1.优化程序分析逻辑,修改文件级别分析逻辑

master
liuzongren 9 months ago
parent d6a7cf0398
commit 09a0ffa44f
  1. 139
      src/main/java/com/keyware/composeanalysis/task/FileAnalysisTask.java

@ -1,5 +1,6 @@
package com.keyware.composeanalysis.task;
import cn.hutool.core.lang.Pair;
import com.keyware.common.constant.enums.AnalysisStatusEnum;
import com.keyware.composeanalysis.constant.FixedValue;
import com.keyware.composeanalysis.constant.RedisConst;
@ -101,7 +102,7 @@ public class FileAnalysisTask extends IAnalysisTask {
SolrDocumentList openSourceFileList = solrUtils.query(featureCoreName, querySb, "sourceMd5");
//如果当前文件在源码库中,匹配到了数据,则统计当前文件的开源率
if (CollectionUtils.isNotEmpty(openSourceFileList)) {
ananlyzeFileOpenRate(openSourceFileList);
ananlyzeFileOpenRate(openSourceFileList,codeFile);
}
}
}
@ -132,18 +133,13 @@ public class FileAnalysisTask extends IAnalysisTask {
/**
* 分析文件的开源率
*
* @param fileList 匹配的开源文件信息
* @param openSourceFileList 匹配的开源文件信息
* @throws IOException
*/
private void ananlyzeFileOpenRate(SolrDocumentList fileList) throws IOException {
//创建匹配开源文件信息匹配对象
MatchOpenFileMongoDto matchOpenFileInfo = new MatchOpenFileMongoDto();
matchOpenFileInfo.setId(IdGenerator.uuid32())
.setFileName(analysisFile.getName())
.setFilePath(analysisFile.getFileUrl());
private void ananlyzeFileOpenRate(SolrDocumentList openSourceFileList,CodeFile fileAnalysisRes) throws IOException {
//根据匹配的开源文件的md5 获取版本ID
Set<String> sourceFileMd5 = fileList.stream().map(solrDocument -> (String) solrDocument.get("sourceMd5")).collect(Collectors.toSet());
Set<String> sourceFileMd5 = openSourceFileList.stream().map(solrDocument -> (String) solrDocument.get("sourceMd5")).collect(Collectors.toSet());
String sourceCoreName = FixedValue.SUFFIX_SOLR_VERSION.get(analysisFile.getSuffix());
Map<String, SolrDocument> md5VersionObjMap = solrUtils.batchQueryVersionIdFromSourceFileBaseBySourceMd5(sourceCoreName, sourceFileMd5);
@ -160,52 +156,14 @@ public class FileAnalysisTask extends IAnalysisTask {
HashSet<Integer> openLineNum = new HashSet<>();
//开源文件结果集合
ArrayList<MatchOpenFile> matchOpenFileList = new ArrayList<>();
//遍历匹配到的开源文件列表
for (int i = 0; i < fileList.size(); i++) {
String openFileMd5 = (String) fileList.get(i).get("sourceMd5");
SolrDocument versionObj = md5VersionObjMap.get(openFileMd5);
String versionId = (String) versionObj.get("versionId");
VersionTree versionInfo = versionIdMap.get(versionId);
if (versionInfo == null) {
log.error("未在versionTree中找到版本信息,openFileMd5:{},versionId:{}",openFileMd5, versionId);
continue;
}
MatchOpenFile matchOpenFile = new MatchOpenFile();
matchOpenFile.setId(IdGenerator.uuid32())
.setVersionId(versionId)
.setSourceFilePath((String) versionObj.get("fullPath"))
.setSourceUrl(versionInfo.getDownUrl())
.setPId(versionInfo.getProId())
.setPName(versionInfo.getProName())
.setLicenseType(versionInfo.getLicenseType())
.setAnalyzeType(AnalysisLevelEnum.FILE_LEVEL.getCode())
.setVersion(versionInfo.getVersionName())
.setFeatureSimilarity(100.00f);
//计算被测件和开源文件的文本相似度
//根据文件的MD5的第一位获取solr库索引名称
String solrNameIndex =openFileMd5.substring(0, 1) + SolrDBConst.CORE_NAME_SUFFIX_SOURCE_FILE_INFO;
SolrDocumentList sourceFileInfo = solrUtils.query(solrNameIndex, "sourceFileMd5:" + openFileMd5, "sourceContent");
if (CollectionUtils.isNotEmpty(sourceFileInfo)) {
String openSourceContent = String.valueOf(sourceFileInfo.get(0).getFieldValue("sourceContent"));
//这里存在优化空间,被测件的文件行拆分 可以拿到循环外面
double similarity = SimilarityUtil.getSimilarityAndSaveRowNum(fileLines, openSourceContent, openLineNum);
matchOpenFile.setOpenRate(new BigDecimal(similarity * 100).setScale(2, RoundingMode.HALF_UP).floatValue());
//如果找不到源代码,直接将原文开源率置为 100%
} else {
log.error("找不到源代码,DBname:{},sourceFileMd5:{}", solrNameIndex, openFileMd5);
matchOpenFile.setOpenRate(100.00f);
}
matchOpenFile.setMd5(openFileMd5);
matchOpenFileList.add(matchOpenFile);
}
List<MatchOpenFile> matchOpenFilesRes = calculateOpenRate(openSourceFileList,fileAnalysisRes,sourceCoreName,openLineNum);
//统计被测件的总体开源率
//获取开源率阈值,判断当前文件是否开源
Integer openRateThreshold = analysisTask.getOpenRateThreshold();
int openLineCount = openLineNum.size();
BigDecimal totalLineCount = new BigDecimal(fileLines.size());
BigDecimal openRate = new BigDecimal(openLineCount).divide(totalLineCount, 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100));
BigDecimal openRate = new BigDecimal(openLineNum.size()).divide(new BigDecimal(fileLines.size()), 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2);
//超过阈值,则认为当前文件是开源文件
if (openRate.compareTo(new BigDecimal(openRateThreshold)) > 0) {
@ -215,18 +173,83 @@ public class FileAnalysisTask extends IAnalysisTask {
}
//修改保存测试文件信息
analysisFile.setOpenLineCount(openLineCount)
analysisFile.setOpenLineCount(openLineNum.size())
.setOpenRate(openRate.floatValue());
//组装开源信息
matchOpenFileInfo.setFilePath(analysisFile.getFileUrl())
//保存当前文件的开源信息到mongo库中
MatchOpenFileMongoDto matchOpenFileMongo = new MatchOpenFileMongoDto();
matchOpenFileMongo.setId(IdGenerator.uuid32())
.setFilePath(analysisFile.getFileUrl())
.setFileName(analysisFile.getName())
.setFeatureSimilarity(100.00f)
.setOpenRate(openRate.floatValue())
.setOpenType(analysisFile.getOpenType())
.setOpenRate(analysisFile.getOpenType() ? 100.00f : 0.00f)
.setMatchOpenFile(matchOpenFileList);
.setMatchOpenFile(matchOpenFilesRes);
//保存当前开源信息数据
mongoTemplate.insert(matchOpenFileInfo);
mongoTemplate.insert(matchOpenFileMongo);
}
/**
* 计算当前文件的特征相似度 开源率
*
* @param matchOpenFiles 通过MD5 匹配到的所有开源文件
* @param sourceFileBaseCoreName 当前文件特征文件的 solr coreName
* @param matchLineRowsNum 所有开源文件匹配到的开源行号列表
* @return 匹配的开源文件解析后的结果集
*/
private List<MatchOpenFile> calculateOpenRate(SolrDocumentList matchOpenFiles, CodeFile fileAnalysisRes, String sourceFileBaseCoreName, Set<Integer> matchLineRowsNum) {
//匹配的开源文件列表
List<MatchOpenFile> matchOpenFilesRes = new ArrayList<>();
//首先根据文件的MD5查询开源文件的版本ID,和路径信息
Set<String> openSourceFileMd5s = matchOpenFiles.stream().map(doc -> (String) doc.get("sourceMd5")).collect(Collectors.toSet());
Map<String, SolrDocument> md5VersionInfoMap = solrUtils.batchQueryVersionIdFromSourceFileBaseBySourceMd5(sourceFileBaseCoreName, openSourceFileMd5s);
//根据版本ID查询版本的详细信息
//todo 这里 查询一个版本的信息 需要检索 两个 solr 库 而且还需要检索 versioinTree 后面需要优化
Set<String> openSourceFileVersionIds = md5VersionInfoMap.values().stream().map(doc -> (String) doc.get("versionId")).collect(Collectors.toSet());
List<VersionTree> versionTrees = solrUtils.queryBatchVersionInfoByVersionIds(openSourceFileVersionIds);
Map<String, VersionTree> versionIdVersionInfoMap = versionTrees.stream().collect(Collectors.toMap(VersionTree::getVersionId, java.util.function.Function.identity()));
for (SolrDocument openSourceFile : matchOpenFiles) {
//开源文件md5
String openSourceFileMd5 = openSourceFile.getFieldValue("sourceMd5").toString();
String openFileContent = solrUtils.getOpenFileContentByMd5(openSourceFileMd5);
//当前文件的开源率
Pair<Float, HashSet<Integer>> openRateAndSaveRowNum = SimilarityUtil.getOpenRateAndSaveRowNum(fileAnalysisRes.getSourceFileContent(), openFileContent);
//将当前文件匹配的行号,存储到缓存中,方便统计整体的开源率
matchLineRowsNum.addAll(openRateAndSaveRowNum.getValue());
SolrDocument openEntries = md5VersionInfoMap.get(openSourceFileMd5);
VersionTree versionInfo = versionIdVersionInfoMap.get(openEntries.get("versionId"));
if (versionInfo == null){
log.error("找不到开源文件版本信息,versionId:{}", openEntries.get("versionId"));
}
//组装当前开源文件的开源项目信息
MatchOpenFile matchOpenFileInfo = new MatchOpenFile();
matchOpenFileInfo.setPId(versionInfo.getProId())
.setPName(versionInfo.getProName())
.setSourceUrl((String) openEntries.get("fullPath"))
.setFeatureSimilarity(100.00f)
.setOpenRate(openRateAndSaveRowNum.getKey())
.setVersion(versionInfo.getVersionName())
.setLicenseType(versionInfo.getLicenseType())
.setAnalyzeType(AnalysisLevelEnum.FILE_LEVEL.getCode());
matchOpenFilesRes.add(matchOpenFileInfo);
}
return matchOpenFilesRes;
}
}

Loading…
Cancel
Save