1.优化程序分析逻辑,修改行级别分析逻辑

master
liuzongren 7 months ago
parent 0aa44dcca4
commit d6a7cf0398
  1. 4
      src/main/java/com/keyware/composeanalysis/ComposeAnalysisApplication.java
  2. 4
      src/main/java/com/keyware/composeanalysis/config/thread/TaskExecutePool.java
  3. 96
      src/main/java/com/keyware/composeanalysis/constant/enums/AnalysisStatusEnum.java
  4. 2
      src/main/java/com/keyware/composeanalysis/controller/ComposeAnalysisController.java
  5. 2
      src/main/java/com/keyware/composeanalysis/service/impl/AnalysisTaskServiceImpl.java
  6. 115
      src/main/java/com/keyware/composeanalysis/task/CodeBlockAnalysisTask.java
  7. 2
      src/main/java/com/keyware/composeanalysis/task/FileAnalysisTask.java
  8. 69
      src/main/java/com/keyware/composeanalysis/task/FunctionAnalysisTask.java
  9. 269
      src/main/java/com/keyware/composeanalysis/task/LineAnalysisTask.java
  10. 4
      src/main/java/com/keyware/composeanalysis/task/PorjectAnalysisTask.java
  11. 4
      src/main/java/com/keyware/composeanalysis/util/SimilarityUtil.java
  12. 30
      src/main/java/com/keyware/composeanalysis/util/SolrUtils.java

@ -12,10 +12,10 @@ import org.springframework.scheduling.annotation.EnableAsync;
@EnableDiscoveryClient @EnableDiscoveryClient
@RefreshScope @RefreshScope
@EnableAsync @EnableAsync
public class ComposeAnalyzeApplication { public class ComposeAnalysisApplication {
public static void main(String[] args) { public static void main(String[] args) {
SpringApplication.run(ComposeAnalyzeApplication.class, args); SpringApplication.run(ComposeAnalysisApplication.class, args);
} }
} }

@ -15,7 +15,7 @@ public class TaskExecutePool {
/** /**
* 核心线程数 * 核心线程数
*/ */
private int coreThreadsSize = 10; private int coreThreadsSize = 20;
/** /**
* 最大线程数 * 最大线程数
@ -45,7 +45,7 @@ public class TaskExecutePool {
//活跃时间 //活跃时间
executor.setKeepAliveSeconds(keepAliveSeconds); executor.setKeepAliveSeconds(keepAliveSeconds);
//线程名字前缀 //线程名字前缀
executor.setThreadNamePrefix("ComposeAnalysisExecutePool-"); executor.setThreadNamePrefix("analysis-exec-");
// setRejectedExecutionHandler:当pool已经达到max size的时候,如何处理新任务 // setRejectedExecutionHandler:当pool已经达到max size的时候,如何处理新任务
// CallerRunsPolicy:不在新线程中执行任务,而是由调用者所在的线程来执行 // CallerRunsPolicy:不在新线程中执行任务,而是由调用者所在的线程来执行

@ -1,48 +1,48 @@
package com.keyware.composeanalysis.constant.enums; //package com.keyware.composeanalysis.constant.enums;
//
//
/** ///**
* 分析状态枚举类 // * 分析状态枚举类
*/ // */
//
public enum AnalysisStatusEnum { //public enum AnalysisStatusEnum {
//
//0:未分析 1:正在分析 2:分析完成 3:暂停分析 4:等待 5:开始分析 6:终止分析 7:分析失败 // //0:未分析 1:正在分析 2:分析完成 3:暂停分析 4:等待 5:开始分析 6:终止分析 7:分析失败
//
UN_ANALYSIS("未分析",0), // UN_ANALYSIS("未分析",0),
//
ANALYSISING("正在分析",1), // ANALYSISING("正在分析",1),
//
ANALYSIS_DONE("分析完成",2), // ANALYSIS_DONE("分析完成",2),
//
PAUSE_ANALYSIS("暂停分析",3), // PAUSE_ANALYSIS("暂停分析",3),
//
WAIT_ANALYSIS("等待分析",4), // WAIT_ANALYSIS("等待分析",4),
//
START_ANALYSIS("开始分析",5), // START_ANALYSIS("开始分析",5),
//
STOP_ANALYSIS("终止分析",6), // STOP_ANALYSIS("终止分析",6),
//
FAIL_ANALYSIS("分析失败",7); // FAIL_ANALYSIS("分析失败",7);
//
// 状态 // // 状态
private String status; // private String status;
//状态码 // //状态码
private Integer code; // private Integer code;
//
AnalysisStatusEnum(String status, Integer code) { // AnalysisStatusEnum(String status, Integer code) {
this.status = status; // this.status = status;
this.code = code; // this.code = code;
} // }
//
public String getStatus() { // public String getStatus() {
return status; // return status;
} // }
//
public Integer getCode() { // public Integer getCode() {
return code; // return code;
} // }
//
//
//
} //}

@ -1,8 +1,8 @@
package com.keyware.composeanalysis.controller; package com.keyware.composeanalysis.controller;
import com.keyware.common.constant.RedisConst; import com.keyware.common.constant.RedisConst;
import com.keyware.common.constant.enums.AnalysisStatusEnum;
import com.keyware.composeanalysis.api.ComposeAnalysisApi; import com.keyware.composeanalysis.api.ComposeAnalysisApi;
import com.keyware.composeanalysis.constant.enums.AnalysisStatusEnum;
import com.keyware.composeanalysis.entity.AnalysisTask; import com.keyware.composeanalysis.entity.AnalysisTask;
import com.keyware.composeanalysis.response.AnalysisResp; import com.keyware.composeanalysis.response.AnalysisResp;
import com.keyware.composeanalysis.service.AnalysisTaskService; import com.keyware.composeanalysis.service.AnalysisTaskService;

@ -96,7 +96,7 @@ public class AnalysisTaskServiceImpl extends ServiceImpl<AnalyzeTaskMapper, Anal
@Override @Override
public void stopComposeAnalysisTask(String taskId) { public void stopComposeAnalysisTask(String taskId) {
//将成分分析的任务状态的标志位置为暂停,让线程池中的排队的任务队列停止分析 //将成分分析的任务状态的标志位置为暂停,让线程池中的排队的任务队列停止分析
redisUtil.set(String.format(RedisConst.TASK_RUNNING_STATUS_KEY_PREFIX, taskId), AnalysisStatusEnum.ANALYSIS_PAUSED.getCode()); redisUtil.set(String.format(RedisConst.TASK_RUNNING_STATUS_KEY_PREFIX, taskId), AnalysisStatusEnum.PAUSE_ANALYSIS.getCode());
} }
@Override @Override

@ -1,15 +1,16 @@
package com.keyware.composeanalysis.task; package com.keyware.composeanalysis.task;
import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.collection.CollectionUtil; import cn.hutool.core.collection.CollectionUtil;
import cn.hutool.core.lang.Pair; import cn.hutool.core.lang.Pair;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.keyware.common.constant.enums.AnalysisStatusEnum;
import com.keyware.composeanalysis.constant.FixedValue; import com.keyware.composeanalysis.constant.FixedValue;
import com.keyware.composeanalysis.constant.FunctionAndAnalysisAssemblyConst; import com.keyware.composeanalysis.constant.FunctionAndAnalysisAssemblyConst;
import com.keyware.composeanalysis.constant.RedisConst; import com.keyware.composeanalysis.constant.RedisConst;
import com.keyware.composeanalysis.constant.SolrDBConst; import com.keyware.composeanalysis.constant.SolrDBConst;
import com.keyware.composeanalysis.constant.enums.AnalysisLevelEnum; import com.keyware.composeanalysis.constant.enums.AnalysisLevelEnum;
import com.keyware.composeanalysis.constant.enums.AnalysisStatusEnum;
import com.keyware.composeanalysis.constant.enums.FileAnalysisStatusEnum; import com.keyware.composeanalysis.constant.enums.FileAnalysisStatusEnum;
import com.keyware.composeanalysis.entity.AnalysisTask; import com.keyware.composeanalysis.entity.AnalysisTask;
import com.keyware.composeanalysis.mongo.FileDataMongoDto; import com.keyware.composeanalysis.mongo.FileDataMongoDto;
@ -92,8 +93,6 @@ public class CodeBlockAnalysisTask extends IAnalysisTask {
String fileName = analysisFile.getName(); String fileName = analysisFile.getName();
try { try {
LineDataMongoDto lineDataMongoDto = new LineDataMongoDto();
lineDataMongoDto.setFileId(analysisFile.getId());
Analysis analysis = AnalysisFactory.getAnalysis(filePath); Analysis analysis = AnalysisFactory.getAnalysis(filePath);
//将代码块特征存入MongoDB //将代码块特征存入MongoDB
//提取文件的代码块信息 //提取文件的代码块信息
@ -103,11 +102,14 @@ public class CodeBlockAnalysisTask extends IAnalysisTask {
//根据文件后缀判断需要查询的solr特征库库名称 //根据文件后缀判断需要查询的solr特征库库名称
String featureCoreName = FixedValue.SUFFIX_SOLR_FILE.get(analysisFile.getSuffix()); String featureCoreName = FixedValue.SUFFIX_SOLR_FILE.get(analysisFile.getSuffix());
//根据文件后缀,去检索sourceFileBase库,来获取文件版本信息
String sourceFileBaseCoreName = FixedValue.SUFFIX_SOLR_VERSION.get(analysisFile.getSuffix());
//从solr库中获取特征相似的文件 //从solr库中获取特征相似的文件
SolrDocumentList matchOpenSourceFiles = getFeatureSimilarityFromSolr(featureCoreName, lineFeatures); SolrDocumentList matchOpenSourceFiles = getFeatureSimilarityFromSolr(featureCoreName, lineFeatures);
//计算开源率 //计算开源率
doAnalysis(matchOpenSourceFiles, codeFile); doAnalysis(matchOpenSourceFiles, sourceFileBaseCoreName, codeFile);
//更新文件表的分析状态为3 行级特征以分析完毕 //更新文件表的分析状态为3 行级特征以分析完毕
analysisFile.setFileAnalysisStatus(FileAnalysisStatusEnum.ANALYSIS_DONE.getCode()); analysisFile.setFileAnalysisStatus(FileAnalysisStatusEnum.ANALYSIS_DONE.getCode());
@ -120,7 +122,7 @@ public class CodeBlockAnalysisTask extends IAnalysisTask {
log.info("文件" + fileName + ":代码块级分析完成"); log.info("文件" + fileName + ":代码块级分析完成");
} catch (Exception e) { } catch (Exception e) {
AnalysisLogUtil.insertErrorInfo(mongoTemplate, "【代码块分析】失败" + fileName, e); AnalysisLogUtil.insertErrorInfo(mongoTemplate, "【代码块分析】失败" + fileName, e);
log.error("文件:" + fileName + "代码块级分析失败!", e); log.error("文件:{}代码块级分析失败!",fileName, e);
//修改当前文件分析状态未失败 //修改当前文件分析状态未失败
mongoTemplate.update(FileDataMongoDto.class) mongoTemplate.update(FileDataMongoDto.class)
.matching(where("_id").is(analysisFile.getId())) .matching(where("_id").is(analysisFile.getId()))
@ -144,10 +146,14 @@ public class CodeBlockAnalysisTask extends IAnalysisTask {
Set<String> traitLineMd5Arr = functionAndCodeBlockInfos.stream().map(LineModel::getTraitLineMd5).collect(Collectors.toSet()); Set<String> traitLineMd5Arr = functionAndCodeBlockInfos.stream().map(LineModel::getTraitLineMd5).collect(Collectors.toSet());
Set<String> cuttLineMd5Arr = functionAndCodeBlockInfos.stream().map(LineModel::getCutLineMd5).collect(Collectors.toSet()); Set<String> cuttLineMd5Arr = functionAndCodeBlockInfos.stream().map(LineModel::getCutLineMd5).collect(Collectors.toSet());
Set<String> queryMd5Arr = Stream.concat(traitLineMd5Arr.stream(), cuttLineMd5Arr.stream()).collect(Collectors.toSet()); Set<String> queryMd5Arr = Stream.concat(traitLineMd5Arr.stream(), cuttLineMd5Arr.stream()).collect(Collectors.toSet());
if (CollUtil.isEmpty(queryMd5Arr)) {
log.error("特征为空,无法查询:{}", analysisFile.getName());
return new SolrDocumentList();
}
String queryStr = "line_hay:(" + StringUtils.join(queryMd5Arr, " OR ") + ")"; String queryStr = "line_hay:(" + StringUtils.join(queryMd5Arr, " OR ") + ")";
log.info("查询条件: solrCoreName:{},queryStr:{}", solrCoreName, queryStr); log.info("查询条件: solrCoreName:{},queryStr:{}", solrCoreName, queryStr);
SolrDocumentList result = solrUtils.query(solrCoreName, queryStr, "sourceMd5,line_hay"); SolrDocumentList result = solrUtils.query(solrCoreName, queryStr, "sourceMd5,line_hay");
log.info("查询结果: result:{}", result); // log.info("查询结果: result:{}", result);
return result; return result;
} }
@ -156,17 +162,17 @@ public class CodeBlockAnalysisTask extends IAnalysisTask {
* 计算开源率 被测件的开源率 * 计算开源率 被测件的开源率
* *
* @param matcheOpenSourceFiles 匹配的开源文件信息 * @param matcheOpenSourceFiles 匹配的开源文件信息
* @param sourceFileBaseCoreName 查询版开源文件版本ID的 solr库名称
* @param fileAnalysisRes 被测件的解析结果 * @param fileAnalysisRes 被测件的解析结果
*/ */
private void doAnalysis(SolrDocumentList matcheOpenSourceFiles, CodeFile fileAnalysisRes) { private void doAnalysis(SolrDocumentList matcheOpenSourceFiles, String sourceFileBaseCoreName, CodeFile fileAnalysisRes) {
if (CollectionUtil.isEmpty(matcheOpenSourceFiles)) { if (CollectionUtil.isEmpty(matcheOpenSourceFiles)) {
//因为代码块的特征库较少,这里补充一个对比逻辑,如果当前文件解析失败,或者没有通过代码块匹配到数据,则直接通过文件的md5 再次查询一次solr库
checkByOriginalFileMd5(sourceFileBaseCoreName, analysisFile.getMd5());
return; return;
} }
//根据文件后缀判断需要查询的文件版本库名称
String sourceFileBaseCoreName = FixedValue.SUFFIX_SOLR_VERSION.get(analysisFile.getSuffix());
//保存所有匹配的特征代码块MD5信息,方便统计总的匹配行数 //保存所有匹配的特征代码块MD5信息,方便统计总的匹配行数
Set<String> matchingTraitLineSet = new HashSet<>(); Set<String> matchingTraitLineSet = new HashSet<>();
@ -178,14 +184,14 @@ public class CodeBlockAnalysisTask extends IAnalysisTask {
List<MatchOpenFile> matchOpenFilesRes = calculateSimilarityAndOpenRate(matcheOpenSourceFiles, fileAnalysisRes, sourceFileBaseCoreName, matchedLineRowsNum, matchingTraitLineSet); List<MatchOpenFile> matchOpenFilesRes = calculateSimilarityAndOpenRate(matcheOpenSourceFiles, fileAnalysisRes, sourceFileBaseCoreName, matchedLineRowsNum, matchingTraitLineSet);
//计算文件的总体的特征相似度 //计算文件的总体的特征相似度
Map<String, LineModel> traitMd5Map = fileAnalysisRes.getLine_hay().stream().collect(Collectors.toMap(LineModel::getTraitLineMd5, java.util.function.Function.identity())); Map<String, Integer> traitsFeatureMd5AndFeatureLineNumMap = getTraitsFeatureMd5AndFeatureLineNumMap(fileAnalysisRes.getLine_hay());
int matchCodeBlockLineCount = 0; int matchCodeBlockLineCount = 0;
for (String matchFeatureFunctionMd5 : matchingTraitLineSet) { for (String matchFeatureMd5 : matchingTraitLineSet) {
LineModel lineModel = traitMd5Map.get(matchFeatureFunctionMd5); matchCodeBlockLineCount += traitsFeatureMd5AndFeatureLineNumMap.get(matchFeatureMd5);
matchCodeBlockLineCount += (Integer.valueOf(lineModel.getEndLine()) - Integer.valueOf(lineModel.getStartLine()));
} }
//计算文件的总体特征相似度
BigDecimal featureSimilarity = new BigDecimal(matchCodeBlockLineCount).divide(new BigDecimal(analysisFile.getCodeRowNum()), 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2); BigDecimal featureSimilarity = new BigDecimal(matchCodeBlockLineCount).divide(new BigDecimal(analysisFile.getCodeRowNum()), 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2);
//计算文件的总体开源率 //计算文件的总体开源率
@ -235,7 +241,8 @@ public class CodeBlockAnalysisTask extends IAnalysisTask {
List<VersionTree> versionTrees = solrUtils.queryBatchVersionInfoByVersionIds(openSourceFileVersionIds); List<VersionTree> versionTrees = solrUtils.queryBatchVersionInfoByVersionIds(openSourceFileVersionIds);
Map<String, VersionTree> versionIdVersionInfoMap = versionTrees.stream().collect(Collectors.toMap(VersionTree::getVersionId, java.util.function.Function.identity())); Map<String, VersionTree> versionIdVersionInfoMap = versionTrees.stream().collect(Collectors.toMap(VersionTree::getVersionId, java.util.function.Function.identity()));
//按照特征行进行分组,一次匹配中,将所有的特征行进行累加
Map<String, Integer> traitsFeatureMd5AndFeatureLineNumMap = getTraitsFeatureMd5AndFeatureLineNumMap(fileAnalysisRes.getLine_hay());
for (SolrDocument matchFile : matchOpenFiles) { for (SolrDocument matchFile : matchOpenFiles) {
//开源文件md5 //开源文件md5
@ -247,25 +254,32 @@ public class CodeBlockAnalysisTask extends IAnalysisTask {
//匹配的总特征行数 //匹配的总特征行数
int currentFileMatchFeatureLineCount = 0; int currentFileMatchFeatureLineCount = 0;
//当前文件所匹配的特征函数MD5
Set<String> currentFileMatchFeatureMd5 = new HashSet();
//遍历当前文件的代码块特征,统计匹配的总行数 //遍历当前文件的代码块特征,统计匹配的总行数
for (LineModel lineModel : fileAnalysisRes.getLine_hay()) { for (String traitLineMd5 : traitsFeatureMd5AndFeatureLineNumMap.keySet()) {
String traitLineMd5 = lineModel.getTraitLineMd5();
//村换匹配到的文件的行信息 //村换匹配到的文件的行信息
for (LineModel matchLine : openFileCodeBlockFeatureList) { for (LineModel matchLine : openFileCodeBlockFeatureList) {
if (traitLineMd5.equals(matchLine.getTraitLineMd5())) { if (traitLineMd5.equals(matchLine.getTraitLineMd5())) {
//计算匹配的特征行数 if (!currentFileMatchFeatureMd5.contains(traitLineMd5)) {
currentFileMatchFeatureLineCount += (Integer.valueOf(matchLine.getEndLine()) - Integer.valueOf(matchLine.getStartLine()) + 1); currentFileMatchFeatureMd5.add(traitLineMd5);
matchFeatureCodeBlockMd5s.add(traitLineMd5); matchFeatureCodeBlockMd5s.add(traitLineMd5);
currentFileMatchFeatureLineCount += traitsFeatureMd5AndFeatureLineNumMap.get(traitLineMd5);
}
} }
} }
} }
//根据源文件的MD5确定需要查询源码库的序号 //根据源文件的MD5确定需要查询源码库的序号
String openSourceCodeCoreIndex = openSourceFileMd5.substring(0, 1) + SolrDBConst.CORE_NAME_SUFFIX_SOURCE_FILE_INFO; String openSourceCodeCoreIndex = openSourceFileMd5.substring(0, 1) + SolrDBConst.CORE_NAME_SUFFIX_SOURCE_FILE_INFO;
//获取开源文件的文本信息 //获取开源文件的文本信息
SolrDocument openSourceContent = solrUtils.queryOne(openSourceCodeCoreIndex, "sourceFileMd5:" + openSourceFileMd5, "sourceContent"); SolrDocument openSourceContent = solrUtils.queryOne(openSourceCodeCoreIndex, "sourceFileMd5:" + openSourceFileMd5, "sourceContent");
if (openSourceContent == null) {
log.error("根据开源文件MD5查询源码失败,sourceFileMd5:{}", openSourceFileMd5);
continue;
}
//当前文件的开源率 //当前文件的开源率
Pair<Float, HashSet<Integer>> openRateAndSaveRowNum = getOpenRateAndSaveRowNum(fileAnalysisRes.getSourceFileContent(), openSourceContent.getFieldValue("sourceContent").toString()); Pair<Float, HashSet<Integer>> openRateAndSaveRowNum = getOpenRateAndSaveRowNum(fileAnalysisRes.getSourceFileContent(), openSourceContent.getFieldValue("sourceContent").toString());
@ -274,10 +288,14 @@ public class CodeBlockAnalysisTask extends IAnalysisTask {
matchLineRowsNum.addAll(openRateAndSaveRowNum.getValue()); matchLineRowsNum.addAll(openRateAndSaveRowNum.getValue());
//统计当前文件的特征相似度 //统计当前文件的特征相似度
BigDecimal featureSimilarity = new BigDecimal(currentFileMatchFeatureLineCount).divide(fileAnalysisRes.getCodeRowNum(), 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2); BigDecimal featureSimilarity = new BigDecimal(currentFileMatchFeatureLineCount).divide(new BigDecimal(analysisFile.getCodeRowNum()), 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2);
SolrDocument openEntries = md5VersionInfoMap.get(openSourceFileMd5); SolrDocument openEntries = md5VersionInfoMap.get(openSourceFileMd5);
VersionTree versionInfo = versionIdVersionInfoMap.get(openEntries.get("versionId")); VersionTree versionInfo = versionIdVersionInfoMap.get(openEntries.get("versionId"));
if (versionInfo == null) {
log.error("根据开源文件版本ID查询版本信息失败,versionId:{}", openEntries.get("versionId"));
continue;
}
//组装当前开源文件的开源项目信息 //组装当前开源文件的开源项目信息
MatchOpenFile matchOpenFileInfo = new MatchOpenFile(); MatchOpenFile matchOpenFileInfo = new MatchOpenFile();
@ -295,6 +313,45 @@ public class CodeBlockAnalysisTask extends IAnalysisTask {
} }
/**
* 防止代码块特征库不全再次根据文件MD5查询开源文件信息, 做二次校验
*
* @param originalFileMd5
* @param versionIdCoreName
*/
private void checkByOriginalFileMd5(String versionIdCoreName, String originalFileMd5) {
//根据文件的MD5,查询特征库,看当前文件是否在开源代码库中
SolrDocument versionIdAndPath = solrUtils.queryOne(versionIdCoreName, "sourceFileMd5:" + originalFileMd5, "versionId,fullPath,sourceFileMd5");
if (versionIdAndPath != null) {
//根据版本ID查询版本的详细信息
VersionTree versionInfo = solrUtils.queryVersionInfoByVersionId((String) versionIdAndPath.get("versionId"));
if (versionInfo != null) {
//当前开源文件的开源项目信息
MatchOpenFile matchOpenFileInfo = new MatchOpenFile();
matchOpenFileInfo.setPId(versionInfo.getProId())
.setPName(versionInfo.getProName())
.setSourceUrl(versionInfo.getDownUrl())
.setFeatureSimilarity(100.00f)
.setOpenRate(100.00f)
.setAnalyzeType(AnalysisLevelEnum.FILE_LEVEL.getCode());
//保存当前文件的开源信息到mongo库中
MatchOpenFileMongoDto matchOpenFileMongo = new MatchOpenFileMongoDto();
matchOpenFileMongo.setId(IdGenerator.uuid32())
.setFilePath(analysisFile.getFileUrl())
.setFileName(analysisFile.getName())
.setOpenRate(100.00f)
.setOpenType(analysisFile.getOpenType())
.setMatchOpenFile(Arrays.asList(matchOpenFileInfo));
mongoTemplate.save(matchOpenFileMongo);
}
}
}
/** /**
* 获取当前文件的代码块特征值 * 获取当前文件的代码块特征值
* *
@ -310,6 +367,22 @@ public class CodeBlockAnalysisTask extends IAnalysisTask {
return JSONArray.parseArray(lineFeatureMd5s, LineModel.class); return JSONArray.parseArray(lineFeatureMd5s, LineModel.class);
} }
/**
* 或者特征代码块的md5 当前md5包含的特征行数
*
* @param codeBlockInfos
* @return
*/
private Map<String, Integer> getTraitsFeatureMd5AndFeatureLineNumMap(List<LineModel> codeBlockInfos) {
Map<String, List<LineModel>> traitMd5GroupMap = codeBlockInfos.stream().collect(Collectors.groupingBy(LineModel::getTraitLineMd5));
Map<String, Integer> resultMap = new HashMap<>();
for (String traitMd5 : traitMd5GroupMap.keySet()) {
List<LineModel> lineModels = traitMd5GroupMap.get(traitMd5);
int traitsLineNum = lineModels.stream().mapToInt(lineModel -> (Integer.valueOf(lineModel.getEndLine()) - Integer.valueOf(lineModel.getStartLine()) + 1)).sum();
resultMap.put(traitMd5, traitsLineNum);
}
return resultMap;
}
/** /**
* 将特征值插入到mongo库中 * 将特征值插入到mongo库中

@ -1,10 +1,10 @@
package com.keyware.composeanalysis.task; package com.keyware.composeanalysis.task;
import com.keyware.common.constant.enums.AnalysisStatusEnum;
import com.keyware.composeanalysis.constant.FixedValue; import com.keyware.composeanalysis.constant.FixedValue;
import com.keyware.composeanalysis.constant.RedisConst; import com.keyware.composeanalysis.constant.RedisConst;
import com.keyware.composeanalysis.constant.SolrDBConst; import com.keyware.composeanalysis.constant.SolrDBConst;
import com.keyware.composeanalysis.constant.enums.AnalysisLevelEnum; import com.keyware.composeanalysis.constant.enums.AnalysisLevelEnum;
import com.keyware.composeanalysis.constant.enums.AnalysisStatusEnum;
import com.keyware.composeanalysis.constant.enums.FileAnalysisStatusEnum; import com.keyware.composeanalysis.constant.enums.FileAnalysisStatusEnum;
import com.keyware.composeanalysis.entity.AnalysisTask; import com.keyware.composeanalysis.entity.AnalysisTask;
import com.keyware.composeanalysis.mongo.FileDataMongoDto; import com.keyware.composeanalysis.mongo.FileDataMongoDto;

@ -1,15 +1,15 @@
package com.keyware.composeanalysis.task; package com.keyware.composeanalysis.task;
import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.collection.CollectionUtil; import cn.hutool.core.collection.CollectionUtil;
import cn.hutool.core.lang.Pair; import cn.hutool.core.lang.Pair;
import cn.hutool.core.util.ObjUtil;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.keyware.common.constant.enums.AnalysisStatusEnum;
import com.keyware.composeanalysis.constant.FixedValue; import com.keyware.composeanalysis.constant.FixedValue;
import com.keyware.composeanalysis.constant.RedisConst; import com.keyware.composeanalysis.constant.RedisConst;
import com.keyware.composeanalysis.constant.SolrDBConst; import com.keyware.composeanalysis.constant.SolrDBConst;
import com.keyware.composeanalysis.constant.enums.AnalysisLevelEnum; import com.keyware.composeanalysis.constant.enums.AnalysisLevelEnum;
import com.keyware.composeanalysis.constant.enums.AnalysisStatusEnum;
import com.keyware.composeanalysis.constant.enums.FileAnalysisStatusEnum; import com.keyware.composeanalysis.constant.enums.FileAnalysisStatusEnum;
import com.keyware.composeanalysis.entity.AnalysisTask; import com.keyware.composeanalysis.entity.AnalysisTask;
import com.keyware.composeanalysis.mongo.FileDataMongoDto; import com.keyware.composeanalysis.mongo.FileDataMongoDto;
@ -102,34 +102,15 @@ public class FunctionAnalysisTask extends IAnalysisTask {
//根据文件的名称获取函数解析器 //根据文件的名称获取函数解析器
Analysis analysis = AnalysisFactory.getAnalysis(filePath); Analysis analysis = AnalysisFactory.getAnalysis(filePath);
//解析文件 //解析文件
if (!ObjUtil.hasEmpty(featureCoreName, sourceFileBaseCoreName, analysis)) {
CodeFile codeFile = analysis.analysisFile(new FileInputStream(filePath)); CodeFile codeFile = analysis.analysisFile(new FileInputStream(filePath));
if (codeFile != null) {
List<Function> functionList = codeFile.getFunctionList(); //根据函数特征去匹配到开源文件
if (CollectionUtil.isNotEmpty(functionList)) { SolrDocumentList matchOpenFiles = getFeatureSimilarityFromSolr(featureCoreName, codeFile.getFunctionList());
//获取函数的特征MD5,cutMD5
List<String> featureFunctionMd5List = functionList.stream().map(Function::getMd5).collect(Collectors.toList()); //计算开源率
List<String> cutFunctionMd5List = functionList.stream().map(Function::getSourceMd5).collect(Collectors.toList());
Set<String> queryMd5List = Stream.concat(featureFunctionMd5List.stream(), cutFunctionMd5List.stream()).collect(Collectors.toSet());
String queryStr = "fun_hay:(" + StringUtils.join(queryMd5List, " OR ") + ")";
// log.info("检索函数特征,coreName:{} ,queryStr:{}", featureCoreName, queryStr);
SolrDocumentList matchOpenFiles = solrUtils.query(featureCoreName, queryStr, "sourceMd5,fun_hay");
// log.info("resp", sourceMd5);
//如果函数级特征匹配,能够匹配到开源文件信息,则根据开源文件的md5或者开源文件信息,做相似度对比
if (matchOpenFiles != null) {
//对匹配到的文件进行分析
doAnalysis(matchOpenFiles, sourceFileBaseCoreName, codeFile); doAnalysis(matchOpenFiles, sourceFileBaseCoreName, codeFile);
} else {
//因为函数的特征库较少,这里补充一个对比逻辑,如果当前文件解析失败,或者没有通过函数匹配到数据,则直接通过文件的md5 再次查询一次solr库
checkByOriginalFileMd5(sourceFileBaseCoreName, analysisFile.getMd5());
}
}
}
} else {
//因为函数的特征库较少,这里补充一个对比逻辑,如果当前文件解析失败,或者没有通过函数匹配到数据,则直接通过文件的md5 再次查询一次solr库
checkByOriginalFileMd5(sourceFileBaseCoreName, analysisFile.getMd5());
}
//更新文件表的分析状态为3 函数级特征以分析完毕 //更新文件表的分析状态为3 函数级特征以分析完毕
analysisFile.setFileAnalysisStatus(FileAnalysisStatusEnum.ANALYSIS_DONE.getCode()); analysisFile.setFileAnalysisStatus(FileAnalysisStatusEnum.ANALYSIS_DONE.getCode());
@ -160,9 +141,14 @@ public class FunctionAnalysisTask extends IAnalysisTask {
* @param matchOpenFiles 通过特征匹配到的开源文件的md5 * @param matchOpenFiles 通过特征匹配到的开源文件的md5
* @param sourceFileBaseCoreName 查询版开源文件版本ID的 solr库名称 * @param sourceFileBaseCoreName 查询版开源文件版本ID的 solr库名称
* @param fileAnalysisRes 被测件的函数解析结果 * @param fileAnalysisRes 被测件的函数解析结果
* @throws Exception
*/ */
private void doAnalysis(SolrDocumentList matchOpenFiles, String sourceFileBaseCoreName, CodeFile fileAnalysisRes) throws Exception { private void doAnalysis(SolrDocumentList matchOpenFiles, String sourceFileBaseCoreName, CodeFile fileAnalysisRes) {
if (CollectionUtil.isEmpty(matchOpenFiles)) {
//因为函数的特征库较少,这里补充一个对比逻辑,如果当前文件解析失败,或者没有通过函数匹配到数据,则直接通过文件的md5 再次查询一次solr库
checkByOriginalFileMd5(sourceFileBaseCoreName, analysisFile.getMd5());
return;
}
//按照函数的特征md5进行分组,getter ,setter等方法的 特征值会重复 //按照函数的特征md5进行分组,getter ,setter等方法的 特征值会重复
Map<String, List<Function>> featureMd5FunctionMap = fileAnalysisRes.getFunctionList().stream().collect(Collectors.groupingBy(Function::getMd5)); Map<String, List<Function>> featureMd5FunctionMap = fileAnalysisRes.getFunctionList().stream().collect(Collectors.groupingBy(Function::getMd5));
@ -239,7 +225,6 @@ public class FunctionAnalysisTask extends IAnalysisTask {
List<VersionTree> versionTrees = solrUtils.queryBatchVersionInfoByVersionIds(openSourceFileVersionIds); List<VersionTree> versionTrees = solrUtils.queryBatchVersionInfoByVersionIds(openSourceFileVersionIds);
Map<String, VersionTree> versionIdVersionInfoMap = versionTrees.stream().collect(Collectors.toMap(VersionTree::getVersionId, java.util.function.Function.identity())); Map<String, VersionTree> versionIdVersionInfoMap = versionTrees.stream().collect(Collectors.toMap(VersionTree::getVersionId, java.util.function.Function.identity()));
//函数总行数 //函数总行数
BigDecimal totalFunctionLineCount = new BigDecimal(fileAnalysisRes.getFunctionList().stream().mapToInt(Function::getCodeRowNum).sum()); BigDecimal totalFunctionLineCount = new BigDecimal(fileAnalysisRes.getFunctionList().stream().mapToInt(Function::getCodeRowNum).sum());
@ -328,7 +313,7 @@ public class FunctionAnalysisTask extends IAnalysisTask {
.setSourceUrl(versionInfo.getDownUrl()) .setSourceUrl(versionInfo.getDownUrl())
.setFeatureSimilarity(100.00f) .setFeatureSimilarity(100.00f)
.setOpenRate(100.00f) .setOpenRate(100.00f)
.setAnalyzeType(AnalysisLevelEnum.FUNCTION_LEVEL.getCode()); .setAnalyzeType(AnalysisLevelEnum.FILE_LEVEL.getCode());
//保存当前文件的开源信息到mongo库中 //保存当前文件的开源信息到mongo库中
MatchOpenFileMongoDto matchOpenFileMongo = new MatchOpenFileMongoDto(); MatchOpenFileMongoDto matchOpenFileMongo = new MatchOpenFileMongoDto();
@ -345,6 +330,28 @@ public class FunctionAnalysisTask extends IAnalysisTask {
} }
/**
* 根据 特征值 从特征库中检索 具有特征相似的
*
* @param featureCoreName 检索的solr 库名称
* @param functionList
* @return
*/
private SolrDocumentList getFeatureSimilarityFromSolr(String featureCoreName, List<Function> functionList) {
//获取方法快的特征MD5,cutMD5
Set<String> featureFunctionMd5List = functionList.stream().map(Function::getMd5).collect(Collectors.toSet());
Set<String> cutFunctionMd5List = functionList.stream().map(Function::getSourceMd5).collect(Collectors.toSet());
Set<String> queryMd5List = Stream.concat(featureFunctionMd5List.stream(), cutFunctionMd5List.stream()).collect(Collectors.toSet());
if (CollUtil.isEmpty(queryMd5List)) {
log.error("特征为空,无法查询:{}", analysisFile.getName());
return new SolrDocumentList();
}
String queryStr = "fun_hay:(" + StringUtils.join(queryMd5List, " OR ") + ")";
// log.info("检索函数特征,coreName:{} ,queryStr:{}", featureCoreName, queryStr);
return solrUtils.query(featureCoreName, queryStr, "sourceMd5,fun_hay");
}
/** /**
* 获取当前文件的函数特征值 * 获取当前文件的函数特征值
* *

@ -2,12 +2,14 @@ package com.keyware.composeanalysis.task;
import cn.hutool.core.collection.CollectionUtil; import cn.hutool.core.collection.CollectionUtil;
import cn.hutool.core.lang.Pair;
import cn.hutool.core.util.StrUtil;
import com.keyware.common.constant.enums.AnalysisStatusEnum;
import com.keyware.composeanalysis.constant.FixedValue; import com.keyware.composeanalysis.constant.FixedValue;
import com.keyware.composeanalysis.constant.FunctionAndAnalysisAssemblyConst; import com.keyware.composeanalysis.constant.FunctionAndAnalysisAssemblyConst;
import com.keyware.composeanalysis.constant.RedisConst; import com.keyware.composeanalysis.constant.RedisConst;
import com.keyware.composeanalysis.constant.SolrDBConst; import com.keyware.composeanalysis.constant.SolrDBConst;
import com.keyware.composeanalysis.constant.enums.AnalysisLevelEnum; import com.keyware.composeanalysis.constant.enums.AnalysisLevelEnum;
import com.keyware.composeanalysis.constant.enums.AnalysisStatusEnum;
import com.keyware.composeanalysis.constant.enums.FileAnalysisStatusEnum; import com.keyware.composeanalysis.constant.enums.FileAnalysisStatusEnum;
import com.keyware.composeanalysis.entity.AnalysisTask; import com.keyware.composeanalysis.entity.AnalysisTask;
import com.keyware.composeanalysis.mongo.FileDataMongoDto; import com.keyware.composeanalysis.mongo.FileDataMongoDto;
@ -15,10 +17,7 @@ import com.keyware.composeanalysis.mongo.LineDataMongoDto;
import com.keyware.composeanalysis.mongo.MatchOpenFile; import com.keyware.composeanalysis.mongo.MatchOpenFile;
import com.keyware.composeanalysis.mongo.MatchOpenFileMongoDto; import com.keyware.composeanalysis.mongo.MatchOpenFileMongoDto;
import com.keyware.composeanalysis.solr.VersionTree; import com.keyware.composeanalysis.solr.VersionTree;
import com.keyware.composeanalysis.util.AnalysisLogUtil; import com.keyware.composeanalysis.util.*;
import com.keyware.composeanalysis.util.RedisUtil;
import com.keyware.composeanalysis.util.SolrUtils;
import com.keyware.composeanalysis.util.SpringContextUtils;
import com.keyware.keyswan.anaysis.Analysis; import com.keyware.keyswan.anaysis.Analysis;
import com.keyware.keyswan.anaysis.AnalysisFactory; import com.keyware.keyswan.anaysis.AnalysisFactory;
import com.keyware.keyswan.common.CodeFile; import com.keyware.keyswan.common.CodeFile;
@ -34,6 +33,7 @@ import java.math.BigDecimal;
import java.math.RoundingMode; import java.math.RoundingMode;
import java.util.*; import java.util.*;
import java.util.concurrent.CountDownLatch; import java.util.concurrent.CountDownLatch;
import java.util.stream.Collectors;
import static org.springframework.data.mongodb.core.query.Criteria.where; import static org.springframework.data.mongodb.core.query.Criteria.where;
@ -90,31 +90,17 @@ public class LineAnalysisTask extends IAnalysisTask {
AnalysisLogUtil.insert(mongoTemplate, "【行级特征提取】正在提取" + fileName); AnalysisLogUtil.insert(mongoTemplate, "【行级特征提取】正在提取" + fileName);
try { try {
LineDataMongoDto lineDataMongoDto = new LineDataMongoDto();
lineDataMongoDto.setFileId(analysisFile.getId())
.setStatus(0)
.setIsSelect(false);
Analysis analysis = AnalysisFactory.getAnalysis(filePath); Analysis analysis = AnalysisFactory.getAnalysis(filePath);
CodeFile codeFile = null;
//获取文件行级特征md5 //获取文件行级特征md5
codeFile = analysis.analysisFile(filePath, FunctionAndAnalysisAssemblyConst.LINE_EXTRACT, FunctionAndAnalysisAssemblyConst.LINE_EXTRACT); CodeFile codeFile = analysis.analysisFile(filePath, FunctionAndAnalysisAssemblyConst.LINE_EXTRACT, FunctionAndAnalysisAssemblyConst.LINE_EXTRACT);
//每一行原内容MD5值集合
// String cutFileLineMd5 = codeFile.getCutFileLineMd5();
//每一行特征内容MD5值集合
String traitFileLineMd5 = codeFile.getTraitFileLineMd5();
String[] featureMd5Arr = {};
if (StringUtils.isNotBlank(traitFileLineMd5)) {
featureMd5Arr = traitFileLineMd5.split(",");
}
List<String> lineFeatures = Arrays.asList(featureMd5Arr);
//从solr中获取特征相似的 文件 //从solr中获取特征相似的 文件
SolrDocumentList featureSimilarityFromSolr = getFeatureSimilarityFromSolr(lineFeatures); SolrDocumentList featureSimilarityFromSolr = getFeatureSimilarityFromSolr(codeFile);
//计算文件的开源率 //计算文件的开源率
calculateOpenRate(featureSimilarityFromSolr, lineFeatures); doAnalysis(featureSimilarityFromSolr, codeFile);
//更新文件表的分析状态为3 行级特征以分析完毕 //更新文件表的分析状态为3 行级特征以分析完毕
analysisFile.setFileAnalysisStatus(FileAnalysisStatusEnum.ANALYSIS_DONE.getCode()); analysisFile.setFileAnalysisStatus(FileAnalysisStatusEnum.ANALYSIS_DONE.getCode());
@ -123,11 +109,12 @@ public class LineAnalysisTask extends IAnalysisTask {
.replaceWith(analysisFile) .replaceWith(analysisFile)
.findAndReplace(); .findAndReplace();
AnalysisLogUtil.insert(mongoTemplate, "【行级特征提取】提取完成" + fileName); //插入日志
AnalysisLogUtil.insert(mongoTemplate, "【行级分析】完成" + fileName);
log.info("文件" + fileName + ":行级分析完成"); log.info("文件" + fileName + ":行级分析完成");
} catch (Exception e) { } catch (Exception e) {
AnalysisLogUtil.insertErrorInfo(mongoTemplate, "【行级特征提取】提取失败" + fileName, e); AnalysisLogUtil.insertErrorInfo(mongoTemplate, "【行级分析】失败" + fileName, e);
log.error("文件:" + fileName + "行级别特征提取失败!", e); log.error("文件:{}行级别分析失败!", fileName,e);
//修改当前文件分析状态未失败 //修改当前文件分析状态未失败
mongoTemplate.update(FileDataMongoDto.class) mongoTemplate.update(FileDataMongoDto.class)
.matching(where("_id").is(analysisFile.getId())) .matching(where("_id").is(analysisFile.getId()))
@ -143,97 +130,190 @@ public class LineAnalysisTask extends IAnalysisTask {
* 计算开源率 被测件的开源率 * 计算开源率 被测件的开源率
* *
* @param matcheOpenSourceFiles * @param matcheOpenSourceFiles
* @param lineFeatures * @param codeFile 文件解析结果
*/ */
private void calculateOpenRate(SolrDocumentList matcheOpenSourceFiles, List<String> lineFeatures) { private void doAnalysis(SolrDocumentList matcheOpenSourceFiles, CodeFile codeFile) {
//根据文件后缀判断需要查询的文件版本库名称
String versionIdCoreName = FixedValue.SUFFIX_SOLR_VERSION.get(analysisFile.getSuffix());
if (CollectionUtil.isEmpty(matcheOpenSourceFiles)) { if (CollectionUtil.isEmpty(matcheOpenSourceFiles)) {
//因为行的特征库较少,这里补充一个对比逻辑,如果当前文件解析失败,或者没有通过代码块匹配到数据,则直接通过文件的md5 再次查询一次solr库
checkByOriginalFileMd5(versionIdCoreName, analysisFile.getMd5());
return; return;
} }
//根据文件后缀判断需要查询的文件版本库名称
String versionIdCoreName = FixedValue.SUFFIX_SOLR_VERSION.get(analysisFile.getSuffix());
//保存所有匹配的行数信息,方便统计总的匹配行数
Set<String> matchedFeatureMd5 = new HashSet<>();
//保存所有匹配的行数信息,方便统计总的匹配行数
Set<Integer> matchLineRowsNum = new HashSet<>();
//获取文件总特征行数
String traitFileLineMd5 = codeFile.getTraitFileLineMd5();
List<String> lineFeatureList = Arrays.asList(traitFileLineMd5.split(","));
//统计每个文件的开源率
List<MatchOpenFile> matchOpenFilesRes = calculateSimilarityAndOpenRate(matcheOpenSourceFiles, codeFile, versionIdCoreName, matchLineRowsNum, matchedFeatureMd5);
//计算文件的总体特征相似度
BigDecimal featureSimilarity = new BigDecimal(matchedFeatureMd5.size()).divide(new BigDecimal(lineFeatureList.size()), 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2);
//计算文件的总体开源率
BigDecimal openRate = new BigDecimal(matchLineRowsNum.size()).divide(new BigDecimal(analysisFile.getCodeRowNum()), 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2);
//获取开源率的阈值
Integer openRateThreshold = analysisTask.getOpenRateThreshold();
//如果开源率大于阈值,则将当前文件设置成开源
if (openRate.compareTo(new BigDecimal(openRateThreshold)) >= 0) {
analysisFile.setOpenType(true);
}
//定义结果集对象 //保存当前文件的开源信息到mongo库中
MatchOpenFileMongoDto matchOpenFileMongo = new MatchOpenFileMongoDto(); MatchOpenFileMongoDto matchOpenFileMongo = new MatchOpenFileMongoDto();
matchOpenFileMongo.setId(IdGenerator.uuid32()) matchOpenFileMongo.setId(IdGenerator.uuid32())
.setFilePath(analysisFile.getFileUrl()) .setFilePath(analysisFile.getFileUrl())
.setFileName(analysisFile.getName()); .setFileName(analysisFile.getName())
.setFeatureSimilarity(featureSimilarity.floatValue())
.setOpenRate(openRate.floatValue())
.setOpenType(analysisFile.getOpenType())
.setMatchOpenFile(matchOpenFilesRes);
log.info("文件" + analysisFile.getName() + ":开源率:" + openRate.floatValue() + ",特征相似度:" + featureSimilarity.floatValue());
mongoTemplate.save(matchOpenFileMongo);
}
//开源文件信息保存结果集
List<MatchOpenFile> matchOpenFileInfoList = new ArrayList<>();
//保存所有匹配的行数信息,方便统计总的匹配行数 /**
Set<String> matchingLineSet = new HashSet<>(); * 计算当前文件的特征相似度 开源率
*
//获取文件总行数 * @param matchOpenFiles 通过MD5 匹配到的所有开源文件
BigDecimal totalCodeRowNum = new BigDecimal(analysisFile.getCodeRowNum()); * @param sourceFileBaseCoreName 当前文件特征文件的 solr coreName
* @param matchLineRowsNum 所有开源文件匹配到的开源行号列表
//统计每个开源文件和被测件的匹配行数 * @param matchFeatureLineMd5s 所有开源文件匹配到的特征行MD5
for (SolrDocument matchFile : matcheOpenSourceFiles) { * @return 匹配的开源文件解析后的结果集
//解析文件的代码块特征值 */
String lineFeatureMd5s = (String) matchFile.get("tz_line_hay"); private List<MatchOpenFile> calculateSimilarityAndOpenRate(SolrDocumentList matchOpenFiles, CodeFile fileAnalysisRes, String sourceFileBaseCoreName, Set<Integer> matchLineRowsNum, Set<String> matchFeatureLineMd5s) {
List<String> matchedLineFeatures = Arrays.asList(lineFeatureMd5s.split(","));
//匹配的开源文件列表
//匹配的总行数 List<MatchOpenFile> matchOpenFilesRes = new ArrayList<>();
int currentFileMatchLineCount = 0;
//遍历当前文件的代码块特征,统计匹配的总行数
for (String originalLineFeatureMd5 : lineFeatures) {
for (String matchLineFeatureMd5 : matchedLineFeatures) {
if (originalLineFeatureMd5.equals(matchLineFeatureMd5)) {
currentFileMatchLineCount++;
matchingLineSet.add(originalLineFeatureMd5);
}
}
}
//首先根据文件的MD5查询开源文件的版本ID,和路径信息 //首先根据文件的MD5查询开源文件的版本ID,和路径信息
SolrDocument versionIdAndPath = solrUtils.queryOne(versionIdCoreName, "sourceFileMd5:" + matchFile.get("sourceMd5"), "versionId,fullPath,sourceFileMd5"); Set<String> openSourceFileMd5s = matchOpenFiles.stream().map(doc -> (String) doc.get("sourceMd5")).collect(Collectors.toSet());
Map<String, SolrDocument> md5VersionInfoMap = solrUtils.batchQueryVersionIdFromSourceFileBaseBySourceMd5(sourceFileBaseCoreName, openSourceFileMd5s);
//根据版本ID查询版本的详细信息 //根据版本ID查询版本的详细信息
//todo 这里 查询一个版本的信息 需要检索 两个 solr 库 而且还需要检索 versioinTree 后面需要优化 //todo 这里 查询一个版本的信息 需要检索 两个 solr 库 而且还需要检索 versioinTree 后面需要优化
VersionTree versionInfo = solrUtils.queryVersionInfoByVersionId((String) versionIdAndPath.get("versionId")); Set<String> openSourceFileVersionIds = md5VersionInfoMap.values().stream().map(doc -> (String) doc.get("versionId")).collect(Collectors.toSet());
List<VersionTree> versionTrees = solrUtils.queryBatchVersionInfoByVersionIds(openSourceFileVersionIds);
Map<String, VersionTree> versionIdVersionInfoMap = versionTrees.stream().collect(Collectors.toMap(VersionTree::getVersionId, java.util.function.Function.identity()));
//计算与当前开源文件的开源率 String traitFileLineMd5 = fileAnalysisRes.getTraitFileLineMd5();
BigDecimal openRate = new BigDecimal(currentFileMatchLineCount).divide(totalCodeRowNum, 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)); List<String> lineFeatureList = Arrays.asList(traitFileLineMd5.split(","));
//当前开源文件的开源项目信息 for (SolrDocument openSourceFile : matchOpenFiles) {
//开源文件MD5
String openSourceFileMd5 = openSourceFile.getFieldValue("sourceMd5").toString();
//解析文件的特征行
String lineFeatureMd5s = (String) openSourceFile.get("tz_line_hay");
List<String> openFileLineFeatures = Arrays.asList(lineFeatureMd5s.split(","));
//获取开源文件的文本信息
String openSourceContent = solrUtils.getOpenFileContentByMd5(openSourceFileMd5);
//当前文件匹配特征行总行数
int currentFileMatchFeatureLineCount = 0;
//遍历函数特征MD5
for (String lineFeatureMd5 : lineFeatureList) {
//源文件的特征行列表
for (String openFileLineFeature : openFileLineFeatures) {
if (lineFeatureMd5.equals(openFileLineFeature)) {
matchFeatureLineMd5s.add(lineFeatureMd5);
currentFileMatchFeatureLineCount++;
}
}
}
//当前文件的开源率
Pair<Float, HashSet<Integer>> openRateAndSaveRowNum = SimilarityUtil.getOpenRateAndSaveRowNum(fileAnalysisRes.getSourceFileContent(), openSourceContent);
//将当前文件匹配的行号,存储到缓存中,方便统计整体的开源率
matchLineRowsNum.addAll(openRateAndSaveRowNum.getValue());
//统计当前文件的特征相似度
BigDecimal featureSimilarity = new BigDecimal(currentFileMatchFeatureLineCount).divide(new BigDecimal(lineFeatureList.size()), 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2);
SolrDocument openEntries = md5VersionInfoMap.get(openSourceFileMd5);
VersionTree versionInfo = versionIdVersionInfoMap.get(openEntries.get("versionId"));
if (versionInfo == null) {
log.error("根据版本ID,未查询到相关的版本信息。versionId:{}", openEntries.get("versionId"));
continue;
}
//组装当前开源文件的开源项目信息
MatchOpenFile matchOpenFileInfo = new MatchOpenFile(); MatchOpenFile matchOpenFileInfo = new MatchOpenFile();
matchOpenFileInfo.setPId(versionInfo.getProId()) matchOpenFileInfo.setPId(versionInfo.getProId())
.setPName(versionInfo.getProName()) .setPName(versionInfo.getProName())
.setSourceUrl(versionInfo.getDownUrl()) .setSourceUrl((String) openEntries.get("fullPath"))
.setOpenRate(openRate.floatValue()) .setFeatureSimilarity(featureSimilarity.floatValue())
.setOpenRate(openRateAndSaveRowNum.getKey())
.setVersion(versionInfo.getVersionName()) .setVersion(versionInfo.getVersionName())
.setLicenseType(versionInfo.getLicenseType()) .setLicenseType(versionInfo.getLicenseType())
.setAnalyzeType(AnalysisLevelEnum.LINE_LEVEL.getCode()); .setAnalyzeType(AnalysisLevelEnum.FUNCTION_LEVEL.getCode());
matchOpenFileInfoList.add(matchOpenFileInfo); matchOpenFilesRes.add(matchOpenFileInfo);
}
return matchOpenFilesRes;
} }
//统计当前文件的整体开源率
BigDecimal openRate = new BigDecimal(matchingLineSet.size()).divide(totalCodeRowNum, 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100));
//获取开源率的阈值
Integer openRateThreshold = analysisTask.getOpenRateThreshold();
/**
* 防止代码块特征库不全再次根据文件MD5查询开源文件信息, 做二次校验
*
* @param originalFileMd5
* @param versionIdCoreName
*/
private void checkByOriginalFileMd5(String versionIdCoreName, String originalFileMd5) {
//如果开源率大于阈值,则将当前文件设置成开源 //根据文件的MD5,查询特征库,看当前文件是否在开源代码库中
if (openRate.compareTo(new BigDecimal(openRateThreshold)) >= 0) { SolrDocument versionIdAndPath = solrUtils.queryOne(versionIdCoreName, "sourceFileMd5:" + originalFileMd5, "versionId,fullPath,sourceFileMd5");
analysisFile.setOpenType(true);
}
//保存当前文件的开源信息 if (versionIdAndPath != null) {
matchOpenFileMongo.setOpenType(analysisFile.getOpenType()) //根据版本ID查询版本的详细信息
.setMatchOpenFile(matchOpenFileInfoList); VersionTree versionInfo = solrUtils.queryVersionInfoByVersionId((String) versionIdAndPath.get("versionId"));
mongoTemplate.save(matchOpenFileMongo); if (versionInfo != null) {
//当前开源文件的开源项目信息
MatchOpenFile matchOpenFileInfo = new MatchOpenFile();
matchOpenFileInfo.setPId(versionInfo.getProId())
.setPName(versionInfo.getProName())
.setSourceUrl(versionInfo.getDownUrl())
.setFeatureSimilarity(100.00f)
.setOpenRate(100.00f)
.setAnalyzeType(AnalysisLevelEnum.FILE_LEVEL.getCode());
//保存当前文件的开源信息到mongo库中
MatchOpenFileMongoDto matchOpenFileMongo = new MatchOpenFileMongoDto();
matchOpenFileMongo.setId(IdGenerator.uuid32())
.setFilePath(analysisFile.getFileUrl())
.setFileName(analysisFile.getName())
.setOpenRate(100.00f)
.setOpenType(analysisFile.getOpenType())
.setMatchOpenFile(Arrays.asList(matchOpenFileInfo));
mongoTemplate.save(matchOpenFileMongo);
}
}
} }
/** /**
* 将特征值插入到mongo库中 * 将特征值插入到mongo库中
*
* @param features 特征集合 * @param features 特征集合
* @param lineDataMongoDto 当前分析任务 特征信息存储 * @param lineDataMongoDto 当前分析任务 特征信息存储
* todo 后期 看看有没有插入的必要 * todo 后期 看看有没有插入的必要
@ -281,16 +361,31 @@ public class LineAnalysisTask extends IAnalysisTask {
/** /**
* 根据 特征值 从特征库中检索 具有特征相似的 * 根据 特征值 从特征库中检索 具有特征相似的
* *
* @param lineFeatureList 行特征信息 * @param codeFile 行特征信息
* @return * @return
*/ */
private SolrDocumentList getFeatureSimilarityFromSolr(List<String> lineFeatureList) { private SolrDocumentList getFeatureSimilarityFromSolr(CodeFile codeFile) {
String solrCoreName = SolrDBConst.CORE_NAME_SOURCE_FILE_INFO_TEMP; Set<String> queryMd5Set = new HashSet<>();
//每一行原内容MD5值集合
String cutFileLineMd5 = codeFile.getCutFileLineMd5();
if (StrUtil.isNotBlank(cutFileLineMd5)) {
List<String> lineCutList = Arrays.asList(cutFileLineMd5.split(","));
queryMd5Set.addAll(lineCutList);
}
//每一行特征内容MD5值集合
String traitFileLineMd5 = codeFile.getTraitFileLineMd5();
if (StrUtil.isNotBlank(traitFileLineMd5)) {
List<String> lineFeatureList = Arrays.asList(traitFileLineMd5.split(","));
queryMd5Set.addAll(lineFeatureList);
}
if (CollectionUtil.isEmpty(queryMd5Set)) {
log.error("特征相似度检索失败,特征为空:{}", analysisFile.getName());
return new SolrDocumentList();
}
//拼接行特征查询条件 //拼接行特征查询条件
String queryStr = "tz_line_hay:(" + StringUtils.join(lineFeatureList, " OR ") + ")"; String queryStr = "tz_line_hay:(" + StringUtils.join(queryMd5Set, " OR ") + ")";
log.info("查询条件: solrCoreName:{},queryStr:{}", solrCoreName, queryStr); log.info("查询条件: solrCoreName:{},queryStr:{}", SolrDBConst.CORE_NAME_SOURCE_FILE_INFO_TEMP, queryStr);
SolrDocumentList result = solrUtils.query(solrCoreName, queryStr, "sourceMd5,tz_line_hay"); SolrDocumentList result = solrUtils.query(SolrDBConst.CORE_NAME_SOURCE_FILE_INFO_TEMP, queryStr, "sourceMd5,tz_line_hay");
log.info("查询结果: result:{}", result);
return result; return result;
} }

@ -2,10 +2,10 @@ package com.keyware.composeanalysis.task;
import cn.hutool.core.collection.CollectionUtil; import cn.hutool.core.collection.CollectionUtil;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import com.keyware.common.constant.enums.AnalysisStatusEnum;
import com.keyware.composeanalysis.constant.FixedValue; import com.keyware.composeanalysis.constant.FixedValue;
import com.keyware.composeanalysis.constant.MongoDBConst; import com.keyware.composeanalysis.constant.MongoDBConst;
import com.keyware.composeanalysis.constant.enums.AnalysisLevelEnum; import com.keyware.composeanalysis.constant.enums.AnalysisLevelEnum;
import com.keyware.composeanalysis.constant.enums.AnalysisStatusEnum;
import com.keyware.composeanalysis.constant.enums.FileAnalysisStatusEnum; import com.keyware.composeanalysis.constant.enums.FileAnalysisStatusEnum;
import com.keyware.composeanalysis.entity.AnalysisTask; import com.keyware.composeanalysis.entity.AnalysisTask;
import com.keyware.composeanalysis.mongo.*; import com.keyware.composeanalysis.mongo.*;
@ -290,7 +290,7 @@ public class PorjectAnalysisTask {
matchOpenFileInfo.setId(IdGenerator.uuid32()) matchOpenFileInfo.setId(IdGenerator.uuid32())
.setFileName(originalFile.getName()) .setFileName(originalFile.getName())
.setFilePath(originalFile.getFileUrl()) .setFilePath(originalFile.getFileUrl())
.setOpenType(originalFile.getOpenType()) .setOpenType(true)
.setFeatureSimilarity(100.00f) .setFeatureSimilarity(100.00f)
.setOpenRate(100.00f) .setOpenRate(100.00f)
.setMatchOpenFile(Arrays.asList(matchOpenFile)); .setMatchOpenFile(Arrays.asList(matchOpenFile));

@ -83,7 +83,7 @@ public class SimilarityUtil {
// //计算开源率 // //计算开源率
// BigDecimal openRate = new BigDecimal(matchedRowsNum.size()).divide(new BigDecimal(analysisFileLineInfo.size()), 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2); // BigDecimal openRate = new BigDecimal(matchedRowsNum.size()).divide(new BigDecimal(analysisFileLineInfo.size()), 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2);
// //
// return new Pair<>(openRate.floatValue(), matchedRowsNum); // return new Pair<>(openRate.toString(), matchedRowsNum);
// } // }
@ -110,7 +110,7 @@ public class SimilarityUtil {
// //计算开源率 // //计算开源率
// BigDecimal openRate = new BigDecimal(matchedRowsNum.size()).divide(new BigDecimal(analysisFileLineInfo.size()), 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2); // BigDecimal openRate = new BigDecimal(matchedRowsNum.size()).divide(new BigDecimal(analysisFileLineInfo.size()), 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2);
// //
// return new Pair<>(openRate.floatValue(), matchedRowsNum); // return new Pair<>(openRate.toString(), matchedRowsNum);
// } // }
public static Pair<Float, HashSet<Integer>> getOpenRateAndSaveRowNum(String analysisFile, String openSourceFile) { public static Pair<Float, HashSet<Integer>> getOpenRateAndSaveRowNum(String analysisFile, String openSourceFile) {

@ -1,6 +1,7 @@
package com.keyware.composeanalysis.util; package com.keyware.composeanalysis.util;
import com.keyware.composeanalysis.constant.MongoDBConst; import com.keyware.composeanalysis.constant.MongoDBConst;
import com.keyware.composeanalysis.constant.SolrDBConst;
import com.keyware.composeanalysis.solr.VersionTree; import com.keyware.composeanalysis.solr.VersionTree;
import lombok.Data; import lombok.Data;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
@ -141,10 +142,6 @@ public class SolrUtils {
} }
/** /**
* 简单查询,指定返回字段 * 简单查询,指定返回字段
* *
@ -175,6 +172,31 @@ public class SolrUtils {
} }
/**
* 根据开源文件的MD5 获取开源文件的文本内容
*
* @param openSourceFileMd5 开源文件的MD5
* @return
* @throws Exception
*/
public String getOpenFileContentByMd5(String openSourceFileMd5) {
//根据源文件的MD5确定需要查询源码库的序号
String openSourceCodeCoreIndex = openSourceFileMd5.substring(0, 1) + SolrDBConst.CORE_NAME_SUFFIX_SOURCE_FILE_INFO;
//获取开源文件的文本信息
SolrDocument openSourceContent = this.queryOne(openSourceCodeCoreIndex, "sourceFileMd5:" + openSourceFileMd5, "sourceContent");
if (openSourceContent == null) {
log.error("根据开源文件MD5:{}未找到对应的开源文件源码", openSourceFileMd5);
}
return openSourceContent.getFieldValue("sourceContent").toString();
}
/** /**
* 查询 versionTree * 查询 versionTree
* *

Loading…
Cancel
Save