diff --git a/src/main/java/com/keyware/composeanalysis/ComposeAnalyzeApplication.java b/src/main/java/com/keyware/composeanalysis/ComposeAnalysisApplication.java similarity index 84% rename from src/main/java/com/keyware/composeanalysis/ComposeAnalyzeApplication.java rename to src/main/java/com/keyware/composeanalysis/ComposeAnalysisApplication.java index ad3071a..9c24d65 100644 --- a/src/main/java/com/keyware/composeanalysis/ComposeAnalyzeApplication.java +++ b/src/main/java/com/keyware/composeanalysis/ComposeAnalysisApplication.java @@ -12,10 +12,10 @@ import org.springframework.scheduling.annotation.EnableAsync; @EnableDiscoveryClient @RefreshScope @EnableAsync -public class ComposeAnalyzeApplication { +public class ComposeAnalysisApplication { public static void main(String[] args) { - SpringApplication.run(ComposeAnalyzeApplication.class, args); + SpringApplication.run(ComposeAnalysisApplication.class, args); } } diff --git a/src/main/java/com/keyware/composeanalysis/config/thread/TaskExecutePool.java b/src/main/java/com/keyware/composeanalysis/config/thread/TaskExecutePool.java index 02a716f..f998c5a 100644 --- a/src/main/java/com/keyware/composeanalysis/config/thread/TaskExecutePool.java +++ b/src/main/java/com/keyware/composeanalysis/config/thread/TaskExecutePool.java @@ -15,7 +15,7 @@ public class TaskExecutePool { /** * 核心线程数 */ - private int coreThreadsSize = 10; + private int coreThreadsSize = 20; /** * 最大线程数 @@ -45,7 +45,7 @@ public class TaskExecutePool { //活跃时间 executor.setKeepAliveSeconds(keepAliveSeconds); //线程名字前缀 - executor.setThreadNamePrefix("ComposeAnalysisExecutePool-"); + executor.setThreadNamePrefix("analysis-exec-"); // setRejectedExecutionHandler:当pool已经达到max size的时候,如何处理新任务 // CallerRunsPolicy:不在新线程中执行任务,而是由调用者所在的线程来执行 diff --git a/src/main/java/com/keyware/composeanalysis/constant/enums/AnalysisStatusEnum.java b/src/main/java/com/keyware/composeanalysis/constant/enums/AnalysisStatusEnum.java index bf87e3c..33e475a 100644 --- a/src/main/java/com/keyware/composeanalysis/constant/enums/AnalysisStatusEnum.java +++ b/src/main/java/com/keyware/composeanalysis/constant/enums/AnalysisStatusEnum.java @@ -1,48 +1,48 @@ -package com.keyware.composeanalysis.constant.enums; - - -/** - * 分析状态枚举类 - */ - -public enum AnalysisStatusEnum { - - //0:未分析 1:正在分析 2:分析完成 3:暂停分析 4:等待 5:开始分析 6:终止分析 7:分析失败 - - UN_ANALYSIS("未分析",0), - - ANALYSISING("正在分析",1), - - ANALYSIS_DONE("分析完成",2), - - PAUSE_ANALYSIS("暂停分析",3), - - WAIT_ANALYSIS("等待分析",4), - - START_ANALYSIS("开始分析",5), - - STOP_ANALYSIS("终止分析",6), - - FAIL_ANALYSIS("分析失败",7); - - // 状态 - private String status; - //状态码 - private Integer code; - - AnalysisStatusEnum(String status, Integer code) { - this.status = status; - this.code = code; - } - - public String getStatus() { - return status; - } - - public Integer getCode() { - return code; - } - - - -} \ No newline at end of file +//package com.keyware.composeanalysis.constant.enums; +// +// +///** +// * 分析状态枚举类 +// */ +// +//public enum AnalysisStatusEnum { +// +// //0:未分析 1:正在分析 2:分析完成 3:暂停分析 4:等待 5:开始分析 6:终止分析 7:分析失败 +// +// UN_ANALYSIS("未分析",0), +// +// ANALYSISING("正在分析",1), +// +// ANALYSIS_DONE("分析完成",2), +// +// PAUSE_ANALYSIS("暂停分析",3), +// +// WAIT_ANALYSIS("等待分析",4), +// +// START_ANALYSIS("开始分析",5), +// +// STOP_ANALYSIS("终止分析",6), +// +// FAIL_ANALYSIS("分析失败",7); +// +// // 状态 +// private String status; +// //状态码 +// private Integer code; +// +// AnalysisStatusEnum(String status, Integer code) { +// this.status = status; +// this.code = code; +// } +// +// public String getStatus() { +// return status; +// } +// +// public Integer getCode() { +// return code; +// } +// +// +// +//} \ No newline at end of file diff --git a/src/main/java/com/keyware/composeanalysis/controller/ComposeAnalysisController.java b/src/main/java/com/keyware/composeanalysis/controller/ComposeAnalysisController.java index 392e99d..fb984b6 100644 --- a/src/main/java/com/keyware/composeanalysis/controller/ComposeAnalysisController.java +++ b/src/main/java/com/keyware/composeanalysis/controller/ComposeAnalysisController.java @@ -1,8 +1,8 @@ package com.keyware.composeanalysis.controller; import com.keyware.common.constant.RedisConst; +import com.keyware.common.constant.enums.AnalysisStatusEnum; import com.keyware.composeanalysis.api.ComposeAnalysisApi; -import com.keyware.composeanalysis.constant.enums.AnalysisStatusEnum; import com.keyware.composeanalysis.entity.AnalysisTask; import com.keyware.composeanalysis.response.AnalysisResp; import com.keyware.composeanalysis.service.AnalysisTaskService; diff --git a/src/main/java/com/keyware/composeanalysis/service/impl/AnalysisTaskServiceImpl.java b/src/main/java/com/keyware/composeanalysis/service/impl/AnalysisTaskServiceImpl.java index 6528706..9fde7bb 100644 --- a/src/main/java/com/keyware/composeanalysis/service/impl/AnalysisTaskServiceImpl.java +++ b/src/main/java/com/keyware/composeanalysis/service/impl/AnalysisTaskServiceImpl.java @@ -96,7 +96,7 @@ public class AnalysisTaskServiceImpl extends ServiceImpl traitLineMd5Arr = functionAndCodeBlockInfos.stream().map(LineModel::getTraitLineMd5).collect(Collectors.toSet()); Set cuttLineMd5Arr = functionAndCodeBlockInfos.stream().map(LineModel::getCutLineMd5).collect(Collectors.toSet()); Set queryMd5Arr = Stream.concat(traitLineMd5Arr.stream(), cuttLineMd5Arr.stream()).collect(Collectors.toSet()); + if (CollUtil.isEmpty(queryMd5Arr)) { + log.error("特征为空,无法查询:{}", analysisFile.getName()); + return new SolrDocumentList(); + } String queryStr = "line_hay:(" + StringUtils.join(queryMd5Arr, " OR ") + ")"; log.info("查询条件: solrCoreName:{},queryStr:{}", solrCoreName, queryStr); SolrDocumentList result = solrUtils.query(solrCoreName, queryStr, "sourceMd5,line_hay"); - log.info("查询结果: result:{}", result); +// log.info("查询结果: result:{}", result); return result; } @@ -155,18 +161,18 @@ public class CodeBlockAnalysisTask extends IAnalysisTask { /** * 计算开源率 被测件的开源率 * - * @param matcheOpenSourceFiles 匹配的开源文件信息 - * @param fileAnalysisRes 被测件的解析结果 + * @param matcheOpenSourceFiles 匹配的开源文件信息 + * @param sourceFileBaseCoreName 查询版开源文件版本ID的 solr库名称 + * @param fileAnalysisRes 被测件的解析结果 */ - private void doAnalysis(SolrDocumentList matcheOpenSourceFiles, CodeFile fileAnalysisRes) { + private void doAnalysis(SolrDocumentList matcheOpenSourceFiles, String sourceFileBaseCoreName, CodeFile fileAnalysisRes) { if (CollectionUtil.isEmpty(matcheOpenSourceFiles)) { + //因为代码块的特征库较少,这里补充一个对比逻辑,如果当前文件解析失败,或者没有通过代码块匹配到数据,则直接通过文件的md5 再次查询一次solr库 + checkByOriginalFileMd5(sourceFileBaseCoreName, analysisFile.getMd5()); return; } - //根据文件后缀判断需要查询的文件版本库名称 - String sourceFileBaseCoreName = FixedValue.SUFFIX_SOLR_VERSION.get(analysisFile.getSuffix()); - //保存所有匹配的特征代码块MD5信息,方便统计总的匹配行数 Set matchingTraitLineSet = new HashSet<>(); @@ -178,14 +184,14 @@ public class CodeBlockAnalysisTask extends IAnalysisTask { List matchOpenFilesRes = calculateSimilarityAndOpenRate(matcheOpenSourceFiles, fileAnalysisRes, sourceFileBaseCoreName, matchedLineRowsNum, matchingTraitLineSet); //计算文件的总体的特征相似度 - Map traitMd5Map = fileAnalysisRes.getLine_hay().stream().collect(Collectors.toMap(LineModel::getTraitLineMd5, java.util.function.Function.identity())); + Map traitsFeatureMd5AndFeatureLineNumMap = getTraitsFeatureMd5AndFeatureLineNumMap(fileAnalysisRes.getLine_hay()); int matchCodeBlockLineCount = 0; - for (String matchFeatureFunctionMd5 : matchingTraitLineSet) { - LineModel lineModel = traitMd5Map.get(matchFeatureFunctionMd5); - matchCodeBlockLineCount += (Integer.valueOf(lineModel.getEndLine()) - Integer.valueOf(lineModel.getStartLine())); + for (String matchFeatureMd5 : matchingTraitLineSet) { + matchCodeBlockLineCount += traitsFeatureMd5AndFeatureLineNumMap.get(matchFeatureMd5); } + //计算文件的总体特征相似度 BigDecimal featureSimilarity = new BigDecimal(matchCodeBlockLineCount).divide(new BigDecimal(analysisFile.getCodeRowNum()), 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2); //计算文件的总体开源率 @@ -235,7 +241,8 @@ public class CodeBlockAnalysisTask extends IAnalysisTask { List versionTrees = solrUtils.queryBatchVersionInfoByVersionIds(openSourceFileVersionIds); Map versionIdVersionInfoMap = versionTrees.stream().collect(Collectors.toMap(VersionTree::getVersionId, java.util.function.Function.identity())); - + //按照特征行进行分组,一次匹配中,将所有的特征行进行累加 + Map traitsFeatureMd5AndFeatureLineNumMap = getTraitsFeatureMd5AndFeatureLineNumMap(fileAnalysisRes.getLine_hay()); for (SolrDocument matchFile : matchOpenFiles) { //开源文件md5 @@ -247,25 +254,32 @@ public class CodeBlockAnalysisTask extends IAnalysisTask { //匹配的总特征行数 int currentFileMatchFeatureLineCount = 0; + //当前文件所匹配的特征函数MD5 + Set currentFileMatchFeatureMd5 = new HashSet(); + //遍历当前文件的代码块特征,统计匹配的总行数 - for (LineModel lineModel : fileAnalysisRes.getLine_hay()) { - String traitLineMd5 = lineModel.getTraitLineMd5(); + for (String traitLineMd5 : traitsFeatureMd5AndFeatureLineNumMap.keySet()) { //村换匹配到的文件的行信息 for (LineModel matchLine : openFileCodeBlockFeatureList) { if (traitLineMd5.equals(matchLine.getTraitLineMd5())) { - //计算匹配的特征行数 - currentFileMatchFeatureLineCount += (Integer.valueOf(matchLine.getEndLine()) - Integer.valueOf(matchLine.getStartLine()) + 1); - matchFeatureCodeBlockMd5s.add(traitLineMd5); + if (!currentFileMatchFeatureMd5.contains(traitLineMd5)) { + currentFileMatchFeatureMd5.add(traitLineMd5); + matchFeatureCodeBlockMd5s.add(traitLineMd5); + currentFileMatchFeatureLineCount += traitsFeatureMd5AndFeatureLineNumMap.get(traitLineMd5); + } } } } - //根据源文件的MD5确定需要查询源码库的序号 String openSourceCodeCoreIndex = openSourceFileMd5.substring(0, 1) + SolrDBConst.CORE_NAME_SUFFIX_SOURCE_FILE_INFO; //获取开源文件的文本信息 SolrDocument openSourceContent = solrUtils.queryOne(openSourceCodeCoreIndex, "sourceFileMd5:" + openSourceFileMd5, "sourceContent"); + if (openSourceContent == null) { + log.error("根据开源文件MD5查询源码失败,sourceFileMd5:{}", openSourceFileMd5); + continue; + } //当前文件的开源率 Pair> openRateAndSaveRowNum = getOpenRateAndSaveRowNum(fileAnalysisRes.getSourceFileContent(), openSourceContent.getFieldValue("sourceContent").toString()); @@ -274,10 +288,14 @@ public class CodeBlockAnalysisTask extends IAnalysisTask { matchLineRowsNum.addAll(openRateAndSaveRowNum.getValue()); //统计当前文件的特征相似度 - BigDecimal featureSimilarity = new BigDecimal(currentFileMatchFeatureLineCount).divide(fileAnalysisRes.getCodeRowNum(), 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2); + BigDecimal featureSimilarity = new BigDecimal(currentFileMatchFeatureLineCount).divide(new BigDecimal(analysisFile.getCodeRowNum()), 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2); SolrDocument openEntries = md5VersionInfoMap.get(openSourceFileMd5); VersionTree versionInfo = versionIdVersionInfoMap.get(openEntries.get("versionId")); + if (versionInfo == null) { + log.error("根据开源文件版本ID查询版本信息失败,versionId:{}", openEntries.get("versionId")); + continue; + } //组装当前开源文件的开源项目信息 MatchOpenFile matchOpenFileInfo = new MatchOpenFile(); @@ -295,6 +313,45 @@ public class CodeBlockAnalysisTask extends IAnalysisTask { } + /** + * 防止代码块特征库不全,再次根据文件MD5查询开源文件信息, 做二次校验 + * + * @param originalFileMd5 + * @param versionIdCoreName + */ + private void checkByOriginalFileMd5(String versionIdCoreName, String originalFileMd5) { + + //根据文件的MD5,查询特征库,看当前文件是否在开源代码库中 + SolrDocument versionIdAndPath = solrUtils.queryOne(versionIdCoreName, "sourceFileMd5:" + originalFileMd5, "versionId,fullPath,sourceFileMd5"); + + if (versionIdAndPath != null) { + //根据版本ID查询版本的详细信息 + VersionTree versionInfo = solrUtils.queryVersionInfoByVersionId((String) versionIdAndPath.get("versionId")); + if (versionInfo != null) { + //当前开源文件的开源项目信息 + MatchOpenFile matchOpenFileInfo = new MatchOpenFile(); + matchOpenFileInfo.setPId(versionInfo.getProId()) + .setPName(versionInfo.getProName()) + .setSourceUrl(versionInfo.getDownUrl()) + .setFeatureSimilarity(100.00f) + .setOpenRate(100.00f) + .setAnalyzeType(AnalysisLevelEnum.FILE_LEVEL.getCode()); + + //保存当前文件的开源信息到mongo库中 + MatchOpenFileMongoDto matchOpenFileMongo = new MatchOpenFileMongoDto(); + matchOpenFileMongo.setId(IdGenerator.uuid32()) + .setFilePath(analysisFile.getFileUrl()) + .setFileName(analysisFile.getName()) + .setOpenRate(100.00f) + .setOpenType(analysisFile.getOpenType()) + .setMatchOpenFile(Arrays.asList(matchOpenFileInfo)); + + mongoTemplate.save(matchOpenFileMongo); + } + } + } + + /** * 获取当前文件的代码块特征值 * @@ -310,13 +367,29 @@ public class CodeBlockAnalysisTask extends IAnalysisTask { return JSONArray.parseArray(lineFeatureMd5s, LineModel.class); } + /** + * 或者特征代码块的md5 和 当前md5包含的特征行数 + * + * @param codeBlockInfos + * @return + */ + private Map getTraitsFeatureMd5AndFeatureLineNumMap(List codeBlockInfos) { + Map> traitMd5GroupMap = codeBlockInfos.stream().collect(Collectors.groupingBy(LineModel::getTraitLineMd5)); + Map resultMap = new HashMap<>(); + for (String traitMd5 : traitMd5GroupMap.keySet()) { + List lineModels = traitMd5GroupMap.get(traitMd5); + int traitsLineNum = lineModels.stream().mapToInt(lineModel -> (Integer.valueOf(lineModel.getEndLine()) - Integer.valueOf(lineModel.getStartLine()) + 1)).sum(); + resultMap.put(traitMd5, traitsLineNum); + } + return resultMap; + } /** * 将特征值插入到mongo库中 * * @param features 特征集合 * @param lineDataMongoDto 当前分析任务 ,特征信息存储 - * todo 后期 看看有没有插入的必要 + * todo 后期 看看有没有插入的必要 * @param */ @Deprecated diff --git a/src/main/java/com/keyware/composeanalysis/task/FileAnalysisTask.java b/src/main/java/com/keyware/composeanalysis/task/FileAnalysisTask.java index 3a0aa4c..05b0280 100644 --- a/src/main/java/com/keyware/composeanalysis/task/FileAnalysisTask.java +++ b/src/main/java/com/keyware/composeanalysis/task/FileAnalysisTask.java @@ -1,10 +1,10 @@ package com.keyware.composeanalysis.task; +import com.keyware.common.constant.enums.AnalysisStatusEnum; import com.keyware.composeanalysis.constant.FixedValue; import com.keyware.composeanalysis.constant.RedisConst; import com.keyware.composeanalysis.constant.SolrDBConst; import com.keyware.composeanalysis.constant.enums.AnalysisLevelEnum; -import com.keyware.composeanalysis.constant.enums.AnalysisStatusEnum; import com.keyware.composeanalysis.constant.enums.FileAnalysisStatusEnum; import com.keyware.composeanalysis.entity.AnalysisTask; import com.keyware.composeanalysis.mongo.FileDataMongoDto; diff --git a/src/main/java/com/keyware/composeanalysis/task/FunctionAnalysisTask.java b/src/main/java/com/keyware/composeanalysis/task/FunctionAnalysisTask.java index 100c1d8..385a2ac 100644 --- a/src/main/java/com/keyware/composeanalysis/task/FunctionAnalysisTask.java +++ b/src/main/java/com/keyware/composeanalysis/task/FunctionAnalysisTask.java @@ -1,15 +1,15 @@ package com.keyware.composeanalysis.task; +import cn.hutool.core.collection.CollUtil; import cn.hutool.core.collection.CollectionUtil; import cn.hutool.core.lang.Pair; -import cn.hutool.core.util.ObjUtil; import com.alibaba.fastjson.JSONArray; +import com.keyware.common.constant.enums.AnalysisStatusEnum; import com.keyware.composeanalysis.constant.FixedValue; import com.keyware.composeanalysis.constant.RedisConst; import com.keyware.composeanalysis.constant.SolrDBConst; import com.keyware.composeanalysis.constant.enums.AnalysisLevelEnum; -import com.keyware.composeanalysis.constant.enums.AnalysisStatusEnum; import com.keyware.composeanalysis.constant.enums.FileAnalysisStatusEnum; import com.keyware.composeanalysis.entity.AnalysisTask; import com.keyware.composeanalysis.mongo.FileDataMongoDto; @@ -102,34 +102,15 @@ public class FunctionAnalysisTask extends IAnalysisTask { //根据文件的名称获取函数解析器 Analysis analysis = AnalysisFactory.getAnalysis(filePath); + //解析文件 - if (!ObjUtil.hasEmpty(featureCoreName, sourceFileBaseCoreName, analysis)) { - CodeFile codeFile = analysis.analysisFile(new FileInputStream(filePath)); - if (codeFile != null) { - List functionList = codeFile.getFunctionList(); - if (CollectionUtil.isNotEmpty(functionList)) { - //获取函数的特征MD5,cutMD5 - List featureFunctionMd5List = functionList.stream().map(Function::getMd5).collect(Collectors.toList()); - List cutFunctionMd5List = functionList.stream().map(Function::getSourceMd5).collect(Collectors.toList()); - Set queryMd5List = Stream.concat(featureFunctionMd5List.stream(), cutFunctionMd5List.stream()).collect(Collectors.toSet()); - String queryStr = "fun_hay:(" + StringUtils.join(queryMd5List, " OR ") + ")"; -// log.info("检索函数特征,coreName:{} ,queryStr:{}", featureCoreName, queryStr); - SolrDocumentList matchOpenFiles = solrUtils.query(featureCoreName, queryStr, "sourceMd5,fun_hay"); -// log.info("resp", sourceMd5); - //如果函数级特征匹配,能够匹配到开源文件信息,则根据开源文件的md5或者开源文件信息,做相似度对比 - if (matchOpenFiles != null) { - //对匹配到的文件进行分析 - doAnalysis(matchOpenFiles, sourceFileBaseCoreName, codeFile); - } else { - //因为函数的特征库较少,这里补充一个对比逻辑,如果当前文件解析失败,或者没有通过函数匹配到数据,则直接通过文件的md5 再次查询一次solr库 - checkByOriginalFileMd5(sourceFileBaseCoreName, analysisFile.getMd5()); - } - } - } - } else { - //因为函数的特征库较少,这里补充一个对比逻辑,如果当前文件解析失败,或者没有通过函数匹配到数据,则直接通过文件的md5 再次查询一次solr库 - checkByOriginalFileMd5(sourceFileBaseCoreName, analysisFile.getMd5()); - } + CodeFile codeFile = analysis.analysisFile(new FileInputStream(filePath)); + + //根据函数特征去匹配到开源文件 + SolrDocumentList matchOpenFiles = getFeatureSimilarityFromSolr(featureCoreName, codeFile.getFunctionList()); + + //计算开源率 + doAnalysis(matchOpenFiles, sourceFileBaseCoreName, codeFile); //更新文件表的分析状态为3 函数级特征以分析完毕 analysisFile.setFileAnalysisStatus(FileAnalysisStatusEnum.ANALYSIS_DONE.getCode()); @@ -160,9 +141,14 @@ public class FunctionAnalysisTask extends IAnalysisTask { * @param matchOpenFiles 通过特征匹配到的开源文件的md5 * @param sourceFileBaseCoreName 查询版开源文件版本ID的 solr库名称 * @param fileAnalysisRes 被测件的函数解析结果 - * @throws Exception */ - private void doAnalysis(SolrDocumentList matchOpenFiles, String sourceFileBaseCoreName, CodeFile fileAnalysisRes) throws Exception { + private void doAnalysis(SolrDocumentList matchOpenFiles, String sourceFileBaseCoreName, CodeFile fileAnalysisRes) { + + if (CollectionUtil.isEmpty(matchOpenFiles)) { + //因为函数的特征库较少,这里补充一个对比逻辑,如果当前文件解析失败,或者没有通过函数匹配到数据,则直接通过文件的md5 再次查询一次solr库 + checkByOriginalFileMd5(sourceFileBaseCoreName, analysisFile.getMd5()); + return; + } //按照函数的特征md5进行分组,getter ,setter等方法的 特征值会重复 Map> featureMd5FunctionMap = fileAnalysisRes.getFunctionList().stream().collect(Collectors.groupingBy(Function::getMd5)); @@ -219,7 +205,7 @@ public class FunctionAnalysisTask extends IAnalysisTask { * @param sourceFileBaseCoreName 当前文件特征文件的 solr coreName * @param matchLineRowsNum 所有开源文件匹配到的开源行号列表 * @param matchFeatureFunctionMd5s 所有开源文件匹配到的特征函数MD5 - * return 匹配的开源文件解析后的结果集 + * return 匹配的开源文件解析后的结果集 */ private List calculateSimilarityAndOpenRate(SolrDocumentList matchOpenFiles, CodeFile fileAnalysisRes, String sourceFileBaseCoreName, Set matchLineRowsNum, Set matchFeatureFunctionMd5s) { @@ -239,7 +225,6 @@ public class FunctionAnalysisTask extends IAnalysisTask { List versionTrees = solrUtils.queryBatchVersionInfoByVersionIds(openSourceFileVersionIds); Map versionIdVersionInfoMap = versionTrees.stream().collect(Collectors.toMap(VersionTree::getVersionId, java.util.function.Function.identity())); - //函数总行数 BigDecimal totalFunctionLineCount = new BigDecimal(fileAnalysisRes.getFunctionList().stream().mapToInt(Function::getCodeRowNum).sum()); @@ -302,7 +287,7 @@ public class FunctionAnalysisTask extends IAnalysisTask { .setAnalyzeType(AnalysisLevelEnum.FUNCTION_LEVEL.getCode()); matchOpenFilesRes.add(matchOpenFileInfo); } - return matchOpenFilesRes; + return matchOpenFilesRes; } @@ -328,7 +313,7 @@ public class FunctionAnalysisTask extends IAnalysisTask { .setSourceUrl(versionInfo.getDownUrl()) .setFeatureSimilarity(100.00f) .setOpenRate(100.00f) - .setAnalyzeType(AnalysisLevelEnum.FUNCTION_LEVEL.getCode()); + .setAnalyzeType(AnalysisLevelEnum.FILE_LEVEL.getCode()); //保存当前文件的开源信息到mongo库中 MatchOpenFileMongoDto matchOpenFileMongo = new MatchOpenFileMongoDto(); @@ -345,6 +330,28 @@ public class FunctionAnalysisTask extends IAnalysisTask { } + /** + * 根据 特征值 从特征库中检索 具有特征相似的 + * + * @param featureCoreName 检索的solr 库名称 + * @param functionList + * @return + */ + private SolrDocumentList getFeatureSimilarityFromSolr(String featureCoreName, List functionList) { + //获取方法快的特征MD5,cutMD5 + Set featureFunctionMd5List = functionList.stream().map(Function::getMd5).collect(Collectors.toSet()); + Set cutFunctionMd5List = functionList.stream().map(Function::getSourceMd5).collect(Collectors.toSet()); + Set queryMd5List = Stream.concat(featureFunctionMd5List.stream(), cutFunctionMd5List.stream()).collect(Collectors.toSet()); + if (CollUtil.isEmpty(queryMd5List)) { + log.error("特征为空,无法查询:{}", analysisFile.getName()); + return new SolrDocumentList(); + } + String queryStr = "fun_hay:(" + StringUtils.join(queryMd5List, " OR ") + ")"; +// log.info("检索函数特征,coreName:{} ,queryStr:{}", featureCoreName, queryStr); + return solrUtils.query(featureCoreName, queryStr, "sourceMd5,fun_hay"); + } + + /** * 获取当前文件的函数特征值 * @@ -359,8 +366,8 @@ public class FunctionAnalysisTask extends IAnalysisTask { .replace("\"{", "{") .replace("}\"", "}"); return JSONArray.parseArray(lineFeatureMd5s, Function.class); - }catch (Exception e){ - log.error("解析文件特征值失败",e); + } catch (Exception e) { + log.error("解析文件特征值失败", e); } return new ArrayList(); } diff --git a/src/main/java/com/keyware/composeanalysis/task/LineAnalysisTask.java b/src/main/java/com/keyware/composeanalysis/task/LineAnalysisTask.java index ac1bb75..de05217 100644 --- a/src/main/java/com/keyware/composeanalysis/task/LineAnalysisTask.java +++ b/src/main/java/com/keyware/composeanalysis/task/LineAnalysisTask.java @@ -2,12 +2,14 @@ package com.keyware.composeanalysis.task; import cn.hutool.core.collection.CollectionUtil; +import cn.hutool.core.lang.Pair; +import cn.hutool.core.util.StrUtil; +import com.keyware.common.constant.enums.AnalysisStatusEnum; import com.keyware.composeanalysis.constant.FixedValue; import com.keyware.composeanalysis.constant.FunctionAndAnalysisAssemblyConst; import com.keyware.composeanalysis.constant.RedisConst; import com.keyware.composeanalysis.constant.SolrDBConst; import com.keyware.composeanalysis.constant.enums.AnalysisLevelEnum; -import com.keyware.composeanalysis.constant.enums.AnalysisStatusEnum; import com.keyware.composeanalysis.constant.enums.FileAnalysisStatusEnum; import com.keyware.composeanalysis.entity.AnalysisTask; import com.keyware.composeanalysis.mongo.FileDataMongoDto; @@ -15,10 +17,7 @@ import com.keyware.composeanalysis.mongo.LineDataMongoDto; import com.keyware.composeanalysis.mongo.MatchOpenFile; import com.keyware.composeanalysis.mongo.MatchOpenFileMongoDto; import com.keyware.composeanalysis.solr.VersionTree; -import com.keyware.composeanalysis.util.AnalysisLogUtil; -import com.keyware.composeanalysis.util.RedisUtil; -import com.keyware.composeanalysis.util.SolrUtils; -import com.keyware.composeanalysis.util.SpringContextUtils; +import com.keyware.composeanalysis.util.*; import com.keyware.keyswan.anaysis.Analysis; import com.keyware.keyswan.anaysis.AnalysisFactory; import com.keyware.keyswan.common.CodeFile; @@ -34,6 +33,7 @@ import java.math.BigDecimal; import java.math.RoundingMode; import java.util.*; import java.util.concurrent.CountDownLatch; +import java.util.stream.Collectors; import static org.springframework.data.mongodb.core.query.Criteria.where; @@ -84,37 +84,23 @@ public class LineAnalysisTask extends IAnalysisTask { } //获取文件地址 - String filePath = analysisFile.getFileUrl(); + String filePath = analysisFile.getFileUrl(); //获取文件名称 String fileName = analysisFile.getName(); AnalysisLogUtil.insert(mongoTemplate, "【行级特征提取】正在提取" + fileName); try { - LineDataMongoDto lineDataMongoDto = new LineDataMongoDto(); - lineDataMongoDto.setFileId(analysisFile.getId()) - .setStatus(0) - .setIsSelect(false); + Analysis analysis = AnalysisFactory.getAnalysis(filePath); - CodeFile codeFile = null; //获取文件行级特征md5 - codeFile = analysis.analysisFile(filePath, FunctionAndAnalysisAssemblyConst.LINE_EXTRACT, FunctionAndAnalysisAssemblyConst.LINE_EXTRACT); - //每一行原内容MD5值集合 -// String cutFileLineMd5 = codeFile.getCutFileLineMd5(); - //每一行特征内容MD5值集合 - String traitFileLineMd5 = codeFile.getTraitFileLineMd5(); - - String[] featureMd5Arr = {}; - if (StringUtils.isNotBlank(traitFileLineMd5)) { - featureMd5Arr = traitFileLineMd5.split(","); - } - List lineFeatures = Arrays.asList(featureMd5Arr); + CodeFile codeFile = analysis.analysisFile(filePath, FunctionAndAnalysisAssemblyConst.LINE_EXTRACT, FunctionAndAnalysisAssemblyConst.LINE_EXTRACT); //从solr中获取特征相似的 文件 - SolrDocumentList featureSimilarityFromSolr = getFeatureSimilarityFromSolr(lineFeatures); + SolrDocumentList featureSimilarityFromSolr = getFeatureSimilarityFromSolr(codeFile); //计算文件的开源率 - calculateOpenRate(featureSimilarityFromSolr, lineFeatures); + doAnalysis(featureSimilarityFromSolr, codeFile); //更新文件表的分析状态为3 行级特征以分析完毕 analysisFile.setFileAnalysisStatus(FileAnalysisStatusEnum.ANALYSIS_DONE.getCode()); @@ -123,11 +109,12 @@ public class LineAnalysisTask extends IAnalysisTask { .replaceWith(analysisFile) .findAndReplace(); - AnalysisLogUtil.insert(mongoTemplate, "【行级特征提取】提取完成" + fileName); + //插入日志 + AnalysisLogUtil.insert(mongoTemplate, "【行级分析】完成" + fileName); log.info("文件" + fileName + ":行级分析完成"); } catch (Exception e) { - AnalysisLogUtil.insertErrorInfo(mongoTemplate, "【行级特征提取】提取失败" + fileName, e); - log.error("文件:" + fileName + "行级别特征提取失败!", e); + AnalysisLogUtil.insertErrorInfo(mongoTemplate, "【行级分析】失败" + fileName, e); + log.error("文件:{}行级别分析失败!", fileName,e); //修改当前文件分析状态未失败 mongoTemplate.update(FileDataMongoDto.class) .matching(where("_id").is(analysisFile.getId())) @@ -143,100 +130,193 @@ public class LineAnalysisTask extends IAnalysisTask { * 计算开源率 被测件的开源率 * * @param matcheOpenSourceFiles - * @param lineFeatures + * @param codeFile 文件解析结果 */ - private void calculateOpenRate(SolrDocumentList matcheOpenSourceFiles, List lineFeatures) { + private void doAnalysis(SolrDocumentList matcheOpenSourceFiles, CodeFile codeFile) { + + //根据文件后缀判断需要查询的文件版本库名称 + String versionIdCoreName = FixedValue.SUFFIX_SOLR_VERSION.get(analysisFile.getSuffix()); if (CollectionUtil.isEmpty(matcheOpenSourceFiles)) { + //因为行的特征库较少,这里补充一个对比逻辑,如果当前文件解析失败,或者没有通过代码块匹配到数据,则直接通过文件的md5 再次查询一次solr库 + checkByOriginalFileMd5(versionIdCoreName, analysisFile.getMd5()); return; } - //根据文件后缀判断需要查询的文件版本库名称 - String versionIdCoreName = FixedValue.SUFFIX_SOLR_VERSION.get(analysisFile.getSuffix()); + //保存所有匹配的行数信息,方便统计总的匹配行数 + Set matchedFeatureMd5 = new HashSet<>(); + + //保存所有匹配的行数信息,方便统计总的匹配行数 + Set matchLineRowsNum = new HashSet<>(); + + //获取文件总特征行数 + String traitFileLineMd5 = codeFile.getTraitFileLineMd5(); + List lineFeatureList = Arrays.asList(traitFileLineMd5.split(",")); + + //统计每个文件的开源率 + List matchOpenFilesRes = calculateSimilarityAndOpenRate(matcheOpenSourceFiles, codeFile, versionIdCoreName, matchLineRowsNum, matchedFeatureMd5); + + //计算文件的总体特征相似度 + BigDecimal featureSimilarity = new BigDecimal(matchedFeatureMd5.size()).divide(new BigDecimal(lineFeatureList.size()), 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2); + + //计算文件的总体开源率 + BigDecimal openRate = new BigDecimal(matchLineRowsNum.size()).divide(new BigDecimal(analysisFile.getCodeRowNum()), 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2); + + + //获取开源率的阈值 + Integer openRateThreshold = analysisTask.getOpenRateThreshold(); + + //如果开源率大于阈值,则将当前文件设置成开源 + if (openRate.compareTo(new BigDecimal(openRateThreshold)) >= 0) { + analysisFile.setOpenType(true); + } - //定义结果集对象 + //保存当前文件的开源信息到mongo库中 MatchOpenFileMongoDto matchOpenFileMongo = new MatchOpenFileMongoDto(); matchOpenFileMongo.setId(IdGenerator.uuid32()) .setFilePath(analysisFile.getFileUrl()) - .setFileName(analysisFile.getName()); + .setFileName(analysisFile.getName()) + .setFeatureSimilarity(featureSimilarity.floatValue()) + .setOpenRate(openRate.floatValue()) + .setOpenType(analysisFile.getOpenType()) + .setMatchOpenFile(matchOpenFilesRes); + log.info("文件" + analysisFile.getName() + ":开源率:" + openRate.floatValue() + ",特征相似度:" + featureSimilarity.floatValue()); + mongoTemplate.save(matchOpenFileMongo); + } - //开源文件信息保存结果集 - List matchOpenFileInfoList = new ArrayList<>(); - //保存所有匹配的行数信息,方便统计总的匹配行数 - Set matchingLineSet = new HashSet<>(); - - //获取文件总行数 - BigDecimal totalCodeRowNum = new BigDecimal(analysisFile.getCodeRowNum()); - - //统计每个开源文件和被测件的匹配行数 - for (SolrDocument matchFile : matcheOpenSourceFiles) { - //解析文件的代码块特征值 - String lineFeatureMd5s = (String) matchFile.get("tz_line_hay"); - List matchedLineFeatures = Arrays.asList(lineFeatureMd5s.split(",")); - - //匹配的总行数 - int currentFileMatchLineCount = 0; - - //遍历当前文件的代码块特征,统计匹配的总行数 - for (String originalLineFeatureMd5 : lineFeatures) { - for (String matchLineFeatureMd5 : matchedLineFeatures) { - if (originalLineFeatureMd5.equals(matchLineFeatureMd5)) { - currentFileMatchLineCount++; - matchingLineSet.add(originalLineFeatureMd5); + /** + * 计算当前文件的特征相似度 和 开源率 + * + * @param matchOpenFiles 通过MD5 匹配到的所有开源文件 + * @param sourceFileBaseCoreName 当前文件特征文件的 solr coreName + * @param matchLineRowsNum 所有开源文件匹配到的开源行号列表 + * @param matchFeatureLineMd5s 所有开源文件匹配到的特征行MD5 + * @return 匹配的开源文件解析后的结果集 + */ + private List calculateSimilarityAndOpenRate(SolrDocumentList matchOpenFiles, CodeFile fileAnalysisRes, String sourceFileBaseCoreName, Set matchLineRowsNum, Set matchFeatureLineMd5s) { + + //匹配的开源文件列表 + List matchOpenFilesRes = new ArrayList<>(); + + //首先根据文件的MD5查询开源文件的版本ID,和路径信息 + Set openSourceFileMd5s = matchOpenFiles.stream().map(doc -> (String) doc.get("sourceMd5")).collect(Collectors.toSet()); + Map md5VersionInfoMap = solrUtils.batchQueryVersionIdFromSourceFileBaseBySourceMd5(sourceFileBaseCoreName, openSourceFileMd5s); + + //根据版本ID查询版本的详细信息 + //todo 这里 查询一个版本的信息 需要检索 两个 solr 库 而且还需要检索 versioinTree 后面需要优化 + Set openSourceFileVersionIds = md5VersionInfoMap.values().stream().map(doc -> (String) doc.get("versionId")).collect(Collectors.toSet()); + List versionTrees = solrUtils.queryBatchVersionInfoByVersionIds(openSourceFileVersionIds); + Map versionIdVersionInfoMap = versionTrees.stream().collect(Collectors.toMap(VersionTree::getVersionId, java.util.function.Function.identity())); + + String traitFileLineMd5 = fileAnalysisRes.getTraitFileLineMd5(); + List lineFeatureList = Arrays.asList(traitFileLineMd5.split(",")); + + for (SolrDocument openSourceFile : matchOpenFiles) { + + //开源文件MD5 + String openSourceFileMd5 = openSourceFile.getFieldValue("sourceMd5").toString(); + + //解析文件的特征行 + String lineFeatureMd5s = (String) openSourceFile.get("tz_line_hay"); + + List openFileLineFeatures = Arrays.asList(lineFeatureMd5s.split(",")); + + //获取开源文件的文本信息 + String openSourceContent = solrUtils.getOpenFileContentByMd5(openSourceFileMd5); + + //当前文件匹配特征行总行数 + int currentFileMatchFeatureLineCount = 0; + + //遍历函数特征MD5 + for (String lineFeatureMd5 : lineFeatureList) { + //源文件的特征行列表 + for (String openFileLineFeature : openFileLineFeatures) { + if (lineFeatureMd5.equals(openFileLineFeature)) { + matchFeatureLineMd5s.add(lineFeatureMd5); + currentFileMatchFeatureLineCount++; } } } - //首先根据文件的MD5查询开源文件的版本ID,和路径信息 - SolrDocument versionIdAndPath = solrUtils.queryOne(versionIdCoreName, "sourceFileMd5:" + matchFile.get("sourceMd5"), "versionId,fullPath,sourceFileMd5"); + //当前文件的开源率 + Pair> openRateAndSaveRowNum = SimilarityUtil.getOpenRateAndSaveRowNum(fileAnalysisRes.getSourceFileContent(), openSourceContent); - //根据版本ID查询版本的详细信息 - //todo 这里 查询一个版本的信息 需要检索 两个 solr 库 而且还需要检索 versioinTree 后面需要优化 - VersionTree versionInfo = solrUtils.queryVersionInfoByVersionId((String) versionIdAndPath.get("versionId")); + //将当前文件匹配的行号,存储到缓存中,方便统计整体的开源率 + matchLineRowsNum.addAll(openRateAndSaveRowNum.getValue()); - //计算与当前开源文件的开源率 - BigDecimal openRate = new BigDecimal(currentFileMatchLineCount).divide(totalCodeRowNum, 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)); + //统计当前文件的特征相似度 + BigDecimal featureSimilarity = new BigDecimal(currentFileMatchFeatureLineCount).divide(new BigDecimal(lineFeatureList.size()), 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2); + + SolrDocument openEntries = md5VersionInfoMap.get(openSourceFileMd5); + VersionTree versionInfo = versionIdVersionInfoMap.get(openEntries.get("versionId")); + if (versionInfo == null) { + log.error("根据版本ID,未查询到相关的版本信息。versionId:{}", openEntries.get("versionId")); + continue; + } - //当前开源文件的开源项目信息 + //组装当前开源文件的开源项目信息 MatchOpenFile matchOpenFileInfo = new MatchOpenFile(); matchOpenFileInfo.setPId(versionInfo.getProId()) .setPName(versionInfo.getProName()) - .setSourceUrl(versionInfo.getDownUrl()) - .setOpenRate(openRate.floatValue()) + .setSourceUrl((String) openEntries.get("fullPath")) + .setFeatureSimilarity(featureSimilarity.floatValue()) + .setOpenRate(openRateAndSaveRowNum.getKey()) .setVersion(versionInfo.getVersionName()) .setLicenseType(versionInfo.getLicenseType()) - .setAnalyzeType(AnalysisLevelEnum.LINE_LEVEL.getCode()); - matchOpenFileInfoList.add(matchOpenFileInfo); + .setAnalyzeType(AnalysisLevelEnum.FUNCTION_LEVEL.getCode()); + matchOpenFilesRes.add(matchOpenFileInfo); } + return matchOpenFilesRes; + } - //统计当前文件的整体开源率 - BigDecimal openRate = new BigDecimal(matchingLineSet.size()).divide(totalCodeRowNum, 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)); - - //获取开源率的阈值 - Integer openRateThreshold = analysisTask.getOpenRateThreshold(); - //如果开源率大于阈值,则将当前文件设置成开源 - if (openRate.compareTo(new BigDecimal(openRateThreshold)) >= 0) { - analysisFile.setOpenType(true); - } + /** + * 防止代码块特征库不全,再次根据文件MD5查询开源文件信息, 做二次校验 + * + * @param originalFileMd5 + * @param versionIdCoreName + */ + private void checkByOriginalFileMd5(String versionIdCoreName, String originalFileMd5) { - //保存当前文件的开源信息 - matchOpenFileMongo.setOpenType(analysisFile.getOpenType()) - .setMatchOpenFile(matchOpenFileInfoList); - mongoTemplate.save(matchOpenFileMongo); + //根据文件的MD5,查询特征库,看当前文件是否在开源代码库中 + SolrDocument versionIdAndPath = solrUtils.queryOne(versionIdCoreName, "sourceFileMd5:" + originalFileMd5, "versionId,fullPath,sourceFileMd5"); + if (versionIdAndPath != null) { + //根据版本ID查询版本的详细信息 + VersionTree versionInfo = solrUtils.queryVersionInfoByVersionId((String) versionIdAndPath.get("versionId")); + if (versionInfo != null) { + //当前开源文件的开源项目信息 + MatchOpenFile matchOpenFileInfo = new MatchOpenFile(); + matchOpenFileInfo.setPId(versionInfo.getProId()) + .setPName(versionInfo.getProName()) + .setSourceUrl(versionInfo.getDownUrl()) + .setFeatureSimilarity(100.00f) + .setOpenRate(100.00f) + .setAnalyzeType(AnalysisLevelEnum.FILE_LEVEL.getCode()); + + //保存当前文件的开源信息到mongo库中 + MatchOpenFileMongoDto matchOpenFileMongo = new MatchOpenFileMongoDto(); + matchOpenFileMongo.setId(IdGenerator.uuid32()) + .setFilePath(analysisFile.getFileUrl()) + .setFileName(analysisFile.getName()) + .setOpenRate(100.00f) + .setOpenType(analysisFile.getOpenType()) + .setMatchOpenFile(Arrays.asList(matchOpenFileInfo)); + + mongoTemplate.save(matchOpenFileMongo); + } + } } /** * 将特征值插入到mongo库中 - * * @param features 特征集合 * @param lineDataMongoDto 当前分析任务 ,特征信息存储 - * todo 后期 看看有没有插入的必要 + * todo 后期 看看有没有插入的必要 * @param */ @Deprecated @@ -281,16 +361,31 @@ public class LineAnalysisTask extends IAnalysisTask { /** * 根据 特征值 从特征库中检索 具有特征相似的 * - * @param lineFeatureList 行特征信息 + * @param codeFile 行特征信息 * @return */ - private SolrDocumentList getFeatureSimilarityFromSolr(List lineFeatureList) { - String solrCoreName = SolrDBConst.CORE_NAME_SOURCE_FILE_INFO_TEMP; + private SolrDocumentList getFeatureSimilarityFromSolr(CodeFile codeFile) { + Set queryMd5Set = new HashSet<>(); + //每一行原内容MD5值集合 + String cutFileLineMd5 = codeFile.getCutFileLineMd5(); + if (StrUtil.isNotBlank(cutFileLineMd5)) { + List lineCutList = Arrays.asList(cutFileLineMd5.split(",")); + queryMd5Set.addAll(lineCutList); + } + //每一行特征内容MD5值集合 + String traitFileLineMd5 = codeFile.getTraitFileLineMd5(); + if (StrUtil.isNotBlank(traitFileLineMd5)) { + List lineFeatureList = Arrays.asList(traitFileLineMd5.split(",")); + queryMd5Set.addAll(lineFeatureList); + } + if (CollectionUtil.isEmpty(queryMd5Set)) { + log.error("特征相似度检索失败,特征为空:{}", analysisFile.getName()); + return new SolrDocumentList(); + } //拼接行特征查询条件 - String queryStr = "tz_line_hay:(" + StringUtils.join(lineFeatureList, " OR ") + ")"; - log.info("查询条件: solrCoreName:{},queryStr:{}", solrCoreName, queryStr); - SolrDocumentList result = solrUtils.query(solrCoreName, queryStr, "sourceMd5,tz_line_hay"); - log.info("查询结果: result:{}", result); + String queryStr = "tz_line_hay:(" + StringUtils.join(queryMd5Set, " OR ") + ")"; + log.info("查询条件: solrCoreName:{},queryStr:{}", SolrDBConst.CORE_NAME_SOURCE_FILE_INFO_TEMP, queryStr); + SolrDocumentList result = solrUtils.query(SolrDBConst.CORE_NAME_SOURCE_FILE_INFO_TEMP, queryStr, "sourceMd5,tz_line_hay"); return result; } diff --git a/src/main/java/com/keyware/composeanalysis/task/PorjectAnalysisTask.java b/src/main/java/com/keyware/composeanalysis/task/PorjectAnalysisTask.java index 4692050..40561e1 100644 --- a/src/main/java/com/keyware/composeanalysis/task/PorjectAnalysisTask.java +++ b/src/main/java/com/keyware/composeanalysis/task/PorjectAnalysisTask.java @@ -2,10 +2,10 @@ package com.keyware.composeanalysis.task; import cn.hutool.core.collection.CollectionUtil; import com.google.common.collect.Sets; +import com.keyware.common.constant.enums.AnalysisStatusEnum; import com.keyware.composeanalysis.constant.FixedValue; import com.keyware.composeanalysis.constant.MongoDBConst; import com.keyware.composeanalysis.constant.enums.AnalysisLevelEnum; -import com.keyware.composeanalysis.constant.enums.AnalysisStatusEnum; import com.keyware.composeanalysis.constant.enums.FileAnalysisStatusEnum; import com.keyware.composeanalysis.entity.AnalysisTask; import com.keyware.composeanalysis.mongo.*; @@ -290,7 +290,7 @@ public class PorjectAnalysisTask { matchOpenFileInfo.setId(IdGenerator.uuid32()) .setFileName(originalFile.getName()) .setFilePath(originalFile.getFileUrl()) - .setOpenType(originalFile.getOpenType()) + .setOpenType(true) .setFeatureSimilarity(100.00f) .setOpenRate(100.00f) .setMatchOpenFile(Arrays.asList(matchOpenFile)); diff --git a/src/main/java/com/keyware/composeanalysis/util/SimilarityUtil.java b/src/main/java/com/keyware/composeanalysis/util/SimilarityUtil.java index b920f98..cbc1565 100644 --- a/src/main/java/com/keyware/composeanalysis/util/SimilarityUtil.java +++ b/src/main/java/com/keyware/composeanalysis/util/SimilarityUtil.java @@ -83,7 +83,7 @@ public class SimilarityUtil { // //计算开源率 // BigDecimal openRate = new BigDecimal(matchedRowsNum.size()).divide(new BigDecimal(analysisFileLineInfo.size()), 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2); // -// return new Pair<>(openRate.floatValue(), matchedRowsNum); +// return new Pair<>(openRate.toString(), matchedRowsNum); // } @@ -110,7 +110,7 @@ public class SimilarityUtil { // //计算开源率 // BigDecimal openRate = new BigDecimal(matchedRowsNum.size()).divide(new BigDecimal(analysisFileLineInfo.size()), 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2); // -// return new Pair<>(openRate.floatValue(), matchedRowsNum); +// return new Pair<>(openRate.toString(), matchedRowsNum); // } public static Pair> getOpenRateAndSaveRowNum(String analysisFile, String openSourceFile) { diff --git a/src/main/java/com/keyware/composeanalysis/util/SolrUtils.java b/src/main/java/com/keyware/composeanalysis/util/SolrUtils.java index 342aa6e..f337c45 100644 --- a/src/main/java/com/keyware/composeanalysis/util/SolrUtils.java +++ b/src/main/java/com/keyware/composeanalysis/util/SolrUtils.java @@ -1,6 +1,7 @@ package com.keyware.composeanalysis.util; import com.keyware.composeanalysis.constant.MongoDBConst; +import com.keyware.composeanalysis.constant.SolrDBConst; import com.keyware.composeanalysis.solr.VersionTree; import lombok.Data; import lombok.extern.log4j.Log4j2; @@ -141,10 +142,6 @@ public class SolrUtils { } - - - - /** * 简单查询,指定返回字段 * @@ -175,6 +172,31 @@ public class SolrUtils { } + /** + * 根据开源文件的MD5 获取开源文件的文本内容 + * + * @param openSourceFileMd5 开源文件的MD5 + * @return + * @throws Exception + */ + public String getOpenFileContentByMd5(String openSourceFileMd5) { + + //根据源文件的MD5确定需要查询源码库的序号 + String openSourceCodeCoreIndex = openSourceFileMd5.substring(0, 1) + SolrDBConst.CORE_NAME_SUFFIX_SOURCE_FILE_INFO; + + //获取开源文件的文本信息 + SolrDocument openSourceContent = this.queryOne(openSourceCodeCoreIndex, "sourceFileMd5:" + openSourceFileMd5, "sourceContent"); + + if (openSourceContent == null) { + log.error("根据开源文件MD5:{}未找到对应的开源文件源码", openSourceFileMd5); + } + return openSourceContent.getFieldValue("sourceContent").toString(); + } + + + + + /** * 查询 versionTree *