diff --git a/src/main/java/com/keyware/composeanalysis/task/CodeBlockAnalysisTask.java b/src/main/java/com/keyware/composeanalysis/task/CodeBlockAnalysisTask.java index d96e262..c6f92ad 100644 --- a/src/main/java/com/keyware/composeanalysis/task/CodeBlockAnalysisTask.java +++ b/src/main/java/com/keyware/composeanalysis/task/CodeBlockAnalysisTask.java @@ -168,8 +168,6 @@ public class CodeBlockAnalysisTask extends IAnalysisTask { private void doAnalysis(SolrDocumentList matcheOpenSourceFiles, String sourceFileBaseCoreName, CodeFile fileAnalysisRes) { if (CollectionUtil.isEmpty(matcheOpenSourceFiles)) { - //因为代码块的特征库较少,这里补充一个对比逻辑,如果当前文件解析失败,或者没有通过代码块匹配到数据,则直接通过文件的md5 再次查询一次solr库 - checkByOriginalFileMd5(sourceFileBaseCoreName, analysisFile.getMd5()); return; } @@ -313,45 +311,6 @@ public class CodeBlockAnalysisTask extends IAnalysisTask { } - /** - * 防止代码块特征库不全,再次根据文件MD5查询开源文件信息, 做二次校验 - * - * @param originalFileMd5 - * @param versionIdCoreName - */ - private void checkByOriginalFileMd5(String versionIdCoreName, String originalFileMd5) { - - //根据文件的MD5,查询特征库,看当前文件是否在开源代码库中 - SolrDocument versionIdAndPath = solrUtils.queryOne(versionIdCoreName, "sourceFileMd5:" + originalFileMd5, "versionId,fullPath,sourceFileMd5"); - - if (versionIdAndPath != null) { - //根据版本ID查询版本的详细信息 - VersionTree versionInfo = solrUtils.queryVersionInfoByVersionId((String) versionIdAndPath.get("versionId")); - if (versionInfo != null) { - //当前开源文件的开源项目信息 - MatchOpenFile matchOpenFileInfo = new MatchOpenFile(); - matchOpenFileInfo.setPId(versionInfo.getProId()) - .setPName(versionInfo.getProName()) - .setSourceUrl(versionInfo.getDownUrl()) - .setFeatureSimilarity(100.00f) - .setOpenRate(100.00f) - .setAnalyzeType(AnalysisLevelEnum.FILE_LEVEL.getCode()); - - //保存当前文件的开源信息到mongo库中 - MatchOpenFileMongoDto matchOpenFileMongo = new MatchOpenFileMongoDto(); - matchOpenFileMongo.setId(IdGenerator.uuid32()) - .setFilePath(analysisFile.getFileUrl()) - .setFileName(analysisFile.getName()) - .setOpenRate(100.00f) - .setOpenType(analysisFile.getOpenType()) - .setMatchOpenFile(Arrays.asList(matchOpenFileInfo)); - - mongoTemplate.save(matchOpenFileMongo); - } - } - } - - /** * 获取当前文件的代码块特征值 * diff --git a/src/main/java/com/keyware/composeanalysis/task/FileAnalysisTask.java b/src/main/java/com/keyware/composeanalysis/task/FileAnalysisTask.java index bf9c7cc..339d0b8 100644 --- a/src/main/java/com/keyware/composeanalysis/task/FileAnalysisTask.java +++ b/src/main/java/com/keyware/composeanalysis/task/FileAnalysisTask.java @@ -4,7 +4,6 @@ import cn.hutool.core.lang.Pair; import com.keyware.common.constant.enums.AnalysisStatusEnum; import com.keyware.composeanalysis.constant.FixedValue; import com.keyware.composeanalysis.constant.RedisConst; -import com.keyware.composeanalysis.constant.SolrDBConst; import com.keyware.composeanalysis.constant.enums.AnalysisLevelEnum; import com.keyware.composeanalysis.constant.enums.FileAnalysisStatusEnum; import com.keyware.composeanalysis.entity.AnalysisTask; @@ -18,8 +17,6 @@ import com.keyware.keyswan.anaysis.AnalysisFactory; import com.keyware.keyswan.common.CodeFile; import com.keyware.utils.IdGenerator; import lombok.extern.log4j.Log4j2; -import org.apache.commons.collections.CollectionUtils; -import org.apache.commons.lang3.StringUtils; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocumentList; import org.springframework.data.mongodb.core.MongoTemplate; @@ -81,42 +78,31 @@ public class FileAnalysisTask extends IAnalysisTask { //获取当前文件名称 String fileName = analysisFile.getName(); - AnalysisLogUtil.insert(mongoTemplate, "【文件级分析】正在分析" + fileName); try { - //只有主流语言的才能解析 - //非32种主流语言的不能提取文件特征,在文件级MD5匹配的时候,已经做过匹配 - if (StringUtils.isNotEmpty(analysisFile.getSuffix()) && FixedValue.SUFFIX_SOLR_VERSION.containsKey(analysisFile.getSuffix())) { - //根据文件后缀 查询 *_CutFileInfo库名称 - String featureCoreName = FixedValue.SUFFIX_SOLR_FILE.get(analysisFile.getSuffix()); - //根据文件名称,获取文件解析器 - Analysis analysis = AnalysisFactory.getAnalysis(fileName); - //如果 analysis 返回值为null 说明还未支持这种语言的特征提取 可以直接通过文件的MD5值去solr库中匹配 - if (analysis != null) { - //如果文件大小超过3M,则不进行文件级行级特征提取 - Integer fileSize = analysisFile.getFileSize(); - if (fileSize < (3 * 1024 * 1024)) { - CodeFile codeFile = analysis.analysisFile(analysisFile.getFileUrl(), "1", "0"); - //根据文件的特征值,去相应文件文件后缀的特征库中进行查询 - if (codeFile != null) { - String querySb = "sourceMd5:" + codeFile.getSourceMd5() + " OR cutFileMd5:" + codeFile.getCutFileMd5() + " OR traitFileMd5:" + codeFile.getTraitFileMd5(); - SolrDocumentList openSourceFileList = solrUtils.query(featureCoreName, querySb, "sourceMd5"); - //如果当前文件在源码库中,匹配到了数据,则统计当前文件的开源率 - if (CollectionUtils.isNotEmpty(openSourceFileList)) { - ananlyzeFileOpenRate(openSourceFileList,codeFile); - } - } - } - } - } + //根据文件后缀 查询 *_CutFileInfo库名称 + String featureCoreName = FixedValue.SUFFIX_SOLR_FILE.get(analysisFile.getSuffix()); + //根据文件名称,获取文件解析器 + Analysis analysis = AnalysisFactory.getAnalysis(fileName); + //如果文件大小超过3M,则不进行文件级行级特征提取 + CodeFile codeFile = analysis.analysisFile(analysisFile.getFileUrl(), "1", "0"); + + //根据文件的特征值,去相应文件文件后缀的特征库中进行查询 + SolrDocumentList openSourceFileList = getFeatureSimilarityFromSolr(featureCoreName, codeFile); + + //则统计当前文件的开源率 + ananlyzeFileOpenRate(openSourceFileList, codeFile); + //更新文件级分析结果 analysisFile.setFileAnalysisStatus(FileAnalysisStatusEnum.ANALYSIS_DONE.getCode()); mongoTemplate.update(FileDataMongoDto.class) .matching(where("_id").is(analysisFile.getId())) .replaceWith(analysisFile) .findAndReplace(); + + AnalysisLogUtil.insert(mongoTemplate, "【文件级分析】成功" + fileName); } catch (Exception e) { - AnalysisLogUtil.insertErrorInfo(mongoTemplate, "【文件级】提取失败" + fileName, e); - log.error("文件:" + fileName + "文件级别特征提取失败!", e); + AnalysisLogUtil.insertErrorInfo(mongoTemplate, "【文件级】分析失败" + fileName, e); + log.error("文件:" + fileName + "文件级别分析失败!", e); //将当前文件的分析状态变更为失败 analysisFile.setFileAnalysisStatus(FileAnalysisStatusEnum.FAILED_ANALYSIS.getCode()); //更新文件级分析结果 @@ -130,40 +116,39 @@ public class FileAnalysisTask extends IAnalysisTask { } + /** + * 根据 特征值 从特征库中检索 具有特征相似的 + * + * @param featureCoreName 检索的solr 库名称 + * @param codeFile 源文件解析结果 + * @return + */ + private SolrDocumentList getFeatureSimilarityFromSolr(String featureCoreName, CodeFile codeFile) { + if (codeFile == null) { + log.error("特征为空,无法查询:{}", analysisFile.getName()); + return new SolrDocumentList(); + } + String queryStr = "sourceMd5:" + codeFile.getSourceMd5() + " OR cutFileMd5:" + codeFile.getCutFileMd5() + " OR traitFileMd5:" + codeFile.getTraitFileMd5(); + return solrUtils.query(featureCoreName, queryStr, "sourceMd5"); + } + /** * 分析文件的开源率 * * @param openSourceFileList 匹配的开源文件信息 * @throws IOException */ - private void ananlyzeFileOpenRate(SolrDocumentList openSourceFileList,CodeFile fileAnalysisRes) throws IOException { - - //根据匹配的开源文件的md5 获取版本ID - Set sourceFileMd5 = openSourceFileList.stream().map(solrDocument -> (String) solrDocument.get("sourceMd5")).collect(Collectors.toSet()); - String sourceCoreName = FixedValue.SUFFIX_SOLR_VERSION.get(analysisFile.getSuffix()); - Map md5VersionObjMap = solrUtils.batchQueryVersionIdFromSourceFileBaseBySourceMd5(sourceCoreName, sourceFileMd5); - - //根据版本ID获取版本信息 - Set versionIds = md5VersionObjMap.values().stream().map(solrDocument -> (String) solrDocument.get("versionId")).collect(Collectors.toSet()); - List treeInfoList = solrUtils.queryBatchVersionInfoByVersionIds(versionIds); - Map versionIdMap = treeInfoList.stream().collect(Collectors.toMap(VersionTree::getVersionId, Function.identity())); - - //获取被测件文本内容 - String fileContent = new String(Files.readAllBytes(Paths.get(analysisFile.getFileUrl())), "utf-8").replaceAll(" ", ""); - - //将被测件的文本内容拆分成行信息,用于匹配开源信息 - List fileLines = SimilarityUtil.getSplitWords(fileContent); + private void ananlyzeFileOpenRate(SolrDocumentList openSourceFileList, CodeFile fileAnalysisRes) { HashSet openLineNum = new HashSet<>(); - List matchOpenFilesRes = calculateOpenRate(openSourceFileList,fileAnalysisRes,sourceCoreName,openLineNum); - - //统计被测件的总体开源率 + //计算每个文件的开源率和特征相似度 + List matchOpenFilesRes = calculateOpenRate(openSourceFileList, fileAnalysisRes, openLineNum); //获取开源率阈值,判断当前文件是否开源 Integer openRateThreshold = analysisTask.getOpenRateThreshold(); - BigDecimal openRate = new BigDecimal(openLineNum.size()).divide(new BigDecimal(fileLines.size()), 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2); + BigDecimal openRate = new BigDecimal(openLineNum.size()).divide(fileAnalysisRes.getCodeRowNum(), 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2); //超过阈值,则认为当前文件是开源文件 if (openRate.compareTo(new BigDecimal(openRateThreshold)) > 0) { @@ -192,20 +177,21 @@ public class FileAnalysisTask extends IAnalysisTask { } - /** * 计算当前文件的特征相似度 和 开源率 * - * @param matchOpenFiles 通过MD5 匹配到的所有开源文件 - * @param sourceFileBaseCoreName 当前文件特征文件的 solr coreName - * @param matchLineRowsNum 所有开源文件匹配到的开源行号列表 + * @param matchOpenFiles 通过MD5 匹配到的所有开源文件 + * @param matchLineRowsNum 所有开源文件匹配到的开源行号列表 * @return 匹配的开源文件解析后的结果集 */ - private List calculateOpenRate(SolrDocumentList matchOpenFiles, CodeFile fileAnalysisRes, String sourceFileBaseCoreName, Set matchLineRowsNum) { + private List calculateOpenRate(SolrDocumentList matchOpenFiles, CodeFile fileAnalysisRes, Set matchLineRowsNum) { //匹配的开源文件列表 List matchOpenFilesRes = new ArrayList<>(); + //根据匹配的开源文件的md5 获取版本ID + String sourceFileBaseCoreName = FixedValue.SUFFIX_SOLR_VERSION.get(analysisFile.getSuffix()); + //首先根据文件的MD5查询开源文件的版本ID,和路径信息 Set openSourceFileMd5s = matchOpenFiles.stream().map(doc -> (String) doc.get("sourceMd5")).collect(Collectors.toSet()); Map md5VersionInfoMap = solrUtils.batchQueryVersionIdFromSourceFileBaseBySourceMd5(sourceFileBaseCoreName, openSourceFileMd5s); @@ -230,7 +216,7 @@ public class FileAnalysisTask extends IAnalysisTask { SolrDocument openEntries = md5VersionInfoMap.get(openSourceFileMd5); VersionTree versionInfo = versionIdVersionInfoMap.get(openEntries.get("versionId")); - if (versionInfo == null){ + if (versionInfo == null) { log.error("找不到开源文件版本信息,versionId:{}", openEntries.get("versionId")); } @@ -250,6 +236,4 @@ public class FileAnalysisTask extends IAnalysisTask { } - - } diff --git a/src/main/java/com/keyware/composeanalysis/task/FunctionAnalysisTask.java b/src/main/java/com/keyware/composeanalysis/task/FunctionAnalysisTask.java index 385a2ac..e4f9884 100644 --- a/src/main/java/com/keyware/composeanalysis/task/FunctionAnalysisTask.java +++ b/src/main/java/com/keyware/composeanalysis/task/FunctionAnalysisTask.java @@ -145,8 +145,6 @@ public class FunctionAnalysisTask extends IAnalysisTask { private void doAnalysis(SolrDocumentList matchOpenFiles, String sourceFileBaseCoreName, CodeFile fileAnalysisRes) { if (CollectionUtil.isEmpty(matchOpenFiles)) { - //因为函数的特征库较少,这里补充一个对比逻辑,如果当前文件解析失败,或者没有通过函数匹配到数据,则直接通过文件的md5 再次查询一次solr库 - checkByOriginalFileMd5(sourceFileBaseCoreName, analysisFile.getMd5()); return; } @@ -290,46 +288,6 @@ public class FunctionAnalysisTask extends IAnalysisTask { return matchOpenFilesRes; } - - /** - * 防止函数特征库不全,再次根据文件MD5查询开源文件信息, 做二次校验 - * - * @param originalFileMd5 - * @param versionIdCoreName - */ - private void checkByOriginalFileMd5(String versionIdCoreName, String originalFileMd5) { - - //根据文件的MD5,查询特征库,看当前文件是否在开源代码库中 - SolrDocument versionIdAndPath = solrUtils.queryOne(versionIdCoreName, "sourceFileMd5:" + originalFileMd5, "versionId,fullPath,sourceFileMd5"); - - if (versionIdAndPath != null) { - //根据版本ID查询版本的详细信息 - VersionTree versionInfo = solrUtils.queryVersionInfoByVersionId((String) versionIdAndPath.get("versionId")); - if (versionInfo != null) { - //当前开源文件的开源项目信息 - MatchOpenFile matchOpenFileInfo = new MatchOpenFile(); - matchOpenFileInfo.setPId(versionInfo.getProId()) - .setPName(versionInfo.getProName()) - .setSourceUrl(versionInfo.getDownUrl()) - .setFeatureSimilarity(100.00f) - .setOpenRate(100.00f) - .setAnalyzeType(AnalysisLevelEnum.FILE_LEVEL.getCode()); - - //保存当前文件的开源信息到mongo库中 - MatchOpenFileMongoDto matchOpenFileMongo = new MatchOpenFileMongoDto(); - matchOpenFileMongo.setId(IdGenerator.uuid32()) - .setFilePath(analysisFile.getFileUrl()) - .setFileName(analysisFile.getName()) - .setOpenRate(100.00f) - .setOpenType(analysisFile.getOpenType()) - .setMatchOpenFile(Arrays.asList(matchOpenFileInfo)); - - mongoTemplate.save(matchOpenFileMongo); - } - } - } - - /** * 根据 特征值 从特征库中检索 具有特征相似的 * diff --git a/src/main/java/com/keyware/composeanalysis/task/LineAnalysisTask.java b/src/main/java/com/keyware/composeanalysis/task/LineAnalysisTask.java index de05217..ed0588c 100644 --- a/src/main/java/com/keyware/composeanalysis/task/LineAnalysisTask.java +++ b/src/main/java/com/keyware/composeanalysis/task/LineAnalysisTask.java @@ -138,8 +138,6 @@ public class LineAnalysisTask extends IAnalysisTask { String versionIdCoreName = FixedValue.SUFFIX_SOLR_VERSION.get(analysisFile.getSuffix()); if (CollectionUtil.isEmpty(matcheOpenSourceFiles)) { - //因为行的特征库较少,这里补充一个对比逻辑,如果当前文件解析失败,或者没有通过代码块匹配到数据,则直接通过文件的md5 再次查询一次solr库 - checkByOriginalFileMd5(versionIdCoreName, analysisFile.getMd5()); return; } @@ -271,47 +269,6 @@ public class LineAnalysisTask extends IAnalysisTask { return matchOpenFilesRes; } - - - /** - * 防止代码块特征库不全,再次根据文件MD5查询开源文件信息, 做二次校验 - * - * @param originalFileMd5 - * @param versionIdCoreName - */ - private void checkByOriginalFileMd5(String versionIdCoreName, String originalFileMd5) { - - //根据文件的MD5,查询特征库,看当前文件是否在开源代码库中 - SolrDocument versionIdAndPath = solrUtils.queryOne(versionIdCoreName, "sourceFileMd5:" + originalFileMd5, "versionId,fullPath,sourceFileMd5"); - - if (versionIdAndPath != null) { - //根据版本ID查询版本的详细信息 - VersionTree versionInfo = solrUtils.queryVersionInfoByVersionId((String) versionIdAndPath.get("versionId")); - if (versionInfo != null) { - //当前开源文件的开源项目信息 - MatchOpenFile matchOpenFileInfo = new MatchOpenFile(); - matchOpenFileInfo.setPId(versionInfo.getProId()) - .setPName(versionInfo.getProName()) - .setSourceUrl(versionInfo.getDownUrl()) - .setFeatureSimilarity(100.00f) - .setOpenRate(100.00f) - .setAnalyzeType(AnalysisLevelEnum.FILE_LEVEL.getCode()); - - //保存当前文件的开源信息到mongo库中 - MatchOpenFileMongoDto matchOpenFileMongo = new MatchOpenFileMongoDto(); - matchOpenFileMongo.setId(IdGenerator.uuid32()) - .setFilePath(analysisFile.getFileUrl()) - .setFileName(analysisFile.getName()) - .setOpenRate(100.00f) - .setOpenType(analysisFile.getOpenType()) - .setMatchOpenFile(Arrays.asList(matchOpenFileInfo)); - - mongoTemplate.save(matchOpenFileMongo); - } - } - } - - /** * 将特征值插入到mongo库中 * @param features 特征集合 diff --git a/src/main/resources/application.yaml b/src/main/resources/application.yaml index 8f0706a..45dd361 100644 --- a/src/main/resources/application.yaml +++ b/src/main/resources/application.yaml @@ -5,14 +5,18 @@ spring: application: name: compose-analysis-service cloud: + inetutils: + preferred-networks: + #优先使用下列网段的IP进行网络通信 + - 172.16 nacos: discovery: - server-addr: 172.16.36.100:8848 - namespace: 7f9bb282-8ee3-4948-8182-24b7dcadcd5a + server-addr: 172.16.36.7:8848 + namespace: 2fad0ca9-bc32-4afd-9f2e-ebc133d5e781 config: - server-addr: 172.16.36.100:8848 - namespace: 7f9bb282-8ee3-4948-8182-24b7dcadcd5a - group: dev_group + server-addr: 172.16.36.7:8848 + namespace: 2fad0ca9-bc32-4afd-9f2e-ebc133d5e781 file-extension: yaml config: import: nacos:compose-analysis-dev.yaml +