diff --git a/src/main/java/com/keyware/composeanalysis/task/FileAnalysisTask.java b/src/main/java/com/keyware/composeanalysis/task/FileAnalysisTask.java index 05b0280..bf9c7cc 100644 --- a/src/main/java/com/keyware/composeanalysis/task/FileAnalysisTask.java +++ b/src/main/java/com/keyware/composeanalysis/task/FileAnalysisTask.java @@ -1,5 +1,6 @@ package com.keyware.composeanalysis.task; +import cn.hutool.core.lang.Pair; import com.keyware.common.constant.enums.AnalysisStatusEnum; import com.keyware.composeanalysis.constant.FixedValue; import com.keyware.composeanalysis.constant.RedisConst; @@ -101,7 +102,7 @@ public class FileAnalysisTask extends IAnalysisTask { SolrDocumentList openSourceFileList = solrUtils.query(featureCoreName, querySb, "sourceMd5"); //如果当前文件在源码库中,匹配到了数据,则统计当前文件的开源率 if (CollectionUtils.isNotEmpty(openSourceFileList)) { - ananlyzeFileOpenRate(openSourceFileList); + ananlyzeFileOpenRate(openSourceFileList,codeFile); } } } @@ -132,18 +133,13 @@ public class FileAnalysisTask extends IAnalysisTask { /** * 分析文件的开源率 * - * @param fileList 匹配的开源文件信息 + * @param openSourceFileList 匹配的开源文件信息 * @throws IOException */ - private void ananlyzeFileOpenRate(SolrDocumentList fileList) throws IOException { - //创建匹配开源文件信息匹配对象 - MatchOpenFileMongoDto matchOpenFileInfo = new MatchOpenFileMongoDto(); - matchOpenFileInfo.setId(IdGenerator.uuid32()) - .setFileName(analysisFile.getName()) - .setFilePath(analysisFile.getFileUrl()); + private void ananlyzeFileOpenRate(SolrDocumentList openSourceFileList,CodeFile fileAnalysisRes) throws IOException { //根据匹配的开源文件的md5 获取版本ID - Set sourceFileMd5 = fileList.stream().map(solrDocument -> (String) solrDocument.get("sourceMd5")).collect(Collectors.toSet()); + Set sourceFileMd5 = openSourceFileList.stream().map(solrDocument -> (String) solrDocument.get("sourceMd5")).collect(Collectors.toSet()); String sourceCoreName = FixedValue.SUFFIX_SOLR_VERSION.get(analysisFile.getSuffix()); Map md5VersionObjMap = solrUtils.batchQueryVersionIdFromSourceFileBaseBySourceMd5(sourceCoreName, sourceFileMd5); @@ -160,52 +156,14 @@ public class FileAnalysisTask extends IAnalysisTask { HashSet openLineNum = new HashSet<>(); - //开源文件结果集合 - ArrayList matchOpenFileList = new ArrayList<>(); - //遍历匹配到的开源文件列表 - for (int i = 0; i < fileList.size(); i++) { - String openFileMd5 = (String) fileList.get(i).get("sourceMd5"); - SolrDocument versionObj = md5VersionObjMap.get(openFileMd5); - String versionId = (String) versionObj.get("versionId"); - VersionTree versionInfo = versionIdMap.get(versionId); - if (versionInfo == null) { - log.error("未在versionTree中找到版本信息,openFileMd5:{},versionId:{}",openFileMd5, versionId); - continue; - } - MatchOpenFile matchOpenFile = new MatchOpenFile(); - matchOpenFile.setId(IdGenerator.uuid32()) - .setVersionId(versionId) - .setSourceFilePath((String) versionObj.get("fullPath")) - .setSourceUrl(versionInfo.getDownUrl()) - .setPId(versionInfo.getProId()) - .setPName(versionInfo.getProName()) - .setLicenseType(versionInfo.getLicenseType()) - .setAnalyzeType(AnalysisLevelEnum.FILE_LEVEL.getCode()) - .setVersion(versionInfo.getVersionName()) - .setFeatureSimilarity(100.00f); - //计算被测件和开源文件的文本相似度 - //根据文件的MD5的第一位获取solr库索引名称 - String solrNameIndex =openFileMd5.substring(0, 1) + SolrDBConst.CORE_NAME_SUFFIX_SOURCE_FILE_INFO; - SolrDocumentList sourceFileInfo = solrUtils.query(solrNameIndex, "sourceFileMd5:" + openFileMd5, "sourceContent"); - if (CollectionUtils.isNotEmpty(sourceFileInfo)) { - String openSourceContent = String.valueOf(sourceFileInfo.get(0).getFieldValue("sourceContent")); - //这里存在优化空间,被测件的文件行拆分 可以拿到循环外面 - double similarity = SimilarityUtil.getSimilarityAndSaveRowNum(fileLines, openSourceContent, openLineNum); - matchOpenFile.setOpenRate(new BigDecimal(similarity * 100).setScale(2, RoundingMode.HALF_UP).floatValue()); - //如果找不到源代码,直接将原文开源率置为 100% - } else { - log.error("找不到源代码,DBname:{},sourceFileMd5:{}", solrNameIndex, openFileMd5); - matchOpenFile.setOpenRate(100.00f); - } - matchOpenFile.setMd5(openFileMd5); - matchOpenFileList.add(matchOpenFile); - } + List matchOpenFilesRes = calculateOpenRate(openSourceFileList,fileAnalysisRes,sourceCoreName,openLineNum); + //统计被测件的总体开源率 + //获取开源率阈值,判断当前文件是否开源 Integer openRateThreshold = analysisTask.getOpenRateThreshold(); - int openLineCount = openLineNum.size(); - BigDecimal totalLineCount = new BigDecimal(fileLines.size()); - BigDecimal openRate = new BigDecimal(openLineCount).divide(totalLineCount, 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)); + + BigDecimal openRate = new BigDecimal(openLineNum.size()).divide(new BigDecimal(fileLines.size()), 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2); //超过阈值,则认为当前文件是开源文件 if (openRate.compareTo(new BigDecimal(openRateThreshold)) > 0) { @@ -215,18 +173,83 @@ public class FileAnalysisTask extends IAnalysisTask { } //修改保存测试文件信息 - analysisFile.setOpenLineCount(openLineCount) + analysisFile.setOpenLineCount(openLineNum.size()) .setOpenRate(openRate.floatValue()); - //组装开源信息 - matchOpenFileInfo.setFilePath(analysisFile.getFileUrl()) + + //保存当前文件的开源信息到mongo库中 + MatchOpenFileMongoDto matchOpenFileMongo = new MatchOpenFileMongoDto(); + matchOpenFileMongo.setId(IdGenerator.uuid32()) + .setFilePath(analysisFile.getFileUrl()) + .setFileName(analysisFile.getName()) + .setFeatureSimilarity(100.00f) + .setOpenRate(openRate.floatValue()) .setOpenType(analysisFile.getOpenType()) - .setOpenRate(analysisFile.getOpenType() ? 100.00f : 0.00f) - .setMatchOpenFile(matchOpenFileList); + .setMatchOpenFile(matchOpenFilesRes); //保存当前开源信息数据 - mongoTemplate.insert(matchOpenFileInfo); + mongoTemplate.insert(matchOpenFileMongo); } + + /** + * 计算当前文件的特征相似度 和 开源率 + * + * @param matchOpenFiles 通过MD5 匹配到的所有开源文件 + * @param sourceFileBaseCoreName 当前文件特征文件的 solr coreName + * @param matchLineRowsNum 所有开源文件匹配到的开源行号列表 + * @return 匹配的开源文件解析后的结果集 + */ + private List calculateOpenRate(SolrDocumentList matchOpenFiles, CodeFile fileAnalysisRes, String sourceFileBaseCoreName, Set matchLineRowsNum) { + + //匹配的开源文件列表 + List matchOpenFilesRes = new ArrayList<>(); + + //首先根据文件的MD5查询开源文件的版本ID,和路径信息 + Set openSourceFileMd5s = matchOpenFiles.stream().map(doc -> (String) doc.get("sourceMd5")).collect(Collectors.toSet()); + Map md5VersionInfoMap = solrUtils.batchQueryVersionIdFromSourceFileBaseBySourceMd5(sourceFileBaseCoreName, openSourceFileMd5s); + + //根据版本ID查询版本的详细信息 + //todo 这里 查询一个版本的信息 需要检索 两个 solr 库 而且还需要检索 versioinTree 后面需要优化 + Set openSourceFileVersionIds = md5VersionInfoMap.values().stream().map(doc -> (String) doc.get("versionId")).collect(Collectors.toSet()); + List versionTrees = solrUtils.queryBatchVersionInfoByVersionIds(openSourceFileVersionIds); + Map versionIdVersionInfoMap = versionTrees.stream().collect(Collectors.toMap(VersionTree::getVersionId, java.util.function.Function.identity())); + + for (SolrDocument openSourceFile : matchOpenFiles) { + + //开源文件md5 + String openSourceFileMd5 = openSourceFile.getFieldValue("sourceMd5").toString(); + + String openFileContent = solrUtils.getOpenFileContentByMd5(openSourceFileMd5); + + //当前文件的开源率 + Pair> openRateAndSaveRowNum = SimilarityUtil.getOpenRateAndSaveRowNum(fileAnalysisRes.getSourceFileContent(), openFileContent); + //将当前文件匹配的行号,存储到缓存中,方便统计整体的开源率 + matchLineRowsNum.addAll(openRateAndSaveRowNum.getValue()); + + SolrDocument openEntries = md5VersionInfoMap.get(openSourceFileMd5); + VersionTree versionInfo = versionIdVersionInfoMap.get(openEntries.get("versionId")); + if (versionInfo == null){ + log.error("找不到开源文件版本信息,versionId:{}", openEntries.get("versionId")); + } + + //组装当前开源文件的开源项目信息 + MatchOpenFile matchOpenFileInfo = new MatchOpenFile(); + matchOpenFileInfo.setPId(versionInfo.getProId()) + .setPName(versionInfo.getProName()) + .setSourceUrl((String) openEntries.get("fullPath")) + .setFeatureSimilarity(100.00f) + .setOpenRate(openRateAndSaveRowNum.getKey()) + .setVersion(versionInfo.getVersionName()) + .setLicenseType(versionInfo.getLicenseType()) + .setAnalyzeType(AnalysisLevelEnum.FILE_LEVEL.getCode()); + matchOpenFilesRes.add(matchOpenFileInfo); + } + return matchOpenFilesRes; + } + + + + }