From 890ed17d84907984bad41b2acd979587b498530a Mon Sep 17 00:00:00 2001 From: liuzongren <15011502566@163.com> Date: Sun, 29 Sep 2024 13:52:31 +0800 Subject: [PATCH] =?UTF-8?q?1.=E4=BC=98=E5=8C=96=E7=9B=B8=E4=BC=BC=E5=BA=A6?= =?UTF-8?q?=E5=AF=B9=E6=AF=94=E9=80=BB=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pom.xml | 44 ++------- .../config/GlobalExceptionHandler.java | 32 +++++++ .../composeanalysis/mongo/MatchOpenFile.java | 32 +++---- .../mongo/MatchOpenFileMongoDto.java | 39 ++++---- .../composeanalysis/solr/FunctionInfo.java | 26 ++++++ .../task/CodeBlockAnalysisTask.java | 35 ++++++-- .../task/FileAnalysisTask.java | 51 +++++++---- .../task/FunctionAnalysisTask.java | 67 +++++++++----- .../task/LineAnalysisTask.java | 31 +++++-- .../task/PorjectAnalysisTask.java | 44 +++++---- .../composeanalysis/util/SimilarityUtil.java | 89 +++++++------------ src/main/resources/application.yaml | 10 ++- 12 files changed, 282 insertions(+), 218 deletions(-) create mode 100644 src/main/java/com/keyware/composeanalysis/config/GlobalExceptionHandler.java create mode 100644 src/main/java/com/keyware/composeanalysis/solr/FunctionInfo.java diff --git a/pom.xml b/pom.xml index df357c1..f56cef2 100644 --- a/pom.xml +++ b/pom.xml @@ -14,10 +14,6 @@ compose-analysis compose-analysis 源码溯源服务 - - 17 - - org.springframework.boot @@ -30,6 +26,13 @@ spring-cloud-starter-alibaba-nacos-discovery + + + org.springframework.cloud + spring-cloud-starter-loadbalancer + + + com.alibaba.cloud @@ -86,35 +89,6 @@ - - - keyware-repos - KeyWare Repository - http://218.30.67.85:19201/nexus/content/groups/public/ - - - keyware-repos-2 - KeyWare Repository-2 - http://218.30.67.85:19201/nexus/content/repositories/releases/ - - - - - - - - - - - - - - - - - - - @@ -126,8 +100,8 @@ org.apache.maven.plugins maven-compiler-plugin - 16 - 16 + 17 + 17 diff --git a/src/main/java/com/keyware/composeanalysis/config/GlobalExceptionHandler.java b/src/main/java/com/keyware/composeanalysis/config/GlobalExceptionHandler.java new file mode 100644 index 0000000..14c2da8 --- /dev/null +++ b/src/main/java/com/keyware/composeanalysis/config/GlobalExceptionHandler.java @@ -0,0 +1,32 @@ +package com.keyware.composeanalysis.config; + +import com.keyware.common.base.response.Result; +import com.keyware.common.constant.enums.ResultCode; +import com.keyware.common.exception.BusinessException; +import lombok.extern.log4j.Log4j2; +import org.springframework.web.bind.annotation.ExceptionHandler; +import org.springframework.web.bind.annotation.ResponseBody; +import org.springframework.web.bind.annotation.RestControllerAdvice; + + +@Log4j2 +@RestControllerAdvice +public class GlobalExceptionHandler { + + //全局异常处理 + @ExceptionHandler(value = Exception.class) + public Result defaultErrorHandler(Exception e) { + log.error("全局异常信息,ex={}",e.getMessage(),e); + return Result.fail(ResultCode.FAIL.getCode(), e.getMessage()); + } + + //自定义异常处理 + //业务异常 + @ExceptionHandler(value = BusinessException.class) + @ResponseBody + public Result businessExceptionHandler(BusinessException e) { + log.error("业务异常信息",e); + return Result.fail(e.getCode(), e.getMsg()); + } + +} diff --git a/src/main/java/com/keyware/composeanalysis/mongo/MatchOpenFile.java b/src/main/java/com/keyware/composeanalysis/mongo/MatchOpenFile.java index cb279e7..8d342d5 100644 --- a/src/main/java/com/keyware/composeanalysis/mongo/MatchOpenFile.java +++ b/src/main/java/com/keyware/composeanalysis/mongo/MatchOpenFile.java @@ -19,15 +19,11 @@ import java.util.List; @Accessors(chain =true) public class MatchOpenFile implements Serializable { - //ID @Id private String id; - //开源项目版本名称 - private String version; - - //组件版本id - private String versionId; + //匹配的开源文件名称 + private String fileName; //开源项目名称 private String pName; @@ -35,28 +31,28 @@ public class MatchOpenFile implements Serializable { //开源项目id private String pId; + //匹配的开源文件所在项目版本名称 + private String version; + + //版本id + private String versionId; + + //开源文件的详细路径 + private String sourceFilePath; + //与被测文件的特征相似度 private Float featureSimilarity; + //文件开源率 + private Float openRate; + //开源地址 private String sourceUrl; - //开源文件的详细路径 - private String sourceFilePath; - //开源许可协议类型 private List licenseType; - //长度 - private Integer fileSize; - //文件MD5值 private String md5; - //分析类型 (0文件,1函数 2:代码块 3:行) - private int analyzeType; - - //文件开源率 - private Float openRate; - } diff --git a/src/main/java/com/keyware/composeanalysis/mongo/MatchOpenFileMongoDto.java b/src/main/java/com/keyware/composeanalysis/mongo/MatchOpenFileMongoDto.java index 55b663e..2033db8 100644 --- a/src/main/java/com/keyware/composeanalysis/mongo/MatchOpenFileMongoDto.java +++ b/src/main/java/com/keyware/composeanalysis/mongo/MatchOpenFileMongoDto.java @@ -20,39 +20,32 @@ import java.util.List; @Accessors(chain =true) public class MatchOpenFileMongoDto implements Serializable { + //ID @Id private String id; - /** - * 文件名称 - */ + //文件名称 private String fileName; - /** - * 文件路径 - */ - @Deprecated + //文件路径 private String filePath; - /** - * 当前 文件 是否 开源 false:不开源 true:开源 - */ - private Boolean openType; + //分析类型 + private Integer analysisType; + //与被测文件的特征相似度 + private Float featureSimilarity; - /** - * 当前文件的开源率 - */ - private float openRate; + //文件开源率 + private Float openRate; - /** - * 特征相似度 - */ - private Float featureSimilarity; + //文件MD5值 + private String md5; + + //当前文件是否开源 + private Boolean openType; - /** - * 匹配的开源文件信息 - */ - List matchOpenFile; + //匹配的开源文件信息 + List subMatchOpenFiles; } diff --git a/src/main/java/com/keyware/composeanalysis/solr/FunctionInfo.java b/src/main/java/com/keyware/composeanalysis/solr/FunctionInfo.java new file mode 100644 index 0000000..ebe77fa --- /dev/null +++ b/src/main/java/com/keyware/composeanalysis/solr/FunctionInfo.java @@ -0,0 +1,26 @@ +package com.keyware.composeanalysis.solr; + +import lombok.Data; +import lombok.experimental.Accessors; + +import java.math.BigDecimal; + +/** + * @author liuzongren + * @date 2024/9/23 + * @description solr库中 函数结构体对象 + */ +@Data +@Accessors(chain = true) +public class FunctionInfo { + //函数名称 + private String funName; + //特征函数MD5 + private String traitFunMd5; + //函数字符长度 + private BigDecimal funSize; + //函数有效代码行数 + private int codeRowNum; + //原函数MD5 + private String cutFunMd5; +} diff --git a/src/main/java/com/keyware/composeanalysis/task/CodeBlockAnalysisTask.java b/src/main/java/com/keyware/composeanalysis/task/CodeBlockAnalysisTask.java index c6f92ad..ec4dc35 100644 --- a/src/main/java/com/keyware/composeanalysis/task/CodeBlockAnalysisTask.java +++ b/src/main/java/com/keyware/composeanalysis/task/CodeBlockAnalysisTask.java @@ -3,6 +3,7 @@ package com.keyware.composeanalysis.task; import cn.hutool.core.collection.CollUtil; import cn.hutool.core.collection.CollectionUtil; +import cn.hutool.core.io.FileUtil; import cn.hutool.core.lang.Pair; import com.alibaba.fastjson.JSONArray; import com.keyware.common.constant.enums.AnalysisStatusEnum; @@ -203,16 +204,20 @@ public class CodeBlockAnalysisTask extends IAnalysisTask { analysisFile.setOpenType(true); } + + //保存当前文件开源行数 + analysisFile.setOpenLineCount(matchedLineRowsNum.size()); + //保存当前文件的开源信息到mongo库中 MatchOpenFileMongoDto matchOpenFileMongo = new MatchOpenFileMongoDto(); matchOpenFileMongo.setId(IdGenerator.uuid32()) - .setFilePath(analysisFile.getFileUrl()) .setFileName(analysisFile.getName()) + .setFilePath(analysisFile.getFileUrl()) .setFeatureSimilarity(featureSimilarity.floatValue()) .setOpenRate(openRate.floatValue()) .setOpenType(analysisFile.getOpenType()) - .setMatchOpenFile(matchOpenFilesRes); - + .setAnalysisType(AnalysisLevelEnum.BLOCK_LEVEL.getCode()) + .setSubMatchOpenFiles(matchOpenFilesRes); mongoTemplate.save(matchOpenFileMongo); } @@ -241,6 +246,13 @@ public class CodeBlockAnalysisTask extends IAnalysisTask { //按照特征行进行分组,一次匹配中,将所有的特征行进行累加 Map traitsFeatureMd5AndFeatureLineNumMap = getTraitsFeatureMd5AndFeatureLineNumMap(fileAnalysisRes.getLine_hay()); + + //被测件文本内容 + String sourcefileContent= FileUtil.readUtf8String(analysisFile.getFileUrl()); + + //将文本内容解析成行信息,用于后续文件的开源率计算 + List analysisFileLineInfo = SimilarityUtil.getSplitWords(sourcefileContent); + for (SolrDocument matchFile : matchOpenFiles) { //开源文件md5 @@ -280,7 +292,7 @@ public class CodeBlockAnalysisTask extends IAnalysisTask { } //当前文件的开源率 - Pair> openRateAndSaveRowNum = getOpenRateAndSaveRowNum(fileAnalysisRes.getSourceFileContent(), openSourceContent.getFieldValue("sourceContent").toString()); + Pair> openRateAndSaveRowNum = getOpenRateAndSaveRowNum(analysisFileLineInfo, openSourceContent.getFieldValue("sourceContent").toString()); //将当前文件匹配的行号,存储到缓存中,方便统计整体的开源率 matchLineRowsNum.addAll(openRateAndSaveRowNum.getValue()); @@ -295,16 +307,21 @@ public class CodeBlockAnalysisTask extends IAnalysisTask { continue; } + String openFilePath = (String) openEntries.get("fullPath"); //组装当前开源文件的开源项目信息 MatchOpenFile matchOpenFileInfo = new MatchOpenFile(); - matchOpenFileInfo.setPId(versionInfo.getProId()) + matchOpenFileInfo.setId(IdGenerator.uuid32()) + .setFileName(FileUtil.getName(openFilePath)) .setPName(versionInfo.getProName()) - .setSourceUrl((String) openEntries.get("fullPath")) - .setFeatureSimilarity(featureSimilarity.floatValue()) - .setOpenRate(openRateAndSaveRowNum.getKey()) + .setPId(versionInfo.getProId()) .setVersion(versionInfo.getVersionName()) + .setVersionId(versionInfo.getVersionId()) + .setSourceFilePath(openFilePath) + .setSourceUrl(versionInfo.getDownUrl()) .setLicenseType(versionInfo.getLicenseType()) - .setAnalyzeType(AnalysisLevelEnum.BLOCK_LEVEL.getCode()); + .setFeatureSimilarity(featureSimilarity.floatValue()) + .setOpenRate(openRateAndSaveRowNum.getKey()) + .setMd5(openSourceFileMd5); matchOpenFilesRes.add(matchOpenFileInfo); } return matchOpenFilesRes; diff --git a/src/main/java/com/keyware/composeanalysis/task/FileAnalysisTask.java b/src/main/java/com/keyware/composeanalysis/task/FileAnalysisTask.java index 339d0b8..97fb6c3 100644 --- a/src/main/java/com/keyware/composeanalysis/task/FileAnalysisTask.java +++ b/src/main/java/com/keyware/composeanalysis/task/FileAnalysisTask.java @@ -1,5 +1,7 @@ package com.keyware.composeanalysis.task; +import cn.hutool.core.collection.CollUtil; +import cn.hutool.core.io.FileUtil; import cn.hutool.core.lang.Pair; import com.keyware.common.constant.enums.AnalysisStatusEnum; import com.keyware.composeanalysis.constant.FixedValue; @@ -25,11 +27,8 @@ import org.springframework.data.mongodb.core.query.Update; import java.io.IOException; import java.math.BigDecimal; import java.math.RoundingMode; -import java.nio.file.Files; -import java.nio.file.Paths; import java.util.*; import java.util.concurrent.CountDownLatch; -import java.util.function.Function; import java.util.stream.Collectors; import static org.springframework.data.mongodb.core.query.Criteria.where; @@ -140,6 +139,10 @@ public class FileAnalysisTask extends IAnalysisTask { */ private void ananlyzeFileOpenRate(SolrDocumentList openSourceFileList, CodeFile fileAnalysisRes) { + if (CollUtil.isEmpty(openSourceFileList)){ + return; + } + HashSet openLineNum = new HashSet<>(); //计算每个文件的开源率和特征相似度 @@ -148,7 +151,7 @@ public class FileAnalysisTask extends IAnalysisTask { //获取开源率阈值,判断当前文件是否开源 Integer openRateThreshold = analysisTask.getOpenRateThreshold(); - BigDecimal openRate = new BigDecimal(openLineNum.size()).divide(fileAnalysisRes.getCodeRowNum(), 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2); + BigDecimal openRate = new BigDecimal(openLineNum.size()).divide(new BigDecimal(analysisFile.getCodeRowNum()), 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2); //超过阈值,则认为当前文件是开源文件 if (openRate.compareTo(new BigDecimal(openRateThreshold)) > 0) { @@ -163,17 +166,18 @@ public class FileAnalysisTask extends IAnalysisTask { //保存当前文件的开源信息到mongo库中 - MatchOpenFileMongoDto matchOpenFileMongo = new MatchOpenFileMongoDto(); - matchOpenFileMongo.setId(IdGenerator.uuid32()) - .setFilePath(analysisFile.getFileUrl()) + MatchOpenFileMongoDto matchOpenFileInfo = new MatchOpenFileMongoDto(); + matchOpenFileInfo.setId(IdGenerator.uuid32()) .setFileName(analysisFile.getName()) + .setFilePath(analysisFile.getFileUrl()) + .setOpenType(true) .setFeatureSimilarity(100.00f) .setOpenRate(openRate.floatValue()) - .setOpenType(analysisFile.getOpenType()) - .setMatchOpenFile(matchOpenFilesRes); + .setAnalysisType(AnalysisLevelEnum.FILE_LEVEL.getCode()) + .setSubMatchOpenFiles(matchOpenFilesRes); //保存当前开源信息数据 - mongoTemplate.insert(matchOpenFileMongo); + mongoTemplate.insert(matchOpenFileInfo); } @@ -202,6 +206,13 @@ public class FileAnalysisTask extends IAnalysisTask { List versionTrees = solrUtils.queryBatchVersionInfoByVersionIds(openSourceFileVersionIds); Map versionIdVersionInfoMap = versionTrees.stream().collect(Collectors.toMap(VersionTree::getVersionId, java.util.function.Function.identity())); + + //被测件文本内容 + String sourcefileContent= FileUtil.readUtf8String(analysisFile.getFileUrl()); + + //将文本内容解析成行信息,用于后续文件的开源率计算 + List analysisFileLineInfo = SimilarityUtil.getSplitWords(sourcefileContent); + for (SolrDocument openSourceFile : matchOpenFiles) { //开源文件md5 @@ -210,7 +221,7 @@ public class FileAnalysisTask extends IAnalysisTask { String openFileContent = solrUtils.getOpenFileContentByMd5(openSourceFileMd5); //当前文件的开源率 - Pair> openRateAndSaveRowNum = SimilarityUtil.getOpenRateAndSaveRowNum(fileAnalysisRes.getSourceFileContent(), openFileContent); + Pair> openRateAndSaveRowNum = SimilarityUtil.getOpenRateAndSaveRowNum(analysisFileLineInfo, openFileContent); //将当前文件匹配的行号,存储到缓存中,方便统计整体的开源率 matchLineRowsNum.addAll(openRateAndSaveRowNum.getValue()); @@ -220,17 +231,21 @@ public class FileAnalysisTask extends IAnalysisTask { log.error("找不到开源文件版本信息,versionId:{}", openEntries.get("versionId")); } + String openFilePath = (String) openEntries.get("fullPath"); //组装当前开源文件的开源项目信息 - MatchOpenFile matchOpenFileInfo = new MatchOpenFile(); - matchOpenFileInfo.setPId(versionInfo.getProId()) + MatchOpenFile matchOpenFile = new MatchOpenFile(); + matchOpenFile.setId(IdGenerator.uuid32()) + .setFileName(FileUtil.getName(openFilePath)) .setPName(versionInfo.getProName()) - .setSourceUrl((String) openEntries.get("fullPath")) - .setFeatureSimilarity(100.00f) - .setOpenRate(openRateAndSaveRowNum.getKey()) + .setPId(versionInfo.getProId()) .setVersion(versionInfo.getVersionName()) + .setVersionId(versionInfo.getVersionId()) + .setSourceFilePath(openFilePath) + .setSourceUrl(versionInfo.getDownUrl()) .setLicenseType(versionInfo.getLicenseType()) - .setAnalyzeType(AnalysisLevelEnum.FILE_LEVEL.getCode()); - matchOpenFilesRes.add(matchOpenFileInfo); + .setFeatureSimilarity(100.00f) + .setOpenRate(openRateAndSaveRowNum.getKey()); + matchOpenFilesRes.add(matchOpenFile); } return matchOpenFilesRes; } diff --git a/src/main/java/com/keyware/composeanalysis/task/FunctionAnalysisTask.java b/src/main/java/com/keyware/composeanalysis/task/FunctionAnalysisTask.java index e4f9884..40c5118 100644 --- a/src/main/java/com/keyware/composeanalysis/task/FunctionAnalysisTask.java +++ b/src/main/java/com/keyware/composeanalysis/task/FunctionAnalysisTask.java @@ -3,9 +3,11 @@ package com.keyware.composeanalysis.task; import cn.hutool.core.collection.CollUtil; import cn.hutool.core.collection.CollectionUtil; +import cn.hutool.core.io.FileUtil; import cn.hutool.core.lang.Pair; import com.alibaba.fastjson.JSONArray; import com.keyware.common.constant.enums.AnalysisStatusEnum; +import com.keyware.common.exception.BusinessException; import com.keyware.composeanalysis.constant.FixedValue; import com.keyware.composeanalysis.constant.RedisConst; import com.keyware.composeanalysis.constant.SolrDBConst; @@ -16,6 +18,7 @@ import com.keyware.composeanalysis.mongo.FileDataMongoDto; import com.keyware.composeanalysis.mongo.LineDataMongoDto; import com.keyware.composeanalysis.mongo.MatchOpenFile; import com.keyware.composeanalysis.mongo.MatchOpenFileMongoDto; +import com.keyware.composeanalysis.solr.FunctionInfo; import com.keyware.composeanalysis.solr.VersionTree; import com.keyware.composeanalysis.util.*; import com.keyware.keyswan.common.LineModel; @@ -101,7 +104,11 @@ public class FunctionAnalysisTask extends IAnalysisTask { String sourceFileBaseCoreName = FixedValue.SUFFIX_SOLR_VERSION.get(analysisFile.getSuffix()); //根据文件的名称获取函数解析器 - Analysis analysis = AnalysisFactory.getAnalysis(filePath); + Analysis analysis = AnalysisFactory.getAnalysis(fileName); + + if (analysis == null){ + throw new BusinessException("获取文件解析器失败,文件名称:"+fileName); + } //解析文件 CodeFile codeFile = analysis.analysisFile(new FileInputStream(filePath)); @@ -123,7 +130,7 @@ public class FunctionAnalysisTask extends IAnalysisTask { log.info("文件" + fileName + ":函数级分析完成"); } catch (Exception e) { AnalysisLogUtil.insertErrorInfo(mongoTemplate, "【函数级级分析】失败" + fileName, e); - log.error("文件:" + fileName + "函数级别特征提取失败!", e); + log.error("文件:" + fileName + "【函数级级分析】失败!", e); //修改当前文件分析状态未失败 mongoTemplate.update(FileDataMongoDto.class) .matching(where("_id").is(analysisFile.getId())) @@ -152,7 +159,7 @@ public class FunctionAnalysisTask extends IAnalysisTask { Map> featureMd5FunctionMap = fileAnalysisRes.getFunctionList().stream().collect(Collectors.groupingBy(Function::getMd5)); //函数代码总函数 - int totalFunctionLineCount = fileAnalysisRes.getFunctionList().stream().mapToInt(Function::getCodeRowNum).sum(); + BigDecimal totalFunctionLineCount = new BigDecimal(fileAnalysisRes.getFunctionList().stream().mapToInt(Function::getCodeRowNum).sum()); //匹配到的特征函数Md5 Set matchFeatureFunctionMd5s = new HashSet(); @@ -169,7 +176,7 @@ public class FunctionAnalysisTask extends IAnalysisTask { matchFunctionLineCount += featureMd5FunctionMap.get(matchFeatureFunctionMd5).stream().mapToInt(Function::getCodeRowNum).sum(); } - BigDecimal featureSimilarity = new BigDecimal(matchFunctionLineCount).divide(new BigDecimal(totalFunctionLineCount), 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2); + BigDecimal featureSimilarity = new BigDecimal(matchFunctionLineCount).divide(totalFunctionLineCount, 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2); //计算文件的总体开源率 BigDecimal openRate = new BigDecimal(matchOpenLineRowsNum.size()).divide(new BigDecimal(analysisFile.getCodeRowNum()), 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2); @@ -182,16 +189,19 @@ public class FunctionAnalysisTask extends IAnalysisTask { analysisFile.setOpenType(true); } + //保存当前文件开源行数 + analysisFile.setOpenLineCount(matchOpenLineRowsNum.size()); + //保存当前文件的开源信息到mongo库中 MatchOpenFileMongoDto matchOpenFileMongo = new MatchOpenFileMongoDto(); matchOpenFileMongo.setId(IdGenerator.uuid32()) - .setFilePath(analysisFile.getFileUrl()) .setFileName(analysisFile.getName()) + .setFilePath(analysisFile.getFileUrl()) + .setOpenType(analysisFile.getOpenType()) .setFeatureSimilarity(featureSimilarity.floatValue()) .setOpenRate(openRate.floatValue()) - .setOpenType(analysisFile.getOpenType()) - .setMatchOpenFile(matchOpenFilesRes); - + .setAnalysisType(AnalysisLevelEnum.FUNCTION_LEVEL.getCode()) + .setSubMatchOpenFiles(matchOpenFilesRes); mongoTemplate.save(matchOpenFileMongo); } @@ -226,13 +236,20 @@ public class FunctionAnalysisTask extends IAnalysisTask { //函数总行数 BigDecimal totalFunctionLineCount = new BigDecimal(fileAnalysisRes.getFunctionList().stream().mapToInt(Function::getCodeRowNum).sum()); + + //被测件文本内容 + String sourcefileContent= FileUtil.readUtf8String(analysisFile.getFileUrl()); + + //将文本内容解析成行信息,用于后续文件的开源率计算 + List analysisFileLineInfo = SimilarityUtil.getSplitWords(sourcefileContent); + for (SolrDocument openSourceFile : matchOpenFiles) { //开源文件md5 String openSourceFileMd5 = openSourceFile.getFieldValue("sourceMd5").toString(); //解析文件的函数特征值 - List openFileFunctionList = getOpenFileFunctionList(openSourceFile); + List openFileFunctionList = getOpenFileFunctionList(openSourceFile); //根据源文件的MD5确定需要查询源码库的序号 String openSourceCodeCoreIndex = openSourceFileMd5.substring(0, 1) + SolrDBConst.CORE_NAME_SUFFIX_SOURCE_FILE_INFO; @@ -250,8 +267,8 @@ public class FunctionAnalysisTask extends IAnalysisTask { for (String funFeatureMd5 : featureMd5FunctionMap.keySet()) { List currentFueatureFunctionList = featureMd5FunctionMap.get(funFeatureMd5); //源文件的特征函数列表 - for (Function openFunction : openFileFunctionList) { - if (funFeatureMd5.equals(openFunction.getMd5())) { + for (FunctionInfo openFunction : openFileFunctionList) { + if (funFeatureMd5.equals(openFunction.getTraitFunMd5())) { //每个特征函数 不能多次匹配,影响整体特征相似度 //匹配成功后,相同的特征行 一并加上 if (!currentFileMatchFeatureFunctionMd5.contains(funFeatureMd5)) { @@ -264,7 +281,7 @@ public class FunctionAnalysisTask extends IAnalysisTask { } //当前文件的开源率 - Pair> openRateAndSaveRowNum = SimilarityUtil.getOpenRateAndSaveRowNum(new String(fileAnalysisRes.getFileContent()), openSourceContent.getFieldValue("sourceContent").toString()); + Pair> openRateAndSaveRowNum = SimilarityUtil.getOpenRateAndSaveRowNum(analysisFileLineInfo, openSourceContent.getFieldValue("sourceContent").toString()); //将当前文件匹配的行号,存储到缓存中,方便统计整体的开源率 matchLineRowsNum.addAll(openRateAndSaveRowNum.getValue()); @@ -273,16 +290,22 @@ public class FunctionAnalysisTask extends IAnalysisTask { SolrDocument openEntries = md5VersionInfoMap.get(openSourceFileMd5); VersionTree versionInfo = versionIdVersionInfoMap.get(openEntries.get("versionId")); + String openFilePath = (String) md5VersionInfoMap.get(openSourceFileMd5).getFieldValue("fullPath"); + //组装当前开源文件的开源项目信息 MatchOpenFile matchOpenFileInfo = new MatchOpenFile(); - matchOpenFileInfo.setPId(versionInfo.getProId()) + matchOpenFileInfo.setId(IdGenerator.uuid32()) + .setFileName(FileUtil.getName(openFilePath)) .setPName(versionInfo.getProName()) - .setSourceUrl((String) openEntries.get("fullPath")) - .setFeatureSimilarity(featureSimilarity.floatValue()) - .setOpenRate(openRateAndSaveRowNum.getKey()) + .setPId(versionInfo.getProId()) .setVersion(versionInfo.getVersionName()) + .setVersionId(versionInfo.getVersionId()) + .setSourceFilePath(openFilePath) + .setSourceUrl(versionInfo.getDownUrl()) .setLicenseType(versionInfo.getLicenseType()) - .setAnalyzeType(AnalysisLevelEnum.FUNCTION_LEVEL.getCode()); + .setFeatureSimilarity(featureSimilarity.floatValue()) + .setOpenRate(openRateAndSaveRowNum.getKey()) + .setMd5(openSourceFileMd5); matchOpenFilesRes.add(matchOpenFileInfo); } return matchOpenFilesRes; @@ -316,18 +339,20 @@ public class FunctionAnalysisTask extends IAnalysisTask { * @param matchOpenFile * @return */ - private List getOpenFileFunctionList(SolrDocument matchOpenFile) { + private List getOpenFileFunctionList(SolrDocument matchOpenFile) { try { //解析文件的函数特征值 String lineFeatureMd5s = matchOpenFile.getFieldValue("fun_hay").toString(); - lineFeatureMd5s = lineFeatureMd5s.replace("\\", "") + lineFeatureMd5s = lineFeatureMd5s + .replace("\\\\\\\"", "") + .replace("\\", "") .replace("\"{", "{") .replace("}\"", "}"); - return JSONArray.parseArray(lineFeatureMd5s, Function.class); + return JSONArray.parseArray(lineFeatureMd5s, FunctionInfo.class); } catch (Exception e) { log.error("解析文件特征值失败", e); } - return new ArrayList(); + return new ArrayList<>(); } /** diff --git a/src/main/java/com/keyware/composeanalysis/task/LineAnalysisTask.java b/src/main/java/com/keyware/composeanalysis/task/LineAnalysisTask.java index ed0588c..4489aed 100644 --- a/src/main/java/com/keyware/composeanalysis/task/LineAnalysisTask.java +++ b/src/main/java/com/keyware/composeanalysis/task/LineAnalysisTask.java @@ -2,6 +2,7 @@ package com.keyware.composeanalysis.task; import cn.hutool.core.collection.CollectionUtil; +import cn.hutool.core.io.FileUtil; import cn.hutool.core.lang.Pair; import cn.hutool.core.util.StrUtil; import com.keyware.common.constant.enums.AnalysisStatusEnum; @@ -170,6 +171,9 @@ public class LineAnalysisTask extends IAnalysisTask { analysisFile.setOpenType(true); } + //保存当前文件开源行数 + analysisFile.setOpenLineCount(matchLineRowsNum.size()); + //保存当前文件的开源信息到mongo库中 MatchOpenFileMongoDto matchOpenFileMongo = new MatchOpenFileMongoDto(); matchOpenFileMongo.setId(IdGenerator.uuid32()) @@ -178,8 +182,8 @@ public class LineAnalysisTask extends IAnalysisTask { .setFeatureSimilarity(featureSimilarity.floatValue()) .setOpenRate(openRate.floatValue()) .setOpenType(analysisFile.getOpenType()) - .setMatchOpenFile(matchOpenFilesRes); - log.info("文件" + analysisFile.getName() + ":开源率:" + openRate.floatValue() + ",特征相似度:" + featureSimilarity.floatValue()); + .setAnalysisType(AnalysisLevelEnum.LINE_LEVEL.getCode()) + .setSubMatchOpenFiles(matchOpenFilesRes); mongoTemplate.save(matchOpenFileMongo); } @@ -211,6 +215,12 @@ public class LineAnalysisTask extends IAnalysisTask { String traitFileLineMd5 = fileAnalysisRes.getTraitFileLineMd5(); List lineFeatureList = Arrays.asList(traitFileLineMd5.split(",")); + //被测件文本内容 + String sourcefileContent= FileUtil.readUtf8String(analysisFile.getFileUrl()); + + //将文本内容解析成行信息,用于后续文件的开源率计算 + List analysisFileLineInfo = SimilarityUtil.getSplitWords(sourcefileContent); + for (SolrDocument openSourceFile : matchOpenFiles) { //开源文件MD5 @@ -239,7 +249,7 @@ public class LineAnalysisTask extends IAnalysisTask { } //当前文件的开源率 - Pair> openRateAndSaveRowNum = SimilarityUtil.getOpenRateAndSaveRowNum(fileAnalysisRes.getSourceFileContent(), openSourceContent); + Pair> openRateAndSaveRowNum = SimilarityUtil.getOpenRateAndSaveRowNum(analysisFileLineInfo, openSourceContent); //将当前文件匹配的行号,存储到缓存中,方便统计整体的开源率 matchLineRowsNum.addAll(openRateAndSaveRowNum.getValue()); @@ -254,16 +264,21 @@ public class LineAnalysisTask extends IAnalysisTask { continue; } + String openFilePath = (String) openEntries.get("fullPath"); //组装当前开源文件的开源项目信息 MatchOpenFile matchOpenFileInfo = new MatchOpenFile(); - matchOpenFileInfo.setPId(versionInfo.getProId()) + matchOpenFileInfo.setId(IdGenerator.uuid32()) + .setFileName(FileUtil.getName(openFilePath)) .setPName(versionInfo.getProName()) - .setSourceUrl((String) openEntries.get("fullPath")) - .setFeatureSimilarity(featureSimilarity.floatValue()) - .setOpenRate(openRateAndSaveRowNum.getKey()) + .setPId(versionInfo.getProId()) .setVersion(versionInfo.getVersionName()) + .setVersionId(versionInfo.getVersionId()) + .setSourceFilePath(openFilePath) + .setSourceUrl(versionInfo.getDownUrl()) .setLicenseType(versionInfo.getLicenseType()) - .setAnalyzeType(AnalysisLevelEnum.FUNCTION_LEVEL.getCode()); + .setFeatureSimilarity(featureSimilarity.floatValue()) + .setOpenRate(openRateAndSaveRowNum.getKey()) + .setMd5(openSourceFileMd5); matchOpenFilesRes.add(matchOpenFileInfo); } return matchOpenFilesRes; diff --git a/src/main/java/com/keyware/composeanalysis/task/PorjectAnalysisTask.java b/src/main/java/com/keyware/composeanalysis/task/PorjectAnalysisTask.java index 40561e1..8306735 100644 --- a/src/main/java/com/keyware/composeanalysis/task/PorjectAnalysisTask.java +++ b/src/main/java/com/keyware/composeanalysis/task/PorjectAnalysisTask.java @@ -1,6 +1,7 @@ package com.keyware.composeanalysis.task; import cn.hutool.core.collection.CollectionUtil; +import cn.hutool.core.io.FileUtil; import com.google.common.collect.Sets; import com.keyware.common.constant.enums.AnalysisStatusEnum; import com.keyware.composeanalysis.constant.FixedValue; @@ -14,14 +15,12 @@ import com.keyware.composeanalysis.solr.VersionTree; import com.keyware.composeanalysis.solr.VersionTreeNode; import com.keyware.composeanalysis.util.AnalysisLogUtil; import com.keyware.composeanalysis.util.SolrUtils; -import com.keyware.composeanalysis.util.SpringContextUtils; import com.keyware.utils.IdGenerator; import com.mongodb.client.MongoClient; import lombok.extern.log4j.Log4j2; import org.apache.commons.collections.CollectionUtils; import org.apache.commons.lang3.StringUtils; import org.apache.solr.common.SolrDocument; -import org.springframework.core.task.TaskExecutor; import org.springframework.data.mongodb.core.MongoTemplate; import org.springframework.data.mongodb.core.query.Query; import org.springframework.data.mongodb.core.query.Update; @@ -48,7 +47,6 @@ public class PorjectAnalysisTask { private AnalysisTask analysisTask; private AnalysisTaskServiceImpl analysisService; private SolrUtils solrUtils; - private TaskExecutor taskExecutor; /** * 项目级分析 @@ -64,7 +62,6 @@ public class PorjectAnalysisTask { this.mongoTemplate = new MongoTemplate(mongoClient, MongoDBConst.DB_NAME_PREFIX + analysisTask.getId()); this.analysisTask = analysisTask; this.solrUtils = solrUtils; - this.taskExecutor = SpringContextUtils.getBean(TaskExecutor.class); } @@ -123,8 +120,6 @@ public class PorjectAnalysisTask { //当前文件开源信息存入数据库中 mongoTemplate.insert(projectAssembly); - analysisService.updateById(analysisTask); - //更新文件分析的状态 mongoTemplate.update(FileDataMongoDto.class) .matching(where("isDirectory").is(false)) @@ -196,12 +191,13 @@ public class PorjectAnalysisTask { Map md5VersionIdMap = solrUtils.batchQueryVersionIdFromSourceFileBaseBySourceMd5(MongoDBConst.TABLE_NAME_SOURCE_FILE_BASE, fileMd5s); if (md5VersionIdMap == null || md5VersionIdMap.isEmpty()) { //如果没有匹配到,直接更新文件分析状态已完成,因为非32种语言的文件,无法进行解析,通过源文件的MD5匹配不到,就匹配不到了,无需进行下一步的匹配 - updateFileAnalysisStatus(fileMd5s); + updateFileAnalysisStatus(fileMd5s,true); return; } saveMatchOpenFileInfo(md5VersionIdMap, otherLanguageFiles); - //直接更改没有匹配的文件分析状态 - updateFileAnalysisStatus(Sets.difference(fileMd5s, md5VersionIdMap.keySet())); + //直接更改没有匹配的文件分析状态,因为没有匹配上的文件,无法进行下一步的匹配(下一步的匹配只针对32种主流的语言) + Set notMatchFileMd5s = Sets.difference(fileMd5s, md5VersionIdMap.keySet()); + updateFileAnalysisStatus(notMatchFileMd5s,false); } } @@ -231,7 +227,7 @@ public class PorjectAnalysisTask { if (CollectionUtils.isNotEmpty(batchInsertCache)) { mongoTemplate.insert(batchInsertCache, MatchOpenFileMongoDto.class); //更新文件分析的状态 - updateFileAnalysisStatus(md5VersionIdMap.keySet()); + updateFileAnalysisStatus(md5VersionIdMap.keySet(),true); } } @@ -266,7 +262,7 @@ public class PorjectAnalysisTask { } //更新文件分析的状态 - updateFileAnalysisStatus(matchedMd5s); + updateFileAnalysisStatus(matchedMd5s,true); } //获取匹配到的开源文件信息 @@ -274,14 +270,15 @@ public class PorjectAnalysisTask { //设置匹配文件的信息 MatchOpenFile matchOpenFile = new MatchOpenFile(); matchOpenFile.setId(IdGenerator.uuid32()) + .setFileName(FileUtil.getName(openFilePath)) + .setPName(versionInfo.getProName()) + .setPId(versionInfo.getProId()) + .setVersion(versionInfo.getVersionName()) .setVersionId(versionInfo.getVersionId()) .setSourceFilePath(openFilePath) .setSourceUrl(versionInfo.getDownUrl()) - .setPId(versionInfo.getProId()) - .setPName(versionInfo.getProName()) .setLicenseType(versionInfo.getLicenseType()) - .setAnalyzeType(AnalysisLevelEnum.FILE_LEVEL.getCode()) - .setVersion(versionInfo.getVersionName()) + .setMd5(originalFile.getMd5()) .setFeatureSimilarity(100.00f) .setOpenRate(100.00f); @@ -293,7 +290,8 @@ public class PorjectAnalysisTask { .setOpenType(true) .setFeatureSimilarity(100.00f) .setOpenRate(100.00f) - .setMatchOpenFile(Arrays.asList(matchOpenFile)); + .setAnalysisType(AnalysisLevelEnum.FILE_LEVEL.getCode()) + .setSubMatchOpenFiles(Arrays.asList(matchOpenFile)); return matchOpenFileInfo; } @@ -325,8 +323,8 @@ public class PorjectAnalysisTask { break; } - //异步保存匹配的开源文件信息 - taskExecutor.execute(() -> saveProjectOpenInfo(openProject, projectFiles)); + //保存匹配的开源文件信息 + saveProjectOpenInfo(openProject, projectFiles); //获取开源项目的所有文件md5集合 List openFilesMd5 = openProject.getDirTree().stream().map(VersionTreeNode::getSourceFileMd5).collect(Collectors.toList()); @@ -339,8 +337,8 @@ public class PorjectAnalysisTask { //计算与当前项目的相似度 BigDecimal semblance = new BigDecimal(matchedFiles.size()).divide(new BigDecimal(projectFilesMd5.size()), 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)); - //当相似度小于30%,不保存项目级的信息 - if (semblance.compareTo(new BigDecimal(30)) < 0){ + //当相似度小于20%,不保存项目级的信息 + if (semblance.compareTo(new BigDecimal(20)) < 0){ break; } @@ -366,11 +364,11 @@ public class PorjectAnalysisTask { } //更新文件分析的状态 - private void updateFileAnalysisStatus(Set fileMd5Set) { + private void updateFileAnalysisStatus(Set fileMd5Set, Boolean openType) { mongoTemplate.update(FileDataMongoDto.class) .matching(where("md5").in(fileMd5Set)) - .apply(new Update().set("openType", true) - .set("openRate", 100.00f) + .apply(new Update().set("openType", openType) + .set("openRate", openType ? 100.00f : 0f) .set("fileAnalysisStatus", FileAnalysisStatusEnum.ANALYSIS_DONE.getCode())) .all(); } diff --git a/src/main/java/com/keyware/composeanalysis/util/SimilarityUtil.java b/src/main/java/com/keyware/composeanalysis/util/SimilarityUtil.java index cbc1565..8f57954 100644 --- a/src/main/java/com/keyware/composeanalysis/util/SimilarityUtil.java +++ b/src/main/java/com/keyware/composeanalysis/util/SimilarityUtil.java @@ -1,5 +1,6 @@ package com.keyware.composeanalysis.util; +import cn.hutool.core.collection.CollUtil; import cn.hutool.core.lang.Pair; import cn.hutool.core.util.ArrayUtil; import cn.hutool.core.util.ByteUtil; @@ -54,65 +55,6 @@ public class SimilarityUtil { } - /** - * 获取开源率和开源行号 - * @param analysisFile 被测件内容 - * @param openSourceFile 开源文件内容 - * @return - */ -// public static Pair> getOpenRateAndSaveRowNum(String analysisFile, String openSourceFile) { -// if (StrUtil.hasBlank(analysisFile,openSourceFile)){ -// return new Pair<>(0.00f,new HashSet<>()); -// } -// //匹配到的行号 -// HashSet matchedRowsNum = new HashSet<>(); -// -// //被测件文件行 -// List analysisFileLineInfo = getSplitWords(analysisFile); -// -// //溯源到文件行 -// HashSet openSourceFileLineInfo = getSplitWords1(openSourceFile); -// -// for (int i = 0; i < analysisFileLineInfo.size(); i++) { -// String sent1Word = analysisFileLineInfo.get(i); -// if (openSourceFileLineInfo.contains(sent1Word)) { -// matchedRowsNum.add(i); -// } -// } -// -// //计算开源率 -// BigDecimal openRate = new BigDecimal(matchedRowsNum.size()).divide(new BigDecimal(analysisFileLineInfo.size()), 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2); -// -// return new Pair<>(openRate.toString(), matchedRowsNum); -// } - - -// public static Pair> getOpenRateAndSaveRowNum(byte[] analysisFile, byte[] openSourceFile) { -// if (ArrayUtil.hasNull(analysisFile,openSourceFile)){ -// return new Pair<>(0.00f,new HashSet<>()); -// } -// //匹配到的行号 -// HashSet matchedRowsNum = new HashSet<>(); -// -// //被测件文件行 -// List analysisFileLineInfo = getSplitWords(new String(analysisFile)); -// -// //溯源到文件行 -// HashSet openSourceFileLineInfo = getSplitWords1(new String(openSourceFile)); -// -// for (int i = 0; i < analysisFileLineInfo.size(); i++) { -// String sent1Word = analysisFileLineInfo.get(i); -// if (openSourceFileLineInfo.contains(sent1Word)) { -// matchedRowsNum.add(i); -// } -// } -// -// //计算开源率 -// BigDecimal openRate = new BigDecimal(matchedRowsNum.size()).divide(new BigDecimal(analysisFileLineInfo.size()), 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2); -// -// return new Pair<>(openRate.toString(), matchedRowsNum); -// } - public static Pair> getOpenRateAndSaveRowNum(String analysisFile, String openSourceFile) { if (StrUtil.hasBlank(analysisFile,openSourceFile)){ return new Pair<>(0.00f,new HashSet<>()); @@ -140,6 +82,35 @@ public class SimilarityUtil { } + public static Pair> getOpenRateAndSaveRowNum(List analysisFileLineInfo , String openSourceFile) { + if (CollUtil.isEmpty(analysisFileLineInfo) || StrUtil.isBlank(openSourceFile)){ + return new Pair<>(0.00f,new HashSet<>()); + } + + //匹配到的行号 + HashSet matchedRowsNum = new HashSet<>(); + + //溯源到文件行 + HashSet openSourceFileLineInfo = getSplitWords1(openSourceFile); + + for (int i = 0; i < analysisFileLineInfo.size(); i++) { + String sent1Word = analysisFileLineInfo.get(i); + if (openSourceFileLineInfo.contains(sent1Word)) { + matchedRowsNum.add(i); + } + } + + //计算开源率 + BigDecimal openRate = new BigDecimal(matchedRowsNum.size()).divide(new BigDecimal(analysisFileLineInfo.size()), 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2); + + return new Pair<>(openRate.floatValue(), matchedRowsNum); + } + + + + + + /** * 获得两个文件的相似度,并将被匹配的行 * @param matchLineInfos 被匹配的行信息 diff --git a/src/main/resources/application.yaml b/src/main/resources/application.yaml index fc22299..e9f435a 100644 --- a/src/main/resources/application.yaml +++ b/src/main/resources/application.yaml @@ -1,12 +1,14 @@ spring: + application: + name: compose-analysis cloud: nacos: discovery: - server-addr: 172.16.36.7:8848 - namespace: 2fad0ca9-bc32-4afd-9f2e-ebc133d5e781 + server-addr: 127.0.0.1:8848 + namespace: 4ce70f33-8b88-4931-a88c-2b68e7259bd7 config: - server-addr: 172.16.36.7:8848 - namespace: 2fad0ca9-bc32-4afd-9f2e-ebc133d5e781 + server-addr: 127.0.0.1:8848 + namespace: 4ce70f33-8b88-4931-a88c-2b68e7259bd7 file-extension: yaml config: import: nacos:compose-analysis-dev.yaml