package com.keyware.composeanalysis.task; import cn.hutool.core.collection.CollectionUtil; import cn.hutool.core.lang.Pair; import cn.hutool.core.util.ObjUtil; import com.alibaba.fastjson.JSONArray; import com.keyware.composeanalysis.constant.FixedValue; import com.keyware.composeanalysis.constant.RedisConst; import com.keyware.composeanalysis.constant.SolrDBConst; import com.keyware.composeanalysis.constant.enums.AnalysisLevelEnum; import com.keyware.composeanalysis.constant.enums.AnalysisStatusEnum; import com.keyware.composeanalysis.constant.enums.FileAnalysisStatusEnum; import com.keyware.composeanalysis.entity.AnalysisTask; import com.keyware.composeanalysis.mongo.FileDataMongoDto; import com.keyware.composeanalysis.mongo.LineDataMongoDto; import com.keyware.composeanalysis.mongo.MatchOpenFile; import com.keyware.composeanalysis.mongo.MatchOpenFileMongoDto; import com.keyware.composeanalysis.solr.VersionTree; import com.keyware.composeanalysis.util.*; import com.keyware.keyswan.common.LineModel; import com.keyware.keyware.anaysis.Analysis; import com.keyware.keyware.anaysis.AnalysisFactory; import com.keyware.keyware.common.CodeFile; import com.keyware.keyware.common.Function; import com.keyware.utils.IdGenerator; import lombok.extern.log4j.Log4j2; import org.apache.commons.lang3.StringUtils; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocumentList; import org.springframework.data.mongodb.core.MongoTemplate; import org.springframework.data.mongodb.core.query.Update; import java.io.FileInputStream; import java.math.BigDecimal; import java.math.RoundingMode; import java.util.*; import java.util.concurrent.CountDownLatch; import java.util.stream.Collectors; import java.util.stream.Stream; import static org.springframework.data.mongodb.core.query.Criteria.where; /** * @author liuzongren * @ClassName LineAnalysisTask * @description: 函数级别溯源 任务 * @datetime 2024年 07月 25日 16:19 * @version: 1.0 */ @Log4j2 public class FunctionAnalysisTask extends IAnalysisTask { private MongoTemplate mongoTemplate; private AnalysisTask analysisTask; //被测件的文件信息 private FileDataMongoDto analysisFile; private SolrUtils solrUtils; private RedisUtil redisUtil; private CountDownLatch countDownLatch; public FunctionAnalysisTask(AnalysisTask analysisTask, FileDataMongoDto analysisFile, MongoTemplate mongoTemplate, CountDownLatch countDownLatch) { this.mongoTemplate = mongoTemplate; this.analysisTask = analysisTask; this.analysisFile = analysisFile; this.countDownLatch = countDownLatch; this.solrUtils = SpringContextUtils.getBean(SolrUtils.class); this.redisUtil = SpringContextUtils.getBean(RedisUtil.class); } /** * 方法 或者代码块 级别 源代码溯源 * 当前任务 需要在 文件级分析完成后 进行 */ @Override public void run() { //执行任务前,判断一下任务执行的状态 Object status = redisUtil.get(String.format(RedisConst.TASK_RUNNING_STATUS_KEY_PREFIX, analysisTask.getId())); if (status != null && (status.equals(AnalysisStatusEnum.STOP_ANALYSIS.getCode()) || status.equals(AnalysisStatusEnum.PAUSE_ANALYSIS.getCode()))) { log.info("任务已取消,fileName:{}", analysisFile.getName()); countDownLatch.countDown(); return; } //获取文件地址 String filePath = analysisFile.getFileUrl(); //获取文件名称 String fileName = analysisFile.getName(); try { //根据文件后缀判断需要查询的solr特征库库名称 String featureCoreName = FixedValue.SUFFIX_SOLR_FILE.get(analysisFile.getSuffix()); //根据文件后缀,去检索sourceFileBase库,来获取文件版本信息 String sourceFileBaseCoreName = FixedValue.SUFFIX_SOLR_VERSION.get(analysisFile.getSuffix()); //根据文件的名称获取函数解析器 Analysis analysis = AnalysisFactory.getAnalysis(filePath); //解析文件 if (!ObjUtil.hasEmpty(featureCoreName, sourceFileBaseCoreName, analysis)) { CodeFile codeFile = analysis.analysisFile(new FileInputStream(filePath)); if (codeFile != null) { List functionList = codeFile.getFunctionList(); if (CollectionUtil.isNotEmpty(functionList)) { //获取函数的特征MD5,cutMD5 List featureFunctionMd5List = functionList.stream().map(Function::getMd5).collect(Collectors.toList()); List cutFunctionMd5List = functionList.stream().map(Function::getSourceMd5).collect(Collectors.toList()); Set queryMd5List = Stream.concat(featureFunctionMd5List.stream(), cutFunctionMd5List.stream()).collect(Collectors.toSet()); String queryStr = "fun_hay:(" + StringUtils.join(queryMd5List, " OR ") + ")"; // log.info("检索函数特征,coreName:{} ,queryStr:{}", featureCoreName, queryStr); SolrDocumentList matchOpenFiles = solrUtils.query(featureCoreName, queryStr, "sourceMd5,fun_hay"); // log.info("resp", sourceMd5); //如果函数级特征匹配,能够匹配到开源文件信息,则根据开源文件的md5或者开源文件信息,做相似度对比 if (matchOpenFiles != null) { //对匹配到的文件进行分析 doAnalysis(matchOpenFiles, sourceFileBaseCoreName, codeFile); } else { //因为函数的特征库较少,这里补充一个对比逻辑,如果当前文件解析失败,或者没有通过函数匹配到数据,则直接通过文件的md5 再次查询一次solr库 checkByOriginalFileMd5(sourceFileBaseCoreName, analysisFile.getMd5()); } } } } else { //因为函数的特征库较少,这里补充一个对比逻辑,如果当前文件解析失败,或者没有通过函数匹配到数据,则直接通过文件的md5 再次查询一次solr库 checkByOriginalFileMd5(sourceFileBaseCoreName, analysisFile.getMd5()); } //更新文件表的分析状态为3 函数级特征以分析完毕 analysisFile.setFileAnalysisStatus(FileAnalysisStatusEnum.ANALYSIS_DONE.getCode()); mongoTemplate.update(FileDataMongoDto.class) .matching(where("_id").is(analysisFile.getId())) .replaceWith(analysisFile) .findAndReplace(); AnalysisLogUtil.insert(mongoTemplate, "【函数级分析】完成" + fileName); log.info("文件" + fileName + ":函数级分析完成"); } catch (Exception e) { AnalysisLogUtil.insertErrorInfo(mongoTemplate, "【函数级级分析】失败" + fileName, e); log.error("文件:" + fileName + "函数级别特征提取失败!", e); //修改当前文件分析状态未失败 mongoTemplate.update(FileDataMongoDto.class) .matching(where("_id").is(analysisFile.getId())) .apply(new Update().set("fileAnalysisStatus", FileAnalysisStatusEnum.FAILED_ANALYSIS.getCode())) .first(); } finally { countDownLatch.countDown(); } } /** * 对比函数级文本相似度 * * @param matchOpenFiles 通过特征匹配到的开源文件的md5 * @param sourceFileBaseCoreName 查询版开源文件版本ID的 solr库名称 * @param fileAnalysisRes 被测件的函数解析结果 * @throws Exception */ private void doAnalysis(SolrDocumentList matchOpenFiles, String sourceFileBaseCoreName, CodeFile fileAnalysisRes) throws Exception { //按照函数的特征md5进行分组,getter ,setter等方法的 特征值会重复 Map> featureMd5FunctionMap = fileAnalysisRes.getFunctionList().stream().collect(Collectors.groupingBy(Function::getMd5)); //函数代码总函数 int totalFunctionLineCount = fileAnalysisRes.getFunctionList().stream().mapToInt(Function::getCodeRowNum).sum(); //匹配到的特征函数Md5 Set matchFeatureFunctionMd5s = new HashSet(); //匹配到源码的行号 Set matchOpenLineRowsNum = new HashSet(); //计算与每个开源文件的开源率和特征相似度 List matchOpenFilesRes = calculateSimilarityAndOpenRate(matchOpenFiles, fileAnalysisRes, sourceFileBaseCoreName, matchOpenLineRowsNum, matchFeatureFunctionMd5s); //计算文件的总体的特征相似度 int matchFunctionLineCount = 0; for (String matchFeatureFunctionMd5 : matchFeatureFunctionMd5s) { matchFunctionLineCount += featureMd5FunctionMap.get(matchFeatureFunctionMd5).stream().mapToInt(Function::getCodeRowNum).sum(); } BigDecimal featureSimilarity = new BigDecimal(matchFunctionLineCount).divide(new BigDecimal(totalFunctionLineCount), 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2); //计算文件的总体开源率 BigDecimal openRate = new BigDecimal(matchOpenLineRowsNum.size()).divide(new BigDecimal(analysisFile.getCodeRowNum()), 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2); //获取开源率的阈值 Integer openRateThreshold = analysisTask.getOpenRateThreshold(); //如果开源率大于阈值,则将当前文件设置成开源 if (openRate.floatValue() > openRateThreshold) { analysisFile.setOpenType(true); } //保存当前文件的开源信息到mongo库中 MatchOpenFileMongoDto matchOpenFileMongo = new MatchOpenFileMongoDto(); matchOpenFileMongo.setId(IdGenerator.uuid32()) .setFilePath(analysisFile.getFileUrl()) .setFileName(analysisFile.getName()) .setFeatureSimilarity(featureSimilarity.floatValue()) .setOpenRate(openRate.floatValue()) .setOpenType(analysisFile.getOpenType()) .setMatchOpenFile(matchOpenFilesRes); mongoTemplate.save(matchOpenFileMongo); } /** * 计算当前文件的特征相似度 和 开源率 * * @param matchOpenFiles 通过MD5 匹配到的所有开源文件 * @param sourceFileBaseCoreName 当前文件特征文件的 solr coreName * @param matchLineRowsNum 所有开源文件匹配到的开源行号列表 * @param matchFeatureFunctionMd5s 所有开源文件匹配到的特征函数MD5 * return 匹配的开源文件解析后的结果集 */ private List calculateSimilarityAndOpenRate(SolrDocumentList matchOpenFiles, CodeFile fileAnalysisRes, String sourceFileBaseCoreName, Set matchLineRowsNum, Set matchFeatureFunctionMd5s) { //匹配的开源文件列表 List matchOpenFilesRes = new ArrayList<>(); //按照函数的特征md5进行分组,getter ,setter等方法的 特征值会重复 Map> featureMd5FunctionMap = fileAnalysisRes.getFunctionList().stream().collect(Collectors.groupingBy(Function::getMd5)); //首先根据文件的MD5查询开源文件的版本ID,和路径信息 Set openSourceFileMd5s = matchOpenFiles.stream().map(doc -> (String) doc.get("sourceMd5")).collect(Collectors.toSet()); Map md5VersionInfoMap = solrUtils.batchQueryVersionIdFromSourceFileBaseBySourceMd5(sourceFileBaseCoreName, openSourceFileMd5s); //根据版本ID查询版本的详细信息 //todo 这里 查询一个版本的信息 需要检索 两个 solr 库 而且还需要检索 versioinTree 后面需要优化 Set openSourceFileVersionIds = md5VersionInfoMap.values().stream().map(doc -> (String) doc.get("versionId")).collect(Collectors.toSet()); List versionTrees = solrUtils.queryBatchVersionInfoByVersionIds(openSourceFileVersionIds); Map versionIdVersionInfoMap = versionTrees.stream().collect(Collectors.toMap(VersionTree::getVersionId, java.util.function.Function.identity())); //函数总行数 BigDecimal totalFunctionLineCount = new BigDecimal(fileAnalysisRes.getFunctionList().stream().mapToInt(Function::getCodeRowNum).sum()); for (SolrDocument openSourceFile : matchOpenFiles) { //开源文件md5 String openSourceFileMd5 = openSourceFile.getFieldValue("sourceMd5").toString(); //解析文件的函数特征值 List openFileFunctionList = getOpenFileFunctionList(openSourceFile); //根据源文件的MD5确定需要查询源码库的序号 String openSourceCodeCoreIndex = openSourceFileMd5.substring(0, 1) + SolrDBConst.CORE_NAME_SUFFIX_SOURCE_FILE_INFO; //获取开源文件的文本信息 SolrDocument openSourceContent = solrUtils.queryOne(openSourceCodeCoreIndex, "sourceFileMd5:" + openSourceFileMd5, "sourceContent"); //当前文件匹配特征函数总行数 int currentFileMatchFeatureLineCount = 0; //当前文件所匹配的特征函数MD5 Set currentFileMatchFeatureFunctionMd5 = new HashSet(); //遍历函数特征MD5 for (String funFeatureMd5 : featureMd5FunctionMap.keySet()) { List currentFueatureFunctionList = featureMd5FunctionMap.get(funFeatureMd5); //源文件的特征函数列表 for (Function openFunction : openFileFunctionList) { if (funFeatureMd5.equals(openFunction.getMd5())) { //每个特征函数 不能多次匹配,影响整体特征相似度 //匹配成功后,相同的特征行 一并加上 if (!currentFileMatchFeatureFunctionMd5.contains(funFeatureMd5)) { currentFileMatchFeatureFunctionMd5.add(funFeatureMd5); matchFeatureFunctionMd5s.add(funFeatureMd5); currentFileMatchFeatureLineCount += currentFueatureFunctionList.stream().mapToInt(Function::getCodeRowNum).sum(); } } } } //当前文件的开源率 Pair> openRateAndSaveRowNum = SimilarityUtil.getOpenRateAndSaveRowNum(new String(fileAnalysisRes.getFileContent()), openSourceContent.getFieldValue("sourceContent").toString()); //将当前文件匹配的行号,存储到缓存中,方便统计整体的开源率 matchLineRowsNum.addAll(openRateAndSaveRowNum.getValue()); //统计当前文件的特征相似度 BigDecimal featureSimilarity = new BigDecimal(currentFileMatchFeatureLineCount).divide(totalFunctionLineCount, 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2); SolrDocument openEntries = md5VersionInfoMap.get(openSourceFileMd5); VersionTree versionInfo = versionIdVersionInfoMap.get(openEntries.get("versionId")); //组装当前开源文件的开源项目信息 MatchOpenFile matchOpenFileInfo = new MatchOpenFile(); matchOpenFileInfo.setPId(versionInfo.getProId()) .setPName(versionInfo.getProName()) .setSourceUrl((String) openEntries.get("fullPath")) .setFeatureSimilarity(featureSimilarity.floatValue()) .setOpenRate(openRateAndSaveRowNum.getKey()) .setVersion(versionInfo.getVersionName()) .setLicenseType(versionInfo.getLicenseType()) .setAnalyzeType(AnalysisLevelEnum.FUNCTION_LEVEL.getCode()); matchOpenFilesRes.add(matchOpenFileInfo); } return matchOpenFilesRes; } /** * 防止函数特征库不全,再次根据文件MD5查询开源文件信息, 做二次校验 * * @param originalFileMd5 * @param versionIdCoreName */ private void checkByOriginalFileMd5(String versionIdCoreName, String originalFileMd5) { //根据文件的MD5,查询特征库,看当前文件是否在开源代码库中 SolrDocument versionIdAndPath = solrUtils.queryOne(versionIdCoreName, "sourceFileMd5:" + originalFileMd5, "versionId,fullPath,sourceFileMd5"); if (versionIdAndPath != null) { //根据版本ID查询版本的详细信息 VersionTree versionInfo = solrUtils.queryVersionInfoByVersionId((String) versionIdAndPath.get("versionId")); if (versionInfo != null) { //当前开源文件的开源项目信息 MatchOpenFile matchOpenFileInfo = new MatchOpenFile(); matchOpenFileInfo.setPId(versionInfo.getProId()) .setPName(versionInfo.getProName()) .setSourceUrl(versionInfo.getDownUrl()) .setFeatureSimilarity(100.00f) .setOpenRate(100.00f) .setAnalyzeType(AnalysisLevelEnum.FUNCTION_LEVEL.getCode()); //保存当前文件的开源信息到mongo库中 MatchOpenFileMongoDto matchOpenFileMongo = new MatchOpenFileMongoDto(); matchOpenFileMongo.setId(IdGenerator.uuid32()) .setFilePath(analysisFile.getFileUrl()) .setFileName(analysisFile.getName()) .setOpenRate(100.00f) .setOpenType(analysisFile.getOpenType()) .setMatchOpenFile(Arrays.asList(matchOpenFileInfo)); mongoTemplate.save(matchOpenFileMongo); } } } /** * 获取当前文件的函数特征值 * * @param matchOpenFile * @return */ private List getOpenFileFunctionList(SolrDocument matchOpenFile) { try { //解析文件的函数特征值 String lineFeatureMd5s = matchOpenFile.getFieldValue("fun_hay").toString(); lineFeatureMd5s = lineFeatureMd5s.replace("\\", "") .replace("\"{", "{") .replace("}\"", "}"); return JSONArray.parseArray(lineFeatureMd5s, Function.class); }catch (Exception e){ log.error("解析文件特征值失败",e); } return new ArrayList(); } /** * 将特征值插入到mongo库中 * * @param features 特征集合 * @param lineDataMongoDto 当前分析任务 ,特征信息存储 * @param */ @Deprecated private void insertFeatureValue(List features, LineDataMongoDto lineDataMongoDto) { List batchInsertList = new ArrayList<>(); if (CollectionUtil.isNotEmpty(features)) { //这里的批量插入逻辑可以进行校验 //每10条存一次,解析的数据量如果过大,可能会超过MongoDB数据限制 int batchInsertStpe = 10; int total = 0; for (int i = 0; i < features.size(); i++) { LineModel lineModel = features.get(i); if (total != batchInsertStpe) { batchInsertList.add(lineModel); total++; } if (i == features.size() - 1 && total != batchInsertStpe) { total = 0; lineDataMongoDto.setId(IdGenerator.uuid32()) .setLineModels(batchInsertList); mongoTemplate.insert(lineDataMongoDto); } if (total == batchInsertStpe) { total = 0; lineDataMongoDto.setId(IdGenerator.uuid32()) .setLineModels(batchInsertList); mongoTemplate.insert(lineDataMongoDto); batchInsertList.clear(); } } } else { lineDataMongoDto.setId(IdGenerator.uuid32()); mongoTemplate.insert(lineDataMongoDto); } } }