package com.keyware.composeanalysis.task; import cn.hutool.core.collection.CollectionUtil; import com.keyware.composeanalysis.constant.FixedValue; import com.keyware.composeanalysis.constant.FunctionAndAnalysisAssemblyConst; import com.keyware.composeanalysis.constant.RedisConst; import com.keyware.composeanalysis.constant.SolrDBConst; import com.keyware.composeanalysis.constant.enums.AnalysisLevelEnum; import com.keyware.composeanalysis.constant.enums.AnalysisStatusEnum; import com.keyware.composeanalysis.constant.enums.FileAnalysisStatusEnum; import com.keyware.composeanalysis.entity.AnalysisTask; import com.keyware.composeanalysis.mongo.FileDataMongoDto; import com.keyware.composeanalysis.mongo.LineDataMongoDto; import com.keyware.composeanalysis.mongo.MatchOpenFile; import com.keyware.composeanalysis.mongo.MatchOpenFileMongoDto; import com.keyware.composeanalysis.solr.VersionTree; import com.keyware.composeanalysis.util.AnalysisLogUtil; import com.keyware.composeanalysis.util.RedisUtil; import com.keyware.composeanalysis.util.SolrUtils; import com.keyware.composeanalysis.util.SpringContextUtils; import com.keyware.keyswan.anaysis.Analysis; import com.keyware.keyswan.anaysis.AnalysisFactory; import com.keyware.keyswan.common.CodeFile; import com.keyware.utils.IdGenerator; import lombok.extern.log4j.Log4j2; import org.apache.commons.lang3.StringUtils; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocumentList; import org.springframework.data.mongodb.core.MongoTemplate; import org.springframework.data.mongodb.core.query.Update; import java.math.BigDecimal; import java.math.RoundingMode; import java.util.*; import java.util.concurrent.CountDownLatch; import static org.springframework.data.mongodb.core.query.Criteria.where; /** * @author liuzongren * @ClassName LineAnalysisTask * @description: 行级别 特征提取定时任务 * @datetime 2024年 07月 25日 16:19 * @version: 1.0 */ @Log4j2 public class LineAnalysisTask extends IAnalysisTask { private MongoTemplate mongoTemplate; private AnalysisTask analysisTask; //被测件的文件信息 private FileDataMongoDto analysisFile; private SolrUtils solrUtils; private RedisUtil redisUtil; private CountDownLatch countDownLatch; public LineAnalysisTask(AnalysisTask analysisTask, FileDataMongoDto analysisFile, MongoTemplate mongoTemplate, CountDownLatch countDownLatch) { this.mongoTemplate = mongoTemplate; this.analysisTask = analysisTask; this.analysisFile = analysisFile; this.countDownLatch = countDownLatch; this.solrUtils = SpringContextUtils.getBean(SolrUtils.class); this.redisUtil = SpringContextUtils.getBean(RedisUtil.class); } /** * 行级别 源代码溯源 * 当前任务 需要在 文件级分析完成后 进行 */ @Override public void run() { //执行任务前,判断一下任务执行的状态 Object status = redisUtil.get(String.format(RedisConst.TASK_RUNNING_STATUS_KEY_PREFIX, analysisTask.getId())); if (status != null && (status.equals(AnalysisStatusEnum.STOP_ANALYSIS.getCode()) || status.equals(AnalysisStatusEnum.PAUSE_ANALYSIS.getCode()))) { log.info("任务已取消,fileName:{}", analysisFile.getName()); countDownLatch.countDown(); return; } //获取文件地址 String filePath = analysisFile.getFileUrl(); //获取文件名称 String fileName = analysisFile.getName(); AnalysisLogUtil.insert(mongoTemplate, "【行级特征提取】正在提取" + fileName); try { LineDataMongoDto lineDataMongoDto = new LineDataMongoDto(); lineDataMongoDto.setFileId(analysisFile.getId()) .setStatus(0) .setIsSelect(false); Analysis analysis = AnalysisFactory.getAnalysis(filePath); CodeFile codeFile = null; //获取文件行级特征md5 codeFile = analysis.analysisFile(filePath, FunctionAndAnalysisAssemblyConst.LINE_EXTRACT, FunctionAndAnalysisAssemblyConst.LINE_EXTRACT); //每一行原内容MD5值集合 // String cutFileLineMd5 = codeFile.getCutFileLineMd5(); //每一行特征内容MD5值集合 String traitFileLineMd5 = codeFile.getTraitFileLineMd5(); String[] featureMd5Arr = {}; if (StringUtils.isNotBlank(traitFileLineMd5)) { featureMd5Arr = traitFileLineMd5.split(","); } List lineFeatures = Arrays.asList(featureMd5Arr); //从solr中获取特征相似的 文件 SolrDocumentList featureSimilarityFromSolr = getFeatureSimilarityFromSolr(lineFeatures); //计算文件的开源率 calculateOpenRate(featureSimilarityFromSolr, lineFeatures); //更新文件表的分析状态为3 行级特征以分析完毕 analysisFile.setFileAnalysisStatus(FileAnalysisStatusEnum.ANALYSIS_DONE.getCode()); mongoTemplate.update(FileDataMongoDto.class) .matching(where("_id").is(analysisFile.getId())) .replaceWith(analysisFile) .findAndReplace(); AnalysisLogUtil.insert(mongoTemplate, "【行级特征提取】提取完成" + fileName); log.info("文件" + fileName + ":行级分析完成"); } catch (Exception e) { AnalysisLogUtil.insertErrorInfo(mongoTemplate, "【行级特征提取】提取失败" + fileName, e); log.error("文件:" + fileName + "行级别特征提取失败!", e); //修改当前文件分析状态未失败 mongoTemplate.update(FileDataMongoDto.class) .matching(where("_id").is(analysisFile.getId())) .apply(new Update().set("fileAnalysisStatus", FileAnalysisStatusEnum.FAILED_ANALYSIS.getCode())) .first(); } finally { countDownLatch.countDown(); } } /** * 计算开源率 被测件的开源率 * * @param matcheOpenSourceFiles * @param lineFeatures */ private void calculateOpenRate(SolrDocumentList matcheOpenSourceFiles, List lineFeatures) { if (CollectionUtil.isEmpty(matcheOpenSourceFiles)) { return; } //根据文件后缀判断需要查询的文件版本库名称 String versionIdCoreName = FixedValue.SUFFIX_SOLR_VERSION.get(analysisFile.getSuffix()); //定义结果集对象 MatchOpenFileMongoDto matchOpenFileMongo = new MatchOpenFileMongoDto(); matchOpenFileMongo.setId(IdGenerator.uuid32()) .setFilePath(analysisFile.getFileUrl()) .setFileName(analysisFile.getName()); //开源文件信息保存结果集 List matchOpenFileInfoList = new ArrayList<>(); //保存所有匹配的行数信息,方便统计总的匹配行数 Set matchingLineSet = new HashSet<>(); //获取文件总行数 BigDecimal totalCodeRowNum = new BigDecimal(analysisFile.getCodeRowNum()); //统计每个开源文件和被测件的匹配行数 for (SolrDocument matchFile : matcheOpenSourceFiles) { //解析文件的代码块特征值 String lineFeatureMd5s = (String) matchFile.get("tz_line_hay"); List matchedLineFeatures = Arrays.asList(lineFeatureMd5s.split(",")); //匹配的总行数 int currentFileMatchLineCount = 0; //遍历当前文件的代码块特征,统计匹配的总行数 for (String originalLineFeatureMd5 : lineFeatures) { for (String matchLineFeatureMd5 : matchedLineFeatures) { if (originalLineFeatureMd5.equals(matchLineFeatureMd5)) { currentFileMatchLineCount++; matchingLineSet.add(originalLineFeatureMd5); } } } //首先根据文件的MD5查询开源文件的版本ID,和路径信息 SolrDocument versionIdAndPath = solrUtils.queryOne(versionIdCoreName, "sourceFileMd5:" + matchFile.get("sourceMd5"), "versionId,fullPath,sourceFileMd5"); //根据版本ID查询版本的详细信息 //todo 这里 查询一个版本的信息 需要检索 两个 solr 库 而且还需要检索 versioinTree 后面需要优化 VersionTree versionInfo = solrUtils.queryVersionInfoByVersionId((String) versionIdAndPath.get("versionId")); //计算与当前开源文件的开源率 BigDecimal openRate = new BigDecimal(currentFileMatchLineCount).divide(totalCodeRowNum, 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)); //当前开源文件的开源项目信息 MatchOpenFile matchOpenFileInfo = new MatchOpenFile(); matchOpenFileInfo.setPId(versionInfo.getProId()) .setPName(versionInfo.getProName()) .setSourceUrl(versionInfo.getDownUrl()) .setOpenRate(openRate.floatValue()) .setVersion(versionInfo.getVersionName()) .setLicenseType(versionInfo.getLicenseType()) .setAnalyzeType(AnalysisLevelEnum.LINE_LEVEL.getCode()); matchOpenFileInfoList.add(matchOpenFileInfo); } //统计当前文件的整体开源率 BigDecimal openRate = new BigDecimal(matchingLineSet.size()).divide(totalCodeRowNum, 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)); //获取开源率的阈值 Integer openRateThreshold = analysisTask.getOpenRateThreshold(); //如果开源率大于阈值,则将当前文件设置成开源 if (openRate.compareTo(new BigDecimal(openRateThreshold)) >= 0) { analysisFile.setOpenType(true); } //保存当前文件的开源信息 matchOpenFileMongo.setOpenType(analysisFile.getOpenType()) .setMatchOpenFile(matchOpenFileInfoList); mongoTemplate.save(matchOpenFileMongo); } /** * 将特征值插入到mongo库中 * * @param features 特征集合 * @param lineDataMongoDto 当前分析任务 ,特征信息存储 * todo 后期 看看有没有插入的必要 * @param */ @Deprecated private void insertFeatureValue(String features, LineDataMongoDto lineDataMongoDto) { String[] featureMd5Arr = {}; if (StringUtils.isNotBlank(features)) { featureMd5Arr = features.split(","); } List lineFeatures = Arrays.asList(featureMd5Arr); List batchInsertList = new ArrayList<>(); if (CollectionUtil.isNotEmpty(lineFeatures)) { //这里的批量插入逻辑可以进行校验 //每10条存一次,解析的数据量如果过大,可能会超过MongoDB数据限制 int batchInsertStpe = 5000; int total = 0; for (int i = 0; i < lineFeatures.size(); i++) { if (total != batchInsertStpe) { batchInsertList.add(lineFeatures.get(i)); total++; } if (i == lineFeatures.size() - 1 && total != batchInsertStpe) { total = 0; lineDataMongoDto.setId(IdGenerator.uuid32()) .setLineFeatueMd5s(batchInsertList); mongoTemplate.insert(lineDataMongoDto); } if (total == batchInsertStpe) { total = 0; lineDataMongoDto.setId(IdGenerator.uuid32()) .setLineFeatueMd5s(batchInsertList); mongoTemplate.insert(lineDataMongoDto); batchInsertList.clear(); } } } else { lineDataMongoDto.setId(IdGenerator.uuid32()); mongoTemplate.insert(lineDataMongoDto); } } /** * 根据 特征值 从特征库中检索 具有特征相似的 * * @param lineFeatureList 行特征信息 * @return */ private SolrDocumentList getFeatureSimilarityFromSolr(List lineFeatureList) { String solrCoreName = SolrDBConst.CORE_NAME_SOURCE_FILE_INFO_TEMP; //拼接行特征查询条件 String queryStr = "tz_line_hay:(" + StringUtils.join(lineFeatureList, " OR ") + ")"; log.info("查询条件: solrCoreName:{},queryStr:{}", solrCoreName, queryStr); SolrDocumentList result = solrUtils.query(solrCoreName, queryStr, "sourceMd5,tz_line_hay"); log.info("查询结果: result:{}", result); return result; } }