package com.keyware.composeanalysis.task; import com.keyware.composeanalysis.constant.FixedValue; import com.keyware.composeanalysis.constant.RedisConst; import com.keyware.composeanalysis.constant.SolrDBConst; import com.keyware.composeanalysis.constant.enums.AnalysisLevelEnum; import com.keyware.composeanalysis.constant.enums.AnalysisStatusEnum; import com.keyware.composeanalysis.constant.enums.FileAnalysisStatusEnum; import com.keyware.composeanalysis.entity.AnalysisTask; import com.keyware.composeanalysis.mongo.FileDataMongoDto; import com.keyware.composeanalysis.mongo.MatchOpenFile; import com.keyware.composeanalysis.mongo.MatchOpenFileMongoDto; import com.keyware.composeanalysis.solr.VersionTree; import com.keyware.composeanalysis.util.*; import com.keyware.keyswan.anaysis.Analysis; import com.keyware.keyswan.anaysis.AnalysisFactory; import com.keyware.keyswan.common.CodeFile; import com.keyware.utils.IdGenerator; import lombok.extern.log4j.Log4j2; import org.apache.commons.collections.CollectionUtils; import org.apache.commons.lang3.StringUtils; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocumentList; import org.springframework.data.mongodb.core.MongoTemplate; import org.springframework.data.mongodb.core.query.Update; import java.io.IOException; import java.math.BigDecimal; import java.math.RoundingMode; import java.nio.file.Files; import java.nio.file.Paths; import java.util.*; import java.util.concurrent.CountDownLatch; import java.util.function.Function; import java.util.stream.Collectors; import static org.springframework.data.mongodb.core.query.Criteria.where; /** * @author liuzongren * @date 2024/7/23 * desc 文件级溯源分析任务 */ @Log4j2 public class FileAnalysisTask extends IAnalysisTask { private MongoTemplate mongoTemplate; private AnalysisTask analysisTask; private SolrUtils solrUtils; //文件信息 private FileDataMongoDto analysisFile; private RedisUtil redisUtil; private CountDownLatch countDownLatch; public FileAnalysisTask(AnalysisTask analysisTask, FileDataMongoDto analysisFile, MongoTemplate mongoTemplate, CountDownLatch countDownLatch) { this.mongoTemplate = mongoTemplate; this.analysisTask = analysisTask; this.analysisFile = analysisFile; this.countDownLatch = countDownLatch; this.solrUtils = SpringContextUtils.getBean(SolrUtils.class); this.redisUtil = SpringContextUtils.getBean(RedisUtil.class); } /** * 文件级溯源分析 * 当前级别溯源分析 需要在 项目级级分析完成后执行 * 当前文件源MD5 已经在solr库中匹配不到了,需要提取特征去匹配 */ @Override public void run() { //执行任务前,判断一下任务执行的状态 Object status = redisUtil.get(String.format(RedisConst.TASK_RUNNING_STATUS_KEY_PREFIX, analysisTask.getId())); if (status != null && (status.equals(AnalysisStatusEnum.STOP_ANALYSIS.getCode()) || status.equals(AnalysisStatusEnum.PAUSE_ANALYSIS.getCode()))) { log.info("任务已取消,fileName:{}", analysisFile.getName()); countDownLatch.countDown(); return; } //获取当前文件名称 String fileName = analysisFile.getName(); AnalysisLogUtil.insert(mongoTemplate, "【文件级分析】正在分析" + fileName); try { //只有主流语言的才能解析 //非32种主流语言的不能提取文件特征,在文件级MD5匹配的时候,已经做过匹配 if (StringUtils.isNotEmpty(analysisFile.getSuffix()) && FixedValue.SUFFIX_SOLR_VERSION.containsKey(analysisFile.getSuffix())) { //根据文件后缀 查询 *_CutFileInfo库名称 String featureCoreName = FixedValue.SUFFIX_SOLR_FILE.get(analysisFile.getSuffix()); //根据文件名称,获取文件解析器 Analysis analysis = AnalysisFactory.getAnalysis(fileName); //如果 analysis 返回值为null 说明还未支持这种语言的特征提取 可以直接通过文件的MD5值去solr库中匹配 if (analysis != null) { //如果文件大小超过3M,则不进行文件级行级特征提取 Integer fileSize = analysisFile.getFileSize(); if (fileSize < (3 * 1024 * 1024)) { CodeFile codeFile = analysis.analysisFile(analysisFile.getFileUrl(), "1", "0"); //根据文件的特征值,去相应文件文件后缀的特征库中进行查询 if (codeFile != null) { String querySb = "sourceMd5:" + codeFile.getSourceMd5() + " OR cutFileMd5:" + codeFile.getCutFileMd5() + " OR traitFileMd5:" + codeFile.getTraitFileMd5(); SolrDocumentList openSourceFileList = solrUtils.query(featureCoreName, querySb, "sourceMd5"); //如果当前文件在源码库中,匹配到了数据,则统计当前文件的开源率 if (CollectionUtils.isNotEmpty(openSourceFileList)) { ananlyzeFileOpenRate(openSourceFileList); } } } } } //更新文件级分析结果 analysisFile.setFileAnalysisStatus(FileAnalysisStatusEnum.ANALYSIS_DONE.getCode()); mongoTemplate.update(FileDataMongoDto.class) .matching(where("_id").is(analysisFile.getId())) .replaceWith(analysisFile) .findAndReplace(); } catch (Exception e) { AnalysisLogUtil.insertErrorInfo(mongoTemplate, "【文件级】提取失败" + fileName, e); log.error("文件:" + fileName + "文件级别特征提取失败!", e); //将当前文件的分析状态变更为失败 analysisFile.setFileAnalysisStatus(FileAnalysisStatusEnum.FAILED_ANALYSIS.getCode()); //更新文件级分析结果 mongoTemplate.update(FileDataMongoDto.class) .matching(where("_id").is(analysisFile.getId())) .apply(new Update().set("fileAnalysisStatus", FileAnalysisStatusEnum.FAILED_ANALYSIS.getCode())) .first(); } finally { countDownLatch.countDown(); } } /** * 分析文件的开源率 * * @param fileList 匹配的开源文件信息 * @throws IOException */ private void ananlyzeFileOpenRate(SolrDocumentList fileList) throws IOException { //创建匹配开源文件信息匹配对象 MatchOpenFileMongoDto matchOpenFileInfo = new MatchOpenFileMongoDto(); matchOpenFileInfo.setId(IdGenerator.uuid32()) .setFileName(analysisFile.getName()) .setFilePath(analysisFile.getFileUrl()); //根据匹配的开源文件的md5 获取版本ID Set sourceFileMd5 = fileList.stream().map(solrDocument -> (String) solrDocument.get("sourceMd5")).collect(Collectors.toSet()); String sourceCoreName = FixedValue.SUFFIX_SOLR_VERSION.get(analysisFile.getSuffix()); Map md5VersionObjMap = solrUtils.batchQueryVersionIdFromSourceFileBaseBySourceMd5(sourceCoreName, sourceFileMd5); //根据版本ID获取版本信息 Set versionIds = md5VersionObjMap.values().stream().map(solrDocument -> (String) solrDocument.get("versionId")).collect(Collectors.toSet()); List treeInfoList = solrUtils.queryBatchVersionInfoByVersionIds(versionIds); Map versionIdMap = treeInfoList.stream().collect(Collectors.toMap(VersionTree::getVersionId, Function.identity())); //获取被测件文本内容 String fileContent = new String(Files.readAllBytes(Paths.get(analysisFile.getFileUrl())), "utf-8").replaceAll(" ", ""); //将被测件的文本内容拆分成行信息,用于匹配开源信息 List fileLines = SimilarityUtil.getSplitWords(fileContent); HashSet openLineNum = new HashSet<>(); //开源文件结果集合 ArrayList matchOpenFileList = new ArrayList<>(); //遍历匹配到的开源文件列表 for (int i = 0; i < fileList.size(); i++) { String openFileMd5 = (String) fileList.get(i).get("sourceMd5"); SolrDocument versionObj = md5VersionObjMap.get(openFileMd5); String versionId = (String) versionObj.get("versionId"); VersionTree versionInfo = versionIdMap.get(versionId); if (versionInfo == null) { log.error("未在versionTree中找到版本信息,openFileMd5:{},versionId:{}",openFileMd5, versionId); continue; } MatchOpenFile matchOpenFile = new MatchOpenFile(); matchOpenFile.setId(IdGenerator.uuid32()) .setVersionId(versionId) .setSourceFilePath((String) versionObj.get("fullPath")) .setSourceUrl(versionInfo.getDownUrl()) .setPId(versionInfo.getProId()) .setPName(versionInfo.getProName()) .setLicenseType(versionInfo.getLicenseType()) .setAnalyzeType(AnalysisLevelEnum.FILE_LEVEL.getCode()) .setVersion(versionInfo.getVersionName()) .setFeatureSimilarity(100.00f); //计算被测件和开源文件的文本相似度 //根据文件的MD5的第一位获取solr库索引名称 String solrNameIndex =openFileMd5.substring(0, 1) + SolrDBConst.CORE_NAME_SUFFIX_SOURCE_FILE_INFO; SolrDocumentList sourceFileInfo = solrUtils.query(solrNameIndex, "sourceFileMd5:" + openFileMd5, "sourceContent"); if (CollectionUtils.isNotEmpty(sourceFileInfo)) { String openSourceContent = String.valueOf(sourceFileInfo.get(0).getFieldValue("sourceContent")); //这里存在优化空间,被测件的文件行拆分 可以拿到循环外面 double similarity = SimilarityUtil.getSimilarityAndSaveRowNum(fileLines, openSourceContent, openLineNum); matchOpenFile.setOpenRate(new BigDecimal(similarity * 100).setScale(2, RoundingMode.HALF_UP).floatValue()); //如果找不到源代码,直接将原文开源率置为 100% } else { log.error("找不到源代码,DBname:{},sourceFileMd5:{}", solrNameIndex, openFileMd5); matchOpenFile.setOpenRate(100.00f); } matchOpenFile.setMd5(openFileMd5); matchOpenFileList.add(matchOpenFile); } //统计被测件的总体开源率 //获取开源率阈值,判断当前文件是否开源 Integer openRateThreshold = analysisTask.getOpenRateThreshold(); int openLineCount = openLineNum.size(); BigDecimal totalLineCount = new BigDecimal(fileLines.size()); BigDecimal openRate = new BigDecimal(openLineCount).divide(totalLineCount, 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)); //超过阈值,则认为当前文件是开源文件 if (openRate.compareTo(new BigDecimal(openRateThreshold)) > 0) { analysisFile.setOpenType(true); } else { analysisFile.setOpenType(false); } //修改保存测试文件信息 analysisFile.setOpenLineCount(openLineCount) .setOpenRate(openRate.floatValue()); //组装开源信息 matchOpenFileInfo.setFilePath(analysisFile.getFileUrl()) .setOpenType(analysisFile.getOpenType()) .setOpenRate(analysisFile.getOpenType() ? 100.00f : 0.00f) .setMatchOpenFile(matchOpenFileList); //保存当前开源信息数据 mongoTemplate.insert(matchOpenFileInfo); } }