You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
compose-analysis/src/main/java/com/keyware/composeanalysis/task/FileAnalysisTask.java

233 lines
12 KiB

7 months ago
package com.keyware.composeanalysis.task;
import com.keyware.composeanalysis.constant.FixedValue;
import com.keyware.composeanalysis.constant.RedisConst;
import com.keyware.composeanalysis.constant.SolrDBConst;
import com.keyware.composeanalysis.constant.enums.AnalysisLevelEnum;
import com.keyware.composeanalysis.constant.enums.AnalysisStatusEnum;
import com.keyware.composeanalysis.constant.enums.FileAnalysisStatusEnum;
import com.keyware.composeanalysis.entity.AnalysisTask;
import com.keyware.composeanalysis.mongo.FileDataMongoDto;
import com.keyware.composeanalysis.mongo.MatchOpenFile;
import com.keyware.composeanalysis.mongo.MatchOpenFileMongoDto;
import com.keyware.composeanalysis.solr.VersionTree;
import com.keyware.composeanalysis.util.*;
import com.keyware.keyswan.anaysis.Analysis;
import com.keyware.keyswan.anaysis.AnalysisFactory;
import com.keyware.keyswan.common.CodeFile;
import com.keyware.utils.IdGenerator;
import lombok.extern.log4j.Log4j2;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.springframework.data.mongodb.core.MongoTemplate;
import org.springframework.data.mongodb.core.query.Update;
import java.io.IOException;
import java.math.BigDecimal;
import java.math.RoundingMode;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.*;
import java.util.concurrent.CountDownLatch;
import java.util.function.Function;
import java.util.stream.Collectors;
import static org.springframework.data.mongodb.core.query.Criteria.where;
/**
* @author liuzongren
* @date 2024/7/23
* desc 文件级溯源分析任务
*/
@Log4j2
public class FileAnalysisTask extends IAnalysisTask {
private MongoTemplate mongoTemplate;
private AnalysisTask analysisTask;
private SolrUtils solrUtils;
//文件信息
private FileDataMongoDto analysisFile;
private RedisUtil redisUtil;
private CountDownLatch countDownLatch;
public FileAnalysisTask(AnalysisTask analysisTask, FileDataMongoDto analysisFile, MongoTemplate mongoTemplate, CountDownLatch countDownLatch) {
this.mongoTemplate = mongoTemplate;
this.analysisTask = analysisTask;
this.analysisFile = analysisFile;
this.countDownLatch = countDownLatch;
this.solrUtils = SpringContextUtils.getBean(SolrUtils.class);
this.redisUtil = SpringContextUtils.getBean(RedisUtil.class);
}
/**
* 文件级溯源分析
* 当前级别溯源分析 需要在 项目级级分析完成后执行
* 当前文件源MD5 已经在solr库中匹配不到了需要提取特征去匹配
*/
@Override
public void run() {
//执行任务前,判断一下任务执行的状态
Object status = redisUtil.get(String.format(RedisConst.TASK_RUNNING_STATUS_KEY_PREFIX, analysisTask.getId()));
if (status != null && (status.equals(AnalysisStatusEnum.STOP_ANALYSIS.getCode()) || status.equals(AnalysisStatusEnum.PAUSE_ANALYSIS.getCode()))) {
log.info("任务已取消,fileName:{}", analysisFile.getName());
countDownLatch.countDown();
return;
}
//获取当前文件名称
String fileName = analysisFile.getName();
AnalysisLogUtil.insert(mongoTemplate, "【文件级分析】正在分析" + fileName);
try {
//只有主流语言的才能解析
//非32种主流语言的不能提取文件特征,在文件级MD5匹配的时候,已经做过匹配
if (StringUtils.isNotEmpty(analysisFile.getSuffix()) && FixedValue.SUFFIX_SOLR_VERSION.containsKey(analysisFile.getSuffix())) {
//根据文件后缀 查询 *_CutFileInfo库名称
String featureCoreName = FixedValue.SUFFIX_SOLR_FILE.get(analysisFile.getSuffix());
//根据文件名称,获取文件解析器
Analysis analysis = AnalysisFactory.getAnalysis(fileName);
//如果 analysis 返回值为null 说明还未支持这种语言的特征提取 可以直接通过文件的MD5值去solr库中匹配
if (analysis != null) {
//如果文件大小超过3M,则不进行文件级行级特征提取
Integer fileSize = analysisFile.getFileSize();
if (fileSize < (3 * 1024 * 1024)) {
CodeFile codeFile = analysis.analysisFile(analysisFile.getFileUrl(), "1", "0");
//根据文件的特征值,去相应文件文件后缀的特征库中进行查询
if (codeFile != null) {
String querySb = "sourceMd5:" + codeFile.getSourceMd5() + " OR cutFileMd5:" + codeFile.getCutFileMd5() + " OR traitFileMd5:" + codeFile.getTraitFileMd5();
SolrDocumentList openSourceFileList = solrUtils.query(featureCoreName, querySb, "sourceMd5");
//如果当前文件在源码库中,匹配到了数据,则统计当前文件的开源率
if (CollectionUtils.isNotEmpty(openSourceFileList)) {
ananlyzeFileOpenRate(openSourceFileList);
}
}
}
}
}
//更新文件级分析结果
analysisFile.setFileAnalysisStatus(FileAnalysisStatusEnum.ANALYSIS_DONE.getCode());
mongoTemplate.update(FileDataMongoDto.class)
.matching(where("_id").is(analysisFile.getId()))
.replaceWith(analysisFile)
.findAndReplace();
} catch (Exception e) {
AnalysisLogUtil.insertErrorInfo(mongoTemplate, "【文件级】提取失败" + fileName, e);
log.error("文件:" + fileName + "文件级别特征提取失败!", e);
//将当前文件的分析状态变更为失败
analysisFile.setFileAnalysisStatus(FileAnalysisStatusEnum.FAILED_ANALYSIS.getCode());
//更新文件级分析结果
mongoTemplate.update(FileDataMongoDto.class)
.matching(where("_id").is(analysisFile.getId()))
.apply(new Update().set("fileAnalysisStatus", FileAnalysisStatusEnum.FAILED_ANALYSIS.getCode()))
.first();
} finally {
countDownLatch.countDown();
}
}
/**
* 分析文件的开源率
*
* @param fileList 匹配的开源文件信息
* @throws IOException
*/
private void ananlyzeFileOpenRate(SolrDocumentList fileList) throws IOException {
//创建匹配开源文件信息匹配对象
MatchOpenFileMongoDto matchOpenFileInfo = new MatchOpenFileMongoDto();
matchOpenFileInfo.setId(IdGenerator.uuid32())
.setFileName(analysisFile.getName())
.setFilePath(analysisFile.getFileUrl());
//根据匹配的开源文件的md5 获取版本ID
Set<String> sourceFileMd5 = fileList.stream().map(solrDocument -> (String) solrDocument.get("sourceMd5")).collect(Collectors.toSet());
String sourceCoreName = FixedValue.SUFFIX_SOLR_VERSION.get(analysisFile.getSuffix());
Map<String, SolrDocument> md5VersionObjMap = solrUtils.batchQueryVersionIdFromSourceFileBaseBySourceMd5(sourceCoreName, sourceFileMd5);
//根据版本ID获取版本信息
Set<String> versionIds = md5VersionObjMap.values().stream().map(solrDocument -> (String) solrDocument.get("versionId")).collect(Collectors.toSet());
List<VersionTree> treeInfoList = solrUtils.queryBatchVersionInfoByVersionIds(versionIds);
Map<String, VersionTree> versionIdMap = treeInfoList.stream().collect(Collectors.toMap(VersionTree::getVersionId, Function.identity()));
//获取被测件文本内容
String fileContent = new String(Files.readAllBytes(Paths.get(analysisFile.getFileUrl())), "utf-8").replaceAll(" ", "");
//将被测件的文本内容拆分成行信息,用于匹配开源信息
List<String> fileLines = SimilarityUtil.getSplitWords(fileContent);
HashSet<Integer> openLineNum = new HashSet<>();
//开源文件结果集合
ArrayList<MatchOpenFile> matchOpenFileList = new ArrayList<>();
//遍历匹配到的开源文件列表
for (int i = 0; i < fileList.size(); i++) {
String openFileMd5 = (String) fileList.get(i).get("sourceMd5");
SolrDocument versionObj = md5VersionObjMap.get(openFileMd5);
String versionId = (String) versionObj.get("versionId");
VersionTree versionInfo = versionIdMap.get(versionId);
if (versionInfo == null) {
log.error("未在versionTree中找到版本信息,openFileMd5:{},versionId:{}",openFileMd5, versionId);
continue;
}
MatchOpenFile matchOpenFile = new MatchOpenFile();
matchOpenFile.setId(IdGenerator.uuid32())
.setVersionId(versionId)
.setSourceFilePath((String) versionObj.get("fullPath"))
.setSourceUrl(versionInfo.getDownUrl())
.setPId(versionInfo.getProId())
.setPName(versionInfo.getProName())
.setLicenseType(versionInfo.getLicenseType())
.setAnalyzeType(AnalysisLevelEnum.FILE_LEVEL.getCode())
.setVersion(versionInfo.getVersionName())
.setFeatureSimilarity(100.00f);
//计算被测件和开源文件的文本相似度
//根据文件的MD5的第一位获取solr库索引名称
String solrNameIndex =openFileMd5.substring(0, 1) + SolrDBConst.CORE_NAME_SUFFIX_SOURCE_FILE_INFO;
SolrDocumentList sourceFileInfo = solrUtils.query(solrNameIndex, "sourceFileMd5:" + openFileMd5, "sourceContent");
if (CollectionUtils.isNotEmpty(sourceFileInfo)) {
String openSourceContent = String.valueOf(sourceFileInfo.get(0).getFieldValue("sourceContent"));
//这里存在优化空间,被测件的文件行拆分 可以拿到循环外面
double similarity = SimilarityUtil.getSimilarityAndSaveRowNum(fileLines, openSourceContent, openLineNum);
matchOpenFile.setOpenRate(new BigDecimal(similarity * 100).setScale(2, RoundingMode.HALF_UP).floatValue());
//如果找不到源代码,直接将原文开源率置为 100%
} else {
log.error("找不到源代码,DBname:{},sourceFileMd5:{}", solrNameIndex, openFileMd5);
matchOpenFile.setOpenRate(100.00f);
}
matchOpenFile.setMd5(openFileMd5);
matchOpenFileList.add(matchOpenFile);
}
//统计被测件的总体开源率
//获取开源率阈值,判断当前文件是否开源
Integer openRateThreshold = analysisTask.getOpenRateThreshold();
int openLineCount = openLineNum.size();
BigDecimal totalLineCount = new BigDecimal(fileLines.size());
BigDecimal openRate = new BigDecimal(openLineCount).divide(totalLineCount, 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100));
//超过阈值,则认为当前文件是开源文件
if (openRate.compareTo(new BigDecimal(openRateThreshold)) > 0) {
analysisFile.setOpenType(true);
} else {
analysisFile.setOpenType(false);
}
//修改保存测试文件信息
analysisFile.setOpenLineCount(openLineCount)
.setOpenRate(openRate.floatValue());
//组装开源信息
matchOpenFileInfo.setFilePath(analysisFile.getFileUrl())
.setOpenType(analysisFile.getOpenType())
.setOpenRate(analysisFile.getOpenType() ? 100.00f : 0.00f)
.setMatchOpenFile(matchOpenFileList);
//保存当前开源信息数据
mongoTemplate.insert(matchOpenFileInfo);
}
}