forked from liuzongren/compose-analysis
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
233 lines
12 KiB
233 lines
12 KiB
7 months ago
|
package com.keyware.composeanalysis.task;
|
||
|
|
||
|
import com.keyware.composeanalysis.constant.FixedValue;
|
||
|
import com.keyware.composeanalysis.constant.RedisConst;
|
||
|
import com.keyware.composeanalysis.constant.SolrDBConst;
|
||
|
import com.keyware.composeanalysis.constant.enums.AnalysisLevelEnum;
|
||
|
import com.keyware.composeanalysis.constant.enums.AnalysisStatusEnum;
|
||
|
import com.keyware.composeanalysis.constant.enums.FileAnalysisStatusEnum;
|
||
|
import com.keyware.composeanalysis.entity.AnalysisTask;
|
||
|
import com.keyware.composeanalysis.mongo.FileDataMongoDto;
|
||
|
import com.keyware.composeanalysis.mongo.MatchOpenFile;
|
||
|
import com.keyware.composeanalysis.mongo.MatchOpenFileMongoDto;
|
||
|
import com.keyware.composeanalysis.solr.VersionTree;
|
||
|
import com.keyware.composeanalysis.util.*;
|
||
|
import com.keyware.keyswan.anaysis.Analysis;
|
||
|
import com.keyware.keyswan.anaysis.AnalysisFactory;
|
||
|
import com.keyware.keyswan.common.CodeFile;
|
||
|
import com.keyware.utils.IdGenerator;
|
||
|
import lombok.extern.log4j.Log4j2;
|
||
|
import org.apache.commons.collections.CollectionUtils;
|
||
|
import org.apache.commons.lang3.StringUtils;
|
||
|
import org.apache.solr.common.SolrDocument;
|
||
|
import org.apache.solr.common.SolrDocumentList;
|
||
|
import org.springframework.data.mongodb.core.MongoTemplate;
|
||
|
import org.springframework.data.mongodb.core.query.Update;
|
||
|
|
||
|
import java.io.IOException;
|
||
|
import java.math.BigDecimal;
|
||
|
import java.math.RoundingMode;
|
||
|
import java.nio.file.Files;
|
||
|
import java.nio.file.Paths;
|
||
|
import java.util.*;
|
||
|
import java.util.concurrent.CountDownLatch;
|
||
|
import java.util.function.Function;
|
||
|
import java.util.stream.Collectors;
|
||
|
|
||
|
import static org.springframework.data.mongodb.core.query.Criteria.where;
|
||
|
|
||
|
/**
|
||
|
* @author liuzongren
|
||
|
* @date 2024/7/23
|
||
|
* desc 文件级溯源分析任务
|
||
|
*/
|
||
|
@Log4j2
|
||
|
public class FileAnalysisTask extends IAnalysisTask {
|
||
|
|
||
|
private MongoTemplate mongoTemplate;
|
||
|
private AnalysisTask analysisTask;
|
||
|
private SolrUtils solrUtils;
|
||
|
//文件信息
|
||
|
private FileDataMongoDto analysisFile;
|
||
|
private RedisUtil redisUtil;
|
||
|
private CountDownLatch countDownLatch;
|
||
|
|
||
|
|
||
|
public FileAnalysisTask(AnalysisTask analysisTask, FileDataMongoDto analysisFile, MongoTemplate mongoTemplate, CountDownLatch countDownLatch) {
|
||
|
this.mongoTemplate = mongoTemplate;
|
||
|
this.analysisTask = analysisTask;
|
||
|
this.analysisFile = analysisFile;
|
||
|
this.countDownLatch = countDownLatch;
|
||
|
this.solrUtils = SpringContextUtils.getBean(SolrUtils.class);
|
||
|
this.redisUtil = SpringContextUtils.getBean(RedisUtil.class);
|
||
|
}
|
||
|
|
||
|
|
||
|
/**
|
||
|
* 文件级溯源分析
|
||
|
* 当前级别溯源分析 需要在 项目级级分析完成后执行
|
||
|
* 当前文件源MD5 已经在solr库中匹配不到了,需要提取特征去匹配
|
||
|
*/
|
||
|
@Override
|
||
|
public void run() {
|
||
|
//执行任务前,判断一下任务执行的状态
|
||
|
Object status = redisUtil.get(String.format(RedisConst.TASK_RUNNING_STATUS_KEY_PREFIX, analysisTask.getId()));
|
||
|
if (status != null && (status.equals(AnalysisStatusEnum.STOP_ANALYSIS.getCode()) || status.equals(AnalysisStatusEnum.PAUSE_ANALYSIS.getCode()))) {
|
||
|
log.info("任务已取消,fileName:{}", analysisFile.getName());
|
||
|
countDownLatch.countDown();
|
||
|
return;
|
||
|
}
|
||
|
//获取当前文件名称
|
||
|
String fileName = analysisFile.getName();
|
||
|
|
||
|
AnalysisLogUtil.insert(mongoTemplate, "【文件级分析】正在分析" + fileName);
|
||
|
try {
|
||
|
//只有主流语言的才能解析
|
||
|
//非32种主流语言的不能提取文件特征,在文件级MD5匹配的时候,已经做过匹配
|
||
|
if (StringUtils.isNotEmpty(analysisFile.getSuffix()) && FixedValue.SUFFIX_SOLR_VERSION.containsKey(analysisFile.getSuffix())) {
|
||
|
//根据文件后缀 查询 *_CutFileInfo库名称
|
||
|
String featureCoreName = FixedValue.SUFFIX_SOLR_FILE.get(analysisFile.getSuffix());
|
||
|
//根据文件名称,获取文件解析器
|
||
|
Analysis analysis = AnalysisFactory.getAnalysis(fileName);
|
||
|
//如果 analysis 返回值为null 说明还未支持这种语言的特征提取 可以直接通过文件的MD5值去solr库中匹配
|
||
|
if (analysis != null) {
|
||
|
//如果文件大小超过3M,则不进行文件级行级特征提取
|
||
|
Integer fileSize = analysisFile.getFileSize();
|
||
|
if (fileSize < (3 * 1024 * 1024)) {
|
||
|
CodeFile codeFile = analysis.analysisFile(analysisFile.getFileUrl(), "1", "0");
|
||
|
//根据文件的特征值,去相应文件文件后缀的特征库中进行查询
|
||
|
if (codeFile != null) {
|
||
|
String querySb = "sourceMd5:" + codeFile.getSourceMd5() + " OR cutFileMd5:" + codeFile.getCutFileMd5() + " OR traitFileMd5:" + codeFile.getTraitFileMd5();
|
||
|
SolrDocumentList openSourceFileList = solrUtils.query(featureCoreName, querySb, "sourceMd5");
|
||
|
//如果当前文件在源码库中,匹配到了数据,则统计当前文件的开源率
|
||
|
if (CollectionUtils.isNotEmpty(openSourceFileList)) {
|
||
|
ananlyzeFileOpenRate(openSourceFileList);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
//更新文件级分析结果
|
||
|
analysisFile.setFileAnalysisStatus(FileAnalysisStatusEnum.ANALYSIS_DONE.getCode());
|
||
|
mongoTemplate.update(FileDataMongoDto.class)
|
||
|
.matching(where("_id").is(analysisFile.getId()))
|
||
|
.replaceWith(analysisFile)
|
||
|
.findAndReplace();
|
||
|
} catch (Exception e) {
|
||
|
AnalysisLogUtil.insertErrorInfo(mongoTemplate, "【文件级】提取失败" + fileName, e);
|
||
|
log.error("文件:" + fileName + "文件级别特征提取失败!", e);
|
||
|
//将当前文件的分析状态变更为失败
|
||
|
analysisFile.setFileAnalysisStatus(FileAnalysisStatusEnum.FAILED_ANALYSIS.getCode());
|
||
|
//更新文件级分析结果
|
||
|
mongoTemplate.update(FileDataMongoDto.class)
|
||
|
.matching(where("_id").is(analysisFile.getId()))
|
||
|
.apply(new Update().set("fileAnalysisStatus", FileAnalysisStatusEnum.FAILED_ANALYSIS.getCode()))
|
||
|
.first();
|
||
|
} finally {
|
||
|
countDownLatch.countDown();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
/**
|
||
|
* 分析文件的开源率
|
||
|
*
|
||
|
* @param fileList 匹配的开源文件信息
|
||
|
* @throws IOException
|
||
|
*/
|
||
|
private void ananlyzeFileOpenRate(SolrDocumentList fileList) throws IOException {
|
||
|
//创建匹配开源文件信息匹配对象
|
||
|
MatchOpenFileMongoDto matchOpenFileInfo = new MatchOpenFileMongoDto();
|
||
|
matchOpenFileInfo.setId(IdGenerator.uuid32())
|
||
|
.setFileName(analysisFile.getName())
|
||
|
.setFilePath(analysisFile.getFileUrl());
|
||
|
|
||
|
//根据匹配的开源文件的md5 获取版本ID
|
||
|
Set<String> sourceFileMd5 = fileList.stream().map(solrDocument -> (String) solrDocument.get("sourceMd5")).collect(Collectors.toSet());
|
||
|
String sourceCoreName = FixedValue.SUFFIX_SOLR_VERSION.get(analysisFile.getSuffix());
|
||
|
Map<String, SolrDocument> md5VersionObjMap = solrUtils.batchQueryVersionIdFromSourceFileBaseBySourceMd5(sourceCoreName, sourceFileMd5);
|
||
|
|
||
|
//根据版本ID获取版本信息
|
||
|
Set<String> versionIds = md5VersionObjMap.values().stream().map(solrDocument -> (String) solrDocument.get("versionId")).collect(Collectors.toSet());
|
||
|
List<VersionTree> treeInfoList = solrUtils.queryBatchVersionInfoByVersionIds(versionIds);
|
||
|
Map<String, VersionTree> versionIdMap = treeInfoList.stream().collect(Collectors.toMap(VersionTree::getVersionId, Function.identity()));
|
||
|
|
||
|
//获取被测件文本内容
|
||
|
String fileContent = new String(Files.readAllBytes(Paths.get(analysisFile.getFileUrl())), "utf-8").replaceAll(" ", "");
|
||
|
|
||
|
//将被测件的文本内容拆分成行信息,用于匹配开源信息
|
||
|
List<String> fileLines = SimilarityUtil.getSplitWords(fileContent);
|
||
|
|
||
|
HashSet<Integer> openLineNum = new HashSet<>();
|
||
|
|
||
|
//开源文件结果集合
|
||
|
ArrayList<MatchOpenFile> matchOpenFileList = new ArrayList<>();
|
||
|
//遍历匹配到的开源文件列表
|
||
|
for (int i = 0; i < fileList.size(); i++) {
|
||
|
String openFileMd5 = (String) fileList.get(i).get("sourceMd5");
|
||
|
SolrDocument versionObj = md5VersionObjMap.get(openFileMd5);
|
||
|
String versionId = (String) versionObj.get("versionId");
|
||
|
VersionTree versionInfo = versionIdMap.get(versionId);
|
||
|
if (versionInfo == null) {
|
||
|
log.error("未在versionTree中找到版本信息,openFileMd5:{},versionId:{}",openFileMd5, versionId);
|
||
|
continue;
|
||
|
}
|
||
|
MatchOpenFile matchOpenFile = new MatchOpenFile();
|
||
|
matchOpenFile.setId(IdGenerator.uuid32())
|
||
|
.setVersionId(versionId)
|
||
|
.setSourceFilePath((String) versionObj.get("fullPath"))
|
||
|
.setSourceUrl(versionInfo.getDownUrl())
|
||
|
.setPId(versionInfo.getProId())
|
||
|
.setPName(versionInfo.getProName())
|
||
|
.setLicenseType(versionInfo.getLicenseType())
|
||
|
.setAnalyzeType(AnalysisLevelEnum.FILE_LEVEL.getCode())
|
||
|
.setVersion(versionInfo.getVersionName())
|
||
|
.setFeatureSimilarity(100.00f);
|
||
|
//计算被测件和开源文件的文本相似度
|
||
|
//根据文件的MD5的第一位获取solr库索引名称
|
||
|
String solrNameIndex =openFileMd5.substring(0, 1) + SolrDBConst.CORE_NAME_SUFFIX_SOURCE_FILE_INFO;
|
||
|
SolrDocumentList sourceFileInfo = solrUtils.query(solrNameIndex, "sourceFileMd5:" + openFileMd5, "sourceContent");
|
||
|
if (CollectionUtils.isNotEmpty(sourceFileInfo)) {
|
||
|
String openSourceContent = String.valueOf(sourceFileInfo.get(0).getFieldValue("sourceContent"));
|
||
|
//这里存在优化空间,被测件的文件行拆分 可以拿到循环外面
|
||
|
double similarity = SimilarityUtil.getSimilarityAndSaveRowNum(fileLines, openSourceContent, openLineNum);
|
||
|
matchOpenFile.setOpenRate(new BigDecimal(similarity * 100).setScale(2, RoundingMode.HALF_UP).floatValue());
|
||
|
//如果找不到源代码,直接将原文开源率置为 100%
|
||
|
} else {
|
||
|
log.error("找不到源代码,DBname:{},sourceFileMd5:{}", solrNameIndex, openFileMd5);
|
||
|
matchOpenFile.setOpenRate(100.00f);
|
||
|
}
|
||
|
matchOpenFile.setMd5(openFileMd5);
|
||
|
matchOpenFileList.add(matchOpenFile);
|
||
|
}
|
||
|
//统计被测件的总体开源率
|
||
|
//获取开源率阈值,判断当前文件是否开源
|
||
|
Integer openRateThreshold = analysisTask.getOpenRateThreshold();
|
||
|
int openLineCount = openLineNum.size();
|
||
|
BigDecimal totalLineCount = new BigDecimal(fileLines.size());
|
||
|
BigDecimal openRate = new BigDecimal(openLineCount).divide(totalLineCount, 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100));
|
||
|
|
||
|
//超过阈值,则认为当前文件是开源文件
|
||
|
if (openRate.compareTo(new BigDecimal(openRateThreshold)) > 0) {
|
||
|
analysisFile.setOpenType(true);
|
||
|
} else {
|
||
|
analysisFile.setOpenType(false);
|
||
|
}
|
||
|
|
||
|
//修改保存测试文件信息
|
||
|
analysisFile.setOpenLineCount(openLineCount)
|
||
|
.setOpenRate(openRate.floatValue());
|
||
|
|
||
|
//组装开源信息
|
||
|
matchOpenFileInfo.setFilePath(analysisFile.getFileUrl())
|
||
|
.setOpenType(analysisFile.getOpenType())
|
||
|
.setOpenRate(analysisFile.getOpenType() ? 100.00f : 0.00f)
|
||
|
.setMatchOpenFile(matchOpenFileList);
|
||
|
|
||
|
//保存当前开源信息数据
|
||
|
mongoTemplate.insert(matchOpenFileInfo);
|
||
|
|
||
|
}
|
||
|
|
||
|
}
|