You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
compose-analysis/src/main/java/com/keyware/composeanalysis/task/FunctionAnalysisTask.java

410 lines
20 KiB

7 months ago
package com.keyware.composeanalysis.task;
import cn.hutool.core.collection.CollectionUtil;
import cn.hutool.core.lang.Pair;
import cn.hutool.core.util.ObjUtil;
import com.alibaba.fastjson.JSONArray;
import com.keyware.composeanalysis.constant.FixedValue;
import com.keyware.composeanalysis.constant.RedisConst;
import com.keyware.composeanalysis.constant.SolrDBConst;
import com.keyware.composeanalysis.constant.enums.AnalysisLevelEnum;
import com.keyware.composeanalysis.constant.enums.AnalysisStatusEnum;
import com.keyware.composeanalysis.constant.enums.FileAnalysisStatusEnum;
import com.keyware.composeanalysis.entity.AnalysisTask;
import com.keyware.composeanalysis.mongo.FileDataMongoDto;
import com.keyware.composeanalysis.mongo.LineDataMongoDto;
import com.keyware.composeanalysis.mongo.MatchOpenFile;
import com.keyware.composeanalysis.mongo.MatchOpenFileMongoDto;
import com.keyware.composeanalysis.solr.VersionTree;
import com.keyware.composeanalysis.util.*;
import com.keyware.keyswan.common.LineModel;
import com.keyware.keyware.anaysis.Analysis;
import com.keyware.keyware.anaysis.AnalysisFactory;
import com.keyware.keyware.common.CodeFile;
import com.keyware.keyware.common.Function;
import com.keyware.utils.IdGenerator;
import lombok.extern.log4j.Log4j2;
import org.apache.commons.lang3.StringUtils;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.springframework.data.mongodb.core.MongoTemplate;
import org.springframework.data.mongodb.core.query.Update;
import java.io.FileInputStream;
import java.math.BigDecimal;
import java.math.RoundingMode;
import java.util.*;
import java.util.concurrent.CountDownLatch;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import static org.springframework.data.mongodb.core.query.Criteria.where;
/**
* @author liuzongren
* @ClassName LineAnalysisTask
* @description: 函数级别溯源 任务
* @datetime 2024年 07月 25日 16:19
* @version: 1.0
*/
@Log4j2
public class FunctionAnalysisTask extends IAnalysisTask {
private MongoTemplate mongoTemplate;
private AnalysisTask analysisTask;
//被测件的文件信息
private FileDataMongoDto analysisFile;
private SolrUtils solrUtils;
private RedisUtil redisUtil;
private CountDownLatch countDownLatch;
public FunctionAnalysisTask(AnalysisTask analysisTask, FileDataMongoDto analysisFile, MongoTemplate mongoTemplate, CountDownLatch countDownLatch) {
this.mongoTemplate = mongoTemplate;
this.analysisTask = analysisTask;
this.analysisFile = analysisFile;
this.countDownLatch = countDownLatch;
this.solrUtils = SpringContextUtils.getBean(SolrUtils.class);
this.redisUtil = SpringContextUtils.getBean(RedisUtil.class);
}
/**
* 方法 或者代码块 级别 源代码溯源
* 当前任务 需要在 文件级分析完成后 进行
*/
@Override
public void run() {
//执行任务前,判断一下任务执行的状态
Object status = redisUtil.get(String.format(RedisConst.TASK_RUNNING_STATUS_KEY_PREFIX, analysisTask.getId()));
if (status != null && (status.equals(AnalysisStatusEnum.STOP_ANALYSIS.getCode()) || status.equals(AnalysisStatusEnum.PAUSE_ANALYSIS.getCode()))) {
log.info("任务已取消,fileName:{}", analysisFile.getName());
countDownLatch.countDown();
return;
}
//获取文件地址
String filePath = analysisFile.getFileUrl();
//获取文件名称
String fileName = analysisFile.getName();
try {
//根据文件后缀判断需要查询的solr特征库库名称
String featureCoreName = FixedValue.SUFFIX_SOLR_FILE.get(analysisFile.getSuffix());
//根据文件后缀,去检索sourceFileBase库,来获取文件版本信息
String sourceFileBaseCoreName = FixedValue.SUFFIX_SOLR_VERSION.get(analysisFile.getSuffix());
//根据文件的名称获取函数解析器
Analysis analysis = AnalysisFactory.getAnalysis(filePath);
//解析文件
if (!ObjUtil.hasEmpty(featureCoreName, sourceFileBaseCoreName, analysis)) {
CodeFile codeFile = analysis.analysisFile(new FileInputStream(filePath));
if (codeFile != null) {
List<Function> functionList = codeFile.getFunctionList();
if (CollectionUtil.isNotEmpty(functionList)) {
//获取函数的特征MD5,cutMD5
List<String> featureFunctionMd5List = functionList.stream().map(Function::getMd5).collect(Collectors.toList());
List<String> cutFunctionMd5List = functionList.stream().map(Function::getSourceMd5).collect(Collectors.toList());
Set<String> queryMd5List = Stream.concat(featureFunctionMd5List.stream(), cutFunctionMd5List.stream()).collect(Collectors.toSet());
String queryStr = "fun_hay:(" + StringUtils.join(queryMd5List, " OR ") + ")";
// log.info("检索函数特征,coreName:{} ,queryStr:{}", featureCoreName, queryStr);
SolrDocumentList matchOpenFiles = solrUtils.query(featureCoreName, queryStr, "sourceMd5,fun_hay");
// log.info("resp", sourceMd5);
//如果函数级特征匹配,能够匹配到开源文件信息,则根据开源文件的md5或者开源文件信息,做相似度对比
if (matchOpenFiles != null) {
//对匹配到的文件进行分析
doAnalysis(matchOpenFiles, sourceFileBaseCoreName, codeFile);
} else {
//因为函数的特征库较少,这里补充一个对比逻辑,如果当前文件解析失败,或者没有通过函数匹配到数据,则直接通过文件的md5 再次查询一次solr库
checkByOriginalFileMd5(sourceFileBaseCoreName, analysisFile.getMd5());
}
}
}
} else {
//因为函数的特征库较少,这里补充一个对比逻辑,如果当前文件解析失败,或者没有通过函数匹配到数据,则直接通过文件的md5 再次查询一次solr库
checkByOriginalFileMd5(sourceFileBaseCoreName, analysisFile.getMd5());
}
//更新文件表的分析状态为3 函数级特征以分析完毕
analysisFile.setFileAnalysisStatus(FileAnalysisStatusEnum.ANALYSIS_DONE.getCode());
mongoTemplate.update(FileDataMongoDto.class)
.matching(where("_id").is(analysisFile.getId()))
.replaceWith(analysisFile)
.findAndReplace();
AnalysisLogUtil.insert(mongoTemplate, "【函数级分析】完成" + fileName);
log.info("文件" + fileName + ":函数级分析完成");
} catch (Exception e) {
AnalysisLogUtil.insertErrorInfo(mongoTemplate, "【函数级级分析】失败" + fileName, e);
log.error("文件:" + fileName + "函数级别特征提取失败!", e);
//修改当前文件分析状态未失败
mongoTemplate.update(FileDataMongoDto.class)
.matching(where("_id").is(analysisFile.getId()))
.apply(new Update().set("fileAnalysisStatus", FileAnalysisStatusEnum.FAILED_ANALYSIS.getCode()))
.first();
} finally {
countDownLatch.countDown();
}
}
/**
* 对比函数级文本相似度
*
* @param matchOpenFiles 通过特征匹配到的开源文件的md5
* @param sourceFileBaseCoreName 查询版开源文件版本ID的 solr库名称
* @param fileAnalysisRes 被测件的函数解析结果
* @throws Exception
*/
private void doAnalysis(SolrDocumentList matchOpenFiles, String sourceFileBaseCoreName, CodeFile fileAnalysisRes) throws Exception {
//按照函数的特征md5进行分组,getter ,setter等方法的 特征值会重复
Map<String, List<Function>> featureMd5FunctionMap = fileAnalysisRes.getFunctionList().stream().collect(Collectors.groupingBy(Function::getMd5));
//函数代码总函数
int totalFunctionLineCount = fileAnalysisRes.getFunctionList().stream().mapToInt(Function::getCodeRowNum).sum();
//匹配到的特征函数Md5
Set<String> matchFeatureFunctionMd5s = new HashSet();
//匹配到源码的行号
Set<Integer> matchOpenLineRowsNum = new HashSet();
//计算与每个开源文件的开源率和特征相似度
List<MatchOpenFile> matchOpenFilesRes = calculateSimilarityAndOpenRate(matchOpenFiles, fileAnalysisRes, sourceFileBaseCoreName, matchOpenLineRowsNum, matchFeatureFunctionMd5s);
//计算文件的总体的特征相似度
int matchFunctionLineCount = 0;
for (String matchFeatureFunctionMd5 : matchFeatureFunctionMd5s) {
matchFunctionLineCount += featureMd5FunctionMap.get(matchFeatureFunctionMd5).stream().mapToInt(Function::getCodeRowNum).sum();
}
BigDecimal featureSimilarity = new BigDecimal(matchFunctionLineCount).divide(new BigDecimal(totalFunctionLineCount), 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2);
//计算文件的总体开源率
BigDecimal openRate = new BigDecimal(matchOpenLineRowsNum.size()).divide(new BigDecimal(analysisFile.getCodeRowNum()), 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2);
//获取开源率的阈值
Integer openRateThreshold = analysisTask.getOpenRateThreshold();
//如果开源率大于阈值,则将当前文件设置成开源
if (openRate.floatValue() > openRateThreshold) {
analysisFile.setOpenType(true);
}
//保存当前文件的开源信息到mongo库中
MatchOpenFileMongoDto matchOpenFileMongo = new MatchOpenFileMongoDto();
matchOpenFileMongo.setId(IdGenerator.uuid32())
.setFilePath(analysisFile.getFileUrl())
.setFileName(analysisFile.getName())
.setFeatureSimilarity(featureSimilarity.floatValue())
.setOpenRate(openRate.floatValue())
.setOpenType(analysisFile.getOpenType())
.setMatchOpenFile(matchOpenFilesRes);
mongoTemplate.save(matchOpenFileMongo);
}
/**
* 计算当前文件的特征相似度 开源率
*
* @param matchOpenFiles 通过MD5 匹配到的所有开源文件
* @param sourceFileBaseCoreName 当前文件特征文件的 solr coreName
* @param matchLineRowsNum 所有开源文件匹配到的开源行号列表
* @param matchFeatureFunctionMd5s 所有开源文件匹配到的特征函数MD5
* return 匹配的开源文件解析后的结果集
*/
private List<MatchOpenFile> calculateSimilarityAndOpenRate(SolrDocumentList matchOpenFiles, CodeFile fileAnalysisRes, String sourceFileBaseCoreName, Set<Integer> matchLineRowsNum, Set<String> matchFeatureFunctionMd5s) {
//匹配的开源文件列表
List<MatchOpenFile> matchOpenFilesRes = new ArrayList<>();
//按照函数的特征md5进行分组,getter ,setter等方法的 特征值会重复
Map<String, List<Function>> featureMd5FunctionMap = fileAnalysisRes.getFunctionList().stream().collect(Collectors.groupingBy(Function::getMd5));
//首先根据文件的MD5查询开源文件的版本ID,和路径信息
Set<String> openSourceFileMd5s = matchOpenFiles.stream().map(doc -> (String) doc.get("sourceMd5")).collect(Collectors.toSet());
Map<String, SolrDocument> md5VersionInfoMap = solrUtils.batchQueryVersionIdFromSourceFileBaseBySourceMd5(sourceFileBaseCoreName, openSourceFileMd5s);
//根据版本ID查询版本的详细信息
//todo 这里 查询一个版本的信息 需要检索 两个 solr 库 而且还需要检索 versioinTree 后面需要优化
Set<String> openSourceFileVersionIds = md5VersionInfoMap.values().stream().map(doc -> (String) doc.get("versionId")).collect(Collectors.toSet());
List<VersionTree> versionTrees = solrUtils.queryBatchVersionInfoByVersionIds(openSourceFileVersionIds);
Map<String, VersionTree> versionIdVersionInfoMap = versionTrees.stream().collect(Collectors.toMap(VersionTree::getVersionId, java.util.function.Function.identity()));
//函数总行数
BigDecimal totalFunctionLineCount = new BigDecimal(fileAnalysisRes.getFunctionList().stream().mapToInt(Function::getCodeRowNum).sum());
for (SolrDocument openSourceFile : matchOpenFiles) {
//开源文件md5
String openSourceFileMd5 = openSourceFile.getFieldValue("sourceMd5").toString();
//解析文件的函数特征值
List<Function> openFileFunctionList = getOpenFileFunctionList(openSourceFile);
//根据源文件的MD5确定需要查询源码库的序号
String openSourceCodeCoreIndex = openSourceFileMd5.substring(0, 1) + SolrDBConst.CORE_NAME_SUFFIX_SOURCE_FILE_INFO;
//获取开源文件的文本信息
SolrDocument openSourceContent = solrUtils.queryOne(openSourceCodeCoreIndex, "sourceFileMd5:" + openSourceFileMd5, "sourceContent");
//当前文件匹配特征函数总行数
int currentFileMatchFeatureLineCount = 0;
//当前文件所匹配的特征函数MD5
Set<String> currentFileMatchFeatureFunctionMd5 = new HashSet();
//遍历函数特征MD5
for (String funFeatureMd5 : featureMd5FunctionMap.keySet()) {
List<Function> currentFueatureFunctionList = featureMd5FunctionMap.get(funFeatureMd5);
//源文件的特征函数列表
for (Function openFunction : openFileFunctionList) {
if (funFeatureMd5.equals(openFunction.getMd5())) {
//每个特征函数 不能多次匹配,影响整体特征相似度
//匹配成功后,相同的特征行 一并加上
if (!currentFileMatchFeatureFunctionMd5.contains(funFeatureMd5)) {
currentFileMatchFeatureFunctionMd5.add(funFeatureMd5);
matchFeatureFunctionMd5s.add(funFeatureMd5);
currentFileMatchFeatureLineCount += currentFueatureFunctionList.stream().mapToInt(Function::getCodeRowNum).sum();
}
}
}
}
//当前文件的开源率
Pair<Float, HashSet<Integer>> openRateAndSaveRowNum = SimilarityUtil.getOpenRateAndSaveRowNum(new String(fileAnalysisRes.getFileContent()), openSourceContent.getFieldValue("sourceContent").toString());
//将当前文件匹配的行号,存储到缓存中,方便统计整体的开源率
matchLineRowsNum.addAll(openRateAndSaveRowNum.getValue());
//统计当前文件的特征相似度
BigDecimal featureSimilarity = new BigDecimal(currentFileMatchFeatureLineCount).divide(totalFunctionLineCount, 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2);
SolrDocument openEntries = md5VersionInfoMap.get(openSourceFileMd5);
VersionTree versionInfo = versionIdVersionInfoMap.get(openEntries.get("versionId"));
//组装当前开源文件的开源项目信息
MatchOpenFile matchOpenFileInfo = new MatchOpenFile();
matchOpenFileInfo.setPId(versionInfo.getProId())
.setPName(versionInfo.getProName())
.setSourceUrl((String) openEntries.get("fullPath"))
.setFeatureSimilarity(featureSimilarity.floatValue())
.setOpenRate(openRateAndSaveRowNum.getKey())
.setVersion(versionInfo.getVersionName())
.setLicenseType(versionInfo.getLicenseType())
.setAnalyzeType(AnalysisLevelEnum.FUNCTION_LEVEL.getCode());
matchOpenFilesRes.add(matchOpenFileInfo);
}
return matchOpenFilesRes;
}
/**
* 防止函数特征库不全再次根据文件MD5查询开源文件信息, 做二次校验
*
* @param originalFileMd5
* @param versionIdCoreName
*/
private void checkByOriginalFileMd5(String versionIdCoreName, String originalFileMd5) {
//根据文件的MD5,查询特征库,看当前文件是否在开源代码库中
SolrDocument versionIdAndPath = solrUtils.queryOne(versionIdCoreName, "sourceFileMd5:" + originalFileMd5, "versionId,fullPath,sourceFileMd5");
if (versionIdAndPath != null) {
//根据版本ID查询版本的详细信息
VersionTree versionInfo = solrUtils.queryVersionInfoByVersionId((String) versionIdAndPath.get("versionId"));
if (versionInfo != null) {
//当前开源文件的开源项目信息
MatchOpenFile matchOpenFileInfo = new MatchOpenFile();
matchOpenFileInfo.setPId(versionInfo.getProId())
.setPName(versionInfo.getProName())
.setSourceUrl(versionInfo.getDownUrl())
.setFeatureSimilarity(100.00f)
.setOpenRate(100.00f)
.setAnalyzeType(AnalysisLevelEnum.FUNCTION_LEVEL.getCode());
//保存当前文件的开源信息到mongo库中
MatchOpenFileMongoDto matchOpenFileMongo = new MatchOpenFileMongoDto();
matchOpenFileMongo.setId(IdGenerator.uuid32())
.setFilePath(analysisFile.getFileUrl())
.setFileName(analysisFile.getName())
.setOpenRate(100.00f)
.setOpenType(analysisFile.getOpenType())
.setMatchOpenFile(Arrays.asList(matchOpenFileInfo));
mongoTemplate.save(matchOpenFileMongo);
}
}
}
/**
* 获取当前文件的函数特征值
*
* @param matchOpenFile
* @return
*/
private List<Function> getOpenFileFunctionList(SolrDocument matchOpenFile) {
try {
//解析文件的函数特征值
String lineFeatureMd5s = matchOpenFile.getFieldValue("fun_hay").toString();
lineFeatureMd5s = lineFeatureMd5s.replace("\\", "")
.replace("\"{", "{")
.replace("}\"", "}");
return JSONArray.parseArray(lineFeatureMd5s, Function.class);
}catch (Exception e){
log.error("解析文件特征值失败",e);
}
return new ArrayList<Function>();
}
/**
* 将特征值插入到mongo库中
*
* @param features 特征集合
* @param lineDataMongoDto 当前分析任务 特征信息存储
* @param
*/
@Deprecated
private void insertFeatureValue(List<LineModel> features, LineDataMongoDto lineDataMongoDto) {
List<LineModel> batchInsertList = new ArrayList<>();
if (CollectionUtil.isNotEmpty(features)) {
//这里的批量插入逻辑可以进行校验
//每10条存一次,解析的数据量如果过大,可能会超过MongoDB数据限制
int batchInsertStpe = 10;
int total = 0;
for (int i = 0; i < features.size(); i++) {
LineModel lineModel = features.get(i);
if (total != batchInsertStpe) {
batchInsertList.add(lineModel);
total++;
}
if (i == features.size() - 1 && total != batchInsertStpe) {
total = 0;
lineDataMongoDto.setId(IdGenerator.uuid32())
.setLineModels(batchInsertList);
mongoTemplate.insert(lineDataMongoDto);
}
if (total == batchInsertStpe) {
total = 0;
lineDataMongoDto.setId(IdGenerator.uuid32())
.setLineModels(batchInsertList);
mongoTemplate.insert(lineDataMongoDto);
batchInsertList.clear();
}
}
} else {
lineDataMongoDto.setId(IdGenerator.uuid32());
mongoTemplate.insert(lineDataMongoDto);
}
}
}