You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
405 lines
19 KiB
405 lines
19 KiB
package com.keyware.composeanalysis.task;
|
|
|
|
|
|
import cn.hutool.core.collection.CollUtil;
|
|
import cn.hutool.core.collection.CollectionUtil;
|
|
import cn.hutool.core.io.FileUtil;
|
|
import cn.hutool.core.lang.Pair;
|
|
import com.alibaba.fastjson.JSONArray;
|
|
import com.keyware.common.constant.enums.AnalysisStatusEnum;
|
|
import com.keyware.composeanalysis.constant.FixedValue;
|
|
import com.keyware.composeanalysis.constant.FunctionAndAnalysisAssemblyConst;
|
|
import com.keyware.composeanalysis.constant.RedisConst;
|
|
import com.keyware.composeanalysis.constant.SolrDBConst;
|
|
import com.keyware.composeanalysis.constant.enums.AnalysisLevelEnum;
|
|
import com.keyware.composeanalysis.constant.enums.FileAnalysisStatusEnum;
|
|
import com.keyware.composeanalysis.entity.AnalysisTask;
|
|
import com.keyware.composeanalysis.mongo.FileDataMongoDto;
|
|
import com.keyware.composeanalysis.mongo.LineDataMongoDto;
|
|
import com.keyware.composeanalysis.mongo.MatchOpenFile;
|
|
import com.keyware.composeanalysis.mongo.MatchOpenFileMongoDto;
|
|
import com.keyware.composeanalysis.solr.VersionTree;
|
|
import com.keyware.composeanalysis.util.*;
|
|
import com.keyware.keyswan.anaysis.Analysis;
|
|
import com.keyware.keyswan.anaysis.AnalysisFactory;
|
|
import com.keyware.keyswan.common.CodeFile;
|
|
import com.keyware.keyswan.common.LineModel;
|
|
import com.keyware.utils.IdGenerator;
|
|
import lombok.extern.log4j.Log4j2;
|
|
import org.apache.commons.lang3.StringUtils;
|
|
import org.apache.solr.common.SolrDocument;
|
|
import org.apache.solr.common.SolrDocumentList;
|
|
import org.springframework.data.mongodb.core.MongoTemplate;
|
|
import org.springframework.data.mongodb.core.query.Update;
|
|
|
|
import java.math.BigDecimal;
|
|
import java.math.RoundingMode;
|
|
import java.util.*;
|
|
import java.util.concurrent.CountDownLatch;
|
|
import java.util.stream.Collectors;
|
|
import java.util.stream.Stream;
|
|
|
|
import static com.keyware.composeanalysis.util.SimilarityUtil.getOpenRateAndSaveRowNum;
|
|
import static org.springframework.data.mongodb.core.query.Criteria.where;
|
|
|
|
/**
|
|
* @author liuzongren
|
|
* @ClassName LineAnalysisTask
|
|
* @description: 代码块级别溯源 任务
|
|
* @datetime 2024年 07月 25日 16:19
|
|
* @version: 1.0
|
|
*/
|
|
|
|
@Log4j2
|
|
public class CodeBlockAnalysisTask extends IAnalysisTask {
|
|
|
|
private MongoTemplate mongoTemplate;
|
|
private AnalysisTask analysisTask;
|
|
//被测件的文件信息
|
|
private FileDataMongoDto analysisFile;
|
|
|
|
private SolrUtils solrUtils;
|
|
|
|
private RedisUtil redisUtil;
|
|
|
|
private CountDownLatch countDownLatch;
|
|
|
|
public CodeBlockAnalysisTask(AnalysisTask analysisTask, FileDataMongoDto analysisFile, MongoTemplate mongoTemplate, CountDownLatch countDownLatch) {
|
|
this.mongoTemplate = mongoTemplate;
|
|
this.analysisTask = analysisTask;
|
|
this.analysisFile = analysisFile;
|
|
this.countDownLatch = countDownLatch;
|
|
this.solrUtils = SpringContextUtils.getBean(SolrUtils.class);
|
|
this.redisUtil = SpringContextUtils.getBean(RedisUtil.class);
|
|
}
|
|
|
|
/**
|
|
* 方法 或者代码块 级别 源代码溯源
|
|
* 当前任务 需要在 文件级分析完成后 进行
|
|
*/
|
|
|
|
@Override
|
|
public void run() {
|
|
//执行任务前,判断一下任务执行的状态
|
|
Object status = redisUtil.get(String.format(RedisConst.TASK_RUNNING_STATUS_KEY_PREFIX, analysisTask.getId()));
|
|
if (status != null && (status.equals(AnalysisStatusEnum.STOP_ANALYSIS.getCode()) || status.equals(AnalysisStatusEnum.PAUSE_ANALYSIS.getCode()))) {
|
|
log.info("任务已取消,fileName:{}", analysisFile.getName());
|
|
countDownLatch.countDown();
|
|
return;
|
|
}
|
|
|
|
//获取文件地址
|
|
String filePath = analysisFile.getFileUrl();
|
|
//获取文件名称
|
|
String fileName = analysisFile.getName();
|
|
|
|
try {
|
|
Analysis analysis = AnalysisFactory.getAnalysis(filePath);
|
|
//将代码块特征存入MongoDB
|
|
//提取文件的代码块信息
|
|
CodeFile codeFile = analysis.analysisFile(filePath, FunctionAndAnalysisAssemblyConst.LINE_EXTRACT, FunctionAndAnalysisAssemblyConst.LINE_EXTRACT_BY_6_LINE);
|
|
List<LineModel> lineFeatures = codeFile.getLine_hay();
|
|
|
|
//根据文件后缀判断需要查询的solr特征库库名称
|
|
String featureCoreName = FixedValue.SUFFIX_SOLR_FILE.get(analysisFile.getSuffix());
|
|
|
|
//根据文件后缀,去检索sourceFileBase库,来获取文件版本信息
|
|
String sourceFileBaseCoreName = FixedValue.SUFFIX_SOLR_VERSION.get(analysisFile.getSuffix());
|
|
|
|
//从solr库中获取特征相似的文件
|
|
SolrDocumentList matchOpenSourceFiles = getFeatureSimilarityFromSolr(featureCoreName, lineFeatures);
|
|
|
|
//计算开源率
|
|
doAnalysis(matchOpenSourceFiles, sourceFileBaseCoreName, codeFile);
|
|
|
|
//更新文件表的分析状态为3 行级特征以分析完毕
|
|
analysisFile.setFileAnalysisStatus(FileAnalysisStatusEnum.ANALYSIS_DONE.getCode());
|
|
mongoTemplate.update(FileDataMongoDto.class)
|
|
.matching(where("_id").is(analysisFile.getId()))
|
|
.replaceWith(analysisFile)
|
|
.findAndReplace();
|
|
|
|
AnalysisLogUtil.insert(mongoTemplate, "【代码块级分析】完成" + fileName);
|
|
log.info("文件" + fileName + ":代码块级分析完成");
|
|
} catch (Exception e) {
|
|
AnalysisLogUtil.insertErrorInfo(mongoTemplate, "【代码块分析】失败" + fileName, e);
|
|
log.error("文件:{}代码块级分析失败!",fileName, e);
|
|
//修改当前文件分析状态未失败
|
|
mongoTemplate.update(FileDataMongoDto.class)
|
|
.matching(where("_id").is(analysisFile.getId()))
|
|
.apply(new Update().set("fileAnalysisStatus", FileAnalysisStatusEnum.FAILED_ANALYSIS.getCode()))
|
|
.first();
|
|
} finally {
|
|
countDownLatch.countDown();
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
* 根据 特征值 从特征库中检索 具有特征相似的
|
|
*
|
|
* @param solrCoreName 检索的solr 库名称
|
|
* @param functionAndCodeBlockInfos
|
|
* @return
|
|
*/
|
|
private SolrDocumentList getFeatureSimilarityFromSolr(String solrCoreName, List<LineModel> functionAndCodeBlockInfos) {
|
|
//获取函数获取代码块的特征MD5值
|
|
Set<String> traitLineMd5Arr = functionAndCodeBlockInfos.stream().map(LineModel::getTraitLineMd5).collect(Collectors.toSet());
|
|
Set<String> cuttLineMd5Arr = functionAndCodeBlockInfos.stream().map(LineModel::getCutLineMd5).collect(Collectors.toSet());
|
|
Set<String> queryMd5Arr = Stream.concat(traitLineMd5Arr.stream(), cuttLineMd5Arr.stream()).collect(Collectors.toSet());
|
|
if (CollUtil.isEmpty(queryMd5Arr)) {
|
|
log.error("特征为空,无法查询:{}", analysisFile.getName());
|
|
return new SolrDocumentList();
|
|
}
|
|
String queryStr = "line_hay:(" + StringUtils.join(queryMd5Arr, " OR ") + ")";
|
|
log.info("查询条件: solrCoreName:{},queryStr:{}", solrCoreName, queryStr);
|
|
SolrDocumentList result = solrUtils.query(solrCoreName, queryStr, "sourceMd5,line_hay");
|
|
// log.info("查询结果: result:{}", result);
|
|
return result;
|
|
}
|
|
|
|
|
|
/**
|
|
* 计算开源率 被测件的开源率
|
|
*
|
|
* @param matcheOpenSourceFiles 匹配的开源文件信息
|
|
* @param sourceFileBaseCoreName 查询版开源文件版本ID的 solr库名称
|
|
* @param fileAnalysisRes 被测件的解析结果
|
|
*/
|
|
private void doAnalysis(SolrDocumentList matcheOpenSourceFiles, String sourceFileBaseCoreName, CodeFile fileAnalysisRes) {
|
|
|
|
if (CollectionUtil.isEmpty(matcheOpenSourceFiles)) {
|
|
return;
|
|
}
|
|
|
|
|
|
//保存所有匹配的特征代码块MD5信息,方便统计总的匹配行数
|
|
Set<String> matchingTraitLineSet = new HashSet<>();
|
|
|
|
//匹配的特征代码块MD5
|
|
Set<Integer> matchedLineRowsNum = new HashSet<>();
|
|
|
|
//统计每个文件的开源率
|
|
List<MatchOpenFile> matchOpenFilesRes = calculateSimilarityAndOpenRate(matcheOpenSourceFiles, fileAnalysisRes, sourceFileBaseCoreName, matchedLineRowsNum, matchingTraitLineSet);
|
|
|
|
//计算文件的总体的特征相似度
|
|
Map<String, Integer> traitsFeatureMd5AndFeatureLineNumMap = getTraitsFeatureMd5AndFeatureLineNumMap(fileAnalysisRes.getLine_hay());
|
|
|
|
int matchCodeBlockLineCount = 0;
|
|
for (String matchFeatureMd5 : matchingTraitLineSet) {
|
|
matchCodeBlockLineCount += traitsFeatureMd5AndFeatureLineNumMap.get(matchFeatureMd5);
|
|
}
|
|
|
|
//计算文件的总体特征相似度
|
|
BigDecimal featureSimilarity = new BigDecimal(matchCodeBlockLineCount).divide(new BigDecimal(analysisFile.getCodeRowNum()), 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2);
|
|
|
|
//计算文件的总体开源率
|
|
BigDecimal openRate = new BigDecimal(matchedLineRowsNum.size()).divide(new BigDecimal(analysisFile.getCodeRowNum()), 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2);
|
|
|
|
//获取开源率的阈值
|
|
Integer openRateThreshold = analysisTask.getOpenRateThreshold();
|
|
|
|
//如果开源率大于阈值,则将当前文件设置成开源
|
|
if (openRate.compareTo(new BigDecimal(openRateThreshold)) >= 0) {
|
|
analysisFile.setOpenType(true);
|
|
}
|
|
|
|
|
|
//保存当前文件开源行数
|
|
analysisFile.setOpenLineCount(matchedLineRowsNum.size());
|
|
|
|
//保存当前文件的开源信息到mongo库中
|
|
MatchOpenFileMongoDto matchOpenFileMongo = new MatchOpenFileMongoDto();
|
|
matchOpenFileMongo.setId(IdGenerator.uuid32())
|
|
.setFileName(analysisFile.getName())
|
|
.setFilePath(analysisFile.getFileUrl())
|
|
.setFeatureSimilarity(featureSimilarity.floatValue())
|
|
.setOpenRate(openRate.floatValue())
|
|
.setOpenType(analysisFile.getOpenType())
|
|
.setAnalysisType(AnalysisLevelEnum.BLOCK_LEVEL.getCode())
|
|
.setSubMatchOpenFiles(matchOpenFilesRes);
|
|
mongoTemplate.save(matchOpenFileMongo);
|
|
}
|
|
|
|
|
|
/**
|
|
* 计算当前文件的特征相似度 和 开源率
|
|
*
|
|
* @param matchOpenFiles 通过MD5 匹配到的所有开源文件
|
|
* @param sourceFileBaseCoreName 当前文件特征文件的 solr coreName
|
|
* @param matchLineRowsNum 所有开源文件匹配到的开源行号列表
|
|
* @param matchFeatureCodeBlockMd5s 所有开源文件匹配到的特征代码块MD5
|
|
*/
|
|
private List<MatchOpenFile> calculateSimilarityAndOpenRate(SolrDocumentList matchOpenFiles, CodeFile fileAnalysisRes, String sourceFileBaseCoreName, Set<Integer> matchLineRowsNum, Set<String> matchFeatureCodeBlockMd5s) {
|
|
|
|
List<MatchOpenFile> matchOpenFilesRes = new ArrayList<>();
|
|
|
|
//首先根据文件的MD5查询开源文件的版本ID,和路径信息
|
|
Set<String> openSourceFileMd5s = matchOpenFiles.stream().map(doc -> (String) doc.get("sourceMd5")).collect(Collectors.toSet());
|
|
Map<String, SolrDocument> md5VersionInfoMap = solrUtils.batchQueryVersionIdFromSourceFileBaseBySourceMd5(sourceFileBaseCoreName, openSourceFileMd5s);
|
|
|
|
//根据版本ID查询版本的详细信息
|
|
//todo 这里 查询一个版本的信息 需要检索 两个 solr 库 而且还需要检索 versioinTree 后面需要优化
|
|
Set<String> openSourceFileVersionIds = md5VersionInfoMap.values().stream().map(doc -> (String) doc.get("versionId")).collect(Collectors.toSet());
|
|
List<VersionTree> versionTrees = solrUtils.queryBatchVersionInfoByVersionIds(openSourceFileVersionIds);
|
|
Map<String, VersionTree> versionIdVersionInfoMap = versionTrees.stream().collect(Collectors.toMap(VersionTree::getVersionId, java.util.function.Function.identity()));
|
|
|
|
//按照特征行进行分组,一次匹配中,将所有的特征行进行累加
|
|
Map<String, Integer> traitsFeatureMd5AndFeatureLineNumMap = getTraitsFeatureMd5AndFeatureLineNumMap(fileAnalysisRes.getLine_hay());
|
|
|
|
//被测件文本内容
|
|
String sourcefileContent= FileUtil.readUtf8String(analysisFile.getFileUrl());
|
|
|
|
//将文本内容解析成行信息,用于后续文件的开源率计算
|
|
List<String> analysisFileLineInfo = SimilarityUtil.getSplitWords(sourcefileContent);
|
|
|
|
for (SolrDocument matchFile : matchOpenFiles) {
|
|
|
|
//开源文件md5
|
|
String openSourceFileMd5 = matchFile.getFieldValue("sourceMd5").toString();
|
|
|
|
//解析文件的代码块特征值
|
|
List<LineModel> openFileCodeBlockFeatureList = getOpenFileCodeBlockList(matchFile);
|
|
|
|
//匹配的总特征行数
|
|
int currentFileMatchFeatureLineCount = 0;
|
|
|
|
//当前文件所匹配的特征函数MD5
|
|
Set<String> currentFileMatchFeatureMd5 = new HashSet();
|
|
|
|
//遍历当前文件的代码块特征,统计匹配的总行数
|
|
for (String traitLineMd5 : traitsFeatureMd5AndFeatureLineNumMap.keySet()) {
|
|
//村换匹配到的文件的行信息
|
|
for (LineModel matchLine : openFileCodeBlockFeatureList) {
|
|
if (traitLineMd5.equals(matchLine.getTraitLineMd5())) {
|
|
if (!currentFileMatchFeatureMd5.contains(traitLineMd5)) {
|
|
currentFileMatchFeatureMd5.add(traitLineMd5);
|
|
matchFeatureCodeBlockMd5s.add(traitLineMd5);
|
|
currentFileMatchFeatureLineCount += traitsFeatureMd5AndFeatureLineNumMap.get(traitLineMd5);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
//根据源文件的MD5确定需要查询源码库的序号
|
|
String openSourceCodeCoreIndex = openSourceFileMd5.substring(0, 1) + SolrDBConst.CORE_NAME_SUFFIX_SOURCE_FILE_INFO;
|
|
|
|
//获取开源文件的文本信息
|
|
SolrDocument openSourceContent = solrUtils.queryOne(openSourceCodeCoreIndex, "sourceFileMd5:" + openSourceFileMd5, "sourceContent");
|
|
if (openSourceContent == null) {
|
|
log.error("根据开源文件MD5查询源码失败,sourceFileMd5:{}", openSourceFileMd5);
|
|
continue;
|
|
}
|
|
|
|
//当前文件的开源率
|
|
Pair<Float, HashSet<Integer>> openRateAndSaveRowNum = getOpenRateAndSaveRowNum(analysisFileLineInfo, openSourceContent.getFieldValue("sourceContent").toString());
|
|
|
|
//将当前文件匹配的行号,存储到缓存中,方便统计整体的开源率
|
|
matchLineRowsNum.addAll(openRateAndSaveRowNum.getValue());
|
|
|
|
//统计当前文件的特征相似度
|
|
BigDecimal featureSimilarity = new BigDecimal(currentFileMatchFeatureLineCount).divide(new BigDecimal(analysisFile.getCodeRowNum()), 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2);
|
|
|
|
SolrDocument openEntries = md5VersionInfoMap.get(openSourceFileMd5);
|
|
VersionTree versionInfo = versionIdVersionInfoMap.get(openEntries.get("versionId"));
|
|
if (versionInfo == null) {
|
|
log.error("根据开源文件版本ID查询版本信息失败,versionId:{}", openEntries.get("versionId"));
|
|
continue;
|
|
}
|
|
|
|
String openFilePath = (String) openEntries.get("fullPath");
|
|
//组装当前开源文件的开源项目信息
|
|
MatchOpenFile matchOpenFileInfo = new MatchOpenFile();
|
|
matchOpenFileInfo.setId(IdGenerator.uuid32())
|
|
.setFileName(FileUtil.getName(openFilePath))
|
|
.setPName(versionInfo.getProName())
|
|
.setPId(versionInfo.getProId())
|
|
.setVersion(versionInfo.getVersionName())
|
|
.setVersionId(versionInfo.getVersionId())
|
|
.setSourceFilePath(openFilePath)
|
|
.setSourceUrl(versionInfo.getDownUrl())
|
|
.setLicenseType(versionInfo.getLicenseType())
|
|
.setFeatureSimilarity(featureSimilarity.floatValue())
|
|
.setOpenRate(openRateAndSaveRowNum.getKey())
|
|
.setMd5(openSourceFileMd5);
|
|
matchOpenFilesRes.add(matchOpenFileInfo);
|
|
}
|
|
return matchOpenFilesRes;
|
|
}
|
|
|
|
|
|
/**
|
|
* 获取当前文件的代码块特征值
|
|
*
|
|
* @param openSourceFile
|
|
* @return
|
|
*/
|
|
private List<LineModel> getOpenFileCodeBlockList(SolrDocument openSourceFile) {
|
|
//解析文件的代码块特征值
|
|
String lineFeatureMd5s = (String) openSourceFile.get("line_hay");
|
|
lineFeatureMd5s = lineFeatureMd5s.replace("\\", "")
|
|
.replace("\"{", "{")
|
|
.replace("}\"", "}");
|
|
return JSONArray.parseArray(lineFeatureMd5s, LineModel.class);
|
|
}
|
|
|
|
/**
|
|
* 或者特征代码块的md5 和 当前md5包含的特征行数
|
|
*
|
|
* @param codeBlockInfos
|
|
* @return
|
|
*/
|
|
private Map<String, Integer> getTraitsFeatureMd5AndFeatureLineNumMap(List<LineModel> codeBlockInfos) {
|
|
Map<String, List<LineModel>> traitMd5GroupMap = codeBlockInfos.stream().collect(Collectors.groupingBy(LineModel::getTraitLineMd5));
|
|
Map<String, Integer> resultMap = new HashMap<>();
|
|
for (String traitMd5 : traitMd5GroupMap.keySet()) {
|
|
List<LineModel> lineModels = traitMd5GroupMap.get(traitMd5);
|
|
int traitsLineNum = lineModels.stream().mapToInt(lineModel -> (Integer.valueOf(lineModel.getEndLine()) - Integer.valueOf(lineModel.getStartLine()) + 1)).sum();
|
|
resultMap.put(traitMd5, traitsLineNum);
|
|
}
|
|
return resultMap;
|
|
}
|
|
|
|
/**
|
|
* 将特征值插入到mongo库中
|
|
*
|
|
* @param features 特征集合
|
|
* @param lineDataMongoDto 当前分析任务 ,特征信息存储
|
|
* todo 后期 看看有没有插入的必要
|
|
* @param
|
|
*/
|
|
@Deprecated
|
|
private void insertFeatureValue(List<LineModel> features, LineDataMongoDto lineDataMongoDto) {
|
|
List<LineModel> batchInsertList = new ArrayList<>();
|
|
if (CollectionUtil.isNotEmpty(features)) {
|
|
//这里的批量插入逻辑可以进行校验
|
|
//每10条存一次,解析的数据量如果过大,可能会超过MongoDB数据限制
|
|
int batchInsertStpe = 10;
|
|
int total = 0;
|
|
for (int i = 0; i < features.size(); i++) {
|
|
LineModel lineModel = features.get(i);
|
|
if (total != batchInsertStpe) {
|
|
batchInsertList.add(lineModel);
|
|
total++;
|
|
}
|
|
if (i == features.size() - 1 && total != batchInsertStpe) {
|
|
total = 0;
|
|
lineDataMongoDto.setId(IdGenerator.uuid32())
|
|
.setLineModels(batchInsertList);
|
|
mongoTemplate.insert(lineDataMongoDto);
|
|
}
|
|
if (total == batchInsertStpe) {
|
|
total = 0;
|
|
lineDataMongoDto.setId(IdGenerator.uuid32())
|
|
.setLineModels(batchInsertList);
|
|
mongoTemplate.insert(lineDataMongoDto);
|
|
batchInsertList.clear();
|
|
}
|
|
}
|
|
} else {
|
|
lineDataMongoDto.setId(IdGenerator.uuid32());
|
|
mongoTemplate.insert(lineDataMongoDto);
|
|
}
|
|
}
|
|
|
|
}
|
|
|