forked from liuzongren/compose-analysis
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
299 lines
13 KiB
299 lines
13 KiB
7 months ago
|
package com.keyware.composeanalysis.task;
|
||
|
|
||
|
|
||
|
import cn.hutool.core.collection.CollectionUtil;
|
||
|
import com.keyware.composeanalysis.constant.FixedValue;
|
||
|
import com.keyware.composeanalysis.constant.FunctionAndAnalysisAssemblyConst;
|
||
|
import com.keyware.composeanalysis.constant.RedisConst;
|
||
|
import com.keyware.composeanalysis.constant.SolrDBConst;
|
||
|
import com.keyware.composeanalysis.constant.enums.AnalysisLevelEnum;
|
||
|
import com.keyware.composeanalysis.constant.enums.AnalysisStatusEnum;
|
||
|
import com.keyware.composeanalysis.constant.enums.FileAnalysisStatusEnum;
|
||
|
import com.keyware.composeanalysis.entity.AnalysisTask;
|
||
|
import com.keyware.composeanalysis.mongo.FileDataMongoDto;
|
||
|
import com.keyware.composeanalysis.mongo.LineDataMongoDto;
|
||
|
import com.keyware.composeanalysis.mongo.MatchOpenFile;
|
||
|
import com.keyware.composeanalysis.mongo.MatchOpenFileMongoDto;
|
||
|
import com.keyware.composeanalysis.solr.VersionTree;
|
||
|
import com.keyware.composeanalysis.util.AnalysisLogUtil;
|
||
|
import com.keyware.composeanalysis.util.RedisUtil;
|
||
|
import com.keyware.composeanalysis.util.SolrUtils;
|
||
|
import com.keyware.composeanalysis.util.SpringContextUtils;
|
||
|
import com.keyware.keyswan.anaysis.Analysis;
|
||
|
import com.keyware.keyswan.anaysis.AnalysisFactory;
|
||
|
import com.keyware.keyswan.common.CodeFile;
|
||
|
import com.keyware.utils.IdGenerator;
|
||
|
import lombok.extern.log4j.Log4j2;
|
||
|
import org.apache.commons.lang3.StringUtils;
|
||
|
import org.apache.solr.common.SolrDocument;
|
||
|
import org.apache.solr.common.SolrDocumentList;
|
||
|
import org.springframework.data.mongodb.core.MongoTemplate;
|
||
|
import org.springframework.data.mongodb.core.query.Update;
|
||
|
|
||
|
import java.math.BigDecimal;
|
||
|
import java.math.RoundingMode;
|
||
|
import java.util.*;
|
||
|
import java.util.concurrent.CountDownLatch;
|
||
|
|
||
|
import static org.springframework.data.mongodb.core.query.Criteria.where;
|
||
|
|
||
|
/**
|
||
|
* @author liuzongren
|
||
|
* @ClassName LineAnalysisTask
|
||
|
* @description: 行级别 特征提取定时任务
|
||
|
* @datetime 2024年 07月 25日 16:19
|
||
|
* @version: 1.0
|
||
|
*/
|
||
|
|
||
|
@Log4j2
|
||
|
public class LineAnalysisTask extends IAnalysisTask {
|
||
|
|
||
|
private MongoTemplate mongoTemplate;
|
||
|
private AnalysisTask analysisTask;
|
||
|
//被测件的文件信息
|
||
|
private FileDataMongoDto analysisFile;
|
||
|
|
||
|
private SolrUtils solrUtils;
|
||
|
|
||
|
private RedisUtil redisUtil;
|
||
|
|
||
|
private CountDownLatch countDownLatch;
|
||
|
|
||
|
public LineAnalysisTask(AnalysisTask analysisTask, FileDataMongoDto analysisFile, MongoTemplate mongoTemplate, CountDownLatch countDownLatch) {
|
||
|
this.mongoTemplate = mongoTemplate;
|
||
|
this.analysisTask = analysisTask;
|
||
|
this.analysisFile = analysisFile;
|
||
|
this.countDownLatch = countDownLatch;
|
||
|
this.solrUtils = SpringContextUtils.getBean(SolrUtils.class);
|
||
|
this.redisUtil = SpringContextUtils.getBean(RedisUtil.class);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* 行级别 源代码溯源
|
||
|
* 当前任务 需要在 文件级分析完成后 进行
|
||
|
*/
|
||
|
|
||
|
@Override
|
||
|
public void run() {
|
||
|
//执行任务前,判断一下任务执行的状态
|
||
|
Object status = redisUtil.get(String.format(RedisConst.TASK_RUNNING_STATUS_KEY_PREFIX, analysisTask.getId()));
|
||
|
if (status != null && (status.equals(AnalysisStatusEnum.STOP_ANALYSIS.getCode()) || status.equals(AnalysisStatusEnum.PAUSE_ANALYSIS.getCode()))) {
|
||
|
log.info("任务已取消,fileName:{}", analysisFile.getName());
|
||
|
countDownLatch.countDown();
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
//获取文件地址
|
||
|
String filePath = analysisFile.getFileUrl();
|
||
|
//获取文件名称
|
||
|
String fileName = analysisFile.getName();
|
||
|
|
||
|
AnalysisLogUtil.insert(mongoTemplate, "【行级特征提取】正在提取" + fileName);
|
||
|
try {
|
||
|
LineDataMongoDto lineDataMongoDto = new LineDataMongoDto();
|
||
|
lineDataMongoDto.setFileId(analysisFile.getId())
|
||
|
.setStatus(0)
|
||
|
.setIsSelect(false);
|
||
|
Analysis analysis = AnalysisFactory.getAnalysis(filePath);
|
||
|
CodeFile codeFile = null;
|
||
|
|
||
|
//获取文件行级特征md5
|
||
|
codeFile = analysis.analysisFile(filePath, FunctionAndAnalysisAssemblyConst.LINE_EXTRACT, FunctionAndAnalysisAssemblyConst.LINE_EXTRACT);
|
||
|
//每一行原内容MD5值集合
|
||
|
// String cutFileLineMd5 = codeFile.getCutFileLineMd5();
|
||
|
//每一行特征内容MD5值集合
|
||
|
String traitFileLineMd5 = codeFile.getTraitFileLineMd5();
|
||
|
|
||
|
String[] featureMd5Arr = {};
|
||
|
if (StringUtils.isNotBlank(traitFileLineMd5)) {
|
||
|
featureMd5Arr = traitFileLineMd5.split(",");
|
||
|
}
|
||
|
List<String> lineFeatures = Arrays.asList(featureMd5Arr);
|
||
|
|
||
|
//从solr中获取特征相似的 文件
|
||
|
SolrDocumentList featureSimilarityFromSolr = getFeatureSimilarityFromSolr(lineFeatures);
|
||
|
|
||
|
//计算文件的开源率
|
||
|
calculateOpenRate(featureSimilarityFromSolr, lineFeatures);
|
||
|
|
||
|
//更新文件表的分析状态为3 行级特征以分析完毕
|
||
|
analysisFile.setFileAnalysisStatus(FileAnalysisStatusEnum.ANALYSIS_DONE.getCode());
|
||
|
mongoTemplate.update(FileDataMongoDto.class)
|
||
|
.matching(where("_id").is(analysisFile.getId()))
|
||
|
.replaceWith(analysisFile)
|
||
|
.findAndReplace();
|
||
|
|
||
|
AnalysisLogUtil.insert(mongoTemplate, "【行级特征提取】提取完成" + fileName);
|
||
|
log.info("文件" + fileName + ":行级分析完成");
|
||
|
} catch (Exception e) {
|
||
|
AnalysisLogUtil.insertErrorInfo(mongoTemplate, "【行级特征提取】提取失败" + fileName, e);
|
||
|
log.error("文件:" + fileName + "行级别特征提取失败!", e);
|
||
|
//修改当前文件分析状态未失败
|
||
|
mongoTemplate.update(FileDataMongoDto.class)
|
||
|
.matching(where("_id").is(analysisFile.getId()))
|
||
|
.apply(new Update().set("fileAnalysisStatus", FileAnalysisStatusEnum.FAILED_ANALYSIS.getCode()))
|
||
|
.first();
|
||
|
} finally {
|
||
|
countDownLatch.countDown();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
/**
|
||
|
* 计算开源率 被测件的开源率
|
||
|
*
|
||
|
* @param matcheOpenSourceFiles
|
||
|
* @param lineFeatures
|
||
|
*/
|
||
|
private void calculateOpenRate(SolrDocumentList matcheOpenSourceFiles, List<String> lineFeatures) {
|
||
|
|
||
|
if (CollectionUtil.isEmpty(matcheOpenSourceFiles)) {
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
//根据文件后缀判断需要查询的文件版本库名称
|
||
|
String versionIdCoreName = FixedValue.SUFFIX_SOLR_VERSION.get(analysisFile.getSuffix());
|
||
|
|
||
|
|
||
|
//定义结果集对象
|
||
|
MatchOpenFileMongoDto matchOpenFileMongo = new MatchOpenFileMongoDto();
|
||
|
matchOpenFileMongo.setId(IdGenerator.uuid32())
|
||
|
.setFilePath(analysisFile.getFileUrl())
|
||
|
.setFileName(analysisFile.getName());
|
||
|
|
||
|
//开源文件信息保存结果集
|
||
|
List<MatchOpenFile> matchOpenFileInfoList = new ArrayList<>();
|
||
|
|
||
|
//保存所有匹配的行数信息,方便统计总的匹配行数
|
||
|
Set<String> matchingLineSet = new HashSet<>();
|
||
|
|
||
|
//获取文件总行数
|
||
|
BigDecimal totalCodeRowNum = new BigDecimal(analysisFile.getCodeRowNum());
|
||
|
|
||
|
//统计每个开源文件和被测件的匹配行数
|
||
|
for (SolrDocument matchFile : matcheOpenSourceFiles) {
|
||
|
//解析文件的代码块特征值
|
||
|
String lineFeatureMd5s = (String) matchFile.get("tz_line_hay");
|
||
|
List<String> matchedLineFeatures = Arrays.asList(lineFeatureMd5s.split(","));
|
||
|
|
||
|
//匹配的总行数
|
||
|
int currentFileMatchLineCount = 0;
|
||
|
|
||
|
//遍历当前文件的代码块特征,统计匹配的总行数
|
||
|
for (String originalLineFeatureMd5 : lineFeatures) {
|
||
|
for (String matchLineFeatureMd5 : matchedLineFeatures) {
|
||
|
if (originalLineFeatureMd5.equals(matchLineFeatureMd5)) {
|
||
|
currentFileMatchLineCount++;
|
||
|
matchingLineSet.add(originalLineFeatureMd5);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
//首先根据文件的MD5查询开源文件的版本ID,和路径信息
|
||
|
SolrDocument versionIdAndPath = solrUtils.queryOne(versionIdCoreName, "sourceFileMd5:" + matchFile.get("sourceMd5"), "versionId,fullPath,sourceFileMd5");
|
||
|
|
||
|
//根据版本ID查询版本的详细信息
|
||
|
//todo 这里 查询一个版本的信息 需要检索 两个 solr 库 而且还需要检索 versioinTree 后面需要优化
|
||
|
VersionTree versionInfo = solrUtils.queryVersionInfoByVersionId((String) versionIdAndPath.get("versionId"));
|
||
|
|
||
|
//计算与当前开源文件的开源率
|
||
|
BigDecimal openRate = new BigDecimal(currentFileMatchLineCount).divide(totalCodeRowNum, 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100));
|
||
|
|
||
|
//当前开源文件的开源项目信息
|
||
|
MatchOpenFile matchOpenFileInfo = new MatchOpenFile();
|
||
|
matchOpenFileInfo.setPId(versionInfo.getProId())
|
||
|
.setPName(versionInfo.getProName())
|
||
|
.setSourceUrl(versionInfo.getDownUrl())
|
||
|
.setOpenRate(openRate.floatValue())
|
||
|
.setVersion(versionInfo.getVersionName())
|
||
|
.setLicenseType(versionInfo.getLicenseType())
|
||
|
.setAnalyzeType(AnalysisLevelEnum.LINE_LEVEL.getCode());
|
||
|
matchOpenFileInfoList.add(matchOpenFileInfo);
|
||
|
}
|
||
|
|
||
|
//统计当前文件的整体开源率
|
||
|
BigDecimal openRate = new BigDecimal(matchingLineSet.size()).divide(totalCodeRowNum, 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100));
|
||
|
|
||
|
//获取开源率的阈值
|
||
|
Integer openRateThreshold = analysisTask.getOpenRateThreshold();
|
||
|
|
||
|
|
||
|
//如果开源率大于阈值,则将当前文件设置成开源
|
||
|
if (openRate.compareTo(new BigDecimal(openRateThreshold)) >= 0) {
|
||
|
analysisFile.setOpenType(true);
|
||
|
}
|
||
|
|
||
|
//保存当前文件的开源信息
|
||
|
matchOpenFileMongo.setOpenType(analysisFile.getOpenType())
|
||
|
.setMatchOpenFile(matchOpenFileInfoList);
|
||
|
mongoTemplate.save(matchOpenFileMongo);
|
||
|
|
||
|
}
|
||
|
|
||
|
|
||
|
/**
|
||
|
* 将特征值插入到mongo库中
|
||
|
*
|
||
|
* @param features 特征集合
|
||
|
* @param lineDataMongoDto 当前分析任务 ,特征信息存储
|
||
|
* todo 后期 看看有没有插入的必要
|
||
|
* @param
|
||
|
*/
|
||
|
@Deprecated
|
||
|
private void insertFeatureValue(String features, LineDataMongoDto lineDataMongoDto) {
|
||
|
String[] featureMd5Arr = {};
|
||
|
if (StringUtils.isNotBlank(features)) {
|
||
|
featureMd5Arr = features.split(",");
|
||
|
}
|
||
|
List<String> lineFeatures = Arrays.asList(featureMd5Arr);
|
||
|
List<String> batchInsertList = new ArrayList<>();
|
||
|
if (CollectionUtil.isNotEmpty(lineFeatures)) {
|
||
|
//这里的批量插入逻辑可以进行校验
|
||
|
//每10条存一次,解析的数据量如果过大,可能会超过MongoDB数据限制
|
||
|
int batchInsertStpe = 5000;
|
||
|
int total = 0;
|
||
|
for (int i = 0; i < lineFeatures.size(); i++) {
|
||
|
if (total != batchInsertStpe) {
|
||
|
batchInsertList.add(lineFeatures.get(i));
|
||
|
total++;
|
||
|
}
|
||
|
if (i == lineFeatures.size() - 1 && total != batchInsertStpe) {
|
||
|
total = 0;
|
||
|
lineDataMongoDto.setId(IdGenerator.uuid32())
|
||
|
.setLineFeatueMd5s(batchInsertList);
|
||
|
mongoTemplate.insert(lineDataMongoDto);
|
||
|
}
|
||
|
if (total == batchInsertStpe) {
|
||
|
total = 0;
|
||
|
lineDataMongoDto.setId(IdGenerator.uuid32())
|
||
|
.setLineFeatueMd5s(batchInsertList);
|
||
|
mongoTemplate.insert(lineDataMongoDto);
|
||
|
batchInsertList.clear();
|
||
|
}
|
||
|
}
|
||
|
} else {
|
||
|
lineDataMongoDto.setId(IdGenerator.uuid32());
|
||
|
mongoTemplate.insert(lineDataMongoDto);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
/**
|
||
|
* 根据 特征值 从特征库中检索 具有特征相似的
|
||
|
*
|
||
|
* @param lineFeatureList 行特征信息
|
||
|
* @return
|
||
|
*/
|
||
|
private SolrDocumentList getFeatureSimilarityFromSolr(List<String> lineFeatureList) {
|
||
|
String solrCoreName = SolrDBConst.CORE_NAME_SOURCE_FILE_INFO_TEMP;
|
||
|
//拼接行特征查询条件
|
||
|
String queryStr = "tz_line_hay:(" + StringUtils.join(lineFeatureList, " OR ") + ")";
|
||
|
log.info("查询条件: solrCoreName:{},queryStr:{}", solrCoreName, queryStr);
|
||
|
SolrDocumentList result = solrUtils.query(solrCoreName, queryStr, "sourceMd5,tz_line_hay");
|
||
|
log.info("查询结果: result:{}", result);
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
|
||
|
}
|