1.优化程序分析逻辑,修改文件级别分析逻辑

master
liuzongren 7 months ago
parent 09a0ffa44f
commit e62e8824fa
  1. 41
      src/main/java/com/keyware/composeanalysis/task/CodeBlockAnalysisTask.java
  2. 104
      src/main/java/com/keyware/composeanalysis/task/FileAnalysisTask.java
  3. 42
      src/main/java/com/keyware/composeanalysis/task/FunctionAnalysisTask.java
  4. 43
      src/main/java/com/keyware/composeanalysis/task/LineAnalysisTask.java
  5. 14
      src/main/resources/application.yaml

@ -168,8 +168,6 @@ public class CodeBlockAnalysisTask extends IAnalysisTask {
private void doAnalysis(SolrDocumentList matcheOpenSourceFiles, String sourceFileBaseCoreName, CodeFile fileAnalysisRes) {
if (CollectionUtil.isEmpty(matcheOpenSourceFiles)) {
//因为代码块的特征库较少,这里补充一个对比逻辑,如果当前文件解析失败,或者没有通过代码块匹配到数据,则直接通过文件的md5 再次查询一次solr库
checkByOriginalFileMd5(sourceFileBaseCoreName, analysisFile.getMd5());
return;
}
@ -313,45 +311,6 @@ public class CodeBlockAnalysisTask extends IAnalysisTask {
}
/**
* 防止代码块特征库不全再次根据文件MD5查询开源文件信息, 做二次校验
*
* @param originalFileMd5
* @param versionIdCoreName
*/
private void checkByOriginalFileMd5(String versionIdCoreName, String originalFileMd5) {
//根据文件的MD5,查询特征库,看当前文件是否在开源代码库中
SolrDocument versionIdAndPath = solrUtils.queryOne(versionIdCoreName, "sourceFileMd5:" + originalFileMd5, "versionId,fullPath,sourceFileMd5");
if (versionIdAndPath != null) {
//根据版本ID查询版本的详细信息
VersionTree versionInfo = solrUtils.queryVersionInfoByVersionId((String) versionIdAndPath.get("versionId"));
if (versionInfo != null) {
//当前开源文件的开源项目信息
MatchOpenFile matchOpenFileInfo = new MatchOpenFile();
matchOpenFileInfo.setPId(versionInfo.getProId())
.setPName(versionInfo.getProName())
.setSourceUrl(versionInfo.getDownUrl())
.setFeatureSimilarity(100.00f)
.setOpenRate(100.00f)
.setAnalyzeType(AnalysisLevelEnum.FILE_LEVEL.getCode());
//保存当前文件的开源信息到mongo库中
MatchOpenFileMongoDto matchOpenFileMongo = new MatchOpenFileMongoDto();
matchOpenFileMongo.setId(IdGenerator.uuid32())
.setFilePath(analysisFile.getFileUrl())
.setFileName(analysisFile.getName())
.setOpenRate(100.00f)
.setOpenType(analysisFile.getOpenType())
.setMatchOpenFile(Arrays.asList(matchOpenFileInfo));
mongoTemplate.save(matchOpenFileMongo);
}
}
}
/**
* 获取当前文件的代码块特征值
*

@ -4,7 +4,6 @@ import cn.hutool.core.lang.Pair;
import com.keyware.common.constant.enums.AnalysisStatusEnum;
import com.keyware.composeanalysis.constant.FixedValue;
import com.keyware.composeanalysis.constant.RedisConst;
import com.keyware.composeanalysis.constant.SolrDBConst;
import com.keyware.composeanalysis.constant.enums.AnalysisLevelEnum;
import com.keyware.composeanalysis.constant.enums.FileAnalysisStatusEnum;
import com.keyware.composeanalysis.entity.AnalysisTask;
@ -18,8 +17,6 @@ import com.keyware.keyswan.anaysis.AnalysisFactory;
import com.keyware.keyswan.common.CodeFile;
import com.keyware.utils.IdGenerator;
import lombok.extern.log4j.Log4j2;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.springframework.data.mongodb.core.MongoTemplate;
@ -81,42 +78,31 @@ public class FileAnalysisTask extends IAnalysisTask {
//获取当前文件名称
String fileName = analysisFile.getName();
AnalysisLogUtil.insert(mongoTemplate, "【文件级分析】正在分析" + fileName);
try {
//只有主流语言的才能解析
//非32种主流语言的不能提取文件特征,在文件级MD5匹配的时候,已经做过匹配
if (StringUtils.isNotEmpty(analysisFile.getSuffix()) && FixedValue.SUFFIX_SOLR_VERSION.containsKey(analysisFile.getSuffix())) {
//根据文件后缀 查询 *_CutFileInfo库名称
String featureCoreName = FixedValue.SUFFIX_SOLR_FILE.get(analysisFile.getSuffix());
//根据文件名称,获取文件解析器
Analysis analysis = AnalysisFactory.getAnalysis(fileName);
//如果 analysis 返回值为null 说明还未支持这种语言的特征提取 可以直接通过文件的MD5值去solr库中匹配
if (analysis != null) {
//如果文件大小超过3M,则不进行文件级行级特征提取
Integer fileSize = analysisFile.getFileSize();
if (fileSize < (3 * 1024 * 1024)) {
CodeFile codeFile = analysis.analysisFile(analysisFile.getFileUrl(), "1", "0");
//根据文件的特征值,去相应文件文件后缀的特征库中进行查询
if (codeFile != null) {
String querySb = "sourceMd5:" + codeFile.getSourceMd5() + " OR cutFileMd5:" + codeFile.getCutFileMd5() + " OR traitFileMd5:" + codeFile.getTraitFileMd5();
SolrDocumentList openSourceFileList = solrUtils.query(featureCoreName, querySb, "sourceMd5");
//如果当前文件在源码库中,匹配到了数据,则统计当前文件的开源率
if (CollectionUtils.isNotEmpty(openSourceFileList)) {
ananlyzeFileOpenRate(openSourceFileList,codeFile);
}
}
}
}
}
//根据文件后缀 查询 *_CutFileInfo库名称
String featureCoreName = FixedValue.SUFFIX_SOLR_FILE.get(analysisFile.getSuffix());
//根据文件名称,获取文件解析器
Analysis analysis = AnalysisFactory.getAnalysis(fileName);
//如果文件大小超过3M,则不进行文件级行级特征提取
CodeFile codeFile = analysis.analysisFile(analysisFile.getFileUrl(), "1", "0");
//根据文件的特征值,去相应文件文件后缀的特征库中进行查询
SolrDocumentList openSourceFileList = getFeatureSimilarityFromSolr(featureCoreName, codeFile);
//则统计当前文件的开源率
ananlyzeFileOpenRate(openSourceFileList, codeFile);
//更新文件级分析结果
analysisFile.setFileAnalysisStatus(FileAnalysisStatusEnum.ANALYSIS_DONE.getCode());
mongoTemplate.update(FileDataMongoDto.class)
.matching(where("_id").is(analysisFile.getId()))
.replaceWith(analysisFile)
.findAndReplace();
AnalysisLogUtil.insert(mongoTemplate, "【文件级分析】成功" + fileName);
} catch (Exception e) {
AnalysisLogUtil.insertErrorInfo(mongoTemplate, "【文件级】提取失败" + fileName, e);
log.error("文件:" + fileName + "文件级别特征提取失败!", e);
AnalysisLogUtil.insertErrorInfo(mongoTemplate, "【文件级】分析失败" + fileName, e);
log.error("文件:" + fileName + "文件级别分析失败!", e);
//将当前文件的分析状态变更为失败
analysisFile.setFileAnalysisStatus(FileAnalysisStatusEnum.FAILED_ANALYSIS.getCode());
//更新文件级分析结果
@ -130,40 +116,39 @@ public class FileAnalysisTask extends IAnalysisTask {
}
/**
* 根据 特征值 从特征库中检索 具有特征相似的
*
* @param featureCoreName 检索的solr 库名称
* @param codeFile 源文件解析结果
* @return
*/
private SolrDocumentList getFeatureSimilarityFromSolr(String featureCoreName, CodeFile codeFile) {
if (codeFile == null) {
log.error("特征为空,无法查询:{}", analysisFile.getName());
return new SolrDocumentList();
}
String queryStr = "sourceMd5:" + codeFile.getSourceMd5() + " OR cutFileMd5:" + codeFile.getCutFileMd5() + " OR traitFileMd5:" + codeFile.getTraitFileMd5();
return solrUtils.query(featureCoreName, queryStr, "sourceMd5");
}
/**
* 分析文件的开源率
*
* @param openSourceFileList 匹配的开源文件信息
* @throws IOException
*/
private void ananlyzeFileOpenRate(SolrDocumentList openSourceFileList,CodeFile fileAnalysisRes) throws IOException {
//根据匹配的开源文件的md5 获取版本ID
Set<String> sourceFileMd5 = openSourceFileList.stream().map(solrDocument -> (String) solrDocument.get("sourceMd5")).collect(Collectors.toSet());
String sourceCoreName = FixedValue.SUFFIX_SOLR_VERSION.get(analysisFile.getSuffix());
Map<String, SolrDocument> md5VersionObjMap = solrUtils.batchQueryVersionIdFromSourceFileBaseBySourceMd5(sourceCoreName, sourceFileMd5);
//根据版本ID获取版本信息
Set<String> versionIds = md5VersionObjMap.values().stream().map(solrDocument -> (String) solrDocument.get("versionId")).collect(Collectors.toSet());
List<VersionTree> treeInfoList = solrUtils.queryBatchVersionInfoByVersionIds(versionIds);
Map<String, VersionTree> versionIdMap = treeInfoList.stream().collect(Collectors.toMap(VersionTree::getVersionId, Function.identity()));
//获取被测件文本内容
String fileContent = new String(Files.readAllBytes(Paths.get(analysisFile.getFileUrl())), "utf-8").replaceAll(" ", "");
//将被测件的文本内容拆分成行信息,用于匹配开源信息
List<String> fileLines = SimilarityUtil.getSplitWords(fileContent);
private void ananlyzeFileOpenRate(SolrDocumentList openSourceFileList, CodeFile fileAnalysisRes) {
HashSet<Integer> openLineNum = new HashSet<>();
List<MatchOpenFile> matchOpenFilesRes = calculateOpenRate(openSourceFileList,fileAnalysisRes,sourceCoreName,openLineNum);
//统计被测件的总体开源率
//计算每个文件的开源率和特征相似度
List<MatchOpenFile> matchOpenFilesRes = calculateOpenRate(openSourceFileList, fileAnalysisRes, openLineNum);
//获取开源率阈值,判断当前文件是否开源
Integer openRateThreshold = analysisTask.getOpenRateThreshold();
BigDecimal openRate = new BigDecimal(openLineNum.size()).divide(new BigDecimal(fileLines.size()), 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2);
BigDecimal openRate = new BigDecimal(openLineNum.size()).divide(fileAnalysisRes.getCodeRowNum(), 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2);
//超过阈值,则认为当前文件是开源文件
if (openRate.compareTo(new BigDecimal(openRateThreshold)) > 0) {
@ -192,20 +177,21 @@ public class FileAnalysisTask extends IAnalysisTask {
}
/**
* 计算当前文件的特征相似度 开源率
*
* @param matchOpenFiles 通过MD5 匹配到的所有开源文件
* @param sourceFileBaseCoreName 当前文件特征文件的 solr coreName
* @param matchLineRowsNum 所有开源文件匹配到的开源行号列表
* @param matchOpenFiles 通过MD5 匹配到的所有开源文件
* @param matchLineRowsNum 所有开源文件匹配到的开源行号列表
* @return 匹配的开源文件解析后的结果集
*/
private List<MatchOpenFile> calculateOpenRate(SolrDocumentList matchOpenFiles, CodeFile fileAnalysisRes, String sourceFileBaseCoreName, Set<Integer> matchLineRowsNum) {
private List<MatchOpenFile> calculateOpenRate(SolrDocumentList matchOpenFiles, CodeFile fileAnalysisRes, Set<Integer> matchLineRowsNum) {
//匹配的开源文件列表
List<MatchOpenFile> matchOpenFilesRes = new ArrayList<>();
//根据匹配的开源文件的md5 获取版本ID
String sourceFileBaseCoreName = FixedValue.SUFFIX_SOLR_VERSION.get(analysisFile.getSuffix());
//首先根据文件的MD5查询开源文件的版本ID,和路径信息
Set<String> openSourceFileMd5s = matchOpenFiles.stream().map(doc -> (String) doc.get("sourceMd5")).collect(Collectors.toSet());
Map<String, SolrDocument> md5VersionInfoMap = solrUtils.batchQueryVersionIdFromSourceFileBaseBySourceMd5(sourceFileBaseCoreName, openSourceFileMd5s);
@ -230,7 +216,7 @@ public class FileAnalysisTask extends IAnalysisTask {
SolrDocument openEntries = md5VersionInfoMap.get(openSourceFileMd5);
VersionTree versionInfo = versionIdVersionInfoMap.get(openEntries.get("versionId"));
if (versionInfo == null){
if (versionInfo == null) {
log.error("找不到开源文件版本信息,versionId:{}", openEntries.get("versionId"));
}
@ -250,6 +236,4 @@ public class FileAnalysisTask extends IAnalysisTask {
}
}

@ -145,8 +145,6 @@ public class FunctionAnalysisTask extends IAnalysisTask {
private void doAnalysis(SolrDocumentList matchOpenFiles, String sourceFileBaseCoreName, CodeFile fileAnalysisRes) {
if (CollectionUtil.isEmpty(matchOpenFiles)) {
//因为函数的特征库较少,这里补充一个对比逻辑,如果当前文件解析失败,或者没有通过函数匹配到数据,则直接通过文件的md5 再次查询一次solr库
checkByOriginalFileMd5(sourceFileBaseCoreName, analysisFile.getMd5());
return;
}
@ -290,46 +288,6 @@ public class FunctionAnalysisTask extends IAnalysisTask {
return matchOpenFilesRes;
}
/**
* 防止函数特征库不全再次根据文件MD5查询开源文件信息, 做二次校验
*
* @param originalFileMd5
* @param versionIdCoreName
*/
private void checkByOriginalFileMd5(String versionIdCoreName, String originalFileMd5) {
//根据文件的MD5,查询特征库,看当前文件是否在开源代码库中
SolrDocument versionIdAndPath = solrUtils.queryOne(versionIdCoreName, "sourceFileMd5:" + originalFileMd5, "versionId,fullPath,sourceFileMd5");
if (versionIdAndPath != null) {
//根据版本ID查询版本的详细信息
VersionTree versionInfo = solrUtils.queryVersionInfoByVersionId((String) versionIdAndPath.get("versionId"));
if (versionInfo != null) {
//当前开源文件的开源项目信息
MatchOpenFile matchOpenFileInfo = new MatchOpenFile();
matchOpenFileInfo.setPId(versionInfo.getProId())
.setPName(versionInfo.getProName())
.setSourceUrl(versionInfo.getDownUrl())
.setFeatureSimilarity(100.00f)
.setOpenRate(100.00f)
.setAnalyzeType(AnalysisLevelEnum.FILE_LEVEL.getCode());
//保存当前文件的开源信息到mongo库中
MatchOpenFileMongoDto matchOpenFileMongo = new MatchOpenFileMongoDto();
matchOpenFileMongo.setId(IdGenerator.uuid32())
.setFilePath(analysisFile.getFileUrl())
.setFileName(analysisFile.getName())
.setOpenRate(100.00f)
.setOpenType(analysisFile.getOpenType())
.setMatchOpenFile(Arrays.asList(matchOpenFileInfo));
mongoTemplate.save(matchOpenFileMongo);
}
}
}
/**
* 根据 特征值 从特征库中检索 具有特征相似的
*

@ -138,8 +138,6 @@ public class LineAnalysisTask extends IAnalysisTask {
String versionIdCoreName = FixedValue.SUFFIX_SOLR_VERSION.get(analysisFile.getSuffix());
if (CollectionUtil.isEmpty(matcheOpenSourceFiles)) {
//因为行的特征库较少,这里补充一个对比逻辑,如果当前文件解析失败,或者没有通过代码块匹配到数据,则直接通过文件的md5 再次查询一次solr库
checkByOriginalFileMd5(versionIdCoreName, analysisFile.getMd5());
return;
}
@ -271,47 +269,6 @@ public class LineAnalysisTask extends IAnalysisTask {
return matchOpenFilesRes;
}
/**
* 防止代码块特征库不全再次根据文件MD5查询开源文件信息, 做二次校验
*
* @param originalFileMd5
* @param versionIdCoreName
*/
private void checkByOriginalFileMd5(String versionIdCoreName, String originalFileMd5) {
//根据文件的MD5,查询特征库,看当前文件是否在开源代码库中
SolrDocument versionIdAndPath = solrUtils.queryOne(versionIdCoreName, "sourceFileMd5:" + originalFileMd5, "versionId,fullPath,sourceFileMd5");
if (versionIdAndPath != null) {
//根据版本ID查询版本的详细信息
VersionTree versionInfo = solrUtils.queryVersionInfoByVersionId((String) versionIdAndPath.get("versionId"));
if (versionInfo != null) {
//当前开源文件的开源项目信息
MatchOpenFile matchOpenFileInfo = new MatchOpenFile();
matchOpenFileInfo.setPId(versionInfo.getProId())
.setPName(versionInfo.getProName())
.setSourceUrl(versionInfo.getDownUrl())
.setFeatureSimilarity(100.00f)
.setOpenRate(100.00f)
.setAnalyzeType(AnalysisLevelEnum.FILE_LEVEL.getCode());
//保存当前文件的开源信息到mongo库中
MatchOpenFileMongoDto matchOpenFileMongo = new MatchOpenFileMongoDto();
matchOpenFileMongo.setId(IdGenerator.uuid32())
.setFilePath(analysisFile.getFileUrl())
.setFileName(analysisFile.getName())
.setOpenRate(100.00f)
.setOpenType(analysisFile.getOpenType())
.setMatchOpenFile(Arrays.asList(matchOpenFileInfo));
mongoTemplate.save(matchOpenFileMongo);
}
}
}
/**
* 将特征值插入到mongo库中
* @param features 特征集合

@ -5,14 +5,18 @@ spring:
application:
name: compose-analysis-service
cloud:
inetutils:
preferred-networks:
#优先使用下列网段的IP进行网络通信
- 172.16
nacos:
discovery:
server-addr: 172.16.36.100:8848
namespace: 7f9bb282-8ee3-4948-8182-24b7dcadcd5a
server-addr: 172.16.36.7:8848
namespace: 2fad0ca9-bc32-4afd-9f2e-ebc133d5e781
config:
server-addr: 172.16.36.100:8848
namespace: 7f9bb282-8ee3-4948-8182-24b7dcadcd5a
group: dev_group
server-addr: 172.16.36.7:8848
namespace: 2fad0ca9-bc32-4afd-9f2e-ebc133d5e781
file-extension: yaml
config:
import: nacos:compose-analysis-dev.yaml

Loading…
Cancel
Save