1.优化分析逻辑,处理分析过程中空指针的问题

without_nacos
liuzongren 5 months ago
parent bb737f8613
commit f86530cee1
  1. 1
      src/main/java/com/keyware/composeanalysis/service/impl/AnalysisTaskServiceImpl.java
  2. 5
      src/main/java/com/keyware/composeanalysis/task/FileAnalysisTask.java
  3. 4
      src/main/java/com/keyware/composeanalysis/task/LineAnalysisTask.java
  4. 68
      src/main/java/com/keyware/composeanalysis/task/PorjectAnalysisTask.java
  5. 1
      src/main/java/com/keyware/composeanalysis/util/SolrUtils.java

@ -166,6 +166,7 @@ public class AnalysisTaskServiceImpl extends ServiceImpl<AnalyzeTaskMapper, Anal
//引入解压缩有可能会很慢,这里添加重试机制,最多重试6次,60s //引入解压缩有可能会很慢,这里添加重试机制,最多重试6次,60s
//todo 主程序 后续可以添加压缩标志位
private boolean retryGetDecompressionFlag(AnalysisTask analysisTask) { private boolean retryGetDecompressionFlag(AnalysisTask analysisTask) {
int retryCount = 0; int retryCount = 0;
while (retryCount < 60) { while (retryCount < 60) {

@ -4,6 +4,7 @@ import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.io.FileUtil; import cn.hutool.core.io.FileUtil;
import cn.hutool.core.lang.Pair; import cn.hutool.core.lang.Pair;
import com.keyware.common.constant.enums.AnalysisStatusEnum; import com.keyware.common.constant.enums.AnalysisStatusEnum;
import com.keyware.common.exception.BusinessException;
import com.keyware.composeanalysis.constant.FixedValue; import com.keyware.composeanalysis.constant.FixedValue;
import com.keyware.composeanalysis.constant.RedisConst; import com.keyware.composeanalysis.constant.RedisConst;
import com.keyware.composeanalysis.constant.enums.AnalysisLevelEnum; import com.keyware.composeanalysis.constant.enums.AnalysisLevelEnum;
@ -82,6 +83,9 @@ public class FileAnalysisTask extends IAnalysisTask {
String featureCoreName = FixedValue.SUFFIX_SOLR_FILE.get(analysisFile.getSuffix()); String featureCoreName = FixedValue.SUFFIX_SOLR_FILE.get(analysisFile.getSuffix());
//根据文件名称,获取文件解析器 //根据文件名称,获取文件解析器
Analysis analysis = AnalysisFactory.getAnalysis(fileName); Analysis analysis = AnalysisFactory.getAnalysis(fileName);
if (analysis == null){
throw new BusinessException("获取文件解析器失败,文件名称:" + fileName);
}
//如果文件大小超过3M,则不进行文件级行级特征提取 //如果文件大小超过3M,则不进行文件级行级特征提取
CodeFile codeFile = analysis.analysisFile(analysisFile.getFileUrl(), "1", "0"); CodeFile codeFile = analysis.analysisFile(analysisFile.getFileUrl(), "1", "0");
@ -97,7 +101,6 @@ public class FileAnalysisTask extends IAnalysisTask {
.matching(where("_id").is(analysisFile.getId())) .matching(where("_id").is(analysisFile.getId()))
.replaceWith(analysisFile) .replaceWith(analysisFile)
.findAndReplace(); .findAndReplace();
AnalysisLogUtil.insert(mongoTemplate, "【文件级分析】成功" + fileName); AnalysisLogUtil.insert(mongoTemplate, "【文件级分析】成功" + fileName);
} catch (Exception e) { } catch (Exception e) {
AnalysisLogUtil.insertErrorInfo(mongoTemplate, "【文件级】分析失败" + fileName, e); AnalysisLogUtil.insertErrorInfo(mongoTemplate, "【文件级】分析失败" + fileName, e);

@ -258,6 +258,10 @@ public class LineAnalysisTask extends IAnalysisTask {
BigDecimal featureSimilarity = new BigDecimal(currentFileMatchFeatureLineCount).divide(new BigDecimal(lineFeatureList.size()), 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2); BigDecimal featureSimilarity = new BigDecimal(currentFileMatchFeatureLineCount).divide(new BigDecimal(lineFeatureList.size()), 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2);
SolrDocument openEntries = md5VersionInfoMap.get(openSourceFileMd5); SolrDocument openEntries = md5VersionInfoMap.get(openSourceFileMd5);
if (openEntries == null){
log.error("根据开源文件MD5,未查询到相关的开源文件版本信息,md5:{}", openSourceFileMd5);
continue;
}
VersionTree versionInfo = versionIdVersionInfoMap.get(openEntries.get("versionId")); VersionTree versionInfo = versionIdVersionInfoMap.get(openEntries.get("versionId"));
if (versionInfo == null) { if (versionInfo == null) {
log.error("根据版本ID,未查询到相关的版本信息。versionId:{}", openEntries.get("versionId")); log.error("根据版本ID,未查询到相关的版本信息。versionId:{}", openEntries.get("versionId"));

@ -1,10 +1,12 @@
package com.keyware.composeanalysis.task; package com.keyware.composeanalysis.task;
import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.collection.CollectionUtil; import cn.hutool.core.collection.CollectionUtil;
import cn.hutool.core.io.FileUtil; import cn.hutool.core.io.FileUtil;
import com.baomidou.mybatisplus.core.conditions.update.LambdaUpdateWrapper; import com.baomidou.mybatisplus.core.conditions.update.LambdaUpdateWrapper;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import com.keyware.common.constant.enums.AnalysisStatusEnum; import com.keyware.common.constant.enums.AnalysisStatusEnum;
import com.keyware.common.exception.BusinessException;
import com.keyware.composeanalysis.constant.FixedValue; import com.keyware.composeanalysis.constant.FixedValue;
import com.keyware.composeanalysis.constant.MongoDBConst; import com.keyware.composeanalysis.constant.MongoDBConst;
import com.keyware.composeanalysis.constant.enums.AnalysisLevelEnum; import com.keyware.composeanalysis.constant.enums.AnalysisLevelEnum;
@ -84,7 +86,7 @@ public class PorjectAnalysisTask {
//todo 如果整体耗时较长,將matchOpenFileInfo存储到数据库的逻辑修改成异步的 //todo 如果整体耗时较长,將matchOpenFileInfo存储到数据库的逻辑修改成异步的
log.info("项目级分析完成,用时:" + (System.currentTimeMillis() - startTime) / 1000 + "s"); log.info("项目级分析完成,用时:" + (System.currentTimeMillis() - startTime) / 1000 + "s");
} catch (Exception e) { } catch (Exception e) {
AnalysisLogUtil.insert(mongoTemplate, "成分分析失败:" + e.getStackTrace()); AnalysisLogUtil.insert(mongoTemplate, "成分分析失败:" + e.getMessage());
log.error("项目级分析失败,项目名称:" + analysisTask.getFileName(), e); log.error("项目级分析失败,项目名称:" + analysisTask.getFileName(), e);
LambdaUpdateWrapper<AnalysisTask> updateWrapper = new LambdaUpdateWrapper<>(); LambdaUpdateWrapper<AnalysisTask> updateWrapper = new LambdaUpdateWrapper<>();
updateWrapper.eq(AnalysisTask::getId, analysisTask.getId()) updateWrapper.eq(AnalysisTask::getId, analysisTask.getId())
@ -123,19 +125,16 @@ public class PorjectAnalysisTask {
//当前文件开源信息存入数据库中 //当前文件开源信息存入数据库中
mongoTemplate.insert(projectAssembly); mongoTemplate.insert(projectAssembly);
//更新文件分析的状态
mongoTemplate.update(FileDataMongoDto.class)
.matching(where("isDirectory").is(false))
.apply(new Update().set("openType", true)
.set("openRate", 100.00d)
.set("fileAnalysisStatus", FileAnalysisStatusEnum.ANALYSIS_DONE.getCode()))
.all();
//保存具体开源文件信息 //保存具体开源文件信息
VersionTree openProjectList = solrUtils.queryVersionTreeByVersionId(openSourceProject.getVersionId()); VersionTree openProject = solrUtils.queryVersionTreeByVersionId(openSourceProject.getVersionId());
if (openProject == null) {
throw new BusinessException("查询开源项目信息失败,项目versionId:" + openSourceProject.getVersionId());
}
Query fileQuery = new Query(where("isDirectory").is(false)); Query fileQuery = new Query(where("isDirectory").is(false));
List<FileDataMongoDto> fileDataMongoDtos = mongoTemplate.find(fileQuery, FileDataMongoDto.class); List<FileDataMongoDto> fileDataMongoDtos = mongoTemplate.find(fileQuery, FileDataMongoDto.class);
saveProjectOpenInfo(openProjectList, fileDataMongoDtos); Set<String> openFileMd5s = fileDataMongoDtos.stream().map(FileDataMongoDto::getMd5).collect(Collectors.toSet());
//匹配到了整个项目,则将被测件的所有文件设置为开源
saveProjectOpenInfo(openProject, fileDataMongoDtos,openFileMd5s);
return true; return true;
} }
return false; return false;
@ -172,11 +171,10 @@ public class PorjectAnalysisTask {
//通过md5去*_SourceFileBase中匹配版本Id //通过md5去*_SourceFileBase中匹配版本Id
Set<String> fileMd5s = data.stream().map(FileDataMongoDto::getMd5).collect(Collectors.toSet()); Set<String> fileMd5s = data.stream().map(FileDataMongoDto::getMd5).collect(Collectors.toSet());
Map<String, SolrDocument> md5VersionObjMap = solrUtils.batchQueryVersionIdFromSourceFileBaseBySourceMd5(currentCoreName, fileMd5s); Map<String, SolrDocument> md5VersionObjMap = solrUtils.batchQueryVersionIdFromSourceFileBaseBySourceMd5(currentCoreName, fileMd5s);
if (CollectionUtil.isEmpty(md5VersionObjMap)) { if (CollectionUtil.isNotEmpty(md5VersionObjMap)) {
return;
}
//保存结果数据 //保存结果数据
saveMatchOpenFileInfo(md5VersionObjMap, data); saveMatchOpenFileInfo(md5VersionObjMap, data);
}
} else { } else {
//非主流语言的,没有单独的特征库,统一到默认的特征库进行检索 //非主流语言的,没有单独的特征库,统一到默认的特征库进行检索
otherLanguageFiles.addAll(data); otherLanguageFiles.addAll(data);
@ -192,9 +190,9 @@ public class PorjectAnalysisTask {
//暂时忽略字符流md5的匹配,因为大部分都是一样的 //暂时忽略字符流md5的匹配,因为大部分都是一样的
Set<String> fileMd5s = otherLanguageFiles.stream().map(FileDataMongoDto::getMd5).collect(Collectors.toSet()); Set<String> fileMd5s = otherLanguageFiles.stream().map(FileDataMongoDto::getMd5).collect(Collectors.toSet());
Map<String, SolrDocument> md5VersionIdMap = solrUtils.batchQueryVersionIdFromSourceFileBaseBySourceMd5(MongoDBConst.TABLE_NAME_SOURCE_FILE_BASE, fileMd5s); Map<String, SolrDocument> md5VersionIdMap = solrUtils.batchQueryVersionIdFromSourceFileBaseBySourceMd5(MongoDBConst.TABLE_NAME_SOURCE_FILE_BASE, fileMd5s);
if (md5VersionIdMap == null || md5VersionIdMap.isEmpty()) { if (CollUtil.isEmpty(md5VersionIdMap)) {
//如果没有匹配到,直接更新文件分析状态已完成,因为非32种语言的文件,无法进行解析,通过源文件的MD5匹配不到,就匹配不到了,无需进行下一步的匹配 //如果没有匹配到,直接更新文件分析状态已完成,因为非32种语言的文件,无法进行解析,通过源文件的MD5匹配不到,就匹配不到了,无需进行下一步的匹配
updateFileAnalysisStatus(fileMd5s,true); updateFileAnalysisStatus(fileMd5s,false);
return; return;
} }
saveMatchOpenFileInfo(md5VersionIdMap, otherLanguageFiles); saveMatchOpenFileInfo(md5VersionIdMap, otherLanguageFiles);
@ -226,46 +224,37 @@ public class PorjectAnalysisTask {
MatchOpenFileMongoDto matchOpenFile = getMatchOpenFile(versionInfo, fileDataMongoDto, versionObj.get("fullPath").toString()); MatchOpenFileMongoDto matchOpenFile = getMatchOpenFile(versionInfo, fileDataMongoDto, versionObj.get("fullPath").toString());
batchInsertCache.add(matchOpenFile); batchInsertCache.add(matchOpenFile);
}); });
if (CollectionUtils.isNotEmpty(batchInsertCache)) { if (CollectionUtils.isNotEmpty(batchInsertCache)) {
mongoTemplate.insert(batchInsertCache, MatchOpenFileMongoDto.class); mongoTemplate.insert(batchInsertCache, MatchOpenFileMongoDto.class);
//更新文件分析的状态 Set<String> openFileIds = batchInsertCache.stream().map(MatchOpenFileMongoDto::getId).collect(Collectors.toSet());
updateFileAnalysisStatus(md5VersionIdMap.keySet(),true); updateFileAnalysisStatus(openFileIds,true);
} }
} }
//匹配到开源项目后,保存各个文件的开源信息 //匹配到开源项目后,保存匹配到的各个文件的开源信息
private void saveProjectOpenInfo(VersionTree versionInfo, List<FileDataMongoDto> originalFiles) { private void saveProjectOpenInfo(VersionTree versionInfo, List<FileDataMongoDto> originalFiles,Set<String> matchedFileMd5s) {
Map<String, FileDataMongoDto> originalMd5ObjMap = originalFiles.stream().collect(Collectors.toMap(FileDataMongoDto::getMd5, Function.identity(), (key1, key2) -> key1)); Map<String, FileDataMongoDto> originalMd5ObjMap = originalFiles.stream().collect(Collectors.toMap(FileDataMongoDto::getMd5, Function.identity(), (key1, key2) -> key1));
Set<String> matchedMd5s = new HashSet<>();
List<MatchOpenFileMongoDto> batchInsertCache = new ArrayList<>(); List<MatchOpenFileMongoDto> batchInsertCache = new ArrayList<>();
List<VersionTreeNode> fileInfos = versionInfo.getDirTree(); List<VersionTreeNode> fileInfos = versionInfo.getDirTree();
//todo 这里会出现重复的md5数据,后续需要处理
Map<String, String> md5ToFullPathMap = fileInfos.stream().collect(Collectors.toMap(VersionTreeNode::getSourceFileMd5, VersionTreeNode::getFullPath, (key1, key2) -> key1));
fileInfos.forEach(versionTreeNodeObj->{ matchedFileMd5s.forEach(fileMd5 -> {
String openFileMd5 = versionTreeNodeObj.getSourceFileMd5();
//看是否和被测件的md5匹配
if (originalMd5ObjMap.keySet().contains(openFileMd5)) {
//匹配的文件只保存一次 //匹配的文件只保存一次
if (!matchedMd5s.contains(openFileMd5)) { MatchOpenFileMongoDto matchOpenFile = getMatchOpenFile(versionInfo, originalMd5ObjMap.get(fileMd5), md5ToFullPathMap.get(fileMd5));
MatchOpenFileMongoDto matchOpenFile = getMatchOpenFile(versionInfo, originalMd5ObjMap.get(openFileMd5),versionTreeNodeObj.getFullPath());
batchInsertCache.add(matchOpenFile); batchInsertCache.add(matchOpenFile);
matchedMd5s.add(openFileMd5);
}
}
//分批保存,防止单个项目太大,撑爆内存 或 超过 mongodb最大插入数 //分批保存,防止单个项目太大,撑爆内存 或 超过 mongodb最大插入数
if (batchInsertCache.size() >= 1000) { if (batchInsertCache.size() >= 1000) {
mongoTemplate.insert(batchInsertCache, MatchOpenFileMongoDto.class); mongoTemplate.insert(batchInsertCache, MatchOpenFileMongoDto.class);
batchInsertCache.clear(); batchInsertCache.clear();
} }
}); });
if (!batchInsertCache.isEmpty()) {
if (batchInsertCache.size() != 0) {
mongoTemplate.insert(batchInsertCache, MatchOpenFileMongoDto.class); mongoTemplate.insert(batchInsertCache, MatchOpenFileMongoDto.class);
} }
//更新文件分析的状态 //更新文件分析的状态
updateFileAnalysisStatus(matchedMd5s,true); updateFileAnalysisStatus(matchedFileMd5s,true);
} }
//获取匹配到的开源文件信息 //获取匹配到的开源文件信息
@ -322,18 +311,19 @@ public class PorjectAnalysisTask {
VersionTree openProject = solrUtils.queryVersionTree(queryStr); VersionTree openProject = solrUtils.queryVersionTree(queryStr);
log.info("query versionTree cost:{}s", (System.currentTimeMillis() - startTime) / 1000); log.info("query versionTree cost:{}s", (System.currentTimeMillis() - startTime) / 1000);
//如果存在没有匹配到开源数据的情况,直接退出循环匹配 //如果存在没有匹配到开源数据的情况,直接退出循环匹配
if (openProject == null){ if (openProject == null) {
break; break;
} }
//保存匹配的开源文件信息
saveProjectOpenInfo(openProject, projectFiles);
//获取开源项目的所有文件md5集合 //获取开源项目的所有文件md5集合
List<String> openFilesMd5 = openProject.getDirTree().stream().map(VersionTreeNode::getSourceFileMd5).collect(Collectors.toList()); List<String> openFilesMd5 = openProject.getDirTree().stream().map(VersionTreeNode::getSourceFileMd5).collect(Collectors.toList());
//获取被测件和开源项目相同的文件 //获取被测件和开源项目相同的文件
Set<String> matchedFiles = unMatchedFileMd5s.stream().filter(item -> openFilesMd5.contains(item)).collect(Collectors.toSet()); Set<String> matchedFiles = unMatchedFileMd5s.stream().filter(item -> openFilesMd5.contains(item)).collect(Collectors.toSet());
//保存匹配的开源文件信息
//todo 这里会重复保存数据, 需要优化
saveProjectOpenInfo(openProject, projectFiles, matchedFiles);
//保存已匹配的文件md5,后续需要统计整体的开源率 //保存已匹配的文件md5,后续需要统计整体的开源率
matchedFileMd5Set.addAll(matchedFiles); matchedFileMd5Set.addAll(matchedFiles);

@ -188,6 +188,7 @@ public class SolrUtils {
if (openSourceContent == null) { if (openSourceContent == null) {
log.error("根据开源文件MD5:{}未找到对应的开源文件源码", openSourceFileMd5); log.error("根据开源文件MD5:{}未找到对应的开源文件源码", openSourceFileMd5);
return "";
} }
return openSourceContent.getFieldValue("sourceContent").toString(); return openSourceContent.getFieldValue("sourceContent").toString();
} }

Loading…
Cancel
Save