|
|
@ -4,7 +4,6 @@ import cn.hutool.core.lang.Pair; |
|
|
|
import com.keyware.common.constant.enums.AnalysisStatusEnum; |
|
|
|
import com.keyware.common.constant.enums.AnalysisStatusEnum; |
|
|
|
import com.keyware.composeanalysis.constant.FixedValue; |
|
|
|
import com.keyware.composeanalysis.constant.FixedValue; |
|
|
|
import com.keyware.composeanalysis.constant.RedisConst; |
|
|
|
import com.keyware.composeanalysis.constant.RedisConst; |
|
|
|
import com.keyware.composeanalysis.constant.SolrDBConst; |
|
|
|
|
|
|
|
import com.keyware.composeanalysis.constant.enums.AnalysisLevelEnum; |
|
|
|
import com.keyware.composeanalysis.constant.enums.AnalysisLevelEnum; |
|
|
|
import com.keyware.composeanalysis.constant.enums.FileAnalysisStatusEnum; |
|
|
|
import com.keyware.composeanalysis.constant.enums.FileAnalysisStatusEnum; |
|
|
|
import com.keyware.composeanalysis.entity.AnalysisTask; |
|
|
|
import com.keyware.composeanalysis.entity.AnalysisTask; |
|
|
@ -18,8 +17,6 @@ import com.keyware.keyswan.anaysis.AnalysisFactory; |
|
|
|
import com.keyware.keyswan.common.CodeFile; |
|
|
|
import com.keyware.keyswan.common.CodeFile; |
|
|
|
import com.keyware.utils.IdGenerator; |
|
|
|
import com.keyware.utils.IdGenerator; |
|
|
|
import lombok.extern.log4j.Log4j2; |
|
|
|
import lombok.extern.log4j.Log4j2; |
|
|
|
import org.apache.commons.collections.CollectionUtils; |
|
|
|
|
|
|
|
import org.apache.commons.lang3.StringUtils; |
|
|
|
|
|
|
|
import org.apache.solr.common.SolrDocument; |
|
|
|
import org.apache.solr.common.SolrDocument; |
|
|
|
import org.apache.solr.common.SolrDocumentList; |
|
|
|
import org.apache.solr.common.SolrDocumentList; |
|
|
|
import org.springframework.data.mongodb.core.MongoTemplate; |
|
|
|
import org.springframework.data.mongodb.core.MongoTemplate; |
|
|
@ -81,42 +78,31 @@ public class FileAnalysisTask extends IAnalysisTask { |
|
|
|
//获取当前文件名称
|
|
|
|
//获取当前文件名称
|
|
|
|
String fileName = analysisFile.getName(); |
|
|
|
String fileName = analysisFile.getName(); |
|
|
|
|
|
|
|
|
|
|
|
AnalysisLogUtil.insert(mongoTemplate, "【文件级分析】正在分析" + fileName); |
|
|
|
|
|
|
|
try { |
|
|
|
try { |
|
|
|
//只有主流语言的才能解析
|
|
|
|
|
|
|
|
//非32种主流语言的不能提取文件特征,在文件级MD5匹配的时候,已经做过匹配
|
|
|
|
|
|
|
|
if (StringUtils.isNotEmpty(analysisFile.getSuffix()) && FixedValue.SUFFIX_SOLR_VERSION.containsKey(analysisFile.getSuffix())) { |
|
|
|
|
|
|
|
//根据文件后缀 查询 *_CutFileInfo库名称
|
|
|
|
//根据文件后缀 查询 *_CutFileInfo库名称
|
|
|
|
String featureCoreName = FixedValue.SUFFIX_SOLR_FILE.get(analysisFile.getSuffix()); |
|
|
|
String featureCoreName = FixedValue.SUFFIX_SOLR_FILE.get(analysisFile.getSuffix()); |
|
|
|
//根据文件名称,获取文件解析器
|
|
|
|
//根据文件名称,获取文件解析器
|
|
|
|
Analysis analysis = AnalysisFactory.getAnalysis(fileName); |
|
|
|
Analysis analysis = AnalysisFactory.getAnalysis(fileName); |
|
|
|
//如果 analysis 返回值为null 说明还未支持这种语言的特征提取 可以直接通过文件的MD5值去solr库中匹配
|
|
|
|
|
|
|
|
if (analysis != null) { |
|
|
|
|
|
|
|
//如果文件大小超过3M,则不进行文件级行级特征提取
|
|
|
|
//如果文件大小超过3M,则不进行文件级行级特征提取
|
|
|
|
Integer fileSize = analysisFile.getFileSize(); |
|
|
|
|
|
|
|
if (fileSize < (3 * 1024 * 1024)) { |
|
|
|
|
|
|
|
CodeFile codeFile = analysis.analysisFile(analysisFile.getFileUrl(), "1", "0"); |
|
|
|
CodeFile codeFile = analysis.analysisFile(analysisFile.getFileUrl(), "1", "0"); |
|
|
|
|
|
|
|
|
|
|
|
//根据文件的特征值,去相应文件文件后缀的特征库中进行查询
|
|
|
|
//根据文件的特征值,去相应文件文件后缀的特征库中进行查询
|
|
|
|
if (codeFile != null) { |
|
|
|
SolrDocumentList openSourceFileList = getFeatureSimilarityFromSolr(featureCoreName, codeFile); |
|
|
|
String querySb = "sourceMd5:" + codeFile.getSourceMd5() + " OR cutFileMd5:" + codeFile.getCutFileMd5() + " OR traitFileMd5:" + codeFile.getTraitFileMd5(); |
|
|
|
|
|
|
|
SolrDocumentList openSourceFileList = solrUtils.query(featureCoreName, querySb, "sourceMd5"); |
|
|
|
//则统计当前文件的开源率
|
|
|
|
//如果当前文件在源码库中,匹配到了数据,则统计当前文件的开源率
|
|
|
|
ananlyzeFileOpenRate(openSourceFileList, codeFile); |
|
|
|
if (CollectionUtils.isNotEmpty(openSourceFileList)) { |
|
|
|
|
|
|
|
ananlyzeFileOpenRate(openSourceFileList,codeFile); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
//更新文件级分析结果
|
|
|
|
//更新文件级分析结果
|
|
|
|
analysisFile.setFileAnalysisStatus(FileAnalysisStatusEnum.ANALYSIS_DONE.getCode()); |
|
|
|
analysisFile.setFileAnalysisStatus(FileAnalysisStatusEnum.ANALYSIS_DONE.getCode()); |
|
|
|
mongoTemplate.update(FileDataMongoDto.class) |
|
|
|
mongoTemplate.update(FileDataMongoDto.class) |
|
|
|
.matching(where("_id").is(analysisFile.getId())) |
|
|
|
.matching(where("_id").is(analysisFile.getId())) |
|
|
|
.replaceWith(analysisFile) |
|
|
|
.replaceWith(analysisFile) |
|
|
|
.findAndReplace(); |
|
|
|
.findAndReplace(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
AnalysisLogUtil.insert(mongoTemplate, "【文件级分析】成功" + fileName); |
|
|
|
} catch (Exception e) { |
|
|
|
} catch (Exception e) { |
|
|
|
AnalysisLogUtil.insertErrorInfo(mongoTemplate, "【文件级】提取失败" + fileName, e); |
|
|
|
AnalysisLogUtil.insertErrorInfo(mongoTemplate, "【文件级】分析失败" + fileName, e); |
|
|
|
log.error("文件:" + fileName + "文件级别特征提取失败!", e); |
|
|
|
log.error("文件:" + fileName + "文件级别分析失败!", e); |
|
|
|
//将当前文件的分析状态变更为失败
|
|
|
|
//将当前文件的分析状态变更为失败
|
|
|
|
analysisFile.setFileAnalysisStatus(FileAnalysisStatusEnum.FAILED_ANALYSIS.getCode()); |
|
|
|
analysisFile.setFileAnalysisStatus(FileAnalysisStatusEnum.FAILED_ANALYSIS.getCode()); |
|
|
|
//更新文件级分析结果
|
|
|
|
//更新文件级分析结果
|
|
|
@ -130,40 +116,39 @@ public class FileAnalysisTask extends IAnalysisTask { |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/** |
|
|
|
|
|
|
|
* 根据 特征值 从特征库中检索 具有特征相似的 |
|
|
|
|
|
|
|
* |
|
|
|
|
|
|
|
* @param featureCoreName 检索的solr 库名称 |
|
|
|
|
|
|
|
* @param codeFile 源文件解析结果 |
|
|
|
|
|
|
|
* @return |
|
|
|
|
|
|
|
*/ |
|
|
|
|
|
|
|
private SolrDocumentList getFeatureSimilarityFromSolr(String featureCoreName, CodeFile codeFile) { |
|
|
|
|
|
|
|
if (codeFile == null) { |
|
|
|
|
|
|
|
log.error("特征为空,无法查询:{}", analysisFile.getName()); |
|
|
|
|
|
|
|
return new SolrDocumentList(); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
String queryStr = "sourceMd5:" + codeFile.getSourceMd5() + " OR cutFileMd5:" + codeFile.getCutFileMd5() + " OR traitFileMd5:" + codeFile.getTraitFileMd5(); |
|
|
|
|
|
|
|
return solrUtils.query(featureCoreName, queryStr, "sourceMd5"); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
/** |
|
|
|
/** |
|
|
|
* 分析文件的开源率 |
|
|
|
* 分析文件的开源率 |
|
|
|
* |
|
|
|
* |
|
|
|
* @param openSourceFileList 匹配的开源文件信息 |
|
|
|
* @param openSourceFileList 匹配的开源文件信息 |
|
|
|
* @throws IOException |
|
|
|
* @throws IOException |
|
|
|
*/ |
|
|
|
*/ |
|
|
|
private void ananlyzeFileOpenRate(SolrDocumentList openSourceFileList,CodeFile fileAnalysisRes) throws IOException { |
|
|
|
private void ananlyzeFileOpenRate(SolrDocumentList openSourceFileList, CodeFile fileAnalysisRes) { |
|
|
|
|
|
|
|
|
|
|
|
//根据匹配的开源文件的md5 获取版本ID
|
|
|
|
|
|
|
|
Set<String> sourceFileMd5 = openSourceFileList.stream().map(solrDocument -> (String) solrDocument.get("sourceMd5")).collect(Collectors.toSet()); |
|
|
|
|
|
|
|
String sourceCoreName = FixedValue.SUFFIX_SOLR_VERSION.get(analysisFile.getSuffix()); |
|
|
|
|
|
|
|
Map<String, SolrDocument> md5VersionObjMap = solrUtils.batchQueryVersionIdFromSourceFileBaseBySourceMd5(sourceCoreName, sourceFileMd5); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
//根据版本ID获取版本信息
|
|
|
|
|
|
|
|
Set<String> versionIds = md5VersionObjMap.values().stream().map(solrDocument -> (String) solrDocument.get("versionId")).collect(Collectors.toSet()); |
|
|
|
|
|
|
|
List<VersionTree> treeInfoList = solrUtils.queryBatchVersionInfoByVersionIds(versionIds); |
|
|
|
|
|
|
|
Map<String, VersionTree> versionIdMap = treeInfoList.stream().collect(Collectors.toMap(VersionTree::getVersionId, Function.identity())); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
//获取被测件文本内容
|
|
|
|
|
|
|
|
String fileContent = new String(Files.readAllBytes(Paths.get(analysisFile.getFileUrl())), "utf-8").replaceAll(" ", ""); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
//将被测件的文本内容拆分成行信息,用于匹配开源信息
|
|
|
|
|
|
|
|
List<String> fileLines = SimilarityUtil.getSplitWords(fileContent); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
HashSet<Integer> openLineNum = new HashSet<>(); |
|
|
|
HashSet<Integer> openLineNum = new HashSet<>(); |
|
|
|
|
|
|
|
|
|
|
|
List<MatchOpenFile> matchOpenFilesRes = calculateOpenRate(openSourceFileList,fileAnalysisRes,sourceCoreName,openLineNum); |
|
|
|
//计算每个文件的开源率和特征相似度
|
|
|
|
|
|
|
|
List<MatchOpenFile> matchOpenFilesRes = calculateOpenRate(openSourceFileList, fileAnalysisRes, openLineNum); |
|
|
|
//统计被测件的总体开源率
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
//获取开源率阈值,判断当前文件是否开源
|
|
|
|
//获取开源率阈值,判断当前文件是否开源
|
|
|
|
Integer openRateThreshold = analysisTask.getOpenRateThreshold(); |
|
|
|
Integer openRateThreshold = analysisTask.getOpenRateThreshold(); |
|
|
|
|
|
|
|
|
|
|
|
BigDecimal openRate = new BigDecimal(openLineNum.size()).divide(new BigDecimal(fileLines.size()), 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2); |
|
|
|
BigDecimal openRate = new BigDecimal(openLineNum.size()).divide(fileAnalysisRes.getCodeRowNum(), 4, RoundingMode.HALF_UP).multiply(new BigDecimal(100)).setScale(2); |
|
|
|
|
|
|
|
|
|
|
|
//超过阈值,则认为当前文件是开源文件
|
|
|
|
//超过阈值,则认为当前文件是开源文件
|
|
|
|
if (openRate.compareTo(new BigDecimal(openRateThreshold)) > 0) { |
|
|
|
if (openRate.compareTo(new BigDecimal(openRateThreshold)) > 0) { |
|
|
@ -192,20 +177,21 @@ public class FileAnalysisTask extends IAnalysisTask { |
|
|
|
|
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/** |
|
|
|
/** |
|
|
|
* 计算当前文件的特征相似度 和 开源率 |
|
|
|
* 计算当前文件的特征相似度 和 开源率 |
|
|
|
* |
|
|
|
* |
|
|
|
* @param matchOpenFiles 通过MD5 匹配到的所有开源文件 |
|
|
|
* @param matchOpenFiles 通过MD5 匹配到的所有开源文件 |
|
|
|
* @param sourceFileBaseCoreName 当前文件特征文件的 solr coreName |
|
|
|
|
|
|
|
* @param matchLineRowsNum 所有开源文件匹配到的开源行号列表 |
|
|
|
* @param matchLineRowsNum 所有开源文件匹配到的开源行号列表 |
|
|
|
* @return 匹配的开源文件解析后的结果集 |
|
|
|
* @return 匹配的开源文件解析后的结果集 |
|
|
|
*/ |
|
|
|
*/ |
|
|
|
private List<MatchOpenFile> calculateOpenRate(SolrDocumentList matchOpenFiles, CodeFile fileAnalysisRes, String sourceFileBaseCoreName, Set<Integer> matchLineRowsNum) { |
|
|
|
private List<MatchOpenFile> calculateOpenRate(SolrDocumentList matchOpenFiles, CodeFile fileAnalysisRes, Set<Integer> matchLineRowsNum) { |
|
|
|
|
|
|
|
|
|
|
|
//匹配的开源文件列表
|
|
|
|
//匹配的开源文件列表
|
|
|
|
List<MatchOpenFile> matchOpenFilesRes = new ArrayList<>(); |
|
|
|
List<MatchOpenFile> matchOpenFilesRes = new ArrayList<>(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
//根据匹配的开源文件的md5 获取版本ID
|
|
|
|
|
|
|
|
String sourceFileBaseCoreName = FixedValue.SUFFIX_SOLR_VERSION.get(analysisFile.getSuffix()); |
|
|
|
|
|
|
|
|
|
|
|
//首先根据文件的MD5查询开源文件的版本ID,和路径信息
|
|
|
|
//首先根据文件的MD5查询开源文件的版本ID,和路径信息
|
|
|
|
Set<String> openSourceFileMd5s = matchOpenFiles.stream().map(doc -> (String) doc.get("sourceMd5")).collect(Collectors.toSet()); |
|
|
|
Set<String> openSourceFileMd5s = matchOpenFiles.stream().map(doc -> (String) doc.get("sourceMd5")).collect(Collectors.toSet()); |
|
|
|
Map<String, SolrDocument> md5VersionInfoMap = solrUtils.batchQueryVersionIdFromSourceFileBaseBySourceMd5(sourceFileBaseCoreName, openSourceFileMd5s); |
|
|
|
Map<String, SolrDocument> md5VersionInfoMap = solrUtils.batchQueryVersionIdFromSourceFileBaseBySourceMd5(sourceFileBaseCoreName, openSourceFileMd5s); |
|
|
@ -230,7 +216,7 @@ public class FileAnalysisTask extends IAnalysisTask { |
|
|
|
|
|
|
|
|
|
|
|
SolrDocument openEntries = md5VersionInfoMap.get(openSourceFileMd5); |
|
|
|
SolrDocument openEntries = md5VersionInfoMap.get(openSourceFileMd5); |
|
|
|
VersionTree versionInfo = versionIdVersionInfoMap.get(openEntries.get("versionId")); |
|
|
|
VersionTree versionInfo = versionIdVersionInfoMap.get(openEntries.get("versionId")); |
|
|
|
if (versionInfo == null){ |
|
|
|
if (versionInfo == null) { |
|
|
|
log.error("找不到开源文件版本信息,versionId:{}", openEntries.get("versionId")); |
|
|
|
log.error("找不到开源文件版本信息,versionId:{}", openEntries.get("versionId")); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -250,6 +236,4 @@ public class FileAnalysisTask extends IAnalysisTask { |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|