|
|
@ -26,6 +26,8 @@ import javax.swing.text.DefaultStyledDocument; |
|
|
|
import javax.swing.text.rtf.RTFEditorKit; |
|
|
|
import javax.swing.text.rtf.RTFEditorKit; |
|
|
|
import java.io.*; |
|
|
|
import java.io.*; |
|
|
|
import java.nio.charset.StandardCharsets; |
|
|
|
import java.nio.charset.StandardCharsets; |
|
|
|
|
|
|
|
import java.nio.file.Files; |
|
|
|
|
|
|
|
import java.util.Arrays; |
|
|
|
import java.util.List; |
|
|
|
import java.util.List; |
|
|
|
import java.util.function.Consumer; |
|
|
|
import java.util.function.Consumer; |
|
|
|
|
|
|
|
|
|
|
@ -36,6 +38,7 @@ import java.util.function.Consumer; |
|
|
|
* @since 2021/6/29 |
|
|
|
* @since 2021/6/29 |
|
|
|
*/ |
|
|
|
*/ |
|
|
|
public class PoiFileReadUtil { |
|
|
|
public class PoiFileReadUtil { |
|
|
|
|
|
|
|
private final static String[] readilyFileTypes = {"doc", "docx", "xls", "xlsx", "ppt", "pptx", "rtf", "pdf", "txt"}; |
|
|
|
|
|
|
|
|
|
|
|
/** |
|
|
|
/** |
|
|
|
* 解析文件文本内容 |
|
|
|
* 解析文件文本内容 |
|
|
@ -67,11 +70,28 @@ public class PoiFileReadUtil { |
|
|
|
case "txt": |
|
|
|
case "txt": |
|
|
|
return readContentByTxt(file); |
|
|
|
return readContentByTxt(file); |
|
|
|
default: |
|
|
|
default: |
|
|
|
return ""; |
|
|
|
if (isText(file)) { |
|
|
|
|
|
|
|
return readContentByTxt(file); |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} else { |
|
|
|
} else { |
|
|
|
throw new Exception("文件不存在"); |
|
|
|
throw new FileNotFoundException("文件不存在"); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
return null; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/** |
|
|
|
|
|
|
|
* 判断是否为可读文件 |
|
|
|
|
|
|
|
* |
|
|
|
|
|
|
|
* @param file 文件 |
|
|
|
|
|
|
|
* @return |
|
|
|
|
|
|
|
*/ |
|
|
|
|
|
|
|
public static boolean isReadilyFile(File file) { |
|
|
|
|
|
|
|
String fileSuffix = FileUtil.extName(file); |
|
|
|
|
|
|
|
if (Arrays.asList(readilyFileTypes).contains(fileSuffix)) { |
|
|
|
|
|
|
|
return true; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
return isText(file); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
/** |
|
|
|
/** |
|
|
@ -81,7 +101,7 @@ public class PoiFileReadUtil { |
|
|
|
* @return 文件内容 |
|
|
|
* @return 文件内容 |
|
|
|
*/ |
|
|
|
*/ |
|
|
|
private static String readContentByDoc(File file) throws IOException { |
|
|
|
private static String readContentByDoc(File file) throws IOException { |
|
|
|
InputStream fis = new FileInputStream(file); |
|
|
|
InputStream fis = Files.newInputStream(file.toPath()); |
|
|
|
WordExtractor wordExtractor = new WordExtractor(fis);//使用HWPF组件中WordExtractor类从Word文档中提取文本或段落
|
|
|
|
WordExtractor wordExtractor = new WordExtractor(fis);//使用HWPF组件中WordExtractor类从Word文档中提取文本或段落
|
|
|
|
StringBuilder result = new StringBuilder(); |
|
|
|
StringBuilder result = new StringBuilder(); |
|
|
|
for (String words : wordExtractor.getParagraphText()) {//获取段落内容
|
|
|
|
for (String words : wordExtractor.getParagraphText()) {//获取段落内容
|
|
|
@ -115,7 +135,7 @@ public class PoiFileReadUtil { |
|
|
|
* @return 文件内容 |
|
|
|
* @return 文件内容 |
|
|
|
*/ |
|
|
|
*/ |
|
|
|
private static String readContentByXls(File file) throws IOException { |
|
|
|
private static String readContentByXls(File file) throws IOException { |
|
|
|
InputStream is = new FileInputStream(file); |
|
|
|
InputStream is = Files.newInputStream(file.toPath()); |
|
|
|
HSSFWorkbook wb = new HSSFWorkbook(new POIFSFileSystem(is)); |
|
|
|
HSSFWorkbook wb = new HSSFWorkbook(new POIFSFileSystem(is)); |
|
|
|
ExcelExtractor extractor = new ExcelExtractor(wb); |
|
|
|
ExcelExtractor extractor = new ExcelExtractor(wb); |
|
|
|
extractor.setFormulasNotResults(false); |
|
|
|
extractor.setFormulasNotResults(false); |
|
|
@ -133,7 +153,7 @@ public class PoiFileReadUtil { |
|
|
|
* @return 文件内容 |
|
|
|
* @return 文件内容 |
|
|
|
*/ |
|
|
|
*/ |
|
|
|
private static String readContentByXlsx(File file) throws IOException { |
|
|
|
private static String readContentByXlsx(File file) throws IOException { |
|
|
|
InputStream is = new FileInputStream(file); |
|
|
|
InputStream is = Files.newInputStream(file.toPath()); |
|
|
|
XSSFExcelExtractor extractor = new XSSFExcelExtractor(new XSSFWorkbook(is)); |
|
|
|
XSSFExcelExtractor extractor = new XSSFExcelExtractor(new XSSFWorkbook(is)); |
|
|
|
extractor.setIncludeSheetNames(false); |
|
|
|
extractor.setIncludeSheetNames(false); |
|
|
|
String result = extractor.getText(); |
|
|
|
String result = extractor.getText(); |
|
|
@ -174,12 +194,12 @@ public class PoiFileReadUtil { |
|
|
|
private static String readContentByRtf(File file) throws IOException, BadLocationException { |
|
|
|
private static String readContentByRtf(File file) throws IOException, BadLocationException { |
|
|
|
DefaultStyledDocument styledDoc = new DefaultStyledDocument(); |
|
|
|
DefaultStyledDocument styledDoc = new DefaultStyledDocument(); |
|
|
|
// 创建文件输入流
|
|
|
|
// 创建文件输入流
|
|
|
|
InputStream is = new FileInputStream(file); |
|
|
|
InputStream is = Files.newInputStream(file.toPath()); |
|
|
|
new RTFEditorKit().read(is, styledDoc, 0); |
|
|
|
new RTFEditorKit().read(is, styledDoc, 0); |
|
|
|
is.close(); |
|
|
|
is.close(); |
|
|
|
byte[] buff = styledDoc.getText(0, styledDoc.getLength()).getBytes(StandardCharsets.ISO_8859_1); |
|
|
|
byte[] buff = styledDoc.getText(0, styledDoc.getLength()).getBytes(StandardCharsets.ISO_8859_1); |
|
|
|
|
|
|
|
|
|
|
|
return new String(buff, get_charset(buff)); |
|
|
|
return new String(buff, getCharset(buff)); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
/** |
|
|
|
/** |
|
|
@ -190,7 +210,7 @@ public class PoiFileReadUtil { |
|
|
|
*/ |
|
|
|
*/ |
|
|
|
private static String readContentByPpt(File file) throws IOException { |
|
|
|
private static String readContentByPpt(File file) throws IOException { |
|
|
|
// word 2003: 图片不会被读取
|
|
|
|
// word 2003: 图片不会被读取
|
|
|
|
InputStream fis = new FileInputStream(file); |
|
|
|
InputStream fis = Files.newInputStream(file.toPath()); |
|
|
|
PowerPointExtractor ex = new PowerPointExtractor(fis); |
|
|
|
PowerPointExtractor ex = new PowerPointExtractor(fis); |
|
|
|
String text = ex.getText().replace("\n", ""); |
|
|
|
String text = ex.getText().replace("\n", ""); |
|
|
|
ex.close(); |
|
|
|
ex.close(); |
|
|
@ -205,7 +225,7 @@ public class PoiFileReadUtil { |
|
|
|
* @return 文件内容 |
|
|
|
* @return 文件内容 |
|
|
|
*/ |
|
|
|
*/ |
|
|
|
private static String readContentByPptx(File file) throws IOException { |
|
|
|
private static String readContentByPptx(File file) throws IOException { |
|
|
|
InputStream is = new FileInputStream(file); |
|
|
|
InputStream is = Files.newInputStream(file.toPath()); |
|
|
|
XMLSlideShow slide = new XMLSlideShow(is); |
|
|
|
XMLSlideShow slide = new XMLSlideShow(is); |
|
|
|
XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(slide); |
|
|
|
XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(slide); |
|
|
|
extractor.close(); |
|
|
|
extractor.close(); |
|
|
@ -221,7 +241,30 @@ public class PoiFileReadUtil { |
|
|
|
*/ |
|
|
|
*/ |
|
|
|
private static String readContentByTxt(File file) throws IOException { |
|
|
|
private static String readContentByTxt(File file) throws IOException { |
|
|
|
FileInputStream fis = new FileInputStream(file); |
|
|
|
FileInputStream fis = new FileInputStream(file); |
|
|
|
return getCharset(fis); |
|
|
|
return getFileText(fis); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/** |
|
|
|
|
|
|
|
* 判断文件是否为文本格式的文件 |
|
|
|
|
|
|
|
* |
|
|
|
|
|
|
|
* @param file |
|
|
|
|
|
|
|
* @return |
|
|
|
|
|
|
|
*/ |
|
|
|
|
|
|
|
public static boolean isText(File file) { |
|
|
|
|
|
|
|
boolean isText = true; |
|
|
|
|
|
|
|
try (FileInputStream fin = new FileInputStream(file)) { |
|
|
|
|
|
|
|
long len = file.length(); |
|
|
|
|
|
|
|
for (int j = 0; j < (int) len; j++) { |
|
|
|
|
|
|
|
int t = fin.read(); |
|
|
|
|
|
|
|
if (t < 32 && t != 9 && t != 10 && t != 13) { |
|
|
|
|
|
|
|
isText = false; |
|
|
|
|
|
|
|
break; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} catch (Exception e) { |
|
|
|
|
|
|
|
e.printStackTrace(); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
return isText; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
/** |
|
|
|
/** |
|
|
@ -231,7 +274,7 @@ public class PoiFileReadUtil { |
|
|
|
* @return - |
|
|
|
* @return - |
|
|
|
* @throws IOException - |
|
|
|
* @throws IOException - |
|
|
|
*/ |
|
|
|
*/ |
|
|
|
public static String getCharset(InputStream is) throws IOException { |
|
|
|
public static String getFileText(InputStream is) throws IOException { |
|
|
|
BufferedInputStream bis = new BufferedInputStream(is); |
|
|
|
BufferedInputStream bis = new BufferedInputStream(is); |
|
|
|
|
|
|
|
|
|
|
|
int len; |
|
|
|
int len; |
|
|
@ -251,7 +294,7 @@ public class PoiFileReadUtil { |
|
|
|
bis.close(); |
|
|
|
bis.close(); |
|
|
|
is.close(); |
|
|
|
is.close(); |
|
|
|
|
|
|
|
|
|
|
|
return new String(buffer, get_charset(buffer)); |
|
|
|
return new String(buffer, getCharset(buffer)); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
/** |
|
|
|
/** |
|
|
@ -261,7 +304,7 @@ public class PoiFileReadUtil { |
|
|
|
* @return - |
|
|
|
* @return - |
|
|
|
* @throws IOException - |
|
|
|
* @throws IOException - |
|
|
|
*/ |
|
|
|
*/ |
|
|
|
private static String get_charset(byte[] file) throws IOException { |
|
|
|
private static String getCharset(byte[] file) throws IOException { |
|
|
|
String charset = "GBK"; |
|
|
|
String charset = "GBK"; |
|
|
|
byte[] first3Bytes = new byte[3]; |
|
|
|
byte[] first3Bytes = new byte[3]; |
|
|
|
InputStream bis = null; |
|
|
|
InputStream bis = null; |
|
|
@ -330,7 +373,7 @@ public class PoiFileReadUtil { |
|
|
|
|
|
|
|
|
|
|
|
public static void convertToUTF8(MultipartFile file, Consumer<? super MultipartFile> action) throws IOException { |
|
|
|
public static void convertToUTF8(MultipartFile file, Consumer<? super MultipartFile> action) throws IOException { |
|
|
|
File temp = new File(file.getName()); |
|
|
|
File temp = new File(file.getName()); |
|
|
|
String charset = get_charset(file.getBytes()); |
|
|
|
String charset = getCharset(file.getBytes()); |
|
|
|
if ("UTF-8".equalsIgnoreCase(charset)) { |
|
|
|
if ("UTF-8".equalsIgnoreCase(charset)) { |
|
|
|
action.accept(file); |
|
|
|
action.accept(file); |
|
|
|
} |
|
|
|
} |
|
|
@ -351,4 +394,5 @@ public class PoiFileReadUtil { |
|
|
|
action.accept(toMultipartFile); |
|
|
|
action.accept(toMultipartFile); |
|
|
|
temp.deleteOnExit(); |
|
|
|
temp.deleteOnExit(); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|