SpringBoot处理文档工具类
import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.xwpf.usermodel.*;
import org.apache.tika.Tika;
import org.springframework.mock.web.MockMultipartFile;
import org.springframework.web.multipart.MultipartFile;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 文档处理
*/
@Slf4j
public class DocumentUtil {
/**
* 将MultipartFile转换为word
*
* @param multipartFile
* @return
*/
public static XWPFDocument multi2word(MultipartFile multipartFile) {
try {
// 检查multipartFile是否为空
if (multipartFile.isEmpty()) {
throw new IllegalArgumentException("The provided MultipartFile is empty.");
}
// 获取文件名和输入流
String fileName = multipartFile.getOriginalFilename();
InputStream inputStream = multipartFile.getInputStream();
// 使用输入流创建XWPFDocument
XWPFDocument document = new XWPFDocument(inputStream);
return document;
} catch (IOException e) {
e.printStackTrace();
return null;
}
}
/**
* 将word转换为MultipartFile
*
* @param document
* @param fileName
* @return
*/
public static MultipartFile word2multi(XWPFDocument document, String fileName) {
try {
String contentType = "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
ByteArrayOutputStream bos = new ByteArrayOutputStream();
document.write(bos);
//XWPFDocument 转 byte[]
byte[] barray = bos.toByteArray();
//byte[] 转 InputStream
InputStream is = new ByteArrayInputStream(barray);
//InputStream 转 MultipartFile
MultipartFile multipartFile = new MockMultipartFile(fileName, fileName, contentType, is);
return multipartFile;
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
/**
* 将word文件转换为文本
*
* @param document
* @return
*/
public static String word2plain(XWPFDocument document) {
StringBuilder textBuilder = new StringBuilder();
// 遍历文档中的所有段落
List<XWPFParagraph> paragraphs = document.getParagraphs();
for (XWPFParagraph paragraph : paragraphs) {
// 获取段落中的文本并添加到StringBuilder中
textBuilder.append(paragraph.getText());
// 在每个段落后添加换行符
textBuilder.append(System.lineSeparator());
}
// 删除最后一个换行符,除非文档为空
if (!textBuilder.isEmpty()) {
textBuilder.setLength(textBuilder.length() - System.lineSeparator().length());
}
return textBuilder.toString();
}
/**
* 将文本转换为word
*
* @param text
* @return
*/
public static XWPFDocument text2word(String text) {
List<String> textList = List.of(text.split("\n"));
return text2word(textList);
}
/**
* 将多个文本段落转换为word
*
* @param textList
* @return
*/
public static XWPFDocument text2word(List<String> textList) {
XWPFDocument document = new XWPFDocument();
for (int i = 0; i < textList.size(); i++) {
XWPFParagraph paragraph = document.createParagraph();
XWPFRun run = paragraph.createRun();
run.setText(textList.get(i));
}
return document;
}
/**
* 将文本转为word再转为MultipartFile
*
* @param text
* @param filename
* @return
*/
public static MultipartFile text2word2multi(String text, String filename) {
if (text == null || text.trim().isEmpty()) {
throw new IllegalArgumentException("文本内容不能为空");
}
return word2multi(text2word(text), filename);
}
/**
* 将文本转为MultipartFile
* @param text
* @param filename
* @return
*/
public static MultipartFile text2txt2multi(String text, String filename) {
// 使用给定的文本和文件名创建一个MockMultipartFile
try {
return new MockMultipartFile(filename,
filename,
"text/plain",
new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)));
} catch (IOException e) {
throw new RuntimeException(e);
}
}
public static String getFileTypeDetail(MultipartFile file) {
// 使用Apache Tika检测文件类型
Tika tika = new Tika();
try {
String detectedType = tika.detect(file.getInputStream());
switch (detectedType) {
case "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
return "docx";
case "application/x-tika-msoffice":
return "doc";
case "application/pdf":
return "pdf";
case "text/plain":
return "txt";
case "application/rtf":
return "rtf";
default:
return detectedType;
}
} catch (IOException e) {
e.printStackTrace();
}
return null;
// 根据需要进行其他处理...
}
// public static int getDocxTextLength(MultipartFile file) {
// try (XWPFDocument document = new XWPFDocument(file.getInputStream())) {
// StringBuilder textBuilder = new StringBuilder();
// // 读取所有段落
// for (XWPFParagraph paragraph : document.getParagraphs()) {
// textBuilder.append(paragraph.getText());
// }
// // 读取所有表格中的文本
// for (XWPFTable table : document.getTables()) {
// for (XWPFTableRow row : table.getRows()) {
// for (XWPFTableCell cell : row.getTableCells()) {
// for (XWPFParagraph paragraph : cell.getParagraphs()) {
// textBuilder.append(paragraph.getText());
// }
// }
// }
// }
// // 获取纯文本内容
// String text = textBuilder.toString();
// // 计算文本长度
// return text.length();
// } catch (IOException e) {
// e.printStackTrace();
// return -1; // 或者抛出异常,根据你的需求处理
// }
// }
public static int getDocxTextLength(MultipartFile file) {
// 检查文件是否为空
if (file == null || file.isEmpty()) {
throw new IllegalArgumentException("The uploaded DOCX file is empty or null.");
}
// 可选:限制文件大小(例如 40MB)
long maxSize = 40 * 1024 * 1024; // 10 MB
if (file.getSize() > maxSize) {
throw new IllegalArgumentException("The DOCX file exceeds the maximum allowed size of 10MB.");
}
try (InputStream inputStream = file.getInputStream();
XWPFDocument document = new XWPFDocument(inputStream)) {
StringBuilder textBuilder = new StringBuilder();
// 提取所有段落文本
for (XWPFParagraph paragraph : document.getParagraphs()) {
String text = paragraph.getText();
if (text != null) {
textBuilder.append(text);
}
}
// 提取所有表格单元格中的文本
for (XWPFTable table : document.getTables()) {
for (XWPFTableRow row : table.getRows()) {
for (XWPFTableCell cell : row.getTableCells()) {
for (XWPFParagraph paragraph : cell.getParagraphs()) {
String text = paragraph.getText();
if (text != null) {
textBuilder.append(text);
}
}
}
}
}
return textBuilder.toString().length();
} catch (IOException e) {
e.printStackTrace();
throw new RuntimeException("Failed to process the DOCX file: " + e.getMessage(), e);
}
}
// public static int getDocTextLength(MultipartFile file) {
// try (InputStream inputStream = file.getInputStream()) {
// // 创建 HWPFDocument 对象
// HWPFDocument document = new HWPFDocument(inputStream);
//
// // 使用 WordExtractor 提取文本
// WordExtractor extractor = new WordExtractor(document);
//
// // 获取纯文本内容
// String text = extractor.getText();
//
// // 计算文本长度
// return text.length();
// } catch (IOException e) {
// e.printStackTrace();
// throw new RuntimeException("Failed to read the .doc file", e);
// }
// }
public static int getDocTextLength(MultipartFile file) {
// 检查文件是否为空
if (file == null || file.isEmpty()) {
throw new IllegalArgumentException("The uploaded DOC file is empty or null.");
}
// 可选:限制文件大小(例如 40MB)
long maxSize = 40 * 1024 * 1024; // 10 MB
if (file.getSize() > maxSize) {
throw new IllegalArgumentException("The DOC file exceeds the maximum allowed size of 10MB.");
}
try (InputStream inputStream = file.getInputStream();
HWPFDocument document = new HWPFDocument(inputStream)) {
WordExtractor extractor = new WordExtractor(document);
String text = extractor.getText();
return text.length();
} catch (IOException e) {
e.printStackTrace();
throw new RuntimeException("Failed to process the DOC file: " + e.getMessage(), e);
}
}
// public static int getPdfTextLength(MultipartFile file) {
// try (InputStream inputStream = file.getInputStream()) {
// // 加载 PDF 文档
// PDDocument document = PDDocument.load(inputStream);
//
// // 使用 PDFTextStripper 提取文本
// PDFTextStripper pdfStripper = new PDFTextStripper();
// String text = pdfStripper.getText(document);
//
// // 计算文本长度
// return text.length();
// } catch (IOException e) {
// e.printStackTrace();
// throw new RuntimeException("Failed to read the .pdf file", e);
// }
// }
public static int getPdfTextLength(MultipartFile file) {
// 检查文件是否为空
if (file == null || file.isEmpty()) {
throw new IllegalArgumentException("The uploaded PDF file is empty or null.");
}
// 可选:限制文件大小(例如 40MB)
long maxSize = 40 * 1024 * 1024; // 10 MB
if (file.getSize() > maxSize) {
throw new IllegalArgumentException("The PDF file exceeds the maximum allowed size of 10MB.");
}
try (InputStream inputStream = file.getInputStream();
PDDocument document = PDDocument.load(inputStream)) {
PDFTextStripper pdfStripper = new PDFTextStripper();
String text = pdfStripper.getText(document);
return text.length();
} catch (IOException e) {
// 可记录日志
e.printStackTrace();
throw new RuntimeException("Failed to process the PDF file: " + e.getMessage(), e);
}
}
// public static int getTxtTextLength(MultipartFile file) {
// try (InputStream inputStream = file.getInputStream()) {
// // 读取文件内容
// byte[] bytes = inputStream.readAllBytes();
//
// // 将字节数组转换为字符串
// String text = new String(bytes, StandardCharsets.UTF_8);
//
// // 计算文本长度
// return text.length();
// } catch (IOException e) {
// e.printStackTrace();
// throw new RuntimeException("Failed to read the .txt file", e);
// }
// }
public static int getTxtTextLength(MultipartFile file) {
// 检查文件是否为空
if (file == null || file.isEmpty()) {
throw new IllegalArgumentException("The uploaded TXT file is empty or null.");
}
// 可选:限制文件大小(例如 40MB)
long maxSize = 40 * 1024 * 1024; // 10 MB
if (file.getSize() > maxSize) {
throw new IllegalArgumentException("The TXT file exceeds the maximum allowed size of 10MB.");
}
try (InputStream inputStream = file.getInputStream()) {
byte[] bytes = inputStream.readAllBytes();
String text = new String(bytes, StandardCharsets.UTF_8);
return text.length();
} catch (IOException e) {
e.printStackTrace();
throw new RuntimeException("Failed to read the TXT file: " + e.getMessage(), e);
}
}
public static int getDocumentTextLength(MultipartFile file) {
String fileType = getFileTypeDetail(file);
int textLength = 0;
switch (fileType) {
case "docx":
textLength = DocumentUtil.getDocxTextLength(file);
break;
case "doc":
textLength = DocumentUtil.getDocTextLength(file);
break;
case "pdf":
textLength = DocumentUtil.getPdfTextLength(file);
break;
case "txt":
textLength = DocumentUtil.getTxtTextLength(file);
break;
default:
throw new RuntimeException("无法计算字数:无法识别文件类型");
}
return textLength;
}
/**
* 获取英文文档词数,仅限docx
*
* @param file
* @return
*/
public static int getEnglishDocxTextLength(MultipartFile file) {
try (XWPFDocument document = multi2word(file)) {
// 使用正则表达式匹配英文单词
Pattern wordPattern = Pattern.compile("\\b\\w+\\b");
// 使用正则表达式匹配中文字符
Pattern chineseCharacterPattern = Pattern.compile("[\u4e00-\u9fa5]");
int englishWordCount = 0;
int chineseCharacterCount = 0;
// 遍历文档中的段落
for (XWPFParagraph paragraph : document.getParagraphs()) {
String text = paragraph.getText();
if (text != null && !text.isEmpty()) {
Matcher wordMatcher = wordPattern.matcher(text);
while (wordMatcher.find()) {
englishWordCount++;
}
Matcher characterMatcher = chineseCharacterPattern.matcher(text);
while (characterMatcher.find()) {
chineseCharacterCount++;
}
}
}
// 遍历文档中的表格
for (XWPFTable table : document.getTables()) {
for (XWPFTableRow row : table.getRows()) {
for (XWPFTableCell cell : row.getTableCells()) {
List<XWPFParagraph> cellParagraphs = cell.getParagraphs();
for (XWPFParagraph cellParagraph : cellParagraphs) {
String cellText = cellParagraph.getText();
if (cellText != null && !cellText.isEmpty()) {
Matcher wordMatcher = wordPattern.matcher(cellText);
while (wordMatcher.find()) {
englishWordCount++;
}
Matcher characterMatcher = chineseCharacterPattern.matcher(cellText);
while (characterMatcher.find()) {
chineseCharacterCount++;
}
}
}
}
}
}
return englishWordCount + chineseCharacterCount;
} catch (IOException e) {
throw new RuntimeException(e);
}
}
public static int getEnglishDocTextLength(MultipartFile file) {
try (ByteArrayInputStream inputStream = new ByteArrayInputStream(file.getBytes());
POIFSFileSystem poifs = new POIFSFileSystem(inputStream);
HWPFDocument document = new HWPFDocument(poifs)) {
String text = document.getDocumentText();
// 使用正则表达式匹配英文单词
Pattern wordPattern = Pattern.compile("\\b\\w+\\b");
// 使用正则表达式匹配中文字符
Pattern chineseCharacterPattern = Pattern.compile("[\u4e00-\u9fa5]");
int englishWordCount = 0;
int chineseCharacterCount = 0;
if (text != null && !text.isEmpty()) {
Matcher wordMatcher = wordPattern.matcher(text);
while (wordMatcher.find()) {
englishWordCount++;
}
Matcher characterMatcher = chineseCharacterPattern.matcher(text);
while (characterMatcher.find()) {
chineseCharacterCount++;
}
}
return englishWordCount + chineseCharacterCount;
} catch (IOException e) {
throw new RuntimeException("读取 .doc 文件失败", e);
}
}
public static int getEnglishPdfTextLength(MultipartFile file) {
try (ByteArrayInputStream inputStream = new ByteArrayInputStream(file.getBytes());
PDDocument document = PDDocument.load(inputStream)) {
// 提取整个 PDF 的纯文本
PDFTextStripper stripper = new PDFTextStripper();
String text = stripper.getText(document);
// 定义正则表达式
Pattern wordPattern = Pattern.compile("\\b[a-zA-Z]+\\b"); // 匹配英文单词
Pattern chinesePattern = Pattern.compile("[\u4e00-\u9fa5]"); // 匹配中文字符
Matcher wordMatcher = wordPattern.matcher(text);
Matcher chineseMatcher = chinesePattern.matcher(text);
int englishWordCount = 0;
while (wordMatcher.find()) {
englishWordCount++;
}
int chineseCharacterCount = 0;
while (chineseMatcher.find()) {
chineseCharacterCount++;
}
return englishWordCount + chineseCharacterCount;
} catch (IOException e) {
throw new RuntimeException("读取 PDF 文件失败", e);
}
}
public static int getEnglishTxtTextLength(MultipartFile file) {
try (InputStreamReader reader = new InputStreamReader(file.getInputStream(), StandardCharsets.UTF_8);
BufferedReader bufferedReader = new BufferedReader(reader)) {
StringBuilder textBuilder = new StringBuilder();
String line;
while ((line = bufferedReader.readLine()) != null) {
textBuilder.append(line).append(" ");
}
String text = textBuilder.toString();
// 定义正则表达式
Pattern wordPattern = Pattern.compile("\\b[a-zA-Z]+\\b"); // 匹配英文单词
Pattern chinesePattern = Pattern.compile("[\u4e00-\u9fa5]"); // 匹配中文字符
Matcher wordMatcher = wordPattern.matcher(text);
Matcher chineseMatcher = chinesePattern.matcher(text);
int englishWordCount = 0;
while (wordMatcher.find()) {
englishWordCount++;
}
int chineseCharacterCount = 0;
while (chineseMatcher.find()) {
chineseCharacterCount++;
}
return englishWordCount + chineseCharacterCount;
} catch (IOException e) {
throw new RuntimeException("读取 TXT 文件失败", e);
}
}
public static int getEnglishDocumentTextLength(MultipartFile file) {
String fileType = getFileTypeDetail(file);
int textLength = 0;
switch (fileType) {
case "docx":
textLength = DocumentUtil.getEnglishDocxTextLength(file);
break;
case "doc":
textLength = DocumentUtil.getEnglishDocTextLength(file);
break;
case "pdf":
textLength = DocumentUtil.getEnglishPdfTextLength(file);
break;
case "txt":
textLength = DocumentUtil.getEnglishTxtTextLength(file);
break;
default:
throw new RuntimeException("无法计算字数:无法识别文件类型");
}
return textLength;
}
}