SpringBoot处理文档工具类

import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.xwpf.usermodel.*;
import org.apache.tika.Tika;
import org.springframework.mock.web.MockMultipartFile;
import org.springframework.web.multipart.MultipartFile;

import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 文档处理
 */
@Slf4j
public class DocumentUtil {
    /**
     * 将MultipartFile转换为word
     *
     * @param multipartFile
     * @return
     */
    public static XWPFDocument multi2word(MultipartFile multipartFile) {
        try {
            // 检查multipartFile是否为空
            if (multipartFile.isEmpty()) {
                throw new IllegalArgumentException("The provided MultipartFile is empty.");
            }

            // 获取文件名和输入流
            String fileName = multipartFile.getOriginalFilename();
            InputStream inputStream = multipartFile.getInputStream();

            // 使用输入流创建XWPFDocument
            XWPFDocument document = new XWPFDocument(inputStream);

            return document;
        } catch (IOException e) {
            e.printStackTrace();
            return null;
        }
    }

    /**
     * 将word转换为MultipartFile
     *
     * @param document
     * @param fileName
     * @return
     */
    public static MultipartFile word2multi(XWPFDocument document, String fileName) {
        try {
            String contentType = "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
            ByteArrayOutputStream bos = new ByteArrayOutputStream();
            document.write(bos);
            //XWPFDocument 转 byte[]
            byte[] barray = bos.toByteArray();
            //byte[] 转 InputStream
            InputStream is = new ByteArrayInputStream(barray);
            //InputStream 转 MultipartFile
            MultipartFile multipartFile = new MockMultipartFile(fileName, fileName, contentType, is);
            return multipartFile;
        } catch (Exception e) {
            e.printStackTrace();
            return null;
        }
    }

    /**
     * 将word文件转换为文本
     *
     * @param document
     * @return
     */
    public static String word2plain(XWPFDocument document) {
        StringBuilder textBuilder = new StringBuilder();

        // 遍历文档中的所有段落
        List<XWPFParagraph> paragraphs = document.getParagraphs();
        for (XWPFParagraph paragraph : paragraphs) {
            // 获取段落中的文本并添加到StringBuilder中
            textBuilder.append(paragraph.getText());
            // 在每个段落后添加换行符
            textBuilder.append(System.lineSeparator());
        }

        // 删除最后一个换行符,除非文档为空
        if (!textBuilder.isEmpty()) {
            textBuilder.setLength(textBuilder.length() - System.lineSeparator().length());
        }

        return textBuilder.toString();
    }

    /**
     * 将文本转换为word
     *
     * @param text
     * @return
     */
    public static XWPFDocument text2word(String text) {
        List<String> textList = List.of(text.split("\n"));
        return text2word(textList);
    }

    /**
     * 将多个文本段落转换为word
     *
     * @param textList
     * @return
     */
    public static XWPFDocument text2word(List<String> textList) {
        XWPFDocument document = new XWPFDocument();
        for (int i = 0; i < textList.size(); i++) {
            XWPFParagraph paragraph = document.createParagraph();
            XWPFRun run = paragraph.createRun();
            run.setText(textList.get(i));
        }
        return document;
    }

    /**
     * 将文本转为word再转为MultipartFile
     *
     * @param text
     * @param filename
     * @return
     */
    public static MultipartFile text2word2multi(String text, String filename) {
        if (text == null || text.trim().isEmpty()) {
            throw new IllegalArgumentException("文本内容不能为空");
        }
        return word2multi(text2word(text), filename);
    }

    /**
     * 将文本转为MultipartFile
     * @param text
     * @param filename
     * @return
     */
    public static MultipartFile text2txt2multi(String text, String filename) {
        // 使用给定的文本和文件名创建一个MockMultipartFile
        try {
            return new MockMultipartFile(filename,
                    filename,
                    "text/plain",
                    new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)));
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    public static String getFileTypeDetail(MultipartFile file) {
        // 使用Apache Tika检测文件类型
        Tika tika = new Tika();
        try {
            String detectedType = tika.detect(file.getInputStream());
            switch (detectedType) {
                case "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
                    return "docx";
                case "application/x-tika-msoffice":
                    return "doc";
                case "application/pdf":
                    return "pdf";
                case "text/plain":
                    return "txt";
                case "application/rtf":
                    return "rtf";
                default:
                    return detectedType;
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        return null;
        // 根据需要进行其他处理...
    }

//    public static int getDocxTextLength(MultipartFile file) {
//        try (XWPFDocument document = new XWPFDocument(file.getInputStream())) {
//            StringBuilder textBuilder = new StringBuilder();
//            // 读取所有段落
//            for (XWPFParagraph paragraph : document.getParagraphs()) {
//                textBuilder.append(paragraph.getText());
//            }
//            // 读取所有表格中的文本
//            for (XWPFTable table : document.getTables()) {
//                for (XWPFTableRow row : table.getRows()) {
//                    for (XWPFTableCell cell : row.getTableCells()) {
//                        for (XWPFParagraph paragraph : cell.getParagraphs()) {
//                            textBuilder.append(paragraph.getText());
//                        }
//                    }
//                }
//            }
//            // 获取纯文本内容
//            String text = textBuilder.toString();
//            // 计算文本长度
//            return text.length();
//        } catch (IOException e) {
//            e.printStackTrace();
//            return -1; // 或者抛出异常,根据你的需求处理
//        }
//    }

    public static int getDocxTextLength(MultipartFile file) {
        // 检查文件是否为空
        if (file == null || file.isEmpty()) {
            throw new IllegalArgumentException("The uploaded DOCX file is empty or null.");
        }

        // 可选:限制文件大小(例如 40MB)
        long maxSize = 40 * 1024 * 1024; // 10 MB
        if (file.getSize() > maxSize) {
            throw new IllegalArgumentException("The DOCX file exceeds the maximum allowed size of 10MB.");
        }

        try (InputStream inputStream = file.getInputStream();
             XWPFDocument document = new XWPFDocument(inputStream)) {

            StringBuilder textBuilder = new StringBuilder();

            // 提取所有段落文本
            for (XWPFParagraph paragraph : document.getParagraphs()) {
                String text = paragraph.getText();
                if (text != null) {
                    textBuilder.append(text);
                }
            }

            // 提取所有表格单元格中的文本
            for (XWPFTable table : document.getTables()) {
                for (XWPFTableRow row : table.getRows()) {
                    for (XWPFTableCell cell : row.getTableCells()) {
                        for (XWPFParagraph paragraph : cell.getParagraphs()) {
                            String text = paragraph.getText();
                            if (text != null) {
                                textBuilder.append(text);
                            }
                        }
                    }
                }
            }

            return textBuilder.toString().length();

        } catch (IOException e) {
            e.printStackTrace();
            throw new RuntimeException("Failed to process the DOCX file: " + e.getMessage(), e);
        }
    }

    //    public static int getDocTextLength(MultipartFile file) {
//        try (InputStream inputStream = file.getInputStream()) {
//            // 创建 HWPFDocument 对象
//            HWPFDocument document = new HWPFDocument(inputStream);
//
//            // 使用 WordExtractor 提取文本
//            WordExtractor extractor = new WordExtractor(document);
//
//            // 获取纯文本内容
//            String text = extractor.getText();
//
//            // 计算文本长度
//            return text.length();
//        } catch (IOException e) {
//            e.printStackTrace();
//            throw new RuntimeException("Failed to read the .doc file", e);
//        }
//    }
    public static int getDocTextLength(MultipartFile file) {
        // 检查文件是否为空
        if (file == null || file.isEmpty()) {
            throw new IllegalArgumentException("The uploaded DOC file is empty or null.");
        }

        // 可选:限制文件大小(例如 40MB)
        long maxSize = 40 * 1024 * 1024; // 10 MB
        if (file.getSize() > maxSize) {
            throw new IllegalArgumentException("The DOC file exceeds the maximum allowed size of 10MB.");
        }

        try (InputStream inputStream = file.getInputStream();
             HWPFDocument document = new HWPFDocument(inputStream)) {

            WordExtractor extractor = new WordExtractor(document);
            String text = extractor.getText();

            return text.length();

        } catch (IOException e) {
            e.printStackTrace();
            throw new RuntimeException("Failed to process the DOC file: " + e.getMessage(), e);
        }
    }

    //    public static int getPdfTextLength(MultipartFile file) {
//        try (InputStream inputStream = file.getInputStream()) {
//            // 加载 PDF 文档
//            PDDocument document = PDDocument.load(inputStream);
//
//            // 使用 PDFTextStripper 提取文本
//            PDFTextStripper pdfStripper = new PDFTextStripper();
//            String text = pdfStripper.getText(document);
//
//            // 计算文本长度
//            return text.length();
//        } catch (IOException e) {
//            e.printStackTrace();
//            throw new RuntimeException("Failed to read the .pdf file", e);
//        }
//    }
    public static int getPdfTextLength(MultipartFile file) {
        // 检查文件是否为空
        if (file == null || file.isEmpty()) {
            throw new IllegalArgumentException("The uploaded PDF file is empty or null.");
        }

        // 可选:限制文件大小(例如 40MB)
        long maxSize = 40 * 1024 * 1024; // 10 MB
        if (file.getSize() > maxSize) {
            throw new IllegalArgumentException("The PDF file exceeds the maximum allowed size of 10MB.");
        }

        try (InputStream inputStream = file.getInputStream();
             PDDocument document = PDDocument.load(inputStream)) {

            PDFTextStripper pdfStripper = new PDFTextStripper();
            String text = pdfStripper.getText(document);

            return text.length();

        } catch (IOException e) {
            // 可记录日志
            e.printStackTrace();
            throw new RuntimeException("Failed to process the PDF file: " + e.getMessage(), e);
        }
    }

    //    public static int getTxtTextLength(MultipartFile file) {
//        try (InputStream inputStream = file.getInputStream()) {
//            // 读取文件内容
//            byte[] bytes = inputStream.readAllBytes();
//
//            // 将字节数组转换为字符串
//            String text = new String(bytes, StandardCharsets.UTF_8);
//
//            // 计算文本长度
//            return text.length();
//        } catch (IOException e) {
//            e.printStackTrace();
//            throw new RuntimeException("Failed to read the .txt file", e);
//        }
//    }

    public static int getTxtTextLength(MultipartFile file) {
        // 检查文件是否为空
        if (file == null || file.isEmpty()) {
            throw new IllegalArgumentException("The uploaded TXT file is empty or null.");
        }

        // 可选:限制文件大小(例如 40MB)
        long maxSize = 40 * 1024 * 1024; // 10 MB
        if (file.getSize() > maxSize) {
            throw new IllegalArgumentException("The TXT file exceeds the maximum allowed size of 10MB.");
        }

        try (InputStream inputStream = file.getInputStream()) {
            byte[] bytes = inputStream.readAllBytes();
            String text = new String(bytes, StandardCharsets.UTF_8);
            return text.length();

        } catch (IOException e) {
            e.printStackTrace();
            throw new RuntimeException("Failed to read the TXT file: " + e.getMessage(), e);
        }
    }

    public static int getDocumentTextLength(MultipartFile file) {
        String fileType = getFileTypeDetail(file);
        int textLength = 0;
        switch (fileType) {
            case "docx":
                textLength = DocumentUtil.getDocxTextLength(file);
                break;
            case "doc":
                textLength = DocumentUtil.getDocTextLength(file);
                break;
            case "pdf":
                textLength = DocumentUtil.getPdfTextLength(file);
                break;
            case "txt":
                textLength = DocumentUtil.getTxtTextLength(file);
                break;
            default:
                throw new RuntimeException("无法计算字数:无法识别文件类型");
        }
        return textLength;
    }

    /**
     * 获取英文文档词数,仅限docx
     *
     * @param file
     * @return
     */
    public static int getEnglishDocxTextLength(MultipartFile file) {

        try (XWPFDocument document = multi2word(file)) {
            // 使用正则表达式匹配英文单词
            Pattern wordPattern = Pattern.compile("\\b\\w+\\b");
            // 使用正则表达式匹配中文字符
            Pattern chineseCharacterPattern = Pattern.compile("[\u4e00-\u9fa5]");

            int englishWordCount = 0;
            int chineseCharacterCount = 0;

            // 遍历文档中的段落
            for (XWPFParagraph paragraph : document.getParagraphs()) {
                String text = paragraph.getText();
                if (text != null && !text.isEmpty()) {
                    Matcher wordMatcher = wordPattern.matcher(text);
                    while (wordMatcher.find()) {
                        englishWordCount++;
                    }

                    Matcher characterMatcher = chineseCharacterPattern.matcher(text);
                    while (characterMatcher.find()) {
                        chineseCharacterCount++;
                    }
                }
            }

            // 遍历文档中的表格
            for (XWPFTable table : document.getTables()) {
                for (XWPFTableRow row : table.getRows()) {
                    for (XWPFTableCell cell : row.getTableCells()) {
                        List<XWPFParagraph> cellParagraphs = cell.getParagraphs();
                        for (XWPFParagraph cellParagraph : cellParagraphs) {
                            String cellText = cellParagraph.getText();
                            if (cellText != null && !cellText.isEmpty()) {
                                Matcher wordMatcher = wordPattern.matcher(cellText);
                                while (wordMatcher.find()) {
                                    englishWordCount++;
                                }

                                Matcher characterMatcher = chineseCharacterPattern.matcher(cellText);
                                while (characterMatcher.find()) {
                                    chineseCharacterCount++;
                                }
                            }
                        }
                    }
                }
            }

            return englishWordCount + chineseCharacterCount;
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    public static int getEnglishDocTextLength(MultipartFile file) {
        try (ByteArrayInputStream inputStream = new ByteArrayInputStream(file.getBytes());
             POIFSFileSystem poifs = new POIFSFileSystem(inputStream);
             HWPFDocument document = new HWPFDocument(poifs)) {

            String text = document.getDocumentText();

            // 使用正则表达式匹配英文单词
            Pattern wordPattern = Pattern.compile("\\b\\w+\\b");
            // 使用正则表达式匹配中文字符
            Pattern chineseCharacterPattern = Pattern.compile("[\u4e00-\u9fa5]");

            int englishWordCount = 0;
            int chineseCharacterCount = 0;

            if (text != null && !text.isEmpty()) {
                Matcher wordMatcher = wordPattern.matcher(text);
                while (wordMatcher.find()) {
                    englishWordCount++;
                }

                Matcher characterMatcher = chineseCharacterPattern.matcher(text);
                while (characterMatcher.find()) {
                    chineseCharacterCount++;
                }
            }

            return englishWordCount + chineseCharacterCount;

        } catch (IOException e) {
            throw new RuntimeException("读取 .doc 文件失败", e);
        }
    }

    public static int getEnglishPdfTextLength(MultipartFile file) {
        try (ByteArrayInputStream inputStream = new ByteArrayInputStream(file.getBytes());
             PDDocument document = PDDocument.load(inputStream)) {

            // 提取整个 PDF 的纯文本
            PDFTextStripper stripper = new PDFTextStripper();
            String text = stripper.getText(document);

            // 定义正则表达式
            Pattern wordPattern = Pattern.compile("\\b[a-zA-Z]+\\b");           // 匹配英文单词
            Pattern chinesePattern = Pattern.compile("[\u4e00-\u9fa5]");       // 匹配中文字符

            Matcher wordMatcher = wordPattern.matcher(text);
            Matcher chineseMatcher = chinesePattern.matcher(text);

            int englishWordCount = 0;
            while (wordMatcher.find()) {
                englishWordCount++;
            }

            int chineseCharacterCount = 0;
            while (chineseMatcher.find()) {
                chineseCharacterCount++;
            }

            return englishWordCount + chineseCharacterCount;

        } catch (IOException e) {
            throw new RuntimeException("读取 PDF 文件失败", e);
        }
    }

    public static int getEnglishTxtTextLength(MultipartFile file) {
        try (InputStreamReader reader = new InputStreamReader(file.getInputStream(), StandardCharsets.UTF_8);
             BufferedReader bufferedReader = new BufferedReader(reader)) {

            StringBuilder textBuilder = new StringBuilder();
            String line;

            while ((line = bufferedReader.readLine()) != null) {
                textBuilder.append(line).append(" ");
            }

            String text = textBuilder.toString();

            // 定义正则表达式
            Pattern wordPattern = Pattern.compile("\\b[a-zA-Z]+\\b");           // 匹配英文单词
            Pattern chinesePattern = Pattern.compile("[\u4e00-\u9fa5]");       // 匹配中文字符

            Matcher wordMatcher = wordPattern.matcher(text);
            Matcher chineseMatcher = chinesePattern.matcher(text);

            int englishWordCount = 0;
            while (wordMatcher.find()) {
                englishWordCount++;
            }

            int chineseCharacterCount = 0;
            while (chineseMatcher.find()) {
                chineseCharacterCount++;
            }

            return englishWordCount + chineseCharacterCount;

        } catch (IOException e) {
            throw new RuntimeException("读取 TXT 文件失败", e);
        }
    }

    public static int getEnglishDocumentTextLength(MultipartFile file) {
        String fileType = getFileTypeDetail(file);
        int textLength = 0;
        switch (fileType) {
            case "docx":
                textLength = DocumentUtil.getEnglishDocxTextLength(file);
                break;
            case "doc":
                textLength = DocumentUtil.getEnglishDocTextLength(file);
                break;
            case "pdf":
                textLength = DocumentUtil.getEnglishPdfTextLength(file);
                break;
            case "txt":
                textLength = DocumentUtil.getEnglishTxtTextLength(file);
                break;
            default:
                throw new RuntimeException("无法计算字数:无法识别文件类型");
        }
        return textLength;
    }
}