java使用Apache PDFBox+POI实现PDF转Word

java使用Apache PDFBox+POI实现PDF转Word

注:仅限简单转换,pdf中包含表格等复杂结构无法保留

<!-- PDFBox -->
<dependency>
   <groupId>org.apache.pdfbox</groupId>
   <artifactId>pdfbox</artifactId>
   <version>2.0.29</version> <!-- 使用最新版本 -->
</dependency>

<!-- Apache POI -->
<dependency>
   <groupId>org.apache.poi</groupId>
   <artifactId>poi</artifactId>
   <version>5.0.0</version>
</dependency>
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFRun;

import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;

public class PdfToWord {
    public static void main(String[] args) throws Exception {
        convertPdfToWord("D:\test.pdf", "D:\test.docx"); //pdf文件路径和输出的word文件路径
    }

    public static void convertPdfToWord(String pdfFilePath, String wordFilePath) throws IOException {
        PDDocument document = PDDocument.load(Files.newInputStream(Paths.get(pdfFilePath)));
        XWPFDocument wordDocument = new XWPFDocument();

        PDFTextStripper pdfTextStripper = new PDFTextStripper();
        int totalPages = document.getNumberOfPages();

        for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) {
            pdfTextStripper.setStartPage(pageIndex + 1);
            pdfTextStripper.setEndPage(pageIndex + 1);

            String text = pdfTextStripper.getText(document);
            addParagraphToWord(wordDocument, text);
        }
        document.close();
        try (FileOutputStream out = new FileOutputStream(wordFilePath)) {
            wordDocument.write(out);
        }
        wordDocument.close();
    }

    private static void addParagraphToWord(XWPFDocument document, String text) {
        XWPFParagraph paragraph = document.createParagraph();
        XWPFRun run = paragraph.createRun();
        run.setText(text);
    }
}