java使用Apache PDFBox+POI实现PDF转Word
注:仅限简单转换,pdf中包含表格等复杂结构无法保留
<!-- PDFBox --> <dependency> <groupId>org.apache.pdfbox</groupId> <artifactId>pdfbox</artifactId> <version>2.0.29</version> <!-- 使用最新版本 --> </dependency> <!-- Apache POI --> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi</artifactId> <version>5.0.0</version> </dependency>
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.text.PDFTextStripper; import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.apache.poi.xwpf.usermodel.XWPFParagraph; import org.apache.poi.xwpf.usermodel.XWPFRun; import java.io.FileOutputStream; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Paths; public class PdfToWord { public static void main(String[] args) throws Exception { convertPdfToWord("D:\test.pdf", "D:\test.docx"); //pdf文件路径和输出的word文件路径 } public static void convertPdfToWord(String pdfFilePath, String wordFilePath) throws IOException { PDDocument document = PDDocument.load(Files.newInputStream(Paths.get(pdfFilePath))); XWPFDocument wordDocument = new XWPFDocument(); PDFTextStripper pdfTextStripper = new PDFTextStripper(); int totalPages = document.getNumberOfPages(); for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) { pdfTextStripper.setStartPage(pageIndex + 1); pdfTextStripper.setEndPage(pageIndex + 1); String text = pdfTextStripper.getText(document); addParagraphToWord(wordDocument, text); } document.close(); try (FileOutputStream out = new FileOutputStream(wordFilePath)) { wordDocument.write(out); } wordDocument.close(); } private static void addParagraphToWord(XWPFDocument document, String text) { XWPFParagraph paragraph = document.createParagraph(); XWPFRun run = paragraph.createRun(); run.setText(text); } }