java使用Apache PDFBox+POI实现PDF转Word
注:仅限简单转换,pdf中包含表格等复杂结构无法保留
<!-- PDFBox --> <dependency> <groupId>org.apache.pdfbox</groupId> <artifactId>pdfbox</artifactId> <version>2.0.29</version> <!-- 使用最新版本 --> </dependency> <!-- Apache POI --> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi</artifactId> <version>5.0.0</version> </dependency>
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFRun;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
public class PdfToWord {
public static void main(String[] args) throws Exception {
convertPdfToWord("D:\test.pdf", "D:\test.docx"); //pdf文件路径和输出的word文件路径
}
public static void convertPdfToWord(String pdfFilePath, String wordFilePath) throws IOException {
PDDocument document = PDDocument.load(Files.newInputStream(Paths.get(pdfFilePath)));
XWPFDocument wordDocument = new XWPFDocument();
PDFTextStripper pdfTextStripper = new PDFTextStripper();
int totalPages = document.getNumberOfPages();
for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) {
pdfTextStripper.setStartPage(pageIndex + 1);
pdfTextStripper.setEndPage(pageIndex + 1);
String text = pdfTextStripper.getText(document);
addParagraphToWord(wordDocument, text);
}
document.close();
try (FileOutputStream out = new FileOutputStream(wordFilePath)) {
wordDocument.write(out);
}
wordDocument.close();
}
private static void addParagraphToWord(XWPFDocument document, String text) {
XWPFParagraph paragraph = document.createParagraph();
XWPFRun run = paragraph.createRun();
run.setText(text);
}
}