package org.jeecg.modules.utils; import com.itextpdf.text.BaseColor; import com.itextpdf.text.DocumentException; import com.itextpdf.text.pdf.PdfContentByte; import com.itextpdf.text.pdf.PdfReader; import com.itextpdf.text.pdf.PdfStamper; import com.itextpdf.text.pdf.parser.PdfTextExtractor; import org.jeecg.modules.ai.vo.ItemPosition; import org.jeecg.modules.ai.vo.KeyWordPosition; import org.jeecg.modules.ai.vo.KeyWordPositionVo; import org.jeecg.modules.ai.vo.MyTextExtractionStrategy; import java.io.FileOutputStream; import java.io.IOException; import java.util.ArrayList; import java.util.List; /** * @author clown * * @date 2024/7/23 */ public class PdfUtils { /** * 用于供外部类调用获取关键字所在PDF文件坐标 * @param filepath * @param keyWords * @return */ public static List getKeyWordsByPath(String filepath, String keyWords) { List matchItems = null; try{ PdfReader pdfReader = new PdfReader(filepath); matchItems = getKeyWords(pdfReader, keyWords); } catch (IOException e) { e.printStackTrace(); } return matchItems; } /** * 获取关键字所在PDF坐标 * @param pdfReader * @param keyWords * @return */ private static List getKeyWords(PdfReader pdfReader, String keyWords) { int page = 0; List matchItems = new ArrayList<>(); try{ int pageNum = pdfReader.getNumberOfPages(); StringBuilder allText = null; //遍历页 for (page = 1; page <= pageNum; page++) { //只记录当页的所有内容,需要记录全部页放在循环外面 List allItems = new ArrayList<>(); //扫描内容 MyTextExtractionStrategy myTextExtractionStrategy = new MyTextExtractionStrategy(allItems, page); PdfTextExtractor.getTextFromPage(pdfReader, page, myTextExtractionStrategy); //当页的文字内容,用于关键词匹配 allText = new StringBuilder(); //一个字一个字的遍历 for (int i=0; i listItem = new ArrayList<>(); for(int j=i-keyWords.length()+1; j<=i; j++) { listItem.add(allItems.get(j)); } keyWordPosition.setListItem(listItem); keyWordPosition.setText(keyWords); matchItems.add(keyWordPosition); allText.setLength(0); } } } } catch (Exception e) { e.printStackTrace(); } return matchItems; } public static void main(String[] args) throws Exception { String keyword = "一旦发生机床安全事故"; String sourcePdf = "F:\\123.pdf"; String watermarkPdf = "F:\\12_bak.pdf"; Long start = System.currentTimeMillis(); System.out.println("开始扫描...."); List matchItems = getKeyWordsByPath(sourcePdf, keyword); System.out.println(matchItems); System.out.println("扫描结束["+(System.currentTimeMillis()-start)+"ms],共找到关键字["+keyword+"]出现["+matchItems.size()+"]次"); start = System.currentTimeMillis(); System.out.println("开始添加标记...."); andRectangleMark(sourcePdf , watermarkPdf , matchItems , BaseColor.RED , 2 , 2); //文件整理 System.out.println("标记添加完成["+(System.currentTimeMillis()-start)+"ms]"); } public static List findListWord(String sourcePath, String keyWord){ System.out.println("开始扫描...."); try { List matchItems = getKeyWordsByPathOne(sourcePath, keyWord); return matchItems; }catch (Exception e) { return null; } } /** * 用于供外部类调用获取关键字所在PDF文件坐标 * @param filepath * @param keyWords * @return */ public static List getKeyWordsByPathOne(String filepath, String keyWords) { List matchItems = null; try{ PdfReader pdfReader = new PdfReader(filepath); matchItems = getKeyWordPage(pdfReader, keyWords); } catch (IOException e) { e.printStackTrace(); } return matchItems; } /** * 获取关键字所在PDF坐标 一页只取第一条 * @param pdfReader * @param keyWords * @return */ private static List getKeyWordPage(PdfReader pdfReader, String keyWords) { int page = 0; List matchItems = new ArrayList<>(); try{ int pageNum = pdfReader.getNumberOfPages(); StringBuilder allText = null; //遍历页 for (page = 1; page <= pageNum; page++) { //只记录当页的所有内容,需要记录全部页放在循环外面 List allItems = new ArrayList<>(); //扫描内容 MyTextExtractionStrategy myTextExtractionStrategy = new MyTextExtractionStrategy(allItems, page); PdfTextExtractor.getTextFromPage(pdfReader, page, myTextExtractionStrategy); //当页的文字内容,用于关键词匹配 allText = new StringBuilder(); //一个字一个字的遍历 for (int i=0; i matchItems, BaseColor color, int lineWidth, int padding) throws DocumentException, IOException{ // 待加水印的文件 PdfReader reader = new PdfReader(oldPath); // 加完水印的文件 PdfStamper stamper = new PdfStamper(reader, new FileOutputStream(newPath)); PdfContentByte content; // 设置字体 // 循环对每页插入水印 for (KeyWordPosition keyWordPosition:matchItems) { //一个关键词的所有字坐标 List oneKeywordItems = keyWordPosition.getListItem(); for(int i=0; i