From 2166c1a14f9629aa49a5f3bb849ce878df4c4892 Mon Sep 17 00:00:00 2001 From: yangbin <yangbin@qq.com> Date: 星期三, 28 八月 2024 17:23:25 +0800 Subject: [PATCH] 2 --- lxzn-module-ai/src/main/java/org/jeecg/modules/utils/PdfUtils.java | 263 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 files changed, 263 insertions(+), 0 deletions(-) diff --git a/lxzn-module-ai/src/main/java/org/jeecg/modules/utils/PdfUtils.java b/lxzn-module-ai/src/main/java/org/jeecg/modules/utils/PdfUtils.java new file mode 100644 index 0000000..c0218e9 --- /dev/null +++ b/lxzn-module-ai/src/main/java/org/jeecg/modules/utils/PdfUtils.java @@ -0,0 +1,263 @@ +package org.jeecg.modules.utils; + +import com.itextpdf.text.BaseColor; +import com.itextpdf.text.DocumentException; +import com.itextpdf.text.pdf.PdfContentByte; +import com.itextpdf.text.pdf.PdfReader; +import com.itextpdf.text.pdf.PdfStamper; +import com.itextpdf.text.pdf.parser.PdfTextExtractor; +import org.jeecg.modules.ai.vo.ItemPosition; +import org.jeecg.modules.ai.vo.KeyWordPosition; +import org.jeecg.modules.ai.vo.KeyWordPositionVo; +import org.jeecg.modules.ai.vo.MyTextExtractionStrategy; + +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +/** + * @author clown + * * @date 2024/7/23 + */ +public class PdfUtils { + /** + * 鐢ㄤ簬渚涘閮ㄧ被璋冪敤鑾峰彇鍏抽敭瀛楁墍鍦≒DF鏂囦欢鍧愭爣 + * @param filepath + * @param keyWords + * @return + */ + public static List<KeyWordPosition> getKeyWordsByPath(String filepath, String keyWords) { + List<KeyWordPosition> matchItems = null; + try{ + PdfReader pdfReader = new PdfReader(filepath); + matchItems = getKeyWords(pdfReader, keyWords); + } catch (IOException e) { + e.printStackTrace(); + } + return matchItems; + } + + /** + * 鑾峰彇鍏抽敭瀛楁墍鍦≒DF鍧愭爣 + * @param pdfReader + * @param keyWords + * @return + */ + private static List<KeyWordPosition> getKeyWords(PdfReader pdfReader, String keyWords) { + int page = 0; + + List<KeyWordPosition> matchItems = new ArrayList<>(); + try{ + int pageNum = pdfReader.getNumberOfPages(); + StringBuilder allText = null; + + //閬嶅巻椤� + for (page = 1; page <= pageNum; page++) { + //鍙褰曞綋椤电殑鎵�鏈夊唴瀹癸紝闇�瑕佽褰曞叏閮ㄩ〉鏀惧湪寰幆澶栭潰 + List<ItemPosition> allItems = new ArrayList<>(); + //鎵弿鍐呭 + MyTextExtractionStrategy myTextExtractionStrategy = new MyTextExtractionStrategy(allItems, page); + PdfTextExtractor.getTextFromPage(pdfReader, page, myTextExtractionStrategy); + //褰撻〉鐨勬枃瀛楀唴瀹癸紝鐢ㄤ簬鍏抽敭璇嶅尮閰� + allText = new StringBuilder(); + //涓�涓瓧涓�涓瓧鐨勯亶鍘� + for (int i=0; i<allItems.size(); i++) { + ItemPosition item = allItems.get(i); + allText.append(item.getText()); + //鍏抽敭瀛楀瓨鍦ㄨ繛缁涓潡涓� + if(allText.indexOf(keyWords) != -1) { + KeyWordPosition keyWordPosition = new KeyWordPosition(); + //璁板綍鍏抽敭璇嶆瘡涓瓧鐨勪綅缃紝鍙褰曞紑濮嬬粨鏉熸爣璁版椂浼氭湁闂 + List<ItemPosition> listItem = new ArrayList<>(); + for(int j=i-keyWords.length()+1; j<=i; j++) { + listItem.add(allItems.get(j)); + } + keyWordPosition.setListItem(listItem); + keyWordPosition.setText(keyWords); + matchItems.add(keyWordPosition); + allText.setLength(0); + } + } + + + } + } catch (Exception e) { + e.printStackTrace(); + } + return matchItems; + } + + + + + public static void main(String[] args) throws Exception { + String keyword = "涓�鏃﹀彂鐢熸満搴婂畨鍏ㄤ簨鏁�"; + String sourcePdf = "F:\\123.pdf"; + String watermarkPdf = "F:\\12_bak.pdf"; + Long start = System.currentTimeMillis(); + System.out.println("寮�濮嬫壂鎻�...."); + List<KeyWordPosition> matchItems = getKeyWordsByPath(sourcePdf, keyword); + System.out.println(matchItems); + System.out.println("鎵弿缁撴潫["+(System.currentTimeMillis()-start)+"ms]锛屽叡鎵惧埌鍏抽敭瀛梉"+keyword+"]鍑虹幇["+matchItems.size()+"]娆�"); + start = System.currentTimeMillis(); + System.out.println("寮�濮嬫坊鍔犳爣璁�...."); + andRectangleMark(sourcePdf + , watermarkPdf + , matchItems + , BaseColor.RED + , 2 + , 2); + //鏂囦欢鏁寸悊 + System.out.println("鏍囪娣诲姞瀹屾垚["+(System.currentTimeMillis()-start)+"ms]"); + + } + + + public static List<KeyWordPositionVo> findListWord(String sourcePath, String keyWord){ + System.out.println("寮�濮嬫壂鎻�...."); + try { + List<KeyWordPositionVo> matchItems = getKeyWordsByPathOne(sourcePath, keyWord); + return matchItems; + }catch (Exception e) { + return null; + } + } + + /** + * 鐢ㄤ簬渚涘閮ㄧ被璋冪敤鑾峰彇鍏抽敭瀛楁墍鍦≒DF鏂囦欢鍧愭爣 + * @param filepath + * @param keyWords + * @return + */ + public static List<KeyWordPositionVo> getKeyWordsByPathOne(String filepath, String keyWords) { + List<KeyWordPositionVo> matchItems = null; + try{ + PdfReader pdfReader = new PdfReader(filepath); + matchItems = getKeyWordPage(pdfReader, keyWords); + } catch (IOException e) { + e.printStackTrace(); + } + return matchItems; + } + + /** + * 鑾峰彇鍏抽敭瀛楁墍鍦≒DF鍧愭爣 涓�椤靛彧鍙栫涓�鏉� + * @param pdfReader + * @param keyWords + * @return + */ + private static List<KeyWordPositionVo> getKeyWordPage(PdfReader pdfReader, String keyWords) { + int page = 0; + List<KeyWordPositionVo> matchItems = new ArrayList<>(); + try{ + int pageNum = pdfReader.getNumberOfPages(); + StringBuilder allText = null; + //閬嶅巻椤� + for (page = 1; page <= pageNum; page++) { + //鍙褰曞綋椤电殑鎵�鏈夊唴瀹癸紝闇�瑕佽褰曞叏閮ㄩ〉鏀惧湪寰幆澶栭潰 + List<ItemPosition> allItems = new ArrayList<>(); + //鎵弿鍐呭 + MyTextExtractionStrategy myTextExtractionStrategy = new MyTextExtractionStrategy(allItems, page); + PdfTextExtractor.getTextFromPage(pdfReader, page, myTextExtractionStrategy); + //褰撻〉鐨勬枃瀛楀唴瀹癸紝鐢ㄤ簬鍏抽敭璇嶅尮閰� + allText = new StringBuilder(); + //涓�涓瓧涓�涓瓧鐨勯亶鍘� + for (int i=0; i<allItems.size(); i++) { + ItemPosition item = allItems.get(i); + allText.append(item.getText()); + //鍏抽敭瀛楀瓨鍦ㄨ繛缁涓潡涓� + if(allText.indexOf(keyWords) != -1) { + KeyWordPosition keyWordPosition = new KeyWordPosition(); + //璁板綍鍏抽敭璇嶆瘡涓瓧鐨勪綅缃紝鍙褰曞紑濮嬬粨鏉熸爣璁版椂浼氭湁闂 + KeyWordPositionVo vo = new KeyWordPositionVo(); + for(int j=i-keyWords.length()+1; j<=i; j++) { + if (allItems.get(j).getText().indexOf(keyWords)!= -1) { + vo.setPage(allItems.get(j).getPage()); + vo.setRectangle(allItems.get(j).getRectangle()); + vo.setText(allItems.get(j).getText()); + vo.setRectangleString(allItems.get(j).getRectangle().toString()); + matchItems.add(vo); + break; + } + } + break; + } + } + } + } catch (Exception e) { + e.printStackTrace(); + } + return matchItems; + } + + /** + * 娣诲姞鐭╁舰鏍囪 + * @param oldPath + * @param newPath + * @param matchItems 鍏抽敭璇� + * @param color 鏍囪棰滆壊 + * @param lineWidth 绾挎潯绮楃粏 + * @param padding 杈规鍐呰竟璺� + * @throws DocumentException + * @throws IOException + */ + public static void andRectangleMark(String oldPath, String newPath, List<KeyWordPosition> matchItems, BaseColor color, int lineWidth, int padding) throws DocumentException, IOException{ + // 寰呭姞姘村嵃鐨勬枃浠� + PdfReader reader = new PdfReader(oldPath); + // 鍔犲畬姘村嵃鐨勬枃浠� + PdfStamper stamper = new PdfStamper(reader, new FileOutputStream(newPath)); + + PdfContentByte content; + + // 璁剧疆瀛椾綋 + // 寰幆瀵规瘡椤垫彃鍏ユ按鍗� + for (KeyWordPosition keyWordPosition:matchItems) + { + //涓�涓叧閿瘝鐨勬墍鏈夊瓧鍧愭爣 + List<ItemPosition> oneKeywordItems = keyWordPosition.getListItem(); + for(int i=0; i<oneKeywordItems.size(); i++) { + ItemPosition item = oneKeywordItems.get(i); + ItemPosition preItem = i==0?null:oneKeywordItems.get(i-1); + //甯︾‘瀹炴槸鍚︽按鍗� + + // 姘村嵃鐨勮捣濮� + content = stamper.getOverContent(item.getPage()); + // 寮�濮嬪啓鍏ユ按鍗� + content.setLineWidth(lineWidth); + content.setColorStroke(color); + System.out.println(item.toString()); + + //搴曠嚎 + content.moveTo(item.getRectangle().getLeft()-padding, item.getRectangle().getBottom()-padding); + content.lineTo(item.getRectangle().getRight()+padding, item.getRectangle().getBottom()-padding); + if(i!=0 && preItem!=null && (preItem.getRectangle().getBottom()-padding)==(item.getRectangle().getBottom()-padding) && (preItem.getRectangle().getRight()+padding)!=(item.getRectangle().getLeft()-padding)) { + content.moveTo(preItem.getRectangle().getRight()+padding, preItem.getRectangle().getBottom()-padding); + content.lineTo(item.getRectangle().getLeft()-padding, item.getRectangle().getBottom()-padding); + } + //涓婄嚎 + content.moveTo(item.getRectangle().getLeft()-padding, item.getRectangle().getTop()+padding); + content.lineTo(item.getRectangle().getRight()+padding, item.getRectangle().getTop()+padding); + if(i!=0 && preItem!=null && (preItem.getRectangle().getTop()+padding)==(item.getRectangle().getTop()+padding) && (preItem.getRectangle().getRight()+padding)!=(item.getRectangle().getLeft()-padding)) { + content.moveTo(preItem.getRectangle().getRight()+padding, preItem.getRectangle().getTop()+padding); + content.lineTo(item.getRectangle().getLeft()-padding, item.getRectangle().getTop()+padding); + } + + //宸︾嚎 + if(i==0) { + content.moveTo(item.getRectangle().getLeft()-padding, item.getRectangle().getBottom()-padding); + content.lineTo(item.getRectangle().getLeft()-padding, item.getRectangle().getTop()+padding); + } + //鍙崇嚎 + if(i==(oneKeywordItems.size()-1)) { + content.moveTo(item.getRectangle().getRight()+padding, item.getRectangle().getBottom()-padding); + content.lineTo(item.getRectangle().getRight()+padding, item.getRectangle().getTop()+padding); + } + + content.stroke(); + } + } + stamper.close(); + } +} + -- Gitblit v1.9.3