package org.jeecg.modules.utils;
|
|
import com.itextpdf.text.BaseColor;
|
import com.itextpdf.text.DocumentException;
|
import com.itextpdf.text.pdf.PdfContentByte;
|
import com.itextpdf.text.pdf.PdfReader;
|
import com.itextpdf.text.pdf.PdfStamper;
|
import com.itextpdf.text.pdf.parser.PdfTextExtractor;
|
import org.jeecg.modules.ai.vo.ItemPosition;
|
import org.jeecg.modules.ai.vo.KeyWordPosition;
|
import org.jeecg.modules.ai.vo.KeyWordPositionVo;
|
import org.jeecg.modules.ai.vo.MyTextExtractionStrategy;
|
|
import java.io.FileOutputStream;
|
import java.io.IOException;
|
import java.util.ArrayList;
|
import java.util.List;
|
|
/**
|
* @author clown
|
* * @date 2024/7/23
|
*/
|
public class PdfUtils {
|
/**
|
* 用于供外部类调用获取关键字所在PDF文件坐标
|
* @param filepath
|
* @param keyWords
|
* @return
|
*/
|
public static List<KeyWordPosition> getKeyWordsByPath(String filepath, String keyWords) {
|
List<KeyWordPosition> matchItems = null;
|
try{
|
PdfReader pdfReader = new PdfReader(filepath);
|
matchItems = getKeyWords(pdfReader, keyWords);
|
} catch (IOException e) {
|
e.printStackTrace();
|
}
|
return matchItems;
|
}
|
|
/**
|
* 获取关键字所在PDF坐标
|
* @param pdfReader
|
* @param keyWords
|
* @return
|
*/
|
private static List<KeyWordPosition> getKeyWords(PdfReader pdfReader, String keyWords) {
|
int page = 0;
|
|
List<KeyWordPosition> matchItems = new ArrayList<>();
|
try{
|
int pageNum = pdfReader.getNumberOfPages();
|
StringBuilder allText = null;
|
|
//遍历页
|
for (page = 1; page <= pageNum; page++) {
|
//只记录当页的所有内容,需要记录全部页放在循环外面
|
List<ItemPosition> allItems = new ArrayList<>();
|
//扫描内容
|
MyTextExtractionStrategy myTextExtractionStrategy = new MyTextExtractionStrategy(allItems, page);
|
PdfTextExtractor.getTextFromPage(pdfReader, page, myTextExtractionStrategy);
|
//当页的文字内容,用于关键词匹配
|
allText = new StringBuilder();
|
//一个字一个字的遍历
|
for (int i=0; i<allItems.size(); i++) {
|
ItemPosition item = allItems.get(i);
|
allText.append(item.getText());
|
//关键字存在连续多个块中
|
if(allText.indexOf(keyWords) != -1) {
|
KeyWordPosition keyWordPosition = new KeyWordPosition();
|
//记录关键词每个字的位置,只记录开始结束标记时会有问题
|
List<ItemPosition> listItem = new ArrayList<>();
|
for(int j=i-keyWords.length()+1; j<=i; j++) {
|
listItem.add(allItems.get(j));
|
}
|
keyWordPosition.setListItem(listItem);
|
keyWordPosition.setText(keyWords);
|
matchItems.add(keyWordPosition);
|
allText.setLength(0);
|
}
|
}
|
|
|
}
|
} catch (Exception e) {
|
e.printStackTrace();
|
}
|
return matchItems;
|
}
|
|
|
|
|
public static void main(String[] args) throws Exception {
|
String keyword = "一旦发生机床安全事故";
|
String sourcePdf = "F:\\123.pdf";
|
String watermarkPdf = "F:\\12_bak.pdf";
|
Long start = System.currentTimeMillis();
|
System.out.println("开始扫描....");
|
List<KeyWordPosition> matchItems = getKeyWordsByPath(sourcePdf, keyword);
|
System.out.println(matchItems);
|
System.out.println("扫描结束["+(System.currentTimeMillis()-start)+"ms],共找到关键字["+keyword+"]出现["+matchItems.size()+"]次");
|
start = System.currentTimeMillis();
|
System.out.println("开始添加标记....");
|
andRectangleMark(sourcePdf
|
, watermarkPdf
|
, matchItems
|
, BaseColor.RED
|
, 2
|
, 2);
|
//文件整理
|
System.out.println("标记添加完成["+(System.currentTimeMillis()-start)+"ms]");
|
|
}
|
|
|
public static List<KeyWordPositionVo> findListWord(String sourcePath, String keyWord){
|
System.out.println("开始扫描....");
|
try {
|
List<KeyWordPositionVo> matchItems = getKeyWordsByPathOne(sourcePath, keyWord);
|
return matchItems;
|
}catch (Exception e) {
|
return null;
|
}
|
}
|
|
/**
|
* 用于供外部类调用获取关键字所在PDF文件坐标
|
* @param filepath
|
* @param keyWords
|
* @return
|
*/
|
public static List<KeyWordPositionVo> getKeyWordsByPathOne(String filepath, String keyWords) {
|
List<KeyWordPositionVo> matchItems = null;
|
try{
|
PdfReader pdfReader = new PdfReader(filepath);
|
matchItems = getKeyWordPage(pdfReader, keyWords);
|
} catch (IOException e) {
|
e.printStackTrace();
|
}
|
return matchItems;
|
}
|
|
/**
|
* 获取关键字所在PDF坐标 一页只取第一条
|
* @param pdfReader
|
* @param keyWords
|
* @return
|
*/
|
private static List<KeyWordPositionVo> getKeyWordPage(PdfReader pdfReader, String keyWords) {
|
int page = 0;
|
List<KeyWordPositionVo> matchItems = new ArrayList<>();
|
try{
|
int pageNum = pdfReader.getNumberOfPages();
|
StringBuilder allText = null;
|
//遍历页
|
for (page = 1; page <= pageNum; page++) {
|
//只记录当页的所有内容,需要记录全部页放在循环外面
|
List<ItemPosition> allItems = new ArrayList<>();
|
//扫描内容
|
MyTextExtractionStrategy myTextExtractionStrategy = new MyTextExtractionStrategy(allItems, page);
|
PdfTextExtractor.getTextFromPage(pdfReader, page, myTextExtractionStrategy);
|
//当页的文字内容,用于关键词匹配
|
allText = new StringBuilder();
|
//一个字一个字的遍历
|
for (int i=0; i<allItems.size(); i++) {
|
ItemPosition item = allItems.get(i);
|
allText.append(item.getText());
|
//关键字存在连续多个块中
|
if(allText.indexOf(keyWords) != -1) {
|
KeyWordPosition keyWordPosition = new KeyWordPosition();
|
//记录关键词每个字的位置,只记录开始结束标记时会有问题
|
KeyWordPositionVo vo = new KeyWordPositionVo();
|
for(int j=i-keyWords.length()+1; j<=i; j++) {
|
if (allItems.get(j).getText().indexOf(keyWords)!= -1) {
|
vo.setPage(allItems.get(j).getPage());
|
vo.setRectangle(allItems.get(j).getRectangle());
|
vo.setText(allItems.get(j).getText());
|
vo.setRectangleString(allItems.get(j).getRectangle().toString());
|
matchItems.add(vo);
|
break;
|
}
|
}
|
break;
|
}
|
}
|
}
|
} catch (Exception e) {
|
e.printStackTrace();
|
}
|
return matchItems;
|
}
|
|
/**
|
* 添加矩形标记
|
* @param oldPath
|
* @param newPath
|
* @param matchItems 关键词
|
* @param color 标记颜色
|
* @param lineWidth 线条粗细
|
* @param padding 边框内边距
|
* @throws DocumentException
|
* @throws IOException
|
*/
|
public static void andRectangleMark(String oldPath, String newPath, List<KeyWordPosition> matchItems, BaseColor color, int lineWidth, int padding) throws DocumentException, IOException{
|
// 待加水印的文件
|
PdfReader reader = new PdfReader(oldPath);
|
// 加完水印的文件
|
PdfStamper stamper = new PdfStamper(reader, new FileOutputStream(newPath));
|
|
PdfContentByte content;
|
|
// 设置字体
|
// 循环对每页插入水印
|
for (KeyWordPosition keyWordPosition:matchItems)
|
{
|
//一个关键词的所有字坐标
|
List<ItemPosition> oneKeywordItems = keyWordPosition.getListItem();
|
for(int i=0; i<oneKeywordItems.size(); i++) {
|
ItemPosition item = oneKeywordItems.get(i);
|
ItemPosition preItem = i==0?null:oneKeywordItems.get(i-1);
|
//带确实是否水印
|
|
// 水印的起始
|
content = stamper.getOverContent(item.getPage());
|
// 开始写入水印
|
content.setLineWidth(lineWidth);
|
content.setColorStroke(color);
|
System.out.println(item.toString());
|
|
//底线
|
content.moveTo(item.getRectangle().getLeft()-padding, item.getRectangle().getBottom()-padding);
|
content.lineTo(item.getRectangle().getRight()+padding, item.getRectangle().getBottom()-padding);
|
if(i!=0 && preItem!=null && (preItem.getRectangle().getBottom()-padding)==(item.getRectangle().getBottom()-padding) && (preItem.getRectangle().getRight()+padding)!=(item.getRectangle().getLeft()-padding)) {
|
content.moveTo(preItem.getRectangle().getRight()+padding, preItem.getRectangle().getBottom()-padding);
|
content.lineTo(item.getRectangle().getLeft()-padding, item.getRectangle().getBottom()-padding);
|
}
|
//上线
|
content.moveTo(item.getRectangle().getLeft()-padding, item.getRectangle().getTop()+padding);
|
content.lineTo(item.getRectangle().getRight()+padding, item.getRectangle().getTop()+padding);
|
if(i!=0 && preItem!=null && (preItem.getRectangle().getTop()+padding)==(item.getRectangle().getTop()+padding) && (preItem.getRectangle().getRight()+padding)!=(item.getRectangle().getLeft()-padding)) {
|
content.moveTo(preItem.getRectangle().getRight()+padding, preItem.getRectangle().getTop()+padding);
|
content.lineTo(item.getRectangle().getLeft()-padding, item.getRectangle().getTop()+padding);
|
}
|
|
//左线
|
if(i==0) {
|
content.moveTo(item.getRectangle().getLeft()-padding, item.getRectangle().getBottom()-padding);
|
content.lineTo(item.getRectangle().getLeft()-padding, item.getRectangle().getTop()+padding);
|
}
|
//右线
|
if(i==(oneKeywordItems.size()-1)) {
|
content.moveTo(item.getRectangle().getRight()+padding, item.getRectangle().getBottom()-padding);
|
content.lineTo(item.getRectangle().getRight()+padding, item.getRectangle().getTop()+padding);
|
}
|
|
content.stroke();
|
}
|
}
|
stamper.close();
|
}
|
}
|