ai/ai_service.git

package org.jeecg.modules.utils;
 
import com.itextpdf.text.BaseColor;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.pdf.PdfContentByte;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.PdfStamper;
import com.itextpdf.text.pdf.parser.PdfTextExtractor;
import org.jeecg.modules.ai.vo.ItemPosition;
import org.jeecg.modules.ai.vo.KeyWordPosition;
import org.jeecg.modules.ai.vo.KeyWordPositionVo;
import org.jeecg.modules.ai.vo.MyTextExtractionStrategy;
 
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
 
/**
 * @author clown
 * * @date 2024/7/23
 */
public class PdfUtils {
    /**
     * 用于供外部类调用获取关键字所在PDF文件坐标
     * @param filepath
     * @param keyWords
     * @return
     */
    public static List<KeyWordPosition> getKeyWordsByPath(String filepath, String keyWords) {
        List<KeyWordPosition> matchItems = null;
        try{
            PdfReader pdfReader = new PdfReader(filepath);
            matchItems = getKeyWords(pdfReader, keyWords);
        } catch (IOException e) {
            e.printStackTrace();
        }
        return matchItems;
    }
 
    /**
     * 获取关键字所在PDF坐标
     * @param pdfReader
     * @param keyWords
     * @return
     */
    private static List<KeyWordPosition> getKeyWords(PdfReader pdfReader, String keyWords) {
        int page = 0;
 
        List<KeyWordPosition> matchItems = new ArrayList<>();
        try{
            int pageNum = pdfReader.getNumberOfPages();
            StringBuilder allText = null;
 
            //遍历页
            for (page = 1; page <= pageNum; page++) {
                //只记录当页的所有内容，需要记录全部页放在循环外面
                List<ItemPosition> allItems = new ArrayList<>();
                //扫描内容
                MyTextExtractionStrategy myTextExtractionStrategy = new MyTextExtractionStrategy(allItems, page);
                PdfTextExtractor.getTextFromPage(pdfReader, page, myTextExtractionStrategy);
                //当页的文字内容，用于关键词匹配
                allText = new StringBuilder();
                //一个字一个字的遍历
                for (int i=0; i<allItems.size(); i++) {
                    ItemPosition item = allItems.get(i);
                    allText.append(item.getText());
                    //关键字存在连续多个块中
                    if(allText.indexOf(keyWords) != -1) {
                        KeyWordPosition keyWordPosition = new KeyWordPosition();
                        //记录关键词每个字的位置，只记录开始结束标记时会有问题
                        List<ItemPosition> listItem = new ArrayList<>();
                        for(int j=i-keyWords.length()+1; j<=i; j++) {
                            listItem.add(allItems.get(j));
                        }
                        keyWordPosition.setListItem(listItem);
                        keyWordPosition.setText(keyWords);
                        matchItems.add(keyWordPosition);
                        allText.setLength(0);
                    }
                }
 
 
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        return matchItems;
    }
 
 
 
 
    public static void main(String[] args) throws Exception {
        String keyword = "一旦发生机床安全事故";
        String sourcePdf = "F:\\123.pdf";
        String watermarkPdf = "F:\\12_bak.pdf";
        Long start = System.currentTimeMillis();
        System.out.println("开始扫描....");
        List<KeyWordPosition> matchItems = getKeyWordsByPath(sourcePdf, keyword);
        System.out.println(matchItems);
        System.out.println("扫描结束["+(System.currentTimeMillis()-start)+"ms]，共找到关键字["+keyword+"]出现["+matchItems.size()+"]次");
        start = System.currentTimeMillis();
        System.out.println("开始添加标记....");
        andRectangleMark(sourcePdf
                , watermarkPdf
                , matchItems
                , BaseColor.RED
                , 2
                , 2);
        //文件整理
        System.out.println("标记添加完成["+(System.currentTimeMillis()-start)+"ms]");
 
    }
 
 
    public static List<KeyWordPositionVo> findListWord(String sourcePath, String keyWord){
        System.out.println("开始扫描....");
        try {
            List<KeyWordPositionVo> matchItems = getKeyWordsByPathOne(sourcePath, keyWord);
            return matchItems;
        }catch (Exception e) {
            return null;
        }
    }
 
    /**
     * 用于供外部类调用获取关键字所在PDF文件坐标
     * @param filepath
     * @param keyWords
     * @return
     */
    public static List<KeyWordPositionVo> getKeyWordsByPathOne(String filepath, String keyWords) {
        List<KeyWordPositionVo> matchItems = null;
        try{
            PdfReader pdfReader = new PdfReader(filepath);
            matchItems = getKeyWordPage(pdfReader, keyWords);
        } catch (IOException e) {
            e.printStackTrace();
        }
        return matchItems;
    }
 
    /**
     * 获取关键字所在PDF坐标 一页只取第一条
     * @param pdfReader
     * @param keyWords
     * @return
     */
    private static List<KeyWordPositionVo> getKeyWordPage(PdfReader pdfReader, String keyWords) {
        int page = 0;
        List<KeyWordPositionVo> matchItems = new ArrayList<>();
        try{
            int pageNum = pdfReader.getNumberOfPages();
            StringBuilder allText = null;
            //遍历页
            for (page = 1; page <= pageNum; page++) {
                //只记录当页的所有内容，需要记录全部页放在循环外面
                List<ItemPosition> allItems = new ArrayList<>();
                //扫描内容
                MyTextExtractionStrategy myTextExtractionStrategy = new MyTextExtractionStrategy(allItems, page);
                PdfTextExtractor.getTextFromPage(pdfReader, page, myTextExtractionStrategy);
                //当页的文字内容，用于关键词匹配
                allText = new StringBuilder();
                //一个字一个字的遍历
                for (int i=0; i<allItems.size(); i++) {
                    ItemPosition item = allItems.get(i);
                    allText.append(item.getText());
                    //关键字存在连续多个块中
                    if(allText.indexOf(keyWords) != -1) {
                        KeyWordPosition keyWordPosition = new KeyWordPosition();
                        //记录关键词每个字的位置，只记录开始结束标记时会有问题
                        KeyWordPositionVo vo = new KeyWordPositionVo();
                        for(int j=i-keyWords.length()+1; j<=i; j++) {
                            if (allItems.get(j).getText().indexOf(keyWords)!= -1) {
                                vo.setPage(allItems.get(j).getPage());
                                vo.setRectangle(allItems.get(j).getRectangle());
                                vo.setText(allItems.get(j).getText());
                                vo.setRectangleString(allItems.get(j).getRectangle().toString());
                                matchItems.add(vo);
                                break;
                            }
                        }
                        break;
                    }
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        return matchItems;
    }
 
    /**
     * 添加矩形标记
     * @param oldPath
     * @param newPath
     * @param matchItems 关键词
     * @param color 标记颜色
     * @param lineWidth 线条粗细
     * @param padding 边框内边距
     * @throws DocumentException
     * @throws IOException
     */
    public static void andRectangleMark(String oldPath, String newPath, List<KeyWordPosition> matchItems, BaseColor color, int lineWidth, int padding) throws DocumentException, IOException{
        // 待加水印的文件
        PdfReader reader = new PdfReader(oldPath);
        // 加完水印的文件
        PdfStamper stamper = new PdfStamper(reader, new FileOutputStream(newPath));
 
        PdfContentByte content;
 
        // 设置字体
        // 循环对每页插入水印
        for (KeyWordPosition keyWordPosition:matchItems)
        {
            //一个关键词的所有字坐标
            List<ItemPosition> oneKeywordItems = keyWordPosition.getListItem();
            for(int i=0; i<oneKeywordItems.size(); i++) {
                ItemPosition item = oneKeywordItems.get(i);
                ItemPosition preItem = i==0?null:oneKeywordItems.get(i-1);
                //带确实是否水印
 
                // 水印的起始
                content = stamper.getOverContent(item.getPage());
                // 开始写入水印
                content.setLineWidth(lineWidth);
                content.setColorStroke(color);
                System.out.println(item.toString());
 
                //底线
                content.moveTo(item.getRectangle().getLeft()-padding, item.getRectangle().getBottom()-padding);
                content.lineTo(item.getRectangle().getRight()+padding, item.getRectangle().getBottom()-padding);
                if(i!=0 && preItem!=null && (preItem.getRectangle().getBottom()-padding)==(item.getRectangle().getBottom()-padding) && (preItem.getRectangle().getRight()+padding)!=(item.getRectangle().getLeft()-padding)) {
                    content.moveTo(preItem.getRectangle().getRight()+padding, preItem.getRectangle().getBottom()-padding);
                    content.lineTo(item.getRectangle().getLeft()-padding, item.getRectangle().getBottom()-padding);
                }
                //上线
                content.moveTo(item.getRectangle().getLeft()-padding, item.getRectangle().getTop()+padding);
                content.lineTo(item.getRectangle().getRight()+padding, item.getRectangle().getTop()+padding);
                if(i!=0 && preItem!=null && (preItem.getRectangle().getTop()+padding)==(item.getRectangle().getTop()+padding) && (preItem.getRectangle().getRight()+padding)!=(item.getRectangle().getLeft()-padding)) {
                    content.moveTo(preItem.getRectangle().getRight()+padding, preItem.getRectangle().getTop()+padding);
                    content.lineTo(item.getRectangle().getLeft()-padding, item.getRectangle().getTop()+padding);
                }
 
                //左线
                if(i==0) {
                    content.moveTo(item.getRectangle().getLeft()-padding, item.getRectangle().getBottom()-padding);
                    content.lineTo(item.getRectangle().getLeft()-padding, item.getRectangle().getTop()+padding);
                }
                //右线
                if(i==(oneKeywordItems.size()-1)) {
                    content.moveTo(item.getRectangle().getRight()+padding, item.getRectangle().getBottom()-padding);
                    content.lineTo(item.getRectangle().getRight()+padding, item.getRectangle().getTop()+padding);
                }
 
                content.stroke();
            }
        }
        stamper.close();
    }
}