首页 > 作文

Java基于PDFbox实现读取处理PDF文件

更新时间:2023-04-04 23:19:06 阅读：评论：0

前言

嗨，大家好，2022年春节已经接近尾声，各地都陆陆续续开工了。近期有朋友做一个小项目正好使用java读取pdf文件信息。因此记录一下相关过程。

pdfbox介绍

pdfbox是一个开源的、基于java的、支持pdf文档生成的工具库，它可以用于创建新的pdf文档，修改现有的pdf文档，还可以从pdf文档中提取所需的内容。apache pdfbox还包含了数个命令行工具。

pdf文件的数据时一系列基本对象的集合：数组，布尔型，字典，数字，字符串和二进制流。

开发环境

本次java基于pdfbox读取处理pdf文件的版本信息如下：

jdk1.8

springboot 2.3.0.relea

pdfbox 1.8.13

pdfbox依赖

在初次使用pdfbox的时候需要引入pdfbox依赖。本次使用的依赖包如下：

<depen爸爸妈妈回来吧dency>            <groupid>org.apache.pdfbox</groupid>            <artifactid>pdfbox</artifactid>            <version>1.8.13</version>        </dependency>

快速开始

本示例是将指定目录下的pdf文件中的信息读取出来，存储到新的指定路径的txt文本文件当中。

class pdftest {    public static void main(string[] args) throws exception {       string filepath ="c:\\urs\\admin\\desktop\\cxy1.pdf";           list<string> list = getfiles(bapath);        for (string filepath : list) {            long ltime = system.currenttimemillis();            string substring = filepath.substring(filepath.lastindexof("\\") + 1, filepath.lastindexof("."));            string project = "（juejin.cn）";            string textfrompdf = gettextfrompd自我评价50字f(filepath);            string s = writtertxt(textfrompdf, substring + "--", ltime, bapath);            stringbuffer stringbuffer = readertext(s, project);            writtertxt(stringbuffer.tostring(), substring + "-", ltime, bapath);        }        system.out.println("******************** end ************************")小鞋子电影;    }    public static list<string> getfiles(string path) {        list<string> files = new arraylist<string>();        file file = new file(path);        file[] templist = file.listfiles();        for (int i = c盘空间不足怎么办0; i < templist.length; i++) {            if (templist[i].isfile()) {                if (templist[i].tostring().contains(".pdf") || templist[i].tostring().contains(保密管理制度".pdf")) {                    files.add(templist[i].tostring());                }                //文件名，不包含路径                //string filename = templist[i].getname();            }            if (templist[i].isdirectory()) {                //这里就不递归了，            }        }        return files;    }    public static string gettextfrompdf(string filepath) throws exception {        string result = null;        fileinputstream is = null;        pddocument document = null;        try {            is = new fileinputstream(filepath);            pdfparr parr = new pdfparr(is);            parr.par();            document = parr.getpddocument();            pdftextstripper stripper = new pdftextstripper();            result = stripper.gettext(document);        } catch (filenotfoundexception e) {            e.printstacktrace();        } catch (ioexception e) {            e.printstacktrace();        } finally {            if (is != null) {                try {                    is.clo();                } catch (ioexception e) {                    e.printstacktrace();                }            }            if (document != null) {                try {                    document.clo();                } catch (ioexception e) {                    e.printstacktrace();                }            }        }        map<string, string> map = new hashmap<string, string>();        return result;    }    public static string writtertxt(string data, string text, long l, string bapath) {        string filename = null;        try {            if (text == null) {                filename = bapath + "javaio-" + l + ".txt";            } el {                filename = bapath + text + l + ".txt";            }            file file = new file(filename);            //if file doesnt exists, then create it            if (!file.exists()) {                file.createnewfile();            }            //true = append file            outputstream outputstream = new fileoutputstream(file);//            filewriter filewritter = new filewriter(file.getname(), true);//            filewritter.write(data);//            filewritter.clo();            outputstreamwriter outputstreamwriter = new outputstreamwriter(outputstream);            outputstreamwriter.write(data);            outputstreamwriter.clo();            outputstream.clo();            system.out.println("done");        } catch (ioexception e) {            e.printstacktrace();        }        return filename;    }    public static stringbuffer readertext(string name, string project) {        // 使用arraylist来存储每行读取到的字符串        stringbuffer stringbuffer = new stringbuffer();        try {            filereader fr = new filereader(name);            bufferedreader bf = new bufferedreader(fr);            string str;            // 按行读取字符串            while ((str = bf.readline()) != null) {                str = replaceall(str);                if (str.contains("d、") || str.contains("d.")) {                    stringbuffer.append(str);                    stringbuffer.append("\n");                    stringbuffer.append("参考： \n");                    stringbuffer.append("参考： \n");                    stringbuffer.append("\n\n\n\n");                } el if (str.contains("a、") || str.contains("a.")) {                    stringbuffer.deletecharat(stringbuffer.length() - 1);                    stringbuffer.append("。" + project + "\n");                    stringbuffer.append(str + "\n");                } el if (str.contains("b、") || str.contains("c、") || str.contains("b.") || str.contains("c.")) {                    stringbuffer.append(str + "\n");                } el {                    stringbuffer.append(str);                }            }            bf.clo();            fr.clo();        } catch (ioexception e) {            e.printstacktrace();        }        return stringbuffer;    }    public static string replaceall(string str) {        return str.replaceall("网", "");    }}