Java实现Word/Pdf/TXT转HTML的方法

一:java实现将word转换为html
1:引入依赖
1 <dependency> 2   <groupid>fr.opensagres.xdocreport</groupid> 3   <artifactid>fr.opensagres.xdocreport.document</artifactid> 4   <version>1.0.5</version> 5 </dependency> 6 <dependency>  7   <groupid>fr.opensagres.xdocreport</groupid>  8   <artifactid>org.apache.poi.xwpf.converter.xhtml</artifactid>  9   <version>1.0.5</version> 10 </dependency>11   <dependency>12   <groupid>org.apache.poi</groupid>13   <artifactid>poi</artifactid>14   <version>3.12</version>15 </dependency>16 <dependency>17   <groupid>org.apache.poi</groupid>18   <artifactid>poi-scratchpad</artifactid>19   <version>3.12</version>20 </dependency>
2:代码demo
  1 package com.svse.controller;  2   3 import javax.xml.parsers.documentbuilderfactory;  4 import javax.xml.parsers.parserconfigurationexception;  5 import javax.xml.transform.outputkeys;  6 import javax.xml.transform.transformer;  7 import javax.xml.transform.transformerexception;  8 import javax.xml.transform.transformerfactory;  9 import javax.xml.transform.dom.domsource; 10 import javax.xml.transform.stream.streamresult; 11  12 import org.apache.poi.hwpf.hwpfdocument; 13 import org.apache.poi.hwpf.converter.picturesmanager; 14 import org.apache.poi.hwpf.converter.wordtohtmlconverter; 15 import org.apache.poi.hwpf.usermodel.picturetype; 16 import org.apache.poi.xwpf.converter.core.basicuriresolver; 17 import org.apache.poi.xwpf.converter.core.fileimageextractor; 18 import org.apache.poi.xwpf.converter.core.fileuriresolver; 19 import org.apache.poi.xwpf.converter.core.iuriresolver; 20 import org.apache.poi.xwpf.converter.core.ixwpfconverter; 21 import org.apache.poi.xwpf.converter.xhtml.xhtmlconverter; 22 import org.apache.poi.xwpf.converter.xhtml.xhtmloptions; 23 import org.apache.poi.xwpf.usermodel.xwpfdocument; 24 /** 25  * word 转换成html 26  */ 27 public class testwordtohtml { 28  29     public static  final string storagepath=c://works//files//; 30     public static  final string ip=192.168.30.222; 31     public static  final string port=8010; 32     public static void main(string[] args) throws ioexception, transformerexception, parserconfigurationexception { 33         testwordtohtml wt=new testwordtohtml(); 34         //wt.word2003tohtml(甲骨文考证.doc); 35         wt.word2007tohtml(甲骨文考证.docx); 36  37     } 38        39      /** 40      * 2003版本word转换成html 41      * @throws ioexception 42      * @throws transformerexception 43      * @throws parserconfigurationexception 44      */ 45     public void word2003tohtml(string filename) throws ioexception, transformerexception, parserconfigurationexception { 46         47         final string imagepath = storagepath+fileimage/;//解析时候如果doc文件中有图片  图片会保存在此路径 48         final string strranstring=getrandomnum(); 49         string filepath =storagepath; 50         string htmlname =filename.substring(0, filename.indexof(.))+ 2003.html; 51         final string file = filepath + filename; 52         inputstream input = new fileinputstream(new file(file)); 53         hwpfdocument worddocument = new hwpfdocument(input); 54         wordtohtmlconverter wordtohtmlconverter = new wordtohtmlconverter(documentbuilderfactory.newinstance().newdocumentbuilder().newdocument()); 55         //设置图片存放的位置 56         wordtohtmlconverter.setpicturesmanager(new picturesmanager() { 57             public string savepicture(byte[] content, picturetype picturetype, string suggestedname, float widthinches, float heightinches) { 58                 file imgpath = new file(imagepath); 59                 if(!imgpath.exists()){//图片目录不存在则创建 60                     imgpath.mkdirs(); 61                 } 62                  63                 file file = new file(imagepath +strranstring+suggestedname); 64                 try { 65                     outputstream os = new fileoutputstream(file); 66                     os.write(content); 67                     os.close(); 68                 } catch (filenotfoundexception e) { 69                     e.printstacktrace(); 70                 } catch (ioexception e) { 71                     e.printstacktrace(); 72                 } 73                  74                 return  http://+ip+:+port+//uploadfile/fileimage/+strranstring+suggestedname; 75                // return imagepath +strranstring+suggestedname; 76             } 77         }); 78          79         //解析word文档 80         wordtohtmlconverter.processdocument(worddocument); 81         document htmldocument = wordtohtmlconverter.getdocument(); 82          83         file htmlfile = new file(filepath +strranstring+htmlname); 84         outputstream outstream = new fileoutputstream(htmlfile); 85          86  87         domsource domsource = new domsource(htmldocument); 88         streamresult streamresult = new streamresult(outstream); 89  90         transformerfactory factory = transformerfactory.newinstance(); 91         transformer serializer = factory.newtransformer(); 92         serializer.setoutputproperty(outputkeys.encoding, utf-8); 93         serializer.setoutputproperty(outputkeys.indent, yes); 94         serializer.setoutputproperty(outputkeys.method, html); 95          96         serializer.transform(domsource, streamresult); 97         outstream.close(); 98          99         system.out.println(生成html文件路径:+ http://+ip+:+port+//uploadfile/+strranstring+htmlname);100     }101 102     /**103      * 2007版本word转换成html104      * @throws ioexception105      */106     public void word2007tohtml(string filename) throws ioexception {107         108        final string strranstring=getrandomnum();109         110         string filepath = storagepath+strranstring;111         string htmlname =filename.substring(0, filename.indexof(.))+ 2007.html;112         file f = new file(storagepath+filename);  113         if (!f.exists()) {  114             system.out.println(sorry file does not exists!);  115         } else {  116             if (f.getname().endswith(.docx) || f.getname().endswith(.docx)) {  117                 try {118                     // 1) 加载word文档生成 xwpfdocument对象  119                     inputstream in = new fileinputstream(f);  120                     xwpfdocument document = new xwpfdocument(in);  121       122                     // 2) 解析 xhtml配置 (这里设置iuriresolver来设置图片存放的目录)  123                     file imagefolderfile = new file(filepath);  124                     xhtmloptions options = xhtmloptions.create().uriresolver(new fileuriresolver(imagefolderfile));  125                     options.setextractor(new fileimageextractor(imagefolderfile));  126                     options.uriresolver(new iuriresolver() {127                         public string resolve(string uri) {128                             //http://192.168.30.222:8010//uploadfile/....129                             return http://+ip+:+port+//uploadfile/+strranstring +/+ uri;130                         }131                     });132                     133                     options.setignorestylesifunused(false);  134                     options.setfragment(true);  135                       136                     // 3) 将 xwpfdocument转换成xhtml  137                     outputstream out = new fileoutputstream(new file(filepath + htmlname));  138                     ixwpfconverter<xhtmloptions> converter = xhtmlconverter.getinstance();139                     converter.convert(document,out, options);140                     //xhtmlconverter.getinstance().convert(document, out, options);  141                     system.out.println(html路径:+http://+ip+:+port+//uploadfile/+strranstring+htmlname);142                 } catch (exception e) {143                     e.printstacktrace();144                 }145             146             } else {  147                 system.out.println(enter only ms office 2007+ files);  148             }  149         }  150     }  151 152      /**153      *功能说明:生成时间戳154      *创建人:zsq155      *创建时间:2019年12月7日下午2:37:09156      *157      */158      public static string getrandomnum(){159          date dt = new date();160          simpledateformat sdf = new simpledateformat(yyyymmddhhmmss);  161          string str=sdf.format(dt);162          return str;163      }164      165    }
二:java实现将pdf转换为html
1: 引入依赖
1 <dependency> 2             <groupid>net.sf.cssbox</groupid> 3             <artifactid>pdf2dom</artifactid> 4             <version>1.7</version> 5         </dependency>  6         <dependency> 7             <groupid>org.apache.pdfbox</groupid> 8             <artifactid>pdfbox</artifactid> 9             <version>2.0.12</version>10         </dependency>11         <dependency>12             <groupid>org.apache.pdfbox</groupid>13             <artifactid>pdfbox-tools</artifactid>14             <version>2.0.12</version>15  </dependency>16
2:代码demo
1 public class pdftohtml { 2  3   /* 4     pdf转换html 5      */ 6     public void pdftohtmltest(string inpdfpath,string outputhtmlpath)  { 7        // string outputpath = c:\\works\\files\\zsq保密知识测试题库.html; 8     9        //try() 写在()里面会自动关闭流10         try{11             bufferedwriter out = new bufferedwriter(new outputstreamwriter(new fileoutputstream(new file(outputhtmlpath)),utf-8));12             //加载pdf文档13             //pddocument document = pddocument.load(bytes);14             pddocument document = pddocument.load(new file(inpdfpath));15             pdfdomtree pdfdomtree = new pdfdomtree();16             pdfdomtree.writetext(document,out);17         } catch (exception e) {18             e.printstacktrace();19         }20     }21 22     public static void main(string[] args) throws ioexception {23         pdftohtml ph=new pdftohtml();24         string pdfpath=c:\\works\\files\\武研中心行政考勤制度.pdf;25         string outputpath=c:\\works\\files\\武研中心行政考勤制度.html;26         ph.pdftohtmltest(pdfpath,outputpath);27   }28 29 }
三:java实现将txt转换为html
1  /* 2      * txt文档转html 3        filepath:txt原文件路径 4        htmlposition:转化后生成的html路径 5     */ 6     public static void txttohtml(string filepath, string htmlposition) { 7         try { 8             //string encoding = gbk; 9             file file = new file(filepath);10             if (file.isfile() && file.exists()) { // 判断文件是否存在11                 inputstreamreader read = new inputstreamreader(new fileinputstream(file), gbk);12                 // 考虑到编码格式13                 bufferedreader bufferedreader = new bufferedreader(read);14                 // 写文件15                 fileoutputstream fos = new fileoutputstream(new file(htmlposition));16                 outputstreamwriter osw = new outputstreamwriter(fos, gbk);17                 bufferedwriter bw = new bufferedwriter(osw);18                 string linetxt = null;19                 while ((linetxt = bufferedreader.readline()) != null) {20                     bw.write(   +linetxt + </br>);21                 }22                 bw.close();23                 osw.close();24                 fos.close();25                 read.close();26             } else {27                 system.out.println(找不到指定的文件);28             }29         } catch (exception e) {30             system.out.println(读取文件内容出错);31             e.printstacktrace();32         }33     }
以上就是java实现word/pdf/txt转html的方法的详细内容。

Java实现Word/Pdf/TXT转HTML的方法

推荐信息