需求要求
读取pptx保存到md
Apache POI ?是用Java编写的免费开源的跨平台的 Java API,Apache POI提供API给Java程式对Microsoft Office格式档案读和写的功能。POI为"Poor Obfuscation Implementation"的首字母缩写,意为"可怜的模糊实现"。
环境java8
maven
创建一个maven项目在这里插入图片描述 使用apache接口
https://search.maven.org/artifact/org.lucee/poi/4.1.0/bundle
在这里插入图片描述
在pom.xml加入接口
在这里插入图片描述 可以类导入apache.poi.xslf
PPTXUtils.java
下面代码有些来源官网
package?com.geekbang.ppttools; //?导入org.apache.poi接口 import?com.google测试数据mon.base.Splitter; import?org.apache.poi.xslf.usermodel.XMLSlideShow; import?org.apache.poi.xslf.usermodel.XSLFShape; import?org.apache.poi.xslf.usermodel.XSLFSlide; import?org.apache.poi.xslf.usermodel.XSLFTextShape; import?java.io.File; import?java.io.FileInputStream; public?class?PPTXUtils?{ ????public?static?String?getToc(File?file)?throws?Exception?{ ????????//创建一个新的空幻灯片 ????????XMLSlideShow?ppt?=?new?XMLSlideShow(new?FileInputStream(file)); ????????//?拼接字符串 ????????StringBuilder?ret?=?new?StringBuilder(); ????????//?第一张幻灯片 ????????XSLFSlide?slide?=?ppt.getSlides().get(0); ????????int?currCha?=?-1; ????????int?count?=?1; ????????String?title?=?null; ????????boolean?firstLine?=?true; ????????for?(XSLFShape?shape?:?slide.getShapes())?{ ????????????if?(shape?instanceof?XSLFTextShape)?{ ????????????????XSLFTextShape?textShape?=?(XSLFTextShape)?shape; ????????????????String?text?=?textShape.getText(); ????????????????try?{ ????????????????????currCha?=?Integer.parseInt(text); ????????????????????if?(currCha?>=?0?&&?title?!=?null)?{ ????????????????????????ret.append("#?").append(currCha).append('.').append(title).append('\n'); ????????????????????} ????????????????????continue; ????????????????}?catch?(Exception?ex)?{ ????????????????} ????????????????for?(String?line?:?Splitter.on("\n").omitEmptyStrings().trimResults().split(text))?{ ????????????????????if?(firstLine)?{ ????????????????????????title?=?line; ????????????????????????firstLine?=?false; ????????????????????}?else?{ ????????????????????????ret.append(count).append(".?").append(line).append('\n'); ????????????????????????count++; ????????????????????} ????????????????} ????????????} ????????} ????????return?ret.toString(); ????} }
TOCGen.java
//?编译class对应的target路径 package?com.geekbang.ppttools; //?导入读取文件的模块 import?java.io.File; import?java.io.FileOutputStream; import?java.io.OutputStreamWriter; import?java.io.PrintWriter; //?编码 import?java.nio.charset.StandardCharsets; public?class?TOCGen?{ ????public?static?void?main(String[]?args)?throws?Exception?{ ????????//?toc?所在pptx的路径 ????????String?toc?=?genTocFromPPTX("D:\\学习资料\\极客时间\\java\\LetsJava\\第六章?Java?编程实战\\06.?一个从?pptx?文件中抽取文字的小工具\\code\\src\\main\\resources\\ppts"); ????????//?new输出一个toc.md?编码utf-8的PrintWriter对象pw ????????PrintWriter?pw?=?new?PrintWriter(new?OutputStreamWriter(new?FileOutputStream("toc.md"),?StandardCharsets.UTF_8)); ????????pw.println(toc); ????????pw.flush(); ????????pw.close(); ????????System.out.println("读取完成"); ????} ????//?定义genTocFromPPTX方法??s?参数pptx的路径 ????private?static?String?genTocFromPPTX(String?s)?throws?Exception?{ ????????File?rootDir?=?new?File(s); ????????//?new一个StringBuilder ????????StringBuilder?ret?=?new?StringBuilder("#?0.自我介绍\n\n"); ????????//?遍历路径的pptx ????????for?(File?pptx?:?rootDir.listFiles())?{ ????????????if?(isThePPT(pptx))?{ ????????????????//?调用PPTXUtils类的getToc方法 ????????????????ret.append('\n').append(PPTXUtils.getToc(pptx)).append('\n'); ????????????} ????????} ????????return?ret.toString(); ????} ????//?定义isThePPT方法 ????private?static?boolean?isThePPT(File?pptx)?{ ????????String?pptxName?=?pptx.getName(); ????????if?(!pptxName.endsWith("pptx"))?{ ????????????return?false; ????????} ????????for?(char?ch?:?pptxName.substring(0,?pptxName.indexOf('.')).toCharArray())?{ ????????????if?(!Character.isDigit(ch))?{ ????????????????return?false; ????????????} ????????} ????????return?true; ????} }
代码链接:
https://github测试数据/geektime-geekbang/LetsJava
查看更多关于学习记录——Java读取pptx的详细内容...
声明:本文来自网络,不代表【好得很程序员自学网】立场,转载请注明出处:http://www.haodehen.cn/did127775