读取文件信息所需依赖
<!-- 读取Excel XLS -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>4.1.2</version>
</dependency>
<!-- 读取PPT、DOC、Visio -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>4.1.2</version>
</dependency>
<!-- 读取Excel XLSX、PPTX、DOCX、-->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>4.1.2</version>
</dependency>
<!--读取pdf信息-->
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.12</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.pdfbox/fontbox -->
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>fontbox</artifactId>
<version>2.0.12</version>
</dependency>
读取doc文件内容
public static String readWord(String name)
{
FileInputStream in;
String text = null;
try
{
in = new FileInputStream(name);
WordExtractor extractor = new WordExtractor(in);
text = extractor.getText();
}
catch (FileNotFoundException e)
{
// TODO Auto-generated catch block
e.printStackTrace();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return text;
}
读取docx文件内容
public static String readDoc(MultipartFile file) {
if (file.isEmpty())return "";
WordExtractor wordExtractor = null;
try {
InputStream inputStream = file.getInputStream();
wordExtractor = new WordExtractor(inputStream);
} catch (IOException e) {
log.warn(e.toString());
e.printStackTrace();
}
return wordExtractor.getText();
}
读取xls文件内容
public static String readXls(MultipartFile file) {
if (file.isEmpty()) return "";
StringBuilder content = new StringBuilder();
try {
HSSFWorkbook excel = new HSSFWorkbook(file.getInputStream());
//获取第一个sheet
HSSFSheet sheet0 = excel.getSheetAt(0);
for (Iterator rowIterator = sheet0.iterator(); rowIterator.hasNext(); ) {
HSSFRow row = (HSSFRow) rowIterator.next();
for (Iterator iterator = row.cellIterator(); iterator.hasNext(); ) {
HSSFCell cell = (HSSFCell) iterator.next();
//根据单元的的类型 读取相应的结果
if (cell.getCellType() == CellType.STRING)
content.append(cell.getStringCellValue() + "\t");
else if (cell.getCellType() == CellType.NUMERIC
|| cell.getCellType() == CellType.FORMULA)
content.append(cell.getNumericCellValue() + "\t");
else
content.append("" + "\t");
}
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
log.warn(e.toString());
}
return content.toString();
}
读取xlsx文件内容
public static String readXlsx(MultipartFile file) {
if (file.isEmpty()) return "";
StringBuilder content = new StringBuilder();
try {
XSSFWorkbook excel = new XSSFWorkbook(file.getInputStream());
//获取第一个sheet
XSSFSheet sheet0 = excel.getSheetAt(0);
for (Iterator rowIterator = sheet0.iterator(); rowIterator.hasNext(); ) {
XSSFRow row = (XSSFRow) rowIterator.next();
for (Iterator iterator = row.cellIterator(); iterator.hasNext(); ) {
XSSFCell cell = (XSSFCell) iterator.next();
//根据单元格的类型 读取相应的结果
if (cell.getCellType() == CellType.STRING)
content.append(cell.getStringCellValue() + "\t");
else if (cell.getCellType() == CellType.NUMERIC
|| cell.getCellType() == CellType.FORMULA)
content.append(cell.getNumericCellValue() + "\t");
else
content.append("" + "\t");
}
}
} catch (Exception e) {
e.printStackTrace();
log.warn(e.toString());
}
return content.toString();
}
读取pdf文件内容
/**
* 读取 PDF文本内容
*
* @Param: MultipartFile
* @return: pdf文本内容
*/
public static String readPdf(MultipartFile file) {
StringBuilder content = new StringBuilder();
try {
InputStream is = file.getInputStream();
PDFParser parser = new PDFParser(new RandomAccessBuffer(is));
parser.parse();
// 读取文本内容
PDDocument document = parser.getPDDocument();
// 获取页码
int pages = document.getNumberOfPages();
PDFTextStripper stripper = new PDFTextStripper();
// 设置按顺序输出
stripper.setSortByPosition(true);
stripper.setStartPage(1);
stripper.setEndPage(pages);
content.append(stripper.getText(document));
} catch (Exception e) {
e.printStackTrace();
log.warn(e.toString());
}
return content.toString();
}
PDF文件加载有两种方式,无明显差异,方式二代码较简洁:
// 方式一:
InputStream input = null;
input = new FileInputStream( pdfFile );
//加载 pdf 文档
PDFParser parser = new PDFParser(new RandomAccessBuffer(input));
parser.parse();
document = parser.getPDDocument();
// 方式二:
document=PDDocument.load(pdfFile);
读取ppt文件内容
public static String readPPT(MultipartFile file) {
if (file.isEmpty()) return "";
StringBuilder content = new StringBuilder();
try {
InputStream is = file.getInputStream();
HSLFSlideShow hslfSlideShow = new HSLFSlideShow(is);
List<HSLFSlide> slides = hslfSlideShow.getSlides();
SlideShowExtractor slideShowExtractor = new SlideShowExtractor(hslfSlideShow);
for (HSLFSlide slide : slides) {
content.append(slideShowExtractor.getText(slide));
}
slideShowExtractor.close();
} catch (IOException e) {
log.warn(e.toString());
e.printStackTrace();
}
return content.toString();
}
读取pptx文件内容
public static String readPPTX(MultipartFile file) {
if (file.isEmpty()) return "";
StringBuffer content = new StringBuffer();
try {
InputStream is = file.getInputStream();
XMLSlideShow xmlSlideShow = new XMLSlideShow(is);
List<XSLFSlide> slides = xmlSlideShow.getSlides(); //获得每一张幻灯片
for (XSLFSlide slide : slides) {
CTSlide rawSlide = slide.getXmlObject();
CTGroupShape spTree = rawSlide.getCSld().getSpTree();
List<CTShape> spList = spTree.getSpList();
for (CTShape shape : spList) {
CTTextBody txBody = shape.getTxBody();
if (null == txBody) {
continue;
}
List<CTTextParagraph> pList = txBody.getPList();
for (CTTextParagraph textParagraph : pList) {
List<CTRegularTextRun> textRuns = textParagraph.getRList();
for (CTRegularTextRun textRun : textRuns) {
content.append(textRun.getT());
}
}
}
}
xmlSlideShow.close();
} catch (Exception e) {
e.printStackTrace();
}
return content.toString();
}