前面我们介绍了用户模式如何解析excel,但是用户模式局限很大,其并不能解析大数据量的excel。我做过测试,在16G内存上的机子,07版excel导入6W条数据,内存一下彪了2个G,由于之前忘截图了,所以此处没有图,下面,我将介绍在大批量数据时候,如何解析excel。
当然是用 事件驱动模式(其实就是一种流式读取的名字)了!
网上有很多相似的代码,我也参考了很多,但是我发现不论哪个都有很大的问题,比如对公式和数字日期的解析。
下面不多说,直接上代码。参考了网上很多的代码,也给自己记录一下。
首先07版的,要继承 DefaultHandler 把他当做xml解析。这个网上有很多说明,我就不赘述了。xlsx改成rar后解压得到如下图。
package com.youth.excel_youth.util;
import com.youth.excel_youth.constants.Constants;
import com.youth.excel_youth.constants.XSSFDataType;
import com.youth.excel_youth.service.IRowReader;
import org.apache.poi.hssf.usermodel.HSSFDateUtil;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.ss.usermodel.BuiltinFormats;
import org.apache.poi.ss.usermodel.DateUtil;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFCellStyle;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;
import java.io.InputStream;
import java.math.BigDecimal;
import java.text.SimpleDateFormat;
import java.util.*;
/**
* @author: youth_1231
* @Date: 2019/1/24 0024 15:39
* @Description: 抽象Excel2007读取器,excel2007的底层数据结构是xml文件,采用SAX的事件驱动的方法解析
* xml,需要继承DefaultHandler,在遇到文件内容时,事件会触发,这种做法可以大大降低
* 内存的耗费,特别使用于大数据量的文件。
*/
public class Excel07Parser extends DefaultHandler {
//共享字符串表
private SharedStringsTable sst;
//上一次的内容
private String readValue;
/**
* 存放一行中的数据
*/
private String[] rowList;
private int colIdx;
private int sheetIndex = -1;
//当前行
private int curRow = 0;
/**
* T元素标识
*/
private boolean isTElement;
/**
* 单元格类型
*/
private XSSFDataType dataType;
private StylesTable stylesTable;
private short dataFormat;
private String dataFormatString;
private IRowReader rowReader;
/**
* 可以将有的sheet页相关参数放到一个对象中,比如sheet中的错误,sheetName,业务数据等等,
* 对象放到集合中,其中的数据建议超过2000条持久化一次
*/
// private List<XXX> xxxxx;
public void setRowReader(IRowReader rowReader){
this.rowReader = rowReader;
}
/**
* 遍历工作簿中所有的电子表格
* @param filename
* @throws Exception
*/
public void process(String filename) throws Exception {
OPCPackage pkg = OPCPackage.open(filename);
XSSFReader r = new XSSFReader(pkg);
this.stylesTable = r.getStylesTable();
SharedStringsTable sst = r.getSharedStringsTable();
XMLReader parser = fetchSheetParser(sst);
XSSFReader.SheetIterator sheets = (XSSFReader.SheetIterator) r.getSheetsData();
while (sheets.hasNext()) {
curRow = 0;
sheetIndex++;
InputStream sheet = sheets.next();
/*获取当前sheet名称,有些同学需要*/
String sheetName = sheets.getSheetName();
InputSource sheetSource = new InputSource(sheet);
parser.parse(sheetSource);
sheet.close();
}
}
public XMLReader fetchSheetParser(SharedStringsTable sst)
throws SAXException {
XMLReader parser = XMLReaderFactory
.createXMLReader();
this.sst = sst;
parser.setContentHandler(this);
return parser;
}
@Override
public void startElement(String uri, String localName, String name,
Attributes attributes) throws SAXException {
// c => 单元格
if ("c".equals(name)) {
colIdx = getColumn(attributes);
dataFormat = -1;
dataFormatString = null;
// 如果下一个元素是 SST 的索引,则将nextIsString标记为true
String cellType = attributes.getValue("t");
String cellStyle = attributes.getValue("s");
this.dataType = XSSFDataType.NUMBER;
if ("b".equals(cellType)) {
this.dataType = XSSFDataType.BOOLEAN;
} else if ("e".equals(cellStyle)) {
this.dataType = XSSFDataType.ERROR;
} else if ("s".equals(cellStyle)) {
this.dataType = XSSFDataType.SSTINDEX;
} else if ("inlineStr".equals(cellStyle)) {
this.dataType = XSSFDataType.INLINESTR;
} else if ("str".equals(cellStyle)) {
this.dataType = XSSFDataType.FORMULA;
}
if (cellStyle != null) {
int styleIndex = Integer.parseInt(cellStyle);
XSSFCellStyle style = stylesTable.getStyleAt(styleIndex);
dataFormat = style.getDataFormat();
dataFormatString = style.getDataFormatString();
/**
* 07版本当前只发现了57 58 的时候formatString为空
*/
if (!Constants.EXCEL_FORMAT_INDEX_DATA_EXACT_NY.equals(dataFormat) && !Constants.EXCEL_FORMAT_INDEX_DATA_EXACT_YR.equals(dataFormat)
&& !Constants.EXCEL_FORMAT_INDEX_TIME_EXACT.contains(dataFormat)
&& dataFormatString == null){
this.dataType = XSSFDataType.NULL;
dataFormatString = BuiltinFormats.getBuiltinFormat(dataFormat);
}
}
}
//当元素为t时
if("t".equals(name)){
isTElement = true;
} else {
isTElement = false;
}
// 解析到一行开始处,初始化数据
if("row".equals(name)){
rowList = new String[getColsNum(attributes)];
}
// 置空
readValue = "";
}
@Override
public void endElement(String uri, String localName, String name)
throws SAXException {
if (isTElement) {
rowList[colIdx] = readValue.trim();
isTElement = false;
} else if ("v".equals(name)){
getValue();
rowList[colIdx] = readValue;
} else {
//如果标签名为 row,这说明已经到行尾,调用getRows()方法
if ("row".equals(name)){
rowReader.getRows(sheetIndex,curRow,new ArrayList<>(Arrays.asList(rowList)));
curRow++;
}
}
}
@Override
public void characters(char[] ch, int start, int length)
throws SAXException {
//得到单元格内容的值
readValue += new String(ch, start, length);
}
/**
* 事件模式: 得到当前cell在当前row的位置
* @param attributes
* @return
*/
private int getColumn(Attributes attributes){
String name = attributes.getValue("r");
int column = -1;
for (int i = 0; i < name.length(); ++i) {
if (Character.isDigit(name.charAt(i))){
break;
}
int c = name.charAt(i);
column = (column + 1) * 26 + c - 'A';
}
return column;
}
/**
* 事件模式: 得到当前cell在当前row的位置
* @param attributes
* @return
*/
private int getColsNum(Attributes attributes){
String spans = attributes.getValue("spans");
String cols = spans.substring(spans.indexOf(":") + 1);
return Integer.parseInt(cols);
}
private void getValue() throws SAXException{
switch (this.dataType){
case BOOLEAN:
readValue = readValue.charAt(0) == '0' ? "FALSE" : "TRUE";
break;
case ERROR:
readValue = "ERROR:" + readValue;
break;
case INLINESTR:
readValue = new XSSFRichTextString(readValue).toString();
break;
case SSTINDEX:
int idx = Integer.parseInt(readValue);
readValue = sst.getItemAt(idx).toString();
break;
case FORMULA:
break;
case NUMBER:
String formatValue = ExcelUtils.getDateValue(this.dataFormat, this.dataFormatString, Double.parseDouble(this.readValue));
formatValue = formatValue == null && dataFormatString != null ?
Constants.EXCEL_07_DATA_FORMAT.formatRawCellContents(Double.valueOf(readValue), dataFormat, dataFormatString) : formatValue;
if (formatValue == null){
readValue = Constants.PATTERN_DECIMAL.matcher(readValue).matches() ? String.valueOf(Double.parseDouble(readValue)) : readValue;
} else {
readValue = Constants.PATTERN_DECIMAL.matcher(formatValue).matches() ? String.valueOf(Double.parseDouble(formatValue)) : formatValue;
}
break;
default:
throw new SAXException("未知的单元格类型");
}
}
}
2003版的实现HSSFListener
package com.youth.excel_youth.util;
import com.youth.excel_youth.constants.Constants;
import com.youth.excel_youth.service.IRowReader;
import org.apache.poi.hssf.eventusermodel.*;
import org.apache.poi.hssf.eventusermodel.dummyrecord.LastCellOfRowDummyRecord;
import org.apache.poi.hssf.eventusermodel.dummyrecord.MissingCellDummyRecord;
import org.apache.poi.hssf.model.HSSFFormulaParser;
import org.apache.poi.hssf.record.*;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.ss.usermodel.DateUtil;
import org.springframework.util.StringUtils;
import org.xml.sax.SAXException;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
/**
* @author: youth_1231
* @Date: 2019/1/24 0024 15:37
* @Description: 抽象Excel2003读取器,通过实现HSSFListener监听器,采用事件驱动模式解析excel2003
* 中的内容,遇到特定事件才会触发,大大减少了内存的使用。
*/
public class Excel03Parser implements HSSFListener {
private int minColumns = -1;
private int lastRowNumber = -1;
private int lastColumnNumber = -1;
/** Should we output the formula, or the value it has? */
private boolean outputFormulaValues = true;
/** For parsing Formulas */
private EventWorkbookBuilder.SheetRecordCollectingListener workbookBuildingListener;
//excel2003工作薄
private HSSFWorkbook stubWorkbook;
// Records we pick up as we process
private SSTRecord sstRecord;
private FormatTrackingHSSFListener formatListener;
//表索引
private int sheetIndex = -1;
private BoundSheetRecord[] orderedBSRs;
@SuppressWarnings("unchecked")
private List<BoundSheetRecord> boundSheetRecords = new ArrayList();
// For handling formulas with string results
private int nextRow;
private int nextColumn;
private boolean outputNextStringRecord;
//当前行
private int curRow = 0;
//存储行记录的容器
private List<String> rowList = new ArrayList<String>();
@SuppressWarnings( "unused")
private String sheetName;
private IRowReader rowReader;
/**
* 中断读取当前sheet标识
*/
private boolean readCurSheetFlag = true;
public void setRowReader(IRowReader rowReader){
this.rowReader = rowReader;
}
/**
* 记录每个sheet页的信息,比如sheet名,每1000次一条数据等等
*/
/**
* 解析03版excel的方法
* @param fileName
*/
public void process(String fileName){
if (StringUtils.isEmpty(fileName)){
return;
}
FileInputStream fileInputStream = null;
POIFSFileSystem poifsFileSystem = null;
try {
fileInputStream = new FileInputStream(fileName);
poifsFileSystem = new POIFSFileSystem(fileInputStream);
MissingRecordAwareHSSFListener listener = new MissingRecordAwareHSSFListener(
this);
formatListener = new FormatTrackingHSSFListener(listener);
HSSFEventFactory factory = new HSSFEventFactory();
HSSFRequest request = new HSSFRequest();
if (outputFormulaValues) {
request.addListenerForAllRecords(formatListener);
} else {
workbookBuildingListener = new EventWorkbookBuilder.SheetRecordCollectingListener(
formatListener);
request.addListenerForAllRecords(workbookBuildingListener);
}
/**
* 解析单元格
*/
factory.processWorkbookEvents(request, poifsFileSystem);
poifsFileSystem.close();
fileInputStream.close();
} catch (Exception e) {
try {
poifsFileSystem.close();
} catch (IOException e1) {
poifsFileSystem = null;
}
try {
fileInputStream.close();
} catch (IOException e1) {
fileInputStream = null;
}
}
}
/**
* HSSFListener 监听方法,处理 Record
*/
@Override
public void processRecord(Record record) {
if (!readCurSheetFlag && record.getSid() != EOFRecord.sid) {
return;
}
int thisRow = -1;
int thisColumn = -1;
// String thisStr = null;
String formatValue = null;
switch (record.getSid()) {
case BoundSheetRecord.sid:
boundSheetRecords.add((BoundSheetRecord) record);
break;
case BOFRecord.sid:
BOFRecord br = (BOFRecord) record;
if (br.getType() == BOFRecord.TYPE_WORKSHEET) {
// 如果有需要,则建立子工作薄
if (workbookBuildingListener != null && stubWorkbook == null) {
stubWorkbook = workbookBuildingListener
.getStubHSSFWorkbook();
}
sheetIndex++;
if (orderedBSRs == null) {
orderedBSRs = BoundSheetRecord
.orderByBofPosition(boundSheetRecords);
}
sheetName = orderedBSRs[sheetIndex].getSheetname();
}
break;
case EOFRecord.sid:
/**
* 清空当前sheet页操作的值
*/
lastRowNumber = -1;
lastColumnNumber = -1;
readCurSheetFlag = true;
break;
case SSTRecord.sid:
sstRecord = (SSTRecord) record;
break;
case BlankRecord.sid:
BlankRecord brec = (BlankRecord) record;
thisRow = brec.getRow();
thisColumn = brec.getColumn();
rowList.add(thisColumn, "");
break;
case BoolErrRecord.sid:
BoolErrRecord berec = (BoolErrRecord) record;
thisRow = berec.getRow();
thisColumn = berec.getColumn();
formatValue = berec.getBooleanValue()+"";
rowList.add(thisColumn, formatValue);
break;
case FormulaRecord.sid:
FormulaRecord frec = (FormulaRecord) record;
thisRow = frec.getRow();
thisColumn = frec.getColumn();
if (outputFormulaValues) {
if (Double.isNaN(frec.getValue())) {
// Formula result is a string
// This is stored in the next record
outputNextStringRecord = true;
nextRow = frec.getRow();
nextColumn = frec.getColumn();
} else {
formatValue = formatListener.formatNumberDateCell(frec);
}
} else {
formatValue = '"' + HSSFFormulaParser.toFormulaString(stubWorkbook,
frec.getParsedExpression()) + '"';
}
rowList.add(thisColumn,formatValue);
break;
case StringRecord.sid:
//单元格中公式的字符串
if (outputNextStringRecord) {
// String for formula
StringRecord srec = (StringRecord) record;
formatValue = srec.getString();
thisRow = nextRow;
thisColumn = nextColumn;
outputNextStringRecord = false;
}
break;
case LabelRecord.sid:
LabelRecord lrec = (LabelRecord) record;
curRow = thisRow = lrec.getRow();
thisColumn = lrec.getColumn();
formatValue = lrec.getValue().trim();
formatValue = formatValue.equals("")?" ":formatValue;
this.rowList.add(thisColumn, formatValue);
break;
case LabelSSTRecord.sid:
//单元格为字符串类型
LabelSSTRecord lsrec = (LabelSSTRecord) record;
curRow = thisRow = lsrec.getRow();
thisColumn = lsrec.getColumn();
if (sstRecord == null) {
rowList.add(thisColumn, " ");
} else {
formatValue = sstRecord
.getString(lsrec.getSSTIndex()).toString().trim();
formatValue = formatValue.equals("")?" ": formatValue;
rowList.add(thisColumn, formatValue);
}
break;
case NumberRecord.sid:
NumberRecord r = (NumberRecord) record;
curRow = thisRow = r.getRow();
thisColumn = r.getColumn();
formatValue = ExcelUtils.getDateValue((short) this.formatListener.getFormatIndex(r),
this.formatListener.getFormatString(r), r.getValue());
formatValue = StringUtils.isEmpty(formatValue) ? formatListener.formatNumberDateCell(r).trim() : formatValue;
formatValue = formatValue.equals("")?" ": formatValue;
// 向容器加入列值
rowList.add(thisColumn, formatValue);
break;
default:
break;
}
// 遇到新行的操作
if (thisRow != -1 && thisRow != lastRowNumber) {
lastColumnNumber = -1;
}
// 空值的操作
if (record instanceof MissingCellDummyRecord) {
MissingCellDummyRecord mc = (MissingCellDummyRecord) record;
curRow = thisRow = mc.getRow();
thisColumn = mc.getColumn();
rowList.add(thisColumn," ");
}
// 更新行和列的值
if (thisRow > -1) {
lastRowNumber = thisRow;
}
if (thisColumn > -1) {
lastColumnNumber = thisColumn;
}
// 行结束时的操作
if (record instanceof LastCellOfRowDummyRecord) {
if (minColumns > 0) {
// 列值重新置空
if (lastColumnNumber == -1) {
lastColumnNumber = 0;
}
}
lastColumnNumber = -1;
// 每行结束时, 调用getRows() 方法
try {
rowReader.getRows(sheetIndex,curRow, rowList);
} catch (SAXException e) {
readCurSheetFlag = false;
} catch (Exception e) {
readCurSheetFlag = false;
}
// 清空容器
rowList.clear();
}
}
}
接下来是一些辅助方法:
package com.youth.excel_youth.constants;
import org.apache.poi.ss.usermodel.DataFormatter;
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Pattern;
/**
* @author: youth_1231
* @Date: 2019/1/7 0007 16:14
* @Description:
*/
public interface Constants {
/**
* 年月日时分秒 默认格式
*/
SimpleDateFormat COMMON_DATE_FORMAT = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
/**
* 时间 默认格式
*/
SimpleDateFormat COMMON_TIME_FORMAT = new SimpleDateFormat("HH:mm:ss");
/**
* 年月日 默认格式
*/
SimpleDateFormat COMMON_DATE_FORMAT_NYR = new SimpleDateFormat("yyyy-MM-dd");
/**
* 年月 默认格式
*/
SimpleDateFormat COMMON_DATE_FORMAT_NY = new SimpleDateFormat("yyyy-MM");
/**
* 月日 默认格式
*/
SimpleDateFormat COMMON_DATE_FORMAT_YR = new SimpleDateFormat("MM-dd");
/**
* 月 默认格式
*/
SimpleDateFormat COMMON_DATE_FORMAT_Y = new SimpleDateFormat("MM");
/**
* 星期 默认格式
*/
String COMMON_DATE_FORMAT_XQ = "星期";
/**
* 周 默认格式
*/
String COMMON_DATE_FORMAT_Z = "周";
/**
* 07版时间(非日期) 总time
*/
List<Short> EXCEL_FORMAT_INDEX_07_TIME = Arrays.asList(
new Short[]{18, 19, 20, 21, 32, 33, 45, 46, 47, 55, 56, 176, 177, 178, 179, 180, 181,
182, 183, 184, 185, 186}
);
/**
* 07版日期(非时间) 总date
*/
List<Short> EXCEL_FORMAT_INDEX_07_DATE = Arrays.asList(
new Short[]{14, 15, 16, 17, 22, 30, 31, 57, 58, 187, 188, 189, 190, 191, 192, 193,
194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208}
);
/**
* 03版时间(非日期) 总time
*/
List<Short> EXCEL_FORMAT_INDEX_03_TIME = Arrays.asList(
new Short[]{18, 19, 20, 21, 32, 33, 45, 46, 47, 55, 56, 176, 177, 178, 179, 180, 181,
182, 183, 184, 185, 186}
);
/**
* 07版日期(非日期) 总date
*/
List<Short> EXCEL_FORMAT_INDEX_03_DATE = Arrays.asList(
new Short[]{14, 15, 16, 17, 22, 30, 31, 57, 58, 187, 188, 189, 190, 191, 192, 193,
194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208}
);
/**
* date-年月日时分秒
*/
List<String> EXCEL_FORMAT_INDEX_DATE_NYRSFM_STRING = Arrays.asList(
"yyyy/m/d\\ h:mm;@", "m/d/yy h:mm", "yyyy/m/d\\ h:mm\\ AM/PM",
"[$-409]yyyy/m/d\\ h:mm\\ AM/PM;@", "yyyy/mm/dd\\ hh:mm:dd", "yyyy/mm/dd\\ hh:mm", "yyyy/m/d\\ h:m", "yyyy/m/d\\ h:m:s",
"yyyy/m/d\\ h:mm", "m/d/yy h:mm;@", "yyyy/m/d\\ h:mm\\ AM/PM;@"
);
/**
* date-年月日
*/
List<String> EXCEL_FORMAT_INDEX_DATE_NYR_STRING = Arrays.asList(
"m/d/yy", "[$-F800]dddd\\,\\ mmmm\\ dd\\,\\ yyyy",
"[DBNum1][$-804]yyyy\"年\"m\"月\"d\"日\";@", "yyyy\"年\"m\"月\"d\"日\";@", "yyyy/m/d;@", "yy/m/d;@", "m/d/yy;@",
"[$-409]d/mmm/yy", "[$-409]dd/mmm/yy;@", "reserved-0x1F", "reserved-0x1E", "mm/dd/yy;@", "yyyy/mm/dd", "d-mmm-yy",
"[$-409]d\\-mmm\\-yy;@", "[$-409]d\\-mmm\\-yy", "[$-409]dd\\-mmm\\-yy;@", "[$-409]dd\\-mmm\\-yy",
"[DBNum1][$-804]yyyy\"年\"m\"月\"d\"日\"", "yy/m/d", "mm/dd/yy", "dd\\-mmm\\-yy"
);
/**
* date-年月
*/
List<String> EXCEL_FORMAT_INDEX_DATE_NY_STRING = Arrays.asList(
"[DBNum1][$-804]yyyy\"年\"m\"月\";@", "[DBNum1][$-804]yyyy\"年\"m\"月\"",
"yyyy\"年\"m\"月\";@", "yyyy\"年\"m\"月\"", "[$-409]mmm\\-yy;@", "[$-409]mmm\\-yy",
"[$-409]mmm/yy;@", "[$-409]mmm/yy", "[$-409]mmmm/yy;@","[$-409]mmmm/yy",
"[$-409]mmmmm/yy;@", "[$-409]mmmmm/yy", "mmm-yy", "yyyy/mm", "mmm/yyyy",
"[$-409]mmmm\\-yy;@", "[$-409]mmmmm\\-yy;@", "mmmm\\-yy", "mmmmm\\-yy"
);
/**
* date-月日
*/
List<String> EXCEL_FORMAT_INDEX_DATE_YR_STRING = Arrays.asList(
"[DBNum1][$-804]m\"月\"d\"日\";@", "[DBNum1][$-804]m\"月\"d\"日\"",
"m\"月\"d\"日\";@", "m\"月\"d\"日\"", "[$-409]d/mmm;@", "[$-409]d/mmm",
"m/d;@", "m/d", "d-mmm", "d-mmm;@", "mm/dd", "mm/dd;@", "[$-409]d\\-mmm;@", "[$-409]d\\-mmm"
);
/**
* date-星期X
*/
List<String> EXCEL_FORMAT_INDEX_DATE_XQ_STRING = Arrays.asList("[$-804]aaaa;@", "[$-804]aaaa");
/**
* date-周X
*/
List<String> EXCEL_FORMAT_INDEX_DATE_Z_STRING = Arrays.asList("[$-804]aaa;@", "[$-804]aaa");
/**
* date-月X
*/
List<String> EXCEL_FORMAT_INDEX_DATE_Y_STRING = Arrays.asList("[$-409]mmmmm;@","mmmmm","[$-409]mmmmm");
/**
* time - 时间
*/
List<String> EXCEL_FORMAT_INDEX_TIME_STRING = Arrays.asList(
"mm:ss.0", "h:mm", "h:mm\\ AM/PM", "h:mm:ss", "h:mm:ss\\ AM/PM",
"reserved-0x20", "reserved-0x21", "[DBNum1]h\"时\"mm\"分\"", "[DBNum1]上午/下午h\"时\"mm\"分\"", "mm:ss",
"[h]:mm:ss", "h:mm:ss;@", "[$-409]h:mm:ss\\ AM/PM;@", "h:mm;@", "[$-409]h:mm\\ AM/PM;@",
"h\"时\"mm\"分\";@", "h\"时\"mm\"分\"\\ AM/PM;@", "h\"时\"mm\"分\"ss\"秒\";@", "h\"时\"mm\"分\"ss\"秒\"_ AM/PM;@", "上午/下午h\"时\"mm\"分\";@",
"上午/下午h\"时\"mm\"分\"ss\"秒\";@", "[DBNum1][$-804]h\"时\"mm\"分\";@", "[DBNum1][$-804]上午/下午h\"时\"mm\"分\";@", "h:mm AM/PM","h:mm:ss AM/PM",
"[$-F400]h:mm:ss\\ AM/PM"
);
/**
* date-当formatString为空的时候-年月
*/
Short EXCEL_FORMAT_INDEX_DATA_EXACT_NY = 57;
/**
* date-当formatString为空的时候-月日
*/
Short EXCEL_FORMAT_INDEX_DATA_EXACT_YR = 58;
/**
* time-当formatString为空的时候-时间
*/
List<Short> EXCEL_FORMAT_INDEX_TIME_EXACT = Arrays.asList(new Short[]{55, 56});
/**
* 格式化星期或者周显示
*/
String[] WEEK_DAYS = { "日", "一", "二", "三", "四", "五", "六" };
/**
* 07版 excel dataformat
*/
DataFormatter EXCEL_07_DATA_FORMAT = new DataFormatter();
/**
* 小数 正则
*/
Pattern PATTERN_DECIMAL = Pattern.compile("^-?([1-9]\\d*\\.\\d*|0\\.\\d*[1-9]\\d*|0?\\.0+)$");
/**
* 07版excel后缀名
*/
String EXCEL_SUFFIX_07 = "xlsx";
/**
* 03版excel后缀名
*/
String EXCEL_SUFFIX_03 = "xls";
}
package com.youth.excel_youth.constants;
/**
* @author: youth_1231
* @Date: 2019/1/24 0024 15:37
* @Description: 07版的excel 单元格type
*/
public enum XSSFDataType {
BOOLEAN,
ERROR,
FORMULA,
INLINESTR,
SSTINDEX,
NUMBER,
NULL
}
package com.youth.excel_youth.service;
import org.xml.sax.SAXException;
import java.util.List;
public interface IRowReader {
/**
* 业务逻辑实现方法
* @param sheetIndex
* @param curRow
* @param rowList
* @throws SAXException
*/
void getRows(int sheetIndex,int curRow, List<String> rowList) throws SAXException;
}
package com.youth.excel_youth.service;
import java.util.List;
/**
* @author: youth_1231
* @Date: 2019/1/24 0024 15:44
* @Description:
*/
public class RowReaderImpl implements IRowReader {
@Override
public void getRows(int sheetIndex, int curRow, List<String> rowlist) {
// TODO 改写成自己所需的业务方法
StringBuilder sb = new StringBuilder();
sb.append(curRow).append(": ");
for (int i = 0; i < rowlist.size(); i++) {
sb.append(rowlist.get(i)).append(" | ");
}
System.out.println(sb);
}
}
package com.youth.excel_youth.util;
import com.youth.excel_youth.constants.Constants;
import java.util.Calendar;
import java.util.Date;
/**
* @author: youth_1231
* @Date: 2019/1/24 0024 09:43
* @Description:
*/
public class CommonUtils {
/**
* 日期转星期
* @param date
* @return
*/
public static String dateToWeek(Date date) {
if (date == null){
return "";
}
// 获得一个日历
Calendar cal = Calendar.getInstance();
cal.setTime(date);
// 指示一个星期中的某天。
int w = cal.get(Calendar.DAY_OF_WEEK) - 1;
if (w < 0)
w = 0;
return Constants.WEEK_DAYS[w];
}
}
package com.youth.excel_youth.util;
import com.youth.excel_youth.constants.Constants;
import com.youth.excel_youth.service.IRowReader;
import java.io.IOException;
/**
* @author: youth_1231
* @Date: 2019/1/24 0024 15:42
* @Description:
*/
public class ExcelReaderUtil {
//excel2003扩展名
public static final String EXCEL03_EXTENSION = ".xls";
//excel2007扩展名
public static final String EXCEL07_EXTENSION = ".xlsx";
/**
* 读取Excel文件,可能是03也可能是07版本
* @param fileName
* @throws Exception
*/
public static void readExcel(IRowReader reader, String fileName) throws Exception{
/**
* 手动改后缀名的情况
*/
boolean xlsxFlag;
if (fileName.endsWith(Constants.EXCEL_SUFFIX_07)){
xlsxFlag = true;
} else if (fileName.endsWith(Constants.EXCEL_SUFFIX_03)){
xlsxFlag = false;
} else {
throw new Exception("文件格式错误,fileName的扩展名只能是xls或xlsx");
}
try {
if (xlsxFlag) {
Excel07Parser excel07 = new Excel07Parser();
excel07.setRowReader(reader);
excel07.process(fileName);
} else {
Excel03Parser excel03 = new Excel03Parser();
excel03.setRowReader(reader);
excel03.process(fileName);
}
} catch (IOException e) {
if (xlsxFlag) {
Excel03Parser excel03 = new Excel03Parser();
excel03.setRowReader(reader);
excel03.process(fileName);
} else {
Excel07Parser excel07 = new Excel07Parser();
excel07.setRowReader(reader);
excel07.process(fileName);
}
}
}
}
package com.youth.excel_youth.util;
import com.youth.excel_youth.constants.Constants;
import org.apache.poi.ss.usermodel.*;
import org.springframework.util.Assert;
import java.util.Date;
/**
* @author: youth_1231
* @Date: 2019/1/7 0007 15:41
* @Description:
*/
public class ExcelUtils {
/**
* 用户模式得到单元格的值
* @param workbook
* @param cell
* @return
*/
public static String getCellValue(Workbook workbook, Cell cell){
Assert.notNull(workbook, "when you parse excel, workbook is not allowed to be null");
String cellValue = "";
if (cell == null){
return cellValue;
}
switch (cell.getCellType()){
case NUMERIC:
cellValue = getDateValue(cell.getCellStyle().getDataFormat(), cell.getCellStyle().getDataFormatString(),
cell.getNumericCellValue());
if (cellValue == null){
cellValue = String.valueOf(cell.getNumericCellValue());
}
break;
case STRING:
cellValue = String.valueOf(cell.getStringCellValue());
break;
case BOOLEAN:
cellValue = String.valueOf(cell.getBooleanCellValue());
break;
case FORMULA:
/**
* 格式化单元格
*/
FormulaEvaluator evaluator = workbook.getCreationHelper().createFormulaEvaluator();
cellValue = getCellValue(evaluator.evaluate(cell));
break;
case BLANK:
cellValue = "";
break;
case ERROR:
cellValue = String.valueOf(cell.getErrorCellValue());
break;
case _NONE:
cellValue = "";
break;
default:
cellValue = "未知类型";
break;
}
return cellValue;
}
/**
* 用户模式得到公式单元格的值
* @param formulaValue
* @return
*/
public static String getCellValue(CellValue formulaValue){
String cellValue = "";
if (formulaValue == null){
return cellValue;
}
switch (formulaValue.getCellType()){
case NUMERIC:
cellValue = String.valueOf(formulaValue.getNumberValue());
break;
case STRING:
cellValue = String.valueOf(formulaValue.getStringValue());
break;
case BOOLEAN:
cellValue = String.valueOf(formulaValue.getBooleanValue());
break;
case BLANK:
cellValue = "";
break;
case ERROR:
cellValue = String.valueOf(formulaValue.getErrorValue());
break;
case _NONE:
cellValue = "";
break;
default:
cellValue = "未知类型";
break;
}
return cellValue;
}
/**
* 得到date单元格格式的值
* @param dataFormat
* @param dataFormatString
* @param value
* @return
*/
public static String getDateValue(Short dataFormat, String dataFormatString, double value){
if (!DateUtil.isValidExcelDate(value)){
return null;
}
Date date = DateUtil.getJavaDate(value);
/**
* 年月日时分秒
*/
if (Constants.EXCEL_FORMAT_INDEX_DATE_NYRSFM_STRING.contains(dataFormatString)) {
return Constants.COMMON_DATE_FORMAT.format(date);
}
/**
* 年月日
*/
if (Constants.EXCEL_FORMAT_INDEX_DATE_NYR_STRING.contains(dataFormatString)) {
return Constants.COMMON_DATE_FORMAT_NYR.format(date);
}
/**
* 年月
*/
if (Constants.EXCEL_FORMAT_INDEX_DATE_NY_STRING.contains(dataFormatString) || Constants.EXCEL_FORMAT_INDEX_DATA_EXACT_NY.equals(dataFormat)) {
return Constants.COMMON_DATE_FORMAT_NY.format(date);
}
/**
* 月日
*/
if (Constants.EXCEL_FORMAT_INDEX_DATE_YR_STRING.contains(dataFormatString) || Constants.EXCEL_FORMAT_INDEX_DATA_EXACT_YR.equals(dataFormat)) {
return Constants.COMMON_DATE_FORMAT_YR.format(date);
}
/**
* 月
*/
if (Constants.EXCEL_FORMAT_INDEX_DATE_Y_STRING.contains(dataFormatString)) {
return Constants.COMMON_DATE_FORMAT_Y.format(date);
}
/**
* 星期X
*/
if (Constants.EXCEL_FORMAT_INDEX_DATE_XQ_STRING.contains(dataFormatString)) {
return Constants.COMMON_DATE_FORMAT_XQ + CommonUtils.dateToWeek(date);
}
/**
* 周X
*/
if (Constants.EXCEL_FORMAT_INDEX_DATE_Z_STRING.contains(dataFormatString)) {
return Constants.COMMON_DATE_FORMAT_Z + CommonUtils.dateToWeek(date);
}
/**
* 时间格式
*/
if (Constants.EXCEL_FORMAT_INDEX_TIME_STRING.contains(dataFormatString) || Constants.EXCEL_FORMAT_INDEX_TIME_EXACT.contains(dataFormat)) {
return Constants.COMMON_TIME_FORMAT.format(DateUtil.getJavaDate(value));
}
/**
* 单元格为其他未覆盖到的类型
*/
if (DateUtil.isADateFormat(dataFormat, dataFormatString)) {
return Constants.COMMON_TIME_FORMAT.format(value);
}
return null;
}
}
package com.youth.excel_youth.util;
import com.youth.excel_youth.service.IRowReader;
import com.youth.excel_youth.service.RowReaderImpl;
import java.io.IOException;
/**
* @author: youth_1231
* @Date: 2019/1/6 0006 15:23
* @Description:
*/
public class Test {
public static void main(String[] args) {
String path = "C:\\Users\\Administrator\\Desktop\\excel_03.xls";
try {
IRowReader rowReader = new RowReaderImpl();
ExcelReaderUtil.readExcel(rowReader, path);
} catch (IOException e) {
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
}
}
}
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>4.0.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>4.0.1</version>
</dependency>
以上就是关于事件模式解析07和03版excel的方法,现在我们来上传单个sheet页100W的数据,用jProfile进行分析。
第二次第三次也差不多,我就不多放图了。用2个sheet100W数据,内存运行情况也差不多,但是时间会增长。
另外,提醒一下,如果07版解析要中断的话,在一行的具体解析业务方法中抛异常即可。而03版不仅要可以抛异常以后,用一个flag去标识,这个sheet有问题,之后可以循环但是不去解析。光抛出异常是不行的。
一般接口的话,最好用异步线程池去解析(git地址以后再补)。