Java poi 4.0+ (三) 事件驱动模式解析excel 包含03和07版的excel

时间:2024-05-18 15:53:36

前面我们介绍了用户模式如何解析excel,但是用户模式局限很大,其并不能解析大数据量的excel。我做过测试,在16G内存上的机子,07版excel导入6W条数据,内存一下彪了2个G,由于之前忘截图了,所以此处没有图,下面,我将介绍在大批量数据时候,如何解析excel。

当然是用 事件驱动模式(其实就是一种流式读取的名字)了!

网上有很多相似的代码,我也参考了很多,但是我发现不论哪个都有很大的问题,比如对公式和数字日期的解析。

下面不多说,直接上代码。参考了网上很多的代码,也给自己记录一下。

 

首先07版的,要继承  DefaultHandler 把他当做xml解析。这个网上有很多说明,我就不赘述了。xlsx改成rar后解压得到如下图。

Java poi 4.0+ (三) 事件驱动模式解析excel 包含03和07版的excel

 

 

package com.youth.excel_youth.util;

import com.youth.excel_youth.constants.Constants;
import com.youth.excel_youth.constants.XSSFDataType;
import com.youth.excel_youth.service.IRowReader;
import org.apache.poi.hssf.usermodel.HSSFDateUtil;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.ss.usermodel.BuiltinFormats;
import org.apache.poi.ss.usermodel.DateUtil;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFCellStyle;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;

import java.io.InputStream;
import java.math.BigDecimal;
import java.text.SimpleDateFormat;
import java.util.*;

/**
 * @author: youth_1231
 * @Date: 2019/1/24 0024 15:39
 * @Description:   抽象Excel2007读取器,excel2007的底层数据结构是xml文件,采用SAX的事件驱动的方法解析
*                    xml,需要继承DefaultHandler,在遇到文件内容时,事件会触发,这种做法可以大大降低
 *                   内存的耗费,特别使用于大数据量的文件。
 */
public class Excel07Parser extends DefaultHandler {
    //共享字符串表
    private SharedStringsTable sst;
    //上一次的内容
    private String readValue;
    /**
     * 存放一行中的数据
     */
    private String[] rowList;

    private int colIdx;
    private int sheetIndex = -1;

    //当前行
    private int curRow = 0;
    /**
     * T元素标识
     */
    private boolean isTElement;
    /**
     * 单元格类型
     */
    private XSSFDataType dataType;

    private StylesTable stylesTable;
    private short dataFormat;
    private String dataFormatString;

    private IRowReader rowReader;
    /**
     * 可以将有的sheet页相关参数放到一个对象中,比如sheet中的错误,sheetName,业务数据等等,
     * 对象放到集合中,其中的数据建议超过2000条持久化一次
     */
//    private List<XXX> xxxxx;


    public void setRowReader(IRowReader rowReader){
        this.rowReader = rowReader;
    }

    /**
     * 遍历工作簿中所有的电子表格
     * @param filename
     * @throws Exception
     */
    public void process(String filename) throws Exception {
        OPCPackage pkg = OPCPackage.open(filename);
        XSSFReader r = new XSSFReader(pkg);
        this.stylesTable = r.getStylesTable();
        SharedStringsTable sst = r.getSharedStringsTable();
        XMLReader parser = fetchSheetParser(sst);
        XSSFReader.SheetIterator sheets = (XSSFReader.SheetIterator) r.getSheetsData();
        while (sheets.hasNext()) {
            curRow = 0;
            sheetIndex++;
            InputStream sheet = sheets.next();
            /*获取当前sheet名称,有些同学需要*/
            String sheetName = sheets.getSheetName();
            InputSource sheetSource = new InputSource(sheet);
            parser.parse(sheetSource);
            sheet.close();
        }
    }

    public XMLReader fetchSheetParser(SharedStringsTable sst)
            throws SAXException {
        XMLReader parser = XMLReaderFactory
                .createXMLReader();
        this.sst = sst;
        parser.setContentHandler(this);
        return parser;
    }

    @Override
    public void startElement(String uri, String localName, String name,
                             Attributes attributes) throws SAXException {

        // c => 单元格
        if ("c".equals(name)) {
            colIdx = getColumn(attributes);
            dataFormat = -1;
            dataFormatString = null;

            // 如果下一个元素是 SST 的索引,则将nextIsString标记为true
            String cellType = attributes.getValue("t");
            String cellStyle = attributes.getValue("s");

            this.dataType = XSSFDataType.NUMBER;
            if ("b".equals(cellType)) {
                this.dataType = XSSFDataType.BOOLEAN;
            } else if ("e".equals(cellStyle)) {
                this.dataType = XSSFDataType.ERROR;
            } else if ("s".equals(cellStyle)) {
                this.dataType = XSSFDataType.SSTINDEX;
            } else if ("inlineStr".equals(cellStyle)) {
                this.dataType = XSSFDataType.INLINESTR;
            } else if ("str".equals(cellStyle)) {
                this.dataType = XSSFDataType.FORMULA;
            }

            if (cellStyle != null) {
                int styleIndex = Integer.parseInt(cellStyle);
                XSSFCellStyle style = stylesTable.getStyleAt(styleIndex);
                dataFormat = style.getDataFormat();
                dataFormatString = style.getDataFormatString();
                /**
                 * 07版本当前只发现了57 58 的时候formatString为空
                 */
                if (!Constants.EXCEL_FORMAT_INDEX_DATA_EXACT_NY.equals(dataFormat) && !Constants.EXCEL_FORMAT_INDEX_DATA_EXACT_YR.equals(dataFormat)
                        && !Constants.EXCEL_FORMAT_INDEX_TIME_EXACT.contains(dataFormat)
                        && dataFormatString == null){
                    this.dataType = XSSFDataType.NULL;
                    dataFormatString = BuiltinFormats.getBuiltinFormat(dataFormat);
                }
            }

        }
        //当元素为t时
        if("t".equals(name)){
            isTElement = true;
        } else {
            isTElement = false;
        }

        // 解析到一行开始处,初始化数据
        if("row".equals(name)){
            rowList = new String[getColsNum(attributes)];
        }
        // 置空
        readValue = "";
    }

    @Override
    public void endElement(String uri, String localName, String name)
            throws SAXException {


        if (isTElement) {
            rowList[colIdx] = readValue.trim();
            isTElement = false;
        } else if ("v".equals(name)){
            getValue();
            rowList[colIdx] = readValue;
        } else {
            //如果标签名为 row,这说明已经到行尾,调用getRows()方法
            if ("row".equals(name)){
                rowReader.getRows(sheetIndex,curRow,new ArrayList<>(Arrays.asList(rowList)));
                curRow++;
            }
        }

    }

    @Override
    public void characters(char[] ch, int start, int length)
            throws SAXException {
        //得到单元格内容的值
        readValue += new String(ch, start, length);
    }

    /**
     * 事件模式: 得到当前cell在当前row的位置
     * @param attributes
     * @return
     */
    private int getColumn(Attributes attributes){
        String name = attributes.getValue("r");
        int column = -1;
        for (int i = 0; i < name.length(); ++i) {
            if (Character.isDigit(name.charAt(i))){
                break;
            }

            int c = name.charAt(i);
            column = (column + 1) * 26 + c - 'A';
        }
        return column;
    }

    /**
     * 事件模式: 得到当前cell在当前row的位置
     * @param attributes
     * @return
     */
    private int getColsNum(Attributes attributes){
        String spans = attributes.getValue("spans");
        String cols = spans.substring(spans.indexOf(":") + 1);
        return Integer.parseInt(cols);
    }

    private void getValue() throws SAXException{
        switch (this.dataType){
            case BOOLEAN:
                readValue = readValue.charAt(0) == '0' ? "FALSE" : "TRUE";
                break;
            case ERROR:
                readValue = "ERROR:" + readValue;
                break;
            case INLINESTR:
                readValue = new XSSFRichTextString(readValue).toString();
                break;
            case SSTINDEX:
                int idx = Integer.parseInt(readValue);
                readValue = sst.getItemAt(idx).toString();
                break;
            case FORMULA:
                break;
            case NUMBER:
                String formatValue = ExcelUtils.getDateValue(this.dataFormat, this.dataFormatString, Double.parseDouble(this.readValue));
                formatValue = formatValue == null && dataFormatString != null ?
                        Constants.EXCEL_07_DATA_FORMAT.formatRawCellContents(Double.valueOf(readValue), dataFormat, dataFormatString) : formatValue;

                if (formatValue == null){
                    readValue = Constants.PATTERN_DECIMAL.matcher(readValue).matches() ? String.valueOf(Double.parseDouble(readValue)) : readValue;
                } else {
                    readValue = Constants.PATTERN_DECIMAL.matcher(formatValue).matches() ? String.valueOf(Double.parseDouble(formatValue)) : formatValue;
                }

                break;
            default:
                throw new SAXException("未知的单元格类型");
        }
    }


}

2003版的实现HSSFListener 

package com.youth.excel_youth.util;

import com.youth.excel_youth.constants.Constants;
import com.youth.excel_youth.service.IRowReader;
import org.apache.poi.hssf.eventusermodel.*;
import org.apache.poi.hssf.eventusermodel.dummyrecord.LastCellOfRowDummyRecord;
import org.apache.poi.hssf.eventusermodel.dummyrecord.MissingCellDummyRecord;
import org.apache.poi.hssf.model.HSSFFormulaParser;
import org.apache.poi.hssf.record.*;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.ss.usermodel.DateUtil;
import org.springframework.util.StringUtils;
import org.xml.sax.SAXException;

import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

/**
 * @author: youth_1231
 * @Date: 2019/1/24 0024 15:37
 * @Description: 抽象Excel2003读取器,通过实现HSSFListener监听器,采用事件驱动模式解析excel2003
 *                 中的内容,遇到特定事件才会触发,大大减少了内存的使用。
 */
public class Excel03Parser implements HSSFListener {
    private int minColumns = -1;
    private int lastRowNumber = -1;
    private int lastColumnNumber = -1;
    /** Should we output the formula, or the value it has? */
    private boolean outputFormulaValues = true;

    /** For parsing Formulas */
    private EventWorkbookBuilder.SheetRecordCollectingListener workbookBuildingListener;
    //excel2003工作薄
    private HSSFWorkbook stubWorkbook;
    // Records we pick up as we process
    private SSTRecord sstRecord;
    private FormatTrackingHSSFListener formatListener;
    //表索引
    private int sheetIndex = -1;
    private BoundSheetRecord[] orderedBSRs;

    @SuppressWarnings("unchecked")
    private List<BoundSheetRecord> boundSheetRecords = new ArrayList();

    // For handling formulas with string results
    private int nextRow;
    private int nextColumn;
    private boolean outputNextStringRecord;
    //当前行
    private int curRow = 0;
    //存储行记录的容器
    private List<String> rowList = new ArrayList<String>();
    @SuppressWarnings( "unused")
    private String sheetName;
    private IRowReader rowReader;
    /**
     * 中断读取当前sheet标识
     */
    private boolean readCurSheetFlag = true;
    public void setRowReader(IRowReader rowReader){
        this.rowReader = rowReader;
    }

    /**
     * 记录每个sheet页的信息,比如sheet名,每1000次一条数据等等
     */


    /**
     * 解析03版excel的方法
     * @param fileName
     */
    public void process(String fileName){
        if (StringUtils.isEmpty(fileName)){
            return;
        }
        FileInputStream fileInputStream = null;
        POIFSFileSystem poifsFileSystem = null;
        try {
            fileInputStream = new FileInputStream(fileName);
            poifsFileSystem = new POIFSFileSystem(fileInputStream);
            MissingRecordAwareHSSFListener listener = new MissingRecordAwareHSSFListener(
                    this);
            formatListener = new FormatTrackingHSSFListener(listener);
            HSSFEventFactory factory = new HSSFEventFactory();
            HSSFRequest request = new HSSFRequest();
            if (outputFormulaValues) {
                request.addListenerForAllRecords(formatListener);
            } else {
                workbookBuildingListener = new EventWorkbookBuilder.SheetRecordCollectingListener(
                        formatListener);
                request.addListenerForAllRecords(workbookBuildingListener);
            }
            /**
             * 解析单元格
             */
            factory.processWorkbookEvents(request, poifsFileSystem);
            poifsFileSystem.close();
            fileInputStream.close();
        } catch (Exception e) {
            try {
                poifsFileSystem.close();
            } catch (IOException e1) {
                poifsFileSystem = null;
            }
            try {
                fileInputStream.close();
            } catch (IOException e1) {
                fileInputStream = null;
            }
        }

    }

    /**
     * HSSFListener 监听方法,处理 Record
     */
    @Override
    public void processRecord(Record record) {

        if (!readCurSheetFlag && record.getSid() != EOFRecord.sid) {
            return;
        }

        int thisRow = -1;
        int thisColumn = -1;
//        String thisStr = null;
        String formatValue = null;
        switch (record.getSid()) {
            case BoundSheetRecord.sid:
                boundSheetRecords.add((BoundSheetRecord) record);
                break;
            case BOFRecord.sid:
                BOFRecord br = (BOFRecord) record;
                if (br.getType() == BOFRecord.TYPE_WORKSHEET) {
                    // 如果有需要,则建立子工作薄
                    if (workbookBuildingListener != null && stubWorkbook == null) {
                        stubWorkbook = workbookBuildingListener
                                .getStubHSSFWorkbook();
                    }

                    sheetIndex++;
                    if (orderedBSRs == null) {
                        orderedBSRs = BoundSheetRecord
                                .orderByBofPosition(boundSheetRecords);
                    }
                    sheetName = orderedBSRs[sheetIndex].getSheetname();
                }
                break;
            case EOFRecord.sid:
                /**
                 * 清空当前sheet页操作的值
                 */
                lastRowNumber = -1;
                lastColumnNumber = -1;
                readCurSheetFlag = true;
                break;

            case SSTRecord.sid:
                sstRecord = (SSTRecord) record;
                break;

            case BlankRecord.sid:
                BlankRecord brec = (BlankRecord) record;
                thisRow = brec.getRow();
                thisColumn = brec.getColumn();
                rowList.add(thisColumn, "");
                break;
            case BoolErrRecord.sid:
                BoolErrRecord berec = (BoolErrRecord) record;
                thisRow = berec.getRow();
                thisColumn = berec.getColumn();
                formatValue = berec.getBooleanValue()+"";
                rowList.add(thisColumn, formatValue);
                break;

            case FormulaRecord.sid:
                FormulaRecord frec = (FormulaRecord) record;
                thisRow = frec.getRow();
                thisColumn = frec.getColumn();
                if (outputFormulaValues) {
                    if (Double.isNaN(frec.getValue())) {
                        // Formula result is a string
                        // This is stored in the next record
                        outputNextStringRecord = true;
                        nextRow = frec.getRow();
                        nextColumn = frec.getColumn();
                    } else {
                        formatValue = formatListener.formatNumberDateCell(frec);
                    }
                } else {
                    formatValue = '"' + HSSFFormulaParser.toFormulaString(stubWorkbook,
                            frec.getParsedExpression()) + '"';
                }
                rowList.add(thisColumn,formatValue);
                break;
            case StringRecord.sid:
                //单元格中公式的字符串
                if (outputNextStringRecord) {
                    // String for formula
                    StringRecord srec = (StringRecord) record;
                    formatValue = srec.getString();
                    thisRow = nextRow;
                    thisColumn = nextColumn;
                    outputNextStringRecord = false;
                }
                break;
            case LabelRecord.sid:
                LabelRecord lrec = (LabelRecord) record;
                curRow = thisRow = lrec.getRow();
                thisColumn = lrec.getColumn();
                formatValue = lrec.getValue().trim();
                formatValue = formatValue.equals("")?" ":formatValue;
                this.rowList.add(thisColumn, formatValue);
                break;
            case LabelSSTRecord.sid:
                //单元格为字符串类型
                LabelSSTRecord lsrec = (LabelSSTRecord) record;
                curRow = thisRow = lsrec.getRow();
                thisColumn = lsrec.getColumn();
                if (sstRecord == null) {
                    rowList.add(thisColumn, " ");
                } else {
                    formatValue =  sstRecord
                            .getString(lsrec.getSSTIndex()).toString().trim();
                    formatValue = formatValue.equals("")?" ": formatValue;
                    rowList.add(thisColumn, formatValue);
                }
                break;
            case NumberRecord.sid:
                NumberRecord r = (NumberRecord) record;
                curRow = thisRow = r.getRow();
                thisColumn = r.getColumn();
                formatValue = ExcelUtils.getDateValue((short) this.formatListener.getFormatIndex(r),
                        this.formatListener.getFormatString(r), r.getValue());

                formatValue = StringUtils.isEmpty(formatValue) ? formatListener.formatNumberDateCell(r).trim() : formatValue;
                formatValue = formatValue.equals("")?" ": formatValue;
                // 向容器加入列值
                rowList.add(thisColumn, formatValue);
                break;
            default:
                break;
        }

        // 遇到新行的操作
        if (thisRow != -1 && thisRow != lastRowNumber) {
            lastColumnNumber = -1;
        }

        // 空值的操作
        if (record instanceof MissingCellDummyRecord) {
            MissingCellDummyRecord mc = (MissingCellDummyRecord) record;
            curRow = thisRow = mc.getRow();
            thisColumn = mc.getColumn();
            rowList.add(thisColumn," ");
        }

        // 更新行和列的值
        if (thisRow > -1) {
            lastRowNumber = thisRow;
        }
        if (thisColumn > -1) {
            lastColumnNumber = thisColumn;
        }

        // 行结束时的操作
        if (record instanceof LastCellOfRowDummyRecord) {
            if (minColumns > 0) {
                // 列值重新置空
                if (lastColumnNumber == -1) {
                    lastColumnNumber = 0;
                }
            }
            lastColumnNumber = -1;
            // 每行结束时, 调用getRows() 方法
            try {
                rowReader.getRows(sheetIndex,curRow, rowList);
            } catch (SAXException e) {
                readCurSheetFlag = false;
            } catch (Exception e) {
                readCurSheetFlag = false;
            }

            // 清空容器
            rowList.clear();
        }
    }

}

接下来是一些辅助方法:

package com.youth.excel_youth.constants;

import org.apache.poi.ss.usermodel.DataFormatter;

import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Pattern;

/**
 * @author: youth_1231
 * @Date: 2019/1/7 0007 16:14
 * @Description:
 */
public interface Constants {

    /**
     * 年月日时分秒 默认格式
     */
    SimpleDateFormat COMMON_DATE_FORMAT = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    /**
     * 时间 默认格式
     */
    SimpleDateFormat COMMON_TIME_FORMAT = new SimpleDateFormat("HH:mm:ss");
    /**
     * 年月日 默认格式
     */
    SimpleDateFormat COMMON_DATE_FORMAT_NYR = new SimpleDateFormat("yyyy-MM-dd");
    /**
     * 年月 默认格式
     */
    SimpleDateFormat COMMON_DATE_FORMAT_NY = new SimpleDateFormat("yyyy-MM");
    /**
     * 月日 默认格式
     */
    SimpleDateFormat COMMON_DATE_FORMAT_YR = new SimpleDateFormat("MM-dd");
    /**
     * 月 默认格式
     */
    SimpleDateFormat COMMON_DATE_FORMAT_Y = new SimpleDateFormat("MM");
    /**
     * 星期 默认格式
     */
    String COMMON_DATE_FORMAT_XQ = "星期";
    /**
     * 周 默认格式
     */
    String COMMON_DATE_FORMAT_Z = "周";
    /**
     * 07版时间(非日期) 总time
     */
    List<Short> EXCEL_FORMAT_INDEX_07_TIME = Arrays.asList(
            new Short[]{18, 19, 20, 21, 32, 33, 45, 46, 47, 55, 56, 176, 177, 178, 179, 180, 181,
                    182, 183, 184, 185, 186}
    );
    /**
     * 07版日期(非时间) 总date
     */
    List<Short> EXCEL_FORMAT_INDEX_07_DATE = Arrays.asList(
            new Short[]{14, 15, 16, 17, 22, 30, 31, 57, 58, 187, 188, 189, 190, 191, 192, 193,
                    194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208}
    );
    /**
     * 03版时间(非日期) 总time
     */
    List<Short> EXCEL_FORMAT_INDEX_03_TIME = Arrays.asList(
            new Short[]{18, 19, 20, 21, 32, 33, 45, 46, 47, 55, 56, 176, 177, 178, 179, 180, 181,
                    182, 183, 184, 185, 186}
    );
    /**
     * 07版日期(非日期) 总date
     */
    List<Short> EXCEL_FORMAT_INDEX_03_DATE = Arrays.asList(
            new Short[]{14, 15, 16, 17, 22, 30, 31, 57, 58, 187, 188, 189, 190, 191, 192, 193,
                    194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208}
    );
    /**
     * date-年月日时分秒
     */
    List<String> EXCEL_FORMAT_INDEX_DATE_NYRSFM_STRING = Arrays.asList(
            "yyyy/m/d\\ h:mm;@", "m/d/yy h:mm", "yyyy/m/d\\ h:mm\\ AM/PM",
            "[$-409]yyyy/m/d\\ h:mm\\ AM/PM;@", "yyyy/mm/dd\\ hh:mm:dd", "yyyy/mm/dd\\ hh:mm", "yyyy/m/d\\ h:m", "yyyy/m/d\\ h:m:s",
            "yyyy/m/d\\ h:mm", "m/d/yy h:mm;@", "yyyy/m/d\\ h:mm\\ AM/PM;@"
    );
    /**
     * date-年月日
     */
    List<String> EXCEL_FORMAT_INDEX_DATE_NYR_STRING = Arrays.asList(
            "m/d/yy", "[$-F800]dddd\\,\\ mmmm\\ dd\\,\\ yyyy",
            "[DBNum1][$-804]yyyy\"年\"m\"月\"d\"日\";@", "yyyy\"年\"m\"月\"d\"日\";@", "yyyy/m/d;@", "yy/m/d;@", "m/d/yy;@",
            "[$-409]d/mmm/yy", "[$-409]dd/mmm/yy;@", "reserved-0x1F", "reserved-0x1E", "mm/dd/yy;@", "yyyy/mm/dd", "d-mmm-yy",
            "[$-409]d\\-mmm\\-yy;@", "[$-409]d\\-mmm\\-yy", "[$-409]dd\\-mmm\\-yy;@", "[$-409]dd\\-mmm\\-yy",
            "[DBNum1][$-804]yyyy\"年\"m\"月\"d\"日\"", "yy/m/d", "mm/dd/yy", "dd\\-mmm\\-yy"
    );
    /**
     * date-年月
     */
    List<String> EXCEL_FORMAT_INDEX_DATE_NY_STRING = Arrays.asList(
            "[DBNum1][$-804]yyyy\"年\"m\"月\";@", "[DBNum1][$-804]yyyy\"年\"m\"月\"",
            "yyyy\"年\"m\"月\";@", "yyyy\"年\"m\"月\"", "[$-409]mmm\\-yy;@", "[$-409]mmm\\-yy",
            "[$-409]mmm/yy;@", "[$-409]mmm/yy", "[$-409]mmmm/yy;@","[$-409]mmmm/yy",
            "[$-409]mmmmm/yy;@", "[$-409]mmmmm/yy", "mmm-yy", "yyyy/mm", "mmm/yyyy",
            "[$-409]mmmm\\-yy;@", "[$-409]mmmmm\\-yy;@", "mmmm\\-yy", "mmmmm\\-yy"
    );
    /**
     * date-月日
     */
    List<String> EXCEL_FORMAT_INDEX_DATE_YR_STRING = Arrays.asList(
            "[DBNum1][$-804]m\"月\"d\"日\";@", "[DBNum1][$-804]m\"月\"d\"日\"",
            "m\"月\"d\"日\";@", "m\"月\"d\"日\"", "[$-409]d/mmm;@", "[$-409]d/mmm",
            "m/d;@", "m/d", "d-mmm", "d-mmm;@", "mm/dd", "mm/dd;@", "[$-409]d\\-mmm;@", "[$-409]d\\-mmm"
    );
    /**
     * date-星期X
     */
    List<String> EXCEL_FORMAT_INDEX_DATE_XQ_STRING = Arrays.asList("[$-804]aaaa;@", "[$-804]aaaa");
    /**
     * date-周X
     */
    List<String> EXCEL_FORMAT_INDEX_DATE_Z_STRING = Arrays.asList("[$-804]aaa;@", "[$-804]aaa");
    /**
     * date-月X
     */
    List<String> EXCEL_FORMAT_INDEX_DATE_Y_STRING = Arrays.asList("[$-409]mmmmm;@","mmmmm","[$-409]mmmmm");
    /**
     * time - 时间
     */
    List<String> EXCEL_FORMAT_INDEX_TIME_STRING = Arrays.asList(
            "mm:ss.0", "h:mm", "h:mm\\ AM/PM", "h:mm:ss", "h:mm:ss\\ AM/PM",
            "reserved-0x20", "reserved-0x21", "[DBNum1]h\"时\"mm\"分\"", "[DBNum1]上午/下午h\"时\"mm\"分\"", "mm:ss",
            "[h]:mm:ss", "h:mm:ss;@", "[$-409]h:mm:ss\\ AM/PM;@", "h:mm;@", "[$-409]h:mm\\ AM/PM;@",
            "h\"时\"mm\"分\";@", "h\"时\"mm\"分\"\\ AM/PM;@", "h\"时\"mm\"分\"ss\"秒\";@", "h\"时\"mm\"分\"ss\"秒\"_ AM/PM;@", "上午/下午h\"时\"mm\"分\";@",
            "上午/下午h\"时\"mm\"分\"ss\"秒\";@", "[DBNum1][$-804]h\"时\"mm\"分\";@", "[DBNum1][$-804]上午/下午h\"时\"mm\"分\";@", "h:mm AM/PM","h:mm:ss AM/PM",
            "[$-F400]h:mm:ss\\ AM/PM"
    );
    /**
     * date-当formatString为空的时候-年月
     */
    Short EXCEL_FORMAT_INDEX_DATA_EXACT_NY = 57;
    /**
     * date-当formatString为空的时候-月日
     */
    Short EXCEL_FORMAT_INDEX_DATA_EXACT_YR = 58;
    /**
     * time-当formatString为空的时候-时间
     */
    List<Short> EXCEL_FORMAT_INDEX_TIME_EXACT = Arrays.asList(new Short[]{55, 56});
    /**
     * 格式化星期或者周显示
     */
    String[] WEEK_DAYS = { "日", "一", "二", "三", "四", "五", "六" };
    /**
     * 07版 excel dataformat
     */
    DataFormatter EXCEL_07_DATA_FORMAT = new DataFormatter();
    /**
     * 小数 正则
     */
    Pattern PATTERN_DECIMAL = Pattern.compile("^-?([1-9]\\d*\\.\\d*|0\\.\\d*[1-9]\\d*|0?\\.0+)$");
    /**
     * 07版excel后缀名
     */
    String EXCEL_SUFFIX_07 = "xlsx";
    /**
     * 03版excel后缀名
     */
    String EXCEL_SUFFIX_03 = "xls";
}
package com.youth.excel_youth.constants;

/**
 * @author: youth_1231
 * @Date: 2019/1/24 0024 15:37
 * @Description: 07版的excel 单元格type
 */
public enum XSSFDataType {
    BOOLEAN,
    ERROR,
    FORMULA,
    INLINESTR,
    SSTINDEX,
    NUMBER,
    NULL
}
package com.youth.excel_youth.service;

import org.xml.sax.SAXException;

import java.util.List;

public interface IRowReader {
    /**
     * 业务逻辑实现方法
     * @param sheetIndex
     * @param curRow
     * @param rowList
     * @throws SAXException
     */
    void getRows(int sheetIndex,int curRow, List<String> rowList) throws SAXException;
}
package com.youth.excel_youth.service;

import java.util.List;

/**
 * @author: youth_1231
 * @Date: 2019/1/24 0024 15:44
 * @Description:
 */
public class RowReaderImpl implements IRowReader {

    @Override
    public void getRows(int sheetIndex, int curRow, List<String> rowlist) {
        // TODO 改写成自己所需的业务方法
        StringBuilder sb = new StringBuilder();
        sb.append(curRow).append(": ");
        for (int i = 0; i < rowlist.size(); i++) {
            sb.append(rowlist.get(i)).append(" | ");
        }
        System.out.println(sb);
    }
}
package com.youth.excel_youth.util;

import com.youth.excel_youth.constants.Constants;

import java.util.Calendar;
import java.util.Date;

/**
 * @author: youth_1231
 * @Date: 2019/1/24 0024 09:43
 * @Description:
 */
public class CommonUtils {
    /**
     * 日期转星期
     * @param date
     * @return
     */
    public static String dateToWeek(Date date) {
        if (date == null){
            return "";
        }
        // 获得一个日历
        Calendar cal = Calendar.getInstance();
        cal.setTime(date);
        // 指示一个星期中的某天。
        int w = cal.get(Calendar.DAY_OF_WEEK) - 1;
        if (w < 0)
            w = 0;
        return Constants.WEEK_DAYS[w];
    }

}
package com.youth.excel_youth.util;

import com.youth.excel_youth.constants.Constants;
import com.youth.excel_youth.service.IRowReader;

import java.io.IOException;

/**
 * @author: youth_1231
 * @Date: 2019/1/24 0024 15:42
 * @Description:
 */
public class ExcelReaderUtil {
    //excel2003扩展名
    public static final String EXCEL03_EXTENSION = ".xls";
    //excel2007扩展名
    public static final String EXCEL07_EXTENSION = ".xlsx";

    /**
     * 读取Excel文件,可能是03也可能是07版本
     * @param fileName
     * @throws Exception
     */
    public static void readExcel(IRowReader reader, String fileName) throws Exception{
        /**
         * 手动改后缀名的情况
         */
        boolean xlsxFlag;
        if (fileName.endsWith(Constants.EXCEL_SUFFIX_07)){
            xlsxFlag = true;
        } else if (fileName.endsWith(Constants.EXCEL_SUFFIX_03)){
            xlsxFlag = false;
        } else {
            throw new Exception("文件格式错误,fileName的扩展名只能是xls或xlsx");
        }

        try {
            if (xlsxFlag) {
                Excel07Parser excel07 = new Excel07Parser();
                excel07.setRowReader(reader);
                excel07.process(fileName);
            } else {
                Excel03Parser excel03 = new Excel03Parser();
                excel03.setRowReader(reader);
                excel03.process(fileName);
            }
        } catch (IOException e) {
            if (xlsxFlag) {
                Excel03Parser excel03 = new Excel03Parser();
                excel03.setRowReader(reader);
                excel03.process(fileName);
            } else {
                Excel07Parser excel07 = new Excel07Parser();
                excel07.setRowReader(reader);
                excel07.process(fileName);
            }
        }

    }




}

 

package com.youth.excel_youth.util;

import com.youth.excel_youth.constants.Constants;
import org.apache.poi.ss.usermodel.*;
import org.springframework.util.Assert;

import java.util.Date;

/**
 * @author: youth_1231
 * @Date: 2019/1/7 0007 15:41
 * @Description:
 */
public class ExcelUtils {

    /**
     * 用户模式得到单元格的值
     * @param workbook
     * @param cell
     * @return
     */
    public static String getCellValue(Workbook workbook, Cell cell){
        Assert.notNull(workbook, "when you parse excel, workbook is not allowed to be null");
        String cellValue = "";
        if (cell == null){
            return cellValue;
        }

        switch (cell.getCellType()){
            case NUMERIC:

                cellValue = getDateValue(cell.getCellStyle().getDataFormat(), cell.getCellStyle().getDataFormatString(),
                        cell.getNumericCellValue());
                if (cellValue == null){
                    cellValue = String.valueOf(cell.getNumericCellValue());
                }
                break;
            case STRING:
                cellValue = String.valueOf(cell.getStringCellValue());
                break;
            case BOOLEAN:
                cellValue = String.valueOf(cell.getBooleanCellValue());
                break;
            case FORMULA:
                /**
                 * 格式化单元格
                 */
                FormulaEvaluator evaluator = workbook.getCreationHelper().createFormulaEvaluator();
                cellValue = getCellValue(evaluator.evaluate(cell));
                break;
            case BLANK:
                cellValue = "";
                break;
            case ERROR:
                cellValue = String.valueOf(cell.getErrorCellValue());
                break;
            case _NONE:
                cellValue = "";
                break;
            default:
                cellValue = "未知类型";
                break;
        }
        return cellValue;

    }


    /**
     * 用户模式得到公式单元格的值
     * @param formulaValue
     * @return
     */
    public static String getCellValue(CellValue formulaValue){
        String cellValue = "";
        if (formulaValue == null){
            return cellValue;
        }

        switch (formulaValue.getCellType()){
            case NUMERIC:
                cellValue = String.valueOf(formulaValue.getNumberValue());
                break;
            case STRING:
                cellValue = String.valueOf(formulaValue.getStringValue());
                break;
            case BOOLEAN:
                cellValue = String.valueOf(formulaValue.getBooleanValue());
                break;
            case BLANK:
                cellValue = "";
                break;
            case ERROR:
                cellValue = String.valueOf(formulaValue.getErrorValue());
                break;
            case _NONE:
                cellValue = "";
                break;
            default:
                cellValue = "未知类型";
                break;
        }
        return cellValue;

    }

    /**
     * 得到date单元格格式的值
     * @param dataFormat
     * @param dataFormatString
     * @param value
     * @return
     */
    public static String getDateValue(Short dataFormat, String dataFormatString, double value){
        if (!DateUtil.isValidExcelDate(value)){
            return null;
        }

        Date date = DateUtil.getJavaDate(value);
        /**
         * 年月日时分秒
         */
        if (Constants.EXCEL_FORMAT_INDEX_DATE_NYRSFM_STRING.contains(dataFormatString)) {
            return Constants.COMMON_DATE_FORMAT.format(date);
        }
        /**
         * 年月日
         */
        if (Constants.EXCEL_FORMAT_INDEX_DATE_NYR_STRING.contains(dataFormatString)) {
            return Constants.COMMON_DATE_FORMAT_NYR.format(date);
        }
        /**
         * 年月
         */
        if (Constants.EXCEL_FORMAT_INDEX_DATE_NY_STRING.contains(dataFormatString) || Constants.EXCEL_FORMAT_INDEX_DATA_EXACT_NY.equals(dataFormat)) {
            return Constants.COMMON_DATE_FORMAT_NY.format(date);
        }
        /**
         * 月日
         */
        if (Constants.EXCEL_FORMAT_INDEX_DATE_YR_STRING.contains(dataFormatString) || Constants.EXCEL_FORMAT_INDEX_DATA_EXACT_YR.equals(dataFormat)) {
            return Constants.COMMON_DATE_FORMAT_YR.format(date);

        }
        /**
         * 月
         */
        if (Constants.EXCEL_FORMAT_INDEX_DATE_Y_STRING.contains(dataFormatString)) {
            return Constants.COMMON_DATE_FORMAT_Y.format(date);
        }
        /**
         * 星期X
         */
        if (Constants.EXCEL_FORMAT_INDEX_DATE_XQ_STRING.contains(dataFormatString)) {
            return Constants.COMMON_DATE_FORMAT_XQ + CommonUtils.dateToWeek(date);
        }
        /**
         * 周X
         */
        if (Constants.EXCEL_FORMAT_INDEX_DATE_Z_STRING.contains(dataFormatString)) {
            return Constants.COMMON_DATE_FORMAT_Z + CommonUtils.dateToWeek(date);
        }
        /**
         * 时间格式
         */
        if (Constants.EXCEL_FORMAT_INDEX_TIME_STRING.contains(dataFormatString) || Constants.EXCEL_FORMAT_INDEX_TIME_EXACT.contains(dataFormat)) {
            return Constants.COMMON_TIME_FORMAT.format(DateUtil.getJavaDate(value));
        }
        /**
         * 单元格为其他未覆盖到的类型
         */
        if (DateUtil.isADateFormat(dataFormat, dataFormatString)) {
            return Constants.COMMON_TIME_FORMAT.format(value);
        }

        return null;
    }

}
package com.youth.excel_youth.util;

import com.youth.excel_youth.service.IRowReader;
import com.youth.excel_youth.service.RowReaderImpl;

import java.io.IOException;

/**
 * @author: youth_1231
 * @Date: 2019/1/6 0006 15:23
 * @Description:
 */
public class Test {
    public static void main(String[] args) {
        String path = "C:\\Users\\Administrator\\Desktop\\excel_03.xls";
        try {
            IRowReader rowReader = new RowReaderImpl();
            ExcelReaderUtil.readExcel(rowReader, path);
        } catch (IOException e) {
            e.printStackTrace();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}
<dependency>
   <groupId>org.apache.poi</groupId>
   <artifactId>poi</artifactId>
   <version>4.0.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml -->
<dependency>
   <groupId>org.apache.poi</groupId>
   <artifactId>poi-ooxml</artifactId>
   <version>4.0.1</version>
</dependency>

以上就是关于事件模式解析07和03版excel的方法,现在我们来上传单个sheet页100W的数据,用jProfile进行分析。

Java poi 4.0+ (三) 事件驱动模式解析excel 包含03和07版的excel

Java poi 4.0+ (三) 事件驱动模式解析excel 包含03和07版的excelJava poi 4.0+ (三) 事件驱动模式解析excel 包含03和07版的excel 第二次第三次也差不多,我就不多放图了。用2个sheet100W数据,内存运行情况也差不多,但是时间会增长。

另外,提醒一下,如果07版解析要中断的话,在一行的具体解析业务方法中抛异常即可。而03版不仅要可以抛异常以后,用一个flag去标识,这个sheet有问题,之后可以循环但是不去解析。光抛出异常是不行的。

一般接口的话,最好用异步线程池去解析(git地址以后再补)。