xlnt/python/xlntpyarrow/__init__.py

import pyarrow as pa
import xlntpyarrow.lib as xpa

COLUMN_TYPE_FIELD = {
    xpa.Cell.Type.Number: pa.float64,
    xpa.Cell.Type.SharedString: pa.string,
    xpa.Cell.Type.InlineString: pa.string,
    xpa.Cell.Type.FormulaString: pa.string,
    xpa.Cell.Type.Error: pa.string,
    xpa.Cell.Type.Boolean: pa.bool_,
    xpa.Cell.Type.Date: pa.date32,
    xpa.Cell.Type.Empty: pa.string,
}

def cell_to_pyarrow_array(cell, type):
    if cell.data_type() == xpa.Cell.Type.Number:
        return pa.array([cell.value_double()], type)
    elif cell.data_type() == xpa.Cell.Type.SharedString:
        return pa.array([cell.value_string()], type)
    elif cell.data_type() == xpa.Cell.Type.InlineString:
        return pa.array([cell.value_string()], type)
    elif cell.data_type() == xpa.Cell.Type.FormulaString:
        return pa.array([cell.value_string()], type)
    elif cell.data_type() == xpa.Cell.Type.Error:
        return pa.array([cell.value_string()], type)
    elif cell.data_type() == xpa.Cell.Type.Boolean:
        return pa.array([cell.value_bool()], type)
    elif cell.data_type() == xpa.Cell.Type.Date:
        return pa.array([cell.value_unsigned_int()], type)
    elif cell.data_type() == xpa.Cell.Type.Empty:
        return pa.array([cell.value_string()], type)

def xlsx2arrow(io, sheetname):
    reader = xpa.StreamingWorkbookReader()
    reader.open(io)

    sheet_titles = reader.sheet_titles()
    sheet_title = sheet_titles[0]

    if sheetname is not None:
        if isinstance(sheetname, int):
            sheet_title = sheet_titles[sheetname]
        elif isinstance(sheetname, str):
            sheet_title = sheetname

    reader.begin_worksheet(sheet_title)

    column_names = []
    fields = []
    batches = []
    schema = None
    first_batch = []
    max_column = 0

    while reader.has_cell():
        if schema is None:
            cell = reader.read_cell()
            type = cell.data_type()

            if cell.row() == 1:
                column_names.append(cell.value_string())
                max_column = max(max_column, cell.column())
                continue
            elif cell.row() == 2:
                column_name = column_names[cell.column() - 1]
                if type == xpa.Cell.Type.Number and cell.format_is_date():
                    fields.append(pa.field(column_name, pa.date32))
                else:
                    fields.append(pa.field(column_name, COLUMN_TYPE_FIELD[type]()))
                first_batch.append(cell_to_pyarrow_array(cell, fields[-1].type))
                if cell.column() == max_column:
                    schema = pa.schema(fields)
                    print(schema)
                    batches.append(pa.RecordBatch.from_arrays(first_batch, column_names))
                continue

        batches.append(reader.read_batch(schema, 10000))

    reader.end_worksheet()

    return pa.Table.from_batches(batches)

if __name__ == '__main__':
    file = open('tmp.xlsx', 'rb')
    table = xlsx2arrow(file, 'Sheet1')
    print(table.to_pandas())
use pybind11 instead of raw C Python API, python_streambuffer is broken for now 2017-07-31 00:48:57 +08:00			`import pyarrow as pa`
move most of the pyarrow logic to python wrapper, use pyarrow's cmake setup.py code 2017-08-05 01:12:25 +08:00			`import xlntpyarrow.lib as xpa`
use pybind11 instead of raw C Python API, python_streambuffer is broken for now 2017-07-31 00:48:57 +08:00
			`COLUMN_TYPE_FIELD = {`
			`xpa.Cell.Type.Number: pa.float64,`
			`xpa.Cell.Type.SharedString: pa.string,`
			`xpa.Cell.Type.InlineString: pa.string,`
			`xpa.Cell.Type.FormulaString: pa.string,`
			`xpa.Cell.Type.Error: pa.string,`
			`xpa.Cell.Type.Boolean: pa.bool_,`
			`xpa.Cell.Type.Date: pa.date32,`
			`xpa.Cell.Type.Empty: pa.string,`
			`}`

implement other type builders 2017-08-02 01:58:47 +08:00			`def cell_to_pyarrow_array(cell, type):`
			`if cell.data_type() == xpa.Cell.Type.Number:`
(xlntpyarrow) check builder append return status and use double cell values instead of long double 2017-09-28 21:05:23 +08:00			`return pa.array([cell.value_double()], type)`
implement other type builders 2017-08-02 01:58:47 +08:00			`elif cell.data_type() == xpa.Cell.Type.SharedString:`
			`return pa.array([cell.value_string()], type)`
			`elif cell.data_type() == xpa.Cell.Type.InlineString:`
			`return pa.array([cell.value_string()], type)`
			`elif cell.data_type() == xpa.Cell.Type.FormulaString:`
			`return pa.array([cell.value_string()], type)`
			`elif cell.data_type() == xpa.Cell.Type.Error:`
			`return pa.array([cell.value_string()], type)`
			`elif cell.data_type() == xpa.Cell.Type.Boolean:`
			`return pa.array([cell.value_bool()], type)`
			`elif cell.data_type() == xpa.Cell.Type.Date:`
			`return pa.array([cell.value_unsigned_int()], type)`
			`elif cell.data_type() == xpa.Cell.Type.Empty:`
			`return pa.array([cell.value_string()], type)`

use pybind11 instead of raw C Python API, python_streambuffer is broken for now 2017-07-31 00:48:57 +08:00			`def xlsx2arrow(io, sheetname):`
			`reader = xpa.StreamingWorkbookReader()`
			`reader.open(io)`

			`sheet_titles = reader.sheet_titles()`
			`sheet_title = sheet_titles[0]`

			`if sheetname is not None:`
			`if isinstance(sheetname, int):`
			`sheet_title = sheet_titles[sheetname]`
			`elif isinstance(sheetname, str):`
			`sheet_title = sheetname`

			`reader.begin_worksheet(sheet_title)`

			`column_names = []`
			`fields = []`
			`batches = []`
figured out the problem 2017-07-31 11:32:37 +08:00			`schema = None`
implement other type builders 2017-08-02 01:58:47 +08:00			`first_batch = []`
handle batches correctly. it works! 2017-08-02 07:32:06 +08:00			`max_column = 0`
use pybind11 instead of raw C Python API, python_streambuffer is broken for now 2017-07-31 00:48:57 +08:00
			`while reader.has_cell():`
handle batches correctly. it works! 2017-08-02 07:32:06 +08:00			`if schema is None:`
			`cell = reader.read_cell()`
			`type = cell.data_type()`
use pybind11 instead of raw C Python API, python_streambuffer is broken for now 2017-07-31 00:48:57 +08:00
handle batches correctly. it works! 2017-08-02 07:32:06 +08:00			`if cell.row() == 1:`
			`column_names.append(cell.value_string())`
			`max_column = max(max_column, cell.column())`
			`continue`
			`elif cell.row() == 2:`
			`column_name = column_names[cell.column() - 1]`
add special case for numeric type cells with a date number format 2017-09-14 00:02:17 +08:00			`if type == xpa.Cell.Type.Number and cell.format_is_date():`
			`fields.append(pa.field(column_name, pa.date32))`
			`else:`
			`fields.append(pa.field(column_name, COLUMN_TYPE_FIELD[type]()))`
handle batches correctly. it works! 2017-08-02 07:32:06 +08:00			`first_batch.append(cell_to_pyarrow_array(cell, fields[-1].type))`
			`if cell.column() == max_column:`
			`schema = pa.schema(fields)`
			`print(schema)`
			`batches.append(pa.RecordBatch.from_arrays(first_batch, column_names))`
			`continue`
use pybind11 instead of raw C Python API, python_streambuffer is broken for now 2017-07-31 00:48:57 +08:00
handle batches correctly. it works! 2017-08-02 07:32:06 +08:00			`batches.append(reader.read_batch(schema, 10000))`
use pybind11 instead of raw C Python API, python_streambuffer is broken for now 2017-07-31 00:48:57 +08:00
			`reader.end_worksheet()`

			`return pa.Table.from_batches(batches)`

			`if __name__ == '__main__':`
			`file = open('tmp.xlsx', 'rb')`
figured out the problem 2017-07-31 11:32:37 +08:00			`table = xlsx2arrow(file, 'Sheet1')`
			`print(table.to_pandas())`