xlnt/xlntpyarrow/test.py

import pyarrow as pa
print('pyarrow loaded')
import xlntpyarrow as xpa
print('xlntpyarrow loaded')

COLUMN_TYPE_FIELD = {
    xpa.Cell.Type.Number: pa.float64,
    xpa.Cell.Type.SharedString: pa.string,
    xpa.Cell.Type.InlineString: pa.string,
    xpa.Cell.Type.FormulaString: pa.string,
    xpa.Cell.Type.Error: pa.string,
    xpa.Cell.Type.Boolean: pa.bool_,
    xpa.Cell.Type.Date: pa.date32,
    xpa.Cell.Type.Empty: pa.string,
}

def cell_to_pyarrow_array(cell, type):
    if cell.data_type() == xpa.Cell.Type.Number:
        return pa.array([cell.value_long_double()], type)
    elif cell.data_type() == xpa.Cell.Type.SharedString:
        return pa.array([cell.value_string()], type)
    elif cell.data_type() == xpa.Cell.Type.InlineString:
        return pa.array([cell.value_string()], type)
    elif cell.data_type() == xpa.Cell.Type.FormulaString:
        return pa.array([cell.value_string()], type)
    elif cell.data_type() == xpa.Cell.Type.Error:
        return pa.array([cell.value_string()], type)
    elif cell.data_type() == xpa.Cell.Type.Boolean:
        return pa.array([cell.value_bool()], type)
    elif cell.data_type() == xpa.Cell.Type.Date:
        return pa.array([cell.value_unsigned_int()], type)
    elif cell.data_type() == xpa.Cell.Type.Empty:
        return pa.array([cell.value_string()], type)

def xlsx2arrow(io, sheetname):
    reader = xpa.StreamingWorkbookReader()
    reader.open(io)

    sheet_titles = reader.sheet_titles()
    sheet_title = sheet_titles[0]

    if sheetname is not None:
        if isinstance(sheetname, int):
            sheet_title = sheet_titles[sheetname]
        elif isinstance(sheetname, str):
            sheet_title = sheetname

    reader.begin_worksheet(sheet_title)

    column_names = []
    fields = []
    batches = []
    schema = None
    first_batch = []

    while reader.has_cell():
        cell = reader.read_cell()
        type = cell.data_type()

        if cell.row() == 1:
            column_names.append(cell.value_string())
            continue
        elif cell.row() == 2:
            column_name = column_names[cell.column() - 1]
            fields.append(pa.field(column_name, COLUMN_TYPE_FIELD[type]()))
            first_batch.append(cell_to_pyarrow_array(cell, fields[-1].type))
            continue
        elif schema is None:
            schema = pa.schema(fields)
            batches.append(pa.RecordBatch.from_arrays(first_batch, column_names))

        print(schema)
        print(batches[0])

        batches.append(reader.read_batch(schema, 100000))

    reader.end_worksheet()

    return pa.Table.from_batches(batches)

if __name__ == '__main__':
    file = open('tmp.xlsx', 'rb')
    table = xlsx2arrow(file, 'Sheet1')
    print(table.to_pandas())