xlnt/xlntpyarrow/test.py
2017-08-01 10:58:47 -07:00

85 lines
2.7 KiB
Python

import pyarrow as pa
print('pyarrow loaded')
import xlntpyarrow as xpa
print('xlntpyarrow loaded')
COLUMN_TYPE_FIELD = {
xpa.Cell.Type.Number: pa.float64,
xpa.Cell.Type.SharedString: pa.string,
xpa.Cell.Type.InlineString: pa.string,
xpa.Cell.Type.FormulaString: pa.string,
xpa.Cell.Type.Error: pa.string,
xpa.Cell.Type.Boolean: pa.bool_,
xpa.Cell.Type.Date: pa.date32,
xpa.Cell.Type.Empty: pa.string,
}
def cell_to_pyarrow_array(cell, type):
if cell.data_type() == xpa.Cell.Type.Number:
return pa.array([cell.value_long_double()], type)
elif cell.data_type() == xpa.Cell.Type.SharedString:
return pa.array([cell.value_string()], type)
elif cell.data_type() == xpa.Cell.Type.InlineString:
return pa.array([cell.value_string()], type)
elif cell.data_type() == xpa.Cell.Type.FormulaString:
return pa.array([cell.value_string()], type)
elif cell.data_type() == xpa.Cell.Type.Error:
return pa.array([cell.value_string()], type)
elif cell.data_type() == xpa.Cell.Type.Boolean:
return pa.array([cell.value_bool()], type)
elif cell.data_type() == xpa.Cell.Type.Date:
return pa.array([cell.value_unsigned_int()], type)
elif cell.data_type() == xpa.Cell.Type.Empty:
return pa.array([cell.value_string()], type)
def xlsx2arrow(io, sheetname):
reader = xpa.StreamingWorkbookReader()
reader.open(io)
sheet_titles = reader.sheet_titles()
sheet_title = sheet_titles[0]
if sheetname is not None:
if isinstance(sheetname, int):
sheet_title = sheet_titles[sheetname]
elif isinstance(sheetname, str):
sheet_title = sheetname
reader.begin_worksheet(sheet_title)
column_names = []
fields = []
batches = []
schema = None
first_batch = []
while reader.has_cell():
cell = reader.read_cell()
type = cell.data_type()
if cell.row() == 1:
column_names.append(cell.value_string())
continue
elif cell.row() == 2:
column_name = column_names[cell.column() - 1]
fields.append(pa.field(column_name, COLUMN_TYPE_FIELD[type]()))
first_batch.append(cell_to_pyarrow_array(cell, fields[-1].type))
continue
elif schema is None:
schema = pa.schema(fields)
batches.append(pa.RecordBatch.from_arrays(first_batch, column_names))
print(schema)
print(batches[0])
batches.append(reader.read_batch(schema, 100000))
reader.end_worksheet()
return pa.Table.from_batches(batches)
if __name__ == '__main__':
file = open('tmp.xlsx', 'rb')
table = xlsx2arrow(file, 'Sheet1')
print(table.to_pandas())