2017-07-31 00:48:57 +08:00
|
|
|
import pyarrow as pa
|
2017-08-05 01:12:25 +08:00
|
|
|
import xlntpyarrow.lib as xpa
|
2017-07-31 00:48:57 +08:00
|
|
|
|
|
|
|
COLUMN_TYPE_FIELD = {
|
|
|
|
xpa.Cell.Type.Number: pa.float64,
|
|
|
|
xpa.Cell.Type.SharedString: pa.string,
|
|
|
|
xpa.Cell.Type.InlineString: pa.string,
|
|
|
|
xpa.Cell.Type.FormulaString: pa.string,
|
|
|
|
xpa.Cell.Type.Error: pa.string,
|
|
|
|
xpa.Cell.Type.Boolean: pa.bool_,
|
|
|
|
xpa.Cell.Type.Date: pa.date32,
|
|
|
|
xpa.Cell.Type.Empty: pa.string,
|
|
|
|
}
|
|
|
|
|
2017-08-02 01:58:47 +08:00
|
|
|
def cell_to_pyarrow_array(cell, type):
|
|
|
|
if cell.data_type() == xpa.Cell.Type.Number:
|
2017-09-28 21:05:23 +08:00
|
|
|
return pa.array([cell.value_double()], type)
|
2017-08-02 01:58:47 +08:00
|
|
|
elif cell.data_type() == xpa.Cell.Type.SharedString:
|
|
|
|
return pa.array([cell.value_string()], type)
|
|
|
|
elif cell.data_type() == xpa.Cell.Type.InlineString:
|
|
|
|
return pa.array([cell.value_string()], type)
|
|
|
|
elif cell.data_type() == xpa.Cell.Type.FormulaString:
|
|
|
|
return pa.array([cell.value_string()], type)
|
|
|
|
elif cell.data_type() == xpa.Cell.Type.Error:
|
|
|
|
return pa.array([cell.value_string()], type)
|
|
|
|
elif cell.data_type() == xpa.Cell.Type.Boolean:
|
|
|
|
return pa.array([cell.value_bool()], type)
|
|
|
|
elif cell.data_type() == xpa.Cell.Type.Date:
|
|
|
|
return pa.array([cell.value_unsigned_int()], type)
|
|
|
|
elif cell.data_type() == xpa.Cell.Type.Empty:
|
|
|
|
return pa.array([cell.value_string()], type)
|
|
|
|
|
2017-07-31 00:48:57 +08:00
|
|
|
def xlsx2arrow(io, sheetname):
|
|
|
|
reader = xpa.StreamingWorkbookReader()
|
|
|
|
reader.open(io)
|
|
|
|
|
|
|
|
sheet_titles = reader.sheet_titles()
|
|
|
|
sheet_title = sheet_titles[0]
|
|
|
|
|
|
|
|
if sheetname is not None:
|
|
|
|
if isinstance(sheetname, int):
|
|
|
|
sheet_title = sheet_titles[sheetname]
|
|
|
|
elif isinstance(sheetname, str):
|
|
|
|
sheet_title = sheetname
|
|
|
|
|
|
|
|
reader.begin_worksheet(sheet_title)
|
|
|
|
|
|
|
|
column_names = []
|
|
|
|
fields = []
|
|
|
|
batches = []
|
2017-07-31 11:32:37 +08:00
|
|
|
schema = None
|
2017-08-02 01:58:47 +08:00
|
|
|
first_batch = []
|
2017-08-02 07:32:06 +08:00
|
|
|
max_column = 0
|
2017-07-31 00:48:57 +08:00
|
|
|
|
|
|
|
while reader.has_cell():
|
2017-08-02 07:32:06 +08:00
|
|
|
if schema is None:
|
|
|
|
cell = reader.read_cell()
|
|
|
|
type = cell.data_type()
|
2017-07-31 00:48:57 +08:00
|
|
|
|
2017-08-02 07:32:06 +08:00
|
|
|
if cell.row() == 1:
|
|
|
|
column_names.append(cell.value_string())
|
|
|
|
max_column = max(max_column, cell.column())
|
|
|
|
continue
|
|
|
|
elif cell.row() == 2:
|
|
|
|
column_name = column_names[cell.column() - 1]
|
2017-09-14 00:02:17 +08:00
|
|
|
if type == xpa.Cell.Type.Number and cell.format_is_date():
|
|
|
|
fields.append(pa.field(column_name, pa.date32))
|
|
|
|
else:
|
|
|
|
fields.append(pa.field(column_name, COLUMN_TYPE_FIELD[type]()))
|
2017-08-02 07:32:06 +08:00
|
|
|
first_batch.append(cell_to_pyarrow_array(cell, fields[-1].type))
|
|
|
|
if cell.column() == max_column:
|
|
|
|
schema = pa.schema(fields)
|
|
|
|
print(schema)
|
|
|
|
batches.append(pa.RecordBatch.from_arrays(first_batch, column_names))
|
|
|
|
continue
|
2017-07-31 00:48:57 +08:00
|
|
|
|
2017-08-02 07:32:06 +08:00
|
|
|
batches.append(reader.read_batch(schema, 10000))
|
2017-07-31 00:48:57 +08:00
|
|
|
|
|
|
|
reader.end_worksheet()
|
|
|
|
|
|
|
|
return pa.Table.from_batches(batches)
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
file = open('tmp.xlsx', 'rb')
|
2017-07-31 11:32:37 +08:00
|
|
|
table = xlsx2arrow(file, 'Sheet1')
|
|
|
|
print(table.to_pandas())
|