mirror of
https://github.com/tfussell/xlnt.git
synced 2024-03-22 13:11:17 +08:00
handle batches correctly. it works!
This commit is contained in:
parent
2aa9e62e62
commit
e3ba39681e
|
@ -52,27 +52,28 @@ def xlsx2arrow(io, sheetname):
|
||||||
batches = []
|
batches = []
|
||||||
schema = None
|
schema = None
|
||||||
first_batch = []
|
first_batch = []
|
||||||
|
max_column = 0
|
||||||
|
|
||||||
while reader.has_cell():
|
while reader.has_cell():
|
||||||
|
if schema is None:
|
||||||
cell = reader.read_cell()
|
cell = reader.read_cell()
|
||||||
type = cell.data_type()
|
type = cell.data_type()
|
||||||
|
|
||||||
if cell.row() == 1:
|
if cell.row() == 1:
|
||||||
column_names.append(cell.value_string())
|
column_names.append(cell.value_string())
|
||||||
|
max_column = max(max_column, cell.column())
|
||||||
continue
|
continue
|
||||||
elif cell.row() == 2:
|
elif cell.row() == 2:
|
||||||
column_name = column_names[cell.column() - 1]
|
column_name = column_names[cell.column() - 1]
|
||||||
fields.append(pa.field(column_name, COLUMN_TYPE_FIELD[type]()))
|
fields.append(pa.field(column_name, COLUMN_TYPE_FIELD[type]()))
|
||||||
first_batch.append(cell_to_pyarrow_array(cell, fields[-1].type))
|
first_batch.append(cell_to_pyarrow_array(cell, fields[-1].type))
|
||||||
continue
|
if cell.column() == max_column:
|
||||||
elif schema is None:
|
|
||||||
schema = pa.schema(fields)
|
schema = pa.schema(fields)
|
||||||
batches.append(pa.RecordBatch.from_arrays(first_batch, column_names))
|
|
||||||
|
|
||||||
print(schema)
|
print(schema)
|
||||||
print(batches[0])
|
batches.append(pa.RecordBatch.from_arrays(first_batch, column_names))
|
||||||
|
continue
|
||||||
|
|
||||||
batches.append(reader.read_batch(schema, 100000))
|
batches.append(reader.read_batch(schema, 10000))
|
||||||
|
|
||||||
reader.end_worksheet()
|
reader.end_worksheet()
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user