handle batches correctly. it works!

This commit is contained in:
Thomas Fussell 2017-08-01 16:32:06 -07:00
parent 2aa9e62e62
commit e3ba39681e

View File

@ -52,27 +52,28 @@ def xlsx2arrow(io, sheetname):
batches = [] batches = []
schema = None schema = None
first_batch = [] first_batch = []
max_column = 0
while reader.has_cell(): while reader.has_cell():
cell = reader.read_cell() if schema is None:
type = cell.data_type() cell = reader.read_cell()
type = cell.data_type()
if cell.row() == 1: if cell.row() == 1:
column_names.append(cell.value_string()) column_names.append(cell.value_string())
continue max_column = max(max_column, cell.column())
elif cell.row() == 2: continue
column_name = column_names[cell.column() - 1] elif cell.row() == 2:
fields.append(pa.field(column_name, COLUMN_TYPE_FIELD[type]())) column_name = column_names[cell.column() - 1]
first_batch.append(cell_to_pyarrow_array(cell, fields[-1].type)) fields.append(pa.field(column_name, COLUMN_TYPE_FIELD[type]()))
continue first_batch.append(cell_to_pyarrow_array(cell, fields[-1].type))
elif schema is None: if cell.column() == max_column:
schema = pa.schema(fields) schema = pa.schema(fields)
batches.append(pa.RecordBatch.from_arrays(first_batch, column_names)) print(schema)
batches.append(pa.RecordBatch.from_arrays(first_batch, column_names))
continue
print(schema) batches.append(reader.read_batch(schema, 10000))
print(batches[0])
batches.append(reader.read_batch(schema, 100000))
reader.end_worksheet() reader.end_worksheet()