handle batches correctly. it works!

2024-03-22 13:11:17 +08:00 · 2017-08-01 16:32:06 -07:00 · 2017-08-01 16:32:06 -07:00 · e3ba39681e
commit e3ba39681e
parent 2aa9e62e62
1 changed files with 18 additions and 17 deletions
--- a/xlntpyarrow/test.py
+++ b/xlntpyarrow/test.py
@ -52,27 +52,28 @@ def xlsx2arrow(io, sheetname):
    batches = []
    schema = None
    first_batch = []
    max_column = 0
    while reader.has_cell():
-        cell = reader.read_cell()
+        if schema is None:
-        type = cell.data_type()
+            cell = reader.read_cell()
            type = cell.data_type()
-        if cell.row() == 1:
+            if cell.row() == 1:
-            column_names.append(cell.value_string())
+                column_names.append(cell.value_string())
-            continue
+                max_column = max(max_column, cell.column())
-        elif cell.row() == 2:
+                continue
-            column_name = column_names[cell.column() - 1]
+            elif cell.row() == 2:
-            fields.append(pa.field(column_name, COLUMN_TYPE_FIELD[type]()))
+                column_name = column_names[cell.column() - 1]
-            first_batch.append(cell_to_pyarrow_array(cell, fields[-1].type))
+                fields.append(pa.field(column_name, COLUMN_TYPE_FIELD[type]()))
-            continue
+                first_batch.append(cell_to_pyarrow_array(cell, fields[-1].type))
-        elif schema is None:
+                if cell.column() == max_column:
-            schema = pa.schema(fields)
+                    schema = pa.schema(fields)
-            batches.append(pa.RecordBatch.from_arrays(first_batch, column_names))
+                    print(schema)
                    batches.append(pa.RecordBatch.from_arrays(first_batch, column_names))
                continue
-        print(schema)
+        batches.append(reader.read_batch(schema, 10000))
        print(batches[0])
        batches.append(reader.read_batch(schema, 100000))
    reader.end_worksheet()