From 2aa9e62e627783e058232c072e4baf04ed5df43b Mon Sep 17 00:00:00 2001 From: Thomas Fussell Date: Tue, 1 Aug 2017 10:58:47 -0700 Subject: [PATCH] implement other type builders --- xlntpyarrow/CMakeLists.txt | 4 + xlntpyarrow/test.py | 38 ++++++---- xlntpyarrow/xlntpyarrow.cpp | 147 +++++++++++++++++++++++------------- 3 files changed, 125 insertions(+), 64 deletions(-) diff --git a/xlntpyarrow/CMakeLists.txt b/xlntpyarrow/CMakeLists.txt index fad3a224..79bab418 100644 --- a/xlntpyarrow/CMakeLists.txt +++ b/xlntpyarrow/CMakeLists.txt @@ -12,6 +12,10 @@ endif() pybind11_add_module(xlntpyarrow xlntpyarrow.cpp) +if(MSVC) + target_compile_definitions(xlntpyarrow PRIVATE _CRT_SECURE_NO_WARNINGS=1) +endif() + target_include_directories(xlntpyarrow PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../source diff --git a/xlntpyarrow/test.py b/xlntpyarrow/test.py index 321f9e54..d97a25c9 100644 --- a/xlntpyarrow/test.py +++ b/xlntpyarrow/test.py @@ -1,7 +1,7 @@ import pyarrow as pa +print('pyarrow loaded') import xlntpyarrow as xpa - -print(xpa) +print('xlntpyarrow loaded') COLUMN_TYPE_FIELD = { xpa.Cell.Type.Number: pa.float64, @@ -14,14 +14,29 @@ COLUMN_TYPE_FIELD = { xpa.Cell.Type.Empty: pa.string, } +def cell_to_pyarrow_array(cell, type): + if cell.data_type() == xpa.Cell.Type.Number: + return pa.array([cell.value_long_double()], type) + elif cell.data_type() == xpa.Cell.Type.SharedString: + return pa.array([cell.value_string()], type) + elif cell.data_type() == xpa.Cell.Type.InlineString: + return pa.array([cell.value_string()], type) + elif cell.data_type() == xpa.Cell.Type.FormulaString: + return pa.array([cell.value_string()], type) + elif cell.data_type() == xpa.Cell.Type.Error: + return pa.array([cell.value_string()], type) + elif cell.data_type() == xpa.Cell.Type.Boolean: + return pa.array([cell.value_bool()], type) + elif cell.data_type() == xpa.Cell.Type.Date: + return pa.array([cell.value_unsigned_int()], type) + elif cell.data_type() == xpa.Cell.Type.Empty: + return pa.array([cell.value_string()], type) + def xlsx2arrow(io, sheetname): reader = xpa.StreamingWorkbookReader() reader.open(io) - print('after open') - print('before titles') sheet_titles = reader.sheet_titles() - print('after titles', sheet_titles) sheet_title = sheet_titles[0] if sheetname is not None: @@ -30,37 +45,34 @@ def xlsx2arrow(io, sheetname): elif isinstance(sheetname, str): sheet_title = sheetname - print('before begin', sheet_title) reader.begin_worksheet(sheet_title) - print('after begin', sheet_title) column_names = [] fields = [] batches = [] schema = None + first_batch = [] while reader.has_cell(): cell = reader.read_cell() type = cell.data_type() - print('read_cell', cell.row(), cell.column()) - if cell.row() == 1: column_names.append(cell.value_string()) continue elif cell.row() == 2: column_name = column_names[cell.column() - 1] fields.append(pa.field(column_name, COLUMN_TYPE_FIELD[type]())) + first_batch.append(cell_to_pyarrow_array(cell, fields[-1].type)) continue elif schema is None: schema = pa.schema(fields) + batches.append(pa.RecordBatch.from_arrays(first_batch, column_names)) print(schema) + print(batches[0]) - batch = reader.read_batch(schema, 100000) - batches.append(batch) - - break + batches.append(reader.read_batch(schema, 100000)) reader.end_worksheet() diff --git a/xlntpyarrow/xlntpyarrow.cpp b/xlntpyarrow/xlntpyarrow.cpp index c08b4143..6fd2bddb 100644 --- a/xlntpyarrow/xlntpyarrow.cpp +++ b/xlntpyarrow/xlntpyarrow.cpp @@ -44,112 +44,121 @@ void import_pyarrow() } } -std::vector extract_schema_types(std::shared_ptr &schema) +std::unique_ptr make_array_builder(std::shared_ptr &type) { - auto types = std::vector(); - - for (auto i = 0; i < schema->num_fields(); ++i) - { - types.push_back(schema->field(i)->type()->id()); - } - - return types; -} - -std::unique_ptr make_array_builder(arrow::Type::type type) -{ - std::unique_ptr builder; auto pool = arrow::default_memory_pool(); + auto builder = static_cast(nullptr); - switch (type) + switch(type->id()) { case arrow::Type::NA: break; - case arrow::Type::BOOL: - builder.reset(new arrow::BooleanBuilder(pool)); - break; - case arrow::Type::UINT8: + builder = new arrow::TypeTraits::BuilderType(pool); break; case arrow::Type::INT8: + builder = new arrow::TypeTraits::BuilderType(pool); break; case arrow::Type::UINT16: + builder = new arrow::TypeTraits::BuilderType(pool); break; case arrow::Type::INT16: + builder = new arrow::TypeTraits::BuilderType(pool); break; case arrow::Type::UINT32: + builder = new arrow::TypeTraits::BuilderType(pool); break; case arrow::Type::INT32: + builder = new arrow::TypeTraits::BuilderType(pool); break; case arrow::Type::UINT64: + builder = new arrow::TypeTraits::BuilderType(pool); break; case arrow::Type::INT64: - break; - - case arrow::Type::HALF_FLOAT: - break; - - case arrow::Type::FLOAT: - break; - - case arrow::Type::DOUBLE: - builder.reset(new arrow::DoubleBuilder(pool)); - break; - - case arrow::Type::STRING: - builder.reset(new arrow::StringBuilder(pool)); - break; - - case arrow::Type::BINARY: - break; - - case arrow::Type::FIXED_SIZE_BINARY: - break; - - case arrow::Type::DATE32: - builder.reset(new arrow::Date32Builder(pool)); + builder = new arrow::TypeTraits::BuilderType(pool); break; case arrow::Type::DATE64: + builder = new arrow::TypeTraits::BuilderType(pool); break; + case arrow::Type::DATE32: + builder = new arrow::TypeTraits::BuilderType(pool); + break; +/* case arrow::Type::TIMESTAMP: + builder = new arrow::TypeTraits::BuilderType(pool); break; case arrow::Type::TIME32: + builder = new arrow::TypeTraits::BuilderType(pool); break; case arrow::Type::TIME64: + builder = new arrow::TypeTraits::BuilderType(pool); + break; +*/ + case arrow::Type::HALF_FLOAT: + builder = new arrow::TypeTraits::BuilderType(pool); break; - case arrow::Type::INTERVAL: + case arrow::Type::FLOAT: + builder = new arrow::TypeTraits::BuilderType(pool); + break; + + case arrow::Type::DOUBLE: + builder = new arrow::TypeTraits::BuilderType(pool); break; case arrow::Type::DECIMAL: + builder = new arrow::TypeTraits::BuilderType(pool, type); + break; + + case arrow::Type::BOOL: + builder = new arrow::TypeTraits::BuilderType(pool); + break; + + case arrow::Type::STRING: + builder = new arrow::TypeTraits::BuilderType(pool); + break; + + case arrow::Type::BINARY: + builder = new arrow::TypeTraits::BuilderType(pool); + break; +/* + case arrow::Type::FIXED_SIZE_BINARY: + builder = new arrow::TypeTraits::BuilderType(pool); break; case arrow::Type::LIST: + builder = new arrow::TypeTraits::BuilderType(pool); break; case arrow::Type::STRUCT: + builder = new arrow::TypeTraits::BuilderType(pool); break; case arrow::Type::UNION: + builder = new arrow::TypeTraits::BuilderType(pool); break; case arrow::Type::DICTIONARY: + builder = new arrow::TypeTraits::BuilderType(pool); break; +*/ + default: + throw std::exception("not implemented"); } - return builder; + return std::unique_ptr(builder); } void open_file(xlnt::streaming_workbook_reader &reader, pybind11::object file) @@ -165,13 +174,12 @@ pybind11::handle read_batch(xlnt::streaming_workbook_reader &reader, std::shared_ptr schema; arrow::py::unwrap_schema(pyschema.ptr(), &schema); - auto column_types = extract_schema_types(schema); auto builders = std::vector>(); auto num_rows = std::int64_t(0); - for (auto type : column_types) + for (auto i = 0; i < schema->num_fields(); ++i) { - builders.push_back(make_array_builder(type)); + builders.push_back(make_array_builder(schema->field(i)->type())); } for (auto row = 0; row < max_rows; ++row) @@ -188,10 +196,10 @@ pybind11::handle read_batch(xlnt::streaming_workbook_reader &reader, if (!reader.has_cell()) break; auto cell = reader.read_cell(); - auto column_type = column_types.at(column); + auto &column_type = schema->field(cell.column().index - 1)->type(); auto builder = builders.at(cell.column().index - 1).get(); - switch (column_type) + switch (column_type->id()) { case arrow::Type::NA: break; @@ -201,33 +209,43 @@ pybind11::handle read_batch(xlnt::streaming_workbook_reader &reader, break; case arrow::Type::UINT8: + static_cast(builder)->Append(static_cast(cell.value())); break; case arrow::Type::INT8: + static_cast(builder)->Append(static_cast(cell.value())); break; case arrow::Type::UINT16: + static_cast(builder)->Append(static_cast(cell.value())); break; case arrow::Type::INT16: + static_cast(builder)->Append(static_cast(cell.value())); break; case arrow::Type::UINT32: + static_cast(builder)->Append(cell.value()); break; case arrow::Type::INT32: + static_cast(builder)->Append(cell.value()); break; case arrow::Type::UINT64: + static_cast(builder)->Append(cell.value()); break; case arrow::Type::INT64: + static_cast(builder)->Append(cell.value()); break; case arrow::Type::HALF_FLOAT: + static_cast(builder)->Append(static_cast(cell.value())); break; case arrow::Type::FLOAT: + static_cast(builder)->Append(cell.value()); break; case arrow::Type::DOUBLE: @@ -239,9 +257,11 @@ pybind11::handle read_batch(xlnt::streaming_workbook_reader &reader, break; case arrow::Type::BINARY: + static_cast(builder)->Append(cell.value()); break; case arrow::Type::FIXED_SIZE_BINARY: + static_cast(builder)->Append(cell.value()); break; case arrow::Type::DATE32: @@ -249,34 +269,47 @@ pybind11::handle read_batch(xlnt::streaming_workbook_reader &reader, break; case arrow::Type::DATE64: + static_cast(builder)->Append(cell.value()); break; case arrow::Type::TIMESTAMP: + static_cast(builder)->Append(cell.value()); break; case arrow::Type::TIME32: + static_cast(builder)->Append(cell.value()); break; case arrow::Type::TIME64: + static_cast(builder)->Append(cell.value()); break; - +/* case arrow::Type::INTERVAL: + static_cast(builder)->Append(cell.value()); break; case arrow::Type::DECIMAL: + static_cast(builder)->Append(cell.value()); break; case arrow::Type::LIST: + static_cast(builder)->Append(cell.value()); break; case arrow::Type::STRUCT: + static_cast(builder)->Append(cell.value()); break; case arrow::Type::UNION: + static_cast(builder)->Append(cell.value()); break; case arrow::Type::DICTIONARY: + static_cast(builder)->Append(cell.value()); break; +*/ + default: + throw std::exception("not implemented"); } } @@ -321,6 +354,18 @@ PYBIND11_MODULE(xlntpyarrow, m) { return cell.value(); }) + .def("value_bool", [](xlnt::cell &cell) + { + return cell.value(); + }) + .def("value_unsigned_int", [](xlnt::cell &cell) + { + return cell.value(); + }) + .def("value_long_double", [](xlnt::cell &cell) + { + return cell.value(); + }) .def("data_type", [](xlnt::cell &cell) { return cell.data_type();