diff --git a/.gitignore b/.gitignore index 0844650f..98a5778a 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,5 @@ node_modules/ __pycache__/ Win32/ *.pyd +python/record.txt +python/xlntpyarrow.egg-info/ diff --git a/python/xlntpyarrow.lib.cpp b/python/xlntpyarrow.lib.cpp index a860ac95..230cd8e2 100644 --- a/python/xlntpyarrow.lib.cpp +++ b/python/xlntpyarrow.lib.cpp @@ -45,12 +45,12 @@ void import_pyarrow() } } -std::unique_ptr make_array_builder(std::shared_ptr &type) +arrow::ArrayBuilder *make_array_builder(arrow::Type::type type) { auto pool = arrow::default_memory_pool(); auto builder = static_cast(nullptr); - switch(type->id()) + switch(type) { case arrow::Type::NA: break; @@ -118,11 +118,11 @@ std::unique_ptr make_array_builder(std::shared_ptr::BuilderType(pool); break; - +/* case arrow::Type::DECIMAL: builder = new arrow::TypeTraits::BuilderType(pool, type); break; - +*/ case arrow::Type::BOOL: builder = new arrow::TypeTraits::BuilderType(pool); break; @@ -159,7 +159,7 @@ std::unique_ptr make_array_builder(std::shared_ptr(builder); + return builder; } void open_file(xlnt::streaming_workbook_reader &reader, pybind11::object file) @@ -167,6 +167,165 @@ void open_file(xlnt::streaming_workbook_reader &reader, pybind11::object file) reader.open(std::unique_ptr(new xlnt::python_streambuf(file))); } +template +T cell_value(xlnt::cell cell) +{ + return static_cast(cell.value()); +} + +// from https://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion +std::uint16_t float_to_half(float f) +{ + auto x = static_cast(f); + auto half = ((x >> 16) & 0x8000) + | ((((x & 0x7f800000) - 0x38000000) >> 13) & 0x7c00) + | ((x >> 13) & 0x03ff); + + return half; +} + +void append_cell_value(arrow::ArrayBuilder *builder, arrow::Type::type type, xlnt::cell cell) +{ + switch (type) + { + case arrow::Type::NA: + break; + + case arrow::Type::BOOL: + static_cast(builder) + ->Append(cell.value()); + break; + + case arrow::Type::UINT8: + static_cast(builder) + ->Append(cell_value(cell)); + break; + + case arrow::Type::INT8: + static_cast(builder) + ->Append(cell_value(cell)); + break; + + case arrow::Type::UINT16: + static_cast(builder) + ->Append(cell_value(cell)); + break; + + case arrow::Type::INT16: + static_cast(builder) + ->Append(cell_value(cell)); + break; + + case arrow::Type::UINT32: + static_cast(builder) + ->Append(cell_value(cell)); + break; + + case arrow::Type::INT32: + static_cast(builder) + ->Append(cell_value(cell)); + break; + + case arrow::Type::UINT64: + static_cast(builder) + ->Append(cell_value(cell)); + break; + + case arrow::Type::INT64: + static_cast(builder) + ->Append(cell_value(cell)); + break; + + case arrow::Type::HALF_FLOAT: + static_cast(builder) + ->Append(float_to_half(cell_value(cell))); + break; + + case arrow::Type::FLOAT: + static_cast(builder) + ->Append(cell_value(cell)); + break; + + case arrow::Type::DOUBLE: + static_cast(builder) + ->Append(cell_value(cell)); + break; + + case arrow::Type::STRING: + static_cast(builder) + ->Append(cell.value()); + break; + + case arrow::Type::BINARY: + static_cast(builder) + ->Append(cell.value()); + break; + + case arrow::Type::FIXED_SIZE_BINARY: + static_cast(builder) + ->Append(cell.value()); + break; + + case arrow::Type::DATE32: + static_cast(builder) + ->Append(cell_value(cell)); + break; + + case arrow::Type::DATE64: + static_cast(builder) + ->Append(cell_value(cell)); + break; + + case arrow::Type::TIMESTAMP: + static_cast(builder) + ->Append(cell_value(cell)); + break; + + case arrow::Type::TIME32: + static_cast(builder) + ->Append(cell_value(cell)); + break; + + case arrow::Type::TIME64: + static_cast(builder) + ->Append(cell_value(cell)); + break; +/* + case arrow::Type::INTERVAL: + static_cast(builder) + ->Append(cell_value(cell)); + break; + + case arrow::Type::DECIMAL: + static_cast(builder) + ->Append(cell.value()); + break; + + case arrow::Type::LIST: + static_cast(builder) + ->Append(cell.value()); + break; + + case arrow::Type::STRUCT: + static_cast(builder) + ->Append(cell.value()); + break; + + case arrow::Type::UNION: + static_cast(builder) + ->Append(cell.value()); + break; + + case arrow::Type::DICTIONARY: + static_cast(builder) + ->Append(cell.value()); + break; +*/ + default: + throw std::runtime_error("not implemented"); + } +} + pybind11::handle read_batch(xlnt::streaming_workbook_reader &reader, pybind11::object pyschema, int max_rows) { @@ -175,146 +334,39 @@ pybind11::handle read_batch(xlnt::streaming_workbook_reader &reader, std::shared_ptr schema; arrow::py::unwrap_schema(pyschema.ptr(), &schema); - auto builders = std::vector>(); - auto num_rows = std::int64_t(0); + std::vector column_types; for (auto i = 0; i < schema->num_fields(); ++i) { - builders.push_back(make_array_builder(schema->field(i)->type())); + column_types.push_back(schema->field(i)->type()->id()); } - for (auto row = 0; row < max_rows; ++row) + auto builders = std::vector>(); + + for (auto type : column_types) + { + builders.emplace_back(make_array_builder(type)); + } + + auto row = std::int64_t(0); + + while (row < max_rows) { if (!reader.has_cell()) break; - if (row % 1000 == 0) - { - std::cout << row << std::endl; - } - for (auto column = 0; column < schema->num_fields(); ++column) { if (!reader.has_cell()) break; auto cell = reader.read_cell(); - auto &column_type = schema->field(cell.column().index - 1)->type(); - auto builder = builders.at(cell.column().index - 1).get(); + auto zero_indexed_column = cell.column().index - 1; + auto column_type = column_types.at(zero_indexed_column); + auto builder = builders.at(zero_indexed_column).get(); - switch (column_type->id()) - { - case arrow::Type::NA: - break; - - case arrow::Type::BOOL: - static_cast(builder)->Append(cell.value()); - break; - - case arrow::Type::UINT8: - static_cast(builder)->Append(static_cast(cell.value())); - break; - - case arrow::Type::INT8: - static_cast(builder)->Append(static_cast(cell.value())); - break; - - case arrow::Type::UINT16: - static_cast(builder)->Append(static_cast(cell.value())); - break; - - case arrow::Type::INT16: - static_cast(builder)->Append(static_cast(cell.value())); - break; - - case arrow::Type::UINT32: - static_cast(builder)->Append(cell.value()); - break; - - case arrow::Type::INT32: - static_cast(builder)->Append(cell.value()); - break; - - case arrow::Type::UINT64: - static_cast(builder)->Append(cell.value()); - break; - - case arrow::Type::INT64: - static_cast(builder)->Append(cell.value()); - break; - - case arrow::Type::HALF_FLOAT: - static_cast(builder)->Append(static_cast(cell.value())); - break; - - case arrow::Type::FLOAT: - static_cast(builder)->Append(cell.value()); - break; - - case arrow::Type::DOUBLE: - static_cast(builder)->Append(cell.value()); - break; - - case arrow::Type::STRING: - static_cast(builder)->Append(cell.value()); - break; - - case arrow::Type::BINARY: - static_cast(builder)->Append(cell.value()); - break; - - case arrow::Type::FIXED_SIZE_BINARY: - static_cast(builder)->Append(cell.value()); - break; - - case arrow::Type::DATE32: - static_cast(builder)->Append(cell.value()); - break; - - case arrow::Type::DATE64: - static_cast(builder)->Append(cell.value()); - break; - - case arrow::Type::TIMESTAMP: - static_cast(builder)->Append(cell.value()); - break; - - case arrow::Type::TIME32: - static_cast(builder)->Append(cell.value()); - break; - - case arrow::Type::TIME64: - static_cast(builder)->Append(cell.value()); - break; -/* - case arrow::Type::INTERVAL: - static_cast(builder)->Append(cell.value()); - break; - - case arrow::Type::DECIMAL: - static_cast(builder)->Append(cell.value()); - break; - - case arrow::Type::LIST: - static_cast(builder)->Append(cell.value()); - break; - - case arrow::Type::STRUCT: - static_cast(builder)->Append(cell.value()); - break; - - case arrow::Type::UNION: - static_cast(builder)->Append(cell.value()); - break; - - case arrow::Type::DICTIONARY: - static_cast(builder)->Append(cell.value()); - break; -*/ - default: - throw std::runtime_error("not implemented"); - } + append_cell_value(builder, column_type, cell); } - ++num_rows; + ++row; } auto columns = std::vector>(); @@ -326,7 +378,7 @@ pybind11::handle read_batch(xlnt::streaming_workbook_reader &reader, columns.emplace_back(column); } - auto batch_pointer = std::make_shared(schema, num_rows, columns); + auto batch_pointer = std::make_shared(schema, row, columns); auto batch_object = arrow::py::wrap_record_batch(batch_pointer); auto batch_handle = pybind11::handle(batch_object); // don't need to incr. reference count, right?