implement other type builders

This commit is contained in:
Thomas Fussell 2017-08-01 10:58:47 -07:00
parent 8801a0e352
commit 2aa9e62e62
3 changed files with 125 additions and 64 deletions

View File

@ -12,6 +12,10 @@ endif()
pybind11_add_module(xlntpyarrow xlntpyarrow.cpp) pybind11_add_module(xlntpyarrow xlntpyarrow.cpp)
if(MSVC)
target_compile_definitions(xlntpyarrow PRIVATE _CRT_SECURE_NO_WARNINGS=1)
endif()
target_include_directories(xlntpyarrow target_include_directories(xlntpyarrow
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../source PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../source

View File

@ -1,7 +1,7 @@
import pyarrow as pa import pyarrow as pa
print('pyarrow loaded')
import xlntpyarrow as xpa import xlntpyarrow as xpa
print('xlntpyarrow loaded')
print(xpa)
COLUMN_TYPE_FIELD = { COLUMN_TYPE_FIELD = {
xpa.Cell.Type.Number: pa.float64, xpa.Cell.Type.Number: pa.float64,
@ -14,14 +14,29 @@ COLUMN_TYPE_FIELD = {
xpa.Cell.Type.Empty: pa.string, xpa.Cell.Type.Empty: pa.string,
} }
def cell_to_pyarrow_array(cell, type):
if cell.data_type() == xpa.Cell.Type.Number:
return pa.array([cell.value_long_double()], type)
elif cell.data_type() == xpa.Cell.Type.SharedString:
return pa.array([cell.value_string()], type)
elif cell.data_type() == xpa.Cell.Type.InlineString:
return pa.array([cell.value_string()], type)
elif cell.data_type() == xpa.Cell.Type.FormulaString:
return pa.array([cell.value_string()], type)
elif cell.data_type() == xpa.Cell.Type.Error:
return pa.array([cell.value_string()], type)
elif cell.data_type() == xpa.Cell.Type.Boolean:
return pa.array([cell.value_bool()], type)
elif cell.data_type() == xpa.Cell.Type.Date:
return pa.array([cell.value_unsigned_int()], type)
elif cell.data_type() == xpa.Cell.Type.Empty:
return pa.array([cell.value_string()], type)
def xlsx2arrow(io, sheetname): def xlsx2arrow(io, sheetname):
reader = xpa.StreamingWorkbookReader() reader = xpa.StreamingWorkbookReader()
reader.open(io) reader.open(io)
print('after open')
print('before titles')
sheet_titles = reader.sheet_titles() sheet_titles = reader.sheet_titles()
print('after titles', sheet_titles)
sheet_title = sheet_titles[0] sheet_title = sheet_titles[0]
if sheetname is not None: if sheetname is not None:
@ -30,37 +45,34 @@ def xlsx2arrow(io, sheetname):
elif isinstance(sheetname, str): elif isinstance(sheetname, str):
sheet_title = sheetname sheet_title = sheetname
print('before begin', sheet_title)
reader.begin_worksheet(sheet_title) reader.begin_worksheet(sheet_title)
print('after begin', sheet_title)
column_names = [] column_names = []
fields = [] fields = []
batches = [] batches = []
schema = None schema = None
first_batch = []
while reader.has_cell(): while reader.has_cell():
cell = reader.read_cell() cell = reader.read_cell()
type = cell.data_type() type = cell.data_type()
print('read_cell', cell.row(), cell.column())
if cell.row() == 1: if cell.row() == 1:
column_names.append(cell.value_string()) column_names.append(cell.value_string())
continue continue
elif cell.row() == 2: elif cell.row() == 2:
column_name = column_names[cell.column() - 1] column_name = column_names[cell.column() - 1]
fields.append(pa.field(column_name, COLUMN_TYPE_FIELD[type]())) fields.append(pa.field(column_name, COLUMN_TYPE_FIELD[type]()))
first_batch.append(cell_to_pyarrow_array(cell, fields[-1].type))
continue continue
elif schema is None: elif schema is None:
schema = pa.schema(fields) schema = pa.schema(fields)
batches.append(pa.RecordBatch.from_arrays(first_batch, column_names))
print(schema) print(schema)
print(batches[0])
batch = reader.read_batch(schema, 100000) batches.append(reader.read_batch(schema, 100000))
batches.append(batch)
break
reader.end_worksheet() reader.end_worksheet()

View File

@ -44,112 +44,121 @@ void import_pyarrow()
} }
} }
std::vector<arrow::Type::type> extract_schema_types(std::shared_ptr<arrow::Schema> &schema) std::unique_ptr<arrow::ArrayBuilder> make_array_builder(std::shared_ptr<arrow::DataType> &type)
{ {
auto types = std::vector<arrow::Type::type>();
for (auto i = 0; i < schema->num_fields(); ++i)
{
types.push_back(schema->field(i)->type()->id());
}
return types;
}
std::unique_ptr<arrow::ArrayBuilder> make_array_builder(arrow::Type::type type)
{
std::unique_ptr<arrow::ArrayBuilder> builder;
auto pool = arrow::default_memory_pool(); auto pool = arrow::default_memory_pool();
auto builder = static_cast<arrow::ArrayBuilder *>(nullptr);
switch (type) switch(type->id())
{ {
case arrow::Type::NA: case arrow::Type::NA:
break; break;
case arrow::Type::BOOL:
builder.reset(new arrow::BooleanBuilder(pool));
break;
case arrow::Type::UINT8: case arrow::Type::UINT8:
builder = new arrow::TypeTraits<arrow::UInt8Type>::BuilderType(pool);
break; break;
case arrow::Type::INT8: case arrow::Type::INT8:
builder = new arrow::TypeTraits<arrow::Int8Type>::BuilderType(pool);
break; break;
case arrow::Type::UINT16: case arrow::Type::UINT16:
builder = new arrow::TypeTraits<arrow::UInt16Type>::BuilderType(pool);
break; break;
case arrow::Type::INT16: case arrow::Type::INT16:
builder = new arrow::TypeTraits<arrow::Int16Type>::BuilderType(pool);
break; break;
case arrow::Type::UINT32: case arrow::Type::UINT32:
builder = new arrow::TypeTraits<arrow::UInt32Type>::BuilderType(pool);
break; break;
case arrow::Type::INT32: case arrow::Type::INT32:
builder = new arrow::TypeTraits<arrow::Int32Type>::BuilderType(pool);
break; break;
case arrow::Type::UINT64: case arrow::Type::UINT64:
builder = new arrow::TypeTraits<arrow::UInt64Type>::BuilderType(pool);
break; break;
case arrow::Type::INT64: case arrow::Type::INT64:
break; builder = new arrow::TypeTraits<arrow::Int64Type>::BuilderType(pool);
case arrow::Type::HALF_FLOAT:
break;
case arrow::Type::FLOAT:
break;
case arrow::Type::DOUBLE:
builder.reset(new arrow::DoubleBuilder(pool));
break;
case arrow::Type::STRING:
builder.reset(new arrow::StringBuilder(pool));
break;
case arrow::Type::BINARY:
break;
case arrow::Type::FIXED_SIZE_BINARY:
break;
case arrow::Type::DATE32:
builder.reset(new arrow::Date32Builder(pool));
break; break;
case arrow::Type::DATE64: case arrow::Type::DATE64:
builder = new arrow::TypeTraits<arrow::Date64Type>::BuilderType(pool);
break; break;
case arrow::Type::DATE32:
builder = new arrow::TypeTraits<arrow::Date32Type>::BuilderType(pool);
break;
/*
case arrow::Type::TIMESTAMP: case arrow::Type::TIMESTAMP:
builder = new arrow::TypeTraits<arrow::TimestampType>::BuilderType(pool);
break; break;
case arrow::Type::TIME32: case arrow::Type::TIME32:
builder = new arrow::TypeTraits<arrow::Time32Type>::BuilderType(pool);
break; break;
case arrow::Type::TIME64: case arrow::Type::TIME64:
builder = new arrow::TypeTraits<arrow::Time64Type>::BuilderType(pool);
break;
*/
case arrow::Type::HALF_FLOAT:
builder = new arrow::TypeTraits<arrow::HalfFloatType>::BuilderType(pool);
break; break;
case arrow::Type::INTERVAL: case arrow::Type::FLOAT:
builder = new arrow::TypeTraits<arrow::FloatType>::BuilderType(pool);
break;
case arrow::Type::DOUBLE:
builder = new arrow::TypeTraits<arrow::DoubleType>::BuilderType(pool);
break; break;
case arrow::Type::DECIMAL: case arrow::Type::DECIMAL:
builder = new arrow::TypeTraits<arrow::DecimalType>::BuilderType(pool, type);
break;
case arrow::Type::BOOL:
builder = new arrow::TypeTraits<arrow::BooleanType>::BuilderType(pool);
break;
case arrow::Type::STRING:
builder = new arrow::TypeTraits<arrow::StringType>::BuilderType(pool);
break;
case arrow::Type::BINARY:
builder = new arrow::TypeTraits<arrow::BinaryType>::BuilderType(pool);
break;
/*
case arrow::Type::FIXED_SIZE_BINARY:
builder = new arrow::TypeTraits<arrow::FixedSizeBinaryType>::BuilderType(pool);
break; break;
case arrow::Type::LIST: case arrow::Type::LIST:
builder = new arrow::TypeTraits<arrow::ListType>::BuilderType(pool);
break; break;
case arrow::Type::STRUCT: case arrow::Type::STRUCT:
builder = new arrow::TypeTraits<arrow::StructType>::BuilderType(pool);
break; break;
case arrow::Type::UNION: case arrow::Type::UNION:
builder = new arrow::TypeTraits<arrow::UnionType>::BuilderType(pool);
break; break;
case arrow::Type::DICTIONARY: case arrow::Type::DICTIONARY:
builder = new arrow::TypeTraits<arrow::DictionaryType>::BuilderType(pool);
break; break;
*/
default:
throw std::exception("not implemented");
} }
return builder; return std::unique_ptr<arrow::ArrayBuilder>(builder);
} }
void open_file(xlnt::streaming_workbook_reader &reader, pybind11::object file) void open_file(xlnt::streaming_workbook_reader &reader, pybind11::object file)
@ -165,13 +174,12 @@ pybind11::handle read_batch(xlnt::streaming_workbook_reader &reader,
std::shared_ptr<arrow::Schema> schema; std::shared_ptr<arrow::Schema> schema;
arrow::py::unwrap_schema(pyschema.ptr(), &schema); arrow::py::unwrap_schema(pyschema.ptr(), &schema);
auto column_types = extract_schema_types(schema);
auto builders = std::vector<std::shared_ptr<arrow::ArrayBuilder>>(); auto builders = std::vector<std::shared_ptr<arrow::ArrayBuilder>>();
auto num_rows = std::int64_t(0); auto num_rows = std::int64_t(0);
for (auto type : column_types) for (auto i = 0; i < schema->num_fields(); ++i)
{ {
builders.push_back(make_array_builder(type)); builders.push_back(make_array_builder(schema->field(i)->type()));
} }
for (auto row = 0; row < max_rows; ++row) for (auto row = 0; row < max_rows; ++row)
@ -188,10 +196,10 @@ pybind11::handle read_batch(xlnt::streaming_workbook_reader &reader,
if (!reader.has_cell()) break; if (!reader.has_cell()) break;
auto cell = reader.read_cell(); auto cell = reader.read_cell();
auto column_type = column_types.at(column); auto &column_type = schema->field(cell.column().index - 1)->type();
auto builder = builders.at(cell.column().index - 1).get(); auto builder = builders.at(cell.column().index - 1).get();
switch (column_type) switch (column_type->id())
{ {
case arrow::Type::NA: case arrow::Type::NA:
break; break;
@ -201,33 +209,43 @@ pybind11::handle read_batch(xlnt::streaming_workbook_reader &reader,
break; break;
case arrow::Type::UINT8: case arrow::Type::UINT8:
static_cast<arrow::UInt8Builder *>(builder)->Append(static_cast<std::uint8_t>(cell.value<unsigned int>()));
break; break;
case arrow::Type::INT8: case arrow::Type::INT8:
static_cast<arrow::Int8Builder *>(builder)->Append(static_cast<std::int8_t>(cell.value<int>()));
break; break;
case arrow::Type::UINT16: case arrow::Type::UINT16:
static_cast<arrow::UInt16Builder *>(builder)->Append(static_cast<std::uint16_t>(cell.value<unsigned int>()));
break; break;
case arrow::Type::INT16: case arrow::Type::INT16:
static_cast<arrow::Int16Builder *>(builder)->Append(static_cast<std::int16_t>(cell.value<int>()));
break; break;
case arrow::Type::UINT32: case arrow::Type::UINT32:
static_cast<arrow::UInt32Builder *>(builder)->Append(cell.value<std::uint32_t>());
break; break;
case arrow::Type::INT32: case arrow::Type::INT32:
static_cast<arrow::Int32Builder *>(builder)->Append(cell.value<std::int32_t>());
break; break;
case arrow::Type::UINT64: case arrow::Type::UINT64:
static_cast<arrow::UInt64Builder *>(builder)->Append(cell.value<std::uint64_t>());
break; break;
case arrow::Type::INT64: case arrow::Type::INT64:
static_cast<arrow::Int64Builder *>(builder)->Append(cell.value<std::int64_t>());
break; break;
case arrow::Type::HALF_FLOAT: case arrow::Type::HALF_FLOAT:
static_cast<arrow::HalfFloatBuilder *>(builder)->Append(static_cast<unsigned short>(cell.value<float>()));
break; break;
case arrow::Type::FLOAT: case arrow::Type::FLOAT:
static_cast<arrow::FloatBuilder *>(builder)->Append(cell.value<float>());
break; break;
case arrow::Type::DOUBLE: case arrow::Type::DOUBLE:
@ -239,9 +257,11 @@ pybind11::handle read_batch(xlnt::streaming_workbook_reader &reader,
break; break;
case arrow::Type::BINARY: case arrow::Type::BINARY:
static_cast<arrow::BinaryBuilder *>(builder)->Append(cell.value<std::string>());
break; break;
case arrow::Type::FIXED_SIZE_BINARY: case arrow::Type::FIXED_SIZE_BINARY:
static_cast<arrow::FixedSizeBinaryBuilder *>(builder)->Append(cell.value<std::string>());
break; break;
case arrow::Type::DATE32: case arrow::Type::DATE32:
@ -249,34 +269,47 @@ pybind11::handle read_batch(xlnt::streaming_workbook_reader &reader,
break; break;
case arrow::Type::DATE64: case arrow::Type::DATE64:
static_cast<arrow::Date64Builder *>(builder)->Append(cell.value<std::int64_t>());
break; break;
case arrow::Type::TIMESTAMP: case arrow::Type::TIMESTAMP:
static_cast<arrow::TimestampBuilder *>(builder)->Append(cell.value<std::int64_t>());
break; break;
case arrow::Type::TIME32: case arrow::Type::TIME32:
static_cast<arrow::Time32Builder *>(builder)->Append(cell.value<int>());
break; break;
case arrow::Type::TIME64: case arrow::Type::TIME64:
static_cast<arrow::Time64Builder *>(builder)->Append(cell.value<std::int64_t>());
break; break;
/*
case arrow::Type::INTERVAL: case arrow::Type::INTERVAL:
static_cast<arrow::IntervalBuilder *>(builder)->Append(cell.value<std::int64_t>());
break; break;
case arrow::Type::DECIMAL: case arrow::Type::DECIMAL:
static_cast<arrow::DecimalBuilder *>(builder)->Append(cell.value<std::string>());
break; break;
case arrow::Type::LIST: case arrow::Type::LIST:
static_cast<arrow::ListBuilder *>(builder)->Append(cell.value<std::string>());
break; break;
case arrow::Type::STRUCT: case arrow::Type::STRUCT:
static_cast<arrow::StructBuilder *>(builder)->Append(cell.value<std::string>());
break; break;
case arrow::Type::UNION: case arrow::Type::UNION:
static_cast<arrow::UnionBuilder *>(builder)->Append(cell.value<std::string>());
break; break;
case arrow::Type::DICTIONARY: case arrow::Type::DICTIONARY:
static_cast<arrow::DictionaryBuilder *>(builder)->Append(cell.value<std::string>());
break; break;
*/
default:
throw std::exception("not implemented");
} }
} }
@ -321,6 +354,18 @@ PYBIND11_MODULE(xlntpyarrow, m)
{ {
return cell.value<std::string>(); return cell.value<std::string>();
}) })
.def("value_bool", [](xlnt::cell &cell)
{
return cell.value<bool>();
})
.def("value_unsigned_int", [](xlnt::cell &cell)
{
return cell.value<unsigned int>();
})
.def("value_long_double", [](xlnt::cell &cell)
{
return cell.value<long double>();
})
.def("data_type", [](xlnt::cell &cell) .def("data_type", [](xlnt::cell &cell)
{ {
return cell.data_type(); return cell.data_type();