mirror of
https://github.com/tfussell/xlnt.git
synced 2024-03-22 13:11:17 +08:00
figured out the problem
This commit is contained in:
parent
de0e010056
commit
8801a0e352
|
@ -118,6 +118,12 @@ public:
|
||||||
/// </summary>
|
/// </summary>
|
||||||
void open(std::istream &stream);
|
void open(std::istream &stream);
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Holds the given streambuf internally, creates a std::istream backed
|
||||||
|
/// by the given buffer, and calls open(std::istream &) with that stream.
|
||||||
|
/// </summary>
|
||||||
|
void open(std::unique_ptr<std::streambuf> &&buffer);
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Returns a vector of the titles of sheets in the workbook in order.
|
/// Returns a vector of the titles of sheets in the workbook in order.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
|
|
|
@ -158,6 +158,13 @@ void streaming_workbook_reader::open(std::istream &stream)
|
||||||
const auto workbook_path = workbook_rel.target().path();
|
const auto workbook_path = workbook_rel.target().path();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void streaming_workbook_reader::open(std::unique_ptr<std::streambuf> &&buffer)
|
||||||
|
{
|
||||||
|
stream_buffer_.swap(buffer);
|
||||||
|
stream_.reset(new std::istream(stream_buffer_.get()));
|
||||||
|
open(*stream_);
|
||||||
|
}
|
||||||
|
|
||||||
std::vector<std::string> streaming_workbook_reader::sheet_titles()
|
std::vector<std::string> streaming_workbook_reader::sheet_titles()
|
||||||
{
|
{
|
||||||
return workbook_->sheet_titles();
|
return workbook_->sheet_titles();
|
||||||
|
|
|
@ -102,7 +102,6 @@ class python_streambuf : public std::basic_streambuf<char>
|
||||||
member function readsome to work correctly (c.f. 27.6.1.3, alinea 30)
|
member function readsome to work correctly (c.f. 27.6.1.3, alinea 30)
|
||||||
*/
|
*/
|
||||||
virtual std::streamsize showmanyc() {
|
virtual std::streamsize showmanyc() {
|
||||||
std::cout << "showmanyc" << std::endl;
|
|
||||||
int_type const failure = traits_type::eof();
|
int_type const failure = traits_type::eof();
|
||||||
int_type status = underflow();
|
int_type status = underflow();
|
||||||
if (status == failure) return -1;
|
if (status == failure) return -1;
|
||||||
|
@ -111,7 +110,6 @@ class python_streambuf : public std::basic_streambuf<char>
|
||||||
|
|
||||||
/// C.f. C++ standard section 27.5.2.4.3
|
/// C.f. C++ standard section 27.5.2.4.3
|
||||||
virtual int_type underflow() {
|
virtual int_type underflow() {
|
||||||
std::cout << "underflow" << std::endl;
|
|
||||||
int_type const failure = traits_type::eof();
|
int_type const failure = traits_type::eof();
|
||||||
if (py_read.is_none()) {
|
if (py_read.is_none()) {
|
||||||
throw std::invalid_argument(
|
throw std::invalid_argument(
|
||||||
|
@ -136,7 +134,6 @@ class python_streambuf : public std::basic_streambuf<char>
|
||||||
|
|
||||||
/// C.f. C++ standard section 27.5.2.4.5
|
/// C.f. C++ standard section 27.5.2.4.5
|
||||||
virtual int_type overflow(int_type c=traits_type_eof()) {
|
virtual int_type overflow(int_type c=traits_type_eof()) {
|
||||||
std::cout << "overflow" << std::endl;
|
|
||||||
if (py_write.is_none()) {
|
if (py_write.is_none()) {
|
||||||
throw std::invalid_argument(
|
throw std::invalid_argument(
|
||||||
"That Python file object has no 'write' attribute");
|
"That Python file object has no 'write' attribute");
|
||||||
|
@ -168,7 +165,6 @@ class python_streambuf : public std::basic_streambuf<char>
|
||||||
seek position in that read buffer.
|
seek position in that read buffer.
|
||||||
*/
|
*/
|
||||||
virtual int sync() {
|
virtual int sync() {
|
||||||
std::cout << "sync" << std::endl;
|
|
||||||
int result = 0;
|
int result = 0;
|
||||||
farthest_pptr = std::max(farthest_pptr, pptr());
|
farthest_pptr = std::max(farthest_pptr, pptr());
|
||||||
if (farthest_pptr && farthest_pptr > pbase()) {
|
if (farthest_pptr && farthest_pptr > pbase()) {
|
||||||
|
@ -201,7 +197,6 @@ class python_streambuf : public std::basic_streambuf<char>
|
||||||
std::ios_base::openmode which= std::ios_base::in
|
std::ios_base::openmode which= std::ios_base::in
|
||||||
| std::ios_base::out)
|
| std::ios_base::out)
|
||||||
{
|
{
|
||||||
std::cout << "seekoff" << std::endl;
|
|
||||||
/* In practice, "which" is either std::ios_base::in or out
|
/* In practice, "which" is either std::ios_base::in or out
|
||||||
since we end up here because either seekp or seekg was called
|
since we end up here because either seekp or seekg was called
|
||||||
on the stream using this buffer. That simplifies the code
|
on the stream using this buffer. That simplifies the code
|
||||||
|
@ -259,7 +254,6 @@ class python_streambuf : public std::basic_streambuf<char>
|
||||||
std::ios_base::openmode which= std::ios_base::in
|
std::ios_base::openmode which= std::ios_base::in
|
||||||
| std::ios_base::out)
|
| std::ios_base::out)
|
||||||
{
|
{
|
||||||
std::cout << "seekpos" << std::endl;
|
|
||||||
return python_streambuf::seekoff(sp, std::ios_base::beg, which);
|
return python_streambuf::seekoff(sp, std::ios_base::beg, which);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -37,24 +37,27 @@ def xlsx2arrow(io, sheetname):
|
||||||
column_names = []
|
column_names = []
|
||||||
fields = []
|
fields = []
|
||||||
batches = []
|
batches = []
|
||||||
|
schema = None
|
||||||
|
|
||||||
while reader.has_cell():
|
while reader.has_cell():
|
||||||
print('read_cell')
|
|
||||||
cell = reader.read_cell()
|
cell = reader.read_cell()
|
||||||
type = cell.data_type()
|
type = cell.data_type()
|
||||||
|
|
||||||
|
print('read_cell', cell.row(), cell.column())
|
||||||
|
|
||||||
if cell.row() == 1:
|
if cell.row() == 1:
|
||||||
column_names.push_back(cell.value_string())
|
column_names.append(cell.value_string())
|
||||||
continue
|
continue
|
||||||
elif cell.row() == 2:
|
elif cell.row() == 2:
|
||||||
column_name = column_names[cell.column() - 1]
|
column_name = column_names[cell.column() - 1]
|
||||||
fields.append(pa.Field(column_name, COLUMN_TYPE_FIELD[type]()))
|
fields.append(pa.field(column_name, COLUMN_TYPE_FIELD[type]()))
|
||||||
continue
|
continue
|
||||||
elif schema is None:
|
elif schema is None:
|
||||||
schema = pa.schema(fields)
|
schema = pa.schema(fields)
|
||||||
|
|
||||||
batch = xpa.read_batch(schema, 0)
|
print(schema)
|
||||||
print(batch)
|
|
||||||
|
batch = reader.read_batch(schema, 100000)
|
||||||
batches.append(batch)
|
batches.append(batch)
|
||||||
|
|
||||||
break
|
break
|
||||||
|
@ -65,4 +68,5 @@ def xlsx2arrow(io, sheetname):
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
file = open('tmp.xlsx', 'rb')
|
file = open('tmp.xlsx', 'rb')
|
||||||
print(xlsx2arrow(file, 'Sheet1'))
|
table = xlsx2arrow(file, 'Sheet1')
|
||||||
|
print(table.to_pandas())
|
||||||
|
|
|
@ -154,9 +154,7 @@ std::unique_ptr<arrow::ArrayBuilder> make_array_builder(arrow::Type::type type)
|
||||||
|
|
||||||
void open_file(xlnt::streaming_workbook_reader &reader, pybind11::object file)
|
void open_file(xlnt::streaming_workbook_reader &reader, pybind11::object file)
|
||||||
{
|
{
|
||||||
xlnt::python_streambuf buffer(file);
|
reader.open(std::unique_ptr<std::streambuf>(new xlnt::python_streambuf(file)));
|
||||||
std::istream stream(&buffer);
|
|
||||||
reader.open(stream);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pybind11::handle read_batch(xlnt::streaming_workbook_reader &reader,
|
pybind11::handle read_batch(xlnt::streaming_workbook_reader &reader,
|
||||||
|
@ -167,36 +165,29 @@ pybind11::handle read_batch(xlnt::streaming_workbook_reader &reader,
|
||||||
std::shared_ptr<arrow::Schema> schema;
|
std::shared_ptr<arrow::Schema> schema;
|
||||||
arrow::py::unwrap_schema(pyschema.ptr(), &schema);
|
arrow::py::unwrap_schema(pyschema.ptr(), &schema);
|
||||||
|
|
||||||
std::cout << "1" << std::endl;
|
|
||||||
|
|
||||||
auto column_types = extract_schema_types(schema);
|
auto column_types = extract_schema_types(schema);
|
||||||
auto builders = std::vector<std::shared_ptr<arrow::ArrayBuilder>>();
|
auto builders = std::vector<std::shared_ptr<arrow::ArrayBuilder>>();
|
||||||
auto num_rows = std::int64_t(0);
|
auto num_rows = std::int64_t(0);
|
||||||
|
|
||||||
std::cout << "2" << std::endl;
|
|
||||||
|
|
||||||
for (auto type : column_types)
|
for (auto type : column_types)
|
||||||
{
|
{
|
||||||
builders.push_back(make_array_builder(type));
|
builders.push_back(make_array_builder(type));
|
||||||
}
|
}
|
||||||
|
|
||||||
std::cout << "3" << std::endl;
|
|
||||||
|
|
||||||
for (auto row = 0; row < max_rows; ++row)
|
for (auto row = 0; row < max_rows; ++row)
|
||||||
{
|
{
|
||||||
if (!reader.has_cell()) break;
|
if (!reader.has_cell()) break;
|
||||||
|
|
||||||
std::cout << "4" << std::endl;
|
if (row % 1000 == 0)
|
||||||
|
{
|
||||||
|
std::cout << row << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
for (auto column = 0; column < schema->num_fields(); ++column)
|
for (auto column = 0; column < schema->num_fields(); ++column)
|
||||||
{
|
{
|
||||||
if (!reader.has_cell()) break;
|
if (!reader.has_cell()) break;
|
||||||
|
|
||||||
std::cout << "5" << std::endl;
|
|
||||||
|
|
||||||
auto cell = reader.read_cell();
|
auto cell = reader.read_cell();
|
||||||
|
|
||||||
/*
|
|
||||||
auto column_type = column_types.at(column);
|
auto column_type = column_types.at(column);
|
||||||
auto builder = builders.at(cell.column().index - 1).get();
|
auto builder = builders.at(cell.column().index - 1).get();
|
||||||
|
|
||||||
|
@ -287,14 +278,11 @@ pybind11::handle read_batch(xlnt::streaming_workbook_reader &reader,
|
||||||
case arrow::Type::DICTIONARY:
|
case arrow::Type::DICTIONARY:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
*/
|
|
||||||
}
|
}
|
||||||
|
|
||||||
++num_rows;
|
++num_rows;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::cout << "6" << std::endl;
|
|
||||||
|
|
||||||
auto columns = std::vector<std::shared_ptr<arrow::Array>>();
|
auto columns = std::vector<std::shared_ptr<arrow::Array>>();
|
||||||
|
|
||||||
for (auto &builder : builders)
|
for (auto &builder : builders)
|
||||||
|
@ -304,14 +292,10 @@ pybind11::handle read_batch(xlnt::streaming_workbook_reader &reader,
|
||||||
columns.emplace_back(column);
|
columns.emplace_back(column);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::cout << "7" << std::endl;
|
|
||||||
|
|
||||||
auto batch_pointer = std::make_shared<arrow::RecordBatch>(schema, num_rows, columns);
|
auto batch_pointer = std::make_shared<arrow::RecordBatch>(schema, num_rows, columns);
|
||||||
auto batch_object = arrow::py::wrap_record_batch(batch_pointer);
|
auto batch_object = arrow::py::wrap_record_batch(batch_pointer);
|
||||||
auto batch_handle = pybind11::handle(batch_object); // don't need to incr. reference count, right?
|
auto batch_handle = pybind11::handle(batch_object); // don't need to incr. reference count, right?
|
||||||
|
|
||||||
std::cout << "8" << std::endl;
|
|
||||||
|
|
||||||
return batch_handle;
|
return batch_handle;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -330,10 +314,21 @@ PYBIND11_MODULE(xlntpyarrow, m)
|
||||||
.def("open", &open_file)
|
.def("open", &open_file)
|
||||||
.def("read_batch", &read_batch);
|
.def("read_batch", &read_batch);
|
||||||
|
|
||||||
|
pybind11::class_<xlnt::worksheet>(m, "Worksheet");
|
||||||
|
|
||||||
pybind11::class_<xlnt::cell> cell(m, "Cell");
|
pybind11::class_<xlnt::cell> cell(m, "Cell");
|
||||||
cell.def("value_string", [](xlnt::cell cell)
|
cell.def("value_string", [](xlnt::cell &cell)
|
||||||
{
|
{
|
||||||
return cell.value<std::string>();
|
return cell.value<std::string>();
|
||||||
|
})
|
||||||
|
.def("data_type", [](xlnt::cell &cell)
|
||||||
|
{
|
||||||
|
return cell.data_type();
|
||||||
|
})
|
||||||
|
.def("row", &xlnt::cell::row)
|
||||||
|
.def("column", [](xlnt::cell &cell)
|
||||||
|
{
|
||||||
|
return cell.column().index;
|
||||||
});
|
});
|
||||||
|
|
||||||
pybind11::enum_<xlnt::cell::type>(cell, "Type")
|
pybind11::enum_<xlnt::cell::type>(cell, "Type")
|
||||||
|
|
Loading…
Reference in New Issue
Block a user