mirror of
https://github.com/tfussell/xlnt.git
synced 2024-03-22 13:11:17 +08:00
figured out the problem
This commit is contained in:
parent
de0e010056
commit
8801a0e352
@ -118,6 +118,12 @@ public:
|
||||
/// </summary>
|
||||
void open(std::istream &stream);
|
||||
|
||||
/// <summary>
|
||||
/// Holds the given streambuf internally, creates a std::istream backed
|
||||
/// by the given buffer, and calls open(std::istream &) with that stream.
|
||||
/// </summary>
|
||||
void open(std::unique_ptr<std::streambuf> &&buffer);
|
||||
|
||||
/// <summary>
|
||||
/// Returns a vector of the titles of sheets in the workbook in order.
|
||||
/// </summary>
|
||||
|
@ -158,6 +158,13 @@ void streaming_workbook_reader::open(std::istream &stream)
|
||||
const auto workbook_path = workbook_rel.target().path();
|
||||
}
|
||||
|
||||
void streaming_workbook_reader::open(std::unique_ptr<std::streambuf> &&buffer)
|
||||
{
|
||||
stream_buffer_.swap(buffer);
|
||||
stream_.reset(new std::istream(stream_buffer_.get()));
|
||||
open(*stream_);
|
||||
}
|
||||
|
||||
std::vector<std::string> streaming_workbook_reader::sheet_titles()
|
||||
{
|
||||
return workbook_->sheet_titles();
|
||||
|
@ -102,7 +102,6 @@ class python_streambuf : public std::basic_streambuf<char>
|
||||
member function readsome to work correctly (c.f. 27.6.1.3, alinea 30)
|
||||
*/
|
||||
virtual std::streamsize showmanyc() {
|
||||
std::cout << "showmanyc" << std::endl;
|
||||
int_type const failure = traits_type::eof();
|
||||
int_type status = underflow();
|
||||
if (status == failure) return -1;
|
||||
@ -111,7 +110,6 @@ class python_streambuf : public std::basic_streambuf<char>
|
||||
|
||||
/// C.f. C++ standard section 27.5.2.4.3
|
||||
virtual int_type underflow() {
|
||||
std::cout << "underflow" << std::endl;
|
||||
int_type const failure = traits_type::eof();
|
||||
if (py_read.is_none()) {
|
||||
throw std::invalid_argument(
|
||||
@ -136,7 +134,6 @@ class python_streambuf : public std::basic_streambuf<char>
|
||||
|
||||
/// C.f. C++ standard section 27.5.2.4.5
|
||||
virtual int_type overflow(int_type c=traits_type_eof()) {
|
||||
std::cout << "overflow" << std::endl;
|
||||
if (py_write.is_none()) {
|
||||
throw std::invalid_argument(
|
||||
"That Python file object has no 'write' attribute");
|
||||
@ -168,7 +165,6 @@ class python_streambuf : public std::basic_streambuf<char>
|
||||
seek position in that read buffer.
|
||||
*/
|
||||
virtual int sync() {
|
||||
std::cout << "sync" << std::endl;
|
||||
int result = 0;
|
||||
farthest_pptr = std::max(farthest_pptr, pptr());
|
||||
if (farthest_pptr && farthest_pptr > pbase()) {
|
||||
@ -201,7 +197,6 @@ class python_streambuf : public std::basic_streambuf<char>
|
||||
std::ios_base::openmode which= std::ios_base::in
|
||||
| std::ios_base::out)
|
||||
{
|
||||
std::cout << "seekoff" << std::endl;
|
||||
/* In practice, "which" is either std::ios_base::in or out
|
||||
since we end up here because either seekp or seekg was called
|
||||
on the stream using this buffer. That simplifies the code
|
||||
@ -259,7 +254,6 @@ class python_streambuf : public std::basic_streambuf<char>
|
||||
std::ios_base::openmode which= std::ios_base::in
|
||||
| std::ios_base::out)
|
||||
{
|
||||
std::cout << "seekpos" << std::endl;
|
||||
return python_streambuf::seekoff(sp, std::ios_base::beg, which);
|
||||
}
|
||||
|
||||
|
@ -37,24 +37,27 @@ def xlsx2arrow(io, sheetname):
|
||||
column_names = []
|
||||
fields = []
|
||||
batches = []
|
||||
schema = None
|
||||
|
||||
while reader.has_cell():
|
||||
print('read_cell')
|
||||
cell = reader.read_cell()
|
||||
type = cell.data_type()
|
||||
|
||||
print('read_cell', cell.row(), cell.column())
|
||||
|
||||
if cell.row() == 1:
|
||||
column_names.push_back(cell.value_string())
|
||||
column_names.append(cell.value_string())
|
||||
continue
|
||||
elif cell.row() == 2:
|
||||
column_name = column_names[cell.column() - 1]
|
||||
fields.append(pa.Field(column_name, COLUMN_TYPE_FIELD[type]()))
|
||||
fields.append(pa.field(column_name, COLUMN_TYPE_FIELD[type]()))
|
||||
continue
|
||||
elif schema is None:
|
||||
schema = pa.schema(fields)
|
||||
|
||||
batch = xpa.read_batch(schema, 0)
|
||||
print(batch)
|
||||
print(schema)
|
||||
|
||||
batch = reader.read_batch(schema, 100000)
|
||||
batches.append(batch)
|
||||
|
||||
break
|
||||
@ -65,4 +68,5 @@ def xlsx2arrow(io, sheetname):
|
||||
|
||||
if __name__ == '__main__':
|
||||
file = open('tmp.xlsx', 'rb')
|
||||
print(xlsx2arrow(file, 'Sheet1'))
|
||||
table = xlsx2arrow(file, 'Sheet1')
|
||||
print(table.to_pandas())
|
||||
|
@ -154,9 +154,7 @@ std::unique_ptr<arrow::ArrayBuilder> make_array_builder(arrow::Type::type type)
|
||||
|
||||
void open_file(xlnt::streaming_workbook_reader &reader, pybind11::object file)
|
||||
{
|
||||
xlnt::python_streambuf buffer(file);
|
||||
std::istream stream(&buffer);
|
||||
reader.open(stream);
|
||||
reader.open(std::unique_ptr<std::streambuf>(new xlnt::python_streambuf(file)));
|
||||
}
|
||||
|
||||
pybind11::handle read_batch(xlnt::streaming_workbook_reader &reader,
|
||||
@ -167,36 +165,29 @@ pybind11::handle read_batch(xlnt::streaming_workbook_reader &reader,
|
||||
std::shared_ptr<arrow::Schema> schema;
|
||||
arrow::py::unwrap_schema(pyschema.ptr(), &schema);
|
||||
|
||||
std::cout << "1" << std::endl;
|
||||
|
||||
auto column_types = extract_schema_types(schema);
|
||||
auto builders = std::vector<std::shared_ptr<arrow::ArrayBuilder>>();
|
||||
auto num_rows = std::int64_t(0);
|
||||
|
||||
std::cout << "2" << std::endl;
|
||||
|
||||
for (auto type : column_types)
|
||||
{
|
||||
builders.push_back(make_array_builder(type));
|
||||
}
|
||||
|
||||
std::cout << "3" << std::endl;
|
||||
|
||||
for (auto row = 0; row < max_rows; ++row)
|
||||
{
|
||||
if (!reader.has_cell()) break;
|
||||
|
||||
std::cout << "4" << std::endl;
|
||||
if (row % 1000 == 0)
|
||||
{
|
||||
std::cout << row << std::endl;
|
||||
}
|
||||
|
||||
for (auto column = 0; column < schema->num_fields(); ++column)
|
||||
{
|
||||
if (!reader.has_cell()) break;
|
||||
|
||||
std::cout << "5" << std::endl;
|
||||
|
||||
auto cell = reader.read_cell();
|
||||
|
||||
/*
|
||||
auto column_type = column_types.at(column);
|
||||
auto builder = builders.at(cell.column().index - 1).get();
|
||||
|
||||
@ -287,14 +278,11 @@ pybind11::handle read_batch(xlnt::streaming_workbook_reader &reader,
|
||||
case arrow::Type::DICTIONARY:
|
||||
break;
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
++num_rows;
|
||||
}
|
||||
|
||||
std::cout << "6" << std::endl;
|
||||
|
||||
auto columns = std::vector<std::shared_ptr<arrow::Array>>();
|
||||
|
||||
for (auto &builder : builders)
|
||||
@ -304,14 +292,10 @@ pybind11::handle read_batch(xlnt::streaming_workbook_reader &reader,
|
||||
columns.emplace_back(column);
|
||||
}
|
||||
|
||||
std::cout << "7" << std::endl;
|
||||
|
||||
auto batch_pointer = std::make_shared<arrow::RecordBatch>(schema, num_rows, columns);
|
||||
auto batch_object = arrow::py::wrap_record_batch(batch_pointer);
|
||||
auto batch_handle = pybind11::handle(batch_object); // don't need to incr. reference count, right?
|
||||
|
||||
std::cout << "8" << std::endl;
|
||||
|
||||
return batch_handle;
|
||||
}
|
||||
|
||||
@ -330,11 +314,22 @@ PYBIND11_MODULE(xlntpyarrow, m)
|
||||
.def("open", &open_file)
|
||||
.def("read_batch", &read_batch);
|
||||
|
||||
pybind11::class_<xlnt::worksheet>(m, "Worksheet");
|
||||
|
||||
pybind11::class_<xlnt::cell> cell(m, "Cell");
|
||||
cell.def("value_string", [](xlnt::cell cell)
|
||||
cell.def("value_string", [](xlnt::cell &cell)
|
||||
{
|
||||
return cell.value<std::string>();
|
||||
});
|
||||
})
|
||||
.def("data_type", [](xlnt::cell &cell)
|
||||
{
|
||||
return cell.data_type();
|
||||
})
|
||||
.def("row", &xlnt::cell::row)
|
||||
.def("column", [](xlnt::cell &cell)
|
||||
{
|
||||
return cell.column().index;
|
||||
});
|
||||
|
||||
pybind11::enum_<xlnt::cell::type>(cell, "Type")
|
||||
.value("Empty", xlnt::cell::type::empty)
|
||||
|
Loading…
x
Reference in New Issue
Block a user