figured out the problem

2024-03-22 13:11:17 +08:00 · 2017-07-30 20:32:37 -07:00 · 2017-07-30 20:32:37 -07:00 · 8801a0e352
commit 8801a0e352
parent de0e010056
5 changed files with 41 additions and 35 deletions
--- a/include/xlnt/workbook/streaming_workbook_reader.hpp
+++ b/include/xlnt/workbook/streaming_workbook_reader.hpp
@ -118,6 +118,12 @@ public:
    /// </summary>
    void open(std::istream &stream);

+    /// <summary>
+    /// Holds the given streambuf internally, creates a std::istream backed
+    /// by the given buffer, and calls open(std::istream &) with that stream.
+    /// </summary>
+    void open(std::unique_ptr<std::streambuf> &&buffer);
+
    /// <summary>
    /// Returns a vector of the titles of sheets in the workbook in order.
    /// </summary>
--- a/source/workbook/streaming_workbook_reader.cpp
+++ b/source/workbook/streaming_workbook_reader.cpp
@ -158,6 +158,13 @@ void streaming_workbook_reader::open(std::istream &stream)
    const auto workbook_path = workbook_rel.target().path();
 }

+void streaming_workbook_reader::open(std::unique_ptr<std::streambuf> &&buffer)
+{
+    stream_buffer_.swap(buffer);
+    stream_.reset(new std::istream(stream_buffer_.get()));
+    open(*stream_);
+}
+
 std::vector<std::string> streaming_workbook_reader::sheet_titles()
 {
    return workbook_->sheet_titles();
--- a/xlntpyarrow/python_streambuf.hpp
+++ b/xlntpyarrow/python_streambuf.hpp
@ -102,7 +102,6 @@ class python_streambuf : public std::basic_streambuf<char>
        member function readsome to work correctly (c.f. 27.6.1.3, alinea 30)
     */
    virtual std::streamsize showmanyc() {
-      std::cout << "showmanyc" << std::endl;
      int_type const failure = traits_type::eof();
      int_type status = underflow();
      if (status == failure) return -1;
@ -111,7 +110,6 @@ class python_streambuf : public std::basic_streambuf<char>

    /// C.f. C++ standard section 27.5.2.4.3
    virtual int_type underflow() {
-      std::cout << "underflow" << std::endl;
      int_type const failure = traits_type::eof();
      if (py_read.is_none()) {
        throw std::invalid_argument(
@ -136,7 +134,6 @@ class python_streambuf : public std::basic_streambuf<char>

    /// C.f. C++ standard section 27.5.2.4.5
    virtual int_type overflow(int_type c=traits_type_eof()) {
-      std::cout << "overflow" << std::endl;
      if (py_write.is_none()) {
        throw std::invalid_argument(
          "That Python file object has no 'write' attribute");
@ -168,7 +165,6 @@ class python_streambuf : public std::basic_streambuf<char>
        seek position in that read buffer.
    */
    virtual int sync() {
-      std::cout << "sync" << std::endl;
      int result = 0;
      farthest_pptr = std::max(farthest_pptr, pptr());
      if (farthest_pptr && farthest_pptr > pbase()) {
@ -201,7 +197,6 @@ class python_streambuf : public std::basic_streambuf<char>
                     std::ios_base::openmode which=  std::ios_base::in
                                                   | std::ios_base::out)
    {
-      std::cout << "seekoff" << std::endl;
      /* In practice, "which" is either std::ios_base::in or out
         since we end up here because either seekp or seekg was called
         on the stream using this buffer. That simplifies the code
@ -259,7 +254,6 @@ class python_streambuf : public std::basic_streambuf<char>
                     std::ios_base::openmode which=  std::ios_base::in
                                                   | std::ios_base::out)
    {
-      std::cout << "seekpos" << std::endl;
      return python_streambuf::seekoff(sp, std::ios_base::beg, which);
    }

--- a/xlntpyarrow/test.py
+++ b/xlntpyarrow/test.py
@ -37,24 +37,27 @@ def xlsx2arrow(io, sheetname):
    column_names = []
    fields = []
    batches = []
+    schema = None

    while reader.has_cell():
-        print('read_cell')
        cell = reader.read_cell()
        type = cell.data_type()

+        print('read_cell', cell.row(), cell.column())
+
        if cell.row() == 1:
-            column_names.push_back(cell.value_string())
+            column_names.append(cell.value_string())
            continue
        elif cell.row() == 2:
            column_name = column_names[cell.column() - 1]
-            fields.append(pa.Field(column_name, COLUMN_TYPE_FIELD[type]()))
+            fields.append(pa.field(column_name, COLUMN_TYPE_FIELD[type]()))
            continue
        elif schema is None:
            schema = pa.schema(fields)

-        batch = xpa.read_batch(schema, 0)
-        print(batch)
+        print(schema)
+
+        batch = reader.read_batch(schema, 100000)
        batches.append(batch)

        break
@ -65,4 +68,5 @@ def xlsx2arrow(io, sheetname):

 if __name__ == '__main__':
    file = open('tmp.xlsx', 'rb')
-    print(xlsx2arrow(file, 'Sheet1'))
+    table = xlsx2arrow(file, 'Sheet1')
+    print(table.to_pandas())
--- a/xlntpyarrow/xlntpyarrow.cpp
+++ b/xlntpyarrow/xlntpyarrow.cpp
@ -154,9 +154,7 @@ std::unique_ptr<arrow::ArrayBuilder> make_array_builder(arrow::Type::type type)

 void open_file(xlnt::streaming_workbook_reader &reader, pybind11::object file)
 {
-    xlnt::python_streambuf buffer(file);
-    std::istream stream(&buffer);
-    reader.open(stream);
+    reader.open(std::unique_ptr<std::streambuf>(new xlnt::python_streambuf(file)));
 }

 pybind11::handle read_batch(xlnt::streaming_workbook_reader &reader,
@ -167,36 +165,29 @@ pybind11::handle read_batch(xlnt::streaming_workbook_reader &reader,
    std::shared_ptr<arrow::Schema> schema;
    arrow::py::unwrap_schema(pyschema.ptr(), &schema);

-    std::cout << "1" << std::endl;
-
    auto column_types = extract_schema_types(schema);
    auto builders = std::vector<std::shared_ptr<arrow::ArrayBuilder>>();
    auto num_rows = std::int64_t(0);

-    std::cout << "2" << std::endl;
-
    for (auto type : column_types)
    {
        builders.push_back(make_array_builder(type));
    }

-    std::cout << "3" << std::endl;
-
    for (auto row = 0; row < max_rows; ++row)
    {
        if (!reader.has_cell()) break;

-        std::cout << "4" << std::endl;
+        if (row % 1000 == 0)
+        {
+            std::cout << row << std::endl;
+        }

        for (auto column = 0; column < schema->num_fields(); ++column)
        {
            if (!reader.has_cell()) break;

-            std::cout << "5" << std::endl;
-
            auto cell = reader.read_cell();
-
-            /*
            auto column_type = column_types.at(column);
            auto builder = builders.at(cell.column().index - 1).get();

@ -287,14 +278,11 @@ pybind11::handle read_batch(xlnt::streaming_workbook_reader &reader,
            case arrow::Type::DICTIONARY:
                break;
            }
-            */
        }

        ++num_rows;
    }

-    std::cout << "6" << std::endl;
-
    auto columns = std::vector<std::shared_ptr<arrow::Array>>();

    for (auto &builder : builders)
@ -304,14 +292,10 @@ pybind11::handle read_batch(xlnt::streaming_workbook_reader &reader,
        columns.emplace_back(column);
    }

-    std::cout << "7" << std::endl;
-
    auto batch_pointer = std::make_shared<arrow::RecordBatch>(schema, num_rows, columns);
    auto batch_object = arrow::py::wrap_record_batch(batch_pointer);
    auto batch_handle = pybind11::handle(batch_object); // don't need to incr. reference count, right?

-    std::cout << "8" << std::endl;
-
    return batch_handle;
 }

@ -330,11 +314,22 @@ PYBIND11_MODULE(xlntpyarrow, m)
        .def("open", &open_file)
        .def("read_batch", &read_batch);

+    pybind11::class_<xlnt::worksheet>(m, "Worksheet");
+
    pybind11::class_<xlnt::cell> cell(m, "Cell");
-    cell.def("value_string", [](xlnt::cell cell)
+    cell.def("value_string", [](xlnt::cell &cell)
        {
            return cell.value<std::string>();
-        });
+        })
+        .def("data_type", [](xlnt::cell &cell)
+            {
+                return cell.data_type();
+            })
+        .def("row", &xlnt::cell::row)
+        .def("column", [](xlnt::cell &cell)
+            {
+                return cell.column().index;
+            });

    pybind11::enum_<xlnt::cell::type>(cell, "Type")
        .value("Empty", xlnt::cell::type::empty)