begin implementing xlsx2arrow, fix msvc warnings, other stuff

This commit is contained in:
Thomas Fussell 2017-07-14 22:18:11 -07:00
parent 4367343e15
commit 5b95b3d463
6 changed files with 127 additions and 97 deletions

View File

@ -1,4 +1,6 @@
#include <iostream> #include <iostream>
#include <memory>
#include <xlnt/xlnt_config.hpp> #include <xlnt/xlnt_config.hpp>
namespace arrow { namespace arrow {
@ -6,10 +8,8 @@ class Table;
} }
namespace xlnt { namespace xlnt {
namespace arrow {
void XLNT_API xlsx2arrow(std::istream &s, ::arrow::Table &table); std::shared_ptr<arrow::Table> XLNT_API xlsx2arrow(std::istream &s);
void XLNT_API arrow2xlsx(const ::arrow::Table &table, std::ostream &s); void XLNT_API arrow2xlsx(std::shared_ptr<const arrow::Table> &table, std::ostream &s);
} // namespace arrow
} // namespace xlnt } // namespace xlnt

View File

@ -1,5 +1,5 @@
cmake_minimum_required(VERSION 3.2) cmake_minimum_required(VERSION 3.2)
project(xlnt VERSION 0.9) project(xlnt VERSION 1.1)
set(CMAKE_CXX_STANDARD 14) set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_STANDARD_REQUIRED ON)

View File

@ -1,49 +1,148 @@
// Copyright (c) 2017 Thomas Fussell
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, WRISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE
//
// @license: http://www.opensource.org/licenses/mit-license.php
// @author: see AUTHORS file
#pragma warning(push)
#pragma warning(disable: 4458)
#include <arrow/api.h> #include <arrow/api.h>
#pragma warning(pop)
#include <xlnt/cell/cell.hpp> #include <xlnt/cell/cell.hpp>
#include <xlnt/cell/cell_reference.hpp> #include <xlnt/cell/cell_reference.hpp>
#include <xlnt/utils/xlntarrow.hpp>
#include <xlnt/workbook/streaming_workbook_reader.hpp> #include <xlnt/workbook/streaming_workbook_reader.hpp>
#include <xlnt/workbook/streaming_workbook_writer.hpp> #include <xlnt/workbook/streaming_workbook_writer.hpp>
#include <xlnt/worksheet/worksheet.hpp> #include <xlnt/worksheet/worksheet.hpp>
#include <xlnt/utils/xlntarrow.hpp>
namespace {
std::unique_ptr<arrow::ArrayBuilder> make_array_builder(xlnt::cell::type type)
{
switch (type)
{
case xlnt::cell::type::number:
return std::unique_ptr<arrow::ArrayBuilder>(new arrow::DoubleBuilder(arrow::default_memory_pool(), arrow::float64()));
case xlnt::cell::type::inline_string:
case xlnt::cell::type::shared_string:
case xlnt::cell::type::error:
case xlnt::cell::type::formula_string:
case xlnt::cell::type::empty:
return std::unique_ptr<arrow::StringBuilder>(new arrow::StringBuilder(arrow::default_memory_pool()));
case xlnt::cell::type::boolean:
return std::unique_ptr<arrow::ArrayBuilder>(new arrow::BooleanBuilder(arrow::default_memory_pool(), std::make_shared<arrow::BooleanType>()));
case xlnt::cell::type::date:
return std::unique_ptr<arrow::Date32Builder>(new arrow::Date32Builder(arrow::default_memory_pool()));
}
}
arrow::Field make_type_field(const std::string &name, xlnt::cell::type type)
{
switch (type)
{
case xlnt::cell::type::number:
return arrow::Field(name, arrow::float64());
case xlnt::cell::type::inline_string:
case xlnt::cell::type::shared_string:
case xlnt::cell::type::error:
case xlnt::cell::type::formula_string:
case xlnt::cell::type::empty:
return arrow::Field(name, std::make_shared<arrow::StringType>());
case xlnt::cell::type::boolean:
return arrow::Field(name, arrow::boolean());
case xlnt::cell::type::date:
return arrow::Field(name, arrow::date32());
}
}
} // namespace
namespace xlnt { namespace xlnt {
namespace arrow {
void XLNT_API xlsx2arrow(std::istream &s, ::arrow::Table &table) std::shared_ptr<arrow::Table> XLNT_API xlsx2arrow(std::istream &s)
{ {
xlnt::streaming_workbook_reader reader; xlnt::streaming_workbook_reader reader;
reader.open(s); reader.open(s);
reader.begin_worksheet(); reader.begin_worksheet();
int first_row = 0;
auto column_names = std::vector<std::string>();
auto columns = std::vector<std::unique_ptr<arrow::ArrayBuilder>>();
auto fields = std::vector<std::shared_ptr<arrow::Field>>();
auto arrow_check = [](arrow::Status s)
{
if (!s.ok())
{
throw xlnt::exception("conversion error");
}
};
while (reader.has_cell()) while (reader.has_cell())
{ {
auto cell = reader.read_cell(); auto cell = reader.read_cell();
if (first_row < 1) if (cell.row() == 1)
{ {
first_row = cell.row(); column_names.push_back(cell.value<std::string>());
} }
else if (cell.row() == 2)
if (cell.reference().row() % 1000 == 1)
{ {
std::cout << cell.reference().to_string() << std::endl; auto column_name = column_names.at(cell.column().index - 1);
auto field = make_type_field(column_name, cell.data_type());
fields.push_back(std::make_shared<arrow::Field>(field));
columns.push_back(make_array_builder(cell.data_type()));
} }
} }
reader.end_worksheet(); reader.end_worksheet();
auto schema = std::make_shared<arrow::Schema>(fields);
auto arrays = std::vector<std::shared_ptr<arrow::Array>>();
for (size_t i = 0; i != columns.size(); ++i)
{
std::shared_ptr<arrow::Array> array;
columns[i]->Finish(&array);
arrays.emplace_back(array);
}
std::shared_ptr<arrow::Table> table;
arrow_check(MakeTable(schema, arrays, &table));
return table;
} }
void XLNT_API arrow2xlsx(const ::arrow::Table &table, std::ostream &s) void XLNT_API arrow2xlsx(std::shared_ptr<const arrow::Table> &table, std::ostream &s)
{ {
xlnt::streaming_workbook_writer writer; xlnt::streaming_workbook_writer writer;
writer.open(s); writer.open(s);
writer.add_worksheet("Sheet1"); writer.add_worksheet("Sheet1");
writer.add_cell("A1").value("test");
for (auto i = 0; i < table->num_columns(); ++i)
{
auto column_name = table->schema()->field(i)->name();
writer.add_cell(xlnt::cell_reference(i + 1, 1)).value(column_name);
}
} }
} } // namespace xlnt
}

View File

@ -9,7 +9,6 @@
#include <Python.h> #include <Python.h>
namespace xlnt { namespace xlnt {
namespace arrow {
/// A stream buffer getting data from and putting data into a Python file object /// A stream buffer getting data from and putting data into a Python file object
/** The aims are as follow: /** The aims are as follow:
@ -84,7 +83,7 @@ namespace arrow {
Note: references are to the C++ standard (the numbers between parentheses Note: references are to the C++ standard (the numbers between parentheses
at the end of references are margin markers). at the end of references are margin markers).
*/ */
class streambuf : public std::basic_streambuf<char> class python_streambuf : public std::basic_streambuf<char>
{ {
private: private:
typedef std::basic_streambuf<char> base_t; typedef std::basic_streambuf<char> base_t;
@ -113,7 +112,7 @@ class streambuf : public std::basic_streambuf<char>
/// Construct from a Python file object /// Construct from a Python file object
/** if buffer_size is 0 the current default_buffer_size is used. /** if buffer_size is 0 the current default_buffer_size is used.
*/ */
streambuf( python_streambuf(
PyObject *python_file_obj, PyObject *python_file_obj,
std::size_t buffer_size_ = 0) std::size_t buffer_size_ = 0)
: :
@ -162,7 +161,7 @@ class streambuf : public std::basic_streambuf<char>
} }
/// Mundane destructor freeing the allocated resources /// Mundane destructor freeing the allocated resources
virtual ~streambuf() { virtual ~python_streambuf() {
if (write_buffer) delete[] write_buffer; if (write_buffer) delete[] write_buffer;
} }
@ -324,7 +323,7 @@ class streambuf : public std::basic_streambuf<char>
std::ios_base::openmode which= std::ios_base::in std::ios_base::openmode which= std::ios_base::in
| std::ios_base::out) | std::ios_base::out)
{ {
return streambuf::seekoff(sp, std::ios_base::beg, which); return python_streambuf::seekoff(sp, std::ios_base::beg, which);
} }
private: private:
@ -402,8 +401,8 @@ class streambuf : public std::basic_streambuf<char>
if (buf_sought < buf_begin || buf_sought >= upper_bound) return failure; if (buf_sought < buf_begin || buf_sought >= upper_bound) return failure;
// we are in wonderland // we are in wonderland
if (which == std::ios_base::in) gbump(buf_sought - buf_cur); if (which == std::ios_base::in) gbump(static_cast<int>(buf_sought - buf_cur));
else if (which == std::ios_base::out) pbump(buf_sought - buf_cur); else if (which == std::ios_base::out) pbump(static_cast<int>(buf_sought - buf_cur));
return pos_of_buffer_end_in_py_file + (buf_sought - buf_end); return pos_of_buffer_end_in_py_file + (buf_sought - buf_end);
} }
@ -415,73 +414,8 @@ class streambuf : public std::basic_streambuf<char>
return static_cast<T>(value); return static_cast<T>(value);
} }
public:
class istream : public std::istream
{
public:
istream(streambuf& buf) : std::istream(&buf)
{
exceptions(std::ios_base::badbit);
}
~istream() { if (this->good()) this->sync(); }
};
class ostream : public std::ostream
{
public:
ostream(streambuf& buf) : std::ostream(&buf)
{
exceptions(std::ios_base::badbit);
}
~ostream() { if (this->good()) this->flush(); }
};
}; };
std::size_t streambuf::default_buffer_size = 1024; std::size_t python_streambuf::default_buffer_size = 1024;
struct streambuf_capsule } // namespace xlnt
{
streambuf python_streambuf;
streambuf_capsule(
PyObject *python_file_obj,
std::size_t buffer_size=0)
:
python_streambuf(python_file_obj, buffer_size)
{}
};
struct ostream : private streambuf_capsule, streambuf::ostream
{
ostream(
PyObject *python_file_obj,
std::size_t buffer_size=0)
:
streambuf_capsule(python_file_obj, buffer_size),
streambuf::ostream(python_streambuf)
{}
~ostream()
{
if (this->good())
{
this->flush();
}
if (PyErr_Occurred() != nullptr)
{
PyErr_Clear();
throw std::runtime_error(
"Problem closing python ostream.\n"
" Known limitation: the error is unrecoverable. Sorry.\n"
" Suggestion for programmer: add ostream.flush() before"
" returning.");
}
}
};
}} // namespace xlnt::arrow

View File

@ -10,12 +10,9 @@
PyObject *xlsx2arrow(PyObject *file) PyObject *xlsx2arrow(PyObject *file)
{ {
xlnt::arrow::streambuf buffer(file); xlnt::python_streambuf buffer(file);
std::istream stream(&buffer); std::istream stream(&buffer);
std::shared_ptr<arrow::Schema> schema; auto table = xlnt::xlsx2arrow(stream);
std::vector<std::shared_ptr<arrow::Column>> columns;
arrow::Table table(schema, columns);
xlnt::arrow::xlsx2arrow(stream, table);
Py_RETURN_NONE; Py_RETURN_NONE;
} }