diff --git a/.gitignore b/.gitignore index 29b572bf..0844650f 100644 --- a/.gitignore +++ b/.gitignore @@ -12,4 +12,5 @@ node_modules/ *~ .DS_Store __pycache__/ -Win32/ \ No newline at end of file +Win32/ +*.pyd diff --git a/CMakeLists.txt b/CMakeLists.txt index f5ba09ec..c99fa633 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,6 +11,7 @@ option(STATIC "Set to ON to build xlnt as a static library instead of a shared l option(TESTS "Set to OFF to skip building test executable (in ./tests)" ON) option(SAMPLES "Set to ON to build executable code samples (in ./samples)" OFF) option(BENCHMARKS "Set to ON to build performance benchmarks (in ./benchmarks)" OFF) +option(ARROW "Set to ON to build Arrow conversion functions (in ./contrib/xlntarrow)" OFF) # Platform specific options if(NOT MSVC) @@ -30,4 +31,8 @@ if(TESTS) add_subdirectory(tests) endif() +if(ARROW) + add_subdirectory(arrow/xlntarrow) +endif() + add_subdirectory(source) diff --git a/README.md b/README.md index 62f14399..41800423 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,14 @@ xlnt
==== -[![Travis Build Status](https://travis-ci.org/tfussell/xlnt.svg)](https://travis-ci.org/tfussell/xlnt) +[![Travis Build Status](https://travis-ci.org/tfussell/xlnt.svg?branch=master)](https://travis-ci.org/tfussell/xlnt) [![AppVeyor Build status](https://ci.appveyor.com/api/projects/status/2hs79a1xoxy16sol?svg=true)](https://ci.appveyor.com/project/tfussell/xlnt) [![Coverage Status](https://coveralls.io/repos/github/tfussell/xlnt/badge.svg?branch=master)](https://coveralls.io/github/tfussell/xlnt?branch=master) [![ReadTheDocs Documentation Status](https://readthedocs.org/projects/xlnt/badge/?version=latest)](http://xlnt.readthedocs.org/en/latest/?badge=latest) [![License](http://img.shields.io/badge/license-MIT-blue.svg?style=flat)](http://opensource.org/licenses/MIT) ## Introduction -xlnt is a modern C++ library for manipulating spreadsheets in memory and reading/writing them from/to XLSX files as described in [ECMA 376 4th edition](http://www.ecma-international.org/publications/standards/Ecma-376.htm). xlnt is currently under active feature development and is on track for the version 1.0 release in the next few weeks. Until then, the API could have significant changes. For a high-level summary of what you can do with this library, see [the feature list](https://tfussell.gitbooks.io/xlnt/content/docs/introduction/Features.html). +xlnt is a modern C++ library for manipulating spreadsheets in memory and reading/writing them from/to XLSX files as described in [ECMA 376 4th edition](http://www.ecma-international.org/publications/standards/Ecma-376.htm). The first public release of xlnt version 1.0 was on May 10th, 2017. Current work is focused on increasing compatibility, improving performance, and brainstorming future development goals. For a high-level summary of what you can do with this library, see [the feature list](https://tfussell.gitbooks.io/xlnt/content/docs/introduction/Features.html). Contributions are welcome in the form of pull requests or discussions on [the repository's Issues page](https://github.com/tfussell/xlnt/issues). ## Example @@ -29,7 +29,7 @@ int main() wb.save("example.xlsx"); return 0; } -// compile with -std=c++14 -Ixlnt/include -Lxlnt/lib -lxlnt +// compile with -std=c++14 -Ixlnt/include -lxlnt ``` ## Documentation diff --git a/arrow/xlntarrow/CMakeLists.txt b/arrow/xlntarrow/CMakeLists.txt new file mode 100644 index 00000000..153cefc3 --- /dev/null +++ b/arrow/xlntarrow/CMakeLists.txt @@ -0,0 +1,15 @@ +cmake_minimum_required(VERSION 3.2) +project(xlntarrow) + +set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +set(XLNT_ARROW + ${CMAKE_CURRENT_SOURCE_DIR}/xlntarrow.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/xlntarrow.cpp) + +add_library(xlntarrow SHARED ${XLNT_ARROW}) +target_link_libraries(xlntarrow PRIVATE xlnt) +target_include_directories(xlntarrow PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) +target_include_directories(xlntarrow PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../../miniconda3/include) +target_include_directories(xlntarrow PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../include) diff --git a/arrow/xlntarrow/xlntarrow.cpp b/arrow/xlntarrow/xlntarrow.cpp new file mode 100644 index 00000000..917a4a06 --- /dev/null +++ b/arrow/xlntarrow/xlntarrow.cpp @@ -0,0 +1,39 @@ +#include +#include + +namespace xlnt { +namespace arrow { + +void xlsx2arrow(std::istream &s, ::arrow::Table &table) +{ + xlnt::streaming_workbook_reader reader; + reader.open(s); + + reader.begin_worksheet(); + int first_row = 0; + + while (reader.has_cell()) + { + auto cell = reader.read_cell(); + + if (first_row < 1) + { + first_row = cell.row(); + } + + if (cell.reference().row() % 1000 == 1) + { + std::cout << cell.reference().to_string() << std::endl; + } + } + + reader.end_worksheet(); +} + +void arrow2xlsx(const ::arrow::Table &table, std::istream &s) +{ + +} + +} +} diff --git a/arrow/xlntarrow/xlntarrow.hpp b/arrow/xlntarrow/xlntarrow.hpp new file mode 100644 index 00000000..8d3ae886 --- /dev/null +++ b/arrow/xlntarrow/xlntarrow.hpp @@ -0,0 +1,11 @@ +#include +#include + +namespace xlnt { +namespace arrow { + +void xlsx2arrow(std::istream &s, ::arrow::Table &table); +void arrow2xlsx(const ::arrow::Table &table, std::istream &s); + +} +} diff --git a/arrow/xlntpyarrow/python_streambuf.hpp b/arrow/xlntpyarrow/python_streambuf.hpp new file mode 100644 index 00000000..6e8e3d66 --- /dev/null +++ b/arrow/xlntpyarrow/python_streambuf.hpp @@ -0,0 +1,487 @@ +#pragma once + +#include +#include + +#include +#include +#include +#include + +namespace xlnt { +namespace arrow { + +/// A stream buffer getting data from and putting data into a Python file object +/** The aims are as follow: + + - Given a C++ function acting on a standard stream, e.g. + + \code + void read_inputs(std::istream& input) { + ... + input >> something >> something_else; + } + \endcode + + and given a piece of Python code which creates a file-like object, + to be able to pass this file object to that C++ function, e.g. + + \code + import gzip + gzip_file_obj = gzip.GzipFile(...) + read_inputs(gzip_file_obj) + \endcode + + and have the standard stream pull data from and put data into the Python + file object. + + - When Python \c read_inputs() returns, the Python object is able to + continue reading or writing where the C++ code left off. + + - Operations in C++ on mere files should be competitively fast compared + to the direct use of \c std::fstream. + + + \b Motivation + + - the standard Python library offer of file-like objects (files, + compressed files and archives, network, ...) is far superior to the + offer of streams in the C++ standard library and Boost C++ libraries. + + - i/o code involves a fair amount of text processing which is more + efficiently prototyped in Python but then one may need to rewrite + a time-critical part in C++, in as seamless a manner as possible. + + \b Usage + + This is 2-step: + + - a trivial wrapper function + + \code + using boost_adaptbx::python::streambuf; + void read_inputs_wrapper(streambuf& input) + { + streambuf::istream is(input); + read_inputs(is); + } + + def("read_inputs", read_inputs_wrapper); + \endcode + + which has to be written every time one wants a Python binding for + such a C++ function. + + - the Python side + + \code + from boost.python import streambuf + read_inputs(streambuf(python_file_obj=obj, buffer_size=1024)) + \endcode + + \c buffer_size is optional. See also: \c default_buffer_size + + Note: references are to the C++ standard (the numbers between parentheses + at the end of references are margin markers). +*/ +class streambuf : public std::basic_streambuf +{ + private: + typedef std::basic_streambuf base_t; + + public: + /* The syntax + using base_t::char_type; + would be nicer but Visual Studio C++ 8 chokes on it + */ + typedef base_t::char_type char_type; + typedef base_t::int_type int_type; + typedef base_t::pos_type pos_type; + typedef base_t::off_type off_type; + typedef base_t::traits_type traits_type; + + // work around Visual C++ 7.1 problem + inline static int + traits_type_eof() { return traits_type::eof(); } + + /// The default size of the read and write buffer. + /** They are respectively used to buffer data read from and data written to + the Python file object. It can be modified from Python. + */ + static std::size_t default_buffer_size; + + /// Construct from a Python file object + /** if buffer_size is 0 the current default_buffer_size is used. + */ + streambuf( + PyObject *python_file_obj, + std::size_t buffer_size_ = 0) + : + py_read (PyObject_GetAttrString(python_file_obj, "read")), + py_write(PyObject_GetAttrString(python_file_obj, "write")), + py_seek (PyObject_GetAttrString(python_file_obj, "seek")), + py_tell (PyObject_GetAttrString(python_file_obj, "tell")), + buffer_size(buffer_size_ != 0 ? buffer_size_ : default_buffer_size), + write_buffer(0), + pos_of_read_buffer_end_in_py_file(0), + pos_of_write_buffer_end_in_py_file(buffer_size), + farthest_pptr(0) + { + assert(buffer_size != 0); + /* Some Python file objects (e.g. sys.stdout and sys.stdin) + have non-functional seek and tell. If so, assign None to + py_tell and py_seek. + */ + if (py_tell != nullptr) { + PyObject_CallFunction(py_tell, nullptr); + if (PyErr_Occurred() != nullptr) + { + py_tell = nullptr; + py_seek = nullptr; + PyErr_Clear(); + } + } + + if (py_write != nullptr) { + // C-like string to make debugging easier + write_buffer = new char[buffer_size + 1]; + write_buffer[buffer_size] = '\0'; + setp(write_buffer, write_buffer + buffer_size); // 27.5.2.4.5 (5) + farthest_pptr = pptr(); + } + else { + // The first attempt at output will result in a call to overflow + setp(0, 0); + } + + if (py_tell != nullptr) { + auto py_pos = extract_int(PyObject_CallFunction(py_tell, nullptr)); + pos_of_read_buffer_end_in_py_file = py_pos; + pos_of_write_buffer_end_in_py_file = py_pos; + } + } + + /// Mundane destructor freeing the allocated resources + virtual ~streambuf() { + if (write_buffer) delete[] write_buffer; + } + + /// C.f. C++ standard section 27.5.2.4.3 + /** It is essential to override this virtual function for the stream + member function readsome to work correctly (c.f. 27.6.1.3, alinea 30) + */ + virtual std::streamsize showmanyc() { + int_type const failure = traits_type::eof(); + int_type status = underflow(); + if (status == failure) return -1; + return egptr() - gptr(); + } + + /// C.f. C++ standard section 27.5.2.4.3 + virtual int_type underflow() { + int_type const failure = traits_type::eof(); + if (py_read == nullptr) { + throw std::invalid_argument( + "That Python file object has no 'read' attribute"); + } + read_buffer = PyObject_CallFunction(py_read, "i", buffer_size); + char *read_buffer_data = nullptr; + Py_ssize_t py_n_read = 0; + if (PyBytes_AsStringAndSize(read_buffer, &read_buffer_data, &py_n_read) == -1) { + setg(0, 0, 0); + throw std::invalid_argument( + "The method 'read' of the Python file object " + "did not return a string."); + } + auto n_read = (off_type)py_n_read; + pos_of_read_buffer_end_in_py_file += n_read; + setg(read_buffer_data, read_buffer_data, read_buffer_data + n_read); + // ^^^27.5.2.3.1 (4) + if (n_read == 0) return failure; + return traits_type::to_int_type(read_buffer_data[0]); + } + + /// C.f. C++ standard section 27.5.2.4.5 + virtual int_type overflow(int_type c=traits_type_eof()) { + if (py_write == nullptr) { + throw std::invalid_argument( + "That Python file object has no 'write' attribute"); + } + farthest_pptr = std::max(farthest_pptr, pptr()); + auto n_written = (off_type)(farthest_pptr - pbase()); + auto chunk = PyBytes_FromStringAndSize(pbase(), farthest_pptr - pbase()); + PyObject_CallFunction(py_write, "O", chunk); + if (!traits_type::eq_int_type(c, traits_type::eof())) { + auto ch = traits_type::to_char_type(c); + PyObject_CallFunction(py_write, "y#", reinterpret_cast(&ch), 1); + n_written++; + } + if (n_written) { + pos_of_write_buffer_end_in_py_file += n_written; + setp(pbase(), epptr()); + // ^^^ 27.5.2.4.5 (5) + farthest_pptr = pptr(); + } + return traits_type::eq_int_type( + c, traits_type::eof()) ? traits_type::not_eof(c) : c; + } + + /// Update the python file to reflect the state of this stream buffer + /** Empty the write buffer into the Python file object and set the seek + position of the latter accordingly (C++ standard section 27.5.2.4.2). + If there is no write buffer or it is empty, but there is a non-empty + read buffer, set the Python file object seek position to the + seek position in that read buffer. + */ + virtual int sync() { + int result = 0; + farthest_pptr = std::max(farthest_pptr, pptr()); + if (farthest_pptr && farthest_pptr > pbase()) { + off_type delta = pptr() - farthest_pptr; + int_type status = overflow(); + if (traits_type::eq_int_type(status, traits_type::eof())) result = -1; + if (py_seek != nullptr) + { + PyObject_CallFunction(py_seek, "i", delta); + } + } + else if (gptr() && gptr() < egptr()) { + if (py_seek != nullptr) + { + PyObject_CallFunction(py_seek, "ii", gptr() - egptr(), 1); + } + } + return result; + } + + /// C.f. C++ standard section 27.5.2.4.2 + /** This implementation is optimised to look whether the position is within + the buffers, so as to avoid calling Python seek or tell. It is + important for many applications that the overhead of calling into Python + is avoided as much as possible (e.g. parsers which may do a lot of + backtracking) + */ + virtual + pos_type seekoff(off_type off, std::ios_base::seekdir way, + std::ios_base::openmode which= std::ios_base::in + | std::ios_base::out) + { + /* In practice, "which" is either std::ios_base::in or out + since we end up here because either seekp or seekg was called + on the stream using this buffer. That simplifies the code + in a few places. + */ + int const failure = off_type(-1); + + if (py_seek == nullptr) { + throw std::invalid_argument( + "That Python file object has no 'seek' attribute"); + } + + // we need the read buffer to contain something! + if (which == std::ios_base::in && !gptr()) { + if (traits_type::eq_int_type(underflow(), traits_type::eof())) { + return failure; + } + } + + // compute the whence parameter for Python seek + int whence; + switch (way) { + case std::ios_base::beg: + whence = 0; + break; + case std::ios_base::cur: + whence = 1; + break; + case std::ios_base::end: + whence = 2; + break; + default: + return failure; + } + + // Let's have a go + boost::optional result = seekoff_without_calling_python( + off, way, which); + if (!result) { + // we need to call Python + if (which == std::ios_base::out) overflow(); + if (way == std::ios_base::cur) { + if (which == std::ios_base::in) off -= egptr() - gptr(); + else if (which == std::ios_base::out) off += pptr() - pbase(); + } + PyObject_CallFunction(py_seek, "ii", off, whence); + result = extract_int(PyObject_CallFunction(py_tell, nullptr)); + if (which == std::ios_base::in) underflow(); + } + return *result; + } + + /// C.f. C++ standard section 27.5.2.4.2 + virtual + pos_type seekpos(pos_type sp, + std::ios_base::openmode which= std::ios_base::in + | std::ios_base::out) + { + return streambuf::seekoff(sp, std::ios_base::beg, which); + } + + private: + PyObject *py_read = nullptr; + PyObject *py_write = nullptr; + PyObject *py_seek = nullptr; + PyObject *py_tell = nullptr; + + std::size_t buffer_size; + + /* This is actually a Python string and the actual read buffer is + its internal data, i.e. an array of characters. We use a Boost.Python + object so as to hold on it: as a result, the actual buffer can't + go away. + */ + PyObject *read_buffer = nullptr; + + /* A mere array of char's allocated on the heap at construction time and + de-allocated only at destruction time. + */ + char *write_buffer = nullptr; + + off_type pos_of_read_buffer_end_in_py_file, + pos_of_write_buffer_end_in_py_file; + + // the farthest place the buffer has been written into + char *farthest_pptr = nullptr; + + + boost::optional seekoff_without_calling_python( + off_type off, + std::ios_base::seekdir way, + std::ios_base::openmode which) + { + boost::optional const failure; + + // Buffer range and current position + off_type buf_begin, buf_end, buf_cur, upper_bound; + off_type pos_of_buffer_end_in_py_file; + if (which == std::ios_base::in) { + pos_of_buffer_end_in_py_file = pos_of_read_buffer_end_in_py_file; + buf_begin = reinterpret_cast(eback()); + buf_cur = reinterpret_cast(gptr()); + buf_end = reinterpret_cast(egptr()); + upper_bound = buf_end; + } + else if (which == std::ios_base::out) { + pos_of_buffer_end_in_py_file = pos_of_write_buffer_end_in_py_file; + buf_begin = reinterpret_cast(pbase()); + buf_cur = reinterpret_cast(pptr()); + buf_end = reinterpret_cast(epptr()); + farthest_pptr = std::max(farthest_pptr, pptr()); + upper_bound = reinterpret_cast(farthest_pptr) + 1; + } + else { + throw std::runtime_error("unreachable"); + } + + // Sought position in "buffer coordinate" + off_type buf_sought; + if (way == std::ios_base::cur) { + buf_sought = buf_cur + off; + } + else if (way == std::ios_base::beg) { + buf_sought = buf_end + (off - pos_of_buffer_end_in_py_file); + } + else if (way == std::ios_base::end) { + return failure; + } + else { + throw std::runtime_error("unreachable"); + } + + // if the sought position is not in the buffer, give up + if (buf_sought < buf_begin || buf_sought >= upper_bound) return failure; + + // we are in wonderland + if (which == std::ios_base::in) gbump(buf_sought - buf_cur); + else if (which == std::ios_base::out) pbump(buf_sought - buf_cur); + return pos_of_buffer_end_in_py_file + (buf_sought - buf_end); + } + + template + T extract_int(PyObject *o) + { + auto value = PyLong_AsLong(o); + Py_DECREF(o); + + return static_cast(value); + } + + public: + + class istream : public std::istream + { + public: + istream(streambuf& buf) : std::istream(&buf) + { + exceptions(std::ios_base::badbit); + } + + ~istream() { if (this->good()) this->sync(); } + }; + + class ostream : public std::ostream + { + public: + ostream(streambuf& buf) : std::ostream(&buf) + { + exceptions(std::ios_base::badbit); + } + + ~ostream() { if (this->good()) this->flush(); } + }; +}; + +std::size_t streambuf::default_buffer_size = 1024; + +struct streambuf_capsule +{ + streambuf python_streambuf; + + streambuf_capsule( + PyObject *python_file_obj, + std::size_t buffer_size=0) + : + python_streambuf(python_file_obj, buffer_size) + {} +}; + +struct ostream : private streambuf_capsule, streambuf::ostream +{ + ostream( + PyObject *python_file_obj, + std::size_t buffer_size=0) + : + streambuf_capsule(python_file_obj, buffer_size), + streambuf::ostream(python_streambuf) + {} + + ~ostream() + { + if (this->good()) + { + this->flush(); + } + + if (PyErr_Occurred() != nullptr) + { + PyErr_Clear(); + throw std::runtime_error( + "Problem closing python ostream.\n" + " Known limitation: the error is unrecoverable. Sorry.\n" + " Suggestion for programmer: add ostream.flush() before" + " returning."); + } + } +}; + +}} // namespace xlnt::arrow diff --git a/arrow/xlntpyarrow/setup.py b/arrow/xlntpyarrow/setup.py new file mode 100644 index 00000000..7860ce7d --- /dev/null +++ b/arrow/xlntpyarrow/setup.py @@ -0,0 +1,64 @@ +from distutils.core import setup, Extension +from distutils import sysconfig + +description = """ +xlntpyarrow allows Apache Arrow tables to be written to and read from an XLSX +file efficiently using the C++ library xlnt. +""".strip() + +cfg_vars = sysconfig.get_config_vars() +if 'CFLAGS' in cfg_vars: + cfg_vars['CFLAGS'] = cfg_vars['CFLAGS'].replace('-Wstrict-prototypes', '') + +xlntpyarrow_extension = Extension( + 'xlntpyarrow', + ['xlntpyarrow.cpp'], + language = 'c++', + include_dirs = [ + '/root/xlnt/arrow/xlntarrow', + '/root/xlnt/arrow/xlntpyarrow', + '/root/miniconda3/include' + ], + libraries = [ + 'arrow', + 'xlntarrow', + 'xlnt' + ], + library_dirs = [ + '/root/miniconda3/lib', + '/root/xlnt/build/arrow/xlntarrow', + '/root/xlnt/build/source' + ], + extra_compile_args=['-std=c++11'] +) + +classifiers = [ + 'Development Status :: 5 - Production/Stable', + 'Environment :: Plugins', + 'Intended Audience :: Science/Research', + 'License :: OSI Approved :: MIT License', + 'Natural Language :: English', + 'Operating System :: Microsoft :: Windows', + 'Operating System :: MacOS :: MacOS X', + 'Operating System :: POSIX :: Linux', + 'Programming Language :: C', + 'Programming Language :: C++', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: Implementation :: CPython', + 'Topic :: Database', + 'Topic :: Office/Business :: Financial :: Spreadsheet', + 'Topic :: Scientific/Engineering :: Information Analysis', + 'Topic :: Software Development :: Libraries :: Python Modules' +] + +setup( + name = 'xlntpyarrow', + version = '1.1.0', + classifiers = classifiers, + description = description, + ext_modules = [xlntpyarrow_extension], + author = 'Thomas Fussell', + author_email = 'thomas.fussell@gmail.com', + url = 'https://github.com/tfussell/xlnt' +) diff --git a/arrow/xlntpyarrow/xlntpyarrow.cpp b/arrow/xlntpyarrow/xlntpyarrow.cpp new file mode 100644 index 00000000..7c948c6e --- /dev/null +++ b/arrow/xlntpyarrow/xlntpyarrow.cpp @@ -0,0 +1,109 @@ +#include +#include +#include +#include +#include +#include + +PyObject *xlsx2arrow(PyObject *file) +{ + xlnt::arrow::streambuf buffer(file); + std::istream stream(&buffer); + std::shared_ptr schema; + std::vector> columns; + arrow::Table table(schema, columns); + xlnt::arrow::xlsx2arrow(stream, table); + + Py_RETURN_NONE; +} + +extern "C" { + +/* + * Implements XLSX->pyarrow table function. + */ +PyDoc_STRVAR(xlntpyarrow_xlsx2arrow_doc, "xlsx2arrow(in_file)\ +\ +Returns an arrow table representing the given XLSX file object."); + +PyObject *xlntpyarrow_xlsx2arrow(PyObject *self, PyObject *args, PyObject *kwargs) +{ + PyObject *file = nullptr; + static const char *keywords[] = { "file", nullptr }; + static auto keywords_nc = const_cast(keywords); + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O", keywords_nc, &file)) + { + return nullptr; + } + + return xlsx2arrow(file); +} + + +/* +* Implements pyarrow table->XLSX function. +*/ +PyDoc_STRVAR(xlntpyarrow_arrow2xlsx_doc, "arrow2xlsx(table, out_file)\ +\ +Writes the given arrow table to out_file as an XLSX file."); + +PyObject *xlntpyarrow_arrow2xlsx(PyObject *self, PyObject *args, PyObject *kwargs) +{ + PyObject *obj = nullptr; + static const char *keywords[] = { "file", nullptr }; + static auto keywords_nc = const_cast(keywords); + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "Oi", keywords_nc, &obj)) + { + return nullptr; + } + + Py_RETURN_NONE; +} + +static PyMethodDef xlntpyarrow_functions[] = +{ + { "xlsx2arrow", (PyCFunction)xlntpyarrow_xlsx2arrow, METH_VARARGS | METH_KEYWORDS, xlntpyarrow_xlsx2arrow_doc }, + { "arrow2xlsx", (PyCFunction)xlntpyarrow_arrow2xlsx, METH_VARARGS | METH_KEYWORDS, xlntpyarrow_arrow2xlsx_doc }, + { nullptr, nullptr, 0, nullptr } +}; + +int exec_xlntpyarrow(PyObject *module) +{ + PyModule_AddFunctions(module, xlntpyarrow_functions); + + PyModule_AddStringConstant(module, "__author__", "Thomas Fussell"); + PyModule_AddStringConstant(module, "__version__", "0.9.0"); + PyModule_AddIntConstant(module, "year", 2017); + + return 0; +} + +PyDoc_STRVAR(xlntpyarrow_doc, "The xlntpyarrow module"); + +static PyModuleDef_Slot xlntpyarrow_slots[] = +{ + { Py_mod_exec, (void *)exec_xlntpyarrow }, + { 0, nullptr } +}; + +static PyModuleDef xlntpyarrow_def = +{ + PyModuleDef_HEAD_INIT, + "xlntpyarrow", + xlntpyarrow_doc, + 0, /* m_size */ + nullptr, /* m_methods */ + xlntpyarrow_slots, + nullptr, /* m_traverse */ + nullptr, /* m_clear */ + nullptr, /* m_free */ +}; + +PyMODINIT_FUNC PyInit_xlntpyarrow() +{ + return PyModuleDef_Init(&xlntpyarrow_def); +} + +} // extern "C" diff --git a/include/xlnt/workbook/streaming_workbook_reader.hpp b/include/xlnt/workbook/streaming_workbook_reader.hpp new file mode 100644 index 00000000..6e975426 --- /dev/null +++ b/include/xlnt/workbook/streaming_workbook_reader.hpp @@ -0,0 +1,132 @@ +// Copyright (c) 2014-2017 Thomas Fussell +// Copyright (c) 2010-2015 openpyxl +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, WRISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE +// +// @license: http://www.opensource.org/licenses/mit-license.php +// @author: see AUTHORS file +#pragma once + +#include +#include +#include +#include +#include + +#include + +namespace xml { +class parser; +} + +namespace xlnt { + +class cell; +template +class optional; +class path; +class workbook; +class worksheet; + +namespace detail { +class xlsx_consumer; +} + +/// +/// workbook is the container for all other parts of the document. +/// +class XLNT_API streaming_workbook_reader +{ +public: + streaming_workbook_reader(); + ~streaming_workbook_reader(); + + /// + /// Closes currently open read stream. This will be called automatically + /// by the destructor if it hasn't already been called manually. + /// + void close(); + + bool has_cell(); + + /// + /// Reads the next cell in the current worksheet and optionally returns it if + /// the last cell in the sheet has not yet been read. + /// + cell read_cell(); + + bool has_worksheet(); + + /// + /// Beings reading of the next worksheet in the workbook and optionally + /// returns its title if the last worksheet has not yet been read. + /// + void begin_worksheet(); + + /// + /// Ends reading of the current worksheet in the workbook and optionally + /// returns a worksheet object corresponding to the worksheet with the title + /// returned by begin_worksheet(). + /// + worksheet end_worksheet(); + + /// + /// Interprets byte vector data as an XLSX file and sets the content of this + /// workbook to match that file. + /// + void open(const std::vector &data); + + /// + /// Interprets file with the given filename as an XLSX file and sets + /// the content of this workbook to match that file. + /// + void open(const std::string &filename); + +#ifdef _MSC_VER + /// + /// Interprets file with the given filename as an XLSX file and sets + /// the content of this workbook to match that file. + /// + void open(const std::wstring &filename); +#endif + + /// + /// Interprets file with the given filename as an XLSX file and sets the + /// content of this workbook to match that file. + /// + void open(const path &filename); + + /// + /// Interprets data in stream as an XLSX file and sets the content of this + /// workbook to match that file. + /// + void open(std::istream &stream); + +private: + std::vector worksheet_queue_; + std::unique_ptr consumer_; + std::unique_ptr workbook_; + std::unique_ptr stream_; + std::unique_ptr stream_buffer_; + std::unique_ptr part_stream_; + std::unique_ptr part_stream_buffer_; + std::unique_ptr parser_; +}; + +} // namespace xlnt diff --git a/include/xlnt/workbook/streaming_workbook_writer.hpp b/include/xlnt/workbook/streaming_workbook_writer.hpp new file mode 100644 index 00000000..38d27880 --- /dev/null +++ b/include/xlnt/workbook/streaming_workbook_writer.hpp @@ -0,0 +1,93 @@ +// Copyright (c) 2014-2017 Thomas Fussell +// Copyright (c) 2010-2015 openpyxl +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, WRISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE +// +// @license: http://www.opensource.org/licenses/mit-license.php +// @author: see AUTHORS file +#pragma once + +#include +#include + +#include + +namespace xlnt { + +/// +/// workbook is the container for all other parts of the document. +/// +class XLNT_API streaming_workbook_writer +{ +public: + ~streaming_workbook_writer(); + + /// + /// Finishes writing of the remaining contents of the workbook and closes + /// currently open write stream. This will be called automatically by the + /// destructor if it hasn't already been called manually. + /// + void close(); + + /// + /// Writes a cell to the currently active worksheet at the position given by + /// ref and with the given value. ref should be to the right of or below + /// the previously written cell. + /// + cell add_cell(const cell_reference &ref); + + /// + /// Ends writing of data to the current sheet and begins writing a new sheet + /// with the given title. + /// + worksheet add_sheet(const std::string &title); + + /// + /// Serializes the workbook into an XLSX file and saves the bytes into + /// byte vector data. + /// + void open(std::vector &data) const; + + /// + /// Serializes the workbook into an XLSX file and saves the data into a file + /// named filename. + /// + void open(const std::string &filename) const; + +#ifdef _MSC_VER + /// + /// Serializes the workbook into an XLSX file and saves the data into a file + /// named filename. + /// + void open(const std::wstring &filename) const; +#endif + + /// + /// Serializes the workbook into an XLSX file and saves the data into a file + /// named filename. + /// + void open(const xlnt::path &filename) const; + + /// + /// Serializes the workbook into an XLSX file and saves the data into stream. + /// + void open(std::ostream &stream) const; +}; + +} // namespace xlnt diff --git a/include/xlnt/xlnt.hpp b/include/xlnt/xlnt.hpp index db839c3a..bf52609c 100644 --- a/include/xlnt/xlnt.hpp +++ b/include/xlnt/xlnt.hpp @@ -65,6 +65,7 @@ #include #include #include +#include #include #include #include diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt index 94a474df..cf71e1e7 100644 --- a/source/CMakeLists.txt +++ b/source/CMakeLists.txt @@ -28,6 +28,7 @@ endif() if(MSVC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4") elseif(CMAKE_CXX_COMPILER_ID MATCHES "GNU") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wno-unknown-pragmas") elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Weverything -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-padded -Werror -Wno-documentation-unknown-command") @@ -155,6 +156,11 @@ target_include_directories(xlnt PUBLIC ${XLNT_INCLUDE_DIR}) target_include_directories(xlnt PRIVATE ${XLNT_SOURCE_DIR}) target_include_directories(xlnt PRIVATE ${XLNT_SOURCE_DIR}/../third-party/libstudxml) +if(CMAKE_CXX_COMPILER_ID MATCHES "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "5.0.0") + target_compile_definitions(xlnt PRIVATE UTFCPP=1) + target_include_directories(xlnt PRIVATE ${XLNT_SOURCE_DIR}/../third-party/utfcpp) +endif() + if(MSVC) set_target_properties(xlnt PROPERTIES COMPILE_FLAGS "/wd\"4251\" /wd\"4275\" /wd\"4068\" /MP") set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/detail/serialization/miniz.cpp PROPERTIES COMPILE_FLAGS "/wd\"4244\" /wd\"4334\" /wd\"4127\"") diff --git a/source/detail/cryptography/xlsx_crypto_consumer.cpp b/source/detail/cryptography/xlsx_crypto_consumer.cpp index 10a319bf..fdc53a96 100644 --- a/source/detail/cryptography/xlsx_crypto_consumer.cpp +++ b/source/detail/cryptography/xlsx_crypto_consumer.cpp @@ -108,7 +108,7 @@ std::vector decrypt_xlsx_agile( ++segment; } - decrypted_package.resize(total_size); + decrypted_package.resize(static_cast(total_size)); return decrypted_package; } @@ -153,7 +153,8 @@ encryption_info::standard_encryption_info read_standard_encryption_info(std::ist throw xlnt::exception("invalid header"); } - const auto csp_name_length = (header_length - (info_stream.tellg() - index_at_start)) / 2; + const auto csp_name_length = static_cast((header_length + - (info_stream.tellg() - index_at_start)) / 2); auto csp_name = xlnt::detail::read_string(info_stream, csp_name_length); csp_name.pop_back(); // remove extraneous trailing null if (csp_name != u"Microsoft Enhanced RSA and AES Cryptographic Provider (Prototype)" diff --git a/source/detail/serialization/xlsx_consumer.cpp b/source/detail/serialization/xlsx_consumer.cpp index fce83802..500303e3 100644 --- a/source/detail/serialization/xlsx_consumer.cpp +++ b/source/detail/serialization/xlsx_consumer.cpp @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -132,10 +133,898 @@ xlsx_consumer::xlsx_consumer(workbook &target) { } +xlsx_consumer::~xlsx_consumer() +{ +} + void xlsx_consumer::read(std::istream &source) { archive_.reset(new izstream(source)); - populate_workbook(); + populate_workbook(false); +} + +void xlsx_consumer::open(std::istream &source) +{ + archive_.reset(new izstream(source)); + populate_workbook(true); +} + +cell xlsx_consumer::read_cell() +{ + if (!has_cell()) + { + return cell(nullptr); + } + + auto ws = worksheet(current_worksheet_); + + if (in_element(qn("spreadsheetml", "sheetData"))) + { + expect_start_element(qn("spreadsheetml", "row"), xml::content::complex); // CT_Row + auto row_index = parser().attribute("r"); + + if (parser().attribute_present("ht")) + { + ws.row_properties(row_index).height = parser().attribute("ht"); + } + + if (parser().attribute_present("customHeight")) + { + ws.row_properties(row_index).custom_height = is_true(parser().attribute("customHeight")); + } + + if (parser().attribute_present("hidden") && is_true(parser().attribute("hidden"))) + { + ws.row_properties(row_index).hidden = true; + } + + skip_attributes({ qn("x14ac", "dyDescent") }); + skip_attributes({ "customFormat", "s", "customFont", + "outlineLevel", "collapsed", "thickTop", "thickBot", + "ph", "spans" }); + } + + if (!in_element(qn("spreadsheetml", "row"))) + { + return cell(nullptr); + } + + expect_start_element(qn("spreadsheetml", "c"), xml::content::complex); + + auto cell = streaming_ ? xlnt::cell(streaming_cell_.get()) + : ws.cell(cell_reference(parser().attribute("r"))); + auto reference = cell_reference(parser().attribute("r")); + cell.d_->parent_ = current_worksheet_; + cell.d_->column_ = reference.column_index(); + cell.d_->row_ = reference.row(); + + auto has_type = parser().attribute_present("t"); + auto type = has_type ? parser().attribute("t") : "n"; + + auto has_format = parser().attribute_present("s"); + auto format_id = static_cast(has_format ? std::stoull(parser().attribute("s")) : 0LL); + + auto has_value = false; + auto value_string = std::string(); + + auto has_formula = false; + auto has_shared_formula = false; + auto formula_value_string = std::string(); + + while (in_element(qn("spreadsheetml", "c"))) + { + auto current_element = expect_start_element(xml::content::mixed); + + if (current_element == qn("spreadsheetml", "v")) // s:ST_Xstring + { + has_value = true; + value_string = read_text(); + } + else if (current_element == qn("spreadsheetml", "f")) // CT_CellFormula + { + has_formula = true; + + if (parser().attribute_present("t")) + { + has_shared_formula = parser().attribute("t") == "shared"; + } + + skip_attributes( + { "aca", "ref", "dt2D", "dtr", "del1", "del2", "r1", "r2", "ca", "si", "bx" }); + + formula_value_string = read_text(); + } + else if (current_element == qn("spreadsheetml", "is")) // CT_Rst + { + expect_start_element(qn("spreadsheetml", "t"), xml::content::simple); + value_string = read_text(); + expect_end_element(qn("spreadsheetml", "t")); + } + else + { + unexpected_element(current_element); + } + + expect_end_element(current_element); + } + + expect_end_element(qn("spreadsheetml", "c")); + + if (has_formula && !has_shared_formula) + { + cell.formula(formula_value_string); + } + + if (has_value) + { + if (type == "str") + { + cell.d_->value_text_ = value_string; + cell.data_type(cell::type::formula_string); + } + else if (type == "inlineStr") + { + cell.d_->value_text_ = value_string; + cell.data_type(cell::type::inline_string); + } + else if (type == "s") + { + cell.d_->value_numeric_ = std::stold(value_string); + cell.data_type(cell::type::shared_string); + } + else if (type == "b") // boolean + { + cell.value(is_true(value_string)); + } + else if (type == "n") // numeric + { + cell.value(std::stold(value_string)); + } + else if (!value_string.empty() && value_string[0] == '#') + { + cell.error(value_string); + } + } + + if (has_format) + { + cell.format(target_.format(format_id)); + } + + if (!in_element(qn("spreadsheetml", "row"))) + { + expect_end_element(qn("spreadsheetml", "row")); + + if (!in_element(qn("spreadsheetml", "sheetData"))) + { + expect_end_element(qn("spreadsheetml", "sheetData")); + } + } + + return cell; +} + +void xlsx_consumer::read_worksheet(const std::string &rel_id) +{ + read_worksheet_begin(rel_id); + + if (!streaming_) + { + read_worksheet_sheetdata(); + read_worksheet_end(rel_id); + } +} + +std::string xlsx_consumer::read_worksheet_begin(const std::string &rel_id) +{ + if (streaming_ && streaming_cell_ == nullptr) + { + streaming_cell_.reset(new detail::cell_impl()); + } + + auto title = std::find_if(target_.d_->sheet_title_rel_id_map_.begin(), + target_.d_->sheet_title_rel_id_map_.end(), + [&](const std::pair &p) { + return p.second == rel_id; + })->first; + + auto id = sheet_title_id_map_[title]; + auto index = sheet_title_index_map_[title]; + + auto insertion_iter = target_.d_->worksheets_.begin(); + while (insertion_iter != target_.d_->worksheets_.end() && sheet_title_index_map_[insertion_iter->title_] < index) + { + ++insertion_iter; + } + + current_worksheet_ = &*target_.d_->worksheets_.emplace(insertion_iter, &target_, id, title); + auto ws = worksheet(current_worksheet_); + + expect_start_element(qn("spreadsheetml", "worksheet"), xml::content::complex); // CT_Worksheet + skip_attributes({ qn("mc", "Ignorable") }); + read_namespaces(); + + while (in_element(qn("spreadsheetml", "worksheet"))) + { + auto current_worksheet_element = expect_start_element(xml::content::complex); + + if (current_worksheet_element == qn("spreadsheetml", "sheetPr")) // CT_SheetPr 0-1 + { + while (in_element(current_worksheet_element)) + { + auto sheet_pr_child_element = expect_start_element(xml::content::simple); + + if (sheet_pr_child_element == qn("spreadsheetml", "tabColor")) // CT_Color 0-1 + { + read_color(); + } + else if (sheet_pr_child_element == qn("spreadsheetml", "outlinePr")) // CT_OutlinePr 0-1 + { + skip_attribute("applyStyles"); // optional, boolean, false + skip_attribute("summaryBelow"); // optional, boolean, true + skip_attribute("summaryRight"); // optional, boolean, true + skip_attribute("showOutlineSymbols"); // optional, boolean, true + } + else if (sheet_pr_child_element == qn("spreadsheetml", "pageSetUpPr")) // CT_PageSetUpPr 0-1 + { + skip_attribute("autoPageBreaks"); // optional, boolean, true + skip_attribute("fitToPage"); // optional, boolean, false + } + else + { + unexpected_element(sheet_pr_child_element); + } + + expect_end_element(sheet_pr_child_element); + } + + skip_attribute("syncHorizontal"); // optional, boolean, false + skip_attribute("syncVertical"); // optional, boolean, false + skip_attribute("syncRef"); // optional, ST_Ref, false + skip_attribute("transitionEvaluation"); // optional, boolean, false + skip_attribute("transitionEntry"); // optional, boolean, false + skip_attribute("published"); // optional, boolean, true + skip_attribute("codeName"); // optional, string + skip_attribute("filterMode"); // optional, boolean, false + skip_attribute("enableFormatConditionsCalculation"); // optional, boolean, true + } + else if (current_worksheet_element == qn("spreadsheetml", "dimension")) // CT_SheetDimension 0-1 + { + skip_remaining_content(current_worksheet_element); + } + else if (current_worksheet_element == qn("spreadsheetml", "sheetViews")) // CT_SheetViews 0-1 + { + while (in_element(current_worksheet_element)) + { + expect_start_element(qn("spreadsheetml", "sheetView"), xml::content::complex); // CT_SheetView 1+ + + sheet_view new_view; + new_view.id(parser().attribute("workbookViewId")); + + if (parser().attribute_present("showGridLines")) // default="true" + { + new_view.show_grid_lines(is_true(parser().attribute("showGridLines"))); + } + + if (parser().attribute_present("defaultGridColor")) // default="true" + { + new_view.default_grid_color(is_true(parser().attribute("defaultGridColor"))); + } + + if (parser().attribute_present("view") && parser().attribute("view") != "normal") + { + new_view.type(parser().attribute("view") == "pageBreakPreview" ? sheet_view_type::page_break_preview + : sheet_view_type::page_layout); + } + + skip_attributes({ "windowProtection", "showFormulas", "showRowColHeaders", "showZeros", "rightToLeft", + "tabSelected", "showRuler", "showOutlineSymbols", "showWhiteSpace", "view", "topLeftCell", + "colorId", "zoomScale", "zoomScaleNormal", "zoomScaleSheetLayoutView", "zoomScalePageLayoutView" }); + + while (in_element(qn("spreadsheetml", "sheetView"))) + { + auto sheet_view_child_element = expect_start_element(xml::content::simple); + + if (sheet_view_child_element == qn("spreadsheetml", "pane")) // CT_Pane 0-1 + { + pane new_pane; + + if (parser().attribute_present("topLeftCell")) + { + new_pane.top_left_cell = cell_reference(parser().attribute("topLeftCell")); + } + + if (parser().attribute_present("xSplit")) + { + new_pane.x_split = parser().attribute("xSplit"); + } + + if (parser().attribute_present("ySplit")) + { + new_pane.y_split = parser().attribute("ySplit"); + } + + if (parser().attribute_present("activePane")) + { + new_pane.active_pane = parser().attribute("activePane"); + } + + if (parser().attribute_present("state")) + { + new_pane.state = parser().attribute("state"); + } + + new_view.pane(new_pane); + } + else if (sheet_view_child_element == qn("spreadsheetml", "selection")) // CT_Selection 0-4 + { + skip_remaining_content(sheet_view_child_element); + } + else if (sheet_view_child_element == qn("spreadsheetml", "pivotSelection")) // CT_PivotSelection 0-4 + { + skip_remaining_content(sheet_view_child_element); + } + else if (sheet_view_child_element == qn("spreadsheetml", "extLst")) // CT_ExtensionList 0-1 + { + skip_remaining_content(sheet_view_child_element); + } + else + { + unexpected_element(sheet_view_child_element); + } + + expect_end_element(sheet_view_child_element); + } + + expect_end_element(qn("spreadsheetml", "sheetView")); + + ws.d_->views_.push_back(new_view); + } + } + else if (current_worksheet_element == qn("spreadsheetml", "sheetFormatPr")) // CT_SheetFormatPr 0-1 + { + skip_remaining_content(current_worksheet_element); + } + else if (current_worksheet_element == qn("spreadsheetml", "cols")) // CT_Cols 0+ + { + while (in_element(qn("spreadsheetml", "cols"))) + { + expect_start_element(qn("spreadsheetml", "col"), xml::content::simple); + + skip_attributes({ "bestFit", "collapsed", "outlineLevel" }); + + auto min = static_cast(std::stoull(parser().attribute("min"))); + auto max = static_cast(std::stoull(parser().attribute("max"))); + + optional width; + + if (parser().attribute_present("width")) + { + width = parser().attribute("width"); + } + + optional column_style; + + if (parser().attribute_present("style")) + { + column_style = parser().attribute("style"); + } + + auto custom = parser().attribute_present("customWidth") + ? is_true(parser().attribute("customWidth")) : false; + auto hidden = parser().attribute_present("hidden") + ? is_true(parser().attribute("hidden")) : false; + + expect_end_element(qn("spreadsheetml", "col")); + + for (auto column = min; column <= max; column++) + { + column_properties props; + + if (width.is_set()) + { + props.width = width.get(); + } + + if (column_style.is_set()) + { + props.style = column_style.get(); + } + + props.hidden = hidden; + props.custom_width = custom; + ws.add_column_properties(column, props); + } + } + } + else if (current_worksheet_element == qn("spreadsheetml", "sheetData")) // CT_SheetData 1 + { + return title; + } + + expect_end_element(current_worksheet_element); + } + + return title; +} + +void xlsx_consumer::read_worksheet_sheetdata() +{ + auto ws = worksheet(current_worksheet_); + + if (stack_.back() != qn("spreadsheetml", "sheetData")) + { + return; + } + + while (in_element(qn("spreadsheetml", "sheetData"))) + { + expect_start_element(qn("spreadsheetml", "row"), xml::content::complex); // CT_Row + auto row_index = parser().attribute("r"); + + if (parser().attribute_present("ht")) + { + ws.row_properties(row_index).height = parser().attribute("ht"); + } + + if (parser().attribute_present("customHeight")) + { + ws.row_properties(row_index).custom_height = is_true(parser().attribute("customHeight")); + } + + if (parser().attribute_present("hidden") && is_true(parser().attribute("hidden"))) + { + ws.row_properties(row_index).hidden = true; + } + + skip_attributes({ qn("x14ac", "dyDescent") }); + skip_attributes({ "customFormat", "s", "customFont", + "outlineLevel", "collapsed", "thickTop", "thickBot", + "ph", "spans" }); + + while (in_element(qn("spreadsheetml", "row"))) + { + expect_start_element(qn("spreadsheetml", "c"), xml::content::complex); + auto cell = ws.cell(cell_reference(parser().attribute("r"))); + + auto has_type = parser().attribute_present("t"); + auto type = has_type ? parser().attribute("t") : "n"; + + auto has_format = parser().attribute_present("s"); + auto format_id = static_cast(has_format ? std::stoull(parser().attribute("s")) : 0LL); + + auto has_value = false; + auto value_string = std::string(); + + auto has_formula = false; + auto has_shared_formula = false; + auto formula_value_string = std::string(); + + while (in_element(qn("spreadsheetml", "c"))) + { + auto current_element = expect_start_element(xml::content::mixed); + + if (current_element == qn("spreadsheetml", "v")) // s:ST_Xstring + { + has_value = true; + value_string = read_text(); + } + else if (current_element == qn("spreadsheetml", "f")) // CT_CellFormula + { + has_formula = true; + + if (parser().attribute_present("t")) + { + has_shared_formula = parser().attribute("t") == "shared"; + } + + skip_attributes( + { "aca", "ref", "dt2D", "dtr", "del1", "del2", "r1", "r2", "ca", "si", "bx" }); + + formula_value_string = read_text(); + } + else if (current_element == qn("spreadsheetml", "is")) // CT_Rst + { + expect_start_element(qn("spreadsheetml", "t"), xml::content::simple); + value_string = read_text(); + expect_end_element(qn("spreadsheetml", "t")); + } + else + { + unexpected_element(current_element); + } + + expect_end_element(current_element); + } + + expect_end_element(qn("spreadsheetml", "c")); + + if (has_formula && !has_shared_formula) + { + cell.formula(formula_value_string); + } + + if (has_value) + { + if (type == "str") + { + cell.d_->value_text_ = value_string; + cell.data_type(cell::type::formula_string); + } + else if (type == "inlineStr") + { + cell.d_->value_text_ = value_string; + cell.data_type(cell::type::inline_string); + } + else if (type == "s") + { + cell.d_->value_numeric_ = std::stold(value_string); + cell.data_type(cell::type::shared_string); + } + else if (type == "b") // boolean + { + cell.value(is_true(value_string)); + } + else if (type == "n") // numeric + { + cell.value(std::stold(value_string)); + } + else if (!value_string.empty() && value_string[0] == '#') + { + cell.error(value_string); + } + } + + if (has_format) + { + cell.format(target_.format(format_id)); + } + } + + expect_end_element(qn("spreadsheetml", "row")); + } + + expect_end_element(qn("spreadsheetml", "sheetData")); +} + +worksheet xlsx_consumer::read_worksheet_end(const std::string &rel_id) +{ + auto &manifest = target_.manifest(); + + const auto workbook_rel = manifest.relationship(path("/"), relationship_type::office_document); + const auto sheet_rel = manifest.relationship(workbook_rel.target().path(), rel_id); + path sheet_path(sheet_rel.source().path().parent().append(sheet_rel.target().path())); + auto hyperlinks = manifest.relationships(sheet_path, xlnt::relationship_type::hyperlink); + + auto ws = worksheet(current_worksheet_); + + while (in_element(qn("spreadsheetml", "worksheet"))) + { + auto current_worksheet_element = expect_start_element(xml::content::complex); + + if (current_worksheet_element == qn("spreadsheetml", "sheetCalcPr")) // CT_SheetCalcPr 0-1 + { + skip_remaining_content(current_worksheet_element); + } + else if (current_worksheet_element == qn("spreadsheetml", "sheetProtection")) // CT_SheetProtection 0-1 + { + skip_remaining_content(current_worksheet_element); + } + else if (current_worksheet_element == qn("spreadsheetml", "protectedRanges")) // CT_ProtectedRanges 0-1 + { + skip_remaining_content(current_worksheet_element); + } + else if (current_worksheet_element == qn("spreadsheetml", "scenarios")) // CT_Scenarios 0-1 + { + skip_remaining_content(current_worksheet_element); + } + else if (current_worksheet_element == qn("spreadsheetml", "autoFilter")) // CT_AutoFilter 0-1 + { + ws.auto_filter(xlnt::range_reference(parser().attribute("ref"))); + // auto filter complex + skip_remaining_content(current_worksheet_element); + } + else if (current_worksheet_element == qn("spreadsheetml", "sortState")) // CT_SortState 0-1 + { + skip_remaining_content(current_worksheet_element); + } + else if (current_worksheet_element == qn("spreadsheetml", "dataConsolidate")) // CT_DataConsolidate 0-1 + { + skip_remaining_content(current_worksheet_element); + } + else if (current_worksheet_element == qn("spreadsheetml", "customSheetViews")) // CT_CustomSheetViews 0-1 + { + skip_remaining_content(current_worksheet_element); + } + else if (current_worksheet_element == qn("spreadsheetml", "mergeCells")) // CT_MergeCells 0-1 + { + auto count = std::stoull(parser().attribute("count")); + + while (in_element(qn("spreadsheetml", "mergeCells"))) + { + expect_start_element(qn("spreadsheetml", "mergeCell"), xml::content::simple); + ws.merge_cells(range_reference(parser().attribute("ref"))); + expect_end_element(qn("spreadsheetml", "mergeCell")); + + count--; + } + + if (count != 0) + { + throw invalid_file("sizes don't match"); + } + } + else if (current_worksheet_element == qn("spreadsheetml", "phoneticPr")) // CT_PhoneticPr 0-1 + { + skip_remaining_content(current_worksheet_element); + } + else if (current_worksheet_element == qn("spreadsheetml", "conditionalFormatting")) // CT_ConditionalFormatting 0+ + { + skip_remaining_content(current_worksheet_element); + } + else if (current_worksheet_element == qn("spreadsheetml", "dataValidations")) // CT_DataValidations 0-1 + { + skip_remaining_content(current_worksheet_element); + } + else if (current_worksheet_element == qn("spreadsheetml", "hyperlinks")) // CT_Hyperlinks 0-1 + { + while (in_element(qn("spreadsheetml", "hyperlinks"))) + { + expect_start_element(qn("spreadsheetml", "hyperlink"), xml::content::simple); + + auto cell = ws.cell(parser().attribute("ref")); + + if (parser().attribute_present(qn("r", "id"))) + { + auto hyperlink_rel_id = parser().attribute(qn("r", "id")); + auto hyperlink_rel = std::find_if(hyperlinks.begin(), hyperlinks.end(), + [&](const relationship &r) { return r.id() == hyperlink_rel_id; }); + + if (hyperlink_rel != hyperlinks.end()) + { + cell.hyperlink(hyperlink_rel->target().path().string()); + } + } + + skip_attributes({ "location", "tooltip", "display" }); + expect_end_element(qn("spreadsheetml", "hyperlink")); + } + } + else if (current_worksheet_element == qn("spreadsheetml", "printOptions")) // CT_PrintOptions 0-1 + { + skip_remaining_content(current_worksheet_element); + } + else if (current_worksheet_element == qn("spreadsheetml", "pageMargins")) // CT_PageMargins 0-1 + { + page_margins margins; + + margins.top(parser().attribute("top")); + margins.bottom(parser().attribute("bottom")); + margins.left(parser().attribute("left")); + margins.right(parser().attribute("right")); + margins.header(parser().attribute("header")); + margins.footer(parser().attribute("footer")); + + ws.page_margins(margins); + } + else if (current_worksheet_element == qn("spreadsheetml", "pageSetup")) // CT_PageSetup 0-1 + { + skip_remaining_content(current_worksheet_element); + } + else if (current_worksheet_element == qn("spreadsheetml", "headerFooter")) // CT_HeaderFooter 0-1 + { + header_footer hf; + + hf.align_with_margins( + !parser().attribute_present("alignWithMargins") || is_true(parser().attribute("alignWithMargins"))); + hf.scale_with_doc( + !parser().attribute_present("alignWithMargins") || is_true(parser().attribute("alignWithMargins"))); + auto different_odd_even = + parser().attribute_present("differentOddEven") && is_true(parser().attribute("differentOddEven")); + auto different_first = + parser().attribute_present("differentFirst") && is_true(parser().attribute("differentFirst")); + + optional, 3>> odd_header; + optional, 3>> odd_footer; + optional, 3>> even_header; + optional, 3>> even_footer; + optional, 3>> first_header; + optional, 3>> first_footer; + + using xlnt::detail::decode_header_footer; + + while (in_element(current_worksheet_element)) + { + auto current_hf_element = expect_start_element(xml::content::simple); + + if (current_hf_element == qn("spreadsheetml", "oddHeader")) + { + odd_header = decode_header_footer(read_text()); + } + else if (current_hf_element == qn("spreadsheetml", "oddFooter")) + { + odd_footer = decode_header_footer(read_text()); + } + else if (current_hf_element == qn("spreadsheetml", "evenHeader")) + { + even_header = decode_header_footer(read_text()); + } + else if (current_hf_element == qn("spreadsheetml", "evenFooter")) + { + even_footer = decode_header_footer(read_text()); + } + else if (current_hf_element == qn("spreadsheetml", "firstHeader")) + { + first_header = decode_header_footer(read_text()); + } + else if (current_hf_element == qn("spreadsheetml", "firstFooter")) + { + first_footer = decode_header_footer(read_text()); + } + else + { + unexpected_element(current_hf_element); + } + + expect_end_element(current_hf_element); + } + + for (std::size_t i = 0; i < 3; ++i) + { + auto loc = i == 0 ? header_footer::location::left + : i == 1 ? header_footer::location::center : header_footer::location::right; + + if (different_odd_even) + { + if (odd_header.is_set() && odd_header.get().at(i).is_set() && even_header.is_set() + && even_header.get().at(i).is_set()) + { + hf.odd_even_header(loc, odd_header.get().at(i).get(), even_header.get().at(i).get()); + } + + if (odd_footer.is_set() && odd_footer.get().at(i).is_set() && even_footer.is_set() + && even_footer.get().at(i).is_set()) + { + hf.odd_even_footer(loc, odd_footer.get().at(i).get(), even_footer.get().at(i).get()); + } + } + else + { + if (odd_header.is_set() && odd_header.get().at(i).is_set()) + { + hf.header(loc, odd_header.get().at(i).get()); + } + + if (odd_footer.is_set() && odd_footer.get().at(i).is_set()) + { + hf.footer(loc, odd_footer.get().at(i).get()); + } + } + + if (different_first) + { + } + } + + ws.header_footer(hf); + } + else if (current_worksheet_element == qn("spreadsheetml", "rowBreaks")) // CT_PageBreak 0-1 + { + auto count = parser().attribute_present("count") ? parser().attribute("count") : 0; + auto manual_break_count = parser().attribute_present("manualBreakCount") + ? parser().attribute("manualBreakCount") : 0; + + while (in_element(qn("spreadsheetml", "rowBreaks"))) + { + expect_start_element(qn("spreadsheetml", "brk"), xml::content::simple); + + if (parser().attribute_present("id")) + { + ws.page_break_at_row(parser().attribute("id")); + --count; + } + + if (parser().attribute_present("man") && is_true(parser().attribute("man"))) + { + --manual_break_count; + } + + skip_attributes({ "min", "max", "pt" }); + expect_end_element(qn("spreadsheetml", "brk")); + } + } + else if (current_worksheet_element == qn("spreadsheetml", "colBreaks")) // CT_PageBreak 0-1 + { + auto count = parser().attribute_present("count") ? parser().attribute("count") : 0; + auto manual_break_count = parser().attribute_present("manualBreakCount") + ? parser().attribute("manualBreakCount") + : 0; + + while (in_element(qn("spreadsheetml", "colBreaks"))) + { + expect_start_element(qn("spreadsheetml", "brk"), xml::content::simple); + + if (parser().attribute_present("id")) + { + ws.page_break_at_column(parser().attribute("id")); + --count; + } + + if (parser().attribute_present("man") && is_true(parser().attribute("man"))) + { + --manual_break_count; + } + + skip_attributes({ "min", "max", "pt" }); + expect_end_element(qn("spreadsheetml", "brk")); + } + } + else if (current_worksheet_element == qn("spreadsheetml", "customProperties")) // CT_CustomProperties 0-1 + { + skip_remaining_content(current_worksheet_element); + } + else if (current_worksheet_element == qn("spreadsheetml", "cellWatches")) // CT_CellWatches 0-1 + { + skip_remaining_content(current_worksheet_element); + } + else if (current_worksheet_element == qn("spreadsheetml", "ignoredErrors")) // CT_IgnoredErrors 0-1 + { + skip_remaining_content(current_worksheet_element); + } + else if (current_worksheet_element == qn("spreadsheetml", "smartTags")) // CT_SmartTags 0-1 + { + skip_remaining_content(current_worksheet_element); + } + else if (current_worksheet_element == qn("spreadsheetml", "drawing")) // CT_Drawing 0-1 + { + skip_remaining_content(current_worksheet_element); + } + else if (current_worksheet_element == qn("spreadsheetml", "legacyDrawing")) + { + skip_remaining_content(current_worksheet_element); + } + else if (current_worksheet_element == qn("spreadsheetml", "extLst")) + { + skip_remaining_content(current_worksheet_element); + } + else + { + unexpected_element(current_worksheet_element); + } + + expect_end_element(current_worksheet_element); + } + + expect_end_element(qn("spreadsheetml", "worksheet")); + + if (manifest.has_relationship(sheet_path, xlnt::relationship_type::comments)) + { + auto comments_part = manifest.canonicalize({ workbook_rel, sheet_rel, + manifest.relationship(sheet_path, xlnt::relationship_type::comments) }); + + auto receive = xml::parser::receive_default; + auto comments_part_streambuf = archive_->open(comments_part); + std::istream comments_part_stream(comments_part_streambuf.get()); + xml::parser parser(comments_part_stream, comments_part.string(), receive); + parser_ = &parser; + + read_comments(ws); + + if (manifest.has_relationship(sheet_path, xlnt::relationship_type::vml_drawing)) + { + auto vml_drawings_part = manifest.canonicalize({ workbook_rel, sheet_rel, + manifest.relationship(sheet_path, xlnt::relationship_type::vml_drawing) }); + + auto vml_drawings_part_streambuf = archive_->open(comments_part); + std::istream vml_drawings_part_stream(comments_part_streambuf.get()); + xml::parser vml_parser(vml_drawings_part_stream, vml_drawings_part.string(), receive); + parser_ = &vml_parser; + + read_vml_drawings(ws); + } + } + + return ws; } xml::parser &xlsx_consumer::parser() @@ -143,6 +1032,12 @@ xml::parser &xlsx_consumer::parser() return *parser_; } +bool xlsx_consumer::has_cell() +{ + return in_element(qn("spreadsheetml", "row")) + || in_element(qn("spreadsheetml", "sheetData")); +} + std::vector xlsx_consumer::read_relationships(const path &part) { const auto part_rels_path = part.parent().append("_rels") @@ -167,7 +1062,7 @@ std::vector xlsx_consumer::read_relationships(const path &part) : xlnt::target_mode::internal; auto target = xlnt::uri(parser.attribute("Target")); - if (target.path().is_absolute() && target_mode == target_mode::internal) + if (target.path().is_absolute() && target_mode == xlnt::target_mode::internal) { target = uri(target.path().relative_to(path(part.string()).resolve(path("/"))).string()); } @@ -321,8 +1216,10 @@ void xlsx_consumer::read_part(const std::vector &rel_chain) parser_ = nullptr; } -void xlsx_consumer::populate_workbook() +void xlsx_consumer::populate_workbook(bool streaming) { + streaming_ = streaming; + target_.clear(); read_content_types(); @@ -535,10 +1432,26 @@ void xlsx_consumer::read_office_document(const std::string &content_type) // CT_ "showHorizontalScroll", "showSheetTabs", "showVerticalScroll"}); workbook_view view; - view.x_window = parser().attribute("xWindow"); - view.y_window = parser().attribute("yWindow"); - view.window_width = parser().attribute("windowWidth"); - view.window_height = parser().attribute("windowHeight"); + + if (parser().attribute_present("xWindow")) + { + view.x_window = parser().attribute("xWindow"); + } + + if (parser().attribute_present("yWindow")) + { + view.y_window = parser().attribute("yWindow"); + } + + if (parser().attribute_present("windowWidth")) + { + view.window_width = parser().attribute("windowWidth"); + } + + if (parser().attribute_present("windowHeight")) + { + view.window_height = parser().attribute("windowHeight"); + } if (parser().attribute_present("tabRatio")) { @@ -640,17 +1553,28 @@ void xlsx_consumer::read_office_document(const std::string &content_type) // CT_ if (manifest().has_relationship(workbook_path, relationship_type::shared_string_table)) { - read_part({workbook_rel, manifest().relationship(workbook_path, relationship_type::shared_string_table)}); + read_part({workbook_rel, + manifest().relationship(workbook_path, + relationship_type::shared_string_table)}); } if (manifest().has_relationship(workbook_path, relationship_type::stylesheet)) { - read_part({workbook_rel, manifest().relationship(workbook_path, relationship_type::stylesheet)}); + read_part({workbook_rel, + manifest().relationship(workbook_path, + relationship_type::stylesheet)}); } if (manifest().has_relationship(workbook_path, relationship_type::theme)) { - read_part({workbook_rel, manifest().relationship(workbook_path, relationship_type::theme)}); + read_part({workbook_rel, + manifest().relationship(workbook_path, + relationship_type::theme)}); + } + + if (streaming_) + { + return; } for (auto worksheet_rel : manifest().relationships(workbook_path, relationship_type::worksheet)) @@ -1364,15 +2288,19 @@ void xlsx_consumer::read_stylesheet() void xlsx_consumer::read_theme() { - auto workbook_rel = manifest().relationship(path("/"), relationship_type::office_document); - auto theme_rel = manifest().relationship(workbook_rel.target().path(), relationship_type::theme); + auto workbook_rel = manifest().relationship(path("/"), + relationship_type::office_document); + auto theme_rel = manifest().relationship(workbook_rel.target().path(), + relationship_type::theme); auto theme_path = manifest().canonicalize({workbook_rel, theme_rel}); target_.theme(theme()); if (manifest().has_relationship(theme_path, relationship_type::image)) { - read_part({workbook_rel, theme_rel, manifest().relationship(theme_path, relationship_type::image)}); + read_part({workbook_rel, theme_rel, + manifest().relationship(theme_path, + relationship_type::image)}); } } @@ -1380,690 +2308,6 @@ void xlsx_consumer::read_volatile_dependencies() { } -// CT_Worksheet -void xlsx_consumer::read_worksheet(const std::string &rel_id) -{ -/* - static const auto &xmlns = constants::namespace_("spreadsheetml"); - static const auto &xmlns_mc = constants::namespace_("mc"); - static const auto &xmlns_x14ac = constants::namespace_("x14ac"); - static const auto &xmlns_r = constants::namespace_("r"); -*/ - auto title = std::find_if(target_.d_->sheet_title_rel_id_map_.begin(), - target_.d_->sheet_title_rel_id_map_.end(), - [&](const std::pair &p) { - return p.second == rel_id; - })->first; - - auto id = sheet_title_id_map_[title]; - auto index = sheet_title_index_map_[title]; - - auto insertion_iter = target_.d_->worksheets_.begin(); - while (insertion_iter != target_.d_->worksheets_.end() && sheet_title_index_map_[insertion_iter->title_] < index) - { - ++insertion_iter; - } - - target_.d_->worksheets_.emplace(insertion_iter, &target_, id, title); - - auto ws = target_.sheet_by_id(id); - - expect_start_element(qn("spreadsheetml", "worksheet"), xml::content::complex); // CT_Worksheet - skip_attributes({qn("mc", "Ignorable")}); - read_namespaces(); - - xlnt::range_reference full_range; - auto &manifest = target_.manifest(); - - const auto workbook_rel = manifest.relationship(path("/"), relationship_type::office_document); - const auto sheet_rel = manifest.relationship(workbook_rel.target().path(), rel_id); - path sheet_path(sheet_rel.source().path().parent().append(sheet_rel.target().path())); - auto hyperlinks = manifest.relationships(sheet_path, xlnt::relationship_type::hyperlink); - - while (in_element(qn("spreadsheetml", "worksheet"))) - { - auto current_worksheet_element = expect_start_element(xml::content::complex); - - if (current_worksheet_element == qn("spreadsheetml", "sheetPr")) // CT_SheetPr 0-1 - { - while (in_element(current_worksheet_element)) - { - auto sheet_pr_child_element = expect_start_element(xml::content::simple); - - if (sheet_pr_child_element == qn("spreadsheetml", "tabColor")) // CT_Color 0-1 - { - read_color(); - } - else if (sheet_pr_child_element == qn("spreadsheetml", "outlinePr")) // CT_OutlinePr 0-1 - { - skip_attribute("applyStyles"); // optional, boolean, false - skip_attribute("summaryBelow"); // optional, boolean, true - skip_attribute("summaryRight"); // optional, boolean, true - skip_attribute("showOutlineSymbols"); // optional, boolean, true - } - else if (sheet_pr_child_element == qn("spreadsheetml", "pageSetUpPr")) // CT_PageSetUpPr 0-1 - { - skip_attribute("autoPageBreaks"); // optional, boolean, true - skip_attribute("fitToPage"); // optional, boolean, false - } - else - { - unexpected_element(sheet_pr_child_element); - } - - expect_end_element(sheet_pr_child_element); - } - - skip_attribute("syncHorizontal"); // optional, boolean, false - skip_attribute("syncVertical"); // optional, boolean, false - skip_attribute("syncRef"); // optional, ST_Ref, false - skip_attribute("transitionEvaluation"); // optional, boolean, false - skip_attribute("transitionEntry"); // optional, boolean, false - skip_attribute("published"); // optional, boolean, true - skip_attribute("codeName"); // optional, string - skip_attribute("filterMode"); // optional, boolean, false - skip_attribute("enableFormatConditionsCalculation"); // optional, boolean, true - } - else if (current_worksheet_element == qn("spreadsheetml", "dimension")) // CT_SheetDimension 0-1 - { - full_range = xlnt::range_reference(parser().attribute("ref")); - } - else if (current_worksheet_element == qn("spreadsheetml", "sheetViews")) // CT_SheetViews 0-1 - { - while (in_element(current_worksheet_element)) - { - expect_start_element(qn("spreadsheetml", "sheetView"), xml::content::complex); // CT_SheetView 1+ - - sheet_view new_view; - new_view.id(parser().attribute("workbookViewId")); - - if (parser().attribute_present("showGridLines")) // default="true" - { - new_view.show_grid_lines(is_true(parser().attribute("showGridLines"))); - } - - if (parser().attribute_present("defaultGridColor")) // default="true" - { - new_view.default_grid_color(is_true(parser().attribute("defaultGridColor"))); - } - - if (parser().attribute_present("view") && parser().attribute("view") != "normal") - { - new_view.type(parser().attribute("view") == "pageBreakPreview" ? sheet_view_type::page_break_preview - : sheet_view_type::page_layout); - } - - skip_attributes({"windowProtection", "showFormulas", "showRowColHeaders", "showZeros", "rightToLeft", - "tabSelected", "showRuler", "showOutlineSymbols", "showWhiteSpace", "view", "topLeftCell", - "colorId", "zoomScale", "zoomScaleNormal", "zoomScaleSheetLayoutView", "zoomScalePageLayoutView"}); - - while (in_element(qn("spreadsheetml", "sheetView"))) - { - auto sheet_view_child_element = expect_start_element(xml::content::simple); - - if (sheet_view_child_element == qn("spreadsheetml", "pane")) // CT_Pane 0-1 - { - pane new_pane; - - if (parser().attribute_present("topLeftCell")) - { - new_pane.top_left_cell = cell_reference(parser().attribute("topLeftCell")); - } - - if (parser().attribute_present("xSplit")) - { - new_pane.x_split = parser().attribute("xSplit"); - } - - if (parser().attribute_present("ySplit")) - { - new_pane.y_split = parser().attribute("ySplit"); - } - - if (parser().attribute_present("activePane")) - { - new_pane.active_pane = parser().attribute("activePane"); - } - - if (parser().attribute_present("state")) - { - new_pane.state = parser().attribute("state"); - } - - new_view.pane(new_pane); - } - else if (sheet_view_child_element == qn("spreadsheetml", "selection")) // CT_Selection 0-4 - { - skip_remaining_content(sheet_view_child_element); - } - else if (sheet_view_child_element == qn("spreadsheetml", "pivotSelection")) // CT_PivotSelection 0-4 - { - skip_remaining_content(sheet_view_child_element); - } - else if (sheet_view_child_element == qn("spreadsheetml", "extLst")) // CT_ExtensionList 0-1 - { - skip_remaining_content(sheet_view_child_element); - } - else - { - unexpected_element(sheet_view_child_element); - } - - expect_end_element(sheet_view_child_element); - } - - expect_end_element(qn("spreadsheetml", "sheetView")); - - ws.d_->views_.push_back(new_view); - } - } - else if (current_worksheet_element == qn("spreadsheetml", "sheetFormatPr")) // CT_SheetFormatPr 0-1 - { - skip_remaining_content(current_worksheet_element); - } - else if (current_worksheet_element == qn("spreadsheetml", "cols")) // CT_Cols 0+ - { - while (in_element(qn("spreadsheetml", "cols"))) - { - expect_start_element(qn("spreadsheetml", "col"), xml::content::simple); - - skip_attributes({"bestFit", "collapsed", "outlineLevel"}); - - auto min = static_cast(std::stoull(parser().attribute("min"))); - auto max = static_cast(std::stoull(parser().attribute("max"))); - - optional width; - - if (parser().attribute_present("width")) - { - width = parser().attribute("width"); - } - - optional column_style; - - if (parser().attribute_present("style")) - { - column_style = parser().attribute("style"); - } - - auto custom = - parser().attribute_present("customWidth") ? is_true(parser().attribute("customWidth")) : false; - auto hidden = parser().attribute_present("hidden") ? is_true(parser().attribute("hidden")) : false; - - expect_end_element(qn("spreadsheetml", "col")); - - for (auto column = min; column <= max; column++) - { - column_properties props; - - if (width.is_set()) - { - props.width = width.get(); - } - - if (column_style.is_set()) - { - props.style = column_style.get(); - } - - props.hidden = hidden; - props.custom_width = custom; - ws.add_column_properties(column, props); - } - } - } - else if (current_worksheet_element == qn("spreadsheetml", "sheetData")) // CT_SheetData 1 - { - while (in_element(qn("spreadsheetml", "sheetData"))) - { - expect_start_element(qn("spreadsheetml", "row"), xml::content::complex); // CT_Row - auto row_index = parser().attribute("r"); - - if (parser().attribute_present("ht")) - { - ws.row_properties(row_index).height = parser().attribute("ht"); - } - - if (parser().attribute_present("customHeight")) - { - ws.row_properties(row_index).custom_height = is_true(parser().attribute("customHeight")); - } - - if (parser().attribute_present("hidden") && is_true(parser().attribute("hidden"))) - { - ws.row_properties(row_index).hidden = true; - } - - skip_attributes({qn("x14ac", "dyDescent")}); - skip_attributes({"customFormat", "s", "customFont", - "outlineLevel", "collapsed", "thickTop", "thickBot", - "ph", "spans"}); - - while (in_element(qn("spreadsheetml", "row"))) - { - expect_start_element(qn("spreadsheetml", "c"), xml::content::complex); - auto cell = ws.cell(cell_reference(parser().attribute("r"))); - - auto has_type = parser().attribute_present("t"); - auto type = has_type ? parser().attribute("t") : "n"; - - auto has_format = parser().attribute_present("s"); - auto format_id = static_cast(has_format ? std::stoull(parser().attribute("s")) : 0LL); - - auto has_value = false; - auto value_string = std::string(); - - auto has_formula = false; - auto has_shared_formula = false; - auto formula_value_string = std::string(); - - while (in_element(qn("spreadsheetml", "c"))) - { - auto current_element = expect_start_element(xml::content::mixed); - - if (current_element == qn("spreadsheetml", "v")) // s:ST_Xstring - { - has_value = true; - value_string = read_text(); - } - else if (current_element == qn("spreadsheetml", "f")) // CT_CellFormula - { - has_formula = true; - - if (parser().attribute_present("t")) - { - has_shared_formula = parser().attribute("t") == "shared"; - } - - skip_attributes( - {"aca", "ref", "dt2D", "dtr", "del1", "del2", "r1", "r2", "ca", "si", "bx"}); - - formula_value_string = read_text(); - } - else if (current_element == qn("spreadsheetml", "is")) // CT_Rst - { - expect_start_element(qn("spreadsheetml", "t"), xml::content::simple); - value_string = read_text(); - expect_end_element(qn("spreadsheetml", "t")); - } - else - { - unexpected_element(current_element); - } - - expect_end_element(current_element); - } - - expect_end_element(qn("spreadsheetml", "c")); - - if (has_formula && !has_shared_formula) - { - cell.formula(formula_value_string); - } - - if (has_value) - { - if (type == "str") - { - cell.d_->value_text_ = value_string; - cell.data_type(cell::type::formula_string); - } - else if (type == "inlineStr") - { - cell.d_->value_text_ = value_string; - cell.data_type(cell::type::inline_string); - } - else if (type == "s") - { - cell.d_->value_numeric_ = std::stold(value_string); - cell.data_type(cell::type::shared_string); - } - else if (type == "b") // boolean - { - cell.value(is_true(value_string)); - } - else if (type == "n") // numeric - { - cell.value(std::stold(value_string)); - } - else if (!value_string.empty() && value_string[0] == '#') - { - cell.error(value_string); - } - } - - if (has_format) - { - cell.format(target_.format(format_id)); - } - } - - expect_end_element(qn("spreadsheetml", "row")); - } - } - else if (current_worksheet_element == qn("spreadsheetml", "sheetCalcPr")) // CT_SheetCalcPr 0-1 - { - skip_remaining_content(current_worksheet_element); - } - else if (current_worksheet_element == qn("spreadsheetml", "sheetProtection")) // CT_SheetProtection 0-1 - { - skip_remaining_content(current_worksheet_element); - } - else if (current_worksheet_element == qn("spreadsheetml", "protectedRanges")) // CT_ProtectedRanges 0-1 - { - skip_remaining_content(current_worksheet_element); - } - else if (current_worksheet_element == qn("spreadsheetml", "scenarios")) // CT_Scenarios 0-1 - { - skip_remaining_content(current_worksheet_element); - } - else if (current_worksheet_element == qn("spreadsheetml", "autoFilter")) // CT_AutoFilter 0-1 - { - ws.auto_filter(xlnt::range_reference(parser().attribute("ref"))); - // auto filter complex - skip_remaining_content(current_worksheet_element); - } - else if (current_worksheet_element == qn("spreadsheetml", "sortState")) // CT_SortState 0-1 - { - skip_remaining_content(current_worksheet_element); - } - else if (current_worksheet_element == qn("spreadsheetml", "dataConsolidate")) // CT_DataConsolidate 0-1 - { - skip_remaining_content(current_worksheet_element); - } - else if (current_worksheet_element == qn("spreadsheetml", "customSheetViews")) // CT_CustomSheetViews 0-1 - { - skip_remaining_content(current_worksheet_element); - } - else if (current_worksheet_element == qn("spreadsheetml", "mergeCells")) // CT_MergeCells 0-1 - { - auto count = std::stoull(parser().attribute("count")); - - while (in_element(qn("spreadsheetml", "mergeCells"))) - { - expect_start_element(qn("spreadsheetml", "mergeCell"), xml::content::simple); - ws.merge_cells(range_reference(parser().attribute("ref"))); - expect_end_element(qn("spreadsheetml", "mergeCell")); - - count--; - } - - if (count != 0) - { - throw invalid_file("sizes don't match"); - } - } - else if (current_worksheet_element == qn("spreadsheetml", "phoneticPr")) // CT_PhoneticPr 0-1 - { - skip_remaining_content(current_worksheet_element); - } - else if (current_worksheet_element == qn("spreadsheetml", "conditionalFormatting")) // CT_ConditionalFormatting 0+ - { - skip_remaining_content(current_worksheet_element); - } - else if (current_worksheet_element == qn("spreadsheetml", "dataValidations")) // CT_DataValidations 0-1 - { - skip_remaining_content(current_worksheet_element); - } - else if (current_worksheet_element == qn("spreadsheetml", "hyperlinks")) // CT_Hyperlinks 0-1 - { - while (in_element(qn("spreadsheetml", "hyperlinks"))) - { - expect_start_element(qn("spreadsheetml", "hyperlink"), xml::content::simple); - - auto cell = ws.cell(parser().attribute("ref")); - - if (parser().attribute_present(qn("r", "id"))) - { - auto hyperlink_rel_id = parser().attribute(qn("r", "id")); - auto hyperlink_rel = std::find_if(hyperlinks.begin(), hyperlinks.end(), - [&](const relationship &r) { return r.id() == hyperlink_rel_id; }); - - if (hyperlink_rel != hyperlinks.end()) - { - cell.hyperlink(hyperlink_rel->target().path().string()); - } - } - - skip_attributes({"location", "tooltip", "display"}); - expect_end_element(qn("spreadsheetml", "hyperlink")); - } - } - else if (current_worksheet_element == qn("spreadsheetml", "printOptions")) // CT_PrintOptions 0-1 - { - skip_remaining_content(current_worksheet_element); - } - else if (current_worksheet_element == qn("spreadsheetml", "pageMargins")) // CT_PageMargins 0-1 - { - page_margins margins; - - margins.top(parser().attribute("top")); - margins.bottom(parser().attribute("bottom")); - margins.left(parser().attribute("left")); - margins.right(parser().attribute("right")); - margins.header(parser().attribute("header")); - margins.footer(parser().attribute("footer")); - - ws.page_margins(margins); - } - else if (current_worksheet_element == qn("spreadsheetml", "pageSetup")) // CT_PageSetup 0-1 - { - skip_remaining_content(current_worksheet_element); - } - else if (current_worksheet_element == qn("spreadsheetml", "headerFooter")) // CT_HeaderFooter 0-1 - { - header_footer hf; - - hf.align_with_margins( - !parser().attribute_present("alignWithMargins") || is_true(parser().attribute("alignWithMargins"))); - hf.scale_with_doc( - !parser().attribute_present("alignWithMargins") || is_true(parser().attribute("alignWithMargins"))); - auto different_odd_even = - parser().attribute_present("differentOddEven") && is_true(parser().attribute("differentOddEven")); - auto different_first = - parser().attribute_present("differentFirst") && is_true(parser().attribute("differentFirst")); - - optional, 3>> odd_header; - optional, 3>> odd_footer; - optional, 3>> even_header; - optional, 3>> even_footer; - optional, 3>> first_header; - optional, 3>> first_footer; - - using xlnt::detail::decode_header_footer; - - while (in_element(current_worksheet_element)) - { - auto current_hf_element = expect_start_element(xml::content::simple); - - if (current_hf_element == qn("spreadsheetml", "oddHeader")) - { - odd_header = decode_header_footer(read_text()); - } - else if (current_hf_element == qn("spreadsheetml", "oddFooter")) - { - odd_footer = decode_header_footer(read_text()); - } - else if (current_hf_element == qn("spreadsheetml", "evenHeader")) - { - even_header = decode_header_footer(read_text()); - } - else if (current_hf_element == qn("spreadsheetml", "evenFooter")) - { - even_footer = decode_header_footer(read_text()); - } - else if (current_hf_element == qn("spreadsheetml", "firstHeader")) - { - first_header = decode_header_footer(read_text()); - } - else if (current_hf_element == qn("spreadsheetml", "firstFooter")) - { - first_footer = decode_header_footer(read_text()); - } - else - { - unexpected_element(current_hf_element); - } - - expect_end_element(current_hf_element); - } - - for (std::size_t i = 0; i < 3; ++i) - { - auto loc = i == 0 ? header_footer::location::left - : i == 1 ? header_footer::location::center : header_footer::location::right; - - if (different_odd_even) - { - if (odd_header.is_set() && odd_header.get().at(i).is_set() && even_header.is_set() - && even_header.get().at(i).is_set()) - { - hf.odd_even_header(loc, odd_header.get().at(i).get(), even_header.get().at(i).get()); - } - - if (odd_footer.is_set() && odd_footer.get().at(i).is_set() && even_footer.is_set() - && even_footer.get().at(i).is_set()) - { - hf.odd_even_footer(loc, odd_footer.get().at(i).get(), even_footer.get().at(i).get()); - } - } - else - { - if (odd_header.is_set() && odd_header.get().at(i).is_set()) - { - hf.header(loc, odd_header.get().at(i).get()); - } - - if (odd_footer.is_set() && odd_footer.get().at(i).is_set()) - { - hf.footer(loc, odd_footer.get().at(i).get()); - } - } - - if (different_first) - { - } - } - - ws.header_footer(hf); - } - else if (current_worksheet_element == qn("spreadsheetml", "rowBreaks")) // CT_PageBreak 0-1 - { - auto count = parser().attribute_present("count") ? parser().attribute("count") : 0; - auto manual_break_count = parser().attribute_present("manualBreakCount") - ? parser().attribute("manualBreakCount") : 0; - - while (in_element(qn("spreadsheetml", "rowBreaks"))) - { - expect_start_element(qn("spreadsheetml", "brk"), xml::content::simple); - - if (parser().attribute_present("id")) - { - ws.page_break_at_row(parser().attribute("id")); - --count; - } - - if (parser().attribute_present("man") && is_true(parser().attribute("man"))) - { - --manual_break_count; - } - - skip_attributes({"min", "max", "pt"}); - expect_end_element(qn("spreadsheetml", "brk")); - } - } - else if (current_worksheet_element == qn("spreadsheetml", "colBreaks")) // CT_PageBreak 0-1 - { - auto count = parser().attribute_present("count") ? parser().attribute("count") : 0; - auto manual_break_count = parser().attribute_present("manualBreakCount") - ? parser().attribute("manualBreakCount") - : 0; - - while (in_element(qn("spreadsheetml", "colBreaks"))) - { - expect_start_element(qn("spreadsheetml", "brk"), xml::content::simple); - - if (parser().attribute_present("id")) - { - ws.page_break_at_column(parser().attribute("id")); - --count; - } - - if (parser().attribute_present("man") && is_true(parser().attribute("man"))) - { - --manual_break_count; - } - - skip_attributes({"min", "max", "pt"}); - expect_end_element(qn("spreadsheetml", "brk")); - } - } - else if (current_worksheet_element == qn("spreadsheetml", "customProperties")) // CT_CustomProperties 0-1 - { - skip_remaining_content(current_worksheet_element); - } - else if (current_worksheet_element == qn("spreadsheetml", "cellWatches")) // CT_CellWatches 0-1 - { - skip_remaining_content(current_worksheet_element); - } - else if (current_worksheet_element == qn("spreadsheetml", "ignoredErrors")) // CT_IgnoredErrors 0-1 - { - skip_remaining_content(current_worksheet_element); - } - else if (current_worksheet_element == qn("spreadsheetml", "smartTags")) // CT_SmartTags 0-1 - { - skip_remaining_content(current_worksheet_element); - } - else if (current_worksheet_element == qn("spreadsheetml", "drawing")) // CT_Drawing 0-1 - { - skip_remaining_content(current_worksheet_element); - } - else if (current_worksheet_element == qn("spreadsheetml", "legacyDrawing")) - { - skip_remaining_content(current_worksheet_element); - } - else if (current_worksheet_element == qn("spreadsheetml", "extLst")) - { - skip_remaining_content(current_worksheet_element); - } - else - { - unexpected_element(current_worksheet_element); - } - - expect_end_element(current_worksheet_element); - } - - expect_end_element(qn("spreadsheetml", "worksheet")); - - if (manifest.has_relationship(sheet_path, xlnt::relationship_type::comments)) - { - auto comments_part = manifest.canonicalize( - {workbook_rel, sheet_rel, manifest.relationship(sheet_path, xlnt::relationship_type::comments)}); - - auto receive = xml::parser::receive_default; - auto comments_part_streambuf = archive_->open(comments_part); - std::istream comments_part_stream(comments_part_streambuf.get()); - xml::parser parser(comments_part_stream, comments_part.string(), receive); - parser_ = &parser; - - read_comments(ws); - - if (manifest.has_relationship(sheet_path, xlnt::relationship_type::vml_drawing)) - { - auto vml_drawings_part = manifest.canonicalize( - {workbook_rel, sheet_rel, manifest.relationship(sheet_path, xlnt::relationship_type::vml_drawing)}); - - auto vml_drawings_part_streambuf = archive_->open(comments_part); - std::istream vml_drawings_part_stream(comments_part_streambuf.get()); - xml::parser vml_parser(vml_drawings_part_stream, vml_drawings_part.string(), receive); - parser_ = &vml_parser; - - read_vml_drawings(ws); - } - } -} - // Sheet Relationship Target Parts void xlsx_consumer::read_vml_drawings(worksheet /*ws*/) diff --git a/source/detail/serialization/xlsx_consumer.hpp b/source/detail/serialization/xlsx_consumer.hpp index f8d65344..76475e17 100644 --- a/source/detail/serialization/xlsx_consumer.hpp +++ b/source/detail/serialization/xlsx_consumer.hpp @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -36,11 +37,15 @@ namespace xlnt { +class cell; class color; class rich_text; class manifest; +template +class optional; class path; class relationship; +class streaming_workbook_reader; class variant; class workbook; class worksheet; @@ -48,6 +53,8 @@ class worksheet; namespace detail { class izstream; +struct cell_impl; +struct worksheet_impl; /// /// Handles writing a workbook into an XLSX file. @@ -57,16 +64,31 @@ class xlsx_consumer public: xlsx_consumer(workbook &destination); + ~xlsx_consumer(); + void read(std::istream &source); void read(std::istream &source, const std::string &password); private: + friend class xlnt::streaming_workbook_reader; + + void open(std::istream &source); + + bool has_cell(); + + /// + /// Reads the next cell in the current worksheet and optionally returns it if + /// the last cell in the sheet has not yet been read. An exception will be thrown + /// if this is not open as a streaming consumer. + /// + cell read_cell(); + /// /// Read all the files needed from the XLSX archive and initialize all of /// the data in the workbook to match. /// - void populate_workbook(); + void populate_workbook(bool streaming); /// /// @@ -106,27 +128,27 @@ private: void read_calculation_chain(); /// - /// + /// /// void read_connections(); /// - /// + /// /// void read_custom_property(); /// - /// + /// /// void read_custom_xml_mappings(); /// - /// + /// /// void read_external_workbook_references(); /// - /// + /// /// void read_pivot_table(); @@ -136,17 +158,17 @@ private: void read_shared_string_table(); /// - /// + /// /// void read_shared_workbook_revision_headers(); /// - /// + /// /// void read_shared_workbook(); /// - /// + /// /// void read_shared_workbook_user_data(); @@ -161,56 +183,71 @@ private: void read_theme(); /// - /// + /// /// void read_volatile_dependencies(); /// /// xl/sheets/*.xml /// - void read_chartsheet(const std::string &title); + void read_chartsheet(const std::string &rel_id); /// /// xl/sheets/*.xml /// - void read_dialogsheet(const std::string &title); + void read_dialogsheet(const std::string &rel_id); /// /// xl/sheets/*.xml /// - void read_worksheet(const std::string &title); + void read_worksheet(const std::string &rel_id); + + /// + /// xl/sheets/*.xml + /// + std::string read_worksheet_begin(const std::string &rel_id); + + /// + /// xl/sheets/*.xml + /// + void read_worksheet_sheetdata(); + + /// + /// xl/sheets/*.xml + /// + worksheet read_worksheet_end(const std::string &rel_id); // Sheet Relationship Target Parts /// - /// + /// /// void read_comments(worksheet ws); - + /// - /// + /// /// void read_vml_drawings(worksheet ws); /// - /// + /// /// void read_drawings(); // Unknown Parts /// - /// + /// /// void read_unknown_parts(); /// - /// + /// /// void read_unknown_relationships(); /// - /// + /// /// void read_image(const path &part); @@ -362,14 +399,22 @@ private: /// /// This pointer is generally set by instantiating an xml::parser in a function - /// scope and then calling a read_*() method which uses xlsx_consumer::parser() + /// scope and then calling a read_*() method which uses xlsx_consumer::parser() /// to access the object. /// xml::parser *parser_; - + std::vector stack_; bool preserve_space_ = false; + + bool streaming_ = false; + + std::unique_ptr streaming_cell_; + + detail::cell_impl *current_cell_; + + detail::worksheet_impl *current_worksheet_; }; } // namespace detail diff --git a/source/detail/serialization/xlsx_producer.cpp b/source/detail/serialization/xlsx_producer.cpp index 8ab586aa..1c1c1626 100644 --- a/source/detail/serialization/xlsx_producer.cpp +++ b/source/detail/serialization/xlsx_producer.cpp @@ -36,13 +36,12 @@ #include #include #include +#include #include #include #include #include -using namespace std::string_literals; - namespace { /// @@ -169,7 +168,7 @@ void xlsx_producer::write_content_types() const auto content_types_path = path("[Content_Types].xml"); begin_part(content_types_path); - const auto xmlns = "http://schemas.openxmlformats.org/package/2006/content-types"s; + const auto xmlns = "http://schemas.openxmlformats.org/package/2006/content-types"; write_start_element(xmlns, "Types"); write_namespace(xmlns, ""); @@ -288,7 +287,7 @@ void xlsx_producer::write_property(const std::string &name, const variant &value write_start_element(constants::ns("vt"), "vector"); auto vector = value.get>(); - std::unordered_set types; + std::unordered_set> types; for (const auto &element : vector) { @@ -2477,9 +2476,9 @@ void xlsx_producer::write_worksheet(const relationship &rel) auto first_header = std::string(); auto first_footer = std::string(); - const auto locations = + const auto locations = { - header_footer::location::left, + header_footer::location::left, header_footer::location::center, header_footer::location::right }; @@ -2715,7 +2714,7 @@ void xlsx_producer::write_comments(const relationship & /*rel*/, worksheet ws, c if (run.second.is_set()) { write_start_element(xmlns, "rPr"); - + if (run.second.get().bold()) { write_start_element(xmlns, "b"); diff --git a/source/detail/serialization/zstream.cpp b/source/detail/serialization/zstream.cpp index b9aab711..737e0300 100644 --- a/source/detail/serialization/zstream.cpp +++ b/source/detail/serialization/zstream.cpp @@ -482,7 +482,9 @@ std::unique_ptr ozstream::open(const path &filename) zheader header; header.filename = filename.string(); file_headers_.push_back(header); - return std::make_unique(&file_headers_.back(), destination_stream_); + auto buffer = new zip_streambuf_compress(&file_headers_.back(), destination_stream_); + + return std::unique_ptr(buffer); } izstream::izstream(std::istream &stream) @@ -526,7 +528,7 @@ bool izstream::read_central_header() } source_stream_.read(reinterpret_cast(buf.data()), read_start); - + if (buf[0] == 0xd0 && buf[1] == 0xcf && buf[2] == 0x11 && buf[3] == 0xe0 && buf[4] == 0xa1 && buf[5] == 0xb1 && buf[6] == 0x1a && buf[7] == 0xe1) { @@ -595,7 +597,9 @@ std::unique_ptr izstream::open(const path &filename) const auto header = file_headers_.at(filename.string()); source_stream_.seekg(header.header_offset); - return std::make_unique(source_stream_, header); + auto buffer = new zip_streambuf_decompress(source_stream_, header); + + return std::unique_ptr(buffer); } std::string izstream::read(const path &filename) const diff --git a/source/detail/unicode.cpp b/source/detail/unicode.cpp index e0e01d6b..722b6800 100644 --- a/source/detail/unicode.cpp +++ b/source/detail/unicode.cpp @@ -21,15 +21,21 @@ // @license: http://www.opensource.org/licenses/mit-license.php // @author: see AUTHORS file -#include #include #include #include +#ifdef UTFCPP +#include +#else +#include +#endif + namespace xlnt { namespace detail { +#ifndef UTFCPP #ifdef _MSC_VER std::u16string utf8_to_utf16(const std::string &utf8_string) { @@ -63,6 +69,23 @@ std::string utf16_to_utf8(const std::u16string &utf16_string) char16_t>{}.to_bytes(utf16_string); } #endif +#else +std::u16string utf8_to_utf16(const std::string &utf8_string) +{ + std::u16string result; + utf8::utf8to16(utf8_string.begin(), utf8_string.end(), std::back_inserter(result)); + + return result; +} + +std::string utf16_to_utf8(const std::u16string &utf16_string) +{ + std::string result; + utf8::utf16to8(utf16_string.begin(), utf16_string.end(), std::back_inserter(result)); + + return result; +} +#endif std::string latin1_to_utf8(const std::string &latin1) { diff --git a/source/utils/path.cpp b/source/utils/path.cpp index 01da6803..2d22888e 100644 --- a/source/utils/path.cpp +++ b/source/utils/path.cpp @@ -21,7 +21,6 @@ // @license: http://www.opensource.org/licenses/mit-license.php // @author: see AUTHORS file -#include #include #include #include @@ -32,6 +31,8 @@ #include #include #include +#elif defined(_MSC_VER) +#include #endif #include diff --git a/source/workbook/streaming_workbook_reader.cpp b/source/workbook/streaming_workbook_reader.cpp new file mode 100644 index 00000000..03c9a1ab --- /dev/null +++ b/source/workbook/streaming_workbook_reader.cpp @@ -0,0 +1,185 @@ +// Copyright (c) 2017 Thomas Fussell +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, WRISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE +// +// @license: http://www.opensource.org/licenses/mit-license.php +// @author: see AUTHORS file + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace { + +//TODO: (important) this is duplicated from workbook.cpp, find a common place to keep it +#ifdef _MSC_VER +void open_stream(std::ifstream &stream, const std::wstring &path) +{ + stream.open(path, std::ios::binary); +} + +void open_stream(std::ofstream &stream, const std::wstring &path) +{ + stream.open(path, std::ios::binary); +} + +void open_stream(std::ifstream &stream, const std::string &path) +{ + open_stream(stream, xlnt::path(path).wstring()); +} + +void open_stream(std::ofstream &stream, const std::string &path) +{ + open_stream(stream, xlnt::path(path).wstring()); +} +#else +void open_stream(std::ifstream &stream, const std::string &path) +{ + stream.open(path, std::ios::binary); +} + +void open_stream(std::ofstream &stream, const std::string &path) +{ + stream.open(path, std::ios::binary); +} +#endif + +} // namespace + + +namespace xlnt { + +streaming_workbook_reader::streaming_workbook_reader() +{ + +} + +streaming_workbook_reader::~streaming_workbook_reader() +{ + close(); +} + +void streaming_workbook_reader::close() +{ + if (consumer_) + { + consumer_.reset(nullptr); + stream_buffer_.reset(nullptr); + } +} + +bool streaming_workbook_reader::has_cell() +{ + return consumer_->has_cell(); +} + +cell streaming_workbook_reader::read_cell() +{ + return consumer_->read_cell(); +} + +bool streaming_workbook_reader::has_worksheet() +{ + return !worksheet_queue_.empty(); +} + +void streaming_workbook_reader::begin_worksheet() +{ + const auto next_worksheet_rel = worksheet_queue_.back(); + const auto workbook_rel = workbook_->manifest() + .relationship(path("/"), relationship_type::office_document); + const auto worksheet_rel = workbook_->manifest() + .relationship(workbook_rel.target().path(), next_worksheet_rel); + + auto rel_chain = std::vector{ workbook_rel, worksheet_rel }; + + const auto &manifest = consumer_->target_.manifest(); + const auto part_path = manifest.canonicalize(rel_chain); + auto part_stream_buffer = consumer_->archive_->open(part_path); + part_stream_buffer_.swap(part_stream_buffer); + part_stream_.reset(new std::istream(part_stream_buffer_.get())); + parser_.reset(new xml::parser(*part_stream_, part_path.string())); + consumer_->parser_ = parser_.get(); + + consumer_->read_worksheet_begin(next_worksheet_rel); +} + +worksheet streaming_workbook_reader::end_worksheet() +{ + auto next_worksheet_rel = worksheet_queue_.back(); + worksheet_queue_.pop_back(); + return consumer_->read_worksheet_end(next_worksheet_rel); +} + +void streaming_workbook_reader::open(const std::vector &data) +{ + stream_buffer_.reset(new detail::vector_istreambuf(data)); + stream_.reset(new std::istream(stream_buffer_.get())); + open(*stream_); +} + +void streaming_workbook_reader::open(const std::string &filename) +{ + stream_.reset(new std::ifstream()); + open_stream((std::ifstream &)stream_, filename); + open(*stream_); +} + +#ifdef _MSC_VER +void streaming_workbook_reader::open(const std::wstring &filename) +{ + stream_.reset(new std::ifstream()); + open_stream((std::ifstream &)*stream_, filename); + open(*stream_); +} +#endif + +void streaming_workbook_reader::open(const xlnt::path &filename) +{ + stream_.reset(new std::ifstream()); + open_stream((std::ifstream &)*stream_, filename.string()); + open(*stream_); +} + +void streaming_workbook_reader::open(std::istream &stream) +{ + workbook_.reset(new workbook()); + consumer_.reset(new detail::xlsx_consumer(*workbook_)); + consumer_->open(stream); + + const auto workbook_rel = workbook_->manifest() + .relationship(path("/"), relationship_type::office_document); + const auto workbook_path = workbook_rel.target().path(); + + for (auto worksheet_rel : workbook_->manifest() + .relationships(workbook_path, relationship_type::worksheet)) + { + worksheet_queue_.push_back(worksheet_rel.id()); + } +} + +} // namespace xlnt diff --git a/source/workbook/workbook.cpp b/source/workbook/workbook.cpp index 60c22538..9dcbc06c 100644 --- a/source/workbook/workbook.cpp +++ b/source/workbook/workbook.cpp @@ -1508,14 +1508,14 @@ void workbook::garbage_collect_formulae() void workbook::update_sheet_properties() { - if (has_extended_property(extended_property::titles_of_parts)) + if (has_extended_property(xlnt::extended_property::titles_of_parts)) { - extended_property(extended_property::titles_of_parts, sheet_titles()); + extended_property(xlnt::extended_property::titles_of_parts, sheet_titles()); } - if (has_extended_property(extended_property::heading_pairs)) + if (has_extended_property(xlnt::extended_property::heading_pairs)) { - extended_property(extended_property::heading_pairs, + extended_property(xlnt::extended_property::heading_pairs, std::vector{variant("Worksheets"), variant(static_cast(sheet_count()))}); } } diff --git a/tests/cell/cell_test_suite.hpp b/tests/cell/cell_test_suite.hpp index 7b79d1fe..e7611ab5 100644 --- a/tests/cell/cell_test_suite.hpp +++ b/tests/cell/cell_test_suite.hpp @@ -619,9 +619,9 @@ private: xlnt_assert_equals(cell.value(), 3.141592); auto cell2 = ws.cell("A2"); - cell2.value(std::string(100'000, 'a')); + cell2.value(std::string(100000, 'a')); cell.value(cell2); - xlnt_assert_equals(cell.value(), std::string(32'767, 'a')); + xlnt_assert_equals(cell.value(), std::string(32767, 'a')); } void test_reference() diff --git a/tests/workbook/serialization_test_suite.hpp b/tests/workbook/serialization_test_suite.hpp index aa2d6def..f1907a72 100644 --- a/tests/workbook/serialization_test_suite.hpp +++ b/tests/workbook/serialization_test_suite.hpp @@ -31,6 +31,8 @@ #include #include #include +#include +#include #include class serialization_test_suite : public test_suite @@ -56,6 +58,8 @@ public: register_test(test_read_custom_properties); register_test(test_round_trip_rw); register_test(test_round_trip_rw_encrypted); + register_test(test_streaming_read); + //register_test(test_streaming_write); } bool workbook_matches_file(xlnt::workbook &wb, const xlnt::path &file) @@ -461,4 +465,42 @@ public: xlnt_assert(round_trip_matches_rw(path, password)); } } + + void test_streaming_read() + { + const auto path = path_helper::test_file("4_every_style.xlsx"); + xlnt::streaming_workbook_reader reader; + + reader.open(xlnt::path(path)); + + while (reader.has_worksheet()) + { + reader.begin_worksheet(); + + while (reader.has_cell()) + { + const auto cell = reader.read_cell(); + //std::cout << cell.reference().to_string() << std::endl; + } + + const auto ws = reader.end_worksheet(); + } + } + + void test_streaming_write() + { + const auto path = std::string("stream-out.xlsx"); + xlnt::streaming_workbook_writer writer; + + writer.open(path); + + writer.add_sheet("stream"); + + auto b2 = writer.add_cell("B2"); + b2.value("B2!"); + + auto c3 = writer.add_cell("C3"); + b2.value("should not change"); + c3.value("C3!"); + } }; diff --git a/third-party/utfcpp/utf8.h b/third-party/utfcpp/utf8.h new file mode 100644 index 00000000..82b13f59 --- /dev/null +++ b/third-party/utfcpp/utf8.h @@ -0,0 +1,34 @@ +// Copyright 2006 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +#include "utf8/checked.h" +#include "utf8/unchecked.h" + +#endif // header guard diff --git a/third-party/utfcpp/utf8/checked.h b/third-party/utfcpp/utf8/checked.h new file mode 100644 index 00000000..2aef5838 --- /dev/null +++ b/third-party/utfcpp/utf8/checked.h @@ -0,0 +1,327 @@ +// Copyright 2006-2016 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +#include "core.h" +#include + +namespace utf8 +{ + // Base for the exceptions that may be thrown from the library + class exception : public ::std::exception { + }; + + // Exceptions that may be thrown from the library functions. + class invalid_code_point : public exception { + uint32_t cp; + public: + invalid_code_point(uint32_t codepoint) : cp(codepoint) {} + virtual const char* what() const throw() { return "Invalid code point"; } + uint32_t code_point() const {return cp;} + }; + + class invalid_utf8 : public exception { + uint8_t u8; + public: + invalid_utf8 (uint8_t u) : u8(u) {} + virtual const char* what() const throw() { return "Invalid UTF-8"; } + uint8_t utf8_octet() const {return u8;} + }; + + class invalid_utf16 : public exception { + uint16_t u16; + public: + invalid_utf16 (uint16_t u) : u16(u) {} + virtual const char* what() const throw() { return "Invalid UTF-16"; } + uint16_t utf16_word() const {return u16;} + }; + + class not_enough_room : public exception { + public: + virtual const char* what() const throw() { return "Not enough space"; } + }; + + /// The library API - functions intended to be called by the users + + template + octet_iterator append(uint32_t cp, octet_iterator result) + { + if (!utf8::internal::is_code_point_valid(cp)) + throw invalid_code_point(cp); + + if (cp < 0x80) // one octet + *(result++) = static_cast(cp); + else if (cp < 0x800) { // two octets + *(result++) = static_cast((cp >> 6) | 0xc0); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + else if (cp < 0x10000) { // three octets + *(result++) = static_cast((cp >> 12) | 0xe0); + *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + else { // four octets + *(result++) = static_cast((cp >> 18) | 0xf0); + *(result++) = static_cast(((cp >> 12) & 0x3f) | 0x80); + *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + return result; + } + + template + output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement) + { + while (start != end) { + octet_iterator sequence_start = start; + internal::utf_error err_code = utf8::internal::validate_next(start, end); + switch (err_code) { + case internal::UTF8_OK : + for (octet_iterator it = sequence_start; it != start; ++it) + *out++ = *it; + break; + case internal::NOT_ENOUGH_ROOM: + throw not_enough_room(); + case internal::INVALID_LEAD: + out = utf8::append (replacement, out); + ++start; + break; + case internal::INCOMPLETE_SEQUENCE: + case internal::OVERLONG_SEQUENCE: + case internal::INVALID_CODE_POINT: + out = utf8::append (replacement, out); + ++start; + // just one replacement mark for the sequence + while (start != end && utf8::internal::is_trail(*start)) + ++start; + break; + } + } + return out; + } + + template + inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) + { + static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd); + return utf8::replace_invalid(start, end, out, replacement_marker); + } + + template + uint32_t next(octet_iterator& it, octet_iterator end) + { + uint32_t cp = 0; + internal::utf_error err_code = utf8::internal::validate_next(it, end, cp); + switch (err_code) { + case internal::UTF8_OK : + break; + case internal::NOT_ENOUGH_ROOM : + throw not_enough_room(); + case internal::INVALID_LEAD : + case internal::INCOMPLETE_SEQUENCE : + case internal::OVERLONG_SEQUENCE : + throw invalid_utf8(*it); + case internal::INVALID_CODE_POINT : + throw invalid_code_point(cp); + } + return cp; + } + + template + uint32_t peek_next(octet_iterator it, octet_iterator end) + { + return utf8::next(it, end); + } + + template + uint32_t prior(octet_iterator& it, octet_iterator start) + { + // can't do much if it == start + if (it == start) + throw not_enough_room(); + + octet_iterator end = it; + // Go back until we hit either a lead octet or start + while (utf8::internal::is_trail(*(--it))) + if (it == start) + throw invalid_utf8(*it); // error - no lead byte in the sequence + return utf8::peek_next(it, end); + } + + /// Deprecated in versions that include "prior" + template + uint32_t previous(octet_iterator& it, octet_iterator pass_start) + { + octet_iterator end = it; + while (utf8::internal::is_trail(*(--it))) + if (it == pass_start) + throw invalid_utf8(*it); // error - no lead byte in the sequence + octet_iterator temp = it; + return utf8::next(temp, end); + } + + template + void advance (octet_iterator& it, distance_type n, octet_iterator end) + { + for (distance_type i = 0; i < n; ++i) + utf8::next(it, end); + } + + template + typename std::iterator_traits::difference_type + distance (octet_iterator first, octet_iterator last) + { + typename std::iterator_traits::difference_type dist; + for (dist = 0; first < last; ++dist) + utf8::next(first, last); + return dist; + } + + template + octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) + { + while (start != end) { + uint32_t cp = utf8::internal::mask16(*start++); + // Take care of surrogate pairs first + if (utf8::internal::is_lead_surrogate(cp)) { + if (start != end) { + uint32_t trail_surrogate = utf8::internal::mask16(*start++); + if (utf8::internal::is_trail_surrogate(trail_surrogate)) + cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; + else + throw invalid_utf16(static_cast(trail_surrogate)); + } + else + throw invalid_utf16(static_cast(cp)); + + } + // Lone trail surrogate + else if (utf8::internal::is_trail_surrogate(cp)) + throw invalid_utf16(static_cast(cp)); + + result = utf8::append(cp, result); + } + return result; + } + + template + u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) + { + while (start < end) { + uint32_t cp = utf8::next(start, end); + if (cp > 0xffff) { //make a surrogate pair + *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); + *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); + } + else + *result++ = static_cast(cp); + } + return result; + } + + template + octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) + { + while (start != end) + result = utf8::append(*(start++), result); + + return result; + } + + template + u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) + { + while (start < end) + (*result++) = utf8::next(start, end); + + return result; + } + + // The iterator class + template + class iterator : public std::iterator { + octet_iterator it; + octet_iterator range_start; + octet_iterator range_end; + public: + iterator () {} + explicit iterator (const octet_iterator& octet_it, + const octet_iterator& rangestart, + const octet_iterator& rangeend) : + it(octet_it), range_start(rangestart), range_end(rangeend) + { + if (it < range_start || it > range_end) + throw std::out_of_range("Invalid utf-8 iterator position"); + } + // the default "big three" are OK + octet_iterator base () const { return it; } + uint32_t operator * () const + { + octet_iterator temp = it; + return utf8::next(temp, range_end); + } + bool operator == (const iterator& rhs) const + { + if (range_start != rhs.range_start || range_end != rhs.range_end) + throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); + return (it == rhs.it); + } + bool operator != (const iterator& rhs) const + { + return !(operator == (rhs)); + } + iterator& operator ++ () + { + utf8::next(it, range_end); + return *this; + } + iterator operator ++ (int) + { + iterator temp = *this; + utf8::next(it, range_end); + return temp; + } + iterator& operator -- () + { + utf8::prior(it, range_start); + return *this; + } + iterator operator -- (int) + { + iterator temp = *this; + utf8::prior(it, range_start); + return temp; + } + }; // class iterator + +} // namespace utf8 + +#endif //header guard + + diff --git a/third-party/utfcpp/utf8/core.h b/third-party/utfcpp/utf8/core.h new file mode 100644 index 00000000..ae0f367d --- /dev/null +++ b/third-party/utfcpp/utf8/core.h @@ -0,0 +1,332 @@ +// Copyright 2006 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +#include + +namespace utf8 +{ + // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers + // You may need to change them to match your system. + // These typedefs have the same names as ones from cstdint, or boost/cstdint + typedef unsigned char uint8_t; + typedef unsigned short uint16_t; + typedef unsigned int uint32_t; + +// Helper code - not intended to be directly called by the library users. May be changed at any time +namespace internal +{ + // Unicode constants + // Leading (high) surrogates: 0xd800 - 0xdbff + // Trailing (low) surrogates: 0xdc00 - 0xdfff + const uint16_t LEAD_SURROGATE_MIN = 0xd800u; + const uint16_t LEAD_SURROGATE_MAX = 0xdbffu; + const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u; + const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu; + const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10); + const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN; + + // Maximum valid value for a Unicode code point + const uint32_t CODE_POINT_MAX = 0x0010ffffu; + + template + inline uint8_t mask8(octet_type oc) + { + return static_cast(0xff & oc); + } + template + inline uint16_t mask16(u16_type oc) + { + return static_cast(0xffff & oc); + } + template + inline bool is_trail(octet_type oc) + { + return ((utf8::internal::mask8(oc) >> 6) == 0x2); + } + + template + inline bool is_lead_surrogate(u16 cp) + { + return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX); + } + + template + inline bool is_trail_surrogate(u16 cp) + { + return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); + } + + template + inline bool is_surrogate(u16 cp) + { + return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); + } + + template + inline bool is_code_point_valid(u32 cp) + { + return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp)); + } + + template + inline typename std::iterator_traits::difference_type + sequence_length(octet_iterator lead_it) + { + uint8_t lead = utf8::internal::mask8(*lead_it); + if (lead < 0x80) + return 1; + else if ((lead >> 5) == 0x6) + return 2; + else if ((lead >> 4) == 0xe) + return 3; + else if ((lead >> 3) == 0x1e) + return 4; + else + return 0; + } + + template + inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length) + { + if (cp < 0x80) { + if (length != 1) + return true; + } + else if (cp < 0x800) { + if (length != 2) + return true; + } + else if (cp < 0x10000) { + if (length != 3) + return true; + } + + return false; + } + + enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT}; + + /// Helper for get_sequence_x + template + utf_error increase_safely(octet_iterator& it, octet_iterator end) + { + if (++it == end) + return NOT_ENOUGH_ROOM; + + if (!utf8::internal::is_trail(*it)) + return INCOMPLETE_SEQUENCE; + + return UTF8_OK; + } + + #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;} + + /// get_sequence_x functions decode utf-8 sequences of the length x + template + utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + code_point = utf8::internal::mask8(*it); + + return UTF8_OK; + } + + template + utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + code_point = utf8::internal::mask8(*it); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f); + + return UTF8_OK; + } + + template + utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + code_point = utf8::internal::mask8(*it); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point += (*it) & 0x3f; + + return UTF8_OK; + } + + template + utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + code_point = utf8::internal::mask8(*it); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point += (utf8::internal::mask8(*it) << 6) & 0xfff; + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point += (*it) & 0x3f; + + return UTF8_OK; + } + + #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR + + template + utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + // Save the original value of it so we can go back in case of failure + // Of course, it does not make much sense with i.e. stream iterators + octet_iterator original_it = it; + + uint32_t cp = 0; + // Determine the sequence length based on the lead octet + typedef typename std::iterator_traits::difference_type octet_difference_type; + const octet_difference_type length = utf8::internal::sequence_length(it); + + // Get trail octets and calculate the code point + utf_error err = UTF8_OK; + switch (length) { + case 0: + return INVALID_LEAD; + case 1: + err = utf8::internal::get_sequence_1(it, end, cp); + break; + case 2: + err = utf8::internal::get_sequence_2(it, end, cp); + break; + case 3: + err = utf8::internal::get_sequence_3(it, end, cp); + break; + case 4: + err = utf8::internal::get_sequence_4(it, end, cp); + break; + } + + if (err == UTF8_OK) { + // Decoding succeeded. Now, security checks... + if (utf8::internal::is_code_point_valid(cp)) { + if (!utf8::internal::is_overlong_sequence(cp, length)){ + // Passed! Return here. + code_point = cp; + ++it; + return UTF8_OK; + } + else + err = OVERLONG_SEQUENCE; + } + else + err = INVALID_CODE_POINT; + } + + // Failure branch - restore the original value of the iterator + it = original_it; + return err; + } + + template + inline utf_error validate_next(octet_iterator& it, octet_iterator end) { + uint32_t ignored; + return utf8::internal::validate_next(it, end, ignored); + } + +} // namespace internal + + /// The library API - functions intended to be called by the users + + // Byte order mark + const uint8_t bom[] = {0xef, 0xbb, 0xbf}; + + template + octet_iterator find_invalid(octet_iterator start, octet_iterator end) + { + octet_iterator result = start; + while (result != end) { + utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end); + if (err_code != internal::UTF8_OK) + return result; + } + return result; + } + + template + inline bool is_valid(octet_iterator start, octet_iterator end) + { + return (utf8::find_invalid(start, end) == end); + } + + template + inline bool starts_with_bom (octet_iterator it, octet_iterator end) + { + return ( + ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) && + ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) && + ((it != end) && (utf8::internal::mask8(*it)) == bom[2]) + ); + } + + //Deprecated in release 2.3 + template + inline bool is_bom (octet_iterator it) + { + return ( + (utf8::internal::mask8(*it++)) == bom[0] && + (utf8::internal::mask8(*it++)) == bom[1] && + (utf8::internal::mask8(*it)) == bom[2] + ); + } +} // namespace utf8 + +#endif // header guard + + diff --git a/third-party/utfcpp/utf8/unchecked.h b/third-party/utfcpp/utf8/unchecked.h new file mode 100644 index 00000000..cb242716 --- /dev/null +++ b/third-party/utfcpp/utf8/unchecked.h @@ -0,0 +1,228 @@ +// Copyright 2006 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +#include "core.h" + +namespace utf8 +{ + namespace unchecked + { + template + octet_iterator append(uint32_t cp, octet_iterator result) + { + if (cp < 0x80) // one octet + *(result++) = static_cast(cp); + else if (cp < 0x800) { // two octets + *(result++) = static_cast((cp >> 6) | 0xc0); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + else if (cp < 0x10000) { // three octets + *(result++) = static_cast((cp >> 12) | 0xe0); + *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + else { // four octets + *(result++) = static_cast((cp >> 18) | 0xf0); + *(result++) = static_cast(((cp >> 12) & 0x3f)| 0x80); + *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + return result; + } + + template + uint32_t next(octet_iterator& it) + { + uint32_t cp = utf8::internal::mask8(*it); + typename std::iterator_traits::difference_type length = utf8::internal::sequence_length(it); + switch (length) { + case 1: + break; + case 2: + it++; + cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f); + break; + case 3: + ++it; + cp = ((cp << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); + ++it; + cp += (*it) & 0x3f; + break; + case 4: + ++it; + cp = ((cp << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); + ++it; + cp += (utf8::internal::mask8(*it) << 6) & 0xfff; + ++it; + cp += (*it) & 0x3f; + break; + } + ++it; + return cp; + } + + template + uint32_t peek_next(octet_iterator it) + { + return utf8::unchecked::next(it); + } + + template + uint32_t prior(octet_iterator& it) + { + while (utf8::internal::is_trail(*(--it))) ; + octet_iterator temp = it; + return utf8::unchecked::next(temp); + } + + // Deprecated in versions that include prior, but only for the sake of consistency (see utf8::previous) + template + inline uint32_t previous(octet_iterator& it) + { + return utf8::unchecked::prior(it); + } + + template + void advance (octet_iterator& it, distance_type n) + { + for (distance_type i = 0; i < n; ++i) + utf8::unchecked::next(it); + } + + template + typename std::iterator_traits::difference_type + distance (octet_iterator first, octet_iterator last) + { + typename std::iterator_traits::difference_type dist; + for (dist = 0; first < last; ++dist) + utf8::unchecked::next(first); + return dist; + } + + template + octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) + { + while (start != end) { + uint32_t cp = utf8::internal::mask16(*start++); + // Take care of surrogate pairs first + if (utf8::internal::is_lead_surrogate(cp)) { + uint32_t trail_surrogate = utf8::internal::mask16(*start++); + cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; + } + result = utf8::unchecked::append(cp, result); + } + return result; + } + + template + u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) + { + while (start < end) { + uint32_t cp = utf8::unchecked::next(start); + if (cp > 0xffff) { //make a surrogate pair + *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); + *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); + } + else + *result++ = static_cast(cp); + } + return result; + } + + template + octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) + { + while (start != end) + result = utf8::unchecked::append(*(start++), result); + + return result; + } + + template + u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) + { + while (start < end) + (*result++) = utf8::unchecked::next(start); + + return result; + } + + // The iterator class + template + class iterator : public std::iterator { + octet_iterator it; + public: + iterator () {} + explicit iterator (const octet_iterator& octet_it): it(octet_it) {} + // the default "big three" are OK + octet_iterator base () const { return it; } + uint32_t operator * () const + { + octet_iterator temp = it; + return utf8::unchecked::next(temp); + } + bool operator == (const iterator& rhs) const + { + return (it == rhs.it); + } + bool operator != (const iterator& rhs) const + { + return !(operator == (rhs)); + } + iterator& operator ++ () + { + ::std::advance(it, utf8::internal::sequence_length(it)); + return *this; + } + iterator operator ++ (int) + { + iterator temp = *this; + ::std::advance(it, utf8::internal::sequence_length(it)); + return temp; + } + iterator& operator -- () + { + utf8::unchecked::prior(it); + return *this; + } + iterator operator -- (int) + { + iterator temp = *this; + utf8::unchecked::prior(it); + return temp; + } + }; // class iterator + + } // namespace utf8::unchecked +} // namespace utf8 + + +#endif // header guard +