start porting benchmarks

2024-03-22 13:11:17 +08:00 · 2016-02-06 10:04:41 -05:00 · 2016-02-06 10:04:41 -05:00 · a8be9fff32
commit a8be9fff32
parent 74bfdb6f7d
9 changed files with 496 additions and 0 deletions
--- a/benchmarks/bufzip.cpp
+++ b/benchmarks/bufzip.cpp
@ -0,0 +1,23 @@
+#include <xlnt/xlnt.hpp>
+#include <xlnt/serialization/xml_document.hpp>
+#include <xlnt/serialization/xml_node.hpp>
+#include <xlnt/serialization/xml_serializer.hpp>
+
+void standard()
+{
+    xlnt::xml_document doc;
+
+    for (int i = 0; i < 1000000; i++) 
+    {
+	doc.add_child("test");
+    }
+
+    xlnt::zip_file archive;
+    archive.writestr("sheet.xml", doc.to_string());
+}
+
+int main()
+{
+    standard();
+    return 0;
+}
--- a/benchmarks/files/large.xlsx
+++ b/benchmarks/files/large.xlsx
--- a/benchmarks/files/very_large.xlsx
+++ b/benchmarks/files/very_large.xlsx
--- a/benchmarks/memory.cpp
+++ b/benchmarks/memory.cpp
@ -0,0 +1,65 @@
+#include <cassert>
+
+#ifdef __APPLE__
+#include<mach/mach.h>
+#endif
+
+#include <xlnt/xlnt.hpp>
+
+#include "../tests/helpers/path_helper.hpp"
+
+int calc_memory_usage()
+{
+#ifdef __APPLE__
+    struct task_basic_info t_info;
+    mach_msg_type_number_t t_info_count = TASK_BASIC_INFO_COUNT;
+
+    if (KERN_SUCCESS != task_info(mach_task_self(),
+				  TASK_BASIC_INFO, (task_info_t)&t_info, 
+				  &t_info_count))
+    {
+	return 0;
+    }
+
+    return t_info.virtual_size;
+#endif
+    return 0;
+}
+
+void test_memory_use()
+{
+    // Naive test that assumes memory use will never be more than 120 % of
+    // that for first 50 rows
+    auto current_folder = PathHelper::GetExecutableDirectory();
+    auto src = current_folder + "rks/files/very_large.xlsx";
+
+    xlnt::workbook wb;
+    wb.load(src);
+    auto ws = wb.get_active_sheet();
+
+    int initial_use = 0;
+    int n = 0;
+
+    for (auto line : ws.rows())
+    {
+        if (n % 50 == 0)
+        {
+            auto use = calc_memory_usage();
+
+            if (initial_use == 0)
+	    {
+                initial_use = use;
+	    }
+
+            assert(use / initial_use < 1.2);
+	    std::cout << n << " " << use << std::endl;
+	}
+
+        n++;
+    }
+}
+
+int main()
+{
+    test_memory_use();
+}
--- a/benchmarks/profiling.cpp
+++ b/benchmarks/profiling.cpp
@ -0,0 +1,131 @@
+from io import BytesIO
+from lxml.etree import xmlfile
+import os
+from random import randint
+
+from openpyxl import Workbook
+from openpyxl.xml.functions import XMLGenerator
+
+def make_worksheet():
+    wb = Workbook()
+    ws = wb.active
+    for i in range(1000):
+        ws.append(list(range(100)))
+    return ws
+
+
+def lxml_writer(ws=None):
+    from openpyxl.writer.lxml_worksheet import write_rows
+    if ws is None:
+        ws = make_worksheet()
+
+    out = BytesIO()
+    with xmlfile(out) as xf:
+        write_rows(xf, ws)
+    #with open("lxml_writer.xml", "wb") as dump:
+        #dump.write(out.getvalue())
+    #ws.parent.save("lxml_writer.xlsx")
+
+
+def make_dump_worksheet():
+    wb = Workbook(write_only=True)
+    ws = wb.create_sheet()
+    return ws
+
+def dump_writer(ws=None):
+    if ws is None:
+        ws = make_dump_worksheet()
+    for i in range(1000):
+        ws.append(list(range(100)))
+
+
+COLUMNS = 100
+ROWS = 1000
+BOLD = 1
+ITALIC = 2
+UNDERLINE = 4
+RED_BG = 8
+formatData = [[None] * COLUMNS for _ in range(ROWS)]
+
+def generate_format_data():
+    for row in range(ROWS):
+        for col in range(COLUMNS):
+            formatData[row][col] = randint(1, 15)
+
+
+def styled_sheet():
+    from openpyxl import Workbook
+    from openpyxl.styles import Font, Style, PatternFill, Color, colors
+
+    wb = Workbook()
+    ws = wb.active
+    ws.title = 'Test 1'
+
+    red_fill = PatternFill(fill_type='solid', fgColor=Color(colors.RED), bgColor=Color(colors.RED))
+    empty_fill = PatternFill()
+    styles = []
+    # pregenerate relevant styles
+    for row in range(ROWS):
+        _row = []
+        for col in range(COLUMNS):
+            cell = ws.cell(row=row+1, column=col+1)
+            cell.value = 1
+            font = {}
+            fill = PatternFill()
+            if formatData[row][col] & BOLD:
+                font['bold'] = True
+            if formatData[row][col] & ITALIC:
+                font['italic'] = True
+            if formatData[row][col] & UNDERLINE:
+                font['underline'] = 'single'
+            if formatData[row][col] & RED_BG:
+                fill = red_fill
+            cell.style = Style(font=Font(**font), fill=fill)
+
+    #wb.save(get_output_path('test_openpyxl_style_std_pregen.xlsx'))
+
+
+def read_workbook():
+    from openpyxl import load_workbook
+    folder = os.path.split(__file__)[0]
+    src = os.path.join(folder, "files", "very_large.xlsx")
+    wb = load_workbook(src)
+    return wb
+
+
+def rows(wb):
+    ws = wb.active
+    rows = ws.iter_rows()
+    for r, row in enumerate(rows):
+        for c, col in enumerate(row):
+            pass
+    print((r+1)* (c+1), "cells")
+
+
+def col_index1():
+    from openpyxl.cell import get_column_letter
+    for i in range(1, 18279):
+        c = get_column_letter(i)
+
+
+
+"""
+Sample use
+import cProfile
+ws = make_worksheet()
+cProfile.run("profiling.lxml_writer(ws)", sort="tottime")
+"""
+
+
+if __name__ == '__main__':
+    import cProfile
+    ws = make_worksheet()
+    #wb = read_workbook()
+    #cProfile.run("rows(wb)", sort="tottime")
+    #cProfile.run("make_worksheet()", sort="tottime")
+    #cProfile.run("lxml_writer(ws)", sort="tottime")
+    #generate_format_data()
+    #cProfile.run("styled_sheet()", sort="tottime")
+    #ws = make_dump_worksheet()
+    #cProfile.run("dump_writer(ws)", sort="tottime")
+    cProfile.run("col_index1()", sort="tottime")
--- a/benchmarks/reader.cpp
+++ b/benchmarks/reader.cpp
@ -0,0 +1,45 @@
+import os
+import sys
+import timeit
+
+import openpyxl
+
+
+def reader(optimised):
+    """
+    Loop through all cells of a workbook
+    """
+    folder = os.path.split(__file__)[0]
+    src = os.path.join(folder, "files", "very_large.xlsx")
+    wb = openpyxl.load_workbook(src, use_iterators=optimised)
+    ws = wb.active
+    rows = ws.iter_rows()
+    for r, row in enumerate(rows):
+        for c, col in enumerate(row):
+            pass
+    print((r+1)* (c+1), "cells")
+
+def timer(fn):
+    """
+    Create a timeit call to a function and pass in keyword arguments.
+    The function is called twice, once using the standard workbook, then with the optimised one.
+    Time from the best of three is taken.
+    """
+    print("lxml", openpyxl.LXML)
+    result = []
+    for opt in (False, True,):
+        print("Workbook is {0}".format(opt and "optimised" or "not optimised"))
+        times = timeit.repeat("{0}({1})".format(fn.__name__, opt),
+                              setup="from __main__ import {0}".format(fn.__name__),
+                              number = 1,
+                              repeat = 3
+        )
+        print("{0:.2f}s".format(min(times)))
+        result.append(min(times))
+    std, opt = result
+    print("Optimised takes {0:.2%} time\n".format(opt/std))
+    return std, opt
+
+
+if __name__ == "__main__":
+    timer(reader)
--- a/benchmarks/speed.cpp
+++ b/benchmarks/speed.cpp
@ -0,0 +1,31 @@
+"Benchmark some different implementations for cells"
+
+from openpyxl.compat import range
+
+from openpyxl.cell import Cell
+from openpyxl.cell.read_only import ReadOnlyCell
+from memory_profiler import memory_usage
+import time
+
+
+def standard():
+    c = Cell(None, "A", "0", None)
+
+def iterative():
+    c = ReadOnlyCell(None, None, None, 'n')
+
+def dictionary():
+    c = {'ws':'None', 'col':'A', 'row':0, 'value':1}
+
+
+if __name__ == '__main__':
+    initial_use = memory_usage(proc=-1, interval=1)[0]
+    for fn in (standard, iterative, dictionary):
+        t = time.time()
+        container = []
+        for i in range(1000000):
+            container.append(fn())
+        print("{0} {1} MB, {2:.2f}s".format(
+            fn.func_name,
+            memory_usage(proc=-1, interval=1)[0] - initial_use,
+            time.time() - t))
--- a/benchmarks/styles.cpp
+++ b/benchmarks/styles.cpp
@ -0,0 +1,118 @@
+#include <iterator>
+#include <random>
+#include <xlnt/xlnt.hpp>
+
+template<typename Iter>
+Iter random_choice(Iter start, Iter end) {
+    static std::random_device rd;
+    static std::mt19937 gen(rd());
+
+    std::uniform_int_distribution<> dis(0, std::distance(start, end) - 1);
+    std::advance(start, dis(gen));
+
+    return start;
+}
+
+std::vector<xlnt::style> generate_all_styles()
+{
+    std::vector<xlnt::style> styles;
+
+    std::vector<xlnt::vertical_alignment> vertical_alignments = {xlnt::vertical_alignment::center, xlnt::vertical_alignment::justify, xlnt::vertical_alignment::top, xlnt::vertical_alignment::bottom};
+    std::vector<xlnt::horizontal_alignment> horizontal_alignments = {xlnt::horizontal_alignment::center, xlnt::horizontal_alignment::center_continuous, xlnt::horizontal_alignment::general, xlnt::horizontal_alignment::justify, xlnt::horizontal_alignment::left, xlnt::horizontal_alignment::right};
+    std::vector<std::string> font_names = {"Calibri", "Tahoma", "Arial", "Times New Roman"};
+    std::vector<int> font_sizes = {11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35};
+    std::vector<bool> bold_options = {true, false};
+    std::vector<xlnt::font::underline_style> underline_options = {xlnt::font::underline_style::single, xlnt::font::underline_style::none};
+    std::vector<bool> italic_options = {true, false};
+    
+    for(auto vertical_alignment : vertical_alignments)
+    {
+        for(auto horizontal_alignment : horizontal_alignments)
+        {
+            for(auto name : font_names)
+            {
+                for(auto size : font_sizes)
+                {
+                    for(auto bold : bold_options)
+                    {
+                        for(auto underline : underline_options)
+                        {
+                            for(auto italic : italic_options)
+                            {
+                                xlnt::style s;
+
+                                xlnt::font f;
+                                f.set_name(name);
+                                f.set_size(size);
+                                f.set_italic(italic);
+                                f.set_underline(underline);
+                                f.set_bold(bold);
+                                s.set_font(f);
+
+                                xlnt::alignment a;
+                                a.set_vertical(vertical_alignment);
+                                a.set_horizontal(horizontal_alignment);
+                                s.set_alignment(a);
+
+                                styles.push_back(s);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    return styles;
+}
+
+xlnt::workbook optimized_workbook(const std::vector<xlnt::style> &styles, int n)
+{
+    xlnt::workbook wb;
+    wb.set_optimized_write(true);
+    auto worksheet = wb.create_sheet();
+
+    for(int i = 1; i < n; i++)
+    {
+        auto style = *random_choice(styles.begin(), styles.end());
+        worksheet.append({{0, style}});
+    }
+
+    return wb;
+}
+
+xlnt::workbook non_optimized_workbook(const std::vector<xlnt::style> &styles, int n)
+{
+    xlnt::workbook wb;
+
+    for(int idx = 1; idx < n; idx++)
+    {
+        auto worksheet = *random_choice(wb.begin(), wb.end());
+        auto cell = worksheet.get_cell({1, (xlnt::row_t)idx + 1});
+        cell.set_value(0);
+        cell.set_style(*random_choice(styles.begin(), styles.end()));
+    }
+
+    return wb;
+}
+
+void to_profile(xlnt::workbook &wb, const std::string &f, int n)
+{
+    auto t = 0;//-time.time();
+    wb.save(f);
+    std::cout << "took " << t << "s for " << n << " styles";
+}
+
+int main()
+{
+    auto styles = generate_all_styles();
+    int n = 10000;
+
+    for(auto func : {&optimized_workbook, &non_optimized_workbook})
+    {
+	std::cout << (func == &optimized_workbook ? "optimized_workbook" : "non_optimized_workbook") << std::endl;
+	auto wb = func(styles, n);
+	std::string f = "/tmp/xlnt.xlsx";
+	to_profile(wb, f, n);
+    }
+}
--- a/benchmarks/writer.cpp
+++ b/benchmarks/writer.cpp
@ -0,0 +1,83 @@
+#include <chrono>
+#include <xlnt/xlnt.hpp>
+#include "path_helper.hpp"
+
+int current_time()
+{
+    return std::chrono::duration<double, std::milli>(std::chrono::system_clock::now().time_since_epoch()).count();
+}
+
+// Create a worksheet with variable width rows. Because data must be
+// serialised row by row it is often the width of the rows which is most
+// important.
+void writer(bool optimized, int cols, int rows)
+{
+    xlnt::workbook wb;
+//    wb.set_optimized_write(optimized);
+
+    auto ws = wb.create_sheet();
+
+    std::vector<int> row;
+
+    for(int i = 0; i < cols; i++)
+    {
+	row.push_back(i);
+    }
+
+    for(int index = 0; index < rows; index++)
+    {
+	if ((index + 1) % (rows / 10) == 0)
+	{
+	    std::string progress = std::string((index + 1) / (1 + rows / 10), '.');
+	    std::cout << "\r" << progress;
+	    std::cout.flush();
+	}
+
+        ws.append(row);
+    }
+
+    std::cout << std::endl;
+
+    auto filename = PathHelper::GetExecutableDirectory() + "s/files/large.xlsx";
+    wb.save(filename);
+}
+
+// Create a timeit call to a function and pass in keyword arguments.
+// The function is called twice, once using the standard workbook, then with the optimised one.
+// Time from the best of three is taken.
+std::pair<int, int> timer(std::function<void(bool, int, int)> fn, int cols, int rows)
+{
+    const int repeat = 3;
+    int min_time_standard = std::numeric_limits<int>::max();
+    int min_time_optimized = std::numeric_limits<int>::max();
+
+    for(bool opt : {false, true})
+    {
+	std::cout << cols << " cols " << rows << " rows, Worksheet is " << (opt ? "optimised" : "not optimised") << std::endl;
+	auto &time = opt ? min_time_optimized : min_time_standard;
+
+        for(int i = 0; i < repeat; i++)
+	{
+	    auto start = current_time();
+	    fn(opt, cols, rows);
+	    time = std::min(current_time() - start, time);
+	}
+    }
+
+    double ratio = min_time_optimized / static_cast<double>(min_time_standard) * 100;
+    std::cout << "Optimised takes " << ratio << "% time" << std::endl;
+
+    return {min_time_standard, min_time_optimized};
+}
+
+int main()
+{
+    timer(&writer, 100, 100);
+    timer(&writer, 1000, 100);
+    timer(&writer, 4000, 100);
+    timer(&writer, 8192, 100);
+    timer(&writer, 10, 10000);
+    timer(&writer, 4000, 1000);
+
+    return 0;
+}