Kouhei Sutou 2019-03-01 11:24:08 +0900 (Fri, 01 Mar 2019) Revision: 2de171d610017dfa074ae7ec77717e4d54e71304 https://github.com/ranguba/chupa-text/commit/2de171d610017dfa074ae7ec77717e4d54e71304 Message: office-open-xml-presentation,workbook: emit multiple data Modified files: lib/chupa-text/decomposers/office-open-xml-document.rb lib/chupa-text/decomposers/office-open-xml-presentation.rb lib/chupa-text/decomposers/office-open-xml-workbook.rb lib/chupa-text/decomposers/office-open-xml.rb test/decomposers/test-office-open-xml-presentation.rb test/decomposers/test-office-open-xml-workbook.rb Modified: lib/chupa-text/decomposers/office-open-xml-document.rb (+12 -0) =================================================================== --- lib/chupa-text/decomposers/office-open-xml-document.rb 2019-03-01 10:59:44 +0900 (48118e5) +++ lib/chupa-text/decomposers/office-open-xml-document.rb 2019-03-01 11:24:08 +0900 (d6cdaac) @@ -40,12 +40,24 @@ module ChupaText end private + def start_decompose(context) + context[:text] = "" + end + def process_entry(entry, context) case entry.zip_path when "word/document.xml" extract_text(entry, context[:text]) end end + + def finish_decompose(context, &block) + text_data = TextData.new(context[:text], source_data: context[:data]) + context[:attributes].each do |name, value| + text_data[name] = value + end + yield(text_data) + end end end end Modified: lib/chupa-text/decomposers/office-open-xml-presentation.rb (+17 -3) =================================================================== --- lib/chupa-text/decomposers/office-open-xml-presentation.rb 2019-03-01 10:59:44 +0900 (1ce5759) +++ lib/chupa-text/decomposers/office-open-xml-presentation.rb 2019-03-01 11:24:08 +0900 (7126eec) @@ -48,19 +48,33 @@ module ChupaText end private + def start_decompose(context) + context[:slides] = [] + end + def process_entry(entry, context) case entry.zip_path when /\Appt\/slides\/slide(\d+)\.xml/ nth_slide = Integer($1, 10) slide_text = "" extract_text(entry, slide_text) - context[:slides] ||= [] context[:slides] << [nth_slide, slide_text] end end - def accumulate_text(context) - context[:slides].sort_by(&:first).collect(&:last).join("\n") + def finish_decompose(context, &block) + metadata = TextData.new("", source_data: context[:data]) + context[:attributes].each do |name, value| + metadata[name] = value + end + yield(metadata) + + slide_texts = context[:slides].sort_by(&:first).collect(&:last) + slide_texts.each_with_index do |slide_text, i| + text_data = TextData.new(slide_text, source_data: context[:data]) + text_data["index"] = i + yield(text_data) + end end end end Modified: lib/chupa-text/decomposers/office-open-xml-workbook.rb (+39 -6) =================================================================== --- lib/chupa-text/decomposers/office-open-xml-workbook.rb 2019-03-01 10:59:44 +0900 (cdc09d1) +++ lib/chupa-text/decomposers/office-open-xml-workbook.rb 2019-03-01 11:24:08 +0900 (90bb5d3) @@ -40,25 +40,39 @@ module ChupaText end private + def start_decompose(context) + context[:shared_strings] = [] + context[:sheet_names] = [] + context[:sheets] = [] + end + def process_entry(entry, context) case entry.zip_path when "xl/sharedStrings.xml" - context[:shared_strings] = [] extract_text(entry, context[:shared_strings]) + when "xl/workbook.xml" + listener = WorkbookListener.new(context[:sheet_names]) + parse(entry.file_data, listener) when /\Axl\/worksheets\/sheet(\d+)\.xml\z/ nth_sheet = Integer($1, 10) sheet = [] listener = SheetListener.new(sheet) parse(entry.file_data, listener) - context[:sheets] ||= [] context[:sheets] << [nth_sheet, sheet] end end - def accumulate_text(context) + def finish_decompose(context, &block) + metadata = TextData.new("", source_data: context[:data]) + context[:attributes].each do |name, value| + metadata[name] = value + end + yield(metadata) + shared_strings = context[:shared_strings] sheets = context[:sheets].sort_by(&:first).collect(&:last) - sheet_texts = sheets.collect do |sheet| + sheet_names = context[:sheet_names] + sheets.each_with_index do |sheet, i| sheet_text = "" sheet.each do |row| row_texts = row.collect do |cell| @@ -71,9 +85,28 @@ module ChupaText end sheet_text << row_texts.join("\t") << "\n" end - sheet_text + text_data = TextData.new(sheet_text, source_data: context[:data]) + text_data["index"] = i + name = sheet_names[i] + text_data["name"] = name if name + yield(text_data) + end + end + + class WorkbookListener < SAXListener + URI = "http://schemas.openxmlformats.org/spreadsheetml/2006/main" + + def initialize(sheet_names) + @sheet_names = sheet_names + end + + def start_element(uri, local_name, qname, attributes) + return unless uri == URI + case local_name + when "sheet" + @sheet_names << attributes["name"] + end end - sheet_texts.join("\n") end class SheetListener < SAXListener Modified: lib/chupa-text/decomposers/office-open-xml.rb (+4 -8) =================================================================== --- lib/chupa-text/decomposers/office-open-xml.rb 2019-03-01 10:59:44 +0900 (f2e2483) +++ lib/chupa-text/decomposers/office-open-xml.rb 2019-03-01 11:24:08 +0900 (20543a8) @@ -34,11 +34,12 @@ module ChupaText end end - def decompose(data) + def decompose(data, &block) context = { - text: "", + data: data, attributes: {}, } + start_decompose(context) data.open do |input| Archive::Zip.open(input) do |zip| zip.each do |entry| @@ -56,12 +57,7 @@ module ChupaText end end end - text = accumulate_text(context) - text_data = TextData.new(text, source_data: data) - context[:attributes].each do |name, value| - text_data[name] = value - end - yield(text_data) + finish_decompose(context, &block) end private Modified: test/decomposers/test-office-open-xml-presentation.rb (+46 -46) =================================================================== --- test/decomposers/test-office-open-xml-presentation.rb 2019-03-01 10:59:44 +0900 (6cff112) +++ test/decomposers/test-office-open-xml-presentation.rb 2019-03-01 11:24:08 +0900 (a72fb13) @@ -48,41 +48,32 @@ class TestDecomposersOfficeOpenXMLPresentation < Test::Unit::TestCase sub_test_case("#decompose") do sub_test_case("attributes") do def decompose(attribute_name) - super(fixture_path("pptx", "attributes.pptx")).collect do |data| - data[attribute_name] - end + super(fixture_path("pptx", "attributes.pptx")).first[attribute_name] end def test_title - assert_equal(["Title"], decompose("title")) + assert_equal("Title", decompose("title")) end def test_author - assert_equal([nil], decompose("author")) + assert_equal(nil, decompose("author")) end def test_subject - assert_equal(["Subject"], decompose("subject")) + assert_equal("Subject", decompose("subject")) end def test_keywords - assert_equal(["Keyword1 Keyword2"], decompose("keywords")) + assert_equal("Keyword1 Keyword2", decompose("keywords")) end def test_modified_time - assert_equal([Time], - decompose("modified_time").collect(&:class)) + assert_equal(Time, decompose("modified_time").class) end def test_application - assert_equal(["LibreOffice"], - normalize_applications(decompose("application"))) - end - - def normalize_applications(applications) - applications.collect do |application| - normalize_application(application) - end + assert_equal("LibreOffice", + normalize_application(decompose("application"))) end def normalize_application(application) @@ -92,41 +83,50 @@ class TestDecomposersOfficeOpenXMLPresentation < Test::Unit::TestCase application end end - - def test_creation_date - assert_equal([nil], decompose("creation_date")) - end end - sub_test_case("one slide") do - def decompose - super(fixture_path("pptx", "one-slide.pptx")) - end - - def test_body - assert_equal([<<-BODY], decompose.collect(&:body)) -Slide1 title -Slide1 content - BODY + sub_test_case("slides") do + def decompose(path) + super(path).collect do |data| + [ + data["index"], + data.body, + ] + end end - end - sub_test_case("multi slides") do - def decompose - super(fixture_path("pptx", "multi-slides.pptx")) + def test_one_slide + assert_equal([ + [nil, ""], + [ + 0, + "Slide1 title\n" + + "Slide1 content\n", + ], + ], + decompose(fixture_path("pptx", "one-slide.pptx"))) end - def test_body - assert_equal([<<-BODY], decompose.collect(&:body)) -Slide1 title -Slide1 content - -Slide2 title -Slide2 content - -Slide3 title -Slide3 content - BODY + def test_multi_slides + assert_equal([ + [nil, ""], + [ + 0, + "Slide1 title\n" + + "Slide1 content\n", + ], + [ + 1, + "Slide2 title\n" + + "Slide2 content\n", + ], + [ + 2, + "Slide3 title\n" + + "Slide3 content\n", + ], + ], + decompose(fixture_path("pptx", "multi-slides.pptx"))) end end end Modified: test/decomposers/test-office-open-xml-workbook.rb (+68 -64) =================================================================== --- test/decomposers/test-office-open-xml-workbook.rb 2019-03-01 10:59:44 +0900 (16970b3) +++ test/decomposers/test-office-open-xml-workbook.rb 2019-03-01 11:24:08 +0900 (704f599) @@ -48,46 +48,36 @@ class TestDecomposersOfficeOpenXMLWorkbook < Test::Unit::TestCase sub_test_case("#decompose") do sub_test_case("attributes") do def decompose(attribute_name) - super(fixture_path("xlsx", "attributes.xlsx")).collect do |data| - data[attribute_name] - end + super(fixture_path("xlsx", "attributes.xlsx")).first[attribute_name] end def test_title - assert_equal(["Title"], decompose("title")) + assert_equal("Title", decompose("title")) end def test_author - assert_equal([nil], decompose("author")) + assert_equal(nil, decompose("author")) end def test_subject - assert_equal(["Subject"], decompose("subject")) + assert_equal("Subject", decompose("subject")) end def test_keywords - assert_equal(["Keyword1 Keyword2"], decompose("keywords")) + assert_equal("Keyword1 Keyword2", decompose("keywords")) end def test_created_time - assert_equal([Time], - decompose("created_time").collect(&:class)) + assert_equal(Time, decompose("created_time").class) end def test_modified_time - assert_equal([Time], - decompose("modified_time").collect(&:class)) + assert_equal(Time, decompose("modified_time").class) end def test_application - assert_equal(["LibreOffice"], - normalize_applications(decompose("application"))) - end - - def normalize_applications(applications) - applications.collect do |application| - normalize_application(application) - end + assert_equal("LibreOffice", + normalize_application(decompose("application"))) end def normalize_application(application) @@ -97,55 +87,69 @@ class TestDecomposersOfficeOpenXMLWorkbook < Test::Unit::TestCase application end end - - def test_creation_date - assert_equal([nil], decompose("creation_date")) - end end - sub_test_case("one sheet") do - def decompose - super(fixture_path("xlsx", "one-sheet.xlsx")) - end - - def test_body - assert_equal([<<-BODY], decompose.collect(&:body)) -Sheet1 - A1\tSheet1 - B1 -Sheet1 - A2\tSheet1 - B2 - BODY - end - end - - sub_test_case("not shared cell") do - def decompose - super(fixture_path("xlsx", "not-shared-cell.xlsx")) - end - - def test_body - assert_equal([<<-BODY], decompose.collect(&:body)) -Sheet1 - A1\tSheet1 - B1 -Sheet1 - A2\tSheet1 - B2 -0.5\t0.5 - BODY - end - end - - sub_test_case("multi sheets") do - def decompose - super(fixture_path("xlsx", "multi-sheets.xlsx")) + sub_test_case("sheets") do + def decompose(path) + super(path).collect do |data| + [ + data["index"], + data["name"], + data.body, + ] + end end - def test_body - assert_equal([<<-BODY], decompose.collect(&:body)) -Sheet1 - A1\tSheet1 - B1 -Sheet1 - A2\tSheet1 - B2 - -Sheet2 - A1\tSheet2 - B1 -Sheet2 - A2\tSheet2 - B2 - -Sheet3 - A1\tSheet3 - B1 -Sheet3 - A2\tSheet3 - B2 - BODY + def test_one_sheet + assert_equal([ + [nil, nil, ""], + [ + 0, + "Sheet1", + "Sheet1 - A1\tSheet1 - B1\n" + + "Sheet1 - A2\tSheet1 - B2\n", + ], + ], + decompose(fixture_path("xlsx", "one-sheet.xlsx"))) + end + + def test_no_shared_cell + assert_equal([ + [nil, nil, ""], + [ + 0, + "Sheet1", + "Sheet1 - A1\tSheet1 - B1\n" + + "Sheet1 - A2\tSheet1 - B2\n" + + "0.5\t0.5\n", + ], + ], + decompose(fixture_path("xlsx", "not-shared-cell.xlsx"))) + end + + def test_multi_sheets + assert_equal([ + [nil, nil, ""], + [ + 0, + "Sheet1", + "Sheet1 - A1\tSheet1 - B1\n" + + "Sheet1 - A2\tSheet1 - B2\n", + ], + [ + 1, + "Sheet2", + "Sheet2 - A1\tSheet2 - B1\n" + + "Sheet2 - A2\tSheet2 - B2\n", + ], + [ + 2, + "Sheet3", + "Sheet3 - A1\tSheet3 - B1\n" + + "Sheet3 - A2\tSheet3 - B2\n", + ], + ], + decompose(fixture_path("xlsx", "multi-sheets.xlsx"))) end end end -------------- next part -------------- An HTML attachment was scrubbed... URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20190301/16654e58/attachment-0001.html>