[Groonga-commit] groonga/wikipedia-search at e79d1b8 [master] Support CSV output

Back to archive index

Kouhei Sutou null+****@clear*****
Sun Feb 7 15:24:30 JST 2016


Kouhei Sutou	2016-02-07 15:24:30 +0900 (Sun, 07 Feb 2016)

  New Revision: e79d1b8a49f234ea3285561c2bdea499eb26d39e
  https://github.com/groonga/wikipedia-search/commit/e79d1b8a49f234ea3285561c2bdea499eb26d39e

  Message:
    Support CSV output

  Added files:
    lib/wikipedia-search/csv-converter.rb
  Modified files:
    bin/wikipedia-convert
    lib/wikipedia-search/path.rb
    lib/wikipedia-search/task.rb

  Modified: bin/wikipedia-convert (+4 -1)
===================================================================
--- bin/wikipedia-convert    2015-05-29 12:23:33 +0900 (437da6e)
+++ bin/wikipedia-convert    2016-02-07 15:24:30 +0900 (9eaaa60)
@@ -12,8 +12,9 @@ $LOAD_PATH.unshift(lib_dir_path.to_s)
 
 require "wikipedia-search/groonga-converter"
 require "wikipedia-search/sql-converter"
+require "wikipedia-search/csv-converter"
 
-available_formats = [:groonga, :sql]
+available_formats = [:groonga, :sql, :csv]
 
 options = OpenStruct.new
 options.output = "-"
@@ -52,6 +53,8 @@ when :groonga
   converter_class = WikipediaSearch::GroongaConverter
 when :sql
   converter_class = WikipediaSearch::SQLConverter
+when :csv
+  converter_class = WikipediaSearch::CSVConverter
 end
 converter = converter_class.new(ARGF, converter_options)
 if options.output == "-"

  Added: lib/wikipedia-search/csv-converter.rb (+45 -0) 100644
===================================================================
--- /dev/null
+++ lib/wikipedia-search/csv-converter.rb    2016-02-07 15:24:30 +0900 (ae05a5e)
@@ -0,0 +1,45 @@
+require "csv"
+
+require "wikipedia-search/converter"
+
+module WikipediaSearch
+  class CSVConverter < Converter
+    private
+    def create_listener(output)
+      CSVListener.new(output, @options)
+    end
+
+    class CSVListener < Listener
+      def on_start
+        @csv = CSV.new(@output)
+      end
+
+      def on_finish
+        @csv.close
+      end
+
+      def on_page(page)
+        record_values = [
+          @page.id,
+          escape_string(@page.title),
+          escape_string(shorten_text(@page.text)),
+        ]
+        @csv << record_values
+      end
+
+      private
+      def escape_string(string)
+        string.gsub(/[\\\r\n]/) do |special_character|
+          case special_character
+          when "\r"
+            "\\r"
+          when "\n"
+            "\\n"
+          else
+            "\\#{special_character}"
+          end
+        end
+      end
+    end
+  end
+end

  Modified: lib/wikipedia-search/path.rb (+23 -0)
===================================================================
--- lib/wikipedia-search/path.rb    2015-05-29 12:23:33 +0900 (aedacf9)
+++ lib/wikipedia-search/path.rb    2016-02-07 15:24:30 +0900 (ce8a54e)
@@ -34,6 +34,10 @@ module WikipediaSearch
     def sql
       SQLPath.new(self, @language)
     end
+
+    def csv
+      CSVPath.new(self, @language)
+    end
   end
 
   class WikipediaPath
@@ -195,4 +199,23 @@ module WikipediaSearch
       data_dir + "#{@language}-all-pages.sql"
     end
   end
+
+  class CSVPath
+    def initialize(base_path, language)
+      @base_path = base_path
+      @language = language
+    end
+
+    def data_dir
+      @base_path.data_dir + "csv"
+    end
+
+    def pages
+      data_dir + "#{@language}-pages.csv"
+    end
+
+    def all_pages
+      data_dir + "#{@language}-all-pages.csv"
+    end
+  end
 end

  Modified: lib/wikipedia-search/task.rb (+11 -0)
===================================================================
--- lib/wikipedia-search/task.rb    2015-05-29 12:23:33 +0900 (87a4b2c)
+++ lib/wikipedia-search/task.rb    2016-02-07 15:24:30 +0900 (caf5bb7)
@@ -65,6 +65,7 @@ module WikipediaSearch
         define_data_convert_groonga_tasks
         define_data_convert_droonga_tasks
         define_data_convert_sql_tasks
+        define_data_convert_csv_tasks
       end
     end
 
@@ -159,6 +160,16 @@ module WikipediaSearch
       end
     end
 
+    def define_data_convert_csv_tasks
+      namespace :csv do
+        define_wikipedia_data_convert_tasks("csv",
+                                            @path.csv.pages,
+                                            @path.csv.all_pages)
+        desc "Convert Japanese Wikipedia page data to CSV data."
+        task :ja => @path.csv.pages.to_s
+      end
+    end
+
     def define_local_tasks
       namespace :local do
         define_local_groonga_tasks
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index