svnno****@sourc*****
svnno****@sourc*****
Thu Feb 28 22:56:39 JST 2008
Revision: 3411 http://svn.sourceforge.jp/cgi-bin/viewcvs.cgi?root=kazehakase&view=rev&rev=3411 Author: pal_gene Date: 2008-02-28 22:56:39 +0900 (Thu, 28 Feb 2008) Log Message: ----------- * modules/search/kz-hyper-estraier-search.c: try to extract the title of content, when index is completely rebuild (make_index()). Modified Paths: -------------- kazehakase/trunk/ChangeLog kazehakase/trunk/module/search/kz-hyper-estraier-search.c Modified: kazehakase/trunk/ChangeLog =================================================================== --- kazehakase/trunk/ChangeLog 2008-02-28 03:58:41 UTC (rev 3410) +++ kazehakase/trunk/ChangeLog 2008-02-28 13:56:39 UTC (rev 3411) @@ -1,3 +1,9 @@ +2008-02-28 Shunichi Fuji <palgl****@gmail*****> + + * modules/search/kz-hyper-estraier-search.c: + try to extract the title of content, + when index is completely rebuild (make_index()). + 2008-02-28 Hiroyuki Ikezoe <poinc****@ikezo*****> * module/embed/gecko/kz-mozlauncher.cpp: The directory which is Modified: kazehakase/trunk/module/search/kz-hyper-estraier-search.c =================================================================== --- kazehakase/trunk/module/search/kz-hyper-estraier-search.c 2008-02-28 03:58:41 UTC (rev 3410) +++ kazehakase/trunk/module/search/kz-hyper-estraier-search.c 2008-02-28 13:56:39 UTC (rev 3411) @@ -368,6 +368,7 @@ desc = est_doc_make_snippet(doc, highlights, num_summary, half_of_summary, half_of_summary); /* make highlight keyword */ + //TODO:more fast that don't split desc_highlight = g_strsplit_set(desc, "\n", 0); free(desc); /* don't g_free because born from estraier func */ desc_len = g_strv_length(desc_highlight); @@ -459,6 +460,89 @@ return encoding; } + +/* Type and functions that used in gMmarkupParser */ +/* drop at the tiem +typedef struct { + gint in_title; + gchar *title; +} title_parsing; +static void title_start_element(GMarkupParseContext *context, const gchar *element_name, + const gchar **attribute_names, const gchar **attribute_values, + gpointer user_data, GError **error) +{ + gchar *tag = g_utf8_strup(element_name, -1); +// printf("what elem:%s\n", tag); + if(g_strcmp(tag, "TITLE") == 0) { + ((title_parsing *) user_data)->in_title = 1; + } else if(g_strcmp(tag, "BODY") == 0) { + g_markup_parse_context_end_parse(context, error); + } + g_free(tag); +} +static void title_end_element(GMarkupParseContext *context, const gchar *element_name, + gpointer user_data, GError **error) +{ + gchar *tag = g_utf8_strup(element_name, -1); + if(g_strcmp(tag, "META") == 0) { + g_markup_parse_context_end_parse(context, error); + } + g_free(tag); +} +static void title_text(GMarkupParseContext *context, const gchar *text, + gsize text_len, gpointer user_data, GError **error) +{ + if(((title_parsing *) user_data)->in_title != 0) { + ((title_parsing *) user_data)->title = g_strndup(text, text_len); + printf("title get in func:%s\n", ((title_parsing *) user_data)->title); + g_markup_parse_context_end_parse(context, error); + ((title_parsing *) user_data)->in_title = 0; + } +} +drop at the time */ +/** + * create page title from html content string + * currentlly called from only register_documents_in_path. + * @parm contents utf-8 strings that contain html contents + * @return gchar* new allocated, should be g_free or NULL for failed parsing + */ +static gchar * +get_document_title(const gchar *contents) +{ + /* didn't work yet. + GMarkupParser title_parser_funcs = { + title_start_element, title_end_element, + title_text, NULL, NULL + }; + title_parsing data = {0, NULL}; + GMarkupParseContext *title_parser; + + title_parser = g_markup_parse_context_new(&title_parser_funcs, + 0, + &data, + NULL); + if(!title_parser) { + printf("parser create fail.\n"); + return NULL; + } + g_markup_parse_context_parse(title_parser, contents, 1024, NULL); + g_markup_parse_context_free(title_parser); + printf("get title:%s\n", data.title); + return data.title; + + */ + /* simple parser */ + //FIXME: parse correctly + gchar *start; + start = strcasestr(contents, "<title>"); + if(start == NULL) { + //printf("not in title.\n"); + return NULL; + } + start += strlen("<title>"); + return g_strndup(start, strstr(start, "<") - start); +} + gboolean register_document (KzSearch *search, const gchar *uri, const gchar *title, const gchar *contents, GTime mtime) { @@ -505,7 +589,7 @@ g_warning("register error: %s", est_err_msg(est_mtdb_error(he_search->db))); g_warning("retry..."); est_mtdb_sync(he_search->db); - if(est_mtdb_put_doc(he_search->db, doc, ESTPDCLEAN)) { + if(est_mtdb_put_doc(he_search->db, doc, 0)) { g_warning("succeed!"); } else { g_warning("register error: %s", est_err_msg(est_mtdb_error(he_search->db))); @@ -564,6 +648,14 @@ return NULL; } +/** + * register all documents in given path + * @parm search this search class + * @parm path path to the directory contains contents to register + * @return void + */ +//TODO:don't register image type +//TODO:correct parse title (get_document_title()) static void register_documents_in_path (KzSearch *search, const gchar *path) { @@ -596,8 +688,6 @@ uri = create_uri_from_filename(new_path + he_search->cache_path_len + 1); - title = uri; - encoding = get_document_encoding(contents); if (!encoding) encoding = g_strdup(est_enc_name(contents, @@ -611,23 +701,37 @@ "UTF-8", encoding, NULL, NULL, NULL); + if (utf8_contents) { - kz_search_register_document(search, + title = get_document_title(utf8_contents); + if(!title) { + // printf("fail title in %s\n", uri); + title = g_strdup(uri); + } + register_document(search, uri, title, utf8_contents, mtime); g_free(utf8_contents); + g_free(title); } } - else - kz_search_register_document(search, uri, title, + else { + title = get_document_title(contents); + if(!title) { + // printf("fail title in %s\n", uri); + title = g_strdup(uri); + } + register_document(search, uri, title, contents, mtime); - + g_free(title); + } if (encoding) g_free(encoding); g_free(uri); + g_free(contents); } g_free(new_path);