#!/usr/bin/perl #------------------------------------------------------------------------ # Filename : search_lib.pl # Author : Oyewole, Olanrewaju J. # Version : 1.0 # Date : 27/04/2002 # Description : #------------------------------------------------------------------------ # This library contains subroutines that provide the necessary services # for implementing a search system for HTML pages on a website. # The names of the subroutines are self-explanatory, as such no # additional description is provided. See inline comments for # further details. # # parse_text # remove_html_tags # remove_conjunctions # write_index_file # build_data_file # resolve_keywords # find_documents # calculate_percent_match # spool_results # # build_document_list # retrieve_page_description # retrieve_keywords_from_file # retrieve_documents_from_file # build_document_number_list # update_keyword_hash # write_keyword_file # #------------------------------------------------------------------------ # Version : 1.1 # Date : 16/05/2002 # Description : #------------------------------------------------------------------------ # Altered the subroutine , to do case-insensitive matching of keywords. # This reflects the fact that keywords are stored in lowercase in the # keywords file, and so mixed case searches always failed previously. # Altered the subroutine spool_results, to emphasise/bold keywords in # the Description field of the search results. If the value for the # input parameter $description_field_name is supplied, keywords found # within that field are returned in bold, else, keywords are returned # using the default format specified by the $section_format parameter. # A minor change added to this release was to alter the keyword matching # to match any part of the word in the keyword file. Such that "bat" # matches bat, batting, or abate. This occurs when the user selects # the OR search option. A line was also added to the subroutine # retrieve_keywords_from_file to perform exact match where specified. #------------------------------------------------------------------------ # use HTML::Parse; # use HTML::FormatText; unshift @INC, '..'; sub write_index_file { my ($file_array, $index_file, $file_path) = @_; if (open INDEXFILE, ">$file_path/$index_file") { print INDEXFILE @$file_array; } else { die "\n

Failed to open INDEX file ($file_path/$index_file) for output!

"; } } sub build_data_file { my ($file_array, $data_file, $keyword_file, $file_path, $remove_file_path, $replace_url, $description_length) = @_; my (@file_contents, @file_stats, $file_size, $file_date, $counter); my (@keyword_array, %keyword_hash); if (open DATAFILE, ">$file_path/$data_file") { $| = 1; foreach $document (@$file_array) { open (TEXTFILE, "$document") or die "\n

Failed to open Data file in search_lib::build_data_file\n
File = $file_path/$data_file

"; @file_contents = ; @file_stats = stat(TEXTFILE); $file_date = gmtime($file_stats[9]); $file_size = $file_stats[7] > 9999 ? int($file_stats[7] / 1024) . ' Kbytes' : $file_stats[7] . ' bytes'; close TEXTFILE; $counter++; # Replace the File System path with the HTTP path so that when # users search we don't have to do this on the fly, saving CPU, # memory, and time. $document =~ s/$remove_file_path/$replace_url/; $file_string = join("\n", @file_contents); $page_hash = &parse_text($file_string, $description_length); # Select unique keywords from this file and add them to the # global list of site-wide keywords, while mentioning that # the keyword was found on this page (number NOT filename). &update_keyword_hash(\%keyword_hash, $$page_hash{KEYWORDS}, $counter); # Print to Data file, including an explicit cast to retrieve # the array from the array reference inside $$page_hash{KEYWORDS} print DATAFILE $document, "\t", $$page_hash{TITLE}, "\t", $$page_hash{DESCRIPTION}, "\t", $file_size, "\t", $file_date, "\t", join(' ', @{$$page_hash{KEYWORDS}}), "\n"; } &write_keyword_file (\%keyword_hash, $keyword_file, $file_path); } else { die "\n

Failed to open DATA file ($file_path/$data_file) for output!

"; } } sub update_keyword_hash { # For each keyword found, we create an entry in the global keyword hash # "Keyword => (list of page numbers in which keyword was found)". We use # a Hashed Array here to prevent keywords being duplicated in our list. # Eventually the Hash will be written to a file using the notation # "Keyword TAB (list of page numbers in which keyword was found)", example # ability 1, 14, 23, 24, 27, 31 my ($keyword_hash, $keyword_array, $counter) = @_; foreach $keyword (@$keyword_array) { $keyword =~ s/^ +//; $keyword =~ s/ +$//; $keyword = lc($keyword); if (!$$keyword_hash{$keyword}) { $$keyword_hash{$keyword} = "$counter"; } else { $$keyword_hash{$keyword} .= ", $counter" if ($$keyword_hash{$keyword} !~ /$counter/); } } } sub write_keyword_file { my ($keyword_hash, $keyword_file, $file_path) = @_; if (open KEYWORDFILE, ">$file_path/$keyword_file") { foreach $hash_key (sort keys %$keyword_hash) { print KEYWORDFILE "\n", $hash_key, "\t", $$keyword_hash{$hash_key} if ($hash_key ne ''); } close KEYWORDFILE; } else { die "\n

Failed to open INDEX file ($file_path/$keyword_file) for output!

"; } } sub parse_text { my ($file_contents, $description_length) = @_; my @keywords; my %page_hash; $file_contents =~ s/\<\!--(.+)--\>//igm; $file_contents =~ s/\//igm; $file_contents =~ s/$.+)\<\/title>//i; $page_hash{TITLE} = $1; $page_hash{DESCRIPTION} = $1 if ( $file_contents =~ /.*\|\<|\]|\[|\/|:|-])([\w]{0,1})/$1 $3/g; # $file_contents = &remove_conjunctions($file_contents, './conjunctions.db'); $page_hash{DESCRIPTION} ||= $page_hash{TITLE} . " " . substr($file_contents, 0, $description_length); $file_contents = &remove_conjunctions($file_contents, './conjunctions.db'); # Having removed everything else, what we have left in the # string should now just be the keywords. First we replace # multiple spaces/tabs with a single space and a special # anchor character. Next we use the anchor character to # split the the keywords, leaving the spaces between them. $file_contents =~ s/\s+/Ź /g; @keywords = split("Ź", $file_contents); $page_hash{KEYWORDS} = \@keywords; return \%page_hash; } sub remove_html_tags { my ($file_contents) = @_; # First we remove everything before the tag; if there is a tag # $file_contents = $2 if ($file_contents =~ /.*($(.+)/i); $file_contents =~ s/.+\]*>/ /gs; $file_contents =~ s/\<\!--[^-->]--\>/ /gs; $file_contents =~ s/[\&|\#][a-z|0-9]{2,4}\;/ /ig; $file_contents =~ s/--\>/ /g; $file_contents =~ s/\s+/ /g; # return the input value ($file_contents) after removing the tags # return HTML::FormatText->new->format(parse_html($file_contents)); ## Do not uncomment !! # This method is actually significantly SLOWER, and hard to debug! return $file_contents; } sub remove_conjunctions { my ($text_string, $conjunctions_file_name) = @_; my @conjunctions = (); if (open CONJUNCTIONS, "$conjunctions_file_name") { while ($conjunction = ) { chomp($conjunction); # if ( $text_string =~ /$conjunction/i ) { print STDOUT "\n
Matched $conjunction\n
"; } # $text_string =~ s/$conjunction/ /ig; $text_string =~ s/\b$conjunction\b/ /ig; } close CONJUNCTIONS; } return $text_string; } sub find_documents { my ($search_files_path, $keywords_file, $data_file, $search_operator, $keywords_array) = @_; my ($keyword_array, $document_array, $file_number_hash); $keyword_array = retrieve_keywords_from_file ($search_files_path, $keywords_file, $keywords_array, $search_operator); $file_number_hash = build_document_number_list($keyword_array, $search_operator); $document_array = retrieve_documents_from_file($search_files_path, $data_file, $file_number_hash, $keywords_array); foreach $document_hash (@$document_array) { &calculate_percent_match($keywords_array, $document_hash); } return $document_array; } sub retrieve_keywords_from_file { my ($search_files_path, $keywords_file, $keywords, $search_operator) = @_; my @keyword_array; # Open file containing our unique list of site-wide keywords. # Each keyword is followed by a tab, and a list of numbers. # The numbers identify the location of documents within the # data file. Each of these documents contain this keyword. if (open KEYWORDS, "$search_files_path/$keywords_file") { # We scroll through the keywords file, and compare each of # the input keywords with the current keyword from the file, # if there is a match, we retrieve the line from the file, # and add it to an array, which we will use later. while ($keyword_string = ) { chomp($keyword_string); foreach $keyword (@$keywords) { # The match is now case-insensitive, and matches # any part of the keyword if (( ($search_operator ne 'OR') && ($keyword_string =~ /^($keyword)(\t)(.*)/i) ) || (($search_operator eq 'OR') && $keyword_string =~ /(\S*$keyword\S*)(\t)(.*)/i)) { my %keyword_hash = ($1 => $3); push (@keyword_array, \%keyword_hash); } } } close KEYWORDS; } else { die "\nFailed to open Keywords file!"; } return \@keyword_array; } sub retrieve_documents_from_file { my ($search_files_path, $data_file, $file_number_hash, $keywords_array) = @_; my @document_array = (); # Here, we open the data.txt file and retrieve information # about all the documents that matched our search requirements. # The hash array %file_number_hash ensures we have no duplication. if (open DATAFILE, "$search_files_path/$data_file") { my $current_document = ''; my $line_number = 1; while ($current_document = ) { if ($$file_number_hash{$line_number}) { my %document_hash; $document_hash{URL} = $1 if ($current_document =~ s/([^\t]+)\t//); $document_hash{TITLE} = $1 if ($current_document =~ s/([^\t]+)\t//); $document_hash{DESCRIPTION} = $1 if ($current_document =~ s/([^\t]+)\t//); $document_hash{SIZE} = $1 if ($current_document =~ s/([^\t]+)\t//); $document_hash{DATE} = $1 if ($current_document =~ s/([^\t]+)\t//); $document_hash{KEYWORDS} = $current_document; &calculate_percent_match($keywords_array, \%document_hash); push(@document_array, \%document_hash); $counter_hash{$document_hash{PERCENT_MATCH}} += 1; $global_hash{$document_hash{PERCENT_MATCH} . sprintf("%03d", $counter_hash{$document_hash{PERCENT_MATCH}})} = \%document_hash; } $line_number++; } close DATAFILE; } else { die "\n\nFailed to open Documents file ($search_files_path/$data_file) at search_lib::retrieve_documents_from_file"; } @document_array = (); foreach $key (reverse sort (keys(%global_hash))) { push(@document_array, $global_hash{$key}); } return \@document_array; } sub build_document_number_list { my ($keyword_array, $search_operator) = @_; my %file_number_hash; my %file_hash; my @arr = @$keyword_array; my $flag = 0; # Scroll through the array of keywords retrieved from the # keywords file, and prepare to retrieve documents from the data.txt file. # However, there is a precaution. If the $search_operator parameter # is 'AND', we need to ensure that we only retrieve those documents # in which all the keywords appear, i.e. the number appears on # the list of each of the keywords we retrieved from the file. foreach $keyword_hash (@$keyword_array) { my @file_number_array = map ({split(', ', $_);} values(%$keyword_hash)); foreach $item (@file_number_array) { $file_hash{$item} = $item; } if (!$flag) { %file_number_hash = %file_hash; $flag = 1; } else { foreach $file_number (sort(keys %file_number_hash)) { if (uc($search_operator) eq 'AND') { if (exists($file_number_hash{$file_number}) && exists($file_hash{$file_number})) { } else { delete($file_number_hash{$file_number}); } } else { foreach $item (keys(%file_hash)) { $file_number_hash{$item} = 1; } } } } undef(%file_hash); } return \%file_number_hash; } sub calculate_percent_match { my ($input_keywords, $page_hash) = @_; my ($match, $count, $percent, $max); my $sum = 0; my $keywords_string = join(' ', $$page_hash{KEYWORDS}); $count = ($keywords_string =~ s/(\w)/$1/ig); foreach $keyword (@$input_keywords) { $match = ($keywords_string =~ s/$keyword/$keyword/ig); $percent = (($match * 3.7) / $count) * 100; $max = $percent > $max ? $percent : $max; $sum += $percent; } $max = $max <= 0.999 ? $max * 100 : $max; $max = $max + 40; $max = $max >= 100 ? 99 : $max; # $max = ($max <= 1 ) ? 1 : $max; $max = sprintf("%2d", $max); $$page_hash{PERCENT_MATCH} = $max; return $max; } sub resolve_keywords { my ($search_string, $search_option) = @_; $search_string =~ s/ [and|or] / /ig; $search_string =~ s/^\ +//; $search_string =~ s/\ +$//; $search_string =~ s/\ +/ /g; # This subroutine breaks up the search string, and gives us a unique list # of keywords to be processed by the search routines. This is accomplished # by the use of an array hashed by keyword, after removing erroneous and # redundant characters. In case the $search_option parameter specifies an # Exact match, there is no splitting of the search string. # Exact matching has yet to be implemented as of Version 1.0 my %keywords_hash; my @keywords_array = ($search_option =~ /exact/i) ? ($search_string) : split(' ', $search_string); foreach $keyword (@keywords_array) { $keywords_hash{$keyword} = 1; } @keywords_array = keys (%keywords_hash); return \@keywords_array; } sub spool_results { # This could use the template library routines, and a template file! # Inputs would include Start and Stop points, as well as template name. # The template library is not being used here because this search # engine may be released as Open Source (GNU), while the template # library and engine, are for now proprietary. # This subroutine simply dumps the records retrieved to standard output # It takes all its input from the invoker, and simply merges the data # from the Array of hashes with the $section_format, taking care to # provide blocks of records based on the $search_increment. my ($documents_list, $search_string_array, $search_header, $search_section, $base_url, $search_start, $search_increment, $search_target, $search_option, $max_documents, $description_field_name) = @_; my $pages = 'Pages '; my ($page_count, $page_stop, $page_url); my $document_count = (@$documents_list); my $search_section_temp = $search_section; my $search_stop = $document_count <= ($search_start + ($search_increment - 1)) ? $document_count : $search_start + ($search_increment - 1); my $search_target_out = $search_target ? qq{ TARGET="$search_target" } : ''; my $counter_double = 0; my $counter = 0; $search_header =~ s/\<\\>/@$search_string_array/ig; $search_header =~ s/\<\\>/$document_count/ig; $search_header =~ s/\<\\>/$search_start/ig; $search_header =~ s/\<\\>/$search_increment/i; $search_header =~ s/\<\\>/$search_stop/i; $search_header =~ s/\<\\>/$search_option/i; print "\n$search_header"; foreach $document (@$documents_list) { $counter++; $counter_double = $counter; $$document{COUNTER} = $counter; if ($counter >= $search_start && $counter <= $search_stop) { # If the Description field has been defined as one of our input # parameters, it means we know the name of the field in which the # keywords will be found. We therefore replace every keyword # found in the description field, with a bolded replacement of same. if ($description_field_name) { foreach $search_keyword (@$search_string_array) { $$document{$description_field_name} =~ s/($search_keyword)/\$1\<\/B\>/i; } } $search_section_temp =~ s/\<\<([^>]+)\>\>/$$document{$1}/ieg; print "\n$search_section_temp"; $search_section_temp = $search_section; } if (($counter % $search_increment) == 1) { $page_count+= 1; $page_stop = $counter + ($search_increment - 1); if ($counter == $search_start) { $page_url = $page_count; } else { $page_url = qq| $page_count |; } $pages .= ($pages eq "Pages ") ? $page_url : ' | ' . $page_url; } last if $counter >= $max_documents; } print "\n

$pages "; } 1;

Failed to open INDEX file ($file_path/$index_file) for output!

Failed to open Data file in search_lib::build_data_file\nFile = $file_path/$data_file

Failed to open DATA file ($file_path/$data_file) for output!

Failed to open INDEX file ($file_path/$keyword_file) for output!

Failed to open Data file in search_lib::build_data_file\n
File = $file_path/$data_file