#!/usr/bin/perl

#------------------------------------------------------------------------
#  Filename	:	search_lib.pl
#  Author	:	Oyewole, Olanrewaju J.
#  Version	:	1.0
#  Date		:	27/04/2002
#  Description	:
#------------------------------------------------------------------------
# This library contains subroutines that provide the necessary services
# for implementing a search system for HTML pages on a website.
# The names of the subroutines are self-explanatory, as such no
# additional description is provided.  See inline comments for
# further details.
#
#  parse_text
#  remove_html_tags
#  remove_conjunctions
#  write_index_file
#  build_data_file
#  resolve_keywords
#  find_documents
#  calculate_percent_match
#  spool_results
#
#  build_document_list
#  retrieve_page_description
#  retrieve_keywords_from_file
#  retrieve_documents_from_file
#  build_document_number_list
#  update_keyword_hash
#  write_keyword_file
#
#------------------------------------------------------------------------
#  Version	:	1.1
#  Date		:	16/05/2002
#  Description	:
#------------------------------------------------------------------------
#  Altered the subroutine , to do case-insensitive matching of keywords.
#  This reflects the fact that keywords are stored in lowercase in the
#  keywords file, and so mixed case searches always failed previously.
#  Altered the subroutine spool_results, to emphasise/bold keywords in
#  the Description field of the search results.  If the value for the
#  input parameter $description_field_name is supplied, keywords found
#  within that field are returned in bold, else, keywords are returned
#  using the default format specified by the $section_format parameter.
#  A minor change added to this release was to alter the keyword matching
#  to match any part of the word in the keyword file.  Such that "bat"
#  matches bat, batting, or abate.  This occurs when the user selects
#  the OR search option.  A line was also added to the subroutine
#  retrieve_keywords_from_file to perform exact match where specified.
#------------------------------------------------------------------------

# use HTML::Parse;
# use HTML::FormatText;

unshift @INC, '..';



sub write_index_file {
	my ($file_array, $index_file, $file_path) = @_;

	if (open INDEXFILE, ">$file_path/$index_file") {
		print INDEXFILE @$file_array;
	} else { die "\n<h3>Failed to open INDEX file ($file_path/$index_file) for output!</h3>"; }
}



sub build_data_file {
	my ($file_array, $data_file, $keyword_file, $file_path, $remove_file_path, $replace_url, $description_length) = @_;
	my (@file_contents, @file_stats, $file_size, $file_date, $counter);
	my (@keyword_array, %keyword_hash);

	if (open DATAFILE, ">$file_path/$data_file") {
		$| = 1;
		foreach $document (@$file_array) {
			open (TEXTFILE, "$document") or die "\n<h1>Failed to open Data file in search_lib::build_data_file\n<br>File = $file_path/$data_file</h1>";
			@file_contents = <TEXTFILE>;
			@file_stats = stat(TEXTFILE);
			$file_date = gmtime($file_stats[9]);
			$file_size = $file_stats[7] > 9999 ? int($file_stats[7] / 1024) . ' Kbytes' : $file_stats[7] . ' bytes';
			close TEXTFILE;
			$counter++;

			# Replace the File System path with the HTTP path so that when
			# users search we don't have to do this on the fly, saving CPU,
			# memory, and time.
			$document =~ s/$remove_file_path/$replace_url/;

			$file_string = join("\n", @file_contents);
			$page_hash = &parse_text($file_string, $description_length);

			# Select unique keywords from this file and add them to the
			# global list of site-wide keywords, while mentioning that
			# the keyword was found on this page (number NOT filename).
			&update_keyword_hash(\%keyword_hash, $$page_hash{KEYWORDS}, $counter);

			# Print to Data file, including an explicit cast to retrieve
			# the array from the array reference inside $$page_hash{KEYWORDS}
			print DATAFILE $document, "\t", $$page_hash{TITLE}, "\t", $$page_hash{DESCRIPTION}, "\t", $file_size, "\t", $file_date, "\t", join(' ', @{$$page_hash{KEYWORDS}}), "\n";
		}
		&write_keyword_file (\%keyword_hash, $keyword_file, $file_path);
	} else {
		die "\n<h3>Failed to open DATA file ($file_path/$data_file) for output!</h3>";
	}
}



sub update_keyword_hash {

	# For each keyword found, we create an entry in the global keyword hash
	# "Keyword => (list of page numbers in which keyword was found)".  We use
	# a Hashed Array here to prevent keywords being duplicated in our list.
	# Eventually the Hash will be written to a file using the notation
	# "Keyword TAB (list of page numbers in which keyword was found)", example
	# ability	1, 14, 23, 24, 27, 31

	my ($keyword_hash, $keyword_array, $counter) = @_;
	foreach $keyword (@$keyword_array) {
		$keyword =~ s/^ +//;
		$keyword =~ s/ +$//;
		$keyword = lc($keyword);
		if (!$$keyword_hash{$keyword}) {
			$$keyword_hash{$keyword} = "$counter";
		} else {
			$$keyword_hash{$keyword} .= ", $counter" if ($$keyword_hash{$keyword} !~ /$counter/);
		}
	}
}



sub write_keyword_file {
	my ($keyword_hash, $keyword_file, $file_path) = @_;

	if (open KEYWORDFILE, ">$file_path/$keyword_file") {
		foreach $hash_key (sort keys %$keyword_hash) {
			print KEYWORDFILE "\n", $hash_key, "\t", $$keyword_hash{$hash_key} if ($hash_key ne '');
		}
		close KEYWORDFILE;
	} else {
		die "\n<h3>Failed to open INDEX file ($file_path/$keyword_file) for output!</h3>";
	}
}



sub parse_text {
	my ($file_contents, $description_length) = @_;
	my @keywords;
	my %page_hash;
	$file_contents =~ s/\<\!--(.+)--\>//igm;
	$file_contents =~ s/\<script(.+)\<\/script>//igm;
	$file_contents =~ s/\<title\>(.+)\<\/title>//i;
	$page_hash{TITLE} = $1;
	$page_hash{DESCRIPTION} = $1 if ( $file_contents =~ /.*\<meta name=\"description\" content=\"(.+)\"/ );
	$file_contents = &remove_html_tags($file_contents);
	$file_contents .= $page_hash{TITLE} . " " . $page_hash{DESCRIPTION} . " ";
	$file_contents =~ s/\n/ /g;

	# Remove special characters that may occur between words and make
	# the words hard to find, for those not au-fait with their particular format.
	$file_contents =~ s/([\w]{0,1})([\.|,|!|?|(|)|-|+|=|£|$|%|&|*|\#|"|'|"|'\>|\<|\]|\[|\/|:|-])([\w]{0,1})/$1 $3/g;

	# $file_contents = &remove_conjunctions($file_contents, './conjunctions.db');
	$page_hash{DESCRIPTION} ||= $page_hash{TITLE} . " " . substr($file_contents, 0, $description_length);
	$file_contents = &remove_conjunctions($file_contents, './conjunctions.db');

	# Having removed everything else, what we have left in the
	# string should now just be the keywords.  First we replace
	# multiple spaces/tabs with a single space and a special
	# anchor character.  Next we use the anchor character to
	# split the the keywords, leaving the spaces between them.
	$file_contents =~ s/\s+/¬ /g;
	@keywords = split("¬", $file_contents);
	$page_hash{KEYWORDS} = \@keywords;
	return \%page_hash;
}



sub remove_html_tags {

	my ($file_contents) = @_;

	# First we remove everything before the <body> tag; if there is a <body> tag
	# $file_contents = $2 if ($file_contents =~ /.*(\<body.*\>)(.+)/i);
	$file_contents =~ s/.+\<body/</is;
	$file_contents =~ s/document.write\([^)]*\)/ /gs;
	$file_contents =~ s/<[^>]*>/ /gs;
	$file_contents =~ s/\<\!--[^-->]--\>/ /gs;
	$file_contents =~ s/[\&|\#][a-z|0-9]{2,4}\;/ /ig;
	$file_contents =~ s/--\>/ /g;
	$file_contents =~ s/\s+/ /g;

	# return the input value ($file_contents) after removing the tags
	# return HTML::FormatText->new->format(parse_html($file_contents));	## Do not uncomment !!
	# This method is actually significantly SLOWER, and hard to debug!

	return $file_contents;
}



sub remove_conjunctions {
	my ($text_string, $conjunctions_file_name) = @_;
	my @conjunctions = ();

	if (open CONJUNCTIONS, "$conjunctions_file_name") {
		while ($conjunction = <CONJUNCTIONS>) {
			chomp($conjunction);
			# if ( $text_string =~ /$conjunction/i ) { print STDOUT "\n<br>Matched $conjunction\n<br>"; }
			# $text_string =~ s/$conjunction/ /ig;
			$text_string =~ s/\b$conjunction\b/ /ig;
		}
		close CONJUNCTIONS;
	}

	return $text_string;
}



sub find_documents {
	my ($search_files_path, $keywords_file, $data_file, $search_operator, $keywords_array) = @_;
	my ($keyword_array, $document_array, $file_number_hash);

	$keyword_array = retrieve_keywords_from_file ($search_files_path, $keywords_file, $keywords_array, $search_operator);
	$file_number_hash = build_document_number_list($keyword_array, $search_operator);
	$document_array = retrieve_documents_from_file($search_files_path, $data_file, $file_number_hash, $keywords_array);
	foreach $document_hash (@$document_array) {
		&calculate_percent_match($keywords_array, $document_hash);
	}
	return $document_array;
}


sub retrieve_keywords_from_file {
	my ($search_files_path, $keywords_file, $keywords, $search_operator) = @_;
	my @keyword_array;

	# Open file containing our unique list of site-wide keywords.
	# Each keyword is followed by a tab, and a list of numbers.
	# The numbers identify the location of documents within the
	# data file.  Each of these documents contain this keyword.
	if (open KEYWORDS, "$search_files_path/$keywords_file") {

		# We scroll through the keywords file, and compare each of
		# the input keywords with the current keyword from the file,
		# if there is a match, we retrieve the line from the file,
		# and add it to an array, which we will use later.
		while ($keyword_string = <KEYWORDS>) {

			chomp($keyword_string);
			foreach $keyword (@$keywords) {
				# The match is now case-insensitive, and matches
				# any part of the keyword
				if (( ($search_operator ne 'OR') && ($keyword_string =~ /^($keyword)(\t)(.*)/i) )   ||   (($search_operator eq 'OR') && $keyword_string =~ /(\S*$keyword\S*)(\t)(.*)/i)) {
					my %keyword_hash = ($1 => $3);
					push (@keyword_array, \%keyword_hash);
				}
			}
		}
		close KEYWORDS;
	} else { die "\nFailed to open Keywords file!"; }
	return \@keyword_array;
}



sub retrieve_documents_from_file {
	my ($search_files_path, $data_file, $file_number_hash, $keywords_array) = @_;
	my @document_array = ();

	# Here, we open the data.txt file and retrieve information
	# about all the documents that matched our search requirements.
	# The hash array %file_number_hash ensures we have no duplication.
	if (open DATAFILE, "$search_files_path/$data_file") {
		my $current_document = '';
		my $line_number = 1;
		while ($current_document = <DATAFILE>) {
			if ($$file_number_hash{$line_number}) {
				my %document_hash;
				$document_hash{URL} = $1 if ($current_document =~ s/([^\t]+)\t//);
				$document_hash{TITLE} = $1 if ($current_document =~ s/([^\t]+)\t//);
				$document_hash{DESCRIPTION} = $1 if ($current_document =~ s/([^\t]+)\t//);
				$document_hash{SIZE} = $1 if ($current_document =~ s/([^\t]+)\t//);
				$document_hash{DATE} = $1 if ($current_document =~ s/([^\t]+)\t//);
				$document_hash{KEYWORDS} = $current_document;
				&calculate_percent_match($keywords_array, \%document_hash);
				push(@document_array, \%document_hash);
				$counter_hash{$document_hash{PERCENT_MATCH}} += 1;
				$global_hash{$document_hash{PERCENT_MATCH} . sprintf("%03d", $counter_hash{$document_hash{PERCENT_MATCH}})} = \%document_hash;
			}
			$line_number++;
		}
		close DATAFILE;
	} else { die "\n\nFailed to open Documents file ($search_files_path/$data_file) at search_lib::retrieve_documents_from_file"; }

	@document_array = ();

	foreach $key (reverse sort (keys(%global_hash))) {
		push(@document_array, $global_hash{$key});
	}

	return \@document_array;
}



sub build_document_number_list {
	my ($keyword_array, $search_operator) = @_;
	my %file_number_hash;
	my %file_hash;
	my @arr = @$keyword_array;
	my $flag = 0;

	# Scroll through the array of keywords retrieved from the
	# keywords file, and prepare to retrieve documents from the data.txt file.
	# However, there is a precaution.  If the $search_operator parameter
	# is 'AND', we need to ensure that we only retrieve those documents
	# in which all the keywords appear, i.e. the number appears on
	# the list of each of the keywords we retrieved from the file.
	foreach $keyword_hash (@$keyword_array) {
		my @file_number_array = map ({split(', ', $_);} values(%$keyword_hash));
		foreach $item (@file_number_array) { $file_hash{$item} = $item; }

		if (!$flag) {
			%file_number_hash = %file_hash;
			$flag = 1;
		} else {
			foreach $file_number (sort(keys %file_number_hash)) {
				if (uc($search_operator) eq 'AND') {
					if (exists($file_number_hash{$file_number}) && exists($file_hash{$file_number})) { } else { delete($file_number_hash{$file_number}); }
				} else {
					foreach $item (keys(%file_hash)) { $file_number_hash{$item} = 1; }
				}
			}
		}
		undef(%file_hash);
	}
	return \%file_number_hash;
}



sub calculate_percent_match {
	my ($input_keywords, $page_hash) = @_;
	my ($match, $count, $percent, $max);
	my $sum = 0;
	my $keywords_string = join(' ', $$page_hash{KEYWORDS});
	$count = ($keywords_string =~ s/(\w)/$1/ig);

	foreach $keyword (@$input_keywords) {
		$match = ($keywords_string =~ s/$keyword/$keyword/ig);
		$percent = (($match * 3.7) / $count) * 100;
		$max = $percent > $max ? $percent : $max;
		$sum += $percent;
	}
	$max = $max <= 0.999 ? $max * 100 : $max;
	$max = $max + 40;
	$max = $max >= 100 ? 99 : $max;
	# $max = ($max <= 1 ) ? 1 : $max;
	$max = sprintf("%2d", $max);
	$$page_hash{PERCENT_MATCH} = $max;
	return $max;
}



sub resolve_keywords {
	my ($search_string, $search_option) = @_;
	$search_string =~ s/ [and|or] / /ig;
	$search_string =~ s/^\ +//;
	$search_string =~ s/\ +$//;
	$search_string =~ s/\  +/ /g;

	# This subroutine breaks up the search string, and gives us a unique list
	# of keywords to be processed by the search routines.  This is accomplished
	# by the use of an array hashed by keyword, after removing erroneous and
	# redundant characters.  In case the $search_option parameter specifies an
	# Exact match, there is no splitting of the search string.
	# Exact matching has yet to be implemented as of Version 1.0

	my %keywords_hash;
	my @keywords_array = ($search_option =~ /exact/i) ? ($search_string) : split(' ', $search_string);
	foreach $keyword (@keywords_array) { $keywords_hash{$keyword} = 1; }
	@keywords_array = keys (%keywords_hash);
	return \@keywords_array;
}



sub spool_results {

	# This could use the template library routines, and a template file!
	# Inputs would include Start and Stop points, as well as template name.
	# The template library is not being used here because this search
	# engine may be released as Open Source (GNU), while the template
	# library and engine, are for now proprietary.

	# This subroutine simply dumps the records retrieved to standard output
	# It takes all its input from the invoker, and simply merges the data
	# from the Array of hashes with the $section_format, taking care to
	# provide blocks of records based on the $search_increment.

	my ($documents_list, $search_string_array, $search_header, $search_section, $base_url, $search_start, $search_increment, $search_target, $search_option, $max_documents, $description_field_name) = @_;
	my $pages = 'Pages ';
	my ($page_count, $page_stop, $page_url);
	my $document_count = (@$documents_list);
	my $search_section_temp = $search_section;
	my $search_stop = $document_count <= ($search_start + ($search_increment - 1)) ? $document_count : $search_start + ($search_increment - 1);
	my $search_target_out = $search_target ? qq{ TARGET="$search_target" } : '';
	my $counter_double = 0;
	my $counter = 0;

	$search_header =~ s/\<\<cgi_search_string\>\>/@$search_string_array/ig;
	$search_header =~ s/\<\<cgi_record_count\>\>/$document_count/ig;
	$search_header =~ s/\<\<cgi_search_start\>\>/$search_start/ig;
	$search_header =~ s/\<\<cgi_search_increment\>\>/$search_increment/i;
	$search_header =~ s/\<\<cgi_search_stop\>\>/$search_stop/i;
	$search_header =~ s/\<\<cgi_search_option\>\>/$search_option/i;
	print "\n$search_header";

	foreach $document (@$documents_list) {
		$counter++;
		$counter_double = $counter;
		$$document{COUNTER} = $counter;

		if ($counter >= $search_start && $counter <= $search_stop) {
			# If the Description field has been defined as one of our input
			# parameters, it means we know the name of the field in which the
			# keywords will be found.  We therefore replace every keyword
			# found in the description field, with a bolded replacement of same.
			if ($description_field_name) {
				foreach $search_keyword (@$search_string_array) {
					$$document{$description_field_name} =~ s/($search_keyword)/\<B\>$1\<\/B\>/i;
				}
			}
			$search_section_temp =~ s/\<\<([^>]+)\>\>/$$document{$1}/ieg;
			print "\n$search_section_temp";
			$search_section_temp = $search_section;
		}

		if (($counter % $search_increment) == 1) {
			$page_count+= 1;
			$page_stop = $counter + ($search_increment - 1);
			if ($counter == $search_start) { $page_url = $page_count; }
			else {
				$page_url = qq|<A HREF="$base_url?&cgi_search_start=${counter}&cgi_search_increment=$search_increment&cgi_search_string=@$search_string_array" $search_target_out> $page_count </a>|;
			}
			$pages .= ($pages eq "Pages ") ? $page_url : '&nbsp;|&nbsp;' . $page_url;
		}
		last if $counter >= $max_documents;
	}
	print "\n<P>$pages ";
}



1;
