#Copyright (c)2002-2003 Good News Publishers. This source code may not be published or used in any medium without
#express written consent from Good News Publishers: webmaster@gnpcb.org. However, comments are welcome.
#requires $spellfile, $specialsearchfile, and $abbrevsfile to be set to the files containing known spelling
#errors, special cases in which to show messages, and book abbreviations. All three are available from
#http://www.gnpcb.org/esv/share/about/
function parse_input($q) { #make the query easier to deal with and return two arrays describing it
$normalizedq = normalize_query($q);
$normalizedq = correct_known_misspells($normalizedq);
identify_special_searches($normalizedq);
list($booksq, $bookrefs) = make_book_refs($normalizedq);
$verserefs = "";
if (func_num_args() == 1) list($refsq, $verserefs) = make_verse_refs($booksq);
else $refsq = $booksq; #if word search, we don't want to even look for verses
#print "$booksq\n$refsq\n"; print_r($bookrefs); print_r($verserefs); #exit;
return objectify_q($refsq, $bookrefs, $verserefs);
}
function normalize_query($q) { #make everything lc and remove extra characters
$q = trim($q);
$q = strtr($q, ".;_", ":,-"); #convert these alternate characters into normalized ones (1.2_3; 4 -> 1:2-3, 4)
$q = strtolower($q);
$regex = array(
"/(\d)\"(\d)/" => "$1:$2", #12"5 -> 12:5 for some reason this happens fairly often
"/([^\d\s\-:,])(\d)/" => "$1 $2", #add a space before numbers
"/(\d)[ab]([\-:, ]|$)/" => "$1$2", #12:1b -> 12:1; however, a search for 23a will find only 23
"/(\d)([^\d\s\-:,])/" => "$1 $2", #add a space after numbers
"/([a-z])-/" => "$1 -", #to handle james-1peter
"/([^\d\s\-:,])[\-:,]/" => "$1 ", #remove punctuation not involving numbers
"/[^\w\s\-:,'\"]/" => "", #remove all other characters
"/\s{2,}/" => " ", #remove extra spaces
);
$q = preg_replace(array_keys($regex), array_values($regex), $q);
return $q;
}
function make_book_refs($q) { #check for the existence of any books in the string
$abbreviations = get_simple_book_abbrevs(); #get the abbreviations stored in a file
$letters = get_letters();
$letter = "A";
$bookrefs = array();
foreach ($abbreviations as $book => $abbrevs) {
foreach ($abbrevs as $abbrev) {
while (preg_match("/\b$abbrev\b/", $q, $matches)) { #one match at a time; use word boundaries so "1 time" != 1 tim
$q = preg_replace("/\b($abbrev)\b/", "#<$letter#", $q, 1);
$bookrefs[$letter] = array(
"book" => $book,
"q" => $matches[0], #the original match
);
$count = count(array_keys($bookrefs));
if ($count >= count($letters)) { #if over 26 books, we need to extend the array
$letters = extend_letters($letters);
}
$letter = $letters[$count]; #get what the next letter will be
}
}
}
return array($q, $bookrefs); #return marked up string, one with books changed to #";
#print "--nrmlu:$normalizedunit;
";
#print "---ref:$ref;
";
#print "---normalizedref:$normalizedref;
";
$count = count(array_keys($verserefs));
if ($count >= count($letters)) { #if over 26 books, we need to extend the array
$letters = extend_letters($letters);
}
$letter = $letters[$count]; #get what the next letter will be
if (strlen($normalizedref) > 0) {
$verserefs[$letter] = array();
$verseunits = explode(",", $normalizedref); #treat each comma-separated unit individually
foreach ($verseunits as $verseunit) {
if (!preg_match("/\d/", $verseunit)) continue; #we want 0 to stay in the loop
list($chapter, $explicitch, $objverseunit) = objectify_reference($chapter, $explicitch, $verseunit);
if ($objverseunit) {
array_push($verserefs[$letter], $objverseunit);
}
else { #nothing
}
}
$verseunits = array(); #reset it for the next time
if (!$verserefs[$letter]) { #if there weren't any objverseunits, we don't need it
unset($verserefs[$letter]);
continue;
}
$normalizedunit = preg_replace("/(?:^|[^\w,'])$ref(?:[^\w,']|$)/", "#>$letter#", $normalizedunit, 1); #replace with a #->-letter-#
#don't use \b because it doesn't work if you enter "Psalm 1,"
array_push($verserefs[$letter], $ref); #always the last so we can pop it later
}
else {
#nothing; it's a number sequence but not a ref
}
}
$q = preg_replace("/$unit/", "$normalizedunit", $q, 1);
}
$q = preg_replace(array("/# +/", "/ +#/"), array("#", "#"), $q); #get rid of extra spaces
#print_r($verserefs);
#$q = strtr($q, "<>", "[]"); #debug
#print "\n$q";
#exit;
return array($q, $verserefs);
}
function normalize_query_unit($unit) { #handle some nonstandard queries that nonetheless make sense
$regex = array(
"/(\d+?) *?- *?end/" => "$1-999", #4:1-end = 4:1-999; 4-end = 4-999
"/ch(?:apter|ap)?s? *?(\d+)/" => "$1:", #gen ch 6 -> gen 6: ;; gen chap 6 -> gen 6
"/(\d) *?v(?:erse|v|er)?s? *?(\d)/" => "$1:$2", #gen 6 vs 8 -> gen 6:8; gen 6 vv 8-10 -> gen 6:8-10
"/v(?:erse|v|er)?s? *?(\d)/" => "$1", # gen ch 6 vs 8 (which becomes gen 6: vs 8) -> gen 6:8
"/(\d) *?(?:to|through|thru) *?(\d)/" => "$1-$2", #gen 6 vs 7 to 9 -> gen 6:7-9
"/(\d):? *?and *?(\d)/" => "$1,$2", #gen 6 vs 7 and 9 -> gen 6:7,9
"/(\d) *?f{1,2}\b/" => "$1-999", #gen 6:8ff -> gen 6:8-999; gen 6ff -> gen 6-999
"/:{2,}/" => ":",
"/^:$/" => "", #"john :" => john
);
return preg_replace(array_keys($regex), array_values($regex), $unit);
}
function normalize_reference($ref) { #make refs easier to parse automatically
#trim is already done above
$regex = array(
"/[^\d ,\-:]/" => "", #just in case any slipped through
"/[^\d:\-]+$/" => "", #don't need to do a beginning one because $ref will always be sent starting with [\d:]
"/ *([\-:,]) */" => "$1",
"/(\d) (\d+? ?(?:[^:]|$))/" => "$1.$2", #rom 8 28 -> rom 8:28; people use this syntax fairly frequently; change to dot for next transformations
"/(\d) (\d+? ?:)/" => "$1,$2", #rom 8 2:7 -> rom 8,2:7
"/(:\d+?)[ .](\d)/" => "$1,$2", #rom 8:28 7 -> rom 8:28,7
"/(:\d+?-\d+?)[ .](\d)/" => "$1,$2", #rom 8:1-10 11-20 -> 8:1-10,11-20
"/,{2,}/" => ",", #get rid of double punctuation marks
"/[.:]{2,}/" => ":",
"/-{2,}/" => "-",
"/\s/" => "",
);
$ref = preg_replace(array_keys($regex), array_values($regex), $ref);
if (strstr($ref, "000") || preg_match("/\d{4,}/", $ref)) { #these will never be references
return "";
}
$ref = strtr($ref, ".", ":"); #change any remaining periods from transformations above into colons
return $ref;
}
function cleanup_reference_unit($unit) {
$regex = array(
"/(^|\D):/" => "$1",
"/:-/" => ":1-",
"/(-\d+?):$/" => "$1:999", #2:3-4: -> 2:3-4:999
#"/(:\d+?):(\D|$)/" => "$1$2", #4:7: -> 4:7
"/:(\D)/" => ":1-999$1", #4: -> 4:1-999; originally it was (\D|$) but that made eccl. 2:21. fail
"/(\d+?)-(\D|$)/" => "$1-949", #4:2- -> 4:2-949; "949" because mark 16:1 - luke 2:5 needs to work, and we need to know that it was open-ended
"/[^\d]+$/" => "", # cleanup; originally there was a : in the [], but we don't want ending colons
);
return preg_replace(array_keys($regex), array_values($regex), $unit);
}
function objectify_reference($chapter, $explicitch, $unit) {
$return = array();
$original = $unit;
$unit = cleanup_reference_unit($unit);
#print "cleanup:$unit
\n";
if (substr_count($unit, "-") > 1) { #if there's more than one range, only use the outside values
preg_match("/(.+?)-+?(.+)-(.+)/", $unit, $matches); #we want $2 to be greedy, so we don't use ?
if (strstr($matches[2], ":") && !strstr($matches[3], ":")) { #if a ch is indicated in the discarded match, use that ch
preg_match_all("/(\d+):/", $matches[2], $temp); #get all the chapters (eg, 23:6-24:7)
$temp = array_pop($temp[1]); #use the last indicated chapter (eg, 24)
$matches[3] = "$temp:$matches[3]"; #change the last match to include the chapter we just found
unset ($temp); #cleanup
}
$unit = "$matches[1]-$matches[3]"; #turn it into a simple pairing
save_error("discarded-reference", $matches[2]);
unset ($matches);
#print "u:$unit\n";
}
if (preg_match("/^(\d+?):(\d+?)-(\d+?):(\d+?)$/", $unit, $ranges)) { #2:3-4:5
$case = "dd"; #d = double, or "1:2"; s = single, or "3"; we don't use this anywhere, but it makes things easier to understand
array_shift($ranges); #ranges[0] includes the whole unit, which we don't care about
list($beginc, $beginv, $endc, $endv) = $ranges;
if (($endc < $beginc) || ($endv < $beginv && $endc == $beginc)) { #5:6-4:7; 2:4-2:3 #invert them
list($endc, $endv, $beginc, $beginv) = $ranges;
}
$isrange = 1; #is talking about more than one verse
$explicitch = 1;
}
elseif (preg_match("/^(\d+?):(\d+?)-(\d+)$/", $unit, $ranges)) { #2:4-5
$case = "ds";
$isrange = 1;
list($beginc, $beginv, $endv) = array_slice($ranges, 1, 3); #ignore ranges[0]
$endc = $beginc;
if ($endv < $beginv || $endc < $beginc) { #if it looks backwards
list($beginc, $beginv, $endc, $endv, $isrange) =
check_reference_range_error($beginc, $beginv, $endc, $endv, $isrange, $original); #original for use in error msg
}
$explicitch = 1;
}
elseif (preg_match("/^(\d+?):(\d+)$/", $unit, $ranges)) { #2:4
$case = "d";
list($beginc, $beginv) = array_slice($ranges, 1, 2); #again, ignore ranges[0]
$isrange = 0;
$explicitch = 1;
}
elseif (preg_match("/^(\d+?)-(\d+):(\d+)$/", $unit, $ranges)) { #3-4:5
$case = "sd";
$isrange = 1;
if ($explicitch) { #[2:]3-4:5; the 2 is implied from a prev unit (2:1,3-4:5)
list($beginc, $beginv, $endc, $endv) = array($chapter, $ranges[1], $ranges[2], $ranges[3]);
}
else { #3:1-4:5; otherwise we assume the single is a chapter ref
list($beginc, $beginv, $endc, $endv) = array($ranges[1], 1, $ranges[2], $ranges[3]);
}
if (($endc < $beginc) || ($endv < $beginv && $endc <= $beginc)) { #oops
list($beginc, $beginv, $endc, $endv) = double_reference_range_error($beginc, $beginv, $endc, $endv, $original);
$isrange = 0;
}
$explicitch = 1;
}
elseif (preg_match("/^(\d+)-(\d+)$/", $unit, $ranges)) { #4-5
$case = "ss";
$isrange = 1;
if ($explicitch) { #[1:]4-5
list($beginc, $beginv, $endc, $endv) = array($chapter, $ranges[1], $chapter, $ranges[2]);
}
else { #4:1-5:999; no explicitch
list($beginc, $beginv, $endc, $endv) = array($ranges[1], 1, $ranges[2], 999);
}
if (($endv < $beginv && $endc <= $beginc) || $endc < $beginc) {
list($beginc, $beginv, $endc, $endv, $isrange) =
check_reference_range_error($beginc, $beginv, $endc, $endv, $isrange, $original);
}
}
elseif (preg_match("/^(\d+)$/", $unit, $ranges)) { #4
$case = "s";
if ($explicitch) { #[1:]4
$isrange = 0;
list($beginc, $beginv) = array($chapter, $ranges[1]);
}
else { #4:1-999; otherwise we assume they want the whole chapter
$isrange = 1;
list($beginc, $beginv, $endc, $endv) = array($ranges[1], 1, $ranges[1], 999);
}
}
else { #we don't know what to do with it
save_error("discarded-reference", $original);
return array($chapter, $explicitch, array());
}
if ($beginc < 1) $beginc = 1; #just in case anything was 0; could conceivably lead to a false isrange (1:0-1:1)...
if ($beginv < 1) $beginv = 1; #but nothing bad happens
if ($isrange) {
if ($endc < 1) $endc = 1;
if ($endv < 1) $endv = 1;
$return = array(
"isrange" => 1,
"beginc" => $beginc, "beginv" => $beginv,
"endc" => $endc, "endv" => $endv,
);
$chapter = $endc;
}
else {
$return = array(
"beginc" => $beginc, "beginv" => $beginv,
"endc" => $beginc, "endv" => $beginv,
);
$chapter = $beginc;
}
#print "$case-$explicitch-$chapter\n";
return array($chapter, $explicitch, $return);
}
function check_reference_range_error($beginc, $beginv, $endc, $endv, $isrange, $unit) { #possible errors
if ($endv > 900 && $endc < $beginc) { #5:6--4 became 5:6-4:999; display one verse and set error
$endc = $beginc;
$endv = $beginv;
$isrange = 0;
save_error("end-before-begin", $unit);
}
elseif ($endv >= $endc) { #[2:]4-3 -> 2:4-3:999; assume they meant to use the chapter; no error
$endc = $endv;
$endv = 999;
#use the isrange we were sent, ie, 1
}
else { #[2:]4-1 -> ??? 2:4-2:4; we don't even want to try to guess, because we'd probably guess wrong
$endc = $beginc;
$endv = $beginv;
$isrange = 0; #display one verse and set error
save_error("end-before-begin", $unit);
}
return array($beginc, $beginv, $endc, $endv, $isrange);
}
function double_reference_range_error($beginc, $beginv, $endc, $endv, $unit) { #if there's a problem, don't try to guess
save_error("end-before-begin", $unit);
return array($beginc, $beginv, $beginc, $beginv); #return only the beginning verses
}
function objectify_q($q, $bookrefs, $verserefs) { #make the actual objectq and objectqdesc
$objectq = array();
$objectqdesc = array();
$qarray = preg_split("/#+/", $q);
if (strstr($q, '"')) $alreadyquotes = 0;
if (preg_match("/#<[A-Z]+?#-#<[A-Z]+?#/", $q)) $checkbookspan = 1; #if someone types heb - james
elseif (preg_match("/#<[A-Z]+?#-$/", $q)) $checkbookspan = 1; #if someone types heb -
else $checkbookspan = 0;
foreach ($qarray as $qunit) {
if (!$qunit || !preg_match("/[^\s\"]/", $qunit)) continue;
if ($qunit{0} == "<") { #we already suspect it's a book because the first char is a <
preg_match("/^<([A-Z]+)/", $qunit, $matches); #put the letter into matches[1]
array_push($objectq, $bookrefs[$matches[1]]);
$objectqdesc["book"] = (isset($objectqdesc["book"])) ? $objectqdesc["book"]++ : 1;
if (isset($alreadyquotes) && strstr($qunit, '"')) $alreadyquotes += substr_count($qunit, '"');
}
elseif ($qunit{0} == ">") { #if it's a ref
preg_match("/^>([A-Z]+)/", $qunit, $matches); #put the letter into matches[1];
$refq = array_pop($verserefs[$matches[1]]); #the ref is always at the end
array_push($objectq, array(
"q" => $refq,
"ref" => $verserefs[$matches[1]], #a book number. the letter in the query references the array
));
$objectqdesc["ref"] = (isset($objectqdesc["ref"])) ? $objectqdesc["ref"]++ : 1;
if (isset($alreadyquotes) && strstr($qunit, '"')) $alreadyquotes += substr_count($qunit, '"');
}
else { #otherwise it could be a sequence of words
if (isset($alreadyquotes) && strstr($qunit, '"') && $alreadyquotes % 2 == 0) { #do phrase matching
#if there's a mismatch of quotes, eg "Mark 7 "hello there", don't do phrase matching
$matches = preg_split('/"/', $qunit, -1, PREG_SPLIT_NO_EMPTY);
}
else $matches = explode(" ", $qunit);
foreach($matches as $match) {
if (strlen($match) == 0 || $match == " ") continue;
array_push($objectq, array(
"q" => $match,
"word" => 1,
));
}
$objectqdesc["word"] = (isset($objectqdesc["word"])) ? $objectqdesc["word"] + count($matches) : count($matches);
}
}
if ($checkbookspan) list($objectq, $objectqdesc) = check_book_span($objectq, $objectqdesc);
return array($objectq, $objectqdesc);
}
function check_book_span($objectq, $objectqdesc) { #if someone enters a book rand (mk - jn) without chapter numbers, assume 1-end
foreach ($objectq as $i => $array) {
if (($array["q"] != "-") || ($i == 0) || (!isset($objectq[$i-1]["book"]))) continue;
$objectq[$i]["ref"][0] = array(
"isrange" => 1,
"beginc" => 1,
"beginv" => 1,
"endc" => 949, #949 indicates we should look for the end of the range in the next book
"endv" => 949,
);
unset($objectq[$i]["word"]); #this key gets looked for later
$objectqdesc["word"]--;
}
if ($objectqdesc["word"] == 0) unset($objectqdesc["word"]); #if there aren't any left at the end, we don't need it
return array($objectq, $objectqdesc);
}
function correct_known_misspells($q) { #if there are often misspelled words that should be silently corrected, they go here
$misspells = get_known_misspells_from_file();
foreach ($misspells as $key => $value) { #always check on boundaries
if (preg_match("/\b$key\b/", $q)) $q = preg_replace("/\b$key\b/", "$value", $q);
}
return $q;
}
function get_known_misspells_from_file() { #file is of format misspell\tcorrected #any comments could be put in another \t after
global $spellfile;
$file = fopen($spellfile, "r") or die ("No spelling file");
$misspells = array();
while (!feof($file)) {
$line = rtrim(fgets($file, 1024));
$words = explode("\t", $line);
if (!$words[1]) continue;
$misspells[$words[0]] = $words[1];
}
return $misspells;
}
function identify_special_searches($q) { #give special error messages when certain queries are entered
global $specialsearchfile;
$specials = array();
$file = fopen($specialsearchfile, "r") or die ("No spelling file");
while (!feof($file)) {
$line = rtrim(fgets($file, 1024));
$words = explode("\t", $line);
if (preg_match("/\b$words[0]/", $q)) $specials[$words[1]]++;
}
if ($specials) {
global $specialsearches;
$specialsearches = $specials;
}
return 1;
}
#also requires save_errors. Here is a blank function.
# function save_errors($arg1, $arg2) {
# return 1;
# }
?>