{* Copyright 1999-2005 The Apache Software Foundation or its licensors, as * applicable. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *} library mod_spelling; {$i define.inc} uses SysUtils, Classes, httpd, apr; var speling_module: module; {$ifdef Unix} cvar; public; {$endif} default_module_ptr: Pmodule; const MODULE_NAME = 'mod_speling.so'; {******************************************************************* * Free Pascal only supports exporting variables on Windows *******************************************************************} {$ifdef WINDOWS} exports speling_module name 'spelling_module'; {$endif} {#include "apr.h" #include "apr_file_io.h" #include "apr_strings.h" #include "apr_lib.h" #define APR_WANT_STRFUNC #include "apr_want.h" #define WANT_BASENAME_MATCH #include "httpd.h" #include "http_core.h" #include "http_config.h" #include "http_request.h" #include "http_log.h" } {* mod_speling.c - by Alexei Kosut <akosut@organic.com> June, 1996 * * Translated to pascal by Felipe Monteiro de Carvalho - July, 2006 * * This module is transparent, and simple. It attempts to correct * misspellings of URLs that users might have entered, namely by checking * capitalizations. If it finds a match, it sends a redirect. * * 08-Aug-1997 <Martin.Kraemer@Mch.SNI.De> * o Upgraded module interface to apache_1.3a2-dev API (more NULL's in * speling_module). * o Integrated tcsh's "spelling correction" routine which allows one * misspelling (character insertion/omission/typo/transposition). * Rewrote it to ignore case as well. This ought to catch the majority * of misspelled requests. * o Commented out the second pass where files' suffixes are stripped. * Given the better hit rate of the first pass, this rather ugly * (request index.html, receive index.db ?!?!) solution can be * omitted. * o wrote a "kind of" html page for mod_speling * * Activate it with "CheckSpelling On" } type spconfig = record enabled: Integer; end; Pspconfig = ^spconfig; { * Create a configuration specific to this module for a server or directory * location, and fill it with the default settings. * * The API says that in the absence of a merge function, the record for the * closest ancestor is used exclusively. That's what we want, so we don't * bother to have such a function. } function mkconfig(p: Papr_pool_t): Pointer; var cfg: Pspconfig; begin cfg := apr_pcalloc(p, sizeof(spconfig)); cfg^.enabled := 0; Result := cfg; end; { * Respond to a callback to create configuration record for a server or * vhost environment. } function create_mconfig_for_server(p: Papr_pool_t; s: Pserver_rec): Pointer; cdecl; begin Result := mkconfig(p); end; { * Respond to a callback to create a config record for a specific directory. } function create_mconfig_for_directory(p: Papr_pool_t; dir: PChar): Pointer; cdecl; begin Result := mkconfig(p); end; { * Handler for the CheckSpelling directive, which is FLAG. } function set_speling(cmd: Pcmd_parms; mconfig: Pointer; arg: Integer): PChar; cdecl; var cfg: Pspconfig; begin cfg := Pspconfig(mconfig); cfg^.enabled := arg; Result := nil; end; {* Define the directives specific to this module. This structure is referenced * later by the 'module' structure. } var speling_cmds: command_rec; type sp_reason = ( SP_IDENTICAL = 0, SP_MISCAPITALIZED = 1, SP_TRANSPOSITION = 2, SP_MISSINGCHAR = 3, SP_EXTRACHAR = 4, SP_SIMPLETYPO = 5, SP_VERYDIFFERENT = 6 ); const sp_reason_str: array [0..7] of PChar = ( 'identical', 'miscapitalized', 'transposed characters', 'character missing', 'extra character', 'mistyped character', 'common basename', nil ); type misspelled_file = record name: PChar; quality: sp_reason; end; Pmisspelled_file = ^misspelled_file; { * spdist() is taken from Kernighan & Pike, * _The_UNIX_Programming_Environment_ * and adapted somewhat to correspond better to psychological reality. * (Note the changes to the return values) * * According to Pollock and Zamora, CACM April 1984 (V. 27, No. 4), * page 363, the correct order for this is: * OMISSION = TRANSPOSITION > INSERTION > SUBSTITUTION * thus, it was exactly backwards in the old version. -- PWP * * This routine was taken out of tcsh's spelling correction code * (tcsh-6.07.04) and re-converted to apache data types ("char" type * instead of tcsh's NLS'ed "Char"). Plus it now ignores the case * during comparisons, so is a "approximate strcasecmp()". * NOTE that is still allows only _one_ real "typo", * it does NOT try to correct multiple errors. } { Extra notes about how this function works: * s and t are supposed different * s } function spdist(const cs, ct: PChar): sp_reason; var s, t, i, j: PChar; begin s := cs; t := ct; while apr_tolower(s^) = apr_tolower(t^) do begin if t^ = #0 then begin Result := SP_MISCAPITALIZED; { exact match (sans case) } Exit; end; Inc(s); Inc(t); end; if s^ <> #0 then begin if t^ <> #0 then begin i := s; Inc(i); j := t; Inc(j); if Integer(i^) and Integer(j^) <> 0 then if (apr_tolower(s^) = apr_tolower(j^)) and (apr_tolower(t^) = apr_tolower(i^)) then begin Inc(i); Inc(j); if stricomp(i, j) = 0 then begin Result := SP_TRANSPOSITION; { transposition } Exit; end; end; Dec(i); Dec(j); if (stricomp(i, j) = 0) then begin Result := SP_SIMPLETYPO; { 1 char mismatch } Exit; end; end; if (stricomp(i, t) = 0) then begin Result := SP_EXTRACHAR; { extra character } Exit; end; end; if (t^ <> #0) and (stricomp(s, t + 1) = 0) then begin Result := SP_MISSINGCHAR; { missing character } Exit; end; Result := SP_VERYDIFFERENT; { distance too large to fix. } end; function sort_by_quality(left, rite: Pointer): Integer; begin Result := Integer(Pmisspelled_file(left)^.quality) - Integer(Pmisspelled_file(rite)^.quality); end; function check_speling(r: Prequest_rec): Integer; cdecl; var cfg: Pspconfig; good, bad, postgood, url: PChar; dirent: apr_finfo_t; filoc, dotloc, urlen, pglen: Integer; candidates: Papr_array_header_t = nil; dir: Papr_dir_t; q: sp_reason; sp_new, variant_, nvariant_: Pmisspelled_file; nuri, ref, vuri, reason: PChar; i, entloc: Integer; p, sub_pool: Papr_pool_t; notes: Papr_table_t; t, v: Papr_array_header_t; List: TList; plist: Pointer; begin cfg := Pspconfig(ap_get_module_config(r^.per_dir_config, @speling_module)); if (cfg^ .enabled = 0) then begin Result := DECLINED; Exit; end; { We only want to worry about GETs } if (r^.method_number <> M_GET) then begin Result := DECLINED; Exit; end; { We've already got a file of some kind or another } if (Integer(r^.finfo.filetype) <> 0) then begin Result := DECLINED; Exit; end; { Not a file request } if (r^.proxyreq>0) or not assigned(r^.filename) then begin Result := DECLINED; Exit; end; { This is a sub request - don't mess with it } if (r^.main <> nil) then begin Result := DECLINED; Exit; end; { * The request should end up looking like this: * r->uri: /correct-url/mispelling/more * r->filename: /correct-file/mispelling r->path_info: /more * * So we do this in steps. First break r->filename into two pieces } filoc := ap_rind(r^.filename, '/'); { * Don't do anything if the request doesn't contain a slash, or * requests "/" } if (filoc = -1) or (strcomp(r^.uri, '/') = 0) then begin Result := DECLINED; Exit; end; { good = /correct-file } good := apr_pstrndup(r^.pool, r^.filename, filoc); { bad = mispelling } bad := apr_pstrdup(r^.pool, r^.filename + filoc + 1); { postgood = mispelling/more } postgood := apr_pstrcat(r^.pool, [bad, r^.path_info, nil]); urlen := strlen(r^.uri); pglen := strlen(postgood); { Check to see if the URL pieces add up } if (strcomp(postgood, r^.uri + (urlen - pglen))) <> 0 then begin Result := DECLINED; Exit; end; { url = /correct-url } url := apr_pstrndup(r^.pool, r^.uri, (urlen - pglen)); { Now open the directory and do ourselves a check... } if (apr_dir_open(@dir, good, r^.pool) <> APR_SUCCESS) then { Oops, not a directory... } begin Result := DECLINED; Exit; end; candidates := apr_array_make(r^.pool, 2, sizeof(misspelled_file)); dotloc := ap_ind(bad, '.'); if (dotloc = -1) then dotloc := strlen(bad); while (apr_dir_read(@dirent, APR_FINFO_DIRENT, dir) = APR_SUCCESS) do begin { * If we end up with a "fixed" URL which is identical to the * requested one, we must have found a broken symlink or some such. * Do _not_ try to redirect this, it causes a loop! } if (strcomp(bad, dirent.name) = 0) then begin apr_dir_close(dir); Result := OK; end { * miscapitalization errors are checked first (like, e.g., lower case * file, upper case request) } else if (stricomp(bad, dirent.name) = 0) then begin sp_new := Pmisspelled_file(apr_array_push(candidates)); sp_new^.name := apr_pstrdup(r^.pool, dirent.name); sp_new^.quality := SP_MISCAPITALIZED; end { * simple typing errors are checked next (like, e.g., * missing/extra/transposed char) } else if (spdist(bad, dirent.name) <> SP_VERYDIFFERENT) then begin q := spdist(bad, dirent.name); sp_new := Pmisspelled_file(apr_array_push(candidates)); sp_new^.name := apr_pstrdup(r^.pool, dirent.name); sp_new^.quality := q; end { * The spdist() should have found the majority of the misspelled * requests. It is of questionable use to continue looking for * files with the same base name, but potentially of totally wrong * type (index.html <-> index.db). * I would propose to not set the WANT_BASENAME_MATCH define. * 08-Aug-1997 <Martin.Kraemer@Mch.SNI.De> * * However, Alexei replied giving some reasons to add it anyway: * > Oh, by the way, I remembered why having the * > extension-stripping-and-matching stuff is a good idea: * > * > If you're using MultiViews, and have a file named foobar.html, * > which you refer to as "foobar", and someone tried to access * > "Foobar", mod_speling won't find it, because it won't find * > anything matching that spelling. With the extension-munging, * > it would locate "foobar.html". Not perfect, but I ran into * > that problem when I first wrote the module. } else begin {$ifdef WANT_BASENAME_MATCH} { * Okay... we didn't find anything. Now we take out the hard-core * power tools. There are several cases here. Someone might have * entered a wrong extension (.htm instead of .html or vice * versa) or the document could be negotiated. At any rate, now * we just compare stuff before the first dot. If it matches, we * figure we got us a match. This can result in wrong things if * there are files of different content types but the same prefix * (e.g. foo.gif and foo.html) This code will pick the first one * it finds. Better than a Not Found, though. } entloc := ap_ind(dirent.name, '.'); if (entloc = -1) then entloc := strlen(dirent.name); if ((dotloc = entloc) and not strncasecmp(bad, dirent.name, dotloc)) then begin sp_new := Pmisspelled_file(apr_array_push(candidates)); sp_new^.name := apr_pstrdup(r^.pool, dirent.name); sp_new^.quality := SP_VERYDIFFERENT; end; {$endif} end; end; apr_dir_close(dir); if (candidates^.nelts <> 0) then begin { Wow... we found us a mispelling. Construct a fixed url } variant_ := Pmisspelled_file(candidates^.elts); ref := apr_table_get(r^.headers_in, 'Referer'); List := TList.Create; try for i := 0 to candidates^.nelts - 1 do begin plist := Pointer(candidates^.elts); Inc(plist, sizeof(misspelled_file)); List.Add(plist); end; List.Sort(@sort_by_quality); finally List.Free; end; { * Conditions for immediate redirection: * a) the first candidate was not found by stripping the suffix * AND b) there exists only one candidate OR the best match is not * ambiguous * then return a redirection right away. } nvariant_ := variant_; Inc(nvariant_, sizeof(misspelled_file)); if (variant_^.quality <> SP_VERYDIFFERENT) and ( (candidates^.nelts = 1) or (Integer(variant_^.quality) <> Integer(nvariant_^.quality))) then begin nuri := ap_escape_uri(r^.pool, apr_pstrcat(r^.pool, [url, variant_^.name, r^.path_info, nil])); if (r^.parsed_uri.query^ <> #0) then nuri := apr_pstrcat(r^.pool, [nuri, PChar('?'), r^.parsed_uri.query, nil]); apr_table_setn(r^.headers_out, 'Location', ap_construct_url(r^.pool, nuri, r)); if ref^ <> #0 then ap_log_rerror(MODULE_NAME, 506, APLOG_INFO, APR_SUCCESS, r, 'Fixed spelling: %s to %s from %s', [r^.uri, nuri, ref]) else ap_log_rerror(MODULE_NAME, 506, APLOG_INFO, APR_SUCCESS, r, 'Fixed spelling: %s to %s', [r^.uri, nuri, ref]); Result := HTTP_MOVED_PERMANENTLY; Exit; end { * Otherwise, a "[300] Multiple Choices" list with the variants is * returned. } else begin if (r^.main = nil) then begin p := r^.pool; notes := r^.notes; end else begin p := r^.main^.pool; notes := r^.main^.notes; end; if (apr_pool_create(@sub_pool, p) <> APR_SUCCESS) then begin Result := DECLINED; Exit; end; t := apr_array_make(sub_pool, candidates^.nelts * 8 + 8, sizeof(PChar)); v := apr_array_make(sub_pool, candidates^.nelts * 5, sizeof(PChar)); { Generate the response text. } PPChar(apr_array_push(t))^ := 'The document name you requested (<code>'; PPChar(apr_array_push(t))^ := ap_escape_html(sub_pool, r^.uri); PPChar(apr_array_push(t))^ := '</code>) could not be found on this server.' + LineEnding + 'However, we found documents with names similar ' + 'to the one you requested.<p>' + 'Available documents:' + LineEnding + '<ul>' + LineEnding; for i := 0 to candidates^.nelts -1 do begin reason := sp_reason_str[Integer(variant_[i].quality)]; { The format isn't very neat... } if r^.parsed_uri.query <> nil then vuri := apr_pstrcat(sub_pool, [url, variant_[i].name, r^.path_info, '?', r^.parsed_uri.query, nil]) else vuri := apr_pstrcat(sub_pool, [url, variant_[i].name, r^.path_info, PChar(''), PChar(''), nil]); PPChar(apr_array_push(v))^ := '"'; PPChar(apr_array_push(v))^ := ap_escape_uri(sub_pool, vuri); PPChar(apr_array_push(v))^ := '";"'; PPChar(apr_array_push(v))^ := reason; PPChar(apr_array_push(v))^ := '"'; PPChar(apr_array_push(t))^ := '<li><a href="'; PPChar(apr_array_push(t))^ := ap_escape_uri(sub_pool, vuri); PPChar(apr_array_push(t))^ := '">'; PPChar(apr_array_push(t))^ := ap_escape_html(sub_pool, vuri); PPChar(apr_array_push(t))^ := '</a> ('; PPChar(apr_array_push(t))^ := reason; PPChar(apr_array_push(t))^ := ')' + LineEnding; { * when we have printed the "close matches" and there are * more "distant matches" (matched by stripping the suffix), * then we insert an additional separator text to suggest * that the user LOOK CLOSELY whether these are really the * files she wanted. } if (i > 0) and (i < candidates^.nelts - 1) and (variant_[i].quality <> SP_VERYDIFFERENT) and (variant_[i + 1].quality = SP_VERYDIFFERENT) then PPChar(apr_array_push(t))^ := '</ul>' + LineEnding + 'Furthermore, the following related ' + 'documents were found:' + LineEnding + '<ul>' + LineEnding; end; PPChar(apr_array_push(t))^ := '</ul>' + LineEnding; { If we know there was a referring page, add a note: } if (ref <> nil) then begin PPChar(apr_array_push(t))^ := 'Please consider informing the owner of the <a href="'; PPChar(apr_array_push(t))^ := ap_escape_uri(sub_pool, ref); PPChar(apr_array_push(t))^ := '">referring page</a> about the broken link.' + LineEnding; end; { Pass our apr_table_t to http_protocol.c (see mod_negotiation): } apr_table_setn(notes, 'variant-list', apr_array_pstrcat(p, t, #0)); apr_table_mergen(r^.subprocess_env, 'VARIANTS', apr_array_pstrcat(p, v, ',')); apr_pool_destroy(sub_pool); if ref <> '' then ap_log_rerror(MODULE_NAME, 609, APLOG_INFO, 0, r, 'Spelling fix: %s: %d candidates from %s', [r^.uri, candidates^.nelts, ref]) else ap_log_rerror(MODULE_NAME, 609, APLOG_INFO, 0, r, 'Spelling fix: %s: %d candidates', [r^.uri, candidates^.nelts, ref]); Result := HTTP_MULTIPLE_CHOICES; end; end; Result := OK; end; procedure register_hooks_(p: Papr_pool_t); cdecl; begin ap_hook_fixups(@check_speling, nil, nil, APR_HOOK_LAST); end; begin default_module_ptr := @speling_module; FillChar(default_module_ptr^, SizeOf(default_module_ptr^), 0); STANDARD20_MODULE_STUFF(default_module_ptr^); {* Define the directives specific to this module. This structure is referenced * later by the 'module' structure. } with speling_cmds do begin name := 'CheckSpelling'; func := cmd_func(@set_speling); cmd_data := nil; req_override := OR_OPTIONS; args_how := FLAG; errmsg := 'whether or not to fix miscapitalized/misspelled requests'; end; with speling_module do begin name := MODULE_NAME; magic := MODULE_MAGIC_COOKIE; create_dir_config := @create_mconfig_for_directory; { per-directory config creator } merge_dir_config := nil; { dir config merger } create_server_config := @create_mconfig_for_server; { server config creator } merge_server_config := nil; { server config merger } cmds := @speling_cmds; { command table } register_hooks := @register_hooks_; { set up other request processing hooks } end; end.