Sophie: webgrep-2.9-2mdk i586

webgrep-2.9-2mdk.i586.rpm

#!/usr/bin/perl -wT
# This script can run with  6755 permissions on some user
# User nobody can often not delete any files and this will
# fill /tmp

=head1 check for broken links or wrong html syntax 

It uses wget, webgrep, and tidy (from http://www.w3.org/People/Raggett/tidy/)
webgrep is available at (http://linuxfocus.org/~guido/)
wget is already part of most linux distributions.

wchck has 2 function:
1) check for broken links
2) check for html syntax errors.


Installation: 
The program writes temporary files and a logfile in /tmp
The temporary files are except in case of error cleaned up.
You might want to delete the logfile called /tmp/wchck_log
from time to time.

Install  wget, tidy and webgrep (this file is actually part of the
webgrep package from version 2.9 onwards)

to /usr/bin
Change the variable $url below to the right value and test it.

Copyright: GPL
Written by: guido@bearix.oche.de

This Program uses the HTTP_REFERER variable

=cut

# -------------------------------------------------------
use strict;
#global data:
my %FORM;
my %html;
my $curl="";
my $ver="0.1";
my %cache;
#The url of this program:
my $url = "http://10.0.0.1/cgi-bin/wchck";
#
delete $ENV{'IFS'};
delete $ENV{'CDPATH'};
delete $ENV{'ENV'};
delete $ENV{'BASH_ENV'};
$ENV{'PATH'}="/bin:/usr/bin:/usr/local/bin";
$<=$>;
#
#
&cgi_receive;
&readHTMLpage();
if ($FORM{'about'} || $FORM{'help'}){
    &printHTMLhead();
    &printHTMLpage('intro');
}elsif ($FORM{'url'}){
    $curl=$FORM{'url'};
    &checkpage();
}elsif ($FORM{'refer'} && $FORM{'HTTP_REFERER'}){
    $curl=$FORM{'HTTP_REFERER'};
    &checkpage();
}else{
    &printHTMLhead();
    &printHTMLpage('intro');
}
#--------------------------------------------------------
sub printHTMLhead(){
    #print "Pragma: no-cache\n";
    print "Content-type: text/html\n\n";
}
#--------------------------------------------------------
sub printHTMLpage($){
    my $reqpage = shift;
    my $tmp;
    die "ERROR: no such template $reqpage\n" unless ($html{$reqpage});
    for (@{$html{$reqpage}}){
        # the /o is important!!
        s/\$url/$url/o;
        s/\$curl/$curl/o;
        print;
    }
}
#--------------------------------------------------------
sub readHTMLpage(){
    #read and print any text between __ xxx __ and the next __ 
    my $pagename="nix";
    while(<DATA>){
        if (/^__ (\w+) __/){
            $pagename=$1;
            next;
        }
        next if (/^__ /);
        push(@{$html{$pagename}},$_);
    }
}
#--------------------------------------------------------
sub today(){
    my @ltime = localtime;
    #return a date in yyyymmdd_hh:mm:ss format
    sprintf("%04d-%02d-%02d_%02d:%02d:%02d",1900+$ltime[5],$ltime[4]+1,$ltime[3],$ltime[2],$ltime[1],$ltime[0]);
}
#--------------------------------------------------------
sub logusage($){
    my $str=shift;
    open(LOG,">>/tmp/wchck_log")||die "ERORR: can not write /tmp/wchck_log\n";
    print LOG "$str\n---\n";
    close LOG;
}
#--------------------------------------------------------
sub hck($){
    my $lnk=shift;
    my $res;
    my $rval=0;
    # if cgi then query only until ? without parameters:
    $lnk=~s/\?.*$//;
    #print "dbg[$lnk]"; 
    if ($cache{$lnk}){
        $res=$cache{$lnk};
        $rval=1 if ($res =~/error/i);
    }else{
        $res=`httpcheck -s \"$lnk\" 2>&1`;
        $res=~s/.+ :: //;
        chomp($res);
        $rval=$?;
    }
    if($rval){
        print "<font color=\"#FF0000\">$res</font>\n";
    }else{
        print "<font color=\"#00AF00\">$res</font>\n";
    }
}
#--------------------------------------------------------
# download a page and check it with tidy -e and lshtmlref -AD and httpcheck
# 
# First we download the page then we parse it with lshtmlref -AD
# next we read in the data from lshtmlref -AD and feed it to httpcheck
#
# The web page syntax is checked with tidy -e
sub checkpage(){
    my $tmpfile="/tmp/wchck_$$";
    my ($i,$forlog,$res,$line,$len,$qurl,$domain);
    my @line;
    my $link;
    $|=1; # make webbrowser faster
    unless($curl=~m=(http://\w.+)=i){
        &printHTMLhead();
        &printHTMLpage('errornourl');
        exit(0);
    }
    $curl=$1; # untaint
    $domain=$curl;
    $domain=~s/(\w)\/.+/$1/;
    if($curl=~m=(\.gif|\.jpg|\.png|\.tiff|\.jpeg|\.gz|\.tar|\.zip)$=i){
        &printHTMLhead();
        &printHTMLpage('noimg');
        exit(0);
    }
    $qurl=$curl;
    $qurl=~s/[;\|<>!&\\\n\$]//g; # shell meta char
    $forlog = "[".&today() ."] $url :";
    $forlog.=`wget -nv -nd -t 1 -T 15 -Q 240K -O $tmpfile.html \"$qurl\" 2>&1`;
    if($?){
        &printHTMLhead();
        &printHTMLpage('nosuchpage');
        exit(0);
    }
    unless( -T "$tmpfile.html"){
        &printHTMLhead();
        &printHTMLpage('errornoascii');
        exit(0);
    }
    $forlog.=`lshtmlref -AD $tmpfile.html > $tmpfile.l.txt 2>&1`;
    $forlog.=`tidy -e $tmpfile.html > $tmpfile.e.txt 2>&1`;
    unlink("$tmpfile.html");
    &printHTMLhead();
    &printHTMLpage('resulthead');
    print "<h2>The following syntax errors were found:</h2>\n<p>Errors must be corrected. Warnings may be left as they are.</p>\n<pre>\n";
    open(FF,"$tmpfile.e.txt")||die "ERROR: can not read $tmpfile.e.txt\n";
    while(<FF>){
        next if (/$tmpfile.html/o);
        s/</&lt;/g;
        s/>/&gt;/g;
        last if(/For further advice/);
        print;

    }
    close FF;
    if ($FORM{'syntaxonly'}){
        goto CHECKEND;
    }
    print "</pre>\n<br clear=all>\n<hr>\n<h2>Result of broken link check:</h2>\n";
    # check the links:
    open(FF,"$tmpfile.l.txt")||die "ERROR: can not read $tmpfile.l.txt\n";
    # here we have the hard time to figure out
    # if a link called ../index.html in http://linuxfocus.org/Deutsch
    # is http://linuxfocus.org/Deutsch/../index.html
    # or http://linuxfocus.org/../index.html because Deutsch is a html
    # file. We ask httpcheck for a new location:
    if ($qurl=~m=/$=){
        # that's clear it's a directory
        $qurl=~s=/$==;
    }else{
        $res=`httpcheck -s \"$qurl\" 2>&1`;
        $res=~s/.+ :: //;
        chomp($res);
        if ($res =~ m/New location: (.+)/){
            $qurl=$1;
        }
        # remove /xxx.html or just /
        $qurl=~s=/[^/]*$==;
    }
    $domain=~s=/$==;
    print "<pre>\n";
    $i=0;
    while(<FF>){
        next unless(/\w:/);
        $i++;
        last if ($i>70); #check only max 70 links
        chomp;
        # a line looks like: 
        # /tmp/wchck_591.html:52: href="Nederlands/" :rel-lnk
        s/^.+:(\d+): /$1 :/;
        @line=split(/ :/);
        print "line $line[0] == $line[1] == $line[2] == ";
        $line[1]=~s/^\w+="//;
        $line[1]=~s/"$//;
        unless($line[1]=~ m=([^;\|<>!&\\\n\$]+)=){
            print "ERORR: in url syntax: $line[1]\n";
            next;
        }
        $link=$1;
        if ($line[2] eq "rel-lnk"){
            &hck("$qurl/$link");
        }elsif ($line[2] eq "file-from-docroot"){
            &hck("$domain/$link");
        }elsif ($line[2] eq "abs-ref"){
            &hck("$link");
        }else{
            print "not checked\n";
            next;
        }
    }
    close FF;
CHECKEND:
    print "</pre>\n<hr><a href=\"$url\">back to the html checker page</a>\n</body></html>\n";
    unlink("$tmpfile.e.txt","$tmpfile.l.txt");
}
#--------------------------------------------------------
#--------------------------------------------------------
sub cgi_receive{
    my $buffer = "";
    my $pair;
    my $name;
    my $value;
    if ($ENV{'GATEWAY_INTERFACE'} && $ENV{'GATEWAY_INTERFACE'} =~ /CGI/){
        if ($ENV{'REQUEST_METHOD'} eq 'GET') {
            if($ENV{'QUERY_STRING'}){
                $buffer = $ENV{'QUERY_STRING'};
            }
        }elsif ($ENV{'REQUEST_METHOD'} eq 'POST') {
            read(STDIN, $buffer,$ENV{'CONTENT_LENGTH'});
        }else{
            die "Unknown REQUEST_METHOD: $ENV{'REQUEST_METHOD'}";
        }
    }else {
        $buffer = $ARGV[0] if ($ARGV[0]);
    }
    if ($ENV{'HTTP_REFERER'}){
        $FORM{'HTTP_REFERER'}=$ENV{'HTTP_REFERER'};
    }
    # now decode it:
    #
    # Split the name-value pairs
    foreach $pair (split(/\&/, $buffer)){
        ($name, $value) = split(/=/, $pair);
        $value = " " unless ($value);
        # Un-Webify plus signs and %-encoding
        $value =~ tr/+/ /;
        $value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
        $FORM{$name} = $value;
    }
}

#--------------------------------------------------------
__END__

__ intro __
<HTML>
<HEAD>
<TITLE>LinuxFocus html checker</TITLE>
</HEAD>
<BODY>
<h2>HTML checker</h2>
<p>With the help of this html checker page you can ensure that
your webpages can be displayed correctly in all web-browsers.
The program checks:
<ul>
<li>for broken links
<li>for errors in the html syntax
</ul>

<center>
<FORM METHOD="post" ACTION="$url">
      <TABLE BORDER="1" CELLPADDING="2" cellspacing="0" bgcolor="#9999BB" WIDTH="90%">
      <TR>
      <TD>
Enter the URL of the page you would like to check:<br>
      <INPUT TYPE="text" NAME="url" SIZE=70 VALUE="http://"><BR>
      <input type="checkbox" name="syntaxonly">Syntax check only (this is faster)<BR>
      <INPUT TYPE="submit" VALUE="check it">
      </TD>
      </TR>
      </TABLE>
</FORM>      
</center>
<p>
If you like you can set a special book mark in your browser.
Clicking on that bookmark while viewing a given web page will execute
this program and check the page that you are currently looking at.
</p>
      <center>
      <TABLE BORDER="1" CELLPADDING="2" cellspacing="0" bgcolor="#CBCBCB" WIDTH="90%">
      <TR>
      <TD>
      You can convert any web-page by adding a bookmark with the following
      settings in your browser:
      </center>
      <i>Title:</i>HTML checker<br>
      <i>Location:</i><small> javascript:location.href= '$url?url=' +escape(window.location);</small>
      </TD>
      </TR>
      </TABLE>
     </center>
<p>At this moment the program checks for webpages with frames
the the main page instead of the individual frames. Enter
either the urls of the individual framesets or do not
use frames (frames cause anyhow a lot of problems for the
reader and it is better to use server side includes instead).
</p>
<p>
A very good html validitor can be found on the w3c homepage:
<br><a href="http://validator.w3.org/check/referer/"><b>http://validator.w3.org/check/referer/</b></a>. 
W3 has also a link validitor at <a href="http://validator.w3.org/checklink"><b>
http://validator.w3.org/checklink</b></a>.

</p>
<!----------------------- -->
<hr>
<p><small>This program is based on Open Source Software and runs under Linux&reg;.
It uses the tidy utility, perl, wget and webgrep.<br>
Written and maintained by <a href="mailto:guido@linuxfocus.org">Guido Socher</a>. 
</small> 
</p>
</BODY>
</HTML>

__ noimg __
<HTML>
<HEAD>
<TITLE>error</TITLE>
</HEAD>
<BODY>
<p>ERROR: This program can only convert html/text
<br>
<a href="$curl">
$curl</a>
</p>
</BODY>
</HTML>

__ nosuchpage __
<HTML>
<HEAD>
<TITLE>error</TITLE>
</HEAD>
<BODY>
<p>ERROR: the requested page could not be retrieved
<br>
<a href="$curl">
$curl</a>
</p>
</BODY>
</HTML>

__ resulthead __
<HTML>
<HEAD>
<TITLE>result</TITLE>
</HEAD>
<BODY>
<P>Results for <b>$curl</b></P>

__ errornoascii __
<HTML>
<HEAD>
<TITLE>error</TITLE>
</HEAD>
<BODY>
<p>ERROR: not a webpage in ascii format:
<br>
<a href="$curl">
$curl</a>
</p>
</BODY>
</HTML>

__ errornourl __
<HTML>
<HEAD>
<TITLE>error</TITLE>
</HEAD>
<BODY>
<p>ERROR: in url syntax. Must be http:// and not 
<br>
<a href="$curl">
$curl</a>
</p>
</BODY>
</HTML>


__ ende __

 vim: set sw=4 ts=4 et si: