#!/usr/bin/perl -wT
#
# URLblocked.cgi - explain to the user that the URL is blocked and by which rule set.
#
# Currently the error messages supports 
# en (English), 
# de (German), 
# pl (Polish)
# sv (Swedisk)
# it (Italian)
# pt (Portuguese)
# fr (French)
# tr (Turkish)
# nl (Dutch).
# You can add a language yourself: search for all occurences of "NEWLANGUAGE"
# and add your language text.
use strict;
use Socket;
# This CGI script uses fastcgi and also requires Fcgid configured in Apache.
# Comment out the next line and the line with "while (new CGI::Fast)" (approximately at line 423)
# to revert this script to a regular CGI script.
### use CGI::Fast;
use constant {
   CT_IMAGE  => 1,
   CT_JAVA   => 2,
   CT_HTML   => 3,
   CT_XML    => 4,
   CT_CSS    => 5,
   CT_TEXT   => 6,
   CT_JSON   => 7,
   CT_STREAM => 8,
   CT_204    => 9
};
use vars qw( $admin $clientaddr $clientname $clientuser $clientgroup $category $targetgroup );
use vars qw( $color $size $mode $textcolor $bgcolor $titlesize $textsize $httpcode $url $origurl );
use vars qw( $ufdbhost $ufdbscripturi $ufdbredirscripturi $ufdbsni $ufdbservername $ufdbrequesturi $ufdbrefurl );
use vars qw( $escaped_ufdbrequesturi $escaped_url );
use vars qw( @day @month @languages $lang $protocol $address $port $path );
local $admin;
local $clientaddr;
local $clientname;
local $clientuser;
local $clientgroup;
local $targetgroup;
local $color;
local $size;
local $mode;
local $textcolor;
local $bgcolor;
local $titlesize;
local $textsize;
local $httpcode;
local $url;
local $origurl;
local $ufdbhost;
local $ufdbscripturi;
local $ufdbredirscripturi;
local $ufdbsni;
local $ufdbservername;
local $ufdbrequesturi;
local $ufdbrefurl;
local $lang;
local $protocol;
local $address;
local $port;
local $path;
local @day = ("Sunday","Monday","Tuesday","Wednesday","Thursday","Friday","Saturday");
local @month = ("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec");
local @languages = (
		"de (German),",
		"nl (Dutch),",
		"pl (Polish),",
		"sv (Swedish),",
		"es (Spanish),",
		"it (Italian),",
		"pt (Portuguese),",
		"fr (French),",
		"tr (Turkish),",
		"NEW (NEWLANGUAGE),",
		"en (English),",
	       );
my $html_comment = "\n";
sub init();
sub session_reinit();
sub getPreferedLanguage(@);
sub parseURL($);
sub parseQuery($);
sub session_reinit ()
{
   $admin = 'unknown';
   $clientaddr = 'unknown';
   $clientname = 'unknown';
   $clientuser = 'unknown';
   $clientgroup = 'unknown';
   $targetgroup = 'unknown';
   $color = 'orange';
   $size = 'normal';
   $mode = 'default';
   $httpcode = '200';
   $url = 'unknown';
   $origurl = 'unknown';
   $lang = 'unknown';
   $ufdbrequesturi = '';
}
sub init () 
{
   $lang = getPreferedLanguage( @languages );
   ( $httpcode, $admin, $clientaddr, $clientname, $clientuser, $clientgroup, $category, $url )  =
      parseQuery( $ENV{"QUERY_STRING"} );
}
#
# Find the first supported language of the client.
#
sub getPreferedLanguage (@) 
{
  my @supported = @_;
  my @clientLanguages = split(/\s*,\s*/,$ENV{"HTTP_ACCEPT_LANGUAGE"}) if(defined($ENV{"HTTP_ACCEPT_LANGUAGE"}));
  my $lang;
  my $supp;
  ### NO!  push(@clientLanguages,$supported[0]);
  for $lang (@clientLanguages) 
  {
    $lang =~ s/\s.*//;
    $lang =~ s/-.*//;
    for $supp (@supported) 
    {
      $supp =~ s/\s.*//;
      return($lang) if ($lang eq $supp);
    }
  }
  return 'en';   # default language is 'en'
}
sub parseQuery ($) 
{
  my $query       = shift;
  my $admin       = 'The system administrator.';
  my $clientaddr  = '';
  my $clientname  = '';
  my $clientuser  = '';
  my $clientgroup = '';
  my $category    = '';
  my $httpcode    = '200';
  my $url         = 'undefined';
  my $therequest  = '';
  my $dummy;
  $ufdbservername = '';
  $ufdbhost = '';
  $ufdbrequesturi = '';
  if (defined($query))
  {
    while ($query =~ /^\&?([^\&=]+)=\"([^\"]*)\"(.*)/  || 
           $query =~ /^\&?([^\&=]+)=([^\&=]*)(.*)/)
    {
      my $key = $1;
      my $value = $2;
      $value = '??' unless(defined($value) && $value && $value ne '??');
      $query = $3;
      if ($key =~ /^(admin|clientaddr|clientname|clientuser|clientident|clientgroup|category|targetgroup|color|size|source|srcclass|targetclass|mode|httpcode|ufdbhost|ufdbscripturi|ufdbredirscripturi|ufdbsni|ufdbservername|ufdbrequesturi|ufdbrefurl|url)$/) 
      {
	$escaped_ufdbrequesturi = $value  if ($key eq 'ufdbrequesturi');
	$escaped_url = $value             if ($key eq 'url');
	$value =~ s/%20/ /g;
	$value =~ s/%22/"/g;
	$value =~ s/%23/#/g;
	$value =~ s/%24/\$/g;
	$value =~ s/%26/\&/g;
	$value =~ s/%27/\?/g;
	$value =~ s/%2B/\+/ig;
        $value =~ s/%2C/,/ig;
	$value =~ s/%2F/\//ig;
	$value =~ s/%3A/:/ig;
	$value =~ s/%3B/;/ig;
	$value =~ s/%3C//ig;
	$value =~ s/%3F/?/ig;
	$value =~ s/%40/\@/ig;
	$value =~ s/%5C/\\/ig;
	$value =~ s/%25/%/g;            # must be last
	$key = 'clientgroup' if ($key eq 'source'  ||  $key eq 'srcclass');
	$key = 'clientuser'  if ($key eq 'clientident');
	$key = 'category'    if ($key eq 'targetgroup'  ||  $key eq 'targetclass');
	eval "\$$key = \$value";
      }
      if ($query =~ /^url=(.*)/) 
      {
	$url = $1;
	last;
      }
    }
  }
  $dummy = $ENV{'HTTP_X_BLOCKED_URL'};
  $url = $dummy  if defined $dummy;
  $dummy = $ENV{'HTTP_X_BLOCKED_CATEGORY'};
  $targetgroup = $dummy  if defined $dummy;
  if ($url eq 'undefined')
  {
     if ($ufdbservername ne '')
     {
        $url = 'https://' . $ufdbservername;
     }
     elsif ($ufdbhost ne '')
     {
        $url = 'https://' . $ufdbhost;
     }
     if ($url ne 'undefined'  &&  $ufdbrequesturi ne '')
     {
	$ufdbrequesturi =~ s/\?.*//;
        $url .= $ufdbrequesturi;
     }
  }
  $origurl = $url;
  $url = substr( $url, 0, 120 ) . '...'  if (length($url) > 120);
  $ufdbrefurl = $ENV{'HTTP_REFERER'};
  if (1)
  {
    my $dbgf = '/tmp/debug-cgibin-cgi';     # '/local/websites/logs/cgidebug';
    open DEBUG, ">> $dbgf";
    print DEBUG "\n\nurl = $url\n";
    print DEBUG "lang = $lang\n";
    foreach my $key ('HTTP_HOST', 'HTTP_REFERER', 'HTTP_USER_AGENT', 'REMOTE_ADDR', 'REQUEST_METHOD', 
		     'HTTP_ACCEPT_CHARSET', 'HTTP_ACCEPT_LANGUAGE', 'HTTP_CONTENT_TYPE', 'QUERY_STRING') {
       my $value = $ENV{$key};
       $value = 'undefined'  if !defined($value);
       print DEBUG "$key = $value\n";
    }
    if (0) 
    {
       print DEBUG ":\n";
       foreach my $key (sort(keys(%ENV))) {
	  print DEBUG "$key = $ENV{$key}\n";
       }
       close DEBUG;
    }
  }
  return ( $httpcode, $admin, $clientaddr, $clientname, $clientuser, $clientgroup, $category, $url );
}
sub parseURL ($) 
{
  my $url      = shift;
  my $protocol = "";
  my $address  = "";
  my $port     = "";
  my $path     = "";
  $url =~ /^([^\/:]+):\/\/([^\/:]+)(:\d*)?(.*)/;
  $protocol = $1 if(defined($1));
  $address  = $2 if(defined($2));
  $port     = $3 if(defined($3));
  $path     = $4 if(defined($4));
  return ( $protocol, $address, $port, $path );
}
sub getContentType( $ )
{
   my $url = shift;
   my $suffix;
   my $path;
   $url =~ s/[;\?\&].*//;
   $url =~ s/^(ftp|http|https):\/\///;
   $path = $url;
   $path =~ s/^[^\/]*//;
   $suffix = $path;
   $suffix =~ s/.*\././;
   return CT_IMAGE  if ($suffix =~ /\.(bmp|gif|ico|jpg|jpeg|jpe|png|webp|tiff)$/i);
   return CT_CSS    if ($suffix =~ /\.css$/i);
   return CT_JSON   if ($suffix =~ /\.json$/i);
   return CT_JAVA   if ($suffix =~ /\.(js|jar)$/i);
   return CT_TEXT   if ($suffix =~ /\.(csv|txt)$/i);
   return CT_HTML   if ($suffix =~ /\.(htm|html|dhtml|shtml)$/i);
   return CT_XML    if ($suffix =~ /\.(xml|sxml|rss)$/i);
   return CT_STREAM if ($suffix =~ /\.(bin|bz2|cab|class|dat|doc|gz|h264|mp3|mpg|mpeg|msi|mst|ppt|pdf|rar|tar|ttf|xls|zip|ogv|divx|xvid|qt|ra|ram|rv|wmv|avi|mov|swf|mp4|mv4|flv)$/i);
   # no suffix found, now we start with the guesswork
   return CT_HTML   if ($path eq '/' || $path eq '');
   return CT_IMAGE  if ($url =~ /^googleadservices\.com\/pagead\/conversion\// );
   return CT_TEXT   if ($url =~ /^googleads\.g\.doubleclick\.net\/pagead\/ads\// );
   return CT_JAVA   if ($url =~ /^pubads\.g\.doubleclick\.net\/pagead\/ads\// );
   return CT_JAVA   if ($url eq 'a.analytics.yahoo.com/fpc.pl' );
   return CT_IMAGE  if ($url eq 'a.analytics.yahoo.com/p.pl' );
   return CT_IMAGE  if ($url eq 'ping.chartbeat.net/ping' );
   if ($url =~ '^www\.youtube\.com')
   {
      return CT_STREAM  if ($path =~ /^\/cp\//  ||  $path =~ /^\/p\//  ||
                            $path =~ /^\/v\//   ||  $path =~ /^\/videoplayback/ );
   }
   return CT_204    if ($url =~ /^s\.youtube\.com\/api\/stats\//);
   return CT_IMAGE  if ($url =~ /^b\.scorecardresearch\.com\// );
   return CT_JAVA   if ($url =~ /\.doubleclick\.net\/adj\//   ||
                        $url =~ /\.doubleclick\.net\/pfadj\// );
   return CT_IMAGE  if ($url =~ /\.doubleclick\.net\/imp/ );
   return CT_IMAGE  if ($url =~ /\.tradedoubler\.com\/imp/ );
   if ($url =~ /^view\.atdmt\.com\//)
   {
      return CT_IMAGE  if ($path =~ /^\/action\// );
      return CT_JAVA   if ($path =~ /^\/jview\// );
   }
   return CT_JAVA   if ($url eq 'static.ak.connect.facebook.com/connect.php' );
   return CT_IMAGE  if ($url eq 'secure-us.imrworldwide.com/cgi-bin/m' );
   return CT_IMAGE  if ($url =~ /ftjcfx\.com\/image-/ );
   return CT_IMAGE  if ($url =~ /lduhtrp\.net\/image-/ );
   return CT_IMAGE  if ($url =~ /img\.pheedo\.com\/img\.phdo/ );
      
   if ($path =~ /\/realmedia\/ads\//i )
   {
      return CT_JAVA   if ($path =~ /\/adstream_jx/  ||  $path =~ /\/adstream_mjx/ );
      return CT_IMAGE  if ($path =~ /\/adstream_lx/  ||  $path =~ /\/adstream_nx/ );
      return CT_IMAGE  if ($path =~ /\/ads\/cap\.cgi/  );
   }
   return CT_JAVA   if ($url =~ /overture\.com\/ls_js_/ );
   return CT_IMAGE  if ($path =~ /\/scripts\/beacon\.dll/  ||  $path =~ /\/scripts\/beacon2\.dll/ );
   return CT_IMAGE  if ($url eq 'rtd.tubemogul.com/upi/');
   return CT_JAVA   if ($path =~ /\/javascript\//  ||  $path =~ /\/ajaxpro\// );
   return CT_JAVA   if ($path =~ /\/js\.php$/  ||  $path =~ /\/javascript\.php$/ );
   return CT_CSS    if ($path =~ /\/css\.php$/ );
   return CT_IMAGE  if ($path =~ /\/image\.php$/  || $path =~ /\/image\.php\// );
   return CT_JAVA   if ($path =~ /\/js\.ng\//  ||  $path =~ /\/js\// );
   return CT_JAVA   if ($path =~ /\/scripts\//  ||  $path =~ /\/script\// );
   return CT_XML    if ($url =~ /^xml\./ );
   if ($path =~ /\/b\/ss\// )
   {
      return CT_IMAGE  if ($path =~ /\/FAS/i  ||  $path =~ /\/H\./i  ||  $path =~ /\/G\./i );
   }
  
   return CT_JAVA   if ($url =~ /\.channel\.facebook\.com\/x\// );
   return CT_TEXT   if ($url =~ /\.channel\.facebook\.com\/p/ );
   return CT_IMAGE  if ($url eq 'www.facebook.com/fr/u.php' );
   return CT_IMAGE  if ($url eq 'pixel.mathtag.com/event/img' );
   return CT_JAVA   if ($url eq 'pixel.mathtag.com/event/js' );
   return CT_IMAGE  if ($url eq 'x.bidswitch.net/ul_cb/sync' );
   return CT_XML    if ($path =~ /\/xml-rpc/ );
   return CT_STREAM if ($path eq 'open/1');
   return CT_IMAGE  if ($url =~ /^pixel\./  ||  $path =~ /\/pixel$/ );
   return CT_TEXT   if ($url =~ /heatmap/ );
   return CT_204    if ($url eq 'analytics.livestream.com/track');
   return CT_HTML;
}
# comment out the next line if fastcgi is not configured
### while (new CGI::Fast)
{
   my $time = time;
   my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst);
   my $root;
   my $buffer;
   session_reinit();
   init();
   # the default is english
   my $forbidden = 'no access';
   my $title = $forbidden;
   my $explanation_prefix = 'URL blocked because it is';
   my $explanation_suffix = '';
   my $go_back = 'back';
   my $more_info = 'More information about ufdbGuard is here.';
   my $contentType = getContentType( $origurl );
   $contentType = CT_204  if ($httpcode eq '204');
   if ($contentType == CT_IMAGE)
   {
      print "Content-Type: image/png\n";
      ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = gmtime( $time + 180 );
      printf "Expires: %s, %02d %s %04d %02d:%02d:%02d GMT\n",
	     $day[$wday], $mday, $month[$mon], $year+1900, $hour, $min, $sec;
      print "\n";
      my $imgfile;
      if ($category eq 'ads')
      {
         $imgfile = "transparent.png";
         $imgfile = "no-ads.png"      if ($mode eq 'noads');
         $imgfile = "smallcross.png"  if ($mode eq 'cross');
         $imgfile = "square.png"      if ($mode eq 'square');
      }
      else
      {
	 if ($mode eq 'cross') {
	    $imgfile = "smallcross.png"  if ($mode eq 'cross');
	 }
	 elsif ($mode eq 'square') {
	    $imgfile = "square.png"      if ($mode eq 'square');
	 }
	 elsif ($mode eq 'simple-red'  ||  $mode eq 'transparent'  ||  $mode eq 'transparant') {
	    $imgfile = "transparent.png";
	 }
	 else  {
	    $imgfile = "forbidden-normal-" . $lang . ".png";
	 }
      }
      $root = $ENV{'DOCUMENT_ROOT'};
      open( BLOCKEDPNG, "$root/images/$imgfile" )  ||  print "failed to open $root/images/$imgfile\n";
      print $buffer while (read (BLOCKEDPNG,$buffer,8192));
      close( BLOCKEDPNG );
   }
   elsif ($contentType == CT_204)
   {
      print "Status: 204 No Content\n";
      ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = gmtime( $time + 180 );
      print "Content-Type: text/plain\n";
      printf "Expires: %s, %02d %s %04d %02d:%02d:%02d GMT\n",
             $day[$wday], $mday, $month[$mon], $year+1900, $hour, $min, $sec;
      printf "X-blocked-category: %s\n", $category;
      printf "X-blocked-URL: %s\n", $url;
      print "\n";
   }
   elsif ($contentType == CT_STREAM)
   {
      if (1)
      {
	 print "Status: 204 no content\n";
         print "Content-Type: text/plain\n";
	 ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = gmtime( $time + 180 );
	 printf "Expires: %s, %02d %s %04d %02d:%02d:%02d GMT\n",
		$day[$wday], $mday, $month[$mon], $year+1900, $hour, $min, $sec;
	 printf "X-blocked-category: %s\n", $category;
	 printf "X-blocked-URL: %s\n", $url;
      }
      else
      {
	 print "Content-Type: application/octet-stream\n";
	 ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = gmtime( $time + 180 );
	 printf "Expires: %s, %02d %s %04d %02d:%02d:%02d GMT\n",
		$day[$wday], $mday, $month[$mon], $year+1900, $hour, $min, $sec;
      }
      print "\n";
   }
   elsif ($contentType == CT_JAVA)
   {
      print "Content-Type: application/x-javascript\n";
      ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = gmtime( $time + 180 );
      printf "Expires: %s, %02d %s %04d %02d:%02d:%02d GMT\n",
	     $day[$wday], $mday, $month[$mon], $year+1900, $hour, $min, $sec;
      print "\n";
      print "\n";
   }
   elsif ($contentType == CT_JSON)
   {
      print "Content-Type: application/json\n";
      ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = gmtime( $time + 180 );
      printf "Expires: %s, %02d %s %04d %02d:%02d:%02d GMT\n",
	     $day[$wday], $mday, $month[$mon], $year+1900, $hour, $min, $sec;
      print "\n";
      print "\n";
   }
   elsif ($contentType == CT_CSS)
   {
      print "Content-Type: text/css\n";
      ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = gmtime( $time + 180 );
      printf "Expires: %s, %02d %s %04d %02d:%02d:%02d GMT\n",
	     $day[$wday], $mday, $month[$mon], $year+1900, $hour, $min, $sec;
      print "\n";
      print "\n";
   }
   elsif ($contentType == CT_TEXT)
   {
      print "Content-Type: text/plain\n";
      ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = gmtime( $time + 180 );
      printf "Expires: %s, %02d %s %04d %02d:%02d:%02d GMT\n",
	     $day[$wday], $mday, $month[$mon], $year+1900, $hour, $min, $sec;
      print "\n";
      print "\n";
   }
   elsif ($contentType == CT_XML)
   {
      print "Content-Type: text/xml\n";
      ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = gmtime( $time + 180 );
      printf "Expires: %s, %02d %s %04d %02d:%02d:%02d GMT\n",
	     $day[$wday], $mday, $month[$mon], $year+1900, $hour, $min, $sec;
      print "\n";
      print "\n";
      print "
\n";
	    print "https://blockedhttps.urlfilterdb.com is used by ufdbGuard
\n";
	    print "to display messages about blocked URLs.
\n";
	    print "
\n";
	       print "$forbidden
$category\n";
	       print "why is this URL blocked?\n";
	       print "
\n";
	       print "$go_back. 
\n";
	       print "$admin\n";
	       print "
\n";
	       print "$more_info\n";
	       print "
\n