index.php | castor.php | engines.php | snoopy.class.php | submit.php ··· back to urlator
<?
/*
 *	URLator v1.1, http://turma.sourceforge.net/web/urlator
 *	part of Violence by Design project, http://turma.sourceforge.net
 *	Author: Mircea MITU <mirceamitu@users.sourceforget.net>
 *	Licence: GPL v2
 *	Copyright: 2001, Mircea MITU
 *	Date:		October, 2001
 *
 *	Original source code copyright:	see below
 */


/*************************************************

Snoopy - the PHP net client
Author: Monte Ohrt <monte@ispi.net>
Copyright (c): 1999-2000 ispi, all rights reserved
Version: 0.92

This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

You may contact the author of Snoopy by e-mail at:
monte@ispi.net

Or, write to:
Monte Ohrt
CTO, ispi
237 S. 70th suite 220
Lincoln, NE 68510

The latest version of Snoopy can be obtained from:
http://snoopy.sourceforge.com

*************************************************/

class Snoopy
{
    /**** Public variables ****/
    
    /* user definable vars */

    var $host           =   "sourceforge.net";      // host name we are connecting to
    var $port           =   80;                 // port we are connecting to
    var $proxy_host     =   "";                 // proxy host to use
    var $proxy_port     =   "";                 // proxy port to use
    var $agent          =   "Snoopy v0.92";     // agent we masquerade as
    var $referer        =   "";                 // referer info to pass
    var $cookies        =   array();            // array of cookies to pass
                                                // $cookies["username"]="joe";
    var $rawheaders     =   array();            // array of raw headers to send
                                                // $rawheaders["Content-type"]="text/html";

    var $maxredirs      =   5;                  // http redirection depth maximum. 0 = disallow
    var $offsiteok      =   true;               // allows redirection off-site
    var $maxframes      =   0;                  // frame content depth maximum. 0 = disallow
    var $expandlinks    =   true;               // expand links to fully qualified URLs.
                                                // this only applies to fetchlinks()
                                                // or submitlinks()
    var $passcookies    =   true;               // pass set cookies back through redirects
                                                // NOTE: this currently does not respect
                                                // dates, domains or paths.
    
    var $user           =   "";                 // user for http authentication
    var $pass           =   "";                 // password for http authentication
    
    // http accept types
    var $accept         =   "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, */*";
    
    var $results        =   "";                 // where the content is put
        
    var $error          =   "";                 // error messages sent here
    var $response_code  =   "";                 // response code returned from server
    var $headers        =   array();            // headers returned from server sent here
    var $maxlength      =   500000;             // max return data length (body)
    var $read_timeout   =   0;                  // timeout on read operations, in seconds
                                                // supported only since PHP 4 Beta 4
                                                // set to 0 to disallow timeouts
    var $timed_out      =   false;              // if a read operation timed out
    
    /**** Private variables ****/   
    
    var $_maxlinelen    =   4096;               // max line length (headers)
    
    var $_httpmethod    =   "GET";              // default http request method
    var $_httpversion   =   "HTTP/1.0";         // default http request version
    var $_submit_method =   "POST";             // default submit method
    var $_submittype    =   "application/x-www-form-urlencoded";    // default submit type
    var $_redirectaddr  =   false;              // will be set if page fetched is a redirect
    var $_redirectdepth =   0;                  // increments on an http redirect
    var $_frameurls     =   array();            // frame src urls
    var $_framedepth    =   0;                  // increments on frame depth
    
    var $_isproxy       =   false;              // set if using a proxy server
    var $_fp_timeout    =   30;                 // timeout for socket connection

/*======================================================================*\
    Function:   fetch
    Purpose:    fetch the contents of a web page
                (and possibly other protocols in the
                future like ftp, nntp, gopher, etc.)
    Input:      $URI    the location of the page to fetch
    Output:     $this->results  the output text from the fetch
\*======================================================================*/

    function fetch($URI)
    {
    
        //preg_match("|^([^:]+)://([^:/]+)(:[\d]+)*(.*)|",$URI,$URI_PARTS);
        $URI_PARTS = parse_url($URI);
        if (!empty($URI_PARTS["user"]))
            $this->user = $URI_PARTS["user"];
        if (!empty($URI_PARTS["pass"]))
            $this->pass = $URI_PARTS["pass"];
                
        switch($URI_PARTS["scheme"])
        {
            case "http":
                $this->host = $URI_PARTS["host"];
                if(!empty($URI_PARTS["port"]))
                    $this->port = $URI_PARTS["port"];
                if($this->_connect($fp))
                {
                    if($this->_isproxy)
                    {
                        // using proxy, send entire URI
                        $this->_httprequest($URI,$fp,$URI,$this->_httpmethod);
                    }
                    else
                    {
                        $path = $URI_PARTS["path"].($URI_PARTS["query"] ? "?".$URI_PARTS["query"] : "");
                        // no proxy, send only the path
                        $this->_httprequest($path, $fp, $URI, $this->_httpmethod);
                    }
                    
                    $this->_disconnect($fp);

                    if($this->_redirectaddr)
                    {
                        /* url was redirected, check if we've hit the max depth */
                        if($this->maxredirs > $this->_redirectdepth)
                        {
                            // only follow redirect if it's on this site, or offsiteok is true
                            if(preg_match("|^http://".preg_quote($this->host)."|i",$this->_redirectaddr) || $this->offsiteok)
                            {
                                /* follow the redirect */
                                $this->_redirectdepth++;
                                $this->fetch($this->_redirectaddr);
                            }
                        }
                    }

                    if($this->_framedepth < $this->maxframes && count($this->_frameurls) > 0)
                    {
                        $frameurls = $this->_frameurls;
                        $this->_frameurls = array();
                        
                        while(list(,$frameurl) = each($frameurls))
                        {
                            if($this->_framedepth < $this->maxframes)
                            {
                                $this->fetch($frameurl);
                                $this->_framedepth++;
                            }
                            else
                                break;
                        }
                    }                   
                }
                else
                {
                    return false;
                }
                return true;                    
                break;
            default:
                // not a valid protocol
                $this->error    =   'Invalid protocol "'.$URI_PARTS["scheme"].'"\n';
                return false;
                break;
        }       
        return true;
    }

/*======================================================================*\
    Function:   submit
    Purpose:    submit an http form
    Input:      $URI    the location to post the data
                $formvars   the formvars to use.
                    format: $formvars["var"] = "val";
    Output:     $this->results  the text output from the post
\*======================================================================*/

    function submit($URI, $formvars="")
    {
        unset($postdata);
        
        settype($formvars, "array");

        while(list($key,$val) = each($formvars))
            $postdata .= urlencode($key)."=".urlencode($val)."&";
            
        $URI_PARTS = parse_url($URI);
        if (!empty($URI_PARTS["user"]))
            $this->user = $URI_PARTS["user"];
        if (!empty($URI_PARTS["pass"]))
            $this->pass = $URI_PARTS["pass"];
                
        switch($URI_PARTS["scheme"])
        {
            case "http":
                $this->host = $URI_PARTS["host"];
                if(!empty($URI_PARTS["port"]))
                    $this->port = $URI_PARTS["port"];
                if($this->_connect($fp))
                {
                    if($this->_isproxy)
                    {
                        // using proxy, send entire URI
                        $this->_httprequest($URI,$fp,$URI,$this->_submit_method,$this->_submittype,$postdata);
                    }
                    else
                    {
                        $path = $URI_PARTS["path"].($URI_PARTS["query"] ? "?".$URI_PARTS["query"] : "");
                        // no proxy, send only the path
                        $this->_httprequest($path, $fp, $URI, $this->_submit_method, $this->_submittype, $postdata);
                    }
                    
                    $this->_disconnect($fp);

                    if($this->_redirectaddr)
                    {
                        /* url was redirected, check if we've hit the max depth */
                        if($this->maxredirs > $this->_redirectdepth)
                        {                       
                            if(!preg_match("|^".$URI_PARTS["scheme"]."://|", $this->_redirectaddr))
                                $this->_redirectaddr = $this->_expandlinks($this->_redirectaddr,$URI_PARTS["scheme"]."://".$URI_PARTS["host"]);                     
                            
                            // only follow redirect if it's on this site, or offsiteok is true
                            if(preg_match("|^http://".preg_quote($this->host)."|i",$this->_redirectaddr) || $this->offsiteok)
                            {
                                /* follow the redirect */
                                $this->_redirectdepth++;
                                $this->submit($this->_redirectaddr,$formvars);
                            }
                        }
                    }

                    if($this->_framedepth < $this->maxframes && count($this->_frameurls) > 0)
                    {
                        $frameurls = $this->_frameurls;
                        $this->_frameurls = array();
                        
                        while(list(,$frameurl) = each($frameurls))
                        {                                                       
                            if($this->_framedepth < $this->maxframes)
                            {
                                $this->fetch($frameurl);
                                $this->_framedepth++;
                            }
                            else
                                break;
                        }
                    }                   
                    
                }
                else
                {
                    return false;
                }
                return true;                    
                break;
                
            default:
                // not a valid protocol
                $this->error    =   'Invalid protocol "'.$URI_PARTS["scheme"].'"\n';
                return false;
                break;
        }       
        return true;
    }

/*======================================================================*\
    Function:   fetchlinks
    Purpose:    fetch the links from a web page
    Input:      $URI    where you are fetching from
    Output:     $this->results  an array of the URLs
\*======================================================================*/

    function fetchlinks($URI)
    {
        if ($this->fetch($URI))
        {           

            if(is_array($this->results))
            {
                for($x=0;$x<count($this->results);$x++)
                    $this->results[$x] = $this->_striplinks($this->results[$x]);
            }
            else
                $this->results = $this->_striplinks($this->results);

            if($this->expandlinks)
                $this->results = $this->_expandlinks($this->results, $URI);
            return true;
        }
        else
            return false;
    }

/*======================================================================*\
    Function:   fetchtext
    Purpose:    fetch the text from a web page, stripping the links
    Input:      $URI    where you are fetching from
    Output:     $this->results  the text from the web page
\*======================================================================*/

    function fetchtext($URI)
    {
        if($this->fetch($URI))
        {           
            if(is_array($this->results))
            {
                for($x=0;$x<count($this->results);$x++)
                    $this->results[$x] = $this->_striptext($this->results[$x]);
            }
            else
                $this->results = $this->_striptext($this->results);
            return true;
        }
        else
            return false;
    }

/*======================================================================*\
    Function:   submitlinks
    Purpose:    grab links from a form submission
    Input:      $URI    where you are submitting from
    Output:     $this->results  an array of the links from the post
\*======================================================================*/

    function submitlinks($URI,$formvars)
    {
        if($this->submit($URI,$formvars))
        {           
            if(is_array($this->results))
            {
                for($x=0;$x<count($this->results);$x++)
                {
                    $this->results[$x] = $this->_striplinks($this->results[$x]);
                    if($this->expandlinks)
                        $this->results[$x] = $this->_expandlinks($this->results[$x],$URI);
                }
            }
            else
            {
                $this->results = $this->_striplinks($this->results);
                if($this->expandlinks)
                    $this->results = $this->_expandlinks($this->results,$URI);
            }
            return true;
        }
        else
            return false;
    }

/*======================================================================*\
    Function:   submittext
    Purpose:    grab text from a form submission
    Input:      $URI    where you are submitting from
    Output:     $this->results  the text from the web page
\*======================================================================*/

    function submittext($URI,$formvars)
    {
        if($this->submit($URI,$formvars))
        {           
            if(is_array($this->results))
            {
                for($x=0;$x<count($this->results);$x++)
                {
                    $this->results[$x] = $this->_striptext($this->results[$x]);
                    if($this->expandlinks)
                        $this->results[$x] = $this->_expandlinks($this->results[$x],$URI);
                }
            }
            else
            {
                $this->results = $this->_striptext($this->results);
                if($this->expandlinks)
                    $this->results = $this->_expandlinks($this->results,$URI);
            }
            return true;
        }
        else
            return false;
    }

/*======================================================================*\
    Function:   _striplinks
    Purpose:    strip the hyperlinks from an html document
    Input:      $document   document to strip.
    Output:     $match      an array of the links
\*======================================================================*/

    function _striplinks($document)
    {   
        preg_match_all("'<a\s+href\s*=\s*           # find <a href=
                        ([\"\'])?                   # find single or double quote
                        (?(1) (.*?)\\1 | ([^\s\>]+))        # if quote found, match up to next matching
                                                    # quote, otherwise match up to next space
                        'isx",$document,$links);
                        

        // catenate the non-empty matches from the conditional subpattern

        while(list($key,$val) = each($links[2]))
        {
            if(!empty($val))
                $match[] = $val;
        }               
        
        while(list($key,$val) = each($links[3]))
        {
            if(!empty($val))
                $match[] = $val;
        }       
        
        // return the links
        return $match;
    }

/*======================================================================*\
    Function:   _striptext
    Purpose:    strip the text from an html document
    Input:      $document   document to strip.
    Output:     $text       the resulting text
\*======================================================================*/

    function _striptext($document)
    {
        
        // I didn't use preg eval (//e) since that is only available in PHP 4.0.
        // so, list your entities one by one here. I included some of the
        // more common ones.
                                
        $search = array("'<script[^>]*?>.*?</script>'si",   // strip out javascript
                        "'<[\/\!]*?[^<>]*?>'si",            // strip out html tags
                        "'([\r\n])[\s]+'",                  // strip out white space
                        "'&(quote|#34);'i",                 // replace html entities
                        "'&(amp|#38);'i",
                        "'&(lt|#60);'i",
                        "'&(gt|#62);'i",
                        "'&(nbsp|#160);'i",
                        "'&(iexcl|#161);'i",
                        "'&(cent|#162);'i",
                        "'&(pound|#163);'i",
                        "'&(copy|#169);'i"
                        );              
        $replace = array(   "",
                            "",
                            "\\1",
                            "\"",
                            "&",
                            "<",
                            ">",
                            " ",
                            chr(161),
                            chr(162),
                            chr(163),
                            chr(169));
                    
        $text = preg_replace($search,$replace,$document);
                                
        return $text;
    }

/*======================================================================*\
    Function:   _expandlinks
    Purpose:    expand each link into a fully qualified URL
    Input:      $links          the links to qualify
                $URI            the full URI to get the base from
    Output:     $expandedLinks  the expanded links
\*======================================================================*/

    function _expandlinks($links,$URI)
    {
        preg_match("/^[^\?]+/",$URI,$match);

        $match = preg_replace("|/[^\/\.]+\.[^\/\.]+$|","",$match[0]);
                
        $search = array(    "|^http://".preg_quote($this->host)."|i",
                            "|^(?!http://)(\/)?(?!mailto:)|i",
                            "|/\./|",
                            "|/[^\/]+/\.\./|"
                        );
                        
        $replace = array(   "",
                            $match."/",
                            "/",
                            "/"
                        );          
                
        $expandedLinks = preg_replace($search,$replace,$links);

        return $expandedLinks;
    }

/*======================================================================*\
    Function:   _httprequest
    Purpose:    go get the http data from the server
    Input:      $url        the url to fetch
                $fp         the current open file pointer
                $URI        the full URI
                $body       body contents to send if any (POST)
    Output:     
\*======================================================================*/
    
    function _httprequest($url,$fp,$URI,$http_method,$content_type="",$body="")
    {
        if($this->passcookies && $this->_redirectaddr)
            $this->setcookies();
            
        $URI_PARTS = parse_url($URI);
        if(empty($url))
            $url = "/";
        $headers = $http_method." ".$url." ".$this->_httpversion."\r\n";
        if(!empty($this->agent))
            $headers .= "User-Agent: ".$this->agent."\r\n";
        if(!empty($this->host))
            $headers .= "Host: ".$this->host."\r\n";
        if(!empty($this->accept))
            $headers .= "Accept: ".$this->accept."\r\n";
        if(!empty($this->referer))
            $headers .= "Referer: ".$this->referer."\r\n";
        if(!empty($this->cookies))
        {           
            if(!is_array($this->cookies))
                $this->cookies = (array)$this->cookies;
    
            while(list($cookieKey,$cookieVal) = each($this->cookies))
                $headers .= "Cookie: ".$cookieKey."=".$cookieVal."\r\n";
        }
        if(!empty($this->rawheaders))
        {
            if(!is_array($this->rawheaders))
                $this->rawheaders = (array)$this->rawheaders;
            while(list($headerKey,$headerVal) = each($this->rawheaders))
                $headers .= $headerKey.": ".$headerVal."\r\n";
        }
        if(!empty($content_type))
            $headers .= "Content-type: $content_type\r\n";
        if(!empty($body))   
            $headers .= "Content-length: ".strlen($body)."\r\n";
        if(!empty($this->user) || !empty($this->pass))  
            $headers .= "Authorization: BASIC ".base64_encode($this->user.":".$this->pass)."\r\n";

        $headers .= "\r\n";
            
        // set the read timeout if needed
        if ($this->read_timeout > 0)
            socket_set_timeout($fp, $this->read_timeout);
        $this->timed_out = false;

        fwrite($fp,$headers.$body,strlen($headers.$body));
        
        $this->_redirectaddr = false;
        unset($this->headers);
                        
        while($currentHeader = fgets($fp,$this->_maxlinelen))
        {
            if ($this->_check_timeout($fp)) {
                $this->status=-100;
                return false;
            };

            if($currentHeader == "\r\n")
                break;
                        
            // if a header begins with Location: or URI:, set the redirect
            if(preg_match("/^(Location: |URI: )/i",$currentHeader))
            {
                // get URL portion of the redirect
                preg_match("/^(Location: |URI:)(.*)/",chop($currentHeader),$matches);
                if(!preg_match("|".$this->host."|",$matches[2]))
                {
                    // no host in the path, so prepend
                    $this->_redirectaddr = $URI_PARTS["scheme"]."://".$this->host.":".$this->port;
                    // eliminate double slash
                    if(!preg_match("|^/|",$matches[2]))
                            $this->_redirectaddr .= "/".$matches[2];
                    else
                            $this->_redirectaddr .= $matches[2];
                }
                else
                    $this->_redirectaddr = $matches[2];
            }
        
            if(preg_match("|^HTTP/|",$currentHeader)) {
                if(preg_match("|^HTTP/[^\s]*\s(.*?)\s|",$currentHeader, $status)) {
                       $this->status= $status[1];
                };
                $this->response_code = $currentHeader;
            };

            $this->headers[] = $currentHeader;
        }

        $results = fread($fp, $this->maxlength);

        if ($this->_check_timeout($fp)) {
            $this->status=-100;
            return false;
        }
        
        // check if there is a a redirect meta tag
        
        if(preg_match("'<meta[\s]*http-equiv[^>]*?content[\s]*=[\s]*[\"\']?\d+;[\s]+URL[\s]*=[\s]*([^\"\']*?)[\"\']?>'i",$results,$match))
        {
            $this->_redirectaddr = $this->_expandlinks($match[1],$URI); 
        }

        // have we hit our frame depth and is there frame src to fetch?
        if(($this->_framedepth < $this->maxframes) && preg_match_all("'<frame[\s]*src[\s]*=[\'\"]?([^\'\"\>]+)'i",$results,$match))
        {
            $this->results[] = $results;
            for($x=0; $x<count($match[1]); $x++)
                $this->_frameurls[] = $this->_expandlinks($match[1][$x],$URI_PARTS["scheme"]."://".$this->host);
        }
        // have we already fetched framed content?
        elseif(is_array($this->results))
            $this->results[] = $results;
        // no framed content
        else
            $this->results = $results;

        return true;
    }


/*======================================================================*\
    Function:   setcookies()
    Purpose:    set cookies for a redirection
\*======================================================================*/
function setcookies()
{
    for($x=0; $x<count($this->headers); $x++)
    {
    if(preg_match("/^set-cookie:[\s]+([^=]+)=([^;]+)/i", $this->headers[$x],$match))
        $this->cookies[$match[1]] = $match[2];
    }
}

    
/*======================================================================*\
    Function:   _check_timeout
    Purpose:    checks whether timeout has occurred
    Input:      $fp file pointer
\*======================================================================*/
    function _check_timeout($fp)
    {
        if ($this->read_timeout > 0) {
            $fp_status = socket_get_status($fp);
            if ($fp_status["timed_out"]) {
                $this->timed_out = true;
                return true;
            }
        }
        return false;
    }

/*======================================================================*\
    Function:   _connect
    Purpose:    make a socket connection
    Input:      $fp file pointer
\*======================================================================*/
    
    function _connect(&$fp)
    {
        if(!empty($this->proxy_host) && !empty($this->proxy_port))
            {
                $this->_isproxy = true;
                $host = $this->proxy_host;
                $port = $this->proxy_port;
            }
        else
        {
            $host = $this->host;
            $port = $this->port;
        }
    
                $this->status=0;

        if($fp = fsockopen(
                    $host,
                    $port,
                    &$errno,
                    &$errstr,
                    $this->_fp_timeout
                    ))
        {
            // socket connection succeeded
            return true;
        }
        else
        {
            $this->status=$errno;
            // socket connection failed
            switch($errno)
            {
                case -3:
                    $this->error="socket creation failed (-3)";
                case -4:
                    $this->error="dns lookup failure (-4)";
                case -5:
                    $this->error="connection refused or timed out (-5)";
                default:
                    $this->error="connection failed (".$errno.")";
            }
            return false;
        }
    }

/*======================================================================*\
    Function:   _disconnect
    Purpose:    disconnect a socket connection
    Input:      $fp file pointer
\*======================================================================*/
    
    function _disconnect($fp)
    {
        return(fclose($fp));
    }

}

?>


index.php | castor.php | engines.php | snoopy.class.php | submit.php ··· back to urlator