Sophie

Sophie

distrib > Mageia > 5 > i586 > media > core-release > by-pkgid > 4d360ba0b1cda7200bbb9f8980f3433d > files > 11

python-eventlet-doc-0.13.0-7.mga5.noarch.rpm

"""This is a recursive web crawler.  Don't go pointing this at random sites;
it doesn't respect robots.txt and it is pretty brutal about how quickly it 
fetches pages.

This is a kind of "producer/consumer" example; the fetch function produces 
jobs, and the GreenPool itself is the consumer, farming out work concurrently.  
It's easier to write it this way rather than writing a standard consumer loop;
GreenPool handles any exceptions raised and arranges so that there's a set
number of "workers", so you don't have to write that tedious management code 
yourself.
"""
from __future__ import with_statement

from eventlet.green import urllib2
import eventlet
import re

# http://daringfireball.net/2009/11/liberal_regex_for_matching_urls
url_regex = re.compile(r'\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))')


def fetch(url, outq):
    """Fetch a url and push any urls found into a queue."""
    print "fetching", url
    data = ''
    with eventlet.Timeout(5, False):
        data = urllib2.urlopen(url).read()
    for url_match in url_regex.finditer(data):
        new_url = url_match.group(0)
        outq.put(new_url)

            
def producer(start_url):
    """Recursively crawl starting from *start_url*.  Returns a set of 
    urls that were found."""
    pool = eventlet.GreenPool()
    seen = set()
    q = eventlet.Queue()
    q.put(start_url)
    # keep looping if there are new urls, or workers that may produce more urls
    while True: 
        while not q.empty():
            url = q.get()
            # limit requests to eventlet.net so we don't crash all over the internet
            if url not in seen and 'eventlet.net' in url:
                seen.add(url)
                pool.spawn_n(fetch, url, q)
        pool.waitall()
        if q.empty():
            break
        
    return seen


seen = producer("http://eventlet.net")
print "I saw these urls:"
print "\n".join(seen)