Learning to Crawl the Web

#start by finding all the links on the seed page
#store them in a list
#go through all the links in that list to continue our crawl, and keep going as long as there are more pages to crawl.

### The first step is to define a procedure get_all_links that takes as input a string
### that represents the text on a web page and produces
### as output a list containing all the URLs that are targets of link tags
### on that page.

import urllib

def get_all_links(page):
links = []
while True:
url, endpos = get_next_target(page)
if url:
print url, ‘printing from get_all_links procedure’
page = page[endpos:]
return links

def crawl_web(seed):
tocrawl = [seed]
crawled = []
index = []
print ‘printing tocrawl before while loop’ , tocrawl
while tocrawl:
page = tocrawl.pop()
print ‘printing tocrawl before if statement’ , tocrawl, page
if page not in crawled:
content = get_page(page)
print ‘printing content’ +content
links = get_all_links(get_page(page))
print ‘printing links and to crawl variables’, links , tocrawl
union (tocrawl, get_all_links(get_page(page)))
print tocrawl , ‘after union’
print crawled
return crawled

def union(p, q):
for e in q:
if e not in p:

def get_page(url):
return urllib.urlopen(url).read()
return “”

# return url

def get_next_target(page):
start_link = page.find(‘<a href=’)
start_quote = page.find(‘”‘, start_link)
end_quote = page.find(‘”‘, start_quote + 1)
url = page[start_quote + 1:end_quote]
print ‘printing url’, url, end_quote
return url, end_quote



Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Google+ photo

You are commenting using your Google+ account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )


Connecting to %s