Learning to Crawl the Web


#start by finding all the links on the seed page
#store them in a list
#go through all the links in that list to continue our crawl, and keep going as long as there are more pages to crawl.

### The first step is to define a procedure get_all_links that takes as input a string
### that represents the text on a web page and produces
### as output a list containing all the URLs that are targets of link tags
### on that page.

import urllib

def get_all_links(page):
links = []
while True:
url, endpos = get_next_target(page)
if url:
print url, ‘printing from get_all_links procedure’
links.append(url)
page = page[endpos:]
else:
break
return links

def crawl_web(seed):
tocrawl = [seed]
crawled = []
index = []
print ‘printing tocrawl before while loop’ , tocrawl
while tocrawl:
page = tocrawl.pop()
print ‘printing tocrawl before if statement’ , tocrawl, page
if page not in crawled:
content = get_page(page)
print ‘printing content’ +content
links = get_all_links(get_page(page))
print ‘printing links and to crawl variables’, links , tocrawl
union (tocrawl, get_all_links(get_page(page)))
print tocrawl , ‘after union’
crawled.append(page)
print crawled
return crawled

def union(p, q):
for e in q:
if e not in p:
p.append(e)

def get_page(url):
try:
return urllib.urlopen(url).read()
except:
return “”

# return url

def get_next_target(page):
start_link = page.find(‘<a href=’)
start_quote = page.find(‘”‘, start_link)
end_quote = page.find(‘”‘, start_quote + 1)
url = page[start_quote + 1:end_quote]
print ‘printing url’, url, end_quote
return url, end_quote

crawl_web(‘https://mewsigma.wordpress.com&#8217;)

Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s