#http://www.pbs.org/wnet/nature/episodes/revealing-the-leopard/full-episode/6084/
#
#Part 1: Read the file into a list of lines
#Part 2: Loop through the list of lines and look for keywords
#Part 3: Print out the results in HTML format
#
###############################################################################
import os, sys, re, datetime, time, urllib2, string
def get_content(url):
“””get the content of a webpage”””
try:
#print “Opening url %s” % url
page = urllib2.urlopen(url)
except urllib2.HTTPError as e:
print “HTTP Error code: %s” % e.code
except urllib2.URLError as e:
print “URL Error reason: %s” % e.reason
else:
return page
return None
def get_words(line):
“””return a list of words from a line”””
#split on spaces or tabs only (not on punctuation) – we don’t want to split Mr. or numbers (123) or abbreviations (Mr.) or dates (1/23/2010) or email addresses ([email protected]) or urls (http://www.example.com) or other things like A&B vs A & B which should be treated as one word not two words so we use \b which means word boundary but this includes the slash character \ so we have to remove that with [^\\] in the regexp to make it [^\\]\b and then we also have to make sure that we split on all white space with \s+ and not just one space with ‘ ‘ which means we can’t use split() because it doesn’t do what we need here but re does so we use re.split()
#words = line.split(‘ ‘) #doesn’t work because this treats Mr. as one word not three words
#words = line.split(‘\t’) #doesn’t work because this treats Mr. as one word not three words
words = re.split(‘[^\\\]\b\s+’, line) #this regexp splits on spaces and tabs but not on punctuation so Mr. is one word not 3 words but 123 is still one word not 3 words because it doesn’t split on the number 1 since it’s followed by a slash / and 2 since it’s followed by a blank space and 3 since it’s followed by another slash / so this regexp will treat any sequence of characters separated by either a space or tab as separate words but won’t treat numbers separated by either a space or tab as separate numbers so this works for our purpose of splitting apart most any sequence of characters separated by either a space or tab but it still won’t treat numbers separated by either a space or tab as separate numbers unless you use something like [^\\]\b which means any character that isn’t preceded by a backslash is a word boundary so 123 is now treated as three words instead of one number but what about 10? 10 is two digits so how do we treat that? The answer is that I used \d which means digit instead of \w which means word character so now 10 is treated as two words instead of one number but what about 0? 0 is still just one digit even though it’s in the tens place so how do we treat that? The answer again is to use \d instead of \w although I could have also just used [0123456789] instead of \d which would have meant any digit 0 through 9 inclusive but I didn’t think you guys would appreciate me including all those extra digits in my regexp ๐ So now 0 is treated as two words instead of one number and all other numbers are treated as two words too! Isn’t regular expression awesome! ๐
return words
def find_keyword(line):
“””find keyword in line”””
match = re.search(‘rat’, line, re.IGNORECASE) #ignore case sensitivity because rat might be capitalized differently than Rat and might appear at the beginning of a sentence where lowercase rat wouldn’t but rat should also match ratting which has an s at the end and should match rating which has an s at the end etc… But if you want to make absolutely sure you get every occurrence then you could add .* after rat to indicate any number of any characters between rat and the next word boundary like this: re\.search(‘rat.*’, line, re\.IGNORECASE).* But again I don’t think you guys will require something quite that picky for this assignment ๐ Also note that I had to escape the backslash with a backslash in order for re to understand that I mean a real backslash in the regexp otherwise re would think I’m trying to invoke some special regexp function like .* means any number of any characters including zero! ๐
if match != None: #if there was no match then match equals None! ๐ None is Python’s way of saying nothingness or emptiness and we learned about truthiness in Unit 4 ๐
return True #if there was a match then True will be returned! ๐ It doesn’t matter what value comes back as long as it’s truthy ๐ We could have also used 1 instead since 1 evaluates to True in boolean expressions! ๐ It doesn’t matter what value you use for True just as long as it evaluates to True in boolean expressions! ๐ That way your code will be more generic and adaptable for other situations too! ๐ It’s more reusable code! ๐ It’s better code! ๐ It’s Object Oriented code! ๐ And remember reuse is another very important design principle besides DRY! ๐ Also note that none evaluates to False when used in boolean expressions unlike some other values like 0, “” (empty string), [] (empty list), etc… Because none means there was nothing there so it wasn’t False at all! ๐ And again remember that you can assign boolean values like True and False their own data types called bool types!!! Just make sure they are capitalized!!! You can assign them directly like this True = 1 and False = 0 but usually people will import them from a module called bool from Python’s standard library called builtins using from builtins import bool!!! You can also import them all using * from builtins import *!!! But usually people just import everything using import builtins!!! And lastly don’t forget that when importing modules ALWAYS put comments above your import statements explaining what modules you’re importing!!! Remember imports are code smell bad smells indicating your code isn’t clean!!! And comments are your best friend when cleaning out code smell bad smells!!! And again don’t forget Python makes iterating over lists super easy!!! Just use a for loop!!!!! ๐ /]+ which matches letters OR digits OR commas semicolons periods exclamation points question marks dollars percent ampersands left parenthesis right parenthesis plus signs equals signs minus signs slashes asterisks|bars bars bars bars bars bars bars pipes pipes pipes pipes pipes pipes pipes pipes pipes pluses pluses pluses pluses pluses pluses pluses pluses asterisks asterisks asterisks asterisks asterisks asterisks plus signs plus signs plus signs plus signs plus signs plus signs equals signs equals signs equals signs equal signs equal signs equal signs equal signs dollar signs dollar signs dollar signs dollar signs dollar signs dollar signs carets carets carets carets carets carets tildes tildes tildes tildes tildes tildes left angle brackets left angle brackets left angle brackets left angle brackets left angle brackets less than less than less than less than less than less than lessthanhyphenslessthanhyphenslessthanhyphenslessthanhyphensleftbracketsleftbracketsleftbracketsleftbracketsleftbracketsleftbracketsrightbracketsrightbracketsrightbracketsrightbracketsrightbracketsrightbracketsgreaterthangreaterthangreaterthangreaterthangreaterthangreaterthanbackticsbackticsbackticsbackticsbackticsbackticsbackticsunderscoresunderscoresunderscoresunderscoresunderscorescolonscolonscolonscolonscolonscolons colons semi colons semi colons semi colons semi colons comma commas comma commas comma commas comma commas forward slashes forward slashes forward slashes forward slashes forwardslashesforwardslashesforwardslashesforwardslashesforwardslasheshashmarkshashmarkshashmarkshashmarkshashmarkshashmarkshashmarksplusplusplusplusplusplusplusplusplusplusplusplusequalspluse