|
| 1 | +''' |
| 2 | +Exercise 17: Decode A Web Page |
| 3 | + |
| 4 | +This is the first 4-chili exercise of this blog! We’ll see what people think, |
| 5 | +and decide whether or not to continue with 4-chili exercises in the future. |
| 6 | + |
| 7 | +Use the BeautifulSoup and requests Python packages to print out a list of all |
| 8 | +the article titles on the New York Times homepage. |
| 9 | + |
| 10 | +''' |
| 11 | +# Solution |
| 12 | +import requests |
| 13 | +from bs4 import BeautifulSoup |
| 14 | + |
| 15 | +def get_html_content_in_text(url): |
| 16 | + """ |
| 17 | + Grab all the content in webpage url and return it's content in text. |
| 18 | + |
| 19 | + Arguments: |
| 20 | + url -- a webpage url string. |
| 21 | + |
| 22 | + Returns: |
| 23 | + r.text -- the content of webpage in text. |
| 24 | + |
| 25 | + """ |
| 26 | + r = requests.get(url) |
| 27 | + return r.text |
| 28 | + |
| 29 | +def main(): |
| 30 | + content = get_html_content_in_text('http://www.nytimes.com/') |
| 31 | + soup = BeautifulSoup(content, "html5lib") |
| 32 | + for element in soup.find_all(class_="story-heading"): |
| 33 | + if element.a: |
| 34 | + print(element.a.text.replace("\n", " ").strip()) |
| 35 | + else: |
| 36 | + print(element.contents[0].strip()) |
| 37 | + |
| 38 | +if __name__ == "__main__": |
| 39 | + main() |
| 40 | + |
| 41 | +# Test Part - 2018年3月29日 |
| 42 | +# Trump Lawyer Broached Idea of Pardons for 2 Top Ex-Aides |
| 43 | +# Trump Aide Spoke in ’16 to Person Tied to Russia Intelligence |
| 44 | +# Justice Dept. Opens Internal Investigation on Surveillance of Trump Campaign Official |
| 45 | +# ‘Kiss Up, Kick Down’: Recalling Bolton’s Confirmation in 2005 |
| 46 | +# Veterans Affairs Chief Is Out, as Trump’s Shake-Up Continues |
| 47 | +# The Trump Administration’s Major Departures |
0 commit comments