Commit acf90ea

authored

Merge pull request avinashkranjan#2934 from jaivsh/second

Added CNN web scraper

2 parents 79a66fb + 2fdeb80 commit acf90eaCopy full SHA for acf90ea

File tree

3 files changed

+120

-0

lines changed

CNN Scraper

3 files changed

+120

-0

lines changed

`‎CNN Scraper/README.md‎`

Lines changed: 6 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,6 @@`
	`1`	`+## CNN scraper`
	`2`	`+`
	`3`	`+This scraper script scrapes the CNN content from it's website using the relevant functions. The details of various of it's functions are as follows:`
	`4`	`+`
	`5`	`+- news_by_location(): Provides news by location/country/continent`
	`6`	`+- news_by_category(): Proides news articles by category.`

`‎CNN Scraper/cnn.py‎`

Lines changed: 111 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,111 @@`
	`1`	`+from bs4 import BeautifulSoup`
	`2`	`+import requests`
	`3`	`+`
	`4`	`+`
	`5`	`+class NewsCNN:`
	`6`	`+ """`
	`7`	+ Create an instance of `NewsCNN` class.\n
	`8`	+ ```python
	`9`	`+ news = NewsCNN()`
	`10`	+ ```
	`11`	`+ \| Methods \| Details \|`
	`12`	`+ \| ---------------------------- \| -------------------------------------------------------------------------- \|`
	`13`	+ \| `.news_by_location(country="india)` \| Returns the list of articles by a specific country. \|
	`14`	+ \| `.news_by_category(type)` \| Returns the list of articles by a specific category. \|
	`15`	`+ """`
	`16`	`+`
	`17`	`+ def __init__(self):`
	`18`	`+ self.headers = {`
	`19`	`+ "User-Agent": "Mozilla/5.0 (Windows NT 6.3; Win 64 ; x64) Apple WeKit /537.36(KHTML , like Gecko) Chrome/80.0.3987.162 Safari/537.36"`
	`20`	`+ }`
	`21`	`+`
	`22`	`+ def news_by_location(self, country: str):`
	`23`	`+ """`
	`24`	`+ Returns the relevant news articles corresponding to that particular geo-continent or country\n`
	`25`	+ Class - `NewsCNN`
	`26`	`+ Parameters: \n`
	`27`	`+ - country: Name of the country\n`
	`28`	+ ```python
	`29`	`+ news = newsCNN()`
	`30`	`+ news.news_by_location()`
	`31`	+ ```
	`32`	`+ """`
	`33`	`+`
	`34`	`+ try:`
	`35`	`+ sol = []`
	`36`	`+ obj_keys = ["news", "link"]`
	`37`	`+ location = country.lower()`
	`38`	`+ URL = f"https://edition.cnn.com/world/{location}"`
	`39`	`+ page = requests.get(URL)`
	`40`	`+ parse = BeautifulSoup(page.content, "html.parser")`
	`41`	`+ heads = parse.find_all("span", attrs={"data-editable": "headline"})`
	`42`	`+ links1 = parse.find_all(`
	`43`	`+ "a",`
	`44`	`+ attrs={`
	`45`	`+ "class": "container__link container_lead-plus-headlines-with-images__link"`
	`46`	`+ },`
	`47`	`+ )`
	`48`	`+ links2 = parse.find_all(`
	`49`	`+ "a", attrs={"class": "container__link container_vertical-strip__link"}`
	`50`	`+ )`
	`51`	`+ links3 = parse.find_all(`
	`52`	`+ "a",`
	`53`	`+ attrs={"class": "container__link container_lead-plus-headlines__link"},`
	`54`	`+ )`
	`55`	`+`
	`56`	`+ base = "https://edition.cnn.com/"`
	`57`	`+ allurls = []`
	`58`	`+ allheads = []`
	`59`	`+`
	`60`	`+ for i in heads:`
	`61`	`+ tmp = i.text`
	`62`	`+ allheads.append(tmp)`
	`63`	`+`
	`64`	`+ for i in links1 + links2 + links3:`
	`65`	`+ t = base + i["href"]`
	`66`	`+ allurls.append(t)`
	`67`	`+ allurls = list(set(allurls))`
	`68`	`+`
	`69`	`+ for i in range(len(allurls)):`
	`70`	`+ obj_values = [allheads[i], allurls[i]]`
	`71`	`+ new_obj = dict(zip(obj_keys, obj_values))`
	`72`	`+ sol.append(new_obj)`
	`73`	`+`
	`74`	`+ return sol`
	`75`	`+ except:`
	`76`	`+ return None`
	`77`	`+`
	`78`	`+ def news_by_category(self, type: str):`
	`79`	`+ """`
	`80`	`+ Returns a list of news articles from a specific category.`
	`81`	`+`
	`82`	`+ Parameters:`
	`83`	`+ - type (str): The category of news articles to retrieve. Allowable types are: "politics", "business", "opinions", "health", "style".`
	`84`	`+`
	`85`	`+ Returns:`
	`86`	`+ A list of dictionaries, each containing news article information including title and link, or an exception if an error occurs.`
	`87`	`+`
	`88`	`+ Example:`
	`89`	+ ```python
	`90`	`+ news = NewsCNN()`
	`91`	`+ politics_articles = news.news_by_category("politics")`
	`92`	+ ```
	`93`	`+ """`
	`94`	`+ try:`
	`95`	`+ sol = []`
	`96`	`+ type = type.lower()`
	`97`	`+ url = f"https://edition.cnn.com/{type}"`
	`98`	`+ page = requests.get(url, headers=self.headers)`
	`99`	`+ parse = BeautifulSoup(page.content, "html.parser")`
	`100`	`+ articles = parse.find_all(`
	`101`	`+ "a", {"class": "container__link container_lead-plus-headlines__link"}`
	`102`	`+ )`
	`103`	`+ for article in articles:`
	`104`	`+ text = article.find("span", {"data-editable": "headline"})`
	`105`	`+ if text:`
	`106`	`+ link = "https://edition.cnn.com" + article["href"]`
	`107`	`+ data = {"Title": text.text, "Link": link}`
	`108`	`+ sol.append(data)`
	`109`	`+ return sol`
	`110`	`+ except Exception as e:`
	`111`	`+ return e`

`‎CNN Scraper/requirements.txt‎`

Lines changed: 3 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+beautifulsoup4==4.9.1`
	`2`	`+bs4==0.0.1`
	`3`	`+requests==2.31.0`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit acf90ea

File tree

3 files changed

3 files changed

`‎CNN Scraper/README.md‎`

`‎CNN Scraper/cnn.py‎`

`‎CNN Scraper/requirements.txt‎`

0 commit comments