Commit 7c56913

authored

Create webscraping.py

1 parent ebeaede commit 7c56913Copy full SHA for 7c56913

File tree

1 file changed

+114

-0

lines changed

WebScrapingScripts/Flipkart Mobiles Scraping
- webscraping.py

1 file changed

+114

-0

lines changed

`‎WebScrapingScripts/Flipkart Mobiles Scraping/webscraping.py‎`

Lines changed: 114 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,114 @@`
	`1`	`+# HTML Page Read and Upload`
	`2`	`+# Import useful libraries and classes.`
	`3`	`+`
	`4`	`+from urllib.request import urlopen as uReq`
	`5`	`+from bs4 import BeautifulSoup as soup`
	`6`	`+`
	`7`	`+# html page upload and read in web_page variable.`
	`8`	`+my_url= "https://www.flipkart.com/search?p%5B%5D=facets.brand%255B%255D%3DSamsung&sid=tyy%2F4io&sort=recency_desc&wid=1.productCard.PMU_V2_1"`
	`9`	`+web_page= uReq(my_url)`
	`10`	`+page_html= web_page.read()`
	`11`	`+`
	`12`	`+# Parsing`
	`13`	`+# html parser. It is to beautify the HTML code.`
	`14`	`+page_soup= soup(page_html)`
	`15`	`+`
	`16`	`+# Extraction of information`
	`17`	`+# read class attribute from web page in containers variable.`
	`18`	`+# Print the length of containers.`
	`19`	`+`
	`20`	`+containers= page_soup.findAll("div", {"class": "_2kHMtA"})`
	`21`	`+print(len(containers))`
	`22`	`+`
	`23`	`+# Extracting Product Name`
	`24`	`+# The product_name_list contains the name of product extracted using find function using div tag and`
	`25`	`+# class name`
	`26`	`+`
	`27`	`+product_name_list = []`
	`28`	`+`
	`29`	`+for box in containers:`
	`30`	`+ # name of product is extracted using div tag with class name as given in website`
	`31`	`+ product_name = box.find("div", class_="_4rR01T")`
	`32`	`+ # the extracted names is stored in a list`
	`33`	`+ product_name_list.append(product_name.string)`
	`34`	`+`
	`35`	`+# Extracting Ratings`
	`36`	`+# The rating_list contains the ratings of product extracted using find function using div tag and class name`
	`37`	`+`
	`38`	`+rating_list = []`
	`39`	`+`
	`40`	`+for box in containers:`
	`41`	`+ # rating of product is extracted using div tag with class name as given in website, if the rating is None, then 0.0 is used.`
	`42`	`+ rating = box.find("div", class_="_3LWZlK")`
	`43`	`+ if rating != None:`
	`44`	`+ rating_list.append(rating.text)`
	`45`	`+ else:`
	`46`	`+ rating_list.append('0.0')`
	`47`	`+# Extracting Price of Product`
	`48`	`+# The price_list contains the price of product extracted using find function using div tag and class name`
	`49`	`+price_list = []`
	`50`	`+`
	`51`	`+for box in containers:`
	`52`	`+ # price of product is extracted using div tag with class name as given in website`
	`53`	`+ price = box.find("div", class_="_30jeq3")`
	`54`	`+ # the extracted price is stored in a list after string Rupees sign.`
	`55`	`+ price_list.append(price.string.strip('₹'))`
	`56`	`+`
	`57`	`+# The container in website contains a list of information of the product which is`
	`58`	`+# extracted using find function using li tag and class name`
	`59`	`+# here n is length of containers in 1 page.`
	`60`	`+`
	`61`	`+n = len(containers)`
	`62`	`+`
	`63`	`+# list to store RAM of phones`
	`64`	`+ram_list = []`
	`65`	`+# list to store ROM of phones`
	`66`	`+rom_list = []`
	`67`	`+# list to store Display Screen of phones`
	`68`	`+display_list = []`
	`69`	`+# list to store Camera Specification of phones`
	`70`	`+camera_list = []`
	`71`	`+# list to store Battery Life of phones`
	`72`	`+battery_life_list = []`
	`73`	`+# list to store Warranty Period of phones`
	`74`	`+warranty_list = []`
	`75`	`+# temporary list to store the all the list of phones's specifications`
	`76`	`+temp_list = []`
	`77`	`+`
	`78`	`+`
	`79`	`+for box in containers:`
	`80`	`+ # one list out of all product list is extracted using li tag with class name as given in website`
	`81`	`+ temp_box = box.findAll("li", class_="rgWa7D")`
	`82`	`+ temp_list.append(temp_box)`
	`83`	`+`
	`84`	`+for i in range(n):`
	`85`	`+ # this loop extracts the values stored in the list of one container.`
	`86`	`+ # since in the website the RAM & ROM of phoes are listed together`
	`87`	`+ # so it is stored in a list and then splitted as per given splittor element.`
	`88`	`+ split_list = temp_list[i][0].string.split('\|')`
	`89`	`+ # the extracted RAM is stored in a list`
	`90`	`+ ram_list.append(split_list[0])`
	`91`	`+ # the extracted ROM is stored in a list`
	`92`	`+ rom_list.append(split_list[1])`
	`93`	`+ # the extracted display is stored in a list`
	`94`	`+ display_list.append(temp_list[i][1].string)`
	`95`	`+ # the extracted camera is stored in a list`
	`96`	`+ camera_list.append(temp_list[i][2].string)`
	`97`	`+ # the extracted battery is stored in a list`
	`98`	`+ battery_life_list.append(temp_list[i][3].string)`
	`99`	`+ # the extracted warranty is stored in a list`
	`100`	`+ warranty_list.append(temp_list[i][-1].string)`
	`101`	`+`
	`102`	`+# Creating Pandas DataFrame from Data scraped from Web`
	`103`	`+# Importing Pandas to create a DataFrame`
	`104`	`+import pandas as pd`
	`105`	`+# Creating a Dictionary to store List values and creating DataFrame`
	`106`	`+dictionary = {'Product_Name':product_name_list, 'Ratings':rating_list, 'Price':price_list, 'RAM_Storage':ram_list,`
	`107`	`+ 'ROM_Storage':rom_list, 'Display_Screen':display_list, 'Camera':camera_list, 'Battery_Life':battery_life_list,`
	`108`	`+ 'Warranty_Life':warranty_list}`
	`109`	`+dataframe = pd.DataFrame(dictionary)`
	`110`	`+# Head of DataFrame`
	`111`	`+dataframe.head()`
	`112`	`+`
	`113`	`+# Tail of DataFrame`
	`114`	`+dataframe.tail()`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

Commit 7c56913

File tree

1 file changed

1 file changed

`‎WebScrapingScripts/Flipkart Mobiles Scraping/webscraping.py‎`

0 commit comments