Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 7c56913

Browse files
Create webscraping.py
1 parent ebeaede commit 7c56913

File tree

1 file changed

+114
-0
lines changed

1 file changed

+114
-0
lines changed
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
# HTML Page Read and Upload
2+
# Import useful libraries and classes.
3+
4+
from urllib.request import urlopen as uReq
5+
from bs4 import BeautifulSoup as soup
6+
7+
# html page upload and read in web_page variable.
8+
my_url= "https://www.flipkart.com/search?p%5B%5D=facets.brand%255B%255D%3DSamsung&sid=tyy%2F4io&sort=recency_desc&wid=1.productCard.PMU_V2_1"
9+
web_page= uReq(my_url)
10+
page_html= web_page.read()
11+
12+
# Parsing
13+
# html parser. It is to beautify the HTML code.
14+
page_soup= soup(page_html)
15+
16+
# Extraction of information
17+
# read class attribute from web page in containers variable.
18+
# Print the length of containers.
19+
20+
containers= page_soup.findAll("div", {"class": "_2kHMtA"})
21+
print(len(containers))
22+
23+
# Extracting Product Name
24+
# The product_name_list contains the name of product extracted using find function using div tag and
25+
# class name
26+
27+
product_name_list = []
28+
29+
for box in containers:
30+
# name of product is extracted using div tag with class name as given in website
31+
product_name = box.find("div", class_="_4rR01T")
32+
# the extracted names is stored in a list
33+
product_name_list.append(product_name.string)
34+
35+
# Extracting Ratings
36+
# The rating_list contains the ratings of product extracted using find function using div tag and class name
37+
38+
rating_list = []
39+
40+
for box in containers:
41+
# rating of product is extracted using div tag with class name as given in website, if the rating is None, then 0.0 is used.
42+
rating = box.find("div", class_="_3LWZlK")
43+
if rating != None:
44+
rating_list.append(rating.text)
45+
else:
46+
rating_list.append('0.0')
47+
# Extracting Price of Product
48+
# The price_list contains the price of product extracted using find function using div tag and class name
49+
price_list = []
50+
51+
for box in containers:
52+
# price of product is extracted using div tag with class name as given in website
53+
price = box.find("div", class_="_30jeq3")
54+
# the extracted price is stored in a list after string Rupees sign.
55+
price_list.append(price.string.strip('₹'))
56+
57+
# The container in website contains a list of information of the product which is
58+
# extracted using find function using li tag and class name
59+
# here n is length of containers in 1 page.
60+
61+
n = len(containers)
62+
63+
# list to store RAM of phones
64+
ram_list = []
65+
# list to store ROM of phones
66+
rom_list = []
67+
# list to store Display Screen of phones
68+
display_list = []
69+
# list to store Camera Specification of phones
70+
camera_list = []
71+
# list to store Battery Life of phones
72+
battery_life_list = []
73+
# list to store Warranty Period of phones
74+
warranty_list = []
75+
# temporary list to store the all the list of phones's specifications
76+
temp_list = []
77+
78+
79+
for box in containers:
80+
# one list out of all product list is extracted using li tag with class name as given in website
81+
temp_box = box.findAll("li", class_="rgWa7D")
82+
temp_list.append(temp_box)
83+
84+
for i in range(n):
85+
# this loop extracts the values stored in the list of one container.
86+
# since in the website the RAM & ROM of phoes are listed together
87+
# so it is stored in a list and then splitted as per given splittor element.
88+
split_list = temp_list[i][0].string.split('|')
89+
# the extracted RAM is stored in a list
90+
ram_list.append(split_list[0])
91+
# the extracted ROM is stored in a list
92+
rom_list.append(split_list[1])
93+
# the extracted display is stored in a list
94+
display_list.append(temp_list[i][1].string)
95+
# the extracted camera is stored in a list
96+
camera_list.append(temp_list[i][2].string)
97+
# the extracted battery is stored in a list
98+
battery_life_list.append(temp_list[i][3].string)
99+
# the extracted warranty is stored in a list
100+
warranty_list.append(temp_list[i][-1].string)
101+
102+
# Creating Pandas DataFrame from Data scraped from Web
103+
# Importing Pandas to create a DataFrame
104+
import pandas as pd
105+
# Creating a Dictionary to store List values and creating DataFrame
106+
dictionary = {'Product_Name':product_name_list, 'Ratings':rating_list, 'Price':price_list, 'RAM_Storage':ram_list,
107+
'ROM_Storage':rom_list, 'Display_Screen':display_list, 'Camera':camera_list, 'Battery_Life':battery_life_list,
108+
'Warranty_Life':warranty_list}
109+
dataframe = pd.DataFrame(dictionary)
110+
# Head of DataFrame
111+
dataframe.head()
112+
113+
# Tail of DataFrame
114+
dataframe.tail()

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /