Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 2b88cb2

Browse files
added scripts
1 parent ac7461c commit 2b88cb2

File tree

7 files changed

+386
-15
lines changed

7 files changed

+386
-15
lines changed

‎FlipkartScraper/dbConnector.py‎

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
import sqlite3
2+
import os
3+
4+
class FlipkartDatabaseConnector:
5+
def __init__(self, stamp):
6+
self.dbPath = "flipkart.db"
7+
self.conn = sqlite3.connect(self.dbPath)
8+
self.cur = self.conn.cursor()
9+
self.welcomeMessage = "Welcome to Flipkart Scraper. This is the database for the Flipkart Scraper. This database was created on {}.".format(stamp)
10+
11+
def schemaMaker(self):
12+
# creating tables
13+
self.cur.execute("""CREATE TABLE products (
14+
id INTEGER PRIMARY KEY AUTOINCREMENT,
15+
sku TEXT NOT NULL,
16+
name TEXT NOT NULL,
17+
description TEXT NOT NULL,
18+
image_path TEXT NOT NULL,
19+
category TEXT NOT NULL,
20+
timestamp TEXT NOT NULL,
21+
URL TEXT NOT NULL,
22+
price TEXT NOT NULL
23+
);""")
24+
self.conn.commit()
25+
self.cur.execute("CREATE TABLE product_matches (id INTEGER PRIMARY KEY AUTOINCREMENT, product_id INTEGER NOT NULL, product_sku INTEGER NOT NULL, match_id INTEGER NOT NULL, match_sku INTEGER NOT NULL);")
26+
self.conn.commit()
27+
28+
def insertProduct(self, productDetails):
29+
self.cur.execute("INSERT INTO products (sku, name, description, image_path, category, timestamp, URL, price) VALUES (?, ?, ?, ?, ?, ?, ?, ?)", (productDetails["sku"], productDetails["name"], productDetails["description"], productDetails["image_path"], productDetails["category"], productDetails["timestamp"], productDetails["URL"], productDetails["price"]))
30+
self.conn.commit()
31+
32+
def fetchAllProducts(self):
33+
self.cur.execute("SELECT * FROM products")
34+
return self.cur.fetchall()
35+
36+
def clearDatabase(self):
37+
self.cur.execute("DELETE FROM products")
38+
self.conn.commit()
39+
self.cur.execute("DELETE FROM product_matches")
40+
self.conn.commit()
41+
42+
def removeDuplicates(self):
43+
self.cur.execute("DELETE FROM products WHERE rowid NOT IN (SELECT MIN(rowid) FROM products GROUP BY sku)")
44+
self.conn.commit()

‎FlipkartScraper/flipkart.db‎

0 Bytes
Binary file not shown.

‎FlipkartScraper/genricHtmlib.py‎

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
from multiprocessing import Pool
2+
import os
3+
from datetime import datetime
4+
import lxml.html as html
5+
import pandas as pd
6+
import time
7+
from selenium import webdriver
8+
from selenium.webdriver.chrome.options import Options
9+
import warnings
10+
import requests
11+
warnings.filterwarnings("ignore")
12+
13+
class SeleniumScraper:
14+
def __init__(self, timeout=10):
15+
self.timeout = timeout
16+
self.reqSession = requests.Session()
17+
self.stamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
18+
self.storagePath = os.path.join(
19+
os.path.dirname(os.path.abspath(__file__))
20+
)
21+
22+
self.headers = {
23+
'authority': 'www.amazon.com',
24+
'pragma': 'no-cache',
25+
'cache-control': 'no-cache',
26+
'dnt': '1',
27+
'upgrade-insecure-requests': '1',
28+
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36',
29+
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
30+
'sec-fetch-site': 'none',
31+
'sec-fetch-mode': 'navigate',
32+
'sec-fetch-dest': 'document',
33+
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
34+
}
35+
36+
def fetch_request_normal(self, url, params=None):
37+
try:
38+
headers = {
39+
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
40+
}
41+
response = self.reqSession.get(url, headers=headers)
42+
43+
if response.status_code == 200:
44+
return response.text
45+
46+
if response.status_code == 301:
47+
# retry with redirect
48+
response = requests.get(response.headers['Location'])
49+
response.raise_for_status()
50+
if response.status_code == 200:
51+
return response.text
52+
53+
if response.status_code == 503:
54+
#print("Request Failed Response status code for url: {} and status code: {}".format(url, 503))
55+
return None
56+
57+
except Exception as e:
58+
print(
59+
"Exception occurred for url: {} and exception: {}".format(url, e)
60+
)
61+
print("Exception occurred for url: {} and exception: {}".format(url, e))
62+
pass
63+
return None
64+
65+
def get_xpath_link(self, doc, xpath, website):
66+
try:
67+
name = doc.xpath("".join(xpath))
68+
for i in range(len(name)):
69+
if name[i].startswith("/"):
70+
name[i] = website + name[i]
71+
else:
72+
name[i] = name[i]
73+
return name
74+
75+
except Exception as e:
76+
print("Error in getting {}: {}".format(name, e))
77+
pass
78+
return None
79+
pass
80+
81+
def get_selenium_driver(self):
82+
chrome_options = Options()
83+
chrome_options.add_argument("--headless")
84+
chrome_options.add_argument("--window-size=1920,1080")
85+
chrome_options.add_argument("--disable-gpu")
86+
chrome_options.add_argument("--no-sandbox")
87+
chrome_options.add_argument("--disable-dev-shm-usage")
88+
chrome_options.add_argument("--disable-extensions")
89+
chrome_options.add_argument("--disable-logging")
90+
chrome_options.add_argument("--log-level=3")
91+
chrome_options.add_argument("--silent")
92+
chrome_options.add_argument("--blink-settings=imagesEnabled=false")
93+
driver = webdriver.Chrome(chrome_options=chrome_options)
94+
return driver
95+
96+
def fetch_request_selenium(self, url, waiting_time=1):
97+
try:
98+
driver = self.get_selenium_driver()
99+
driver.get(url)
100+
time.sleep(waiting_time)
101+
doc = html.fromstring(driver.page_source)
102+
driver.close()
103+
return doc
104+
105+
except Exception as e:
106+
print(
107+
"Exception occurred for url: {} and exception: {}".format(url, e)
108+
)
109+
pass
110+
111+
def get_xpath_data(self, doc, xpath):
112+
try:
113+
name = doc.xpath(xpath)
114+
return name
115+
116+
except Exception as e:
117+
print("Error in getting {}: {}".format(name, e))
118+
pass
119+
return None
120+
121+
def slow_page_scroll(self, driver, speed):
122+
current_scroll_position = driver.execute_script("return window.pageYOffset;")
123+
while current_scroll_position < driver.execute_script(
124+
"return document.body.scrollHeight;"
125+
):
126+
driver.execute_script(
127+
"window.scrollTo(0, arguments[0]);", current_scroll_position
128+
)
129+
current_scroll_position += 1000
130+
time.sleep(speed)
131+
132+
def data_storage(self, df_list, unique_id, name, storageFormat, storagePath=None):
133+
df_combined = pd.concat(df_list, ignore_index=True)
134+
df_combined.drop_duplicates(subset=unique_id, inplace=True)
135+
if storageFormat == "csv":
136+
df_combined.to_csv(
137+
self.storagePath +"/{}_{}.csv".format(name, self.stamp),
138+
index=False,
139+
)
140+
elif storageFormat == "json":
141+
df_combined.to_json(
142+
self.storagePath + "/{}_{}.json".format(name, self.stamp),
143+
orient="records",
144+
)
145+
146+
def cleanData(self, array):
147+
array = [x.strip() for x in array]
148+
array = list(filter(None, array))
149+
array = [x.encode("ascii", "ignore").decode() for x in array]
150+
array = [x.replace("\n", "") for x in array]
151+
return array
152+
153+

‎FlipkartScraper/main.py‎

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
import logging
2+
from datetime import datetime
3+
from dbConnector import FlipkartDatabaseConnector
4+
from productList import product_categories
5+
from genricHtmlib import SeleniumScraper
6+
import os
7+
import lxml.html as html
8+
import concurrent.futures
9+
10+
SeleniumScraper = SeleniumScraper()
11+
12+
class Scraper:
13+
def __init__(self):
14+
self.brand: str = "flipkart"
15+
self.website = "https://www.flipkart.com/search?q="
16+
self.websiteName = "https://www.flipkart.com"
17+
self.stamp: str = datetime.now().strftime("%Y-%ma-%d_%H-%M-%S")
18+
self.storagePath: str = os.getcwd()
19+
20+
self.productLinksXpath = '//*[@rel="noopener noreferrer"]//@href'
21+
self.skuXpath = '//tr[contains(@class, "row")]//td[contains(text(), "Model Number")]/following-sibling::td[1]/ul/li/text()'
22+
self.nameXpath = '//*[@class="B_NuCI"]//text()'
23+
self.description = '//div[contains(text(), "Description")]/following-sibling::div[1]/div/text()'
24+
self.image = '//*[@class="_396cs4 _2amPTt _3qGmMb"]//@src'
25+
self.category = '//*[@class="_3GIHBu"]//text()'
26+
self.price = '//*[@class="_30jeq3 _16Jk6d"]//text()'
27+
28+
def getProductList(self, keyword):
29+
try:
30+
productLinks = []
31+
url = self.website + keyword
32+
response = SeleniumScraper.fetch_request_normal(url)
33+
if response is None:
34+
doc = SeleniumScraper.fetch_request_selenium(url)
35+
else:
36+
doc = html.fromstring(response)
37+
38+
Links = SeleniumScraper.get_xpath_link(doc, self.productLinksXpath, self.websiteName)
39+
productLinks.extend(Links)
40+
41+
for page in range(2, 20):
42+
print(f'Geting Page {page} for {keyword}')
43+
url = self.website + keyword + "&page=" + str(page)
44+
response = SeleniumScraper.fetch_request_normal(url)
45+
if response is None:
46+
doc = SeleniumScraper.fetch_request_selenium(url)
47+
else:
48+
doc = html.fromstring(response)
49+
50+
Links = SeleniumScraper.get_xpath_link(doc, self.productLinksXpath, self.websiteName)
51+
productLinks.extend(Links)
52+
53+
print(f'Total products for {keyword} is {len(productLinks)}')
54+
return productLinks
55+
56+
except Exception as e:
57+
print(e)
58+
59+
def getProductDetails(self, productLink):
60+
print(f'Getting product details for {productLink}')
61+
response = SeleniumScraper.fetch_request_normal(productLink)
62+
if response is None:
63+
doc = SeleniumScraper.fetch_request_selenium(productLink)
64+
else:
65+
doc = html.fromstring(response)
66+
67+
productDetails = {}
68+
69+
try:
70+
sku = SeleniumScraper.get_xpath_data(doc ,self.skuXpath)
71+
sku = sku[0]
72+
except:
73+
sku = "None"
74+
75+
try:
76+
name = SeleniumScraper.get_xpath_data(doc ,self.nameXpath)
77+
name = name[0]
78+
except:
79+
name = "None"
80+
81+
try:
82+
description = SeleniumScraper.get_xpath_data(doc, self.description)
83+
description = ''.join(description)
84+
except:
85+
description = "None"
86+
87+
try:
88+
image_path = SeleniumScraper.get_xpath_link(doc, self.image, self.websiteName)
89+
image_path = image_path[0]
90+
except:
91+
image_path = "None"
92+
93+
try:
94+
category = SeleniumScraper.get_xpath_data(doc, self.category)
95+
category = category[1]
96+
except:
97+
category = "None"
98+
99+
try:
100+
price = SeleniumScraper.get_xpath_data(doc, self.price)
101+
price = SeleniumScraper.cleanData(price)
102+
price = price[0]
103+
except:
104+
price = "None"
105+
106+
productDetails["sku"] = str(sku)
107+
productDetails["name"] = str(name)
108+
productDetails["description"] = str(description)
109+
productDetails["image_path"] = str(image_path)
110+
productDetails["category"] = str(category)
111+
productDetails["timestamp"] = str(self.stamp)
112+
productDetails["URL"] = str(productLink)
113+
productDetails['price'] = price
114+
115+
print(productDetails)
116+
return productDetails
117+
118+
def start(self):
119+
productList = []
120+
number_of_threads: int = 1
121+
122+
# Log start of scraper
123+
print(f"Starting {self.brand} scraper")
124+
125+
# make db amazon.db if it doesn't exist
126+
if not os.path.exists(self.storagePath + "/" + self.brand + ".db"):
127+
print(f'Creating {self.brand}.db at {self.storagePath+self.brand+".db"}')
128+
db = FlipkartDatabaseConnector(self.stamp)
129+
db.schemaMaker()
130+
print(db.welcomeMessage)
131+
132+
self.db = FlipkartDatabaseConnector(self.stamp)
133+
print(self.db.welcomeMessage)
134+
135+
with concurrent.futures.ThreadPoolExecutor(max_workers=number_of_threads) as executor:
136+
productUrls = executor.map(self.getProductList, product_categories)
137+
productList.extend(productUrls)
138+
139+
140+
# flatten the list productList
141+
productList = [item for sublist in productList for item in sublist]
142+
print(f'Total products for {self.brand} is {len(productList)}')
143+
144+
with concurrent.futures.ThreadPoolExecutor(max_workers=number_of_threads) as executor:
145+
results = executor.map(self.getProductDetails, productList)
146+
147+
for result in results:
148+
print(f"Saving {result['sku']} to db")
149+
self.db.insertProduct(result)
150+
151+
self.db.removeDuplicates()
152+
153+
154+
if __name__ == '__main__':
155+
scraper = Scraper()
156+
scraper.start()

‎FlipkartScraper/productList.py‎

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
product_categories = [
2+
'mobiles',
3+
]

‎FlipkartScraper/requirements.txt‎

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,10 @@
1-
jupyter
2-
scikit-learn
3-
pandas
4-
numpy
5-
matplotlib
6-
seaborn
7-
tensorflow
8-
flask
9-
openai
101
bs4
112
requests
123
pandas
13-
requests
144
numpy
155
bs4
16-
geopy
176
boto3
187
ndjson
198
selenium
209
httpx
2110
lxml
22-
python-dotenv
23-
paramiko
24-
undetected-chromedriver
25-
fastjsonschema

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /