Return to Question

added 3 characters in body; edited tags; deleted 5 characters in body

edited Aug 12, 2022 at 18:19

145.6k
22
190
479

A task:

A task:
Сollect Сollect data from the site in the following format:
book; user; book_rating; comment_rating; publication_date; comment
For one book at once several pages of reviews (or more than one).

Problem:

Problem:
One One request to the site can be sent once every 0.25 seconds, so async requests don't work.

Question:

Question:
Can Can data collection be accelerated?

Сode structure:

Сode structure: Link Link to the page with the book from Top_link.txt ->
→ Get links to pages with reviews of this book (function score_link) ->
→ Get each review in a loop (function score_user) ->
→ Collect data from the review (function score_user)

Code:

Code:

A task:
Сollect data from the site in the following format:
book; user; book_rating; comment_rating; publication_date; comment
For one book at once several pages of reviews (or more than one).

Problem:
One request to the site can be sent once every 0.25 seconds, so async requests don't work.

Question:
Can data collection be accelerated?

Сode structure: Link to the page with the book from Top_link.txt -> Get links to pages with reviews of this book (function score_link) -> Get each review in a loop (function score_user) -> Collect data from the review (function score_user)

Code:

A task:

Сollect data from the site in the following format:
book; user; book_rating; comment_rating; publication_date; comment
For one book at once several pages of reviews (or more than one).

Problem:

One request to the site can be sent once every 0.25 seconds, so async requests don't work.

Question:

Can data collection be accelerated?

Сode structure:

Link to the page with the book from Top_link.txt
→ Get links to pages with reviews of this book (function score_link)
→ Get each review in a loop (function score_user)
→ Collect data from the review (function score_user)

Code:

added 16 characters in body

Source Link

edited Aug 12, 2022 at 14:10

Pom Mop

edited Aug 12, 2022 at 14:10

Pom Mop

import os
import requests
from fake_useragent import UserAgent
from selectolax.lexbor import LexborHTMLParser
from time import sleep
from tqdm import tqdm
# The function concatenates the current directory and the file name
def path_to_file(name):
 return os.path.join(os.path.dirname(__file__), name)
# Read links to sites from top_link.txt
with open(path_to_file('top_link.txt'), 'r', encoding="utf-8") as f:
 text = f.read()
book_id = [int(element.strip("'{}")) for element in text.split(", ")]
sites = [f"https://fantlab.ru/work{i}" for i in sorted(book_id)]
# Activate UserAgent
useragent = UserAgent()
# Get the html page and request response status.
def get_html(url):
 headers = {"Accept": "*/*", "User-Agent": useragent.random}
 # Establish a permanent connection
 session = requests.Session()
 session.headers = headers
 adapter = requests.adapters.HTTPAdapter(pool_connections=100,
 pool_maxsize=100)
 session.mount('http://', adapter)
 resp = requests.get(url, headers=headers)
 html = resp.text
 return html, resp.status_code
# Get links to review pages
def score_link(html, url):
 tree = LexborHTMLParser(html)
 tree_users_list = tree.css_first(r'span.page-links')
 link_list = []
 # Users without this element have no reviews
 if tree_users_list is not None:
 tree_users = tree_users_list.css(r'a')
 for user in tree_users:
 # Link to comment page
 link = url + user.attributes['href']
 link_list.append(link)
 return link_list
 else:
 link_list.append(url)
 return link_list
# Get user feedback
def score_user(links):
 score_list = []
 # Follow links to review pages
 for url in links:
 html, status_code = get_html(url)
 tree = LexborHTMLParser(html)
 # Check server response
 if status_code == 200:
 score = tree.css("div.responses-list > div.response-item")
 if score is not None:
 # Go through reviews
 for user in score:
 book_link = url.split('?')[0]
 user_id = user.css_first(
 r'p.response-autor-info>b>a').attributes['href']
 book_rating = user.css_first(
 r'div.clearfix>div.response-autor-mark>b>span').text()
 comment_rating = user.css_first(
 r'div.response-votetab>span:nth-of-type(2)').text()
 data_score = user.css_first(
 r'p.response-autor-info>span').attributes['content']
 body_score = user.css_first(
 r'div.response-body-home').text().replace('\n', ' ')
 score_list.append(
 f'{book_link};{user_id};{book_rating};{comment_rating};{data_score};{body_score}\n'
 )
 elif status_code == 429:
 sleep(1)
 print('ERROE_429:', url)
 sleep(0.25)
 return score_list
with open(path_to_file("user.csv"), "a+", encoding='utf-8') as file:
 file.write(
 "book; user; book_rating; rating_rating; publication_date; comment \n"
 )
 for url in tqdm(sites):
 html, status_code = get_html(url)
 line = ''.join(score_user(score_link(html, url)))
 if line is not None:
 file.write(line)
 sleep(0.5)

import os
import requests
from fake_useragent import UserAgent
from selectolax.lexbor import LexborHTMLParser
from time import sleep
from tqdm import tqdm
# The function concatenates the current directory and the file name
def path_to_file(name):
 return os.path.join(os.path.dirname(__file__), name)
# Read links to sites from top_link.txt
with open(path_to_file('top_link.txt'), 'r', encoding="utf-8") as f:
 text = f.read()
book_id = [int(element.strip("'{}")) for element in text.split(", ")]
sites = [f"https://fantlab.ru/work{i}" for i in sorted(book_id)]
# Activate UserAgent
useragent = UserAgent()
# Get the html page and request response status.
def get_html(url):
 headers = {"Accept": "*/*", "User-Agent": useragent.random}
 # Establish a permanent connection
 session = requests.Session()
 session.headers = headers
 adapter = requests.adapters.HTTPAdapter(pool_connections=100,
 pool_maxsize=100)
 session.mount('http://', adapter)
 resp = requests.get(url, headers=headers)
 html = resp.text
 return html, resp.status_code
# Get links to review pages
def score_link(html, url):
 tree = LexborHTMLParser(html)
 tree_users_list = tree.css_first(r'span.page-links')
 link_list = []
 # Users without this element have no reviews
 if tree_users_list is not None:
 tree_users = tree_users_list.css(r'a')
 for user in tree_users:
 # Link to comment page
 link = url + user.attributes['href']
 link_list.append(link)
 return link_list
 else:
 link_list.append(url)
 return link_list
# Get user feedback
def score_user(links):
 score_list = []
 # Follow links to review pages
 for url in links:
 html, status_code = get_html(url)
 tree = LexborHTMLParser(html)
 # Check server response
 if status_code == 200:
 score = tree.css("div.responses-list > div.response-item")
 if score is not None:
 # Go through reviews
 for user in score:
 book_link = url.split('?')[0]
 user_id = user.css_first(
 r'p.response-autor-info>b>a').attributes['href']
 book_rating = user.css_first(
 r'div.clearfix>div.response-autor-mark>b>span').text()
 comment_rating = user.css_first(
 r'div.response-votetab>span:nth-of-type(2)').text()
 data_score = user.css_first(
 r'p.response-autor-info>span').attributes['content']
 body_score = user.css_first(
 r'div.response-body-home').text().replace('\n', ' ')
 score_list.append(
 f'{book_link};{user_id};{book_rating};{comment_rating};{data_score};{body_score}\n'
 )
 elif status_code == 429:
 sleep(1)
 print('ERROE_429:', url)
 sleep(0.25)
 return score_list
with open(path_to_file("user.csv"), "a+", encoding='utf-8') as file:
 file.write(
 "book; user; book_rating; rating_rating; publication_date; comment \n"
 )
 for url in tqdm(sites):
 html, status_code = get_html(url)
 line = ''.join(score_user(score_link(html, url)))
 if line is not None:
 file.write(line)

import os
import requests
from fake_useragent import UserAgent
from selectolax.lexbor import LexborHTMLParser
from time import sleep
from tqdm import tqdm
# The function concatenates the current directory and the file name
def path_to_file(name):
 return os.path.join(os.path.dirname(__file__), name)
# Read links to sites from top_link.txt
with open(path_to_file('top_link.txt'), 'r', encoding="utf-8") as f:
 text = f.read()
book_id = [int(element.strip("'{}")) for element in text.split(", ")]
sites = [f"https://fantlab.ru/work{i}" for i in sorted(book_id)]
# Activate UserAgent
useragent = UserAgent()
# Get the html page and request response status.
def get_html(url):
 headers = {"Accept": "*/*", "User-Agent": useragent.random}
 # Establish a permanent connection
 session = requests.Session()
 session.headers = headers
 adapter = requests.adapters.HTTPAdapter(pool_connections=100,
 pool_maxsize=100)
 session.mount('http://', adapter)
 resp = requests.get(url, headers=headers)
 html = resp.text
 return html, resp.status_code
# Get links to review pages
def score_link(html, url):
 tree = LexborHTMLParser(html)
 tree_users_list = tree.css_first(r'span.page-links')
 link_list = []
 # Users without this element have no reviews
 if tree_users_list is not None:
 tree_users = tree_users_list.css(r'a')
 for user in tree_users:
 # Link to comment page
 link = url + user.attributes['href']
 link_list.append(link)
 return link_list
 else:
 link_list.append(url)
 return link_list
# Get user feedback
def score_user(links):
 score_list = []
 # Follow links to review pages
 for url in links:
 html, status_code = get_html(url)
 tree = LexborHTMLParser(html)
 # Check server response
 if status_code == 200:
 score = tree.css("div.responses-list > div.response-item")
 if score is not None:
 # Go through reviews
 for user in score:
 book_link = url.split('?')[0]
 user_id = user.css_first(
 r'p.response-autor-info>b>a').attributes['href']
 book_rating = user.css_first(
 r'div.clearfix>div.response-autor-mark>b>span').text()
 comment_rating = user.css_first(
 r'div.response-votetab>span:nth-of-type(2)').text()
 data_score = user.css_first(
 r'p.response-autor-info>span').attributes['content']
 body_score = user.css_first(
 r'div.response-body-home').text().replace('\n', ' ')
 score_list.append(
 f'{book_link};{user_id};{book_rating};{comment_rating};{data_score};{body_score}\n'
 )
 elif status_code == 429:
 sleep(1)
 print('ERROE_429:', url)
 sleep(0.25)
 return score_list
with open(path_to_file("user.csv"), "a+", encoding='utf-8') as file:
 file.write(
 "book; user; book_rating; rating_rating; publication_date; comment \n"
 )
 for url in tqdm(sites):
 html, status_code = get_html(url)
 line = ''.join(score_user(score_link(html, url)))
 if line is not None:
 file.write(line)
 sleep(0.5)

deleted 84 characters in body

Source Link

edited Aug 12, 2022 at 13:56

Pom Mop

edited Aug 12, 2022 at 13:56

Pom Mop

import os
import requests
from fake_useragent import UserAgent
from selectolax.lexbor import LexborHTMLParser
from time import sleep
from tqdm import tqdm
# The function concatenates the current directory and the file name
def path_to_file(name):
 return os.path.join(os.path.dirname(__file__), name)
# Read links to sites from top_link.txt
with open(path_to_file('top_link.txt'), 'r', encoding="utf-8") as f:
 text = f.read()
book_id = [int(element.strip("'{}")) for element in text.split(", ")]
sites = [f"https://fantlab.ru/work{i}" for i in sorted(book_id)]
# Activate UserAgent
useragent = UserAgent()
# Get the html page and request response status.
def get_html(url):
 headers = {"Accept": "*/*", "User-Agent": useragent.random}
 # Establish a permanent connection
 session = requests.Session()
 session.headers = headers
 adapter = requests.adapters.HTTPAdapter(pool_connections=100,
 pool_maxsize=100)
 session.mount('http://', adapter)
 resp = requests.get(url, headers=headers)
 html = resp.text
 return html, resp.status_code
# Get links to review pages
def score_link(html, url):
 tree = LexborHTMLParser(html)
 tree_users_list = tree.css_first(r'span.page-links')
 link_list = []
 # Users without this element have no reviews
 if tree_users_list is not None:
 tree_users = tree_users_list.css(r'a')
 for user in tree_users:
 # Link to comment page
 link = url + user.attributes['href']
 link_list.append(link)
 return link_list
 else:
 link_list.append(url)
 return link_list
# Get user feedback
def score_user(links):
 score_list = []
 # Follow links to review pages
 for url in links:
 html, status_code = get_html(url)
 tree = LexborHTMLParser(html)
 # Check server response
 if status_code == 200:
 score = tree.css("div.responses-list > div.response-item")
 if score is not None:
 # Go through reviews
 for user in score:
 # book; user; book_rating; rating_ratings; publication_date; comment
 bookbook_link = url.split('?')[0]
 useruser_id = user.css_first(
 r'p.response-authorautor-info>b>a').attributes['href']
 book_rating = user.css_first(
 r'div.clearfix>div.response-authorautor-mark>b>span').text()
 comment_rating = user.css_first(
 r'div.response-votetab>span:nth-of-type(2)').text()
 publication_datedata_score = user.css_first(
 r'p.response-autor-info>span').attributes['content']
 commentbody_score = user.css_first(
 r'div.response-body-home').text().replace('\n', ' ')
 score_list.append(
 f'{bookbook_link};{useruser_id};{book_rating};{comment_rating};{publication_datedata_score};{commentbody_score}\n'
 )
 elif status_code == 429:
 sleep(1)
 print('ERROE_429:', url)
 sleep(0.25)
 return score_list
with open(path_to_file("user.csv"), "a+", encoding='utf-8') as file:
 file.write(
 "book; user; book_rating; rating_rating; publication_date; comment \n"
 )
 for url in tqdm(sites):
 html, status_code = get_html(url)
 line = ''.join(score_user(score_link(html, url)))
 if line is not None:
 file.write(line)

import os
import requests
from fake_useragent import UserAgent
from selectolax.lexbor import LexborHTMLParser
from time import sleep
from tqdm import tqdm
# The function concatenates the current directory and the file name
def path_to_file(name):
 return os.path.join(os.path.dirname(__file__), name)
# Read links to sites from top_link.txt
with open(path_to_file('top_link.txt'), 'r', encoding="utf-8") as f:
 text = f.read()
book_id = [int(element.strip("'{}")) for element in text.split(", ")]
sites = [f"https://fantlab.ru/work{i}" for i in sorted(book_id)]
# Activate UserAgent
useragent = UserAgent()
# Get the html page and request response status.
def get_html(url):
 headers = {"Accept": "*/*", "User-Agent": useragent.random}
 # Establish a permanent connection
 session = requests.Session()
 session.headers = headers
 adapter = requests.adapters.HTTPAdapter(pool_connections=100,
 pool_maxsize=100)
 session.mount('http://', adapter)
 resp = requests.get(url, headers=headers)
 html = resp.text
 return html, resp.status_code
# Get links to review pages
def score_link(html, url):
 tree = LexborHTMLParser(html)
 tree_users_list = tree.css_first(r'span.page-links')
 link_list = []
 # Users without this element have no reviews
 if tree_users_list is not None:
 tree_users = tree_users_list.css(r'a')
 for user in tree_users:
 # Link to comment page
 link = url + user.attributes['href']
 link_list.append(link)
 return link_list
 else:
 link_list.append(url)
 return link_list
# Get user feedback
def score_user(links):
 score_list = []
 # Follow links to review pages
 for url in links:
 html, status_code = get_html(url)
 tree = LexborHTMLParser(html)
 # Check server response
 if status_code == 200:
 score = tree.css("div.responses-list > div.response-item")
 if score is not None:
 # Go through reviews
 for user in score:
 # book; user; book_rating; rating_ratings; publication_date; comment
 book = url.split('?')[0]
 user = user.css_first(
 r'p.response-author-info>b>a').attributes['href']
 book_rating = user.css_first(
 r'div.clearfix>div.response-author-mark>b>span').text()
 comment_rating = user.css_first(
 r'div.response-votetab>span:nth-of-type(2)').text()
 publication_date = user.css_first(
 r'p.response-autor-info>span').attributes['content']
 comment = user.css_first(
 r'div.response-body-home').text().replace('\n', ' ')
 score_list.append(
 f'{book};{user};{book_rating};{comment_rating};{publication_date};{comment}\n'
 )
 elif status_code == 429:
 sleep(1)
 print('ERROE_429:', url)
 sleep(0.25)
 return score_list
with open(path_to_file("user.csv"), "a+", encoding='utf-8') as file:
 file.write(
 "book; user; book_rating; rating_rating; publication_date; comment \n"
 )
 for url in tqdm(sites):
 html, status_code = get_html(url)
 line = ''.join(score_user(score_link(html, url)))
 if line is not None:
 file.write(line)

import os
import requests
from fake_useragent import UserAgent
from selectolax.lexbor import LexborHTMLParser
from time import sleep
from tqdm import tqdm
# The function concatenates the current directory and the file name
def path_to_file(name):
 return os.path.join(os.path.dirname(__file__), name)
# Read links to sites from top_link.txt
with open(path_to_file('top_link.txt'), 'r', encoding="utf-8") as f:
 text = f.read()
book_id = [int(element.strip("'{}")) for element in text.split(", ")]
sites = [f"https://fantlab.ru/work{i}" for i in sorted(book_id)]
# Activate UserAgent
useragent = UserAgent()
# Get the html page and request response status.
def get_html(url):
 headers = {"Accept": "*/*", "User-Agent": useragent.random}
 # Establish a permanent connection
 session = requests.Session()
 session.headers = headers
 adapter = requests.adapters.HTTPAdapter(pool_connections=100,
 pool_maxsize=100)
 session.mount('http://', adapter)
 resp = requests.get(url, headers=headers)
 html = resp.text
 return html, resp.status_code
# Get links to review pages
def score_link(html, url):
 tree = LexborHTMLParser(html)
 tree_users_list = tree.css_first(r'span.page-links')
 link_list = []
 # Users without this element have no reviews
 if tree_users_list is not None:
 tree_users = tree_users_list.css(r'a')
 for user in tree_users:
 # Link to comment page
 link = url + user.attributes['href']
 link_list.append(link)
 return link_list
 else:
 link_list.append(url)
 return link_list
# Get user feedback
def score_user(links):
 score_list = []
 # Follow links to review pages
 for url in links:
 html, status_code = get_html(url)
 tree = LexborHTMLParser(html)
 # Check server response
 if status_code == 200:
 score = tree.css("div.responses-list > div.response-item")
 if score is not None:
 # Go through reviews
 for user in score:
 book_link = url.split('?')[0]
 user_id = user.css_first(
 r'p.response-autor-info>b>a').attributes['href']
 book_rating = user.css_first(
 r'div.clearfix>div.response-autor-mark>b>span').text()
 comment_rating = user.css_first(
 r'div.response-votetab>span:nth-of-type(2)').text()
 data_score = user.css_first(
 r'p.response-autor-info>span').attributes['content']
 body_score = user.css_first(
 r'div.response-body-home').text().replace('\n', ' ')
 score_list.append(
 f'{book_link};{user_id};{book_rating};{comment_rating};{data_score};{body_score}\n'
 )
 elif status_code == 429:
 sleep(1)
 print('ERROE_429:', url)
 sleep(0.25)
 return score_list
with open(path_to_file("user.csv"), "a+", encoding='utf-8') as file:
 file.write(
 "book; user; book_rating; rating_rating; publication_date; comment \n"
 )
 for url in tqdm(sites):
 html, status_code = get_html(url)
 line = ''.join(score_user(score_link(html, url)))
 if line is not None:
 file.write(line)

Source Link

asked Aug 12, 2022 at 13:43

Pom Mop

asked Aug 12, 2022 at 13:43

Pom Mop

lang-py