Scraping Data From Multiple URLs into Single Dataframe

Question 1

I have a Class that:

goes to a url
grabs a link from that page, and a date (filing_date)
navigates to the link, and
writes the table from that page to a dataframe.

I am trying to add the respective filing_date from step 2 to the dataframe from step 4, but rather than pass the multiple filing_dates, like so:

 nameOfIssuer cik Filing Date
0 Agilent Technologies, Inc. (A) ... 0000846222 2020年01月10日
1 Adient PLC (ADNT) ... 0000846222 2020年01月10日
.. ... ... ... ...
662 Whirlpool Corp (WHR) ... 0000846222 2010年07月08日

it only passes the last scraped date from the prior page to all rows:

 nameOfIssuer cik Filing Date
0 Agilent Technologies, Inc. (A) ... 0000846222 2010年07月08日
1 Adient PLC (ADNT) ... 0000846222 2010年07月08日
.. ... ... ... ...
662 Whirlpool Corp (WHR) ... 0000846222 2010年07月08日

I've tried storing the dates to an empty list and then appending to the output data frame, but because the length of the list doesn't match the list of the dataframe, I get ValueError: Length of values does not match length of index.

Can someone advise on what the best approach would be (e.g., making another function to solely handle filing_date or perhaps returning a data frame instead)?

import pandas as pd
from urllib.parse import urljoin
from bs4 import BeautifulSoup, SoupStrainer
import requests
class Scraper:
 BASE_URL = "https://www.sec.gov"
 FORMS_URL_TEMPLATE = "/cgi-bin/browse-edgar?action=getcompany&CIK={cik}&type=13F"
 def __init__(self):
 self.session = requests.Session()
 def get_holdings(self, cik):
 """
 Main function that first finds the most recent 13F form and then passes
 it to scrapeForm to get the holdings for a particular institutional investor.
 """
 # get the form urls
 forms_url = urljoin(self.BASE_URL, self.FORMS_URL_TEMPLATE.format(cik=cik))
 parse_only = SoupStrainer('a', {"id": "documentsbutton"})
 soup = BeautifulSoup(self.session.get(forms_url).content, 'lxml', parse_only=parse_only)
 urls = soup.find_all('a', href=True)
 # get form document URLs
 form_urls = []
 for url in urls:
 url = url.get("href")
 url = urljoin(self.BASE_URL, str(url))
 headers = {'User-Agent': 'Mozilla/5.0'}
 page = requests.get(url, headers=headers)
 soup = BeautifulSoup(page.content, 'html.parser')
 # Get filing date and "period date"
 dates = soup.find("div", {"class": "formContent"})
 filing_date = dates.find_all("div", {"class": "formGrouping"})[0]
 filing_date = filing_date.find_all("div", {"class": "info"})[0]
 filing_date = filing_date.text
 # get form table URLs
 parse_only = SoupStrainer('tr', {"class": 'blueRow'})
 soup = BeautifulSoup(self.session.get(url).content,'lxml', parse_only=parse_only)
 form_url = soup.find_all('tr', {"class": 'blueRow'})[-1].find('a')['href']
 if ".txt" in form_url:
 pass
 else:
 form_url = urljoin(self.BASE_URL, form_url)
 # print(form_url)
 form_urls.append(form_url)
 return self.scrape_document(form_urls, cik, filing_date)
 def scrape_document(self, urls, cik, filing_date):
 """This function scrapes holdings from particular document URL"""
 cols = ['nameOfIssuer', 'titleOfClass', 'cusip', 'value', 'sshPrnamt',
 'sshPrnamtType', 'putCall', 'investmentDiscretion',
 'otherManager', 'Sole', 'Shared', 'None']
 data = []
 for url in urls:
 soup = BeautifulSoup(self.session.get(url).content, 'lxml')
 for info_table in soup.find_all(['ns1:infotable', 'infotable']):
 row = []
 for col in cols:
 d = info_table.find([col.lower(), 'ns1:' + col.lower()])
 row.append(d.text.strip() if d else 'NaN')
 data.append(row)
 df = pd.DataFrame(data, columns=cols)
 df['cik'] = cik
 df['Filing Date'] = filing_date
 return df
holdings = Scraper()
holdings = holdings.get_holdings("0000846222")
print(holdings)

Question 2

What version of python is this written in?

Question 3

Sorry, it's Python 3.7

Question 4

Seems like you have as many filing_dates as you have URLs, so you should have those together and handle them similarly.

Your problem seems to come from the fact that you're losing the intel of which row comes from which URL, and so your only option becomes to set one date for the full dataframe.

Here's an updated version saving the dates at the same time as the URLs and using and using a new res_df dataframe in scrape_document to aggregate the dataframes retrieved from each URL.

import pandas as pd
from urllib.parse import urljoin
from bs4 import BeautifulSoup, SoupStrainer
import requests
class Scraper:
 BASE_URL = "https://www.sec.gov"
 FORMS_URL_TEMPLATE = "/cgi-bin/browse-edgar?action=getcompany&CIK={cik}&type=13F"
 def __init__(self):
 self.session = requests.Session()
 def get_holdings(self, cik):
 """
 Main function that first finds the most recent 13F form and then passes
 it to scrapeForm to get the holdings for a particular institutional investor.
 """
 # get the form urls
 forms_url = urljoin(self.BASE_URL, self.FORMS_URL_TEMPLATE.format(cik=cik))
 parse_only = SoupStrainer('a', {"id": "documentsbutton"})
 soup = BeautifulSoup(self.session.get(forms_url).content, 'lxml', parse_only=parse_only)
 urls = soup.find_all('a', href=True)
 # get form document URLs
 form_urls = []
 filing_dates = []
 for url in urls:
 url = url.get("href")
 url = urljoin(self.BASE_URL, str(url))
 headers = {'User-Agent': 'Mozilla/5.0'}
 page = requests.get(url, headers=headers)
 soup = BeautifulSoup(page.content, 'html.parser')
 # Get filing date and "period date"
 dates = soup.find("div", {"class": "formContent"})
 filing_date = dates.find_all("div", {"class": "formGrouping"})[0]
 filing_date = filing_date.find_all("div", {"class": "info"})[0]
 filing_date = filing_date.text
 # get form table URLs
 parse_only = SoupStrainer('tr', {"class": 'blueRow'})
 soup = BeautifulSoup(self.session.get(url).content,'lxml', parse_only=parse_only)
 form_url = soup.find_all('tr', {"class": 'blueRow'})[-1].find('a')['href']
 if ".txt" in form_url:
 pass
 else:
 form_url = urljoin(self.BASE_URL, form_url)
 # print(form_url)
 form_urls.append(form_url)
 # Save the filing date too
 filing_dates.append(filing_date)
 # Pass the dates list rather than the last one
 return self.scrape_document(form_urls, cik, filing_dates)
 def scrape_document(self, urls, cik, filing_dates):
 """This function scrapes holdings from particular document URL"""
 cols = ['nameOfIssuer', 'titleOfClass', 'cusip', 'value', 'sshPrnamt',
 'sshPrnamtType', 'putCall', 'investmentDiscretion',
 'otherManager', 'Sole', 'Shared', 'None']
 res_df = pd.DataFrame(columns=cols+["Filing Date"])
 # Iterate over both list at the same time
 for url, date in zip(urls, filing_dates):
 data = []
 soup = BeautifulSoup(self.session.get(url).content, 'lxml')
 for info_table in soup.find_all(['ns1:infotable', 'infotable']):
 row = []
 for col in cols:
 d = info_table.find([col.lower(), 'ns1:' + col.lower()])
 row.append(d.text.strip() if d else 'NaN')
 data.append(row)
 url_df = pd.DataFrame(data, columns=cols)
 url_df["Filing Date"] = date
 res_df = res_df.append(url_df, ignore_index=True)
 # CIK seems common to the whole DF, if not follow the example of dates
 res_df['cik'] = cik
 return res_df
holdings = Scraper()
holdings = holdings.get_holdings("0000846222")
print(holdings)
```

Question 5

Thanks I appreciate this. Made a couple edits on some minor typos but I think this will work. Out of curiosity, is there a better way I could have organized/structured this from the beginning?

Question 6

It's pretty clean already. Separating the data processing from the data fetching (get requests) might prove useful if you plan to write unit tests. It'll make mocking easier.

Cal CalCal 3061 silver badge2 bronze badges · Accepted Answer · 2020-03-11 17:28:22Z

Seems like you have as many filing_dates as you have URLs, so you should have those together and handle them similarly.

Your problem seems to come from the fact that you're losing the intel of which row comes from which URL, and so your only option becomes to set one date for the full dataframe.

Here's an updated version saving the dates at the same time as the URLs and using and using a new res_df dataframe in scrape_document to aggregate the dataframes retrieved from each URL.

import pandas as pd
from urllib.parse import urljoin
from bs4 import BeautifulSoup, SoupStrainer
import requests
class Scraper:
 BASE_URL = "https://www.sec.gov"
 FORMS_URL_TEMPLATE = "/cgi-bin/browse-edgar?action=getcompany&CIK={cik}&type=13F"
 def __init__(self):
 self.session = requests.Session()
 def get_holdings(self, cik):
 """
 Main function that first finds the most recent 13F form and then passes
 it to scrapeForm to get the holdings for a particular institutional investor.
 """
 # get the form urls
 forms_url = urljoin(self.BASE_URL, self.FORMS_URL_TEMPLATE.format(cik=cik))
 parse_only = SoupStrainer('a', {"id": "documentsbutton"})
 soup = BeautifulSoup(self.session.get(forms_url).content, 'lxml', parse_only=parse_only)
 urls = soup.find_all('a', href=True)
 # get form document URLs
 form_urls = []
 filing_dates = []
 for url in urls:
 url = url.get("href")
 url = urljoin(self.BASE_URL, str(url))
 headers = {'User-Agent': 'Mozilla/5.0'}
 page = requests.get(url, headers=headers)
 soup = BeautifulSoup(page.content, 'html.parser')
 # Get filing date and "period date"
 dates = soup.find("div", {"class": "formContent"})
 filing_date = dates.find_all("div", {"class": "formGrouping"})[0]
 filing_date = filing_date.find_all("div", {"class": "info"})[0]
 filing_date = filing_date.text
 # get form table URLs
 parse_only = SoupStrainer('tr', {"class": 'blueRow'})
 soup = BeautifulSoup(self.session.get(url).content,'lxml', parse_only=parse_only)
 form_url = soup.find_all('tr', {"class": 'blueRow'})[-1].find('a')['href']
 if ".txt" in form_url:
 pass
 else:
 form_url = urljoin(self.BASE_URL, form_url)
 # print(form_url)
 form_urls.append(form_url)
 # Save the filing date too
 filing_dates.append(filing_date)
 # Pass the dates list rather than the last one
 return self.scrape_document(form_urls, cik, filing_dates)
 def scrape_document(self, urls, cik, filing_dates):
 """This function scrapes holdings from particular document URL"""
 cols = ['nameOfIssuer', 'titleOfClass', 'cusip', 'value', 'sshPrnamt',
 'sshPrnamtType', 'putCall', 'investmentDiscretion',
 'otherManager', 'Sole', 'Shared', 'None']
 res_df = pd.DataFrame(columns=cols+["Filing Date"])
 # Iterate over both list at the same time
 for url, date in zip(urls, filing_dates):
 data = []
 soup = BeautifulSoup(self.session.get(url).content, 'lxml')
 for info_table in soup.find_all(['ns1:infotable', 'infotable']):
 row = []
 for col in cols:
 d = info_table.find([col.lower(), 'ns1:' + col.lower()])
 row.append(d.text.strip() if d else 'NaN')
 data.append(row)
 url_df = pd.DataFrame(data, columns=cols)
 url_df["Filing Date"] = date
 res_df = res_df.append(url_df, ignore_index=True)
 # CIK seems common to the whole DF, if not follow the example of dates
 res_df['cik'] = cik
 return res_df
holdings = Scraper()
holdings = holdings.get_holdings("0000846222")
print(holdings)
```

Thanks I appreciate this. Made a couple edits on some minor typos but I think this will work. Out of curiosity, is there a better way I could have organized/structured this from the beginning?
It's pretty clean already. Separating the data processing from the data fetching (get requests) might prove useful if you plan to write unit tests. It'll make mocking easier.

Stack Exchange Network

Scraping Data From Multiple URLs into Single Dataframe

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Scraping Data From Multiple URLs into Single Dataframe

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions