I have a Class that:
- goes to a url
- grabs a link from that page, and a date (
filing_date
) - navigates to the link, and
- writes the table from that page to a dataframe.
I am trying to add the respective filing_date
from step 2 to the dataframe from step 4, but rather than pass the multiple filing_dates
, like so:
nameOfIssuer cik Filing Date
0 Agilent Technologies, Inc. (A) ... 0000846222 2020年01月10日
1 Adient PLC (ADNT) ... 0000846222 2020年01月10日
.. ... ... ... ...
662 Whirlpool Corp (WHR) ... 0000846222 2010年07月08日
it only passes the last scraped date from the prior page to all rows:
nameOfIssuer cik Filing Date
0 Agilent Technologies, Inc. (A) ... 0000846222 2010年07月08日
1 Adient PLC (ADNT) ... 0000846222 2010年07月08日
.. ... ... ... ...
662 Whirlpool Corp (WHR) ... 0000846222 2010年07月08日
I've tried storing the dates to an empty list and then appending to the output data frame, but because the length of the list doesn't match the list of the dataframe, I get ValueError: Length of values does not match length of index
.
Can someone advise on what the best approach would be (e.g., making another function to solely handle filing_date
or perhaps returning a data frame instead)?
import pandas as pd
from urllib.parse import urljoin
from bs4 import BeautifulSoup, SoupStrainer
import requests
class Scraper:
BASE_URL = "https://www.sec.gov"
FORMS_URL_TEMPLATE = "/cgi-bin/browse-edgar?action=getcompany&CIK={cik}&type=13F"
def __init__(self):
self.session = requests.Session()
def get_holdings(self, cik):
"""
Main function that first finds the most recent 13F form and then passes
it to scrapeForm to get the holdings for a particular institutional investor.
"""
# get the form urls
forms_url = urljoin(self.BASE_URL, self.FORMS_URL_TEMPLATE.format(cik=cik))
parse_only = SoupStrainer('a', {"id": "documentsbutton"})
soup = BeautifulSoup(self.session.get(forms_url).content, 'lxml', parse_only=parse_only)
urls = soup.find_all('a', href=True)
# get form document URLs
form_urls = []
for url in urls:
url = url.get("href")
url = urljoin(self.BASE_URL, str(url))
headers = {'User-Agent': 'Mozilla/5.0'}
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
# Get filing date and "period date"
dates = soup.find("div", {"class": "formContent"})
filing_date = dates.find_all("div", {"class": "formGrouping"})[0]
filing_date = filing_date.find_all("div", {"class": "info"})[0]
filing_date = filing_date.text
# get form table URLs
parse_only = SoupStrainer('tr', {"class": 'blueRow'})
soup = BeautifulSoup(self.session.get(url).content,'lxml', parse_only=parse_only)
form_url = soup.find_all('tr', {"class": 'blueRow'})[-1].find('a')['href']
if ".txt" in form_url:
pass
else:
form_url = urljoin(self.BASE_URL, form_url)
# print(form_url)
form_urls.append(form_url)
return self.scrape_document(form_urls, cik, filing_date)
def scrape_document(self, urls, cik, filing_date):
"""This function scrapes holdings from particular document URL"""
cols = ['nameOfIssuer', 'titleOfClass', 'cusip', 'value', 'sshPrnamt',
'sshPrnamtType', 'putCall', 'investmentDiscretion',
'otherManager', 'Sole', 'Shared', 'None']
data = []
for url in urls:
soup = BeautifulSoup(self.session.get(url).content, 'lxml')
for info_table in soup.find_all(['ns1:infotable', 'infotable']):
row = []
for col in cols:
d = info_table.find([col.lower(), 'ns1:' + col.lower()])
row.append(d.text.strip() if d else 'NaN')
data.append(row)
df = pd.DataFrame(data, columns=cols)
df['cik'] = cik
df['Filing Date'] = filing_date
return df
holdings = Scraper()
holdings = holdings.get_holdings("0000846222")
print(holdings)
-
2\$\begingroup\$ What version of python is this written in? \$\endgroup\$Ben A– Ben A2020年03月11日 02:16:34 +00:00Commented Mar 11, 2020 at 2:16
-
1\$\begingroup\$ Sorry, it's Python 3.7 \$\endgroup\$user53526356– user535263562020年03月11日 02:44:18 +00:00Commented Mar 11, 2020 at 2:44
1 Answer 1
Seems like you have as many filing_dates as you have URLs, so you should have those together and handle them similarly.
Your problem seems to come from the fact that you're losing the intel of which row comes from which URL, and so your only option becomes to set one date for the full dataframe.
Here's an updated version saving the dates at the same time as the URLs and using and using a new res_df
dataframe in scrape_document
to aggregate the dataframes retrieved from each URL.
import pandas as pd
from urllib.parse import urljoin
from bs4 import BeautifulSoup, SoupStrainer
import requests
class Scraper:
BASE_URL = "https://www.sec.gov"
FORMS_URL_TEMPLATE = "/cgi-bin/browse-edgar?action=getcompany&CIK={cik}&type=13F"
def __init__(self):
self.session = requests.Session()
def get_holdings(self, cik):
"""
Main function that first finds the most recent 13F form and then passes
it to scrapeForm to get the holdings for a particular institutional investor.
"""
# get the form urls
forms_url = urljoin(self.BASE_URL, self.FORMS_URL_TEMPLATE.format(cik=cik))
parse_only = SoupStrainer('a', {"id": "documentsbutton"})
soup = BeautifulSoup(self.session.get(forms_url).content, 'lxml', parse_only=parse_only)
urls = soup.find_all('a', href=True)
# get form document URLs
form_urls = []
filing_dates = []
for url in urls:
url = url.get("href")
url = urljoin(self.BASE_URL, str(url))
headers = {'User-Agent': 'Mozilla/5.0'}
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
# Get filing date and "period date"
dates = soup.find("div", {"class": "formContent"})
filing_date = dates.find_all("div", {"class": "formGrouping"})[0]
filing_date = filing_date.find_all("div", {"class": "info"})[0]
filing_date = filing_date.text
# get form table URLs
parse_only = SoupStrainer('tr', {"class": 'blueRow'})
soup = BeautifulSoup(self.session.get(url).content,'lxml', parse_only=parse_only)
form_url = soup.find_all('tr', {"class": 'blueRow'})[-1].find('a')['href']
if ".txt" in form_url:
pass
else:
form_url = urljoin(self.BASE_URL, form_url)
# print(form_url)
form_urls.append(form_url)
# Save the filing date too
filing_dates.append(filing_date)
# Pass the dates list rather than the last one
return self.scrape_document(form_urls, cik, filing_dates)
def scrape_document(self, urls, cik, filing_dates):
"""This function scrapes holdings from particular document URL"""
cols = ['nameOfIssuer', 'titleOfClass', 'cusip', 'value', 'sshPrnamt',
'sshPrnamtType', 'putCall', 'investmentDiscretion',
'otherManager', 'Sole', 'Shared', 'None']
res_df = pd.DataFrame(columns=cols+["Filing Date"])
# Iterate over both list at the same time
for url, date in zip(urls, filing_dates):
data = []
soup = BeautifulSoup(self.session.get(url).content, 'lxml')
for info_table in soup.find_all(['ns1:infotable', 'infotable']):
row = []
for col in cols:
d = info_table.find([col.lower(), 'ns1:' + col.lower()])
row.append(d.text.strip() if d else 'NaN')
data.append(row)
url_df = pd.DataFrame(data, columns=cols)
url_df["Filing Date"] = date
res_df = res_df.append(url_df, ignore_index=True)
# CIK seems common to the whole DF, if not follow the example of dates
res_df['cik'] = cik
return res_df
holdings = Scraper()
holdings = holdings.get_holdings("0000846222")
print(holdings)
```
-
1\$\begingroup\$ Thanks I appreciate this. Made a couple edits on some minor typos but I think this will work. Out of curiosity, is there a better way I could have organized/structured this from the beginning? \$\endgroup\$user53526356– user535263562020年03月11日 18:06:31 +00:00Commented Mar 11, 2020 at 18:06
-
1\$\begingroup\$ It's pretty clean already. Separating the data processing from the data fetching (get requests) might prove useful if you plan to write unit tests. It'll make mocking easier. \$\endgroup\$Cal– Cal2020年03月12日 17:05:12 +00:00Commented Mar 12, 2020 at 17:05
Explore related questions
See similar questions with these tags.