Design Pattern: Builder - BeautifulSoup directory navigation and scraping

Question 1

I wrote a class on top of BeautifulSoup using the builder design pattern that allows for the navigation of the necp data directory.

There are a couple navigation methods navto which just builds upon the base url to return a new instance, and inav useful when the urls are likely to change due to temporal updates.

import time
import re
import pandas as pd
import requests
from bs4 import BeautifulSoup
# for context TSRAGR is TAF code for a thunderstorm with hail
class TSragr:
 def __init__(self, base_url: str = None) -> None:
 self._baseurl = base_url
 r = requests.get(base_url)
 r.raise_for_status()
 soup = BeautifulSoup(r.content, "lxml").find_all("a")
 if soup[0].text == "Parent Directory":
 soup = soup[1:]
 self._soup = pd.Series([x.text for x in soup])
 def __repr__(self) -> str:
 return f"{self.url}\n"+ self._soup.__repr__()
 def __getitem__(self, args) -> "TSragr":
 self._soup = self._soup[args]
 return self
 @property
 def url(self) -> str:
 url = self._baseurl
 if not url.endswith("/"):
 url = url+ "/"
 return url
 def navto(self, *args: str) -> "TSragr":
 return TSragr(self.url + "/".join(args))
 def navup(self) -> "TSragr":
 return TSragr(re.match(r"^(.*[\/])", self.url).group())
 def inav(self, index: int) -> "TSragr":
 return TSragr(self.url + self._soup[index])
 def download(self, save_to="./", wait: float = 10) -> None:
 soup = self._soup.copy()
 soup.index = self.url + self._soup
 for url, filename in soup.items():
 print("DOWNLAODING FILE")
 local_filename = save_to + filename
 with requests.get(url, stream=True) as r:
 r.raise_for_status()
 with open(local_filename, "wb") as f:
 for chunk in r.iter_content(chunk_size=8192):
 f.write(chunk)
 print("FILE SAVED")
 time.sleep(60 * wait)

usage

>>> from wxlab.scrape import TSragr
>>> ragr = TSragr("https://nomads.ncep.noaa.gov/pub/data")
>>> ragr
https://nomads.ncep.noaa.gov/pub/data/
0 DSRC
1 nccf/
dtype: object
>>> ragr.navto("nccf")
https://nomads.ncep.noaa.gov/pub/data/nccf/
0 charts/
1 com/
2 dcom/
3 nonoperational/
4 pcom/
5 radar/
dtype: object
>>> ragr.navto("nccf","com")
https://nomads.ncep.noaa.gov/pub/data/nccf/com/
0 557ww/
1 amsu_estimation/
2 aqm/
3 arch/
4 blend/
 ... 
61 uvi/
62 wave/
63 wave_nfcens/
64 wfs/
65 wsa_enlil/
Length: 66, dtype: object
>>> ragr.navto("nccf","com","blend")
https://nomads.ncep.noaa.gov/pub/data/nccf/com/blend/
0 prod/
1 v4.0/
dtype: object
>>> ragr.navto("nccf","com","blend","prod")
https://nomads.ncep.noaa.gov/pub/data/nccf/com/blend/prod/
0 blend.20220604/
1 blend.20220605/
dtype: object
>>> ragr.navto("nccf","com","blend","prod").inav(0)
https://nomads.ncep.noaa.gov/pub/data/nccf/com/blend/prod/blend.20220604/
0 00/
1 01/
2 02/
3 03/
4 04/
5 05/
6 06/
7 07/
8 08/
9 09/
10 10/
11 11/
12 12/
13 13/
14 14/
15 15/
16 16/
17 17/
18 18/
19 19/
20 20/
21 21/
22 22/
23 23/
dtype: object
>>> ragr.navto("nccf","com","blend","prod").inav(0).inav(0)
https://nomads.ncep.noaa.gov/pub/data/nccf/com/blend/prod/blend.20220604/00/
0 core/
1 qmd/
2 text/
dtype: object
>>> ragr.navto("nccf","com","blend","prod").inav(0).inav(0).navto("core")
https://nomads.ncep.noaa.gov/pub/data/nccf/com/blend/prod/blend.20220604/00/core/
0 blend.t00z.core.f001.ak.grib2
1 blend.t00z.core.f001.ak.grib2.idx
2 blend.t00z.core.f001.co.grib2
3 blend.t00z.core.f001.co.grib2.idx
4 blend.t00z.core.f001.gu.grib2
 ... 
1148 blend.t00z.core.f264.oc.grib2
1149 blend.t00z.core.f264.oc.grib2.idx
1150 blend.t00z.core.f264.pr.grib2
1151 blend.t00z.core.f264.pr.grib2.idx
1152 ls-l
Length: 1153, dtype: object
>>> ragr.navto("nccf","com","blend","prod").inav(0).inav(0).navto("core")[0:6:2]
https://nomads.ncep.noaa.gov/pub/data/nccf/com/blend/prod/blend.20220604/00/core/
0 blend.t00z.core.f001.ak.grib2
2 blend.t00z.core.f001.co.grib2
4 blend.t00z.core.f001.gu.grib2
dtype: object
>>> ragr.navto("nccf","com","blend","prod").inav(0).inav(0).navto("core")[0:6:2].download(save_to="/media/external/data/", wait=1)
DOWNLAODING FILE
FILE SAVED
DOWNLAODING FILE
FILE SAVED

Question 2

Fundamentally you're scraping the default directory listing format for Apache, which is the web server used by this NOAA site.

I see no use for Pandas here; I've removed it in my example code.

You should re-think your class. Rather than attempting to be an eager-fetched directory listing, it's better represented as a lazy-fetched directory listing that can be navigable either to child instances of subdirectories or of files, which amount to a different class. I don't think it's all that helpful to write your magic __getitem__ method; if you have an iterable, represent it directly as an iterable from a .children() method.

Use a soup strainer where applicable to narrow your DOM tree search.

Your Parent Directory check can be carried over to the BeautifulSoup DOM search by use of a regular expression.

I see no reason to wait; delete your sleep.

Suggested code

import re
from itertools import islice
from pathlib import Path
from shutil import copyfileobj
from typing import Iterator, Optional
from urllib.parse import urljoin
from requests import Session
from bs4 import BeautifulSoup
from bs4.element import SoupStrainer
class ApacheNode:
 """
 This represents a tree node in the Apache directory page for the NOAA NOMADS site;
 for more details see https://nomads.ncep.noaa.gov/
 It's used to look up TSRAGR data. TSRAGR is Terminal Aerodrome Forecast (TAF) code
 for a thunderstorm with hail.
 """
 def __init__(self, session: Session, url: str, name: Optional[str] = None) -> None:
 self.session = session
 self.url = url
 if name is None:
 name = url.removesuffix('/').rsplit('/', maxsplit=1)[-1]
 self.name = name
 def __repr__(self) -> str:
 return self.url
class ApacheFile(ApacheNode):
 def download(self, save_to: Path) -> None:
 print(f'Downloading {self.name}...', end=' ')
 local_filename = save_to / self.name
 with self.session.get(self.url, stream=True) as response:
 response.raise_for_status()
 with local_filename.open('wb') as f:
 copyfileobj(response.raw, f)
 print('saved')
class ApacheDir(ApacheNode):
 pre_strainer = SoupStrainer(name='pre')
 # Text has at least one character and cannot contain Parent Directory
 link_pattern = re.compile(
 '(?i)' # ignore case
 '^' # string start
 '(?:' # non-capturing group
 '(?!parent directory)' # negative lookahead: don't match 'parent directory'
 '.' # match any one character
 ')+' # match one or more of the above chars
 '$' # string end
 )
 def __init__(self, session: Session, url: str, name: Optional[str] = None) -> None:
 if not url.endswith('/'):
 url += '/'
 super().__init__(session, url, name)
 def children(self) -> Iterator[ApacheNode]:
 with self.session.get(self.url) as response:
 response.raise_for_status()
 soup = BeautifulSoup(markup=response.text, features='lxml', parse_only=self.pre_strainer)
 pre = soup.pre
 anchors = pre.find_all(name='a', text=self.link_pattern, recursive=False)
 for anchor in anchors:
 child_name = anchor['href']
 child_url = urljoin(self.url, child_name)
 size_text = anchor.next_sibling.strip()
 if size_text.endswith('-'):
 child_type = ApacheDir
 else:
 child_type = ApacheFile
 yield child_type(self.session, child_url, child_name)
 def navto(self, *args: str) -> 'ApacheDir':
 url = urljoin(self.url, '/'.join(args))
 return ApacheDir(self.session, url=url, name=args[-1])
 def inav(self, index: int) -> 'ApacheNode':
 child, = islice(self.children(), index, index+1)
 return child
def test() -> None:
 with Session() as session:
 ragr = ApacheDir(session, url='https://nomads.ncep.noaa.gov/pub/data')
 print(ragr)
 print(ragr.navto('nccf'))
 print(ragr.navto('nccf', 'com'))
 print(ragr.navto('nccf', 'com', 'blend'))
 ragr = ragr.navto('nccf', 'com', 'blend', 'prod')
 print(ragr)
 ragr = ragr.inav(0)
 print(ragr)
 ragr = ragr.inav(0)
 print(ragr)
 ragr = ragr.navto('core')
 print(ragr)
 # anything that doesn't end in idx, first three
 first_three_files = islice(
 (
 child for child in ragr.children()
 if not child.name.endswith('idx')
 ), 3
 )
 save_to = Path('ragr-test') # '/media/external/data/'
 save_to.mkdir(exist_ok=True)
 for file in first_three_files:
 file.download(save_to)
if __name__ == '__main__':
 test()

Output

https://nomads.ncep.noaa.gov/pub/data/
https://nomads.ncep.noaa.gov/pub/data/nccf/
https://nomads.ncep.noaa.gov/pub/data/nccf/com/
https://nomads.ncep.noaa.gov/pub/data/nccf/com/blend/
https://nomads.ncep.noaa.gov/pub/data/nccf/com/blend/prod/
https://nomads.ncep.noaa.gov/pub/data/nccf/com/blend/prod/blend.20220605/
https://nomads.ncep.noaa.gov/pub/data/nccf/com/blend/prod/blend.20220605/00/
https://nomads.ncep.noaa.gov/pub/data/nccf/com/blend/prod/blend.20220605/00/core/
Downloading blend.t00z.core.f001.ak.grib2... saved
Downloading blend.t00z.core.f001.co.grib2... saved
Downloading blend.t00z.core.f001.gu.grib2... saved

Question 3

Awesome answer as usual thanks! I had the sleep because some of the files can be upwards of 5gb and I found up occasionally when downloading large files back to back I was getting temporarily blocked

Reinderien Reinderien 71k5 gold badges76 silver badges256 bronze badges · Accepted Answer · 2022-06-06 16:07:59Z

Fundamentally you're scraping the default directory listing format for Apache, which is the web server used by this NOAA site.

I see no use for Pandas here; I've removed it in my example code.

You should re-think your class. Rather than attempting to be an eager-fetched directory listing, it's better represented as a lazy-fetched directory listing that can be navigable either to child instances of subdirectories or of files, which amount to a different class. I don't think it's all that helpful to write your magic __getitem__ method; if you have an iterable, represent it directly as an iterable from a .children() method.

Use a soup strainer where applicable to narrow your DOM tree search.

Your Parent Directory check can be carried over to the BeautifulSoup DOM search by use of a regular expression.

I see no reason to wait; delete your sleep.

Suggested code

import re
from itertools import islice
from pathlib import Path
from shutil import copyfileobj
from typing import Iterator, Optional
from urllib.parse import urljoin
from requests import Session
from bs4 import BeautifulSoup
from bs4.element import SoupStrainer
class ApacheNode:
 """
 This represents a tree node in the Apache directory page for the NOAA NOMADS site;
 for more details see https://nomads.ncep.noaa.gov/
 It's used to look up TSRAGR data. TSRAGR is Terminal Aerodrome Forecast (TAF) code
 for a thunderstorm with hail.
 """
 def __init__(self, session: Session, url: str, name: Optional[str] = None) -> None:
 self.session = session
 self.url = url
 if name is None:
 name = url.removesuffix('/').rsplit('/', maxsplit=1)[-1]
 self.name = name
 def __repr__(self) -> str:
 return self.url
class ApacheFile(ApacheNode):
 def download(self, save_to: Path) -> None:
 print(f'Downloading {self.name}...', end=' ')
 local_filename = save_to / self.name
 with self.session.get(self.url, stream=True) as response:
 response.raise_for_status()
 with local_filename.open('wb') as f:
 copyfileobj(response.raw, f)
 print('saved')
class ApacheDir(ApacheNode):
 pre_strainer = SoupStrainer(name='pre')
 # Text has at least one character and cannot contain Parent Directory
 link_pattern = re.compile(
 '(?i)' # ignore case
 '^' # string start
 '(?:' # non-capturing group
 '(?!parent directory)' # negative lookahead: don't match 'parent directory'
 '.' # match any one character
 ')+' # match one or more of the above chars
 '$' # string end
 )
 def __init__(self, session: Session, url: str, name: Optional[str] = None) -> None:
 if not url.endswith('/'):
 url += '/'
 super().__init__(session, url, name)
 def children(self) -> Iterator[ApacheNode]:
 with self.session.get(self.url) as response:
 response.raise_for_status()
 soup = BeautifulSoup(markup=response.text, features='lxml', parse_only=self.pre_strainer)
 pre = soup.pre
 anchors = pre.find_all(name='a', text=self.link_pattern, recursive=False)
 for anchor in anchors:
 child_name = anchor['href']
 child_url = urljoin(self.url, child_name)
 size_text = anchor.next_sibling.strip()
 if size_text.endswith('-'):
 child_type = ApacheDir
 else:
 child_type = ApacheFile
 yield child_type(self.session, child_url, child_name)
 def navto(self, *args: str) -> 'ApacheDir':
 url = urljoin(self.url, '/'.join(args))
 return ApacheDir(self.session, url=url, name=args[-1])
 def inav(self, index: int) -> 'ApacheNode':
 child, = islice(self.children(), index, index+1)
 return child
def test() -> None:
 with Session() as session:
 ragr = ApacheDir(session, url='https://nomads.ncep.noaa.gov/pub/data')
 print(ragr)
 print(ragr.navto('nccf'))
 print(ragr.navto('nccf', 'com'))
 print(ragr.navto('nccf', 'com', 'blend'))
 ragr = ragr.navto('nccf', 'com', 'blend', 'prod')
 print(ragr)
 ragr = ragr.inav(0)
 print(ragr)
 ragr = ragr.inav(0)
 print(ragr)
 ragr = ragr.navto('core')
 print(ragr)
 # anything that doesn't end in idx, first three
 first_three_files = islice(
 (
 child for child in ragr.children()
 if not child.name.endswith('idx')
 ), 3
 )
 save_to = Path('ragr-test') # '/media/external/data/'
 save_to.mkdir(exist_ok=True)
 for file in first_three_files:
 file.download(save_to)
if __name__ == '__main__':
 test()

Output

https://nomads.ncep.noaa.gov/pub/data/
https://nomads.ncep.noaa.gov/pub/data/nccf/
https://nomads.ncep.noaa.gov/pub/data/nccf/com/
https://nomads.ncep.noaa.gov/pub/data/nccf/com/blend/
https://nomads.ncep.noaa.gov/pub/data/nccf/com/blend/prod/
https://nomads.ncep.noaa.gov/pub/data/nccf/com/blend/prod/blend.20220605/
https://nomads.ncep.noaa.gov/pub/data/nccf/com/blend/prod/blend.20220605/00/
https://nomads.ncep.noaa.gov/pub/data/nccf/com/blend/prod/blend.20220605/00/core/
Downloading blend.t00z.core.f001.ak.grib2... saved
Downloading blend.t00z.core.f001.co.grib2... saved
Downloading blend.t00z.core.f001.gu.grib2... saved

Awesome answer as usual thanks! I had the sleep because some of the files can be upwards of 5gb and I found up occasionally when downloading large files back to back I was getting temporarily blocked

Stack Exchange Network

Design Pattern: Builder - BeautifulSoup directory navigation and scraping

usage

1 Answer 1

Suggested code

Output

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Design Pattern: Builder - BeautifulSoup directory navigation and scraping

usage

1 Answer 1

Suggested code

Output

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions