I wrote a class on top of BeautifulSoup
using the builder design pattern that allows for the navigation of the necp data directory.
There are a couple navigation methods navto
which just builds upon the base url to return a new instance, and inav
useful when the urls are likely to change due to temporal updates.
import time
import re
import pandas as pd
import requests
from bs4 import BeautifulSoup
# for context TSRAGR is TAF code for a thunderstorm with hail
class TSragr:
def __init__(self, base_url: str = None) -> None:
self._baseurl = base_url
r = requests.get(base_url)
r.raise_for_status()
soup = BeautifulSoup(r.content, "lxml").find_all("a")
if soup[0].text == "Parent Directory":
soup = soup[1:]
self._soup = pd.Series([x.text for x in soup])
def __repr__(self) -> str:
return f"{self.url}\n"+ self._soup.__repr__()
def __getitem__(self, args) -> "TSragr":
self._soup = self._soup[args]
return self
@property
def url(self) -> str:
url = self._baseurl
if not url.endswith("/"):
url = url+ "/"
return url
def navto(self, *args: str) -> "TSragr":
return TSragr(self.url + "/".join(args))
def navup(self) -> "TSragr":
return TSragr(re.match(r"^(.*[\/])", self.url).group())
def inav(self, index: int) -> "TSragr":
return TSragr(self.url + self._soup[index])
def download(self, save_to="./", wait: float = 10) -> None:
soup = self._soup.copy()
soup.index = self.url + self._soup
for url, filename in soup.items():
print("DOWNLAODING FILE")
local_filename = save_to + filename
with requests.get(url, stream=True) as r:
r.raise_for_status()
with open(local_filename, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
print("FILE SAVED")
time.sleep(60 * wait)
usage
>>> from wxlab.scrape import TSragr
>>> ragr = TSragr("https://nomads.ncep.noaa.gov/pub/data")
>>> ragr
https://nomads.ncep.noaa.gov/pub/data/
0 DSRC
1 nccf/
dtype: object
>>> ragr.navto("nccf")
https://nomads.ncep.noaa.gov/pub/data/nccf/
0 charts/
1 com/
2 dcom/
3 nonoperational/
4 pcom/
5 radar/
dtype: object
>>> ragr.navto("nccf","com")
https://nomads.ncep.noaa.gov/pub/data/nccf/com/
0 557ww/
1 amsu_estimation/
2 aqm/
3 arch/
4 blend/
...
61 uvi/
62 wave/
63 wave_nfcens/
64 wfs/
65 wsa_enlil/
Length: 66, dtype: object
>>> ragr.navto("nccf","com","blend")
https://nomads.ncep.noaa.gov/pub/data/nccf/com/blend/
0 prod/
1 v4.0/
dtype: object
>>> ragr.navto("nccf","com","blend","prod")
https://nomads.ncep.noaa.gov/pub/data/nccf/com/blend/prod/
0 blend.20220604/
1 blend.20220605/
dtype: object
>>> ragr.navto("nccf","com","blend","prod").inav(0)
https://nomads.ncep.noaa.gov/pub/data/nccf/com/blend/prod/blend.20220604/
0 00/
1 01/
2 02/
3 03/
4 04/
5 05/
6 06/
7 07/
8 08/
9 09/
10 10/
11 11/
12 12/
13 13/
14 14/
15 15/
16 16/
17 17/
18 18/
19 19/
20 20/
21 21/
22 22/
23 23/
dtype: object
>>> ragr.navto("nccf","com","blend","prod").inav(0).inav(0)
https://nomads.ncep.noaa.gov/pub/data/nccf/com/blend/prod/blend.20220604/00/
0 core/
1 qmd/
2 text/
dtype: object
>>> ragr.navto("nccf","com","blend","prod").inav(0).inav(0).navto("core")
https://nomads.ncep.noaa.gov/pub/data/nccf/com/blend/prod/blend.20220604/00/core/
0 blend.t00z.core.f001.ak.grib2
1 blend.t00z.core.f001.ak.grib2.idx
2 blend.t00z.core.f001.co.grib2
3 blend.t00z.core.f001.co.grib2.idx
4 blend.t00z.core.f001.gu.grib2
...
1148 blend.t00z.core.f264.oc.grib2
1149 blend.t00z.core.f264.oc.grib2.idx
1150 blend.t00z.core.f264.pr.grib2
1151 blend.t00z.core.f264.pr.grib2.idx
1152 ls-l
Length: 1153, dtype: object
>>> ragr.navto("nccf","com","blend","prod").inav(0).inav(0).navto("core")[0:6:2]
https://nomads.ncep.noaa.gov/pub/data/nccf/com/blend/prod/blend.20220604/00/core/
0 blend.t00z.core.f001.ak.grib2
2 blend.t00z.core.f001.co.grib2
4 blend.t00z.core.f001.gu.grib2
dtype: object
>>> ragr.navto("nccf","com","blend","prod").inav(0).inav(0).navto("core")[0:6:2].download(save_to="/media/external/data/", wait=1)
DOWNLAODING FILE
FILE SAVED
DOWNLAODING FILE
FILE SAVED
1 Answer 1
Fundamentally you're scraping the default directory listing format for Apache, which is the web server used by this NOAA site.
I see no use for Pandas here; I've removed it in my example code.
You should re-think your class. Rather than attempting to be an eager-fetched directory listing, it's better represented as a lazy-fetched directory listing that can be navigable either to child instances of subdirectories or of files, which amount to a different class. I don't think it's all that helpful to write your magic __getitem__
method; if you have an iterable, represent it directly as an iterable from a .children()
method.
Use a soup strainer where applicable to narrow your DOM tree search.
Your Parent Directory check can be carried over to the BeautifulSoup DOM search by use of a regular expression.
I see no reason to wait; delete your sleep
.
Suggested code
import re
from itertools import islice
from pathlib import Path
from shutil import copyfileobj
from typing import Iterator, Optional
from urllib.parse import urljoin
from requests import Session
from bs4 import BeautifulSoup
from bs4.element import SoupStrainer
class ApacheNode:
"""
This represents a tree node in the Apache directory page for the NOAA NOMADS site;
for more details see https://nomads.ncep.noaa.gov/
It's used to look up TSRAGR data. TSRAGR is Terminal Aerodrome Forecast (TAF) code
for a thunderstorm with hail.
"""
def __init__(self, session: Session, url: str, name: Optional[str] = None) -> None:
self.session = session
self.url = url
if name is None:
name = url.removesuffix('/').rsplit('/', maxsplit=1)[-1]
self.name = name
def __repr__(self) -> str:
return self.url
class ApacheFile(ApacheNode):
def download(self, save_to: Path) -> None:
print(f'Downloading {self.name}...', end=' ')
local_filename = save_to / self.name
with self.session.get(self.url, stream=True) as response:
response.raise_for_status()
with local_filename.open('wb') as f:
copyfileobj(response.raw, f)
print('saved')
class ApacheDir(ApacheNode):
pre_strainer = SoupStrainer(name='pre')
# Text has at least one character and cannot contain Parent Directory
link_pattern = re.compile(
'(?i)' # ignore case
'^' # string start
'(?:' # non-capturing group
'(?!parent directory)' # negative lookahead: don't match 'parent directory'
'.' # match any one character
')+' # match one or more of the above chars
'$' # string end
)
def __init__(self, session: Session, url: str, name: Optional[str] = None) -> None:
if not url.endswith('/'):
url += '/'
super().__init__(session, url, name)
def children(self) -> Iterator[ApacheNode]:
with self.session.get(self.url) as response:
response.raise_for_status()
soup = BeautifulSoup(markup=response.text, features='lxml', parse_only=self.pre_strainer)
pre = soup.pre
anchors = pre.find_all(name='a', text=self.link_pattern, recursive=False)
for anchor in anchors:
child_name = anchor['href']
child_url = urljoin(self.url, child_name)
size_text = anchor.next_sibling.strip()
if size_text.endswith('-'):
child_type = ApacheDir
else:
child_type = ApacheFile
yield child_type(self.session, child_url, child_name)
def navto(self, *args: str) -> 'ApacheDir':
url = urljoin(self.url, '/'.join(args))
return ApacheDir(self.session, url=url, name=args[-1])
def inav(self, index: int) -> 'ApacheNode':
child, = islice(self.children(), index, index+1)
return child
def test() -> None:
with Session() as session:
ragr = ApacheDir(session, url='https://nomads.ncep.noaa.gov/pub/data')
print(ragr)
print(ragr.navto('nccf'))
print(ragr.navto('nccf', 'com'))
print(ragr.navto('nccf', 'com', 'blend'))
ragr = ragr.navto('nccf', 'com', 'blend', 'prod')
print(ragr)
ragr = ragr.inav(0)
print(ragr)
ragr = ragr.inav(0)
print(ragr)
ragr = ragr.navto('core')
print(ragr)
# anything that doesn't end in idx, first three
first_three_files = islice(
(
child for child in ragr.children()
if not child.name.endswith('idx')
), 3
)
save_to = Path('ragr-test') # '/media/external/data/'
save_to.mkdir(exist_ok=True)
for file in first_three_files:
file.download(save_to)
if __name__ == '__main__':
test()
Output
https://nomads.ncep.noaa.gov/pub/data/
https://nomads.ncep.noaa.gov/pub/data/nccf/
https://nomads.ncep.noaa.gov/pub/data/nccf/com/
https://nomads.ncep.noaa.gov/pub/data/nccf/com/blend/
https://nomads.ncep.noaa.gov/pub/data/nccf/com/blend/prod/
https://nomads.ncep.noaa.gov/pub/data/nccf/com/blend/prod/blend.20220605/
https://nomads.ncep.noaa.gov/pub/data/nccf/com/blend/prod/blend.20220605/00/
https://nomads.ncep.noaa.gov/pub/data/nccf/com/blend/prod/blend.20220605/00/core/
Downloading blend.t00z.core.f001.ak.grib2... saved
Downloading blend.t00z.core.f001.co.grib2... saved
Downloading blend.t00z.core.f001.gu.grib2... saved
-
\$\begingroup\$ Awesome answer as usual thanks! I had the sleep because some of the files can be upwards of 5gb and I found up occasionally when downloading large files back to back I was getting temporarily blocked \$\endgroup\$Jason Leaver– Jason Leaver2022年06月08日 00:43:35 +00:00Commented Jun 8, 2022 at 0:43
Explore related questions
See similar questions with these tags.