SHARE
    TWEET
    Najeebsk

    SCRAPE-AND-DOWNLOAD.pyw

    Apr 16th, 2025 (edited)
    599
    0
    Never
    Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
    Python 36.85 KB | None | 0 0
    1. import tkinter as tk
    2. from tkinter import END
    3. from tkinter import ttk, messagebox, filedialog
    4. import webbrowser
    5. import os
    6. import requests
    7. from bs4 import BeautifulSoup
    8. from urllib.parse import urljoin, urlparse
    9. import yt_dlp
    10. import subprocess
    11. from PIL import Image, ImageTk
    12. import io
    13. import threading
    14. import shutil
    15. # ------------------------------
    16. from tkinter import scrolledtext
    17. stop_download_flag = False
    18. #================ADD-IMAGE-ICON=================
    19. import sys
    20. def resource_path(relative_path):
    21. """ Get the absolute path to the resource, works for PyInstaller. """
    22. if getattr(sys, '_MEIPASS', False):
    23. return os.path.join(sys._MEIPASS, relative_path)
    24. return os.path.join(os.path.abspath("."), relative_path)
    25. # Use this function to load files:
    26. #splash_image = resource_path("splash-1.png")
    27. icon_path = resource_path("D.ico")
    28. #================ADD-IMAGE-ICON=================
    29. # Register browsers with full path
    30. chrome_path = r"C:\Program Files\Google\Chrome\Application\chrome.exe"
    31. firefox_path = r"C:\Program Files\Mozilla Firefox\firefox.exe"
    32. if os.path.exists(chrome_path):
    33. webbrowser.register("chrome", None, webbrowser.BackgroundBrowser(chrome_path))
    34. if os.path.exists(firefox_path):
    35. webbrowser.register("firefox", None, webbrowser.BackgroundBrowser(firefox_path))
    36. # === Main Window ===
    37. window = tk.Tk()
    38. window.title("NAJEEB SHAH KHAN SCRAPE WEB & Image Search Tool & Media Downloader")
    39. window.geometry("965x700")
    40. #window.configure(bg="#2c3e50")
    41. window.iconbitmap(icon_path)
    42. notebook = ttk.Notebook(window)
    43. tab1 = ttk.Frame(notebook)
    44. tab2 = ttk.Frame(notebook)
    45. notebook.add(tab1, text="Image Search Tool")
    46. notebook.add(tab2, text="Media Downloader")
    47. notebook.pack(expand=True, fill="both")
    48. # ====================
    49. # === Tab 1 Content ===
    50. # ====================
    51. dark_mode_var = tk.BooleanVar()
    52. keyword_var = tk.StringVar()
    53. site_var = tk.StringVar()
    54. extra_format_var = tk.StringVar()
    55. query_preview_var = tk.StringVar()
    56. browser_var = tk.StringVar(value="default")
    57. format_vars = {
    58. "jpg": tk.BooleanVar(value=True),
    59. "png": tk.BooleanVar(value=True),
    60. "gif": tk.BooleanVar(),
    61. "bmp": tk.BooleanVar(),
    62. "webp": tk.BooleanVar(),
    63. }
    64. def update_query_preview():
    65. selected_formats = [f for f, var in format_vars.items() if var.get()]
    66. custom_format = extra_format_var.get().strip()
    67. keyword = keyword_var.get().strip()
    68. site = site_var.get().strip()
    69. all_formats = selected_formats.copy()
    70. if custom_format:
    71. all_formats.append(custom_format)
    72. filetype_str = ' | '.join(all_formats) if all_formats else "jpg | png"
    73. query = 'intitle:"index of"'
    74. if keyword:
    75. query += f' ({keyword})'
    76. query += f' ({filetype_str})'
    77. if site:
    78. query += f' site:{site}'
    79. query_preview_var.set(query)
    80. def perform_search():
    81. query = query_preview_var.get()
    82. if not query:
    83. result_text.delete("1.0", tk.END)
    84. result_text.insert(tk.END, "⚠️ Query is empty.")
    85. return
    86. url = f"https://www.google.com/search?q={query.replace(' ', '+')}"
    87. result_text.delete("1.0", tk.END)
    88. result_text.insert(tk.END, f"πŸ” Google Search URL:\n{url}")
    89. browser = browser_var.get()
    90. try:
    91. if browser == "chrome":
    92. webbrowser.get("chrome").open(url)
    93. elif browser == "firefox":
    94. webbrowser.get("firefox").open(url)
    95. else:
    96. webbrowser.open(url)
    97. except webbrowser.Error:
    98. result_text.insert(tk.END, f"\n⚠️ Failed to open {browser}, using default browser instead.")
    99. webbrowser.open(url)
    100. def toggle_dark_mode():
    101. dark = dark_mode_var.get()
    102. bg = "#1e1e1e" if dark else "#ffffff"
    103. fg = "#ffffff" if dark else "#000000"
    104. widgets = [tab1, format_frame, keyword_label, keyword_entry,
    105. site_label, site_entry, extra_label, extra_entry,
    106. preview_label, preview_entry, search_button, dark_mode_check,
    107. browser_label, result_label, result_text]
    108. for widget in widgets:
    109. try:
    110. widget.config(bg=bg, fg=fg)
    111. except:
    112. pass
    113. keyword_entry.config(insertbackground=fg)
    114. site_entry.config(insertbackground=fg)
    115. extra_entry.config(insertbackground=fg)
    116. result_text.config(insertbackground=fg)
    117. # Tab 1 Layout
    118. tk.Label(tab1, text="Select Image Formats:", bg="#ffffff").pack(anchor="w", padx=10, pady=5)
    119. format_frame = tk.Frame(tab1, bg="#ffffff")
    120. format_frame.pack(anchor="w", padx=20)
    121. for fmt, var in format_vars.items():
    122. cb = tk.Checkbutton(format_frame, text=fmt, variable=var, bg="#ffffff", command=update_query_preview)
    123. cb.pack(side="left", padx=5)
    124. extra_label = tk.Label(tab1, text="Type any extra format or word (e.g. tif, raw):", bg="#ffffff")
    125. extra_label.pack(anchor="w", padx=10, pady=5)
    126. extra_entry = tk.Entry(tab1, textvariable=extra_format_var, width=60, bg="#ffffff", fg="#000000")
    127. extra_entry.pack(padx=10)
    128. extra_entry.bind("<KeyRelease>", lambda e: update_query_preview())
    129. keyword_label = tk.Label(tab1, text="Enter Keywords (e.g. wallpaper | backgrounds):", bg="#ffffff")
    130. keyword_label.pack(anchor="w", padx=10, pady=5)
    131. keyword_entry = tk.Entry(tab1, textvariable=keyword_var, width=60, bg="#ffffff", fg="#000000")
    132. keyword_entry.pack(padx=10)
    133. keyword_entry.bind("<KeyRelease>", lambda e: update_query_preview())
    134. site_label = tk.Label(tab1, text="Optional Site Filter (e.g. .edu, example.com):", bg="#ffffff")
    135. site_label.pack(anchor="w", padx=10, pady=5)
    136. site_entry = tk.Entry(tab1, textvariable=site_var, width=60, bg="#ffffff", fg="#000000")
    137. site_entry.pack(padx=10)
    138. site_entry.bind("<KeyRelease>", lambda e: update_query_preview())
    139. preview_label = tk.Label(tab1, text="πŸ”Ž Search Query Preview:", bg="#ffffff", font=("Arial", 10, "bold"))
    140. preview_label.pack(anchor="w", padx=10, pady=5)
    141. preview_entry = tk.Entry(tab1, textvariable=query_preview_var, width=80, state="readonly", bg="#eeeeee")
    142. preview_entry.pack(padx=10, pady=5)
    143. browser_label = tk.Label(tab1, text="Select Browser:", bg="#ffffff")
    144. browser_label.pack(anchor="w", padx=10, pady=5)
    145. browser_frame = tk.Frame(tab1, bg="#ffffff")
    146. browser_frame.pack(anchor="w", padx=20)
    147. tk.Radiobutton(browser_frame, text="Default", variable=browser_var, value="default", bg="#ffffff", command=update_query_preview).pack(side="left", padx=10)
    148. tk.Radiobutton(browser_frame, text="Chrome", variable=browser_var, value="chrome", bg="#ffffff", command=update_query_preview).pack(side="left", padx=10)
    149. tk.Radiobutton(browser_frame, text="Firefox", variable=browser_var, value="firefox", bg="#ffffff", command=update_query_preview).pack(side="left", padx=10)
    150. search_button = tk.Button(tab1, text="Search on Google", command=perform_search)
    151. search_button.pack(pady=10)
    152. dark_mode_check = tk.Checkbutton(tab1, text="Dark Mode", variable=dark_mode_var, command=toggle_dark_mode, bg="#ffffff")
    153. dark_mode_check.pack()
    154. result_label = tk.Label(tab1, text="Generated Google Search URL:", bg="#ffffff")
    155. result_label.pack(anchor="w", padx=10, pady=5)
    156. result_text = tk.Text(tab1, height=4, width=80, wrap="word", bg="#f8f8f8")
    157. result_text.pack(padx=10, pady=5)
    158. update_query_preview()
    159. # ====================
    160. # === Tab 2 Content ===
    161. # ====================
    162. media_urls = []
    163. special_sites = ['youtube.com', 'youtu.be', 'facebook.com', 'fb.watch', 'tiktok.com', 'instagram.com']
    164. image_exts = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.svg', '.ico']
    165. video_exts = ['.mp4', '.webm', '.ogg', '.mov', '.avi', '.mkv', '.flv', '.3gp', '.wmv', '.m3u', '.m3u8']
    166. stop_download_flag = False
    167. def is_special_site(url):
    168. return any(domain in url for domain in special_sites)
    169. def browse_url_file():
    170. file_path = filedialog.askopenfilename(title="Open URL File", filetypes=[("Text files", "*.txt")])
    171. if file_path:
    172. with open(file_path, 'r') as f:
    173. for line in f:
    174. url = line.strip()
    175. if url and url not in media_urls:
    176. media_urls.append(url)
    177. result_box.insert(tk.END, url + "\n")
    178. def save_urls_to_file():
    179. file_path = filedialog.asksaveasfilename(defaultextension=".txt", filetypes=[("Text files", "*.txt")])
    180. if file_path:
    181. with open(file_path, 'w') as f:
    182. f.write(result_box.get("1.0", tk.END).strip())
    183. messagebox.showinfo("Saved", f"URLs saved to {file_path}")
    184. def scrape_normal_site(url):
    185. found_urls = set()
    186. try:
    187. response = requests.get(url, timeout=10)
    188. if response.status_code != 200:
    189. return found_urls
    190. soup = BeautifulSoup(response.text, 'html.parser')
    191. for tag in soup.find_all(['img', 'video', 'source', 'a']):
    192. src = tag.get('src') or tag.get('href')
    193. if src:
    194. full_url = urljoin(url, src)
    195. parsed = urlparse(full_url)
    196. ext = os.path.splitext(parsed.path)[1].lower()
    197. if ext in image_exts + video_exts:
    198. found_urls.add(full_url)
    199. except Exception:
    200. pass
    201. return found_urls
    202. def process_url():
    203. url = url_entry.get().strip()
    204. if not url:
    205. messagebox.showwarning("Input Error", "Please enter a valid URL.")
    206. return
    207. media_urls.clear()
    208. result_box.delete("1.0", tk.END)
    209. try:
    210. if is_special_site(url):
    211. ydl_opts = {
    212. 'quiet': True,
    213. 'skip_download': True,
    214. 'force_generic_extractor': False
    215. }
    216. with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    217. info = ydl.extract_info(url, download=False)
    218. if 'entries' in info:
    219. for entry in info['entries']:
    220. media_urls.append(entry['webpage_url'])
    221. result_box.insert(tk.END, entry['webpage_url'] + "\n")
    222. else:
    223. media_urls.append(info['webpage_url'])
    224. result_box.insert(tk.END, info['webpage_url'] + "\n")
    225. else:
    226. scraped = scrape_normal_site(url)
    227. media_urls.extend(scraped)
    228. for media_url in scraped:
    229. result_box.insert(tk.END, media_url + "\n")
    230. if not media_urls:
    231. messagebox.showinfo("Info", "No media URLs found.")
    232. else:
    233. messagebox.showinfo("Success", f"{len(media_urls)} media URL(s) found!")
    234. except Exception as e:
    235. messagebox.showerror("Error", str(e))
    236. def download_media(url, save_path):
    237. try:
    238. if is_special_site(url):
    239. ytdlp_path = shutil.which("yt-dlp") or r"C:\Windows\yt-dlp.exe"
    240. command = [
    241. ytdlp_path,
    242. "-f", "best",
    243. "--no-playlist",
    244. "--extractor-args", "youtube:player_client=web",
    245. "-o", save_path,
    246. url
    247. ]
    248. result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    249. if result.returncode != 0:
    250. raise Exception(result.stderr.strip())
    251. else:
    252. response = requests.get(url, stream=True)
    253. if response.status_code == 200:
    254. with open(save_path, 'wb') as f:
    255. for chunk in response.iter_content(1024):
    256. f.write(chunk)
    257. except Exception as e:
    258. messagebox.showerror("Download Error", f"Failed to download:\n{url}\n{str(e)}")
    259. def download_selected_line():
    260. try:
    261. line_index = result_box.index(tk.INSERT).split(".")[0]
    262. selected_url = result_box.get(f"{line_index}.0", f"{line_index}.end").strip()
    263. if not selected_url:
    264. raise Exception("No line selected.")
    265. folder = filedialog.askdirectory(title="Select Folder to Save File")
    266. if not folder:
    267. return
    268. parsed = urlparse(selected_url)
    269. filename = os.path.basename(parsed.path)
    270. if not filename:
    271. filename = "downloaded_file"
    272. save_path = os.path.join(folder, filename)
    273. threading.Thread(target=threaded_download, args=(selected_url, save_path), daemon=True).start()
    274. except Exception as e:
    275. messagebox.showerror("Error", str(e))
    276. def download_selected():
    277. selected_urls = result_box.get("1.0", tk.END).strip().splitlines()
    278. if not selected_urls:
    279. messagebox.showwarning("Selection Error", "No URLs to download.")
    280. return
    281. selected = filedialog.askdirectory(title="Select Folder to Save Files")
    282. if not selected:
    283. return
    284. for url in selected_urls:
    285. parsed = urlparse(url)
    286. filename = os.path.basename(parsed.path)
    287. if not filename:
    288. filename = "downloaded_file.mp4"
    289. save_path = os.path.join(selected, filename)
    290. download_media(url, save_path)
    291. messagebox.showinfo("Download Complete", f"Downloaded {len(selected_urls)} media files.")
    292. def threaded_download(url, save_path):
    293. global stop_download_flag
    294. stop_download_flag = False
    295. try:
    296. if is_special_site(url):
    297. ytdlp_path = shutil.which("yt-dlp") or r"C:\Windows\yt-dlp.exe"
    298. command = [
    299. ytdlp_path,
    300. "-f", "mp4",
    301. "--no-part", # Saves directly as .mp4
    302. "--downloader", "ffmpeg",
    303. "--downloader-args", "ffmpeg_i:-movflags +faststart",
    304. "-o", save_path,
    305. url
    306. ]
    307. proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    308. while proc.poll() is None:
    309. if stop_download_flag:
    310. proc.kill()
    311. break
    312. else:
    313. response = requests.get(url, stream=True, timeout=10)
    314. if response.status_code == 200:
    315. with open(save_path, 'wb') as f:
    316. for chunk in response.iter_content(1024 * 1024): # 1MB
    317. if stop_download_flag:
    318. break
    319. if chunk:
    320. f.write(chunk)
    321. if stop_download_flag:
    322. fix_partial_video(save_path) # Try to repair it
    323. messagebox.showinfo("Download Stopped", f"Download was stopped by user.\nSaved: {save_path}")
    324. else:
    325. messagebox.showinfo("Download Complete", f"Downloaded successfully to:\n{save_path}")
    326. except Exception as e:
    327. messagebox.showerror("Download Error", str(e))
    328. def stop_download():
    329. global stop_download_flag
    330. stop_download_flag = True
    331. def fix_partial_video(input_path):
    332. try:
    333. if not os.path.exists(input_path) or not input_path.lower().endswith(".mp4"):
    334. return
    335. output_path = input_path.replace(".mp4", "_fixed.mp4")
    336. ffmpeg_path = shutil.which("ffmpeg") or r"C:\Program Files\ffmpeg\bin\ffmpeg.exe"
    337. # Try quick remux
    338. command = [
    339. ffmpeg_path,
    340. "-y",
    341. "-i", input_path,
    342. "-c", "copy",
    343. "-movflags", "+faststart",
    344. output_path
    345. ]
    346. result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    347. # Fallback to re-encode if remux fails or small file
    348. if result.returncode != 0 or not os.path.exists(output_path) or os.path.getsize(output_path) < 1024 * 1024:
    349. print("[INFO] Remux failed or file too small, retrying with re-encode...")
    350. command = [
    351. ffmpeg_path,
    352. "-y",
    353. "-i", input_path,
    354. "-preset", "ultrafast",
    355. output_path
    356. ]
    357. subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    358. # Replace original file if fixed
    359. if os.path.exists(output_path):
    360. os.remove(input_path)
    361. os.rename(output_path, input_path)
    362. except Exception as e:
    363. print(f"[FFmpeg Fix Error] {e}")
    364. def scrape_all_links(url):
    365. try:
    366. response = requests.get(url, timeout=10)
    367. response.raise_for_status()
    368. soup = BeautifulSoup(response.text, 'html.parser')
    369. links = []
    370. for tag in soup.find_all('a', href=True):
    371. href = tag['href']
    372. full_url = urljoin(url, href)
    373. parsed_url = urlparse(full_url)
    374. if parsed_url.scheme in ['http', 'https']:
    375. links.append(full_url)
    376. return links
    377. except requests.exceptions.RequestException as e:
    378. messagebox.showerror("Network Error", f"Failed to scrape links: {e}")
    379. return []
    380. except Exception as e:
    381. messagebox.showerror("Error", f"An unexpected error occurred: {e}")
    382. return []
    383. def scrape_all_button():
    384. url = url_entry.get().strip()
    385. if not url:
    386. messagebox.showwarning("Input Error", "Please enter a valid URL.")
    387. return
    388. result_box.delete("1.0", tk.END)
    389. media_urls.clear()
    390. all_links = scrape_all_links(url)
    391. media_urls.extend(all_links)
    392. for link in all_links:
    393. result_box.insert(tk.END, link + "\n")
    394. messagebox.showinfo("Done", f"{len(all_links)} total link(s) scraped.")
    395. def open_in_vlc():
    396. line_index = result_box.index(tk.INSERT).split(".")[0]
    397. selected_url = result_box.get(f"{line_index}.0", f"{line_index}.end").strip()
    398. if not selected_url:
    399. messagebox.showwarning("No Selection", "Select a valid media URL.")
    400. return
    401. #vlc_path = shutil.which("vlc")
    402. vlc_path = r"C:\Program Files\VideoLAN\VLC\vlc.exe"
    403. if not vlc_path:
    404. messagebox.showerror("VLC Error", "VLC is not installed or not found in PATH.")
    405. return
    406. try:
    407. subprocess.Popen([vlc_path, selected_url])
    408. except Exception as e:
    409. messagebox.showerror("VLC Error", f"Could not open VLC: {e}")
    410. def preview_image_popup():
    411. try:
    412. line_index = result_box.index(tk.INSERT).split(".")[0]
    413. selected_url = result_box.get(f"{line_index}.0", f"{line_index}.end").strip()
    414. if not selected_url.lower().endswith(tuple(image_exts)):
    415. messagebox.showerror("Preview Error", "Selected link is not an image.")
    416. return
    417. response = requests.get(selected_url, timeout=10)
    418. if response.status_code != 200:
    419. messagebox.showerror("Preview Error", "Failed to load image.")
    420. return
    421. image = Image.open(io.BytesIO(response.content))
    422. popup = tk.Toplevel(window)
    423. popup.title("Image Preview")
    424. popup.geometry("600x600")
    425. img_resized = image.resize((500, 500), Image.ANTIALIAS)
    426. img_tk = ImageTk.PhotoImage(img_resized)
    427. label = tk.Label(popup, image=img_tk)
    428. label.image = img_tk
    429. label.pack()
    430. except Exception as e:
    431. messagebox.showerror("Preview Error", str(e))
    432. def load_m3u_file():
    433. file_path = filedialog.askopenfilename(title="Open M3U File", filetypes=[("M3U/M3U8 Files", "*.m3u *.m3u8")])
    434. if file_path:
    435. result_box.delete("1.0", tk.END)
    436. media_urls.clear()
    437. with open(file_path, 'r', encoding="utf-8", errors="ignore") as f:
    438. for line in f:
    439. url = line.strip()
    440. if url and url.startswith("http"):
    441. media_urls.append(url)
    442. result_box.insert(tk.END, url + "\n")
    443. messagebox.showinfo("Loaded", f"{len(media_urls)} media URLs loaded from playlist.")
    444. def load_online_m3u():
    445. url = url_entry.get().strip()
    446. if not url.lower().endswith((".m3u", ".m3u8")):
    447. messagebox.showwarning("URL Error", "Please enter a valid .m3u or .m3u8 URL.")
    448. return
    449. try:
    450. response = requests.get(url, timeout=10)
    451. if response.status_code != 200:
    452. raise Exception("Unable to fetch playlist.")
    453. result_box.delete("1.0", tk.END)
    454. media_urls.clear()
    455. for line in response.text.splitlines():
    456. line = line.strip()
    457. if line and line.startswith("http"):
    458. media_urls.append(line)
    459. result_box.insert(tk.END, line + "\n")
    460. messagebox.showinfo("Online M3U Loaded", f"{len(media_urls)} stream(s) loaded.")
    461. except Exception as e:
    462. messagebox.showerror("Error", str(e))
    463. def scrape_xtream_m3u_url():
    464. url = url_entry.get().strip()
    465. if not url or "get.php" not in url:
    466. messagebox.showwarning("Input Error", "Please enter a valid Xtream M3U URL.")
    467. return
    468. try:
    469. headers = {
    470. "User-Agent": "VLC/3.0.18 LibVLC/3.0.18"
    471. }
    472. response = requests.get(url, headers=headers, timeout=15)
    473. if response.status_code == 404:
    474. raise Exception("404 Not Found β€” the playlist URL might be wrong or expired.")
    475. if response.status_code != 200:
    476. raise Exception(f"Failed to fetch playlist. Status code: {response.status_code}")
    477. content = response.text
    478. if "#EXTM3U" not in content:
    479. raise Exception("Invalid playlist. No M3U content found.")
    480. result_box.delete("1.0", tk.END)
    481. media_urls.clear()
    482. for line in content.splitlines():
    483. if line.startswith("http"):
    484. media_urls.append(line)
    485. result_box.insert(tk.END, line + "\n")
    486. if media_urls:
    487. messagebox.showinfo("Success", f"Scraped {len(media_urls)} stream URLs from Xtream playlist.")
    488. else:
    489. messagebox.showwarning("No URLs", "Playlist loaded, but no stream URLs found.")
    490. except Exception as e:
    491. messagebox.showerror("Error", str(e))
    492. def search_urls():
    493. query = search_entry.get().strip().lower()
    494. if not query:
    495. return
    496. result_box.tag_remove("highlight", "1.0", tk.END)
    497. lines = result_box.get("1.0", tk.END).splitlines()
    498. for i, line in enumerate(lines, 1):
    499. if query in line.lower():
    500. result_box.tag_add("highlight", f"{i}.0", f"{i}.end")
    501. result_box.tag_config("highlight", background="yellow", foreground="black")
    502. def save_as_m3u():
    503. """
    504. Saves the contents of the result box as an M3U/M3U8 playlist file.
    505. """
    506. file_path = filedialog.asksaveasfilename(
    507. defaultextension=".m3u",
    508. filetypes=[("Text File", "*.txt"), ("M3U Playlist", "*.m3u"), ("M3U8 Playlist", "*.m3u8")]
    509. )
    510. if file_path:
    511. try:
    512. with open(file_path, 'w', encoding="utf-8") as f:
    513. # Write content from the result box to the file
    514. f.write(result_box.get("1.0", tk.END).strip())
    515. messagebox.showinfo("Saved", f"Playlist saved to:\n{file_path}")
    516. except Exception as e:
    517. messagebox.showerror("Save Error", f"Failed to save playlist:\n{str(e)}")
    518. def clear_url_field():
    519. """
    520. Clears the URL entry field.
    521. """
    522. url_entry.delete(0, tk.END)
    523. def clear_result_box():
    524. """
    525. Clears the result box and resets the media URLs list.
    526. """
    527. result_box.delete("1.0", tk.END)
    528. media_urls.clear()
    529. def clear_search():
    530. """
    531. Clears the search entry field and removes highlights from the result box.
    532. """
    533. search_entry.delete(0, tk.END)
    534. result_box.tag_remove("highlight", "1.0", tk.END)
    535. def scrape_directory_media(url):
    536. """
    537. Scrape media URLs from subdirectories of the given URL.
    538. :param url: The base URL to start scraping from.
    539. """
    540. global media_urls
    541. result_box.delete("1.0", tk.END) # Fix: Replace END with tk.END
    542. media_urls.clear()
    543. def extract_directories(soup, base_url):
    544. """
    545. Extract directory links from the page.
    546. :param soup: BeautifulSoup object of the page.
    547. :param base_url: Base URL to resolve relative paths.
    548. :return: List of directory URLs.
    549. """
    550. directories = []
    551. for a_tag in soup.find_all('a', href=True):
    552. href = a_tag['href']
    553. if href.endswith("/") and not href.startswith("#"): # Subdirectory link
    554. full_href = urljoin(base_url, href)
    555. if full_href != base_url: # Avoid infinite loops
    556. directories.append(full_href)
    557. return directories
    558. def extract_media_urls(soup, base_url):
    559. """
    560. Extract media URLs from the page.
    561. :param soup: BeautifulSoup object of the page.
    562. :param base_url: Base URL to resolve relative paths.
    563. :return: Set of media URLs.
    564. """
    565. media_links = set()
    566. for tag in soup.find_all(['img', 'video', 'source', 'a']):
    567. src = tag.get('src') or tag.get('href')
    568. if src:
    569. full_url = urljoin(base_url, src)
    570. parsed = urlparse(full_url)
    571. ext = os.path.splitext(parsed.path)[1].lower()
    572. if ext in image_exts + video_exts:
    573. media_links.add(full_url)
    574. return media_links
    575. try:
    576. # Fetch the base URL content
    577. response = requests.get(url, timeout=10)
    578. if response.status_code != 200:
    579. messagebox.showerror("Error", f"Failed to fetch {url} (Status Code: {response.status_code})")
    580. return
    581. soup = BeautifulSoup(response.text, 'html.parser')
    582. # Step 1: Extract all subdirectories
    583. directories = extract_directories(soup, url)
    584. # Step 2: Scrape media URLs from each subdirectory
    585. found_media = False
    586. for directory in directories:
    587. try:
    588. dir_response = requests.get(directory, timeout=10)
    589. if dir_response.status_code == 200:
    590. dir_soup = BeautifulSoup(dir_response.text, 'html.parser')
    591. media_links = extract_media_urls(dir_soup, directory)
    592. if media_links:
    593. found_media = True
    594. for media_url in media_links:
    595. if media_url not in media_urls:
    596. media_urls.append(media_url)
    597. result_box.insert(tk.END, media_url + "\n") # Fix: Replace END with tk.END
    598. except Exception as e:
    599. print(f"Error scraping directory {directory}: {e}")
    600. if not found_media:
    601. messagebox.showinfo("Info", "No media URLs found in subdirectories.")
    602. else:
    603. messagebox.showinfo("Success", f"{len(media_urls)} media URL(s) found!")
    604. except Exception as e:
    605. messagebox.showerror("Error", str(e))
    606. # Tab 2 Layout
    607. tk.Label(tab2, text="Enter URL to Scrape Media:").pack(pady=5)
    608. search_frame = tk.Frame(tab2)
    609. search_frame.pack(pady=5)
    610. search_entry = tk.Entry(search_frame, width=40)
    611. search_entry.pack(side=tk.LEFT, padx=5)
    612. tk.Button(search_frame, text="Search", command=search_urls, bg="lightblue").pack(side=tk.LEFT, padx=5)
    613. url_entry = tk.Entry(search_frame, width=100)
    614. url_entry.pack(pady=5)
    615. frame_buttons = tk.Frame(tab2)
    616. frame_buttons.pack(pady=5)
    617. tk.Button(frame_buttons, text="Scrape Media", command=process_url, bg="lightgreen", width=20).pack(side=tk.LEFT, padx=5)
    618. tk.Button(frame_buttons, text="Browse URL File", command=browse_url_file, bg="lightyellow", width=20).pack(side=tk.LEFT, padx=5)
    619. tk.Button(frame_buttons, text="Download All URLs", command=download_selected, bg="lightblue", width=20).pack(side=tk.LEFT, padx=5)
    620. tk.Button(frame_buttons, text="Download Selected URL", command=download_selected_line, bg="orange", width=20).pack(side=tk.LEFT, padx=5)
    621. tk.Button(frame_buttons, text="Save URLs to File", command=save_urls_to_file, bg="lightgray", width=20).pack(side=tk.LEFT, padx=5)
    622. tk.Button(frame_buttons, text="Stop Download", command=stop_download, bg="red", width=20).pack(side=tk.LEFT, padx=5)
    623. frame_button = tk.Frame(tab2)
    624. frame_button.pack(pady=5)
    625. tk.Button(frame_button, text="Scrape All Links", command=scrape_all_button, bg="#e0c3fc", width=20).pack(side=tk.LEFT, padx=5)
    626. tk.Button(frame_button, text="Open in VLC", command=open_in_vlc, bg="#c1f0c1", width=20).pack(side=tk.LEFT, padx=5)
    627. tk.Button(frame_button, text="Preview Image", command=preview_image_popup, bg="#f0c1c1", width=20).pack(side=tk.LEFT, padx=5)
    628. tk.Button(frame_button, text="Load Online M3U", command=load_online_m3u, bg="#c9f2ff", width=20).pack(side=tk.LEFT, padx=5)
    629. tk.Button(frame_button, text="Scrape Xtream M3U", command=scrape_xtream_m3u_url, bg="#fff0b3", width=20).pack(side=tk.LEFT, padx=5)
    630. tk.Button(frame_button, text="Load M3U File", command=load_m3u_file, bg="#d0f0fd", width=20).pack(side=tk.LEFT, padx=5)
    631. result_frame = tk.Frame(tab2)
    632. result_frame.pack(pady=5)
    633. scrollbar = tk.Scrollbar(result_frame)
    634. scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
    635. result_box = tk.Text(result_frame, height=30, width=124, yscrollcommand=scrollbar.set)
    636. result_box.pack(side=tk.LEFT, fill=tk.BOTH)
    637. scrollbar.config(command=result_box.yview)
    638. frame_clear = tk.Frame(tab2)
    639. frame_clear.pack(pady=5)
    640. tk.Button(frame_clear, text="Save Result", command=save_as_m3u, bg="#a7ffcc", width=20).pack(side=tk.LEFT, padx=5)
    641. tk.Button(frame_clear, text="Clear Search", command=clear_search, bg="lightgray").pack(side=tk.LEFT, padx=2)
    642. tk.Button(frame_clear, text="Clear URL Field", command=clear_url_field, bg="#ffd580", width=20).pack(side=tk.LEFT, padx=5)
    643. tk.Button(frame_clear, text="Clear Result Field", command=clear_result_box, bg="#ffb3b3", width=20).pack(side=tk.LEFT, padx=5)
    644. # Add a button for scraping subdirectories
    645. tk.Button(frame_clear, text="Scrape Subdirectories", command=lambda: scrape_directory_media(url_entry.get().strip()), bg="#ffcccb", width=20).pack(side=tk.LEFT, padx=5)
    646. # ====================
    647. # === Tab 3 Content ===
    648. # ====================
    649. tab3 = ttk.Frame(notebook)
    650. notebook.add(tab3, text="Web Scraper")
    651. notebook.pack(expand=True, fill="both")
    652. class WebScraperGUI:
    653. def __init__(self, root):
    654. self.root = root
    655. # Configure the style for ttk.Frame
    656. self.style = ttk.Style()
    657. self.style.configure("Background.TFrame", background="#336699") # Define a custom style
    658. self.root.config(style="Background.TFrame") # Apply the style to the root frame
    659. # URL Entry
    660. self.url_label = ttk.Label(root, text="Enter URL:")
    661. self.url_label.grid(column=0, row=0, sticky=tk.W, padx=10, pady=5)
    662. self.url_entry = ttk.Entry(root, width=120)
    663. self.url_entry.grid(column=1, row=0, columnspan=4, sticky=tk.W, padx=10, pady=5)
    664. # Options
    665. self.options_label = ttk.Label(root, text="Select Options:")
    666. self.options_label.grid(column=0, row=1, sticky=tk.W, padx=10, pady=5)
    667. # Checkboxes
    668. self.check_var_html = tk.BooleanVar()
    669. self.check_var_heading = tk.BooleanVar()
    670. self.check_var_paragraph = tk.BooleanVar()
    671. self.check_var_css = tk.BooleanVar()
    672. self.check_var_table = tk.BooleanVar()
    673. self.check_var_links = tk.BooleanVar()
    674. self.check_var_files = tk.BooleanVar()
    675. self.html_check = ttk.Checkbutton(root, text="Full HTML", variable=self.check_var_html)
    676. self.html_check.grid(column=1, row=1, sticky=tk.W, padx=10, pady=5)
    677. self.heading_check = ttk.Checkbutton(root, text="Headings", variable=self.check_var_heading)
    678. self.heading_check.grid(column=2, row=1, sticky=tk.W, padx=10, pady=5)
    679. self.paragraph_check = ttk.Checkbutton(root, text="Paragraphs", variable=self.check_var_paragraph)
    680. self.paragraph_check.grid(column=3, row=1, sticky=tk.W, padx=10, pady=5)
    681. self.css_check = ttk.Checkbutton(root, text="CSS", variable=self.check_var_css)
    682. self.css_check.grid(column=4, row=1, sticky=tk.W, padx=10, pady=5)
    683. self.table_check = ttk.Checkbutton(root, text="Tables", variable=self.check_var_table)
    684. self.table_check.grid(column=1, row=2, sticky=tk.W, padx=10, pady=5)
    685. self.links_check = ttk.Checkbutton(root, text="Links", variable=self.check_var_links)
    686. self.links_check.grid(column=2, row=2, sticky=tk.W, padx=10, pady=5)
    687. self.files_check = ttk.Checkbutton(root, text="Files", variable=self.check_var_files)
    688. self.files_check.grid(column=3, row=2, sticky=tk.W, padx=10, pady=5)
    689. # Result Text Field
    690. self.result_label = ttk.Label(root, text="Scraped Content of Websites:")
    691. self.result_label.grid(column=0, row=4, sticky=tk.W, padx=10, pady=5)
    692. #self.result_text = scrolledtext.ScrolledText(root, width=110, height=33, wrap=tk.WORD)
    693. self.result_text = scrolledtext.ScrolledText(root, width=116, height=33, wrap=tk.WORD, bg="#f0f0f0")
    694. self.result_text.grid(column=0, row=5, columnspan=5)
    695. # Scrape Button
    696. self.scrape_button = ttk.Button(root, text="SCRAPE", command=self.scrape)
    697. self.scrape_button.grid(column=4, row=4, columnspan=8, pady=10)
    698. # Save Result Button
    699. self.save_result_button = ttk.Button(root, text="Save Result", command=self.save_result, style='Red.TButton')
    700. self.save_result_button.grid(column=2, row=4, columnspan=8, pady=10)
    701. # Define style for the "Save Result" button
    702. self.style.configure('Red.TButton', foreground='red')
    703. def scrape(self):
    704. url = self.url_entry.get()
    705. if not url:
    706. messagebox.showwarning("Input Error", "Please enter a valid URL.")
    707. return
    708. options = {
    709. 'html': self.check_var_html.get(),
    710. 'heading': self.check_var_heading.get(),
    711. 'paragraph': self.check_var_paragraph.get(),
    712. 'css': self.check_var_css.get(),
    713. 'table': self.check_var_table.get(),
    714. 'links': self.check_var_links.get(),
    715. 'files': self.check_var_files.get()
    716. }
    717. try:
    718. response = requests.get(url)
    719. response.raise_for_status()
    720. soup = BeautifulSoup(response.content, 'html.parser')
    721. result = ""
    722. if options['html']:
    723. result += str(soup) + '\n\n'
    724. if options['heading']:
    725. headings = soup.find_all(re.compile('^h[1-6]$'))
    726. result += "Headings:\n"
    727. for heading in headings:
    728. result += heading.text.strip() + '\n'
    729. result += '\n'
    730. if options['paragraph']:
    731. paragraphs = soup.find_all('p')
    732. result += "Paragraphs:\n"
    733. for paragraph in paragraphs:
    734. result += paragraph.text.strip() + '\n'
    735. result += '\n'
    736. if options['css']:
    737. css_links = [link['href'] for link in soup.find_all('link', rel='stylesheet')]
    738. result += "CSS Links:\n"
    739. for css_link in css_links:
    740. full_url = urljoin(url, css_link)
    741. result += full_url + '\n'
    742. result += '\n'
    743. if options['table']:
    744. tables = soup.find_all('table')
    745. result += "Tables:\n"
    746. for table in tables:
    747. result += str(table) + '\n'
    748. result += '\n'
    749. if options['links']:
    750. links = soup.find_all('a', href=True)
    751. result += "Links:\n"
    752. for link in links:
    753. if link['href'].startswith('http'):
    754. result += f"Text: {link.text.strip()}, URL: {link['href']}\n"
    755. else:
    756. full_url = urljoin(url, link['href'])
    757. result += f"Text: {link.text.strip()}, URL: {full_url}\n"
    758. result += '\n'
    759. if options['files']:
    760. file_links = [link['href'] for link in soup.find_all('a', href=True) if re.search(r'\.[^.]+$', link['href'])]
    761. result += "File Links:\n"
    762. for file_link in file_links:
    763. full_url = urljoin(url, file_link)
    764. result += full_url + '\n'
    765. result += '\n'
    766. self.result_text.delete(1.0, tk.END)
    767. self.result_text.insert(tk.END, result)
    768. except requests.exceptions.RequestException as e:
    769. messagebox.showerror("Network Error", f"Failed to fetch URL: {e}")
    770. except Exception as e:
    771. messagebox.showerror("Error", f"An unexpected error occurred: {e}")
    772. def save_result(self):
    773. result_text = self.result_text.get(1.0, tk.END)
    774. if not result_text.strip():
    775. messagebox.showwarning("Empty Result", "No content to save.")
    776. return
    777. file_path = filedialog.asksaveasfilename(defaultextension=".txt", filetypes=[("Text files", "*.txt")])
    778. if file_path:
    779. try:
    780. with open(file_path, "w", encoding="utf-8") as file:
    781. file.write(result_text)
    782. messagebox.showinfo("Success", f"Result saved to {file_path}")
    783. except Exception as e:
    784. messagebox.showerror("Save Error", f"Failed to save file: {e}")
    785. # Initialize WebScraperGUI in Tab 3
    786. web_scraper_gui = WebScraperGUI(tab3)
    787. # Run
    788. window.mainloop()
    Advertisement
    Add Comment
    Please, Sign In to add comment
    Public Pastes
    We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
    Not a member of Pastebin yet?
    Sign Up, it unlocks many cool features!

    AltStyle γ«γ‚ˆγ£γ¦ε€‰ζ›γ•γ‚ŒγŸγƒšγƒΌγ‚Έ (->γ‚ͺγƒͺγ‚ΈγƒŠγƒ«) /