Let us have fun!. (as a Web Scraping 3 Minute Exercise)
As per Deliverables by Sethu ( May be useful)
We have to backup iMM website from GoDaddy.
Try this code:
import requests
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
def get_all_urls(website_url):
try:
response = requests.get(website_url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
urls = set()
for a_tag in soup.find_all('a', href=True):
full_url = urljoin(website_url, a_tag['href'])
urls.add(full_url)
return urls
except requests.exceptions.RequestException as e:
print(f"An error occurred: {e}")
return set()
def backup_urls(urls, backup_dir="backups"):
os.makedirs(backup_dir, exist_ok=True)
for url in urls:
try:
response = requests.get(url)
response.raise_for_status()
# Generate a safe file name based on the URL
parsed_url = urlparse(url)
file_name = parsed_url.netloc + parsed_url.path.replace("/", "_") + ".html"
file_path = os.path.join(backup_dir, file_name)
# Save the HTML content to a file
with open(file_path, 'w', encoding='utf-8') as file:
file.write(response.text)
print(f"Backup saved: {file_path}")
except requests.exceptions.RequestException as e:
print(f"Failed to backup {url}: {e}")
# Example usage
website = "https://intmmsw.com/"
urls = get_all_urls(website)
print(urls)
print(f"Found {len(urls)} URLs. Starting backup...")
backup_urls(urls)
Big Fun! Sweet! Got all HTML back up files on your desktop in a Minute !!!
# Response
"""
Found 21 URLs. Starting backup...
Backup saved: backups\intmmsw.com_software-%26-hitech.html
Backup saved: backups\intmmsw.com_oil-%26-gas-1.html
Backup saved: backups\intmmsw.com_google-cloud-1.html
Backup saved: backups\intmmsw.com_rpa.html
Backup saved: backups\intmmsw.com_cloud-computing.html
Backup saved: backups\intmmsw.com_development-center.html
Backup saved: backups\intmmsw.com_software-as-a-service.html
Backup saved: backups\intmmsw.com_healthcare-%26-biotech-1.html
Backup saved: backups\intmmsw.com_professional-services-1.html
Backup saved: backups\intmmsw.com_oci.html
Backup saved: backups\intmmsw.com_artificial-intelligence.html
Backup saved: backups\intmmsw.com_data-analytics-%26-insights-1.html
Backup saved: backups\intmmsw.com_.html
Backup saved: backups\intmmsw.com_web-development.html
Backup saved: backups\intmmsw.com_digital-transformation-1.html
Backup saved: backups\intmmsw.com_education-%26-learning-1.html
Backup saved: backups\intmmsw.com_banking-%26-fintech-1.html
Backup saved: backups\intmmsw.com_govt-%26-public-sector-1.html
Backup saved: backups\intmmsw.com_data-warehouse%2Fdatalake.html
Backup saved: backups\intmmsw.com_azure-1.html
Backup saved: backups\intmmsw.com_aws.html
"""
Note :
Pl. clone our web site with some tool and download in to your desktop for other files like jpg, png, js, css, scss!!!☝
No comments:
Post a Comment