Sunday, 12 January 2025

Web Scratch - iMM (For Sethu?)

 Let us have fun!. (as a Web Scraping  3 Minute Exercise)

As per Deliverables by Sethu ( May be useful)

We have to backup iMM website from GoDaddy.

Try this code:

import requests

import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

def get_all_urls(website_url):
    try:
        response = requests.get(website_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        urls = set()
        for a_tag in soup.find_all('a', href=True):
            full_url = urljoin(website_url, a_tag['href'])
            urls.add(full_url)
       
        return urls

    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return set()

def backup_urls(urls, backup_dir="backups"):
    os.makedirs(backup_dir, exist_ok=True)

    for url in urls:
        try:
            response = requests.get(url)
            response.raise_for_status()

            # Generate a safe file name based on the URL
            parsed_url = urlparse(url)
            file_name = parsed_url.netloc + parsed_url.path.replace("/", "_") + ".html"
            file_path = os.path.join(backup_dir, file_name)

            # Save the HTML content to a file
            with open(file_path, 'w', encoding='utf-8') as file:
                file.write(response.text)
           
            print(f"Backup saved: {file_path}")
       
        except requests.exceptions.RequestException as e:
            print(f"Failed to backup {url}: {e}")

# Example usage

website = "https://intmmsw.com/"
urls = get_all_urls(website)
print(urls)
print(f"Found {len(urls)} URLs. Starting backup...")
backup_urls(urls)

Big Fun! Sweet! Got all HTML back up files on your desktop in a Minute !!! 

# Response

"""
Found 21 URLs. Starting backup...
Backup saved: backups\intmmsw.com_software-%26-hitech.html
Backup saved: backups\intmmsw.com_oil-%26-gas-1.html
Backup saved: backups\intmmsw.com_google-cloud-1.html
Backup saved: backups\intmmsw.com_rpa.html
Backup saved: backups\intmmsw.com_cloud-computing.html
Backup saved: backups\intmmsw.com_development-center.html
Backup saved: backups\intmmsw.com_software-as-a-service.html
Backup saved: backups\intmmsw.com_healthcare-%26-biotech-1.html
Backup saved: backups\intmmsw.com_professional-services-1.html
Backup saved: backups\intmmsw.com_oci.html
Backup saved: backups\intmmsw.com_artificial-intelligence.html
Backup saved: backups\intmmsw.com_data-analytics-%26-insights-1.html
Backup saved: backups\intmmsw.com_.html
Backup saved: backups\intmmsw.com_web-development.html
Backup saved: backups\intmmsw.com_digital-transformation-1.html
Backup saved: backups\intmmsw.com_education-%26-learning-1.html
Backup saved: backups\intmmsw.com_banking-%26-fintech-1.html
Backup saved: backups\intmmsw.com_govt-%26-public-sector-1.html
Backup saved: backups\intmmsw.com_data-warehouse%2Fdatalake.html
Backup saved: backups\intmmsw.com_azure-1.html
Backup saved: backups\intmmsw.com_aws.html
"""

Note : 

Pl. clone our web site with some tool and download in to your desktop for other files like jpg, png, js, css, scss!!!☝

 

No comments:

Post a Comment

Hands on with Llama #2

Let us play with Gradio UI! Creating a Gradio interface only requires adding a couple lines of code to your project.  Seamlessly use any pyt...