In [1]:
import os
import re
import json
import lxml
import requests
import warnings
import psycopg2
import time, random
from dotenv import load_dotenv
from bs4 import BeautifulSoup, Tag, NavigableString, XMLParsedAsHTMLWarning
from IPython.display import Markdown, display, update_display

SQL statement to select links not related to a programming language or that has `/reference/` in the url:

```sql
select link from doc_links where link not like '%.com/java/%' and link not like '%.com/php%' and link not like '%.com/python%' and link not like '%.com/ruby%' and link not like '%.com/cpp%' and link not like '%.com/dotnet%' and link not like '%.com/nodejs%' and link not like '%/reference/%' limit 10;
```

In [2]:
load_dotenv()
db_user = os.getenv('POSTGRES_USER')
db_pwd = os.getenv('POSTGRES_PASSWORD')

db_user

'admin'

In [None]:
def connect_to_db():
    """
    Connects to a PostgreSQL database on localhost:5432.
    Returns the connection object, or None if the connection fails.
    """
    try:
        conn = psycopg2.connect(
            host="localhost",
            port=5432,
            database="gcplinks",  # Replace with your database name
            user=db_user,          # Replace with your user name
            password=db_pwd       # Replace with your password
        )
        print("Successfully connected to the database.")
        return conn
    except psycopg2.Error as e:
        print(f"Error connecting to the database: {e}")
        return None

In [4]:
base_url = 'https://cloud.google.com/'
sitemap_url = 'sitemap_{}_of_390'
url = base_url + sitemap_url + '.xml'
start_number = 1
end_number = 2

connect_timeout_secs = 5
read_timeout_secs = 15 

random_sleep_lower = 5
random_sleep_upper = 20
url

'https://cloud.google.com/sitemap_{}_of_390.xml'

In [5]:
headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/122.0.0.0 Safari/537.36"
    ),
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
    "Referer": "https://cloud.google.com/",
}


In [6]:
page_data_folder = './page_data/'
raw_data_folder = './raw_data/'

In [7]:
# Disabling the following warning:
#
# XMLParsedAsHTMLWarning: It looks like you're using an HTML parser to parse an XML document.
# Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will
#  be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword
#  argument `features="xml"` into the BeautifulSoup constructor.
# If you want or need to use an HTML parser on this document, you can make this warning go away by filtering it. To do that,
#  run this code before calling the BeautifulSoup constructor:

warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)

### Scrape page

The next code block scrapes the given url.

Comment out to avoid accidentally running

In [8]:
# for i in range(start_number, end_number + 1):
#     response = requests.get(url.format(i), headers=headers, timeout=(connect_timeout_secs, read_timeout_secs))
#     body = response.content
#     soup = BeautifulSoup(body, 'xml')
#     raw_file = raw_data_folder + sitemap_url.format(i) + ".txt"
#     with open(raw_file, "w") as f:
#         f.write(str(soup))

#     print("Wrote file: {}".format(raw_file))
#     random_sleep_time_seconds = random.uniform(random_sleep_lower, random_sleep_upper)
#     print("Sleeping for {} seconds.".format(random_sleep_time_seconds))
#     time.sleep(random_sleep_time_seconds)

In [11]:
insert = "insert into doc_links (link, date_last_modified, date_last_visited) values (%s, %s, '1970-01-01')"

conn = connect_to_db()
cur = conn.cursor()

for j in range(1, 8):
    soup = None
    with open("./raw_data/sitemap_{}_of_390.txt".format(j), "r", encoding='utf-8') as f:
        print("reading : {}".format("./raw_data/sitemap_{}_of_390.txt".format(j)))
        soup = BeautifulSoup(f, 'xml')

    urls = soup.find_all('url')
    for url in urls:
        loc = url.find('loc').get_text() 
        lastmod_raw = url.find('lastmod').get_text()
        lastmod_split = lastmod_raw.split('T')
        lastmod = lastmod_split[0]
        try:
            cur.execute(insert, (loc, lastmod,))
            conn.commit()
            #print("Successfully inserted: {} | {}".format(lastmod, loc))
        except Exception as e:
            conn.rollback()
            print("Failed to insert link: {}".format(loc))
            print(e)

cur.close()
conn.close()

Successfully connected to the database.
reading : ./raw_data/sitemap_1_of_390.txt
reading : ./raw_data/sitemap_2_of_390.txt
reading : ./raw_data/sitemap_3_of_390.txt
reading : ./raw_data/sitemap_4_of_390.txt
reading : ./raw_data/sitemap_5_of_390.txt
reading : ./raw_data/sitemap_6_of_390.txt
reading : ./raw_data/sitemap_7_of_390.txt
