In [None]:
import re
import json
import requests
import psycopg2
import time, random
import gradio as gr
from typing import List
from openai import OpenAI
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display

In [None]:
load_dotenv()
db_user = os.getenv('POSTGRES_USER')
db_pwd = os.getenv('POSTGRES_PASSWORD')

In [None]:
def connect_to_db():
 """
 Connects to a PostgreSQL database on localhost:5432.
 Returns the connection object, or None if the connection fails.
 """
 try:
 conn = psycopg2.connect(
 host="localhost",
 port=5432,
 database="gcplinks", # Replace with your database name
 user=db_user, # Replace with your user name
 password=db_pwd # Replace with your password
 )
 print("Successfully connected to the database.")
 return conn
 except psycopg2.Error as e:
 print(f"Error connecting to the database: {e}")
 return None

In [None]:
connect_timeout_secs = 5
read_timeout_secs = 15 

random_sleep_lower = 5
random_sleep_upper = 20

In [None]:
#
# from Ed Donner's github repo for LLM Engineering course on Udemy: 
# https://github.com/ed-donner/llm_engineering/blob/main/week1/day5.ipynb
#
# headers = {
# "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
# }

headers = {
 "User-Agent": (
 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
 "AppleWebKit/537.36 (KHTML, like Gecko) "
 "Chrome/122.0.0.0 Safari/537.36"
 ),
 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 "Accept-Language": "en-US,en;q=0.5",
 "Accept-Encoding": "gzip, deflate, br",
 "Connection": "keep-alive",
 "Referer": "https://cloud.google.com/",
}


class Website:
 """
 A utility class to represent a Website that we have scraped, now with links
 """

 def __init__(self, url):
 self.url = url
 response = requests.get(url, headers=headers, timeout=(connect_timeout_secs, read_timeout_secs))
 self.body = response.content
 soup = BeautifulSoup(self.body, 'html.parser')
 self.title = soup.title.string if soup.title else "No title found"
 if soup.body:
 for irrelevant in soup.body(["script", "style", "img", "input"]):
 irrelevant.decompose()
 self.text = soup.body.get_text(separator="\n", strip=True)
 else:
 self.text = ""
 links = [link.get('href') for link in soup.find_all('a')]
 self.links = [link for link in links if link]

 def get_contents(self):
 return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"


In [3]:
# gcp_products = Website("https://cloud.google.com/products")
# gcp_products = Website("https://cloud.google.com/docs")
# gcp_products = Website("https://cloud.google.com/compute/docs")
# doc_url = "/compute/docs/images/create-custom"

doc_url = "/compute/docs/instances"
gcp_url = "https://cloud.google.com"
url = gcp_url + doc_url

doc_link_folder = "gcp_pages/links/"
doc_html_folder = "gcp_pages/html/"

doc_link_file = doc_link_folder + doc_url[1:].replace("/", "_") + "_links.txt"
doc_html_file = doc_html_folder + doc_url[1:].replace("/", "_") + "_html.txt"

website = Website(url)


In [4]:
doc_links = set()
for link in website.links:
 if re.search(r'(.*)\/docs', link):
 doc_links.add(link)

print('Found {} links containing /docs'.format(len(doc_links)))

if len(doc_links) > 0:
 with open(doc_link_file, 'w') as f:
 for link in doc_links:
 f.write(link)
 f.write('\n')

with open(doc_html_file, 'w') as f:
 f.write(str(website.body))

Found 622 links containing /docs
