In [1]:
import re
from openai import OpenAI
import gradio as gr
import requests
import json
from typing import List
from bs4 import BeautifulSoup, Tag, NavigableString
from IPython.display import Markdown, display, update_display

In [2]:
#
# from Ed Donner's github repo for LLM Engineering course on Udemy: 
# https://github.com/ed-donner/llm_engineering/blob/main/week1/day5.ipynb
#
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
 """
 A utility class to represent a Website that we have scraped, now with links
 """

 def __init__(self, url):
 
 self.url = url
 self.title = None
 self.links = set()
 self.last_updated = '1970-01-01'
 
 response = requests.get(url, headers=headers)
 self.body = response.content
 self.soup = BeautifulSoup(self.body, 'html.parser')
 self.article = self.soup.find('article', class_='devsite-article')
 # self.title = soup.title.string if soup.title else "No title found"
 # if soup.body:
 # for irrelevant in soup.body(["script", "style", "img", "input"]):
 # irrelevant.decompose()
 # self.text = soup.body.get_text(separator="\n", strip=True)
 # else:
 # self.text = ""
 # links = [link.get('href') for link in soup.find_all('a')]
 # self.links = [link for link in links if link]

 def get_doc_title(self):
 if self.title:
 return self.title

 title = self.soup.find('h1', class_='devsite-page-title').find(string=True, recursive=False).strip()
 if not title:
 self.title = '{} | No Title'.format(self.url)
 else:
 self.title = title

 return self.title

 def get_last_updated(self):
 footer = self.soup.find('devsite-content-footer')
 footer_paras = footer.find_all('p')
 for fp in footer_paras:
 last_updated_re = r'Last updated (.*) UTC'
 match = re.search(last_updated_re, fp.get_text())
 if match:
 self.last_updated = match.group(1)
 break

 def get_doc_links(self):
 nav_items = self.soup.find_all('li', class_='devsite-nav-item')
 links_in_article = self.article_section.find_all('a', href=True)
 all_links = nav_items + links_in_article
 
 for al in all_links:
 link = al.find('a', href=True)
 if link and 'docs' in link['href']:
 if link['href'][0] == '/':
 self.links.add('https://cloud.google.com{}'.format(link['href']))
 else:
 self.links.add(link['href'])


In [3]:
exclude_tags = ['devsite-feedback', 'devsite-actions', 'devsite-toc', 'aside', 'devsite-page-title-meta', 'devsite-thumb-rating']

In [4]:
# eventarc = Website('https://cloud.google.com/eventarc/docs')
eventarc = Website('https://cloud.google.com/compute/docs/overview')

In [5]:
eventarc.get_doc_title()

'Compute Engine overview'

In [6]:
eventarc.title

'Compute Engine overview'

In [7]:
soup = eventarc.soup

In [8]:
print(soup.find('h1', class_='devsite-page-title').find(string=True, recursive=False).strip())

Compute Engine overview


In [9]:
footer = soup.find('devsite-content-footer')

In [10]:
footer_paras = footer.find_all('p')

In [11]:
for fp in footer_paras:
 last_updated_re = r'Last updated (.*) UTC'
 match = re.search(last_updated_re, fp.get_text())
 print(match)
 if match:
 print(match.group(1))
 else:
 print('Nope')

None
Nope
<re.Match object; span=(0, 27), match='Last updated 2025-05-06 UTC'>
2025-05-06


In [12]:
eventarc.get_last_updated()
eventarc.last_updated

'2025-05-06'

In [13]:
nav_items = soup.find_all('li', class_='devsite-nav-item')
nav_items[0].find('a', href=True)['href']

'https://cloud.google.com/docs/ai-ml'

In [14]:
links_on_page = set()
for ni in nav_items:
 link = ni.find('a', href=True)
 if link and 'docs' in link['href']:
 if link['href'][0] == '/' and '#' not in link['href']:
 links_on_page.add('https://cloud.google.com{}'.format(link['href']))
 else:
 links_on_page.add(link['href'])

In [15]:
links_on_page

{'https://cloud.google.com/build/docs/deploying-builds/deploy-compute-engine',
 'https://cloud.google.com/compute/docs/about-confidential-vm',
 'https://cloud.google.com/compute/docs/about-shielded-vm',
 'https://cloud.google.com/compute/docs/accelerator-optimized-machines',
 'https://cloud.google.com/compute/docs/access',
 'https://cloud.google.com/compute/docs/access/app-authentication-methods',
 'https://cloud.google.com/compute/docs/access/authenticate-workloads',
 'https://cloud.google.com/compute/docs/access/authenticate-workloads-over-mtls',
 'https://cloud.google.com/compute/docs/access/create-enable-service-accounts-for-instances',
 'https://cloud.google.com/compute/docs/access/custom-constraints',
 'https://cloud.google.com/compute/docs/access/iam',
 'https://cloud.google.com/compute/docs/access/managing-access-to-resources',
 'https://cloud.google.com/compute/docs/access/service-accounts',
 'https://cloud.google.com/compute/docs/apis',
 'https://cloud.google.com/compute/docs

In [16]:
article_section = soup.find('article', class_='devsite-article')
links_in_article = article_section.find_all('a', href=True)

In [17]:
def make_google_cloud_link(link):
 return 'https://cloud.google.com{}'.format(link['href'])

In [18]:
article_links = set()
for lia in links_in_article:
 if lia and 'docs' in lia['href'] and '#' not in lia['href']:
 article_links.add(make_google_cloud_link(lia))

In [19]:
article_links

{'https://cloud.google.com/apis/docs/cloud-client-libraries',
 'https://cloud.google.com/compute/docs/choose-compute-deployment-option',
 'https://cloud.google.com/compute/docs/cpu-platforms',
 'https://cloud.google.com/compute/docs/disks',
 'https://cloud.google.com/compute/docs/gpus',
 'https://cloud.google.com/compute/docs/machine-resource',
 'https://cloud.google.com/compute/docs/networking/network-overview',
 'https://cloud.google.com/compute/docs/regions-zones',
 'https://cloud.google.comhttps://cloud.google.com/compute/docs',
 'https://cloud.google.comhttps://cloud.google.com/compute/docs/overview'}

In [None]:
# i = 0
# for elem in article_section:
# if i < 5:
 
# if elem.name in exclude_tags:
# continue

# print('** {} **: {}'.format(i,elem))
# if isinstance(elem, Tag):
# keys = elem.attrs.keys()
# print(keys)
# if 'class' in keys:
# print("---*** TAG NAME: {}".format(elem.name))
# print(elem['class'])
# class_list = elem['class']
# print("CLASS_LIST: {}".format(class_list))
# for cl in class_list:
# print("CLASS cl: {}".format(cl))
# if 'breadcrumb' in cl or 'meta' in cl:
# print('DECOMPOSING\n')
# elem.decompose()
# break
# else:
# print('No class or no breadcrumb')
# i = i + 1
# else:
# continue
# else:
# article_section = article_section
# break

In [None]:
# article_section

In [21]:
soupy = eventarc.soup

In [22]:
art_body = soupy.select('.devsite-article-body')

In [23]:
content = ""
for ab in art_body:
 a_tags = ab.find_all('a')
 for a_tag in a_tags:
 a_tag.decompose()
 content += ab.get_text(separator="\n").strip()

In [None]:
# content

"Compute Engine is an infrastructure as a service (IaaS) product that offers\nself-managed virtual machine (VM) instances and bare\nmetal instances. Compute Engine offers VMs with a KVM hypervisor,\noperating systems for both Linux and Windows, and local and durable\nstorage options. You can configure and control Compute Engine\nresources using the Google Cloud console, the Google Cloud CLI, or using a\nREST-based API. You can also use a variety of programming languages available\nwith Google's\n\n.\n\n\nHere are some of the benefits of using Compute Engine:\n\n\n\n\nExtensibility:\n Compute Engine integrates with Google Cloud\ntechnologies such as Cloud Storage, Google Kubernetes Engine, and\nBigQuery, to extend beyond the basic computational capability to\ncreate more complex and sophisticated applications.\n\n\nScalability:\n Scale the number of compute resources as needed without\nhaving to manage your own infrastructure. This is useful for businesses that\nexperience sudden increa

In [25]:
print(content)

Compute Engine is an infrastructure as a service (IaaS) product that offers
self-managed virtual machine (VM) instances and bare
metal instances. Compute Engine offers VMs with a KVM hypervisor,
operating systems for both Linux and Windows, and local and durable
storage options. You can configure and control Compute Engine
resources using the Google Cloud console, the Google Cloud CLI, or using a
REST-based API. You can also use a variety of programming languages available
with Google's

.


Here are some of the benefits of using Compute Engine:




Extensibility:
 Compute Engine integrates with Google Cloud
technologies such as Cloud Storage, Google Kubernetes Engine, and
BigQuery, to extend beyond the basic computational capability to
create more complex and sophisticated applications.


Scalability:
 Scale the number of compute resources as needed without
having to manage your own infrastructure. This is useful for businesses that
experience sudden increases in traffic, because you 