In [178]:
import re
from openai import OpenAI
import gradio as gr
import requests
import json
from typing import List
from bs4 import BeautifulSoup, Tag, NavigableString
from IPython.display import Markdown, display, update_display

In [179]:
#
# from Ed Donner's github repo for LLM Engineering course on Udemy: 
# https://github.com/ed-donner/llm_engineering/blob/main/week1/day5.ipynb
#
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
 """
 A utility class to represent a Website that we have scraped, now with links
 """

 def __init__(self, url):
 
 self.url = url
 self.title = None
 self.links = set()
 self.last_updated = '1970-01-01'
 
 response = requests.get(url, headers=headers)
 self.body = response.content
 self.soup = BeautifulSoup(self.body, 'html.parser')
 self.article = self.soup.find('article', class_='devsite-article')
 # self.title = soup.title.string if soup.title else "No title found"
 # if soup.body:
 # for irrelevant in soup.body(["script", "style", "img", "input"]):
 # irrelevant.decompose()
 # self.text = soup.body.get_text(separator="\n", strip=True)
 # else:
 # self.text = ""
 # links = [link.get('href') for link in soup.find_all('a')]
 # self.links = [link for link in links if link]

 def get_doc_title(self):
 if self.title:
 return self.title

 # title = self.soup.find('h1', class_='devsite-page-title').find(string=True, recursive=False).strip()
 title = self.soup.find('h1').find(string=True, recursive=False).strip()
 if not title:
 self.title = '{} | No Title'.format(self.url)
 else:
 self.title = title

 return self.title

 def get_last_updated(self):
 footer = self.soup.find('devsite-content-footer')
 footer_paras = footer.find_all('p')
 for fp in footer_paras:
 last_updated_re = r'Last updated (.*) UTC'
 match = re.search(last_updated_re, fp.get_text())
 if match:
 self.last_updated = match.group(1)
 break

 def get_doc_links(self):
 nav_items = self.soup.find_all('li', class_='devsite-nav-item')
 links_in_article = self.article_section.find_all('a', href=True)
 all_links = nav_items + links_in_article
 
 for al in all_links:
 link = al.find('a', href=True)
 if link and 'docs' in link['href']:
 if link['href'][0] == '/':
 self.links.add('https://cloud.google.com{}'.format(link['href']))
 else:
 self.links.add(link['href'])


In [180]:
exclude_tags = ['devsite-feedback', 'devsite-actions', 'devsite-toc', 'aside', 'devsite-page-title-meta', 'devsite-thumb-rating']

In [181]:
# eventarc = Website('https://cloud.google.com/eventarc/docs')
# eventarc = Website('https://cloud.google.com/build/docs/deploying-builds/deploy-compute-engine')
# eventarc = Website('https://cloud.google.com/compute/docs/troubleshooting/troubleshooting-using-serial-console')
eventarc = Website('https://cloud.google.com/functions/docs/deploy')

In [182]:
eventarc.get_doc_title()

'Deploy a function'

In [183]:
eventarc.title

'Deploy a function'

In [184]:
soup = eventarc.soup

In [185]:
footer = soup.find('devsite-content-footer')

In [186]:
footer_paras = footer.find_all('p')

In [187]:
for fp in footer_paras:
 last_updated_re = r'Last updated (.*) UTC'
 match = re.search(last_updated_re, fp.get_text())
 print(match)
 if match:
 print(match.group(1))
 else:
 print('Nope')

None
Nope
<re.Match object; span=(0, 27), match='Last updated 2025-05-23 UTC'>
2025-05-23


In [188]:
eventarc.get_last_updated()
eventarc.last_updated

'2025-05-23'

In [189]:
nav_items = soup.find_all('li', class_='devsite-nav-item')
nav_items[0].find('a', href=True)['href']

'https://cloud.google.com/docs/ai-ml'

In [190]:
links_on_page = set()
for ni in nav_items:
 link = ni.find('a', href=True)
 if link and 'docs' in link['href']:
 if link['href'][0] == '/' and '#' not in link['href']:
 links_on_page.add('https://cloud.google.com{}'.format(link['href']))
 else:
 links_on_page.add(link['href'])

In [191]:
links_on_page

{'https://cloud.google.com/docs',
 'https://cloud.google.com/docs/access-resources',
 'https://cloud.google.com/docs/ai-ml',
 'https://cloud.google.com/docs/application-development',
 'https://cloud.google.com/docs/application-hosting',
 'https://cloud.google.com/docs/compute-area',
 'https://cloud.google.com/docs/costs-usage',
 'https://cloud.google.com/docs/cross-product-overviews',
 'https://cloud.google.com/docs/data',
 'https://cloud.google.com/docs/databases',
 'https://cloud.google.com/docs/devtools',
 'https://cloud.google.com/docs/dhm-cloud',
 'https://cloud.google.com/docs/generative-ai',
 'https://cloud.google.com/docs/iac',
 'https://cloud.google.com/docs/industry',
 'https://cloud.google.com/docs/migration',
 'https://cloud.google.com/docs/networking',
 'https://cloud.google.com/docs/observability',
 'https://cloud.google.com/docs/security',
 'https://cloud.google.com/docs/storage',
 'https://cloud.google.com/docs/tech-area-overviews',
 'https://cloud.google.com/functions/

In [192]:
article_section = soup.find('article', class_='devsite-article')
links_in_article = article_section.find_all('a', href=True)

In [193]:
def make_google_cloud_link(link):
 return 'https://cloud.google.com{}'.format(link['href'])

In [194]:
article_links = set()
for lia in links_in_article:
 if lia and 'docs' in lia['href'] and '#' not in lia['href']:
 article_links.add(make_google_cloud_link(lia))

In [195]:
article_links

{'https://cloud.google.com/functions/docs/apis',
 'https://cloud.google.com/functions/docs/building',
 'https://cloud.google.com/functions/docs/runtime-support',
 'https://cloud.google.com/functions/docs/tutorials/terraform',
 'https://cloud.google.com/functions/docs/tutorials/terraform-pubsub',
 'https://cloud.google.com/run/docs/configuring',
 'https://cloud.google.com/run/docs/configuring/concurrency',
 'https://cloud.google.com/run/docs/configuring/connect-cloudsql',
 'https://cloud.google.com/run/docs/configuring/connecting-vpc',
 'https://cloud.google.com/run/docs/configuring/execution-environments',
 'https://cloud.google.com/run/docs/configuring/http2',
 'https://cloud.google.com/run/docs/configuring/min-instances',
 'https://cloud.google.com/run/docs/configuring/networking-best-practices',
 'https://cloud.google.com/run/docs/configuring/request-timeout',
 'https://cloud.google.com/run/docs/configuring/services/containers',
 'https://cloud.google.com/run/docs/configuring/servic

In [196]:
# i = 0
# for elem in article_section:
# if i < 5:
 
# if elem.name in exclude_tags:
# continue

# print('** {} **: {}'.format(i,elem))
# if isinstance(elem, Tag):
# keys = elem.attrs.keys()
# print(keys)
# if 'class' in keys:
# print("---*** TAG NAME: {}".format(elem.name))
# print(elem['class'])
# class_list = elem['class']
# print("CLASS_LIST: {}".format(class_list))
# for cl in class_list:
# print("CLASS cl: {}".format(cl))
# if 'breadcrumb' in cl or 'meta' in cl:
# print('DECOMPOSING\n')
# elem.decompose()
# break
# else:
# print('No class or no breadcrumb')
# i = i + 1
# else:
# continue
# else:
# article_section = article_section
# break

In [197]:
# article_section

In [198]:
soupy = eventarc.soup

In [199]:
art_body = soupy.select('.devsite-article-body')

In [200]:
content = ""
for ab in art_body:
 for pre_tag in ab.find_all('pre'):
 pre_tag.replace_with("```" + pre_tag.text.strip() + "```")
 for code in ab.find_all('code'):
 code.replace_with("`" + code.text.strip() + "`")
 a_tags = ab.find_all('a')
 for a_tag in a_tags:
 a_tag.text.strip()
 content += ab.get_text(separator="\n").strip()

In [201]:
content

"Deploy a function\n\n\nThis guide shows you how to deploy a function from source code using the\n\n`gcloud functions`\n command. To learn how to deploy a function using the\n\n`gcloud run`\n command, see\n\nDeploy a Cloud Run function using the gcloud CLI\n.\n\n\nThe deployment process takes your source code and configuration settings and\n\nbuilds a runnable image\n that Cloud Run functions\nmanages automatically in order to handle requests to your function.\n\n\nDeployment basics\n\n\nFor an introduction on the type of functions you can deploy, see\n\nWrite Cloud Run functions\n.\n\n\nUsers deploying functions must have the\n\nCloud Functions Developer\n\nIAM role or a role that includes the same permissions. See also\n\nAdditional configuration for deployment\n.\n\n\n\n\n\n\nIn the Google Cloud console, activate Cloud Shell.\n\n\nActivate Cloud Shell\n\n\n\n At the bottom of the Google Cloud console, a\n \nCloud Shell\n\n session starts and displays a command-line prompt. Cloud She

In [202]:
def normalize_newlines(text):
 # Replace multiple newlines (\n\n or more) with a single \n
 text = re.sub(r'\n{2,}', '\n', text)
 # Replace single newlines (\n) with a space
 text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)
 return text

In [203]:
folder = 'page_content'
filename = eventarc.get_doc_title().lower().replace(" ", "_")
folder_and_filename = folder + "/" + filename + ".txt"
with open(folder_and_filename, "w") as f:
 f.write(clean_content)

In [224]:
raw_folder = 'raw_soup'
filename = eventarc.get_doc_title().lower().replace(" ", "_")
folder_and_filename = raw_folder + "/" + filename + "_RAW.txt"
with open(folder_and_filename, "w") as f:
 f.write(str(art_body[0]))

In [222]:
str(art_body[0])


'<div class="devsite-article-body clearfix devsite-no-page-title">\n<h1 data-text="Deploy a function" id="deploy-a-function" tabindex="-1">Deploy a function</h1>\n<p>This guide shows you how to deploy a function from source code using the\n`gcloud functions` command. To learn how to deploy a function using the\n`gcloud run` command, see\n<a href="/run/docs/quickstarts/functions/deploy-functions-gcloud">Deploy a Cloud Run function using the gcloud CLI</a>.</p>\n<p>The deployment process takes your source code and configuration settings and\n<a href="/functions/docs/building">builds a runnable image</a> that Cloud Run functions\nmanages automatically in order to handle requests to your function.</p>\n<h2 data-text="Deployment basics" id="basics" tabindex="-1">Deployment basics</h2>\n<p>For an introduction on the type of functions you can deploy, see\n<a href="/run/docs/write-functions">Write Cloud Run functions</a>.</p>\n<p>Users deploying functions must have the\n<a href="/functions/doc

In [204]:
# def replace_pre_tags(soupy):
# # soup = BeautifulSoup(html_content, 'html.parser')
# for pre_tag in soupy.find_all('pre'):
# pre_tag.replace_with("```" + pre_tag.text.strip() + "```")
# return str(soupy)

In [213]:
def normalize_newlines_and_spaces(text):
 # Replace multiple newlines (\n\n or more) with a single \n
 text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)
 text = re.sub(r'\n{2,}', '\n', text)
 text = re.sub(r' {2,}', ' ', text)
 # Replace single newlines (\n) with a space
 return text


In [214]:
final_content = normalize_newlines_and_spaces(content)
final_content

"Deploy a function\nThis guide shows you how to deploy a function from source code using the\n`gcloud functions` command. To learn how to deploy a function using the\n`gcloud run` command, see\nDeploy a Cloud Run function using the gcloud CLI .\nThe deployment process takes your source code and configuration settings and\nbuilds a runnable image that Cloud Run functions manages automatically in order to handle requests to your function.\nDeployment basics\nFor an introduction on the type of functions you can deploy, see\nWrite Cloud Run functions .\nUsers deploying functions must have the\nCloud Functions Developer\nIAM role or a role that includes the same permissions. See also\nAdditional configuration for deployment .\nIn the Google Cloud console, activate Cloud Shell.\nActivate Cloud Shell\n At the bottom of the Google Cloud console, a Cloud Shell\n session starts and displays a command-line prompt. Cloud Shell is a shell environment with the Google Cloud CLI already installed and 

In [215]:
print(final_content)

Deploy a function
This guide shows you how to deploy a function from source code using the
`gcloud functions` command. To learn how to deploy a function using the
`gcloud run` command, see
Deploy a Cloud Run function using the gcloud CLI .
The deployment process takes your source code and configuration settings and
builds a runnable image that Cloud Run functions manages automatically in order to handle requests to your function.
Deployment basics
For an introduction on the type of functions you can deploy, see
Write Cloud Run functions .
Users deploying functions must have the
Cloud Functions Developer
IAM role or a role that includes the same permissions. See also
Additional configuration for deployment .
In the Google Cloud console, activate Cloud Shell.
Activate Cloud Shell
 At the bottom of the Google Cloud console, a Cloud Shell
 session starts and displays a command-line prompt. Cloud Shell is a shell environment with the Google Cloud CLI already installed and with values alread