In [1]:
import re
from openai import OpenAI
import gradio as gr
import requests
import json
from typing import List
from bs4 import BeautifulSoup, Tag, NavigableString
from IPython.display import Markdown, display, update_display

In [16]:
#
# from Ed Donner's github repo for LLM Engineering course on Udemy: 
# https://github.com/ed-donner/llm_engineering/blob/main/week1/day5.ipynb
#
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        
        self.url = url
        self.title = None
        self.links = set()
        self.last_updated = '1970-01-01'
        
        response = requests.get(url, headers=headers)
        self.body = response.content
        self.soup = BeautifulSoup(self.body, 'html.parser')
        self.article = self.soup.find('article', class_='devsite-article')
        # self.title = soup.title.string if soup.title else "No title found"
        # if soup.body:
        #     for irrelevant in soup.body(["script", "style", "img", "input"]):
        #         irrelevant.decompose()
        #     self.text = soup.body.get_text(separator="\n", strip=True)
        # else:
        #     self.text = ""
        # links = [link.get('href') for link in soup.find_all('a')]
        # self.links = [link for link in links if link]

    def get_doc_title(self):
        if self.title:
            return self.title

        title = self.soup.find('h1', class_='devsite-page-title').find(string=True, recursive=False).strip()
        if not title:
            self.title = '{} | No Title'.format(self.url)
        else:
            self.title = title

        return self.title

    def get_last_updated(self):
        footer = self.soup.find('devsite-content-footer')
        footer_paras = footer.find_all('p')
        for fp in footer_paras:
            last_updated_re = r'Last updated (.*) UTC'
            match = re.search(last_updated_re, fp.get_text())
            if match:
                self.last_updated = match.group(1)
                break

    def get_doc_links(self):
        nav_items = self.soup.find_all('li', class_='devsite-nav-item')
        links_in_article = self.article_section.find_all('a', href=True)
        all_links = nav_items + links_in_article
        
        for al in all_links:
            link = al.find('a', href=True)
            if link and 'docs' in link['href']:
                if link['href'][0] == '/':
                    self.links.add('https://cloud.google.com{}'.format(link['href']))
                else:
                    self.links.add(link['href'])


In [30]:
exclude_tags = ['devsite-feedback', 'devsite-actions', 'devsite-toc', 'aside', 'devsite-page-title-meta', 'devsite-thumb-rating']

In [7]:
eventarc = Website('https://cloud.google.com/eventarc/docs')

In [8]:
eventarc.get_doc_title()

'Eventarc overview'

In [9]:
eventarc.title

'Eventarc overview'

In [10]:
soup = eventarc.soup

In [11]:
print(soup.find('h1', class_='devsite-page-title').find(string=True, recursive=False).strip())

Eventarc overview


In [12]:
footer = soup.find('devsite-content-footer')

In [13]:
footer_paras = footer.find_all('p')

In [14]:
for fp in footer_paras:
    last_updated_re = r'Last updated (.*) UTC'
    match = re.search(last_updated_re, fp.get_text())
    print(match)
    if match:
        print(match.group(1))
    else:
        print('Nope')

None
Nope
<re.Match object; span=(0, 27), match='Last updated 2025-05-08 UTC'>
2025-05-08


In [19]:
eventarc.get_last_updated()
eventarc.last_updated

'2025-05-08'

In [20]:
nav_items = soup.find_all('li', class_='devsite-nav-item')
nav_items[0].find('a', href=True)['href']

'https://cloud.google.com/docs/ai-ml'

In [21]:
links_on_page = set()
for ni in nav_items:
    link = ni.find('a', href=True)
    if link and 'docs' in link['href']:
        if link['href'][0] == '/' and '#' not in link['href']:
            links_on_page.add('https://cloud.google.com{}'.format(link['href']))
        else:
            links_on_page.add(link['href'])

In [22]:
links_on_page

{'https://cloud.google.com/docs',
 'https://cloud.google.com/docs/access-resources',
 'https://cloud.google.com/docs/ai-ml',
 'https://cloud.google.com/docs/application-development',
 'https://cloud.google.com/docs/application-hosting',
 'https://cloud.google.com/docs/compute-area',
 'https://cloud.google.com/docs/costs-usage',
 'https://cloud.google.com/docs/cross-product-overviews',
 'https://cloud.google.com/docs/data',
 'https://cloud.google.com/docs/databases',
 'https://cloud.google.com/docs/devtools',
 'https://cloud.google.com/docs/dhm-cloud',
 'https://cloud.google.com/docs/generative-ai',
 'https://cloud.google.com/docs/iac',
 'https://cloud.google.com/docs/industry',
 'https://cloud.google.com/docs/migration',
 'https://cloud.google.com/docs/networking',
 'https://cloud.google.com/docs/observability',
 'https://cloud.google.com/docs/security',
 'https://cloud.google.com/docs/storage',
 'https://cloud.google.com/docs/tech-area-overviews',
 'https://cloud.google.com/eventarc/a

In [23]:
article_section = soup.find('article', class_='devsite-article')
links_in_article = article_section.find_all('a', href=True)

In [24]:
def make_google_cloud_link(link):
    return 'https://cloud.google.com{}'.format(link['href'])

In [25]:
article_links = set()
for lia in links_in_article:
    if lia and 'docs' in lia['href'] and '#' not in lia['href']:
        article_links.add(make_google_cloud_link(lia))

In [26]:
article_links

{'https://cloud.google.com/eventarc/advanced/docs/audit-logs',
 'https://cloud.google.com/eventarc/advanced/docs/event-providers-targets',
 'https://cloud.google.com/eventarc/advanced/docs/overview',
 'https://cloud.google.com/eventarc/advanced/docs/publish-events/publish-events-google-sources',
 'https://cloud.google.com/eventarc/advanced/docs/receive-events/configure-format-events',
 'https://cloud.google.com/eventarc/advanced/docs/receive-events/transform-events',
 'https://cloud.google.com/eventarc/advanced/docs/use-cmek',
 'https://cloud.google.com/eventarc/advanced/docs/using-vpc-service-controls',
 'https://cloud.google.com/eventarc/docs/access-control',
 'https://cloud.google.com/eventarc/docs/compliance',
 'https://cloud.google.com/eventarc/docs/event-driven-architectures',
 'https://cloud.google.com/eventarc/docs/event-format',
 'https://cloud.google.com/eventarc/docs/quotas',
 'https://cloud.google.com/eventarc/docs/reference/audit-logs',
 'https://cloud.google.com/eventarc/

In [31]:
i = 0
for elem in article_section:
    if i < 5:
        
        if elem.name in exclude_tags:
            continue

        print('** {} **: {}'.format(i,elem))
        if isinstance(elem, Tag):
            keys = elem.attrs.keys()
            print(keys)
            if 'class' in keys:
                print("---*** TAG NAME: {}".format(elem.name))
                print(elem['class'])
                class_list = elem['class']
                print("CLASS_LIST: {}".format(class_list))
                for cl in class_list:
                    print("CLASS cl: {}".format(cl))
                    if 'breadcrumb' in cl or 'meta' in cl:
                        print('DECOMPOSING\n')
                        elem.decompose()
                        break
            else:
                print('No class or no breadcrumb')
            i = i + 1
        else:
            continue
    else:
        article_section = article_section
        break

** 0 **: 

** 0 **: 

** 0 **: 

** 0 **: <h1 class="devsite-page-title" tabindex="-1">
      Eventarc overview
      <div class="devsite-actions" data-nosnippet=""><devsite-feature-tooltip ack-key="AckCollectionsBookmarkTooltipDismiss" analytics-action-close="Callout Profile dismissed" analytics-action-show="Callout Profile displayed" analytics-category="Site-Wide Custom Events" analytics-label="Create Collection Callout" class="devsite-page-bookmark-tooltip nocontent" close-button-text="Got it" dismiss-button="true" dismiss-button-text="Dismiss" id="devsite-collections-dropdown">
<devsite-bookmark></devsite-bookmark>
<span slot="popout-heading">
      
      Stay organized with collections
    </span>
<span slot="popout-contents">
      
      Save and categorize content based on your preferences.
    </span>
</devsite-feature-tooltip></div>
</h1>
dict_keys(['class', 'tabindex'])
---*** TAG NAME: h1
['devsite-page-title']
CLASS_LIST: ['devsite-page-title']
CLASS cl: devsite-page-titl

In [33]:
article_section

<article class="devsite-article">

<devsite-feedback bucket="Documentation" class="nocontent" context="" data-label="Send Feedback Button" position="header" product-id="97037" project-icon="https://www.gstatic.com/devrel-devsite/prod/v6dc4611c4232bd02b2b914c4948f523846f90835f230654af18f87f75fe9f73c/cloud/images/favicons/onecloud/super_cloud.png" project-name="Eventarc" track-metadata-position="header" track-name="sendFeedbackLink" track-type="feedback" version="t-devsite-webserver-20250506-r00-rc01.469018407534597322">
<button>
  
    
    Send feedback
  
  </button>
</devsite-feedback>
<h1 class="devsite-page-title" tabindex="-1">
      Eventarc overview
      <div class="devsite-actions" data-nosnippet=""><devsite-feature-tooltip ack-key="AckCollectionsBookmarkTooltipDismiss" analytics-action-close="Callout Profile dismissed" analytics-action-show="Callout Profile displayed" analytics-category="Site-Wide Custom Events" analytics-label="Create Collection Callout" class="devsite-page-

In [34]:
soupy = eventarc.soup

In [35]:
art_body = soupy.select('.devsite-article-body')

[<div class="devsite-article-body clearfix">
 <div class="nocontent" id="tags">
 <a class="cloud-chip" data-title="Applies to Eventarc Advanced" href="/eventarc/advanced/docs/overview" track-name="Advanced" track-type="docChip">
   Advanced
 
   </a>
 <a class="cloud-chip" data-title="Applies to Eventarc Standard" href="/eventarc/standard/docs/overview" track-name="Standard" track-type="docChip">
   Standard
 
   </a>
 </div>
 <aside class="beta">
 <p>
 <strong>
       
         Preview
       
         — Eventarc Advanced
     </strong>
 </p>
 <p>
       
       
         This feature is subject to the "Pre-GA Offerings Terms" in the General Service Terms section
         of the <a href="/terms/service-terms#1" track-type="commonIncludes">Service Specific Terms</a>.
         
         Pre-GA features are available "as is" and might have limited support.
       
       For more information, see the
       <a href="/products#product-launch-stages" track-type="commonIncludes">launch stag

In [54]:
content = ""
for ab in art_body:
    a_tags = ab.find_all('a')
    for a_tag in a_tags:
        a_tag.decompose()
    content += ab.get_text(separator="\n").strip()

In [55]:
content

'Preview\n      \n        — Eventarc Advanced\n    \n\n\n\n\n\n      \n      \n        This feature is subject to the "Pre-GA Offerings Terms" in the General Service Terms section\n        of the \n.\n        \n        Pre-GA features are available "as is" and might have limited support.\n      \n      For more information, see the\n      \n.\n  \n\n\n\n\nEventarc lets you build event-driven architectures without having to implement,\n    customize, or maintain the underlying infrastructure.\n\n\nEventarc is offered in two editions: \nEventarc Advanced\n and\n    \nEventarc Standard\n.\n\n\nBoth editions offer a scalable, serverless, and fully managed eventing solution that lets you\n  asynchronously route messages from sources to targets using loosely coupled services that are\n  triggered by and react to state changes known as \nevents\n. Both editions support a range of\n  event providers and destinations—including Google Cloud services, custom applications, SaaS\n  applications, an

In [51]:
print(content)

Preview
      
        — Eventarc Advanced
    





      
      
        This feature is subject to the "Pre-GA Offerings Terms" in the General Service Terms section
        of the 
.
        
        Pre-GA features are available "as is" and might have limited support.
      
      For more information, see the
      
.
  




Eventarc lets you build event-driven architectures without having to implement,
    customize, or maintain the underlying infrastructure.


Eventarc is offered in two editions: 
Eventarc Advanced
 and
    
Eventarc Standard
.


Both editions offer a scalable, serverless, and fully managed eventing solution that lets you
  asynchronously route messages from sources to targets using loosely coupled services that are
  triggered by and react to state changes known as 
events
. Both editions support a range of
  event providers and destinations—including Google Cloud services, custom applications, SaaS
  applications, and third-party services—while managing delive