gcp_docs/scratch.py at main

Fork: 0
clewis / gcp_docs
Find file
Newer
Older
gcp_docs / scratch.py
clewis 12 days ago 3 KB initial commit
Raw Blame History
class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        
        self.url = url
        response = requests.get(url, headers=headers, timeout=(connect_timeout_secs, read_timeout_secs))
        self.status_code = response.status_code
        self.body = response.content
        self.soup = BeautifulSoup(self.body, 'html.parser')
        self.title = self.soup.title.string if self.soup.title else "No title found"
        if self.soup.body:
            for irrelevant in self.soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = self.soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in self.soup.find_all('a')]
        exclude_strings = ["/go", "/java", "/nodejs", "/python"
                           , "/ruby", "//", "/cpp", "/php", "#"
                          , "/partner", "/product-terms", "/dotnet"
                          , "/eclipse", "/customer", "/contac", "/contact"
                          , "/free", "/marketplace", "/pricing"
                          , "/release-notes", "/whitepapers", "/why-google"]
        self.links = {self.make_gcp_link(link) for link in links if link is not None and not any(exclude in link for exclude in exclude_strings)}
        self.get_last_updated()

    def make_gcp_link(self, link):

        if link.endswith('/'):
            link = link[:len(link) - 1]
        if 'https://cloud.google.com' in link:
            return link
    
        # Only return paths from GCP.  If it is any other domain 
        if 'https://' not in link:
            return 'https://cloud.google.com{}'.format(link)
        else:
            return None


    def get_last_updated(self):
        footer = self.soup.find('devsite-content-footer')
        self.last_updated = '1980-01-01'
        if footer:
            footer_paras = footer.find_all('p')
            for fp in footer_paras:
                last_updated_re = r'Last updated (.*) UTC'
                match = re.search(last_updated_re, fp.get_text())
                if match:
                    self.last_updated = match.group(1)
                    break
        

    def get_doc_links(self):
        nav_items = self.soup.find_all('li', class_='devsite-nav-item')
        links_in_article = self.article_section.find_all('a', href=True)
        all_links = nav_items + links_in_article
        
        for al in all_links:
            link = al.find('a', href=True)
            if link and 'docs' in link['href']:
                if link['href'][0] == '/':
                    self.links.add('https://cloud.google.com{}'.format(link['href']))
                else:
                    self.links.add(link['href'])

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"


def connect_to_db():
    """
    Connects to a PostgreSQL database on localhost:5432.
    Returns the connection object, or None if the connection fails.
    """
    try:
        conn = psycopg2.connect(
            host="localhost",
            port=5432,
            database="gcplinks",  # Replace with your database name
            user=db_user,          # Replace with your user name
            password=db_pwd       # Replace with your password
        )
        print("Successfully connected to the database.")
        return conn
    except psycopg2.Error as e:
        print(f"Error connecting to the database: {e}")
        return None