class Website: """ A utility class to represent a Website that we have scraped, now with links """ def __init__(self, url): self.url = url response = requests.get(url, headers=headers, timeout=(connect_timeout_secs, read_timeout_secs)) self.status_code = response.status_code self.body = response.content self.soup = BeautifulSoup(self.body, 'html.parser') self.title = self.soup.title.string if self.soup.title else "No title found" if self.soup.body: for irrelevant in self.soup.body(["script", "style", "img", "input"]): irrelevant.decompose() self.text = self.soup.body.get_text(separator="\n", strip=True) else: self.text = "" links = [link.get('href') for link in self.soup.find_all('a')] exclude_strings = ["/go", "/java", "/nodejs", "/python" , "/ruby", "//", "/cpp", "/php", "#" , "/partner", "/product-terms", "/dotnet" , "/eclipse", "/customer", "/contac", "/contact" , "/free", "/marketplace", "/pricing" , "/release-notes", "/whitepapers", "/why-google"] self.links = {self.make_gcp_link(link) for link in links if link is not None and not any(exclude in link for exclude in exclude_strings)} self.get_last_updated() def make_gcp_link(self, link): if link.endswith('/'): link = link[:len(link) - 1] if 'https://cloud.google.com' in link: return link # Only return paths from GCP. If it is any other domain if 'https://' not in link: return 'https://cloud.google.com{}'.format(link) else: return None def get_last_updated(self): footer = self.soup.find('devsite-content-footer') self.last_updated = '1980-01-01' if footer: footer_paras = footer.find_all('p') for fp in footer_paras: last_updated_re = r'Last updated (.*) UTC' match = re.search(last_updated_re, fp.get_text()) if match: self.last_updated = match.group(1) break def get_doc_links(self): nav_items = self.soup.find_all('li', class_='devsite-nav-item') links_in_article = self.article_section.find_all('a', href=True) all_links = nav_items + links_in_article for al in all_links: link = al.find('a', href=True) if link and 'docs' in link['href']: if link['href'][0] == '/': self.links.add('https://cloud.google.com{}'.format(link['href'])) else: self.links.add(link['href']) def get_contents(self): return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n" def connect_to_db(): """ Connects to a PostgreSQL database on localhost:5432. Returns the connection object, or None if the connection fails. """ try: conn = psycopg2.connect( host="localhost", port=5432, database="gcplinks", # Replace with your database name user=db_user, # Replace with your user name password=db_pwd # Replace with your password ) print("Successfully connected to the database.") return conn except psycopg2.Error as e: print(f"Error connecting to the database: {e}") return None