{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "bc99549d-4ea3-4c04-a065-1b3af7b5023a", "metadata": {}, "outputs": [], "source": [ "import re\n", "import json\n", "import requests\n", "import psycopg2\n", "import time, random\n", "import gradio as gr\n", "from typing import List\n", "from openai import OpenAI\n", "from dotenv import load_dotenv\n", "from bs4 import BeautifulSoup\n", "from IPython.display import Markdown, display, update_display" ] }, { "cell_type": "code", "execution_count": null, "id": "aa3c192c", "metadata": {}, "outputs": [], "source": [ "load_dotenv()\n", "db_user = os.getenv('POSTGRES_USER')\n", "db_pwd = os.getenv('POSTGRES_PASSWORD')" ] }, { "cell_type": "code", "execution_count": null, "id": "fcab4d96", "metadata": {}, "outputs": [], "source": [ "def connect_to_db():\n", " \"\"\"\n", " Connects to a PostgreSQL database on localhost:5432.\n", " Returns the connection object, or None if the connection fails.\n", " \"\"\"\n", " try:\n", " conn = psycopg2.connect(\n", " host=\"localhost\",\n", " port=5432,\n", " database=\"gcplinks\", # Replace with your database name\n", " user=db_user, # Replace with your user name\n", " password=db_pwd # Replace with your password\n", " )\n", " print(\"Successfully connected to the database.\")\n", " return conn\n", " except psycopg2.Error as e:\n", " print(f\"Error connecting to the database: {e}\")\n", " return None" ] }, { "cell_type": "code", "execution_count": null, "id": "e16c5158", "metadata": {}, "outputs": [], "source": [ "connect_timeout_secs = 5\n", "read_timeout_secs = 15 \n", "\n", "random_sleep_lower = 5\n", "random_sleep_upper = 20" ] }, { "cell_type": "code", "execution_count": null, "id": "7dbabd45-29ab-450c-8e19-b105b6996610", "metadata": {}, "outputs": [], "source": [ "#\n", "# from Ed Donner's github repo for LLM Engineering course on Udemy: \n", "# https://github.com/ed-donner/llm_engineering/blob/main/week1/day5.ipynb\n", "#\n", "# headers = {\n", "# \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n", "# }\n", "\n", "headers = {\n", " \"User-Agent\": (\n", " \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) \"\n", " \"AppleWebKit/537.36 (KHTML, like Gecko) \"\n", " \"Chrome/122.0.0.0 Safari/537.36\"\n", " ),\n", " \"Accept\": \"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\",\n", " \"Accept-Language\": \"en-US,en;q=0.5\",\n", " \"Accept-Encoding\": \"gzip, deflate, br\",\n", " \"Connection\": \"keep-alive\",\n", " \"Referer\": \"https://cloud.google.com/\",\n", "}\n", "\n", "\n", "class Website:\n", " \"\"\"\n", " A utility class to represent a Website that we have scraped, now with links\n", " \"\"\"\n", "\n", " def __init__(self, url):\n", " self.url = url\n", " response = requests.get(url, headers=headers, timeout=(connect_timeout_secs, read_timeout_secs))\n", " self.body = response.content\n", " soup = BeautifulSoup(self.body, 'html.parser')\n", " self.title = soup.title.string if soup.title else \"No title found\"\n", " if soup.body:\n", " for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n", " irrelevant.decompose()\n", " self.text = soup.body.get_text(separator=\"\\n\", strip=True)\n", " else:\n", " self.text = \"\"\n", " links = [link.get('href') for link in soup.find_all('a')]\n", " self.links = [link for link in links if link]\n", "\n", " def get_contents(self):\n", " return f\"Webpage Title:\\n{self.title}\\nWebpage Contents:\\n{self.text}\\n\\n\"\n" ] }, { "cell_type": "code", "execution_count": 3, "id": "6da96cc9-2efb-4967-9e94-0b89cf8d82d6", "metadata": {}, "outputs": [], "source": [ "# gcp_products = Website(\"https://cloud.google.com/products\")\n", "# gcp_products = Website(\"https://cloud.google.com/docs\")\n", "# gcp_products = Website(\"https://cloud.google.com/compute/docs\")\n", "# doc_url = \"/compute/docs/images/create-custom\"\n", "\n", "doc_url = \"/compute/docs/instances\"\n", "gcp_url = \"https://cloud.google.com\"\n", "url = gcp_url + doc_url\n", "\n", "doc_link_folder = \"gcp_pages/links/\"\n", "doc_html_folder = \"gcp_pages/html/\"\n", "\n", "doc_link_file = doc_link_folder + doc_url[1:].replace(\"/\", \"_\") + \"_links.txt\"\n", "doc_html_file = doc_html_folder + doc_url[1:].replace(\"/\", \"_\") + \"_html.txt\"\n", "\n", "website = Website(url)\n" ] }, { "cell_type": "code", "execution_count": 4, "id": "d305e8dd-443d-4089-832e-b989a6d37fd5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Found 622 links containing /docs\n" ] } ], "source": [ "doc_links = set()\n", "for link in website.links:\n", " if re.search(r'(.*)\\/docs', link):\n", " doc_links.add(link)\n", "\n", "print('Found {} links containing /docs'.format(len(doc_links)))\n", "\n", "if len(doc_links) > 0:\n", " with open(doc_link_file, 'w') as f:\n", " for link in doc_links:\n", " f.write(link)\n", " f.write('\\n')\n", "\n", "with open(doc_html_file, 'w') as f:\n", " f.write(str(website.body))" ] }, { "cell_type": "code", "execution_count": null, "id": "58125067", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.17" } }, "nbformat": 4, "nbformat_minor": 5 }