{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "fe9f3b7f", "metadata": {}, "outputs": [], "source": [ "import os\n", "import re\n", "import json\n", "import lxml\n", "import requests\n", "import warnings\n", "import psycopg2\n", "import time, random\n", "from dotenv import load_dotenv\n", "from bs4 import BeautifulSoup, Tag, NavigableString, XMLParsedAsHTMLWarning\n", "from IPython.display import Markdown, display, update_display" ] }, { "cell_type": "markdown", "id": "d3e2fdf4", "metadata": {}, "source": [ "SQL statement to select links not related to a programming language or that has `/reference/` in the url:\n", "\n", "```sql\n", "select link from doc_links where link not like '%.com/java/%' and link not like '%.com/php%' and link not like '%.com/python%' and link not like '%.com/ruby%' and link not like '%.com/cpp%' and link not like '%.com/dotnet%' and link not like '%.com/nodejs%' and link not like '%/reference/%' limit 10;\n", "```" ] }, { "cell_type": "code", "execution_count": 2, "id": "f3aad0b8", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'admin'" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "load_dotenv()\n", "db_user = os.getenv('POSTGRES_USER')\n", "db_pwd = os.getenv('POSTGRES_PASSWORD')\n", "\n", "db_user" ] }, { "cell_type": "code", "execution_count": null, "id": "7293fa00", "metadata": {}, "outputs": [], "source": [ "def connect_to_db():\n", " \"\"\"\n", " Connects to a PostgreSQL database on localhost:5432.\n", " Returns the connection object, or None if the connection fails.\n", " \"\"\"\n", " try:\n", " conn = psycopg2.connect(\n", " host=\"localhost\",\n", " port=5432,\n", " database=\"gcplinks\", # Replace with your database name\n", " user=db_user, # Replace with your user name\n", " password=db_pwd # Replace with your password\n", " )\n", " print(\"Successfully connected to the database.\")\n", " return conn\n", " except psycopg2.Error as e:\n", " print(f\"Error connecting to the database: {e}\")\n", " return None" ] }, { "cell_type": "code", "execution_count": 4, "id": "c3a19cad", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'https://cloud.google.com/sitemap_{}_of_390.xml'" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "base_url = 'https://cloud.google.com/'\n", "sitemap_url = 'sitemap_{}_of_390'\n", "url = base_url + sitemap_url + '.xml'\n", "start_number = 1\n", "end_number = 2\n", "\n", "connect_timeout_secs = 5\n", "read_timeout_secs = 15 \n", "\n", "random_sleep_lower = 5\n", "random_sleep_upper = 20\n", "url" ] }, { "cell_type": "code", "execution_count": 5, "id": "e0eca8ad", "metadata": {}, "outputs": [], "source": [ "headers = {\n", " \"User-Agent\": (\n", " \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) \"\n", " \"AppleWebKit/537.36 (KHTML, like Gecko) \"\n", " \"Chrome/122.0.0.0 Safari/537.36\"\n", " ),\n", " \"Accept\": \"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\",\n", " \"Accept-Language\": \"en-US,en;q=0.5\",\n", " \"Accept-Encoding\": \"gzip, deflate, br\",\n", " \"Connection\": \"keep-alive\",\n", " \"Referer\": \"https://cloud.google.com/\",\n", "}\n" ] }, { "cell_type": "code", "execution_count": 6, "id": "b9f70537", "metadata": {}, "outputs": [], "source": [ "page_data_folder = './page_data/'\n", "raw_data_folder = './raw_data/'" ] }, { "cell_type": "code", "execution_count": 7, "id": "84551079", "metadata": {}, "outputs": [], "source": [ "# Disabling the following warning:\n", "#\n", "# XMLParsedAsHTMLWarning: It looks like you're using an HTML parser to parse an XML document.\n", "# Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will\n", "# be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword\n", "# argument `features=\"xml\"` into the BeautifulSoup constructor.\n", "# If you want or need to use an HTML parser on this document, you can make this warning go away by filtering it. To do that,\n", "# run this code before calling the BeautifulSoup constructor:\n", "\n", "warnings.filterwarnings(\"ignore\", category=XMLParsedAsHTMLWarning)" ] }, { "cell_type": "markdown", "id": "5417bc20", "metadata": {}, "source": [ "### Scrape page\n", "\n", "The next code block scrapes the given url.\n", "\n", "Comment out to avoid accidentally running" ] }, { "cell_type": "code", "execution_count": 8, "id": "7a061795", "metadata": {}, "outputs": [], "source": [ "# for i in range(start_number, end_number + 1):\n", "# response = requests.get(url.format(i), headers=headers, timeout=(connect_timeout_secs, read_timeout_secs))\n", "# body = response.content\n", "# soup = BeautifulSoup(body, 'xml')\n", "# raw_file = raw_data_folder + sitemap_url.format(i) + \".txt\"\n", "# with open(raw_file, \"w\") as f:\n", "# f.write(str(soup))\n", "\n", "# print(\"Wrote file: {}\".format(raw_file))\n", "# random_sleep_time_seconds = random.uniform(random_sleep_lower, random_sleep_upper)\n", "# print(\"Sleeping for {} seconds.\".format(random_sleep_time_seconds))\n", "# time.sleep(random_sleep_time_seconds)" ] }, { "cell_type": "code", "execution_count": null, "id": "fd9b6dce", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 11, "id": "98e47247", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Successfully connected to the database.\n", "reading : ./raw_data/sitemap_1_of_390.txt\n", "reading : ./raw_data/sitemap_2_of_390.txt\n", "reading : ./raw_data/sitemap_3_of_390.txt\n", "reading : ./raw_data/sitemap_4_of_390.txt\n", "reading : ./raw_data/sitemap_5_of_390.txt\n", "reading : ./raw_data/sitemap_6_of_390.txt\n", "reading : ./raw_data/sitemap_7_of_390.txt\n" ] } ], "source": [ "insert = \"insert into doc_links (link, date_last_modified, date_last_visited) values (%s, %s, '1970-01-01')\"\n", "\n", "conn = connect_to_db()\n", "cur = conn.cursor()\n", "\n", "for j in range(1, 8):\n", " soup = None\n", " with open(\"./raw_data/sitemap_{}_of_390.txt\".format(j), \"r\", encoding='utf-8') as f:\n", " print(\"reading : {}\".format(\"./raw_data/sitemap_{}_of_390.txt\".format(j)))\n", " soup = BeautifulSoup(f, 'xml')\n", "\n", " urls = soup.find_all('url')\n", " for url in urls:\n", " loc = url.find('loc').get_text() \n", " lastmod_raw = url.find('lastmod').get_text()\n", " lastmod_split = lastmod_raw.split('T')\n", " lastmod = lastmod_split[0]\n", " try:\n", " cur.execute(insert, (loc, lastmod,))\n", " conn.commit()\n", " #print(\"Successfully inserted: {} | {}\".format(lastmod, loc))\n", " except Exception as e:\n", " conn.rollback()\n", " print(\"Failed to insert link: {}\".format(loc))\n", " print(e)\n", "\n", "cur.close()\n", "conn.close()" ] }, { "cell_type": "code", "execution_count": null, "id": "2050f990", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.17" } }, "nbformat": 4, "nbformat_minor": 5 }