gcp_docs_scrape/sitemap_data/sitemap_scrape.ipynb at main

Fork: 0
clewis / gcp_docs_scrape
Find file
Newer
Older
gcp_docs_scrape / sitemap_data / sitemap_scrape.ipynb
moreserverless on 29 May 8 KB Preparing to scrape and populate db to not visit a page more than one. Restructured folders and removed some extraneous pages no longer needed.
Raw Blame History
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "fe9f3b7f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import re\n",
    "import json\n",
    "import lxml\n",
    "import requests\n",
    "import warnings\n",
    "import psycopg2\n",
    "import time, random\n",
    "from dotenv import load_dotenv\n",
    "from bs4 import BeautifulSoup, Tag, NavigableString, XMLParsedAsHTMLWarning\n",
    "from IPython.display import Markdown, display, update_display"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d3e2fdf4",
   "metadata": {},
   "source": [
    "SQL statement to select links not related to a programming language or that has `/reference/` in the url:\n",
    "\n",
    "```sql\n",
    "select link from doc_links where link not like '%.com/java/%' and link not like '%.com/php%' and link not like '%.com/python%' and link not like '%.com/ruby%' and link not like '%.com/cpp%' and link not like '%.com/dotnet%' and link not like '%.com/nodejs%' and link not like '%/reference/%' limit 10;\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "f3aad0b8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'admin'"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "load_dotenv()\n",
    "db_user = os.getenv('POSTGRES_USER')\n",
    "db_pwd = os.getenv('POSTGRES_PASSWORD')\n",
    "\n",
    "db_user"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7293fa00",
   "metadata": {},
   "outputs": [],
   "source": [
    "def connect_to_db():\n",
    "    \"\"\"\n",
    "    Connects to a PostgreSQL database on localhost:5432.\n",
    "    Returns the connection object, or None if the connection fails.\n",
    "    \"\"\"\n",
    "    try:\n",
    "        conn = psycopg2.connect(\n",
    "            host=\"localhost\",\n",
    "            port=5432,\n",
    "            database=\"gcplinks\",  # Replace with your database name\n",
    "            user=db_user,          # Replace with your user name\n",
    "            password=db_pwd       # Replace with your password\n",
    "        )\n",
    "        print(\"Successfully connected to the database.\")\n",
    "        return conn\n",
    "    except psycopg2.Error as e:\n",
    "        print(f\"Error connecting to the database: {e}\")\n",
    "        return None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "c3a19cad",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'https://cloud.google.com/sitemap_{}_of_390.xml'"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "base_url = 'https://cloud.google.com/'\n",
    "sitemap_url = 'sitemap_{}_of_390'\n",
    "url = base_url + sitemap_url + '.xml'\n",
    "start_number = 1\n",
    "end_number = 2\n",
    "\n",
    "connect_timeout_secs = 5\n",
    "read_timeout_secs = 15 \n",
    "\n",
    "random_sleep_lower = 5\n",
    "random_sleep_upper = 20\n",
    "url"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "e0eca8ad",
   "metadata": {},
   "outputs": [],
   "source": [
    "headers = {\n",
    "    \"User-Agent\": (\n",
    "        \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) \"\n",
    "        \"AppleWebKit/537.36 (KHTML, like Gecko) \"\n",
    "        \"Chrome/122.0.0.0 Safari/537.36\"\n",
    "    ),\n",
    "    \"Accept\": \"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\",\n",
    "    \"Accept-Language\": \"en-US,en;q=0.5\",\n",
    "    \"Accept-Encoding\": \"gzip, deflate, br\",\n",
    "    \"Connection\": \"keep-alive\",\n",
    "    \"Referer\": \"https://cloud.google.com/\",\n",
    "}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "b9f70537",
   "metadata": {},
   "outputs": [],
   "source": [
    "page_data_folder = './page_data/'\n",
    "raw_data_folder = './raw_data/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "84551079",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Disabling the following warning:\n",
    "#\n",
    "# XMLParsedAsHTMLWarning: It looks like you're using an HTML parser to parse an XML document.\n",
    "# Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will\n",
    "#  be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword\n",
    "#  argument `features=\"xml\"` into the BeautifulSoup constructor.\n",
    "# If you want or need to use an HTML parser on this document, you can make this warning go away by filtering it. To do that,\n",
    "#  run this code before calling the BeautifulSoup constructor:\n",
    "\n",
    "warnings.filterwarnings(\"ignore\", category=XMLParsedAsHTMLWarning)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5417bc20",
   "metadata": {},
   "source": [
    "### Scrape page\n",
    "\n",
    "The next code block scrapes the given url.\n",
    "\n",
    "Comment out to avoid accidentally running"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "7a061795",
   "metadata": {},
   "outputs": [],
   "source": [
    "# for i in range(start_number, end_number + 1):\n",
    "#     response = requests.get(url.format(i), headers=headers, timeout=(connect_timeout_secs, read_timeout_secs))\n",
    "#     body = response.content\n",
    "#     soup = BeautifulSoup(body, 'xml')\n",
    "#     raw_file = raw_data_folder + sitemap_url.format(i) + \".txt\"\n",
    "#     with open(raw_file, \"w\") as f:\n",
    "#         f.write(str(soup))\n",
    "\n",
    "#     print(\"Wrote file: {}\".format(raw_file))\n",
    "#     random_sleep_time_seconds = random.uniform(random_sleep_lower, random_sleep_upper)\n",
    "#     print(\"Sleeping for {} seconds.\".format(random_sleep_time_seconds))\n",
    "#     time.sleep(random_sleep_time_seconds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fd9b6dce",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "98e47247",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Successfully connected to the database.\n",
      "reading : ./raw_data/sitemap_1_of_390.txt\n",
      "reading : ./raw_data/sitemap_2_of_390.txt\n",
      "reading : ./raw_data/sitemap_3_of_390.txt\n",
      "reading : ./raw_data/sitemap_4_of_390.txt\n",
      "reading : ./raw_data/sitemap_5_of_390.txt\n",
      "reading : ./raw_data/sitemap_6_of_390.txt\n",
      "reading : ./raw_data/sitemap_7_of_390.txt\n"
     ]
    }
   ],
   "source": [
    "insert = \"insert into doc_links (link, date_last_modified, date_last_visited) values (%s, %s, '1970-01-01')\"\n",
    "\n",
    "conn = connect_to_db()\n",
    "cur = conn.cursor()\n",
    "\n",
    "for j in range(1, 8):\n",
    "    soup = None\n",
    "    with open(\"./raw_data/sitemap_{}_of_390.txt\".format(j), \"r\", encoding='utf-8') as f:\n",
    "        print(\"reading : {}\".format(\"./raw_data/sitemap_{}_of_390.txt\".format(j)))\n",
    "        soup = BeautifulSoup(f, 'xml')\n",
    "\n",
    "    urls = soup.find_all('url')\n",
    "    for url in urls:\n",
    "        loc = url.find('loc').get_text() \n",
    "        lastmod_raw = url.find('lastmod').get_text()\n",
    "        lastmod_split = lastmod_raw.split('T')\n",
    "        lastmod = lastmod_split[0]\n",
    "        try:\n",
    "            cur.execute(insert, (loc, lastmod,))\n",
    "            conn.commit()\n",
    "            #print(\"Successfully inserted: {} | {}\".format(lastmod, loc))\n",
    "        except Exception as e:\n",
    "            conn.rollback()\n",
    "            print(\"Failed to insert link: {}\".format(loc))\n",
    "            print(e)\n",
    "\n",
    "cur.close()\n",
    "conn.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2050f990",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.17"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}