diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..0f711ba
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+venv/
+.ipynb_checkpoints/
+docs/
+docker/.ipynb_checkpoints/
+docker/pgdata/
diff --git a/docker/ORIG-schema.sql b/docker/ORIG-schema.sql
new file mode 100644
index 0000000..1c5ce26
--- /dev/null
+++ b/docker/ORIG-schema.sql
@@ -0,0 +1,17 @@
+CREATE EXTENSION IF NOT EXISTS vector;
+CREATE EXTENSION IF NOT EXISTS hstore;
+CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
+
+CREATE TABLE IF NOT EXISTS vector_store (
+    id                  uuid DEFAULT uuid_generate_v4() PRIMARY KEY,
+    document_id         text,
+    title               text,
+    chunk_index         text,
+    content             text,
+    source_url          text,
+    date_last_modified  timestamp,
+    metadata            jsonb,
+    embedding           vector
+);
+
+CREATE INDEX ON vector_store USING HNSW (embedding vector_cosine_ops);
\ No newline at end of file
diff --git a/docker/compose.yaml b/docker/compose.yaml
new file mode 100644
index 0000000..dd6b65b
--- /dev/null
+++ b/docker/compose.yaml
@@ -0,0 +1,50 @@
+services:
+  ollama:
+    image: 'ollama/ollama:latest'
+    volumes:
+        - ollama_data:/root/.ollama
+    ports:
+      - '11434:11434'
+    mem_limit: 4g
+    memswap_limit: 16g
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    # healthcheck:
+    #   test: ["CMD", "curl", "-f", "http://localhost:11434/api/tags"]
+    #   interval: 10s
+    #   timeout: 5s
+    #   retries: 3
+    #   start_period: 20s
+  
+#  ollama-pull:
+#    image: 'ollama/ollama:latest'
+    # depends_on:
+    #   ollama:
+    #     condition: service_healthy
+#    volumes:
+#      - ollama_data:/root/.ollama
+#    environment:
+#      - OLLAMA_HOST=http://ollama:11434
+#    command: ollama pull llama3.2:latest nomic-embed-text
+#    restart: "no"
+  
+  pgvector:
+    image: 'pgvector/pgvector:pg16'
+    environment:
+      - 'POSTGRES_DB=gcp_docs'
+      - 'POSTGRES_PASSWORD=password'
+      - 'POSTGRES_USER=admin'
+    ports:
+      - '15432:5432'
+    volumes:
+      - ./schema.sql:/docker-entrypoint-initdb.d/schema.sql
+      - ./pgdata:/var/lib/postgresql/data
+
+volumes:
+  pgdata:
+  ollama_data:
diff --git a/docker/schema.sql b/docker/schema.sql
new file mode 100644
index 0000000..0b184e1
--- /dev/null
+++ b/docker/schema.sql
@@ -0,0 +1,23 @@
+CREATE EXTENSION IF NOT EXISTS vector;
+CREATE EXTENSION IF NOT EXISTS hstore;
+CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
+
+-- Documents table
+CREATE TABLE documents (
+    id uuid DEFAULT uuid_generate_v4() PRIMARY KEY,
+    source_url text UNIQUE NOT NULL,
+    title text,
+    date_last_modified timestamp,
+    metadata jsonb,
+    created_at timestamp DEFAULT CURRENT_TIMESTAMP
+);
+
+-- Chunks table
+CREATE TABLE chunks (
+    id uuid DEFAULT uuid_generate_v4() PRIMARY KEY,
+    document_id uuid REFERENCES documents(id) ON DELETE CASCADE,
+    chunk_index text,
+    content text,
+    embedding vector,
+    created_at timestamp DEFAULT CURRENT_TIMESTAMP
+);
\ No newline at end of file
diff --git a/one_liner.ipynb b/one_liner.ipynb
new file mode 100644
index 0000000..e96610f
--- /dev/null
+++ b/one_liner.ipynb
@@ -0,0 +1,91 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "5d3d7989-2c36-4811-9469-5e6be6ef6990",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "string = \"\"\"\n",
+    "You are working as a network administrator for a company with\n",
+    "two subnets (subnet-a and subnet-b) in their default VPC. The company’s\n",
+    "database servers are located in subnet-a, while the application servers and web\n",
+    "servers operate in subnet-b. Your task is to configure a firewall rule that permits\n",
+    "database traffic exclusively from the application servers to the database servers.\n",
+    "What steps should be taken to accomplish this?\n",
+    "A. • Create service accounts sa-app and sa-db. • Associate service account sa-\n",
+    "app with the application servers and the service account sa-db with the database\n",
+    "servers. • Create an ingress firewall rule to allow network traffic from source\n",
+    "service account sa-app to target service account sa-db.\n",
+    "B. Create network tags db-server and app-server. • Add the db-server tag to\n",
+    "the application servers and the app-server tag to the database servers. • Create\n",
+    "an egress firewall rule to allow network traffic from source network tag db-server\n",
+    "to target network tag app-server.\n",
+    "C. Create a service account sa-app and a network tag db-server. • Associate the\n",
+    "service account sa-app with the database servers and the network tag db-server\n",
+    "with the application servers. • Create an ingress firewall rule to allow network\n",
+    "traffic from source service account sa-app to target network tag db-server.\n",
+    "D. Create a service account sa-app and a network tag app-server. • Add the\n",
+    "service account sa-app to the application servers and the network tag app-server\n",
+    "to the database servers. • Create an ingress firewall rule to allow network traffic\n",
+    "from source VPC IP addresses and target the subnet-b IP addresses\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "bf144f00-fce3-4842-abf3-37d244bf4192",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "' You are working as a network administrator for a company with two subnets (subnet-a and subnet-b) in their default VPC. The company’s database servers are located in subnet-a, while the application servers and web servers operate in subnet-b. Your task is to configure a firewall rule that permits database traffic exclusively from the application servers to the database servers. What steps should be taken to accomplish this? A. • Create service accounts sa-app and sa-db. • Associate service account sa- app with the application servers and the service account sa-db with the database servers. • Create an ingress firewall rule to allow network traffic from source service account sa-app to target service account sa-db. B. Create network tags db-server and app-server. • Add the db-server tag to the application servers and the app-server tag to the database servers. • Create an egress firewall rule to allow network traffic from source network tag db-server to target network tag app-server. C. Create a service account sa-app and a network tag db-server. • Associate the service account sa-app with the database servers and the network tag db-server with the application servers. • Create an ingress firewall rule to allow network traffic from source service account sa-app to target network tag db-server. D. Create a service account sa-app and a network tag app-server. • Add the service account sa-app to the application servers and the network tag app-server to the database servers. • Create an ingress firewall rule to allow network traffic from source VPC IP addresses and target the subnet-b IP addresses '"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "one_line = string.replace(\"\\n\", \" \")\n",
+    "one_line"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1d91d618-c1bf-434c-890e-e5610782643b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Ch 12 Q2\n",
+    "# You are an IT Specialist at a technology company, and your Dataproc cluster runs in a single Virtual Private Cloud (VPC) network in a single subnetwork with range 172.16.20.128/25. The subnetwork runs out of private IP addresses. Your manager asks you to find a way to add new VMs for communication with the cluster while minimizing the steps involved. What should you do? A. Create a new subnetwork in the existing VPC with a range of 172.16.21.0/24 and configure the VMs to use that subnetwork. B. Create a new VPC network for the VMs with a subnet of 172.32.0.0/16. Enable VPC network Peering between the Dataproc VPC network and the VMs VPC network. Configure a custom Route exchange. C. Configure Shared VPC for the existing VPC and add the VMs to a new subnetwork in the Shared VPC. D. Modify the existing subnet range to 172.16.20.0/24"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/rag_pipeline.ipynb b/rag_pipeline.ipynb
new file mode 100644
index 0000000..ac8bab1
--- /dev/null
+++ b/rag_pipeline.ipynb
@@ -0,0 +1,926 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "6647b014-6ef7-4553-bfd9-a17a92a2374c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import re\n",
+    "import uuid\n",
+    "import ollama\n",
+    "import logging\n",
+    "import psycopg2\n",
+    "from psycopg2.extras import Json\n",
+    "from datetime import date, datetime\n",
+    "from bs4 import BeautifulSoup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "53625000-e642-459e-b496-07da97095c08",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Set up logging\n",
+    "logging.basicConfig(level=logging.INFO)\n",
+    "logger = logging.getLogger(__name__)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "47c7f0b4-32c0-4952-a49a-01ee0099a89d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def read_file(filename):\n",
+    "    content = ''\n",
+    "    with open(filename, 'r') as f:\n",
+    "        content = f.read()\n",
+    "\n",
+    "    return content"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "9c5202b0-bd87-458e-b2c8-5dc94033271f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "BASE_PATH = './docs'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "bbb37d78-eaa1-4959-b315-938d55949919",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_chunk(content_type, content, heading_stack, extra_metadata=None):\n",
+    "    \"\"\"Helper to create consistent chunk structure.\n",
+    "    Generated by ClaudeAI\"\"\"\n",
+    "\n",
+    "    content = content\n",
+    "\n",
+    "    if isinstance(content, list):\n",
+    "        # Join the list first, then replace newlines\n",
+    "        content = ' '.join(content).replace('\\n', ' ')\n",
+    "    else:\n",
+    "        # It's already a string\n",
+    "        content = content.replace('\\n', ' ')\n",
+    "                \n",
+    "    chunk = {\n",
+    "        'content': content,\n",
+    "        'content_type': content_type,\n",
+    "        'heading_path': ' > '.join(h['text'] for h in heading_stack),\n",
+    "        'immediate_heading': heading_stack[-1]['text'] if heading_stack else None,\n",
+    "        'headings': [h['text'] for h in heading_stack],\n",
+    "    }\n",
+    "    \n",
+    "    if extra_metadata:\n",
+    "        chunk.update(extra_metadata)\n",
+    "    \n",
+    "    return chunk"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "9c5ba591-0ccf-4f96-9fc2-a50b994be065",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def process_list(list_element, heading_stack):\n",
+    "    \"\"\"Process ul/ol lists as single chunks or individual items\"\"\"\n",
+    "    list_type = 'ordered_list' if list_element.name == 'ol' else 'unordered_list'\n",
+    "    \n",
+    "    # Extract all list items\n",
+    "    items = []\n",
+    "    for li in list_element.find_all('li', recursive=False):  # Only direct children\n",
+    "        item_text = li.get_text().strip()\n",
+    "        if item_text:\n",
+    "            # Clean up bullets and numbering from the text\n",
+    "            cleaned_text = clean_list_item_text(item_text)\n",
+    "            if cleaned_text:  # Only add if there's content after cleaning\n",
+    "                items.append(cleaned_text)\n",
+    "    \n",
+    "    if not items:\n",
+    "        return None\n",
+    "    \n",
+    "    # Strategy 1: Treat entire list as one chunk\n",
+    "    if len(items) <= 10:  # Reasonable threshold\n",
+    "        content = format_list_content(items, list_type)\n",
+    "        return create_chunk(list_type, content, heading_stack, {\n",
+    "            'item_count': len(items),\n",
+    "            'list_items': items\n",
+    "        })\n",
+    "    \n",
+    "    # Strategy 2: Split long lists into multiple chunks\n",
+    "    else:\n",
+    "        chunks = []\n",
+    "        chunk_size = 8\n",
+    "        for i in range(0, len(items), chunk_size):\n",
+    "            chunk_items = items[i:i + chunk_size]\n",
+    "            content = format_list_content(chunk_items, list_type)\n",
+    "            chunk = create_chunk(f'{list_type}_part', content, heading_stack, {\n",
+    "                'item_count': len(chunk_items),\n",
+    "                'list_items': chunk_items,\n",
+    "                'part_number': i // chunk_size + 1,\n",
+    "                'total_parts': (len(items) + chunk_size - 1) // chunk_size\n",
+    "            })\n",
+    "            chunks.append(chunk)\n",
+    "        return chunks\n",
+    "\n",
+    "def clean_list_item_text(text):\n",
+    "    \"\"\"Remove bullets, numbers, and other list markers from text\"\"\"\n",
+    "    \n",
+    "    # First, split on bullet points if multiple items are concatenated\n",
+    "    # This handles cases where multiple list items got joined together\n",
+    "    if '•' in text:\n",
+    "        # Split on bullets and clean each part\n",
+    "        parts = text.split('•')\n",
+    "        cleaned_parts = []\n",
+    "        for part in parts:\n",
+    "            cleaned_part = clean_single_item(part.strip())\n",
+    "            if cleaned_part:\n",
+    "                cleaned_parts.append(cleaned_part)\n",
+    "        \n",
+    "        if len(cleaned_parts) > 1:\n",
+    "            # Multiple items were concatenated, return them separated\n",
+    "            return ' | '.join(cleaned_parts)\n",
+    "        else:\n",
+    "            # Single item, continue with normal cleaning\n",
+    "            text = parts[0] if parts else text\n",
+    "    \n",
+    "    # Clean single item\n",
+    "    return clean_single_item(text)\n",
+    "\n",
+    "def clean_single_item(text):\n",
+    "    \"\"\"Clean a single list item\"\"\"\n",
+    "    if not text:\n",
+    "        return \"\"\n",
+    "    \n",
+    "    # Common bullet characters and patterns to remove\n",
+    "    bullet_patterns = [\n",
+    "        r'^[•·▪▫‣⁃◦▸▹►▻○●◉◎⦿⦾]\\s*',  # Various bullet characters\n",
+    "        r'^[-–—*+]\\s*',                    # Dash, asterisk, plus bullets\n",
+    "        r'^\\d+[\\.\\)]\\s*',                  # Numbers with periods or parentheses\n",
+    "        r'^[a-zA-Z][\\.\\)]\\s*',             # Letters with periods or parentheses\n",
+    "        r'^[ivxlcdm]+[\\.\\)]\\s*',           # Roman numerals\n",
+    "        r'^\\([a-zA-Z0-9]+\\)\\s*',           # Parenthesized numbers/letters\n",
+    "        r'^\\s*\\u2022\\s*',                  # Unicode bullet\n",
+    "        r'^\\s*\\u25E6\\s*',                  # White bullet\n",
+    "        r'^\\s*\\u25AA\\s*',                  # Black small square\n",
+    "        r'^\\s*\\u25AB\\s*',                  # White small square\n",
+    "    ]\n",
+    "    \n",
+    "    cleaned_text = text\n",
+    "    for pattern in bullet_patterns:\n",
+    "        cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.IGNORECASE)\n",
+    "    \n",
+    "    # Remove extra whitespace\n",
+    "    cleaned_text = re.sub(r'\\s+', ' ', cleaned_text).strip()\n",
+    "    \n",
+    "    return cleaned_text\n",
+    "\n",
+    "def format_list_content(items, list_type):\n",
+    "    \"\"\"Format list items into readable content WITHOUT adding bullets\"\"\"\n",
+    "    if list_type == 'ordered_list':\n",
+    "        return '\\n'.join(f\"{i+1}. {item}\" for i, item in enumerate(items))\n",
+    "    else:\n",
+    "        # For unordered lists, just join with newlines or separators\n",
+    "        # Don't add bullets since we want clean text\n",
+    "        return '\\n'.join(items)\n",
+    "        # Alternative: use a separator instead of newlines\n",
+    "        # return ' | '.join(items)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "62d7feea-1c66-419d-8fb7-66dc75c8e9f7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def process_table(table_element, heading_stack):\n",
+    "    \"\"\"Process tables with different strategies based on size.\n",
+    "    Generated by ClaudeAI\"\"\"\n",
+    "    \n",
+    "    # Extract table data\n",
+    "    table_data = extract_table_data(table_element)\n",
+    "    \n",
+    "    if not table_data['rows']:\n",
+    "        return None\n",
+    "    \n",
+    "    row_count = len(table_data['rows'])\n",
+    "    col_count = len(table_data['headers']) if table_data['headers'] else len(table_data['rows'][0])\n",
+    "    \n",
+    "    # Strategy based on table size\n",
+    "    if row_count <= 20 and col_count <= 6:\n",
+    "        # Small table: treat as single chunk\n",
+    "        content = format_table_content(table_data)\n",
+    "        return create_chunk('table', content, heading_stack, {\n",
+    "            'row_count': row_count,\n",
+    "            'column_count': col_count,\n",
+    "            'headers': table_data['headers'],\n",
+    "            'table_caption': table_data['caption']\n",
+    "        })\n",
+    "    \n",
+    "    else:\n",
+    "        # Large table: split by rows\n",
+    "        return split_large_table(table_data, heading_stack)\n",
+    "\n",
+    "def extract_table_data(table_element):\n",
+    "    \"\"\"Extract structured data from table.\n",
+    "    Generated by ClaudeAI\"\"\"\n",
+    "    \n",
+    "    # Get caption if present\n",
+    "    caption_elem = table_element.find('caption')\n",
+    "    caption = caption_elem.get_text().strip() if caption_elem else None\n",
+    "    \n",
+    "    # Extract headers\n",
+    "    headers = []\n",
+    "    header_row = table_element.find('thead')\n",
+    "    if header_row:\n",
+    "        for th in header_row.find_all(['th', 'td']):\n",
+    "            headers.append(th.get_text().strip())\n",
+    "    else:\n",
+    "        # Try first row as headers\n",
+    "        first_row = table_element.find('tr')\n",
+    "        if first_row:\n",
+    "            for cell in first_row.find_all(['th', 'td']):\n",
+    "                headers.append(cell.get_text().strip())\n",
+    "    \n",
+    "    # Extract data rows\n",
+    "    rows = []\n",
+    "    tbody = table_element.find('tbody') or table_element\n",
+    "    \n",
+    "    for tr in tbody.find_all('tr')[1 if not table_element.find('thead') and headers else 0:]:\n",
+    "        row = []\n",
+    "        for cell in tr.find_all(['td', 'th']):\n",
+    "            row.append(cell.get_text().strip())\n",
+    "        if row:  # Skip empty rows\n",
+    "            rows.append(row)\n",
+    "    \n",
+    "    return {\n",
+    "        'caption': caption,\n",
+    "        'headers': headers,\n",
+    "        'rows': rows\n",
+    "    }\n",
+    "\n",
+    "def format_table_content(table_data):\n",
+    "    \"\"\"Format table data into readable text\n",
+    "    Generated by ClaudeAI\"\"\"\n",
+    "    content_parts = []\n",
+    "    \n",
+    "    if table_data['caption']:\n",
+    "        content_parts.append(f\"Table: {table_data['caption']}\")\n",
+    "    \n",
+    "    headers = table_data['headers']\n",
+    "    rows = table_data['rows']\n",
+    "    \n",
+    "    if headers:\n",
+    "        content_parts.append(\"Columns: \" + \" | \".join(headers))\n",
+    "    \n",
+    "    # Format rows\n",
+    "    for i, row in enumerate(rows):\n",
+    "        if headers and len(row) == len(headers):\n",
+    "            # Create key-value pairs\n",
+    "            row_content = []\n",
+    "            for header, value in zip(headers, row):\n",
+    "                if value:  # Skip empty cells\n",
+    "                    row_content.append(f\"{header}: {value}\")\n",
+    "            if row_content:\n",
+    "                content_parts.append(f\"Row {i+1}: \" + \"; \".join(row_content))\n",
+    "        else:\n",
+    "            # Simple row format\n",
+    "            content_parts.append(f\"Row {i+1}: \" + \" | \".join(row))\n",
+    "    \n",
+    "    return '\\n'.join(content_parts)\n",
+    "\n",
+    "def split_large_table(table_data, heading_stack):\n",
+    "    \"\"\"Split large tables into smaller chunks\n",
+    "    Generated By ClaudeAI\"\"\"\n",
+    "    chunks = []\n",
+    "    headers = table_data['headers']\n",
+    "    rows = table_data['rows']\n",
+    "    \n",
+    "    chunk_size = 10  # Rows per chunk\n",
+    "    total_chunks = (len(rows) + chunk_size - 1) // chunk_size\n",
+    "    \n",
+    "    for i in range(0, len(rows), chunk_size):\n",
+    "        chunk_rows = rows[i:i + chunk_size]\n",
+    "        \n",
+    "        chunk_table_data = {\n",
+    "            'caption': table_data['caption'],\n",
+    "            'headers': headers,\n",
+    "            'rows': chunk_rows\n",
+    "        }\n",
+    "        \n",
+    "        content = format_table_content(chunk_table_data)\n",
+    "        \n",
+    "        chunk = create_chunk('table_part', content, heading_stack, {\n",
+    "            'row_count': len(chunk_rows),\n",
+    "            'column_count': len(headers) if headers else len(chunk_rows[0]),\n",
+    "            'headers': headers,\n",
+    "            'table_caption': table_data['caption'],\n",
+    "            'part_number': i // chunk_size + 1,\n",
+    "            'total_parts': total_chunks,\n",
+    "            'row_range': f\"{i+1}-{min(i+chunk_size, len(rows))}\"\n",
+    "        })\n",
+    "        \n",
+    "        chunks.append(chunk)\n",
+    "    \n",
+    "    return chunks"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "fb41a9bf-01d3-4931-80c7-f9d3f853d761",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def extract_content(html_content):\n",
+    "    ret_value = {}\n",
+    "    soup = BeautifulSoup(html_content, 'html.parser')\n",
+    "\n",
+    "    og_url = soup.find('meta', property='og:url')\n",
+    "    og_description = soup.find('meta', property=\"og:description\")\n",
+    "    og_title = soup.find('meta', property=\"og:title\")\n",
+    "    print(og_title)\n",
+    "    title_content = og_title.get('content') if og_title else None\n",
+    "    title = re.sub(r'[\\s\\xa0]*\\|[\\s\\xa0]*', ' | ', title_content) if title_content else None\n",
+    "\n",
+    "    article_body = soup.find('div', class_='devsite-article-body')\n",
+    "    if not article_body:\n",
+    "        return {}\n",
+    "\n",
+    "    footer = soup.find('devsite-content-footer')\n",
+    "    # footer_paras = footer.find_all('p') if footer else None\n",
+    "    # second_para = footer_paras[1] if len(footer_paras) > 1 else None\n",
+    "    date_last_modified = date.today().strftime('%Y-%m-%d')\n",
+    "    if footer:\n",
+    "            footer_paras = footer.find_all('p')\n",
+    "            for fp in footer_paras:\n",
+    "                last_updated_re = r'Last updated (.*) UTC'\n",
+    "                match = re.search(last_updated_re, fp.get_text())\n",
+    "                if match:\n",
+    "                    date_last_modified = match.group(1)\n",
+    "                    break\n",
+    "\n",
+    "    #\n",
+    "    # Start ClaudeAI generated Code\n",
+    "    #\n",
+    "    chunks = []\n",
+    "    heading_stack = []\n",
+    "    \n",
+    "    # Process elements that can be chunks or provide context\n",
+    "    for element in article_body.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol', 'table']):\n",
+    "        if element.name.startswith('h'):\n",
+    "            level = int(element.name[1])\n",
+    "            heading_text = element.get_text().strip()\n",
+    "            \n",
+    "            heading_stack = [h for h in heading_stack if h['level'] < level]\n",
+    "            heading_stack.append({'level': level, 'text': heading_text})\n",
+    "            \n",
+    "        elif element.name == 'p':\n",
+    "            raw_content = element.get_text().strip()\n",
+    "\n",
+    "            if isinstance(raw_content, list):\n",
+    "                # Join the list first, then replace newlines\n",
+    "                content = ' '.join(raw_content).replace('\\n', ' ')\n",
+    "            else:\n",
+    "                # It's already a string\n",
+    "                content = raw_content.replace('\\n', ' ')\n",
+    "                \n",
+    "            if content and len(content) > 10:\n",
+    "                chunk = create_chunk('paragraph', content, heading_stack)\n",
+    "                chunks.append(chunk)\n",
+    "                \n",
+    "        elif element.name in ['ul', 'ol']:\n",
+    "            list_chunk = process_list(element, heading_stack)\n",
+    "            if list_chunk:\n",
+    "                chunks.append(list_chunk)\n",
+    "                \n",
+    "        elif element.name == 'table':\n",
+    "            table_chunk = process_table(element, heading_stack)\n",
+    "            if table_chunk:\n",
+    "                chunks.append(table_chunk)\n",
+    "    #\n",
+    "    # End ClaudeAI generated code\n",
+    "    #\n",
+    "\n",
+    "    ret_value['url'] = og_url.get('content') if og_url else None\n",
+    "    ret_value['description'] = og_description.get('content') if og_description else None\n",
+    "    ret_value['title'] = title\n",
+    "    ret_value['date_last_modified'] = date_last_modified\n",
+    "    ret_value['chunks'] = chunks\n",
+    "    # ret_value['article'] = article_body\n",
+    "\n",
+    "    return ret_value"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "816faecf-b764-46ce-ac2a-1cd1bf0bd9c4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def prepare_enhanced_documents(html_content, additional_metadata=None):\n",
+    "    extracted = extract_content(html_content)\n",
+    "\n",
+    "    if not extracted:\n",
+    "        return []\n",
+    "    # extracted = extract_all_content_chunks(html_content)\n",
+    "\n",
+    "    additional_metadata = {\n",
+    "        \"date_last_modified\": extracted['date_last_modified']\n",
+    "    }\n",
+    "    \n",
+    "    documents = []\n",
+    "    chunk_counter = 0\n",
+    "    \n",
+    "    for chunk in extracted['chunks']:\n",
+    "        # Handle cases where list/table processing returns multiple chunks\n",
+    "        if isinstance(chunk, list):\n",
+    "            for sub_chunk in chunk:\n",
+    "                doc = create_document_from_chunk(sub_chunk, extracted['url'], extracted['title'], chunk_counter, additional_metadata)\n",
+    "                documents.append(doc)\n",
+    "                chunk_counter += 1\n",
+    "        else:\n",
+    "            doc = create_document_from_chunk(chunk, extracted['url'], extracted['title'], chunk_counter, additional_metadata)\n",
+    "            documents.append(doc)\n",
+    "            chunk_counter += 1\n",
+    "    \n",
+    "    return documents\n",
+    "\n",
+    "def create_document_from_chunk(chunk, url, title, index, additional_metadata):\n",
+    "    \"\"\"Create document object from chunk\"\"\"\n",
+    "    \n",
+    "    # Create enhanced content for embedding\n",
+    "    content_parts = []\n",
+    "    \n",
+    "    # Add document title\n",
+    "    content_parts.append(title)\n",
+    "    \n",
+    "    # Add heading context\n",
+    "    if chunk['heading_path']:\n",
+    "        content_parts.append(f\"Section: {chunk['heading_path']}\")\n",
+    "    \n",
+    "    # Add content type context\n",
+    "    content_type_labels = {\n",
+    "        'paragraph': '',\n",
+    "        'unordered_list': 'List:',\n",
+    "        'ordered_list': 'Numbered list:',\n",
+    "        'table': 'Table:',\n",
+    "        'table_part': 'Table data:',\n",
+    "        'unordered_list_part': 'List items:',\n",
+    "        'ordered_list_part': 'Numbered list items:'\n",
+    "    }\n",
+    "    \n",
+    "    type_label = content_type_labels.get(chunk['content_type'], '')\n",
+    "    if type_label:\n",
+    "        content_parts.append(type_label)\n",
+    "    \n",
+    "    # Add main content\n",
+    "    content_parts.append(chunk['content'])\n",
+    "    \n",
+    "    # Enhanced content for embedding\n",
+    "    embedding_content = ' '.join(content_parts)\n",
+    "    \n",
+    "    # Base metadata\n",
+    "    metadata = {\n",
+    "        'source_url': url,\n",
+    "        'document_title': title,\n",
+    "        'chunk_index': index,\n",
+    "        'content_type': chunk['content_type'],\n",
+    "        'heading_path': chunk['heading_path'],\n",
+    "        'immediate_heading': chunk['immediate_heading'],\n",
+    "        'all_headings': chunk['headings'],\n",
+    "        'processed_at': date.today().strftime('%Y-%m-%d'),\n",
+    "        **(additional_metadata or {})\n",
+    "    }\n",
+    "    \n",
+    "    # Add content-specific metadata\n",
+    "    for key in ['item_count', 'list_items', 'row_count', 'column_count', 'headers', 'table_caption', 'part_number', 'total_parts', 'row_range']:\n",
+    "        if key in chunk:\n",
+    "            metadata[key] = chunk[key]\n",
+    "    \n",
+    "    doc = {\n",
+    "        'id': f\"{url}#chunk{index}\",\n",
+    "        'content': chunk['content'],\n",
+    "        'embedding_content': embedding_content,\n",
+    "        'metadata': metadata\n",
+    "    }\n",
+    "    \n",
+    "    return doc"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "5fe9b756-8d89-481c-8ba1-4f4daa79135e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_embedding(text):\n",
+    "    \"\"\"Create an embedding vector for a single text\"\"\"\n",
+    "    response = ollama.embeddings(\n",
+    "        model='nomic-embed-text',\n",
+    "        prompt=text\n",
+    "    )\n",
+    "    return response['embedding']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "123af885-1da9-4089-a9c6-87b4d26d3e2d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# content = read_file('./docs/run/cloud.google.com/run/docs/overview/what-is-cloud-run')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "02eedbde-e1a1-4b40-8ac2-e86ff50afa73",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "folders_to_read = [\n",
+    "    'docs/run/cloud.google.com/run/docs',\n",
+    "    'docs/compute/cloud.google.com/compute/docs',\n",
+    "    'docs/iam/cloud.google.com/iam/docs'\n",
+    "]\n",
+    "    \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "3355eab2-0e54-4dbc-bb54-065cf165a9fa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def is_html_file(file_path: str) -> bool:\n",
+    "    \"\"\"Check if file is likely an HTML file based on content.\"\"\"\n",
+    "    try:\n",
+    "        with open(file_path, 'r', encoding='utf-8') as file:\n",
+    "            first_line = file.readline().lower().strip()\n",
+    "            # Check for common HTML indicators\n",
+    "            return (first_line.startswith('<!doctype html') or \n",
+    "                   first_line.startswith('<html') or \n",
+    "                   '<html' in first_line)\n",
+    "    except:\n",
+    "        return False"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "f048c113-b928-41e1-8f97-f6a8315b1bcd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# def insert_to_database(conn, records):\n",
+    "#     \"\"\"Insert records into the vector_store table.\"\"\"\n",
+    "#     try:\n",
+    "#         cursor = conn.cursor()\n",
+    "        \n",
+    "#         insert_query = \"\"\"\n",
+    "#             INSERT INTO vector_store (\n",
+    "#                 id, title, chunk_index, content, source_url, \n",
+    "#                 date_last_modified, metadata, embedding\n",
+    "#             ) VALUES (\n",
+    "#                 %s, %s, %s, %s, %s, %s, %s, %s\n",
+    "#             )\n",
+    "#         \"\"\"\n",
+    "        \n",
+    "#         for record in records:\n",
+    "#             cursor.execute(insert_query, (\n",
+    "#                 record['id'],\n",
+    "#                 record['title'],\n",
+    "#                 record['chunk_index'],\n",
+    "#                 record['content'],\n",
+    "#                 record['source_url'],\n",
+    "#                 record['date_last_modified'],\n",
+    "#                 Json(record['metadata']),\n",
+    "#                 record['embedding']\n",
+    "#             ))\n",
+    "        \n",
+    "#         conn.commit()\n",
+    "#         logger.info(f\"Inserted {len(records)} records into database\")\n",
+    "        \n",
+    "#     except Exception as e:\n",
+    "#         logger.error(f\"Database insertion error: {str(e)}\")\n",
+    "#         conn.rollback()\n",
+    "#         raise\n",
+    "#     finally:\n",
+    "#         cursor.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "272d8cf0-5eab-494b-ad72-9f081aa36586",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def insert_to_database(conn, records):\n",
+    "    \"\"\"Insert records into the documents and chunks tables.\"\"\"\n",
+    "    try:\n",
+    "        cursor = conn.cursor()\n",
+    "        \n",
+    "        # Group records by source_url to handle documents\n",
+    "        documents_by_url = {}\n",
+    "        for record in records:\n",
+    "            source_url = record['source_url']\n",
+    "            if source_url not in documents_by_url:\n",
+    "                documents_by_url[source_url] = {\n",
+    "                    'title': record['title'],\n",
+    "                    'source_url': source_url,\n",
+    "                    'date_last_modified': record['date_last_modified'],\n",
+    "                    'metadata': record['metadata'],\n",
+    "                    'chunks': []\n",
+    "                }\n",
+    "            documents_by_url[source_url]['chunks'].append(record)\n",
+    "        \n",
+    "        # Insert documents (with conflict handling for duplicates)\n",
+    "        document_insert_query = \"\"\"\n",
+    "            INSERT INTO documents (source_url, title, date_last_modified, metadata)\n",
+    "            VALUES (%s, %s, %s, %s)\n",
+    "            ON CONFLICT (source_url) DO UPDATE SET\n",
+    "                title = EXCLUDED.title,\n",
+    "                date_last_modified = EXCLUDED.date_last_modified,\n",
+    "                metadata = EXCLUDED.metadata\n",
+    "            RETURNING id\n",
+    "        \"\"\"\n",
+    "        \n",
+    "        # Insert chunks\n",
+    "        chunk_insert_query = \"\"\"\n",
+    "            INSERT INTO chunks (document_id, chunk_index, content, embedding)\n",
+    "            VALUES (%s, %s, %s, %s)\n",
+    "        \"\"\"\n",
+    "        \n",
+    "        # Get document ID query for existing documents\n",
+    "        get_document_id_query = \"\"\"\n",
+    "            SELECT id FROM documents WHERE source_url = %s\n",
+    "        \"\"\"\n",
+    "        \n",
+    "        total_chunks_inserted = 0\n",
+    "        \n",
+    "        for source_url, doc_data in documents_by_url.items():\n",
+    "            # Try to insert the document (or update if exists)\n",
+    "            try:\n",
+    "                cursor.execute(document_insert_query, (\n",
+    "                    doc_data['source_url'],\n",
+    "                    doc_data['title'],\n",
+    "                    doc_data['date_last_modified'],\n",
+    "                    Json(doc_data['metadata'])\n",
+    "                ))\n",
+    "                document_id = cursor.fetchone()[0]\n",
+    "            except Exception as e:\n",
+    "                # If insert fails, try to get existing document ID\n",
+    "                cursor.execute(get_document_id_query, (source_url,))\n",
+    "                result = cursor.fetchone()\n",
+    "                if result:\n",
+    "                    document_id = result[0]\n",
+    "                else:\n",
+    "                    logger.error(f\"Failed to insert or find document for URL {source_url}: {str(e)}\")\n",
+    "                    continue\n",
+    "            \n",
+    "            # Insert chunks for this document\n",
+    "            for chunk in doc_data['chunks']:\n",
+    "                cursor.execute(chunk_insert_query, (\n",
+    "                    document_id,\n",
+    "                    chunk['chunk_index'],\n",
+    "                    chunk['content'],\n",
+    "                    chunk['embedding']\n",
+    "                ))\n",
+    "                total_chunks_inserted += 1\n",
+    "        \n",
+    "        conn.commit()\n",
+    "        logger.info(f\"Inserted {len(documents_by_url)} documents and {total_chunks_inserted} chunks into database\")\n",
+    "        \n",
+    "    except Exception as e:\n",
+    "        logger.error(f\"Database insertion error: {str(e)}\")\n",
+    "        conn.rollback()\n",
+    "        raise\n",
+    "    finally:\n",
+    "        cursor.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "b67e5cb3-fa63-4429-a791-902ef2fa4356",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def process(folder_path):\n",
+    "    \n",
+    "    try:\n",
+    "        # Connect to database\n",
+    "        conn = psycopg2.connect(\n",
+    "            host=\"localhost\",           # Database server host\n",
+    "            port=15432,                 # Port number (default: 5432)\n",
+    "            database=\"gcp_docs\",           # Database name\n",
+    "            user=\"admin\",           # Username\n",
+    "            password=\"password\"        # Password\n",
+    "        )\n",
+    "        logger.info(\"Connected to database successfully\")\n",
+    "        \n",
+    "        # Track processing statistics\n",
+    "        total_files = 0\n",
+    "        total_chunks = 0\n",
+    "        \n",
+    "        # Walk through all directories and files\n",
+    "        for root, dirs, files in os.walk(folder_path):\n",
+    "            logger.info(f\"Processing directory: {root}\")\n",
+    "            for filename in files:\n",
+    "                file_path = os.path.join(root, filename)\n",
+    "               \n",
+    "                html_page = read_file(file_path)\n",
+    "                logger.info(f\"Processing file: {file_path}\")\n",
+    "                \n",
+    "                # Extract content from HTML\n",
+    "                document = prepare_enhanced_documents(html_page)\n",
+    "                \n",
+    "                if not document:\n",
+    "                    logger.warning(f\"No content extracted from {file_path}\")\n",
+    "                    continue\n",
+    "\n",
+    "                i = 0\n",
+    "                records = []\n",
+    "                for chunk in document:\n",
+    "                    embedding = create_embedding(chunk['embedding_content'])\n",
+    "                    record = {\n",
+    "                        'id': str(uuid.uuid4()),\n",
+    "                        'title': chunk['metadata']['document_title'],\n",
+    "                        'chunk_index':chunk['id'],\n",
+    "                        'content': chunk['content'],\n",
+    "                        'source_url': chunk['metadata']['source_url'],\n",
+    "                        'date_last_modified': chunk['metadata']['date_last_modified'],\n",
+    "                        'metadata': {\n",
+    "                            **chunk['metadata'],\n",
+    "                            'chunk_number': i,\n",
+    "                            'total_chunks': len(document),\n",
+    "                            'chunk_size': len(chunk['embedding_content'])\n",
+    "                        },\n",
+    "                        'embedding': embedding\n",
+    "                    }\n",
+    "                    records.append(record)\n",
+    "                    i = i + 1\n",
+    "                \n",
+    "                # Insert records into database\n",
+    "                if records:\n",
+    "                    insert_to_database(conn, records)\n",
+    "                    total_files += 1\n",
+    "                    total_chunks += len(records)\n",
+    "        \n",
+    "        logger.info(f\"Processing complete. Processed {total_files} files, created {total_chunks} chunks\")\n",
+    "        return records\n",
+    "        \n",
+    "    except Exception as e:\n",
+    "        logger.error(f\"Error during processing: {str(e)}\")\n",
+    "        raise\n",
+    "    finally:\n",
+    "        if 'conn' in locals():\n",
+    "            conn.close()\n",
+    "        logger.info(\"Database connection closed\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "b125fdf1-838f-4516-885d-004342ec0be7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# html_file = read_file('docs/run/cloud.google.com/run/docs/monitoring-overview')\n",
+    "# prepare_enhanced_documents(html_file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ed63e442-ede2-48f0-a378-884110a250f1",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:__main__:Connected to database successfully\n",
+      "INFO:__main__:Processing directory: docs/functions/cloud.google.com/functions/docs\n",
+      "INFO:__main__:Processing file: docs/functions/cloud.google.com/functions/docs/create-deploy-http-ruby\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<meta content=\"Quickstart: Deploy a Cloud Run function using the gcloud CLI  |  Cloud Run Documentation  |  Google Cloud\" property=\"og:title\"/>\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n"
+     ]
+    }
+   ],
+   "source": [
+    "#records = process('docs/run/cloud.google.com/run/docs--')\n",
+    "# records = process('docs/iam/cloud.google.com/iam/docs')\n",
+    "# records = process('docs/compute/cloud.google.com/compute/docs')\n",
+    "# records = process('docs/storage/cloud.google.com/storage/docs')\n",
+    "# records = process('docs/iap/cloud.google.com/iap/docs')\n",
+    "# records = process('docs/bigquery/cloud.google.com/bigquery/docs')\n",
+    "# records = process('docs/apigee/cloud.google.com/apigee/docs')\n",
+    "records = process('docs/functions/cloud.google.com/functions/docs')\n",
+    "# records = process('docs/pubsub/cloud.google.com/pubsub/docs')\n",
+    "# records = process('docs/sql/cloud.google.com/sql/docs')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d7a4fe99-d9db-46c6-bbfb-8d4d36446d1c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/rag_pipeline.py b/rag_pipeline.py
new file mode 100644
index 0000000..1e35eb8
--- /dev/null
+++ b/rag_pipeline.py
@@ -0,0 +1,705 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+import os
+import re
+import uuid
+import ollama
+import logging
+import psycopg2
+from psycopg2.extras import Json
+from datetime import date, datetime
+from bs4 import BeautifulSoup
+
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+def read_file(filename):
+    content = ''
+    with open(filename, 'r') as f:
+        content = f.read()
+
+    return content
+
+BASE_PATH = './docs'
+
+def create_chunk(content_type, content, heading_stack, extra_metadata=None):
+    """Helper to create consistent chunk structure.
+    Generated by ClaudeAI"""
+
+    content = content
+
+    if isinstance(content, list):
+        # Join the list first, then replace newlines
+        content = ' '.join(content).replace('\n', ' ')
+    else:
+        # It's already a string
+        content = content.replace('\n', ' ')
+
+    chunk = {
+        'content': content,
+        'content_type': content_type,
+        'heading_path': ' > '.join(h['text'] for h in heading_stack),
+        'immediate_heading': heading_stack[-1]['text'] if heading_stack else None,
+        'headings': [h['text'] for h in heading_stack],
+    }
+
+    if extra_metadata:
+        chunk.update(extra_metadata)
+
+    return chunk
+
+def process_list(list_element, heading_stack):
+    """Process ul/ol lists as single chunks or individual items"""
+    list_type = 'ordered_list' if list_element.name == 'ol' else 'unordered_list'
+
+    # Extract all list items
+    items = []
+    for li in list_element.find_all('li', recursive=False):  # Only direct children
+        item_text = li.get_text().strip()
+        if item_text:
+            # Clean up bullets and numbering from the text
+            cleaned_text = clean_list_item_text(item_text)
+            if cleaned_text:  # Only add if there's content after cleaning
+                items.append(cleaned_text)
+
+    if not items:
+        return None
+
+    # Strategy 1: Treat entire list as one chunk
+    if len(items) <= 10:  # Reasonable threshold
+        content = format_list_content(items, list_type)
+        return create_chunk(list_type, content, heading_stack, {
+            'item_count': len(items),
+            'list_items': items
+        })
+
+    # Strategy 2: Split long lists into multiple chunks
+    else:
+        chunks = []
+        chunk_size = 8
+        for i in range(0, len(items), chunk_size):
+            chunk_items = items[i:i + chunk_size]
+            content = format_list_content(chunk_items, list_type)
+            chunk = create_chunk(f'{list_type}_part', content, heading_stack, {
+                'item_count': len(chunk_items),
+                'list_items': chunk_items,
+                'part_number': i // chunk_size + 1,
+                'total_parts': (len(items) + chunk_size - 1) // chunk_size
+            })
+            chunks.append(chunk)
+        return chunks
+
+def clean_list_item_text(text):
+    """Remove bullets, numbers, and other list markers from text"""
+
+    # First, split on bullet points if multiple items are concatenated
+    # This handles cases where multiple list items got joined together
+    if '•' in text:
+        # Split on bullets and clean each part
+        parts = text.split('•')
+        cleaned_parts = []
+        for part in parts:
+            cleaned_part = clean_single_item(part.strip())
+            if cleaned_part:
+                cleaned_parts.append(cleaned_part)
+
+        if len(cleaned_parts) > 1:
+            # Multiple items were concatenated, return them separated
+            return ' | '.join(cleaned_parts)
+        else:
+            # Single item, continue with normal cleaning
+            text = parts[0] if parts else text
+
+    # Clean single item
+    return clean_single_item(text)
+
+def clean_single_item(text):
+    """Clean a single list item"""
+    if not text:
+        return ""
+
+    # Common bullet characters and patterns to remove
+    bullet_patterns = [
+        r'^[•·▪▫‣⁃◦▸▹►▻○●◉◎⦿⦾]\s*',  # Various bullet characters
+        r'^[-–—*+]\s*',                    # Dash, asterisk, plus bullets
+        r'^\d+[\.\)]\s*',                  # Numbers with periods or parentheses
+        r'^[a-zA-Z][\.\)]\s*',             # Letters with periods or parentheses
+        r'^[ivxlcdm]+[\.\)]\s*',           # Roman numerals
+        r'^\([a-zA-Z0-9]+\)\s*',           # Parenthesized numbers/letters
+        r'^\s*\u2022\s*',                  # Unicode bullet
+        r'^\s*\u25E6\s*',                  # White bullet
+        r'^\s*\u25AA\s*',                  # Black small square
+        r'^\s*\u25AB\s*',                  # White small square
+    ]
+
+    cleaned_text = text
+    for pattern in bullet_patterns:
+        cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.IGNORECASE)
+
+    # Remove extra whitespace
+    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
+
+    return cleaned_text
+
+def format_list_content(items, list_type):
+    """Format list items into readable content WITHOUT adding bullets"""
+    if list_type == 'ordered_list':
+        return '\n'.join(f"{i+1}. {item}" for i, item in enumerate(items))
+    else:
+        # For unordered lists, just join with newlines or separators
+        # Don't add bullets since we want clean text
+        return '\n'.join(items)
+        # Alternative: use a separator instead of newlines
+        # return ' | '.join(items)
+
+def process_table(table_element, heading_stack):
+    """Process tables with different strategies based on size.
+    Generated by ClaudeAI"""
+
+    # Extract table data
+    table_data = extract_table_data(table_element)
+
+    if not table_data['rows']:
+        return None
+
+    row_count = len(table_data['rows'])
+    col_count = len(table_data['headers']) if table_data['headers'] else len(table_data['rows'][0])
+
+    # Strategy based on table size
+    if row_count <= 20 and col_count <= 6:
+        # Small table: treat as single chunk
+        content = format_table_content(table_data)
+        return create_chunk('table', content, heading_stack, {
+            'row_count': row_count,
+            'column_count': col_count,
+            'headers': table_data['headers'],
+            'table_caption': table_data['caption']
+        })
+
+    else:
+        # Large table: split by rows
+        return split_large_table(table_data, heading_stack)
+
+def extract_table_data(table_element):
+    """Extract structured data from table.
+    Generated by ClaudeAI"""
+
+    # Get caption if present
+    caption_elem = table_element.find('caption')
+    caption = caption_elem.get_text().strip() if caption_elem else None
+
+    # Extract headers
+    headers = []
+    header_row = table_element.find('thead')
+    if header_row:
+        for th in header_row.find_all(['th', 'td']):
+            headers.append(th.get_text().strip())
+    else:
+        # Try first row as headers
+        first_row = table_element.find('tr')
+        if first_row:
+            for cell in first_row.find_all(['th', 'td']):
+                headers.append(cell.get_text().strip())
+
+    # Extract data rows
+    rows = []
+    tbody = table_element.find('tbody') or table_element
+
+    for tr in tbody.find_all('tr')[1 if not table_element.find('thead') and headers else 0:]:
+        row = []
+        for cell in tr.find_all(['td', 'th']):
+            row.append(cell.get_text().strip())
+        if row:  # Skip empty rows
+            rows.append(row)
+
+    return {
+        'caption': caption,
+        'headers': headers,
+        'rows': rows
+    }
+
+def format_table_content(table_data):
+    """Format table data into readable text
+    Generated by ClaudeAI"""
+    content_parts = []
+
+    if table_data['caption']:
+        content_parts.append(f"Table: {table_data['caption']}")
+
+    headers = table_data['headers']
+    rows = table_data['rows']
+
+    if headers:
+        content_parts.append("Columns: " + " | ".join(headers))
+
+    # Format rows
+    for i, row in enumerate(rows):
+        if headers and len(row) == len(headers):
+            # Create key-value pairs
+            row_content = []
+            for header, value in zip(headers, row):
+                if value:  # Skip empty cells
+                    row_content.append(f"{header}: {value}")
+            if row_content:
+                content_parts.append(f"Row {i+1}: " + "; ".join(row_content))
+        else:
+            # Simple row format
+            content_parts.append(f"Row {i+1}: " + " | ".join(row))
+
+    return '\n'.join(content_parts)
+
+def split_large_table(table_data, heading_stack):
+    """Split large tables into smaller chunks
+    Generated By ClaudeAI"""
+    chunks = []
+    headers = table_data['headers']
+    rows = table_data['rows']
+
+    chunk_size = 10  # Rows per chunk
+    total_chunks = (len(rows) + chunk_size - 1) // chunk_size
+
+    for i in range(0, len(rows), chunk_size):
+        chunk_rows = rows[i:i + chunk_size]
+
+        chunk_table_data = {
+            'caption': table_data['caption'],
+            'headers': headers,
+            'rows': chunk_rows
+        }
+
+        content = format_table_content(chunk_table_data)
+
+        chunk = create_chunk('table_part', content, heading_stack, {
+            'row_count': len(chunk_rows),
+            'column_count': len(headers) if headers else len(chunk_rows[0]),
+            'headers': headers,
+            'table_caption': table_data['caption'],
+            'part_number': i // chunk_size + 1,
+            'total_parts': total_chunks,
+            'row_range': f"{i+1}-{min(i+chunk_size, len(rows))}"
+        })
+
+        chunks.append(chunk)
+
+    return chunks
+
+
+def extract_content(html_content):
+    ret_value = {}
+    soup = BeautifulSoup(html_content, 'html.parser')
+
+    og_url = soup.find('meta', property='og:url')
+    og_description = soup.find('meta', property="og:description")
+    og_title = soup.find('meta', property="og:title")
+    print(og_title)
+    title_content = og_title.get('content') if og_title else None
+    title = re.sub(r'[\s\xa0]*\|[\s\xa0]*', ' | ', title_content) if title_content else None
+
+    article_body = soup.find('div', class_='devsite-article-body')
+    if not article_body:
+        return {}
+
+    footer = soup.find('devsite-content-footer')
+    # footer_paras = footer.find_all('p') if footer else None
+    # second_para = footer_paras[1] if len(footer_paras) > 1 else None
+    date_last_modified = date.today().strftime('%Y-%m-%d')
+    if footer:
+            footer_paras = footer.find_all('p')
+            for fp in footer_paras:
+                last_updated_re = r'Last updated (.*) UTC'
+                match = re.search(last_updated_re, fp.get_text())
+                if match:
+                    date_last_modified = match.group(1)
+                    break
+
+    #
+    # Start ClaudeAI generated Code
+    #
+    chunks = []
+    heading_stack = []
+
+    # Process elements that can be chunks or provide context
+    for element in article_body.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol', 'table']):
+        if element.name.startswith('h'):
+            level = int(element.name[1])
+            heading_text = element.get_text().strip()
+
+            heading_stack = [h for h in heading_stack if h['level'] < level]
+            heading_stack.append({'level': level, 'text': heading_text})
+
+        elif element.name == 'p':
+            raw_content = element.get_text().strip()
+
+            if isinstance(raw_content, list):
+                # Join the list first, then replace newlines
+                content = ' '.join(raw_content).replace('\n', ' ')
+            else:
+                # It's already a string
+                content = raw_content.replace('\n', ' ')
+
+            if content and len(content) > 10:
+                chunk = create_chunk('paragraph', content, heading_stack)
+                chunks.append(chunk)
+
+        elif element.name in ['ul', 'ol']:
+            list_chunk = process_list(element, heading_stack)
+            if list_chunk:
+                chunks.append(list_chunk)
+
+        elif element.name == 'table':
+            table_chunk = process_table(element, heading_stack)
+            if table_chunk:
+                chunks.append(table_chunk)
+    #
+    # End ClaudeAI generated code
+    #
+
+    ret_value['url'] = og_url.get('content') if og_url else None
+    ret_value['description'] = og_description.get('content') if og_description else None
+    ret_value['title'] = title
+    ret_value['date_last_modified'] = date_last_modified
+    ret_value['chunks'] = chunks
+    # ret_value['article'] = article_body
+
+    return ret_value
+
+def prepare_enhanced_documents(html_content, additional_metadata=None):
+    extracted = extract_content(html_content)
+
+    if not extracted:
+        return []
+    # extracted = extract_all_content_chunks(html_content)
+
+    additional_metadata = {
+        "date_last_modified": extracted['date_last_modified']
+    }
+
+    documents = []
+    chunk_counter = 0
+
+    for chunk in extracted['chunks']:
+        # Handle cases where list/table processing returns multiple chunks
+        if isinstance(chunk, list):
+            for sub_chunk in chunk:
+                doc = create_document_from_chunk(sub_chunk, extracted['url'], extracted['title'], chunk_counter, additional_metadata)
+                documents.append(doc)
+                chunk_counter += 1
+        else:
+            doc = create_document_from_chunk(chunk, extracted['url'], extracted['title'], chunk_counter, additional_metadata)
+            documents.append(doc)
+            chunk_counter += 1
+
+    return documents
+
+def create_document_from_chunk(chunk, url, title, index, additional_metadata):
+    """Create document object from chunk"""
+
+    # Create enhanced content for embedding
+    content_parts = []
+
+    # Add document title
+    content_parts.append(title)
+
+    # Add heading context
+    if chunk['heading_path']:
+        content_parts.append(f"Section: {chunk['heading_path']}")
+
+    # Add content type context
+    content_type_labels = {
+        'paragraph': '',
+        'unordered_list': 'List:',
+        'ordered_list': 'Numbered list:',
+        'table': 'Table:',
+        'table_part': 'Table data:',
+        'unordered_list_part': 'List items:',
+        'ordered_list_part': 'Numbered list items:'
+    }
+
+    type_label = content_type_labels.get(chunk['content_type'], '')
+    if type_label:
+        content_parts.append(type_label)
+
+    # Add main content
+    content_parts.append(chunk['content'])
+
+    # Enhanced content for embedding
+    embedding_content = ' '.join(content_parts)
+
+    # Base metadata
+    metadata = {
+        'source_url': url,
+        'document_title': title,
+        'chunk_index': index,
+        'content_type': chunk['content_type'],
+        'heading_path': chunk['heading_path'],
+        'immediate_heading': chunk['immediate_heading'],
+        'all_headings': chunk['headings'],
+        'processed_at': date.today().strftime('%Y-%m-%d'),
+        **(additional_metadata or {})
+    }
+
+    # Add content-specific metadata
+    for key in ['item_count', 'list_items', 'row_count', 'column_count', 'headers', 'table_caption', 'part_number', 'total_parts', 'row_range']:
+        if key in chunk:
+            metadata[key] = chunk[key]
+
+    doc = {
+        'id': f"{url}#chunk{index}",
+        'content': chunk['content'],
+        'embedding_content': embedding_content,
+        'metadata': metadata
+    }
+
+    return doc
+
+def create_embedding(text):
+    """Create an embedding vector for a single text"""
+    response = ollama.embeddings(
+        model='nomic-embed-text',
+        prompt=text
+    )
+    return response['embedding']
+
+# content = read_file('./docs/run/cloud.google.com/run/docs/overview/what-is-cloud-run')
+
+folders_to_read = [
+    'docs/run/cloud.google.com/run/docs',
+    'docs/compute/cloud.google.com/compute/docs',
+    'docs/iam/cloud.google.com/iam/docs'
+]
+
+def is_html_file(file_path: str) -> bool:
+    """Check if file is likely an HTML file based on content."""
+    try:
+        with open(file_path, 'r', encoding='utf-8') as file:
+            first_line = file.readline().lower().strip()
+            # Check for common HTML indicators
+            return (first_line.startswith('<!doctype html') or 
+                   first_line.startswith('<html') or 
+                   '<html' in first_line)
+    except:
+        return False
+
+# def insert_to_database(conn, records):
+#     """Insert records into the vector_store table."""
+#     try:
+#         cursor = conn.cursor()
+
+#         insert_query = """
+#             INSERT INTO vector_store (
+#                 id, title, chunk_index, content, source_url, 
+#                 date_last_modified, metadata, embedding
+#             ) VALUES (
+#                 %s, %s, %s, %s, %s, %s, %s, %s
+#             )
+#         """
+
+#         for record in records:
+#             cursor.execute(insert_query, (
+#                 record['id'],
+#                 record['title'],
+#                 record['chunk_index'],
+#                 record['content'],
+#                 record['source_url'],
+#                 record['date_last_modified'],
+#                 Json(record['metadata']),
+#                 record['embedding']
+#             ))
+
+#         conn.commit()
+#         logger.info(f"Inserted {len(records)} records into database")
+
+#     except Exception as e:
+#         logger.error(f"Database insertion error: {str(e)}")
+#         conn.rollback()
+#         raise
+#     finally:
+#         cursor.close()
+
+
+# In[15]:
+
+
+def insert_to_database(conn, records):
+    """Insert records into the documents and chunks tables."""
+    try:
+        cursor = conn.cursor()
+
+        # Group records by source_url to handle documents
+        documents_by_url = {}
+        for record in records:
+            source_url = record['source_url']
+            if source_url not in documents_by_url:
+                documents_by_url[source_url] = {
+                    'title': record['title'],
+                    'source_url': source_url,
+                    'date_last_modified': record['date_last_modified'],
+                    'metadata': record['metadata'],
+                    'chunks': []
+                }
+            documents_by_url[source_url]['chunks'].append(record)
+
+        # Insert documents (with conflict handling for duplicates)
+        document_insert_query = """
+            INSERT INTO documents (source_url, title, date_last_modified, metadata)
+            VALUES (%s, %s, %s, %s)
+            ON CONFLICT (source_url) DO UPDATE SET
+                title = EXCLUDED.title,
+                date_last_modified = EXCLUDED.date_last_modified,
+                metadata = EXCLUDED.metadata
+            RETURNING id
+        """
+
+        # Insert chunks
+        chunk_insert_query = """
+            INSERT INTO chunks (document_id, chunk_index, content, embedding)
+            VALUES (%s, %s, %s, %s)
+        """
+
+        # Get document ID query for existing documents
+        get_document_id_query = """
+            SELECT id FROM documents WHERE source_url = %s
+        """
+
+        total_chunks_inserted = 0
+
+        for source_url, doc_data in documents_by_url.items():
+            # Try to insert the document (or update if exists)
+            try:
+                cursor.execute(document_insert_query, (
+                    doc_data['source_url'],
+                    doc_data['title'],
+                    doc_data['date_last_modified'],
+                    Json(doc_data['metadata'])
+                ))
+                document_id = cursor.fetchone()[0]
+            except Exception as e:
+                # If insert fails, try to get existing document ID
+                cursor.execute(get_document_id_query, (source_url,))
+                result = cursor.fetchone()
+                if result:
+                    document_id = result[0]
+                else:
+                    logger.error(f"Failed to insert or find document for URL {source_url}: {str(e)}")
+                    continue
+
+            # Insert chunks for this document
+            for chunk in doc_data['chunks']:
+                cursor.execute(chunk_insert_query, (
+                    document_id,
+                    chunk['chunk_index'],
+                    chunk['content'],
+                    chunk['embedding']
+                ))
+                total_chunks_inserted += 1
+
+        conn.commit()
+        logger.info(f"Inserted {len(documents_by_url)} documents and {total_chunks_inserted} chunks into database")
+
+    except Exception as e:
+        logger.error(f"Database insertion error: {str(e)}")
+        conn.rollback()
+        raise
+    finally:
+        cursor.close()
+
+
+# In[16]:
+
+
+def process(folder_path):
+
+    try:
+        # Connect to database
+        conn = psycopg2.connect(
+            host="localhost",           # Database server host
+            port=15432,                 # Port number (default: 5432)
+            database="gcp_docs",           # Database name
+            user="admin",           # Username
+            password="password"        # Password
+        )
+        logger.info("Connected to database successfully")
+
+        # Track processing statistics
+        total_files = 0
+        total_chunks = 0
+
+        # Walk through all directories and files
+        for root, dirs, files in os.walk(folder_path):
+            logger.info(f"Processing directory: {root}")
+            for filename in files:
+                file_path = os.path.join(root, filename)
+
+                html_page = read_file(file_path)
+                logger.info(f"Processing file: {file_path}")
+
+                # Extract content from HTML
+                document = prepare_enhanced_documents(html_page)
+
+                if not document:
+                    logger.warning(f"No content extracted from {file_path}")
+                    continue
+
+                i = 0
+                records = []
+                for chunk in document:
+                    embedding = create_embedding(chunk['embedding_content'])
+                    record = {
+                        'id': str(uuid.uuid4()),
+                        'title': chunk['metadata']['document_title'],
+                        'chunk_index':chunk['id'],
+                        'content': chunk['content'],
+                        'source_url': chunk['metadata']['source_url'],
+                        'date_last_modified': chunk['metadata']['date_last_modified'],
+                        'metadata': {
+                            **chunk['metadata'],
+                            'chunk_number': i,
+                            'total_chunks': len(document),
+                            'chunk_size': len(chunk['embedding_content'])
+                        },
+                        'embedding': embedding
+                    }
+                    records.append(record)
+                    i = i + 1
+
+                # Insert records into database
+                if records:
+                    insert_to_database(conn, records)
+                    total_files += 1
+                    total_chunks += len(records)
+
+        logger.info(f"Processing complete. Processed {total_files} files, created {total_chunks} chunks")
+        return records
+
+    except Exception as e:
+        logger.error(f"Error during processing: {str(e)}")
+        raise
+    finally:
+        if 'conn' in locals():
+            conn.close()
+        logger.info("Database connection closed")
+
+
+# In[17]:
+
+
+# html_file = read_file('docs/run/cloud.google.com/run/docs/monitoring-overview')
+# prepare_enhanced_documents(html_file)
+
+
+#records = process('docs/compute/cloud.google.com/compute/docs')
+# records = process('docs/storage/cloud.google.com/storage/docs')
+# records = process('docs/pubsub/cloud.google.com/pubsub/docs')
+# records = process('docs/functions/cloud.google.com/functions/docs')
+# records = process('docs/run/cloud.google.com/run/docs')
+# records = process('docs/iam/cloud.google.com/iam/docs')
+# records = process('docs/iap/cloud.google.com/iap/docs')
+# records = process('docs/bigquery/cloud.google.com/bigquery/docs')
+# records = process('docs/apigee/cloud.google.com/apigee/docs')
+#records = process('docs/sql/cloud.google.com/sql/docs')
+records = process('docs/firestore/cloud.google.com/firestore/docs')
+records = process('docs/firestore/cloud.google.com/firestore/native')
+
+
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..1ea94f2
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,94 @@
+anyio==4.9.0
+appnope==0.1.4
+argon2-cffi==25.1.0
+argon2-cffi-bindings==21.2.0
+arrow==1.3.0
+asttokens==3.0.0
+async-lru==2.0.5
+attrs==25.3.0
+babel==2.17.0
+beautifulsoup4==4.13.4
+bleach==6.2.0
+bs4==0.0.2
+certifi==2025.6.15
+cffi==1.17.1
+charset-normalizer==3.4.2
+comm==0.2.2
+debugpy==1.8.14
+decorator==5.2.1
+defusedxml==0.7.1
+executing==2.2.0
+fastjsonschema==2.21.1
+fqdn==1.5.1
+h11==0.16.0
+httpcore==1.0.9
+httpx==0.28.1
+idna==3.10
+ipykernel==6.29.5
+ipython==9.3.0
+ipython_pygments_lexers==1.1.1
+isoduration==20.11.0
+jedi==0.19.2
+Jinja2==3.1.6
+json5==0.12.0
+jsonpointer==3.0.0
+jsonschema==4.24.0
+jsonschema-specifications==2025.4.1
+jupyter-events==0.12.0
+jupyter-lsp==2.2.5
+jupyter_client==8.6.3
+jupyter_core==5.8.1
+jupyter_server==2.16.0
+jupyter_server_terminals==0.5.3
+jupyterlab==4.4.4
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+MarkupSafe==3.0.2
+matplotlib-inline==0.1.7
+mistune==3.1.3
+nbclient==0.10.2
+nbconvert==7.16.6
+nbformat==5.10.4
+nest-asyncio==1.6.0
+notebook_shim==0.2.4
+overrides==7.7.0
+packaging==25.0
+pandocfilters==1.5.1
+parso==0.8.4
+pexpect==4.9.0
+platformdirs==4.3.8
+prometheus_client==0.22.1
+prompt_toolkit==3.0.51
+psutil==7.0.0
+psycopg2-binary==2.9.10
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pycparser==2.22
+Pygments==2.19.2
+python-dateutil==2.9.0.post0
+python-json-logger==3.3.0
+PyYAML==6.0.2
+pyzmq==27.0.0
+referencing==0.36.2
+requests==2.32.4
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rpds-py==0.25.1
+Send2Trash==1.8.3
+setuptools==80.9.0
+six==1.17.0
+sniffio==1.3.1
+soupsieve==2.7
+stack-data==0.6.3
+terminado==0.18.1
+tinycss2==1.4.0
+tornado==6.5.1
+traitlets==5.14.3
+types-python-dateutil==2.9.0.20250516
+typing_extensions==4.14.0
+uri-template==1.3.0
+urllib3==2.5.0
+wcwidth==0.2.13
+webcolors==24.11.1
+webencodings==0.5.1
+websocket-client==1.8.0
diff --git a/scratch.py b/scratch.py
new file mode 100644
index 0000000..d88c1fd
--- /dev/null
+++ b/scratch.py
@@ -0,0 +1,91 @@
+class Website:
+    """
+    A utility class to represent a Website that we have scraped, now with links
+    """
+
+    def __init__(self, url):
+        
+        self.url = url
+        response = requests.get(url, headers=headers, timeout=(connect_timeout_secs, read_timeout_secs))
+        self.status_code = response.status_code
+        self.body = response.content
+        self.soup = BeautifulSoup(self.body, 'html.parser')
+        self.title = self.soup.title.string if self.soup.title else "No title found"
+        if self.soup.body:
+            for irrelevant in self.soup.body(["script", "style", "img", "input"]):
+                irrelevant.decompose()
+            self.text = self.soup.body.get_text(separator="\n", strip=True)
+        else:
+            self.text = ""
+        links = [link.get('href') for link in self.soup.find_all('a')]
+        exclude_strings = ["/go", "/java", "/nodejs", "/python"
+                           , "/ruby", "//", "/cpp", "/php", "#"
+                          , "/partner", "/product-terms", "/dotnet"
+                          , "/eclipse", "/customer", "/contac", "/contact"
+                          , "/free", "/marketplace", "/pricing"
+                          , "/release-notes", "/whitepapers", "/why-google"]
+        self.links = {self.make_gcp_link(link) for link in links if link is not None and not any(exclude in link for exclude in exclude_strings)}
+        self.get_last_updated()
+
+    def make_gcp_link(self, link):
+
+        if link.endswith('/'):
+            link = link[:len(link) - 1]
+        if 'https://cloud.google.com' in link:
+            return link
+    
+        # Only return paths from GCP.  If it is any other domain 
+        if 'https://' not in link:
+            return 'https://cloud.google.com{}'.format(link)
+        else:
+            return None
+
+
+    def get_last_updated(self):
+        footer = self.soup.find('devsite-content-footer')
+        self.last_updated = '1980-01-01'
+        if footer:
+            footer_paras = footer.find_all('p')
+            for fp in footer_paras:
+                last_updated_re = r'Last updated (.*) UTC'
+                match = re.search(last_updated_re, fp.get_text())
+                if match:
+                    self.last_updated = match.group(1)
+                    break
+        
+
+    def get_doc_links(self):
+        nav_items = self.soup.find_all('li', class_='devsite-nav-item')
+        links_in_article = self.article_section.find_all('a', href=True)
+        all_links = nav_items + links_in_article
+        
+        for al in all_links:
+            link = al.find('a', href=True)
+            if link and 'docs' in link['href']:
+                if link['href'][0] == '/':
+                    self.links.add('https://cloud.google.com{}'.format(link['href']))
+                else:
+                    self.links.add(link['href'])
+
+    def get_contents(self):
+        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"
+
+
+def connect_to_db():
+    """
+    Connects to a PostgreSQL database on localhost:5432.
+    Returns the connection object, or None if the connection fails.
+    """
+    try:
+        conn = psycopg2.connect(
+            host="localhost",
+            port=5432,
+            database="gcplinks",  # Replace with your database name
+            user=db_user,          # Replace with your user name
+            password=db_pwd       # Replace with your password
+        )
+        print("Successfully connected to the database.")
+        return conn
+    except psycopg2.Error as e:
+        print(f"Error connecting to the database: {e}")
+        return None
\ No newline at end of file