gcp_docs/rag_pipeline.ipynb at main

Fork: 0
clewis / gcp_docs
Find file
Newer
Older
gcp_docs / rag_pipeline.ipynb
clewis 12 days ago 37 KB initial commit
Raw Blame History
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "6647b014-6ef7-4553-bfd9-a17a92a2374c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import re\n",
    "import uuid\n",
    "import ollama\n",
    "import logging\n",
    "import psycopg2\n",
    "from psycopg2.extras import Json\n",
    "from datetime import date, datetime\n",
    "from bs4 import BeautifulSoup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "53625000-e642-459e-b496-07da97095c08",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set up logging\n",
    "logging.basicConfig(level=logging.INFO)\n",
    "logger = logging.getLogger(__name__)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "47c7f0b4-32c0-4952-a49a-01ee0099a89d",
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_file(filename):\n",
    "    content = ''\n",
    "    with open(filename, 'r') as f:\n",
    "        content = f.read()\n",
    "\n",
    "    return content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "9c5202b0-bd87-458e-b2c8-5dc94033271f",
   "metadata": {},
   "outputs": [],
   "source": [
    "BASE_PATH = './docs'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "bbb37d78-eaa1-4959-b315-938d55949919",
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_chunk(content_type, content, heading_stack, extra_metadata=None):\n",
    "    \"\"\"Helper to create consistent chunk structure.\n",
    "    Generated by ClaudeAI\"\"\"\n",
    "\n",
    "    content = content\n",
    "\n",
    "    if isinstance(content, list):\n",
    "        # Join the list first, then replace newlines\n",
    "        content = ' '.join(content).replace('\\n', ' ')\n",
    "    else:\n",
    "        # It's already a string\n",
    "        content = content.replace('\\n', ' ')\n",
    "                \n",
    "    chunk = {\n",
    "        'content': content,\n",
    "        'content_type': content_type,\n",
    "        'heading_path': ' > '.join(h['text'] for h in heading_stack),\n",
    "        'immediate_heading': heading_stack[-1]['text'] if heading_stack else None,\n",
    "        'headings': [h['text'] for h in heading_stack],\n",
    "    }\n",
    "    \n",
    "    if extra_metadata:\n",
    "        chunk.update(extra_metadata)\n",
    "    \n",
    "    return chunk"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "9c5ba591-0ccf-4f96-9fc2-a50b994be065",
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_list(list_element, heading_stack):\n",
    "    \"\"\"Process ul/ol lists as single chunks or individual items\"\"\"\n",
    "    list_type = 'ordered_list' if list_element.name == 'ol' else 'unordered_list'\n",
    "    \n",
    "    # Extract all list items\n",
    "    items = []\n",
    "    for li in list_element.find_all('li', recursive=False):  # Only direct children\n",
    "        item_text = li.get_text().strip()\n",
    "        if item_text:\n",
    "            # Clean up bullets and numbering from the text\n",
    "            cleaned_text = clean_list_item_text(item_text)\n",
    "            if cleaned_text:  # Only add if there's content after cleaning\n",
    "                items.append(cleaned_text)\n",
    "    \n",
    "    if not items:\n",
    "        return None\n",
    "    \n",
    "    # Strategy 1: Treat entire list as one chunk\n",
    "    if len(items) <= 10:  # Reasonable threshold\n",
    "        content = format_list_content(items, list_type)\n",
    "        return create_chunk(list_type, content, heading_stack, {\n",
    "            'item_count': len(items),\n",
    "            'list_items': items\n",
    "        })\n",
    "    \n",
    "    # Strategy 2: Split long lists into multiple chunks\n",
    "    else:\n",
    "        chunks = []\n",
    "        chunk_size = 8\n",
    "        for i in range(0, len(items), chunk_size):\n",
    "            chunk_items = items[i:i + chunk_size]\n",
    "            content = format_list_content(chunk_items, list_type)\n",
    "            chunk = create_chunk(f'{list_type}_part', content, heading_stack, {\n",
    "                'item_count': len(chunk_items),\n",
    "                'list_items': chunk_items,\n",
    "                'part_number': i // chunk_size + 1,\n",
    "                'total_parts': (len(items) + chunk_size - 1) // chunk_size\n",
    "            })\n",
    "            chunks.append(chunk)\n",
    "        return chunks\n",
    "\n",
    "def clean_list_item_text(text):\n",
    "    \"\"\"Remove bullets, numbers, and other list markers from text\"\"\"\n",
    "    \n",
    "    # First, split on bullet points if multiple items are concatenated\n",
    "    # This handles cases where multiple list items got joined together\n",
    "    if '•' in text:\n",
    "        # Split on bullets and clean each part\n",
    "        parts = text.split('•')\n",
    "        cleaned_parts = []\n",
    "        for part in parts:\n",
    "            cleaned_part = clean_single_item(part.strip())\n",
    "            if cleaned_part:\n",
    "                cleaned_parts.append(cleaned_part)\n",
    "        \n",
    "        if len(cleaned_parts) > 1:\n",
    "            # Multiple items were concatenated, return them separated\n",
    "            return ' | '.join(cleaned_parts)\n",
    "        else:\n",
    "            # Single item, continue with normal cleaning\n",
    "            text = parts[0] if parts else text\n",
    "    \n",
    "    # Clean single item\n",
    "    return clean_single_item(text)\n",
    "\n",
    "def clean_single_item(text):\n",
    "    \"\"\"Clean a single list item\"\"\"\n",
    "    if not text:\n",
    "        return \"\"\n",
    "    \n",
    "    # Common bullet characters and patterns to remove\n",
    "    bullet_patterns = [\n",
    "        r'^[•·▪▫‣⁃◦▸▹►▻○●◉◎⦿⦾]\\s*',  # Various bullet characters\n",
    "        r'^[-–—*+]\\s*',                    # Dash, asterisk, plus bullets\n",
    "        r'^\\d+[\\.\\)]\\s*',                  # Numbers with periods or parentheses\n",
    "        r'^[a-zA-Z][\\.\\)]\\s*',             # Letters with periods or parentheses\n",
    "        r'^[ivxlcdm]+[\\.\\)]\\s*',           # Roman numerals\n",
    "        r'^\\([a-zA-Z0-9]+\\)\\s*',           # Parenthesized numbers/letters\n",
    "        r'^\\s*\\u2022\\s*',                  # Unicode bullet\n",
    "        r'^\\s*\\u25E6\\s*',                  # White bullet\n",
    "        r'^\\s*\\u25AA\\s*',                  # Black small square\n",
    "        r'^\\s*\\u25AB\\s*',                  # White small square\n",
    "    ]\n",
    "    \n",
    "    cleaned_text = text\n",
    "    for pattern in bullet_patterns:\n",
    "        cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.IGNORECASE)\n",
    "    \n",
    "    # Remove extra whitespace\n",
    "    cleaned_text = re.sub(r'\\s+', ' ', cleaned_text).strip()\n",
    "    \n",
    "    return cleaned_text\n",
    "\n",
    "def format_list_content(items, list_type):\n",
    "    \"\"\"Format list items into readable content WITHOUT adding bullets\"\"\"\n",
    "    if list_type == 'ordered_list':\n",
    "        return '\\n'.join(f\"{i+1}. {item}\" for i, item in enumerate(items))\n",
    "    else:\n",
    "        # For unordered lists, just join with newlines or separators\n",
    "        # Don't add bullets since we want clean text\n",
    "        return '\\n'.join(items)\n",
    "        # Alternative: use a separator instead of newlines\n",
    "        # return ' | '.join(items)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "62d7feea-1c66-419d-8fb7-66dc75c8e9f7",
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_table(table_element, heading_stack):\n",
    "    \"\"\"Process tables with different strategies based on size.\n",
    "    Generated by ClaudeAI\"\"\"\n",
    "    \n",
    "    # Extract table data\n",
    "    table_data = extract_table_data(table_element)\n",
    "    \n",
    "    if not table_data['rows']:\n",
    "        return None\n",
    "    \n",
    "    row_count = len(table_data['rows'])\n",
    "    col_count = len(table_data['headers']) if table_data['headers'] else len(table_data['rows'][0])\n",
    "    \n",
    "    # Strategy based on table size\n",
    "    if row_count <= 20 and col_count <= 6:\n",
    "        # Small table: treat as single chunk\n",
    "        content = format_table_content(table_data)\n",
    "        return create_chunk('table', content, heading_stack, {\n",
    "            'row_count': row_count,\n",
    "            'column_count': col_count,\n",
    "            'headers': table_data['headers'],\n",
    "            'table_caption': table_data['caption']\n",
    "        })\n",
    "    \n",
    "    else:\n",
    "        # Large table: split by rows\n",
    "        return split_large_table(table_data, heading_stack)\n",
    "\n",
    "def extract_table_data(table_element):\n",
    "    \"\"\"Extract structured data from table.\n",
    "    Generated by ClaudeAI\"\"\"\n",
    "    \n",
    "    # Get caption if present\n",
    "    caption_elem = table_element.find('caption')\n",
    "    caption = caption_elem.get_text().strip() if caption_elem else None\n",
    "    \n",
    "    # Extract headers\n",
    "    headers = []\n",
    "    header_row = table_element.find('thead')\n",
    "    if header_row:\n",
    "        for th in header_row.find_all(['th', 'td']):\n",
    "            headers.append(th.get_text().strip())\n",
    "    else:\n",
    "        # Try first row as headers\n",
    "        first_row = table_element.find('tr')\n",
    "        if first_row:\n",
    "            for cell in first_row.find_all(['th', 'td']):\n",
    "                headers.append(cell.get_text().strip())\n",
    "    \n",
    "    # Extract data rows\n",
    "    rows = []\n",
    "    tbody = table_element.find('tbody') or table_element\n",
    "    \n",
    "    for tr in tbody.find_all('tr')[1 if not table_element.find('thead') and headers else 0:]:\n",
    "        row = []\n",
    "        for cell in tr.find_all(['td', 'th']):\n",
    "            row.append(cell.get_text().strip())\n",
    "        if row:  # Skip empty rows\n",
    "            rows.append(row)\n",
    "    \n",
    "    return {\n",
    "        'caption': caption,\n",
    "        'headers': headers,\n",
    "        'rows': rows\n",
    "    }\n",
    "\n",
    "def format_table_content(table_data):\n",
    "    \"\"\"Format table data into readable text\n",
    "    Generated by ClaudeAI\"\"\"\n",
    "    content_parts = []\n",
    "    \n",
    "    if table_data['caption']:\n",
    "        content_parts.append(f\"Table: {table_data['caption']}\")\n",
    "    \n",
    "    headers = table_data['headers']\n",
    "    rows = table_data['rows']\n",
    "    \n",
    "    if headers:\n",
    "        content_parts.append(\"Columns: \" + \" | \".join(headers))\n",
    "    \n",
    "    # Format rows\n",
    "    for i, row in enumerate(rows):\n",
    "        if headers and len(row) == len(headers):\n",
    "            # Create key-value pairs\n",
    "            row_content = []\n",
    "            for header, value in zip(headers, row):\n",
    "                if value:  # Skip empty cells\n",
    "                    row_content.append(f\"{header}: {value}\")\n",
    "            if row_content:\n",
    "                content_parts.append(f\"Row {i+1}: \" + \"; \".join(row_content))\n",
    "        else:\n",
    "            # Simple row format\n",
    "            content_parts.append(f\"Row {i+1}: \" + \" | \".join(row))\n",
    "    \n",
    "    return '\\n'.join(content_parts)\n",
    "\n",
    "def split_large_table(table_data, heading_stack):\n",
    "    \"\"\"Split large tables into smaller chunks\n",
    "    Generated By ClaudeAI\"\"\"\n",
    "    chunks = []\n",
    "    headers = table_data['headers']\n",
    "    rows = table_data['rows']\n",
    "    \n",
    "    chunk_size = 10  # Rows per chunk\n",
    "    total_chunks = (len(rows) + chunk_size - 1) // chunk_size\n",
    "    \n",
    "    for i in range(0, len(rows), chunk_size):\n",
    "        chunk_rows = rows[i:i + chunk_size]\n",
    "        \n",
    "        chunk_table_data = {\n",
    "            'caption': table_data['caption'],\n",
    "            'headers': headers,\n",
    "            'rows': chunk_rows\n",
    "        }\n",
    "        \n",
    "        content = format_table_content(chunk_table_data)\n",
    "        \n",
    "        chunk = create_chunk('table_part', content, heading_stack, {\n",
    "            'row_count': len(chunk_rows),\n",
    "            'column_count': len(headers) if headers else len(chunk_rows[0]),\n",
    "            'headers': headers,\n",
    "            'table_caption': table_data['caption'],\n",
    "            'part_number': i // chunk_size + 1,\n",
    "            'total_parts': total_chunks,\n",
    "            'row_range': f\"{i+1}-{min(i+chunk_size, len(rows))}\"\n",
    "        })\n",
    "        \n",
    "        chunks.append(chunk)\n",
    "    \n",
    "    return chunks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "fb41a9bf-01d3-4931-80c7-f9d3f853d761",
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_content(html_content):\n",
    "    ret_value = {}\n",
    "    soup = BeautifulSoup(html_content, 'html.parser')\n",
    "\n",
    "    og_url = soup.find('meta', property='og:url')\n",
    "    og_description = soup.find('meta', property=\"og:description\")\n",
    "    og_title = soup.find('meta', property=\"og:title\")\n",
    "    print(og_title)\n",
    "    title_content = og_title.get('content') if og_title else None\n",
    "    title = re.sub(r'[\\s\\xa0]*\\|[\\s\\xa0]*', ' | ', title_content) if title_content else None\n",
    "\n",
    "    article_body = soup.find('div', class_='devsite-article-body')\n",
    "    if not article_body:\n",
    "        return {}\n",
    "\n",
    "    footer = soup.find('devsite-content-footer')\n",
    "    # footer_paras = footer.find_all('p') if footer else None\n",
    "    # second_para = footer_paras[1] if len(footer_paras) > 1 else None\n",
    "    date_last_modified = date.today().strftime('%Y-%m-%d')\n",
    "    if footer:\n",
    "            footer_paras = footer.find_all('p')\n",
    "            for fp in footer_paras:\n",
    "                last_updated_re = r'Last updated (.*) UTC'\n",
    "                match = re.search(last_updated_re, fp.get_text())\n",
    "                if match:\n",
    "                    date_last_modified = match.group(1)\n",
    "                    break\n",
    "\n",
    "    #\n",
    "    # Start ClaudeAI generated Code\n",
    "    #\n",
    "    chunks = []\n",
    "    heading_stack = []\n",
    "    \n",
    "    # Process elements that can be chunks or provide context\n",
    "    for element in article_body.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol', 'table']):\n",
    "        if element.name.startswith('h'):\n",
    "            level = int(element.name[1])\n",
    "            heading_text = element.get_text().strip()\n",
    "            \n",
    "            heading_stack = [h for h in heading_stack if h['level'] < level]\n",
    "            heading_stack.append({'level': level, 'text': heading_text})\n",
    "            \n",
    "        elif element.name == 'p':\n",
    "            raw_content = element.get_text().strip()\n",
    "\n",
    "            if isinstance(raw_content, list):\n",
    "                # Join the list first, then replace newlines\n",
    "                content = ' '.join(raw_content).replace('\\n', ' ')\n",
    "            else:\n",
    "                # It's already a string\n",
    "                content = raw_content.replace('\\n', ' ')\n",
    "                \n",
    "            if content and len(content) > 10:\n",
    "                chunk = create_chunk('paragraph', content, heading_stack)\n",
    "                chunks.append(chunk)\n",
    "                \n",
    "        elif element.name in ['ul', 'ol']:\n",
    "            list_chunk = process_list(element, heading_stack)\n",
    "            if list_chunk:\n",
    "                chunks.append(list_chunk)\n",
    "                \n",
    "        elif element.name == 'table':\n",
    "            table_chunk = process_table(element, heading_stack)\n",
    "            if table_chunk:\n",
    "                chunks.append(table_chunk)\n",
    "    #\n",
    "    # End ClaudeAI generated code\n",
    "    #\n",
    "\n",
    "    ret_value['url'] = og_url.get('content') if og_url else None\n",
    "    ret_value['description'] = og_description.get('content') if og_description else None\n",
    "    ret_value['title'] = title\n",
    "    ret_value['date_last_modified'] = date_last_modified\n",
    "    ret_value['chunks'] = chunks\n",
    "    # ret_value['article'] = article_body\n",
    "\n",
    "    return ret_value"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "816faecf-b764-46ce-ac2a-1cd1bf0bd9c4",
   "metadata": {},
   "outputs": [],
   "source": [
    "def prepare_enhanced_documents(html_content, additional_metadata=None):\n",
    "    extracted = extract_content(html_content)\n",
    "\n",
    "    if not extracted:\n",
    "        return []\n",
    "    # extracted = extract_all_content_chunks(html_content)\n",
    "\n",
    "    additional_metadata = {\n",
    "        \"date_last_modified\": extracted['date_last_modified']\n",
    "    }\n",
    "    \n",
    "    documents = []\n",
    "    chunk_counter = 0\n",
    "    \n",
    "    for chunk in extracted['chunks']:\n",
    "        # Handle cases where list/table processing returns multiple chunks\n",
    "        if isinstance(chunk, list):\n",
    "            for sub_chunk in chunk:\n",
    "                doc = create_document_from_chunk(sub_chunk, extracted['url'], extracted['title'], chunk_counter, additional_metadata)\n",
    "                documents.append(doc)\n",
    "                chunk_counter += 1\n",
    "        else:\n",
    "            doc = create_document_from_chunk(chunk, extracted['url'], extracted['title'], chunk_counter, additional_metadata)\n",
    "            documents.append(doc)\n",
    "            chunk_counter += 1\n",
    "    \n",
    "    return documents\n",
    "\n",
    "def create_document_from_chunk(chunk, url, title, index, additional_metadata):\n",
    "    \"\"\"Create document object from chunk\"\"\"\n",
    "    \n",
    "    # Create enhanced content for embedding\n",
    "    content_parts = []\n",
    "    \n",
    "    # Add document title\n",
    "    content_parts.append(title)\n",
    "    \n",
    "    # Add heading context\n",
    "    if chunk['heading_path']:\n",
    "        content_parts.append(f\"Section: {chunk['heading_path']}\")\n",
    "    \n",
    "    # Add content type context\n",
    "    content_type_labels = {\n",
    "        'paragraph': '',\n",
    "        'unordered_list': 'List:',\n",
    "        'ordered_list': 'Numbered list:',\n",
    "        'table': 'Table:',\n",
    "        'table_part': 'Table data:',\n",
    "        'unordered_list_part': 'List items:',\n",
    "        'ordered_list_part': 'Numbered list items:'\n",
    "    }\n",
    "    \n",
    "    type_label = content_type_labels.get(chunk['content_type'], '')\n",
    "    if type_label:\n",
    "        content_parts.append(type_label)\n",
    "    \n",
    "    # Add main content\n",
    "    content_parts.append(chunk['content'])\n",
    "    \n",
    "    # Enhanced content for embedding\n",
    "    embedding_content = ' '.join(content_parts)\n",
    "    \n",
    "    # Base metadata\n",
    "    metadata = {\n",
    "        'source_url': url,\n",
    "        'document_title': title,\n",
    "        'chunk_index': index,\n",
    "        'content_type': chunk['content_type'],\n",
    "        'heading_path': chunk['heading_path'],\n",
    "        'immediate_heading': chunk['immediate_heading'],\n",
    "        'all_headings': chunk['headings'],\n",
    "        'processed_at': date.today().strftime('%Y-%m-%d'),\n",
    "        **(additional_metadata or {})\n",
    "    }\n",
    "    \n",
    "    # Add content-specific metadata\n",
    "    for key in ['item_count', 'list_items', 'row_count', 'column_count', 'headers', 'table_caption', 'part_number', 'total_parts', 'row_range']:\n",
    "        if key in chunk:\n",
    "            metadata[key] = chunk[key]\n",
    "    \n",
    "    doc = {\n",
    "        'id': f\"{url}#chunk{index}\",\n",
    "        'content': chunk['content'],\n",
    "        'embedding_content': embedding_content,\n",
    "        'metadata': metadata\n",
    "    }\n",
    "    \n",
    "    return doc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "5fe9b756-8d89-481c-8ba1-4f4daa79135e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_embedding(text):\n",
    "    \"\"\"Create an embedding vector for a single text\"\"\"\n",
    "    response = ollama.embeddings(\n",
    "        model='nomic-embed-text',\n",
    "        prompt=text\n",
    "    )\n",
    "    return response['embedding']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "123af885-1da9-4089-a9c6-87b4d26d3e2d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# content = read_file('./docs/run/cloud.google.com/run/docs/overview/what-is-cloud-run')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "02eedbde-e1a1-4b40-8ac2-e86ff50afa73",
   "metadata": {},
   "outputs": [],
   "source": [
    "folders_to_read = [\n",
    "    'docs/run/cloud.google.com/run/docs',\n",
    "    'docs/compute/cloud.google.com/compute/docs',\n",
    "    'docs/iam/cloud.google.com/iam/docs'\n",
    "]\n",
    "    \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "3355eab2-0e54-4dbc-bb54-065cf165a9fa",
   "metadata": {},
   "outputs": [],
   "source": [
    "def is_html_file(file_path: str) -> bool:\n",
    "    \"\"\"Check if file is likely an HTML file based on content.\"\"\"\n",
    "    try:\n",
    "        with open(file_path, 'r', encoding='utf-8') as file:\n",
    "            first_line = file.readline().lower().strip()\n",
    "            # Check for common HTML indicators\n",
    "            return (first_line.startswith('<!doctype html') or \n",
    "                   first_line.startswith('<html') or \n",
    "                   '<html' in first_line)\n",
    "    except:\n",
    "        return False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "f048c113-b928-41e1-8f97-f6a8315b1bcd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# def insert_to_database(conn, records):\n",
    "#     \"\"\"Insert records into the vector_store table.\"\"\"\n",
    "#     try:\n",
    "#         cursor = conn.cursor()\n",
    "        \n",
    "#         insert_query = \"\"\"\n",
    "#             INSERT INTO vector_store (\n",
    "#                 id, title, chunk_index, content, source_url, \n",
    "#                 date_last_modified, metadata, embedding\n",
    "#             ) VALUES (\n",
    "#                 %s, %s, %s, %s, %s, %s, %s, %s\n",
    "#             )\n",
    "#         \"\"\"\n",
    "        \n",
    "#         for record in records:\n",
    "#             cursor.execute(insert_query, (\n",
    "#                 record['id'],\n",
    "#                 record['title'],\n",
    "#                 record['chunk_index'],\n",
    "#                 record['content'],\n",
    "#                 record['source_url'],\n",
    "#                 record['date_last_modified'],\n",
    "#                 Json(record['metadata']),\n",
    "#                 record['embedding']\n",
    "#             ))\n",
    "        \n",
    "#         conn.commit()\n",
    "#         logger.info(f\"Inserted {len(records)} records into database\")\n",
    "        \n",
    "#     except Exception as e:\n",
    "#         logger.error(f\"Database insertion error: {str(e)}\")\n",
    "#         conn.rollback()\n",
    "#         raise\n",
    "#     finally:\n",
    "#         cursor.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "272d8cf0-5eab-494b-ad72-9f081aa36586",
   "metadata": {},
   "outputs": [],
   "source": [
    "def insert_to_database(conn, records):\n",
    "    \"\"\"Insert records into the documents and chunks tables.\"\"\"\n",
    "    try:\n",
    "        cursor = conn.cursor()\n",
    "        \n",
    "        # Group records by source_url to handle documents\n",
    "        documents_by_url = {}\n",
    "        for record in records:\n",
    "            source_url = record['source_url']\n",
    "            if source_url not in documents_by_url:\n",
    "                documents_by_url[source_url] = {\n",
    "                    'title': record['title'],\n",
    "                    'source_url': source_url,\n",
    "                    'date_last_modified': record['date_last_modified'],\n",
    "                    'metadata': record['metadata'],\n",
    "                    'chunks': []\n",
    "                }\n",
    "            documents_by_url[source_url]['chunks'].append(record)\n",
    "        \n",
    "        # Insert documents (with conflict handling for duplicates)\n",
    "        document_insert_query = \"\"\"\n",
    "            INSERT INTO documents (source_url, title, date_last_modified, metadata)\n",
    "            VALUES (%s, %s, %s, %s)\n",
    "            ON CONFLICT (source_url) DO UPDATE SET\n",
    "                title = EXCLUDED.title,\n",
    "                date_last_modified = EXCLUDED.date_last_modified,\n",
    "                metadata = EXCLUDED.metadata\n",
    "            RETURNING id\n",
    "        \"\"\"\n",
    "        \n",
    "        # Insert chunks\n",
    "        chunk_insert_query = \"\"\"\n",
    "            INSERT INTO chunks (document_id, chunk_index, content, embedding)\n",
    "            VALUES (%s, %s, %s, %s)\n",
    "        \"\"\"\n",
    "        \n",
    "        # Get document ID query for existing documents\n",
    "        get_document_id_query = \"\"\"\n",
    "            SELECT id FROM documents WHERE source_url = %s\n",
    "        \"\"\"\n",
    "        \n",
    "        total_chunks_inserted = 0\n",
    "        \n",
    "        for source_url, doc_data in documents_by_url.items():\n",
    "            # Try to insert the document (or update if exists)\n",
    "            try:\n",
    "                cursor.execute(document_insert_query, (\n",
    "                    doc_data['source_url'],\n",
    "                    doc_data['title'],\n",
    "                    doc_data['date_last_modified'],\n",
    "                    Json(doc_data['metadata'])\n",
    "                ))\n",
    "                document_id = cursor.fetchone()[0]\n",
    "            except Exception as e:\n",
    "                # If insert fails, try to get existing document ID\n",
    "                cursor.execute(get_document_id_query, (source_url,))\n",
    "                result = cursor.fetchone()\n",
    "                if result:\n",
    "                    document_id = result[0]\n",
    "                else:\n",
    "                    logger.error(f\"Failed to insert or find document for URL {source_url}: {str(e)}\")\n",
    "                    continue\n",
    "            \n",
    "            # Insert chunks for this document\n",
    "            for chunk in doc_data['chunks']:\n",
    "                cursor.execute(chunk_insert_query, (\n",
    "                    document_id,\n",
    "                    chunk['chunk_index'],\n",
    "                    chunk['content'],\n",
    "                    chunk['embedding']\n",
    "                ))\n",
    "                total_chunks_inserted += 1\n",
    "        \n",
    "        conn.commit()\n",
    "        logger.info(f\"Inserted {len(documents_by_url)} documents and {total_chunks_inserted} chunks into database\")\n",
    "        \n",
    "    except Exception as e:\n",
    "        logger.error(f\"Database insertion error: {str(e)}\")\n",
    "        conn.rollback()\n",
    "        raise\n",
    "    finally:\n",
    "        cursor.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "b67e5cb3-fa63-4429-a791-902ef2fa4356",
   "metadata": {},
   "outputs": [],
   "source": [
    "def process(folder_path):\n",
    "    \n",
    "    try:\n",
    "        # Connect to database\n",
    "        conn = psycopg2.connect(\n",
    "            host=\"localhost\",           # Database server host\n",
    "            port=15432,                 # Port number (default: 5432)\n",
    "            database=\"gcp_docs\",           # Database name\n",
    "            user=\"admin\",           # Username\n",
    "            password=\"password\"        # Password\n",
    "        )\n",
    "        logger.info(\"Connected to database successfully\")\n",
    "        \n",
    "        # Track processing statistics\n",
    "        total_files = 0\n",
    "        total_chunks = 0\n",
    "        \n",
    "        # Walk through all directories and files\n",
    "        for root, dirs, files in os.walk(folder_path):\n",
    "            logger.info(f\"Processing directory: {root}\")\n",
    "            for filename in files:\n",
    "                file_path = os.path.join(root, filename)\n",
    "               \n",
    "                html_page = read_file(file_path)\n",
    "                logger.info(f\"Processing file: {file_path}\")\n",
    "                \n",
    "                # Extract content from HTML\n",
    "                document = prepare_enhanced_documents(html_page)\n",
    "                \n",
    "                if not document:\n",
    "                    logger.warning(f\"No content extracted from {file_path}\")\n",
    "                    continue\n",
    "\n",
    "                i = 0\n",
    "                records = []\n",
    "                for chunk in document:\n",
    "                    embedding = create_embedding(chunk['embedding_content'])\n",
    "                    record = {\n",
    "                        'id': str(uuid.uuid4()),\n",
    "                        'title': chunk['metadata']['document_title'],\n",
    "                        'chunk_index':chunk['id'],\n",
    "                        'content': chunk['content'],\n",
    "                        'source_url': chunk['metadata']['source_url'],\n",
    "                        'date_last_modified': chunk['metadata']['date_last_modified'],\n",
    "                        'metadata': {\n",
    "                            **chunk['metadata'],\n",
    "                            'chunk_number': i,\n",
    "                            'total_chunks': len(document),\n",
    "                            'chunk_size': len(chunk['embedding_content'])\n",
    "                        },\n",
    "                        'embedding': embedding\n",
    "                    }\n",
    "                    records.append(record)\n",
    "                    i = i + 1\n",
    "                \n",
    "                # Insert records into database\n",
    "                if records:\n",
    "                    insert_to_database(conn, records)\n",
    "                    total_files += 1\n",
    "                    total_chunks += len(records)\n",
    "        \n",
    "        logger.info(f\"Processing complete. Processed {total_files} files, created {total_chunks} chunks\")\n",
    "        return records\n",
    "        \n",
    "    except Exception as e:\n",
    "        logger.error(f\"Error during processing: {str(e)}\")\n",
    "        raise\n",
    "    finally:\n",
    "        if 'conn' in locals():\n",
    "            conn.close()\n",
    "        logger.info(\"Database connection closed\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "b125fdf1-838f-4516-885d-004342ec0be7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# html_file = read_file('docs/run/cloud.google.com/run/docs/monitoring-overview')\n",
    "# prepare_enhanced_documents(html_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ed63e442-ede2-48f0-a378-884110a250f1",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:__main__:Connected to database successfully\n",
      "INFO:__main__:Processing directory: docs/functions/cloud.google.com/functions/docs\n",
      "INFO:__main__:Processing file: docs/functions/cloud.google.com/functions/docs/create-deploy-http-ruby\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<meta content=\"Quickstart: Deploy a Cloud Run function using the gcloud CLI  |  Cloud Run Documentation  |  Google Cloud\" property=\"og:title\"/>\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n"
     ]
    }
   ],
   "source": [
    "#records = process('docs/run/cloud.google.com/run/docs--')\n",
    "# records = process('docs/iam/cloud.google.com/iam/docs')\n",
    "# records = process('docs/compute/cloud.google.com/compute/docs')\n",
    "# records = process('docs/storage/cloud.google.com/storage/docs')\n",
    "# records = process('docs/iap/cloud.google.com/iap/docs')\n",
    "# records = process('docs/bigquery/cloud.google.com/bigquery/docs')\n",
    "# records = process('docs/apigee/cloud.google.com/apigee/docs')\n",
    "records = process('docs/functions/cloud.google.com/functions/docs')\n",
    "# records = process('docs/pubsub/cloud.google.com/pubsub/docs')\n",
    "# records = process('docs/sql/cloud.google.com/sql/docs')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d7a4fe99-d9db-46c6-bbfb-8d4d36446d1c",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}