diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0f711ba --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +venv/ +.ipynb_checkpoints/ +docs/ +docker/.ipynb_checkpoints/ +docker/pgdata/ diff --git a/docker/ORIG-schema.sql b/docker/ORIG-schema.sql new file mode 100644 index 0000000..1c5ce26 --- /dev/null +++ b/docker/ORIG-schema.sql @@ -0,0 +1,17 @@ +CREATE EXTENSION IF NOT EXISTS vector; +CREATE EXTENSION IF NOT EXISTS hstore; +CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; + +CREATE TABLE IF NOT EXISTS vector_store ( + id uuid DEFAULT uuid_generate_v4() PRIMARY KEY, + document_id text, + title text, + chunk_index text, + content text, + source_url text, + date_last_modified timestamp, + metadata jsonb, + embedding vector +); + +CREATE INDEX ON vector_store USING HNSW (embedding vector_cosine_ops); \ No newline at end of file diff --git a/docker/compose.yaml b/docker/compose.yaml new file mode 100644 index 0000000..dd6b65b --- /dev/null +++ b/docker/compose.yaml @@ -0,0 +1,50 @@ +services: + ollama: + image: 'ollama/ollama:latest' + volumes: + - ollama_data:/root/.ollama + ports: + - '11434:11434' + mem_limit: 4g + memswap_limit: 16g + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + # healthcheck: + # test: ["CMD", "curl", "-f", "http://localhost:11434/api/tags"] + # interval: 10s + # timeout: 5s + # retries: 3 + # start_period: 20s + +# ollama-pull: +# image: 'ollama/ollama:latest' + # depends_on: + # ollama: + # condition: service_healthy +# volumes: +# - ollama_data:/root/.ollama +# environment: +# - OLLAMA_HOST=http://ollama:11434 +# command: ollama pull llama3.2:latest nomic-embed-text +# restart: "no" + + pgvector: + image: 'pgvector/pgvector:pg16' + environment: + - 'POSTGRES_DB=gcp_docs' + - 'POSTGRES_PASSWORD=password' + - 'POSTGRES_USER=admin' + ports: + - '15432:5432' + volumes: + - ./schema.sql:/docker-entrypoint-initdb.d/schema.sql + - ./pgdata:/var/lib/postgresql/data + +volumes: + pgdata: + ollama_data: diff --git a/docker/schema.sql b/docker/schema.sql new file mode 100644 index 0000000..0b184e1 --- /dev/null +++ b/docker/schema.sql @@ -0,0 +1,23 @@ +CREATE EXTENSION IF NOT EXISTS vector; +CREATE EXTENSION IF NOT EXISTS hstore; +CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; + +-- Documents table +CREATE TABLE documents ( + id uuid DEFAULT uuid_generate_v4() PRIMARY KEY, + source_url text UNIQUE NOT NULL, + title text, + date_last_modified timestamp, + metadata jsonb, + created_at timestamp DEFAULT CURRENT_TIMESTAMP +); + +-- Chunks table +CREATE TABLE chunks ( + id uuid DEFAULT uuid_generate_v4() PRIMARY KEY, + document_id uuid REFERENCES documents(id) ON DELETE CASCADE, + chunk_index text, + content text, + embedding vector, + created_at timestamp DEFAULT CURRENT_TIMESTAMP +); \ No newline at end of file diff --git a/one_liner.ipynb b/one_liner.ipynb new file mode 100644 index 0000000..e96610f --- /dev/null +++ b/one_liner.ipynb @@ -0,0 +1,91 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "5d3d7989-2c36-4811-9469-5e6be6ef6990", + "metadata": {}, + "outputs": [], + "source": [ + "string = \"\"\"\n", + "You are working as a network administrator for a company with\n", + "two subnets (subnet-a and subnet-b) in their default VPC. The company’s\n", + "database servers are located in subnet-a, while the application servers and web\n", + "servers operate in subnet-b. Your task is to configure a firewall rule that permits\n", + "database traffic exclusively from the application servers to the database servers.\n", + "What steps should be taken to accomplish this?\n", + "A. • Create service accounts sa-app and sa-db. • Associate service account sa-\n", + "app with the application servers and the service account sa-db with the database\n", + "servers. • Create an ingress firewall rule to allow network traffic from source\n", + "service account sa-app to target service account sa-db.\n", + "B. Create network tags db-server and app-server. • Add the db-server tag to\n", + "the application servers and the app-server tag to the database servers. • Create\n", + "an egress firewall rule to allow network traffic from source network tag db-server\n", + "to target network tag app-server.\n", + "C. Create a service account sa-app and a network tag db-server. • Associate the\n", + "service account sa-app with the database servers and the network tag db-server\n", + "with the application servers. • Create an ingress firewall rule to allow network\n", + "traffic from source service account sa-app to target network tag db-server.\n", + "D. Create a service account sa-app and a network tag app-server. • Add the\n", + "service account sa-app to the application servers and the network tag app-server\n", + "to the database servers. • Create an ingress firewall rule to allow network traffic\n", + "from source VPC IP addresses and target the subnet-b IP addresses\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "bf144f00-fce3-4842-abf3-37d244bf4192", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "' You are working as a network administrator for a company with two subnets (subnet-a and subnet-b) in their default VPC. The company’s database servers are located in subnet-a, while the application servers and web servers operate in subnet-b. Your task is to configure a firewall rule that permits database traffic exclusively from the application servers to the database servers. What steps should be taken to accomplish this? A. • Create service accounts sa-app and sa-db. • Associate service account sa- app with the application servers and the service account sa-db with the database servers. • Create an ingress firewall rule to allow network traffic from source service account sa-app to target service account sa-db. B. Create network tags db-server and app-server. • Add the db-server tag to the application servers and the app-server tag to the database servers. • Create an egress firewall rule to allow network traffic from source network tag db-server to target network tag app-server. C. Create a service account sa-app and a network tag db-server. • Associate the service account sa-app with the database servers and the network tag db-server with the application servers. • Create an ingress firewall rule to allow network traffic from source service account sa-app to target network tag db-server. D. Create a service account sa-app and a network tag app-server. • Add the service account sa-app to the application servers and the network tag app-server to the database servers. • Create an ingress firewall rule to allow network traffic from source VPC IP addresses and target the subnet-b IP addresses '" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "one_line = string.replace(\"\\n\", \" \")\n", + "one_line" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1d91d618-c1bf-434c-890e-e5610782643b", + "metadata": {}, + "outputs": [], + "source": [ + "# Ch 12 Q2\n", + "# You are an IT Specialist at a technology company, and your Dataproc cluster runs in a single Virtual Private Cloud (VPC) network in a single subnetwork with range 172.16.20.128/25. The subnetwork runs out of private IP addresses. Your manager asks you to find a way to add new VMs for communication with the cluster while minimizing the steps involved. What should you do? A. Create a new subnetwork in the existing VPC with a range of 172.16.21.0/24 and configure the VMs to use that subnetwork. B. Create a new VPC network for the VMs with a subnet of 172.32.0.0/16. Enable VPC network Peering between the Dataproc VPC network and the VMs VPC network. Configure a custom Route exchange. C. Configure Shared VPC for the existing VPC and add the VMs to a new subnetwork in the Shared VPC. D. Modify the existing subnet range to 172.16.20.0/24" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/rag_pipeline.ipynb b/rag_pipeline.ipynb new file mode 100644 index 0000000..ac8bab1 --- /dev/null +++ b/rag_pipeline.ipynb @@ -0,0 +1,926 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "6647b014-6ef7-4553-bfd9-a17a92a2374c", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import re\n", + "import uuid\n", + "import ollama\n", + "import logging\n", + "import psycopg2\n", + "from psycopg2.extras import Json\n", + "from datetime import date, datetime\n", + "from bs4 import BeautifulSoup" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "53625000-e642-459e-b496-07da97095c08", + "metadata": {}, + "outputs": [], + "source": [ + "# Set up logging\n", + "logging.basicConfig(level=logging.INFO)\n", + "logger = logging.getLogger(__name__)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "47c7f0b4-32c0-4952-a49a-01ee0099a89d", + "metadata": {}, + "outputs": [], + "source": [ + "def read_file(filename):\n", + " content = ''\n", + " with open(filename, 'r') as f:\n", + " content = f.read()\n", + "\n", + " return content" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "9c5202b0-bd87-458e-b2c8-5dc94033271f", + "metadata": {}, + "outputs": [], + "source": [ + "BASE_PATH = './docs'" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "bbb37d78-eaa1-4959-b315-938d55949919", + "metadata": {}, + "outputs": [], + "source": [ + "def create_chunk(content_type, content, heading_stack, extra_metadata=None):\n", + " \"\"\"Helper to create consistent chunk structure.\n", + " Generated by ClaudeAI\"\"\"\n", + "\n", + " content = content\n", + "\n", + " if isinstance(content, list):\n", + " # Join the list first, then replace newlines\n", + " content = ' '.join(content).replace('\\n', ' ')\n", + " else:\n", + " # It's already a string\n", + " content = content.replace('\\n', ' ')\n", + " \n", + " chunk = {\n", + " 'content': content,\n", + " 'content_type': content_type,\n", + " 'heading_path': ' > '.join(h['text'] for h in heading_stack),\n", + " 'immediate_heading': heading_stack[-1]['text'] if heading_stack else None,\n", + " 'headings': [h['text'] for h in heading_stack],\n", + " }\n", + " \n", + " if extra_metadata:\n", + " chunk.update(extra_metadata)\n", + " \n", + " return chunk" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "9c5ba591-0ccf-4f96-9fc2-a50b994be065", + "metadata": {}, + "outputs": [], + "source": [ + "def process_list(list_element, heading_stack):\n", + " \"\"\"Process ul/ol lists as single chunks or individual items\"\"\"\n", + " list_type = 'ordered_list' if list_element.name == 'ol' else 'unordered_list'\n", + " \n", + " # Extract all list items\n", + " items = []\n", + " for li in list_element.find_all('li', recursive=False): # Only direct children\n", + " item_text = li.get_text().strip()\n", + " if item_text:\n", + " # Clean up bullets and numbering from the text\n", + " cleaned_text = clean_list_item_text(item_text)\n", + " if cleaned_text: # Only add if there's content after cleaning\n", + " items.append(cleaned_text)\n", + " \n", + " if not items:\n", + " return None\n", + " \n", + " # Strategy 1: Treat entire list as one chunk\n", + " if len(items) <= 10: # Reasonable threshold\n", + " content = format_list_content(items, list_type)\n", + " return create_chunk(list_type, content, heading_stack, {\n", + " 'item_count': len(items),\n", + " 'list_items': items\n", + " })\n", + " \n", + " # Strategy 2: Split long lists into multiple chunks\n", + " else:\n", + " chunks = []\n", + " chunk_size = 8\n", + " for i in range(0, len(items), chunk_size):\n", + " chunk_items = items[i:i + chunk_size]\n", + " content = format_list_content(chunk_items, list_type)\n", + " chunk = create_chunk(f'{list_type}_part', content, heading_stack, {\n", + " 'item_count': len(chunk_items),\n", + " 'list_items': chunk_items,\n", + " 'part_number': i // chunk_size + 1,\n", + " 'total_parts': (len(items) + chunk_size - 1) // chunk_size\n", + " })\n", + " chunks.append(chunk)\n", + " return chunks\n", + "\n", + "def clean_list_item_text(text):\n", + " \"\"\"Remove bullets, numbers, and other list markers from text\"\"\"\n", + " \n", + " # First, split on bullet points if multiple items are concatenated\n", + " # This handles cases where multiple list items got joined together\n", + " if '•' in text:\n", + " # Split on bullets and clean each part\n", + " parts = text.split('•')\n", + " cleaned_parts = []\n", + " for part in parts:\n", + " cleaned_part = clean_single_item(part.strip())\n", + " if cleaned_part:\n", + " cleaned_parts.append(cleaned_part)\n", + " \n", + " if len(cleaned_parts) > 1:\n", + " # Multiple items were concatenated, return them separated\n", + " return ' | '.join(cleaned_parts)\n", + " else:\n", + " # Single item, continue with normal cleaning\n", + " text = parts[0] if parts else text\n", + " \n", + " # Clean single item\n", + " return clean_single_item(text)\n", + "\n", + "def clean_single_item(text):\n", + " \"\"\"Clean a single list item\"\"\"\n", + " if not text:\n", + " return \"\"\n", + " \n", + " # Common bullet characters and patterns to remove\n", + " bullet_patterns = [\n", + " r'^[•·▪▫‣⁃◦▸▹►▻○●◉◎⦿⦾]\\s*', # Various bullet characters\n", + " r'^[-–—*+]\\s*', # Dash, asterisk, plus bullets\n", + " r'^\\d+[\\.\\)]\\s*', # Numbers with periods or parentheses\n", + " r'^[a-zA-Z][\\.\\)]\\s*', # Letters with periods or parentheses\n", + " r'^[ivxlcdm]+[\\.\\)]\\s*', # Roman numerals\n", + " r'^\\([a-zA-Z0-9]+\\)\\s*', # Parenthesized numbers/letters\n", + " r'^\\s*\\u2022\\s*', # Unicode bullet\n", + " r'^\\s*\\u25E6\\s*', # White bullet\n", + " r'^\\s*\\u25AA\\s*', # Black small square\n", + " r'^\\s*\\u25AB\\s*', # White small square\n", + " ]\n", + " \n", + " cleaned_text = text\n", + " for pattern in bullet_patterns:\n", + " cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.IGNORECASE)\n", + " \n", + " # Remove extra whitespace\n", + " cleaned_text = re.sub(r'\\s+', ' ', cleaned_text).strip()\n", + " \n", + " return cleaned_text\n", + "\n", + "def format_list_content(items, list_type):\n", + " \"\"\"Format list items into readable content WITHOUT adding bullets\"\"\"\n", + " if list_type == 'ordered_list':\n", + " return '\\n'.join(f\"{i+1}. {item}\" for i, item in enumerate(items))\n", + " else:\n", + " # For unordered lists, just join with newlines or separators\n", + " # Don't add bullets since we want clean text\n", + " return '\\n'.join(items)\n", + " # Alternative: use a separator instead of newlines\n", + " # return ' | '.join(items)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "62d7feea-1c66-419d-8fb7-66dc75c8e9f7", + "metadata": {}, + "outputs": [], + "source": [ + "def process_table(table_element, heading_stack):\n", + " \"\"\"Process tables with different strategies based on size.\n", + " Generated by ClaudeAI\"\"\"\n", + " \n", + " # Extract table data\n", + " table_data = extract_table_data(table_element)\n", + " \n", + " if not table_data['rows']:\n", + " return None\n", + " \n", + " row_count = len(table_data['rows'])\n", + " col_count = len(table_data['headers']) if table_data['headers'] else len(table_data['rows'][0])\n", + " \n", + " # Strategy based on table size\n", + " if row_count <= 20 and col_count <= 6:\n", + " # Small table: treat as single chunk\n", + " content = format_table_content(table_data)\n", + " return create_chunk('table', content, heading_stack, {\n", + " 'row_count': row_count,\n", + " 'column_count': col_count,\n", + " 'headers': table_data['headers'],\n", + " 'table_caption': table_data['caption']\n", + " })\n", + " \n", + " else:\n", + " # Large table: split by rows\n", + " return split_large_table(table_data, heading_stack)\n", + "\n", + "def extract_table_data(table_element):\n", + " \"\"\"Extract structured data from table.\n", + " Generated by ClaudeAI\"\"\"\n", + " \n", + " # Get caption if present\n", + " caption_elem = table_element.find('caption')\n", + " caption = caption_elem.get_text().strip() if caption_elem else None\n", + " \n", + " # Extract headers\n", + " headers = []\n", + " header_row = table_element.find('thead')\n", + " if header_row:\n", + " for th in header_row.find_all(['th', 'td']):\n", + " headers.append(th.get_text().strip())\n", + " else:\n", + " # Try first row as headers\n", + " first_row = table_element.find('tr')\n", + " if first_row:\n", + " for cell in first_row.find_all(['th', 'td']):\n", + " headers.append(cell.get_text().strip())\n", + " \n", + " # Extract data rows\n", + " rows = []\n", + " tbody = table_element.find('tbody') or table_element\n", + " \n", + " for tr in tbody.find_all('tr')[1 if not table_element.find('thead') and headers else 0:]:\n", + " row = []\n", + " for cell in tr.find_all(['td', 'th']):\n", + " row.append(cell.get_text().strip())\n", + " if row: # Skip empty rows\n", + " rows.append(row)\n", + " \n", + " return {\n", + " 'caption': caption,\n", + " 'headers': headers,\n", + " 'rows': rows\n", + " }\n", + "\n", + "def format_table_content(table_data):\n", + " \"\"\"Format table data into readable text\n", + " Generated by ClaudeAI\"\"\"\n", + " content_parts = []\n", + " \n", + " if table_data['caption']:\n", + " content_parts.append(f\"Table: {table_data['caption']}\")\n", + " \n", + " headers = table_data['headers']\n", + " rows = table_data['rows']\n", + " \n", + " if headers:\n", + " content_parts.append(\"Columns: \" + \" | \".join(headers))\n", + " \n", + " # Format rows\n", + " for i, row in enumerate(rows):\n", + " if headers and len(row) == len(headers):\n", + " # Create key-value pairs\n", + " row_content = []\n", + " for header, value in zip(headers, row):\n", + " if value: # Skip empty cells\n", + " row_content.append(f\"{header}: {value}\")\n", + " if row_content:\n", + " content_parts.append(f\"Row {i+1}: \" + \"; \".join(row_content))\n", + " else:\n", + " # Simple row format\n", + " content_parts.append(f\"Row {i+1}: \" + \" | \".join(row))\n", + " \n", + " return '\\n'.join(content_parts)\n", + "\n", + "def split_large_table(table_data, heading_stack):\n", + " \"\"\"Split large tables into smaller chunks\n", + " Generated By ClaudeAI\"\"\"\n", + " chunks = []\n", + " headers = table_data['headers']\n", + " rows = table_data['rows']\n", + " \n", + " chunk_size = 10 # Rows per chunk\n", + " total_chunks = (len(rows) + chunk_size - 1) // chunk_size\n", + " \n", + " for i in range(0, len(rows), chunk_size):\n", + " chunk_rows = rows[i:i + chunk_size]\n", + " \n", + " chunk_table_data = {\n", + " 'caption': table_data['caption'],\n", + " 'headers': headers,\n", + " 'rows': chunk_rows\n", + " }\n", + " \n", + " content = format_table_content(chunk_table_data)\n", + " \n", + " chunk = create_chunk('table_part', content, heading_stack, {\n", + " 'row_count': len(chunk_rows),\n", + " 'column_count': len(headers) if headers else len(chunk_rows[0]),\n", + " 'headers': headers,\n", + " 'table_caption': table_data['caption'],\n", + " 'part_number': i // chunk_size + 1,\n", + " 'total_parts': total_chunks,\n", + " 'row_range': f\"{i+1}-{min(i+chunk_size, len(rows))}\"\n", + " })\n", + " \n", + " chunks.append(chunk)\n", + " \n", + " return chunks" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "fb41a9bf-01d3-4931-80c7-f9d3f853d761", + "metadata": {}, + "outputs": [], + "source": [ + "def extract_content(html_content):\n", + " ret_value = {}\n", + " soup = BeautifulSoup(html_content, 'html.parser')\n", + "\n", + " og_url = soup.find('meta', property='og:url')\n", + " og_description = soup.find('meta', property=\"og:description\")\n", + " og_title = soup.find('meta', property=\"og:title\")\n", + " print(og_title)\n", + " title_content = og_title.get('content') if og_title else None\n", + " title = re.sub(r'[\\s\\xa0]*\\|[\\s\\xa0]*', ' | ', title_content) if title_content else None\n", + "\n", + " article_body = soup.find('div', class_='devsite-article-body')\n", + " if not article_body:\n", + " return {}\n", + "\n", + " footer = soup.find('devsite-content-footer')\n", + " # footer_paras = footer.find_all('p') if footer else None\n", + " # second_para = footer_paras[1] if len(footer_paras) > 1 else None\n", + " date_last_modified = date.today().strftime('%Y-%m-%d')\n", + " if footer:\n", + " footer_paras = footer.find_all('p')\n", + " for fp in footer_paras:\n", + " last_updated_re = r'Last updated (.*) UTC'\n", + " match = re.search(last_updated_re, fp.get_text())\n", + " if match:\n", + " date_last_modified = match.group(1)\n", + " break\n", + "\n", + " #\n", + " # Start ClaudeAI generated Code\n", + " #\n", + " chunks = []\n", + " heading_stack = []\n", + " \n", + " # Process elements that can be chunks or provide context\n", + " for element in article_body.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol', 'table']):\n", + " if element.name.startswith('h'):\n", + " level = int(element.name[1])\n", + " heading_text = element.get_text().strip()\n", + " \n", + " heading_stack = [h for h in heading_stack if h['level'] < level]\n", + " heading_stack.append({'level': level, 'text': heading_text})\n", + " \n", + " elif element.name == 'p':\n", + " raw_content = element.get_text().strip()\n", + "\n", + " if isinstance(raw_content, list):\n", + " # Join the list first, then replace newlines\n", + " content = ' '.join(raw_content).replace('\\n', ' ')\n", + " else:\n", + " # It's already a string\n", + " content = raw_content.replace('\\n', ' ')\n", + " \n", + " if content and len(content) > 10:\n", + " chunk = create_chunk('paragraph', content, heading_stack)\n", + " chunks.append(chunk)\n", + " \n", + " elif element.name in ['ul', 'ol']:\n", + " list_chunk = process_list(element, heading_stack)\n", + " if list_chunk:\n", + " chunks.append(list_chunk)\n", + " \n", + " elif element.name == 'table':\n", + " table_chunk = process_table(element, heading_stack)\n", + " if table_chunk:\n", + " chunks.append(table_chunk)\n", + " #\n", + " # End ClaudeAI generated code\n", + " #\n", + "\n", + " ret_value['url'] = og_url.get('content') if og_url else None\n", + " ret_value['description'] = og_description.get('content') if og_description else None\n", + " ret_value['title'] = title\n", + " ret_value['date_last_modified'] = date_last_modified\n", + " ret_value['chunks'] = chunks\n", + " # ret_value['article'] = article_body\n", + "\n", + " return ret_value" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "816faecf-b764-46ce-ac2a-1cd1bf0bd9c4", + "metadata": {}, + "outputs": [], + "source": [ + "def prepare_enhanced_documents(html_content, additional_metadata=None):\n", + " extracted = extract_content(html_content)\n", + "\n", + " if not extracted:\n", + " return []\n", + " # extracted = extract_all_content_chunks(html_content)\n", + "\n", + " additional_metadata = {\n", + " \"date_last_modified\": extracted['date_last_modified']\n", + " }\n", + " \n", + " documents = []\n", + " chunk_counter = 0\n", + " \n", + " for chunk in extracted['chunks']:\n", + " # Handle cases where list/table processing returns multiple chunks\n", + " if isinstance(chunk, list):\n", + " for sub_chunk in chunk:\n", + " doc = create_document_from_chunk(sub_chunk, extracted['url'], extracted['title'], chunk_counter, additional_metadata)\n", + " documents.append(doc)\n", + " chunk_counter += 1\n", + " else:\n", + " doc = create_document_from_chunk(chunk, extracted['url'], extracted['title'], chunk_counter, additional_metadata)\n", + " documents.append(doc)\n", + " chunk_counter += 1\n", + " \n", + " return documents\n", + "\n", + "def create_document_from_chunk(chunk, url, title, index, additional_metadata):\n", + " \"\"\"Create document object from chunk\"\"\"\n", + " \n", + " # Create enhanced content for embedding\n", + " content_parts = []\n", + " \n", + " # Add document title\n", + " content_parts.append(title)\n", + " \n", + " # Add heading context\n", + " if chunk['heading_path']:\n", + " content_parts.append(f\"Section: {chunk['heading_path']}\")\n", + " \n", + " # Add content type context\n", + " content_type_labels = {\n", + " 'paragraph': '',\n", + " 'unordered_list': 'List:',\n", + " 'ordered_list': 'Numbered list:',\n", + " 'table': 'Table:',\n", + " 'table_part': 'Table data:',\n", + " 'unordered_list_part': 'List items:',\n", + " 'ordered_list_part': 'Numbered list items:'\n", + " }\n", + " \n", + " type_label = content_type_labels.get(chunk['content_type'], '')\n", + " if type_label:\n", + " content_parts.append(type_label)\n", + " \n", + " # Add main content\n", + " content_parts.append(chunk['content'])\n", + " \n", + " # Enhanced content for embedding\n", + " embedding_content = ' '.join(content_parts)\n", + " \n", + " # Base metadata\n", + " metadata = {\n", + " 'source_url': url,\n", + " 'document_title': title,\n", + " 'chunk_index': index,\n", + " 'content_type': chunk['content_type'],\n", + " 'heading_path': chunk['heading_path'],\n", + " 'immediate_heading': chunk['immediate_heading'],\n", + " 'all_headings': chunk['headings'],\n", + " 'processed_at': date.today().strftime('%Y-%m-%d'),\n", + " **(additional_metadata or {})\n", + " }\n", + " \n", + " # Add content-specific metadata\n", + " for key in ['item_count', 'list_items', 'row_count', 'column_count', 'headers', 'table_caption', 'part_number', 'total_parts', 'row_range']:\n", + " if key in chunk:\n", + " metadata[key] = chunk[key]\n", + " \n", + " doc = {\n", + " 'id': f\"{url}#chunk{index}\",\n", + " 'content': chunk['content'],\n", + " 'embedding_content': embedding_content,\n", + " 'metadata': metadata\n", + " }\n", + " \n", + " return doc" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "5fe9b756-8d89-481c-8ba1-4f4daa79135e", + "metadata": {}, + "outputs": [], + "source": [ + "def create_embedding(text):\n", + " \"\"\"Create an embedding vector for a single text\"\"\"\n", + " response = ollama.embeddings(\n", + " model='nomic-embed-text',\n", + " prompt=text\n", + " )\n", + " return response['embedding']" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "123af885-1da9-4089-a9c6-87b4d26d3e2d", + "metadata": {}, + "outputs": [], + "source": [ + "# content = read_file('./docs/run/cloud.google.com/run/docs/overview/what-is-cloud-run')" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "02eedbde-e1a1-4b40-8ac2-e86ff50afa73", + "metadata": {}, + "outputs": [], + "source": [ + "folders_to_read = [\n", + " 'docs/run/cloud.google.com/run/docs',\n", + " 'docs/compute/cloud.google.com/compute/docs',\n", + " 'docs/iam/cloud.google.com/iam/docs'\n", + "]\n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "3355eab2-0e54-4dbc-bb54-065cf165a9fa", + "metadata": {}, + "outputs": [], + "source": [ + "def is_html_file(file_path: str) -> bool:\n", + " \"\"\"Check if file is likely an HTML file based on content.\"\"\"\n", + " try:\n", + " with open(file_path, 'r', encoding='utf-8') as file:\n", + " first_line = file.readline().lower().strip()\n", + " # Check for common HTML indicators\n", + " return (first_line.startswith('\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n" + ] + } + ], + "source": [ + "#records = process('docs/run/cloud.google.com/run/docs--')\n", + "# records = process('docs/iam/cloud.google.com/iam/docs')\n", + "# records = process('docs/compute/cloud.google.com/compute/docs')\n", + "# records = process('docs/storage/cloud.google.com/storage/docs')\n", + "# records = process('docs/iap/cloud.google.com/iap/docs')\n", + "# records = process('docs/bigquery/cloud.google.com/bigquery/docs')\n", + "# records = process('docs/apigee/cloud.google.com/apigee/docs')\n", + "records = process('docs/functions/cloud.google.com/functions/docs')\n", + "# records = process('docs/pubsub/cloud.google.com/pubsub/docs')\n", + "# records = process('docs/sql/cloud.google.com/sql/docs')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d7a4fe99-d9db-46c6-bbfb-8d4d36446d1c", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/rag_pipeline.py b/rag_pipeline.py new file mode 100644 index 0000000..1e35eb8 --- /dev/null +++ b/rag_pipeline.py @@ -0,0 +1,705 @@ +#!/usr/bin/env python +# coding: utf-8 + +import os +import re +import uuid +import ollama +import logging +import psycopg2 +from psycopg2.extras import Json +from datetime import date, datetime +from bs4 import BeautifulSoup + +# Set up logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +def read_file(filename): + content = '' + with open(filename, 'r') as f: + content = f.read() + + return content + +BASE_PATH = './docs' + +def create_chunk(content_type, content, heading_stack, extra_metadata=None): + """Helper to create consistent chunk structure. + Generated by ClaudeAI""" + + content = content + + if isinstance(content, list): + # Join the list first, then replace newlines + content = ' '.join(content).replace('\n', ' ') + else: + # It's already a string + content = content.replace('\n', ' ') + + chunk = { + 'content': content, + 'content_type': content_type, + 'heading_path': ' > '.join(h['text'] for h in heading_stack), + 'immediate_heading': heading_stack[-1]['text'] if heading_stack else None, + 'headings': [h['text'] for h in heading_stack], + } + + if extra_metadata: + chunk.update(extra_metadata) + + return chunk + +def process_list(list_element, heading_stack): + """Process ul/ol lists as single chunks or individual items""" + list_type = 'ordered_list' if list_element.name == 'ol' else 'unordered_list' + + # Extract all list items + items = [] + for li in list_element.find_all('li', recursive=False): # Only direct children + item_text = li.get_text().strip() + if item_text: + # Clean up bullets and numbering from the text + cleaned_text = clean_list_item_text(item_text) + if cleaned_text: # Only add if there's content after cleaning + items.append(cleaned_text) + + if not items: + return None + + # Strategy 1: Treat entire list as one chunk + if len(items) <= 10: # Reasonable threshold + content = format_list_content(items, list_type) + return create_chunk(list_type, content, heading_stack, { + 'item_count': len(items), + 'list_items': items + }) + + # Strategy 2: Split long lists into multiple chunks + else: + chunks = [] + chunk_size = 8 + for i in range(0, len(items), chunk_size): + chunk_items = items[i:i + chunk_size] + content = format_list_content(chunk_items, list_type) + chunk = create_chunk(f'{list_type}_part', content, heading_stack, { + 'item_count': len(chunk_items), + 'list_items': chunk_items, + 'part_number': i // chunk_size + 1, + 'total_parts': (len(items) + chunk_size - 1) // chunk_size + }) + chunks.append(chunk) + return chunks + +def clean_list_item_text(text): + """Remove bullets, numbers, and other list markers from text""" + + # First, split on bullet points if multiple items are concatenated + # This handles cases where multiple list items got joined together + if '•' in text: + # Split on bullets and clean each part + parts = text.split('•') + cleaned_parts = [] + for part in parts: + cleaned_part = clean_single_item(part.strip()) + if cleaned_part: + cleaned_parts.append(cleaned_part) + + if len(cleaned_parts) > 1: + # Multiple items were concatenated, return them separated + return ' | '.join(cleaned_parts) + else: + # Single item, continue with normal cleaning + text = parts[0] if parts else text + + # Clean single item + return clean_single_item(text) + +def clean_single_item(text): + """Clean a single list item""" + if not text: + return "" + + # Common bullet characters and patterns to remove + bullet_patterns = [ + r'^[•·▪▫‣⁃◦▸▹►▻○●◉◎⦿⦾]\s*', # Various bullet characters + r'^[-–—*+]\s*', # Dash, asterisk, plus bullets + r'^\d+[\.\)]\s*', # Numbers with periods or parentheses + r'^[a-zA-Z][\.\)]\s*', # Letters with periods or parentheses + r'^[ivxlcdm]+[\.\)]\s*', # Roman numerals + r'^\([a-zA-Z0-9]+\)\s*', # Parenthesized numbers/letters + r'^\s*\u2022\s*', # Unicode bullet + r'^\s*\u25E6\s*', # White bullet + r'^\s*\u25AA\s*', # Black small square + r'^\s*\u25AB\s*', # White small square + ] + + cleaned_text = text + for pattern in bullet_patterns: + cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.IGNORECASE) + + # Remove extra whitespace + cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip() + + return cleaned_text + +def format_list_content(items, list_type): + """Format list items into readable content WITHOUT adding bullets""" + if list_type == 'ordered_list': + return '\n'.join(f"{i+1}. {item}" for i, item in enumerate(items)) + else: + # For unordered lists, just join with newlines or separators + # Don't add bullets since we want clean text + return '\n'.join(items) + # Alternative: use a separator instead of newlines + # return ' | '.join(items) + +def process_table(table_element, heading_stack): + """Process tables with different strategies based on size. + Generated by ClaudeAI""" + + # Extract table data + table_data = extract_table_data(table_element) + + if not table_data['rows']: + return None + + row_count = len(table_data['rows']) + col_count = len(table_data['headers']) if table_data['headers'] else len(table_data['rows'][0]) + + # Strategy based on table size + if row_count <= 20 and col_count <= 6: + # Small table: treat as single chunk + content = format_table_content(table_data) + return create_chunk('table', content, heading_stack, { + 'row_count': row_count, + 'column_count': col_count, + 'headers': table_data['headers'], + 'table_caption': table_data['caption'] + }) + + else: + # Large table: split by rows + return split_large_table(table_data, heading_stack) + +def extract_table_data(table_element): + """Extract structured data from table. + Generated by ClaudeAI""" + + # Get caption if present + caption_elem = table_element.find('caption') + caption = caption_elem.get_text().strip() if caption_elem else None + + # Extract headers + headers = [] + header_row = table_element.find('thead') + if header_row: + for th in header_row.find_all(['th', 'td']): + headers.append(th.get_text().strip()) + else: + # Try first row as headers + first_row = table_element.find('tr') + if first_row: + for cell in first_row.find_all(['th', 'td']): + headers.append(cell.get_text().strip()) + + # Extract data rows + rows = [] + tbody = table_element.find('tbody') or table_element + + for tr in tbody.find_all('tr')[1 if not table_element.find('thead') and headers else 0:]: + row = [] + for cell in tr.find_all(['td', 'th']): + row.append(cell.get_text().strip()) + if row: # Skip empty rows + rows.append(row) + + return { + 'caption': caption, + 'headers': headers, + 'rows': rows + } + +def format_table_content(table_data): + """Format table data into readable text + Generated by ClaudeAI""" + content_parts = [] + + if table_data['caption']: + content_parts.append(f"Table: {table_data['caption']}") + + headers = table_data['headers'] + rows = table_data['rows'] + + if headers: + content_parts.append("Columns: " + " | ".join(headers)) + + # Format rows + for i, row in enumerate(rows): + if headers and len(row) == len(headers): + # Create key-value pairs + row_content = [] + for header, value in zip(headers, row): + if value: # Skip empty cells + row_content.append(f"{header}: {value}") + if row_content: + content_parts.append(f"Row {i+1}: " + "; ".join(row_content)) + else: + # Simple row format + content_parts.append(f"Row {i+1}: " + " | ".join(row)) + + return '\n'.join(content_parts) + +def split_large_table(table_data, heading_stack): + """Split large tables into smaller chunks + Generated By ClaudeAI""" + chunks = [] + headers = table_data['headers'] + rows = table_data['rows'] + + chunk_size = 10 # Rows per chunk + total_chunks = (len(rows) + chunk_size - 1) // chunk_size + + for i in range(0, len(rows), chunk_size): + chunk_rows = rows[i:i + chunk_size] + + chunk_table_data = { + 'caption': table_data['caption'], + 'headers': headers, + 'rows': chunk_rows + } + + content = format_table_content(chunk_table_data) + + chunk = create_chunk('table_part', content, heading_stack, { + 'row_count': len(chunk_rows), + 'column_count': len(headers) if headers else len(chunk_rows[0]), + 'headers': headers, + 'table_caption': table_data['caption'], + 'part_number': i // chunk_size + 1, + 'total_parts': total_chunks, + 'row_range': f"{i+1}-{min(i+chunk_size, len(rows))}" + }) + + chunks.append(chunk) + + return chunks + + +def extract_content(html_content): + ret_value = {} + soup = BeautifulSoup(html_content, 'html.parser') + + og_url = soup.find('meta', property='og:url') + og_description = soup.find('meta', property="og:description") + og_title = soup.find('meta', property="og:title") + print(og_title) + title_content = og_title.get('content') if og_title else None + title = re.sub(r'[\s\xa0]*\|[\s\xa0]*', ' | ', title_content) if title_content else None + + article_body = soup.find('div', class_='devsite-article-body') + if not article_body: + return {} + + footer = soup.find('devsite-content-footer') + # footer_paras = footer.find_all('p') if footer else None + # second_para = footer_paras[1] if len(footer_paras) > 1 else None + date_last_modified = date.today().strftime('%Y-%m-%d') + if footer: + footer_paras = footer.find_all('p') + for fp in footer_paras: + last_updated_re = r'Last updated (.*) UTC' + match = re.search(last_updated_re, fp.get_text()) + if match: + date_last_modified = match.group(1) + break + + # + # Start ClaudeAI generated Code + # + chunks = [] + heading_stack = [] + + # Process elements that can be chunks or provide context + for element in article_body.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol', 'table']): + if element.name.startswith('h'): + level = int(element.name[1]) + heading_text = element.get_text().strip() + + heading_stack = [h for h in heading_stack if h['level'] < level] + heading_stack.append({'level': level, 'text': heading_text}) + + elif element.name == 'p': + raw_content = element.get_text().strip() + + if isinstance(raw_content, list): + # Join the list first, then replace newlines + content = ' '.join(raw_content).replace('\n', ' ') + else: + # It's already a string + content = raw_content.replace('\n', ' ') + + if content and len(content) > 10: + chunk = create_chunk('paragraph', content, heading_stack) + chunks.append(chunk) + + elif element.name in ['ul', 'ol']: + list_chunk = process_list(element, heading_stack) + if list_chunk: + chunks.append(list_chunk) + + elif element.name == 'table': + table_chunk = process_table(element, heading_stack) + if table_chunk: + chunks.append(table_chunk) + # + # End ClaudeAI generated code + # + + ret_value['url'] = og_url.get('content') if og_url else None + ret_value['description'] = og_description.get('content') if og_description else None + ret_value['title'] = title + ret_value['date_last_modified'] = date_last_modified + ret_value['chunks'] = chunks + # ret_value['article'] = article_body + + return ret_value + +def prepare_enhanced_documents(html_content, additional_metadata=None): + extracted = extract_content(html_content) + + if not extracted: + return [] + # extracted = extract_all_content_chunks(html_content) + + additional_metadata = { + "date_last_modified": extracted['date_last_modified'] + } + + documents = [] + chunk_counter = 0 + + for chunk in extracted['chunks']: + # Handle cases where list/table processing returns multiple chunks + if isinstance(chunk, list): + for sub_chunk in chunk: + doc = create_document_from_chunk(sub_chunk, extracted['url'], extracted['title'], chunk_counter, additional_metadata) + documents.append(doc) + chunk_counter += 1 + else: + doc = create_document_from_chunk(chunk, extracted['url'], extracted['title'], chunk_counter, additional_metadata) + documents.append(doc) + chunk_counter += 1 + + return documents + +def create_document_from_chunk(chunk, url, title, index, additional_metadata): + """Create document object from chunk""" + + # Create enhanced content for embedding + content_parts = [] + + # Add document title + content_parts.append(title) + + # Add heading context + if chunk['heading_path']: + content_parts.append(f"Section: {chunk['heading_path']}") + + # Add content type context + content_type_labels = { + 'paragraph': '', + 'unordered_list': 'List:', + 'ordered_list': 'Numbered list:', + 'table': 'Table:', + 'table_part': 'Table data:', + 'unordered_list_part': 'List items:', + 'ordered_list_part': 'Numbered list items:' + } + + type_label = content_type_labels.get(chunk['content_type'], '') + if type_label: + content_parts.append(type_label) + + # Add main content + content_parts.append(chunk['content']) + + # Enhanced content for embedding + embedding_content = ' '.join(content_parts) + + # Base metadata + metadata = { + 'source_url': url, + 'document_title': title, + 'chunk_index': index, + 'content_type': chunk['content_type'], + 'heading_path': chunk['heading_path'], + 'immediate_heading': chunk['immediate_heading'], + 'all_headings': chunk['headings'], + 'processed_at': date.today().strftime('%Y-%m-%d'), + **(additional_metadata or {}) + } + + # Add content-specific metadata + for key in ['item_count', 'list_items', 'row_count', 'column_count', 'headers', 'table_caption', 'part_number', 'total_parts', 'row_range']: + if key in chunk: + metadata[key] = chunk[key] + + doc = { + 'id': f"{url}#chunk{index}", + 'content': chunk['content'], + 'embedding_content': embedding_content, + 'metadata': metadata + } + + return doc + +def create_embedding(text): + """Create an embedding vector for a single text""" + response = ollama.embeddings( + model='nomic-embed-text', + prompt=text + ) + return response['embedding'] + +# content = read_file('./docs/run/cloud.google.com/run/docs/overview/what-is-cloud-run') + +folders_to_read = [ + 'docs/run/cloud.google.com/run/docs', + 'docs/compute/cloud.google.com/compute/docs', + 'docs/iam/cloud.google.com/iam/docs' +] + +def is_html_file(file_path: str) -> bool: + """Check if file is likely an HTML file based on content.""" + try: + with open(file_path, 'r', encoding='utf-8') as file: + first_line = file.readline().lower().strip() + # Check for common HTML indicators + return (first_line.startswith('