{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "6647b014-6ef7-4553-bfd9-a17a92a2374c", "metadata": {}, "outputs": [], "source": [ "import os\n", "import re\n", "import uuid\n", "import ollama\n", "import logging\n", "import psycopg2\n", "from psycopg2.extras import Json\n", "from datetime import date, datetime\n", "from bs4 import BeautifulSoup" ] }, { "cell_type": "code", "execution_count": 2, "id": "53625000-e642-459e-b496-07da97095c08", "metadata": {}, "outputs": [], "source": [ "# Set up logging\n", "logging.basicConfig(level=logging.INFO)\n", "logger = logging.getLogger(__name__)" ] }, { "cell_type": "code", "execution_count": 3, "id": "47c7f0b4-32c0-4952-a49a-01ee0099a89d", "metadata": {}, "outputs": [], "source": [ "def read_file(filename):\n", " content = ''\n", " with open(filename, 'r') as f:\n", " content = f.read()\n", "\n", " return content" ] }, { "cell_type": "code", "execution_count": 4, "id": "9c5202b0-bd87-458e-b2c8-5dc94033271f", "metadata": {}, "outputs": [], "source": [ "BASE_PATH = './docs'" ] }, { "cell_type": "code", "execution_count": 5, "id": "bbb37d78-eaa1-4959-b315-938d55949919", "metadata": {}, "outputs": [], "source": [ "def create_chunk(content_type, content, heading_stack, extra_metadata=None):\n", " \"\"\"Helper to create consistent chunk structure.\n", " Generated by ClaudeAI\"\"\"\n", "\n", " content = content\n", "\n", " if isinstance(content, list):\n", " # Join the list first, then replace newlines\n", " content = ' '.join(content).replace('\\n', ' ')\n", " else:\n", " # It's already a string\n", " content = content.replace('\\n', ' ')\n", " \n", " chunk = {\n", " 'content': content,\n", " 'content_type': content_type,\n", " 'heading_path': ' > '.join(h['text'] for h in heading_stack),\n", " 'immediate_heading': heading_stack[-1]['text'] if heading_stack else None,\n", " 'headings': [h['text'] for h in heading_stack],\n", " }\n", " \n", " if extra_metadata:\n", " chunk.update(extra_metadata)\n", " \n", " return chunk" ] }, { "cell_type": "code", "execution_count": 6, "id": "9c5ba591-0ccf-4f96-9fc2-a50b994be065", "metadata": {}, "outputs": [], "source": [ "def process_list(list_element, heading_stack):\n", " \"\"\"Process ul/ol lists as single chunks or individual items\"\"\"\n", " list_type = 'ordered_list' if list_element.name == 'ol' else 'unordered_list'\n", " \n", " # Extract all list items\n", " items = []\n", " for li in list_element.find_all('li', recursive=False): # Only direct children\n", " item_text = li.get_text().strip()\n", " if item_text:\n", " # Clean up bullets and numbering from the text\n", " cleaned_text = clean_list_item_text(item_text)\n", " if cleaned_text: # Only add if there's content after cleaning\n", " items.append(cleaned_text)\n", " \n", " if not items:\n", " return None\n", " \n", " # Strategy 1: Treat entire list as one chunk\n", " if len(items) <= 10: # Reasonable threshold\n", " content = format_list_content(items, list_type)\n", " return create_chunk(list_type, content, heading_stack, {\n", " 'item_count': len(items),\n", " 'list_items': items\n", " })\n", " \n", " # Strategy 2: Split long lists into multiple chunks\n", " else:\n", " chunks = []\n", " chunk_size = 8\n", " for i in range(0, len(items), chunk_size):\n", " chunk_items = items[i:i + chunk_size]\n", " content = format_list_content(chunk_items, list_type)\n", " chunk = create_chunk(f'{list_type}_part', content, heading_stack, {\n", " 'item_count': len(chunk_items),\n", " 'list_items': chunk_items,\n", " 'part_number': i // chunk_size + 1,\n", " 'total_parts': (len(items) + chunk_size - 1) // chunk_size\n", " })\n", " chunks.append(chunk)\n", " return chunks\n", "\n", "def clean_list_item_text(text):\n", " \"\"\"Remove bullets, numbers, and other list markers from text\"\"\"\n", " \n", " # First, split on bullet points if multiple items are concatenated\n", " # This handles cases where multiple list items got joined together\n", " if '•' in text:\n", " # Split on bullets and clean each part\n", " parts = text.split('•')\n", " cleaned_parts = []\n", " for part in parts:\n", " cleaned_part = clean_single_item(part.strip())\n", " if cleaned_part:\n", " cleaned_parts.append(cleaned_part)\n", " \n", " if len(cleaned_parts) > 1:\n", " # Multiple items were concatenated, return them separated\n", " return ' | '.join(cleaned_parts)\n", " else:\n", " # Single item, continue with normal cleaning\n", " text = parts[0] if parts else text\n", " \n", " # Clean single item\n", " return clean_single_item(text)\n", "\n", "def clean_single_item(text):\n", " \"\"\"Clean a single list item\"\"\"\n", " if not text:\n", " return \"\"\n", " \n", " # Common bullet characters and patterns to remove\n", " bullet_patterns = [\n", " r'^[•·▪▫‣⁃◦▸▹►▻○●◉◎⦿⦾]\\s*', # Various bullet characters\n", " r'^[-–—*+]\\s*', # Dash, asterisk, plus bullets\n", " r'^\\d+[\\.\\)]\\s*', # Numbers with periods or parentheses\n", " r'^[a-zA-Z][\\.\\)]\\s*', # Letters with periods or parentheses\n", " r'^[ivxlcdm]+[\\.\\)]\\s*', # Roman numerals\n", " r'^\\([a-zA-Z0-9]+\\)\\s*', # Parenthesized numbers/letters\n", " r'^\\s*\\u2022\\s*', # Unicode bullet\n", " r'^\\s*\\u25E6\\s*', # White bullet\n", " r'^\\s*\\u25AA\\s*', # Black small square\n", " r'^\\s*\\u25AB\\s*', # White small square\n", " ]\n", " \n", " cleaned_text = text\n", " for pattern in bullet_patterns:\n", " cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.IGNORECASE)\n", " \n", " # Remove extra whitespace\n", " cleaned_text = re.sub(r'\\s+', ' ', cleaned_text).strip()\n", " \n", " return cleaned_text\n", "\n", "def format_list_content(items, list_type):\n", " \"\"\"Format list items into readable content WITHOUT adding bullets\"\"\"\n", " if list_type == 'ordered_list':\n", " return '\\n'.join(f\"{i+1}. {item}\" for i, item in enumerate(items))\n", " else:\n", " # For unordered lists, just join with newlines or separators\n", " # Don't add bullets since we want clean text\n", " return '\\n'.join(items)\n", " # Alternative: use a separator instead of newlines\n", " # return ' | '.join(items)" ] }, { "cell_type": "code", "execution_count": 7, "id": "62d7feea-1c66-419d-8fb7-66dc75c8e9f7", "metadata": {}, "outputs": [], "source": [ "def process_table(table_element, heading_stack):\n", " \"\"\"Process tables with different strategies based on size.\n", " Generated by ClaudeAI\"\"\"\n", " \n", " # Extract table data\n", " table_data = extract_table_data(table_element)\n", " \n", " if not table_data['rows']:\n", " return None\n", " \n", " row_count = len(table_data['rows'])\n", " col_count = len(table_data['headers']) if table_data['headers'] else len(table_data['rows'][0])\n", " \n", " # Strategy based on table size\n", " if row_count <= 20 and col_count <= 6:\n", " # Small table: treat as single chunk\n", " content = format_table_content(table_data)\n", " return create_chunk('table', content, heading_stack, {\n", " 'row_count': row_count,\n", " 'column_count': col_count,\n", " 'headers': table_data['headers'],\n", " 'table_caption': table_data['caption']\n", " })\n", " \n", " else:\n", " # Large table: split by rows\n", " return split_large_table(table_data, heading_stack)\n", "\n", "def extract_table_data(table_element):\n", " \"\"\"Extract structured data from table.\n", " Generated by ClaudeAI\"\"\"\n", " \n", " # Get caption if present\n", " caption_elem = table_element.find('caption')\n", " caption = caption_elem.get_text().strip() if caption_elem else None\n", " \n", " # Extract headers\n", " headers = []\n", " header_row = table_element.find('thead')\n", " if header_row:\n", " for th in header_row.find_all(['th', 'td']):\n", " headers.append(th.get_text().strip())\n", " else:\n", " # Try first row as headers\n", " first_row = table_element.find('tr')\n", " if first_row:\n", " for cell in first_row.find_all(['th', 'td']):\n", " headers.append(cell.get_text().strip())\n", " \n", " # Extract data rows\n", " rows = []\n", " tbody = table_element.find('tbody') or table_element\n", " \n", " for tr in tbody.find_all('tr')[1 if not table_element.find('thead') and headers else 0:]:\n", " row = []\n", " for cell in tr.find_all(['td', 'th']):\n", " row.append(cell.get_text().strip())\n", " if row: # Skip empty rows\n", " rows.append(row)\n", " \n", " return {\n", " 'caption': caption,\n", " 'headers': headers,\n", " 'rows': rows\n", " }\n", "\n", "def format_table_content(table_data):\n", " \"\"\"Format table data into readable text\n", " Generated by ClaudeAI\"\"\"\n", " content_parts = []\n", " \n", " if table_data['caption']:\n", " content_parts.append(f\"Table: {table_data['caption']}\")\n", " \n", " headers = table_data['headers']\n", " rows = table_data['rows']\n", " \n", " if headers:\n", " content_parts.append(\"Columns: \" + \" | \".join(headers))\n", " \n", " # Format rows\n", " for i, row in enumerate(rows):\n", " if headers and len(row) == len(headers):\n", " # Create key-value pairs\n", " row_content = []\n", " for header, value in zip(headers, row):\n", " if value: # Skip empty cells\n", " row_content.append(f\"{header}: {value}\")\n", " if row_content:\n", " content_parts.append(f\"Row {i+1}: \" + \"; \".join(row_content))\n", " else:\n", " # Simple row format\n", " content_parts.append(f\"Row {i+1}: \" + \" | \".join(row))\n", " \n", " return '\\n'.join(content_parts)\n", "\n", "def split_large_table(table_data, heading_stack):\n", " \"\"\"Split large tables into smaller chunks\n", " Generated By ClaudeAI\"\"\"\n", " chunks = []\n", " headers = table_data['headers']\n", " rows = table_data['rows']\n", " \n", " chunk_size = 10 # Rows per chunk\n", " total_chunks = (len(rows) + chunk_size - 1) // chunk_size\n", " \n", " for i in range(0, len(rows), chunk_size):\n", " chunk_rows = rows[i:i + chunk_size]\n", " \n", " chunk_table_data = {\n", " 'caption': table_data['caption'],\n", " 'headers': headers,\n", " 'rows': chunk_rows\n", " }\n", " \n", " content = format_table_content(chunk_table_data)\n", " \n", " chunk = create_chunk('table_part', content, heading_stack, {\n", " 'row_count': len(chunk_rows),\n", " 'column_count': len(headers) if headers else len(chunk_rows[0]),\n", " 'headers': headers,\n", " 'table_caption': table_data['caption'],\n", " 'part_number': i // chunk_size + 1,\n", " 'total_parts': total_chunks,\n", " 'row_range': f\"{i+1}-{min(i+chunk_size, len(rows))}\"\n", " })\n", " \n", " chunks.append(chunk)\n", " \n", " return chunks" ] }, { "cell_type": "code", "execution_count": 8, "id": "fb41a9bf-01d3-4931-80c7-f9d3f853d761", "metadata": {}, "outputs": [], "source": [ "def extract_content(html_content):\n", " ret_value = {}\n", " soup = BeautifulSoup(html_content, 'html.parser')\n", "\n", " og_url = soup.find('meta', property='og:url')\n", " og_description = soup.find('meta', property=\"og:description\")\n", " og_title = soup.find('meta', property=\"og:title\")\n", " print(og_title)\n", " title_content = og_title.get('content') if og_title else None\n", " title = re.sub(r'[\\s\\xa0]*\\|[\\s\\xa0]*', ' | ', title_content) if title_content else None\n", "\n", " article_body = soup.find('div', class_='devsite-article-body')\n", " if not article_body:\n", " return {}\n", "\n", " footer = soup.find('devsite-content-footer')\n", " # footer_paras = footer.find_all('p') if footer else None\n", " # second_para = footer_paras[1] if len(footer_paras) > 1 else None\n", " date_last_modified = date.today().strftime('%Y-%m-%d')\n", " if footer:\n", " footer_paras = footer.find_all('p')\n", " for fp in footer_paras:\n", " last_updated_re = r'Last updated (.*) UTC'\n", " match = re.search(last_updated_re, fp.get_text())\n", " if match:\n", " date_last_modified = match.group(1)\n", " break\n", "\n", " #\n", " # Start ClaudeAI generated Code\n", " #\n", " chunks = []\n", " heading_stack = []\n", " \n", " # Process elements that can be chunks or provide context\n", " for element in article_body.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol', 'table']):\n", " if element.name.startswith('h'):\n", " level = int(element.name[1])\n", " heading_text = element.get_text().strip()\n", " \n", " heading_stack = [h for h in heading_stack if h['level'] < level]\n", " heading_stack.append({'level': level, 'text': heading_text})\n", " \n", " elif element.name == 'p':\n", " raw_content = element.get_text().strip()\n", "\n", " if isinstance(raw_content, list):\n", " # Join the list first, then replace newlines\n", " content = ' '.join(raw_content).replace('\\n', ' ')\n", " else:\n", " # It's already a string\n", " content = raw_content.replace('\\n', ' ')\n", " \n", " if content and len(content) > 10:\n", " chunk = create_chunk('paragraph', content, heading_stack)\n", " chunks.append(chunk)\n", " \n", " elif element.name in ['ul', 'ol']:\n", " list_chunk = process_list(element, heading_stack)\n", " if list_chunk:\n", " chunks.append(list_chunk)\n", " \n", " elif element.name == 'table':\n", " table_chunk = process_table(element, heading_stack)\n", " if table_chunk:\n", " chunks.append(table_chunk)\n", " #\n", " # End ClaudeAI generated code\n", " #\n", "\n", " ret_value['url'] = og_url.get('content') if og_url else None\n", " ret_value['description'] = og_description.get('content') if og_description else None\n", " ret_value['title'] = title\n", " ret_value['date_last_modified'] = date_last_modified\n", " ret_value['chunks'] = chunks\n", " # ret_value['article'] = article_body\n", "\n", " return ret_value" ] }, { "cell_type": "code", "execution_count": 9, "id": "816faecf-b764-46ce-ac2a-1cd1bf0bd9c4", "metadata": {}, "outputs": [], "source": [ "def prepare_enhanced_documents(html_content, additional_metadata=None):\n", " extracted = extract_content(html_content)\n", "\n", " if not extracted:\n", " return []\n", " # extracted = extract_all_content_chunks(html_content)\n", "\n", " additional_metadata = {\n", " \"date_last_modified\": extracted['date_last_modified']\n", " }\n", " \n", " documents = []\n", " chunk_counter = 0\n", " \n", " for chunk in extracted['chunks']:\n", " # Handle cases where list/table processing returns multiple chunks\n", " if isinstance(chunk, list):\n", " for sub_chunk in chunk:\n", " doc = create_document_from_chunk(sub_chunk, extracted['url'], extracted['title'], chunk_counter, additional_metadata)\n", " documents.append(doc)\n", " chunk_counter += 1\n", " else:\n", " doc = create_document_from_chunk(chunk, extracted['url'], extracted['title'], chunk_counter, additional_metadata)\n", " documents.append(doc)\n", " chunk_counter += 1\n", " \n", " return documents\n", "\n", "def create_document_from_chunk(chunk, url, title, index, additional_metadata):\n", " \"\"\"Create document object from chunk\"\"\"\n", " \n", " # Create enhanced content for embedding\n", " content_parts = []\n", " \n", " # Add document title\n", " content_parts.append(title)\n", " \n", " # Add heading context\n", " if chunk['heading_path']:\n", " content_parts.append(f\"Section: {chunk['heading_path']}\")\n", " \n", " # Add content type context\n", " content_type_labels = {\n", " 'paragraph': '',\n", " 'unordered_list': 'List:',\n", " 'ordered_list': 'Numbered list:',\n", " 'table': 'Table:',\n", " 'table_part': 'Table data:',\n", " 'unordered_list_part': 'List items:',\n", " 'ordered_list_part': 'Numbered list items:'\n", " }\n", " \n", " type_label = content_type_labels.get(chunk['content_type'], '')\n", " if type_label:\n", " content_parts.append(type_label)\n", " \n", " # Add main content\n", " content_parts.append(chunk['content'])\n", " \n", " # Enhanced content for embedding\n", " embedding_content = ' '.join(content_parts)\n", " \n", " # Base metadata\n", " metadata = {\n", " 'source_url': url,\n", " 'document_title': title,\n", " 'chunk_index': index,\n", " 'content_type': chunk['content_type'],\n", " 'heading_path': chunk['heading_path'],\n", " 'immediate_heading': chunk['immediate_heading'],\n", " 'all_headings': chunk['headings'],\n", " 'processed_at': date.today().strftime('%Y-%m-%d'),\n", " **(additional_metadata or {})\n", " }\n", " \n", " # Add content-specific metadata\n", " for key in ['item_count', 'list_items', 'row_count', 'column_count', 'headers', 'table_caption', 'part_number', 'total_parts', 'row_range']:\n", " if key in chunk:\n", " metadata[key] = chunk[key]\n", " \n", " doc = {\n", " 'id': f\"{url}#chunk{index}\",\n", " 'content': chunk['content'],\n", " 'embedding_content': embedding_content,\n", " 'metadata': metadata\n", " }\n", " \n", " return doc" ] }, { "cell_type": "code", "execution_count": 10, "id": "5fe9b756-8d89-481c-8ba1-4f4daa79135e", "metadata": {}, "outputs": [], "source": [ "def create_embedding(text):\n", " \"\"\"Create an embedding vector for a single text\"\"\"\n", " response = ollama.embeddings(\n", " model='nomic-embed-text',\n", " prompt=text\n", " )\n", " return response['embedding']" ] }, { "cell_type": "code", "execution_count": 11, "id": "123af885-1da9-4089-a9c6-87b4d26d3e2d", "metadata": {}, "outputs": [], "source": [ "# content = read_file('./docs/run/cloud.google.com/run/docs/overview/what-is-cloud-run')" ] }, { "cell_type": "code", "execution_count": 12, "id": "02eedbde-e1a1-4b40-8ac2-e86ff50afa73", "metadata": {}, "outputs": [], "source": [ "folders_to_read = [\n", " 'docs/run/cloud.google.com/run/docs',\n", " 'docs/compute/cloud.google.com/compute/docs',\n", " 'docs/iam/cloud.google.com/iam/docs'\n", "]\n", " \n" ] }, { "cell_type": "code", "execution_count": 13, "id": "3355eab2-0e54-4dbc-bb54-065cf165a9fa", "metadata": {}, "outputs": [], "source": [ "def is_html_file(file_path: str) -> bool:\n", " \"\"\"Check if file is likely an HTML file based on content.\"\"\"\n", " try:\n", " with open(file_path, 'r', encoding='utf-8') as file:\n", " first_line = file.readline().lower().strip()\n", " # Check for common HTML indicators\n", " return (first_line.startswith('<!doctype html') or \n", " first_line.startswith('<html') or \n", " '<html' in first_line)\n", " except:\n", " return False" ] }, { "cell_type": "code", "execution_count": 14, "id": "f048c113-b928-41e1-8f97-f6a8315b1bcd", "metadata": {}, "outputs": [], "source": [ "# def insert_to_database(conn, records):\n", "# \"\"\"Insert records into the vector_store table.\"\"\"\n", "# try:\n", "# cursor = conn.cursor()\n", " \n", "# insert_query = \"\"\"\n", "# INSERT INTO vector_store (\n", "# id, title, chunk_index, content, source_url, \n", "# date_last_modified, metadata, embedding\n", "# ) VALUES (\n", "# %s, %s, %s, %s, %s, %s, %s, %s\n", "# )\n", "# \"\"\"\n", " \n", "# for record in records:\n", "# cursor.execute(insert_query, (\n", "# record['id'],\n", "# record['title'],\n", "# record['chunk_index'],\n", "# record['content'],\n", "# record['source_url'],\n", "# record['date_last_modified'],\n", "# Json(record['metadata']),\n", "# record['embedding']\n", "# ))\n", " \n", "# conn.commit()\n", "# logger.info(f\"Inserted {len(records)} records into database\")\n", " \n", "# except Exception as e:\n", "# logger.error(f\"Database insertion error: {str(e)}\")\n", "# conn.rollback()\n", "# raise\n", "# finally:\n", "# cursor.close()" ] }, { "cell_type": "code", "execution_count": 15, "id": "272d8cf0-5eab-494b-ad72-9f081aa36586", "metadata": {}, "outputs": [], "source": [ "def insert_to_database(conn, records):\n", " \"\"\"Insert records into the documents and chunks tables.\"\"\"\n", " try:\n", " cursor = conn.cursor()\n", " \n", " # Group records by source_url to handle documents\n", " documents_by_url = {}\n", " for record in records:\n", " source_url = record['source_url']\n", " if source_url not in documents_by_url:\n", " documents_by_url[source_url] = {\n", " 'title': record['title'],\n", " 'source_url': source_url,\n", " 'date_last_modified': record['date_last_modified'],\n", " 'metadata': record['metadata'],\n", " 'chunks': []\n", " }\n", " documents_by_url[source_url]['chunks'].append(record)\n", " \n", " # Insert documents (with conflict handling for duplicates)\n", " document_insert_query = \"\"\"\n", " INSERT INTO documents (source_url, title, date_last_modified, metadata)\n", " VALUES (%s, %s, %s, %s)\n", " ON CONFLICT (source_url) DO UPDATE SET\n", " title = EXCLUDED.title,\n", " date_last_modified = EXCLUDED.date_last_modified,\n", " metadata = EXCLUDED.metadata\n", " RETURNING id\n", " \"\"\"\n", " \n", " # Insert chunks\n", " chunk_insert_query = \"\"\"\n", " INSERT INTO chunks (document_id, chunk_index, content, embedding)\n", " VALUES (%s, %s, %s, %s)\n", " \"\"\"\n", " \n", " # Get document ID query for existing documents\n", " get_document_id_query = \"\"\"\n", " SELECT id FROM documents WHERE source_url = %s\n", " \"\"\"\n", " \n", " total_chunks_inserted = 0\n", " \n", " for source_url, doc_data in documents_by_url.items():\n", " # Try to insert the document (or update if exists)\n", " try:\n", " cursor.execute(document_insert_query, (\n", " doc_data['source_url'],\n", " doc_data['title'],\n", " doc_data['date_last_modified'],\n", " Json(doc_data['metadata'])\n", " ))\n", " document_id = cursor.fetchone()[0]\n", " except Exception as e:\n", " # If insert fails, try to get existing document ID\n", " cursor.execute(get_document_id_query, (source_url,))\n", " result = cursor.fetchone()\n", " if result:\n", " document_id = result[0]\n", " else:\n", " logger.error(f\"Failed to insert or find document for URL {source_url}: {str(e)}\")\n", " continue\n", " \n", " # Insert chunks for this document\n", " for chunk in doc_data['chunks']:\n", " cursor.execute(chunk_insert_query, (\n", " document_id,\n", " chunk['chunk_index'],\n", " chunk['content'],\n", " chunk['embedding']\n", " ))\n", " total_chunks_inserted += 1\n", " \n", " conn.commit()\n", " logger.info(f\"Inserted {len(documents_by_url)} documents and {total_chunks_inserted} chunks into database\")\n", " \n", " except Exception as e:\n", " logger.error(f\"Database insertion error: {str(e)}\")\n", " conn.rollback()\n", " raise\n", " finally:\n", " cursor.close()" ] }, { "cell_type": "code", "execution_count": 16, "id": "b67e5cb3-fa63-4429-a791-902ef2fa4356", "metadata": {}, "outputs": [], "source": [ "def process(folder_path):\n", " \n", " try:\n", " # Connect to database\n", " conn = psycopg2.connect(\n", " host=\"localhost\", # Database server host\n", " port=15432, # Port number (default: 5432)\n", " database=\"gcp_docs\", # Database name\n", " user=\"admin\", # Username\n", " password=\"password\" # Password\n", " )\n", " logger.info(\"Connected to database successfully\")\n", " \n", " # Track processing statistics\n", " total_files = 0\n", " total_chunks = 0\n", " \n", " # Walk through all directories and files\n", " for root, dirs, files in os.walk(folder_path):\n", " logger.info(f\"Processing directory: {root}\")\n", " for filename in files:\n", " file_path = os.path.join(root, filename)\n", " \n", " html_page = read_file(file_path)\n", " logger.info(f\"Processing file: {file_path}\")\n", " \n", " # Extract content from HTML\n", " document = prepare_enhanced_documents(html_page)\n", " \n", " if not document:\n", " logger.warning(f\"No content extracted from {file_path}\")\n", " continue\n", "\n", " i = 0\n", " records = []\n", " for chunk in document:\n", " embedding = create_embedding(chunk['embedding_content'])\n", " record = {\n", " 'id': str(uuid.uuid4()),\n", " 'title': chunk['metadata']['document_title'],\n", " 'chunk_index':chunk['id'],\n", " 'content': chunk['content'],\n", " 'source_url': chunk['metadata']['source_url'],\n", " 'date_last_modified': chunk['metadata']['date_last_modified'],\n", " 'metadata': {\n", " **chunk['metadata'],\n", " 'chunk_number': i,\n", " 'total_chunks': len(document),\n", " 'chunk_size': len(chunk['embedding_content'])\n", " },\n", " 'embedding': embedding\n", " }\n", " records.append(record)\n", " i = i + 1\n", " \n", " # Insert records into database\n", " if records:\n", " insert_to_database(conn, records)\n", " total_files += 1\n", " total_chunks += len(records)\n", " \n", " logger.info(f\"Processing complete. Processed {total_files} files, created {total_chunks} chunks\")\n", " return records\n", " \n", " except Exception as e:\n", " logger.error(f\"Error during processing: {str(e)}\")\n", " raise\n", " finally:\n", " if 'conn' in locals():\n", " conn.close()\n", " logger.info(\"Database connection closed\")" ] }, { "cell_type": "code", "execution_count": 17, "id": "b125fdf1-838f-4516-885d-004342ec0be7", "metadata": {}, "outputs": [], "source": [ "# html_file = read_file('docs/run/cloud.google.com/run/docs/monitoring-overview')\n", "# prepare_enhanced_documents(html_file)" ] }, { "cell_type": "code", "execution_count": null, "id": "ed63e442-ede2-48f0-a378-884110a250f1", "metadata": { "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:__main__:Connected to database successfully\n", "INFO:__main__:Processing directory: docs/functions/cloud.google.com/functions/docs\n", "INFO:__main__:Processing file: docs/functions/cloud.google.com/functions/docs/create-deploy-http-ruby\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "<meta content=\"Quickstart: Deploy a Cloud Run function using the gcloud CLI | Cloud Run Documentation | Google Cloud\" property=\"og:title\"/>\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n", "INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings \"HTTP/1.1 200 OK\"\n" ] } ], "source": [ "#records = process('docs/run/cloud.google.com/run/docs--')\n", "# records = process('docs/iam/cloud.google.com/iam/docs')\n", "# records = process('docs/compute/cloud.google.com/compute/docs')\n", "# records = process('docs/storage/cloud.google.com/storage/docs')\n", "# records = process('docs/iap/cloud.google.com/iap/docs')\n", "# records = process('docs/bigquery/cloud.google.com/bigquery/docs')\n", "# records = process('docs/apigee/cloud.google.com/apigee/docs')\n", "records = process('docs/functions/cloud.google.com/functions/docs')\n", "# records = process('docs/pubsub/cloud.google.com/pubsub/docs')\n", "# records = process('docs/sql/cloud.google.com/sql/docs')" ] }, { "cell_type": "code", "execution_count": null, "id": "d7a4fe99-d9db-46c6-bbfb-8d4d36446d1c", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.2" } }, "nbformat": 4, "nbformat_minor": 5 }