gcp_docs_scrape/sample.ipynb at b5f4e904deb986e26e2fef769f7d57e84fa73450

Fork: 0
clewis / gcp_docs_scrape
Find file
Newer
Older
gcp_docs_scrape / sample.ipynb
moreserverless on 25 May 10 KB Initial commit.
Raw Blame History
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "771cb720-1a7c-4c63-aeba-503ee926fce9",
   "metadata": {},
   "outputs": [],
   "source": [
    "from openai import OpenAI\n",
    "import gradio as gr\n",
    "import requests\n",
    "import json\n",
    "from typing import List\n",
    "from bs4 import BeautifulSoup\n",
    "from IPython.display import Markdown, display, update_display"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f67efb4a-efc3-483e-84aa-60a31b888305",
   "metadata": {},
   "source": [
    "The next 2 code blocks from [Ollama website](https://ollama.com/blog/openai-compatibility)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "1d8b2464-352f-4df9-8ea9-796d1f886ddd",
   "metadata": {},
   "outputs": [],
   "source": [
    "from openai import OpenAI\n",
    "\n",
    "client = OpenAI(\n",
    "    base_url = 'http://localhost:11434/v1',\n",
    "    api_key='ollama', # required, but unused\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "d629773a-c8cd-4ea9-a4c7-17e3d52f3415",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The 2020 World Series was played between the Los Angeles Dodgers and the Tampa Bay Rays at Globe Life Field, as well as Rogers Centre due to COVID-19 restrictions in Texas\n"
     ]
    }
   ],
   "source": [
    "response = client.chat.completions.create(\n",
    "  model=\"llama3.2\",\n",
    "  messages=[\n",
    "    {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
    "    {\"role\": \"user\", \"content\": \"Who won the world series in 2020?\"},\n",
    "    {\"role\": \"assistant\", \"content\": \"The LA Dodgers won in 2020.\"},\n",
    "    {\"role\": \"user\", \"content\": \"Where was it played?\"}\n",
    "  ]\n",
    ")\n",
    "\n",
    "print(response.choices[0].message.content)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "c7d1eea8-cf7a-4f26-891f-f649e3ab6f03",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_chat_response(client, model, user_message, system_message=\"You are a helpful assistant.\"):\n",
    "    response = client.chat.completions.create(\n",
    "      model=model,\n",
    "      messages=[\n",
    "        {\"role\": \"system\", \"content\": system_message},\n",
    "        {\"role\": \"user\", \"content\": user_message}\n",
    "      ]\n",
    "    )\n",
    "\n",
    "    return response.choices[0].message.content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "744ae925-fd10-48f7-ba5f-28b9c2467582",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "I'm not aware of the current date, as I'm a text-based AI model and do not have real-time access to the current date or time. My training data only goes up until December 2023, but I don't know what day of the week it is or what month it currently is.\n",
      "\n",
      "If you need to know the current date or time, I recommend checking your device's clock or searching online for \"current date\" or \"current time.\"\n"
     ]
    }
   ],
   "source": [
    "print(get_chat_response(client, \"llama3.2\", \"What is today's date?\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "70e1608f-976c-4257-9a66-bb4cc19dec93",
   "metadata": {},
   "outputs": [],
   "source": [
    "#print(get_chat_response(client, \"gemma3\", \"What is your cutoff date?\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "9f9ca64a-c43d-4f7d-8fea-87a8d0a38835",
   "metadata": {},
   "outputs": [],
   "source": [
    "#print(get_chat_response(client, \"gemma3\", \"Are you able to generate images?\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "4b7f8415-783b-4ebd-abec-852443abd87c",
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "#\n",
    "# from Ed Donner's github repo for LLM Engineering course on Udemy: \n",
    "# https://github.com/ed-donner/llm_engineering/blob/main/week1/day5.ipynb\n",
    "#\n",
    "headers = {\n",
    " \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
    "}\n",
    "\n",
    "class Website:\n",
    "    \"\"\"\n",
    "    A utility class to represent a Website that we have scraped, now with links\n",
    "    \"\"\"\n",
    "\n",
    "    def __init__(self, url):\n",
    "        self.url = url\n",
    "        response = requests.get(url, headers=headers)\n",
    "        self.body = response.content\n",
    "        soup = BeautifulSoup(self.body, 'html.parser')\n",
    "        self.title = soup.title.string if soup.title else \"No title found\"\n",
    "        if soup.body:\n",
    "            for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
    "                irrelevant.decompose()\n",
    "            self.text = soup.body.get_text(separator=\"\\n\", strip=True)\n",
    "        else:\n",
    "            self.text = \"\"\n",
    "        links = [link.get('href') for link in soup.find_all('a')]\n",
    "        self.links = [link for link in links if link]\n",
    "\n",
    "    def get_contents(self):\n",
    "        return f\"Webpage Title:\\n{self.title}\\nWebpage Contents:\\n{self.text}\\n\\n\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "badd4eac-8b1c-4068-ba46-01ef60948c59",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of links: 194\n"
     ]
    }
   ],
   "source": [
    "eventarc_website = Website(\"https://cloud.google.com/eventarc/docs\")\n",
    "eventarc_links = eventarc_website.links\n",
    "print('Number of links: {}'.format(len(eventarc_links)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "6b0b2f4c-027f-48aa-875a-24939a5120c1",
   "metadata": {},
   "outputs": [],
   "source": [
    "system_message = \"You are an helpful assistant and an expert in determining if links are related to \"\n",
    "system_message += \"Google Cloud eventarcs.  Decide which links are relevant to the Google Cloud Professional \"\n",
    "system_message += \"Developer certification and return only those links.  respond with the full https URL in JSON format. \"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "f9f9b6d5-04cd-465f-a16a-6b8757ab7ec9",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_links_user_prompt(website):\n",
    "    user_prompt = f\"Here is the list of links on the website of {website.url} - \"\n",
    "    user_prompt += \"please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \\\n",
    "Do not include Terms of Service, Privacy, email links.\\n\"\n",
    "    user_prompt += \"Links (some might be relative links):\\n\"\n",
    "    user_prompt += \"\\n\".join(website.links)\n",
    "    return user_prompt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "6ceccbf7-156e-40bd-ac85-a84125c850bb",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_links(client, url, model, user_prompt, system_prompt):\n",
    "    website = Website(url)\n",
    "    response = client.chat.completions.create(\n",
    "        model=model,\n",
    "        messages=[\n",
    "            {\"role\": \"system\", \"content\": system_prompt},\n",
    "            {\"role\": \"user\", \"content\": user_prompt}\n",
    "      ],\n",
    "        response_format={\"type\": \"json_object\"}\n",
    "    )\n",
    "    result = response.choices[0].message.content\n",
    "    return json.loads(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "99fd93c8-6443-4b98-9e1f-5f1b24e92940",
   "metadata": {},
   "outputs": [],
   "source": [
    "user_prompt = get_links_user_prompt(eventarc_website)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "a733e5cc-0e22-47e0-84eb-0d6a4bbbf4f4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'links': ['https://cloud.google.com/eventarc/docs', 'https://cloud.google.com/eventarcStandard/docs/overview', 'https://cloud.google.com/eventarcAdvanced/docs/overview', 'https://cloud.google.com/eventarcStandard/docs/apis', 'https://cloud.google.com/eventarcStandard/docs/samples', 'https://cloud.google.com/eventarcaAdvanced/docs/resources', 'https://www.apache.org/licenses/LICENSE-2.0', 'https://github.com/googlecloudPlatform']}\n"
     ]
    }
   ],
   "source": [
    "llama32_relevant_links = get_links(client, eventarc_website.url, \"llama3.2\", user_prompt, system_message)\n",
    "print(llama32_relevant_links)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "423642fe-fb0e-4c8a-96bc-11e6416048dd",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of relevant links found by llama3.2: 8\n"
     ]
    }
   ],
   "source": [
    "print(\"Number of relevant links found by llama3.2: {}\".format(len(llama32_relevant_links['links'])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "16511679-7b47-451e-b469-4ca9b23d3576",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{}\n"
     ]
    }
   ],
   "source": [
    "gemma3_relevant_links = get_links(client, eventarc_website.url, \"gemma3\", user_prompt, system_message)\n",
    "print(gemma3_relevant_links)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "66db5fe2-6a51-48d0-80d0-85abb6afec21",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.17"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}