gcp_docs/quiz_pdf.ipynb at 89aee1748f8084e1928ba5150a68d9813cbf73cf

Fork: 0
clewis / gcp_docs
Find file
Newer
Older
gcp_docs / quiz_pdf.ipynb
clewis 16 days ago 5 KB start of extracting question and answers from pdf for eval
Raw Blame History
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "a91a0116-3eca-4a85-b211-2e55361f653e",
   "metadata": {},
   "outputs": [],
   "source": [
    "from pypdf import PdfReader"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "269ece5f-6de7-4ca5-87a0-436396a041ea",
   "metadata": {},
   "outputs": [],
   "source": [
    "pdf_path = \"/home/clewis/hdd1tb/docs/gcp/Aldovelio_Castremonte_1000_Practice_Questions_to_Master_the_GCP_Google_Cloud_Certified_Associate_Cloud_Engineer_Exam-ipgdou.pdf\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "d77bfee7-3890-447f-8403-79ef7e4d2a43",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of pages: 1039\n",
      "33\n"
     ]
    }
   ],
   "source": [
    "try:\n",
    "    # Create a PdfReader object\n",
    "    reader = PdfReader(pdf_path)\n",
    "\n",
    "    # Initialize an empty string to store extracted text\n",
    "    extracted_text = \"\"\n",
    "\n",
    "    # Last page of the first 50 questions = page 19, thus zero index = 20\n",
    "    last_page_first_50_questions = 20\n",
    "    first_page_first_50_answers = 20\n",
    "    last_page_first_50_answers = 53\n",
    "    answer_phrase = \"Solution to Question\"\n",
    "\n",
    "    # Loop through each page and extract text\n",
    "    print(f\"Number of pages: {len(reader.pages)}\")\n",
    "    pages = []\n",
    "    page_answers = []\n",
    "    for page_number in range(first_page_first_50_answers, last_page_first_50_answers):\n",
    "        page = reader.pages[page_number]\n",
    "        pages.append(page)\n",
    "        \n",
    "        page_answers.append([answer for answer in page.extract_text().split(\"\\n\") if answer_phrase in answer])\n",
    "        # extracted_text += page.extract_text() + \"\\n\"  # Add a newline for readability\n",
    "\n",
    "    # Print or process the extracted text\n",
    "    print(len(pages))\n",
    "    answers = [item for sublist in page_answers for item in sublist]\n",
    "\n",
    "except FileNotFoundError:\n",
    "    print(f\"Error: The file '{pdf_path}' was not found.\")\n",
    "except Exception as e:\n",
    "    print(f\"An error occurred: {e}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "c73cdb8f-808f-4968-894c-f89a52f4df5a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Solution to Question 1: D',\n",
       " 'Solution to Question 2: B',\n",
       " 'Solution to Question 3: C',\n",
       " 'Solution to Question 4: D',\n",
       " 'Solution to Question 5: C',\n",
       " 'Solution to Question 6: C',\n",
       " 'Solution to Question 7: C',\n",
       " 'Solution to Question 8: D',\n",
       " 'Solution to Question 9: D',\n",
       " 'Solution to Question 10: C',\n",
       " 'Solution to Question 11: B',\n",
       " 'Solution to Question 12: C',\n",
       " 'Solution to Question 13: D',\n",
       " 'Solution to Question 14: A',\n",
       " 'Solution to Question 15: A',\n",
       " 'Solution to Question 16: D',\n",
       " 'Solution to Question 17: D',\n",
       " 'Solution to Question 18: D',\n",
       " 'Solution to Question 19: A',\n",
       " 'Solution to Question 20: D',\n",
       " 'Solution to Question 21: C',\n",
       " 'Solution to Question 22: C',\n",
       " 'Solution to Question 23: B',\n",
       " 'Solution to Question 24: A',\n",
       " 'Solution to Question 25: D',\n",
       " 'Solution to Question 26: A',\n",
       " 'Solution to Question 27: C',\n",
       " 'Solution to Question 28: D',\n",
       " 'Solution to Question 29: D',\n",
       " 'Solution to Question 30: A',\n",
       " 'Solution to Question 31: D',\n",
       " 'Solution to Question 32: A',\n",
       " 'Solution to Question 33: D',\n",
       " 'Solution to Question 34: B',\n",
       " 'Solution to Question 35: D',\n",
       " 'Solution to Question 36: B',\n",
       " 'Solution to Question 37: A',\n",
       " 'Solution to Question 38: D',\n",
       " 'Solution to Question 39: D',\n",
       " 'Solution to Question 40: C',\n",
       " 'Solution to Question 41: B',\n",
       " 'Solution to Question 42: A',\n",
       " 'Solution to Question 43: C',\n",
       " 'Solution to Question 44: C',\n",
       " 'Solution to Question 45: D',\n",
       " 'Solution to Question 46: C',\n",
       " 'Solution to Question 47: D',\n",
       " 'Solution to Question 48: B',\n",
       " 'Solution to Question 49: C',\n",
       " 'Solution to Question 50: A']"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "answers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7cf9bf24-dbfa-437f-9625-1f5d6571394a",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}