gcp_docs/quiz_pdf.ipynb at main

Fork: 0
clewis / gcp_docs
Find file
Newer
Older
gcp_docs / quiz_pdf.ipynb
clewis 7 days ago 8 KB quiz_pdf reads Q and A pdf, extracts questions and answers for exam 1 into separate csv files
Raw Blame History
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "a91a0116-3eca-4a85-b211-2e55361f653e",
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import csv\n",
    "from pypdf import PdfReader"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "269ece5f-6de7-4ca5-87a0-436396a041ea",
   "metadata": {},
   "outputs": [],
   "source": [
    "pdf_path = \"/home/clewis/hdd1tb/docs/gcp/Aldovelio_Castremonte_1000_Practice_Questions_to_Master_the_GCP_Google_Cloud_Certified_Associate_Cloud_Engineer_Exam-ipgdou.pdf\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "211c2ab2-ff37-47d7-a9d3-410105027d7d",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_answers(pdf_reader, first_page_num, last_page_num):\n",
    "    try:\n",
    "        answer_phrase = \"Solution to Question\"\n",
    "        pages = []\n",
    "        page_answers = []\n",
    "        for page_number in range(first_page_num, last_page_num):\n",
    "            page = reader.pages[page_number]\n",
    "            pages.append(page)\n",
    "            \n",
    "            page_answers.append([answer for answer in page.extract_text().split(\"\\n\") if answer_phrase in answer])\n",
    "            # extracted_text += page.extract_text() + \"\\n\"  # Add a newline for readability\n",
    "    \n",
    "        answers = [item for sublist in page_answers for item in sublist]\n",
    "    \n",
    "    except FileNotFoundError:\n",
    "        print(f\"Error: The file '{pdf_path}' was not found.\")\n",
    "        answers = []\n",
    "    except Exception as e:\n",
    "        print(f\"An error occurred: {e}\")\n",
    "        answers = []\n",
    "    finally:\n",
    "        return answers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "d77bfee7-3890-447f-8403-79ef7e4d2a43",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# pages = []\n",
    "# try:\n",
    "#     # Create a PdfReader object\n",
    "#     reader = PdfReader(pdf_path)\n",
    "\n",
    "#     # Initialize an empty string to store extracted text\n",
    "#     extracted_text = \"\"\n",
    "\n",
    "#     # Last page of the first 50 questions = page 19, thus zero index = 20\n",
    "#     last_page_first_50_questions = 20\n",
    "#     first_page_first_50_answers = 20\n",
    "#     last_page_first_50_answers = 53\n",
    "#     answer_phrase = \"Solution to Question\"\n",
    "# # \n",
    "#     # Loop through each page and extract text\n",
    "#     print(f\"Number of pages: {len(reader.pages)}\")\n",
    "#     # pages = []\n",
    "#     page_answers = []\n",
    "#     for page_number in range(first_page_first_50_answers, last_page_first_50_answers):\n",
    "#         page = reader.pages[page_number]\n",
    "#         pages.append(page)\n",
    "        \n",
    "#         page_answers.append([answer for answer in page.extract_text().split(\"\\n\") if answer_phrase in answer])\n",
    "#         # extracted_text += page.extract_text() + \"\\n\"  # Add a newline for readability\n",
    "\n",
    "#     # Print or process the extracted text\n",
    "#     print(len(pages))\n",
    "#     answers = [item for sublist in page_answers for item in sublist]\n",
    "\n",
    "# except FileNotFoundError:\n",
    "#     print(f\"Error: The file '{pdf_path}' was not found.\")\n",
    "# except Exception as e:\n",
    "#     print(f\"An error occurred: {e}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "7cf9bf24-dbfa-437f-9625-1f5d6571394a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "33\n"
     ]
    }
   ],
   "source": [
    "reader = PdfReader(pdf_path)\n",
    "\n",
    "first_page_num_of_exam_1_answers = 20\n",
    "last_page_num_of_exam_1_answers = 53\n",
    "\n",
    "answers = get_answers(reader, first_page_num_of_exam_1_answers, last_page_num_of_exam_1_answers)\n",
    "\n",
    "with open('docs/exam_1_answers.csv',  mode='w', newline='') as csv_file:\n",
    "    csv_writer = csv.writer(csv_file)\n",
    "    for answer in answers:\n",
    "        csv_writer.writerow([answer])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "306d1a68-75c3-4866-9976-a629c56d1c8e",
   "metadata": {},
   "outputs": [],
   "source": [
    "pages = []\n",
    "extracted_text = \"\"\n",
    "try:\n",
    "    # Create a PdfReader object\n",
    "    reader = PdfReader(pdf_path)\n",
    "\n",
    "    # Last page of the first 50 questions = page 19, thus zero index = 20\n",
    "    first_page = 3\n",
    "    last_page = 21\n",
    "\n",
    "    for page_number in range(first_page, last_page):\n",
    "        page = reader.pages[page_number]\n",
    "        extracted_text = extracted_text + page.extract_text().replace(\"\\n\", \" \")\n",
    "\n",
    "except FileNotFoundError:\n",
    "    print(f\"Error: The file '{pdf_path}' was not found.\")\n",
    "except Exception as e:\n",
    "    print(f\"An error occurred: {e}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "12d06537-98ce-46b0-9549-a39440e62418",
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_question_v2(text, question_num, total_questions=50):\n",
    "    \"\"\"Extract a specific question number from text\"\"\"\n",
    "    \n",
    "    if question_num == total_questions:\n",
    "        # For the last question, look for common patterns that indicate end of questions\n",
    "        # Try multiple patterns in order of preference\n",
    "        patterns = [\n",
    "            rf'(Question {question_num}:.*?)(?=\\d+Practice Exam \\d+ Solutions)',  # \"19Practice Exam 1 Solutions\"\n",
    "            rf'(Question {question_num}:.*?)(?=Practice Exam \\d+ Solutions)',     # \"Practice Exam 1 Solutions\"\n",
    "            rf'(Question {question_num}:.*?)(?=Solutions)',                       # Just \"Solutions\"\n",
    "            rf'(Question {question_num}:.*?)(?=Solution to Question)',            # \"Solution to Question\"\n",
    "            rf'(Question {question_num}:.*?)(?=Answer Key)',                      # \"Answer Key\"\n",
    "            rf'(Question {question_num}:.*?)(?=SOLUTIONS)',                       # \"SOLUTIONS\" (uppercase)\n",
    "            rf'(Question {question_num}:.*)'                                      # Fallback: to end of string\n",
    "        ]\n",
    "        \n",
    "        for pattern in patterns:\n",
    "            match = re.search(pattern, text, re.DOTALL)\n",
    "            if match:\n",
    "                return match.group(1).strip()\n",
    "    else:\n",
    "        # For other questions, match until next question\n",
    "        pattern = rf'(Question {question_num}:.*?)(?=Question {question_num + 1}:)'\n",
    "        match = re.search(pattern, text, re.DOTALL)\n",
    "        if match:\n",
    "            return match.group(1).strip()\n",
    "    return None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "187c6b70-9ead-4ca2-9146-726540628716",
   "metadata": {},
   "outputs": [],
   "source": [
    "questions = []\n",
    "for i in range(1, 51):\n",
    "    question = extract_question_v2(extracted_text, i)\n",
    "    questions.append(f'\"{question}\"')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "d503a074-3940-4994-a62d-8e5b1065e1c0",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('docs/exam_1_questions.csv',  mode='w', newline='') as csv_file:\n",
    "    csv_writer = csv.writer(csv_file)\n",
    "    for question in questions:\n",
    "        csv_writer.writerow([question])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fffd210c-52f6-4677-a8f0-a9da32dc0bd1",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}