{ "cells": [ { "cell_type": "code", "execution_count": 16, "id": "a91a0116-3eca-4a85-b211-2e55361f653e", "metadata": {}, "outputs": [], "source": [ "import re\n", "import csv\n", "from pypdf import PdfReader" ] }, { "cell_type": "code", "execution_count": 2, "id": "269ece5f-6de7-4ca5-87a0-436396a041ea", "metadata": {}, "outputs": [], "source": [ "pdf_path = \"/home/clewis/hdd1tb/docs/gcp/Aldovelio_Castremonte_1000_Practice_Questions_to_Master_the_GCP_Google_Cloud_Certified_Associate_Cloud_Engineer_Exam-ipgdou.pdf\"" ] }, { "cell_type": "code", "execution_count": 3, "id": "211c2ab2-ff37-47d7-a9d3-410105027d7d", "metadata": {}, "outputs": [], "source": [ "def get_answers(pdf_reader, first_page_num, last_page_num):\n", " try:\n", " answer_phrase = \"Solution to Question\"\n", " pages = []\n", " page_answers = []\n", " for page_number in range(first_page_num, last_page_num):\n", " page = reader.pages[page_number]\n", " pages.append(page)\n", " \n", " page_answers.append([answer for answer in page.extract_text().split(\"\\n\") if answer_phrase in answer])\n", " # extracted_text += page.extract_text() + \"\\n\" # Add a newline for readability\n", " \n", " answers = [item for sublist in page_answers for item in sublist]\n", " \n", " except FileNotFoundError:\n", " print(f\"Error: The file '{pdf_path}' was not found.\")\n", " answers = []\n", " except Exception as e:\n", " print(f\"An error occurred: {e}\")\n", " answers = []\n", " finally:\n", " return answers" ] }, { "cell_type": "code", "execution_count": 4, "id": "d77bfee7-3890-447f-8403-79ef7e4d2a43", "metadata": { "scrolled": true }, "outputs": [], "source": [ "# pages = []\n", "# try:\n", "# # Create a PdfReader object\n", "# reader = PdfReader(pdf_path)\n", "\n", "# # Initialize an empty string to store extracted text\n", "# extracted_text = \"\"\n", "\n", "# # Last page of the first 50 questions = page 19, thus zero index = 20\n", "# last_page_first_50_questions = 20\n", "# first_page_first_50_answers = 20\n", "# last_page_first_50_answers = 53\n", "# answer_phrase = \"Solution to Question\"\n", "# # \n", "# # Loop through each page and extract text\n", "# print(f\"Number of pages: {len(reader.pages)}\")\n", "# # pages = []\n", "# page_answers = []\n", "# for page_number in range(first_page_first_50_answers, last_page_first_50_answers):\n", "# page = reader.pages[page_number]\n", "# pages.append(page)\n", " \n", "# page_answers.append([answer for answer in page.extract_text().split(\"\\n\") if answer_phrase in answer])\n", "# # extracted_text += page.extract_text() + \"\\n\" # Add a newline for readability\n", "\n", "# # Print or process the extracted text\n", "# print(len(pages))\n", "# answers = [item for sublist in page_answers for item in sublist]\n", "\n", "# except FileNotFoundError:\n", "# print(f\"Error: The file '{pdf_path}' was not found.\")\n", "# except Exception as e:\n", "# print(f\"An error occurred: {e}\")" ] }, { "cell_type": "code", "execution_count": 21, "id": "7cf9bf24-dbfa-437f-9625-1f5d6571394a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "33\n" ] } ], "source": [ "reader = PdfReader(pdf_path)\n", "\n", "first_page_num_of_exam_1_answers = 20\n", "last_page_num_of_exam_1_answers = 53\n", "\n", "answers = get_answers(reader, first_page_num_of_exam_1_answers, last_page_num_of_exam_1_answers)\n", "\n", "with open('docs/exam_1_answers.csv', mode='w', newline='') as csv_file:\n", " csv_writer = csv.writer(csv_file)\n", " for answer in answers:\n", " csv_writer.writerow([answer])" ] }, { "cell_type": "code", "execution_count": 12, "id": "306d1a68-75c3-4866-9976-a629c56d1c8e", "metadata": {}, "outputs": [], "source": [ "pages = []\n", "extracted_text = \"\"\n", "try:\n", " # Create a PdfReader object\n", " reader = PdfReader(pdf_path)\n", "\n", " # Last page of the first 50 questions = page 19, thus zero index = 20\n", " first_page = 3\n", " last_page = 21\n", "\n", " for page_number in range(first_page, last_page):\n", " page = reader.pages[page_number]\n", " extracted_text = extracted_text + page.extract_text().replace(\"\\n\", \" \")\n", "\n", "except FileNotFoundError:\n", " print(f\"Error: The file '{pdf_path}' was not found.\")\n", "except Exception as e:\n", " print(f\"An error occurred: {e}\")" ] }, { "cell_type": "code", "execution_count": 13, "id": "12d06537-98ce-46b0-9549-a39440e62418", "metadata": {}, "outputs": [], "source": [ "def extract_question_v2(text, question_num, total_questions=50):\n", " \"\"\"Extract a specific question number from text\"\"\"\n", " \n", " if question_num == total_questions:\n", " # For the last question, look for common patterns that indicate end of questions\n", " # Try multiple patterns in order of preference\n", " patterns = [\n", " rf'(Question {question_num}:.*?)(?=\\d+Practice Exam \\d+ Solutions)', # \"19Practice Exam 1 Solutions\"\n", " rf'(Question {question_num}:.*?)(?=Practice Exam \\d+ Solutions)', # \"Practice Exam 1 Solutions\"\n", " rf'(Question {question_num}:.*?)(?=Solutions)', # Just \"Solutions\"\n", " rf'(Question {question_num}:.*?)(?=Solution to Question)', # \"Solution to Question\"\n", " rf'(Question {question_num}:.*?)(?=Answer Key)', # \"Answer Key\"\n", " rf'(Question {question_num}:.*?)(?=SOLUTIONS)', # \"SOLUTIONS\" (uppercase)\n", " rf'(Question {question_num}:.*)' # Fallback: to end of string\n", " ]\n", " \n", " for pattern in patterns:\n", " match = re.search(pattern, text, re.DOTALL)\n", " if match:\n", " return match.group(1).strip()\n", " else:\n", " # For other questions, match until next question\n", " pattern = rf'(Question {question_num}:.*?)(?=Question {question_num + 1}:)'\n", " match = re.search(pattern, text, re.DOTALL)\n", " if match:\n", " return match.group(1).strip()\n", " return None" ] }, { "cell_type": "code", "execution_count": 14, "id": "187c6b70-9ead-4ca2-9146-726540628716", "metadata": {}, "outputs": [], "source": [ "questions = []\n", "for i in range(1, 51):\n", " question = extract_question_v2(extracted_text, i)\n", " questions.append(f'\"{question}\"')" ] }, { "cell_type": "code", "execution_count": 22, "id": "d503a074-3940-4994-a62d-8e5b1065e1c0", "metadata": {}, "outputs": [], "source": [ "with open('docs/exam_1_questions.csv', mode='w', newline='') as csv_file:\n", " csv_writer = csv.writer(csv_file)\n", " for question in questions:\n", " csv_writer.writerow([question])" ] }, { "cell_type": "code", "execution_count": null, "id": "fffd210c-52f6-4677-a8f0-a9da32dc0bd1", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.2" } }, "nbformat": 4, "nbformat_minor": 5 }