diff --git a/quiz_pdf.ipynb b/quiz_pdf.ipynb index 8fe02e7..3eb0081 100644 --- a/quiz_pdf.ipynb +++ b/quiz_pdf.ipynb @@ -2,17 +2,19 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 16, "id": "a91a0116-3eca-4a85-b211-2e55361f653e", "metadata": {}, "outputs": [], "source": [ + "import re\n", + "import csv\n", "from pypdf import PdfReader" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "id": "269ece5f-6de7-4ca5-87a0-436396a041ea", "metadata": {}, "outputs": [], @@ -22,49 +24,127 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 3, + "id": "211c2ab2-ff37-47d7-a9d3-410105027d7d", + "metadata": {}, + "outputs": [], + "source": [ + "def get_answers(pdf_reader, first_page_num, last_page_num):\n", + " try:\n", + " answer_phrase = \"Solution to Question\"\n", + " pages = []\n", + " page_answers = []\n", + " for page_number in range(first_page_num, last_page_num):\n", + " page = reader.pages[page_number]\n", + " pages.append(page)\n", + " \n", + " page_answers.append([answer for answer in page.extract_text().split(\"\\n\") if answer_phrase in answer])\n", + " # extracted_text += page.extract_text() + \"\\n\" # Add a newline for readability\n", + " \n", + " answers = [item for sublist in page_answers for item in sublist]\n", + " \n", + " except FileNotFoundError:\n", + " print(f\"Error: The file '{pdf_path}' was not found.\")\n", + " answers = []\n", + " except Exception as e:\n", + " print(f\"An error occurred: {e}\")\n", + " answers = []\n", + " finally:\n", + " return answers" + ] + }, + { + "cell_type": "code", + "execution_count": 4, "id": "d77bfee7-3890-447f-8403-79ef7e4d2a43", "metadata": { "scrolled": true }, + "outputs": [], + "source": [ + "# pages = []\n", + "# try:\n", + "# # Create a PdfReader object\n", + "# reader = PdfReader(pdf_path)\n", + "\n", + "# # Initialize an empty string to store extracted text\n", + "# extracted_text = \"\"\n", + "\n", + "# # Last page of the first 50 questions = page 19, thus zero index = 20\n", + "# last_page_first_50_questions = 20\n", + "# first_page_first_50_answers = 20\n", + "# last_page_first_50_answers = 53\n", + "# answer_phrase = \"Solution to Question\"\n", + "# # \n", + "# # Loop through each page and extract text\n", + "# print(f\"Number of pages: {len(reader.pages)}\")\n", + "# # pages = []\n", + "# page_answers = []\n", + "# for page_number in range(first_page_first_50_answers, last_page_first_50_answers):\n", + "# page = reader.pages[page_number]\n", + "# pages.append(page)\n", + " \n", + "# page_answers.append([answer for answer in page.extract_text().split(\"\\n\") if answer_phrase in answer])\n", + "# # extracted_text += page.extract_text() + \"\\n\" # Add a newline for readability\n", + "\n", + "# # Print or process the extracted text\n", + "# print(len(pages))\n", + "# answers = [item for sublist in page_answers for item in sublist]\n", + "\n", + "# except FileNotFoundError:\n", + "# print(f\"Error: The file '{pdf_path}' was not found.\")\n", + "# except Exception as e:\n", + "# print(f\"An error occurred: {e}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "7cf9bf24-dbfa-437f-9625-1f5d6571394a", + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Number of pages: 1039\n", "33\n" ] } ], "source": [ + "reader = PdfReader(pdf_path)\n", + "\n", + "first_page_num_of_exam_1_answers = 20\n", + "last_page_num_of_exam_1_answers = 53\n", + "\n", + "answers = get_answers(reader, first_page_num_of_exam_1_answers, last_page_num_of_exam_1_answers)\n", + "\n", + "with open('docs/exam_1_answers.csv', mode='w', newline='') as csv_file:\n", + " csv_writer = csv.writer(csv_file)\n", + " for answer in answers:\n", + " csv_writer.writerow([answer])" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "306d1a68-75c3-4866-9976-a629c56d1c8e", + "metadata": {}, + "outputs": [], + "source": [ + "pages = []\n", + "extracted_text = \"\"\n", "try:\n", " # Create a PdfReader object\n", " reader = PdfReader(pdf_path)\n", "\n", - " # Initialize an empty string to store extracted text\n", - " extracted_text = \"\"\n", - "\n", " # Last page of the first 50 questions = page 19, thus zero index = 20\n", - " last_page_first_50_questions = 20\n", - " first_page_first_50_answers = 20\n", - " last_page_first_50_answers = 53\n", - " answer_phrase = \"Solution to Question\"\n", + " first_page = 3\n", + " last_page = 21\n", "\n", - " # Loop through each page and extract text\n", - " print(f\"Number of pages: {len(reader.pages)}\")\n", - " pages = []\n", - " page_answers = []\n", - " for page_number in range(first_page_first_50_answers, last_page_first_50_answers):\n", + " for page_number in range(first_page, last_page):\n", " page = reader.pages[page_number]\n", - " pages.append(page)\n", - " \n", - " page_answers.append([answer for answer in page.extract_text().split(\"\\n\") if answer_phrase in answer])\n", - " # extracted_text += page.extract_text() + \"\\n\" # Add a newline for readability\n", - "\n", - " # Print or process the extracted text\n", - " print(len(pages))\n", - " answers = [item for sublist in page_answers for item in sublist]\n", + " extracted_text = extracted_text + page.extract_text().replace(\"\\n\", \" \")\n", "\n", "except FileNotFoundError:\n", " print(f\"Error: The file '{pdf_path}' was not found.\")\n", @@ -74,78 +154,70 @@ }, { "cell_type": "code", - "execution_count": 21, - "id": "c73cdb8f-808f-4968-894c-f89a52f4df5a", + "execution_count": 13, + "id": "12d06537-98ce-46b0-9549-a39440e62418", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['Solution to Question 1: D',\n", - " 'Solution to Question 2: B',\n", - " 'Solution to Question 3: C',\n", - " 'Solution to Question 4: D',\n", - " 'Solution to Question 5: C',\n", - " 'Solution to Question 6: C',\n", - " 'Solution to Question 7: C',\n", - " 'Solution to Question 8: D',\n", - " 'Solution to Question 9: D',\n", - " 'Solution to Question 10: C',\n", - " 'Solution to Question 11: B',\n", - " 'Solution to Question 12: C',\n", - " 'Solution to Question 13: D',\n", - " 'Solution to Question 14: A',\n", - " 'Solution to Question 15: A',\n", - " 'Solution to Question 16: D',\n", - " 'Solution to Question 17: D',\n", - " 'Solution to Question 18: D',\n", - " 'Solution to Question 19: A',\n", - " 'Solution to Question 20: D',\n", - " 'Solution to Question 21: C',\n", - " 'Solution to Question 22: C',\n", - " 'Solution to Question 23: B',\n", - " 'Solution to Question 24: A',\n", - " 'Solution to Question 25: D',\n", - " 'Solution to Question 26: A',\n", - " 'Solution to Question 27: C',\n", - " 'Solution to Question 28: D',\n", - " 'Solution to Question 29: D',\n", - " 'Solution to Question 30: A',\n", - " 'Solution to Question 31: D',\n", - " 'Solution to Question 32: A',\n", - " 'Solution to Question 33: D',\n", - " 'Solution to Question 34: B',\n", - " 'Solution to Question 35: D',\n", - " 'Solution to Question 36: B',\n", - " 'Solution to Question 37: A',\n", - " 'Solution to Question 38: D',\n", - " 'Solution to Question 39: D',\n", - " 'Solution to Question 40: C',\n", - " 'Solution to Question 41: B',\n", - " 'Solution to Question 42: A',\n", - " 'Solution to Question 43: C',\n", - " 'Solution to Question 44: C',\n", - " 'Solution to Question 45: D',\n", - " 'Solution to Question 46: C',\n", - " 'Solution to Question 47: D',\n", - " 'Solution to Question 48: B',\n", - " 'Solution to Question 49: C',\n", - " 'Solution to Question 50: A']" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "answers" + "def extract_question_v2(text, question_num, total_questions=50):\n", + " \"\"\"Extract a specific question number from text\"\"\"\n", + " \n", + " if question_num == total_questions:\n", + " # For the last question, look for common patterns that indicate end of questions\n", + " # Try multiple patterns in order of preference\n", + " patterns = [\n", + " rf'(Question {question_num}:.*?)(?=\\d+Practice Exam \\d+ Solutions)', # \"19Practice Exam 1 Solutions\"\n", + " rf'(Question {question_num}:.*?)(?=Practice Exam \\d+ Solutions)', # \"Practice Exam 1 Solutions\"\n", + " rf'(Question {question_num}:.*?)(?=Solutions)', # Just \"Solutions\"\n", + " rf'(Question {question_num}:.*?)(?=Solution to Question)', # \"Solution to Question\"\n", + " rf'(Question {question_num}:.*?)(?=Answer Key)', # \"Answer Key\"\n", + " rf'(Question {question_num}:.*?)(?=SOLUTIONS)', # \"SOLUTIONS\" (uppercase)\n", + " rf'(Question {question_num}:.*)' # Fallback: to end of string\n", + " ]\n", + " \n", + " for pattern in patterns:\n", + " match = re.search(pattern, text, re.DOTALL)\n", + " if match:\n", + " return match.group(1).strip()\n", + " else:\n", + " # For other questions, match until next question\n", + " pattern = rf'(Question {question_num}:.*?)(?=Question {question_num + 1}:)'\n", + " match = re.search(pattern, text, re.DOTALL)\n", + " if match:\n", + " return match.group(1).strip()\n", + " return None" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "187c6b70-9ead-4ca2-9146-726540628716", + "metadata": {}, + "outputs": [], + "source": [ + "questions = []\n", + "for i in range(1, 51):\n", + " question = extract_question_v2(extracted_text, i)\n", + " questions.append(f'\"{question}\"')" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "d503a074-3940-4994-a62d-8e5b1065e1c0", + "metadata": {}, + "outputs": [], + "source": [ + "with open('docs/exam_1_questions.csv', mode='w', newline='') as csv_file:\n", + " csv_writer = csv.writer(csv_file)\n", + " for question in questions:\n", + " csv_writer.writerow([question])" ] }, { "cell_type": "code", "execution_count": null, - "id": "7cf9bf24-dbfa-437f-9625-1f5d6571394a", + "id": "fffd210c-52f6-4677-a8f0-a9da32dc0bd1", "metadata": {}, "outputs": [], "source": []