|
| 1 | +import PyPDF2 |
| 2 | +from PIL import Image |
| 3 | +import os |
| 4 | + |
| 5 | +def convert_pdf_to_text(pdf_path, text_output_path): |
| 6 | + """Converts a PDF file to text. |
| 7 | +
|
| 8 | + Args: |
| 9 | + pdf_path (str): Path to the input PDF file. |
| 10 | + text_output_path (str): Path to save the converted text file. |
| 11 | + """ |
| 12 | + try: |
| 13 | + with open(pdf_path, 'rb') as pdf_file: |
| 14 | + pdf_reader = PyPDF2.PdfReader(pdf_file) |
| 15 | + with open(text_output_path, 'w', encoding='utf-8') as text_file: |
| 16 | + # Iterate through each page of the PDF |
| 17 | + for page_num in range(len(pdf_reader.pages)): |
| 18 | + page = pdf_reader.pages[page_num] |
| 19 | + # Extract text from the page and write it to the text file |
| 20 | + text_file.write(page.extract_text()) |
| 21 | + print(f"PDF converted to text successfully. Text file saved at {text_output_path}") |
| 22 | + except Exception as e: |
| 23 | + print(f"An error occurred: {e}") |
| 24 | + |
| 25 | + |
| 26 | +def extract_images_from_pdf(pdf_path, image_output_folder): |
| 27 | + """Extracts images from a PDF file. |
| 28 | +
|
| 29 | + Args: |
| 30 | + pdf_path (str): Path to the input PDF file. |
| 31 | + image_output_folder (str): Folder to save the extracted images. |
| 32 | + """ |
| 33 | + try: |
| 34 | + with open(pdf_path, 'rb') as pdf_file: |
| 35 | + pdf_reader = PyPDF2.PdfReader(pdf_file) |
| 36 | + # Iterate through each page of the PDF |
| 37 | + for page_num in range(len(pdf_reader.pages)): |
| 38 | + page = pdf_reader.pages[page_num] |
| 39 | + xObject = page.resources['XObject'].resolve() |
| 40 | + for obj in xObject: |
| 41 | + if xObject[obj]['/Subtype'] == '/Image': |
| 42 | + size = (xObject[obj]['/Width'], xObject[obj]['/Height']) |
| 43 | + data = xObject[obj].get_data() |
| 44 | + mode = '' |
| 45 | + if xObject[obj]['/ColorSpace'] == '/DeviceRGB': |
| 46 | + mode = "RGB" |
| 47 | + else: |
| 48 | + mode = "P" |
| 49 | + if xObject[obj]['/Filter'] == '/FlateDecode': |
| 50 | + img = Image.frombytes(mode, size, data) |
| 51 | + img.save(os.path.join(image_output_folder, f"page{page_num+1}_{obj[1:]}.png")) |
| 52 | + elif xObject[obj]['/Filter'] == '/DCTDecode': |
| 53 | + img = open(os.path.join(image_output_folder, f"page{page_num+1}_{obj[1:]}.jpg"), "wb") |
| 54 | + img.write(data) |
| 55 | + img.close() |
| 56 | + elif xObject[obj]['/Filter'] == '/JPXDecode': |
| 57 | + img = open(os.path.join(image_output_folder, f"page{page_num+1}_{obj[1:]}.jp2"), "wb") |
| 58 | + img.write(data) |
| 59 | + img.close() |
| 60 | + print(f"Images extracted successfully. Saved in {image_output_folder}") |
| 61 | + except Exception as e: |
| 62 | + print(f"An error occurred: {e}") |
| 63 | + |
| 64 | +def main(): |
| 65 | + # Get input paths and output folder from user |
| 66 | + pdf_path = input("Enter the path to the PDF file: ") |
| 67 | + output_folder = input("Enter the output folder path: ") |
| 68 | + |
| 69 | + # Create the output folder if it does not exist |
| 70 | + if not os.path.exists(output_folder): |
| 71 | + os.makedirs(output_folder) |
| 72 | + |
| 73 | + # Choose conversion option |
| 74 | + choice = input("Choose an option:\n1. Convert PDF to text\n2. Extract images from PDF\nEnter your choice: ") |
| 75 | + |
| 76 | + if choice == '1': |
| 77 | + # Convert PDF to text |
| 78 | + text_output_path = os.path.join(output_folder, "converted_text.txt") |
| 79 | + convert_pdf_to_text(pdf_path, text_output_path) |
| 80 | + elif choice == '2': |
| 81 | + # Extract images from PDF |
| 82 | + image_output_folder = os.path.join(output_folder, "extracted_images") |
| 83 | + if not os.path.exists(image_output_folder): |
| 84 | + os.makedirs(image_output_folder) |
| 85 | + extract_images_from_pdf(pdf_path, image_output_folder) |
| 86 | + else: |
| 87 | + print("Invalid choice. Please choose 1 or 2.") |
| 88 | + |
| 89 | +if __name__ == "__main__": |
| 90 | + main() |
0 commit comments