Skip to content

pypdf_helper

PyPdfHelper

Source code in src/gemini-cli/pdf/pypdf_helper.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
class PyPdfHelper:
    def __init__(self, path: str):
        """Initializes the PyPdfHelper class to read PDF files using PyPDF2 library
        Args:
            path (str): Path to the PDF file
        """
        try:
            self.path = path
            self.reader = PdfReader(path)
        except Exception as e:
            logging.error('Error reading PDF file: %s', e)
            raise

    def _extract_page_text(self, page_num: int) -> str:
        try:
            return self.reader.pages[page_num].extract_text()
        except Exception as e:
            logging.error('Error extracting text from page %d: %s', page_num, e)
            return ""

    def get_text(self, start: int = 0, end: int = None) -> str:
        """Extracts text from the PDF file from the given start and end page numbers
        Args:
            start (int, optional): Start page index. Default to 0.
            end (int, optional): End page index. Defaults to None.
        Returns:
            str: Extracted text from the PDF file
        """
        try:
            end = end if end is not None else self.reader.getNumPages()
            pages = range(start, end)

            with ThreadPoolExecutor() as executor:
                texts = executor.map(self._extract_page_text, pages)

            return "".join(texts)
        except Exception as e:
            logging.error('Error reading PDF file: %s', e)
            raise
__init__(path)

Initializes the PyPdfHelper class to read PDF files using PyPDF2 library Args: path (str): Path to the PDF file

Source code in src/gemini-cli/pdf/pypdf_helper.py
14
15
16
17
18
19
20
21
22
23
24
def __init__(self, path: str):
    """Initializes the PyPdfHelper class to read PDF files using PyPDF2 library
    Args:
        path (str): Path to the PDF file
    """
    try:
        self.path = path
        self.reader = PdfReader(path)
    except Exception as e:
        logging.error('Error reading PDF file: %s', e)
        raise
get_text(start=0, end=None)

Extracts text from the PDF file from the given start and end page numbers Args: start (int, optional): Start page index. Default to 0. end (int, optional): End page index. Defaults to None. Returns: str: Extracted text from the PDF file

Source code in src/gemini-cli/pdf/pypdf_helper.py
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
def get_text(self, start: int = 0, end: int = None) -> str:
    """Extracts text from the PDF file from the given start and end page numbers
    Args:
        start (int, optional): Start page index. Default to 0.
        end (int, optional): End page index. Defaults to None.
    Returns:
        str: Extracted text from the PDF file
    """
    try:
        end = end if end is not None else self.reader.getNumPages()
        pages = range(start, end)

        with ThreadPoolExecutor() as executor:
            texts = executor.map(self._extract_page_text, pages)

        return "".join(texts)
    except Exception as e:
        logging.error('Error reading PDF file: %s', e)
        raise