Extracts text from the PDF file from the given start and end page numbers
Args:
start (int, optional): Start page index. Default to 0.
end (int, optional): End page index. Defaults to None.
Returns:
str: Extracted text from the PDF file
Source code in src/gemini-cli/pdf/pypdf_helper.py
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51 | def get_text(self, start: int = 0, end: int = None) -> str:
"""Extracts text from the PDF file from the given start and end page numbers
Args:
start (int, optional): Start page index. Default to 0.
end (int, optional): End page index. Defaults to None.
Returns:
str: Extracted text from the PDF file
"""
try:
end = end if end is not None else self.reader.getNumPages()
pages = range(start, end)
with ThreadPoolExecutor() as executor:
texts = executor.map(self._extract_page_text, pages)
return "".join(texts)
except Exception as e:
logging.error('Error reading PDF file: %s', e)
raise
|