Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
jakep-allenai committed Nov 8, 2024
2 parents 299819e + 60563d6 commit 9ff107b
Showing 1 changed file with 3 additions and 1 deletion.
4 changes: 3 additions & 1 deletion pdelfin/birrpipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -502,12 +502,14 @@ def process_jsonl_content(inference_s3_path: str) -> List[DatabaseManager.BatchI


def get_pdf_num_pages(s3_path: str) -> Optional[int]:
logger.debug(f"Startng to get_pdf_num_pages for {s3_path}")
try:
with tempfile.NamedTemporaryFile("wb+", suffix=".pdf") as tf:
tf.write(get_s3_bytes(pdf_s3, s3_path))
tf.flush()

reader = PdfReader(tf.name)
logger.debug(f"Built reader for {s3_path}")
return reader.get_num_pages()
except Exception as ex:
logger.warning(f"Warning, could not add {s3_path} due to {ex}")
Expand Down Expand Up @@ -717,6 +719,7 @@ def get_current_round(s3_workspace: str) -> int:
for future in tqdm(as_completed(future_to_path), total=len(future_to_path), desc="Adding PDFs"):
s3_path = future_to_path[future]
num_pages = future.result()
logger.debug(f"Got {num_pages} pages back for {s3_path}")
if num_pages and not db.pdf_exists(s3_path):
db.add_pdf(s3_path, num_pages, "pending")

Expand Down Expand Up @@ -782,7 +785,6 @@ def get_current_round(s3_workspace: str) -> int:
return_when=concurrent.futures.FIRST_COMPLETED,
)


for future in done:
pdf = pending_futures.pop(future)
inference_lines = future.result()
Expand Down

0 comments on commit 9ff107b

Please sign in to comment.