Skip to content

Commit

Permalink
Adding some skip logic
Browse files Browse the repository at this point in the history
  • Loading branch information
jakep-allenai committed Oct 27, 2024
1 parent 8e6d0c6 commit 062abff
Showing 1 changed file with 5 additions and 1 deletion.
6 changes: 5 additions & 1 deletion pdelfin/birrpipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -610,6 +610,7 @@ def get_current_round(s3_workspace: str) -> int:
parser.add_argument('--pdf_profile', help='S3 configuration profile for accessing the raw pdf documents', default=None)
parser.add_argument('--max_size_mb', type=int, default=250, help='Max file size in MB')
parser.add_argument('--reindex', action='store_true', default=False, help='Reindex all of the page_results')
parser.add_argument('--skip_build_queries', action='store_true', default=False, help='Skip generation of new pdf page queries for batch inferencing')
args = parser.parse_args()

if args.workspace_profile:
Expand Down Expand Up @@ -687,6 +688,9 @@ def get_current_round(s3_workspace: str) -> int:
if db.get_last_indexed_round() < current_round - 1:
print(f"WARNING: No new batch inference results found, you need to run batch inference on {args.workspace}/inference_inputs/round_{current_round - 1}")
potentially_done_pdfs = db.get_pdfs_by_status("pending")
elif args.skip_build_queries:
print(f"Skipping generating new batch inference files")
potentially_done_pdfs = db.get_pdfs_by_status("pending")
else:
print(f"\nCreating batch inference files for new PDFs")
pdf_list = list(db.get_pdfs_by_status("pending"))
Expand All @@ -696,7 +700,7 @@ def get_current_round(s3_workspace: str) -> int:
lines_written = 0
new_inference_writer = BatchWriter(f"{args.workspace}/inference_inputs/round_{current_round}", args.max_size_mb)
total_pdfs = len(pdf_list)
max_pending = 5000
max_pending = 300

with tqdm(total=total_pdfs) as pbar:
# Submit initial batch of futures
Expand Down

0 comments on commit 062abff

Please sign in to comment.