Skip to content

Commit

Permalink
Trying to get reliablity up
Browse files Browse the repository at this point in the history
  • Loading branch information
jakep-allenai committed Nov 11, 2024
1 parent fedda40 commit 24a9d23
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 2 deletions.
6 changes: 4 additions & 2 deletions pdelfin/beakerpipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ async def build_page_query(local_pdf_path: str, page: int, target_longest_image_

# Allow the page rendering to process in the background while we get the anchor text (which blocks the main thread)
image_base64 = asyncio.to_thread(render_pdf_to_base64png, local_pdf_path, page, target_longest_image_dim=target_longest_image_dim)
anchor_text = get_anchor_text(local_pdf_path, page, pdf_engine="pdfreport", target_length=target_anchor_text_len)
anchor_text = asyncio.to_thread(get_anchor_text, local_pdf_path, page, pdf_engine="pdfreport", target_length=target_anchor_text_len)

image_base64 = await image_base64
if image_rotation != 0:
Expand All @@ -78,6 +78,8 @@ async def build_page_query(local_pdf_path: str, page: int, target_longest_image_
# Encode the rotated image back to base64
image_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')

anchor_text = await anchor_text

return {
"model": "Qwen/Qwen2-VL-7B-Instruct",
"messages": [
Expand Down Expand Up @@ -246,7 +248,7 @@ async def process_pdf(args, pdf_s3_path: str):
# List to hold the tasks for processing each page
page_tasks = []

async with aiohttp.ClientSession() as session:
async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=3600), connector=TCPConnector(limit=100)) as session:
for page_num in range(1, num_pages + 1):
# Create a task for each page
task = asyncio.create_task(process_page(args, session, pdf_s3_path, tf.name, page_num))
Expand Down
1 change: 1 addition & 0 deletions pdelfin/viewer/dolmaviewer.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ def main(jsonl_path, output_dir, template_path):
future.result()
except Exception as e:
print(f"An error occurred: {e}")
raise

if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Generate HTML pages from a JSONL file with pre-signed S3 links.')
Expand Down

0 comments on commit 24a9d23

Please sign in to comment.