Skip to content

Commit

Permalink
Fixing most ruff errors
Browse files Browse the repository at this point in the history
  • Loading branch information
jakep-allenai committed Jan 29, 2025
1 parent 5690377 commit 2c29533
Show file tree
Hide file tree
Showing 9 changed files with 21 additions and 24 deletions.
2 changes: 0 additions & 2 deletions olmocr/data/convertsilver_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,6 @@ def process_file(input_file: str, output_file: str, rewrite_prompt_str: bool):
match = re.search(pattern, obj["body"]["messages"][0]["content"][0]["text"], re.DOTALL)

if match:
raw_page_text = match.group(1).strip()

# Ok, now we want to try to see if it's better if we recalculate the anchor text
goldkey = obj["custom_id"]
s3_path = goldkey[: goldkey.rindex("-")]
Expand Down
4 changes: 2 additions & 2 deletions olmocr/data/renderpdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,8 @@ def get_png_dimensions_from_base64(base64_data) -> tuple[int, int]:

# Positions in the binary data where width and height are stored
width_start = 16 # Byte position where width starts (0-based indexing)
width_end = 20 # Byte position where width ends (exclusive)
height_start = 20
_width_end = 20 # Byte position where width ends (exclusive)
_height_start = 20
height_end = 24

# Compute the byte range needed (from width_start to height_end)
Expand Down
2 changes: 1 addition & 1 deletion olmocr/eval/runeval.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Optional
from typing import Dict, Optional, List

import boto3
import zstandard
Expand Down
2 changes: 1 addition & 1 deletion olmocr/prompts/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from .prompts import *
from .prompts import build_openai_silver_data_prompt, PageResponse, openai_response_format_schema, build_finetuning_prompt, extract_raw_text
2 changes: 1 addition & 1 deletion olmocr/prompts/_adv_anchor.py
Original file line number Diff line number Diff line change
Expand Up @@ -577,7 +577,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
font_size,
)
except Exception:
logger_warning(
print(
f" impossible to decode XFormObject {operands[0]}",
__name__,
)
Expand Down
6 changes: 3 additions & 3 deletions olmocr/prompts/anchor.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,8 +150,8 @@ def visitor_op(op, args, cm, tm):
if xobject and xobject["/Subtype"] == "/Image":
# Compute image bbox
# The image is placed according to the CTM
width = xobject.get("/Width")
height = xobject.get("/Height")
_width = xobject.get("/Width")
_height = xobject.get("/Height")
x0, y0 = _transform_point(0, 0, cm)
x1, y1 = _transform_point(1, 1, cm)
image_elements.append(ImageElement(xobject_name, BoundingBox(min(x0, x1), min(y0, y1), max(x0, x1), max(y0, y1))))
Expand Down Expand Up @@ -332,7 +332,7 @@ def _linearize_pdf_report(report: PageReport, max_length: int = 4000) -> str:

# Calculate remaining length
current_length = len(result) + sum(len(s) for _, _, s, _ in selected_elements)
remaining_length = max_length - current_length
_remaining_length = max_length - current_length

# Exclude edge elements from the pool
remaining_elements = [(elem_type, elem, s, position) for elem_type, elem, s, position in all_elements if id(elem) not in selected_element_ids]
Expand Down
5 changes: 1 addition & 4 deletions olmocr/train/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,13 @@
from .core.paths import copy_dir, is_local
from .core.state import BeakerState

# from .tokenization import ModelTokenizer

T = TypeVar("T")

from olmocr.train.dataloader import build_finetuning_dataset
from olmocr.train.dataprep import (
batch_prepare_data_for_molmo_training,
batch_prepare_data_for_qwen2_training,
)

T = TypeVar("T")

def accelerator_to_dtype(accelerator: Accelerator) -> torch.dtype:
pt = PrecisionType(accelerator.mixed_precision)
Expand Down
16 changes: 8 additions & 8 deletions olmocr/work_queue.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,13 @@
from dataclasses import dataclass
from typing import List, Optional

from olmocr.s3_utils import (
download_zstd_csv,
expand_s3_glob,
parse_s3_path,
upload_zstd_csv,
)

logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -355,16 +362,9 @@ def size(self) -> int:


# --------------------------------------------------------------------------------------
# S3WorkQueue Implementation (Preserves Original Comments)
# S3WorkQueue Implementation
# --------------------------------------------------------------------------------------

from olmocr.s3_utils import (
download_zstd_csv,
expand_s3_glob,
parse_s3_path,
upload_zstd_csv,
)


class S3WorkQueue(WorkQueue):
"""
Expand Down
6 changes: 4 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,10 @@ multi_line_output = 3
reportPrivateImportUsage = false

[tool.ruff]
line-length = 115
target-version = "py39"
line-length = 160
target-version = "py311"
exclude = ["olmocr/train/molmo", "tests/*"]
ignore = ["E722"] #igore bare except

[tool.ruff.per-file-ignores]
"__init__.py" = ["F401"]
Expand Down

0 comments on commit 2c29533

Please sign in to comment.