Fixing most ruff errors

allenai · Jan 29, 2025 · 2c29533 · 2c29533
1 parent 5690377
commit 2c29533
Show file tree

Hide file tree

Showing 9 changed files with 21 additions and 24 deletions.
diff --git a/olmocr/data/convertsilver_openai.py b/olmocr/data/convertsilver_openai.py
@@ -54,8 +54,6 @@ def process_file(input_file: str, output_file: str, rewrite_prompt_str: bool):
                     match = re.search(pattern, obj["body"]["messages"][0]["content"][0]["text"], re.DOTALL)
 
                     if match:
-                        raw_page_text = match.group(1).strip()
-
                         # Ok, now we want to try to see if it's better if we recalculate the anchor text
                         goldkey = obj["custom_id"]
                         s3_path = goldkey[: goldkey.rindex("-")]

diff --git a/olmocr/data/renderpdf.py b/olmocr/data/renderpdf.py
@@ -93,8 +93,8 @@ def get_png_dimensions_from_base64(base64_data) -> tuple[int, int]:
 
     # Positions in the binary data where width and height are stored
     width_start = 16  # Byte position where width starts (0-based indexing)
-    width_end = 20  # Byte position where width ends (exclusive)
-    height_start = 20
+    _width_end = 20  # Byte position where width ends (exclusive)
+    _height_start = 20
     height_end = 24
 
     # Compute the byte range needed (from width_start to height_end)

diff --git a/olmocr/eval/runeval.py b/olmocr/eval/runeval.py
@@ -13,7 +13,7 @@
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Dict, Optional
+from typing import Dict, Optional, List
 
 import boto3
 import zstandard

diff --git a/olmocr/prompts/__init__.py b/olmocr/prompts/__init__.py
@@ -1 +1 @@
-from .prompts import *
+from .prompts import build_openai_silver_data_prompt, PageResponse, openai_response_format_schema, build_finetuning_prompt, extract_raw_text
diff --git a/olmocr/prompts/_adv_anchor.py b/olmocr/prompts/_adv_anchor.py
@@ -577,7 +577,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
                             font_size,
                         )
             except Exception:
-                logger_warning(
+                print(
                     f" impossible to decode XFormObject {operands[0]}",
                     __name__,
                 )

diff --git a/olmocr/prompts/anchor.py b/olmocr/prompts/anchor.py
@@ -150,8 +150,8 @@ def visitor_op(op, args, cm, tm):
             if xobject and xobject["/Subtype"] == "/Image":
                 # Compute image bbox
                 # The image is placed according to the CTM
-                width = xobject.get("/Width")
-                height = xobject.get("/Height")
+                _width = xobject.get("/Width")
+                _height = xobject.get("/Height")
                 x0, y0 = _transform_point(0, 0, cm)
                 x1, y1 = _transform_point(1, 1, cm)
                 image_elements.append(ImageElement(xobject_name, BoundingBox(min(x0, x1), min(y0, y1), max(x0, x1), max(y0, y1))))
@@ -332,7 +332,7 @@ def _linearize_pdf_report(report: PageReport, max_length: int = 4000) -> str:
 
     # Calculate remaining length
     current_length = len(result) + sum(len(s) for _, _, s, _ in selected_elements)
-    remaining_length = max_length - current_length
+    _remaining_length = max_length - current_length
 
     # Exclude edge elements from the pool
     remaining_elements = [(elem_type, elem, s, position) for elem_type, elem, s, position in all_elements if id(elem) not in selected_element_ids]

diff --git a/olmocr/train/utils.py b/olmocr/train/utils.py
@@ -23,16 +23,13 @@
 from .core.paths import copy_dir, is_local
 from .core.state import BeakerState
 
-# from .tokenization import ModelTokenizer
-
-T = TypeVar("T")
-
 from olmocr.train.dataloader import build_finetuning_dataset
 from olmocr.train.dataprep import (
     batch_prepare_data_for_molmo_training,
     batch_prepare_data_for_qwen2_training,
 )
 
+T = TypeVar("T")
 
 def accelerator_to_dtype(accelerator: Accelerator) -> torch.dtype:
     pt = PrecisionType(accelerator.mixed_precision)

diff --git a/olmocr/work_queue.py b/olmocr/work_queue.py
@@ -8,6 +8,13 @@
 from dataclasses import dataclass
 from typing import List, Optional
 
+from olmocr.s3_utils import (
+    download_zstd_csv,
+    expand_s3_glob,
+    parse_s3_path,
+    upload_zstd_csv,
+)
+
 logger = logging.getLogger(__name__)
 
 
@@ -355,16 +362,9 @@ def size(self) -> int:
 
 
 # --------------------------------------------------------------------------------------
-# S3WorkQueue Implementation (Preserves Original Comments)
+# S3WorkQueue Implementation
 # --------------------------------------------------------------------------------------
 
-from olmocr.s3_utils import (
-    download_zstd_csv,
-    expand_s3_glob,
-    parse_s3_path,
-    upload_zstd_csv,
-)
-
 
 class S3WorkQueue(WorkQueue):
     """

diff --git a/pyproject.toml b/pyproject.toml
@@ -130,8 +130,10 @@ multi_line_output = 3
 reportPrivateImportUsage = false
 
 [tool.ruff]
-line-length = 115
-target-version = "py39"
+line-length = 160
+target-version = "py311"
+exclude = ["olmocr/train/molmo", "tests/*"]
+ignore = ["E722"] #igore bare except
 
 [tool.ruff.per-file-ignores]
 "__init__.py" = ["F401"]
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		from .prompts import *
		from .prompts import build_openai_silver_data_prompt, PageResponse, openai_response_format_schema, build_finetuning_prompt, extract_raw_text