Add new tools (#369)

* add glee tools * added qwen25 vl * added glee to video tracking * fixed countgd+ visual prompt * add new countgd visual prompt * add new countgd visual prompt * update tool descriptions * fixed docs for countgd visual prompt * fix depth prompt * add qwen25 to activity recognition * add qwen25 to activity recognition * fix tool names * update df and emb * fix typo * fix typo * address github comments * remove fine tune id arg
landing-ai · Feb 14, 2025 · 14c4046 · 14c4046
1 parent 0141595
commit 14c4046
Show file tree

Hide file tree

Showing 6 changed files with 523 additions and 114 deletions.
diff --git a/tests/integ/test_tools.py b/tests/integ/test_tools.py
@@ -13,7 +13,7 @@
     countgd_object_detection,
     countgd_sam2_instance_segmentation,
     countgd_sam2_video_tracking,
-    countgd_visual_prompt_object_detection,
+    countgd_visual_object_detection,
     custom_object_detection,
     depth_anything_v2,
     detr_segmentation,
@@ -366,18 +366,18 @@ def test_countgd_object_detection_empty():
     assert result == []
 
 
-def test_countgd_visual_prompt_object_detection():
+def test_countgd_visual_object_detection():
     img = ski.data.coins()
-    result = countgd_visual_prompt_object_detection(
+    result = countgd_visual_object_detection(
         visual_prompts=[[85, 106, 122, 145]],
         image=img,
     )
     assert len(result) == 24
     assert [res["label"] for res in result] == ["object"] * 24
 
 
-def test_countgd_visual_prompt_object_detection_empty():
-    result = countgd_visual_prompt_object_detection(
+def test_countgd_visual_object_detection_empty():
+    result = countgd_visual_object_detection(
         visual_prompts=[[85, 106, 122, 145]],
         image=np.zeros((0, 0, 3)).astype(np.uint8),
     )

diff --git a/vision_agent/.sim_tools/df.csv b/vision_agent/.sim_tools/df.csv
@@ -1,5 +1,5 @@
 desc,doc,name
-"'owlv2_object_detection' is a tool that can detect and count multiple objects given a text prompt such as category names or referring expressions on images. The categories in text prompt are separated by commas. It returns a list of bounding boxes with normalized coordinates, label names and associated probability scores.","owlv2_object_detection(prompt: str, image: numpy.ndarray, box_threshold: float = 0.1, fine_tune_id: Optional[str] = None) -> List[Dict[str, Any]]:
+"'owlv2_object_detection' is a tool that can detect and count multiple objects given a text prompt such as category names or referring expressions on images. The categories in text prompt are separated by commas. It returns a list of bounding boxes with normalized coordinates, label names and associated probability scores.","owlv2_object_detection(prompt: str, image: numpy.ndarray, box_threshold: float = 0.1) -> List[Dict[str, Any]]:
 'owlv2_object_detection' is a tool that can detect and count multiple objects
     given a text prompt such as category names or referring expressions on images. The
     categories in text prompt are separated by commas. It returns a list of bounding
@@ -10,8 +10,6 @@ desc,doc,name
         image (np.ndarray): The image to ground the prompt to.
         box_threshold (float, optional): The threshold for the box detection. Defaults
             to 0.10.
-        fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
-            fine-tuned model ID here to use it.
 
     Returns:
         List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
@@ -65,7 +63,7 @@ desc,doc,name
             },
         ]
     ",owlv2_sam2_instance_segmentation
-"'owlv2_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","owlv2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], box_threshold: float = 0.1, chunk_length: Optional[int] = 25, fine_tune_id: Optional[str] = None) -> List[List[Dict[str, Any]]]:
+"'owlv2_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","owlv2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], box_threshold: float = 0.1, chunk_length: Optional[int] = 25) -> List[List[Dict[str, Any]]]:
 'owlv2_sam2_video_tracking' is a tool that can track and segment multiple
     objects in a video given a text prompt such as category names or referring
     expressions. The categories in the text prompt are separated by commas. It returns
@@ -79,8 +77,6 @@ desc,doc,name
             to 0.10.
         chunk_length (Optional[int]): The number of frames to re-run owlv2 to find
             new objects.
-        fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
-            fine-tuned model ID here to use it.
 
     Returns:
         List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
@@ -240,7 +236,7 @@ desc,doc,name
             {'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
         ]
     ",florence2_ocr
-"'florence2_object_detection' is a tool that can detect multiple objects given a text prompt which can be object names or caption. You can optionally separate the object names in the text with commas. It returns a list of bounding boxes with normalized coordinates, label names and associated confidence scores of 1.0.","florence2_object_detection(prompt: str, image: numpy.ndarray, fine_tune_id: Optional[str] = None) -> List[Dict[str, Any]]:
+"'florence2_object_detection' is a tool that can detect multiple objects given a text prompt which can be object names or caption. You can optionally separate the object names in the text with commas. It returns a list of bounding boxes with normalized coordinates, label names and associated confidence scores of 1.0.","florence2_object_detection(prompt: str, image: numpy.ndarray) -> List[Dict[str, Any]]:
 'florence2_object_detection' is a tool that can detect multiple objects given a
     text prompt which can be object names or caption. You can optionally separate the
     object names in the text with commas. It returns a list of bounding boxes with
@@ -250,8 +246,6 @@ desc,doc,name
         prompt (str): The prompt to ground to the image. Use exclusive categories that
             do not overlap such as 'person, car' and NOT 'person, athlete'.
         image (np.ndarray): The image to used to detect objects
-        fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
-            fine-tuned model ID here to use it.
 
     Returns:
         List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
@@ -268,7 +262,7 @@ desc,doc,name
             {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
         ]
     ",florence2_object_detection
-"'florence2_sam2_instance_segmentation' is a tool that can segment multiple objects given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, mask file names and associated probability scores of 1.0.","florence2_sam2_instance_segmentation(prompt: str, image: numpy.ndarray, fine_tune_id: Optional[str] = None) -> List[Dict[str, Any]]:
+"'florence2_sam2_instance_segmentation' is a tool that can segment multiple objects given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, mask file names and associated probability scores of 1.0.","florence2_sam2_instance_segmentation(prompt: str, image: numpy.ndarray) -> List[Dict[str, Any]]:
 'florence2_sam2_instance_segmentation' is a tool that can segment multiple
     objects given a text prompt such as category names or referring expressions. The
     categories in the text prompt are separated by commas. It returns a list of
@@ -279,8 +273,6 @@ desc,doc,name
         prompt (str): The prompt to ground to the image. Use exclusive categories that
             do not overlap such as 'person, car' and NOT 'person, athlete'.
         image (np.ndarray): The image to ground the prompt to.
-        fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
-            fine-tuned model ID here to use it.
 
     Returns:
         List[Dict[str, Any]]: A list of dictionaries containing the score, label,
@@ -306,7 +298,7 @@ desc,doc,name
             },
         ]
     ",florence2_sam2_instance_segmentation
-"'florence2_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","florence2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 25, fine_tune_id: Optional[str] = None) -> List[List[Dict[str, Any]]]:
+"'florence2_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","florence2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 25) -> List[List[Dict[str, Any]]]:
 'florence2_sam2_video_tracking' is a tool that can track and segment multiple
     objects in a video given a text prompt such as category names or referring
     expressions. The categories in the text prompt are separated by commas. It returns
@@ -319,8 +311,6 @@ desc,doc,name
         frames (List[np.ndarray]): The list of frames to ground the prompt to.
         chunk_length (Optional[int]): The number of frames to re-run florence2 to find
             new objects.
-        fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
-            fine-tuned model ID here to use it.
 
     Returns:
         List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
@@ -484,16 +474,17 @@ desc,doc,name
         >>> activity_recognition('Did a goal happened?', frames)
         [0.0, 0.0, 0.0, 1.0, 1.0, 0.0]
     ",activity_recognition
-'depth_anything_v2' is a tool that runs depth_anythingv2 model to generate a depth image from a given RGB image. The returned depth image is monochrome and represents depth values as pixel intesities with pixel values ranging from 0 to 255.,"depth_anything_v2(image: numpy.ndarray) -> numpy.ndarray:
-'depth_anything_v2' is a tool that runs depth_anythingv2 model to generate a
+'depth_anything_v2' is a tool that runs depth anything v2 model to generate a depth image from a given RGB image. The returned depth image is monochrome and represents depth values as pixel intensities with pixel values ranging from 0 to 255.,"depth_anything_v2(image: numpy.ndarray) -> numpy.ndarray:
+'depth_anything_v2' is a tool that runs depth anything v2 model to generate a
     depth image from a given RGB image. The returned depth image is monochrome and
-    represents depth values as pixel intesities with pixel values ranging from 0 to 255.
+    represents depth values as pixel intensities with pixel values ranging from 0 to 255.
 
     Parameters:
         image (np.ndarray): The image to used to generate depth image
 
     Returns:
-        np.ndarray: A grayscale depth image with pixel values ranging from 0 to 255.
+        np.ndarray: A grayscale depth image with pixel values ranging from 0 to 255
+            where high values represent closer objects and low values further.
 
     Example
     -------

diff --git a/vision_agent/.sim_tools/embs.npy b/vision_agent/.sim_tools/embs.npy
diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py
@@ -23,7 +23,8 @@
     countgd_object_detection,
     countgd_sam2_instance_segmentation,
     countgd_sam2_video_tracking,
-    countgd_visual_prompt_object_detection,
+    countgd_sam2_visual_instance_segmentation,
+    countgd_visual_object_detection,
     custom_object_detection,
     depth_anything_v2,
     detr_segmentation,
@@ -41,6 +42,9 @@
     get_tools_df,
     get_tools_docstring,
     get_utilties_docstring,
+    glee_object_detection,
+    glee_sam2_instance_segmentation,
+    glee_sam2_video_tracking,
     load_image,
     minimum_distance,
     ocr,
@@ -53,6 +57,8 @@
     owlv2_sam2_video_tracking,
     qwen2_vl_images_vqa,
     qwen2_vl_video_vqa,
+    qwen25_vl_images_vqa,
+    qwen25_vl_video_vqa,
     sam2,
     save_image,
     save_json,