From 28e0958ff4e73cb0beab68fa31de628654162474 Mon Sep 17 00:00:00 2001
From: Eugene Yurtsev <eyurtsev@gmail.com>
Date: Wed, 7 Aug 2024 10:04:58 -0400
Subject: [PATCH] core[patch]: Relax rate limit unit tests in terms of timing
 (#25140)

Relax rate limit unit tests
---
 .../chat_models/test_rate_limiting.py         | 146 +++++++++---------
 1 file changed, 75 insertions(+), 71 deletions(-)

diff --git a/libs/core/tests/unit_tests/language_models/chat_models/test_rate_limiting.py b/libs/core/tests/unit_tests/language_models/chat_models/test_rate_limiting.py
index 2d251aadb92..ac633b8263f 100644
--- a/libs/core/tests/unit_tests/language_models/chat_models/test_rate_limiting.py
+++ b/libs/core/tests/unit_tests/language_models/chat_models/test_rate_limiting.py
@@ -9,9 +9,12 @@ def test_rate_limit_invoke() -> None:
     """Add rate limiter."""
 
     model = GenericFakeChatModel(
-        messages=iter(["hello", "world", "!"]),
+        messages=iter(["hello", "world"]),
         rate_limiter=InMemoryRateLimiter(
-            requests_per_second=200, check_every_n_seconds=0.01, max_bucket_size=10
+            requests_per_second=20,
+            check_every_n_seconds=0.1,
+            max_bucket_size=10,
+            # At 20 requests per second we see a refresh every 0.05 seconds
         ),
     )
     tic = time.time()
@@ -19,22 +22,14 @@ def test_rate_limit_invoke() -> None:
     toc = time.time()
     # Should be larger than check every n seconds since the token bucket starts
     # with 0 tokens.
-    assert 0.01 < toc - tic < 0.02
+    assert 0.10 < toc - tic < 0.15
 
     tic = time.time()
     model.invoke("foo")
     toc = time.time()
-    # The second time we call the model, we should have 1 extra token
-    # to proceed immediately.
-    assert toc - tic < 0.005
-
-    # The third time we call the model, we need to wait again for a token
-    tic = time.time()
-    model.invoke("foo")
-    toc = time.time()
-    # Should be larger than check every n seconds since the token bucket starts
-    # with 0 tokens.
-    assert 0.01 < toc - tic < 0.02
+    # Second time we check the model, we should have 1 extra token
+    # since the sleep time is 0.1 seconds
+    assert 0.00 < toc - tic < 0.10
 
 
 async def test_rate_limit_ainvoke() -> None:
@@ -43,7 +38,10 @@ async def test_rate_limit_ainvoke() -> None:
     model = GenericFakeChatModel(
         messages=iter(["hello", "world", "!"]),
         rate_limiter=InMemoryRateLimiter(
-            requests_per_second=20, check_every_n_seconds=0.1, max_bucket_size=10
+            requests_per_second=20,
+            check_every_n_seconds=0.1,
+            max_bucket_size=10,
+            # At 20 requests per second we see a refresh every 0.05 seconds
         ),
     )
     tic = time.time()
@@ -58,7 +56,7 @@ async def test_rate_limit_ainvoke() -> None:
     toc = time.time()
     # The second time we call the model, we should have 1 extra token
     # to proceed immediately.
-    assert toc - tic < 0.01
+    assert toc - tic < 0.1
 
     # The third time we call the model, we need to wait again for a token
     tic = time.time()
@@ -74,17 +72,16 @@ def test_rate_limit_batch() -> None:
     model = GenericFakeChatModel(
         messages=iter(["hello", "world", "!"]),
         rate_limiter=InMemoryRateLimiter(
-            requests_per_second=200, check_every_n_seconds=0.01, max_bucket_size=10
+            requests_per_second=20,
+            check_every_n_seconds=0.01,
+            max_bucket_size=10,
+            # At 20 requests per second we see a refresh every 0.05 seconds
         ),
     )
-    # Need 2 tokens to proceed
-    time_to_fill = 2 / 200.0
     tic = time.time()
     model.batch(["foo", "foo"])
     toc = time.time()
-    # Should be larger than check every n seconds since the token bucket starts
-    # with 0 tokens.
-    assert time_to_fill < toc - tic < time_to_fill + 0.03
+    assert 0.1 < toc - tic < 0.2
 
 
 async def test_rate_limit_abatch() -> None:
@@ -92,17 +89,16 @@ async def test_rate_limit_abatch() -> None:
     model = GenericFakeChatModel(
         messages=iter(["hello", "world", "!"]),
         rate_limiter=InMemoryRateLimiter(
-            requests_per_second=200, check_every_n_seconds=0.01, max_bucket_size=10
+            requests_per_second=20,
+            check_every_n_seconds=0.01,
+            max_bucket_size=10,
+            # At 20 requests per second we see a refresh every 0.05 seconds
         ),
     )
-    # Need 2 tokens to proceed
-    time_to_fill = 2 / 200.0
     tic = time.time()
     await model.abatch(["foo", "foo"])
     toc = time.time()
-    # Should be larger than check every n seconds since the token bucket starts
-    # with 0 tokens.
-    assert time_to_fill < toc - tic < time_to_fill + 0.03
+    assert 0.1 < toc - tic < 0.2
 
 
 def test_rate_limit_stream() -> None:
@@ -110,7 +106,10 @@ def test_rate_limit_stream() -> None:
     model = GenericFakeChatModel(
         messages=iter(["hello world", "hello world", "hello world"]),
         rate_limiter=InMemoryRateLimiter(
-            requests_per_second=200, check_every_n_seconds=0.01, max_bucket_size=10
+            requests_per_second=20,
+            check_every_n_seconds=0.1,
+            max_bucket_size=10,
+            # At 20 requests per second we see a refresh every 0.05 seconds
         ),
     )
     # Check astream
@@ -119,52 +118,54 @@ def test_rate_limit_stream() -> None:
     assert [msg.content for msg in response] == ["hello", " ", "world"]
     toc = time.time()
     # Should be larger than check every n seconds since the token bucket starts
-    assert 0.01 < toc - tic < 0.02  # Slightly smaller than check every n seconds
-
-    # Second time around we should have 1 token left
-    tic = time.time()
-    response = list(model.stream("foo"))
-    assert [msg.content for msg in response] == ["hello", " ", "world"]
-    toc = time.time()
-    # Should be larger than check every n seconds since the token bucket starts
-    assert toc - tic < 0.005  # Slightly smaller than check every n seconds
-
-    # Third time around we should have 0 tokens left
-    tic = time.time()
-    response = list(model.stream("foo"))
-    assert [msg.content for msg in response] == ["hello", " ", "world"]
-    toc = time.time()
-    # Should be larger than check every n seconds since the token bucket starts
-    assert 0.01 < toc - tic < 0.02  # Slightly smaller than check every n seconds
-
-
-async def test_rate_limit_astream() -> None:
-    """Test rate limiting astream."""
-    rate_limiter = InMemoryRateLimiter(
-        requests_per_second=20, check_every_n_seconds=0.1, max_bucket_size=10
-    )
-    model = GenericFakeChatModel(
-        messages=iter(["hello world", "hello world", "hello world"]),
-        rate_limiter=rate_limiter,
-    )
-    # Check astream
-    tic = time.time()
-    response = [chunk async for chunk in model.astream("foo")]
-    assert [msg.content for msg in response] == ["hello", " ", "world"]
-    toc = time.time()
     assert 0.1 < toc - tic < 0.2
 
     # Second time around we should have 1 token left
     tic = time.time()
-    response = [chunk async for chunk in model.astream("foo")]
+    response = list(model.stream("foo"))
     assert [msg.content for msg in response] == ["hello", " ", "world"]
     toc = time.time()
     # Should be larger than check every n seconds since the token bucket starts
-    assert toc - tic < 0.01  # Slightly smaller than check every n seconds
+    assert toc - tic < 0.1  # Slightly smaller than check every n seconds
 
     # Third time around we should have 0 tokens left
     tic = time.time()
-    response = [chunk async for chunk in model.astream("foo")]
+    response = list(model.stream("foo"))
+    assert [msg.content for msg in response] == ["hello", " ", "world"]
+    toc = time.time()
+    assert 0.1 < toc - tic < 0.2
+
+
+async def test_rate_limit_astream() -> None:
+    """Test rate limiting astream."""
+    model = GenericFakeChatModel(
+        messages=iter(["hello world", "hello world", "hello world"]),
+        rate_limiter=InMemoryRateLimiter(
+            requests_per_second=20,
+            check_every_n_seconds=0.1,
+            max_bucket_size=10,
+            # At 20 requests per second we see a refresh every 0.05 seconds
+        ),
+    )
+    # Check astream
+    tic = time.time()
+    response = [msg async for msg in model.astream("foo")]
+    assert [msg.content for msg in response] == ["hello", " ", "world"]
+    toc = time.time()
+    # Should be larger than check every n seconds since the token bucket starts
+    assert 0.1 < toc - tic < 0.2
+
+    # Second time around we should have 1 token left
+    tic = time.time()
+    response = [msg async for msg in model.astream("foo")]
+    assert [msg.content for msg in response] == ["hello", " ", "world"]
+    toc = time.time()
+    # Should be larger than check every n seconds since the token bucket starts
+    assert toc - tic < 0.1  # Slightly smaller than check every n seconds
+
+    # Third time around we should have 0 tokens left
+    tic = time.time()
+    response = [msg async for msg in model.astream("foo")]
     assert [msg.content for msg in response] == ["hello", " ", "world"]
     toc = time.time()
     assert 0.1 < toc - tic < 0.2
@@ -176,7 +177,10 @@ def test_rate_limit_skips_cache() -> None:
     model = GenericFakeChatModel(
         messages=iter(["hello", "world", "!"]),
         rate_limiter=InMemoryRateLimiter(
-            requests_per_second=100, check_every_n_seconds=0.01, max_bucket_size=1
+            requests_per_second=20,
+            check_every_n_seconds=0.1,
+            max_bucket_size=1,
+            # At 20 requests per second we see a refresh every 0.05 seconds
         ),
         cache=cache,
     )
@@ -186,7 +190,7 @@ def test_rate_limit_skips_cache() -> None:
     toc = time.time()
     # Should be larger than check every n seconds since the token bucket starts
     # with 0 tokens.
-    assert 0.01 < toc - tic < 0.02
+    assert 0.1 < toc - tic < 0.2
 
     for _ in range(2):
         # Cache hits
@@ -195,7 +199,7 @@ def test_rate_limit_skips_cache() -> None:
         toc = time.time()
         # Should be larger than check every n seconds since the token bucket starts
         # with 0 tokens.
-        assert toc - tic < 0.005
+        assert toc - tic < 0.05
 
     # Test verifies that there's only a single key
     # Test also verifies that rate_limiter information is not part of the
@@ -236,7 +240,7 @@ async def test_rate_limit_skips_cache_async() -> None:
     model = GenericFakeChatModel(
         messages=iter(["hello", "world", "!"]),
         rate_limiter=InMemoryRateLimiter(
-            requests_per_second=100, check_every_n_seconds=0.01, max_bucket_size=1
+            requests_per_second=20, check_every_n_seconds=0.1, max_bucket_size=1
         ),
         cache=cache,
     )
@@ -246,7 +250,7 @@ async def test_rate_limit_skips_cache_async() -> None:
     toc = time.time()
     # Should be larger than check every n seconds since the token bucket starts
     # with 0 tokens.
-    assert 0.01 < toc - tic < 0.02
+    assert 0.1 < toc - tic < 0.2
 
     for _ in range(2):
         # Cache hits
@@ -255,4 +259,4 @@ async def test_rate_limit_skips_cache_async() -> None:
         toc = time.time()
         # Should be larger than check every n seconds since the token bucket starts
         # with 0 tokens.
-        assert toc - tic < 0.005
+        assert toc - tic < 0.05