From 28e0958ff4e73cb0beab68fa31de628654162474 Mon Sep 17 00:00:00 2001 From: Eugene Yurtsev Date: Wed, 7 Aug 2024 10:04:58 -0400 Subject: [PATCH] core[patch]: Relax rate limit unit tests in terms of timing (#25140) Relax rate limit unit tests --- .../chat_models/test_rate_limiting.py | 146 +++++++++--------- 1 file changed, 75 insertions(+), 71 deletions(-) diff --git a/libs/core/tests/unit_tests/language_models/chat_models/test_rate_limiting.py b/libs/core/tests/unit_tests/language_models/chat_models/test_rate_limiting.py index 2d251aadb92..ac633b8263f 100644 --- a/libs/core/tests/unit_tests/language_models/chat_models/test_rate_limiting.py +++ b/libs/core/tests/unit_tests/language_models/chat_models/test_rate_limiting.py @@ -9,9 +9,12 @@ def test_rate_limit_invoke() -> None: """Add rate limiter.""" model = GenericFakeChatModel( - messages=iter(["hello", "world", "!"]), + messages=iter(["hello", "world"]), rate_limiter=InMemoryRateLimiter( - requests_per_second=200, check_every_n_seconds=0.01, max_bucket_size=10 + requests_per_second=20, + check_every_n_seconds=0.1, + max_bucket_size=10, + # At 20 requests per second we see a refresh every 0.05 seconds ), ) tic = time.time() @@ -19,22 +22,14 @@ def test_rate_limit_invoke() -> None: toc = time.time() # Should be larger than check every n seconds since the token bucket starts # with 0 tokens. - assert 0.01 < toc - tic < 0.02 + assert 0.10 < toc - tic < 0.15 tic = time.time() model.invoke("foo") toc = time.time() - # The second time we call the model, we should have 1 extra token - # to proceed immediately. - assert toc - tic < 0.005 - - # The third time we call the model, we need to wait again for a token - tic = time.time() - model.invoke("foo") - toc = time.time() - # Should be larger than check every n seconds since the token bucket starts - # with 0 tokens. - assert 0.01 < toc - tic < 0.02 + # Second time we check the model, we should have 1 extra token + # since the sleep time is 0.1 seconds + assert 0.00 < toc - tic < 0.10 async def test_rate_limit_ainvoke() -> None: @@ -43,7 +38,10 @@ async def test_rate_limit_ainvoke() -> None: model = GenericFakeChatModel( messages=iter(["hello", "world", "!"]), rate_limiter=InMemoryRateLimiter( - requests_per_second=20, check_every_n_seconds=0.1, max_bucket_size=10 + requests_per_second=20, + check_every_n_seconds=0.1, + max_bucket_size=10, + # At 20 requests per second we see a refresh every 0.05 seconds ), ) tic = time.time() @@ -58,7 +56,7 @@ async def test_rate_limit_ainvoke() -> None: toc = time.time() # The second time we call the model, we should have 1 extra token # to proceed immediately. - assert toc - tic < 0.01 + assert toc - tic < 0.1 # The third time we call the model, we need to wait again for a token tic = time.time() @@ -74,17 +72,16 @@ def test_rate_limit_batch() -> None: model = GenericFakeChatModel( messages=iter(["hello", "world", "!"]), rate_limiter=InMemoryRateLimiter( - requests_per_second=200, check_every_n_seconds=0.01, max_bucket_size=10 + requests_per_second=20, + check_every_n_seconds=0.01, + max_bucket_size=10, + # At 20 requests per second we see a refresh every 0.05 seconds ), ) - # Need 2 tokens to proceed - time_to_fill = 2 / 200.0 tic = time.time() model.batch(["foo", "foo"]) toc = time.time() - # Should be larger than check every n seconds since the token bucket starts - # with 0 tokens. - assert time_to_fill < toc - tic < time_to_fill + 0.03 + assert 0.1 < toc - tic < 0.2 async def test_rate_limit_abatch() -> None: @@ -92,17 +89,16 @@ async def test_rate_limit_abatch() -> None: model = GenericFakeChatModel( messages=iter(["hello", "world", "!"]), rate_limiter=InMemoryRateLimiter( - requests_per_second=200, check_every_n_seconds=0.01, max_bucket_size=10 + requests_per_second=20, + check_every_n_seconds=0.01, + max_bucket_size=10, + # At 20 requests per second we see a refresh every 0.05 seconds ), ) - # Need 2 tokens to proceed - time_to_fill = 2 / 200.0 tic = time.time() await model.abatch(["foo", "foo"]) toc = time.time() - # Should be larger than check every n seconds since the token bucket starts - # with 0 tokens. - assert time_to_fill < toc - tic < time_to_fill + 0.03 + assert 0.1 < toc - tic < 0.2 def test_rate_limit_stream() -> None: @@ -110,7 +106,10 @@ def test_rate_limit_stream() -> None: model = GenericFakeChatModel( messages=iter(["hello world", "hello world", "hello world"]), rate_limiter=InMemoryRateLimiter( - requests_per_second=200, check_every_n_seconds=0.01, max_bucket_size=10 + requests_per_second=20, + check_every_n_seconds=0.1, + max_bucket_size=10, + # At 20 requests per second we see a refresh every 0.05 seconds ), ) # Check astream @@ -119,52 +118,54 @@ def test_rate_limit_stream() -> None: assert [msg.content for msg in response] == ["hello", " ", "world"] toc = time.time() # Should be larger than check every n seconds since the token bucket starts - assert 0.01 < toc - tic < 0.02 # Slightly smaller than check every n seconds - - # Second time around we should have 1 token left - tic = time.time() - response = list(model.stream("foo")) - assert [msg.content for msg in response] == ["hello", " ", "world"] - toc = time.time() - # Should be larger than check every n seconds since the token bucket starts - assert toc - tic < 0.005 # Slightly smaller than check every n seconds - - # Third time around we should have 0 tokens left - tic = time.time() - response = list(model.stream("foo")) - assert [msg.content for msg in response] == ["hello", " ", "world"] - toc = time.time() - # Should be larger than check every n seconds since the token bucket starts - assert 0.01 < toc - tic < 0.02 # Slightly smaller than check every n seconds - - -async def test_rate_limit_astream() -> None: - """Test rate limiting astream.""" - rate_limiter = InMemoryRateLimiter( - requests_per_second=20, check_every_n_seconds=0.1, max_bucket_size=10 - ) - model = GenericFakeChatModel( - messages=iter(["hello world", "hello world", "hello world"]), - rate_limiter=rate_limiter, - ) - # Check astream - tic = time.time() - response = [chunk async for chunk in model.astream("foo")] - assert [msg.content for msg in response] == ["hello", " ", "world"] - toc = time.time() assert 0.1 < toc - tic < 0.2 # Second time around we should have 1 token left tic = time.time() - response = [chunk async for chunk in model.astream("foo")] + response = list(model.stream("foo")) assert [msg.content for msg in response] == ["hello", " ", "world"] toc = time.time() # Should be larger than check every n seconds since the token bucket starts - assert toc - tic < 0.01 # Slightly smaller than check every n seconds + assert toc - tic < 0.1 # Slightly smaller than check every n seconds # Third time around we should have 0 tokens left tic = time.time() - response = [chunk async for chunk in model.astream("foo")] + response = list(model.stream("foo")) + assert [msg.content for msg in response] == ["hello", " ", "world"] + toc = time.time() + assert 0.1 < toc - tic < 0.2 + + +async def test_rate_limit_astream() -> None: + """Test rate limiting astream.""" + model = GenericFakeChatModel( + messages=iter(["hello world", "hello world", "hello world"]), + rate_limiter=InMemoryRateLimiter( + requests_per_second=20, + check_every_n_seconds=0.1, + max_bucket_size=10, + # At 20 requests per second we see a refresh every 0.05 seconds + ), + ) + # Check astream + tic = time.time() + response = [msg async for msg in model.astream("foo")] + assert [msg.content for msg in response] == ["hello", " ", "world"] + toc = time.time() + # Should be larger than check every n seconds since the token bucket starts + assert 0.1 < toc - tic < 0.2 + + # Second time around we should have 1 token left + tic = time.time() + response = [msg async for msg in model.astream("foo")] + assert [msg.content for msg in response] == ["hello", " ", "world"] + toc = time.time() + # Should be larger than check every n seconds since the token bucket starts + assert toc - tic < 0.1 # Slightly smaller than check every n seconds + + # Third time around we should have 0 tokens left + tic = time.time() + response = [msg async for msg in model.astream("foo")] assert [msg.content for msg in response] == ["hello", " ", "world"] toc = time.time() assert 0.1 < toc - tic < 0.2 @@ -176,7 +177,10 @@ def test_rate_limit_skips_cache() -> None: model = GenericFakeChatModel( messages=iter(["hello", "world", "!"]), rate_limiter=InMemoryRateLimiter( - requests_per_second=100, check_every_n_seconds=0.01, max_bucket_size=1 + requests_per_second=20, + check_every_n_seconds=0.1, + max_bucket_size=1, + # At 20 requests per second we see a refresh every 0.05 seconds ), cache=cache, ) @@ -186,7 +190,7 @@ def test_rate_limit_skips_cache() -> None: toc = time.time() # Should be larger than check every n seconds since the token bucket starts # with 0 tokens. - assert 0.01 < toc - tic < 0.02 + assert 0.1 < toc - tic < 0.2 for _ in range(2): # Cache hits @@ -195,7 +199,7 @@ def test_rate_limit_skips_cache() -> None: toc = time.time() # Should be larger than check every n seconds since the token bucket starts # with 0 tokens. - assert toc - tic < 0.005 + assert toc - tic < 0.05 # Test verifies that there's only a single key # Test also verifies that rate_limiter information is not part of the @@ -236,7 +240,7 @@ async def test_rate_limit_skips_cache_async() -> None: model = GenericFakeChatModel( messages=iter(["hello", "world", "!"]), rate_limiter=InMemoryRateLimiter( - requests_per_second=100, check_every_n_seconds=0.01, max_bucket_size=1 + requests_per_second=20, check_every_n_seconds=0.1, max_bucket_size=1 ), cache=cache, ) @@ -246,7 +250,7 @@ async def test_rate_limit_skips_cache_async() -> None: toc = time.time() # Should be larger than check every n seconds since the token bucket starts # with 0 tokens. - assert 0.01 < toc - tic < 0.02 + assert 0.1 < toc - tic < 0.2 for _ in range(2): # Cache hits @@ -255,4 +259,4 @@ async def test_rate_limit_skips_cache_async() -> None: toc = time.time() # Should be larger than check every n seconds since the token bucket starts # with 0 tokens. - assert toc - tic < 0.005 + assert toc - tic < 0.05