core[patch]: Relax rate limit unit tests in terms of timing (#25140)

Relax rate limit unit tests
This commit is contained in:
Eugene Yurtsev 2024-08-07 10:04:58 -04:00 committed by GitHub
parent a2e9910268
commit 28e0958ff4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -9,9 +9,12 @@ def test_rate_limit_invoke() -> None:
"""Add rate limiter.""" """Add rate limiter."""
model = GenericFakeChatModel( model = GenericFakeChatModel(
messages=iter(["hello", "world", "!"]), messages=iter(["hello", "world"]),
rate_limiter=InMemoryRateLimiter( rate_limiter=InMemoryRateLimiter(
requests_per_second=200, check_every_n_seconds=0.01, max_bucket_size=10 requests_per_second=20,
check_every_n_seconds=0.1,
max_bucket_size=10,
# At 20 requests per second we see a refresh every 0.05 seconds
), ),
) )
tic = time.time() tic = time.time()
@ -19,22 +22,14 @@ def test_rate_limit_invoke() -> None:
toc = time.time() toc = time.time()
# Should be larger than check every n seconds since the token bucket starts # Should be larger than check every n seconds since the token bucket starts
# with 0 tokens. # with 0 tokens.
assert 0.01 < toc - tic < 0.02 assert 0.10 < toc - tic < 0.15
tic = time.time() tic = time.time()
model.invoke("foo") model.invoke("foo")
toc = time.time() toc = time.time()
# The second time we call the model, we should have 1 extra token # Second time we check the model, we should have 1 extra token
# to proceed immediately. # since the sleep time is 0.1 seconds
assert toc - tic < 0.005 assert 0.00 < toc - tic < 0.10
# The third time we call the model, we need to wait again for a token
tic = time.time()
model.invoke("foo")
toc = time.time()
# Should be larger than check every n seconds since the token bucket starts
# with 0 tokens.
assert 0.01 < toc - tic < 0.02
async def test_rate_limit_ainvoke() -> None: async def test_rate_limit_ainvoke() -> None:
@ -43,7 +38,10 @@ async def test_rate_limit_ainvoke() -> None:
model = GenericFakeChatModel( model = GenericFakeChatModel(
messages=iter(["hello", "world", "!"]), messages=iter(["hello", "world", "!"]),
rate_limiter=InMemoryRateLimiter( rate_limiter=InMemoryRateLimiter(
requests_per_second=20, check_every_n_seconds=0.1, max_bucket_size=10 requests_per_second=20,
check_every_n_seconds=0.1,
max_bucket_size=10,
# At 20 requests per second we see a refresh every 0.05 seconds
), ),
) )
tic = time.time() tic = time.time()
@ -58,7 +56,7 @@ async def test_rate_limit_ainvoke() -> None:
toc = time.time() toc = time.time()
# The second time we call the model, we should have 1 extra token # The second time we call the model, we should have 1 extra token
# to proceed immediately. # to proceed immediately.
assert toc - tic < 0.01 assert toc - tic < 0.1
# The third time we call the model, we need to wait again for a token # The third time we call the model, we need to wait again for a token
tic = time.time() tic = time.time()
@ -74,17 +72,16 @@ def test_rate_limit_batch() -> None:
model = GenericFakeChatModel( model = GenericFakeChatModel(
messages=iter(["hello", "world", "!"]), messages=iter(["hello", "world", "!"]),
rate_limiter=InMemoryRateLimiter( rate_limiter=InMemoryRateLimiter(
requests_per_second=200, check_every_n_seconds=0.01, max_bucket_size=10 requests_per_second=20,
check_every_n_seconds=0.01,
max_bucket_size=10,
# At 20 requests per second we see a refresh every 0.05 seconds
), ),
) )
# Need 2 tokens to proceed
time_to_fill = 2 / 200.0
tic = time.time() tic = time.time()
model.batch(["foo", "foo"]) model.batch(["foo", "foo"])
toc = time.time() toc = time.time()
# Should be larger than check every n seconds since the token bucket starts assert 0.1 < toc - tic < 0.2
# with 0 tokens.
assert time_to_fill < toc - tic < time_to_fill + 0.03
async def test_rate_limit_abatch() -> None: async def test_rate_limit_abatch() -> None:
@ -92,17 +89,16 @@ async def test_rate_limit_abatch() -> None:
model = GenericFakeChatModel( model = GenericFakeChatModel(
messages=iter(["hello", "world", "!"]), messages=iter(["hello", "world", "!"]),
rate_limiter=InMemoryRateLimiter( rate_limiter=InMemoryRateLimiter(
requests_per_second=200, check_every_n_seconds=0.01, max_bucket_size=10 requests_per_second=20,
check_every_n_seconds=0.01,
max_bucket_size=10,
# At 20 requests per second we see a refresh every 0.05 seconds
), ),
) )
# Need 2 tokens to proceed
time_to_fill = 2 / 200.0
tic = time.time() tic = time.time()
await model.abatch(["foo", "foo"]) await model.abatch(["foo", "foo"])
toc = time.time() toc = time.time()
# Should be larger than check every n seconds since the token bucket starts assert 0.1 < toc - tic < 0.2
# with 0 tokens.
assert time_to_fill < toc - tic < time_to_fill + 0.03
def test_rate_limit_stream() -> None: def test_rate_limit_stream() -> None:
@ -110,7 +106,10 @@ def test_rate_limit_stream() -> None:
model = GenericFakeChatModel( model = GenericFakeChatModel(
messages=iter(["hello world", "hello world", "hello world"]), messages=iter(["hello world", "hello world", "hello world"]),
rate_limiter=InMemoryRateLimiter( rate_limiter=InMemoryRateLimiter(
requests_per_second=200, check_every_n_seconds=0.01, max_bucket_size=10 requests_per_second=20,
check_every_n_seconds=0.1,
max_bucket_size=10,
# At 20 requests per second we see a refresh every 0.05 seconds
), ),
) )
# Check astream # Check astream
@ -119,52 +118,54 @@ def test_rate_limit_stream() -> None:
assert [msg.content for msg in response] == ["hello", " ", "world"] assert [msg.content for msg in response] == ["hello", " ", "world"]
toc = time.time() toc = time.time()
# Should be larger than check every n seconds since the token bucket starts # Should be larger than check every n seconds since the token bucket starts
assert 0.01 < toc - tic < 0.02 # Slightly smaller than check every n seconds
# Second time around we should have 1 token left
tic = time.time()
response = list(model.stream("foo"))
assert [msg.content for msg in response] == ["hello", " ", "world"]
toc = time.time()
# Should be larger than check every n seconds since the token bucket starts
assert toc - tic < 0.005 # Slightly smaller than check every n seconds
# Third time around we should have 0 tokens left
tic = time.time()
response = list(model.stream("foo"))
assert [msg.content for msg in response] == ["hello", " ", "world"]
toc = time.time()
# Should be larger than check every n seconds since the token bucket starts
assert 0.01 < toc - tic < 0.02 # Slightly smaller than check every n seconds
async def test_rate_limit_astream() -> None:
"""Test rate limiting astream."""
rate_limiter = InMemoryRateLimiter(
requests_per_second=20, check_every_n_seconds=0.1, max_bucket_size=10
)
model = GenericFakeChatModel(
messages=iter(["hello world", "hello world", "hello world"]),
rate_limiter=rate_limiter,
)
# Check astream
tic = time.time()
response = [chunk async for chunk in model.astream("foo")]
assert [msg.content for msg in response] == ["hello", " ", "world"]
toc = time.time()
assert 0.1 < toc - tic < 0.2 assert 0.1 < toc - tic < 0.2
# Second time around we should have 1 token left # Second time around we should have 1 token left
tic = time.time() tic = time.time()
response = [chunk async for chunk in model.astream("foo")] response = list(model.stream("foo"))
assert [msg.content for msg in response] == ["hello", " ", "world"] assert [msg.content for msg in response] == ["hello", " ", "world"]
toc = time.time() toc = time.time()
# Should be larger than check every n seconds since the token bucket starts # Should be larger than check every n seconds since the token bucket starts
assert toc - tic < 0.01 # Slightly smaller than check every n seconds assert toc - tic < 0.1 # Slightly smaller than check every n seconds
# Third time around we should have 0 tokens left # Third time around we should have 0 tokens left
tic = time.time() tic = time.time()
response = [chunk async for chunk in model.astream("foo")] response = list(model.stream("foo"))
assert [msg.content for msg in response] == ["hello", " ", "world"]
toc = time.time()
assert 0.1 < toc - tic < 0.2
async def test_rate_limit_astream() -> None:
"""Test rate limiting astream."""
model = GenericFakeChatModel(
messages=iter(["hello world", "hello world", "hello world"]),
rate_limiter=InMemoryRateLimiter(
requests_per_second=20,
check_every_n_seconds=0.1,
max_bucket_size=10,
# At 20 requests per second we see a refresh every 0.05 seconds
),
)
# Check astream
tic = time.time()
response = [msg async for msg in model.astream("foo")]
assert [msg.content for msg in response] == ["hello", " ", "world"]
toc = time.time()
# Should be larger than check every n seconds since the token bucket starts
assert 0.1 < toc - tic < 0.2
# Second time around we should have 1 token left
tic = time.time()
response = [msg async for msg in model.astream("foo")]
assert [msg.content for msg in response] == ["hello", " ", "world"]
toc = time.time()
# Should be larger than check every n seconds since the token bucket starts
assert toc - tic < 0.1 # Slightly smaller than check every n seconds
# Third time around we should have 0 tokens left
tic = time.time()
response = [msg async for msg in model.astream("foo")]
assert [msg.content for msg in response] == ["hello", " ", "world"] assert [msg.content for msg in response] == ["hello", " ", "world"]
toc = time.time() toc = time.time()
assert 0.1 < toc - tic < 0.2 assert 0.1 < toc - tic < 0.2
@ -176,7 +177,10 @@ def test_rate_limit_skips_cache() -> None:
model = GenericFakeChatModel( model = GenericFakeChatModel(
messages=iter(["hello", "world", "!"]), messages=iter(["hello", "world", "!"]),
rate_limiter=InMemoryRateLimiter( rate_limiter=InMemoryRateLimiter(
requests_per_second=100, check_every_n_seconds=0.01, max_bucket_size=1 requests_per_second=20,
check_every_n_seconds=0.1,
max_bucket_size=1,
# At 20 requests per second we see a refresh every 0.05 seconds
), ),
cache=cache, cache=cache,
) )
@ -186,7 +190,7 @@ def test_rate_limit_skips_cache() -> None:
toc = time.time() toc = time.time()
# Should be larger than check every n seconds since the token bucket starts # Should be larger than check every n seconds since the token bucket starts
# with 0 tokens. # with 0 tokens.
assert 0.01 < toc - tic < 0.02 assert 0.1 < toc - tic < 0.2
for _ in range(2): for _ in range(2):
# Cache hits # Cache hits
@ -195,7 +199,7 @@ def test_rate_limit_skips_cache() -> None:
toc = time.time() toc = time.time()
# Should be larger than check every n seconds since the token bucket starts # Should be larger than check every n seconds since the token bucket starts
# with 0 tokens. # with 0 tokens.
assert toc - tic < 0.005 assert toc - tic < 0.05
# Test verifies that there's only a single key # Test verifies that there's only a single key
# Test also verifies that rate_limiter information is not part of the # Test also verifies that rate_limiter information is not part of the
@ -236,7 +240,7 @@ async def test_rate_limit_skips_cache_async() -> None:
model = GenericFakeChatModel( model = GenericFakeChatModel(
messages=iter(["hello", "world", "!"]), messages=iter(["hello", "world", "!"]),
rate_limiter=InMemoryRateLimiter( rate_limiter=InMemoryRateLimiter(
requests_per_second=100, check_every_n_seconds=0.01, max_bucket_size=1 requests_per_second=20, check_every_n_seconds=0.1, max_bucket_size=1
), ),
cache=cache, cache=cache,
) )
@ -246,7 +250,7 @@ async def test_rate_limit_skips_cache_async() -> None:
toc = time.time() toc = time.time()
# Should be larger than check every n seconds since the token bucket starts # Should be larger than check every n seconds since the token bucket starts
# with 0 tokens. # with 0 tokens.
assert 0.01 < toc - tic < 0.02 assert 0.1 < toc - tic < 0.2
for _ in range(2): for _ in range(2):
# Cache hits # Cache hits
@ -255,4 +259,4 @@ async def test_rate_limit_skips_cache_async() -> None:
toc = time.time() toc = time.time()
# Should be larger than check every n seconds since the token bucket starts # Should be larger than check every n seconds since the token bucket starts
# with 0 tokens. # with 0 tokens.
assert toc - tic < 0.005 assert toc - tic < 0.05