Merge branch 'main' of github.com:hpcaitech/ColossalAI into prefetch

2025-09-01 09:07:51 +00:00 · 2024-05-24 04:05:07 +00:00
parent 63c057cd8e 2fc85abf43
commit ff507b755e
16 changed files with 192 additions and 82 deletions
--- a/examples/inference/client/locustfile.py
+++ b/examples/inference/client/locustfile.py
@@ -20,7 +20,7 @@ class QuickstartUser(HttpUser):
        self.client.post(
            "/chat",
            json={
-                "converation": [
+                "messages": [
                    {"role": "system", "content": "you are a helpful assistant"},
                    {"role": "user", "content": "what is 1+1?"},
                ],
@@ -34,7 +34,7 @@ class QuickstartUser(HttpUser):
        self.client.post(
            "/chat",
            json={
-                "converation": [
+                "messages": [
                    {"role": "system", "content": "you are a helpful assistant"},
                    {"role": "user", "content": "what is 1+1?"},
                ],
@@ -42,6 +42,7 @@ class QuickstartUser(HttpUser):
            },
        )

+    # offline-generation is only for showing the usage, it will never be used in actual serving.
    @tag("offline-generation")
    @task(5)
    def generate_streaming(self):
@@ -54,5 +55,5 @@ class QuickstartUser(HttpUser):

    @tag("online-generation", "offline-generation")
    @task
-    def get_models(self):
-        self.client.get("/models")
+    def health_check(self):
+        self.client.get("/ping")
--- a/examples/language/llama/benchmark.py
+++ b/examples/language/llama/benchmark.py
@@ -78,6 +78,8 @@ def main():
    parser.add_argument("--zero", type=int, default=0, help="Zero Stage when hybrid plugin is enabled")
    parser.add_argument("--custom-ckpt", action="store_true", help="Customize checkpoint", default=False)
    parser.add_argument("--profile", action="store_true", help="Enable profiling", default=False)
+    parser.add_argument("--disable-async-reduce", action="store_true", help="Customize checkpoint", default=False)
+
    args = parser.parse_args()

    colossalai.launch_from_torch()
@@ -113,6 +115,7 @@ def main():
            enable_fused_normalization=torch.cuda.is_available(),
            enable_flash_attention=args.xformers,
            max_prefetch=10,
+            enable_async_reduce=not args.disable_async_reduce,
        )
    elif args.plugin == "gemini_auto":
        plugin = GeminiPlugin(