[legacy] clean up legacy code (#4743)

* [legacy] remove outdated codes of pipeline (#4692) * [legacy] remove cli of benchmark and update optim (#4690) * [legacy] remove cli of benchmark and update optim * [doc] fix cli doc test * [legacy] fix engine clip grad norm * [legacy] remove outdated colo tensor (#4694) * [legacy] remove outdated colo tensor * [test] fix test import * [legacy] move outdated zero to legacy (#4696) * [legacy] clean up utils (#4700) * [legacy] clean up utils * [example] update examples * [legacy] clean up amp * [legacy] fix amp module * [legacy] clean up gpc (#4742) * [legacy] clean up context * [legacy] clean core, constants and global vars * [legacy] refactor initialize * [example] fix examples ci * [example] fix examples ci * [legacy] fix tests * [example] fix gpt example * [example] fix examples ci * [devops] fix ci installation * [example] fix examples ci
2025-11-24 06:51:14 +00:00 · 2023-09-18 16:31:06 +08:00
parent 32e7f99416
commit b5f9e37c70
342 changed files with 2919 additions and 4182 deletions
--- a/examples/tutorial/sequence_parallel/data/tokenizer/tokenizer.py
+++ b/examples/tutorial/sequence_parallel/data/tokenizer/tokenizer.py
@@ -12,13 +12,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """Megatron tokenizers."""

-from abc import ABC
-from abc import abstractmethod
-from colossalai.core import global_context as gpc
-from colossalai.context import ParallelMode
+from abc import ABC, abstractmethod
+
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc

 from .bert_tokenization import FullTokenizer as FullBertTokenizer

@@ -26,18 +25,13 @@ from .bert_tokenization import FullTokenizer as FullBertTokenizer
 def build_tokenizer(vocab_file, tokenizer_type, vocab_extra_ids=0):
    """Initialize tokenizer."""
    if not gpc.is_initialized(ParallelMode.GLOBAL) or gpc.get_global_rank() == 0:
-        print('> building {} tokenizer ...'.format(tokenizer_type),
-              flush=True)
+        print('> building {} tokenizer ...'.format(tokenizer_type), flush=True)

    # Select and instantiate the tokenizer.
    if tokenizer_type == 'BertWordPieceLowerCase':
-        tokenizer = _BertWordPieceTokenizer(vocab_file=vocab_file,
-                                            lower_case=True,
-                                            vocab_extra_ids=vocab_extra_ids)
+        tokenizer = _BertWordPieceTokenizer(vocab_file=vocab_file, lower_case=True, vocab_extra_ids=vocab_extra_ids)
    elif tokenizer_type == 'BertWordPieceCase':
-        tokenizer = _BertWordPieceTokenizer(vocab_file=vocab_file,
-                                            lower_case=False,
-                                            vocab_extra_ids=vocab_extra_ids)
+        tokenizer = _BertWordPieceTokenizer(vocab_file=vocab_file, lower_case=False, vocab_extra_ids=vocab_extra_ids)
    else:
        raise NotImplementedError('{} tokenizer is not '
                                  'implemented.'.format(tokenizer_type))
@@ -62,8 +56,8 @@ def _vocab_size_with_padding(orig_vocab_size, make_vocab_size_divisible_by=128):
        after += 1
    if not gpc.is_initialized(ParallelMode.GLOBAL) or gpc.get_global_rank() == 0:
        print(' > padded vocab (size: {}) with {} dummy tokens '
-              '(new size: {})'.format(
-                  orig_vocab_size, after - orig_vocab_size, after), flush=True)
+              '(new size: {})'.format(orig_vocab_size, after - orig_vocab_size, after),
+              flush=True)
    return after


@@ -142,8 +136,7 @@ class _BertWordPieceTokenizer(AbstractTokenizer):
        self._additional_special_tokens = []

        # (dsachan) Add BOS and EOS tokens
-        SPECIAL_TOKENS = {'eos_token': '[EOS]',
-                          'bos_token': '[BOS]'}
+        SPECIAL_TOKENS = {'eos_token': '[EOS]', 'bos_token': '[BOS]'}
        self._bos_token = '[BOS]'
        self.add_token(self._bos_token)
        self._bos_token_id = self.vocab.get(self._bos_token)
@@ -155,8 +148,7 @@ class _BertWordPieceTokenizer(AbstractTokenizer):
        # (dsachan) Add additional special tokens
        # These can be used as sentinel tokens in T5 model inputs
        additional_special_tokens = []
-        additional_special_tokens.extend(
-            ["<extra_id_{}>".format(i) for i in range(vocab_extra_ids)])
+        additional_special_tokens.extend(["<extra_id_{}>".format(i) for i in range(vocab_extra_ids)])
        self.add_additional_special_tokens(additional_special_tokens)

    def add_token(self, token):