mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-03 18:19:58 +00:00
[misc] update pre-commit and run all files (#4752)
* [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format
This commit is contained in:
@@ -39,7 +39,7 @@ def meta_trace(module: torch.nn.Module, fake_device=None, *args, **kwargs) -> Gr
|
||||
_tensor: torch.Tensor
|
||||
_node: Node
|
||||
|
||||
__slots__ = ['_tensor', '_node']
|
||||
__slots__ = ["_tensor", "_node"]
|
||||
|
||||
@staticmethod
|
||||
def __new__(cls, tensor, fake_device=None, placeholder=False, name=None):
|
||||
@@ -51,22 +51,22 @@ def meta_trace(module: torch.nn.Module, fake_device=None, *args, **kwargs) -> Gr
|
||||
dtype=tensor.dtype,
|
||||
layout=tensor.layout,
|
||||
device=fake_device if fake_device is not None else tensor.device,
|
||||
requires_grad=tensor.requires_grad) # deceive the frontend for aten selections
|
||||
requires_grad=tensor.requires_grad,
|
||||
) # deceive the frontend for aten selections
|
||||
r._tensor = tensor
|
||||
if placeholder:
|
||||
if name is None:
|
||||
name = 'input'
|
||||
r._node = graph.create_node('placeholder',
|
||||
'placeholder', (graph._root,),
|
||||
name=namespace.create_name(name, tensor))
|
||||
name = "input"
|
||||
r._node = graph.create_node(
|
||||
"placeholder", "placeholder", (graph._root,), name=namespace.create_name(name, tensor)
|
||||
)
|
||||
# ...the real tensor is held as an element on the tensor.
|
||||
if not r._tensor.is_meta:
|
||||
r._tensor = r._tensor.to(torch.device('meta'))
|
||||
r._tensor = r._tensor.to(torch.device("meta"))
|
||||
return r
|
||||
|
||||
@classmethod
|
||||
def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
|
||||
|
||||
def unwrap(x):
|
||||
nonlocal fake_device
|
||||
if isinstance(x, MetaProxy):
|
||||
@@ -75,21 +75,21 @@ def meta_trace(module: torch.nn.Module, fake_device=None, *args, **kwargs) -> Gr
|
||||
# assert not isinstance(x, MetaProxy)
|
||||
elif isinstance(x, torch.Tensor):
|
||||
fake_device = x.device
|
||||
x = x.to(torch.device('meta'))
|
||||
x = x.to(torch.device("meta"))
|
||||
return x
|
||||
|
||||
def get_node(x):
|
||||
if isinstance(x, torch.Tensor) and not hasattr(x, '_node'):
|
||||
x = MetaProxy(x, placeholder=True, name='weight')
|
||||
return x if not hasattr(x, '_node') else x._node
|
||||
if isinstance(x, torch.Tensor) and not hasattr(x, "_node"):
|
||||
x = MetaProxy(x, placeholder=True, name="weight")
|
||||
return x if not hasattr(x, "_node") else x._node
|
||||
|
||||
args_node = tree_map(get_node, args)
|
||||
kwargs_node = tree_map(get_node, kwargs)
|
||||
node = graph.create_node('call_function', func, args_node, kwargs_node)
|
||||
node = graph.create_node("call_function", func, args_node, kwargs_node)
|
||||
|
||||
if 'device' in kwargs:
|
||||
fake_device = kwargs['device']
|
||||
kwargs['device'] = torch.device('meta')
|
||||
if "device" in kwargs:
|
||||
fake_device = kwargs["device"]
|
||||
kwargs["device"] = torch.device("meta")
|
||||
|
||||
args = tree_map(unwrap, args)
|
||||
kwargs = tree_map(unwrap, kwargs)
|
||||
@@ -103,9 +103,12 @@ def meta_trace(module: torch.nn.Module, fake_device=None, *args, **kwargs) -> Gr
|
||||
if isinstance(x, torch.Tensor):
|
||||
nonlocal fake_device
|
||||
if not x.is_meta:
|
||||
x = x.to(torch.device('meta'))
|
||||
return MetaProxy(
|
||||
x, fake_device=fake_device) if isinstance(x, torch.Tensor) and not hasattr(x, '_tensor') else x
|
||||
x = x.to(torch.device("meta"))
|
||||
return (
|
||||
MetaProxy(x, fake_device=fake_device)
|
||||
if isinstance(x, torch.Tensor) and not hasattr(x, "_tensor")
|
||||
else x
|
||||
)
|
||||
|
||||
def set_node(x):
|
||||
x._node = node
|
||||
@@ -125,9 +128,12 @@ def meta_trace(module: torch.nn.Module, fake_device=None, *args, **kwargs) -> Gr
|
||||
|
||||
for tensor in normalize_tuple(out):
|
||||
if is_autogradable(tensor) and tensor.requires_grad:
|
||||
grad = torch.empty_like(tensor._tensor, device=torch.device('meta')) if isinstance(
|
||||
tensor, MetaProxy) else torch.empty_like(tensor, device=torch.device('meta'))
|
||||
torch.autograd.backward(tensor,
|
||||
MetaProxy(grad, fake_device=tensor.device, placeholder=True),
|
||||
retain_graph=True)
|
||||
grad = (
|
||||
torch.empty_like(tensor._tensor, device=torch.device("meta"))
|
||||
if isinstance(tensor, MetaProxy)
|
||||
else torch.empty_like(tensor, device=torch.device("meta"))
|
||||
)
|
||||
torch.autograd.backward(
|
||||
tensor, MetaProxy(grad, fake_device=tensor.device, placeholder=True), retain_graph=True
|
||||
)
|
||||
return graph
|
||||
|
@@ -2,10 +2,10 @@ from typing import Any, List, Union
|
||||
|
||||
import torch
|
||||
|
||||
from ..proxy import ColoAttribute, ColoProxy
|
||||
from .meta_patch import meta_patched_function, meta_patched_module
|
||||
from ..proxy import ColoProxy
|
||||
from .meta_patch import meta_patched_function
|
||||
|
||||
__all__ = ['is_element_in_list', 'extract_meta']
|
||||
__all__ = ["is_element_in_list", "extract_meta"]
|
||||
|
||||
|
||||
def is_element_in_list(elements: Union[List[Any], Any], list_: List[Any]):
|
||||
@@ -21,7 +21,6 @@ def is_element_in_list(elements: Union[List[Any], Any], list_: List[Any]):
|
||||
|
||||
|
||||
def extract_meta(*args, **kwargs):
|
||||
|
||||
def _convert(val):
|
||||
if isinstance(val, ColoProxy):
|
||||
return val.meta_data
|
||||
|
@@ -1,7 +1,4 @@
|
||||
import operator
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from ...registry import bias_addition_function, bias_addition_method
|
||||
from .bias_addition_function import LinearBasedBiasFunc
|
||||
@@ -10,13 +7,12 @@ from .bias_addition_function import LinearBasedBiasFunc
|
||||
@bias_addition_method.register(torch.Tensor.addbmm)
|
||||
@bias_addition_function.register(torch.addbmm)
|
||||
class Addbmm(LinearBasedBiasFunc):
|
||||
|
||||
def extract_kwargs_from_origin_func(self):
|
||||
kwargs = {}
|
||||
if 'beta' in self.kwargs:
|
||||
kwargs['beta'] = self.kwargs['beta']
|
||||
if 'alpha' in self.kwargs:
|
||||
kwargs['alpha'] = self.kwargs['alpha']
|
||||
if "beta" in self.kwargs:
|
||||
kwargs["beta"] = self.kwargs["beta"]
|
||||
if "alpha" in self.kwargs:
|
||||
kwargs["alpha"] = self.kwargs["alpha"]
|
||||
return kwargs
|
||||
|
||||
def create_non_bias_func_proxy(self, input_proxy, other_proxy):
|
||||
@@ -25,7 +21,7 @@ class Addbmm(LinearBasedBiasFunc):
|
||||
compute the main computation, such as convolution, with bias option banned.
|
||||
"""
|
||||
assert self.substitute_func == torch.bmm
|
||||
node_kind = 'call_function'
|
||||
node_kind = "call_function"
|
||||
node_target = self.substitute_func
|
||||
|
||||
node_args = (input_proxy, other_proxy)
|
||||
@@ -35,10 +31,10 @@ class Addbmm(LinearBasedBiasFunc):
|
||||
return non_bias_func_proxy
|
||||
|
||||
def insert_sum_node(self, input_proxy, sum_dims=0):
|
||||
'''
|
||||
"""
|
||||
This method is used to sum the input_proxy through the sum_dims.
|
||||
'''
|
||||
node_kind = 'call_function'
|
||||
"""
|
||||
node_kind = "call_function"
|
||||
node_target = torch.sum
|
||||
node_args = (input_proxy, sum_dims)
|
||||
node_kwargs = {}
|
||||
@@ -55,15 +51,15 @@ class Addbmm(LinearBasedBiasFunc):
|
||||
sum_proxy = self.insert_sum_node(non_bias_linear_func_proxy)
|
||||
kwargs = self.extract_kwargs_from_origin_func()
|
||||
|
||||
if 'beta' in kwargs:
|
||||
beta = kwargs['beta']
|
||||
if "beta" in kwargs:
|
||||
beta = kwargs["beta"]
|
||||
# doing the multiplication with beta if it exists(temp_2 = beta * input)
|
||||
beta_proxy = self.create_mul_node(self.args[0], beta)
|
||||
else:
|
||||
beta_proxy = self.args[0]
|
||||
|
||||
if 'alpha' in kwargs:
|
||||
alpha = kwargs['alpha']
|
||||
if "alpha" in kwargs:
|
||||
alpha = kwargs["alpha"]
|
||||
# doing the multiplication with alpha if it exists(temp_3 = alpha * temp_1)
|
||||
alpha_proxy = self.create_mul_node(alpha, sum_proxy)
|
||||
else:
|
||||
|
@@ -1,7 +1,4 @@
|
||||
import operator
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from ...registry import bias_addition_function, bias_addition_method
|
||||
from .bias_addition_function import LinearBasedBiasFunc
|
||||
@@ -10,17 +7,16 @@ from .bias_addition_function import LinearBasedBiasFunc
|
||||
@bias_addition_method.register(torch.Tensor.addmm)
|
||||
@bias_addition_function.register(torch.addmm)
|
||||
class Addmm(LinearBasedBiasFunc):
|
||||
|
||||
def extract_kwargs_from_origin_func(self):
|
||||
kwargs = {}
|
||||
if 'beta' in self.kwargs:
|
||||
kwargs['beta'] = self.kwargs['beta']
|
||||
if 'alpha' in self.kwargs:
|
||||
kwargs['alpha'] = self.kwargs['alpha']
|
||||
if "beta" in self.kwargs:
|
||||
kwargs["beta"] = self.kwargs["beta"]
|
||||
if "alpha" in self.kwargs:
|
||||
kwargs["alpha"] = self.kwargs["alpha"]
|
||||
return kwargs
|
||||
|
||||
def transpose_other_operand_for_linear(self, other_proxy):
|
||||
'''
|
||||
"""
|
||||
This method is used to transpose the other operand for linear function.
|
||||
For example:
|
||||
input = torch.rand(3, 4)
|
||||
@@ -30,8 +26,8 @@ class Addmm(LinearBasedBiasFunc):
|
||||
# To keep the computation graph consistent with the origin computation graph, we need to transpose the m2
|
||||
# before we call the linear function.
|
||||
new_output = torch.linear(m1, m2.transpose(0, 1)) + input
|
||||
'''
|
||||
node_kind = 'call_function'
|
||||
"""
|
||||
node_kind = "call_function"
|
||||
node_target = torch.transpose
|
||||
node_args = (other_proxy, 0, 1)
|
||||
node_kwargs = {}
|
||||
@@ -43,14 +39,14 @@ class Addmm(LinearBasedBiasFunc):
|
||||
non_bias_linear_func_proxy = self.create_non_bias_func_proxy(self.args[1], transpose_proxy)
|
||||
kwargs = self.extract_kwargs_from_origin_func()
|
||||
|
||||
if 'beta' in kwargs:
|
||||
beta = kwargs['beta']
|
||||
if "beta" in kwargs:
|
||||
beta = kwargs["beta"]
|
||||
beta_proxy = self.create_mul_node(self.args[0], beta)
|
||||
else:
|
||||
beta_proxy = self.args[0]
|
||||
|
||||
if 'alpha' in kwargs:
|
||||
alpha = kwargs['alpha']
|
||||
if "alpha" in kwargs:
|
||||
alpha = kwargs["alpha"]
|
||||
alpha_proxy = self.create_mul_node(alpha, non_bias_linear_func_proxy)
|
||||
else:
|
||||
alpha_proxy = non_bias_linear_func_proxy
|
||||
|
@@ -29,7 +29,6 @@ class BiasAdditionFunc(ABC):
|
||||
to insert two more operator.mul nodes for the computation graph to compute the
|
||||
final result.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def generate(self):
|
||||
@@ -50,7 +49,6 @@ class BiasAdditionFunc(ABC):
|
||||
%mul_1 : [#users=1] = call_function[target=operator.mul](args = (2, %linear), kwargs = {})
|
||||
%add : [#users=1] = call_function[target=operator.add](args = (%mul_1, %mul), kwargs = {})
|
||||
"""
|
||||
pass
|
||||
|
||||
def create_mul_node(self, input_proxy, coefficent):
|
||||
"""
|
||||
@@ -59,7 +57,7 @@ class BiasAdditionFunc(ABC):
|
||||
Therefore, we need to use this method insert two more operator.mul nodes for
|
||||
the computation graph to compute the final result.
|
||||
"""
|
||||
node_kind = 'call_function'
|
||||
node_kind = "call_function"
|
||||
node_target = operator.mul
|
||||
node_args = (
|
||||
input_proxy,
|
||||
@@ -82,7 +80,7 @@ class LinearBasedBiasFunc(BiasAdditionFunc):
|
||||
compute the main computation, such as convolution, with bias option banned.
|
||||
"""
|
||||
assert self.substitute_func == torch.nn.functional.linear
|
||||
node_kind = 'call_function'
|
||||
node_kind = "call_function"
|
||||
node_target = self.substitute_func
|
||||
|
||||
node_args = (input_proxy, other_proxy)
|
||||
@@ -96,7 +94,7 @@ class LinearBasedBiasFunc(BiasAdditionFunc):
|
||||
This method is used to create the bias_addition_proxy, the node created by this proxy will
|
||||
compute the sum of non_bias_func result and bias with some reshape operation if needed.
|
||||
"""
|
||||
bias_add_node_kind = 'call_function'
|
||||
bias_add_node_kind = "call_function"
|
||||
bias_add_node_target = operator.add
|
||||
bias_add_args = (non_bias_func_proxy, bias_proxy)
|
||||
bias_add_proxy = self.tracer.create_proxy(bias_add_node_kind, bias_add_node_target, tuple(bias_add_args), {})
|
||||
|
@@ -1,6 +1,3 @@
|
||||
import operator
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from ...registry import bias_addition_function
|
||||
@@ -9,17 +6,16 @@ from .bias_addition_function import LinearBasedBiasFunc
|
||||
|
||||
@bias_addition_function.register(F.linear)
|
||||
class Linear(LinearBasedBiasFunc):
|
||||
|
||||
def extract_kwargs_from_origin_func(self):
|
||||
assert 'bias' in self.kwargs
|
||||
assert "bias" in self.kwargs
|
||||
kwargs = {}
|
||||
if 'bias' in self.kwargs:
|
||||
kwargs['bias'] = self.kwargs['bias']
|
||||
if "bias" in self.kwargs:
|
||||
kwargs["bias"] = self.kwargs["bias"]
|
||||
return kwargs
|
||||
|
||||
def generate(self):
|
||||
non_bias_linear_func_proxy = self.create_non_bias_func_proxy(self.args[0], self.args[1])
|
||||
kwargs = self.extract_kwargs_from_origin_func()
|
||||
bias_addition_proxy = self.create_bias_addition_proxy(non_bias_linear_func_proxy, kwargs['bias'])
|
||||
bias_addition_proxy = self.create_bias_addition_proxy(non_bias_linear_func_proxy, kwargs["bias"])
|
||||
|
||||
return bias_addition_proxy
|
||||
|
@@ -27,8 +27,8 @@ class BiasAdditionModule(ABC):
|
||||
Note: this function will be invoked during module initializing,
|
||||
you should never call this function.
|
||||
"""
|
||||
weight_node_kind = 'get_attr'
|
||||
weight_node_target = self.target + '.weight'
|
||||
weight_node_kind = "get_attr"
|
||||
weight_node_target = self.target + ".weight"
|
||||
weight_proxy = self.tracer.create_proxy(weight_node_kind, weight_node_target, (), {})
|
||||
return weight_proxy
|
||||
|
||||
@@ -39,8 +39,8 @@ class BiasAdditionModule(ABC):
|
||||
Note: this function will be invoked during module initializing,
|
||||
you should never call this function.
|
||||
"""
|
||||
bias_node_kind = 'get_attr'
|
||||
bias_node_target = self.target + '.bias'
|
||||
bias_node_kind = "get_attr"
|
||||
bias_node_target = self.target + ".bias"
|
||||
bias_proxy = self.tracer.create_proxy(bias_node_kind, bias_node_target, (), {})
|
||||
return bias_proxy
|
||||
|
||||
@@ -54,14 +54,13 @@ class BiasAdditionModule(ABC):
|
||||
considered during module initializing. However, we need to consider those attributes as kwargs
|
||||
in F.conv2d.
|
||||
"""
|
||||
pass
|
||||
|
||||
def create_non_bias_func_proxy(self, input_proxy=None):
|
||||
"""
|
||||
This method is used to create the non_bias_func proxy, the node created by this proxy will
|
||||
compute the main computation, such as convolution, with bias option banned.
|
||||
"""
|
||||
node_kind = 'call_function'
|
||||
node_kind = "call_function"
|
||||
node_target = self.substitute_func
|
||||
if input_proxy is None:
|
||||
input_proxy = self.args[0]
|
||||
@@ -75,7 +74,7 @@ class BiasAdditionModule(ABC):
|
||||
This method is used to create the bias_addition_proxy, the node created by this proxy will
|
||||
compute the sum of non_bias_func result and bias with some reshape operation if needed.
|
||||
"""
|
||||
bias_add_node_kind = 'call_function'
|
||||
bias_add_node_kind = "call_function"
|
||||
bias_add_node_target = operator.add
|
||||
bias_add_args = (non_bias_func_proxy, bias_proxy)
|
||||
bias_add_proxy = self.tracer.create_proxy(bias_add_node_kind, bias_add_node_target, tuple(bias_add_args), {})
|
||||
@@ -100,7 +99,6 @@ class BiasAdditionModule(ABC):
|
||||
%view : [#users=1] = call_method[target=view](args = (%conv_bias, [1, -1, 1, 1]), kwargs = {})
|
||||
%add : [#users=1] = call_function[target=operator.add](args = (%conv2d, %view), kwargs = {})
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
module_to_func_dict = {
|
||||
|
@@ -1,6 +1,5 @@
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from torch.nn.modules.utils import _pair, _reverse_repeat_tuple, _single, _triple
|
||||
from torch.nn.modules.utils import _pair, _single, _triple
|
||||
|
||||
from ...registry import bias_addition_module
|
||||
from .bias_addition_module import BiasAdditionModule
|
||||
@@ -10,17 +9,16 @@ from .bias_addition_module import BiasAdditionModule
|
||||
@bias_addition_module.register(torch.nn.Conv2d)
|
||||
@bias_addition_module.register(torch.nn.Conv3d)
|
||||
class BiasAdditionConv(BiasAdditionModule):
|
||||
|
||||
def extract_kwargs_from_mod(self):
|
||||
root = self.tracer.root
|
||||
conv_module = root.get_submodule(self.target)
|
||||
kwarg_attributes = ['groups', 'dilation', 'stride']
|
||||
kwarg_attributes = ["groups", "dilation", "stride"]
|
||||
non_bias_kwargs = {}
|
||||
for attr_name in kwarg_attributes:
|
||||
if hasattr(conv_module, attr_name):
|
||||
non_bias_kwargs[attr_name] = getattr(conv_module, attr_name)
|
||||
if conv_module.padding_mode != "zeros":
|
||||
#TODO: non zeros mode requires some extra processing for input
|
||||
# TODO: non zeros mode requires some extra processing for input
|
||||
conv_type = type(conv_module)
|
||||
if conv_type == "torch.nn.Conv1d":
|
||||
padding_element = _single(0)
|
||||
@@ -28,9 +26,9 @@ class BiasAdditionConv(BiasAdditionModule):
|
||||
padding_element = _pair(0)
|
||||
elif conv_type == "torch.nn.Conv3d":
|
||||
padding_element = _triple(0)
|
||||
non_bias_kwargs['padding'] = padding_element
|
||||
non_bias_kwargs["padding"] = padding_element
|
||||
else:
|
||||
non_bias_kwargs['padding'] = getattr(conv_module, 'padding')
|
||||
non_bias_kwargs["padding"] = getattr(conv_module, "padding")
|
||||
|
||||
return non_bias_kwargs
|
||||
|
||||
@@ -41,11 +39,12 @@ class BiasAdditionConv(BiasAdditionModule):
|
||||
"""
|
||||
bias_shape = [1] * (dimensions - 1)
|
||||
bias_shape[0] = -1
|
||||
bias_reshape_node_kind = 'call_method'
|
||||
bias_reshape_node_target = 'view'
|
||||
bias_reshape_node_kind = "call_method"
|
||||
bias_reshape_node_target = "view"
|
||||
bias_reshape_node_args = (self.bias_proxy, torch.Size(bias_shape))
|
||||
bias_reshape_proxy = self.tracer.create_proxy(bias_reshape_node_kind, bias_reshape_node_target,
|
||||
bias_reshape_node_args, {})
|
||||
bias_reshape_proxy = self.tracer.create_proxy(
|
||||
bias_reshape_node_kind, bias_reshape_node_target, bias_reshape_node_args, {}
|
||||
)
|
||||
return bias_reshape_proxy
|
||||
|
||||
def generate(self):
|
||||
|
@@ -1,5 +1,4 @@
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from ...registry import bias_addition_module
|
||||
from .bias_addition_module import BiasAdditionModule
|
||||
@@ -7,7 +6,6 @@ from .bias_addition_module import BiasAdditionModule
|
||||
|
||||
@bias_addition_module.register(torch.nn.Linear)
|
||||
class BiasAdditionLinear(BiasAdditionModule):
|
||||
|
||||
def extract_kwargs_from_mod(self):
|
||||
return {}
|
||||
|
||||
|
@@ -1,4 +1,3 @@
|
||||
import enum
|
||||
import functools
|
||||
import inspect
|
||||
import operator
|
||||
@@ -10,7 +9,7 @@ from torch.fx import Graph, Node, Proxy, Tracer
|
||||
from torch.utils._pytree import tree_map
|
||||
|
||||
from colossalai.fx import ColoGraphModule, compatibility, is_compatible_with_meta
|
||||
from colossalai.fx.tracer._tracer_utils import extract_meta, is_element_in_list
|
||||
from colossalai.fx.tracer._tracer_utils import is_element_in_list
|
||||
from colossalai.fx.tracer.bias_addition_patch import func_to_func_dict, method_to_func_dict, module_to_func_dict
|
||||
from colossalai.fx.tracer.registry import (
|
||||
bias_addition_function,
|
||||
@@ -24,31 +23,45 @@ if is_compatible_with_meta():
|
||||
from colossalai.fx.profiler import MetaTensor
|
||||
|
||||
Target = Union[Callable[..., Any], str]
|
||||
Argument = Optional[Union[Tuple[Any, ...], # actually Argument, but mypy can't represent recursive types
|
||||
List[Any], # actually Argument
|
||||
Dict[str, Any], # actually Argument
|
||||
slice, # Slice[Argument, Argument, Argument], but slice is not a templated type in typing
|
||||
'Node',]]
|
||||
_CScriptMethod = ['add', 'mul', 'sub', 'div']
|
||||
Argument = Optional[
|
||||
Union[
|
||||
Tuple[Any, ...], # actually Argument, but mypy can't represent recursive types
|
||||
List[Any], # actually Argument
|
||||
Dict[str, Any], # actually Argument
|
||||
slice, # Slice[Argument, Argument, Argument], but slice is not a templated type in typing
|
||||
"Node",
|
||||
]
|
||||
]
|
||||
_CScriptMethod = ["add", "mul", "sub", "div"]
|
||||
_TorchNewMethod = [
|
||||
"arange", "zeros", "zeros_like", "ones", "ones_like", "full", "full_like", "empty", "empty_like", "eye", "tensor",
|
||||
"finfo"
|
||||
"arange",
|
||||
"zeros",
|
||||
"zeros_like",
|
||||
"ones",
|
||||
"ones_like",
|
||||
"full",
|
||||
"full_like",
|
||||
"empty",
|
||||
"empty_like",
|
||||
"eye",
|
||||
"tensor",
|
||||
"finfo",
|
||||
]
|
||||
_TensorPropertyMethod = ["dtype", "shape", "device", "requires_grad", "grad", "grad_fn", "data"]
|
||||
|
||||
|
||||
def _truncate_suffix(s: str):
|
||||
import re
|
||||
return re.sub(r'_\d+$', '', s)
|
||||
|
||||
return re.sub(r"_\d+$", "", s)
|
||||
|
||||
|
||||
def default_device():
|
||||
return torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
|
||||
return torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
|
||||
|
||||
|
||||
@compatibility(is_backward_compatible=False)
|
||||
class ColoProxy(Proxy):
|
||||
|
||||
def __init__(self, *args, data=None, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._meta_data = data
|
||||
@@ -100,7 +113,7 @@ class ColoProxy(Proxy):
|
||||
return ColoAttribute(self, k, getattr(self._meta_data, k, None))
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
proxy = self.tracer.create_proxy('call_function', operator.setitem, (self, key, value), {})
|
||||
proxy = self.tracer.create_proxy("call_function", operator.setitem, (self, key, value), {})
|
||||
proxy.meta_data = self._meta_data
|
||||
return proxy
|
||||
|
||||
@@ -125,29 +138,28 @@ class ColoProxy(Proxy):
|
||||
|
||||
@property
|
||||
def device(self):
|
||||
proxy = self.tracer.create_proxy('call_function', getattr, (self, 'device'), {})
|
||||
proxy = self.tracer.create_proxy("call_function", getattr, (self, "device"), {})
|
||||
proxy.meta_data = self.meta_data.device
|
||||
return proxy
|
||||
|
||||
@property
|
||||
def dtype(self):
|
||||
proxy = self.tracer.create_proxy('call_function', getattr, (self, 'dtype'), {})
|
||||
proxy = self.tracer.create_proxy("call_function", getattr, (self, "dtype"), {})
|
||||
proxy.meta_data = self.meta_data.dtype
|
||||
return proxy
|
||||
|
||||
def to(self, *args, **kwargs):
|
||||
return self.tracer.create_proxy('call_method', 'to', (self, *args), {**kwargs})
|
||||
return self.tracer.create_proxy("call_method", "to", (self, *args), {**kwargs})
|
||||
|
||||
def cpu(self, *args, **kwargs):
|
||||
return self.tracer.create_proxy('call_method', 'cpu', (self, *args), {**kwargs})
|
||||
return self.tracer.create_proxy("call_method", "cpu", (self, *args), {**kwargs})
|
||||
|
||||
def cuda(self, *args, **kwargs):
|
||||
return self.tracer.create_proxy('call_method', 'cuda', (self, *args), {**kwargs})
|
||||
return self.tracer.create_proxy("call_method", "cuda", (self, *args), {**kwargs})
|
||||
|
||||
|
||||
@compatibility(is_backward_compatible=False)
|
||||
class ColoAttribute(ColoProxy):
|
||||
|
||||
def __init__(self, root, attr: str, data=None):
|
||||
self.root = root
|
||||
self.attr = attr
|
||||
@@ -160,11 +172,11 @@ class ColoAttribute(ColoProxy):
|
||||
# the node for attributes is added lazily, since most will just be method calls
|
||||
# which do not rely on the getitem call
|
||||
if self._node is None:
|
||||
self._node = self.tracer.create_proxy('call_function', getattr, (self.root, self.attr), {}).node
|
||||
self._node = self.tracer.create_proxy("call_function", getattr, (self.root, self.attr), {}).node
|
||||
return self._node
|
||||
|
||||
def __call__(self, *args, **kwargs):
|
||||
return self.tracer.create_proxy('call_method', self.attr, (self.root,) + args, kwargs)
|
||||
return self.tracer.create_proxy("call_method", self.attr, (self.root,) + args, kwargs)
|
||||
|
||||
def __repr__(self):
|
||||
return f"ColoAttribute({self.node.name}, attr={self.attr})"
|
||||
@@ -172,7 +184,6 @@ class ColoAttribute(ColoProxy):
|
||||
|
||||
@compatibility(is_backward_compatible=False)
|
||||
class ColoTracer(Tracer):
|
||||
|
||||
def __init__(self, trace_act_ckpt: bool = False, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._disable_module_getattr = False
|
||||
@@ -184,24 +195,28 @@ class ColoTracer(Tracer):
|
||||
self.inside_torch_checkpoint_func = False
|
||||
self.act_ckpt_region_count = 0
|
||||
|
||||
def proxy(self, node: Node) -> 'ColoProxy':
|
||||
def proxy(self, node: Node) -> "ColoProxy":
|
||||
return ColoProxy(node, self)
|
||||
|
||||
def create_proxy(self,
|
||||
kind: str,
|
||||
target: Target,
|
||||
args: Tuple[Any, ...],
|
||||
kwargs: Dict[str, Any],
|
||||
name: Optional[str] = None,
|
||||
type_expr: Optional[Any] = None,
|
||||
proxy_factory_fn: Callable[[Node], 'Proxy'] = None):
|
||||
|
||||
def create_proxy(
|
||||
self,
|
||||
kind: str,
|
||||
target: Target,
|
||||
args: Tuple[Any, ...],
|
||||
kwargs: Dict[str, Any],
|
||||
name: Optional[str] = None,
|
||||
type_expr: Optional[Any] = None,
|
||||
proxy_factory_fn: Callable[[Node], "Proxy"] = None,
|
||||
):
|
||||
proxy: ColoProxy = super().create_proxy(kind, target, args, kwargs, name, type_expr, proxy_factory_fn)
|
||||
unwrap_fn = lambda p: p.meta_data if isinstance(p, ColoProxy) else p
|
||||
if kind == 'placeholder':
|
||||
proxy.meta_data = self.meta_args[target] if target in self.meta_args else self.concrete_args.get(
|
||||
_truncate_suffix(target), None)
|
||||
elif kind == 'get_attr':
|
||||
if kind == "placeholder":
|
||||
proxy.meta_data = (
|
||||
self.meta_args[target]
|
||||
if target in self.meta_args
|
||||
else self.concrete_args.get(_truncate_suffix(target), None)
|
||||
)
|
||||
elif kind == "get_attr":
|
||||
self._disable_module_getattr = True
|
||||
try:
|
||||
attr_itr = self.root
|
||||
@@ -211,20 +226,21 @@ class ColoTracer(Tracer):
|
||||
proxy.meta_data = attr_itr
|
||||
finally:
|
||||
self._disable_module_getattr = False
|
||||
elif kind == 'call_function':
|
||||
elif kind == "call_function":
|
||||
proxy.meta_data = target(*tree_map(unwrap_fn, args), **tree_map(unwrap_fn, kwargs))
|
||||
elif kind == 'call_method':
|
||||
elif kind == "call_method":
|
||||
self._disable_module_getattr = True
|
||||
try:
|
||||
if target == '__call__':
|
||||
if target == "__call__":
|
||||
proxy.meta_data = unwrap_fn(args[0])(*tree_map(unwrap_fn, args[1:]), **tree_map(unwrap_fn, kwargs))
|
||||
else:
|
||||
if target not in _TensorPropertyMethod:
|
||||
proxy._meta_data = getattr(unwrap_fn(args[0]), target)(*tree_map(unwrap_fn, args[1:]),
|
||||
**tree_map(unwrap_fn, kwargs))
|
||||
proxy._meta_data = getattr(unwrap_fn(args[0]), target)(
|
||||
*tree_map(unwrap_fn, args[1:]), **tree_map(unwrap_fn, kwargs)
|
||||
)
|
||||
finally:
|
||||
self._disable_module_getattr = False
|
||||
elif kind == 'call_module':
|
||||
elif kind == "call_module":
|
||||
mod = self.root.get_submodule(target)
|
||||
self._disable_module_getattr = True
|
||||
try:
|
||||
@@ -238,14 +254,15 @@ class ColoTracer(Tracer):
|
||||
|
||||
if self.inside_torch_checkpoint_func:
|
||||
# annotate the activation checkpoint module
|
||||
node.meta['activation_checkpoint'] = self.act_ckpt_region_count
|
||||
node.meta["activation_checkpoint"] = self.act_ckpt_region_count
|
||||
return node
|
||||
|
||||
def trace(self,
|
||||
root: torch.nn.Module,
|
||||
concrete_args: Optional[Dict[str, torch.Tensor]] = None,
|
||||
meta_args: Optional[Dict[str, torch.Tensor]] = None) -> Graph:
|
||||
|
||||
def trace(
|
||||
self,
|
||||
root: torch.nn.Module,
|
||||
concrete_args: Optional[Dict[str, torch.Tensor]] = None,
|
||||
meta_args: Optional[Dict[str, torch.Tensor]] = None,
|
||||
) -> Graph:
|
||||
if meta_args is None:
|
||||
meta_args = {}
|
||||
|
||||
@@ -260,20 +277,19 @@ class ColoTracer(Tracer):
|
||||
# update concrete args with default values
|
||||
non_meta_arg_names = sig_names - meta_arg_names
|
||||
for k, v in sig.parameters.items():
|
||||
if k in non_meta_arg_names and \
|
||||
k not in concrete_args and \
|
||||
v.default is not inspect.Parameter.empty:
|
||||
if k in non_meta_arg_names and k not in concrete_args and v.default is not inspect.Parameter.empty:
|
||||
concrete_args[k] = v.default
|
||||
|
||||
# get non concrete arg names
|
||||
concrete_arg_names = set(concrete_args.keys())
|
||||
non_concrete_arg_names = sig_names - concrete_arg_names
|
||||
sig_names - concrete_arg_names
|
||||
|
||||
def _check_arg_name_valid(names):
|
||||
success, element = is_element_in_list(names, sig_names)
|
||||
if not success:
|
||||
raise KeyError(
|
||||
f"argument {element} is not found in the signature of {root.__class__.__name__}'s forward function")
|
||||
f"argument {element} is not found in the signature of {root.__class__.__name__}'s forward function"
|
||||
)
|
||||
|
||||
_check_arg_name_valid(meta_arg_names)
|
||||
_check_arg_name_valid(concrete_arg_names)
|
||||
@@ -292,7 +308,6 @@ class ColoTracer(Tracer):
|
||||
orig_ckpt_func = torch.utils.checkpoint.CheckpointFunction
|
||||
|
||||
class PatchedCheckpointFunction(torch.autograd.Function):
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, run_function, preserve_rng_state, *args):
|
||||
# signal that the current tracing occurs within activation checkpoint part
|
||||
@@ -305,7 +320,8 @@ class ColoTracer(Tracer):
|
||||
@staticmethod
|
||||
def backward(ctx: Any, *grad_outputs: Any) -> Any:
|
||||
raise NotImplementedError(
|
||||
"We do not implement the backward pass as we only trace the forward pass.")
|
||||
"We do not implement the backward pass as we only trace the forward pass."
|
||||
)
|
||||
|
||||
# override the checkpoint function
|
||||
torch.utils.checkpoint.CheckpointFunction = PatchedCheckpointFunction
|
||||
@@ -356,10 +372,13 @@ class ColoTracer(Tracer):
|
||||
if attr_val is p:
|
||||
if n not in parameter_proxy_cache:
|
||||
kwargs = {}
|
||||
if 'proxy_factory_fn' in inspect.signature(self.create_proxy).parameters:
|
||||
kwargs['proxy_factory_fn'] = (None if not self.param_shapes_constant else
|
||||
lambda node: ColoProxy(self, node, n, attr_val))
|
||||
val_proxy = self.create_proxy('get_attr', n, (), {}, **kwargs) # type: ignore[arg-type]
|
||||
if "proxy_factory_fn" in inspect.signature(self.create_proxy).parameters:
|
||||
kwargs["proxy_factory_fn"] = (
|
||||
None
|
||||
if not self.param_shapes_constant
|
||||
else lambda node: ColoProxy(self, node, n, attr_val)
|
||||
)
|
||||
val_proxy = self.create_proxy("get_attr", n, (), {}, **kwargs) # type: ignore[arg-type]
|
||||
parameter_proxy_cache[n] = val_proxy
|
||||
return parameter_proxy_cache[n]
|
||||
return None
|
||||
@@ -370,8 +389,9 @@ class ColoTracer(Tracer):
|
||||
return maybe_buffer_proxy
|
||||
|
||||
if isinstance(attr_val, torch.nn.Parameter):
|
||||
maybe_parameter_proxy = maybe_get_proxy_for_attr(attr_val, self.root.named_parameters(),
|
||||
parameter_proxy_cache)
|
||||
maybe_parameter_proxy = maybe_get_proxy_for_attr(
|
||||
attr_val, self.root.named_parameters(), parameter_proxy_cache
|
||||
)
|
||||
if maybe_parameter_proxy is not None:
|
||||
return maybe_parameter_proxy
|
||||
|
||||
@@ -389,42 +409,41 @@ def symbolic_trace(
|
||||
if meta_args is not None:
|
||||
root.to(default_device())
|
||||
wrap_fn = lambda x: MetaTensor(x, fake_device=default_device()) if isinstance(x, torch.Tensor) else x
|
||||
graph = ColoTracer(trace_act_ckpt=trace_act_ckpt).trace(root,
|
||||
concrete_args=concrete_args,
|
||||
meta_args=tree_map(wrap_fn, meta_args))
|
||||
graph = ColoTracer(trace_act_ckpt=trace_act_ckpt).trace(
|
||||
root, concrete_args=concrete_args, meta_args=tree_map(wrap_fn, meta_args)
|
||||
)
|
||||
root.cpu()
|
||||
else:
|
||||
graph = Tracer().trace(root, concrete_args=concrete_args)
|
||||
else:
|
||||
from .tracer import ColoTracer as OrigColoTracer
|
||||
graph = OrigColoTracer(trace_act_ckpt=trace_act_ckpt).trace(root,
|
||||
concrete_args=concrete_args,
|
||||
meta_args=meta_args)
|
||||
|
||||
graph = OrigColoTracer(trace_act_ckpt=trace_act_ckpt).trace(
|
||||
root, concrete_args=concrete_args, meta_args=meta_args
|
||||
)
|
||||
name = root.__class__.__name__ if isinstance(root, torch.nn.Module) else root.__name__
|
||||
return ColoGraphModule(root, graph, name)
|
||||
|
||||
|
||||
@compatibility(is_backward_compatible=False)
|
||||
class _TorchTensorOverride(object):
|
||||
|
||||
def __init__(self, tracer: Tracer):
|
||||
self.overrides = {}
|
||||
self.tracer = tracer
|
||||
|
||||
def __enter__(self):
|
||||
|
||||
def wrap_tensor_method(target):
|
||||
|
||||
@functools.wraps(target)
|
||||
def wrapper(*args, **kwargs):
|
||||
is_proxy = any(isinstance(p, ColoProxy) for p in args) | any(
|
||||
isinstance(p, ColoProxy) for p in kwargs.values())
|
||||
isinstance(p, ColoProxy) for p in kwargs.values()
|
||||
)
|
||||
if is_proxy:
|
||||
# if the arg is a proxy, then need to record this function called on this proxy
|
||||
# e.g. torch.ones(size) where size is an input proxy
|
||||
self.tracer._disable_module_getattr = True
|
||||
try:
|
||||
proxy = self.tracer.create_proxy('call_function', target, args, kwargs)
|
||||
proxy = self.tracer.create_proxy("call_function", target, args, kwargs)
|
||||
finally:
|
||||
self.tracer._disable_module_getattr = False
|
||||
return proxy
|
||||
@@ -446,11 +465,12 @@ class _TorchTensorOverride(object):
|
||||
setattr(torch, name, orig)
|
||||
|
||||
|
||||
def meta_prop_pass(gm: ColoGraphModule,
|
||||
root: torch.nn.Module,
|
||||
meta_args: Optional[Dict[str, Any]] = None,
|
||||
concrete_args: Optional[Dict[str, torch.Tensor]] = None):
|
||||
|
||||
def meta_prop_pass(
|
||||
gm: ColoGraphModule,
|
||||
root: torch.nn.Module,
|
||||
meta_args: Optional[Dict[str, Any]] = None,
|
||||
concrete_args: Optional[Dict[str, torch.Tensor]] = None,
|
||||
):
|
||||
if meta_args is None:
|
||||
meta_args = {}
|
||||
|
||||
@@ -465,36 +485,36 @@ def meta_prop_pass(gm: ColoGraphModule,
|
||||
# update concrete args with default values
|
||||
non_meta_arg_names = sig_names - meta_arg_names
|
||||
for k, v in sig.parameters.items():
|
||||
if k in non_meta_arg_names and \
|
||||
k not in concrete_args and \
|
||||
v.default is not inspect.Parameter.empty:
|
||||
if k in non_meta_arg_names and k not in concrete_args and v.default is not inspect.Parameter.empty:
|
||||
concrete_args[k] = v.default
|
||||
|
||||
for node in gm.graph.nodes:
|
||||
node._meta_data = _meta_data_computing(meta_args, concrete_args, root, node.op, node.target, node.args,
|
||||
node.kwargs)
|
||||
node._meta_data = _meta_data_computing(
|
||||
meta_args, concrete_args, root, node.op, node.target, node.args, node.kwargs
|
||||
)
|
||||
|
||||
|
||||
def _meta_data_computing(meta_args, concrete_args, root, kind, target, args, kwargs):
|
||||
unwrap_fn = lambda n: n._meta_data if isinstance(n, Node) else n
|
||||
if kind == 'placeholder':
|
||||
if kind == "placeholder":
|
||||
meta_out = meta_args[target] if target in meta_args else concrete_args.get(_truncate_suffix(target), None)
|
||||
elif kind == 'get_attr':
|
||||
elif kind == "get_attr":
|
||||
attr_itr = root
|
||||
atoms = target.split(".")
|
||||
for atom in atoms:
|
||||
attr_itr = getattr(attr_itr, atom)
|
||||
meta_out = attr_itr
|
||||
elif kind == 'call_function':
|
||||
elif kind == "call_function":
|
||||
meta_out = target(*tree_map(unwrap_fn, args), **tree_map(unwrap_fn, kwargs))
|
||||
elif kind == 'call_method':
|
||||
if target == '__call__':
|
||||
elif kind == "call_method":
|
||||
if target == "__call__":
|
||||
meta_out = unwrap_fn(args[0])(*tree_map(unwrap_fn, args[1:]), **tree_map(unwrap_fn, kwargs))
|
||||
else:
|
||||
if target not in _TensorPropertyMethod:
|
||||
meta_out = getattr(unwrap_fn(args[0]), target)(*tree_map(unwrap_fn, args[1:]),
|
||||
**tree_map(unwrap_fn, kwargs))
|
||||
elif kind == 'call_module':
|
||||
meta_out = getattr(unwrap_fn(args[0]), target)(
|
||||
*tree_map(unwrap_fn, args[1:]), **tree_map(unwrap_fn, kwargs)
|
||||
)
|
||||
elif kind == "call_module":
|
||||
mod = root.get_submodule(target)
|
||||
meta_out = mod.forward(*tree_map(unwrap_fn, args), **tree_map(unwrap_fn, kwargs))
|
||||
else:
|
||||
@@ -603,26 +623,30 @@ def bias_addition_pass(gm: ColoGraphModule, root_model: torch.nn.Module, meta_ar
|
||||
if kind == "call_function":
|
||||
if bias_addition_function.has(target):
|
||||
if target == torch.nn.functional.linear:
|
||||
if 'bias' in kwargs and kwargs['bias'] is not None:
|
||||
if "bias" in kwargs and kwargs["bias"] is not None:
|
||||
function_to_substitute = func_to_func_dict[target]
|
||||
handle = bias_addition_function.get(target)(tracer, target, args_proxy, kwargs_proxy,
|
||||
function_to_substitute)
|
||||
handle = bias_addition_function.get(target)(
|
||||
tracer, target, args_proxy, kwargs_proxy, function_to_substitute
|
||||
)
|
||||
else:
|
||||
function_to_substitute = func_to_func_dict[target]
|
||||
handle = bias_addition_function.get(target)(tracer, target, args_proxy, kwargs_proxy,
|
||||
function_to_substitute)
|
||||
handle = bias_addition_function.get(target)(
|
||||
tracer, target, args_proxy, kwargs_proxy, function_to_substitute
|
||||
)
|
||||
elif bias_addition_function.has(target.__name__):
|
||||
# use name for some builtin op like @ (matmul)
|
||||
function_to_substitute = func_to_func_dict[target]
|
||||
handle = bias_addition_function.get(target.__name__)(tracer, target, args_proxy, kwargs_proxy,
|
||||
function_to_substitute)
|
||||
handle = bias_addition_function.get(target.__name__)(
|
||||
tracer, target, args_proxy, kwargs_proxy, function_to_substitute
|
||||
)
|
||||
|
||||
elif kind == "call_method":
|
||||
method = getattr(args_metas[0].__class__, target)
|
||||
if bias_addition_method.has(method):
|
||||
function_to_substitute = method_to_func_dict[method]
|
||||
handle = bias_addition_method.get(method)(tracer, target, args_proxy, kwargs_proxy,
|
||||
function_to_substitute)
|
||||
handle = bias_addition_method.get(method)(
|
||||
tracer, target, args_proxy, kwargs_proxy, function_to_substitute
|
||||
)
|
||||
|
||||
elif kind == "call_module":
|
||||
# if not hasattr(self, "orig_forward"):
|
||||
@@ -631,8 +655,9 @@ def bias_addition_pass(gm: ColoGraphModule, root_model: torch.nn.Module, meta_ar
|
||||
mod_type = type(mod)
|
||||
if bias_addition_module.has(mod_type) and mod.bias is not None:
|
||||
function_to_substitute = module_to_func_dict[mod_type]
|
||||
handle = bias_addition_module.get(mod_type)(tracer, target, args_proxy, kwargs_proxy,
|
||||
function_to_substitute)
|
||||
handle = bias_addition_module.get(mod_type)(
|
||||
tracer, target, args_proxy, kwargs_proxy, function_to_substitute
|
||||
)
|
||||
|
||||
if handle is not None:
|
||||
handle.generate()
|
||||
|
@@ -5,4 +5,4 @@ from ...registry import meta_patched_function
|
||||
|
||||
@meta_patched_function.register(torch.nn.functional.relu)
|
||||
def torch_nn_func_relu(input, inplace=False):
|
||||
return torch.empty(input.shape, device='meta')
|
||||
return torch.empty(input.shape, device="meta")
|
||||
|
@@ -4,7 +4,7 @@ from ...registry import meta_patched_function
|
||||
|
||||
|
||||
@meta_patched_function.register(torch.matmul)
|
||||
@meta_patched_function.register('matmul') # for built-in op @
|
||||
@meta_patched_function.register("matmul") # for built-in op @
|
||||
def torch_matmul(input, other, *, out=None):
|
||||
# copied from huggingface.utils.fx
|
||||
d1 = input.dim()
|
||||
@@ -44,8 +44,8 @@ def torch_matmul(input, other, *, out=None):
|
||||
|
||||
@meta_patched_function.register(torch.abs)
|
||||
def torch_abs(input, *, out=None):
|
||||
assert out is None, 'out is not supported yet'
|
||||
return torch.empty(input.shape, device='meta')
|
||||
assert out is None, "out is not supported yet"
|
||||
return torch.empty(input.shape, device="meta")
|
||||
|
||||
|
||||
@meta_patched_function.register(torch.bmm)
|
||||
@@ -89,7 +89,7 @@ def torch_addmm(input, mat1, mat2, *, beta=1, alpha=1, out=None):
|
||||
|
||||
@meta_patched_function.register(torch.var_mean)
|
||||
def torch_var_mean(input, dim, unbiased=True, keepdim=False, *, out=None):
|
||||
assert out is None, 'saving to out is not supported yet'
|
||||
var = torch.empty(1).squeeze(0).to('meta')
|
||||
mean = torch.empty(1).squeeze(0).to('meta')
|
||||
assert out is None, "saving to out is not supported yet"
|
||||
var = torch.empty(1).squeeze(0).to("meta")
|
||||
mean = torch.empty(1).squeeze(0).to("meta")
|
||||
return var, mean
|
||||
|
@@ -8,7 +8,6 @@ from ...registry import meta_patched_function
|
||||
|
||||
|
||||
def _ntuple(n, name="parse"):
|
||||
|
||||
def parse(x):
|
||||
if isinstance(x, collections.abc.Iterable):
|
||||
return tuple(x)
|
||||
@@ -24,21 +23,21 @@ _triple = _ntuple(3, "_triple")
|
||||
|
||||
|
||||
def _extract_kwargs(kwargs):
|
||||
if 'stride' in kwargs:
|
||||
stride = kwargs['stride']
|
||||
if "stride" in kwargs:
|
||||
stride = kwargs["stride"]
|
||||
else:
|
||||
stride = 1
|
||||
# TODO: process str type padding
|
||||
if 'padding' in kwargs:
|
||||
padding = kwargs['padding']
|
||||
if "padding" in kwargs:
|
||||
padding = kwargs["padding"]
|
||||
else:
|
||||
padding = 0
|
||||
if 'dilation' in kwargs:
|
||||
dilation = kwargs['dilation']
|
||||
if "dilation" in kwargs:
|
||||
dilation = kwargs["dilation"]
|
||||
else:
|
||||
dilation = 1
|
||||
if 'output_padding' in kwargs:
|
||||
output_padding = kwargs['output_padding']
|
||||
if "output_padding" in kwargs:
|
||||
output_padding = kwargs["output_padding"]
|
||||
else:
|
||||
output_padding = 0
|
||||
|
||||
@@ -61,7 +60,7 @@ def torch_nn_functional_conv1d(input, weight, **kwargs):
|
||||
c_out,
|
||||
l_out,
|
||||
)
|
||||
return torch.empty(result_shape, device='meta')
|
||||
return torch.empty(result_shape, device="meta")
|
||||
|
||||
|
||||
@meta_patched_function.register(torch.nn.functional.conv2d)
|
||||
@@ -82,7 +81,7 @@ def torch_nn_functional_conv2d(input, weight, **kwargs):
|
||||
h_out,
|
||||
w_out,
|
||||
)
|
||||
return torch.empty(result_shape, device='meta')
|
||||
return torch.empty(result_shape, device="meta")
|
||||
|
||||
|
||||
@meta_patched_function.register(torch.nn.functional.conv3d)
|
||||
@@ -105,7 +104,7 @@ def torch_nn_functional_conv3d(input, weight, **kwargs):
|
||||
h_out,
|
||||
w_out,
|
||||
)
|
||||
return torch.empty(result_shape, device='meta')
|
||||
return torch.empty(result_shape, device="meta")
|
||||
|
||||
|
||||
@meta_patched_function.register(torch.nn.functional.conv_transpose1d)
|
||||
@@ -120,13 +119,14 @@ def torch_nn_functional_convtranspose1d(input, weight, **kwargs):
|
||||
kernel_size = weight.shape[2:]
|
||||
l_in = input.shape[-1]
|
||||
c_out = weight.shape[1]
|
||||
l_out = math.floor((l_in - 1) * stride[0] - 2 * padding[0] + dilation[0] * (kernel_size[0] - 1) +
|
||||
output_padding[0] + 1)
|
||||
l_out = math.floor(
|
||||
(l_in - 1) * stride[0] - 2 * padding[0] + dilation[0] * (kernel_size[0] - 1) + output_padding[0] + 1
|
||||
)
|
||||
result_shape = input.shape[:-2] + (
|
||||
c_out,
|
||||
l_out,
|
||||
)
|
||||
return torch.empty(result_shape, device='meta')
|
||||
return torch.empty(result_shape, device="meta")
|
||||
|
||||
|
||||
@meta_patched_function.register(torch.nn.functional.conv_transpose2d)
|
||||
@@ -141,16 +141,18 @@ def torch_nn_functional_convtranspose2d(input, weight, **kwargs):
|
||||
kernel_size = weight.shape[2:]
|
||||
h_in, w_in = input.shape[-2:]
|
||||
c_out = weight.shape[1]
|
||||
h_out = math.floor((h_in - 1) * stride[0] - 2 * padding[0] + dilation[0] * (kernel_size[0] - 1) +
|
||||
output_padding[0] + 1)
|
||||
w_out = math.floor((w_in - 1) * stride[1] - 2 * padding[1] + dilation[1] * (kernel_size[1] - 1) +
|
||||
output_padding[1] + 1)
|
||||
h_out = math.floor(
|
||||
(h_in - 1) * stride[0] - 2 * padding[0] + dilation[0] * (kernel_size[0] - 1) + output_padding[0] + 1
|
||||
)
|
||||
w_out = math.floor(
|
||||
(w_in - 1) * stride[1] - 2 * padding[1] + dilation[1] * (kernel_size[1] - 1) + output_padding[1] + 1
|
||||
)
|
||||
result_shape = input.shape[:-3] + (
|
||||
c_out,
|
||||
h_out,
|
||||
w_out,
|
||||
)
|
||||
return torch.empty(result_shape, device='meta')
|
||||
return torch.empty(result_shape, device="meta")
|
||||
|
||||
|
||||
@meta_patched_function.register(torch.nn.functional.conv_transpose3d)
|
||||
@@ -165,16 +167,19 @@ def torch_nn_functional_convtranspose3d(input, weight, **kwargs):
|
||||
kernel_size = weight.shape[2:]
|
||||
d_in, h_in, w_in = input.shape[-3:]
|
||||
c_out = weight.shape[1]
|
||||
d_out = math.floor((d_in - 1) * stride[0] - 2 * padding[0] + dilation[0] * (kernel_size[0] - 1) +
|
||||
output_padding[0] + 1)
|
||||
h_out = math.floor((h_in - 1) * stride[1] - 2 * padding[1] + dilation[1] * (kernel_size[1] - 1) +
|
||||
output_padding[1] + 1)
|
||||
w_out = math.floor((w_in - 1) * stride[2] - 2 * padding[2] + dilation[2] * (kernel_size[2] - 1) +
|
||||
output_padding[2] + 1)
|
||||
d_out = math.floor(
|
||||
(d_in - 1) * stride[0] - 2 * padding[0] + dilation[0] * (kernel_size[0] - 1) + output_padding[0] + 1
|
||||
)
|
||||
h_out = math.floor(
|
||||
(h_in - 1) * stride[1] - 2 * padding[1] + dilation[1] * (kernel_size[1] - 1) + output_padding[1] + 1
|
||||
)
|
||||
w_out = math.floor(
|
||||
(w_in - 1) * stride[2] - 2 * padding[2] + dilation[2] * (kernel_size[2] - 1) + output_padding[2] + 1
|
||||
)
|
||||
result_shape = input.shape[:-4] + (
|
||||
c_out,
|
||||
d_out,
|
||||
h_out,
|
||||
w_out,
|
||||
)
|
||||
return torch.empty(result_shape, device='meta')
|
||||
return torch.empty(result_shape, device="meta")
|
||||
|
@@ -4,11 +4,7 @@ from ...registry import meta_patched_function
|
||||
|
||||
|
||||
@meta_patched_function.register(torch.nn.functional.embedding)
|
||||
def torch_nn_functional_embedding(input,
|
||||
weight,
|
||||
padding_idx=None,
|
||||
max_norm=None,
|
||||
norm_type=2.0,
|
||||
scale_grad_by_freq=False,
|
||||
sparse=False):
|
||||
def torch_nn_functional_embedding(
|
||||
input, weight, padding_idx=None, max_norm=None, norm_type=2.0, scale_grad_by_freq=False, sparse=False
|
||||
):
|
||||
return torch.empty(*input.shape, weight.shape[-1], device="meta")
|
||||
|
@@ -5,16 +5,11 @@ from ...registry import meta_patched_function
|
||||
|
||||
@meta_patched_function.register(torch.nn.functional.layer_norm)
|
||||
def torch_nn_func_layernorm(input, normalized_shape, weight=None, bias=None, eps=1e-05):
|
||||
return torch.empty(input.shape, device='meta')
|
||||
return torch.empty(input.shape, device="meta")
|
||||
|
||||
|
||||
@meta_patched_function.register(torch.nn.functional.batch_norm)
|
||||
def torch_nn_func_batchnorm(input,
|
||||
running_mean,
|
||||
running_var,
|
||||
weight=None,
|
||||
bias=None,
|
||||
training=False,
|
||||
momentum=0.1,
|
||||
eps=1e-05):
|
||||
return torch.empty(input.shape, device='meta')
|
||||
def torch_nn_func_batchnorm(
|
||||
input, running_mean, running_var, weight=None, bias=None, training=False, momentum=0.1, eps=1e-05
|
||||
):
|
||||
return torch.empty(input.shape, device="meta")
|
||||
|
@@ -19,9 +19,9 @@ def operator_getitem(a, b):
|
||||
return t
|
||||
|
||||
def _slice_convert(slice_obj):
|
||||
attrs = {'start': slice_obj.start, 'stop': slice_obj.stop, 'step': slice_obj.step}
|
||||
attrs = {"start": slice_obj.start, "stop": slice_obj.stop, "step": slice_obj.step}
|
||||
new_attrs = _slice_attr_convert(attrs)
|
||||
attr_dict_to_tuple = (new_attrs['start'], new_attrs['stop'], new_attrs['step'])
|
||||
attr_dict_to_tuple = (new_attrs["start"], new_attrs["stop"], new_attrs["step"])
|
||||
return slice(*attr_dict_to_tuple)
|
||||
|
||||
def _slice_attr_convert(attrs):
|
||||
|
@@ -105,14 +105,15 @@ def torch_cat(tensors, dim=None, axis=None, *, out=None):
|
||||
shapes = [t.shape for t in tensors]
|
||||
shape = list(shapes[0])
|
||||
concatenated_dim = sum(shape[dim] for shape in shapes)
|
||||
final_shape = shape[:dim] + [concatenated_dim] + shape[dim + 1:]
|
||||
final_shape = shape[:dim] + [concatenated_dim] + shape[dim + 1 :]
|
||||
return torch.empty(final_shape, device="meta")
|
||||
|
||||
|
||||
@meta_patched_function.register(torch.repeat_interleave)
|
||||
def torch_repeat_interleave(input, repeats, dim=None, output_size=None):
|
||||
assert isinstance(repeats, int) or isinstance(repeats, torch.Tensor), \
|
||||
"Argument 'repeats' should be of type 'torch.Tensor' or 'int'"
|
||||
assert isinstance(repeats, int) or isinstance(
|
||||
repeats, torch.Tensor
|
||||
), "Argument 'repeats' should be of type 'torch.Tensor' or 'int'"
|
||||
|
||||
shape = list(input.shape) if dim is not None else [input.numel()]
|
||||
dim = dim if dim is not None else 0
|
||||
@@ -132,36 +133,36 @@ def torch_tensor_repeat_interleave(self, repeats, dim=None, *, output_size=None)
|
||||
|
||||
@meta_patched_function.register(torch.roll)
|
||||
def torch_roll(input, shifts, dims=None):
|
||||
return torch.empty(input.shape, device='meta')
|
||||
return torch.empty(input.shape, device="meta")
|
||||
|
||||
|
||||
@meta_patched_function.register(torch.full)
|
||||
def torch_full(size, fill_value, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False):
|
||||
assert out is None, 'assigning result to out is not supported yet'
|
||||
return torch.empty(size, device='meta', dtype=dtype, layout=layout, requires_grad=requires_grad)
|
||||
assert out is None, "assigning result to out is not supported yet"
|
||||
return torch.empty(size, device="meta", dtype=dtype, layout=layout, requires_grad=requires_grad)
|
||||
|
||||
|
||||
@meta_patched_function.register(torch.max)
|
||||
def torch_max(input, dim=None, keepdim=False, *, out=None):
|
||||
assert out is None, 'assigning value to out is not supported yet'
|
||||
assert out is None, "assigning value to out is not supported yet"
|
||||
if dim is not None:
|
||||
if isinstance(dim, int):
|
||||
shape = list(input.shape)
|
||||
shape.pop(dim)
|
||||
if keepdim:
|
||||
shape.insert(dim, 1)
|
||||
return torch.empty(shape, device='meta', dtype=input.dtype), torch.empty(shape,
|
||||
device='meta',
|
||||
dtype=input.dtype)
|
||||
return torch.empty(shape, device="meta", dtype=input.dtype), torch.empty(
|
||||
shape, device="meta", dtype=input.dtype
|
||||
)
|
||||
elif isinstance(dim, torch.Tensor):
|
||||
# when dim is a 0D or 1D tensor, it will maintain the same shape
|
||||
num_dims = dim.dim()
|
||||
if num_dims in [0, 1]:
|
||||
return torch.empty_like(input, device='meta')
|
||||
return torch.empty_like(input, device="meta")
|
||||
else:
|
||||
raise ValueError(f"Expected dim to a 0D or 1D tensor but got {num_dims} dimensions")
|
||||
else:
|
||||
return torch.empty([], device='meta', dtype=input.dtype)
|
||||
return torch.empty([], device="meta", dtype=input.dtype)
|
||||
|
||||
|
||||
@meta_patched_function.register(torch.Tensor.cpu)
|
||||
|
@@ -4,4 +4,4 @@ from .embedding import *
|
||||
from .linear import *
|
||||
from .normalization import *
|
||||
from .pooling import *
|
||||
from .rnn import *
|
||||
from .rnn import *
|
||||
|
@@ -10,4 +10,4 @@ from ...registry import meta_patched_module
|
||||
@meta_patched_module.register(torch.nn.ReLU6)
|
||||
@meta_patched_module.register(torch.nn.PReLU)
|
||||
def torch_nn_non_linear_act(self, input):
|
||||
return torch.empty(input.shape, device='meta')
|
||||
return torch.empty(input.shape, device="meta")
|
||||
|
@@ -11,13 +11,14 @@ def torch_nn_conv1d(self, input):
|
||||
# at https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html#torch.nn.Conv1d
|
||||
l_in = input.shape[-1]
|
||||
c_out = self.out_channels
|
||||
l_out = math.floor((l_in + 2 * self.padding[0] - self.dilation[0] *
|
||||
(self.kernel_size[0] - 1) - 1) / self.stride[0] + 1)
|
||||
l_out = math.floor(
|
||||
(l_in + 2 * self.padding[0] - self.dilation[0] * (self.kernel_size[0] - 1) - 1) / self.stride[0] + 1
|
||||
)
|
||||
result_shape = input.shape[:-2] + (
|
||||
c_out,
|
||||
l_out,
|
||||
)
|
||||
return torch.empty(result_shape, device='meta')
|
||||
return torch.empty(result_shape, device="meta")
|
||||
|
||||
|
||||
@meta_patched_module.register(torch.nn.Conv2d)
|
||||
@@ -26,16 +27,18 @@ def torch_nn_conv2d(self, input):
|
||||
# at https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html#torch.nn.Conv2d
|
||||
h_in, w_in = input.shape[-2:]
|
||||
c_out = self.out_channels
|
||||
h_out = math.floor((h_in + 2 * self.padding[0] - self.dilation[0] *
|
||||
(self.kernel_size[0] - 1) - 1) / self.stride[0] + 1)
|
||||
w_out = math.floor((w_in + 2 * self.padding[1] - self.dilation[1] *
|
||||
(self.kernel_size[1] - 1) - 1) / self.stride[1] + 1)
|
||||
h_out = math.floor(
|
||||
(h_in + 2 * self.padding[0] - self.dilation[0] * (self.kernel_size[0] - 1) - 1) / self.stride[0] + 1
|
||||
)
|
||||
w_out = math.floor(
|
||||
(w_in + 2 * self.padding[1] - self.dilation[1] * (self.kernel_size[1] - 1) - 1) / self.stride[1] + 1
|
||||
)
|
||||
result_shape = input.shape[:-3] + (
|
||||
c_out,
|
||||
h_out,
|
||||
w_out,
|
||||
)
|
||||
return torch.empty(result_shape, device='meta')
|
||||
return torch.empty(result_shape, device="meta")
|
||||
|
||||
|
||||
@meta_patched_module.register(torch.nn.Conv3d)
|
||||
@@ -44,19 +47,22 @@ def torch_nn_conv3d(self, input):
|
||||
# at https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html#torch.nn.Conv3d
|
||||
d_in, h_in, w_in = input.shape[-3:]
|
||||
c_out = self.out_channels
|
||||
d_out = math.floor((d_in + 2 * self.padding[0] - self.dilation[0] *
|
||||
(self.kernel_size[0] - 1) - 1) / self.stride[0] + 1)
|
||||
h_out = math.floor((h_in + 2 * self.padding[1] - self.dilation[1] *
|
||||
(self.kernel_size[1] - 1) - 1) / self.stride[1] + 1)
|
||||
w_out = math.floor((w_in + 2 * self.padding[2] - self.dilation[2] *
|
||||
(self.kernel_size[2] - 1) - 1) / self.stride[2] + 1)
|
||||
d_out = math.floor(
|
||||
(d_in + 2 * self.padding[0] - self.dilation[0] * (self.kernel_size[0] - 1) - 1) / self.stride[0] + 1
|
||||
)
|
||||
h_out = math.floor(
|
||||
(h_in + 2 * self.padding[1] - self.dilation[1] * (self.kernel_size[1] - 1) - 1) / self.stride[1] + 1
|
||||
)
|
||||
w_out = math.floor(
|
||||
(w_in + 2 * self.padding[2] - self.dilation[2] * (self.kernel_size[2] - 1) - 1) / self.stride[2] + 1
|
||||
)
|
||||
result_shape = input.shape[:-4] + (
|
||||
c_out,
|
||||
d_out,
|
||||
h_out,
|
||||
w_out,
|
||||
)
|
||||
return torch.empty(result_shape, device='meta')
|
||||
return torch.empty(result_shape, device="meta")
|
||||
|
||||
|
||||
@meta_patched_module.register(torch.nn.ConvTranspose1d)
|
||||
@@ -65,13 +71,18 @@ def torch_nn_convtranspose1d(self, input):
|
||||
# at https://pytorch.org/docs/stable/generated/torch.nn.ConvTranspose1d.html
|
||||
l_in = input.shape[-1]
|
||||
c_out = self.out_channels
|
||||
l_out = math.floor((l_in - 1) * self.stride[0] - 2 * self.padding[0] + self.dilation[0] *
|
||||
(self.kernel_size[0] - 1) + self.output_padding[0] + 1)
|
||||
l_out = math.floor(
|
||||
(l_in - 1) * self.stride[0]
|
||||
- 2 * self.padding[0]
|
||||
+ self.dilation[0] * (self.kernel_size[0] - 1)
|
||||
+ self.output_padding[0]
|
||||
+ 1
|
||||
)
|
||||
result_shape = input.shape[:-2] + (
|
||||
c_out,
|
||||
l_out,
|
||||
)
|
||||
return torch.empty(result_shape, device='meta')
|
||||
return torch.empty(result_shape, device="meta")
|
||||
|
||||
|
||||
@meta_patched_module.register(torch.nn.ConvTranspose2d)
|
||||
@@ -80,16 +91,26 @@ def torch_nn_convtranspose2d(self, input):
|
||||
# at https://pytorch.org/docs/stable/generated/torch.nn.ConvTranspose2d.html
|
||||
h_in, w_in = input.shape[-2:]
|
||||
c_out = self.out_channels
|
||||
h_out = math.floor((h_in - 1) * self.stride[0] - 2 * self.padding[0] + self.dilation[0] *
|
||||
(self.kernel_size[0] - 1) + self.output_padding[0] + 1)
|
||||
w_out = math.floor((w_in - 1) * self.stride[1] - 2 * self.padding[1] + self.dilation[1] *
|
||||
(self.kernel_size[1] - 1) + self.output_padding[1] + 1)
|
||||
h_out = math.floor(
|
||||
(h_in - 1) * self.stride[0]
|
||||
- 2 * self.padding[0]
|
||||
+ self.dilation[0] * (self.kernel_size[0] - 1)
|
||||
+ self.output_padding[0]
|
||||
+ 1
|
||||
)
|
||||
w_out = math.floor(
|
||||
(w_in - 1) * self.stride[1]
|
||||
- 2 * self.padding[1]
|
||||
+ self.dilation[1] * (self.kernel_size[1] - 1)
|
||||
+ self.output_padding[1]
|
||||
+ 1
|
||||
)
|
||||
result_shape = input.shape[:-3] + (
|
||||
c_out,
|
||||
h_out,
|
||||
w_out,
|
||||
)
|
||||
return torch.empty(result_shape, device='meta')
|
||||
return torch.empty(result_shape, device="meta")
|
||||
|
||||
|
||||
@meta_patched_module.register(torch.nn.ConvTranspose3d)
|
||||
@@ -98,16 +119,31 @@ def torch_nn_convtranspose3d(self, input):
|
||||
# at https://pytorch.org/docs/stable/generated/torch.nn.ConvTranspose3d.html
|
||||
d_in, h_in, w_in = input.shape[-3:]
|
||||
c_out = self.out_channels
|
||||
d_out = math.floor((d_in - 1) * self.stride[0] - 2 * self.padding[0] + self.dilation[0] *
|
||||
(self.kernel_size[0] - 1) + self.output_padding[0] + 1)
|
||||
h_out = math.floor((h_in - 1) * self.stride[1] - 2 * self.padding[1] + self.dilation[1] *
|
||||
(self.kernel_size[1] - 1) + self.output_padding[1] + 1)
|
||||
w_out = math.floor((w_in - 1) * self.stride[2] - 2 * self.padding[2] + self.dilation[2] *
|
||||
(self.kernel_size[2] - 1) + self.output_padding[2] + 1)
|
||||
d_out = math.floor(
|
||||
(d_in - 1) * self.stride[0]
|
||||
- 2 * self.padding[0]
|
||||
+ self.dilation[0] * (self.kernel_size[0] - 1)
|
||||
+ self.output_padding[0]
|
||||
+ 1
|
||||
)
|
||||
h_out = math.floor(
|
||||
(h_in - 1) * self.stride[1]
|
||||
- 2 * self.padding[1]
|
||||
+ self.dilation[1] * (self.kernel_size[1] - 1)
|
||||
+ self.output_padding[1]
|
||||
+ 1
|
||||
)
|
||||
w_out = math.floor(
|
||||
(w_in - 1) * self.stride[2]
|
||||
- 2 * self.padding[2]
|
||||
+ self.dilation[2] * (self.kernel_size[2] - 1)
|
||||
+ self.output_padding[2]
|
||||
+ 1
|
||||
)
|
||||
result_shape = input.shape[:-4] + (
|
||||
c_out,
|
||||
d_out,
|
||||
h_out,
|
||||
w_out,
|
||||
)
|
||||
return torch.empty(result_shape, device='meta')
|
||||
return torch.empty(result_shape, device="meta")
|
||||
|
@@ -6,4 +6,4 @@ from ...registry import meta_patched_module
|
||||
@meta_patched_module.register(torch.nn.Embedding)
|
||||
def torch_nn_embedding(self, input):
|
||||
result_shape = input.shape + (self.embedding_dim,)
|
||||
return torch.empty(result_shape, device='meta')
|
||||
return torch.empty(result_shape, device="meta")
|
||||
|
@@ -6,5 +6,7 @@ from ...registry import meta_patched_module
|
||||
@meta_patched_module.register(torch.nn.Linear)
|
||||
def torch_nn_linear(self, input):
|
||||
last_dim = input.shape[-1]
|
||||
assert last_dim == self.in_features, f'Expected hidden size {self.in_features} but got {last_dim} for the torch.nn.Linear patch'
|
||||
assert (
|
||||
last_dim == self.in_features
|
||||
), f"Expected hidden size {self.in_features} but got {last_dim} for the torch.nn.Linear patch"
|
||||
return torch.empty(input.shape[:-1] + (self.out_features,), device="meta")
|
||||
|
@@ -23,6 +23,7 @@ def torch_nn_normalize(self, input):
|
||||
|
||||
try:
|
||||
import apex
|
||||
|
||||
meta_patched_module.register(apex.normalization.FusedLayerNorm)(torch_nn_normalize)
|
||||
meta_patched_module.register(apex.normalization.FusedRMSNorm)(torch_nn_normalize)
|
||||
meta_patched_module.register(apex.normalization.MixedFusedLayerNorm)(torch_nn_normalize)
|
||||
|
@@ -8,7 +8,7 @@ from ...registry import meta_patched_module
|
||||
@meta_patched_module.register(torch.nn.AvgPool1d)
|
||||
def torch_nn_avgpool1d(self, input):
|
||||
num_dim = input.dim()
|
||||
assert num_dim in [2, 3], f'expected the input to have 2 or 3 dimensions, but got {num_dim} dimensions'
|
||||
assert num_dim in [2, 3], f"expected the input to have 2 or 3 dimensions, but got {num_dim} dimensions"
|
||||
|
||||
l_in = input.shape[-1]
|
||||
|
||||
@@ -25,13 +25,13 @@ def torch_nn_avgpool1d(self, input):
|
||||
l_out = math.floor((l_in + 2 * padding[0] - kernel_size[0]) / stride[0] + 1)
|
||||
|
||||
result_shape = tuple(input.shape[:-1]) + (l_out,)
|
||||
return torch.empty(result_shape, device='meta')
|
||||
return torch.empty(result_shape, device="meta")
|
||||
|
||||
|
||||
@meta_patched_module.register(torch.nn.AvgPool2d)
|
||||
def torch_nn_avgpool2d(self, input):
|
||||
num_dim = input.dim()
|
||||
assert num_dim in [3, 4], f'expected the input to have 3 or 4 dimensions, but got {num_dim} dimensions'
|
||||
assert num_dim in [3, 4], f"expected the input to have 3 or 4 dimensions, but got {num_dim} dimensions"
|
||||
|
||||
h_in, w_in = input.shape[-2:]
|
||||
|
||||
@@ -52,13 +52,13 @@ def torch_nn_avgpool2d(self, input):
|
||||
h_out,
|
||||
w_out,
|
||||
)
|
||||
return torch.empty(result_shape, device='meta')
|
||||
return torch.empty(result_shape, device="meta")
|
||||
|
||||
|
||||
@meta_patched_module.register(torch.nn.AvgPool3d)
|
||||
def torch_nn_avgpool3d(self, input):
|
||||
num_dim = input.dim()
|
||||
assert num_dim in [4, 5], f'expected the input to have 4 or 5 dimensions, but got {num_dim} dimensions'
|
||||
assert num_dim in [4, 5], f"expected the input to have 4 or 5 dimensions, but got {num_dim} dimensions"
|
||||
|
||||
d_in, h_in, w_in = input.shape[-3:]
|
||||
|
||||
@@ -81,13 +81,13 @@ def torch_nn_avgpool3d(self, input):
|
||||
h_out,
|
||||
w_out,
|
||||
)
|
||||
return torch.empty(result_shape, device='meta')
|
||||
return torch.empty(result_shape, device="meta")
|
||||
|
||||
|
||||
@meta_patched_module.register(torch.nn.MaxPool1d)
|
||||
def torch_nn_maxpool1d(self, input):
|
||||
num_dim = input.dim()
|
||||
assert num_dim in [2, 3], f'expected the input to have 2 or 3 dimensions, but got {num_dim} dimensions'
|
||||
assert num_dim in [2, 3], f"expected the input to have 2 or 3 dimensions, but got {num_dim} dimensions"
|
||||
|
||||
l_in = input.shape[-1]
|
||||
|
||||
@@ -105,13 +105,13 @@ def torch_nn_maxpool1d(self, input):
|
||||
l_out = math.floor((l_in + 2 * padding[0] - dilation[0] * (kernel_size[0] - 1) - 1) / stride[0] + 1)
|
||||
|
||||
result_shape = tuple(input.shape[:-1]) + (l_out,)
|
||||
return torch.empty(result_shape, device='meta')
|
||||
return torch.empty(result_shape, device="meta")
|
||||
|
||||
|
||||
@meta_patched_module.register(torch.nn.MaxPool2d)
|
||||
def torch_nn_maxpool2d(self, input):
|
||||
num_dim = input.dim()
|
||||
assert num_dim in [3, 4], f'expected the input to have 3 or 4 dimensions, but got {num_dim} dimensions'
|
||||
assert num_dim in [3, 4], f"expected the input to have 3 or 4 dimensions, but got {num_dim} dimensions"
|
||||
|
||||
h_in, w_in = input.shape[-2:]
|
||||
|
||||
@@ -133,13 +133,13 @@ def torch_nn_maxpool2d(self, input):
|
||||
h_out,
|
||||
w_out,
|
||||
)
|
||||
return torch.empty(result_shape, device='meta')
|
||||
return torch.empty(result_shape, device="meta")
|
||||
|
||||
|
||||
@meta_patched_module.register(torch.nn.MaxPool3d)
|
||||
def torch_nn_maxpool3d(self, input):
|
||||
num_dim = input.dim()
|
||||
assert num_dim in [4, 5], f'expected the input to have 4 or 5 dimensions, but got {num_dim} dimensions'
|
||||
assert num_dim in [4, 5], f"expected the input to have 4 or 5 dimensions, but got {num_dim} dimensions"
|
||||
|
||||
d_in, h_in, w_in = input.shape[-3:]
|
||||
|
||||
@@ -163,7 +163,7 @@ def torch_nn_maxpool3d(self, input):
|
||||
h_out,
|
||||
w_out,
|
||||
)
|
||||
return torch.empty(result_shape, device='meta')
|
||||
return torch.empty(result_shape, device="meta")
|
||||
|
||||
|
||||
@meta_patched_module.register(torch.nn.AdaptiveAvgPool1d)
|
||||
@@ -175,7 +175,7 @@ def torch_nn_adapative_pooling_1d(self, input):
|
||||
else:
|
||||
output_size = self.output_size
|
||||
result_shape = tuple(input.shape[:-1]) + output_size
|
||||
return torch.empty(result_shape, device='meta')
|
||||
return torch.empty(result_shape, device="meta")
|
||||
|
||||
|
||||
@meta_patched_module.register(torch.nn.AdaptiveAvgPool2d)
|
||||
@@ -187,7 +187,7 @@ def torch_nn_adapative_pooling_2d(self, input):
|
||||
else:
|
||||
output_size = self.output_size
|
||||
result_shape = tuple(input.shape[:-2]) + output_size
|
||||
return torch.empty(result_shape, device='meta')
|
||||
return torch.empty(result_shape, device="meta")
|
||||
|
||||
|
||||
@meta_patched_module.register(torch.nn.AdaptiveAvgPool3d)
|
||||
@@ -199,4 +199,4 @@ def torch_nn_adapative_pooling_3d(self, input):
|
||||
else:
|
||||
output_size = self.output_size
|
||||
result_shape = tuple(input.shape[:-3]) + output_size
|
||||
return torch.empty(result_shape, device='meta')
|
||||
return torch.empty(result_shape, device="meta")
|
||||
|
@@ -1,5 +1,3 @@
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
|
||||
from ...registry import meta_patched_module
|
||||
@@ -8,9 +6,11 @@ from ...registry import meta_patched_module
|
||||
@meta_patched_module.register(torch.nn.GRU)
|
||||
@meta_patched_module.register(torch.nn.RNN)
|
||||
def torch_nn_rnn(self, input, hx):
|
||||
assert input.shape[
|
||||
-1] == self.input_size, f'Expected input to have input size {self.input_size} but got {input.shape[-1]} for the torch.nn.RNN patch'
|
||||
assert hx.shape[
|
||||
-1] == self.hidden_size, f'Expected hx to have hidden size {self.hidden_size} but got {hx.shape[-1]} for the torch.nn.RNN patch'
|
||||
assert (
|
||||
input.shape[-1] == self.input_size
|
||||
), f"Expected input to have input size {self.input_size} but got {input.shape[-1]} for the torch.nn.RNN patch"
|
||||
assert (
|
||||
hx.shape[-1] == self.hidden_size
|
||||
), f"Expected hx to have hidden size {self.hidden_size} but got {hx.shape[-1]} for the torch.nn.RNN patch"
|
||||
d = 2 if self.bidirectional else 1
|
||||
return torch.empty(input.shape[:-1] + (self.hidden_size * d,), device="meta"), hx
|
||||
|
@@ -1,11 +1,9 @@
|
||||
class PatchRegistry:
|
||||
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
self.store = {}
|
||||
|
||||
def register(self, source):
|
||||
|
||||
def wrapper(func):
|
||||
self.store[source] = func
|
||||
return func
|
||||
@@ -21,8 +19,8 @@ class PatchRegistry:
|
||||
return source in self.store
|
||||
|
||||
|
||||
meta_patched_function = PatchRegistry(name='patched_functions_for_meta_execution')
|
||||
meta_patched_module = PatchRegistry(name='patched_modules_for_meta_execution')
|
||||
bias_addition_function = PatchRegistry(name='patched_function_for_bias_addition')
|
||||
bias_addition_module = PatchRegistry(name='patched_module_for_bias_addition')
|
||||
bias_addition_method = PatchRegistry(name='patched_method_for_bias_addition')
|
||||
meta_patched_function = PatchRegistry(name="patched_functions_for_meta_execution")
|
||||
meta_patched_module = PatchRegistry(name="patched_modules_for_meta_execution")
|
||||
bias_addition_function = PatchRegistry(name="patched_function_for_bias_addition")
|
||||
bias_addition_module = PatchRegistry(name="patched_module_for_bias_addition")
|
||||
bias_addition_method = PatchRegistry(name="patched_method_for_bias_addition")
|
||||
|
@@ -29,7 +29,7 @@ from .registry import (
|
||||
meta_patched_module,
|
||||
)
|
||||
|
||||
__all__ = ['ColoTracer']
|
||||
__all__ = ["ColoTracer"]
|
||||
|
||||
|
||||
class TracerType(enum.Enum):
|
||||
@@ -103,7 +103,7 @@ class ColoTracer(Tracer):
|
||||
if kind == "call_function":
|
||||
if bias_addition_function.has(target):
|
||||
if target == torch.nn.functional.linear:
|
||||
if 'bias' in kwargs and kwargs['bias'] is not None:
|
||||
if "bias" in kwargs and kwargs["bias"] is not None:
|
||||
function_to_substitute = func_to_func_dict[target]
|
||||
handle = bias_addition_function.get(target)(self, target, args, kwargs, function_to_substitute)
|
||||
else:
|
||||
@@ -160,22 +160,27 @@ class ColoTracer(Tracer):
|
||||
if n not in parameter_proxy_cache:
|
||||
kwargs = {}
|
||||
if "proxy_factory_fn" in inspect.signature(self.create_proxy).parameters:
|
||||
kwargs["proxy_factory_fn"] = (None if not self.param_shapes_constant else
|
||||
lambda node: ParameterProxy(self, node, n, attr_val))
|
||||
val_proxy = self.create_proxy("get_attr", n, (), {}, **kwargs) # type: ignore[arg-type]
|
||||
kwargs["proxy_factory_fn"] = (
|
||||
None
|
||||
if not self.param_shapes_constant
|
||||
else lambda node: ParameterProxy(self, node, n, attr_val)
|
||||
)
|
||||
val_proxy = self.create_proxy("get_attr", n, (), {}, **kwargs) # type: ignore[arg-type]
|
||||
parameter_proxy_cache[n] = val_proxy
|
||||
return parameter_proxy_cache[n]
|
||||
return None
|
||||
|
||||
if isinstance(attr_val, torch.nn.Parameter):
|
||||
maybe_parameter_proxy = maybe_get_proxy_for_attr(attr_val, self.root.named_parameters(),
|
||||
parameter_proxy_cache)
|
||||
maybe_parameter_proxy = maybe_get_proxy_for_attr(
|
||||
attr_val, self.root.named_parameters(), parameter_proxy_cache
|
||||
)
|
||||
if maybe_parameter_proxy is not None:
|
||||
return maybe_parameter_proxy
|
||||
|
||||
if self.proxy_buffer_attributes and isinstance(attr_val, torch.Tensor):
|
||||
maybe_buffer_proxy = maybe_get_proxy_for_attr(attr_val, self.root.named_buffers(),
|
||||
parameter_proxy_cache)
|
||||
maybe_buffer_proxy = maybe_get_proxy_for_attr(
|
||||
attr_val, self.root.named_buffers(), parameter_proxy_cache
|
||||
)
|
||||
if maybe_buffer_proxy is not None:
|
||||
return maybe_buffer_proxy
|
||||
|
||||
@@ -190,7 +195,7 @@ class ColoTracer(Tracer):
|
||||
# if a customized or third-party module like apex.normalization.FusedRMSNorm is patched,
|
||||
# we should treat it as leaf module as well
|
||||
if meta_patched_module.has(m.__class__) or self.is_leaf_module(m, module_qualified_name):
|
||||
return self.create_proxy('call_module', module_qualified_name, args, kwargs)
|
||||
return self.create_proxy("call_module", module_qualified_name, args, kwargs)
|
||||
else:
|
||||
return forward(*args, **kwargs)
|
||||
|
||||
@@ -211,7 +216,6 @@ class ColoTracer(Tracer):
|
||||
raise ValueError(f"Unrecognized tracer type {tracer_type}")
|
||||
|
||||
def _meta_data_computing(self, kind, target, args, kwargs):
|
||||
|
||||
if kind == "placeholder" and target in self.meta_args and self.meta_args[target].is_meta:
|
||||
meta_out = self.meta_args[target]
|
||||
return meta_out
|
||||
@@ -235,8 +239,9 @@ class ColoTracer(Tracer):
|
||||
# Therefore, I need to record the nn.parameter.Parameter attribute for the operation
|
||||
# added by the bias addition manipulation following the get_attr node.
|
||||
convert_to_parameter = False
|
||||
if target in (torch.transpose, torch.reshape) and isinstance(args_metas[0],
|
||||
torch.nn.parameter.Parameter):
|
||||
if target in (torch.transpose, torch.reshape) and isinstance(
|
||||
args_metas[0], torch.nn.parameter.Parameter
|
||||
):
|
||||
convert_to_parameter = True
|
||||
# fetch patched function
|
||||
if meta_patched_function.has(target):
|
||||
@@ -309,10 +314,12 @@ class ColoTracer(Tracer):
|
||||
|
||||
return meta_out
|
||||
|
||||
def trace(self,
|
||||
root: nn.Module,
|
||||
concrete_args: Optional[Dict[str, Tensor]] = None,
|
||||
meta_args: Optional[Dict[str, Tensor]] = None) -> Graph:
|
||||
def trace(
|
||||
self,
|
||||
root: nn.Module,
|
||||
concrete_args: Optional[Dict[str, Tensor]] = None,
|
||||
meta_args: Optional[Dict[str, Tensor]] = None,
|
||||
) -> Graph:
|
||||
"""
|
||||
Trace the forward computation graph using `torch.fx.Tracer`. This tracer enables data-dependent control flow.
|
||||
|
||||
@@ -341,9 +348,7 @@ class ColoTracer(Tracer):
|
||||
# update concrete args with default values
|
||||
non_meta_arg_names = sig_names - meta_arg_names
|
||||
for k, v in sig.parameters.items():
|
||||
if k in non_meta_arg_names and \
|
||||
k not in concrete_args and \
|
||||
v.default is not inspect.Parameter.empty:
|
||||
if k in non_meta_arg_names and k not in concrete_args and v.default is not inspect.Parameter.empty:
|
||||
concrete_args[k] = v.default
|
||||
|
||||
# get non concrete arg names
|
||||
@@ -354,7 +359,8 @@ class ColoTracer(Tracer):
|
||||
success, element = is_element_in_list(names, sig_names)
|
||||
if not success:
|
||||
raise KeyError(
|
||||
f"argument {element} is not found in the signature of {root.__class__.__name__}'s forward function")
|
||||
f"argument {element} is not found in the signature of {root.__class__.__name__}'s forward function"
|
||||
)
|
||||
|
||||
_check_arg_name_valid(meta_arg_names)
|
||||
_check_arg_name_valid(concrete_arg_names)
|
||||
@@ -363,11 +369,13 @@ class ColoTracer(Tracer):
|
||||
def _check_kwargs(kwargs, should_be_meta: bool):
|
||||
for k, v in kwargs.items():
|
||||
if not should_be_meta:
|
||||
assert not torch.is_tensor(v) or not v.is_meta, \
|
||||
f'Expected the {k} not to be a meta tensor, please check the args passed to the tracer'
|
||||
assert (
|
||||
not torch.is_tensor(v) or not v.is_meta
|
||||
), f"Expected the {k} not to be a meta tensor, please check the args passed to the tracer"
|
||||
else:
|
||||
assert v.is_meta == should_be_meta, \
|
||||
f'Expected the is_meta attribute of {k} to be {should_be_meta}, but got {v.is_meta}, please check the args passed to the tracer'
|
||||
assert (
|
||||
v.is_meta == should_be_meta
|
||||
), f"Expected the is_meta attribute of {k} to be {should_be_meta}, but got {v.is_meta}, please check the args passed to the tracer"
|
||||
|
||||
_check_kwargs(concrete_args, should_be_meta=False)
|
||||
_check_kwargs(meta_args, should_be_meta=True)
|
||||
@@ -442,7 +450,6 @@ class ColoTracer(Tracer):
|
||||
orig_ckpt_func = torch.utils.checkpoint.CheckpointFunction
|
||||
|
||||
class PatchedCheckpointFunction(torch.autograd.Function):
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, run_function, preserve_rng_state, *args):
|
||||
# signal that the current tracing occurs within activation checkpoint part
|
||||
@@ -455,7 +462,8 @@ class ColoTracer(Tracer):
|
||||
@staticmethod
|
||||
def backward(ctx: Any, *grad_outputs: Any) -> Any:
|
||||
raise NotImplementedError(
|
||||
"We do not implement the backward pass as we only trace the forward pass.")
|
||||
"We do not implement the backward pass as we only trace the forward pass."
|
||||
)
|
||||
|
||||
# override the checkpoint function
|
||||
torch.utils.checkpoint.CheckpointFunction = PatchedCheckpointFunction
|
||||
@@ -470,12 +478,11 @@ class ColoTracer(Tracer):
|
||||
|
||||
if self.inside_torch_checkpoint_func:
|
||||
# annotate the activation checkpoint module
|
||||
node.meta['activation_checkpoint'] = self.act_ckpt_region_count
|
||||
node.meta["activation_checkpoint"] = self.act_ckpt_region_count
|
||||
return node
|
||||
|
||||
|
||||
def wrap_tensor_constructor_method(target):
|
||||
|
||||
def look_for_proxy(*args, **kwargs):
|
||||
# find in pos vars
|
||||
for arg in args:
|
||||
@@ -518,12 +525,10 @@ def wrap_tensor_constructor_method(target):
|
||||
for method in magic_methods:
|
||||
|
||||
def _scope(method):
|
||||
|
||||
def impl(*args, **kwargs):
|
||||
|
||||
tracer = args[0].tracer
|
||||
target = getattr(operator, method)
|
||||
proxy = tracer.create_proxy('call_function', target, args, kwargs)
|
||||
proxy = tracer.create_proxy("call_function", target, args, kwargs)
|
||||
if not isinstance(proxy, ColoProxy):
|
||||
meta_out = compute_meta_data_for_functions_proxy(target, args, kwargs)
|
||||
proxy = ColoProxy(proxy.node)
|
||||
@@ -542,7 +547,7 @@ def _define_reflectable(orig_method_name):
|
||||
|
||||
def impl(self, rhs):
|
||||
target = getattr(operator, orig_method_name)
|
||||
proxy = self.tracer.create_proxy('call_function', target, (rhs, self), {})
|
||||
proxy = self.tracer.create_proxy("call_function", target, (rhs, self), {})
|
||||
if not isinstance(proxy, ColoProxy):
|
||||
meta_out = compute_meta_data_for_functions_proxy(target, *(rhs, self), {})
|
||||
proxy = ColoProxy(proxy.node)
|
||||
|
Reference in New Issue
Block a user