jacobmou · Aug 24, 2020
diff --git a/‎benchmarks/functional_autograd_benchmark/README.md
+48 b/‎benchmarks/functional_autograd_benchmark/README.md
+48
diff --git a/‎benchmarks/functional_autograd_benchmark/audio_text_models.py
+122 b/‎benchmarks/functional_autograd_benchmark/audio_text_models.py
+122
diff --git a/‎benchmarks/functional_autograd_benchmark/compare.py
+45 b/‎benchmarks/functional_autograd_benchmark/compare.py
+45
diff --git a/‎benchmarks/functional_autograd_benchmark/functional_autograd_benchmark.py
+153 b/‎benchmarks/functional_autograd_benchmark/functional_autograd_benchmark.py
+153
diff --git a/‎benchmarks/functional_autograd_benchmark/ppl_models.py
+93 b/‎benchmarks/functional_autograd_benchmark/ppl_models.py
+93
diff --git a/‎benchmarks/functional_autograd_benchmark/torchaudio_models.py
+556 b/‎benchmarks/functional_autograd_benchmark/torchaudio_models.py
+556
diff --git a/‎benchmarks/functional_autograd_benchmark/torchvision_models.py
+803 b/‎benchmarks/functional_autograd_benchmark/torchvision_models.py
+803
diff --git a/‎benchmarks/functional_autograd_benchmark/utils.py
+103 b/‎benchmarks/functional_autograd_benchmark/utils.py
+103
diff --git a/‎benchmarks/functional_autograd_benchmark/vision_models.py
+97 b/‎benchmarks/functional_autograd_benchmark/vision_models.py
+97
diff --git a/‎test/run_test.py
+1 b/‎test/run_test.py
+1
diff --git a/‎test/test_functional_autograd_benchmark.py
+57 b/‎test/test_functional_autograd_benchmark.py
+57
@@ -0,0 +1,48 @@
+# Benchmarking tool for the autograd API
+
+This folder contain a set of self-contained scripts that allow to benchmark the autograd with different common models.
+It is designed to run the benchmark before and after your change and will generate a table to share on the PR.
+
+To do so, you can use `functional_autograd_benchmark.py` to run the benchmarks before your change (using as output `before.txt`) and after your change (using as output `after.txt`).
+You can then use `compare.py` to get a markdown table comparing the two runs.
+
+The default arguments of `functional_autograd_benchmark.py` should be used in general. You can change them though to force a given device or force running even the (very) slow settings.
+
+### Sample usage
+
+```bash
+# Make sure you compile pytorch in release mode and with the same flags before/after
+export DEBUG=0
+# When running on CPU, it might be required to limit the number of cores to avoid oversubscription
+export OMP_NUM_THREADS=10
+
+# Compile pytorch with the base revision
+git checkout master
+python setup.py develop
+
+# Run the benchmark for the base
+# This will use the GPU if available.
+pushd benchmarks/functional_autograd_benchmark
+python functional_autograd_benchmark.py --output before.txt
+
+# Compile pytorch with your change
+popd
+git checkout your_feature_branch
+python setup.py develop
+
+# Run the benchmark for the new version
+pushd benchmarks/functional_autograd_benchmark
+python functional_autograd_benchmark.py --output after.txt
+
+# Get the markdown table that you can paste in your github PR
+python compare.py
+
+popd
+
+```
+
+### Files in this folder:
+- `functional_autograd_benchmark.py` is the main entry point to run the benchmark.
+- `compare.py` is the entry point to run the comparison script that generates a markdown table.
+- `torchaudio_models.py` and `torchvision_models.py`  contains code extracted from torchaudio and torchvision to be able to run the models without having a specific version of these libraries installed.
+- `ppl_models.py`, `vision_models.py` and `audio_text_models.py` contain all the getter functions used for the benchmark.
@@ -0,0 +1,122 @@
+import torch
+from torch import nn, Tensor
+
+import torchaudio_models as models
+
+from utils import extract_weights, load_weights, GetterReturnType
+
+def get_wav2letter(device: torch.device) -> GetterReturnType:
+    N = 10
+    input_frames = 700
+    vocab_size = 28
+    model = models.Wav2Letter(num_classes=vocab_size)
+    criterion = torch.nn.NLLLoss()
+    model.to(device)
+    params, names = extract_weights(model)
+
+    inputs = torch.rand([N, 1, input_frames], device=device)
+    labels = torch.rand(N, 3, device=device).mul(vocab_size).long()
+
+    def forward(*new_params: Tensor) -> Tensor:
+        load_weights(model, names, new_params)
+        out = model(inputs)
+
+        loss = criterion(out, labels)
+        return loss
+
+    return forward, params
+
+def get_deepspeech(device: torch.device) -> GetterReturnType:
+    sample_rate = 16000
+    window_size = 0.02
+    window = "hamming"
+    audio_conf = dict(sample_rate=sample_rate,
+                      window_size=window_size,
+                      window=window,
+                      noise_dir=None)
+
+    N = 10
+    num_classes = 10
+    spectrogram_size = 161
+    # Commented are the original sizes in the code
+    seq_length = 500  # 1343
+    target_length = 10  # 50
+    labels = torch.rand(num_classes, device=device)
+    inputs = torch.rand(N, 1, spectrogram_size, seq_length, device=device)
+    # Sequence length for each input
+    inputs_sizes = torch.rand(N, device=device).mul(seq_length * 0.1).add(seq_length * 0.8)
+    targets = torch.rand(N, target_length, device=device)
+    targets_sizes = torch.full((N,), target_length, dtype=torch.int, device=device)
+
+    model = models.DeepSpeech(rnn_type=nn.LSTM, labels=labels, rnn_hidden_size=1024, nb_layers=5,
+                              audio_conf=audio_conf, bidirectional=True)
+    model = model.to(device)
+    criterion = nn.CTCLoss()
+    params, names = extract_weights(model)
+
+    def forward(*new_params: Tensor) -> Tensor:
+        load_weights(model, names, new_params)
+        out, out_sizes = model(inputs, inputs_sizes)
+        out = out.transpose(0, 1)  # For ctc loss
+
+        loss = criterion(out, targets, out_sizes, targets_sizes)
+        return loss
+
+    return forward, params
+
+def get_transformer(device: torch.device) -> GetterReturnType:
+    # For most SOTA research, you would like to have embed to 720, nhead to 12, bsz to 64, tgt_len/src_len to 128.
+    N = 64
+    seq_length = 128
+    ntoken = 50
+    model = models.TransformerModel(ntoken=ntoken, ninp=720, nhead=12, nhid=2048, nlayers=2)
+    model.to(device)
+    criterion = nn.NLLLoss()
+    params, names = extract_weights(model)
+
+    data = torch.rand(N, seq_length + 1, device=device).mul(ntoken).long()
+    inputs = data.narrow(1, 0, seq_length)
+    targets = data.narrow(1, 1, seq_length)
+
+    def forward(*new_params: Tensor) -> Tensor:
+        load_weights(model, names, new_params)
+        out = model(inputs)
+
+        loss = criterion(out.reshape(N * seq_length, ntoken), targets.reshape(N * seq_length))
+        return loss
+
+    return forward, params
+
+def get_multiheadattn(device: torch.device) -> GetterReturnType:
+    # From https://github.com/pytorch/text/blob/master/test/data/test_modules.py#L10
+    embed_dim, nhead, tgt_len, src_len, bsz = 10, 5, 6, 10, 64
+    # Build torchtext MultiheadAttention module
+    in_proj = models.InProjContainer(torch.nn.Linear(embed_dim, embed_dim, bias=False),
+                                     torch.nn.Linear(embed_dim, embed_dim, bias=False),
+                                     torch.nn.Linear(embed_dim, embed_dim, bias=False))
+
+    model = models.MultiheadAttentionContainer(nhead, in_proj,
+                                               models.ScaledDotProduct(),
+                                               torch.nn.Linear(embed_dim, embed_dim, bias=False))
+    model.to(device)
+    params, names = extract_weights(model)
+
+    query = torch.rand((tgt_len, bsz, embed_dim), device=device)
+    key = value = torch.rand((src_len, bsz, embed_dim), device=device)
+    attn_mask_2D = torch.randint(0, 2, (tgt_len, src_len), device=device).to(torch.bool)
+    bias_k = bias_v = torch.rand((1, 1, embed_dim), device=device)
+
+    attn_mask = torch.stack([attn_mask_2D] * (bsz * nhead))
+    bias_k = bias_k.repeat(1, bsz, 1).reshape(1, bsz * nhead, -1)
+    bias_v = bias_v.repeat(1, bsz, 1).reshape(1, bsz * nhead, -1)
+
+    def forward(*new_params: Tensor) -> Tensor:
+        load_weights(model, names, new_params)
+        mha_output, attn_weights = model(query, key, value, attn_mask=attn_mask, bias_k=bias_k, bias_v=bias_v)
+
+        # Don't test any specific loss, just backprop ones for both outputs
+        loss = mha_output.sum() + attn_weights.sum()
+
+        return loss
+
+    return forward, params
@@ -0,0 +1,45 @@
+import argparse
+from collections import defaultdict
+
+from utils import to_markdown_table, from_markdown_table
+
+def main():
+    parser = argparse.ArgumentParser("Main script to compare results from the benchmarks")
+    parser.add_argument("--before", type=str, default="before.txt", help="Text file containing the times to use as base")
+    parser.add_argument("--after", type=str, default="after.txt", help="Text file containing the times to use as new version")
+    parser.add_argument("--output", type=str, default="", help="Text file where to write the output")
+    args = parser.parse_args()
+
+    with open(args.before, "r") as f:
+        content = f.read()
+    res_before = from_markdown_table(content)
+
+    with open(args.after, "r") as f:
+        content = f.read()
+    res_after = from_markdown_table(content)
+
+    diff = defaultdict(defaultdict)
+    for model in res_before:
+        for task in res_before[model]:
+            mean_before, var_before = res_before[model][task]
+            if task not in res_after[model]:
+                diff[model][task] = (None, mean_before, var_before, None, None)
+            else:
+                mean_after, var_after = res_after[model][task]
+                diff[model][task] = (mean_before / mean_after, mean_before, var_before, mean_after, var_after)
+    for model in res_after:
+        for task in res_after[model]:
+            if task not in res_before[model]:
+                mean_after, var_after = res_after[model][task]
+                diff[model][task] = (None, None, None, mean_after, var_after)
+
+    header = ("model", "task", "speedup", "mean (before)", "var (before)", "mean (after)", "var (after)")
+    out = to_markdown_table(diff, header=header)
+
+    print(out)
+    if args.output:
+        with open(args.output, "w") as f:
+            f.write(out)
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,153 @@
+import torch
+from torch.autograd import functional
+
+import time
+from argparse import ArgumentParser
+from collections import defaultdict
+from typing import NamedTuple, Callable, List, Any
+
+import ppl_models
+import vision_models
+import audio_text_models
+
+from utils import to_markdown_table, TimingResultType, InputsType, GetterType, VType
+
+# Listing of the different tasks
+FAST_TASKS_NO_DOUBLE_BACK = [
+    "vjp",
+]
+
+FAST_TASKS = FAST_TASKS_NO_DOUBLE_BACK + [
+    "vhp",
+    "jvp",
+]
+
+ALL_TASKS = FAST_TASKS + [
+    "hvp",
+    "jacobian",
+    "hessian"
+]
+
+DOUBLE_BACKWARD_TASKS = ["jvp", "hvp", "vhp", "hessian"]
+
+# Model definition which contains:
+# - name: a string with the model name.
+# - getter: a function to get the model. It takes as input the device on which the model
+#     will run. It should return the forward function and the parameters (Tensors) used as
+#     input for the forward function. Note that the forward must *not* have any side effect.
+# - tasks: the list of recommended tasks that can run in a reasonable amount of time with this model.
+# - unsupported: the list of tasks that this model cannot run.
+class ModelDef(NamedTuple):
+    name: str
+    getter: GetterType
+    tasks: List[str]
+    unsupported: List[str]
+
+MODELS = [
+    ModelDef("resnet18", vision_models.get_resnet18, FAST_TASKS, []),
+    ModelDef("fcn_resnet", vision_models.get_fcn_resnet, FAST_TASKS, []),
+    ModelDef("detr", vision_models.get_detr, FAST_TASKS, []),
+    ModelDef("ppl_simple_reg", ppl_models.get_simple_regression, ALL_TASKS, []),
+    ModelDef("ppl_robust_reg", ppl_models.get_robust_regression, ALL_TASKS, []),
+    ModelDef("wav2letter", audio_text_models.get_wav2letter, FAST_TASKS, []),
+    ModelDef("deepspeech", audio_text_models.get_deepspeech, FAST_TASKS_NO_DOUBLE_BACK, DOUBLE_BACKWARD_TASKS),
+    ModelDef("transformer", audio_text_models.get_transformer, FAST_TASKS, []),
+    ModelDef("multiheadattn", audio_text_models.get_multiheadattn, FAST_TASKS, []),
+]
+
+def get_v_for(model: Callable, inp: InputsType, task: str) -> VType:
+    v: VType
+
+    if task in ["vjp"]:
+        out = model(*inp)
+        v = torch.rand_like(out)
+    elif task in ["jvp", "hvp", "vhp"]:
+        if isinstance(inp, tuple):
+            v = tuple(torch.rand_like(i) for i in inp)
+        else:
+            v = torch.rand_like(inp)
+    else:
+        v = None
+
+    return v
+
+def run_once(model: Callable, inp: InputsType, task: str, v: VType) -> None:
+    func = getattr(functional, task)
+
+    if v is not None:
+        res = func(model, inp, v=v, strict=True)
+    else:
+        res = func(model, inp, strict=True)
+
+def run_model(model_getter: GetterType, args: Any, task: str) -> List[float]:
+    if args.gpu == -1:
+        device = torch.device("cpu")
+
+        def noop():
+            pass
+        do_sync = noop
+    else:
+        device = torch.device("cuda:{}".format(args.gpu))
+        do_sync = torch.cuda.synchronize
+
+    model, inp = model_getter(device)
+
+    v = get_v_for(model, inp, task)
+    # Warmup
+    run_once(model, inp, task, v)
+
+    elapsed = []
+    for it in range(args.num_iters):
+        do_sync()
+        start = time.time()
+        run_once(model, inp, task, v)
+        do_sync()
+        elapsed.append(time.time() - start)
+
+    return elapsed
+
+def main():
+    parser = ArgumentParser("Main script to benchmark functional API of the autograd.")
+    parser.add_argument("--output", type=str, default="", help="Text file where to write the output")
+    parser.add_argument("--num-iters", type=int, default=10)
+    parser.add_argument("--gpu", type=int, default=-2, help="GPU to use, -1 for CPU and -2 for auto-detect")
+    parser.add_argument("--run-slow-tasks", action="store_true", help="Run even the slow tasks")
+    parser.add_argument("--model-filter", type=str, default="", help="Only run the models in this filter")
+    parser.add_argument("--task-filter", type=str, default="", help="Only run the tasks in this filter")
+    parser.add_argument("--num-threads", type=int, default=10,
+                        help="Number of concurrent threads to use when running on cpu")
+    parser.add_argument("--seed", type=int, default=0, help="The random seed to use.")
+    args = parser.parse_args()
+
+    results: TimingResultType = defaultdict(defaultdict)
+    torch.set_num_threads(args.num_threads)
+    torch.set_num_interop_threads(args.num_threads)
+
+    # This automatically seed cuda if it is available
+    torch.manual_seed(args.seed)
+
+    if args.gpu == -2:
+        args.gpu = 0 if torch.cuda.is_available() else -1
+
+    for name, model_getter, recommended_tasks, unsupported_tasks in MODELS:
+        if args.model_filter and name not in args.model_filter:
+            continue
+        tasks = ALL_TASKS if args.run_slow_tasks else recommended_tasks
+        for task in tasks:
+            if task in unsupported_tasks:
+                continue
+            if args.task_filter and task not in args.task_filter:
+                continue
+            runtimes = run_model(model_getter, args, task)
+
+            runtimes = torch.tensor(runtimes)
+            mean, var = runtimes.mean(), runtimes.var()
+            results[name][task] = (mean.item(), var.item())
+            print("Results for model {} on task {}: {}s (var: {})".format(name, task, mean, var))
+
+    if args.output:
+        with open(args.output, "w") as f:
+            f.write(to_markdown_table(results))
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,93 @@
+import torch
+from torch import Tensor
+import torch.distributions as dist
+
+from utils import GetterReturnType
+
+def get_simple_regression(device: torch.device) -> GetterReturnType:
+    N = 10
+    K = 10
+
+    loc_beta = 0.
+    scale_beta = 1.
+
+    beta_prior = dist.Normal(loc_beta, scale_beta)
+
+    X = torch.rand(N, K + 1, device=device)
+    Y = torch.rand(N, 1, device=device)
+
+    # X.shape: (N, K + 1), Y.shape: (N, 1), beta_value.shape: (K + 1, 1)
+    beta_value = beta_prior.sample((K + 1, 1))
+    beta_value.requires_grad_(True)
+
+    def forward(beta_value: Tensor) -> Tensor:
+        mu = X.mm(beta_value)
+
+        # We need to compute the first and second gradient of this score with respect
+        # to beta_value.
+        score = dist.Bernoulli(logits=mu).log_prob(Y).sum() + beta_prior.log_prob(beta_value).sum()
+        return score
+
+    return forward, (beta_value.to(device),)
+
+
+def get_robust_regression(device: torch.device) -> GetterReturnType:
+    N = 10
+    K = 10
+
+    # X.shape: (N, K + 1), Y.shape: (N, 1)
+    X = torch.rand(N, K + 1, device=device)
+    Y = torch.rand(N, 1, device=device)
+
+    # Predefined nu_alpha and nu_beta, nu_alpha.shape: (1, 1), nu_beta.shape: (1, 1)
+    nu_alpha = torch.randn(1, 1, device=device)
+    nu_beta = torch.rand(1, 1, device=device)
+    nu = dist.Gamma(nu_alpha, nu_beta)
+
+    # Predefined sigma_rate: sigma_rate.shape: (N, 1)
+    sigma_rate = torch.rand(N, 1, device=device)
+    sigma = dist.Exponential(sigma_rate)
+
+    # Predefined beta_mean and beta_sigma: beta_mean.shape: (K + 1, 1), beta_sigma.shape: (K + 1, 1)
+    beta_mean = torch.rand(K + 1, 1, device=device)
+    beta_sigma = torch.rand(K + 1, 1, device=device)
+    beta = dist.Normal(beta_mean, beta_sigma)
+
+    nu_value = nu.sample()
+    nu_value.requires_grad_(True)
+
+    sigma_value = sigma.sample()
+    sigma_unconstrained_value = sigma_value.log()
+    sigma_unconstrained_value.requires_grad_(True)
+
+    beta_value = beta.sample()
+    beta_value.requires_grad_(True)
+
+    def forward(nu_value: Tensor, sigma_unconstrained_value: Tensor, beta_value: Tensor) -> Tensor:
+        sigma_constrained_value = sigma_unconstrained_value.exp()
+        mu = X.mm(beta_value)
+
+        # For this model, we need to compute the following three scores:
+        # We need to compute the first and second gradient of this score with respect
+        # to nu_value.
+        nu_score = dist.StudentT(nu_value, mu, sigma_constrained_value).log_prob(Y).sum() \
+            + nu.log_prob(nu_value)
+
+
+
+        # We need to compute the first and second gradient of this score with respect
+        # to sigma_unconstrained_value.
+        sigma_score = dist.StudentT(nu_value, mu, sigma_constrained_value).log_prob(Y).sum() \
+            + sigma.log_prob(sigma_constrained_value) \
+            + sigma_unconstrained_value
+
+
+
+        # We need to compute the first and second gradient of this score with respect
+        # to beta_value.
+        beta_score = dist.StudentT(nu_value, mu, sigma_constrained_value).log_prob(Y).sum() \
+            + beta.log_prob(beta_value)
+
+        return nu_score.sum() + sigma_score.sum() + beta_score.sum()
+
+    return forward, (nu_value.to(device), sigma_unconstrained_value.to(device), beta_value.to(device))
@@ -0,0 +1,103 @@
+import torch
+
+from collections import defaultdict
+
+from torch import nn, Tensor
+from typing import List, Tuple, Dict, Union, Callable
+
+# Type helpers
+InputsType = Union[Tensor, Tuple[Tensor, ...]]
+# A Getter takes in a device and returns a callable and the inputs to that callable
+GetterReturnType = Tuple[Callable[..., Tensor], InputsType]
+GetterType = Callable[[torch.device], GetterReturnType]
+# V here refers to the v in either vjp, jvp, vhp or hvp
+VType = Union[None, Tensor, Tuple[Tensor, ...]]
+# Type used to store timing results. The first key is the model name, the second key
+# is the task name, the result is a Tuple of: speedup, mean_before, var_before, mean_after, var_after.
+TimingResultType = Dict[str, Dict[str, Tuple[float, ...]]]
+
+# Utilities to make nn.Module "functional"
+# In particular the goal is to be able to provide a function that takes as input
+# the parameters and evaluate the nn.Module using fixed inputs.
+def _del_nested_attr(obj: nn.Module, names: List[str]) -> None:
+    """
+    Deletes the attribute specified by the given list of names.
+    For example, to delete the attribute obj.conv.weight,
+    use _del_nested_attr(obj, ['conv', 'weight'])
+    """
+    if len(names) == 1:
+        delattr(obj, names[0])
+    else:
+        _del_nested_attr(getattr(obj, names[0]), names[1:])
+
+def _set_nested_attr(obj: nn.Module, names: List[str], value: Tensor) -> None:
+    """
+    Set the attribute specified by the given list of names to value.
+    For example, to set the attribute obj.conv.weight,
+    use _del_nested_attr(obj, ['conv', 'weight'], value)
+    """
+    if len(names) == 1:
+        setattr(obj, names[0], value)
+    else:
+        _set_nested_attr(getattr(obj, names[0]), names[1:], value)
+
+def extract_weights(mod: nn.Module) -> Tuple[Tuple[Tensor, ...], List[str]]:
+    """
+    This function removes all the Parameters from the model and
+    return them as a tuple as well as their original attribute names.
+    The weights must be re-loaded with `load_weights` before the model
+    can be used again.
+    Note that this function modifies the model in place and after this
+    call, mod.parameters() will be empty.
+    """
+    orig_params = tuple(mod.parameters())
+    # Remove all the parameters in the model
+    names = []
+    for name, p in list(mod.named_parameters()):
+        _del_nested_attr(mod, name.split("."))
+        names.append(name)
+
+    # Make params regular Tensors instead of nn.Parameter
+    params = tuple(p.detach().requires_grad_() for p in orig_params)
+    return params, names
+
+def load_weights(mod: nn.Module, names: List[str], params: Tuple[Tensor, ...]) -> None:
+    """
+    Reload a set of weights so that `mod` can be used again to perform a forward pass.
+    Note that the `params` are regular Tensors (that can have history) and so are left
+    as Tensors. This means that mod.parameters() will still be empty after this call.
+    """
+    for name, p in zip(names, params):
+        _set_nested_attr(mod, name.split("."), p)
+
+# Utilities to read/write markdown table-like content.
+def to_markdown_table(res: TimingResultType, header: Tuple[str, ...] = None) -> str:
+    if header is None:
+        header = ("model", "task", "mean", "var")
+    out = ""
+
+    def write_line(*args):
+        nonlocal out
+        out += "| {} |\n".format(" | ".join(str(a) for a in args))
+
+    # Make it a markdown table
+    write_line(*header)
+    write_line(*["--"] * len(header))
+    for model, tasks in res.items():
+        for task, line in tasks.items():
+            write_line(*(model, task) + line)
+
+    return out
+
+def from_markdown_table(data: str) -> TimingResultType:
+    out = data.strip().split("\n")
+    out = out[2:]  # Ignore the header lines
+
+    res: TimingResultType
+    res = defaultdict(defaultdict)
+
+    for line in out:
+        model, task, mean, var = [f.strip() for f in line.strip().split("|") if f]
+        res[model][task] = (float(mean), float(var))
+
+    return res
@@ -0,0 +1,97 @@
+import torch
+from torch import Tensor
+import torchvision_models as models
+
+from utils import extract_weights, load_weights, GetterReturnType
+
+from typing import cast
+
+def get_resnet18(device: torch.device) -> GetterReturnType:
+    N = 32
+    model = models.resnet18(pretrained=False)
+    criterion = torch.nn.CrossEntropyLoss()
+    model.to(device)
+    params, names = extract_weights(model)
+
+    inputs = torch.rand([N, 3, 224, 224], device=device)
+    labels = torch.rand(N, device=device).mul(10).long()
+
+    def forward(*new_params: Tensor) -> Tensor:
+        load_weights(model, names, new_params)
+        out = model(inputs)
+
+        loss = criterion(out, labels)
+        return loss
+
+    return forward, params
+
+def get_fcn_resnet(device: torch.device) -> GetterReturnType:
+    N = 8
+    criterion = torch.nn.MSELoss()
+    model = models.fcn_resnet50(pretrained=False, pretrained_backbone=False)
+    model.to(device)
+    params, names = extract_weights(model)
+
+    inputs = torch.rand([N, 3, 480, 480], device=device)
+    # Given model has 21 classes
+    labels = torch.rand([N, 21, 480, 480], device=device)
+
+    def forward(*new_params: Tensor) -> Tensor:
+        load_weights(model, names, new_params)
+        out = model(inputs)['out']
+
+        loss = criterion(out, labels)
+        return loss
+
+    return forward, params
+
+def get_detr(device: torch.device) -> GetterReturnType:
+    # All values below are from CLI defaults in https://github.com/facebookresearch/detr
+    N = 2
+    num_classes = 91
+    hidden_dim = 256
+    nheads = 8
+    num_encoder_layers = 6
+    num_decoder_layers = 6
+
+    model = models.DETR(num_classes=num_classes, hidden_dim=hidden_dim, nheads=nheads,
+                        num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers)
+    losses = ['labels', 'boxes', 'cardinality']
+    eos_coef = 0.1
+    bbox_loss_coef = 5
+    giou_loss_coef = 2
+    weight_dict = {'loss_ce': 1, 'loss_bbox': bbox_loss_coef, 'loss_giou': giou_loss_coef}
+    matcher = models.HungarianMatcher(1, 5, 2)
+    criterion = models.SetCriterion(num_classes=num_classes, matcher=matcher, weight_dict=weight_dict,
+                                    eos_coef=eos_coef, losses=losses)
+
+    model = model.to(device)
+    criterion = criterion.to(device)
+    params, names = extract_weights(model)
+
+    inputs = torch.rand(N, 3, 800, 1200, device=device)
+    labels = []
+    for idx in range(N):
+        targets = {}
+        n_targets: int = int(torch.randint(5, 10, size=tuple()).item())
+        label = torch.randint(5, 10, size=(n_targets,))
+        targets["labels"] = label
+        boxes = torch.randint(100, 800, size=(n_targets, 4))
+        for t in range(n_targets):
+            if boxes[t, 0] > boxes[t, 2]:
+                boxes[t, 0], boxes[t, 2] = boxes[t, 2], boxes[t, 0]
+            if boxes[t, 1] > boxes[t, 3]:
+                boxes[t, 1], boxes[t, 3] = boxes[t, 3], boxes[t, 1]
+        targets["boxes"] = boxes.float()
+        labels.append(targets)
+
+    def forward(*new_params: Tensor) -> Tensor:
+        load_weights(model, names, new_params)
+        out = model(inputs)
+
+        loss = criterion(out, labels)
+        weight_dict = criterion.weight_dict
+        final_loss = cast(Tensor, sum(loss[k] * weight_dict[k] for k in loss.keys() if k in weight_dict))
+        return final_loss
+
+    return forward, params
@@ -87,6 +87,7 @@
     'test_determination',
     'test_futures',
     'test_fx',
+    'test_functional_autograd_benchmark'
 ]
 
 WINDOWS_BLOCKLIST = [
 
@@ -0,0 +1,57 @@
+from torch.testing._internal.common_utils import TestCase, run_tests, slowTest, IS_WINDOWS
+
+import subprocess
+import tempfile
+import os
+import unittest
+
+# This is a very simple smoke test for the functional autograd benchmarking script.
+class TestFunctionalAutogradBenchmark(TestCase):
+    def _test_runner(self, model, disable_gpu=False):
+        # Note about windows:
+        # The temporary file is exclusively open by this process and the child process
+        # is not allowed to open it again. As this is a simple smoke test, we choose for now
+        # not to run this on windows and keep the code here simple.
+        with tempfile.NamedTemporaryFile() as out_file:
+            cmd = ['python', '../benchmarks/functional_autograd_benchmark/functional_autograd_benchmark.py']
+            # Only run the warmup
+            cmd += ['--num-iters', '0']
+            # Only run the vjp task (fastest one)
+            cmd += ['--task-filter', 'vjp']
+            # Only run the specified model
+            cmd += ['--model-filter', model]
+            # Output file
+            cmd += ['--output', out_file.name]
+            if disable_gpu:
+                cmd += ['--gpu', '-1']
+
+            res = subprocess.run(cmd)
+
+            self.assertTrue(res.returncode == 0)
+            # Check that something was written to the file
+            out_file.seek(0, os.SEEK_END)
+            self.assertTrue(out_file.tell() > 0)
+
+
+    @unittest.skipIf(IS_WINDOWS, "NamedTemporaryFile on windows does not have all the features we need.")
+    def test_fast_tasks(self):
+        fast_tasks = ['resnet18', 'ppl_simple_reg', 'ppl_robust_reg', 'wav2letter',
+                      'transformer', 'multiheadattn']
+
+        for task in fast_tasks:
+            self._test_runner(task)
+
+    @slowTest
+    @unittest.skipIf(IS_WINDOWS, "NamedTemporaryFile on windows does not have all the features we need.")
+    def test_slow_tasks(self):
+        slow_tasks = ['fcn_resnet', 'detr']
+        # deepspeech is voluntarily excluded as it takes too long to run without
+        # proper tuning of the number of threads it should use.
+
+        for task in slow_tasks:
+            # Disable GPU for slow test as the CI GPU don't have enough memory
+            self._test_runner(task, disable_gpu=True)
+
+
+if __name__ == '__main__':
+    run_tests()
Original file line number	Diff line number	Diff line change
`@@ -87,6 +87,7 @@`
`87`	`87`	`'test_determination',`
`88`	`88`	`'test_futures',`
`89`	`89`	`'test_fx',`
	`90`	`+ 'test_functional_autograd_benchmark'`
`90`	`91`	`]`
`91`	`92`
`92`	`93`	`WINDOWS_BLOCKLIST = [`