added max-of-n #2

euanong · 2024-01-06T22:52:40Z

No description provided.

JasonGross

Initial review comments

gbmi/__init__.py

gbmi/analysis_tools/decomp.py

gbmi/analysis_tools/l1h1.py

gbmi/verification_tools/general.py

JasonGross · 2024-01-07T11:48:02Z

gbmi/exp_max_of_n/analysis.py

+    assert W_pos.shape == (
+        n_ctx,
+        d_model,
+    ), f"W_pos.shape = {W_pos.shape} != {(n_ctx, d_model)} = (n_ctx, d_model)"
+    assert W_Q.shape == (
+        1,
+        1,
+        d_model,
+        d_model,
+    ), f"W_Q.shape = {W_Q.shape} != {(1, 1, d_model, d_model)} = (1, 1, d_model, d_model)"
+    assert W_K.shape == (
+        1,
+        1,
+        d_model,
+        d_model,
+    ), f"W_K.shape = {W_K.shape} != {(1, 1, d_model, d_model)} = (1, 1, d_model, d_model)"
+    assert W_E.shape == (
+        d_vocab,
+        d_model,
+    ), f"W_E.shape = {W_E.shape} != {(d_vocab, d_model)} = (d_vocab, d_model)"


I don't suppose there's any way to tell the code formatter to compress the shape checking?

My ideal here would be to annotate the assignment of W_pos, etc, with types that include tensor dimensions, and import some package for runtime type checking. Do you know if this is possible with, e.g., jaxtyping, torchtyping, typeguard, beartype, mypy, etc? (Maybe cf patrick-kidger/jaxtyping#153

JasonGross · 2024-01-07T11:51:15Z

gbmi/exp_max_of_n/train.py

+@dataclass
+class MaxOfN(ExperimentConfig):
+    # Max of n (iterable dataset)
+    n_train_samples: Optional[int] = None  # if none, infinite dataset


Nit: The dataset isn't actually infinite, though, right? Is there a better descriptor?

JasonGross · 2024-01-07T11:57:15Z

gbmi/exp_max_of_n/train.py

+        simpler_cfg = HookedTransformerConfig(
+            d_model=config.d_model,
+            n_layers=config.n_layers,
+            n_heads=config.n_heads,
+            d_head=config.d_head,
+            n_ctx=config.n_ctx,
+            d_vocab=config.d_vocab,
+            seed=config.seed,
+            attn_only=True,
+            normalization_type=None,
+            # device=default_device(deterministic=config.deterministic),
+        )
+        model = HookedTransformer(simpler_cfg)


Can we add config for float32 vs float64?

JasonGross · 2024-01-07T12:20:53Z

gbmi/exp_max_of_n/train.py

+    ) -> Float[Tensor, ""]:
+        logits = logits[:, -1, :]
+        true_maximum = torch.max(tokens, dim=1)[0]
+        log_probs = logits.log_softmax(-1)


Do we want to do the more accurate (currently untested)

Suggested change

log_probs = logits.log_softmax(-1)

# log_softmax is only around to around 2e-7, cf https://github.com/pytorch/pytorch/issues/113708

# we can get better precision by using log1p

logits_max_idxs = logits.argmax(dim=-1, keepdim=True)

logits_centered = logits - logits.gather(dim=-1, index=logits_max_idxs)

logits_exp = logits_centered.exp()

# logits_exp[max] will be 1, so we can zero it and use log1p(x) = log(1 + x)

logits_exp.scatter_(dim=-1, index=logits_max_idxs, 0)

log_probs = logits_centered - logits_exp.sum(dim=-1, keepdim=True).log1p()

JasonGross · 2024-01-07T12:22:40Z

gbmi/exp_max_of_n/train.py

+            lr=self.config.optimizer_kwargs["lr"],
+            betas=self.config.optimizer_kwargs["betas"],


More compactly generalizable alternative (pick whichever seems better to you):

Suggested change

lr=self.config.optimizer_kwargs["lr"],

betas=self.config.optimizer_kwargs["betas"],

**{k:self.config.optimizer_kwargs[k] for k in ("lr", "betas")}

JasonGross · 2024-01-07T12:23:13Z

gbmi/exp_max_of_n/train.py

+        super().__init__(config)
+        self.config = config
+        self.seq_len = config.n_ctx
+        self.dataset_seed = config.seed * 10 + 1


Why this seed? (Leave a comment about how (non)arbitrary this is)

JasonGross

Initial review comments

JasonGross · 2024-01-07T12:49:58Z

gbmi/exp_max_of_n/train.py

+if __name__ == "__main__":
+    print("Training model:", MAX_OF_10_CONFIG)
+    train_or_load_model(MAX_OF_10_CONFIG, force="train")


I don't think this should be in exp_max_of_n/train.py, maybe in exp_max_of_n/train_max_of_10.py? Or else we should take command line arguments (argparse? get chatgpt to quickly write up the argparse code?) for --max-of {2|10} [--force-train]

JasonGross · 2024-01-07T12:52:43Z

gbmi/model.py

+
+def train_or_load_model(
+    config: Config,
+    force: Optional[Literal["train", "load"]] = None,


Does it make sense to use enums or global constants (TRAIN = "train") here rather than strings, to make it harder to typo?

JasonGross · 2024-01-07T12:54:57Z

gbmi/model.py

+    test_metrics: Sequence[Mapping[str, float]]
+
+
+def _load_model(


Do we also (want) to do __all__ = [...] to control import behavior?

JasonGross · 2024-01-07T12:55:28Z

gbmi/model.py

+    elif unit == "epochs":
+        trainer_args = {"max_epochs": n}
+    else:
+        raise ValueError


Suggested change

raise ValueError

raise ValueError(f"Invalid unit {unit}")

JasonGross · 2024-01-07T12:56:37Z

gbmi/model.py

+def train_or_load_model(
+    config: Config,
+    force: Optional[Literal["train", "load"]] = None,
+    save_to: Optional[Literal["disk", "disk_and_wandb"]] = "disk_and_wandb",


Do we want to make this a bitmask instead, a la re.match's flags argument?

JasonGross · 2024-01-07T12:57:20Z

gbmi/utils/__init__.py

+    return "cuda" if torch.cuda.is_available() and not deterministic else "cpu"
+
+
+T = TypeVar("T")


JasonGross · 2024-01-07T13:00:12Z

gbmi/utils/__init__.py

+from numpy.random import Generator
+from torch import Tensor
+
+PROJECT_ROOT = Path(__file__).parent.parent.parent


This seems fragile, especially since it goes outside the top-level package, and with respect to refactoring. Maybe we want to have a get_git_project_root? Or ensure it's a subdirectory of the package?

JasonGross · 2024-01-07T13:02:08Z

gbmi/utils/hash.py

+from typing import Any
+from typing import Dict
+
+# Implemented for https://github.com/lemon24/reader/issues/179


Did you write this code @euanong ? Do we need to stick some license notice at the top?

JasonGross · 2024-01-07T13:03:24Z

I'm going to merge this now so we can build on it, code review can be fixed in another PR

euanong added 2 commits January 6, 2024 22:52

added max-of-n

812bd9e

added max-of-n

b303d2e

JasonGross reviewed Jan 7, 2024

View reviewed changes

JasonGross merged commit 7e3e96c into main Jan 7, 2024
18 checks passed

JasonGross deleted the eo-6-jan branch January 7, 2024 13:03

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

added max-of-n #2

added max-of-n #2

euanong commented Jan 6, 2024

JasonGross left a comment

JasonGross Jan 7, 2024

JasonGross Jan 7, 2024

JasonGross Jan 7, 2024

JasonGross Jan 7, 2024

JasonGross Jan 7, 2024

JasonGross Jan 7, 2024

JasonGross left a comment

JasonGross Jan 7, 2024

JasonGross Jan 7, 2024

JasonGross Jan 7, 2024

JasonGross Jan 7, 2024

JasonGross Jan 7, 2024

JasonGross Jan 7, 2024

JasonGross Jan 7, 2024

JasonGross Jan 7, 2024

JasonGross commented Jan 7, 2024

-        log_probs = logits.log_softmax(-1)
+        # log_softmax is only around to around 2e-7, cf https://github.com/pytorch/pytorch/issues/113708
+        # we can get better precision by using log1p
+        logits_max_idxs = logits.argmax(dim=-1, keepdim=True)
+        logits_centered = logits - logits.gather(dim=-1, index=logits_max_idxs)
+        logits_exp = logits_centered.exp()
+        # logits_exp[max] will be 1, so we can zero it and use log1p(x) = log(1 + x)
+        logits_exp.scatter_(dim=-1, index=logits_max_idxs, 0)
+        log_probs = logits_centered - logits_exp.sum(dim=-1, keepdim=True).log1p()

		lr=self.config.optimizer_kwargs["lr"],
		betas=self.config.optimizer_kwargs["betas"],

	lr=self.config.optimizer_kwargs["lr"],
	betas=self.config.optimizer_kwargs["betas"],
	**{k:self.config.optimizer_kwargs[k] for k in ("lr", "betas")}

		return "cuda" if torch.cuda.is_available() and not deterministic else "cpu"


		T = TypeVar("T")

added max-of-n #2

added max-of-n #2

Conversation

euanong commented Jan 6, 2024

JasonGross left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

JasonGross left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

JasonGross commented Jan 7, 2024