diff --git a/Cargo.lock b/Cargo.lock
index 18b5a2953..bbfab695f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -205,17 +205,7 @@ dependencies = [
 [[package]]
 name = "bril-rs"
 version = "0.1.0"
-source = "git+https://github.com/uwplse/bril?rev=e2be3f5#e2be3f5d7e160f02b7aed0ef2bcc3e13ae722d2b"
-dependencies = [
- "serde",
- "serde_json",
- "thiserror",
-]
-
-[[package]]
-name = "bril-rs"
-version = "0.1.0"
-source = "git+https://github.com/uwplse/bril?rev=e2be3f5d7e160f02b7aed0ef2bcc3e13ae722d2b#e2be3f5d7e160f02b7aed0ef2bcc3e13ae722d2b"
+source = "git+https://github.com/uwplse/bril?rev=fe255deec1533960b20fff832971e45810202a5d#fe255deec1533960b20fff832971e45810202a5d"
 dependencies = [
  "serde",
  "serde_json",
@@ -235,9 +225,9 @@ dependencies = [
 [[package]]
 name = "bril2json"
 version = "0.1.0"
-source = "git+https://github.com/uwplse/bril?rev=e2be3f5d7e160f02b7aed0ef2bcc3e13ae722d2b#e2be3f5d7e160f02b7aed0ef2bcc3e13ae722d2b"
+source = "git+https://github.com/uwplse/bril?rev=fe255deec1533960b20fff832971e45810202a5d#fe255deec1533960b20fff832971e45810202a5d"
 dependencies = [
- "bril-rs 0.1.0 (git+https://github.com/uwplse/bril?rev=e2be3f5d7e160f02b7aed0ef2bcc3e13ae722d2b)",
+ "bril-rs 0.1.0 (git+https://github.com/uwplse/bril?rev=fe255deec1533960b20fff832971e45810202a5d)",
  "clap",
  "lalrpop",
  "lalrpop-util",
@@ -247,10 +237,10 @@ dependencies = [
 [[package]]
 name = "brilift"
 version = "0.1.0"
-source = "git+https://github.com/uwplse/bril?rev=e2be3f5d7e160f02b7aed0ef2bcc3e13ae722d2b#e2be3f5d7e160f02b7aed0ef2bcc3e13ae722d2b"
+source = "git+https://github.com/uwplse/bril?rev=fe255deec1533960b20fff832971e45810202a5d#fe255deec1533960b20fff832971e45810202a5d"
 dependencies = [
  "argh",
- "bril-rs 0.1.0 (git+https://github.com/uwplse/bril?rev=e2be3f5d7e160f02b7aed0ef2bcc3e13ae722d2b)",
+ "bril-rs 0.1.0 (git+https://github.com/uwplse/bril?rev=fe255deec1533960b20fff832971e45810202a5d)",
  "cranelift-codegen",
  "cranelift-frontend",
  "cranelift-jit",
@@ -264,9 +254,9 @@ dependencies = [
 [[package]]
 name = "brilirs"
 version = "0.1.0"
-source = "git+https://github.com/uwplse/bril?rev=e2be3f5d7e160f02b7aed0ef2bcc3e13ae722d2b#e2be3f5d7e160f02b7aed0ef2bcc3e13ae722d2b"
+source = "git+https://github.com/uwplse/bril?rev=fe255deec1533960b20fff832971e45810202a5d#fe255deec1533960b20fff832971e45810202a5d"
 dependencies = [
- "bril-rs 0.1.0 (git+https://github.com/uwplse/bril?rev=e2be3f5d7e160f02b7aed0ef2bcc3e13ae722d2b)",
+ "bril-rs 0.1.0 (git+https://github.com/uwplse/bril?rev=fe255deec1533960b20fff832971e45810202a5d)",
  "bril2json",
  "clap",
  "fxhash",
@@ -278,7 +268,7 @@ dependencies = [
 [[package]]
 name = "brillvm"
 version = "0.1.0"
-source = "git+https://github.com/uwplse/bril?rev=e2be3f5d7e160f02b7aed0ef2bcc3e13ae722d2b#e2be3f5d7e160f02b7aed0ef2bcc3e13ae722d2b"
+source = "git+https://github.com/uwplse/bril?rev=fe255deec1533960b20fff832971e45810202a5d#fe255deec1533960b20fff832971e45810202a5d"
 dependencies = [
  "bril-rs 0.1.0 (git+https://github.com/uwplse/bril)",
  "clap",
@@ -549,7 +539,7 @@ dependencies = [
 name = "dag_in_context"
 version = "0.1.0"
 dependencies = [
- "bril-rs 0.1.0 (git+https://github.com/uwplse/bril?rev=e2be3f5)",
+ "bril-rs 0.1.0 (git+https://github.com/uwplse/bril?rev=fe255deec1533960b20fff832971e45810202a5d)",
  "dot-structures",
  "egglog",
  "egraph-serialize",
@@ -627,7 +617,7 @@ checksum = "675e35c02a51bb4d4618cb4885b3839ce6d1787c97b664474d9208d074742e20"
 name = "eggcc"
 version = "0.1.0"
 dependencies = [
- "bril-rs 0.1.0 (git+https://github.com/uwplse/bril?rev=e2be3f5d7e160f02b7aed0ef2bcc3e13ae722d2b)",
+ "bril-rs 0.1.0 (git+https://github.com/uwplse/bril?rev=fe255deec1533960b20fff832971e45810202a5d)",
  "bril2json",
  "brilift",
  "brilirs",
@@ -1546,9 +1536,9 @@ dependencies = [
 
 [[package]]
 name = "regex-lite"
-version = "0.1.5"
+version = "0.1.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "30b661b2f27137bdbc16f00eda72866a92bb28af1753ffbd56744fb6e2e9cd8e"
+checksum = "53a49587ad06b26609c52e423de037e7f57f20d53535d66e08c695f347df952a"
 
 [[package]]
 name = "regex-syntax"
@@ -1580,9 +1570,9 @@ dependencies = [
 [[package]]
 name = "rs2bril"
 version = "0.1.0"
-source = "git+https://github.com/uwplse/bril?rev=e2be3f5d7e160f02b7aed0ef2bcc3e13ae722d2b#e2be3f5d7e160f02b7aed0ef2bcc3e13ae722d2b"
+source = "git+https://github.com/uwplse/bril?rev=fe255deec1533960b20fff832971e45810202a5d#fe255deec1533960b20fff832971e45810202a5d"
 dependencies = [
- "bril-rs 0.1.0 (git+https://github.com/uwplse/bril?rev=e2be3f5d7e160f02b7aed0ef2bcc3e13ae722d2b)",
+ "bril-rs 0.1.0 (git+https://github.com/uwplse/bril?rev=fe255deec1533960b20fff832971e45810202a5d)",
  "clap",
  "proc-macro2",
  "syn 2.0.66",
diff --git a/Cargo.toml b/Cargo.toml
index 15b0830da..289866827 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -22,14 +22,14 @@ smallvec = "1.11.1"
 
 syn = { version = "2.0", features = ["full", "extra-traits"] }
 # currently using the uwplse/bril fork of bril, on eggcc-main
-bril2json = { git = "https://github.com/uwplse/bril", rev = "e2be3f5d7e160f02b7aed0ef2bcc3e13ae722d2b" }
-brilirs = { git = "https://github.com/uwplse/bril", rev = "e2be3f5d7e160f02b7aed0ef2bcc3e13ae722d2b" }
-bril-rs = { git = "https://github.com/uwplse/bril", rev = "e2be3f5d7e160f02b7aed0ef2bcc3e13ae722d2b" }
-brilift = { git = "https://github.com/uwplse/bril", rev = "e2be3f5d7e160f02b7aed0ef2bcc3e13ae722d2b" }
-rs2bril = { git = "https://github.com/uwplse/bril", rev = "e2be3f5d7e160f02b7aed0ef2bcc3e13ae722d2b" ,features = [
+bril2json = { git = "https://github.com/uwplse/bril", rev = "fe255deec1533960b20fff832971e45810202a5d" }
+brilirs = { git = "https://github.com/uwplse/bril", rev = "fe255deec1533960b20fff832971e45810202a5d" }
+bril-rs = { git = "https://github.com/uwplse/bril", rev = "fe255deec1533960b20fff832971e45810202a5d" }
+brilift = { git = "https://github.com/uwplse/bril", rev = "fe255deec1533960b20fff832971e45810202a5d" }
+rs2bril = { git = "https://github.com/uwplse/bril", rev = "fe255deec1533960b20fff832971e45810202a5d" ,features = [
   "import",
 ] }
-brillvm = { git = "https://github.com/uwplse/bril", rev = "e2be3f5d7e160f02b7aed0ef2bcc3e13ae722d2b" }
+brillvm = { git = "https://github.com/uwplse/bril", rev = "fe255deec1533960b20fff832971e45810202a5d" }
 
 
 ordered-float = { version = "3.7" }
diff --git a/benchmarks/failing/polybench/_lib.bril b/benchmarks/failing/polybench/_lib.bril
index ac25081a9..5199221a3 100644
--- a/benchmarks/failing/polybench/_lib.bril
+++ b/benchmarks/failing/polybench/_lib.bril
@@ -142,7 +142,7 @@
     store ptr new_val;
 }
 
-@matrix_print(mtx: ptr<float>, Nrow: int, Ncol: int) {
+@matrix_sum(mtx: ptr<float>, Nrow: int, Ncol: int) {
     i: int = const 0;
     one: int = const 1;
     total: int = mul Nrow Ncol;
@@ -190,7 +190,7 @@
 
 # EXPECTS:
 #   @vector_get defined
-@vector_print(vec: ptr<float>, N: int) {
+@vector_sum(vec: ptr<float>, N: int) {
     i: int = const 0;
     one: int = const 1;
 .while:
diff --git a/benchmarks/passing/bril/core/ackermann.bril b/benchmarks/passing/bril/core/ackermann.bril
index f53a12601..f6a0e2da2 100644
--- a/benchmarks/passing/bril/core/ackermann.bril
+++ b/benchmarks/passing/bril/core/ackermann.bril
@@ -1,7 +1,12 @@
-# ARGS: 480
+# ARGS: 2 480
 # Compute the Ackermann function recursively.
 # WARNING: Will quickly exceed stack size
 
+@main(m: int, n: int) {
+  t: int = call @ack m n;
+  print t;
+}
+
 @ack(m: int, n: int): int {
   zero: int = const 0;
   one: int = const 1;
@@ -25,25 +30,3 @@
   ret t2;
 }
 
-@main(loop_bound: int) {
-  loop_incr: int = const 1;
-  loop_counter: int = const 10;
-  final_output: int = const 0;
-.loop_cond:
-  loop_cond: bool = lt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  output: int = call @orig_main loop_counter;
-  final_output: int = add final_output output;
-  loop_counter: int = add loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-@orig_main(n: int): int {
-  m: int = const 2;
-  tmp: int = call @ack m n;
-  ret tmp;
-}
-
diff --git a/benchmarks/passing/bril/core/armstrong.bril b/benchmarks/passing/bril/core/armstrong.bril
index 2843748af..bbda09722 100644
--- a/benchmarks/passing/bril/core/armstrong.bril
+++ b/benchmarks/passing/bril/core/armstrong.bril
@@ -1,24 +1,5 @@
-# ARGS: 2200000
-@main(loop_bound: int) {
-  loop_incr: int = const 1;
-  loop_counter: int = const 10;
-  final_output: int = const 0;
-.loop_cond:
-  loop_cond: bool = lt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  output: bool = call @orig_main loop_counter;
-  br output .output_incr .skip;
-.output_incr:
-  final_output: int = add final_output loop_incr;
-.skip:
-  loop_counter: int = add loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-@orig_main(input : int): bool {
+# ARGS: 220001000
+@main(input : int) {
   zero : int = const 0;
   ten : int = const 10;
   sum : int = const 0;
@@ -35,7 +16,7 @@
   jmp .loop;
 .done:
   res : bool = eq input sum;
-  ret res;
+  print res;
 }
 
 @getDigits(n : int) : int {
diff --git a/benchmarks/passing/bril/core/binary-fmt.bril b/benchmarks/passing/bril/core/binary-fmt.bril
index 46bbef730..26504ca6e 100644
--- a/benchmarks/passing/bril/core/binary-fmt.bril
+++ b/benchmarks/passing/bril/core/binary-fmt.bril
@@ -1,24 +1,8 @@
-# ARGS: 3500000
-@main(loop_bound: int) {
-  loop_incr: int = const 1;
-  loop_counter: int = const 10;
-  final_output: int = const 1;
-.loop_cond:
-  loop_cond: bool = lt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  output: int = call @orig_main loop_counter;
-  final_output: int = add final_output output;
-  loop_counter: int = add loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-@orig_main(n : int): int {
+# ARGS: 3501010
+@main(n : int) {
 	zero: int = const 0;
 	output: int = call @printBinary n zero;
-	ret output;
+	print output;
 }
 
 @printBinary(n: int, sum: int): int {
diff --git a/benchmarks/passing/bril/core/birthday.bril b/benchmarks/passing/bril/core/birthday.bril
index 037883cab..ca1a42419 100644
--- a/benchmarks/passing/bril/core/birthday.bril
+++ b/benchmarks/passing/bril/core/birthday.bril
@@ -1,27 +1,11 @@
 # ARGS: 9500
-@main(loop_bound: float) {
-  loop_incr: float = const 1;
-  loop_counter: float = const 10;
-  final_output: float = const 0.0;
-.loop_cond:
-  loop_cond: bool = flt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  output: float = call @orig_main loop_counter;
-  final_output: float = fadd final_output output;
-  loop_counter: float = fadd loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-@orig_main(n: float): float {
+@main(n: float) {
   v0: float = id n;
   c: float = call @probability v0;
   c: float = id c;
   v1: float = id c;
   v2: int = const 0;
-  ret v1;
+  print v1;
 }
 @probability(n: float): float {
   v0: float = const 1;
diff --git a/benchmarks/passing/bril/core/bitshift.bril b/benchmarks/passing/bril/core/bitshift.bril
index 07f92eca1..c12c7af2e 100644
--- a/benchmarks/passing/bril/core/bitshift.bril
+++ b/benchmarks/passing/bril/core/bitshift.bril
@@ -1,4 +1,4 @@
-# ARGS: 61
+# ARGS: 4371 5 343234 2
 
 @pow(x: int, n: int): int {
   v1: int = id n;
@@ -68,47 +68,8 @@
   v4: int = div v2 v3;
   ret v4;
 }
-@main(loop_bound: int) {
-  loop_incr: int = const 1;
-  loop_counter: int = const 10;
-  final_output: int = const 0;
-.loop_cond:
-  loop_cond: bool = lt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  loop2_counter: int = const 10;
-.loop2_cond:
-  loop2_cond: bool = lt loop2_counter loop_bound;
-  br loop2_cond .loop2_body .loop2_done;
-.loop2_body:
-  loop3_counter: int = const 10;
-.loop3_cond:
-  loop3_cond: bool = lt loop3_counter loop_bound;
-  br loop3_cond .loop3_body .loop3_done;
-.loop3_body:
-  loop4_counter: int = const 10;
-.loop4_cond:
-  loop4_cond: bool = lt loop4_counter loop_bound;
-  br loop4_cond .loop4_body .loop4_done;
-.loop4_body:
-  output: int = call @orig_main loop_counter loop2_counter loop3_counter loop4_counter;
-  final_output: int = add final_output output;
-  loop4_counter: int = add loop4_counter loop_incr;
-  jmp .loop4_cond;
-.loop4_done:  
-  loop3_counter: int = add loop3_counter loop_incr;
-  jmp .loop3_cond;
-.loop3_done:  
-  loop2_counter: int = add loop2_counter loop_incr;
-  jmp .loop2_cond;
-.loop2_done:
-  loop_counter: int = add loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
 
-@orig_main (a : int, b: int, c: int, d : int): int {
+@main (a : int, b: int, c: int, d : int) {
   v2: int = id a;
   v3: int = id b;
   ans1: int = call @LEFTSHIFT v2 v3;
@@ -116,5 +77,5 @@
   v5: int = id d;
   ans2: int = call @RIGHTSHIFT v4 v5;
   output: int = add ans1 ans2;
-  ret output;
+  print output;
 }
diff --git a/benchmarks/passing/bril/core/bitwise-ops.bril b/benchmarks/passing/bril/core/bitwise-ops.bril
index bdc98d241..dbf9087ec 100644
--- a/benchmarks/passing/bril/core/bitwise-ops.bril
+++ b/benchmarks/passing/bril/core/bitwise-ops.bril
@@ -1,4 +1,4 @@
-# ARGS: 84
+# ARGS: 823324 2313 9000
 
 # Bitwise Operator Library
 # Supports AND, OR, XOR. 0 for and, 1 for OR, 2+ for XOR
@@ -67,39 +67,7 @@
  ret ans;
 }
 
-@main(loop_bound: int) {
-  loop_incr: int = const 1;
-  loop_counter: int = const 10;
-  final_output: int = const 0;
-.loop_cond:
-  loop_cond: bool = lt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  loop2_counter: int = const 10;
-.loop2_cond:
-  loop2_cond: bool = lt loop2_counter loop_bound;
-  br loop2_cond .loop2_body .loop2_done;
-.loop2_body:
-  loop3_counter: int = const 10;
-.loop3_cond:
-  loop3_cond: bool = lt loop3_counter loop_bound;
-  br loop3_cond .loop3_body .loop3_done;
-.loop3_body:
-  output: int = call @orig_main loop_counter loop2_counter loop3_counter;
-  final_output: int = add final_output output;
-  loop3_counter: int = add loop3_counter loop_incr;
-  jmp .loop3_cond;
-.loop3_done:  
-  loop2_counter: int = add loop2_counter loop_incr;
-  jmp .loop2_cond;
-.loop2_done:
-  loop_counter: int = add loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-@orig_main (a : int, b: int, c: int): int {
+@main (a : int, b: int, c: int) {
   one: int = const 1;
   zero: int = const 0; 
 
@@ -124,5 +92,5 @@
 .xor_op:
   ans: int = call @XOR a b;
 .end:  
-  ret ans;
+  print ans;
 }
diff --git a/benchmarks/passing/bril/core/catalan.bril b/benchmarks/passing/bril/core/catalan.bril
index 9361a11b6..ccd37d59b 100644
--- a/benchmarks/passing/bril/core/catalan.bril
+++ b/benchmarks/passing/bril/core/catalan.bril
@@ -1,27 +1,12 @@
-# ARGS: 18
+# ARGS: 14
 
 # Compute the 10th element in the catalan sequence, which is given by:
 #   c0 = 0; c(n+1) = sum(ci * c(n-i) for i = 0..n)
-@main(loop_bound: int) {
-  loop_incr: int = const 1;
-  loop_counter: int = const 10;
-  final_output: int = const 0;
-.loop_cond:
-  loop_cond: bool = lt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  output: int = call @orig_main loop_counter;
-  final_output: int = add final_output output;
-  loop_counter: int = add loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-@orig_main(input : int): int {
+@main(input : int) {
   catn: int = call @catalan input;
-  ret catn;
+  print catn;
 }
+
 # Compute the nth term in the catalan sequence
 @catalan(n: int):int{
   one: int = const 1;
diff --git a/benchmarks/passing/bril/core/collatz.bril b/benchmarks/passing/bril/core/collatz.bril
index 5c1a102e3..36f907667 100644
--- a/benchmarks/passing/bril/core/collatz.bril
+++ b/benchmarks/passing/bril/core/collatz.bril
@@ -3,23 +3,7 @@
 # Compute the Collatz sequence from *n*. This may not terminate for all *n*, but
 # it is at least known to terminate for all *n* up to a large value.
 # see https://en.wikipedia.org/wiki/Collatz_conjecture
-@main(loop_bound: int) {
-  loop_incr: int = const 1;
-  loop_counter: int = const 10;
-  final_output: int = const 0;
-.loop_cond:
-  loop_cond: bool = lt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  output: int = call @orig_main loop_counter;
-  final_output: int = add final_output output;
-  loop_counter: int = add loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-@orig_main(x : int): int {
+@main(x : int) {
   one: int = const 1;
   two: int = const 2;
   three: int = const 3;
@@ -45,5 +29,5 @@
   sum: int = add sum x;
   jmp .cond;
 .end:
-  ret sum;
+  print sum;
 }
diff --git a/benchmarks/passing/bril/core/digital-root.bril b/benchmarks/passing/bril/core/digital-root.bril
index 85f50a83b..be3b6aece 100644
--- a/benchmarks/passing/bril/core/digital-root.bril
+++ b/benchmarks/passing/bril/core/digital-root.bril
@@ -4,24 +4,7 @@
 # adding each digit together until the result is a single number.
 # This is equivalent to the input mod 9 except if that value would be zero
 # in which case the digital root is nine.
-
-@main(loop_bound: int) {
-  loop_incr: int = const 1;
-  loop_counter: int = const 10;
-  final_output: int = const 0;
-.loop_cond:
-  loop_cond: bool = lt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  output: int = call @orig_main loop_counter;
-  final_output: int = add final_output output;
-  loop_counter: int = add loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-@orig_main(input : int): int {
+@main(input : int) {
     zero: int = const 0;
     ten: int = const 10;
     result: int = const 0;
@@ -47,7 +30,7 @@
 
  .done:
     sum: int = add sum result;
-    ret sum;
+    print sum;
 }
 
 @is_single_digit(input: int): bool {
diff --git a/benchmarks/passing/bril/core/dot-product.bril b/benchmarks/passing/bril/core/dot-product.bril
index 241d8066a..a96ff06f4 100644
--- a/benchmarks/passing/bril/core/dot-product.bril
+++ b/benchmarks/passing/bril/core/dot-product.bril
@@ -1,4 +1,4 @@
-# ARGS: 4100000
+# ARGS: 100 4100000
 
 @dot_product(vectorA: ptr<int>, vectorB: ptr<int>, size: int): int {
   one: int = const 1;
@@ -17,27 +17,9 @@
 .done:
   ret answer;
 }
-
-@main(loop_bound: int) {
-  loop_incr: int = const 1;
-  loop_counter: int = const 10;
-  final_output: int = const 0;
-.loop_cond:
-  loop_cond: bool = lt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  output: int = call @orig_main loop_counter;
-  final_output: int = add final_output output;
-  loop_counter: int = add loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-@orig_main(x: int): int {
+@main(c: int, x: int) {
   a: int = const 25;
   b: int = const 50;
-  c: int = const 100;
   d: int = const 150;
   e: int = const 250;
   f: int = const 2;
@@ -81,5 +63,5 @@
   free vectorA;
   free vectorB;
 
-  ret val;
+  print val;
 }
diff --git a/benchmarks/passing/bril/core/euclid.bril b/benchmarks/passing/bril/core/euclid.bril
index 789803575..392c59937 100644
--- a/benchmarks/passing/bril/core/euclid.bril
+++ b/benchmarks/passing/bril/core/euclid.bril
@@ -1,30 +1,6 @@
-# ARGS: 2550
+# ARGS: 38432 25
 
-@main(loop_bound: int) {
-  loop_incr: int = const 1;
-  loop_counter: int = const 10;
-  final_output: int = const 0;
-.loop_cond:
-  loop_cond: bool = lt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  inner_counter: int = const 10;
-.inner_cond:
-  inner_cond: bool = lt inner_counter loop_bound;
-  br inner_cond .inner_body .inner_done;
-.inner_body:
-  output: int = call @orig_main loop_counter inner_counter;
-  final_output: int = add final_output output;
-  inner_counter: int = add inner_counter loop_incr;
-  jmp .inner_cond;
-.inner_done:
-  loop_counter: int = add loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-@orig_main(v0 : int, v1 : int ): int {
+@main(v0 : int, v1 : int ) {
   x: int = id v0;
   y: int = id v1;
   v2: int = id x;
@@ -33,7 +9,7 @@
   f: int = id f;
   v4: int = id f;
   v5: int = const 0;
-  ret v4;
+  print v4;
 }
 @mod(r: int, s: int): int {
   v0: int = id r;
diff --git a/benchmarks/passing/bril/core/fact.bril b/benchmarks/passing/bril/core/fact.bril
index 76aaf4963..db5bc102a 100644
--- a/benchmarks/passing/bril/core/fact.bril
+++ b/benchmarks/passing/bril/core/fact.bril
@@ -1,27 +1,10 @@
-# ARGS: 9500
+# ARGS: 920
 
 # Recursive factorial
-
-@main(loop_bound: int) {
-  loop_incr: int = const 1;
-  loop_counter: int = const 10;
-  final_output: int = const 0;
-.loop_cond:
-  loop_cond: bool = lt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  output: int = call @orig_main loop_counter;
-  final_output: int = add final_output output;
-  loop_counter: int = add loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-@orig_main(a: int): int {
+@main(a: int) {
   x: int = call @fact a;
   v13: int = const 0;
-  ret x;
+  print x;
 }
 
 
diff --git a/benchmarks/passing/bril/core/factors.bril b/benchmarks/passing/bril/core/factors.bril
index 2f6eb0c51..8758884a2 100644
--- a/benchmarks/passing/bril/core/factors.bril
+++ b/benchmarks/passing/bril/core/factors.bril
@@ -5,24 +5,7 @@
 
 # input: a positive integer
 # output: the integer's factors
-
-@main(loop_bound: int) {
-  loop_incr: int = const 1;
-  loop_counter: int = const 10;
-  final_output: int = const 0;
-.loop_cond:
-  loop_cond: bool = lt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  output: int = call @orig_main loop_counter;
-  final_output: int = add final_output output;
-  loop_counter: int = add loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-@orig_main(num: int): int {
+@main(num: int) {
   zer: int = const 0;
   one: int = const 1;
   fac: int = const 2;
@@ -49,5 +32,5 @@
   jmp .loopcheck;
 
 .loopend:
-  ret sum;
+  print sum;
 }
\ No newline at end of file
diff --git a/benchmarks/passing/bril/core/fitsinside.bril b/benchmarks/passing/bril/core/fitsinside.bril
index 469522c09..31ea4444a 100644
--- a/benchmarks/passing/bril/core/fitsinside.bril
+++ b/benchmarks/passing/bril/core/fitsinside.bril
@@ -1,52 +1,7 @@
-# ARGS: 112
-
-@main(loop_bound: int) {
-  loop_incr: int = const 1;
-  loop_counter: int = const 10;
-  final_output: int = const 0;
-.loop_cond:
-  loop_cond: bool = lt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  loop2_counter: int = const 10;
-.loop2_cond:
-  loop2_cond: bool = lt loop2_counter loop_bound;
-  br loop2_cond .loop2_body .loop2_done;
-.loop2_body:
-  loop3_counter: int = const 10;
-.loop3_cond:
-  loop3_cond: bool = lt loop3_counter loop_bound;
-  br loop3_cond .loop3_body .loop3_done;
-.loop3_body:
-  loop4_counter: int = const 10;
-.loop4_cond:
-  loop4_cond: bool = lt loop4_counter loop_bound;
-  br loop4_cond .loop4_body .loop4_done;
-.loop4_body:
-  output: bool = call @orig_main loop_counter loop2_counter loop3_counter loop4_counter;
-  br output .output_incr .skip;
-.output_incr:
-  final_output: int = add final_output loop_incr;
-.skip:  
-  loop4_counter: int = add loop4_counter loop_incr;
-  jmp .loop4_cond;
-.loop4_done:  
-  loop3_counter: int = add loop3_counter loop_incr;
-  jmp .loop3_cond;
-.loop3_done:  
-  loop2_counter: int = add loop2_counter loop_incr;
-  jmp .loop2_cond;
-.loop2_done:
-  loop_counter: int = add loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-
-@orig_main (width1:int, height1:int, width2:int, height2:int): bool {
+# ARGS: 112 233 900 3211
+@main (width1:int, height1:int, width2:int, height2:int) {
   output: bool = call @fitsInside width1 height1 width2 height2;
-  ret output;
+  print output;
 }
 
 @fitsInside(w1: int, h1: int, w2: int, h2: int) : bool {
diff --git a/benchmarks/passing/bril/core/fizz-buzz.bril b/benchmarks/passing/bril/core/fizz-buzz.bril
index b47363cfe..70afae0f1 100644
--- a/benchmarks/passing/bril/core/fizz-buzz.bril
+++ b/benchmarks/passing/bril/core/fizz-buzz.bril
@@ -1,21 +1,5 @@
 # ARGS: 10300
-@main(loop_bound: int) {
-  loop_incr: int = const 1;
-  loop_counter: int = const 10;
-  final_output: int = const 0;
-.loop_cond:
-  loop_cond: bool = lt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  output: int = call @orig_main loop_counter;
-  final_output: int = add final_output output;
-  loop_counter: int = add loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-@orig_main(input: int): int {
+@main(input: int) {
   sum: int = const 0;
   v1: int = const 1;
   index: int = id v1;
@@ -86,5 +70,5 @@
   index: int = id v43;
   jmp .for.cond.0;
 .for.end.0:
-  ret sum;
+  print sum;
 }
diff --git a/benchmarks/passing/bril/core/gcd.bril b/benchmarks/passing/bril/core/gcd.bril
index 1de8c63fd..e0cb56c81 100644
--- a/benchmarks/passing/bril/core/gcd.bril
+++ b/benchmarks/passing/bril/core/gcd.bril
@@ -1,36 +1,13 @@
-# ARGS: 1190
+# ARGS: 1190 83
+
+# TODO failing on nightly- hanging during benchmarking
 
 # GCD: Greatest Common Divisor
 # Euclidean algorithm
 
 # input: two positive integer - op1, op2
 # output: one positive integer - gcd(op1, op2)
-
-@main(loop_bound: int) {
-  loop_incr: int = const 1;
-  loop_counter: int = const 10;
-  final_output: int = const 0;
-.loop_cond:
-  loop_cond: bool = lt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  inner_counter: int = const 10;
-.inner_cond:
-  inner_cond: bool = lt inner_counter loop_bound;
-  br inner_cond .inner_body .inner_done;
-.inner_body:
-  output: int = call @orig_main loop_counter inner_counter;
-  final_output: int = add final_output output;
-  inner_counter: int = add inner_counter loop_incr;
-  jmp .inner_cond;
-.inner_done:
-  loop_counter: int = add loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-@orig_main(op1 : int, op2: int ): int {
+@main(op1 : int, op2: int) {
   # const
   vc0: int = const 0;
   # take two input ops, first iteration
@@ -61,5 +38,5 @@
   jmp .cmp.val;
   # print out the results
 .program.end:
-  ret v1;
+  print v1;
 }
diff --git a/benchmarks/passing/bril/core/lcm.bril b/benchmarks/passing/bril/core/lcm.bril
index 4a145f03e..5e9a1159a 100644
--- a/benchmarks/passing/bril/core/lcm.bril
+++ b/benchmarks/passing/bril/core/lcm.bril
@@ -1,30 +1,5 @@
-# ARGS: 165
-
-@main(loop_bound: int) {
-  loop_incr: int = const 1;
-  loop_counter: int = const 10;
-  final_output: int = const 0;
-.loop_cond:
-  loop_cond: bool = lt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  inner_counter: int = const 10;
-.inner_cond:
-  inner_cond: bool = lt inner_counter loop_bound;
-  br inner_cond .inner_body .inner_done;
-.inner_body:
-  output: int = call @orig_main loop_counter inner_counter;
-  final_output: int = add final_output output;
-  inner_counter: int = add inner_counter loop_incr;
-  jmp .inner_cond;
-.inner_done:
-  loop_counter: int = add loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-@orig_main(x : int, y : int ): int {
+# ARGS: 165 233
+@main(x : int, y : int) {
   greater: int = id y;
   v4: bool = gt x y;
   br v4 .then.1 .else.1;
@@ -48,8 +23,7 @@
   greater:int = add greater one;
   jmp .foreverloop;
 .loopend:
-  ret greater;
-
+  print greater;
 }
 
 @getMod(val: int, mod: int): int{
diff --git a/benchmarks/passing/bril/core/loopfact.bril b/benchmarks/passing/bril/core/loopfact.bril
index 48ad31ed3..e098bfdb9 100644
--- a/benchmarks/passing/bril/core/loopfact.bril
+++ b/benchmarks/passing/bril/core/loopfact.bril
@@ -1,22 +1,5 @@
 # ARGS: 13500
-
-@main(loop_bound: int) {
-  loop_incr: int = const 1;
-  loop_counter: int = const 10;
-  final_output: int = const 0;
-.loop_cond:
-  loop_cond: bool = lt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  output: int = call @orig_main loop_counter;
-  final_output: int = add final_output output;
-  loop_counter: int = add loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-@orig_main(input : int): int {
+@main(input : int) {
   value: int = id input;
   v1: int = const 1;
   result: int = id v1;
@@ -40,5 +23,5 @@
 .for.end.2:
   v13: int = id result;
   v14: int = const 0;
-  ret v13;
+  print v13;
 }
diff --git a/benchmarks/passing/bril/core/mod_inv.bril b/benchmarks/passing/bril/core/mod_inv.bril
index 66bec76b1..6d7472c47 100644
--- a/benchmarks/passing/bril/core/mod_inv.bril
+++ b/benchmarks/passing/bril/core/mod_inv.bril
@@ -1,22 +1,5 @@
 # ARGS: 2800000
-
-@main(loop_bound: int) {
-  loop_incr: int = const 1;
-  loop_counter: int = const 10;
-  final_output: int = const 0;
-.loop_cond:
-  loop_cond: bool = lt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  output: int = call @orig_main loop_counter;
-  final_output: int = add final_output output;
-  loop_counter: int = add loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-@orig_main(n: int): int {
+@main(n: int) {
   p: int = const 10007;
   v0: int = const 2;
   two: int = id v0;
@@ -68,7 +51,7 @@
 .for.end.6:
   v32: int = id ans;
   v33: int = const 0;
-  ret v32;
+  print v32;
 }
 @mod(n: int, p: int): int {
   v0: int = id n;
diff --git a/benchmarks/passing/bril/core/palindrome.bril b/benchmarks/passing/bril/core/palindrome.bril
index 8b1101476..46a6bf57b 100644
--- a/benchmarks/passing/bril/core/palindrome.bril
+++ b/benchmarks/passing/bril/core/palindrome.bril
@@ -1,25 +1,6 @@
-# ARGS: 2050000
-@main(loop_bound: int) {
-  loop_incr: int = const 1;
-  loop_counter: int = const 10;
-  final_output: int = const 0;
-.loop_cond:
-  loop_cond: bool = lt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  output: bool = call @orig_main loop_counter;
-  br output .output_incr .skip;
-.output_incr:
-  final_output: int = add final_output loop_incr;
-.skip:
-  loop_counter: int = add loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
+# ARGS: 2343553432
 
-@orig_main(in : int): bool {
-#in: int = const 2343553432;
+@main(in : int) {
 ten: int = const 10;
 zero: int = const 0;
 one: int = const 1;
@@ -41,7 +22,7 @@ not_finished: bool = const true;
 .for.end:
  exp: int = sub index one;
  is_palindrome: bool = call @palindrome in exp;
- ret is_palindrome;
+ print is_palindrome;
 }
 
 @pow(base: int, exp: int): int {
diff --git a/benchmarks/passing/bril/core/pascals-row.bril b/benchmarks/passing/bril/core/pascals-row.bril
index 711b3efb4..cbe219b7d 100644
--- a/benchmarks/passing/bril/core/pascals-row.bril
+++ b/benchmarks/passing/bril/core/pascals-row.bril
@@ -1,27 +1,10 @@
 # ARGS: 12000
-
-@main(loop_bound: int) {
-  loop_incr: int = const 1;
-  loop_counter: int = const 10;
-  final_output: int = const 0;
-.loop_cond:
-  loop_cond: bool = lt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  output: int = call @orig_main loop_counter;
-  final_output: int = add final_output output;
-  loop_counter: int = add loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-@orig_main(v0 : int): int {
+@main(v0 : int) {
   x: int = id v0;
   v1: int = id x;
   sum: int = call @generateNthRow v1;
   v2: int = const 0;
-  ret sum;
+  print sum;
 }
 @generateNthRow(x: int): int {
   sum: int = const 0;
diff --git a/benchmarks/passing/bril/core/perfect.bril b/benchmarks/passing/bril/core/perfect.bril
index 6f19b4b0c..7de1a6a58 100644
--- a/benchmarks/passing/bril/core/perfect.bril
+++ b/benchmarks/passing/bril/core/perfect.bril
@@ -1,22 +1,5 @@
 # ARGS: 265000
-
-@main(loop_bound: int) {
-  loop_incr: int = const 1;
-  loop_counter: int = const 10;
-  final_output: int = const 0;
-.loop_cond:
-  loop_cond: bool = lt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  output: int = call @orig_main loop_counter;
-  final_output: int = add final_output output;
-  loop_counter: int = add loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-@orig_main(input: int): int {
+@main(input: int) {
   n: int = id input;
   v0: int = const 0;
   v1: int = const 1;
@@ -46,5 +29,5 @@
   .if.success:
    result: int = id v0;
   .if.failure.end:
-  ret result;
+  print result;
 }
diff --git a/benchmarks/passing/bril/core/primes-between.bril b/benchmarks/passing/bril/core/primes-between.bril
index 0bad885e1..d765a5d0b 100644
--- a/benchmarks/passing/bril/core/primes-between.bril
+++ b/benchmarks/passing/bril/core/primes-between.bril
@@ -1,29 +1,5 @@
-# ARGS: 365
-@main(loop_bound: int) {
-  loop_incr: int = const 1;
-  loop_counter: int = const 10;
-  final_output: int = const 0;
-.loop_cond:
-  loop_cond: bool = lt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  inner_counter: int = const 10;
-.inner_cond:
-  inner_cond: bool = lt inner_counter loop_bound;
-  br inner_cond .inner_body .inner_done;
-.inner_body:
-  output: int = call @orig_main loop_counter inner_counter;
-  final_output: int = add final_output output;
-  inner_counter: int = add inner_counter loop_incr;
-  jmp .inner_cond;
-.inner_done:
-  loop_counter: int = add loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-@orig_main(a : int, b: int ): int {
+# ARGS: 0 4000
+@main(a : int, b: int ) {
   sum: int = const 0;
   .for.outer.init: # start at interval unless a < 2, then start at 2.
     t0 : int = const 2;
@@ -72,7 +48,7 @@
     t2 : int = add t2 t17;
     jmp .for.outer.cond;
   .for.outer.end:
-  ret sum;
+  print sum;
 }
 
 @mod(a : int, b : int) : int {
diff --git a/benchmarks/passing/bril/core/recfact.bril b/benchmarks/passing/bril/core/recfact.bril
index 0ac0582b2..5687865bb 100644
--- a/benchmarks/passing/bril/core/recfact.bril
+++ b/benchmarks/passing/bril/core/recfact.bril
@@ -1,29 +1,12 @@
 # ARGS: 9300
-
-@main(loop_bound: int) {
-  loop_incr: int = const 1;
-  loop_counter: int = const 10;
-  final_output: int = const 0;
-.loop_cond:
-  loop_cond: bool = lt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  output: int = call @orig_main loop_counter;
-  final_output: int = add final_output output;
-  loop_counter: int = add loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-@orig_main(input: int): int {
+@main(input: int) {
   x: int = id input;
   v1: int = id x;
   f: int = call @fac v1;
   f: int = id f;
   v2: int = id f;
   v3: int = const 0;
-  ret v2;
+  print v2;
 }
 
 @fac(x: int): int {
diff --git a/benchmarks/passing/bril/core/rectangles-area-difference.bril b/benchmarks/passing/bril/core/rectangles-area-difference.bril
deleted file mode 100644
index 2b3afa8f2..000000000
--- a/benchmarks/passing/bril/core/rectangles-area-difference.bril
+++ /dev/null
@@ -1,59 +0,0 @@
-# ARGS: 106
-
-@main(loop_bound: int) {
-  loop_incr: int = const 1;
-  loop_counter: int = const 10;
-  final_output: int = const 0;
-.loop_cond:
-  loop_cond: bool = lt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  loop2_counter: int = const 10;
-.loop2_cond:
-  loop2_cond: bool = lt loop2_counter loop_bound;
-  br loop2_cond .loop2_body .loop2_done;
-.loop2_body:
-  loop3_counter: int = const 10;
-.loop3_cond:
-  loop3_cond: bool = lt loop3_counter loop_bound;
-  br loop3_cond .loop3_body .loop3_done;
-.loop3_body:
-  loop4_counter: int = const 10;
-.loop4_cond:
-  loop4_cond: bool = lt loop4_counter loop_bound;
-  br loop4_cond .loop4_body .loop4_done;
-.loop4_body:
-  output: int = call @orig_main loop_counter loop2_counter loop3_counter loop4_counter;
-  final_output: int = add final_output output;
-  loop4_counter: int = add loop4_counter loop_incr;
-  jmp .loop4_cond;
-.loop4_done:  
-  loop3_counter: int = add loop3_counter loop_incr;
-  jmp .loop3_cond;
-.loop3_done:  
-  loop2_counter: int = add loop2_counter loop_incr;
-  jmp .loop2_cond;
-.loop2_done:
-  loop_counter: int = add loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-@orig_main (x1 : int, y1: int, x2: int, y2 : int): int {
-      a1: int = call @area x1 y1;
-      a2: int = call @area x2 y2;
-      res: int = sub a1 a2;
-      a1_bigger: bool = gt a1 a2;
-      br a1_bigger .end .flip;
-.flip:
-      neg1: int = const -1;
-      res: int = mul res neg1;
-.end:
-      ret res;
-}
-
-@area (x : int, y : int) : int {
-      area: int = mul x y;
-      ret area;
-}
diff --git a/benchmarks/passing/bril/core/relative-primes.bril b/benchmarks/passing/bril/core/relative-primes.bril
index ffe09fbd7..32689e0d0 100644
--- a/benchmarks/passing/bril/core/relative-primes.bril
+++ b/benchmarks/passing/bril/core/relative-primes.bril
@@ -1,26 +1,10 @@
 # ARGS: 3400
-@main(loop_bound: int) {
-  loop_incr: int = const 1;
-  loop_counter: int = const 10;
-  final_output: int = const 0;
-.loop_cond:
-  loop_cond: bool = lt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  output: int = call @orig_main loop_counter;
-  final_output: int = add final_output output;
-  loop_counter: int = add loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-@orig_main(v0 : int): int {
+@main(v0 : int) {
   a: int = id v0;
   v1: int = id a;
   sum: int = call @relative_primes v1;
   v2: int = const 0;
-  ret sum;
+  print sum;
 }
 @mod(a: int, b: int): int {
   v0: int = id a;
diff --git a/benchmarks/passing/bril/core/reverse.bril b/benchmarks/passing/bril/core/reverse.bril
index 6bb5a9bdd..27d59247f 100644
--- a/benchmarks/passing/bril/core/reverse.bril
+++ b/benchmarks/passing/bril/core/reverse.bril
@@ -1,21 +1,6 @@
-# ARGS: 8000000
-@main(loop_bound: int) {
-  loop_incr: int = const 1;
-  loop_counter: int = const 10;
-  final_output: int = const 0;
-.loop_cond:
-  loop_cond: bool = lt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  output: int = call @orig_main loop_counter;
-  final_output: int = add final_output output;
-  loop_counter: int = add loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
+# ARGS: 8010002
 
-@orig_main(input : int): int {
+@main(input : int) {
   n: int = id input;
   v0: int = const 0;
   v1: int = const 10;
@@ -41,6 +26,6 @@
 .for.incre:
   jmp .for.cond.3;
 .for.end.3:
-  ret result;
+  print result;
 }
 
diff --git a/benchmarks/passing/bril/core/sum-bits.bril b/benchmarks/passing/bril/core/sum-bits.bril
index 487820422..f01084474 100644
--- a/benchmarks/passing/bril/core/sum-bits.bril
+++ b/benchmarks/passing/bril/core/sum-bits.bril
@@ -1,21 +1,5 @@
-# ARGS: 3100000
-@main(loop_bound: int) {
-  loop_incr: int = const 1;
-  loop_counter: int = const 10;
-  final_output: int = const 0;
-.loop_cond:
-  loop_cond: bool = lt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  output: int = call @orig_main loop_counter;
-  final_output: int = add final_output output;
-  loop_counter: int = add loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-@orig_main(input : int): int {
+# ARGS: 3124109
+@main(input: int) {
   sum : int = const 0;
   two : int = const 2;
   zero : int = const 0;
@@ -28,7 +12,7 @@
   sum : int = add sum bit;
   jmp .loop;
 .done:
-  ret sum;
+  print sum;
 }
 
 @mod(dividend : int, divisor : int) : int {
diff --git a/benchmarks/passing/bril/core/sum-check.bril b/benchmarks/passing/bril/core/sum-check.bril
index a777bc7f1..24d9c68bd 100644
--- a/benchmarks/passing/bril/core/sum-check.bril
+++ b/benchmarks/passing/bril/core/sum-check.bril
@@ -1,24 +1,7 @@
 # ARGS: 14500
 # compute the sum of [1, n] by both loop and formula
 # and compare them to see if the result is the same
-
-@main(loop_bound: int) {
-  loop_incr: int = const 1;
-  loop_counter: int = const 10;
-  final_output: int = const 0;
-.loop_cond:
-  loop_cond: bool = lt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  output: int = call @orig_main loop_counter;
-  final_output: int = add final_output output;
-  loop_counter: int = add loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-@orig_main(n: int): int {
+@main(n: int) {
     sum: int = const 0;
     first: int = call @sum_by_loop n;
     second: int = call @sum_by_formula n;
@@ -30,7 +13,7 @@
     one: int = const 1;
     sum: int = add sum one;
 .done:
-    ret sum;
+    print sum;
 }
 
 @sum_by_loop(n: int): int {
diff --git a/benchmarks/passing/bril/core/sum-divisors.bril b/benchmarks/passing/bril/core/sum-divisors.bril
index 6514242bc..2e2cdf4d3 100644
--- a/benchmarks/passing/bril/core/sum-divisors.bril
+++ b/benchmarks/passing/bril/core/sum-divisors.bril
@@ -2,24 +2,7 @@
 
 # Finds the sum of an integer input's positive divisors.
 # Prints the divisors as they are found, followed by the sum.
-
-@main(loop_bound: int) {
-  loop_incr: int = const 1;
-  loop_counter: int = const 10;
-  final_output: int = const 0;
-.loop_cond:
-  loop_cond: bool = lt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  output: int = call @orig_main loop_counter;
-  final_output: int = add final_output output;
-  loop_counter: int = add loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-@orig_main(n : int): int {
+@main(n : int) {
   sum: int = const 0;
   i   : int = const 0;
   res : int = const 0;
@@ -50,7 +33,7 @@
   res : int = add res d;
   jmp .begin;
 .end:
-  ret res;
+  print res;
 }
 
 @mod(dividend: int, divisor: int) : int {
diff --git a/benchmarks/passing/bril/core/sum-sq-diff.bril b/benchmarks/passing/bril/core/sum-sq-diff.bril
index fcd2519d2..02a5e4415 100644
--- a/benchmarks/passing/bril/core/sum-sq-diff.bril
+++ b/benchmarks/passing/bril/core/sum-sq-diff.bril
@@ -28,6 +28,7 @@
   v15: int = id res;
   ret v15;
 }
+
 @squareOfSum(n: int): int {
   v0: int = const 0;
   res: int = id v0;
@@ -56,23 +57,7 @@
   v15: int = id square;
   ret v15;
 }
-@main(loop_bound: int) {
-  loop_incr: int = const 1;
-  loop_counter: int = const 10;
-  final_output: int = const 0;
-.loop_cond:
-  loop_cond: bool = lt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  output: int = call @orig_main loop_counter;
-  final_output: int = add final_output output;
-  loop_counter: int = add loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-@orig_main(n: int): int {
+@main(n: int) {
   v0: int = id n;
   sum: int = call @sumOfSquares v0;
   sum: int = id sum;
@@ -86,5 +71,5 @@
   v5: int = id diff;
   v6: int = const 0;
   v7: int = id diff;
-  ret v5;
+  print v5;
 }
diff --git a/benchmarks/passing/bril/core/totient.bril b/benchmarks/passing/bril/core/totient.bril
index 96dd5a4af..05392db81 100644
--- a/benchmarks/passing/bril/core/totient.bril
+++ b/benchmarks/passing/bril/core/totient.bril
@@ -1,26 +1,9 @@
 # ARGS: 500000
-
-@main(loop_bound: int) {
-  loop_incr: int = const 1;
-  loop_counter: int = const 10;
-  final_output: int = const 0;
-.loop_cond:
-  loop_cond: bool = lt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  output: int = call @orig_main loop_counter;
-  final_output: int = add final_output output;
-  loop_counter: int = add loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-@orig_main(n: int): int {
+@main(n: int) {
   sum: int = id n;
   tot: int = call @totient n;
   sum: int = add sum tot;
-  ret sum;
+  print sum;
 }
 
 @totient (n: int): int {
diff --git a/benchmarks/passing/bril/core/up-arrow.bril b/benchmarks/passing/bril/core/up-arrow.bril
index a9fed7848..27a55a0b5 100644
--- a/benchmarks/passing/bril/core/up-arrow.bril
+++ b/benchmarks/passing/bril/core/up-arrow.bril
@@ -1,4 +1,4 @@
-# ARGS: 9
+# ARGS: 6
 @main(n: int) {
   arrows: int = const 2;
   repeats: int = const 3;
@@ -6,6 +6,8 @@
   print ans;
 }
 
+# Computes Knuth’s up arrow notation, with the first argument being the number, the second argument being the number of Knuth’s up arrows, and the third argument being the number of repeats.
+
 @up_arrow(num: int, arrows: int, repeats: int): int {
   one: int = const 1;
 
diff --git a/benchmarks/passing/bril/float/conjugate-gradient.bril b/benchmarks/passing/bril/float/conjugate-gradient.bril
index 9de513cc8..6fe8491b6 100644
--- a/benchmarks/passing/bril/float/conjugate-gradient.bril
+++ b/benchmarks/passing/bril/float/conjugate-gradient.bril
@@ -2,24 +2,7 @@
 
 # Conjugate gradient method to solve Ax=b. Currently A is a 3x3 diagonal with 
 # incrementing values, but any arbitrary spd A can be used
-
-@main(loop_bound: int) {
-  loop_incr: int = const 1;
-  loop_counter: int = const 10;
-  final_output: float = const 0;
-.loop_cond:
-  loop_cond: bool = lt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  output: float = call @orig_main loop_counter;
-  final_output: float = fadd final_output output;
-  loop_counter: int = add loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-@orig_main(n: int): float {
+@main(n: int) {
 	one: int = const 1;
 	fone: float = const 1;
 	a :ptr<float> = call @get_sym n;
@@ -51,7 +34,7 @@
 	free x0;
 	free b;
 	free a;
-	ret sum;
+	print sum;
 }
 
 # returns the scalar-vector product cv
diff --git a/benchmarks/passing/bril/float/cordic.bril b/benchmarks/passing/bril/float/cordic.bril
index 58aa74347..95f383e53 100644
--- a/benchmarks/passing/bril/float/cordic.bril
+++ b/benchmarks/passing/bril/float/cordic.bril
@@ -1,28 +1,12 @@
 # ARGS: 2250000
 
-@main(loop_bound: float) {
-  loop_incr: float = const 1;
-  loop_counter: float = const 10;
-  final_output: float = const 0;
-.loop_cond:
-  loop_cond: bool = flt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  output: float = call @orig_main loop_counter;
-  final_output: float = fadd final_output output;
-  loop_counter: float = fadd loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-@orig_main(theta: float): float {
+@main(theta: float) {
   v0: float = id theta;
   x: float = call @cordic v0;
   x: float = id x;
   v1: float = id x;
   v2: int = const 0;
-  ret v1;
+  print v1;
 }
 @cordic(theta: float): float {
   v0: float = const 0.7853981633974483;
diff --git a/benchmarks/passing/bril/float/euler.bril b/benchmarks/passing/bril/float/euler.bril
index 7af3aae21..504fd3bfa 100644
--- a/benchmarks/passing/bril/float/euler.bril
+++ b/benchmarks/passing/bril/float/euler.bril
@@ -8,7 +8,6 @@
   e: float = id e;
   v1: float = id e;
   print v1;
-  v2: int = const 0;
 }
 
 @factorial(n: float): float {
diff --git a/benchmarks/passing/bril/float/n_root.bril b/benchmarks/passing/bril/float/n_root.bril
index f3828dea1..853e0fcff 100644
--- a/benchmarks/passing/bril/float/n_root.bril
+++ b/benchmarks/passing/bril/float/n_root.bril
@@ -1,4 +1,4 @@
-# ARGS: 400000
+# ARGS: 380000
 
 @pow(x:float, k:int):float
 {
@@ -42,24 +42,8 @@
 }
 
 
-@main(loop_bound: float) {
-  loop_incr: float = const 1;
-  loop_counter: float = const 10;
-  final_output: float = const 0;
-.loop_cond:
-  loop_cond: bool = flt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  output: float = call @orig_main loop_counter;
-  final_output: float = fadd final_output output;
-  loop_counter: float = fadd loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-@orig_main(x: float): float {
-    n: int = const 5;
-    result: float = call @n_root x n;
-    ret result;
+@main(x: float) {
+  n: int = const 5;
+  result: float = call @n_root x n;
+  print result;
 }
diff --git a/benchmarks/passing/bril/float/newton.bril b/benchmarks/passing/bril/float/newton.bril
index 9d98792fc..4683e4a79 100644
--- a/benchmarks/passing/bril/float/newton.bril
+++ b/benchmarks/passing/bril/float/newton.bril
@@ -1,21 +1,5 @@
 # ARGS: 2200000
-@main(loop_bound: float) {
-  loop_incr: float = const 1;
-  loop_counter: float = const 10;
-  final_output: float = const 0;
-.loop_cond:
-  loop_cond: bool = flt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  output: float = call @orig_main loop_counter;
-  final_output: float = fadd final_output output;
-  loop_counter: float = fadd loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-@orig_main(n: float): float {
+@main(n: float) {
   two: float = const 2;
   x: float = fdiv n two; # fist approximation
   prev_x: float = id n;
@@ -29,7 +13,7 @@
   x: float = call @sqrt x n;
   jmp .for.cond.1;
 .for.end.1:
-  ret x;
+  print x;
 }
 
 # one iteration of the newton method:
diff --git a/benchmarks/passing/bril/float/norm.bril b/benchmarks/passing/bril/float/norm.bril
index 8c34b3b76..456481b72 100644
--- a/benchmarks/passing/bril/float/norm.bril
+++ b/benchmarks/passing/bril/float/norm.bril
@@ -1,4 +1,4 @@
-# ARGS: 25
+# ARGS: 25 23 34 23 10
 
 @pow(x:float, k:int):float
 {
@@ -101,59 +101,11 @@
     ret norm;
 }
 
-@main(loop_bound: float) {
-  loop_incr: float = const 1;
-  loop_counter: float = const 10;
-  final_output: float = const 0;
-.loop_cond:
-  loop_cond: bool = flt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  loop2_counter: float = const 10;
-.loop2_cond:
-  loop2_cond: bool = flt loop2_counter loop_bound;
-  br loop2_cond .loop2_body .loop2_done;
-.loop2_body:
-  loop3_counter: float = const 10;
-.loop3_cond:
-  loop3_cond: bool = flt loop3_counter loop_bound;
-  br loop3_cond .loop3_body .loop3_done;
-.loop3_body:
-  loop4_counter: float = const 11;
-.loop4_cond:
-  loop4_cond: bool = flt loop4_counter loop_bound;
-  br loop4_cond .loop4_body .loop4_done;
-.loop4_body:
-  loop5_counter: float = const 11;
-.loop5_cond:
-  loop5_cond: bool = flt loop5_counter loop_bound;
-  br loop5_cond .loop5_body .loop5_done;
-.loop5_body:
-  output: float = call @orig_main loop_counter loop2_counter loop3_counter loop4_counter loop5_counter;
-  final_output: float = fadd final_output output;
-  loop5_counter: float = fadd loop5_counter loop_incr;
-  jmp .loop5_cond;
-.loop5_done:  
-  loop4_counter: float = fadd loop4_counter loop_incr;
-  jmp .loop4_cond;
-.loop4_done:  
-  loop3_counter: float = fadd loop3_counter loop_incr;
-  jmp .loop3_cond;
-.loop3_done:  
-  loop2_counter: float = fadd loop2_counter loop_incr;
-  jmp .loop2_cond;
-.loop2_done:
-  loop_counter: float = fadd loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-@orig_main(n1: float, n2: float, n3: float, n4: float, n5: float) : float
+@main(n1: float, n2: float, n3: float, n4: float, n5: float)
 {
     size: int = const 5;
     array: ptr<float> = call @pack size n1 n2 n3 n4 n5;
     norm: float = call @euclidean_norm array size;
     free array;
-    ret norm;
+    print norm;
 }
\ No newline at end of file
diff --git a/benchmarks/passing/bril/float/ray-sphere-intersection.bril b/benchmarks/passing/bril/float/ray-sphere-intersection.bril
index 26fac3bf4..1cfd50aff 100644
--- a/benchmarks/passing/bril/float/ray-sphere-intersection.bril
+++ b/benchmarks/passing/bril/float/ray-sphere-intersection.bril
@@ -1,4 +1,4 @@
-# ARGS: 28
+# ARGS: 0.0 0.0 0.0 0.33 0.33 0.33 5.0 5.0 5.0 1.0
 # Ray-sphere intersection algorithm. Prints true if
 # the ray intersects the sphere, and false otherwise.
 #
@@ -97,72 +97,12 @@
   ret v13;
 }
 
-@main(loop_bound: float) {
-  loop_incr: float = const 1;
-  one: int = const 1;
-  loop_counter: float = const 10;
-  final_output: int = const 0;
-.loop_cond:
-  loop_cond: bool = flt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  loop2_counter: float = const 10;
-.loop2_cond:
-  loop2_cond: bool = flt loop2_counter loop_bound;
-  br loop2_cond .loop2_body .loop2_done;
-.loop2_body:
-  loop3_counter: float = const 10;
-.loop3_cond:
-  loop3_cond: bool = flt loop3_counter loop_bound;
-  br loop3_cond .loop3_body .loop3_done;
-.loop3_body:
-  loop4_counter: float = const 10;
-.loop4_cond:
-  loop4_cond: bool = flt loop4_counter loop_bound;
-  br loop4_cond .loop4_body .loop4_done;
-.loop4_body:
-  loop5_counter: float = const 11;
-.loop5_cond:
-  loop5_cond: bool = flt loop5_counter loop_bound;
-  br loop5_cond .loop5_body .loop5_done;
-.loop5_body:
-  loop6_counter: float = const 11;
-.loop6_cond:
-  loop6_cond: bool = flt loop6_counter loop_bound;
-  br loop6_cond .loop6_body .loop6_done;
-.loop6_body:
-  output: bool = call @orig_main loop_counter loop2_counter loop3_counter loop4_counter loop5_counter loop6_counter;
-  br output .output_incr .skip;
-.output_incr:
-  final_output: int = add final_output one;
-.skip:
-  loop6_counter: float = fadd loop6_counter loop_incr;
-  jmp .loop6_cond;
-.loop6_done:  
-  loop5_counter: float = fadd loop5_counter loop_incr;
-  jmp .loop5_cond;
-.loop5_done:  
-  loop4_counter: float = fadd loop4_counter loop_incr;
-  jmp .loop4_cond;
-.loop4_done:  
-  loop3_counter: float = fadd loop3_counter loop_incr;
-  jmp .loop3_cond;
-.loop3_done:  
-  loop2_counter: float = fadd loop2_counter loop_incr;
-  jmp .loop2_cond;
-.loop2_done:
-  loop_counter: float = fadd loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-@orig_main(rayDirectionX: float,
+@main(rayDirectionX: float,
            rayDirectionY: float,
            rayDirectionZ: float,
            circleCenterX: float,
            circleCenterY: float,
-           circleCenterZ: float): bool {
+           circleCenterZ: float) {
   rayOriginX: float = const 0.0;
   rayOriginY: float = const 0.0;
   rayOriginZ: float = const 0.0;
@@ -180,5 +120,5 @@
   v18: float = id circleCenterZ;
   v19: float = id radius;
   intersected: bool = call @RaySphereIntersection v10 v11 v12 v13 v14 v15 v16 v17 v18 v19;
-  ret intersected;
+  print intersected;
 }
diff --git a/benchmarks/passing/bril/float/riemann.bril b/benchmarks/passing/bril/float/riemann.bril
index 3da3b729d..139ea499a 100644
--- a/benchmarks/passing/bril/float/riemann.bril
+++ b/benchmarks/passing/bril/float/riemann.bril
@@ -4,12 +4,14 @@
 @main(n: float) {
     a: float = const 2.0;
     b: float = const 10.0;
+    res: float = const 0.0;
     left : float = call @left_riemann a b n;
-    print left;
+    res : float = fadd res left;
     midpoint: float = call @midpoint_riemann a b n;
-    print midpoint;
+    res : float = fadd res midpoint;
     right : float = call @right_riemann a b n;
-    print right;
+    res : float = fadd res right;
+    print res;
 }
 
 @square_function(x: float): float {
diff --git a/benchmarks/passing/bril/float/sqrt.bril b/benchmarks/passing/bril/float/sqrt.bril
index 50c60a04d..9fb603512 100644
--- a/benchmarks/passing/bril/float/sqrt.bril
+++ b/benchmarks/passing/bril/float/sqrt.bril
@@ -1,22 +1,5 @@
 # ARGS: 2000000
-
-@main(loop_bound: float) {
-  loop_incr: float = const 1;
-  loop_counter: float = const 10;
-  final_output: float = const 0;
-.loop_cond:
-  loop_cond: bool = flt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  output: float = call @orig_main loop_counter;
-  final_output: float = fadd final_output output;
-  loop_counter: float = fadd loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-@orig_main(v0: float): float {
+@main(v0: float) {
   n: float = id v0;
   v1: float = const 0.00001;
   precision: float = id v1;
@@ -72,5 +55,5 @@
 .for.end.4:
   v31: float = id x;
   v32: int = const 0;
-  ret v31;
+  print v31;
 }
diff --git a/benchmarks/passing/bril/long/function_call.bril b/benchmarks/passing/bril/long/function_call.bril
index da63a6f00..8a06b98ad 100644
--- a/benchmarks/passing/bril/long/function_call.bril
+++ b/benchmarks/passing/bril/long/function_call.bril
@@ -1,28 +1,38 @@
-# ARGS: 21
+# ARGS: 25
+
+# failing due to brillvm bug:
+# https://github.com/sampsyo/bril/issues/339
+
 @main(starting_m: int) {
-  call @orig_main starting_m;
+  res: int = call @myrec starting_m;
+  print res;
 }
 
-@orig_main(starting_m: int) {
+@myrec(starting_m: int): int {
   m : int = id starting_m;
   zero: int = const 0;
   one: int = const 1;
   two: int = const 2;
   cond_m: bool = eq m zero;
+  res: int = const 0;
+  tmp: int = const 0;
   br cond_m .end .m_nonzero;
 .m_nonzero:
   m:int = sub m one;
-  call @orig_main m;
+  tmp:int = call @myrec m;
+  res:int = add res tmp;
   cond_m: bool = eq m zero;
   br cond_m .end .m_nonzero2;
 .m_nonzero2:
   m:int = sub m one;
-  call @orig_main m;
+  tmp:int = call @myrec m;
+  res:int = add res tmp;
   cond_m: bool = eq m zero;
   br cond_m .end .m_nonzero3;
 .m_nonzero3:
   m:int = sub m one;
-  call @orig_main m;
+  tmp:int = call @myrec m;
+  res:int = add res tmp;
 .end:
-  print starting_m;
-}
+  ret res;
+}
\ No newline at end of file
diff --git a/benchmarks/passing/bril/mem/adj2csr.bril b/benchmarks/passing/bril/mem/adj2csr.bril
index 93ca3b9f0..bfa6c791a 100644
--- a/benchmarks/passing/bril/mem/adj2csr.bril
+++ b/benchmarks/passing/bril/mem/adj2csr.bril
@@ -130,23 +130,7 @@
   ret num_edges;
 }
 
-@main(loop_bound: int) {
-  loop_incr: int = const 1;
-  loop_counter: int = const 10;
-  final_output: int = const 0;
-.loop_cond:
-  loop_cond: bool = lt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  output: int = call @orig_main loop_counter;
-  final_output: int = add final_output output;
-  loop_counter: int = add loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-@orig_main(num_nodes: int): int {
+@main(num_nodes: int) {
   sum: int = const 0;
 
   seed: int = const 2348512;
@@ -176,5 +160,5 @@
   free csr_offset;
   free csr_edges;
   free rng;
-  ret sum;
+  print sum;
 }
diff --git a/benchmarks/passing/bril/mem/adler32.bril b/benchmarks/passing/bril/mem/adler32.bril
index 512e80e0f..3aea204bc 100644
--- a/benchmarks/passing/bril/mem/adler32.bril
+++ b/benchmarks/passing/bril/mem/adler32.bril
@@ -1,26 +1,10 @@
 # ARGS: 13000
-@main(loop_bound: int) {
-  loop_incr: int = const 1;
-  loop_counter: int = const 10;
-  final_output: int = const 0;
-.loop_cond:
-  loop_cond: bool = lt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  output: int = call @orig_main loop_counter;
-  final_output: int = add final_output output;
-  loop_counter: int = add loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-@orig_main(size : int): int {
+@main(size : int) {
   arr: ptr<int> = alloc size;
   call @fill_array arr size;
   checksum: int = call @adler32 arr size;
   free arr;
-  ret checksum;
+  print checksum;
 }
 
 # Calculate the modulo of the two numbers using arithmetic
diff --git a/benchmarks/passing/bril/mem/binary-search.bril b/benchmarks/passing/bril/mem/binary-search.bril
index 8675de9a6..b1199eb25 100644
--- a/benchmarks/passing/bril/mem/binary-search.bril
+++ b/benchmarks/passing/bril/mem/binary-search.bril
@@ -1,4 +1,4 @@
-# ARGS: 34
+# ARGS: 1 2 6 7 11
 
 # A standard binary search, not super interesting
 # Inputs: An array of 5 elements (fixed), and a target
@@ -81,7 +81,7 @@
     ret output;
 }
 
-@orig_main(e1: int, e2: int, e3: int, e4: int, e5: int): int {
+@main(e1: int, e2: int, e3: int, e4: int, e5: int) {
     target: int = const 7;
     size: int = const 5;
     zero: int = const 0;
@@ -89,53 +89,5 @@
     array: ptr<int> = call @pack size e1 e2 e3 e4 e5;
     output: int = call @binary_search array target zero four;
     free array;
-    ret output;
+    print output;
 }
-
-@main(loop_bound: int) {
-  loop_incr: int = const 1;
-  loop_counter: int = const 10;
-  final_output: int = const 0;
-.loop_cond:
-  loop_cond: bool = lt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  loop2_counter: int = const 10;
-.loop2_cond:
-  loop2_cond: bool = lt loop2_counter loop_bound;
-  br loop2_cond .loop2_body .loop2_done;
-.loop2_body:
-  loop3_counter: int = const 10;
-.loop3_cond:
-  loop3_cond: bool = lt loop3_counter loop_bound;
-  br loop3_cond .loop3_body .loop3_done;
-.loop3_body:
-  loop4_counter: int = const 10;
-.loop4_cond:
-  loop4_cond: bool = lt loop4_counter loop_bound;
-  br loop4_cond .loop4_body .loop4_done;
-.loop4_body:
-  loop5_counter: int = const 10;
-.loop5_cond:
-  loop5_cond: bool = lt loop5_counter loop_bound;
-  br loop5_cond .loop5_body .loop5_done;
-.loop5_body:
-  output: int = call @orig_main loop_counter loop2_counter loop3_counter loop4_counter loop5_counter;
-  final_output: int = add final_output output;
-  loop5_counter: int = add loop5_counter loop_incr;
-  jmp .loop5_cond;
-.loop5_done:  
-  loop4_counter: int = add loop4_counter loop_incr;
-  jmp .loop4_cond;
-.loop4_done:  
-  loop3_counter: int = add loop3_counter loop_incr;
-  jmp .loop3_cond;
-.loop3_done:  
-  loop2_counter: int = add loop2_counter loop_incr;
-  jmp .loop2_cond;
-.loop2_done:
-  loop_counter: int = add loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
\ No newline at end of file
diff --git a/benchmarks/passing/bril/mem/bubblesort.bril b/benchmarks/passing/bril/mem/bubblesort.bril
index 8f10c9788..67aa05abe 100644
--- a/benchmarks/passing/bril/mem/bubblesort.bril
+++ b/benchmarks/passing/bril/mem/bubblesort.bril
@@ -1,4 +1,5 @@
-# ARGS: 31
+# ARGS: 5 3 10 1 9 7
+
 # Bubble Sort for a list containing 5 elements. It is sorted in ascending order. 
 # It can be easily extended to list with any other length. 
 
@@ -63,55 +64,7 @@
     ret;
 }
 
-@main(loop_bound: int) {
-  loop_incr: int = const 1;
-  loop_counter: int = const 10;
-  final_output: int = const 0;
-.loop_cond:
-  loop_cond: bool = lt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  loop2_counter: int = const 10;
-.loop2_cond:
-  loop2_cond: bool = lt loop2_counter loop_bound;
-  br loop2_cond .loop2_body .loop2_done;
-.loop2_body:
-  loop3_counter: int = const 10;
-.loop3_cond:
-  loop3_cond: bool = lt loop3_counter loop_bound;
-  br loop3_cond .loop3_body .loop3_done;
-.loop3_body:
-  loop4_counter: int = const 10;
-.loop4_cond:
-  loop4_cond: bool = lt loop4_counter loop_bound;
-  br loop4_cond .loop4_body .loop4_done;
-.loop4_body:
-  loop5_counter: int = const 10;
-.loop5_cond:
-  loop5_cond: bool = lt loop5_counter loop_bound;
-  br loop5_cond .loop5_body .loop5_done;
-.loop5_body:
-  output: int = call @orig_main loop_counter loop2_counter loop3_counter loop4_counter loop5_counter;
-  final_output: int = add final_output output;
-  loop5_counter: int = add loop5_counter loop_incr;
-  jmp .loop5_cond;
-.loop5_done:  
-  loop4_counter: int = add loop4_counter loop_incr;
-  jmp .loop4_cond;
-.loop4_done:  
-  loop3_counter: int = add loop3_counter loop_incr;
-  jmp .loop3_cond;
-.loop3_done:  
-  loop2_counter: int = add loop2_counter loop_incr;
-  jmp .loop2_cond;
-.loop2_done:
-  loop_counter: int = add loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-@orig_main(n1: int, n2: int, n3: int, n4: int, n5: int): int {
+@main(n1: int, n2: int, n3: int, n4: int, n5: int) {
 # Pack the input elements into an array with a starting pointer
     size: int = const 5;
     array: ptr<int> = call @pack size n1 n2 n3 n4 n5;
@@ -142,9 +95,8 @@ sizei: int = sub size one;
     jmp .loopi;
 .donei:
 
-# Print array
     sum: int = call @print_array array size;
 
     free array;
-    ret sum;
+    print sum;
 }
\ No newline at end of file
diff --git a/benchmarks/passing/bril/mem/csrmv.bril b/benchmarks/passing/bril/mem/csrmv.bril
index e36ae3d77..80c404b3d 100644
--- a/benchmarks/passing/bril/mem/csrmv.bril
+++ b/benchmarks/passing/bril/mem/csrmv.bril
@@ -236,23 +236,7 @@
     ret sum;
 }
 
-@main(loop_bound: int) {
-  loop_incr: int = const 1;
-  loop_counter: int = const 10;
-  final_output: int = const 0;
-.loop_cond:
-  loop_cond: bool = lt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  output: int = call @orig_main loop_counter;
-  final_output: int = add final_output output;
-  loop_counter: int = add loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-@orig_main(n: int): int {
+@main(n: int) {
     rows: int = id n;
     cols: int = id n;
     degree: int = const 5;
@@ -288,5 +272,5 @@
     sum: int = add c sum;
     sum: int = add d sum;
     sum: int = add e sum;
-    ret sum;
+    print sum;
 }
diff --git a/benchmarks/passing/bril/mem/fib.bril b/benchmarks/passing/bril/mem/fib.bril
index b2c5f814a..80158eb03 100644
--- a/benchmarks/passing/bril/mem/fib.bril
+++ b/benchmarks/passing/bril/mem/fib.bril
@@ -1,21 +1,6 @@
 # ARGS: 12000
-@main(loop_bound: int) {
-  loop_incr: int = const 1;
-  loop_counter: int = const 10;
-  final_output: int = const 0;
-.loop_cond:
-  loop_cond: bool = lt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  output: int = call @orig_main loop_counter;
-  final_output: int = add final_output output;
-  loop_counter: int = add loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
 
-@orig_main(input : int): int {
+@main(input : int) {
   zero: int = const 0;
   one: int = const 1;
   neg_one: int = const -1;
@@ -45,5 +30,5 @@
   last: ptr<int> = ptradd vals i_minus_one;
   tmp: int = load last;
   free vals;
-  ret tmp;
+  print tmp;
 }
diff --git a/benchmarks/passing/bril/mem/major-elm.bril b/benchmarks/passing/bril/mem/major-elm.bril
index 048577075..0bd6b96ef 100644
--- a/benchmarks/passing/bril/mem/major-elm.bril
+++ b/benchmarks/passing/bril/mem/major-elm.bril
@@ -1,34 +1,11 @@
-# ARGS: 3300
+# ARGS: 3 2 3
 # Return the majority element (appears more than floor(n/2) times) of an array, assuming that the majority element is guaranteed to exist.
 # Inputs: an array of size 3 (fixed);
 # Output: the majority element of the array.
 # Not a big example in terms of the array size and the total dynamic instructions; but I found Boyer-Moore voting algorithm very interesting!
 # Acknowledgement: brought idea from the pack function in Pat-Lafon's binary-search.bril to my create_arr.
-@main(loop_bound: int) {
-  loop_incr: int = const 1;
-  loop_counter: int = const 10;
-  final_output: int = const 0;
-.loop_cond:
-  loop_cond: bool = lt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  loop2_counter: int = const 10;
-.loop2_cond:
-  loop2_cond: bool = lt loop2_counter loop_bound;
-  br loop2_cond .loop2_body .loop2_done;
-.loop2_body:
-  output: int = call @orig_main loop_counter loop2_counter loop_counter;
-  final_output: int = add final_output output;
-  loop2_counter: int = add loop2_counter loop_incr;
-  jmp .loop2_cond;
-.loop2_done:
-  loop_counter: int = add loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
 
-@orig_main(e1: int, e2: int, e3: int): int {
+@main(e1: int, e2: int, e3: int) {
   arr_size: int = const 3;
 
   nums: ptr<int> = call @create_arr arr_size e1 e2 e3;
@@ -73,7 +50,7 @@
 
   .end:
     free nums;
-    ret major_elm;
+    print major_elm;
 }
 
 @create_arr(size: int, e1: int, e2: int, e3: int): ptr<int> {
diff --git a/benchmarks/passing/bril/mem/mat-mul.bril b/benchmarks/passing/bril/mem/mat-mul.bril
index 3a6407821..614c6a504 100644
--- a/benchmarks/passing/bril/mem/mat-mul.bril
+++ b/benchmarks/passing/bril/mem/mat-mul.bril
@@ -108,23 +108,7 @@
   ret;
 }
 
-@main(loop_bound: int) {
-  loop_incr: int = const 1;
-  loop_counter: int = const 10;
-  final_output: int = const 0;
-.loop_cond:
-  loop_cond: bool = lt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  output: int = call @orig_main loop_counter;
-  final_output: int = add final_output output;
-  loop_counter: int = add loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-@orig_main(size: int): int {
+@main(size: int) {
   seed: int = const 109658;
   one: int = const 1;
   rng: ptr<int> = alloc one;
@@ -145,5 +129,5 @@
   sum: int = add sum a;
   sum: int = add sum b;
   sum: int = add sum c;
-  ret sum;
+  print sum;
 }
diff --git a/benchmarks/passing/bril/mem/max-subarray.bril b/benchmarks/passing/bril/mem/max-subarray.bril
index 709d230e2..f66b3c26d 100644
--- a/benchmarks/passing/bril/mem/max-subarray.bril
+++ b/benchmarks/passing/bril/mem/max-subarray.bril
@@ -46,23 +46,7 @@
   ret n;
 }
 
-@main(loop_bound: int) {
-  loop_incr: int = const 1;
-  loop_counter: int = const 10;
-  final_output: int = const 0;
-.loop_cond:
-  loop_cond: bool = lt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  output: int = call @orig_main loop_counter;
-  final_output: int = add final_output output;
-  loop_counter: int = add loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-@orig_main(x: int): int {
+@main(x: int) {
     size: int = const 10;
     n1: int = const 1;
     n2: int = const 2;
@@ -106,5 +90,5 @@
     val: int = add max_sum x;
 
     free array;
-    ret val;
+    print val;
 }
diff --git a/benchmarks/passing/bril/mem/quickselect.bril b/benchmarks/passing/bril/mem/quickselect.bril
index 3ad44e509..d1824a24a 100644
--- a/benchmarks/passing/bril/mem/quickselect.bril
+++ b/benchmarks/passing/bril/mem/quickselect.bril
@@ -106,23 +106,8 @@
   i: int = call @quickselect array newl r newk;
   ret i;
 }
-@main(loop_bound: int) {
-  loop_incr: int = const 1;
-  loop_counter: int = const 10;
-  final_output: int = const 0;
-.loop_cond:
-  loop_cond: bool = lt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  output: int = call @orig_main loop_counter;
-  final_output: int = add final_output output;
-  loop_counter: int = add loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
 
-@orig_main(x: int): int {
+@main(x: int) {
   k: int = const 4;
   n1: int = const 97;
   n2: int = const 108;
@@ -138,5 +123,5 @@
   output: int = call @quickselect array zero five k;
   val: int = add output x;
   free array;
-  ret val;
+  print val;
 }
\ No newline at end of file
diff --git a/benchmarks/passing/bril/mem/quicksort.bril b/benchmarks/passing/bril/mem/quicksort.bril
index 0e9244110..1b48a6925 100644
--- a/benchmarks/passing/bril/mem/quicksort.bril
+++ b/benchmarks/passing/bril/mem/quicksort.bril
@@ -1,29 +1,11 @@
-# ARGS: 3500000
+# ARGS: 94
 # An implementation of Quicksort using the Lomuto partition scheme, adapted from the pseudocode on Wikipedia
 # Input: an array of length 6
 # Output: the input array sorted in ascending order
 # Adopted two helper functions, pack and print_array, from Jiajie Li's bubble sort benchmark
 
-@main(loop_bound: int) {
-  loop_incr: int = const 1;
-  loop_counter: int = const 10;
-  final_output: int = const 0;
-.loop_cond:
-  loop_cond: bool = lt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  output: int = call @orig_main loop_counter;
-  final_output: int = add final_output output;
-  loop_counter: int = add loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-@orig_main(x: int): int {
+@main(n1: int) {
     size: int = const 6;
-    ninetyfour: int = const 94;
-    n1: int = add x ninetyfour;
     n2: int = const 21;
     n3: int = const 5;
     n4: int = const 6;
@@ -40,7 +22,7 @@
     sum: int = call @print_array array size;
 
     free array;
-    ret sum;
+    print sum;
 }
 
 @qsort(array : ptr<int>, l: int, r:int) {
diff --git a/benchmarks/passing/bril/mem/vsmul.bril b/benchmarks/passing/bril/mem/vsmul.bril
index abff04b42..707d29a13 100644
--- a/benchmarks/passing/bril/mem/vsmul.bril
+++ b/benchmarks/passing/bril/mem/vsmul.bril
@@ -37,26 +37,15 @@
   ret arr;
 }
 
-@main(loop_bound: int) {
-  loop_incr: int = const 1;
-  loop_counter: int = const 10;
-.loop_cond:
-  loop_cond: bool = lt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  call @orig_main loop_counter;
-  loop_counter: int = add loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-}
 
-@orig_main(size: int) {
+@main(size: int) {
   seed: int = const 2023;
   two: int = const 2;
   rng: ptr<int> = alloc seed; 
   store rng seed;
   arr: ptr<int> = call @randarray size rng; 
   i: int = const 0; 
+  val: int = const 0;
   .loop:
     cond: bool = lt i size;
     br cond .body .done;
@@ -68,4 +57,5 @@
   .done:
   free arr;
   free rng;
+  print val;
 }
\ No newline at end of file
diff --git a/benchmarks/passing/bril/mixed/cholesky.bril b/benchmarks/passing/bril/mixed/cholesky.bril
index 6ae7356d1..ee31d905c 100644
--- a/benchmarks/passing/bril/mixed/cholesky.bril
+++ b/benchmarks/passing/bril/mixed/cholesky.bril
@@ -1,4 +1,6 @@
-# ARGS: 270000
+# ARGS: 4
+# TODO size has to be 4
+
 # Cholesky decomposition algorithm
 
 # Cholesky decomposition transforms a Hermitian,
@@ -317,25 +319,8 @@
   ret;
 }
 
-@main(loop_bound: float) {
-  loop_incr: float = const 1;
-  loop_counter: float = const 10;
-  final_output: float = const 0;
-.loop_cond:
-  loop_cond: bool = flt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  output: float = call @orig_main loop_counter;
-  final_output: float = fadd final_output output;
-  loop_counter: float = fadd loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-@orig_main(x: float): float {
+@main(size: int) {
   one: int = const 1;
-  size: int = const 4;
   sqsize: int = mul size size;
   arr1: ptr<float> = call @fillarray;
   arr1_transposed: ptr<float> = call @fillarray;
@@ -345,10 +330,9 @@
   call @matmul size arr1 arr1_transposed hermitian;
   call @cholesky size hermitian res;
   sum: float = call @printarray sqsize res;
-  sum: float = fadd sum x;
   free arr1;
   free arr1_transposed;
   free hermitian;
   free res;
-  ret sum;
+  print sum;
 }
diff --git a/benchmarks/passing/bril/mixed/mat-inv.bril b/benchmarks/passing/bril/mixed/mat-inv.bril
index ae8a87771..d3f969426 100644
--- a/benchmarks/passing/bril/mixed/mat-inv.bril
+++ b/benchmarks/passing/bril/mixed/mat-inv.bril
@@ -1,4 +1,4 @@
-# ARGS: 1400000
+# ARGS: 7
 
 ## let's try to invert a 3x3 matrix lmao
 ## We'll keep an array of 9 integers
@@ -106,23 +106,8 @@
 }
 
 
-@main(loop_bound: float) {
-  loop_incr: float = const 1;
-  loop_counter: float = const 10;
-  final_output: float = const 0;
-.loop_cond:
-  loop_cond: bool = flt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  output: float = call @orig_main loop_counter;
-  final_output: float = fadd final_output output;
-  loop_counter: float = fadd loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
 
-@orig_main(x: float): float 
+@main(x: float) 
 {
   nine :int = const 9;
   one :int = const 1;
@@ -132,12 +117,11 @@
   three :float = const 3;
   four :float = const 4;
   five :float = const 5;
-  seven :float = const 7;
   store matrix zero;
   ptr :ptr<float> = ptradd matrix one;
   store ptr onef;
   ptr :ptr<float> = ptradd ptr one;
-  store ptr seven;
+  store ptr x;
   ptr :ptr<float> = ptradd ptr one;
   store ptr four;
   ptr :ptr<float> = ptradd ptr one;
@@ -145,7 +129,7 @@
   ptr :ptr<float> = ptradd ptr one;
   store ptr five;
   ptr :ptr<float> = ptradd ptr one;
-  store ptr seven;
+  store ptr x;
   ptr :ptr<float> = ptradd ptr one;
   store ptr four;
   ptr :ptr<float> = ptradd ptr one;
@@ -159,7 +143,7 @@
   free matrix;
   sum: float = fadd sum arr;
   sum: float = fadd sum x;
-  ret sum;
+  print sum;
 }
 
 
diff --git a/benchmarks/passing/polybench/linear-algebra/blas/gemm.bril b/benchmarks/passing/polybench/linear-algebra/blas/gemm.bril
index 28f467955..a0c505ea8 100644
--- a/benchmarks/passing/polybench/linear-algebra/blas/gemm.bril
+++ b/benchmarks/passing/polybench/linear-algebra/blas/gemm.bril
@@ -62,11 +62,12 @@
     jmp .main_i;
 .main_i_done:
 
-    call @matrix_print C NI NJ;
+    res: float = call @matrix_sum C NI NJ;
 
     free A;
     free B;
     free C;
+    print res;
 }
 
 @init(A: ptr<float>, B: ptr<float>, C: ptr<float>,
@@ -201,20 +202,22 @@
     store ptr new_val;
 }
 
-@matrix_print(mtx: ptr<float>, Nrow: int, Ncol: int) {
+@matrix_sum(mtx: ptr<float>, Nrow: int, Ncol: int): float {
     i: int = const 0;
     one: int = const 1;
     total: int = mul Nrow Ncol;
+    res: float = const 0;
 .while:
     cond: bool = lt i total;
     br cond .body .done;
 .body:
     mtx_loc: ptr<float> = ptradd mtx i;
     val: float = load mtx_loc;
-    print val;
+    res: float = fadd res val;
     i: int = add i one;
     jmp .while;
 .done:
+    ret res;
 }
 
 # Search for n % m where n and m are floats by
diff --git a/benchmarks/passing/polybench/linear-algebra/blas/gemver.bril b/benchmarks/passing/polybench/linear-algebra/blas/gemver.bril
index a1d625e14..fb38d2f08 100644
--- a/benchmarks/passing/polybench/linear-algebra/blas/gemver.bril
+++ b/benchmarks/passing/polybench/linear-algebra/blas/gemver.bril
@@ -112,7 +112,7 @@
     jmp .part4_i;
 .part4_i_done:
 
-    call @vector_print w N;
+    res: float = call @vector_sum w N;
 
     free A;
     free u1;
@@ -123,6 +123,7 @@
     free x;
     free y;
     free z;
+    print res;
 }
 
 
@@ -239,18 +240,20 @@
 
 # EXPECTS:
 #   @vector_get defined
-@vector_print(vec: ptr<float>, N: int) {
+@vector_sum(vec: ptr<float>, N: int): float {
     i: int = const 0;
     one: int = const 1;
+    res: float = const 0;
 .while:
     cond: bool = lt i N;
     br cond .body .done;
 .body:
     val: float = call @vector_get vec i;
-    print val;
+    res: float = fadd res val;
     i: int = add i one;
     jmp .while;
 .done:
+    ret res;
 }
 
 # Search for n % m where n and m are floats by
diff --git a/benchmarks/passing/polybench/linear-algebra/blas/gesummv.bril b/benchmarks/passing/polybench/linear-algebra/blas/gesummv.bril
index f1a65ee48..9c8e49083 100644
--- a/benchmarks/passing/polybench/linear-algebra/blas/gesummv.bril
+++ b/benchmarks/passing/polybench/linear-algebra/blas/gesummv.bril
@@ -62,13 +62,14 @@
     jmp .main_i;
 .main_i_done:
 
-    call @vector_print y N;
+    res: float = call @vector_sum y N;
 
     free A;
     free B;
     free tmp;
     free x;
     free y;
+    print res;
 }
 
 @init(A: ptr<float>, B: ptr<float>, x: ptr<float>,
@@ -161,18 +162,20 @@
 
 # EXPECTS:
 #   @vector_get defined
-@vector_print(vec: ptr<float>, N: int) {
+@vector_sum(vec: ptr<float>, N: int): float {
     i: int = const 0;
     one: int = const 1;
+    res: float = const 0;
 .while:
     cond: bool = lt i N;
     br cond .body .done;
 .body:
     val: float = call @vector_get vec i;
-    print val;
+    res: float = fadd res val;
     i: int = add i one;
     jmp .while;
 .done:
+    ret res;
 }
 
 # Search for n % m where n and m are floats by
diff --git a/benchmarks/passing/polybench/linear-algebra/blas/symm.bril b/benchmarks/passing/polybench/linear-algebra/blas/symm.bril
index cb9a07f75..2ca250eae 100644
--- a/benchmarks/passing/polybench/linear-algebra/blas/symm.bril
+++ b/benchmarks/passing/polybench/linear-algebra/blas/symm.bril
@@ -74,11 +74,12 @@
     jmp .main_i;
 .main_i_done:
 
-    call @matrix_print C M N;
+    res: float = call @matrix_sum C M N;
 
     free C;
     free A;
     free B;
+    print res;
 }
 
 @init(C: ptr<float>, A: ptr<float>, B: ptr<float>,
@@ -193,20 +194,22 @@
     store ptr new_val;
 }
 
-@matrix_print(mtx: ptr<float>, Nrow: int, Ncol: int) {
+@matrix_sum(mtx: ptr<float>, Nrow: int, Ncol: int): float {
     i: int = const 0;
     one: int = const 1;
     total: int = mul Nrow Ncol;
+    res: float = const 0;
 .while:
     cond: bool = lt i total;
     br cond .body .done;
 .body:
     mtx_loc: ptr<float> = ptradd mtx i;
     val: float = load mtx_loc;
-    print val;
+    res: float = fadd res val;
     i: int = add i one;
     jmp .while;
 .done:
+    ret res;
 }
 
 # Search for n % m where n and m are floats by
diff --git a/benchmarks/passing/polybench/linear-algebra/blas/syr2k.bril b/benchmarks/passing/polybench/linear-algebra/blas/syr2k.bril
index 7c35f0a9c..064bc2111 100644
--- a/benchmarks/passing/polybench/linear-algebra/blas/syr2k.bril
+++ b/benchmarks/passing/polybench/linear-algebra/blas/syr2k.bril
@@ -66,11 +66,12 @@
     jmp .main_i;
 .main_i_done:
 
-    call @matrix_print C N N;
+    res: float = call @matrix_sum C N N;
 
     free C;
     free A;
     free B;
+    print res;
 }
 
 @init(C: ptr<float>, A: ptr<float>, B: ptr<float>,
@@ -185,22 +186,25 @@
     store ptr new_val;
 }
 
-@matrix_print(mtx: ptr<float>, Nrow: int, Ncol: int) {
+@matrix_sum(mtx: ptr<float>, Nrow: int, Ncol: int): float {
     i: int = const 0;
     one: int = const 1;
     total: int = mul Nrow Ncol;
+    res: float = const 0;
 .while:
     cond: bool = lt i total;
     br cond .body .done;
 .body:
     mtx_loc: ptr<float> = ptradd mtx i;
     val: float = load mtx_loc;
-    print val;
+    res: float = fadd res val;
     i: int = add i one;
     jmp .while;
 .done:
+    ret res;
 }
 
+
 # Search for n % m where n and m are floats by
 # iteratively subtracting the largest m*2^k that
 # fits inside n. 
diff --git a/benchmarks/passing/polybench/linear-algebra/blas/syrk.bril b/benchmarks/passing/polybench/linear-algebra/blas/syrk.bril
index 9c9f637f0..1fa277b5e 100644
--- a/benchmarks/passing/polybench/linear-algebra/blas/syrk.bril
+++ b/benchmarks/passing/polybench/linear-algebra/blas/syrk.bril
@@ -1,6 +1,6 @@
 ## syr2k computes a symmetric rank k update
 
-@main {
+@main() {
     # constants
     # dimensions correspond to Polybench MEDIUM_DATASET
 
@@ -60,10 +60,11 @@
     jmp .main_i;
 .main_i_done:
 
-    call @matrix_print C N N;
+    res: float = call @matrix_sum C N N;
 
     free C;
     free A;
+    print res;
 }
 
 @init(C: ptr<float>, A: ptr<float>,
@@ -171,22 +172,25 @@
     store ptr new_val;
 }
 
-@matrix_print(mtx: ptr<float>, Nrow: int, Ncol: int) {
+@matrix_sum(mtx: ptr<float>, Nrow: int, Ncol: int): float {
     i: int = const 0;
     one: int = const 1;
     total: int = mul Nrow Ncol;
+    res: float = const 0;
 .while:
     cond: bool = lt i total;
     br cond .body .done;
 .body:
     mtx_loc: ptr<float> = ptradd mtx i;
     val: float = load mtx_loc;
-    print val;
+    res: float = fadd res val;
     i: int = add i one;
     jmp .while;
 .done:
+    ret res;
 }
 
+
 # Search for n % m where n and m are floats by
 # iteratively subtracting the largest m*2^k that
 # fits inside n. 
diff --git a/benchmarks/passing/polybench/linear-algebra/blas/trmm.bril b/benchmarks/passing/polybench/linear-algebra/blas/trmm.bril
index 95fa64e18..2bf52275e 100644
--- a/benchmarks/passing/polybench/linear-algebra/blas/trmm.bril
+++ b/benchmarks/passing/polybench/linear-algebra/blas/trmm.bril
@@ -51,10 +51,11 @@
     jmp .main_i;
 .main_i_done:
 
-    call @matrix_print B M N;
+    res: float = call @matrix_sum B M N;
 
     free A;
     free B;
+    print res;
 }
 
 @init(A: ptr<float>, B: ptr<float>,
@@ -144,22 +145,25 @@
     store ptr new_val;
 }
 
-@matrix_print(mtx: ptr<float>, Nrow: int, Ncol: int) {
+@matrix_sum(mtx: ptr<float>, Nrow: int, Ncol: int): float {
     i: int = const 0;
     one: int = const 1;
     total: int = mul Nrow Ncol;
+    res: float = const 0;
 .while:
     cond: bool = lt i total;
     br cond .body .done;
 .body:
     mtx_loc: ptr<float> = ptradd mtx i;
     val: float = load mtx_loc;
-    print val;
+    res: float = fadd res val;
     i: int = add i one;
     jmp .while;
 .done:
+    ret res;
 }
 
+
 # Search for n % m where n and m are floats by
 # iteratively subtracting the largest m*2^k that
 # fits inside n. 
diff --git a/benchmarks/passing/polybench/linear-algebra/kernels/2mm.bril b/benchmarks/passing/polybench/linear-algebra/kernels/2mm.bril
index 9feeb96b0..8f33f6250 100644
--- a/benchmarks/passing/polybench/linear-algebra/kernels/2mm.bril
+++ b/benchmarks/passing/polybench/linear-algebra/kernels/2mm.bril
@@ -93,13 +93,14 @@
     jmp .part2_i;
 .part2_i_done:
 
-    call @matrix_print D NI NL;
+    res: float = call @matrix_sum D NI NL;
 
     free A;
     free B;
     free C;
     free D;
     free tmp;
+    print res;
 }
 
 @init(A: ptr<float>, B: ptr<float>, C: ptr<float>, D: ptr<float>, 
@@ -262,22 +263,25 @@
     store ptr new_val;
 }
 
-@matrix_print(mtx: ptr<float>, Nrow: int, Ncol: int) {
+@matrix_sum(mtx: ptr<float>, Nrow: int, Ncol: int): float {
     i: int = const 0;
     one: int = const 1;
     total: int = mul Nrow Ncol;
+    res: float = const 0;
 .while:
     cond: bool = lt i total;
     br cond .body .done;
 .body:
     mtx_loc: ptr<float> = ptradd mtx i;
     val: float = load mtx_loc;
-    print val;
+    res: float = fadd res val;
     i: int = add i one;
     jmp .while;
 .done:
+    ret res;
 }
 
+
 # Search for n % m where n and m are floats by
 # iteratively subtracting the largest m*2^k that
 # fits inside n. 
diff --git a/benchmarks/passing/polybench/linear-algebra/kernels/3mm.bril b/benchmarks/passing/polybench/linear-algebra/kernels/3mm.bril
index 831c630f2..204755bb3 100644
--- a/benchmarks/passing/polybench/linear-algebra/kernels/3mm.bril
+++ b/benchmarks/passing/polybench/linear-algebra/kernels/3mm.bril
@@ -25,10 +25,6 @@
     C: ptr<float> = call @matrix_new NJ NM;
     D: ptr<float> = call @matrix_new NM NL;
     call @init A B C D NI fNI NJ fNJ NK fNK NL fNL NM fNM;
-    call @matrix_print A NI NK;
-    call @matrix_print B NK NJ;
-    call @matrix_print C NJ NM;
-    call @matrix_print D NM NL;
     # main computation
     # computes G := (A * B) * (C * D)
 
@@ -131,7 +127,7 @@
     jmp .part3_i;
 .part3_i_done:
 
-    call @matrix_print G NI NL;
+    res: float = call @matrix_sum G NI NL;
 
     free A;
     free B;
@@ -140,6 +136,7 @@
     free E;
     free F;
     free G;
+    print res;
 }
 
 @init(A: ptr<float>, B: ptr<float>, C: ptr<float>, D: ptr<float>,
@@ -299,22 +296,25 @@
     store ptr new_val;
 }
 
-@matrix_print(mtx: ptr<float>, Nrow: int, Ncol: int) {
+@matrix_sum(mtx: ptr<float>, Nrow: int, Ncol: int): float {
     i: int = const 0;
     one: int = const 1;
     total: int = mul Nrow Ncol;
+    res: float = const 0;
 .while:
     cond: bool = lt i total;
     br cond .body .done;
 .body:
     mtx_loc: ptr<float> = ptradd mtx i;
     val: float = load mtx_loc;
-    print val;
+    res: float = fadd res val;
     i: int = add i one;
     jmp .while;
 .done:
+    ret res;
 }
 
+
 # Search for n % m where n and m are floats by
 # iteratively subtracting the largest m*2^k that
 # fits inside n. 
diff --git a/benchmarks/passing/polybench/linear-algebra/kernels/atax.bril b/benchmarks/passing/polybench/linear-algebra/kernels/atax.bril
index d21410f9b..14db7080e 100644
--- a/benchmarks/passing/polybench/linear-algebra/kernels/atax.bril
+++ b/benchmarks/passing/polybench/linear-algebra/kernels/atax.bril
@@ -72,12 +72,13 @@
     jmp .main_i;
 .main_i_done:
     
-    call @vector_print y N;
+    res: float = call @vector_sum y N;
 
     free A;
     free x;
     free y;
     free tmp;
+    print res;
 }
 
 @init(A: ptr<float>, x: ptr<float>, 
@@ -174,18 +175,20 @@
 
 # EXPECTS:
 #   @vector_get defined
-@vector_print(vec: ptr<float>, N: int) {
+@vector_sum(vec: ptr<float>, N: int): float {
     i: int = const 0;
     one: int = const 1;
+    res: float = const 0;
 .while:
     cond: bool = lt i N;
     br cond .body .done;
 .body:
     val: float = call @vector_get vec i;
-    print val;
+    res: float = fadd res val;
     i: int = add i one;
     jmp .while;
 .done:
+    ret res;
 }
 
 # Search for n % m where n and m are floats by
diff --git a/benchmarks/passing/polybench/linear-algebra/kernels/bicg.bril b/benchmarks/passing/polybench/linear-algebra/kernels/bicg.bril
index ddf680811..6863c71e5 100644
--- a/benchmarks/passing/polybench/linear-algebra/kernels/bicg.bril
+++ b/benchmarks/passing/polybench/linear-algebra/kernels/bicg.bril
@@ -65,14 +65,16 @@
     jmp .main_i;
 .main_i_done:
 
-    call @vector_print s M;
-    call @vector_print q N;
+    res: float = call @vector_sum s M;
+    res2: float = call @vector_sum q N;
+    res3: float = fadd res res2;
 
     free A;
     free s;
     free q;
     free p;
     free r;
+    print res3;
 }
 
 @init(p: ptr<float>, r: ptr<float>, A: ptr<float>, 
@@ -171,18 +173,20 @@
 
 # EXPECTS:
 #   @vector_get defined
-@vector_print(vec: ptr<float>, N: int) {
+@vector_sum(vec: ptr<float>, N: int): float {
     i: int = const 0;
     one: int = const 1;
+    res: float = const 0;
 .while:
     cond: bool = lt i N;
     br cond .body .done;
 .body:
     val: float = call @vector_get vec i;
-    print val;
+    res: float = fadd res val;
     i: int = add i one;
     jmp .while;
 .done:
+    ret res;
 }
 
 # Search for n % m where n and m are floats by
diff --git a/benchmarks/passing/polybench/linear-algebra/kernels/doitgen.bril b/benchmarks/passing/polybench/linear-algebra/kernels/doitgen.bril
index b9160b00d..2ce953665 100644
--- a/benchmarks/passing/polybench/linear-algebra/kernels/doitgen.bril
+++ b/benchmarks/passing/polybench/linear-algebra/kernels/doitgen.bril
@@ -74,11 +74,12 @@
     jmp .main_r;
 .main_r_done:
 
-    call @tensor_print A NR NQ NP;
+    res: float = call @tensor_sum A NR NQ NP;
 
     free A;
     free sum;
     free C4;
+    print res;
 }
 
 
@@ -217,21 +218,23 @@
     store ptr new_val;
 }
 
-@tensor_print(tsr: ptr<float>, Ni: int, Nj: int, Nk: int) {
+@tensor_sum(tsr: ptr<float>, Ni: int, Nj: int, Nk: int): float {
     i: int = const 0;
     one: int = const 1;
     total: int = mul Ni Nj;
     total: int = mul total Nk;
+    res: float = const 0;
 .while:
     cond: bool = lt i total;
     br cond .body .done;
 .body:
     tsr_loc: ptr<float> = ptradd tsr i;
     val: float = load tsr_loc;
-    print val;
+    res: float = fadd res val;
     i: int = add i one;
     jmp .while;
 .done:
+    ret res;
 }
 
 @vector_new(N: int): ptr<float> {
diff --git a/benchmarks/passing/polybench/linear-algebra/kernels/mvt.bril b/benchmarks/passing/polybench/linear-algebra/kernels/mvt.bril
index 7fa9a7936..c27526ef4 100644
--- a/benchmarks/passing/polybench/linear-algebra/kernels/mvt.bril
+++ b/benchmarks/passing/polybench/linear-algebra/kernels/mvt.bril
@@ -63,14 +63,16 @@
     jmp .part2_i;
 .part2_i_done:
 
-    call @vector_print x1 N;
-    call @vector_print x2 N;
+    res: float = call @vector_sum x1 N;
+    res2: float = call @vector_sum x2 N;
+    res3: float = fadd res res2;
 
     free x1;
     free x2;
     free y_1;
     free y_2;
     free A;
+    print res3;
 }
 
 @init(x1: ptr<float>, x2: ptr<float>, y_1: ptr<float>, y_2: ptr<float>, A: ptr<float>,
@@ -144,18 +146,20 @@
 
 # EXPECTS:
 #   @vector_get defined
-@vector_print(vec: ptr<float>, N: int) {
+@vector_sum(vec: ptr<float>, N: int): float {
     i: int = const 0;
     one: int = const 1;
+    res: float = const 0;
 .while:
     cond: bool = lt i N;
     br cond .body .done;
 .body:
     val: float = call @vector_get vec i;
-    print val;
+    res: float = fadd res val;
     i: int = add i one;
     jmp .while;
 .done:
+    ret res;
 }
 
 @matrix_new(Nrow: int, Ncol: int): ptr<float> {
diff --git a/benchmarks/passing/polybench/linear-algebra/solvers/cholesky.bril b/benchmarks/passing/polybench/linear-algebra/solvers/cholesky.bril
index 784d49d4c..e36c7aa6b 100644
--- a/benchmarks/passing/polybench/linear-algebra/solvers/cholesky.bril
+++ b/benchmarks/passing/polybench/linear-algebra/solvers/cholesky.bril
@@ -60,9 +60,10 @@
     jmp .main_i;
 .main_i_done:
 
-    call @matrix_print A N N;
+    res: float = call @matrix_sum A N N;
 
     free A;
+    print res;
 }
 
 @init(A: ptr<float>, N: int, fN: float) {
@@ -230,20 +231,22 @@
     store ptr new_val;
 }
 
-@matrix_print(mtx: ptr<float>, Nrow: int, Ncol: int) {
+@matrix_sum(mtx: ptr<float>, Nrow: int, Ncol: int): float {
     i: int = const 0;
     one: int = const 1;
     total: int = mul Nrow Ncol;
+    res: float = const 0;
 .while:
     cond: bool = lt i total;
     br cond .body .done;
 .body:
     mtx_loc: ptr<float> = ptradd mtx i;
     val: float = load mtx_loc;
-    print val;
+    res: float = fadd res val;
     i: int = add i one;
     jmp .while;
 .done:
+    ret res;
 }
 
 # Loop Newton's method until convergence within a
diff --git a/benchmarks/passing/polybench/linear-algebra/solvers/durbin.bril b/benchmarks/passing/polybench/linear-algebra/solvers/durbin.bril
index 0ee58f13a..6fd76a457 100644
--- a/benchmarks/passing/polybench/linear-algebra/solvers/durbin.bril
+++ b/benchmarks/passing/polybench/linear-algebra/solvers/durbin.bril
@@ -90,11 +90,12 @@
     jmp .main_k;
 .main_k_done:
 
-    call @vector_print y N;
+    res: float = call @vector_sum y N;
 
     free r;
     free y;
     free z;
+    print res;
 }
 
 @init(r: ptr<float>, N: int, fN: float) {
@@ -141,16 +142,18 @@
 
 # EXPECTS:
 #   @vector_get defined
-@vector_print(vec: ptr<float>, N: int) {
+@vector_sum(vec: ptr<float>, N: int): float {
     i: int = const 0;
     one: int = const 1;
+    res: float = const 0;
 .while:
     cond: bool = lt i N;
     br cond .body .done;
 .body:
     val: float = call @vector_get vec i;
-    print val;
+    res: float = fadd res val;
     i: int = add i one;
     jmp .while;
 .done:
+    ret res;
 }
\ No newline at end of file
diff --git a/benchmarks/passing/polybench/linear-algebra/solvers/gramschmidt.bril b/benchmarks/passing/polybench/linear-algebra/solvers/gramschmidt.bril
index 67c1b139c..990a7f4ae 100644
--- a/benchmarks/passing/polybench/linear-algebra/solvers/gramschmidt.bril
+++ b/benchmarks/passing/polybench/linear-algebra/solvers/gramschmidt.bril
@@ -90,12 +90,14 @@
     jmp .main_k;
 .main_k_done:
 
-    call @matrix_print R N N;
-    call @matrix_print Q M N;
+    res: float = call @matrix_sum R N N;
+    res2: float = call @matrix_sum Q M N;
+    res3: float = fadd res res2;
 
     free A;
     free R;
     free Q;
+    print res3;
 }
 
 @init(A: ptr<float>, R: ptr<float>, Q: ptr<float>,
@@ -190,20 +192,22 @@
     store ptr new_val;
 }
 
-@matrix_print(mtx: ptr<float>, Nrow: int, Ncol: int) {
+@matrix_sum(mtx: ptr<float>, Nrow: int, Ncol: int): float {
     i: int = const 0;
     one: int = const 1;
     total: int = mul Nrow Ncol;
+    res: float = const 0;
 .while:
     cond: bool = lt i total;
     br cond .body .done;
 .body:
     mtx_loc: ptr<float> = ptradd mtx i;
     val: float = load mtx_loc;
-    print val;
+    res: float = fadd res val;
     i: int = add i one;
     jmp .while;
 .done:
+    ret res;
 }
 
 
diff --git a/benchmarks/passing/polybench/linear-algebra/solvers/lu.bril b/benchmarks/passing/polybench/linear-algebra/solvers/lu.bril
index dab2bcbd2..77d78a800 100644
--- a/benchmarks/passing/polybench/linear-algebra/solvers/lu.bril
+++ b/benchmarks/passing/polybench/linear-algebra/solvers/lu.bril
@@ -65,9 +65,10 @@
     jmp .main_i;
 .main_i_done:
 
-    call @matrix_print A N N;
+    res: float = call @matrix_sum A N N;
 
     free A;
+    print res;
 }
 
 @init(A: ptr<float>, N: int, fN: float) {
@@ -266,18 +267,20 @@
     store ptr new_val;
 }
 
-@matrix_print(mtx: ptr<float>, Nrow: int, Ncol: int) {
+@matrix_sum(mtx: ptr<float>, Nrow: int, Ncol: int): float {
     i: int = const 0;
     one: int = const 1;
     total: int = mul Nrow Ncol;
+    res: float = const 0;
 .while:
     cond: bool = lt i total;
     br cond .body .done;
 .body:
     mtx_loc: ptr<float> = ptradd mtx i;
     val: float = load mtx_loc;
-    print val;
+    res: float = fadd res val;
     i: int = add i one;
     jmp .while;
 .done:
-}
\ No newline at end of file
+    ret res;
+}
diff --git a/benchmarks/passing/polybench/linear-algebra/solvers/ludcmp.bril b/benchmarks/passing/polybench/linear-algebra/solvers/ludcmp.bril
index 1c9322fca..15c47d75b 100644
--- a/benchmarks/passing/polybench/linear-algebra/solvers/ludcmp.bril
+++ b/benchmarks/passing/polybench/linear-algebra/solvers/ludcmp.bril
@@ -122,13 +122,13 @@
     jmp .set_x_i;
 .set_x_i_done:
 
-    call @vector_print x N;
+    res:float = call @vector_sum x N;
 
     free A;
     free x;
     free y;
     free b;
-
+    print res;
 }
 
 
@@ -320,18 +320,20 @@
 
 # EXPECTS:
 #   @vector_get defined
-@vector_print(vec: ptr<float>, N: int) {
+@vector_sum(vec: ptr<float>, N: int): float {
     i: int = const 0;
     one: int = const 1;
+    res: float = const 0;
 .while:
     cond: bool = lt i N;
     br cond .body .done;
 .body:
     val: float = call @vector_get vec i;
-    print val;
+    res: float = fadd res val;
     i: int = add i one;
     jmp .while;
 .done:
+    ret res;
 }
 
 
diff --git a/benchmarks/passing/polybench/linear-algebra/solvers/trisolv.bril b/benchmarks/passing/polybench/linear-algebra/solvers/trisolv.bril
index 990aa5035..d9ab89df4 100644
--- a/benchmarks/passing/polybench/linear-algebra/solvers/trisolv.bril
+++ b/benchmarks/passing/polybench/linear-algebra/solvers/trisolv.bril
@@ -44,11 +44,12 @@
     jmp .main_i;
 .main_i_done:
 
-    call @vector_print x N;
+    res:float = call @vector_sum x N;
 
     free L;
     free x;
     free b;
+    print res;
 }
 
 @init(L: ptr<float>, b: ptr<float>, x: ptr<float>,
@@ -142,16 +143,18 @@
 
 # EXPECTS:
 #   @vector_get defined
-@vector_print(vec: ptr<float>, N: int) {
+@vector_sum(vec: ptr<float>, N: int): float {
     i: int = const 0;
     one: int = const 1;
+    res: float = const 0;
 .while:
     cond: bool = lt i N;
     br cond .body .done;
 .body:
     val: float = call @vector_get vec i;
-    print val;
+    res: float = fadd res val;
     i: int = add i one;
     jmp .while;
 .done:
+    ret res;
 }
\ No newline at end of file
diff --git a/dag_in_context/Cargo.lock b/dag_in_context/Cargo.lock
index f8f4d8a72..e0351160e 100644
--- a/dag_in_context/Cargo.lock
+++ b/dag_in_context/Cargo.lock
@@ -161,7 +161,7 @@ dependencies = [
 [[package]]
 name = "bril-rs"
 version = "0.1.0"
-source = "git+https://github.com/uwplse/bril?rev=e2be3f5#e2be3f5d7e160f02b7aed0ef2bcc3e13ae722d2b"
+source = "git+https://github.com/uwplse/bril?rev=fe255deec1533960b20fff832971e45810202a5d#fe255deec1533960b20fff832971e45810202a5d"
 dependencies = [
  "serde",
  "serde_json",
diff --git a/dag_in_context/Cargo.toml b/dag_in_context/Cargo.toml
index 0e56536be..5406f9a31 100644
--- a/dag_in_context/Cargo.toml
+++ b/dag_in_context/Cargo.toml
@@ -12,7 +12,7 @@ strum_macros = "0.25"
 main_error = "0.1.2"
 thiserror = "1.0"
 egraph-serialize = "0.2.0"
-bril-rs = { git = "https://github.com/uwplse/bril", rev = "e2be3f5" }
+bril-rs = { git = "https://github.com/uwplse/bril", rev = "fe255deec1533960b20fff832971e45810202a5d" }
 indexmap = "2.0.0"
 rustc-hash = "1.1.0"
 ordered-float = "3"
diff --git a/infra/generate_cfgs.py b/infra/generate_cfgs.py
index c1cfdee61..64d04450d 100755
--- a/infra/generate_cfgs.py
+++ b/infra/generate_cfgs.py
@@ -2,6 +2,8 @@
 import glob
 import os
 
+import concurrent.futures
+
 def make_cfgs(bench, data_dir):
   cwd = os.getcwd()
   path = f"{data_dir}/{bench}"
@@ -55,5 +57,16 @@ def make_cfgs(bench, data_dir):
       exit(1)
   data_dir = os.sys.argv[1]
   benchmarks = os.listdir(data_dir)
-  for bench in benchmarks:
-    make_cfgs(bench, data_dir)
+
+  # get the number of cores on this machine 
+  parallelism = os.cpu_count()
+  with concurrent.futures.ThreadPoolExecutor(max_workers = parallelism) as executor:
+    futures = {executor.submit(make_cfgs, bench, data_dir) for bench in benchmarks}
+
+    for future in concurrent.futures.as_completed(futures):
+      try:
+        future.result()
+      except Exception as e:
+        print(f"Shutting down executor due to error: {e}")
+        executor.shutdown(wait=False, cancel_futures=True)
+        raise e
diff --git a/infra/generate_line_counts.py b/infra/generate_line_counts.py
index c804b293e..7e10fe59b 100755
--- a/infra/generate_line_counts.py
+++ b/infra/generate_line_counts.py
@@ -4,6 +4,7 @@
 import json
 import subprocess
 import os
+import statistics 
 
 def header():
     return [
@@ -78,19 +79,36 @@ def detailed_linecount_table():
 def round_fmt(v):
     return "{:.3f}".format(round(v, 3))
 
+# given a list of integers (cycles taken for each run)
+# return the mean of the cycles
+def mean_cycles(cycles):
+    return sum(cycles) / len(cycles)
+
+# given a list of integers, return the max
+def max_cycles(cycles):
+    return max(cycles)
+
+def min_cycles(cycles):
+    return min(cycles)
+
+
+# given a list of integers, return the standard deviation
+def stddev_cycles(cycles):
+    return statistics.stdev(cycles)
+
 def get_rows_for_benchmark(bench, profile_data):
     data_for_bench = [x for x in profile_data if x["benchmark"] == bench]
     rows = []
     for (idx, entry) in enumerate(data_for_bench):
         fst_col = r'\multirow{' + str(len(data_for_bench)) + r'}{*}{' + bench.replace("_", r'\_') + r'}' if idx == 0 else ''
-        res = entry["hyperfine"]["results"][0]
+        cycles = entry["cycles"]
         row = " ".join([
             r'\multicolumn{1}{|l|}{' + fst_col + r'} &',
             r'\multicolumn{1}{l|}{' + entry["runMethod"] + r'}  &',
-            r'\multicolumn{1}{l|}{' + round_fmt(res["mean"]) + r'} &',
-            r'\multicolumn{1}{l|}{' + round_fmt(res["max"]) + r'} &',
-            r'\multicolumn{1}{l|}{' + round_fmt(res["min"]) + r'} &',
-            round_fmt(res["stddev"]) + r' \\',
+            r'\multicolumn{1}{l|}{' + round_fmt(mean_cycles(cycles)) + r'} &',
+            r'\multicolumn{1}{l|}{' + round_fmt(max_cycles(cycles)) + r'} &',
+            r'\multicolumn{1}{l|}{' + round_fmt(min_cycles(cycles)) + r'} &',
+            round_fmt(stddev_cycles(cycles)) + r' \\',
         ])
         rows.append(row)
     rows.append(r' \hline')
diff --git a/infra/localnightly.sh b/infra/localnightly.sh
index ee6b933dc..f7abec73a 100755
--- a/infra/localnightly.sh
+++ b/infra/localnightly.sh
@@ -4,4 +4,4 @@ set -x -e
 
 # pass arguments to nightly.sh
 LOCAL=1 bash infra/nightly.sh "$@"
-cd nightly/output && python3 -m http.server
\ No newline at end of file
+cd nightly/output && python3 -m http.server 8002
\ No newline at end of file
diff --git a/infra/nightly-resources/chart.js b/infra/nightly-resources/chart.js
index f49422b3e..2adddf92b 100644
--- a/infra/nightly-resources/chart.js
+++ b/infra/nightly-resources/chart.js
@@ -1,6 +1,5 @@
 const COLORS = {
   "rvsdg-round-trip-to-executable": "red",
-  "cranelift-O3": "blue",
   "llvm-O0": "purple",
   "llvm-O1": "green",
   "llvm-O2": "orange",
@@ -11,6 +10,42 @@ const COLORS = {
 
 const BASELINE_MODE = "llvm-O0";
 
+// TODO these functions (mean, median, ect) are duplicated in generate_line_counts.py
+// we could move the computation of the latex table to js to solve this problem
+
+// Given a list of integers, compute the mean
+// number of cycles
+function mean_cycles(cycles) {
+  return cycles.reduce((a, b) => a + b, 0) / cycles.length;
+}
+
+function median_cycles(cycles) {
+  const sorted = cycles.sort((a, b) => a - b);
+  const mid = Math.floor(sorted.length / 2);
+  if (sorted.length % 2 === 0) {
+    return (sorted[mid - 1] + sorted[mid]) / 2;
+  } else {
+    return sorted[mid];
+  }
+}
+
+function max_cycles(cycles) {
+  return Math.max(...cycles);
+}
+
+function min_cycles(cycles) {
+  return Math.min(...cycles);
+}
+
+function stddev_cycles(cycles) {
+  const mean = cycles.reduce((a, b) => a + b, 0) / (cycles.length - 1);
+  const squared_diffs = cycles.map((c) => (c - mean) ** 2);
+  // TODO kevin said we might want to use bessel's correction here
+  const bessels_corrected =
+    squared_diffs.reduce((a, b) => a + b, 0) / squared_diffs.length;
+  return Math.sqrt(bessels_corrected);
+}
+
 function getEntry(benchmark, runMode) {
   const entries = GLOBAL_DATA.currentRun.filter(
     (entry) => entry.benchmark === benchmark && entry.runMethod === runMode,
@@ -28,14 +63,14 @@ function getEntry(benchmark, runMode) {
 
 function getValue(entry) {
   if (GLOBAL_DATA.chart.mode === "absolute") {
-    return entry.hyperfine.results[0].mean;
+    return mean_cycles(entry["cycles"]);
   } else if (GLOBAL_DATA.chart.mode === "speedup") {
     const baseline = getEntry(entry.benchmark, BASELINE_MODE);
     if (!baseline) {
       addWarning(`No speedup baseline for ${benchmark}`);
     }
-    const baseV = baseline.hyperfine.results[0].mean;
-    const expV = entry.hyperfine.results[0].mean;
+    const baseV = mean_cycles(baseline["cycles"]);
+    const expV = mean_cycles(entry["cycles"]);
     // If you change this, also change the displayed formula in index.html
     return baseV / expV;
   } else {
@@ -45,9 +80,31 @@ function getValue(entry) {
 
 function getError(entry) {
   if (GLOBAL_DATA.chart.mode === "absolute") {
-    return entry.hyperfine.results[0].stddev;
+    return stddev_cycles(entry["cycles"]);
   } else {
-    return undefined;
+    // Error is given using propagation of error formula for two variables
+    // f = baseV / expV
+    const baseline = getEntry(entry.benchmark, BASELINE_MODE);
+    if (!baseline) {
+      addWarning(`No speedup baseline for ${benchmark}`);
+    }
+
+    const baseV = mean_cycles(baseline["cycles"]);
+    const expV = mean_cycles(entry["cycles"]);
+    const baseStd = stddev_cycles(baseline["cycles"]);
+    const expStd = stddev_cycles(entry["cycles"]);
+
+    // Speedup calculation
+    const speedup = baseV / expV;
+
+    // Error propagation
+    const relativeBaseError = baseStd / baseV;
+    const relativeExpError = expStd / expV;
+
+    const speedupError =
+      speedup * Math.sqrt(relativeBaseError ** 2 + relativeExpError ** 2);
+
+    return speedupError;
   }
 }
 
diff --git a/infra/nightly-resources/data.js b/infra/nightly-resources/data.js
index 86dedd845..77d53e02f 100644
--- a/infra/nightly-resources/data.js
+++ b/infra/nightly-resources/data.js
@@ -10,7 +10,7 @@ async function fetchText(url) {
   return data;
 }
 
-function getBaselineHyperfine(benchmark, runMethod) {
+function getBaselineCycles(benchmark, runMethod) {
   const baselineData =
     GLOBAL_DATA.baselineRun?.filter((o) => o.benchmark === benchmark) || [];
   if (baselineData.length === 0) {
@@ -24,7 +24,7 @@ function getBaselineHyperfine(benchmark, runMethod) {
         `Baseline had multiple entries for ${benchmark} ${runMethod}`,
       );
     } else {
-      return baseline[0].hyperfine.results[0];
+      return baseline[0]["cycles"];
     }
   }
 }
@@ -53,24 +53,24 @@ function getBrilPathForBenchmark(benchmark) {
 
 function getDataForBenchmark(benchmark) {
   const executions = GLOBAL_DATA.currentRun
-    ?.filter((o) => o.benchmark === benchmark)
-    .map((o) => {
-      const baselineHyperfine = getBaselineHyperfine(o.benchmark, o.runMethod);
-      const hyperfine = o.hyperfine.results[0];
+    ?.filter((row) => row.benchmark === benchmark)
+    .map((row) => {
+      const baselineCycles = getBaselineCycles(row.benchmark, row.runMethod);
+      const cycles = row["cycles"];
       const rowData = {
-        runMethod: o.runMethod,
-        mean: { class: "", value: tryRound(hyperfine.mean) },
-        meanVsBaseline: diffAttribute(hyperfine, baselineHyperfine, "mean"),
-        min: { class: "", value: tryRound(hyperfine.min) },
-        minVsBaseline: diffAttribute(hyperfine, baselineHyperfine, "min"),
-        max: { class: "", value: tryRound(hyperfine.max) },
-        maxVsBaseline: diffAttribute(hyperfine, baselineHyperfine, "max"),
-        median: { class: "", value: tryRound(hyperfine.median) },
-        medianVsBaseline: diffAttribute(hyperfine, baselineHyperfine, "median"),
-        stddev: { class: "", value: tryRound(hyperfine.stddev) },
+        runMethod: row.runMethod,
+        mean: { class: "", value: tryRound(mean_cycles(cycles)) },
+        meanVsBaseline: getDifference(cycles, baselineCycles, mean_cycles),
+        min: { class: "", value: tryRound(min_cycles(cycles)) },
+        minVsBaseline: getDifference(cycles, baselineCycles, min_cycles),
+        max: { class: "", value: tryRound(max_cycles(cycles)) },
+        maxVsBaseline: getDifference(cycles, baselineCycles, max_cycles),
+        median: { class: "", value: tryRound(median_cycles(cycles)) },
+        medianVsBaseline: getDifference(cycles, baselineCycles, median_cycles),
+        stddev: { class: "", value: tryRound(median_cycles(cycles)) },
       };
-      if (shouldHaveLlvm(o.runMethod)) {
-        rowData.runMethod = `<a target="_blank" rel="noopener noreferrer" href="llvm.html?benchmark=${benchmark}&runmode=${o.runMethod}">${o.runMethod}</a>`;
+      if (shouldHaveLlvm(row.runMethod)) {
+        rowData.runMethod = `<a target="_blank" rel="noopener noreferrer" href="llvm.html?benchmark=${benchmark}&runmode=${row.runMethod}">${row.runMethod}</a>`;
       }
       return rowData;
     });
diff --git a/infra/nightly-resources/index.js b/infra/nightly-resources/index.js
index 18ea4fa08..d2d9e94c9 100644
--- a/infra/nightly-resources/index.js
+++ b/infra/nightly-resources/index.js
@@ -1,7 +1,6 @@
 // copied from profile.py
 const treatments = [
   "rvsdg-round-trip-to-executable",
-  "cranelift-O3",
   "llvm-O0",
   "llvm-O1",
   "llvm-O2",
diff --git a/infra/nightly-resources/utils.js b/infra/nightly-resources/utils.js
index b69926b30..3a222e5f3 100644
--- a/infra/nightly-resources/utils.js
+++ b/infra/nightly-resources/utils.js
@@ -14,13 +14,13 @@ function tryRound(v, precision) {
 
 // Outputs current_number - baseline_number in a human-readable format
 // If baseline_number is undefined, it will return N/A
-function getDifference(current, baseline) {
+function getDifference(current, baseline, comparison_func) {
   const THRESHOLD = 0.01;
   // if b is undefined, return a
   if (baseline === undefined) {
     return { class: "", value: "N/A" };
   } else {
-    var difference = current - baseline;
+    var difference = comparison_func(current) - comparison_func(baseline);
     // if the difference is negative it will already have a "-"
     var sign = difference < 0 ? "" : "+";
     var cssClass = "";
@@ -33,10 +33,3 @@ function getDifference(current, baseline) {
     return { class: cssClass, value: `${sign}${tryRound(difference)}` };
   }
 }
-
-// compare two objects at a particular attribute
-function diffAttribute(results, baseline, attribute) {
-  const current = results[attribute];
-  const baselineNum = baseline?.[attribute];
-  return getDifference(current, baselineNum);
-}
diff --git a/infra/nightly.sh b/infra/nightly.sh
index f1c27f492..44f60a81a 100755
--- a/infra/nightly.sh
+++ b/infra/nightly.sh
@@ -11,6 +11,8 @@ echo "Beginning eggcc nightly script..."
 # -x: before executing each command, print it
 # -e: exit immediately upon first error
 set -x -e
+# if anything in a pipeline fails, fail the whole pipeline
+set -o pipefail
 
 export PATH=~/.cargo/bin:$PATH
 
diff --git a/infra/output_test_egraphs.sh b/infra/output_test_egraphs.sh
index 4e75801b7..c46e32c65 100644
--- a/infra/output_test_egraphs.sh
+++ b/infra/output_test_egraphs.sh
@@ -10,8 +10,6 @@ fi
 # make serialized directory
 mkdir -p ./serialized
 
-# bench will benchmark a single bril file, outputting hyperfine contents to ./tmp/bench/<PROFILE_NAME>.json
-# and will output the number of instructions it executed to ./tmp/bench/<PROFILE_NAME>.profile
 bench() {
     echo "json for $1"
     # store just the file name to a variable
diff --git a/infra/profile.py b/infra/profile.py
index 27bd87052..5b262f95e 100755
--- a/infra/profile.py
+++ b/infra/profile.py
@@ -11,7 +11,7 @@
 
 treatments = [
   "rvsdg-round-trip-to-executable",
-  "cranelift-O3",
+  #"cranelift-O3", currently disabled since it doesn't support measuring cycles yet
   "llvm-O0",
   "llvm-O1",
   "llvm-O2",
@@ -64,25 +64,23 @@ def benchmark_profile_dir(name):
 
 def setup_benchmark(name):
   profile_dir = benchmark_profile_dir(name)
-  try:
-    os.mkdir(profile_dir)
-  except FileExistsError:
-    print(f'{profile_dir} exists, overwriting contents')
+  os.mkdir(profile_dir)
 
 def optimize(benchmark):
   print(f'[{benchmark.index}/{benchmark.total}] Optimizing {benchmark.name} with {benchmark.treatment}')
   profile_dir = benchmark_profile_dir(benchmark.name)
   cmd = f'cargo run --release {benchmark.path} {get_eggcc_options(benchmark.treatment, benchmark.name)} -o {profile_dir}/{benchmark.treatment}'
-  print(f'Running: {cmd}')
+  print(f'Running: {cmd}', flush=True)
   start = time.time()
-  subprocess.call(cmd, shell=True)
+  process = subprocess.run(cmd, shell=True)
+  process.check_returncode()
   end = time.time()
   return (f"{profile_dir}/{benchmark.treatment}", end-start)
 
 
 
 def bench(benchmark):
-  print(f'[{benchmark.index}/{benchmark.total}] Benchmarking {benchmark.name} with {benchmark.treatment}')
+  print(f'[{benchmark.index}/{benchmark.total}] Benchmarking {benchmark.name} with {benchmark.treatment}', flush=True)
   profile_dir = benchmark_profile_dir(benchmark.name)
 
   with open(f'{profile_dir}/{benchmark.treatment}-args') as f:
@@ -95,10 +93,26 @@ def bench(benchmark):
         #f.write(f'ERROR: No executable found for {name} in {benchmark.path}\n')
       return None
     else:
-      # TODO for final nightly results, remove `--max-runs 2` and let hyperfine find stable results
-      cmd = f'hyperfine --style none --warmup 1 --max-runs 2 --export-json /dev/stdout "{profile_dir}/{benchmark.treatment}{" " + args if len(args) > 0 else ""}"'
-      result = subprocess.run(cmd, capture_output=True, shell=True)
-      return (f'{profile_dir}/{benchmark.treatment}', json.loads(result.stdout))
+      # hyperfine command for measuring time, unused in favor of cycles
+      # cmd = f'hyperfine --style none --warmup 1 --max-runs 2 --export-json /dev/stdout "{profile_dir}/{benchmark.treatment}{" " + args if len(args) > 0 else ""}"'
+      time_per_benchmark = 1.0
+      resulting_num_cycles = []
+      time_start = time.time()
+      while True:
+        args_str = " " + args if len(args) > 0 else ""
+        cmd = f'{profile_dir}/{benchmark.treatment}{args_str}'
+        result = subprocess.run(cmd, capture_output=True, shell=True)
+        
+        if result.returncode != 0:
+          raise Exception(f'Error running {benchmark.name} with {benchmark.treatment}: {result.stderr}')
+        res_cycles = int(result.stderr)
+        resulting_num_cycles.append(res_cycles)
+
+        # if we have run for at least 1 second and we have at least 2 samples, stop
+        if time.time() - time_start > time_per_benchmark and len(resulting_num_cycles) >= 2:
+          break
+
+      return (f'{profile_dir}/{benchmark.treatment}', resulting_num_cycles)
 
 # Run modes that we expect to output llvm IR
 def should_have_llvm_ir(runMethod):
@@ -117,7 +131,7 @@ def aggregate(compile_times, bench_times, benchmark_metadata):
     for path in sorted(compile_times.keys()):
       name = path.split("/")[-2]
       runMethod = path.split("/")[-1]
-      result = {"runMethod": runMethod, "benchmark": name, "hyperfine": bench_times[path], "compileTime": compile_times[path], "metadata": benchmark_metadata[name]}
+      result = {"runMethod": runMethod, "benchmark": name, "cycles": bench_times[path], "compileTime": compile_times[path], "metadata": benchmark_metadata[name]}
 
       res.append(result)
     return res
@@ -137,7 +151,10 @@ def is_looped(bril_file):
   try:
     os.mkdir(TMP_DIR)
   except FileExistsError:
-    print(f"{TMP_DIR} exits, overwriting contents")
+    print(f"{TMP_DIR} exits, deleting contents")
+    # remove the files in the directory
+    os.system(f"rm -rf {TMP_DIR}/*")
+
 
   bril_dir, DATA_DIR = os.sys.argv[1:]
   profiles = []
@@ -161,17 +178,26 @@ def is_looped(bril_file):
       to_run.append(Benchmark(benchmark_path, treatment, index, total))
       index += 1
 
-  for benchmark in to_run:
-    setup_benchmark(benchmark.name)
+  benchmark_names = set([benchmark.name for benchmark in to_run])
+  for benchmark_name in benchmark_names:
+    setup_benchmark(benchmark_name)
   
 
   compile_times = {}
+  # get the number of cores on this machine 
+  parallelism = os.cpu_count()
+
   # create a thread pool for running optimization
-  with concurrent.futures.ThreadPoolExecutor(max_workers = 6) as executor:
+  with concurrent.futures.ThreadPoolExecutor(max_workers = parallelism) as executor:
     futures = {executor.submit(optimize, benchmark) for benchmark in to_run}
     for future in concurrent.futures.as_completed(futures):
-      (path, compile_time) = future.result()
-      compile_times[path] = compile_time
+      try:
+        (path, compile_time) = future.result()
+        compile_times[path] = compile_time
+      except Exception as e:
+        print(f"Shutting down executor due to error: {e}")
+        executor.shutdown(wait=False, cancel_futures=True)
+        raise e
 
   # running benchmarks sequentially for more reliable results
   # can set this to true for testing
@@ -180,14 +206,19 @@ def is_looped(bril_file):
   bench_data = {}
   if isParallelBenchmark:
     # create a thread pool for running benchmarks
-    with concurrent.futures.ThreadPoolExecutor(max_workers = 6) as executor:
+    with concurrent.futures.ThreadPoolExecutor(max_workers = parallelism) as executor:
       futures = {executor.submit(bench, benchmark) for benchmark in to_run}
       for future in concurrent.futures.as_completed(futures):
-        res = future.result()
-        if res is None:
-          continue
-        (path, _bench_data) = res
-        bench_data[path] = _bench_data
+        try:
+          res = future.result()
+          if res is None:
+            continue
+          (path, _bench_data) = res
+          bench_data[path] = _bench_data
+        except Exception as e:
+          print(f"Shutting down executor due to error: {e}")
+          executor.shutdown(wait=False, cancel_futures=True)
+          raise e
   else:
     for benchmark in to_run:
       res = bench(benchmark)
diff --git a/runtime/install.sh b/runtime/install.sh
index 279a2ba5a..e36bc1239 100755
--- a/runtime/install.sh
+++ b/runtime/install.sh
@@ -1,14 +1,16 @@
 #!/bin/bash
 
+echo "Building runtime.bc and rt.o"
+
 set -e
 
 # remove rt.bc if it exists
-if [ -f brillvm/rt.bc ]; then
-    rm brillvm/rt.bc
+if [ -f runtime/rt.bc ]; then
+    rm runtime/rt.bc
 fi
 
-if [ -f brillvm/rt.bc ]; then
-    rm brillvm/rt.o
+if [ -f runtime/rt.bc ]; then
+    rm runtime/rt.o
 fi
 
 cd runtime
diff --git a/runtime/rt.c b/runtime/rt.c
index 15dd03fc0..c4d936566 100644
--- a/runtime/rt.c
+++ b/runtime/rt.c
@@ -1,3 +1,5 @@
+// the c runtime is used for cranelift, while the rust one is used for llvm
+
 #include <stdio.h>
 #include <stdint.h>
 #include <inttypes.h>
diff --git a/runtime/src/main.rs b/runtime/src/main.rs
index 00147b3ff..72fa4ea06 100644
--- a/runtime/src/main.rs
+++ b/runtime/src/main.rs
@@ -2,12 +2,76 @@
 use std::alloc::{alloc, dealloc};
 use std::convert::TryInto;
 use std::mem::size_of; */
-#![no_std]
 #![no_main]
+#![no_std]
 
 use core::ffi::{c_char, CStr};
 
-use libc_print::std_name::{print, println};
+use core::arch::asm;
+use libc_print::std_name::{eprintln, print, println};
+
+// code for tick counter from
+// tick_counter = "0.4.5"
+
+// HACK: Some tool in our toolchain is dropping get_ticks_start
+// on aarch64 because it is identical to get_ticks_end.
+// So on aarch we call it something different
+#[no_mangle]
+#[inline(never)]
+#[cfg(target_arch = "aarch64")]
+pub extern "C" fn _bril_get_ticks() -> u64 {
+    let tick_counter: u64;
+    unsafe {
+        asm!(
+            "mrs x0, cntvct_el0",
+            out("x0") tick_counter
+        );
+    }
+    tick_counter
+}
+
+#[no_mangle]
+#[inline(never)]
+#[cfg(target_arch = "x86_64")]
+pub extern "C" fn _bril_get_ticks_start() -> u64 {
+    let mut res: u64 = 0;
+    unsafe {
+        asm!(
+            "mfence",
+            "lfence",
+            "rdtsc",
+            "shl rdx, 32",
+            "or {res}, rdx",
+            res = inout(reg) res,
+            out("rdx") _,
+        );
+    }
+    res
+}
+
+#[no_mangle]
+#[inline(never)]
+#[cfg(target_arch = "x86_64")]
+pub extern "C" fn _bril_get_ticks_end() -> u64 {
+    let mut res: u64 = 0;
+    unsafe {
+        asm!(
+            "rdtsc",
+            "lfence",
+            "shl rdx, 32",
+            "or {res}, rdx",
+            out("rdx") _,
+            res = inout(reg) res,
+        );
+    }
+    res
+}
+
+#[no_mangle]
+#[inline(never)]
+pub extern "C" fn _bril_eprintln_unsigned_int(i: u64) {
+    eprintln!("{}", i);
+}
 
 #[no_mangle]
 #[inline(never)]
diff --git a/src/lib.rs b/src/lib.rs
index f32f3a9b7..1989171b3 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -88,15 +88,15 @@ impl Optimizer {
     }
 
     /// Interpret a program in an `Interpretable` IR.
-    /// Returns the printed output of the program.
+    /// Returns the printed output of the program and optionally the cycles taken to run the program.
     /// The program should not return a value.
     pub fn interp(
         program: &Interpretable,
         args: Vec<String>,
         profile_out: Option<PathBuf>,
-    ) -> String {
+    ) -> (String, Option<u64>) {
         match program {
-            Interpretable::Bril(program) => Self::interp_bril(program, args, profile_out),
+            Interpretable::Bril(program) => (Self::interp_bril(program, args, profile_out), None),
             Interpretable::TreeProgram(program) => {
                 let mut parsed = Self::parse_arguments(args);
                 // add the state value to the end
@@ -107,7 +107,22 @@ impl Optimizer {
                 for line in printed.iter_mut() {
                     line.push('\n');
                 }
-                printed.join("")
+                (printed.join(""), None)
+            }
+            Interpretable::CycleMeasuringExecutable { executable } => {
+                let output = std::process::Command::new(
+                    std::path::Path::new(executable).canonicalize().unwrap(),
+                )
+                .args(args)
+                .output()
+                .unwrap();
+                let output_str = String::from_utf8(output.stdout).unwrap();
+                let output_err = String::from_utf8(output.stderr).unwrap();
+                let error_code = output.status.code().unwrap();
+                if error_code != 0 {
+                    panic!("Error code: {}", error_code);
+                }
+                (output_str, Some(output_err.trim().parse().unwrap()))
             }
             Interpretable::Executable { executable } => {
                 let output = std::process::Command::new(
@@ -118,7 +133,7 @@ impl Optimizer {
                 .unwrap()
                 .stdout;
 
-                String::from_utf8(output).unwrap()
+                (String::from_utf8(output).unwrap(), None)
             }
         }
     }
diff --git a/src/main.rs b/src/main.rs
index b31fd62f0..4e09b9aa9 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -26,10 +26,7 @@ struct Args {
     /// (only used when interpreting)
     bril_args: Vec<String>,
 
-    /// Where to put the executable (only for the CompileBrilfit run mode)
-    /// If not provided, the executable will be in a file with the same prefix as the
-    /// input file, but with no file extension. That is, if `abc.bril` is passed in,
-    /// then the executable will be in `abc`.
+    /// Where to put the executable (only for the brillift and llvm modes)
     #[clap(short)]
     output_path: Option<String>,
     /// Where to put intermediary files (only for OptimizeBrilLLVM mode)
@@ -89,6 +86,7 @@ fn main() {
         optimize_egglog: args.optimize_egglog,
         optimize_brilift: args.optimize_brilift,
         optimize_bril_llvm: args.optimize_bril_llvm,
+        add_timing: false,
     };
 
     let result = match run.run() {
@@ -101,6 +99,9 @@ fn main() {
     if args.interp {
         // just print out the result of interpreting the program
         println!("{}", result.result_interpreted.unwrap());
+        if let Some(cycles_taken) = result.cycles_taken {
+            eprintln!("{}", cycles_taken);
+        }
     } else if let &[visualization] = &result.visualizations.as_slice() {
         // when there is just one visualization, print it out without
         // the "visualization of: {}" header for convenience
diff --git a/src/util.rs b/src/util.rs
index 4bfb19c2e..352e95741 100644
--- a/src/util.rs
+++ b/src/util.rs
@@ -346,6 +346,7 @@ pub struct Run {
     pub optimize_egglog: Option<bool>,
     pub optimize_brilift: Option<bool>,
     pub optimize_bril_llvm: Option<LLVMOptLevel>,
+    pub add_timing: bool,
 }
 
 /// an enum of IRs that can be interpreted
@@ -353,7 +354,14 @@ pub struct Run {
 pub enum Interpretable {
     Bril(Program),
     TreeProgram(TreeProgram),
-    Executable { executable: String },
+    /// An executable that also prints the number of cycles it took to run to stderr (llvm)
+    CycleMeasuringExecutable {
+        executable: String,
+    },
+    /// An executable that doesn't measure number of cycles (cranelift)
+    Executable {
+        executable: String,
+    },
 }
 
 /// Some sort of visualization of the result, with a name
@@ -373,6 +381,7 @@ pub struct RunOutput {
     // if the result was interpreted, the stdout of interpreting it
     pub result_interpreted: Option<String>,
     pub original_interpreted: Option<String>,
+    pub cycles_taken: Option<u64>,
 }
 
 impl Run {
@@ -404,6 +413,7 @@ impl Run {
             optimize_egglog: None,
             optimize_brilift: Some(optimize_brilift),
             optimize_bril_llvm: None,
+            add_timing: false,
         }
     }
 
@@ -420,6 +430,7 @@ impl Run {
             optimize_egglog: None,
             optimize_brilift: None,
             optimize_bril_llvm: None,
+            add_timing: false,
         }
     }
 
@@ -449,6 +460,7 @@ impl Run {
                 optimize_egglog: None,
                 optimize_brilift: None,
                 optimize_bril_llvm: None,
+                add_timing: false,
             };
             if test_type.produces_interpretable() {
                 let interp = Run {
@@ -482,6 +494,7 @@ impl Run {
                         optimize_egglog: Some(optimize_egglog),
                         optimize_brilift: None,
                         optimize_bril_llvm: Some(optimize_llvm),
+                        add_timing: false,
                     });
                 }
             }
@@ -520,11 +533,14 @@ impl Run {
             ))
         } else if self.interp == InterpMode::InterpFast {
             let interpretable = self.run_brilift(self.prog_with_args.program.clone(), true);
-            let res = Some(Optimizer::interp(
-                interpretable.as_ref().unwrap(),
-                self.prog_with_args.args.clone(),
-                None,
-            ));
+            let res = Some(
+                Optimizer::interp(
+                    interpretable.as_ref().unwrap(),
+                    self.prog_with_args.args.clone(),
+                    None,
+                )
+                .0,
+            );
 
             // clean up binary
             if let Interpretable::Executable { executable } = interpretable.unwrap() {
@@ -585,7 +601,8 @@ impl Run {
                 let rvsdg = Optimizer::program_to_rvsdg(&self.prog_with_args.program)?;
                 let cfg = rvsdg.to_cfg();
                 let bril = cfg.to_bril();
-                let interpretable = self.run_bril_llvm(bril, false, LLVMOptLevel::O0)?;
+                let interpretable =
+                    self.run_bril_llvm(bril, false, LLVMOptLevel::O0, self.add_timing)?;
                 (vec![], Some(interpretable))
             }
             RunMode::DagToRvsdg => {
@@ -782,6 +799,7 @@ impl Run {
                     self.prog_with_args.program.clone(),
                     optimize_egglog,
                     optimize_brillvm,
+                    self.add_timing,
                 )?;
                 (vec![], Some(interpretable))
             }
@@ -807,8 +825,12 @@ impl Run {
                     };
 
                     for optimize_llvm in [LLVMOptLevel::O0, LLVMOptLevel::O3] {
-                        let interpretable =
-                            self.run_bril_llvm(resulting_bril.clone(), false, optimize_llvm)?;
+                        let interpretable = self.run_bril_llvm(
+                            resulting_bril.clone(),
+                            false,
+                            optimize_llvm,
+                            self.add_timing,
+                        )?;
                         let new_interpreted = Optimizer::interp(
                             &interpretable,
                             self.prog_with_args.args.clone(),
@@ -850,11 +872,13 @@ impl Run {
 
             res
         };
+        let cycles_taken = result_interpreted.as_ref().map(|val| val.1).unwrap_or(None);
 
         Ok(RunOutput {
             visualizations,
-            result_interpreted,
+            result_interpreted: result_interpreted.map(|val| val.0),
             original_interpreted,
+            cycles_taken,
         })
     }
 
@@ -930,6 +954,7 @@ impl Run {
         input_prog: Program,
         optimize_egglog: bool,
         llvm_level: LLVMOptLevel,
+        add_timing: bool,
     ) -> Result<Interpretable, EggCCError> {
         // Make a unique name for this test running bril llvm
         // so we don't have conflicts in /tmp
@@ -951,6 +976,7 @@ impl Run {
             args: vec![],
             program: Some(String::from_utf8(buf).unwrap()),
             interpreter: false,
+            add_timing,
         });
 
         let init_ll_name = format!("{}-init.ll", self.name());
@@ -1030,7 +1056,11 @@ impl Run {
             self.prog_with_args.args.join(" "),
         );
 
-        Ok(Interpretable::Executable { executable })
+        if add_timing {
+            Ok(Interpretable::CycleMeasuringExecutable { executable })
+        } else {
+            Ok(Interpretable::Executable { executable })
+        }
     }
 }
 
diff --git a/tests/files.rs b/tests/files.rs
index 8469180c8..c05fc3670 100644
--- a/tests/files.rs
+++ b/tests/files.rs
@@ -5,9 +5,8 @@ use insta::assert_snapshot;
 use libtest_mimic::Trial;
 
 /// Generate tests for all configurations of a given file
-/// If `just_brilift` is true, only generate tests that
-/// run the full pipeline with brilift
-fn generate_tests(glob: &str, benchmark_mode: bool) -> Vec<Trial> {
+// slow_test means the test is too slow to run the interpreter on, so use benchmarking mode
+fn generate_tests(glob: &str, slow_test: bool) -> Vec<Trial> {
     let mut trials = vec![];
 
     let mut mk_trial = |run: Run, snapshot: bool| {
@@ -47,7 +46,7 @@ fn generate_tests(glob: &str, benchmark_mode: bool) -> Vec<Trial> {
 
         let snapshot = f.to_str().unwrap().contains("small");
 
-        let configurations = if benchmark_mode {
+        let configurations = if slow_test {
             // in benchmark mode, run a special test pipeline that only runs
             // a few modes, and shares intermediate results
             vec![Run::test_benchmark_config(TestProgram::BrilFile(f.clone()))]
@@ -66,6 +65,7 @@ fn generate_tests(glob: &str, benchmark_mode: bool) -> Vec<Trial> {
 fn main() {
     let args = libtest_mimic::Arguments::from_args();
     let mut tests = generate_tests("tests/passing/**/*.bril", false);
+    tests.extend(generate_tests("tests/slow/**/*.bril", true));
     // also generate tests for benchmarks
     tests.extend(generate_tests("benchmarks/passing/**/*.bril", true));
 
diff --git a/tests/passing/brils/core/rectangles-area-difference.bril b/tests/passing/brils/core/rectangles-area-difference.bril
new file mode 100644
index 000000000..0bf4cc6bd
--- /dev/null
+++ b/tests/passing/brils/core/rectangles-area-difference.bril
@@ -0,0 +1,18 @@
+# ARGS: 106 233 323 233
+@main (x1 : int, y1: int, x2: int, y2 : int) {
+      a1: int = call @area x1 y1;
+      a2: int = call @area x2 y2;
+      res: int = sub a1 a2;
+      a1_bigger: bool = gt a1 a2;
+      br a1_bigger .end .flip;
+.flip:
+      neg1: int = const -1;
+      res: int = mul res neg1;
+.end:
+      print res;
+}
+
+@area (x : int, y : int) : int {
+      area: int = mul x y;
+      ret area;
+}
diff --git a/benchmarks/passing/bril/float/pow.bril b/tests/passing/brils/float/pow.bril
similarity index 56%
rename from benchmarks/passing/bril/float/pow.bril
rename to tests/passing/brils/float/pow.bril
index 9c15538cf..8630c42c3 100644
--- a/benchmarks/passing/bril/float/pow.bril
+++ b/tests/passing/brils/float/pow.bril
@@ -1,31 +1,15 @@
-# ARGS: 43000000
+# ARGS: 4300.0
 
-@main(loop_bound: float) {
-  loop_incr: float = const 1;
-  loop_counter: float = const 10;
-  final_output: float = const 0;
-.loop_cond:
-  loop_cond: bool = flt loop_counter loop_bound;
-  br loop_cond .loop_body .loop_done;
-.loop_body:
-  output: float = call @orig_main loop_counter;
-  final_output: float = fadd final_output output;
-  loop_counter: float = fadd loop_counter loop_incr;
-  jmp .loop_cond;
-.loop_done:
-  print final_output;
-}
-
-@orig_main(v0: float): float {
+@main(v0: float) {
   x: float = id v0;
   v1: int = const 2;
   n: int = id v1;
   v2: float = id x;
   v3: int = id n;
   output: float = call @pow v2 v3;
-  v4: int = const 0;
-  ret output;
+  print output;
 }
+
 @pow(x: float, n: int): float {
   v0: float = id x;
   res: float = id v0;
diff --git a/tests/slow/polybench/README.md b/tests/slow/polybench/README.md
new file mode 100644
index 000000000..89f14af44
--- /dev/null
+++ b/tests/slow/polybench/README.md
@@ -0,0 +1,3 @@
+# Polybench/Bril
+
+A port of Polybench/C 4.2.1 benchmarks to Bril. Original source [here](https://github.com/MatthiasJReisinger/PolyBenchC-4.2.1).
\ No newline at end of file
diff --git a/tests/slow/polybench/linear-algebra/blas/gemm-test.bril b/tests/slow/polybench/linear-algebra/blas/gemm-test.bril
new file mode 100644
index 000000000..28f467955
--- /dev/null
+++ b/tests/slow/polybench/linear-algebra/blas/gemm-test.bril
@@ -0,0 +1,248 @@
+## gemm computes C := alpha * A * B + beta * C for
+## some procedurally generated matrices A, B, C.
+
+@main {
+    # constants
+    # dimensions correspond to Polybench MEDIUM_DATASET
+
+    NI: int = const 200;
+    fNI: float = const 200;
+    NJ: int = const 220;
+    fNJ: float = const 220;
+    NK: int = const 240;
+    fNK: float = const 240;
+
+    one: int = const 1;
+
+    # initialize arrays
+    A: ptr<float> = call @matrix_new NI NK;
+    B: ptr<float> = call @matrix_new NK NJ;
+    C: ptr<float> = call @matrix_new NI NJ;
+    alpha: float = const 1.5;
+    beta: float = const 1.2;
+    call @init A B C NI fNI NJ fNJ NK fNK;
+
+    # main computation
+    i: int = const 0;
+.main_i:
+    cond: bool = lt i NI;
+    br cond .main_i_body .main_i_done;
+.main_i_body:
+    j: int = const 0;
+.main_j:
+    cond: bool = lt j NJ;
+    br cond .main_j_body .main_j_done;
+.main_j_body:
+    call @matrix_scale C i j NJ beta;
+    j: int = add j one;
+    jmp .main_j;
+.main_j_done:
+    k: int = const 0;
+.main_k:
+    cond: bool = lt k NK;
+    br cond .main_k_body .main_k_done;
+.main_k_body:
+    j: int = const 0;
+.inner_j:
+    cond: bool = lt j NJ;
+    br cond .inner_j_body .inner_j_done;
+.inner_j_body:
+    Aik: float = call @matrix_get A i k NK;
+    Bkj: float = call @matrix_get B k j NJ;
+    incr: float = fmul alpha Aik;
+    incr: float = fmul incr Bkj;
+    call @matrix_incr C i j NJ incr;
+    j: int = add j one;
+    jmp .inner_j;
+.inner_j_done:
+    k: int = add k one;
+    jmp .main_k;
+.main_k_done:
+    i: int = add i one;
+    jmp .main_i;
+.main_i_done:
+
+    call @matrix_print C NI NJ;
+
+    free A;
+    free B;
+    free C;
+}
+
+@init(A: ptr<float>, B: ptr<float>, C: ptr<float>,
+      NI: int, fNI: float, NJ: int, fNJ: float, NK: int, fNK: float) {
+    one: int = const 1;
+    fone: float = const 1;
+    ftwo: float = const 2;
+
+    i: int = const 0;
+    fi: float = const 0;
+.init_C_i:
+    cond: bool = lt i NI;
+    br cond .init_C_i_body .init_C_i_done;
+.init_C_i_body:
+    j: int = const 0;
+    fj: float = const 0;
+.init_C_j:
+    cond: bool = lt j NJ;
+    br cond .init_C_j_body .init_C_j_done;
+.init_C_j_body:
+    val: float = fmul fi fj;
+    val: float = fadd fone val;
+    val: float = call @fmod val fNI;
+    val: float = fdiv val fNI;
+    call @matrix_set C i j NJ val;
+    j: int = add j one;
+    fj: float = fadd fj fone;
+    jmp .init_C_j;
+.init_C_j_done:
+    i: int = add i one;
+    fi: float = fadd fi fone;
+    jmp .init_C_i;
+.init_C_i_done:
+
+    i: int = const 0;
+    fi: float = const 0;
+.init_A_i:
+    cond: bool = lt i NI;
+    br cond .init_A_i_body .init_A_i_done;
+.init_A_i_body:
+    j: int = const 0;
+    fj: float = const 0;
+.init_A_j:
+    cond: bool = lt j NK;
+    br cond .init_A_j_body .init_A_j_done;
+.init_A_j_body:
+    val: float = fadd fj fone;
+    val: float = fmul fi val;
+    val: float = call @fmod val fNK;
+    val: float = fdiv val fNK;
+    call @matrix_set A i j NK val;
+    j: int = add j one;
+    fj: float = fadd fj fone;
+    jmp .init_A_j;
+.init_A_j_done:
+    i: int = add i one;
+    fi: float = fadd fi fone;
+    jmp .init_A_i;
+.init_A_i_done:
+
+    i: int = const 0;
+    fi: float = const 0;
+.init_B_i:
+    cond: bool = lt i NK;
+    br cond .init_B_i_body .init_B_i_done;
+.init_B_i_body:
+    j: int = const 0;
+    fj: float = const 0;
+.init_B_j:
+    cond: bool = lt j NJ;
+    br cond .init_B_j_body .init_B_j_done;
+.init_B_j_body:
+    val: float = fadd fj ftwo;
+    val: float = fmul fi val;
+    val: float = call @fmod val fNJ;
+    val: float = fdiv val fNJ;
+    call @matrix_set B i j NJ val;
+    j: int = add j one;
+    fj: float = fadd fj fone;
+    jmp .init_B_j;
+.init_B_j_done:
+    i: int = add i one;
+    fi: float = fadd fi fone;
+    jmp .init_B_i;
+.init_B_i_done:
+}
+
+
+@matrix_new(Nrow: int, Ncol: int): ptr<float> {
+    total: int = mul Nrow Ncol;
+    ptr: ptr<float> = alloc total;
+    ret ptr;
+}
+
+@matrix_loc(mtx: ptr<float>, row: int, col: int, Ncol: int): ptr<float> {
+    row_offset: int = mul row Ncol;
+    offset: int = add row_offset col;
+    new_ptr: ptr<float> = ptradd mtx offset;
+    ret new_ptr;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_get(mtx: ptr<float>, row: int, col: int, Ncol: int): float {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    val: float = load ptr;
+    ret val;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_set(mtx: ptr<float>, row: int, col: int, Ncol: int, val: float) {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    store ptr val;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_incr(mtx: ptr<float>, row: int, col: int, Ncol: int, incr: float) {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    val: float = load ptr;
+    new_val: float = fadd val incr;
+    store ptr new_val;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_scale(mtx: ptr<float>, row: int, col: int, Ncol: int, scale: float) {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    val: float = load ptr;
+    new_val: float = fmul val scale;
+    store ptr new_val;
+}
+
+@matrix_print(mtx: ptr<float>, Nrow: int, Ncol: int) {
+    i: int = const 0;
+    one: int = const 1;
+    total: int = mul Nrow Ncol;
+.while:
+    cond: bool = lt i total;
+    br cond .body .done;
+.body:
+    mtx_loc: ptr<float> = ptradd mtx i;
+    val: float = load mtx_loc;
+    print val;
+    i: int = add i one;
+    jmp .while;
+.done:
+}
+
+# Search for n % m where n and m are floats by
+# iteratively subtracting the largest m*2^k that
+# fits inside n. 
+# Takes O((log n/m)^2) time.
+# NOTE: In C, this can be done with a built in 
+# function, but this is the best we can do.
+@fmod(n: float, m: float): float {
+    zero: float = const 0;
+    two: float = const 2;
+    rem: float = id n;
+.while:
+    cond: bool = fge rem m;
+    br cond .body .done;
+.body:
+    decr: float = id m;
+.while_inner:
+    diff: float = fsub rem decr;
+    cond: bool = fge diff zero;
+    br cond .body_inner .done_inner;
+.body_inner:
+    decr: float = fmul decr two;
+    jmp .while_inner;
+.done_inner:
+    decr: float = fdiv decr two;
+    rem: float = fsub rem decr;
+    jmp .while;
+.done:
+    ret rem;
+}
\ No newline at end of file
diff --git a/tests/slow/polybench/linear-algebra/blas/gemver-test.bril b/tests/slow/polybench/linear-algebra/blas/gemver-test.bril
new file mode 100644
index 000000000..a1d625e14
--- /dev/null
+++ b/tests/slow/polybench/linear-algebra/blas/gemver-test.bril
@@ -0,0 +1,285 @@
+## gemver computes some vector multiplications for
+## several procedurally generated vectors.
+
+@main {
+    # constants
+    # dimensions correspond to Polybench MEDIUM_DATASET
+
+    N: int = const 400;
+    fN: float = const 400;
+
+    one: int = const 1;
+
+    # initialize arrays
+    A: ptr<float> = call @matrix_new N N;
+    u1: ptr<float> = call @vector_new N;
+    v1: ptr<float> = call @vector_new N;
+    u2: ptr<float> = call @vector_new N;
+    v2: ptr<float> = call @vector_new N;
+    w: ptr<float> = call @vector_new N;
+    x: ptr<float> = call @vector_new N;
+    y: ptr<float> = call @vector_new N;
+    z: ptr<float> = call @vector_new N;
+    alpha: float = const 1.5;
+    beta: float = const 1.2;
+    call @init A u1 v1 u2 v2 w x y z N fN;
+
+    i: int = const 0;
+.part1_i:
+    cond: bool = lt i N;
+    br cond .part1_i_body .part1_i_done;
+.part1_i_body:
+    j: int = const 0;
+.part1_j:
+    cond: bool = lt j N;
+    br cond .part1_j_body .part1_j_done;
+.part1_j_body:
+    u1i: float = call @vector_get u1 i;
+    v1j: float = call @vector_get v1 j;
+    u2i: float = call @vector_get u2 i;
+    v2j: float = call @vector_get v2 j;
+    Aij: float = call @matrix_get A i j N;
+    tmp: float = fmul u2i v2j;
+    new_Aij: float = fmul u1i v1j;
+    new_Aij: float = fadd tmp new_Aij;
+    new_Aij: float = fadd Aij new_Aij;
+    call @matrix_set A i j N new_Aij;
+    j: int = add j one;
+    jmp .part1_j;
+.part1_j_done:
+    i: int = add i one;
+    jmp .part1_i;
+.part1_i_done:
+
+    i: int = const 0;
+.part2_i:
+    cond: bool = lt i N;
+    br cond .part2_i_body .part2_i_done;
+.part2_i_body:
+    j: int = const 0;
+.part2_j:
+    cond: bool = lt j N;
+    br cond .part2_j_body .part2_j_done;
+.part2_j_body:
+    Aji: float = call @matrix_get A j i N;
+    yj: float = call @vector_get y j;
+    xi: float = call @vector_get x i;
+    new_xi: float = fmul Aji yj;
+    new_xi: float = fmul beta new_xi;
+    new_xi: float = fadd xi new_xi;
+    call @vector_set x i new_xi;
+    j: int = add j one;
+    jmp .part2_j;
+.part2_j_done:
+    i: int = add i one;
+    jmp .part2_i;
+.part2_i_done:
+
+    i: int = const 0;
+.part3_i:
+    cond: bool = lt i N;
+    br cond .part3_i_body .part3_i_done;
+.part3_i_body:
+    xi: float = call @vector_get x i;
+    zi: float = call @vector_get z i;
+    new_xi: float = fadd xi zi;
+    call @vector_set x i new_xi;
+    i: int = add i one;
+    jmp .part3_i;
+.part3_i_done:
+
+    i: int = const 0;
+.part4_i:
+    cond: bool = lt i N;
+    br cond .part4_i_body .part4_i_done;
+.part4_i_body:
+    j: int = const 0;
+.part4_j:
+    cond: bool = lt j N;
+    br cond .part4_j_body .part4_j_done;
+.part4_j_body:
+    Aij: float = call @matrix_get A i j N;
+    xj: float = call @vector_get x j;
+    wi: float = call @vector_get w i;
+    new_wi: float = fmul Aij xj;
+    new_wi: float = fmul alpha new_wi;
+    new_wi: float = fadd wi new_wi;
+    call @vector_set w i new_wi;
+    j: int = add j one;
+    jmp .part4_j;
+.part4_j_done:
+    i: int = add i one;
+    jmp .part4_i;
+.part4_i_done:
+
+    call @vector_print w N;
+
+    free A;
+    free u1;
+    free v1;
+    free u2;
+    free v2;
+    free w;
+    free x;
+    free y;
+    free z;
+}
+
+
+@init(A: ptr<float>, u1: ptr<float>, v1: ptr<float>, u2: ptr<float>, v2: ptr<float>, w: ptr<float>, x: ptr<float>, y: ptr<float>, z: ptr<float>, N: int, fN: float) {
+    one: int = const 1;
+    fzero: float = const 0;
+    fone: float = const 1;
+    ftwo: float = const 2;
+    ffour: float = const 4;
+    fsix: float = const 6;
+    feight: float = const 8;
+    fnine: float = const 9;
+    
+    i: int = const 0;
+    fi: float = const 0;
+.init_i:
+    cond: bool = lt i N;
+    br cond .init_i_body .init_i_done;
+.init_i_body:
+    call @vector_set u1 i fi;
+
+    val: float = fadd fi fone;
+    val: float = fdiv val fN;
+    val: float = fdiv val ftwo;
+    call @vector_set u2 i val;
+
+    val: float = fadd fi fone;
+    val: float = fdiv val fN;
+    val: float = fdiv val ffour;
+    call @vector_set v1 i val;
+
+    val: float = fadd fi fone;
+    val: float = fdiv val fN;
+    val: float = fdiv val fsix;
+    call @vector_set v2 i val;
+
+    val: float = fadd fi fone;
+    val: float = fdiv val fN;
+    val: float = fdiv val feight;
+    call @vector_set y i val;
+
+    val: float = fadd fi fone;
+    val: float = fdiv val fN;
+    val: float = fdiv val fnine;
+    call @vector_set z i val;
+
+    call @vector_set x i fzero;
+    call @vector_set w i fzero;
+
+    j: int = const 0;
+    fj: float = const 0;
+.init_j:
+    cond: bool = lt j N;
+    br cond .init_j_body .init_j_done;
+.init_j_body:
+    val: float = fmul fi fj;
+    val: float = call @fmod val fN;
+    val: float = fdiv val fN;
+    call @matrix_set A i j N val;
+
+    j: int = add j one;
+    fj: float = fadd fj fone;
+    jmp .init_j;
+.init_j_done:
+    i: int = add i one;
+    fi: float = fadd fi fone;
+    jmp .init_i;
+.init_i_done:
+}
+
+@matrix_new(Nrow: int, Ncol: int): ptr<float> {
+    total: int = mul Nrow Ncol;
+    ptr: ptr<float> = alloc total;
+    ret ptr;
+}
+
+@matrix_loc(mtx: ptr<float>, row: int, col: int, Ncol: int): ptr<float> {
+    row_offset: int = mul row Ncol;
+    offset: int = add row_offset col;
+    new_ptr: ptr<float> = ptradd mtx offset;
+    ret new_ptr;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_get(mtx: ptr<float>, row: int, col: int, Ncol: int): float {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    val: float = load ptr;
+    ret val;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_set(mtx: ptr<float>, row: int, col: int, Ncol: int, val: float) {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    store ptr val;
+}
+
+@vector_new(N: int): ptr<float> {
+    ptr: ptr<float> = alloc N;
+    ret ptr;
+}
+
+@vector_get(vec: ptr<float>, i: int): float {
+    ptr: ptr<float> = ptradd vec i;
+    val: float = load ptr;
+    ret val;
+}
+
+@vector_set(vec: ptr<float>, i: int, val: float) {
+    ptr: ptr<float> = ptradd vec i;
+    store ptr val;
+}
+
+# EXPECTS:
+#   @vector_get defined
+@vector_print(vec: ptr<float>, N: int) {
+    i: int = const 0;
+    one: int = const 1;
+.while:
+    cond: bool = lt i N;
+    br cond .body .done;
+.body:
+    val: float = call @vector_get vec i;
+    print val;
+    i: int = add i one;
+    jmp .while;
+.done:
+}
+
+# Search for n % m where n and m are floats by
+# iteratively subtracting the largest m*2^k that
+# fits inside n. 
+# Takes O((log n/m)^2) time.
+# NOTE: In C, this can be done with a built in 
+# function, but this is the best we can do.
+@fmod(n: float, m: float): float {
+    zero: float = const 0;
+    two: float = const 2;
+    rem: float = id n;
+.while:
+    cond: bool = fge rem m;
+    br cond .body .done;
+.body:
+    decr: float = id m;
+.while_inner:
+    diff: float = fsub rem decr;
+    cond: bool = fge diff zero;
+    br cond .body_inner .done_inner;
+.body_inner:
+    decr: float = fmul decr two;
+    jmp .while_inner;
+.done_inner:
+    decr: float = fdiv decr two;
+    rem: float = fsub rem decr;
+    jmp .while;
+.done:
+    ret rem;
+}
+
diff --git a/tests/slow/polybench/linear-algebra/blas/gesummv-test.bril b/tests/slow/polybench/linear-algebra/blas/gesummv-test.bril
new file mode 100644
index 000000000..f1a65ee48
--- /dev/null
+++ b/tests/slow/polybench/linear-algebra/blas/gesummv-test.bril
@@ -0,0 +1,206 @@
+## gesummv computes some matrix and vector multiplications
+
+@main {
+    # constants
+    # dimensions correspond to Polybench MEDIUM_DATASET
+
+    N: int = const 250;
+    fN: float = const 250;
+
+    one: int = const 1;
+    fzero: float = const 0;
+
+    # initialize arrays
+    A: ptr<float> = call @matrix_new N N;
+    B: ptr<float> = call @matrix_new N N;
+    x: ptr<float> = call @vector_new N;
+    alpha: float = const 1.5;
+    beta: float = const 1.2;
+    call @init A B x N fN;
+
+    # main computation
+    tmp: ptr<float> = call @vector_new N;
+    y: ptr<float> = call @vector_new N;
+
+    i: int = const 0;
+.main_i:
+    cond: bool = lt i N;
+    br cond .main_i_body .main_i_done;
+.main_i_body:
+    call @vector_set tmp i fzero;
+    call @vector_set y i fzero;
+
+    j: int = const 0;
+.main_j:
+    cond: bool = lt j N;
+    br cond .main_j_body .main_j_done;
+.main_j_body:
+    Aij: float = call @matrix_get A i j N;
+    xj: float = call @vector_get x j;
+    tmpi: float = call @vector_get tmp i;
+    val: float = fmul Aij xj;
+    val: float = fadd val tmpi;
+    call @vector_set tmp i val;
+
+    Bij: float = call @matrix_get B i j N;
+    xj: float = call @vector_get x j;
+    yi: float = call @vector_get y i;
+    val: float = fmul Bij xj;
+    val: float = fadd val yi;
+    call @vector_set y i val;
+
+    j: int = add j one;
+    jmp .main_j;
+.main_j_done:
+    tmpi: float = call @vector_get tmp i;
+    yi: float = call @vector_get y i;
+    val1: float = fmul alpha tmpi;
+    val2: float = fmul beta yi;
+    new_yi: float = fadd val1 val2;
+    call @vector_set y i new_yi;
+    i: int = add i one;
+    jmp .main_i;
+.main_i_done:
+
+    call @vector_print y N;
+
+    free A;
+    free B;
+    free tmp;
+    free x;
+    free y;
+}
+
+@init(A: ptr<float>, B: ptr<float>, x: ptr<float>,
+      N: int, fN: float) {
+    one: int = const 1;
+    fone: float = const 1;
+    ftwo: float = const 2;
+
+    i: int = const 0;
+    fi: float = const 0;
+.init_i:
+    cond: bool = lt i N;
+    br cond .init_i_body .init_i_done;
+.init_i_body:
+    val: float = call @fmod fi fN;
+    val: float = fdiv val fN;
+    call @vector_set x i val;
+
+    j: int = const 0;
+    fj: float = const 0;
+.init_j:
+    cond: bool = lt j N;
+    br cond .init_j_body .init_j_done;
+.init_j_body:
+    val: float = fmul fi fj;
+    val: float = fadd val fone;
+    val: float = call @fmod val fN;
+    val: float = fdiv val fN;
+    call @matrix_set A i j N val;
+
+    val: float = fmul fi fj;
+    val: float = fadd val ftwo;
+    val: float = call @fmod val fN;
+    val: float = fdiv val fN;
+    call @matrix_set B i j N val;
+
+    j: int = add j one;
+    fj: float = fadd fj fone;
+    jmp .init_j;
+.init_j_done:
+    i: int = add i one;
+    fi: float = fadd fi fone;
+    jmp .init_i;
+.init_i_done:
+}
+
+@matrix_new(Nrow: int, Ncol: int): ptr<float> {
+    total: int = mul Nrow Ncol;
+    ptr: ptr<float> = alloc total;
+    ret ptr;
+}
+
+@matrix_loc(mtx: ptr<float>, row: int, col: int, Ncol: int): ptr<float> {
+    row_offset: int = mul row Ncol;
+    offset: int = add row_offset col;
+    new_ptr: ptr<float> = ptradd mtx offset;
+    ret new_ptr;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_get(mtx: ptr<float>, row: int, col: int, Ncol: int): float {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    val: float = load ptr;
+    ret val;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_set(mtx: ptr<float>, row: int, col: int, Ncol: int, val: float) {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    store ptr val;
+}
+
+@vector_new(N: int): ptr<float> {
+    ptr: ptr<float> = alloc N;
+    ret ptr;
+}
+
+@vector_get(vec: ptr<float>, i: int): float {
+    ptr: ptr<float> = ptradd vec i;
+    val: float = load ptr;
+    ret val;
+}
+
+@vector_set(vec: ptr<float>, i: int, val: float) {
+    ptr: ptr<float> = ptradd vec i;
+    store ptr val;
+}
+
+# EXPECTS:
+#   @vector_get defined
+@vector_print(vec: ptr<float>, N: int) {
+    i: int = const 0;
+    one: int = const 1;
+.while:
+    cond: bool = lt i N;
+    br cond .body .done;
+.body:
+    val: float = call @vector_get vec i;
+    print val;
+    i: int = add i one;
+    jmp .while;
+.done:
+}
+
+# Search for n % m where n and m are floats by
+# iteratively subtracting the largest m*2^k that
+# fits inside n. 
+# Takes O((log n/m)^2) time.
+# NOTE: In C, this can be done with a built in 
+# function, but this is the best we can do.
+@fmod(n: float, m: float): float {
+    zero: float = const 0;
+    two: float = const 2;
+    rem: float = id n;
+.while:
+    cond: bool = fge rem m;
+    br cond .body .done;
+.body:
+    decr: float = id m;
+.while_inner:
+    diff: float = fsub rem decr;
+    cond: bool = fge diff zero;
+    br cond .body_inner .done_inner;
+.body_inner:
+    decr: float = fmul decr two;
+    jmp .while_inner;
+.done_inner:
+    decr: float = fdiv decr two;
+    rem: float = fsub rem decr;
+    jmp .while;
+.done:
+    ret rem;
+}
\ No newline at end of file
diff --git a/tests/slow/polybench/linear-algebra/blas/symm-test.bril b/tests/slow/polybench/linear-algebra/blas/symm-test.bril
new file mode 100644
index 000000000..cb9a07f75
--- /dev/null
+++ b/tests/slow/polybench/linear-algebra/blas/symm-test.bril
@@ -0,0 +1,240 @@
+## symm computes matrix multiplication for a
+## symmetric matrix.
+
+@main {
+    # constants
+    # dimensions correspond to Polybench MEDIUM_DATASET
+
+    M: int = const 200;
+    fM: float = const 200;
+    N: int = const 240;
+    fN: float = const 240;
+
+    one: int = const 1;
+    fzero: float = const 0;
+
+    # initialize arrays
+    C: ptr<float> = call @matrix_new M N;
+    A: ptr<float> = call @matrix_new M M;
+    B: ptr<float> = call @matrix_new M N;
+    alpha: float = const 1.5;
+    beta: float = const 1.2;
+    call @init C A B M fM N fN;
+
+    # main computation
+
+    i: int = const 0;
+.main_i:
+    cond: bool = lt i M;
+    br cond .main_i_body .main_i_done;
+.main_i_body:
+    j: int = const 0;
+.main_j:
+    cond: bool = lt j N;
+    br cond .main_j_body .main_j_done;
+.main_j_body:
+    temp2: float = const 0;
+    k: int = const 0;
+.main_k:
+    cond: bool = lt k i;
+    br cond .main_k_body .main_k_done;
+.main_k_body:
+    Bij: float = call @matrix_get B i j N;
+    Aik: float = call @matrix_get A i k M;
+    incr: float = fmul alpha Bij;
+    incr: float = fmul incr Aik;
+    call @matrix_incr C k j N incr;
+
+    Bkj: float = call @matrix_get B k j N;
+    Aik: float = call @matrix_get A i k M;
+    incr: float = fmul Bkj Aik;
+
+    temp2: float = fadd temp2 incr;
+
+    k: int = add k one;
+    jmp .main_k;
+.main_k_done:
+    Cij: float = call @matrix_get C i j N;
+    Bij: float = call @matrix_get B i j N;
+    Aii: float = call @matrix_get A i i M;
+
+    val1: float = fmul beta Cij;
+    val2: float = fmul alpha Bij;
+
+    val2: float = fmul val2 Aii;
+    val3: float = fmul alpha temp2;
+
+    val: float = fadd val1 val2;
+    val: float = fadd val val3;
+    call @matrix_set C i j N val;
+    j: int = add j one;
+    jmp .main_j;
+.main_j_done:
+    i: int = add i one;
+    jmp .main_i;
+.main_i_done:
+
+    call @matrix_print C M N;
+
+    free C;
+    free A;
+    free B;
+}
+
+@init(C: ptr<float>, A: ptr<float>, B: ptr<float>,
+      M: int, fM: float, N: int, fN: float) {
+    one: int = const 1;
+    fone: float = const 1;
+    hundred: float = const 100;
+    invalid: float = const -999;
+
+    i: int = const 0;
+    fi: float = const 0;
+.init_CB_i:
+    cond: bool = lt i M;
+    br cond .init_CB_i_body .init_CB_i_done;
+.init_CB_i_body:
+    j: int = const 0;
+    fj: float = const 0;
+.init_CB_j:
+    cond: bool = lt j N;
+    br cond .init_CB_j_body .init_CB_j_done;
+.init_CB_j_body:
+    val: float = fadd fi fj;
+    val: float = call @fmod val hundred;
+    val: float = fdiv val fM;
+    call @matrix_set C i j N val;
+
+    val: float = fadd fN fi;
+    val: float = fsub val fj;
+    val: float = call @fmod val hundred;
+    val: float = fdiv val fM;
+    call @matrix_set B i j N val;
+
+    j: int = add j one;
+    fj: float = fadd fj fone;
+    jmp .init_CB_j;
+.init_CB_j_done:
+    i: int = add i one;
+    fi: float = fadd fi fone;
+    jmp .init_CB_i;
+.init_CB_i_done:
+
+    i: int = const 0;
+    fi: float = const 0;
+.init_A_i:
+    cond: bool = lt i M;
+    br cond .init_A_i_body .init_A_i_done;
+.init_A_i_body:
+    j: int = const 0;
+    fj: float = const 0;
+.init_A_j1:
+    cond: bool = le j i;
+    br cond .init_A_j1_body .init_A_j1_done;
+.init_A_j1_body:
+    val: float = fadd fi fj;
+    val: float = call @fmod val hundred;
+    val: float = fdiv val fM;
+    call @matrix_set A i j M val;
+
+    j: int = add j one;
+    fj: float = fadd fj fone;
+    jmp .init_A_j1;
+.init_A_j1_done:
+    j: int = add i one;
+.init_A_j2:
+    cond: bool = lt j M;
+    br cond .init_A_j2_body .init_A_j2_done;
+.init_A_j2_body:
+    call @matrix_set A i j M invalid;
+    j: int = add j one;
+    jmp .init_A_j2;
+.init_A_j2_done:
+    i: int = add i one;
+    fi: float = fadd fi fone;
+    jmp .init_A_i;
+.init_A_i_done:
+}
+
+@matrix_new(Nrow: int, Ncol: int): ptr<float> {
+    total: int = mul Nrow Ncol;
+    ptr: ptr<float> = alloc total;
+    ret ptr;
+}
+
+@matrix_loc(mtx: ptr<float>, row: int, col: int, Ncol: int): ptr<float> {
+    row_offset: int = mul row Ncol;
+    offset: int = add row_offset col;
+    new_ptr: ptr<float> = ptradd mtx offset;
+    ret new_ptr;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_get(mtx: ptr<float>, row: int, col: int, Ncol: int): float {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    val: float = load ptr;
+    ret val;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_set(mtx: ptr<float>, row: int, col: int, Ncol: int, val: float) {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    store ptr val;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_incr(mtx: ptr<float>, row: int, col: int, Ncol: int, incr: float) {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    val: float = load ptr;
+    new_val: float = fadd val incr;
+    store ptr new_val;
+}
+
+@matrix_print(mtx: ptr<float>, Nrow: int, Ncol: int) {
+    i: int = const 0;
+    one: int = const 1;
+    total: int = mul Nrow Ncol;
+.while:
+    cond: bool = lt i total;
+    br cond .body .done;
+.body:
+    mtx_loc: ptr<float> = ptradd mtx i;
+    val: float = load mtx_loc;
+    print val;
+    i: int = add i one;
+    jmp .while;
+.done:
+}
+
+# Search for n % m where n and m are floats by
+# iteratively subtracting the largest m*2^k that
+# fits inside n. 
+# Takes O((log n/m)^2) time.
+# NOTE: In C, this can be done with a built in 
+# function, but this is the best we can do.
+@fmod(n: float, m: float): float {
+    zero: float = const 0;
+    two: float = const 2;
+    rem: float = id n;
+.while:
+    cond: bool = fge rem m;
+    br cond .body .done;
+.body:
+    decr: float = id m;
+.while_inner:
+    diff: float = fsub rem decr;
+    cond: bool = fge diff zero;
+    br cond .body_inner .done_inner;
+.body_inner:
+    decr: float = fmul decr two;
+    jmp .while_inner;
+.done_inner:
+    decr: float = fdiv decr two;
+    rem: float = fsub rem decr;
+    jmp .while;
+.done:
+    ret rem;
+}
diff --git a/tests/slow/polybench/linear-algebra/blas/syr2k-test.bril b/tests/slow/polybench/linear-algebra/blas/syr2k-test.bril
new file mode 100644
index 000000000..7c35f0a9c
--- /dev/null
+++ b/tests/slow/polybench/linear-algebra/blas/syr2k-test.bril
@@ -0,0 +1,232 @@
+## syr2k computes a symmetric rank 2k update
+
+@main {
+    # constants
+    # dimensions correspond to Polybench MEDIUM_DATASET
+
+    M: int = const 200;
+    fM: float = const 200;
+    N: int = const 240;
+    fN: float = const 240;
+
+    one: int = const 1;
+    fzero: float = const 0;
+
+    # initialize arrays
+    C: ptr<float> = call @matrix_new N N;
+    A: ptr<float> = call @matrix_new N M;
+    B: ptr<float> = call @matrix_new N M;
+    alpha: float = const 1.5;
+    beta: float = const 1.2;
+    call @init C A B M fM N fN;
+
+    # main computation
+
+    i: int = const 0;
+.main_i:
+    cond: bool = lt i N;
+    br cond .main_i_body .main_i_done;
+.main_i_body:
+    j: int = const 0;
+.inner_j:
+    cond: bool = le j i;
+    br cond .inner_j_body .inner_j_done;
+.inner_j_body:
+    call @matrix_scale C i j N beta;
+    j: int = add j one;
+    jmp .inner_j;
+.inner_j_done:
+    k: int = const 0;
+.main_k:
+    cond: bool = lt k M;
+    br cond .main_k_body .main_k_done;
+.main_k_body:
+    j: int = const 0;
+.main_j:
+    cond: bool = le j i;
+    br cond .main_j_body .main_j_done;
+.main_j_body:
+    Ajk: float = call @matrix_get A j k M;
+    Bik: float = call @matrix_get B i k M;
+    Bjk: float = call @matrix_get B j k M;
+    Aik: float = call @matrix_get A i k M;
+    val1: float = fmul Ajk alpha;
+    val1: float = fmul val1 Bik;
+    val2: float = fmul Bjk alpha;
+    val2: float = fmul val2 Aik;
+    incr: float = fadd val1 val2;
+    call @matrix_incr C i j N incr;
+    j: int = add j one;
+    jmp .main_j;
+.main_j_done:
+    k: int = add k one;
+    jmp .main_k;
+.main_k_done:
+    i: int = add i one;
+    jmp .main_i;
+.main_i_done:
+
+    call @matrix_print C N N;
+
+    free C;
+    free A;
+    free B;
+}
+
+@init(C: ptr<float>, A: ptr<float>, B: ptr<float>,
+      M: int, fM: float, N: int, fN: float) {
+    one: int = const 1;
+    fone: float = const 1;
+    ftwo: float = const 2;
+    fthree: float = const 3;
+
+    i: int = const 0;
+    fi: float = const 0;
+.init_AB_i:
+    cond: bool = lt i N;
+    br cond .init_AB_i_body .init_AB_i_done;
+.init_AB_i_body:
+    j: int = const 0;
+    fj: float = const 0;
+.init_AB_j:
+    cond: bool = lt j M;
+    br cond .init_AB_j_body .init_AB_j_done;
+.init_AB_j_body:
+    val: float = fmul fi fj;
+    val: float = fadd val fone;
+    val: float = call @fmod val fN;
+    val: float = fdiv val fN;
+    call @matrix_set A i j M val;
+
+    val: float = fmul fi fj;
+    val: float = fadd val ftwo;
+    val: float = call @fmod val fM;
+    val: float = fdiv val fM;
+    call @matrix_set B i j M val;
+    j: int = add j one;
+    fj: float = fadd fj fone;
+    jmp .init_AB_j;
+.init_AB_j_done:
+    i: int = add i one;
+    fi: float = fadd fi fone;
+    jmp .init_AB_i;
+.init_AB_i_done:
+
+    i: int = const 0;
+    fi: float = const 0;
+.init_C_i:
+    cond: bool = lt i N;
+    br cond .init_C_i_body .init_C_i_done;
+.init_C_i_body:
+    j: int = const 0;
+    fj: float = const 0;
+.init_C_j:
+    cond: bool = lt j N;
+    br cond .init_C_j_body .init_C_j_done;
+.init_C_j_body:
+    val: float = fmul fi fj;
+    val: float = fadd val fthree;
+    val: float = call @fmod val fN;
+    val: float = fdiv val fM;
+    call @matrix_set C i j N val;
+    j: int = add j one;
+    fj: float = fadd fj fone;
+    jmp .init_C_j;
+.init_C_j_done:
+    i: int = add i one;
+    fi: float = fadd fi fone;
+    jmp .init_C_i;
+.init_C_i_done:
+}
+
+@matrix_new(Nrow: int, Ncol: int): ptr<float> {
+    total: int = mul Nrow Ncol;
+    ptr: ptr<float> = alloc total;
+    ret ptr;
+}
+
+@matrix_loc(mtx: ptr<float>, row: int, col: int, Ncol: int): ptr<float> {
+    row_offset: int = mul row Ncol;
+    offset: int = add row_offset col;
+    new_ptr: ptr<float> = ptradd mtx offset;
+    ret new_ptr;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_get(mtx: ptr<float>, row: int, col: int, Ncol: int): float {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    val: float = load ptr;
+    ret val;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_set(mtx: ptr<float>, row: int, col: int, Ncol: int, val: float) {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    store ptr val;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_incr(mtx: ptr<float>, row: int, col: int, Ncol: int, incr: float) {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    val: float = load ptr;
+    new_val: float = fadd val incr;
+    store ptr new_val;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_scale(mtx: ptr<float>, row: int, col: int, Ncol: int, scale: float) {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    val: float = load ptr;
+    new_val: float = fmul val scale;
+    store ptr new_val;
+}
+
+@matrix_print(mtx: ptr<float>, Nrow: int, Ncol: int) {
+    i: int = const 0;
+    one: int = const 1;
+    total: int = mul Nrow Ncol;
+.while:
+    cond: bool = lt i total;
+    br cond .body .done;
+.body:
+    mtx_loc: ptr<float> = ptradd mtx i;
+    val: float = load mtx_loc;
+    print val;
+    i: int = add i one;
+    jmp .while;
+.done:
+}
+
+# Search for n % m where n and m are floats by
+# iteratively subtracting the largest m*2^k that
+# fits inside n. 
+# Takes O((log n/m)^2) time.
+# NOTE: In C, this can be done with a built in 
+# function, but this is the best we can do.
+@fmod(n: float, m: float): float {
+    zero: float = const 0;
+    two: float = const 2;
+    rem: float = id n;
+.while:
+    cond: bool = fge rem m;
+    br cond .body .done;
+.body:
+    decr: float = id m;
+.while_inner:
+    diff: float = fsub rem decr;
+    cond: bool = fge diff zero;
+    br cond .body_inner .done_inner;
+.body_inner:
+    decr: float = fmul decr two;
+    jmp .while_inner;
+.done_inner:
+    decr: float = fdiv decr two;
+    rem: float = fsub rem decr;
+    jmp .while;
+.done:
+    ret rem;
+}
\ No newline at end of file
diff --git a/tests/slow/polybench/linear-algebra/blas/syrk-test.bril b/tests/slow/polybench/linear-algebra/blas/syrk-test.bril
new file mode 100644
index 000000000..9c9f637f0
--- /dev/null
+++ b/tests/slow/polybench/linear-algebra/blas/syrk-test.bril
@@ -0,0 +1,218 @@
+## syr2k computes a symmetric rank k update
+
+@main {
+    # constants
+    # dimensions correspond to Polybench MEDIUM_DATASET
+
+    M: int = const 200;
+    fM: float = const 200;
+    N: int = const 240;
+    fN: float = const 240;
+
+    one: int = const 1;
+    fzero: float = const 0;
+
+    # initialize arrays
+    C: ptr<float> = call @matrix_new N N;
+    A: ptr<float> = call @matrix_new N M;
+    alpha: float = const 1.5;
+    beta: float = const 1.2;
+    call @init C A M fM N fN;
+
+    # main computation
+
+    i: int = const 0;
+.main_i:
+    cond: bool = lt i N;
+    br cond .main_i_body .main_i_done;
+.main_i_body:
+    j: int = const 0;
+.inner_j:
+    cond: bool = le j i;
+    br cond .inner_j_body .inner_j_done;
+.inner_j_body:
+    call @matrix_scale C i j N beta;
+    j: int = add j one;
+    jmp .inner_j;
+.inner_j_done:
+    k: int = const 0;
+.main_k:
+    cond: bool = lt k M;
+    br cond .main_k_body .main_k_done;
+.main_k_body:
+    j: int = const 0;
+.main_j:
+    cond: bool = le j i;
+    br cond .main_j_body .main_j_done;
+.main_j_body:
+    Ajk: float = call @matrix_get A j k M;
+    Aik: float = call @matrix_get A i k M;
+    incr: float = fmul alpha Aik;
+    incr: float = fmul incr Ajk;
+    call @matrix_incr C i j N incr;
+    j: int = add j one;
+    jmp .main_j;
+.main_j_done:
+    k: int = add k one;
+    jmp .main_k;
+.main_k_done:
+    i: int = add i one;
+    jmp .main_i;
+.main_i_done:
+
+    call @matrix_print C N N;
+
+    free C;
+    free A;
+}
+
+@init(C: ptr<float>, A: ptr<float>,
+      M: int, fM: float, N: int, fN: float) {
+    one: int = const 1;
+    fone: float = const 1;
+    ftwo: float = const 2;
+
+    i: int = const 0;
+    fi: float = const 0;
+.init_A_i:
+    cond: bool = lt i N;
+    br cond .init_A_i_body .init_A_i_done;
+.init_A_i_body:
+    j: int = const 0;
+    fj: float = const 0;
+.init_A_j:
+    cond: bool = lt j M;
+    br cond .init_A_j_body .init_A_j_done;
+.init_A_j_body:
+    val: float = fmul fi fj;
+    val: float = fadd val fone;
+    val: float = call @fmod val fN;
+    val: float = fdiv val fN;
+    call @matrix_set A i j M val;
+    j: int = add j one;
+    fj: float = fadd fj fone;
+    jmp .init_A_j;
+.init_A_j_done:
+    i: int = add i one;
+    fi: float = fadd fi fone;
+    jmp .init_A_i;
+.init_A_i_done:
+
+    i: int = const 0;
+    fi: float = const 0;
+.init_C_i:
+    cond: bool = lt i N;
+    br cond .init_C_i_body .init_C_i_done;
+.init_C_i_body:
+    j: int = const 0;
+    fj: float = const 0;
+.init_C_j:
+    cond: bool = lt j N;
+    br cond .init_C_j_body .init_C_j_done;
+.init_C_j_body:
+    val: float = fmul fi fj;
+    val: float = fadd val ftwo;
+    val: float = call @fmod val fM;
+    val: float = fdiv val fM;
+    call @matrix_set C i j N val;
+    j: int = add j one;
+    fj: float = fadd fj fone;
+    jmp .init_C_j;
+.init_C_j_done:
+    i: int = add i one;
+    fi: float = fadd fi fone;
+    jmp .init_C_i;
+.init_C_i_done:
+}
+
+@matrix_new(Nrow: int, Ncol: int): ptr<float> {
+    total: int = mul Nrow Ncol;
+    ptr: ptr<float> = alloc total;
+    ret ptr;
+}
+
+@matrix_loc(mtx: ptr<float>, row: int, col: int, Ncol: int): ptr<float> {
+    row_offset: int = mul row Ncol;
+    offset: int = add row_offset col;
+    new_ptr: ptr<float> = ptradd mtx offset;
+    ret new_ptr;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_get(mtx: ptr<float>, row: int, col: int, Ncol: int): float {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    val: float = load ptr;
+    ret val;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_set(mtx: ptr<float>, row: int, col: int, Ncol: int, val: float) {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    store ptr val;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_incr(mtx: ptr<float>, row: int, col: int, Ncol: int, incr: float) {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    val: float = load ptr;
+    new_val: float = fadd val incr;
+    store ptr new_val;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_scale(mtx: ptr<float>, row: int, col: int, Ncol: int, scale: float) {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    val: float = load ptr;
+    new_val: float = fmul val scale;
+    store ptr new_val;
+}
+
+@matrix_print(mtx: ptr<float>, Nrow: int, Ncol: int) {
+    i: int = const 0;
+    one: int = const 1;
+    total: int = mul Nrow Ncol;
+.while:
+    cond: bool = lt i total;
+    br cond .body .done;
+.body:
+    mtx_loc: ptr<float> = ptradd mtx i;
+    val: float = load mtx_loc;
+    print val;
+    i: int = add i one;
+    jmp .while;
+.done:
+}
+
+# Search for n % m where n and m are floats by
+# iteratively subtracting the largest m*2^k that
+# fits inside n. 
+# Takes O((log n/m)^2) time.
+# NOTE: In C, this can be done with a built in 
+# function, but this is the best we can do.
+@fmod(n: float, m: float): float {
+    zero: float = const 0;
+    two: float = const 2;
+    rem: float = id n;
+.while:
+    cond: bool = fge rem m;
+    br cond .body .done;
+.body:
+    decr: float = id m;
+.while_inner:
+    diff: float = fsub rem decr;
+    cond: bool = fge diff zero;
+    br cond .body_inner .done_inner;
+.body_inner:
+    decr: float = fmul decr two;
+    jmp .while_inner;
+.done_inner:
+    decr: float = fdiv decr two;
+    rem: float = fsub rem decr;
+    jmp .while;
+.done:
+    ret rem;
+}
\ No newline at end of file
diff --git a/tests/slow/polybench/linear-algebra/blas/trmm-test.bril b/tests/slow/polybench/linear-algebra/blas/trmm-test.bril
new file mode 100644
index 000000000..95fa64e18
--- /dev/null
+++ b/tests/slow/polybench/linear-algebra/blas/trmm-test.bril
@@ -0,0 +1,191 @@
+## trmm computes a triangular matrix multiplication
+
+@main {
+    # constants
+    # dimensions correspond to Polybench MEDIUM_DATASET
+
+    M: int = const 200;
+    fM: float = const 200;
+    N: int = const 240;
+    fN: float = const 240;
+
+    one: int = const 1;
+    fzero: float = const 0;
+
+    # initialize arrays
+    A: ptr<float> = call @matrix_new M M;
+    B: ptr<float> = call @matrix_new M N;
+    alpha: float = const 1.5;
+    call @init A B M fM N fN;
+    # main computation
+
+    i: int = const 0;
+.main_i:
+    cond: bool = lt i M;
+    br cond .main_i_body .main_i_done;
+.main_i_body:
+    j: int = const 0;
+.main_j:
+    cond: bool = lt j N;
+    br cond .main_j_body .main_j_done;
+.main_j_body:
+    k: int = add i one;
+.main_k:
+    cond: bool = lt k M;
+    br cond .main_k_body .main_k_done;
+.main_k_body:
+    Aki: float = call @matrix_get A k i M;
+    Bkj: float = call @matrix_get B k j N;
+    incr: float = fmul Aki Bkj;
+    call @matrix_incr B i j N incr;
+    k: int = add k one;
+    jmp .main_k;
+.main_k_done:
+    Bij: float = call @matrix_get B i j N;
+    new_Bij: float = fmul alpha Bij;
+    call @matrix_set B i j N new_Bij;
+    j: int = add j one;
+    jmp .main_j;
+.main_j_done:
+    i: int = add i one;
+    jmp .main_i;
+.main_i_done:
+
+    call @matrix_print B M N;
+
+    free A;
+    free B;
+}
+
+@init(A: ptr<float>, B: ptr<float>,
+      M: int, fM: float, N: int, fN: float) {
+    one: int = const 1;
+    fone: float = const 1;
+
+    i: int = const 0;
+    fi: float = const 0;
+.init_i:
+    cond: bool = lt i M;
+    br cond .init_i_body .init_i_done;
+.init_i_body:
+    j: int = const 0;
+    fj: float = const 0;
+.init_A_j:
+    cond: bool = lt j i;
+    br cond .init_A_j_body .init_A_j_done;
+.init_A_j_body:
+    val: float = fadd fi fj;
+    val: float = call @fmod val fM;
+    val: float = fdiv val fM;
+    call @matrix_set A i j M val;
+
+    j: int = add j one;
+    fj: float = fadd fj fone;
+    jmp .init_A_j;
+.init_A_j_done:
+    call @matrix_set A i i M fone;
+    j: int = const 0;
+    fj: float = const 0;
+.init_B_j:
+    cond: bool = lt j N;
+    br cond .init_B_j_body .init_B_j_done;
+.init_B_j_body:
+    val: float = fsub fi fj;
+    val: float = fadd fN val;
+    val: float = call @fmod val fN;
+    val: float = fdiv val fN;
+    call @matrix_set B i j N val;
+
+    j: int = add j one;
+    fj: float = fadd fj fone;
+    jmp .init_B_j;
+.init_B_j_done:
+
+    i: int = add i one;
+    fi: float = fadd fi fone;
+    jmp .init_i;
+.init_i_done:
+}
+
+@matrix_new(Nrow: int, Ncol: int): ptr<float> {
+    total: int = mul Nrow Ncol;
+    ptr: ptr<float> = alloc total;
+    ret ptr;
+}
+
+@matrix_loc(mtx: ptr<float>, row: int, col: int, Ncol: int): ptr<float> {
+    row_offset: int = mul row Ncol;
+    offset: int = add row_offset col;
+    new_ptr: ptr<float> = ptradd mtx offset;
+    ret new_ptr;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_get(mtx: ptr<float>, row: int, col: int, Ncol: int): float {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    val: float = load ptr;
+    ret val;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_set(mtx: ptr<float>, row: int, col: int, Ncol: int, val: float) {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    store ptr val;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_incr(mtx: ptr<float>, row: int, col: int, Ncol: int, incr: float) {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    val: float = load ptr;
+    new_val: float = fadd val incr;
+    store ptr new_val;
+}
+
+@matrix_print(mtx: ptr<float>, Nrow: int, Ncol: int) {
+    i: int = const 0;
+    one: int = const 1;
+    total: int = mul Nrow Ncol;
+.while:
+    cond: bool = lt i total;
+    br cond .body .done;
+.body:
+    mtx_loc: ptr<float> = ptradd mtx i;
+    val: float = load mtx_loc;
+    print val;
+    i: int = add i one;
+    jmp .while;
+.done:
+}
+
+# Search for n % m where n and m are floats by
+# iteratively subtracting the largest m*2^k that
+# fits inside n. 
+# Takes O((log n/m)^2) time.
+# NOTE: In C, this can be done with a built in 
+# function, but this is the best we can do.
+@fmod(n: float, m: float): float {
+    zero: float = const 0;
+    two: float = const 2;
+    rem: float = id n;
+.while:
+    cond: bool = fge rem m;
+    br cond .body .done;
+.body:
+    decr: float = id m;
+.while_inner:
+    diff: float = fsub rem decr;
+    cond: bool = fge diff zero;
+    br cond .body_inner .done_inner;
+.body_inner:
+    decr: float = fmul decr two;
+    jmp .while_inner;
+.done_inner:
+    decr: float = fdiv decr two;
+    rem: float = fsub rem decr;
+    jmp .while;
+.done:
+    ret rem;
+}
\ No newline at end of file
diff --git a/tests/slow/polybench/linear-algebra/kernels/2mm-test.bril b/tests/slow/polybench/linear-algebra/kernels/2mm-test.bril
new file mode 100644
index 000000000..c83f9f266
--- /dev/null
+++ b/tests/slow/polybench/linear-algebra/kernels/2mm-test.bril
@@ -0,0 +1,311 @@
+## 2mm computes D := alpha * A * B * C + beta * D
+## for some procedurally generated matrices A, B, C, D
+
+@main {
+    # constants
+    # dimensions correspond to Polybench MEDIUM_DATASET
+
+    NI: int = const 180;
+    fNI: float = const 180;
+    NJ: int = const 190;
+    fNJ: float = const 190;
+    NK: int = const 210;
+    fNK: float = const 210;
+    NL: int = const 220;
+    fNL: float = const 220;
+
+    one: int = const 1;
+    fzero: float = const 0;
+
+    # initialize arrays
+    A: ptr<float> = call @matrix_new NI NK;
+    B: ptr<float> = call @matrix_new NK NJ;
+    C: ptr<float> = call @matrix_new NJ NL;
+    D: ptr<float> = call @matrix_new NI NL;
+    alpha: float = const 1.5;
+    beta: float = const 1.2;
+    call @init A B C D NI fNI NJ fNJ NK fNK NL fNL;
+
+    # main computation
+    # computes D := alpha * A * B * C + beta * D
+
+    # first compute alpha * A * B 
+    tmp: ptr<float> = call @matrix_new NI NJ;
+    
+    i: int = const 0;
+.part1_i:
+    cond: bool = lt i NI;
+    br cond .part1_i_body .part1_i_done;
+.part1_i_body:
+    j: int = const 0;
+.part1_j:
+    cond: bool = lt j NJ;
+    br cond .part1_j_body .part1_j_done;
+.part1_j_body:
+    call @matrix_set tmp i j NJ fzero;
+    k: int = const 0;
+.part1_k:
+    cond: bool = lt k NK;
+    br cond .part1_k_body .part1_k_done;
+.part1_k_body:
+    Aik: float = call @matrix_get A i k NK;
+    Bkj: float = call @matrix_get B k j NJ;
+    incr: float = fmul alpha Aik;
+    incr: float = fmul incr Bkj;
+    call @matrix_incr tmp i j NJ incr;
+    k: int = add k one;
+    jmp .part1_k;
+.part1_k_done:
+    j: int = add j one;
+    jmp .part1_j;
+.part1_j_done:
+    i: int = add i one;
+    jmp .part1_i;
+.part1_i_done:
+
+    i: int = const 0;
+.part2_i:
+    cond: bool = lt i NI;
+    br cond .part2_i_body .part2_i_done;
+.part2_i_body:
+    j: int = const 0;
+.part2_j:
+    cond: bool = lt j NL;
+    br cond .part2_j_body .part2_j_done;
+.part2_j_body:
+    call @matrix_scale D i j NL beta;
+    k: int = const 0;
+.part2_k:
+    cond: bool = lt k NJ;
+    br cond .part2_k_body .part2_k_done;
+.part2_k_body:
+    tmpik: float = call @matrix_get tmp i k NJ;
+    Ckj: float = call @matrix_get C k j NL;
+    incr: float = fmul tmpik Ckj;
+    call @matrix_incr D i j NL incr;
+    k: int = add k one;
+    jmp .part2_k;
+.part2_k_done:
+    j: int = add j one;
+    jmp .part2_j;
+.part2_j_done:
+    i: int = add i one;
+    jmp .part2_i;
+.part2_i_done:
+
+    call @matrix_print D NI NL;
+
+    res: float = load D;
+    free A;
+    free B;
+    free C;
+    free D;
+    free tmp;
+    print res;
+}
+
+@init(A: ptr<float>, B: ptr<float>, C: ptr<float>, D: ptr<float>, 
+      NI: int, fNI: float, NJ: int, fNJ: float, NK: int, fNK: float, NL: int, fNL: float) {
+    one: int = const 1;
+    fzero: float = const 0;
+    fone: float = const 1;
+    ftwo: float = const 2;
+    fthree: float = const 3;
+
+    i: int = const 0;
+    fi: float = const 0;
+.init_A_i:
+    cond: bool = lt i NI;
+    br cond .init_A_i_body .init_A_i_done;
+.init_A_i_body:
+    j: int = const 0;
+    fj: float = const 0;
+.init_A_j:
+    cond: bool = lt j NK;
+    br cond .init_A_j_body .init_A_j_done;
+.init_A_j_body:
+    val: float = fmul fi fj;
+    val: float = fadd val fone;
+    val: float = call @fmod val fNI;
+    val: float = fdiv val fNI;
+    call @matrix_set A i j NK val;
+    j: int = add j one;
+    fj: float = fadd fj fone;
+    jmp .init_A_j;
+.init_A_j_done:
+    i: int = add i one;
+    fi: float = fadd fi fone;
+    jmp .init_A_i;
+.init_A_i_done:
+
+    i: int = const 0;
+    fi: float = const 0;
+.init_B_i:
+    cond: bool = lt i NK;
+    br cond .init_B_i_body .init_B_i_done;
+.init_B_i_body:
+    j: int = const 0;
+    fj: float = const 0;
+.init_B_j:
+    cond: bool = lt j NJ;
+    br cond .init_B_j_body .init_B_j_done;
+.init_B_j_body:
+    val: float = fadd fj fone;
+    val: float = fmul fi val;
+    val: float = call @fmod val fNJ;
+    val: float = fdiv val fNJ;
+    call @matrix_set B i j NJ val;
+    j: int = add j one;
+    fj: float = fadd fj fone;
+    jmp .init_B_j;
+.init_B_j_done:
+    i: int = add i one;
+    fi: float = fadd fi fone;
+    jmp .init_B_i;
+.init_B_i_done:
+    
+    i: int = const 0;
+    fi: float = const 0;
+.init_C_i:
+    cond: bool = lt i NJ;
+    br cond .init_C_i_body .init_C_i_done;
+.init_C_i_body:
+    j: int = const 0;
+    fj: float = const 0;
+.init_C_j:
+    cond: bool = lt j NL;
+    br cond .init_C_j_body .init_C_j_done;
+.init_C_j_body:
+    val: float = fadd fj fthree;
+    val: float = fmul fi val;
+    val: float = fadd val fone;
+    val: float = call @fmod val fNL;
+    val: float = fdiv val fNL;
+    call @matrix_set C i j NL val;
+    j: int = add j one;
+    fj: float = fadd fj fone;
+    jmp .init_C_j;
+.init_C_j_done:
+    i: int = add i one;
+    fi: float = fadd fi fone;
+    jmp .init_C_i;
+.init_C_i_done:
+
+    i: int = const 0;
+    fi: float = const 0;
+.init_D_i:
+    cond: bool = lt i NI;
+    br cond .init_D_i_body .init_D_i_done;
+.init_D_i_body:
+    j: int = const 0;
+    fj: float = const 0;
+.init_D_j:
+    cond: bool = lt j NL;
+    br cond .init_D_j_body .init_D_j_done;
+.init_D_j_body:
+    val: float = fadd fj ftwo;
+    val: float = fmul fi val;
+    val: float = call @fmod val fNK;
+    val: float = fdiv val fNK;
+    call @matrix_set D i j NL val;
+    j: int = add j one;
+    fj: float = fadd fj fone;
+    jmp .init_D_j;
+.init_D_j_done:
+    i: int = add i one;
+    fi: float = fadd fi fone;
+    jmp .init_D_i;
+.init_D_i_done:
+}
+
+@matrix_new(Nrow: int, Ncol: int): ptr<float> {
+    total: int = mul Nrow Ncol;
+    ptr: ptr<float> = alloc total;
+    ret ptr;
+}
+
+@matrix_loc(mtx: ptr<float>, row: int, col: int, Ncol: int): ptr<float> {
+    row_offset: int = mul row Ncol;
+    offset: int = add row_offset col;
+    new_ptr: ptr<float> = ptradd mtx offset;
+    ret new_ptr;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_get(mtx: ptr<float>, row: int, col: int, Ncol: int): float {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    val: float = load ptr;
+    ret val;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_set(mtx: ptr<float>, row: int, col: int, Ncol: int, val: float) {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    store ptr val;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_incr(mtx: ptr<float>, row: int, col: int, Ncol: int, incr: float) {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    val: float = load ptr;
+    new_val: float = fadd val incr;
+    store ptr new_val;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_scale(mtx: ptr<float>, row: int, col: int, Ncol: int, scale: float) {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    val: float = load ptr;
+    new_val: float = fmul val scale;
+    store ptr new_val;
+}
+
+@matrix_print(mtx: ptr<float>, Nrow: int, Ncol: int) {
+    i: int = const 0;
+    one: int = const 1;
+    total: int = mul Nrow Ncol;
+.while:
+    cond: bool = lt i total;
+    br cond .body .done;
+.body:
+    mtx_loc: ptr<float> = ptradd mtx i;
+    val: float = load mtx_loc;
+    print val;
+    i: int = add i one;
+    jmp .while;
+.done:
+}
+
+# Search for n % m where n and m are floats by
+# iteratively subtracting the largest m*2^k that
+# fits inside n. 
+# Takes O((log n/m)^2) time.
+# NOTE: In C, this can be done with a built in 
+# function, but this is the best we can do.
+@fmod(n: float, m: float): float {
+    zero: float = const 0;
+    two: float = const 2;
+    rem: float = id n;
+.while:
+    cond: bool = fge rem m;
+    br cond .body .done;
+.body:
+    decr: float = id m;
+.while_inner:
+    diff: float = fsub rem decr;
+    cond: bool = fge diff zero;
+    br cond .body_inner .done_inner;
+.body_inner:
+    decr: float = fmul decr two;
+    jmp .while_inner;
+.done_inner:
+    decr: float = fdiv decr two;
+    rem: float = fsub rem decr;
+    jmp .while;
+.done:
+    ret rem;
+}
\ No newline at end of file
diff --git a/tests/slow/polybench/linear-algebra/kernels/3mm-test.bril b/tests/slow/polybench/linear-algebra/kernels/3mm-test.bril
new file mode 100644
index 000000000..831c630f2
--- /dev/null
+++ b/tests/slow/polybench/linear-algebra/kernels/3mm-test.bril
@@ -0,0 +1,346 @@
+## 3mm computes G := (A * B) * (C * D)
+## for some procedurally generated matrices A, B, C, D
+
+@main {
+    # constants
+    # dimensions correspond to Polybench MEDIUM_DATASET
+
+    NI: int = const 180;
+    fNI: float = const 180;
+    NJ: int = const 190;
+    fNJ: float = const 190;
+    NK: int = const 200;
+    fNK: float = const 200;
+    NL: int = const 210;
+    fNL: float = const 210;
+    NM: int = const 220;
+    fNM: float = const 220;
+
+    one: int = const 1;
+    fzero: float = const 0;
+
+    # initialize arrays
+    A: ptr<float> = call @matrix_new NI NK;
+    B: ptr<float> = call @matrix_new NK NJ;
+    C: ptr<float> = call @matrix_new NJ NM;
+    D: ptr<float> = call @matrix_new NM NL;
+    call @init A B C D NI fNI NJ fNJ NK fNK NL fNL NM fNM;
+    call @matrix_print A NI NK;
+    call @matrix_print B NK NJ;
+    call @matrix_print C NJ NM;
+    call @matrix_print D NM NL;
+    # main computation
+    # computes G := (A * B) * (C * D)
+
+    # first compute E := A * B 
+    E: ptr<float> = call @matrix_new NI NJ;
+
+    i: int = const 0;
+.part1_i:
+    cond: bool = lt i NI;
+    br cond .part1_i_body .part1_i_done;
+.part1_i_body:
+    j: int = const 0;
+.part1_j:
+    cond: bool = lt j NJ;
+    br cond .part1_j_body .part1_j_done;
+.part1_j_body:
+    call @matrix_set E i j NJ fzero;
+    k: int = const 0;
+.part1_k:
+    cond: bool = lt k NK;
+    br cond .part1_k_body .part1_k_done;
+.part1_k_body:
+    Aik: float = call @matrix_get A i k NK;
+    Bkj: float = call @matrix_get B k j NJ;
+    incr: float = fmul Aik Bkj;
+    call @matrix_incr E i j NJ incr;
+    k: int = add k one;
+    jmp .part1_k;
+.part1_k_done:
+    j: int = add j one;
+    jmp .part1_j;
+.part1_j_done:
+    i: int = add i one;
+    jmp .part1_i;
+.part1_i_done:
+    
+    # then compute F := C * D
+    F: ptr<float> = call @matrix_new NJ NL;
+
+    i: int = const 0;
+.part2_i:
+    cond: bool = lt i NJ;
+    br cond .part2_i_body .part2_i_done;
+.part2_i_body:
+    j: int = const 0;
+.part2_j:
+    cond: bool = lt j NL;
+    br cond .part2_j_body .part2_j_done;
+.part2_j_body:
+    call @matrix_set F i j NL fzero;
+    k: int = const 0;
+.part2_k:
+    cond: bool = lt k NM;
+    br cond .part2_k_body .part2_k_done;
+.part2_k_body:
+    Cik: float = call @matrix_get C i k NM;
+    Dkj: float = call @matrix_get D k j NL;
+    incr: float = fmul Cik Dkj;
+    call @matrix_incr F i j NL incr;
+    k: int = add k one;
+    jmp .part2_k;
+.part2_k_done:
+    j: int = add j one;
+    jmp .part2_j;
+.part2_j_done:
+    i: int = add i one;
+    jmp .part2_i;
+.part2_i_done:
+
+    # finally compute G := E * F 
+    G: ptr<float> = call @matrix_new NI NL;
+
+    i: int = const 0;
+.part3_i:
+    cond: bool = lt i NI;
+    br cond .part3_i_body .part3_i_done;
+.part3_i_body:
+    j: int = const 0;
+.part3_j:
+    cond: bool = lt j NL;
+    br cond .part3_j_body .part3_j_done;
+.part3_j_body:
+    call @matrix_set G i j NL fzero;
+    k: int = const 0;
+.part3_k:
+    cond: bool = lt k NJ;
+    br cond .part3_k_body .part3_k_done;
+.part3_k_body:
+    Eik: float = call @matrix_get E i k NJ;
+    Fkj: float = call @matrix_get F k j NL;
+    incr: float = fmul Eik Fkj;
+    call @matrix_incr G i j NL incr;
+    k: int = add k one;
+    jmp .part3_k;
+.part3_k_done:
+    j: int = add j one;
+    jmp .part3_j;
+.part3_j_done:
+    i: int = add i one;
+    jmp .part3_i;
+.part3_i_done:
+
+    call @matrix_print G NI NL;
+
+    free A;
+    free B;
+    free C;
+    free D;
+    free E;
+    free F;
+    free G;
+}
+
+@init(A: ptr<float>, B: ptr<float>, C: ptr<float>, D: ptr<float>,
+      NI: int, fNI: float, NJ: int, fNJ: float, NK: int, fNK: float, NL: int, fNL: float, NM: int, fNM: float) {
+    one: int = const 1;
+    fzero: float = const 0;
+    fone: float = const 1;
+    ftwo: float = const 2;
+    fthree: float = const 3;
+    ffive: float = const 5;
+
+    i: int = const 0;
+    fi: float = const 0;
+.init_A_i:
+    cond: bool = lt i NI;
+    br cond .init_A_i_body .init_A_i_done;
+.init_A_i_body:
+    j: int = const 0;
+    fj: float = const 0;
+.init_A_j:
+    cond: bool = lt j NK;
+    br cond .init_A_j_body .init_A_j_done;
+.init_A_j_body:
+    val: float = fmul fi fj;
+    val: float = fadd val fone;
+    val: float = call @fmod val fNI;
+    denom: float = fmul ffive fNI;
+    val: float = fdiv val denom;
+    call @matrix_set A i j NK val;
+    j: int = add j one;
+    fj: float = fadd fj fone;
+    jmp .init_A_j;
+.init_A_j_done:
+    i: int = add i one;
+    fi: float = fadd fi fone;
+    jmp .init_A_i;
+.init_A_i_done:
+
+    i: int = const 0;
+    fi: float = const 0;
+.init_B_i:
+    cond: bool = lt i NK;
+    br cond .init_B_i_body .init_B_i_done;
+.init_B_i_body:
+    j: int = const 0;
+    fj: float = const 0;
+.init_B_j:
+    cond: bool = lt j NJ;
+    br cond .init_B_j_body .init_B_j_done;
+.init_B_j_body:
+    val: float = fadd fj fone;
+    val: float = fmul fi val;
+    val: float = fadd val ftwo;
+    val: float = call @fmod val fNJ;
+    denom: float = fmul ffive fNJ;
+    val: float = fdiv val denom;
+    call @matrix_set B i j NJ val;
+    j: int = add j one;
+    fj: float = fadd fj fone;
+    jmp .init_B_j;
+.init_B_j_done:
+    i: int = add i one;
+    fi: float = fadd fi fone;
+    jmp .init_B_i;
+.init_B_i_done:
+    
+    i: int = const 0;
+    fi: float = const 0;
+.init_C_i:
+    cond: bool = lt i NJ;
+    br cond .init_C_i_body .init_C_i_done;
+.init_C_i_body:
+    j: int = const 0;
+    fj: float = const 0;
+.init_C_j:
+    cond: bool = lt j NM;
+    br cond .init_C_j_body .init_C_j_done;
+.init_C_j_body:
+    val: float = fadd fj fthree;
+    val: float = fmul fi val;
+    val: float = call @fmod val fNL;
+    denom: float = fmul ffive fNL;
+    val: float = fdiv val denom;
+    call @matrix_set C i j NM val;
+    j: int = add j one;
+    fj: float = fadd fj fone;
+    jmp .init_C_j;
+.init_C_j_done:
+    i: int = add i one;
+    fi: float = fadd fi fone;
+    jmp .init_C_i;
+.init_C_i_done:
+
+    i: int = const 0;
+    fi: float = const 0;
+.init_D_i:
+    cond: bool = lt i NM;
+    br cond .init_D_i_body .init_D_i_done;
+.init_D_i_body:
+    j: int = const 0;
+    fj: float = const 0;
+.init_D_j:
+    cond: bool = lt j NL;
+    br cond .init_D_j_body .init_D_j_done;
+.init_D_j_body:
+    val: float = fadd fj ftwo;
+    val: float = fmul fi val;
+    val: float = fadd val ftwo;
+    val: float = call @fmod val fNK;
+    denom: float = fmul ffive fNK;
+    val: float = fdiv val denom;
+    call @matrix_set D i j NL val;
+    j: int = add j one;
+    fj: float = fadd fj fone;
+    jmp .init_D_j;
+.init_D_j_done:
+    i: int = add i one;
+    fi: float = fadd fi fone;
+    jmp .init_D_i;
+.init_D_i_done:
+}
+
+@matrix_new(Nrow: int, Ncol: int): ptr<float> {
+    total: int = mul Nrow Ncol;
+    ptr: ptr<float> = alloc total;
+    ret ptr;
+}
+
+@matrix_loc(mtx: ptr<float>, row: int, col: int, Ncol: int): ptr<float> {
+    row_offset: int = mul row Ncol;
+    offset: int = add row_offset col;
+    new_ptr: ptr<float> = ptradd mtx offset;
+    ret new_ptr;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_get(mtx: ptr<float>, row: int, col: int, Ncol: int): float {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    val: float = load ptr;
+    ret val;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_set(mtx: ptr<float>, row: int, col: int, Ncol: int, val: float) {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    store ptr val;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_incr(mtx: ptr<float>, row: int, col: int, Ncol: int, incr: float) {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    val: float = load ptr;
+    new_val: float = fadd val incr;
+    store ptr new_val;
+}
+
+@matrix_print(mtx: ptr<float>, Nrow: int, Ncol: int) {
+    i: int = const 0;
+    one: int = const 1;
+    total: int = mul Nrow Ncol;
+.while:
+    cond: bool = lt i total;
+    br cond .body .done;
+.body:
+    mtx_loc: ptr<float> = ptradd mtx i;
+    val: float = load mtx_loc;
+    print val;
+    i: int = add i one;
+    jmp .while;
+.done:
+}
+
+# Search for n % m where n and m are floats by
+# iteratively subtracting the largest m*2^k that
+# fits inside n. 
+# Takes O((log n/m)^2) time.
+# NOTE: In C, this can be done with a built in 
+# function, but this is the best we can do.
+@fmod(n: float, m: float): float {
+    zero: float = const 0;
+    two: float = const 2;
+    rem: float = id n;
+.while:
+    cond: bool = fge rem m;
+    br cond .body .done;
+.body:
+    decr: float = id m;
+.while_inner:
+    diff: float = fsub rem decr;
+    cond: bool = fge diff zero;
+    br cond .body_inner .done_inner;
+.body_inner:
+    decr: float = fmul decr two;
+    jmp .while_inner;
+.done_inner:
+    decr: float = fdiv decr two;
+    rem: float = fsub rem decr;
+    jmp .while;
+.done:
+    ret rem;
+}
\ No newline at end of file
diff --git a/tests/slow/polybench/linear-algebra/kernels/atax-test.bril b/tests/slow/polybench/linear-algebra/kernels/atax-test.bril
new file mode 100644
index 000000000..d21410f9b
--- /dev/null
+++ b/tests/slow/polybench/linear-algebra/kernels/atax-test.bril
@@ -0,0 +1,219 @@
+## atax computes y := A^T * A * x
+## for some procedurally generated matrix A and 
+## some procedurally generated vector x
+
+@main {
+    # constants
+    # dimensions correspond to Polybench MEDIUM_DATASET
+
+    M: int = const 390;
+    fM: float = const 390;
+    N: int = const 410;
+    fN: float = const 410;
+
+    one: int = const 1;
+    fzero: float = const 0;
+
+    # initialize arrays
+    A: ptr<float> = call @matrix_new M N;
+    x: ptr<float> = call @vector_new N;
+    call @init A x M fM N fN;
+
+    # main computation
+    # computes y := A^T * A * x
+
+    y: ptr<float> = call @vector_new N;
+    i: int = const 0;
+.init_y:
+    cond: bool = lt i N;
+    br cond .init_y_body .init_y_done;
+.init_y_body:
+    call @vector_set y i fzero;
+    i: int = add i one;
+    jmp .init_y;
+.init_y_done:
+    
+    tmp: ptr<float> = call @vector_new M;
+    i: int = const 0;
+.main_i:
+    cond: bool = lt i M;
+    br cond .main_i_body .main_i_done;
+.main_i_body:
+    call @vector_set tmp i fzero;
+    j: int = const 0;
+.main_j1:
+    cond: bool = lt j N;
+    br cond .main_j1_body .main_j1_done;
+.main_j1_body:
+    Aij: float = call @matrix_get A i j N;
+    xj: float = call @vector_get x j;
+    tmpi: float = call @vector_get tmp i;
+    new_tmpi: float = fmul Aij xj;
+    new_tmpi: float = fadd tmpi new_tmpi;
+    call @vector_set tmp i new_tmpi;
+    j: int = add j one;
+    jmp .main_j1;
+.main_j1_done:
+    j: int = const 0;
+.main_j2:
+    cond: bool = lt j N;
+    br cond .main_j2_body .main_j2_done;
+.main_j2_body:
+    Aij: float = call @matrix_get A i j N;
+    tmpi: float = call @vector_get tmp i;
+    yj: float = call @vector_get y j;
+    new_yj: float = fmul Aij tmpi;
+    new_yj: float = fadd yj new_yj;
+    call @vector_set y j new_yj;
+    j: int = add j one;
+    jmp .main_j2;
+.main_j2_done:
+    i: int = add i one;
+    jmp .main_i;
+.main_i_done:
+    
+    call @vector_print y N;
+
+    free A;
+    free x;
+    free y;
+    free tmp;
+}
+
+@init(A: ptr<float>, x: ptr<float>, 
+      M: int, fM: float, N: int, fN: float) {
+    one: int = const 1;
+    fzero: float = const 0;
+    fone: float = const 1;
+    ffive: float = const 5;
+
+    i: int = const 0;
+    fi: float = const 0;
+.init_x:
+    cond: bool = lt i N;
+    br cond .init_x_body .init_x_done;
+.init_x_body:
+    val: float = fdiv fi fN;
+    val: float = fadd fone val;
+    call @vector_set x i val;
+    i: int = add i one;
+    fi: float = fadd fi fone;
+    jmp .init_x;
+.init_x_done:
+
+    i: int = const 0;
+    fi: float = const 0;
+.init_A_i:
+    cond: bool = lt i M;
+    br cond .init_A_i_body .init_A_i_done;
+.init_A_i_body:
+    j: int = const 0;
+    fj: float = const 0;
+.init_A_j:
+    cond: bool = lt j N;
+    br cond .init_A_j_body .init_A_j_done;
+.init_A_j_body:
+    val: float = fadd fi fj;
+    val: float = call @fmod val fN;
+    denom: float = fmul ffive fM;
+    val: float = fdiv val denom;
+    call @matrix_set A i j N val;
+    j: int = add j one;
+    fj: float = fadd fj fone;
+    jmp .init_A_j;
+.init_A_j_done:
+    i: int = add i one;
+    fi: float = fadd fi fone;
+    jmp .init_A_i;
+.init_A_i_done:
+}
+
+@matrix_new(Nrow: int, Ncol: int): ptr<float> {
+    total: int = mul Nrow Ncol;
+    ptr: ptr<float> = alloc total;
+    ret ptr;
+}
+
+@matrix_loc(mtx: ptr<float>, row: int, col: int, Ncol: int): ptr<float> {
+    row_offset: int = mul row Ncol;
+    offset: int = add row_offset col;
+    new_ptr: ptr<float> = ptradd mtx offset;
+    ret new_ptr;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_get(mtx: ptr<float>, row: int, col: int, Ncol: int): float {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    val: float = load ptr;
+    ret val;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_set(mtx: ptr<float>, row: int, col: int, Ncol: int, val: float) {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    store ptr val;
+}
+
+@vector_new(N: int): ptr<float> {
+    ptr: ptr<float> = alloc N;
+    ret ptr;
+}
+
+@vector_get(vec: ptr<float>, i: int): float {
+    ptr: ptr<float> = ptradd vec i;
+    val: float = load ptr;
+    ret val;
+}
+
+@vector_set(vec: ptr<float>, i: int, val: float) {
+    ptr: ptr<float> = ptradd vec i;
+    store ptr val;
+}
+
+# EXPECTS:
+#   @vector_get defined
+@vector_print(vec: ptr<float>, N: int) {
+    i: int = const 0;
+    one: int = const 1;
+.while:
+    cond: bool = lt i N;
+    br cond .body .done;
+.body:
+    val: float = call @vector_get vec i;
+    print val;
+    i: int = add i one;
+    jmp .while;
+.done:
+}
+
+# Search for n % m where n and m are floats by
+# iteratively subtracting the largest m*2^k that
+# fits inside n. 
+# Takes O((log n/m)^2) time.
+# NOTE: In C, this can be done with a built in 
+# function, but this is the best we can do.
+@fmod(n: float, m: float): float {
+    zero: float = const 0;
+    two: float = const 2;
+    rem: float = id n;
+.while:
+    cond: bool = fge rem m;
+    br cond .body .done;
+.body:
+    decr: float = id m;
+.while_inner:
+    diff: float = fsub rem decr;
+    cond: bool = fge diff zero;
+    br cond .body_inner .done_inner;
+.body_inner:
+    decr: float = fmul decr two;
+    jmp .while_inner;
+.done_inner:
+    decr: float = fdiv decr two;
+    rem: float = fsub rem decr;
+    jmp .while;
+.done:
+    ret rem;
+}
\ No newline at end of file
diff --git a/tests/slow/polybench/linear-algebra/kernels/bicg-test.bril b/tests/slow/polybench/linear-algebra/kernels/bicg-test.bril
new file mode 100644
index 000000000..ddf680811
--- /dev/null
+++ b/tests/slow/polybench/linear-algebra/kernels/bicg-test.bril
@@ -0,0 +1,216 @@
+## bicg computes the BiCG Sub Kernel of BiCGStab Linear Solver
+## for some procedurally generated inputs
+
+@main {
+    # constants
+    # dimensions correspond to Polybench MEDIUM_DATASET
+
+    M: int = const 390;
+    fM: float = const 390;
+    N: int = const 410;
+    fN: float = const 410;
+
+    one: int = const 1;
+    fzero: float = const 0;
+
+    # initialize arrays
+    p: ptr<float> = call @vector_new M;
+    r: ptr<float> = call @vector_new N;
+    A: ptr<float> = call @matrix_new N M;
+    call @init p r A N fN M fM;
+    
+    # main computation
+    s: ptr<float> = call @vector_new M;
+    q: ptr<float> = call @vector_new N;
+
+    i: int = const 0;
+.init_s_i:
+    cond: bool = lt i M;
+    br cond .init_s_i_body .init_s_i_done;
+.init_s_i_body:
+    call @vector_set s i fzero;
+    i: int = add i one;
+    jmp .init_s_i;
+.init_s_i_done:
+
+    i: int = const 0;
+.main_i:
+    cond: bool = lt i N;
+    br cond .main_i_body .main_i_done;
+.main_i_body:
+    call @vector_set q i fzero;
+    j: int = const 0;
+.main_j:
+    cond: bool = lt j M;
+    br cond .main_j_body .main_j_done;
+.main_j_body:
+    ri: float = call @vector_get r i;
+    Aij: float = call @matrix_get A i j M;
+    sj: float = call @vector_get s j;
+    new_sj: float = fmul ri Aij;
+    new_sj: float = fadd sj new_sj;
+    call @vector_set s j new_sj;
+
+    Aij: float = call @matrix_get A i j M;
+    pj: float = call @vector_get p j;
+    qi: float = call @vector_get q i;
+    new_qi: float = fmul Aij pj;
+    new_qi: float = fadd qi new_qi;
+    call @vector_set q i new_qi;
+
+    j: int = add j one;
+    jmp .main_j;
+.main_j_done:
+    i: int = add i one;
+    jmp .main_i;
+.main_i_done:
+
+    call @vector_print s M;
+    call @vector_print q N;
+
+    free A;
+    free s;
+    free q;
+    free p;
+    free r;
+}
+
+@init(p: ptr<float>, r: ptr<float>, A: ptr<float>, 
+      N: int, fN: float, M: int, fM: float) {
+    one: int = const 1;
+    fzero: float = const 0;
+    fone: float = const 1;
+
+    i: int = const 0;
+    fi: float = const 0;
+.init_p_i:
+    cond: bool = lt i M;
+    br cond .init_p_i_body .init_p_i_done;
+.init_p_i_body:
+    val: float = call @fmod fi fM;
+    val: float = fdiv val fM;
+    call @vector_set p i val;
+    i: int = add i one;
+    fi: float = fadd fi fone;
+    jmp .init_p_i;
+.init_p_i_done:
+
+    i: int = const 0;
+    fi: float = const 0;
+.init_Ar_i:
+    cond: bool = lt i N;
+    br cond .init_Ar_i_body .init_Ar_i_done;
+.init_Ar_i_body:
+    val: float = call @fmod fi fN;
+    val: float = fdiv val fN;
+    call @vector_set r i val;
+    j: int = const 0;
+    fj: float = const 0;
+.init_Ar_j:
+    cond: bool = lt j N;
+    br cond .init_Ar_j_body .init_Ar_j_done;
+.init_Ar_j_body:
+    val: float = fadd fj fone;
+    val: float = fmul val fi;
+    val: float = call @fmod val fN;
+    val: float = fdiv val fN;
+    call @matrix_set A i j M val;
+    j: int = add j one;
+    fj: float = fadd fj fone;
+    jmp .init_Ar_j;
+.init_Ar_j_done:
+    i: int = add i one;
+    fi: float = fadd fi fone;
+    jmp .init_Ar_i;
+.init_Ar_i_done:
+}
+
+@matrix_new(Nrow: int, Ncol: int): ptr<float> {
+    total: int = mul Nrow Ncol;
+    ptr: ptr<float> = alloc total;
+    ret ptr;
+}
+
+@matrix_loc(mtx: ptr<float>, row: int, col: int, Ncol: int): ptr<float> {
+    row_offset: int = mul row Ncol;
+    offset: int = add row_offset col;
+    new_ptr: ptr<float> = ptradd mtx offset;
+    ret new_ptr;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_get(mtx: ptr<float>, row: int, col: int, Ncol: int): float {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    val: float = load ptr;
+    ret val;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_set(mtx: ptr<float>, row: int, col: int, Ncol: int, val: float) {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    store ptr val;
+}
+
+@vector_new(N: int): ptr<float> {
+    ptr: ptr<float> = alloc N;
+    ret ptr;
+}
+
+@vector_get(vec: ptr<float>, i: int): float {
+    ptr: ptr<float> = ptradd vec i;
+    val: float = load ptr;
+    ret val;
+}
+
+@vector_set(vec: ptr<float>, i: int, val: float) {
+    ptr: ptr<float> = ptradd vec i;
+    store ptr val;
+}
+
+# EXPECTS:
+#   @vector_get defined
+@vector_print(vec: ptr<float>, N: int) {
+    i: int = const 0;
+    one: int = const 1;
+.while:
+    cond: bool = lt i N;
+    br cond .body .done;
+.body:
+    val: float = call @vector_get vec i;
+    print val;
+    i: int = add i one;
+    jmp .while;
+.done:
+}
+
+# Search for n % m where n and m are floats by
+# iteratively subtracting the largest m*2^k that
+# fits inside n. 
+# Takes O((log n/m)^2) time.
+# NOTE: In C, this can be done with a built in 
+# function, but this is the best we can do.
+@fmod(n: float, m: float): float {
+    zero: float = const 0;
+    two: float = const 2;
+    rem: float = id n;
+.while:
+    cond: bool = fge rem m;
+    br cond .body .done;
+.body:
+    decr: float = id m;
+.while_inner:
+    diff: float = fsub rem decr;
+    cond: bool = fge diff zero;
+    br cond .body_inner .done_inner;
+.body_inner:
+    decr: float = fmul decr two;
+    jmp .while_inner;
+.done_inner:
+    decr: float = fdiv decr two;
+    rem: float = fsub rem decr;
+    jmp .while;
+.done:
+    ret rem;
+}
\ No newline at end of file
diff --git a/tests/slow/polybench/linear-algebra/kernels/doitgen-test.bril b/tests/slow/polybench/linear-algebra/kernels/doitgen-test.bril
new file mode 100644
index 000000000..24128959a
--- /dev/null
+++ b/tests/slow/polybench/linear-algebra/kernels/doitgen-test.bril
@@ -0,0 +1,290 @@
+## doitgen is a kernel for the MADNESS framework for 
+## adaptive multiresolution methods in multiwavelet bases,
+## a topic in quantum chemistry.
+
+@main {
+    # constants
+    # dimensions correspond to Polybench MEDIUM_DATASET
+
+    NQ: int = const 40;
+    fNQ: float = const 40;
+    NR: int = const 50;
+    fNR: float = const 50;
+    NP: int = const 60;
+    fNP: float = const 60;
+
+    zero: int = const 0;
+    one: int = const 1;
+    fzero: float = const 0;
+
+    # initialize arrays
+    A: ptr<float> = call @tensor_new NR NQ NP;
+    C4: ptr<float> = call @matrix_new NP NP;
+
+    call @init A C4 NQ fNQ NR fNR NP fNP;
+
+    # main computation
+    sum: ptr<float> = call @vector_new NP;
+
+    r: int = const 0;
+.main_r:
+    cond: bool = lt r NR;
+    br cond .main_r_body .main_r_done;
+.main_r_body:
+    q: int = const 0;
+.main_q:
+    cond: bool = lt q NQ;
+    br cond .main_q_body .main_q_done;
+.main_q_body:
+    p: int = const 0;
+.main_p1:
+    cond: bool = lt p NP;
+    br cond .main_p1_body .main_p1_done;
+.main_p1_body:
+    call @vector_set sum p fzero;
+    s: int = const 0;
+.main_s:
+    cond: bool = lt s NP;
+    br cond .main_s_body .main_s_done;
+.main_s_body:
+    Arqs: float = call @tensor_get A r q s NQ NP;
+    C4sp: float = call @matrix_get C4 s p NP;
+    incr: float = fmul Arqs C4sp;
+    call @vector_incr sum p incr;
+    s: int = add s one;
+    jmp .main_s;
+.main_s_done:
+    p: int = add p one;
+    jmp .main_p1;
+.main_p1_done:
+    p: int = const 0;
+.main_p2:
+    cond: bool = lt p NP;
+    br cond .main_p2_body .main_p2_done;
+.main_p2_body:
+    sump: float = call @vector_get sum p;
+    call @tensor_set A r q p NQ NP sump;
+    p: int = add p one;
+    jmp .main_p2;
+.main_p2_done:
+    q: int = add q one;
+    jmp .main_q;
+.main_q_done:
+    r: int = add r one;
+    jmp .main_r;
+.main_r_done:
+
+    call @tensor_print A NR NQ NP;
+    res: float = load A;
+
+    free A;
+    free sum;
+    free C4;
+    print res;
+}
+
+
+@init(A: ptr<float>, C4: ptr<float>,
+      NQ: int, fNQ: float, NR: int, fNR: float, NP: int, fNP: float) {
+    one: int = const 1;
+    fone: float = const 1;
+
+    i: int = const 0;
+    fi: float = const 0;
+.init_A_i:
+    cond: bool = lt i NR;
+    br cond .init_A_i_body .init_A_i_done;
+.init_A_i_body:
+    j: int = const 0;
+    fj: float = const 0;
+.init_A_j:
+    cond: bool = lt j NQ;
+    br cond .init_A_j_body .init_A_j_done;
+.init_A_j_body:
+    k: int = const 0;
+    fk: float = const 0;
+.init_A_k:
+    cond: bool = lt k NP;
+    br cond .init_A_k_body .init_A_k_done;
+.init_A_k_body:
+    val: float = fmul fi fj;
+    val: float = fadd val fk;
+    val: float = call @fmod val fNP;
+    val: float = fdiv val fNP;
+    call @tensor_set A i j k NQ NP val; 
+    k: int = add k one;
+    fk: float = fadd fk fone;
+    jmp .init_A_k;
+.init_A_k_done:
+    j: int = add j one;
+    fj: float = fadd fj fone;
+    jmp .init_A_j;
+.init_A_j_done:
+    i: int = add i one;
+    fi: float = fadd fi fone;
+    jmp .init_A_i;
+.init_A_i_done:
+
+    i: int = const 0;
+    fi: float = const 0;
+.init_C4_i:
+    cond: bool = lt i NP;
+    br cond .init_C4_i_body .init_C4_i_done;
+.init_C4_i_body:
+    j: int = const 0;
+    fj: float = const 0;
+.init_C4_j:
+    cond: bool = lt j NP;
+    br cond .init_C4_j_body .init_C4_j_done;
+.init_C4_j_body:
+    val: float = fmul fi fj;
+    val: float = call @fmod val fNP;
+    val: float = fdiv val fNP;
+    call @matrix_set C4 i j NP val; 
+    j: int = add j one;
+    fj: float = fadd fj fone;
+    jmp .init_C4_j;
+.init_C4_j_done:
+    i: int = add i one;
+    fi: float = fadd fi fone;
+    jmp .init_C4_i;
+.init_C4_i_done:
+}
+
+@matrix_new(Nrow: int, Ncol: int): ptr<float> {
+    total: int = mul Nrow Ncol;
+    ptr: ptr<float> = alloc total;
+    ret ptr;
+}
+
+@matrix_loc(mtx: ptr<float>, row: int, col: int, Ncol: int): ptr<float> {
+    row_offset: int = mul row Ncol;
+    offset: int = add row_offset col;
+    new_ptr: ptr<float> = ptradd mtx offset;
+    ret new_ptr;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_get(mtx: ptr<float>, row: int, col: int, Ncol: int): float {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    val: float = load ptr;
+    ret val;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_set(mtx: ptr<float>, row: int, col: int, Ncol: int, val: float) {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    store ptr val;
+}
+
+@tensor_new(Ni: int, Nj: int, Nk: int): ptr<float> {
+    total: int = mul Ni Nj;
+    total: int = mul total Nk;
+    ptr: ptr<float> = alloc total;
+    ret ptr;
+}
+
+@tensor_loc(tsr: ptr<float>, i: int, j: int, k: int, Nj: int, Nk: int): ptr<float> {
+    offset: int = mul i Nj;
+    offset: int = add offset j;
+    offset: int = mul offset Nk;
+    offset: int = add offset k;
+    new_ptr: ptr<float> = ptradd tsr offset;
+    ret new_ptr;
+}
+
+# EXPECTS:
+#   @tensor_loc defined
+@tensor_get(tsr: ptr<float>, i: int, j: int, k: int, Nj: int, Nk: int): float {
+    ptr: ptr<float> = call @tensor_loc tsr i j k Nj Nk;
+    val: float = load ptr;
+    ret val;
+}
+
+# EXPECTS:
+#   @tensor_loc defined
+@tensor_set(tsr: ptr<float>, i: int, j: int, k: int, Nj: int, Nk: int, val: float) {
+    ptr: ptr<float> = call @tensor_loc tsr i j k Nj Nk;
+    store ptr val;
+}
+
+# EXPECTS:
+#   @tensor_loc defined
+@tensor_incr(tsr: ptr<float>, i: int, j: int, k: int, Nj: int, Nk: int, incr: float) {
+    ptr: ptr<float> = call @tensor_loc tsr i j k Nj Nk;
+    val: float = load ptr;
+    new_val: float = fadd val incr;
+    store ptr new_val;
+}
+
+@tensor_print(tsr: ptr<float>, Ni: int, Nj: int, Nk: int) {
+    i: int = const 0;
+    one: int = const 1;
+    total: int = mul Ni Nj;
+    total: int = mul total Nk;
+.while:
+    cond: bool = lt i total;
+    br cond .body .done;
+.body:
+    tsr_loc: ptr<float> = ptradd tsr i;
+    val: float = load tsr_loc;
+    print val;
+    i: int = add i one;
+    jmp .while;
+.done:
+}
+
+@vector_new(N: int): ptr<float> {
+    ptr: ptr<float> = alloc N;
+    ret ptr;
+}
+
+@vector_get(vec: ptr<float>, i: int): float {
+    ptr: ptr<float> = ptradd vec i;
+    val: float = load ptr;
+    ret val;
+}
+
+@vector_set(vec: ptr<float>, i: int, val: float) {
+    ptr: ptr<float> = ptradd vec i;
+    store ptr val;
+}
+
+@vector_incr(vec: ptr<float>, i: int, incr: float) {
+    ptr: ptr<float> = ptradd vec i;
+    val: float = load ptr;
+    new_val: float = fadd val incr;
+    store ptr new_val;
+}
+
+# Search for n % m where n and m are floats by
+# iteratively subtracting the largest m*2^k that
+# fits inside n. 
+# Takes O((log n/m)^2) time.
+# NOTE: In C, this can be done with a built in 
+# function, but this is the best we can do.
+@fmod(n: float, m: float): float {
+    zero: float = const 0;
+    two: float = const 2;
+    rem: float = id n;
+.while:
+    cond: bool = fge rem m;
+    br cond .body .done;
+.body:
+    decr: float = id m;
+.while_inner:
+    diff: float = fsub rem decr;
+    cond: bool = fge diff zero;
+    br cond .body_inner .done_inner;
+.body_inner:
+    decr: float = fmul decr two;
+    jmp .while_inner;
+.done_inner:
+    decr: float = fdiv decr two;
+    rem: float = fsub rem decr;
+    jmp .while;
+.done:
+    ret rem;
+}
diff --git a/tests/slow/polybench/linear-algebra/kernels/mvt-test.bril b/tests/slow/polybench/linear-algebra/kernels/mvt-test.bril
new file mode 100644
index 000000000..7fa9a7936
--- /dev/null
+++ b/tests/slow/polybench/linear-algebra/kernels/mvt-test.bril
@@ -0,0 +1,217 @@
+## mvt computes a matrix vector product and transpose
+
+@main {
+    # constants
+    # dimensions correspond to Polybench MEDIUM_DATASET
+
+    N: int = const 400;
+    fN: float = const 400;
+
+    one: int = const 1;
+
+    x1: ptr<float> = call @vector_new N;
+    x2: ptr<float> = call @vector_new N;
+    y_1: ptr<float> = call @vector_new N;
+    y_2: ptr<float> = call @vector_new N;
+    A: ptr<float> = call @matrix_new N N;
+
+    call @init x1 x2 y_1 y_2 A N fN;
+
+    i: int = const 0;
+.part1_i:
+    cond: bool = lt i N;
+    br cond .part1_i_body .part1_i_done;
+.part1_i_body:
+    j: int = const 0;
+.part1_j:
+    cond: bool = lt j N;
+    br cond .part1_j_body .part1_j_done;
+.part1_j_body:
+    x1i: float = call @vector_get x1 i;
+    Aij: float = call @matrix_get A i j N;
+    y_1j: float = call @vector_get y_1 j;
+    val: float = fmul Aij y_1j;
+    val: float = fadd val x1i;
+    call @vector_set x1 i val;
+    j: int = add j one;
+    jmp .part1_j;
+.part1_j_done:
+    i: int = add i one;
+    jmp .part1_i;
+.part1_i_done:
+
+    i: int = const 0;
+.part2_i:
+    cond: bool = lt i N;
+    br cond .part2_i_body .part2_i_done;
+.part2_i_body:
+    j: int = const 0;
+.part2_j:
+    cond: bool = lt j N;
+    br cond .part2_j_body .part2_j_done;
+.part2_j_body:
+    x2i: float = call @vector_get x2 i;
+    Aji: float = call @matrix_get A j i N;
+    y_2j: float = call @vector_get y_2 j;
+    val: float = fmul Aji y_2j;
+    val: float = fadd val x2i;
+    call @vector_set x2 i val;
+    j: int = add j one;
+    jmp .part2_j;
+.part2_j_done:
+    i: int = add i one;
+    jmp .part2_i;
+.part2_i_done:
+
+    call @vector_print x1 N;
+    call @vector_print x2 N;
+
+    free x1;
+    free x2;
+    free y_1;
+    free y_2;
+    free A;
+}
+
+@init(x1: ptr<float>, x2: ptr<float>, y_1: ptr<float>, y_2: ptr<float>, A: ptr<float>,
+      N: int, fN: float) {
+    one: int = const 1;
+    fone: float = const 1;
+    fthree: float = const 3;
+    ffour: float = const 4;
+
+    i: int = const 0;
+    fi: float = const 0;
+.init_i:
+    cond: bool = lt i N;
+    br cond .init_i_body .init_i_done;
+.init_i_body:
+    val: float = call @fmod fi fN;
+    val: float = fdiv val fN;
+    call @vector_set x1 i val;
+
+    val: float = fadd fi fone;
+    val: float = call @fmod val fN;
+    val: float = fdiv val fN;
+    call @vector_set x2 i val;
+
+    val: float = fadd fi fthree;
+    val: float = call @fmod val fN;
+    val: float = fdiv val fN;
+    call @vector_set y_1 i val;
+
+    val: float = fadd fi ffour;
+    val: float = call @fmod val fN;
+    val: float = fdiv val fN;
+    call @vector_set y_2 i val;
+
+    j: int = const 0;
+    fj: float = const 0;
+.init_j:
+    cond: bool = lt j N;
+    br cond .init_j_body .init_j_done;
+.init_j_body:
+    val: float = fmul fi fj;
+    val: float = call @fmod val fN;
+    val: float = fdiv val fN;
+    call @matrix_set A i j N val;
+
+    j: int = add j one;
+    fj: float = fadd fj fone;
+    jmp .init_j;
+.init_j_done:
+    i: int = add i one;
+    fi: float = fadd fi fone;
+    jmp .init_i;
+.init_i_done:
+}
+
+@vector_new(N: int): ptr<float> {
+    ptr: ptr<float> = alloc N;
+    ret ptr;
+}
+
+@vector_get(vec: ptr<float>, i: int): float {
+    ptr: ptr<float> = ptradd vec i;
+    val: float = load ptr;
+    ret val;
+}
+
+@vector_set(vec: ptr<float>, i: int, val: float) {
+    ptr: ptr<float> = ptradd vec i;
+    store ptr val;
+}
+
+# EXPECTS:
+#   @vector_get defined
+@vector_print(vec: ptr<float>, N: int) {
+    i: int = const 0;
+    one: int = const 1;
+.while:
+    cond: bool = lt i N;
+    br cond .body .done;
+.body:
+    val: float = call @vector_get vec i;
+    print val;
+    i: int = add i one;
+    jmp .while;
+.done:
+}
+
+@matrix_new(Nrow: int, Ncol: int): ptr<float> {
+    total: int = mul Nrow Ncol;
+    ptr: ptr<float> = alloc total;
+    ret ptr;
+}
+
+@matrix_loc(mtx: ptr<float>, row: int, col: int, Ncol: int): ptr<float> {
+    row_offset: int = mul row Ncol;
+    offset: int = add row_offset col;
+    new_ptr: ptr<float> = ptradd mtx offset;
+    ret new_ptr;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_get(mtx: ptr<float>, row: int, col: int, Ncol: int): float {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    val: float = load ptr;
+    ret val;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_set(mtx: ptr<float>, row: int, col: int, Ncol: int, val: float) {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    store ptr val;
+}
+
+# Search for n % m where n and m are floats by
+# iteratively subtracting the largest m*2^k that
+# fits inside n. 
+# Takes O((log n/m)^2) time.
+# NOTE: In C, this can be done with a built in 
+# function, but this is the best we can do.
+@fmod(n: float, m: float): float {
+    zero: float = const 0;
+    two: float = const 2;
+    rem: float = id n;
+.while:
+    cond: bool = fge rem m;
+    br cond .body .done;
+.body:
+    decr: float = id m;
+.while_inner:
+    diff: float = fsub rem decr;
+    cond: bool = fge diff zero;
+    br cond .body_inner .done_inner;
+.body_inner:
+    decr: float = fmul decr two;
+    jmp .while_inner;
+.done_inner:
+    decr: float = fdiv decr two;
+    rem: float = fsub rem decr;
+    jmp .while;
+.done:
+    ret rem;
+}
\ No newline at end of file
diff --git a/tests/slow/polybench/linear-algebra/solvers/cholesky-test.bril b/tests/slow/polybench/linear-algebra/solvers/cholesky-test.bril
new file mode 100644
index 000000000..784d49d4c
--- /dev/null
+++ b/tests/slow/polybench/linear-algebra/solvers/cholesky-test.bril
@@ -0,0 +1,326 @@
+## cholesky computes some part of the Cholesky decomposition.
+## Cannot be translated to int.
+
+@main {
+    # constants
+    # dimensions correspond to Polybench MEDIUM_DATASET
+
+    N: int = const 400;
+    fN: float = const 400;
+
+    one: int = const 1;
+
+    # initialize arrays
+    A: ptr<float> = call @matrix_new N N;
+    call @init A N fN;
+
+    # main computation
+    i: int = const 0;
+.main_i:
+    cond: bool = lt i N;
+    br cond .main_i_body .main_i_done;
+.main_i_body:
+    j: int = const 0;
+.case1_j:
+    cond: bool = lt j i;
+    br cond .case1_j_body .case1_j_done;
+.case1_j_body:
+    k: int = const 0;
+.case1_k:
+    cond: bool = lt k j;
+    br cond .case1_k_body .case1_k_done;
+.case1_k_body:
+    Aik: float = call @matrix_get A i k N;
+    Ajk: float = call @matrix_get A j k N;
+    decr: float = fmul Aik Ajk;
+    call @matrix_decr A i j N decr;
+    k: int = add k one;
+    jmp .case1_k;
+.case1_k_done:
+    Ajj: float = call @matrix_get A j j N;
+    call @matrix_div A i j N Ajj;
+    j: int = add j one;
+    jmp .case1_j;
+.case1_j_done:
+    k: int = const 0;
+.case2_k:
+    cond: bool = lt k i;
+    br cond .case2_k_body .case2_k_done;
+.case2_k_body:
+    Aik: float = call @matrix_get A i k N;
+    decr: float = fmul Aik Aik;
+    call @matrix_decr A i i N decr;
+    k: int = add k one;
+    jmp .case2_k;
+.case2_k_done:
+    Aii: float = call @matrix_get A i i N;
+    sqrtAii: float = call @sqrt Aii;
+    call @matrix_set A i i N sqrtAii;
+    i: int = add i one;
+    jmp .main_i;
+.main_i_done:
+
+    call @matrix_print A N N;
+
+    free A;
+}
+
+@init(A: ptr<float>, N: int, fN: float) {
+    one: int = const 1;
+    fzero: float = const 0;
+    fone: float = const 1;
+    fminusone: float = const -1;
+
+    i: int = const 0;
+.init_i:
+    cond: bool = lt i N;
+    br cond .init_i_body .init_i_done;
+.init_i_body:
+    j: int = const 0;
+    fj: float = const 0;
+.init_j1:
+    cond: bool = le j i;
+    br cond .init_j1_body .init_j1_done;
+.init_j1_body:
+    val: float = call @fmod fj fN;
+    val: float = fmul val fminusone;
+    val: float = fdiv val fN;
+    val: float = fadd val fone;
+    call @matrix_set A i j N val;
+
+    j: int = add j one;
+    fj: float = fadd fj fone;
+    jmp .init_j1;
+.init_j1_done:
+    j: int = add i one;
+.init_j2:
+    cond: bool = lt j N;
+    br cond .init_j2_body .init_j2_done;
+.init_j2_body:
+    call @matrix_set A i j N fzero;
+    j: int = add j one;
+    jmp .init_j2;
+.init_j2_done:
+    call @matrix_set A i i N fone;
+    i: int = add i one;
+    jmp .init_i;
+.init_i_done:
+    B: ptr<float> = call @matrix_new N N;
+
+    r: int = const 0;
+.init_B_r:
+    cond: bool = lt r N;
+    br cond .init_B_r_body .init_B_r_done;
+.init_B_r_body:
+    s: int = const 0;
+.init_B_s:
+    cond: bool = lt s N;
+    br cond .init_B_s_body .init_B_s_done;
+.init_B_s_body:
+    call @matrix_set B r s N fzero;
+    s: int = add s one;
+    jmp .init_B_s;
+.init_B_s_done:
+    r: int = add r one;
+    jmp .init_B_r;
+.init_B_r_done:
+
+    t: int = const 0;
+.psd_t:
+    cond: bool = lt t N;
+    br cond .psd_t_body .psd_t_done;
+.psd_t_body:
+    r: int = const 0;
+.psd1_r:
+    cond: bool = lt r N;
+    br cond .psd1_r_body .psd1_r_done;
+.psd1_r_body:
+    s: int = const 0;
+.psd1_s:
+    cond: bool = lt s N;
+    br cond .psd1_s_body .psd1_s_done;
+.psd1_s_body:
+    Art: float = call @matrix_get A r t N;
+    Ast: float = call @matrix_get A s t N;
+    incr: float = fmul Art Ast;
+    call @matrix_incr B r s N incr;
+    s: int = add s one;
+    jmp .psd1_s;
+.psd1_s_done:
+    r: int = add r one;
+    jmp .psd1_r;
+.psd1_r_done:
+    t: int = add t one;
+    jmp .psd_t;
+.psd_t_done:
+
+    r: int = const 0;
+.psd2_r:
+    cond: bool = lt r N;
+    br cond .psd2_r_body .psd2_r_done;
+.psd2_r_body:
+    s: int = const 0;
+.psd2_s:
+    cond: bool = lt s N;
+    br cond .psd2_s_body .psd2_s_done;
+.psd2_s_body:
+    Brs: float = call @matrix_get B r s N;
+    call @matrix_set A r s N Brs;
+    s: int = add s one;
+    jmp .psd2_s;
+.psd2_s_done:
+    r: int = add r one;
+    jmp .psd2_r;
+.psd2_r_done:
+    free B;
+}
+
+@matrix_new(Nrow: int, Ncol: int): ptr<float> {
+    total: int = mul Nrow Ncol;
+    ptr: ptr<float> = alloc total;
+    ret ptr;
+}
+
+@matrix_loc(mtx: ptr<float>, row: int, col: int, Ncol: int): ptr<float> {
+    row_offset: int = mul row Ncol;
+    offset: int = add row_offset col;
+    new_ptr: ptr<float> = ptradd mtx offset;
+    ret new_ptr;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_get(mtx: ptr<float>, row: int, col: int, Ncol: int): float {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    val: float = load ptr;
+    ret val;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_set(mtx: ptr<float>, row: int, col: int, Ncol: int, val: float) {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    store ptr val;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_incr(mtx: ptr<float>, row: int, col: int, Ncol: int, incr: float) {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    val: float = load ptr;
+    new_val: float = fadd val incr;
+    store ptr new_val;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_decr(mtx: ptr<float>, row: int, col: int, Ncol: int, decr: float) {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    val: float = load ptr;
+    new_val: float = fsub val decr;
+    store ptr new_val;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_div(mtx: ptr<float>, row: int, col: int, Ncol: int, div: float) {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    val: float = load ptr;
+    new_val: float = fdiv val div;
+    store ptr new_val;
+}
+
+@matrix_print(mtx: ptr<float>, Nrow: int, Ncol: int) {
+    i: int = const 0;
+    one: int = const 1;
+    total: int = mul Nrow Ncol;
+.while:
+    cond: bool = lt i total;
+    br cond .body .done;
+.body:
+    mtx_loc: ptr<float> = ptradd mtx i;
+    val: float = load mtx_loc;
+    print val;
+    i: int = add i one;
+    jmp .while;
+.done:
+}
+
+# Loop Newton's method until convergence within a
+# multiplicative 1 +/- 1e-10 factor. That is,
+#         x_(i+1) = 1/2 (x_i + n / x_i).
+# 
+# Takes O(log n) iterations.
+# NOTE: C uses a hardware instruction to compute 
+# sqrt, but this is the best we can do.
+@sqrt(n: float): float {
+    # handle zero
+    zero: float = const 0;
+    is_zero: bool = feq n zero;
+    br is_zero .ret_zero .continue_zero;
+.ret_zero:
+    ret zero;
+.continue_zero:
+
+    # handle nan
+    is_not_nan: bool = feq n n;
+    br is_not_nan .continue_nan .ret_nan;
+.ret_nan:
+    nan: float = fdiv zero zero;
+    ret nan;
+.continue_nan:
+
+    # handle negative
+    is_negative: bool = flt n zero;
+    br is_negative .ret_nan .continue_neg;
+.continue_neg:
+
+    # main case
+    two: float = const 2;
+    oneminuseps: float = const 0.9999999999;
+    onepluseps: float = const 1.0000000001;
+    ans: float = const 1;
+.do:
+    next: float = fdiv n ans;
+    next: float = fadd ans next;
+    next: float = fdiv next two;
+.while:
+    quot: float = fdiv next ans;
+    ans: float = id next;
+    big_enough: bool = fge quot oneminuseps;
+    small_enough: bool = fle quot onepluseps;
+    good: bool = and big_enough small_enough;
+    br good .done .do;
+.done:
+    ret ans;
+}
+
+# Search for n % m where n and m are floats by
+# iteratively subtracting the largest m*2^k that
+# fits inside n. 
+# Takes O((log n/m)^2) time.
+# NOTE: In C, this can be done with a built in 
+# function, but this is the best we can do.
+@fmod(n: float, m: float): float {
+    zero: float = const 0;
+    two: float = const 2;
+    rem: float = id n;
+.while:
+    cond: bool = fge rem m;
+    br cond .body .done;
+.body:
+    decr: float = id m;
+.while_inner:
+    diff: float = fsub rem decr;
+    cond: bool = fge diff zero;
+    br cond .body_inner .done_inner;
+.body_inner:
+    decr: float = fmul decr two;
+    jmp .while_inner;
+.done_inner:
+    decr: float = fdiv decr two;
+    rem: float = fsub rem decr;
+    jmp .while;
+.done:
+    ret rem;
+}
\ No newline at end of file
diff --git a/tests/slow/polybench/linear-algebra/solvers/durbin-test.bril b/tests/slow/polybench/linear-algebra/solvers/durbin-test.bril
new file mode 100644
index 000000000..0ee58f13a
--- /dev/null
+++ b/tests/slow/polybench/linear-algebra/solvers/durbin-test.bril
@@ -0,0 +1,156 @@
+## durbin is a Toeplitz system solver.
+
+@main {
+    # constants
+    # dimensions correspond to Polybench MEDIUM_DATASET
+
+    N: int = const 400;
+    fN: float = const 400;
+
+    one: int = const 1;
+    zero: int = const 0;
+
+    fone: float = const 1;
+    fminusone: float = const -1;
+
+    r: ptr<float> = call @vector_new N;
+    call @init r N fN;
+
+    # main computation
+
+    y: ptr<float> = call @vector_new N;
+    z: ptr<float> = call @vector_new N;
+
+    r0: float = call @vector_get r zero;
+    val: float = fmul fminusone r0;
+    call @vector_set y zero val;
+
+    beta: float = const 1;
+    r0: float = call @vector_get r zero;
+    alpha: float = fmul fminusone r0;
+    
+    k: int = const 1;
+.main_k:
+    cond: bool = lt k N;
+    br cond .main_k_body .main_k_done;
+.main_k_body:
+    val: float = fmul alpha alpha;
+    val: float = fsub fone val;
+    beta: float = fmul val beta;
+    sum: float = const 0;
+    i: int = const 0;
+.compute_sum_i:
+    cond: bool = lt i k;
+    br cond .compute_sum_i_body .compute_sum_i_done;
+.compute_sum_i_body:
+    index: int = sub k i;
+    index: int = sub index one;
+    rx: float = call @vector_get r index;
+    yi: float = call @vector_get y i;
+    incr: float = fmul rx yi;
+    sum: float = fadd sum incr;
+    i: int = add i one;
+    jmp .compute_sum_i;
+.compute_sum_i_done:
+
+    rk: float = call @vector_get r k;
+    val: float = fadd rk sum;
+    val: float = fdiv val beta;
+    alpha: float = fmul fminusone val;
+
+    i: int = const 0;
+.compute_z_i:
+    cond: bool = lt i k;
+    br cond .compute_z_i_body .compute_z_i_done;
+.compute_z_i_body:
+    yi: float = call @vector_get y i;
+    index: int = sub k i;
+    index: int = sub index one;
+    yx: float = call @vector_get y index;
+    val: float = fmul alpha yx;
+    val: float = fadd yi val;
+    call @vector_set z i val;
+    i: int = add i one;
+    jmp .compute_z_i;
+.compute_z_i_done:
+
+    i: int = const 0;
+.set_y_i:
+    cond: bool = lt i k;
+    br cond .set_y_i_body .set_y_i_done;
+.set_y_i_body:
+    zi: float = call @vector_get z i;
+    call @vector_set y i zi;
+    i: int = add i one;
+    jmp .set_y_i;
+.set_y_i_done:
+    call @vector_set y k alpha;
+
+    k: int = add k one;
+    jmp .main_k;
+.main_k_done:
+
+    call @vector_print y N;
+
+    free r;
+    free y;
+    free z;
+}
+
+@init(r: ptr<float>, N: int, fN: float) {
+    one: int = const 1;
+    fone: float = const 1;
+
+    i: int = const 0;
+    fi: float = const 0;
+.main_i:
+    cond: bool = lt i N;
+    br cond .main_i_body .main_i_done;
+.main_i_body:
+    val: float = fadd fN fone;
+    val: float = fsub val fi;
+    call @vector_set r i val;
+    i: int = add i one;
+    fi: float = fadd fi fone;
+    jmp .main_i;
+.main_i_done:
+}
+
+@vector_new(N: int): ptr<float> {
+    ptr: ptr<float> = alloc N;
+    ret ptr;
+}
+
+@vector_get(vec: ptr<float>, i: int): float {
+    ptr: ptr<float> = ptradd vec i;
+    val: float = load ptr;
+    ret val;
+}
+
+@vector_set(vec: ptr<float>, i: int, val: float) {
+    ptr: ptr<float> = ptradd vec i;
+    store ptr val;
+}
+
+@vector_incr(vec: ptr<float>, i: int, incr: float) {
+    ptr: ptr<float> = ptradd vec i;
+    val: float = load ptr;
+    new_val: float = fadd val incr;
+    store ptr new_val;
+}
+
+# EXPECTS:
+#   @vector_get defined
+@vector_print(vec: ptr<float>, N: int) {
+    i: int = const 0;
+    one: int = const 1;
+.while:
+    cond: bool = lt i N;
+    br cond .body .done;
+.body:
+    val: float = call @vector_get vec i;
+    print val;
+    i: int = add i one;
+    jmp .while;
+.done:
+}
\ No newline at end of file
diff --git a/tests/slow/polybench/linear-algebra/solvers/gramschmidt-test.bril b/tests/slow/polybench/linear-algebra/solvers/gramschmidt-test.bril
new file mode 100644
index 000000000..67c1b139c
--- /dev/null
+++ b/tests/slow/polybench/linear-algebra/solvers/gramschmidt-test.bril
@@ -0,0 +1,287 @@
+## gramschmidt performs QR decomposition
+## with modified Gram Schmidt.
+
+@main {
+    # constants
+    # dimensions correspond to Polybench MEDIUM_DATASET
+
+    M: int = const 20;
+    fM: float = const 20;
+    N: int = const 30;
+    fN: float = const 30;
+
+    one: int = const 1;
+    fzero: float = const 0;
+
+    A: ptr<float> = call @matrix_new M N;
+    R: ptr<float> = call @matrix_new N N;
+    Q: ptr<float> = call @matrix_new M N;
+    call @init A R Q M fM N fN;
+
+    k: int = const 0;
+.main_k:
+    cond: bool = lt k N;
+    br cond .main_k_body .main_k_done;
+.main_k_body:
+    nrm: float = const 0;
+    i: int = const 0;
+.sum_nrm_i:
+    cond: bool = lt i M;
+    br cond .sum_nrm_i_body .sum_nrm_i_done;
+.sum_nrm_i_body:
+    Aik: float = call @matrix_get A i k N;
+    incr: float = fmul Aik Aik;
+    nrm: float = fadd nrm incr;
+    i: int = add i one;
+    jmp .sum_nrm_i;
+.sum_nrm_i_done:
+    sqrt_nrm: float = call @sqrt nrm;
+    call @matrix_set R k k N sqrt_nrm;
+    i: int = const 0;
+.setQ_i:
+    cond: bool = lt i M;
+    br cond .setQ_i_body .setQ_i_done;
+.setQ_i_body:
+    Aik: float = call @matrix_get A i k N;
+    Rkk: float = call @matrix_get R k k N;
+    val: float = fdiv Aik Rkk;
+    call @matrix_set Q i k N val;
+    i: int = add i one;
+    jmp .setQ_i;
+.setQ_i_done:
+    j: int = add k one;
+.inner_j:
+    cond: bool = lt j N;
+    br cond .inner_j_body .inner_j_done;
+.inner_j_body:
+    call @matrix_set R k j N fzero;
+    i: int = const 0;
+.compute_Rkj_i:
+    cond: bool = lt i M;
+    br cond .compute_Rkj_i_body .compute_Rkj_i_done;
+.compute_Rkj_i_body:
+    Qik: float = call @matrix_get Q i k N;
+    Aij: float = call @matrix_get A i j N;
+    incr: float = fmul Qik Aij;
+    call @matrix_incr R k j N incr;
+    i: int = add i one;
+    jmp .compute_Rkj_i;
+.compute_Rkj_i_done:
+    i: int = const 0;
+.update_Aij_i:
+    cond: bool = lt i M;
+    br cond .update_Aij_i_body .update_Aij_i_done;
+.update_Aij_i_body:
+    Aij: float = call @matrix_get A i j N;
+    Qik: float = call @matrix_get Q i k N;
+    Rkj: float = call @matrix_get R k j N;
+    val: float = fmul Qik Rkj;
+    val: float = fsub Aij val;
+    call @matrix_set A i j N val;
+    i: int = add i one;
+    jmp .update_Aij_i;
+.update_Aij_i_done:
+
+    j: int = add j one;
+    jmp .inner_j;
+.inner_j_done:
+
+    k: int = add k one;
+    jmp .main_k;
+.main_k_done:
+
+    call @matrix_print R N N;
+    call @matrix_print Q M N;
+
+    free A;
+    free R;
+    free Q;
+}
+
+@init(A: ptr<float>, R: ptr<float>, Q: ptr<float>,
+      M: int, fM: float, N: int, fN: float) {
+    one: int = const 1;
+    fone: float = const 1;
+    fzero: float = const 0;
+    fhundred: float = const 100;
+    ften: float = const 10;
+
+    i: int = const 0;
+    fi: float = const 0;
+.init_AQ_i:
+    cond: bool = lt i M;
+    br cond .init_AQ_i_body .init_AQ_i_done;
+.init_AQ_i_body:
+    j: int = const 0;
+    fj: float = const 0;
+.init_AQ_j:
+    cond: bool = lt j N;
+    br cond .init_AQ_j_body .init_AQ_j_done;
+.init_AQ_j_body:
+    val: float = fmul fi fj;
+    val: float = call @fmod val fM;
+    val: float = fdiv val fM;
+    val: float = fmul val fhundred;
+    val: float = fadd val ften;
+    call @matrix_set A i j N val;
+    call @matrix_set Q i j N fzero;
+    j: int = add j one;
+    fj: float = fadd fj fone;
+    jmp .init_AQ_j;
+.init_AQ_j_done:
+    i: int = add i one;
+    fi: float = fadd fi fone;
+    jmp .init_AQ_i;
+.init_AQ_i_done:
+
+    i: int = const 0;
+.init_R_i:
+    cond: bool = lt i N;
+    br cond .init_R_i_body .init_R_i_done;
+.init_R_i_body:
+    j: int = const 0;
+.init_R_j:
+    cond: bool = lt j N;
+    br cond .init_R_j_body .init_R_j_done;
+.init_R_j_body:
+    call @matrix_set R i j N fzero;
+    j: int = add j one;
+    jmp .init_R_j;
+.init_R_j_done:
+    i: int = add i one;
+    jmp .init_R_i;
+.init_R_i_done:
+}
+
+@matrix_new(Nrow: int, Ncol: int): ptr<float> {
+    total: int = mul Nrow Ncol;
+    ptr: ptr<float> = alloc total;
+    ret ptr;
+}
+
+@matrix_loc(mtx: ptr<float>, row: int, col: int, Ncol: int): ptr<float> {
+    row_offset: int = mul row Ncol;
+    offset: int = add row_offset col;
+    new_ptr: ptr<float> = ptradd mtx offset;
+    ret new_ptr;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_get(mtx: ptr<float>, row: int, col: int, Ncol: int): float {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    val: float = load ptr;
+    ret val;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_set(mtx: ptr<float>, row: int, col: int, Ncol: int, val: float) {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    store ptr val;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_incr(mtx: ptr<float>, row: int, col: int, Ncol: int, incr: float) {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    val: float = load ptr;
+    new_val: float = fadd val incr;
+    store ptr new_val;
+}
+
+@matrix_print(mtx: ptr<float>, Nrow: int, Ncol: int) {
+    i: int = const 0;
+    one: int = const 1;
+    total: int = mul Nrow Ncol;
+.while:
+    cond: bool = lt i total;
+    br cond .body .done;
+.body:
+    mtx_loc: ptr<float> = ptradd mtx i;
+    val: float = load mtx_loc;
+    print val;
+    i: int = add i one;
+    jmp .while;
+.done:
+}
+
+
+# Search for n % m where n and m are floats by
+# iteratively subtracting the largest m*2^k that
+# fits inside n. 
+# Takes O((log n/m)^2) time.
+# NOTE: In C, this can be done with a built in 
+# function, but this is the best we can do.
+@fmod(n: float, m: float): float {
+    zero: float = const 0;
+    two: float = const 2;
+    rem: float = id n;
+.while:
+    cond: bool = fge rem m;
+    br cond .body .done;
+.body:
+    decr: float = id m;
+.while_inner:
+    diff: float = fsub rem decr;
+    cond: bool = fge diff zero;
+    br cond .body_inner .done_inner;
+.body_inner:
+    decr: float = fmul decr two;
+    jmp .while_inner;
+.done_inner:
+    decr: float = fdiv decr two;
+    rem: float = fsub rem decr;
+    jmp .while;
+.done:
+    ret rem;
+}
+
+# Loop Newton's method until convergence within a
+# multiplicative 1 +/- 1e-10 factor. That is,
+#         x_(i+1) = 1/2 (x_i + n / x_i).
+# 
+# Takes O(log n) iterations.
+# NOTE: C uses a hardware instruction to compute 
+# sqrt, but this is the best we can do.
+@sqrt(n: float): float {
+    # handle zero
+    zero: float = const 0;
+    is_zero: bool = feq n zero;
+    br is_zero .ret_zero .continue_zero;
+.ret_zero:
+    ret zero;
+.continue_zero:
+
+    # handle nan
+    is_not_nan: bool = feq n n;
+    br is_not_nan .continue_nan .ret_nan;
+.ret_nan:
+    nan: float = fdiv zero zero;
+    ret nan;
+.continue_nan:
+
+    # handle negative
+    is_negative: bool = flt n zero;
+    br is_negative .ret_nan .continue_neg;
+.continue_neg:
+
+    # main case
+    two: float = const 2;
+    oneminuseps: float = const 0.9999999999;
+    onepluseps: float = const 1.0000000001;
+    ans: float = const 1;
+.do:
+    next: float = fdiv n ans;
+    next: float = fadd ans next;
+    next: float = fdiv next two;
+.while:
+    quot: float = fdiv next ans;
+    ans: float = id next;
+    big_enough: bool = fge quot oneminuseps;
+    small_enough: bool = fle quot onepluseps;
+    good: bool = and big_enough small_enough;
+    br good .done .do;
+.done:
+    ret ans;
+}
diff --git a/tests/slow/polybench/linear-algebra/solvers/lu-test.bril b/tests/slow/polybench/linear-algebra/solvers/lu-test.bril
new file mode 100644
index 000000000..dab2bcbd2
--- /dev/null
+++ b/tests/slow/polybench/linear-algebra/solvers/lu-test.bril
@@ -0,0 +1,283 @@
+## lu performs LU decomposition.
+
+@main {
+    # constants
+    # dimensions correspond to Polybench MEDIUM_DATASET
+
+    N: int = const 400;
+    fN: float = const 400;
+
+    one: int = const 1;
+
+    # initialize arrays
+    A: ptr<float> = call @matrix_new N N;
+    call @init A N fN;
+
+    # main computation
+    i: int = const 0;
+.main_i:
+    cond: bool = lt i N;
+    br cond .main_i_body .main_i_done;
+.main_i_body:
+    j: int = const 0;
+.lower_j:
+    cond: bool = lt j i;
+    br cond .lower_j_body .lower_j_done;
+.lower_j_body:
+    k: int = const 0;
+.lower_k:
+    cond: bool = lt k j;
+    br cond .lower_k_body .lower_k_done;
+.lower_k_body:
+    Aik: float = call @matrix_get A i k N;
+    Akj: float = call @matrix_get A k j N;
+    decr: float = fmul Aik Akj;
+    call @matrix_decr A i j N decr;
+    k: int = add k one;
+    jmp .lower_k;
+.lower_k_done:
+    Ajj: float = call @matrix_get A j j N;
+    call @matrix_div A i j N Ajj;
+    j: int = add j one;
+    jmp .lower_j;
+.lower_j_done:
+    j: int = id i;
+.upper_j:
+    cond: bool = lt j N;
+    br cond .upper_j_body .upper_j_done;
+.upper_j_body:
+    k: int = const 0;
+.upper_k:
+    cond: bool = lt k i;
+    br cond .upper_k_body .upper_k_done;
+.upper_k_body:
+    Aik: float = call @matrix_get A i k N;
+    Akj: float = call @matrix_get A k j N;
+    decr: float = fmul Aik Akj;
+    call @matrix_decr A i j N decr;
+    k: int = add k one;
+    jmp .upper_k;
+.upper_k_done:
+    j: int = add j one;
+    jmp .upper_j;
+.upper_j_done:
+    i: int = add i one;
+    jmp .main_i;
+.main_i_done:
+
+    call @matrix_print A N N;
+
+    free A;
+}
+
+@init(A: ptr<float>, N: int, fN: float) {
+    one: int = const 1;
+    fzero: float = const 0;
+    fone: float = const 1;
+    fminusone: float = const -1;
+
+    i: int = const 0;
+.init_i:
+    cond: bool = lt i N;
+    br cond .init_i_body .init_i_done;
+.init_i_body:
+    j: int = const 0;
+    fj: float = const 0;
+.init_j1:
+    cond: bool = le j i;
+    br cond .init_j1_body .init_j1_done;
+.init_j1_body:
+    val: float = call @fmod fj fN;
+    val: float = fmul val fminusone;
+    val: float = fdiv val fN;
+    val: float = fadd val fone;
+    call @matrix_set A i j N val;
+
+    j: int = add j one;
+    fj: float = fadd fj fone;
+    jmp .init_j1;
+.init_j1_done:
+    j: int = add i one;
+.init_j2:
+    cond: bool = lt j N;
+    br cond .init_j2_body .init_j2_done;
+.init_j2_body:
+    call @matrix_set A i j N fzero;
+    j: int = add j one;
+    jmp .init_j2;
+.init_j2_done:
+    call @matrix_set A i i N fone;
+    i: int = add i one;
+    jmp .init_i;
+.init_i_done:
+    B: ptr<float> = call @matrix_new N N;
+
+    r: int = const 0;
+.init_B_r:
+    cond: bool = lt r N;
+    br cond .init_B_r_body .init_B_r_done;
+.init_B_r_body:
+    s: int = const 0;
+.init_B_s:
+    cond: bool = lt s N;
+    br cond .init_B_s_body .init_B_s_done;
+.init_B_s_body:
+    call @matrix_set B r s N fzero;
+    s: int = add s one;
+    jmp .init_B_s;
+.init_B_s_done:
+    r: int = add r one;
+    jmp .init_B_r;
+.init_B_r_done:
+
+    t: int = const 0;
+.psd_t:
+    cond: bool = lt t N;
+    br cond .psd_t_body .psd_t_done;
+.psd_t_body:
+    r: int = const 0;
+.psd1_r:
+    cond: bool = lt r N;
+    br cond .psd1_r_body .psd1_r_done;
+.psd1_r_body:
+    s: int = const 0;
+.psd1_s:
+    cond: bool = lt s N;
+    br cond .psd1_s_body .psd1_s_done;
+.psd1_s_body:
+    Art: float = call @matrix_get A r t N;
+    Ast: float = call @matrix_get A s t N;
+    incr: float = fmul Art Ast;
+    call @matrix_incr B r s N incr;
+    s: int = add s one;
+    jmp .psd1_s;
+.psd1_s_done:
+    r: int = add r one;
+    jmp .psd1_r;
+.psd1_r_done:
+    t: int = add t one;
+    jmp .psd_t;
+.psd_t_done:
+
+    r: int = const 0;
+.psd2_r:
+    cond: bool = lt r N;
+    br cond .psd2_r_body .psd2_r_done;
+.psd2_r_body:
+    s: int = const 0;
+.psd2_s:
+    cond: bool = lt s N;
+    br cond .psd2_s_body .psd2_s_done;
+.psd2_s_body:
+    Brs: float = call @matrix_get B r s N;
+    call @matrix_set A r s N Brs;
+    s: int = add s one;
+    jmp .psd2_s;
+.psd2_s_done:
+    r: int = add r one;
+    jmp .psd2_r;
+.psd2_r_done:
+    free B;
+}
+
+
+# Search for n % m where n and m are floats by
+# iteratively subtracting the largest m*2^k that
+# fits inside n. 
+# Takes O((log n/m)^2) time.
+# NOTE: In C, this can be done with a built in 
+# function, but this is the best we can do.
+@fmod(n: float, m: float): float {
+    zero: float = const 0;
+    two: float = const 2;
+    rem: float = id n;
+.while:
+    cond: bool = fge rem m;
+    br cond .body .done;
+.body:
+    decr: float = id m;
+.while_inner:
+    diff: float = fsub rem decr;
+    cond: bool = fge diff zero;
+    br cond .body_inner .done_inner;
+.body_inner:
+    decr: float = fmul decr two;
+    jmp .while_inner;
+.done_inner:
+    decr: float = fdiv decr two;
+    rem: float = fsub rem decr;
+    jmp .while;
+.done:
+    ret rem;
+}
+
+@matrix_new(Nrow: int, Ncol: int): ptr<float> {
+    total: int = mul Nrow Ncol;
+    ptr: ptr<float> = alloc total;
+    ret ptr;
+}
+
+@matrix_loc(mtx: ptr<float>, row: int, col: int, Ncol: int): ptr<float> {
+    row_offset: int = mul row Ncol;
+    offset: int = add row_offset col;
+    new_ptr: ptr<float> = ptradd mtx offset;
+    ret new_ptr;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_get(mtx: ptr<float>, row: int, col: int, Ncol: int): float {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    val: float = load ptr;
+    ret val;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_set(mtx: ptr<float>, row: int, col: int, Ncol: int, val: float) {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    store ptr val;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_incr(mtx: ptr<float>, row: int, col: int, Ncol: int, incr: float) {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    val: float = load ptr;
+    new_val: float = fadd val incr;
+    store ptr new_val;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_decr(mtx: ptr<float>, row: int, col: int, Ncol: int, decr: float) {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    val: float = load ptr;
+    new_val: float = fsub val decr;
+    store ptr new_val;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_div(mtx: ptr<float>, row: int, col: int, Ncol: int, div: float) {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    val: float = load ptr;
+    new_val: float = fdiv val div;
+    store ptr new_val;
+}
+
+@matrix_print(mtx: ptr<float>, Nrow: int, Ncol: int) {
+    i: int = const 0;
+    one: int = const 1;
+    total: int = mul Nrow Ncol;
+.while:
+    cond: bool = lt i total;
+    br cond .body .done;
+.body:
+    mtx_loc: ptr<float> = ptradd mtx i;
+    val: float = load mtx_loc;
+    print val;
+    i: int = add i one;
+    jmp .while;
+.done:
+}
\ No newline at end of file
diff --git a/tests/slow/polybench/linear-algebra/solvers/ludcmp-test.bril b/tests/slow/polybench/linear-algebra/solvers/ludcmp-test.bril
new file mode 100644
index 000000000..1c9322fca
--- /dev/null
+++ b/tests/slow/polybench/linear-algebra/solvers/ludcmp-test.bril
@@ -0,0 +1,366 @@
+## ludcmp performs LU decomposition followed
+## by forward substitution.
+
+@main {
+    # constants
+    # dimensions correspond to Polybench MEDIUM_DATASET
+
+    N: int = const 400;
+    fN: float = const 400;
+
+    one: int = const 1;
+    zero: int = const 0;
+
+    # initialize arrays
+    A: ptr<float> = call @matrix_new N N;
+    b: ptr<float> = call @vector_new N;
+    x: ptr<float> = call @vector_new N;
+    y: ptr<float> = call @vector_new N;
+
+    call @init A b x y N fN;
+
+    i: int = const 0;
+.main_i:
+    cond: bool = lt i N;
+    br cond .main_i_body .main_i_done;
+.main_i_body:
+    j: int = const 0;
+.lower_j:
+    cond: bool = lt j i;
+    br cond .lower_j_body .lower_j_done;
+.lower_j_body:
+    w: float = call @matrix_get A i j N;
+    k: int = const 0;
+.lower_k:
+    cond: bool = lt k j;
+    br cond .lower_k_body .lower_k_done;
+.lower_k_body:
+    Aik: float = call @matrix_get A i k N;
+    Akj: float = call @matrix_get A k j N;
+    decr: float = fmul Aik Akj;
+    w: float = fsub w decr;
+    k: int = add k one;
+    jmp .lower_k;
+.lower_k_done:
+    Ajj: float = call @matrix_get A j j N;
+    val: float = fdiv w Ajj;
+    call @matrix_set A i j N val;
+    j: int = add j one;
+    jmp .lower_j;
+.lower_j_done:
+    j: int = id i;
+.upper_j:
+    cond: bool = lt j N;
+    br cond .upper_j_body .upper_j_done;
+.upper_j_body:
+    w: float = call @matrix_get A i j N;
+    k: int = const 0;
+.upper_k:
+    cond: bool = lt k i;
+    br cond .upper_k_body .upper_k_done;
+.upper_k_body:
+    Aik: float = call @matrix_get A i k N;
+    Akj: float = call @matrix_get A k j N;
+    decr: float = fmul Aik Akj;
+    w: float = fsub w decr;
+    k: int = add k one;
+    jmp .upper_k;
+.upper_k_done:
+    call @matrix_set A i j N w;
+    j: int = add j one;
+    jmp .upper_j;
+.upper_j_done:
+    i: int = add i one;
+    jmp .main_i;
+.main_i_done:
+
+    i: int = const 0;
+.set_y_i:
+    cond: bool = lt i N;
+    br cond .set_y_i_body .set_y_i_done;
+.set_y_i_body:
+    w: float = call @vector_get b i;
+    j: int = const 0;
+.set_y_j:
+    cond: bool = lt j i;
+    br cond .set_y_j_body .set_y_j_done;
+.set_y_j_body:
+    Aij: float = call @matrix_get A i j N;
+    yj: float = call @vector_get y j;
+    decr: float = fmul Aij yj;
+    w: float = fsub w decr;
+    j: int = add j one;
+    jmp .set_y_j;
+.set_y_j_done:
+    call @vector_set y i w;
+    i: int = add i one;
+    jmp .set_y_i;
+.set_y_i_done:
+
+    i: int = sub N one;
+.set_x_i:
+    cond: bool = ge i zero;
+    br cond .set_x_i_body .set_x_i_done;
+.set_x_i_body:
+    w: float = call @vector_get y i;
+    j: int = add i one;
+.set_x_j:
+    cond: bool = lt j N;
+    br cond .set_x_j_body .set_x_j_done;
+.set_x_j_body:
+    Aij: float = call @matrix_get A i j N;
+    xj: float = call @vector_get x j;
+    decr: float = fmul Aij xj;
+    w: float = fsub w decr;
+    j: int = add j one;
+    jmp .set_x_j;
+.set_x_j_done:
+    Aii: float = call @matrix_get A i i N;
+    val: float = fdiv w Aii;
+    call @vector_set x i val;
+    i: int = sub i one;
+    jmp .set_x_i;
+.set_x_i_done:
+
+    call @vector_print x N;
+
+    free A;
+    free x;
+    free y;
+    free b;
+
+}
+
+
+@init(A: ptr<float>, b: ptr<float>, x: ptr<float>, y: ptr<float>,
+      N: int, fN: float) {
+    one: int = const 1;
+    fzero: float = const 0;
+    fone: float = const 1;
+    fminusone: float = const -1;
+
+    ftwo: float = const 2;
+    ffour: float = const 4;
+
+    i: int = const 0;
+    fi: float = const 0;
+.init_xyb_i:
+    cond: bool = lt i N;
+    br cond .init_xyb_i_body .init_xyb_i_done;
+.init_xyb_i_body:
+    call @vector_set x i fzero;
+    call @vector_set y i fzero;
+    val: float = fadd fi fone;
+    val: float = fdiv val fN;
+    val: float = fdiv val ftwo;
+    val: float = fadd val ffour;
+    call @vector_set b i val;
+    i: int = add i one;
+    fi: float = fadd fi fone;
+    jmp .init_xyb_i;
+.init_xyb_i_done:
+
+
+    i: int = const 0;
+.init_i:
+    cond: bool = lt i N;
+    br cond .init_i_body .init_i_done;
+.init_i_body:
+    j: int = const 0;
+    fj: float = const 0;
+.init_j1:
+    cond: bool = le j i;
+    br cond .init_j1_body .init_j1_done;
+.init_j1_body:
+    val: float = call @fmod fj fN;
+    val: float = fmul val fminusone;
+    val: float = fdiv val fN;
+    val: float = fadd val fone;
+    call @matrix_set A i j N val;
+
+    j: int = add j one;
+    fj: float = fadd fj fone;
+    jmp .init_j1;
+.init_j1_done:
+    j: int = add i one;
+.init_j2:
+    cond: bool = lt j N;
+    br cond .init_j2_body .init_j2_done;
+.init_j2_body:
+    call @matrix_set A i j N fzero;
+    j: int = add j one;
+    jmp .init_j2;
+.init_j2_done:
+    call @matrix_set A i i N fone;
+    i: int = add i one;
+    jmp .init_i;
+.init_i_done:
+    B: ptr<float> = call @matrix_new N N;
+
+    r: int = const 0;
+.init_B_r:
+    cond: bool = lt r N;
+    br cond .init_B_r_body .init_B_r_done;
+.init_B_r_body:
+    s: int = const 0;
+.init_B_s:
+    cond: bool = lt s N;
+    br cond .init_B_s_body .init_B_s_done;
+.init_B_s_body:
+    call @matrix_set B r s N fzero;
+    s: int = add s one;
+    jmp .init_B_s;
+.init_B_s_done:
+    r: int = add r one;
+    jmp .init_B_r;
+.init_B_r_done:
+
+    t: int = const 0;
+.psd_t:
+    cond: bool = lt t N;
+    br cond .psd_t_body .psd_t_done;
+.psd_t_body:
+    r: int = const 0;
+.psd1_r:
+    cond: bool = lt r N;
+    br cond .psd1_r_body .psd1_r_done;
+.psd1_r_body:
+    s: int = const 0;
+.psd1_s:
+    cond: bool = lt s N;
+    br cond .psd1_s_body .psd1_s_done;
+.psd1_s_body:
+    Art: float = call @matrix_get A r t N;
+    Ast: float = call @matrix_get A s t N;
+    incr: float = fmul Art Ast;
+    call @matrix_incr B r s N incr;
+    s: int = add s one;
+    jmp .psd1_s;
+.psd1_s_done:
+    r: int = add r one;
+    jmp .psd1_r;
+.psd1_r_done:
+    t: int = add t one;
+    jmp .psd_t;
+.psd_t_done:
+
+    r: int = const 0;
+.psd2_r:
+    cond: bool = lt r N;
+    br cond .psd2_r_body .psd2_r_done;
+.psd2_r_body:
+    s: int = const 0;
+.psd2_s:
+    cond: bool = lt s N;
+    br cond .psd2_s_body .psd2_s_done;
+.psd2_s_body:
+    Brs: float = call @matrix_get B r s N;
+    call @matrix_set A r s N Brs;
+    s: int = add s one;
+    jmp .psd2_s;
+.psd2_s_done:
+    r: int = add r one;
+    jmp .psd2_r;
+.psd2_r_done:
+    free B;
+}
+
+@matrix_new(Nrow: int, Ncol: int): ptr<float> {
+    total: int = mul Nrow Ncol;
+    ptr: ptr<float> = alloc total;
+    ret ptr;
+}
+
+@matrix_loc(mtx: ptr<float>, row: int, col: int, Ncol: int): ptr<float> {
+    row_offset: int = mul row Ncol;
+    offset: int = add row_offset col;
+    new_ptr: ptr<float> = ptradd mtx offset;
+    ret new_ptr;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_get(mtx: ptr<float>, row: int, col: int, Ncol: int): float {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    val: float = load ptr;
+    ret val;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_set(mtx: ptr<float>, row: int, col: int, Ncol: int, val: float) {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    store ptr val;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_incr(mtx: ptr<float>, row: int, col: int, Ncol: int, incr: float) {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    val: float = load ptr;
+    new_val: float = fadd val incr;
+    store ptr new_val;
+}
+
+@vector_new(N: int): ptr<float> {
+    ptr: ptr<float> = alloc N;
+    ret ptr;
+}
+
+@vector_get(vec: ptr<float>, i: int): float {
+    ptr: ptr<float> = ptradd vec i;
+    val: float = load ptr;
+    ret val;
+}
+
+@vector_set(vec: ptr<float>, i: int, val: float) {
+    ptr: ptr<float> = ptradd vec i;
+    store ptr val;
+}
+
+# EXPECTS:
+#   @vector_get defined
+@vector_print(vec: ptr<float>, N: int) {
+    i: int = const 0;
+    one: int = const 1;
+.while:
+    cond: bool = lt i N;
+    br cond .body .done;
+.body:
+    val: float = call @vector_get vec i;
+    print val;
+    i: int = add i one;
+    jmp .while;
+.done:
+}
+
+
+# Search for n % m where n and m are floats by
+# iteratively subtracting the largest m*2^k that
+# fits inside n. 
+# Takes O((log n/m)^2) time.
+# NOTE: In C, this can be done with a built in 
+# function, but this is the best we can do.
+@fmod(n: float, m: float): float {
+    zero: float = const 0;
+    two: float = const 2;
+    rem: float = id n;
+.while:
+    cond: bool = fge rem m;
+    br cond .body .done;
+.body:
+    decr: float = id m;
+.while_inner:
+    diff: float = fsub rem decr;
+    cond: bool = fge diff zero;
+    br cond .body_inner .done_inner;
+.body_inner:
+    decr: float = fmul decr two;
+    jmp .while_inner;
+.done_inner:
+    decr: float = fdiv decr two;
+    rem: float = fsub rem decr;
+    jmp .while;
+.done:
+    ret rem;
+}
\ No newline at end of file
diff --git a/tests/slow/polybench/linear-algebra/solvers/trisolv-test.bril b/tests/slow/polybench/linear-algebra/solvers/trisolv-test.bril
new file mode 100644
index 000000000..990aa5035
--- /dev/null
+++ b/tests/slow/polybench/linear-algebra/solvers/trisolv-test.bril
@@ -0,0 +1,157 @@
+## trisolv is a triangular solver.
+
+@main {
+    # constants
+    # dimensions correspond to Polybench MEDIUM_DATASET
+
+    N: int = const 400;
+    fN: float = const 400;
+
+    one: int = const 1;
+    zero: int = const 0;
+
+    # initialize arrays
+    L: ptr<float> = call @matrix_new N N;
+    b: ptr<float> = call @vector_new N;
+    x: ptr<float> = call @vector_new N;
+    call @init L b x N fN;
+
+    # main computation
+    i: int = const 0;
+.main_i:
+    cond: bool = lt i N;
+    br cond .main_i_body .main_i_done;
+.main_i_body:
+    bi: float = call @vector_get b i;
+    call @vector_set x i bi;
+    j: int = const 0;
+.main_j:
+    cond: bool = lt j i;
+    br cond .main_j_body .main_j_done;
+.main_j_body:
+    Lij: float = call @matrix_get L i j N;
+    xj: float = call @vector_get x j;
+    val: float = fmul Lij xj;
+    call @vector_decr x i val;
+    j: int = add j one;
+    jmp .main_j;
+.main_j_done:
+    xi: float = call @vector_get x i;
+    Lii: float = call @matrix_get L i i N;
+    val: float = fdiv xi Lii;
+    call @vector_set x i val;
+    i: int = add i one;
+    jmp .main_i;
+.main_i_done:
+
+    call @vector_print x N;
+
+    free L;
+    free x;
+    free b;
+}
+
+@init(L: ptr<float>, b: ptr<float>, x: ptr<float>,
+      N: int, fN: float) {
+    one: int = const 1;
+    fone: float = const 1;
+    ftwo: float = const 2;
+    finvalid: float = const -999;
+
+    i: int = const 0;
+    fi: float = const 0;
+.init_i:
+    cond: bool = lt i N;
+    br cond .init_i_body .init_i_done;
+.init_i_body:
+    call @vector_set x i finvalid;
+    call @vector_set b i fi;
+    j: int = const 0;
+    fj: float = const 0;
+.init_j:
+    cond: bool = le j i;
+    br cond .init_j_body .init_j_done;
+.init_j_body:
+    val: float = fadd fi fN;
+    val: float = fsub val fj;
+    val: float = fadd val fone;
+    val: float = fmul val ftwo;
+    val: float = fdiv val fN;
+    call @matrix_set L i j N val;
+
+    j: int = add j one;
+    fj: float = fadd fj fone;
+    jmp .init_j;
+.init_j_done:
+    i: int = add i one;
+    fi: float = fadd fi fone;
+    jmp .init_i;
+.init_i_done:
+}
+
+@matrix_new(Nrow: int, Ncol: int): ptr<float> {
+    total: int = mul Nrow Ncol;
+    ptr: ptr<float> = alloc total;
+    ret ptr;
+}
+
+@matrix_loc(mtx: ptr<float>, row: int, col: int, Ncol: int): ptr<float> {
+    row_offset: int = mul row Ncol;
+    offset: int = add row_offset col;
+    new_ptr: ptr<float> = ptradd mtx offset;
+    ret new_ptr;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_get(mtx: ptr<float>, row: int, col: int, Ncol: int): float {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    val: float = load ptr;
+    ret val;
+}
+
+# EXPECTS:
+#   @matrix_loc defined
+@matrix_set(mtx: ptr<float>, row: int, col: int, Ncol: int, val: float) {
+    ptr: ptr<float> = call @matrix_loc mtx row col Ncol;
+    store ptr val;
+}
+
+@vector_new(N: int): ptr<float> {
+    ptr: ptr<float> = alloc N;
+    ret ptr;
+}
+
+@vector_get(vec: ptr<float>, i: int): float {
+    ptr: ptr<float> = ptradd vec i;
+    val: float = load ptr;
+    ret val;
+}
+
+@vector_set(vec: ptr<float>, i: int, val: float) {
+    ptr: ptr<float> = ptradd vec i;
+    store ptr val;
+}
+
+@vector_decr(vec: ptr<float>, i: int, decr: float) {
+    ptr: ptr<float> = ptradd vec i;
+    val: float = load ptr;
+    new_val: float = fsub val decr;
+    store ptr new_val;
+}
+
+# EXPECTS:
+#   @vector_get defined
+@vector_print(vec: ptr<float>, N: int) {
+    i: int = const 0;
+    one: int = const 1;
+.while:
+    cond: bool = lt i N;
+    br cond .body .done;
+.body:
+    val: float = call @vector_get vec i;
+    print val;
+    i: int = add i one;
+    jmp .while;
+.done:
+}
\ No newline at end of file