jda-ml-demo
Neural network training benchmark – Jda vs Python head-to-head comparison. Trains MLPs from scratch with no external dependencies on either side. Jda is up to 37x faster than Python.
Tasks
| # | Task | Architecture | Epochs |
|---|---|---|---|
| 1 | XOR Classification | 2->8->1 MLP | 5,000 |
| 2 | Sine Approximation | 1->16->1 MLP | 10,000 |
| 3 | Matrix Multiply | 64x64 matmul | 10 iters |
Performance (x86-64 Linux, best of 3)
| Task | Jda | Python (no NumPy) | Speedup |
|---|---|---|---|
| XOR training (5K epochs) | 21 ms | 778 ms | ~37x |
| Sine training (10K epochs) | 439 ms | 15,347 ms | ~35x |
| 64x64 matmul (per iter) | 3 ms | 75 ms | ~25x |
Both implementations use identical algorithms (same loop structure, same SGD, same loss function). The only difference is the runtime: Jda compiles to native x86-64 machine code, Python interprets through CPython.
Build and Run
# Build Jda binary
bash apps/build-ml-demo.sh
# Run Jda only (in Docker)
docker run --rm --platform linux/amd64 --ulimit stack=524288000:524288000 \
-v $(PWD):/jda -w /jda jda-build ./apps/jda-ml-demo
# Run Python only
python3 apps/ml-demo-python.py
# Run both side by side with comparison
bash apps/run-ml-benchmark.shBinary Size
~1.08 MB static ELF binary. Zero external dependencies.
Dependencies
- Jda: all tensor/f64 operations are compiler builtins. 486 lines.
- Python: only stdlib (
math,time,random). No NumPy. 363 lines.
Source Code
// jda-ml-demo — Neural Network Training from Scratch in Pure Jda
// Trains MLPs on 3 tasks: XOR, Sine, Matmul. No external dependencies.
// Compare with: python3 apps/ml-demo-python.py
struct Timespec {
tv_sec: i64
tv_nsec: i64
}
fn time_now_ms() -> i64 {
let ts = Timespec{}
syscall(228, 1, &ts, 0)
let ns = ts.tv_sec * 1000000000 + ts.tv_nsec
ret ns / 1000000
}
// --- Float formatting ---
fn print_f64(val: f64) {
let scaled: f64 = val * 10000.0
let ival = f64_to_int(scaled)
let sign = 0
if ival < 0 {
sign = 1
ival = 0 - ival
}
let whole = ival / 10000
let frac = ival - whole * 10000
if sign == 1 { print "-" }
print "{whole}."
if frac < 10 { print "000" }
else if frac < 100 { print "00" }
else if frac < 1000 { print "0" }
print "{frac}"
}
// --- Neural network primitives ---
fn linear_forward(x: i64, w: i64, bias: i64, out: i64) {
let batch = tensor_shape(x, 0)
let in_f = tensor_shape(x, 1)
let out_f = tensor_shape(w, 0)
for i in range(batch) {
for j in range(out_f) {
let acc: f64 = tensor_get(bias, j)
for k in range(in_f) {
let xv: f64 = tensor_get(x, i * in_f + k)
let wv: f64 = tensor_get(w, j * in_f + k)
acc += xv * wv
}
tensor_set(out, i * out_f + j, acc)
}
}
}
fn relu_forward(a: i64, out: i64) {
let n = tensor_len(a)
for i in range(n) {
let v: f64 = tensor_get(a, i)
if v > 0.0 {
tensor_set(out, i, v)
} else {
tensor_set(out, i, 0.0)
}
}
}
fn relu_backward(pre_relu: i64, grad_out: i64, grad_in: i64) {
let n = tensor_len(pre_relu)
for i in range(n) {
let ai: f64 = tensor_get(pre_relu, i)
if ai > 0.0 {
let go: f64 = tensor_get(grad_out, i)
tensor_set(grad_in, i, go)
} else {
tensor_set(grad_in, i, 0.0)
}
}
}
fn backward_weights(x: i64, w: i64, grad_out: i64, grad_w: i64) {
let batch = tensor_shape(x, 0)
let in_f = tensor_shape(x, 1)
let out_f = tensor_shape(w, 0)
for j in range(out_f) {
for k in range(in_f) {
let acc: f64 = 0.0
for i in range(batch) {
let gov: f64 = tensor_get(grad_out, i * out_f + j)
let xv: f64 = tensor_get(x, i * in_f + k)
acc += gov * xv
}
let gw_idx = j * in_f + k
let old: f64 = tensor_get(grad_w, gw_idx)
tensor_set(grad_w, gw_idx, old + acc)
}
}
}
fn backward_bias(grad_out: i64, grad_b: i64, batch: i64, out_f: i64) {
for j in range(out_f) {
let acc: f64 = 0.0
for i in range(batch) {
let gov: f64 = tensor_get(grad_out, i * out_f + j)
acc += gov
}
let old: f64 = tensor_get(grad_b, j)
tensor_set(grad_b, j, old + acc)
}
}
fn backward_input(w: i64, grad_out: i64, grad_in: i64) {
let batch = tensor_shape(grad_in, 0)
let in_f = tensor_shape(grad_in, 1)
let out_f = tensor_shape(w, 0)
for i in range(batch) {
for k in range(in_f) {
let acc: f64 = 0.0
for j in range(out_f) {
let gov: f64 = tensor_get(grad_out, i * out_f + j)
let wv: f64 = tensor_get(w, j * in_f + k)
acc += gov * wv
}
let gi_idx = i * in_f + k
let old: f64 = tensor_get(grad_in, gi_idx)
tensor_set(grad_in, gi_idx, old + acc)
}
}
}
fn mse_loss(pred: i64, target: i64) -> i64 {
let n = tensor_len(pred)
let sum: f64 = 0.0
for i in range(n) {
let p: f64 = tensor_get(pred, i)
let t: f64 = tensor_get(target, i)
let d: f64 = p - t
sum += d * d
}
let fn_val: f64 = f64_from_int(n)
ret sum / fn_val
}
fn mse_backward(pred: i64, target: i64, grad_out: i64) {
let n = tensor_len(pred)
let scale: f64 = 2.0 / f64_from_int(n)
for i in range(n) {
let p: f64 = tensor_get(pred, i)
let t: f64 = tensor_get(target, i)
let d: f64 = p - t
tensor_set(grad_out, i, scale * d)
}
}
fn sgd_step(params: i64, grads: i64, lr: i64) {
let n = tensor_len(params)
let lr_f: f64 = lr
for i in range(n) {
let p: f64 = tensor_get(params, i)
let g: f64 = tensor_get(grads, i)
tensor_set(params, i, p - lr_f * g)
tensor_set(grads, i, 0.0)
}
}
fn zero_tensor(t: i64) {
tensor_fill(t, 0.0)
}
// --- PRNG for weight init ---
fn rng_next(state: &i64) -> i64 {
let s = state[0]
let a = s * 6364136223846793005 + 1442695040888963407
state[0] = a
ret a
}
fn rng_f64(state: &i64) -> i64 {
let r = rng_next(state)
if r < 0 { r = 0 - r }
let ri = r / 1000000000000
if ri < 0 { ri = 0 - ri }
let rem = ri - (ri / 1000000) * 1000000
let rf: f64 = f64_from_int(rem)
ret rf / 1000000.0
}
fn init_weights_uniform(t: i64, state: &i64, scale: i64) {
let n = tensor_len(t)
let scale_f: f64 = scale
let half: f64 = scale_f / 2.0
for i in range(n) {
let r: f64 = rng_f64(state)
tensor_set(t, i, r * scale_f - half)
}
}
fn matmul_bench(a: i64, b: i64, c: i64, M: i64, K: i64, N: i64) {
for i in range(M) {
for j in range(N) {
let acc: f64 = 0.0
for k in range(K) {
let av: f64 = tensor_get(a, i * K + k)
let bv: f64 = tensor_get(b, k * N + j)
acc += av * bv
}
tensor_set(c, i * N + j, acc)
}
}
}
// === Task 1: XOR Classification (2->8->1 MLP) ===
fn task_xor() {
print "=== Task 1: XOR Classification ===\n"
print "Architecture: 2 -> 8 -> 1 MLP\n"
print "Dataset: 4 XOR samples\n"
print "Epochs: 5000, LR: 0.1\n\n"
let batch = 4
let hidden = 8
let x = tensor_new2(4, 2)
tensor_set(x, 0, 0.0)
tensor_set(x, 1, 0.0)
tensor_set(x, 2, 0.0)
tensor_set(x, 3, 1.0)
tensor_set(x, 4, 1.0)
tensor_set(x, 5, 0.0)
tensor_set(x, 6, 1.0)
tensor_set(x, 7, 1.0)
let y = tensor_new2(4, 1)
tensor_set(y, 0, 0.0)
tensor_set(y, 1, 1.0)
tensor_set(y, 2, 1.0)
tensor_set(y, 3, 0.0)
let w1 = tensor_new2(hidden, 2)
let b1 = tensor_new1(hidden)
let gw1 = tensor_new2(hidden, 2)
let gb1 = tensor_new1(hidden)
let w2 = tensor_new2(1, hidden)
let b2 = tensor_new1(1)
let gw2 = tensor_new2(1, hidden)
let gb2 = tensor_new1(1)
let rng_state: i64[1]
rng_state[0] = 42
let scale: f64 = 2.0
init_weights_uniform(w1, &rng_state, scale)
init_weights_uniform(w2, &rng_state, scale)
tensor_fill(b1, 0.0)
tensor_fill(b2, 0.0)
let h1 = tensor_new2(4, hidden)
let h1r = tensor_new2(4, hidden)
let out = tensor_new2(4, 1)
let g_out = tensor_new2(4, 1)
let g_h1r = tensor_new2(4, hidden)
let g_h1 = tensor_new2(4, hidden)
let g_x = tensor_new2(4, 2)
let lr: f64 = 0.1
let t0 = time_now_ms()
for epoch in range(5000) {
linear_forward(x, w1, b1, h1)
relu_forward(h1, h1r)
linear_forward(h1r, w2, b2, out)
if epoch % 1000 == 0 {
let loss: f64 = mse_loss(out, y)
print " epoch "
print "{epoch}"
print " loss="
print_f64(loss)
print "\n"
}
mse_backward(out, y, g_out)
zero_tensor(gw2)
zero_tensor(gb2)
zero_tensor(g_h1r)
backward_weights(h1r, w2, g_out, gw2)
backward_bias(g_out, gb2, batch, 1)
backward_input(w2, g_out, g_h1r)
relu_backward(h1, g_h1r, g_h1)
zero_tensor(gw1)
zero_tensor(gb1)
zero_tensor(g_x)
backward_weights(x, w1, g_h1, gw1)
backward_bias(g_h1, gb1, batch, hidden)
backward_input(w1, g_h1, g_x)
sgd_step(w1, gw1, lr)
sgd_step(b1, gb1, lr)
sgd_step(w2, gw2, lr)
sgd_step(b2, gb2, lr)
}
let t1 = time_now_ms()
let xor_ms = t1 - t0
linear_forward(x, w1, b1, h1)
relu_forward(h1, h1r)
linear_forward(h1r, w2, b2, out)
print "\nPredictions:\n"
let half: f64 = 0.5
let p0: f64 = tensor_get(out, 0)
let p1: f64 = tensor_get(out, 1)
let p2: f64 = tensor_get(out, 2)
let p3: f64 = tensor_get(out, 3)
print " [0,0] -> "
print_f64(p0)
print " (expected ~0)\n"
print " [0,1] -> "
print_f64(p1)
print " (expected ~1)\n"
print " [1,0] -> "
print_f64(p2)
print " (expected ~1)\n"
print " [1,1] -> "
print_f64(p3)
print " (expected ~0)\n"
if half > p0 and p1 > half and p2 > half and half > p3 {
print " Result: PASS (4/4 correct)\n"
} else {
print " Result: FAIL\n"
}
let final_loss: f64 = mse_loss(out, y)
print " Final loss: "
print_f64(final_loss)
print "\n Time: "
print "{xor_ms}"
print " ms\n\n"
}
// === Task 2: Sine Approximation (1->16->1 MLP) ===
fn task_sine() {
print "=== Task 2: Sine Approximation ===\n"
print "Architecture: 1 -> 16 -> 1 MLP\n"
print "Dataset: 32 samples of sin(x), x in [0, 6.28]\n"
print "Epochs: 10000, LR: 0.01\n\n"
let n_samples = 32
let hidden = 16
let x = tensor_new2(n_samples, 1)
let y = tensor_new2(n_samples, 1)
let two_pi: f64 = 6.283185
let n_f: f64 = f64_from_int(n_samples)
let step: f64 = two_pi / n_f
for si in range(n_samples) {
let si_f: f64 = f64_from_int(si)
let xv: f64 = si_f * step
tensor_set(x, si, xv)
tensor_set(y, si, f64_sin(xv))
}
let w1 = tensor_new2(hidden, 1)
let b1 = tensor_new1(hidden)
let gw1 = tensor_new2(hidden, 1)
let gb1 = tensor_new1(hidden)
let w2 = tensor_new2(1, hidden)
let b2 = tensor_new1(1)
let gw2 = tensor_new2(1, hidden)
let gb2 = tensor_new1(1)
let rng_state: i64[1]
rng_state[0] = 12345
let scale: f64 = 2.0
init_weights_uniform(w1, &rng_state, scale)
init_weights_uniform(w2, &rng_state, scale)
tensor_fill(b1, 0.0)
tensor_fill(b2, 0.0)
let h1 = tensor_new2(n_samples, hidden)
let h1r = tensor_new2(n_samples, hidden)
let out = tensor_new2(n_samples, 1)
let g_out = tensor_new2(n_samples, 1)
let g_h1r = tensor_new2(n_samples, hidden)
let g_h1 = tensor_new2(n_samples, hidden)
let g_x = tensor_new2(n_samples, 1)
let lr: f64 = 0.01
let t0 = time_now_ms()
for epoch in range(10000) {
linear_forward(x, w1, b1, h1)
relu_forward(h1, h1r)
linear_forward(h1r, w2, b2, out)
if epoch % 2000 == 0 {
let loss: f64 = mse_loss(out, y)
print " epoch "
print "{epoch}"
print " loss="
print_f64(loss)
print "\n"
}
mse_backward(out, y, g_out)
zero_tensor(gw2)
zero_tensor(gb2)
zero_tensor(g_h1r)
backward_weights(h1r, w2, g_out, gw2)
backward_bias(g_out, gb2, n_samples, 1)
backward_input(w2, g_out, g_h1r)
relu_backward(h1, g_h1r, g_h1)
zero_tensor(gw1)
zero_tensor(gb1)
zero_tensor(g_x)
backward_weights(x, w1, g_h1, gw1)
backward_bias(g_h1, gb1, n_samples, hidden)
backward_input(w1, g_h1, g_x)
sgd_step(w1, gw1, lr)
sgd_step(b1, gb1, lr)
sgd_step(w2, gw2, lr)
sgd_step(b2, gb2, lr)
}
let t1 = time_now_ms()
let sine_ms = t1 - t0
linear_forward(x, w1, b1, h1)
relu_forward(h1, h1r)
linear_forward(h1r, w2, b2, out)
print "\nSample predictions (x -> predicted vs actual):\n"
for pi in range(8) {
let idx = pi * 4
let xv: f64 = tensor_get(x, idx)
let pv: f64 = tensor_get(out, idx)
let av: f64 = tensor_get(y, idx)
print " x="
print_f64(xv)
print " pred="
print_f64(pv)
print " actual="
print_f64(av)
print "\n"
}
let final_loss: f64 = mse_loss(out, y)
print " Final loss: "
print_f64(final_loss)
print "\n Time: "
print "{sine_ms}"
print " ms\n\n"
}
// === Task 3: Matrix Multiply Benchmark (64x64) ===
fn task_matmul() {
print "=== Task 3: Matrix Multiply Benchmark ===\n"
print "Size: 64x64 @ 64x64 (10 iterations)\n\n"
let M = 64
let N = 64
let K = 64
let a = tensor_new2(M, K)
let b = tensor_new2(K, N)
let c = tensor_new2(M, N)
let rng_state: i64[1]
rng_state[0] = 999
let scale: f64 = 1.0
init_weights_uniform(a, &rng_state, scale)
init_weights_uniform(b, &rng_state, scale)
matmul_bench(a, b, c, M, K, N)
let t0 = time_now_ms()
for iter in range(10) {
matmul_bench(a, b, c, M, K, N)
}
let t1 = time_now_ms()
let total_ms = t1 - t0
let avg_ms = total_ms / 10
let sum: f64 = 0.0
for vi in range(16) {
let v: f64 = tensor_get(c, vi)
sum += f64_abs(v)
}
print " Total (10 iters): "
print "{total_ms}"
print " ms\n"
print " Average: "
print "{avg_ms}"
print " ms per matmul\n"
if sum > 0.0 {
print " Verification: PASS\n"
}
print " FLOP/matmul: 524288\n\n"
}
// === Main ===
fn main() -> i64 {
print "================================================================\n"
print " Jda ML Demo — Neural Network Training from Scratch\n"
print " Compiler: jda1 (self-hosted, x86-64 native code)\n"
print " No LLVM, no C, no external dependencies\n"
print "================================================================\n\n"
task_xor()
task_sine()
task_matmul()
print "================================================================\n"
print " All tasks complete. Compare with:\n"
print " python3 apps/ml-demo-python.py\n"
print "================================================================\n"
ret 0
}