dnl -*- mode: m4; comment-start: "%" -*-
include(`macros.m4')divert(-1)
% $Id: mars.m4,v 1.13 1999/04/15 03:01:06 geoffk Exp $

define(v_K,0x100)		%The expanded key array, 40 words (160 bytes).
define(v_sch_seenbits,0xD9)	Number of bits the same as this one seen.
define(v_sch_str,0xDA)		The number of stirring rounds to go
define(v_sch_bit,0xDA)		Counter of number of bits in 
%				search for weak key
define(v_sch_cnt,0xDB)		Counter for the scheduler
define(v_Kt,0xDC)		A temporary key word used by the scheduler.
define(v_Kt2,0xF0)		Another temporary.
define(v_k,0xE0)		The key to schedule, 16 bytes
define(v_sbox_result,0xF0)	Result of a 9-bit s-box lookup
define(v_sch_j,0xD8)		Low byte of K[i] in pseudo-code

define(v_D,0xE0)		The block to encrypt, MSB-first
define(v_L,0xD8)		Output words of the E-function
define(v_M,0xDC)
define(v_R,0xF0)
define(v_tmp,0xF0)		Temporary, 4 bytes
define(v_round,0xD5)		The current round number

define(v_trotate,0xD7)		Temporaries for the rotater.
define(v_rotvar,0xD6)
define(test_ram,0xF4)		Some RAM for the test program.

divert`'dnl
include(`mars-sbox-out.m4')

% MARS encryption
% Input: The scheduled key in v_K, the block to encrypt in v_D
% Output: The encrypted block in v_D
mars:

% Phase 1, forwards mixing
% First add subkeys to data
	ldx	#3*4
subkey_add_loop:
	add4(`v_D,X',`v_D,X',`v_K,X')
	txa
	sub	#4
	tax
	bpl	subkey_add_loop

% Then do eight rounds of forward mixing
	lda	#7
	sta	v_round
forwards_mix_loop:
	ldx	v_D+3
	stx	v_tmp+2
	eor4(`v_D+4',`v_D+4',`0*512+mars_sbox,X')
	ldx	v_D+2
	stx	v_tmp+1
	add4(`v_D+4',`v_D+4',`0*512+mars_sbox+256,X')
	ldx	v_D+1
	stx	v_tmp+0
	add4(`v_D+8',`v_D+8',`0*512+mars_sbox,X')
	ldx	v_D
	stx	v_tmp+3
	eor4(`v_D+12',`v_D+12',`0*512+mars_sbox+256,X')
	lda	v_round
	and	#3
	cmp	#2
	bmi	forwards_rounddep_done
	bne	forwards_not_15
	add4(v_tmp,v_tmp,`v_D+4')
	bra	forwards_rounddep_done
forwards_not_15:
	add4(v_tmp,v_tmp,`v_D+12')
forwards_rounddep_done:
	
	ldx	#3
forwards_shift_loop:
	lda	v_D+4,X
	sta	v_D,X
	lda	v_D+8,X
	sta	v_D+4,X
	lda	v_D+12,X
	sta	v_D+8,X
	lda	v_tmp,X
	sta	v_D+12,X
	decx
	bpl	forwards_shift_loop

	dec	v_round
	bmi	forwards_mix_loop_done
	jmp	forwards_mix_loop
forwards_mix_loop_done:

% Do 16 rounds of keyed transformation
	clr	v_round
keyed_transform_loop:

% Compute the E-function
	ldx	v_round
	add4(v_M,v_D,`v_K+16,X')
	set4(v_L,`v_K+20,X')

% Compute v_D[0] = v_D[0] << 13
	rotbl4(v_D,2,`,X')
forloop(`i',0,2,`dnl
	lsrx
	ror	v_D+1
	ror	v_D+2
	ror	v_D+3
	ror	v_D
')dnl

	mul4(v_R,v_D,v_L)
	ldx	v_M+3
	brset0	v_M+2,e_use_s1
	set4(v_L,`0*512+mars_sbox,X')
e_use_s1:
	brclr0	v_M+2,e_used_s0
	set4(v_L,`0*512+256+mars_sbox,X')
e_used_s0:
	rotl4(v_R,5)
	lda	v_R+3
	ldx	#v_M
	jsr	dorotate
	eor4(v_L,v_L,v_R)
	rotbl4(v_R,1,`,X')
	rotr4(v_R,3)
	eor4(v_L,v_L,v_R)
	lda	v_R+3
	ldx	#v_L
	jsr	dorotate

	add4(v_D+8,v_D+8,v_M)
	brset6	v_round,backward_mode
	add4(v_L,v_D+4,v_L)
	eor4(v_D+12,v_D+12,v_R)
	bra	done_addLR
backward_mode:
	add4(v_D+12,v_D+12,v_L)
	eor4(v_L,v_D+4,v_R)
done_addLR:
	ldx	#3
keyed_shift_loop:
	lda	v_D+8,X
	sta	v_D+4,X
	lda	v_D+12,X
	sta	v_D+8,X
	lda	v_D,X
	sta	v_D+12,X
	lda	v_L,X
	sta	v_D,X
	decx
	bpl	keyed_shift_loop

	lda	v_round
	add	#8
	sta	v_round
	bmi	keyed_transform_loop_done
	jmp	keyed_transform_loop
keyed_transform_loop_done:

% Do eight rounds of backward mixing
	clr	v_round
backwards_mix_loop:
	lda	v_round
	and	#3
	cmp	#2
	bmi	backwards_rounddep_done
	bne	backwards_not_26
	sub4(v_D,v_D,v_D+12)
	bra	backwards_rounddep_done
backwards_not_26:
	sub4(v_D,v_D,v_D+4)
backwards_rounddep_done:
	
	ldx	v_D+3
	stx	v_tmp+0
	eor4(`v_D+4',`v_D+4',`0*512+mars_sbox+256,X')
	ldx	v_D
	stx	v_tmp+1
	sub4(`v_D+8',`v_D+8',`0*512+mars_sbox,X')
	ldx	v_D+1
	stx	v_tmp+2
	sub4(`v_D+12',`v_D+12',`0*512+mars_sbox+256,X')
	ldx	v_D+2
	stx	v_tmp+3
	eor4(`v_D+12',`v_D+12',`0*512+mars_sbox,X')

	ldx	#3
backwards_shift_loop:
	lda	v_D+4,X
	sta	v_D,X
	lda	v_D+8,X
	sta	v_D+4,X
	lda	v_D+12,X
	sta	v_D+8,X
	lda	v_tmp,X
	sta	v_D+12,X
	decx
	bpl	backwards_shift_loop

	inc	v_round
	lda	v_round
	cmp	#8
	beq	backwards_mix_loop_done
	jmp	backwards_mix_loop
backwards_mix_loop_done:
	
% Then subtract subkeys from data
	ldx	#3*4
subkey_sub_loop:
	sub4(`v_D,X',`v_D,X',`v_K+eval(36*4),X')
	txa
	sub	#4
	tax
	bpl	subkey_sub_loop
	
% We're done!
	rts

%  Rotate the variable at X left by A.
dorotate: variable_rotate

% Table for key word reordering.
mars_key_reorder:
dnl Note that 7*23==1 (mod 40)
changecom`'dnl
forloop(`i',1,39,`dnl
ifdef(`mars_key_reorder_seen_'i,,`dnl
pushdef(`j',i)dnl
whileloop(`eval( ((j*23)%40) != i )',`dnl
	byte	j*4+1
define(`mars_key_reorder_seen_'j,1)dnl
define(`j',eval((j*23)%40))dnl
')dnl
define(`mars_key_reorder_seen_'j,1)dnl
ifelse(j,i,,`dnl
	byte	j*4
')')')dnl
changecom(`%')dnl
% termination byte
	byte	0

% Get the sbox entry keyed by X (low byte) and bit 0 of A (high byte)
% Constant-time routine.
mars_get_sbox:
	bit	#1
	bne	sch_stir_use_s1
	set4(v_sbox_result,`0*512+mars_sbox,X')
	rts
sch_stir_use_s1:
	set4(v_sbox_result,`0*512+256+mars_sbox,X')
	rts

% The MARS key schedule
% Input: The key (128 bits) in v_k
% Output: The key scheduled into v_K
mars_schedule:
% Initialise T with the key data
	eor4i(v_K,v_k,rotli(mars_sbox_0 ^ mars_sbox_5, 3) ^ 0)
	eor4i(v_K+4,v_k+4,rotli(mars_sbox_1 ^ mars_sbox_6, 3) ^ 1)

	lda	#8
sch_key_add_loop:
	sta	v_sch_cnt

	tax
	cmp	#4*7
	bcc	sch_eor_from_vk
	set4(v_Kt,`v_K-eval(2*4),X')
	lsrx
	lsrx
 	eor4(v_Kt,v_Kt,`0*512+mars_sbox,X')
	bra	done_sch_vk_eor
sch_eor_from_vk:
	eor4(v_Kt,`v_K-eval(2*4),X',`v_K-eval(7*4),X')
done_sch_vk_eor:

	rotl4(v_Kt,3)
	lda	v_sch_cnt
	and	#0x0C
	tax
	eor4(v_Kt,v_Kt,`v_k,X')
	lda	v_sch_cnt
	tax
	lsra
	lsra
	eor	v_Kt+3
	sta	v_K+3,X
forloop(`i',0,2,`dnl
	lda	v_Kt+i
	sta	v_K+i,X
')dnl
	txa
	add	#4
	cmp	#39*4
	beq	sch_key_add_loop_done
	jmp	sch_key_add_loop
sch_key_add_loop_done:

	lda	#4
	sta	v_K+eval(39*4+3)
	lda	#0
	sta	v_K+eval(39*4+2)
	sta	v_K+eval(39*4+1)
	sta	v_K+eval(39*4)
	
% Do the stirring
	lda	#6
	sta	v_sch_str
sch_stir_loop:
	ldx	#4
sch_stir_loop2:
	stx	v_sch_cnt
	lda	v_K-4+2,X
	ldx	v_K-4+3,X
	jsr	mars_get_sbox
	ldx	v_sch_cnt
	add4(v_sbox_result,v_sbox_result,`v_K,X')
	rot9lc4(`v_K,X',v_sbox_result)
	txa
	add	#4
	tax
	cpx	#40*4
	bne	sch_stir_loop2

	ldx	v_K+eval(39*4)+3
	lda	v_K+eval(39*4)+2
	jsr	mars_get_sbox
	add4(v_sbox_result,v_sbox_result,v_K)
	rot9lc4(v_K, v_sbox_result)
	
	dec	v_sch_str
	bmi	sch_stir_loop_done
	jmp	sch_stir_loop
sch_stir_loop_done:

% re-order the words, using a simple FSM
	clr	v_sch_cnt
start_reorder_cycle:
	ldx	v_sch_cnt
	ldx	mars_key_reorder,X
	beq	done_reorder_loop
	set4(v_Kt,`v_K-1,X')
	clc

reorder_cycle_continues:
	inc	v_sch_cnt
	ldx	v_sch_cnt
	ldx	mars_key_reorder,X
	txa
	bit	#1
	bne	more_reorder_cycle
	incx
	sec
more_reorder_cycle:
	set4(v_Kt2,`v_K-1,X')
	ldx	v_sch_cnt
	ldx	mars_key_reorder-1,X
	set4(`v_K-1,X',v_Kt2)
	bcc	reorder_cycle_continues
reorder_cycle_ends:
	ldx	v_sch_cnt
	ldx	mars_key_reorder,X
	set4(`v_K,X',v_Kt)
	inc	v_sch_cnt
	bra	start_reorder_cycle
done_reorder_loop:

% fix the `weak' key-words
	lda	#5*4
	sta	v_sch_cnt
weak_fix_loop:

% Look for sequences of 10+ 0s or 1s
	ldx	v_sch_cnt
forloop(`i',0,3,`dnl
	lda	v_K+i,X
ifelse(i,3,`dnl
	sta	v_sch_j
	ora	#3
	sta	v_K+i,X
')dnl
	sta	v_Kt+i
')dnl

% Simply loop through the bits of v_Kt and count the number of 0/1 bits
% seen so far, being careful to be constant-time...
% Set v_Kt to be v_Kt ^ (v_Kt >> 1)
forloop(`i',0,3,`dnl
	lda	v_Kt+i
ifelse(i,0,`dnl
	lsr	v_Kt+i
',`dnl
	ror	v_Kt+i
')dnl
	eor	v_Kt+i
	sta	v_Kt+i
')dnl

% Set a bit in v_Kt2 if the corresponding bit in v_Kt is part of a sequence
% of 10 or more consecutive 0 bits.
	lda	#32
	sta	v_sch_bit
	clr	v_sch_seenbits
sch_bit_counting_loop:
	shr4(v_Kt2,1)
	shr4(v_Kt,1)	% also copies the low bit into the carry flag.
	lda	#0
	adc	#0xFF	% A is now 0xFF if the bit was 0, 0x00 if it was 1.
	and	v_sch_seenbits
	sbc	#0xFF	% Add 1 only if the bit was 0.
	sta	v_sch_seenbits
	cmp	#9
	lda	#0
	adc	#0xFF	% A is now 0xFF if 9 or more 0 bits have been seen.
	ora	v_Kt2
	sta	v_Kt2
	dec	v_sch_bit
	bne	sch_bit_counting_loop

	bclr1	v_Kt2+3
	
% Generate the masked word and eor it with the schedule item
	lda	v_sch_j
	and	#3
	tax
	set4(v_Kt,`0*512+mars_sbox+265,X')
	ldx	v_sch_cnt
	lda	v_K+3+12,X
	ldx	#v_Kt
	jsr	dorotate

	ldx	v_sch_cnt
forloop(`i',0,3,`dnl
	lda	v_Kt2+i
	and	v_Kt+i
	eor	v_K+i,X
	sta	v_K+i,X
')dnl
	

	lda	v_sch_cnt
	add	#2*4
	sta	v_sch_cnt
	cmp	#37*4
	beq	weak_fix_loop_done
	jmp	weak_fix_loop

weak_fix_loop_done:
	rts

test_program(test_ram,v_k,16,v_D,v_D,16,jsr mars_schedule,jsr mars)

test_data:
xbytes(00000000000000000000000000000000
 00000000000000000000000000000000  deb35132 83c296de 39069e6b 994c2438)
xbytes(00000000000000000000000000000000
 deb35132 83c296de 39069e6b 994c2438  64fc8e9c b429181f 72141f4b bf87af3b)
xbytes(00000000000000000000000000000000
 64fc8e9c b429181f 72141f4b bf87af3b  c97823ec f4435929 4a2e679f 38174e5b)
xbytes(c97823ec f4435929 4a2e679f 38174e5b
 00000000000000000000000000000000  31fd9f10 284745f2 77b4d619 e92474b5)
xbytes(c97823ec f4435929 4a2e679f 38174e5b
 31fd9f10 284745f2 77b4d619 e92474b5  b694cfd6 22349ffa 834f2121 35c92a84)
test_data_end:
