
		// SEGMENT Code.
		//

		// PROCEDURE init.
		//
		// Initialise the runtime library.
		//
init:
		cld			// Don't use BCD mode.

		if (target == target_7800)
		lda #0x07
		sta 0x08		// lock system to 7800 mode
_i1:
		sta 0x24
		bit 0x28
		bpl _i1			// wait for VBLANK
		lda #0x73
		sta 0x3c		// turn off DMA
		jsr movemem
		endif

		lda #0x00
		sta datastacks		// Reset the data stack pointer.
		sta bp			// Reset the base pointer.
		lda #charmode_raw_wait
		sta charmode		// Begin in raw wait ioctl mode.

		if (target == target_vicg)
		//
		// vicg mode.
		//
		// Move character generator to       0x1000 (4096).
		// Move framebuffer to         	     0x1800 (6144).
		// Runtime proper must then start at 0x1a00 (6656).
		//
		lda #(224+12)
		sta [36869]		// VIC Register #6
		lda [36866]             // VIC Register #3
		and #0x7f
		sta [36866]             // VIC Register #3
		lda #0x18		// MSB of new video buffer address.
		sta [648]		// Tell the kernel it's moved.
		lda #147		// The clear character
		jmp print_ch_a		// Print clear character and rts.

		elif (target == target_vich)
		//
		// vich mode.
		//
		// Leave framebuffer at        	     0x1000 (4096).
		// Move character generator to       0x1400 (5120).
		// Runtime proper must then start at 0x2400 (9216).
		//
		lda #(192+13)
		sta [36869]		// VIC Register #6

		lda [36866]             // VIC Register #3
		and #0x7f
		sta [36866]             // VIC Register #3

		lda [36867]             // VIC Register #4
		ora #0x01		// Double-height characters.
		sta [36867]             // VIC Register #4

		lda #0x10		// MSB of new video buffer address.
		sta [648]		// Tell the kernel it's moved.
		lda #147		// The clear character
		jmp print_ch_a		// Print clear character and rts.
		else
		rts
		endif



		// GROUP Push/Pop.
		//

		data_m3	= datastack - 3
		data_m2 = datastack - 2
		data_m1 = datastack - 1

		// PROCEDURE push_b.
		//
		// NOTE: push_b is stored in the Operator Group
		// 	 to allow relative branches to it


		// PROCEDURE cast_w2b.
		//
		// Change the word on the top of the stack into a byte.
		// (Simply discard the high byte; equivalent to pop_b).
		//
cast_w2b:
		/*FALLTHROUGH*/


		// PROCEDURE pop_b.
		//
		// Pop a byte from the stack into <A>.
		//
pop_b:
		ldy datastacks
pop_b_yok:
		lda datastack,y
		dey
		sty datastacks
a_rts:
		rts


		// PROCEDURE push_w.
		//
		// Push the word <A,X> on the stack.
		//
		// By convention when we store a word in the registers
		// <A=LSB, X=MSB>.
		//
		// The low byte is pushed first, the high byte second.
		// Thus we get: [under] = <A>, [top] = <X>.
		//
push_w:
		ldy datastacks
push_w_yok:
		iny
		sta datastack,y
		iny
		txa
		sta datastack,y
		sty datastacks
		rts


		// PROCEDURE pop_w.
		//
		// Pop a word from the stack into <A,X>.
		//
pop_w:
		ldy datastacks
// HFD			a bit smaller & faster - too bad we can't do STX abs,Y as well
pop_w_yok:
		ldx datastack,y
		dey
		lda datastack,y
		dey
		sty datastacks
		rts


		// PROCEDURE push_w_c.
		//
		// Push the word <C> on the stack.
		//
push_w_c:
		ldx ch
		lda cl
		jmp push_w	// rts.


		// PROCEDURE pop_w_c.
		//
		// Pop a word from the stack into <C>.
		//
pop_w_c:
		jsr pop_w
		stx ch
		sta cl
		rts


		// PROCEDURE push_b_c.
		//
		// Push the byte <CL> on the stack.
		//
push_b_c:
		lda cl
		jmp push_b	// rts.


		// PROCEDURE pop_w_c.
		//
		// Pop a byte from the stack into <CL>.
		//
pop_b_c:	jsr pop_b
		sta cl
		rts


		// PROCEDURE pop_b_cr.
		//
		// Pop two bytes from the stack into <A> and <CL>.
		//
		// <A> = [under]. <CL> = [top].
		//
pop_b_cr:
		jsr pop_b_c
		jmp pop_b_yok	// rts


		// PROCEDURE pop_w_cr.
		//
		// Pop two words from the stack into <A, X> and <C>.
		//
		// <A, X> = [under]. <C> = [top].
		//
pop_w_cr:
// HFD minor code reduction
		jsr pop_w_c
		jmp pop_w_yok	// rts


		// PROCEDURE pop_w_dc
		//
		// Pop two words from the stack into <D> and <C>.
		//
		// <C> = [under]. <D> = [top].
		//
pop_w_dc:
		jsr pop_w
		sta dl
		stx dh
		jmp pop_w_c	// rts





		// GROUP Cast.
		//

		// PROCEDURE cast_b2w.
		//
		// NOTE: cast_b2w is stored in the Operator Group
		//	 where it is identical to push_b #0.
		//


		// PROCEDURE cast_c2w.
		//
		// Change the signed byte (eg. char) on the top of the stack
		// into a word.
		//
		// MOD2: [bj 22sep1998] Wasn't looking at stack.
		//cast_b2w
// HFD a bit larger/faster
cast_c2w:
		ldx #0		// preset X for positive
		ldy datastacks	// Peek at [top]; copy into <A>.
		lda datastack,y
		bpl _cpos
		ldx #0xff
_cpos:
		txa
		jmp push_b_yok	// Push the extended MSB <X>.




		// GROUP Swap.
		//

		// PROCEDURE swap_bb.
		//
		// Swap bytes [under] and [top].
		//
swap_bb:
//HFD			still getting bigger and faster
		ldy datastacks
		lda data_m1,y
		sta cl
		lda datastack,y
		sta data_m1,y
		lda cl
		sta datastack,y
		rts


		// PROCEDURE swap_ww.
		//
		// Swap words [under] and [top].
		//
swap_ww:
//HFD			still getting bigger and faster
		ldy datastacks
		lda data_m2,y
		sta dl
		lda data_m3,y
		sta dh
		lda datastack,y
		sta data_m2,y
		lda data_m1,y
		sta data_m3,y
		lda dl
		sta datastack,y
		lda dh
		sta data_m1,y
		rts



		// GROUP Operators.
		//




		// PROCEDURE and_bb.  (Logical/Bit And with byte operands).
		//
		// [top] = [top] bitand [under].
		//
and_bb:
		jsr pop_b_cr	; Semicolon forces end of statement.
		and cl          ;
		jmp push_b_yok	// rts.


		// PROCEDURE or_bb.  (Logical/Bit Or with byte operands).
		//
		// [top] = [top] bitor [under].
		//
or_bb:
		jsr pop_b_cr
		ora cl
		jmp push_b_yok	// rts.


		// PROCEDURE and_ww.  (Logical/Bit And with word operands).
		//
		// [top] = [top] bitand [under].
		//
and_ww:		jsr pop_w_c     ;
		jsr pop_w_yok	; Semicolon forces end of statement.
		and cl		;
		sta cl          ;
		txa             ;
		and ch          ;
		sta ch          ;
		jmp push_w_c	// rts


		// PROCEDURE or_ww.  (Logical/Bit Or with word operands).
		//
		// [top] = [top] bitor [under].
		//
or_ww:		jsr pop_w_c
		jsr pop_w_yok
		ora cl
		sta cl
		txa
		ora ch
		sta ch
		jmp push_w_c	// rts


		// PROCEDURE pop_b_cr_cmp.
		//
		// Pop two bytes into <C> and <X>, then compare them
		// setting the carry/minus/zero flags as appropriate.
		//
pop_b_cr_cmp:
		jsr pop_b_cr
		cmp cl
		rts



		// PROCEDURE pop_w_cmp_cm.
		//
		// Pop two words into <C> and <X>, then compare them
		// setting the carry/minus flags as appropriate.
		//
_pop_w_cmp_cm:
		jsr pop_w_cr		// al,xh = [under].  c = [top].
		sec
		sbc cl
		txa
		sbc ch			// [under] - [top]
		rts


		// PROCEDURE pop_w_cmp_eq.
		//
		// Pop two words into <C> and <X>, then compare them
		// setting the zero flag as appropriate.
		//
_pop_w_cmp_eq:
		jsr pop_w_cr		// al,xh = [under].  c = [top].
		cmp cl
		bne _leave20
		cpx ch
_leave20:
		rts


		// PROCEDURE pop_w_cmp_all.
		//
		// Pop two words into <C> and <X>, then compare them
		// setting the carry/minus/zero flags as appropriate.
		//
_pop_w_cmp_all:
		jsr pop_w_cr		// al,xh = [under].  c = [top].

		cmp cl
		bne _set_nonzero
		cpx ch
		sec			// For upcoming subtraction
		beq _set_zero
_set_nonzero:
		sbc cl
		txa
		sbc ch			// [under] - [top]
		php
		pla		;
		and #253	;	// Clear the Z flag
		jmp _cont22     ;

_set_zero:
		sbc cl
		txa
		sbc ch			// [under] - [top]
		php
		pla
		ora #2			// Set Z flag.

_cont22:
		pha
		plp
		rts




		// PROCEDURE eq_ww.  [top] = [under] == [top] (words)
		//
eq_ww:
		jsr 	_pop_w_cmp_eq
		jmp	_res_eq

		// PROCEDURE ne_ww.  [top] = [under] != [top] (words)
		//
ne_ww:
		jsr 	_pop_w_cmp_eq
		jmp	_res_ne



		// PROCEDURE lt_ww.  [top] = [under] <  [top] (signed words)
		//
lt_ww:
		jsr 	_pop_w_cmp_cm	// [underneath] - [top]
		jmp	_res_lt


		// PROCEDURE le_ww.  [top] = [under] <= [top] (signed words)
		//
le_ww:
		jsr 	_pop_w_cmp_all
		jmp	_res_le


		// PROCEDURE gt_ww.  [top] = [under] >  [top] (signed words)
		//
gt_ww:
		jsr 	_pop_w_cmp_all
		jmp	_res_gt


		// PROCEDURE ge_ww.  [top] = [under] >= [top] (signed words)
		//
ge_ww:
		jsr 	_pop_w_cmp_cm
		jmp	_res_ge


		// PROCEDURE ult_ww. [top] = [under] <  [top] (unsigned words)
		//
ult_ww:
		jsr 	_pop_w_cmp_cm
		jmp	_res_ult


		// PROCEDURE ule_ww. [top] = [under] <= [top] (unsigned words)
		//
ule_ww:
		jsr 	_pop_w_cmp_all
		jmp	_res_ule


		// PROCEDURE ugt_ww. [top] = [under] >  [top] (unsigned words)
		//
ugt_ww:
		jsr 	_pop_w_cmp_all
		jmp	_res_ugt


		// PROCEDURE uge_ww. [top] = [under] >= [top] (unsigned words)
		//
uge_ww:
		jsr 	_pop_w_cmp_cm
		jmp	_res_uge




		// PROCEDURE push_b_1.
		//
		// Push true (-1).
		//
		// NOTE:
		//   We use -1 to represent true rather than 1,
		//   since -1 allows logical and bitwise
		//   boolean operations to be done by the
		//   same algorithm.  This is also faster.
		//   [bj 21sep1998].
		//
		//
push_b_1:
		lda #0xff
		/*FALLTHROUGH*/



		// PROCEDURE push_b.
		//
		// Push the byte <A> on the stack.
		//
push_b:
		ldy datastacks
push_b_yok:
		iny
		sta datastack,y
		sty datastacks
		rts



		// PROCEDURE cast_b2w.
		//
		// Change the unsigned byte on the top of the stack
		// into a word.  (Set the high byte to zero).
		//
		// This is identical to push_b_0.
		//
cast_b2w:
		/*FALLTHROUGH*/


		// PROCEDURE push_b_0.
		//
		// Push false (0).
		//
push_b_0:
		lda #0
		beq push_b	// rts.
		//NOTREACHED




		// PROCEDURE eq_bb.  [top] = [under] == [top] (bytes)
		//
eq_bb:
		jsr pop_b_cr_cmp
_res_eq:
		beq push_b_1
		bne push_b_0		// rts
		//NOTREACHED


		// PROCEDURE ne_bb.  [top] = [under] != [top] (bytes)
		//
ne_bb:
		jsr pop_b_cr_cmp
_res_ne:
		bne push_b_1
		beq push_b_0		// rts
		//NOTREACHED


// Unsigned Greater than: true iff [under] > [top]
//
ugt_bb:
		jsr pop_b_cr_cmp	// lda [under], cmp [top]
_res_ugt:
		beq push_b_0		// =  fails.
		bcs push_b_1		// >= suceeds.
		bcc push_b_0		// <  fails.
		//NOTREACHED


// Unsigned Greater than or equal: true iff [under] >= [top]
//
uge_bb:
		jsr pop_b_cr_cmp	// lda [under], cmp [top]
_res_uge:
		bcs push_b_1		// >= suceeds
		bcc push_b_0		// *  fails.
		//NOTREACHED


// Unsigned Less than: true iff [under] < [top]
//
ult_bb:
		jsr pop_b_cr_cmp	// lda [under], cmp [top]
_res_ult:
		bcc push_b_1		// < suceeds.
		bcs push_b_0		// * fails.
		//NOTREACHED


// Unsigned Less than or equal: true iff [under] <= [top]
//
ule_bb:
		jsr pop_b_cr_cmp	// lda [under], cmp [top]
_res_ule:
		bcc push_b_1		// < succeeds.
		beq push_b_1		// = succeeds.
		bne push_b_0		// * fails.
		//NOTREACHED


// Signed Greater than: true iff [under] > [top]
//				 [under] - [top] > 0
//
gt_bb:
		jsr pop_b_cr_cmp	// lda [under], cmp [top]
_res_gt:
		beq push_b_0		// =  fails.
		bpl push_b_1		// >= suceeds.
		bmi push_b_0		// <  fails.
		//NOTREACHED


// Signed Greater than or equal: true iff [under] >= [top]
//					  [under] -  [top] >= 0
//
ge_bb:
		jsr pop_b_cr_cmp	// lda [under], cmp [top]
_res_ge:
		bpl push_b_1		// >= suceeds
		bmi push_b_0		// <  fails.
		//NOTREACHED


// Signed Less than: true iff [under] < [top]
//			      [under] - [top] < 0
//
lt_bb:
		jsr pop_b_cr_cmp	// lda [under], cmp [top]
_res_lt:
		bmi push_b_1		// <  suceeds.
		bpl push_b_0		// *  fails.
		//NOTREACHED


// Signed Less than or equal: true iff [under] <= [top]
//				    [under] - [top] <= 0
//
le_bb:
		jsr pop_b_cr_cmp	// lda [under], cmp [top]
_res_le:
		bmi push_b_1		// <  suceeds
		beq push_b_1		// =  suceeds.
		bne push_b_0		// *  fails.
		//NOTREACHED




		// PROCEDURE pop_w_cd.
		//
		// Pop two words from the stack into <C>, <D>.
		//
		// <D> = [top]; <C> = [underneath]
pop_w_cd:
		jsr pop_w
		sta dl
		stx dh
		jmp pop_w_c		// rts.





		// GROUP Stack Management.
		//

		// PROCEDURE push_n.
		//
		// Push <A> bytes on the stack.
		// Their contents are undefined.
		//
push_n:
		clc
		adc datastacks
		sta datastacks
		rts


		// PROCEDURE pop_n.
		//
		// Pop <A> bytes off the stack and discard them.
		//
pop_n:
		sta el
		sec
		lda datastacks
		sbc el
		sta datastacks
		rts


		// PROCEDURE prolog.
		//
		// Start a new stack frame.  A stack frame is
		// needed when a subroutine is called that has
		// either parameters or local "automatic" variables.
		// When we return from the subroutine the stack
		// frame is discarded.
		//
		// Subroutines with none of these do not need a stack frame.
		//
		// A stack frame lives on the datastack.
		// It consists of local storage;
		// Parameters followed by local "automatic" variables.
		// The base pointer <BP> points to the first byte
		// underneath the the current stack frame.
		//
		//   datastack[BP+1] points to the first  byte of the frame.
		//   datastack[BP+2] points to the second byte of the frame.
		//   datastack[BP+3] points to the third  byte of the frame.
		//   etc.
		//
		// The prolog subroutine is called from the application
		// before calling the subroutine.  It should be called
		// immediately before the first parameter is pushed on
		// the stack.
		//
		// On entering the subroutine the bytes needed for
		// storage of local "automatic" variables needs to be
		// pushed on the stack (by calling push_n).  This is the
		// size of the stack frame, minus the size of the parameters.
		//
prolog:
		lda bp
		jsr push_b	// Saved old <BP> on stack.
		lda datastacks
		sta bp		// Current stack pointer becomes new <BP>.
		rts



		// PROCEDURE epilog.
		//
		// This subroutine is called before returning from
		// a stack-framed subroutine.  <A> is set to the size
		// of the local stack frame.
		//
		// Subroutines that return a value do so in <A, X>;
		// <A> (and <Y>) are destroyed by this subroutine,
		// and so <A> should be saved before calling it.
		//
epilog:
		jsr pop_n	// Discard local stack frame.
		jsr pop_b_yok   // Retrieve the previous <BP>.
		sta bp          // Restore  the previous <BP>.
		rts


		// ---------------------------------------------------
		if (target == target_7800)

		// HFD
		// PROCEDURE movemem
		//
		// moves data segment from ROM space to RAM
		// actually 3 move routines in 1
		// 1. copy parameters
		// 2. copy pages at a time
		// 3. copy less than 256 bytes
		//
movemem:
		ldx #5			// copy to e,f,c pseudo-registers
_lp1:
		lda params,x
		sta el,x
		dex
		bpl _lp1

					// e= start  f= end  c= size
					// then move data
		LDY #0
		LDX ch
		BEQ _MD2
_MD1:
		LDA (el),y		// move a page at a time
		STA (fl),y
		INY
		BNE _MD1
		INC eh
		INC fh
		DEX
		BNE _MD1
_MD2:
		LDX cl
		BEQ _MD4
_MD3:
		LDA (el),y		// move the remaining bytes
		STA (fl),y
		INY
		DEX
		BNE _MD3
_MD4:
		RTS

NMI:
IRQ:
		RTI
		endif
		// ---------------------------------------------------

// ===========================================================================
// char I/O
// ===========================================================================


		// SEGMENT Static Data.
		//

		// Runtime Error String.
		//
rerror_string:
		db "RERROR "
		db  0


		// GROUP Console I/O.
		//


		// PROCEDURE ioctl.
		//
		// Set the ioctl standard input mode from [top].
		//
ioctl:
		jsr pop_b
		sta charmode
		rts


		// PROCEDURE print_ch_b.
		//
		// Print [top] as a character.
		//
print_ch_b:
		jsr pop_b

		// ALTERNATE ENTRY print_ch_a.
		//
		// Print <A> as a character.
		//
print_ch_a:
		if (target != target_aim)
		 cmp #10			// convert '\n' to '\r'
		 bne _skip4
		 lda #13
_skip4:
		endif
		jmp call_print_ch_a	// rts

print_nl:
		if (target == target_aim)
		  jmp 0xea13
		else
		  lda #13		// convert '\n' to '\r'
		  bne _skip4	// small cut saves us a byte.
		endif
		//NOTREACHED


		// PROCEDURE print_string_ax.
		//
		// Print the null-terminated string pointed at by <A, X>.
		//
		// Two strings walk into a bar.
		// The first says, "I'll have a beer."
		// The second says, "I'll have a beer too1r380*)@$(C@&unrd3c"
		// The first says, "You'll have to excuse my friend;
		// He isn't null terminated."  :)
		//
		//
print_string_ax:
		sta cl
		stx ch
		jmp print_string_c

		// ALTERNATE ENTRY print_string_w.
		//
		// Print the null-terminated string pointed at by [top].
		//
print_string_w:
		jsr pop_w_c

		// ALTERNATE ENTRY print_string_w.
		//
		// Print the null-terminated string pointed at by <C>.
		//
print_string_c:
_loop5:		ldy #0
		lda (cl),y
		beq _leave5
		jsr print_ch_a
		inc cl
		bne _loop5
		inc ch
		jmp _loop5
_leave5:
__rts1:
		rts


		// PROCEDURE print_integer_ax_signed.
		//
		// Print the signed word <A, X> as an integer.
		//
print_integer_ax_signed:
		sta cl
		stx ch
		ldx #0			// Address <C>
		stx is_negative		// Clear <is_negative>.
		jsr _abs_cx_flag	// Make absolute; update <is_negative>
		ldx is_negative		// Test <is_negative>.
		beq print_integer_c	// Not set, so Positive; print as is.
		lda #45			// Set, so Negative; print '-'.
		jsr print_ch_a
		jmp print_integer_c	// Now print it.

		// PROCEDURE print_integer_ax.
		//
		// Print the unsigned word <A, X> as an integer.
		//
print_integer_ax:
		sta cl
		stx ch
		jmp print_integer_c

		// ALTERNATE ENTRY print_integer_w.
		//
		// Print the unsigned word [top] as an integer.
		//
print_integer_w:
		jsr pop_w_c

		// ALTERNATE ENTRY print_integer_c.
		//
		// Print the unsigned word <C> as an integer.
		//

		// ----------------------
		//   Better routine here
		//   no long division - no buffer
		// ----------------------
// -------------------------------------------------------------
// This section found in "6502 Software Design" by Leo J. Scanlon
// (page 156) and adapted/optimized by Harry Dodgson
// -------------------------------------------------------------
print_integer_c:
	ldy #0			; init table pointer
	sty dl			; leading zero suppresion
_nxtdig:
	ldx #0			; init digit count
_subem:
	lda cl			; fetch LSB of binary value
	sec
	sbc _subtbl_l,y		; subtract LSB of table value
	sta cl			; save LSB of result
	lda ch			; fetch MSB
	sbc _subtbl_h,y		; subtract MSB
	bcc _adback		; if results is negative, done with this digit
	sta ch			; save MSB of result
	inx			; increment digit count
	bne _subem		; loop	(bra)

_adback:
	lda cl			; fetch LSB 
	adc _subtbl_l,y		; add table value to put it back where it was
	sta cl			; save LSB
;
	bit dl			; test for non-zero printed
	bmi _prtit		; if so, then print this digit
	txa			; get this digit
	beq _skpit		; if zero, then don't print
	lda #0x80
	sta dl			; otherwise set flag
;
_prtit:
	txa			; get digit count
	ora #0x30		; convert to ascii
	jsr call_print_ch_a	; print it
_skpit:
	iny			; advance to next table value
	cpy #4			; test for end of table
	bcc _nxtdig		; continue with loop for next digit
	lda cl			; get last digit (remainder)
	ora #0x30		; convert to ascii
	jmp call_print_ch_a	; print it - rts

				; number table
				; 10,000 1,000 100 10
_subtbl_l:
	db 0x01, 0xe8, 0x64, 0x0a
_subtbl_h:
	db 0x27, 0x03, 0x00, 0x00
// -------------------------------------------------------------

		// PROCEDURE get_ch_a.
		//
		// Get a character from standard input.
		// Return the character in 'A'.
		//
		// This applies the ioctl standard input mode
		// before calling the appropriate kernel routines.
		//
get_ch_a:	lda charmode
		beq _get_cooked
		bmi _get_nowait

_get_wait:	jsr call_get_ch_raw_a
		cmp #0
		beq _get_wait
		bne __rts1

_get_nowait:	jmp call_get_ch_raw_a

_get_cooked:	jmp call_get_ch_cooked_a


get_ch_b:	jsr get_ch_a
		jmp push_b

		// GROUP Runtime Error.
		//


		// PROCEDURE rerror (report runtime error).
		//
		// Terminate reporting runtime error <A>.
		//

		// note that the 7800 doesn't have anywhere to return to
		// so we better not error out
rerror:
		pha
                lda #(rerror_string % 256)
                ldx #(rerror_string / 256)
		jsr print_string_ax		// Print "RERROR "
		pla
		ldx #0
		jsr print_integer_ax		// Print the error #.

		if (target == target_aim)
		jmp 0xe1a1			// return to AIM-65 monitor

		else
		jsr call_get_ch_cooked_a	// Wait for an Enter.
		brk				// Soft break.
		endif

// ===========================================================================
// MATH
// ===========================================================================

		//HFD  never used D register - makes smaller and faster
		// LOCAL PROCEDURE _abs_cx_flag.
		//
		//
		// Set <C> to the absolute value of itself.
		// If <C> was negative, then increment the
		// <is_negative> counter.
		//
		// NOTE:
		//   The <is_negative> counter can be used to count the
		//   total number of negative operands to an operator.
		//   If this number is odd, we know we need to negate
		//   the result of the operator.
		//

		// ----------------------
		//   Better routines here
		// ----------------------
_abs_cx_flag:
		lda ch
		bpl _skip6

		// ALTERNATE ENTRY _neg_cx_flag.
		//
_neg_cx_flag:
// HFD			<C> = 0-<C>
		inc is_negative
_neg_cx:
		sec
		lda #0
		sbc cl
		sta cl
		lda #0
		sbc ch
		sta ch
_skip6:
		rts


		// PROCEDURE pop_w_cd_abs.
		//
		// Pop two words from the stack into <C>, <D>.
		//
		// <D> = [top]; <C> = [underneath];
		// <is_negative> = number of negative operands.
		//
_pop_w_cd_abs:

		lda #0
		sta is_negative
		jsr pop_w_c
		jsr _abs_cx_flag
		lda cl
		sta dl
		lda ch
		sta dh
		jsr pop_w_c
		// this is because the remainder retains the dividend sign
		// put the sign bit into high bit of is_negative
		lda ch			;
		and #0x80		;
		ora is_negative		;
		sta is_negative		;
		jmp _abs_cx_flag

_push_w_abs2:
		jsr push_w
		bit is_negative
		bpl _skip07
		jsr _neg_w_yok
_skip07:
		rts

_push_w_abs:
		sta cl			// have to save this now
		lda is_negative         ;
		and #1			;
		beq _skip7              ;
		stx ch
		jsr _neg_cx
_skip7:
		jmp push_w_c		; rts





		// PROCEDURE mod_ww. [top] = [under] % [top] (signed word)
		//
mod_ww:
		jsr _pop_w_cd_abs
		jsr div_mod_w
_mod_ww_rt:
		lda el
		ldx eh
		jmp _push_w_abs2	; rts,


		// PROCEDURE div_ww. [top] = [under] / [top] (signed word)
		//
div_ww:
		jsr _pop_w_cd_abs
		jsr div_mod_w
		lda cl
		ldx ch
		jmp _push_w_abs		; rts,




		// PROCEDURE umod_ww. [top] = [under] % [top] (unsigned word)
		//
umod_ww:
		jsr pop_w_dc
		jsr div_mod_w
		lda el
		ldx eh
		jmp push_w		// rts


		// PROCEDURE udiv_ww. [top] = [under] / [top] (unsigned word)
		//
udiv_ww:
		jsr pop_w_dc
		jsr div_mod_w
		jmp push_w_c

		// PROCEDURE div_mod_w.
		//
		// Divides <C> by <D>.
		//
		// IN:
		//   C		dividend (number)
		//   D		divisor
		// OUT:
		//   E		Remainder.
		//   C		Quotient.
		//
div_mod_w:
		lda dl		// check for div/0
		bne _okdiv	// could ignore this and let the result go to 0xffff
		lda dh
		bne _okdiv
		// lda #0
		jmp rerror	// only runtime error at this time

// -------------------------------------------------------------
// This section found in "6502 Software Design" by Leo J. Scanlon
// (page 132)
// -------------------------------------------------------------
// This is the smallest and fastest one I've found

	reml   = el
	remh   = eh
	divndl = cl
	divndh = ch
	divsrl = dl
	divsrh = dh

_okdiv:
	lda #0			; clear partial dividend
	sta reml
	sta remh
	ldx #16			; set dividend bit count 
_nxtbt:
	asl divndl		; shift dividend/quotient left
	rol divndh
	rol reml		; shift partial dividend left
	rol remh
	lda reml		; subtract divisor LSB
	sec
	sbc divsrl
	tay			; save LSB in Y
	lda remh		; subtract divisor MSB
	sbc divsrh
	bcc _cntdn		; if divisor is less than dividend
	inc divndl		; set bit in quotient
	sta remh		; save subtraction result
	sty reml
_cntdn:
	dex			; decrement bit count
	bne _nxtbt		; loop until all bits done
	rts 			; only 38 bytes
// -------------------------------------------------------------



		// PROCEDURE add_bb.  [top] = [under] + [top] (bytes)
		//
add_bb:
// HFD			any pop then push will get faster this way
		ldy datastacks
		lda datastack,y		// a = [under].  c = [top].
		clc
		adc data_m1,y
		sta data_m1,y
		dey
		sty datastack
		rts


		// PROCEDURE sub_bb.  [top] = [under] - [top] (bytes)
		//
sub_bb:
// HFD			any pop then push will get faster this way
		ldy datastacks
		lda data_m1,y		// a = [under].  c = [top].
		sec
		sbc datastack,y
		sta data_m1,y
		dey
		sty datastack
		rts


		// PROCEDURE add_ww.  [top] = [under] + [top] (words)
		//
add_ww:
		jsr pop_w_cr		// al,xh = [under].  c = [top].
		clc
		adc cl
		sta cl
		txa
		adc ch
		sta ch
		jmp push_w_c		// rts.


		// PROCEDURE sub_ww.  [top] = [under] - [top] (words)
		//
sub_ww:
		jsr pop_w_cr		// al,xh = [under].  c = [top].
		sec
		sbc cl
		sta cl
		txa
		sbc ch
		sta ch
		jmp push_w_c		// rts.



		// PROCEDURE extend.
		//
		// Extend signed byte a <A> into signed word <A, X>.
		// If <A> is negative <X> is set to 0xffu.
		// If <A> is zero or positive<X> is set to 0x00.
		//
extend_ax:
		cmp #0
		bmi _extend_neg
		ldx #0
		rts
_extend_neg:	ldx #0xff
		rts

		// PROCEDURE not_w.   [top] = ~[top] 	(words)
		//
not_w:
// HFD			smaller and faster when combined
		ldy datastacks
		jsr not_b
		dey
		jmp _not_b_y		// saves 3 cycles

		// PROCEDURE not_b.   [top] = ~[top] 	(bytes)
		//
not_b:		
// HFD			
		ldy datastacks
_not_b_y:
		lda datastack,y
		eor #0xff
		sta datastack,y
		rts


		// PROCEDURE neg_w.   [top] = -[top] 	(words)
		//
neg_w:
// HFD			larger, but faster
		ldy datastacks
_neg_w_yok:
		sec
		lda #0
		sbc data_m1,y
		sta data_m1,y
		jmp _neg_b_y

		// PROCEDURE neg_b.   [top] = -[top] 	(bytes)
		//
neg_b:
// HFD
		ldy datastacks
		sec
_neg_b_y:
		lda #0
		sbc datastack,y
		sta datastack,y
		rts
		

	if (0)
// HFD  - this isn't used yet - should also have shift-right routine
		// PROCEDURE rot_w.  [top] = [top] << <Y>.
		//
rot_w:
		sty bitcount
		jsr pop_w_c
// HFD			fixed the following anyway
		ldy bitcount
		beq _leave10
_loop10:
		asl cl
		rol ch
		dey
		bne _loop10
_leave10:
		jmp push_w_c
	endif



		// PROCEDURE mul_ww. [top] = [under] / [top] (signed word)
		//
mul_ww:
		jsr _pop_w_cd_abs
		jsr umul_ww_cd
		lda is_negative	;
		and #1		;
		beq _skip20	;
		jmp neg_w
_skip20:
		rts


		// PROCEDURE mul_ww.  [top] = [under] * [top] (unsigned word)
		//
umul_ww:
		jsr pop_w_cd		// Order doesn't matter...

		// ALTERNATE ENTRY mul_ww_cd.  [top] = <C> * <D>.
		//
umul_ww_cd:
		lda #0
		sta el
		sta eh

		ldx #16

_loop0:
		asl el
		rol eh

		bit ch
		bpl _skip0

;		clc		// carry better be clear or we have overflow
		lda el
		adc dl
		sta el
		lda eh
		adc dh
		sta eh

_skip0:
		asl cl
		rol ch

		dex
		bne _loop0

		lda el
		ldx eh
		jmp push_w		// rts.


		// ---------------------------------------------------
		// returns remainder from last division
		// -- if used right after division --
"getrmdr()u":
		lda el
		ldx eh
		rts

		// signed remainder needs a little more work
		// if this was C++, we could overload one call
		// at the C level, but it's not
"getsrmdr()s":
		jsr _mod_ww_rt
		jmp pop_w               // returns in AX, not stack

// square root routine
//
// input in CL, CH
// output in CL

	odd_l	= dl
	odd_h	= dh
	val_l	= cl
	val_h	= ch

"sqrt(u)b":
	jsr pop_w_c

// -------------------------------------------------------------
// This section found in "6502 Software Design" By Leo J. Scanlon
//
// -------------------------------------------------------------

	LDY #1			; LSB of first odd number = 1
	STY odd_l
	DEY
	STY odd_h		; MSB of first odd number (sqrt = 0)
_again:
	SEC
	LDA val_l		; save remainder in X register
	TAX			; subtract odd lo from integer lo
	SBC odd_l
	STA val_l
	LDA val_h		; subtract odd hi from integer hi
	SBC odd_h
	STA val_h		; is subtract result negative?
	BCC _nomore		; no. increment square root
	INY
	LDA odd_l		; calculate next odd number
	ADC #1
	STA odd_l
	BCC _again
	INC odd_h
	JMP _again
_nomore:
	TYA 			; all done, copy result to Acc
				; remainder in X - don't care about it
	RTS


// ===========================================================================
// memory
// ===========================================================================

		// GROUP Memory Management.
		//

		// PROCEDURE peek_w_b.
		//
		// [top] = memory[[top]]
		//
		// Peek at the address in [top], returning the byte
		// of the value at that address on the stack.
		//
peek_w_b:
		jsr pop_w_c
_peek_w_b_c:
		ldy #0
		lda (cl),y
		jmp push_b



		// PROCEDURE peek_w_w.
		//
		// [top] = memory[[top]]
		//
		// Peek at the address in [top], returning the word
		// of the value at that address on the stack.
		//
// HFD			bigger, but faster and easier to read
peek_w_w:
		jsr pop_w_c
		ldy #0
		lda (cl),y
		pha
		iny
		lda (cl),y
		tax
		pla
		jmp push_w


		// PROCEDURE poke_w_b.
		//
		// memory[[under]] = [top]
		//
		// Set the address at [under] to the byte [top].
		//
poke_w_b:
		jsr pop_b
		pha
		jsr pop_w_c
		pla
_poke_w_b_c:
		ldy #0
		sta (cl),y
		rts


		// PROCEDURE poke_w_w.
		//
		// memory[[under]] = [top]
		//
		// Set the address at [under] to the word [top].
		//
// HFD			bigger, but faster and easier to read
poke_w_w:
		jsr pop_w
		sta dl
		stx dh
		jsr pop_w_c
		lda dl
		ldy #0
		sta (cl),y
		iny
		lda dh
		sta (cl),y
		rts

		// ---------------------------------------------------
		// C callable routines
		// previous instruction is always a push or cast
		// so it is safe to skip ldy datastacks in all cases

		// returns the address of the variable passed
		// if called as var1 = hl(&var2) since var1 = &var2 doesn't do that

		// hi() and lo() work with both data and addresses
"hl(u)u":
		ldx datastack,y
"lo(u)b":
		lda data_m1,y
		jmp _full_end

		// ---------------------------------------------------
		// HFD return high or low byte of uint
		// faster than var/256 or var%256
		// have to adjust the datastack pointer though
		// would be much better if done in the compiler
"hi(u)b":
		lda datastack,y
_full_end:
		dey
		dey
		sty datastacks
		rts

		// ---------------------------------------------------
		// MEMSET
		// similar to C string library routine
		// set memory to one value
		// not too efficient, but it works
"memset(ubu)":
		JSR pop_w_yok		// amount to set
		STA dl
		STX dh
		JSR pop_b_yok		// value
		PHA
		JSR pop_w_yok		// location to start

		CLC			// convert amount to end location
		STA cl
		ADC dl
		STA dl
		TXA
		STA ch
		ADC dh
		STA dh

		LDY #0
		PLA
_memsi:
		STA (cl),y
		INC cl
		BNE _memsl
		INC ch
_memsl:
		LDX cl
		CPX dl
		BNE _memsi
		LDX ch
		CPX dh
		BNE _memsi
		RTS

		// ---------------------------------------------------
		// MEMMOVE
		// similar to C string library routine
		// is safe to copy overlapping memory sections
		// r0 and r2 have to be in page 0
		// reuses other temporary memory locations

// -------------------------------------------------------------
// This section found in "The First Book of KIM" By Lew Edwards
// (page 127-128)
// -------------------------------------------------------------
	r0  = cl		// source
	r0h = ch
	r1  = el		// last source
	r1h = eh
	r2  = dl		// destination
	r2h = dh
	r3  = fl		// last destination
	r3h = fh
	r4  = bitcount		// size
	r4h = is_negative

"memmove(uuu)":
		JSR pop_w_yok		// amount to move
		STA r4
		STX r4h
		JSR pop_w_yok		// source
		STA r0
		STX r0h
		JSR pop_w_yok		// destination
		STA r2
		STX r2h

		LDY #0xff		// slight speedup

		CLC			// compute last source address
		LDA r4
		ADC r0
		STA r1
		LDA r0h
		ADC r2h
		STA r1h
		
		CLC			// compute last destination address
		LDA r4
		ADC r2
		STA r3
		LDA r4h
		ADC r2h
		STA r3h

		INC r4			// adjust (DEC works with BNE, not BPL)
		INC r4h

		SEC			// test if source higher than destination
		LDA r2
		SBC r0
		LDA r2h
		SBC r0h			// determine which way to copy
_mmem1:
		LDX #0
		BCC _skip_m1
		LDX #2
_skip_m1:
		LDA (r0,x)		// could be copying up or down
		STA (r2,x)		// from r0-r2 or r1-r3
		BCC _skip_m4
		DEC r1
		TYA
		EOR r1
		BNE _skip_m2
		DEC r1h
_skip_m2:
		DEC r3
		TYA
		EOR r3
		BNE _skip_m3
		DEC r3h
_skip_m3:
		BCS _skip_m6
_skip_m4:
		INC r0
		BNE _skip_m5
		INC r0h
_skip_m5:
		INC r2
		BNE _skip_m6
		INC r2h
_skip_m6:
		DEC r4			// decrement byte count
		BNE _skip_m7
		DEC r4h
_skip_m7:
		BNE _mmem1
		RTS

// ===========================================================================
		end
