;++
; MACRO64$PI.M64 -- Based on original algorithm and C program by Remy Dube.
;
;	Copyright 1992 Digital Equipment Corporation
;	All rights reserved.
;         
;	This software is furnished under a license and may be used and  copied
;	only in accordance with the terms of such license and with  the
;	inclusion of the above copyright notice.  This software and any copies
;	shall not be provided to any other person.  No  title to or ownership of
;	the software is hereby transferred.  The  information in this software
;	is subject to change without notice. DIGITAL assumes no responsibility
;	for the use, functionality or  reliability of its software on equipment
;	which is not supplied by  DIGITAL.
;
; This is a complete MACRO-64 program which computes PI to a specified 
; number of digits.  This example is one of three installed with MACRO-64:
;
;	MACRO64$HELLO.M64	- Simple Hello World program
;				  Demonstrates calling standard macros
;	MACRO64$WHAMI.M64	- Program that displays WHAMI IPR 
;				  Demonstrates system calls
;	MACRO64$PI.M64		- Program that computes PI (this program)
;				  Demonstrates general programming with 
;                                 MACRO-64 along with optimization techniques
;
; Digital suggests you become familiar with the concepts illustrated in 
; the MACRO64$HELLO.M64 example and the MACRO64$WHAMI.M64 example before 
; you attempt to understand this example example.
;
;
; The original algorithm implemented by this program is by Remy Dube.  
; This MACRO-64 implementation is adapted from Remy's C implementation.  
; While the algorithm and implementation are reasonably fast and accurate, 
; the intent of this program is to illustrate MACRO-64 programming 
; concepts rather than set world speed or precision records.  This program 
; illustrates the use of a number of optimization techniques, such as loop 
; unrolling and routine inlining.  In several instances, the context in 
; which these techniques are applied causes the technique to yield only a 
; marginal benefit.  Furthermore, due to the decreased development and 
; maintenance costs and increased portability associated with High Order 
; Languages, it would not normally be advisable to write an entire program 
; such as this in assembly language.  Nonetheless, this program serves to 
; illustrate a number of paradigms applicable to efficient programming in 
; MACRO-64.
;
; This program calls the DEC C Runtime Library to perform I/O and for 
; other ancillary tasks.  Note that the DEC C Runtime Library is available 
; on all OpenVMS AXP systems, regardless of whether you have purchased and 
; installed the DEC C compiler product.
;
; To run this program on an OpenVMS AXP system, use the following commands:
;
;	$ macro/alpha_axp/object=pi sys$examples:macro64$pi
;	$ link pi
;	$ run pi
;--

	;++
	; The $library_debug macro helps work around a problem that 
    	; exists between MACRO-64 V1.0 and the V1.0 symbolic debugger 
	; where the debugger fills the screen with messages indicating
	; "no source line available."  Invoking $library_debug near 
	; the beginning of your module causes these message to occur 
	; at the beginning of your module, rather than interdispersed 
	; throughout your routines.
	;--
	$library_debug

	; About 30 blank lines after the invocation of $library_debug
	; keeps the "source line not available" messages from 
	; overwriting your MACRO-64 statements in the source display 
	; window.


;+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
;                         ASSEMBLY-TIME CONSTANTS
;---------------------------------------------------------------------------

; Turn on optimizations and automatic alignment.
	.enable align_code, align_data, peephole, schedule

PRINT_WIDTH	= 100 ; Width of output device
BIGINT_BYTES	= 8   ; Number of bytes in a quadword
BIGINT_SHIFTER	= 3   ; L-Shift this much to convert array index to byte offset
BIGINT_BITS	= BIGINT_BYTES * 8

; Integer argument registers.
IARG0		= "r16"
IARG1		= "r17"
IARG2		= "r18"
IARG3		= "r19"
IARG4		= "r20"
IARG5		= "r21"

; Floating-point argument registers.
FARG0		= "f16"
FARG1		= "f17"
FARG2		= "f18"
FARG3		= "f19"
FARG4		= "f20"
FARG5		= "f21"


;+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
;                          MODULE VARIABLE DATA
; ---------------------------------------------------------------------------
	$data_section
WEIGHT:		.quad		0
FILE_VAR:	.quad		0
LOG_4:		.g_floating 	0.0
PRECISION:	.quad		0
IMAX:		.quad		0
CLUSTER_SIZE:	.quad		0
CLUSTER_VECT:	.quad		0
CLUSTER_VECT_SIZE:
		.quad		0
ISTOP:		.quad		0
START:		.quad		0
FINISH:		.quad		0
DURATION:	.quad		0
SCRATCH:	.byte 		%repeat(99,<0,>)0

CLK_TCK:	.g_floating 	100.0 ; From DECC$LIBRARY_INCLUDE:TIME.H

G1:		.g_floating 	1.0
G4:		.g_floating	4.0


;+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
;                        UTILITY MACROS
;---------------------------------------------------------------------------

;++
; Macro IPOW
; Abstract: Integer power function for assembly-time values.
; Inputs: A - Value to be raised to a power
;         B - Power to which to raise value
; Method: Multiply A times itself B times.
;--
	.macro IPOW A B RESULT
	    RESULT = %repeat(<<B>>,<<A>*>) 1
	.endm IPOW


; The LOOP_UNROLL assembly-time variable is used to control manual loop 
; unrolling:  Use 0 For no loop unrolling, 1 for normal loop unrolling, 
; or a value greater than 1 for a higher, global unroll factor.
LOOP_UNROLL	= 1


;++
; Macro UNROLL
; Abstract:
;   Macro UNROLL is used to assist in in manually unrolling a loop.   
;   It can be beneficial to repeat the body of a loop a number of times 
;   with branch-outs between each repetition to the loop exit.  At 
;   the least, you gain the benefit of a a branch not taken vs. a branch 
;   taken.  While this effect may not gain very much, when loop unrolling 
;   is combined with instruction scheduling (.ENABLE SCHEDULE), you may 
;   be able to schedule portions successive repetitions of the loop in 
;   parallel.  See Appendix B of the MACRO-64 Assembler for OpenVMS AXP 
;   Systems Reference Manual for more information on this optimization 
;   technique.
;
;   To use UNROLL, you must define a macro that expands to the body of the 
;   loop.  Specify the name of this macro with the BODY argument.  You
;   must also specify 2 branch instruction statements: 1 that tests a
;   condition and branches to the beginning of the loop to continue, and
;   another that tests the opposite condition and branches to the end of the
;   loop to exit.  Specify the loop continuation statement with the 
;   CONTINUE argument.  Specify the loop exit statement with the EXIT 
;   argument.  Specify the target label for the loop-continue statement
;   as %%LOOP_START%% and the target for the loop-exit statement as
;   %%LOOP_END%%.  For instance, the continue statement might be a BNE and 
;   the end statement would be BEQ, or vice versa.  Often, you must precede 
;   the branch instruction with a comparison instruction.  If so, specify
;   the comparison instruction statement with the COMPARE argument.
;
;   You can use the TEST_FIRST argument to specify the style of the loop.  
;   Specify 1 to get generate an exit test prior to the first iteration of 
;   the loop.  Specify 0 to always execute the loop at least once.
;
;   You can experiment with different values for the FACTOR argument.  A 
;   FACTOR of 0 or 1 results in a single copy of the loop body -- that is, no 
;   loop unrolling.  FACTORs greater than 1 can improve runtime performance 
;   at the expense of increased code size.  However, depending on the 
;   size of the loop body, too large a FACTOR can cause the repeated loop 
;   body to overflow the instruction cache, thus negatively impacting 
;   performance.
;
;   In addition, if the loop body contains a call to another routine, 
;   it is unlikely that the scheduler (.ENABLE SCHEDULE) will be able to
;   concurrently schedule successive repetitions of the loop body.  As a 
;   result, the benefit realized by unrolling the loop will likely 
;   reduce to the difference in cost between a branch taken and a branch 
;   not taken times the FACTOR you specify.
;
;   Thus, the best loops to unroll are those that are in your critical 
;   performance path, contain a small number of instructions, and do not 
;   call other routines.
;
; Inputs:
;   BODY - Name of the macro that expands to the loop body
;   FACTOR - Specific repetition factor to use when unrolling the loop
;   CONTINUE - A conditional branch instruction statement to continue the loop
;   EXIT - A conditional branch instruction statement to exit the loop
;   COMPARE - An optional comparison instruction to use prior to the CONTINUE 
;     or EXIT instruction
;   TEST_FIRST - non-zero means test the exit condition prior to the 1st 
;     iteration, zero means always execute the loop at least once
;   UNROLL_LOOP (assembly variable) - Controls whether or not to unroll loops
;     or whether to increase the unroll factor for all unrolled loops
;
; Outputs:
;   The body of the loop is repeated FACTOR*UNROLL_LOOP times.
;
; Example usage:
;
;	; First define the loop body as a macro
;	.macro LOOP_BODY
;	    ldq    r0, (r5)
;           lda	   r5, 8(r5)
;           lda    r1, 1(r1)
;	.endm LOOP_BODY
;
;	; Now invoke the UNROLL macro to unroll the loop
;	UNROLL BODY=LOOP_BODY, FACTOR=3, TEST_FIRST=1,	-
;	    COMPARE  = <cmple r0, r2, r22>,		-
;	    CONTINUE = <bne r22, %%LOOP_START%%>,	-
;	    EXIT     = <beq r22, %%LOOP_END%%>
;
;	; The above use of UNROLL would result in the following code 
;	; being generated (where LOOP_START and LOOP_END are unique labels 
;       ; for each invocation of UNROLL):
;
;		cmple r0, r2, r22    ; These 2 instructions are suppressed 
;		beq   r22, LOOP_END  ;  if you specify TEST_FIRST=0
;   	LOOP_START:
;		ldq   r0, (r5)
;		lda   r5, 8(r5)
;	        lda   r1, 1(r1)
;		cmple r0, r2, r22
;		beq   r22, LOOP_END
;		ldq   r0, (r5)
;	        lda   r5, 8(r5)
;		lda   r1, 1(r1)
;		cmple r0, r2, r22
;		beq   r22, LOOP_END
;		ldq   r0, (r5)
;		lda   r5, 8(r5)
;       	lda   r1, 1(r1)
;		cmple r0, r2, r22
;		bne   r22, LOOP_START
;	LOOP_END:
;--
.macro UNROLL BODY, FACTOR, CONTINUE, EXIT, COMPARE, TEST_FIRST=1
    .if not_defined, UNROLL_INDEX
	UNROLL_INDEX = 0
    .else
	UNROLL_INDEX = UNROLL_INDEX + 1
    .endc
    LOOP_START = "UNROLL_LOOP_START_%integer(UNROLL_INDEX)"
    LOOP_END   = "UNROLL_LOOP_END_%integer(UNROLL_INDEX)"
    UNROLL_HELPER <BODY>, <FACTOR>, <%LOOP_START%>, <%LOOP_END%>, -
	<CONTINUE>, <EXIT>, <COMPARE>, <TEST_FIRST>
.endm UNROLL

; Helper macro for loop unrolling
.macro UNROLL_HELPER BODY, FACTOR, START, END, CONTINUE, EXIT, COMPARE, -
    TEST_FIRST
    .if ne, <TEST_FIRST>
	COMPARE
	EXIT
    .endc
    START:
    .repeat <<FACTOR>-1> * LOOP_UNROLL
	BODY
	COMPARE
	EXIT
    .endr
    BODY
    COMPARE
    CONTINUE
    END:
.endm UNROLL_HELPER


;+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
;                       MODULE LOCAL ROUTINES
;---------------------------------------------------------------------------


	$routine ASK_NUMBER_OF_DIGITS, kind=stack, local=true, 		-
	    data_section_pointer=true,					-
	    saved_regs=<r2,r3>
;++
; Abstract:	Inquire from the user terminal who many digits to compute.
; Inputs:	Terminal I/O
; Outputs:	PRECISION - Number of digits to compute
;--
	$linkage_section
10$:	.asciz "How many digits do you want to compute? "
20$:	.asciz "Computing PI with %-d digits \x0A\x0A"
30$:	.asciz "Please specify a positive, non-zero integer value.\x0A"
	$code_section
	.base	r27, $ls			; 1st access link sect w/r27
	mov	r27, r2				; Later use r2 for link sect 
	ldq	r3, $dp				; Access...
	.base	r3, $ds				; ...data section
	.base	r2, $ls				; Now use r2 for link sect
100$:	$call 	DECC$GPRINTF, args=10$/a	; Print "How many?"
	$call	DECC$GETS, args=SCRATCH/a	; Get answer
	$call 	DECC$ATOL, args=SCRATCH/a	; Convert to longword
	bgt	r0, 200$
	$call	DECC$GPRINTF, args=30$/a	; Print "be positive"
	br	100$
200$:	stq	r0, PRECISION			; Set precision
	; Output "Computing with N digits"
	$call	DECC$GPRINTF, args=<20$/a, r0/l>, scratch_regs=<r22,r23>
	$return
	$end_routine ASK_NUMBER_OF_DIGITS


	$routine INITIALIZE, kind=stack, local=true, 			-
	    saved_regs=<r2,r3,r4,r5,r7>,				-
	    data_section_pointer=true
;++
; Abstract:	Start up processing.
; Inputs:	
;   PRECISION
; Outputs:
;   LOG_4		  - log10(4.0)
;   IMAX		  - PRECISION/LOG_4
;   WEIGHT		  - Scale factor for various computations
;   CLUSTER_SIZE	  - Number of decimal digits represented in a quadword 
;   CLUSTER_VECT_SIZE 	  - Number of quadwords needed for spec'd precision
;   CLUSTER_VECT	  - Ptr to array of quadwords 
;   CLUSTER_VECT[0..last] - Initialized to 0
;   FILE_VAR		  - Opened for output to PI.DAT
;--
	$linkage_section
POWX	= "r4"				; r4 holds pow(2,BIGINT_BITS-4)/IMAX*10
W	= "r5"				; r5 is temp for WEIGHT
CS	= "r6"				; r6 is temp for CLUSTER_SIZE
CVS	= "r7"				; r7 is temp for CLUSTER_VECT_SIZE
	IPOW	2 <BIGINT_BITS-4> T	; T=pow(2,BIGINT_BITS-4)
10$:	.quad		T
20$:	.asciz "pi.dat"
30$:	.asciz "w"
40$:	.asciz "        PI WITH %-d digits \x0A \x0A"
	$code_section
	.base	r27, $ls			; 1st access link sect w/r27
	mov	r27, r2				; Later use r2 for link sect 
	ldq	r3, $dp				; Access...
	.base	r3, $ds				; ...data section
	.base	r2, $ls				; Now use r2 for link sect
	.base	r3, $ds				; ...data section
	$call	DECC$GLOG10, args=G4/g		; r0=LOG10(4.0)
	stg	f0, LOG_4			; LOG_4=LOG10(4.0)
	ldt	f1, PRECISION			; Convert precision to...
	cvtqg	f1, f1				; ...G float
	divg	f1, f0, f1			; f1=PRECISION/LOG_4
	cvtgq/c	f1, f1				; Convert to quadword
	stt	f1, IMAX			; IMAX=PRECISION/LOG_4
	ldq	%IARG1%, IMAX			; IARG1=IMAX
	mulq	%IARG1%, #10, %IARG1%		; IARG1=IMAX*10

	;++
	; The OTS$DIV_L routine takes 2 quad word arguments and returns 
	; in r0 the quadword quotient of the first argument divided by the 
	; second argument.  OTS$DIV_L is in the sharable library LIBOTS.EXE, 
	; which is available at link time by default.  A similar routine, 
	; OTS$DIV_I, is available in that same library for longword integer 
	; division.
	;--
	; r0=pow(2,BIGINT_BITS-4)/(IMAX*10)
	$call	OTS$DIV_L, args=<10$/q, %IARG1%/q> 
	; POWX=pow(2,BIGINT_BITS-4)/(IMAX*10)
	mov	r0, %POWX%
	mov 	1, %W% 				; WEIGHT=1
	mov	0, %CS%				; CLUSTER_SIZE=0

	; Loop computing WEIGHT and CLUSTER_SIZE
	.macro LOOP_BODY			; Loop body for unrolling
	    mulq	%W%, #10, %W%		; WEIGHT=WEIGHT*10
	    lda		%CS%, 1(%CS%)		; CLUSTER_SIZE=CLUSTER_SIZE+1
	.endm LOOP_BODY
	UNROLL BODY=LOOP_BODY, FACTOR=10, TEST_FIRST=1, 		-
	    COMPARE  = <cmple %W%, %POWX%, r23>,			-
	    CONTINUE = <bne r23, %%LOOP_START%%>,			-
	    EXIT     = <beq r23, %%LOOP_END%%>

	stq	%W%, WEIGHT			; Store local copies of 
	stq	%CS%, CLUSTER_SIZE		;  WEIGHT and CLUSTER_SIZE
	ldq	%IARG0%, PRECISION		; IARG0=PRECISION
	addq	%CS%, %IARG0%, %IARG0%		; IARG0=PRECISION+CLUSTER_SIZE
	lda	%IARG0%, -1(%IARG0%)		; IARG0=PRECISION+CLUSTER_SIZE-1
	; r0=(PRECISION+CLUSTER_SIZE-1)/CLUSTER_SIZE
	$call	OTS$DIV_L, args=<%IARG0%/q,%CS%/q>
	; r0=((PRECISION+CLUSTER_SIZE-1)/CLUSTER_SIZE)+1
	lda	r0, 1(r0)
	; CLUSTER_VECT_SIZE=((PRECISION+CLUSTER_SIZE-1)/CLUSTER_SIZE)+1
	mov	r0, %CVS%
	stq	r0, CLUSTER_VECT_SIZE
	; Allocate CLUSTER_VECT
	sll	r0, #BIGINT_SHIFTER, r0
	$call	DECC$MALLOC, args=r0/l, scratch_regs=<r22,r23>
	stq	r0, CLUSTER_VECT
CCA	= "r0"					; r0 is current cluster addr
	; Initialize CLUSTER_VECT
	mov	%CVS%, r1			; r1=CLUSTER_VECT_SIZE
100$:	stq	r31, (%CCA%)			; *CCA = 0
	lda	r1, -1(r1)			; --r1
	lda	r0, BIGINT_BYTES(R0)		; CCA++
	bgt	r1, 100$			; Branch if not done	
	$call	DECC$FOPEN, args=<20$/a, 30$/a>	; Open output file
	stq	r0, FILE_VAR
	$call	DECC$GFPRINTF, args=<r0/q, 40$/a, PRECISION/l>, 	-
	    scratch_regs=<r22,r23>
	$return
	$end_routine INITIALIZE


	$routine MULTIPLY_AND_DIVIDE, kind=stack, 			-
	    saved_regs=<r2,r3,r4,r5,r6,r7,r8,r9,r10>,			-
	    data_section_pointer=true,					-
	    local=true
;++
; Abstract: Walk the CLUSTER_VECTOR array performing a series of
;            multiplications and divisions.
; Inputs:
;   IARG0 - NUMERATOR
;   IARG1 - DENOMINATOR
;   ISTOP - Indicator of how much to compute
;   WEIGHT
;   CLUSTER_VECT
;   CLUSTER_VECT[0..ISTOP]
; Outputs:
;   CLUSTER_VECT[0..ISTOP]
;--
	.base	r27, $ls			; 1st access link sect w/r27
	mov	r27, r2				; Later use r2 for link sect 
	ldq	r3, $dp				; Access...
	.base	r3, $ds				; ...data section
	.base	r2, $ls				; Now use r2 for link sect
TEMP	= "r4"					; r4 is variable 'temp'
I	= "r5"					; r5 is variable 'i'
W	= "r6"					; r6 is local WEIGHT
CCA	= "r7"					; r7 is curr cluster addr
NUM	= "r8"					; r8 is local copy of NUM
DEN	= "r9"					; r9 is local copy of DEN
CC	= "r10"					; r10 is curr cluster
	clr	%TEMP%				; TEMP=0
	ldq	%I%, ISTOP			; I=ISTOP
	lda	%I%, 1(%I%)			; I=ISTOP+1
	ldq	%W%, WEIGHT			; Load local copy of WEIGHT
	ldq	%CCA%, CLUSTER_VECT		; CCA = &CLUSTER_VECT[0]
	mov	%IARG0%, %NUM%			; Get local copy of NUM
	mov	%IARG1%, %DEN%			; Get local copy of DEN
	.macro LOOP_BODY			; Unroll loop
	    mulq	%TEMP%, %W%, %TEMP%	; TEMP *= WEIGHT
	    ldq		%CC%, (%CCA%)		; CC = *CCA
	    mulq	%NUM%, %CC%, r22	; r22 = CC * NUM
	    addq	%TEMP%, r22, %TEMP%	; TEMP += CC * NUM
	    $call	OTS$DIV_L, -		; r0=TEMP/DEN
		args=<%TEMP%/q, %DEN%/q>
	    stq		r0, (%CCA%)		; *CCA=TEMP/DEN
	    mulq	%DEN%, r0, r0		; r0=DEN * (*CCA)
	    subq	%TEMP%, r0, %TEMP%	; TEMP -= DEN * (*CCA)
	    lda		%CCA%, BIGINT_BYTES(%CCA%); Advance to next cluster
	    lda		%I%, -1(%I%)		; Decrement loop index
	.endm LOOP_BODY
	UNROLL BODY=LOOP_BODY, FACTOR=8, TEST_FIRST=1, 			-
	    CONTINUE = <bne %I%, %%LOOP_START%%>,			-
	    EXIT     = <beq %I%, %%LOOP_END%%>
	$return
	$end_routine MULTIPLY_AND_DIVIDE


    ;++
    ; Macro PROPAGATE_CARRY
    ; Abstract:  Walk the CLUSTER_VECTOR array in reverse order propagating
    ;            a carry out from one cluster to its predecessor as necessary.
    ;            (A manually inlined routine.)
    ; Inputs:
    ;	%CA% - register that holds CLUSTER_VECT
    ;	WEIGHT
    ;	CLUSTER_VECT[0..last]
    ; Outputs:
    ;	CLUSTER_VECT[0..last]
    ;--
	.macro PROPAGATE_CARRY
	    CCA 	= "r0"			; r0 is addr of current cluster
	    CC		= "r1"			; r1 is current cluster
	    CARRY	= "r22"			; r22 is 'CARRY' variable
	    W		= "r23"			; r23 is local WEIGHT
	    clr		%CARRY%			; CARRY=0
	    ldq		%W%, WEIGHT		; load local copy of WEIGHT
	    ldq		%CCA%, CLUSTER_VECT_SIZE; CCA=sizeof(CLUSTER_VECT)
	    lda		%CCA%, -1(%CCA%)	; CCA=sizeof(CLUSTER_VECT)-1
	    ; CCA=(sizeof(CLUSTER_VECT)-1)*sizeof(CLUSTER_VECT[0])
	    sll		%CCA%, #BIGINT_SHIFTER, %CCA%
	    addq	%CA%, %CCA%, %CCA%	; CCA --> last cluster
	    .macro LOOP1_BODY			; Unroll loop 1
		ldq	%CC%, (%CCA%)		; r22 = *CCA
		addq	%CC%, %CARRY%, %CC%	; r22 = *CCA + CARRY
		clr	%CARRY%			; CARRY=0
		.macro LOOP2_BODY 		; Unroll loop 2
		    subq	%CC%, %W%, %CC% ; CC -= WEIGHT
		    lda		%CARRY%, 1(%CARRY%); CARRY++
		.endm LOOP2_BODY
		UNROLL BODY=LOOP2_BODY, FACTOR=8, TEST_FIRST=1, 	-
		    COMPARE  = <cmplt %CC%, %W%, r16>,			-
		    CONTINUE = <beq   r16, %%LOOP_START%%>,		-
		    EXIT     = <bne   r16, %%LOOP_END%%>
		stq	%CC%, (%CCA%)		; Save current cluster
		lda	%CCA%, -BIGINT_BYTES(%CCA%) ; Backup 1 cluster
	    .endm LOOP1_BODY
	    UNROLL BODY=LOOP1_BODY, FACTOR=2, TEST_FIRST=1, 		-
		COMPARE  = <cmplt  %CCA%, %CA%, r16>,			-
		CONTINUE = <beq    r16, %%LOOP_START%%>,		-
		EXIT     = <bne    r16, %%LOOP_END%%>
	.endm PROPAGATE_CARRY


	$routine COMPUTE, kind=stack, 					-
	    saved_regs=<r2,r3,r4,r5,r6,r7,r8,r9,r10,f2,f3,f4>, 		-
	    local=true,							-
	    data_section_pointer=true,					-
	    size=$rsa_end+16
;++
; Abstract: Build up a binary representation of PI in the CLUSTER_VECT array.
;
; Inputs:
;   PRECISION
;   CLUSTER_SIZE
;   IMAX
;   LOG_4
;   CLUSTER_VECT[0]
;
; Outputs:
;   CLUSTER_VECT[0..last]
;
; Notes:
;   $ROUTINE defines the assembly-time variable $RSA_END as the offset 
;   from the beginning of the stack frame to just beyond the end of the 
;   the register save area.  You can use this variable both to define 
;   the stack size with $ROUTINE's SIZE argument as above, and in 
;   referencing the stack beyond the register save area.  In this case,
;   we have allocated an additional 16 bytes of stack storage beyond the
;   end of the register-save area.  Note that the size of the stack frame
;   must be an even multiple of 16.
;--
	$linkage_section
I	= "r4"					; r4 is variable 'I'
CS	= "r5"					; r5 is local CLUSTER_SIZE
TMP	= "r6"					; r6 is variable 'TMP'
CA	= "r7"					; r7 is local &CLUSTER_VECT[0]
PCC	= "r8"					; r8 is variable 'PCC'
I2M1	= "r9"					; r9 is 2*I-1
I4	= "r10"					; r10 is 4*I and 4*I+2
I.G	= "f2"					; f2 is variable 'I.G', G float 
						;  shadow of I
L4	= "f3"					; f3 is local LOG_4
ONE.G	= "f4"					; f4 holds 1.0 in G floating
	$code_section
	.base	r27, $ls			; 1st access link sect w/r27
	mov	r27, r2				; Later use r2 for link sect 
	ldq	r3, $dp				; Access...
	.base	r3, $ds				; ...data section
	.base	r2, $ls				; Now use r2 for link sect
	ldq	%I%, IMAX			; I=IMAX
	ldt	%I.G%, IMAX			; I.G shadows I
	cvtqg	%I.G%, %I.G%			; Convert to G format
	ldg	%L4%, LOG_4			; Load local LOG_4
	ldq	%CS%, CLUSTER_SIZE		; Load local CLUSTER_SIZE
	ldq	%TMP%, PRECISION		; TMP=PRECISION
	mov	%TMP%, %PCC%			; PCC=PRECISION
	lda	%TMP%, 2(%TMP%)			; TMP=PRECISION+2
	addq	%CS%, %TMP%, %TMP%		; TMP=CLUSTER_SIZE+PRECISION+2
	ldq	%CA%, CLUSTER_VECT		; CA = &CLUSTER_VECT[0]
	addq	%PCC%, %CS%, %PCC%		; PCC=PRECISION+CLUSTER_SIZE
	lda	%PCC%, -1(%PCC%)		; PCC=PRECISION+CLUSTER_SIZE-1
	; r0=(PRECISION+CLUSTER_SIZE-1)/CLUSTER_SIZE
	$call	OTS$DIV_L, args=<%PCC%/q,%CS%/q>
	; PCC=(PRECISION+CLUSTER_SIZE-1)/CLUSTER_SIZE
	mov	r0, %PCC%
	ldg	%ONE.G%, G1			; Load ONE.G with 1.0
	.macro	LOOP_BODY ?LABEL		; Unroll loop
	    mulg	%I.G%, %L4%, f0		; f0=I*LOG_4
	    cvtgq/c	f0, f0			; Convert to integer

	    ;++
	    ; We must store F0 to memory in order to load it into an 
	    ; integer register.  Our invocation of the $ROUTINE macro
	    ; above allocated an additional 16 bytes of stack storage,
	    ; 8 of which we will use now for that purpose.  We can 
	    ; reference the stack beyond the end of the register save 
	    ; area relative to the frame pointer, FP, using the 
	    ; $RSA_END assembly-time variable that is defined by the 
	    ; $ROUTINE macro.
	    ;--
	    stt		f0, $rsa_end(fp)	; Put to memory
	    ldq		%IARG0%, $rsa_end(fp)	; Read back to I reg
	    ; IARG0=(PRECISION-I*LOG_4+2+CLUSTER_SIZE)
	    subq	%TMP%, %IARG0%, %IARG0%
	    $call	OTS$DIV_L, args=<%IARG0%/q, %CS%/q>
	    cmple	r0, %PCC%, r1		; result <= PCC?
	    bne		r1, LABEL    		; Yes, branch
	    mov		%PCC%, r0		; No, use PCC
LABEL:	    stq		r0, ISTOP		; ISTOP=min(result,PCC)
	    sll		%I%, #1, %I2M1%		; I2M1 = 2*I
	    lda		%I2M1%, -1(%I2M1%)	; I2M1 = 2*I-1
	    sll		%I%, #2, %I4%		; I4 = 4*I
	    $call	MULTIPLY_AND_DIVIDE, args=<%I2M1%/q, %I4%/q>, -
		local=true
	    lda		%I4%, 2(%I4%)		; I4=4*I+2
	    $call	MULTIPLY_AND_DIVIDE, args=<%I2M1%/q, %I4%/q>, -
		local=true
	    ldq		r0, (%CA%)		; r0=CLUSTER_VECT[0]
	    lda		r0, 3(R0)		; r0=CLUSTER_VECT[0] + 3
	    stq		r0, (%CA%)		; CLUSTER_VECT[0] += 3
	    lda		%I%, -1(%I%)		; I--
	    subg	%I.G%, %ONE.G%, %I.G%	; I.G shadows I
	.endm LOOP_BODY
	UNROLL BODY=LOOP_BODY, FACTOR=4, TEST_FIRST=1, 			-
	    CONTINUE = <bne %I%, %%LOOP_START%%>,			-
	    EXIT     = <beq %I%, %%LOOP_END%%>

	PROPAGATE_CARRY				; Inlined routine call

	$return
	$end_routine COMPUTE


	$routine PRINT_RESULT, kind=stack, 				-
	    saved_regs=<r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,f2>,-
	    local=true,							-
	    data_section_pointer=true,					-
	    size=$rsa_end+16
;++
; Abstract:  Print the results of the computation.
; Inputs:
;   START  - seconds at start of computation
;   FINISH - seconds at end of computation
;   FILE_VAR
;   PRECISION
;   WEIGHT
;   CLUSTER_SIZE
;   CLUSTER_VECT_SIZE
;   CLUSTER_VECT[0..last]
;   PRINT_WIDTH
;--
	$linkage_section
SECONDS		= "f2"				; f2 is variable 'SECONDS' 
MINUTES		= "r4"				; r4 is variable 'MINUTES'
HOURS		= "r5"				; r5 is variable 'HOURS'
I		= "r6"				; r6 is variable 'I' 
CCA		= "r7"				; r7 is variable 'CCA' 
CC		= "r8"				; r8 is variable 'CC'
CLIM		= "r9"				; r9 is variable 'CLIM'
J		= "r10"				; r10 is variable 'J'
CS		= "r11"				; r11 is local CLUSTER_SIZE
W		= "r12"				; r12 is local WEIGHT
COUNT		= "r13"				; r13 is variable COUNT
P		= "r14"				; r14 is local PRECISION
10$:	.asciz	-
"        Calculation time :  %-d hours %-d  minutes %f seconds \x0A \x0A"
20$:	.asciz	"%-d."
30$:	.asciz  "%-d"
40$:	.asciz	"\x0A  "
50$:	.asciz	"\x0A"


	$code_section
	.base	r27, $ls			; 1st access link sect w/r27
	mov	r27, r2				; Later use r2 for link sect 
	ldq	r3, $dp				; Access...
	.base	r3, $ds				; ...data section
	.base	r2, $ls				; Now use r2 for link sect
	ldq	r22, FINISH			; r22=FINISH
	ldq	r23, START			; r23=START
	subq	r22, r23, r23			; r23=FINISH-START
	stq	r23, $rsa_end(fp)		; Convert FINISH-START
	ldt	f0,  $rsa_end(fp)		; ...to...
	cvtqg	f0, f0				; ...G floating
	ldg	f1, CLK_TCK			; f1=CLK_TCK
	divg	f0, f1, %SECONDS%		; SECONDS=(FINISH-START)/CLK_TCK
	cvtgq/c	%SECONDS%, f0			; Convert...
	stt	f0,  $rsa_end(fp)		; ...to integer
	$call	OTS$DIV_L, args=<$rsa_end(fp)/q, 60/a>	; r0=SECONDS/60
	mov	r0, %MINUTES%			; MINUTES=SECONDS/60
	mulq	r0, #60, r0			; r0=MINUTES*60
	stq	r0, $rsa_end(fp)		; Convert...
	ldt	f0, $rsa_end(fp)		; ...to...
	cvtqg	f0, f0				; G floating
	subg	%SECONDS%, f0, %SECONDS%	; SECONDS -= MINUTES*60
	$call	OTS$DIV_L, -			; r0=MINUTES/60
	    args=<%MINUTES%/q, 60/a>
	mov	r0, %HOURS%			; HOURS=MINUTES/60
	mulq	%HOURS%, #60, r0		; r0=HOURS*60
	subq	%MINUTES%, r0, %MINUTES%	; MINUTES -= HOURS*60
	$call	DECC$GFPRINTF, -
	    args=<FILE_VAR/q, 10$/a, %HOURS%/l, %MINUTES%/l, %SECONDS%/g>
	ldq	%CCA%, CLUSTER_VECT		; CCA = &CLUSTER_VECT[0]
	ldq	%CLIM%, CLUSTER_VECT_SIZE	; CLIM=sizeof(CLUSTER_VECT)
	; CLIM=sizeof(CLUSTER_VECT)*sizeof(CLUSTER_VECT[0])
	sll	%CLIM%, #BIGINT_SHIFTER, %CLIM%	
	addq	%CCA%, %CLIM%, %CLIM%		; CLIM --> beyond last cluster
	ldq	%CC%, (%CCA%)			; CC = CLUSTER_VECT[0]
	$call	DECC$GFPRINTF, -
	    args=<FILE_VAR/q, 20$/a, %CC%/l>
	mov	1, %I%				; I=1
	ldq	%CS%, CLUSTER_SIZE		; Load local cluster size
	lda	%CCA%, BIGINT_BYTES(%CCA%)	; Advance to next cluster
	cmplt	%CCA%, %CLIM%, r0		; Beyond last cluster?
	beq	r0, 400$			; Yes, branch to exit loop 1
	ldq	%W%, WEIGHT			; Load local copy of WEIGHT
	ldq	%P%, PRECISION			; Load local copy of PRECISION
	clr	%COUNT%				; COUNT=0
100$:	ldq	%CC%, (%CCA%)			; CC = *CCA
	mov	1, %J%				; J=1
	cmple	%J%, %CS%, r1			; J<=CLUSTER_SIZE?
	beq	r1, 300$			; No, branch to exit loop 2
200$:	mulq	%CC%, #10, %CC%			; CC *= 10
	$call	OTS$DIV_L, args=<%CC%/q, %W%/q>	; r0=CC/WEIGHT
	mov	r0, %IARG2%			; IARG2=CC/WEIGHT
	mulq	%W%, r0, r0			; r0=IARG2*WEIGHT
	subq	%CC%, r0, %CC%			; CC -= IARG2*WEIGHT
	cmple	%COUNT%, %P%, r1		; COUNT<=PRECISION?
	beq	r1, 210$			; No, branch
	sextl	%IARG2%, %IARG2%		; Convert to longword
	$call	DECC$GFPRINTF, -
	    args=<FILE_VAR/q, 30$/a, %IARG2%/l>
210$:	lda	%COUNT%, 1(%COUNT%)		; COUNT++
	cmplt	%I%, #PRINT_WIDTH, r0		; I>=PRINT_WIDTH?
	bne	r0, 220$			; No, branch
	clr	%I%				; I=0
	$call	DECC$GFPRINTF, -		; Print linefeed + 2 spaces
	    args=<FILE_VAR/q, 40$/a>
220$:	lda	%I%, 1(%I%)			; I++
	lda	%J%, 1(%J%)			; J++
	cmple	%J%, %CS%, r0			; J<=CLUSTER_SIZE?
	bne	r0, 200$			; Yes, branch to continue loop 2
300$:	lda	%CCA%, BIGINT_BYTES(%CCA%)	; Advance to next cluster
	cmplt	%CCA%, %CLIM%, r0		; Beyond last cluster?
	bne	r0, 100$			; No, branch to continue loop 1
	$call	DECC$GFPRINTF, -		; Print linefeed 
	    args=<FILE_VAR/q, 50$/a>
400$:	$return
	$end_routine PRINT_RESULT


	$routine CLEANUP, kind=stack,					-
	    local=true,							-
	    data_section_pointer=true
;++
; Abstract: Perform shutdown trivia.
; Inputs:  FILE_VAR
; Outputs: The output file is closed.
;--
	.base	r27, $ls			; Access linkage section
	ldq	r22, $dp			; Access...
	.base	r22, $ds			; ...data section
	$call	DECC$FCLOSE, args=FILE_VAR/q
	$return
	$end_routine CLEANUP


;+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
;                            EXTERNAL ROUTINES
;---------------------------------------------------------------------------

	$routine PI, kind=stack, saved_regs=<r2,r3>,			-
	    data_section_pointer=true
;++
; Abstract: Program entry point.  
; Outputs:
;   PI.DAT with the specified number of decimal places.
;--
	.base	r27, $ls			; 1st access link sect w/r27
	mov	r27, r2				; Later use r2 for link sect 
	ldq	r3, $dp				; Access...
	.base	r3, $ds				; ...data section
	.base	r2, $ls				; Now use r2 for link sect
	$call	ASK_NUMBER_OF_DIGITS, local=true
	$call	INITIALIZE, local=true
	$call	DECC$CLOCK
	stq	r0, START
	$call	COMPUTE, local=true
	$call	DECC$CLOCK
	stq	r0, FINISH
	$call	PRINT_RESULT, local=true
	$call	CLEANUP, local=true
	mov	1, r0				; Return success status
	$return
	$end_routine PI

	.end PI