\
\  Some routines for checking efficiency of SIMD/SWAR instructions
\  in vector-oriented virtual machine architecture
\  (see VECTORVM.DOC, VECTORVM.PS for details)
\
\  Using P5 233(MMX), SMAL32 Forth System v97.4.30
\
\  (C) 1998 Peter Sobolev
\

CREATE array0
               0  w, 1  w, 2  w, 3  w,
               4  w, 5  w, 6  w, 7  w,     
               8  w, 9  w, 10 w, 11 w,
               12 w, 13 w, 14 w, 15 w,

CREATE array1
               0  w, 1  w, 2  w, 3  w,
               1  w, 1  w, 1  w, 1  w,
               0  w, 1  w, 2  w, 3  w,
               2  w, 2  w, 2  w, 2  w,

CREATE arraydst
	       32 ALLOT arraydst 32 0 FILL

CREATE vec0 8 ALLOT
CREATE vec1 8 ALLOT
CREATE vecdst 8 ALLOT

4 VALUE #N     \ src, dst matrices' dimensions ( 4x4 )
8 VALUE #Nb    \ same, in bytes

>PUBLIC vec0 vec1 vecdst

: ROW ( row addr --> )                         \ result -> vec0
   SWAP #Nb * + vec0 #Nb CMOVE ;

: COL ( col addr --> )
   SWAP 2* + #N 0                              \ result -> vec1
   DO DUP I #Nb * + W@ I 2* vec1 + W! LOOP DROP ;

: GETWORD ( addr n --> word )   2* + W@  ;

: PUTWORD ( a addr --> )   2* + W!  ;

: VECTOR* ( --> )                              \ result (vec0*vec1) -> vecdst
   #N 0
   DO vec0 I GETWORD vec1 I GETWORD * vecdst I PUTWORD LOOP ;

: VECTORSUM ( --> sum(vectdst[0..n] )
   0 #N 0 DO vecdst I GETWORD + LOOP ;

CODE mmxVECTOR* ; ( --> )                      \ result (vec0*vec1) -> vecdst


        db	0Fh, 6Fh, 05h
        dd      offset vec0         ; movq	mm0,vec0

        db	0Fh, 6fh, 0dh       ; movq	mm1,vec1
        dd	offset vec1

       	db	0fh, 0d5h, 0c1h     ; pmullw	mm0,mm1

	db      0Fh, 7Fh, 05h       ; movq	vecdst,mm0
        dd	offset vecdst

        next
ENDCODE

CODE nommxVECTOR* ; ( --> )                      \ result (vec0*vec1) -> vecdst

        mov     edi,4
again:        
        movzx   eax,word ptr [vec0+edi*2-2]
        movzx   ebx,word ptr [vec1+edi*2-2]
        mul     ebx

        mov     word ptr [vecdst+edi*2-2],ax
        dec     edi
        jnz	again

        next
ENDCODE

: emptyVECTOR* ( --> ) ;

VARIABLE D

: MATRIX* ( --> )
 0 D !
 #N 0
 DO
  #N 0
  DO
   J array0 ROW
   I array1 COL
   VECTOR*                           \ change HERE!
   VECTORSUM           
   arraydst D @ PUTWORD
   D 1+!
  LOOP
 LOOP
;

: main

1000000 0
DO
 MATRIX*
LOOP
;

compress off
\ NoErrors
\ NoTraps
Build .\vectorvm