aboutsummaryrefslogtreecommitdiff
path: root/openssl/trunk/crypto/sha/asm
diff options
context:
space:
mode:
Diffstat (limited to 'openssl/trunk/crypto/sha/asm')
-rw-r--r--openssl/trunk/crypto/sha/asm/README1
-rw-r--r--openssl/trunk/crypto/sha/asm/sha1-586.pl430
-rw-r--r--openssl/trunk/crypto/sha/asm/sha1-ia64.pl549
-rwxr-xr-xopenssl/trunk/crypto/sha/asm/sha512-ia64.pl432
-rw-r--r--openssl/trunk/crypto/sha/asm/sha512-sse2.pl404
5 files changed, 0 insertions, 1816 deletions
diff --git a/openssl/trunk/crypto/sha/asm/README b/openssl/trunk/crypto/sha/asm/README
deleted file mode 100644
index b7e75576..00000000
--- a/openssl/trunk/crypto/sha/asm/README
+++ /dev/null
@@ -1 +0,0 @@
-C2.pl works
diff --git a/openssl/trunk/crypto/sha/asm/sha1-586.pl b/openssl/trunk/crypto/sha/asm/sha1-586.pl
deleted file mode 100644
index 4f8521f1..00000000
--- a/openssl/trunk/crypto/sha/asm/sha1-586.pl
+++ /dev/null
@@ -1,430 +0,0 @@
-#!/usr/local/bin/perl
-
-# It was noted that Intel IA-32 C compiler generates code which
-# performs ~30% *faster* on P4 CPU than original *hand-coded*
-# SHA1 assembler implementation. To address this problem (and
-# prove that humans are still better than machines:-), the
-# original code was overhauled, which resulted in following
-# performance changes:
-#
-# compared with original compared with Intel cc
-# assembler impl. generated code
-# Pentium -16% +48%
-# PIII/AMD +8% +16%
-# P4 +85%(!) +45%
-#
-# As you can see Pentium came out as looser:-( Yet I reckoned that
-# improvement on P4 outweights the loss and incorporate this
-# re-tuned code to 0.9.7 and later.
-# ----------------------------------------------------------------
-# Those who for any particular reason absolutely must score on
-# Pentium can replace this module with one from 0.9.6 distribution.
-# This "offer" shall be revoked the moment programming interface to
-# this module is changed, in which case this paragraph should be
-# removed.
-# ----------------------------------------------------------------
-# <appro@fy.chalmers.se>
-
-$normal=0;
-
-push(@INC,"perlasm","../../perlasm");
-require "x86asm.pl";
-
-&asm_init($ARGV[0],"sha1-586.pl",$ARGV[$#ARGV] eq "386");
-
-$A="eax";
-$B="ecx";
-$C="ebx";
-$D="edx";
-$E="edi";
-$T="esi";
-$tmp1="ebp";
-
-$off=9*4;
-
-@K=(0x5a827999,0x6ed9eba1,0x8f1bbcdc,0xca62c1d6);
-
-&sha1_block_data("sha1_block_asm_data_order");
-
-&asm_finish();
-
-sub Nn
- {
- local($p)=@_;
- local(%n)=($A,$T,$B,$A,$C,$B,$D,$C,$E,$D,$T,$E);
- return($n{$p});
- }
-
-sub Np
- {
- local($p)=@_;
- local(%n)=($A,$T,$B,$A,$C,$B,$D,$C,$E,$D,$T,$E);
- local(%n)=($A,$B,$B,$C,$C,$D,$D,$E,$E,$T,$T,$A);
- return($n{$p});
- }
-
-sub Na
- {
- local($n)=@_;
- return( (($n )&0x0f),
- (($n+ 2)&0x0f),
- (($n+ 8)&0x0f),
- (($n+13)&0x0f),
- (($n+ 1)&0x0f));
- }
-
-sub X_expand
- {
- local($in)=@_;
-
- &comment("First, load the words onto the stack in network byte order");
- for ($i=0; $i<16; $i+=2)
- {
- &mov($A,&DWP(($i+0)*4,$in,"",0));# unless $i == 0;
- &mov($B,&DWP(($i+1)*4,$in,"",0));
- &bswap($A);
- &bswap($B);
- &mov(&swtmp($i+0),$A);
- &mov(&swtmp($i+1),$B);
- }
-
- &comment("We now have the X array on the stack");
- &comment("starting at sp-4");
- }
-
-# Rules of engagement
-# F is always trashable at the start, the running total.
-# E becomes the next F so it can be trashed after it has been 'accumulated'
-# F becomes A in the next round. We don't need to access it much.
-# During the X update part, the result ends up in $X[$n0].
-
-sub BODY_00_15
- {
- local($pos,$K,$X,$n,$a,$b,$c,$d,$e,$f)=@_;
-
- &comment("00_15 $n");
-
- &mov($f,$c); # f to hold F_00_19(b,c,d)
- if ($n==0) { &mov($tmp1,$a); }
- else { &mov($a,$tmp1); }
- &rotl($tmp1,5); # tmp1=ROTATE(a,5)
- &xor($f,$d);
- &and($f,$b);
- &add($tmp1,$e); # tmp1+=e;
- &mov($e,&swtmp($n)); # e becomes volatile and
- # is loaded with xi
- &xor($f,$d); # f holds F_00_19(b,c,d)
- &rotr($b,2); # b=ROTATE(b,30)
- &lea($tmp1,&DWP($K,$tmp1,$e,1));# tmp1+=K_00_19+xi
-
- if ($n==15) { &add($f,$tmp1); } # f+=tmp1
- else { &add($tmp1,$f); }
- }
-
-sub BODY_16_19
- {
- local($pos,$K,$X,$n,$a,$b,$c,$d,$e,$f)=@_;
- local($n0,$n1,$n2,$n3,$np)=&Na($n);
-
- &comment("16_19 $n");
-
- &mov($f,&swtmp($n1)); # f to hold Xupdate(xi,xa,xb,xc,xd)
- &mov($tmp1,$c); # tmp1 to hold F_00_19(b,c,d)
- &xor($f,&swtmp($n0));
- &xor($tmp1,$d);
- &xor($f,&swtmp($n2));
- &and($tmp1,$b); # tmp1 holds F_00_19(b,c,d)
- &rotr($b,2); # b=ROTATE(b,30)
- &xor($f,&swtmp($n3)); # f holds xa^xb^xc^xd
- &rotl($f,1); # f=ROATE(f,1)
- &xor($tmp1,$d); # tmp1=F_00_19(b,c,d)
- &mov(&swtmp($n0),$f); # xi=f
- &lea($f,&DWP($K,$f,$e,1)); # f+=K_00_19+e
- &mov($e,$a); # e becomes volatile
- &rotl($e,5); # e=ROTATE(a,5)
- &add($f,$tmp1); # f+=F_00_19(b,c,d)
- &add($f,$e); # f+=ROTATE(a,5)
- }
-
-sub BODY_20_39
- {
- local($pos,$K,$X,$n,$a,$b,$c,$d,$e,$f)=@_;
-
- &comment("20_39 $n");
- local($n0,$n1,$n2,$n3,$np)=&Na($n);
-
- &mov($tmp1,$b); # tmp1 to hold F_20_39(b,c,d)
- &mov($f,&swtmp($n0)); # f to hold Xupdate(xi,xa,xb,xc,xd)
- &rotr($b,2); # b=ROTATE(b,30)
- &xor($f,&swtmp($n1));
- &xor($tmp1,$c);
- &xor($f,&swtmp($n2));
- &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d)
- &xor($f,&swtmp($n3)); # f holds xa^xb^xc^xd
- &rotl($f,1); # f=ROTATE(f,1)
- &add($tmp1,$e);
- &mov(&swtmp($n0),$f); # xi=f
- &mov($e,$a); # e becomes volatile
- &rotl($e,5); # e=ROTATE(a,5)
- &lea($f,&DWP($K,$f,$tmp1,1)); # f+=K_20_39+e
- &add($f,$e); # f+=ROTATE(a,5)
- }
-
-sub BODY_40_59
- {
- local($pos,$K,$X,$n,$a,$b,$c,$d,$e,$f)=@_;
-
- &comment("40_59 $n");
- local($n0,$n1,$n2,$n3,$np)=&Na($n);
-
- &mov($f,&swtmp($n0)); # f to hold Xupdate(xi,xa,xb,xc,xd)
- &mov($tmp1,&swtmp($n1));
- &xor($f,$tmp1);
- &mov($tmp1,&swtmp($n2));
- &xor($f,$tmp1);
- &mov($tmp1,&swtmp($n3));
- &xor($f,$tmp1); # f holds xa^xb^xc^xd
- &mov($tmp1,$b); # tmp1 to hold F_40_59(b,c,d)
- &rotl($f,1); # f=ROTATE(f,1)
- &or($tmp1,$c);
- &mov(&swtmp($n0),$f); # xi=f
- &and($tmp1,$d);
- &lea($f,&DWP($K,$f,$e,1)); # f+=K_40_59+e
- &mov($e,$b); # e becomes volatile and is used
- # to calculate F_40_59(b,c,d)
- &rotr($b,2); # b=ROTATE(b,30)
- &and($e,$c);
- &or($tmp1,$e); # tmp1 holds F_40_59(b,c,d)
- &mov($e,$a);
- &rotl($e,5); # e=ROTATE(a,5)
- &add($f,$tmp1); # f+=tmp1;
- &add($f,$e); # f+=ROTATE(a,5)
- }
-
-sub BODY_60_79
- {
- &BODY_20_39(@_);
- }
-
-sub sha1_block_host
- {
- local($name, $sclabel)=@_;
-
- &function_begin_B($name,"");
-
- # parameter 1 is the MD5_CTX structure.
- # A 0
- # B 4
- # C 8
- # D 12
- # E 16
-
- &mov("ecx", &wparam(2));
- &push("esi");
- &shl("ecx",6);
- &mov("esi", &wparam(1));
- &push("ebp");
- &add("ecx","esi"); # offset to leave on
- &push("ebx");
- &mov("ebp", &wparam(0));
- &push("edi");
- &mov($D, &DWP(12,"ebp","",0));
- &stack_push(18+9);
- &mov($E, &DWP(16,"ebp","",0));
- &mov($C, &DWP( 8,"ebp","",0));
- &mov(&swtmp(17),"ecx");
-
- &comment("First we need to setup the X array");
-
- for ($i=0; $i<16; $i+=2)
- {
- &mov($A,&DWP(($i+0)*4,"esi","",0));# unless $i == 0;
- &mov($B,&DWP(($i+1)*4,"esi","",0));
- &mov(&swtmp($i+0),$A);
- &mov(&swtmp($i+1),$B);
- }
- &jmp($sclabel);
- &function_end_B($name);
- }
-
-
-sub sha1_block_data
- {
- local($name)=@_;
-
- &function_begin_B($name,"");
-
- # parameter 1 is the MD5_CTX structure.
- # A 0
- # B 4
- # C 8
- # D 12
- # E 16
-
- &mov("ecx", &wparam(2));
- &push("esi");
- &shl("ecx",6);
- &mov("esi", &wparam(1));
- &push("ebp");
- &add("ecx","esi"); # offset to leave on
- &push("ebx");
- &mov("ebp", &wparam(0));
- &push("edi");
- &mov($D, &DWP(12,"ebp","",0));
- &stack_push(18+9);
- &mov($E, &DWP(16,"ebp","",0));
- &mov($C, &DWP( 8,"ebp","",0));
- &mov(&swtmp(17),"ecx");
-
- &comment("First we need to setup the X array");
-
- &set_label("start") unless $normal;
-
- &X_expand("esi");
- &mov(&wparam(1),"esi");
-
- &set_label("shortcut", 0, 1);
- &comment("");
- &comment("Start processing");
-
- # odd start
- &mov($A, &DWP( 0,"ebp","",0));
- &mov($B, &DWP( 4,"ebp","",0));
- $X="esp";
- &BODY_00_15(-2,$K[0],$X, 0,$A,$B,$C,$D,$E,$T);
- &BODY_00_15( 0,$K[0],$X, 1,$T,$A,$B,$C,$D,$E);
- &BODY_00_15( 0,$K[0],$X, 2,$E,$T,$A,$B,$C,$D);
- &BODY_00_15( 0,$K[0],$X, 3,$D,$E,$T,$A,$B,$C);
- &BODY_00_15( 0,$K[0],$X, 4,$C,$D,$E,$T,$A,$B);
- &BODY_00_15( 0,$K[0],$X, 5,$B,$C,$D,$E,$T,$A);
- &BODY_00_15( 0,$K[0],$X, 6,$A,$B,$C,$D,$E,$T);
- &BODY_00_15( 0,$K[0],$X, 7,$T,$A,$B,$C,$D,$E);
- &BODY_00_15( 0,$K[0],$X, 8,$E,$T,$A,$B,$C,$D);
- &BODY_00_15( 0,$K[0],$X, 9,$D,$E,$T,$A,$B,$C);
- &BODY_00_15( 0,$K[0],$X,10,$C,$D,$E,$T,$A,$B);
- &BODY_00_15( 0,$K[0],$X,11,$B,$C,$D,$E,$T,$A);
- &BODY_00_15( 0,$K[0],$X,12,$A,$B,$C,$D,$E,$T);
- &BODY_00_15( 0,$K[0],$X,13,$T,$A,$B,$C,$D,$E);
- &BODY_00_15( 0,$K[0],$X,14,$E,$T,$A,$B,$C,$D);
- &BODY_00_15( 1,$K[0],$X,15,$D,$E,$T,$A,$B,$C);
- &BODY_16_19(-1,$K[0],$X,16,$C,$D,$E,$T,$A,$B);
- &BODY_16_19( 0,$K[0],$X,17,$B,$C,$D,$E,$T,$A);
- &BODY_16_19( 0,$K[0],$X,18,$A,$B,$C,$D,$E,$T);
- &BODY_16_19( 1,$K[0],$X,19,$T,$A,$B,$C,$D,$E);
-
- &BODY_20_39(-1,$K[1],$X,20,$E,$T,$A,$B,$C,$D);
- &BODY_20_39( 0,$K[1],$X,21,$D,$E,$T,$A,$B,$C);
- &BODY_20_39( 0,$K[1],$X,22,$C,$D,$E,$T,$A,$B);
- &BODY_20_39( 0,$K[1],$X,23,$B,$C,$D,$E,$T,$A);
- &BODY_20_39( 0,$K[1],$X,24,$A,$B,$C,$D,$E,$T);
- &BODY_20_39( 0,$K[1],$X,25,$T,$A,$B,$C,$D,$E);
- &BODY_20_39( 0,$K[1],$X,26,$E,$T,$A,$B,$C,$D);
- &BODY_20_39( 0,$K[1],$X,27,$D,$E,$T,$A,$B,$C);
- &BODY_20_39( 0,$K[1],$X,28,$C,$D,$E,$T,$A,$B);
- &BODY_20_39( 0,$K[1],$X,29,$B,$C,$D,$E,$T,$A);
- &BODY_20_39( 0,$K[1],$X,30,$A,$B,$C,$D,$E,$T);
- &BODY_20_39( 0,$K[1],$X,31,$T,$A,$B,$C,$D,$E);
- &BODY_20_39( 0,$K[1],$X,32,$E,$T,$A,$B,$C,$D);
- &BODY_20_39( 0,$K[1],$X,33,$D,$E,$T,$A,$B,$C);
- &BODY_20_39( 0,$K[1],$X,34,$C,$D,$E,$T,$A,$B);
- &BODY_20_39( 0,$K[1],$X,35,$B,$C,$D,$E,$T,$A);
- &BODY_20_39( 0,$K[1],$X,36,$A,$B,$C,$D,$E,$T);
- &BODY_20_39( 0,$K[1],$X,37,$T,$A,$B,$C,$D,$E);
- &BODY_20_39( 0,$K[1],$X,38,$E,$T,$A,$B,$C,$D);
- &BODY_20_39( 1,$K[1],$X,39,$D,$E,$T,$A,$B,$C);
-
- &BODY_40_59(-1,$K[2],$X,40,$C,$D,$E,$T,$A,$B);
- &BODY_40_59( 0,$K[2],$X,41,$B,$C,$D,$E,$T,$A);
- &BODY_40_59( 0,$K[2],$X,42,$A,$B,$C,$D,$E,$T);
- &BODY_40_59( 0,$K[2],$X,43,$T,$A,$B,$C,$D,$E);
- &BODY_40_59( 0,$K[2],$X,44,$E,$T,$A,$B,$C,$D);
- &BODY_40_59( 0,$K[2],$X,45,$D,$E,$T,$A,$B,$C);
- &BODY_40_59( 0,$K[2],$X,46,$C,$D,$E,$T,$A,$B);
- &BODY_40_59( 0,$K[2],$X,47,$B,$C,$D,$E,$T,$A);
- &BODY_40_59( 0,$K[2],$X,48,$A,$B,$C,$D,$E,$T);
- &BODY_40_59( 0,$K[2],$X,49,$T,$A,$B,$C,$D,$E);
- &BODY_40_59( 0,$K[2],$X,50,$E,$T,$A,$B,$C,$D);
- &BODY_40_59( 0,$K[2],$X,51,$D,$E,$T,$A,$B,$C);
- &BODY_40_59( 0,$K[2],$X,52,$C,$D,$E,$T,$A,$B);
- &BODY_40_59( 0,$K[2],$X,53,$B,$C,$D,$E,$T,$A);
- &BODY_40_59( 0,$K[2],$X,54,$A,$B,$C,$D,$E,$T);
- &BODY_40_59( 0,$K[2],$X,55,$T,$A,$B,$C,$D,$E);
- &BODY_40_59( 0,$K[2],$X,56,$E,$T,$A,$B,$C,$D);
- &BODY_40_59( 0,$K[2],$X,57,$D,$E,$T,$A,$B,$C);
- &BODY_40_59( 0,$K[2],$X,58,$C,$D,$E,$T,$A,$B);
- &BODY_40_59( 1,$K[2],$X,59,$B,$C,$D,$E,$T,$A);
-
- &BODY_60_79(-1,$K[3],$X,60,$A,$B,$C,$D,$E,$T);
- &BODY_60_79( 0,$K[3],$X,61,$T,$A,$B,$C,$D,$E);
- &BODY_60_79( 0,$K[3],$X,62,$E,$T,$A,$B,$C,$D);
- &BODY_60_79( 0,$K[3],$X,63,$D,$E,$T,$A,$B,$C);
- &BODY_60_79( 0,$K[3],$X,64,$C,$D,$E,$T,$A,$B);
- &BODY_60_79( 0,$K[3],$X,65,$B,$C,$D,$E,$T,$A);
- &BODY_60_79( 0,$K[3],$X,66,$A,$B,$C,$D,$E,$T);
- &BODY_60_79( 0,$K[3],$X,67,$T,$A,$B,$C,$D,$E);
- &BODY_60_79( 0,$K[3],$X,68,$E,$T,$A,$B,$C,$D);
- &BODY_60_79( 0,$K[3],$X,69,$D,$E,$T,$A,$B,$C);
- &BODY_60_79( 0,$K[3],$X,70,$C,$D,$E,$T,$A,$B);
- &BODY_60_79( 0,$K[3],$X,71,$B,$C,$D,$E,$T,$A);
- &BODY_60_79( 0,$K[3],$X,72,$A,$B,$C,$D,$E,$T);
- &BODY_60_79( 0,$K[3],$X,73,$T,$A,$B,$C,$D,$E);
- &BODY_60_79( 0,$K[3],$X,74,$E,$T,$A,$B,$C,$D);
- &BODY_60_79( 0,$K[3],$X,75,$D,$E,$T,$A,$B,$C);
- &BODY_60_79( 0,$K[3],$X,76,$C,$D,$E,$T,$A,$B);
- &BODY_60_79( 0,$K[3],$X,77,$B,$C,$D,$E,$T,$A);
- &BODY_60_79( 0,$K[3],$X,78,$A,$B,$C,$D,$E,$T);
- &BODY_60_79( 2,$K[3],$X,79,$T,$A,$B,$C,$D,$E);
-
- &comment("End processing");
- &comment("");
- # D is the tmp value
-
- # E -> A
- # T -> B
- # A -> C
- # B -> D
- # C -> E
- # D -> T
-
- &mov($tmp1,&wparam(0));
-
- &mov($D, &DWP(12,$tmp1,"",0));
- &add($D,$B);
- &mov($B, &DWP( 4,$tmp1,"",0));
- &add($B,$T);
- &mov($T, $A);
- &mov($A, &DWP( 0,$tmp1,"",0));
- &mov(&DWP(12,$tmp1,"",0),$D);
-
- &add($A,$E);
- &mov($E, &DWP(16,$tmp1,"",0));
- &add($E,$C);
- &mov($C, &DWP( 8,$tmp1,"",0));
- &add($C,$T);
-
- &mov(&DWP( 0,$tmp1,"",0),$A);
- &mov("esi",&wparam(1));
- &mov(&DWP( 8,$tmp1,"",0),$C);
- &add("esi",64);
- &mov("eax",&swtmp(17));
- &mov(&DWP(16,$tmp1,"",0),$E);
- &cmp("esi","eax");
- &mov(&DWP( 4,$tmp1,"",0),$B);
- &jb(&label("start"));
-
- &stack_pop(18+9);
- &pop("edi");
- &pop("ebx");
- &pop("ebp");
- &pop("esi");
- &ret();
-
- # keep a note of shortcut label so it can be used outside
- # block.
- my $sclabel = &label("shortcut");
-
- &function_end_B($name);
- # Putting this here avoids problems with MASM in debugging mode
- &sha1_block_host("sha1_block_asm_host_order", $sclabel);
- }
-
diff --git a/openssl/trunk/crypto/sha/asm/sha1-ia64.pl b/openssl/trunk/crypto/sha/asm/sha1-ia64.pl
deleted file mode 100644
index cb9dfad1..00000000
--- a/openssl/trunk/crypto/sha/asm/sha1-ia64.pl
+++ /dev/null
@@ -1,549 +0,0 @@
-#!/usr/bin/env perl
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. Rights for redistribution and usage in source and binary
-# forms are granted according to the OpenSSL license.
-# ====================================================================
-#
-# Eternal question is what's wrong with compiler generated code? The
-# trick is that it's possible to reduce the number of shifts required
-# to perform rotations by maintaining copy of 32-bit value in upper
-# bits of 64-bit register. Just follow mux2 and shrp instructions...
-# Performance under big-endian OS such as HP-UX is 179MBps*1GHz, which
-# is >50% better than HP C and >2x better than gcc. As of this moment
-# performance under little-endian OS such as Linux and Windows will be
-# a bit lower, because data has to be picked in reverse byte-order.
-# It's possible to resolve this issue by implementing third function,
-# sha1_block_asm_data_order_aligned, which would temporarily flip
-# BE field in User Mask register...
-
-$code=<<___;
-.ident \"sha1-ia64.s, version 1.0\"
-.ident \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\"
-.explicit
-
-___
-
-
-if ($^O eq "hpux") {
- $ADDP="addp4";
- for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
-} else { $ADDP="add"; }
-for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/);
- $big_endian=0 if (/\-DL_ENDIAN/); }
-if (!defined($big_endian))
- { $big_endian=(unpack('L',pack('N',1))==1); }
-
-#$human=1;
-if ($human) { # useful for visual code auditing...
- ($A,$B,$C,$D,$E,$T) = ("A","B","C","D","E","T");
- ($h0,$h1,$h2,$h3,$h4) = ("h0","h1","h2","h3","h4");
- ($K_00_19, $K_20_39, $K_40_59, $K_60_79) =
- ( "K_00_19","K_20_39","K_40_59","K_60_79" );
- @X= ( "X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7",
- "X8", "X9","X10","X11","X12","X13","X14","X15" );
-}
-else {
- ($A,$B,$C,$D,$E,$T) = ("loc0","loc1","loc2","loc3","loc4","loc5");
- ($h0,$h1,$h2,$h3,$h4) = ("loc6","loc7","loc8","loc9","loc10");
- ($K_00_19, $K_20_39, $K_40_59, $K_60_79) =
- ( "r14", "r15", "loc11", "loc12" );
- @X= ( "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
- "r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31" );
-}
-
-sub BODY_00_15 {
-local *code=shift;
-local ($i,$a,$b,$c,$d,$e,$f,$unaligned)=@_;
-
-if ($unaligned) {
- $code.=<<___;
-{ .mmi; ld1 tmp0=[inp],2 // MSB
- ld1 tmp1=[tmp3],2 };;
-{ .mmi; ld1 tmp2=[inp],2
- ld1 $X[$i&0xf]=[tmp3],2 // LSB
- dep tmp1=tmp0,tmp1,8,8 };;
-{ .mii; cmp.ne p16,p0=r0,r0 // no misaligned prefetch
- dep $X[$i&0xf]=tmp2,$X[$i&0xf],8,8;;
- dep $X[$i&0xf]=tmp1,$X[$i&0xf],16,16 };;
-{ .mmi; nop.m 0
-___
- }
-elsif ($i<15) {
- $code.=<<___;
-{ .mmi; ld4 $X[($i+1)&0xf]=[inp],4 // prefetch
-___
- }
-else {
- $code.=<<___;
-{ .mmi; nop.m 0
-___
- }
-if ($i<15) {
- $code.=<<___;
- and tmp0=$c,$b
- dep.z tmp5=$a,5,27 } // a<<5
-{ .mmi; andcm tmp1=$d,$b
- add tmp4=$e,$K_00_19 };;
-{ .mmi; or tmp0=tmp0,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d)
- add $f=tmp4,$X[$i&0xf] // f=xi+e+K_00_19
- extr.u tmp1=$a,27,5 };; // a>>27
-{ .mib; add $f=$f,tmp0 // f+=F_00_19(b,c,d)
- shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
-{ .mib; or tmp1=tmp1,tmp5 // ROTATE(a,5)
- mux2 tmp6=$a,0x44 };; // see b in next iteration
-{ .mii; add $f=$f,tmp1 // f+=ROTATE(a,5)
- mux2 $X[$i&0xf]=$X[$i&0xf],0x44
- nop.i 0 };;
-
-___
- }
-else {
- $code.=<<___;
- and tmp0=$c,$b
- dep.z tmp5=$a,5,27 } // a<<5 ;;?
-{ .mmi; andcm tmp1=$d,$b
- add tmp4=$e,$K_00_19 };;
-{ .mmi; or tmp0=tmp0,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d)
- add $f=tmp4,$X[$i&0xf] // f=xi+e+K_00_19
- extr.u tmp1=$a,27,5 } // a>>27
-{ .mmi; xor tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf] // +1
- xor tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1
- nop.i 0 };;
-{ .mmi; add $f=$f,tmp0 // f+=F_00_19(b,c,d)
- xor tmp2=tmp2,tmp3 // +1
- shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
-{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5)
- mux2 tmp6=$a,0x44 };; // see b in next iteration
-{ .mii; add $f=$f,tmp1 // f+=ROTATE(a,5)
- shrp $e=tmp2,tmp2,31 // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
- mux2 $X[$i&0xf]=$X[$i&0xf],0x44 };;
-
-___
- }
-}
-
-sub BODY_16_19 {
-local *code=shift;
-local ($i,$a,$b,$c,$d,$e,$f)=@_;
-
-$code.=<<___;
-{ .mmi; mov $X[$i&0xf]=$f // Xupdate
- and tmp0=$c,$b
- dep.z tmp5=$a,5,27 } // a<<5
-{ .mmi; andcm tmp1=$d,$b
- add tmp4=$e,$K_00_19 };;
-{ .mmi; or tmp0=tmp0,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d)
- add $f=$f,tmp4 // f+=e+K_00_19
- extr.u tmp1=$a,27,5 } // a>>27
-{ .mmi; xor tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf] // +1
- xor tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1
- nop.i 0 };;
-{ .mmi; add $f=$f,tmp0 // f+=F_00_19(b,c,d)
- xor tmp2=tmp2,tmp3 // +1
- shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
-{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5)
- mux2 tmp6=$a,0x44 };; // see b in next iteration
-{ .mii; add $f=$f,tmp1 // f+=ROTATE(a,5)
- shrp $e=tmp2,tmp2,31 // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
- nop.i 0 };;
-
-___
-}
-
-sub BODY_20_39 {
-local *code=shift;
-local ($i,$a,$b,$c,$d,$e,$f,$Konst)=@_;
- $Konst = $K_20_39 if (!defined($Konst));
-
-if ($i<79) {
-$code.=<<___;
-{ .mib; mov $X[$i&0xf]=$f // Xupdate
- dep.z tmp5=$a,5,27 } // a<<5
-{ .mib; xor tmp0=$c,$b
- add tmp4=$e,$Konst };;
-{ .mmi; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d
- add $f=$f,tmp4 // f+=e+K_20_39
- extr.u tmp1=$a,27,5 } // a>>27
-{ .mmi; xor tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf] // +1
- xor tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1
- nop.i 0 };;
-{ .mmi; add $f=$f,tmp0 // f+=F_20_39(b,c,d)
- xor tmp2=tmp2,tmp3 // +1
- shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
-{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5)
- mux2 tmp6=$a,0x44 };; // see b in next iteration
-{ .mii; add $f=$f,tmp1 // f+=ROTATE(a,5)
- shrp $e=tmp2,tmp2,31 // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
- nop.i 0 };;
-
-___
-}
-else {
-$code.=<<___;
-{ .mib; mov $X[$i&0xf]=$f // Xupdate
- dep.z tmp5=$a,5,27 } // a<<5
-{ .mib; xor tmp0=$c,$b
- add tmp4=$e,$Konst };;
-{ .mib; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d
- extr.u tmp1=$a,27,5 } // a>>27
-{ .mib; add $f=$f,tmp4 // f+=e+K_20_39
- add $h1=$h1,$a };; // wrap up
-{ .mmi;
-(p16) ld4.s $X[0]=[inp],4 // non-faulting prefetch
- add $f=$f,tmp0 // f+=F_20_39(b,c,d)
- shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) ;;?
-{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5)
- add $h3=$h3,$c };; // wrap up
-{ .mib; add tmp3=1,inp // used in unaligned codepath
- add $f=$f,tmp1 } // f+=ROTATE(a,5)
-{ .mib; add $h2=$h2,$b // wrap up
- add $h4=$h4,$d };; // wrap up
-
-___
-}
-}
-
-sub BODY_40_59 {
-local *code=shift;
-local ($i,$a,$b,$c,$d,$e,$f)=@_;
-
-$code.=<<___;
-{ .mmi; mov $X[$i&0xf]=$f // Xupdate
- and tmp0=$c,$b
- dep.z tmp5=$a,5,27 } // a<<5
-{ .mmi; and tmp1=$d,$b
- add tmp4=$e,$K_40_59 };;
-{ .mmi; or tmp0=tmp0,tmp1 // (b&c)|(b&d)
- add $f=$f,tmp4 // f+=e+K_40_59
- extr.u tmp1=$a,27,5 } // a>>27
-{ .mmi; and tmp4=$c,$d
- xor tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf] // +1
- xor tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1
- };;
-{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5)
- xor tmp2=tmp2,tmp3 // +1
- shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
-{ .mmi; or tmp0=tmp0,tmp4 // F_40_59(b,c,d)=(b&c)|(b&d)|(c&d)
- mux2 tmp6=$a,0x44 };; // see b in next iteration
-{ .mii; add $f=$f,tmp0 // f+=F_40_59(b,c,d)
- shrp $e=tmp2,tmp2,31;; // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
- add $f=$f,tmp1 };; // f+=ROTATE(a,5)
-
-___
-}
-sub BODY_60_79 { &BODY_20_39(@_,$K_60_79); }
-
-$code.=<<___;
-.text
-
-tmp0=r8;
-tmp1=r9;
-tmp2=r10;
-tmp3=r11;
-ctx=r32; // in0
-inp=r33; // in1
-
-// void sha1_block_asm_host_order(SHA_CTX *c,const void *p,size_t num);
-.global sha1_block_asm_host_order#
-.proc sha1_block_asm_host_order#
-.align 32
-sha1_block_asm_host_order:
- .prologue
- .fframe 0
- .save ar.pfs,r0
- .save ar.lc,r3
-{ .mmi; alloc tmp1=ar.pfs,3,15,0,0
- $ADDP tmp0=4,ctx
- mov r3=ar.lc }
-{ .mmi; $ADDP ctx=0,ctx
- $ADDP inp=0,inp
- mov r2=pr };;
-tmp4=in2;
-tmp5=loc13;
-tmp6=loc14;
- .body
-{ .mlx; ld4 $h0=[ctx],8
- movl $K_00_19=0x5a827999 }
-{ .mlx; ld4 $h1=[tmp0],8
- movl $K_20_39=0x6ed9eba1 };;
-{ .mlx; ld4 $h2=[ctx],8
- movl $K_40_59=0x8f1bbcdc }
-{ .mlx; ld4 $h3=[tmp0]
- movl $K_60_79=0xca62c1d6 };;
-{ .mmi; ld4 $h4=[ctx],-16
- add in2=-1,in2 // adjust num for ar.lc
- mov ar.ec=1 };;
-{ .mmi; ld4 $X[0]=[inp],4 // prefetch
- cmp.ne p16,p0=r0,in2 // prefecth at loop end
- mov ar.lc=in2 };; // brp.loop.imp: too far
-
-.Lhtop:
-{ .mmi; mov $A=$h0
- mov $B=$h1
- mux2 tmp6=$h1,0x44 }
-{ .mmi; mov $C=$h2
- mov $D=$h3
- mov $E=$h4 };;
-
-___
-
- &BODY_00_15(\$code, 0,$A,$B,$C,$D,$E,$T);
- &BODY_00_15(\$code, 1,$T,$A,$B,$C,$D,$E);
- &BODY_00_15(\$code, 2,$E,$T,$A,$B,$C,$D);
- &BODY_00_15(\$code, 3,$D,$E,$T,$A,$B,$C);
- &BODY_00_15(\$code, 4,$C,$D,$E,$T,$A,$B);
- &BODY_00_15(\$code, 5,$B,$C,$D,$E,$T,$A);
- &BODY_00_15(\$code, 6,$A,$B,$C,$D,$E,$T);
- &BODY_00_15(\$code, 7,$T,$A,$B,$C,$D,$E);
- &BODY_00_15(\$code, 8,$E,$T,$A,$B,$C,$D);
- &BODY_00_15(\$code, 9,$D,$E,$T,$A,$B,$C);
- &BODY_00_15(\$code,10,$C,$D,$E,$T,$A,$B);
- &BODY_00_15(\$code,11,$B,$C,$D,$E,$T,$A);
- &BODY_00_15(\$code,12,$A,$B,$C,$D,$E,$T);
- &BODY_00_15(\$code,13,$T,$A,$B,$C,$D,$E);
- &BODY_00_15(\$code,14,$E,$T,$A,$B,$C,$D);
- &BODY_00_15(\$code,15,$D,$E,$T,$A,$B,$C);
-
- &BODY_16_19(\$code,16,$C,$D,$E,$T,$A,$B);
- &BODY_16_19(\$code,17,$B,$C,$D,$E,$T,$A);
- &BODY_16_19(\$code,18,$A,$B,$C,$D,$E,$T);
- &BODY_16_19(\$code,19,$T,$A,$B,$C,$D,$E);
-
- &BODY_20_39(\$code,20,$E,$T,$A,$B,$C,$D);
- &BODY_20_39(\$code,21,$D,$E,$T,$A,$B,$C);
- &BODY_20_39(\$code,22,$C,$D,$E,$T,$A,$B);
- &BODY_20_39(\$code,23,$B,$C,$D,$E,$T,$A);
- &BODY_20_39(\$code,24,$A,$B,$C,$D,$E,$T);
- &BODY_20_39(\$code,25,$T,$A,$B,$C,$D,$E);
- &BODY_20_39(\$code,26,$E,$T,$A,$B,$C,$D);
- &BODY_20_39(\$code,27,$D,$E,$T,$A,$B,$C);
- &BODY_20_39(\$code,28,$C,$D,$E,$T,$A,$B);
- &BODY_20_39(\$code,29,$B,$C,$D,$E,$T,$A);
- &BODY_20_39(\$code,30,$A,$B,$C,$D,$E,$T);
- &BODY_20_39(\$code,31,$T,$A,$B,$C,$D,$E);
- &BODY_20_39(\$code,32,$E,$T,$A,$B,$C,$D);
- &BODY_20_39(\$code,33,$D,$E,$T,$A,$B,$C);
- &BODY_20_39(\$code,34,$C,$D,$E,$T,$A,$B);
- &BODY_20_39(\$code,35,$B,$C,$D,$E,$T,$A);
- &BODY_20_39(\$code,36,$A,$B,$C,$D,$E,$T);
- &BODY_20_39(\$code,37,$T,$A,$B,$C,$D,$E);
- &BODY_20_39(\$code,38,$E,$T,$A,$B,$C,$D);
- &BODY_20_39(\$code,39,$D,$E,$T,$A,$B,$C);
-
- &BODY_40_59(\$code,40,$C,$D,$E,$T,$A,$B);
- &BODY_40_59(\$code,41,$B,$C,$D,$E,$T,$A);
- &BODY_40_59(\$code,42,$A,$B,$C,$D,$E,$T);
- &BODY_40_59(\$code,43,$T,$A,$B,$C,$D,$E);
- &BODY_40_59(\$code,44,$E,$T,$A,$B,$C,$D);
- &BODY_40_59(\$code,45,$D,$E,$T,$A,$B,$C);
- &BODY_40_59(\$code,46,$C,$D,$E,$T,$A,$B);
- &BODY_40_59(\$code,47,$B,$C,$D,$E,$T,$A);
- &BODY_40_59(\$code,48,$A,$B,$C,$D,$E,$T);
- &BODY_40_59(\$code,49,$T,$A,$B,$C,$D,$E);
- &BODY_40_59(\$code,50,$E,$T,$A,$B,$C,$D);
- &BODY_40_59(\$code,51,$D,$E,$T,$A,$B,$C);
- &BODY_40_59(\$code,52,$C,$D,$E,$T,$A,$B);
- &BODY_40_59(\$code,53,$B,$C,$D,$E,$T,$A);
- &BODY_40_59(\$code,54,$A,$B,$C,$D,$E,$T);
- &BODY_40_59(\$code,55,$T,$A,$B,$C,$D,$E);
- &BODY_40_59(\$code,56,$E,$T,$A,$B,$C,$D);
- &BODY_40_59(\$code,57,$D,$E,$T,$A,$B,$C);
- &BODY_40_59(\$code,58,$C,$D,$E,$T,$A,$B);
- &BODY_40_59(\$code,59,$B,$C,$D,$E,$T,$A);
-
- &BODY_60_79(\$code,60,$A,$B,$C,$D,$E,$T);
- &BODY_60_79(\$code,61,$T,$A,$B,$C,$D,$E);
- &BODY_60_79(\$code,62,$E,$T,$A,$B,$C,$D);
- &BODY_60_79(\$code,63,$D,$E,$T,$A,$B,$C);
- &BODY_60_79(\$code,64,$C,$D,$E,$T,$A,$B);
- &BODY_60_79(\$code,65,$B,$C,$D,$E,$T,$A);
- &BODY_60_79(\$code,66,$A,$B,$C,$D,$E,$T);
- &BODY_60_79(\$code,67,$T,$A,$B,$C,$D,$E);
- &BODY_60_79(\$code,68,$E,$T,$A,$B,$C,$D);
- &BODY_60_79(\$code,69,$D,$E,$T,$A,$B,$C);
- &BODY_60_79(\$code,70,$C,$D,$E,$T,$A,$B);
- &BODY_60_79(\$code,71,$B,$C,$D,$E,$T,$A);
- &BODY_60_79(\$code,72,$A,$B,$C,$D,$E,$T);
- &BODY_60_79(\$code,73,$T,$A,$B,$C,$D,$E);
- &BODY_60_79(\$code,74,$E,$T,$A,$B,$C,$D);
- &BODY_60_79(\$code,75,$D,$E,$T,$A,$B,$C);
- &BODY_60_79(\$code,76,$C,$D,$E,$T,$A,$B);
- &BODY_60_79(\$code,77,$B,$C,$D,$E,$T,$A);
- &BODY_60_79(\$code,78,$A,$B,$C,$D,$E,$T);
- &BODY_60_79(\$code,79,$T,$A,$B,$C,$D,$E);
-
-$code.=<<___;
-{ .mmb; add $h0=$h0,$E
- nop.m 0
- br.ctop.dptk.many .Lhtop };;
-.Lhend:
-{ .mmi; add tmp0=4,ctx
- mov ar.lc=r3 };;
-{ .mmi; st4 [ctx]=$h0,8
- st4 [tmp0]=$h1,8 };;
-{ .mmi; st4 [ctx]=$h2,8
- st4 [tmp0]=$h3 };;
-{ .mib; st4 [ctx]=$h4,-16
- mov pr=r2,0x1ffff
- br.ret.sptk.many b0 };;
-.endp sha1_block_asm_host_order#
-___
-
-
-$code.=<<___;
-// void sha1_block_asm_data_order(SHA_CTX *c,const void *p,size_t num);
-.global sha1_block_asm_data_order#
-.proc sha1_block_asm_data_order#
-.align 32
-sha1_block_asm_data_order:
-___
-$code.=<<___ if ($big_endian);
-{ .mmi; and r2=3,inp };;
-{ .mib; cmp.eq p6,p0=r0,r2
-(p6) br.dptk.many sha1_block_asm_host_order };;
-___
-$code.=<<___;
- .prologue
- .fframe 0
- .save ar.pfs,r0
- .save ar.lc,r3
-{ .mmi; alloc tmp1=ar.pfs,3,15,0,0
- $ADDP tmp0=4,ctx
- mov r3=ar.lc }
-{ .mmi; $ADDP ctx=0,ctx
- $ADDP inp=0,inp
- mov r2=pr };;
-tmp4=in2;
-tmp5=loc13;
-tmp6=loc14;
- .body
-{ .mlx; ld4 $h0=[ctx],8
- movl $K_00_19=0x5a827999 }
-{ .mlx; ld4 $h1=[tmp0],8
- movl $K_20_39=0x6ed9eba1 };;
-{ .mlx; ld4 $h2=[ctx],8
- movl $K_40_59=0x8f1bbcdc }
-{ .mlx; ld4 $h3=[tmp0]
- movl $K_60_79=0xca62c1d6 };;
-{ .mmi; ld4 $h4=[ctx],-16
- add in2=-1,in2 // adjust num for ar.lc
- mov ar.ec=1 };;
-{ .mmi; nop.m 0
- add tmp3=1,inp
- mov ar.lc=in2 };; // brp.loop.imp: too far
-
-.Ldtop:
-{ .mmi; mov $A=$h0
- mov $B=$h1
- mux2 tmp6=$h1,0x44 }
-{ .mmi; mov $C=$h2
- mov $D=$h3
- mov $E=$h4 };;
-
-___
-
- &BODY_00_15(\$code, 0,$A,$B,$C,$D,$E,$T,1);
- &BODY_00_15(\$code, 1,$T,$A,$B,$C,$D,$E,1);
- &BODY_00_15(\$code, 2,$E,$T,$A,$B,$C,$D,1);
- &BODY_00_15(\$code, 3,$D,$E,$T,$A,$B,$C,1);
- &BODY_00_15(\$code, 4,$C,$D,$E,$T,$A,$B,1);
- &BODY_00_15(\$code, 5,$B,$C,$D,$E,$T,$A,1);
- &BODY_00_15(\$code, 6,$A,$B,$C,$D,$E,$T,1);
- &BODY_00_15(\$code, 7,$T,$A,$B,$C,$D,$E,1);
- &BODY_00_15(\$code, 8,$E,$T,$A,$B,$C,$D,1);
- &BODY_00_15(\$code, 9,$D,$E,$T,$A,$B,$C,1);
- &BODY_00_15(\$code,10,$C,$D,$E,$T,$A,$B,1);
- &BODY_00_15(\$code,11,$B,$C,$D,$E,$T,$A,1);
- &BODY_00_15(\$code,12,$A,$B,$C,$D,$E,$T,1);
- &BODY_00_15(\$code,13,$T,$A,$B,$C,$D,$E,1);
- &BODY_00_15(\$code,14,$E,$T,$A,$B,$C,$D,1);
- &BODY_00_15(\$code,15,$D,$E,$T,$A,$B,$C,1);
-
- &BODY_16_19(\$code,16,$C,$D,$E,$T,$A,$B);
- &BODY_16_19(\$code,17,$B,$C,$D,$E,$T,$A);
- &BODY_16_19(\$code,18,$A,$B,$C,$D,$E,$T);
- &BODY_16_19(\$code,19,$T,$A,$B,$C,$D,$E);
-
- &BODY_20_39(\$code,20,$E,$T,$A,$B,$C,$D);
- &BODY_20_39(\$code,21,$D,$E,$T,$A,$B,$C);
- &BODY_20_39(\$code,22,$C,$D,$E,$T,$A,$B);
- &BODY_20_39(\$code,23,$B,$C,$D,$E,$T,$A);
- &BODY_20_39(\$code,24,$A,$B,$C,$D,$E,$T);
- &BODY_20_39(\$code,25,$T,$A,$B,$C,$D,$E);
- &BODY_20_39(\$code,26,$E,$T,$A,$B,$C,$D);
- &BODY_20_39(\$code,27,$D,$E,$T,$A,$B,$C);
- &BODY_20_39(\$code,28,$C,$D,$E,$T,$A,$B);
- &BODY_20_39(\$code,29,$B,$C,$D,$E,$T,$A);
- &BODY_20_39(\$code,30,$A,$B,$C,$D,$E,$T);
- &BODY_20_39(\$code,31,$T,$A,$B,$C,$D,$E);
- &BODY_20_39(\$code,32,$E,$T,$A,$B,$C,$D);
- &BODY_20_39(\$code,33,$D,$E,$T,$A,$B,$C);
- &BODY_20_39(\$code,34,$C,$D,$E,$T,$A,$B);
- &BODY_20_39(\$code,35,$B,$C,$D,$E,$T,$A);
- &BODY_20_39(\$code,36,$A,$B,$C,$D,$E,$T);
- &BODY_20_39(\$code,37,$T,$A,$B,$C,$D,$E);
- &BODY_20_39(\$code,38,$E,$T,$A,$B,$C,$D);
- &BODY_20_39(\$code,39,$D,$E,$T,$A,$B,$C);
-
- &BODY_40_59(\$code,40,$C,$D,$E,$T,$A,$B);
- &BODY_40_59(\$code,41,$B,$C,$D,$E,$T,$A);
- &BODY_40_59(\$code,42,$A,$B,$C,$D,$E,$T);
- &BODY_40_59(\$code,43,$T,$A,$B,$C,$D,$E);
- &BODY_40_59(\$code,44,$E,$T,$A,$B,$C,$D);
- &BODY_40_59(\$code,45,$D,$E,$T,$A,$B,$C);
- &BODY_40_59(\$code,46,$C,$D,$E,$T,$A,$B);
- &BODY_40_59(\$code,47,$B,$C,$D,$E,$T,$A);
- &BODY_40_59(\$code,48,$A,$B,$C,$D,$E,$T);
- &BODY_40_59(\$code,49,$T,$A,$B,$C,$D,$E);
- &BODY_40_59(\$code,50,$E,$T,$A,$B,$C,$D);
- &BODY_40_59(\$code,51,$D,$E,$T,$A,$B,$C);
- &BODY_40_59(\$code,52,$C,$D,$E,$T,$A,$B);
- &BODY_40_59(\$code,53,$B,$C,$D,$E,$T,$A);
- &BODY_40_59(\$code,54,$A,$B,$C,$D,$E,$T);
- &BODY_40_59(\$code,55,$T,$A,$B,$C,$D,$E);
- &BODY_40_59(\$code,56,$E,$T,$A,$B,$C,$D);
- &BODY_40_59(\$code,57,$D,$E,$T,$A,$B,$C);
- &BODY_40_59(\$code,58,$C,$D,$E,$T,$A,$B);
- &BODY_40_59(\$code,59,$B,$C,$D,$E,$T,$A);
-
- &BODY_60_79(\$code,60,$A,$B,$C,$D,$E,$T);
- &BODY_60_79(\$code,61,$T,$A,$B,$C,$D,$E);
- &BODY_60_79(\$code,62,$E,$T,$A,$B,$C,$D);
- &BODY_60_79(\$code,63,$D,$E,$T,$A,$B,$C);
- &BODY_60_79(\$code,64,$C,$D,$E,$T,$A,$B);
- &BODY_60_79(\$code,65,$B,$C,$D,$E,$T,$A);
- &BODY_60_79(\$code,66,$A,$B,$C,$D,$E,$T);
- &BODY_60_79(\$code,67,$T,$A,$B,$C,$D,$E);
- &BODY_60_79(\$code,68,$E,$T,$A,$B,$C,$D);
- &BODY_60_79(\$code,69,$D,$E,$T,$A,$B,$C);
- &BODY_60_79(\$code,70,$C,$D,$E,$T,$A,$B);
- &BODY_60_79(\$code,71,$B,$C,$D,$E,$T,$A);
- &BODY_60_79(\$code,72,$A,$B,$C,$D,$E,$T);
- &BODY_60_79(\$code,73,$T,$A,$B,$C,$D,$E);
- &BODY_60_79(\$code,74,$E,$T,$A,$B,$C,$D);
- &BODY_60_79(\$code,75,$D,$E,$T,$A,$B,$C);
- &BODY_60_79(\$code,76,$C,$D,$E,$T,$A,$B);
- &BODY_60_79(\$code,77,$B,$C,$D,$E,$T,$A);
- &BODY_60_79(\$code,78,$A,$B,$C,$D,$E,$T);
- &BODY_60_79(\$code,79,$T,$A,$B,$C,$D,$E);
-
-$code.=<<___;
-{ .mmb; add $h0=$h0,$E
- nop.m 0
- br.ctop.dptk.many .Ldtop };;
-.Ldend:
-{ .mmi; add tmp0=4,ctx
- mov ar.lc=r3 };;
-{ .mmi; st4 [ctx]=$h0,8
- st4 [tmp0]=$h1,8 };;
-{ .mmi; st4 [ctx]=$h2,8
- st4 [tmp0]=$h3 };;
-{ .mib; st4 [ctx]=$h4,-16
- mov pr=r2,0x1ffff
- br.ret.sptk.many b0 };;
-.endp sha1_block_asm_data_order#
-___
-
-print $code;
diff --git a/openssl/trunk/crypto/sha/asm/sha512-ia64.pl b/openssl/trunk/crypto/sha/asm/sha512-ia64.pl
deleted file mode 100755
index 0aea0239..00000000
--- a/openssl/trunk/crypto/sha/asm/sha512-ia64.pl
+++ /dev/null
@@ -1,432 +0,0 @@
-#!/usr/bin/env perl
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. Rights for redistribution and usage in source and binary
-# forms are granted according to the OpenSSL license.
-# ====================================================================
-#
-# SHA256/512_Transform for Itanium.
-#
-# sha512_block runs in 1003 cycles on Itanium 2, which is almost 50%
-# faster than gcc and >60%(!) faster than code generated by HP-UX
-# compiler (yes, HP-UX is generating slower code, because unlike gcc,
-# it failed to deploy "shift right pair," 'shrp' instruction, which
-# substitutes for 64-bit rotate).
-#
-# 924 cycles long sha256_block outperforms gcc by over factor of 2(!)
-# and HP-UX compiler - by >40% (yes, gcc won sha512_block, but lost
-# this one big time). Note that "formally" 924 is about 100 cycles
-# too much. I mean it's 64 32-bit rounds vs. 80 virtually identical
-# 64-bit ones and 1003*64/80 gives 802. Extra cycles, 2 per round,
-# are spent on extra work to provide for 32-bit rotations. 32-bit
-# rotations are still handled by 'shrp' instruction and for this
-# reason lower 32 bits are deposited to upper half of 64-bit register
-# prior 'shrp' issue. And in order to minimize the amount of such
-# operations, X[16] values are *maintained* with copies of lower
-# halves in upper halves, which is why you'll spot such instructions
-# as custom 'mux2', "parallel 32-bit add," 'padd4' and "parallel
-# 32-bit unsigned right shift," 'pshr4.u' instructions here.
-#
-# Rules of engagement.
-#
-# There is only one integer shifter meaning that if I have two rotate,
-# deposit or extract instructions in adjacent bundles, they shall
-# split [at run-time if they have to]. But note that variable and
-# parallel shifts are performed by multi-media ALU and *are* pairable
-# with rotates [and alike]. On the backside MMALU is rather slow: it
-# takes 2 extra cycles before the result of integer operation is
-# available *to* MMALU and 2(*) extra cycles before the result of MM
-# operation is available "back" *to* integer ALU, not to mention that
-# MMALU itself has 2 cycles latency. However! I explicitly scheduled
-# these MM instructions to avoid MM stalls, so that all these extra
-# latencies get "hidden" in instruction-level parallelism.
-#
-# (*) 2 cycles on Itanium 1 and 1 cycle on Itanium 2. But I schedule
-# for 2 in order to provide for best *overall* performance,
-# because on Itanium 1 stall on MM result is accompanied by
-# pipeline flush, which takes 6 cycles:-(
-#
-# Resulting performance numbers for 900MHz Itanium 2 system:
-#
-# The 'numbers' are in 1000s of bytes per second processed.
-# type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes
-# sha1(*) 6210.14k 20376.30k 52447.83k 85870.05k 105478.12k
-# sha256 7476.45k 20572.05k 41538.34k 56062.29k 62093.18k
-# sha512 4996.56k 20026.28k 47597.20k 85278.79k 111501.31k
-#
-# (*) SHA1 numbers are for HP-UX compiler and are presented purely
-# for reference purposes. I bet it can improved too...
-#
-# To generate code, pass the file name with either 256 or 512 in its
-# name and compiler flags.
-
-$output=shift;
-
-if ($output =~ /512.*\.[s|asm]/) {
- $SZ=8;
- $BITS=8*$SZ;
- $LDW="ld8";
- $STW="st8";
- $ADD="add";
- $SHRU="shr.u";
- $TABLE="K512";
- $func="sha512_block";
- @Sigma0=(28,34,39);
- @Sigma1=(14,18,41);
- @sigma0=(1, 8, 7);
- @sigma1=(19,61, 6);
- $rounds=80;
-} elsif ($output =~ /256.*\.[s|asm]/) {
- $SZ=4;
- $BITS=8*$SZ;
- $LDW="ld4";
- $STW="st4";
- $ADD="padd4";
- $SHRU="pshr4.u";
- $TABLE="K256";
- $func="sha256_block";
- @Sigma0=( 2,13,22);
- @Sigma1=( 6,11,25);
- @sigma0=( 7,18, 3);
- @sigma1=(17,19,10);
- $rounds=64;
-} else { die "nonsense $output"; }
-
-open STDOUT,">$output" || die "can't open $output: $!";
-
-if ($^O eq "hpux") {
- $ADDP="addp4";
- for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
-} else { $ADDP="add"; }
-for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/);
- $big_endian=0 if (/\-DL_ENDIAN/); }
-if (!defined($big_endian))
- { $big_endian=(unpack('L',pack('N',1))==1); }
-
-$code=<<___;
-.ident \"$output, version 1.0\"
-.ident \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\"
-.explicit
-.text
-
-prsave=r14;
-K=r15;
-A=r16; B=r17; C=r18; D=r19;
-E=r20; F=r21; G=r22; H=r23;
-T1=r24; T2=r25;
-s0=r26; s1=r27; t0=r28; t1=r29;
-Ktbl=r30;
-ctx=r31; // 1st arg
-input=r48; // 2nd arg
-num=r49; // 3rd arg
-sgm0=r50; sgm1=r51; // small constants
-
-// void $func (SHA_CTX *ctx, const void *in,size_t num[,int host])
-.global $func#
-.proc $func#
-.align 32
-$func:
- .prologue
- .fframe 0
- .save ar.pfs,r2
- .save ar.lc,r3
- .save pr,prsave
-{ .mmi; alloc r2=ar.pfs,3,17,0,16
- $ADDP ctx=0,r32 // 1st arg
- mov r3=ar.lc }
-{ .mmi; $ADDP input=0,r33 // 2nd arg
- addl Ktbl=\@ltoff($TABLE#),gp
- mov prsave=pr };;
-
- .body
-{ .mii; ld8 Ktbl=[Ktbl]
- mov num=r34 };; // 3rd arg
-
-{ .mib; add r8=0*$SZ,ctx
- add r9=1*$SZ,ctx
- brp.loop.imp .L_first16,.L_first16_ctop
- }
-{ .mib; add r10=2*$SZ,ctx
- add r11=3*$SZ,ctx
- brp.loop.imp .L_rest,.L_rest_ctop
- };;
-// load A-H
-{ .mmi; $LDW A=[r8],4*$SZ
- $LDW B=[r9],4*$SZ
- mov sgm0=$sigma0[2] }
-{ .mmi; $LDW C=[r10],4*$SZ
- $LDW D=[r11],4*$SZ
- mov sgm1=$sigma1[2] };;
-{ .mmi; $LDW E=[r8]
- $LDW F=[r9] }
-{ .mmi; $LDW G=[r10]
- $LDW H=[r11]
- cmp.ne p15,p14=0,r35 };; // used in sha256_block
-
-.L_outer:
-{ .mii; mov ar.lc=15
- mov ar.ec=1 };;
-.align 32
-.L_first16:
-.rotr X[16]
-___
-$t0="t0", $t1="t1", $code.=<<___ if ($BITS==32);
-{ .mib; (p14) add r9=1,input
- (p14) add r10=2,input }
-{ .mib; (p14) add r11=3,input
- (p15) br.dptk.few .L_host };;
-{ .mmi; (p14) ld1 r8=[input],$SZ
- (p14) ld1 r9=[r9] }
-{ .mmi; (p14) ld1 r10=[r10]
- (p14) ld1 r11=[r11] };;
-{ .mii; (p14) dep r9=r8,r9,8,8
- (p14) dep r11=r10,r11,8,8 };;
-{ .mib; (p14) dep X[15]=r9,r11,16,16 };;
-.L_host:
-{ .mib; (p15) $LDW X[15]=[input],$SZ // X[i]=*input++
- dep.z $t1=E,32,32 }
-{ .mib; $LDW K=[Ktbl],$SZ
- zxt4 E=E };;
-{ .mmi; or $t1=$t1,E
- and T1=F,E
- and T2=A,B }
-{ .mmi; andcm r8=G,E
- and r9=A,C
- mux2 $t0=A,0x44 };; // copy lower half to upper
-{ .mib; xor T1=T1,r8 // T1=((e & f) ^ (~e & g))
- _rotr r11=$t1,$Sigma1[0] } // ROTR(e,14)
-{ .mib; and r10=B,C
- xor T2=T2,r9 };;
-___
-$t0="A", $t1="E", $code.=<<___ if ($BITS==64);
-{ .mmi; $LDW X[15]=[input],$SZ // X[i]=*input++
- and T1=F,E
- and T2=A,B }
-{ .mmi; $LDW K=[Ktbl],$SZ
- andcm r8=G,E
- and r9=A,C };;
-{ .mmi; xor T1=T1,r8 //T1=((e & f) ^ (~e & g))
- and r10=B,C
- _rotr r11=$t1,$Sigma1[0] } // ROTR(e,14)
-{ .mmi; xor T2=T2,r9
- mux1 X[15]=X[15],\@rev };; // eliminated in big-endian
-___
-$code.=<<___;
-{ .mib; add T1=T1,H // T1=Ch(e,f,g)+h
- _rotr r8=$t1,$Sigma1[1] } // ROTR(e,18)
-{ .mib; xor T2=T2,r10 // T2=((a & b) ^ (a & c) ^ (b & c))
- mov H=G };;
-{ .mib; xor r11=r8,r11
- _rotr r9=$t1,$Sigma1[2] } // ROTR(e,41)
-{ .mib; mov G=F
- mov F=E };;
-{ .mib; xor r9=r9,r11 // r9=Sigma1(e)
- _rotr r10=$t0,$Sigma0[0] } // ROTR(a,28)
-{ .mib; add T1=T1,K // T1=Ch(e,f,g)+h+K512[i]
- mov E=D };;
-{ .mib; add T1=T1,r9 // T1+=Sigma1(e)
- _rotr r11=$t0,$Sigma0[1] } // ROTR(a,34)
-{ .mib; mov D=C
- mov C=B };;
-{ .mib; add T1=T1,X[15] // T1+=X[i]
- _rotr r8=$t0,$Sigma0[2] } // ROTR(a,39)
-{ .mib; xor r10=r10,r11
- mux2 X[15]=X[15],0x44 };; // eliminated in 64-bit
-{ .mmi; xor r10=r8,r10 // r10=Sigma0(a)
- mov B=A
- add A=T1,T2 };;
-.L_first16_ctop:
-{ .mib; add E=E,T1
- add A=A,r10 // T2=Maj(a,b,c)+Sigma0(a)
- br.ctop.sptk .L_first16 };;
-
-{ .mib; mov ar.lc=$rounds-17 }
-{ .mib; mov ar.ec=1 };;
-.align 32
-.L_rest:
-.rotr X[16]
-{ .mib; $LDW K=[Ktbl],$SZ
- _rotr r8=X[15-1],$sigma0[0] } // ROTR(s0,1)
-{ .mib; $ADD X[15]=X[15],X[15-9] // X[i&0xF]+=X[(i+9)&0xF]
- $SHRU s0=X[15-1],sgm0 };; // s0=X[(i+1)&0xF]>>7
-{ .mib; and T1=F,E
- _rotr r9=X[15-1],$sigma0[1] } // ROTR(s0,8)
-{ .mib; andcm r10=G,E
- $SHRU s1=X[15-14],sgm1 };; // s1=X[(i+14)&0xF]>>6
-{ .mmi; xor T1=T1,r10 // T1=((e & f) ^ (~e & g))
- xor r9=r8,r9
- _rotr r10=X[15-14],$sigma1[0] };;// ROTR(s1,19)
-{ .mib; and T2=A,B
- _rotr r11=X[15-14],$sigma1[1] }// ROTR(s1,61)
-{ .mib; and r8=A,C };;
-___
-$t0="t0", $t1="t1", $code.=<<___ if ($BITS==32);
-// I adhere to mmi; in order to hold Itanium 1 back and avoid 6 cycle
-// pipeline flush in last bundle. Note that even on Itanium2 the
-// latter stalls for one clock cycle...
-{ .mmi; xor s0=s0,r9 // s0=sigma0(X[(i+1)&0xF])
- dep.z $t1=E,32,32 }
-{ .mmi; xor r10=r11,r10
- zxt4 E=E };;
-{ .mmi; or $t1=$t1,E
- xor s1=s1,r10 // s1=sigma1(X[(i+14)&0xF])
- mux2 $t0=A,0x44 };; // copy lower half to upper
-{ .mmi; xor T2=T2,r8
- _rotr r9=$t1,$Sigma1[0] } // ROTR(e,14)
-{ .mmi; and r10=B,C
- add T1=T1,H // T1=Ch(e,f,g)+h
- $ADD X[15]=X[15],s0 };; // X[i&0xF]+=sigma0(X[(i+1)&0xF])
-___
-$t0="A", $t1="E", $code.=<<___ if ($BITS==64);
-{ .mib; xor s0=s0,r9 // s0=sigma0(X[(i+1)&0xF])
- _rotr r9=$t1,$Sigma1[0] } // ROTR(e,14)
-{ .mib; xor r10=r11,r10
- xor T2=T2,r8 };;
-{ .mib; xor s1=s1,r10 // s1=sigma1(X[(i+14)&0xF])
- add T1=T1,H }
-{ .mib; and r10=B,C
- $ADD X[15]=X[15],s0 };; // X[i&0xF]+=sigma0(X[(i+1)&0xF])
-___
-$code.=<<___;
-{ .mmi; xor T2=T2,r10 // T2=((a & b) ^ (a & c) ^ (b & c))
- mov H=G
- _rotr r8=$t1,$Sigma1[1] };; // ROTR(e,18)
-{ .mmi; xor r11=r8,r9
- $ADD X[15]=X[15],s1 // X[i&0xF]+=sigma1(X[(i+14)&0xF])
- _rotr r9=$t1,$Sigma1[2] } // ROTR(e,41)
-{ .mmi; mov G=F
- mov F=E };;
-{ .mib; xor r9=r9,r11 // r9=Sigma1(e)
- _rotr r10=$t0,$Sigma0[0] } // ROTR(a,28)
-{ .mib; add T1=T1,K // T1=Ch(e,f,g)+h+K512[i]
- mov E=D };;
-{ .mib; add T1=T1,r9 // T1+=Sigma1(e)
- _rotr r11=$t0,$Sigma0[1] } // ROTR(a,34)
-{ .mib; mov D=C
- mov C=B };;
-{ .mmi; add T1=T1,X[15] // T1+=X[i]
- xor r10=r10,r11
- _rotr r8=$t0,$Sigma0[2] };; // ROTR(a,39)
-{ .mmi; xor r10=r8,r10 // r10=Sigma0(a)
- mov B=A
- add A=T1,T2 };;
-.L_rest_ctop:
-{ .mib; add E=E,T1
- add A=A,r10 // T2=Maj(a,b,c)+Sigma0(a)
- br.ctop.sptk .L_rest };;
-
-{ .mib; add r8=0*$SZ,ctx
- add r9=1*$SZ,ctx }
-{ .mib; add r10=2*$SZ,ctx
- add r11=3*$SZ,ctx };;
-{ .mmi; $LDW r32=[r8],4*$SZ
- $LDW r33=[r9],4*$SZ }
-{ .mmi; $LDW r34=[r10],4*$SZ
- $LDW r35=[r11],4*$SZ
- cmp.ltu p6,p7=1,num };;
-{ .mmi; $LDW r36=[r8],-4*$SZ
- $LDW r37=[r9],-4*$SZ
-(p6) add Ktbl=-$SZ*$rounds,Ktbl }
-{ .mmi; $LDW r38=[r10],-4*$SZ
- $LDW r39=[r11],-4*$SZ
-(p7) mov ar.lc=r3 };;
-{ .mmi; add A=A,r32
- add B=B,r33
- add C=C,r34 }
-{ .mmi; add D=D,r35
- add E=E,r36
- add F=F,r37 };;
-{ .mmi; $STW [r8]=A,4*$SZ
- $STW [r9]=B,4*$SZ
- add G=G,r38 }
-{ .mmi; $STW [r10]=C,4*$SZ
- $STW [r11]=D,4*$SZ
- add H=H,r39 };;
-{ .mmi; $STW [r8]=E
- $STW [r9]=F
-(p6) add num=-1,num }
-{ .mmb; $STW [r10]=G
- $STW [r11]=H
-(p6) br.dptk.many .L_outer };;
-
-{ .mib; mov pr=prsave,0x1ffff
- br.ret.sptk.many b0 };;
-.endp $func#
-___
-
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-$code =~ s/_rotr(\s+)([^=]+)=([^,]+),([0-9]+)/shrp$1$2=$3,$3,$4/gm;
-if ($BITS==64) {
- $code =~ s/mux2(\s+)\S+/nop.i$1 0x0/gm;
- $code =~ s/mux1(\s+)\S+/nop.i$1 0x0/gm if ($big_endian);
-}
-
-print $code;
-
-print<<___ if ($BITS==32);
-.align 64
-.type K256#,\@object
-K256: data4 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
- data4 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
- data4 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
- data4 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
- data4 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
- data4 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
- data4 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
- data4 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
- data4 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
- data4 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
- data4 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
- data4 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
- data4 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
- data4 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
- data4 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
- data4 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-.size K256#,$SZ*$rounds
-___
-print<<___ if ($BITS==64);
-.align 64
-.type K512#,\@object
-K512: data8 0x428a2f98d728ae22,0x7137449123ef65cd
- data8 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
- data8 0x3956c25bf348b538,0x59f111f1b605d019
- data8 0x923f82a4af194f9b,0xab1c5ed5da6d8118
- data8 0xd807aa98a3030242,0x12835b0145706fbe
- data8 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
- data8 0x72be5d74f27b896f,0x80deb1fe3b1696b1
- data8 0x9bdc06a725c71235,0xc19bf174cf692694
- data8 0xe49b69c19ef14ad2,0xefbe4786384f25e3
- data8 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
- data8 0x2de92c6f592b0275,0x4a7484aa6ea6e483
- data8 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
- data8 0x983e5152ee66dfab,0xa831c66d2db43210
- data8 0xb00327c898fb213f,0xbf597fc7beef0ee4
- data8 0xc6e00bf33da88fc2,0xd5a79147930aa725
- data8 0x06ca6351e003826f,0x142929670a0e6e70
- data8 0x27b70a8546d22ffc,0x2e1b21385c26c926
- data8 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
- data8 0x650a73548baf63de,0x766a0abb3c77b2a8
- data8 0x81c2c92e47edaee6,0x92722c851482353b
- data8 0xa2bfe8a14cf10364,0xa81a664bbc423001
- data8 0xc24b8b70d0f89791,0xc76c51a30654be30
- data8 0xd192e819d6ef5218,0xd69906245565a910
- data8 0xf40e35855771202a,0x106aa07032bbd1b8
- data8 0x19a4c116b8d2d0c8,0x1e376c085141ab53
- data8 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
- data8 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
- data8 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
- data8 0x748f82ee5defb2fc,0x78a5636f43172f60
- data8 0x84c87814a1f0ab72,0x8cc702081a6439ec
- data8 0x90befffa23631e28,0xa4506cebde82bde9
- data8 0xbef9a3f7b2c67915,0xc67178f2e372532b
- data8 0xca273eceea26619c,0xd186b8c721c0c207
- data8 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
- data8 0x06f067aa72176fba,0x0a637dc5a2c898a6
- data8 0x113f9804bef90dae,0x1b710b35131c471b
- data8 0x28db77f523047d84,0x32caab7b40c72493
- data8 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
- data8 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
- data8 0x5fcb6fab3ad6faec,0x6c44198c4a475817
-.size K512#,$SZ*$rounds
-___
diff --git a/openssl/trunk/crypto/sha/asm/sha512-sse2.pl b/openssl/trunk/crypto/sha/asm/sha512-sse2.pl
deleted file mode 100644
index 10902bf6..00000000
--- a/openssl/trunk/crypto/sha/asm/sha512-sse2.pl
+++ /dev/null
@@ -1,404 +0,0 @@
-#!/usr/bin/env perl
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. Rights for redistribution and usage in source and binary
-# forms are granted according to the OpenSSL license.
-# ====================================================================
-#
-# SHA512_Transform_SSE2.
-#
-# As the name suggests, this is an IA-32 SSE2 implementation of
-# SHA512_Transform. Motivating factor for the undertaken effort was that
-# SHA512 was observed to *consistently* perform *significantly* poorer
-# than SHA256 [2x and slower is common] on 32-bit platforms. On 64-bit
-# platforms on the other hand SHA512 tend to outperform SHA256 [~50%
-# seem to be common improvement factor]. All this is perfectly natural,
-# as SHA512 is a 64-bit algorithm. But isn't IA-32 SSE2 essentially
-# a 64-bit instruction set? Is it rich enough to implement SHA512?
-# If answer was "no," then you wouldn't have been reading this...
-#
-# Throughput performance in MBps (larger is better):
-#
-# 2.4GHz P4 1.4GHz AMD32 1.4GHz AMD64(*)
-# SHA256/gcc(*) 54 43 59
-# SHA512/gcc 17 23 92
-# SHA512/sse2 61(**) 57(**)
-# SHA512/icc 26 28
-# SHA256/icc(*) 65 54
-#
-# (*) AMD64 and SHA256 numbers are presented mostly for amusement or
-# reference purposes.
-# (**) I.e. it gives ~2-3x speed-up if compared with compiler generated
-# code. One can argue that hand-coded *non*-SSE2 implementation
-# would perform better than compiler generated one as well, and
-# that comparison is therefore not exactly fair. Well, as SHA512
-# puts enormous pressure on IA-32 GP register bank, I reckon that
-# hand-coded version wouldn't perform significantly better than
-# one compiled with icc, ~20% perhaps... So that this code would
-# still outperform it with distinguishing marginal. But feel free
-# to prove me wrong:-)
-# <appro@fy.chalmers.se>
-push(@INC,"perlasm","../../perlasm");
-require "x86asm.pl";
-
-&asm_init($ARGV[0],"sha512-sse2.pl",$ARGV[$#ARGV] eq "386");
-
-$K512="esi"; # K512[80] table, found at the end...
-#$W512="esp"; # $W512 is not just W512[16]: it comprises *two* copies
- # of W512[16] and a copy of A-H variables...
-$W512_SZ=8*(16+16+8); # see above...
-#$Kidx="ebx"; # index in K512 table, advances from 0 to 80...
-$Widx="edx"; # index in W512, wraps around at 16...
-$data="edi"; # 16 qwords of input data...
-$A="mm0"; # B-D and
-$E="mm1"; # F-H are allocated dynamically...
-$Aoff=256+0; # A-H offsets relative to $W512...
-$Boff=256+8;
-$Coff=256+16;
-$Doff=256+24;
-$Eoff=256+32;
-$Foff=256+40;
-$Goff=256+48;
-$Hoff=256+56;
-
-sub SHA2_ROUND()
-{ local ($kidx,$widx)=@_;
-
- # One can argue that one could reorder instructions for better
- # performance. Well, I tried and it doesn't seem to make any
- # noticeable difference. Modern out-of-order execution cores
- # reorder instructions to their liking in either case and they
- # apparently do decent job. So we can keep the code more
- # readable/regular/comprehensible:-)
-
- # I adhere to 64-bit %mmX registers in order to avoid/not care
- # about #GP exceptions on misaligned 128-bit access, most
- # notably in paddq with memory operand. Not to mention that
- # SSE2 intructions operating on %mmX can be scheduled every
- # cycle [and not every second one if operating on %xmmN].
-
- &movq ("mm4",&QWP($Foff,$W512)); # load f
- &movq ("mm5",&QWP($Goff,$W512)); # load g
- &movq ("mm6",&QWP($Hoff,$W512)); # load h
-
- &movq ("mm2",$E); # %mm2 is sliding right
- &movq ("mm3",$E); # %mm3 is sliding left
- &psrlq ("mm2",14);
- &psllq ("mm3",23);
- &movq ("mm7","mm2"); # %mm7 is T1
- &pxor ("mm7","mm3");
- &psrlq ("mm2",4);
- &psllq ("mm3",23);
- &pxor ("mm7","mm2");
- &pxor ("mm7","mm3");
- &psrlq ("mm2",23);
- &psllq ("mm3",4);
- &pxor ("mm7","mm2");
- &pxor ("mm7","mm3"); # T1=Sigma1_512(e)
-
- &movq (&QWP($Foff,$W512),$E); # f = e
- &movq (&QWP($Goff,$W512),"mm4"); # g = f
- &movq (&QWP($Hoff,$W512),"mm5"); # h = g
-
- &pxor ("mm4","mm5"); # f^=g
- &pand ("mm4",$E); # f&=e
- &pxor ("mm4","mm5"); # f^=g
- &paddq ("mm7","mm4"); # T1+=Ch(e,f,g)
-
- &movq ("mm2",&QWP($Boff,$W512)); # load b
- &movq ("mm3",&QWP($Coff,$W512)); # load c
- &movq ($E,&QWP($Doff,$W512)); # e = d
-
- &paddq ("mm7","mm6"); # T1+=h
- &paddq ("mm7",&QWP(0,$K512,$kidx,8)); # T1+=K512[i]
- &paddq ("mm7",&QWP(0,$W512,$widx,8)); # T1+=W512[i]
- &paddq ($E,"mm7"); # e += T1
-
- &movq ("mm4",$A); # %mm4 is sliding right
- &movq ("mm5",$A); # %mm5 is sliding left
- &psrlq ("mm4",28);
- &psllq ("mm5",25);
- &movq ("mm6","mm4"); # %mm6 is T2
- &pxor ("mm6","mm5");
- &psrlq ("mm4",6);
- &psllq ("mm5",5);
- &pxor ("mm6","mm4");
- &pxor ("mm6","mm5");
- &psrlq ("mm4",5);
- &psllq ("mm5",6);
- &pxor ("mm6","mm4");
- &pxor ("mm6","mm5"); # T2=Sigma0_512(a)
-
- &movq (&QWP($Boff,$W512),$A); # b = a
- &movq (&QWP($Coff,$W512),"mm2"); # c = b
- &movq (&QWP($Doff,$W512),"mm3"); # d = c
-
- &movq ("mm4",$A); # %mm4=a
- &por ($A,"mm3"); # a=a|c
- &pand ("mm4","mm3"); # %mm4=a&c
- &pand ($A,"mm2"); # a=(a|c)&b
- &por ("mm4",$A); # %mm4=(a&c)|((a|c)&b)
- &paddq ("mm6","mm4"); # T2+=Maj(a,b,c)
-
- &movq ($A,"mm7"); # a=T1
- &paddq ($A,"mm6"); # a+=T2
-}
-
-$func="sha512_block_sse2";
-
-&function_begin_B($func);
- if (0) {# Caller is expected to check if it's appropriate to
- # call this routine. Below 3 lines are retained for
- # debugging purposes...
- &picmeup("eax","OPENSSL_ia32cap");
- &bt (&DWP(0,"eax"),26);
- &jnc ("SHA512_Transform");
- }
-
- &push ("ebp");
- &mov ("ebp","esp");
- &push ("ebx");
- &push ("esi");
- &push ("edi");
-
- &mov ($Widx,&DWP(8,"ebp")); # A-H state, 1st arg
- &mov ($data,&DWP(12,"ebp")); # input data, 2nd arg
- &call (&label("pic_point")); # make it PIC!
-&set_label("pic_point");
- &blindpop($K512);
- &lea ($K512,&DWP(&label("K512")."-".&label("pic_point"),$K512));
-
- $W512 = "esp"; # start using %esp as W512
- &sub ($W512,$W512_SZ);
- &and ($W512,-16); # ensure 128-bit alignment
-
- # make private copy of A-H
- # v assume the worst and stick to unaligned load
- &movdqu ("xmm0",&QWP(0,$Widx));
- &movdqu ("xmm1",&QWP(16,$Widx));
- &movdqu ("xmm2",&QWP(32,$Widx));
- &movdqu ("xmm3",&QWP(48,$Widx));
-
-&align(8);
-&set_label("_chunk_loop");
-
- &movdqa (&QWP($Aoff,$W512),"xmm0"); # a,b
- &movdqa (&QWP($Coff,$W512),"xmm1"); # c,d
- &movdqa (&QWP($Eoff,$W512),"xmm2"); # e,f
- &movdqa (&QWP($Goff,$W512),"xmm3"); # g,h
-
- &xor ($Widx,$Widx);
-
- &movdq2q($A,"xmm0"); # load a
- &movdq2q($E,"xmm2"); # load e
-
- # Why aren't loops unrolled? It makes sense to unroll if
- # execution time for loop body is comparable with branch
- # penalties and/or if whole data-set resides in register bank.
- # Neither is case here... Well, it would be possible to
- # eliminate few store operations, but it would hardly affect
- # so to say stop-watch performance, as there is a lot of
- # available memory slots to fill. It will only relieve some
- # pressure off memory bus...
-
- # flip input stream byte order...
- &mov ("eax",&DWP(0,$data,$Widx,8));
- &mov ("ebx",&DWP(4,$data,$Widx,8));
- &bswap ("eax");
- &bswap ("ebx");
- &mov (&DWP(0,$W512,$Widx,8),"ebx"); # W512[i]
- &mov (&DWP(4,$W512,$Widx,8),"eax");
- &mov (&DWP(128+0,$W512,$Widx,8),"ebx"); # copy of W512[i]
- &mov (&DWP(128+4,$W512,$Widx,8),"eax");
-
-&align(8);
-&set_label("_1st_loop"); # 0-15
- # flip input stream byte order...
- &mov ("eax",&DWP(0+8,$data,$Widx,8));
- &mov ("ebx",&DWP(4+8,$data,$Widx,8));
- &bswap ("eax");
- &bswap ("ebx");
- &mov (&DWP(0+8,$W512,$Widx,8),"ebx"); # W512[i]
- &mov (&DWP(4+8,$W512,$Widx,8),"eax");
- &mov (&DWP(128+0+8,$W512,$Widx,8),"ebx"); # copy of W512[i]
- &mov (&DWP(128+4+8,$W512,$Widx,8),"eax");
-&set_label("_1st_looplet");
- &SHA2_ROUND($Widx,$Widx); &inc($Widx);
-
-&cmp ($Widx,15)
-&jl (&label("_1st_loop"));
-&je (&label("_1st_looplet")); # playing similar trick on 2nd loop
- # does not improve performance...
-
- $Kidx = "ebx"; # start using %ebx as Kidx
- &mov ($Kidx,$Widx);
-
-&align(8);
-&set_label("_2nd_loop"); # 16-79
- &and($Widx,0xf);
-
- # 128-bit fragment! I update W512[i] and W512[i+1] in
- # parallel:-) Note that I refer to W512[(i&0xf)+N] and not to
- # W512[(i+N)&0xf]! This is exactly what I maintain the second
- # copy of W512[16] for...
- &movdqu ("xmm0",&QWP(8*1,$W512,$Widx,8)); # s0=W512[i+1]
- &movdqa ("xmm2","xmm0"); # %xmm2 is sliding right
- &movdqa ("xmm3","xmm0"); # %xmm3 is sliding left
- &psrlq ("xmm2",1);
- &psllq ("xmm3",56);
- &movdqa ("xmm0","xmm2");
- &pxor ("xmm0","xmm3");
- &psrlq ("xmm2",6);
- &psllq ("xmm3",7);
- &pxor ("xmm0","xmm2");
- &pxor ("xmm0","xmm3");
- &psrlq ("xmm2",1);
- &pxor ("xmm0","xmm2"); # s0 = sigma0_512(s0);
-
- &movdqa ("xmm1",&QWP(8*14,$W512,$Widx,8)); # s1=W512[i+14]
- &movdqa ("xmm4","xmm1"); # %xmm4 is sliding right
- &movdqa ("xmm5","xmm1"); # %xmm5 is sliding left
- &psrlq ("xmm4",6);
- &psllq ("xmm5",3);
- &movdqa ("xmm1","xmm4");
- &pxor ("xmm1","xmm5");
- &psrlq ("xmm4",13);
- &psllq ("xmm5",42);
- &pxor ("xmm1","xmm4");
- &pxor ("xmm1","xmm5");
- &psrlq ("xmm4",42);
- &pxor ("xmm1","xmm4"); # s1 = sigma1_512(s1);
-
- # + have to explictly load W512[i+9] as it's not 128-bit
- # v aligned and paddq would throw an exception...
- &movdqu ("xmm6",&QWP(8*9,$W512,$Widx,8));
- &paddq ("xmm0","xmm1"); # s0 += s1
- &paddq ("xmm0","xmm6"); # s0 += W512[i+9]
- &paddq ("xmm0",&QWP(0,$W512,$Widx,8)); # s0 += W512[i]
-
- &movdqa (&QWP(0,$W512,$Widx,8),"xmm0"); # W512[i] = s0
- &movdqa (&QWP(16*8,$W512,$Widx,8),"xmm0"); # copy of W512[i]
-
- # as the above fragment was 128-bit, we "owe" 2 rounds...
- &SHA2_ROUND($Kidx,$Widx); &inc($Kidx); &inc($Widx);
- &SHA2_ROUND($Kidx,$Widx); &inc($Kidx); &inc($Widx);
-
-&cmp ($Kidx,80);
-&jl (&label("_2nd_loop"));
-
- # update A-H state
- &mov ($Widx,&DWP(8,"ebp")); # A-H state, 1st arg
- &movq (&QWP($Aoff,$W512),$A); # write out a
- &movq (&QWP($Eoff,$W512),$E); # write out e
- &movdqu ("xmm0",&QWP(0,$Widx));
- &movdqu ("xmm1",&QWP(16,$Widx));
- &movdqu ("xmm2",&QWP(32,$Widx));
- &movdqu ("xmm3",&QWP(48,$Widx));
- &paddq ("xmm0",&QWP($Aoff,$W512)); # 128-bit additions...
- &paddq ("xmm1",&QWP($Coff,$W512));
- &paddq ("xmm2",&QWP($Eoff,$W512));
- &paddq ("xmm3",&QWP($Goff,$W512));
- &movdqu (&QWP(0,$Widx),"xmm0");
- &movdqu (&QWP(16,$Widx),"xmm1");
- &movdqu (&QWP(32,$Widx),"xmm2");
- &movdqu (&QWP(48,$Widx),"xmm3");
-
-&add ($data,16*8); # advance input data pointer
-&dec (&DWP(16,"ebp")); # decrement 3rd arg
-&jnz (&label("_chunk_loop"));
-
- # epilogue
- &emms (); # required for at least ELF and Win32 ABIs
- &mov ("edi",&DWP(-12,"ebp"));
- &mov ("esi",&DWP(-8,"ebp"));
- &mov ("ebx",&DWP(-4,"ebp"));
- &leave ();
-&ret ();
-
-&align(64);
-&set_label("K512"); # Yes! I keep it in the code segment!
- &data_word(0xd728ae22,0x428a2f98); # u64
- &data_word(0x23ef65cd,0x71374491); # u64
- &data_word(0xec4d3b2f,0xb5c0fbcf); # u64
- &data_word(0x8189dbbc,0xe9b5dba5); # u64
- &data_word(0xf348b538,0x3956c25b); # u64
- &data_word(0xb605d019,0x59f111f1); # u64
- &data_word(0xaf194f9b,0x923f82a4); # u64
- &data_word(0xda6d8118,0xab1c5ed5); # u64
- &data_word(0xa3030242,0xd807aa98); # u64
- &data_word(0x45706fbe,0x12835b01); # u64
- &data_word(0x4ee4b28c,0x243185be); # u64
- &data_word(0xd5ffb4e2,0x550c7dc3); # u64
- &data_word(0xf27b896f,0x72be5d74); # u64
- &data_word(0x3b1696b1,0x80deb1fe); # u64
- &data_word(0x25c71235,0x9bdc06a7); # u64
- &data_word(0xcf692694,0xc19bf174); # u64
- &data_word(0x9ef14ad2,0xe49b69c1); # u64
- &data_word(0x384f25e3,0xefbe4786); # u64
- &data_word(0x8b8cd5b5,0x0fc19dc6); # u64
- &data_word(0x77ac9c65,0x240ca1cc); # u64
- &data_word(0x592b0275,0x2de92c6f); # u64
- &data_word(0x6ea6e483,0x4a7484aa); # u64
- &data_word(0xbd41fbd4,0x5cb0a9dc); # u64
- &data_word(0x831153b5,0x76f988da); # u64
- &data_word(0xee66dfab,0x983e5152); # u64
- &data_word(0x2db43210,0xa831c66d); # u64
- &data_word(0x98fb213f,0xb00327c8); # u64
- &data_word(0xbeef0ee4,0xbf597fc7); # u64
- &data_word(0x3da88fc2,0xc6e00bf3); # u64
- &data_word(0x930aa725,0xd5a79147); # u64
- &data_word(0xe003826f,0x06ca6351); # u64
- &data_word(0x0a0e6e70,0x14292967); # u64
- &data_word(0x46d22ffc,0x27b70a85); # u64
- &data_word(0x5c26c926,0x2e1b2138); # u64
- &data_word(0x5ac42aed,0x4d2c6dfc); # u64
- &data_word(0x9d95b3df,0x53380d13); # u64
- &data_word(0x8baf63de,0x650a7354); # u64
- &data_word(0x3c77b2a8,0x766a0abb); # u64
- &data_word(0x47edaee6,0x81c2c92e); # u64
- &data_word(0x1482353b,0x92722c85); # u64
- &data_word(0x4cf10364,0xa2bfe8a1); # u64
- &data_word(0xbc423001,0xa81a664b); # u64
- &data_word(0xd0f89791,0xc24b8b70); # u64
- &data_word(0x0654be30,0xc76c51a3); # u64
- &data_word(0xd6ef5218,0xd192e819); # u64
- &data_word(0x5565a910,0xd6990624); # u64
- &data_word(0x5771202a,0xf40e3585); # u64
- &data_word(0x32bbd1b8,0x106aa070); # u64
- &data_word(0xb8d2d0c8,0x19a4c116); # u64
- &data_word(0x5141ab53,0x1e376c08); # u64
- &data_word(0xdf8eeb99,0x2748774c); # u64
- &data_word(0xe19b48a8,0x34b0bcb5); # u64
- &data_word(0xc5c95a63,0x391c0cb3); # u64
- &data_word(0xe3418acb,0x4ed8aa4a); # u64
- &data_word(0x7763e373,0x5b9cca4f); # u64
- &data_word(0xd6b2b8a3,0x682e6ff3); # u64
- &data_word(0x5defb2fc,0x748f82ee); # u64
- &data_word(0x43172f60,0x78a5636f); # u64
- &data_word(0xa1f0ab72,0x84c87814); # u64
- &data_word(0x1a6439ec,0x8cc70208); # u64
- &data_word(0x23631e28,0x90befffa); # u64
- &data_word(0xde82bde9,0xa4506ceb); # u64
- &data_word(0xb2c67915,0xbef9a3f7); # u64
- &data_word(0xe372532b,0xc67178f2); # u64
- &data_word(0xea26619c,0xca273ece); # u64
- &data_word(0x21c0c207,0xd186b8c7); # u64
- &data_word(0xcde0eb1e,0xeada7dd6); # u64
- &data_word(0xee6ed178,0xf57d4f7f); # u64
- &data_word(0x72176fba,0x06f067aa); # u64
- &data_word(0xa2c898a6,0x0a637dc5); # u64
- &data_word(0xbef90dae,0x113f9804); # u64
- &data_word(0x131c471b,0x1b710b35); # u64
- &data_word(0x23047d84,0x28db77f5); # u64
- &data_word(0x40c72493,0x32caab7b); # u64
- &data_word(0x15c9bebc,0x3c9ebe0a); # u64
- &data_word(0x9c100d4c,0x431d67c4); # u64
- &data_word(0xcb3e42b6,0x4cc5d4be); # u64
- &data_word(0xfc657e2a,0x597f299c); # u64
- &data_word(0x3ad6faec,0x5fcb6fab); # u64
- &data_word(0x4a475817,0x6c44198c); # u64
-
-&function_end_B($func);
-
-&asm_finish();