File cpuminer-2.5.1.obscpio of Package cpuminer

07070100000000000081A4000003E800000064000000015EF4BCA10000004B000000000000000000000000000000000000001700000000cpuminer-2.5.1/AUTHORSJeff Garzik <jgarzik@pobox.com>

ArtForz

pooler <pooler@litecoinpool.org>
07070100000001000081A4000003E800000064000000015EF4BCA100004648000000000000000000000000000000000000001700000000cpuminer-2.5.1/COPYING		    GNU GENERAL PUBLIC LICENSE
		       Version 2, June 1991

 Copyright (C) 1989, 1991 Free Software Foundation, Inc.
     59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 Everyone is permitted to copy and distribute verbatim copies
 of this license document, but changing it is not allowed.

			    Preamble

  The licenses for most software are designed to take away your
freedom to share and change it.  By contrast, the GNU General Public
License is intended to guarantee your freedom to share and change free
software--to make sure the software is free for all its users.  This
General Public License applies to most of the Free Software
Foundation's software and to any other program whose authors commit to
using it.  (Some other Free Software Foundation software is covered by
the GNU Library General Public License instead.)  You can apply it to
your programs, too.

  When we speak of free software, we are referring to freedom, not
price.  Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
this service if you wish), that you receive source code or can get it
if you want it, that you can change the software or use pieces of it
in new free programs; and that you know you can do these things.

  To protect your rights, we need to make restrictions that forbid
anyone to deny you these rights or to ask you to surrender the rights.
These restrictions translate to certain responsibilities for you if you
distribute copies of the software, or if you modify it.

  For example, if you distribute copies of such a program, whether
gratis or for a fee, you must give the recipients all the rights that
you have.  You must make sure that they, too, receive or can get the
source code.  And you must show them these terms so they know their
rights.

  We protect your rights with two steps: (1) copyright the software, and
(2) offer you this license which gives you legal permission to copy,
distribute and/or modify the software.

  Also, for each author's protection and ours, we want to make certain
that everyone understands that there is no warranty for this free
software.  If the software is modified by someone else and passed on, we
want its recipients to know that what they have is not the original, so
that any problems introduced by others will not reflect on the original
authors' reputations.

  Finally, any free program is threatened constantly by software
patents.  We wish to avoid the danger that redistributors of a free
program will individually obtain patent licenses, in effect making the
program proprietary.  To prevent this, we have made it clear that any
patent must be licensed for everyone's free use or not licensed at all.

  The precise terms and conditions for copying, distribution and
modification follow.

		    GNU GENERAL PUBLIC LICENSE
   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION

  0. This License applies to any program or other work which contains
a notice placed by the copyright holder saying it may be distributed
under the terms of this General Public License.  The "Program", below,
refers to any such program or work, and a "work based on the Program"
means either the Program or any derivative work under copyright law:
that is to say, a work containing the Program or a portion of it,
either verbatim or with modifications and/or translated into another
language.  (Hereinafter, translation is included without limitation in
the term "modification".)  Each licensee is addressed as "you".

Activities other than copying, distribution and modification are not
covered by this License; they are outside its scope.  The act of
running the Program is not restricted, and the output from the Program
is covered only if its contents constitute a work based on the
Program (independent of having been made by running the Program).
Whether that is true depends on what the Program does.

  1. You may copy and distribute verbatim copies of the Program's
source code as you receive it, in any medium, provided that you
conspicuously and appropriately publish on each copy an appropriate
copyright notice and disclaimer of warranty; keep intact all the
notices that refer to this License and to the absence of any warranty;
and give any other recipients of the Program a copy of this License
along with the Program.

You may charge a fee for the physical act of transferring a copy, and
you may at your option offer warranty protection in exchange for a fee.

  2. You may modify your copy or copies of the Program or any portion
of it, thus forming a work based on the Program, and copy and
distribute such modifications or work under the terms of Section 1
above, provided that you also meet all of these conditions:

    a) You must cause the modified files to carry prominent notices
    stating that you changed the files and the date of any change.

    b) You must cause any work that you distribute or publish, that in
    whole or in part contains or is derived from the Program or any
    part thereof, to be licensed as a whole at no charge to all third
    parties under the terms of this License.

    c) If the modified program normally reads commands interactively
    when run, you must cause it, when started running for such
    interactive use in the most ordinary way, to print or display an
    announcement including an appropriate copyright notice and a
    notice that there is no warranty (or else, saying that you provide
    a warranty) and that users may redistribute the program under
    these conditions, and telling the user how to view a copy of this
    License.  (Exception: if the Program itself is interactive but
    does not normally print such an announcement, your work based on
    the Program is not required to print an announcement.)

These requirements apply to the modified work as a whole.  If
identifiable sections of that work are not derived from the Program,
and can be reasonably considered independent and separate works in
themselves, then this License, and its terms, do not apply to those
sections when you distribute them as separate works.  But when you
distribute the same sections as part of a whole which is a work based
on the Program, the distribution of the whole must be on the terms of
this License, whose permissions for other licensees extend to the
entire whole, and thus to each and every part regardless of who wrote it.

Thus, it is not the intent of this section to claim rights or contest
your rights to work written entirely by you; rather, the intent is to
exercise the right to control the distribution of derivative or
collective works based on the Program.

In addition, mere aggregation of another work not based on the Program
with the Program (or with a work based on the Program) on a volume of
a storage or distribution medium does not bring the other work under
the scope of this License.

  3. You may copy and distribute the Program (or a work based on it,
under Section 2) in object code or executable form under the terms of
Sections 1 and 2 above provided that you also do one of the following:

    a) Accompany it with the complete corresponding machine-readable
    source code, which must be distributed under the terms of Sections
    1 and 2 above on a medium customarily used for software interchange; or,

    b) Accompany it with a written offer, valid for at least three
    years, to give any third party, for a charge no more than your
    cost of physically performing source distribution, a complete
    machine-readable copy of the corresponding source code, to be
    distributed under the terms of Sections 1 and 2 above on a medium
    customarily used for software interchange; or,

    c) Accompany it with the information you received as to the offer
    to distribute corresponding source code.  (This alternative is
    allowed only for noncommercial distribution and only if you
    received the program in object code or executable form with such
    an offer, in accord with Subsection b above.)

The source code for a work means the preferred form of the work for
making modifications to it.  For an executable work, complete source
code means all the source code for all modules it contains, plus any
associated interface definition files, plus the scripts used to
control compilation and installation of the executable.  However, as a
special exception, the source code distributed need not include
anything that is normally distributed (in either source or binary
form) with the major components (compiler, kernel, and so on) of the
operating system on which the executable runs, unless that component
itself accompanies the executable.

If distribution of executable or object code is made by offering
access to copy from a designated place, then offering equivalent
access to copy the source code from the same place counts as
distribution of the source code, even though third parties are not
compelled to copy the source along with the object code.

  4. You may not copy, modify, sublicense, or distribute the Program
except as expressly provided under this License.  Any attempt
otherwise to copy, modify, sublicense or distribute the Program is
void, and will automatically terminate your rights under this License.
However, parties who have received copies, or rights, from you under
this License will not have their licenses terminated so long as such
parties remain in full compliance.

  5. You are not required to accept this License, since you have not
signed it.  However, nothing else grants you permission to modify or
distribute the Program or its derivative works.  These actions are
prohibited by law if you do not accept this License.  Therefore, by
modifying or distributing the Program (or any work based on the
Program), you indicate your acceptance of this License to do so, and
all its terms and conditions for copying, distributing or modifying
the Program or works based on it.

  6. Each time you redistribute the Program (or any work based on the
Program), the recipient automatically receives a license from the
original licensor to copy, distribute or modify the Program subject to
these terms and conditions.  You may not impose any further
restrictions on the recipients' exercise of the rights granted herein.
You are not responsible for enforcing compliance by third parties to
this License.

  7. If, as a consequence of a court judgment or allegation of patent
infringement or for any other reason (not limited to patent issues),
conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License.  If you cannot
distribute so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you
may not distribute the Program at all.  For example, if a patent
license would not permit royalty-free redistribution of the Program by
all those who receive copies directly or indirectly through you, then
the only way you could satisfy both it and this License would be to
refrain entirely from distribution of the Program.

If any portion of this section is held invalid or unenforceable under
any particular circumstance, the balance of the section is intended to
apply and the section as a whole is intended to apply in other
circumstances.

It is not the purpose of this section to induce you to infringe any
patents or other property right claims or to contest validity of any
such claims; this section has the sole purpose of protecting the
integrity of the free software distribution system, which is
implemented by public license practices.  Many people have made
generous contributions to the wide range of software distributed
through that system in reliance on consistent application of that
system; it is up to the author/donor to decide if he or she is willing
to distribute software through any other system and a licensee cannot
impose that choice.

This section is intended to make thoroughly clear what is believed to
be a consequence of the rest of this License.

  8. If the distribution and/or use of the Program is restricted in
certain countries either by patents or by copyrighted interfaces, the
original copyright holder who places the Program under this License
may add an explicit geographical distribution limitation excluding
those countries, so that distribution is permitted only in or among
countries not thus excluded.  In such case, this License incorporates
the limitation as if written in the body of this License.

  9. The Free Software Foundation may publish revised and/or new versions
of the General Public License from time to time.  Such new versions will
be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.

Each version is given a distinguishing version number.  If the Program
specifies a version number of this License which applies to it and "any
later version", you have the option of following the terms and conditions
either of that version or of any later version published by the Free
Software Foundation.  If the Program does not specify a version number of
this License, you may choose any version ever published by the Free Software
Foundation.

  10. If you wish to incorporate parts of the Program into other free
programs whose distribution conditions are different, write to the author
to ask for permission.  For software which is copyrighted by the Free
Software Foundation, write to the Free Software Foundation; we sometimes
make exceptions for this.  Our decision will be guided by the two goals
of preserving the free status of all derivatives of our free software and
of promoting the sharing and reuse of software generally.

			    NO WARRANTY

  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
REPAIR OR CORRECTION.

  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
POSSIBILITY OF SUCH DAMAGES.

		     END OF TERMS AND CONDITIONS

	    How to Apply These Terms to Your New Programs

  If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.

  To do so, attach the following notices to the program.  It is safest
to attach them to the start of each source file to most effectively
convey the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.

    <one line to give the program's name and a brief idea of what it does.>
    Copyright (C) <year>  <name of author>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA


Also add information on how to contact you by electronic and paper mail.

If the program is interactive, make it output a short notice like this
when it starts in an interactive mode:

    Gnomovision version 69, Copyright (C) year  name of author
    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
    This is free software, and you are welcome to redistribute it
    under certain conditions; type `show c' for details.

The hypothetical commands `show w' and `show c' should show the appropriate
parts of the General Public License.  Of course, the commands you use may
be called something other than `show w' and `show c'; they could even be
mouse-clicks or menu items--whatever suits your program.

You should also get your employer (if you work as a programmer) or your
school, if any, to sign a "copyright disclaimer" for the program, if
necessary.  Here is a sample; alter the names:

  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
  `Gnomovision' (which makes passes at compilers) written by James Hacker.

  <signature of Ty Coon>, 1 April 1989
  Ty Coon, President of Vice

This General Public License does not permit incorporating your program into
proprietary programs.  If your program is a subroutine library, you may
consider it more useful to permit linking proprietary applications with the
library.  If this is what you want to do, use the GNU Library General
Public License instead of this License.
07070100000002000081A4000003E800000064000000015EF4BCA100000033000000000000000000000000000000000000001900000000cpuminer-2.5.1/ChangeLogSee git repository ('git log') for full changelog.
07070100000003000081A4000003E800000064000000015EF4BCA1000002A1000000000000000000000000000000000000001A00000000cpuminer-2.5.1/Dockerfile#
# Dockerfile for cpuminer
# usage: docker run creack/cpuminer --url xxxx --user xxxx --pass xxxx
# ex: docker run creack/cpuminer --url stratum+tcp://ltc.pool.com:80 --user creack.worker1 --pass abcdef
#
#

FROM            ubuntu:16.04
MAINTAINER      Guillaume J. Charmes <guillaume@charmes.net>

RUN             apt-get update -qq && \
                apt-get install -qqy automake libcurl4-openssl-dev git make gcc

RUN             git clone https://github.com/pooler/cpuminer

RUN             cd cpuminer && \
                ./autogen.sh && \
                ./configure CFLAGS="-O3" && \
                make

WORKDIR         /cpuminer
ENTRYPOINT      ["./minerd"]
07070100000004000081A4000003E800000064000000015EF4BCA100000065000000000000000000000000000000000000001700000000cpuminer-2.5.1/LICENSEcpuminer is available under the terms of the GNU Public License version 2.

See COPYING for details.
07070100000005000081A4000003E800000064000000015EF4BCA1000002FF000000000000000000000000000000000000001B00000000cpuminer-2.5.1/Makefile.am
if WANT_JANSSON
JANSSON_INCLUDES= -I$(top_srcdir)/compat/jansson
else
JANSSON_INCLUDES=
endif

EXTRA_DIST	= example-cfg.json nomacro.pl

SUBDIRS		= compat

bin_PROGRAMS	= minerd

dist_man_MANS	= minerd.1

minerd_SOURCES	= elist.h miner.h compat.h \
		  cpu-miner.c util.c \
		  sha2.c scrypt.c
if USE_ASM
if ARCH_x86
minerd_SOURCES += sha2-x86.S scrypt-x86.S
endif
if ARCH_x86_64
minerd_SOURCES += sha2-x64.S scrypt-x64.S
endif
if ARCH_ARM
minerd_SOURCES += sha2-arm.S scrypt-arm.S
endif
if ARCH_PPC
minerd_SOURCES += sha2-ppc.S scrypt-ppc.S
endif
endif
minerd_LDFLAGS	= $(PTHREAD_FLAGS)
minerd_LDADD	= @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@
minerd_CFLAGS	= -fno-strict-aliasing
minerd_CPPFLAGS	= @LIBCURL_CPPFLAGS@ $(JANSSON_INCLUDES) $(PTHREAD_FLAGS)

07070100000006000081A4000003E800000064000000015EF4BCA100002734000000000000000000000000000000000000001400000000cpuminer-2.5.1/NEWSVersion 2.5.1 - Jun 25, 2020

- Add support for bech32 addresses (BIP 173)

Version 2.5.0 - Jun 22, 2017

- Add Segwit support
- Add support for 64-bit PowerPC

Version 2.4.5 - Jun 10, 2016

- Fix a memory leak affecting long polling

Version 2.4.4 - Mar 24, 2016

- Fix memory leaks affecting getblocktemplate

Version 2.4.3 - Jan 24, 2016

- Add support for the VIA PadLock Hash Engine on x86-64
- Allow block version 4 when using getblocktemplate

Version 2.4.2 - Apr 28, 2015

- Add support for Stratum over TLS
- Allow block version 3 when using getblocktemplate

Version 2.4.1 - Feb 25, 2015

- Add support for scrypt(N, 1, 1)
- Add optimized PowerPC code for scrypt and SHA-256d
- Allow use of getblocktemplate with merge-mined cryptocurrencies
- Automatically switch to getwork if no payout address is provided
- Fix CVE-2014-6251

Version 2.4 - May 20, 2014

- Add support for the getblocktemplate RPC method (BIP 22)
- Allow tunnelling Stratum through HTTP proxies
- Add a --no-redirect option to ignore redirection requests
- Timeout for long polling is now disabled by default
- Fix CPU affinity on Linux (kiyominer)
- Add support for building under 64-bit Cygwin
- Expand version information with build details

Version 2.3.3 - Feb 27, 2014

- The --url option is now mandatory
- Do not switch to Stratum when using an HTTP proxy
- Fix scheduling policy change on Linux (clbr)
- Fix CPU affinity on FreeBSD (ache)
- Compatibility fixes for various platforms, including Solaris 8
  and old versions of OS X
- A man page for minerd is now available

Version 2.3.2 - Jul 10, 2013

- Add optimizations for AVX2-capable x86-64 processors
- Ensure that the output stream is flushed after every log message
- Fix an undefined-behavior bug in the Stratum code

Version 2.3.1 - Jun 18, 2013

- Add a --cert option for specifying an SSL certificate (martinwguy)
- Fix a bug that only made SHA-256d mining work at difficulty 1
- Fix a couple of compatibility issues with some Stratum servers

Version 2.3 - Jun 12, 2013

- Add support for the Stratum mining protocol
- Automatically switch to Stratum if the mining server supports
  the X-Stratum extension, unless --no-stratum is used
- Set CPU affinity on FreeBSD (lye)
- Fix a bug in libcurl initialization (martinwguy)

Version 2.2.3 - Aug 5, 2012

- Add optimized ARM NEON code for scrypt and SHA-256d
- Add a --benchmark option that allows offline testing
- Support for the X-Reject-Reason extension

Version 2.2.2 - Jun 7, 2012

- Various performance improvements for x86 and x86-64
- Optimize scrypt for ARMv5E and later processors
- Set the priority of miner threads to idle on Windows
- Add an option to start minerd as a daemon on POSIX systems

Version 2.2.1 - May 2, 2012

- Add optimized code for ARM processors
- Support for building on NetBSD and OpenBSD
- Various compatibility fixes for AIX (pontius)

Version 2.2 - Apr 2, 2012

- Add an optimized SHA-256d algorithm, with specialized code
  for x86 and x86-64 and support for AVX and XOP instructions
- Slight performance increase for scrypt on x86 and x86-64
- The default timeout is now 270 seconds

Version 2.1.5 - Mar 7, 2012

- Add optimizations for AVX-capable x86-64 processors
- Assume HTTP if no protocol is specified for the mining server
- Fix MinGW compatibility issues and update build instructions
- Add support for building on Solaris using gcc (pontius)

Version 2.1.4 - Feb 28, 2012

- Implement 4-way SHA-256 on x86-64
- Add TCP keepalive to long polling connections
- Support HTTP and SOCKS proxies via the --proxy option
- Username and password are no longer mandatory
- Add a script that makes assembly code compatible with old versions
  of the GNU assembler that do not support macros

Version 2.1.3 - Feb 12, 2012

- Smart handling of long polling failures: switch to short scan time
  if long polling fails, and only try to reactivate it if the server
  continues to advertise the feature in HTTP headers
- Add "X-Mining-Extensions: midstate" to HTTP headers (p2k)
- Add support for the "submitold" extension, used by p2pool
- It is now possible to specify username and password in the URL,
  like this: http://username:password@host:port/
- Add a --version option, and clean up --help output
- Avoid division by zero when computing hash rates
- Handle empty responses properly (TimothyA)
- Eliminate the delay between starting threads

Version 2.1.2 - Jan 26, 2012

- Do not submit work that is known to be stale
- Allow miner threads to ask for new work if the current one is at least
  45 seconds old and long polling is enabled
- Refresh work when long polling times out
- Fix minor speed regression
- Modify x86-64 code to make it compatible with older versions of binutils

Version 2.1.1 - Jan 20, 2012

- Handle network errors properly
- Make scantime retargeting more accurate

Version 2.1 - Jan 19, 2012

- Share the same work among all threads
- Do not ask for new work if the current one is not expired
- Do not discard the work returned by long polling

Version 2.0 - Jan 16, 2012

- Change default port to 9332 for Litecoin and remove default credentials
- Add 'scrypt' as the default algorithm and remove other algorithms (ArtForz)
- Optimize scrypt for x86 and x86-64
- Make scantime retargeting less granular (ArtForz)
- Test the whole hash instead of just looking at the high 32 bits
- Add configurable timeout, with a default of 180 seconds
- Add share summary output (inlikeflynn)
- Fix priority and CPU count detection on Windows
- Fix parameters -u and -p, and add short options -o and -O

Version 1.0.2 - Jun 13, 2011

- Linux x86_64 optimisations - Con Kolivas
- Optimise for x86_64 by default by using sse2_64 algo
- Detects CPUs and sets number of threads accordingly
- Uses CPU affinity for each thread where appropriate
- Sets scheduling policy to lowest possible
- Minor performance tweaks

Version 1.0.1 - May 14, 2011

- OSX support

Version 1.0 - May 9, 2011

- jansson 2.0 compatibility
- correct off-by-one in date (month) display output
- fix platform detection
- improve yasm configure bits
- support full URL, in X-Long-Polling header

Version 0.8.1 - March 22, 2011

- Make --user, --pass actually work

- Add User-Agent HTTP header to requests, so that server operators may
  more easily identify the miner client.

- Fix minor bug in example JSON config file

Version 0.8 - March 21, 2011

- Support long polling: http://deepbit.net/longpolling.php

- Adjust max workload based on scantime (default 5 seconds,
  or 60 seconds for longpoll)

- Standardize program output, and support syslog on Unix platforms

- Suport --user/--pass options (and "user" and "pass" in config file),
  as an alternative to the current --userpass

Version 0.7.2 - March 14, 2011

- Add port of ufasoft's sse2 assembly implementation (Linux only)
  This is a substantial speed improvement on Intel CPUs.

- Move all JSON-RPC I/O to separate thread.  This reduces the
  number of HTTP connections from one-per-thread to one, reducing resource
  usage on upstream bitcoind / pool server.

Version 0.7.1 - March 2, 2011

- Add support for JSON-format configuration file.  See example
  file example-cfg.json.  Any long argument on the command line
  may be stored in the config file.
- Timestamp each solution found
- Improve sha256_4way performance.  NOTE: This optimization makes
  the 'hash' debug-print output for sha256_way incorrect.
- Use __builtin_expect() intrinsic as compiler micro-optimization
- Build on Intel compiler
- HTTP library now follows HTTP redirects

Version 0.7 - February 12, 2011

- Re-use CURL object, thereby reuseing DNS cache and HTTP connections
- Use bswap_32, if compiler intrinsic is not available
- Disable full target validation (as opposed to simply H==0) for now

Version 0.6.1 - February 4, 2011

- Fully validate "hash < target", rather than simply stopping our scan
  if the high 32 bits are 00000000.
- Add --retry-pause, to set length of pause time between failure retries
- Display proof-of-work hash and target, if -D (debug mode) enabled
- Fix max-nonce auto-adjustment to actually work.  This means if your
  scan takes longer than 5 seconds (--scantime), the miner will slowly
  reduce the number of hashes you work on, before fetching a new work unit.

Version 0.6 - January 29, 2011

- Fetch new work unit, if scanhash takes longer than 5 seconds (--scantime)
- BeeCee1's sha256 4way optimizations
- lfm's byte swap optimization (improves via, cryptopp)
- Fix non-working short options -q, -r

Version 0.5 - December 28, 2010

- Exit program, when all threads have exited
- Improve JSON-RPC failure diagnostics and resilience
- Add --quiet option, to disable hashmeter output.

Version 0.3.3 - December 27, 2010

- Critical fix for sha256_cryptopp 'cryptopp_asm' algo

Version 0.3.2 - December 23, 2010

- Critical fix for sha256_via

Version 0.3.1 - December 19, 2010

- Critical fix for sha256_via
- Retry JSON-RPC failures (see --retry, under "minerd --help" output)

Version 0.3 - December 18, 2010

- Add crypto++ 32bit assembly implementation
- show version upon 'minerd --help'
- work around gcc 4.5.x bug that killed 4way performance

Version 0.2.2 - December 6, 2010

- VIA padlock implementation works now
- Minor build and runtime fixes

Version 0.2.1 - November 29, 2010

- avoid buffer overflow when submitting solutions
- add Crypto++ sha256 implementation (C only, ASM elided for now)
- minor internal optimizations and cleanups

Version 0.2 - November 27, 2010

- Add script for building a Windows installer
- improve hash performance (hashmeter) statistics
- add tcatm 4way sha256 implementation
- Add experimental VIA Padlock sha256 implementation

Version 0.1.2 - November 26, 2010

- many small cleanups and micro-optimizations
- build win32 exe using mingw
- RPC URL, username/password become command line arguments
- remove unused OpenSSL dependency

Version 0.1.1 - November 24, 2010

- Do not build sha256_generic module separately from cpuminer.

Version 0.1 - November 24, 2010

- Initial release.

07070100000007000081A4000003E800000064000000015EF4BCA100000BED000000000000000000000000000000000000001600000000cpuminer-2.5.1/READMEThis is a multi-threaded CPU miner for Litecoin and Bitcoin,
fork of Jeff Garzik's reference cpuminer.

License: GPLv2.  See COPYING for details.

Downloads:  https://sourceforge.net/projects/cpuminer/files/
Git tree:   https://github.com/pooler/cpuminer

Dependencies:
	libcurl			http://curl.haxx.se/libcurl/
	jansson			http://www.digip.org/jansson/
		(jansson is included in-tree)

Basic *nix build instructions:
	./autogen.sh	# only needed if building from git repo
	./nomacro.pl	# in case the assembler doesn't support macros
	./configure CFLAGS="-O3" # make sure -O3 is an O and not a zero!
	make

Notes for AIX users:
	* To build a 64-bit binary, export OBJECT_MODE=64
	* GNU-style long options are not supported, but are accessible
	  via configuration file

Basic Windows build instructions, using MinGW:
	Install MinGW and the MSYS Developer Tool Kit (http://www.mingw.org/)
		* Make sure you have mstcpip.h in MinGW\include
	If using MinGW-w64, install pthreads-w64
	Install libcurl devel (http://curl.haxx.se/download.html)
		* Make sure you have libcurl.m4 in MinGW\share\aclocal
		* Make sure you have curl-config in MinGW\bin
	In the MSYS shell, run:
		./autogen.sh	# only needed if building from git repo
		LIBCURL="-lcurldll" ./configure CFLAGS="-O3"
		make

Architecture-specific notes:
	ARM:	No runtime CPU detection. The miner can take advantage
		of some instructions specific to ARMv5E and later processors,
		but the decision whether to use them is made at compile time,
		based on compiler-defined macros.
		To use NEON instructions, add "-mfpu=neon" to CFLAGS.
	PowerPC: No runtime CPU detection.
		To use AltiVec instructions, add "-maltivec" to CFLAGS.
	x86:	The miner checks for SSE2 instructions support at runtime,
		and uses them if they are available.
	x86-64:	The miner can take advantage of AVX, AVX2 and XOP instructions,
		but only if both the CPU and the operating system support them.
		    * Linux supports AVX starting from kernel version 2.6.30.
		    * FreeBSD supports AVX starting with 9.1-RELEASE.
		    * Mac OS X added AVX support in the 10.6.8 update.
		    * Windows supports AVX starting from Windows 7 SP1 and
		      Windows Server 2008 R2 SP1.
		The configure script outputs a warning if the assembler
		doesn't support some instruction sets. In that case, the miner
		can still be built, but unavailable optimizations are left off.
		The miner uses the VIA Padlock Hash Engine where available.

Usage instructions:  Run "minerd --help" to see options.

Connecting through a proxy:  Use the --proxy option.
To use a SOCKS proxy, add a socks4:// or socks5:// prefix to the proxy host.
Protocols socks4a and socks5h, allowing remote name resolving, are also
available since libcurl 7.18.0.
If no protocol is specified, the proxy is assumed to be a HTTP proxy.
When the --proxy option is not used, the program honors the http_proxy
and all_proxy environment variables.

Also many issues and FAQs are covered in the forum thread
dedicated to this program,
	https://bitcointalk.org/index.php?topic=55038.0
07070100000008000081ED000003E800000064000000015EF4BCA1000000BA000000000000000000000000000000000000001A00000000cpuminer-2.5.1/autogen.sh#!/bin/sh

# You need autoconf 2.5x, preferably 2.57 or later
# You need automake 1.7 or later. 1.6 might work.

set -e

aclocal
autoheader
automake --gnu --add-missing --copy
autoconf

07070100000009000041ED000003E800000064000000015EF4BCA100000000000000000000000000000000000000000000001600000000cpuminer-2.5.1/compat0707010000000A000081A4000003E800000064000000015EF4BCA10000014E000000000000000000000000000000000000001800000000cpuminer-2.5.1/compat.h#ifndef __COMPAT_H__
#define __COMPAT_H__

#ifdef WIN32

#include <windows.h>

#define sleep(secs) Sleep((secs) * 1000)

enum {
	PRIO_PROCESS		= 0,
};

static inline int setpriority(int which, int who, int prio)
{
	return -!SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_IDLE);
}

#endif /* WIN32 */

#endif /* __COMPAT_H__ */
0707010000000B000081A4000003E800000064000000015EF4BCA100000039000000000000000000000000000000000000002200000000cpuminer-2.5.1/compat/Makefile.am
if WANT_JANSSON
SUBDIRS	= jansson
else
SUBDIRS	=
endif

0707010000000C000041ED000003E800000064000000015EF4BCA100000000000000000000000000000000000000000000001E00000000cpuminer-2.5.1/compat/jansson0707010000000D000081A4000003E800000064000000015EF4BCA10000043A000000000000000000000000000000000000002600000000cpuminer-2.5.1/compat/jansson/LICENSECopyright (c) 2009, 2010 Petri Lehtinen <petri@digip.org>

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
0707010000000E000081A4000003E800000064000000015EF4BCA100000120000000000000000000000000000000000000002A00000000cpuminer-2.5.1/compat/jansson/Makefile.am
noinst_LIBRARIES	= libjansson.a

libjansson_a_SOURCES	= \
			  config.h		\
			  dump.c		\
			  hashtable.c		\
			  hashtable.h		\
			  jansson.h		\
			  jansson_private.h	\
			  load.c		\
			  strbuffer.c		\
			  strbuffer.h		\
			  utf.c			\
			  utf.h			\
			  util.h		\
			  value.c

0707010000000F000081A4000003E800000064000000015EF4BCA100000874000000000000000000000000000000000000002700000000cpuminer-2.5.1/compat/jansson/config.h/* config.h.  Generated from config.h.in by configure.  */
/* config.h.in.  Generated from configure.ac by autoheader.  */

/* Define to 1 if you have the <dlfcn.h> header file. */
#define HAVE_DLFCN_H 1

/* Define to 1 if you have the <inttypes.h> header file. */
#define HAVE_INTTYPES_H 1

/* Define to 1 if you have the <memory.h> header file. */
#define HAVE_MEMORY_H 1

/* Define to 1 if you have the <stdint.h> header file. */
#define HAVE_STDINT_H 1

/* Define to 1 if you have the <stdlib.h> header file. */
#define HAVE_STDLIB_H 1

/* Define to 1 if you have the <strings.h> header file. */
#define HAVE_STRINGS_H 1

/* Define to 1 if you have the <string.h> header file. */
#define HAVE_STRING_H 1

/* Define to 1 if you have the <sys/stat.h> header file. */
#define HAVE_SYS_STAT_H 1

/* Define to 1 if you have the <sys/types.h> header file. */
#define HAVE_SYS_TYPES_H 1

/* Define to 1 if you have the <unistd.h> header file. */
#define HAVE_UNISTD_H 1

/* Define to the sub-directory in which libtool stores uninstalled libraries.
   */
#define LT_OBJDIR ".libs/"

/* Name of package */
#define PACKAGE "jansson"

/* Define to the address where bug reports for this package should be sent. */
#define PACKAGE_BUGREPORT "petri@digip.org"

/* Define to the full name of this package. */
#define PACKAGE_NAME "jansson"

/* Define to the full name and version of this package. */
#define PACKAGE_STRING "jansson 1.3"

/* Define to the one symbol short name of this package. */
#define PACKAGE_TARNAME "jansson"

/* Define to the home page for this package. */
#define PACKAGE_URL ""

/* Define to the version of this package. */
#define PACKAGE_VERSION "1.3"

/* Define to 1 if you have the ANSI C header files. */
#define STDC_HEADERS 1

/* Version number of package */
#define VERSION "1.3"

/* Define to `__inline__' or `__inline' if that's what the C compiler
   calls it, or to nothing if 'inline' is not supported under any name.  */
#ifndef __cplusplus
/* #undef inline */
#endif

/* Define to the type of a signed integer type of width exactly 32 bits if
   such a type exists and the standard includes do not define it. */
/* #undef int32_t */
07070100000010000081A4000003E800000064000000015EF4BCA100003171000000000000000000000000000000000000002500000000cpuminer-2.5.1/compat/jansson/dump.c/*
 * Copyright (c) 2009, 2010 Petri Lehtinen <petri@digip.org>
 *
 * Jansson is free software; you can redistribute it and/or modify
 * it under the terms of the MIT license. See LICENSE for details.
 */

#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>

#include <jansson.h>
#include "jansson_private.h"
#include "strbuffer.h"
#include "utf.h"

#define MAX_INTEGER_STR_LENGTH  100
#define MAX_REAL_STR_LENGTH     100

typedef int (*dump_func)(const char *buffer, int size, void *data);

struct string
{
    char *buffer;
    int length;
    int size;
};

static int dump_to_strbuffer(const char *buffer, int size, void *data)
{
    return strbuffer_append_bytes((strbuffer_t *)data, buffer, size);
}

static int dump_to_file(const char *buffer, int size, void *data)
{
    FILE *dest = (FILE *)data;
    if(fwrite(buffer, size, 1, dest) != 1)
        return -1;
    return 0;
}

/* 256 spaces (the maximum indentation size) */
static char whitespace[] = "                                                                                                                                                                                                                                                                ";

static int dump_indent(unsigned long flags, int depth, int space, dump_func dump, void *data)
{
    if(JSON_INDENT(flags) > 0)
    {
        int i, ws_count = JSON_INDENT(flags);

        if(dump("\n", 1, data))
            return -1;

        for(i = 0; i < depth; i++)
        {
            if(dump(whitespace, ws_count, data))
                return -1;
        }
    }
    else if(space && !(flags & JSON_COMPACT))
    {
        return dump(" ", 1, data);
    }
    return 0;
}

static int dump_string(const char *str, int ascii, dump_func dump, void *data)
{
    const char *pos, *end;
    int32_t codepoint;

    if(dump("\"", 1, data))
        return -1;

    end = pos = str;
    while(1)
    {
        const char *text;
        char seq[13];
        int length;

        while(*end)
        {
            end = utf8_iterate(pos, &codepoint);
            if(!end)
                return -1;

            /* mandatory escape or control char */
            if(codepoint == '\\' || codepoint == '"' || codepoint < 0x20)
                break;

            /* non-ASCII */
            if(ascii && codepoint > 0x7F)
                break;

            pos = end;
        }

        if(pos != str) {
            if(dump(str, pos - str, data))
                return -1;
        }

        if(end == pos)
            break;

        /* handle \, ", and control codes */
        length = 2;
        switch(codepoint)
        {
            case '\\': text = "\\\\"; break;
            case '\"': text = "\\\""; break;
            case '\b': text = "\\b"; break;
            case '\f': text = "\\f"; break;
            case '\n': text = "\\n"; break;
            case '\r': text = "\\r"; break;
            case '\t': text = "\\t"; break;
            default:
            {
                /* codepoint is in BMP */
                if(codepoint < 0x10000)
                {
                    sprintf(seq, "\\u%04x", codepoint);
                    length = 6;
                }

                /* not in BMP -> construct a UTF-16 surrogate pair */
                else
                {
                    int32_t first, last;

                    codepoint -= 0x10000;
                    first = 0xD800 | ((codepoint & 0xffc00) >> 10);
                    last = 0xDC00 | (codepoint & 0x003ff);

                    sprintf(seq, "\\u%04x\\u%04x", first, last);
                    length = 12;
                }

                text = seq;
                break;
            }
        }

        if(dump(text, length, data))
            return -1;

        str = pos = end;
    }

    return dump("\"", 1, data);
}

static int object_key_compare_keys(const void *key1, const void *key2)
{
    return strcmp((*(const object_key_t **)key1)->key,
                  (*(const object_key_t **)key2)->key);
}

static int object_key_compare_serials(const void *key1, const void *key2)
{
    return (*(const object_key_t **)key1)->serial -
           (*(const object_key_t **)key2)->serial;
}

static int do_dump(const json_t *json, unsigned long flags, int depth,
                   dump_func dump, void *data)
{
    int ascii = flags & JSON_ENSURE_ASCII ? 1 : 0;

    switch(json_typeof(json)) {
        case JSON_NULL:
            return dump("null", 4, data);

        case JSON_TRUE:
            return dump("true", 4, data);

        case JSON_FALSE:
            return dump("false", 5, data);

        case JSON_INTEGER:
        {
            char buffer[MAX_INTEGER_STR_LENGTH];
            int size;

            size = snprintf(buffer, MAX_INTEGER_STR_LENGTH, "%d", json_integer_value(json));
            if(size >= MAX_INTEGER_STR_LENGTH)
                return -1;

            return dump(buffer, size, data);
        }

        case JSON_REAL:
        {
            char buffer[MAX_REAL_STR_LENGTH];
            int size;

            size = snprintf(buffer, MAX_REAL_STR_LENGTH, "%.17g",
                            json_real_value(json));
            if(size >= MAX_REAL_STR_LENGTH)
                return -1;

            /* Make sure there's a dot or 'e' in the output. Otherwise
               a real is converted to an integer when decoding */
            if(strchr(buffer, '.') == NULL &&
               strchr(buffer, 'e') == NULL)
            {
                if(size + 2 >= MAX_REAL_STR_LENGTH) {
                    /* No space to append ".0" */
                    return -1;
                }
                buffer[size] = '.';
                buffer[size + 1] = '0';
                size += 2;
            }

            return dump(buffer, size, data);
        }

        case JSON_STRING:
            return dump_string(json_string_value(json), ascii, dump, data);

        case JSON_ARRAY:
        {
            int i;
            int n;
            json_array_t *array;

            /* detect circular references */
            array = json_to_array(json);
            if(array->visited)
                goto array_error;
            array->visited = 1;

            n = json_array_size(json);

            if(dump("[", 1, data))
                goto array_error;
            if(n == 0) {
                array->visited = 0;
                return dump("]", 1, data);
            }
            if(dump_indent(flags, depth + 1, 0, dump, data))
                goto array_error;

            for(i = 0; i < n; ++i) {
                if(do_dump(json_array_get(json, i), flags, depth + 1,
                           dump, data))
                    goto array_error;

                if(i < n - 1)
                {
                    if(dump(",", 1, data) ||
                       dump_indent(flags, depth + 1, 1, dump, data))
                        goto array_error;
                }
                else
                {
                    if(dump_indent(flags, depth, 0, dump, data))
                        goto array_error;
                }
            }

            array->visited = 0;
            return dump("]", 1, data);

        array_error:
            array->visited = 0;
            return -1;
        }

        case JSON_OBJECT:
        {
            json_object_t *object;
            void *iter;
            const char *separator;
            int separator_length;

            if(flags & JSON_COMPACT) {
                separator = ":";
                separator_length = 1;
            }
            else {
                separator = ": ";
                separator_length = 2;
            }

            /* detect circular references */
            object = json_to_object(json);
            if(object->visited)
                goto object_error;
            object->visited = 1;

            iter = json_object_iter((json_t *)json);

            if(dump("{", 1, data))
                goto object_error;
            if(!iter) {
                object->visited = 0;
                return dump("}", 1, data);
            }
            if(dump_indent(flags, depth + 1, 0, dump, data))
                goto object_error;

            if(flags & JSON_SORT_KEYS || flags & JSON_PRESERVE_ORDER)
            {
                const object_key_t **keys;
                unsigned int size;
                unsigned int i;
                int (*cmp_func)(const void *, const void *);

                size = json_object_size(json);
                keys = malloc(size * sizeof(object_key_t *));
                if(!keys)
                    goto object_error;

                i = 0;
                while(iter)
                {
                    keys[i] = jsonp_object_iter_fullkey(iter);
                    iter = json_object_iter_next((json_t *)json, iter);
                    i++;
                }
                assert(i == size);

                if(flags & JSON_SORT_KEYS)
                    cmp_func = object_key_compare_keys;
                else
                    cmp_func = object_key_compare_serials;

                qsort(keys, size, sizeof(object_key_t *), cmp_func);

                for(i = 0; i < size; i++)
                {
                    const char *key;
                    json_t *value;

                    key = keys[i]->key;
                    value = json_object_get(json, key);
                    assert(value);

                    dump_string(key, ascii, dump, data);
                    if(dump(separator, separator_length, data) ||
                       do_dump(value, flags, depth + 1, dump, data))
                    {
                        free(keys);
                        goto object_error;
                    }

                    if(i < size - 1)
                    {
                        if(dump(",", 1, data) ||
                           dump_indent(flags, depth + 1, 1, dump, data))
                        {
                            free(keys);
                            goto object_error;
                        }
                    }
                    else
                    {
                        if(dump_indent(flags, depth, 0, dump, data))
                        {
                            free(keys);
                            goto object_error;
                        }
                    }
                }

                free(keys);
            }
            else
            {
                /* Don't sort keys */

                while(iter)
                {
                    void *next = json_object_iter_next((json_t *)json, iter);

                    dump_string(json_object_iter_key(iter), ascii, dump, data);
                    if(dump(separator, separator_length, data) ||
                       do_dump(json_object_iter_value(iter), flags, depth + 1,
                               dump, data))
                        goto object_error;

                    if(next)
                    {
                        if(dump(",", 1, data) ||
                           dump_indent(flags, depth + 1, 1, dump, data))
                            goto object_error;
                    }
                    else
                    {
                        if(dump_indent(flags, depth, 0, dump, data))
                            goto object_error;
                    }

                    iter = next;
                }
            }

            object->visited = 0;
            return dump("}", 1, data);

        object_error:
            object->visited = 0;
            return -1;
        }

        default:
            /* not reached */
            return -1;
    }
}


char *json_dumps(const json_t *json, unsigned long flags)
{
    strbuffer_t strbuff;
    char *result;

    if(!json_is_array(json) && !json_is_object(json))
        return NULL;

    if(strbuffer_init(&strbuff))
        return NULL;

    if(do_dump(json, flags, 0, dump_to_strbuffer, (void *)&strbuff)) {
        strbuffer_close(&strbuff);
        return NULL;
    }

    result = strdup(strbuffer_value(&strbuff));
    strbuffer_close(&strbuff);

    return result;
}

int json_dumpf(const json_t *json, FILE *output, unsigned long flags)
{
    if(!json_is_array(json) && !json_is_object(json))
        return -1;

    return do_dump(json, flags, 0, dump_to_file, (void *)output);
}

int json_dump_file(const json_t *json, const char *path, unsigned long flags)
{
    int result;

    FILE *output = fopen(path, "w");
    if(!output)
        return -1;

    result = json_dumpf(json, output, flags);

    fclose(output);
    return result;
}
07070100000011000081A4000003E800000064000000015EF4BCA10000233A000000000000000000000000000000000000002A00000000cpuminer-2.5.1/compat/jansson/hashtable.c/*
 * Copyright (c) 2009, 2010 Petri Lehtinen <petri@digip.org>
 *
 * This library is free software; you can redistribute it and/or modify
 * it under the terms of the MIT license. See LICENSE for details.
 */

#include <config.h>

#include <stdlib.h>
#include "hashtable.h"

typedef struct hashtable_list list_t;
typedef struct hashtable_pair pair_t;
typedef struct hashtable_bucket bucket_t;

#define container_of(ptr_, type_, member_)                      \
    ((type_ *)((char *)ptr_ - (size_t)&((type_ *)0)->member_))

#define list_to_pair(list_)  container_of(list_, pair_t, list)

static inline void list_init(list_t *list)
{
    list->next = list;
    list->prev = list;
}

static inline void list_insert(list_t *list, list_t *node)
{
    node->next = list;
    node->prev = list->prev;
    list->prev->next = node;
    list->prev = node;
}

static inline void list_remove(list_t *list)
{
    list->prev->next = list->next;
    list->next->prev = list->prev;
}

static inline int bucket_is_empty(hashtable_t *hashtable, bucket_t *bucket)
{
    return bucket->first == &hashtable->list && bucket->first == bucket->last;
}

static void insert_to_bucket(hashtable_t *hashtable, bucket_t *bucket,
                             list_t *list)
{
    if(bucket_is_empty(hashtable, bucket))
    {
        list_insert(&hashtable->list, list);
        bucket->first = bucket->last = list;
    }
    else
    {
        list_insert(bucket->first, list);
        bucket->first = list;
    }
}

static unsigned int primes[] = {
    5, 13, 23, 53, 97, 193, 389, 769, 1543, 3079, 6151, 12289, 24593,
    49157, 98317, 196613, 393241, 786433, 1572869, 3145739, 6291469,
    12582917, 25165843, 50331653, 100663319, 201326611, 402653189,
    805306457, 1610612741
};
static const unsigned int num_primes = sizeof(primes) / sizeof(unsigned int);

static inline unsigned int num_buckets(hashtable_t *hashtable)
{
    return primes[hashtable->num_buckets];
}


static pair_t *hashtable_find_pair(hashtable_t *hashtable, bucket_t *bucket,
                                   const void *key, unsigned int hash)
{
    list_t *list;
    pair_t *pair;

    if(bucket_is_empty(hashtable, bucket))
        return NULL;

    list = bucket->first;
    while(1)
    {
        pair = list_to_pair(list);
        if(pair->hash == hash && hashtable->cmp_keys(pair->key, key))
            return pair;

        if(list == bucket->last)
            break;

        list = list->next;
    }

    return NULL;
}

/* returns 0 on success, -1 if key was not found */
static int hashtable_do_del(hashtable_t *hashtable,
                            const void *key, unsigned int hash)
{
    pair_t *pair;
    bucket_t *bucket;
    unsigned int index;

    index = hash % num_buckets(hashtable);
    bucket = &hashtable->buckets[index];

    pair = hashtable_find_pair(hashtable, bucket, key, hash);
    if(!pair)
        return -1;

    if(&pair->list == bucket->first && &pair->list == bucket->last)
        bucket->first = bucket->last = &hashtable->list;

    else if(&pair->list == bucket->first)
        bucket->first = pair->list.next;

    else if(&pair->list == bucket->last)
        bucket->last = pair->list.prev;

    list_remove(&pair->list);

    if(hashtable->free_key)
        hashtable->free_key(pair->key);
    if(hashtable->free_value)
        hashtable->free_value(pair->value);

    free(pair);
    hashtable->size--;

    return 0;
}

static void hashtable_do_clear(hashtable_t *hashtable)
{
    list_t *list, *next;
    pair_t *pair;

    for(list = hashtable->list.next; list != &hashtable->list; list = next)
    {
        next = list->next;
        pair = list_to_pair(list);
        if(hashtable->free_key)
            hashtable->free_key(pair->key);
        if(hashtable->free_value)
            hashtable->free_value(pair->value);
        free(pair);
    }
}

static int hashtable_do_rehash(hashtable_t *hashtable)
{
    list_t *list, *next;
    pair_t *pair;
    unsigned int i, index, new_size;

    free(hashtable->buckets);

    hashtable->num_buckets++;
    new_size = num_buckets(hashtable);

    hashtable->buckets = malloc(new_size * sizeof(bucket_t));
    if(!hashtable->buckets)
        return -1;

    for(i = 0; i < num_buckets(hashtable); i++)
    {
        hashtable->buckets[i].first = hashtable->buckets[i].last =
            &hashtable->list;
    }

    list = hashtable->list.next;
    list_init(&hashtable->list);

    for(; list != &hashtable->list; list = next) {
        next = list->next;
        pair = list_to_pair(list);
        index = pair->hash % new_size;
        insert_to_bucket(hashtable, &hashtable->buckets[index], &pair->list);
    }

    return 0;
}


hashtable_t *hashtable_create(key_hash_fn hash_key, key_cmp_fn cmp_keys,
                              free_fn free_key, free_fn free_value)
{
    hashtable_t *hashtable = malloc(sizeof(hashtable_t));
    if(!hashtable)
        return NULL;

    if(hashtable_init(hashtable, hash_key, cmp_keys, free_key, free_value))
    {
        free(hashtable);
        return NULL;
    }

    return hashtable;
}

void hashtable_destroy(hashtable_t *hashtable)
{
    hashtable_close(hashtable);
    free(hashtable);
}

int hashtable_init(hashtable_t *hashtable,
                   key_hash_fn hash_key, key_cmp_fn cmp_keys,
                   free_fn free_key, free_fn free_value)
{
    unsigned int i;

    hashtable->size = 0;
    hashtable->num_buckets = 0;  /* index to primes[] */
    hashtable->buckets = malloc(num_buckets(hashtable) * sizeof(bucket_t));
    if(!hashtable->buckets)
        return -1;

    list_init(&hashtable->list);

    hashtable->hash_key = hash_key;
    hashtable->cmp_keys = cmp_keys;
    hashtable->free_key = free_key;
    hashtable->free_value = free_value;

    for(i = 0; i < num_buckets(hashtable); i++)
    {
        hashtable->buckets[i].first = hashtable->buckets[i].last =
            &hashtable->list;
    }

    return 0;
}

void hashtable_close(hashtable_t *hashtable)
{
    hashtable_do_clear(hashtable);
    free(hashtable->buckets);
}

int hashtable_set(hashtable_t *hashtable, void *key, void *value)
{
    pair_t *pair;
    bucket_t *bucket;
    unsigned int hash, index;

    /* rehash if the load ratio exceeds 1 */
    if(hashtable->size >= num_buckets(hashtable))
        if(hashtable_do_rehash(hashtable))
            return -1;

    hash = hashtable->hash_key(key);
    index = hash % num_buckets(hashtable);
    bucket = &hashtable->buckets[index];
    pair = hashtable_find_pair(hashtable, bucket, key, hash);

    if(pair)
    {
        if(hashtable->free_key)
            hashtable->free_key(key);
        if(hashtable->free_value)
            hashtable->free_value(pair->value);
        pair->value = value;
    }
    else
    {
        pair = malloc(sizeof(pair_t));
        if(!pair)
            return -1;

        pair->key = key;
        pair->value = value;
        pair->hash = hash;
        list_init(&pair->list);

        insert_to_bucket(hashtable, bucket, &pair->list);

        hashtable->size++;
    }
    return 0;
}

void *hashtable_get(hashtable_t *hashtable, const void *key)
{
    pair_t *pair;
    unsigned int hash;
    bucket_t *bucket;

    hash = hashtable->hash_key(key);
    bucket = &hashtable->buckets[hash % num_buckets(hashtable)];

    pair = hashtable_find_pair(hashtable, bucket, key, hash);
    if(!pair)
        return NULL;

    return pair->value;
}

int hashtable_del(hashtable_t *hashtable, const void *key)
{
    unsigned int hash = hashtable->hash_key(key);
    return hashtable_do_del(hashtable, key, hash);
}

void hashtable_clear(hashtable_t *hashtable)
{
    unsigned int i;

    hashtable_do_clear(hashtable);

    for(i = 0; i < num_buckets(hashtable); i++)
    {
        hashtable->buckets[i].first = hashtable->buckets[i].last =
            &hashtable->list;
    }

    list_init(&hashtable->list);
    hashtable->size = 0;
}

void *hashtable_iter(hashtable_t *hashtable)
{
    return hashtable_iter_next(hashtable, &hashtable->list);
}

void *hashtable_iter_at(hashtable_t *hashtable, const void *key)
{
    pair_t *pair;
    unsigned int hash;
    bucket_t *bucket;

    hash = hashtable->hash_key(key);
    bucket = &hashtable->buckets[hash % num_buckets(hashtable)];

    pair = hashtable_find_pair(hashtable, bucket, key, hash);
    if(!pair)
        return NULL;

    return &pair->list;
}

void *hashtable_iter_next(hashtable_t *hashtable, void *iter)
{
    list_t *list = (list_t *)iter;
    if(list->next == &hashtable->list)
        return NULL;
    return list->next;
}

void *hashtable_iter_key(void *iter)
{
    pair_t *pair = list_to_pair((list_t *)iter);
    return pair->key;
}

void *hashtable_iter_value(void *iter)
{
    pair_t *pair = list_to_pair((list_t *)iter);
    return pair->value;
}

void hashtable_iter_set(hashtable_t *hashtable, void *iter, void *value)
{
    pair_t *pair = list_to_pair((list_t *)iter);

    if(hashtable->free_value)
        hashtable->free_value(pair->value);

    pair->value = value;
}
07070100000012000081A4000003E800000064000000015EF4BCA100001777000000000000000000000000000000000000002A00000000cpuminer-2.5.1/compat/jansson/hashtable.h/*
 * Copyright (c) 2009, 2010 Petri Lehtinen <petri@digip.org>
 *
 * This library is free software; you can redistribute it and/or modify
 * it under the terms of the MIT license. See LICENSE for details.
 */

#ifndef HASHTABLE_H
#define HASHTABLE_H

typedef unsigned int (*key_hash_fn)(const void *key);
typedef int (*key_cmp_fn)(const void *key1, const void *key2);
typedef void (*free_fn)(void *key);

struct hashtable_list {
    struct hashtable_list *prev;
    struct hashtable_list *next;
};

struct hashtable_pair {
    void *key;
    void *value;
    unsigned int hash;
    struct hashtable_list list;
};

struct hashtable_bucket {
    struct hashtable_list *first;
    struct hashtable_list *last;
};

typedef struct hashtable {
    unsigned int size;
    struct hashtable_bucket *buckets;
    unsigned int num_buckets;  /* index to primes[] */
    struct hashtable_list list;

    key_hash_fn hash_key;
    key_cmp_fn cmp_keys;  /* returns non-zero for equal keys */
    free_fn free_key;
    free_fn free_value;
} hashtable_t;

/**
 * hashtable_create - Create a hashtable object
 *
 * @hash_key: The key hashing function
 * @cmp_keys: The key compare function. Returns non-zero for equal and
 *     zero for unequal unequal keys
 * @free_key: If non-NULL, called for a key that is no longer referenced.
 * @free_value: If non-NULL, called for a value that is no longer referenced.
 *
 * Returns a new hashtable object that should be freed with
 * hashtable_destroy when it's no longer used, or NULL on failure (out
 * of memory).
 */
hashtable_t *hashtable_create(key_hash_fn hash_key, key_cmp_fn cmp_keys,
                              free_fn free_key, free_fn free_value);

/**
 * hashtable_destroy - Destroy a hashtable object
 *
 * @hashtable: The hashtable
 *
 * Destroys a hashtable created with hashtable_create().
 */
void hashtable_destroy(hashtable_t *hashtable);

/**
 * hashtable_init - Initialize a hashtable object
 *
 * @hashtable: The (statically allocated) hashtable object
 * @hash_key: The key hashing function
 * @cmp_keys: The key compare function. Returns non-zero for equal and
 *     zero for unequal unequal keys
 * @free_key: If non-NULL, called for a key that is no longer referenced.
 * @free_value: If non-NULL, called for a value that is no longer referenced.
 *
 * Initializes a statically allocated hashtable object. The object
 * should be cleared with hashtable_close when it's no longer used.
 *
 * Returns 0 on success, -1 on error (out of memory).
 */
int hashtable_init(hashtable_t *hashtable,
                   key_hash_fn hash_key, key_cmp_fn cmp_keys,
                   free_fn free_key, free_fn free_value);

/**
 * hashtable_close - Release all resources used by a hashtable object
 *
 * @hashtable: The hashtable
 *
 * Destroys a statically allocated hashtable object.
 */
void hashtable_close(hashtable_t *hashtable);

/**
 * hashtable_set - Add/modify value in hashtable
 *
 * @hashtable: The hashtable object
 * @key: The key
 * @value: The value
 *
 * If a value with the given key already exists, its value is replaced
 * with the new value.
 *
 * Key and value are "stealed" in the sense that hashtable frees them
 * automatically when they are no longer used. The freeing is
 * accomplished by calling free_key and free_value functions that were
 * supplied to hashtable_new. In case one or both of the free
 * functions is NULL, the corresponding item is not "stealed".
 *
 * Returns 0 on success, -1 on failure (out of memory).
 */
int hashtable_set(hashtable_t *hashtable, void *key, void *value);

/**
 * hashtable_get - Get a value associated with a key
 *
 * @hashtable: The hashtable object
 * @key: The key
 *
 * Returns value if it is found, or NULL otherwise.
 */
void *hashtable_get(hashtable_t *hashtable, const void *key);

/**
 * hashtable_del - Remove a value from the hashtable
 *
 * @hashtable: The hashtable object
 * @key: The key
 *
 * Returns 0 on success, or -1 if the key was not found.
 */
int hashtable_del(hashtable_t *hashtable, const void *key);

/**
 * hashtable_clear - Clear hashtable
 *
 * @hashtable: The hashtable object
 *
 * Removes all items from the hashtable.
 */
void hashtable_clear(hashtable_t *hashtable);

/**
 * hashtable_iter - Iterate over hashtable
 *
 * @hashtable: The hashtable object
 *
 * Returns an opaque iterator to the first element in the hashtable.
 * The iterator should be passed to hashtable_iter_* functions.
 * The hashtable items are not iterated over in any particular order.
 *
 * There's no need to free the iterator in any way. The iterator is
 * valid as long as the item that is referenced by the iterator is not
 * deleted. Other values may be added or deleted. In particular,
 * hashtable_iter_next() may be called on an iterator, and after that
 * the key/value pair pointed by the old iterator may be deleted.
 */
void *hashtable_iter(hashtable_t *hashtable);

/**
 * hashtable_iter_at - Return an iterator at a specific key
 *
 * @hashtable: The hashtable object
 * @key: The key that the iterator should point to
 *
 * Like hashtable_iter() but returns an iterator pointing to a
 * specific key.
 */
void *hashtable_iter_at(hashtable_t *hashtable, const void *key);

/**
 * hashtable_iter_next - Advance an iterator
 *
 * @hashtable: The hashtable object
 * @iter: The iterator
 *
 * Returns a new iterator pointing to the next element in the
 * hashtable or NULL if the whole hastable has been iterated over.
 */
void *hashtable_iter_next(hashtable_t *hashtable, void *iter);

/**
 * hashtable_iter_key - Retrieve the key pointed by an iterator
 *
 * @iter: The iterator
 */
void *hashtable_iter_key(void *iter);

/**
 * hashtable_iter_value - Retrieve the value pointed by an iterator
 *
 * @iter: The iterator
 */
void *hashtable_iter_value(void *iter);

/**
 * hashtable_iter_set - Set the value pointed by an iterator
 *
 * @iter: The iterator
 * @value: The value to set
 */
void hashtable_iter_set(hashtable_t *hashtable, void *iter, void *value);

#endif
07070100000013000081A4000003E800000064000000015EF4BCA1000015FB000000000000000000000000000000000000002800000000cpuminer-2.5.1/compat/jansson/jansson.h/*
 * Copyright (c) 2009, 2010 Petri Lehtinen <petri@digip.org>
 *
 * Jansson is free software; you can redistribute it and/or modify
 * it under the terms of the MIT license. See LICENSE for details.
 */

#ifndef JANSSON_H
#define JANSSON_H

#include <stdio.h>

#ifndef __cplusplus
#define JSON_INLINE inline
#else
#define JSON_INLINE inline
extern "C" {
#endif

/* types */

typedef enum {
    JSON_OBJECT,
    JSON_ARRAY,
    JSON_STRING,
    JSON_INTEGER,
    JSON_REAL,
    JSON_TRUE,
    JSON_FALSE,
    JSON_NULL
} json_type;

typedef struct {
    json_type type;
    unsigned long refcount;
} json_t;

#define json_typeof(json)      ((json)->type)
#define json_is_object(json)   (json && json_typeof(json) == JSON_OBJECT)
#define json_is_array(json)    (json && json_typeof(json) == JSON_ARRAY)
#define json_is_string(json)   (json && json_typeof(json) == JSON_STRING)
#define json_is_integer(json)  (json && json_typeof(json) == JSON_INTEGER)
#define json_is_real(json)     (json && json_typeof(json) == JSON_REAL)
#define json_is_number(json)   (json_is_integer(json) || json_is_real(json))
#define json_is_true(json)     (json && json_typeof(json) == JSON_TRUE)
#define json_is_false(json)    (json && json_typeof(json) == JSON_FALSE)
#define json_is_boolean(json)  (json_is_true(json) || json_is_false(json))
#define json_is_null(json)     (json && json_typeof(json) == JSON_NULL)

/* construction, destruction, reference counting */

json_t *json_object(void);
json_t *json_array(void);
json_t *json_string(const char *value);
json_t *json_string_nocheck(const char *value);
json_t *json_integer(int value);
json_t *json_real(double value);
json_t *json_true(void);
json_t *json_false(void);
json_t *json_null(void);

static JSON_INLINE
json_t *json_incref(json_t *json)
{
    if(json && json->refcount != (unsigned int)-1)
        ++json->refcount;
    return json;
}

/* do not call json_delete directly */
void json_delete(json_t *json);

static JSON_INLINE
void json_decref(json_t *json)
{
    if(json && json->refcount != (unsigned int)-1 && --json->refcount == 0)
        json_delete(json);
}


/* getters, setters, manipulation */

unsigned int json_object_size(const json_t *object);
json_t *json_object_get(const json_t *object, const char *key);
int json_object_set_new(json_t *object, const char *key, json_t *value);
int json_object_set_new_nocheck(json_t *object, const char *key, json_t *value);
int json_object_del(json_t *object, const char *key);
int json_object_clear(json_t *object);
int json_object_update(json_t *object, json_t *other);
void *json_object_iter(json_t *object);
void *json_object_iter_at(json_t *object, const char *key);
void *json_object_iter_next(json_t *object, void *iter);
const char *json_object_iter_key(void *iter);
json_t *json_object_iter_value(void *iter);
int json_object_iter_set_new(json_t *object, void *iter, json_t *value);

static JSON_INLINE
int json_object_set(json_t *object, const char *key, json_t *value)
{
    return json_object_set_new(object, key, json_incref(value));
}

static JSON_INLINE
int json_object_set_nocheck(json_t *object, const char *key, json_t *value)
{
    return json_object_set_new_nocheck(object, key, json_incref(value));
}

static inline
int json_object_iter_set(json_t *object, void *iter, json_t *value)
{
    return json_object_iter_set_new(object, iter, json_incref(value));
}

unsigned int json_array_size(const json_t *array);
json_t *json_array_get(const json_t *array, unsigned int index);
int json_array_set_new(json_t *array, unsigned int index, json_t *value);
int json_array_append_new(json_t *array, json_t *value);
int json_array_insert_new(json_t *array, unsigned int index, json_t *value);
int json_array_remove(json_t *array, unsigned int index);
int json_array_clear(json_t *array);
int json_array_extend(json_t *array, json_t *other);

static JSON_INLINE
int json_array_set(json_t *array, unsigned int index, json_t *value)
{
    return json_array_set_new(array, index, json_incref(value));
}

static JSON_INLINE
int json_array_append(json_t *array, json_t *value)
{
    return json_array_append_new(array, json_incref(value));
}

static JSON_INLINE
int json_array_insert(json_t *array, unsigned int index, json_t *value)
{
    return json_array_insert_new(array, index, json_incref(value));
}

const char *json_string_value(const json_t *string);
int json_integer_value(const json_t *integer);
double json_real_value(const json_t *real);
double json_number_value(const json_t *json);

int json_string_set(json_t *string, const char *value);
int json_string_set_nocheck(json_t *string, const char *value);
int json_integer_set(json_t *integer, int value);
int json_real_set(json_t *real, double value);


/* equality */

int json_equal(json_t *value1, json_t *value2);


/* copying */

json_t *json_copy(json_t *value);
json_t *json_deep_copy(json_t *value);


/* loading, printing */

#define JSON_ERROR_TEXT_LENGTH  160

typedef struct {
    char text[JSON_ERROR_TEXT_LENGTH];
    int line;
} json_error_t;

json_t *json_loads(const char *input, json_error_t *error);
json_t *json_loadf(FILE *input, json_error_t *error);
json_t *json_load_file(const char *path, json_error_t *error);

#define JSON_INDENT(n)      (n & 0xFF)
#define JSON_COMPACT        0x100
#define JSON_ENSURE_ASCII   0x200
#define JSON_SORT_KEYS      0x400
#define JSON_PRESERVE_ORDER 0x800

char *json_dumps(const json_t *json, unsigned long flags);
int json_dumpf(const json_t *json, FILE *output, unsigned long flags);
int json_dump_file(const json_t *json, const char *path, unsigned long flags);

#ifdef __cplusplus
}
#endif

#endif
07070100000014000081A4000003E800000064000000015EF4BCA100000563000000000000000000000000000000000000003000000000cpuminer-2.5.1/compat/jansson/jansson_private.h/*
 * Copyright (c) 2009, 2010 Petri Lehtinen <petri@digip.org>
 *
 * Jansson is free software; you can redistribute it and/or modify
 * it under the terms of the MIT license. See LICENSE for details.
 */

#ifndef JANSSON_PRIVATE_H
#define JANSSON_PRIVATE_H

#include "jansson.h"
#include "hashtable.h"

#define container_of(ptr_, type_, member_)  \
    ((type_ *)((char *)ptr_ - (size_t)&((type_ *)0)->member_))

typedef struct {
    json_t json;
    hashtable_t hashtable;
    unsigned long serial;
    int visited;
} json_object_t;

typedef struct {
    json_t json;
    unsigned int size;
    unsigned int entries;
    json_t **table;
    int visited;
} json_array_t;

typedef struct {
    json_t json;
    char *value;
} json_string_t;

typedef struct {
    json_t json;
    double value;
} json_real_t;

typedef struct {
    json_t json;
    int value;
} json_integer_t;

#define json_to_object(json_)  container_of(json_, json_object_t, json)
#define json_to_array(json_)   container_of(json_, json_array_t, json)
#define json_to_string(json_)  container_of(json_, json_string_t, json)
#define json_to_real(json_)   container_of(json_, json_real_t, json)
#define json_to_integer(json_) container_of(json_, json_integer_t, json)

typedef struct {
    unsigned long serial;
    char key[];
} object_key_t;

const object_key_t *jsonp_object_iter_fullkey(void *iter);

#endif
07070100000015000081A4000003E800000064000000015EF4BCA100004FB9000000000000000000000000000000000000002500000000cpuminer-2.5.1/compat/jansson/load.c/*
 * Copyright (c) 2009, 2010 Petri Lehtinen <petri@digip.org>
 *
 * Jansson is free software; you can redistribute it and/or modify
 * it under the terms of the MIT license. See LICENSE for details.
 */

#define _GNU_SOURCE
#include <ctype.h>
#include <errno.h>
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdarg.h>
#include <assert.h>

#include <jansson.h>
#include "jansson_private.h"
#include "strbuffer.h"
#include "utf.h"

#define TOKEN_INVALID         -1
#define TOKEN_EOF              0
#define TOKEN_STRING         256
#define TOKEN_INTEGER        257
#define TOKEN_REAL           258
#define TOKEN_TRUE           259
#define TOKEN_FALSE          260
#define TOKEN_NULL           261

/* read one byte from stream, return EOF on end of file */
typedef int (*get_func)(void *data);

/* return non-zero if end of file has been reached */
typedef int (*eof_func)(void *data);

typedef struct {
    get_func get;
    eof_func eof;
    void *data;
    int stream_pos;
    char buffer[5];
    int buffer_pos;
} stream_t;


typedef struct {
    stream_t stream;
    strbuffer_t saved_text;
    int token;
    int line, column;
    union {
        char *string;
        int integer;
        double real;
    } value;
} lex_t;


/*** error reporting ***/

static void error_init(json_error_t *error)
{
    if(error)
    {
        error->text[0] = '\0';
        error->line = -1;
    }
}

static void error_set(json_error_t *error, const lex_t *lex,
                      const char *msg, ...)
{
    va_list ap;
    char text[JSON_ERROR_TEXT_LENGTH];

    if(!error || error->text[0] != '\0') {
        /* error already set */
        return;
    }

    va_start(ap, msg);
    vsnprintf(text, JSON_ERROR_TEXT_LENGTH, msg, ap);
    va_end(ap);

    if(lex)
    {
        const char *saved_text = strbuffer_value(&lex->saved_text);
        error->line = lex->line;
        if(saved_text && saved_text[0])
        {
            if(lex->saved_text.length <= 20) {
                snprintf(error->text, JSON_ERROR_TEXT_LENGTH,
                         "%s near '%s'", text, saved_text);
            }
            else
                snprintf(error->text, JSON_ERROR_TEXT_LENGTH, "%s", text);
        }
        else
        {
            snprintf(error->text, JSON_ERROR_TEXT_LENGTH,
                     "%s near end of file", text);
        }
    }
    else
    {
        error->line = -1;
        snprintf(error->text, JSON_ERROR_TEXT_LENGTH, "%s", text);
    }
}


/*** lexical analyzer ***/

static void
stream_init(stream_t *stream, get_func get, eof_func eof, void *data)
{
    stream->get = get;
    stream->eof = eof;
    stream->data = data;
    stream->stream_pos = 0;
    stream->buffer[0] = '\0';
    stream->buffer_pos = 0;
}

static char stream_get(stream_t *stream, json_error_t *error)
{
    char c;

    if(!stream->buffer[stream->buffer_pos])
    {
        stream->buffer[0] = stream->get(stream->data);
        stream->buffer_pos = 0;

        c = stream->buffer[0];

        if((unsigned char)c >= 0x80 && c != (char)EOF)
        {
            /* multi-byte UTF-8 sequence */
            int i, count;

            count = utf8_check_first(c);
            if(!count)
                goto out;

            assert(count >= 2);

            for(i = 1; i < count; i++)
                stream->buffer[i] = stream->get(stream->data);

            if(!utf8_check_full(stream->buffer, count, NULL))
                goto out;

            stream->stream_pos += count;
            stream->buffer[count] = '\0';
        }
        else {
            stream->buffer[1] = '\0';
            stream->stream_pos++;
        }
    }

    return stream->buffer[stream->buffer_pos++];

out:
    error_set(error, NULL, "unable to decode byte 0x%x at position %d",
              (unsigned char)c, stream->stream_pos);

    stream->buffer[0] = EOF;
    stream->buffer[1] = '\0';
    stream->buffer_pos = 1;

    return EOF;
}

static void stream_unget(stream_t *stream, char c)
{
    assert(stream->buffer_pos > 0);
    stream->buffer_pos--;
    assert(stream->buffer[stream->buffer_pos] == c);
}


static int lex_get(lex_t *lex, json_error_t *error)
{
    return stream_get(&lex->stream, error);
}

static int lex_eof(lex_t *lex)
{
    return lex->stream.eof(lex->stream.data);
}

static void lex_save(lex_t *lex, char c)
{
    strbuffer_append_byte(&lex->saved_text, c);
}

static int lex_get_save(lex_t *lex, json_error_t *error)
{
    char c = stream_get(&lex->stream, error);
    lex_save(lex, c);
    return c;
}

static void lex_unget_unsave(lex_t *lex, char c)
{
    char d;
    stream_unget(&lex->stream, c);
    d = strbuffer_pop(&lex->saved_text);
    assert(c == d);
}

static void lex_save_cached(lex_t *lex)
{
    while(lex->stream.buffer[lex->stream.buffer_pos] != '\0')
    {
        lex_save(lex, lex->stream.buffer[lex->stream.buffer_pos]);
        lex->stream.buffer_pos++;
    }
}

/* assumes that str points to 'u' plus at least 4 valid hex digits */
static int32_t decode_unicode_escape(const char *str)
{
    int i;
    int32_t value = 0;

    assert(str[0] == 'u');

    for(i = 1; i <= 4; i++) {
        char c = str[i];
        value <<= 4;
        if(isdigit(c))
            value += c - '0';
        else if(islower(c))
            value += c - 'a' + 10;
        else if(isupper(c))
            value += c - 'A' + 10;
        else
            assert(0);
    }

    return value;
}

static void lex_scan_string(lex_t *lex, json_error_t *error)
{
    char c;
    const char *p;
    char *t;
    int i;

    lex->value.string = NULL;
    lex->token = TOKEN_INVALID;

    c = lex_get_save(lex, error);

    while(c != '"') {
        if(c == (char)EOF) {
            lex_unget_unsave(lex, c);
            if(lex_eof(lex))
                error_set(error, lex, "premature end of input");
            goto out;
        }

        else if((unsigned char)c <= 0x1F) {
            /* control character */
            lex_unget_unsave(lex, c);
            if(c == '\n')
                error_set(error, lex, "unexpected newline", c);
            else
                error_set(error, lex, "control character 0x%x", c);
            goto out;
        }

        else if(c == '\\') {
            c = lex_get_save(lex, error);
            if(c == 'u') {
                c = lex_get_save(lex, error);
                for(i = 0; i < 4; i++) {
                    if(!isxdigit(c)) {
                        lex_unget_unsave(lex, c);
                        error_set(error, lex, "invalid escape");
                        goto out;
                    }
                    c = lex_get_save(lex, error);
                }
            }
            else if(c == '"' || c == '\\' || c == '/' || c == 'b' ||
                    c == 'f' || c == 'n' || c == 'r' || c == 't')
                c = lex_get_save(lex, error);
            else {
                lex_unget_unsave(lex, c);
                error_set(error, lex, "invalid escape");
                goto out;
            }
        }
        else
            c = lex_get_save(lex, error);
    }

    /* the actual value is at most of the same length as the source
       string, because:
         - shortcut escapes (e.g. "\t") (length 2) are converted to 1 byte
         - a single \uXXXX escape (length 6) is converted to at most 3 bytes
         - two \uXXXX escapes (length 12) forming an UTF-16 surrogate pair
           are converted to 4 bytes
    */
    lex->value.string = malloc(lex->saved_text.length + 1);
    if(!lex->value.string) {
        /* this is not very nice, since TOKEN_INVALID is returned */
        goto out;
    }

    /* the target */
    t = lex->value.string;

    /* + 1 to skip the " */
    p = strbuffer_value(&lex->saved_text) + 1;

    while(*p != '"') {
        if(*p == '\\') {
            p++;
            if(*p == 'u') {
                char buffer[4];
                int length;
                int32_t value;

                value = decode_unicode_escape(p);
                p += 5;

                if(0xD800 <= value && value <= 0xDBFF) {
                    /* surrogate pair */
                    if(*p == '\\' && *(p + 1) == 'u') {
                        int32_t value2 = decode_unicode_escape(++p);
                        p += 5;

                        if(0xDC00 <= value2 && value2 <= 0xDFFF) {
                            /* valid second surrogate */
                            value =
                                ((value - 0xD800) << 10) +
                                (value2 - 0xDC00) +
                                0x10000;
                        }
                        else {
                            /* invalid second surrogate */
                            error_set(error, lex,
                                      "invalid Unicode '\\u%04X\\u%04X'",
                                      value, value2);
                            goto out;
                        }
                    }
                    else {
                        /* no second surrogate */
                        error_set(error, lex, "invalid Unicode '\\u%04X'",
                                  value);
                        goto out;
                    }
                }
                else if(0xDC00 <= value && value <= 0xDFFF) {
                    error_set(error, lex, "invalid Unicode '\\u%04X'", value);
                    goto out;
                }
                else if(value == 0)
                {
                    error_set(error, lex, "\\u0000 is not allowed");
                    goto out;
                }

                if(utf8_encode(value, buffer, &length))
                    assert(0);

                memcpy(t, buffer, length);
                t += length;
            }
            else {
                switch(*p) {
                    case '"': case '\\': case '/':
                        *t = *p; break;
                    case 'b': *t = '\b'; break;
                    case 'f': *t = '\f'; break;
                    case 'n': *t = '\n'; break;
                    case 'r': *t = '\r'; break;
                    case 't': *t = '\t'; break;
                    default: assert(0);
                }
                t++;
                p++;
            }
        }
        else
            *(t++) = *(p++);
    }
    *t = '\0';
    lex->token = TOKEN_STRING;
    return;

out:
    free(lex->value.string);
}

static int lex_scan_number(lex_t *lex, char c, json_error_t *error)
{
    const char *saved_text;
    char *end;
    double value;

    lex->token = TOKEN_INVALID;

    if(c == '-')
        c = lex_get_save(lex, error);

    if(c == '0') {
        c = lex_get_save(lex, error);
        if(isdigit(c)) {
            lex_unget_unsave(lex, c);
            goto out;
        }
    }
    else if(isdigit(c)) {
        c = lex_get_save(lex, error);
        while(isdigit(c))
            c = lex_get_save(lex, error);
    }
    else {
      lex_unget_unsave(lex, c);
      goto out;
    }

    if(c != '.' && c != 'E' && c != 'e') {
        long value;

        lex_unget_unsave(lex, c);

        saved_text = strbuffer_value(&lex->saved_text);
        value = strtol(saved_text, &end, 10);
        assert(end == saved_text + lex->saved_text.length);

        if((value == LONG_MAX && errno == ERANGE) || value > INT_MAX) {
            error_set(error, lex, "too big integer");
            goto out;
        }
        else if((value == LONG_MIN && errno == ERANGE) || value < INT_MIN) {
            error_set(error, lex, "too big negative integer");
            goto out;
        }

        lex->token = TOKEN_INTEGER;
        lex->value.integer = (int)value;
        return 0;
    }

    if(c == '.') {
        c = lex_get(lex, error);
        if(!isdigit(c))
            goto out;
        lex_save(lex, c);

        c = lex_get_save(lex, error);
        while(isdigit(c))
            c = lex_get_save(lex, error);
    }

    if(c == 'E' || c == 'e') {
        c = lex_get_save(lex, error);
        if(c == '+' || c == '-')
            c = lex_get_save(lex, error);

        if(!isdigit(c)) {
            lex_unget_unsave(lex, c);
            goto out;
        }

        c = lex_get_save(lex, error);
        while(isdigit(c))
            c = lex_get_save(lex, error);
    }

    lex_unget_unsave(lex, c);

    saved_text = strbuffer_value(&lex->saved_text);
    value = strtod(saved_text, &end);
    assert(end == saved_text + lex->saved_text.length);

    if(errno == ERANGE && value != 0) {
        error_set(error, lex, "real number overflow");
        goto out;
    }

    lex->token = TOKEN_REAL;
    lex->value.real = value;
    return 0;

out:
    return -1;
}

static int lex_scan(lex_t *lex, json_error_t *error)
{
    char c;

    strbuffer_clear(&lex->saved_text);

    if(lex->token == TOKEN_STRING) {
        free(lex->value.string);
        lex->value.string = NULL;
    }

    c = lex_get(lex, error);
    while(c == ' ' || c == '\t' || c == '\n' || c == '\r')
    {
        if(c == '\n')
            lex->line++;

        c = lex_get(lex, error);
    }

    if(c == (char)EOF) {
        if(lex_eof(lex))
            lex->token = TOKEN_EOF;
        else
            lex->token = TOKEN_INVALID;
        goto out;
    }

    lex_save(lex, c);

    if(c == '{' || c == '}' || c == '[' || c == ']' || c == ':' || c == ',')
        lex->token = c;

    else if(c == '"')
        lex_scan_string(lex, error);

    else if(isdigit(c) || c == '-') {
        if(lex_scan_number(lex, c, error))
            goto out;
    }

    else if(isupper(c) || islower(c)) {
        /* eat up the whole identifier for clearer error messages */
        const char *saved_text;

        c = lex_get_save(lex, error);
        while(isupper(c) || islower(c))
            c = lex_get_save(lex, error);
        lex_unget_unsave(lex, c);

        saved_text = strbuffer_value(&lex->saved_text);

        if(strcmp(saved_text, "true") == 0)
            lex->token = TOKEN_TRUE;
        else if(strcmp(saved_text, "false") == 0)
            lex->token = TOKEN_FALSE;
        else if(strcmp(saved_text, "null") == 0)
            lex->token = TOKEN_NULL;
        else
            lex->token = TOKEN_INVALID;
    }

    else {
        /* save the rest of the input UTF-8 sequence to get an error
           message of valid UTF-8 */
        lex_save_cached(lex);
        lex->token = TOKEN_INVALID;
    }

out:
    return lex->token;
}

static char *lex_steal_string(lex_t *lex)
{
    char *result = NULL;
    if(lex->token == TOKEN_STRING)
    {
        result = lex->value.string;
        lex->value.string = NULL;
    }
    return result;
}

static int lex_init(lex_t *lex, get_func get, eof_func eof, void *data)
{
    stream_init(&lex->stream, get, eof, data);
    if(strbuffer_init(&lex->saved_text))
        return -1;

    lex->token = TOKEN_INVALID;
    lex->line = 1;

    return 0;
}

static void lex_close(lex_t *lex)
{
    if(lex->token == TOKEN_STRING)
        free(lex->value.string);
    strbuffer_close(&lex->saved_text);
}


/*** parser ***/

static json_t *parse_value(lex_t *lex, json_error_t *error);

static json_t *parse_object(lex_t *lex, json_error_t *error)
{
    json_t *object = json_object();
    if(!object)
        return NULL;

    lex_scan(lex, error);
    if(lex->token == '}')
        return object;

    while(1) {
        char *key;
        json_t *value;

        if(lex->token != TOKEN_STRING) {
            error_set(error, lex, "string or '}' expected");
            goto error;
        }

        key = lex_steal_string(lex);
        if(!key)
            return NULL;

        lex_scan(lex, error);
        if(lex->token != ':') {
            free(key);
            error_set(error, lex, "':' expected");
            goto error;
        }

        lex_scan(lex, error);
        value = parse_value(lex, error);
        if(!value) {
            free(key);
            goto error;
        }

        if(json_object_set_nocheck(object, key, value)) {
            free(key);
            json_decref(value);
            goto error;
        }

        json_decref(value);
        free(key);

        lex_scan(lex, error);
        if(lex->token != ',')
            break;

        lex_scan(lex, error);
    }

    if(lex->token != '}') {
        error_set(error, lex, "'}' expected");
        goto error;
    }

    return object;

error:
    json_decref(object);
    return NULL;
}

static json_t *parse_array(lex_t *lex, json_error_t *error)
{
    json_t *array = json_array();
    if(!array)
        return NULL;

    lex_scan(lex, error);
    if(lex->token == ']')
        return array;

    while(lex->token) {
        json_t *elem = parse_value(lex, error);
        if(!elem)
            goto error;

        if(json_array_append(array, elem)) {
            json_decref(elem);
            goto error;
        }
        json_decref(elem);

        lex_scan(lex, error);
        if(lex->token != ',')
            break;

        lex_scan(lex, error);
    }

    if(lex->token != ']') {
        error_set(error, lex, "']' expected");
        goto error;
    }

    return array;

error:
    json_decref(array);
    return NULL;
}

static json_t *parse_value(lex_t *lex, json_error_t *error)
{
    json_t *json;

    switch(lex->token) {
        case TOKEN_STRING: {
            json = json_string_nocheck(lex->value.string);
            break;
        }

        case TOKEN_INTEGER: {
            json = json_integer(lex->value.integer);
            break;
        }

        case TOKEN_REAL: {
            json = json_real(lex->value.real);
            break;
        }

        case TOKEN_TRUE:
            json = json_true();
            break;

        case TOKEN_FALSE:
            json = json_false();
            break;

        case TOKEN_NULL:
            json = json_null();
            break;

        case '{':
            json = parse_object(lex, error);
            break;

        case '[':
            json = parse_array(lex, error);
            break;

        case TOKEN_INVALID:
            error_set(error, lex, "invalid token");
            return NULL;

        default:
            error_set(error, lex, "unexpected token");
            return NULL;
    }

    if(!json)
        return NULL;

    return json;
}

static json_t *parse_json(lex_t *lex, json_error_t *error)
{
    error_init(error);

    lex_scan(lex, error);
    if(lex->token != '[' && lex->token != '{') {
        error_set(error, lex, "'[' or '{' expected");
        return NULL;
    }

    return parse_value(lex, error);
}

typedef struct
{
    const char *data;
    int pos;
} string_data_t;

static int string_get(void *data)
{
    char c;
    string_data_t *stream = (string_data_t *)data;
    c = stream->data[stream->pos];
    if(c == '\0')
        return EOF;
    else
    {
        stream->pos++;
        return c;
    }
}

static int string_eof(void *data)
{
    string_data_t *stream = (string_data_t *)data;
    return (stream->data[stream->pos] == '\0');
}

json_t *json_loads(const char *string, json_error_t *error)
{
    lex_t lex;
    json_t *result;

    string_data_t stream_data = {
        .data = string,
        .pos = 0
    };

    if(lex_init(&lex, string_get, string_eof, (void *)&stream_data))
        return NULL;

    result = parse_json(&lex, error);
    if(!result)
        goto out;

    lex_scan(&lex, error);
    if(lex.token != TOKEN_EOF) {
        error_set(error, &lex, "end of file expected");
        json_decref(result);
        result = NULL;
    }

out:
    lex_close(&lex);
    return result;
}

json_t *json_loadf(FILE *input, json_error_t *error)
{
    lex_t lex;
    json_t *result;

    if(lex_init(&lex, (get_func)fgetc, (eof_func)feof, input))
        return NULL;

    result = parse_json(&lex, error);
    if(!result)
        goto out;

    lex_scan(&lex, error);
    if(lex.token != TOKEN_EOF) {
        error_set(error, &lex, "end of file expected");
        json_decref(result);
        result = NULL;
    }

out:
    lex_close(&lex);
    return result;
}

json_t *json_load_file(const char *path, json_error_t *error)
{
    json_t *result;
    FILE *fp;

    error_init(error);

    fp = fopen(path, "r");
    if(!fp)
    {
        error_set(error, NULL, "unable to open %s: %s",
                  path, strerror(errno));
        return NULL;
    }

    result = json_loadf(fp, error);

    fclose(fp);
    return result;
}
07070100000016000081A4000003E800000064000000015EF4BCA100000852000000000000000000000000000000000000002A00000000cpuminer-2.5.1/compat/jansson/strbuffer.c/*
 * Copyright (c) 2009, 2010 Petri Lehtinen <petri@digip.org>
 *
 * Jansson is free software; you can redistribute it and/or modify
 * it under the terms of the MIT license. See LICENSE for details.
 */

#define _GNU_SOURCE
#include <stdlib.h>
#include <string.h>
#include "strbuffer.h"
#include "util.h"

#define STRBUFFER_MIN_SIZE  16
#define STRBUFFER_FACTOR    2

int strbuffer_init(strbuffer_t *strbuff)
{
    strbuff->size = STRBUFFER_MIN_SIZE;
    strbuff->length = 0;

    strbuff->value = malloc(strbuff->size);
    if(!strbuff->value)
        return -1;

    /* initialize to empty */
    strbuff->value[0] = '\0';
    return 0;
}

void strbuffer_close(strbuffer_t *strbuff)
{
    free(strbuff->value);
    strbuff->size = 0;
    strbuff->length = 0;
    strbuff->value = NULL;
}

void strbuffer_clear(strbuffer_t *strbuff)
{
    strbuff->length = 0;
    strbuff->value[0] = '\0';
}

const char *strbuffer_value(const strbuffer_t *strbuff)
{
    return strbuff->value;
}

char *strbuffer_steal_value(strbuffer_t *strbuff)
{
    char *result = strbuff->value;
    strbuffer_init(strbuff);
    return result;
}

int strbuffer_append(strbuffer_t *strbuff, const char *string)
{
    return strbuffer_append_bytes(strbuff, string, strlen(string));
}

int strbuffer_append_byte(strbuffer_t *strbuff, char byte)
{
    return strbuffer_append_bytes(strbuff, &byte, 1);
}

int strbuffer_append_bytes(strbuffer_t *strbuff, const char *data, int size)
{
    if(strbuff->length + size >= strbuff->size)
    {
        strbuff->size = max(strbuff->size * STRBUFFER_FACTOR,
                            strbuff->length + size + 1);

        strbuff->value = realloc(strbuff->value, strbuff->size);
        if(!strbuff->value)
            return -1;
    }

    memcpy(strbuff->value + strbuff->length, data, size);
    strbuff->length += size;
    strbuff->value[strbuff->length] = '\0';

    return 0;
}

char strbuffer_pop(strbuffer_t *strbuff)
{
    if(strbuff->length > 0) {
        char c = strbuff->value[--strbuff->length];
        strbuff->value[strbuff->length] = '\0';
        return c;
    }
    else
        return '\0';
}
07070100000017000081A4000003E800000064000000015EF4BCA100000362000000000000000000000000000000000000002A00000000cpuminer-2.5.1/compat/jansson/strbuffer.h/*
 * Copyright (c) 2009, 2010 Petri Lehtinen <petri@digip.org>
 *
 * Jansson is free software; you can redistribute it and/or modify
 * it under the terms of the MIT license. See LICENSE for details.
 */

#ifndef STRBUFFER_H
#define STRBUFFER_H

typedef struct {
    char *value;
    int length;   /* bytes used */
    int size;     /* bytes allocated */
} strbuffer_t;

int strbuffer_init(strbuffer_t *strbuff);
void strbuffer_close(strbuffer_t *strbuff);

void strbuffer_clear(strbuffer_t *strbuff);

const char *strbuffer_value(const strbuffer_t *strbuff);
char *strbuffer_steal_value(strbuffer_t *strbuff);

int strbuffer_append(strbuffer_t *strbuff, const char *string);
int strbuffer_append_byte(strbuffer_t *strbuff, char byte);
int strbuffer_append_bytes(strbuffer_t *strbuff, const char *data, int size);

char strbuffer_pop(strbuffer_t *strbuff);

#endif
07070100000018000081A4000003E800000064000000015EF4BCA100000FB6000000000000000000000000000000000000002400000000cpuminer-2.5.1/compat/jansson/utf.c/*
 * Copyright (c) 2009, 2010 Petri Lehtinen <petri@digip.org>
 *
 * Jansson is free software; you can redistribute it and/or modify
 * it under the terms of the MIT license. See LICENSE for details.
 */

#include <string.h>
#include "utf.h"

int utf8_encode(int32_t codepoint, char *buffer, int *size)
{
    if(codepoint < 0)
        return -1;
    else if(codepoint < 0x80)
    {
        buffer[0] = (char)codepoint;
        *size = 1;
    }
    else if(codepoint < 0x800)
    {
        buffer[0] = 0xC0 + ((codepoint & 0x7C0) >> 6);
        buffer[1] = 0x80 + ((codepoint & 0x03F));
        *size = 2;
    }
    else if(codepoint < 0x10000)
    {
        buffer[0] = 0xE0 + ((codepoint & 0xF000) >> 12);
        buffer[1] = 0x80 + ((codepoint & 0x0FC0) >> 6);
        buffer[2] = 0x80 + ((codepoint & 0x003F));
        *size = 3;
    }
    else if(codepoint <= 0x10FFFF)
    {
        buffer[0] = 0xF0 + ((codepoint & 0x1C0000) >> 18);
        buffer[1] = 0x80 + ((codepoint & 0x03F000) >> 12);
        buffer[2] = 0x80 + ((codepoint & 0x000FC0) >> 6);
        buffer[3] = 0x80 + ((codepoint & 0x00003F));
        *size = 4;
    }
    else
        return -1;

    return 0;
}

int utf8_check_first(char byte)
{
    unsigned char u = (unsigned char)byte;

    if(u < 0x80)
        return 1;

    if(0x80 <= u && u <= 0xBF) {
        /* second, third or fourth byte of a multi-byte
           sequence, i.e. a "continuation byte" */
        return 0;
    }
    else if(u == 0xC0 || u == 0xC1) {
        /* overlong encoding of an ASCII byte */
        return 0;
    }
    else if(0xC2 <= u && u <= 0xDF) {
        /* 2-byte sequence */
        return 2;
    }

    else if(0xE0 <= u && u <= 0xEF) {
        /* 3-byte sequence */
        return 3;
    }
    else if(0xF0 <= u && u <= 0xF4) {
        /* 4-byte sequence */
        return 4;
    }
    else { /* u >= 0xF5 */
        /* Restricted (start of 4-, 5- or 6-byte sequence) or invalid
           UTF-8 */
        return 0;
    }
}

int utf8_check_full(const char *buffer, int size, int32_t *codepoint)
{
    int i;
    int32_t value = 0;
    unsigned char u = (unsigned char)buffer[0];

    if(size == 2)
    {
        value = u & 0x1F;
    }
    else if(size == 3)
    {
        value = u & 0xF;
    }
    else if(size == 4)
    {
        value = u & 0x7;
    }
    else
        return 0;

    for(i = 1; i < size; i++)
    {
        u = (unsigned char)buffer[i];

        if(u < 0x80 || u > 0xBF) {
            /* not a continuation byte */
            return 0;
        }

        value = (value << 6) + (u & 0x3F);
    }

    if(value > 0x10FFFF) {
        /* not in Unicode range */
        return 0;
    }

    else if(0xD800 <= value && value <= 0xDFFF) {
        /* invalid code point (UTF-16 surrogate halves) */
        return 0;
    }

    else if((size == 2 && value < 0x80) ||
            (size == 3 && value < 0x800) ||
            (size == 4 && value < 0x10000)) {
        /* overlong encoding */
        return 0;
    }

    if(codepoint)
        *codepoint = value;

    return 1;
}

const char *utf8_iterate(const char *buffer, int32_t *codepoint)
{
    int count;
    int32_t value;

    if(!*buffer)
        return buffer;

    count = utf8_check_first(buffer[0]);
    if(count <= 0)
        return NULL;

    if(count == 1)
        value = (unsigned char)buffer[0];
    else
    {
        if(!utf8_check_full(buffer, count, &value))
            return NULL;
    }

    if(codepoint)
        *codepoint = value;

    return buffer + count;
}

int utf8_check_string(const char *string, int length)
{
    int i;

    if(length == -1)
        length = strlen(string);

    for(i = 0; i < length; i++)
    {
        int count = utf8_check_first(string[i]);
        if(count == 0)
            return 0;
        else if(count > 1)
        {
            if(i + count > length)
                return 0;

            if(!utf8_check_full(&string[i], count, NULL))
                return 0;

            i += count - 1;
        }
    }

    return 1;
}
07070100000019000081A4000003E800000064000000015EF4BCA10000030A000000000000000000000000000000000000002400000000cpuminer-2.5.1/compat/jansson/utf.h/*
 * Copyright (c) 2009, 2010 Petri Lehtinen <petri@digip.org>
 *
 * Jansson is free software; you can redistribute it and/or modify
 * it under the terms of the MIT license. See LICENSE for details.
 */

#ifndef UTF_H
#define UTF_H

#include <config.h>

#ifdef HAVE_INTTYPES_H
/* inttypes.h includes stdint.h in a standard environment, so there's
no need to include stdint.h separately. If inttypes.h doesn't define
int32_t, it's defined in config.h. */
#include <inttypes.h>
#endif

int utf8_encode(int codepoint, char *buffer, int *size);

int utf8_check_first(char byte);
int utf8_check_full(const char *buffer, int size, int32_t *codepoint);
const char *utf8_iterate(const char *buffer, int32_t *codepoint);

int utf8_check_string(const char *string, int length);

#endif
0707010000001A000081A4000003E800000064000000015EF4BCA100000120000000000000000000000000000000000000002500000000cpuminer-2.5.1/compat/jansson/util.h/*
 * Copyright (c) 2009, 2010 Petri Lehtinen <petri@digip.org>
 *
 * Jansson is free software; you can redistribute it and/or modify
 * it under the terms of the MIT license. See LICENSE for details.
 */

#ifndef UTIL_H
#define UTIL_H

#define max(a, b)  ((a) > (b) ? (a) : (b))

#endif
0707010000001B000081A4000003E800000064000000015EF4BCA100004C51000000000000000000000000000000000000002600000000cpuminer-2.5.1/compat/jansson/value.c/*
 * Copyright (c) 2009, 2010 Petri Lehtinen <petri@digip.org>
 *
 * Jansson is free software; you can redistribute it and/or modify
 * it under the terms of the MIT license. See LICENSE for details.
 */

#define _GNU_SOURCE

#include <config.h>

#include <stdlib.h>
#include <string.h>

#include <jansson.h>
#include "hashtable.h"
#include "jansson_private.h"
#include "utf.h"
#include "util.h"


static inline void json_init(json_t *json, json_type type)
{
    json->type = type;
    json->refcount = 1;
}


/*** object ***/

/* This macro just returns a pointer that's a few bytes backwards from
   string. This makes it possible to pass a pointer to object_key_t
   when only the string inside it is used, without actually creating
   an object_key_t instance. */
#define string_to_key(string)  container_of(string, object_key_t, key)

static unsigned int hash_key(const void *ptr)
{
    const char *str = ((const object_key_t *)ptr)->key;

    unsigned int hash = 5381;
    unsigned int c;

    while((c = (unsigned int)*str))
    {
        hash = ((hash << 5) + hash) + c;
        str++;
    }

    return hash;
}

static int key_equal(const void *ptr1, const void *ptr2)
{
    return strcmp(((const object_key_t *)ptr1)->key,
                  ((const object_key_t *)ptr2)->key) == 0;
}

static void value_decref(void *value)
{
    json_decref((json_t *)value);
}

json_t *json_object(void)
{
    json_object_t *object = malloc(sizeof(json_object_t));
    if(!object)
        return NULL;
    json_init(&object->json, JSON_OBJECT);

    if(hashtable_init(&object->hashtable, hash_key, key_equal,
                      free, value_decref))
    {
        free(object);
        return NULL;
    }

    object->serial = 0;
    object->visited = 0;

    return &object->json;
}

static void json_delete_object(json_object_t *object)
{
    hashtable_close(&object->hashtable);
    free(object);
}

unsigned int json_object_size(const json_t *json)
{
    json_object_t *object;

    if(!json_is_object(json))
        return -1;

    object = json_to_object(json);
    return object->hashtable.size;
}

json_t *json_object_get(const json_t *json, const char *key)
{
    json_object_t *object;

    if(!json_is_object(json))
        return NULL;

    object = json_to_object(json);
    return hashtable_get(&object->hashtable, string_to_key(key));
}

int json_object_set_new_nocheck(json_t *json, const char *key, json_t *value)
{
    json_object_t *object;
    object_key_t *k;

    if(!key || !value)
        return -1;

    if(!json_is_object(json) || json == value)
    {
        json_decref(value);
        return -1;
    }
    object = json_to_object(json);

    k = malloc(sizeof(object_key_t) + strlen(key) + 1);
    if(!k)
        return -1;

    k->serial = object->serial++;
    strcpy(k->key, key);

    if(hashtable_set(&object->hashtable, k, value))
    {
        json_decref(value);
        return -1;
    }

    return 0;
}

int json_object_set_new(json_t *json, const char *key, json_t *value)
{
    if(!key || !utf8_check_string(key, -1))
    {
        json_decref(value);
        return -1;
    }

    return json_object_set_new_nocheck(json, key, value);
}

int json_object_del(json_t *json, const char *key)
{
    json_object_t *object;

    if(!json_is_object(json))
        return -1;

    object = json_to_object(json);
    return hashtable_del(&object->hashtable, string_to_key(key));
}

int json_object_clear(json_t *json)
{
    json_object_t *object;

    if(!json_is_object(json))
        return -1;

    object = json_to_object(json);
    hashtable_clear(&object->hashtable);

    return 0;
}

int json_object_update(json_t *object, json_t *other)
{
    void *iter;

    if(!json_is_object(object) || !json_is_object(other))
        return -1;

    iter = json_object_iter(other);
    while(iter) {
        const char *key;
        json_t *value;

        key = json_object_iter_key(iter);
        value = json_object_iter_value(iter);

        if(json_object_set_nocheck(object, key, value))
            return -1;

        iter = json_object_iter_next(other, iter);
    }

    return 0;
}

void *json_object_iter(json_t *json)
{
    json_object_t *object;

    if(!json_is_object(json))
        return NULL;

    object = json_to_object(json);
    return hashtable_iter(&object->hashtable);
}

void *json_object_iter_at(json_t *json, const char *key)
{
    json_object_t *object;

    if(!key || !json_is_object(json))
        return NULL;

    object = json_to_object(json);
    return hashtable_iter_at(&object->hashtable, string_to_key(key));
}

void *json_object_iter_next(json_t *json, void *iter)
{
    json_object_t *object;

    if(!json_is_object(json) || iter == NULL)
        return NULL;

    object = json_to_object(json);
    return hashtable_iter_next(&object->hashtable, iter);
}

const object_key_t *jsonp_object_iter_fullkey(void *iter)
{
    if(!iter)
        return NULL;

    return hashtable_iter_key(iter);
}

const char *json_object_iter_key(void *iter)
{
    if(!iter)
        return NULL;

    return jsonp_object_iter_fullkey(iter)->key;
}

json_t *json_object_iter_value(void *iter)
{
    if(!iter)
        return NULL;

    return (json_t *)hashtable_iter_value(iter);
}

int json_object_iter_set_new(json_t *json, void *iter, json_t *value)
{
    json_object_t *object;

    if(!json_is_object(json) || !iter || !value)
        return -1;

    object = json_to_object(json);
    hashtable_iter_set(&object->hashtable, iter, value);

    return 0;
}

static int json_object_equal(json_t *object1, json_t *object2)
{
    void *iter;

    if(json_object_size(object1) != json_object_size(object2))
        return 0;

    iter = json_object_iter(object1);
    while(iter)
    {
        const char *key;
        json_t *value1, *value2;

        key = json_object_iter_key(iter);
        value1 = json_object_iter_value(iter);
        value2 = json_object_get(object2, key);

        if(!json_equal(value1, value2))
            return 0;

        iter = json_object_iter_next(object1, iter);
    }

    return 1;
}

static json_t *json_object_copy(json_t *object)
{
    json_t *result;
    void *iter;

    result = json_object();
    if(!result)
        return NULL;

    iter = json_object_iter(object);
    while(iter)
    {
        const char *key;
        json_t *value;

        key = json_object_iter_key(iter);
        value = json_object_iter_value(iter);
        json_object_set_nocheck(result, key, value);

        iter = json_object_iter_next(object, iter);
    }

    return result;
}

static json_t *json_object_deep_copy(json_t *object)
{
    json_t *result;
    void *iter;

    result = json_object();
    if(!result)
        return NULL;

    iter = json_object_iter(object);
    while(iter)
    {
        const char *key;
        json_t *value;

        key = json_object_iter_key(iter);
        value = json_object_iter_value(iter);
        json_object_set_new_nocheck(result, key, json_deep_copy(value));

        iter = json_object_iter_next(object, iter);
    }

    return result;
}


/*** array ***/

json_t *json_array(void)
{
    json_array_t *array = malloc(sizeof(json_array_t));
    if(!array)
        return NULL;
    json_init(&array->json, JSON_ARRAY);

    array->entries = 0;
    array->size = 8;

    array->table = malloc(array->size * sizeof(json_t *));
    if(!array->table) {
        free(array);
        return NULL;
    }

    array->visited = 0;

    return &array->json;
}

static void json_delete_array(json_array_t *array)
{
    unsigned int i;

    for(i = 0; i < array->entries; i++)
        json_decref(array->table[i]);

    free(array->table);
    free(array);
}

unsigned int json_array_size(const json_t *json)
{
    if(!json_is_array(json))
        return 0;

    return json_to_array(json)->entries;
}

json_t *json_array_get(const json_t *json, unsigned int index)
{
    json_array_t *array;
    if(!json_is_array(json))
        return NULL;
    array = json_to_array(json);

    if(index >= array->entries)
        return NULL;

    return array->table[index];
}

int json_array_set_new(json_t *json, unsigned int index, json_t *value)
{
    json_array_t *array;

    if(!value)
        return -1;

    if(!json_is_array(json) || json == value)
    {
        json_decref(value);
        return -1;
    }
    array = json_to_array(json);

    if(index >= array->entries)
    {
        json_decref(value);
        return -1;
    }

    json_decref(array->table[index]);
    array->table[index] = value;

    return 0;
}

static void array_move(json_array_t *array, unsigned int dest,
                       unsigned int src, unsigned int count)
{
    memmove(&array->table[dest], &array->table[src], count * sizeof(json_t *));
}

static void array_copy(json_t **dest, unsigned int dpos,
                       json_t **src, unsigned int spos,
                       unsigned int count)
{
    memcpy(&dest[dpos], &src[spos], count * sizeof(json_t *));
}

static json_t **json_array_grow(json_array_t *array,
                                unsigned int amount,
                                int copy)
{
    unsigned int new_size;
    json_t **old_table, **new_table;

    if(array->entries + amount <= array->size)
        return array->table;

    old_table = array->table;

    new_size = max(array->size + amount, array->size * 2);
    new_table = malloc(new_size * sizeof(json_t *));
    if(!new_table)
        return NULL;

    array->size = new_size;
    array->table = new_table;

    if(copy) {
        array_copy(array->table, 0, old_table, 0, array->entries);
        free(old_table);
        return array->table;
    }

    return old_table;
}

int json_array_append_new(json_t *json, json_t *value)
{
    json_array_t *array;

    if(!value)
        return -1;

    if(!json_is_array(json) || json == value)
    {
        json_decref(value);
        return -1;
    }
    array = json_to_array(json);

    if(!json_array_grow(array, 1, 1)) {
        json_decref(value);
        return -1;
    }

    array->table[array->entries] = value;
    array->entries++;

    return 0;
}

int json_array_insert_new(json_t *json, unsigned int index, json_t *value)
{
    json_array_t *array;
    json_t **old_table;

    if(!value)
        return -1;

    if(!json_is_array(json) || json == value) {
        json_decref(value);
        return -1;
    }
    array = json_to_array(json);

    if(index > array->entries) {
        json_decref(value);
        return -1;
    }

    old_table = json_array_grow(array, 1, 0);
    if(!old_table) {
        json_decref(value);
        return -1;
    }

    if(old_table != array->table) {
        array_copy(array->table, 0, old_table, 0, index);
        array_copy(array->table, index + 1, old_table, index,
                   array->entries - index);
        free(old_table);
    }
    else
        array_move(array, index + 1, index, array->entries - index);

    array->table[index] = value;
    array->entries++;

    return 0;
}

int json_array_remove(json_t *json, unsigned int index)
{
    json_array_t *array;

    if(!json_is_array(json))
        return -1;
    array = json_to_array(json);

    if(index >= array->entries)
        return -1;

    json_decref(array->table[index]);

    array_move(array, index, index + 1, array->entries - index);
    array->entries--;

    return 0;
}

int json_array_clear(json_t *json)
{
    json_array_t *array;
    unsigned int i;

    if(!json_is_array(json))
        return -1;
    array = json_to_array(json);

    for(i = 0; i < array->entries; i++)
        json_decref(array->table[i]);

    array->entries = 0;
    return 0;
}

int json_array_extend(json_t *json, json_t *other_json)
{
    json_array_t *array, *other;
    unsigned int i;

    if(!json_is_array(json) || !json_is_array(other_json))
        return -1;
    array = json_to_array(json);
    other = json_to_array(other_json);

    if(!json_array_grow(array, other->entries, 1))
        return -1;

    for(i = 0; i < other->entries; i++)
        json_incref(other->table[i]);

    array_copy(array->table, array->entries, other->table, 0, other->entries);

    array->entries += other->entries;
    return 0;
}

static int json_array_equal(json_t *array1, json_t *array2)
{
    unsigned int i, size;

    size = json_array_size(array1);
    if(size != json_array_size(array2))
        return 0;

    for(i = 0; i < size; i++)
    {
        json_t *value1, *value2;

        value1 = json_array_get(array1, i);
        value2 = json_array_get(array2, i);

        if(!json_equal(value1, value2))
            return 0;
    }

    return 1;
}

static json_t *json_array_copy(json_t *array)
{
    json_t *result;
    unsigned int i;

    result = json_array();
    if(!result)
        return NULL;

    for(i = 0; i < json_array_size(array); i++)
        json_array_append(result, json_array_get(array, i));

    return result;
}

static json_t *json_array_deep_copy(json_t *array)
{
    json_t *result;
    unsigned int i;

    result = json_array();
    if(!result)
        return NULL;

    for(i = 0; i < json_array_size(array); i++)
        json_array_append_new(result, json_deep_copy(json_array_get(array, i)));

    return result;
}

/*** string ***/

json_t *json_string_nocheck(const char *value)
{
    json_string_t *string;

    if(!value)
        return NULL;

    string = malloc(sizeof(json_string_t));
    if(!string)
        return NULL;
    json_init(&string->json, JSON_STRING);

    string->value = strdup(value);
    if(!string->value) {
        free(string);
        return NULL;
    }

    return &string->json;
}

json_t *json_string(const char *value)
{
    if(!value || !utf8_check_string(value, -1))
        return NULL;

    return json_string_nocheck(value);
}

const char *json_string_value(const json_t *json)
{
    if(!json_is_string(json))
        return NULL;

    return json_to_string(json)->value;
}

int json_string_set_nocheck(json_t *json, const char *value)
{
    char *dup;
    json_string_t *string;

    dup = strdup(value);
    if(!dup)
        return -1;

    string = json_to_string(json);
    free(string->value);
    string->value = dup;

    return 0;
}

int json_string_set(json_t *json, const char *value)
{
    if(!value || !utf8_check_string(value, -1))
        return -1;

    return json_string_set_nocheck(json, value);
}

static void json_delete_string(json_string_t *string)
{
    free(string->value);
    free(string);
}

static int json_string_equal(json_t *string1, json_t *string2)
{
    return strcmp(json_string_value(string1), json_string_value(string2)) == 0;
}

static json_t *json_string_copy(json_t *string)
{
    return json_string_nocheck(json_string_value(string));
}


/*** integer ***/

json_t *json_integer(int value)
{
    json_integer_t *integer = malloc(sizeof(json_integer_t));
    if(!integer)
        return NULL;
    json_init(&integer->json, JSON_INTEGER);

    integer->value = value;
    return &integer->json;
}

int json_integer_value(const json_t *json)
{
    if(!json_is_integer(json))
        return 0;

    return json_to_integer(json)->value;
}

int json_integer_set(json_t *json, int value)
{
    if(!json_is_integer(json))
        return -1;

    json_to_integer(json)->value = value;

    return 0;
}

static void json_delete_integer(json_integer_t *integer)
{
    free(integer);
}

static int json_integer_equal(json_t *integer1, json_t *integer2)
{
    return json_integer_value(integer1) == json_integer_value(integer2);
}

static json_t *json_integer_copy(json_t *integer)
{
    return json_integer(json_integer_value(integer));
}


/*** real ***/

json_t *json_real(double value)
{
    json_real_t *real = malloc(sizeof(json_real_t));
    if(!real)
        return NULL;
    json_init(&real->json, JSON_REAL);

    real->value = value;
    return &real->json;
}

double json_real_value(const json_t *json)
{
    if(!json_is_real(json))
        return 0;

    return json_to_real(json)->value;
}

int json_real_set(json_t *json, double value)
{
    if(!json_is_real(json))
        return 0;

    json_to_real(json)->value = value;

    return 0;
}

static void json_delete_real(json_real_t *real)
{
    free(real);
}

static int json_real_equal(json_t *real1, json_t *real2)
{
    return json_real_value(real1) == json_real_value(real2);
}

static json_t *json_real_copy(json_t *real)
{
    return json_real(json_real_value(real));
}


/*** number ***/

double json_number_value(const json_t *json)
{
    if(json_is_integer(json))
        return json_integer_value(json);
    else if(json_is_real(json))
        return json_real_value(json);
    else
        return 0.0;
}


/*** simple values ***/

json_t *json_true(void)
{
    static json_t the_true = {
        .type = JSON_TRUE,
        .refcount = (unsigned int)-1
    };
    return &the_true;
}


json_t *json_false(void)
{
    static json_t the_false = {
        .type = JSON_FALSE,
        .refcount = (unsigned int)-1
    };
    return &the_false;
}


json_t *json_null(void)
{
    static json_t the_null = {
        .type = JSON_NULL,
        .refcount = (unsigned int)-1
    };
    return &the_null;
}


/*** deletion ***/

void json_delete(json_t *json)
{
    if(json_is_object(json))
        json_delete_object(json_to_object(json));

    else if(json_is_array(json))
        json_delete_array(json_to_array(json));

    else if(json_is_string(json))
        json_delete_string(json_to_string(json));

    else if(json_is_integer(json))
        json_delete_integer(json_to_integer(json));

    else if(json_is_real(json))
        json_delete_real(json_to_real(json));

    /* json_delete is not called for true, false or null */
}


/*** equality ***/

int json_equal(json_t *json1, json_t *json2)
{
    if(!json1 || !json2)
        return 0;

    if(json_typeof(json1) != json_typeof(json2))
        return 0;

    /* this covers true, false and null as they are singletons */
    if(json1 == json2)
        return 1;

    if(json_is_object(json1))
        return json_object_equal(json1, json2);

    if(json_is_array(json1))
        return json_array_equal(json1, json2);

    if(json_is_string(json1))
        return json_string_equal(json1, json2);

    if(json_is_integer(json1))
        return json_integer_equal(json1, json2);

    if(json_is_real(json1))
        return json_real_equal(json1, json2);

    return 0;
}


/*** copying ***/

json_t *json_copy(json_t *json)
{
    if(!json)
        return NULL;

    if(json_is_object(json))
        return json_object_copy(json);

    if(json_is_array(json))
        return json_array_copy(json);

    if(json_is_string(json))
        return json_string_copy(json);

    if(json_is_integer(json))
        return json_integer_copy(json);

    if(json_is_real(json))
        return json_real_copy(json);

    if(json_is_true(json) || json_is_false(json) || json_is_null(json))
        return json;

    return NULL;
}

json_t *json_deep_copy(json_t *json)
{
    if(!json)
        return NULL;

    if(json_is_object(json))
        return json_object_deep_copy(json);

    if(json_is_array(json))
        return json_array_deep_copy(json);

    /* for the rest of the types, deep copying doesn't differ from
       shallow copying */

    if(json_is_string(json))
        return json_string_copy(json);

    if(json_is_integer(json))
        return json_integer_copy(json);

    if(json_is_real(json))
        return json_real_copy(json);

    if(json_is_true(json) || json_is_false(json) || json_is_null(json))
        return json;

    return NULL;
}
0707010000001C000081A4000003E800000064000000015EF4BCA100000E55000000000000000000000000000000000000001C00000000cpuminer-2.5.1/configure.acAC_INIT([cpuminer], [2.5.1])

AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM
AC_CONFIG_SRCDIR([cpu-miner.c])
AM_INIT_AUTOMAKE([gnu])
AC_CONFIG_HEADERS([cpuminer-config.h])

dnl Make sure anyone changing configure.ac/Makefile.am has a clue
AM_MAINTAINER_MODE

dnl Checks for programs
AC_PROG_CC_C99
AC_PROG_GCC_TRADITIONAL
AM_PROG_CC_C_O
AM_PROG_AS
AC_PROG_RANLIB

dnl Checks for header files
AC_HEADER_STDC
AC_CHECK_HEADERS([sys/endian.h sys/param.h syslog.h])
# sys/sysctl.h requires sys/types.h on FreeBSD
# sys/sysctl.h requires sys/param.h on OpenBSD
AC_CHECK_HEADERS([sys/sysctl.h], [], [],
[#include <sys/types.h>
#ifdef HAVE_SYS_PARAM_H
#include <sys/param.h>
#endif
])

AC_CHECK_DECLS([be32dec, le32dec, be32enc, le32enc], [], [],
[AC_INCLUDES_DEFAULT
#ifdef HAVE_SYS_ENDIAN_H
#include <sys/endian.h>
#endif
])

AC_FUNC_ALLOCA
AC_CHECK_FUNCS([getopt_long])

case $target in
  i*86-*-*)
    have_x86=true
    ;;
  x86_64-*-*|amd64-*-*)
    have_x86_64=true
    ;;
  arm*-*-*)
    have_arm=true
    ;;
  powerpc*-*-*)
    have_ppc=true
    ;;
esac

PTHREAD_FLAGS="-pthread"
WS2_LIBS=""

case $target in
  *-*-mingw*)
    have_win32=true
    PTHREAD_FLAGS=""
    WS2_LIBS="-lws2_32"
    ;;
esac

AC_ARG_ENABLE([assembly],
  AS_HELP_STRING([--disable-assembly], [disable assembly-language routines]))
if test x$enable_assembly != xno; then
  AC_DEFINE([USE_ASM], [1], [Define to 1 if assembly routines are wanted.])
fi

if test x$enable_assembly != xno -a x$have_x86_64 = xtrue
then
  AC_MSG_CHECKING(whether we can compile AVX code)
  AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vmovdqa %ymm0, %ymm1");])],
    AC_DEFINE(USE_AVX, 1, [Define to 1 if AVX assembly is available.])
    AC_MSG_RESULT(yes)
    AC_MSG_CHECKING(whether we can compile XOP code)
    AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vprotd \$7, %xmm0, %xmm1");])],
      AC_DEFINE(USE_XOP, 1, [Define to 1 if XOP assembly is available.])
      AC_MSG_RESULT(yes)
    ,
      AC_MSG_RESULT(no)
      AC_MSG_WARN([The assembler does not support the XOP instruction set.])
    )
    AC_MSG_CHECKING(whether we can compile AVX2 code)
    AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vpaddd %ymm0, %ymm1, %ymm2");])],
      AC_DEFINE(USE_AVX2, 1, [Define to 1 if AVX2 assembly is available.])
      AC_MSG_RESULT(yes)
    ,
      AC_MSG_RESULT(no)
      AC_MSG_WARN([The assembler does not support the AVX2 instruction set.])
    )
  ,
    AC_MSG_RESULT(no)
    AC_MSG_WARN([The assembler does not support the AVX instruction set.])
  )
fi

AC_CHECK_LIB(jansson, json_loads, request_jansson=false, request_jansson=true)
AC_CHECK_LIB([pthread], [pthread_create], PTHREAD_LIBS="-lpthread",
  AC_CHECK_LIB([pthreadGC2], [pthread_create], PTHREAD_LIBS="-lpthreadGC2",
    AC_CHECK_LIB([pthreadGC1], [pthread_create], PTHREAD_LIBS="-lpthreadGC1",
      AC_CHECK_LIB([pthreadGC], [pthread_create], PTHREAD_LIBS="-lpthreadGC"
))))

AM_CONDITIONAL([WANT_JANSSON], [test x$request_jansson = xtrue])
AM_CONDITIONAL([HAVE_WINDOWS], [test x$have_win32 = xtrue])
AM_CONDITIONAL([USE_ASM], [test x$enable_assembly != xno])
AM_CONDITIONAL([ARCH_x86], [test x$have_x86 = xtrue])
AM_CONDITIONAL([ARCH_x86_64], [test x$have_x86_64 = xtrue])
AM_CONDITIONAL([ARCH_ARM], [test x$have_arm = xtrue])
AM_CONDITIONAL([ARCH_PPC], [test x$have_ppc = xtrue])

if test x$request_jansson = xtrue
then
	JANSSON_LIBS="compat/jansson/libjansson.a"
else
	JANSSON_LIBS=-ljansson
fi

LIBCURL_CHECK_CONFIG(, 7.15.2, ,
  [AC_MSG_ERROR([Missing required libcurl >= 7.15.2])])

AC_SUBST(JANSSON_LIBS)
AC_SUBST(PTHREAD_FLAGS)
AC_SUBST(PTHREAD_LIBS)
AC_SUBST(WS2_LIBS)

AC_CONFIG_FILES([
	Makefile
	compat/Makefile
	compat/jansson/Makefile
	])
AC_OUTPUT
0707010000001D000081A4000003E800000064000000015EF4BCA10000C4D2000000000000000000000000000000000000001B00000000cpuminer-2.5.1/cpu-miner.c/*
 * Copyright 2010 Jeff Garzik
 * Copyright 2012-2017 pooler
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the Free
 * Software Foundation; either version 2 of the License, or (at your option)
 * any later version.  See COPYING for more details.
 */

#include "cpuminer-config.h"
#define _GNU_SOURCE

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <inttypes.h>
#include <unistd.h>
#include <sys/time.h>
#include <time.h>
#ifdef WIN32
#include <windows.h>
#else
#include <errno.h>
#include <signal.h>
#include <sys/resource.h>
#if HAVE_SYS_SYSCTL_H
#include <sys/types.h>
#if HAVE_SYS_PARAM_H
#include <sys/param.h>
#endif
#include <sys/sysctl.h>
#endif
#endif
#include <jansson.h>
#include <curl/curl.h>
#include "compat.h"
#include "miner.h"

#define PROGRAM_NAME		"minerd"
#define LP_SCANTIME		60

#ifdef __linux /* Linux specific policy and affinity management */
#include <sched.h>
static inline void drop_policy(void)
{
	struct sched_param param;
	param.sched_priority = 0;

#ifdef SCHED_IDLE
	if (unlikely(sched_setscheduler(0, SCHED_IDLE, &param) == -1))
#endif
#ifdef SCHED_BATCH
		sched_setscheduler(0, SCHED_BATCH, &param);
#endif
}

static inline void affine_to_cpu(int id, int cpu)
{
	cpu_set_t set;

	CPU_ZERO(&set);
	CPU_SET(cpu, &set);
	sched_setaffinity(0, sizeof(set), &set);
}
#elif defined(__FreeBSD__) /* FreeBSD specific policy and affinity management */
#include <sys/cpuset.h>
static inline void drop_policy(void)
{
}

static inline void affine_to_cpu(int id, int cpu)
{
	cpuset_t set;
	CPU_ZERO(&set);
	CPU_SET(cpu, &set);
	cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, sizeof(cpuset_t), &set);
}
#else
static inline void drop_policy(void)
{
}

static inline void affine_to_cpu(int id, int cpu)
{
}
#endif
		
enum workio_commands {
	WC_GET_WORK,
	WC_SUBMIT_WORK,
};

struct workio_cmd {
	enum workio_commands	cmd;
	struct thr_info		*thr;
	union {
		struct work	*work;
	} u;
};

enum algos {
	ALGO_SCRYPT,		/* scrypt(1024,1,1) */
	ALGO_SHA256D,		/* SHA-256d */
};

static const char *algo_names[] = {
	[ALGO_SCRYPT]		= "scrypt",
	[ALGO_SHA256D]		= "sha256d",
};

bool opt_debug = false;
bool opt_protocol = false;
static bool opt_benchmark = false;
bool opt_redirect = true;
bool want_longpoll = true;
bool have_longpoll = false;
bool have_gbt = true;
bool allow_getwork = true;
bool want_stratum = true;
bool have_stratum = false;
bool use_syslog = false;
static bool opt_background = false;
static bool opt_quiet = false;
static int opt_retries = -1;
static int opt_fail_pause = 30;
int opt_timeout = 0;
static int opt_scantime = 5;
static enum algos opt_algo = ALGO_SCRYPT;
static int opt_scrypt_n = 1024;
static int opt_n_threads;
static int num_processors;
static char *rpc_url;
static char *rpc_userpass;
static char *rpc_user, *rpc_pass;
static int pk_script_size;
static unsigned char pk_script[42];
static char coinbase_sig[101] = "";
char *opt_cert;
char *opt_proxy;
long opt_proxy_type;
struct thr_info *thr_info;
static int work_thr_id;
int longpoll_thr_id = -1;
int stratum_thr_id = -1;
struct work_restart *work_restart = NULL;
static struct stratum_ctx stratum;

pthread_mutex_t applog_lock;
static pthread_mutex_t stats_lock;

static unsigned long accepted_count = 0L;
static unsigned long rejected_count = 0L;
static double *thr_hashrates;

#ifdef HAVE_GETOPT_LONG
#include <getopt.h>
#else
struct option {
	const char *name;
	int has_arg;
	int *flag;
	int val;
};
#endif

static char const usage[] = "\
Usage: " PROGRAM_NAME " [OPTIONS]\n\
Options:\n\
  -a, --algo=ALGO       specify the algorithm to use\n\
                          scrypt    scrypt(1024, 1, 1) (default)\n\
                          scrypt:N  scrypt(N, 1, 1)\n\
                          sha256d   SHA-256d\n\
  -o, --url=URL         URL of mining server\n\
  -O, --userpass=U:P    username:password pair for mining server\n\
  -u, --user=USERNAME   username for mining server\n\
  -p, --pass=PASSWORD   password for mining server\n\
      --cert=FILE       certificate for mining server using SSL\n\
  -x, --proxy=[PROTOCOL://]HOST[:PORT]  connect through a proxy\n\
  -t, --threads=N       number of miner threads (default: number of processors)\n\
  -r, --retries=N       number of times to retry if a network call fails\n\
                          (default: retry indefinitely)\n\
  -R, --retry-pause=N   time to pause between retries, in seconds (default: 30)\n\
  -T, --timeout=N       timeout for long polling, in seconds (default: none)\n\
  -s, --scantime=N      upper bound on time spent scanning current work when\n\
                          long polling is unavailable, in seconds (default: 5)\n\
      --coinbase-addr=ADDR  payout address for solo mining\n\
      --coinbase-sig=TEXT  data to insert in the coinbase when possible\n\
      --no-longpoll     disable long polling support\n\
      --no-getwork      disable getwork support\n\
      --no-gbt          disable getblocktemplate support\n\
      --no-stratum      disable X-Stratum support\n\
      --no-redirect     ignore requests to change the URL of the mining server\n\
  -q, --quiet           disable per-thread hashmeter output\n\
  -D, --debug           enable debug output\n\
  -P, --protocol-dump   verbose dump of protocol-level activities\n"
#ifdef HAVE_SYSLOG_H
"\
  -S, --syslog          use system log for output messages\n"
#endif
#ifndef WIN32
"\
  -B, --background      run the miner in the background\n"
#endif
"\
      --benchmark       run in offline benchmark mode\n\
  -c, --config=FILE     load a JSON-format configuration file\n\
  -V, --version         display version information and exit\n\
  -h, --help            display this help text and exit\n\
";

static char const short_options[] =
#ifndef WIN32
	"B"
#endif
#ifdef HAVE_SYSLOG_H
	"S"
#endif
	"a:c:Dhp:Px:qr:R:s:t:T:o:u:O:V";

static struct option const options[] = {
	{ "algo", 1, NULL, 'a' },
#ifndef WIN32
	{ "background", 0, NULL, 'B' },
#endif
	{ "benchmark", 0, NULL, 1005 },
	{ "cert", 1, NULL, 1001 },
	{ "coinbase-addr", 1, NULL, 1013 },
	{ "coinbase-sig", 1, NULL, 1015 },
	{ "config", 1, NULL, 'c' },
	{ "debug", 0, NULL, 'D' },
	{ "help", 0, NULL, 'h' },
	{ "no-gbt", 0, NULL, 1011 },
	{ "no-getwork", 0, NULL, 1010 },
	{ "no-longpoll", 0, NULL, 1003 },
	{ "no-redirect", 0, NULL, 1009 },
	{ "no-stratum", 0, NULL, 1007 },
	{ "pass", 1, NULL, 'p' },
	{ "protocol-dump", 0, NULL, 'P' },
	{ "proxy", 1, NULL, 'x' },
	{ "quiet", 0, NULL, 'q' },
	{ "retries", 1, NULL, 'r' },
	{ "retry-pause", 1, NULL, 'R' },
	{ "scantime", 1, NULL, 's' },
#ifdef HAVE_SYSLOG_H
	{ "syslog", 0, NULL, 'S' },
#endif
	{ "threads", 1, NULL, 't' },
	{ "timeout", 1, NULL, 'T' },
	{ "url", 1, NULL, 'o' },
	{ "user", 1, NULL, 'u' },
	{ "userpass", 1, NULL, 'O' },
	{ "version", 0, NULL, 'V' },
	{ 0, 0, 0, 0 }
};

struct work {
	uint32_t data[32];
	uint32_t target[8];

	int height;
	char *txs;
	char *workid;

	char *job_id;
	size_t xnonce2_len;
	unsigned char *xnonce2;
};

static struct work g_work;
static time_t g_work_time;
static pthread_mutex_t g_work_lock;
static bool submit_old = false;
static char *lp_id;

static inline void work_free(struct work *w)
{
	free(w->txs);
	free(w->workid);
	free(w->job_id);
	free(w->xnonce2);
}

static inline void work_copy(struct work *dest, const struct work *src)
{
	memcpy(dest, src, sizeof(struct work));
	if (src->txs)
		dest->txs = strdup(src->txs);
	if (src->workid)
		dest->workid = strdup(src->workid);
	if (src->job_id)
		dest->job_id = strdup(src->job_id);
	if (src->xnonce2) {
		dest->xnonce2 = malloc(src->xnonce2_len);
		memcpy(dest->xnonce2, src->xnonce2, src->xnonce2_len);
	}
}

static bool jobj_binary(const json_t *obj, const char *key,
			void *buf, size_t buflen)
{
	const char *hexstr;
	json_t *tmp;

	tmp = json_object_get(obj, key);
	if (unlikely(!tmp)) {
		applog(LOG_ERR, "JSON key '%s' not found", key);
		return false;
	}
	hexstr = json_string_value(tmp);
	if (unlikely(!hexstr)) {
		applog(LOG_ERR, "JSON key '%s' is not a string", key);
		return false;
	}
	if (!hex2bin(buf, hexstr, buflen))
		return false;

	return true;
}

static bool work_decode(const json_t *val, struct work *work)
{
	int i;

	if (unlikely(!jobj_binary(val, "data", work->data, sizeof(work->data)))) {
		applog(LOG_ERR, "JSON invalid data");
		goto err_out;
	}
	if (unlikely(!jobj_binary(val, "target", work->target, sizeof(work->target)))) {
		applog(LOG_ERR, "JSON invalid target");
		goto err_out;
	}

	for (i = 0; i < ARRAY_SIZE(work->data); i++)
		work->data[i] = le32dec(work->data + i);
	for (i = 0; i < ARRAY_SIZE(work->target); i++)
		work->target[i] = le32dec(work->target + i);

	return true;

err_out:
	return false;
}

static bool gbt_work_decode(const json_t *val, struct work *work)
{
	int i, n;
	uint32_t version, curtime, bits;
	uint32_t prevhash[8];
	uint32_t target[8];
	int cbtx_size;
	unsigned char *cbtx = NULL;
	int tx_count, tx_size;
	unsigned char txc_vi[9];
	unsigned char (*merkle_tree)[32] = NULL;
	bool coinbase_append = false;
	bool submit_coinbase = false;
	bool segwit = false;
	json_t *tmp, *txa;
	bool rc = false;

	tmp = json_object_get(val, "rules");
	if (tmp && json_is_array(tmp)) {
		n = json_array_size(tmp);
		for (i = 0; i < n; i++) {
			const char *s = json_string_value(json_array_get(tmp, i));
			if (!s)
				continue;
			if (!strcmp(s, "segwit") || !strcmp(s, "!segwit"))
				segwit = true;
		}
	}

	tmp = json_object_get(val, "mutable");
	if (tmp && json_is_array(tmp)) {
		n = json_array_size(tmp);
		for (i = 0; i < n; i++) {
			const char *s = json_string_value(json_array_get(tmp, i));
			if (!s)
				continue;
			if (!strcmp(s, "coinbase/append"))
				coinbase_append = true;
			else if (!strcmp(s, "submit/coinbase"))
				submit_coinbase = true;
		}
	}

	tmp = json_object_get(val, "height");
	if (!tmp || !json_is_integer(tmp)) {
		applog(LOG_ERR, "JSON invalid height");
		goto out;
	}
	work->height = json_integer_value(tmp);

	tmp = json_object_get(val, "version");
	if (!tmp || !json_is_integer(tmp)) {
		applog(LOG_ERR, "JSON invalid version");
		goto out;
	}
	version = json_integer_value(tmp);

	if (unlikely(!jobj_binary(val, "previousblockhash", prevhash, sizeof(prevhash)))) {
		applog(LOG_ERR, "JSON invalid previousblockhash");
		goto out;
	}

	tmp = json_object_get(val, "curtime");
	if (!tmp || !json_is_integer(tmp)) {
		applog(LOG_ERR, "JSON invalid curtime");
		goto out;
	}
	curtime = json_integer_value(tmp);

	if (unlikely(!jobj_binary(val, "bits", &bits, sizeof(bits)))) {
		applog(LOG_ERR, "JSON invalid bits");
		goto out;
	}

	/* find count and size of transactions */
	txa = json_object_get(val, "transactions");
	if (!txa || !json_is_array(txa)) {
		applog(LOG_ERR, "JSON invalid transactions");
		goto out;
	}
	tx_count = json_array_size(txa);
	tx_size = 0;
	for (i = 0; i < tx_count; i++) {
		const json_t *tx = json_array_get(txa, i);
		const char *tx_hex = json_string_value(json_object_get(tx, "data"));
		if (!tx_hex) {
			applog(LOG_ERR, "JSON invalid transactions");
			goto out;
		}
		tx_size += strlen(tx_hex) / 2;
	}

	/* build coinbase transaction */
	tmp = json_object_get(val, "coinbasetxn");
	if (tmp) {
		const char *cbtx_hex = json_string_value(json_object_get(tmp, "data"));
		cbtx_size = cbtx_hex ? strlen(cbtx_hex) / 2 : 0;
		cbtx = malloc(cbtx_size + 100);
		if (cbtx_size < 60 || !hex2bin(cbtx, cbtx_hex, cbtx_size)) {
			applog(LOG_ERR, "JSON invalid coinbasetxn");
			goto out;
		}
	} else {
		int64_t cbvalue;
		if (!pk_script_size) {
			if (allow_getwork) {
				applog(LOG_INFO, "No payout address provided, switching to getwork");
				have_gbt = false;
			} else
				applog(LOG_ERR, "No payout address provided");
			goto out;
		}
		tmp = json_object_get(val, "coinbasevalue");
		if (!tmp || !json_is_number(tmp)) {
			applog(LOG_ERR, "JSON invalid coinbasevalue");
			goto out;
		}
		cbvalue = json_is_integer(tmp) ? json_integer_value(tmp) : json_number_value(tmp);
		cbtx = malloc(256);
		le32enc((uint32_t *)cbtx, 1); /* version */
		cbtx[4] = 1; /* in-counter */
		memset(cbtx+5, 0x00, 32); /* prev txout hash */
		le32enc((uint32_t *)(cbtx+37), 0xffffffff); /* prev txout index */
		cbtx_size = 43;
		/* BIP 34: height in coinbase */
		for (n = work->height; n; n >>= 8) {
			cbtx[cbtx_size++] = n & 0xff;
			if (n < 0x100 && n >= 0x80)
				cbtx[cbtx_size++] = 0;
		}
		cbtx[42] = cbtx_size - 43;
		cbtx[41] = cbtx_size - 42; /* scriptsig length */
		le32enc((uint32_t *)(cbtx+cbtx_size), 0xffffffff); /* sequence */
		cbtx_size += 4;
		cbtx[cbtx_size++] = segwit ? 2 : 1; /* out-counter */
		le32enc((uint32_t *)(cbtx+cbtx_size), (uint32_t)cbvalue); /* value */
		le32enc((uint32_t *)(cbtx+cbtx_size+4), cbvalue >> 32);
		cbtx_size += 8;
		cbtx[cbtx_size++] = pk_script_size; /* txout-script length */
		memcpy(cbtx+cbtx_size, pk_script, pk_script_size);
		cbtx_size += pk_script_size;
		if (segwit) {
			unsigned char (*wtree)[32] = calloc(tx_count + 2, 32);
			memset(cbtx+cbtx_size, 0, 8); /* value */
			cbtx_size += 8;
			cbtx[cbtx_size++] = 38; /* txout-script length */
			cbtx[cbtx_size++] = 0x6a; /* txout-script */
			cbtx[cbtx_size++] = 0x24;
			cbtx[cbtx_size++] = 0xaa;
			cbtx[cbtx_size++] = 0x21;
			cbtx[cbtx_size++] = 0xa9;
			cbtx[cbtx_size++] = 0xed;
			for (i = 0; i < tx_count; i++) {
				const json_t *tx = json_array_get(txa, i);
				const json_t *hash = json_object_get(tx, "hash");
				if (!hash || !hex2bin(wtree[1+i], json_string_value(hash), 32)) {
					applog(LOG_ERR, "JSON invalid transaction hash");
					free(wtree);
					goto out;
				}
				memrev(wtree[1+i], 32);
			}
			n = tx_count + 1;
			while (n > 1) {
				if (n % 2)
					memcpy(wtree[n], wtree[n-1], 32);
				n = (n + 1) / 2;
				for (i = 0; i < n; i++)
					sha256d(wtree[i], wtree[2*i], 64);
			}
			memset(wtree[1], 0, 32);  /* witness reserved value = 0 */
			sha256d(cbtx+cbtx_size, wtree[0], 64);
			cbtx_size += 32;
			free(wtree);
		}
		le32enc((uint32_t *)(cbtx+cbtx_size), 0); /* lock time */
		cbtx_size += 4;
		coinbase_append = true;
	}
	if (coinbase_append) {
		unsigned char xsig[100];
		int xsig_len = 0;
		if (*coinbase_sig) {
			n = strlen(coinbase_sig);
			if (cbtx[41] + xsig_len + n <= 100) {
				memcpy(xsig+xsig_len, coinbase_sig, n);
				xsig_len += n;
			} else {
				applog(LOG_WARNING, "Signature does not fit in coinbase, skipping");
			}
		}
		tmp = json_object_get(val, "coinbaseaux");
		if (tmp && json_is_object(tmp)) {
			void *iter = json_object_iter(tmp);
			while (iter) {
				unsigned char buf[100];
				const char *s = json_string_value(json_object_iter_value(iter));
				n = s ? strlen(s) / 2 : 0;
				if (!s || n > 100 || !hex2bin(buf, s, n)) {
					applog(LOG_ERR, "JSON invalid coinbaseaux");
					break;
				}
				if (cbtx[41] + xsig_len + n <= 100) {
					memcpy(xsig+xsig_len, buf, n);
					xsig_len += n;
				}
				iter = json_object_iter_next(tmp, iter);
			}
		}
		if (xsig_len) {
			unsigned char *ssig_end = cbtx + 42 + cbtx[41];
			int push_len = cbtx[41] + xsig_len < 76 ? 1 :
			               cbtx[41] + 2 + xsig_len > 100 ? 0 : 2;
			n = xsig_len + push_len;
			memmove(ssig_end + n, ssig_end, cbtx_size - 42 - cbtx[41]);
			cbtx[41] += n;
			if (push_len == 2)
				*(ssig_end++) = 0x4c; /* OP_PUSHDATA1 */
			if (push_len)
				*(ssig_end++) = xsig_len;
			memcpy(ssig_end, xsig, xsig_len);
			cbtx_size += n;
		}
	}

	n = varint_encode(txc_vi, 1 + tx_count);
	work->txs = malloc(2 * (n + cbtx_size + tx_size) + 1);
	bin2hex(work->txs, txc_vi, n);
	bin2hex(work->txs + 2*n, cbtx, cbtx_size);

	/* generate merkle root */
	merkle_tree = malloc(32 * ((1 + tx_count + 1) & ~1));
	sha256d(merkle_tree[0], cbtx, cbtx_size);
	for (i = 0; i < tx_count; i++) {
		tmp = json_array_get(txa, i);
		const char *tx_hex = json_string_value(json_object_get(tmp, "data"));
		const int tx_size = tx_hex ? strlen(tx_hex) / 2 : 0;
		if (segwit) {
			const char *txid = json_string_value(json_object_get(tmp, "txid"));
			if (!txid || !hex2bin(merkle_tree[1 + i], txid, 32)) {
				applog(LOG_ERR, "JSON invalid transaction txid");
				goto out;
			}
			memrev(merkle_tree[1 + i], 32);
		} else {
			unsigned char *tx = malloc(tx_size);
			if (!tx_hex || !hex2bin(tx, tx_hex, tx_size)) {
				applog(LOG_ERR, "JSON invalid transactions");
				free(tx);
				goto out;
			}
			sha256d(merkle_tree[1 + i], tx, tx_size);
			free(tx);
		}
		if (!submit_coinbase)
			strcat(work->txs, tx_hex);
	}
	n = 1 + tx_count;
	while (n > 1) {
		if (n % 2) {
			memcpy(merkle_tree[n], merkle_tree[n-1], 32);
			++n;
		}
		n /= 2;
		for (i = 0; i < n; i++)
			sha256d(merkle_tree[i], merkle_tree[2*i], 64);
	}

	/* assemble block header */
	work->data[0] = swab32(version);
	for (i = 0; i < 8; i++)
		work->data[8 - i] = le32dec(prevhash + i);
	for (i = 0; i < 8; i++)
		work->data[9 + i] = be32dec((uint32_t *)merkle_tree[0] + i);
	work->data[17] = swab32(curtime);
	work->data[18] = le32dec(&bits);
	memset(work->data + 19, 0x00, 52);
	work->data[20] = 0x80000000;
	work->data[31] = 0x00000280;

	if (unlikely(!jobj_binary(val, "target", target, sizeof(target)))) {
		applog(LOG_ERR, "JSON invalid target");
		goto out;
	}
	for (i = 0; i < ARRAY_SIZE(work->target); i++)
		work->target[7 - i] = be32dec(target + i);

	tmp = json_object_get(val, "workid");
	if (tmp) {
		if (!json_is_string(tmp)) {
			applog(LOG_ERR, "JSON invalid workid");
			goto out;
		}
		work->workid = strdup(json_string_value(tmp));
	}

	/* Long polling */
	tmp = json_object_get(val, "longpollid");
	if (want_longpoll && json_is_string(tmp)) {
		free(lp_id);
		lp_id = strdup(json_string_value(tmp));
		if (!have_longpoll) {
			char *lp_uri;
			tmp = json_object_get(val, "longpolluri");
			lp_uri = strdup(json_is_string(tmp) ? json_string_value(tmp) : rpc_url);
			have_longpoll = true;
			tq_push(thr_info[longpoll_thr_id].q, lp_uri);
		}
	}

	rc = true;

out:
	free(cbtx);
	free(merkle_tree);
	return rc;
}

static void share_result(int result, const char *reason)
{
	char s[345];
	double hashrate;
	int i;

	hashrate = 0.;
	pthread_mutex_lock(&stats_lock);
	for (i = 0; i < opt_n_threads; i++)
		hashrate += thr_hashrates[i];
	result ? accepted_count++ : rejected_count++;
	pthread_mutex_unlock(&stats_lock);
	
	sprintf(s, hashrate >= 1e6 ? "%.0f" : "%.2f", 1e-3 * hashrate);
	applog(LOG_INFO, "accepted: %lu/%lu (%.2f%%), %s khash/s %s",
		   accepted_count,
		   accepted_count + rejected_count,
		   100. * accepted_count / (accepted_count + rejected_count),
		   s,
		   result ? "(yay!!!)" : "(booooo)");

	if (opt_debug && reason)
		applog(LOG_DEBUG, "DEBUG: reject reason: %s", reason);
}

static bool submit_upstream_work(CURL *curl, struct work *work)
{
	json_t *val, *res, *reason;
	char data_str[2 * sizeof(work->data) + 1];
	char s[345];
	int i;
	bool rc = false;

	/* pass if the previous hash is not the current previous hash */
	if (!submit_old && memcmp(work->data + 1, g_work.data + 1, 32)) {
		if (opt_debug)
			applog(LOG_DEBUG, "DEBUG: stale work detected, discarding");
		return true;
	}

	if (have_stratum) {
		uint32_t ntime, nonce;
		char ntimestr[9], noncestr[9], *xnonce2str, *req;

		le32enc(&ntime, work->data[17]);
		le32enc(&nonce, work->data[19]);
		bin2hex(ntimestr, (const unsigned char *)(&ntime), 4);
		bin2hex(noncestr, (const unsigned char *)(&nonce), 4);
		xnonce2str = abin2hex(work->xnonce2, work->xnonce2_len);
		req = malloc(256 + strlen(rpc_user) + strlen(work->job_id) + 2 * work->xnonce2_len);
		sprintf(req,
			"{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
			rpc_user, work->job_id, xnonce2str, ntimestr, noncestr);
		free(xnonce2str);

		rc = stratum_send_line(&stratum, req);
		free(req);
		if (unlikely(!rc)) {
			applog(LOG_ERR, "submit_upstream_work stratum_send_line failed");
			goto out;
		}
	} else if (work->txs) {
		char *req;

		for (i = 0; i < ARRAY_SIZE(work->data); i++)
			be32enc(work->data + i, work->data[i]);
		bin2hex(data_str, (unsigned char *)work->data, 80);
		if (work->workid) {
			char *params;
			val = json_object();
			json_object_set_new(val, "workid", json_string(work->workid));
			params = json_dumps(val, 0);
			json_decref(val);
			req = malloc(128 + 2*80 + strlen(work->txs) + strlen(params));
			sprintf(req,
				"{\"method\": \"submitblock\", \"params\": [\"%s%s\", %s], \"id\":1}\r\n",
				data_str, work->txs, params);
			free(params);
		} else {
			req = malloc(128 + 2*80 + strlen(work->txs));
			sprintf(req,
				"{\"method\": \"submitblock\", \"params\": [\"%s%s\"], \"id\":1}\r\n",
				data_str, work->txs);
		}
		val = json_rpc_call(curl, rpc_url, rpc_userpass, req, NULL, 0);
		free(req);
		if (unlikely(!val)) {
			applog(LOG_ERR, "submit_upstream_work json_rpc_call failed");
			goto out;
		}

		res = json_object_get(val, "result");
		if (json_is_object(res)) {
			char *res_str;
			bool sumres = false;
			void *iter = json_object_iter(res);
			while (iter) {
				if (json_is_null(json_object_iter_value(iter))) {
					sumres = true;
					break;
				}
				iter = json_object_iter_next(res, iter);
			}
			res_str = json_dumps(res, 0);
			share_result(sumres, res_str);
			free(res_str);
		} else
			share_result(json_is_null(res), json_string_value(res));

		json_decref(val);
	} else {
		/* build hex string */
		for (i = 0; i < ARRAY_SIZE(work->data); i++)
			le32enc(work->data + i, work->data[i]);
		bin2hex(data_str, (unsigned char *)work->data, sizeof(work->data));

		/* build JSON-RPC request */
		sprintf(s,
			"{\"method\": \"getwork\", \"params\": [ \"%s\" ], \"id\":1}\r\n",
			data_str);

		/* issue JSON-RPC request */
		val = json_rpc_call(curl, rpc_url, rpc_userpass, s, NULL, 0);
		if (unlikely(!val)) {
			applog(LOG_ERR, "submit_upstream_work json_rpc_call failed");
			goto out;
		}

		res = json_object_get(val, "result");
		reason = json_object_get(val, "reject-reason");
		share_result(json_is_true(res), reason ? json_string_value(reason) : NULL);

		json_decref(val);
	}

	rc = true;

out:
	return rc;
}

static const char *getwork_req =
	"{\"method\": \"getwork\", \"params\": [], \"id\":0}\r\n";

#define GBT_CAPABILITIES "[\"coinbasetxn\", \"coinbasevalue\", \"longpoll\", \"workid\"]"
#define GBT_RULES "[\"segwit\"]"

static const char *gbt_req =
	"{\"method\": \"getblocktemplate\", \"params\": [{\"capabilities\": "
	GBT_CAPABILITIES ", \"rules\": " GBT_RULES "}], \"id\":0}\r\n";
static const char *gbt_lp_req =
	"{\"method\": \"getblocktemplate\", \"params\": [{\"capabilities\": "
	GBT_CAPABILITIES ", \"rules\": " GBT_RULES ", \"longpollid\": \"%s\"}], \"id\":0}\r\n";

static bool get_upstream_work(CURL *curl, struct work *work)
{
	json_t *val;
	int err;
	bool rc;
	struct timeval tv_start, tv_end, diff;

start:
	gettimeofday(&tv_start, NULL);
	val = json_rpc_call(curl, rpc_url, rpc_userpass,
			    have_gbt ? gbt_req : getwork_req,
			    &err, have_gbt ? JSON_RPC_QUIET_404 : 0);
	gettimeofday(&tv_end, NULL);

	if (have_stratum) {
		if (val)
			json_decref(val);
		return true;
	}

	if (!have_gbt && !allow_getwork) {
		applog(LOG_ERR, "No usable protocol");
		if (val)
			json_decref(val);
		return false;
	}

	if (have_gbt && allow_getwork && !val && err == CURLE_OK) {
		applog(LOG_INFO, "getblocktemplate failed, falling back to getwork");
		have_gbt = false;
		goto start;
	}

	if (!val)
		return false;

	if (have_gbt) {
		rc = gbt_work_decode(json_object_get(val, "result"), work);
		if (!have_gbt) {
			json_decref(val);
			goto start;
		}
	} else
		rc = work_decode(json_object_get(val, "result"), work);

	if (opt_debug && rc) {
		timeval_subtract(&diff, &tv_end, &tv_start);
		applog(LOG_DEBUG, "DEBUG: got new work in %d ms",
		       diff.tv_sec * 1000 + diff.tv_usec / 1000);
	}

	json_decref(val);

	return rc;
}

static void workio_cmd_free(struct workio_cmd *wc)
{
	if (!wc)
		return;

	switch (wc->cmd) {
	case WC_SUBMIT_WORK:
		work_free(wc->u.work);
		free(wc->u.work);
		break;
	default: /* do nothing */
		break;
	}

	memset(wc, 0, sizeof(*wc));	/* poison */
	free(wc);
}

static bool workio_get_work(struct workio_cmd *wc, CURL *curl)
{
	struct work *ret_work;
	int failures = 0;

	ret_work = calloc(1, sizeof(*ret_work));
	if (!ret_work)
		return false;

	/* obtain new work from bitcoin via JSON-RPC */
	while (!get_upstream_work(curl, ret_work)) {
		if (unlikely((opt_retries >= 0) && (++failures > opt_retries))) {
			applog(LOG_ERR, "json_rpc_call failed, terminating workio thread");
			free(ret_work);
			return false;
		}

		/* pause, then restart work-request loop */
		applog(LOG_ERR, "json_rpc_call failed, retry after %d seconds",
			opt_fail_pause);
		sleep(opt_fail_pause);
	}

	/* send work to requesting thread */
	if (!tq_push(wc->thr->q, ret_work))
		free(ret_work);

	return true;
}

static bool workio_submit_work(struct workio_cmd *wc, CURL *curl)
{
	int failures = 0;

	/* submit solution to bitcoin via JSON-RPC */
	while (!submit_upstream_work(curl, wc->u.work)) {
		if (unlikely((opt_retries >= 0) && (++failures > opt_retries))) {
			applog(LOG_ERR, "...terminating workio thread");
			return false;
		}

		/* pause, then restart work-request loop */
		applog(LOG_ERR, "...retry after %d seconds",
			opt_fail_pause);
		sleep(opt_fail_pause);
	}

	return true;
}

static void *workio_thread(void *userdata)
{
	struct thr_info *mythr = userdata;
	CURL *curl;
	bool ok = true;

	curl = curl_easy_init();
	if (unlikely(!curl)) {
		applog(LOG_ERR, "CURL initialization failed");
		return NULL;
	}

	while (ok) {
		struct workio_cmd *wc;

		/* wait for workio_cmd sent to us, on our queue */
		wc = tq_pop(mythr->q, NULL);
		if (!wc) {
			ok = false;
			break;
		}

		/* process workio_cmd */
		switch (wc->cmd) {
		case WC_GET_WORK:
			ok = workio_get_work(wc, curl);
			break;
		case WC_SUBMIT_WORK:
			ok = workio_submit_work(wc, curl);
			break;

		default:		/* should never happen */
			ok = false;
			break;
		}

		workio_cmd_free(wc);
	}

	tq_freeze(mythr->q);
	curl_easy_cleanup(curl);

	return NULL;
}

static bool get_work(struct thr_info *thr, struct work *work)
{
	struct workio_cmd *wc;
	struct work *work_heap;

	if (opt_benchmark) {
		memset(work->data, 0x55, 76);
		work->data[17] = swab32(time(NULL));
		memset(work->data + 19, 0x00, 52);
		work->data[20] = 0x80000000;
		work->data[31] = 0x00000280;
		memset(work->target, 0x00, sizeof(work->target));
		return true;
	}

	/* fill out work request message */
	wc = calloc(1, sizeof(*wc));
	if (!wc)
		return false;

	wc->cmd = WC_GET_WORK;
	wc->thr = thr;

	/* send work request to workio thread */
	if (!tq_push(thr_info[work_thr_id].q, wc)) {
		workio_cmd_free(wc);
		return false;
	}

	/* wait for response, a unit of work */
	work_heap = tq_pop(thr->q, NULL);
	if (!work_heap)
		return false;

	/* copy returned work into storage provided by caller */
	memcpy(work, work_heap, sizeof(*work));
	free(work_heap);

	return true;
}

static bool submit_work(struct thr_info *thr, const struct work *work_in)
{
	struct workio_cmd *wc;
	
	/* fill out work request message */
	wc = calloc(1, sizeof(*wc));
	if (!wc)
		return false;

	wc->u.work = malloc(sizeof(*work_in));
	if (!wc->u.work)
		goto err_out;

	wc->cmd = WC_SUBMIT_WORK;
	wc->thr = thr;
	work_copy(wc->u.work, work_in);

	/* send solution to workio thread */
	if (!tq_push(thr_info[work_thr_id].q, wc))
		goto err_out;

	return true;

err_out:
	workio_cmd_free(wc);
	return false;
}

static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
{
	unsigned char merkle_root[64];
	int i;

	pthread_mutex_lock(&sctx->work_lock);

	free(work->job_id);
	work->job_id = strdup(sctx->job.job_id);
	work->xnonce2_len = sctx->xnonce2_size;
	work->xnonce2 = realloc(work->xnonce2, sctx->xnonce2_size);
	memcpy(work->xnonce2, sctx->job.xnonce2, sctx->xnonce2_size);

	/* Generate merkle root */
	sha256d(merkle_root, sctx->job.coinbase, sctx->job.coinbase_size);
	for (i = 0; i < sctx->job.merkle_count; i++) {
		memcpy(merkle_root + 32, sctx->job.merkle[i], 32);
		sha256d(merkle_root, merkle_root, 64);
	}
	
	/* Increment extranonce2 */
	for (i = 0; i < sctx->xnonce2_size && !++sctx->job.xnonce2[i]; i++);

	/* Assemble block header */
	memset(work->data, 0, 128);
	work->data[0] = le32dec(sctx->job.version);
	for (i = 0; i < 8; i++)
		work->data[1 + i] = le32dec((uint32_t *)sctx->job.prevhash + i);
	for (i = 0; i < 8; i++)
		work->data[9 + i] = be32dec((uint32_t *)merkle_root + i);
	work->data[17] = le32dec(sctx->job.ntime);
	work->data[18] = le32dec(sctx->job.nbits);
	work->data[20] = 0x80000000;
	work->data[31] = 0x00000280;

	pthread_mutex_unlock(&sctx->work_lock);

	if (opt_debug) {
		char *xnonce2str = abin2hex(work->xnonce2, work->xnonce2_len);
		applog(LOG_DEBUG, "DEBUG: job_id='%s' extranonce2=%s ntime=%08x",
		       work->job_id, xnonce2str, swab32(work->data[17]));
		free(xnonce2str);
	}

	if (opt_algo == ALGO_SCRYPT)
		diff_to_target(work->target, sctx->job.diff / 65536.0);
	else
		diff_to_target(work->target, sctx->job.diff);
}

static void *miner_thread(void *userdata)
{
	struct thr_info *mythr = userdata;
	int thr_id = mythr->id;
	struct work work = {{0}};
	uint32_t max_nonce;
	uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - 0x20;
	unsigned char *scratchbuf = NULL;
	char s[16];
	int i;

	/* Set worker threads to nice 19 and then preferentially to SCHED_IDLE
	 * and if that fails, then SCHED_BATCH. No need for this to be an
	 * error if it fails */
	if (!opt_benchmark) {
		setpriority(PRIO_PROCESS, 0, 19);
		drop_policy();
	}

	/* Cpu affinity only makes sense if the number of threads is a multiple
	 * of the number of CPUs */
	if (num_processors > 1 && opt_n_threads % num_processors == 0) {
		if (!opt_quiet)
			applog(LOG_INFO, "Binding thread %d to cpu %d",
			       thr_id, thr_id % num_processors);
		affine_to_cpu(thr_id, thr_id % num_processors);
	}
	
	if (opt_algo == ALGO_SCRYPT) {
		scratchbuf = scrypt_buffer_alloc(opt_scrypt_n);
		if (!scratchbuf) {
			applog(LOG_ERR, "scrypt buffer allocation failed");
			pthread_mutex_lock(&applog_lock);
			exit(1);
		}
	}

	while (1) {
		unsigned long hashes_done;
		struct timeval tv_start, tv_end, diff;
		int64_t max64;
		int rc;

		if (have_stratum) {
			while (time(NULL) >= g_work_time + 120)
				sleep(1);
			pthread_mutex_lock(&g_work_lock);
			if (work.data[19] >= end_nonce && !memcmp(work.data, g_work.data, 76))
				stratum_gen_work(&stratum, &g_work);
		} else {
			int min_scantime = have_longpoll ? LP_SCANTIME : opt_scantime;
			/* obtain new work from internal workio thread */
			pthread_mutex_lock(&g_work_lock);
			if (!have_stratum &&
			    (time(NULL) - g_work_time >= min_scantime ||
			     work.data[19] >= end_nonce)) {
				work_free(&g_work);
				if (unlikely(!get_work(mythr, &g_work))) {
					applog(LOG_ERR, "work retrieval failed, exiting "
						"mining thread %d", mythr->id);
					pthread_mutex_unlock(&g_work_lock);
					goto out;
				}
				g_work_time = have_stratum ? 0 : time(NULL);
			}
			if (have_stratum) {
				pthread_mutex_unlock(&g_work_lock);
				continue;
			}
		}
		if (memcmp(work.data, g_work.data, 76)) {
			work_free(&work);
			work_copy(&work, &g_work);
			work.data[19] = 0xffffffffU / opt_n_threads * thr_id;
		} else
			work.data[19]++;
		pthread_mutex_unlock(&g_work_lock);
		work_restart[thr_id].restart = 0;
		
		/* adjust max_nonce to meet target scan time */
		if (have_stratum)
			max64 = LP_SCANTIME;
		else
			max64 = g_work_time + (have_longpoll ? LP_SCANTIME : opt_scantime)
			      - time(NULL);
		max64 *= thr_hashrates[thr_id];
		if (max64 <= 0) {
			switch (opt_algo) {
			case ALGO_SCRYPT:
				max64 = opt_scrypt_n < 16 ? 0x3ffff : 0x3fffff / opt_scrypt_n;
				break;
			case ALGO_SHA256D:
				max64 = 0x1fffff;
				break;
			}
		}
		if (work.data[19] + max64 > end_nonce)
			max_nonce = end_nonce;
		else
			max_nonce = work.data[19] + max64;
		
		hashes_done = 0;
		gettimeofday(&tv_start, NULL);

		/* scan nonces for a proof-of-work hash */
		switch (opt_algo) {
		case ALGO_SCRYPT:
			rc = scanhash_scrypt(thr_id, work.data, scratchbuf, work.target,
			                     max_nonce, &hashes_done, opt_scrypt_n);
			break;

		case ALGO_SHA256D:
			rc = scanhash_sha256d(thr_id, work.data, work.target,
			                      max_nonce, &hashes_done);
			break;

		default:
			/* should never happen */
			goto out;
		}

		/* record scanhash elapsed time */
		gettimeofday(&tv_end, NULL);
		timeval_subtract(&diff, &tv_end, &tv_start);
		if (diff.tv_usec || diff.tv_sec) {
			pthread_mutex_lock(&stats_lock);
			thr_hashrates[thr_id] =
				hashes_done / (diff.tv_sec + 1e-6 * diff.tv_usec);
			pthread_mutex_unlock(&stats_lock);
		}
		if (!opt_quiet) {
			sprintf(s, thr_hashrates[thr_id] >= 1e6 ? "%.0f" : "%.2f",
				1e-3 * thr_hashrates[thr_id]);
			applog(LOG_INFO, "thread %d: %lu hashes, %s khash/s",
				thr_id, hashes_done, s);
		}
		if (opt_benchmark && thr_id == opt_n_threads - 1) {
			double hashrate = 0.;
			for (i = 0; i < opt_n_threads && thr_hashrates[i]; i++)
				hashrate += thr_hashrates[i];
			if (i == opt_n_threads) {
				sprintf(s, hashrate >= 1e6 ? "%.0f" : "%.2f", 1e-3 * hashrate);
				applog(LOG_INFO, "Total: %s khash/s", s);
			}
		}

		/* if nonce found, submit work */
		if (rc && !opt_benchmark && !submit_work(mythr, &work))
			break;
	}

out:
	tq_freeze(mythr->q);

	return NULL;
}

static void restart_threads(void)
{
	int i;

	for (i = 0; i < opt_n_threads; i++)
		work_restart[i].restart = 1;
}

static void *longpoll_thread(void *userdata)
{
	struct thr_info *mythr = userdata;
	CURL *curl = NULL;
	char *copy_start, *hdr_path = NULL, *lp_url = NULL;
	bool need_slash = false;

	curl = curl_easy_init();
	if (unlikely(!curl)) {
		applog(LOG_ERR, "CURL initialization failed");
		goto out;
	}

start:
	hdr_path = tq_pop(mythr->q, NULL);
	if (!hdr_path)
		goto out;

	/* full URL */
	if (strstr(hdr_path, "://")) {
		lp_url = hdr_path;
		hdr_path = NULL;
	}
	
	/* absolute path, on current server */
	else {
		copy_start = (*hdr_path == '/') ? (hdr_path + 1) : hdr_path;
		if (rpc_url[strlen(rpc_url) - 1] != '/')
			need_slash = true;

		lp_url = malloc(strlen(rpc_url) + strlen(copy_start) + 2);
		if (!lp_url)
			goto out;

		sprintf(lp_url, "%s%s%s", rpc_url, need_slash ? "/" : "", copy_start);
	}

	applog(LOG_INFO, "Long-polling activated for %s", lp_url);

	while (1) {
		json_t *val, *res, *soval;
		char *req = NULL;
		int err;

		if (have_gbt) {
			req = malloc(strlen(gbt_lp_req) + strlen(lp_id) + 1);
			sprintf(req, gbt_lp_req, lp_id);
		}
		val = json_rpc_call(curl, lp_url, rpc_userpass,
				    req ? req : getwork_req, &err,
				    JSON_RPC_LONGPOLL);
		free(req);
		if (have_stratum) {
			if (val)
				json_decref(val);
			goto out;
		}
		if (likely(val)) {
			bool rc;
			applog(LOG_INFO, "LONGPOLL pushed new work");
			res = json_object_get(val, "result");
			soval = json_object_get(res, "submitold");
			submit_old = soval ? json_is_true(soval) : false;
			pthread_mutex_lock(&g_work_lock);
			work_free(&g_work);
			if (have_gbt)
				rc = gbt_work_decode(res, &g_work);
			else
				rc = work_decode(res, &g_work);
			if (rc) {
				time(&g_work_time);
				restart_threads();
			}
			pthread_mutex_unlock(&g_work_lock);
			json_decref(val);
		} else {
			pthread_mutex_lock(&g_work_lock);
			g_work_time -= LP_SCANTIME;
			pthread_mutex_unlock(&g_work_lock);
			if (err == CURLE_OPERATION_TIMEDOUT) {
				restart_threads();
			} else {
				have_longpoll = false;
				restart_threads();
				free(hdr_path);
				free(lp_url);
				lp_url = NULL;
				sleep(opt_fail_pause);
				goto start;
			}
		}
	}

out:
	free(hdr_path);
	free(lp_url);
	tq_freeze(mythr->q);
	if (curl)
		curl_easy_cleanup(curl);

	return NULL;
}

static bool stratum_handle_response(char *buf)
{
	json_t *val, *err_val, *res_val, *id_val;
	json_error_t err;
	bool ret = false;

	val = JSON_LOADS(buf, &err);
	if (!val) {
		applog(LOG_INFO, "JSON decode failed(%d): %s", err.line, err.text);
		goto out;
	}

	res_val = json_object_get(val, "result");
	err_val = json_object_get(val, "error");
	id_val = json_object_get(val, "id");

	if (!id_val || json_is_null(id_val) || !res_val)
		goto out;

	share_result(json_is_true(res_val),
		err_val ? json_string_value(json_array_get(err_val, 1)) : NULL);

	ret = true;
out:
	if (val)
		json_decref(val);

	return ret;
}

static void *stratum_thread(void *userdata)
{
	struct thr_info *mythr = userdata;
	char *s;

	stratum.url = tq_pop(mythr->q, NULL);
	if (!stratum.url)
		goto out;
	applog(LOG_INFO, "Starting Stratum on %s", stratum.url);

	while (1) {
		int failures = 0;

		while (!stratum.curl) {
			pthread_mutex_lock(&g_work_lock);
			g_work_time = 0;
			pthread_mutex_unlock(&g_work_lock);
			restart_threads();

			if (!stratum_connect(&stratum, stratum.url) ||
			    !stratum_subscribe(&stratum) ||
			    !stratum_authorize(&stratum, rpc_user, rpc_pass)) {
				stratum_disconnect(&stratum);
				if (opt_retries >= 0 && ++failures > opt_retries) {
					applog(LOG_ERR, "...terminating workio thread");
					tq_push(thr_info[work_thr_id].q, NULL);
					goto out;
				}
				applog(LOG_ERR, "...retry after %d seconds", opt_fail_pause);
				sleep(opt_fail_pause);
			}
		}

		if (stratum.job.job_id &&
		    (!g_work_time || strcmp(stratum.job.job_id, g_work.job_id))) {
			pthread_mutex_lock(&g_work_lock);
			stratum_gen_work(&stratum, &g_work);
			time(&g_work_time);
			pthread_mutex_unlock(&g_work_lock);
			if (stratum.job.clean) {
				applog(LOG_INFO, "Stratum requested work restart");
				restart_threads();
			}
		}
		
		if (!stratum_socket_full(&stratum, 120)) {
			applog(LOG_ERR, "Stratum connection timed out");
			s = NULL;
		} else
			s = stratum_recv_line(&stratum);
		if (!s) {
			stratum_disconnect(&stratum);
			applog(LOG_ERR, "Stratum connection interrupted");
			continue;
		}
		if (!stratum_handle_method(&stratum, s))
			stratum_handle_response(s);
		free(s);
	}

out:
	return NULL;
}

static void show_version_and_exit(void)
{
	printf(PACKAGE_STRING "\n built on " __DATE__ "\n features:"
#if defined(USE_ASM) && defined(__i386__)
		" i386"
#endif
#if defined(USE_ASM) && defined(__x86_64__)
		" x86_64"
		" PHE"
#endif
#if defined(USE_ASM) && (defined(__i386__) || defined(__x86_64__))
		" SSE2"
#endif
#if defined(__x86_64__) && defined(USE_AVX)
		" AVX"
#endif
#if defined(__x86_64__) && defined(USE_AVX2)
		" AVX2"
#endif
#if defined(__x86_64__) && defined(USE_XOP)
		" XOP"
#endif
#if defined(USE_ASM) && defined(__arm__) && defined(__APCS_32__)
		" ARM"
#if defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) || \
	defined(__ARM_ARCH_5TEJ__) || defined(__ARM_ARCH_6__) || \
	defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || \
	defined(__ARM_ARCH_6M__) || defined(__ARM_ARCH_6T2__) || \
	defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || \
	defined(__ARM_ARCH_7__) || \
	defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || \
	defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7EM__)
		" ARMv5E"
#endif
#if defined(__ARM_NEON__)
		" NEON"
#endif
#endif
#if defined(USE_ASM) && (defined(__powerpc__) || defined(__ppc__) || defined(__PPC__))
		" PowerPC"
#if defined(__ALTIVEC__)
		" AltiVec"
#endif
#endif
		"\n");

	printf("%s\n", curl_version());
#ifdef JANSSON_VERSION
	printf("libjansson %s\n", JANSSON_VERSION);
#endif
	exit(0);
}

static void show_usage_and_exit(int status)
{
	if (status)
		fprintf(stderr, "Try `" PROGRAM_NAME " --help' for more information.\n");
	else
		printf(usage);
	exit(status);
}

static void strhide(char *s)
{
	if (*s) *s++ = 'x';
	while (*s) *s++ = '\0';
}

static void parse_config(json_t *config, char *pname, char *ref);

static void parse_arg(int key, char *arg, char *pname)
{
	char *p;
	int v, i;

	switch(key) {
	case 'a':
		for (i = 0; i < ARRAY_SIZE(algo_names); i++) {
			v = strlen(algo_names[i]);
			if (!strncmp(arg, algo_names[i], v)) {
				if (arg[v] == '\0') {
					opt_algo = i;
					break;
				}
				if (arg[v] == ':' && i == ALGO_SCRYPT) {
					char *ep;
					v = strtol(arg+v+1, &ep, 10);
					if (*ep || v & (v-1) || v < 2)
						continue;
					opt_algo = i;
					opt_scrypt_n = v;
					break;
				}
			}
		}
		if (i == ARRAY_SIZE(algo_names)) {
			fprintf(stderr, "%s: unknown algorithm -- '%s'\n",
				pname, arg);
			show_usage_and_exit(1);
		}
		break;
	case 'B':
		opt_background = true;
		break;
	case 'c': {
		json_error_t err;
		json_t *config = JSON_LOAD_FILE(arg, &err);
		if (!json_is_object(config)) {
			if (err.line < 0)
				fprintf(stderr, "%s: %s\n", pname, err.text);
			else
				fprintf(stderr, "%s: %s:%d: %s\n",
					pname, arg, err.line, err.text);
			exit(1);
		}
		parse_config(config, pname, arg);
		json_decref(config);
		break;
	}
	case 'q':
		opt_quiet = true;
		break;
	case 'D':
		opt_debug = true;
		break;
	case 'p':
		free(rpc_pass);
		rpc_pass = strdup(arg);
		strhide(arg);
		break;
	case 'P':
		opt_protocol = true;
		break;
	case 'r':
		v = atoi(arg);
		if (v < -1 || v > 9999)	/* sanity check */
			show_usage_and_exit(1);
		opt_retries = v;
		break;
	case 'R':
		v = atoi(arg);
		if (v < 1 || v > 9999)	/* sanity check */
			show_usage_and_exit(1);
		opt_fail_pause = v;
		break;
	case 's':
		v = atoi(arg);
		if (v < 1 || v > 9999)	/* sanity check */
			show_usage_and_exit(1);
		opt_scantime = v;
		break;
	case 'T':
		v = atoi(arg);
		if (v < 1 || v > 99999)	/* sanity check */
			show_usage_and_exit(1);
		opt_timeout = v;
		break;
	case 't':
		v = atoi(arg);
		if (v < 1 || v > 9999)	/* sanity check */
			show_usage_and_exit(1);
		opt_n_threads = v;
		break;
	case 'u':
		free(rpc_user);
		rpc_user = strdup(arg);
		break;
	case 'o': {			/* --url */
		char *ap, *hp;
		ap = strstr(arg, "://");
		ap = ap ? ap + 3 : arg;
		hp = strrchr(arg, '@');
		if (hp) {
			*hp = '\0';
			p = strchr(ap, ':');
			if (p) {
				free(rpc_userpass);
				rpc_userpass = strdup(ap);
				free(rpc_user);
				rpc_user = calloc(p - ap + 1, 1);
				strncpy(rpc_user, ap, p - ap);
				free(rpc_pass);
				rpc_pass = strdup(++p);
				if (*p) *p++ = 'x';
				v = strlen(hp + 1) + 1;
				memmove(p + 1, hp + 1, v);
				memset(p + v, 0, hp - p);
				hp = p;
			} else {
				free(rpc_user);
				rpc_user = strdup(ap);
			}
			*hp++ = '@';
		} else
			hp = ap;
		if (ap != arg) {
			if (strncasecmp(arg, "http://", 7) &&
			    strncasecmp(arg, "https://", 8) &&
			    strncasecmp(arg, "stratum+tcp://", 14) &&
			    strncasecmp(arg, "stratum+tcps://", 15)) {
				fprintf(stderr, "%s: unknown protocol -- '%s'\n",
					pname, arg);
				show_usage_and_exit(1);
			}
			free(rpc_url);
			rpc_url = strdup(arg);
			strcpy(rpc_url + (ap - arg), hp);
		} else {
			if (*hp == '\0' || *hp == '/') {
				fprintf(stderr, "%s: invalid URL -- '%s'\n",
					pname, arg);
				show_usage_and_exit(1);
			}
			free(rpc_url);
			rpc_url = malloc(strlen(hp) + 8);
			sprintf(rpc_url, "http://%s", hp);
		}
		have_stratum = !opt_benchmark && !strncasecmp(rpc_url, "stratum", 7);
		break;
	}
	case 'O':			/* --userpass */
		p = strchr(arg, ':');
		if (!p) {
			fprintf(stderr, "%s: invalid username:password pair -- '%s'\n",
				pname, arg);
			show_usage_and_exit(1);
		}
		free(rpc_userpass);
		rpc_userpass = strdup(arg);
		free(rpc_user);
		rpc_user = calloc(p - arg + 1, 1);
		strncpy(rpc_user, arg, p - arg);
		free(rpc_pass);
		rpc_pass = strdup(++p);
		strhide(p);
		break;
	case 'x':			/* --proxy */
		if (!strncasecmp(arg, "socks4://", 9))
			opt_proxy_type = CURLPROXY_SOCKS4;
		else if (!strncasecmp(arg, "socks5://", 9))
			opt_proxy_type = CURLPROXY_SOCKS5;
#if LIBCURL_VERSION_NUM >= 0x071200
		else if (!strncasecmp(arg, "socks4a://", 10))
			opt_proxy_type = CURLPROXY_SOCKS4A;
		else if (!strncasecmp(arg, "socks5h://", 10))
			opt_proxy_type = CURLPROXY_SOCKS5_HOSTNAME;
#endif
		else
			opt_proxy_type = CURLPROXY_HTTP;
		free(opt_proxy);
		opt_proxy = strdup(arg);
		break;
	case 1001:
		free(opt_cert);
		opt_cert = strdup(arg);
		break;
	case 1005:
		opt_benchmark = true;
		want_longpoll = false;
		want_stratum = false;
		have_stratum = false;
		break;
	case 1003:
		want_longpoll = false;
		break;
	case 1007:
		want_stratum = false;
		break;
	case 1009:
		opt_redirect = false;
		break;
	case 1010:
		allow_getwork = false;
		break;
	case 1011:
		have_gbt = false;
		break;
	case 1013:			/* --coinbase-addr */
		pk_script_size = address_to_script(pk_script, sizeof(pk_script), arg);
		if (!pk_script_size) {
			fprintf(stderr, "%s: invalid address -- '%s'\n",
				pname, arg);
			show_usage_and_exit(1);
		}
		break;
	case 1015:			/* --coinbase-sig */
		if (strlen(arg) + 1 > sizeof(coinbase_sig)) {
			fprintf(stderr, "%s: coinbase signature too long\n", pname);
			show_usage_and_exit(1);
		}
		strcpy(coinbase_sig, arg);
		break;
	case 'S':
		use_syslog = true;
		break;
	case 'V':
		show_version_and_exit();
	case 'h':
		show_usage_and_exit(0);
	default:
		show_usage_and_exit(1);
	}
}

static void parse_config(json_t *config, char *pname, char *ref)
{
	int i;
	char *s;
	json_t *val;

	for (i = 0; i < ARRAY_SIZE(options); i++) {
		if (!options[i].name)
			break;

		val = json_object_get(config, options[i].name);
		if (!val)
			continue;

		if (options[i].has_arg && json_is_string(val)) {
			if (!strcmp(options[i].name, "config")) {
				fprintf(stderr, "%s: %s: option '%s' not allowed here\n",
					pname, ref, options[i].name);
				exit(1);
			}
			s = strdup(json_string_value(val));
			if (!s)
				break;
			parse_arg(options[i].val, s, pname);
			free(s);
		} else if (!options[i].has_arg && json_is_true(val)) {
			parse_arg(options[i].val, "", pname);
		} else {
			fprintf(stderr, "%s: invalid argument for option '%s'\n",
				pname, options[i].name);
			exit(1);
		}
	}
}

static void parse_cmdline(int argc, char *argv[])
{
	int key;

	while (1) {
#if HAVE_GETOPT_LONG
		key = getopt_long(argc, argv, short_options, options, NULL);
#else
		key = getopt(argc, argv, short_options);
#endif
		if (key < 0)
			break;

		parse_arg(key, optarg, argv[0]);
	}
	if (optind < argc) {
		fprintf(stderr, "%s: unsupported non-option argument -- '%s'\n",
			argv[0], argv[optind]);
		show_usage_and_exit(1);
	}
}

#ifndef WIN32
static void signal_handler(int sig)
{
	switch (sig) {
	case SIGHUP:
		applog(LOG_INFO, "SIGHUP received");
		break;
	case SIGINT:
		applog(LOG_INFO, "SIGINT received, exiting");
		exit(0);
		break;
	case SIGTERM:
		applog(LOG_INFO, "SIGTERM received, exiting");
		exit(0);
		break;
	}
}
#endif

int main(int argc, char *argv[])
{
	struct thr_info *thr;
	long flags;
	int i;

	rpc_user = strdup("");
	rpc_pass = strdup("");

	/* parse command line */
	parse_cmdline(argc, argv);

	if (!opt_benchmark && !rpc_url) {
		fprintf(stderr, "%s: no URL supplied\n", argv[0]);
		show_usage_and_exit(1);
	}

	if (!rpc_userpass) {
		rpc_userpass = malloc(strlen(rpc_user) + strlen(rpc_pass) + 2);
		if (!rpc_userpass)
			return 1;
		sprintf(rpc_userpass, "%s:%s", rpc_user, rpc_pass);
	}

	pthread_mutex_init(&applog_lock, NULL);
	pthread_mutex_init(&stats_lock, NULL);
	pthread_mutex_init(&g_work_lock, NULL);
	pthread_mutex_init(&stratum.sock_lock, NULL);
	pthread_mutex_init(&stratum.work_lock, NULL);

	flags = opt_benchmark || (strncasecmp(rpc_url, "https://", 8) &&
	                          strncasecmp(rpc_url, "stratum+tcps://", 15))
	      ? (CURL_GLOBAL_ALL & ~CURL_GLOBAL_SSL)
	      : CURL_GLOBAL_ALL;
	if (curl_global_init(flags)) {
		applog(LOG_ERR, "CURL initialization failed");
		return 1;
	}

#ifndef WIN32
	if (opt_background) {
		i = fork();
		if (i < 0) exit(1);
		if (i > 0) exit(0);
		i = setsid();
		if (i < 0)
			applog(LOG_ERR, "setsid() failed (errno = %d)", errno);
		i = chdir("/");
		if (i < 0)
			applog(LOG_ERR, "chdir() failed (errno = %d)", errno);
		signal(SIGHUP, signal_handler);
		signal(SIGINT, signal_handler);
		signal(SIGTERM, signal_handler);
	}
#endif

#if defined(WIN32)
	SYSTEM_INFO sysinfo;
	GetSystemInfo(&sysinfo);
	num_processors = sysinfo.dwNumberOfProcessors;
#elif defined(_SC_NPROCESSORS_CONF)
	num_processors = sysconf(_SC_NPROCESSORS_CONF);
#elif defined(CTL_HW) && defined(HW_NCPU)
	int req[] = { CTL_HW, HW_NCPU };
	size_t len = sizeof(num_processors);
	sysctl(req, 2, &num_processors, &len, NULL, 0);
#else
	num_processors = 1;
#endif
	if (num_processors < 1)
		num_processors = 1;
	if (!opt_n_threads)
		opt_n_threads = num_processors;

#ifdef HAVE_SYSLOG_H
	if (use_syslog)
		openlog("cpuminer", LOG_PID, LOG_USER);
#endif

	work_restart = calloc(opt_n_threads, sizeof(*work_restart));
	if (!work_restart)
		return 1;

	thr_info = calloc(opt_n_threads + 3, sizeof(*thr));
	if (!thr_info)
		return 1;
	
	thr_hashrates = (double *) calloc(opt_n_threads, sizeof(double));
	if (!thr_hashrates)
		return 1;

	/* init workio thread info */
	work_thr_id = opt_n_threads;
	thr = &thr_info[work_thr_id];
	thr->id = work_thr_id;
	thr->q = tq_new();
	if (!thr->q)
		return 1;

	/* start work I/O thread */
	if (pthread_create(&thr->pth, NULL, workio_thread, thr)) {
		applog(LOG_ERR, "workio thread create failed");
		return 1;
	}

	if (want_longpoll && !have_stratum) {
		/* init longpoll thread info */
		longpoll_thr_id = opt_n_threads + 1;
		thr = &thr_info[longpoll_thr_id];
		thr->id = longpoll_thr_id;
		thr->q = tq_new();
		if (!thr->q)
			return 1;

		/* start longpoll thread */
		if (unlikely(pthread_create(&thr->pth, NULL, longpoll_thread, thr))) {
			applog(LOG_ERR, "longpoll thread create failed");
			return 1;
		}
	}
	if (want_stratum) {
		/* init stratum thread info */
		stratum_thr_id = opt_n_threads + 2;
		thr = &thr_info[stratum_thr_id];
		thr->id = stratum_thr_id;
		thr->q = tq_new();
		if (!thr->q)
			return 1;

		/* start stratum thread */
		if (unlikely(pthread_create(&thr->pth, NULL, stratum_thread, thr))) {
			applog(LOG_ERR, "stratum thread create failed");
			return 1;
		}

		if (have_stratum)
			tq_push(thr_info[stratum_thr_id].q, strdup(rpc_url));
	}

	/* start mining threads */
	for (i = 0; i < opt_n_threads; i++) {
		thr = &thr_info[i];

		thr->id = i;
		thr->q = tq_new();
		if (!thr->q)
			return 1;

		if (unlikely(pthread_create(&thr->pth, NULL, miner_thread, thr))) {
			applog(LOG_ERR, "thread %d create failed", i);
			return 1;
		}
	}

	applog(LOG_INFO, "%d miner threads started, "
		"using '%s' algorithm.",
		opt_n_threads,
		algo_names[opt_algo]);

	/* main loop - simply wait for workio thread to exit */
	pthread_join(thr_info[work_thr_id].pth, NULL);

	applog(LOG_INFO, "workio thread dead, exiting.");

	return 0;
}
0707010000001E000081A4000003E800000064000000015EF4BCA100001BD9000000000000000000000000000000000000001700000000cpuminer-2.5.1/elist.h#ifndef _LINUX_LIST_H
#define _LINUX_LIST_H

/*
 * Simple doubly linked list implementation.
 *
 * Some of the internal functions ("__xxx") are useful when
 * manipulating whole lists rather than single entries, as
 * sometimes we already know the next/prev entries and we can
 * generate better code by using them directly rather than
 * using the generic single-entry routines.
 */

struct list_head {
	struct list_head *next, *prev;
};

#define LIST_HEAD_INIT(name) { &(name), &(name) }

#define LIST_HEAD(name) \
	struct list_head name = LIST_HEAD_INIT(name)

#define INIT_LIST_HEAD(ptr) do { \
	(ptr)->next = (ptr); (ptr)->prev = (ptr); \
} while (0)

/*
 * Insert a new entry between two known consecutive entries.
 *
 * This is only for internal list manipulation where we know
 * the prev/next entries already!
 */
static inline void __list_add(struct list_head *new,
			      struct list_head *prev,
			      struct list_head *next)
{
	next->prev = new;
	new->next = next;
	new->prev = prev;
	prev->next = new;
}

/**
 * list_add - add a new entry
 * @new: new entry to be added
 * @head: list head to add it after
 *
 * Insert a new entry after the specified head.
 * This is good for implementing stacks.
 */
static inline void list_add(struct list_head *new, struct list_head *head)
{
	__list_add(new, head, head->next);
}

/**
 * list_add_tail - add a new entry
 * @new: new entry to be added
 * @head: list head to add it before
 *
 * Insert a new entry before the specified head.
 * This is useful for implementing queues.
 */
static inline void list_add_tail(struct list_head *new, struct list_head *head)
{
	__list_add(new, head->prev, head);
}

/*
 * Delete a list entry by making the prev/next entries
 * point to each other.
 *
 * This is only for internal list manipulation where we know
 * the prev/next entries already!
 */
static inline void __list_del(struct list_head *prev, struct list_head *next)
{
	next->prev = prev;
	prev->next = next;
}

/**
 * list_del - deletes entry from list.
 * @entry: the element to delete from the list.
 * Note: list_empty on entry does not return true after this, the entry is in an undefined state.
 */
static inline void list_del(struct list_head *entry)
{
	__list_del(entry->prev, entry->next);
	entry->next = (void *) 0;
	entry->prev = (void *) 0;
}

/**
 * list_del_init - deletes entry from list and reinitialize it.
 * @entry: the element to delete from the list.
 */
static inline void list_del_init(struct list_head *entry)
{
	__list_del(entry->prev, entry->next);
	INIT_LIST_HEAD(entry);
}

/**
 * list_move - delete from one list and add as another's head
 * @list: the entry to move
 * @head: the head that will precede our entry
 */
static inline void list_move(struct list_head *list, struct list_head *head)
{
        __list_del(list->prev, list->next);
        list_add(list, head);
}

/**
 * list_move_tail - delete from one list and add as another's tail
 * @list: the entry to move
 * @head: the head that will follow our entry
 */
static inline void list_move_tail(struct list_head *list,
				  struct list_head *head)
{
        __list_del(list->prev, list->next);
        list_add_tail(list, head);
}

/**
 * list_empty - tests whether a list is empty
 * @head: the list to test.
 */
static inline int list_empty(struct list_head *head)
{
	return head->next == head;
}

static inline void __list_splice(struct list_head *list,
				 struct list_head *head)
{
	struct list_head *first = list->next;
	struct list_head *last = list->prev;
	struct list_head *at = head->next;

	first->prev = head;
	head->next = first;

	last->next = at;
	at->prev = last;
}

/**
 * list_splice - join two lists
 * @list: the new list to add.
 * @head: the place to add it in the first list.
 */
static inline void list_splice(struct list_head *list, struct list_head *head)
{
	if (!list_empty(list))
		__list_splice(list, head);
}

/**
 * list_splice_init - join two lists and reinitialise the emptied list.
 * @list: the new list to add.
 * @head: the place to add it in the first list.
 *
 * The list at @list is reinitialised
 */
static inline void list_splice_init(struct list_head *list,
				    struct list_head *head)
{
	if (!list_empty(list)) {
		__list_splice(list, head);
		INIT_LIST_HEAD(list);
	}
}

/**
 * list_entry - get the struct for this entry
 * @ptr:	the &struct list_head pointer.
 * @type:	the type of the struct this is embedded in.
 * @member:	the name of the list_struct within the struct.
 */
#define list_entry(ptr, type, member) \
	((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))

/**
 * list_for_each	-	iterate over a list
 * @pos:	the &struct list_head to use as a loop counter.
 * @head:	the head for your list.
 */
#define list_for_each(pos, head) \
	for (pos = (head)->next; pos != (head); \
        	pos = pos->next)
/**
 * list_for_each_prev	-	iterate over a list backwards
 * @pos:	the &struct list_head to use as a loop counter.
 * @head:	the head for your list.
 */
#define list_for_each_prev(pos, head) \
	for (pos = (head)->prev; pos != (head); \
        	pos = pos->prev)

/**
 * list_for_each_safe	-	iterate over a list safe against removal of list entry
 * @pos:	the &struct list_head to use as a loop counter.
 * @n:		another &struct list_head to use as temporary storage
 * @head:	the head for your list.
 */
#define list_for_each_safe(pos, n, head) \
	for (pos = (head)->next, n = pos->next; pos != (head); \
		pos = n, n = pos->next)

/**
 * list_for_each_entry	-	iterate over list of given type
 * @pos:	the type * to use as a loop counter.
 * @head:	the head for your list.
 * @member:	the name of the list_struct within the struct.
 * @type:	the type of the struct.
 */
#define list_for_each_entry(pos, head, member, type)			\
	for (pos = list_entry((head)->next, type, member);	\
	     &pos->member != (head); 					\
	     pos = list_entry(pos->member.next, type, member))

/**
 * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry
 * @pos:	the type * to use as a loop counter.
 * @n:		another type * to use as temporary storage
 * @head:	the head for your list.
 * @member:	the name of the list_struct within the struct.
 * @type:	the type of the struct.
 */
#define list_for_each_entry_safe(pos, n, head, member, type)		\
	for (pos = list_entry((head)->next, type, member),	\
		n = list_entry(pos->member.next, type, member);	\
	     &pos->member != (head); 					\
	     pos = n, n = list_entry(n->member.next, type, member))

/**
 * list_for_each_entry_continue -       iterate over list of given type
 *                      continuing after existing point
 * @pos:        the type * to use as a loop counter.
 * @head:       the head for your list.
 * @member:     the name of the list_struct within the struct.
 * @type:       the type of the struct.
 */
#define list_for_each_entry_continue(pos, head, member, type)		\
	for (pos = list_entry(pos->member.next, type, member),	\
		     prefetch(pos->member.next);			\
	     &pos->member != (head);					\
	     pos = list_entry(pos->member.next, type, member),	\
		     prefetch(pos->member.next))

#endif
0707010000001F000081A4000003E800000064000000015EF4BCA100000101000000000000000000000000000000000000002000000000cpuminer-2.5.1/example-cfg.json{
	"_comment1" : "Any long-format command line argument ",
	"_comment2" : "may be used in this JSON configuration file",

	"url" : "http://127.0.0.1:9332/",
	"user" : "rpcuser",
	"pass" : "rpcpass",

	"algo" : "scrypt",
	"threads" : "4",

	"quiet" : true
}
07070100000020000081A4000003E800000064000000015EF4BCA100001B0D000000000000000000000000000000000000001700000000cpuminer-2.5.1/miner.h#ifndef __MINER_H__
#define __MINER_H__

#include "cpuminer-config.h"

#include <stdbool.h>
#include <inttypes.h>
#include <sys/time.h>
#include <pthread.h>
#include <jansson.h>
#include <curl/curl.h>

#ifdef STDC_HEADERS
# include <stdlib.h>
# include <stddef.h>
#else
# ifdef HAVE_STDLIB_H
#  include <stdlib.h>
# endif
#endif
#ifdef HAVE_ALLOCA_H
# include <alloca.h>
#elif !defined alloca
# ifdef __GNUC__
#  define alloca __builtin_alloca
# elif defined _AIX
#  define alloca __alloca
# elif defined _MSC_VER
#  include <malloc.h>
#  define alloca _alloca
# elif !defined HAVE_ALLOCA
#  ifdef  __cplusplus
extern "C"
#  endif
void *alloca (size_t);
# endif
#endif

#ifdef HAVE_SYSLOG_H
#include <syslog.h>
#else
enum {
	LOG_ERR,
	LOG_WARNING,
	LOG_NOTICE,
	LOG_INFO,
	LOG_DEBUG,
};
#endif

#undef unlikely
#undef likely
#if defined(__GNUC__) && (__GNUC__ > 2) && defined(__OPTIMIZE__)
#define unlikely(expr) (__builtin_expect(!!(expr), 0))
#define likely(expr) (__builtin_expect(!!(expr), 1))
#else
#define unlikely(expr) (expr)
#define likely(expr) (expr)
#endif

#ifndef ARRAY_SIZE
#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
#endif

#if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
#define WANT_BUILTIN_BSWAP
#else
#define bswap_32(x) ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) \
                   | (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
#endif

static inline uint32_t swab32(uint32_t v)
{
#ifdef WANT_BUILTIN_BSWAP
	return __builtin_bswap32(v);
#else
	return bswap_32(v);
#endif
}

#ifdef HAVE_SYS_ENDIAN_H
#include <sys/endian.h>
#endif

#if !HAVE_DECL_BE32DEC
static inline uint32_t be32dec(const void *pp)
{
	const uint8_t *p = (uint8_t const *)pp;
	return ((uint32_t)(p[3]) + ((uint32_t)(p[2]) << 8) +
	    ((uint32_t)(p[1]) << 16) + ((uint32_t)(p[0]) << 24));
}
#endif

#if !HAVE_DECL_LE32DEC
static inline uint32_t le32dec(const void *pp)
{
	const uint8_t *p = (uint8_t const *)pp;
	return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) +
	    ((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24));
}
#endif

#if !HAVE_DECL_BE32ENC
static inline void be32enc(void *pp, uint32_t x)
{
	uint8_t *p = (uint8_t *)pp;
	p[3] = x & 0xff;
	p[2] = (x >> 8) & 0xff;
	p[1] = (x >> 16) & 0xff;
	p[0] = (x >> 24) & 0xff;
}
#endif

#if !HAVE_DECL_LE32ENC
static inline void le32enc(void *pp, uint32_t x)
{
	uint8_t *p = (uint8_t *)pp;
	p[0] = x & 0xff;
	p[1] = (x >> 8) & 0xff;
	p[2] = (x >> 16) & 0xff;
	p[3] = (x >> 24) & 0xff;
}
#endif

#if JANSSON_MAJOR_VERSION >= 2
#define JSON_LOADS(str, err_ptr) json_loads(str, 0, err_ptr)
#define JSON_LOAD_FILE(path, err_ptr) json_load_file(path, 0, err_ptr)
#else
#define JSON_LOADS(str, err_ptr) json_loads(str, err_ptr)
#define JSON_LOAD_FILE(path, err_ptr) json_load_file(path, err_ptr)
#endif

#define USER_AGENT PACKAGE_NAME "/" PACKAGE_VERSION

void sha256_init(uint32_t *state);
void sha256_transform(uint32_t *state, const uint32_t *block, int swap);
void sha256d(unsigned char *hash, const unsigned char *data, int len);

#ifdef USE_ASM
#if defined(__ARM_NEON__) || defined(__ALTIVEC__) || defined(__i386__) || defined(__x86_64__)
#define HAVE_SHA256_4WAY 1
int sha256_use_4way();
void sha256_init_4way(uint32_t *state);
void sha256_transform_4way(uint32_t *state, const uint32_t *block, int swap);
#endif
#if defined(__x86_64__) && defined(USE_AVX2)
#define HAVE_SHA256_8WAY 1
int sha256_use_8way();
void sha256_init_8way(uint32_t *state);
void sha256_transform_8way(uint32_t *state, const uint32_t *block, int swap);
#endif
#endif

extern int scanhash_sha256d(int thr_id, uint32_t *pdata,
	const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done);

extern unsigned char *scrypt_buffer_alloc(int N);
extern int scanhash_scrypt(int thr_id, uint32_t *pdata,
	unsigned char *scratchbuf, const uint32_t *ptarget,
	uint32_t max_nonce, unsigned long *hashes_done, int N);

struct thr_info {
	int		id;
	pthread_t	pth;
	struct thread_q	*q;
};

struct work_restart {
	volatile unsigned long	restart;
	char			padding[128 - sizeof(unsigned long)];
};

extern bool opt_debug;
extern bool opt_protocol;
extern bool opt_redirect;
extern int opt_timeout;
extern bool want_longpoll;
extern bool have_longpoll;
extern bool have_gbt;
extern bool allow_getwork;
extern bool want_stratum;
extern bool have_stratum;
extern char *opt_cert;
extern char *opt_proxy;
extern long opt_proxy_type;
extern bool use_syslog;
extern pthread_mutex_t applog_lock;
extern struct thr_info *thr_info;
extern int longpoll_thr_id;
extern int stratum_thr_id;
extern struct work_restart *work_restart;

#define JSON_RPC_LONGPOLL	(1 << 0)
#define JSON_RPC_QUIET_404	(1 << 1)

extern void applog(int prio, const char *fmt, ...);
extern json_t *json_rpc_call(CURL *curl, const char *url, const char *userpass,
	const char *rpc_req, int *curl_err, int flags);
void memrev(unsigned char *p, size_t len);
extern void bin2hex(char *s, const unsigned char *p, size_t len);
extern char *abin2hex(const unsigned char *p, size_t len);
extern bool hex2bin(unsigned char *p, const char *hexstr, size_t len);
extern int varint_encode(unsigned char *p, uint64_t n);
extern size_t address_to_script(unsigned char *out, size_t outsz, const char *addr);
extern int timeval_subtract(struct timeval *result, struct timeval *x,
	struct timeval *y);
extern bool fulltest(const uint32_t *hash, const uint32_t *target);
extern void diff_to_target(uint32_t *target, double diff);

struct stratum_job {
	char *job_id;
	unsigned char prevhash[32];
	size_t coinbase_size;
	unsigned char *coinbase;
	unsigned char *xnonce2;
	int merkle_count;
	unsigned char **merkle;
	unsigned char version[4];
	unsigned char nbits[4];
	unsigned char ntime[4];
	bool clean;
	double diff;
};

struct stratum_ctx {
	char *url;

	CURL *curl;
	char *curl_url;
	char curl_err_str[CURL_ERROR_SIZE];
	curl_socket_t sock;
	size_t sockbuf_size;
	char *sockbuf;
	pthread_mutex_t sock_lock;

	double next_diff;

	char *session_id;
	size_t xnonce1_size;
	unsigned char *xnonce1;
	size_t xnonce2_size;
	struct stratum_job job;
	pthread_mutex_t work_lock;
};

bool stratum_socket_full(struct stratum_ctx *sctx, int timeout);
bool stratum_send_line(struct stratum_ctx *sctx, char *s);
char *stratum_recv_line(struct stratum_ctx *sctx);
bool stratum_connect(struct stratum_ctx *sctx, const char *url);
void stratum_disconnect(struct stratum_ctx *sctx);
bool stratum_subscribe(struct stratum_ctx *sctx);
bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass);
bool stratum_handle_method(struct stratum_ctx *sctx, const char *s);

struct thread_q;

extern struct thread_q *tq_new(void);
extern void tq_free(struct thread_q *tq);
extern bool tq_push(struct thread_q *tq, void *data);
extern void *tq_pop(struct thread_q *tq, const struct timespec *abstime);
extern void tq_freeze(struct thread_q *tq);
extern void tq_thaw(struct thread_q *tq);

#endif /* __MINER_H__ */
07070100000021000081A4000003E800000064000000015EF4BCA100001DB7000000000000000000000000000000000000001800000000cpuminer-2.5.1/minerd.1.TH MINERD 1 "June 2020" "cpuminer 2.5.1"
.SH NAME
minerd \- CPU miner for Bitcoin and Litecoin
.SH SYNOPSIS
.B minerd
[\fIOPTION\fR]...
.SH DESCRIPTION
.B minerd
is a multi-threaded CPU miner for Bitcoin, Litecoin and other cryptocurrencies.
It supports the getwork and getblocktemplate (BIP 22) methods,
as well as the Stratum mining protocol.
.PP
In its normal mode of operation, \fBminerd\fR connects to a mining server
(specified with the \fB\-o\fR option), receives work from it and starts hashing.
As soon as a solution is found, it is submitted to the same mining server,
which can accept or reject it.
When using getwork or getblocktemplate,
\fBminerd\fR can take advantage of long polling, if the server supports it;
in any case, fresh work is fetched as needed.
When using the Stratum protocol this is not possible,
and the server is responsible for sending fresh work at least every minute;
if it fails to do so,
\fBminerd\fR may drop the connection and try reconnecting again.
.PP
By default, \fBminerd\fR writes all its messages to standard error.
On systems that have a syslog, the \fB\-\-syslog\fR option can be used
to write to it instead.
.PP
On start, the nice value of all miner threads is set to 19.
On Linux, the scheduling policy is also changed to SCHED_IDLE,
or to SCHED_BATCH if that fails.
On multiprocessor systems, \fBminerd\fR
automatically sets the CPU affinity of miner threads
if the number of threads is a multiple of the number of processors.
.SH EXAMPLES
To connect to a Litecoin mining pool that provides a Stratum server
at example.com on port 3333, authenticating as worker "foo" with password "bar":
.PP
.nf
.RS
minerd \-o stratum+tcp://example.com:3333 \-O foo:bar
.RE
.fi
.PP
To mine to a local Bitcoin testnet instance running on port 18332,
authenticating with username "rpcuser" and password "rpcpass":
.PP
.nf
.RS
minerd \-a sha256d \-o http://localhost:18332 \-O rpcuser:rpcpass \\
	\-\-coinbase\-addr=mpXwg4jMtRhuSpVq4xS3HFHmCmWp9NyGKt
.RE
.fi
.PP
To connect to a Litecoin P2Pool node running on my.server on port 9327,
mining in the background and having output sent to the syslog facility,
omitting the per-thread hashmeter output:
.PP
.nf
.RS
minerd \-BSq \-o http://my.server:9327
.RE
.fi
.SH OPTIONS
.TP
\fB\-a\fR, \fB\-\-algo\fR=\fIALGORITHM\fR
Set the hashing algorithm to use.
Default is scrypt.
Possible values are:
.RS 11
.TP 10
.B scrypt
scrypt(1024, 1, 1) (used by Litecoin)
.TP
.B scrypt:\fIN\fR
scrypt(\fIN\fR, 1, 1) (\fIN\fR must be a power of 2 greater than 1)
.TP
.B sha256d
SHA-256d (used by Bitcoin)
.RE
.TP
\fB\-\-benchmark\fR
Run in offline benchmark mode.
.TP
\fB\-B\fR, \fB\-\-background\fR
Run in the background as a daemon.
.TP
\fB\-\-cert\fR=\fIFILE\fR
Set an SSL certificate to use with the mining server.
Only supported when using the HTTPS protocol.
.TP
\fB\-\-coinbase\-addr\fR=\fIADDRESS\fR
Set a payout address for solo mining.
This is only used in getblocktemplate mode,
and only if the server does not provide a coinbase transaction.
It can be either a base-58 address, or a bech32 address (BIP 173).
.TP
\fB\-\-coinbase\-sig\fR=\fITEXT\fR
Set a string to be included in the coinbase (if allowed by the server).
This is only used in getblocktemplate mode.
.TP
\fB\-c\fR, \fB\-\-config\fR=\fIFILE\fR
Load options from a configuration file.
\fIFILE\fR must contain a JSON object
mapping long options to their arguments (as strings),
or to \fBtrue\fR if no argument is required.
Sample configuration file:

.nf
	{
		"url": "stratum+tcp://example.com:3333",
		"userpass": "foo:bar",
		"retry-pause": "10",
		"quiet": true
	}
.fi
.TP
\fB\-D\fR, \fB\-\-debug\fR
Enable debug output.
.TP
\fB\-h\fR, \fB\-\-help\fR
Print a help message and exit.
.TP
\fB\-\-no\-gbt\fR
Do not use the getblocktemplate RPC method.
.TP
\fB\-\-no\-getwork\fR
Do not use the getwork RPC method.
.TP
\fB\-\-no\-longpoll\fR
Do not use long polling.
.TP
\fB\-\-no\-redirect\fR
Ignore requests from the server to switch to a different URL.
.TP
\fB\-\-no\-stratum\fR
Do not switch to Stratum, even if the server advertises support for it.
.TP
\fB\-o\fR, \fB\-\-url\fR=[\fISCHEME\fR://][\fIUSERNAME\fR[:\fIPASSWORD\fR]@]\fIHOST\fR:\fIPORT\fR[/\fIPATH\fR]
Set the URL of the mining server to connect to.
Supported schemes are \fBhttp\fR, \fBhttps\fR, \fBstratum+tcp\fR
and \fBstratum+tcps\fR.
If no scheme is specified, http is assumed.
Specifying a \fIPATH\fR is only supported for HTTP and HTTPS.
Specifying credentials has the same effect as using the \fB\-O\fR option.

By default, on HTTP and HTTPS,
the miner tries to use the getblocktemplate RPC method,
and falls back to using getwork if getblocktemplate is unavailable.
This behavior can be modified by using the \fB\-\-no\-gbt\fR
and \fB\-\-no\-getwork\fR options.
.TP
\fB\-O\fR, \fB\-\-userpass\fR=\fIUSERNAME\fR:\fIPASSWORD\fR
Set the credentials to use for connecting to the mining server.
Any value previously set with \fB\-u\fR or \fB\-p\fR is discarded.
.TP
\fB\-p\fR, \fB\-\-pass\fR=\fIPASSWORD\fR
Set the password to use for connecting to the mining server.
Any password previously set with \fB\-O\fR is discarded.
.TP
\fB\-P\fR, \fB\-\-protocol\-dump\fR
Enable output of all protocol-level activities.
.TP
\fB\-q\fR, \fB\-\-quiet\fR
Disable per-thread hashmeter output.
.TP
\fB\-r\fR, \fB\-\-retries\fR=\fIN\fR
Set the maximum number of times to retry if a network call fails.
If not specified, the miner will retry indefinitely.
.TP
\fB\-R\fR, \fB\-\-retry\-pause\fR=\fISECONDS\fR
Set how long to wait between retries. Default is 30 seconds.
.TP
\fB\-s\fR, \fB\-\-scantime\fR=\fISECONDS\fR
Set an upper bound on the time the miner can go without fetching fresh work.
This setting has no effect in Stratum mode or when long polling is activated.
Default is 5 seconds.
.TP
\fB\-S\fR, \fB\-\-syslog\fR
Log to the syslog facility instead of standard error.
.TP
\fB\-t\fR, \fB\-\-threads\fR=\fIN\fR
Set the number of miner threads.
If not specified, the miner will try to detect the number of available processors
and use that.
.TP
\fB\-T\fR, \fB\-\-timeout\fR=\fISECONDS\fR
Set a timeout for long polling.
.TP
\fB\-u\fR, \fB\-\-user\fR=\fIUSERNAME\fR
Set the username to use for connecting to the mining server.
Any username previously set with \fB\-O\fR is discarded.
.TP
\fB\-V\fR, \fB\-\-version\fR
Display version information and quit.
.TP
\fB\-x\fR, \fB\-\-proxy\fR=[\fISCHEME\fR://][\fIUSERNAME\fR:\fIPASSWORD\fR@]\fIHOST\fR:\fIPORT\fR
Connect to the mining server through a proxy.
Supported schemes are: \fBhttp\fR, \fBsocks4\fR, \fBsocks5\fR.
Since libcurl 7.18.0, the following are also supported:
\fBsocks4a\fR, \fBsocks5h\fR (SOCKS5 with remote name resolving).
If no scheme is specified, the proxy is treated as an HTTP proxy.
.SH ENVIRONMENT
The following environment variables can be specified in lower case or upper case;
the lower-case version has precedence. \fBhttp_proxy\fR is an exception
as it is only available in lower case.
.PP
.RS
.TP
\fBhttp_proxy\fR [\fISCHEME\fR://]\fIHOST\fR:\fIPORT\fR
Sets the proxy server to use for HTTP.
.TP
\fBHTTPS_PROXY\fR [\fISCHEME\fR://]\fIHOST\fR:\fIPORT\fR
Sets the proxy server to use for HTTPS.
.TP
\fBALL_PROXY\fR [\fISCHEME\fR://]\fIHOST\fR:\fIPORT\fR
Sets the proxy server to use if no protocol-specific proxy is set.
.RE
.PP
Using an environment variable to set the proxy has the same effect as
using the \fB\-x\fR option.
.SH AUTHOR
Most of the code in the current version of minerd was written by
Pooler <pooler@litecoinpool.org> with contributions from others.

The original minerd was written by Jeff Garzik <jeff@garzik.org>.
07070100000022000081ED000003E800000064000000015EF4BCA100000432000000000000000000000000000000000000001A00000000cpuminer-2.5.1/nomacro.pl#!/usr/bin/perl
# Copyright 2012, 2015 pooler@litecoinpool.org
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.  See COPYING for more details.
#
# nomacro.pl - expand assembler macros.

use strict;

foreach my $f (<*.S>) {
	rename $f, "$f.orig" unless -e "$f.orig";
	open FIN, "$f.orig";
	open FOUT, ">$f";
	my %macros = ();
	my %m = ();
	while (<FIN>) {
		if (m/^\.macro\s+(\w+)\s*(.*)$/) {
			$m{name} = $1;
			@m{args} = [split /\s*,\s*/, $2];
			$m{body} = "";
			next;
		}
		if (m/^\.endm/) {
			$macros{$m{name}} = {%m};
			%m = ();
			next;
		}
		for my $n (keys %macros) {
			if (m/^\s*$n\b\s*(.*)$/) {
				my @a = split /\s*,\s*/, $1;
				$_ = $macros{$n}{body};
				for my $i (0 .. $#{$macros{$n}{args}}) {
					s/\\$macros{$n}{args}[$i]\b/$a[$i]/g;
				}
				last;
			}
		}
		if (%m) {
			$m{body} .= $_;
			next;
		}
		print FOUT;
	}
	close FOUT;
	close FIN;
}
07070100000023000081A4000003E800000064000000015EF4BCA100005A43000000000000000000000000000000000000001C00000000cpuminer-2.5.1/scrypt-arm.S/*
 * Copyright 2012, 2014 pooler@litecoinpool.org
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the Free
 * Software Foundation; either version 2 of the License, or (at your option)
 * any later version.  See COPYING for more details.
 */

#include "cpuminer-config.h"

#if defined(USE_ASM) && defined(__arm__) && defined(__APCS_32__)

#if defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) || \
	defined(__ARM_ARCH_5TEJ__) || defined(__ARM_ARCH_6__) || \
	defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || \
	defined(__ARM_ARCH_6M__) || defined(__ARM_ARCH_6T2__) || \
	defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__)
#define __ARM_ARCH_5E_OR_6__
#endif

#if defined(__ARM_ARCH_5E_OR_6__) || defined(__ARM_ARCH_7__) || \
	defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || \
	defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7EM__)
#define __ARM_ARCH_5E_OR_6_OR_7__
#endif

#ifdef __ARM_ARCH_5E_OR_6__

.macro scrypt_shuffle
	add	lr, r0, #9*4
	ldmia	r0, {r2-r7}
	ldmia	lr, {r2, r8-r12, lr}
	str	r3, [r0, #5*4]
	str	r5, [r0, #15*4]
	str	r6, [r0, #12*4]
	str	r7, [r0, #1*4]
	ldr r5, [r0, #7*4]
	str	r2, [r0, #13*4]
	str	r8, [r0, #2*4]
	strd	r4, [r0, #10*4]
	str	r9, [r0, #7*4]
	str	r10, [r0, #4*4]
	str	r11, [r0, #9*4]
	str	lr, [r0, #3*4]
	
	add	r2, r0, #64+0*4
	add	lr, r0, #64+9*4
	ldmia	r2, {r2-r7}
	ldmia	lr, {r2, r8-r12, lr}
	str	r3, [r0, #64+5*4]
	str	r5, [r0, #64+15*4]
	str	r6, [r0, #64+12*4]
	str	r7, [r0, #64+1*4]
	ldr r5, [r0, #64+7*4]
	str	r2, [r0, #64+13*4]
	str	r8, [r0, #64+2*4]
	strd	r4, [r0, #64+10*4]
	str	r9, [r0, #64+7*4]
	str	r10, [r0, #64+4*4]
	str	r11, [r0, #64+9*4]
	str	lr, [r0, #64+3*4]
.endm

.macro salsa8_core_doubleround_body
	add	r6, r2, r6
	add	r7, r3, r7
	eor	r10, r10, r6, ror #25
	add	r6, r0, r4
	eor	r11, r11, r7, ror #25
	add	r7, r1, r5
	strd	r10, [sp, #14*4]
	eor	r12, r12, r6, ror #25
	eor	lr, lr, r7, ror #25
	
	ldrd	r6, [sp, #10*4]
	add	r2, r10, r2
	add	r3, r11, r3
	eor	r6, r6, r2, ror #23
	add	r2, r12, r0
	eor	r7, r7, r3, ror #23
	add	r3, lr, r1
	strd	r6, [sp, #10*4]
	eor	r8, r8, r2, ror #23
	eor	r9, r9, r3, ror #23
	
	ldrd	r2, [sp, #6*4]
	add	r10, r6, r10
	add	r11, r7, r11
	eor	r2, r2, r10, ror #19
	add	r10, r8, r12
	eor	r3, r3, r11, ror #19
	add	r11, r9, lr
	eor	r4, r4, r10, ror #19
	eor	r5, r5, r11, ror #19
	
	ldrd	r10, [sp, #2*4]
	add	r6, r2, r6
	add	r7, r3, r7
	eor	r10, r10, r6, ror #14
	add	r6, r4, r8
	eor	r11, r11, r7, ror #14
	add	r7, r5, r9
	eor	r0, r0, r6, ror #14
	eor	r1, r1, r7, ror #14
	
	
	ldrd	r6, [sp, #14*4]
	strd	r2, [sp, #6*4]
	strd	r10, [sp, #2*4]
	add	r6, r11, r6
	add	r7, r0, r7
	eor	r4, r4, r6, ror #25
	add	r6, r1, r12
	eor	r5, r5, r7, ror #25
	add	r7, r10, lr
	eor	r2, r2, r6, ror #25
	eor	r3, r3, r7, ror #25
	strd	r2, [sp, #6*4]
	
	add	r10, r3, r10
	ldrd	r6, [sp, #10*4]
	add	r11, r4, r11
	eor	r8, r8, r10, ror #23
	add	r10, r5, r0
	eor	r9, r9, r11, ror #23
	add	r11, r2, r1
	eor	r6, r6, r10, ror #23
	eor	r7, r7, r11, ror #23
	strd	r6, [sp, #10*4]
	
	add	r2, r7, r2
	ldrd	r10, [sp, #14*4]
	add	r3, r8, r3
	eor	r12, r12, r2, ror #19
	add	r2, r9, r4
	eor	lr, lr, r3, ror #19
	add	r3, r6, r5
	eor	r10, r10, r2, ror #19
	eor	r11, r11, r3, ror #19
	
	ldrd	r2, [sp, #2*4]
	add	r6, r11, r6
	add	r7, r12, r7
	eor	r0, r0, r6, ror #14
	add	r6, lr, r8
	eor	r1, r1, r7, ror #14
	add	r7, r10, r9
	eor	r2, r2, r6, ror #14
	eor	r3, r3, r7, ror #14
.endm

.macro salsa8_core
	ldmia	sp, {r0-r12, lr}
	
	ldrd	r10, [sp, #14*4]
	salsa8_core_doubleround_body
	ldrd	r6, [sp, #6*4]
	strd	r2, [sp, #2*4]
	strd	r10, [sp, #14*4]
	salsa8_core_doubleround_body
	ldrd	r6, [sp, #6*4]
	strd	r2, [sp, #2*4]
	strd	r10, [sp, #14*4]
	salsa8_core_doubleround_body
	ldrd	r6, [sp, #6*4]
	strd	r2, [sp, #2*4]
	strd	r10, [sp, #14*4]
	salsa8_core_doubleround_body
	
	stmia	sp, {r0-r5}
	strd	r8, [sp, #8*4]
	str	r12, [sp, #12*4]
	str	lr, [sp, #13*4]
	strd	r10, [sp, #14*4]
.endm

#else

.macro scrypt_shuffle
.endm

.macro salsa8_core_doubleround_body
	ldr	r8, [sp, #8*4]
	add	r11, r11, r10
	ldr	lr, [sp, #13*4]
	add	r12, r12, r3
	eor	r2, r2, r11, ror #23
	add	r11, r4, r0
	eor	r7, r7, r12, ror #23
	add	r12, r9, r5
	str	r9, [sp, #9*4]
	eor	r8, r8, r11, ror #23
	str	r10, [sp, #14*4]
	eor	lr, lr, r12, ror #23
	
	ldr	r11, [sp, #11*4]
	add	r9, lr, r9
	ldr	r12, [sp, #12*4]
	add	r10, r2, r10
	eor	r1, r1, r9, ror #19
	add	r9, r7, r3
	eor	r6, r6, r10, ror #19
	add	r10, r8, r4
	str	r8, [sp, #8*4]
	eor	r11, r11, r9, ror #19
	str	lr, [sp, #13*4]
	eor	r12, r12, r10, ror #19
	
	ldr	r9, [sp, #10*4]
	add	r8, r12, r8
	ldr	r10, [sp, #15*4]
	add	lr, r1, lr
	eor	r0, r0, r8, ror #14
	add	r8, r6, r2
	eor	r5, r5, lr, ror #14
	add	lr, r11, r7
	eor	r9, r9, r8, ror #14
	ldr	r8, [sp, #9*4]
	eor	r10, r10, lr, ror #14
	ldr	lr, [sp, #14*4]
	
	
	add	r8, r9, r8
	str	r9, [sp, #10*4]
	add	lr, r10, lr
	str	r10, [sp, #15*4]
	eor	r11, r11, r8, ror #25
	add	r8, r0, r3
	eor	r12, r12, lr, ror #25
	add	lr, r5, r4
	eor	r1, r1, r8, ror #25
	ldr	r8, [sp, #8*4]
	eor	r6, r6, lr, ror #25
	
	add	r9, r11, r9
	ldr	lr, [sp, #13*4]
	add	r10, r12, r10
	eor	r8, r8, r9, ror #23
	add	r9, r1, r0
	eor	lr, lr, r10, ror #23
	add	r10, r6, r5
	str	r11, [sp, #11*4]
	eor	r2, r2, r9, ror #23
	str	r12, [sp, #12*4]
	eor	r7, r7, r10, ror #23
	
	ldr	r9, [sp, #9*4]
	add	r11, r8, r11
	ldr	r10, [sp, #14*4]
	add	r12, lr, r12
	eor	r9, r9, r11, ror #19
	add	r11, r2, r1
	eor	r10, r10, r12, ror #19
	add	r12, r7, r6
	str	r8, [sp, #8*4]
	eor	r3, r3, r11, ror #19
	str	lr, [sp, #13*4]
	eor	r4, r4, r12, ror #19
.endm

.macro salsa8_core
	ldmia	sp, {r0-r7}
	
	ldr	r12, [sp, #15*4]
	ldr	r8, [sp, #11*4]
	ldr	lr, [sp, #12*4]
	
	ldr	r9, [sp, #9*4]
	add	r8, r8, r12
	ldr	r11, [sp, #10*4]
	add	lr, lr, r0
	eor	r3, r3, r8, ror #25
	add	r8, r5, r1
	ldr	r10, [sp, #14*4]
	eor	r4, r4, lr, ror #25
	add	lr, r11, r6
	eor	r9, r9, r8, ror #25
	eor	r10, r10, lr, ror #25
	
	salsa8_core_doubleround_body
	
	ldr	r11, [sp, #10*4]
	add	r8, r9, r8
	ldr	r12, [sp, #15*4]
	add	lr, r10, lr
	eor	r11, r11, r8, ror #14
	add	r8, r3, r2
	eor	r12, r12, lr, ror #14
	add	lr, r4, r7
	eor	r0, r0, r8, ror #14
	ldr	r8, [sp, #11*4]
	eor	r5, r5, lr, ror #14
	ldr	lr, [sp, #12*4]
	
	add	r8, r8, r12
	str	r11, [sp, #10*4]
	add	lr, lr, r0
	str	r12, [sp, #15*4]
	eor	r3, r3, r8, ror #25
	add	r8, r5, r1
	eor	r4, r4, lr, ror #25
	add	lr, r11, r6
	str	r9, [sp, #9*4]
	eor	r9, r9, r8, ror #25
	str	r10, [sp, #14*4]
	eor	r10, r10, lr, ror #25
	
	salsa8_core_doubleround_body
	
	ldr	r11, [sp, #10*4]
	add	r8, r9, r8
	ldr	r12, [sp, #15*4]
	add	lr, r10, lr
	eor	r11, r11, r8, ror #14
	add	r8, r3, r2
	eor	r12, r12, lr, ror #14
	add	lr, r4, r7
	eor	r0, r0, r8, ror #14
	ldr	r8, [sp, #11*4]
	eor	r5, r5, lr, ror #14
	ldr	lr, [sp, #12*4]
	
	add	r8, r8, r12
	str	r11, [sp, #10*4]
	add	lr, lr, r0
	str	r12, [sp, #15*4]
	eor	r3, r3, r8, ror #25
	add	r8, r5, r1
	eor	r4, r4, lr, ror #25
	add	lr, r11, r6
	str	r9, [sp, #9*4]
	eor	r9, r9, r8, ror #25
	str	r10, [sp, #14*4]
	eor	r10, r10, lr, ror #25
	
	salsa8_core_doubleround_body
	
	ldr	r11, [sp, #10*4]
	add	r8, r9, r8
	ldr	r12, [sp, #15*4]
	add	lr, r10, lr
	eor	r11, r11, r8, ror #14
	add	r8, r3, r2
	eor	r12, r12, lr, ror #14
	add	lr, r4, r7
	eor	r0, r0, r8, ror #14
	ldr	r8, [sp, #11*4]
	eor	r5, r5, lr, ror #14
	ldr	lr, [sp, #12*4]
	
	add	r8, r8, r12
	str	r11, [sp, #10*4]
	add	lr, lr, r0
	str	r12, [sp, #15*4]
	eor	r3, r3, r8, ror #25
	add	r8, r5, r1
	eor	r4, r4, lr, ror #25
	add	lr, r11, r6
	str	r9, [sp, #9*4]
	eor	r9, r9, r8, ror #25
	str	r10, [sp, #14*4]
	eor	r10, r10, lr, ror #25
	
	salsa8_core_doubleround_body
	
	ldr	r11, [sp, #10*4]
	add	r8, r9, r8
	ldr	r12, [sp, #15*4]
	add	lr, r10, lr
	str	r9, [sp, #9*4]
	eor	r11, r11, r8, ror #14
	eor	r12, r12, lr, ror #14
	add	r8, r3, r2
	str	r10, [sp, #14*4]
	add	lr, r4, r7
	str	r11, [sp, #10*4]
	eor	r0, r0, r8, ror #14
	str	r12, [sp, #15*4]
	eor	r5, r5, lr, ror #14
	
	stmia	sp, {r0-r7}
.endm

#endif


.macro scrypt_core_macro1a_x4
	ldmia	r0, {r4-r7}
	ldmia	lr!, {r8-r11}
	stmia	r1!, {r4-r7}
	stmia	r3!, {r8-r11}
	eor	r4, r4, r8
	eor	r5, r5, r9
	eor	r6, r6, r10
	eor	r7, r7, r11
	stmia	r0!, {r4-r7}
	stmia	r12!, {r4-r7}
.endm

.macro scrypt_core_macro1b_x4
	ldmia	r3!, {r8-r11}
	ldmia	r2, {r4-r7}
	eor	r8, r8, r4
	eor	r9, r9, r5
	eor	r10, r10, r6
	eor	r11, r11, r7
	ldmia	r0, {r4-r7}
	stmia	r2!, {r8-r11}
	eor	r4, r4, r8
	eor	r5, r5, r9
	eor	r6, r6, r10
	eor	r7, r7, r11
	ldmia	r1!, {r8-r11}
	eor	r4, r4, r8
	eor	r5, r5, r9
	eor	r6, r6, r10
	eor	r7, r7, r11
	stmia	r0!, {r4-r7}
	stmia	r12!, {r4-r7}
.endm

.macro scrypt_core_macro2_x4
	ldmia	r12, {r4-r7}
	ldmia	r0, {r8-r11}
	add	r4, r4, r8
	add	r5, r5, r9
	add	r6, r6, r10
	add	r7, r7, r11
	stmia	r0!, {r4-r7}
	ldmia	r2, {r8-r11}
	eor	r4, r4, r8
	eor	r5, r5, r9
	eor	r6, r6, r10
	eor	r7, r7, r11
	stmia	r2!, {r4-r7}
	stmia	r12!, {r4-r7}
.endm

.macro scrypt_core_macro3_x4
	ldmia	r1!, {r4-r7}
	ldmia	r0, {r8-r11}
	add	r4, r4, r8
	add	r5, r5, r9
	add	r6, r6, r10
	add	r7, r7, r11
	stmia	r0!, {r4-r7}
.endm

.macro scrypt_core_macro3_x6
	ldmia	r1!, {r2-r7}
	ldmia	r0, {r8-r12, lr}
	add	r2, r2, r8
	add	r3, r3, r9
	add	r4, r4, r10
	add	r5, r5, r11
	add	r6, r6, r12
	add	r7, r7, lr
	stmia	r0!, {r2-r7}
.endm


	.text
	.code 32
	.align 2
	.globl scrypt_core
	.globl _scrypt_core
#ifdef __ELF__
	.type scrypt_core, %function
#endif
scrypt_core:
_scrypt_core:
	stmfd	sp!, {r4-r11, lr}
	mov	r12, sp
	sub	sp, sp, #22*4
	bic	sp, sp, #63
	str	r12, [sp, #20*4]
	str	r2, [sp, #21*4]
	
	scrypt_shuffle
	
	ldr	r2, [sp, #21*4]
	str	r0, [sp, #16*4]
	add	r12, r1, r2, lsl #7
	str	r12, [sp, #18*4]
scrypt_core_loop1:
	add	lr, r0, #16*4
	add	r3, r1, #16*4
	mov	r12, sp
	scrypt_core_macro1a_x4
	scrypt_core_macro1a_x4
	scrypt_core_macro1a_x4
	scrypt_core_macro1a_x4
	str	r1, [sp, #17*4]
	
	salsa8_core
	
	ldr	r0, [sp, #16*4]
	mov	r12, sp
	add	r2, r0, #16*4
	scrypt_core_macro2_x4
	scrypt_core_macro2_x4
	scrypt_core_macro2_x4
	scrypt_core_macro2_x4
	
	salsa8_core
	
	ldr	r0, [sp, #16*4]
	mov	r1, sp
	add	r0, r0, #16*4
	scrypt_core_macro3_x6
	scrypt_core_macro3_x6
	ldr	r3, [sp, #17*4]
	ldr	r12, [sp, #18*4]
	scrypt_core_macro3_x4
	
	add	r1, r3, #16*4
	sub	r0, r0, #32*4
	cmp	r1, r12
	bne	scrypt_core_loop1
	
	ldr	r12, [sp, #21*4]
	ldr	r4, [r0, #16*4]
	sub	r2, r12, #1
	str	r2, [sp, #21*4]
	sub	r1, r1, r12, lsl #7
	str	r1, [sp, #17*4]
	and	r4, r4, r2
	add	r1, r1, r4, lsl #7
scrypt_core_loop2:
	add	r2, r0, #16*4
	add	r3, r1, #16*4
	str	r12, [sp, #18*4]
	mov	r12, sp
#ifdef __ARM_ARCH_5E_OR_6_OR_7__
	pld [r1, #24*4]
	pld [r1, #8*4]
#endif
	scrypt_core_macro1b_x4
	scrypt_core_macro1b_x4
	scrypt_core_macro1b_x4
	scrypt_core_macro1b_x4
	
	salsa8_core
	
	ldr	r0, [sp, #16*4]
	mov	r12, sp
	add	r2, r0, #16*4
	scrypt_core_macro2_x4
	scrypt_core_macro2_x4
	scrypt_core_macro2_x4
	scrypt_core_macro2_x4
	
	salsa8_core
	
	ldr	r0, [sp, #16*4]
	mov	r1, sp
	ldr	r3, [sp, #17*4]
	add	r0, r0, #16*4
	ldr	r2, [sp, #21*4]
	scrypt_core_macro3_x4
	and	r4, r4, r2
	add	r3, r3, r4, lsl #7
	str	r3, [sp, #19*4]
#ifdef __ARM_ARCH_5E_OR_6_OR_7__
	pld	[r3, #16*4]
	pld	[r3]
#endif
	scrypt_core_macro3_x6
	scrypt_core_macro3_x6
	
	ldr	r12, [sp, #18*4]
	sub	r0, r0, #32*4
	ldr	r1, [sp, #19*4]
	subs	r12, r12, #1
	bne	scrypt_core_loop2
	
	scrypt_shuffle
	
	ldr	sp, [sp, #20*4]
#ifdef __thumb__
	ldmfd	sp!, {r4-r11, lr}
	bx	lr
#else
	ldmfd	sp!, {r4-r11, pc}
#endif


#ifdef __ARM_NEON__

.macro salsa8_core_3way_doubleround
	ldrd	r6, [sp, #6*4]
	vadd.u32	q4, q0, q1
	add	r6, r2, r6
	vadd.u32	q6, q8, q9
	add	r7, r3, r7
	vshl.u32	q5, q4, #7
	eor	r10, r10, r6, ror #25
	vshl.u32	q7, q6, #7
	add	r6, r0, r4
	vshr.u32	q4, q4, #32-7
	eor	r11, r11, r7, ror #25
	vshr.u32	q6, q6, #32-7
	add	r7, r1, r5
	veor.u32	q3, q3, q5
	strd	r10, [sp, #14*4]
	veor.u32	q11, q11, q7
	eor	r12, r12, r6, ror #25
	veor.u32	q3, q3, q4
	eor	lr, lr, r7, ror #25
	veor.u32	q11, q11, q6
	
	ldrd	r6, [sp, #10*4]
	vadd.u32	q4, q3, q0
	add	r2, r10, r2
	vadd.u32	q6, q11, q8
	add	r3, r11, r3
	vshl.u32	q5, q4, #9
	eor	r6, r6, r2, ror #23
	vshl.u32	q7, q6, #9
	add	r2, r12, r0
	vshr.u32	q4, q4, #32-9
	eor	r7, r7, r3, ror #23
	vshr.u32	q6, q6, #32-9
	add	r3, lr, r1
	veor.u32	q2, q2, q5
	strd	r6, [sp, #10*4]
	veor.u32	q10, q10, q7
	eor	r8, r8, r2, ror #23
	veor.u32	q2, q2, q4
	eor	r9, r9, r3, ror #23
	veor.u32	q10, q10, q6
	
	ldrd	r2, [sp, #6*4]
	vadd.u32	q4, q2, q3
	add	r10, r6, r10
	vadd.u32	q6, q10, q11
	add	r11, r7, r11
	vext.u32	q3, q3, q3, #3
	eor	r2, r2, r10, ror #19
	vshl.u32	q5, q4, #13
	add	r10, r8, r12
	vext.u32	q11, q11, q11, #3
	eor	r3, r3, r11, ror #19
	vshl.u32	q7, q6, #13
	add	r11, r9, lr
	vshr.u32	q4, q4, #32-13
	eor	r4, r4, r10, ror #19
	vshr.u32	q6, q6, #32-13
	eor	r5, r5, r11, ror #19
	veor.u32	q1, q1, q5
	veor.u32	q9, q9, q7
	veor.u32	q1, q1, q4
	veor.u32	q9, q9, q6
	
	ldrd	r10, [sp, #2*4]
	vadd.u32	q4, q1, q2
	add	r6, r2, r6
	vadd.u32	q6, q9, q10
	add	r7, r3, r7
	vswp.u32	d4, d5
	eor	r10, r10, r6, ror #14
	vshl.u32	q5, q4, #18
	add	r6, r4, r8
	vswp.u32	d20, d21
	eor	r11, r11, r7, ror #14
	vshl.u32	q7, q6, #18
	add	r7, r5, r9
	vshr.u32	q4, q4, #32-18
	eor	r0, r0, r6, ror #14
	vshr.u32	q6, q6, #32-18
	eor	r1, r1, r7, ror #14
	veor.u32	q0, q0, q5
	ldrd	r6, [sp, #14*4]
	veor.u32	q8, q8, q7
	veor.u32	q0, q0, q4
	veor.u32	q8, q8, q6
	
	
	strd	r2, [sp, #6*4]
	vadd.u32	q4, q0, q3
	strd	r10, [sp, #2*4]
	vadd.u32	q6, q8, q11
	add	r6, r11, r6
	vext.u32	q1, q1, q1, #1
	add	r7, r0, r7
	vshl.u32	q5, q4, #7
	eor	r4, r4, r6, ror #25
	vext.u32	q9, q9, q9, #1
	add	r6, r1, r12
	vshl.u32	q7, q6, #7
	eor	r5, r5, r7, ror #25
	vshr.u32	q4, q4, #32-7
	add	r7, r10, lr
	vshr.u32	q6, q6, #32-7
	eor	r2, r2, r6, ror #25
	veor.u32	q1, q1, q5
	eor	r3, r3, r7, ror #25
	veor.u32	q9, q9, q7
	strd	r2, [sp, #6*4]
	veor.u32	q1, q1, q4
	veor.u32	q9, q9, q6
	
	add	r10, r3, r10
	vadd.u32	q4, q1, q0
	ldrd	r6, [sp, #10*4]
	vadd.u32	q6, q9, q8
	add	r11, r4, r11
	vshl.u32	q5, q4, #9
	eor	r8, r8, r10, ror #23
	vshl.u32	q7, q6, #9
	add	r10, r5, r0
	vshr.u32	q4, q4, #32-9
	eor	r9, r9, r11, ror #23
	vshr.u32	q6, q6, #32-9
	add	r11, r2, r1
	veor.u32	q2, q2, q5
	eor	r6, r6, r10, ror #23
	veor.u32	q10, q10, q7
	eor	r7, r7, r11, ror #23
	veor.u32	q2, q2, q4
	strd	r6, [sp, #10*4]
	veor.u32	q10, q10, q6
	
	add	r2, r7, r2
	vadd.u32	q4, q2, q1
	ldrd	r10, [sp, #14*4]
	vadd.u32	q6, q10, q9
	add	r3, r8, r3
	vext.u32	q1, q1, q1, #3
	eor	r12, r12, r2, ror #19
	vshl.u32	q5, q4, #13
	add	r2, r9, r4
	vext.u32	q9, q9, q9, #3
	eor	lr, lr, r3, ror #19
	vshl.u32	q7, q6, #13
	add	r3, r6, r5
	vshr.u32	q4, q4, #32-13
	eor	r10, r10, r2, ror #19
	vshr.u32	q6, q6, #32-13
	eor	r11, r11, r3, ror #19
	veor.u32	q3, q3, q5
	veor.u32	q11, q11, q7
	veor.u32	q3, q3, q4
	veor.u32	q11, q11, q6
	
	ldrd	r2, [sp, #2*4]
	vadd.u32	q4, q3, q2
	add	r6, r11, r6
	vadd.u32	q6, q11, q10
	add	r7, r12, r7
	vswp.u32	d4, d5
	eor	r0, r0, r6, ror #14
	vshl.u32	q5, q4, #18
	add	r6, lr, r8
	vswp.u32	d20, d21
	eor	r1, r1, r7, ror #14
	vshl.u32	q7, q6, #18
	add	r7, r10, r9
	vext.u32	q3, q3, q3, #1
	eor	r2, r2, r6, ror #14
	vshr.u32	q4, q4, #32-18
	eor	r3, r3, r7, ror #14
	vshr.u32	q6, q6, #32-18
	strd	r2, [sp, #2*4]
	vext.u32	q11, q11, q11, #1
	strd	r10, [sp, #14*4]
	veor.u32	q0, q0, q5
	veor.u32	q8, q8, q7
	veor.u32	q0, q0, q4
	veor.u32	q8, q8, q6
.endm

.macro salsa8_core_3way
	ldmia	sp, {r0-r12, lr}
	ldrd	r10, [sp, #14*4]
	salsa8_core_3way_doubleround
	salsa8_core_3way_doubleround
	salsa8_core_3way_doubleround
	salsa8_core_3way_doubleround
	stmia	sp, {r0-r5}
	strd	r8, [sp, #8*4]
	str	r12, [sp, #12*4]
	str	lr, [sp, #13*4]
.endm

	.text
	.code 32
	.align 2
	.globl scrypt_core_3way
	.globl _scrypt_core_3way
#ifdef __ELF__
	.type scrypt_core_3way, %function
#endif
scrypt_core_3way:
_scrypt_core_3way:
	stmfd	sp!, {r4-r11, lr}
	vpush	{q4-q7}
	mov	r12, sp
	sub	sp, sp, #24*16
	bic	sp, sp, #63
	str	r2, [sp, #4*16+3*4]
	str	r12, [sp, #4*16+4*4]
	
	mov	r3, r0
	vldmia	r3!, {q8-q15}
	vmov.u64	q0, #0xffffffff
	vmov.u32	q1, q8
	vmov.u32	q2, q12
	vbif.u32	q8, q9, q0
	vbif.u32	q12, q13, q0
	vbif.u32	q9, q10, q0
	vbif.u32	q13, q14, q0
	vbif.u32	q10, q11, q0
	vbif.u32	q14, q15, q0
	vbif.u32	q11, q1, q0
	vbif.u32	q15, q2, q0
	vldmia	r3!, {q0-q7}
	vswp.u32	d17, d21
	vswp.u32	d25, d29
	vswp.u32	d18, d22
	vswp.u32	d26, d30
	vstmia	r0, {q8-q15}
	vmov.u64	q8, #0xffffffff
	vmov.u32	q9, q0
	vmov.u32	q10, q4
	vbif.u32	q0, q1, q8
	vbif.u32	q4, q5, q8
	vbif.u32	q1, q2, q8
	vbif.u32	q5, q6, q8
	vbif.u32	q2, q3, q8
	vbif.u32	q6, q7, q8
	vbif.u32	q3, q9, q8
	vbif.u32	q7, q10, q8
	vldmia	r3, {q8-q15}
	vswp.u32	d1, d5
	vswp.u32	d9, d13
	vswp.u32	d2, d6
	vswp.u32	d10, d14
	add	r12, sp, #8*16
	vstmia	r12!, {q0-q7}
	vmov.u64	q0, #0xffffffff
	vmov.u32	q1, q8
	vmov.u32	q2, q12
	vbif.u32	q8, q9, q0
	vbif.u32	q12, q13, q0
	vbif.u32	q9, q10, q0
	vbif.u32	q13, q14, q0
	vbif.u32	q10, q11, q0
	vbif.u32	q14, q15, q0
	vbif.u32	q11, q1, q0
	vbif.u32	q15, q2, q0
	vswp.u32	d17, d21
	vswp.u32	d25, d29
	vswp.u32	d18, d22
	vswp.u32	d26, d30
	vstmia	r12, {q8-q15}
	
	add	lr, sp, #128
	vldmia	lr, {q0-q7}
	add	r2, r1, r2, lsl #7
	str	r0, [sp, #4*16+0*4]
	str	r2, [sp, #4*16+2*4]
scrypt_core_3way_loop1:
	add	lr, r0, #16*4
	add	r3, r1, #16*4
	str	r1, [sp, #4*16+1*4]
	mov	r12, sp
	scrypt_core_macro1a_x4
	scrypt_core_macro1a_x4
	scrypt_core_macro1a_x4
	ldr	r2, [sp, #4*16+3*4]
	scrypt_core_macro1a_x4
	sub	r1, r1, #4*16
	
	add	r1, r1, r2, lsl #7
	vstmia	r1, {q0-q7}
	add	r3, r1, r2, lsl #7
	vstmia	r3, {q8-q15}
	
	add	lr, sp, #128
	veor.u32	q0, q0, q4
	veor.u32	q1, q1, q5
	veor.u32	q2, q2, q6
	veor.u32	q3, q3, q7
	vstmia	lr, {q0-q3}
	veor.u32	q8, q8, q12
	veor.u32	q9, q9, q13
	veor.u32	q10, q10, q14
	veor.u32	q11, q11, q15
	add	r12, sp, #256
	vstmia	r12, {q8-q11}
	
	salsa8_core_3way
	
	ldr	r0, [sp, #4*16+0*4]
	mov	r12, sp
	add	r2, r0, #16*4
	scrypt_core_macro2_x4
	scrypt_core_macro2_x4
	scrypt_core_macro2_x4
	scrypt_core_macro2_x4
	
	add	lr, sp, #128
	vldmia	lr, {q4-q7}
	vadd.u32	q4, q4, q0
	vadd.u32	q5, q5, q1
	vadd.u32	q6, q6, q2
	vadd.u32	q7, q7, q3
	add	r12, sp, #256
	vldmia	r12, {q0-q3}
	vstmia	lr, {q4-q7}
	vadd.u32	q8, q8, q0
	vadd.u32	q9, q9, q1
	vadd.u32	q10, q10, q2
	vadd.u32	q11, q11, q3
	
	add	r4, sp, #128+4*16
	vldmia	r4, {q0-q3}
	vstmia	r12, {q8-q11}
	veor.u32	q0, q0, q4
	veor.u32	q1, q1, q5
	veor.u32	q2, q2, q6
	veor.u32	q3, q3, q7
	vstmia	r4, {q0-q3}
	veor.u32	q8, q8, q12
	veor.u32	q9, q9, q13
	veor.u32	q10, q10, q14
	veor.u32	q11, q11, q15
	vmov	q12, q8
	vmov	q13, q9
	vmov	q14, q10
	vmov	q15, q11
	
	salsa8_core_3way
	
	ldr	r0, [sp, #4*16+0*4]
	mov	r1, sp
	add	r0, r0, #16*4
	scrypt_core_macro3_x6
	scrypt_core_macro3_x6
	scrypt_core_macro3_x4
	sub	r0, r0, #8*16
	
	ldr	r1, [sp, #4*16+1*4]
	ldr	r2, [sp, #4*16+2*4]
	add	lr, sp, #128
	add	r4, sp, #128+4*16
	vldmia	r4, {q4-q7}
	vadd.u32	q4, q4, q0
	vadd.u32	q5, q5, q1
	vadd.u32	q6, q6, q2
	vadd.u32	q7, q7, q3
	vstmia	r4, {q4-q7}
	vldmia	lr, {q0-q3}
	vadd.u32	q12, q12, q8
	vadd.u32	q13, q13, q9
	vadd.u32	q14, q14, q10
	vadd.u32	q15, q15, q11
	add	r12, sp, #256
	vldmia	r12, {q8-q11}
	
	add	r1, r1, #8*16
	cmp	r1, r2
	bne	scrypt_core_3way_loop1
	
	ldr	r2, [sp, #4*16+3*4]
	add	r5, sp, #256+4*16
	vstmia	r5, {q12-q15}
	
	sub	r1, r1, r2, lsl #7
	str	r1, [sp, #4*16+1*4]
scrypt_core_3way_loop2:
	str	r2, [sp, #4*16+2*4]
	
	ldr	r0, [sp, #4*16+0*4]
	ldr	r1, [sp, #4*16+1*4]
	ldr	r2, [sp, #4*16+3*4]
	ldr	r4, [r0, #16*4]
	sub	r2, r2, #1
	and	r4, r4, r2
	add	r1, r1, r4, lsl #7
	add	r2, r0, #16*4
	add	r3, r1, #16*4
	mov	r12, sp
	scrypt_core_macro1b_x4
	scrypt_core_macro1b_x4
	scrypt_core_macro1b_x4
	scrypt_core_macro1b_x4
	
	ldr	r1, [sp, #4*16+1*4]
	ldr	r2, [sp, #4*16+3*4]
	add	r1, r1, r2, lsl #7
	add	r3, r1, r2, lsl #7
	sub	r2, r2, #1
	vmov	r6, r7, d8
	and	r6, r6, r2
	add	r6, r1, r6, lsl #7
	vmov	r7, r8, d24
	add	lr, sp, #128
	vldmia	lr, {q0-q3}
	pld	[r6]
	pld	[r6, #8*4]
	pld	[r6, #16*4]
	pld	[r6, #24*4]
	vldmia	r6, {q8-q15}
	and	r7, r7, r2
	add	r7, r3, r7, lsl #7
	veor.u32	q8, q8, q0
	veor.u32	q9, q9, q1
	veor.u32	q10, q10, q2
	veor.u32	q11, q11, q3
	pld	[r7]
	pld	[r7, #8*4]
	pld	[r7, #16*4]
	pld	[r7, #24*4]
	veor.u32	q12, q12, q4
	veor.u32	q13, q13, q5
	veor.u32	q14, q14, q6
	veor.u32	q15, q15, q7
	vldmia	r7, {q0-q7}
	vstmia	lr, {q8-q15}
	add	r12, sp, #256
	vldmia	r12, {q8-q15}
	veor.u32	q8, q8, q0
	veor.u32	q9, q9, q1
	veor.u32	q10, q10, q2
	veor.u32	q11, q11, q3
	veor.u32	q12, q12, q4
	veor.u32	q13, q13, q5
	veor.u32	q14, q14, q6
	veor.u32	q15, q15, q7
	
	vldmia	lr, {q0-q7}
	veor.u32	q0, q0, q4
	veor.u32	q1, q1, q5
	veor.u32	q2, q2, q6
	veor.u32	q3, q3, q7
	vstmia	lr, {q0-q3}
	veor.u32	q8, q8, q12
	veor.u32	q9, q9, q13
	veor.u32	q10, q10, q14
	veor.u32	q11, q11, q15
	vstmia	r12, {q8-q15}
	
	salsa8_core_3way
	
	ldr	r0, [sp, #4*16+0*4]
	mov	r12, sp
	add	r2, r0, #16*4
	scrypt_core_macro2_x4
	scrypt_core_macro2_x4
	scrypt_core_macro2_x4
	scrypt_core_macro2_x4
	
	add	lr, sp, #128
	vldmia	lr, {q4-q7}
	vadd.u32	q4, q4, q0
	vadd.u32	q5, q5, q1
	vadd.u32	q6, q6, q2
	vadd.u32	q7, q7, q3
	add	r12, sp, #256
	vldmia	r12, {q12-q15}
	vstmia	lr, {q4-q7}
	vadd.u32	q12, q12, q8
	vadd.u32	q13, q13, q9
	vadd.u32	q14, q14, q10
	vadd.u32	q15, q15, q11
	
	add	r4, sp, #128+4*16
	vldmia	r4, {q0-q3}
	vstmia	r12, {q12-q15}
	veor.u32	q0, q0, q4
	veor.u32	q1, q1, q5
	veor.u32	q2, q2, q6
	veor.u32	q3, q3, q7
	add	r5, sp, #256+4*16
	vldmia	r5, {q8-q11}
	vstmia	r4, {q0-q3}
	veor.u32	q8, q8, q12
	veor.u32	q9, q9, q13
	veor.u32	q10, q10, q14
	veor.u32	q11, q11, q15
	vmov	q12, q8
	vmov	q13, q9
	vmov	q14, q10
	vmov	q15, q11
	
	salsa8_core_3way
	
	ldr	r0, [sp, #4*16+0*4]
	ldr	r3, [sp, #4*16+1*4]
	ldr	r2, [sp, #4*16+3*4]
	mov	r1, sp
	add	r0, r0, #16*4
	sub	r2, r2, #1
	scrypt_core_macro3_x4
	and	r4, r4, r2
	add	r3, r3, r4, lsl #7
	pld	[r3, #16*4]
	pld	[r3]
	pld	[r3, #24*4]
	pld	[r3, #8*4]
	scrypt_core_macro3_x6
	scrypt_core_macro3_x6
	
	add	lr, sp, #128
	add	r4, sp, #128+4*16
	vldmia	r4, {q4-q7}
	vadd.u32	q4, q4, q0
	vadd.u32	q5, q5, q1
	vadd.u32	q6, q6, q2
	vadd.u32	q7, q7, q3
	vstmia	r4, {q4-q7}
	vadd.u32	q12, q12, q8
	vadd.u32	q13, q13, q9
	vadd.u32	q14, q14, q10
	vadd.u32	q15, q15, q11
	add	r5, sp, #256+4*16
	vstmia	r5, {q12-q15}
	
	ldr	r2, [sp, #4*16+2*4]
	subs	r2, r2, #1
	bne	scrypt_core_3way_loop2
	
	ldr	r0, [sp, #4*16+0*4]
	vldmia	r0, {q8-q15}
	vmov.u64	q0, #0xffffffff
	vmov.u32	q1, q8
	vmov.u32	q2, q12
	vbif.u32	q8, q9, q0
	vbif.u32	q12, q13, q0
	vbif.u32	q9, q10, q0
	vbif.u32	q13, q14, q0
	vbif.u32	q10, q11, q0
	vbif.u32	q14, q15, q0
	vbif.u32	q11, q1, q0
	vbif.u32	q15, q2, q0
	add	r12, sp, #8*16
	vldmia	r12!, {q0-q7}
	vswp.u32	d17, d21
	vswp.u32	d25, d29
	vswp.u32	d18, d22
	vswp.u32	d26, d30
	vstmia	r0!, {q8-q15}
	vmov.u64	q8, #0xffffffff
	vmov.u32	q9, q0
	vmov.u32	q10, q4
	vbif.u32	q0, q1, q8
	vbif.u32	q4, q5, q8
	vbif.u32	q1, q2, q8
	vbif.u32	q5, q6, q8
	vbif.u32	q2, q3, q8
	vbif.u32	q6, q7, q8
	vbif.u32	q3, q9, q8
	vbif.u32	q7, q10, q8
	vldmia	r12, {q8-q15}
	vswp.u32	d1, d5
	vswp.u32	d9, d13
	vswp.u32	d2, d6
	vswp.u32	d10, d14
	vstmia	r0!, {q0-q7}
	vmov.u64	q0, #0xffffffff
	vmov.u32	q1, q8
	vmov.u32	q2, q12
	vbif.u32	q8, q9, q0
	vbif.u32	q12, q13, q0
	vbif.u32	q9, q10, q0
	vbif.u32	q13, q14, q0
	vbif.u32	q10, q11, q0
	vbif.u32	q14, q15, q0
	vbif.u32	q11, q1, q0
	vbif.u32	q15, q2, q0
	vswp.u32	d17, d21
	vswp.u32	d25, d29
	vswp.u32	d18, d22
	vswp.u32	d26, d30
	vstmia	r0, {q8-q15}
	
	ldr	sp, [sp, #4*16+4*4]
	vpop	{q4-q7}
	ldmfd	sp!, {r4-r11, pc}

#endif /* __ARM_NEON__ */

#endif
07070100000024000081A4000003E800000064000000015EF4BCA100004C75000000000000000000000000000000000000001C00000000cpuminer-2.5.1/scrypt-ppc.S/*
 * Copyright 2014-2015 pooler@litecoinpool.org
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the Free
 * Software Foundation; either version 2 of the License, or (at your option)
 * any later version.  See COPYING for more details.
 */

#include "cpuminer-config.h"

#if defined(USE_ASM) && (defined(__powerpc__) || defined(__ppc__) || defined(__PPC__))

#ifndef __APPLE__

#define r0 0
#define r1 1
#define r2 2
#define r3 3
#define r4 4
#define r5 5
#define r6 6
#define r7 7
#define r8 8
#define r9 9
#define r10 10
#define r11 11
#define r12 12
#define r13 13
#define r14 14
#define r15 15
#define r16 16
#define r17 17
#define r18 18
#define r19 19
#define r20 20
#define r21 21
#define r22 22
#define r23 23
#define r24 24
#define r25 25
#define r26 26
#define r27 27
#define r28 28
#define r29 29
#define r30 30
#define r31 31

#ifdef __ALTIVEC__
#define v0 0
#define v1 1
#define v2 2
#define v3 3
#define v4 4
#define v5 5
#define v6 6
#define v7 7
#define v8 8
#define v9 9
#define v10 10
#define v11 11
#define v12 12
#define v13 13
#define v14 14
#define v15 15
#define v16 16
#define v17 17
#define v18 18
#define v19 19
#define v20 20
#define v21 21
#define v22 22
#define v23 23
#define v24 24
#define v25 25
#define v26 26
#define v27 27
#define v28 28
#define v29 29
#define v30 30
#define v31 31
#endif

#endif

#if !(defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__) || \
	defined(__64BIT__) || defined(_LP64) || defined(__LP64__))
#define ld lwz
#define std stw
#define stdu stwu
#define stdux stwux
#endif


#ifdef __ALTIVEC__

#ifdef __APPLE__
	.machine ppc7400
#endif

.macro salsa8_core_doubleround
	vadduwm	v4, v0, v1
	vrlw	v4, v4, v16
	vxor	v3, v3, v4
	
	vadduwm	v4, v3, v0
	vrlw	v4, v4, v17
	vxor	v2, v2, v4
	
	vadduwm	v4, v2, v3
	vrlw	v4, v4, v18
	vsldoi	v3, v3, v3, 12
	vxor	v1, v1, v4
	
	vadduwm	v4, v1, v2
	vrlw	v4, v4, v19
	vsldoi	v1, v1, v1, 4
	vxor	v0, v0, v4
	
	vadduwm	v4, v0, v3
	vrlw	v4, v4, v16
	vsldoi	v2, v2, v2, 8
	vxor	v1, v1, v4
	
	vadduwm	v4, v1, v0
	vrlw	v4, v4, v17
	vxor	v2, v2, v4
	
	vadduwm	v4, v2, v1
	vrlw	v4, v4, v18
	vsldoi	v1, v1, v1, 12
	vxor	v3, v3, v4
	
	vadduwm	v4, v3, v2
	vrlw	v4, v4, v19
	vsldoi	v3, v3, v3, 4
	vxor	v0, v0, v4
	vsldoi	v2, v2, v2, 8
.endm

.macro salsa8_core
	salsa8_core_doubleround
	salsa8_core_doubleround
	salsa8_core_doubleround
	salsa8_core_doubleround
.endm

#ifdef _AIX
	.csect .text[PR]
#else
	.text
#endif
	.align 2
	.globl scrypt_core
	.globl _scrypt_core
	.globl .scrypt_core
#ifdef __ELF__
	.type scrypt_core, %function
#endif
scrypt_core:
_scrypt_core:
.scrypt_core:
	stdu	r1, -4*4(r1)
	mfspr	r0, 256
	std	r0, 2*4(r1)
	oris	r0, r0, 0xffff
	ori	r0, r0, 0xf000
	mtspr	256, r0
	
	li	r6, 1*16
	li	r7, 2*16
	li	r8, 3*16
	li	r9, 4*16
	li	r10, 5*16
	li	r11, 6*16
	li	r12, 7*16
	
	lvx	v8, 0, r3
	lvx	v9, r3, r6
	lvx	v10, r3, r7
	lvx	v11, r3, r8
	lvx	v12, r3, r9
	lvx	v13, r3, r10
	lvx	v14, r3, r11
	lvx	v15, r3, r12
	
	vxor	v0, v0, v0
	vnor	v1, v0, v0
	vsldoi	v2, v0, v1, 4
	vsldoi	v3, v2, v0, 8
	vor	v3, v3, v2
	vsldoi	v1, v0, v1, 8
	
	vor	v4, v8, v8
	vsel	v8, v8, v9, v3
	vsel	v9, v9, v10, v3
	vsel	v10, v10, v11, v3
	vsel	v11, v11, v4, v3
	vor	v4, v8, v8
	vor	v5, v9, v9
	vsel	v8, v8, v10, v1
	vsel	v9, v11, v9, v1
	vsel	v10, v10, v4, v1
	vsel	v11, v5, v11, v1
	
	vor	v4, v12, v12
	vsel	v12, v12, v13, v3
	vsel	v13, v13, v14, v3
	vsel	v14, v14, v15, v3
	vsel	v15, v15, v4, v3
	vor	v4, v12, v12
	vor	v5, v13, v13
	vsel	v12, v12, v14, v1
	vsel	v13, v15, v13, v1
	vsel	v14, v14, v4, v1
	vsel	v15, v5, v15, v1
	
	vspltisw	v16, 7
	vspltisw	v17, 9
	vspltisw	v18, 13
	vadduwm	v19, v17, v17
	
	mtctr	r5
scrypt_core_loop1:
	vxor	v8, v8, v12
	stvx	v8, 0, r4
	vxor	v9, v9, v13
	stvx	v9, r4, r6
	vxor	v10, v10, v14
	stvx	v10, r4, r7
	vxor	v11, v11, v15
	stvx	v11, r4, r8
	vor	v0, v8, v8
	stvx	v12, r4, r9
	vor	v1, v9, v9
	stvx	v13, r4, r10
	vor	v2, v10, v10
	stvx	v14, r4, r11
	vor	v3, v11, v11
	stvx	v15, r4, r12
	
	salsa8_core
	
	vadduwm	v8, v8, v0
	vadduwm	v9, v9, v1
	vadduwm	v10, v10, v2
	vadduwm	v11, v11, v3
	
	vxor	v12, v12, v8
	vxor	v13, v13, v9
	vxor	v14, v14, v10
	vxor	v15, v15, v11
	vor	v0, v12, v12
	vor	v1, v13, v13
	vor	v2, v14, v14
	vor	v3, v15, v15
	
	salsa8_core
	
	vadduwm	v12, v12, v0
	vadduwm	v13, v13, v1
	vadduwm	v14, v14, v2
	vadduwm	v15, v15, v3
	
	addi	r4, r4, 32*4
	bdnz	scrypt_core_loop1
	
	stvx	v12, 0, r3
	slwi	r6, r5, 7
	subf	r4, r6, r4
	mtctr	r5
	addi	r5, r5, -1
	addi	r7, r4, 1*16
	addi	r8, r4, 2*16
	addi	r9, r4, 3*16
scrypt_core_loop2:
	lwz	r6, 0(r3)
	and	r6, r6, r5
	slwi	r6, r6, 7
	lvx	v0, r4, r6
	vxor	v8, v8, v12
	lvx	v1, r7, r6
	vxor	v9, v9, v13
	lvx	v2, r8, r6
	vxor	v10, v10, v14
	lvx	v3, r9, r6
	vxor	v11, v11, v15
	vxor	v0, v0, v8
	vxor	v1, v1, v9
	vxor	v2, v2, v10
	vxor	v3, v3, v11
	addi	r6, r6, 64
	vor	v8, v0, v0
	vor	v9, v1, v1
	lvx	v5, r4, r6
	vor	v10, v2, v2
	lvx	v6, r7, r6
	vor	v11, v3, v3
	lvx	v7, r8, r6
	
	salsa8_core
	
	vadduwm	v8, v8, v0
	lvx	v0, r9, r6
	vadduwm	v9, v9, v1
	vadduwm	v10, v10, v2
	vadduwm	v11, v11, v3
	
	vxor	v12, v12, v5
	vxor	v13, v13, v6
	vxor	v14, v14, v7
	vxor	v15, v15, v0
	vxor	v12, v12, v8
	vxor	v13, v13, v9
	vxor	v14, v14, v10
	vxor	v15, v15, v11
	vor	v0, v12, v12
	vor	v1, v13, v13
	vor	v2, v14, v14
	vor	v3, v15, v15
	
	salsa8_core
	
	vadduwm	v12, v12, v0
	stvx	v12, 0, r3
	vadduwm	v13, v13, v1
	vadduwm	v14, v14, v2
	vadduwm	v15, v15, v3
	
	bdnz	scrypt_core_loop2
	
	vxor	v0, v0, v0
	vnor	v1, v0, v0
	vsldoi	v2, v0, v1, 4
	vsldoi	v3, v2, v0, 8
	vor	v3, v3, v2
	vsldoi	v1, v0, v1, 8
	
	vor	v4, v8, v8
	vsel	v8, v8, v9, v3
	vsel	v9, v9, v10, v3
	vsel	v10, v10, v11, v3
	vsel	v11, v11, v4, v3
	vor	v4, v8, v8
	vor	v5, v9, v9
	vsel	v8, v8, v10, v1
	vsel	v9, v11, v9, v1
	vsel	v10, v10, v4, v1
	vsel	v11, v5, v11, v1
	
	vor	v4, v12, v12
	vsel	v12, v12, v13, v3
	vsel	v13, v13, v14, v3
	vsel	v14, v14, v15, v3
	vsel	v15, v15, v4, v3
	vor	v4, v12, v12
	vor	v5, v13, v13
	vsel	v12, v12, v14, v1
	vsel	v13, v15, v13, v1
	vsel	v14, v14, v4, v1
	vsel	v15, v5, v15, v1
	
	li	r6, 1*16
	li	r7, 2*16
	li	r8, 3*16
	li	r9, 4*16
	
	stvx	v8, 0, r3
	stvx	v9, r3, r6
	stvx	v10, r3, r7
	stvx	v11, r3, r8
	stvx	v12, r3, r9
	stvx	v13, r3, r10
	stvx	v14, r3, r11
	stvx	v15, r3, r12
	
	ld	r0, 2*4(r1)
	mtspr	256, r0
	addi	r1, r1, 4*4
	blr

#else /* __ALTIVEC__ */

.macro salsa8_core_doubleround
	add	r0, r16, r28
	add	r5, r21, r17
	add	r6, r26, r22
	add	r7, r31, r27
	rotlwi	r0, r0, 7
	rotlwi	r5, r5, 7
	rotlwi	r6, r6, 7
	rotlwi	r7, r7, 7
	xor	r20, r20, r0
	xor	r25, r25, r5
	xor	r30, r30, r6
	xor	r19, r19, r7
	
	add	r0, r20, r16
	add	r5, r25, r21
	add	r6, r30, r26
	add	r7, r19, r31
	rotlwi	r0, r0, 9
	rotlwi	r5, r5, 9
	rotlwi	r6, r6, 9
	rotlwi	r7, r7, 9
	xor	r24, r24, r0
	xor	r29, r29, r5
	xor	r18, r18, r6
	xor	r23, r23, r7
	
	add	r0, r24, r20
	add	r5, r29, r25
	add	r6, r18, r30
	add	r7, r23, r19
	rotlwi	r0, r0, 13
	rotlwi	r5, r5, 13
	rotlwi	r6, r6, 13
	rotlwi	r7, r7, 13
	xor	r28, r28, r0
	xor	r17, r17, r5
	xor	r22, r22, r6
	xor	r27, r27, r7
	
	add	r0, r28, r24
	add	r5, r17, r29
	add	r6, r22, r18
	add	r7, r27, r23
	rotlwi	r0, r0, 18
	rotlwi	r5, r5, 18
	rotlwi	r6, r6, 18
	rotlwi	r7, r7, 18
	xor	r16, r16, r0
	xor	r21, r21, r5
	xor	r26, r26, r6
	xor	r31, r31, r7
	
	add	r0, r16, r19
	add	r5, r21, r20
	add	r6, r26, r25
	add	r7, r31, r30
	rotlwi	r0, r0, 7
	rotlwi	r5, r5, 7
	rotlwi	r6, r6, 7
	rotlwi	r7, r7, 7
	xor	r17, r17, r0
	xor	r22, r22, r5
	xor	r27, r27, r6
	xor	r28, r28, r7
	
	add	r0, r17, r16
	add	r5, r22, r21
	add	r6, r27, r26
	add	r7, r28, r31
	rotlwi	r0, r0, 9
	rotlwi	r5, r5, 9
	rotlwi	r6, r6, 9
	rotlwi	r7, r7, 9
	xor	r18, r18, r0
	xor	r23, r23, r5
	xor	r24, r24, r6
	xor	r29, r29, r7
	
	add	r0, r18, r17
	add	r5, r23, r22
	add	r6, r24, r27
	add	r7, r29, r28
	rotlwi	r0, r0, 13
	rotlwi	r5, r5, 13
	rotlwi	r6, r6, 13
	rotlwi	r7, r7, 13
	xor	r19, r19, r0
	xor	r20, r20, r5
	xor	r25, r25, r6
	xor	r30, r30, r7
	
	add	r0, r19, r18
	add	r5, r20, r23
	add	r6, r25, r24
	add	r7, r30, r29
	rotlwi	r0, r0, 18
	rotlwi	r5, r5, 18
	rotlwi	r6, r6, 18
	rotlwi	r7, r7, 18
	xor	r16, r16, r0
	xor	r21, r21, r5
	xor	r26, r26, r6
	xor	r31, r31, r7
.endm

.macro salsa8_core
	salsa8_core_doubleround
	salsa8_core_doubleround
	salsa8_core_doubleround
	salsa8_core_doubleround
.endm

#ifdef _AIX
	.csect .text[PR]
#else
	.text
#endif
	.align 2
	.globl scrypt_core
	.globl _scrypt_core
	.globl .scrypt_core
#ifdef __ELF__
	.type scrypt_core, %function
#endif
scrypt_core:
_scrypt_core:
.scrypt_core:
	stdu	r1, -68*4(r1)
	stw	r5, 2*4(r1)
	std	r13, 4*4(r1)
	std	r14, 6*4(r1)
	std	r15, 8*4(r1)
	std	r16, 10*4(r1)
	std	r17, 12*4(r1)
	std	r18, 14*4(r1)
	std	r19, 16*4(r1)
	std	r20, 18*4(r1)
	std	r21, 20*4(r1)
	std	r3, 22*4(r1)
	std	r22, 48*4(r1)
	std	r23, 50*4(r1)
	std	r24, 52*4(r1)
	std	r25, 54*4(r1)
	std	r26, 56*4(r1)
	std	r27, 58*4(r1)
	std	r28, 60*4(r1)
	std	r29, 62*4(r1)
	std	r30, 64*4(r1)
	std	r31, 66*4(r1)
	
	lwz	r16, 0*4(r3)
	lwz	r17, 1*4(r3)
	lwz	r18, 2*4(r3)
	lwz	r19, 3*4(r3)
	lwz	r20, 4*4(r3)
	lwz	r21, 5*4(r3)
	lwz	r22, 6*4(r3)
	lwz	r23, 7*4(r3)
	stw	r16, 24*4(r1)
	stw	r17, 25*4(r1)
	stw	r18, 26*4(r1)
	stw	r19, 27*4(r1)
	stw	r20, 28*4(r1)
	stw	r21, 29*4(r1)
	stw	r22, 30*4(r1)
	stw	r23, 31*4(r1)
	lwz	r24, 8*4(r3)
	lwz	r25, 9*4(r3)
	lwz	r26, 10*4(r3)
	lwz	r27, 11*4(r3)
	lwz	r28, 12*4(r3)
	lwz	r29, 13*4(r3)
	lwz	r30, 14*4(r3)
	lwz	r31, 15*4(r3)
	stw	r24, 32*4(r1)
	stw	r25, 33*4(r1)
	stw	r26, 34*4(r1)
	stw	r27, 35*4(r1)
	stw	r28, 36*4(r1)
	stw	r29, 37*4(r1)
	stw	r30, 38*4(r1)
	stw	r31, 39*4(r1)
	lwz	r16, 16*4(r3)
	lwz	r17, 17*4(r3)
	lwz	r18, 18*4(r3)
	lwz	r19, 19*4(r3)
	lwz	r20, 20*4(r3)
	lwz	r21, 21*4(r3)
	lwz	r22, 22*4(r3)
	lwz	r23, 23*4(r3)
	stw	r16, 40*4(r1)
	stw	r17, 41*4(r1)
	stw	r18, 42*4(r1)
	stw	r19, 43*4(r1)
	stw	r20, 44*4(r1)
	stw	r21, 45*4(r1)
	stw	r22, 46*4(r1)
	stw	r23, 47*4(r1)
	lwz	r8, 24*4(r3)
	lwz	r9, 25*4(r3)
	lwz	r10, 26*4(r3)
	lwz	r11, 27*4(r3)
	lwz	r12, 28*4(r3)
	lwz	r13, 29*4(r3)
	lwz	r14, 30*4(r3)
	lwz	r15, 31*4(r3)
	
	mtctr	r5
scrypt_core_loop1:
	lwz	r16, 24*4(r1)
	lwz	r17, 25*4(r1)
	lwz	r18, 26*4(r1)
	lwz	r19, 27*4(r1)
	lwz	r20, 28*4(r1)
	lwz	r21, 29*4(r1)
	lwz	r22, 30*4(r1)
	lwz	r23, 31*4(r1)
	lwz	r24, 32*4(r1)
	lwz	r25, 33*4(r1)
	lwz	r26, 34*4(r1)
	lwz	r27, 35*4(r1)
	lwz	r28, 36*4(r1)
	lwz	r29, 37*4(r1)
	lwz	r30, 38*4(r1)
	lwz	r31, 39*4(r1)
	
	lwz	r0, 40*4(r1)
	lwz	r5, 41*4(r1)
	lwz	r6, 42*4(r1)
	lwz	r7, 43*4(r1)
	xor	r16, r16, r0
	xor	r17, r17, r5
	xor	r18, r18, r6
	xor	r19, r19, r7
	stw	r16, 0*4(r4)
	stw	r17, 1*4(r4)
	stw	r18, 2*4(r4)
	stw	r19, 3*4(r4)
	stw	r0, 16*4(r4)
	stw	r5, 17*4(r4)
	stw	r6, 18*4(r4)
	stw	r7, 19*4(r4)
	
	lwz	r0, 44*4(r1)
	lwz	r5, 45*4(r1)
	lwz	r6, 46*4(r1)
	lwz	r7, 47*4(r1)
	xor	r20, r20, r0
	xor	r21, r21, r5
	xor	r22, r22, r6
	xor	r23, r23, r7
	stw	r0, 20*4(r4)
	stw	r5, 21*4(r4)
	stw	r6, 22*4(r4)
	stw	r7, 23*4(r4)
	stw	r20, 4*4(r4)
	stw	r21, 5*4(r4)
	stw	r22, 6*4(r4)
	stw	r23, 7*4(r4)
	
	xor	r24, r24, r8
	xor	r25, r25, r9
	xor	r26, r26, r10
	xor	r27, r27, r11
	xor	r28, r28, r12
	xor	r29, r29, r13
	xor	r30, r30, r14
	xor	r31, r31, r15
	stw	r24, 8*4(r4)
	stw	r25, 9*4(r4)
	stw	r26, 10*4(r4)
	stw	r27, 11*4(r4)
	stw	r28, 12*4(r4)
	stw	r29, 13*4(r4)
	stw	r30, 14*4(r4)
	stw	r31, 15*4(r4)
	stw	r8, 24*4(r4)
	stw	r9, 25*4(r4)
	stw	r10, 26*4(r4)
	stw	r11, 27*4(r4)
	stw	r12, 28*4(r4)
	stw	r13, 29*4(r4)
	stw	r14, 30*4(r4)
	stw	r15, 31*4(r4)
	
	salsa8_core
	
	lwz	r0, 0*4(r4)
	lwz	r5, 1*4(r4)
	lwz	r6, 2*4(r4)
	lwz	r7, 3*4(r4)
	add	r16, r16, r0
	add	r17, r17, r5
	add	r18, r18, r6
	add	r19, r19, r7
	lwz	r0, 4*4(r4)
	lwz	r5, 5*4(r4)
	lwz	r6, 6*4(r4)
	lwz	r7, 7*4(r4)
	add	r20, r20, r0
	add	r21, r21, r5
	add	r22, r22, r6
	add	r23, r23, r7
	lwz	r0, 8*4(r4)
	lwz	r5, 9*4(r4)
	lwz	r6, 10*4(r4)
	lwz	r7, 11*4(r4)
	add	r24, r24, r0
	add	r25, r25, r5
	add	r26, r26, r6
	add	r27, r27, r7
	lwz	r0, 12*4(r4)
	lwz	r5, 13*4(r4)
	lwz	r6, 14*4(r4)
	lwz	r7, 15*4(r4)
	add	r28, r28, r0
	add	r29, r29, r5
	add	r30, r30, r6
	add	r31, r31, r7
	
	stw	r16, 24*4(r1)
	stw	r17, 25*4(r1)
	stw	r18, 26*4(r1)
	stw	r19, 27*4(r1)
	stw	r20, 28*4(r1)
	stw	r21, 29*4(r1)
	stw	r22, 30*4(r1)
	stw	r23, 31*4(r1)
	stw	r24, 32*4(r1)
	stw	r25, 33*4(r1)
	stw	r26, 34*4(r1)
	stw	r27, 35*4(r1)
	stw	r28, 36*4(r1)
	stw	r29, 37*4(r1)
	stw	r30, 38*4(r1)
	stw	r31, 39*4(r1)
	
	lwz	r0, 40*4(r1)
	lwz	r5, 41*4(r1)
	lwz	r6, 42*4(r1)
	lwz	r7, 43*4(r1)
	xor	r16, r16, r0
	xor	r17, r17, r5
	xor	r18, r18, r6
	xor	r19, r19, r7
	lwz	r0, 44*4(r1)
	lwz	r5, 45*4(r1)
	lwz	r6, 46*4(r1)
	lwz	r7, 47*4(r1)
	xor	r20, r20, r0
	xor	r21, r21, r5
	xor	r22, r22, r6
	xor	r23, r23, r7
	xor	r24, r24, r8
	xor	r25, r25, r9
	xor	r26, r26, r10
	xor	r27, r27, r11
	xor	r28, r28, r12
	xor	r29, r29, r13
	xor	r30, r30, r14
	xor	r31, r31, r15
	stw	r16, 40*4(r1)
	stw	r17, 41*4(r1)
	stw	r18, 42*4(r1)
	stw	r19, 43*4(r1)
	mr	r8, r24
	mr	r9, r25
	mr	r10, r26
	mr	r11, r27
	stw	r20, 44*4(r1)
	stw	r21, 45*4(r1)
	stw	r22, 46*4(r1)
	stw	r23, 47*4(r1)
	mr	r12, r28
	mr	r13, r29
	mr	r14, r30
	mr	r15, r31
	
	salsa8_core
	
	lwz	r0, 40*4(r1)
	lwz	r5, 41*4(r1)
	lwz	r6, 42*4(r1)
	lwz	r7, 43*4(r1)
	add	r16, r16, r0
	add	r17, r17, r5
	add	r18, r18, r6
	add	r19, r19, r7
	lwz	r0, 44*4(r1)
	lwz	r5, 45*4(r1)
	lwz	r6, 46*4(r1)
	lwz	r7, 47*4(r1)
	add	r20, r20, r0
	add	r21, r21, r5
	add	r22, r22, r6
	add	r23, r23, r7
	add	r8, r8, r24
	add	r9, r9, r25
	add	r10, r10, r26
	add	r11, r11, r27
	stw	r16, 40*4(r1)
	stw	r17, 41*4(r1)
	stw	r18, 42*4(r1)
	stw	r19, 43*4(r1)
	add	r12, r12, r28
	add	r13, r13, r29
	add	r14, r14, r30
	add	r15, r15, r31
	stw	r20, 44*4(r1)
	stw	r21, 45*4(r1)
	stw	r22, 46*4(r1)
	stw	r23, 47*4(r1)
	
	addi	r4, r4, 32*4
	bdnz	scrypt_core_loop1
	
	lwz	r5, 2*4(r1)
	slwi	r3, r5, 7
	subf	r4, r3, r4
	mtctr	r5
	addi	r5, r5, -1
	stw	r5, 2*4(r1)
scrypt_core_loop2:
	and	r3, r16, r5
	slwi	r3, r3, 7
	add	r3, r3, r4
	mr	r0, r16
	mr	r5, r17
	mr	r6, r18
	mr	r7, r19
	lwz	r16, 24*4(r1)
	lwz	r17, 25*4(r1)
	lwz	r18, 26*4(r1)
	lwz	r19, 27*4(r1)
	lwz	r20, 28*4(r1)
	lwz	r21, 29*4(r1)
	lwz	r22, 30*4(r1)
	lwz	r23, 31*4(r1)
	lwz	r24, 32*4(r1)
	lwz	r25, 33*4(r1)
	lwz	r26, 34*4(r1)
	lwz	r27, 35*4(r1)
	lwz	r28, 36*4(r1)
	lwz	r29, 37*4(r1)
	lwz	r30, 38*4(r1)
	lwz	r31, 39*4(r1)
	xor	r16, r16, r0
	xor	r17, r17, r5
	xor	r18, r18, r6
	xor	r19, r19, r7
	lwz	r0, 44*4(r1)
	lwz	r5, 45*4(r1)
	lwz	r6, 46*4(r1)
	lwz	r7, 47*4(r1)
	xor	r20, r20, r0
	xor	r21, r21, r5
	xor	r22, r22, r6
	xor	r23, r23, r7
	xor	r24, r24, r8
	xor	r25, r25, r9
	xor	r26, r26, r10
	xor	r27, r27, r11
	xor	r28, r28, r12
	xor	r29, r29, r13
	xor	r30, r30, r14
	xor	r31, r31, r15
	
	lwz	r0, 0*4(r3)
	lwz	r5, 1*4(r3)
	lwz	r6, 2*4(r3)
	lwz	r7, 3*4(r3)
	xor	r16, r16, r0
	xor	r17, r17, r5
	xor	r18, r18, r6
	xor	r19, r19, r7
	lwz	r0, 4*4(r3)
	lwz	r5, 5*4(r3)
	lwz	r6, 6*4(r3)
	lwz	r7, 7*4(r3)
	xor	r20, r20, r0
	xor	r21, r21, r5
	xor	r22, r22, r6
	xor	r23, r23, r7
	lwz	r0, 8*4(r3)
	lwz	r5, 9*4(r3)
	lwz	r6, 10*4(r3)
	lwz	r7, 11*4(r3)
	xor	r24, r24, r0
	xor	r25, r25, r5
	xor	r26, r26, r6
	xor	r27, r27, r7
	lwz	r0, 12*4(r3)
	lwz	r5, 13*4(r3)
	lwz	r6, 14*4(r3)
	lwz	r7, 15*4(r3)
	xor	r28, r28, r0
	xor	r29, r29, r5
	xor	r30, r30, r6
	xor	r31, r31, r7
	
	stw	r16, 24*4(r1)
	stw	r17, 25*4(r1)
	stw	r18, 26*4(r1)
	stw	r19, 27*4(r1)
	stw	r20, 28*4(r1)
	stw	r21, 29*4(r1)
	stw	r22, 30*4(r1)
	stw	r23, 31*4(r1)
	stw	r24, 32*4(r1)
	stw	r25, 33*4(r1)
	stw	r26, 34*4(r1)
	stw	r27, 35*4(r1)
	stw	r28, 36*4(r1)
	stw	r29, 37*4(r1)
	stw	r30, 38*4(r1)
	stw	r31, 39*4(r1)
	
	salsa8_core
	
	lwz	r0, 24*4(r1)
	lwz	r5, 25*4(r1)
	lwz	r6, 26*4(r1)
	lwz	r7, 27*4(r1)
	add	r16, r16, r0
	add	r17, r17, r5
	add	r18, r18, r6
	add	r19, r19, r7
	lwz	r0, 28*4(r1)
	lwz	r5, 29*4(r1)
	lwz	r6, 30*4(r1)
	lwz	r7, 31*4(r1)
	add	r20, r20, r0
	add	r21, r21, r5
	add	r22, r22, r6
	add	r23, r23, r7
	lwz	r0, 32*4(r1)
	lwz	r5, 33*4(r1)
	lwz	r6, 34*4(r1)
	lwz	r7, 35*4(r1)
	add	r24, r24, r0
	add	r25, r25, r5
	add	r26, r26, r6
	add	r27, r27, r7
	lwz	r0, 36*4(r1)
	lwz	r5, 37*4(r1)
	lwz	r6, 38*4(r1)
	lwz	r7, 39*4(r1)
	add	r28, r28, r0
	add	r29, r29, r5
	add	r30, r30, r6
	add	r31, r31, r7
	
	stw	r16, 24*4(r1)
	stw	r17, 25*4(r1)
	stw	r18, 26*4(r1)
	stw	r19, 27*4(r1)
	stw	r20, 28*4(r1)
	stw	r21, 29*4(r1)
	stw	r22, 30*4(r1)
	stw	r23, 31*4(r1)
	stw	r24, 32*4(r1)
	stw	r25, 33*4(r1)
	stw	r26, 34*4(r1)
	stw	r27, 35*4(r1)
	stw	r28, 36*4(r1)
	stw	r29, 37*4(r1)
	stw	r30, 38*4(r1)
	stw	r31, 39*4(r1)
	
	lwz	r0, 16*4(r3)
	lwz	r5, 17*4(r3)
	lwz	r6, 18*4(r3)
	lwz	r7, 19*4(r3)
	xor	r16, r16, r0
	xor	r17, r17, r5
	xor	r18, r18, r6
	xor	r19, r19, r7
	lwz	r0, 20*4(r3)
	lwz	r5, 21*4(r3)
	lwz	r6, 22*4(r3)
	lwz	r7, 23*4(r3)
	xor	r20, r20, r0
	xor	r21, r21, r5
	xor	r22, r22, r6
	xor	r23, r23, r7
	lwz	r0, 24*4(r3)
	lwz	r5, 25*4(r3)
	lwz	r6, 26*4(r3)
	lwz	r7, 27*4(r3)
	xor	r24, r24, r0
	xor	r25, r25, r5
	xor	r26, r26, r6
	xor	r27, r27, r7
	lwz	r0, 28*4(r3)
	lwz	r5, 29*4(r3)
	lwz	r6, 30*4(r3)
	lwz	r7, 31*4(r3)
	xor	r28, r28, r0
	xor	r29, r29, r5
	xor	r30, r30, r6
	xor	r31, r31, r7
	
	lwz	r0, 40*4(r1)
	lwz	r5, 41*4(r1)
	lwz	r6, 42*4(r1)
	lwz	r7, 43*4(r1)
	xor	r16, r16, r0
	xor	r17, r17, r5
	xor	r18, r18, r6
	xor	r19, r19, r7
	lwz	r0, 44*4(r1)
	lwz	r5, 45*4(r1)
	lwz	r6, 46*4(r1)
	lwz	r7, 47*4(r1)
	xor	r20, r20, r0
	xor	r21, r21, r5
	xor	r22, r22, r6
	xor	r23, r23, r7
	xor	r24, r24, r8
	xor	r25, r25, r9
	xor	r26, r26, r10
	xor	r27, r27, r11
	xor	r28, r28, r12
	xor	r29, r29, r13
	xor	r30, r30, r14
	xor	r31, r31, r15
	stw	r16, 40*4(r1)
	stw	r17, 41*4(r1)
	stw	r18, 42*4(r1)
	stw	r19, 43*4(r1)
	mr	r8, r24
	mr	r9, r25
	mr	r10, r26
	mr	r11, r27
	stw	r20, 44*4(r1)
	stw	r21, 45*4(r1)
	stw	r22, 46*4(r1)
	stw	r23, 47*4(r1)
	mr	r12, r28
	mr	r13, r29
	mr	r14, r30
	mr	r15, r31
	
	salsa8_core
	
	lwz	r0, 40*4(r1)
	lwz	r5, 41*4(r1)
	lwz	r6, 42*4(r1)
	lwz	r7, 43*4(r1)
	add	r16, r16, r0
	add	r17, r17, r5
	add	r18, r18, r6
	add	r19, r19, r7
	lwz	r0, 44*4(r1)
	lwz	r5, 45*4(r1)
	lwz	r6, 46*4(r1)
	lwz	r7, 47*4(r1)
	add	r20, r20, r0
	add	r21, r21, r5
	add	r22, r22, r6
	add	r23, r23, r7
	lwz	r5, 2*4(r1)
	add	r8, r8, r24
	add	r9, r9, r25
	add	r10, r10, r26
	add	r11, r11, r27
	add	r12, r12, r28
	add	r13, r13, r29
	add	r14, r14, r30
	add	r15, r15, r31
	stw	r16, 40*4(r1)
	stw	r17, 41*4(r1)
	stw	r18, 42*4(r1)
	stw	r19, 43*4(r1)
	stw	r20, 44*4(r1)
	stw	r21, 45*4(r1)
	stw	r22, 46*4(r1)
	stw	r23, 47*4(r1)
	bdnz	scrypt_core_loop2
	
	ld	r3, 22*4(r1)
	
	lwz	r16, 24*4(r1)
	lwz	r17, 25*4(r1)
	lwz	r18, 26*4(r1)
	lwz	r19, 27*4(r1)
	lwz	r20, 28*4(r1)
	lwz	r21, 29*4(r1)
	lwz	r22, 30*4(r1)
	lwz	r23, 31*4(r1)
	stw	r16, 0*4(r3)
	stw	r17, 1*4(r3)
	stw	r18, 2*4(r3)
	stw	r19, 3*4(r3)
	stw	r20, 4*4(r3)
	stw	r21, 5*4(r3)
	stw	r22, 6*4(r3)
	stw	r23, 7*4(r3)
	lwz	r24, 32*4(r1)
	lwz	r25, 33*4(r1)
	lwz	r26, 34*4(r1)
	lwz	r27, 35*4(r1)
	lwz	r28, 36*4(r1)
	lwz	r29, 37*4(r1)
	lwz	r30, 38*4(r1)
	lwz	r31, 39*4(r1)
	stw	r24, 8*4(r3)
	stw	r25, 9*4(r3)
	stw	r26, 10*4(r3)
	stw	r27, 11*4(r3)
	stw	r28, 12*4(r3)
	stw	r29, 13*4(r3)
	stw	r30, 14*4(r3)
	stw	r31, 15*4(r3)
	lwz	r16, 40*4(r1)
	lwz	r17, 41*4(r1)
	lwz	r18, 42*4(r1)
	lwz	r19, 43*4(r1)
	lwz	r20, 44*4(r1)
	lwz	r21, 45*4(r1)
	lwz	r22, 46*4(r1)
	lwz	r23, 47*4(r1)
	stw	r16, 16*4(r3)
	stw	r17, 17*4(r3)
	stw	r18, 18*4(r3)
	stw	r19, 19*4(r3)
	stw	r20, 20*4(r3)
	stw	r21, 21*4(r3)
	stw	r22, 22*4(r3)
	stw	r23, 23*4(r3)
	stw	r8, 24*4(r3)
	stw	r9, 25*4(r3)
	stw	r10, 26*4(r3)
	stw	r11, 27*4(r3)
	stw	r12, 28*4(r3)
	stw	r13, 29*4(r3)
	stw	r14, 30*4(r3)
	stw	r15, 31*4(r3)
	
	ld	r13, 4*4(r1)
	ld	r14, 6*4(r1)
	ld	r15, 8*4(r1)
	ld	r16, 10*4(r1)
	ld	r17, 12*4(r1)
	ld	r18, 14*4(r1)
	ld	r19, 16*4(r1)
	ld	r20, 18*4(r1)
	ld	r21, 20*4(r1)
	ld	r22, 48*4(r1)
	ld	r23, 50*4(r1)
	ld	r24, 52*4(r1)
	ld	r25, 54*4(r1)
	ld	r26, 56*4(r1)
	ld	r27, 58*4(r1)
	ld	r28, 60*4(r1)
	ld	r29, 62*4(r1)
	ld	r30, 64*4(r1)
	ld	r31, 66*4(r1)
	addi	r1, r1, 68*4
	blr

#endif /* __ALTIVEC__ */

#endif
07070100000025000081A4000003E800000064000000015EF4BCA1000116C0000000000000000000000000000000000000001C00000000cpuminer-2.5.1/scrypt-x64.S/*
 * Copyright 2011-2014 pooler@litecoinpool.org
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include "cpuminer-config.h"

#if defined(__linux__) && defined(__ELF__)
	.section .note.GNU-stack,"",%progbits
#endif

#if defined(USE_ASM) && defined(__x86_64__)

	.text
	.p2align 6
	.globl scrypt_best_throughput
	.globl _scrypt_best_throughput
scrypt_best_throughput:
_scrypt_best_throughput:
	pushq	%rbx
#if defined(USE_AVX2)
	/* Check for AVX and OSXSAVE support */
	movl	$1, %eax
	cpuid
	andl	$0x18000000, %ecx
	cmpl	$0x18000000, %ecx
	jne scrypt_best_throughput_no_avx2
	/* Check for AVX2 support */
	movl	$7, %eax
	xorl	%ecx, %ecx
	cpuid
	andl	$0x00000020, %ebx
	cmpl	$0x00000020, %ebx
	jne scrypt_best_throughput_no_avx2
	/* Check for XMM and YMM state support */
	xorl	%ecx, %ecx
	xgetbv
	andl	$0x00000006, %eax
	cmpl	$0x00000006, %eax
	jne scrypt_best_throughput_no_avx2
	movl	$6, %eax
	jmp scrypt_best_throughput_exit
scrypt_best_throughput_no_avx2:
#endif
	/* Check for AuthenticAMD */
	xorq	%rax, %rax
	cpuid
	movl	$3, %eax
	cmpl	$0x444d4163, %ecx
	jne scrypt_best_throughput_not_amd
	cmpl	$0x69746e65, %edx
	jne scrypt_best_throughput_not_amd
	cmpl	$0x68747541, %ebx
	jne scrypt_best_throughput_not_amd
	/* Check for AMD K8 or Bobcat */
	movl	$1, %eax
	cpuid
	andl	$0x0ff00000, %eax
	jz scrypt_best_throughput_one
	cmpl	$0x00500000, %eax
	je scrypt_best_throughput_one
	movl	$3, %eax
	jmp scrypt_best_throughput_exit
scrypt_best_throughput_not_amd:
	/* Check for GenuineIntel */
	cmpl	$0x6c65746e, %ecx
	jne scrypt_best_throughput_exit
	cmpl	$0x49656e69, %edx
	jne scrypt_best_throughput_exit
	cmpl	$0x756e6547, %ebx
	jne scrypt_best_throughput_exit
	/* Check for Intel Atom */
	movl	$1, %eax
	cpuid
	movl	%eax, %edx
	andl	$0x0ff00f00, %eax
	cmpl	$0x00000600, %eax
	movl	$3, %eax
	jnz scrypt_best_throughput_exit
	andl	$0x000f00f0, %edx
	cmpl	$0x000100c0, %edx
	je scrypt_best_throughput_one
	cmpl	$0x00020060, %edx
	je scrypt_best_throughput_one
	cmpl	$0x00030060, %edx
	jne scrypt_best_throughput_exit
scrypt_best_throughput_one:
	movl	$1, %eax
scrypt_best_throughput_exit:
	popq	%rbx
	ret
	
	
.macro scrypt_shuffle src, so, dest, do
	movl	\so+60(\src), %eax
	movl	\so+44(\src), %ebx
	movl	\so+28(\src), %ecx
	movl	\so+12(\src), %edx
	movl	%eax, \do+12(\dest)
	movl	%ebx, \do+28(\dest)
	movl	%ecx, \do+44(\dest)
	movl	%edx, \do+60(\dest)
	movl	\so+40(\src), %eax
	movl	\so+8(\src), %ebx
	movl	\so+48(\src), %ecx
	movl	\so+16(\src), %edx
	movl	%eax, \do+8(\dest)
	movl	%ebx, \do+40(\dest)
	movl	%ecx, \do+16(\dest)
	movl	%edx, \do+48(\dest)
	movl	\so+20(\src), %eax
	movl	\so+4(\src), %ebx
	movl	\so+52(\src), %ecx
	movl	\so+36(\src), %edx
	movl	%eax, \do+4(\dest)
	movl	%ebx, \do+20(\dest)
	movl	%ecx, \do+36(\dest)
	movl	%edx, \do+52(\dest)
	movl	\so+0(\src), %eax
	movl	\so+24(\src), %ebx
	movl	\so+32(\src), %ecx
	movl	\so+56(\src), %edx
	movl	%eax, \do+0(\dest)
	movl	%ebx, \do+24(\dest)
	movl	%ecx, \do+32(\dest)
	movl	%edx, \do+56(\dest)
.endm


.macro salsa8_core_gen_doubleround
	movq	72(%rsp), %r15
	
	leaq	(%r14, %rdx), %rbp
	roll	$7, %ebp
	xorl	%ebp, %r9d
	leaq	(%rdi, %r15), %rbp
	roll	$7, %ebp
	xorl	%ebp, %r10d
	leaq	(%rdx, %r9), %rbp
	roll	$9, %ebp
	xorl	%ebp, %r11d
	leaq	(%r15, %r10), %rbp
	roll	$9, %ebp
	xorl	%ebp, %r13d
	
	leaq	(%r9, %r11), %rbp
	roll	$13, %ebp
	xorl	%ebp, %r14d
	leaq	(%r10, %r13), %rbp
	roll	$13, %ebp
	xorl	%ebp, %edi
	leaq	(%r11, %r14), %rbp
	roll	$18, %ebp
	xorl	%ebp, %edx
	leaq	(%r13, %rdi), %rbp
	roll	$18, %ebp
	xorl	%ebp, %r15d
	
	movq	48(%rsp), %rbp
	movq	%r15, 72(%rsp)
	
	leaq	(%rax, %rbp), %r15
	roll	$7, %r15d
	xorl	%r15d, %ebx
	leaq	(%rbp, %rbx), %r15
	roll	$9, %r15d
	xorl	%r15d, %ecx
	leaq	(%rbx, %rcx), %r15
	roll	$13, %r15d
	xorl	%r15d, %eax
	leaq	(%rcx, %rax), %r15
	roll	$18, %r15d
	xorl	%r15d, %ebp
	
	movq	88(%rsp), %r15
	movq	%rbp, 48(%rsp)
	
	leaq	(%r12, %r15), %rbp
	roll	$7, %ebp
	xorl	%ebp, %esi
	leaq	(%r15, %rsi), %rbp
	roll	$9, %ebp
	xorl	%ebp, %r8d
	leaq	(%rsi, %r8), %rbp
	roll	$13, %ebp
	xorl	%ebp, %r12d
	leaq	(%r8, %r12), %rbp
	roll	$18, %ebp
	xorl	%ebp, %r15d
	
	movq	%r15, 88(%rsp)
	movq	72(%rsp), %r15
	
	leaq	(%rsi, %rdx), %rbp
	roll	$7, %ebp
	xorl	%ebp, %edi
	leaq	(%r9, %r15), %rbp
	roll	$7, %ebp
	xorl	%ebp, %eax
	leaq	(%rdx, %rdi), %rbp
	roll	$9, %ebp
	xorl	%ebp, %ecx
	leaq	(%r15, %rax), %rbp
	roll	$9, %ebp
	xorl	%ebp, %r8d
	
	leaq	(%rdi, %rcx), %rbp
	roll	$13, %ebp
	xorl	%ebp, %esi
	leaq	(%rax, %r8), %rbp
	roll	$13, %ebp
	xorl	%ebp, %r9d
	leaq	(%rcx, %rsi), %rbp
	roll	$18, %ebp
	xorl	%ebp, %edx
	leaq	(%r8, %r9), %rbp
	roll	$18, %ebp
	xorl	%ebp, %r15d
	
	movq	48(%rsp), %rbp
	movq	%r15, 72(%rsp)
	
	leaq	(%r10, %rbp), %r15
	roll	$7, %r15d
	xorl	%r15d, %r12d
	leaq	(%rbp, %r12), %r15
	roll	$9, %r15d
	xorl	%r15d, %r11d
	leaq	(%r12, %r11), %r15
	roll	$13, %r15d
	xorl	%r15d, %r10d
	leaq	(%r11, %r10), %r15
	roll	$18, %r15d
	xorl	%r15d, %ebp
	
	movq	88(%rsp), %r15
	movq	%rbp, 48(%rsp)
	
	leaq	(%rbx, %r15), %rbp
	roll	$7, %ebp
	xorl	%ebp, %r14d
	leaq	(%r15, %r14), %rbp
	roll	$9, %ebp
	xorl	%ebp, %r13d
	leaq	(%r14, %r13), %rbp
	roll	$13, %ebp
	xorl	%ebp, %ebx
	leaq	(%r13, %rbx), %rbp
	roll	$18, %ebp
	xorl	%ebp, %r15d
	
	movq	%r15, 88(%rsp)
.endm

	.text
	.p2align 6
salsa8_core_gen:
	/* 0: %rdx, %rdi, %rcx, %rsi */
	movq	8(%rsp), %rdi
	movq	%rdi, %rdx
	shrq	$32, %rdi
	movq	16(%rsp), %rsi
	movq	%rsi, %rcx
	shrq	$32, %rsi
	/* 1: %r9, 72(%rsp), %rax, %r8 */
	movq	24(%rsp), %r8
	movq	%r8, %r9
	shrq	$32, %r8
	movq	%r8, 72(%rsp)
	movq	32(%rsp), %r8
	movq	%r8, %rax
	shrq	$32, %r8
	/* 2: %r11, %r10, 48(%rsp), %r12 */
	movq	40(%rsp), %r10
	movq	%r10, %r11
	shrq	$32, %r10
	movq	48(%rsp), %r12
	/* movq	%r12, %r13 */
	/* movq	%r13, 48(%rsp) */
	shrq	$32, %r12
	/* 3: %r14, %r13, %rbx, 88(%rsp) */
	movq	56(%rsp), %r13
	movq	%r13, %r14
	shrq	$32, %r13
	movq	64(%rsp), %r15
	movq	%r15, %rbx
	shrq	$32, %r15
	movq	%r15, 88(%rsp)
	
	salsa8_core_gen_doubleround
	salsa8_core_gen_doubleround
	salsa8_core_gen_doubleround
	salsa8_core_gen_doubleround
	
	shlq	$32, %rdi
	xorq	%rdi, %rdx
	movq	%rdx, 24(%rsp)
	
	shlq	$32, %rsi
	xorq	%rsi, %rcx
	movq	%rcx, 32(%rsp)
	
	movl	72(%rsp), %edi
	shlq	$32, %rdi
	xorq	%rdi, %r9
	movq	%r9, 40(%rsp)
	
	movl	48(%rsp), %ebp
	shlq	$32, %r8
	xorq	%r8, %rax
	movq	%rax, 48(%rsp)
	
	shlq	$32, %r10
	xorq	%r10, %r11
	movq	%r11, 56(%rsp)
	
	shlq	$32, %r12
	xorq	%r12, %rbp
	movq	%rbp, 64(%rsp)
	
	shlq	$32, %r13
	xorq	%r13, %r14
	movq	%r14, 72(%rsp)
	
	movdqa	24(%rsp), %xmm0
	
	shlq	$32, %r15
	xorq	%r15, %rbx
	movq	%rbx, 80(%rsp)
	
	movdqa	40(%rsp), %xmm1
	movdqa	56(%rsp), %xmm2
	movdqa	72(%rsp), %xmm3
	
	ret
	
	
	.text
	.p2align 6
	.globl scrypt_core
	.globl _scrypt_core
scrypt_core:
_scrypt_core:
	pushq	%rbx
	pushq	%rbp
	pushq	%r12
	pushq	%r13
	pushq	%r14
	pushq	%r15
#if defined(_WIN64) || defined(__CYGWIN__)
	subq	$176, %rsp
	movdqa	%xmm6, 8(%rsp)
	movdqa	%xmm7, 24(%rsp)
	movdqa	%xmm8, 40(%rsp)
	movdqa	%xmm9, 56(%rsp)
	movdqa	%xmm10, 72(%rsp)
	movdqa	%xmm11, 88(%rsp)
	movdqa	%xmm12, 104(%rsp)
	movdqa	%xmm13, 120(%rsp)
	movdqa	%xmm14, 136(%rsp)
	movdqa	%xmm15, 152(%rsp)
	pushq	%rdi
	pushq	%rsi
	movq	%rcx, %rdi
	movq	%rdx, %rsi
#else
	movq	%rdx, %r8
#endif

.macro scrypt_core_cleanup
#if defined(_WIN64) || defined(__CYGWIN__)
	popq	%rsi
	popq	%rdi
	movdqa	8(%rsp), %xmm6
	movdqa	24(%rsp), %xmm7
	movdqa	40(%rsp), %xmm8
	movdqa	56(%rsp), %xmm9
	movdqa	72(%rsp), %xmm10
	movdqa	88(%rsp), %xmm11
	movdqa	104(%rsp), %xmm12
	movdqa	120(%rsp), %xmm13
	movdqa	136(%rsp), %xmm14
	movdqa	152(%rsp), %xmm15
	addq	$176, %rsp
#endif
	popq	%r15
	popq	%r14
	popq	%r13
	popq	%r12
	popq	%rbp
	popq	%rbx
.endm
	
	/* GenuineIntel processors have fast SIMD */
	xorl	%eax, %eax
	cpuid
	cmpl	$0x6c65746e, %ecx
	jne scrypt_core_gen
	cmpl	$0x49656e69, %edx
	jne scrypt_core_gen
	cmpl	$0x756e6547, %ebx
	je scrypt_core_xmm
	
	.p2align 6
scrypt_core_gen:
	subq	$136, %rsp
	movdqa	0(%rdi), %xmm8
	movdqa	16(%rdi), %xmm9
	movdqa	32(%rdi), %xmm10
	movdqa	48(%rdi), %xmm11
	movdqa	64(%rdi), %xmm12
	movdqa	80(%rdi), %xmm13
	movdqa	96(%rdi), %xmm14
	movdqa	112(%rdi), %xmm15
	
	movq	%r8, %rcx
	shlq	$7, %rcx
	addq	%rsi, %rcx
	movq	%r8, 96(%rsp)
	movq	%rdi, 104(%rsp)
	movq	%rsi, 112(%rsp)
	movq	%rcx, 120(%rsp)
scrypt_core_gen_loop1:
	movdqa	%xmm8, 0(%rsi)
	movdqa	%xmm9, 16(%rsi)
	movdqa	%xmm10, 32(%rsi)
	movdqa	%xmm11, 48(%rsi)
	movdqa	%xmm12, 64(%rsi)
	movdqa	%xmm13, 80(%rsi)
	movdqa	%xmm14, 96(%rsi)
	movdqa	%xmm15, 112(%rsi)
	
	pxor	%xmm12, %xmm8
	pxor	%xmm13, %xmm9
	pxor	%xmm14, %xmm10
	pxor	%xmm15, %xmm11
	movdqa	%xmm8, 0(%rsp)
	movdqa	%xmm9, 16(%rsp)
	movdqa	%xmm10, 32(%rsp)
	movdqa	%xmm11, 48(%rsp)
	movq	%rsi, 128(%rsp)
	call salsa8_core_gen
	paddd	%xmm0, %xmm8
	paddd	%xmm1, %xmm9
	paddd	%xmm2, %xmm10
	paddd	%xmm3, %xmm11
	
	pxor	%xmm8, %xmm12
	pxor	%xmm9, %xmm13
	pxor	%xmm10, %xmm14
	pxor	%xmm11, %xmm15
	movdqa	%xmm12, 0(%rsp)
	movdqa	%xmm13, 16(%rsp)
	movdqa	%xmm14, 32(%rsp)
	movdqa	%xmm15, 48(%rsp)
	call salsa8_core_gen
	movq	128(%rsp), %rsi
	paddd	%xmm0, %xmm12
	paddd	%xmm1, %xmm13
	paddd	%xmm2, %xmm14
	paddd	%xmm3, %xmm15
	
	addq	$128, %rsi
	movq	120(%rsp), %rcx
	cmpq	%rcx, %rsi
	jne scrypt_core_gen_loop1
	
	movq	96(%rsp), %r8
	movq	%r8, %rcx
	subl	$1, %r8d
	movq	%r8, 96(%rsp)
	movd	%xmm12, %edx
scrypt_core_gen_loop2:
	movq	112(%rsp), %rsi
	andl	%r8d, %edx
	shll	$7, %edx
	addq	%rsi, %rdx
	movdqa	0(%rdx), %xmm0
	movdqa	16(%rdx), %xmm1
	movdqa	32(%rdx), %xmm2
	movdqa	48(%rdx), %xmm3
	movdqa	64(%rdx), %xmm4
	movdqa	80(%rdx), %xmm5
	movdqa	96(%rdx), %xmm6
	movdqa	112(%rdx), %xmm7
	pxor	%xmm0, %xmm8
	pxor	%xmm1, %xmm9
	pxor	%xmm2, %xmm10
	pxor	%xmm3, %xmm11
	pxor	%xmm4, %xmm12
	pxor	%xmm5, %xmm13
	pxor	%xmm6, %xmm14
	pxor	%xmm7, %xmm15
	
	pxor	%xmm12, %xmm8
	pxor	%xmm13, %xmm9
	pxor	%xmm14, %xmm10
	pxor	%xmm15, %xmm11
	movdqa	%xmm8, 0(%rsp)
	movdqa	%xmm9, 16(%rsp)
	movdqa	%xmm10, 32(%rsp)
	movdqa	%xmm11, 48(%rsp)
	movq	%rcx, 128(%rsp)
	call salsa8_core_gen
	paddd	%xmm0, %xmm8
	paddd	%xmm1, %xmm9
	paddd	%xmm2, %xmm10
	paddd	%xmm3, %xmm11
	
	pxor	%xmm8, %xmm12
	pxor	%xmm9, %xmm13
	pxor	%xmm10, %xmm14
	pxor	%xmm11, %xmm15
	movdqa	%xmm12, 0(%rsp)
	movdqa	%xmm13, 16(%rsp)
	movdqa	%xmm14, 32(%rsp)
	movdqa	%xmm15, 48(%rsp)
	call salsa8_core_gen
	movq	96(%rsp), %r8
	movq	128(%rsp), %rcx
	addl	0(%rsp), %edx
	paddd	%xmm0, %xmm12
	paddd	%xmm1, %xmm13
	paddd	%xmm2, %xmm14
	paddd	%xmm3, %xmm15
	
	subq	$1, %rcx
	ja scrypt_core_gen_loop2
	
	movq	104(%rsp), %rdi
	movdqa	%xmm8, 0(%rdi)
	movdqa	%xmm9, 16(%rdi)
	movdqa	%xmm10, 32(%rdi)
	movdqa	%xmm11, 48(%rdi)
	movdqa	%xmm12, 64(%rdi)
	movdqa	%xmm13, 80(%rdi)
	movdqa	%xmm14, 96(%rdi)
	movdqa	%xmm15, 112(%rdi)
	
	addq	$136, %rsp
	scrypt_core_cleanup
	ret


.macro salsa8_core_xmm_doubleround
	movdqa	%xmm1, %xmm4
	paddd	%xmm0, %xmm4
	movdqa	%xmm4, %xmm5
	pslld	$7, %xmm4
	psrld	$25, %xmm5
	pxor	%xmm4, %xmm3
	movdqa	%xmm0, %xmm4
	pxor	%xmm5, %xmm3
	
	paddd	%xmm3, %xmm4
	movdqa	%xmm4, %xmm5
	pslld	$9, %xmm4
	psrld	$23, %xmm5
	pxor	%xmm4, %xmm2
	movdqa	%xmm3, %xmm4
	pxor	%xmm5, %xmm2
	pshufd	$0x93, %xmm3, %xmm3
	
	paddd	%xmm2, %xmm4
	movdqa	%xmm4, %xmm5
	pslld	$13, %xmm4
	psrld	$19, %xmm5
	pxor	%xmm4, %xmm1
	movdqa	%xmm2, %xmm4
	pxor	%xmm5, %xmm1
	pshufd	$0x4e, %xmm2, %xmm2
	
	paddd	%xmm1, %xmm4
	movdqa	%xmm4, %xmm5
	pslld	$18, %xmm4
	psrld	$14, %xmm5
	pxor	%xmm4, %xmm0
	movdqa	%xmm3, %xmm4
	pxor	%xmm5, %xmm0
	pshufd	$0x39, %xmm1, %xmm1
	
	paddd	%xmm0, %xmm4
	movdqa	%xmm4, %xmm5
	pslld	$7, %xmm4
	psrld	$25, %xmm5
	pxor	%xmm4, %xmm1
	movdqa	%xmm0, %xmm4
	pxor	%xmm5, %xmm1
	
	paddd	%xmm1, %xmm4
	movdqa	%xmm4, %xmm5
	pslld	$9, %xmm4
	psrld	$23, %xmm5
	pxor	%xmm4, %xmm2
	movdqa	%xmm1, %xmm4
	pxor	%xmm5, %xmm2
	pshufd	$0x93, %xmm1, %xmm1
	
	paddd	%xmm2, %xmm4
	movdqa	%xmm4, %xmm5
	pslld	$13, %xmm4
	psrld	$19, %xmm5
	pxor	%xmm4, %xmm3
	movdqa	%xmm2, %xmm4
	pxor	%xmm5, %xmm3
	pshufd	$0x4e, %xmm2, %xmm2
	
	paddd	%xmm3, %xmm4
	movdqa	%xmm4, %xmm5
	pslld	$18, %xmm4
	psrld	$14, %xmm5
	pxor	%xmm4, %xmm0
	pshufd	$0x39, %xmm3, %xmm3
	pxor	%xmm5, %xmm0
.endm

.macro salsa8_core_xmm
	salsa8_core_xmm_doubleround
	salsa8_core_xmm_doubleround
	salsa8_core_xmm_doubleround
	salsa8_core_xmm_doubleround
.endm
	
	.p2align 6
scrypt_core_xmm:
	pcmpeqw	%xmm1, %xmm1
	psrlq	$32, %xmm1
	
	movdqa	0(%rdi), %xmm8
	movdqa	16(%rdi), %xmm11
	movdqa	32(%rdi), %xmm10
	movdqa	48(%rdi), %xmm9
	movdqa	%xmm8, %xmm0
	pxor	%xmm11, %xmm8
	pand	%xmm1, %xmm8
	pxor	%xmm11, %xmm8
	pxor	%xmm10, %xmm11
	pand	%xmm1, %xmm11
	pxor	%xmm10, %xmm11
	pxor	%xmm9, %xmm10
	pand	%xmm1, %xmm10
	pxor	%xmm9, %xmm10
	pxor	%xmm0, %xmm9
	pand	%xmm1, %xmm9
	pxor	%xmm0, %xmm9
	movdqa	%xmm8, %xmm0
	pshufd	$0x4e, %xmm10, %xmm10
	punpcklqdq	%xmm10, %xmm8
	punpckhqdq	%xmm0, %xmm10
	movdqa	%xmm11, %xmm0
	pshufd	$0x4e, %xmm9, %xmm9
	punpcklqdq	%xmm9, %xmm11
	punpckhqdq	%xmm0, %xmm9
	
	movdqa	64(%rdi), %xmm12
	movdqa	80(%rdi), %xmm15
	movdqa	96(%rdi), %xmm14
	movdqa	112(%rdi), %xmm13
	movdqa	%xmm12, %xmm0
	pxor	%xmm15, %xmm12
	pand	%xmm1, %xmm12
	pxor	%xmm15, %xmm12
	pxor	%xmm14, %xmm15
	pand	%xmm1, %xmm15
	pxor	%xmm14, %xmm15
	pxor	%xmm13, %xmm14
	pand	%xmm1, %xmm14
	pxor	%xmm13, %xmm14
	pxor	%xmm0, %xmm13
	pand	%xmm1, %xmm13
	pxor	%xmm0, %xmm13
	movdqa	%xmm12, %xmm0
	pshufd	$0x4e, %xmm14, %xmm14
	punpcklqdq	%xmm14, %xmm12
	punpckhqdq	%xmm0, %xmm14
	movdqa	%xmm15, %xmm0
	pshufd	$0x4e, %xmm13, %xmm13
	punpcklqdq	%xmm13, %xmm15
	punpckhqdq	%xmm0, %xmm13
	
	movq	%rsi, %rdx
	movq	%r8, %rcx
	shlq	$7, %rcx
	addq	%rsi, %rcx
scrypt_core_xmm_loop1:
	pxor	%xmm12, %xmm8
	pxor	%xmm13, %xmm9
	pxor	%xmm14, %xmm10
	pxor	%xmm15, %xmm11
	movdqa	%xmm8, 0(%rdx)
	movdqa	%xmm9, 16(%rdx)
	movdqa	%xmm10, 32(%rdx)
	movdqa	%xmm11, 48(%rdx)
	movdqa	%xmm12, 64(%rdx)
	movdqa	%xmm13, 80(%rdx)
	movdqa	%xmm14, 96(%rdx)
	movdqa	%xmm15, 112(%rdx)
	
	movdqa	%xmm8, %xmm0
	movdqa	%xmm9, %xmm1
	movdqa	%xmm10, %xmm2
	movdqa	%xmm11, %xmm3
	salsa8_core_xmm
	paddd	%xmm0, %xmm8
	paddd	%xmm1, %xmm9
	paddd	%xmm2, %xmm10
	paddd	%xmm3, %xmm11
	
	pxor	%xmm8, %xmm12
	pxor	%xmm9, %xmm13
	pxor	%xmm10, %xmm14
	pxor	%xmm11, %xmm15
	movdqa	%xmm12, %xmm0
	movdqa	%xmm13, %xmm1
	movdqa	%xmm14, %xmm2
	movdqa	%xmm15, %xmm3
	salsa8_core_xmm
	paddd	%xmm0, %xmm12
	paddd	%xmm1, %xmm13
	paddd	%xmm2, %xmm14
	paddd	%xmm3, %xmm15
	
	addq	$128, %rdx
	cmpq	%rcx, %rdx
	jne scrypt_core_xmm_loop1
	
	movq	%r8, %rcx
	subl	$1, %r8d
scrypt_core_xmm_loop2:
	movd	%xmm12, %edx
	andl	%r8d, %edx
	shll	$7, %edx
	pxor	0(%rsi, %rdx), %xmm8
	pxor	16(%rsi, %rdx), %xmm9
	pxor	32(%rsi, %rdx), %xmm10
	pxor	48(%rsi, %rdx), %xmm11
	
	pxor	%xmm12, %xmm8
	pxor	%xmm13, %xmm9
	pxor	%xmm14, %xmm10
	pxor	%xmm15, %xmm11
	movdqa	%xmm8, %xmm0
	movdqa	%xmm9, %xmm1
	movdqa	%xmm10, %xmm2
	movdqa	%xmm11, %xmm3
	salsa8_core_xmm
	paddd	%xmm0, %xmm8
	paddd	%xmm1, %xmm9
	paddd	%xmm2, %xmm10
	paddd	%xmm3, %xmm11
	
	pxor	64(%rsi, %rdx), %xmm12
	pxor	80(%rsi, %rdx), %xmm13
	pxor	96(%rsi, %rdx), %xmm14
	pxor	112(%rsi, %rdx), %xmm15
	pxor	%xmm8, %xmm12
	pxor	%xmm9, %xmm13
	pxor	%xmm10, %xmm14
	pxor	%xmm11, %xmm15
	movdqa	%xmm12, %xmm0
	movdqa	%xmm13, %xmm1
	movdqa	%xmm14, %xmm2
	movdqa	%xmm15, %xmm3
	salsa8_core_xmm
	paddd	%xmm0, %xmm12
	paddd	%xmm1, %xmm13
	paddd	%xmm2, %xmm14
	paddd	%xmm3, %xmm15
	
	subq	$1, %rcx
	ja scrypt_core_xmm_loop2
	
	pcmpeqw	%xmm1, %xmm1
	psrlq	$32, %xmm1
	
	movdqa	%xmm8, %xmm0
	pxor	%xmm9, %xmm8
	pand	%xmm1, %xmm8
	pxor	%xmm9, %xmm8
	pxor	%xmm10, %xmm9
	pand	%xmm1, %xmm9
	pxor	%xmm10, %xmm9
	pxor	%xmm11, %xmm10
	pand	%xmm1, %xmm10
	pxor	%xmm11, %xmm10
	pxor	%xmm0, %xmm11
	pand	%xmm1, %xmm11
	pxor	%xmm0, %xmm11
	movdqa	%xmm8, %xmm0
	pshufd	$0x4e, %xmm10, %xmm10
	punpcklqdq	%xmm10, %xmm8
	punpckhqdq	%xmm0, %xmm10
	movdqa	%xmm9, %xmm0
	pshufd	$0x4e, %xmm11, %xmm11
	punpcklqdq	%xmm11, %xmm9
	punpckhqdq	%xmm0, %xmm11
	movdqa	%xmm8, 0(%rdi)
	movdqa	%xmm11, 16(%rdi)
	movdqa	%xmm10, 32(%rdi)
	movdqa	%xmm9, 48(%rdi)
	
	movdqa	%xmm12, %xmm0
	pxor	%xmm13, %xmm12
	pand	%xmm1, %xmm12
	pxor	%xmm13, %xmm12
	pxor	%xmm14, %xmm13
	pand	%xmm1, %xmm13
	pxor	%xmm14, %xmm13
	pxor	%xmm15, %xmm14
	pand	%xmm1, %xmm14
	pxor	%xmm15, %xmm14
	pxor	%xmm0, %xmm15
	pand	%xmm1, %xmm15
	pxor	%xmm0, %xmm15
	movdqa	%xmm12, %xmm0
	pshufd	$0x4e, %xmm14, %xmm14
	punpcklqdq	%xmm14, %xmm12
	punpckhqdq	%xmm0, %xmm14
	movdqa	%xmm13, %xmm0
	pshufd	$0x4e, %xmm15, %xmm15
	punpcklqdq	%xmm15, %xmm13
	punpckhqdq	%xmm0, %xmm15
	movdqa	%xmm12, 64(%rdi)
	movdqa	%xmm15, 80(%rdi)
	movdqa	%xmm14, 96(%rdi)
	movdqa	%xmm13, 112(%rdi)
	
	scrypt_core_cleanup
	ret
	
	
#if defined(USE_AVX)
.macro salsa8_core_3way_avx_doubleround
	vpaddd	%xmm0, %xmm1, %xmm4
	vpaddd	%xmm8, %xmm9, %xmm6
	vpaddd	%xmm12, %xmm13, %xmm7
	vpslld	$7, %xmm4, %xmm5
	vpsrld	$25, %xmm4, %xmm4
	vpxor	%xmm5, %xmm3, %xmm3
	vpxor	%xmm4, %xmm3, %xmm3
	vpslld	$7, %xmm6, %xmm5
	vpsrld	$25, %xmm6, %xmm6
	vpxor	%xmm5, %xmm11, %xmm11
	vpxor	%xmm6, %xmm11, %xmm11
	vpslld	$7, %xmm7, %xmm5
	vpsrld	$25, %xmm7, %xmm7
	vpxor	%xmm5, %xmm15, %xmm15
	vpxor	%xmm7, %xmm15, %xmm15
	
	vpaddd	%xmm3, %xmm0, %xmm4
	vpaddd	%xmm11, %xmm8, %xmm6
	vpaddd	%xmm15, %xmm12, %xmm7
	vpslld	$9, %xmm4, %xmm5
	vpsrld	$23, %xmm4, %xmm4
	vpxor	%xmm5, %xmm2, %xmm2
	vpxor	%xmm4, %xmm2, %xmm2
	vpslld	$9, %xmm6, %xmm5
	vpsrld	$23, %xmm6, %xmm6
	vpxor	%xmm5, %xmm10, %xmm10
	vpxor	%xmm6, %xmm10, %xmm10
	vpslld	$9, %xmm7, %xmm5
	vpsrld	$23, %xmm7, %xmm7
	vpxor	%xmm5, %xmm14, %xmm14
	vpxor	%xmm7, %xmm14, %xmm14
	
	vpaddd	%xmm2, %xmm3, %xmm4
	vpaddd	%xmm10, %xmm11, %xmm6
	vpaddd	%xmm14, %xmm15, %xmm7
	vpslld	$13, %xmm4, %xmm5
	vpsrld	$19, %xmm4, %xmm4
	vpshufd	$0x93, %xmm3, %xmm3
	vpshufd	$0x93, %xmm11, %xmm11
	vpshufd	$0x93, %xmm15, %xmm15
	vpxor	%xmm5, %xmm1, %xmm1
	vpxor	%xmm4, %xmm1, %xmm1
	vpslld	$13, %xmm6, %xmm5
	vpsrld	$19, %xmm6, %xmm6
	vpxor	%xmm5, %xmm9, %xmm9
	vpxor	%xmm6, %xmm9, %xmm9
	vpslld	$13, %xmm7, %xmm5
	vpsrld	$19, %xmm7, %xmm7
	vpxor	%xmm5, %xmm13, %xmm13
	vpxor	%xmm7, %xmm13, %xmm13
	
	vpaddd	%xmm1, %xmm2, %xmm4
	vpaddd	%xmm9, %xmm10, %xmm6
	vpaddd	%xmm13, %xmm14, %xmm7
	vpslld	$18, %xmm4, %xmm5
	vpsrld	$14, %xmm4, %xmm4
	vpshufd	$0x4e, %xmm2, %xmm2
	vpshufd	$0x4e, %xmm10, %xmm10
	vpshufd	$0x4e, %xmm14, %xmm14
	vpxor	%xmm5, %xmm0, %xmm0
	vpxor	%xmm4, %xmm0, %xmm0
	vpslld	$18, %xmm6, %xmm5
	vpsrld	$14, %xmm6, %xmm6
	vpxor	%xmm5, %xmm8, %xmm8
	vpxor	%xmm6, %xmm8, %xmm8
	vpslld	$18, %xmm7, %xmm5
	vpsrld	$14, %xmm7, %xmm7
	vpxor	%xmm5, %xmm12, %xmm12
	vpxor	%xmm7, %xmm12, %xmm12
	
	vpaddd	%xmm0, %xmm3, %xmm4
	vpaddd	%xmm8, %xmm11, %xmm6
	vpaddd	%xmm12, %xmm15, %xmm7
	vpslld	$7, %xmm4, %xmm5
	vpsrld	$25, %xmm4, %xmm4
	vpshufd	$0x39, %xmm1, %xmm1
	vpxor	%xmm5, %xmm1, %xmm1
	vpxor	%xmm4, %xmm1, %xmm1
	vpslld	$7, %xmm6, %xmm5
	vpsrld	$25, %xmm6, %xmm6
	vpshufd	$0x39, %xmm9, %xmm9
	vpxor	%xmm5, %xmm9, %xmm9
	vpxor	%xmm6, %xmm9, %xmm9
	vpslld	$7, %xmm7, %xmm5
	vpsrld	$25, %xmm7, %xmm7
	vpshufd	$0x39, %xmm13, %xmm13
	vpxor	%xmm5, %xmm13, %xmm13
	vpxor	%xmm7, %xmm13, %xmm13
	
	vpaddd	%xmm1, %xmm0, %xmm4
	vpaddd	%xmm9, %xmm8, %xmm6
	vpaddd	%xmm13, %xmm12, %xmm7
	vpslld	$9, %xmm4, %xmm5
	vpsrld	$23, %xmm4, %xmm4
	vpxor	%xmm5, %xmm2, %xmm2
	vpxor	%xmm4, %xmm2, %xmm2
	vpslld	$9, %xmm6, %xmm5
	vpsrld	$23, %xmm6, %xmm6
	vpxor	%xmm5, %xmm10, %xmm10
	vpxor	%xmm6, %xmm10, %xmm10
	vpslld	$9, %xmm7, %xmm5
	vpsrld	$23, %xmm7, %xmm7
	vpxor	%xmm5, %xmm14, %xmm14
	vpxor	%xmm7, %xmm14, %xmm14
	
	vpaddd	%xmm2, %xmm1, %xmm4
	vpaddd	%xmm10, %xmm9, %xmm6
	vpaddd	%xmm14, %xmm13, %xmm7
	vpslld	$13, %xmm4, %xmm5
	vpsrld	$19, %xmm4, %xmm4
	vpshufd	$0x93, %xmm1, %xmm1
	vpshufd	$0x93, %xmm9, %xmm9
	vpshufd	$0x93, %xmm13, %xmm13
	vpxor	%xmm5, %xmm3, %xmm3
	vpxor	%xmm4, %xmm3, %xmm3
	vpslld	$13, %xmm6, %xmm5
	vpsrld	$19, %xmm6, %xmm6
	vpxor	%xmm5, %xmm11, %xmm11
	vpxor	%xmm6, %xmm11, %xmm11
	vpslld	$13, %xmm7, %xmm5
	vpsrld	$19, %xmm7, %xmm7
	vpxor	%xmm5, %xmm15, %xmm15
	vpxor	%xmm7, %xmm15, %xmm15
	
	vpaddd	%xmm3, %xmm2, %xmm4
	vpaddd	%xmm11, %xmm10, %xmm6
	vpaddd	%xmm15, %xmm14, %xmm7
	vpslld	$18, %xmm4, %xmm5
	vpsrld	$14, %xmm4, %xmm4
	vpshufd	$0x4e, %xmm2, %xmm2
	vpshufd	$0x4e, %xmm10, %xmm10
	vpxor	%xmm5, %xmm0, %xmm0
	vpxor	%xmm4, %xmm0, %xmm0
	vpslld	$18, %xmm6, %xmm5
	vpsrld	$14, %xmm6, %xmm6
	vpshufd	$0x4e, %xmm14, %xmm14
	vpshufd	$0x39, %xmm11, %xmm11
	vpxor	%xmm5, %xmm8, %xmm8
	vpxor	%xmm6, %xmm8, %xmm8
	vpslld	$18, %xmm7, %xmm5
	vpsrld	$14, %xmm7, %xmm7
	vpshufd	$0x39, %xmm3, %xmm3
	vpshufd	$0x39, %xmm15, %xmm15
	vpxor	%xmm5, %xmm12, %xmm12
	vpxor	%xmm7, %xmm12, %xmm12
.endm

.macro salsa8_core_3way_avx
	salsa8_core_3way_avx_doubleround
	salsa8_core_3way_avx_doubleround
	salsa8_core_3way_avx_doubleround
	salsa8_core_3way_avx_doubleround
.endm
#endif /* USE_AVX */
	
	.text
	.p2align 6
	.globl scrypt_core_3way
	.globl _scrypt_core_3way
scrypt_core_3way:
_scrypt_core_3way:
	pushq	%rbx
	pushq	%rbp
#if defined(_WIN64) || defined(__CYGWIN__)
	subq	$176, %rsp
	movdqa	%xmm6, 8(%rsp)
	movdqa	%xmm7, 24(%rsp)
	movdqa	%xmm8, 40(%rsp)
	movdqa	%xmm9, 56(%rsp)
	movdqa	%xmm10, 72(%rsp)
	movdqa	%xmm11, 88(%rsp)
	movdqa	%xmm12, 104(%rsp)
	movdqa	%xmm13, 120(%rsp)
	movdqa	%xmm14, 136(%rsp)
	movdqa	%xmm15, 152(%rsp)
	pushq	%rdi
	pushq	%rsi
	movq	%rcx, %rdi
	movq	%rdx, %rsi
#else
	movq	%rdx, %r8
#endif
	subq	$392, %rsp
	
.macro scrypt_core_3way_cleanup
	addq	$392, %rsp
#if defined(_WIN64) || defined(__CYGWIN__)
	popq	%rsi
	popq	%rdi
	movdqa	8(%rsp), %xmm6
	movdqa	24(%rsp), %xmm7
	movdqa	40(%rsp), %xmm8
	movdqa	56(%rsp), %xmm9
	movdqa	72(%rsp), %xmm10
	movdqa	88(%rsp), %xmm11
	movdqa	104(%rsp), %xmm12
	movdqa	120(%rsp), %xmm13
	movdqa	136(%rsp), %xmm14
	movdqa	152(%rsp), %xmm15
	addq	$176, %rsp
#endif
	popq	%rbp
	popq	%rbx
.endm
	
#if !defined(USE_AVX)
	jmp scrypt_core_3way_xmm
#else
	/* Check for AVX and OSXSAVE support */
	movl	$1, %eax
	cpuid
	andl	$0x18000000, %ecx
	cmpl	$0x18000000, %ecx
	jne scrypt_core_3way_xmm
	/* Check for XMM and YMM state support */
	xorl	%ecx, %ecx
	xgetbv
	andl	$0x00000006, %eax
	cmpl	$0x00000006, %eax
	jne scrypt_core_3way_xmm
#if defined(USE_XOP)
	/* Check for XOP support */
	movl	$0x80000001, %eax
	cpuid
	andl	$0x00000800, %ecx
	jnz scrypt_core_3way_xop
#endif
	
scrypt_core_3way_avx:
	scrypt_shuffle %rdi, 0, %rsp, 0
	scrypt_shuffle %rdi, 64, %rsp, 64
	scrypt_shuffle %rdi, 128, %rsp, 128
	scrypt_shuffle %rdi, 192, %rsp, 192
	scrypt_shuffle %rdi, 256, %rsp, 256
	scrypt_shuffle %rdi, 320, %rsp, 320
	
	movdqa	64(%rsp), %xmm0
	movdqa	80(%rsp), %xmm1
	movdqa	96(%rsp), %xmm2
	movdqa	112(%rsp), %xmm3
	movdqa	128+64(%rsp), %xmm8
	movdqa	128+80(%rsp), %xmm9
	movdqa	128+96(%rsp), %xmm10
	movdqa	128+112(%rsp), %xmm11
	movdqa	256+64(%rsp), %xmm12
	movdqa	256+80(%rsp), %xmm13
	movdqa	256+96(%rsp), %xmm14
	movdqa	256+112(%rsp), %xmm15
	
	movq	%rsi, %rbx
	leaq	(%r8, %r8, 2), %rax
	shlq	$7, %rax
	addq	%rsi, %rax
scrypt_core_3way_avx_loop1:
	movdqa	%xmm0, 64(%rbx)
	movdqa	%xmm1, 80(%rbx)
	movdqa	%xmm2, 96(%rbx)
	movdqa	%xmm3, 112(%rbx)
	pxor	0(%rsp), %xmm0
	pxor	16(%rsp), %xmm1
	pxor	32(%rsp), %xmm2
	pxor	48(%rsp), %xmm3
	movdqa	%xmm8, 128+64(%rbx)
	movdqa	%xmm9, 128+80(%rbx)
	movdqa	%xmm10, 128+96(%rbx)
	movdqa	%xmm11, 128+112(%rbx)
	pxor	128+0(%rsp), %xmm8
	pxor	128+16(%rsp), %xmm9
	pxor	128+32(%rsp), %xmm10
	pxor	128+48(%rsp), %xmm11
	movdqa	%xmm12, 256+64(%rbx)
	movdqa	%xmm13, 256+80(%rbx)
	movdqa	%xmm14, 256+96(%rbx)
	movdqa	%xmm15, 256+112(%rbx)
	pxor	256+0(%rsp), %xmm12
	pxor	256+16(%rsp), %xmm13
	pxor	256+32(%rsp), %xmm14
	pxor	256+48(%rsp), %xmm15
	movdqa	%xmm0, 0(%rbx)
	movdqa	%xmm1, 16(%rbx)
	movdqa	%xmm2, 32(%rbx)
	movdqa	%xmm3, 48(%rbx)
	movdqa	%xmm8, 128+0(%rbx)
	movdqa	%xmm9, 128+16(%rbx)
	movdqa	%xmm10, 128+32(%rbx)
	movdqa	%xmm11, 128+48(%rbx)
	movdqa	%xmm12, 256+0(%rbx)
	movdqa	%xmm13, 256+16(%rbx)
	movdqa	%xmm14, 256+32(%rbx)
	movdqa	%xmm15, 256+48(%rbx)
	
	salsa8_core_3way_avx
	paddd	0(%rbx), %xmm0
	paddd	16(%rbx), %xmm1
	paddd	32(%rbx), %xmm2
	paddd	48(%rbx), %xmm3
	paddd	128+0(%rbx), %xmm8
	paddd	128+16(%rbx), %xmm9
	paddd	128+32(%rbx), %xmm10
	paddd	128+48(%rbx), %xmm11
	paddd	256+0(%rbx), %xmm12
	paddd	256+16(%rbx), %xmm13
	paddd	256+32(%rbx), %xmm14
	paddd	256+48(%rbx), %xmm15
	movdqa	%xmm0, 0(%rsp)
	movdqa	%xmm1, 16(%rsp)
	movdqa	%xmm2, 32(%rsp)
	movdqa	%xmm3, 48(%rsp)
	movdqa	%xmm8, 128+0(%rsp)
	movdqa	%xmm9, 128+16(%rsp)
	movdqa	%xmm10, 128+32(%rsp)
	movdqa	%xmm11, 128+48(%rsp)
	movdqa	%xmm12, 256+0(%rsp)
	movdqa	%xmm13, 256+16(%rsp)
	movdqa	%xmm14, 256+32(%rsp)
	movdqa	%xmm15, 256+48(%rsp)
	
	pxor	64(%rbx), %xmm0
	pxor	80(%rbx), %xmm1
	pxor	96(%rbx), %xmm2
	pxor	112(%rbx), %xmm3
	pxor	128+64(%rbx), %xmm8
	pxor	128+80(%rbx), %xmm9
	pxor	128+96(%rbx), %xmm10
	pxor	128+112(%rbx), %xmm11
	pxor	256+64(%rbx), %xmm12
	pxor	256+80(%rbx), %xmm13
	pxor	256+96(%rbx), %xmm14
	pxor	256+112(%rbx), %xmm15
	movdqa	%xmm0, 64(%rsp)
	movdqa	%xmm1, 80(%rsp)
	movdqa	%xmm2, 96(%rsp)
	movdqa	%xmm3, 112(%rsp)
	movdqa	%xmm8, 128+64(%rsp)
	movdqa	%xmm9, 128+80(%rsp)
	movdqa	%xmm10, 128+96(%rsp)
	movdqa	%xmm11, 128+112(%rsp)
	movdqa	%xmm12, 256+64(%rsp)
	movdqa	%xmm13, 256+80(%rsp)
	movdqa	%xmm14, 256+96(%rsp)
	movdqa	%xmm15, 256+112(%rsp)
	salsa8_core_3way_avx
	paddd	64(%rsp), %xmm0
	paddd	80(%rsp), %xmm1
	paddd	96(%rsp), %xmm2
	paddd	112(%rsp), %xmm3
	paddd	128+64(%rsp), %xmm8
	paddd	128+80(%rsp), %xmm9
	paddd	128+96(%rsp), %xmm10
	paddd	128+112(%rsp), %xmm11
	paddd	256+64(%rsp), %xmm12
	paddd	256+80(%rsp), %xmm13
	paddd	256+96(%rsp), %xmm14
	paddd	256+112(%rsp), %xmm15
	
	addq	$3*128, %rbx
	cmpq	%rax, %rbx
	jne scrypt_core_3way_avx_loop1
	
	movdqa	%xmm0, 64(%rsp)
	movdqa	%xmm1, 80(%rsp)
	movdqa	%xmm2, 96(%rsp)
	movdqa	%xmm3, 112(%rsp)
	movdqa	%xmm8, 128+64(%rsp)
	movdqa	%xmm9, 128+80(%rsp)
	movdqa	%xmm10, 128+96(%rsp)
	movdqa	%xmm11, 128+112(%rsp)
	movdqa	%xmm12, 256+64(%rsp)
	movdqa	%xmm13, 256+80(%rsp)
	movdqa	%xmm14, 256+96(%rsp)
	movdqa	%xmm15, 256+112(%rsp)
	
	movq	%r8, %rcx
	subq	$1, %r8
scrypt_core_3way_avx_loop2:
	movd	%xmm0, %ebp
	movd	%xmm8, %ebx
	movd	%xmm12, %eax
	pxor	0(%rsp), %xmm0
	pxor	16(%rsp), %xmm1
	pxor	32(%rsp), %xmm2
	pxor	48(%rsp), %xmm3
	pxor	128+0(%rsp), %xmm8
	pxor	128+16(%rsp), %xmm9
	pxor	128+32(%rsp), %xmm10
	pxor	128+48(%rsp), %xmm11
	pxor	256+0(%rsp), %xmm12
	pxor	256+16(%rsp), %xmm13
	pxor	256+32(%rsp), %xmm14
	pxor	256+48(%rsp), %xmm15
	andl	%r8d, %ebp
	leaq	(%rbp, %rbp, 2), %rbp
	shll	$7, %ebp
	andl	%r8d, %ebx
	leaq	1(%rbx, %rbx, 2), %rbx
	shll	$7, %ebx
	andl	%r8d, %eax
	leaq	2(%rax, %rax, 2), %rax
	shll	$7, %eax
	pxor	0(%rsi, %rbp), %xmm0
	pxor	16(%rsi, %rbp), %xmm1
	pxor	32(%rsi, %rbp), %xmm2
	pxor	48(%rsi, %rbp), %xmm3
	pxor	0(%rsi, %rbx), %xmm8
	pxor	16(%rsi, %rbx), %xmm9
	pxor	32(%rsi, %rbx), %xmm10
	pxor	48(%rsi, %rbx), %xmm11
	pxor	0(%rsi, %rax), %xmm12
	pxor	16(%rsi, %rax), %xmm13
	pxor	32(%rsi, %rax), %xmm14
	pxor	48(%rsi, %rax), %xmm15
	
	movdqa	%xmm0, 0(%rsp)
	movdqa	%xmm1, 16(%rsp)
	movdqa	%xmm2, 32(%rsp)
	movdqa	%xmm3, 48(%rsp)
	movdqa	%xmm8, 128+0(%rsp)
	movdqa	%xmm9, 128+16(%rsp)
	movdqa	%xmm10, 128+32(%rsp)
	movdqa	%xmm11, 128+48(%rsp)
	movdqa	%xmm12, 256+0(%rsp)
	movdqa	%xmm13, 256+16(%rsp)
	movdqa	%xmm14, 256+32(%rsp)
	movdqa	%xmm15, 256+48(%rsp)
	salsa8_core_3way_avx
	paddd	0(%rsp), %xmm0
	paddd	16(%rsp), %xmm1
	paddd	32(%rsp), %xmm2
	paddd	48(%rsp), %xmm3
	paddd	128+0(%rsp), %xmm8
	paddd	128+16(%rsp), %xmm9
	paddd	128+32(%rsp), %xmm10
	paddd	128+48(%rsp), %xmm11
	paddd	256+0(%rsp), %xmm12
	paddd	256+16(%rsp), %xmm13
	paddd	256+32(%rsp), %xmm14
	paddd	256+48(%rsp), %xmm15
	movdqa	%xmm0, 0(%rsp)
	movdqa	%xmm1, 16(%rsp)
	movdqa	%xmm2, 32(%rsp)
	movdqa	%xmm3, 48(%rsp)
	movdqa	%xmm8, 128+0(%rsp)
	movdqa	%xmm9, 128+16(%rsp)
	movdqa	%xmm10, 128+32(%rsp)
	movdqa	%xmm11, 128+48(%rsp)
	movdqa	%xmm12, 256+0(%rsp)
	movdqa	%xmm13, 256+16(%rsp)
	movdqa	%xmm14, 256+32(%rsp)
	movdqa	%xmm15, 256+48(%rsp)
	
	pxor	64(%rsi, %rbp), %xmm0
	pxor	80(%rsi, %rbp), %xmm1
	pxor	96(%rsi, %rbp), %xmm2
	pxor	112(%rsi, %rbp), %xmm3
	pxor	64(%rsi, %rbx), %xmm8
	pxor	80(%rsi, %rbx), %xmm9
	pxor	96(%rsi, %rbx), %xmm10
	pxor	112(%rsi, %rbx), %xmm11
	pxor	64(%rsi, %rax), %xmm12
	pxor	80(%rsi, %rax), %xmm13
	pxor	96(%rsi, %rax), %xmm14
	pxor	112(%rsi, %rax), %xmm15
	pxor	64(%rsp), %xmm0
	pxor	80(%rsp), %xmm1
	pxor	96(%rsp), %xmm2
	pxor	112(%rsp), %xmm3
	pxor	128+64(%rsp), %xmm8
	pxor	128+80(%rsp), %xmm9
	pxor	128+96(%rsp), %xmm10
	pxor	128+112(%rsp), %xmm11
	pxor	256+64(%rsp), %xmm12
	pxor	256+80(%rsp), %xmm13
	pxor	256+96(%rsp), %xmm14
	pxor	256+112(%rsp), %xmm15
	movdqa	%xmm0, 64(%rsp)
	movdqa	%xmm1, 80(%rsp)
	movdqa	%xmm2, 96(%rsp)
	movdqa	%xmm3, 112(%rsp)
	movdqa	%xmm8, 128+64(%rsp)
	movdqa	%xmm9, 128+80(%rsp)
	movdqa	%xmm10, 128+96(%rsp)
	movdqa	%xmm11, 128+112(%rsp)
	movdqa	%xmm12, 256+64(%rsp)
	movdqa	%xmm13, 256+80(%rsp)
	movdqa	%xmm14, 256+96(%rsp)
	movdqa	%xmm15, 256+112(%rsp)
	salsa8_core_3way_avx
	paddd	64(%rsp), %xmm0
	paddd	80(%rsp), %xmm1
	paddd	96(%rsp), %xmm2
	paddd	112(%rsp), %xmm3
	paddd	128+64(%rsp), %xmm8
	paddd	128+80(%rsp), %xmm9
	paddd	128+96(%rsp), %xmm10
	paddd	128+112(%rsp), %xmm11
	paddd	256+64(%rsp), %xmm12
	paddd	256+80(%rsp), %xmm13
	paddd	256+96(%rsp), %xmm14
	paddd	256+112(%rsp), %xmm15
	movdqa	%xmm0, 64(%rsp)
	movdqa	%xmm1, 80(%rsp)
	movdqa	%xmm2, 96(%rsp)
	movdqa	%xmm3, 112(%rsp)
	movdqa	%xmm8, 128+64(%rsp)
	movdqa	%xmm9, 128+80(%rsp)
	movdqa	%xmm10, 128+96(%rsp)
	movdqa	%xmm11, 128+112(%rsp)
	movdqa	%xmm12, 256+64(%rsp)
	movdqa	%xmm13, 256+80(%rsp)
	movdqa	%xmm14, 256+96(%rsp)
	movdqa	%xmm15, 256+112(%rsp)
	
	subq	$1, %rcx
	ja scrypt_core_3way_avx_loop2
	
	scrypt_shuffle %rsp, 0, %rdi, 0
	scrypt_shuffle %rsp, 64, %rdi, 64
	scrypt_shuffle %rsp, 128, %rdi, 128
	scrypt_shuffle %rsp, 192, %rdi, 192
	scrypt_shuffle %rsp, 256, %rdi, 256
	scrypt_shuffle %rsp, 320, %rdi, 320
	
	scrypt_core_3way_cleanup
	ret

#if defined(USE_XOP)
.macro salsa8_core_3way_xop_doubleround
	vpaddd	%xmm0, %xmm1, %xmm4
	vpaddd	%xmm8, %xmm9, %xmm6
	vpaddd	%xmm12, %xmm13, %xmm7
	vprotd	$7, %xmm4, %xmm4
	vprotd	$7, %xmm6, %xmm6
	vprotd	$7, %xmm7, %xmm7
	vpxor	%xmm4, %xmm3, %xmm3
	vpxor	%xmm6, %xmm11, %xmm11
	vpxor	%xmm7, %xmm15, %xmm15
	
	vpaddd	%xmm3, %xmm0, %xmm4
	vpaddd	%xmm11, %xmm8, %xmm6
	vpaddd	%xmm15, %xmm12, %xmm7
	vprotd	$9, %xmm4, %xmm4
	vprotd	$9, %xmm6, %xmm6
	vprotd	$9, %xmm7, %xmm7
	vpxor	%xmm4, %xmm2, %xmm2
	vpxor	%xmm6, %xmm10, %xmm10
	vpxor	%xmm7, %xmm14, %xmm14
	
	vpaddd	%xmm2, %xmm3, %xmm4
	vpaddd	%xmm10, %xmm11, %xmm6
	vpaddd	%xmm14, %xmm15, %xmm7
	vprotd	$13, %xmm4, %xmm4
	vprotd	$13, %xmm6, %xmm6
	vprotd	$13, %xmm7, %xmm7
	vpshufd	$0x93, %xmm3, %xmm3
	vpshufd	$0x93, %xmm11, %xmm11
	vpshufd	$0x93, %xmm15, %xmm15
	vpxor	%xmm4, %xmm1, %xmm1
	vpxor	%xmm6, %xmm9, %xmm9
	vpxor	%xmm7, %xmm13, %xmm13
	
	vpaddd	%xmm1, %xmm2, %xmm4
	vpaddd	%xmm9, %xmm10, %xmm6
	vpaddd	%xmm13, %xmm14, %xmm7
	vprotd	$18, %xmm4, %xmm4
	vprotd	$18, %xmm6, %xmm6
	vprotd	$18, %xmm7, %xmm7
	vpshufd	$0x4e, %xmm2, %xmm2
	vpshufd	$0x4e, %xmm10, %xmm10
	vpshufd	$0x4e, %xmm14, %xmm14
	vpxor	%xmm6, %xmm8, %xmm8
	vpxor	%xmm4, %xmm0, %xmm0
	vpxor	%xmm7, %xmm12, %xmm12
	
	vpaddd	%xmm0, %xmm3, %xmm4
	vpaddd	%xmm8, %xmm11, %xmm6
	vpaddd	%xmm12, %xmm15, %xmm7
	vprotd	$7, %xmm4, %xmm4
	vprotd	$7, %xmm6, %xmm6
	vprotd	$7, %xmm7, %xmm7
	vpshufd	$0x39, %xmm1, %xmm1
	vpshufd	$0x39, %xmm9, %xmm9
	vpshufd	$0x39, %xmm13, %xmm13
	vpxor	%xmm4, %xmm1, %xmm1
	vpxor	%xmm6, %xmm9, %xmm9
	vpxor	%xmm7, %xmm13, %xmm13
	
	vpaddd	%xmm1, %xmm0, %xmm4
	vpaddd	%xmm9, %xmm8, %xmm6
	vpaddd	%xmm13, %xmm12, %xmm7
	vprotd	$9, %xmm4, %xmm4
	vprotd	$9, %xmm6, %xmm6
	vprotd	$9, %xmm7, %xmm7
	vpxor	%xmm4, %xmm2, %xmm2
	vpxor	%xmm6, %xmm10, %xmm10
	vpxor	%xmm7, %xmm14, %xmm14
	
	vpaddd	%xmm2, %xmm1, %xmm4
	vpaddd	%xmm10, %xmm9, %xmm6
	vpaddd	%xmm14, %xmm13, %xmm7
	vprotd	$13, %xmm4, %xmm4
	vprotd	$13, %xmm6, %xmm6
	vprotd	$13, %xmm7, %xmm7
	vpshufd	$0x93, %xmm1, %xmm1
	vpshufd	$0x93, %xmm9, %xmm9
	vpshufd	$0x93, %xmm13, %xmm13
	vpxor	%xmm4, %xmm3, %xmm3
	vpxor	%xmm6, %xmm11, %xmm11
	vpxor	%xmm7, %xmm15, %xmm15
	
	vpaddd	%xmm3, %xmm2, %xmm4
	vpaddd	%xmm11, %xmm10, %xmm6
	vpaddd	%xmm15, %xmm14, %xmm7
	vprotd	$18, %xmm4, %xmm4
	vprotd	$18, %xmm6, %xmm6
	vprotd	$18, %xmm7, %xmm7
	vpshufd	$0x4e, %xmm2, %xmm2
	vpshufd	$0x4e, %xmm10, %xmm10
	vpshufd	$0x4e, %xmm14, %xmm14
	vpxor	%xmm4, %xmm0, %xmm0
	vpxor	%xmm6, %xmm8, %xmm8
	vpxor	%xmm7, %xmm12, %xmm12
	vpshufd	$0x39, %xmm3, %xmm3
	vpshufd	$0x39, %xmm11, %xmm11
	vpshufd	$0x39, %xmm15, %xmm15
.endm

.macro salsa8_core_3way_xop
	salsa8_core_3way_xop_doubleround
	salsa8_core_3way_xop_doubleround
	salsa8_core_3way_xop_doubleround
	salsa8_core_3way_xop_doubleround
.endm
	
	.p2align 6
scrypt_core_3way_xop:
	scrypt_shuffle %rdi, 0, %rsp, 0
	scrypt_shuffle %rdi, 64, %rsp, 64
	scrypt_shuffle %rdi, 128, %rsp, 128
	scrypt_shuffle %rdi, 192, %rsp, 192
	scrypt_shuffle %rdi, 256, %rsp, 256
	scrypt_shuffle %rdi, 320, %rsp, 320
	
	movdqa	64(%rsp), %xmm0
	movdqa	80(%rsp), %xmm1
	movdqa	96(%rsp), %xmm2
	movdqa	112(%rsp), %xmm3
	movdqa	128+64(%rsp), %xmm8
	movdqa	128+80(%rsp), %xmm9
	movdqa	128+96(%rsp), %xmm10
	movdqa	128+112(%rsp), %xmm11
	movdqa	256+64(%rsp), %xmm12
	movdqa	256+80(%rsp), %xmm13
	movdqa	256+96(%rsp), %xmm14
	movdqa	256+112(%rsp), %xmm15
	
	movq	%rsi, %rbx
	leaq	(%r8, %r8, 2), %rax
	shlq	$7, %rax
	addq	%rsi, %rax
scrypt_core_3way_xop_loop1:
	movdqa	%xmm0, 64(%rbx)
	movdqa	%xmm1, 80(%rbx)
	movdqa	%xmm2, 96(%rbx)
	movdqa	%xmm3, 112(%rbx)
	pxor	0(%rsp), %xmm0
	pxor	16(%rsp), %xmm1
	pxor	32(%rsp), %xmm2
	pxor	48(%rsp), %xmm3
	movdqa	%xmm8, 128+64(%rbx)
	movdqa	%xmm9, 128+80(%rbx)
	movdqa	%xmm10, 128+96(%rbx)
	movdqa	%xmm11, 128+112(%rbx)
	pxor	128+0(%rsp), %xmm8
	pxor	128+16(%rsp), %xmm9
	pxor	128+32(%rsp), %xmm10
	pxor	128+48(%rsp), %xmm11
	movdqa	%xmm12, 256+64(%rbx)
	movdqa	%xmm13, 256+80(%rbx)
	movdqa	%xmm14, 256+96(%rbx)
	movdqa	%xmm15, 256+112(%rbx)
	pxor	256+0(%rsp), %xmm12
	pxor	256+16(%rsp), %xmm13
	pxor	256+32(%rsp), %xmm14
	pxor	256+48(%rsp), %xmm15
	movdqa	%xmm0, 0(%rbx)
	movdqa	%xmm1, 16(%rbx)
	movdqa	%xmm2, 32(%rbx)
	movdqa	%xmm3, 48(%rbx)
	movdqa	%xmm8, 128+0(%rbx)
	movdqa	%xmm9, 128+16(%rbx)
	movdqa	%xmm10, 128+32(%rbx)
	movdqa	%xmm11, 128+48(%rbx)
	movdqa	%xmm12, 256+0(%rbx)
	movdqa	%xmm13, 256+16(%rbx)
	movdqa	%xmm14, 256+32(%rbx)
	movdqa	%xmm15, 256+48(%rbx)
	
	salsa8_core_3way_xop
	paddd	0(%rbx), %xmm0
	paddd	16(%rbx), %xmm1
	paddd	32(%rbx), %xmm2
	paddd	48(%rbx), %xmm3
	paddd	128+0(%rbx), %xmm8
	paddd	128+16(%rbx), %xmm9
	paddd	128+32(%rbx), %xmm10
	paddd	128+48(%rbx), %xmm11
	paddd	256+0(%rbx), %xmm12
	paddd	256+16(%rbx), %xmm13
	paddd	256+32(%rbx), %xmm14
	paddd	256+48(%rbx), %xmm15
	movdqa	%xmm0, 0(%rsp)
	movdqa	%xmm1, 16(%rsp)
	movdqa	%xmm2, 32(%rsp)
	movdqa	%xmm3, 48(%rsp)
	movdqa	%xmm8, 128+0(%rsp)
	movdqa	%xmm9, 128+16(%rsp)
	movdqa	%xmm10, 128+32(%rsp)
	movdqa	%xmm11, 128+48(%rsp)
	movdqa	%xmm12, 256+0(%rsp)
	movdqa	%xmm13, 256+16(%rsp)
	movdqa	%xmm14, 256+32(%rsp)
	movdqa	%xmm15, 256+48(%rsp)
	
	pxor	64(%rbx), %xmm0
	pxor	80(%rbx), %xmm1
	pxor	96(%rbx), %xmm2
	pxor	112(%rbx), %xmm3
	pxor	128+64(%rbx), %xmm8
	pxor	128+80(%rbx), %xmm9
	pxor	128+96(%rbx), %xmm10
	pxor	128+112(%rbx), %xmm11
	pxor	256+64(%rbx), %xmm12
	pxor	256+80(%rbx), %xmm13
	pxor	256+96(%rbx), %xmm14
	pxor	256+112(%rbx), %xmm15
	movdqa	%xmm0, 64(%rsp)
	movdqa	%xmm1, 80(%rsp)
	movdqa	%xmm2, 96(%rsp)
	movdqa	%xmm3, 112(%rsp)
	movdqa	%xmm8, 128+64(%rsp)
	movdqa	%xmm9, 128+80(%rsp)
	movdqa	%xmm10, 128+96(%rsp)
	movdqa	%xmm11, 128+112(%rsp)
	movdqa	%xmm12, 256+64(%rsp)
	movdqa	%xmm13, 256+80(%rsp)
	movdqa	%xmm14, 256+96(%rsp)
	movdqa	%xmm15, 256+112(%rsp)
	salsa8_core_3way_xop
	paddd	64(%rsp), %xmm0
	paddd	80(%rsp), %xmm1
	paddd	96(%rsp), %xmm2
	paddd	112(%rsp), %xmm3
	paddd	128+64(%rsp), %xmm8
	paddd	128+80(%rsp), %xmm9
	paddd	128+96(%rsp), %xmm10
	paddd	128+112(%rsp), %xmm11
	paddd	256+64(%rsp), %xmm12
	paddd	256+80(%rsp), %xmm13
	paddd	256+96(%rsp), %xmm14
	paddd	256+112(%rsp), %xmm15
	
	addq	$3*128, %rbx
	cmpq	%rax, %rbx
	jne scrypt_core_3way_xop_loop1
	
	movdqa	%xmm0, 64(%rsp)
	movdqa	%xmm1, 80(%rsp)
	movdqa	%xmm2, 96(%rsp)
	movdqa	%xmm3, 112(%rsp)
	movdqa	%xmm8, 128+64(%rsp)
	movdqa	%xmm9, 128+80(%rsp)
	movdqa	%xmm10, 128+96(%rsp)
	movdqa	%xmm11, 128+112(%rsp)
	movdqa	%xmm12, 256+64(%rsp)
	movdqa	%xmm13, 256+80(%rsp)
	movdqa	%xmm14, 256+96(%rsp)
	movdqa	%xmm15, 256+112(%rsp)
	
	movq	%r8, %rcx
	subq	$1, %r8
scrypt_core_3way_xop_loop2:
	movd	%xmm0, %ebp
	movd	%xmm8, %ebx
	movd	%xmm12, %eax
	pxor	0(%rsp), %xmm0
	pxor	16(%rsp), %xmm1
	pxor	32(%rsp), %xmm2
	pxor	48(%rsp), %xmm3
	pxor	128+0(%rsp), %xmm8
	pxor	128+16(%rsp), %xmm9
	pxor	128+32(%rsp), %xmm10
	pxor	128+48(%rsp), %xmm11
	pxor	256+0(%rsp), %xmm12
	pxor	256+16(%rsp), %xmm13
	pxor	256+32(%rsp), %xmm14
	pxor	256+48(%rsp), %xmm15
	andl	%r8d, %ebp
	leaq	(%rbp, %rbp, 2), %rbp
	shll	$7, %ebp
	andl	%r8d, %ebx
	leaq	1(%rbx, %rbx, 2), %rbx
	shll	$7, %ebx
	andl	%r8d, %eax
	leaq	2(%rax, %rax, 2), %rax
	shll	$7, %eax
	pxor	0(%rsi, %rbp), %xmm0
	pxor	16(%rsi, %rbp), %xmm1
	pxor	32(%rsi, %rbp), %xmm2
	pxor	48(%rsi, %rbp), %xmm3
	pxor	0(%rsi, %rbx), %xmm8
	pxor	16(%rsi, %rbx), %xmm9
	pxor	32(%rsi, %rbx), %xmm10
	pxor	48(%rsi, %rbx), %xmm11
	pxor	0(%rsi, %rax), %xmm12
	pxor	16(%rsi, %rax), %xmm13
	pxor	32(%rsi, %rax), %xmm14
	pxor	48(%rsi, %rax), %xmm15
	
	movdqa	%xmm0, 0(%rsp)
	movdqa	%xmm1, 16(%rsp)
	movdqa	%xmm2, 32(%rsp)
	movdqa	%xmm3, 48(%rsp)
	movdqa	%xmm8, 128+0(%rsp)
	movdqa	%xmm9, 128+16(%rsp)
	movdqa	%xmm10, 128+32(%rsp)
	movdqa	%xmm11, 128+48(%rsp)
	movdqa	%xmm12, 256+0(%rsp)
	movdqa	%xmm13, 256+16(%rsp)
	movdqa	%xmm14, 256+32(%rsp)
	movdqa	%xmm15, 256+48(%rsp)
	salsa8_core_3way_xop
	paddd	0(%rsp), %xmm0
	paddd	16(%rsp), %xmm1
	paddd	32(%rsp), %xmm2
	paddd	48(%rsp), %xmm3
	paddd	128+0(%rsp), %xmm8
	paddd	128+16(%rsp), %xmm9
	paddd	128+32(%rsp), %xmm10
	paddd	128+48(%rsp), %xmm11
	paddd	256+0(%rsp), %xmm12
	paddd	256+16(%rsp), %xmm13
	paddd	256+32(%rsp), %xmm14
	paddd	256+48(%rsp), %xmm15
	movdqa	%xmm0, 0(%rsp)
	movdqa	%xmm1, 16(%rsp)
	movdqa	%xmm2, 32(%rsp)
	movdqa	%xmm3, 48(%rsp)
	movdqa	%xmm8, 128+0(%rsp)
	movdqa	%xmm9, 128+16(%rsp)
	movdqa	%xmm10, 128+32(%rsp)
	movdqa	%xmm11, 128+48(%rsp)
	movdqa	%xmm12, 256+0(%rsp)
	movdqa	%xmm13, 256+16(%rsp)
	movdqa	%xmm14, 256+32(%rsp)
	movdqa	%xmm15, 256+48(%rsp)
	
	pxor	64(%rsi, %rbp), %xmm0
	pxor	80(%rsi, %rbp), %xmm1
	pxor	96(%rsi, %rbp), %xmm2
	pxor	112(%rsi, %rbp), %xmm3
	pxor	64(%rsi, %rbx), %xmm8
	pxor	80(%rsi, %rbx), %xmm9
	pxor	96(%rsi, %rbx), %xmm10
	pxor	112(%rsi, %rbx), %xmm11
	pxor	64(%rsi, %rax), %xmm12
	pxor	80(%rsi, %rax), %xmm13
	pxor	96(%rsi, %rax), %xmm14
	pxor	112(%rsi, %rax), %xmm15
	pxor	64(%rsp), %xmm0
	pxor	80(%rsp), %xmm1
	pxor	96(%rsp), %xmm2
	pxor	112(%rsp), %xmm3
	pxor	128+64(%rsp), %xmm8
	pxor	128+80(%rsp), %xmm9
	pxor	128+96(%rsp), %xmm10
	pxor	128+112(%rsp), %xmm11
	pxor	256+64(%rsp), %xmm12
	pxor	256+80(%rsp), %xmm13
	pxor	256+96(%rsp), %xmm14
	pxor	256+112(%rsp), %xmm15
	movdqa	%xmm0, 64(%rsp)
	movdqa	%xmm1, 80(%rsp)
	movdqa	%xmm2, 96(%rsp)
	movdqa	%xmm3, 112(%rsp)
	movdqa	%xmm8, 128+64(%rsp)
	movdqa	%xmm9, 128+80(%rsp)
	movdqa	%xmm10, 128+96(%rsp)
	movdqa	%xmm11, 128+112(%rsp)
	movdqa	%xmm12, 256+64(%rsp)
	movdqa	%xmm13, 256+80(%rsp)
	movdqa	%xmm14, 256+96(%rsp)
	movdqa	%xmm15, 256+112(%rsp)
	salsa8_core_3way_xop
	paddd	64(%rsp), %xmm0
	paddd	80(%rsp), %xmm1
	paddd	96(%rsp), %xmm2
	paddd	112(%rsp), %xmm3
	paddd	128+64(%rsp), %xmm8
	paddd	128+80(%rsp), %xmm9
	paddd	128+96(%rsp), %xmm10
	paddd	128+112(%rsp), %xmm11
	paddd	256+64(%rsp), %xmm12
	paddd	256+80(%rsp), %xmm13
	paddd	256+96(%rsp), %xmm14
	paddd	256+112(%rsp), %xmm15
	movdqa	%xmm0, 64(%rsp)
	movdqa	%xmm1, 80(%rsp)
	movdqa	%xmm2, 96(%rsp)
	movdqa	%xmm3, 112(%rsp)
	movdqa	%xmm8, 128+64(%rsp)
	movdqa	%xmm9, 128+80(%rsp)
	movdqa	%xmm10, 128+96(%rsp)
	movdqa	%xmm11, 128+112(%rsp)
	movdqa	%xmm12, 256+64(%rsp)
	movdqa	%xmm13, 256+80(%rsp)
	movdqa	%xmm14, 256+96(%rsp)
	movdqa	%xmm15, 256+112(%rsp)
	
	subq	$1, %rcx
	ja scrypt_core_3way_xop_loop2
	
	scrypt_shuffle %rsp, 0, %rdi, 0
	scrypt_shuffle %rsp, 64, %rdi, 64
	scrypt_shuffle %rsp, 128, %rdi, 128
	scrypt_shuffle %rsp, 192, %rdi, 192
	scrypt_shuffle %rsp, 256, %rdi, 256
	scrypt_shuffle %rsp, 320, %rdi, 320
	
	scrypt_core_3way_cleanup
	ret
#endif /* USE_XOP */
#endif /* USE_AVX */
	
.macro salsa8_core_3way_xmm_doubleround
	movdqa	%xmm1, %xmm4
	movdqa	%xmm9, %xmm6
	movdqa	%xmm13, %xmm7
	paddd	%xmm0, %xmm4
	paddd	%xmm8, %xmm6
	paddd	%xmm12, %xmm7
	movdqa	%xmm4, %xmm5
	pslld	$7, %xmm4
	psrld	$25, %xmm5
	pxor	%xmm4, %xmm3
	pxor	%xmm5, %xmm3
	movdqa	%xmm0, %xmm4
	movdqa	%xmm6, %xmm5
	pslld	$7, %xmm6
	psrld	$25, %xmm5
	pxor	%xmm6, %xmm11
	pxor	%xmm5, %xmm11
	movdqa	%xmm8, %xmm6
	movdqa	%xmm7, %xmm5
	pslld	$7, %xmm7
	psrld	$25, %xmm5
	pxor	%xmm7, %xmm15
	pxor	%xmm5, %xmm15
	movdqa	%xmm12, %xmm7
	
	paddd	%xmm3, %xmm4
	paddd	%xmm11, %xmm6
	paddd	%xmm15, %xmm7
	movdqa	%xmm4, %xmm5
	pslld	$9, %xmm4
	psrld	$23, %xmm5
	pxor	%xmm4, %xmm2
	movdqa	%xmm3, %xmm4
	pshufd	$0x93, %xmm3, %xmm3
	pxor	%xmm5, %xmm2
	movdqa	%xmm6, %xmm5
	pslld	$9, %xmm6
	psrld	$23, %xmm5
	pxor	%xmm6, %xmm10
	movdqa	%xmm11, %xmm6
	pshufd	$0x93, %xmm11, %xmm11
	pxor	%xmm5, %xmm10
	movdqa	%xmm7, %xmm5
	pslld	$9, %xmm7
	psrld	$23, %xmm5
	pxor	%xmm7, %xmm14
	movdqa	%xmm15, %xmm7
	pxor	%xmm5, %xmm14
	pshufd	$0x93, %xmm15, %xmm15
	
	paddd	%xmm2, %xmm4
	paddd	%xmm10, %xmm6
	paddd	%xmm14, %xmm7
	movdqa	%xmm4, %xmm5
	pslld	$13, %xmm4
	psrld	$19, %xmm5
	pxor	%xmm4, %xmm1
	movdqa	%xmm2, %xmm4
	pshufd	$0x4e, %xmm2, %xmm2
	pxor	%xmm5, %xmm1
	movdqa	%xmm6, %xmm5
	pslld	$13, %xmm6
	psrld	$19, %xmm5
	pxor	%xmm6, %xmm9
	movdqa	%xmm10, %xmm6
	pshufd	$0x4e, %xmm10, %xmm10
	pxor	%xmm5, %xmm9
	movdqa	%xmm7, %xmm5
	pslld	$13, %xmm7
	psrld	$19, %xmm5
	pxor	%xmm7, %xmm13
	movdqa	%xmm14, %xmm7
	pshufd	$0x4e, %xmm14, %xmm14
	pxor	%xmm5, %xmm13
	
	paddd	%xmm1, %xmm4
	paddd	%xmm9, %xmm6
	paddd	%xmm13, %xmm7
	movdqa	%xmm4, %xmm5
	pslld	$18, %xmm4
	psrld	$14, %xmm5
	pxor	%xmm4, %xmm0
	pshufd	$0x39, %xmm1, %xmm1
	pxor	%xmm5, %xmm0
	movdqa	%xmm3, %xmm4
	movdqa	%xmm6, %xmm5
	pslld	$18, %xmm6
	psrld	$14, %xmm5
	pxor	%xmm6, %xmm8
	pshufd	$0x39, %xmm9, %xmm9
	pxor	%xmm5, %xmm8
	movdqa	%xmm11, %xmm6
	movdqa	%xmm7, %xmm5
	pslld	$18, %xmm7
	psrld	$14, %xmm5
	pxor	%xmm7, %xmm12
	movdqa	%xmm15, %xmm7
	pxor	%xmm5, %xmm12
	pshufd	$0x39, %xmm13, %xmm13
	
	paddd	%xmm0, %xmm4
	paddd	%xmm8, %xmm6
	paddd	%xmm12, %xmm7
	movdqa	%xmm4, %xmm5
	pslld	$7, %xmm4
	psrld	$25, %xmm5
	pxor	%xmm4, %xmm1
	pxor	%xmm5, %xmm1
	movdqa	%xmm0, %xmm4
	movdqa	%xmm6, %xmm5
	pslld	$7, %xmm6
	psrld	$25, %xmm5
	pxor	%xmm6, %xmm9
	pxor	%xmm5, %xmm9
	movdqa	%xmm8, %xmm6
	movdqa	%xmm7, %xmm5
	pslld	$7, %xmm7
	psrld	$25, %xmm5
	pxor	%xmm7, %xmm13
	pxor	%xmm5, %xmm13
	movdqa	%xmm12, %xmm7
	
	paddd	%xmm1, %xmm4
	paddd	%xmm9, %xmm6
	paddd	%xmm13, %xmm7
	movdqa	%xmm4, %xmm5
	pslld	$9, %xmm4
	psrld	$23, %xmm5
	pxor	%xmm4, %xmm2
	movdqa	%xmm1, %xmm4
	pshufd	$0x93, %xmm1, %xmm1
	pxor	%xmm5, %xmm2
	movdqa	%xmm6, %xmm5
	pslld	$9, %xmm6
	psrld	$23, %xmm5
	pxor	%xmm6, %xmm10
	movdqa	%xmm9, %xmm6
	pshufd	$0x93, %xmm9, %xmm9
	pxor	%xmm5, %xmm10
	movdqa	%xmm7, %xmm5
	pslld	$9, %xmm7
	psrld	$23, %xmm5
	pxor	%xmm7, %xmm14
	movdqa	%xmm13, %xmm7
	pshufd	$0x93, %xmm13, %xmm13
	pxor	%xmm5, %xmm14
	
	paddd	%xmm2, %xmm4
	paddd	%xmm10, %xmm6
	paddd	%xmm14, %xmm7
	movdqa	%xmm4, %xmm5
	pslld	$13, %xmm4
	psrld	$19, %xmm5
	pxor	%xmm4, %xmm3
	movdqa	%xmm2, %xmm4
	pshufd	$0x4e, %xmm2, %xmm2
	pxor	%xmm5, %xmm3
	movdqa	%xmm6, %xmm5
	pslld	$13, %xmm6
	psrld	$19, %xmm5
	pxor	%xmm6, %xmm11
	movdqa	%xmm10, %xmm6
	pshufd	$0x4e, %xmm10, %xmm10
	pxor	%xmm5, %xmm11
	movdqa	%xmm7, %xmm5
	pslld	$13, %xmm7
	psrld	$19, %xmm5
	pxor	%xmm7, %xmm15
	movdqa	%xmm14, %xmm7
	pshufd	$0x4e, %xmm14, %xmm14
	pxor	%xmm5, %xmm15
	
	paddd	%xmm3, %xmm4
	paddd	%xmm11, %xmm6
	paddd	%xmm15, %xmm7
	movdqa	%xmm4, %xmm5
	pslld	$18, %xmm4
	psrld	$14, %xmm5
	pxor	%xmm4, %xmm0
	pshufd	$0x39, %xmm3, %xmm3
	pxor	%xmm5, %xmm0
	movdqa	%xmm6, %xmm5
	pslld	$18, %xmm6
	psrld	$14, %xmm5
	pxor	%xmm6, %xmm8
	pshufd	$0x39, %xmm11, %xmm11
	pxor	%xmm5, %xmm8
	movdqa	%xmm7, %xmm5
	pslld	$18, %xmm7
	psrld	$14, %xmm5
	pxor	%xmm7, %xmm12
	pshufd	$0x39, %xmm15, %xmm15
	pxor	%xmm5, %xmm12
.endm

.macro salsa8_core_3way_xmm
	salsa8_core_3way_xmm_doubleround
	salsa8_core_3way_xmm_doubleround
	salsa8_core_3way_xmm_doubleround
	salsa8_core_3way_xmm_doubleround
.endm
	
	.p2align 6
scrypt_core_3way_xmm:
	scrypt_shuffle %rdi, 0, %rsp, 0
	scrypt_shuffle %rdi, 64, %rsp, 64
	scrypt_shuffle %rdi, 128, %rsp, 128
	scrypt_shuffle %rdi, 192, %rsp, 192
	scrypt_shuffle %rdi, 256, %rsp, 256
	scrypt_shuffle %rdi, 320, %rsp, 320
	
	movdqa	64(%rsp), %xmm0
	movdqa	80(%rsp), %xmm1
	movdqa	96(%rsp), %xmm2
	movdqa	112(%rsp), %xmm3
	movdqa	128+64(%rsp), %xmm8
	movdqa	128+80(%rsp), %xmm9
	movdqa	128+96(%rsp), %xmm10
	movdqa	128+112(%rsp), %xmm11
	movdqa	256+64(%rsp), %xmm12
	movdqa	256+80(%rsp), %xmm13
	movdqa	256+96(%rsp), %xmm14
	movdqa	256+112(%rsp), %xmm15
	
	movq	%rsi, %rbx
	leaq	(%r8, %r8, 2), %rax
	shlq	$7, %rax
	addq	%rsi, %rax
scrypt_core_3way_xmm_loop1:
	movdqa	%xmm0, 64(%rbx)
	movdqa	%xmm1, 80(%rbx)
	movdqa	%xmm2, 96(%rbx)
	movdqa	%xmm3, 112(%rbx)
	pxor	0(%rsp), %xmm0
	pxor	16(%rsp), %xmm1
	pxor	32(%rsp), %xmm2
	pxor	48(%rsp), %xmm3
	movdqa	%xmm8, 128+64(%rbx)
	movdqa	%xmm9, 128+80(%rbx)
	movdqa	%xmm10, 128+96(%rbx)
	movdqa	%xmm11, 128+112(%rbx)
	pxor	128+0(%rsp), %xmm8
	pxor	128+16(%rsp), %xmm9
	pxor	128+32(%rsp), %xmm10
	pxor	128+48(%rsp), %xmm11
	movdqa	%xmm12, 256+64(%rbx)
	movdqa	%xmm13, 256+80(%rbx)
	movdqa	%xmm14, 256+96(%rbx)
	movdqa	%xmm15, 256+112(%rbx)
	pxor	256+0(%rsp), %xmm12
	pxor	256+16(%rsp), %xmm13
	pxor	256+32(%rsp), %xmm14
	pxor	256+48(%rsp), %xmm15
	movdqa	%xmm0, 0(%rbx)
	movdqa	%xmm1, 16(%rbx)
	movdqa	%xmm2, 32(%rbx)
	movdqa	%xmm3, 48(%rbx)
	movdqa	%xmm8, 128+0(%rbx)
	movdqa	%xmm9, 128+16(%rbx)
	movdqa	%xmm10, 128+32(%rbx)
	movdqa	%xmm11, 128+48(%rbx)
	movdqa	%xmm12, 256+0(%rbx)
	movdqa	%xmm13, 256+16(%rbx)
	movdqa	%xmm14, 256+32(%rbx)
	movdqa	%xmm15, 256+48(%rbx)
	
	salsa8_core_3way_xmm
	paddd	0(%rbx), %xmm0
	paddd	16(%rbx), %xmm1
	paddd	32(%rbx), %xmm2
	paddd	48(%rbx), %xmm3
	paddd	128+0(%rbx), %xmm8
	paddd	128+16(%rbx), %xmm9
	paddd	128+32(%rbx), %xmm10
	paddd	128+48(%rbx), %xmm11
	paddd	256+0(%rbx), %xmm12
	paddd	256+16(%rbx), %xmm13
	paddd	256+32(%rbx), %xmm14
	paddd	256+48(%rbx), %xmm15
	movdqa	%xmm0, 0(%rsp)
	movdqa	%xmm1, 16(%rsp)
	movdqa	%xmm2, 32(%rsp)
	movdqa	%xmm3, 48(%rsp)
	movdqa	%xmm8, 128+0(%rsp)
	movdqa	%xmm9, 128+16(%rsp)
	movdqa	%xmm10, 128+32(%rsp)
	movdqa	%xmm11, 128+48(%rsp)
	movdqa	%xmm12, 256+0(%rsp)
	movdqa	%xmm13, 256+16(%rsp)
	movdqa	%xmm14, 256+32(%rsp)
	movdqa	%xmm15, 256+48(%rsp)
	
	pxor	64(%rbx), %xmm0
	pxor	80(%rbx), %xmm1
	pxor	96(%rbx), %xmm2
	pxor	112(%rbx), %xmm3
	pxor	128+64(%rbx), %xmm8
	pxor	128+80(%rbx), %xmm9
	pxor	128+96(%rbx), %xmm10
	pxor	128+112(%rbx), %xmm11
	pxor	256+64(%rbx), %xmm12
	pxor	256+80(%rbx), %xmm13
	pxor	256+96(%rbx), %xmm14
	pxor	256+112(%rbx), %xmm15
	movdqa	%xmm0, 64(%rsp)
	movdqa	%xmm1, 80(%rsp)
	movdqa	%xmm2, 96(%rsp)
	movdqa	%xmm3, 112(%rsp)
	movdqa	%xmm8, 128+64(%rsp)
	movdqa	%xmm9, 128+80(%rsp)
	movdqa	%xmm10, 128+96(%rsp)
	movdqa	%xmm11, 128+112(%rsp)
	movdqa	%xmm12, 256+64(%rsp)
	movdqa	%xmm13, 256+80(%rsp)
	movdqa	%xmm14, 256+96(%rsp)
	movdqa	%xmm15, 256+112(%rsp)
	salsa8_core_3way_xmm
	paddd	64(%rsp), %xmm0
	paddd	80(%rsp), %xmm1
	paddd	96(%rsp), %xmm2
	paddd	112(%rsp), %xmm3
	paddd	128+64(%rsp), %xmm8
	paddd	128+80(%rsp), %xmm9
	paddd	128+96(%rsp), %xmm10
	paddd	128+112(%rsp), %xmm11
	paddd	256+64(%rsp), %xmm12
	paddd	256+80(%rsp), %xmm13
	paddd	256+96(%rsp), %xmm14
	paddd	256+112(%rsp), %xmm15
	
	addq	$3*128, %rbx
	cmpq	%rax, %rbx
	jne scrypt_core_3way_xmm_loop1
	
	movdqa	%xmm0, 64(%rsp)
	movdqa	%xmm1, 80(%rsp)
	movdqa	%xmm2, 96(%rsp)
	movdqa	%xmm3, 112(%rsp)
	movdqa	%xmm8, 128+64(%rsp)
	movdqa	%xmm9, 128+80(%rsp)
	movdqa	%xmm10, 128+96(%rsp)
	movdqa	%xmm11, 128+112(%rsp)
	movdqa	%xmm12, 256+64(%rsp)
	movdqa	%xmm13, 256+80(%rsp)
	movdqa	%xmm14, 256+96(%rsp)
	movdqa	%xmm15, 256+112(%rsp)
	
	movq	%r8, %rcx
	subq	$1, %r8
scrypt_core_3way_xmm_loop2:
	movd	%xmm0, %ebp
	movd	%xmm8, %ebx
	movd	%xmm12, %eax
	pxor	0(%rsp), %xmm0
	pxor	16(%rsp), %xmm1
	pxor	32(%rsp), %xmm2
	pxor	48(%rsp), %xmm3
	pxor	128+0(%rsp), %xmm8
	pxor	128+16(%rsp), %xmm9
	pxor	128+32(%rsp), %xmm10
	pxor	128+48(%rsp), %xmm11
	pxor	256+0(%rsp), %xmm12
	pxor	256+16(%rsp), %xmm13
	pxor	256+32(%rsp), %xmm14
	pxor	256+48(%rsp), %xmm15
	andl	%r8d, %ebp
	leaq	(%rbp, %rbp, 2), %rbp
	shll	$7, %ebp
	andl	%r8d, %ebx
	leaq	1(%rbx, %rbx, 2), %rbx
	shll	$7, %ebx
	andl	%r8d, %eax
	leaq	2(%rax, %rax, 2), %rax
	shll	$7, %eax
	pxor	0(%rsi, %rbp), %xmm0
	pxor	16(%rsi, %rbp), %xmm1
	pxor	32(%rsi, %rbp), %xmm2
	pxor	48(%rsi, %rbp), %xmm3
	pxor	0(%rsi, %rbx), %xmm8
	pxor	16(%rsi, %rbx), %xmm9
	pxor	32(%rsi, %rbx), %xmm10
	pxor	48(%rsi, %rbx), %xmm11
	pxor	0(%rsi, %rax), %xmm12
	pxor	16(%rsi, %rax), %xmm13
	pxor	32(%rsi, %rax), %xmm14
	pxor	48(%rsi, %rax), %xmm15
	
	movdqa	%xmm0, 0(%rsp)
	movdqa	%xmm1, 16(%rsp)
	movdqa	%xmm2, 32(%rsp)
	movdqa	%xmm3, 48(%rsp)
	movdqa	%xmm8, 128+0(%rsp)
	movdqa	%xmm9, 128+16(%rsp)
	movdqa	%xmm10, 128+32(%rsp)
	movdqa	%xmm11, 128+48(%rsp)
	movdqa	%xmm12, 256+0(%rsp)
	movdqa	%xmm13, 256+16(%rsp)
	movdqa	%xmm14, 256+32(%rsp)
	movdqa	%xmm15, 256+48(%rsp)
	salsa8_core_3way_xmm
	paddd	0(%rsp), %xmm0
	paddd	16(%rsp), %xmm1
	paddd	32(%rsp), %xmm2
	paddd	48(%rsp), %xmm3
	paddd	128+0(%rsp), %xmm8
	paddd	128+16(%rsp), %xmm9
	paddd	128+32(%rsp), %xmm10
	paddd	128+48(%rsp), %xmm11
	paddd	256+0(%rsp), %xmm12
	paddd	256+16(%rsp), %xmm13
	paddd	256+32(%rsp), %xmm14
	paddd	256+48(%rsp), %xmm15
	movdqa	%xmm0, 0(%rsp)
	movdqa	%xmm1, 16(%rsp)
	movdqa	%xmm2, 32(%rsp)
	movdqa	%xmm3, 48(%rsp)
	movdqa	%xmm8, 128+0(%rsp)
	movdqa	%xmm9, 128+16(%rsp)
	movdqa	%xmm10, 128+32(%rsp)
	movdqa	%xmm11, 128+48(%rsp)
	movdqa	%xmm12, 256+0(%rsp)
	movdqa	%xmm13, 256+16(%rsp)
	movdqa	%xmm14, 256+32(%rsp)
	movdqa	%xmm15, 256+48(%rsp)
	
	pxor	64(%rsi, %rbp), %xmm0
	pxor	80(%rsi, %rbp), %xmm1
	pxor	96(%rsi, %rbp), %xmm2
	pxor	112(%rsi, %rbp), %xmm3
	pxor	64(%rsi, %rbx), %xmm8
	pxor	80(%rsi, %rbx), %xmm9
	pxor	96(%rsi, %rbx), %xmm10
	pxor	112(%rsi, %rbx), %xmm11
	pxor	64(%rsi, %rax), %xmm12
	pxor	80(%rsi, %rax), %xmm13
	pxor	96(%rsi, %rax), %xmm14
	pxor	112(%rsi, %rax), %xmm15
	pxor	64(%rsp), %xmm0
	pxor	80(%rsp), %xmm1
	pxor	96(%rsp), %xmm2
	pxor	112(%rsp), %xmm3
	pxor	128+64(%rsp), %xmm8
	pxor	128+80(%rsp), %xmm9
	pxor	128+96(%rsp), %xmm10
	pxor	128+112(%rsp), %xmm11
	pxor	256+64(%rsp), %xmm12
	pxor	256+80(%rsp), %xmm13
	pxor	256+96(%rsp), %xmm14
	pxor	256+112(%rsp), %xmm15
	movdqa	%xmm0, 64(%rsp)
	movdqa	%xmm1, 80(%rsp)
	movdqa	%xmm2, 96(%rsp)
	movdqa	%xmm3, 112(%rsp)
	movdqa	%xmm8, 128+64(%rsp)
	movdqa	%xmm9, 128+80(%rsp)
	movdqa	%xmm10, 128+96(%rsp)
	movdqa	%xmm11, 128+112(%rsp)
	movdqa	%xmm12, 256+64(%rsp)
	movdqa	%xmm13, 256+80(%rsp)
	movdqa	%xmm14, 256+96(%rsp)
	movdqa	%xmm15, 256+112(%rsp)
	salsa8_core_3way_xmm
	paddd	64(%rsp), %xmm0
	paddd	80(%rsp), %xmm1
	paddd	96(%rsp), %xmm2
	paddd	112(%rsp), %xmm3
	paddd	128+64(%rsp), %xmm8
	paddd	128+80(%rsp), %xmm9
	paddd	128+96(%rsp), %xmm10
	paddd	128+112(%rsp), %xmm11
	paddd	256+64(%rsp), %xmm12
	paddd	256+80(%rsp), %xmm13
	paddd	256+96(%rsp), %xmm14
	paddd	256+112(%rsp), %xmm15
	movdqa	%xmm0, 64(%rsp)
	movdqa	%xmm1, 80(%rsp)
	movdqa	%xmm2, 96(%rsp)
	movdqa	%xmm3, 112(%rsp)
	movdqa	%xmm8, 128+64(%rsp)
	movdqa	%xmm9, 128+80(%rsp)
	movdqa	%xmm10, 128+96(%rsp)
	movdqa	%xmm11, 128+112(%rsp)
	movdqa	%xmm12, 256+64(%rsp)
	movdqa	%xmm13, 256+80(%rsp)
	movdqa	%xmm14, 256+96(%rsp)
	movdqa	%xmm15, 256+112(%rsp)
	
	subq	$1, %rcx
	ja scrypt_core_3way_xmm_loop2
	
	scrypt_shuffle %rsp, 0, %rdi, 0
	scrypt_shuffle %rsp, 64, %rdi, 64
	scrypt_shuffle %rsp, 128, %rdi, 128
	scrypt_shuffle %rsp, 192, %rdi, 192
	scrypt_shuffle %rsp, 256, %rdi, 256
	scrypt_shuffle %rsp, 320, %rdi, 320
	
	scrypt_core_3way_cleanup
	ret


#if defined(USE_AVX2)

.macro salsa8_core_6way_avx2_doubleround
	vpaddd	%ymm0, %ymm1, %ymm4
	vpaddd	%ymm8, %ymm9, %ymm6
	vpaddd	%ymm12, %ymm13, %ymm7
	vpslld	$7, %ymm4, %ymm5
	vpsrld	$25, %ymm4, %ymm4
	vpxor	%ymm5, %ymm3, %ymm3
	vpxor	%ymm4, %ymm3, %ymm3
	vpslld	$7, %ymm6, %ymm5
	vpsrld	$25, %ymm6, %ymm6
	vpxor	%ymm5, %ymm11, %ymm11
	vpxor	%ymm6, %ymm11, %ymm11
	vpslld	$7, %ymm7, %ymm5
	vpsrld	$25, %ymm7, %ymm7
	vpxor	%ymm5, %ymm15, %ymm15
	vpxor	%ymm7, %ymm15, %ymm15
	
	vpaddd	%ymm3, %ymm0, %ymm4
	vpaddd	%ymm11, %ymm8, %ymm6
	vpaddd	%ymm15, %ymm12, %ymm7
	vpslld	$9, %ymm4, %ymm5
	vpsrld	$23, %ymm4, %ymm4
	vpxor	%ymm5, %ymm2, %ymm2
	vpxor	%ymm4, %ymm2, %ymm2
	vpslld	$9, %ymm6, %ymm5
	vpsrld	$23, %ymm6, %ymm6
	vpxor	%ymm5, %ymm10, %ymm10
	vpxor	%ymm6, %ymm10, %ymm10
	vpslld	$9, %ymm7, %ymm5
	vpsrld	$23, %ymm7, %ymm7
	vpxor	%ymm5, %ymm14, %ymm14
	vpxor	%ymm7, %ymm14, %ymm14
	
	vpaddd	%ymm2, %ymm3, %ymm4
	vpaddd	%ymm10, %ymm11, %ymm6
	vpaddd	%ymm14, %ymm15, %ymm7
	vpslld	$13, %ymm4, %ymm5
	vpsrld	$19, %ymm4, %ymm4
	vpshufd	$0x93, %ymm3, %ymm3
	vpshufd	$0x93, %ymm11, %ymm11
	vpshufd	$0x93, %ymm15, %ymm15
	vpxor	%ymm5, %ymm1, %ymm1
	vpxor	%ymm4, %ymm1, %ymm1
	vpslld	$13, %ymm6, %ymm5
	vpsrld	$19, %ymm6, %ymm6
	vpxor	%ymm5, %ymm9, %ymm9
	vpxor	%ymm6, %ymm9, %ymm9
	vpslld	$13, %ymm7, %ymm5
	vpsrld	$19, %ymm7, %ymm7
	vpxor	%ymm5, %ymm13, %ymm13
	vpxor	%ymm7, %ymm13, %ymm13
	
	vpaddd	%ymm1, %ymm2, %ymm4
	vpaddd	%ymm9, %ymm10, %ymm6
	vpaddd	%ymm13, %ymm14, %ymm7
	vpslld	$18, %ymm4, %ymm5
	vpsrld	$14, %ymm4, %ymm4
	vpshufd	$0x4e, %ymm2, %ymm2
	vpshufd	$0x4e, %ymm10, %ymm10
	vpshufd	$0x4e, %ymm14, %ymm14
	vpxor	%ymm5, %ymm0, %ymm0
	vpxor	%ymm4, %ymm0, %ymm0
	vpslld	$18, %ymm6, %ymm5
	vpsrld	$14, %ymm6, %ymm6
	vpxor	%ymm5, %ymm8, %ymm8
	vpxor	%ymm6, %ymm8, %ymm8
	vpslld	$18, %ymm7, %ymm5
	vpsrld	$14, %ymm7, %ymm7
	vpxor	%ymm5, %ymm12, %ymm12
	vpxor	%ymm7, %ymm12, %ymm12
	
	vpaddd	%ymm0, %ymm3, %ymm4
	vpaddd	%ymm8, %ymm11, %ymm6
	vpaddd	%ymm12, %ymm15, %ymm7
	vpslld	$7, %ymm4, %ymm5
	vpsrld	$25, %ymm4, %ymm4
	vpshufd	$0x39, %ymm1, %ymm1
	vpxor	%ymm5, %ymm1, %ymm1
	vpxor	%ymm4, %ymm1, %ymm1
	vpslld	$7, %ymm6, %ymm5
	vpsrld	$25, %ymm6, %ymm6
	vpshufd	$0x39, %ymm9, %ymm9
	vpxor	%ymm5, %ymm9, %ymm9
	vpxor	%ymm6, %ymm9, %ymm9
	vpslld	$7, %ymm7, %ymm5
	vpsrld	$25, %ymm7, %ymm7
	vpshufd	$0x39, %ymm13, %ymm13
	vpxor	%ymm5, %ymm13, %ymm13
	vpxor	%ymm7, %ymm13, %ymm13
	
	vpaddd	%ymm1, %ymm0, %ymm4
	vpaddd	%ymm9, %ymm8, %ymm6
	vpaddd	%ymm13, %ymm12, %ymm7
	vpslld	$9, %ymm4, %ymm5
	vpsrld	$23, %ymm4, %ymm4
	vpxor	%ymm5, %ymm2, %ymm2
	vpxor	%ymm4, %ymm2, %ymm2
	vpslld	$9, %ymm6, %ymm5
	vpsrld	$23, %ymm6, %ymm6
	vpxor	%ymm5, %ymm10, %ymm10
	vpxor	%ymm6, %ymm10, %ymm10
	vpslld	$9, %ymm7, %ymm5
	vpsrld	$23, %ymm7, %ymm7
	vpxor	%ymm5, %ymm14, %ymm14
	vpxor	%ymm7, %ymm14, %ymm14
	
	vpaddd	%ymm2, %ymm1, %ymm4
	vpaddd	%ymm10, %ymm9, %ymm6
	vpaddd	%ymm14, %ymm13, %ymm7
	vpslld	$13, %ymm4, %ymm5
	vpsrld	$19, %ymm4, %ymm4
	vpshufd	$0x93, %ymm1, %ymm1
	vpshufd	$0x93, %ymm9, %ymm9
	vpshufd	$0x93, %ymm13, %ymm13
	vpxor	%ymm5, %ymm3, %ymm3
	vpxor	%ymm4, %ymm3, %ymm3
	vpslld	$13, %ymm6, %ymm5
	vpsrld	$19, %ymm6, %ymm6
	vpxor	%ymm5, %ymm11, %ymm11
	vpxor	%ymm6, %ymm11, %ymm11
	vpslld	$13, %ymm7, %ymm5
	vpsrld	$19, %ymm7, %ymm7
	vpxor	%ymm5, %ymm15, %ymm15
	vpxor	%ymm7, %ymm15, %ymm15
	
	vpaddd	%ymm3, %ymm2, %ymm4
	vpaddd	%ymm11, %ymm10, %ymm6
	vpaddd	%ymm15, %ymm14, %ymm7
	vpslld	$18, %ymm4, %ymm5
	vpsrld	$14, %ymm4, %ymm4
	vpshufd	$0x4e, %ymm2, %ymm2
	vpshufd	$0x4e, %ymm10, %ymm10
	vpxor	%ymm5, %ymm0, %ymm0
	vpxor	%ymm4, %ymm0, %ymm0
	vpslld	$18, %ymm6, %ymm5
	vpsrld	$14, %ymm6, %ymm6
	vpshufd	$0x4e, %ymm14, %ymm14
	vpshufd	$0x39, %ymm11, %ymm11
	vpxor	%ymm5, %ymm8, %ymm8
	vpxor	%ymm6, %ymm8, %ymm8
	vpslld	$18, %ymm7, %ymm5
	vpsrld	$14, %ymm7, %ymm7
	vpshufd	$0x39, %ymm3, %ymm3
	vpshufd	$0x39, %ymm15, %ymm15
	vpxor	%ymm5, %ymm12, %ymm12
	vpxor	%ymm7, %ymm12, %ymm12
.endm

.macro salsa8_core_6way_avx2
	salsa8_core_6way_avx2_doubleround
	salsa8_core_6way_avx2_doubleround
	salsa8_core_6way_avx2_doubleround
	salsa8_core_6way_avx2_doubleround
.endm
	
	.text
	.p2align 6
	.globl scrypt_core_6way
	.globl _scrypt_core_6way
scrypt_core_6way:
_scrypt_core_6way:
	pushq	%rbx
	pushq	%rbp
#if defined(_WIN64) || defined(__CYGWIN__)
	subq	$176, %rsp
	vmovdqa	%xmm6, 8(%rsp)
	vmovdqa	%xmm7, 24(%rsp)
	vmovdqa	%xmm8, 40(%rsp)
	vmovdqa	%xmm9, 56(%rsp)
	vmovdqa	%xmm10, 72(%rsp)
	vmovdqa	%xmm11, 88(%rsp)
	vmovdqa	%xmm12, 104(%rsp)
	vmovdqa	%xmm13, 120(%rsp)
	vmovdqa	%xmm14, 136(%rsp)
	vmovdqa	%xmm15, 152(%rsp)
	pushq	%rdi
	pushq	%rsi
	movq	%rcx, %rdi
	movq	%rdx, %rsi
#else
	movq	%rdx, %r8
#endif
	movq	%rsp, %rdx
	subq	$768, %rsp
	andq	$-128, %rsp
	
.macro scrypt_core_6way_cleanup
	movq	%rdx, %rsp
#if defined(_WIN64) || defined(__CYGWIN__)
	popq	%rsi
	popq	%rdi
	vmovdqa	8(%rsp), %xmm6
	vmovdqa	24(%rsp), %xmm7
	vmovdqa	40(%rsp), %xmm8
	vmovdqa	56(%rsp), %xmm9
	vmovdqa	72(%rsp), %xmm10
	vmovdqa	88(%rsp), %xmm11
	vmovdqa	104(%rsp), %xmm12
	vmovdqa	120(%rsp), %xmm13
	vmovdqa	136(%rsp), %xmm14
	vmovdqa	152(%rsp), %xmm15
	addq	$176, %rsp
#endif
	popq	%rbp
	popq	%rbx
.endm

.macro scrypt_shuffle_pack2 src, so, dest, do
	vmovdqa	\so+0*16(\src), %xmm0
	vmovdqa	\so+1*16(\src), %xmm1
	vmovdqa	\so+2*16(\src), %xmm2
	vmovdqa	\so+3*16(\src), %xmm3
	vinserti128	$1, \so+128+0*16(\src), %ymm0, %ymm0
	vinserti128	$1, \so+128+1*16(\src), %ymm1, %ymm1
	vinserti128	$1, \so+128+2*16(\src), %ymm2, %ymm2
	vinserti128	$1, \so+128+3*16(\src), %ymm3, %ymm3
	vpblendd	$0x33, %ymm0, %ymm2, %ymm4
	vpblendd	$0xcc, %ymm1, %ymm3, %ymm5
	vpblendd	$0x33, %ymm2, %ymm0, %ymm6
	vpblendd	$0xcc, %ymm3, %ymm1, %ymm7
	vpblendd	$0x55, %ymm7, %ymm6, %ymm3
	vpblendd	$0x55, %ymm6, %ymm5, %ymm2
	vpblendd	$0x55, %ymm5, %ymm4, %ymm1
	vpblendd	$0x55, %ymm4, %ymm7, %ymm0
	vmovdqa	%ymm0, \do+0*32(\dest)
	vmovdqa	%ymm1, \do+1*32(\dest)
	vmovdqa	%ymm2, \do+2*32(\dest)
	vmovdqa	%ymm3, \do+3*32(\dest)
.endm

.macro scrypt_shuffle_unpack2 src, so, dest, do
	vmovdqa	\so+0*32(\src), %ymm0
	vmovdqa	\so+1*32(\src), %ymm1
	vmovdqa	\so+2*32(\src), %ymm2
	vmovdqa	\so+3*32(\src), %ymm3
	vpblendd	$0x33, %ymm0, %ymm2, %ymm4
	vpblendd	$0xcc, %ymm1, %ymm3, %ymm5
	vpblendd	$0x33, %ymm2, %ymm0, %ymm6
	vpblendd	$0xcc, %ymm3, %ymm1, %ymm7
	vpblendd	$0x55, %ymm7, %ymm6, %ymm3
	vpblendd	$0x55, %ymm6, %ymm5, %ymm2
	vpblendd	$0x55, %ymm5, %ymm4, %ymm1
	vpblendd	$0x55, %ymm4, %ymm7, %ymm0
	vmovdqa	%xmm0, \do+0*16(\dest)
	vmovdqa	%xmm1, \do+1*16(\dest)
	vmovdqa	%xmm2, \do+2*16(\dest)
	vmovdqa	%xmm3, \do+3*16(\dest)
	vextracti128	$1, %ymm0, \do+128+0*16(\dest)
	vextracti128	$1, %ymm1, \do+128+1*16(\dest)
	vextracti128	$1, %ymm2, \do+128+2*16(\dest)
	vextracti128	$1, %ymm3, \do+128+3*16(\dest)
.endm
	
scrypt_core_6way_avx2:
	scrypt_shuffle_pack2 %rdi, 0*256+0, %rsp, 0*128
	scrypt_shuffle_pack2 %rdi, 0*256+64, %rsp, 1*128
	scrypt_shuffle_pack2 %rdi, 1*256+0, %rsp, 2*128
	scrypt_shuffle_pack2 %rdi, 1*256+64, %rsp, 3*128
	scrypt_shuffle_pack2 %rdi, 2*256+0, %rsp, 4*128
	scrypt_shuffle_pack2 %rdi, 2*256+64, %rsp, 5*128
	
	vmovdqa	0*256+4*32(%rsp), %ymm0
	vmovdqa	0*256+5*32(%rsp), %ymm1
	vmovdqa	0*256+6*32(%rsp), %ymm2
	vmovdqa	0*256+7*32(%rsp), %ymm3
	vmovdqa	1*256+4*32(%rsp), %ymm8
	vmovdqa	1*256+5*32(%rsp), %ymm9
	vmovdqa	1*256+6*32(%rsp), %ymm10
	vmovdqa	1*256+7*32(%rsp), %ymm11
	vmovdqa	2*256+4*32(%rsp), %ymm12
	vmovdqa	2*256+5*32(%rsp), %ymm13
	vmovdqa	2*256+6*32(%rsp), %ymm14
	vmovdqa	2*256+7*32(%rsp), %ymm15
	
	movq	%rsi, %rbx
	leaq	(%r8, %r8, 2), %rax
	shlq	$8, %rax
	addq	%rsi, %rax
scrypt_core_6way_avx2_loop1:
	vmovdqa	%ymm0, 0*256+4*32(%rbx)
	vmovdqa	%ymm1, 0*256+5*32(%rbx)
	vmovdqa	%ymm2, 0*256+6*32(%rbx)
	vmovdqa	%ymm3, 0*256+7*32(%rbx)
	vpxor	0*256+0*32(%rsp), %ymm0, %ymm0
	vpxor	0*256+1*32(%rsp), %ymm1, %ymm1
	vpxor	0*256+2*32(%rsp), %ymm2, %ymm2
	vpxor	0*256+3*32(%rsp), %ymm3, %ymm3
	vmovdqa	%ymm8, 1*256+4*32(%rbx)
	vmovdqa	%ymm9, 1*256+5*32(%rbx)
	vmovdqa	%ymm10, 1*256+6*32(%rbx)
	vmovdqa	%ymm11, 1*256+7*32(%rbx)
	vpxor	1*256+0*32(%rsp), %ymm8, %ymm8
	vpxor	1*256+1*32(%rsp), %ymm9, %ymm9
	vpxor	1*256+2*32(%rsp), %ymm10, %ymm10
	vpxor	1*256+3*32(%rsp), %ymm11, %ymm11
	vmovdqa	%ymm12, 2*256+4*32(%rbx)
	vmovdqa	%ymm13, 2*256+5*32(%rbx)
	vmovdqa	%ymm14, 2*256+6*32(%rbx)
	vmovdqa	%ymm15, 2*256+7*32(%rbx)
	vpxor	2*256+0*32(%rsp), %ymm12, %ymm12
	vpxor	2*256+1*32(%rsp), %ymm13, %ymm13
	vpxor	2*256+2*32(%rsp), %ymm14, %ymm14
	vpxor	2*256+3*32(%rsp), %ymm15, %ymm15
	vmovdqa	%ymm0, 0*256+0*32(%rbx)
	vmovdqa	%ymm1, 0*256+1*32(%rbx)
	vmovdqa	%ymm2, 0*256+2*32(%rbx)
	vmovdqa	%ymm3, 0*256+3*32(%rbx)
	vmovdqa	%ymm8, 1*256+0*32(%rbx)
	vmovdqa	%ymm9, 1*256+1*32(%rbx)
	vmovdqa	%ymm10, 1*256+2*32(%rbx)
	vmovdqa	%ymm11, 1*256+3*32(%rbx)
	vmovdqa	%ymm12, 2*256+0*32(%rbx)
	vmovdqa	%ymm13, 2*256+1*32(%rbx)
	vmovdqa	%ymm14, 2*256+2*32(%rbx)
	vmovdqa	%ymm15, 2*256+3*32(%rbx)
	
	salsa8_core_6way_avx2
	vpaddd	0*256+0*32(%rbx), %ymm0, %ymm0
	vpaddd	0*256+1*32(%rbx), %ymm1, %ymm1
	vpaddd	0*256+2*32(%rbx), %ymm2, %ymm2
	vpaddd	0*256+3*32(%rbx), %ymm3, %ymm3
	vpaddd	1*256+0*32(%rbx), %ymm8, %ymm8
	vpaddd	1*256+1*32(%rbx), %ymm9, %ymm9
	vpaddd	1*256+2*32(%rbx), %ymm10, %ymm10
	vpaddd	1*256+3*32(%rbx), %ymm11, %ymm11
	vpaddd	2*256+0*32(%rbx), %ymm12, %ymm12
	vpaddd	2*256+1*32(%rbx), %ymm13, %ymm13
	vpaddd	2*256+2*32(%rbx), %ymm14, %ymm14
	vpaddd	2*256+3*32(%rbx), %ymm15, %ymm15
	vmovdqa	%ymm0, 0*256+0*32(%rsp)
	vmovdqa	%ymm1, 0*256+1*32(%rsp)
	vmovdqa	%ymm2, 0*256+2*32(%rsp)
	vmovdqa	%ymm3, 0*256+3*32(%rsp)
	vmovdqa	%ymm8, 1*256+0*32(%rsp)
	vmovdqa	%ymm9, 1*256+1*32(%rsp)
	vmovdqa	%ymm10, 1*256+2*32(%rsp)
	vmovdqa	%ymm11, 1*256+3*32(%rsp)
	vmovdqa	%ymm12, 2*256+0*32(%rsp)
	vmovdqa	%ymm13, 2*256+1*32(%rsp)
	vmovdqa	%ymm14, 2*256+2*32(%rsp)
	vmovdqa	%ymm15, 2*256+3*32(%rsp)
	
	vpxor	0*256+4*32(%rbx), %ymm0, %ymm0
	vpxor	0*256+5*32(%rbx), %ymm1, %ymm1
	vpxor	0*256+6*32(%rbx), %ymm2, %ymm2
	vpxor	0*256+7*32(%rbx), %ymm3, %ymm3
	vpxor	1*256+4*32(%rbx), %ymm8, %ymm8
	vpxor	1*256+5*32(%rbx), %ymm9, %ymm9
	vpxor	1*256+6*32(%rbx), %ymm10, %ymm10
	vpxor	1*256+7*32(%rbx), %ymm11, %ymm11
	vpxor	2*256+4*32(%rbx), %ymm12, %ymm12
	vpxor	2*256+5*32(%rbx), %ymm13, %ymm13
	vpxor	2*256+6*32(%rbx), %ymm14, %ymm14
	vpxor	2*256+7*32(%rbx), %ymm15, %ymm15
	vmovdqa	%ymm0, 0*256+4*32(%rsp)
	vmovdqa	%ymm1, 0*256+5*32(%rsp)
	vmovdqa	%ymm2, 0*256+6*32(%rsp)
	vmovdqa	%ymm3, 0*256+7*32(%rsp)
	vmovdqa	%ymm8, 1*256+4*32(%rsp)
	vmovdqa	%ymm9, 1*256+5*32(%rsp)
	vmovdqa	%ymm10, 1*256+6*32(%rsp)
	vmovdqa	%ymm11, 1*256+7*32(%rsp)
	vmovdqa	%ymm12, 2*256+4*32(%rsp)
	vmovdqa	%ymm13, 2*256+5*32(%rsp)
	vmovdqa	%ymm14, 2*256+6*32(%rsp)
	vmovdqa	%ymm15, 2*256+7*32(%rsp)
	salsa8_core_6way_avx2
	vpaddd	0*256+4*32(%rsp), %ymm0, %ymm0
	vpaddd	0*256+5*32(%rsp), %ymm1, %ymm1
	vpaddd	0*256+6*32(%rsp), %ymm2, %ymm2
	vpaddd	0*256+7*32(%rsp), %ymm3, %ymm3
	vpaddd	1*256+4*32(%rsp), %ymm8, %ymm8
	vpaddd	1*256+5*32(%rsp), %ymm9, %ymm9
	vpaddd	1*256+6*32(%rsp), %ymm10, %ymm10
	vpaddd	1*256+7*32(%rsp), %ymm11, %ymm11
	vpaddd	2*256+4*32(%rsp), %ymm12, %ymm12
	vpaddd	2*256+5*32(%rsp), %ymm13, %ymm13
	vpaddd	2*256+6*32(%rsp), %ymm14, %ymm14
	vpaddd	2*256+7*32(%rsp), %ymm15, %ymm15
	
	addq	$6*128, %rbx
	cmpq	%rax, %rbx
	jne scrypt_core_6way_avx2_loop1
	
	vmovdqa	%ymm0, 0*256+4*32(%rsp)
	vmovdqa	%ymm1, 0*256+5*32(%rsp)
	vmovdqa	%ymm2, 0*256+6*32(%rsp)
	vmovdqa	%ymm3, 0*256+7*32(%rsp)
	vmovdqa	%ymm8, 1*256+4*32(%rsp)
	vmovdqa	%ymm9, 1*256+5*32(%rsp)
	vmovdqa	%ymm10, 1*256+6*32(%rsp)
	vmovdqa	%ymm11, 1*256+7*32(%rsp)
	vmovdqa	%ymm12, 2*256+4*32(%rsp)
	vmovdqa	%ymm13, 2*256+5*32(%rsp)
	vmovdqa	%ymm14, 2*256+6*32(%rsp)
	vmovdqa	%ymm15, 2*256+7*32(%rsp)
	
	movq	%r8, %rcx
	leaq	-1(%r8), %r11
scrypt_core_6way_avx2_loop2:
	vmovd	%xmm0, %ebp
	vmovd	%xmm8, %ebx
	vmovd	%xmm12, %eax
	vextracti128	$1, %ymm0, %xmm4
	vextracti128	$1, %ymm8, %xmm5
	vextracti128	$1, %ymm12, %xmm6
	vmovd	%xmm4, %r8d
	vmovd	%xmm5, %r9d
	vmovd	%xmm6, %r10d
	vpxor	0*256+0*32(%rsp), %ymm0, %ymm0
	vpxor	0*256+1*32(%rsp), %ymm1, %ymm1
	vpxor	0*256+2*32(%rsp), %ymm2, %ymm2
	vpxor	0*256+3*32(%rsp), %ymm3, %ymm3
	vpxor	1*256+0*32(%rsp), %ymm8, %ymm8
	vpxor	1*256+1*32(%rsp), %ymm9, %ymm9
	vpxor	1*256+2*32(%rsp), %ymm10, %ymm10
	vpxor	1*256+3*32(%rsp), %ymm11, %ymm11
	vpxor	2*256+0*32(%rsp), %ymm12, %ymm12
	vpxor	2*256+1*32(%rsp), %ymm13, %ymm13
	vpxor	2*256+2*32(%rsp), %ymm14, %ymm14
	vpxor	2*256+3*32(%rsp), %ymm15, %ymm15
	andl	%r11d, %ebp
	leaq	0(%rbp, %rbp, 2), %rbp
	shll	$8, %ebp
	andl	%r11d, %ebx
	leaq	1(%rbx, %rbx, 2), %rbx
	shll	$8, %ebx
	andl	%r11d, %eax
	leaq	2(%rax, %rax, 2), %rax
	shll	$8, %eax
	andl	%r11d, %r8d
	leaq	0(%r8, %r8, 2), %r8
	shll	$8, %r8d
	andl	%r11d, %r9d
	leaq	1(%r9, %r9, 2), %r9
	shll	$8, %r9d
	andl	%r11d, %r10d
	leaq	2(%r10, %r10, 2), %r10
	shll	$8, %r10d
	vmovdqa	0*32(%rsi, %rbp), %xmm4
	vinserti128	$1, 0*32+16(%rsi, %r8), %ymm4, %ymm4
	vmovdqa	1*32(%rsi, %rbp), %xmm5
	vinserti128	$1, 1*32+16(%rsi, %r8), %ymm5, %ymm5
	vmovdqa	2*32(%rsi, %rbp), %xmm6
	vinserti128	$1, 2*32+16(%rsi, %r8), %ymm6, %ymm6
	vmovdqa	3*32(%rsi, %rbp), %xmm7
	vinserti128	$1, 3*32+16(%rsi, %r8), %ymm7, %ymm7
	vpxor	%ymm4, %ymm0, %ymm0
	vpxor	%ymm5, %ymm1, %ymm1
	vpxor	%ymm6, %ymm2, %ymm2
	vpxor	%ymm7, %ymm3, %ymm3
	vmovdqa	0*32(%rsi, %rbx), %xmm4
	vinserti128	$1, 0*32+16(%rsi, %r9), %ymm4, %ymm4
	vmovdqa	1*32(%rsi, %rbx), %xmm5
	vinserti128	$1, 1*32+16(%rsi, %r9), %ymm5, %ymm5
	vmovdqa	2*32(%rsi, %rbx), %xmm6
	vinserti128	$1, 2*32+16(%rsi, %r9), %ymm6, %ymm6
	vmovdqa	3*32(%rsi, %rbx), %xmm7
	vinserti128	$1, 3*32+16(%rsi, %r9), %ymm7, %ymm7
	vpxor	%ymm4, %ymm8, %ymm8
	vpxor	%ymm5, %ymm9, %ymm9
	vpxor	%ymm6, %ymm10, %ymm10
	vpxor	%ymm7, %ymm11, %ymm11
	vmovdqa	0*32(%rsi, %rax), %xmm4
	vinserti128	$1, 0*32+16(%rsi, %r10), %ymm4, %ymm4
	vmovdqa	1*32(%rsi, %rax), %xmm5
	vinserti128	$1, 1*32+16(%rsi, %r10), %ymm5, %ymm5
	vmovdqa	2*32(%rsi, %rax), %xmm6
	vinserti128	$1, 2*32+16(%rsi, %r10), %ymm6, %ymm6
	vmovdqa	3*32(%rsi, %rax), %xmm7
	vinserti128	$1, 3*32+16(%rsi, %r10), %ymm7, %ymm7
	vpxor	%ymm4, %ymm12, %ymm12
	vpxor	%ymm5, %ymm13, %ymm13
	vpxor	%ymm6, %ymm14, %ymm14
	vpxor	%ymm7, %ymm15, %ymm15
	
	vmovdqa	%ymm0, 0*256+0*32(%rsp)
	vmovdqa	%ymm1, 0*256+1*32(%rsp)
	vmovdqa	%ymm2, 0*256+2*32(%rsp)
	vmovdqa	%ymm3, 0*256+3*32(%rsp)
	vmovdqa	%ymm8, 1*256+0*32(%rsp)
	vmovdqa	%ymm9, 1*256+1*32(%rsp)
	vmovdqa	%ymm10, 1*256+2*32(%rsp)
	vmovdqa	%ymm11, 1*256+3*32(%rsp)
	vmovdqa	%ymm12, 2*256+0*32(%rsp)
	vmovdqa	%ymm13, 2*256+1*32(%rsp)
	vmovdqa	%ymm14, 2*256+2*32(%rsp)
	vmovdqa	%ymm15, 2*256+3*32(%rsp)
	salsa8_core_6way_avx2
	vpaddd	0*256+0*32(%rsp), %ymm0, %ymm0
	vpaddd	0*256+1*32(%rsp), %ymm1, %ymm1
	vpaddd	0*256+2*32(%rsp), %ymm2, %ymm2
	vpaddd	0*256+3*32(%rsp), %ymm3, %ymm3
	vpaddd	1*256+0*32(%rsp), %ymm8, %ymm8
	vpaddd	1*256+1*32(%rsp), %ymm9, %ymm9
	vpaddd	1*256+2*32(%rsp), %ymm10, %ymm10
	vpaddd	1*256+3*32(%rsp), %ymm11, %ymm11
	vpaddd	2*256+0*32(%rsp), %ymm12, %ymm12
	vpaddd	2*256+1*32(%rsp), %ymm13, %ymm13
	vpaddd	2*256+2*32(%rsp), %ymm14, %ymm14
	vpaddd	2*256+3*32(%rsp), %ymm15, %ymm15
	vmovdqa	%ymm0, 0*256+0*32(%rsp)
	vmovdqa	%ymm1, 0*256+1*32(%rsp)
	vmovdqa	%ymm2, 0*256+2*32(%rsp)
	vmovdqa	%ymm3, 0*256+3*32(%rsp)
	vmovdqa	%ymm8, 1*256+0*32(%rsp)
	vmovdqa	%ymm9, 1*256+1*32(%rsp)
	vmovdqa	%ymm10, 1*256+2*32(%rsp)
	vmovdqa	%ymm11, 1*256+3*32(%rsp)
	vmovdqa	%ymm12, 2*256+0*32(%rsp)
	vmovdqa	%ymm13, 2*256+1*32(%rsp)
	vmovdqa	%ymm14, 2*256+2*32(%rsp)
	vmovdqa	%ymm15, 2*256+3*32(%rsp)
	
	vmovdqa	4*32(%rsi, %rbp), %xmm4
	vinserti128	$1, 4*32+16(%rsi, %r8), %ymm4, %ymm4
	vmovdqa	5*32(%rsi, %rbp), %xmm5
	vinserti128	$1, 5*32+16(%rsi, %r8), %ymm5, %ymm5
	vmovdqa	6*32(%rsi, %rbp), %xmm6
	vinserti128	$1, 6*32+16(%rsi, %r8), %ymm6, %ymm6
	vmovdqa	7*32(%rsi, %rbp), %xmm7
	vinserti128	$1, 7*32+16(%rsi, %r8), %ymm7, %ymm7
	vpxor	%ymm4, %ymm0, %ymm0
	vpxor	%ymm5, %ymm1, %ymm1
	vpxor	%ymm6, %ymm2, %ymm2
	vpxor	%ymm7, %ymm3, %ymm3
	vmovdqa	4*32(%rsi, %rbx), %xmm4
	vinserti128	$1, 4*32+16(%rsi, %r9), %ymm4, %ymm4
	vmovdqa	5*32(%rsi, %rbx), %xmm5
	vinserti128	$1, 5*32+16(%rsi, %r9), %ymm5, %ymm5
	vmovdqa	6*32(%rsi, %rbx), %xmm6
	vinserti128	$1, 6*32+16(%rsi, %r9), %ymm6, %ymm6
	vmovdqa	7*32(%rsi, %rbx), %xmm7
	vinserti128	$1, 7*32+16(%rsi, %r9), %ymm7, %ymm7
	vpxor	%ymm4, %ymm8, %ymm8
	vpxor	%ymm5, %ymm9, %ymm9
	vpxor	%ymm6, %ymm10, %ymm10
	vpxor	%ymm7, %ymm11, %ymm11
	vmovdqa	4*32(%rsi, %rax), %xmm4
	vinserti128	$1, 4*32+16(%rsi, %r10), %ymm4, %ymm4
	vmovdqa	5*32(%rsi, %rax), %xmm5
	vinserti128	$1, 5*32+16(%rsi, %r10), %ymm5, %ymm5
	vmovdqa	6*32(%rsi, %rax), %xmm6
	vinserti128	$1, 6*32+16(%rsi, %r10), %ymm6, %ymm6
	vmovdqa	7*32(%rsi, %rax), %xmm7
	vinserti128	$1, 7*32+16(%rsi, %r10), %ymm7, %ymm7
	vpxor	%ymm4, %ymm12, %ymm12
	vpxor	%ymm5, %ymm13, %ymm13
	vpxor	%ymm6, %ymm14, %ymm14
	vpxor	%ymm7, %ymm15, %ymm15
	vpxor	0*256+4*32(%rsp), %ymm0, %ymm0
	vpxor	0*256+5*32(%rsp), %ymm1, %ymm1
	vpxor	0*256+6*32(%rsp), %ymm2, %ymm2
	vpxor	0*256+7*32(%rsp), %ymm3, %ymm3
	vpxor	1*256+4*32(%rsp), %ymm8, %ymm8
	vpxor	1*256+5*32(%rsp), %ymm9, %ymm9
	vpxor	1*256+6*32(%rsp), %ymm10, %ymm10
	vpxor	1*256+7*32(%rsp), %ymm11, %ymm11
	vpxor	2*256+4*32(%rsp), %ymm12, %ymm12
	vpxor	2*256+5*32(%rsp), %ymm13, %ymm13
	vpxor	2*256+6*32(%rsp), %ymm14, %ymm14
	vpxor	2*256+7*32(%rsp), %ymm15, %ymm15
	vmovdqa	%ymm0, 0*256+4*32(%rsp)
	vmovdqa	%ymm1, 0*256+5*32(%rsp)
	vmovdqa	%ymm2, 0*256+6*32(%rsp)
	vmovdqa	%ymm3, 0*256+7*32(%rsp)
	vmovdqa	%ymm8, 1*256+4*32(%rsp)
	vmovdqa	%ymm9, 1*256+5*32(%rsp)
	vmovdqa	%ymm10, 1*256+6*32(%rsp)
	vmovdqa	%ymm11, 1*256+7*32(%rsp)
	vmovdqa	%ymm12, 2*256+4*32(%rsp)
	vmovdqa	%ymm13, 2*256+5*32(%rsp)
	vmovdqa	%ymm14, 2*256+6*32(%rsp)
	vmovdqa	%ymm15, 2*256+7*32(%rsp)
	salsa8_core_6way_avx2
	vpaddd	0*256+4*32(%rsp), %ymm0, %ymm0
	vpaddd	0*256+5*32(%rsp), %ymm1, %ymm1
	vpaddd	0*256+6*32(%rsp), %ymm2, %ymm2
	vpaddd	0*256+7*32(%rsp), %ymm3, %ymm3
	vpaddd	1*256+4*32(%rsp), %ymm8, %ymm8
	vpaddd	1*256+5*32(%rsp), %ymm9, %ymm9
	vpaddd	1*256+6*32(%rsp), %ymm10, %ymm10
	vpaddd	1*256+7*32(%rsp), %ymm11, %ymm11
	vpaddd	2*256+4*32(%rsp), %ymm12, %ymm12
	vpaddd	2*256+5*32(%rsp), %ymm13, %ymm13
	vpaddd	2*256+6*32(%rsp), %ymm14, %ymm14
	vpaddd	2*256+7*32(%rsp), %ymm15, %ymm15
	vmovdqa	%ymm0, 0*256+4*32(%rsp)
	vmovdqa	%ymm1, 0*256+5*32(%rsp)
	vmovdqa	%ymm2, 0*256+6*32(%rsp)
	vmovdqa	%ymm3, 0*256+7*32(%rsp)
	vmovdqa	%ymm8, 1*256+4*32(%rsp)
	vmovdqa	%ymm9, 1*256+5*32(%rsp)
	vmovdqa	%ymm10, 1*256+6*32(%rsp)
	vmovdqa	%ymm11, 1*256+7*32(%rsp)
	vmovdqa	%ymm12, 2*256+4*32(%rsp)
	vmovdqa	%ymm13, 2*256+5*32(%rsp)
	vmovdqa	%ymm14, 2*256+6*32(%rsp)
	vmovdqa	%ymm15, 2*256+7*32(%rsp)
	
	subq	$1, %rcx
	ja scrypt_core_6way_avx2_loop2
	
	scrypt_shuffle_unpack2 %rsp, 0*128, %rdi, 0*256+0
	scrypt_shuffle_unpack2 %rsp, 1*128, %rdi, 0*256+64
	scrypt_shuffle_unpack2 %rsp, 2*128, %rdi, 1*256+0
	scrypt_shuffle_unpack2 %rsp, 3*128, %rdi, 1*256+64
	scrypt_shuffle_unpack2 %rsp, 4*128, %rdi, 2*256+0
	scrypt_shuffle_unpack2 %rsp, 5*128, %rdi, 2*256+64
	
	scrypt_core_6way_cleanup
	ret

#endif /* USE_AVX2 */

#endif
07070100000026000081A4000003E800000064000000015EF4BCA10000435F000000000000000000000000000000000000001C00000000cpuminer-2.5.1/scrypt-x86.S/*
 * Copyright 2011-2012, 2014 pooler@litecoinpool.org
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include "cpuminer-config.h"

#if defined(__linux__) && defined(__ELF__)
	.section .note.GNU-stack,"",%progbits
#endif

#if defined(USE_ASM) && defined(__i386__)
	
.macro scrypt_shuffle src, so, dest, do
	movl	\so+60(\src), %eax
	movl	\so+44(\src), %ebx
	movl	\so+28(\src), %ecx
	movl	\so+12(\src), %edx
	movl	%eax, \do+12(\dest)
	movl	%ebx, \do+28(\dest)
	movl	%ecx, \do+44(\dest)
	movl	%edx, \do+60(\dest)
	movl	\so+40(\src), %eax
	movl	\so+8(\src), %ebx
	movl	\so+48(\src), %ecx
	movl	\so+16(\src), %edx
	movl	%eax, \do+8(\dest)
	movl	%ebx, \do+40(\dest)
	movl	%ecx, \do+16(\dest)
	movl	%edx, \do+48(\dest)
	movl	\so+20(\src), %eax
	movl	\so+4(\src), %ebx
	movl	\so+52(\src), %ecx
	movl	\so+36(\src), %edx
	movl	%eax, \do+4(\dest)
	movl	%ebx, \do+20(\dest)
	movl	%ecx, \do+36(\dest)
	movl	%edx, \do+52(\dest)
	movl	\so+0(\src), %eax
	movl	\so+24(\src), %ebx
	movl	\so+32(\src), %ecx
	movl	\so+56(\src), %edx
	movl	%eax, \do+0(\dest)
	movl	%ebx, \do+24(\dest)
	movl	%ecx, \do+32(\dest)
	movl	%edx, \do+56(\dest)
.endm

.macro salsa8_core_gen_quadround
	movl	52(%esp), %ecx
	movl	4(%esp), %edx
	movl	20(%esp), %ebx
	movl	8(%esp), %esi
	leal	(%ecx, %edx), %edi
	roll	$7, %edi
	xorl	%edi, %ebx
	movl	%ebx, 4(%esp)
	movl	36(%esp), %edi
	leal	(%edx, %ebx), %ebp
	roll	$9, %ebp
	xorl	%ebp, %edi
	movl	24(%esp), %ebp
	movl	%edi, 8(%esp)
	addl	%edi, %ebx
	roll	$13, %ebx
	xorl	%ebx, %ecx
	movl	40(%esp), %ebx
	movl	%ecx, 20(%esp)
	addl	%edi, %ecx
	roll	$18, %ecx
	leal	(%esi, %ebp), %edi
	roll	$7, %edi
	xorl	%edi, %ebx
	movl	%ebx, 24(%esp)
	movl	56(%esp), %edi
	xorl	%ecx, %edx
	leal	(%ebp, %ebx), %ecx
	roll	$9, %ecx
	xorl	%ecx, %edi
	movl	%edi, 36(%esp)
	movl	28(%esp), %ecx
	movl	%edx, 28(%esp)
	movl	44(%esp), %edx
	addl	%edi, %ebx
	roll	$13, %ebx
	xorl	%ebx, %esi
	movl	60(%esp), %ebx
	movl	%esi, 40(%esp)
	addl	%edi, %esi
	roll	$18, %esi
	leal	(%ecx, %edx), %edi
	roll	$7, %edi
	xorl	%edi, %ebx
	movl	%ebx, 44(%esp)
	movl	12(%esp), %edi
	xorl	%esi, %ebp
	leal	(%edx, %ebx), %esi
	roll	$9, %esi
	xorl	%esi, %edi
	movl	%edi, 12(%esp)
	movl	48(%esp), %esi
	movl	%ebp, 48(%esp)
	movl	64(%esp), %ebp
	addl	%edi, %ebx
	roll	$13, %ebx
	xorl	%ebx, %ecx
	movl	16(%esp), %ebx
	movl	%ecx, 16(%esp)
	addl	%edi, %ecx
	roll	$18, %ecx
	leal	(%esi, %ebp), %edi
	roll	$7, %edi
	xorl	%edi, %ebx
	movl	32(%esp), %edi
	xorl	%ecx, %edx
	leal	(%ebp, %ebx), %ecx
	roll	$9, %ecx
	xorl	%ecx, %edi
	movl	%edi, 32(%esp)
	movl	%ebx, %ecx
	movl	%edx, 52(%esp)
	movl	28(%esp), %edx
	addl	%edi, %ebx
	roll	$13, %ebx
	xorl	%ebx, %esi
	movl	40(%esp), %ebx
	movl	%esi, 28(%esp)
	addl	%edi, %esi
	roll	$18, %esi
	leal	(%ecx, %edx), %edi
	roll	$7, %edi
	xorl	%edi, %ebx
	movl	%ebx, 40(%esp)
	movl	12(%esp), %edi
	xorl	%esi, %ebp
	leal	(%edx, %ebx), %esi
	roll	$9, %esi
	xorl	%esi, %edi
	movl	%edi, 12(%esp)
	movl	4(%esp), %esi
	movl	%ebp, 4(%esp)
	movl	48(%esp), %ebp
	addl	%edi, %ebx
	roll	$13, %ebx
	xorl	%ebx, %ecx
	movl	16(%esp), %ebx
	movl	%ecx, 16(%esp)
	addl	%edi, %ecx
	roll	$18, %ecx
	leal	(%esi, %ebp), %edi
	roll	$7, %edi
	xorl	%edi, %ebx
	movl	%ebx, 48(%esp)
	movl	32(%esp), %edi
	xorl	%ecx, %edx
	leal	(%ebp, %ebx), %ecx
	roll	$9, %ecx
	xorl	%ecx, %edi
	movl	%edi, 32(%esp)
	movl	24(%esp), %ecx
	movl	%edx, 24(%esp)
	movl	52(%esp), %edx
	addl	%edi, %ebx
	roll	$13, %ebx
	xorl	%ebx, %esi
	movl	28(%esp), %ebx
	movl	%esi, 28(%esp)
	addl	%edi, %esi
	roll	$18, %esi
	leal	(%ecx, %edx), %edi
	roll	$7, %edi
	xorl	%edi, %ebx
	movl	%ebx, 52(%esp)
	movl	8(%esp), %edi
	xorl	%esi, %ebp
	leal	(%edx, %ebx), %esi
	roll	$9, %esi
	xorl	%esi, %edi
	movl	%edi, 8(%esp)
	movl	44(%esp), %esi
	movl	%ebp, 44(%esp)
	movl	4(%esp), %ebp
	addl	%edi, %ebx
	roll	$13, %ebx
	xorl	%ebx, %ecx
	movl	20(%esp), %ebx
	movl	%ecx, 4(%esp)
	addl	%edi, %ecx
	roll	$18, %ecx
	leal	(%esi, %ebp), %edi
	roll	$7, %edi
	xorl	%edi, %ebx
	movl	36(%esp), %edi
	xorl	%ecx, %edx
	leal	(%ebp, %ebx), %ecx
	roll	$9, %ecx
	xorl	%ecx, %edi
	movl	%edi, 20(%esp)
	movl	%ebx, %ecx
	movl	%edx, 36(%esp)
	movl	24(%esp), %edx
	addl	%edi, %ebx
	roll	$13, %ebx
	xorl	%ebx, %esi
	movl	28(%esp), %ebx
	movl	%esi, 24(%esp)
	addl	%edi, %esi
	roll	$18, %esi
	leal	(%ecx, %edx), %edi
	roll	$7, %edi
	xorl	%edi, %ebx
	movl	%ebx, 28(%esp)
	xorl	%esi, %ebp
	movl	8(%esp), %esi
	leal	(%edx, %ebx), %edi
	roll	$9, %edi
	xorl	%edi, %esi
	movl	40(%esp), %edi
	movl	%ebp, 8(%esp)
	movl	44(%esp), %ebp
	movl	%esi, 40(%esp)
	addl	%esi, %ebx
	roll	$13, %ebx
	xorl	%ebx, %ecx
	movl	4(%esp), %ebx
	movl	%ecx, 44(%esp)
	addl	%esi, %ecx
	roll	$18, %ecx
	leal	(%edi, %ebp), %esi
	roll	$7, %esi
	xorl	%esi, %ebx
	movl	%ebx, 4(%esp)
	movl	20(%esp), %esi
	xorl	%ecx, %edx
	leal	(%ebp, %ebx), %ecx
	roll	$9, %ecx
	xorl	%ecx, %esi
	movl	%esi, 56(%esp)
	movl	48(%esp), %ecx
	movl	%edx, 20(%esp)
	movl	36(%esp), %edx
	addl	%esi, %ebx
	roll	$13, %ebx
	xorl	%ebx, %edi
	movl	24(%esp), %ebx
	movl	%edi, 24(%esp)
	addl	%esi, %edi
	roll	$18, %edi
	leal	(%ecx, %edx), %esi
	roll	$7, %esi
	xorl	%esi, %ebx
	movl	%ebx, 60(%esp)
	movl	12(%esp), %esi
	xorl	%edi, %ebp
	leal	(%edx, %ebx), %edi
	roll	$9, %edi
	xorl	%edi, %esi
	movl	%esi, 12(%esp)
	movl	52(%esp), %edi
	movl	%ebp, 36(%esp)
	movl	8(%esp), %ebp
	addl	%esi, %ebx
	roll	$13, %ebx
	xorl	%ebx, %ecx
	movl	16(%esp), %ebx
	movl	%ecx, 16(%esp)
	addl	%esi, %ecx
	roll	$18, %ecx
	leal	(%edi, %ebp), %esi
	roll	$7, %esi
	xorl	%esi, %ebx
	movl	32(%esp), %esi
	xorl	%ecx, %edx
	leal	(%ebp, %ebx), %ecx
	roll	$9, %ecx
	xorl	%ecx, %esi
	movl	%esi, 32(%esp)
	movl	%ebx, %ecx
	movl	%edx, 48(%esp)
	movl	20(%esp), %edx
	addl	%esi, %ebx
	roll	$13, %ebx
	xorl	%ebx, %edi
	movl	24(%esp), %ebx
	movl	%edi, 20(%esp)
	addl	%esi, %edi
	roll	$18, %edi
	leal	(%ecx, %edx), %esi
	roll	$7, %esi
	xorl	%esi, %ebx
	movl	%ebx, 8(%esp)
	movl	12(%esp), %esi
	xorl	%edi, %ebp
	leal	(%edx, %ebx), %edi
	roll	$9, %edi
	xorl	%edi, %esi
	movl	%esi, 12(%esp)
	movl	28(%esp), %edi
	movl	%ebp, 52(%esp)
	movl	36(%esp), %ebp
	addl	%esi, %ebx
	roll	$13, %ebx
	xorl	%ebx, %ecx
	movl	16(%esp), %ebx
	movl	%ecx, 16(%esp)
	addl	%esi, %ecx
	roll	$18, %ecx
	leal	(%edi, %ebp), %esi
	roll	$7, %esi
	xorl	%esi, %ebx
	movl	%ebx, 28(%esp)
	movl	32(%esp), %esi
	xorl	%ecx, %edx
	leal	(%ebp, %ebx), %ecx
	roll	$9, %ecx
	xorl	%ecx, %esi
	movl	%esi, 32(%esp)
	movl	4(%esp), %ecx
	movl	%edx, 4(%esp)
	movl	48(%esp), %edx
	addl	%esi, %ebx
	roll	$13, %ebx
	xorl	%ebx, %edi
	movl	20(%esp), %ebx
	movl	%edi, 20(%esp)
	addl	%esi, %edi
	roll	$18, %edi
	leal	(%ecx, %edx), %esi
	roll	$7, %esi
	xorl	%esi, %ebx
	movl	%ebx, 48(%esp)
	movl	40(%esp), %esi
	xorl	%edi, %ebp
	leal	(%edx, %ebx), %edi
	roll	$9, %edi
	xorl	%edi, %esi
	movl	%esi, 36(%esp)
	movl	60(%esp), %edi
	movl	%ebp, 24(%esp)
	movl	52(%esp), %ebp
	addl	%esi, %ebx
	roll	$13, %ebx
	xorl	%ebx, %ecx
	movl	44(%esp), %ebx
	movl	%ecx, 40(%esp)
	addl	%esi, %ecx
	roll	$18, %ecx
	leal	(%edi, %ebp), %esi
	roll	$7, %esi
	xorl	%esi, %ebx
	movl	%ebx, 52(%esp)
	movl	56(%esp), %esi
	xorl	%ecx, %edx
	leal	(%ebp, %ebx), %ecx
	roll	$9, %ecx
	xorl	%ecx, %esi
	movl	%esi, 56(%esp)
	addl	%esi, %ebx
	movl	%edx, 44(%esp)
	roll	$13, %ebx
	xorl	%ebx, %edi
	movl	%edi, 60(%esp)
	addl	%esi, %edi
	roll	$18, %edi
	xorl	%edi, %ebp
	movl	%ebp, 64(%esp)
.endm

	.text
	.p2align 5
salsa8_core_gen:
	salsa8_core_gen_quadround
	salsa8_core_gen_quadround
	ret
	
	
	.text
	.p2align 5
	.globl scrypt_core
	.globl _scrypt_core
scrypt_core:
_scrypt_core:
	pushl	%ebx
	pushl	%ebp
	pushl	%edi
	pushl	%esi
	
	/* Check for SSE2 availability */
	movl	$1, %eax
	cpuid
	andl	$0x04000000, %edx
	jnz scrypt_core_sse2
	
scrypt_core_gen:
	movl	20(%esp), %edi
	movl	24(%esp), %esi
	movl	28(%esp), %ecx
	subl	$72, %esp
	
.macro scrypt_core_macro1a p, q
	movl	\p(%edi), %eax
	movl	\q(%edi), %edx
	movl	%eax, \p(%esi)
	movl	%edx, \q(%esi)
	xorl	%edx, %eax
	movl	%eax, \p(%edi)
	movl	%eax, \p(%esp)
.endm
	
.macro scrypt_core_macro1b p, q
	movl	\p(%edi), %eax
	xorl	\p(%esi, %edx), %eax
	movl	\q(%edi), %ebx
	xorl	\q(%esi, %edx), %ebx
	movl	%ebx, \q(%edi)
	xorl	%ebx, %eax
	movl	%eax, \p(%edi)
	movl	%eax, \p(%esp)
.endm
	
.macro scrypt_core_macro2 p, q
	movl	\p(%esp), %eax
	addl	\p(%edi), %eax
	movl	%eax, \p(%edi)
	xorl	\q(%edi), %eax
	movl	%eax, \q(%edi)
	movl	%eax, \p(%esp)
.endm
	
.macro scrypt_core_macro3 p, q
	movl	\p(%esp), %eax
	addl	\q(%edi), %eax
	movl	%eax, \q(%edi)
.endm
	
	shll	$7, %ecx
	addl	%esi, %ecx
scrypt_core_gen_loop1:
	movl	%esi, 64(%esp)
	movl	%ecx, 68(%esp)
	
	scrypt_core_macro1a	0, 64
	scrypt_core_macro1a	4, 68
	scrypt_core_macro1a	8, 72
	scrypt_core_macro1a	12, 76
	scrypt_core_macro1a	16, 80
	scrypt_core_macro1a	20, 84
	scrypt_core_macro1a	24, 88
	scrypt_core_macro1a	28, 92
	scrypt_core_macro1a	32, 96
	scrypt_core_macro1a	36, 100
	scrypt_core_macro1a	40, 104
	scrypt_core_macro1a	44, 108
	scrypt_core_macro1a	48, 112
	scrypt_core_macro1a	52, 116
	scrypt_core_macro1a	56, 120
	scrypt_core_macro1a	60, 124
	
	call salsa8_core_gen
	
	movl	92(%esp), %edi
	scrypt_core_macro2	0, 64
	scrypt_core_macro2	4, 68
	scrypt_core_macro2	8, 72
	scrypt_core_macro2	12, 76
	scrypt_core_macro2	16, 80
	scrypt_core_macro2	20, 84
	scrypt_core_macro2	24, 88
	scrypt_core_macro2	28, 92
	scrypt_core_macro2	32, 96
	scrypt_core_macro2	36, 100
	scrypt_core_macro2	40, 104
	scrypt_core_macro2	44, 108
	scrypt_core_macro2	48, 112
	scrypt_core_macro2	52, 116
	scrypt_core_macro2	56, 120
	scrypt_core_macro2	60, 124
	
	call salsa8_core_gen
	
	movl	92(%esp), %edi
	scrypt_core_macro3	0, 64
	scrypt_core_macro3	4, 68
	scrypt_core_macro3	8, 72
	scrypt_core_macro3	12, 76
	scrypt_core_macro3	16, 80
	scrypt_core_macro3	20, 84
	scrypt_core_macro3	24, 88
	scrypt_core_macro3	28, 92
	scrypt_core_macro3	32, 96
	scrypt_core_macro3	36, 100
	scrypt_core_macro3	40, 104
	scrypt_core_macro3	44, 108
	scrypt_core_macro3	48, 112
	scrypt_core_macro3	52, 116
	scrypt_core_macro3	56, 120
	scrypt_core_macro3	60, 124
	
	movl	64(%esp), %esi
	movl	68(%esp), %ecx
	addl	$128, %esi
	cmpl	%ecx, %esi
	jne scrypt_core_gen_loop1

	movl	96(%esp), %esi
	movl	100(%esp), %ecx
	movl	%ecx, %eax
	subl	$1, %eax
	movl	%eax, 100(%esp)
scrypt_core_gen_loop2:
	movl	%ecx, 68(%esp)
	
	movl	64(%edi), %edx
	andl	100(%esp), %edx
	shll	$7, %edx
	
	scrypt_core_macro1b	0, 64
	scrypt_core_macro1b	4, 68
	scrypt_core_macro1b	8, 72
	scrypt_core_macro1b	12, 76
	scrypt_core_macro1b	16, 80
	scrypt_core_macro1b	20, 84
	scrypt_core_macro1b	24, 88
	scrypt_core_macro1b	28, 92
	scrypt_core_macro1b	32, 96
	scrypt_core_macro1b	36, 100
	scrypt_core_macro1b	40, 104
	scrypt_core_macro1b	44, 108
	scrypt_core_macro1b	48, 112
	scrypt_core_macro1b	52, 116
	scrypt_core_macro1b	56, 120
	scrypt_core_macro1b	60, 124
	
	call salsa8_core_gen
	
	movl	92(%esp), %edi
	scrypt_core_macro2	0, 64
	scrypt_core_macro2	4, 68
	scrypt_core_macro2	8, 72
	scrypt_core_macro2	12, 76
	scrypt_core_macro2	16, 80
	scrypt_core_macro2	20, 84
	scrypt_core_macro2	24, 88
	scrypt_core_macro2	28, 92
	scrypt_core_macro2	32, 96
	scrypt_core_macro2	36, 100
	scrypt_core_macro2	40, 104
	scrypt_core_macro2	44, 108
	scrypt_core_macro2	48, 112
	scrypt_core_macro2	52, 116
	scrypt_core_macro2	56, 120
	scrypt_core_macro2	60, 124
	
	call salsa8_core_gen
	
	movl	92(%esp), %edi
	movl	96(%esp), %esi
	scrypt_core_macro3	0, 64
	scrypt_core_macro3	4, 68
	scrypt_core_macro3	8, 72
	scrypt_core_macro3	12, 76
	scrypt_core_macro3	16, 80
	scrypt_core_macro3	20, 84
	scrypt_core_macro3	24, 88
	scrypt_core_macro3	28, 92
	scrypt_core_macro3	32, 96
	scrypt_core_macro3	36, 100
	scrypt_core_macro3	40, 104
	scrypt_core_macro3	44, 108
	scrypt_core_macro3	48, 112
	scrypt_core_macro3	52, 116
	scrypt_core_macro3	56, 120
	scrypt_core_macro3	60, 124
	
	movl	68(%esp), %ecx
	subl	$1, %ecx
	ja scrypt_core_gen_loop2
	
	addl	$72, %esp
	popl	%esi
	popl	%edi
	popl	%ebp
	popl	%ebx
	ret


.macro salsa8_core_sse2_doubleround
	movdqa	%xmm1, %xmm4
	paddd	%xmm0, %xmm4
	movdqa	%xmm4, %xmm5
	pslld	$7, %xmm4
	psrld	$25, %xmm5
	pxor	%xmm4, %xmm3
	movdqa	%xmm0, %xmm4
	pxor	%xmm5, %xmm3
	
	paddd	%xmm3, %xmm4
	movdqa	%xmm4, %xmm5
	pslld	$9, %xmm4
	psrld	$23, %xmm5
	pxor	%xmm4, %xmm2
	movdqa	%xmm3, %xmm4
	pxor	%xmm5, %xmm2
	pshufd	$0x93, %xmm3, %xmm3
	
	paddd	%xmm2, %xmm4
	movdqa	%xmm4, %xmm5
	pslld	$13, %xmm4
	psrld	$19, %xmm5
	pxor	%xmm4, %xmm1
	movdqa	%xmm2, %xmm4
	pxor	%xmm5, %xmm1
	pshufd	$0x4e, %xmm2, %xmm2
	
	paddd	%xmm1, %xmm4
	movdqa	%xmm4, %xmm5
	pslld	$18, %xmm4
	psrld	$14, %xmm5
	pxor	%xmm4, %xmm0
	movdqa	%xmm3, %xmm4
	pxor	%xmm5, %xmm0
	pshufd	$0x39, %xmm1, %xmm1
	
	paddd	%xmm0, %xmm4
	movdqa	%xmm4, %xmm5
	pslld	$7, %xmm4
	psrld	$25, %xmm5
	pxor	%xmm4, %xmm1
	movdqa	%xmm0, %xmm4
	pxor	%xmm5, %xmm1
	
	paddd	%xmm1, %xmm4
	movdqa	%xmm4, %xmm5
	pslld	$9, %xmm4
	psrld	$23, %xmm5
	pxor	%xmm4, %xmm2
	movdqa	%xmm1, %xmm4
	pxor	%xmm5, %xmm2
	pshufd	$0x93, %xmm1, %xmm1
	
	paddd	%xmm2, %xmm4
	movdqa	%xmm4, %xmm5
	pslld	$13, %xmm4
	psrld	$19, %xmm5
	pxor	%xmm4, %xmm3
	movdqa	%xmm2, %xmm4
	pxor	%xmm5, %xmm3
	pshufd	$0x4e, %xmm2, %xmm2
	
	paddd	%xmm3, %xmm4
	movdqa	%xmm4, %xmm5
	pslld	$18, %xmm4
	psrld	$14, %xmm5
	pxor	%xmm4, %xmm0
	pshufd	$0x39, %xmm3, %xmm3
	pxor	%xmm5, %xmm0
.endm

.macro salsa8_core_sse2
	salsa8_core_sse2_doubleround
	salsa8_core_sse2_doubleround
	salsa8_core_sse2_doubleround
	salsa8_core_sse2_doubleround
.endm
	
	.p2align 5
scrypt_core_sse2:
	movl	20(%esp), %edi
	movl	24(%esp), %esi
	movl	%esp, %ebp
	subl	$128, %esp
	andl	$-16, %esp
	
	scrypt_shuffle %edi, 0, %esp, 0
	scrypt_shuffle %edi, 64, %esp, 64
	
	movdqa	96(%esp), %xmm6
	movdqa	112(%esp), %xmm7
	
	movl	%esi, %edx
	movl	28(%ebp), %ecx
	shll	$7, %ecx
	addl	%esi, %ecx
scrypt_core_sse2_loop1:
	movdqa	0(%esp), %xmm0
	movdqa	16(%esp), %xmm1
	movdqa	32(%esp), %xmm2
	movdqa	48(%esp), %xmm3
	movdqa	64(%esp), %xmm4
	movdqa	80(%esp), %xmm5
	pxor	%xmm4, %xmm0
	pxor	%xmm5, %xmm1
	movdqa	%xmm0, 0(%edx)
	movdqa	%xmm1, 16(%edx)
	pxor	%xmm6, %xmm2
	pxor	%xmm7, %xmm3
	movdqa	%xmm2, 32(%edx)
	movdqa	%xmm3, 48(%edx)
	movdqa	%xmm4, 64(%edx)
	movdqa	%xmm5, 80(%edx)
	movdqa	%xmm6, 96(%edx)
	movdqa	%xmm7, 112(%edx)
	
	salsa8_core_sse2
	paddd	0(%edx), %xmm0
	paddd	16(%edx), %xmm1
	paddd	32(%edx), %xmm2
	paddd	48(%edx), %xmm3
	movdqa	%xmm0, 0(%esp)
	movdqa	%xmm1, 16(%esp)
	movdqa	%xmm2, 32(%esp)
	movdqa	%xmm3, 48(%esp)
	
	pxor	64(%esp), %xmm0
	pxor	80(%esp), %xmm1
	pxor	%xmm6, %xmm2
	pxor	%xmm7, %xmm3
	movdqa	%xmm0, 64(%esp)
	movdqa	%xmm1, 80(%esp)
	movdqa	%xmm2, %xmm6
	movdqa	%xmm3, %xmm7
	salsa8_core_sse2
	paddd	64(%esp), %xmm0
	paddd	80(%esp), %xmm1
	paddd	%xmm2, %xmm6
	paddd	%xmm3, %xmm7
	movdqa	%xmm0, 64(%esp)
	movdqa	%xmm1, 80(%esp)
	
	addl	$128, %edx
	cmpl	%ecx, %edx
	jne scrypt_core_sse2_loop1
	
	movdqa	64(%esp), %xmm4
	movdqa	80(%esp), %xmm5
	
	movl	28(%ebp), %ecx
	movl	%ecx, %eax
	subl	$1, %eax
scrypt_core_sse2_loop2:
	movd	%xmm4, %edx
	movdqa	0(%esp), %xmm0
	movdqa	16(%esp), %xmm1
	movdqa	32(%esp), %xmm2
	movdqa	48(%esp), %xmm3
	andl	%eax, %edx
	shll	$7, %edx
	pxor	0(%esi, %edx), %xmm0
	pxor	16(%esi, %edx), %xmm1
	pxor	32(%esi, %edx), %xmm2
	pxor	48(%esi, %edx), %xmm3
	
	pxor	%xmm4, %xmm0
	pxor	%xmm5, %xmm1
	movdqa	%xmm0, 0(%esp)
	movdqa	%xmm1, 16(%esp)
	pxor	%xmm6, %xmm2
	pxor	%xmm7, %xmm3
	movdqa	%xmm2, 32(%esp)
	movdqa	%xmm3, 48(%esp)
	salsa8_core_sse2
	paddd	0(%esp), %xmm0
	paddd	16(%esp), %xmm1
	paddd	32(%esp), %xmm2
	paddd	48(%esp), %xmm3
	movdqa	%xmm0, 0(%esp)
	movdqa	%xmm1, 16(%esp)
	movdqa	%xmm2, 32(%esp)
	movdqa	%xmm3, 48(%esp)
	
	pxor	64(%esi, %edx), %xmm0
	pxor	80(%esi, %edx), %xmm1
	pxor	96(%esi, %edx), %xmm2
	pxor	112(%esi, %edx), %xmm3
	pxor	64(%esp), %xmm0
	pxor	80(%esp), %xmm1
	pxor	%xmm6, %xmm2
	pxor	%xmm7, %xmm3
	movdqa	%xmm0, 64(%esp)
	movdqa	%xmm1, 80(%esp)
	movdqa	%xmm2, %xmm6
	movdqa	%xmm3, %xmm7
	salsa8_core_sse2
	paddd	64(%esp), %xmm0
	paddd	80(%esp), %xmm1
	paddd	%xmm2, %xmm6
	paddd	%xmm3, %xmm7
	movdqa	%xmm0, %xmm4
	movdqa	%xmm1, %xmm5
	movdqa	%xmm0, 64(%esp)
	movdqa	%xmm1, 80(%esp)
	
	subl	$1, %ecx
	ja scrypt_core_sse2_loop2
	
	movdqa	%xmm6, 96(%esp)
	movdqa	%xmm7, 112(%esp)
	
	scrypt_shuffle %esp, 0, %edi, 0
	scrypt_shuffle %esp, 64, %edi, 64
	
	movl	%ebp, %esp
	popl	%esi
	popl	%edi
	popl	%ebp
	popl	%ebx
	ret

#endif
07070100000027000081A4000003E800000064000000015EF4BCA10000607F000000000000000000000000000000000000001800000000cpuminer-2.5.1/scrypt.c/*
 * Copyright 2009 Colin Percival, 2011 ArtForz, 2011-2014 pooler
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 * This file was originally written by Colin Percival as part of the Tarsnap
 * online backup system.
 */

#include "cpuminer-config.h"
#include "miner.h"

#include <stdlib.h>
#include <string.h>
#include <inttypes.h>

static const uint32_t keypad[12] = {
	0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280
};
static const uint32_t innerpad[11] = {
	0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x000004a0
};
static const uint32_t outerpad[8] = {
	0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300
};
static const uint32_t finalblk[16] = {
	0x00000001, 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000620
};

static inline void HMAC_SHA256_80_init(const uint32_t *key,
	uint32_t *tstate, uint32_t *ostate)
{
	uint32_t ihash[8];
	uint32_t pad[16];
	int i;

	/* tstate is assumed to contain the midstate of key */
	memcpy(pad, key + 16, 16);
	memcpy(pad + 4, keypad, 48);
	sha256_transform(tstate, pad, 0);
	memcpy(ihash, tstate, 32);

	sha256_init(ostate);
	for (i = 0; i < 8; i++)
		pad[i] = ihash[i] ^ 0x5c5c5c5c;
	for (; i < 16; i++)
		pad[i] = 0x5c5c5c5c;
	sha256_transform(ostate, pad, 0);

	sha256_init(tstate);
	for (i = 0; i < 8; i++)
		pad[i] = ihash[i] ^ 0x36363636;
	for (; i < 16; i++)
		pad[i] = 0x36363636;
	sha256_transform(tstate, pad, 0);
}

static inline void PBKDF2_SHA256_80_128(const uint32_t *tstate,
	const uint32_t *ostate, const uint32_t *salt, uint32_t *output)
{
	uint32_t istate[8], ostate2[8];
	uint32_t ibuf[16], obuf[16];
	int i, j;

	memcpy(istate, tstate, 32);
	sha256_transform(istate, salt, 0);
	
	memcpy(ibuf, salt + 16, 16);
	memcpy(ibuf + 5, innerpad, 44);
	memcpy(obuf + 8, outerpad, 32);

	for (i = 0; i < 4; i++) {
		memcpy(obuf, istate, 32);
		ibuf[4] = i + 1;
		sha256_transform(obuf, ibuf, 0);

		memcpy(ostate2, ostate, 32);
		sha256_transform(ostate2, obuf, 0);
		for (j = 0; j < 8; j++)
			output[8 * i + j] = swab32(ostate2[j]);
	}
}

static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate,
	const uint32_t *salt, uint32_t *output)
{
	uint32_t buf[16];
	int i;
	
	sha256_transform(tstate, salt, 1);
	sha256_transform(tstate, salt + 16, 1);
	sha256_transform(tstate, finalblk, 0);
	memcpy(buf, tstate, 32);
	memcpy(buf + 8, outerpad, 32);

	sha256_transform(ostate, buf, 0);
	for (i = 0; i < 8; i++)
		output[i] = swab32(ostate[i]);
}


#ifdef HAVE_SHA256_4WAY

static const uint32_t keypad_4way[4 * 12] = {
	0x80000000, 0x80000000, 0x80000000, 0x80000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000280, 0x00000280, 0x00000280, 0x00000280
};
static const uint32_t innerpad_4way[4 * 11] = {
	0x80000000, 0x80000000, 0x80000000, 0x80000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x000004a0, 0x000004a0, 0x000004a0, 0x000004a0
};
static const uint32_t outerpad_4way[4 * 8] = {
	0x80000000, 0x80000000, 0x80000000, 0x80000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000300, 0x00000300, 0x00000300, 0x00000300
};
static const uint32_t finalblk_4way[4 * 16] __attribute__((aligned(16))) = {
	0x00000001, 0x00000001, 0x00000001, 0x00000001,
	0x80000000, 0x80000000, 0x80000000, 0x80000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000620, 0x00000620, 0x00000620, 0x00000620
};

static inline void HMAC_SHA256_80_init_4way(const uint32_t *key,
	uint32_t *tstate, uint32_t *ostate)
{
	uint32_t ihash[4 * 8] __attribute__((aligned(16)));
	uint32_t pad[4 * 16] __attribute__((aligned(16)));
	int i;

	/* tstate is assumed to contain the midstate of key */
	memcpy(pad, key + 4 * 16, 4 * 16);
	memcpy(pad + 4 * 4, keypad_4way, 4 * 48);
	sha256_transform_4way(tstate, pad, 0);
	memcpy(ihash, tstate, 4 * 32);

	sha256_init_4way(ostate);
	for (i = 0; i < 4 * 8; i++)
		pad[i] = ihash[i] ^ 0x5c5c5c5c;
	for (; i < 4 * 16; i++)
		pad[i] = 0x5c5c5c5c;
	sha256_transform_4way(ostate, pad, 0);

	sha256_init_4way(tstate);
	for (i = 0; i < 4 * 8; i++)
		pad[i] = ihash[i] ^ 0x36363636;
	for (; i < 4 * 16; i++)
		pad[i] = 0x36363636;
	sha256_transform_4way(tstate, pad, 0);
}

static inline void PBKDF2_SHA256_80_128_4way(const uint32_t *tstate,
	const uint32_t *ostate, const uint32_t *salt, uint32_t *output)
{
	uint32_t istate[4 * 8] __attribute__((aligned(16)));
	uint32_t ostate2[4 * 8] __attribute__((aligned(16)));
	uint32_t ibuf[4 * 16] __attribute__((aligned(16)));
	uint32_t obuf[4 * 16] __attribute__((aligned(16)));
	int i, j;

	memcpy(istate, tstate, 4 * 32);
	sha256_transform_4way(istate, salt, 0);
	
	memcpy(ibuf, salt + 4 * 16, 4 * 16);
	memcpy(ibuf + 4 * 5, innerpad_4way, 4 * 44);
	memcpy(obuf + 4 * 8, outerpad_4way, 4 * 32);

	for (i = 0; i < 4; i++) {
		memcpy(obuf, istate, 4 * 32);
		ibuf[4 * 4 + 0] = i + 1;
		ibuf[4 * 4 + 1] = i + 1;
		ibuf[4 * 4 + 2] = i + 1;
		ibuf[4 * 4 + 3] = i + 1;
		sha256_transform_4way(obuf, ibuf, 0);

		memcpy(ostate2, ostate, 4 * 32);
		sha256_transform_4way(ostate2, obuf, 0);
		for (j = 0; j < 4 * 8; j++)
			output[4 * 8 * i + j] = swab32(ostate2[j]);
	}
}

static inline void PBKDF2_SHA256_128_32_4way(uint32_t *tstate,
	uint32_t *ostate, const uint32_t *salt, uint32_t *output)
{
	uint32_t buf[4 * 16] __attribute__((aligned(16)));
	int i;
	
	sha256_transform_4way(tstate, salt, 1);
	sha256_transform_4way(tstate, salt + 4 * 16, 1);
	sha256_transform_4way(tstate, finalblk_4way, 0);
	memcpy(buf, tstate, 4 * 32);
	memcpy(buf + 4 * 8, outerpad_4way, 4 * 32);

	sha256_transform_4way(ostate, buf, 0);
	for (i = 0; i < 4 * 8; i++)
		output[i] = swab32(ostate[i]);
}

#endif /* HAVE_SHA256_4WAY */


#ifdef HAVE_SHA256_8WAY

static const uint32_t finalblk_8way[8 * 16] __attribute__((aligned(32))) = {
	0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001,
	0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620
};

static inline void HMAC_SHA256_80_init_8way(const uint32_t *key,
	uint32_t *tstate, uint32_t *ostate)
{
	uint32_t ihash[8 * 8] __attribute__((aligned(32)));
	uint32_t pad[8 * 16] __attribute__((aligned(32)));
	int i;
	
	/* tstate is assumed to contain the midstate of key */
	memcpy(pad, key + 8 * 16, 8 * 16);
	for (i = 0; i < 8; i++)
		pad[8 * 4 + i] = 0x80000000;
	memset(pad + 8 * 5, 0x00, 8 * 40);
	for (i = 0; i < 8; i++)
		pad[8 * 15 + i] = 0x00000280;
	sha256_transform_8way(tstate, pad, 0);
	memcpy(ihash, tstate, 8 * 32);
	
	sha256_init_8way(ostate);
	for (i = 0; i < 8 * 8; i++)
		pad[i] = ihash[i] ^ 0x5c5c5c5c;
	for (; i < 8 * 16; i++)
		pad[i] = 0x5c5c5c5c;
	sha256_transform_8way(ostate, pad, 0);
	
	sha256_init_8way(tstate);
	for (i = 0; i < 8 * 8; i++)
		pad[i] = ihash[i] ^ 0x36363636;
	for (; i < 8 * 16; i++)
		pad[i] = 0x36363636;
	sha256_transform_8way(tstate, pad, 0);
}

static inline void PBKDF2_SHA256_80_128_8way(const uint32_t *tstate,
	const uint32_t *ostate, const uint32_t *salt, uint32_t *output)
{
	uint32_t istate[8 * 8] __attribute__((aligned(32)));
	uint32_t ostate2[8 * 8] __attribute__((aligned(32)));
	uint32_t ibuf[8 * 16] __attribute__((aligned(32)));
	uint32_t obuf[8 * 16] __attribute__((aligned(32)));
	int i, j;
	
	memcpy(istate, tstate, 8 * 32);
	sha256_transform_8way(istate, salt, 0);
	
	memcpy(ibuf, salt + 8 * 16, 8 * 16);
	for (i = 0; i < 8; i++)
		ibuf[8 * 5 + i] = 0x80000000;
	memset(ibuf + 8 * 6, 0x00, 8 * 36);
	for (i = 0; i < 8; i++)
		ibuf[8 * 15 + i] = 0x000004a0;
	
	for (i = 0; i < 8; i++)
		obuf[8 * 8 + i] = 0x80000000;
	memset(obuf + 8 * 9, 0x00, 8 * 24);
	for (i = 0; i < 8; i++)
		obuf[8 * 15 + i] = 0x00000300;
	
	for (i = 0; i < 4; i++) {
		memcpy(obuf, istate, 8 * 32);
		ibuf[8 * 4 + 0] = i + 1;
		ibuf[8 * 4 + 1] = i + 1;
		ibuf[8 * 4 + 2] = i + 1;
		ibuf[8 * 4 + 3] = i + 1;
		ibuf[8 * 4 + 4] = i + 1;
		ibuf[8 * 4 + 5] = i + 1;
		ibuf[8 * 4 + 6] = i + 1;
		ibuf[8 * 4 + 7] = i + 1;
		sha256_transform_8way(obuf, ibuf, 0);
		
		memcpy(ostate2, ostate, 8 * 32);
		sha256_transform_8way(ostate2, obuf, 0);
		for (j = 0; j < 8 * 8; j++)
			output[8 * 8 * i + j] = swab32(ostate2[j]);
	}
}

static inline void PBKDF2_SHA256_128_32_8way(uint32_t *tstate,
	uint32_t *ostate, const uint32_t *salt, uint32_t *output)
{
	uint32_t buf[8 * 16] __attribute__((aligned(32)));
	int i;
	
	sha256_transform_8way(tstate, salt, 1);
	sha256_transform_8way(tstate, salt + 8 * 16, 1);
	sha256_transform_8way(tstate, finalblk_8way, 0);
	
	memcpy(buf, tstate, 8 * 32);
	for (i = 0; i < 8; i++)
		buf[8 * 8 + i] = 0x80000000;
	memset(buf + 8 * 9, 0x00, 8 * 24);
	for (i = 0; i < 8; i++)
		buf[8 * 15 + i] = 0x00000300;
	sha256_transform_8way(ostate, buf, 0);
	
	for (i = 0; i < 8 * 8; i++)
		output[i] = swab32(ostate[i]);
}

#endif /* HAVE_SHA256_8WAY */


#if defined(USE_ASM) && defined(__x86_64__)

#define SCRYPT_MAX_WAYS 12
#define HAVE_SCRYPT_3WAY 1
int scrypt_best_throughput();
void scrypt_core(uint32_t *X, uint32_t *V, int N);
void scrypt_core_3way(uint32_t *X, uint32_t *V, int N);
#if defined(USE_AVX2)
#undef SCRYPT_MAX_WAYS
#define SCRYPT_MAX_WAYS 24
#define HAVE_SCRYPT_6WAY 1
void scrypt_core_6way(uint32_t *X, uint32_t *V, int N);
#endif

#elif defined(USE_ASM) && defined(__i386__)

#define SCRYPT_MAX_WAYS 4
#define scrypt_best_throughput() 1
void scrypt_core(uint32_t *X, uint32_t *V, int N);

#elif defined(USE_ASM) && defined(__arm__) && defined(__APCS_32__)

void scrypt_core(uint32_t *X, uint32_t *V, int N);
#if defined(__ARM_NEON__)
#undef HAVE_SHA256_4WAY
#define SCRYPT_MAX_WAYS 3
#define HAVE_SCRYPT_3WAY 1
#define scrypt_best_throughput() 3
void scrypt_core_3way(uint32_t *X, uint32_t *V, int N);
#endif

#elif defined(USE_ASM) && (defined(__powerpc__) || defined(__ppc__) || defined(__PPC__))

#define SCRYPT_MAX_WAYS 4
#define scrypt_best_throughput() 1
void scrypt_core(uint32_t *X, uint32_t *V, int N);

#else

static inline void xor_salsa8(uint32_t B[16], const uint32_t Bx[16])
{
	uint32_t x00,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,x11,x12,x13,x14,x15;
	int i;

	x00 = (B[ 0] ^= Bx[ 0]);
	x01 = (B[ 1] ^= Bx[ 1]);
	x02 = (B[ 2] ^= Bx[ 2]);
	x03 = (B[ 3] ^= Bx[ 3]);
	x04 = (B[ 4] ^= Bx[ 4]);
	x05 = (B[ 5] ^= Bx[ 5]);
	x06 = (B[ 6] ^= Bx[ 6]);
	x07 = (B[ 7] ^= Bx[ 7]);
	x08 = (B[ 8] ^= Bx[ 8]);
	x09 = (B[ 9] ^= Bx[ 9]);
	x10 = (B[10] ^= Bx[10]);
	x11 = (B[11] ^= Bx[11]);
	x12 = (B[12] ^= Bx[12]);
	x13 = (B[13] ^= Bx[13]);
	x14 = (B[14] ^= Bx[14]);
	x15 = (B[15] ^= Bx[15]);
	for (i = 0; i < 8; i += 2) {
#define R(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
		/* Operate on columns. */
		x04 ^= R(x00+x12, 7);	x09 ^= R(x05+x01, 7);
		x14 ^= R(x10+x06, 7);	x03 ^= R(x15+x11, 7);
		
		x08 ^= R(x04+x00, 9);	x13 ^= R(x09+x05, 9);
		x02 ^= R(x14+x10, 9);	x07 ^= R(x03+x15, 9);
		
		x12 ^= R(x08+x04,13);	x01 ^= R(x13+x09,13);
		x06 ^= R(x02+x14,13);	x11 ^= R(x07+x03,13);
		
		x00 ^= R(x12+x08,18);	x05 ^= R(x01+x13,18);
		x10 ^= R(x06+x02,18);	x15 ^= R(x11+x07,18);
		
		/* Operate on rows. */
		x01 ^= R(x00+x03, 7);	x06 ^= R(x05+x04, 7);
		x11 ^= R(x10+x09, 7);	x12 ^= R(x15+x14, 7);
		
		x02 ^= R(x01+x00, 9);	x07 ^= R(x06+x05, 9);
		x08 ^= R(x11+x10, 9);	x13 ^= R(x12+x15, 9);
		
		x03 ^= R(x02+x01,13);	x04 ^= R(x07+x06,13);
		x09 ^= R(x08+x11,13);	x14 ^= R(x13+x12,13);
		
		x00 ^= R(x03+x02,18);	x05 ^= R(x04+x07,18);
		x10 ^= R(x09+x08,18);	x15 ^= R(x14+x13,18);
#undef R
	}
	B[ 0] += x00;
	B[ 1] += x01;
	B[ 2] += x02;
	B[ 3] += x03;
	B[ 4] += x04;
	B[ 5] += x05;
	B[ 6] += x06;
	B[ 7] += x07;
	B[ 8] += x08;
	B[ 9] += x09;
	B[10] += x10;
	B[11] += x11;
	B[12] += x12;
	B[13] += x13;
	B[14] += x14;
	B[15] += x15;
}

static inline void scrypt_core(uint32_t *X, uint32_t *V, int N)
{
	uint32_t i, j, k;
	
	for (i = 0; i < N; i++) {
		memcpy(&V[i * 32], X, 128);
		xor_salsa8(&X[0], &X[16]);
		xor_salsa8(&X[16], &X[0]);
	}
	for (i = 0; i < N; i++) {
		j = 32 * (X[16] & (N - 1));
		for (k = 0; k < 32; k++)
			X[k] ^= V[j + k];
		xor_salsa8(&X[0], &X[16]);
		xor_salsa8(&X[16], &X[0]);
	}
}

#endif

#ifndef SCRYPT_MAX_WAYS
#define SCRYPT_MAX_WAYS 1
#define scrypt_best_throughput() 1
#endif

unsigned char *scrypt_buffer_alloc(int N)
{
	return malloc((size_t)N * SCRYPT_MAX_WAYS * 128 + 63);
}

static void scrypt_1024_1_1_256(const uint32_t *input, uint32_t *output,
	uint32_t *midstate, unsigned char *scratchpad, int N)
{
	uint32_t tstate[8], ostate[8];
	uint32_t X[32] __attribute__((aligned(128)));
	uint32_t *V;
	
	V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));

	memcpy(tstate, midstate, 32);
	HMAC_SHA256_80_init(input, tstate, ostate);
	PBKDF2_SHA256_80_128(tstate, ostate, input, X);

	scrypt_core(X, V, N);

	PBKDF2_SHA256_128_32(tstate, ostate, X, output);
}

#ifdef HAVE_SHA256_4WAY
static void scrypt_1024_1_1_256_4way(const uint32_t *input,
	uint32_t *output, uint32_t *midstate, unsigned char *scratchpad, int N)
{
	uint32_t tstate[4 * 8] __attribute__((aligned(128)));
	uint32_t ostate[4 * 8] __attribute__((aligned(128)));
	uint32_t W[4 * 32] __attribute__((aligned(128)));
	uint32_t X[4 * 32] __attribute__((aligned(128)));
	uint32_t *V;
	int i, k;
	
	V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));

	for (i = 0; i < 20; i++)
		for (k = 0; k < 4; k++)
			W[4 * i + k] = input[k * 20 + i];
	for (i = 0; i < 8; i++)
		for (k = 0; k < 4; k++)
			tstate[4 * i + k] = midstate[i];
	HMAC_SHA256_80_init_4way(W, tstate, ostate);
	PBKDF2_SHA256_80_128_4way(tstate, ostate, W, W);
	for (i = 0; i < 32; i++)
		for (k = 0; k < 4; k++)
			X[k * 32 + i] = W[4 * i + k];
	scrypt_core(X + 0 * 32, V, N);
	scrypt_core(X + 1 * 32, V, N);
	scrypt_core(X + 2 * 32, V, N);
	scrypt_core(X + 3 * 32, V, N);
	for (i = 0; i < 32; i++)
		for (k = 0; k < 4; k++)
			W[4 * i + k] = X[k * 32 + i];
	PBKDF2_SHA256_128_32_4way(tstate, ostate, W, W);
	for (i = 0; i < 8; i++)
		for (k = 0; k < 4; k++)
			output[k * 8 + i] = W[4 * i + k];
}
#endif /* HAVE_SHA256_4WAY */

#ifdef HAVE_SCRYPT_3WAY

static void scrypt_1024_1_1_256_3way(const uint32_t *input,
	uint32_t *output, uint32_t *midstate, unsigned char *scratchpad, int N)
{
	uint32_t tstate[3 * 8], ostate[3 * 8];
	uint32_t X[3 * 32] __attribute__((aligned(64)));
	uint32_t *V;
	
	V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));

	memcpy(tstate +  0, midstate, 32);
	memcpy(tstate +  8, midstate, 32);
	memcpy(tstate + 16, midstate, 32);
	HMAC_SHA256_80_init(input +  0, tstate +  0, ostate +  0);
	HMAC_SHA256_80_init(input + 20, tstate +  8, ostate +  8);
	HMAC_SHA256_80_init(input + 40, tstate + 16, ostate + 16);
	PBKDF2_SHA256_80_128(tstate +  0, ostate +  0, input +  0, X +  0);
	PBKDF2_SHA256_80_128(tstate +  8, ostate +  8, input + 20, X + 32);
	PBKDF2_SHA256_80_128(tstate + 16, ostate + 16, input + 40, X + 64);

	scrypt_core_3way(X, V, N);

	PBKDF2_SHA256_128_32(tstate +  0, ostate +  0, X +  0, output +  0);
	PBKDF2_SHA256_128_32(tstate +  8, ostate +  8, X + 32, output +  8);
	PBKDF2_SHA256_128_32(tstate + 16, ostate + 16, X + 64, output + 16);
}

#ifdef HAVE_SHA256_4WAY
static void scrypt_1024_1_1_256_12way(const uint32_t *input,
	uint32_t *output, uint32_t *midstate, unsigned char *scratchpad, int N)
{
	uint32_t tstate[12 * 8] __attribute__((aligned(128)));
	uint32_t ostate[12 * 8] __attribute__((aligned(128)));
	uint32_t W[12 * 32] __attribute__((aligned(128)));
	uint32_t X[12 * 32] __attribute__((aligned(128)));
	uint32_t *V;
	int i, j, k;
	
	V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));

	for (j = 0; j < 3; j++)
		for (i = 0; i < 20; i++)
			for (k = 0; k < 4; k++)
				W[128 * j + 4 * i + k] = input[80 * j + k * 20 + i];
	for (j = 0; j < 3; j++)
		for (i = 0; i < 8; i++)
			for (k = 0; k < 4; k++)
				tstate[32 * j + 4 * i + k] = midstate[i];
	HMAC_SHA256_80_init_4way(W +   0, tstate +  0, ostate +  0);
	HMAC_SHA256_80_init_4way(W + 128, tstate + 32, ostate + 32);
	HMAC_SHA256_80_init_4way(W + 256, tstate + 64, ostate + 64);
	PBKDF2_SHA256_80_128_4way(tstate +  0, ostate +  0, W +   0, W +   0);
	PBKDF2_SHA256_80_128_4way(tstate + 32, ostate + 32, W + 128, W + 128);
	PBKDF2_SHA256_80_128_4way(tstate + 64, ostate + 64, W + 256, W + 256);
	for (j = 0; j < 3; j++)
		for (i = 0; i < 32; i++)
			for (k = 0; k < 4; k++)
				X[128 * j + k * 32 + i] = W[128 * j + 4 * i + k];
	scrypt_core_3way(X + 0 * 96, V, N);
	scrypt_core_3way(X + 1 * 96, V, N);
	scrypt_core_3way(X + 2 * 96, V, N);
	scrypt_core_3way(X + 3 * 96, V, N);
	for (j = 0; j < 3; j++)
		for (i = 0; i < 32; i++)
			for (k = 0; k < 4; k++)
				W[128 * j + 4 * i + k] = X[128 * j + k * 32 + i];
	PBKDF2_SHA256_128_32_4way(tstate +  0, ostate +  0, W +   0, W +   0);
	PBKDF2_SHA256_128_32_4way(tstate + 32, ostate + 32, W + 128, W + 128);
	PBKDF2_SHA256_128_32_4way(tstate + 64, ostate + 64, W + 256, W + 256);
	for (j = 0; j < 3; j++)
		for (i = 0; i < 8; i++)
			for (k = 0; k < 4; k++)
				output[32 * j + k * 8 + i] = W[128 * j + 4 * i + k];
}
#endif /* HAVE_SHA256_4WAY */

#endif /* HAVE_SCRYPT_3WAY */

#ifdef HAVE_SCRYPT_6WAY
static void scrypt_1024_1_1_256_24way(const uint32_t *input,
	uint32_t *output, uint32_t *midstate, unsigned char *scratchpad, int N)
{
	uint32_t tstate[24 * 8] __attribute__((aligned(128)));
	uint32_t ostate[24 * 8] __attribute__((aligned(128)));
	uint32_t W[24 * 32] __attribute__((aligned(128)));
	uint32_t X[24 * 32] __attribute__((aligned(128)));
	uint32_t *V;
	int i, j, k;
	
	V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
	
	for (j = 0; j < 3; j++) 
		for (i = 0; i < 20; i++)
			for (k = 0; k < 8; k++)
				W[8 * 32 * j + 8 * i + k] = input[8 * 20 * j + k * 20 + i];
	for (j = 0; j < 3; j++)
		for (i = 0; i < 8; i++)
			for (k = 0; k < 8; k++)
				tstate[8 * 8 * j + 8 * i + k] = midstate[i];
	HMAC_SHA256_80_init_8way(W +   0, tstate +   0, ostate +   0);
	HMAC_SHA256_80_init_8way(W + 256, tstate +  64, ostate +  64);
	HMAC_SHA256_80_init_8way(W + 512, tstate + 128, ostate + 128);
	PBKDF2_SHA256_80_128_8way(tstate +   0, ostate +   0, W +   0, W +   0);
	PBKDF2_SHA256_80_128_8way(tstate +  64, ostate +  64, W + 256, W + 256);
	PBKDF2_SHA256_80_128_8way(tstate + 128, ostate + 128, W + 512, W + 512);
	for (j = 0; j < 3; j++)
		for (i = 0; i < 32; i++)
			for (k = 0; k < 8; k++)
				X[8 * 32 * j + k * 32 + i] = W[8 * 32 * j + 8 * i + k];
	scrypt_core_6way(X + 0 * 32, V, N);
	scrypt_core_6way(X + 6 * 32, V, N);
	scrypt_core_6way(X + 12 * 32, V, N);
	scrypt_core_6way(X + 18 * 32, V, N);
	for (j = 0; j < 3; j++)
		for (i = 0; i < 32; i++)
			for (k = 0; k < 8; k++)
				W[8 * 32 * j + 8 * i + k] = X[8 * 32 * j + k * 32 + i];
	PBKDF2_SHA256_128_32_8way(tstate +   0, ostate +   0, W +   0, W +   0);
	PBKDF2_SHA256_128_32_8way(tstate +  64, ostate +  64, W + 256, W + 256);
	PBKDF2_SHA256_128_32_8way(tstate + 128, ostate + 128, W + 512, W + 512);
	for (j = 0; j < 3; j++)
		for (i = 0; i < 8; i++)
			for (k = 0; k < 8; k++)
				output[8 * 8 * j + k * 8 + i] = W[8 * 32 * j + 8 * i + k];
}
#endif /* HAVE_SCRYPT_6WAY */

int scanhash_scrypt(int thr_id, uint32_t *pdata,
	unsigned char *scratchbuf, const uint32_t *ptarget,
	uint32_t max_nonce, unsigned long *hashes_done, int N)
{
	uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8];
	uint32_t midstate[8];
	uint32_t n = pdata[19] - 1;
	const uint32_t Htarg = ptarget[7];
	int throughput = scrypt_best_throughput();
	int i;
	
#ifdef HAVE_SHA256_4WAY
	if (sha256_use_4way())
		throughput *= 4;
#endif
	
	for (i = 0; i < throughput; i++)
		memcpy(data + i * 20, pdata, 80);
	
	sha256_init(midstate);
	sha256_transform(midstate, data, 0);
	
	do {
		for (i = 0; i < throughput; i++)
			data[i * 20 + 19] = ++n;
		
#if defined(HAVE_SHA256_4WAY)
		if (throughput == 4)
			scrypt_1024_1_1_256_4way(data, hash, midstate, scratchbuf, N);
		else
#endif
#if defined(HAVE_SCRYPT_3WAY) && defined(HAVE_SHA256_4WAY)
		if (throughput == 12)
			scrypt_1024_1_1_256_12way(data, hash, midstate, scratchbuf, N);
		else
#endif
#if defined(HAVE_SCRYPT_6WAY)
		if (throughput == 24)
			scrypt_1024_1_1_256_24way(data, hash, midstate, scratchbuf, N);
		else
#endif
#if defined(HAVE_SCRYPT_3WAY)
		if (throughput == 3)
			scrypt_1024_1_1_256_3way(data, hash, midstate, scratchbuf, N);
		else
#endif
		scrypt_1024_1_1_256(data, hash, midstate, scratchbuf, N);
		
		for (i = 0; i < throughput; i++) {
			if (hash[i * 8 + 7] <= Htarg && fulltest(hash + i * 8, ptarget)) {
				*hashes_done = n - pdata[19] + 1;
				pdata[19] = data[i * 20 + 19];
				return 1;
			}
		}
	} while (n < max_nonce && !work_restart[thr_id].restart);
	
	*hashes_done = n - pdata[19] + 1;
	pdata[19] = n;
	return 0;
}
07070100000028000081A4000003E800000064000000015EF4BCA10000A37A000000000000000000000000000000000000001A00000000cpuminer-2.5.1/sha2-arm.S/*
 * Copyright 2012 pooler@litecoinpool.org
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the Free
 * Software Foundation; either version 2 of the License, or (at your option)
 * any later version.  See COPYING for more details.
 */

#include "cpuminer-config.h"

#if defined(USE_ASM) && defined(__arm__) && defined(__APCS_32__)

.macro sha256_k
	.align 2
	.long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
	.long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
	.long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
	.long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
	.long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
	.long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
	.long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
	.long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
	.long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
	.long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
	.long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
	.long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
	.long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
	.long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
	.long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
	.long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
.endm

.macro sha256_extend_doubleround_core i, rw, ra, rb, ry, rz
	mov	r12, \ry, ror #17
	add	r11, r11, \ra
	eor	r12, r12, \ry, ror #19
	mov	\ra, lr, ror #7
	eor	r12, r12, \ry, lsr #10
	eor	\ra, \ra, lr, ror #18
	add	r12, r12, r11
	ldr	r11, [\rw, #(\i+2)*4]
	eor	\ra, \ra, lr, lsr #3
	add	\ra, \ra, r12

	mov	r12, \rz, ror #17
	str	\ra, [\rw, #(\i+16)*4]
	add	lr, lr, \rb
	eor	r12, r12, \rz, ror #19
	mov	\rb, r11, ror #7
	eor	r12, r12, \rz, lsr #10
	eor	\rb, \rb, r11, ror #18
	add	lr, lr, r12
	eor	\rb, \rb, r11, lsr #3
	add	\rb, \rb, lr
.endm

.macro sha256_extend_doubleround_head i, rw, ra, rb, ry, rz
	ldr	lr, [\rw, #(\i+1)*4]
	sha256_extend_doubleround_core \i, \rw, \ra, \rb, \ry, \rz
	ldr	lr, [\rw, #(\i+3)*4]
.endm

.macro sha256_extend_doubleround_body i, rw, ra, rb, ry, rz
	str	\rz, [\rw, #(\i+15)*4]
	sha256_extend_doubleround_core \i, \rw, \ra, \rb, \ry, \rz
	ldr	lr, [\rw, #(\i+3)*4]
.endm

.macro sha256_extend_doubleround_foot i, rw, ra, rb, ry, rz
	str	\rz, [\rw, #(\i+15)*4]
	sha256_extend_doubleround_core \i, \rw, \ra, \rb, \ry, \rz
	str	\rb, [\rw, #(\i+17)*4]
.endm

.macro sha256_main_round i, ka, rw, ra, rb, rc, rd, re, rf, rg, rh
	ldr	r12, [\rw, #(\i)*4]
	and	r3, \rf, \re
	bic	lr, \rg, \re
	orr	lr, lr, r3
	ldr	r3, \ka + (\i)*4
	add	\rh, \rh, lr
	eor	lr, \re, \re, ror #5
	add	\rh, \rh, r12
	eor	lr, lr, \re, ror #19
	add	\rh, \rh, r3
	eor	r3, \ra, \rb
	add	\rh, \rh, lr, ror #6

	and	r3, r3, \rc
	eor	r12, \ra, \ra, ror #11
	and	lr, \ra, \rb
	eor	r12, r12, \ra, ror #20
	eor	lr, lr, r3
	add	r3, \rh, lr
	add	\rh, \rh, \rd
	add	\rd, r3, r12, ror #2
.endm

.macro sha256_main_quadround i, ka, rw
	sha256_main_round \i+0, \ka, \rw, r4, r5, r6, r7, r8, r9, r10, r11
	sha256_main_round \i+1, \ka, \rw, r7, r4, r5, r6, r11, r8, r9, r10
	sha256_main_round \i+2, \ka, \rw, r6, r7, r4, r5, r10, r11, r8, r9
	sha256_main_round \i+3, \ka, \rw, r5, r6, r7, r4, r9, r10, r11, r8
.endm


	.text
	.code 32
	.align 2
	.globl sha256_transform
	.globl _sha256_transform
#ifdef __ELF__
	.type sha256_transform, %function
#endif
sha256_transform:
_sha256_transform:
	stmfd	sp!, {r4-r11, lr}
	cmp	r2, #0
	sub	sp, sp, #64*4
	bne	sha256_transform_swap
	
	ldmia	r1!, {r4-r11}
	stmia	sp, {r4-r11}
	add	r3, sp, #8*4
	ldmia	r1, {r4-r11}
	stmia	r3, {r4-r11}
	b	sha256_transform_extend

.macro bswap rd, rn
	eor	r12, \rn, \rn, ror #16
	bic	r12, r12, #0x00ff0000
	mov	\rd, \rn, ror #8
	eor	\rd, \rd, r12, lsr #8
.endm

sha256_transform_swap:
	ldmia	r1!, {r4-r11}
	bswap	r4, r4
	bswap	r5, r5
	bswap	r6, r6
	bswap	r7, r7
	bswap	r8, r8
	bswap	r9, r9
	bswap	r10, r10
	bswap	r11, r11
	stmia	sp, {r4-r11}
	add	r3, sp, #8*4
	ldmia	r1, {r4-r11}
	bswap	r4, r4
	bswap	r5, r5
	bswap	r6, r6
	bswap	r7, r7
	bswap	r8, r8
	bswap	r9, r9
	bswap	r10, r10
	bswap	r11, r11
	stmia	r3, {r4-r11}
	
sha256_transform_extend:
	add	r12, sp, #9*4
	ldr	r11, [sp, #0*4]
	ldmia	r12, {r4-r10}
	sha256_extend_doubleround_head  0, sp, r4, r5, r9, r10
	sha256_extend_doubleround_body  2, sp, r6, r7, r4, r5
	sha256_extend_doubleround_body  4, sp, r8, r9, r6, r7
	sha256_extend_doubleround_body  6, sp, r10, r4, r8, r9
	sha256_extend_doubleround_body  8, sp, r5, r6, r10, r4
	sha256_extend_doubleround_body 10, sp, r7, r8, r5, r6
	sha256_extend_doubleround_body 12, sp, r9, r10, r7, r8
	sha256_extend_doubleround_body 14, sp, r4, r5, r9, r10
	sha256_extend_doubleround_body 16, sp, r6, r7, r4, r5
	sha256_extend_doubleround_body 18, sp, r8, r9, r6, r7
	sha256_extend_doubleround_body 20, sp, r10, r4, r8, r9
	sha256_extend_doubleround_body 22, sp, r5, r6, r10, r4
	sha256_extend_doubleround_body 24, sp, r7, r8, r5, r6
	sha256_extend_doubleround_body 26, sp, r9, r10, r7, r8
	sha256_extend_doubleround_body 28, sp, r4, r5, r9, r10
	sha256_extend_doubleround_body 30, sp, r6, r7, r4, r5
	sha256_extend_doubleround_body 32, sp, r8, r9, r6, r7
	sha256_extend_doubleround_body 34, sp, r10, r4, r8, r9
	sha256_extend_doubleround_body 36, sp, r5, r6, r10, r4
	sha256_extend_doubleround_body 38, sp, r7, r8, r5, r6
	sha256_extend_doubleround_body 40, sp, r9, r10, r7, r8
	sha256_extend_doubleround_body 42, sp, r4, r5, r9, r10
	sha256_extend_doubleround_body 44, sp, r6, r7, r4, r5
	sha256_extend_doubleround_foot 46, sp, r8, r9, r6, r7
	
	ldmia	r0, {r4-r11}
	sha256_main_quadround  0, sha256_transform_k, sp
	sha256_main_quadround  4, sha256_transform_k, sp
	sha256_main_quadround  8, sha256_transform_k, sp
	sha256_main_quadround 12, sha256_transform_k, sp
	sha256_main_quadround 16, sha256_transform_k, sp
	sha256_main_quadround 20, sha256_transform_k, sp
	sha256_main_quadround 24, sha256_transform_k, sp
	sha256_main_quadround 28, sha256_transform_k, sp
	b	sha256_transform_k_over
sha256_transform_k:
	sha256_k
sha256_transform_k_over:
	sha256_main_quadround 32, sha256_transform_k, sp
	sha256_main_quadround 36, sha256_transform_k, sp
	sha256_main_quadround 40, sha256_transform_k, sp
	sha256_main_quadround 44, sha256_transform_k, sp
	sha256_main_quadround 48, sha256_transform_k, sp
	sha256_main_quadround 52, sha256_transform_k, sp
	sha256_main_quadround 56, sha256_transform_k, sp
	sha256_main_quadround 60, sha256_transform_k, sp
	
	ldmia	r0, {r1, r2, r3, r12}
	add	r4, r4, r1
	add	r5, r5, r2
	add	r6, r6, r3
	add	r7, r7, r12
	stmia	r0!, {r4-r7}
	ldmia	r0, {r1, r2, r3, r12}
	add	r8, r8, r1
	add	r9, r9, r2
	add	r10, r10, r3
	add	r11, r11, r12
	stmia	r0, {r8-r11}
	
	add	sp, sp, #64*4
#ifdef __thumb__
	ldmfd	sp!, {r4-r11, lr}
	bx	lr
#else
	ldmfd	sp!, {r4-r11, pc}
#endif


	.text
	.code 32
	.align 2
	.globl sha256d_ms
	.globl _sha256d_ms
#ifdef __ELF__
	.type sha256d_ms, %function
#endif
sha256d_ms:
_sha256d_ms:
	stmfd	sp!, {r4-r11, lr}
	sub	sp, sp, #64*4
	
	cmp	r0, r0
	
	ldr	lr, [r1, #3*4]
	ldr	r6, [r1, #18*4]
	ldr	r7, [r1, #19*4]
	
	mov	r12, lr, ror #7
	str	r6, [sp, #18*4]
	eor	r12, r12, lr, ror #18
	str	r7, [sp, #19*4]
	eor	r12, r12, lr, lsr #3
	ldr	r8, [r1, #20*4]
	add	r6, r6, r12
	ldr	r10, [r1, #22*4]
	add	r7, r7, lr
	str	r6, [r1, #18*4]
	
	mov	r12, r6, ror #17
	str	r7, [r1, #19*4]
	eor	r12, r12, r6, ror #19
	str	r8, [sp, #20*4]
	eor	r12, r12, r6, lsr #10
	ldr	r4, [r1, #23*4]
	add	r8, r8, r12
	ldr	r5, [r1, #24*4]
	
	mov	r9, r7, ror #17
	str	r8, [r1, #20*4]
	eor	r9, r9, r7, ror #19
	str	r10, [sp, #21*4]
	eor	r9, r9, r7, lsr #10
	str	r4, [sp, #22*4]
	
	mov	r12, r8, ror #17
	str	r9, [r1, #21*4]
	eor	r12, r12, r8, ror #19
	str	r5, [sp, #23*4]
	eor	r12, r12, r8, lsr #10
	mov	lr, r9, ror #17
	add	r10, r10, r12
	ldr	r11, [r1, #30*4]
	
	eor	lr, lr, r9, ror #19
	str	r10, [r1, #22*4]
	eor	lr, lr, r9, lsr #10
	str	r11, [sp, #24*4]
	add	r4, r4, lr
	
	mov	r12, r10, ror #17
	str	r4, [r1, #23*4]
	eor	r12, r12, r10, ror #19
	mov	lr, r4, ror #17
	eor	r12, r12, r10, lsr #10
	eor	lr, lr, r4, ror #19
	add	r5, r5, r12
	eor	lr, lr, r4, lsr #10
	str	r5, [r1, #24*4]
	add	r6, r6, lr
	
	mov	r12, r5, ror #17
	str	r6, [r1, #25*4]
	eor	r12, r12, r5, ror #19
	mov	lr, r6, ror #17
	eor	r12, r12, r5, lsr #10
	eor	lr, lr, r6, ror #19
	add	r7, r7, r12
	eor	lr, lr, r6, lsr #10
	str	r7, [r1, #26*4]
	add	r8, r8, lr
	
	mov	r12, r7, ror #17
	str	r8, [r1, #27*4]
	eor	r12, r12, r7, ror #19
	mov	lr, r8, ror #17
	eor	r12, r12, r7, lsr #10
	eor	lr, lr, r8, ror #19
	add	r9, r9, r12
	eor	lr, lr, r8, lsr #10
	str	r9, [r1, #28*4]
	add	r10, r10, lr
	
	ldr	lr, [r1, #31*4]
	mov	r12, r9, ror #17
	str	r10, [r1, #29*4]
	eor	r12, r12, r9, ror #19
	str	lr, [sp, #25*4]
	eor	r12, r12, r9, lsr #10
	add	r11, r11, r12
	add	r5, r5, lr
	mov	r12, r10, ror #17
	add	r4, r4, r11
	
	ldr	r11, [r1, #16*4]
	eor	r12, r12, r10, ror #19
	str	r4, [r1, #30*4]
	eor	r12, r12, r10, lsr #10
	add	r5, r5, r12
	ldr	lr, [r1, #17*4]
	
sha256d_ms_extend_loop2:
	sha256_extend_doubleround_body 16, r1, r6, r7, r4, r5
	sha256_extend_doubleround_body 18, r1, r8, r9, r6, r7
	sha256_extend_doubleround_body 20, r1, r10, r4, r8, r9
	sha256_extend_doubleround_body 22, r1, r5, r6, r10, r4
	sha256_extend_doubleround_body 24, r1, r7, r8, r5, r6
	sha256_extend_doubleround_body 26, r1, r9, r10, r7, r8
	sha256_extend_doubleround_body 28, r1, r4, r5, r9, r10
	sha256_extend_doubleround_body 30, r1, r6, r7, r4, r5
	sha256_extend_doubleround_body 32, r1, r8, r9, r6, r7
	sha256_extend_doubleround_body 34, r1, r10, r4, r8, r9
	sha256_extend_doubleround_body 36, r1, r5, r6, r10, r4
	sha256_extend_doubleround_body 38, r1, r7, r8, r5, r6
	sha256_extend_doubleround_body 40, r1, r9, r10, r7, r8
	sha256_extend_doubleround_body 42, r1, r4, r5, r9, r10
	bne	sha256d_ms_extend_coda2
	sha256_extend_doubleround_body 44, r1, r6, r7, r4, r5
	sha256_extend_doubleround_foot 46, r1, r8, r9, r6, r7
	
	ldr	r4,  [r3, #0*4]
	ldr	r9,  [r3, #1*4]
	ldr	r10, [r3, #2*4]
	ldr	r11, [r3, #3*4]
	ldr	r8,  [r3, #4*4]
	ldr	r5,  [r3, #5*4]
	ldr	r6,  [r3, #6*4]
	ldr	r7,  [r3, #7*4]
	b	sha256d_ms_main_loop1
	
sha256d_ms_main_loop2:
	sha256_main_round  0, sha256d_ms_k, r1, r4, r5, r6, r7, r8, r9, r10, r11
	sha256_main_round  1, sha256d_ms_k, r1, r7, r4, r5, r6, r11, r8, r9, r10
	sha256_main_round  2, sha256d_ms_k, r1, r6, r7, r4, r5, r10, r11, r8, r9
sha256d_ms_main_loop1:
	sha256_main_round  3, sha256d_ms_k, r1, r5, r6, r7, r4, r9, r10, r11, r8
	sha256_main_quadround  4, sha256d_ms_k, r1
	sha256_main_quadround  8, sha256d_ms_k, r1
	sha256_main_quadround 12, sha256d_ms_k, r1
	sha256_main_quadround 16, sha256d_ms_k, r1
	sha256_main_quadround 20, sha256d_ms_k, r1
	sha256_main_quadround 24, sha256d_ms_k, r1
	sha256_main_quadround 28, sha256d_ms_k, r1
	b	sha256d_ms_k_over
sha256d_ms_k:
	sha256_k
sha256d_ms_k_over:
	sha256_main_quadround 32, sha256d_ms_k, r1
	sha256_main_quadround 36, sha256d_ms_k, r1
	sha256_main_quadround 40, sha256d_ms_k, r1
	sha256_main_quadround 44, sha256d_ms_k, r1
	sha256_main_quadround 48, sha256d_ms_k, r1
	sha256_main_quadround 52, sha256d_ms_k, r1
	sha256_main_round 56, sha256d_ms_k, r1, r4, r5, r6, r7, r8, r9, r10, r11
	bne	sha256d_ms_finish
	sha256_main_round 57, sha256d_ms_k, r1, r7, r4, r5, r6, r11, r8, r9, r10
	sha256_main_round 58, sha256d_ms_k, r1, r6, r7, r4, r5, r10, r11, r8, r9
	sha256_main_round 59, sha256d_ms_k, r1, r5, r6, r7, r4, r9, r10, r11, r8
	sha256_main_quadround 60, sha256d_ms_k, r1
	
	ldmia	r2!, {r3, r12, lr}
	add	r4, r4, r3
	add	r5, r5, r12
	add	r6, r6, lr
	stmia	sp, {r4-r6}
	ldmia	r2, {r3, r4, r5, r6, r12}
	add	lr, sp, #3*4
	add	r7, r7, r3
	add	r8, r8, r4
	add	r9, r9, r5
	add	r10, r10, r6
	add	r11, r11, r12
	add	r12, sp, #18*4
	stmia	lr!, {r7-r11}
	
	ldmia	r12, {r4-r11}
	str	r4,  [r1, #18*4]
	str	r5,  [r1, #19*4]
	str	r6,  [r1, #20*4]
	str	r7,  [r1, #22*4]
	str	r8,  [r1, #23*4]
	str	r9,  [r1, #24*4]
	str	r10, [r1, #30*4]
	str	r11, [r1, #31*4]
	
	mov	r3,  #0x80000000
	mov	r4,  #0
	mov	r5,  #0
	mov	r6,  #0
	mov	r7,  #0
	mov	r8,  #0
	mov	r9,  #0
	mov	r10, #0x00000100
	stmia	lr, {r3-r10}
	
	ldr	lr, [sp, #1*4]
	movs	r1, sp
	ldr	r4, [sp, #0*4]
	
	ldr	r11, [sp, #2*4]
	mov	r12, lr, ror #7
	eor	r12, r12, lr, ror #18
	add	r5, lr, #0x00a00000
	eor	r12, r12, lr, lsr #3
	mov	lr, r11, ror #7
	add	r4, r4, r12
	eor	lr, lr, r11, ror #18
	str	r4, [sp, #16*4]
	eor	lr, lr, r11, lsr #3
	mov	r12, r4, ror #17
	add	r5, r5, lr
	ldr	lr, [sp, #3*4]
	
	str	r5, [sp, #17*4]
	eor	r12, r12, r4, ror #19
	mov	r6, lr, ror #7
	eor	r12, r12, r4, lsr #10
	eor	r6, r6, lr, ror #18
	add	r11, r11, r12
	eor	r6, r6, lr, lsr #3
	mov	r12, r5, ror #17
	add	r6, r6, r11
	ldr	r11, [sp, #4*4]
	
	str	r6, [sp, #18*4]
	eor	r12, r12, r5, ror #19
	mov	r7, r11, ror #7
	eor	r12, r12, r5, lsr #10
	eor	r7, r7, r11, ror #18
	add	lr, lr, r12
	eor	r7, r7, r11, lsr #3
	mov	r12, r6, ror #17
	add	r7, r7, lr
	ldr	lr, [sp, #5*4]
	
	str	r7, [sp, #19*4]
	eor	r12, r12, r6, ror #19
	mov	r8, lr, ror #7
	eor	r12, r12, r6, lsr #10
	eor	r8, r8, lr, ror #18
	add	r11, r11, r12
	eor	r8, r8, lr, lsr #3
	mov	r12, r7, ror #17
	add	r8, r8, r11
	ldr	r11, [sp, #6*4]
	
	str	r8, [sp, #20*4]
	eor	r12, r12, r7, ror #19
	mov	r9, r11, ror #7
	eor	r12, r12, r7, lsr #10
	eor	r9, r9, r11, ror #18
	add	lr, lr, r12
	eor	r9, r9, r11, lsr #3
	mov	r12, r8, ror #17
	add	r9, r9, lr
	ldr	lr, [sp, #7*4]
	
	str	r9, [sp, #21*4]
	eor	r12, r12, r8, ror #19
	mov	r10, lr, ror #7
	eor	r12, r12, r8, lsr #10
	eor	r10, r10, lr, ror #18
	add	r11, r11, r12
	eor	r10, r10, lr, lsr #3
	mov	r12, r9, ror #17
	add	r11, r11, #0x00000100
	add	lr, lr, r4
	add	r10, r10, r11
	
	eor	r12, r12, r9, ror #19
	str	r10, [sp, #22*4]
	add	lr, lr, #0x11000000
	eor	r12, r12, r9, lsr #10
	add	lr, lr, r12
	mov	r12, r10, ror #17
	add	r4, lr, #0x00002000
	eor	r12, r12, r10, ror #19
	str	r4, [sp, #23*4]
	add	r5, r5, #0x80000000
	eor	r12, r12, r10, lsr #10
	add	r5, r5, r12

	mov	r12, r4, ror #17
	str	r5, [sp, #24*4]
	eor	r12, r12, r4, ror #19
	mov	r11, r5, ror #17
	eor	r12, r12, r4, lsr #10
	eor	r11, r11, r5, ror #19
	add	r6, r6, r12
	eor	r11, r11, r5, lsr #10
	str	r6, [sp, #25*4]
	add	r7, r7, r11
	
	mov	r12, r6, ror #17
	str	r7, [sp, #26*4]
	eor	r12, r12, r6, ror #19
	mov	r11, r7, ror #17
	eor	r12, r12, r6, lsr #10
	eor	r11, r11, r7, ror #19
	add	r8, r8, r12
	eor	r11, r11, r7, lsr #10
	str	r8, [sp, #27*4]
	add	r9, r9, r11
	
	mov	lr, r8, ror #17
	mov	r12, r9, ror #17
	str	r9, [sp, #28*4]
	add	r4, r4, #0x00400000
	eor	lr, lr, r8, ror #19
	eor	r12, r12, r9, ror #19
	eor	lr, lr, r8, lsr #10
	eor	r12, r12, r9, lsr #10
	add	r4, r4, #0x00000022
	add	r10, r10, lr
	add	r4, r4, r12
	ldr	r11, [sp, #16*4]
	
	add	r5, r5, #0x00000100
	str	r4, [sp, #30*4]
	mov	lr, r11, ror #7
	str	r10, [sp, #29*4]
	mov	r12, r10, ror #17
	eor	lr, lr, r11, ror #18
	eor	r12, r12, r10, ror #19
	eor	lr, lr, r11, lsr #3
	eor	r12, r12, r10, lsr #10
	add	r5, r5, lr
	ldr	lr, [r1, #17*4]
	add	r5, r5, r12
	
	b	sha256d_ms_extend_loop2
	
sha256d_ms_extend_coda2:
	str	r5, [r1, #(44+15)*4]
	mov	r12, r4, ror #17
	add	r11, r11, r6
	mov	r6, lr, ror #7
	eor	r12, r12, r4, ror #19
	eor	r6, r6, lr, ror #18
	eor	r12, r12, r4, lsr #10
	eor	r6, r6, lr, lsr #3
	add	r12, r12, r11
	add	r6, r6, r12
	str	r6, [r1, #(44+16)*4]
	
	adr	r2, sha256d_ms_h
	ldmia	r2, {r4-r11}
	b	sha256d_ms_main_loop2

sha256d_ms_h:
	.long 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a
	.long 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19

.macro sha256_main_round_red i, ka, rw, rd, re, rf, rg, rh
	ldr	r12, [\rw, #(\i)*4]
	and	r3, \rf, \re
	bic	lr, \rg, \re
	add	\rh, \rh, \rd
	orr	lr, lr, r3
	ldr	r3, \ka + (\i)*4
	add	\rh, \rh, lr
	eor	lr, \re, \re, ror #5
	add	\rh, \rh, r12
	eor	lr, lr, \re, ror #19
	add	\rh, \rh, r3
	add	\rh, \rh, lr, ror #6
.endm
	
sha256d_ms_finish:
	sha256_main_round_red 57, sha256d_ms_k, r1, r6, r11, r8, r9, r10
	sha256_main_round_red 58, sha256d_ms_k, r1, r5, r10, r11, r8, r9
	sha256_main_round_red 59, sha256d_ms_k, r1, r4, r9, r10, r11, r8
	ldr	r5, [r2, #7*4]
	sha256_main_round_red 60, sha256d_ms_k, r1, r7, r8, r9, r10, r11
	
	add	r11, r11, r5
	str	r11, [r0, #7*4]
	
	add	sp, sp, #64*4
#ifdef __thumb__
	ldmfd	sp!, {r4-r11, lr}
	bx	lr
#else
	ldmfd	sp!, {r4-r11, pc}
#endif


#ifdef __ARM_NEON__

	.text
	.code 32
	.align 2
	.globl sha256_init_4way
	.globl _sha256_init_4way
#ifdef __ELF__
	.type sha256_init_4way, %function
#endif
sha256_init_4way:
_sha256_init_4way:
	adr	r12, sha256_4h
	vldmia	r12, {q8-q15}
	vstmia	r0, {q8-q15}
	bx	lr
	.align 4
sha256_4h:
	.long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667
	.long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85
	.long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372
	.long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a
	.long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f
	.long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c
	.long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab
	.long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19

.macro sha256_4k
	.long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98
	.long 0x71374491, 0x71374491, 0x71374491, 0x71374491
	.long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf
	.long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5
	.long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b
	.long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1
	.long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4
	.long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5
	.long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98
	.long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01
	.long 0x243185be, 0x243185be, 0x243185be, 0x243185be
	.long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3
	.long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74
	.long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe
	.long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7
	.long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174
	.long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1
	.long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786
	.long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6
	.long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc
	.long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f
	.long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa
	.long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc
	.long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da
	.long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152
	.long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d
	.long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8
	.long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7
	.long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3
	.long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147
	.long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351
	.long 0x14292967, 0x14292967, 0x14292967, 0x14292967
	.long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85
	.long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138
	.long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc
	.long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13
	.long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354
	.long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb
	.long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e
	.long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85
	.long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1
	.long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b
	.long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70
	.long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3
	.long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819
	.long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624
	.long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585
	.long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070
	.long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116
	.long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08
	.long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c
	.long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5
	.long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3
	.long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a
	.long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f
	.long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3
	.long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee
	.long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f
	.long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814
	.long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208
	.long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa
	.long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb
	.long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7
	.long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2
.endm

.macro sha256_4way_extend_doubleround_core i, rr, rw, ra, rb, ry, rz
	vadd.u32	q5, q5, \ra
	veor.u32	q4, q4, q0
	vshr.u32	q0, \ry, #19
	vshl.u32	q1, \ry, #32-19
	veor.u32	q4, q4, q0
	vshr.u32	\ra, q6, #7
	vshl.u32	q0, q6, #32-7
	veor.u32	q4, q4, q1
	veor.u32	\ra, \ra, q0
	vshr.u32	q1, \ry, #10
	vshr.u32	q0, q6, #18
	veor.u32	q4, q4, q1
	veor.u32	\ra, \ra, q0
	vshl.u32	q1, q6, #32-18
	vshr.u32	q0, q6, #3
	veor.u32	\ra, \ra, q1
	vadd.u32	q4, q4, q5
	veor.u32	\ra, \ra, q0
	vld1.u32	{q5}, [\rr]!
	vadd.u32	\ra, \ra, q4

	vshr.u32	q4, \rz, #17
	vshl.u32	q0, \rz, #32-17
	vadd.u32	q6, q6, \rb
	vst1.u32	{\ra}, [\rw]!
	veor.u32	q4, q4, q0
	vshr.u32	q0, \rz, #19
	vshl.u32	q1, \rz, #32-19
	veor.u32	q4, q4, q0
	vshr.u32	\rb, q5, #7
	veor.u32	q4, q4, q1
	vshl.u32	q0, q5, #32-7
	vshr.u32	q1, \rz, #10
	veor.u32	\rb, \rb, q0
	vshr.u32	q0, q5, #18
	veor.u32	q4, q4, q1
	veor.u32	\rb, \rb, q0
	vshl.u32	q1, q5, #32-18
	vshr.u32	q0, q5, #3
	veor.u32	\rb, \rb, q1
	vadd.u32	q1, q6, q4
	veor.u32	\rb, \rb, q0
.endm

.macro sha256_4way_extend_doubleround_head i, rr, rw, ra, rb, ry, rz
	vld1.u32	{q6}, [\rr]!
	vshr.u32	q4, \ry, #17
	vshl.u32	q0, \ry, #32-17
	sha256_4way_extend_doubleround_core \i, \rr, \rw, \ra, \rb, \ry, \rz
	vld1.u32	{q6}, [\rr]!
	vadd.u32	\rb, \rb, q1
.endm

.macro sha256_4way_extend_doubleround_body i, rr, rw, ra, rb, ry, rz
	vshr.u32	q4, \ry, #17
	vshl.u32	q0, \ry, #32-17
	vst1.u32	{\rz}, [\rw]!
	sha256_4way_extend_doubleround_core \i, \rr, \rw, \ra, \rb, \ry, \rz
	vld1.u32	{q6}, [\rr]!
	vadd.u32	\rb, \rb, q1
.endm

.macro sha256_4way_extend_doubleround_foot i, rr, rw, ra, rb, ry, rz
	vshr.u32	q4, \ry, #17
	vshl.u32	q0, \ry, #32-17
	vst1.u32	{\rz}, [\rw]!
	sha256_4way_extend_doubleround_core \i, \rr, \rw, \ra, \rb, \ry, \rz
	vadd.u32	\rb, \rb, q1
	vst1.u32	{\rb}, [\rw]!
.endm

.macro sha256_4way_main_round i, rk, rw, ra, rb, rc, rd, re, rf, rg, rh
	vld1.u32	{q8}, [\rw]!
	vand.u32	q9, \rf, \re
	vbic.u32	q10, \rg, \re
	vshr.u32	q11, \re, #5
	vorr.u32	q10, q10, q9
	vld1.u32	{q9}, [\rk]!
	vadd.u32	\rh, \rh, q10
	vshl.u32	q12, \re, #32-5
	veor.u32	q10, \re, q11
	vshr.u32	q11, \re, #19
	veor.u32	q10, q10, q12
	vshl.u32	q12, \re, #32-19
	veor.u32	q10, q10, q11
	vadd.u32	\rh, \rh, q8
	veor.u32	q10, q10, q12
	vadd.u32	\rh, \rh, q9
	veor.u32	q9, \ra, \rb
	vshr.u32	q11, q10, #6
	vshl.u32	q13, q10, #32-6
	vadd.u32	\rh, \rh, q11

	vshr.u32	q11, \ra, #11
	vshl.u32	q12, \ra, #32-11
	veor.u32	q8, \ra, q11
	vand.u32	q10, \ra, \rb
	veor.u32	q8, q8, q12
	vshr.u32	q11, \ra, #20
	vshl.u32	q12, \ra, #32-20
	veor.u32	q8, q8, q11
	vand.u32	q9, q9, \rc
	veor.u32	q8, q8, q12
	vadd.u32	\rh, \rh, q13
	veor.u32	q10, q10, q9
	vshr.u32	q11, q8, #2
	vshl.u32	q12, q8, #32-2
	vadd.u32	q9, \rh, q10
	vadd.u32	q12, q12, q11
	vadd.u32	\rh, \rh, \rd
	vadd.u32	\rd, q9, q12
.endm

.macro sha256_4way_main_quadround i, rk, rw
	sha256_4way_main_round \i+0, \rk, \rw, q0, q1, q2, q3, q4, q5, q6, q7
	sha256_4way_main_round \i+1, \rk, \rw, q3, q0, q1, q2, q7, q4, q5, q6
	sha256_4way_main_round \i+2, \rk, \rw, q2, q3, q0, q1, q6, q7, q4, q5
	sha256_4way_main_round \i+3, \rk, \rw, q1, q2, q3, q0, q5, q6, q7, q4
.endm


	.text
	.code 32
	.align 2
	.globl sha256_transform_4way
	.globl _sha256_transform_4way
#ifdef __ELF__
	.type sha256_transform_4way, %function
#endif
sha256_transform_4way:
_sha256_transform_4way:
	stmfd	sp!, {r4, lr}
	vpush	{q4-q7}
	mov	r12, sp
	sub	sp, sp, #64*16
	bic	sp, sp, #63
	cmp	r2, #0
	bne	sha256_transform_4way_swap
	
	vldmia	r1!, {q0-q7}
	vstmia	sp, {q0-q7}
	add	r3, sp, #8*16
	vldmia	r1, {q8-q15}
	vstmia	r3, {q8-q15}
	b	sha256_transform_4way_extend

sha256_transform_4way_swap:
	vldmia	r1!, {q0-q7}
	vrev32.8	q0, q0
	vrev32.8	q1, q1
	vrev32.8	q2, q2
	vrev32.8	q3, q3
	vldmia	r1, {q8-q15}
	vrev32.8	q4, q4
	vrev32.8	q5, q5
	vrev32.8	q6, q6
	vrev32.8	q7, q7
	vstmia	sp, {q0-q7}
	vrev32.8	q8, q8
	vrev32.8	q9, q9
	vrev32.8	q10, q10
	vrev32.8	q11, q11
	vrev32.8	q12, q12
	vrev32.8	q13, q13
	vrev32.8	q14, q14
	vrev32.8	q15, q15
	add	r3, sp, #8*16
	vstmia	r3, {q8-q15}
	
sha256_transform_4way_extend:
	add	r1, sp, #1*16
	add	r2, sp, #16*16
	vmov.u32	q5, q0
	sha256_4way_extend_doubleround_head  0, r1, r2,  q9, q10, q14, q15
	sha256_4way_extend_doubleround_body  2, r1, r2, q11, q12,  q9, q10
	sha256_4way_extend_doubleround_body  4, r1, r2, q13, q14, q11, q12
	sha256_4way_extend_doubleround_body  6, r1, r2, q15,  q9, q13, q14
	sha256_4way_extend_doubleround_body  8, r1, r2, q10, q11, q15,  q9
	sha256_4way_extend_doubleround_body 10, r1, r2, q12, q13, q10, q11
	sha256_4way_extend_doubleround_body 12, r1, r2, q14, q15, q12, q13
	sha256_4way_extend_doubleround_body 14, r1, r2,  q9, q10, q14, q15
	sha256_4way_extend_doubleround_body 16, r1, r2, q11, q12,  q9, q10
	sha256_4way_extend_doubleround_body 18, r1, r2, q13, q14, q11, q12
	sha256_4way_extend_doubleround_body 20, r1, r2, q15,  q9, q13, q14
	sha256_4way_extend_doubleround_body 22, r1, r2, q10, q11, q15,  q9
	sha256_4way_extend_doubleround_body 24, r1, r2, q12, q13, q10, q11
	sha256_4way_extend_doubleround_body 26, r1, r2, q14, q15, q12, q13
	sha256_4way_extend_doubleround_body 28, r1, r2,  q9, q10, q14, q15
	sha256_4way_extend_doubleround_body 30, r1, r2, q11, q12,  q9, q10
	sha256_4way_extend_doubleround_body 32, r1, r2, q13, q14, q11, q12
	sha256_4way_extend_doubleround_body 34, r1, r2, q15,  q9, q13, q14
	sha256_4way_extend_doubleround_body 36, r1, r2, q10, q11, q15,  q9
	sha256_4way_extend_doubleround_body 38, r1, r2, q12, q13, q10, q11
	sha256_4way_extend_doubleround_body 40, r1, r2, q14, q15, q12, q13
	sha256_4way_extend_doubleround_body 42, r1, r2,  q9, q10, q14, q15
	sha256_4way_extend_doubleround_body 44, r1, r2, q11, q12,  q9, q10
	sha256_4way_extend_doubleround_foot 46, r1, r2, q13, q14, q11, q12
	
	vldmia	r0, {q0-q7}
	adr	r4, sha256_transform_4way_4k
	b	sha256_transform_4way_4k_over
	.align 4
sha256_transform_4way_4k:
	sha256_4k
sha256_transform_4way_4k_over:
	sha256_4way_main_quadround  0, r4, sp
	sha256_4way_main_quadround  4, r4, sp
	sha256_4way_main_quadround  8, r4, sp
	sha256_4way_main_quadround 12, r4, sp
	sha256_4way_main_quadround 16, r4, sp
	sha256_4way_main_quadround 20, r4, sp
	sha256_4way_main_quadround 24, r4, sp
	sha256_4way_main_quadround 28, r4, sp
	sha256_4way_main_quadround 32, r4, sp
	sha256_4way_main_quadround 36, r4, sp
	sha256_4way_main_quadround 40, r4, sp
	sha256_4way_main_quadround 44, r4, sp
	sha256_4way_main_quadround 48, r4, sp
	sha256_4way_main_quadround 52, r4, sp
	sha256_4way_main_quadround 56, r4, sp
	sha256_4way_main_quadround 60, r4, sp
	
	vldmia	r0, {q8-q15}
	vadd.u32	q0, q0, q8
	vadd.u32	q1, q1, q9
	vadd.u32	q2, q2, q10
	vadd.u32	q3, q3, q11
	vadd.u32	q4, q4, q12
	vadd.u32	q5, q5, q13
	vadd.u32	q6, q6, q14
	vadd.u32	q7, q7, q15
	vstmia	r0, {q0-q7}
	
	mov	sp, r12
	vpop	{q4-q7}
	ldmfd	sp!, {r4, pc}
	

	.text
	.code 32
	.align 2
	.globl sha256d_ms_4way
	.globl _sha256d_ms_4way
#ifdef __ELF__
	.type sha256d_ms_4way, %function
#endif
sha256d_ms_4way:
_sha256d_ms_4way:
	stmfd	sp!, {r4, lr}
	vpush	{q4-q7}
	mov	r12, sp
	sub	sp, sp, #64*16
	bic	sp, sp, #63
	
	add	r4, r1, #3*16
	vld1.u32	{q6}, [r4]!
	add	r1, r1, #18*16
	vldmia	r1, {q11-q13}
	cmp	r0, r0
	
	vshr.u32	q10, q6, #7
	vshl.u32	q0, q6, #32-7
	vshr.u32	q1, q6, #18
	veor.u32	q10, q10, q0
	vshl.u32	q0, q6, #32-18
	veor.u32	q10, q10, q1
	vshr.u32	q1, q6, #3
	veor.u32	q10, q10, q0
	vstmia	sp!, {q11-q13}
	veor.u32	q4, q10, q1
	vadd.u32	q12, q12, q6
	vadd.u32	q11, q11, q4
	
	vshr.u32	q14, q12, #17
	vshr.u32	q4, q11, #17
	vshl.u32	q0, q11, #32-17
	vst1.u32	{q11}, [r1]!
	veor.u32	q4, q4, q0
	vshr.u32	q0, q11, #19
	vshl.u32	q1, q11, #32-19
	veor.u32	q4, q4, q0
	vst1.u32	{q12}, [r1]!
	veor.u32	q4, q4, q1
	vshr.u32	q1, q11, #10
	vshl.u32	q0, q12, #32-17
	veor.u32	q4, q4, q1
	veor.u32	q14, q14, q0
	vadd.u32	q13, q13, q4
	vshr.u32	q0, q12, #19
	vshl.u32	q1, q12, #32-19
	veor.u32	q14, q14, q0
	vst1.u32	{q13}, [r1]!
	veor.u32	q14, q14, q1
	vshr.u32	q1, q12, #10
	
	vshr.u32	q4, q13, #17
	vshl.u32	q0, q13, #32-17
	veor.u32	q14, q14, q1
	veor.u32	q4, q4, q0
	vshr.u32	q0, q13, #19
	vshl.u32	q1, q13, #32-19
	veor.u32	q4, q4, q0
	vst1.u32	{q14}, [r1]!
	veor.u32	q4, q4, q1
	vshr.u32	q1, q13, #10
	vld1.u32	{q15}, [r1]
	veor.u32	q4, q4, q1
	vst1.u32	{q15}, [sp]!
	vadd.u32	q15, q15, q4
	vshr.u32	q4, q14, #17
	vshl.u32	q0, q14, #32-17
	vshl.u32	q1, q14, #32-19
	veor.u32	q4, q4, q0
	vshr.u32	q0, q14, #19
	vst1.u32	{q15}, [r1]!
	veor.u32	q4, q4, q0
	vld1.u32	{q9}, [r1]
	veor.u32	q4, q4, q1
	vshr.u32	q1, q14, #10
	vst1.u32	{q9}, [sp]!
	veor.u32	q5, q4, q1
	
	vshr.u32	q4, q15, #17
	vadd.u32	q9, q9, q5
	vshl.u32	q0, q15, #32-17
	vshl.u32	q1, q15, #32-19
	veor.u32	q4, q4, q0
	vshr.u32	q0, q15, #19
	vst1.u32	{q9}, [r1]!
	veor.u32	q4, q4, q0
	vld1.u32	{q10}, [r1]
	veor.u32	q4, q4, q1
	vshr.u32	q1, q15, #10
	vst1.u32	{q10}, [sp]!
	veor.u32	q4, q4, q1
	vshl.u32	q0, q9, #32-17
	vadd.u32	q10, q10, q4
	vshr.u32	q4, q9, #17
	vshl.u32	q1, q9, #32-19
	veor.u32	q4, q4, q0
	vshr.u32	q0, q9, #19
	veor.u32	q4, q4, q1
	vshr.u32	q1, q9, #10
	veor.u32	q4, q4, q0
	vst1.u32	{q10}, [r1]!
	veor.u32	q5, q4, q1
	
	vshr.u32	q4, q10, #17
	vshl.u32	q0, q10, #32-17
	vadd.u32	q11, q11, q5
	veor.u32	q4, q4, q0
	vshr.u32	q0, q10, #19
	vshl.u32	q1, q10, #32-19
	veor.u32	q4, q4, q0
	vst1.u32	{q11}, [r1]!
	veor.u32	q4, q4, q1
	vshr.u32	q1, q10, #10
	vshl.u32	q0, q11, #32-17
	veor.u32	q2, q4, q1
	vshr.u32	q4, q11, #17
	vadd.u32	q12, q12, q2
	vshl.u32	q1, q11, #32-19
	veor.u32	q4, q4, q0
	vshr.u32	q0, q11, #19
	veor.u32	q4, q4, q1
	vshr.u32	q1, q11, #10
	veor.u32	q4, q4, q0
	vst1.u32	{q12}, [r1]!
	veor.u32	q5, q4, q1
	
	vshr.u32	q4, q12, #17
	vshl.u32	q0, q12, #32-17
	vadd.u32	q13, q13, q5
	veor.u32	q4, q4, q0
	vshr.u32	q0, q12, #19
	vshl.u32	q1, q12, #32-19
	veor.u32	q4, q4, q0
	vst1.u32	{q13}, [r1]!
	veor.u32	q4, q4, q1
	vshr.u32	q1, q12, #10
	vshl.u32	q0, q13, #32-17
	veor.u32	q2, q4, q1
	vshr.u32	q4, q13, #17
	vadd.u32	q14, q14, q2
	vshl.u32	q1, q13, #32-19
	veor.u32	q4, q4, q0
	vshr.u32	q0, q13, #19
	veor.u32	q4, q4, q1
	vshr.u32	q1, q13, #10
	veor.u32	q4, q4, q0
	vst1.u32	{q14}, [r1]!
	veor.u32	q5, q4, q1
	add	r4, r4, #12*16
	
	vshr.u32	q4, q14, #17
	vshl.u32	q0, q14, #32-17
	vadd.u32	q15, q15, q5
	veor.u32	q4, q4, q0
	vshr.u32	q0, q14, #19
	vshl.u32	q1, q14, #32-19
	veor.u32	q4, q4, q0
	vst1.u32	{q15}, [r1]!
	veor.u32	q4, q4, q1
	vshr.u32	q1, q14, #10
	vld1.u32	{q2}, [r1]
	veor.u32	q4, q4, q1
	vshl.u32	q0, q15, #32-17
	vadd.u32	q9, q9, q4
	vst1.u32	{q2}, [sp]!
	vadd.u32	q9, q9, q2
	vshr.u32	q4, q15, #17
	vshr.u32	q2, q15, #19
	veor.u32	q4, q4, q0
	vst1.u32	{q9}, [r1]!
	vshl.u32	q1, q15, #32-19
	veor.u32	q4, q4, q2
	vshr.u32	q0, q15, #10
	veor.u32	q4, q4, q1
	vld1.u32	{q5-q6}, [r4]!
	veor.u32	q4, q4, q0
	vld1.u32	{q2}, [r1]
	vadd.u32	q10, q10, q4
	vst1.u32	{q2}, [sp]!
	vadd.u32	q10, q10, q2
	
	sub	sp, sp, #8*16
	
sha256d_ms_4way_extend_loop2:
	sha256_4way_extend_doubleround_body 16, r4, r1, q11, q12,  q9, q10
	sha256_4way_extend_doubleround_body 18, r4, r1, q13, q14, q11, q12
	sha256_4way_extend_doubleround_body 20, r4, r1, q15,  q9, q13, q14
	sha256_4way_extend_doubleround_body 22, r4, r1, q10, q11, q15,  q9
	sha256_4way_extend_doubleround_body 24, r4, r1, q12, q13, q10, q11
	sha256_4way_extend_doubleround_body 26, r4, r1, q14, q15, q12, q13
	sha256_4way_extend_doubleround_body 28, r4, r1,  q9, q10, q14, q15
	sha256_4way_extend_doubleround_body 30, r4, r1, q11, q12,  q9, q10
	sha256_4way_extend_doubleround_body 32, r4, r1, q13, q14, q11, q12
	sha256_4way_extend_doubleround_body 34, r4, r1, q15,  q9, q13, q14
	sha256_4way_extend_doubleround_body 36, r4, r1, q10, q11, q15,  q9
	sha256_4way_extend_doubleround_body 38, r4, r1, q12, q13, q10, q11
	sha256_4way_extend_doubleround_body 40, r4, r1, q14, q15, q12, q13
	sha256_4way_extend_doubleround_body 42, r4, r1,  q9, q10, q14, q15
	sha256_4way_extend_doubleround_body 44, r4, r1, q11, q12,  q9, q10
	sha256_4way_extend_doubleround_foot 46, r4, r1, q13, q14, q11, q12
	bne	sha256d_ms_4way_extend_coda2
	
	vldmia	r3!, {q4-q7}
	vldmia	r3, {q0-q3}
	vswp	q0, q4
	adr	r3, sha256d_ms_4way_4k+3*16
	sub r1, r1, #(64-3)*16
	b	sha256d_ms_4way_main_loop1
	
	.align 4
sha256d_ms_4way_4k:
	sha256_4k
	
sha256d_ms_4way_main_loop2:
	sha256_4way_main_round  0, r3, r1, q0, q1, q2, q3, q4, q5, q6, q7
	sha256_4way_main_round  1, r3, r1, q3, q0, q1, q2, q7, q4, q5, q6
	sha256_4way_main_round  2, r3, r1, q2, q3, q0, q1, q6, q7, q4, q5
sha256d_ms_4way_main_loop1:
	sha256_4way_main_round  3, r3, r1, q1, q2, q3, q0, q5, q6, q7, q4
	sha256_4way_main_quadround  4, r3, r1
	sha256_4way_main_quadround  8, r3, r1
	sha256_4way_main_quadround 12, r3, r1
	sha256_4way_main_quadround 16, r3, r1
	sha256_4way_main_quadround 20, r3, r1
	sha256_4way_main_quadround 24, r3, r1
	sha256_4way_main_quadround 28, r3, r1
	sha256_4way_main_quadround 32, r3, r1
	sha256_4way_main_quadround 36, r3, r1
	sha256_4way_main_quadround 40, r3, r1
	sha256_4way_main_quadround 44, r3, r1
	sha256_4way_main_quadround 48, r3, r1
	sha256_4way_main_quadround 52, r3, r1
	sha256_4way_main_round 56, r3, r1, q0, q1, q2, q3, q4, q5, q6, q7
	bne	sha256d_ms_4way_finish
	sha256_4way_main_round 57, r3, r1, q3, q0, q1, q2, q7, q4, q5, q6
	sha256_4way_main_round 58, r3, r1, q2, q3, q0, q1, q6, q7, q4, q5
	sha256_4way_main_round 59, r3, r1, q1, q2, q3, q0, q5, q6, q7, q4
	sha256_4way_main_quadround 60, r3, r1
	
	vldmia	r2, {q8-q15}
	vadd.u32	q0, q0, q8
	vadd.u32	q1, q1, q9
	vadd.u32	q2, q2, q10
	vadd.u32	q3, q3, q11
	vadd.u32	q4, q4, q12
	vadd.u32	q5, q5, q13
	vadd.u32	q6, q6, q14
	vadd.u32	q7, q7, q15
	
	vldmia	sp, {q8-q15}
	sub	r1, r1, #(64-18)*16
	vstmia	r1, {q8-q10}
	add	r1, r1, #4*16
	vstmia	r1, {q11-q13}
	add	r1, r1, #8*16
	vstmia	r1, {q14-q15}
	
	vstmia	sp, {q0-q7}
	vmov.u32	q8,  #0x80000000
	vmov.u32	q9,  #0
	vmov.u32	q10, #0
	vmov.u32	q11, #0
	vmov.u32	q12, #0
	vmov.u32	q13, #0
	vmov.u32	q14, #0
	vmov.u32	q15, #0x00000100
	add	r1, sp, #8*16
	vstmia	r1!, {q8-q15}
	adds	r4, sp, #2*16
	
	vshr.u32	q9, q1, #7
	vshl.u32	q2, q1, #32-7
	vshr.u32	q4, q1, #18
	veor.u32	q9, q9, q2
	vshl.u32	q3, q1, #32-18
	veor.u32	q9, q9, q4
	vshr.u32	q2, q1, #3
	veor.u32	q9, q9, q3
	vld1.u32	{q5}, [r4]!
	veor.u32	q9, q9, q2
	vmov.u32	q7, #0x00a00000
	vadd.u32	q9, q9, q0
	vshr.u32	q10, q5, #7
	vshl.u32	q0, q5, #32-7
	vshl.u32	q3, q5, #32-18
	veor.u32	q10, q10, q0
	vshr.u32	q0, q5, #18
	veor.u32	q10, q10, q3
	vst1.u32	{q9}, [r1]!
	vadd.u32	q3, q1, q7
	veor.u32	q10, q10, q0
	vshr.u32	q0, q5, #3
	vld1.u32	{q6}, [r4]!
	veor.u32	q10, q10, q0
	
	vshr.u32	q4, q9, #17
	vshl.u32	q0, q9, #32-17
	vadd.u32	q10, q10, q3
	veor.u32	q4, q4, q0
	vshr.u32	q0, q9, #19
	vshl.u32	q1, q9, #32-19
	veor.u32	q4, q4, q0
	vshr.u32	q11, q6, #7
	vshl.u32	q0, q6, #32-7
	veor.u32	q4, q4, q1
	veor.u32	q11, q11, q0
	vshr.u32	q1, q9, #10
	vshr.u32	q0, q6, #18
	veor.u32	q4, q4, q1
	veor.u32	q11, q11, q0
	vshl.u32	q1, q6, #32-18
	vshr.u32	q0, q6, #3
	veor.u32	q11, q11, q1
	vadd.u32	q4, q4, q5
	veor.u32	q11, q11, q0
	vld1.u32	{q5}, [r4]!
	vadd.u32	q11, q11, q4
	vshr.u32	q4, q10, #17
	vshl.u32	q0, q10, #32-17
	vst1.u32	{q10}, [r1]!
	veor.u32	q4, q4, q0
	vshr.u32	q0, q10, #19
	vshl.u32	q1, q10, #32-19
	veor.u32	q4, q4, q0
	vshr.u32	q12, q5, #7
	veor.u32	q4, q4, q1
	vshl.u32	q0, q5, #32-7
	vshr.u32	q1, q10, #10
	veor.u32	q12, q12, q0
	vshr.u32	q0, q5, #18
	veor.u32	q4, q4, q1
	veor.u32	q12, q12, q0
	vshl.u32	q1, q5, #32-18
	vst1.u32	{q11}, [r1]!
	veor.u32	q12, q12, q1
	vshr.u32	q0, q5, #3
	vadd.u32	q1, q6, q4
	veor.u32	q12, q12, q0
	
	vshr.u32	q4, q11, #17
	vshl.u32	q0, q11, #32-17
	vadd.u32	q12, q12, q1
	vld1.u32	{q6}, [r4]!
	veor.u32	q4, q4, q0
	vshr.u32	q0, q11, #19
	vshl.u32	q1, q11, #32-19
	veor.u32	q4, q4, q0
	vshr.u32	q13, q6, #7
	vshl.u32	q0, q6, #32-7
	veor.u32	q4, q4, q1
	veor.u32	q13, q13, q0
	vshr.u32	q1, q11, #10
	vshr.u32	q0, q6, #18
	veor.u32	q4, q4, q1
	veor.u32	q13, q13, q0
	vshl.u32	q1, q6, #32-18
	vshr.u32	q0, q6, #3
	veor.u32	q13, q13, q1
	vadd.u32	q4, q4, q5
	veor.u32	q13, q13, q0
	vld1.u32	{q5}, [r4]!
	vadd.u32	q13, q13, q4
	vshr.u32	q4, q12, #17
	vshl.u32	q0, q12, #32-17
	vst1.u32	{q12}, [r1]!
	veor.u32	q4, q4, q0
	vshr.u32	q0, q12, #19
	vshl.u32	q1, q12, #32-19
	veor.u32	q4, q4, q0
	vshr.u32	q14, q5, #7
	veor.u32	q4, q4, q1
	vshl.u32	q0, q5, #32-7
	vshr.u32	q1, q12, #10
	veor.u32	q14, q14, q0
	vshr.u32	q0, q5, #18
	veor.u32	q4, q4, q1
	veor.u32	q14, q14, q0
	vshl.u32	q1, q5, #32-18
	vst1.u32	{q13}, [r1]!
	veor.u32	q14, q14, q1
	vshr.u32	q0, q5, #3
	vadd.u32	q1, q6, q4
	veor.u32	q14, q14, q0
	
	vshr.u32	q4, q13, #17
	vshl.u32	q0, q13, #32-17
	vadd.u32	q14, q14, q1
	vld1.u32	{q6}, [r4]!
	vadd.u32	q5, q5, q15
	veor.u32	q4, q4, q0
	vshr.u32	q0, q13, #19
	vshl.u32	q1, q13, #32-19
	veor.u32	q4, q4, q0
	vshr.u32	q15, q6, #7
	vshl.u32	q0, q6, #32-7
	veor.u32	q4, q4, q1
	veor.u32	q15, q15, q0
	vshr.u32	q1, q13, #10
	vshr.u32	q0, q6, #18
	veor.u32	q4, q4, q1
	veor.u32	q15, q15, q0
	vshl.u32	q1, q6, #32-18
	vshr.u32	q0, q6, #3
	veor.u32	q15, q15, q1
	vadd.u32	q4, q4, q5
	veor.u32	q15, q15, q0
	vmov.u32	q5, #0x80000000
	vadd.u32	q15, q15, q4
	vshr.u32	q4, q14, #17
	vshl.u32	q0, q14, #32-17
	vadd.u32	q6, q6, q9
	vst1.u32	{q14}, [r1]!
	vmov.u32	q7, #0x11000000
	veor.u32	q4, q4, q0
	vshr.u32	q0, q14, #19
	vshl.u32	q1, q14, #32-19
	vadd.u32	q6, q6, q7
	vmov.u32	q2, #0x00002000
	veor.u32	q4, q4, q0
	vst1.u32	{q15}, [r1]!
	veor.u32	q4, q4, q1
	vshr.u32	q1, q14, #10
	vadd.u32	q6, q6, q2
	veor.u32	q1, q4, q1
	add	r4, r4, #8*16
	
	vshr.u32	q4, q15, #17
	vshl.u32	q0, q15, #32-17
	vadd.u32	q9, q6, q1
	veor.u32	q4, q4, q0
	vshr.u32	q0, q15, #19
	vshl.u32	q1, q15, #32-19
	veor.u32	q4, q4, q0
	vst1.u32	{q9}, [r1]!
	vadd.u32	q5, q5, q10
	veor.u32	q4, q4, q1
	vshr.u32	q1, q15, #10
	vshl.u32	q0, q9, #32-17
	veor.u32	q10, q4, q1
	vshr.u32	q4, q9, #17
	vadd.u32	q10, q10, q5
	veor.u32	q4, q4, q0
	vshr.u32	q0, q9, #19
	vshl.u32	q1, q9, #32-19
	veor.u32	q4, q4, q0
	vshr.u32	q0, q9, #10
	veor.u32	q4, q4, q1
	vst1.u32	{q10}, [r1]!
	veor.u32	q1, q4, q0
	
	vshr.u32	q4, q10, #17
	vshl.u32	q0, q10, #32-17
	vadd.u32	q11, q11, q1
	veor.u32	q4, q4, q0
	vshr.u32	q0, q10, #19
	vshl.u32	q1, q10, #32-19
	veor.u32	q4, q4, q0
	vst1.u32	{q11}, [r1]!
	veor.u32	q4, q4, q1
	vshr.u32	q1, q10, #10
	vshl.u32	q0, q11, #32-17
	veor.u32	q1, q4, q1
	vshr.u32	q4, q11, #17
	vadd.u32	q12, q12, q1
	veor.u32	q4, q4, q0
	vshr.u32	q0, q11, #19
	vshl.u32	q1, q11, #32-19
	veor.u32	q4, q4, q0
	vshr.u32	q0, q11, #10
	veor.u32	q4, q4, q1
	vst1.u32	{q12}, [r1]!
	veor.u32	q1, q4, q0
	
	vshr.u32	q4, q12, #17
	vshl.u32	q0, q12, #32-17
	vadd.u32	q13, q13, q1
	veor.u32	q4, q4, q0
	vshr.u32	q0, q12, #19
	vshl.u32	q1, q12, #32-19
	veor.u32	q4, q4, q0
	vst1.u32	{q13}, [r1]!
	veor.u32	q4, q4, q1
	vshr.u32	q1, q12, #10
	vshl.u32	q0, q13, #32-17
	veor.u32	q1, q4, q1
	vshr.u32	q4, q13, #17
	vadd.u32	q14, q14, q1
	veor.u32	q4, q4, q0
	vshr.u32	q0, q13, #19
	vshl.u32	q1, q13, #32-19
	veor.u32	q4, q4, q0
	vshr.u32	q0, q13, #10
	veor.u32	q4, q4, q1
	vst1.u32	{q14}, [r1]!
	veor.u32	q4, q4, q0
	vmov.u32	q6, #0x00000100
	vadd.u32	q15, q15, q4
	
	vshr.u32	q4, q14, #17
	vshl.u32	q0, q14, #32-17
	vmov.u32	q7, #0x00400000
	vst1.u32	{q15}, [r1]!
	veor.u32	q4, q4, q0
	vshr.u32	q0, q14, #19
	vshl.u32	q1, q14, #32-19
	veor.u32	q4, q4, q0
	vadd.u32	q9, q9, q7
	veor.u32	q4, q4, q1
	vshr.u32	q1, q14, #10
	vmov.u32	q2, #0x00000022
	veor.u32	q4, q4, q1
	vadd.u32	q9, q9, q2
	vld1.u32	{q5}, [r4]!
	vadd.u32	q9, q9, q4
	vshr.u32	q4, q15, #17
	vshl.u32	q0, q15, #32-17
	vadd.u32	q6, q6, q10
	vst1.u32	{q9}, [r1]!
	veor.u32	q4, q4, q0
	vshr.u32	q0, q15, #19
	vshl.u32	q1, q15, #32-19
	veor.u32	q4, q4, q0
	vshr.u32	q10, q5, #7
	veor.u32	q4, q4, q1
	vshl.u32	q0, q5, #32-7
	vshr.u32	q1, q15, #10
	veor.u32	q10, q10, q0
	vshr.u32	q0, q5, #18
	veor.u32	q4, q4, q1
	veor.u32	q10, q10, q0
	vshl.u32	q1, q5, #32-18
	vshr.u32	q0, q5, #3
	veor.u32	q10, q10, q1
	vadd.u32	q1, q6, q4
	veor.u32	q10, q10, q0
	vld1.u32	{q6}, [r4]!
	vadd.u32	q10, q10, q1
	
	b	sha256d_ms_4way_extend_loop2
	
	.align 4
sha256d_ms_4way_4h:
	.long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667
	.long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85
	.long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372
	.long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a
	.long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f
	.long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c
	.long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab
	.long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19
	
sha256d_ms_4way_extend_coda2:
	adr	r4, sha256d_ms_4way_4h
	mov	r1, sp
	vldmia	r4, {q0-q7}
	vmov.u32	q15, q7
	sub	r3, r3, #64*16
	b	sha256d_ms_4way_main_loop2

.macro sha256_4way_main_round_red i, rk, rw, rd, re, rf, rg, rh
	vld1.u32	{q8}, [\rw]!
	vand.u32	q9, \rf, \re
	vbic.u32	q10, \rg, \re
	vshr.u32	q11, \re, #5
	vorr.u32	q10, q10, q9
	vshl.u32	q12, \re, #32-5
	vadd.u32	\rh, \rh, q10
	veor.u32	q10, \re, q11
	vshr.u32	q11, \re, #19
	veor.u32	q10, q10, q12
	vshl.u32	q12, \re, #32-19
	veor.u32	q10, q10, q11
	vadd.u32	\rh, \rh, q8
	veor.u32	q10, q10, q12
	vld1.u32	{q9}, [\rk]!
	vadd.u32	\rh, \rh, \rd
	vshr.u32	q11, q10, #6
	vadd.u32	\rh, \rh, q9
	vshl.u32	q13, q10, #32-6
	vadd.u32	\rh, \rh, q11
	vadd.u32	\rh, \rh, q13
.endm

sha256d_ms_4way_finish:
	sha256_4way_main_round_red 57, r3, r1, q2, q7, q4, q5, q6
	sha256_4way_main_round_red 58, r3, r1, q1, q6, q7, q4, q5
	sha256_4way_main_round_red 59, r3, r1, q0, q5, q6, q7, q4
	sha256_4way_main_round_red 60, r3, r1, q3, q4, q5, q6, q7
	
	vadd.u32	q7, q7, q15
	add	r0, r0, #7*16
	vst1.u32	{q7}, [r0]
	
	mov	sp, r12
	vpop	{q4-q7}
	ldmfd	sp!, {r4, pc}


	.text
	.code 32
	.align 2
	.globl sha256_use_4way
	.globl _sha256_use_4way
#ifdef __ELF__
	.type sha256_use_4way, %function
#endif
sha256_use_4way:
_sha256_use_4way:
	mov	r0, #1
	bx	lr

#endif /* __ARM_NEON__ */

#endif
07070100000029000081A4000003E800000064000000015EF4BCA10000B174000000000000000000000000000000000000001A00000000cpuminer-2.5.1/sha2-ppc.S/*
 * Copyright 2014-2015 pooler@litecoinpool.org
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the Free
 * Software Foundation; either version 2 of the License, or (at your option)
 * any later version.  See COPYING for more details.
 */

#include "cpuminer-config.h"

#if defined(USE_ASM) && (defined(__powerpc__) || defined(__ppc__) || defined(__PPC__))

#ifdef __APPLE__

#define HI(name) ha16(name)
#define LO(name) lo16(name)

#else

#define HI(name) name@ha
#define LO(name) name@l

#define r0 0
#define r1 1
#define r2 2
#define r3 3
#define r4 4
#define r5 5
#define r6 6
#define r7 7
#define r8 8
#define r9 9
#define r10 10
#define r11 11
#define r12 12
#define r13 13
#define r14 14
#define r15 15
#define r16 16
#define r17 17
#define r18 18
#define r19 19
#define r20 20
#define r21 21
#define r22 22
#define r23 23
#define r24 24
#define r25 25
#define r26 26
#define r27 27
#define r28 28
#define r29 29
#define r30 30
#define r31 31

#ifdef __ALTIVEC__
#define v0 0
#define v1 1
#define v2 2
#define v3 3
#define v4 4
#define v5 5
#define v6 6
#define v7 7
#define v8 8
#define v9 9
#define v10 10
#define v11 11
#define v12 12
#define v13 13
#define v14 14
#define v15 15
#define v16 16
#define v17 17
#define v18 18
#define v19 19
#define v20 20
#define v21 21
#define v22 22
#define v23 23
#define v24 24
#define v25 25
#define v26 26
#define v27 27
#define v28 28
#define v29 29
#define v30 30
#define v31 31
#endif

#endif

#if !(defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__) || \
	defined(__64BIT__) || defined(_LP64) || defined(__LP64__))
#define ld lwz
#define std stw
#define stdu stwu
#define stdux stwux
#endif


#ifdef _AIX
	.csect .text[RO]
#else
	.data
#endif
	.align 2
sha256_h:
	.long 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a
	.long 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19

	.align 2
sha256_k:
	.long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
	.long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
	.long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
	.long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
	.long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
	.long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
	.long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
	.long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
	.long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
	.long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
	.long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
	.long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
	.long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
	.long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
	.long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
	.long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2

#ifdef _AIX
	.toc
T.sha256_h:
	.tc sha256_h[TC], sha256_h
T.sha256_k:
	.tc sha256_k[TC], sha256_k
#endif


.macro sha256_extend_doubleround i, rw, wo, ra, rb, ry, rz
	lwz	r14, \wo+(\i+1)*4(\rw)
	rotrwi	r12, \ry, 17
	rotrwi	r13, \ry, 19
	add	r11, r11, \ra
	xor	r12, r12, r13
	srwi	r13, \ry, 10
	rotrwi	\ra, r14, 7
	xor	r12, r12, r13
	rotrwi	r13, r14, 18
	add	r12, r12, r11
	xor	\ra, \ra, r13
	srwi	r13, r14, 3
	lwz	r11, \wo+(\i+2)*4(\rw)
	xor	\ra, \ra, r13
	rotrwi	r13, \rz, 19
	add	\ra, \ra, r12

	rotrwi	r12, \rz, 17
	add	r14, r14, \rb
	xor	r12, r12, r13
	srwi	r13, \rz, 10
	rotrwi	\rb, r11, 7
	xor	r12, r12, r13
	rotrwi	r13, r11, 18
	stw	\ra, \wo+(\i+16)*4(\rw)
	xor	\rb, \rb, r13
	srwi	r13, r11, 3
	add	r14, r14, r12
	xor	\rb, \rb, r13
	add	\rb, \rb, r14
	stw	\rb, \wo+(\i+17)*4(\rw)
.endm


.macro sha256_main_round i, rk, rw, wo, ra, rb, rc, rd, re, rf, rg, rh
	lwz	r12, \wo+(\i)*4(\rw)
	and	r13, \rf, \re
	andc	r14, \rg, \re
	lwz	r15, (\i)*4(\rk)
	or	r14, r14, r13
	rotrwi	r13, \re, 5
	add	\rh, \rh, r14
	xor	r14, \re, r13
	rotrwi	r13, \re, 19
	add	\rh, \rh, r12
	xor	r14, r14, r13
	add	\rh, \rh, r15
	rotrwi	r13, r14, 6
	xor	r15, \ra, \rb
	add	\rh, \rh, r13

	rotrwi	r13, \ra, 11
	and	r15, r15, \rc
	xor	r12, \ra, r13
	rotrwi	r13, \ra, 20
	and	r14, \ra, \rb
	xor	r12, r12, r13
	xor	r14, r14, r15
	rotrwi	r13, r12, 2
	add	r15, \rh, r14
	add	\rh, \rh, \rd
	add	\rd, r15, r13
.endm

.macro sha256_main_quadround i, rk, rw, wo
	sha256_main_round \i+0, \rk, \rw, \wo, r4, r5, r6, r7, r8, r9, r10, r11
	sha256_main_round \i+1, \rk, \rw, \wo, r7, r4, r5, r6, r11, r8, r9, r10
	sha256_main_round \i+2, \rk, \rw, \wo, r6, r7, r4, r5, r10, r11, r8, r9
	sha256_main_round \i+3, \rk, \rw, \wo, r5, r6, r7, r4, r9, r10, r11, r8
.endm


#ifdef _AIX
	.csect .text[PR]
#else
	.text
#endif
	.align 2
	.globl sha256_transform
	.globl _sha256_transform
	.globl .sha256_transform
#ifdef __ELF__
	.type sha256_transform, %function
#endif
sha256_transform:
_sha256_transform:
.sha256_transform:
	stdu	r1, -76*4(r1)
	cmpwi	0, r5, 0
	std	r13, 2*4(r1)
	std	r14, 4*4(r1)
	std	r15, 6*4(r1)
	std	r16, 72*4(r1)
	
	bne	0, sha256_transform_swap
	
	lwz	r11, 0*4(r4)
	lwz	r14, 1*4(r4)
	lwz	r15, 2*4(r4)
	lwz	r7, 3*4(r4)
	lwz	r8, 4*4(r4)
	lwz	r9, 5*4(r4)
	lwz	r10, 6*4(r4)
	lwz	r0, 7*4(r4)
	lwz	r12, 8*4(r4)
	lwz	r13, 9*4(r4)
	lwz	r5, 10*4(r4)
	lwz	r6, 11*4(r4)
	stw	r11, 8*4+0*4(r1)
	stw	r14, 8*4+1*4(r1)
	stw	r15, 8*4+2*4(r1)
	stw	r7, 8*4+3*4(r1)
	stw	r8, 8*4+4*4(r1)
	stw	r9, 8*4+5*4(r1)
	stw	r10, 8*4+6*4(r1)
	stw	r0, 8*4+7*4(r1)
	stw	r12, 8*4+8*4(r1)
	stw	r13, 8*4+9*4(r1)
	stw	r5, 8*4+10*4(r1)
	stw	r6, 8*4+11*4(r1)
	lwz	r7, 12*4(r4)
	lwz	r8, 13*4(r4)
	lwz	r9, 14*4(r4)
	lwz	r10, 15*4(r4)
	mr	r4, r13
	stw	r7, 8*4+12*4(r1)
	stw	r8, 8*4+13*4(r1)
	stw	r9, 8*4+14*4(r1)
	stw	r10, 8*4+15*4(r1)
	b	sha256_transform_extend
	
sha256_transform_swap:
	li	r13, 1*4
	li	r14, 2*4
	li	r15, 3*4
	lwbrx	r11, 0, r4
	lwbrx	r7, r4, r13
	lwbrx	r8, r4, r14
	lwbrx	r9, r4, r15
	addi	r4, r4, 4*4
	stw	r11, 8*4+0*4(r1)
	stw	r7, 8*4+1*4(r1)
	stw	r8, 8*4+2*4(r1)
	stw	r9, 8*4+3*4(r1)
	lwbrx	r7, 0, r4
	lwbrx	r8, r4, r13
	lwbrx	r9, r4, r14
	lwbrx	r10, r4, r15
	addi	r4, r4, 4*4
	stw	r7, 8*4+4*4(r1)
	stw	r8, 8*4+5*4(r1)
	stw	r9, 8*4+6*4(r1)
	stw	r10, 8*4+7*4(r1)
	lwbrx	r8, 0, r4
	lwbrx	r12, r4, r13
	lwbrx	r5, r4, r14
	lwbrx	r6, r4, r15
	addi	r4, r4, 4*4
	stw	r8, 8*4+8*4(r1)
	stw	r12, 8*4+9*4(r1)
	stw	r5, 8*4+10*4(r1)
	stw	r6, 8*4+11*4(r1)
	lwbrx	r7, 0, r4
	lwbrx	r8, r4, r13
	lwbrx	r9, r4, r14
	lwbrx	r10, r4, r15
	mr	r4, r12
	stw	r7, 8*4+12*4(r1)
	stw	r8, 8*4+13*4(r1)
	stw	r9, 8*4+14*4(r1)
	stw	r10, 8*4+15*4(r1)
	
sha256_transform_extend:
	sha256_extend_doubleround  0, r1, 8*4, r4, r5, r9, r10
	sha256_extend_doubleround  2, r1, 8*4, r6, r7, r4, r5
	sha256_extend_doubleround  4, r1, 8*4, r8, r9, r6, r7
	sha256_extend_doubleround  6, r1, 8*4, r10, r4, r8, r9
	sha256_extend_doubleround  8, r1, 8*4, r5, r6, r10, r4
	sha256_extend_doubleround 10, r1, 8*4, r7, r8, r5, r6
	sha256_extend_doubleround 12, r1, 8*4, r9, r10, r7, r8
	sha256_extend_doubleround 14, r1, 8*4, r4, r5, r9, r10
	sha256_extend_doubleround 16, r1, 8*4, r6, r7, r4, r5
	sha256_extend_doubleround 18, r1, 8*4, r8, r9, r6, r7
	sha256_extend_doubleround 20, r1, 8*4, r10, r4, r8, r9
	sha256_extend_doubleround 22, r1, 8*4, r5, r6, r10, r4
	sha256_extend_doubleround 24, r1, 8*4, r7, r8, r5, r6
	sha256_extend_doubleround 26, r1, 8*4, r9, r10, r7, r8
	sha256_extend_doubleround 28, r1, 8*4, r4, r5, r9, r10
	sha256_extend_doubleround 30, r1, 8*4, r6, r7, r4, r5
	sha256_extend_doubleround 32, r1, 8*4, r8, r9, r6, r7
	sha256_extend_doubleround 34, r1, 8*4, r10, r4, r8, r9
	sha256_extend_doubleround 36, r1, 8*4, r5, r6, r10, r4
	sha256_extend_doubleround 38, r1, 8*4, r7, r8, r5, r6
	sha256_extend_doubleround 40, r1, 8*4, r9, r10, r7, r8
	sha256_extend_doubleround 42, r1, 8*4, r4, r5, r9, r10
	sha256_extend_doubleround 44, r1, 8*4, r6, r7, r4, r5
	sha256_extend_doubleround 46, r1, 8*4, r8, r9, r6, r7
	
	lwz	r4, 0*4(r3)
	lwz	r5, 1*4(r3)
	lwz	r6, 2*4(r3)
	lwz	r7, 3*4(r3)
	lwz	r8, 4*4(r3)
	lwz	r9, 5*4(r3)
	lwz	r10, 6*4(r3)
	lwz	r11, 7*4(r3)
#ifdef _AIX
	ld	r16, T.sha256_k(r2)
#else
	lis	r16, HI(sha256_k)
	addi	r16, r16, LO(sha256_k)
#endif
	sha256_main_quadround  0, r16, r1, 8*4
	sha256_main_quadround  4, r16, r1, 8*4
	sha256_main_quadround  8, r16, r1, 8*4
	sha256_main_quadround 12, r16, r1, 8*4
	sha256_main_quadround 16, r16, r1, 8*4
	sha256_main_quadround 20, r16, r1, 8*4
	sha256_main_quadround 24, r16, r1, 8*4
	sha256_main_quadround 28, r16, r1, 8*4
	sha256_main_quadround 32, r16, r1, 8*4
	sha256_main_quadround 36, r16, r1, 8*4
	sha256_main_quadround 40, r16, r1, 8*4
	sha256_main_quadround 44, r16, r1, 8*4
	sha256_main_quadround 48, r16, r1, 8*4
	sha256_main_quadround 52, r16, r1, 8*4
	sha256_main_quadround 56, r16, r1, 8*4
	sha256_main_quadround 60, r16, r1, 8*4
	
	lwz	r12, 0*4(r3)
	lwz	r13, 1*4(r3)
	lwz	r14, 2*4(r3)
	lwz	r15, 3*4(r3)
	add	r4, r4, r12
	add	r5, r5, r13
	add	r6, r6, r14
	add	r7, r7, r15
	stw	r4, 0*4(r3)
	stw	r5, 1*4(r3)
	stw	r6, 2*4(r3)
	stw	r7, 3*4(r3)
	lwz	r12, 4*4(r3)
	lwz	r13, 5*4(r3)
	lwz	r14, 6*4(r3)
	lwz	r15, 7*4(r3)
	add	r8, r8, r12
	add	r9, r9, r13
	add	r10, r10, r14
	add	r11, r11, r15
	stw	r8, 4*4(r3)
	stw	r9, 5*4(r3)
	stw	r10, 6*4(r3)
	stw	r11, 7*4(r3)
	
	ld	r13, 2*4(r1)
	ld	r14, 4*4(r1)
	ld	r15, 6*4(r1)
	ld	r16, 72*4(r1)
	addi	r1, r1, 76*4
	blr


	.align 2
	.globl sha256d_ms
	.globl _sha256d_ms
	.globl .sha256d_ms
#ifdef __ELF__
	.type sha256d_ms, %function
#endif
sha256d_ms:
_sha256d_ms:
.sha256d_ms:
	stdu	r1, -80*4(r1)
	std	r13, 2*4(r1)
	std	r14, 4*4(r1)
	std	r15, 6*4(r1)
	std	r16, 72*4(r1)
	std	r17, 74*4(r1)
	std	r18, 76*4(r1)
	
	mr	r17, r4
	mr	r18, r5
	mr	r16, r6
	
	lwz	r14, 3*4(r17)
	lwz	r6, 18*4(r17)
	lwz	r7, 19*4(r17)
	
	rotrwi	r12, r14, 7
	rotrwi	r13, r14, 18
	stw	r6, 8*4+18*4(r1)
	xor	r12, r12, r13
	srwi	r13, r14, 3
	stw	r7, 8*4+19*4(r1)
	xor	r12, r12, r13
	lwz	r8, 20*4(r17)
	add	r6, r6, r12
	lwz	r10, 22*4(r17)
	add	r7, r7, r14
	stw	r6, 18*4(r17)
	
	rotrwi	r12, r6, 17
	rotrwi	r13, r6, 19
	stw	r7, 19*4(r17)
	xor	r12, r12, r13
	srwi	r13, r6, 10
	stw	r8, 8*4+20*4(r1)
	xor	r12, r12, r13
	lwz	r4, 23*4(r17)
	add	r8, r8, r12
	lwz	r5, 24*4(r17)
	
	rotrwi	r9, r7, 17
	rotrwi	r13, r7, 19
	stw	r8, 20*4(r17)
	xor	r9, r9, r13
	srwi	r13, r7, 10
	stw	r10, 8*4+21*4(r1)
	xor	r9, r9, r13
	stw	r4, 8*4+22*4(r1)
	
	rotrwi	r12, r8, 17
	rotrwi	r13, r8, 19
	stw	r9, 21*4(r17)
	xor	r12, r12, r13
	srwi	r13, r8, 10
	stw	r5, 8*4+23*4(r1)
	xor	r12, r12, r13
	rotrwi	r14, r9, 17
	rotrwi	r13, r9, 19
	add	r10, r10, r12
	lwz	r11, 30*4(r17)
	
	xor	r14, r14, r13
	srwi	r13, r9, 10
	stw	r10, 22*4(r17)
	xor	r14, r14, r13
	stw	r11, 8*4+24*4(r1)
	add	r4, r4, r14
	
	rotrwi	r12, r10, 17
	rotrwi	r13, r10, 19
	stw	r4, 23*4(r17)
	xor	r12, r12, r13
	srwi	r13, r10, 10
	rotrwi	r14, r4, 17
	xor	r12, r12, r13
	rotrwi	r13, r4, 19
	xor	r14, r14, r13
	srwi	r13, r4, 10
	add	r5, r5, r12
	xor	r14, r14, r13
	stw	r5, 24*4(r17)
	add	r6, r6, r14
	
	rotrwi	r12, r5, 17
	rotrwi	r13, r5, 19
	stw	r6, 25*4(r17)
	xor	r12, r12, r13
	srwi	r13, r5, 10
	rotrwi	r14, r6, 17
	xor	r12, r12, r13
	rotrwi	r13, r6, 19
	xor	r14, r14, r13
	srwi	r13, r6, 10
	add	r7, r7, r12
	xor	r14, r14, r13
	stw	r7, 26*4(r17)
	add	r8, r8, r14
	
	rotrwi	r12, r7, 17
	rotrwi	r13, r7, 19
	stw	r8, 27*4(r17)
	xor	r12, r12, r13
	srwi	r13, r7, 10
	rotrwi	r14, r8, 17
	xor	r12, r12, r13
	rotrwi	r13, r8, 19
	xor	r14, r14, r13
	srwi	r13, r8, 10
	add	r9, r9, r12
	xor	r14, r14, r13
	stw	r9, 28*4(r17)
	add	r10, r10, r14
	
	lwz	r14, 31*4(r17)
	rotrwi	r12, r9, 17
	rotrwi	r13, r9, 19
	stw	r10, 29*4(r17)
	xor	r12, r12, r13
	srwi	r13, r9, 10
	stw	r14, 8*4+25*4(r1)
	xor	r12, r12, r13
	add	r11, r11, r12
	add	r5, r5, r14
	rotrwi	r12, r10, 17
	rotrwi	r13, r10, 19
	add	r4, r4, r11
	
	lwz	r11, 16*4(r17)
	xor	r12, r12, r13
	srwi	r13, r10, 10
	stw	r4, 30*4(r17)
	xor	r12, r12, r13
	add	r5, r5, r12
	stw	r5, 31*4(r17)
	
	sha256_extend_doubleround 16, r17, 0, r6, r7, r4, r5
	sha256_extend_doubleround 18, r17, 0, r8, r9, r6, r7
	sha256_extend_doubleround 20, r17, 0, r10, r4, r8, r9
	sha256_extend_doubleround 22, r17, 0, r5, r6, r10, r4
	sha256_extend_doubleround 24, r17, 0, r7, r8, r5, r6
	sha256_extend_doubleround 26, r17, 0, r9, r10, r7, r8
	sha256_extend_doubleround 28, r17, 0, r4, r5, r9, r10
	sha256_extend_doubleround 30, r17, 0, r6, r7, r4, r5
	sha256_extend_doubleround 32, r17, 0, r8, r9, r6, r7
	sha256_extend_doubleround 34, r17, 0, r10, r4, r8, r9
	sha256_extend_doubleround 36, r17, 0, r5, r6, r10, r4
	sha256_extend_doubleround 38, r17, 0, r7, r8, r5, r6
	sha256_extend_doubleround 40, r17, 0, r9, r10, r7, r8
	sha256_extend_doubleround 42, r17, 0, r4, r5, r9, r10
	sha256_extend_doubleround 44, r17, 0, r6, r7, r4, r5
	sha256_extend_doubleround 46, r17, 0, r8, r9, r6, r7
	
	lwz	r4,  0*4(r16)
	lwz	r9,  1*4(r16)
	lwz	r10, 2*4(r16)
	lwz	r11, 3*4(r16)
	lwz	r8,  4*4(r16)
	lwz	r5,  5*4(r16)
	lwz	r6,  6*4(r16)
	lwz	r7,  7*4(r16)
#ifdef _AIX
	ld	r16, T.sha256_k(r2)
#else
	lis	r16, HI(sha256_k)
	addi	r16, r16, LO(sha256_k)
#endif
	
	sha256_main_round  3, r16, r17, 0, r5, r6, r7, r4, r9, r10, r11, r8
	sha256_main_quadround  4, r16, r17, 0
	sha256_main_quadround  8, r16, r17, 0
	sha256_main_quadround 12, r16, r17, 0
	sha256_main_quadround 16, r16, r17, 0
	sha256_main_quadround 20, r16, r17, 0
	sha256_main_quadround 24, r16, r17, 0
	sha256_main_quadround 28, r16, r17, 0
	sha256_main_quadround 32, r16, r17, 0
	sha256_main_quadround 36, r16, r17, 0
	sha256_main_quadround 40, r16, r17, 0
	sha256_main_quadround 44, r16, r17, 0
	sha256_main_quadround 48, r16, r17, 0
	sha256_main_quadround 52, r16, r17, 0
	sha256_main_quadround 56, r16, r17, 0
	sha256_main_quadround 60, r16, r17, 0
	
	lwz	r12, 0*4(r18)
	lwz	r13, 1*4(r18)
	lwz	r14, 2*4(r18)
	lwz	r15, 3*4(r18)
	add	r4, r4, r12
	add	r5, r5, r13
	add	r6, r6, r14
	add	r7, r7, r15
	stw	r4, 8*4+0*4(r1)
	stw	r5, 8*4+1*4(r1)
	stw	r6, 8*4+2*4(r1)
	stw	r7, 8*4+3*4(r1)
	lwz	r12, 4*4(r18)
	lwz	r13, 5*4(r18)
	lwz	r14, 6*4(r18)
	lwz	r15, 7*4(r18)
	add	r8, r8, r12
	add	r9, r9, r13
	add	r10, r10, r14
	add	r11, r11, r15
	stw	r8, 8*4+4*4(r1)
	stw	r9, 8*4+5*4(r1)
	stw	r10, 8*4+6*4(r1)
	stw	r11, 8*4+7*4(r1)

	lwz	r4, 8*4+18*4(r1)
	lwz	r5, 8*4+19*4(r1)
	lwz	r6, 8*4+20*4(r1)
	lwz	r7, 8*4+21*4(r1)
	lwz	r8, 8*4+22*4(r1)
	lwz	r9, 8*4+23*4(r1)
	lwz	r10, 8*4+24*4(r1)
	lwz	r11, 8*4+25*4(r1)
	stw	r4,  18*4(r17)
	stw	r5,  19*4(r17)
	stw	r6,  20*4(r17)
	stw	r7,  22*4(r17)
	stw	r8,  23*4(r17)
	stw	r9,  24*4(r17)
	stw	r10, 30*4(r17)
	stw	r11, 31*4(r17)
	
	lis	r8, 0x8000
	li	r9,  0
	li	r10, 0x0100
	
	lwz	r14, 8*4+1*4(r1)
	lwz	r4, 8*4+0*4(r1)
	
	lwz	r11, 8*4+2*4(r1)
	rotrwi	r12, r14, 7
	rotrwi	r13, r14, 18
	
	stw	r8, 8*4+8*4(r1)
	stw	r9, 8*4+9*4(r1)
	stw	r9, 8*4+10*4(r1)
	stw	r9, 8*4+11*4(r1)
	stw	r9, 8*4+12*4(r1)
	stw	r9, 8*4+13*4(r1)
	stw	r9, 8*4+14*4(r1)
	stw	r10, 8*4+15*4(r1)
	
	xor	r12, r12, r13
	srwi	r13, r14, 3
	addis	r5, r14, 0x00a0
	xor	r12, r12, r13
	rotrwi	r14, r11, 7
	rotrwi	r13, r11, 18
	add	r4, r4, r12
	xor	r14, r14, r13
	srwi	r13, r11, 3
	stw	r4, 8*4+16*4(r1)
	xor	r14, r14, r13
	rotrwi	r12, r4, 17
	rotrwi	r13, r4, 19
	add	r5, r5, r14
	lwz	r14, 8*4+3*4(r1)
	
	stw	r5, 8*4+17*4(r1)
	xor	r12, r12, r13
	srwi	r13, r4, 10
	rotrwi	r6, r14, 7
	xor	r12, r12, r13
	rotrwi	r13, r14, 18
	xor	r6, r6, r13
	srwi	r13, r14, 3
	add	r11, r11, r12
	xor	r6, r6, r13
	rotrwi	r12, r5, 17
	rotrwi	r13, r5, 19
	add	r6, r6, r11
	lwz	r11, 8*4+4*4(r1)
	
	stw	r6, 8*4+18*4(r1)
	xor	r12, r12, r13
	srwi	r13, r5, 10
	rotrwi	r7, r11, 7
	xor	r12, r12, r13
	rotrwi	r13, r11, 18
	xor	r7, r7, r13
	srwi	r13, r11, 3
	add	r14, r14, r12
	xor	r7, r7, r13
	rotrwi	r12, r6, 17
	rotrwi	r13, r6, 19
	add	r7, r7, r14
	lwz	r14, 8*4+5*4(r1)
	
	stw	r7, 8*4+19*4(r1)
	xor	r12, r12, r13
	srwi	r13, r6, 10
	rotrwi	r8, r14, 7
	xor	r12, r12, r13
	rotrwi	r13, r14, 18
	xor	r8, r8, r13
	srwi	r13, r14, 3
	add	r11, r11, r12
	xor	r8, r8, r13
	rotrwi	r12, r7, 17
	rotrwi	r13, r7, 19
	add	r8, r8, r11
	lwz	r11, 8*4+6*4(r1)
	
	stw	r8, 8*4+20*4(r1)
	xor	r12, r12, r13
	srwi	r13, r7, 10
	rotrwi	r9, r11, 7
	xor	r12, r12, r13
	rotrwi	r13, r11, 18
	xor	r9, r9, r13
	srwi	r13, r11, 3
	add	r14, r14, r12
	xor	r9, r9, r13
	rotrwi	r12, r8, 17
	rotrwi	r13, r8, 19
	add	r9, r9, r14
	lwz	r14, 8*4+7*4(r1)
	
	stw	r9, 8*4+21*4(r1)
	xor	r12, r12, r13
	srwi	r13, r8, 10
	rotrwi	r10, r14, 7
	xor	r12, r12, r13
	rotrwi	r13, r14, 18
	xor	r10, r10, r13
	srwi	r13, r14, 3
	add	r11, r11, r12
	xor	r10, r10, r13
	rotrwi	r12, r9, 17
	rotrwi	r13, r9, 19
	addi	r11, r11, 0x0100
	add	r14, r14, r4
	add	r10, r10, r11
	
	xor	r12, r12, r13
	srwi	r13, r9, 10
	stw	r10, 8*4+22*4(r1)
	addis	r14, r14, 0x1100
	xor	r12, r12, r13
	add	r14, r14, r12
	rotrwi	r12, r10, 17
	rotrwi	r13, r10, 19
	addi	r4, r14, 0x2000
	xor	r12, r12, r13
	srwi	r13, r10, 10
	stw	r4, 8*4+23*4(r1)
	addis	r5, r5, 0x8000
	xor	r12, r12, r13
	add	r5, r5, r12

	rotrwi	r12, r4, 17
	rotrwi	r13, r4, 19
	stw	r5, 8*4+24*4(r1)
	xor	r12, r12, r13
	srwi	r13, r4, 10
	rotrwi	r11, r5, 17
	xor	r12, r12, r13
	rotrwi	r13, r5, 19
	xor	r11, r11, r13
	srwi	r13, r5, 10
	add	r6, r6, r12
	xor	r11, r11, r13
	stw	r6, 8*4+25*4(r1)
	add	r7, r7, r11
	
	rotrwi	r12, r6, 17
	rotrwi	r13, r6, 19
	stw	r7, 8*4+26*4(r1)
	xor	r12, r12, r13
	srwi	r13, r6, 10
	rotrwi	r11, r7, 17
	xor	r12, r12, r13
	rotrwi	r13, r7, 19
	xor	r11, r11, r13
	srwi	r13, r7, 10
	add	r8, r8, r12
	xor	r11, r11, r13
	stw	r8, 8*4+27*4(r1)
	add	r9, r9, r11
	
	rotrwi	r14, r8, 17
	rotrwi	r13, r8, 19
	rotrwi	r12, r9, 17
	stw	r9, 8*4+28*4(r1)
	addis	r4, r4, 0x0040
	xor	r14, r14, r13
	rotrwi	r13, r9, 19
	xor	r12, r12, r13
	srwi	r13, r8, 10
	xor	r14, r14, r13
	srwi	r13, r9, 10
	xor	r12, r12, r13
	addi	r4, r4, 0x0022
	add	r10, r10, r14
	add	r4, r4, r12
	lwz	r11, 8*4+16*4(r1)
	
	addi	r5, r5, 0x0100
	stw	r4, 8*4+30*4(r1)
	rotrwi	r14, r11, 7
	stw	r10, 8*4+29*4(r1)
	rotrwi	r13, r11, 18
	rotrwi	r12, r10, 17
	xor	r14, r14, r13
	rotrwi	r13, r10, 19
	xor	r12, r12, r13
	srwi	r13, r11, 3
	xor	r14, r14, r13
	srwi	r13, r10, 10
	xor	r12, r12, r13
	add	r5, r5, r14
	add	r5, r5, r12
	stw	r5, 8*4+31*4(r1)
	
	sha256_extend_doubleround 16, r1, 8*4, r6, r7, r4, r5
	sha256_extend_doubleround 18, r1, 8*4, r8, r9, r6, r7
	sha256_extend_doubleround 20, r1, 8*4, r10, r4, r8, r9
	sha256_extend_doubleround 22, r1, 8*4, r5, r6, r10, r4
	sha256_extend_doubleround 24, r1, 8*4, r7, r8, r5, r6
	sha256_extend_doubleround 26, r1, 8*4, r9, r10, r7, r8
	sha256_extend_doubleround 28, r1, 8*4, r4, r5, r9, r10
	sha256_extend_doubleround 30, r1, 8*4, r6, r7, r4, r5
	sha256_extend_doubleround 32, r1, 8*4, r8, r9, r6, r7
	sha256_extend_doubleround 34, r1, 8*4, r10, r4, r8, r9
	sha256_extend_doubleround 36, r1, 8*4, r5, r6, r10, r4
	sha256_extend_doubleround 38, r1, 8*4, r7, r8, r5, r6
	sha256_extend_doubleround 40, r1, 8*4, r9, r10, r7, r8
	sha256_extend_doubleround 42, r1, 8*4, r4, r5, r9, r10
	
#ifdef _AIX
	ld	r18, T.sha256_h(r2)
#else
	lis	r18, HI(sha256_h)
	addi	r18, r18, LO(sha256_h)
#endif
	
	lwz	r14, 8*4+(44+1)*4(r1)
	rotrwi	r12, r4, 17
	rotrwi	r13, r4, 19
	add	r15, r11, r6
	rotrwi	r6, r14, 7
	rotrwi	r11, r14, 18
	xor	r12, r12, r13
	xor	r6, r6, r11
	
	lwz	r8, 4*4(r18)
	lwz	r9, 5*4(r18)
	lwz	r10, 6*4(r18)
	lwz	r11, 7*4(r18)
	
	srwi	r13, r4, 10
	srwi	r14, r14, 3
	xor	r12, r12, r13
	xor	r6, r6, r14
	add	r12, r12, r15
	add	r6, r6, r12
	stw	r6, 8*4+(44+16)*4(r1)
	
	lwz	r4, 0*4(r18)
	lwz	r5, 1*4(r18)
	lwz	r6, 2*4(r18)
	lwz	r7, 3*4(r18)
	
	sha256_main_quadround  0, r16, r1, 8*4
	sha256_main_quadround  4, r16, r1, 8*4
	sha256_main_quadround  8, r16, r1, 8*4
	sha256_main_quadround 12, r16, r1, 8*4
	sha256_main_quadround 16, r16, r1, 8*4
	sha256_main_quadround 20, r16, r1, 8*4
	sha256_main_quadround 24, r16, r1, 8*4
	sha256_main_quadround 28, r16, r1, 8*4
	sha256_main_quadround 32, r16, r1, 8*4
	sha256_main_quadround 36, r16, r1, 8*4
	sha256_main_quadround 40, r16, r1, 8*4
	sha256_main_quadround 44, r16, r1, 8*4
	sha256_main_quadround 48, r16, r1, 8*4
	sha256_main_quadround 52, r16, r1, 8*4
	sha256_main_round 56, r16, r1, 8*4, r4, r5, r6, r7, r8, r9, r10, r11

.macro sha256_main_round_red i, rk, rw, wo, rd, re, rf, rg, rh
	lwz	r12, \wo+(\i)*4(\rw)
	and	r15, \rf, \re
	andc	r14, \rg, \re
	add	\rh, \rh, \rd
	or	r14, r14, r15
	lwz	r15, (\i)*4(\rk)
	rotrwi	r13, \re, 5
	add	\rh, \rh, r14
	xor	r14, \re, r13
	rotrwi	r13, \re, 19
	add	\rh, \rh, r12
	xor	r14, r14, r13
	add	\rh, \rh, r15
	rotrwi	r13, r14, 6
	add	\rh, \rh, r13
.endm
	
	sha256_main_round_red 57, r16, r1, 8*4, r6, r11, r8, r9, r10
	sha256_main_round_red 58, r16, r1, 8*4, r5, r10, r11, r8, r9
	sha256_main_round_red 59, r16, r1, 8*4, r4, r9, r10, r11, r8
	lwz	r5, 7*4(r18)
	sha256_main_round_red 60, r16, r1, 8*4, r7, r8, r9, r10, r11
	
	add	r11, r11, r5
	stw	r11, 7*4(r3)
	
	ld	r13, 2*4(r1)
	ld	r14, 4*4(r1)
	ld	r15, 6*4(r1)
	ld	r16, 72*4(r1)
	ld	r17, 74*4(r1)
	ld	r18, 76*4(r1)
	addi	r1, r1, 80*4
	blr


#ifdef __ALTIVEC__

#ifdef __APPLE__
	.machine ppc7400
#endif

#ifdef _AIX
	.csect .text[RO]
#else
	.data
#endif
	.align 4
sha256_4h:
	.long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667
	.long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85
	.long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372
	.long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a
	.long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f
	.long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c
	.long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab
	.long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19

	.align 4
sha256_4k:
	.long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98
	.long 0x71374491, 0x71374491, 0x71374491, 0x71374491
	.long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf
	.long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5
	.long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b
	.long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1
	.long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4
	.long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5
	.long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98
	.long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01
	.long 0x243185be, 0x243185be, 0x243185be, 0x243185be
	.long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3
	.long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74
	.long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe
	.long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7
	.long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174
	.long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1
	.long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786
	.long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6
	.long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc
	.long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f
	.long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa
	.long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc
	.long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da
	.long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152
	.long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d
	.long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8
	.long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7
	.long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3
	.long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147
	.long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351
	.long 0x14292967, 0x14292967, 0x14292967, 0x14292967
	.long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85
	.long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138
	.long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc
	.long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13
	.long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354
	.long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb
	.long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e
	.long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85
	.long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1
	.long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b
	.long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70
	.long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3
	.long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819
	.long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624
	.long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585
	.long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070
	.long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116
	.long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08
	.long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c
	.long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5
	.long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3
	.long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a
	.long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f
	.long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3
	.long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee
	.long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f
	.long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814
	.long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208
	.long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa
	.long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb
	.long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7
	.long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2

	.align 4
sha256d_4preext2:
	.long 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000
	.long 0x11002000, 0x11002000, 0x11002000, 0x11002000
	.long 0x80000000, 0x80000000, 0x80000000, 0x80000000
	.long 0x00400022, 0x00400022, 0x00400022, 0x00400022

	.align 4
br_perm:
	.long 0x03020100, 0x07060504, 0x0b0a0908, 0x0f0e0d0c

#ifdef _AIX
	.toc
T.sha256_4h:
	.tc sha256_4h[TC], sha256_4h
T.sha256_4k:
	.tc sha256_4k[TC], sha256_4k
T.sha256d_4preext2:
	.tc sha256d_4preext2[TC], sha256d_4preext2
T.br_perm:
	.tc br_perm[TC], br_perm
#endif


.macro sha256_4way_extend_setup
	vspltisw	v0, 10
	vspltisw	v1, -7
	vspltisw	v16, 3
	vspltisw	v17, 15
	vspltisw	v18, 14
	vspltisw	v19, 13
.endm

.macro sha256_4way_extend_doubleround i, rw, va, vb, vy, vz
	lvx	v14, \rw, r7
	vrlw	v12, \vy, v17
	vrlw	v13, \vy, v19
	vadduwm	v11, v11, \va
	vxor	v12, v12, v13
	vsrw	v13, \vy, v0
	vrlw	\va, v14, v1
	vxor	v12, v12, v13
	vrlw	v13, v14, v18
	vadduwm	v12, v12, v11
	vxor	\va, \va, v13
	vsrw	v13, v14, v16
	lvx	v11, \rw, r8
	vxor	\va, \va, v13
	vrlw	v13, \vz, v19
	vadduwm	\va, \va, v12

	vrlw	v12, \vz, v17
	vadduwm	v14, v14, \vb
	vxor	v12, v12, v13
	vsrw	v13, \vz, v0
	vrlw	\vb, v11, v1
	vxor	v12, v12, v13
	vrlw	v13, v11, v18
	stvx	\va, \rw, r10
	vxor	\vb, \vb, v13
	vsrw	v13, v11, v16
	vadduwm	v14, v14, v12
	vxor	\vb, \vb, v13
	vadduwm	\vb, \vb, v14
	stvx	\vb, \rw, r11
	addi	\rw, \rw, 2*16
.endm


.macro sha256_4way_main_setup
	vspltisw	v2, 12
	vspltisw	v3, -5
	vspltisw	v16, -6
	vspltisw	v17, -11
	vspltisw	v18, -2
.endm

.macro sha256_4way_main_round i, rk, rw, va, vb, vc, vd, ve, vf, vg, vh
	li	r6, (\i)*16
	lvx	v12, \rw, r6
	vand	v13, \vf, \ve
	vandc	v14, \vg, \ve
	lvx	v15, \rk, r6
	vor	v14, v14, v13
	vrlw	v13, \ve, v3
	vadduwm	\vh, \vh, v14
	vxor	v14, \ve, v13
	vrlw	v13, \ve, v19
	vadduwm	\vh, \vh, v12
	vxor	v14, v14, v13
	vadduwm	\vh, \vh, v15
	vrlw	v13, v14, v16
	vxor	v15, \va, \vb
	vadduwm	\vh, \vh, v13

	vrlw	v13, \va, v17
	vand	v15, v15, \vc
	vxor	v12, \va, v13
	vrlw	v13, \va, v2
	vand	v14, \va, \vb
	vxor	v12, v12, v13
	vxor	v14, v14, v15
	vrlw	v13, v12, v18
	vadduwm	v15, \vh, v14
	vadduwm	\vh, \vh, \vd
	vadduwm	\vd, v15, v13
.endm

.macro sha256_4way_main_quadround i, rk, rw
	sha256_4way_main_round \i+0, \rk, \rw, v4, v5, v6, v7, v8, v9, v10, v11
	sha256_4way_main_round \i+1, \rk, \rw, v7, v4, v5, v6, v11, v8, v9, v10
	sha256_4way_main_round \i+2, \rk, \rw, v6, v7, v4, v5, v10, v11, v8, v9
	sha256_4way_main_round \i+3, \rk, \rw, v5, v6, v7, v4, v9, v10, v11, v8
.endm


#ifdef _AIX
	.csect .text[PR]
#else
	.text
#endif
	.align 2
	.globl sha256_init_4way
	.globl _sha256_init_4way
	.globl .sha256_init_4way
#ifdef __ELF__
	.type sha256_init_4way, %function
#endif
sha256_init_4way:
_sha256_init_4way:
.sha256_init_4way:
	mfspr	r0, 256
	oris	r12, r0, 0xff00
	mtspr	256, r12
	
#ifdef _AIX
	ld	r4, T.sha256_4h(r2)
#else
	lis	r4, HI(sha256_4h)
	addi	r4, r4, LO(sha256_4h)
#endif
	li	r5, 1*16
	li	r6, 2*16
	li	r7, 3*16
	li	r8, 4*16
	li	r9, 5*16
	li	r10, 6*16
	li	r11, 7*16
	lvx	v0, 0, r4
	lvx	v1, r4, r5
	lvx	v2, r4, r6
	lvx	v3, r4, r7
	lvx	v4, r4, r8
	lvx	v5, r4, r9
	lvx	v6, r4, r10
	lvx	v7, r4, r11
	stvx	v0, 0, r3
	stvx	v1, r3, r5
	stvx	v2, r3, r6
	stvx	v3, r3, r7
	stvx	v4, r3, r8
	stvx	v5, r3, r9
	stvx	v6, r3, r10
	stvx	v7, r3, r11
	
	mtspr	256, r0
	blr


	.align 2
	.globl sha256_transform_4way
	.globl _sha256_transform_4way
	.globl .sha256_transform_4way
#ifdef __ELF__
	.type sha256_transform_4way, %function
#endif
sha256_transform_4way:
_sha256_transform_4way:
.sha256_transform_4way:
	mfspr	r0, 256
	oris	r12, r0, 0xffff
	ori	r12, r12, 0xf000
	mtspr	256, r12
	
	andi.	r6, r1, 15
	cmpwi	0, r5, 0
	li	r7, -(4*4+64*16)
	subf	r6, r6, r7
	stdux	r1, r1, r6

	li	r7, 1*16
	li	r8, 2*16
	li	r9, 3*16
	li	r10, 4*16
	li	r11, 5*16
	li	r12, 6*16
	li	r6, 7*16
	
	bne	0, sha256_transform_4way_swap
	
	lvx	v11, 0, r4
	lvx	v1, r4, r7
	lvx	v2, r4, r8
	lvx	v3, r4, r9
	lvx	v4, r4, r10
	lvx	v5, r4, r11
	lvx	v6, r4, r12
	lvx	v7, r4, r6
	addi	r5, r1, 4*4
	stvx	v11, 0, r5
	stvx	v1, r5, r7
	stvx	v2, r5, r8
	stvx	v3, r5, r9
	stvx	v4, r5, r10
	stvx	v5, r5, r11
	stvx	v6, r5, r12
	stvx	v7, r5, r6
	addi	r4, r4, 8*16
	lvx	v0, 0, r4
	lvx	v4, r4, r7
	lvx	v5, r4, r8
	lvx	v6, r4, r9
	lvx	v7, r4, r10
	lvx	v8, r4, r11
	lvx	v9, r4, r12
	lvx	v10, r4, r6
	addi	r4, r1, 4*4+8*16
	stvx	v0, 0, r4
	stvx	v4, r4, r7
	stvx	v5, r4, r8
	stvx	v6, r4, r9
	stvx	v7, r4, r10
	stvx	v8, r4, r11
	stvx	v9, r4, r12
	stvx	v10, r4, r6
	b	sha256_transform_4way_extend

sha256_transform_4way_swap:
#ifdef _AIX
	ld	r5, T.br_perm(r2)
#else
	lis	r5, HI(br_perm)
	addi	r5, r5, LO(br_perm)
#endif
	lvx	v19, 0, r5
	
	lvx	v11, 0, r4
	lvx	v1, r4, r7
	lvx	v2, r4, r8
	lvx	v3, r4, r9
	lvx	v4, r4, r10
	lvx	v5, r4, r11
	lvx	v6, r4, r12
	lvx	v7, r4, r6
	vperm	v11, v11, v11, v19
	vperm	v1, v1, v1, v19
	vperm	v2, v2, v2, v19
	vperm	v3, v3, v3, v19
	vperm	v4, v4, v4, v19
	vperm	v5, v5, v5, v19
	vperm	v6, v6, v6, v19
	vperm	v7, v7, v7, v19
	addi	r5, r1, 4*4
	stvx	v11, 0, r5
	stvx	v1, r5, r7
	stvx	v2, r5, r8
	stvx	v3, r5, r9
	stvx	v4, r5, r10
	stvx	v5, r5, r11
	stvx	v6, r5, r12
	stvx	v7, r5, r6
	addi	r4, r4, 8*16
	lvx	v0, 0, r4
	lvx	v4, r4, r7
	lvx	v5, r4, r8
	lvx	v6, r4, r9
	lvx	v7, r4, r10
	lvx	v8, r4, r11
	lvx	v9, r4, r12
	lvx	v10, r4, r6
	vperm	v0, v0, v0, v19
	vperm	v4, v4, v4, v19
	vperm	v5, v5, v5, v19
	vperm	v6, v6, v6, v19
	vperm	v7, v7, v7, v19
	vperm	v8, v8, v8, v19
	vperm	v9, v9, v9, v19
	vperm	v10, v10, v10, v19
	addi	r4, r1, 4*4+8*16
	stvx	v0, 0, r4
	stvx	v4, r4, r7
	stvx	v5, r4, r8
	stvx	v6, r4, r9
	stvx	v7, r4, r10
	stvx	v8, r4, r11
	stvx	v9, r4, r12
	stvx	v10, r4, r6
	
sha256_transform_4way_extend:
	li	r10, 16*16
	li	r11, 17*16
	sha256_4way_extend_setup
	sha256_4way_extend_doubleround  0, r5, v4, v5, v9, v10
	sha256_4way_extend_doubleround  2, r5, v6, v7, v4, v5
	sha256_4way_extend_doubleround  4, r5, v8, v9, v6, v7
	sha256_4way_extend_doubleround  6, r5, v10, v4, v8, v9
	sha256_4way_extend_doubleround  8, r5, v5, v6, v10, v4
	sha256_4way_extend_doubleround 10, r5, v7, v8, v5, v6
	sha256_4way_extend_doubleround 12, r5, v9, v10, v7, v8
	sha256_4way_extend_doubleround 14, r5, v4, v5, v9, v10
	sha256_4way_extend_doubleround 16, r5, v6, v7, v4, v5
	sha256_4way_extend_doubleround 18, r5, v8, v9, v6, v7
	sha256_4way_extend_doubleround 20, r5, v10, v4, v8, v9
	sha256_4way_extend_doubleround 22, r5, v5, v6, v10, v4
	sha256_4way_extend_doubleround 24, r5, v7, v8, v5, v6
	sha256_4way_extend_doubleround 26, r5, v9, v10, v7, v8
	sha256_4way_extend_doubleround 28, r5, v4, v5, v9, v10
	sha256_4way_extend_doubleround 30, r5, v6, v7, v4, v5
	sha256_4way_extend_doubleround 32, r5, v8, v9, v6, v7
	sha256_4way_extend_doubleround 34, r5, v10, v4, v8, v9
	sha256_4way_extend_doubleround 36, r5, v5, v6, v10, v4
	sha256_4way_extend_doubleround 38, r5, v7, v8, v5, v6
	sha256_4way_extend_doubleround 40, r5, v9, v10, v7, v8
	sha256_4way_extend_doubleround 42, r5, v4, v5, v9, v10
	sha256_4way_extend_doubleround 44, r5, v6, v7, v4, v5
	sha256_4way_extend_doubleround 46, r5, v8, v9, v6, v7
	
	addi	r11, r3, 4*16
	lvx	v4, 0, r3
	lvx	v5, r3, r7
	lvx	v6, r3, r8
	lvx	v7, r3, r9
	lvx	v8, 0, r11
	lvx	v9, r11, r7
	lvx	v10, r11, r8
	lvx	v11, r11, r9
#ifdef _AIX
	ld	r12, T.sha256_4k(r2)
#else
	lis	r12, HI(sha256_4k)
	addi	r12, r12, LO(sha256_4k)
#endif
	addi	r5, r1, 4*4
	sha256_4way_main_setup
	sha256_4way_main_quadround  0, r12, r5
	sha256_4way_main_quadround  4, r12, r5
	sha256_4way_main_quadround  8, r12, r5
	sha256_4way_main_quadround 12, r12, r5
	sha256_4way_main_quadround 16, r12, r5
	sha256_4way_main_quadround 20, r12, r5
	sha256_4way_main_quadround 24, r12, r5
	sha256_4way_main_quadround 28, r12, r5
	sha256_4way_main_quadround 32, r12, r5
	sha256_4way_main_quadround 36, r12, r5
	sha256_4way_main_quadround 40, r12, r5
	sha256_4way_main_quadround 44, r12, r5
	sha256_4way_main_quadround 48, r12, r5
	sha256_4way_main_quadround 52, r12, r5
	sha256_4way_main_quadround 56, r12, r5
	sha256_4way_main_quadround 60, r12, r5
	
	lvx	v12, 0, r3
	lvx	v13, r3, r7
	lvx	v14, r3, r8
	lvx	v15, r3, r9
	lvx	v16, 0, r11
	lvx	v17, r11, r7
	lvx	v18, r11, r8
	lvx	v19, r11, r9
	vadduwm	v4, v4, v12
	vadduwm	v5, v5, v13
	vadduwm	v6, v6, v14
	vadduwm	v7, v7, v15
	vadduwm	v8, v8, v16
	vadduwm	v9, v9, v17
	vadduwm	v10, v10, v18
	vadduwm	v11, v11, v19
	stvx	v4, 0, r3
	stvx	v5, r3, r7
	stvx	v6, r3, r8
	stvx	v7, r3, r9
	stvx	v8, 0, r11
	stvx	v9, r11, r7
	stvx	v10, r11, r8
	stvx	v11, r11, r9
	
	ld	r1, 0(r1)
	mtspr	256, r0
	blr


	.align 2
	.globl sha256d_ms_4way
	.globl _sha256d_ms_4way
	.globl .sha256d_ms_4way
#ifdef __ELF__
	.type sha256d_ms_4way, %function
#endif
sha256d_ms_4way:
_sha256d_ms_4way:
.sha256d_ms_4way:
	mfspr	r0, 256
	oris	r12, r0, 0xffff
	ori	r12, r12, 0xf000
	mtspr	256, r12
	
	andi.	r12, r1, 15
	li	r11, -(4*4+64*16)
	subf	r12, r12, r11
	stdux	r1, r1, r12
	
	li	r7, 1*16
	li	r8, 2*16
	li	r9, 3*16
	li	r10, 16*16
	li	r11, 17*16
	
	sha256_4way_extend_setup
	
	addi	r4, r4, 2*16
	addi	r12, r1, 4*4+18*16
	lvx	v14, r4, r7
	lvx	v6, r4, r10
	lvx	v7, r4, r11
	
	vrlw	v12, v14, v1
	vrlw	v13, v14, v18
	stvx	v6, 0, r12
	vxor	v12, v12, v13
	vsrw	v13, v14, v16
	stvx	v7, r12, r7
	vxor	v12, v12, v13
	vadduwm	v6, v6, v12
	vadduwm	v7, v7, v14
	stvx	v6, r4, r10
	
	vrlw	v12, v6, v17
	vrlw	v13, v6, v19
	stvx	v7, r4, r11
	addi	r4, r4, 18*16
	lvx	v8, 0, r4
	vxor	v12, v12, v13
	vsrw	v13, v6, v0
	stvx	v8, r12, r8
	vxor	v12, v12, v13
	vadduwm	v8, v8, v12
	
	vrlw	v9, v7, v17
	vrlw	v13, v7, v19
	stvx	v8, 0, r4
	vxor	v9, v9, v13
	vsrw	v13, v7, v0
	vxor	v9, v9, v13
	
	vrlw	v12, v8, v17
	vrlw	v13, v8, v19
	stvx	v9, r4, r7
	vxor	v12, v12, v13
	vsrw	v13, v8, v0
	lvx	v10, r4, r8
	lvx	v4, r4, r9
	vxor	v12, v12, v13
	stvx	v10, r12, r9
	addi	r12, r12, 4*16
	stvx	v4, 0, r12
	vrlw	v14, v9, v17
	vrlw	v13, v9, v19
	vadduwm	v10, v10, v12
	
	vxor	v14, v14, v13
	vsrw	v13, v9, v0
	stvx	v10, r4, r8
	vxor	v14, v14, v13
	vadduwm	v4, v4, v14
	
	vrlw	v12, v10, v17
	vrlw	v13, v10, v19
	stvx	v4, r4, r9
	vxor	v12, v12, v13
	vsrw	v13, v10, v0
	vrlw	v14, v4, v17
	vxor	v12, v12, v13
	vrlw	v13, v4, v19
	addi	r4, r4, 4*16
	lvx	v5, 0, r4
	vxor	v14, v14, v13
	stvx	v5, r12, r7
	vsrw	v13, v4, v0
	vadduwm	v5, v5, v12
	vxor	v14, v14, v13
	stvx	v5, 0, r4
	vadduwm	v6, v6, v14
	
	vrlw	v12, v5, v17
	vrlw	v13, v5, v19
	stvx	v6, r4, r7
	vxor	v12, v12, v13
	vsrw	v13, v5, v0
	vrlw	v14, v6, v17
	vxor	v12, v12, v13
	vrlw	v13, v6, v19
	vxor	v14, v14, v13
	vsrw	v13, v6, v0
	vadduwm	v7, v7, v12
	vxor	v14, v14, v13
	stvx	v7, r4, r8
	vadduwm	v8, v8, v14
	
	vrlw	v12, v7, v17
	vrlw	v13, v7, v19
	stvx	v8, r4, r9
	vxor	v12, v12, v13
	vsrw	v13, v7, v0
	vrlw	v14, v8, v17
	vxor	v12, v12, v13
	vrlw	v13, v8, v19
	vxor	v14, v14, v13
	vsrw	v13, v8, v0
	vadduwm	v9, v9, v12
	vxor	v14, v14, v13
	addi	r4, r4, 4*16
	stvx	v9, 0, r4
	vadduwm	v10, v10, v14
	
	vrlw	v12, v9, v17
	vrlw	v13, v9, v19
	stvx	v10, r4, r7
	vxor	v12, v12, v13
	vsrw	v13, v9, v0
	lvx	v11, r4, r8
	lvx	v14, r4, r9
	stvx	v11, r12, r8
	stvx	v14, r12, r9
	vxor	v12, v12, v13
	vadduwm	v11, v11, v12
	vadduwm	v5, v5, v14
	vrlw	v12, v10, v17
	vrlw	v13, v10, v19
	vadduwm	v4, v4, v11
	
	vxor	v12, v12, v13
	vsrw	v13, v10, v0
	stvx	v4, r4, r8
	vxor	v12, v12, v13
	vadduwm	v5, v5, v12
	stvx	v5, r4, r9
	addi	r4, r4, -12*16
	lvx	v11, 0, r4
	
	sha256_4way_extend_doubleround 16, r4, v6, v7, v4, v5
	sha256_4way_extend_doubleround 18, r4, v8, v9, v6, v7
	sha256_4way_extend_doubleround 20, r4, v10, v4, v8, v9
	sha256_4way_extend_doubleround 22, r4, v5, v6, v10, v4
	sha256_4way_extend_doubleround 24, r4, v7, v8, v5, v6
	sha256_4way_extend_doubleround 26, r4, v9, v10, v7, v8
	sha256_4way_extend_doubleround 28, r4, v4, v5, v9, v10
	sha256_4way_extend_doubleround 30, r4, v6, v7, v4, v5
	sha256_4way_extend_doubleround 32, r4, v8, v9, v6, v7
	sha256_4way_extend_doubleround 34, r4, v10, v4, v8, v9
	sha256_4way_extend_doubleround 36, r4, v5, v6, v10, v4
	sha256_4way_extend_doubleround 38, r4, v7, v8, v5, v6
	sha256_4way_extend_doubleround 40, r4, v9, v10, v7, v8
	sha256_4way_extend_doubleround 42, r4, v4, v5, v9, v10
	sha256_4way_extend_doubleround 44, r4, v6, v7, v4, v5
	sha256_4way_extend_doubleround 46, r4, v8, v9, v6, v7
	addi	r4, r4, -48*16
	
	lvx	v4, 0, r6
	lvx	v9, r6, r7
	lvx	v10, r6, r8
	lvx	v11, r6, r9
	addi	r12, r6, 4*16
	lvx	v8, 0, r12
	lvx	v5, r12, r7
	lvx	v6, r12, r8
	lvx	v7, r12, r9
#ifdef _AIX
	ld	r12, T.sha256_4k(r2)
#else
	lis	r12, HI(sha256_4k)
	addi	r12, r12, LO(sha256_4k)
#endif
	sha256_4way_main_setup
	sha256_4way_main_round  3, r12, r4, v5, v6, v7, v4, v9, v10, v11, v8
	sha256_4way_main_quadround  4, r12, r4
	sha256_4way_main_quadround  8, r12, r4
	sha256_4way_main_quadround 12, r12, r4
	sha256_4way_main_quadround 16, r12, r4
	sha256_4way_main_quadround 20, r12, r4
	sha256_4way_main_quadround 24, r12, r4
	sha256_4way_main_quadround 28, r12, r4
	sha256_4way_main_quadround 32, r12, r4
	sha256_4way_main_quadround 36, r12, r4
	sha256_4way_main_quadround 40, r12, r4
	sha256_4way_main_quadround 44, r12, r4
	sha256_4way_main_quadround 48, r12, r4
	sha256_4way_main_quadround 52, r12, r4
	sha256_4way_main_quadround 56, r12, r4
	sha256_4way_main_quadround 60, r12, r4
	
	lvx	v12, 0, r5
	lvx	v13, r5, r7
	lvx	v14, r5, r8
	lvx	v15, r5, r9
	addi	r12, r5, 4*16
	lvx	v16, 0, r12
	lvx	v17, r12, r7
	lvx	v18, r12, r8
	lvx	v19, r12, r9
	vadduwm	v4, v4, v12
	vadduwm	v5, v5, v13
	vadduwm	v6, v6, v14
	vadduwm	v7, v7, v15
	vadduwm	v8, v8, v16
	vadduwm	v9, v9, v17
	vadduwm	v10, v10, v18
	vadduwm	v11, v11, v19
	addi	r12, r1, 4*4
	stvx	v4, 0, r12
	stvx	v5, r12, r7
	stvx	v6, r12, r8
	stvx	v7, r12, r9
	addi	r12, r12, 4*16
	stvx	v8, 0, r12
	stvx	v9, r12, r7
	stvx	v10, r12, r8
	stvx	v11, r12, r9
	
	addi	r12, r1, 4*4+18*16
	lvx	v4, 0, r12
	lvx	v5, r12, r7
	lvx	v6, r12, r8
	lvx	v7, r12, r9
	addi	r12, r12, 4*16
	lvx	v8, 0, r12
	lvx	v9, r12, r7
	lvx	v10, r12, r8
	lvx	v11, r12, r9
	addi	r12, r4, 18*16
	stvx	v4, 0, r12
	stvx	v5, r12, r7
	stvx	v6, r12, r8
	addi	r12, r4, 22*16
	stvx	v7, 0, r12
	stvx	v8, r12, r7
	stvx	v9, r12, r8
	addi	r12, r4, 30*16
	stvx	v10, 0, r12
	stvx	v11, r12, r7
	
	addi	r4, r1, 4*4
	
	sha256_4way_extend_setup
	
#ifdef _AIX
	ld	r12, T.sha256d_4preext2(r2)
#else
	lis	r12, HI(sha256d_4preext2)
	addi	r12, r12, LO(sha256d_4preext2)
#endif
	lvx	v2, 0, r12
	
	vxor	v9, v9, v9
	vspltisw	v3, 1
	lvx	v4, r12, r8
	vsldoi	v3, v3, v3, 1
	addi	r5, r1, 4*4+8*16
	stvx	v4, 0, r5
	stvx	v9, r5, r7
	stvx	v9, r5, r8
	stvx	v9, r5, r9
	addi	r5, r5, 4*16
	stvx	v9, 0, r5
	stvx	v9, r5, r7
	stvx	v9, r5, r8
	stvx	v3, r5, r9
	
	lvx	v4, 0, r4
	lvx	v14, r4, r7
	
	lvx	v11, r4, r8
	vrlw	v12, v14, v1
	vrlw	v13, v14, v18
	
	vxor	v12, v12, v13
	vsrw	v13, v14, v16
	vadduwm	v5, v14, v2
	vxor	v12, v12, v13
	vrlw	v14, v11, v1
	vrlw	v13, v11, v18
	vadduwm	v4, v4, v12
	vxor	v14, v14, v13
	vsrw	v13, v11, v16
	stvx	v4, r4, r10
	vxor	v14, v14, v13
	vrlw	v12, v4, v17
	vrlw	v13, v4, v19
	vadduwm	v5, v5, v14
	
	stvx	v5, r4, r11
	addi	r4, r4, 2*16
	lvx	v14, r4, r7
	vxor	v12, v12, v13
	vsrw	v13, v4, v0
	vrlw	v6, v14, v1
	vxor	v12, v12, v13
	vrlw	v13, v14, v18
	vxor	v6, v6, v13
	vsrw	v13, v14, v16
	vadduwm	v11, v11, v12
	vxor	v6, v6, v13
	vrlw	v12, v5, v17
	vrlw	v13, v5, v19
	vadduwm	v6, v6, v11
	lvx	v11, r4, r8
	
	stvx	v6, r4, r10
	vxor	v12, v12, v13
	vsrw	v13, v5, v0
	vrlw	v7, v11, v1
	vxor	v12, v12, v13
	vrlw	v13, v11, v18
	vxor	v7, v7, v13
	vsrw	v13, v11, v16
	vadduwm	v14, v14, v12
	vxor	v7, v7, v13
	vrlw	v12, v6, v17
	vrlw	v13, v6, v19
	vadduwm	v7, v7, v14
	
	stvx	v7, r4, r11
	addi	r4, r4, 2*16
	lvx	v14, r4, r7
	vxor	v12, v12, v13
	vsrw	v13, v6, v0
	vrlw	v8, v14, v1
	vxor	v12, v12, v13
	vrlw	v13, v14, v18
	vxor	v8, v8, v13
	vsrw	v13, v14, v16
	vadduwm	v11, v11, v12
	vxor	v8, v8, v13
	vrlw	v12, v7, v17
	vrlw	v13, v7, v19
	vadduwm	v8, v8, v11
	lvx	v11, r4, r8
	
	stvx	v8, r4, r10
	vxor	v12, v12, v13
	vsrw	v13, v7, v0
	vrlw	v9, v11, v1
	vxor	v12, v12, v13
	vrlw	v13, v11, v18
	vxor	v9, v9, v13
	vsrw	v13, v11, v16
	vadduwm	v14, v14, v12
	vxor	v9, v9, v13
	vrlw	v12, v8, v17
	vrlw	v13, v8, v19
	vadduwm	v9, v9, v14
	
	stvx	v9, r4, r11
	addi	r4, r4, 2*16
	lvx	v14, r4, r7
	vxor	v12, v12, v13
	vsrw	v13, v8, v0
	vrlw	v10, v14, v1
	vxor	v12, v12, v13
	vrlw	v13, v14, v18
	vxor	v10, v10, v13
	vsrw	v13, v14, v16
	vadduwm	v11, v11, v12
	vxor	v10, v10, v13
	vrlw	v12, v9, v17
	vrlw	v13, v9, v19
	vadduwm	v11, v11, v3
	vadduwm	v14, v14, v4
	vadduwm	v10, v10, v11
	
	lvx	v2, r12, r7
	vxor	v12, v12, v13
	vsrw	v13, v9, v0
	stvx	v10, r4, r10
	vxor	v12, v12, v13
	vadduwm	v14, v14, v12
	vrlw	v12, v10, v17
	vrlw	v13, v10, v19
	vadduwm	v4, v14, v2
	lvx	v2, r12, r8
	vxor	v12, v12, v13
	vsrw	v13, v10, v0
	stvx	v4, r4, r11
	vadduwm	v5, v5, v2
	vxor	v12, v12, v13
	vadduwm	v5, v5, v12

	vrlw	v12, v4, v17
	vrlw	v13, v4, v19
	addi	r4, r4, 2*16
	stvx	v5, r4, r10
	vxor	v12, v12, v13
	vsrw	v13, v4, v0
	vrlw	v11, v5, v17
	vxor	v12, v12, v13
	vrlw	v13, v5, v19
	vxor	v11, v11, v13
	vsrw	v13, v5, v0
	vadduwm	v6, v6, v12
	vxor	v11, v11, v13
	stvx	v6, r4, r11
	vadduwm	v7, v7, v11
	
	vrlw	v12, v6, v17
	vrlw	v13, v6, v19
	addi	r4, r4, 2*16
	stvx	v7, r4, r10
	vxor	v12, v12, v13
	vsrw	v13, v6, v0
	vrlw	v11, v7, v17
	vxor	v12, v12, v13
	vrlw	v13, v7, v19
	vxor	v11, v11, v13
	vsrw	v13, v7, v0
	vadduwm	v8, v8, v12
	vxor	v11, v11, v13
	stvx	v8, r4, r11
	vadduwm	v9, v9, v11
	
	lvx	v2, r12, r9
	vrlw	v14, v8, v17
	vrlw	v13, v8, v19
	vrlw	v12, v9, v17
	addi	r4, r4, 2*16
	stvx	v9, r4, r10
	vxor	v14, v14, v13
	vrlw	v13, v9, v19
	vxor	v12, v12, v13
	vsrw	v13, v8, v0
	vxor	v14, v14, v13
	vsrw	v13, v9, v0
	vxor	v12, v12, v13
	vadduwm	v4, v4, v2
	vadduwm	v10, v10, v14
	vadduwm	v4, v4, v12
	stvx	v10, r4, r11
	addi	r4, r4, 2*16
	lvx	v11, r4, r8
	
	vadduwm	v5, v5, v3
	stvx	v4, r4, r10
	vrlw	v14, v11, v1
	vrlw	v13, v11, v18
	vrlw	v12, v10, v17
	vxor	v14, v14, v13
	vrlw	v13, v10, v19
	vxor	v12, v12, v13
	vsrw	v13, v11, v16
	vxor	v14, v14, v13
	vsrw	v13, v10, v0
	vxor	v12, v12, v13
	vadduwm	v5, v5, v14
	vadduwm	v5, v5, v12
	stvx	v5, r4, r11
	addi	r4, r4, 2*16
	
	sha256_4way_extend_doubleround 16, r4, v6, v7, v4, v5
	sha256_4way_extend_doubleround 18, r4, v8, v9, v6, v7
	sha256_4way_extend_doubleround 20, r4, v10, v4, v8, v9
	sha256_4way_extend_doubleround 22, r4, v5, v6, v10, v4
	sha256_4way_extend_doubleround 24, r4, v7, v8, v5, v6
	sha256_4way_extend_doubleround 26, r4, v9, v10, v7, v8
	sha256_4way_extend_doubleround 28, r4, v4, v5, v9, v10
	sha256_4way_extend_doubleround 30, r4, v6, v7, v4, v5
	sha256_4way_extend_doubleround 32, r4, v8, v9, v6, v7
	sha256_4way_extend_doubleround 34, r4, v10, v4, v8, v9
	sha256_4way_extend_doubleround 36, r4, v5, v6, v10, v4
	sha256_4way_extend_doubleround 38, r4, v7, v8, v5, v6
	sha256_4way_extend_doubleround 40, r4, v9, v10, v7, v8
	sha256_4way_extend_doubleround 42, r4, v4, v5, v9, v10
	
	lvx	v14, r4, r7
	vrlw	v12, v4, v17
	vrlw	v13, v4, v19
	vadduwm	v15, v11, v6
	vrlw	v6, v14, v1
	vrlw	v11, v14, v18
	vxor	v12, v12, v13
	vxor	v6, v6, v11
	vsrw	v13, v4, v0
	vsrw	v14, v14, v16
	vxor	v12, v12, v13
	vxor	v6, v6, v14
	vadduwm	v12, v12, v15
	vadduwm	v6, v6, v12
	stvx	v6, r4, r10
	addi	r4, r4, -44*16
	
#ifdef _AIX
	ld	r5, T.sha256_4h(r2)
#else
	lis	r5, HI(sha256_4h)
	addi	r5, r5, LO(sha256_4h)
#endif
	lvx	v4, 0, r5
	lvx	v5, r5, r7
	lvx	v6, r5, r8
	lvx	v7, r5, r9
	addi	r12, r5, 4*16
	lvx	v8, 0, r12
	lvx	v9, r12, r7
	lvx	v10, r12, r8
	lvx	v11, r12, r9
#ifdef _AIX
	ld	r12, T.sha256_4k(r2)
#else
	lis	r12, HI(sha256_4k)
	addi	r12, r12, LO(sha256_4k)
#endif
	sha256_4way_main_setup
	sha256_4way_main_quadround  0, r12, r4
	sha256_4way_main_quadround  4, r12, r4
	sha256_4way_main_quadround  8, r12, r4
	sha256_4way_main_quadround 12, r12, r4
	sha256_4way_main_quadround 16, r12, r4
	sha256_4way_main_quadround 20, r12, r4
	sha256_4way_main_quadround 24, r12, r4
	sha256_4way_main_quadround 28, r12, r4
	sha256_4way_main_quadround 32, r12, r4
	sha256_4way_main_quadround 36, r12, r4
	sha256_4way_main_quadround 40, r12, r4
	sha256_4way_main_quadround 44, r12, r4
	sha256_4way_main_quadround 48, r12, r4
	sha256_4way_main_quadround 52, r12, r4
	sha256_4way_main_round 56, r12, r4, v4, v5, v6, v7, v8, v9, v10, v11

.macro sha256_4way_main_round_red i, rk, rw, vd, ve, vf, vg, vh
	li	r6, (\i)*16
	vand	v15, \vf, \ve
	vandc	v14, \vg, \ve
	lvx	v12, \rw, r6
	vadduwm	\vh, \vh, \vd
	vor	v14, v14, v15
	lvx	v15, \rk, r6
	vrlw	v13, \ve, v3
	vadduwm	\vh, \vh, v14
	vxor	v14, \ve, v13
	vrlw	v13, \ve, v19
	vadduwm	\vh, \vh, v12
	vxor	v14, v14, v13
	vadduwm	\vh, \vh, v15
	vrlw	v13, v14, v16
	vadduwm	\vh, \vh, v13
.endm

	sha256_4way_main_round_red 57, r12, r4, v6, v11, v8, v9, v10
	sha256_4way_main_round_red 58, r12, r4, v5, v10, v11, v8, v9
	sha256_4way_main_round_red 59, r12, r4, v4, v9, v10, v11, v8
	sha256_4way_main_round_red 60, r12, r4, v7, v8, v9, v10, v11
	
	li	r12, 7*16
	lvx	v19, r5, r12
	vadduwm	v11, v11, v19
	stvx	v11, r3, r12
	
	ld	r1, 0(r1)
	mtspr	256, r0
	blr


	.align 2
	.globl sha256_use_4way
	.globl _sha256_use_4way
	.globl .sha256_use_4way
#ifdef __ELF__
	.type sha256_use_4way, %function
#endif
sha256_use_4way:
_sha256_use_4way:
.sha256_use_4way:
	li	r3, 1
	blr

#endif /* __ALTIVEC__ */

#endif
0707010000002A000081A4000003E800000064000000015EF4BCA10001A5DF000000000000000000000000000000000000001A00000000cpuminer-2.5.1/sha2-x64.S/*
 * Copyright 2012-2015 pooler@litecoinpool.org
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the Free
 * Software Foundation; either version 2 of the License, or (at your option)
 * any later version.  See COPYING for more details.
 */

#include "cpuminer-config.h"

#if defined(__linux__) && defined(__ELF__)
	.section .note.GNU-stack,"",%progbits
#endif

#if defined(USE_ASM) && defined(__x86_64__)

	.data
	.p2align 4
sha256_h:
	.long 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a
	.long 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19

	.data
	.p2align 6
sha256_k:
	.long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
	.long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
	.long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
	.long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
	.long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
	.long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
	.long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
	.long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
	.long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
	.long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
	.long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
	.long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
	.long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
	.long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
	.long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
	.long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2

bswap_xmm_mask:
	.long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f


.macro sha256_mixed_quadround ra, rb, rc, rd, re, rf, rg, rh, x0, x1, x2, x3
	movdqa	\x3, %xmm4
	movl	\re, %eax
	movdqa	\x2, %xmm6
	rorl	$(25-11), %eax
	movl	\ra, %ebx
	pslldq	$12, %xmm4
	rorl	$(22-13), %ebx
	psrldq	$4, %xmm6
	xorl	\re, %eax
	movl	\rf, %ecx
	rorl	$(11-6), %eax
	pxor	%xmm6, %xmm4
	movdqa	\x1, %xmm5
	xorl	\ra, %ebx
	xorl	\rg, %ecx
	xorl	\re, %eax
	paddd	\x0, %xmm4
	movdqa	\x0, %xmm7
	andl	\re, %ecx
	rorl	$(13-2), %ebx
	xorl	\ra, %ebx
	pslldq	$12, %xmm5
	psrldq	$4, %xmm7
	rorl	$6, %eax
	xorl	\rg, %ecx
	pxor	%xmm7, %xmm5
	rorl	$2, %ebx
	addl	%eax, %ecx
	addl	(%rsp) , %ecx
	movdqa	%xmm5, %xmm6
	movl	\ra, %eax
	addl	%ecx, \rh
	movl	\ra, %ecx
	movdqa	%xmm5, %xmm7
	orl	\rc, %eax
	addl	\rh, \rd
	andl	\rc, %ecx
	pslld	$(32-7), %xmm5
	psrld	$7, %xmm6
	andl	\rb, %eax
	addl	%ebx, \rh
	orl	%ecx, %eax
	por	%xmm6, %xmm5
	addl	%eax, \rh
	
	movl	\rd, %eax
	movdqa	%xmm7, %xmm6
	movl	\rh, %ebx
	rorl	$(25-11), %eax
	xorl	\rd, %eax
	movdqa	%xmm7, %xmm8
	movl	\re, %ecx
	rorl	$(22-13), %ebx
	xorl	\rh, %ebx
	pslld	$(32-18), %xmm7
	rorl	$(11-6), %eax
	xorl	\rf, %ecx
	rorl	$(13-2), %ebx
	psrld	$18, %xmm6
	xorl	\rd, %eax
	andl	\rd, %ecx
	rorl	$6, %eax
	pxor	%xmm7, %xmm5
	xorl	\rh, %ebx
	xorl	\rf, %ecx
	psrld	$3, %xmm8
	addl	%eax, %ecx
	addl	1*4(%rsp), %ecx
	rorl	$2, %ebx
	pxor	%xmm6, %xmm5
	movl	\rh, %eax
	addl	%ecx, \rg
	movl	\rh, %ecx
	pxor	%xmm8, %xmm5
	orl	\rb, %eax
	addl	\rg, \rc
	andl	\rb, %ecx
	pshufd	$0xfa, \x3, %xmm6
	andl	\ra, %eax
	addl	%ebx, \rg
	paddd	%xmm5, %xmm4
	orl	%ecx, %eax
	addl	%eax, \rg
	
	movl	\rc, %eax
	movdqa	%xmm6, %xmm7
	movl	\rg, %ebx
	rorl	$(25-11), %eax
	xorl	\rc, %eax
	movdqa	%xmm6, %xmm8
	rorl	$(22-13), %ebx
	movl	\rd, %ecx
	xorl	\rg, %ebx
	psrlq	$17, %xmm6
	psrlq	$19, %xmm7
	rorl	$(11-6), %eax
	xorl	\re, %ecx
	xorl	\rc, %eax
	psrld	$10, %xmm8
	pxor	%xmm7, %xmm6
	andl	\rc, %ecx
	rorl	$(13-2), %ebx
	xorl	\rg, %ebx
	pxor	%xmm6, %xmm8
	xorl	\re, %ecx
	rorl	$6, %eax
	addl	%eax, %ecx
	pshufd	$0x8f, %xmm8, %xmm8
	rorl	$2, %ebx
	addl	2*4(%rsp), %ecx
	movl	\rg, %eax
	psrldq	$8, %xmm8
	addl	%ecx, \rf
	movl	\rg, %ecx
	orl	\ra, %eax
	paddd	%xmm8, %xmm4
	addl	\rf, \rb
	andl	\ra, %ecx
	andl	\rh, %eax
	pshufd	$0x50, %xmm4, %xmm6
	addl	%ebx, \rf
	orl	%ecx, %eax
	addl	%eax, \rf
	
	movdqa	%xmm6, %xmm7
	movl	\rb, %eax
	rorl	$(25-11), %eax
	movl	\rf, %ebx
	movdqa	%xmm6, \x0
	rorl	$(22-13), %ebx
	xorl	\rb, %eax
	movl	\rc, %ecx
	psrlq	$17, %xmm6
	rorl	$(11-6), %eax
	xorl	\rf, %ebx
	xorl	\rd, %ecx
	psrlq	$19, %xmm7
	xorl	\rb, %eax
	andl	\rb, %ecx
	rorl	$(13-2), %ebx
	psrld	$10, \x0
	xorl	\rf, %ebx
	rorl	$6, %eax
	pxor	%xmm7, %xmm6
	xorl	\rd, %ecx
	rorl	$2, %ebx
	addl	%eax, %ecx
	pxor	%xmm6, \x0
	addl	3*4(%rsp), %ecx
	movl	\rf, %eax
	addl	%ecx, \re
	pshufd	$0xf8, \x0, \x0
	movl	\rf, %ecx
	orl	\rh, %eax
	addl	\re, \ra
	pslldq	$8, \x0
	andl	\rh, %ecx
	andl	\rg, %eax
	paddd	%xmm4, \x0
	addl	%ebx, \re
	orl	%ecx, %eax
	addl	%eax, \re
.endm

.macro sha256_main_round i, ra, rb, rc, rd, re, rf, rg, rh
	movl	\re, %eax
	rorl	$(25-11), %eax
	movl	\ra, %ebx
	xorl	\re, %eax
	rorl	$(22-13), %ebx
	movl	\rf, %ecx
	xorl	\ra, %ebx
	rorl	$(11-6), %eax
	xorl	\rg, %ecx
	xorl	\re, %eax
	rorl	$(13-2), %ebx
	andl	\re, %ecx
	xorl	\ra, %ebx
	rorl	$6, %eax
	xorl	\rg, %ecx
	addl	%eax, %ecx
	rorl	$2, %ebx
	addl	\i*4(%rsp), %ecx
	movl	\ra, %eax
	addl	%ecx, \rh
	movl	\ra, %ecx
	orl	\rc, %eax
	addl	\rh, \rd
	andl	\rc, %ecx
	andl	\rb, %eax
	addl	%ebx, \rh
	orl	%ecx, %eax
	addl	%eax, \rh
.endm


	.text
	.p2align 6
sha256_transform_sse2:
	pushq	%rbx
	pushq	%r12
	pushq	%r13
	pushq	%r14
	pushq	%r15
#if defined(_WIN64) || defined(__CYGWIN__)
	pushq	%rdi
	pushq	%rsi
	subq	$5*16, %rsp
	movdqa	%xmm6, 1*16(%rsp)
	movdqa	%xmm7, 2*16(%rsp)
	movdqa	%xmm8, 3*16(%rsp)
	movdqa	%xmm9, 4*16(%rsp)
	movq	%rcx, %rdi
	movq	%rdx, %rsi
	movq	%r8, %rdx
#else
	subq	$16, %rsp
#endif
	
	movl	0*4(%rdi), %r8d
	movl	1*4(%rdi), %r9d
	movl	2*4(%rdi), %r10d
	movl	3*4(%rdi), %r11d
	movl	4*4(%rdi), %r12d
	movl	5*4(%rdi), %r13d
	movl	6*4(%rdi), %r14d
	movl	7*4(%rdi), %r15d
	
	testq	%rdx, %rdx
	jnz sha256_transform_sse2_swap
	
	movdqu	0*16(%rsi), %xmm0
	movdqu	1*16(%rsi), %xmm1
	movdqu	2*16(%rsi), %xmm2
	movdqu	3*16(%rsi), %xmm3
	jmp sha256_transform_sse2_core
	
sha256_transform_sse2_swap:
	movdqu	0*16(%rsi), %xmm0
	movdqu	1*16(%rsi), %xmm1
	movdqu	2*16(%rsi), %xmm2
	movdqu	3*16(%rsi), %xmm3
	pshuflw	$0xb1, %xmm0, %xmm0
	pshuflw	$0xb1, %xmm1, %xmm1
	pshuflw	$0xb1, %xmm2, %xmm2
	pshuflw	$0xb1, %xmm3, %xmm3
	pshufhw	$0xb1, %xmm0, %xmm0
	pshufhw	$0xb1, %xmm1, %xmm1
	pshufhw	$0xb1, %xmm2, %xmm2
	pshufhw	$0xb1, %xmm3, %xmm3
	movdqa	%xmm0, %xmm4
	movdqa	%xmm1, %xmm5
	movdqa	%xmm2, %xmm6
	movdqa	%xmm3, %xmm7
	psrlw	$8, %xmm4
	psrlw	$8, %xmm5
	psrlw	$8, %xmm6
	psrlw	$8, %xmm7
	psllw	$8, %xmm0
	psllw	$8, %xmm1
	psllw	$8, %xmm2
	psllw	$8, %xmm3
	pxor	%xmm4, %xmm0
	pxor	%xmm5, %xmm1
	pxor	%xmm6, %xmm2
	pxor	%xmm7, %xmm3
	
sha256_transform_sse2_core:
	leaq	sha256_k(%rip), %rdx
	movq	$48, %rsi
	.p2align 4
sha256_transform_sse2_loop:
	movdqa	0*16(%rdx), %xmm9
	paddd	%xmm0, %xmm9
	movdqa	%xmm9, (%rsp)
	sha256_mixed_quadround %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %xmm0, %xmm1, %xmm2, %xmm3
	movdqa	1*16(%rdx), %xmm9
	paddd	%xmm1, %xmm9
	movdqa	%xmm9, (%rsp)
	sha256_mixed_quadround %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %xmm1, %xmm2, %xmm3, %xmm0
	movdqa	2*16(%rdx), %xmm9
	paddd	%xmm2, %xmm9
	movdqa	%xmm9, (%rsp)
	sha256_mixed_quadround %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %xmm2, %xmm3, %xmm0, %xmm1
	movdqa	3*16(%rdx), %xmm9
	paddd	%xmm3, %xmm9
	movdqa	%xmm9, (%rsp)
	addq	$4*16, %rdx
	sha256_mixed_quadround %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %xmm3, %xmm0, %xmm1, %xmm2
	
	subq	$16, %rsi
	jne sha256_transform_sse2_loop
	
	paddd	0*16(%rdx), %xmm0
	movdqa	%xmm0, (%rsp)
	sha256_main_round 0, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d
	sha256_main_round 1, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d
	sha256_main_round 2, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d
	sha256_main_round 3, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d
	paddd	1*16(%rdx), %xmm1
	movdqa	%xmm1, (%rsp)
	sha256_main_round 0, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d
	sha256_main_round 1, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d
	sha256_main_round 2, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d
	sha256_main_round 3, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d
	paddd	2*16(%rdx), %xmm2
	movdqa	%xmm2, (%rsp)
	sha256_main_round 0, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d
	sha256_main_round 1, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d
	sha256_main_round 2, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d
	sha256_main_round 3, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d
	paddd	3*16(%rdx), %xmm3
	movdqa	%xmm3, (%rsp)
	sha256_main_round 0, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d
	sha256_main_round 1, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d
	sha256_main_round 2, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d
	sha256_main_round 3, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d
	
	addl	%r8d, 0*4(%rdi)
	addl	%r9d, 1*4(%rdi)
	addl	%r10d, 2*4(%rdi)
	addl	%r11d, 3*4(%rdi)
	addl	%r12d, 4*4(%rdi)
	addl	%r13d, 5*4(%rdi)
	addl	%r14d, 6*4(%rdi)
	addl	%r15d, 7*4(%rdi)
	
#if defined(_WIN64) || defined(__CYGWIN__)
	movdqa	1*16(%rsp), %xmm6
	movdqa	2*16(%rsp), %xmm7
	movdqa	3*16(%rsp), %xmm8
	movdqa	4*16(%rsp), %xmm9
	addq	$5*16, %rsp
	popq	%rsi
	popq	%rdi
#else
	addq	$16, %rsp
#endif
	popq	%r15
	popq	%r14
	popq	%r13
	popq	%r12
	popq	%rbx
	ret


	.text
	.p2align 6
sha256_transform_phe:
#if defined(_WIN64) || defined(__CYGWIN__)
	pushq	%rdi
	pushq	%rsi
	movq	%rcx, %rdi
	movq	%rdx, %rsi
	movq	%r8, %rdx
#endif
	movq	%rsp, %r8
	subq	$64, %rsp
	andq	$-64, %rsp
	
	testq	%rdx, %rdx
	jnz sha256_transform_phe_noswap
	
	movl	0*4(%rsi), %eax
	movl	1*4(%rsi), %ecx
	movl	2*4(%rsi), %edx
	movl	3*4(%rsi), %r9d
	bswapl	%eax
	bswapl	%ecx
	bswapl	%edx
	bswapl	%r9d
	movl	%eax, 0*4(%rsp)
	movl	%ecx, 1*4(%rsp)
	movl	%edx, 2*4(%rsp)
	movl	%r9d, 3*4(%rsp)
	movl	4*4(%rsi), %eax
	movl	5*4(%rsi), %ecx
	movl	6*4(%rsi), %edx
	movl	7*4(%rsi), %r9d
	bswapl	%eax
	bswapl	%ecx
	bswapl	%edx
	bswapl	%r9d
	movl	%eax, 4*4(%rsp)
	movl	%ecx, 5*4(%rsp)
	movl	%edx, 6*4(%rsp)
	movl	%r9d, 7*4(%rsp)
	
	movdqu	2*16(%rsi), %xmm0
	movdqu	3*16(%rsi), %xmm2
	pshuflw	$0xb1, %xmm0, %xmm0
	pshuflw	$0xb1, %xmm2, %xmm2
	pshufhw	$0xb1, %xmm0, %xmm0
	pshufhw	$0xb1, %xmm2, %xmm2
	movdqa	%xmm0, %xmm1
	movdqa	%xmm2, %xmm3
	psrlw	$8, %xmm1
	psrlw	$8, %xmm3
	psllw	$8, %xmm0
	psllw	$8, %xmm2
	pxor	%xmm1, %xmm0
	pxor	%xmm3, %xmm2
	movdqa	%xmm0, 2*16(%rsp)
	movdqa	%xmm2, 3*16(%rsp)
	
	jmp sha256_transform_phe_core
	
sha256_transform_phe_noswap:
	movdqu	0*16(%rsi), %xmm0
	movdqu	1*16(%rsi), %xmm1
	movdqu	2*16(%rsi), %xmm2
	movdqu	3*16(%rsi), %xmm3
	movdqa	%xmm0, 0*16(%rsp)
	movdqa	%xmm1, 1*16(%rsp)
	movdqa	%xmm2, 2*16(%rsp)
	movdqa	%xmm3, 3*16(%rsp)
	
sha256_transform_phe_core:
	movq	%rsp, %rsi
	movq	$-1, %rax
	movq	$1, %rcx
	/* rep xsha256 */
	.byte 0xf3, 0x0f, 0xa6, 0xd0
	
	movq	%r8, %rsp
#if defined(_WIN64) || defined(__CYGWIN__)
	popq	%rsi
	popq	%rdi
#endif
	ret
	
	
	.data
	.p2align 3
sha256_transform_addr:
	.quad sha256_transform_sse2
	
	.text
	.p2align 3
	.globl sha256_transform
	.globl _sha256_transform
sha256_transform:
_sha256_transform:
	jmp *sha256_transform_addr(%rip)


	.text
	.p2align 6
	.globl sha256d_ms
	.globl _sha256d_ms
sha256d_ms:
_sha256d_ms:
#if defined(_WIN64) || defined(__CYGWIN__)
	pushq	%rdi
	pushq	%rsi
	movq	%rcx, %rdi
	movq	%rdx, %rsi
	movq	%r8, %rdx
#endif
	movq	%rsp, %r8
	subq	$32, %rsp
	andq	$-32, %rsp
	
	movdqa	0*16(%rdx), %xmm0
	movdqa	1*16(%rdx), %xmm1
	movdqa	%xmm0, 0*16(%rdi)
	movdqa	%xmm1, 1*16(%rdi)
	
	movl	0*4(%rsi), %eax
	movl	1*4(%rsi), %ecx
	movl	2*4(%rsi), %edx
	movl	3*4(%rsi), %r9d
	bswapl	%eax
	bswapl	%ecx
	bswapl	%edx
	bswapl	%r9d
	movl	%eax, 0*4(%rsp)
	movl	%ecx, 1*4(%rsp)
	movl	%edx, 2*4(%rsp)
	movl	%r9d, 3*4(%rsp)
	
	movq	%rsp, %rsi
	movl	$64, %eax
	movl	$80, %ecx
	/* rep xsha256 */
	.byte 0xf3, 0x0f, 0xa6, 0xd0
	
	movdqa	bswap_xmm_mask(%rip), %xmm1
	movdqa	0*16(%rdi), %xmm0
	movdqa	1*16(%rdi), %xmm2
	pshufb	%xmm1, %xmm0
	pshufb	%xmm1, %xmm2
	movdqa	%xmm0, 0*16(%rsp)
	movdqa	%xmm2, 1*16(%rsp)
	
	movdqa	sha256_h+0*16(%rip), %xmm0
	movdqa	sha256_h+1*16(%rip), %xmm1
	movdqa	%xmm0, 0*16(%rdi)
	movdqa	%xmm1, 1*16(%rdi)
	
	movq	%rsp, %rsi
	xorq	%rax, %rax
	movl	$32, %ecx
	/* rep xsha256 */
	.byte 0xf3, 0x0f, 0xa6, 0xd0
	
	movq	%r8, %rsp
#if defined(_WIN64) || defined(__CYGWIN__)
	popq	%rsi
	popq	%rdi
#endif
	ret


	.data
	.p2align 7
sha256_4h:
	.long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667
	.long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85
	.long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372
	.long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a
	.long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f
	.long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c
	.long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab
	.long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19

	.data
	.p2align 7
sha256_4k:
	.long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98
	.long 0x71374491, 0x71374491, 0x71374491, 0x71374491
	.long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf
	.long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5
	.long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b
	.long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1
	.long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4
	.long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5
	.long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98
	.long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01
	.long 0x243185be, 0x243185be, 0x243185be, 0x243185be
	.long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3
	.long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74
	.long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe
	.long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7
	.long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174
	.long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1
	.long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786
	.long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6
	.long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc
	.long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f
	.long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa
	.long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc
	.long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da
	.long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152
	.long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d
	.long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8
	.long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7
	.long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3
	.long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147
	.long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351
	.long 0x14292967, 0x14292967, 0x14292967, 0x14292967
	.long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85
	.long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138
	.long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc
	.long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13
	.long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354
	.long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb
	.long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e
	.long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85
	.long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1
	.long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b
	.long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70
	.long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3
	.long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819
	.long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624
	.long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585
	.long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070
	.long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116
	.long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08
	.long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c
	.long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5
	.long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3
	.long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a
	.long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f
	.long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3
	.long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee
	.long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f
	.long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814
	.long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208
	.long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa
	.long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb
	.long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7
	.long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2

	.data
	.p2align 6
sha256d_4preext2_17:
	.long 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000
sha256d_4preext2_23:
	.long 0x11002000, 0x11002000, 0x11002000, 0x11002000
sha256d_4preext2_24:
	.long 0x80000000, 0x80000000, 0x80000000, 0x80000000
sha256d_4preext2_30:
	.long 0x00400022, 0x00400022, 0x00400022, 0x00400022


#ifdef USE_AVX2

	.data
	.p2align 7
sha256_8h:
	.long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667
	.long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85
	.long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372
	.long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a
	.long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f
	.long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c
	.long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab
	.long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19

	.data
	.p2align 7
sha256_8k:
	.long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98
	.long 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491
	.long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf
	.long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5
	.long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b
	.long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1
	.long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4
	.long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5
	.long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98
	.long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01
	.long 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be
	.long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3
	.long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74
	.long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe
	.long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7
	.long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174
	.long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1
	.long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786
	.long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6
	.long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc
	.long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f
	.long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa
	.long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc
	.long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da
	.long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152
	.long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d
	.long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8
	.long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7
	.long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3
	.long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147
	.long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351
	.long 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967
	.long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85
	.long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138
	.long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc
	.long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13
	.long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354
	.long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb
	.long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e
	.long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85
	.long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1
	.long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b
	.long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70
	.long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3
	.long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819
	.long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624
	.long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585
	.long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070
	.long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116
	.long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08
	.long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c
	.long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5
	.long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3
	.long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a
	.long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f
	.long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3
	.long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee
	.long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f
	.long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814
	.long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208
	.long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa
	.long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb
	.long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7
	.long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2

	.data
	.p2align 6
sha256d_8preext2_17:
	.long 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000
sha256d_8preext2_23:
	.long 0x11002000, 0x11002000, 0x11002000, 0x11002000, 0x11002000, 0x11002000, 0x11002000, 0x11002000
sha256d_8preext2_24:
	.long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
sha256d_8preext2_30:
	.long 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022

#endif /* USE_AVX2 */


	.text
	.p2align 6
	.globl sha256_init_4way
	.globl _sha256_init_4way
sha256_init_4way:
_sha256_init_4way:
#if defined(_WIN64) || defined(__CYGWIN__)
	pushq	%rdi
	movq	%rcx, %rdi
#endif
	movdqa	sha256_4h+0(%rip), %xmm0
	movdqa	sha256_4h+16(%rip), %xmm1
	movdqa	sha256_4h+32(%rip), %xmm2
	movdqa	sha256_4h+48(%rip), %xmm3
	movdqu	%xmm0, 0(%rdi)
	movdqu	%xmm1, 16(%rdi)
	movdqu	%xmm2, 32(%rdi)
	movdqu	%xmm3, 48(%rdi)
	movdqa	sha256_4h+64(%rip), %xmm0
	movdqa	sha256_4h+80(%rip), %xmm1
	movdqa	sha256_4h+96(%rip), %xmm2
	movdqa	sha256_4h+112(%rip), %xmm3
	movdqu	%xmm0, 64(%rdi)
	movdqu	%xmm1, 80(%rdi)
	movdqu	%xmm2, 96(%rdi)
	movdqu	%xmm3, 112(%rdi)
#if defined(_WIN64) || defined(__CYGWIN__)
	popq	%rdi
#endif
	ret


#ifdef USE_AVX2
	.text
	.p2align 6
	.globl sha256_init_8way
	.globl _sha256_init_8way
sha256_init_8way:
_sha256_init_8way:
#if defined(_WIN64) || defined(__CYGWIN__)
	pushq	%rdi
	movq	%rcx, %rdi
#endif
	vpbroadcastd	sha256_4h+0(%rip), %ymm0
	vpbroadcastd	sha256_4h+16(%rip), %ymm1
	vpbroadcastd	sha256_4h+32(%rip), %ymm2
	vpbroadcastd	sha256_4h+48(%rip), %ymm3
	vmovdqu	%ymm0, 0*32(%rdi)
	vmovdqu	%ymm1, 1*32(%rdi)
	vmovdqu	%ymm2, 2*32(%rdi)
	vmovdqu	%ymm3, 3*32(%rdi)
	vpbroadcastd	sha256_4h+64(%rip), %ymm0
	vpbroadcastd	sha256_4h+80(%rip), %ymm1
	vpbroadcastd	sha256_4h+96(%rip), %ymm2
	vpbroadcastd	sha256_4h+112(%rip), %ymm3
	vmovdqu	%ymm0, 4*32(%rdi)
	vmovdqu	%ymm1, 5*32(%rdi)
	vmovdqu	%ymm2, 6*32(%rdi)
	vmovdqu	%ymm3, 7*32(%rdi)
#if defined(_WIN64) || defined(__CYGWIN__)
	popq	%rdi
#endif
	ret
#endif /* USE_AVX2 */


.macro sha256_sse2_extend_round i
	movdqa	(\i-15)*16(%rax), %xmm0
	movdqa	%xmm0, %xmm2
	psrld	$3, %xmm0
	movdqa	%xmm0, %xmm1
	pslld	$14, %xmm2
	psrld	$4, %xmm1
	pxor	%xmm1, %xmm0
	pxor	%xmm2, %xmm0
	psrld	$11, %xmm1
	pslld	$11, %xmm2
	pxor	%xmm1, %xmm0
	pxor	%xmm2, %xmm0
	paddd	(\i-16)*16(%rax), %xmm0
	paddd	(\i-7)*16(%rax), %xmm0

	movdqa	%xmm3, %xmm2
	psrld	$10, %xmm3
	pslld	$13, %xmm2
	movdqa	%xmm3, %xmm1
	psrld	$7, %xmm1
	pxor	%xmm1, %xmm3
	pxor	%xmm2, %xmm3
	psrld	$2, %xmm1
	pslld	$2, %xmm2
	pxor	%xmm1, %xmm3
	pxor	%xmm2, %xmm3
	paddd	%xmm0, %xmm3
	movdqa	%xmm3, \i*16(%rax)
.endm

.macro sha256_sse2_extend_doubleround i
	movdqa	(\i-15)*16(%rax), %xmm0
	movdqa	(\i-14)*16(%rax), %xmm4
	movdqa	%xmm0, %xmm2
	movdqa	%xmm4, %xmm6
	psrld	$3, %xmm0
	psrld	$3, %xmm4
	movdqa	%xmm0, %xmm1
	movdqa	%xmm4, %xmm5
	pslld	$14, %xmm2
	pslld	$14, %xmm6
	psrld	$4, %xmm1
	psrld	$4, %xmm5
	pxor	%xmm1, %xmm0
	pxor	%xmm5, %xmm4
	psrld	$11, %xmm1
	psrld	$11, %xmm5
	pxor	%xmm2, %xmm0
	pxor	%xmm6, %xmm4
	pslld	$11, %xmm2
	pslld	$11, %xmm6
	pxor	%xmm1, %xmm0
	pxor	%xmm5, %xmm4
	pxor	%xmm2, %xmm0
	pxor	%xmm6, %xmm4

	paddd	(\i-16)*16(%rax), %xmm0
	paddd	(\i-15)*16(%rax), %xmm4

	movdqa	%xmm3, %xmm2
	movdqa	%xmm7, %xmm6
	psrld	$10, %xmm3
	psrld	$10, %xmm7
	movdqa	%xmm3, %xmm1
	movdqa	%xmm7, %xmm5
	pslld	$13, %xmm2
	pslld	$13, %xmm6
	psrld	$7, %xmm1
	psrld	$7, %xmm5

	paddd	(\i-7)*16(%rax), %xmm0
	paddd	(\i-6)*16(%rax), %xmm4

	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	psrld	$2, %xmm1
	psrld	$2, %xmm5
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	pslld	$2, %xmm2
	pslld	$2, %xmm6
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7

	paddd	%xmm0, %xmm3
	paddd	%xmm4, %xmm7
	movdqa	%xmm3, \i*16(%rax)
	movdqa	%xmm7, (\i+1)*16(%rax)
.endm

.macro sha256_sse2_main_round i
	movdqa	16*(\i)(%rax), %xmm6

	movdqa	%xmm0, %xmm1
	movdqa	16(%rsp), %xmm2
	pandn	%xmm2, %xmm1
	paddd	32(%rsp), %xmm6

	movdqa	%xmm2, 32(%rsp)
	movdqa	0(%rsp), %xmm2
	movdqa	%xmm2, 16(%rsp)

	pand	%xmm0, %xmm2
	pxor	%xmm2, %xmm1
	movdqa	%xmm0, 0(%rsp)

	paddd	%xmm1, %xmm6

	movdqa	%xmm0, %xmm1
	psrld	$6, %xmm0
	paddd	16*(\i)(%rcx), %xmm6
	movdqa	%xmm0, %xmm2
	pslld	$7, %xmm1
	psrld	$5, %xmm2
	pxor	%xmm1, %xmm0
	pxor	%xmm2, %xmm0
	pslld	$14, %xmm1
	psrld	$14, %xmm2
	pxor	%xmm1, %xmm0
	pslld	$5, %xmm1
	pxor	%xmm2, %xmm0
	pxor	%xmm1, %xmm0
	movdqa	%xmm5, %xmm1
	paddd	%xmm0, %xmm6

	movdqa	%xmm3, %xmm0
	movdqa	%xmm4, %xmm3
	movdqa	%xmm4, %xmm2
	paddd	%xmm6, %xmm0
	pand	%xmm5, %xmm2
	pand	%xmm7, %xmm1
	pand	%xmm7, %xmm4
	pxor	%xmm4, %xmm1
	movdqa	%xmm5, %xmm4
	movdqa	%xmm7, %xmm5
	pxor	%xmm2, %xmm1
	paddd	%xmm1, %xmm6

	movdqa	%xmm7, %xmm2
	psrld	$2, %xmm7
	movdqa	%xmm7, %xmm1
	pslld	$10, %xmm2
	psrld	$11, %xmm1
	pxor	%xmm2, %xmm7
	pslld	$9, %xmm2
	pxor	%xmm1, %xmm7
	psrld	$9, %xmm1
	pxor	%xmm2, %xmm7
	pslld	$11, %xmm2
	pxor	%xmm1, %xmm7
	pxor	%xmm2, %xmm7
	paddd	%xmm6, %xmm7
.endm

.macro sha256_sse2_main_quadround i
	sha256_sse2_main_round \i+0
	sha256_sse2_main_round \i+1
	sha256_sse2_main_round \i+2
	sha256_sse2_main_round \i+3
.endm


#if defined(USE_AVX)

.macro sha256_avx_extend_round i
	vmovdqa	(\i-15)*16(%rax), %xmm0
	vpslld	$14, %xmm0, %xmm2
	vpsrld	$3, %xmm0, %xmm0
	vpsrld	$4, %xmm0, %xmm1
	vpxor	%xmm1, %xmm0, %xmm0
	vpxor	%xmm2, %xmm0, %xmm0
	vpsrld	$11, %xmm1, %xmm1
	vpslld	$11, %xmm2, %xmm2
	vpxor	%xmm1, %xmm0, %xmm0
	vpxor	%xmm2, %xmm0, %xmm0
	vpaddd	(\i-16)*16(%rax), %xmm0, %xmm0
	vpaddd	(\i-7)*16(%rax), %xmm0, %xmm0

	vpslld	$13, %xmm3, %xmm2
	vpsrld	$10, %xmm3, %xmm3
	vpsrld	$7, %xmm3, %xmm1
	vpxor	%xmm1, %xmm3, %xmm3
	vpxor	%xmm2, %xmm3, %xmm3
	vpsrld	$2, %xmm1, %xmm1
	vpslld	$2, %xmm2, %xmm2
	vpxor	%xmm1, %xmm3, %xmm3
	vpxor	%xmm2, %xmm3, %xmm3
	vpaddd	%xmm0, %xmm3, %xmm3
	vmovdqa	%xmm3, \i*16(%rax)
.endm

.macro sha256_avx_extend_doubleround i
	vmovdqa	(\i-15)*16(%rax), %xmm0
	vmovdqa	(\i-14)*16(%rax), %xmm4
	vpslld	$14, %xmm0, %xmm2
	vpslld	$14, %xmm4, %xmm6
	vpsrld	$3, %xmm0, %xmm8
	vpsrld	$3, %xmm4, %xmm4
	vpsrld	$7, %xmm0, %xmm1
	vpsrld	$4, %xmm4, %xmm5
	vpxor	%xmm1, %xmm8, %xmm8
	vpxor	%xmm5, %xmm4, %xmm4
	vpsrld	$11, %xmm1, %xmm1
	vpsrld	$11, %xmm5, %xmm5
	vpxor	%xmm2, %xmm8, %xmm8
	vpxor	%xmm6, %xmm4, %xmm4
	vpslld	$11, %xmm2, %xmm2
	vpslld	$11, %xmm6, %xmm6
	vpxor	%xmm1, %xmm8, %xmm8
	vpxor	%xmm5, %xmm4, %xmm4
	vpxor	%xmm2, %xmm8, %xmm8
	vpxor	%xmm6, %xmm4, %xmm4

	vpaddd	%xmm0, %xmm4, %xmm4
	vpaddd	(\i-16)*16(%rax), %xmm8, %xmm0

	vpslld	$13, %xmm3, %xmm2
	vpslld	$13, %xmm7, %xmm6
	vpsrld	$10, %xmm3, %xmm3
	vpsrld	$10, %xmm7, %xmm7

	vpaddd	(\i-7)*16(%rax), %xmm0, %xmm0
	vpaddd	(\i-6)*16(%rax), %xmm4, %xmm4

	vpsrld	$7, %xmm3, %xmm1
	vpsrld	$7, %xmm7, %xmm5
	vpxor	%xmm1, %xmm3, %xmm3
	vpxor	%xmm5, %xmm7, %xmm7
	vpsrld	$2, %xmm1, %xmm1
	vpsrld	$2, %xmm5, %xmm5
	vpxor	%xmm2, %xmm3, %xmm3
	vpxor	%xmm6, %xmm7, %xmm7
	vpslld	$2, %xmm2, %xmm2
	vpslld	$2, %xmm6, %xmm6
	vpxor	%xmm1, %xmm3, %xmm3
	vpxor	%xmm5, %xmm7, %xmm7
	vpxor	%xmm2, %xmm3, %xmm3
	vpxor	%xmm6, %xmm7, %xmm7

	vpaddd	%xmm0, %xmm3, %xmm3
	vpaddd	%xmm4, %xmm7, %xmm7
	vmovdqa	%xmm3, \i*16(%rax)
	vmovdqa	%xmm7, (\i+1)*16(%rax)
.endm

.macro sha256_avx_main_round i, r0, r1, r2, r3, r4, r5, r6, r7
	vpaddd	16*(\i)(%rax), \r0, %xmm6
	vpaddd	16*(\i)(%rcx), %xmm6, %xmm6

	vpandn	\r1, \r3, %xmm1
	vpand	\r3, \r2, %xmm2
	vpxor	%xmm2, %xmm1, %xmm1
	vpaddd	%xmm1, %xmm6, %xmm6

	vpslld	$7, \r3, %xmm1
	vpsrld	$6, \r3, \r0
	vpsrld	$5, \r0, %xmm2
	vpxor	%xmm1, \r0, \r0
	vpxor	%xmm2, \r0, \r0
	vpslld	$14, %xmm1, %xmm1
	vpsrld	$14, %xmm2, %xmm2
	vpxor	%xmm1, \r0, \r0
	vpxor	%xmm2, \r0, \r0
	vpslld	$5, %xmm1, %xmm1
	vpxor	%xmm1, \r0, \r0
	vpaddd	\r0, %xmm6, %xmm6
	vpaddd	%xmm6, \r4, \r0

	vpand	\r6, \r5, %xmm2
	vpand	\r7, \r5, \r4
	vpand	\r7, \r6, %xmm1
	vpxor	\r4, %xmm1, %xmm1
	vpxor	%xmm2, %xmm1, %xmm1
	vpaddd	%xmm1, %xmm6, %xmm6

	vpslld	$10, \r7, %xmm2
	vpsrld	$2, \r7, \r4
	vpsrld	$11, \r4, %xmm1
	vpxor	%xmm2, \r4, \r4
	vpxor	%xmm1, \r4, \r4
	vpslld	$9, %xmm2, %xmm2
	vpsrld	$9, %xmm1, %xmm1
	vpxor	%xmm2, \r4, \r4
	vpxor	%xmm1, \r4, \r4
	vpslld	$11, %xmm2, %xmm2
	vpxor	%xmm2, \r4, \r4
	vpaddd	%xmm6, \r4, \r4
.endm

.macro sha256_avx_main_quadround i
	sha256_avx_main_round \i+0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
	sha256_avx_main_round \i+1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
	sha256_avx_main_round \i+2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
	sha256_avx_main_round \i+3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
.endm

#endif /* USE_AVX */


#if defined(USE_AVX2)

.macro sha256_avx2_extend_round i
	vmovdqa	(\i-15)*32(%rax), %ymm0
	vpslld	$14, %ymm0, %ymm2
	vpsrld	$3, %ymm0, %ymm0
	vpsrld	$4, %ymm0, %ymm1
	vpxor	%ymm1, %ymm0, %ymm0
	vpxor	%ymm2, %ymm0, %ymm0
	vpsrld	$11, %ymm1, %ymm1
	vpslld	$11, %ymm2, %ymm2
	vpxor	%ymm1, %ymm0, %ymm0
	vpxor	%ymm2, %ymm0, %ymm0
	vpaddd	(\i-16)*32(%rax), %ymm0, %ymm0
	vpaddd	(\i-7)*32(%rax), %ymm0, %ymm0

	vpslld	$13, %ymm3, %ymm2
	vpsrld	$10, %ymm3, %ymm3
	vpsrld	$7, %ymm3, %ymm1
	vpxor	%ymm1, %ymm3, %ymm3
	vpxor	%ymm2, %ymm3, %ymm3
	vpsrld	$2, %ymm1, %ymm1
	vpslld	$2, %ymm2, %ymm2
	vpxor	%ymm1, %ymm3, %ymm3
	vpxor	%ymm2, %ymm3, %ymm3
	vpaddd	%ymm0, %ymm3, %ymm3
	vmovdqa	%ymm3, \i*32(%rax)
.endm

.macro sha256_avx2_extend_doubleround i
	vmovdqa	(\i-15)*32(%rax), %ymm0
	vmovdqa	(\i-14)*32(%rax), %ymm4
	vpslld	$14, %ymm0, %ymm2
	vpslld	$14, %ymm4, %ymm6
	vpsrld	$3, %ymm0, %ymm8
	vpsrld	$3, %ymm4, %ymm4
	vpsrld	$7, %ymm0, %ymm1
	vpsrld	$4, %ymm4, %ymm5
	vpxor	%ymm1, %ymm8, %ymm8
	vpxor	%ymm5, %ymm4, %ymm4
	vpsrld	$11, %ymm1, %ymm1
	vpsrld	$11, %ymm5, %ymm5
	vpxor	%ymm2, %ymm8, %ymm8
	vpxor	%ymm6, %ymm4, %ymm4
	vpslld	$11, %ymm2, %ymm2
	vpslld	$11, %ymm6, %ymm6
	vpxor	%ymm1, %ymm8, %ymm8
	vpxor	%ymm5, %ymm4, %ymm4
	vpxor	%ymm2, %ymm8, %ymm8
	vpxor	%ymm6, %ymm4, %ymm4

	vpaddd	%ymm0, %ymm4, %ymm4
	vpaddd	(\i-16)*32(%rax), %ymm8, %ymm0

	vpslld	$13, %ymm3, %ymm2
	vpslld	$13, %ymm7, %ymm6
	vpsrld	$10, %ymm3, %ymm3
	vpsrld	$10, %ymm7, %ymm7

	vpaddd	(\i-7)*32(%rax), %ymm0, %ymm0
	vpaddd	(\i-6)*32(%rax), %ymm4, %ymm4

	vpsrld	$7, %ymm3, %ymm1
	vpsrld	$7, %ymm7, %ymm5
	vpxor	%ymm1, %ymm3, %ymm3
	vpxor	%ymm5, %ymm7, %ymm7
	vpsrld	$2, %ymm1, %ymm1
	vpsrld	$2, %ymm5, %ymm5
	vpxor	%ymm2, %ymm3, %ymm3
	vpxor	%ymm6, %ymm7, %ymm7
	vpslld	$2, %ymm2, %ymm2
	vpslld	$2, %ymm6, %ymm6
	vpxor	%ymm1, %ymm3, %ymm3
	vpxor	%ymm5, %ymm7, %ymm7
	vpxor	%ymm2, %ymm3, %ymm3
	vpxor	%ymm6, %ymm7, %ymm7

	vpaddd	%ymm0, %ymm3, %ymm3
	vpaddd	%ymm4, %ymm7, %ymm7
	vmovdqa	%ymm3, \i*32(%rax)
	vmovdqa	%ymm7, (\i+1)*32(%rax)
.endm

.macro sha256_avx2_main_round i, r0, r1, r2, r3, r4, r5, r6, r7
	vpaddd	32*(\i)(%rax), \r0, %ymm6
	vpaddd	32*(\i)(%rcx), %ymm6, %ymm6

	vpandn	\r1, \r3, %ymm1
	vpand	\r3, \r2, %ymm2
	vpxor	%ymm2, %ymm1, %ymm1
	vpaddd	%ymm1, %ymm6, %ymm6

	vpslld	$7, \r3, %ymm1
	vpsrld	$6, \r3, \r0
	vpsrld	$5, \r0, %ymm2
	vpxor	%ymm1, \r0, \r0
	vpxor	%ymm2, \r0, \r0
	vpslld	$14, %ymm1, %ymm1
	vpsrld	$14, %ymm2, %ymm2
	vpxor	%ymm1, \r0, \r0
	vpxor	%ymm2, \r0, \r0
	vpslld	$5, %ymm1, %ymm1
	vpxor	%ymm1, \r0, \r0
	vpaddd	\r0, %ymm6, %ymm6
	vpaddd	%ymm6, \r4, \r0

	vpand	\r6, \r5, %ymm2
	vpand	\r7, \r5, \r4
	vpand	\r7, \r6, %ymm1
	vpxor	\r4, %ymm1, %ymm1
	vpxor	%ymm2, %ymm1, %ymm1
	vpaddd	%ymm1, %ymm6, %ymm6

	vpslld	$10, \r7, %ymm2
	vpsrld	$2, \r7, \r4
	vpsrld	$11, \r4, %ymm1
	vpxor	%ymm2, \r4, \r4
	vpxor	%ymm1, \r4, \r4
	vpslld	$9, %ymm2, %ymm2
	vpsrld	$9, %ymm1, %ymm1
	vpxor	%ymm2, \r4, \r4
	vpxor	%ymm1, \r4, \r4
	vpslld	$11, %ymm2, %ymm2
	vpxor	%ymm2, \r4, \r4
	vpaddd	%ymm6, \r4, \r4
.endm

.macro sha256_avx2_main_quadround i
	sha256_avx2_main_round \i+0, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3, %ymm4, %ymm5, %ymm7
	sha256_avx2_main_round \i+1, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4, %ymm5, %ymm7, %ymm3
	sha256_avx2_main_round \i+2, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5, %ymm7, %ymm3, %ymm4
	sha256_avx2_main_round \i+3, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7, %ymm3, %ymm4, %ymm5
.endm

#endif /* USE_AVX2 */


#if defined(USE_XOP)

.macro sha256_xop_extend_round i
	vmovdqa	(\i-15)*16(%rax), %xmm0
	vprotd	$25, %xmm0, %xmm1
	vprotd	$14, %xmm0, %xmm2
	vpsrld	$3, %xmm0, %xmm0
	vpxor	%xmm1, %xmm2, %xmm2
	vpxor	%xmm2, %xmm0, %xmm0

	vpaddd	(\i-16)*16(%rax), %xmm0, %xmm0
	vpaddd	(\i-7)*16(%rax), %xmm0, %xmm0

	vprotd	$15, %xmm3, %xmm1
	vprotd	$13, %xmm3, %xmm2
	vpsrld	$10, %xmm3, %xmm3
	vpxor	%xmm1, %xmm2, %xmm2
	vpxor	%xmm2, %xmm3, %xmm3
	vpaddd	%xmm0, %xmm3, %xmm3
	vmovdqa	%xmm3, \i*16(%rax)
.endm

.macro sha256_xop_extend_doubleround i
	vmovdqa	(\i-15)*16(%rax), %xmm0
	vmovdqa	(\i-14)*16(%rax), %xmm4
	vprotd	$25, %xmm0, %xmm1
	vprotd	$25, %xmm4, %xmm5
	vprotd	$14, %xmm0, %xmm2
	vprotd	$14, %xmm4, %xmm6
	vpxor	%xmm1, %xmm2, %xmm2
	vpxor	%xmm5, %xmm6, %xmm6
	vpsrld	$3, %xmm0, %xmm0
	vpsrld	$3, %xmm4, %xmm4
	vpxor	%xmm2, %xmm0, %xmm0
	vpxor	%xmm6, %xmm4, %xmm4

	vpaddd	(\i-16)*16(%rax), %xmm0, %xmm0
	vpaddd	(\i-15)*16(%rax), %xmm4, %xmm4

	vprotd	$15, %xmm3, %xmm1
	vprotd	$15, %xmm7, %xmm5
	vprotd	$13, %xmm3, %xmm2
	vprotd	$13, %xmm7, %xmm6
	vpxor	%xmm1, %xmm2, %xmm2
	vpxor	%xmm5, %xmm6, %xmm6

	vpaddd	(\i-7)*16(%rax), %xmm0, %xmm0
	vpaddd	(\i-6)*16(%rax), %xmm4, %xmm4

	vpsrld	$10, %xmm3, %xmm3
	vpsrld	$10, %xmm7, %xmm7
	vpxor	%xmm2, %xmm3, %xmm3
	vpxor	%xmm6, %xmm7, %xmm7

	vpaddd	%xmm0, %xmm3, %xmm3
	vpaddd	%xmm4, %xmm7, %xmm7
	vmovdqa	%xmm3, \i*16(%rax)
	vmovdqa	%xmm7, (\i+1)*16(%rax)
.endm
	
.macro sha256_xop_main_round i, r0, r1, r2, r3, r4, r5, r6, r7
	vpaddd	16*(\i)(%rax), \r0, %xmm6
	vpaddd	16*(\i)(%rcx), %xmm6, %xmm6

	vpandn	\r1, \r3, %xmm1
	vpand	\r3, \r2, %xmm2
	vpxor	%xmm2, %xmm1, %xmm1
	vpaddd	%xmm1, %xmm6, %xmm6

	vprotd	$26, \r3, %xmm1
	vprotd	$21, \r3, %xmm2
	vpxor	%xmm1, %xmm2, %xmm2
	vprotd	$7, \r3, \r0
	vpxor	%xmm2, \r0, \r0
	vpaddd	\r0, %xmm6, %xmm6
	vpaddd	%xmm6, \r4, \r0

	vpand	\r6, \r5, %xmm2
	vpand	\r7, \r5, \r4
	vpand	\r7, \r6, %xmm1
	vpxor	\r4, %xmm1, %xmm1
	vpxor	%xmm2, %xmm1, %xmm1
	vpaddd	%xmm1, %xmm6, %xmm6

	vprotd	$30, \r7, %xmm1
	vprotd	$19, \r7, %xmm2
	vpxor	%xmm1, %xmm2, %xmm2
	vprotd	$10, \r7, \r4
	vpxor	%xmm2, \r4, \r4
	vpaddd	%xmm6, \r4, \r4
.endm

.macro sha256_xop_main_quadround i
	sha256_xop_main_round \i+0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
	sha256_xop_main_round \i+1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
	sha256_xop_main_round \i+2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
	sha256_xop_main_round \i+3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
.endm

#endif /* USE_XOP */


	.text
	.p2align 6
sha256_transform_4way_core_sse2:
	leaq	256(%rsp), %rcx
	leaq	48*16(%rcx), %rax
	movdqa	-2*16(%rcx), %xmm3
	movdqa	-1*16(%rcx), %xmm7
sha256_transform_4way_sse2_extend_loop:
	movdqa	-15*16(%rcx), %xmm0
	movdqa	-14*16(%rcx), %xmm4
	movdqa	%xmm0, %xmm2
	movdqa	%xmm4, %xmm6
	psrld	$3, %xmm0
	psrld	$3, %xmm4
	movdqa	%xmm0, %xmm1
	movdqa	%xmm4, %xmm5
	pslld	$14, %xmm2
	pslld	$14, %xmm6
	psrld	$4, %xmm1
	psrld	$4, %xmm5
	pxor	%xmm1, %xmm0
	pxor	%xmm5, %xmm4
	psrld	$11, %xmm1
	psrld	$11, %xmm5
	pxor	%xmm2, %xmm0
	pxor	%xmm6, %xmm4
	pslld	$11, %xmm2
	pslld	$11, %xmm6
	pxor	%xmm1, %xmm0
	pxor	%xmm5, %xmm4
	pxor	%xmm2, %xmm0
	pxor	%xmm6, %xmm4

	paddd	-16*16(%rcx), %xmm0
	paddd	-15*16(%rcx), %xmm4

	movdqa	%xmm3, %xmm2
	movdqa	%xmm7, %xmm6
	psrld	$10, %xmm3
	psrld	$10, %xmm7
	movdqa	%xmm3, %xmm1
	movdqa	%xmm7, %xmm5
	pslld	$13, %xmm2
	pslld	$13, %xmm6
	psrld	$7, %xmm1
	psrld	$7, %xmm5

	paddd	-7*16(%rcx), %xmm0
	paddd	-6*16(%rcx), %xmm4

	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	psrld	$2, %xmm1
	psrld	$2, %xmm5
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	pslld	$2, %xmm2
	pslld	$2, %xmm6
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7

	paddd	%xmm0, %xmm3
	paddd	%xmm4, %xmm7
	movdqa	%xmm3, (%rcx)
	movdqa	%xmm7, 16(%rcx)
	addq	$2*16, %rcx
	cmpq	%rcx, %rax
	jne sha256_transform_4way_sse2_extend_loop
	
	movdqu	0(%rdi), %xmm7
	movdqu	16(%rdi), %xmm5
	movdqu	32(%rdi), %xmm4
	movdqu	48(%rdi), %xmm3
	movdqu	64(%rdi), %xmm0
	movdqu	80(%rdi), %xmm8
	movdqu	96(%rdi), %xmm9
	movdqu	112(%rdi), %xmm10
	
	leaq	sha256_4k(%rip), %rcx
	xorq	%rax, %rax
sha256_transform_4way_sse2_main_loop:
	movdqa	(%rsp, %rax), %xmm6
	paddd	(%rcx, %rax), %xmm6
	paddd	%xmm10, %xmm6

	movdqa	%xmm0, %xmm1
	movdqa	%xmm9, %xmm2
	pandn	%xmm2, %xmm1

	movdqa	%xmm2, %xmm10
	movdqa	%xmm8, %xmm2
	movdqa	%xmm2, %xmm9

	pand	%xmm0, %xmm2
	pxor	%xmm2, %xmm1
	movdqa	%xmm0, %xmm8

	paddd	%xmm1, %xmm6

	movdqa	%xmm0, %xmm1
	psrld	$6, %xmm0
	movdqa	%xmm0, %xmm2
	pslld	$7, %xmm1
	psrld	$5, %xmm2
	pxor	%xmm1, %xmm0
	pxor	%xmm2, %xmm0
	pslld	$14, %xmm1
	psrld	$14, %xmm2
	pxor	%xmm1, %xmm0
	pxor	%xmm2, %xmm0
	pslld	$5, %xmm1
	pxor	%xmm1, %xmm0
	paddd	%xmm0, %xmm6

	movdqa	%xmm3, %xmm0
	paddd	%xmm6, %xmm0

	movdqa	%xmm5, %xmm1
	movdqa	%xmm4, %xmm3
	movdqa	%xmm4, %xmm2
	pand	%xmm5, %xmm2
	pand	%xmm7, %xmm4
	pand	%xmm7, %xmm1
	pxor	%xmm4, %xmm1
	movdqa	%xmm5, %xmm4
	movdqa	%xmm7, %xmm5
	pxor	%xmm2, %xmm1
	paddd	%xmm1, %xmm6

	movdqa	%xmm7, %xmm2
	psrld	$2, %xmm7
	movdqa	%xmm7, %xmm1
	pslld	$10, %xmm2
	psrld	$11, %xmm1
	pxor	%xmm2, %xmm7
	pxor	%xmm1, %xmm7
	pslld	$9, %xmm2
	psrld	$9, %xmm1
	pxor	%xmm2, %xmm7
	pxor	%xmm1, %xmm7
	pslld	$11, %xmm2
	pxor	%xmm2, %xmm7
	paddd	%xmm6, %xmm7
	
	addq	$16, %rax
	cmpq	$16*64, %rax
	jne sha256_transform_4way_sse2_main_loop
	jmp sha256_transform_4way_finish


#if defined(USE_AVX)
	.text
	.p2align 6
sha256_transform_4way_core_avx:
	leaq	256(%rsp), %rax
	movdqa	-2*16(%rax), %xmm3
	movdqa	-1*16(%rax), %xmm7
	sha256_avx_extend_doubleround 0
	sha256_avx_extend_doubleround 2
	sha256_avx_extend_doubleround 4
	sha256_avx_extend_doubleround 6
	sha256_avx_extend_doubleround 8
	sha256_avx_extend_doubleround 10
	sha256_avx_extend_doubleround 12
	sha256_avx_extend_doubleround 14
	sha256_avx_extend_doubleround 16
	sha256_avx_extend_doubleround 18
	sha256_avx_extend_doubleround 20
	sha256_avx_extend_doubleround 22
	sha256_avx_extend_doubleround 24
	sha256_avx_extend_doubleround 26
	sha256_avx_extend_doubleround 28
	sha256_avx_extend_doubleround 30
	sha256_avx_extend_doubleround 32
	sha256_avx_extend_doubleround 34
	sha256_avx_extend_doubleround 36
	sha256_avx_extend_doubleround 38
	sha256_avx_extend_doubleround 40
	sha256_avx_extend_doubleround 42
	sha256_avx_extend_doubleround 44
	sha256_avx_extend_doubleround 46
	movdqu	0(%rdi), %xmm7
	movdqu	16(%rdi), %xmm5
	movdqu	32(%rdi), %xmm4
	movdqu	48(%rdi), %xmm3
	movdqu	64(%rdi), %xmm0
	movdqu	80(%rdi), %xmm8
	movdqu	96(%rdi), %xmm9
	movdqu	112(%rdi), %xmm10
	movq	%rsp, %rax
	leaq	sha256_4k(%rip), %rcx
	sha256_avx_main_quadround 0
	sha256_avx_main_quadround 4
	sha256_avx_main_quadround 8
	sha256_avx_main_quadround 12
	sha256_avx_main_quadround 16
	sha256_avx_main_quadround 20
	sha256_avx_main_quadround 24
	sha256_avx_main_quadround 28
	sha256_avx_main_quadround 32
	sha256_avx_main_quadround 36
	sha256_avx_main_quadround 40
	sha256_avx_main_quadround 44
	sha256_avx_main_quadround 48
	sha256_avx_main_quadround 52
	sha256_avx_main_quadround 56
	sha256_avx_main_quadround 60
	jmp sha256_transform_4way_finish
#endif /* USE_AVX */


#if defined(USE_XOP)
	.text
	.p2align 6
sha256_transform_4way_core_xop:
	leaq	256(%rsp), %rax
	movdqa	-2*16(%rax), %xmm3
	movdqa	-1*16(%rax), %xmm7
	sha256_xop_extend_doubleround 0
	sha256_xop_extend_doubleround 2
	sha256_xop_extend_doubleround 4
	sha256_xop_extend_doubleround 6
	sha256_xop_extend_doubleround 8
	sha256_xop_extend_doubleround 10
	sha256_xop_extend_doubleround 12
	sha256_xop_extend_doubleround 14
	sha256_xop_extend_doubleround 16
	sha256_xop_extend_doubleround 18
	sha256_xop_extend_doubleround 20
	sha256_xop_extend_doubleround 22
	sha256_xop_extend_doubleround 24
	sha256_xop_extend_doubleround 26
	sha256_xop_extend_doubleround 28
	sha256_xop_extend_doubleround 30
	sha256_xop_extend_doubleround 32
	sha256_xop_extend_doubleround 34
	sha256_xop_extend_doubleround 36
	sha256_xop_extend_doubleround 38
	sha256_xop_extend_doubleround 40
	sha256_xop_extend_doubleround 42
	sha256_xop_extend_doubleround 44
	sha256_xop_extend_doubleround 46
	movdqu	0(%rdi), %xmm7
	movdqu	16(%rdi), %xmm5
	movdqu	32(%rdi), %xmm4
	movdqu	48(%rdi), %xmm3
	movdqu	64(%rdi), %xmm0
	movdqu	80(%rdi), %xmm8
	movdqu	96(%rdi), %xmm9
	movdqu	112(%rdi), %xmm10
	movq	%rsp, %rax
	leaq	sha256_4k(%rip), %rcx
	sha256_xop_main_quadround 0
	sha256_xop_main_quadround 4
	sha256_xop_main_quadround 8
	sha256_xop_main_quadround 12
	sha256_xop_main_quadround 16
	sha256_xop_main_quadround 20
	sha256_xop_main_quadround 24
	sha256_xop_main_quadround 28
	sha256_xop_main_quadround 32
	sha256_xop_main_quadround 36
	sha256_xop_main_quadround 40
	sha256_xop_main_quadround 44
	sha256_xop_main_quadround 48
	sha256_xop_main_quadround 52
	sha256_xop_main_quadround 56
	sha256_xop_main_quadround 60
	jmp sha256_transform_4way_finish
#endif /* USE_XOP */


	.data
	.p2align 3
sha256_transform_4way_core_addr:
	.quad 0x0

.macro p2bswap_rsi_rsp i
	movdqu	\i*16(%rsi), %xmm0
	movdqu	(\i+1)*16(%rsi), %xmm2
	pshuflw	$0xb1, %xmm0, %xmm0
	pshuflw	$0xb1, %xmm2, %xmm2
	pshufhw	$0xb1, %xmm0, %xmm0
	pshufhw	$0xb1, %xmm2, %xmm2
	movdqa	%xmm0, %xmm1
	movdqa	%xmm2, %xmm3
	psrlw	$8, %xmm1
	psrlw	$8, %xmm3
	psllw	$8, %xmm0
	psllw	$8, %xmm2
	pxor	%xmm1, %xmm0
	pxor	%xmm3, %xmm2
	movdqa	%xmm0, \i*16(%rsp)
	movdqa	%xmm2, (\i+1)*16(%rsp)
.endm
	
	.text
	.p2align 6
	.globl sha256_transform_4way
	.globl _sha256_transform_4way
sha256_transform_4way:
_sha256_transform_4way:
#if defined(_WIN64) || defined(__CYGWIN__)
	pushq	%rdi
	subq	$96, %rsp
	movdqa	%xmm6, 0(%rsp)
	movdqa	%xmm7, 16(%rsp)
	movdqa	%xmm8, 32(%rsp)
	movdqa	%xmm9, 48(%rsp)
	movdqa	%xmm10, 64(%rsp)
	movdqa	%xmm11, 80(%rsp)
	pushq	%rsi
	movq	%rcx, %rdi
	movq	%rdx, %rsi
	movq	%r8, %rdx
#endif
	movq	%rsp, %r8
	subq	$1032, %rsp
	andq	$-128, %rsp
	
	testq	%rdx, %rdx
	jnz sha256_transform_4way_swap
	
	movdqu	0*16(%rsi), %xmm0
	movdqu	1*16(%rsi), %xmm1
	movdqu	2*16(%rsi), %xmm2
	movdqu	3*16(%rsi), %xmm3
	movdqu	4*16(%rsi), %xmm4
	movdqu	5*16(%rsi), %xmm5
	movdqu	6*16(%rsi), %xmm6
	movdqu	7*16(%rsi), %xmm7
	movdqa	%xmm0, 0*16(%rsp)
	movdqa	%xmm1, 1*16(%rsp)
	movdqa	%xmm2, 2*16(%rsp)
	movdqa	%xmm3, 3*16(%rsp)
	movdqa	%xmm4, 4*16(%rsp)
	movdqa	%xmm5, 5*16(%rsp)
	movdqa	%xmm6, 6*16(%rsp)
	movdqa	%xmm7, 7*16(%rsp)
	movdqu	8*16(%rsi), %xmm0
	movdqu	9*16(%rsi), %xmm1
	movdqu	10*16(%rsi), %xmm2
	movdqu	11*16(%rsi), %xmm3
	movdqu	12*16(%rsi), %xmm4
	movdqu	13*16(%rsi), %xmm5
	movdqu	14*16(%rsi), %xmm6
	movdqu	15*16(%rsi), %xmm7
	movdqa	%xmm0, 8*16(%rsp)
	movdqa	%xmm1, 9*16(%rsp)
	movdqa	%xmm2, 10*16(%rsp)
	movdqa	%xmm3, 11*16(%rsp)
	movdqa	%xmm4, 12*16(%rsp)
	movdqa	%xmm5, 13*16(%rsp)
	movdqa	%xmm6, 14*16(%rsp)
	movdqa	%xmm7, 15*16(%rsp)
	jmp *sha256_transform_4way_core_addr(%rip)
	
	.p2align 6
sha256_transform_4way_swap:
	p2bswap_rsi_rsp 0
	p2bswap_rsi_rsp 2
	p2bswap_rsi_rsp 4
	p2bswap_rsi_rsp 6
	p2bswap_rsi_rsp 8
	p2bswap_rsi_rsp 10
	p2bswap_rsi_rsp 12
	p2bswap_rsi_rsp 14
	jmp *sha256_transform_4way_core_addr(%rip)
	
	.p2align 6
sha256_transform_4way_finish:
	movdqu	0(%rdi), %xmm2
	movdqu	16(%rdi), %xmm6
	movdqu	32(%rdi), %xmm11
	movdqu	48(%rdi), %xmm1
	paddd	%xmm2, %xmm7
	paddd	%xmm6, %xmm5
	paddd	%xmm11, %xmm4
	paddd	%xmm1, %xmm3
	movdqu	64(%rdi), %xmm2
	movdqu	80(%rdi), %xmm6
	movdqu	96(%rdi), %xmm11
	movdqu	112(%rdi), %xmm1
	paddd	%xmm2, %xmm0
	paddd	%xmm6, %xmm8
	paddd	%xmm11, %xmm9
	paddd	%xmm1, %xmm10
	
	movdqu	%xmm7, 0(%rdi)
	movdqu	%xmm5, 16(%rdi)
	movdqu	%xmm4, 32(%rdi)
	movdqu	%xmm3, 48(%rdi)
	movdqu	%xmm0, 64(%rdi)
	movdqu	%xmm8, 80(%rdi)
	movdqu	%xmm9, 96(%rdi)
	movdqu	%xmm10, 112(%rdi)
	
	movq	%r8, %rsp
#if defined(_WIN64) || defined(__CYGWIN__)
	popq	%rsi
	movdqa	0(%rsp), %xmm6
	movdqa	16(%rsp), %xmm7
	movdqa	32(%rsp), %xmm8
	movdqa	48(%rsp), %xmm9
	movdqa	64(%rsp), %xmm10
	movdqa	80(%rsp), %xmm11
	addq	$96, %rsp
	popq	%rdi
#endif
	ret


#ifdef USE_AVX2

	.text
	.p2align 6
sha256_transform_8way_core_avx2:
	leaq	8*64(%rsp), %rax
	vmovdqa	-2*32(%rax), %ymm3
	vmovdqa	-1*32(%rax), %ymm7
	sha256_avx2_extend_doubleround 0
	sha256_avx2_extend_doubleround 2
	sha256_avx2_extend_doubleround 4
	sha256_avx2_extend_doubleround 6
	sha256_avx2_extend_doubleround 8
	sha256_avx2_extend_doubleround 10
	sha256_avx2_extend_doubleround 12
	sha256_avx2_extend_doubleround 14
	sha256_avx2_extend_doubleround 16
	sha256_avx2_extend_doubleround 18
	sha256_avx2_extend_doubleround 20
	sha256_avx2_extend_doubleround 22
	sha256_avx2_extend_doubleround 24
	sha256_avx2_extend_doubleround 26
	sha256_avx2_extend_doubleround 28
	sha256_avx2_extend_doubleround 30
	sha256_avx2_extend_doubleround 32
	sha256_avx2_extend_doubleround 34
	sha256_avx2_extend_doubleround 36
	sha256_avx2_extend_doubleround 38
	sha256_avx2_extend_doubleround 40
	sha256_avx2_extend_doubleround 42
	sha256_avx2_extend_doubleround 44
	sha256_avx2_extend_doubleround 46
	vmovdqu	0*32(%rdi), %ymm7
	vmovdqu	1*32(%rdi), %ymm5
	vmovdqu	2*32(%rdi), %ymm4
	vmovdqu	3*32(%rdi), %ymm3
	vmovdqu	4*32(%rdi), %ymm0
	vmovdqu	5*32(%rdi), %ymm8
	vmovdqu	6*32(%rdi), %ymm9
	vmovdqu	7*32(%rdi), %ymm10
	movq	%rsp, %rax
	leaq	sha256_8k(%rip), %rcx
	sha256_avx2_main_quadround 0
	sha256_avx2_main_quadround 4
	sha256_avx2_main_quadround 8
	sha256_avx2_main_quadround 12
	sha256_avx2_main_quadround 16
	sha256_avx2_main_quadround 20
	sha256_avx2_main_quadround 24
	sha256_avx2_main_quadround 28
	sha256_avx2_main_quadround 32
	sha256_avx2_main_quadround 36
	sha256_avx2_main_quadround 40
	sha256_avx2_main_quadround 44
	sha256_avx2_main_quadround 48
	sha256_avx2_main_quadround 52
	sha256_avx2_main_quadround 56
	sha256_avx2_main_quadround 60
	jmp sha256_transform_8way_finish

.macro p2bswap_avx2_rsi_rsp i
	vmovdqu	\i*32(%rsi), %ymm0
	vmovdqu	(\i+1)*32(%rsi), %ymm2
	vpshuflw	$0xb1, %ymm0, %ymm0
	vpshuflw	$0xb1, %ymm2, %ymm2
	vpshufhw	$0xb1, %ymm0, %ymm0
	vpshufhw	$0xb1, %ymm2, %ymm2
	vpsrlw	$8, %ymm0, %ymm1
	vpsrlw	$8, %ymm2, %ymm3
	vpsllw	$8, %ymm0, %ymm0
	vpsllw	$8, %ymm2, %ymm2
	vpxor	%ymm1, %ymm0, %ymm0
	vpxor	%ymm3, %ymm2, %ymm2
	vmovdqa	%ymm0, \i*32(%rsp)
	vmovdqa	%ymm2, (\i+1)*32(%rsp)
.endm
	
	.text
	.p2align 6
	.globl sha256_transform_8way
	.globl _sha256_transform_8way
sha256_transform_8way:
_sha256_transform_8way:
#if defined(_WIN64) || defined(__CYGWIN__)
	pushq	%rdi
	subq	$96, %rsp
	vmovdqa	%xmm6, 0(%rsp)
	vmovdqa	%xmm7, 16(%rsp)
	vmovdqa	%xmm8, 32(%rsp)
	vmovdqa	%xmm9, 48(%rsp)
	vmovdqa	%xmm10, 64(%rsp)
	vmovdqa	%xmm11, 80(%rsp)
	pushq	%rsi
	movq	%rcx, %rdi
	movq	%rdx, %rsi
	movq	%r8, %rdx
#endif
	movq	%rsp, %r8
	subq	$64*32, %rsp
	andq	$-128, %rsp
	
	testq	%rdx, %rdx
	jnz sha256_transform_8way_swap
	
	vmovdqu	0*32(%rsi), %ymm0
	vmovdqu	1*32(%rsi), %ymm1
	vmovdqu	2*32(%rsi), %ymm2
	vmovdqu	3*32(%rsi), %ymm3
	vmovdqu	4*32(%rsi), %ymm4
	vmovdqu	5*32(%rsi), %ymm5
	vmovdqu	6*32(%rsi), %ymm6
	vmovdqu	7*32(%rsi), %ymm7
	vmovdqa	%ymm0, 0*32(%rsp)
	vmovdqa	%ymm1, 1*32(%rsp)
	vmovdqa	%ymm2, 2*32(%rsp)
	vmovdqa	%ymm3, 3*32(%rsp)
	vmovdqa	%ymm4, 4*32(%rsp)
	vmovdqa	%ymm5, 5*32(%rsp)
	vmovdqa	%ymm6, 6*32(%rsp)
	vmovdqa	%ymm7, 7*32(%rsp)
	vmovdqu	8*32(%rsi), %ymm0
	vmovdqu	9*32(%rsi), %ymm1
	vmovdqu	10*32(%rsi), %ymm2
	vmovdqu	11*32(%rsi), %ymm3
	vmovdqu	12*32(%rsi), %ymm4
	vmovdqu	13*32(%rsi), %ymm5
	vmovdqu	14*32(%rsi), %ymm6
	vmovdqu	15*32(%rsi), %ymm7
	vmovdqa	%ymm0, 8*32(%rsp)
	vmovdqa	%ymm1, 9*32(%rsp)
	vmovdqa	%ymm2, 10*32(%rsp)
	vmovdqa	%ymm3, 11*32(%rsp)
	vmovdqa	%ymm4, 12*32(%rsp)
	vmovdqa	%ymm5, 13*32(%rsp)
	vmovdqa	%ymm6, 14*32(%rsp)
	vmovdqa	%ymm7, 15*32(%rsp)
	jmp sha256_transform_8way_core_avx2
	
	.p2align 6
sha256_transform_8way_swap:
	p2bswap_avx2_rsi_rsp 0
	p2bswap_avx2_rsi_rsp 2
	p2bswap_avx2_rsi_rsp 4
	p2bswap_avx2_rsi_rsp 6
	p2bswap_avx2_rsi_rsp 8
	p2bswap_avx2_rsi_rsp 10
	p2bswap_avx2_rsi_rsp 12
	p2bswap_avx2_rsi_rsp 14
	jmp sha256_transform_8way_core_avx2
	
	.p2align 6
sha256_transform_8way_finish:
	vmovdqu	0*32(%rdi), %ymm2
	vmovdqu	1*32(%rdi), %ymm6
	vmovdqu	2*32(%rdi), %ymm11
	vmovdqu	3*32(%rdi), %ymm1
	vpaddd	%ymm2, %ymm7, %ymm7
	vpaddd	%ymm6, %ymm5, %ymm5
	vpaddd	%ymm11, %ymm4, %ymm4
	vpaddd	%ymm1, %ymm3, %ymm3
	vmovdqu	4*32(%rdi), %ymm2
	vmovdqu	5*32(%rdi), %ymm6
	vmovdqu	6*32(%rdi), %ymm11
	vmovdqu	7*32(%rdi), %ymm1
	vpaddd	%ymm2, %ymm0, %ymm0
	vpaddd	%ymm6, %ymm8, %ymm8
	vpaddd	%ymm11, %ymm9, %ymm9
	vpaddd	%ymm1, %ymm10, %ymm10
	
	vmovdqu	%ymm7, 0*32(%rdi)
	vmovdqu	%ymm5, 1*32(%rdi)
	vmovdqu	%ymm4, 2*32(%rdi)
	vmovdqu	%ymm3, 3*32(%rdi)
	vmovdqu	%ymm0, 4*32(%rdi)
	vmovdqu	%ymm8, 5*32(%rdi)
	vmovdqu	%ymm9, 6*32(%rdi)
	vmovdqu	%ymm10, 7*32(%rdi)
	
	movq	%r8, %rsp
#if defined(_WIN64) || defined(__CYGWIN__)
	popq	%rsi
	vmovdqa	0(%rsp), %xmm6
	vmovdqa	16(%rsp), %xmm7
	vmovdqa	32(%rsp), %xmm8
	vmovdqa	48(%rsp), %xmm9
	vmovdqa	64(%rsp), %xmm10
	vmovdqa	80(%rsp), %xmm11
	addq	$96, %rsp
	popq	%rdi
#endif
	ret

#endif /* USE_AVX2 */
	
	
	.data
	.p2align 3
sha256d_ms_4way_addr:
	.quad 0x0
	
	.text
	.p2align 6
	.globl sha256d_ms_4way
	.globl _sha256d_ms_4way
sha256d_ms_4way:
_sha256d_ms_4way:
	jmp *sha256d_ms_4way_addr(%rip)
	
	
	.p2align 6
sha256d_ms_4way_sse2:
#if defined(_WIN64) || defined(__CYGWIN__)
	pushq	%rdi
	subq	$32, %rsp
	movdqa	%xmm6, 0(%rsp)
	movdqa	%xmm7, 16(%rsp)
	pushq	%rsi
	movq	%rcx, %rdi
	movq	%rdx, %rsi
	movq	%r8, %rdx
	movq	%r9, %rcx
#endif
	subq	$8+67*16, %rsp
	
	leaq	256(%rsi), %rax
	
sha256d_ms_4way_sse2_extend_loop1:
	movdqa	3*16(%rsi), %xmm0
	movdqa	2*16(%rax), %xmm3
	movdqa	3*16(%rax), %xmm7
	movdqa	%xmm3, 5*16(%rsp)
	movdqa	%xmm7, 6*16(%rsp)
	movdqa	%xmm0, %xmm2
	paddd	%xmm0, %xmm7
	psrld	$3, %xmm0
	movdqa	%xmm0, %xmm1
	pslld	$14, %xmm2
	psrld	$4, %xmm1
	pxor	%xmm1, %xmm0
	pxor	%xmm2, %xmm0
	psrld	$11, %xmm1
	pslld	$11, %xmm2
	pxor	%xmm1, %xmm0
	pxor	%xmm2, %xmm0
	paddd	%xmm0, %xmm3
	movdqa	%xmm3, 2*16(%rax)
	movdqa	%xmm7, 3*16(%rax)
	
	movdqa	4*16(%rax), %xmm0
	movdqa	%xmm0, 7*16(%rsp)
	movdqa	%xmm3, %xmm2
	movdqa	%xmm7, %xmm6
	psrld	$10, %xmm3
	psrld	$10, %xmm7
	movdqa	%xmm3, %xmm1
	movdqa	%xmm7, %xmm5
	pslld	$13, %xmm2
	pslld	$13, %xmm6
	psrld	$7, %xmm1
	psrld	$7, %xmm5
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	psrld	$2, %xmm1
	psrld	$2, %xmm5
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	pslld	$2, %xmm2
	pslld	$2, %xmm6
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	paddd	%xmm0, %xmm3
	movdqa	%xmm3, 4*16(%rax)
	movdqa	%xmm7, 5*16(%rax)
	
	movdqa	6*16(%rax), %xmm0
	movdqa	7*16(%rax), %xmm4
	movdqa	%xmm0, 9*16(%rsp)
	movdqa	%xmm4, 10*16(%rsp)
	movdqa	%xmm3, %xmm2
	movdqa	%xmm7, %xmm6
	psrld	$10, %xmm3
	psrld	$10, %xmm7
	movdqa	%xmm3, %xmm1
	movdqa	%xmm7, %xmm5
	pslld	$13, %xmm2
	pslld	$13, %xmm6
	psrld	$7, %xmm1
	psrld	$7, %xmm5
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	psrld	$2, %xmm1
	psrld	$2, %xmm5
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	pslld	$2, %xmm2
	pslld	$2, %xmm6
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	paddd	%xmm0, %xmm3
	paddd	%xmm4, %xmm7
	movdqa	%xmm3, 6*16(%rax)
	movdqa	%xmm7, 7*16(%rax)
	
	movdqa	8*16(%rax), %xmm0
	movdqa	2*16(%rax), %xmm4
	movdqa	%xmm0, 11*16(%rsp)
	movdqa	%xmm3, %xmm2
	movdqa	%xmm7, %xmm6
	psrld	$10, %xmm3
	psrld	$10, %xmm7
	movdqa	%xmm3, %xmm1
	movdqa	%xmm7, %xmm5
	pslld	$13, %xmm2
	pslld	$13, %xmm6
	psrld	$7, %xmm1
	psrld	$7, %xmm5
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	psrld	$2, %xmm1
	psrld	$2, %xmm5
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	pslld	$2, %xmm2
	pslld	$2, %xmm6
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	paddd	%xmm0, %xmm3
	paddd	%xmm4, %xmm7
	movdqa	%xmm3, 8*16(%rax)
	movdqa	%xmm7, 9*16(%rax)
	
	movdqa	%xmm3, %xmm2
	movdqa	%xmm7, %xmm6
	psrld	$10, %xmm3
	psrld	$10, %xmm7
	movdqa	%xmm3, %xmm1
	movdqa	%xmm7, %xmm5
	pslld	$13, %xmm2
	pslld	$13, %xmm6
	psrld	$7, %xmm1
	psrld	$7, %xmm5
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	psrld	$2, %xmm1
	psrld	$2, %xmm5
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	pslld	$2, %xmm2
	pslld	$2, %xmm6
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	paddd	3*16(%rax), %xmm3
	paddd	4*16(%rax), %xmm7
	movdqa	%xmm3, 10*16(%rax)
	movdqa	%xmm7, 11*16(%rax)
	
	movdqa	%xmm3, %xmm2
	movdqa	%xmm7, %xmm6
	psrld	$10, %xmm3
	psrld	$10, %xmm7
	movdqa	%xmm3, %xmm1
	movdqa	%xmm7, %xmm5
	pslld	$13, %xmm2
	pslld	$13, %xmm6
	psrld	$7, %xmm1
	psrld	$7, %xmm5
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	psrld	$2, %xmm1
	psrld	$2, %xmm5
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	pslld	$2, %xmm2
	pslld	$2, %xmm6
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	paddd	5*16(%rax), %xmm3
	paddd	6*16(%rax), %xmm7
	movdqa	%xmm3, 12*16(%rax)
	movdqa	%xmm7, 13*16(%rax)
	
	movdqa	14*16(%rax), %xmm0
	movdqa	15*16(%rax), %xmm4
	movdqa	%xmm0, 17*16(%rsp)
	movdqa	%xmm4, 18*16(%rsp)
	movdqa	%xmm3, %xmm2
	movdqa	%xmm7, %xmm6
	psrld	$10, %xmm3
	psrld	$10, %xmm7
	movdqa	%xmm3, %xmm1
	movdqa	%xmm7, %xmm5
	paddd	7*16(%rax), %xmm0
	paddd	8*16(%rax), %xmm4
	pslld	$13, %xmm2
	pslld	$13, %xmm6
	psrld	$7, %xmm1
	psrld	$7, %xmm5
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	psrld	$2, %xmm1
	psrld	$2, %xmm5
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	pslld	$2, %xmm2
	pslld	$2, %xmm6
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	paddd	%xmm0, %xmm3
	paddd	%xmm4, %xmm7
	movdqa	%xmm3, 14*16(%rax)
	movdqa	%xmm7, 15*16(%rax)
	
sha256d_ms_4way_sse2_extend_loop2:
	sha256_sse2_extend_doubleround 16
	sha256_sse2_extend_doubleround 18
	sha256_sse2_extend_doubleround 20
	sha256_sse2_extend_doubleround 22
	sha256_sse2_extend_doubleround 24
	sha256_sse2_extend_doubleround 26
	sha256_sse2_extend_doubleround 28
	sha256_sse2_extend_doubleround 30
	sha256_sse2_extend_doubleround 32
	sha256_sse2_extend_doubleround 34
	sha256_sse2_extend_doubleround 36
	sha256_sse2_extend_doubleround 38
	sha256_sse2_extend_doubleround 40
	sha256_sse2_extend_doubleround 42
	jz sha256d_ms_4way_sse2_extend_coda2
	sha256_sse2_extend_doubleround 44
	sha256_sse2_extend_doubleround 46
	
	movdqa	0(%rcx), %xmm3
	movdqa	16(%rcx), %xmm0
	movdqa	32(%rcx), %xmm1
	movdqa	48(%rcx), %xmm2
	movdqa	64(%rcx), %xmm6
	movdqa	80(%rcx), %xmm7
	movdqa	96(%rcx), %xmm5
	movdqa	112(%rcx), %xmm4
	movdqa	%xmm1, 0(%rsp)
	movdqa	%xmm2, 16(%rsp)
	movdqa	%xmm6, 32(%rsp)
	
	movq	%rsi, %rax
	leaq	sha256_4k(%rip), %rcx
	jmp sha256d_ms_4way_sse2_main_loop1
	
sha256d_ms_4way_sse2_main_loop2:
	sha256_sse2_main_round 0
	sha256_sse2_main_round 1
	sha256_sse2_main_round 2
sha256d_ms_4way_sse2_main_loop1:
	sha256_sse2_main_round 3
	sha256_sse2_main_quadround 4
	sha256_sse2_main_quadround 8
	sha256_sse2_main_quadround 12
	sha256_sse2_main_quadround 16
	sha256_sse2_main_quadround 20
	sha256_sse2_main_quadround 24
	sha256_sse2_main_quadround 28
	sha256_sse2_main_quadround 32
	sha256_sse2_main_quadround 36
	sha256_sse2_main_quadround 40
	sha256_sse2_main_quadround 44
	sha256_sse2_main_quadround 48
	sha256_sse2_main_quadround 52
	sha256_sse2_main_round 56
	jz sha256d_ms_4way_sse2_finish
	sha256_sse2_main_round 57
	sha256_sse2_main_round 58
	sha256_sse2_main_round 59
	sha256_sse2_main_quadround 60
	
	movdqa	5*16(%rsp), %xmm1
	movdqa	6*16(%rsp), %xmm2
	movdqa	7*16(%rsp), %xmm6
	movdqa	%xmm1, 18*16(%rsi)
	movdqa	%xmm2, 19*16(%rsi)
	movdqa	%xmm6, 20*16(%rsi)
	movdqa	9*16(%rsp), %xmm1
	movdqa	10*16(%rsp), %xmm2
	movdqa	11*16(%rsp), %xmm6
	movdqa	%xmm1, 22*16(%rsi)
	movdqa	%xmm2, 23*16(%rsi)
	movdqa	%xmm6, 24*16(%rsi)
	movdqa	17*16(%rsp), %xmm1
	movdqa	18*16(%rsp), %xmm2
	movdqa	%xmm1, 30*16(%rsi)
	movdqa	%xmm2, 31*16(%rsi)
	
	movdqa	0(%rsp), %xmm1
	movdqa	16(%rsp), %xmm2
	movdqa	32(%rsp), %xmm6
	paddd	0(%rdx), %xmm7
	paddd	16(%rdx), %xmm5
	paddd	32(%rdx), %xmm4
	paddd	48(%rdx), %xmm3
	paddd	64(%rdx), %xmm0
	paddd	80(%rdx), %xmm1
	paddd	96(%rdx), %xmm2
	paddd	112(%rdx), %xmm6
	
	movdqa	%xmm7, 48+0(%rsp)
	movdqa	%xmm5, 48+16(%rsp)
	movdqa	%xmm4, 48+32(%rsp)
	movdqa	%xmm3, 48+48(%rsp)
	movdqa	%xmm0, 48+64(%rsp)
	movdqa	%xmm1, 48+80(%rsp)
	movdqa	%xmm2, 48+96(%rsp)
	movdqa	%xmm6, 48+112(%rsp)
	
	pxor	%xmm0, %xmm0
	movq	$0x8000000000000100, %rax
	movd	%rax, %xmm1
	pshufd	$0x55, %xmm1, %xmm2
	pshufd	$0x00, %xmm1, %xmm1
	movdqa	%xmm2, 48+128(%rsp)
	movdqa	%xmm0, 48+144(%rsp)
	movdqa	%xmm0, 48+160(%rsp)
	movdqa	%xmm0, 48+176(%rsp)
	movdqa	%xmm0, 48+192(%rsp)
	movdqa	%xmm0, 48+208(%rsp)
	movdqa	%xmm0, 48+224(%rsp)
	movdqa	%xmm1, 48+240(%rsp)
	
	leaq	19*16(%rsp), %rax
	cmpq	%rax, %rax
	
	movdqa	-15*16(%rax), %xmm0
	movdqa	-14*16(%rax), %xmm4
	movdqa	%xmm0, %xmm2
	movdqa	%xmm4, %xmm6
	psrld	$3, %xmm0
	psrld	$3, %xmm4
	movdqa	%xmm0, %xmm1
	movdqa	%xmm4, %xmm5
	pslld	$14, %xmm2
	pslld	$14, %xmm6
	psrld	$4, %xmm1
	psrld	$4, %xmm5
	pxor	%xmm1, %xmm0
	pxor	%xmm5, %xmm4
	psrld	$11, %xmm1
	psrld	$11, %xmm5
	pxor	%xmm2, %xmm0
	pxor	%xmm6, %xmm4
	pslld	$11, %xmm2
	pslld	$11, %xmm6
	pxor	%xmm1, %xmm0
	pxor	%xmm5, %xmm4
	pxor	%xmm2, %xmm0
	pxor	%xmm6, %xmm4
	paddd	-16*16(%rax), %xmm0
	paddd	-15*16(%rax), %xmm4
	paddd	sha256d_4preext2_17(%rip), %xmm4
	movdqa	%xmm0, %xmm3
	movdqa	%xmm4, %xmm7
	movdqa	%xmm3, 0*16(%rax)
	movdqa	%xmm7, 1*16(%rax)
	
	sha256_sse2_extend_doubleround 2
	sha256_sse2_extend_doubleround 4
	
	movdqa	-9*16(%rax), %xmm0
	movdqa	sha256d_4preext2_23(%rip), %xmm4
	movdqa	%xmm0, %xmm2
	psrld	$3, %xmm0
	movdqa	%xmm0, %xmm1
	pslld	$14, %xmm2
	psrld	$4, %xmm1
	pxor	%xmm1, %xmm0
	pxor	%xmm2, %xmm0
	psrld	$11, %xmm1
	pslld	$11, %xmm2
	pxor	%xmm1, %xmm0
	pxor	%xmm2, %xmm0
	paddd	-10*16(%rax), %xmm0
	paddd	-9*16(%rax), %xmm4
	movdqa	%xmm3, %xmm2
	movdqa	%xmm7, %xmm6
	psrld	$10, %xmm3
	psrld	$10, %xmm7
	movdqa	%xmm3, %xmm1
	movdqa	%xmm7, %xmm5
	paddd	-1*16(%rax), %xmm0
	pslld	$13, %xmm2
	pslld	$13, %xmm6
	psrld	$7, %xmm1
	psrld	$7, %xmm5
	paddd	0*16(%rax), %xmm4
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	psrld	$2, %xmm1
	psrld	$2, %xmm5
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	pslld	$2, %xmm2
	pslld	$2, %xmm6
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	paddd	%xmm0, %xmm3
	paddd	%xmm4, %xmm7
	movdqa	%xmm3, 6*16(%rax)
	movdqa	%xmm7, 7*16(%rax)
	
	movdqa	sha256d_4preext2_24(%rip), %xmm0
	movdqa	%xmm3, %xmm2
	movdqa	%xmm7, %xmm6
	psrld	$10, %xmm3
	psrld	$10, %xmm7
	movdqa	%xmm3, %xmm1
	movdqa	%xmm7, %xmm5
	paddd	1*16(%rax), %xmm0
	pslld	$13, %xmm2
	pslld	$13, %xmm6
	psrld	$7, %xmm1
	psrld	$7, %xmm5
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	psrld	$2, %xmm1
	psrld	$2, %xmm5
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	pslld	$2, %xmm2
	pslld	$2, %xmm6
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	paddd	%xmm0, %xmm3
	paddd	2*16(%rax), %xmm7
	movdqa	%xmm3, 8*16(%rax)
	movdqa	%xmm7, 9*16(%rax)
	
	movdqa	%xmm3, %xmm2
	movdqa	%xmm7, %xmm6
	psrld	$10, %xmm3
	psrld	$10, %xmm7
	movdqa	%xmm3, %xmm1
	movdqa	%xmm7, %xmm5
	pslld	$13, %xmm2
	pslld	$13, %xmm6
	psrld	$7, %xmm1
	psrld	$7, %xmm5
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	psrld	$2, %xmm1
	psrld	$2, %xmm5
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	pslld	$2, %xmm2
	pslld	$2, %xmm6
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	paddd	3*16(%rax), %xmm3
	paddd	4*16(%rax), %xmm7
	movdqa	%xmm3, 10*16(%rax)
	movdqa	%xmm7, 11*16(%rax)
	
	movdqa	%xmm3, %xmm2
	movdqa	%xmm7, %xmm6
	psrld	$10, %xmm3
	psrld	$10, %xmm7
	movdqa	%xmm3, %xmm1
	movdqa	%xmm7, %xmm5
	pslld	$13, %xmm2
	pslld	$13, %xmm6
	psrld	$7, %xmm1
	psrld	$7, %xmm5
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	psrld	$2, %xmm1
	psrld	$2, %xmm5
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	pslld	$2, %xmm2
	pslld	$2, %xmm6
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	paddd	5*16(%rax), %xmm3
	paddd	6*16(%rax), %xmm7
	movdqa	%xmm3, 12*16(%rax)
	movdqa	%xmm7, 13*16(%rax)
	
	movdqa	sha256d_4preext2_30(%rip), %xmm0
	movdqa	0*16(%rax), %xmm4
	movdqa	%xmm4, %xmm6
	psrld	$3, %xmm4
	movdqa	%xmm4, %xmm5
	pslld	$14, %xmm6
	psrld	$4, %xmm5
	pxor	%xmm5, %xmm4
	pxor	%xmm6, %xmm4
	psrld	$11, %xmm5
	pslld	$11, %xmm6
	pxor	%xmm5, %xmm4
	pxor	%xmm6, %xmm4
	paddd	-1*16(%rax), %xmm4
	movdqa	%xmm3, %xmm2
	movdqa	%xmm7, %xmm6
	psrld	$10, %xmm3
	psrld	$10, %xmm7
	movdqa	%xmm3, %xmm1
	movdqa	%xmm7, %xmm5
	paddd	7*16(%rax), %xmm0
	pslld	$13, %xmm2
	pslld	$13, %xmm6
	psrld	$7, %xmm1
	psrld	$7, %xmm5
	paddd	8*16(%rax), %xmm4
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	psrld	$2, %xmm1
	psrld	$2, %xmm5
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	pslld	$2, %xmm2
	pslld	$2, %xmm6
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	paddd	%xmm0, %xmm3
	paddd	%xmm4, %xmm7
	movdqa	%xmm3, 14*16(%rax)
	movdqa	%xmm7, 15*16(%rax)
	
	jmp sha256d_ms_4way_sse2_extend_loop2
	
sha256d_ms_4way_sse2_extend_coda2:
	sha256_sse2_extend_round 44
	
	movdqa	sha256_4h+0(%rip), %xmm7
	movdqa	sha256_4h+16(%rip), %xmm5
	movdqa	sha256_4h+32(%rip), %xmm4
	movdqa	sha256_4h+48(%rip), %xmm3
	movdqa	sha256_4h+64(%rip), %xmm0
	movdqa	sha256_4h+80(%rip), %xmm1
	movdqa	sha256_4h+96(%rip), %xmm2
	movdqa	sha256_4h+112(%rip), %xmm6
	movdqa	%xmm1, 0(%rsp)
	movdqa	%xmm2, 16(%rsp)
	movdqa	%xmm6, 32(%rsp)
	
	leaq	48(%rsp), %rax
	leaq	sha256_4k(%rip), %rcx
	jmp sha256d_ms_4way_sse2_main_loop2

.macro sha256_sse2_main_round_red i, r7
	movdqa	16*\i(%rax), %xmm6
	paddd	16*\i(%rcx), %xmm6
	paddd	32(%rsp), %xmm6
	movdqa	%xmm0, %xmm1
	movdqa	16(%rsp), %xmm2
	paddd	\r7, %xmm6
	pandn	%xmm2, %xmm1
	movdqa	%xmm2, 32(%rsp)
	movdqa	0(%rsp), %xmm2
	movdqa	%xmm2, 16(%rsp)
	pand	%xmm0, %xmm2
	pxor	%xmm2, %xmm1
	movdqa	%xmm0, 0(%rsp)
	paddd	%xmm1, %xmm6
	movdqa	%xmm0, %xmm1
	psrld	$6, %xmm0
	movdqa	%xmm0, %xmm2
	pslld	$7, %xmm1
	psrld	$5, %xmm2
	pxor	%xmm1, %xmm0
	pxor	%xmm2, %xmm0
	pslld	$14, %xmm1
	psrld	$14, %xmm2
	pxor	%xmm1, %xmm0
	pxor	%xmm2, %xmm0
	pslld	$5, %xmm1
	pxor	%xmm1, %xmm0
	paddd	%xmm6, %xmm0
.endm

sha256d_ms_4way_sse2_finish:
	sha256_sse2_main_round_red 57, %xmm3
	sha256_sse2_main_round_red 58, %xmm4
	sha256_sse2_main_round_red 59, %xmm5
	sha256_sse2_main_round_red 60, %xmm7
	
	paddd	sha256_4h+112(%rip), %xmm0
	movdqa	%xmm0, 112(%rdi)
	
	addq	$8+67*16, %rsp
#if defined(_WIN64) || defined(__CYGWIN__)
	popq	%rsi
	movdqa	0(%rsp), %xmm6
	movdqa	16(%rsp), %xmm7
	addq	$32, %rsp
	popq	%rdi
#endif
	ret
	
	
#if defined(USE_AVX)
	
	.p2align 6
sha256d_ms_4way_avx:
#if defined(_WIN64) || defined(__CYGWIN__)
	pushq	%rdi
	subq	$80, %rsp
	movdqa	%xmm6, 0(%rsp)
	movdqa	%xmm7, 16(%rsp)
	movdqa	%xmm8, 32(%rsp)
	movdqa	%xmm9, 48(%rsp)
	movdqa	%xmm10, 64(%rsp)
	pushq	%rsi
	movq	%rcx, %rdi
	movq	%rdx, %rsi
	movq	%r8, %rdx
	movq	%r9, %rcx
#endif
	subq	$1032, %rsp
	
	leaq	256(%rsi), %rax
	
sha256d_ms_4way_avx_extend_loop1:
	vmovdqa	3*16(%rsi), %xmm0
	vmovdqa	2*16(%rax), %xmm3
	vmovdqa	3*16(%rax), %xmm7
	vmovdqa	%xmm3, 2*16(%rsp)
	vmovdqa	%xmm7, 3*16(%rsp)
	vpaddd	%xmm0, %xmm7, %xmm7
	vpslld	$14, %xmm0, %xmm2
	vpsrld	$3, %xmm0, %xmm0
	vpsrld	$4, %xmm0, %xmm1
	vpxor	%xmm1, %xmm0, %xmm0
	vpxor	%xmm2, %xmm0, %xmm0
	vpsrld	$11, %xmm1, %xmm1
	vpslld	$11, %xmm2, %xmm2
	vpxor	%xmm1, %xmm0, %xmm0
	vpxor	%xmm2, %xmm0, %xmm0
	vpaddd	%xmm0, %xmm3, %xmm3
	vmovdqa	%xmm3, 2*16(%rax)
	vmovdqa	%xmm7, 3*16(%rax)
	
	vmovdqa	4*16(%rax), %xmm0
	vmovdqa	%xmm0, 4*16(%rsp)
	vpslld	$13, %xmm3, %xmm2
	vpslld	$13, %xmm7, %xmm6
	vpsrld	$10, %xmm3, %xmm3
	vpsrld	$10, %xmm7, %xmm7
	vpsrld	$7, %xmm3, %xmm1
	vpsrld	$7, %xmm7, %xmm5
	vpxor	%xmm1, %xmm3, %xmm3
	vpxor	%xmm5, %xmm7, %xmm7
	vpsrld	$2, %xmm1, %xmm1
	vpsrld	$2, %xmm5, %xmm5
	vpxor	%xmm2, %xmm3, %xmm3
	vpxor	%xmm6, %xmm7, %xmm7
	vpslld	$2, %xmm2, %xmm2
	vpslld	$2, %xmm6, %xmm6
	vpxor	%xmm1, %xmm3, %xmm3
	vpxor	%xmm5, %xmm7, %xmm7
	vpxor	%xmm2, %xmm3, %xmm3
	vpxor	%xmm6, %xmm7, %xmm7
	vpaddd	%xmm0, %xmm3, %xmm3
	vmovdqa	%xmm3, 4*16(%rax)
	vmovdqa	%xmm7, 5*16(%rax)
	
	vmovdqa	6*16(%rax), %xmm0
	vmovdqa	7*16(%rax), %xmm4
	vmovdqa	%xmm0, 6*16(%rsp)
	vmovdqa	%xmm4, 7*16(%rsp)
	vpslld	$13, %xmm3, %xmm2
	vpslld	$13, %xmm7, %xmm6
	vpsrld	$10, %xmm3, %xmm3
	vpsrld	$10, %xmm7, %xmm7
	vpsrld	$7, %xmm3, %xmm1
	vpsrld	$7, %xmm7, %xmm5
	vpxor	%xmm1, %xmm3, %xmm3
	vpxor	%xmm5, %xmm7, %xmm7
	vpsrld	$2, %xmm1, %xmm1
	vpsrld	$2, %xmm5, %xmm5
	vpxor	%xmm2, %xmm3, %xmm3
	vpxor	%xmm6, %xmm7, %xmm7
	vpslld	$2, %xmm2, %xmm2
	vpslld	$2, %xmm6, %xmm6
	vpxor	%xmm1, %xmm3, %xmm3
	vpxor	%xmm5, %xmm7, %xmm7
	vpxor	%xmm2, %xmm3, %xmm3
	vpxor	%xmm6, %xmm7, %xmm7
	vpaddd	%xmm0, %xmm3, %xmm3
	vpaddd	%xmm4, %xmm7, %xmm7
	vmovdqa	%xmm3, 6*16(%rax)
	vmovdqa	%xmm7, 7*16(%rax)
	
	vmovdqa	8*16(%rax), %xmm0
	vmovdqa	2*16(%rax), %xmm4
	vmovdqa	%xmm0, 8*16(%rsp)
	vpslld	$13, %xmm3, %xmm2
	vpslld	$13, %xmm7, %xmm6
	vpsrld	$10, %xmm3, %xmm3
	vpsrld	$10, %xmm7, %xmm7
	vpsrld	$7, %xmm3, %xmm1
	vpsrld	$7, %xmm7, %xmm5
	vpxor	%xmm1, %xmm3, %xmm3
	vpxor	%xmm5, %xmm7, %xmm7
	vpsrld	$2, %xmm1, %xmm1
	vpsrld	$2, %xmm5, %xmm5
	vpxor	%xmm2, %xmm3, %xmm3
	vpxor	%xmm6, %xmm7, %xmm7
	vpslld	$2, %xmm2, %xmm2
	vpslld	$2, %xmm6, %xmm6
	vpxor	%xmm1, %xmm3, %xmm3
	vpxor	%xmm5, %xmm7, %xmm7
	vpxor	%xmm2, %xmm3, %xmm3
	vpxor	%xmm6, %xmm7, %xmm7
	vpaddd	%xmm0, %xmm3, %xmm3
	vpaddd	%xmm4, %xmm7, %xmm7
	vmovdqa	%xmm3, 8*16(%rax)
	vmovdqa	%xmm7, 9*16(%rax)
	
	vpslld	$13, %xmm3, %xmm2
	vpslld	$13, %xmm7, %xmm6
	vpsrld	$10, %xmm3, %xmm3
	vpsrld	$10, %xmm7, %xmm7
	vpsrld	$7, %xmm3, %xmm1
	vpsrld	$7, %xmm7, %xmm5
	vpxor	%xmm1, %xmm3, %xmm3
	vpxor	%xmm5, %xmm7, %xmm7
	vpsrld	$2, %xmm1, %xmm1
	vpsrld	$2, %xmm5, %xmm5
	vpxor	%xmm2, %xmm3, %xmm3
	vpxor	%xmm6, %xmm7, %xmm7
	vpslld	$2, %xmm2, %xmm2
	vpslld	$2, %xmm6, %xmm6
	vpxor	%xmm1, %xmm3, %xmm3
	vpxor	%xmm5, %xmm7, %xmm7
	vpxor	%xmm2, %xmm3, %xmm3
	vpxor	%xmm6, %xmm7, %xmm7
	vpaddd	3*16(%rax), %xmm3, %xmm3
	vpaddd	4*16(%rax), %xmm7, %xmm7
	vmovdqa	%xmm3, 10*16(%rax)
	vmovdqa	%xmm7, 11*16(%rax)
	
	vpslld	$13, %xmm3, %xmm2
	vpslld	$13, %xmm7, %xmm6
	vpsrld	$10, %xmm3, %xmm3
	vpsrld	$10, %xmm7, %xmm7
	vpsrld	$7, %xmm3, %xmm1
	vpsrld	$7, %xmm7, %xmm5
	vpxor	%xmm1, %xmm3, %xmm3
	vpxor	%xmm5, %xmm7, %xmm7
	vpsrld	$2, %xmm1, %xmm1
	vpsrld	$2, %xmm5, %xmm5
	vpxor	%xmm2, %xmm3, %xmm3
	vpxor	%xmm6, %xmm7, %xmm7
	vpslld	$2, %xmm2, %xmm2
	vpslld	$2, %xmm6, %xmm6
	vpxor	%xmm1, %xmm3, %xmm3
	vpxor	%xmm5, %xmm7, %xmm7
	vpxor	%xmm2, %xmm3, %xmm3
	vpxor	%xmm6, %xmm7, %xmm7
	vpaddd	5*16(%rax), %xmm3, %xmm3
	vpaddd	6*16(%rax), %xmm7, %xmm7
	vmovdqa	%xmm3, 12*16(%rax)
	vmovdqa	%xmm7, 13*16(%rax)
	
	vmovdqa	14*16(%rax), %xmm0
	vmovdqa	15*16(%rax), %xmm4
	vmovdqa	%xmm0, 14*16(%rsp)
	vmovdqa	%xmm4, 15*16(%rsp)
	vpslld	$13, %xmm3, %xmm2
	vpslld	$13, %xmm7, %xmm6
	vpsrld	$10, %xmm3, %xmm3
	vpsrld	$10, %xmm7, %xmm7
	vpaddd	7*16(%rax), %xmm0, %xmm0
	vpaddd	8*16(%rax), %xmm4, %xmm4
	vpsrld	$7, %xmm3, %xmm1
	vpsrld	$7, %xmm7, %xmm5
	vpxor	%xmm1, %xmm3, %xmm3
	vpxor	%xmm5, %xmm7, %xmm7
	vpsrld	$2, %xmm1, %xmm1
	vpsrld	$2, %xmm5, %xmm5
	vpxor	%xmm2, %xmm3, %xmm3
	vpxor	%xmm6, %xmm7, %xmm7
	vpslld	$2, %xmm2, %xmm2
	vpslld	$2, %xmm6, %xmm6
	vpxor	%xmm1, %xmm3, %xmm3
	vpxor	%xmm5, %xmm7, %xmm7
	vpxor	%xmm2, %xmm3, %xmm3
	vpxor	%xmm6, %xmm7, %xmm7
	vpaddd	%xmm0, %xmm3, %xmm3
	vpaddd	%xmm4, %xmm7, %xmm7
	vmovdqa	%xmm3, 14*16(%rax)
	vmovdqa	%xmm7, 15*16(%rax)
	
sha256d_ms_4way_avx_extend_loop2:
	sha256_avx_extend_doubleround 16
	sha256_avx_extend_doubleround 18
	sha256_avx_extend_doubleround 20
	sha256_avx_extend_doubleround 22
	sha256_avx_extend_doubleround 24
	sha256_avx_extend_doubleround 26
	sha256_avx_extend_doubleround 28
	sha256_avx_extend_doubleround 30
	sha256_avx_extend_doubleround 32
	sha256_avx_extend_doubleround 34
	sha256_avx_extend_doubleround 36
	sha256_avx_extend_doubleround 38
	sha256_avx_extend_doubleround 40
	sha256_avx_extend_doubleround 42
	jz sha256d_ms_4way_avx_extend_coda2
	sha256_avx_extend_doubleround 44
	sha256_avx_extend_doubleround 46
	
	movdqa	0(%rcx), %xmm7
	movdqa	16(%rcx), %xmm8
	movdqa	32(%rcx), %xmm9
	movdqa	48(%rcx), %xmm10
	movdqa	64(%rcx), %xmm0
	movdqa	80(%rcx), %xmm5
	movdqa	96(%rcx), %xmm4
	movdqa	112(%rcx), %xmm3
	
	movq	%rsi, %rax
	leaq	sha256_4k(%rip), %rcx
	jmp sha256d_ms_4way_avx_main_loop1
	
sha256d_ms_4way_avx_main_loop2:
	sha256_avx_main_round 0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
	sha256_avx_main_round 1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
	sha256_avx_main_round 2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
sha256d_ms_4way_avx_main_loop1:
	sha256_avx_main_round 3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
	sha256_avx_main_quadround 4
	sha256_avx_main_quadround 8
	sha256_avx_main_quadround 12
	sha256_avx_main_quadround 16
	sha256_avx_main_quadround 20
	sha256_avx_main_quadround 24
	sha256_avx_main_quadround 28
	sha256_avx_main_quadround 32
	sha256_avx_main_quadround 36
	sha256_avx_main_quadround 40
	sha256_avx_main_quadround 44
	sha256_avx_main_quadround 48
	sha256_avx_main_quadround 52
	sha256_avx_main_round 56, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
	jz sha256d_ms_4way_avx_finish
	sha256_avx_main_round 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
	sha256_avx_main_round 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
	sha256_avx_main_round 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
	sha256_avx_main_quadround 60
	
	movdqa	2*16(%rsp), %xmm1
	movdqa	3*16(%rsp), %xmm2
	movdqa	4*16(%rsp), %xmm6
	movdqa	%xmm1, 18*16(%rsi)
	movdqa	%xmm2, 19*16(%rsi)
	movdqa	%xmm6, 20*16(%rsi)
	movdqa	6*16(%rsp), %xmm1
	movdqa	7*16(%rsp), %xmm2
	movdqa	8*16(%rsp), %xmm6
	movdqa	%xmm1, 22*16(%rsi)
	movdqa	%xmm2, 23*16(%rsi)
	movdqa	%xmm6, 24*16(%rsi)
	movdqa	14*16(%rsp), %xmm1
	movdqa	15*16(%rsp), %xmm2
	movdqa	%xmm1, 30*16(%rsi)
	movdqa	%xmm2, 31*16(%rsi)
	
	paddd	0(%rdx), %xmm7
	paddd	16(%rdx), %xmm5
	paddd	32(%rdx), %xmm4
	paddd	48(%rdx), %xmm3
	paddd	64(%rdx), %xmm0
	paddd	80(%rdx), %xmm8
	paddd	96(%rdx), %xmm9
	paddd	112(%rdx), %xmm10
	
	movdqa	%xmm7, 0(%rsp)
	movdqa	%xmm5, 16(%rsp)
	movdqa	%xmm4, 32(%rsp)
	movdqa	%xmm3, 48(%rsp)
	movdqa	%xmm0, 64(%rsp)
	movdqa	%xmm8, 80(%rsp)
	movdqa	%xmm9, 96(%rsp)
	movdqa	%xmm10, 112(%rsp)
	
	pxor	%xmm0, %xmm0
	movq	$0x8000000000000100, %rax
	movd	%rax, %xmm1
	pshufd	$0x55, %xmm1, %xmm2
	pshufd	$0x00, %xmm1, %xmm1
	movdqa	%xmm2, 128(%rsp)
	movdqa	%xmm0, 144(%rsp)
	movdqa	%xmm0, 160(%rsp)
	movdqa	%xmm0, 176(%rsp)
	movdqa	%xmm0, 192(%rsp)
	movdqa	%xmm0, 208(%rsp)
	movdqa	%xmm0, 224(%rsp)
	movdqa	%xmm1, 240(%rsp)
	
	leaq	256(%rsp), %rax
	cmpq	%rax, %rax
	
	vmovdqa	-15*16(%rax), %xmm0
	vmovdqa	-14*16(%rax), %xmm4
	vpslld	$14, %xmm0, %xmm2
	vpslld	$14, %xmm4, %xmm6
	vpsrld	$3, %xmm0, %xmm8
	vpsrld	$3, %xmm4, %xmm4
	vpsrld	$7, %xmm0, %xmm1
	vpsrld	$4, %xmm4, %xmm5
	vpxor	%xmm1, %xmm8, %xmm8
	vpxor	%xmm5, %xmm4, %xmm4
	vpsrld	$11, %xmm1, %xmm1
	vpsrld	$11, %xmm5, %xmm5
	vpxor	%xmm2, %xmm8, %xmm8
	vpxor	%xmm6, %xmm4, %xmm4
	vpslld	$11, %xmm2, %xmm2
	vpslld	$11, %xmm6, %xmm6
	vpxor	%xmm1, %xmm8, %xmm8
	vpxor	%xmm5, %xmm4, %xmm4
	vpxor	%xmm2, %xmm8, %xmm8
	vpxor	%xmm6, %xmm4, %xmm4
	vpaddd	%xmm0, %xmm4, %xmm4
	vpaddd	-16*16(%rax), %xmm8, %xmm3
	vpaddd	sha256d_4preext2_17(%rip), %xmm4, %xmm7
	vmovdqa	%xmm3, 0*16(%rax)
	vmovdqa	%xmm7, 1*16(%rax)
	
	sha256_avx_extend_doubleround 2
	sha256_avx_extend_doubleround 4
	
	vmovdqa	-9*16(%rax), %xmm0
	vpslld	$14, %xmm0, %xmm2
	vpsrld	$3, %xmm0, %xmm8
	vpsrld	$7, %xmm0, %xmm1
	vpxor	%xmm1, %xmm8, %xmm8
	vpxor	%xmm2, %xmm8, %xmm8
	vpsrld	$11, %xmm1, %xmm1
	vpslld	$11, %xmm2, %xmm2
	vpxor	%xmm1, %xmm8, %xmm8
	vpxor	%xmm2, %xmm8, %xmm8
	vpaddd	sha256d_4preext2_23(%rip), %xmm0, %xmm4
	vpaddd	-10*16(%rax), %xmm8, %xmm0
	vpslld	$13, %xmm3, %xmm2
	vpslld	$13, %xmm7, %xmm6
	vpsrld	$10, %xmm3, %xmm3
	vpsrld	$10, %xmm7, %xmm7
	vpaddd	-1*16(%rax), %xmm0, %xmm0
	vpaddd	0*16(%rax), %xmm4, %xmm4
	vpsrld	$7, %xmm3, %xmm1
	vpsrld	$7, %xmm7, %xmm5
	vpxor	%xmm1, %xmm3, %xmm3
	vpxor	%xmm5, %xmm7, %xmm7
	vpsrld	$2, %xmm1, %xmm1
	vpsrld	$2, %xmm5, %xmm5
	vpxor	%xmm2, %xmm3, %xmm3
	vpxor	%xmm6, %xmm7, %xmm7
	vpslld	$2, %xmm2, %xmm2
	vpslld	$2, %xmm6, %xmm6
	vpxor	%xmm1, %xmm3, %xmm3
	vpxor	%xmm5, %xmm7, %xmm7
	vpxor	%xmm2, %xmm3, %xmm3
	vpxor	%xmm6, %xmm7, %xmm7
	vpaddd	%xmm0, %xmm3, %xmm3
	vpaddd	%xmm4, %xmm7, %xmm7
	vmovdqa	%xmm3, 6*16(%rax)
	vmovdqa	%xmm7, 7*16(%rax)
	
	vpslld	$13, %xmm3, %xmm2
	vpslld	$13, %xmm7, %xmm6
	vpsrld	$10, %xmm3, %xmm3
	vpsrld	$10, %xmm7, %xmm7
	vpsrld	$7, %xmm3, %xmm1
	vpsrld	$7, %xmm7, %xmm5
	vpxor	%xmm1, %xmm3, %xmm3
	vpxor	%xmm5, %xmm7, %xmm7
	vpsrld	$2, %xmm1, %xmm1
	vpsrld	$2, %xmm5, %xmm5
	vpxor	%xmm2, %xmm3, %xmm3
	vpxor	%xmm6, %xmm7, %xmm7
	vpslld	$2, %xmm2, %xmm2
	vpslld	$2, %xmm6, %xmm6
	vpxor	%xmm1, %xmm3, %xmm3
	vpxor	%xmm5, %xmm7, %xmm7
	vpxor	%xmm2, %xmm3, %xmm3
	vpxor	%xmm6, %xmm7, %xmm7
	vpaddd	sha256d_4preext2_24(%rip), %xmm3, %xmm3
	vpaddd	1*16(%rax), %xmm3, %xmm3
	vpaddd	2*16(%rax), %xmm7, %xmm7
	vmovdqa	%xmm3, 8*16(%rax)
	vmovdqa	%xmm7, 9*16(%rax)
	
	vpslld	$13, %xmm3, %xmm2
	vpslld	$13, %xmm7, %xmm6
	vpsrld	$10, %xmm3, %xmm3
	vpsrld	$10, %xmm7, %xmm7
	vpsrld	$7, %xmm3, %xmm1
	vpsrld	$7, %xmm7, %xmm5
	vpxor	%xmm1, %xmm3, %xmm3
	vpxor	%xmm5, %xmm7, %xmm7
	vpsrld	$2, %xmm1, %xmm1
	vpsrld	$2, %xmm5, %xmm5
	vpxor	%xmm2, %xmm3, %xmm3
	vpxor	%xmm6, %xmm7, %xmm7
	vpslld	$2, %xmm2, %xmm2
	vpslld	$2, %xmm6, %xmm6
	vpxor	%xmm1, %xmm3, %xmm3
	vpxor	%xmm5, %xmm7, %xmm7
	vpxor	%xmm2, %xmm3, %xmm3
	vpxor	%xmm6, %xmm7, %xmm7
	vpaddd	3*16(%rax), %xmm3, %xmm3
	vpaddd	4*16(%rax), %xmm7, %xmm7
	vmovdqa	%xmm3, 10*16(%rax)
	vmovdqa	%xmm7, 11*16(%rax)
	
	vpslld	$13, %xmm3, %xmm2
	vpslld	$13, %xmm7, %xmm6
	vpsrld	$10, %xmm3, %xmm3
	vpsrld	$10, %xmm7, %xmm7
	vpsrld	$7, %xmm3, %xmm1
	vpsrld	$7, %xmm7, %xmm5
	vpxor	%xmm1, %xmm3, %xmm3
	vpxor	%xmm5, %xmm7, %xmm7
	vpsrld	$2, %xmm1, %xmm1
	vpsrld	$2, %xmm5, %xmm5
	vpxor	%xmm2, %xmm3, %xmm3
	vpxor	%xmm6, %xmm7, %xmm7
	vpslld	$2, %xmm2, %xmm2
	vpslld	$2, %xmm6, %xmm6
	vpxor	%xmm1, %xmm3, %xmm3
	vpxor	%xmm5, %xmm7, %xmm7
	vpxor	%xmm2, %xmm3, %xmm3
	vpxor	%xmm6, %xmm7, %xmm7
	vpaddd	5*16(%rax), %xmm3, %xmm3
	vpaddd	6*16(%rax), %xmm7, %xmm7
	vmovdqa	%xmm3, 12*16(%rax)
	vmovdqa	%xmm7, 13*16(%rax)
	
	vmovdqa	sha256d_4preext2_30(%rip), %xmm0
	vmovdqa	0*16(%rax), %xmm4
	vpslld	$14, %xmm4, %xmm6
	vpsrld	$3, %xmm4, %xmm4
	vpsrld	$4, %xmm4, %xmm5
	vpxor	%xmm5, %xmm4, %xmm4
	vpxor	%xmm6, %xmm4, %xmm4
	vpsrld	$11, %xmm5, %xmm5
	vpslld	$11, %xmm6, %xmm6
	vpxor	%xmm5, %xmm4, %xmm4
	vpxor	%xmm6, %xmm4, %xmm4
	vpaddd	-1*16(%rax), %xmm4, %xmm4
	vpslld	$13, %xmm3, %xmm2
	vpslld	$13, %xmm7, %xmm6
	vpsrld	$10, %xmm3, %xmm3
	vpsrld	$10, %xmm7, %xmm7
	vpaddd	7*16(%rax), %xmm0, %xmm0
	vpaddd	8*16(%rax), %xmm4, %xmm4
	vpsrld	$7, %xmm3, %xmm1
	vpsrld	$7, %xmm7, %xmm5
	vpxor	%xmm1, %xmm3, %xmm3
	vpxor	%xmm5, %xmm7, %xmm7
	vpsrld	$2, %xmm1, %xmm1
	vpsrld	$2, %xmm5, %xmm5
	vpxor	%xmm2, %xmm3, %xmm3
	vpxor	%xmm6, %xmm7, %xmm7
	vpslld	$2, %xmm2, %xmm2
	vpslld	$2, %xmm6, %xmm6
	vpxor	%xmm1, %xmm3, %xmm3
	vpxor	%xmm5, %xmm7, %xmm7
	vpxor	%xmm2, %xmm3, %xmm3
	vpxor	%xmm6, %xmm7, %xmm7
	vpaddd	%xmm0, %xmm3, %xmm3
	vpaddd	%xmm4, %xmm7, %xmm7
	vmovdqa	%xmm3, 14*16(%rax)
	vmovdqa	%xmm7, 15*16(%rax)
	
	jmp sha256d_ms_4way_avx_extend_loop2
	
sha256d_ms_4way_avx_extend_coda2:
	sha256_avx_extend_round 44
	
	movdqa	sha256_4h+0(%rip), %xmm7
	movdqa	sha256_4h+16(%rip), %xmm5
	movdqa	sha256_4h+32(%rip), %xmm4
	movdqa	sha256_4h+48(%rip), %xmm3
	movdqa	sha256_4h+64(%rip), %xmm0
	movdqa	sha256_4h+80(%rip), %xmm8
	movdqa	sha256_4h+96(%rip), %xmm9
	movdqa	sha256_4h+112(%rip), %xmm10
	
	movq	%rsp, %rax
	leaq	sha256_4k(%rip), %rcx
	jmp sha256d_ms_4way_avx_main_loop2

.macro sha256_avx_main_round_red i, r0, r1, r2, r3, r4
	vpaddd	16*\i(%rax), \r0, %xmm6
	vpaddd	16*\i(%rcx), %xmm6, %xmm6
	vpandn	\r1, \r3, %xmm1
	vpand	\r3, \r2, %xmm2
	vpxor	%xmm2, %xmm1, %xmm1
	vpaddd	%xmm1, %xmm6, %xmm6
	vpslld	$7, \r3, %xmm1
	vpsrld	$6, \r3, \r0
	vpsrld	$5, \r0, %xmm2
	vpxor	%xmm1, \r0, \r0
	vpxor	%xmm2, \r0, \r0
	vpslld	$14, %xmm1, %xmm1
	vpsrld	$14, %xmm2, %xmm2
	vpxor	%xmm1, \r0, \r0
	vpxor	%xmm2, \r0, \r0
	vpslld	$5, %xmm1, %xmm1
	vpxor	%xmm1, \r0, \r0
	vpaddd	\r0, %xmm6, %xmm6
	vpaddd	%xmm6, \r4, \r0
.endm

sha256d_ms_4way_avx_finish:
	sha256_avx_main_round_red 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4
	sha256_avx_main_round_red 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5
	sha256_avx_main_round_red 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7
	sha256_avx_main_round_red 60, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3
	
	paddd	sha256_4h+112(%rip), %xmm10
	movdqa	%xmm10, 112(%rdi)
	
	addq	$1032, %rsp
#if defined(_WIN64) || defined(__CYGWIN__)
	popq	%rsi
	movdqa	0(%rsp), %xmm6
	movdqa	16(%rsp), %xmm7
	movdqa	32(%rsp), %xmm8
	movdqa	48(%rsp), %xmm9
	movdqa	64(%rsp), %xmm10
	addq	$80, %rsp
	popq	%rdi
#endif
	ret
	
#endif /* USE_AVX */
	
	
#if defined(USE_XOP)
	
	.p2align 6
sha256d_ms_4way_xop:
#if defined(_WIN64) || defined(__CYGWIN__)
	pushq	%rdi
	subq	$80, %rsp
	movdqa	%xmm6, 0(%rsp)
	movdqa	%xmm7, 16(%rsp)
	movdqa	%xmm8, 32(%rsp)
	movdqa	%xmm9, 48(%rsp)
	movdqa	%xmm10, 64(%rsp)
	pushq	%rsi
	movq	%rcx, %rdi
	movq	%rdx, %rsi
	movq	%r8, %rdx
	movq	%r9, %rcx
#endif
	subq	$1032, %rsp
	
	leaq	256(%rsi), %rax
	
sha256d_ms_4way_xop_extend_loop1:
	vmovdqa	3*16(%rsi), %xmm0
	vmovdqa	2*16(%rax), %xmm3
	vmovdqa	3*16(%rax), %xmm7
	vmovdqa	%xmm3, 2*16(%rsp)
	vmovdqa	%xmm7, 3*16(%rsp)
	vpaddd	%xmm0, %xmm7, %xmm7
	vprotd	$25, %xmm0, %xmm1
	vprotd	$14, %xmm0, %xmm2
	vpsrld	$3, %xmm0, %xmm0
	vpxor	%xmm1, %xmm2, %xmm2
	vpxor	%xmm2, %xmm0, %xmm0
	vpaddd	%xmm0, %xmm3, %xmm3
	vmovdqa	%xmm3, 2*16(%rax)
	vmovdqa	%xmm7, 3*16(%rax)
	
	vmovdqa	4*16(%rax), %xmm0
	vmovdqa	%xmm0, 4*16(%rsp)
	vprotd	$15, %xmm3, %xmm1
	vprotd	$15, %xmm7, %xmm5
	vprotd	$13, %xmm3, %xmm2
	vprotd	$13, %xmm7, %xmm6
	vpxor	%xmm1, %xmm2, %xmm2
	vpxor	%xmm5, %xmm6, %xmm6
	vpsrld	$10, %xmm3, %xmm3
	vpsrld	$10, %xmm7, %xmm7
	vpxor	%xmm2, %xmm3, %xmm3
	vpxor	%xmm6, %xmm7, %xmm7
	vpaddd	%xmm0, %xmm3, %xmm3
	vmovdqa	%xmm3, 4*16(%rax)
	vmovdqa	%xmm7, 5*16(%rax)
	
	vmovdqa	6*16(%rax), %xmm0
	vmovdqa	7*16(%rax), %xmm4
	vmovdqa	%xmm0, 6*16(%rsp)
	vmovdqa	%xmm4, 7*16(%rsp)
	vprotd	$15, %xmm3, %xmm1
	vprotd	$15, %xmm7, %xmm5
	vprotd	$13, %xmm3, %xmm2
	vprotd	$13, %xmm7, %xmm6
	vpxor	%xmm1, %xmm2, %xmm2
	vpxor	%xmm5, %xmm6, %xmm6
	vpsrld	$10, %xmm3, %xmm3
	vpsrld	$10, %xmm7, %xmm7
	vpxor	%xmm2, %xmm3, %xmm3
	vpxor	%xmm6, %xmm7, %xmm7
	vpaddd	%xmm0, %xmm3, %xmm3
	vpaddd	%xmm4, %xmm7, %xmm7
	vmovdqa	%xmm3, 6*16(%rax)
	vmovdqa	%xmm7, 7*16(%rax)
	
	vmovdqa	8*16(%rax), %xmm0
	vmovdqa	2*16(%rax), %xmm4
	vmovdqa	%xmm0, 8*16(%rsp)
	vprotd	$15, %xmm3, %xmm1
	vprotd	$15, %xmm7, %xmm5
	vprotd	$13, %xmm3, %xmm2
	vprotd	$13, %xmm7, %xmm6
	vpxor	%xmm1, %xmm2, %xmm2
	vpxor	%xmm5, %xmm6, %xmm6
	vpsrld	$10, %xmm3, %xmm3
	vpsrld	$10, %xmm7, %xmm7
	vpxor	%xmm2, %xmm3, %xmm3
	vpxor	%xmm6, %xmm7, %xmm7
	vpaddd	%xmm0, %xmm3, %xmm3
	vpaddd	%xmm4, %xmm7, %xmm7
	vmovdqa	%xmm3, 8*16(%rax)
	vmovdqa	%xmm7, 9*16(%rax)
	
	vprotd	$15, %xmm3, %xmm1
	vprotd	$15, %xmm7, %xmm5
	vprotd	$13, %xmm3, %xmm2
	vprotd	$13, %xmm7, %xmm6
	vpxor	%xmm1, %xmm2, %xmm2
	vpxor	%xmm5, %xmm6, %xmm6
	vpsrld	$10, %xmm3, %xmm3
	vpsrld	$10, %xmm7, %xmm7
	vpxor	%xmm2, %xmm3, %xmm3
	vpxor	%xmm6, %xmm7, %xmm7
	vpaddd	3*16(%rax), %xmm3, %xmm3
	vpaddd	4*16(%rax), %xmm7, %xmm7
	vmovdqa	%xmm3, 10*16(%rax)
	vmovdqa	%xmm7, 11*16(%rax)
	
	vprotd	$15, %xmm3, %xmm1
	vprotd	$15, %xmm7, %xmm5
	vprotd	$13, %xmm3, %xmm2
	vprotd	$13, %xmm7, %xmm6
	vpxor	%xmm1, %xmm2, %xmm2
	vpxor	%xmm5, %xmm6, %xmm6
	vpsrld	$10, %xmm3, %xmm3
	vpsrld	$10, %xmm7, %xmm7
	vpxor	%xmm2, %xmm3, %xmm3
	vpxor	%xmm6, %xmm7, %xmm7
	vpaddd	5*16(%rax), %xmm3, %xmm3
	vpaddd	6*16(%rax), %xmm7, %xmm7
	vmovdqa	%xmm3, 12*16(%rax)
	vmovdqa	%xmm7, 13*16(%rax)
	
	vmovdqa	14*16(%rax), %xmm0
	vmovdqa	15*16(%rax), %xmm4
	vmovdqa	%xmm0, 14*16(%rsp)
	vmovdqa	%xmm4, 15*16(%rsp)
	vprotd	$15, %xmm3, %xmm1
	vprotd	$15, %xmm7, %xmm5
	vprotd	$13, %xmm3, %xmm2
	vprotd	$13, %xmm7, %xmm6
	vpxor	%xmm1, %xmm2, %xmm2
	vpxor	%xmm5, %xmm6, %xmm6
	vpaddd	7*16(%rax), %xmm0, %xmm0
	vpaddd	8*16(%rax), %xmm4, %xmm4
	vpsrld	$10, %xmm3, %xmm3
	vpsrld	$10, %xmm7, %xmm7
	vpxor	%xmm2, %xmm3, %xmm3
	vpxor	%xmm6, %xmm7, %xmm7
	vpaddd	%xmm0, %xmm3, %xmm3
	vpaddd	%xmm4, %xmm7, %xmm7
	vmovdqa	%xmm3, 14*16(%rax)
	vmovdqa	%xmm7, 15*16(%rax)
	
sha256d_ms_4way_xop_extend_loop2:
	sha256_xop_extend_doubleround 16
	sha256_xop_extend_doubleround 18
	sha256_xop_extend_doubleround 20
	sha256_xop_extend_doubleround 22
	sha256_xop_extend_doubleround 24
	sha256_xop_extend_doubleround 26
	sha256_xop_extend_doubleround 28
	sha256_xop_extend_doubleround 30
	sha256_xop_extend_doubleround 32
	sha256_xop_extend_doubleround 34
	sha256_xop_extend_doubleround 36
	sha256_xop_extend_doubleround 38
	sha256_xop_extend_doubleround 40
	sha256_xop_extend_doubleround 42
	jz sha256d_ms_4way_xop_extend_coda2
	sha256_xop_extend_doubleround 44
	sha256_xop_extend_doubleround 46
	
	movdqa	0(%rcx), %xmm7
	movdqa	16(%rcx), %xmm8
	movdqa	32(%rcx), %xmm9
	movdqa	48(%rcx), %xmm10
	movdqa	64(%rcx), %xmm0
	movdqa	80(%rcx), %xmm5
	movdqa	96(%rcx), %xmm4
	movdqa	112(%rcx), %xmm3
	
	movq	%rsi, %rax
	leaq	sha256_4k(%rip), %rcx
	jmp sha256d_ms_4way_xop_main_loop1
	
sha256d_ms_4way_xop_main_loop2:
	sha256_xop_main_round 0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
	sha256_xop_main_round 1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
	sha256_xop_main_round 2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
sha256d_ms_4way_xop_main_loop1:
	sha256_xop_main_round 3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
	sha256_xop_main_quadround 4
	sha256_xop_main_quadround 8
	sha256_xop_main_quadround 12
	sha256_xop_main_quadround 16
	sha256_xop_main_quadround 20
	sha256_xop_main_quadround 24
	sha256_xop_main_quadround 28
	sha256_xop_main_quadround 32
	sha256_xop_main_quadround 36
	sha256_xop_main_quadround 40
	sha256_xop_main_quadround 44
	sha256_xop_main_quadround 48
	sha256_xop_main_quadround 52
	sha256_xop_main_round 56, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
	jz sha256d_ms_4way_xop_finish
	sha256_xop_main_round 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
	sha256_xop_main_round 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
	sha256_xop_main_round 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
	sha256_xop_main_quadround 60
	
	movdqa	2*16(%rsp), %xmm1
	movdqa	3*16(%rsp), %xmm2
	movdqa	4*16(%rsp), %xmm6
	movdqa	%xmm1, 18*16(%rsi)
	movdqa	%xmm2, 19*16(%rsi)
	movdqa	%xmm6, 20*16(%rsi)
	movdqa	6*16(%rsp), %xmm1
	movdqa	7*16(%rsp), %xmm2
	movdqa	8*16(%rsp), %xmm6
	movdqa	%xmm1, 22*16(%rsi)
	movdqa	%xmm2, 23*16(%rsi)
	movdqa	%xmm6, 24*16(%rsi)
	movdqa	14*16(%rsp), %xmm1
	movdqa	15*16(%rsp), %xmm2
	movdqa	%xmm1, 30*16(%rsi)
	movdqa	%xmm2, 31*16(%rsi)
	
	paddd	0(%rdx), %xmm7
	paddd	16(%rdx), %xmm5
	paddd	32(%rdx), %xmm4
	paddd	48(%rdx), %xmm3
	paddd	64(%rdx), %xmm0
	paddd	80(%rdx), %xmm8
	paddd	96(%rdx), %xmm9
	paddd	112(%rdx), %xmm10
	
	movdqa	%xmm7, 0(%rsp)
	movdqa	%xmm5, 16(%rsp)
	movdqa	%xmm4, 32(%rsp)
	movdqa	%xmm3, 48(%rsp)
	movdqa	%xmm0, 64(%rsp)
	movdqa	%xmm8, 80(%rsp)
	movdqa	%xmm9, 96(%rsp)
	movdqa	%xmm10, 112(%rsp)
	
	pxor	%xmm0, %xmm0
	movq	$0x8000000000000100, %rax
	movd	%rax, %xmm1
	pshufd	$0x55, %xmm1, %xmm2
	pshufd	$0x00, %xmm1, %xmm1
	movdqa	%xmm2, 128(%rsp)
	movdqa	%xmm0, 144(%rsp)
	movdqa	%xmm0, 160(%rsp)
	movdqa	%xmm0, 176(%rsp)
	movdqa	%xmm0, 192(%rsp)
	movdqa	%xmm0, 208(%rsp)
	movdqa	%xmm0, 224(%rsp)
	movdqa	%xmm1, 240(%rsp)
	
	leaq	256(%rsp), %rax
	cmpq	%rax, %rax
	
	vmovdqa	-15*16(%rax), %xmm0
	vmovdqa	-14*16(%rax), %xmm4
	vprotd	$25, %xmm0, %xmm1
	vprotd	$25, %xmm4, %xmm5
	vprotd	$14, %xmm0, %xmm2
	vprotd	$14, %xmm4, %xmm6
	vpxor	%xmm1, %xmm2, %xmm2
	vpxor	%xmm5, %xmm6, %xmm6
	vpsrld	$3, %xmm0, %xmm8
	vpsrld	$3, %xmm4, %xmm4
	vpxor	%xmm2, %xmm8, %xmm8
	vpxor	%xmm6, %xmm4, %xmm4
	vpaddd	%xmm0, %xmm4, %xmm4
	vpaddd	-16*16(%rax), %xmm8, %xmm3
	vpaddd	sha256d_4preext2_17(%rip), %xmm4, %xmm7
	vmovdqa	%xmm3, 0*16(%rax)
	vmovdqa	%xmm7, 1*16(%rax)
	
	sha256_xop_extend_doubleround 2
	sha256_xop_extend_doubleround 4
	
	vmovdqa	-9*16(%rax), %xmm0
	vprotd	$25, %xmm0, %xmm1
	vprotd	$14, %xmm0, %xmm2
	vpsrld	$3, %xmm0, %xmm8
	vpxor	%xmm1, %xmm2, %xmm2
	vpxor	%xmm2, %xmm8, %xmm8
	vpaddd	sha256d_4preext2_23(%rip), %xmm0, %xmm4
	vpaddd	-10*16(%rax), %xmm8, %xmm0
	vprotd	$15, %xmm3, %xmm1
	vprotd	$15, %xmm7, %xmm5
	vprotd	$13, %xmm3, %xmm2
	vprotd	$13, %xmm7, %xmm6
	vpxor	%xmm1, %xmm2, %xmm2
	vpxor	%xmm5, %xmm6, %xmm6
	vpaddd	-1*16(%rax), %xmm0, %xmm0
	vpaddd	0*16(%rax), %xmm4, %xmm4
	vpsrld	$10, %xmm3, %xmm3
	vpsrld	$10, %xmm7, %xmm7
	vpxor	%xmm2, %xmm3, %xmm3
	vpxor	%xmm6, %xmm7, %xmm7
	vpaddd	%xmm0, %xmm3, %xmm3
	vpaddd	%xmm4, %xmm7, %xmm7
	vmovdqa	%xmm3, 6*16(%rax)
	vmovdqa	%xmm7, 7*16(%rax)
	
	vprotd	$15, %xmm3, %xmm1
	vprotd	$15, %xmm7, %xmm5
	vprotd	$13, %xmm3, %xmm2
	vprotd	$13, %xmm7, %xmm6
	vpxor	%xmm1, %xmm2, %xmm2
	vpxor	%xmm5, %xmm6, %xmm6
	vpsrld	$10, %xmm3, %xmm3
	vpsrld	$10, %xmm7, %xmm7
	vpxor	%xmm2, %xmm3, %xmm3
	vpxor	%xmm6, %xmm7, %xmm7
	vpaddd	sha256d_4preext2_24(%rip), %xmm3, %xmm3
	vpaddd	1*16(%rax), %xmm3, %xmm3
	vpaddd	2*16(%rax), %xmm7, %xmm7
	vmovdqa	%xmm3, 8*16(%rax)
	vmovdqa	%xmm7, 9*16(%rax)
	
	vprotd	$15, %xmm3, %xmm1
	vprotd	$15, %xmm7, %xmm5
	vprotd	$13, %xmm3, %xmm2
	vprotd	$13, %xmm7, %xmm6
	vpxor	%xmm1, %xmm2, %xmm2
	vpxor	%xmm5, %xmm6, %xmm6
	vpsrld	$10, %xmm3, %xmm3
	vpsrld	$10, %xmm7, %xmm7
	vpxor	%xmm2, %xmm3, %xmm3
	vpxor	%xmm6, %xmm7, %xmm7
	vpaddd	3*16(%rax), %xmm3, %xmm3
	vpaddd	4*16(%rax), %xmm7, %xmm7
	vmovdqa	%xmm3, 10*16(%rax)
	vmovdqa	%xmm7, 11*16(%rax)
	
	vprotd	$15, %xmm3, %xmm1
	vprotd	$15, %xmm7, %xmm5
	vprotd	$13, %xmm3, %xmm2
	vprotd	$13, %xmm7, %xmm6
	vpxor	%xmm1, %xmm2, %xmm2
	vpxor	%xmm5, %xmm6, %xmm6
	vpsrld	$10, %xmm3, %xmm3
	vpsrld	$10, %xmm7, %xmm7
	vpxor	%xmm2, %xmm3, %xmm3
	vpxor	%xmm6, %xmm7, %xmm7
	vpaddd	5*16(%rax), %xmm3, %xmm3
	vpaddd	6*16(%rax), %xmm7, %xmm7
	vmovdqa	%xmm3, 12*16(%rax)
	vmovdqa	%xmm7, 13*16(%rax)
	
	vmovdqa	sha256d_4preext2_30(%rip), %xmm0
	vmovdqa	0*16(%rax), %xmm4
	vprotd	$25, %xmm4, %xmm5
	vprotd	$14, %xmm4, %xmm6
	vpxor	%xmm5, %xmm6, %xmm6
	vpsrld	$3, %xmm4, %xmm4
	vpxor	%xmm6, %xmm4, %xmm4
	vpaddd	-1*16(%rax), %xmm4, %xmm4
	vprotd	$15, %xmm3, %xmm1
	vprotd	$15, %xmm7, %xmm5
	vprotd	$13, %xmm3, %xmm2
	vprotd	$13, %xmm7, %xmm6
	vpxor	%xmm1, %xmm2, %xmm2
	vpxor	%xmm5, %xmm6, %xmm6
	vpaddd	7*16(%rax), %xmm0, %xmm0
	vpaddd	8*16(%rax), %xmm4, %xmm4
	vpsrld	$10, %xmm3, %xmm3
	vpsrld	$10, %xmm7, %xmm7
	vpxor	%xmm2, %xmm3, %xmm3
	vpxor	%xmm6, %xmm7, %xmm7
	vpaddd	%xmm0, %xmm3, %xmm3
	vpaddd	%xmm4, %xmm7, %xmm7
	vmovdqa	%xmm3, 14*16(%rax)
	vmovdqa	%xmm7, 15*16(%rax)
	
	jmp sha256d_ms_4way_xop_extend_loop2
	
sha256d_ms_4way_xop_extend_coda2:
	sha256_xop_extend_round 44
	
	movdqa	sha256_4h+0(%rip), %xmm7
	movdqa	sha256_4h+16(%rip), %xmm5
	movdqa	sha256_4h+32(%rip), %xmm4
	movdqa	sha256_4h+48(%rip), %xmm3
	movdqa	sha256_4h+64(%rip), %xmm0
	movdqa	sha256_4h+80(%rip), %xmm8
	movdqa	sha256_4h+96(%rip), %xmm9
	movdqa	sha256_4h+112(%rip), %xmm10
	
	movq	%rsp, %rax
	leaq	sha256_4k(%rip), %rcx
	jmp sha256d_ms_4way_xop_main_loop2

.macro sha256_xop_main_round_red i, r0, r1, r2, r3, r4
	vpaddd	16*\i(%rax), \r0, %xmm6
	vpaddd	16*\i(%rcx), %xmm6, %xmm6
	vpandn	\r1, \r3, %xmm1
	vpand	\r3, \r2, %xmm2
	vpxor	%xmm2, %xmm1, %xmm1
	vpaddd	%xmm1, %xmm6, %xmm6
	vprotd	$26, \r3, %xmm1
	vprotd	$21, \r3, %xmm2
	vpxor	%xmm1, %xmm2, %xmm2
	vprotd	$7, \r3, \r0
	vpxor	%xmm2, \r0, \r0
	vpaddd	\r0, %xmm6, %xmm6
	vpaddd	%xmm6, \r4, \r0
.endm

sha256d_ms_4way_xop_finish:
	sha256_xop_main_round_red 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4
	sha256_xop_main_round_red 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5
	sha256_xop_main_round_red 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7
	sha256_xop_main_round_red 60, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3
	
	paddd	sha256_4h+112(%rip), %xmm10
	movdqa	%xmm10, 112(%rdi)
	
	addq	$1032, %rsp
#if defined(_WIN64) || defined(__CYGWIN__)
	popq	%rsi
	movdqa	0(%rsp), %xmm6
	movdqa	16(%rsp), %xmm7
	movdqa	32(%rsp), %xmm8
	movdqa	48(%rsp), %xmm9
	movdqa	64(%rsp), %xmm10
	addq	$80, %rsp
	popq	%rdi
#endif
	ret
	
#endif /* USE_XOP */


	.text
	.p2align 6
	.globl sha256_use_4way
	.globl _sha256_use_4way
sha256_use_4way:
_sha256_use_4way:
	pushq	%rbx
	pushq	%rcx
	pushq	%rdx
	
	/* Check for VIA PadLock Hash Engine */
	movl	$0xc0000000, %eax
	cpuid
	cmpl	$0xc0000001, %eax
	jb	sha256_use_4way_no_phe
	movl	$0xc0000001, %eax
	cpuid
	andl	$0x00000c00, %edx
	cmpl	$0x00000c00, %edx
	jne	sha256_use_4way_no_phe
	leaq	sha256_transform_phe(%rip), %rdx
	movq	%rdx, sha256_transform_addr(%rip)
	xorl	%eax, %eax
	jmp	sha256_use_4way_exit
sha256_use_4way_no_phe:
#if defined(USE_AVX)
	/* Check for AVX and OSXSAVE support */
	movl	$1, %eax
	cpuid
	andl	$0x18000000, %ecx
	cmpl	$0x18000000, %ecx
	jne sha256_use_4way_base
	/* Check for XMM and YMM state support */
	xorl	%ecx, %ecx
	xgetbv
	andl	$0x00000006, %eax
	cmpl	$0x00000006, %eax
	jne sha256_use_4way_base
#if defined(USE_XOP)
	/* Check for XOP support */
	movl	$0x80000001, %eax
	cpuid
	andl	$0x00000800, %ecx
	jz sha256_use_4way_avx
	
sha256_use_4way_xop:
	leaq	sha256d_ms_4way_xop(%rip), %rcx
	leaq	sha256_transform_4way_core_xop(%rip), %rdx
	jmp sha256_use_4way_done
#endif /* USE_XOP */
	
sha256_use_4way_avx:
	leaq	sha256d_ms_4way_avx(%rip), %rcx
	leaq	sha256_transform_4way_core_avx(%rip), %rdx
	jmp sha256_use_4way_done
#endif /* USE_AVX */
	
sha256_use_4way_base:
	leaq	sha256d_ms_4way_sse2(%rip), %rcx
	leaq	sha256_transform_4way_core_sse2(%rip), %rdx
	
sha256_use_4way_done:
	movq	%rcx, sha256d_ms_4way_addr(%rip)
	movq	%rdx, sha256_transform_4way_core_addr(%rip)
	movl	$1, %eax
sha256_use_4way_exit:
	popq	%rdx
	popq	%rcx
	popq	%rbx
	ret


#if defined(USE_AVX2)

	.text
	.p2align 6
	.globl sha256d_ms_8way
	.globl _sha256d_ms_8way
sha256d_ms_8way:
_sha256d_ms_8way:
sha256d_ms_8way_avx2:
#if defined(_WIN64) || defined(__CYGWIN__)
	pushq	%rdi
	subq	$80, %rsp
	vmovdqa	%xmm6, 0(%rsp)
	vmovdqa	%xmm7, 16(%rsp)
	vmovdqa	%xmm8, 32(%rsp)
	vmovdqa	%xmm9, 48(%rsp)
	vmovdqa	%xmm10, 64(%rsp)
	pushq	%rsi
	movq	%rcx, %rdi
	movq	%rdx, %rsi
	movq	%r8, %rdx
	movq	%r9, %rcx
#endif
	pushq	%rbp
	movq	%rsp, %rbp
	subq	$64*32, %rsp
	andq	$-128, %rsp
	
	leaq	16*32(%rsi), %rax
	
sha256d_ms_8way_avx2_extend_loop1:
	vmovdqa	3*32(%rsi), %ymm0
	vmovdqa	2*32(%rax), %ymm3
	vmovdqa	3*32(%rax), %ymm7
	vmovdqa	%ymm3, 2*32(%rsp)
	vmovdqa	%ymm7, 3*32(%rsp)
	vpaddd	%ymm0, %ymm7, %ymm7
	vpslld	$14, %ymm0, %ymm2
	vpsrld	$3, %ymm0, %ymm0
	vpsrld	$4, %ymm0, %ymm1
	vpxor	%ymm1, %ymm0, %ymm0
	vpxor	%ymm2, %ymm0, %ymm0
	vpsrld	$11, %ymm1, %ymm1
	vpslld	$11, %ymm2, %ymm2
	vpxor	%ymm1, %ymm0, %ymm0
	vpxor	%ymm2, %ymm0, %ymm0
	vpaddd	%ymm0, %ymm3, %ymm3
	vmovdqa	%ymm3, 2*32(%rax)
	vmovdqa	%ymm7, 3*32(%rax)
	
	vmovdqa	4*32(%rax), %ymm0
	vmovdqa	%ymm0, 4*32(%rsp)
	vpslld	$13, %ymm3, %ymm2
	vpslld	$13, %ymm7, %ymm6
	vpsrld	$10, %ymm3, %ymm3
	vpsrld	$10, %ymm7, %ymm7
	vpsrld	$7, %ymm3, %ymm1
	vpsrld	$7, %ymm7, %ymm5
	vpxor	%ymm1, %ymm3, %ymm3
	vpxor	%ymm5, %ymm7, %ymm7
	vpsrld	$2, %ymm1, %ymm1
	vpsrld	$2, %ymm5, %ymm5
	vpxor	%ymm2, %ymm3, %ymm3
	vpxor	%ymm6, %ymm7, %ymm7
	vpslld	$2, %ymm2, %ymm2
	vpslld	$2, %ymm6, %ymm6
	vpxor	%ymm1, %ymm3, %ymm3
	vpxor	%ymm5, %ymm7, %ymm7
	vpxor	%ymm2, %ymm3, %ymm3
	vpxor	%ymm6, %ymm7, %ymm7
	vpaddd	%ymm0, %ymm3, %ymm3
	vmovdqa	%ymm3, 4*32(%rax)
	vmovdqa	%ymm7, 5*32(%rax)
	
	vmovdqa	6*32(%rax), %ymm0
	vmovdqa	7*32(%rax), %ymm4
	vmovdqa	%ymm0, 6*32(%rsp)
	vmovdqa	%ymm4, 7*32(%rsp)
	vpslld	$13, %ymm3, %ymm2
	vpslld	$13, %ymm7, %ymm6
	vpsrld	$10, %ymm3, %ymm3
	vpsrld	$10, %ymm7, %ymm7
	vpsrld	$7, %ymm3, %ymm1
	vpsrld	$7, %ymm7, %ymm5
	vpxor	%ymm1, %ymm3, %ymm3
	vpxor	%ymm5, %ymm7, %ymm7
	vpsrld	$2, %ymm1, %ymm1
	vpsrld	$2, %ymm5, %ymm5
	vpxor	%ymm2, %ymm3, %ymm3
	vpxor	%ymm6, %ymm7, %ymm7
	vpslld	$2, %ymm2, %ymm2
	vpslld	$2, %ymm6, %ymm6
	vpxor	%ymm1, %ymm3, %ymm3
	vpxor	%ymm5, %ymm7, %ymm7
	vpxor	%ymm2, %ymm3, %ymm3
	vpxor	%ymm6, %ymm7, %ymm7
	vpaddd	%ymm0, %ymm3, %ymm3
	vpaddd	%ymm4, %ymm7, %ymm7
	vmovdqa	%ymm3, 6*32(%rax)
	vmovdqa	%ymm7, 7*32(%rax)
	
	vmovdqa	8*32(%rax), %ymm0
	vmovdqa	2*32(%rax), %ymm4
	vmovdqa	%ymm0, 8*32(%rsp)
	vpslld	$13, %ymm3, %ymm2
	vpslld	$13, %ymm7, %ymm6
	vpsrld	$10, %ymm3, %ymm3
	vpsrld	$10, %ymm7, %ymm7
	vpsrld	$7, %ymm3, %ymm1
	vpsrld	$7, %ymm7, %ymm5
	vpxor	%ymm1, %ymm3, %ymm3
	vpxor	%ymm5, %ymm7, %ymm7
	vpsrld	$2, %ymm1, %ymm1
	vpsrld	$2, %ymm5, %ymm5
	vpxor	%ymm2, %ymm3, %ymm3
	vpxor	%ymm6, %ymm7, %ymm7
	vpslld	$2, %ymm2, %ymm2
	vpslld	$2, %ymm6, %ymm6
	vpxor	%ymm1, %ymm3, %ymm3
	vpxor	%ymm5, %ymm7, %ymm7
	vpxor	%ymm2, %ymm3, %ymm3
	vpxor	%ymm6, %ymm7, %ymm7
	vpaddd	%ymm0, %ymm3, %ymm3
	vpaddd	%ymm4, %ymm7, %ymm7
	vmovdqa	%ymm3, 8*32(%rax)
	vmovdqa	%ymm7, 9*32(%rax)
	
	vpslld	$13, %ymm3, %ymm2
	vpslld	$13, %ymm7, %ymm6
	vpsrld	$10, %ymm3, %ymm3
	vpsrld	$10, %ymm7, %ymm7
	vpsrld	$7, %ymm3, %ymm1
	vpsrld	$7, %ymm7, %ymm5
	vpxor	%ymm1, %ymm3, %ymm3
	vpxor	%ymm5, %ymm7, %ymm7
	vpsrld	$2, %ymm1, %ymm1
	vpsrld	$2, %ymm5, %ymm5
	vpxor	%ymm2, %ymm3, %ymm3
	vpxor	%ymm6, %ymm7, %ymm7
	vpslld	$2, %ymm2, %ymm2
	vpslld	$2, %ymm6, %ymm6
	vpxor	%ymm1, %ymm3, %ymm3
	vpxor	%ymm5, %ymm7, %ymm7
	vpxor	%ymm2, %ymm3, %ymm3
	vpxor	%ymm6, %ymm7, %ymm7
	vpaddd	3*32(%rax), %ymm3, %ymm3
	vpaddd	4*32(%rax), %ymm7, %ymm7
	vmovdqa	%ymm3, 10*32(%rax)
	vmovdqa	%ymm7, 11*32(%rax)
	
	vpslld	$13, %ymm3, %ymm2
	vpslld	$13, %ymm7, %ymm6
	vpsrld	$10, %ymm3, %ymm3
	vpsrld	$10, %ymm7, %ymm7
	vpsrld	$7, %ymm3, %ymm1
	vpsrld	$7, %ymm7, %ymm5
	vpxor	%ymm1, %ymm3, %ymm3
	vpxor	%ymm5, %ymm7, %ymm7
	vpsrld	$2, %ymm1, %ymm1
	vpsrld	$2, %ymm5, %ymm5
	vpxor	%ymm2, %ymm3, %ymm3
	vpxor	%ymm6, %ymm7, %ymm7
	vpslld	$2, %ymm2, %ymm2
	vpslld	$2, %ymm6, %ymm6
	vpxor	%ymm1, %ymm3, %ymm3
	vpxor	%ymm5, %ymm7, %ymm7
	vpxor	%ymm2, %ymm3, %ymm3
	vpxor	%ymm6, %ymm7, %ymm7
	vpaddd	5*32(%rax), %ymm3, %ymm3
	vpaddd	6*32(%rax), %ymm7, %ymm7
	vmovdqa	%ymm3, 12*32(%rax)
	vmovdqa	%ymm7, 13*32(%rax)
	
	vmovdqa	14*32(%rax), %ymm0
	vmovdqa	15*32(%rax), %ymm4
	vmovdqa	%ymm0, 14*32(%rsp)
	vmovdqa	%ymm4, 15*32(%rsp)
	vpslld	$13, %ymm3, %ymm2
	vpslld	$13, %ymm7, %ymm6
	vpsrld	$10, %ymm3, %ymm3
	vpsrld	$10, %ymm7, %ymm7
	vpaddd	7*32(%rax), %ymm0, %ymm0
	vpaddd	8*32(%rax), %ymm4, %ymm4
	vpsrld	$7, %ymm3, %ymm1
	vpsrld	$7, %ymm7, %ymm5
	vpxor	%ymm1, %ymm3, %ymm3
	vpxor	%ymm5, %ymm7, %ymm7
	vpsrld	$2, %ymm1, %ymm1
	vpsrld	$2, %ymm5, %ymm5
	vpxor	%ymm2, %ymm3, %ymm3
	vpxor	%ymm6, %ymm7, %ymm7
	vpslld	$2, %ymm2, %ymm2
	vpslld	$2, %ymm6, %ymm6
	vpxor	%ymm1, %ymm3, %ymm3
	vpxor	%ymm5, %ymm7, %ymm7
	vpxor	%ymm2, %ymm3, %ymm3
	vpxor	%ymm6, %ymm7, %ymm7
	vpaddd	%ymm0, %ymm3, %ymm3
	vpaddd	%ymm4, %ymm7, %ymm7
	vmovdqa	%ymm3, 14*32(%rax)
	vmovdqa	%ymm7, 15*32(%rax)
	
sha256d_ms_8way_avx2_extend_loop2:
	sha256_avx2_extend_doubleround 16
	sha256_avx2_extend_doubleround 18
	sha256_avx2_extend_doubleround 20
	sha256_avx2_extend_doubleround 22
	sha256_avx2_extend_doubleround 24
	sha256_avx2_extend_doubleround 26
	sha256_avx2_extend_doubleround 28
	sha256_avx2_extend_doubleround 30
	sha256_avx2_extend_doubleround 32
	sha256_avx2_extend_doubleround 34
	sha256_avx2_extend_doubleround 36
	sha256_avx2_extend_doubleround 38
	sha256_avx2_extend_doubleround 40
	sha256_avx2_extend_doubleround 42
	jz sha256d_ms_8way_avx2_extend_coda2
	sha256_avx2_extend_doubleround 44
	sha256_avx2_extend_doubleround 46
	
	vmovdqa	0(%rcx), %ymm7
	vmovdqa	32(%rcx), %ymm8
	vmovdqa	64(%rcx), %ymm9
	vmovdqa	96(%rcx), %ymm10
	vmovdqa	128(%rcx), %ymm0
	vmovdqa	160(%rcx), %ymm5
	vmovdqa	192(%rcx), %ymm4
	vmovdqa	224(%rcx), %ymm3
	
	movq	%rsi, %rax
	leaq	sha256_8k(%rip), %rcx
	jmp sha256d_ms_8way_avx2_main_loop1
	
sha256d_ms_8way_avx2_main_loop2:
	sha256_avx2_main_round 0, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3, %ymm4, %ymm5, %ymm7
	sha256_avx2_main_round 1, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4, %ymm5, %ymm7, %ymm3
	sha256_avx2_main_round 2, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5, %ymm7, %ymm3, %ymm4
sha256d_ms_8way_avx2_main_loop1:
	sha256_avx2_main_round 3, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7, %ymm3, %ymm4, %ymm5
	sha256_avx2_main_quadround 4
	sha256_avx2_main_quadround 8
	sha256_avx2_main_quadround 12
	sha256_avx2_main_quadround 16
	sha256_avx2_main_quadround 20
	sha256_avx2_main_quadround 24
	sha256_avx2_main_quadround 28
	sha256_avx2_main_quadround 32
	sha256_avx2_main_quadround 36
	sha256_avx2_main_quadround 40
	sha256_avx2_main_quadround 44
	sha256_avx2_main_quadround 48
	sha256_avx2_main_quadround 52
	sha256_avx2_main_round 56, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3, %ymm4, %ymm5, %ymm7
	jz sha256d_ms_8way_avx2_finish
	sha256_avx2_main_round 57, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4, %ymm5, %ymm7, %ymm3
	sha256_avx2_main_round 58, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5, %ymm7, %ymm3, %ymm4
	sha256_avx2_main_round 59, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7, %ymm3, %ymm4, %ymm5
	sha256_avx2_main_quadround 60
	
	vmovdqa	2*32(%rsp), %ymm1
	vmovdqa	3*32(%rsp), %ymm2
	vmovdqa	4*32(%rsp), %ymm6
	vmovdqa	%ymm1, 18*32(%rsi)
	vmovdqa	%ymm2, 19*32(%rsi)
	vmovdqa	%ymm6, 20*32(%rsi)
	vmovdqa	6*32(%rsp), %ymm1
	vmovdqa	7*32(%rsp), %ymm2
	vmovdqa	8*32(%rsp), %ymm6
	vmovdqa	%ymm1, 22*32(%rsi)
	vmovdqa	%ymm2, 23*32(%rsi)
	vmovdqa	%ymm6, 24*32(%rsi)
	vmovdqa	14*32(%rsp), %ymm1
	vmovdqa	15*32(%rsp), %ymm2
	vmovdqa	%ymm1, 30*32(%rsi)
	vmovdqa	%ymm2, 31*32(%rsi)
	
	vpaddd	0(%rdx), %ymm7, %ymm7
	vpaddd	32(%rdx), %ymm5, %ymm5
	vpaddd	64(%rdx), %ymm4, %ymm4
	vpaddd	96(%rdx), %ymm3, %ymm3
	vpaddd	128(%rdx), %ymm0, %ymm0
	vpaddd	160(%rdx), %ymm8, %ymm8
	vpaddd	192(%rdx), %ymm9, %ymm9
	vpaddd	224(%rdx), %ymm10, %ymm10
	
	vmovdqa	%ymm7, 0(%rsp)
	vmovdqa	%ymm5, 32(%rsp)
	vmovdqa	%ymm4, 64(%rsp)
	vmovdqa	%ymm3, 96(%rsp)
	vmovdqa	%ymm0, 128(%rsp)
	vmovdqa	%ymm8, 160(%rsp)
	vmovdqa	%ymm9, 192(%rsp)
	vmovdqa	%ymm10, 224(%rsp)
	
	vpxor	%ymm0, %ymm0, %ymm0
	movq	$0x8000000000000100, %rax
	vmovd	%rax, %xmm1
	vinserti128	$1, %xmm1, %ymm1, %ymm1
	vpshufd	$0x55, %ymm1, %ymm2
	vpshufd	$0x00, %ymm1, %ymm1
	vmovdqa	%ymm2, 8*32(%rsp)
	vmovdqa	%ymm0, 9*32(%rsp)
	vmovdqa	%ymm0, 10*32(%rsp)
	vmovdqa	%ymm0, 11*32(%rsp)
	vmovdqa	%ymm0, 12*32(%rsp)
	vmovdqa	%ymm0, 13*32(%rsp)
	vmovdqa	%ymm0, 14*32(%rsp)
	vmovdqa	%ymm1, 15*32(%rsp)
	
	leaq	16*32(%rsp), %rax
	cmpq	%rax, %rax
	
	vmovdqa	-15*32(%rax), %ymm0
	vmovdqa	-14*32(%rax), %ymm4
	vpslld	$14, %ymm0, %ymm2
	vpslld	$14, %ymm4, %ymm6
	vpsrld	$3, %ymm0, %ymm8
	vpsrld	$3, %ymm4, %ymm4
	vpsrld	$7, %ymm0, %ymm1
	vpsrld	$4, %ymm4, %ymm5
	vpxor	%ymm1, %ymm8, %ymm8
	vpxor	%ymm5, %ymm4, %ymm4
	vpsrld	$11, %ymm1, %ymm1
	vpsrld	$11, %ymm5, %ymm5
	vpxor	%ymm2, %ymm8, %ymm8
	vpxor	%ymm6, %ymm4, %ymm4
	vpslld	$11, %ymm2, %ymm2
	vpslld	$11, %ymm6, %ymm6
	vpxor	%ymm1, %ymm8, %ymm8
	vpxor	%ymm5, %ymm4, %ymm4
	vpxor	%ymm2, %ymm8, %ymm8
	vpxor	%ymm6, %ymm4, %ymm4
	vpaddd	%ymm0, %ymm4, %ymm4
	vpaddd	-16*32(%rax), %ymm8, %ymm3
	vpaddd	sha256d_8preext2_17(%rip), %ymm4, %ymm7
	vmovdqa	%ymm3, 0*32(%rax)
	vmovdqa	%ymm7, 1*32(%rax)
	
	sha256_avx2_extend_doubleround 2
	sha256_avx2_extend_doubleround 4
	
	vmovdqa	-9*32(%rax), %ymm0
	vpslld	$14, %ymm0, %ymm2
	vpsrld	$3, %ymm0, %ymm8
	vpsrld	$7, %ymm0, %ymm1
	vpxor	%ymm1, %ymm8, %ymm8
	vpxor	%ymm2, %ymm8, %ymm8
	vpsrld	$11, %ymm1, %ymm1
	vpslld	$11, %ymm2, %ymm2
	vpxor	%ymm1, %ymm8, %ymm8
	vpxor	%ymm2, %ymm8, %ymm8
	vpaddd	sha256d_8preext2_23(%rip), %ymm0, %ymm4
	vpaddd	-10*32(%rax), %ymm8, %ymm0
	vpslld	$13, %ymm3, %ymm2
	vpslld	$13, %ymm7, %ymm6
	vpsrld	$10, %ymm3, %ymm3
	vpsrld	$10, %ymm7, %ymm7
	vpaddd	-1*32(%rax), %ymm0, %ymm0
	vpaddd	0*32(%rax), %ymm4, %ymm4
	vpsrld	$7, %ymm3, %ymm1
	vpsrld	$7, %ymm7, %ymm5
	vpxor	%ymm1, %ymm3, %ymm3
	vpxor	%ymm5, %ymm7, %ymm7
	vpsrld	$2, %ymm1, %ymm1
	vpsrld	$2, %ymm5, %ymm5
	vpxor	%ymm2, %ymm3, %ymm3
	vpxor	%ymm6, %ymm7, %ymm7
	vpslld	$2, %ymm2, %ymm2
	vpslld	$2, %ymm6, %ymm6
	vpxor	%ymm1, %ymm3, %ymm3
	vpxor	%ymm5, %ymm7, %ymm7
	vpxor	%ymm2, %ymm3, %ymm3
	vpxor	%ymm6, %ymm7, %ymm7
	vpaddd	%ymm0, %ymm3, %ymm3
	vpaddd	%ymm4, %ymm7, %ymm7
	vmovdqa	%ymm3, 6*32(%rax)
	vmovdqa	%ymm7, 7*32(%rax)
	
	vpslld	$13, %ymm3, %ymm2
	vpslld	$13, %ymm7, %ymm6
	vpsrld	$10, %ymm3, %ymm3
	vpsrld	$10, %ymm7, %ymm7
	vpsrld	$7, %ymm3, %ymm1
	vpsrld	$7, %ymm7, %ymm5
	vpxor	%ymm1, %ymm3, %ymm3
	vpxor	%ymm5, %ymm7, %ymm7
	vpsrld	$2, %ymm1, %ymm1
	vpsrld	$2, %ymm5, %ymm5
	vpxor	%ymm2, %ymm3, %ymm3
	vpxor	%ymm6, %ymm7, %ymm7
	vpslld	$2, %ymm2, %ymm2
	vpslld	$2, %ymm6, %ymm6
	vpxor	%ymm1, %ymm3, %ymm3
	vpxor	%ymm5, %ymm7, %ymm7
	vpxor	%ymm2, %ymm3, %ymm3
	vpxor	%ymm6, %ymm7, %ymm7
	vpaddd	sha256d_8preext2_24(%rip), %ymm3, %ymm3
	vpaddd	1*32(%rax), %ymm3, %ymm3
	vpaddd	2*32(%rax), %ymm7, %ymm7
	vmovdqa	%ymm3, 8*32(%rax)
	vmovdqa	%ymm7, 9*32(%rax)
	
	vpslld	$13, %ymm3, %ymm2
	vpslld	$13, %ymm7, %ymm6
	vpsrld	$10, %ymm3, %ymm3
	vpsrld	$10, %ymm7, %ymm7
	vpsrld	$7, %ymm3, %ymm1
	vpsrld	$7, %ymm7, %ymm5
	vpxor	%ymm1, %ymm3, %ymm3
	vpxor	%ymm5, %ymm7, %ymm7
	vpsrld	$2, %ymm1, %ymm1
	vpsrld	$2, %ymm5, %ymm5
	vpxor	%ymm2, %ymm3, %ymm3
	vpxor	%ymm6, %ymm7, %ymm7
	vpslld	$2, %ymm2, %ymm2
	vpslld	$2, %ymm6, %ymm6
	vpxor	%ymm1, %ymm3, %ymm3
	vpxor	%ymm5, %ymm7, %ymm7
	vpxor	%ymm2, %ymm3, %ymm3
	vpxor	%ymm6, %ymm7, %ymm7
	vpaddd	3*32(%rax), %ymm3, %ymm3
	vpaddd	4*32(%rax), %ymm7, %ymm7
	vmovdqa	%ymm3, 10*32(%rax)
	vmovdqa	%ymm7, 11*32(%rax)
	
	vpslld	$13, %ymm3, %ymm2
	vpslld	$13, %ymm7, %ymm6
	vpsrld	$10, %ymm3, %ymm3
	vpsrld	$10, %ymm7, %ymm7
	vpsrld	$7, %ymm3, %ymm1
	vpsrld	$7, %ymm7, %ymm5
	vpxor	%ymm1, %ymm3, %ymm3
	vpxor	%ymm5, %ymm7, %ymm7
	vpsrld	$2, %ymm1, %ymm1
	vpsrld	$2, %ymm5, %ymm5
	vpxor	%ymm2, %ymm3, %ymm3
	vpxor	%ymm6, %ymm7, %ymm7
	vpslld	$2, %ymm2, %ymm2
	vpslld	$2, %ymm6, %ymm6
	vpxor	%ymm1, %ymm3, %ymm3
	vpxor	%ymm5, %ymm7, %ymm7
	vpxor	%ymm2, %ymm3, %ymm3
	vpxor	%ymm6, %ymm7, %ymm7
	vpaddd	5*32(%rax), %ymm3, %ymm3
	vpaddd	6*32(%rax), %ymm7, %ymm7
	vmovdqa	%ymm3, 12*32(%rax)
	vmovdqa	%ymm7, 13*32(%rax)
	
	vmovdqa	sha256d_8preext2_30(%rip), %ymm0
	vmovdqa	0*32(%rax), %ymm4
	vpslld	$14, %ymm4, %ymm6
	vpsrld	$3, %ymm4, %ymm4
	vpsrld	$4, %ymm4, %ymm5
	vpxor	%ymm5, %ymm4, %ymm4
	vpxor	%ymm6, %ymm4, %ymm4
	vpsrld	$11, %ymm5, %ymm5
	vpslld	$11, %ymm6, %ymm6
	vpxor	%ymm5, %ymm4, %ymm4
	vpxor	%ymm6, %ymm4, %ymm4
	vpaddd	-1*32(%rax), %ymm4, %ymm4
	vpslld	$13, %ymm3, %ymm2
	vpslld	$13, %ymm7, %ymm6
	vpsrld	$10, %ymm3, %ymm3
	vpsrld	$10, %ymm7, %ymm7
	vpaddd	7*32(%rax), %ymm0, %ymm0
	vpaddd	8*32(%rax), %ymm4, %ymm4
	vpsrld	$7, %ymm3, %ymm1
	vpsrld	$7, %ymm7, %ymm5
	vpxor	%ymm1, %ymm3, %ymm3
	vpxor	%ymm5, %ymm7, %ymm7
	vpsrld	$2, %ymm1, %ymm1
	vpsrld	$2, %ymm5, %ymm5
	vpxor	%ymm2, %ymm3, %ymm3
	vpxor	%ymm6, %ymm7, %ymm7
	vpslld	$2, %ymm2, %ymm2
	vpslld	$2, %ymm6, %ymm6
	vpxor	%ymm1, %ymm3, %ymm3
	vpxor	%ymm5, %ymm7, %ymm7
	vpxor	%ymm2, %ymm3, %ymm3
	vpxor	%ymm6, %ymm7, %ymm7
	vpaddd	%ymm0, %ymm3, %ymm3
	vpaddd	%ymm4, %ymm7, %ymm7
	vmovdqa	%ymm3, 14*32(%rax)
	vmovdqa	%ymm7, 15*32(%rax)
	
	jmp sha256d_ms_8way_avx2_extend_loop2
	
sha256d_ms_8way_avx2_extend_coda2:
	sha256_avx2_extend_round 44
	
	vmovdqa	sha256_8h+0(%rip), %ymm7
	vmovdqa	sha256_8h+32(%rip), %ymm5
	vmovdqa	sha256_8h+64(%rip), %ymm4
	vmovdqa	sha256_8h+96(%rip), %ymm3
	vmovdqa	sha256_8h+128(%rip), %ymm0
	vmovdqa	sha256_8h+160(%rip), %ymm8
	vmovdqa	sha256_8h+192(%rip), %ymm9
	vmovdqa	sha256_8h+224(%rip), %ymm10
	
	movq	%rsp, %rax
	leaq	sha256_8k(%rip), %rcx
	jmp sha256d_ms_8way_avx2_main_loop2

.macro sha256_avx2_main_round_red i, r0, r1, r2, r3, r4
	vpaddd	32*\i(%rax), \r0, %ymm6
	vpaddd	32*\i(%rcx), %ymm6, %ymm6
	vpandn	\r1, \r3, %ymm1
	vpand	\r3, \r2, %ymm2
	vpxor	%ymm2, %ymm1, %ymm1
	vpaddd	%ymm1, %ymm6, %ymm6
	vpslld	$7, \r3, %ymm1
	vpsrld	$6, \r3, \r0
	vpsrld	$5, \r0, %ymm2
	vpxor	%ymm1, \r0, \r0
	vpxor	%ymm2, \r0, \r0
	vpslld	$14, %ymm1, %ymm1
	vpsrld	$14, %ymm2, %ymm2
	vpxor	%ymm1, \r0, \r0
	vpxor	%ymm2, \r0, \r0
	vpslld	$5, %ymm1, %ymm1
	vpxor	%ymm1, \r0, \r0
	vpaddd	\r0, %ymm6, %ymm6
	vpaddd	%ymm6, \r4, \r0
.endm

sha256d_ms_8way_avx2_finish:
	sha256_avx2_main_round_red 57, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4
	sha256_avx2_main_round_red 58, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5
	sha256_avx2_main_round_red 59, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7
	sha256_avx2_main_round_red 60, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3
	
	vpaddd	sha256_8h+224(%rip), %ymm10, %ymm10
	vmovdqa	%ymm10, 224(%rdi)
	
	movq	%rbp, %rsp
	popq	%rbp
#if defined(_WIN64) || defined(__CYGWIN__)
	popq	%rsi
	vmovdqa	0(%rsp), %xmm6
	vmovdqa	16(%rsp), %xmm7
	vmovdqa	32(%rsp), %xmm8
	vmovdqa	48(%rsp), %xmm9
	vmovdqa	64(%rsp), %xmm10
	addq	$80, %rsp
	popq	%rdi
#endif
	ret


	.text
	.p2align 6
	.globl sha256_use_8way
	.globl _sha256_use_8way
sha256_use_8way:
_sha256_use_8way:
	pushq	%rbx
	
	/* Check for AVX and OSXSAVE support */
	movl	$1, %eax
	cpuid
	andl	$0x18000000, %ecx
	cmpl	$0x18000000, %ecx
	jne sha256_use_8way_no
	/* Check for AVX2 support */
	movl	$7, %eax
	xorl	%ecx, %ecx
	cpuid
	andl	$0x00000020, %ebx
	cmpl	$0x00000020, %ebx
	jne sha256_use_8way_no
	/* Check for XMM and YMM state support */
	xorl	%ecx, %ecx
	xgetbv
	andl	$0x00000006, %eax
	cmpl	$0x00000006, %eax
	jne sha256_use_8way_no
	
sha256_use_8way_yes:
	movl	$1, %eax
	jmp sha256_use_8way_done
	
sha256_use_8way_no:
	xorl	%eax, %eax
	
sha256_use_8way_done:
	popq	%rbx
	ret

#endif /* USE_AVX2 */

#endif
0707010000002B000081A4000003E800000064000000015EF4BCA100006795000000000000000000000000000000000000001A00000000cpuminer-2.5.1/sha2-x86.S/*
 * Copyright 2012 pooler@litecoinpool.org
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the Free
 * Software Foundation; either version 2 of the License, or (at your option)
 * any later version.  See COPYING for more details.
 */

#include "cpuminer-config.h"

#if defined(__linux__) && defined(__ELF__)
	.section .note.GNU-stack,"",%progbits
#endif

#if defined(USE_ASM) && defined(__i386__)

	.data
	.p2align 7
sha256_4h:
	.long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667
	.long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85
	.long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372
	.long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a
	.long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f
	.long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c
	.long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab
	.long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19

	.data
	.p2align 7
sha256_4k:
	.long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98
	.long 0x71374491, 0x71374491, 0x71374491, 0x71374491
	.long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf
	.long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5
	.long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b
	.long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1
	.long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4
	.long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5
	.long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98
	.long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01
	.long 0x243185be, 0x243185be, 0x243185be, 0x243185be
	.long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3
	.long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74
	.long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe
	.long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7
	.long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174
	.long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1
	.long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786
	.long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6
	.long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc
	.long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f
	.long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa
	.long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc
	.long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da
	.long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152
	.long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d
	.long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8
	.long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7
	.long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3
	.long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147
	.long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351
	.long 0x14292967, 0x14292967, 0x14292967, 0x14292967
	.long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85
	.long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138
	.long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc
	.long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13
	.long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354
	.long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb
	.long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e
	.long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85
	.long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1
	.long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b
	.long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70
	.long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3
	.long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819
	.long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624
	.long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585
	.long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070
	.long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116
	.long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08
	.long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c
	.long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5
	.long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3
	.long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a
	.long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f
	.long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3
	.long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee
	.long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f
	.long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814
	.long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208
	.long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa
	.long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb
	.long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7
	.long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2

	.data
	.p2align 6
sha256d_4preext2_15:
	.long 0x00000100, 0x00000100, 0x00000100, 0x00000100
sha256d_4preext2_17:
	.long 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000
sha256d_4preext2_23:
	.long 0x11002000, 0x11002000, 0x11002000, 0x11002000
sha256d_4preext2_24:
	.long 0x80000000, 0x80000000, 0x80000000, 0x80000000
sha256d_4preext2_30:
	.long 0x00400022, 0x00400022, 0x00400022, 0x00400022


	.text
	.p2align 5
	.globl sha256_init_4way
	.globl _sha256_init_4way
sha256_init_4way:
_sha256_init_4way:
	movl	4(%esp), %edx
	movdqa	sha256_4h+0, %xmm0
	movdqa	sha256_4h+16, %xmm1
	movdqa	sha256_4h+32, %xmm2
	movdqa	sha256_4h+48, %xmm3
	movdqu	%xmm0, 0(%edx)
	movdqu	%xmm1, 16(%edx)
	movdqu	%xmm2, 32(%edx)
	movdqu	%xmm3, 48(%edx)
	movdqa	sha256_4h+64, %xmm0
	movdqa	sha256_4h+80, %xmm1
	movdqa	sha256_4h+96, %xmm2
	movdqa	sha256_4h+112, %xmm3
	movdqu	%xmm0, 64(%edx)
	movdqu	%xmm1, 80(%edx)
	movdqu	%xmm2, 96(%edx)
	movdqu	%xmm3, 112(%edx)
	ret


.macro sha256_sse2_extend_round i
	movdqa	(\i-15)*16(%eax), %xmm0
	movdqa	%xmm0, %xmm2
	psrld	$3, %xmm0
	movdqa	%xmm0, %xmm1
	pslld	$14, %xmm2
	psrld	$4, %xmm1
	pxor	%xmm1, %xmm0
	pxor	%xmm2, %xmm0
	psrld	$11, %xmm1
	pslld	$11, %xmm2
	pxor	%xmm1, %xmm0
	pxor	%xmm2, %xmm0
	paddd	(\i-16)*16(%eax), %xmm0
	paddd	(\i-7)*16(%eax), %xmm0

	movdqa	%xmm3, %xmm2
	psrld	$10, %xmm3
	pslld	$13, %xmm2
	movdqa	%xmm3, %xmm1
	psrld	$7, %xmm1
	pxor	%xmm1, %xmm3
	pxor	%xmm2, %xmm3
	psrld	$2, %xmm1
	pslld	$2, %xmm2
	pxor	%xmm1, %xmm3
	pxor	%xmm2, %xmm3
	paddd	%xmm0, %xmm3
	movdqa	%xmm3, \i*16(%eax)
.endm

.macro sha256_sse2_extend_doubleround i
	movdqa	(\i-15)*16(%eax), %xmm0
	movdqa	(\i-14)*16(%eax), %xmm4
	movdqa	%xmm0, %xmm2
	movdqa	%xmm4, %xmm6
	psrld	$3, %xmm0
	psrld	$3, %xmm4
	movdqa	%xmm0, %xmm1
	movdqa	%xmm4, %xmm5
	pslld	$14, %xmm2
	pslld	$14, %xmm6
	psrld	$4, %xmm1
	psrld	$4, %xmm5
	pxor	%xmm1, %xmm0
	pxor	%xmm5, %xmm4
	psrld	$11, %xmm1
	psrld	$11, %xmm5
	pxor	%xmm2, %xmm0
	pxor	%xmm6, %xmm4
	pslld	$11, %xmm2
	pslld	$11, %xmm6
	pxor	%xmm1, %xmm0
	pxor	%xmm5, %xmm4
	pxor	%xmm2, %xmm0
	pxor	%xmm6, %xmm4

	paddd	(\i-16)*16(%eax), %xmm0
	paddd	(\i-15)*16(%eax), %xmm4

	movdqa	%xmm3, %xmm2
	movdqa	%xmm7, %xmm6
	psrld	$10, %xmm3
	psrld	$10, %xmm7
	movdqa	%xmm3, %xmm1
	movdqa	%xmm7, %xmm5
	pslld	$13, %xmm2
	pslld	$13, %xmm6
	psrld	$7, %xmm1
	psrld	$7, %xmm5

	paddd	(\i-7)*16(%eax), %xmm0
	paddd	(\i-6)*16(%eax), %xmm4

	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	psrld	$2, %xmm1
	psrld	$2, %xmm5
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	pslld	$2, %xmm2
	pslld	$2, %xmm6
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7

	paddd	%xmm0, %xmm3
	paddd	%xmm4, %xmm7
	movdqa	%xmm3, \i*16(%eax)
	movdqa	%xmm7, (\i+1)*16(%eax)
.endm

.macro sha256_sse2_main_round i
	movdqa	16*(\i)(%eax), %xmm6

	movdqa	%xmm0, %xmm1
	movdqa	16(%esp), %xmm2
	pandn	%xmm2, %xmm1
	paddd	32(%esp), %xmm6

	movdqa	%xmm2, 32(%esp)
	movdqa	0(%esp), %xmm2
	movdqa	%xmm2, 16(%esp)

	pand	%xmm0, %xmm2
	pxor	%xmm2, %xmm1
	movdqa	%xmm0, 0(%esp)

	paddd	%xmm1, %xmm6

	movdqa	%xmm0, %xmm1
	psrld	$6, %xmm0
	paddd	16*(\i)+sha256_4k, %xmm6
	movdqa	%xmm0, %xmm2
	pslld	$7, %xmm1
	psrld	$5, %xmm2
	pxor	%xmm1, %xmm0
	pxor	%xmm2, %xmm0
	pslld	$14, %xmm1
	psrld	$14, %xmm2
	pxor	%xmm1, %xmm0
	pslld	$5, %xmm1
	pxor	%xmm2, %xmm0
	pxor	%xmm1, %xmm0
	movdqa	%xmm5, %xmm1
	paddd	%xmm0, %xmm6

	movdqa	%xmm3, %xmm0
	movdqa	%xmm4, %xmm3
	movdqa	%xmm4, %xmm2
	paddd	%xmm6, %xmm0
	pand	%xmm5, %xmm2
	pand	%xmm7, %xmm1
	pand	%xmm7, %xmm4
	pxor	%xmm4, %xmm1
	movdqa	%xmm5, %xmm4
	movdqa	%xmm7, %xmm5
	pxor	%xmm2, %xmm1
	paddd	%xmm1, %xmm6

	movdqa	%xmm7, %xmm2
	psrld	$2, %xmm7
	movdqa	%xmm7, %xmm1
	pslld	$10, %xmm2
	psrld	$11, %xmm1
	pxor	%xmm2, %xmm7
	pslld	$9, %xmm2
	pxor	%xmm1, %xmm7
	psrld	$9, %xmm1
	pxor	%xmm2, %xmm7
	pslld	$11, %xmm2
	pxor	%xmm1, %xmm7
	pxor	%xmm2, %xmm7
	paddd	%xmm6, %xmm7
.endm

.macro sha256_sse2_main_quadround i
	sha256_sse2_main_round \i+0
	sha256_sse2_main_round \i+1
	sha256_sse2_main_round \i+2
	sha256_sse2_main_round \i+3
.endm


.macro p2bswap_esi_esp i
	movdqu	\i*16(%esi), %xmm0
	movdqu	(\i+1)*16(%esi), %xmm2
	pshuflw	$0xb1, %xmm0, %xmm0
	pshuflw	$0xb1, %xmm2, %xmm2
	pshufhw	$0xb1, %xmm0, %xmm0
	pshufhw	$0xb1, %xmm2, %xmm2
	movdqa	%xmm0, %xmm1
	movdqa	%xmm2, %xmm3
	psrlw	$8, %xmm1
	psrlw	$8, %xmm3
	psllw	$8, %xmm0
	psllw	$8, %xmm2
	pxor	%xmm1, %xmm0
	pxor	%xmm3, %xmm2
	movdqa	%xmm0, (\i+3)*16(%esp)
	movdqa	%xmm2, (\i+4)*16(%esp)
.endm

	.text
	.p2align 5
	.globl sha256_transform_4way
	.globl _sha256_transform_4way
sha256_transform_4way:
_sha256_transform_4way:
	pushl	%edi
	pushl	%esi
	movl	12(%esp), %edi
	movl	16(%esp), %esi
	movl	20(%esp), %ecx
	movl	%esp, %edx
	subl	$67*16, %esp
	andl	$-128, %esp
	
	testl	%ecx, %ecx
	jnz sha256_transform_4way_swap
	
	movdqu	0*16(%esi), %xmm0
	movdqu	1*16(%esi), %xmm1
	movdqu	2*16(%esi), %xmm2
	movdqu	3*16(%esi), %xmm3
	movdqu	4*16(%esi), %xmm4
	movdqu	5*16(%esi), %xmm5
	movdqu	6*16(%esi), %xmm6
	movdqu	7*16(%esi), %xmm7
	movdqa	%xmm0, 3*16(%esp)
	movdqa	%xmm1, 4*16(%esp)
	movdqa	%xmm2, 5*16(%esp)
	movdqa	%xmm3, 6*16(%esp)
	movdqa	%xmm4, 7*16(%esp)
	movdqa	%xmm5, 8*16(%esp)
	movdqa	%xmm6, 9*16(%esp)
	movdqa	%xmm7, 10*16(%esp)
	movdqu	8*16(%esi), %xmm0
	movdqu	9*16(%esi), %xmm1
	movdqu	10*16(%esi), %xmm2
	movdqu	11*16(%esi), %xmm3
	movdqu	12*16(%esi), %xmm4
	movdqu	13*16(%esi), %xmm5
	movdqu	14*16(%esi), %xmm6
	movdqu	15*16(%esi), %xmm7
	movdqa	%xmm0, 11*16(%esp)
	movdqa	%xmm1, 12*16(%esp)
	movdqa	%xmm2, 13*16(%esp)
	movdqa	%xmm3, 14*16(%esp)
	movdqa	%xmm4, 15*16(%esp)
	movdqa	%xmm5, 16*16(%esp)
	movdqa	%xmm6, 17*16(%esp)
	movdqa	%xmm7, 18*16(%esp)
	jmp sha256_transform_4way_extend
	
	.p2align 5
sha256_transform_4way_swap:
	p2bswap_esi_esp 0
	p2bswap_esi_esp 2
	p2bswap_esi_esp 4
	p2bswap_esi_esp 6
	p2bswap_esi_esp 8
	p2bswap_esi_esp 10
	p2bswap_esi_esp 12
	p2bswap_esi_esp 14
	
sha256_transform_4way_extend:
	leal	19*16(%esp), %ecx
	leal	48*16(%ecx), %eax
	movdqa	-2*16(%ecx), %xmm3
	movdqa	-1*16(%ecx), %xmm7
sha256_transform_4way_extend_loop:
	movdqa	-15*16(%ecx), %xmm0
	movdqa	-14*16(%ecx), %xmm4
	movdqa	%xmm0, %xmm2
	movdqa	%xmm4, %xmm6
	psrld	$3, %xmm0
	psrld	$3, %xmm4
	movdqa	%xmm0, %xmm1
	movdqa	%xmm4, %xmm5
	pslld	$14, %xmm2
	pslld	$14, %xmm6
	psrld	$4, %xmm1
	psrld	$4, %xmm5
	pxor	%xmm1, %xmm0
	pxor	%xmm5, %xmm4
	psrld	$11, %xmm1
	psrld	$11, %xmm5
	pxor	%xmm2, %xmm0
	pxor	%xmm6, %xmm4
	pslld	$11, %xmm2
	pslld	$11, %xmm6
	pxor	%xmm1, %xmm0
	pxor	%xmm5, %xmm4
	pxor	%xmm2, %xmm0
	pxor	%xmm6, %xmm4

	paddd	-16*16(%ecx), %xmm0
	paddd	-15*16(%ecx), %xmm4

	movdqa	%xmm3, %xmm2
	movdqa	%xmm7, %xmm6
	psrld	$10, %xmm3
	psrld	$10, %xmm7
	movdqa	%xmm3, %xmm1
	movdqa	%xmm7, %xmm5
	pslld	$13, %xmm2
	pslld	$13, %xmm6
	psrld	$7, %xmm1
	psrld	$7, %xmm5

	paddd	-7*16(%ecx), %xmm0
	paddd	-6*16(%ecx), %xmm4

	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	psrld	$2, %xmm1
	psrld	$2, %xmm5
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	pslld	$2, %xmm2
	pslld	$2, %xmm6
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7

	paddd	%xmm0, %xmm3
	paddd	%xmm4, %xmm7
	movdqa	%xmm3, (%ecx)
	movdqa	%xmm7, 16(%ecx)
	addl	$2*16, %ecx
	cmpl	%ecx, %eax
	jne sha256_transform_4way_extend_loop
	
	movdqu	0(%edi), %xmm7
	movdqu	16(%edi), %xmm5
	movdqu	32(%edi), %xmm4
	movdqu	48(%edi), %xmm3
	movdqu	64(%edi), %xmm0
	movdqu	80(%edi), %xmm1
	movdqu	96(%edi), %xmm2
	movdqu	112(%edi), %xmm6
	movdqa	%xmm1, 0(%esp)
	movdqa	%xmm2, 16(%esp)
	movdqa	%xmm6, 32(%esp)
	
	xorl	%eax, %eax
sha256_transform_4way_main_loop:
	movdqa	3*16(%esp, %eax), %xmm6
	paddd	sha256_4k(%eax), %xmm6
	paddd	32(%esp), %xmm6

	movdqa	%xmm0, %xmm1
	movdqa	16(%esp), %xmm2
	pandn	%xmm2, %xmm1

	movdqa	%xmm2, 32(%esp)
	movdqa	0(%esp), %xmm2
	movdqa	%xmm2, 16(%esp)

	pand	%xmm0, %xmm2
	pxor	%xmm2, %xmm1
	movdqa	%xmm0, 0(%esp)

	paddd	%xmm1, %xmm6

	movdqa	%xmm0, %xmm1
	psrld	$6, %xmm0
	movdqa	%xmm0, %xmm2
	pslld	$7, %xmm1
	psrld	$5, %xmm2
	pxor	%xmm1, %xmm0
	pxor	%xmm2, %xmm0
	pslld	$14, %xmm1
	psrld	$14, %xmm2
	pxor	%xmm1, %xmm0
	pxor	%xmm2, %xmm0
	pslld	$5, %xmm1
	pxor	%xmm1, %xmm0
	paddd	%xmm0, %xmm6

	movdqa	%xmm3, %xmm0
	paddd	%xmm6, %xmm0

	movdqa	%xmm5, %xmm1
	movdqa	%xmm4, %xmm3
	movdqa	%xmm4, %xmm2
	pand	%xmm5, %xmm2
	pand	%xmm7, %xmm4
	pand	%xmm7, %xmm1
	pxor	%xmm4, %xmm1
	movdqa	%xmm5, %xmm4
	movdqa	%xmm7, %xmm5
	pxor	%xmm2, %xmm1
	paddd	%xmm1, %xmm6

	movdqa	%xmm7, %xmm2
	psrld	$2, %xmm7
	movdqa	%xmm7, %xmm1
	pslld	$10, %xmm2
	psrld	$11, %xmm1
	pxor	%xmm2, %xmm7
	pxor	%xmm1, %xmm7
	pslld	$9, %xmm2
	psrld	$9, %xmm1
	pxor	%xmm2, %xmm7
	pxor	%xmm1, %xmm7
	pslld	$11, %xmm2
	pxor	%xmm2, %xmm7
	paddd	%xmm6, %xmm7
	
	addl	$16, %eax
	cmpl	$16*64, %eax
	jne sha256_transform_4way_main_loop
	
	movdqu	0(%edi), %xmm1
	movdqu	16(%edi), %xmm2
	paddd	%xmm1, %xmm7
	paddd	%xmm2, %xmm5
	movdqu	32(%edi), %xmm1
	movdqu	48(%edi), %xmm2
	paddd	%xmm1, %xmm4
	paddd	%xmm2, %xmm3
	
	movdqu	%xmm7, 0(%edi)
	movdqu	%xmm5, 16(%edi)
	movdqu	%xmm4, 32(%edi)
	movdqu	%xmm3, 48(%edi)
	
	movdqu	64(%edi), %xmm1
	movdqu	80(%edi), %xmm2
	movdqu	96(%edi), %xmm6
	movdqu	112(%edi), %xmm7
	paddd	%xmm1, %xmm0
	paddd	0(%esp), %xmm2
	paddd	16(%esp), %xmm6
	paddd	32(%esp), %xmm7
	
	movdqu	%xmm0, 64(%edi)
	movdqu	%xmm2, 80(%edi)
	movdqu	%xmm6, 96(%edi)
	movdqu	%xmm7, 112(%edi)
	
	movl	%edx, %esp
	popl	%esi
	popl	%edi
	ret


	.text
	.p2align 5
	.globl sha256d_ms_4way
	.globl _sha256d_ms_4way
sha256d_ms_4way:
_sha256d_ms_4way:
	pushl	%edi
	pushl	%esi
	pushl	%ebp
	movl	16(%esp), %edi
	movl	20(%esp), %esi
	movl	24(%esp), %edx
	movl	28(%esp), %ecx
	movl	%esp, %ebp
	subl	$67*16, %esp
	andl	$-128, %esp
	
	leal	256(%esi), %eax
	
sha256d_ms_4way_extend_loop1:
	movdqa	3*16(%esi), %xmm0
	movdqa	2*16(%eax), %xmm3
	movdqa	3*16(%eax), %xmm7
	movdqa	%xmm3, 5*16(%esp)
	movdqa	%xmm7, 6*16(%esp)
	movdqa	%xmm0, %xmm2
	paddd	%xmm0, %xmm7
	psrld	$3, %xmm0
	movdqa	%xmm0, %xmm1
	pslld	$14, %xmm2
	psrld	$4, %xmm1
	pxor	%xmm1, %xmm0
	pxor	%xmm2, %xmm0
	psrld	$11, %xmm1
	pslld	$11, %xmm2
	pxor	%xmm1, %xmm0
	pxor	%xmm2, %xmm0
	paddd	%xmm0, %xmm3
	movdqa	%xmm3, 2*16(%eax)
	movdqa	%xmm7, 3*16(%eax)
	
	movdqa	4*16(%eax), %xmm0
	movdqa	%xmm0, 7*16(%esp)
	movdqa	%xmm3, %xmm2
	movdqa	%xmm7, %xmm6
	psrld	$10, %xmm3
	psrld	$10, %xmm7
	movdqa	%xmm3, %xmm1
	movdqa	%xmm7, %xmm5
	pslld	$13, %xmm2
	pslld	$13, %xmm6
	psrld	$7, %xmm1
	psrld	$7, %xmm5
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	psrld	$2, %xmm1
	psrld	$2, %xmm5
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	pslld	$2, %xmm2
	pslld	$2, %xmm6
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	paddd	%xmm0, %xmm3
	movdqa	%xmm3, 4*16(%eax)
	movdqa	%xmm7, 5*16(%eax)
	
	movdqa	6*16(%eax), %xmm0
	movdqa	7*16(%eax), %xmm4
	movdqa	%xmm0, 9*16(%esp)
	movdqa	%xmm4, 10*16(%esp)
	movdqa	%xmm3, %xmm2
	movdqa	%xmm7, %xmm6
	psrld	$10, %xmm3
	psrld	$10, %xmm7
	movdqa	%xmm3, %xmm1
	movdqa	%xmm7, %xmm5
	pslld	$13, %xmm2
	pslld	$13, %xmm6
	psrld	$7, %xmm1
	psrld	$7, %xmm5
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	psrld	$2, %xmm1
	psrld	$2, %xmm5
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	pslld	$2, %xmm2
	pslld	$2, %xmm6
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	paddd	%xmm0, %xmm3
	paddd	%xmm4, %xmm7
	movdqa	%xmm3, 6*16(%eax)
	movdqa	%xmm7, 7*16(%eax)
	
	movdqa	8*16(%eax), %xmm0
	movdqa	2*16(%eax), %xmm4
	movdqa	%xmm0, 11*16(%esp)
	movdqa	%xmm3, %xmm2
	movdqa	%xmm7, %xmm6
	psrld	$10, %xmm3
	psrld	$10, %xmm7
	movdqa	%xmm3, %xmm1
	movdqa	%xmm7, %xmm5
	pslld	$13, %xmm2
	pslld	$13, %xmm6
	psrld	$7, %xmm1
	psrld	$7, %xmm5
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	psrld	$2, %xmm1
	psrld	$2, %xmm5
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	pslld	$2, %xmm2
	pslld	$2, %xmm6
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	paddd	%xmm0, %xmm3
	paddd	%xmm4, %xmm7
	movdqa	%xmm3, 8*16(%eax)
	movdqa	%xmm7, 9*16(%eax)
	
	movdqa	%xmm3, %xmm2
	movdqa	%xmm7, %xmm6
	psrld	$10, %xmm3
	psrld	$10, %xmm7
	movdqa	%xmm3, %xmm1
	movdqa	%xmm7, %xmm5
	pslld	$13, %xmm2
	pslld	$13, %xmm6
	psrld	$7, %xmm1
	psrld	$7, %xmm5
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	psrld	$2, %xmm1
	psrld	$2, %xmm5
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	pslld	$2, %xmm2
	pslld	$2, %xmm6
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	paddd	3*16(%eax), %xmm3
	paddd	4*16(%eax), %xmm7
	movdqa	%xmm3, 10*16(%eax)
	movdqa	%xmm7, 11*16(%eax)
	
	movdqa	%xmm3, %xmm2
	movdqa	%xmm7, %xmm6
	psrld	$10, %xmm3
	psrld	$10, %xmm7
	movdqa	%xmm3, %xmm1
	movdqa	%xmm7, %xmm5
	pslld	$13, %xmm2
	pslld	$13, %xmm6
	psrld	$7, %xmm1
	psrld	$7, %xmm5
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	psrld	$2, %xmm1
	psrld	$2, %xmm5
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	pslld	$2, %xmm2
	pslld	$2, %xmm6
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	paddd	5*16(%eax), %xmm3
	paddd	6*16(%eax), %xmm7
	movdqa	%xmm3, 12*16(%eax)
	movdqa	%xmm7, 13*16(%eax)
	
	movdqa	14*16(%eax), %xmm0
	movdqa	15*16(%eax), %xmm4
	movdqa	%xmm0, 17*16(%esp)
	movdqa	%xmm4, 18*16(%esp)
	movdqa	%xmm3, %xmm2
	movdqa	%xmm7, %xmm6
	psrld	$10, %xmm3
	psrld	$10, %xmm7
	movdqa	%xmm3, %xmm1
	movdqa	%xmm7, %xmm5
	paddd	7*16(%eax), %xmm0
	paddd	8*16(%eax), %xmm4
	pslld	$13, %xmm2
	pslld	$13, %xmm6
	psrld	$7, %xmm1
	psrld	$7, %xmm5
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	psrld	$2, %xmm1
	psrld	$2, %xmm5
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	pslld	$2, %xmm2
	pslld	$2, %xmm6
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	paddd	%xmm0, %xmm3
	paddd	%xmm4, %xmm7
	movdqa	%xmm3, 14*16(%eax)
	movdqa	%xmm7, 15*16(%eax)
	
sha256d_ms_4way_extend_loop2:
	sha256_sse2_extend_doubleround 16
	sha256_sse2_extend_doubleround 18
	sha256_sse2_extend_doubleround 20
	sha256_sse2_extend_doubleround 22
	sha256_sse2_extend_doubleround 24
	sha256_sse2_extend_doubleround 26
	sha256_sse2_extend_doubleround 28
	sha256_sse2_extend_doubleround 30
	sha256_sse2_extend_doubleround 32
	sha256_sse2_extend_doubleround 34
	sha256_sse2_extend_doubleround 36
	sha256_sse2_extend_doubleround 38
	sha256_sse2_extend_doubleround 40
	sha256_sse2_extend_doubleround 42
	jz sha256d_ms_4way_extend_coda2
	sha256_sse2_extend_doubleround 44
	sha256_sse2_extend_doubleround 46
	
	movdqa	0(%ecx), %xmm3
	movdqa	16(%ecx), %xmm0
	movdqa	32(%ecx), %xmm1
	movdqa	48(%ecx), %xmm2
	movdqa	64(%ecx), %xmm6
	movdqa	80(%ecx), %xmm7
	movdqa	96(%ecx), %xmm5
	movdqa	112(%ecx), %xmm4
	movdqa	%xmm1, 0(%esp)
	movdqa	%xmm2, 16(%esp)
	movdqa	%xmm6, 32(%esp)
	
	movl	%esi, %eax
	jmp sha256d_ms_4way_main_loop1
	
sha256d_ms_4way_main_loop2:
	sha256_sse2_main_round 0
	sha256_sse2_main_round 1
	sha256_sse2_main_round 2
sha256d_ms_4way_main_loop1:
	sha256_sse2_main_round 3
	sha256_sse2_main_quadround 4
	sha256_sse2_main_quadround 8
	sha256_sse2_main_quadround 12
	sha256_sse2_main_quadround 16
	sha256_sse2_main_quadround 20
	sha256_sse2_main_quadround 24
	sha256_sse2_main_quadround 28
	sha256_sse2_main_quadround 32
	sha256_sse2_main_quadround 36
	sha256_sse2_main_quadround 40
	sha256_sse2_main_quadround 44
	sha256_sse2_main_quadround 48
	sha256_sse2_main_quadround 52
	sha256_sse2_main_round 56
	jz sha256d_ms_4way_finish
	sha256_sse2_main_round 57
	sha256_sse2_main_round 58
	sha256_sse2_main_round 59
	sha256_sse2_main_quadround 60
	
	movdqa	5*16(%esp), %xmm1
	movdqa	6*16(%esp), %xmm2
	movdqa	7*16(%esp), %xmm6
	movdqa	%xmm1, 18*16(%esi)
	movdqa	%xmm2, 19*16(%esi)
	movdqa	%xmm6, 20*16(%esi)
	movdqa	9*16(%esp), %xmm1
	movdqa	10*16(%esp), %xmm2
	movdqa	11*16(%esp), %xmm6
	movdqa	%xmm1, 22*16(%esi)
	movdqa	%xmm2, 23*16(%esi)
	movdqa	%xmm6, 24*16(%esi)
	movdqa	17*16(%esp), %xmm1
	movdqa	18*16(%esp), %xmm2
	movdqa	%xmm1, 30*16(%esi)
	movdqa	%xmm2, 31*16(%esi)
	
	movdqa	0(%esp), %xmm1
	movdqa	16(%esp), %xmm2
	movdqa	32(%esp), %xmm6
	paddd	0(%edx), %xmm7
	paddd	16(%edx), %xmm5
	paddd	32(%edx), %xmm4
	paddd	48(%edx), %xmm3
	paddd	64(%edx), %xmm0
	paddd	80(%edx), %xmm1
	paddd	96(%edx), %xmm2
	paddd	112(%edx), %xmm6
	
	movdqa	%xmm7, 48+0(%esp)
	movdqa	%xmm5, 48+16(%esp)
	movdqa	%xmm4, 48+32(%esp)
	movdqa	%xmm3, 48+48(%esp)
	movdqa	%xmm0, 48+64(%esp)
	movdqa	%xmm1, 48+80(%esp)
	movdqa	%xmm2, 48+96(%esp)
	movdqa	%xmm6, 48+112(%esp)
	
	movdqa	sha256d_4preext2_15, %xmm1
	movdqa	sha256d_4preext2_24, %xmm2
	pxor	%xmm0, %xmm0
	movdqa	%xmm2, 48+128(%esp)
	movdqa	%xmm0, 48+144(%esp)
	movdqa	%xmm0, 48+160(%esp)
	movdqa	%xmm0, 48+176(%esp)
	movdqa	%xmm0, 48+192(%esp)
	movdqa	%xmm0, 48+208(%esp)
	movdqa	%xmm0, 48+224(%esp)
	movdqa	%xmm1, 48+240(%esp)
	
	leal	19*16(%esp), %eax
	cmpl	%eax, %eax
	
	movdqa	-15*16(%eax), %xmm0
	movdqa	-14*16(%eax), %xmm4
	movdqa	%xmm0, %xmm2
	movdqa	%xmm4, %xmm6
	psrld	$3, %xmm0
	psrld	$3, %xmm4
	movdqa	%xmm0, %xmm1
	movdqa	%xmm4, %xmm5
	pslld	$14, %xmm2
	pslld	$14, %xmm6
	psrld	$4, %xmm1
	psrld	$4, %xmm5
	pxor	%xmm1, %xmm0
	pxor	%xmm5, %xmm4
	psrld	$11, %xmm1
	psrld	$11, %xmm5
	pxor	%xmm2, %xmm0
	pxor	%xmm6, %xmm4
	pslld	$11, %xmm2
	pslld	$11, %xmm6
	pxor	%xmm1, %xmm0
	pxor	%xmm5, %xmm4
	pxor	%xmm2, %xmm0
	pxor	%xmm6, %xmm4
	paddd	-16*16(%eax), %xmm0
	paddd	-15*16(%eax), %xmm4
	paddd	sha256d_4preext2_17, %xmm4
	movdqa	%xmm0, %xmm3
	movdqa	%xmm4, %xmm7
	movdqa	%xmm3, 0*16(%eax)
	movdqa	%xmm7, 1*16(%eax)
	
	sha256_sse2_extend_doubleround 2
	sha256_sse2_extend_doubleround 4
	
	movdqa	-9*16(%eax), %xmm0
	movdqa	sha256d_4preext2_23, %xmm4
	movdqa	%xmm0, %xmm2
	psrld	$3, %xmm0
	movdqa	%xmm0, %xmm1
	pslld	$14, %xmm2
	psrld	$4, %xmm1
	pxor	%xmm1, %xmm0
	pxor	%xmm2, %xmm0
	psrld	$11, %xmm1
	pslld	$11, %xmm2
	pxor	%xmm1, %xmm0
	pxor	%xmm2, %xmm0
	paddd	-10*16(%eax), %xmm0
	paddd	-9*16(%eax), %xmm4
	movdqa	%xmm3, %xmm2
	movdqa	%xmm7, %xmm6
	psrld	$10, %xmm3
	psrld	$10, %xmm7
	movdqa	%xmm3, %xmm1
	movdqa	%xmm7, %xmm5
	paddd	-1*16(%eax), %xmm0
	pslld	$13, %xmm2
	pslld	$13, %xmm6
	psrld	$7, %xmm1
	psrld	$7, %xmm5
	paddd	0*16(%eax), %xmm4
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	psrld	$2, %xmm1
	psrld	$2, %xmm5
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	pslld	$2, %xmm2
	pslld	$2, %xmm6
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	paddd	%xmm0, %xmm3
	paddd	%xmm4, %xmm7
	movdqa	%xmm3, 6*16(%eax)
	movdqa	%xmm7, 7*16(%eax)
	
	movdqa	sha256d_4preext2_24, %xmm0
	movdqa	%xmm3, %xmm2
	movdqa	%xmm7, %xmm6
	psrld	$10, %xmm3
	psrld	$10, %xmm7
	movdqa	%xmm3, %xmm1
	movdqa	%xmm7, %xmm5
	paddd	1*16(%eax), %xmm0
	pslld	$13, %xmm2
	pslld	$13, %xmm6
	psrld	$7, %xmm1
	psrld	$7, %xmm5
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	psrld	$2, %xmm1
	psrld	$2, %xmm5
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	pslld	$2, %xmm2
	pslld	$2, %xmm6
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	paddd	%xmm0, %xmm3
	paddd	2*16(%eax), %xmm7
	movdqa	%xmm3, 8*16(%eax)
	movdqa	%xmm7, 9*16(%eax)
	
	movdqa	%xmm3, %xmm2
	movdqa	%xmm7, %xmm6
	psrld	$10, %xmm3
	psrld	$10, %xmm7
	movdqa	%xmm3, %xmm1
	movdqa	%xmm7, %xmm5
	pslld	$13, %xmm2
	pslld	$13, %xmm6
	psrld	$7, %xmm1
	psrld	$7, %xmm5
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	psrld	$2, %xmm1
	psrld	$2, %xmm5
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	pslld	$2, %xmm2
	pslld	$2, %xmm6
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	paddd	3*16(%eax), %xmm3
	paddd	4*16(%eax), %xmm7
	movdqa	%xmm3, 10*16(%eax)
	movdqa	%xmm7, 11*16(%eax)
	
	movdqa	%xmm3, %xmm2
	movdqa	%xmm7, %xmm6
	psrld	$10, %xmm3
	psrld	$10, %xmm7
	movdqa	%xmm3, %xmm1
	movdqa	%xmm7, %xmm5
	pslld	$13, %xmm2
	pslld	$13, %xmm6
	psrld	$7, %xmm1
	psrld	$7, %xmm5
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	psrld	$2, %xmm1
	psrld	$2, %xmm5
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	pslld	$2, %xmm2
	pslld	$2, %xmm6
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	paddd	5*16(%eax), %xmm3
	paddd	6*16(%eax), %xmm7
	movdqa	%xmm3, 12*16(%eax)
	movdqa	%xmm7, 13*16(%eax)
	
	movdqa	sha256d_4preext2_30, %xmm0
	movdqa	0*16(%eax), %xmm4
	movdqa	%xmm4, %xmm6
	psrld	$3, %xmm4
	movdqa	%xmm4, %xmm5
	pslld	$14, %xmm6
	psrld	$4, %xmm5
	pxor	%xmm5, %xmm4
	pxor	%xmm6, %xmm4
	psrld	$11, %xmm5
	pslld	$11, %xmm6
	pxor	%xmm5, %xmm4
	pxor	%xmm6, %xmm4
	paddd	-1*16(%eax), %xmm4
	movdqa	%xmm3, %xmm2
	movdqa	%xmm7, %xmm6
	psrld	$10, %xmm3
	psrld	$10, %xmm7
	movdqa	%xmm3, %xmm1
	movdqa	%xmm7, %xmm5
	paddd	7*16(%eax), %xmm0
	pslld	$13, %xmm2
	pslld	$13, %xmm6
	psrld	$7, %xmm1
	psrld	$7, %xmm5
	paddd	8*16(%eax), %xmm4
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	psrld	$2, %xmm1
	psrld	$2, %xmm5
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	pslld	$2, %xmm2
	pslld	$2, %xmm6
	pxor	%xmm1, %xmm3
	pxor	%xmm5, %xmm7
	pxor	%xmm2, %xmm3
	pxor	%xmm6, %xmm7
	paddd	%xmm0, %xmm3
	paddd	%xmm4, %xmm7
	movdqa	%xmm3, 14*16(%eax)
	movdqa	%xmm7, 15*16(%eax)
	
	jmp sha256d_ms_4way_extend_loop2
	
sha256d_ms_4way_extend_coda2:
	sha256_sse2_extend_round 44
	
	movdqa	sha256_4h+0, %xmm7
	movdqa	sha256_4h+16, %xmm5
	movdqa	sha256_4h+32, %xmm4
	movdqa	sha256_4h+48, %xmm3
	movdqa	sha256_4h+64, %xmm0
	movdqa	sha256_4h+80, %xmm1
	movdqa	sha256_4h+96, %xmm2
	movdqa	sha256_4h+112, %xmm6
	movdqa	%xmm1, 0(%esp)
	movdqa	%xmm2, 16(%esp)
	movdqa	%xmm6, 32(%esp)
	
	leal	48(%esp), %eax
	jmp sha256d_ms_4way_main_loop2

.macro sha256_sse2_main_round_red i, r7
	movdqa	16*(\i)(%eax), %xmm6
	paddd	16*(\i)+sha256_4k, %xmm6
	paddd	32(%esp), %xmm6
	movdqa	%xmm0, %xmm1
	movdqa	16(%esp), %xmm2
	paddd	\r7, %xmm6
	pandn	%xmm2, %xmm1
	movdqa	%xmm2, 32(%esp)
	movdqa	0(%esp), %xmm2
	movdqa	%xmm2, 16(%esp)
	pand	%xmm0, %xmm2
	pxor	%xmm2, %xmm1
	movdqa	%xmm0, 0(%esp)
	paddd	%xmm1, %xmm6
	movdqa	%xmm0, %xmm1
	psrld	$6, %xmm0
	movdqa	%xmm0, %xmm2
	pslld	$7, %xmm1
	psrld	$5, %xmm2
	pxor	%xmm1, %xmm0
	pxor	%xmm2, %xmm0
	pslld	$14, %xmm1
	psrld	$14, %xmm2
	pxor	%xmm1, %xmm0
	pxor	%xmm2, %xmm0
	pslld	$5, %xmm1
	pxor	%xmm1, %xmm0
	paddd	%xmm6, %xmm0
.endm

sha256d_ms_4way_finish:
	sha256_sse2_main_round_red 57, %xmm3
	sha256_sse2_main_round_red 58, %xmm4
	sha256_sse2_main_round_red 59, %xmm5
	sha256_sse2_main_round_red 60, %xmm7
	
	paddd	sha256_4h+112, %xmm0
	movdqa	%xmm0, 112(%edi)
	
	movl	%ebp, %esp
	popl	%ebp
	popl	%esi
	popl	%edi
	ret


	.text
	.p2align 5
	.globl sha256_use_4way
	.globl _sha256_use_4way
sha256_use_4way:
_sha256_use_4way:
	pushl	%ebx
	
	/* Check for SSE2 availability */
	movl	$1, %eax
	cpuid
	andl	$0x04000000, %edx
	jnz sha256_use_4way_sse2
	xorl	%eax, %eax
	popl	%ebx
	ret
	
sha256_use_4way_sse2:
	movl	$1, %eax
	popl	%ebx
	ret

#endif
0707010000002C000081A4000003E800000064000000015EF4BCA100003EC7000000000000000000000000000000000000001600000000cpuminer-2.5.1/sha2.c/*
 * Copyright 2011 ArtForz
 * Copyright 2011-2013 pooler
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the Free
 * Software Foundation; either version 2 of the License, or (at your option)
 * any later version.  See COPYING for more details.
 */

#include "cpuminer-config.h"
#include "miner.h"

#include <string.h>
#include <inttypes.h>

#if defined(USE_ASM) && \
	(defined(__x86_64__) || \
	 (defined(__arm__) && defined(__APCS_32__)) || \
	 (defined(__powerpc__) || defined(__ppc__) || defined(__PPC__)))
#define EXTERN_SHA256
#endif

static const uint32_t sha256_h[8] = {
	0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
	0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
};

static const uint32_t sha256_k[64] = {
	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};

void sha256_init(uint32_t *state)
{
	memcpy(state, sha256_h, 32);
}

/* Elementary functions used by SHA256 */
#define Ch(x, y, z)     ((x & (y ^ z)) ^ z)
#define Maj(x, y, z)    ((x & (y | z)) | (y & z))
#define ROTR(x, n)      ((x >> n) | (x << (32 - n)))
#define S0(x)           (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
#define S1(x)           (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
#define s0(x)           (ROTR(x, 7) ^ ROTR(x, 18) ^ (x >> 3))
#define s1(x)           (ROTR(x, 17) ^ ROTR(x, 19) ^ (x >> 10))

/* SHA256 round function */
#define RND(a, b, c, d, e, f, g, h, k) \
	do { \
		t0 = h + S1(e) + Ch(e, f, g) + k; \
		t1 = S0(a) + Maj(a, b, c); \
		d += t0; \
		h  = t0 + t1; \
	} while (0)

/* Adjusted round function for rotating state */
#define RNDr(S, W, i) \
	RND(S[(64 - i) % 8], S[(65 - i) % 8], \
	    S[(66 - i) % 8], S[(67 - i) % 8], \
	    S[(68 - i) % 8], S[(69 - i) % 8], \
	    S[(70 - i) % 8], S[(71 - i) % 8], \
	    W[i] + sha256_k[i])

#ifndef EXTERN_SHA256

/*
 * SHA256 block compression function.  The 256-bit state is transformed via
 * the 512-bit input block to produce a new state.
 */
void sha256_transform(uint32_t *state, const uint32_t *block, int swap)
{
	uint32_t W[64];
	uint32_t S[8];
	uint32_t t0, t1;
	int i;

	/* 1. Prepare message schedule W. */
	if (swap) {
		for (i = 0; i < 16; i++)
			W[i] = swab32(block[i]);
	} else
		memcpy(W, block, 64);
	for (i = 16; i < 64; i += 2) {
		W[i]   = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
		W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15];
	}

	/* 2. Initialize working variables. */
	memcpy(S, state, 32);

	/* 3. Mix. */
	RNDr(S, W,  0);
	RNDr(S, W,  1);
	RNDr(S, W,  2);
	RNDr(S, W,  3);
	RNDr(S, W,  4);
	RNDr(S, W,  5);
	RNDr(S, W,  6);
	RNDr(S, W,  7);
	RNDr(S, W,  8);
	RNDr(S, W,  9);
	RNDr(S, W, 10);
	RNDr(S, W, 11);
	RNDr(S, W, 12);
	RNDr(S, W, 13);
	RNDr(S, W, 14);
	RNDr(S, W, 15);
	RNDr(S, W, 16);
	RNDr(S, W, 17);
	RNDr(S, W, 18);
	RNDr(S, W, 19);
	RNDr(S, W, 20);
	RNDr(S, W, 21);
	RNDr(S, W, 22);
	RNDr(S, W, 23);
	RNDr(S, W, 24);
	RNDr(S, W, 25);
	RNDr(S, W, 26);
	RNDr(S, W, 27);
	RNDr(S, W, 28);
	RNDr(S, W, 29);
	RNDr(S, W, 30);
	RNDr(S, W, 31);
	RNDr(S, W, 32);
	RNDr(S, W, 33);
	RNDr(S, W, 34);
	RNDr(S, W, 35);
	RNDr(S, W, 36);
	RNDr(S, W, 37);
	RNDr(S, W, 38);
	RNDr(S, W, 39);
	RNDr(S, W, 40);
	RNDr(S, W, 41);
	RNDr(S, W, 42);
	RNDr(S, W, 43);
	RNDr(S, W, 44);
	RNDr(S, W, 45);
	RNDr(S, W, 46);
	RNDr(S, W, 47);
	RNDr(S, W, 48);
	RNDr(S, W, 49);
	RNDr(S, W, 50);
	RNDr(S, W, 51);
	RNDr(S, W, 52);
	RNDr(S, W, 53);
	RNDr(S, W, 54);
	RNDr(S, W, 55);
	RNDr(S, W, 56);
	RNDr(S, W, 57);
	RNDr(S, W, 58);
	RNDr(S, W, 59);
	RNDr(S, W, 60);
	RNDr(S, W, 61);
	RNDr(S, W, 62);
	RNDr(S, W, 63);

	/* 4. Mix local working variables into global state */
	for (i = 0; i < 8; i++)
		state[i] += S[i];
}

#endif /* EXTERN_SHA256 */


static const uint32_t sha256d_hash1[16] = {
	0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000,
	0x80000000, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000100
};

static void sha256d_80_swap(uint32_t *hash, const uint32_t *data)
{
	uint32_t S[16];
	int i;

	sha256_init(S);
	sha256_transform(S, data, 0);
	sha256_transform(S, data + 16, 0);
	memcpy(S + 8, sha256d_hash1 + 8, 32);
	sha256_init(hash);
	sha256_transform(hash, S, 0);
	for (i = 0; i < 8; i++)
		hash[i] = swab32(hash[i]);
}

void sha256d(unsigned char *hash, const unsigned char *data, int len)
{
	uint32_t S[16], T[16];
	int i, r;

	sha256_init(S);
	for (r = len; r > -9; r -= 64) {
		if (r < 64)
			memset(T, 0, 64);
		memcpy(T, data + len - r, r > 64 ? 64 : (r < 0 ? 0 : r));
		if (r >= 0 && r < 64)
			((unsigned char *)T)[r] = 0x80;
		for (i = 0; i < 16; i++)
			T[i] = be32dec(T + i);
		if (r < 56)
			T[15] = 8 * len;
		sha256_transform(S, T, 0);
	}
	memcpy(S + 8, sha256d_hash1 + 8, 32);
	sha256_init(T);
	sha256_transform(T, S, 0);
	for (i = 0; i < 8; i++)
		be32enc((uint32_t *)hash + i, T[i]);
}

static inline void sha256d_preextend(uint32_t *W)
{
	W[16] = s1(W[14]) + W[ 9] + s0(W[ 1]) + W[ 0];
	W[17] = s1(W[15]) + W[10] + s0(W[ 2]) + W[ 1];
	W[18] = s1(W[16]) + W[11]             + W[ 2];
	W[19] = s1(W[17]) + W[12] + s0(W[ 4]);
	W[20] =             W[13] + s0(W[ 5]) + W[ 4];
	W[21] =             W[14] + s0(W[ 6]) + W[ 5];
	W[22] =             W[15] + s0(W[ 7]) + W[ 6];
	W[23] =             W[16] + s0(W[ 8]) + W[ 7];
	W[24] =             W[17] + s0(W[ 9]) + W[ 8];
	W[25] =                     s0(W[10]) + W[ 9];
	W[26] =                     s0(W[11]) + W[10];
	W[27] =                     s0(W[12]) + W[11];
	W[28] =                     s0(W[13]) + W[12];
	W[29] =                     s0(W[14]) + W[13];
	W[30] =                     s0(W[15]) + W[14];
	W[31] =                     s0(W[16]) + W[15];
}

static inline void sha256d_prehash(uint32_t *S, const uint32_t *W)
{
	uint32_t t0, t1;
	RNDr(S, W, 0);
	RNDr(S, W, 1);
	RNDr(S, W, 2);
}

#ifdef EXTERN_SHA256

void sha256d_ms(uint32_t *hash, uint32_t *W,
	const uint32_t *midstate, const uint32_t *prehash);

#else

static inline void sha256d_ms(uint32_t *hash, uint32_t *W,
	const uint32_t *midstate, const uint32_t *prehash)
{
	uint32_t S[64];
	uint32_t t0, t1;
	int i;

	S[18] = W[18];
	S[19] = W[19];
	S[20] = W[20];
	S[22] = W[22];
	S[23] = W[23];
	S[24] = W[24];
	S[30] = W[30];
	S[31] = W[31];

	W[18] += s0(W[3]);
	W[19] += W[3];
	W[20] += s1(W[18]);
	W[21]  = s1(W[19]);
	W[22] += s1(W[20]);
	W[23] += s1(W[21]);
	W[24] += s1(W[22]);
	W[25]  = s1(W[23]) + W[18];
	W[26]  = s1(W[24]) + W[19];
	W[27]  = s1(W[25]) + W[20];
	W[28]  = s1(W[26]) + W[21];
	W[29]  = s1(W[27]) + W[22];
	W[30] += s1(W[28]) + W[23];
	W[31] += s1(W[29]) + W[24];
	for (i = 32; i < 64; i += 2) {
		W[i]   = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
		W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15];
	}

	memcpy(S, prehash, 32);

	RNDr(S, W,  3);
	RNDr(S, W,  4);
	RNDr(S, W,  5);
	RNDr(S, W,  6);
	RNDr(S, W,  7);
	RNDr(S, W,  8);
	RNDr(S, W,  9);
	RNDr(S, W, 10);
	RNDr(S, W, 11);
	RNDr(S, W, 12);
	RNDr(S, W, 13);
	RNDr(S, W, 14);
	RNDr(S, W, 15);
	RNDr(S, W, 16);
	RNDr(S, W, 17);
	RNDr(S, W, 18);
	RNDr(S, W, 19);
	RNDr(S, W, 20);
	RNDr(S, W, 21);
	RNDr(S, W, 22);
	RNDr(S, W, 23);
	RNDr(S, W, 24);
	RNDr(S, W, 25);
	RNDr(S, W, 26);
	RNDr(S, W, 27);
	RNDr(S, W, 28);
	RNDr(S, W, 29);
	RNDr(S, W, 30);
	RNDr(S, W, 31);
	RNDr(S, W, 32);
	RNDr(S, W, 33);
	RNDr(S, W, 34);
	RNDr(S, W, 35);
	RNDr(S, W, 36);
	RNDr(S, W, 37);
	RNDr(S, W, 38);
	RNDr(S, W, 39);
	RNDr(S, W, 40);
	RNDr(S, W, 41);
	RNDr(S, W, 42);
	RNDr(S, W, 43);
	RNDr(S, W, 44);
	RNDr(S, W, 45);
	RNDr(S, W, 46);
	RNDr(S, W, 47);
	RNDr(S, W, 48);
	RNDr(S, W, 49);
	RNDr(S, W, 50);
	RNDr(S, W, 51);
	RNDr(S, W, 52);
	RNDr(S, W, 53);
	RNDr(S, W, 54);
	RNDr(S, W, 55);
	RNDr(S, W, 56);
	RNDr(S, W, 57);
	RNDr(S, W, 58);
	RNDr(S, W, 59);
	RNDr(S, W, 60);
	RNDr(S, W, 61);
	RNDr(S, W, 62);
	RNDr(S, W, 63);

	for (i = 0; i < 8; i++)
		S[i] += midstate[i];
	
	W[18] = S[18];
	W[19] = S[19];
	W[20] = S[20];
	W[22] = S[22];
	W[23] = S[23];
	W[24] = S[24];
	W[30] = S[30];
	W[31] = S[31];
	
	memcpy(S + 8, sha256d_hash1 + 8, 32);
	S[16] = s1(sha256d_hash1[14]) + sha256d_hash1[ 9] + s0(S[ 1]) + S[ 0];
	S[17] = s1(sha256d_hash1[15]) + sha256d_hash1[10] + s0(S[ 2]) + S[ 1];
	S[18] = s1(S[16]) + sha256d_hash1[11] + s0(S[ 3]) + S[ 2];
	S[19] = s1(S[17]) + sha256d_hash1[12] + s0(S[ 4]) + S[ 3];
	S[20] = s1(S[18]) + sha256d_hash1[13] + s0(S[ 5]) + S[ 4];
	S[21] = s1(S[19]) + sha256d_hash1[14] + s0(S[ 6]) + S[ 5];
	S[22] = s1(S[20]) + sha256d_hash1[15] + s0(S[ 7]) + S[ 6];
	S[23] = s1(S[21]) + S[16] + s0(sha256d_hash1[ 8]) + S[ 7];
	S[24] = s1(S[22]) + S[17] + s0(sha256d_hash1[ 9]) + sha256d_hash1[ 8];
	S[25] = s1(S[23]) + S[18] + s0(sha256d_hash1[10]) + sha256d_hash1[ 9];
	S[26] = s1(S[24]) + S[19] + s0(sha256d_hash1[11]) + sha256d_hash1[10];
	S[27] = s1(S[25]) + S[20] + s0(sha256d_hash1[12]) + sha256d_hash1[11];
	S[28] = s1(S[26]) + S[21] + s0(sha256d_hash1[13]) + sha256d_hash1[12];
	S[29] = s1(S[27]) + S[22] + s0(sha256d_hash1[14]) + sha256d_hash1[13];
	S[30] = s1(S[28]) + S[23] + s0(sha256d_hash1[15]) + sha256d_hash1[14];
	S[31] = s1(S[29]) + S[24] + s0(S[16])             + sha256d_hash1[15];
	for (i = 32; i < 60; i += 2) {
		S[i]   = s1(S[i - 2]) + S[i - 7] + s0(S[i - 15]) + S[i - 16];
		S[i+1] = s1(S[i - 1]) + S[i - 6] + s0(S[i - 14]) + S[i - 15];
	}
	S[60] = s1(S[58]) + S[53] + s0(S[45]) + S[44];

	sha256_init(hash);

	RNDr(hash, S,  0);
	RNDr(hash, S,  1);
	RNDr(hash, S,  2);
	RNDr(hash, S,  3);
	RNDr(hash, S,  4);
	RNDr(hash, S,  5);
	RNDr(hash, S,  6);
	RNDr(hash, S,  7);
	RNDr(hash, S,  8);
	RNDr(hash, S,  9);
	RNDr(hash, S, 10);
	RNDr(hash, S, 11);
	RNDr(hash, S, 12);
	RNDr(hash, S, 13);
	RNDr(hash, S, 14);
	RNDr(hash, S, 15);
	RNDr(hash, S, 16);
	RNDr(hash, S, 17);
	RNDr(hash, S, 18);
	RNDr(hash, S, 19);
	RNDr(hash, S, 20);
	RNDr(hash, S, 21);
	RNDr(hash, S, 22);
	RNDr(hash, S, 23);
	RNDr(hash, S, 24);
	RNDr(hash, S, 25);
	RNDr(hash, S, 26);
	RNDr(hash, S, 27);
	RNDr(hash, S, 28);
	RNDr(hash, S, 29);
	RNDr(hash, S, 30);
	RNDr(hash, S, 31);
	RNDr(hash, S, 32);
	RNDr(hash, S, 33);
	RNDr(hash, S, 34);
	RNDr(hash, S, 35);
	RNDr(hash, S, 36);
	RNDr(hash, S, 37);
	RNDr(hash, S, 38);
	RNDr(hash, S, 39);
	RNDr(hash, S, 40);
	RNDr(hash, S, 41);
	RNDr(hash, S, 42);
	RNDr(hash, S, 43);
	RNDr(hash, S, 44);
	RNDr(hash, S, 45);
	RNDr(hash, S, 46);
	RNDr(hash, S, 47);
	RNDr(hash, S, 48);
	RNDr(hash, S, 49);
	RNDr(hash, S, 50);
	RNDr(hash, S, 51);
	RNDr(hash, S, 52);
	RNDr(hash, S, 53);
	RNDr(hash, S, 54);
	RNDr(hash, S, 55);
	RNDr(hash, S, 56);
	
	hash[2] += hash[6] + S1(hash[3]) + Ch(hash[3], hash[4], hash[5])
	         + S[57] + sha256_k[57];
	hash[1] += hash[5] + S1(hash[2]) + Ch(hash[2], hash[3], hash[4])
	         + S[58] + sha256_k[58];
	hash[0] += hash[4] + S1(hash[1]) + Ch(hash[1], hash[2], hash[3])
	         + S[59] + sha256_k[59];
	hash[7] += hash[3] + S1(hash[0]) + Ch(hash[0], hash[1], hash[2])
	         + S[60] + sha256_k[60]
	         + sha256_h[7];
}

#endif /* EXTERN_SHA256 */

#ifdef HAVE_SHA256_4WAY

void sha256d_ms_4way(uint32_t *hash,  uint32_t *data,
	const uint32_t *midstate, const uint32_t *prehash);

static inline int scanhash_sha256d_4way(int thr_id, uint32_t *pdata,
	const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done)
{
	uint32_t data[4 * 64] __attribute__((aligned(128)));
	uint32_t hash[4 * 8] __attribute__((aligned(32)));
	uint32_t midstate[4 * 8] __attribute__((aligned(32)));
	uint32_t prehash[4 * 8] __attribute__((aligned(32)));
	uint32_t n = pdata[19] - 1;
	const uint32_t first_nonce = pdata[19];
	const uint32_t Htarg = ptarget[7];
	int i, j;
	
	memcpy(data, pdata + 16, 64);
	sha256d_preextend(data);
	for (i = 31; i >= 0; i--)
		for (j = 0; j < 4; j++)
			data[i * 4 + j] = data[i];
	
	sha256_init(midstate);
	sha256_transform(midstate, pdata, 0);
	memcpy(prehash, midstate, 32);
	sha256d_prehash(prehash, pdata + 16);
	for (i = 7; i >= 0; i--) {
		for (j = 0; j < 4; j++) {
			midstate[i * 4 + j] = midstate[i];
			prehash[i * 4 + j] = prehash[i];
		}
	}
	
	do {
		for (i = 0; i < 4; i++)
			data[4 * 3 + i] = ++n;
		
		sha256d_ms_4way(hash, data, midstate, prehash);
		
		for (i = 0; i < 4; i++) {
			if (swab32(hash[4 * 7 + i]) <= Htarg) {
				pdata[19] = data[4 * 3 + i];
				sha256d_80_swap(hash, pdata);
				if (fulltest(hash, ptarget)) {
					*hashes_done = n - first_nonce + 1;
					return 1;
				}
			}
		}
	} while (n < max_nonce && !work_restart[thr_id].restart);
	
	*hashes_done = n - first_nonce + 1;
	pdata[19] = n;
	return 0;
}

#endif /* HAVE_SHA256_4WAY */

#ifdef HAVE_SHA256_8WAY

void sha256d_ms_8way(uint32_t *hash,  uint32_t *data,
	const uint32_t *midstate, const uint32_t *prehash);

static inline int scanhash_sha256d_8way(int thr_id, uint32_t *pdata,
	const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done)
{
	uint32_t data[8 * 64] __attribute__((aligned(128)));
	uint32_t hash[8 * 8] __attribute__((aligned(32)));
	uint32_t midstate[8 * 8] __attribute__((aligned(32)));
	uint32_t prehash[8 * 8] __attribute__((aligned(32)));
	uint32_t n = pdata[19] - 1;
	const uint32_t first_nonce = pdata[19];
	const uint32_t Htarg = ptarget[7];
	int i, j;
	
	memcpy(data, pdata + 16, 64);
	sha256d_preextend(data);
	for (i = 31; i >= 0; i--)
		for (j = 0; j < 8; j++)
			data[i * 8 + j] = data[i];
	
	sha256_init(midstate);
	sha256_transform(midstate, pdata, 0);
	memcpy(prehash, midstate, 32);
	sha256d_prehash(prehash, pdata + 16);
	for (i = 7; i >= 0; i--) {
		for (j = 0; j < 8; j++) {
			midstate[i * 8 + j] = midstate[i];
			prehash[i * 8 + j] = prehash[i];
		}
	}
	
	do {
		for (i = 0; i < 8; i++)
			data[8 * 3 + i] = ++n;
		
		sha256d_ms_8way(hash, data, midstate, prehash);
		
		for (i = 0; i < 8; i++) {
			if (swab32(hash[8 * 7 + i]) <= Htarg) {
				pdata[19] = data[8 * 3 + i];
				sha256d_80_swap(hash, pdata);
				if (fulltest(hash, ptarget)) {
					*hashes_done = n - first_nonce + 1;
					return 1;
				}
			}
		}
	} while (n < max_nonce && !work_restart[thr_id].restart);
	
	*hashes_done = n - first_nonce + 1;
	pdata[19] = n;
	return 0;
}

#endif /* HAVE_SHA256_8WAY */

int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
	uint32_t max_nonce, unsigned long *hashes_done)
{
	uint32_t data[64] __attribute__((aligned(128)));
	uint32_t hash[8] __attribute__((aligned(32)));
	uint32_t midstate[8] __attribute__((aligned(32)));
	uint32_t prehash[8] __attribute__((aligned(32)));
	uint32_t n = pdata[19] - 1;
	const uint32_t first_nonce = pdata[19];
	const uint32_t Htarg = ptarget[7];
	
#ifdef HAVE_SHA256_8WAY
	if (sha256_use_8way())
		return scanhash_sha256d_8way(thr_id, pdata, ptarget,
			max_nonce, hashes_done);
#endif
#ifdef HAVE_SHA256_4WAY
	if (sha256_use_4way())
		return scanhash_sha256d_4way(thr_id, pdata, ptarget,
			max_nonce, hashes_done);
#endif
	
	memcpy(data, pdata + 16, 64);
	sha256d_preextend(data);
	
	sha256_init(midstate);
	sha256_transform(midstate, pdata, 0);
	memcpy(prehash, midstate, 32);
	sha256d_prehash(prehash, pdata + 16);
	
	do {
		data[3] = ++n;
		sha256d_ms(hash, data, midstate, prehash);
		if (swab32(hash[7]) <= Htarg) {
			pdata[19] = data[3];
			sha256d_80_swap(hash, pdata);
			if (fulltest(hash, ptarget)) {
				*hashes_done = n - first_nonce + 1;
				return 1;
			}
		}
	} while (n < max_nonce && !work_restart[thr_id].restart);
	
	*hashes_done = n - first_nonce + 1;
	pdata[19] = n;
	return 0;
}
0707010000002D000081A4000003E800000064000000015EF4BCA100009AA3000000000000000000000000000000000000001600000000cpuminer-2.5.1/util.c/*
 * Copyright 2010 Jeff Garzik
 * Copyright 2012 Luke Dashjr
 * Copyright 2012-2020 pooler
 * Copyright 2017 Pieter Wuille
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the Free
 * Software Foundation; either version 2 of the License, or (at your option)
 * any later version.  See COPYING for more details.
 */

#define _GNU_SOURCE
#include "cpuminer-config.h"

#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <stdarg.h>
#include <string.h>
#include <stdbool.h>
#include <inttypes.h>
#include <limits.h>
#include <errno.h>
#include <unistd.h>
#include <jansson.h>
#include <curl/curl.h>
#include <time.h>
#if defined(WIN32)
#include <winsock2.h>
#include <mstcpip.h>
#else
#include <sys/socket.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#endif
#include "compat.h"
#include "miner.h"
#include "elist.h"

struct data_buffer {
	void		*buf;
	size_t		len;
};

struct upload_buffer {
	const void	*buf;
	size_t		len;
	size_t		pos;
};

struct header_info {
	char		*lp_path;
	char		*reason;
	char		*stratum_url;
};

struct tq_ent {
	void			*data;
	struct list_head	q_node;
};

struct thread_q {
	struct list_head	q;

	bool frozen;

	pthread_mutex_t		mutex;
	pthread_cond_t		cond;
};

void applog(int prio, const char *fmt, ...)
{
	va_list ap;

	va_start(ap, fmt);

#ifdef HAVE_SYSLOG_H
	if (use_syslog) {
		va_list ap2;
		char *buf;
		int len;
		
		va_copy(ap2, ap);
		len = vsnprintf(NULL, 0, fmt, ap2) + 1;
		va_end(ap2);
		buf = alloca(len);
		if (vsnprintf(buf, len, fmt, ap) >= 0)
			syslog(prio, "%s", buf);
	}
#else
	if (0) {}
#endif
	else {
		char *f;
		int len;
		time_t now;
		struct tm tm, *tm_p;

		time(&now);

		pthread_mutex_lock(&applog_lock);
		tm_p = localtime(&now);
		memcpy(&tm, tm_p, sizeof(tm));
		pthread_mutex_unlock(&applog_lock);

		len = 40 + strlen(fmt) + 2;
		f = alloca(len);
		sprintf(f, "[%d-%02d-%02d %02d:%02d:%02d] %s\n",
			tm.tm_year + 1900,
			tm.tm_mon + 1,
			tm.tm_mday,
			tm.tm_hour,
			tm.tm_min,
			tm.tm_sec,
			fmt);
		pthread_mutex_lock(&applog_lock);
		vfprintf(stderr, f, ap);	/* atomic write to stderr */
		fflush(stderr);
		pthread_mutex_unlock(&applog_lock);
	}
	va_end(ap);
}

/* Modify the representation of integer numbers which would cause an overflow
 * so that they are treated as floating-point numbers.
 * This is a hack to overcome the limitations of some versions of Jansson. */
static char *hack_json_numbers(const char *in)
{
	char *out;
	int i, off, intoff;
	bool in_str, in_int;

	out = calloc(2 * strlen(in) + 1, 1);
	if (!out)
		return NULL;
	off = intoff = 0;
	in_str = in_int = false;
	for (i = 0; in[i]; i++) {
		char c = in[i];
		if (c == '"') {
			in_str = !in_str;
		} else if (c == '\\') {
			out[off++] = c;
			if (!in[++i])
				break;
		} else if (!in_str && !in_int && isdigit(c)) {
			intoff = off;
			in_int = true;
		} else if (in_int && !isdigit(c)) {
			if (c != '.' && c != 'e' && c != 'E' && c != '+' && c != '-') {
				in_int = false;
				if (off - intoff > 4) {
					char *end;
#if JSON_INTEGER_IS_LONG_LONG
					errno = 0;
					strtoll(out + intoff, &end, 10);
					if (!*end && errno == ERANGE) {
#else
					long l;
					errno = 0;
					l = strtol(out + intoff, &end, 10);
					if (!*end && (errno == ERANGE || l > INT_MAX)) {
#endif
						out[off++] = '.';
						out[off++] = '0';
					}
				}
			}
		}
		out[off++] = in[i];
	}
	return out;
}

static void databuf_free(struct data_buffer *db)
{
	if (!db)
		return;

	free(db->buf);

	memset(db, 0, sizeof(*db));
}

static size_t all_data_cb(const void *ptr, size_t size, size_t nmemb,
			  void *user_data)
{
	struct data_buffer *db = user_data;
	size_t len = size * nmemb;
	size_t oldlen, newlen;
	void *newmem;
	static const unsigned char zero = 0;

	oldlen = db->len;
	newlen = oldlen + len;

	newmem = realloc(db->buf, newlen + 1);
	if (!newmem)
		return 0;

	db->buf = newmem;
	db->len = newlen;
	memcpy(db->buf + oldlen, ptr, len);
	memcpy(db->buf + newlen, &zero, 1);	/* null terminate */

	return len;
}

static size_t upload_data_cb(void *ptr, size_t size, size_t nmemb,
			     void *user_data)
{
	struct upload_buffer *ub = user_data;
	int len = size * nmemb;

	if (len > ub->len - ub->pos)
		len = ub->len - ub->pos;

	if (len) {
		memcpy(ptr, ub->buf + ub->pos, len);
		ub->pos += len;
	}

	return len;
}

#if LIBCURL_VERSION_NUM >= 0x071200
static int seek_data_cb(void *user_data, curl_off_t offset, int origin)
{
	struct upload_buffer *ub = user_data;
	
	switch (origin) {
	case SEEK_SET:
		ub->pos = offset;
		break;
	case SEEK_CUR:
		ub->pos += offset;
		break;
	case SEEK_END:
		ub->pos = ub->len + offset;
		break;
	default:
		return 1; /* CURL_SEEKFUNC_FAIL */
	}

	return 0; /* CURL_SEEKFUNC_OK */
}
#endif

static size_t resp_hdr_cb(void *ptr, size_t size, size_t nmemb, void *user_data)
{
	struct header_info *hi = user_data;
	size_t remlen, slen, ptrlen = size * nmemb;
	char *rem, *val = NULL, *key = NULL;
	void *tmp;

	val = calloc(1, ptrlen);
	key = calloc(1, ptrlen);
	if (!key || !val)
		goto out;

	tmp = memchr(ptr, ':', ptrlen);
	if (!tmp || (tmp == ptr))	/* skip empty keys / blanks */
		goto out;
	slen = tmp - ptr;
	if ((slen + 1) == ptrlen)	/* skip key w/ no value */
		goto out;
	memcpy(key, ptr, slen);		/* store & nul term key */
	key[slen] = 0;

	rem = ptr + slen + 1;		/* trim value's leading whitespace */
	remlen = ptrlen - slen - 1;
	while ((remlen > 0) && (isspace(*rem))) {
		remlen--;
		rem++;
	}

	memcpy(val, rem, remlen);	/* store value, trim trailing ws */
	val[remlen] = 0;
	while ((*val) && (isspace(val[strlen(val) - 1]))) {
		val[strlen(val) - 1] = 0;
	}
	if (!*val)			/* skip blank value */
		goto out;

	if (!strcasecmp("X-Long-Polling", key)) {
		hi->lp_path = val;	/* steal memory reference */
		val = NULL;
	}

	if (!strcasecmp("X-Reject-Reason", key)) {
		hi->reason = val;	/* steal memory reference */
		val = NULL;
	}

	if (!strcasecmp("X-Stratum", key)) {
		hi->stratum_url = val;	/* steal memory reference */
		val = NULL;
	}

out:
	free(key);
	free(val);
	return ptrlen;
}

#if LIBCURL_VERSION_NUM >= 0x070f06
static int sockopt_keepalive_cb(void *userdata, curl_socket_t fd,
	curlsocktype purpose)
{
	int keepalive = 1;
	int tcp_keepcnt = 3;
	int tcp_keepidle = 50;
	int tcp_keepintvl = 50;

#ifndef WIN32
	if (unlikely(setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &keepalive,
		sizeof(keepalive))))
		return 1;
#ifdef __linux
	if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPCNT,
		&tcp_keepcnt, sizeof(tcp_keepcnt))))
		return 1;
	if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPIDLE,
		&tcp_keepidle, sizeof(tcp_keepidle))))
		return 1;
	if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPINTVL,
		&tcp_keepintvl, sizeof(tcp_keepintvl))))
		return 1;
#endif /* __linux */
#ifdef __APPLE_CC__
	if (unlikely(setsockopt(fd, IPPROTO_TCP, TCP_KEEPALIVE,
		&tcp_keepintvl, sizeof(tcp_keepintvl))))
		return 1;
#endif /* __APPLE_CC__ */
#else /* WIN32 */
	struct tcp_keepalive vals;
	vals.onoff = 1;
	vals.keepalivetime = tcp_keepidle * 1000;
	vals.keepaliveinterval = tcp_keepintvl * 1000;
	DWORD outputBytes;
	if (unlikely(WSAIoctl(fd, SIO_KEEPALIVE_VALS, &vals, sizeof(vals),
		NULL, 0, &outputBytes, NULL, NULL)))
		return 1;
#endif /* WIN32 */

	return 0;
}
#endif

json_t *json_rpc_call(CURL *curl, const char *url,
		      const char *userpass, const char *rpc_req,
		      int *curl_err, int flags)
{
	json_t *val, *err_val, *res_val;
	int rc;
	long http_rc;
	struct data_buffer all_data = {0};
	struct upload_buffer upload_data;
	char *json_buf;
	json_error_t err;
	struct curl_slist *headers = NULL;
	char len_hdr[64];
	char curl_err_str[CURL_ERROR_SIZE];
	long timeout = (flags & JSON_RPC_LONGPOLL) ? opt_timeout : 30;
	struct header_info hi = {0};

	/* it is assumed that 'curl' is freshly [re]initialized at this pt */

	if (opt_protocol)
		curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
	curl_easy_setopt(curl, CURLOPT_URL, url);
	if (opt_cert)
		curl_easy_setopt(curl, CURLOPT_CAINFO, opt_cert);
	curl_easy_setopt(curl, CURLOPT_ENCODING, "");
	curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1);
	curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
	curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1);
	curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, all_data_cb);
	curl_easy_setopt(curl, CURLOPT_WRITEDATA, &all_data);
	curl_easy_setopt(curl, CURLOPT_READFUNCTION, upload_data_cb);
	curl_easy_setopt(curl, CURLOPT_READDATA, &upload_data);
#if LIBCURL_VERSION_NUM >= 0x071200
	curl_easy_setopt(curl, CURLOPT_SEEKFUNCTION, &seek_data_cb);
	curl_easy_setopt(curl, CURLOPT_SEEKDATA, &upload_data);
#endif
	curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_err_str);
	if (opt_redirect)
		curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
	curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout);
	curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, resp_hdr_cb);
	curl_easy_setopt(curl, CURLOPT_HEADERDATA, &hi);
	if (opt_proxy) {
		curl_easy_setopt(curl, CURLOPT_PROXY, opt_proxy);
		curl_easy_setopt(curl, CURLOPT_PROXYTYPE, opt_proxy_type);
	}
	if (userpass) {
		curl_easy_setopt(curl, CURLOPT_USERPWD, userpass);
		curl_easy_setopt(curl, CURLOPT_HTTPAUTH, CURLAUTH_BASIC);
	}
#if LIBCURL_VERSION_NUM >= 0x070f06
	if (flags & JSON_RPC_LONGPOLL)
		curl_easy_setopt(curl, CURLOPT_SOCKOPTFUNCTION, sockopt_keepalive_cb);
#endif
	curl_easy_setopt(curl, CURLOPT_POST, 1);

	if (opt_protocol)
		applog(LOG_DEBUG, "JSON protocol request:\n%s\n", rpc_req);

	upload_data.buf = rpc_req;
	upload_data.len = strlen(rpc_req);
	upload_data.pos = 0;
	sprintf(len_hdr, "Content-Length: %lu",
		(unsigned long) upload_data.len);

	headers = curl_slist_append(headers, "Content-Type: application/json");
	headers = curl_slist_append(headers, len_hdr);
	headers = curl_slist_append(headers, "User-Agent: " USER_AGENT);
	headers = curl_slist_append(headers, "X-Mining-Extensions: midstate");
	headers = curl_slist_append(headers, "Accept:"); /* disable Accept hdr*/
	headers = curl_slist_append(headers, "Expect:"); /* disable Expect hdr*/

	curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);

	rc = curl_easy_perform(curl);
	if (curl_err != NULL)
		*curl_err = rc;
	if (rc) {
		curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_rc);
		if (!((flags & JSON_RPC_LONGPOLL) && rc == CURLE_OPERATION_TIMEDOUT) &&
		    !((flags & JSON_RPC_QUIET_404) && http_rc == 404))
			applog(LOG_ERR, "HTTP request failed: %s", curl_err_str);
		if (curl_err && (flags & JSON_RPC_QUIET_404) && http_rc == 404)
			*curl_err = CURLE_OK;
		goto err_out;
	}

	/* If X-Stratum was found, activate Stratum */
	if (want_stratum && hi.stratum_url &&
	    !strncasecmp(hi.stratum_url, "stratum+tcp://", 14)) {
		have_stratum = true;
		tq_push(thr_info[stratum_thr_id].q, hi.stratum_url);
		hi.stratum_url = NULL;
	}

	/* If X-Long-Polling was found, activate long polling */
	if (!have_longpoll && want_longpoll && hi.lp_path && !have_gbt &&
	    allow_getwork && !have_stratum) {
		have_longpoll = true;
		tq_push(thr_info[longpoll_thr_id].q, hi.lp_path);
		hi.lp_path = NULL;
	}

	if (!all_data.buf) {
		applog(LOG_ERR, "Empty data received in json_rpc_call.");
		goto err_out;
	}

	json_buf = hack_json_numbers(all_data.buf);
	errno = 0; /* needed for Jansson < 2.1 */
	val = JSON_LOADS(json_buf, &err);
	free(json_buf);
	if (!val) {
		applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text);
		goto err_out;
	}

	if (opt_protocol) {
		char *s = json_dumps(val, JSON_INDENT(3));
		applog(LOG_DEBUG, "JSON protocol response:\n%s", s);
		free(s);
	}

	/* JSON-RPC valid response returns a 'result' and a null 'error'. */
	res_val = json_object_get(val, "result");
	err_val = json_object_get(val, "error");

	if (!res_val || (err_val && !json_is_null(err_val))) {
		char *s;

		if (err_val)
			s = json_dumps(err_val, JSON_INDENT(3));
		else
			s = strdup("(unknown reason)");

		applog(LOG_ERR, "JSON-RPC call failed: %s", s);

		free(s);

		goto err_out;
	}

	if (hi.reason)
		json_object_set_new(val, "reject-reason", json_string(hi.reason));

	databuf_free(&all_data);
	curl_slist_free_all(headers);
	curl_easy_reset(curl);
	return val;

err_out:
	free(hi.lp_path);
	free(hi.reason);
	free(hi.stratum_url);
	databuf_free(&all_data);
	curl_slist_free_all(headers);
	curl_easy_reset(curl);
	return NULL;
}

void memrev(unsigned char *p, size_t len)
{
	unsigned char c, *q;
	for (q = p + len - 1; p < q; p++, q--) {
		c = *p;
		*p = *q;
		*q = c;
	}
}

void bin2hex(char *s, const unsigned char *p, size_t len)
{
	int i;
	for (i = 0; i < len; i++)
		sprintf(s + (i * 2), "%02x", (unsigned int) p[i]);
}

char *abin2hex(const unsigned char *p, size_t len)
{
	char *s = malloc((len * 2) + 1);
	if (!s)
		return NULL;
	bin2hex(s, p, len);
	return s;
}

bool hex2bin(unsigned char *p, const char *hexstr, size_t len)
{
	char hex_byte[3];
	char *ep;

	hex_byte[2] = '\0';

	while (*hexstr && len) {
		if (!hexstr[1]) {
			applog(LOG_ERR, "hex2bin str truncated");
			return false;
		}
		hex_byte[0] = hexstr[0];
		hex_byte[1] = hexstr[1];
		*p = (unsigned char) strtol(hex_byte, &ep, 16);
		if (*ep) {
			applog(LOG_ERR, "hex2bin failed on '%s'", hex_byte);
			return false;
		}
		p++;
		hexstr += 2;
		len--;
	}

	return (len == 0 && *hexstr == 0) ? true : false;
}

int varint_encode(unsigned char *p, uint64_t n)
{
	int i;
	if (n < 0xfd) {
		p[0] = n;
		return 1;
	}
	if (n <= 0xffff) {
		p[0] = 0xfd;
		p[1] = n & 0xff;
		p[2] = n >> 8;
		return 3;
	}
	if (n <= 0xffffffff) {
		p[0] = 0xfe;
		for (i = 1; i < 5; i++) {
			p[i] = n & 0xff;
			n >>= 8;
		}
		return 5;
	}
	p[0] = 0xff;
	for (i = 1; i < 9; i++) {
		p[i] = n & 0xff;
		n >>= 8;
	}
	return 9;
}

static const char b58digits[] = "123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz";

static bool b58dec(unsigned char *bin, size_t binsz, const char *b58)
{
	size_t i, j;
	uint64_t t;
	uint32_t c;
	uint32_t *outi;
	size_t outisz = (binsz + 3) / 4;
	int rem = binsz % 4;
	uint32_t remmask = 0xffffffff << (8 * rem);
	size_t b58sz = strlen(b58);
	bool rc = false;

	outi = calloc(outisz, sizeof(*outi));

	for (i = 0; i < b58sz; ++i) {
		for (c = 0; b58digits[c] != b58[i]; c++)
			if (!b58digits[c])
				goto out;
		for (j = outisz; j--; ) {
			t = (uint64_t)outi[j] * 58 + c;
			c = t >> 32;
			outi[j] = t & 0xffffffff;
		}
		if (c || outi[0] & remmask)
			goto out;
	}

	j = 0;
	switch (rem) {
		case 3:
			*(bin++) = (outi[0] >> 16) & 0xff;
		case 2:
			*(bin++) = (outi[0] >> 8) & 0xff;
		case 1:
			*(bin++) = outi[0] & 0xff;
			++j;
		default:
			break;
	}
	for (; j < outisz; ++j) {
		be32enc((uint32_t *)bin, outi[j]);
		bin += sizeof(uint32_t);
	}

	rc = true;
out:
	free(outi);
	return rc;
}

static int b58check(unsigned char *bin, size_t binsz, const char *b58)
{
	unsigned char buf[32];
	int i;

	sha256d(buf, bin, binsz - 4);
	if (memcmp(&bin[binsz - 4], buf, 4))
		return -1;

	/* Check number of zeros is correct AFTER verifying checksum
	 * (to avoid possibility of accessing the string beyond the end) */
	for (i = 0; bin[i] == '\0' && b58[i] == '1'; ++i);
	if (bin[i] == '\0' || b58[i] == '1')
		return -3;

	return bin[0];
}

static uint32_t bech32_polymod_step(uint32_t pre) {
	uint8_t b = pre >> 25;
	return ((pre & 0x1FFFFFF) << 5) ^
		(-((b >> 0) & 1) & 0x3b6a57b2UL) ^
		(-((b >> 1) & 1) & 0x26508e6dUL) ^
		(-((b >> 2) & 1) & 0x1ea119faUL) ^
		(-((b >> 3) & 1) & 0x3d4233ddUL) ^
		(-((b >> 4) & 1) & 0x2a1462b3UL);
}

static const int8_t bech32_charset_rev[128] = {
	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	15, -1, 10, 17, 21, 20, 26, 30,  7,  5, -1, -1, -1, -1, -1, -1,
	-1, 29, -1, 24, 13, 25,  9,  8, 23, -1, 18, 22, 31, 27, 19, -1,
	 1,  0,  3, 16, 11, 28, 12, 14,  6,  4,  2, -1, -1, -1, -1, -1,
	-1, 29, -1, 24, 13, 25,  9,  8, 23, -1, 18, 22, 31, 27, 19, -1,
	 1,  0,  3, 16, 11, 28, 12, 14,  6,  4,  2, -1, -1, -1, -1, -1
};

static bool bech32_decode(char *hrp, uint8_t *data, size_t *data_len, const char *input) {
	uint32_t chk = 1;
	size_t i;
	size_t input_len = strlen(input);
	size_t hrp_len;
	int have_lower = 0, have_upper = 0;
	if (input_len < 8 || input_len > 90) {
		return false;
	}
	*data_len = 0;
	while (*data_len < input_len && input[(input_len - 1) - *data_len] != '1') {
		++(*data_len);
	}
	hrp_len = input_len - (1 + *data_len);
	if (1 + *data_len >= input_len || *data_len < 6) {
		return false;
	}
	*(data_len) -= 6;
	for (i = 0; i < hrp_len; ++i) {
		int ch = input[i];
		if (ch < 33 || ch > 126) {
			return false;
		}
		if (ch >= 'a' && ch <= 'z') {
			have_lower = 1;
		} else if (ch >= 'A' && ch <= 'Z') {
			have_upper = 1;
			ch = (ch - 'A') + 'a';
		}
		hrp[i] = ch;
		chk = bech32_polymod_step(chk) ^ (ch >> 5);
	}
	hrp[i] = 0;
	chk = bech32_polymod_step(chk);
	for (i = 0; i < hrp_len; ++i) {
		chk = bech32_polymod_step(chk) ^ (input[i] & 0x1f);
	}
	++i;
	while (i < input_len) {
		int v = (input[i] & 0x80) ? -1 : bech32_charset_rev[(int)input[i]];
		if (input[i] >= 'a' && input[i] <= 'z') have_lower = 1;
		if (input[i] >= 'A' && input[i] <= 'Z') have_upper = 1;
		if (v == -1) {
			return false;
		}
		chk = bech32_polymod_step(chk) ^ v;
		if (i + 6 < input_len) {
			data[i - (1 + hrp_len)] = v;
		}
		++i;
	}
	if (have_lower && have_upper) {
		return false;
	}
	return chk == 1;
}

static bool convert_bits(uint8_t *out, size_t *outlen, int outbits, const uint8_t *in, size_t inlen, int inbits, int pad) {
	uint32_t val = 0;
	int bits = 0;
	uint32_t maxv = (((uint32_t)1) << outbits) - 1;
	while (inlen--) {
		val = (val << inbits) | *(in++);
		bits += inbits;
		while (bits >= outbits) {
			bits -= outbits;
			out[(*outlen)++] = (val >> bits) & maxv;
		}
	}
	if (pad) {
		if (bits) {
			out[(*outlen)++] = (val << (outbits - bits)) & maxv;
		}
	} else if (((val << (outbits - bits)) & maxv) || bits >= inbits) {
		return false;
	}
	return true;
}

static bool segwit_addr_decode(int *witver, uint8_t *witdata, size_t *witdata_len, const char *addr) {
	uint8_t data[84];
	char hrp_actual[84];
	size_t data_len;
	if (!bech32_decode(hrp_actual, data, &data_len, addr)) return false;
	if (data_len == 0 || data_len > 65) return false;
	if (data[0] > 16) return false;
	*witdata_len = 0;
	if (!convert_bits(witdata, witdata_len, 8, data + 1, data_len - 1, 5, 0)) return false;
	if (*witdata_len < 2 || *witdata_len > 40) return false;
	if (data[0] == 0 && *witdata_len != 20 && *witdata_len != 32) return false;
	*witver = data[0];
	return true;
}

static size_t bech32_to_script(uint8_t *out, size_t outsz, const char *addr) {
	uint8_t witprog[40];
	size_t witprog_len;
	int witver;

	if (!segwit_addr_decode(&witver, witprog, &witprog_len, addr))
		return 0;
	if (outsz < witprog_len + 2)
		return 0;
	out[0] = witver ? (0x50 + witver) : 0;
	out[1] = witprog_len;
	memcpy(out + 2, witprog, witprog_len);
	return witprog_len + 2;
}

size_t address_to_script(unsigned char *out, size_t outsz, const char *addr)
{
	unsigned char addrbin[25];
	int addrver;
	size_t rv;

	if (!b58dec(addrbin, sizeof(addrbin), addr))
		return bech32_to_script(out, outsz, addr);
	addrver = b58check(addrbin, sizeof(addrbin), addr);
	if (addrver < 0)
		return 0;
	switch (addrver) {
		case 5:    /* Bitcoin script hash */
		case 196:  /* Testnet script hash */
			if (outsz < (rv = 23))
				return rv;
			out[ 0] = 0xa9;  /* OP_HASH160 */
			out[ 1] = 0x14;  /* push 20 bytes */
			memcpy(&out[2], &addrbin[1], 20);
			out[22] = 0x87;  /* OP_EQUAL */
			return rv;
		default:
			if (outsz < (rv = 25))
				return rv;
			out[ 0] = 0x76;  /* OP_DUP */
			out[ 1] = 0xa9;  /* OP_HASH160 */
			out[ 2] = 0x14;  /* push 20 bytes */
			memcpy(&out[3], &addrbin[1], 20);
			out[23] = 0x88;  /* OP_EQUALVERIFY */
			out[24] = 0xac;  /* OP_CHECKSIG */
			return rv;
	}
}

/* Subtract the `struct timeval' values X and Y,
   storing the result in RESULT.
   Return 1 if the difference is negative, otherwise 0.  */
int timeval_subtract(struct timeval *result, struct timeval *x,
	struct timeval *y)
{
	/* Perform the carry for the later subtraction by updating Y. */
	if (x->tv_usec < y->tv_usec) {
		int nsec = (y->tv_usec - x->tv_usec) / 1000000 + 1;
		y->tv_usec -= 1000000 * nsec;
		y->tv_sec += nsec;
	}
	if (x->tv_usec - y->tv_usec > 1000000) {
		int nsec = (x->tv_usec - y->tv_usec) / 1000000;
		y->tv_usec += 1000000 * nsec;
		y->tv_sec -= nsec;
	}

	/* Compute the time remaining to wait.
	 * `tv_usec' is certainly positive. */
	result->tv_sec = x->tv_sec - y->tv_sec;
	result->tv_usec = x->tv_usec - y->tv_usec;

	/* Return 1 if result is negative. */
	return x->tv_sec < y->tv_sec;
}

bool fulltest(const uint32_t *hash, const uint32_t *target)
{
	int i;
	bool rc = true;
	
	for (i = 7; i >= 0; i--) {
		if (hash[i] > target[i]) {
			rc = false;
			break;
		}
		if (hash[i] < target[i]) {
			rc = true;
			break;
		}
	}

	if (opt_debug) {
		uint32_t hash_be[8], target_be[8];
		char hash_str[65], target_str[65];
		
		for (i = 0; i < 8; i++) {
			be32enc(hash_be + i, hash[7 - i]);
			be32enc(target_be + i, target[7 - i]);
		}
		bin2hex(hash_str, (unsigned char *)hash_be, 32);
		bin2hex(target_str, (unsigned char *)target_be, 32);

		applog(LOG_DEBUG, "DEBUG: %s\nHash:   %s\nTarget: %s",
			rc ? "hash <= target"
			   : "hash > target (false positive)",
			hash_str,
			target_str);
	}

	return rc;
}

void diff_to_target(uint32_t *target, double diff)
{
	uint64_t m;
	int k;
	
	for (k = 6; k > 0 && diff > 1.0; k--)
		diff /= 4294967296.0;
	m = 4294901760.0 / diff;
	if (m == 0 && k == 6)
		memset(target, 0xff, 32);
	else {
		memset(target, 0, 32);
		target[k] = (uint32_t)m;
		target[k + 1] = (uint32_t)(m >> 32);
	}
}

#ifdef WIN32
#define socket_blocks() (WSAGetLastError() == WSAEWOULDBLOCK)
#else
#define socket_blocks() (errno == EAGAIN || errno == EWOULDBLOCK)
#endif

static bool send_line(struct stratum_ctx *sctx, char *s)
{
	ssize_t len, sent = 0;
	
	len = strlen(s);
	s[len++] = '\n';

	while (len > 0) {
		struct timeval timeout = {0, 0};
		ssize_t n;
		fd_set wd;

		FD_ZERO(&wd);
		FD_SET(sctx->sock, &wd);
		if (select(sctx->sock + 1, NULL, &wd, NULL, &timeout) < 1)
			return false;
#if LIBCURL_VERSION_NUM >= 0x071202
		CURLcode rc = curl_easy_send(sctx->curl, s + sent, len, (size_t *)&n);
		if (rc != CURLE_OK) {
			if (rc != CURLE_AGAIN)
#else
		n = send(sctx->sock, s + sent, len, 0);
		if (n < 0) {
			if (!socket_blocks())
#endif
				return false;
			n = 0;
		}
		sent += n;
		len -= n;
	}

	return true;
}

bool stratum_send_line(struct stratum_ctx *sctx, char *s)
{
	bool ret = false;

	if (opt_protocol)
		applog(LOG_DEBUG, "> %s", s);

	pthread_mutex_lock(&sctx->sock_lock);
	ret = send_line(sctx, s);
	pthread_mutex_unlock(&sctx->sock_lock);

	return ret;
}

static bool socket_full(curl_socket_t sock, int timeout)
{
	struct timeval tv;
	fd_set rd;

	FD_ZERO(&rd);
	FD_SET(sock, &rd);
	tv.tv_sec = timeout;
	tv.tv_usec = 0;
	if (select(sock + 1, &rd, NULL, NULL, &tv) > 0)
		return true;
	return false;
}

bool stratum_socket_full(struct stratum_ctx *sctx, int timeout)
{
	return strlen(sctx->sockbuf) || socket_full(sctx->sock, timeout);
}

#define RBUFSIZE 2048
#define RECVSIZE (RBUFSIZE - 4)

static void stratum_buffer_append(struct stratum_ctx *sctx, const char *s)
{
	size_t old, new;

	old = strlen(sctx->sockbuf);
	new = old + strlen(s) + 1;
	if (new >= sctx->sockbuf_size) {
		sctx->sockbuf_size = new + (RBUFSIZE - (new % RBUFSIZE));
		sctx->sockbuf = realloc(sctx->sockbuf, sctx->sockbuf_size);
	}
	strcpy(sctx->sockbuf + old, s);
}

char *stratum_recv_line(struct stratum_ctx *sctx)
{
	ssize_t len, buflen;
	char *tok, *sret = NULL;

	if (!strstr(sctx->sockbuf, "\n")) {
		bool ret = true;
		time_t rstart;

		time(&rstart);
		if (!socket_full(sctx->sock, 60)) {
			applog(LOG_ERR, "stratum_recv_line timed out");
			goto out;
		}
		do {
			char s[RBUFSIZE];
			ssize_t n;

			memset(s, 0, RBUFSIZE);
#if LIBCURL_VERSION_NUM >= 0x071202
			CURLcode rc = curl_easy_recv(sctx->curl, s, RECVSIZE, (size_t *)&n);
			if (rc == CURLE_OK && !n) {
				ret = false;
				break;
			}
			if (rc != CURLE_OK) {
				if (rc != CURLE_AGAIN || !socket_full(sctx->sock, 1)) {
#else
			n = recv(sctx->sock, s, RECVSIZE, 0);
			if (!n) {
				ret = false;
				break;
			}
			if (n < 0) {
				if (!socket_blocks() || !socket_full(sctx->sock, 1)) {
#endif
					ret = false;
					break;
				}
			} else
				stratum_buffer_append(sctx, s);
		} while (time(NULL) - rstart < 60 && !strstr(sctx->sockbuf, "\n"));

		if (!ret) {
			applog(LOG_ERR, "stratum_recv_line failed");
			goto out;
		}
	}

	buflen = strlen(sctx->sockbuf);
	tok = strtok(sctx->sockbuf, "\n");
	if (!tok) {
		applog(LOG_ERR, "stratum_recv_line failed to parse a newline-terminated string");
		goto out;
	}
	sret = strdup(tok);
	len = strlen(sret);

	if (buflen > len + 1)
		memmove(sctx->sockbuf, sctx->sockbuf + len + 1, buflen - len + 1);
	else
		sctx->sockbuf[0] = '\0';

out:
	if (sret && opt_protocol)
		applog(LOG_DEBUG, "< %s", sret);
	return sret;
}

#if LIBCURL_VERSION_NUM >= 0x071101 && LIBCURL_VERSION_NUM < 0x072d00
static curl_socket_t opensocket_grab_cb(void *clientp, curlsocktype purpose,
	struct curl_sockaddr *addr)
{
	curl_socket_t *sock = clientp;
	*sock = socket(addr->family, addr->socktype, addr->protocol);
	return *sock;
}
#endif

bool stratum_connect(struct stratum_ctx *sctx, const char *url)
{
	CURL *curl;
	int rc;

	pthread_mutex_lock(&sctx->sock_lock);
	if (sctx->curl)
		curl_easy_cleanup(sctx->curl);
	sctx->curl = curl_easy_init();
	if (!sctx->curl) {
		applog(LOG_ERR, "CURL initialization failed");
		pthread_mutex_unlock(&sctx->sock_lock);
		return false;
	}
	curl = sctx->curl;
	if (!sctx->sockbuf) {
		sctx->sockbuf = calloc(RBUFSIZE, 1);
		sctx->sockbuf_size = RBUFSIZE;
	}
	sctx->sockbuf[0] = '\0';
	pthread_mutex_unlock(&sctx->sock_lock);

	if (url != sctx->url) {
		free(sctx->url);
		sctx->url = strdup(url);
	}
	free(sctx->curl_url);
	sctx->curl_url = malloc(strlen(url));
	sprintf(sctx->curl_url, "http%s", url + 11);

	if (opt_protocol)
		curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
	curl_easy_setopt(curl, CURLOPT_URL, sctx->curl_url);
	if (opt_cert)
		curl_easy_setopt(curl, CURLOPT_CAINFO, opt_cert);
	curl_easy_setopt(curl, CURLOPT_FRESH_CONNECT, 1);
	curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 30);
	curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, sctx->curl_err_str);
	curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
	curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1);
	if (opt_proxy) {
		curl_easy_setopt(curl, CURLOPT_PROXY, opt_proxy);
		curl_easy_setopt(curl, CURLOPT_PROXYTYPE, opt_proxy_type);
	}
	curl_easy_setopt(curl, CURLOPT_HTTPPROXYTUNNEL, 1);
#if LIBCURL_VERSION_NUM >= 0x070f06
	curl_easy_setopt(curl, CURLOPT_SOCKOPTFUNCTION, sockopt_keepalive_cb);
#endif
#if LIBCURL_VERSION_NUM >= 0x071101 && LIBCURL_VERSION_NUM < 0x072d00
	curl_easy_setopt(curl, CURLOPT_OPENSOCKETFUNCTION, opensocket_grab_cb);
	curl_easy_setopt(curl, CURLOPT_OPENSOCKETDATA, &sctx->sock);
#endif
	curl_easy_setopt(curl, CURLOPT_CONNECT_ONLY, 1);

	rc = curl_easy_perform(curl);
	if (rc) {
		applog(LOG_ERR, "Stratum connection failed: %s", sctx->curl_err_str);
		curl_easy_cleanup(curl);
		sctx->curl = NULL;
		return false;
	}

#if LIBCURL_VERSION_NUM >= 0x072d00
	curl_easy_getinfo(curl, CURLINFO_ACTIVESOCKET, &sctx->sock);
#elif LIBCURL_VERSION_NUM < 0x071101
	/* CURLINFO_LASTSOCKET is broken on Win64; only use it as a last resort */
	curl_easy_getinfo(curl, CURLINFO_LASTSOCKET, (long *)&sctx->sock);
#endif

	return true;
}

void stratum_disconnect(struct stratum_ctx *sctx)
{
	pthread_mutex_lock(&sctx->sock_lock);
	if (sctx->curl) {
		curl_easy_cleanup(sctx->curl);
		sctx->curl = NULL;
		sctx->sockbuf[0] = '\0';
	}
	pthread_mutex_unlock(&sctx->sock_lock);
}

static const char *get_stratum_session_id(json_t *val)
{
	json_t *arr_val;
	int i, n;

	arr_val = json_array_get(val, 0);
	if (!arr_val || !json_is_array(arr_val))
		return NULL;
	n = json_array_size(arr_val);
	for (i = 0; i < n; i++) {
		const char *notify;
		json_t *arr = json_array_get(arr_val, i);

		if (!arr || !json_is_array(arr))
			break;
		notify = json_string_value(json_array_get(arr, 0));
		if (!notify)
			continue;
		if (!strcasecmp(notify, "mining.notify"))
			return json_string_value(json_array_get(arr, 1));
	}
	return NULL;
}

bool stratum_subscribe(struct stratum_ctx *sctx)
{
	char *s, *sret = NULL;
	const char *sid, *xnonce1;
	int xn2_size;
	json_t *val = NULL, *res_val, *err_val;
	json_error_t err;
	bool ret = false, retry = false;

start:
	s = malloc(128 + (sctx->session_id ? strlen(sctx->session_id) : 0));
	if (retry)
		sprintf(s, "{\"id\": 1, \"method\": \"mining.subscribe\", \"params\": []}");
	else if (sctx->session_id)
		sprintf(s, "{\"id\": 1, \"method\": \"mining.subscribe\", \"params\": [\"" USER_AGENT "\", \"%s\"]}", sctx->session_id);
	else
		sprintf(s, "{\"id\": 1, \"method\": \"mining.subscribe\", \"params\": [\"" USER_AGENT "\"]}");

	if (!stratum_send_line(sctx, s)) {
		applog(LOG_ERR, "stratum_subscribe send failed");
		goto out;
	}

	if (!socket_full(sctx->sock, 30)) {
		applog(LOG_ERR, "stratum_subscribe timed out");
		goto out;
	}

	sret = stratum_recv_line(sctx);
	if (!sret)
		goto out;

	val = JSON_LOADS(sret, &err);
	free(sret);
	if (!val) {
		applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text);
		goto out;
	}

	res_val = json_object_get(val, "result");
	err_val = json_object_get(val, "error");

	if (!res_val || json_is_null(res_val) ||
	    (err_val && !json_is_null(err_val))) {
		if (opt_debug || retry) {
			free(s);
			if (err_val)
				s = json_dumps(err_val, JSON_INDENT(3));
			else
				s = strdup("(unknown reason)");
			applog(LOG_ERR, "JSON-RPC call failed: %s", s);
		}
		goto out;
	}

	sid = get_stratum_session_id(res_val);
	if (opt_debug && !sid)
		applog(LOG_DEBUG, "Failed to get Stratum session id");
	xnonce1 = json_string_value(json_array_get(res_val, 1));
	if (!xnonce1) {
		applog(LOG_ERR, "Failed to get extranonce1");
		goto out;
	}
	xn2_size = json_integer_value(json_array_get(res_val, 2));
	if (!xn2_size) {
		applog(LOG_ERR, "Failed to get extranonce2_size");
		goto out;
	}
	if (xn2_size < 0 || xn2_size > 100) {
		applog(LOG_ERR, "Invalid value of extranonce2_size");
		goto out;
	}

	pthread_mutex_lock(&sctx->work_lock);
	free(sctx->session_id);
	free(sctx->xnonce1);
	sctx->session_id = sid ? strdup(sid) : NULL;
	sctx->xnonce1_size = strlen(xnonce1) / 2;
	sctx->xnonce1 = malloc(sctx->xnonce1_size);
	hex2bin(sctx->xnonce1, xnonce1, sctx->xnonce1_size);
	sctx->xnonce2_size = xn2_size;
	sctx->next_diff = 1.0;
	pthread_mutex_unlock(&sctx->work_lock);

	if (opt_debug && sid)
		applog(LOG_DEBUG, "Stratum session id: %s", sctx->session_id);

	ret = true;

out:
	free(s);
	if (val)
		json_decref(val);

	if (!ret) {
		if (sret && !retry) {
			retry = true;
			goto start;
		}
	}

	return ret;
}

bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass)
{
	json_t *val = NULL, *res_val, *err_val;
	char *s, *sret;
	json_error_t err;
	bool ret = false;

	s = malloc(80 + strlen(user) + strlen(pass));
	sprintf(s, "{\"id\": 2, \"method\": \"mining.authorize\", \"params\": [\"%s\", \"%s\"]}",
	        user, pass);

	if (!stratum_send_line(sctx, s))
		goto out;

	while (1) {
		sret = stratum_recv_line(sctx);
		if (!sret)
			goto out;
		if (!stratum_handle_method(sctx, sret))
			break;
		free(sret);
	}

	val = JSON_LOADS(sret, &err);
	free(sret);
	if (!val) {
		applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text);
		goto out;
	}

	res_val = json_object_get(val, "result");
	err_val = json_object_get(val, "error");

	if (!res_val || json_is_false(res_val) ||
	    (err_val && !json_is_null(err_val)))  {
		applog(LOG_ERR, "Stratum authentication failed");
		goto out;
	}

	ret = true;

out:
	free(s);
	if (val)
		json_decref(val);

	return ret;
}

static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
{
	const char *job_id, *prevhash, *coinb1, *coinb2, *version, *nbits, *ntime;
	size_t coinb1_size, coinb2_size;
	bool clean, ret = false;
	int merkle_count, i;
	json_t *merkle_arr;
	unsigned char **merkle;

	job_id = json_string_value(json_array_get(params, 0));
	prevhash = json_string_value(json_array_get(params, 1));
	coinb1 = json_string_value(json_array_get(params, 2));
	coinb2 = json_string_value(json_array_get(params, 3));
	merkle_arr = json_array_get(params, 4);
	if (!merkle_arr || !json_is_array(merkle_arr))
		goto out;
	merkle_count = json_array_size(merkle_arr);
	version = json_string_value(json_array_get(params, 5));
	nbits = json_string_value(json_array_get(params, 6));
	ntime = json_string_value(json_array_get(params, 7));
	clean = json_is_true(json_array_get(params, 8));

	if (!job_id || !prevhash || !coinb1 || !coinb2 || !version || !nbits || !ntime ||
	    strlen(prevhash) != 64 || strlen(version) != 8 ||
	    strlen(nbits) != 8 || strlen(ntime) != 8) {
		applog(LOG_ERR, "Stratum notify: invalid parameters");
		goto out;
	}
	merkle = malloc(merkle_count * sizeof(char *));
	for (i = 0; i < merkle_count; i++) {
		const char *s = json_string_value(json_array_get(merkle_arr, i));
		if (!s || strlen(s) != 64) {
			while (i--)
				free(merkle[i]);
			free(merkle);
			applog(LOG_ERR, "Stratum notify: invalid Merkle branch");
			goto out;
		}
		merkle[i] = malloc(32);
		hex2bin(merkle[i], s, 32);
	}

	pthread_mutex_lock(&sctx->work_lock);

	coinb1_size = strlen(coinb1) / 2;
	coinb2_size = strlen(coinb2) / 2;
	sctx->job.coinbase_size = coinb1_size + sctx->xnonce1_size +
	                          sctx->xnonce2_size + coinb2_size;
	sctx->job.coinbase = realloc(sctx->job.coinbase, sctx->job.coinbase_size);
	sctx->job.xnonce2 = sctx->job.coinbase + coinb1_size + sctx->xnonce1_size;
	hex2bin(sctx->job.coinbase, coinb1, coinb1_size);
	memcpy(sctx->job.coinbase + coinb1_size, sctx->xnonce1, sctx->xnonce1_size);
	if (!sctx->job.job_id || strcmp(sctx->job.job_id, job_id))
		memset(sctx->job.xnonce2, 0, sctx->xnonce2_size);
	hex2bin(sctx->job.xnonce2 + sctx->xnonce2_size, coinb2, coinb2_size);

	free(sctx->job.job_id);
	sctx->job.job_id = strdup(job_id);
	hex2bin(sctx->job.prevhash, prevhash, 32);

	for (i = 0; i < sctx->job.merkle_count; i++)
		free(sctx->job.merkle[i]);
	free(sctx->job.merkle);
	sctx->job.merkle = merkle;
	sctx->job.merkle_count = merkle_count;

	hex2bin(sctx->job.version, version, 4);
	hex2bin(sctx->job.nbits, nbits, 4);
	hex2bin(sctx->job.ntime, ntime, 4);
	sctx->job.clean = clean;

	sctx->job.diff = sctx->next_diff;

	pthread_mutex_unlock(&sctx->work_lock);

	ret = true;

out:
	return ret;
}

static bool stratum_set_difficulty(struct stratum_ctx *sctx, json_t *params)
{
	double diff;

	diff = json_number_value(json_array_get(params, 0));
	if (diff == 0)
		return false;

	pthread_mutex_lock(&sctx->work_lock);
	sctx->next_diff = diff;
	pthread_mutex_unlock(&sctx->work_lock);

	if (opt_debug)
		applog(LOG_DEBUG, "Stratum difficulty set to %g", diff);

	return true;
}

static bool stratum_reconnect(struct stratum_ctx *sctx, json_t *params)
{
	json_t *port_val;
	char *url;
	const char *host;
	int port;

	host = json_string_value(json_array_get(params, 0));
	port_val = json_array_get(params, 1);
	if (json_is_string(port_val))
		port = atoi(json_string_value(port_val));
	else
		port = json_integer_value(port_val);
	if (!host || !port)
		return false;

	url = malloc(32 + strlen(host));
	strncpy(url, sctx->url, 15);
	sprintf(strstr(url, "://") + 3, "%s:%d", host, port);

	if (!opt_redirect) {
		applog(LOG_INFO, "Ignoring request to reconnect to %s", url);
		free(url);
		return true;
	}

	applog(LOG_NOTICE, "Server requested reconnection to %s", url);

	free(sctx->url);
	sctx->url = url;
	stratum_disconnect(sctx);

	return true;
}

static bool stratum_get_version(struct stratum_ctx *sctx, json_t *id)
{
	char *s;
	json_t *val;
	bool ret;
	
	if (!id || json_is_null(id))
		return false;

	val = json_object();
	json_object_set(val, "id", id);
	json_object_set_new(val, "error", json_null());
	json_object_set_new(val, "result", json_string(USER_AGENT));
	s = json_dumps(val, 0);
	ret = stratum_send_line(sctx, s);
	json_decref(val);
	free(s);

	return ret;
}

static bool stratum_show_message(struct stratum_ctx *sctx, json_t *id, json_t *params)
{
	char *s;
	json_t *val;
	bool ret;

	val = json_array_get(params, 0);
	if (val)
		applog(LOG_NOTICE, "MESSAGE FROM SERVER: %s", json_string_value(val));
	
	if (!id || json_is_null(id))
		return true;

	val = json_object();
	json_object_set(val, "id", id);
	json_object_set_new(val, "error", json_null());
	json_object_set_new(val, "result", json_true());
	s = json_dumps(val, 0);
	ret = stratum_send_line(sctx, s);
	json_decref(val);
	free(s);

	return ret;
}

bool stratum_handle_method(struct stratum_ctx *sctx, const char *s)
{
	json_t *val, *id, *params;
	json_error_t err;
	const char *method;
	bool ret = false;

	val = JSON_LOADS(s, &err);
	if (!val) {
		applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text);
		goto out;
	}

	method = json_string_value(json_object_get(val, "method"));
	if (!method)
		goto out;
	id = json_object_get(val, "id");
	params = json_object_get(val, "params");

	if (!strcasecmp(method, "mining.notify")) {
		ret = stratum_notify(sctx, params);
		goto out;
	}
	if (!strcasecmp(method, "mining.set_difficulty")) {
		ret = stratum_set_difficulty(sctx, params);
		goto out;
	}
	if (!strcasecmp(method, "client.reconnect")) {
		ret = stratum_reconnect(sctx, params);
		goto out;
	}
	if (!strcasecmp(method, "client.get_version")) {
		ret = stratum_get_version(sctx, id);
		goto out;
	}
	if (!strcasecmp(method, "client.show_message")) {
		ret = stratum_show_message(sctx, id, params);
		goto out;
	}

out:
	if (val)
		json_decref(val);

	return ret;
}

struct thread_q *tq_new(void)
{
	struct thread_q *tq;

	tq = calloc(1, sizeof(*tq));
	if (!tq)
		return NULL;

	INIT_LIST_HEAD(&tq->q);
	pthread_mutex_init(&tq->mutex, NULL);
	pthread_cond_init(&tq->cond, NULL);

	return tq;
}

void tq_free(struct thread_q *tq)
{
	struct tq_ent *ent, *iter;

	if (!tq)
		return;

	list_for_each_entry_safe(ent, iter, &tq->q, q_node, struct tq_ent) {
		list_del(&ent->q_node);
		free(ent);
	}

	pthread_cond_destroy(&tq->cond);
	pthread_mutex_destroy(&tq->mutex);

	memset(tq, 0, sizeof(*tq));	/* poison */
	free(tq);
}

static void tq_freezethaw(struct thread_q *tq, bool frozen)
{
	pthread_mutex_lock(&tq->mutex);

	tq->frozen = frozen;

	pthread_cond_signal(&tq->cond);
	pthread_mutex_unlock(&tq->mutex);
}

void tq_freeze(struct thread_q *tq)
{
	tq_freezethaw(tq, true);
}

void tq_thaw(struct thread_q *tq)
{
	tq_freezethaw(tq, false);
}

bool tq_push(struct thread_q *tq, void *data)
{
	struct tq_ent *ent;
	bool rc = true;

	ent = calloc(1, sizeof(*ent));
	if (!ent)
		return false;

	ent->data = data;
	INIT_LIST_HEAD(&ent->q_node);

	pthread_mutex_lock(&tq->mutex);

	if (!tq->frozen) {
		list_add_tail(&ent->q_node, &tq->q);
	} else {
		free(ent);
		rc = false;
	}

	pthread_cond_signal(&tq->cond);
	pthread_mutex_unlock(&tq->mutex);

	return rc;
}

void *tq_pop(struct thread_q *tq, const struct timespec *abstime)
{
	struct tq_ent *ent;
	void *rval = NULL;
	int rc;

	pthread_mutex_lock(&tq->mutex);

	if (!list_empty(&tq->q))
		goto pop;

	if (abstime)
		rc = pthread_cond_timedwait(&tq->cond, &tq->mutex, abstime);
	else
		rc = pthread_cond_wait(&tq->cond, &tq->mutex);
	if (rc)
		goto out;
	if (list_empty(&tq->q))
		goto out;

pop:
	ent = list_entry(tq->q.next, struct tq_ent, q_node);
	rval = ent->data;

	list_del(&ent->q_node);
	free(ent);

out:
	pthread_mutex_unlock(&tq->mutex);
	return rval;
}
07070100000000000000000000000000000000000000010000000000000000000000000000000000000000000000000000000B00000000TRAILER!!!1244 blocks
openSUSE Build Service is sponsored by