Feat: gemm, linpack, stream, whetstone out-of-box in am-kernel.

2024-06-18 23:17:09 +08:00 · 2024-06-18 23:17:09 +08:00 · cbee9324c1
commit cbee9324c1
parent 5cd6edc02e
399 changed files with 55469 additions and 4 deletions
--- a/src/gemm/Makefile
+++ b/src/gemm/Makefile
@ -0,0 +1,3 @@
+NAME = gemm
+SRCS = $(shell find soft-fp/ -name "*.c") gemm.c matmul.c
+include $(AM_HOME)/Makefile
--- a/src/gemm/gemm.c
+++ b/src/gemm/gemm.c
@ -34,9 +34,9 @@ void display(double * matrix, int m, int n){

 int main(){

-    int m = 200;
-    int n = 200;
-    int k = 200;
+    int m = 100;
+    int n = 100;
+    int k = 100;

    double * A = (double*)malloc(m*k*sizeof(double));
    double * B = (double*)malloc(k*n*sizeof(double));
@ -72,4 +72,4 @@ int main(){
    
    printf("Dot product took %f seconds GFLOPS : %f\n",duration,gflops/duration);
    return 0;
-}
+}
--- a/src/gemm/include/gemm.h
+++ b/src/gemm/include/gemm.h
@ -0,0 +1,10 @@
+#include <am.h>
+#include <klib.h>
+#include <klib-macros.h>
+
+
+void AddDot4x4( int, double *, int, double *, int, double *, int );
+void PackMatrixA( int, double *, int, double * );
+void PackMatrixB( int, double *, int, double * );
+void InnerKernel( int, int, int, double *, int, double *, int, double *, int, int );
+void matmul( int m, int n, int k, double *a, int lda, double *b, int ldb,double *c, int ldc );
--- a/src/gemm/soft-fp/aa-README.txt
+++ b/src/gemm/soft-fp/aa-README.txt
@ -0,0 +1,76 @@
+https://gcc.gnu.org/onlinedocs/gccint/Soft-float-library-routines.html
+
+1.Arithmetic functions
+
+Runtime Function: float __addsf3 (float a, float b)
+Runtime Function: double __adddf3 (double a, double b)
+    These functions return the sum of a and b.
+
+Runtime Function: float __subsf3 (float a, float b)
+Runtime Function: double __subdf3 (double a, double b)
+    These functions return the difference between b and a; that is, a - b.
+
+Runtime Function: float __mulsf3 (float a, float b)
+Runtime Function: double __muldf3 (double a, double b)
+    These functions return the product of a and b.
+
+Runtime Function: float __divsf3 (float a, float b)
+Runtime Function: double __divdf3 (double a, double b)
+    These functions return the quotient of a and b; that is, a / b.
+
+Runtime Function: float __negsf2 (float a)
+Runtime Function: double __negdf2 (double a)
+    These functions return the negation of a. They simply flip the sign bit, so they can produce negative zero and negative NaN.
+
+2.Conversion functions
+
+Runtime Function: double __extendsfdf2 (float a)
+    These functions extend a to the wider mode of their return type.
+
+Runtime Function: float __truncdfsf2 (double a)
+    These functions truncate a to the narrower mode of their return type, rounding toward zero.
+
+Runtime Function: int __fixsfsi (float a)
+Runtime Function: int __fixdfsi (double a)
+    These functions convert a to a signed integer, rounding toward zero.
+
+Runtime Function: long __fixsfdi (float a)
+Runtime Function: long __fixdfdi (double a)
+    These functions convert a to a signed long, rounding toward zero.
+
+Runtime Function: long long __fixsfti (float a)
+Runtime Function: long long __fixdfti (double a)
+    These functions convert a to a signed long long, rounding toward zero.
+
+
+Runtime Function: unsigned int __fixunssfsi (float a)
+Runtime Function: unsigned int __fixunsdfsi (double a)
+    These functions convert a to an unsigned integer, rounding toward zero. Negative values all become zero.
+
+Runtime Function: unsigned long __fixunssfdi (float a)
+Runtime Function: unsigned long __fixunsdfdi (double a)
+    These functions convert a to an unsigned long, rounding toward zero. Negative values all become zero.
+
+Runtime Function: unsigned long long __fixunssfti (float a)
+Runtime Function: unsigned long long __fixunsdfti (double a)
+    These functions convert a to an unsigned long long, rounding toward zero. Negative values all become zero.
+
+
+Runtime Function: float __floatsisf (int i)
+Runtime Function: double __floatsidf (int i)
+    These functions convert i, a signed integer, to floating point.
+
+Runtime Function: float __floatdisf (long i) ¶
+Runtime Function: double __floatdidf (long i)
+    These functions convert i, a signed long, to floating point.
+
+
+Runtime Function: float __floatunsisf (unsigned int i)
+Runtime Function: double __floatunsidf (unsigned int i)
+    These functions convert i, an unsigned integer, to floating point.
+
+Runtime Function: float __floatundisf (unsigned long i)
+Runtime Function: double __floatundidf (unsigned long i)
+    These functions convert i, an unsigned long, to floating point.
+
+3.Comparison functions
--- a/src/gemm/soft-fp/adddf3.c
+++ b/src/gemm/soft-fp/adddf3.c
@ -0,0 +1,21 @@
+#include "soft-fp.h"
+#include "double.h"
+
+DFtype
+__adddf3 (DFtype a, DFtype b)
+{
+  FP_DECL_EX;
+  FP_DECL_D (A);
+  FP_DECL_D (B);
+  FP_DECL_D (R);
+  DFtype r;
+
+  FP_INIT_ROUNDMODE;
+  FP_UNPACK_SEMIRAW_D (A, a);
+  FP_UNPACK_SEMIRAW_D (B, b);
+  FP_ADD_D (R, A, B);
+  FP_PACK_SEMIRAW_D (r, R);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
--- a/src/gemm/soft-fp/addsf3.c
+++ b/src/gemm/soft-fp/addsf3.c
@ -0,0 +1,23 @@
+#include "soft-fp.h"
+#include "single.h"
+
+SFtype
+__addsf3 (SFtype a, SFtype b)
+{
+  FP_DECL_EX;
+  FP_DECL_S (A);
+  FP_DECL_S (B);
+  FP_DECL_S (R);
+  SFtype r;
+
+  FP_INIT_ROUNDMODE;
+  FP_UNPACK_SEMIRAW_S (A, a);
+  FP_UNPACK_SEMIRAW_S (B, b);
+  FP_ADD_S (R, A, B);
+  FP_PACK_SEMIRAW_S (r, R);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
+
+
--- a/src/gemm/soft-fp/divdf3.c
+++ b/src/gemm/soft-fp/divdf3.c
@ -0,0 +1,21 @@
+#include "soft-fp.h"
+#include "double.h"
+
+DFtype
+__divdf3 (DFtype a, DFtype b)
+{
+  FP_DECL_EX;
+  FP_DECL_D (A);
+  FP_DECL_D (B);
+  FP_DECL_D (R);
+  DFtype r;
+
+  FP_INIT_ROUNDMODE;
+  FP_UNPACK_D (A, a);
+  FP_UNPACK_D (B, b);
+  FP_DIV_D (R, A, B);
+  FP_PACK_D (r, R);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
--- a/src/gemm/soft-fp/divsf3.c
+++ b/src/gemm/soft-fp/divsf3.c
@ -0,0 +1,21 @@
+#include "soft-fp.h"
+#include "single.h"
+
+SFtype
+__divsf3 (SFtype a, SFtype b)
+{
+  FP_DECL_EX;
+  FP_DECL_S (A);
+  FP_DECL_S (B);
+  FP_DECL_S (R);
+  SFtype r;
+
+  FP_INIT_ROUNDMODE;
+  FP_UNPACK_S (A, a);
+  FP_UNPACK_S (B, b);
+  FP_DIV_S (R, A, B);
+  FP_PACK_S (r, R);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
--- a/src/gemm/soft-fp/double.h
+++ b/src/gemm/soft-fp/double.h
@ -0,0 +1,323 @@
+/* Software floating-point emulation.
+   Definitions for IEEE Double Precision
+   Copyright (C) 1997-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson (rth@cygnus.com),
+		  Jakub Jelinek (jj@ultra.linux.cz),
+		  David S. Miller (davem@redhat.com) and
+		  Peter Maydell (pmaydell@chiark.greenend.org.uk).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef SOFT_FP_DOUBLE_H
+#define SOFT_FP_DOUBLE_H	1
+
+#if _FP_W_TYPE_SIZE < 32
+# error "Here's a nickel kid.  Go buy yourself a real computer."
+#endif
+
+#if _FP_W_TYPE_SIZE < 64
+# define _FP_FRACTBITS_D	(2 * _FP_W_TYPE_SIZE)
+# define _FP_FRACTBITS_DW_D	(4 * _FP_W_TYPE_SIZE)
+#else
+# define _FP_FRACTBITS_D	_FP_W_TYPE_SIZE
+# define _FP_FRACTBITS_DW_D	(2 * _FP_W_TYPE_SIZE)
+#endif
+
+#define _FP_FRACBITS_D		53
+#define _FP_FRACXBITS_D		(_FP_FRACTBITS_D - _FP_FRACBITS_D)
+#define _FP_WFRACBITS_D		(_FP_WORKBITS + _FP_FRACBITS_D)
+#define _FP_WFRACXBITS_D	(_FP_FRACTBITS_D - _FP_WFRACBITS_D)
+#define _FP_EXPBITS_D		11
+#define _FP_EXPBIAS_D		1023
+#define _FP_EXPMAX_D		2047
+
+#define _FP_QNANBIT_D		\
+	((_FP_W_TYPE) 1 << (_FP_FRACBITS_D-2) % _FP_W_TYPE_SIZE)
+#define _FP_QNANBIT_SH_D		\
+	((_FP_W_TYPE) 1 << (_FP_FRACBITS_D-2+_FP_WORKBITS) % _FP_W_TYPE_SIZE)
+#define _FP_IMPLBIT_D		\
+	((_FP_W_TYPE) 1 << (_FP_FRACBITS_D-1) % _FP_W_TYPE_SIZE)
+#define _FP_IMPLBIT_SH_D		\
+	((_FP_W_TYPE) 1 << (_FP_FRACBITS_D-1+_FP_WORKBITS) % _FP_W_TYPE_SIZE)
+#define _FP_OVERFLOW_D		\
+	((_FP_W_TYPE) 1 << _FP_WFRACBITS_D % _FP_W_TYPE_SIZE)
+
+#define _FP_WFRACBITS_DW_D	(2 * _FP_WFRACBITS_D)
+#define _FP_WFRACXBITS_DW_D	(_FP_FRACTBITS_DW_D - _FP_WFRACBITS_DW_D)
+#define _FP_HIGHBIT_DW_D	\
+  ((_FP_W_TYPE) 1 << (_FP_WFRACBITS_DW_D - 1) % _FP_W_TYPE_SIZE)
+
+typedef float DFtype __attribute__ ((mode (DF)));
+
+#if _FP_W_TYPE_SIZE < 64
+
+union _FP_UNION_D
+{
+  DFtype flt;
+  struct _FP_STRUCT_LAYOUT
+  {
+# if __BYTE_ORDER == __BIG_ENDIAN
+    unsigned sign  : 1;
+    unsigned exp   : _FP_EXPBITS_D;
+    unsigned frac1 : _FP_FRACBITS_D - (_FP_IMPLBIT_D != 0) - _FP_W_TYPE_SIZE;
+    unsigned frac0 : _FP_W_TYPE_SIZE;
+# else
+    unsigned frac0 : _FP_W_TYPE_SIZE;
+    unsigned frac1 : _FP_FRACBITS_D - (_FP_IMPLBIT_D != 0) - _FP_W_TYPE_SIZE;
+    unsigned exp   : _FP_EXPBITS_D;
+    unsigned sign  : 1;
+# endif
+  } bits;
+};
+
+# define FP_DECL_D(X)		_FP_DECL (2, X)
+# define FP_UNPACK_RAW_D(X, val)	_FP_UNPACK_RAW_2 (D, X, (val))
+# define FP_UNPACK_RAW_DP(X, val)	_FP_UNPACK_RAW_2_P (D, X, (val))
+# define FP_PACK_RAW_D(val, X)	_FP_PACK_RAW_2 (D, (val), X)
+# define FP_PACK_RAW_DP(val, X)			\
+  do						\
+    {						\
+      if (!FP_INHIBIT_RESULTS)			\
+	_FP_PACK_RAW_2_P (D, (val), X);		\
+    }						\
+  while (0)
+
+# define FP_UNPACK_D(X, val)			\
+  do						\
+    {						\
+      _FP_UNPACK_RAW_2 (D, X, (val));		\
+      _FP_UNPACK_CANONICAL (D, 2, X);		\
+    }						\
+  while (0)
+
+# define FP_UNPACK_DP(X, val)			\
+  do						\
+    {						\
+      _FP_UNPACK_RAW_2_P (D, X, (val));		\
+      _FP_UNPACK_CANONICAL (D, 2, X);		\
+    }						\
+  while (0)
+
+# define FP_UNPACK_SEMIRAW_D(X, val)		\
+  do						\
+    {						\
+      _FP_UNPACK_RAW_2 (D, X, (val));		\
+      _FP_UNPACK_SEMIRAW (D, 2, X);		\
+    }						\
+  while (0)
+
+# define FP_UNPACK_SEMIRAW_DP(X, val)		\
+  do						\
+    {						\
+      _FP_UNPACK_RAW_2_P (D, X, (val));		\
+      _FP_UNPACK_SEMIRAW (D, 2, X);		\
+    }						\
+  while (0)
+
+# define FP_PACK_D(val, X)			\
+  do						\
+    {						\
+      _FP_PACK_CANONICAL (D, 2, X);		\
+      _FP_PACK_RAW_2 (D, (val), X);		\
+    }						\
+  while (0)
+
+# define FP_PACK_DP(val, X)			\
+  do						\
+    {						\
+      _FP_PACK_CANONICAL (D, 2, X);		\
+      if (!FP_INHIBIT_RESULTS)			\
+	_FP_PACK_RAW_2_P (D, (val), X);		\
+    }						\
+  while (0)
+
+# define FP_PACK_SEMIRAW_D(val, X)		\
+  do						\
+    {						\
+      _FP_PACK_SEMIRAW (D, 2, X);		\
+      _FP_PACK_RAW_2 (D, (val), X);		\
+    }						\
+  while (0)
+
+# define FP_PACK_SEMIRAW_DP(val, X)		\
+  do						\
+    {						\
+      _FP_PACK_SEMIRAW (D, 2, X);		\
+      if (!FP_INHIBIT_RESULTS)			\
+	_FP_PACK_RAW_2_P (D, (val), X);		\
+    }						\
+  while (0)
+
+# define FP_ISSIGNAN_D(X)		_FP_ISSIGNAN (D, 2, X)
+# define FP_NEG_D(R, X)			_FP_NEG (D, 2, R, X)
+# define FP_ADD_D(R, X, Y)		_FP_ADD (D, 2, R, X, Y)
+# define FP_SUB_D(R, X, Y)		_FP_SUB (D, 2, R, X, Y)
+# define FP_MUL_D(R, X, Y)		_FP_MUL (D, 2, R, X, Y)
+# define FP_DIV_D(R, X, Y)		_FP_DIV (D, 2, R, X, Y)
+# define FP_SQRT_D(R, X)		_FP_SQRT (D, 2, R, X)
+# define _FP_SQRT_MEAT_D(R, S, T, X, Q)	_FP_SQRT_MEAT_2 (R, S, T, X, (Q))
+# define FP_FMA_D(R, X, Y, Z)		_FP_FMA (D, 2, 4, R, X, Y, Z)
+
+# define FP_CMP_D(r, X, Y, un, ex)	_FP_CMP (D, 2, (r), X, Y, (un), (ex))
+# define FP_CMP_EQ_D(r, X, Y, ex)	_FP_CMP_EQ (D, 2, (r), X, Y, (ex))
+# define FP_CMP_UNORD_D(r, X, Y, ex)	_FP_CMP_UNORD (D, 2, (r), X, Y, (ex))
+
+# define FP_TO_INT_D(r, X, rsz, rsg)	_FP_TO_INT (D, 2, (r), X, (rsz), (rsg))
+# define FP_TO_INT_ROUND_D(r, X, rsz, rsg)	\
+  _FP_TO_INT_ROUND (D, 2, (r), X, (rsz), (rsg))
+# define FP_FROM_INT_D(X, r, rs, rt)	_FP_FROM_INT (D, 2, X, (r), (rs), rt)
+
+# define _FP_FRAC_HIGH_D(X)	_FP_FRAC_HIGH_2 (X)
+# define _FP_FRAC_HIGH_RAW_D(X)	_FP_FRAC_HIGH_2 (X)
+
+# define _FP_FRAC_HIGH_DW_D(X)	_FP_FRAC_HIGH_4 (X)
+
+#else
+
+union _FP_UNION_D
+{
+  DFtype flt;
+  struct _FP_STRUCT_LAYOUT
+  {
+# if __BYTE_ORDER == __BIG_ENDIAN
+    unsigned sign   : 1;
+    unsigned exp    : _FP_EXPBITS_D;
+    _FP_W_TYPE frac : _FP_FRACBITS_D - (_FP_IMPLBIT_D != 0);
+# else
+    _FP_W_TYPE frac : _FP_FRACBITS_D - (_FP_IMPLBIT_D != 0);
+    unsigned exp    : _FP_EXPBITS_D;
+    unsigned sign   : 1;
+# endif
+  } bits;
+};
+
+# define FP_DECL_D(X)		_FP_DECL (1, X)
+# define FP_UNPACK_RAW_D(X, val)	_FP_UNPACK_RAW_1 (D, X, (val))
+# define FP_UNPACK_RAW_DP(X, val)	_FP_UNPACK_RAW_1_P (D, X, (val))
+# define FP_PACK_RAW_D(val, X)	_FP_PACK_RAW_1 (D, (val), X)
+# define FP_PACK_RAW_DP(val, X)			\
+  do						\
+    {						\
+      if (!FP_INHIBIT_RESULTS)			\
+	_FP_PACK_RAW_1_P (D, (val), X);		\
+    }						\
+  while (0)
+
+# define FP_UNPACK_D(X, val)			\
+  do						\
+    {						\
+      _FP_UNPACK_RAW_1 (D, X, (val));		\
+      _FP_UNPACK_CANONICAL (D, 1, X);		\
+    }						\
+  while (0)
+
+# define FP_UNPACK_DP(X, val)			\
+  do						\
+    {						\
+      _FP_UNPACK_RAW_1_P (D, X, (val));		\
+      _FP_UNPACK_CANONICAL (D, 1, X);		\
+    }						\
+  while (0)
+
+# define FP_UNPACK_SEMIRAW_D(X, val)		\
+  do						\
+    {						\
+      _FP_UNPACK_RAW_1 (D, X, (val));		\
+      _FP_UNPACK_SEMIRAW (D, 1, X);		\
+    }						\
+  while (0)
+
+# define FP_UNPACK_SEMIRAW_DP(X, val)		\
+  do						\
+    {						\
+      _FP_UNPACK_RAW_1_P (D, X, (val));		\
+      _FP_UNPACK_SEMIRAW (D, 1, X);		\
+    }						\
+  while (0)
+
+# define FP_PACK_D(val, X)			\
+  do						\
+    {						\
+      _FP_PACK_CANONICAL (D, 1, X);		\
+      _FP_PACK_RAW_1 (D, (val), X);		\
+    }						\
+  while (0)
+
+# define FP_PACK_DP(val, X)			\
+  do						\
+    {						\
+      _FP_PACK_CANONICAL (D, 1, X);		\
+      if (!FP_INHIBIT_RESULTS)			\
+	_FP_PACK_RAW_1_P (D, (val), X);		\
+    }						\
+  while (0)
+
+# define FP_PACK_SEMIRAW_D(val, X)		\
+  do						\
+    {						\
+      _FP_PACK_SEMIRAW (D, 1, X);		\
+      _FP_PACK_RAW_1 (D, (val), X);		\
+    }						\
+  while (0)
+
+# define FP_PACK_SEMIRAW_DP(val, X)		\
+  do						\
+    {						\
+      _FP_PACK_SEMIRAW (D, 1, X);		\
+      if (!FP_INHIBIT_RESULTS)			\
+	_FP_PACK_RAW_1_P (D, (val), X);		\
+    }						\
+  while (0)
+
+# define FP_ISSIGNAN_D(X)		_FP_ISSIGNAN (D, 1, X)
+# define FP_NEG_D(R, X)			_FP_NEG (D, 1, R, X)
+# define FP_ADD_D(R, X, Y)		_FP_ADD (D, 1, R, X, Y)
+# define FP_SUB_D(R, X, Y)		_FP_SUB (D, 1, R, X, Y)
+# define FP_MUL_D(R, X, Y)		_FP_MUL (D, 1, R, X, Y)
+# define FP_DIV_D(R, X, Y)		_FP_DIV (D, 1, R, X, Y)
+# define FP_SQRT_D(R, X)		_FP_SQRT (D, 1, R, X)
+# define _FP_SQRT_MEAT_D(R, S, T, X, Q)	_FP_SQRT_MEAT_1 (R, S, T, X, (Q))
+# define FP_FMA_D(R, X, Y, Z)		_FP_FMA (D, 1, 2, R, X, Y, Z)
+
+/* The implementation of _FP_MUL_D and _FP_DIV_D should be chosen by
+   the target machine.  */
+
+# define FP_CMP_D(r, X, Y, un, ex)	_FP_CMP (D, 1, (r), X, Y, (un), (ex))
+# define FP_CMP_EQ_D(r, X, Y, ex)	_FP_CMP_EQ (D, 1, (r), X, Y, (ex))
+# define FP_CMP_UNORD_D(r, X, Y, ex)	_FP_CMP_UNORD (D, 1, (r), X, Y, (ex))
+
+# define FP_TO_INT_D(r, X, rsz, rsg)	_FP_TO_INT (D, 1, (r), X, (rsz), (rsg))
+# define FP_TO_INT_ROUND_D(r, X, rsz, rsg)	\
+  _FP_TO_INT_ROUND (D, 1, (r), X, (rsz), (rsg))
+# define FP_FROM_INT_D(X, r, rs, rt)	_FP_FROM_INT (D, 1, X, (r), (rs), rt)
+
+# define _FP_FRAC_HIGH_D(X)	_FP_FRAC_HIGH_1 (X)
+# define _FP_FRAC_HIGH_RAW_D(X)	_FP_FRAC_HIGH_1 (X)
+
+# define _FP_FRAC_HIGH_DW_D(X)	_FP_FRAC_HIGH_2 (X)
+
+#endif /* W_TYPE_SIZE < 64 */
+
+#endif /* !SOFT_FP_DOUBLE_H */
--- a/src/gemm/soft-fp/eqdf2.c
+++ b/src/gemm/soft-fp/eqdf2.c
@ -0,0 +1,21 @@
+#include "soft-fp.h"
+#include "double.h"
+
+CMPtype
+__eqdf2 (DFtype a, DFtype b)
+{
+  FP_DECL_EX;
+  FP_DECL_D (A);
+  FP_DECL_D (B);
+  CMPtype r;
+
+  FP_INIT_EXCEPTIONS;
+  FP_UNPACK_RAW_D (A, a);
+  FP_UNPACK_RAW_D (B, b);
+  FP_CMP_EQ_D (r, A, B, 1);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
+
+strong_alias (__eqdf2, __nedf2);
--- a/src/gemm/soft-fp/eqsf2.c
+++ b/src/gemm/soft-fp/eqsf2.c
@ -0,0 +1,21 @@
+#include "soft-fp.h"
+#include "single.h"
+
+CMPtype
+__eqsf2 (SFtype a, SFtype b)
+{
+  FP_DECL_EX;
+  FP_DECL_S (A);
+  FP_DECL_S (B);
+  CMPtype r;
+
+  FP_INIT_EXCEPTIONS;
+  FP_UNPACK_RAW_S (A, a);
+  FP_UNPACK_RAW_S (B, b);
+  FP_CMP_EQ_S (r, A, B, 1);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
+
+strong_alias (__eqsf2, __nesf2);
--- a/src/gemm/soft-fp/extendsfdf2.c
+++ b/src/gemm/soft-fp/extendsfdf2.c
@ -0,0 +1,26 @@
+#define FP_NO_EXACT_UNDERFLOW
+#include "soft-fp.h"
+#include "single.h"
+#include "double.h"
+
+DFtype
+__extendsfdf2 (SFtype a)
+{
+  FP_DECL_EX;
+  FP_DECL_S (A);
+  FP_DECL_D (R);
+  DFtype r;
+
+  FP_INIT_EXCEPTIONS;
+  FP_UNPACK_RAW_S (A, a);
+#if _FP_W_TYPE_SIZE < _FP_FRACBITS_D
+  FP_EXTEND (D, S, 2, 1, R, A);
+#else
+  FP_EXTEND (D, S, 1, 1, R, A);
+#endif
+  FP_PACK_RAW_D (r, R);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
+
--- a/src/gemm/soft-fp/fixdfdi.c
+++ b/src/gemm/soft-fp/fixdfdi.c
@ -0,0 +1,47 @@
+/* Software floating-point emulation.
+   Convert a to 64bit signed integer
+   Copyright (C) 1997-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson (rth@cygnus.com) and
+		  Jakub Jelinek (jj@ultra.linux.cz).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "soft-fp.h"
+#include "double.h"
+
+DItype
+__fixdfdi (DFtype a)
+{
+  FP_DECL_EX;
+  FP_DECL_D (A);
+  UDItype r;
+
+  FP_INIT_EXCEPTIONS;
+  FP_UNPACK_RAW_D (A, a);
+  FP_TO_INT_D (r, A, DI_BITS, 1);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
--- a/src/gemm/soft-fp/fixdfsi.c
+++ b/src/gemm/soft-fp/fixdfsi.c
@ -0,0 +1,47 @@
+/* Software floating-point emulation.
+   Convert a to 32bit signed integer
+   Copyright (C) 1997-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson (rth@cygnus.com) and
+		  Jakub Jelinek (jj@ultra.linux.cz).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "soft-fp.h"
+#include "double.h"
+
+SItype
+__fixdfsi (DFtype a)
+{
+  FP_DECL_EX;
+  FP_DECL_D (A);
+  USItype r;
+
+  FP_INIT_EXCEPTIONS;
+  FP_UNPACK_RAW_D (A, a);
+  FP_TO_INT_D (r, A, SI_BITS, 1);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
--- a/src/gemm/soft-fp/fixdfti.c
+++ b/src/gemm/soft-fp/fixdfti.c
@ -0,0 +1,46 @@
+/* Software floating-point emulation.
+   Convert IEEE double to 128bit signed integer
+   Copyright (C) 2007-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Uros Bizjak (ubizjak@gmail.com).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+// #include "soft-fp.h"
+// #include "double.h"
+
+// TItype
+// __fixdfti (DFtype a)
+// {
+//   FP_DECL_EX;
+//   FP_DECL_D (A);
+//   UTItype r;
+
+//   FP_INIT_EXCEPTIONS;
+//   FP_UNPACK_RAW_D (A, a);
+//   FP_TO_INT_D (r, A, TI_BITS, 1);
+//   FP_HANDLE_EXCEPTIONS;
+
+//   return r;
+// }
--- a/src/gemm/soft-fp/fixsfdi.c
+++ b/src/gemm/soft-fp/fixsfdi.c
@ -0,0 +1,47 @@
+/* Software floating-point emulation.
+   Convert a to 64bit signed integer
+   Copyright (C) 1997-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson (rth@cygnus.com) and
+		  Jakub Jelinek (jj@ultra.linux.cz).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "soft-fp.h"
+#include "single.h"
+
+DItype
+__fixsfdi (SFtype a)
+{
+  FP_DECL_EX;
+  FP_DECL_S (A);
+  UDItype r;
+
+  FP_INIT_EXCEPTIONS;
+  FP_UNPACK_RAW_S (A, a);
+  FP_TO_INT_S (r, A, DI_BITS, 1);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
--- a/src/gemm/soft-fp/fixsfsi.c
+++ b/src/gemm/soft-fp/fixsfsi.c
@ -0,0 +1,47 @@
+/* Software floating-point emulation.
+   Convert a to 32bit signed integer
+   Copyright (C) 1997-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson (rth@cygnus.com) and
+		  Jakub Jelinek (jj@ultra.linux.cz).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "soft-fp.h"
+#include "single.h"
+
+SItype
+__fixsfsi (SFtype a)
+{
+  FP_DECL_EX;
+  FP_DECL_S (A);
+  USItype r;
+
+  FP_INIT_EXCEPTIONS;
+  FP_UNPACK_RAW_S (A, a);
+  FP_TO_INT_S (r, A, SI_BITS, 1);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
--- a/src/gemm/soft-fp/fixsfti.c
+++ b/src/gemm/soft-fp/fixsfti.c
@ -0,0 +1,46 @@
+/* Software floating-point emulation.
+   Convert IEEE single to 128bit signed integer
+   Copyright (C) 2007-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Uros Bizjak (ubizjak@gmail.com).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+// #include "soft-fp.h"
+// #include "single.h"
+
+// TItype
+// __fixsfti (SFtype a)
+// {
+//   FP_DECL_EX;
+//   FP_DECL_S (A);
+//   UTItype r;
+
+//   FP_INIT_EXCEPTIONS;
+//   FP_UNPACK_RAW_S (A, a);
+//   FP_TO_INT_S (r, A, TI_BITS, 1);
+//   FP_HANDLE_EXCEPTIONS;
+
+//   return r;
+// }
--- a/src/gemm/soft-fp/fixunsdfdi.c
+++ b/src/gemm/soft-fp/fixunsdfdi.c
@ -0,0 +1,47 @@
+/* Software floating-point emulation.
+   Convert a to 64bit unsigned integer
+   Copyright (C) 1997-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson (rth@cygnus.com) and
+		  Jakub Jelinek (jj@ultra.linux.cz).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "soft-fp.h"
+#include "double.h"
+
+UDItype
+__fixunsdfdi (DFtype a)
+{
+  FP_DECL_EX;
+  FP_DECL_D (A);
+  UDItype r;
+
+  FP_INIT_EXCEPTIONS;
+  FP_UNPACK_RAW_D (A, a);
+  FP_TO_INT_D (r, A, DI_BITS, 0);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
--- a/src/gemm/soft-fp/fixunsdfsi.c
+++ b/src/gemm/soft-fp/fixunsdfsi.c
@ -0,0 +1,47 @@
+/* Software floating-point emulation.
+   Convert a to 32bit unsigned integer
+   Copyright (C) 1997-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson (rth@cygnus.com) and
+		  Jakub Jelinek (jj@ultra.linux.cz).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "soft-fp.h"
+#include "double.h"
+
+USItype
+__fixunsdfsi (DFtype a)
+{
+  FP_DECL_EX;
+  FP_DECL_D (A);
+  USItype r;
+
+  FP_INIT_EXCEPTIONS;
+  FP_UNPACK_RAW_D (A, a);
+  FP_TO_INT_D (r, A, SI_BITS, 0);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
--- a/src/gemm/soft-fp/fixunsdfti.c
+++ b/src/gemm/soft-fp/fixunsdfti.c
@ -0,0 +1,46 @@
+/* Software floating-point emulation.
+   Convert IEEE double to 128bit unsigned integer
+   Copyright (C) 2007-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Uros Bizjak (ubizjak@gmail.com).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+// #include "soft-fp.h"
+// #include "double.h"
+
+// UTItype
+// __fixunsdfti (DFtype a)
+// {
+//   FP_DECL_EX;
+//   FP_DECL_D (A);
+//   UTItype r;
+
+//   FP_INIT_EXCEPTIONS;
+//   FP_UNPACK_RAW_D (A, a);
+//   FP_TO_INT_D (r, A, TI_BITS, 0);
+//   FP_HANDLE_EXCEPTIONS;
+
+//   return r;
+// }
--- a/src/gemm/soft-fp/fixunssfdi.c
+++ b/src/gemm/soft-fp/fixunssfdi.c
@ -0,0 +1,47 @@
+/* Software floating-point emulation.
+   Convert a to 64bit unsigned integer
+   Copyright (C) 1997-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson (rth@cygnus.com) and
+		  Jakub Jelinek (jj@ultra.linux.cz).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "soft-fp.h"
+#include "single.h"
+
+UDItype
+__fixunssfdi (SFtype a)
+{
+  FP_DECL_EX;
+  FP_DECL_S (A);
+  UDItype r;
+
+  FP_INIT_EXCEPTIONS;
+  FP_UNPACK_RAW_S (A, a);
+  FP_TO_INT_S (r, A, DI_BITS, 0);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
--- a/src/gemm/soft-fp/fixunssfsi.c
+++ b/src/gemm/soft-fp/fixunssfsi.c
@ -0,0 +1,47 @@
+/* Software floating-point emulation.
+   Convert a to 32bit unsigned integer
+   Copyright (C) 1997-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson (rth@cygnus.com) and
+		  Jakub Jelinek (jj@ultra.linux.cz).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "soft-fp.h"
+#include "single.h"
+
+USItype
+__fixunssfsi (SFtype a)
+{
+  FP_DECL_EX;
+  FP_DECL_S (A);
+  USItype r;
+
+  FP_INIT_EXCEPTIONS;
+  FP_UNPACK_RAW_S (A, a);
+  FP_TO_INT_S (r, A, SI_BITS, 0);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
--- a/src/gemm/soft-fp/fixunssfti.c
+++ b/src/gemm/soft-fp/fixunssfti.c
@ -0,0 +1,46 @@
+/* Software floating-point emulation.
+   Convert IEEE single to 128bit unsigned integer
+   Copyright (C) 2007-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Uros Bizjak (ubizjak@gmail.com).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+// #include "soft-fp.h"
+// #include "single.h"
+
+// UTItype
+// __fixunssfti (SFtype a)
+// {
+//   FP_DECL_EX;
+//   FP_DECL_S (A);
+//   UTItype r;
+
+//   FP_INIT_EXCEPTIONS;
+//   FP_UNPACK_RAW_S (A, a);
+//   FP_TO_INT_S (r, A, TI_BITS, 0);
+//   FP_HANDLE_EXCEPTIONS;
+
+//   return r;
+// }
--- a/src/gemm/soft-fp/floatdidf.c
+++ b/src/gemm/soft-fp/floatdidf.c
@ -0,0 +1,47 @@
+/* Software floating-point emulation.
+   Convert a 64bit signed integer to IEEE double
+   Copyright (C) 1997-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson (rth@cygnus.com) and
+		  Jakub Jelinek (jj@ultra.linux.cz).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "soft-fp.h"
+#include "double.h"
+
+DFtype
+__floatdidf (DItype i)
+{
+  FP_DECL_EX;
+  FP_DECL_D (A);
+  DFtype a;
+
+  FP_INIT_ROUNDMODE;
+  FP_FROM_INT_D (A, i, DI_BITS, UDItype);
+  FP_PACK_RAW_D (a, A);
+  FP_HANDLE_EXCEPTIONS;
+
+  return a;
+}
--- a/src/gemm/soft-fp/floatdisf.c
+++ b/src/gemm/soft-fp/floatdisf.c
@ -0,0 +1,47 @@
+/* Software floating-point emulation.
+   Convert a 64bit signed integer to IEEE single
+   Copyright (C) 1997-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson (rth@cygnus.com) and
+		  Jakub Jelinek (jj@ultra.linux.cz).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "soft-fp.h"
+#include "single.h"
+
+SFtype
+__floatdisf (DItype i)
+{
+  FP_DECL_EX;
+  FP_DECL_S (A);
+  SFtype a;
+
+  FP_INIT_ROUNDMODE;
+  FP_FROM_INT_S (A, i, DI_BITS, UDItype);
+  FP_PACK_RAW_S (a, A);
+  FP_HANDLE_EXCEPTIONS;
+
+  return a;
+}
--- a/src/gemm/soft-fp/floatsidf.c
+++ b/src/gemm/soft-fp/floatsidf.c
@ -0,0 +1,49 @@
+/* Software floating-point emulation.
+   Convert a 32bit signed integer to IEEE double
+   Copyright (C) 1997-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson (rth@cygnus.com) and
+		  Jakub Jelinek (jj@ultra.linux.cz).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define FP_NO_EXCEPTIONS
+#include "soft-fp.h"
+#include "double.h"
+
+DFtype
+__floatsidf (SItype i)
+{
+  FP_DECL_EX;
+  FP_DECL_D (A);
+  DFtype a;
+
+  FP_INIT_ROUNDMODE;
+  FP_FROM_INT_D (A, i, SI_BITS, USItype);
+  FP_PACK_RAW_D (a, A);
+  FP_HANDLE_EXCEPTIONS;
+
+  return a;
+}
+
--- a/src/gemm/soft-fp/floatsisf.c
+++ b/src/gemm/soft-fp/floatsisf.c
@ -0,0 +1,47 @@
+/* Software floating-point emulation.
+   Convert a 32bit signed integer to IEEE single
+   Copyright (C) 1997-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson (rth@cygnus.com) and
+		  Jakub Jelinek (jj@ultra.linux.cz).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "soft-fp.h"
+#include "single.h"
+
+SFtype
+__floatsisf (SItype i)
+{
+  FP_DECL_EX;
+  FP_DECL_S (A);
+  SFtype a;
+
+  FP_INIT_ROUNDMODE;
+  FP_FROM_INT_S (A, i, SI_BITS, USItype);
+  FP_PACK_RAW_S (a, A);
+  FP_HANDLE_EXCEPTIONS;
+
+  return a;
+}
--- a/src/gemm/soft-fp/floatundidf.c
+++ b/src/gemm/soft-fp/floatundidf.c
@ -0,0 +1,47 @@
+/* Software floating-point emulation.
+   Convert a 64bit unsigned integer to IEEE double
+   Copyright (C) 1997-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson (rth@cygnus.com) and
+		  Jakub Jelinek (jj@ultra.linux.cz).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "soft-fp.h"
+#include "double.h"
+
+DFtype
+__floatundidf (UDItype i)
+{
+  FP_DECL_EX;
+  FP_DECL_D (A);
+  DFtype a;
+
+  FP_INIT_ROUNDMODE;
+  FP_FROM_INT_D (A, i, DI_BITS, UDItype);
+  FP_PACK_RAW_D (a, A);
+  FP_HANDLE_EXCEPTIONS;
+
+  return a;
+}
--- a/src/gemm/soft-fp/floatundisf.c
+++ b/src/gemm/soft-fp/floatundisf.c
@ -0,0 +1,47 @@
+/* Software floating-point emulation.
+   Convert a 64bit unsigned integer to IEEE single
+   Copyright (C) 1997-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson (rth@cygnus.com) and
+		  Jakub Jelinek (jj@ultra.linux.cz).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "soft-fp.h"
+#include "single.h"
+
+SFtype
+__floatundisf (UDItype i)
+{
+  FP_DECL_EX;
+  FP_DECL_S (A);
+  SFtype a;
+
+  FP_INIT_ROUNDMODE;
+  FP_FROM_INT_S (A, i, DI_BITS, UDItype);
+  FP_PACK_RAW_S (a, A);
+  FP_HANDLE_EXCEPTIONS;
+
+  return a;
+}
--- a/src/gemm/soft-fp/floatunsidf.c
+++ b/src/gemm/soft-fp/floatunsidf.c
@ -0,0 +1,47 @@
+/* Software floating-point emulation.
+   Convert a 32bit unsigned integer to IEEE double
+   Copyright (C) 1997-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson (rth@cygnus.com) and
+		  Jakub Jelinek (jj@ultra.linux.cz).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define FP_NO_EXCEPTIONS
+#include "soft-fp.h"
+#include "double.h"
+
+DFtype
+__floatunsidf (USItype i)
+{
+  FP_DECL_EX;
+  FP_DECL_D (A);
+  DFtype a;
+
+  FP_INIT_ROUNDMODE;
+  FP_FROM_INT_D (A, i, SI_BITS, USItype);
+  FP_PACK_RAW_D (a, A);
+
+  return a;
+}
--- a/src/gemm/soft-fp/floatunsisf.c
+++ b/src/gemm/soft-fp/floatunsisf.c
@ -0,0 +1,47 @@
+/* Software floating-point emulation.
+   Convert a 32bit unsigned integer to IEEE single
+   Copyright (C) 1997-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson (rth@cygnus.com) and
+		  Jakub Jelinek (jj@ultra.linux.cz).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "soft-fp.h"
+#include "single.h"
+
+SFtype
+__floatunsisf (USItype i)
+{
+  FP_DECL_EX;
+  FP_DECL_S (A);
+  SFtype a;
+
+  FP_INIT_ROUNDMODE;
+  FP_FROM_INT_S (A, i, SI_BITS, USItype);
+  FP_PACK_RAW_S (a, A);
+  FP_HANDLE_EXCEPTIONS;
+
+  return a;
+}
--- a/src/gemm/soft-fp/gedf2.c
+++ b/src/gemm/soft-fp/gedf2.c
@ -0,0 +1,21 @@
+#include "soft-fp.h"
+#include "double.h"
+
+CMPtype
+__gedf2 (DFtype a, DFtype b)
+{
+  FP_DECL_EX;
+  FP_DECL_D (A);
+  FP_DECL_D (B);
+  CMPtype r;
+
+  FP_INIT_EXCEPTIONS;
+  FP_UNPACK_RAW_D (A, a);
+  FP_UNPACK_RAW_D (B, b);
+  FP_CMP_D (r, A, B, -2, 2);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
+
+strong_alias (__gedf2, __gtdf2);
--- a/src/gemm/soft-fp/gesf2.c
+++ b/src/gemm/soft-fp/gesf2.c
@ -0,0 +1,21 @@
+#include "soft-fp.h"
+#include "single.h"
+
+CMPtype
+__gesf2 (SFtype a, SFtype b)
+{
+  FP_DECL_EX;
+  FP_DECL_S (A);
+  FP_DECL_S (B);
+  CMPtype r;
+
+  FP_INIT_EXCEPTIONS;
+  FP_UNPACK_RAW_S (A, a);
+  FP_UNPACK_RAW_S (B, b);
+  FP_CMP_S (r, A, B, -2, 2);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
+
+strong_alias (__gesf2, __gtsf2);
--- a/src/gemm/soft-fp/ledf2.c
+++ b/src/gemm/soft-fp/ledf2.c
@ -0,0 +1,21 @@
+#include "soft-fp.h"
+#include "double.h"
+
+CMPtype
+__ledf2 (DFtype a, DFtype b)
+{
+  FP_DECL_EX;
+  FP_DECL_D (A);
+  FP_DECL_D (B);
+  CMPtype r;
+
+  FP_INIT_EXCEPTIONS;
+  FP_UNPACK_RAW_D (A, a);
+  FP_UNPACK_RAW_D (B, b);
+  FP_CMP_D (r, A, B, 2, 2);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
+
+strong_alias (__ledf2, __ltdf2);
--- a/src/gemm/soft-fp/lesf2.c
+++ b/src/gemm/soft-fp/lesf2.c
@ -0,0 +1,22 @@
+#include "soft-fp.h"
+#include "single.h"
+
+CMPtype
+__lesf2 (SFtype a, SFtype b)
+{
+  FP_DECL_EX;
+  FP_DECL_S (A);
+  FP_DECL_S (B);
+  CMPtype r;
+
+  FP_INIT_EXCEPTIONS;
+  FP_UNPACK_RAW_S (A, a);
+  FP_UNPACK_RAW_S (B, b);
+  FP_CMP_S (r, A, B, 2, 2);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
+
+strong_alias (__lesf2, __ltsf2);
+
--- a/src/gemm/soft-fp/longlong.h
+++ b/src/gemm/soft-fp/longlong.h
--- a/src/gemm/soft-fp/muldf3.c
+++ b/src/gemm/soft-fp/muldf3.c
@ -0,0 +1,21 @@
+#include "soft-fp.h"
+#include "double.h"
+
+DFtype
+__muldf3 (DFtype a, DFtype b)
+{
+  FP_DECL_EX;
+  FP_DECL_D (A);
+  FP_DECL_D (B);
+  FP_DECL_D (R);
+  DFtype r;
+
+  FP_INIT_ROUNDMODE;
+  FP_UNPACK_D (A, a);
+  FP_UNPACK_D (B, b);
+  FP_MUL_D (R, A, B);
+  FP_PACK_D (r, R);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
--- a/src/gemm/soft-fp/mulsf3.c
+++ b/src/gemm/soft-fp/mulsf3.c
@ -0,0 +1,22 @@
+#include "soft-fp.h"
+#include "single.h"
+
+SFtype
+__mulsf3 (SFtype a, SFtype b)
+{
+  FP_DECL_EX;
+  FP_DECL_S (A);
+  FP_DECL_S (B);
+  FP_DECL_S (R);
+  SFtype r;
+
+  FP_INIT_ROUNDMODE;
+  FP_UNPACK_S (A, a);
+  FP_UNPACK_S (B, b);
+  FP_MUL_S (R, A, B);
+  FP_PACK_S (r, R);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
+
--- a/src/gemm/soft-fp/negdf2.c
+++ b/src/gemm/soft-fp/negdf2.c
@ -0,0 +1,16 @@
+#include "soft-fp.h"
+#include "double.h"
+
+DFtype
+__negdf2 (DFtype a)
+{
+  FP_DECL_D (A);
+  FP_DECL_D (R);
+  DFtype r;
+
+  FP_UNPACK_RAW_D (A, a);
+  FP_NEG_D (R, A);
+  FP_PACK_RAW_D (r, R);
+
+  return r;
+}
--- a/src/gemm/soft-fp/negsf2.c
+++ b/src/gemm/soft-fp/negsf2.c
@ -0,0 +1,16 @@
+#include "soft-fp.h"
+#include "single.h"
+
+SFtype
+__negsf2 (SFtype a)
+{
+  FP_DECL_S (A);
+  FP_DECL_S (R);
+  SFtype r;
+
+  FP_UNPACK_RAW_S (A, a);
+  FP_NEG_S (R, A);
+  FP_PACK_RAW_S (r, R);
+
+  return r;
+}
--- a/src/gemm/soft-fp/op-1.h
+++ b/src/gemm/soft-fp/op-1.h
@ -0,0 +1,369 @@
+/* Software floating-point emulation.
+   Basic one-word fraction declaration and manipulation.
+   Copyright (C) 1997-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson (rth@cygnus.com),
+		  Jakub Jelinek (jj@ultra.linux.cz),
+		  David S. Miller (davem@redhat.com) and
+		  Peter Maydell (pmaydell@chiark.greenend.org.uk).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef SOFT_FP_OP_1_H
+#define SOFT_FP_OP_1_H	1
+
+#define _FP_FRAC_DECL_1(X)	_FP_W_TYPE X##_f _FP_ZERO_INIT
+#define _FP_FRAC_COPY_1(D, S)	(D##_f = S##_f)
+#define _FP_FRAC_SET_1(X, I)	(X##_f = I)
+#define _FP_FRAC_HIGH_1(X)	(X##_f)
+#define _FP_FRAC_LOW_1(X)	(X##_f)
+#define _FP_FRAC_WORD_1(X, w)	(X##_f)
+
+#define _FP_FRAC_ADDI_1(X, I)	(X##_f += I)
+#define _FP_FRAC_SLL_1(X, N)			\
+  do						\
+    {						\
+      if (__builtin_constant_p (N) && (N) == 1)	\
+	X##_f += X##_f;				\
+      else					\
+	X##_f <<= (N);				\
+    }						\
+  while (0)
+#define _FP_FRAC_SRL_1(X, N)	(X##_f >>= N)
+
+/* Right shift with sticky-lsb.  */
+#define _FP_FRAC_SRST_1(X, S, N, sz)	__FP_FRAC_SRST_1 (X##_f, S, (N), (sz))
+#define _FP_FRAC_SRS_1(X, N, sz)	__FP_FRAC_SRS_1 (X##_f, (N), (sz))
+
+#define __FP_FRAC_SRST_1(X, S, N, sz)			\
+  do							\
+    {							\
+      S = (__builtin_constant_p (N) && (N) == 1		\
+	   ? X & 1					\
+	   : (X << (_FP_W_TYPE_SIZE - (N))) != 0);	\
+      X = X >> (N);					\
+    }							\
+  while (0)
+
+#define __FP_FRAC_SRS_1(X, N, sz)				\
+  (X = (X >> (N) | (__builtin_constant_p (N) && (N) == 1	\
+		    ? X & 1					\
+		    : (X << (_FP_W_TYPE_SIZE - (N))) != 0)))
+
+#define _FP_FRAC_ADD_1(R, X, Y)	(R##_f = X##_f + Y##_f)
+#define _FP_FRAC_SUB_1(R, X, Y)	(R##_f = X##_f - Y##_f)
+#define _FP_FRAC_DEC_1(X, Y)	(X##_f -= Y##_f)
+#define _FP_FRAC_CLZ_1(z, X)	__FP_CLZ ((z), X##_f)
+
+/* Predicates.  */
+#define _FP_FRAC_NEGP_1(X)	((_FP_WS_TYPE) X##_f < 0)
+#define _FP_FRAC_ZEROP_1(X)	(X##_f == 0)
+#define _FP_FRAC_OVERP_1(fs, X)	(X##_f & _FP_OVERFLOW_##fs)
+#define _FP_FRAC_CLEAR_OVERP_1(fs, X)	(X##_f &= ~_FP_OVERFLOW_##fs)
+#define _FP_FRAC_HIGHBIT_DW_1(fs, X)	(X##_f & _FP_HIGHBIT_DW_##fs)
+#define _FP_FRAC_EQ_1(X, Y)	(X##_f == Y##_f)
+#define _FP_FRAC_GE_1(X, Y)	(X##_f >= Y##_f)
+#define _FP_FRAC_GT_1(X, Y)	(X##_f > Y##_f)
+
+#define _FP_ZEROFRAC_1		0
+#define _FP_MINFRAC_1		1
+#define _FP_MAXFRAC_1		(~(_FP_WS_TYPE) 0)
+
+/* Unpack the raw bits of a native fp value.  Do not classify or
+   normalize the data.  */
+
+#define _FP_UNPACK_RAW_1(fs, X, val)			\
+  do							\
+    {							\
+      union _FP_UNION_##fs _FP_UNPACK_RAW_1_flo;	\
+      _FP_UNPACK_RAW_1_flo.flt = (val);			\
+							\
+      X##_f = _FP_UNPACK_RAW_1_flo.bits.frac;		\
+      X##_e = _FP_UNPACK_RAW_1_flo.bits.exp;		\
+      X##_s = _FP_UNPACK_RAW_1_flo.bits.sign;		\
+    }							\
+  while (0)
+
+#define _FP_UNPACK_RAW_1_P(fs, X, val)			\
+  do							\
+    {							\
+      union _FP_UNION_##fs *_FP_UNPACK_RAW_1_P_flo	\
+	= (union _FP_UNION_##fs *) (val);		\
+							\
+      X##_f = _FP_UNPACK_RAW_1_P_flo->bits.frac;	\
+      X##_e = _FP_UNPACK_RAW_1_P_flo->bits.exp;		\
+      X##_s = _FP_UNPACK_RAW_1_P_flo->bits.sign;	\
+    }							\
+  while (0)
+
+/* Repack the raw bits of a native fp value.  */
+
+#define _FP_PACK_RAW_1(fs, val, X)		\
+  do						\
+    {						\
+      union _FP_UNION_##fs _FP_PACK_RAW_1_flo;	\
+						\
+      _FP_PACK_RAW_1_flo.bits.frac = X##_f;	\
+      _FP_PACK_RAW_1_flo.bits.exp  = X##_e;	\
+      _FP_PACK_RAW_1_flo.bits.sign = X##_s;	\
+						\
+      (val) = _FP_PACK_RAW_1_flo.flt;		\
+    }						\
+  while (0)
+
+#define _FP_PACK_RAW_1_P(fs, val, X)			\
+  do							\
+    {							\
+      union _FP_UNION_##fs *_FP_PACK_RAW_1_P_flo	\
+	= (union _FP_UNION_##fs *) (val);		\
+							\
+      _FP_PACK_RAW_1_P_flo->bits.frac = X##_f;		\
+      _FP_PACK_RAW_1_P_flo->bits.exp  = X##_e;		\
+      _FP_PACK_RAW_1_P_flo->bits.sign = X##_s;		\
+    }							\
+  while (0)
+
+
+/* Multiplication algorithms: */
+
+/* Basic.  Assuming the host word size is >= 2*FRACBITS, we can do the
+   multiplication immediately.  */
+
+#define _FP_MUL_MEAT_DW_1_imm(wfracbits, R, X, Y)	\
+  do							\
+    {							\
+      R##_f = X##_f * Y##_f;				\
+    }							\
+  while (0)
+
+#define _FP_MUL_MEAT_1_imm(wfracbits, R, X, Y)				\
+  do									\
+    {									\
+      _FP_MUL_MEAT_DW_1_imm ((wfracbits), R, X, Y);			\
+      /* Normalize since we know where the msb of the multiplicands	\
+	 were (bit B), we know that the msb of the of the product is	\
+	 at either 2B or 2B-1.  */					\
+      _FP_FRAC_SRS_1 (R, (wfracbits)-1, 2*(wfracbits));			\
+    }									\
+  while (0)
+
+/* Given a 1W * 1W => 2W primitive, do the extended multiplication.  */
+
+#define _FP_MUL_MEAT_DW_1_wide(wfracbits, R, X, Y, doit)	\
+  do								\
+    {								\
+      doit (R##_f1, R##_f0, X##_f, Y##_f);			\
+    }								\
+  while (0)
+
+#define _FP_MUL_MEAT_1_wide(wfracbits, R, X, Y, doit)			\
+  do									\
+    {									\
+      _FP_FRAC_DECL_2 (_FP_MUL_MEAT_1_wide_Z);				\
+      _FP_MUL_MEAT_DW_1_wide ((wfracbits), _FP_MUL_MEAT_1_wide_Z,	\
+			      X, Y, doit);				\
+      /* Normalize since we know where the msb of the multiplicands	\
+	 were (bit B), we know that the msb of the of the product is	\
+	 at either 2B or 2B-1.  */					\
+      _FP_FRAC_SRS_2 (_FP_MUL_MEAT_1_wide_Z, (wfracbits)-1,		\
+		      2*(wfracbits));					\
+      R##_f = _FP_MUL_MEAT_1_wide_Z_f0;					\
+    }									\
+  while (0)
+
+/* Finally, a simple widening multiply algorithm.  What fun!  */
+
+#define _FP_MUL_MEAT_DW_1_hard(wfracbits, R, X, Y)			\
+  do									\
+    {									\
+      _FP_W_TYPE _FP_MUL_MEAT_DW_1_hard_xh, _FP_MUL_MEAT_DW_1_hard_xl;	\
+      _FP_W_TYPE _FP_MUL_MEAT_DW_1_hard_yh, _FP_MUL_MEAT_DW_1_hard_yl;	\
+      _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_1_hard_a);			\
+									\
+      /* Split the words in half.  */					\
+      _FP_MUL_MEAT_DW_1_hard_xh = X##_f >> (_FP_W_TYPE_SIZE/2);		\
+      _FP_MUL_MEAT_DW_1_hard_xl						\
+	= X##_f & (((_FP_W_TYPE) 1 << (_FP_W_TYPE_SIZE/2)) - 1);	\
+      _FP_MUL_MEAT_DW_1_hard_yh = Y##_f >> (_FP_W_TYPE_SIZE/2);		\
+      _FP_MUL_MEAT_DW_1_hard_yl						\
+	= Y##_f & (((_FP_W_TYPE) 1 << (_FP_W_TYPE_SIZE/2)) - 1);	\
+									\
+      /* Multiply the pieces.  */					\
+      R##_f0 = _FP_MUL_MEAT_DW_1_hard_xl * _FP_MUL_MEAT_DW_1_hard_yl;	\
+      _FP_MUL_MEAT_DW_1_hard_a_f0					\
+	= _FP_MUL_MEAT_DW_1_hard_xh * _FP_MUL_MEAT_DW_1_hard_yl;	\
+      _FP_MUL_MEAT_DW_1_hard_a_f1					\
+	= _FP_MUL_MEAT_DW_1_hard_xl * _FP_MUL_MEAT_DW_1_hard_yh;	\
+      R##_f1 = _FP_MUL_MEAT_DW_1_hard_xh * _FP_MUL_MEAT_DW_1_hard_yh;	\
+									\
+      /* Reassemble into two full words.  */				\
+      if ((_FP_MUL_MEAT_DW_1_hard_a_f0 += _FP_MUL_MEAT_DW_1_hard_a_f1)	\
+	  < _FP_MUL_MEAT_DW_1_hard_a_f1)				\
+	R##_f1 += (_FP_W_TYPE) 1 << (_FP_W_TYPE_SIZE/2);		\
+      _FP_MUL_MEAT_DW_1_hard_a_f1					\
+	= _FP_MUL_MEAT_DW_1_hard_a_f0 >> (_FP_W_TYPE_SIZE/2);		\
+      _FP_MUL_MEAT_DW_1_hard_a_f0					\
+	= _FP_MUL_MEAT_DW_1_hard_a_f0 << (_FP_W_TYPE_SIZE/2);		\
+      _FP_FRAC_ADD_2 (R, R, _FP_MUL_MEAT_DW_1_hard_a);			\
+    }									\
+  while (0)
+
+#define _FP_MUL_MEAT_1_hard(wfracbits, R, X, Y)			\
+  do								\
+    {								\
+      _FP_FRAC_DECL_2 (_FP_MUL_MEAT_1_hard_z);			\
+      _FP_MUL_MEAT_DW_1_hard ((wfracbits),			\
+			      _FP_MUL_MEAT_1_hard_z, X, Y);	\
+								\
+      /* Normalize.  */						\
+      _FP_FRAC_SRS_2 (_FP_MUL_MEAT_1_hard_z,			\
+		      (wfracbits) - 1, 2*(wfracbits));		\
+      R##_f = _FP_MUL_MEAT_1_hard_z_f0;				\
+    }								\
+  while (0)
+
+
+/* Division algorithms: */
+
+/* Basic.  Assuming the host word size is >= 2*FRACBITS, we can do the
+   division immediately.  Give this macro either _FP_DIV_HELP_imm for
+   C primitives or _FP_DIV_HELP_ldiv for the ISO function.  Which you
+   choose will depend on what the compiler does with divrem4.  */
+
+#define _FP_DIV_MEAT_1_imm(fs, R, X, Y, doit)				\
+  do									\
+    {									\
+      _FP_W_TYPE _FP_DIV_MEAT_1_imm_q, _FP_DIV_MEAT_1_imm_r;		\
+      X##_f <<= (X##_f < Y##_f						\
+		 ? R##_e--, _FP_WFRACBITS_##fs				\
+		 : _FP_WFRACBITS_##fs - 1);				\
+      doit (_FP_DIV_MEAT_1_imm_q, _FP_DIV_MEAT_1_imm_r, X##_f, Y##_f);	\
+      R##_f = _FP_DIV_MEAT_1_imm_q | (_FP_DIV_MEAT_1_imm_r != 0);	\
+    }									\
+  while (0)
+
+/* GCC's longlong.h defines a 2W / 1W => (1W,1W) primitive udiv_qrnnd
+   that may be useful in this situation.  This first is for a primitive
+   that requires normalization, the second for one that does not.  Look
+   for UDIV_NEEDS_NORMALIZATION to tell which your machine needs.  */
+
+#define _FP_DIV_MEAT_1_udiv_norm(fs, R, X, Y)				\
+  do									\
+    {									\
+      _FP_W_TYPE _FP_DIV_MEAT_1_udiv_norm_nh;				\
+      _FP_W_TYPE _FP_DIV_MEAT_1_udiv_norm_nl;				\
+      _FP_W_TYPE _FP_DIV_MEAT_1_udiv_norm_q;				\
+      _FP_W_TYPE _FP_DIV_MEAT_1_udiv_norm_r;				\
+      _FP_W_TYPE _FP_DIV_MEAT_1_udiv_norm_y;				\
+									\
+      /* Normalize Y -- i.e. make the most significant bit set.  */	\
+      _FP_DIV_MEAT_1_udiv_norm_y = Y##_f << _FP_WFRACXBITS_##fs;	\
+									\
+      /* Shift X op correspondingly high, that is, up one full word.  */ \
+      if (X##_f < Y##_f)						\
+	{								\
+	  R##_e--;							\
+	  _FP_DIV_MEAT_1_udiv_norm_nl = 0;				\
+	  _FP_DIV_MEAT_1_udiv_norm_nh = X##_f;				\
+	}								\
+      else								\
+	{								\
+	  _FP_DIV_MEAT_1_udiv_norm_nl = X##_f << (_FP_W_TYPE_SIZE - 1);	\
+	  _FP_DIV_MEAT_1_udiv_norm_nh = X##_f >> 1;			\
+	}								\
+									\
+      udiv_qrnnd (_FP_DIV_MEAT_1_udiv_norm_q,				\
+		  _FP_DIV_MEAT_1_udiv_norm_r,				\
+		  _FP_DIV_MEAT_1_udiv_norm_nh,				\
+		  _FP_DIV_MEAT_1_udiv_norm_nl,				\
+		  _FP_DIV_MEAT_1_udiv_norm_y);				\
+      R##_f = (_FP_DIV_MEAT_1_udiv_norm_q				\
+	       | (_FP_DIV_MEAT_1_udiv_norm_r != 0));			\
+    }									\
+  while (0)
+
+#define _FP_DIV_MEAT_1_udiv(fs, R, X, Y)				\
+  do									\
+    {									\
+      _FP_W_TYPE _FP_DIV_MEAT_1_udiv_nh, _FP_DIV_MEAT_1_udiv_nl;	\
+      _FP_W_TYPE _FP_DIV_MEAT_1_udiv_q, _FP_DIV_MEAT_1_udiv_r;		\
+      if (X##_f < Y##_f)						\
+	{								\
+	  R##_e--;							\
+	  _FP_DIV_MEAT_1_udiv_nl = X##_f << _FP_WFRACBITS_##fs;		\
+	  _FP_DIV_MEAT_1_udiv_nh = X##_f >> _FP_WFRACXBITS_##fs;	\
+	}								\
+      else								\
+	{								\
+	  _FP_DIV_MEAT_1_udiv_nl = X##_f << (_FP_WFRACBITS_##fs - 1);	\
+	  _FP_DIV_MEAT_1_udiv_nh = X##_f >> (_FP_WFRACXBITS_##fs + 1);	\
+	}								\
+      udiv_qrnnd (_FP_DIV_MEAT_1_udiv_q, _FP_DIV_MEAT_1_udiv_r,		\
+		  _FP_DIV_MEAT_1_udiv_nh, _FP_DIV_MEAT_1_udiv_nl,	\
+		  Y##_f);						\
+      R##_f = _FP_DIV_MEAT_1_udiv_q | (_FP_DIV_MEAT_1_udiv_r != 0);	\
+    }									\
+  while (0)
+
+
+/* Square root algorithms:
+   We have just one right now, maybe Newton approximation
+   should be added for those machines where division is fast.  */
+
+#define _FP_SQRT_MEAT_1(R, S, T, X, q)		\
+  do						\
+    {						\
+      while ((q) != _FP_WORK_ROUND)		\
+	{					\
+	  T##_f = S##_f + (q);			\
+	  if (T##_f <= X##_f)			\
+	    {					\
+	      S##_f = T##_f + (q);		\
+	      X##_f -= T##_f;			\
+	      R##_f += (q);			\
+	    }					\
+	  _FP_FRAC_SLL_1 (X, 1);		\
+	  (q) >>= 1;				\
+	}					\
+      if (X##_f)				\
+	{					\
+	  if (S##_f < X##_f)			\
+	    R##_f |= _FP_WORK_ROUND;		\
+	  R##_f |= _FP_WORK_STICKY;		\
+	}					\
+    }						\
+  while (0)
+
+/* Assembly/disassembly for converting to/from integral types.
+   No shifting or overflow handled here.  */
+
+#define _FP_FRAC_ASSEMBLE_1(r, X, rsize)	((r) = X##_f)
+#define _FP_FRAC_DISASSEMBLE_1(X, r, rsize)	(X##_f = (r))
+
+
+/* Convert FP values between word sizes.  */
+
+#define _FP_FRAC_COPY_1_1(D, S)		(D##_f = S##_f)
+
+#endif /* !SOFT_FP_OP_1_H */
--- a/src/gemm/soft-fp/op-2.h
+++ b/src/gemm/soft-fp/op-2.h
@ -0,0 +1,705 @@
+/* Software floating-point emulation.
+   Basic two-word fraction declaration and manipulation.
+   Copyright (C) 1997-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson (rth@cygnus.com),
+		  Jakub Jelinek (jj@ultra.linux.cz),
+		  David S. Miller (davem@redhat.com) and
+		  Peter Maydell (pmaydell@chiark.greenend.org.uk).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef SOFT_FP_OP_2_H
+#define SOFT_FP_OP_2_H	1
+
+#define _FP_FRAC_DECL_2(X)				\
+  _FP_W_TYPE X##_f0 _FP_ZERO_INIT, X##_f1 _FP_ZERO_INIT
+#define _FP_FRAC_COPY_2(D, S)	(D##_f0 = S##_f0, D##_f1 = S##_f1)
+#define _FP_FRAC_SET_2(X, I)	__FP_FRAC_SET_2 (X, I)
+#define _FP_FRAC_HIGH_2(X)	(X##_f1)
+#define _FP_FRAC_LOW_2(X)	(X##_f0)
+#define _FP_FRAC_WORD_2(X, w)	(X##_f##w)
+
+#define _FP_FRAC_SLL_2(X, N)						\
+  (void) (((N) < _FP_W_TYPE_SIZE)					\
+	  ? ({								\
+	      if (__builtin_constant_p (N) && (N) == 1)			\
+		{							\
+		  X##_f1 = X##_f1 + X##_f1 + (((_FP_WS_TYPE) (X##_f0)) < 0); \
+		  X##_f0 += X##_f0;					\
+		}							\
+	      else							\
+		{							\
+		  X##_f1 = X##_f1 << (N) | X##_f0 >> (_FP_W_TYPE_SIZE - (N)); \
+		  X##_f0 <<= (N);					\
+		}							\
+	      0;							\
+	    })								\
+	  : ({								\
+	      X##_f1 = X##_f0 << ((N) - _FP_W_TYPE_SIZE);		\
+	      X##_f0 = 0;						\
+	    }))
+
+
+#define _FP_FRAC_SRL_2(X, N)						\
+  (void) (((N) < _FP_W_TYPE_SIZE)					\
+	  ? ({								\
+	      X##_f0 = X##_f0 >> (N) | X##_f1 << (_FP_W_TYPE_SIZE - (N)); \
+	      X##_f1 >>= (N);						\
+	    })								\
+	  : ({								\
+	      X##_f0 = X##_f1 >> ((N) - _FP_W_TYPE_SIZE);		\
+	      X##_f1 = 0;						\
+	    }))
+
+/* Right shift with sticky-lsb.  */
+#define _FP_FRAC_SRST_2(X, S, N, sz)					\
+  (void) (((N) < _FP_W_TYPE_SIZE)					\
+	  ? ({								\
+	      S = (__builtin_constant_p (N) && (N) == 1			\
+		   ? X##_f0 & 1						\
+		   : (X##_f0 << (_FP_W_TYPE_SIZE - (N))) != 0);		\
+	      X##_f0 = (X##_f1 << (_FP_W_TYPE_SIZE - (N)) | X##_f0 >> (N)); \
+	      X##_f1 >>= (N);						\
+	    })								\
+	  : ({								\
+	      S = ((((N) == _FP_W_TYPE_SIZE				\
+		     ? 0						\
+		     : (X##_f1 << (2*_FP_W_TYPE_SIZE - (N))))		\
+		    | X##_f0) != 0);					\
+	      X##_f0 = (X##_f1 >> ((N) - _FP_W_TYPE_SIZE));		\
+	      X##_f1 = 0;						\
+	    }))
+
+#define _FP_FRAC_SRS_2(X, N, sz)					\
+  (void) (((N) < _FP_W_TYPE_SIZE)					\
+	  ? ({								\
+	      X##_f0 = (X##_f1 << (_FP_W_TYPE_SIZE - (N)) | X##_f0 >> (N) \
+			| (__builtin_constant_p (N) && (N) == 1		\
+			   ? X##_f0 & 1					\
+			   : (X##_f0 << (_FP_W_TYPE_SIZE - (N))) != 0)); \
+	      X##_f1 >>= (N);						\
+	    })								\
+	  : ({								\
+	      X##_f0 = (X##_f1 >> ((N) - _FP_W_TYPE_SIZE)		\
+			| ((((N) == _FP_W_TYPE_SIZE			\
+			     ? 0					\
+			     : (X##_f1 << (2*_FP_W_TYPE_SIZE - (N))))	\
+			    | X##_f0) != 0));				\
+	      X##_f1 = 0;						\
+	    }))
+
+#define _FP_FRAC_ADDI_2(X, I)	\
+  __FP_FRAC_ADDI_2 (X##_f1, X##_f0, I)
+
+#define _FP_FRAC_ADD_2(R, X, Y)	\
+  __FP_FRAC_ADD_2 (R##_f1, R##_f0, X##_f1, X##_f0, Y##_f1, Y##_f0)
+
+#define _FP_FRAC_SUB_2(R, X, Y)	\
+  __FP_FRAC_SUB_2 (R##_f1, R##_f0, X##_f1, X##_f0, Y##_f1, Y##_f0)
+
+#define _FP_FRAC_DEC_2(X, Y)	\
+  __FP_FRAC_DEC_2 (X##_f1, X##_f0, Y##_f1, Y##_f0)
+
+#define _FP_FRAC_CLZ_2(R, X)			\
+  do						\
+    {						\
+      if (X##_f1)				\
+	__FP_CLZ ((R), X##_f1);			\
+      else					\
+	{					\
+	  __FP_CLZ ((R), X##_f0);		\
+	  (R) += _FP_W_TYPE_SIZE;		\
+	}					\
+    }						\
+  while (0)
+
+/* Predicates.  */
+#define _FP_FRAC_NEGP_2(X)	((_FP_WS_TYPE) X##_f1 < 0)
+#define _FP_FRAC_ZEROP_2(X)	((X##_f1 | X##_f0) == 0)
+#define _FP_FRAC_OVERP_2(fs, X)	(_FP_FRAC_HIGH_##fs (X) & _FP_OVERFLOW_##fs)
+#define _FP_FRAC_CLEAR_OVERP_2(fs, X)	(_FP_FRAC_HIGH_##fs (X) &= ~_FP_OVERFLOW_##fs)
+#define _FP_FRAC_HIGHBIT_DW_2(fs, X)	\
+  (_FP_FRAC_HIGH_DW_##fs (X) & _FP_HIGHBIT_DW_##fs)
+#define _FP_FRAC_EQ_2(X, Y)	(X##_f1 == Y##_f1 && X##_f0 == Y##_f0)
+#define _FP_FRAC_GT_2(X, Y)	\
+  (X##_f1 > Y##_f1 || (X##_f1 == Y##_f1 && X##_f0 > Y##_f0))
+#define _FP_FRAC_GE_2(X, Y)	\
+  (X##_f1 > Y##_f1 || (X##_f1 == Y##_f1 && X##_f0 >= Y##_f0))
+
+#define _FP_ZEROFRAC_2		0, 0
+#define _FP_MINFRAC_2		0, 1
+#define _FP_MAXFRAC_2		(~(_FP_WS_TYPE) 0), (~(_FP_WS_TYPE) 0)
+
+/* Internals.  */
+
+#define __FP_FRAC_SET_2(X, I1, I0)	(X##_f0 = I0, X##_f1 = I1)
+
+#define __FP_CLZ_2(R, xh, xl)			\
+  do						\
+    {						\
+      if (xh)					\
+	__FP_CLZ ((R), xh);			\
+      else					\
+	{					\
+	  __FP_CLZ ((R), xl);			\
+	  (R) += _FP_W_TYPE_SIZE;		\
+	}					\
+    }						\
+  while (0)
+
+#if 0
+
+# ifndef __FP_FRAC_ADDI_2
+#  define __FP_FRAC_ADDI_2(xh, xl, i)	\
+  (xh += ((xl += i) < i))
+# endif
+# ifndef __FP_FRAC_ADD_2
+#  define __FP_FRAC_ADD_2(rh, rl, xh, xl, yh, yl)	\
+  (rh = xh + yh + ((rl = xl + yl) < xl))
+# endif
+# ifndef __FP_FRAC_SUB_2
+#  define __FP_FRAC_SUB_2(rh, rl, xh, xl, yh, yl)	\
+  (rh = xh - yh - ((rl = xl - yl) > xl))
+# endif
+# ifndef __FP_FRAC_DEC_2
+#  define __FP_FRAC_DEC_2(xh, xl, yh, yl)		\
+  do							\
+    {							\
+      UWtype __FP_FRAC_DEC_2_t = xl;			\
+      xh -= yh + ((xl -= yl) > __FP_FRAC_DEC_2_t);	\
+    }							\
+  while (0)
+# endif
+
+#else
+
+# undef __FP_FRAC_ADDI_2
+# define __FP_FRAC_ADDI_2(xh, xl, i)	add_ssaaaa (xh, xl, xh, xl, 0, i)
+# undef __FP_FRAC_ADD_2
+# define __FP_FRAC_ADD_2		add_ssaaaa
+# undef __FP_FRAC_SUB_2
+# define __FP_FRAC_SUB_2		sub_ddmmss
+# undef __FP_FRAC_DEC_2
+# define __FP_FRAC_DEC_2(xh, xl, yh, yl)	\
+  sub_ddmmss (xh, xl, xh, xl, yh, yl)
+
+#endif
+
+/* Unpack the raw bits of a native fp value.  Do not classify or
+   normalize the data.  */
+
+#define _FP_UNPACK_RAW_2(fs, X, val)			\
+  do							\
+    {							\
+      union _FP_UNION_##fs _FP_UNPACK_RAW_2_flo;	\
+      _FP_UNPACK_RAW_2_flo.flt = (val);			\
+							\
+      X##_f0 = _FP_UNPACK_RAW_2_flo.bits.frac0;		\
+      X##_f1 = _FP_UNPACK_RAW_2_flo.bits.frac1;		\
+      X##_e  = _FP_UNPACK_RAW_2_flo.bits.exp;		\
+      X##_s  = _FP_UNPACK_RAW_2_flo.bits.sign;		\
+    }							\
+  while (0)
+
+#define _FP_UNPACK_RAW_2_P(fs, X, val)			\
+  do							\
+    {							\
+      union _FP_UNION_##fs *_FP_UNPACK_RAW_2_P_flo	\
+	= (union _FP_UNION_##fs *) (val);		\
+							\
+      X##_f0 = _FP_UNPACK_RAW_2_P_flo->bits.frac0;	\
+      X##_f1 = _FP_UNPACK_RAW_2_P_flo->bits.frac1;	\
+      X##_e  = _FP_UNPACK_RAW_2_P_flo->bits.exp;	\
+      X##_s  = _FP_UNPACK_RAW_2_P_flo->bits.sign;	\
+    }							\
+  while (0)
+
+
+/* Repack the raw bits of a native fp value.  */
+
+#define _FP_PACK_RAW_2(fs, val, X)		\
+  do						\
+    {						\
+      union _FP_UNION_##fs _FP_PACK_RAW_2_flo;	\
+						\
+      _FP_PACK_RAW_2_flo.bits.frac0 = X##_f0;	\
+      _FP_PACK_RAW_2_flo.bits.frac1 = X##_f1;	\
+      _FP_PACK_RAW_2_flo.bits.exp   = X##_e;	\
+      _FP_PACK_RAW_2_flo.bits.sign  = X##_s;	\
+						\
+      (val) = _FP_PACK_RAW_2_flo.flt;		\
+    }						\
+  while (0)
+
+#define _FP_PACK_RAW_2_P(fs, val, X)			\
+  do							\
+    {							\
+      union _FP_UNION_##fs *_FP_PACK_RAW_2_P_flo	\
+	= (union _FP_UNION_##fs *) (val);		\
+							\
+      _FP_PACK_RAW_2_P_flo->bits.frac0 = X##_f0;	\
+      _FP_PACK_RAW_2_P_flo->bits.frac1 = X##_f1;	\
+      _FP_PACK_RAW_2_P_flo->bits.exp   = X##_e;		\
+      _FP_PACK_RAW_2_P_flo->bits.sign  = X##_s;		\
+    }							\
+  while (0)
+
+
+/* Multiplication algorithms: */
+
+/* Given a 1W * 1W => 2W primitive, do the extended multiplication.  */
+
+#define _FP_MUL_MEAT_DW_2_wide(wfracbits, R, X, Y, doit)		\
+  do									\
+    {									\
+      _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_2_wide_b);			\
+      _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_2_wide_c);			\
+									\
+      doit (_FP_FRAC_WORD_4 (R, 1), _FP_FRAC_WORD_4 (R, 0),		\
+	    X##_f0, Y##_f0);						\
+      doit (_FP_MUL_MEAT_DW_2_wide_b_f1, _FP_MUL_MEAT_DW_2_wide_b_f0,	\
+	    X##_f0, Y##_f1);						\
+      doit (_FP_MUL_MEAT_DW_2_wide_c_f1, _FP_MUL_MEAT_DW_2_wide_c_f0,	\
+	    X##_f1, Y##_f0);						\
+      doit (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),		\
+	    X##_f1, Y##_f1);						\
+									\
+      __FP_FRAC_ADD_3 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),	\
+		       _FP_FRAC_WORD_4 (R, 1), 0,			\
+		       _FP_MUL_MEAT_DW_2_wide_b_f1,			\
+		       _FP_MUL_MEAT_DW_2_wide_b_f0,			\
+		       _FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),	\
+		       _FP_FRAC_WORD_4 (R, 1));				\
+      __FP_FRAC_ADD_3 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),	\
+		       _FP_FRAC_WORD_4 (R, 1), 0,			\
+		       _FP_MUL_MEAT_DW_2_wide_c_f1,			\
+		       _FP_MUL_MEAT_DW_2_wide_c_f0,			\
+		       _FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),	\
+		       _FP_FRAC_WORD_4 (R, 1));				\
+    }									\
+  while (0)
+
+#define _FP_MUL_MEAT_2_wide(wfracbits, R, X, Y, doit)			\
+  do									\
+    {									\
+      _FP_FRAC_DECL_4 (_FP_MUL_MEAT_2_wide_z);				\
+									\
+      _FP_MUL_MEAT_DW_2_wide ((wfracbits), _FP_MUL_MEAT_2_wide_z,	\
+			      X, Y, doit);				\
+									\
+      /* Normalize since we know where the msb of the multiplicands	\
+	 were (bit B), we know that the msb of the of the product is	\
+	 at either 2B or 2B-1.  */					\
+      _FP_FRAC_SRS_4 (_FP_MUL_MEAT_2_wide_z, (wfracbits)-1,		\
+		      2*(wfracbits));					\
+      R##_f0 = _FP_FRAC_WORD_4 (_FP_MUL_MEAT_2_wide_z, 0);		\
+      R##_f1 = _FP_FRAC_WORD_4 (_FP_MUL_MEAT_2_wide_z, 1);		\
+    }									\
+  while (0)
+
+/* Given a 1W * 1W => 2W primitive, do the extended multiplication.
+   Do only 3 multiplications instead of four. This one is for machines
+   where multiplication is much more expensive than subtraction.  */
+
+#define _FP_MUL_MEAT_DW_2_wide_3mul(wfracbits, R, X, Y, doit)		\
+  do									\
+    {									\
+      _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_2_wide_3mul_b);			\
+      _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_2_wide_3mul_c);			\
+      _FP_W_TYPE _FP_MUL_MEAT_DW_2_wide_3mul_d;				\
+      int _FP_MUL_MEAT_DW_2_wide_3mul_c1;				\
+      int _FP_MUL_MEAT_DW_2_wide_3mul_c2;				\
+									\
+      _FP_MUL_MEAT_DW_2_wide_3mul_b_f0 = X##_f0 + X##_f1;		\
+      _FP_MUL_MEAT_DW_2_wide_3mul_c1					\
+	= _FP_MUL_MEAT_DW_2_wide_3mul_b_f0 < X##_f0;			\
+      _FP_MUL_MEAT_DW_2_wide_3mul_b_f1 = Y##_f0 + Y##_f1;		\
+      _FP_MUL_MEAT_DW_2_wide_3mul_c2					\
+	= _FP_MUL_MEAT_DW_2_wide_3mul_b_f1 < Y##_f0;			\
+      doit (_FP_MUL_MEAT_DW_2_wide_3mul_d, _FP_FRAC_WORD_4 (R, 0),	\
+	    X##_f0, Y##_f0);						\
+      doit (_FP_FRAC_WORD_4 (R, 2), _FP_FRAC_WORD_4 (R, 1),		\
+	    _FP_MUL_MEAT_DW_2_wide_3mul_b_f0,				\
+	    _FP_MUL_MEAT_DW_2_wide_3mul_b_f1);				\
+      doit (_FP_MUL_MEAT_DW_2_wide_3mul_c_f1,				\
+	    _FP_MUL_MEAT_DW_2_wide_3mul_c_f0, X##_f1, Y##_f1);		\
+									\
+      _FP_MUL_MEAT_DW_2_wide_3mul_b_f0					\
+	&= -_FP_MUL_MEAT_DW_2_wide_3mul_c2;				\
+      _FP_MUL_MEAT_DW_2_wide_3mul_b_f1					\
+	&= -_FP_MUL_MEAT_DW_2_wide_3mul_c1;				\
+      __FP_FRAC_ADD_3 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),	\
+		       _FP_FRAC_WORD_4 (R, 1),				\
+		       (_FP_MUL_MEAT_DW_2_wide_3mul_c1			\
+			& _FP_MUL_MEAT_DW_2_wide_3mul_c2), 0,		\
+		       _FP_MUL_MEAT_DW_2_wide_3mul_d,			\
+		       0, _FP_FRAC_WORD_4 (R, 2), _FP_FRAC_WORD_4 (R, 1)); \
+      __FP_FRAC_ADDI_2 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),	\
+			_FP_MUL_MEAT_DW_2_wide_3mul_b_f0);		\
+      __FP_FRAC_ADDI_2 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),	\
+			_FP_MUL_MEAT_DW_2_wide_3mul_b_f1);		\
+      __FP_FRAC_DEC_3 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),	\
+		       _FP_FRAC_WORD_4 (R, 1),				\
+		       0, _FP_MUL_MEAT_DW_2_wide_3mul_d,		\
+		       _FP_FRAC_WORD_4 (R, 0));				\
+      __FP_FRAC_DEC_3 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),	\
+		       _FP_FRAC_WORD_4 (R, 1), 0,			\
+		       _FP_MUL_MEAT_DW_2_wide_3mul_c_f1,		\
+		       _FP_MUL_MEAT_DW_2_wide_3mul_c_f0);		\
+      __FP_FRAC_ADD_2 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),	\
+		       _FP_MUL_MEAT_DW_2_wide_3mul_c_f1,		\
+		       _FP_MUL_MEAT_DW_2_wide_3mul_c_f0,		\
+		       _FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2));	\
+    }									\
+  while (0)
+
+#define _FP_MUL_MEAT_2_wide_3mul(wfracbits, R, X, Y, doit)		\
+  do									\
+    {									\
+      _FP_FRAC_DECL_4 (_FP_MUL_MEAT_2_wide_3mul_z);			\
+									\
+      _FP_MUL_MEAT_DW_2_wide_3mul ((wfracbits),				\
+				   _FP_MUL_MEAT_2_wide_3mul_z,		\
+				   X, Y, doit);				\
+									\
+      /* Normalize since we know where the msb of the multiplicands	\
+	 were (bit B), we know that the msb of the of the product is	\
+	 at either 2B or 2B-1.  */					\
+      _FP_FRAC_SRS_4 (_FP_MUL_MEAT_2_wide_3mul_z,			\
+		      (wfracbits)-1, 2*(wfracbits));			\
+      R##_f0 = _FP_FRAC_WORD_4 (_FP_MUL_MEAT_2_wide_3mul_z, 0);		\
+      R##_f1 = _FP_FRAC_WORD_4 (_FP_MUL_MEAT_2_wide_3mul_z, 1);		\
+    }									\
+  while (0)
+
+#define _FP_MUL_MEAT_DW_2_gmp(wfracbits, R, X, Y)	\
+  do							\
+    {							\
+      _FP_W_TYPE _FP_MUL_MEAT_DW_2_gmp_x[2];		\
+      _FP_W_TYPE _FP_MUL_MEAT_DW_2_gmp_y[2];		\
+      _FP_MUL_MEAT_DW_2_gmp_x[0] = X##_f0;		\
+      _FP_MUL_MEAT_DW_2_gmp_x[1] = X##_f1;		\
+      _FP_MUL_MEAT_DW_2_gmp_y[0] = Y##_f0;		\
+      _FP_MUL_MEAT_DW_2_gmp_y[1] = Y##_f1;		\
+							\
+      mpn_mul_n (R##_f, _FP_MUL_MEAT_DW_2_gmp_x,	\
+		 _FP_MUL_MEAT_DW_2_gmp_y, 2);		\
+    }							\
+  while (0)
+
+#define _FP_MUL_MEAT_2_gmp(wfracbits, R, X, Y)				\
+  do									\
+    {									\
+      _FP_FRAC_DECL_4 (_FP_MUL_MEAT_2_gmp_z);				\
+									\
+      _FP_MUL_MEAT_DW_2_gmp ((wfracbits), _FP_MUL_MEAT_2_gmp_z, X, Y);	\
+									\
+      /* Normalize since we know where the msb of the multiplicands	\
+	 were (bit B), we know that the msb of the of the product is	\
+	 at either 2B or 2B-1.  */					\
+      _FP_FRAC_SRS_4 (_FP_MUL_MEAT_2_gmp_z, (wfracbits)-1,		\
+		      2*(wfracbits));					\
+      R##_f0 = _FP_MUL_MEAT_2_gmp_z_f[0];				\
+      R##_f1 = _FP_MUL_MEAT_2_gmp_z_f[1];				\
+    }									\
+  while (0)
+
+/* Do at most 120x120=240 bits multiplication using double floating
+   point multiplication.  This is useful if floating point
+   multiplication has much bigger throughput than integer multiply.
+   It is supposed to work for _FP_W_TYPE_SIZE 64 and wfracbits
+   between 106 and 120 only.
+   Caller guarantees that X and Y has (1LLL << (wfracbits - 1)) set.
+   SETFETZ is a macro which will disable all FPU exceptions and set rounding
+   towards zero,  RESETFE should optionally reset it back.  */
+
+#define _FP_MUL_MEAT_2_120_240_double(wfracbits, R, X, Y, setfetz, resetfe) \
+  do									\
+    {									\
+      static const double _const[] =					\
+	{								\
+	  /* 2^-24 */ 5.9604644775390625e-08,				\
+	  /* 2^-48 */ 3.5527136788005009e-15,				\
+	  /* 2^-72 */ 2.1175823681357508e-22,				\
+	  /* 2^-96 */ 1.2621774483536189e-29,				\
+	  /* 2^28 */ 2.68435456e+08,					\
+	  /* 2^4 */ 1.600000e+01,					\
+	  /* 2^-20 */ 9.5367431640625e-07,				\
+	  /* 2^-44 */ 5.6843418860808015e-14,				\
+	  /* 2^-68 */ 3.3881317890172014e-21,				\
+	  /* 2^-92 */ 2.0194839173657902e-28,				\
+	  /* 2^-116 */ 1.2037062152420224e-35				\
+	};								\
+      double _a240, _b240, _c240, _d240, _e240, _f240,			\
+	_g240, _h240, _i240, _j240, _k240;				\
+      union { double d; UDItype i; } _l240, _m240, _n240, _o240,	\
+				       _p240, _q240, _r240, _s240;	\
+      UDItype _t240, _u240, _v240, _w240, _x240, _y240 = 0;		\
+									\
+      _FP_STATIC_ASSERT ((wfracbits) >= 106 && (wfracbits) <= 120,	\
+			 "wfracbits out of range");			\
+									\
+      setfetz;								\
+									\
+      _e240 = (double) (long) (X##_f0 & 0xffffff);			\
+      _j240 = (double) (long) (Y##_f0 & 0xffffff);			\
+      _d240 = (double) (long) ((X##_f0 >> 24) & 0xffffff);		\
+      _i240 = (double) (long) ((Y##_f0 >> 24) & 0xffffff);		\
+      _c240 = (double) (long) (((X##_f1 << 16) & 0xffffff) | (X##_f0 >> 48)); \
+      _h240 = (double) (long) (((Y##_f1 << 16) & 0xffffff) | (Y##_f0 >> 48)); \
+      _b240 = (double) (long) ((X##_f1 >> 8) & 0xffffff);		\
+      _g240 = (double) (long) ((Y##_f1 >> 8) & 0xffffff);		\
+      _a240 = (double) (long) (X##_f1 >> 32);				\
+      _f240 = (double) (long) (Y##_f1 >> 32);				\
+      _e240 *= _const[3];						\
+      _j240 *= _const[3];						\
+      _d240 *= _const[2];						\
+      _i240 *= _const[2];						\
+      _c240 *= _const[1];						\
+      _h240 *= _const[1];						\
+      _b240 *= _const[0];						\
+      _g240 *= _const[0];						\
+      _s240.d =							      _e240*_j240; \
+      _r240.d =						_d240*_j240 + _e240*_i240; \
+      _q240.d =				  _c240*_j240 + _d240*_i240 + _e240*_h240; \
+      _p240.d =		    _b240*_j240 + _c240*_i240 + _d240*_h240 + _e240*_g240; \
+      _o240.d = _a240*_j240 + _b240*_i240 + _c240*_h240 + _d240*_g240 + _e240*_f240; \
+      _n240.d = _a240*_i240 + _b240*_h240 + _c240*_g240 + _d240*_f240;	\
+      _m240.d = _a240*_h240 + _b240*_g240 + _c240*_f240;		\
+      _l240.d = _a240*_g240 + _b240*_f240;				\
+      _k240 =   _a240*_f240;						\
+      _r240.d += _s240.d;						\
+      _q240.d += _r240.d;						\
+      _p240.d += _q240.d;						\
+      _o240.d += _p240.d;						\
+      _n240.d += _o240.d;						\
+      _m240.d += _n240.d;						\
+      _l240.d += _m240.d;						\
+      _k240 += _l240.d;							\
+      _s240.d -= ((_const[10]+_s240.d)-_const[10]);			\
+      _r240.d -= ((_const[9]+_r240.d)-_const[9]);			\
+      _q240.d -= ((_const[8]+_q240.d)-_const[8]);			\
+      _p240.d -= ((_const[7]+_p240.d)-_const[7]);			\
+      _o240.d += _const[7];						\
+      _n240.d += _const[6];						\
+      _m240.d += _const[5];						\
+      _l240.d += _const[4];						\
+      if (_s240.d != 0.0)						\
+	_y240 = 1;							\
+      if (_r240.d != 0.0)						\
+	_y240 = 1;							\
+      if (_q240.d != 0.0)						\
+	_y240 = 1;							\
+      if (_p240.d != 0.0)						\
+	_y240 = 1;							\
+      _t240 = (DItype) _k240;						\
+      _u240 = _l240.i;							\
+      _v240 = _m240.i;							\
+      _w240 = _n240.i;							\
+      _x240 = _o240.i;							\
+      R##_f1 = ((_t240 << (128 - (wfracbits - 1)))			\
+		| ((_u240 & 0xffffff) >> ((wfracbits - 1) - 104)));	\
+      R##_f0 = (((_u240 & 0xffffff) << (168 - (wfracbits - 1)))		\
+		| ((_v240 & 0xffffff) << (144 - (wfracbits - 1)))	\
+		| ((_w240 & 0xffffff) << (120 - (wfracbits - 1)))	\
+		| ((_x240 & 0xffffff) >> ((wfracbits - 1) - 96))	\
+		| _y240);						\
+      resetfe;								\
+    }									\
+  while (0)
+
+/* Division algorithms: */
+
+#define _FP_DIV_MEAT_2_udiv(fs, R, X, Y)				\
+  do									\
+    {									\
+      _FP_W_TYPE _FP_DIV_MEAT_2_udiv_n_f2;				\
+      _FP_W_TYPE _FP_DIV_MEAT_2_udiv_n_f1;				\
+      _FP_W_TYPE _FP_DIV_MEAT_2_udiv_n_f0;				\
+      _FP_W_TYPE _FP_DIV_MEAT_2_udiv_r_f1;				\
+      _FP_W_TYPE _FP_DIV_MEAT_2_udiv_r_f0;				\
+      _FP_W_TYPE _FP_DIV_MEAT_2_udiv_m_f1;				\
+      _FP_W_TYPE _FP_DIV_MEAT_2_udiv_m_f0;				\
+      if (_FP_FRAC_GE_2 (X, Y))						\
+	{								\
+	  _FP_DIV_MEAT_2_udiv_n_f2 = X##_f1 >> 1;			\
+	  _FP_DIV_MEAT_2_udiv_n_f1					\
+	    = X##_f1 << (_FP_W_TYPE_SIZE - 1) | X##_f0 >> 1;		\
+	  _FP_DIV_MEAT_2_udiv_n_f0					\
+	    = X##_f0 << (_FP_W_TYPE_SIZE - 1);				\
+	}								\
+      else								\
+	{								\
+	  R##_e--;							\
+	  _FP_DIV_MEAT_2_udiv_n_f2 = X##_f1;				\
+	  _FP_DIV_MEAT_2_udiv_n_f1 = X##_f0;				\
+	  _FP_DIV_MEAT_2_udiv_n_f0 = 0;					\
+	}								\
+									\
+      /* Normalize, i.e. make the most significant bit of the		\
+	 denominator set.  */						\
+      _FP_FRAC_SLL_2 (Y, _FP_WFRACXBITS_##fs);				\
+									\
+      udiv_qrnnd (R##_f1, _FP_DIV_MEAT_2_udiv_r_f1,			\
+		  _FP_DIV_MEAT_2_udiv_n_f2, _FP_DIV_MEAT_2_udiv_n_f1,	\
+		  Y##_f1);						\
+      umul_ppmm (_FP_DIV_MEAT_2_udiv_m_f1, _FP_DIV_MEAT_2_udiv_m_f0,	\
+		 R##_f1, Y##_f0);					\
+      _FP_DIV_MEAT_2_udiv_r_f0 = _FP_DIV_MEAT_2_udiv_n_f0;		\
+      if (_FP_FRAC_GT_2 (_FP_DIV_MEAT_2_udiv_m, _FP_DIV_MEAT_2_udiv_r))	\
+	{								\
+	  R##_f1--;							\
+	  _FP_FRAC_ADD_2 (_FP_DIV_MEAT_2_udiv_r, Y,			\
+			  _FP_DIV_MEAT_2_udiv_r);			\
+	  if (_FP_FRAC_GE_2 (_FP_DIV_MEAT_2_udiv_r, Y)			\
+	      && _FP_FRAC_GT_2 (_FP_DIV_MEAT_2_udiv_m,			\
+				_FP_DIV_MEAT_2_udiv_r))			\
+	    {								\
+	      R##_f1--;							\
+	      _FP_FRAC_ADD_2 (_FP_DIV_MEAT_2_udiv_r, Y,			\
+			      _FP_DIV_MEAT_2_udiv_r);			\
+	    }								\
+	}								\
+      _FP_FRAC_DEC_2 (_FP_DIV_MEAT_2_udiv_r, _FP_DIV_MEAT_2_udiv_m);	\
+									\
+      if (_FP_DIV_MEAT_2_udiv_r_f1 == Y##_f1)				\
+	{								\
+	  /* This is a special case, not an optimization		\
+	     (_FP_DIV_MEAT_2_udiv_r/Y##_f1 would not fit into UWtype).	\
+	     As _FP_DIV_MEAT_2_udiv_r is guaranteed to be < Y,		\
+	     R##_f0 can be either (UWtype)-1 or (UWtype)-2.  But as we	\
+	     know what kind of bits it is (sticky, guard, round),	\
+	     we don't care.  We also don't care what the reminder is,	\
+	     because the guard bit will be set anyway.  -jj */		\
+	  R##_f0 = -1;							\
+	}								\
+      else								\
+	{								\
+	  udiv_qrnnd (R##_f0, _FP_DIV_MEAT_2_udiv_r_f1,			\
+		      _FP_DIV_MEAT_2_udiv_r_f1,				\
+		      _FP_DIV_MEAT_2_udiv_r_f0, Y##_f1);		\
+	  umul_ppmm (_FP_DIV_MEAT_2_udiv_m_f1,				\
+		     _FP_DIV_MEAT_2_udiv_m_f0, R##_f0, Y##_f0);		\
+	  _FP_DIV_MEAT_2_udiv_r_f0 = 0;					\
+	  if (_FP_FRAC_GT_2 (_FP_DIV_MEAT_2_udiv_m,			\
+			     _FP_DIV_MEAT_2_udiv_r))			\
+	    {								\
+	      R##_f0--;							\
+	      _FP_FRAC_ADD_2 (_FP_DIV_MEAT_2_udiv_r, Y,			\
+			      _FP_DIV_MEAT_2_udiv_r);			\
+	      if (_FP_FRAC_GE_2 (_FP_DIV_MEAT_2_udiv_r, Y)		\
+		  && _FP_FRAC_GT_2 (_FP_DIV_MEAT_2_udiv_m,		\
+				    _FP_DIV_MEAT_2_udiv_r))		\
+		{							\
+		  R##_f0--;						\
+		  _FP_FRAC_ADD_2 (_FP_DIV_MEAT_2_udiv_r, Y,		\
+				  _FP_DIV_MEAT_2_udiv_r);		\
+		}							\
+	    }								\
+	  if (!_FP_FRAC_EQ_2 (_FP_DIV_MEAT_2_udiv_r,			\
+			      _FP_DIV_MEAT_2_udiv_m))			\
+	    R##_f0 |= _FP_WORK_STICKY;					\
+	}								\
+    }									\
+  while (0)
+
+
+/* Square root algorithms:
+   We have just one right now, maybe Newton approximation
+   should be added for those machines where division is fast.  */
+
+#define _FP_SQRT_MEAT_2(R, S, T, X, q)				\
+  do								\
+    {								\
+      while (q)							\
+	{							\
+	  T##_f1 = S##_f1 + (q);				\
+	  if (T##_f1 <= X##_f1)					\
+	    {							\
+	      S##_f1 = T##_f1 + (q);				\
+	      X##_f1 -= T##_f1;					\
+	      R##_f1 += (q);					\
+	    }							\
+	  _FP_FRAC_SLL_2 (X, 1);				\
+	  (q) >>= 1;						\
+	}							\
+      (q) = (_FP_W_TYPE) 1 << (_FP_W_TYPE_SIZE - 1);		\
+      while ((q) != _FP_WORK_ROUND)				\
+	{							\
+	  T##_f0 = S##_f0 + (q);				\
+	  T##_f1 = S##_f1;					\
+	  if (T##_f1 < X##_f1					\
+	      || (T##_f1 == X##_f1 && T##_f0 <= X##_f0))	\
+	    {							\
+	      S##_f0 = T##_f0 + (q);				\
+	      S##_f1 += (T##_f0 > S##_f0);			\
+	      _FP_FRAC_DEC_2 (X, T);				\
+	      R##_f0 += (q);					\
+	    }							\
+	  _FP_FRAC_SLL_2 (X, 1);				\
+	  (q) >>= 1;						\
+	}							\
+      if (X##_f0 | X##_f1)					\
+	{							\
+	  if (S##_f1 < X##_f1					\
+	      || (S##_f1 == X##_f1 && S##_f0 < X##_f0))		\
+	    R##_f0 |= _FP_WORK_ROUND;				\
+	  R##_f0 |= _FP_WORK_STICKY;				\
+	}							\
+    }								\
+  while (0)
+
+
+/* Assembly/disassembly for converting to/from integral types.
+   No shifting or overflow handled here.  */
+
+#define _FP_FRAC_ASSEMBLE_2(r, X, rsize)	\
+  (void) (((rsize) <= _FP_W_TYPE_SIZE)		\
+	  ? ({ (r) = X##_f0; })			\
+	  : ({					\
+	      (r) = X##_f1;			\
+	      (r) <<= _FP_W_TYPE_SIZE;		\
+	      (r) += X##_f0;			\
+	    }))
+
+#define _FP_FRAC_DISASSEMBLE_2(X, r, rsize)	\
+  do						\
+    {						\
+      X##_f0 = (r);				\
+      X##_f1 = ((rsize) <= _FP_W_TYPE_SIZE	\
+		? 0				\
+		: (r) >> _FP_W_TYPE_SIZE);	\
+    }						\
+  while (0)
+
+/* Convert FP values between word sizes.  */
+
+#define _FP_FRAC_COPY_1_2(D, S)		(D##_f = S##_f0)
+
+#define _FP_FRAC_COPY_2_1(D, S)		((D##_f0 = S##_f), (D##_f1 = 0))
+
+#define _FP_FRAC_COPY_2_2(D, S)		_FP_FRAC_COPY_2 (D, S)
+
+#endif /* !SOFT_FP_OP_2_H */
--- a/src/gemm/soft-fp/op-4.h
+++ b/src/gemm/soft-fp/op-4.h
@ -0,0 +1,882 @@
+/* Software floating-point emulation.
+   Basic four-word fraction declaration and manipulation.
+   Copyright (C) 1997-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson (rth@cygnus.com),
+		  Jakub Jelinek (jj@ultra.linux.cz),
+		  David S. Miller (davem@redhat.com) and
+		  Peter Maydell (pmaydell@chiark.greenend.org.uk).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef SOFT_FP_OP_4_H
+#define SOFT_FP_OP_4_H	1
+
+#define _FP_FRAC_DECL_4(X)	_FP_W_TYPE X##_f[4]
+#define _FP_FRAC_COPY_4(D, S)			\
+  (D##_f[0] = S##_f[0], D##_f[1] = S##_f[1],	\
+   D##_f[2] = S##_f[2], D##_f[3] = S##_f[3])
+#define _FP_FRAC_SET_4(X, I)	__FP_FRAC_SET_4 (X, I)
+#define _FP_FRAC_HIGH_4(X)	(X##_f[3])
+#define _FP_FRAC_LOW_4(X)	(X##_f[0])
+#define _FP_FRAC_WORD_4(X, w)	(X##_f[w])
+
+#define _FP_FRAC_SLL_4(X, N)						\
+  do									\
+    {									\
+      _FP_I_TYPE _FP_FRAC_SLL_4_up, _FP_FRAC_SLL_4_down;		\
+      _FP_I_TYPE _FP_FRAC_SLL_4_skip, _FP_FRAC_SLL_4_i;			\
+      _FP_FRAC_SLL_4_skip = (N) / _FP_W_TYPE_SIZE;			\
+      _FP_FRAC_SLL_4_up = (N) % _FP_W_TYPE_SIZE;			\
+      _FP_FRAC_SLL_4_down = _FP_W_TYPE_SIZE - _FP_FRAC_SLL_4_up;	\
+      if (!_FP_FRAC_SLL_4_up)						\
+	for (_FP_FRAC_SLL_4_i = 3;					\
+	     _FP_FRAC_SLL_4_i >= _FP_FRAC_SLL_4_skip;			\
+	     --_FP_FRAC_SLL_4_i)					\
+	  X##_f[_FP_FRAC_SLL_4_i]					\
+	    = X##_f[_FP_FRAC_SLL_4_i-_FP_FRAC_SLL_4_skip];		\
+      else								\
+	{								\
+	  for (_FP_FRAC_SLL_4_i = 3;					\
+	       _FP_FRAC_SLL_4_i > _FP_FRAC_SLL_4_skip;			\
+	       --_FP_FRAC_SLL_4_i)					\
+	    X##_f[_FP_FRAC_SLL_4_i]					\
+	      = ((X##_f[_FP_FRAC_SLL_4_i-_FP_FRAC_SLL_4_skip]		\
+		  << _FP_FRAC_SLL_4_up)					\
+		 | (X##_f[_FP_FRAC_SLL_4_i-_FP_FRAC_SLL_4_skip-1]	\
+		    >> _FP_FRAC_SLL_4_down));				\
+	  X##_f[_FP_FRAC_SLL_4_i--] = X##_f[0] << _FP_FRAC_SLL_4_up;	\
+	}								\
+      for (; _FP_FRAC_SLL_4_i >= 0; --_FP_FRAC_SLL_4_i)			\
+	X##_f[_FP_FRAC_SLL_4_i] = 0;					\
+    }									\
+  while (0)
+
+/* This one was broken too.  */
+#define _FP_FRAC_SRL_4(X, N)						\
+  do									\
+    {									\
+      _FP_I_TYPE _FP_FRAC_SRL_4_up, _FP_FRAC_SRL_4_down;		\
+      _FP_I_TYPE _FP_FRAC_SRL_4_skip, _FP_FRAC_SRL_4_i;			\
+      _FP_FRAC_SRL_4_skip = (N) / _FP_W_TYPE_SIZE;			\
+      _FP_FRAC_SRL_4_down = (N) % _FP_W_TYPE_SIZE;			\
+      _FP_FRAC_SRL_4_up = _FP_W_TYPE_SIZE - _FP_FRAC_SRL_4_down;	\
+      if (!_FP_FRAC_SRL_4_down)						\
+	for (_FP_FRAC_SRL_4_i = 0;					\
+	     _FP_FRAC_SRL_4_i <= 3-_FP_FRAC_SRL_4_skip;			\
+	     ++_FP_FRAC_SRL_4_i)					\
+	  X##_f[_FP_FRAC_SRL_4_i]					\
+	    = X##_f[_FP_FRAC_SRL_4_i+_FP_FRAC_SRL_4_skip];		\
+      else								\
+	{								\
+	  for (_FP_FRAC_SRL_4_i = 0;					\
+	       _FP_FRAC_SRL_4_i < 3-_FP_FRAC_SRL_4_skip;		\
+	       ++_FP_FRAC_SRL_4_i)					\
+	    X##_f[_FP_FRAC_SRL_4_i]					\
+	      = ((X##_f[_FP_FRAC_SRL_4_i+_FP_FRAC_SRL_4_skip]		\
+		  >> _FP_FRAC_SRL_4_down)				\
+		 | (X##_f[_FP_FRAC_SRL_4_i+_FP_FRAC_SRL_4_skip+1]	\
+		    << _FP_FRAC_SRL_4_up));				\
+	  X##_f[_FP_FRAC_SRL_4_i++] = X##_f[3] >> _FP_FRAC_SRL_4_down;	\
+	}								\
+      for (; _FP_FRAC_SRL_4_i < 4; ++_FP_FRAC_SRL_4_i)			\
+	X##_f[_FP_FRAC_SRL_4_i] = 0;					\
+    }									\
+  while (0)
+
+
+/* Right shift with sticky-lsb.
+   What this actually means is that we do a standard right-shift,
+   but that if any of the bits that fall off the right hand side
+   were one then we always set the LSbit.  */
+#define _FP_FRAC_SRST_4(X, S, N, size)					\
+  do									\
+    {									\
+      _FP_I_TYPE _FP_FRAC_SRST_4_up, _FP_FRAC_SRST_4_down;		\
+      _FP_I_TYPE _FP_FRAC_SRST_4_skip, _FP_FRAC_SRST_4_i;		\
+      _FP_W_TYPE _FP_FRAC_SRST_4_s;					\
+      _FP_FRAC_SRST_4_skip = (N) / _FP_W_TYPE_SIZE;			\
+      _FP_FRAC_SRST_4_down = (N) % _FP_W_TYPE_SIZE;			\
+      _FP_FRAC_SRST_4_up = _FP_W_TYPE_SIZE - _FP_FRAC_SRST_4_down;	\
+      for (_FP_FRAC_SRST_4_s = _FP_FRAC_SRST_4_i = 0;			\
+	   _FP_FRAC_SRST_4_i < _FP_FRAC_SRST_4_skip;			\
+	   ++_FP_FRAC_SRST_4_i)						\
+	_FP_FRAC_SRST_4_s |= X##_f[_FP_FRAC_SRST_4_i];			\
+      if (!_FP_FRAC_SRST_4_down)					\
+	for (_FP_FRAC_SRST_4_i = 0;					\
+	     _FP_FRAC_SRST_4_i <= 3-_FP_FRAC_SRST_4_skip;		\
+	     ++_FP_FRAC_SRST_4_i)					\
+	  X##_f[_FP_FRAC_SRST_4_i]					\
+	    = X##_f[_FP_FRAC_SRST_4_i+_FP_FRAC_SRST_4_skip];		\
+      else								\
+	{								\
+	  _FP_FRAC_SRST_4_s						\
+	    |= X##_f[_FP_FRAC_SRST_4_i] << _FP_FRAC_SRST_4_up;		\
+	  for (_FP_FRAC_SRST_4_i = 0;					\
+	       _FP_FRAC_SRST_4_i < 3-_FP_FRAC_SRST_4_skip;		\
+	       ++_FP_FRAC_SRST_4_i)					\
+	    X##_f[_FP_FRAC_SRST_4_i]					\
+	      = ((X##_f[_FP_FRAC_SRST_4_i+_FP_FRAC_SRST_4_skip]		\
+		  >> _FP_FRAC_SRST_4_down)				\
+		 | (X##_f[_FP_FRAC_SRST_4_i+_FP_FRAC_SRST_4_skip+1]	\
+		    << _FP_FRAC_SRST_4_up));				\
+	  X##_f[_FP_FRAC_SRST_4_i++]					\
+	    = X##_f[3] >> _FP_FRAC_SRST_4_down;				\
+	}								\
+      for (; _FP_FRAC_SRST_4_i < 4; ++_FP_FRAC_SRST_4_i)		\
+	X##_f[_FP_FRAC_SRST_4_i] = 0;					\
+      S = (_FP_FRAC_SRST_4_s != 0);					\
+    }									\
+  while (0)
+
+#define _FP_FRAC_SRS_4(X, N, size)				\
+  do								\
+    {								\
+      int _FP_FRAC_SRS_4_sticky;				\
+      _FP_FRAC_SRST_4 (X, _FP_FRAC_SRS_4_sticky, (N), (size));	\
+      X##_f[0] |= _FP_FRAC_SRS_4_sticky;			\
+    }								\
+  while (0)
+
+#define _FP_FRAC_ADD_4(R, X, Y)					\
+  __FP_FRAC_ADD_4 (R##_f[3], R##_f[2], R##_f[1], R##_f[0],	\
+		   X##_f[3], X##_f[2], X##_f[1], X##_f[0],	\
+		   Y##_f[3], Y##_f[2], Y##_f[1], Y##_f[0])
+
+#define _FP_FRAC_SUB_4(R, X, Y)					\
+  __FP_FRAC_SUB_4 (R##_f[3], R##_f[2], R##_f[1], R##_f[0],	\
+		   X##_f[3], X##_f[2], X##_f[1], X##_f[0],	\
+		   Y##_f[3], Y##_f[2], Y##_f[1], Y##_f[0])
+
+#define _FP_FRAC_DEC_4(X, Y)					\
+  __FP_FRAC_DEC_4 (X##_f[3], X##_f[2], X##_f[1], X##_f[0],	\
+		   Y##_f[3], Y##_f[2], Y##_f[1], Y##_f[0])
+
+#define _FP_FRAC_ADDI_4(X, I)					\
+  __FP_FRAC_ADDI_4 (X##_f[3], X##_f[2], X##_f[1], X##_f[0], I)
+
+#define _FP_ZEROFRAC_4  0, 0, 0, 0
+#define _FP_MINFRAC_4   0, 0, 0, 1
+#define _FP_MAXFRAC_4	(~(_FP_WS_TYPE) 0), (~(_FP_WS_TYPE) 0), (~(_FP_WS_TYPE) 0), (~(_FP_WS_TYPE) 0)
+
+#define _FP_FRAC_ZEROP_4(X)     ((X##_f[0] | X##_f[1] | X##_f[2] | X##_f[3]) == 0)
+#define _FP_FRAC_NEGP_4(X)      ((_FP_WS_TYPE) X##_f[3] < 0)
+#define _FP_FRAC_OVERP_4(fs, X)  (_FP_FRAC_HIGH_##fs (X) & _FP_OVERFLOW_##fs)
+#define _FP_FRAC_HIGHBIT_DW_4(fs, X)	\
+  (_FP_FRAC_HIGH_DW_##fs (X) & _FP_HIGHBIT_DW_##fs)
+#define _FP_FRAC_CLEAR_OVERP_4(fs, X)  (_FP_FRAC_HIGH_##fs (X) &= ~_FP_OVERFLOW_##fs)
+
+#define _FP_FRAC_EQ_4(X, Y)				\
+  (X##_f[0] == Y##_f[0] && X##_f[1] == Y##_f[1]		\
+   && X##_f[2] == Y##_f[2] && X##_f[3] == Y##_f[3])
+
+#define _FP_FRAC_GT_4(X, Y)				\
+  (X##_f[3] > Y##_f[3]					\
+   || (X##_f[3] == Y##_f[3]				\
+       && (X##_f[2] > Y##_f[2]				\
+	   || (X##_f[2] == Y##_f[2]			\
+	       && (X##_f[1] > Y##_f[1]			\
+		   || (X##_f[1] == Y##_f[1]		\
+		       && X##_f[0] > Y##_f[0]))))))
+
+#define _FP_FRAC_GE_4(X, Y)				\
+  (X##_f[3] > Y##_f[3]					\
+   || (X##_f[3] == Y##_f[3]				\
+       && (X##_f[2] > Y##_f[2]				\
+	   || (X##_f[2] == Y##_f[2]			\
+	       && (X##_f[1] > Y##_f[1]			\
+		   || (X##_f[1] == Y##_f[1]		\
+		       && X##_f[0] >= Y##_f[0]))))))
+
+
+#define _FP_FRAC_CLZ_4(R, X)			\
+  do						\
+    {						\
+      if (X##_f[3])				\
+	__FP_CLZ ((R), X##_f[3]);		\
+      else if (X##_f[2])			\
+	{					\
+	  __FP_CLZ ((R), X##_f[2]);		\
+	  (R) += _FP_W_TYPE_SIZE;		\
+	}					\
+      else if (X##_f[1])			\
+	{					\
+	  __FP_CLZ ((R), X##_f[1]);		\
+	  (R) += _FP_W_TYPE_SIZE*2;		\
+	}					\
+      else					\
+	{					\
+	  __FP_CLZ ((R), X##_f[0]);		\
+	  (R) += _FP_W_TYPE_SIZE*3;		\
+	}					\
+    }						\
+  while (0)
+
+
+#define _FP_UNPACK_RAW_4(fs, X, val)			\
+  do							\
+    {							\
+      union _FP_UNION_##fs _FP_UNPACK_RAW_4_flo;	\
+      _FP_UNPACK_RAW_4_flo.flt = (val);			\
+      X##_f[0] = _FP_UNPACK_RAW_4_flo.bits.frac0;	\
+      X##_f[1] = _FP_UNPACK_RAW_4_flo.bits.frac1;	\
+      X##_f[2] = _FP_UNPACK_RAW_4_flo.bits.frac2;	\
+      X##_f[3] = _FP_UNPACK_RAW_4_flo.bits.frac3;	\
+      X##_e  = _FP_UNPACK_RAW_4_flo.bits.exp;		\
+      X##_s  = _FP_UNPACK_RAW_4_flo.bits.sign;		\
+    }							\
+  while (0)
+
+#define _FP_UNPACK_RAW_4_P(fs, X, val)			\
+  do							\
+    {							\
+      union _FP_UNION_##fs *_FP_UNPACK_RAW_4_P_flo	\
+	= (union _FP_UNION_##fs *) (val);		\
+							\
+      X##_f[0] = _FP_UNPACK_RAW_4_P_flo->bits.frac0;	\
+      X##_f[1] = _FP_UNPACK_RAW_4_P_flo->bits.frac1;	\
+      X##_f[2] = _FP_UNPACK_RAW_4_P_flo->bits.frac2;	\
+      X##_f[3] = _FP_UNPACK_RAW_4_P_flo->bits.frac3;	\
+      X##_e  = _FP_UNPACK_RAW_4_P_flo->bits.exp;	\
+      X##_s  = _FP_UNPACK_RAW_4_P_flo->bits.sign;	\
+    }							\
+  while (0)
+
+#define _FP_PACK_RAW_4(fs, val, X)		\
+  do						\
+    {						\
+      union _FP_UNION_##fs _FP_PACK_RAW_4_flo;	\
+      _FP_PACK_RAW_4_flo.bits.frac0 = X##_f[0];	\
+      _FP_PACK_RAW_4_flo.bits.frac1 = X##_f[1];	\
+      _FP_PACK_RAW_4_flo.bits.frac2 = X##_f[2];	\
+      _FP_PACK_RAW_4_flo.bits.frac3 = X##_f[3];	\
+      _FP_PACK_RAW_4_flo.bits.exp   = X##_e;	\
+      _FP_PACK_RAW_4_flo.bits.sign  = X##_s;	\
+      (val) = _FP_PACK_RAW_4_flo.flt;		\
+    }						\
+  while (0)
+
+#define _FP_PACK_RAW_4_P(fs, val, X)			\
+  do							\
+    {							\
+      union _FP_UNION_##fs *_FP_PACK_RAW_4_P_flo	\
+	= (union _FP_UNION_##fs *) (val);		\
+							\
+      _FP_PACK_RAW_4_P_flo->bits.frac0 = X##_f[0];	\
+      _FP_PACK_RAW_4_P_flo->bits.frac1 = X##_f[1];	\
+      _FP_PACK_RAW_4_P_flo->bits.frac2 = X##_f[2];	\
+      _FP_PACK_RAW_4_P_flo->bits.frac3 = X##_f[3];	\
+      _FP_PACK_RAW_4_P_flo->bits.exp   = X##_e;		\
+      _FP_PACK_RAW_4_P_flo->bits.sign  = X##_s;		\
+    }							\
+  while (0)
+
+/* Multiplication algorithms: */
+
+/* Given a 1W * 1W => 2W primitive, do the extended multiplication.  */
+
+#define _FP_MUL_MEAT_DW_4_wide(wfracbits, R, X, Y, doit)		\
+  do									\
+    {									\
+      _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_4_wide_b);			\
+      _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_4_wide_c);			\
+      _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_4_wide_d);			\
+      _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_4_wide_e);			\
+      _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_4_wide_f);			\
+									\
+      doit (_FP_FRAC_WORD_8 (R, 1), _FP_FRAC_WORD_8 (R, 0),		\
+	    X##_f[0], Y##_f[0]);					\
+      doit (_FP_MUL_MEAT_DW_4_wide_b_f1, _FP_MUL_MEAT_DW_4_wide_b_f0,	\
+	    X##_f[0], Y##_f[1]);					\
+      doit (_FP_MUL_MEAT_DW_4_wide_c_f1, _FP_MUL_MEAT_DW_4_wide_c_f0,	\
+	    X##_f[1], Y##_f[0]);					\
+      doit (_FP_MUL_MEAT_DW_4_wide_d_f1, _FP_MUL_MEAT_DW_4_wide_d_f0,	\
+	    X##_f[1], Y##_f[1]);					\
+      doit (_FP_MUL_MEAT_DW_4_wide_e_f1, _FP_MUL_MEAT_DW_4_wide_e_f0,	\
+	    X##_f[0], Y##_f[2]);					\
+      doit (_FP_MUL_MEAT_DW_4_wide_f_f1, _FP_MUL_MEAT_DW_4_wide_f_f0,	\
+	    X##_f[2], Y##_f[0]);					\
+      __FP_FRAC_ADD_3 (_FP_FRAC_WORD_8 (R, 3), _FP_FRAC_WORD_8 (R, 2),	\
+		       _FP_FRAC_WORD_8 (R, 1), 0,			\
+		       _FP_MUL_MEAT_DW_4_wide_b_f1,			\
+		       _FP_MUL_MEAT_DW_4_wide_b_f0,			\
+		       0, 0, _FP_FRAC_WORD_8 (R, 1));			\
+      __FP_FRAC_ADD_3 (_FP_FRAC_WORD_8 (R, 3), _FP_FRAC_WORD_8 (R, 2),	\
+		       _FP_FRAC_WORD_8 (R, 1), 0,			\
+		       _FP_MUL_MEAT_DW_4_wide_c_f1,			\
+		       _FP_MUL_MEAT_DW_4_wide_c_f0,			\
+		       _FP_FRAC_WORD_8 (R, 3), _FP_FRAC_WORD_8 (R, 2),	\
+		       _FP_FRAC_WORD_8 (R, 1));				\
+      __FP_FRAC_ADD_3 (_FP_FRAC_WORD_8 (R, 4), _FP_FRAC_WORD_8 (R, 3),	\
+		       _FP_FRAC_WORD_8 (R, 2), 0,			\
+		       _FP_MUL_MEAT_DW_4_wide_d_f1,			\
+		       _FP_MUL_MEAT_DW_4_wide_d_f0,			\
+		       0, _FP_FRAC_WORD_8 (R, 3), _FP_FRAC_WORD_8 (R, 2)); \
+      __FP_FRAC_ADD_3 (_FP_FRAC_WORD_8 (R, 4), _FP_FRAC_WORD_8 (R, 3),	\
+		       _FP_FRAC_WORD_8 (R, 2), 0,			\
+		       _FP_MUL_MEAT_DW_4_wide_e_f1,			\
+		       _FP_MUL_MEAT_DW_4_wide_e_f0,			\
+		       _FP_FRAC_WORD_8 (R, 4), _FP_FRAC_WORD_8 (R, 3),	\
+		       _FP_FRAC_WORD_8 (R, 2));				\
+      __FP_FRAC_ADD_3 (_FP_FRAC_WORD_8 (R, 4), _FP_FRAC_WORD_8 (R, 3),	\
+		       _FP_FRAC_WORD_8 (R, 2), 0,			\
+		       _FP_MUL_MEAT_DW_4_wide_f_f1,			\
+		       _FP_MUL_MEAT_DW_4_wide_f_f0,			\
+		       _FP_FRAC_WORD_8 (R, 4), _FP_FRAC_WORD_8 (R, 3),	\
+		       _FP_FRAC_WORD_8 (R, 2));				\
+      doit (_FP_MUL_MEAT_DW_4_wide_b_f1,				\
+	    _FP_MUL_MEAT_DW_4_wide_b_f0, X##_f[0], Y##_f[3]);		\
+      doit (_FP_MUL_MEAT_DW_4_wide_c_f1,				\
+	    _FP_MUL_MEAT_DW_4_wide_c_f0, X##_f[3], Y##_f[0]);		\
+      doit (_FP_MUL_MEAT_DW_4_wide_d_f1, _FP_MUL_MEAT_DW_4_wide_d_f0,	\
+	    X##_f[1], Y##_f[2]);					\
+      doit (_FP_MUL_MEAT_DW_4_wide_e_f1, _FP_MUL_MEAT_DW_4_wide_e_f0,	\
+	    X##_f[2], Y##_f[1]);					\
+      __FP_FRAC_ADD_3 (_FP_FRAC_WORD_8 (R, 5), _FP_FRAC_WORD_8 (R, 4),	\
+		       _FP_FRAC_WORD_8 (R, 3), 0,			\
+		       _FP_MUL_MEAT_DW_4_wide_b_f1,			\
+		       _FP_MUL_MEAT_DW_4_wide_b_f0,			\
+		       0, _FP_FRAC_WORD_8 (R, 4), _FP_FRAC_WORD_8 (R, 3)); \
+      __FP_FRAC_ADD_3 (_FP_FRAC_WORD_8 (R, 5), _FP_FRAC_WORD_8 (R, 4),	\
+		       _FP_FRAC_WORD_8 (R, 3), 0,			\
+		       _FP_MUL_MEAT_DW_4_wide_c_f1,			\
+		       _FP_MUL_MEAT_DW_4_wide_c_f0,			\
+		       _FP_FRAC_WORD_8 (R, 5), _FP_FRAC_WORD_8 (R, 4),	\
+		       _FP_FRAC_WORD_8 (R, 3));				\
+      __FP_FRAC_ADD_3 (_FP_FRAC_WORD_8 (R, 5), _FP_FRAC_WORD_8 (R, 4),	\
+		       _FP_FRAC_WORD_8 (R, 3), 0,			\
+		       _FP_MUL_MEAT_DW_4_wide_d_f1,			\
+		       _FP_MUL_MEAT_DW_4_wide_d_f0,			\
+		       _FP_FRAC_WORD_8 (R, 5), _FP_FRAC_WORD_8 (R, 4),	\
+		       _FP_FRAC_WORD_8 (R, 3));				\
+      __FP_FRAC_ADD_3 (_FP_FRAC_WORD_8 (R, 5), _FP_FRAC_WORD_8 (R, 4),	\
+		       _FP_FRAC_WORD_8 (R, 3), 0,			\
+		       _FP_MUL_MEAT_DW_4_wide_e_f1,			\
+		       _FP_MUL_MEAT_DW_4_wide_e_f0,			\
+		       _FP_FRAC_WORD_8 (R, 5), _FP_FRAC_WORD_8 (R, 4),	\
+		       _FP_FRAC_WORD_8 (R, 3));				\
+      doit (_FP_MUL_MEAT_DW_4_wide_b_f1, _FP_MUL_MEAT_DW_4_wide_b_f0,	\
+	    X##_f[2], Y##_f[2]);					\
+      doit (_FP_MUL_MEAT_DW_4_wide_c_f1, _FP_MUL_MEAT_DW_4_wide_c_f0,	\
+	    X##_f[1], Y##_f[3]);					\
+      doit (_FP_MUL_MEAT_DW_4_wide_d_f1, _FP_MUL_MEAT_DW_4_wide_d_f0,	\
+	    X##_f[3], Y##_f[1]);					\
+      doit (_FP_MUL_MEAT_DW_4_wide_e_f1, _FP_MUL_MEAT_DW_4_wide_e_f0,	\
+	    X##_f[2], Y##_f[3]);					\
+      doit (_FP_MUL_MEAT_DW_4_wide_f_f1, _FP_MUL_MEAT_DW_4_wide_f_f0,	\
+	    X##_f[3], Y##_f[2]);					\
+      __FP_FRAC_ADD_3 (_FP_FRAC_WORD_8 (R, 6), _FP_FRAC_WORD_8 (R, 5),	\
+		       _FP_FRAC_WORD_8 (R, 4), 0,			\
+		       _FP_MUL_MEAT_DW_4_wide_b_f1,			\
+		       _FP_MUL_MEAT_DW_4_wide_b_f0,			\
+		       0, _FP_FRAC_WORD_8 (R, 5), _FP_FRAC_WORD_8 (R, 4)); \
+      __FP_FRAC_ADD_3 (_FP_FRAC_WORD_8 (R, 6), _FP_FRAC_WORD_8 (R, 5),	\
+		       _FP_FRAC_WORD_8 (R, 4), 0,			\
+		       _FP_MUL_MEAT_DW_4_wide_c_f1,			\
+		       _FP_MUL_MEAT_DW_4_wide_c_f0,			\
+		       _FP_FRAC_WORD_8 (R, 6), _FP_FRAC_WORD_8 (R, 5),	\
+		       _FP_FRAC_WORD_8 (R, 4));				\
+      __FP_FRAC_ADD_3 (_FP_FRAC_WORD_8 (R, 6), _FP_FRAC_WORD_8 (R, 5),	\
+		       _FP_FRAC_WORD_8 (R, 4), 0,			\
+		       _FP_MUL_MEAT_DW_4_wide_d_f1,			\
+		       _FP_MUL_MEAT_DW_4_wide_d_f0,			\
+		       _FP_FRAC_WORD_8 (R, 6), _FP_FRAC_WORD_8 (R, 5),	\
+		       _FP_FRAC_WORD_8 (R, 4));				\
+      __FP_FRAC_ADD_3 (_FP_FRAC_WORD_8 (R, 7), _FP_FRAC_WORD_8 (R, 6),	\
+		       _FP_FRAC_WORD_8 (R, 5), 0,			\
+		       _FP_MUL_MEAT_DW_4_wide_e_f1,			\
+		       _FP_MUL_MEAT_DW_4_wide_e_f0,			\
+		       0, _FP_FRAC_WORD_8 (R, 6), _FP_FRAC_WORD_8 (R, 5)); \
+      __FP_FRAC_ADD_3 (_FP_FRAC_WORD_8 (R, 7), _FP_FRAC_WORD_8 (R, 6),	\
+		       _FP_FRAC_WORD_8 (R, 5), 0,			\
+		       _FP_MUL_MEAT_DW_4_wide_f_f1,			\
+		       _FP_MUL_MEAT_DW_4_wide_f_f0,			\
+		       _FP_FRAC_WORD_8 (R, 7), _FP_FRAC_WORD_8 (R, 6),	\
+		       _FP_FRAC_WORD_8 (R, 5));				\
+      doit (_FP_MUL_MEAT_DW_4_wide_b_f1, _FP_MUL_MEAT_DW_4_wide_b_f0,	\
+	    X##_f[3], Y##_f[3]);					\
+      __FP_FRAC_ADD_2 (_FP_FRAC_WORD_8 (R, 7), _FP_FRAC_WORD_8 (R, 6),	\
+		       _FP_MUL_MEAT_DW_4_wide_b_f1,			\
+		       _FP_MUL_MEAT_DW_4_wide_b_f0,			\
+		       _FP_FRAC_WORD_8 (R, 7), _FP_FRAC_WORD_8 (R, 6));	\
+    }									\
+  while (0)
+
+#define _FP_MUL_MEAT_4_wide(wfracbits, R, X, Y, doit)			\
+  do									\
+    {									\
+      _FP_FRAC_DECL_8 (_FP_MUL_MEAT_4_wide_z);				\
+									\
+      _FP_MUL_MEAT_DW_4_wide ((wfracbits), _FP_MUL_MEAT_4_wide_z,	\
+			      X, Y, doit);				\
+									\
+      /* Normalize since we know where the msb of the multiplicands	\
+	 were (bit B), we know that the msb of the of the product is	\
+	 at either 2B or 2B-1.  */					\
+      _FP_FRAC_SRS_8 (_FP_MUL_MEAT_4_wide_z, (wfracbits)-1,		\
+		      2*(wfracbits));					\
+      __FP_FRAC_SET_4 (R, _FP_FRAC_WORD_8 (_FP_MUL_MEAT_4_wide_z, 3),	\
+		       _FP_FRAC_WORD_8 (_FP_MUL_MEAT_4_wide_z, 2),	\
+		       _FP_FRAC_WORD_8 (_FP_MUL_MEAT_4_wide_z, 1),	\
+		       _FP_FRAC_WORD_8 (_FP_MUL_MEAT_4_wide_z, 0));	\
+    }									\
+  while (0)
+
+#define _FP_MUL_MEAT_DW_4_gmp(wfracbits, R, X, Y)	\
+  do							\
+    {							\
+      mpn_mul_n (R##_f, _x_f, _y_f, 4);			\
+    }							\
+  while (0)
+
+#define _FP_MUL_MEAT_4_gmp(wfracbits, R, X, Y)				\
+  do									\
+    {									\
+      _FP_FRAC_DECL_8 (_FP_MUL_MEAT_4_gmp_z);				\
+									\
+      _FP_MUL_MEAT_DW_4_gmp ((wfracbits), _FP_MUL_MEAT_4_gmp_z, X, Y);	\
+									\
+      /* Normalize since we know where the msb of the multiplicands	\
+	 were (bit B), we know that the msb of the of the product is	\
+	 at either 2B or 2B-1.  */					\
+      _FP_FRAC_SRS_8 (_FP_MUL_MEAT_4_gmp_z, (wfracbits)-1,		\
+		      2*(wfracbits));					\
+      __FP_FRAC_SET_4 (R, _FP_FRAC_WORD_8 (_FP_MUL_MEAT_4_gmp_z, 3),	\
+		       _FP_FRAC_WORD_8 (_FP_MUL_MEAT_4_gmp_z, 2),	\
+		       _FP_FRAC_WORD_8 (_FP_MUL_MEAT_4_gmp_z, 1),	\
+		       _FP_FRAC_WORD_8 (_FP_MUL_MEAT_4_gmp_z, 0));	\
+    }									\
+  while (0)
+
+/* Helper utility for _FP_DIV_MEAT_4_udiv:
+ * pppp = m * nnn.  */
+#define umul_ppppmnnn(p3, p2, p1, p0, m, n2, n1, n0)	\
+  do							\
+    {							\
+      UWtype umul_ppppmnnn_t;				\
+      umul_ppmm (p1, p0, m, n0);			\
+      umul_ppmm (p2, umul_ppppmnnn_t, m, n1);		\
+      __FP_FRAC_ADDI_2 (p2, p1, umul_ppppmnnn_t);	\
+      umul_ppmm (p3, umul_ppppmnnn_t, m, n2);		\
+      __FP_FRAC_ADDI_2 (p3, p2, umul_ppppmnnn_t);	\
+    }							\
+  while (0)
+
+/* Division algorithms: */
+
+#define _FP_DIV_MEAT_4_udiv(fs, R, X, Y)				\
+  do									\
+    {									\
+      int _FP_DIV_MEAT_4_udiv_i;					\
+      _FP_FRAC_DECL_4 (_FP_DIV_MEAT_4_udiv_n);				\
+      _FP_FRAC_DECL_4 (_FP_DIV_MEAT_4_udiv_m);				\
+      _FP_FRAC_SET_4 (_FP_DIV_MEAT_4_udiv_n, _FP_ZEROFRAC_4);		\
+      if (_FP_FRAC_GE_4 (X, Y))						\
+	{								\
+	  _FP_DIV_MEAT_4_udiv_n_f[3]					\
+	    = X##_f[0] << (_FP_W_TYPE_SIZE - 1);			\
+	  _FP_FRAC_SRL_4 (X, 1);					\
+	}								\
+      else								\
+	R##_e--;							\
+									\
+      /* Normalize, i.e. make the most significant bit of the		\
+	 denominator set.  */						\
+      _FP_FRAC_SLL_4 (Y, _FP_WFRACXBITS_##fs);				\
+									\
+      for (_FP_DIV_MEAT_4_udiv_i = 3; ; _FP_DIV_MEAT_4_udiv_i--)	\
+	{								\
+	  if (X##_f[3] == Y##_f[3])					\
+	    {								\
+	      /* This is a special case, not an optimization		\
+		 (X##_f[3]/Y##_f[3] would not fit into UWtype).		\
+		 As X## is guaranteed to be < Y,			\
+		 R##_f[_FP_DIV_MEAT_4_udiv_i] can be either		\
+		 (UWtype)-1 or (UWtype)-2.  */				\
+	      R##_f[_FP_DIV_MEAT_4_udiv_i] = -1;			\
+	      if (!_FP_DIV_MEAT_4_udiv_i)				\
+		break;							\
+	      __FP_FRAC_SUB_4 (X##_f[3], X##_f[2], X##_f[1], X##_f[0],	\
+			       Y##_f[2], Y##_f[1], Y##_f[0], 0,		\
+			       X##_f[2], X##_f[1], X##_f[0],		\
+			       _FP_DIV_MEAT_4_udiv_n_f[_FP_DIV_MEAT_4_udiv_i]); \
+	      _FP_FRAC_SUB_4 (X, Y, X);					\
+	      if (X##_f[3] > Y##_f[3])					\
+		{							\
+		  R##_f[_FP_DIV_MEAT_4_udiv_i] = -2;			\
+		  _FP_FRAC_ADD_4 (X, Y, X);				\
+		}							\
+	    }								\
+	  else								\
+	    {								\
+	      udiv_qrnnd (R##_f[_FP_DIV_MEAT_4_udiv_i],			\
+			  X##_f[3], X##_f[3], X##_f[2], Y##_f[3]);	\
+	      umul_ppppmnnn (_FP_DIV_MEAT_4_udiv_m_f[3],		\
+			     _FP_DIV_MEAT_4_udiv_m_f[2],		\
+			     _FP_DIV_MEAT_4_udiv_m_f[1],		\
+			     _FP_DIV_MEAT_4_udiv_m_f[0],		\
+			     R##_f[_FP_DIV_MEAT_4_udiv_i],		\
+			     Y##_f[2], Y##_f[1], Y##_f[0]);		\
+	      X##_f[2] = X##_f[1];					\
+	      X##_f[1] = X##_f[0];					\
+	      X##_f[0]							\
+		= _FP_DIV_MEAT_4_udiv_n_f[_FP_DIV_MEAT_4_udiv_i];	\
+	      if (_FP_FRAC_GT_4 (_FP_DIV_MEAT_4_udiv_m, X))		\
+		{							\
+		  R##_f[_FP_DIV_MEAT_4_udiv_i]--;			\
+		  _FP_FRAC_ADD_4 (X, Y, X);				\
+		  if (_FP_FRAC_GE_4 (X, Y)				\
+		      && _FP_FRAC_GT_4 (_FP_DIV_MEAT_4_udiv_m, X))	\
+		    {							\
+		      R##_f[_FP_DIV_MEAT_4_udiv_i]--;			\
+		      _FP_FRAC_ADD_4 (X, Y, X);				\
+		    }							\
+		}							\
+	      _FP_FRAC_DEC_4 (X, _FP_DIV_MEAT_4_udiv_m);		\
+	      if (!_FP_DIV_MEAT_4_udiv_i)				\
+		{							\
+		  if (!_FP_FRAC_EQ_4 (X, _FP_DIV_MEAT_4_udiv_m))	\
+		    R##_f[0] |= _FP_WORK_STICKY;			\
+		  break;						\
+		}							\
+	    }								\
+	}								\
+    }									\
+  while (0)
+
+
+/* Square root algorithms:
+   We have just one right now, maybe Newton approximation
+   should be added for those machines where division is fast.  */
+
+#define _FP_SQRT_MEAT_4(R, S, T, X, q)					\
+  do									\
+    {									\
+      while (q)								\
+	{								\
+	  T##_f[3] = S##_f[3] + (q);					\
+	  if (T##_f[3] <= X##_f[3])					\
+	    {								\
+	      S##_f[3] = T##_f[3] + (q);				\
+	      X##_f[3] -= T##_f[3];					\
+	      R##_f[3] += (q);						\
+	    }								\
+	  _FP_FRAC_SLL_4 (X, 1);					\
+	  (q) >>= 1;							\
+	}								\
+      (q) = (_FP_W_TYPE) 1 << (_FP_W_TYPE_SIZE - 1);			\
+      while (q)								\
+	{								\
+	  T##_f[2] = S##_f[2] + (q);					\
+	  T##_f[3] = S##_f[3];						\
+	  if (T##_f[3] < X##_f[3]					\
+	      || (T##_f[3] == X##_f[3] && T##_f[2] <= X##_f[2]))	\
+	    {								\
+	      S##_f[2] = T##_f[2] + (q);				\
+	      S##_f[3] += (T##_f[2] > S##_f[2]);			\
+	      __FP_FRAC_DEC_2 (X##_f[3], X##_f[2],			\
+			       T##_f[3], T##_f[2]);			\
+	      R##_f[2] += (q);						\
+	    }								\
+	  _FP_FRAC_SLL_4 (X, 1);					\
+	  (q) >>= 1;							\
+	}								\
+      (q) = (_FP_W_TYPE) 1 << (_FP_W_TYPE_SIZE - 1);			\
+      while (q)								\
+	{								\
+	  T##_f[1] = S##_f[1] + (q);					\
+	  T##_f[2] = S##_f[2];						\
+	  T##_f[3] = S##_f[3];						\
+	  if (T##_f[3] < X##_f[3]					\
+	      || (T##_f[3] == X##_f[3]					\
+		  && (T##_f[2] < X##_f[2]				\
+		      || (T##_f[2] == X##_f[2]				\
+			  && T##_f[1] <= X##_f[1]))))			\
+	    {								\
+	      S##_f[1] = T##_f[1] + (q);				\
+	      S##_f[2] += (T##_f[1] > S##_f[1]);			\
+	      S##_f[3] += (T##_f[2] > S##_f[2]);			\
+	      __FP_FRAC_DEC_3 (X##_f[3], X##_f[2], X##_f[1],		\
+			       T##_f[3], T##_f[2], T##_f[1]);		\
+	      R##_f[1] += (q);						\
+	    }								\
+	  _FP_FRAC_SLL_4 (X, 1);					\
+	  (q) >>= 1;							\
+	}								\
+      (q) = (_FP_W_TYPE) 1 << (_FP_W_TYPE_SIZE - 1);			\
+      while ((q) != _FP_WORK_ROUND)					\
+	{								\
+	  T##_f[0] = S##_f[0] + (q);					\
+	  T##_f[1] = S##_f[1];						\
+	  T##_f[2] = S##_f[2];						\
+	  T##_f[3] = S##_f[3];						\
+	  if (_FP_FRAC_GE_4 (X, T))					\
+	    {								\
+	      S##_f[0] = T##_f[0] + (q);				\
+	      S##_f[1] += (T##_f[0] > S##_f[0]);			\
+	      S##_f[2] += (T##_f[1] > S##_f[1]);			\
+	      S##_f[3] += (T##_f[2] > S##_f[2]);			\
+	      _FP_FRAC_DEC_4 (X, T);					\
+	      R##_f[0] += (q);						\
+	    }								\
+	  _FP_FRAC_SLL_4 (X, 1);					\
+	  (q) >>= 1;							\
+	}								\
+      if (!_FP_FRAC_ZEROP_4 (X))					\
+	{								\
+	  if (_FP_FRAC_GT_4 (X, S))					\
+	    R##_f[0] |= _FP_WORK_ROUND;					\
+	  R##_f[0] |= _FP_WORK_STICKY;					\
+	}								\
+    }									\
+  while (0)
+
+
+/* Internals.  */
+
+#define __FP_FRAC_SET_4(X, I3, I2, I1, I0)			\
+  (X##_f[3] = I3, X##_f[2] = I2, X##_f[1] = I1, X##_f[0] = I0)
+
+#ifndef __FP_FRAC_ADD_3
+# define __FP_FRAC_ADD_3(r2, r1, r0, x2, x1, x0, y2, y1, y0)	\
+  do								\
+    {								\
+      _FP_W_TYPE __FP_FRAC_ADD_3_c1, __FP_FRAC_ADD_3_c2;	\
+      r0 = x0 + y0;						\
+      __FP_FRAC_ADD_3_c1 = r0 < x0;				\
+      r1 = x1 + y1;						\
+      __FP_FRAC_ADD_3_c2 = r1 < x1;				\
+      r1 += __FP_FRAC_ADD_3_c1;					\
+      __FP_FRAC_ADD_3_c2 |= r1 < __FP_FRAC_ADD_3_c1;		\
+      r2 = x2 + y2 + __FP_FRAC_ADD_3_c2;			\
+    }								\
+  while (0)
+#endif
+
+#ifndef __FP_FRAC_ADD_4
+# define __FP_FRAC_ADD_4(r3, r2, r1, r0, x3, x2, x1, x0, y3, y2, y1, y0) \
+  do									\
+    {									\
+      _FP_W_TYPE __FP_FRAC_ADD_4_c1, __FP_FRAC_ADD_4_c2;		\
+      _FP_W_TYPE __FP_FRAC_ADD_4_c3;					\
+      r0 = x0 + y0;							\
+      __FP_FRAC_ADD_4_c1 = r0 < x0;					\
+      r1 = x1 + y1;							\
+      __FP_FRAC_ADD_4_c2 = r1 < x1;					\
+      r1 += __FP_FRAC_ADD_4_c1;						\
+      __FP_FRAC_ADD_4_c2 |= r1 < __FP_FRAC_ADD_4_c1;			\
+      r2 = x2 + y2;							\
+      __FP_FRAC_ADD_4_c3 = r2 < x2;					\
+      r2 += __FP_FRAC_ADD_4_c2;						\
+      __FP_FRAC_ADD_4_c3 |= r2 < __FP_FRAC_ADD_4_c2;			\
+      r3 = x3 + y3 + __FP_FRAC_ADD_4_c3;				\
+    }									\
+  while (0)
+#endif
+
+#ifndef __FP_FRAC_SUB_3
+# define __FP_FRAC_SUB_3(r2, r1, r0, x2, x1, x0, y2, y1, y0)    \
+  do                                                            \
+    {                                                           \
+      _FP_W_TYPE __FP_FRAC_SUB_3_tmp[2];                        \
+      _FP_W_TYPE __FP_FRAC_SUB_3_c1, __FP_FRAC_SUB_3_c2;        \
+      __FP_FRAC_SUB_3_tmp[0] = x0 - y0;                         \
+      __FP_FRAC_SUB_3_c1 = __FP_FRAC_SUB_3_tmp[0] > x0;         \
+      __FP_FRAC_SUB_3_tmp[1] = x1 - y1;                         \
+      __FP_FRAC_SUB_3_c2 = __FP_FRAC_SUB_3_tmp[1] > x1;         \
+      __FP_FRAC_SUB_3_tmp[1] -= __FP_FRAC_SUB_3_c1;             \
+      __FP_FRAC_SUB_3_c2 |= __FP_FRAC_SUB_3_c1 && (y1 == x1);   \
+      r2 = x2 - y2 - __FP_FRAC_SUB_3_c2;                        \
+      r1 = __FP_FRAC_SUB_3_tmp[1];                              \
+      r0 = __FP_FRAC_SUB_3_tmp[0];                              \
+    }                                                           \
+  while (0)
+#endif
+
+#ifndef __FP_FRAC_SUB_4
+# define __FP_FRAC_SUB_4(r3, r2, r1, r0, x3, x2, x1, x0, y3, y2, y1, y0) \
+  do                                                                     \
+    {                                                                    \
+      _FP_W_TYPE __FP_FRAC_SUB_4_tmp[3];                                 \
+      _FP_W_TYPE __FP_FRAC_SUB_4_c1, __FP_FRAC_SUB_4_c2;                 \
+      _FP_W_TYPE __FP_FRAC_SUB_4_c3;                                     \
+      __FP_FRAC_SUB_4_tmp[0] = x0 - y0;                                  \
+      __FP_FRAC_SUB_4_c1 = __FP_FRAC_SUB_4_tmp[0] > x0;                  \
+      __FP_FRAC_SUB_4_tmp[1] = x1 - y1;                                  \
+      __FP_FRAC_SUB_4_c2 = __FP_FRAC_SUB_4_tmp[1] > x1;                  \
+      __FP_FRAC_SUB_4_tmp[1] -= __FP_FRAC_SUB_4_c1;                      \
+      __FP_FRAC_SUB_4_c2 |= __FP_FRAC_SUB_4_c1 && (y1 == x1);            \
+      __FP_FRAC_SUB_4_tmp[2] = x2 - y2;                                  \
+      __FP_FRAC_SUB_4_c3 = __FP_FRAC_SUB_4_tmp[2] > x2;                  \
+      __FP_FRAC_SUB_4_tmp[2] -= __FP_FRAC_SUB_4_c2;                      \
+      __FP_FRAC_SUB_4_c3 |= __FP_FRAC_SUB_4_c2 && (y2 == x2);            \
+      r3 = x3 - y3 - __FP_FRAC_SUB_4_c3;                                 \
+      r2 = __FP_FRAC_SUB_4_tmp[2];                                       \
+      r1 = __FP_FRAC_SUB_4_tmp[1];                                       \
+      r0 = __FP_FRAC_SUB_4_tmp[0];                                       \
+    }                                                                    \
+  while (0)
+#endif
+
+#ifndef __FP_FRAC_DEC_3
+# define __FP_FRAC_DEC_3(x2, x1, x0, y2, y1, y0)		\
+  do								\
+    {								\
+      UWtype __FP_FRAC_DEC_3_t0, __FP_FRAC_DEC_3_t1;		\
+      UWtype __FP_FRAC_DEC_3_t2;				\
+      __FP_FRAC_DEC_3_t0 = x0;					\
+      __FP_FRAC_DEC_3_t1 = x1;					\
+      __FP_FRAC_DEC_3_t2 = x2;					\
+      __FP_FRAC_SUB_3 (x2, x1, x0, __FP_FRAC_DEC_3_t2,		\
+		       __FP_FRAC_DEC_3_t1, __FP_FRAC_DEC_3_t0,	\
+		       y2, y1, y0);				\
+    }								\
+  while (0)
+#endif
+
+#ifndef __FP_FRAC_DEC_4
+# define __FP_FRAC_DEC_4(x3, x2, x1, x0, y3, y2, y1, y0)	\
+  do								\
+    {								\
+      UWtype __FP_FRAC_DEC_4_t0, __FP_FRAC_DEC_4_t1;		\
+      UWtype __FP_FRAC_DEC_4_t2, __FP_FRAC_DEC_4_t3;		\
+      __FP_FRAC_DEC_4_t0 = x0;					\
+      __FP_FRAC_DEC_4_t1 = x1;					\
+      __FP_FRAC_DEC_4_t2 = x2;					\
+      __FP_FRAC_DEC_4_t3 = x3;					\
+      __FP_FRAC_SUB_4 (x3, x2, x1, x0, __FP_FRAC_DEC_4_t3,	\
+		       __FP_FRAC_DEC_4_t2, __FP_FRAC_DEC_4_t1,	\
+		       __FP_FRAC_DEC_4_t0, y3, y2, y1, y0);	\
+    }								\
+  while (0)
+#endif
+
+#ifndef __FP_FRAC_ADDI_4
+# define __FP_FRAC_ADDI_4(x3, x2, x1, x0, i)		\
+  do							\
+    {							\
+      UWtype __FP_FRAC_ADDI_4_t;			\
+      __FP_FRAC_ADDI_4_t = ((x0 += i) < i);		\
+      x1 += __FP_FRAC_ADDI_4_t;				\
+      __FP_FRAC_ADDI_4_t = (x1 < __FP_FRAC_ADDI_4_t);	\
+      x2 += __FP_FRAC_ADDI_4_t;				\
+      __FP_FRAC_ADDI_4_t = (x2 < __FP_FRAC_ADDI_4_t);	\
+      x3 += __FP_FRAC_ADDI_4_t;				\
+    }							\
+  while (0)
+#endif
+
+/* Convert FP values between word sizes. This appears to be more
+   complicated than I'd have expected it to be, so these might be
+   wrong... These macros are in any case somewhat bogus because they
+   use information about what various FRAC_n variables look like
+   internally [eg, that 2 word vars are X_f0 and x_f1]. But so do
+   the ones in op-2.h and op-1.h.  */
+#define _FP_FRAC_COPY_1_4(D, S)		(D##_f = S##_f[0])
+
+#define _FP_FRAC_COPY_2_4(D, S)			\
+  do						\
+    {						\
+      D##_f0 = S##_f[0];			\
+      D##_f1 = S##_f[1];			\
+    }						\
+  while (0)
+
+/* Assembly/disassembly for converting to/from integral types.
+   No shifting or overflow handled here.  */
+/* Put the FP value X into r, which is an integer of size rsize.  */
+#define _FP_FRAC_ASSEMBLE_4(r, X, rsize)				\
+  do									\
+    {									\
+      if ((rsize) <= _FP_W_TYPE_SIZE)					\
+	(r) = X##_f[0];							\
+	else if ((rsize) <= 2*_FP_W_TYPE_SIZE)				\
+	{								\
+	  (r) = X##_f[1];						\
+	  (r) = ((rsize) <= _FP_W_TYPE_SIZE				\
+		 ? 0							\
+		 : (r) << _FP_W_TYPE_SIZE);				\
+	  (r) += X##_f[0];						\
+	}								\
+      else								\
+	{								\
+	  /* I'm feeling lazy so we deal with int == 3words		\
+	     (implausible) and int == 4words as a single case.  */	\
+	  (r) = X##_f[3];						\
+	  (r) = ((rsize) <= _FP_W_TYPE_SIZE				\
+		 ? 0							\
+		 : (r) << _FP_W_TYPE_SIZE);				\
+	  (r) += X##_f[2];						\
+	  (r) = ((rsize) <= _FP_W_TYPE_SIZE				\
+		 ? 0							\
+		 : (r) << _FP_W_TYPE_SIZE);				\
+	  (r) += X##_f[1];						\
+	  (r) = ((rsize) <= _FP_W_TYPE_SIZE				\
+		 ? 0							\
+		 : (r) << _FP_W_TYPE_SIZE);				\
+	  (r) += X##_f[0];						\
+	}								\
+    }									\
+  while (0)
+
+/* "No disassemble Number Five!" */
+/* Move an integer of size rsize into X's fractional part. We rely on
+   the _f[] array consisting of words of size _FP_W_TYPE_SIZE to avoid
+   having to mask the values we store into it.  */
+#define _FP_FRAC_DISASSEMBLE_4(X, r, rsize)	\
+  do						\
+    {						\
+      X##_f[0] = (r);				\
+      X##_f[1] = ((rsize) <= _FP_W_TYPE_SIZE	\
+		  ? 0				\
+		  : (r) >> _FP_W_TYPE_SIZE);	\
+      X##_f[2] = ((rsize) <= 2*_FP_W_TYPE_SIZE	\
+		  ? 0				\
+		  : (r) >> 2*_FP_W_TYPE_SIZE);	\
+      X##_f[3] = ((rsize) <= 3*_FP_W_TYPE_SIZE	\
+		  ? 0				\
+		  : (r) >> 3*_FP_W_TYPE_SIZE);	\
+    }						\
+  while (0)
+
+#define _FP_FRAC_COPY_4_1(D, S)			\
+  do						\
+    {						\
+      D##_f[0] = S##_f;				\
+      D##_f[1] = D##_f[2] = D##_f[3] = 0;	\
+    }						\
+  while (0)
+
+#define _FP_FRAC_COPY_4_2(D, S)			\
+  do						\
+    {						\
+      D##_f[0] = S##_f0;			\
+      D##_f[1] = S##_f1;			\
+      D##_f[2] = D##_f[3] = 0;			\
+    }						\
+  while (0)
+
+#define _FP_FRAC_COPY_4_4(D, S)	_FP_FRAC_COPY_4 (D, S)
+
+#endif /* !SOFT_FP_OP_4_H */
--- a/src/gemm/soft-fp/op-8.h
+++ b/src/gemm/soft-fp/op-8.h
@ -0,0 +1,238 @@
+/* Software floating-point emulation.
+   Basic eight-word fraction declaration and manipulation.
+   Copyright (C) 1997-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson (rth@cygnus.com),
+		  Jakub Jelinek (jj@ultra.linux.cz) and
+		  Peter Maydell (pmaydell@chiark.greenend.org.uk).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef SOFT_FP_OP_8_H
+#define SOFT_FP_OP_8_H	1
+
+/* We need just a few things from here for op-4, if we ever need some
+   other macros, they can be added.  */
+#define _FP_FRAC_DECL_8(X)	_FP_W_TYPE X##_f[8]
+#define _FP_FRAC_SET_8(X, I)    __FP_FRAC_SET_8 (X, I)
+#define _FP_FRAC_HIGH_8(X)	(X##_f[7])
+#define _FP_FRAC_LOW_8(X)	(X##_f[0])
+#define _FP_FRAC_WORD_8(X, w)	(X##_f[w])
+
+#define _FP_FRAC_SLL_8(X, N)						\
+  do									\
+    {									\
+      _FP_I_TYPE _FP_FRAC_SLL_8_up, _FP_FRAC_SLL_8_down;		\
+      _FP_I_TYPE _FP_FRAC_SLL_8_skip, _FP_FRAC_SLL_8_i;			\
+      _FP_FRAC_SLL_8_skip = (N) / _FP_W_TYPE_SIZE;			\
+      _FP_FRAC_SLL_8_up = (N) % _FP_W_TYPE_SIZE;			\
+      _FP_FRAC_SLL_8_down = _FP_W_TYPE_SIZE - _FP_FRAC_SLL_8_up;	\
+      if (!_FP_FRAC_SLL_8_up)						\
+	for (_FP_FRAC_SLL_8_i = 7;					\
+	     _FP_FRAC_SLL_8_i >= _FP_FRAC_SLL_8_skip;			\
+	     --_FP_FRAC_SLL_8_i)					\
+	  X##_f[_FP_FRAC_SLL_8_i]					\
+	    = X##_f[_FP_FRAC_SLL_8_i-_FP_FRAC_SLL_8_skip];		\
+      else								\
+	{								\
+	  for (_FP_FRAC_SLL_8_i = 7;					\
+	       _FP_FRAC_SLL_8_i > _FP_FRAC_SLL_8_skip;			\
+	       --_FP_FRAC_SLL_8_i)					\
+	    X##_f[_FP_FRAC_SLL_8_i]					\
+	      = ((X##_f[_FP_FRAC_SLL_8_i-_FP_FRAC_SLL_8_skip]		\
+		  << _FP_FRAC_SLL_8_up)					\
+		 | (X##_f[_FP_FRAC_SLL_8_i-_FP_FRAC_SLL_8_skip-1]	\
+		    >> _FP_FRAC_SLL_8_down));				\
+	  X##_f[_FP_FRAC_SLL_8_i--] = X##_f[0] << _FP_FRAC_SLL_8_up;	\
+	}								\
+      for (; _FP_FRAC_SLL_8_i >= 0; --_FP_FRAC_SLL_8_i)			\
+	X##_f[_FP_FRAC_SLL_8_i] = 0;					\
+    }									\
+  while (0)
+
+#define _FP_FRAC_SRL_8(X, N)						\
+  do									\
+    {									\
+      _FP_I_TYPE _FP_FRAC_SRL_8_up, _FP_FRAC_SRL_8_down;		\
+      _FP_I_TYPE _FP_FRAC_SRL_8_skip, _FP_FRAC_SRL_8_i;			\
+      _FP_FRAC_SRL_8_skip = (N) / _FP_W_TYPE_SIZE;			\
+      _FP_FRAC_SRL_8_down = (N) % _FP_W_TYPE_SIZE;			\
+      _FP_FRAC_SRL_8_up = _FP_W_TYPE_SIZE - _FP_FRAC_SRL_8_down;	\
+      if (!_FP_FRAC_SRL_8_down)						\
+	for (_FP_FRAC_SRL_8_i = 0;					\
+	     _FP_FRAC_SRL_8_i <= 7-_FP_FRAC_SRL_8_skip;			\
+	     ++_FP_FRAC_SRL_8_i)					\
+	  X##_f[_FP_FRAC_SRL_8_i]					\
+	    = X##_f[_FP_FRAC_SRL_8_i+_FP_FRAC_SRL_8_skip];		\
+      else								\
+	{								\
+	  for (_FP_FRAC_SRL_8_i = 0;					\
+	       _FP_FRAC_SRL_8_i < 7-_FP_FRAC_SRL_8_skip;		\
+	       ++_FP_FRAC_SRL_8_i)					\
+	    X##_f[_FP_FRAC_SRL_8_i]					\
+	      = ((X##_f[_FP_FRAC_SRL_8_i+_FP_FRAC_SRL_8_skip]		\
+		  >> _FP_FRAC_SRL_8_down)				\
+		 | (X##_f[_FP_FRAC_SRL_8_i+_FP_FRAC_SRL_8_skip+1]	\
+		    << _FP_FRAC_SRL_8_up));				\
+	  X##_f[_FP_FRAC_SRL_8_i++] = X##_f[7] >> _FP_FRAC_SRL_8_down;	\
+	}								\
+      for (; _FP_FRAC_SRL_8_i < 8; ++_FP_FRAC_SRL_8_i)			\
+	X##_f[_FP_FRAC_SRL_8_i] = 0;					\
+    }									\
+  while (0)
+
+
+/* Right shift with sticky-lsb.
+   What this actually means is that we do a standard right-shift,
+   but that if any of the bits that fall off the right hand side
+   were one then we always set the LSbit.  */
+#define _FP_FRAC_SRS_8(X, N, size)					\
+  do									\
+    {									\
+      _FP_I_TYPE _FP_FRAC_SRS_8_up, _FP_FRAC_SRS_8_down;		\
+      _FP_I_TYPE _FP_FRAC_SRS_8_skip, _FP_FRAC_SRS_8_i;			\
+      _FP_W_TYPE _FP_FRAC_SRS_8_s;					\
+      _FP_FRAC_SRS_8_skip = (N) / _FP_W_TYPE_SIZE;			\
+      _FP_FRAC_SRS_8_down = (N) % _FP_W_TYPE_SIZE;			\
+      _FP_FRAC_SRS_8_up = _FP_W_TYPE_SIZE - _FP_FRAC_SRS_8_down;	\
+      for (_FP_FRAC_SRS_8_s = _FP_FRAC_SRS_8_i = 0;			\
+	   _FP_FRAC_SRS_8_i < _FP_FRAC_SRS_8_skip;			\
+	   ++_FP_FRAC_SRS_8_i)						\
+	_FP_FRAC_SRS_8_s |= X##_f[_FP_FRAC_SRS_8_i];			\
+      if (!_FP_FRAC_SRS_8_down)						\
+	for (_FP_FRAC_SRS_8_i = 0;					\
+	     _FP_FRAC_SRS_8_i <= 7-_FP_FRAC_SRS_8_skip;			\
+	     ++_FP_FRAC_SRS_8_i)					\
+	  X##_f[_FP_FRAC_SRS_8_i]					\
+	    = X##_f[_FP_FRAC_SRS_8_i+_FP_FRAC_SRS_8_skip];		\
+      else								\
+	{								\
+	  _FP_FRAC_SRS_8_s						\
+	    |= X##_f[_FP_FRAC_SRS_8_i] << _FP_FRAC_SRS_8_up;		\
+	  for (_FP_FRAC_SRS_8_i = 0;					\
+	       _FP_FRAC_SRS_8_i < 7-_FP_FRAC_SRS_8_skip;		\
+	       ++_FP_FRAC_SRS_8_i)					\
+	    X##_f[_FP_FRAC_SRS_8_i]					\
+	      = ((X##_f[_FP_FRAC_SRS_8_i+_FP_FRAC_SRS_8_skip]		\
+		  >> _FP_FRAC_SRS_8_down)				\
+		 | (X##_f[_FP_FRAC_SRS_8_i+_FP_FRAC_SRS_8_skip+1]	\
+		    << _FP_FRAC_SRS_8_up));				\
+	  X##_f[_FP_FRAC_SRS_8_i++] = X##_f[7] >> _FP_FRAC_SRS_8_down;	\
+	}								\
+      for (; _FP_FRAC_SRS_8_i < 8; ++_FP_FRAC_SRS_8_i)			\
+	X##_f[_FP_FRAC_SRS_8_i] = 0;					\
+      /* Don't fix the LSB until the very end when we're sure f[0] is	\
+	 stable.  */							\
+      X##_f[0] |= (_FP_FRAC_SRS_8_s != 0);				\
+    }									\
+  while (0)
+
+#define _FP_FRAC_ADD_8(R, X, Y)                                             \
+  do                                                                        \
+    {                                                                       \
+      _FP_W_TYPE _FP_FRAC_ADD_8_c = 0;                                      \
+      _FP_I_TYPE _FP_FRAC_ADD_8_i;                                          \
+      for (_FP_FRAC_ADD_8_i = 0; _FP_FRAC_ADD_8_i < 8; ++_FP_FRAC_ADD_8_i)  \
+        {                                                                   \
+          R##_f[_FP_FRAC_ADD_8_i]                                           \
+            = (X##_f[_FP_FRAC_ADD_8_i] + Y##_f[_FP_FRAC_ADD_8_i]            \
+               + _FP_FRAC_ADD_8_c);                                         \
+          _FP_FRAC_ADD_8_c                                                  \
+            = (_FP_FRAC_ADD_8_c                                             \
+               ? R##_f[_FP_FRAC_ADD_8_i] <= X##_f[_FP_FRAC_ADD_8_i]         \
+               : R##_f[_FP_FRAC_ADD_8_i] < X##_f[_FP_FRAC_ADD_8_i]);        \
+        }                                                                   \
+    }                                                                       \
+  while (0)
+
+#define _FP_FRAC_SUB_8(R, X, Y)                                             \
+  do                                                                        \
+    {                                                                       \
+      _FP_W_TYPE _FP_FRAC_SUB_8_tmp[8];                                     \
+      _FP_W_TYPE _FP_FRAC_SUB_8_c = 0;                                      \
+      _FP_I_TYPE _FP_FRAC_SUB_8_i;                                          \
+      for (_FP_FRAC_SUB_8_i = 0; _FP_FRAC_SUB_8_i < 8; ++_FP_FRAC_SUB_8_i)  \
+        {                                                                   \
+          _FP_FRAC_SUB_8_tmp[_FP_FRAC_SUB_8_i]                              \
+            = (X##_f[_FP_FRAC_SUB_8_i] - Y##_f[_FP_FRAC_SUB_8_i]            \
+               - _FP_FRAC_SUB_8_c);                                         \
+          _FP_FRAC_SUB_8_c                                                  \
+            = (_FP_FRAC_SUB_8_c                                             \
+               ? (_FP_FRAC_SUB_8_tmp[_FP_FRAC_SUB_8_i]                      \
+                  >= X##_f[_FP_FRAC_SUB_8_i])                               \
+               : (_FP_FRAC_SUB_8_tmp[_FP_FRAC_SUB_8_i]                      \
+                  > X##_f[_FP_FRAC_SUB_8_i]));                              \
+        }                                                                   \
+      for (_FP_FRAC_SUB_8_i = 0; _FP_FRAC_SUB_8_i < 8; ++_FP_FRAC_SUB_8_i)  \
+        R##_f[_FP_FRAC_SUB_8_i] = _FP_FRAC_SUB_8_tmp[_FP_FRAC_SUB_8_i];     \
+    }                                                                       \
+  while (0)
+
+#define _FP_FRAC_CLZ_8(R, X)                                                \
+  do                                                                        \
+    {                                                                       \
+      _FP_I_TYPE _FP_FRAC_CLZ_8_i;                                          \
+      for (_FP_FRAC_CLZ_8_i = 7; _FP_FRAC_CLZ_8_i > 0; _FP_FRAC_CLZ_8_i--)  \
+        if (X##_f[_FP_FRAC_CLZ_8_i])                                        \
+          break;                                                            \
+      __FP_CLZ ((R), X##_f[_FP_FRAC_CLZ_8_i]);                              \
+      (R) += _FP_W_TYPE_SIZE * (7 - _FP_FRAC_CLZ_8_i);                      \
+    }                                                                       \
+  while (0)
+
+#define _FP_MINFRAC_8   0, 0, 0, 0, 0, 0, 0, 1
+
+#define _FP_FRAC_NEGP_8(X)      ((_FP_WS_TYPE) X##_f[7] < 0)
+#define _FP_FRAC_ZEROP_8(X)                                             \
+  ((X##_f[0] | X##_f[1] | X##_f[2] | X##_f[3]                           \
+    | X##_f[4] | X##_f[5] | X##_f[6] | X##_f[7]) == 0)
+#define _FP_FRAC_HIGHBIT_DW_8(fs, X)                                    \
+  (_FP_FRAC_HIGH_DW_##fs (X) & _FP_HIGHBIT_DW_##fs)
+
+#define _FP_FRAC_COPY_4_8(D, S)                           \
+  do                                                      \
+    {                                                     \
+      D##_f[0] = S##_f[0];                                \
+      D##_f[1] = S##_f[1];                                \
+      D##_f[2] = S##_f[2];                                \
+      D##_f[3] = S##_f[3];                                \
+    }                                                     \
+  while (0)
+
+#define _FP_FRAC_COPY_8_4(D, S)                           \
+  do                                                      \
+    {                                                     \
+      D##_f[0] = S##_f[0];                                \
+      D##_f[1] = S##_f[1];                                \
+      D##_f[2] = S##_f[2];                                \
+      D##_f[3] = S##_f[3];                                \
+      D##_f[4] = D##_f[5] = D##_f[6] = D##_f[7]= 0;       \
+    }                                                     \
+  while (0)
+
+#define __FP_FRAC_SET_8(X, I7, I6, I5, I4, I3, I2, I1, I0)             \
+  (X##_f[7] = I7, X##_f[6] = I6, X##_f[5] = I5, X##_f[4] = I4,         \
+   X##_f[3] = I3, X##_f[2] = I2, X##_f[1] = I1, X##_f[0] = I0)
+
+#endif /* !SOFT_FP_OP_8_H */
--- a/src/gemm/soft-fp/op-common.h
+++ b/src/gemm/soft-fp/op-common.h
--- a/src/gemm/soft-fp/sfp-machine.h
+++ b/src/gemm/soft-fp/sfp-machine.h
@ -0,0 +1,117 @@
+
+#if __riscv_xlen == 32
+
+#define _FP_W_TYPE_SIZE		32
+#define _FP_W_TYPE		unsigned long
+#define _FP_WS_TYPE		signed long
+#define _FP_I_TYPE		long
+
+#define _FP_MUL_MEAT_S(R,X,Y)				\
+  _FP_MUL_MEAT_1_wide(_FP_WFRACBITS_S,R,X,Y,umul_ppmm)
+#define _FP_MUL_MEAT_D(R,X,Y)				\
+  _FP_MUL_MEAT_2_wide(_FP_WFRACBITS_D,R,X,Y,umul_ppmm)
+#define _FP_MUL_MEAT_Q(R,X,Y)				\
+  _FP_MUL_MEAT_4_wide(_FP_WFRACBITS_Q,R,X,Y,umul_ppmm)
+
+#define _FP_DIV_MEAT_S(R,X,Y)	_FP_DIV_MEAT_1_udiv_norm(S,R,X,Y)
+#define _FP_DIV_MEAT_D(R,X,Y)	_FP_DIV_MEAT_2_udiv(D,R,X,Y)
+#define _FP_DIV_MEAT_Q(R,X,Y)	_FP_DIV_MEAT_4_udiv(Q,R,X,Y)
+
+#define _FP_NANFRAC_S		_FP_QNANBIT_S
+#define _FP_NANFRAC_D		_FP_QNANBIT_D, 0
+#define _FP_NANFRAC_Q		_FP_QNANBIT_Q, 0, 0, 0
+
+#else
+
+#define _FP_W_TYPE_SIZE		64
+#define _FP_W_TYPE		unsigned long long
+#define _FP_WS_TYPE		signed long long
+#define _FP_I_TYPE		long long
+
+#define _FP_MUL_MEAT_S(R,X,Y)					\
+  _FP_MUL_MEAT_1_imm(_FP_WFRACBITS_S,R,X,Y)
+#define _FP_MUL_MEAT_D(R,X,Y)					\
+  _FP_MUL_MEAT_1_wide(_FP_WFRACBITS_D,R,X,Y,umul_ppmm)
+#define _FP_MUL_MEAT_Q(R,X,Y)					\
+  _FP_MUL_MEAT_2_wide_3mul(_FP_WFRACBITS_Q,R,X,Y,umul_ppmm)
+
+#define _FP_DIV_MEAT_S(R,X,Y)	_FP_DIV_MEAT_1_imm(S,R,X,Y,_FP_DIV_HELP_imm)
+#define _FP_DIV_MEAT_D(R,X,Y)	_FP_DIV_MEAT_1_udiv_norm(D,R,X,Y)
+#define _FP_DIV_MEAT_Q(R,X,Y)	_FP_DIV_MEAT_2_udiv(Q,R,X,Y)
+
+#define _FP_NANFRAC_S		_FP_QNANBIT_S
+#define _FP_NANFRAC_D		_FP_QNANBIT_D
+#define _FP_NANFRAC_Q		_FP_QNANBIT_Q, 0
+
+#endif
+
+#if __riscv_xlen == 64
+typedef int TItype __attribute__ ((mode (TI)));
+typedef unsigned int UTItype __attribute__ ((mode (TI)));
+#define TI_BITS (__CHAR_BIT__ * (int)sizeof(TItype))
+#endif
+
+/* The type of the result of a floating point comparison.  This must
+   match __libgcc_cmp_return__ in GCC for the target.  */
+typedef int __gcc_CMPtype __attribute__ ((mode (__libgcc_cmp_return__)));
+#define CMPtype __gcc_CMPtype
+
+#define _FP_NANSIGN_S		0
+#define _FP_NANSIGN_D		0
+#define _FP_NANSIGN_Q		0
+
+#define _FP_KEEPNANFRACP 0
+#define _FP_QNANNEGATEDP 0
+
+#define _FP_CHOOSENAN(fs, wc, R, X, Y, OP)	\
+  do {						\
+    R##_s = _FP_NANSIGN_##fs;			\
+    _FP_FRAC_SET_##wc(R,_FP_NANFRAC_##fs);	\
+    R##_c = FP_CLS_NAN;				\
+  } while (0)
+
+#define _FP_DECL_EX		int _frm __attribute__ ((unused));
+#define FP_ROUNDMODE		_frm
+
+#define FP_RND_NEAREST		0x0
+#define FP_RND_ZERO		0x1
+#define FP_RND_PINF		0x3
+#define FP_RND_MINF		0x2
+
+#define FP_EX_INVALID		0x10
+#define FP_EX_OVERFLOW		0x04
+#define FP_EX_UNDERFLOW		0x02
+#define FP_EX_DIVZERO		0x08
+#define FP_EX_INEXACT		0x01
+
+#define _FP_TININESS_AFTER_ROUNDING 1
+
+#ifdef __riscv_flen
+#define FP_INIT_ROUNDMODE			\
+do {						\
+  __asm__ volatile ("frrm %0" : "=r" (_frm));	\
+} while (0)
+
+#define FP_HANDLE_EXCEPTIONS					\
+do {								\
+  if (__builtin_expect (_fex, 0))				\
+    __asm__ volatile ("csrs fflags, %0" : : "rK" (_fex));	\
+} while (0)
+#else
+#define FP_INIT_ROUNDMODE	_frm = FP_RND_NEAREST
+#endif
+
+#define	__LITTLE_ENDIAN	1234
+#define	__BIG_ENDIAN	4321
+
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#define __BYTE_ORDER __BIG_ENDIAN
+#else
+#define __BYTE_ORDER __LITTLE_ENDIAN
+#endif
+
+
+/* Define ALIASNAME as a strong alias for NAME.  */
+# define strong_alias(name, aliasname) _strong_alias(name, aliasname)
+# define _strong_alias(name, aliasname) \
+  extern __typeof (name) aliasname __attribute__ ((alias (#name)));
--- a/src/gemm/soft-fp/single.h
+++ b/src/gemm/soft-fp/single.h
@ -0,0 +1,199 @@
+/* Software floating-point emulation.
+   Definitions for IEEE Single Precision.
+   Copyright (C) 1997-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson (rth@cygnus.com),
+		  Jakub Jelinek (jj@ultra.linux.cz),
+		  David S. Miller (davem@redhat.com) and
+		  Peter Maydell (pmaydell@chiark.greenend.org.uk).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef SOFT_FP_SINGLE_H
+#define SOFT_FP_SINGLE_H	1
+
+#if _FP_W_TYPE_SIZE < 32
+# error "Here's a nickel kid.  Go buy yourself a real computer."
+#endif
+
+#define _FP_FRACTBITS_S		_FP_W_TYPE_SIZE
+
+#if _FP_W_TYPE_SIZE < 64
+# define _FP_FRACTBITS_DW_S	(2 * _FP_W_TYPE_SIZE)
+#else
+# define _FP_FRACTBITS_DW_S	_FP_W_TYPE_SIZE
+#endif
+
+#define _FP_FRACBITS_S		24
+#define _FP_FRACXBITS_S		(_FP_FRACTBITS_S - _FP_FRACBITS_S)
+#define _FP_WFRACBITS_S		(_FP_WORKBITS + _FP_FRACBITS_S)
+#define _FP_WFRACXBITS_S	(_FP_FRACTBITS_S - _FP_WFRACBITS_S)
+#define _FP_EXPBITS_S		8
+#define _FP_EXPBIAS_S		127
+#define _FP_EXPMAX_S		255
+#define _FP_QNANBIT_S		((_FP_W_TYPE) 1 << (_FP_FRACBITS_S-2))
+#define _FP_QNANBIT_SH_S	((_FP_W_TYPE) 1 << (_FP_FRACBITS_S-2+_FP_WORKBITS))
+#define _FP_IMPLBIT_S		((_FP_W_TYPE) 1 << (_FP_FRACBITS_S-1))
+#define _FP_IMPLBIT_SH_S	((_FP_W_TYPE) 1 << (_FP_FRACBITS_S-1+_FP_WORKBITS))
+#define _FP_OVERFLOW_S		((_FP_W_TYPE) 1 << (_FP_WFRACBITS_S))
+
+#define _FP_WFRACBITS_DW_S	(2 * _FP_WFRACBITS_S)
+#define _FP_WFRACXBITS_DW_S	(_FP_FRACTBITS_DW_S - _FP_WFRACBITS_DW_S)
+#define _FP_HIGHBIT_DW_S	\
+  ((_FP_W_TYPE) 1 << (_FP_WFRACBITS_DW_S - 1) % _FP_W_TYPE_SIZE)
+
+/* The implementation of _FP_MUL_MEAT_S and _FP_DIV_MEAT_S should be
+   chosen by the target machine.  */
+
+typedef float SFtype __attribute__ ((mode (SF)));
+
+union _FP_UNION_S
+{
+  SFtype flt;
+  struct _FP_STRUCT_LAYOUT
+  {
+#if __BYTE_ORDER == __BIG_ENDIAN
+    unsigned sign : 1;
+    unsigned exp  : _FP_EXPBITS_S;
+    unsigned frac : _FP_FRACBITS_S - (_FP_IMPLBIT_S != 0);
+#else
+    unsigned frac : _FP_FRACBITS_S - (_FP_IMPLBIT_S != 0);
+    unsigned exp  : _FP_EXPBITS_S;
+    unsigned sign : 1;
+#endif
+  } bits;
+};
+
+#define FP_DECL_S(X)		_FP_DECL (1, X)
+#define FP_UNPACK_RAW_S(X, val)	_FP_UNPACK_RAW_1 (S, X, (val))
+#define FP_UNPACK_RAW_SP(X, val)	_FP_UNPACK_RAW_1_P (S, X, (val))
+#define FP_PACK_RAW_S(val, X)	_FP_PACK_RAW_1 (S, (val), X)
+#define FP_PACK_RAW_SP(val, X)			\
+  do						\
+    {						\
+      if (!FP_INHIBIT_RESULTS)			\
+	_FP_PACK_RAW_1_P (S, (val), X);		\
+    }						\
+  while (0)
+
+#define FP_UNPACK_S(X, val)			\
+  do						\
+    {						\
+      _FP_UNPACK_RAW_1 (S, X, (val));		\
+      _FP_UNPACK_CANONICAL (S, 1, X);		\
+    }						\
+  while (0)
+
+#define FP_UNPACK_SP(X, val)			\
+  do						\
+    {						\
+      _FP_UNPACK_RAW_1_P (S, X, (val));		\
+      _FP_UNPACK_CANONICAL (S, 1, X);		\
+    }						\
+  while (0)
+
+#define FP_UNPACK_SEMIRAW_S(X, val)		\
+  do						\
+    {						\
+      _FP_UNPACK_RAW_1 (S, X, (val));		\
+      _FP_UNPACK_SEMIRAW (S, 1, X);		\
+    }						\
+  while (0)
+
+#define FP_UNPACK_SEMIRAW_SP(X, val)		\
+  do						\
+    {						\
+      _FP_UNPACK_RAW_1_P (S, X, (val));		\
+      _FP_UNPACK_SEMIRAW (S, 1, X);		\
+    }						\
+  while (0)
+
+#define FP_PACK_S(val, X)			\
+  do						\
+    {						\
+      _FP_PACK_CANONICAL (S, 1, X);		\
+      _FP_PACK_RAW_1 (S, (val), X);		\
+    }						\
+  while (0)
+
+#define FP_PACK_SP(val, X)			\
+  do						\
+    {						\
+      _FP_PACK_CANONICAL (S, 1, X);		\
+      if (!FP_INHIBIT_RESULTS)			\
+	_FP_PACK_RAW_1_P (S, (val), X);		\
+    }						\
+  while (0)
+
+#define FP_PACK_SEMIRAW_S(val, X)		\
+  do						\
+    {						\
+      _FP_PACK_SEMIRAW (S, 1, X);		\
+      _FP_PACK_RAW_1 (S, (val), X);		\
+    }						\
+  while (0)
+
+#define FP_PACK_SEMIRAW_SP(val, X)		\
+  do						\
+    {						\
+      _FP_PACK_SEMIRAW (S, 1, X);		\
+      if (!FP_INHIBIT_RESULTS)			\
+	_FP_PACK_RAW_1_P (S, (val), X);		\
+    }						\
+  while (0)
+
+#define FP_ISSIGNAN_S(X)		_FP_ISSIGNAN (S, 1, X)
+#define FP_NEG_S(R, X)			_FP_NEG (S, 1, R, X)
+#define FP_ADD_S(R, X, Y)		_FP_ADD (S, 1, R, X, Y)
+#define FP_SUB_S(R, X, Y)		_FP_SUB (S, 1, R, X, Y)
+#define FP_MUL_S(R, X, Y)		_FP_MUL (S, 1, R, X, Y)
+#define FP_DIV_S(R, X, Y)		_FP_DIV (S, 1, R, X, Y)
+#define FP_SQRT_S(R, X)			_FP_SQRT (S, 1, R, X)
+#define _FP_SQRT_MEAT_S(R, S, T, X, Q)	_FP_SQRT_MEAT_1 (R, S, T, X, (Q))
+
+#if _FP_W_TYPE_SIZE < 64
+# define FP_FMA_S(R, X, Y, Z)	_FP_FMA (S, 1, 2, R, X, Y, Z)
+#else
+# define FP_FMA_S(R, X, Y, Z)	_FP_FMA (S, 1, 1, R, X, Y, Z)
+#endif
+
+#define FP_CMP_S(r, X, Y, un, ex)	_FP_CMP (S, 1, (r), X, Y, (un), (ex))
+#define FP_CMP_EQ_S(r, X, Y, ex)	_FP_CMP_EQ (S, 1, (r), X, Y, (ex))
+#define FP_CMP_UNORD_S(r, X, Y, ex)	_FP_CMP_UNORD (S, 1, (r), X, Y, (ex))
+
+#define FP_TO_INT_S(r, X, rsz, rsg)	_FP_TO_INT (S, 1, (r), X, (rsz), (rsg))
+#define FP_TO_INT_ROUND_S(r, X, rsz, rsg)	\
+  _FP_TO_INT_ROUND (S, 1, (r), X, (rsz), (rsg))
+#define FP_FROM_INT_S(X, r, rs, rt)	_FP_FROM_INT (S, 1, X, (r), (rs), rt)
+
+#define _FP_FRAC_HIGH_S(X)	_FP_FRAC_HIGH_1 (X)
+#define _FP_FRAC_HIGH_RAW_S(X)	_FP_FRAC_HIGH_1 (X)
+
+#if _FP_W_TYPE_SIZE < 64
+# define _FP_FRAC_HIGH_DW_S(X)	_FP_FRAC_HIGH_2 (X)
+#else
+# define _FP_FRAC_HIGH_DW_S(X)	_FP_FRAC_HIGH_1 (X)
+#endif
+
+#endif /* !SOFT_FP_SINGLE_H */
--- a/src/gemm/soft-fp/soft-fp.h
+++ b/src/gemm/soft-fp/soft-fp.h
@ -0,0 +1,230 @@
+#ifndef __SOFT_FP_H__
+#define __SOFT_FP_H__
+
+#include "sfp-machine.h"
+
+#define abort()   // 54
+/* For unreachable default cases in switch statements over bitwise OR
+   of FP_CLS_* values.  */
+#if (defined __GNUC__							\
+     && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)))
+# define _FP_UNREACHABLE	__builtin_unreachable ()
+#else
+# define _FP_UNREACHABLE	abort ()
+#endif
+// 63
+#if ((defined __GNUC__							\
+      && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))	\
+     || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 201112L))
+# define _FP_STATIC_ASSERT(expr, msg)		\
+  _Static_assert ((expr), msg)
+#else
+# define _FP_STATIC_ASSERT(expr, msg)					\
+  extern int (*__Static_assert_function (void))				\
+    [!!sizeof (struct { int __error_if_negative: (expr) ? 2 : -1; })]
+#endif
+
+
+#define _FP_ZERO_INIT	  = 0		// 82
+#define _FP_WORKBITS		3			// 85
+#define _FP_WORK_LSB		((_FP_W_TYPE) 1 << 3)
+#define _FP_WORK_ROUND		((_FP_W_TYPE) 1 << 2)	// 87
+#define _FP_WORK_GUARD		((_FP_W_TYPE) 1 << 1)
+#define _FP_WORK_STICKY		((_FP_W_TYPE) 1 << 0)	// 89
+
+#ifndef FP_RND_NEAREST
+# define FP_RND_NEAREST		0
+# define FP_RND_ZERO		1
+# define FP_RND_PINF		2
+# define FP_RND_MINF		3
+#endif
+#ifndef FP_ROUNDMODE
+# define FP_ROUNDMODE		FP_RND_NEAREST
+#endif
+
+/* By default don't care about exceptions.  */	// 101
+#ifndef FP_EX_INVALID
+# define FP_EX_INVALID		0
+#endif
+#ifndef FP_EX_OVERFLOW
+# define FP_EX_OVERFLOW		0
+#endif
+#ifndef FP_EX_UNDERFLOW
+# define FP_EX_UNDERFLOW	0
+#endif
+#ifndef FP_EX_DIVZERO
+# define FP_EX_DIVZERO		0
+#endif
+#ifndef FP_EX_INEXACT
+# define FP_EX_INEXACT		0
+#endif
+#ifndef FP_EX_DENORM
+# define FP_EX_DENORM		0
+#endif
+
+/* Sub-exceptions of "invalid".  */		// 121
+/* Signaling NaN operand.  */
+#ifndef FP_EX_INVALID_SNAN
+# define FP_EX_INVALID_SNAN	0
+#endif
+/* Inf * 0.  */							// 126
+#ifndef FP_EX_INVALID_IMZ
+# define FP_EX_INVALID_IMZ	0
+#endif
+
+/* Inf - Inf.  */						// 134
+#ifndef FP_EX_INVALID_ISI
+# define FP_EX_INVALID_ISI	0
+#endif
+/* 0 / 0.  */
+#ifndef FP_EX_INVALID_ZDZ
+# define FP_EX_INVALID_ZDZ	0
+#endif
+/* Inf / Inf.  */
+#ifndef FP_EX_INVALID_IDI
+# define FP_EX_INVALID_IDI	0
+#endif
+
+/* Invalid conversion to integer.  */
+#ifndef FP_EX_INVALID_CVI
+# define FP_EX_INVALID_CVI	0
+#endif
+/* Invalid comparison.  */				// 154
+#ifndef FP_EX_INVALID_VC				
+# define FP_EX_INVALID_VC	0			
+#endif
+
+/* _FP_STRUCT_LAYOUT may be defined as an attribute to determine the
+   struct layout variant used for structures where bit-fields are used
+   to access specific parts of binary floating-point numbers.  This is
+   required for systems where the default ABI uses struct layout with
+   differences in how consecutive bit-fields are laid out from the
+   default expected by soft-fp.  */
+#ifndef _FP_STRUCT_LAYOUT
+# define _FP_STRUCT_LAYOUT
+#endif
+										// 169
+#ifdef _FP_DECL_EX
+# define FP_DECL_EX					\
+  int _fex = 0;						\
+  _FP_DECL_EX
+#else
+# define FP_DECL_EX int _fex = 0
+#endif
+
+/* Initialize any machine-specific state used in FP_ROUNDMODE,
+   FP_TRAPPING_EXCEPTIONS or FP_HANDLE_EXCEPTIONS.  */
+#ifndef FP_INIT_ROUNDMODE
+# define FP_INIT_ROUNDMODE do {} while (0)
+#endif
+
+/* Initialize any machine-specific state used in
+   FP_TRAPPING_EXCEPTIONS or FP_HANDLE_EXCEPTIONS.  */
+# define FP_INIT_TRAPPING_EXCEPTIONS FP_INIT_ROUNDMODE	// 186
+
+/* Initialize any machine-specific state used in
+   FP_HANDLE_EXCEPTIONS.  */
+#define FP_INIT_EXCEPTIONS FP_INIT_TRAPPING_EXCEPTIONS	// 192
+
+#define FP_HANDLE_EXCEPTIONS do {} while (0)	// 196
+
+#define FP_DENORM_ZERO  0			// 201
+#define FP_SET_EXCEPTION(ex)	_fex |= (ex)		// 212
+#define FP_CUR_EXCEPTIONS		 (_fex)				// 215
+#define FP_TRAPPING_EXCEPTIONS 0					// 219
+ 
+
+													// 259
+#define _FP_ROUND_NEAREST(wc, X)				\
+  do								\
+    {								\
+      if ((_FP_FRAC_LOW_##wc (X) & 15) != _FP_WORK_ROUND)	\
+	_FP_FRAC_ADDI_##wc (X, _FP_WORK_ROUND);			\
+    }								\
+  while (0)
+
+#define _FP_ROUND_ZERO(wc, X)		(void) 0
+
+#define _FP_ROUND_PINF(wc, X)				\
+  do							\
+    {							\
+      if (!X##_s && (_FP_FRAC_LOW_##wc (X) & 7))	\
+	_FP_FRAC_ADDI_##wc (X, _FP_WORK_LSB);		\
+    }							\
+  while (0)
+
+#define _FP_ROUND_MINF(wc, X)			\
+  do						\
+    {						\
+      if (X##_s && (_FP_FRAC_LOW_##wc (X) & 7))	\
+	_FP_FRAC_ADDI_##wc (X, _FP_WORK_LSB);	\
+    }						\
+  while (0)
+
+#define _FP_ROUND(wc, X)			\
+  do						\
+    {						\
+      if (_FP_FRAC_LOW_##wc (X) & 7)		\
+	{					\
+	  FP_SET_EXCEPTION (FP_EX_INEXACT);	\
+	  switch (FP_ROUNDMODE)			\
+	    {					\
+	    case FP_RND_NEAREST:		\
+	      _FP_ROUND_NEAREST (wc, X);	\
+	      break;				\
+	    case FP_RND_ZERO:			\
+	      _FP_ROUND_ZERO (wc, X);		\
+	      break;				\
+	    case FP_RND_PINF:			\
+	      _FP_ROUND_PINF (wc, X);		\
+	      break;				\
+	    case FP_RND_MINF:			\
+	      _FP_ROUND_MINF (wc, X);		\
+	      break;				\
+	    }					\
+	}					\
+    }						\
+  while (0)
+
+#define FP_CLS_NORMAL		0		// 310
+#define FP_CLS_ZERO		1
+#define FP_CLS_INF		2
+#define FP_CLS_NAN		3
+
+#define _FP_CLS_COMBINE(x, y)	(((x) << 2) | (y))	// 315
+
+#include "op-1.h"
+#include "op-2.h"
+#include "op-4.h"
+#include "op-8.h"
+#include "op-common.h"
+
+/* Sigh.  Silly things longlong.h needs.  */
+#define UWtype		_FP_W_TYPE
+#define W_TYPE_SIZE	_FP_W_TYPE_SIZE
+
+typedef int QItype __attribute__ ((mode (QI)));
+typedef int SItype __attribute__ ((mode (SI)));
+typedef int DItype __attribute__ ((mode (DI)));
+typedef unsigned int UQItype __attribute__ ((mode (QI)));
+typedef unsigned int USItype __attribute__ ((mode (SI)));
+typedef unsigned int UDItype __attribute__ ((mode (DI)));
+#if _FP_W_TYPE_SIZE == 32
+typedef unsigned int UHWtype __attribute__ ((mode (HI)));
+#elif _FP_W_TYPE_SIZE == 64
+typedef USItype UHWtype;
+#endif
+
+#ifndef CMPtype
+# define CMPtype	int
+#endif
+
+#define SI_BITS		(__CHAR_BIT__ * (int) sizeof (SItype))
+#define DI_BITS		(__CHAR_BIT__ * (int) sizeof (DItype))
+
+#include "longlong.h"
+
+#endif
+
+
+
--- a/src/gemm/soft-fp/subdf3.c
+++ b/src/gemm/soft-fp/subdf3.c
@ -0,0 +1,21 @@
+#include "soft-fp.h"
+#include "double.h"
+
+DFtype
+__subdf3 (DFtype a, DFtype b)
+{
+  FP_DECL_EX;
+  FP_DECL_D (A);
+  FP_DECL_D (B);
+  FP_DECL_D (R);
+  DFtype r;
+
+  FP_INIT_ROUNDMODE;
+  FP_UNPACK_SEMIRAW_D (A, a);
+  FP_UNPACK_SEMIRAW_D (B, b);
+  FP_SUB_D (R, A, B);
+  FP_PACK_SEMIRAW_D (r, R);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
--- a/src/gemm/soft-fp/subsf3.c
+++ b/src/gemm/soft-fp/subsf3.c
@ -0,0 +1,22 @@
+#include "soft-fp.h"
+#include "single.h"
+
+SFtype
+__subsf3 (SFtype a, SFtype b)
+{
+  FP_DECL_EX;
+  FP_DECL_S (A);
+  FP_DECL_S (B);
+  FP_DECL_S (R);
+  SFtype r;
+
+  FP_INIT_ROUNDMODE;
+  FP_UNPACK_SEMIRAW_S (A, a);
+  FP_UNPACK_SEMIRAW_S (B, b);
+  FP_SUB_S (R, A, B);
+  FP_PACK_SEMIRAW_S (r, R);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
+
--- a/src/gemm/soft-fp/truncdfsf2.c
+++ b/src/gemm/soft-fp/truncdfsf2.c
@ -0,0 +1,24 @@
+#include "soft-fp.h"
+#include "single.h"
+#include "double.h"
+
+SFtype
+__truncdfsf2 (DFtype a)
+{
+  FP_DECL_EX;
+  FP_DECL_D (A);
+  FP_DECL_S (R);
+  SFtype r;
+
+  FP_INIT_ROUNDMODE;
+  FP_UNPACK_SEMIRAW_D (A, a);
+#if _FP_W_TYPE_SIZE < _FP_FRACBITS_D
+  FP_TRUNC (S, D, 1, 2, R, A);
+#else
+  FP_TRUNC (S, D, 1, 1, R, A);
+#endif
+  FP_PACK_SEMIRAW_S (r, R);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
--- a/src/linpack/Makefile
+++ b/src/linpack/Makefile
@ -0,0 +1,3 @@
+NAME = linpack
+SRCS = $(shell find soft-fp/ -name "*.c") linpack.c
+include $(AM_HOME)/Makefile
--- a/src/linpack/soft-fp/aa-README.txt
+++ b/src/linpack/soft-fp/aa-README.txt
@ -0,0 +1,76 @@
+https://gcc.gnu.org/onlinedocs/gccint/Soft-float-library-routines.html
+
+1.Arithmetic functions
+
+Runtime Function: float __addsf3 (float a, float b)
+Runtime Function: double __adddf3 (double a, double b)
+    These functions return the sum of a and b.
+
+Runtime Function: float __subsf3 (float a, float b)
+Runtime Function: double __subdf3 (double a, double b)
+    These functions return the difference between b and a; that is, a - b.
+
+Runtime Function: float __mulsf3 (float a, float b)
+Runtime Function: double __muldf3 (double a, double b)
+    These functions return the product of a and b.
+
+Runtime Function: float __divsf3 (float a, float b)
+Runtime Function: double __divdf3 (double a, double b)
+    These functions return the quotient of a and b; that is, a / b.
+
+Runtime Function: float __negsf2 (float a)
+Runtime Function: double __negdf2 (double a)
+    These functions return the negation of a. They simply flip the sign bit, so they can produce negative zero and negative NaN.
+
+2.Conversion functions
+
+Runtime Function: double __extendsfdf2 (float a)
+    These functions extend a to the wider mode of their return type.
+
+Runtime Function: float __truncdfsf2 (double a)
+    These functions truncate a to the narrower mode of their return type, rounding toward zero.
+
+Runtime Function: int __fixsfsi (float a)
+Runtime Function: int __fixdfsi (double a)
+    These functions convert a to a signed integer, rounding toward zero.
+
+Runtime Function: long __fixsfdi (float a)
+Runtime Function: long __fixdfdi (double a)
+    These functions convert a to a signed long, rounding toward zero.
+
+Runtime Function: long long __fixsfti (float a)
+Runtime Function: long long __fixdfti (double a)
+    These functions convert a to a signed long long, rounding toward zero.
+
+
+Runtime Function: unsigned int __fixunssfsi (float a)
+Runtime Function: unsigned int __fixunsdfsi (double a)
+    These functions convert a to an unsigned integer, rounding toward zero. Negative values all become zero.
+
+Runtime Function: unsigned long __fixunssfdi (float a)
+Runtime Function: unsigned long __fixunsdfdi (double a)
+    These functions convert a to an unsigned long, rounding toward zero. Negative values all become zero.
+
+Runtime Function: unsigned long long __fixunssfti (float a)
+Runtime Function: unsigned long long __fixunsdfti (double a)
+    These functions convert a to an unsigned long long, rounding toward zero. Negative values all become zero.
+
+
+Runtime Function: float __floatsisf (int i)
+Runtime Function: double __floatsidf (int i)
+    These functions convert i, a signed integer, to floating point.
+
+Runtime Function: float __floatdisf (long i) ¶
+Runtime Function: double __floatdidf (long i)
+    These functions convert i, a signed long, to floating point.
+
+
+Runtime Function: float __floatunsisf (unsigned int i)
+Runtime Function: double __floatunsidf (unsigned int i)
+    These functions convert i, an unsigned integer, to floating point.
+
+Runtime Function: float __floatundisf (unsigned long i)
+Runtime Function: double __floatundidf (unsigned long i)
+    These functions convert i, an unsigned long, to floating point.
+
+3.Comparison functions
--- a/src/linpack/soft-fp/adddf3.c
+++ b/src/linpack/soft-fp/adddf3.c
@ -0,0 +1,21 @@
+#include "soft-fp.h"
+#include "double.h"
+
+DFtype
+__adddf3 (DFtype a, DFtype b)
+{
+  FP_DECL_EX;
+  FP_DECL_D (A);
+  FP_DECL_D (B);
+  FP_DECL_D (R);
+  DFtype r;
+
+  FP_INIT_ROUNDMODE;
+  FP_UNPACK_SEMIRAW_D (A, a);
+  FP_UNPACK_SEMIRAW_D (B, b);
+  FP_ADD_D (R, A, B);
+  FP_PACK_SEMIRAW_D (r, R);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
--- a/src/linpack/soft-fp/addsf3.c
+++ b/src/linpack/soft-fp/addsf3.c
@ -0,0 +1,23 @@
+#include "soft-fp.h"
+#include "single.h"
+
+SFtype
+__addsf3 (SFtype a, SFtype b)
+{
+  FP_DECL_EX;
+  FP_DECL_S (A);
+  FP_DECL_S (B);
+  FP_DECL_S (R);
+  SFtype r;
+
+  FP_INIT_ROUNDMODE;
+  FP_UNPACK_SEMIRAW_S (A, a);
+  FP_UNPACK_SEMIRAW_S (B, b);
+  FP_ADD_S (R, A, B);
+  FP_PACK_SEMIRAW_S (r, R);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
+
+
--- a/src/linpack/soft-fp/divdf3.c
+++ b/src/linpack/soft-fp/divdf3.c
@ -0,0 +1,21 @@
+#include "soft-fp.h"
+#include "double.h"
+
+DFtype
+__divdf3 (DFtype a, DFtype b)
+{
+  FP_DECL_EX;
+  FP_DECL_D (A);
+  FP_DECL_D (B);
+  FP_DECL_D (R);
+  DFtype r;
+
+  FP_INIT_ROUNDMODE;
+  FP_UNPACK_D (A, a);
+  FP_UNPACK_D (B, b);
+  FP_DIV_D (R, A, B);
+  FP_PACK_D (r, R);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
--- a/src/linpack/soft-fp/divsf3.c
+++ b/src/linpack/soft-fp/divsf3.c
@ -0,0 +1,21 @@
+#include "soft-fp.h"
+#include "single.h"
+
+SFtype
+__divsf3 (SFtype a, SFtype b)
+{
+  FP_DECL_EX;
+  FP_DECL_S (A);
+  FP_DECL_S (B);
+  FP_DECL_S (R);
+  SFtype r;
+
+  FP_INIT_ROUNDMODE;
+  FP_UNPACK_S (A, a);
+  FP_UNPACK_S (B, b);
+  FP_DIV_S (R, A, B);
+  FP_PACK_S (r, R);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
--- a/src/linpack/soft-fp/double.h
+++ b/src/linpack/soft-fp/double.h
@ -0,0 +1,323 @@
+/* Software floating-point emulation.
+   Definitions for IEEE Double Precision
+   Copyright (C) 1997-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson (rth@cygnus.com),
+		  Jakub Jelinek (jj@ultra.linux.cz),
+		  David S. Miller (davem@redhat.com) and
+		  Peter Maydell (pmaydell@chiark.greenend.org.uk).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef SOFT_FP_DOUBLE_H
+#define SOFT_FP_DOUBLE_H	1
+
+#if _FP_W_TYPE_SIZE < 32
+# error "Here's a nickel kid.  Go buy yourself a real computer."
+#endif
+
+#if _FP_W_TYPE_SIZE < 64
+# define _FP_FRACTBITS_D	(2 * _FP_W_TYPE_SIZE)
+# define _FP_FRACTBITS_DW_D	(4 * _FP_W_TYPE_SIZE)
+#else
+# define _FP_FRACTBITS_D	_FP_W_TYPE_SIZE
+# define _FP_FRACTBITS_DW_D	(2 * _FP_W_TYPE_SIZE)
+#endif
+
+#define _FP_FRACBITS_D		53
+#define _FP_FRACXBITS_D		(_FP_FRACTBITS_D - _FP_FRACBITS_D)
+#define _FP_WFRACBITS_D		(_FP_WORKBITS + _FP_FRACBITS_D)
+#define _FP_WFRACXBITS_D	(_FP_FRACTBITS_D - _FP_WFRACBITS_D)
+#define _FP_EXPBITS_D		11
+#define _FP_EXPBIAS_D		1023
+#define _FP_EXPMAX_D		2047
+
+#define _FP_QNANBIT_D		\
+	((_FP_W_TYPE) 1 << (_FP_FRACBITS_D-2) % _FP_W_TYPE_SIZE)
+#define _FP_QNANBIT_SH_D		\
+	((_FP_W_TYPE) 1 << (_FP_FRACBITS_D-2+_FP_WORKBITS) % _FP_W_TYPE_SIZE)
+#define _FP_IMPLBIT_D		\
+	((_FP_W_TYPE) 1 << (_FP_FRACBITS_D-1) % _FP_W_TYPE_SIZE)
+#define _FP_IMPLBIT_SH_D		\
+	((_FP_W_TYPE) 1 << (_FP_FRACBITS_D-1+_FP_WORKBITS) % _FP_W_TYPE_SIZE)
+#define _FP_OVERFLOW_D		\
+	((_FP_W_TYPE) 1 << _FP_WFRACBITS_D % _FP_W_TYPE_SIZE)
+
+#define _FP_WFRACBITS_DW_D	(2 * _FP_WFRACBITS_D)
+#define _FP_WFRACXBITS_DW_D	(_FP_FRACTBITS_DW_D - _FP_WFRACBITS_DW_D)
+#define _FP_HIGHBIT_DW_D	\
+  ((_FP_W_TYPE) 1 << (_FP_WFRACBITS_DW_D - 1) % _FP_W_TYPE_SIZE)
+
+typedef float DFtype __attribute__ ((mode (DF)));
+
+#if _FP_W_TYPE_SIZE < 64
+
+union _FP_UNION_D
+{
+  DFtype flt;
+  struct _FP_STRUCT_LAYOUT
+  {
+# if __BYTE_ORDER == __BIG_ENDIAN
+    unsigned sign  : 1;
+    unsigned exp   : _FP_EXPBITS_D;
+    unsigned frac1 : _FP_FRACBITS_D - (_FP_IMPLBIT_D != 0) - _FP_W_TYPE_SIZE;
+    unsigned frac0 : _FP_W_TYPE_SIZE;
+# else
+    unsigned frac0 : _FP_W_TYPE_SIZE;
+    unsigned frac1 : _FP_FRACBITS_D - (_FP_IMPLBIT_D != 0) - _FP_W_TYPE_SIZE;
+    unsigned exp   : _FP_EXPBITS_D;
+    unsigned sign  : 1;
+# endif
+  } bits;
+};
+
+# define FP_DECL_D(X)		_FP_DECL (2, X)
+# define FP_UNPACK_RAW_D(X, val)	_FP_UNPACK_RAW_2 (D, X, (val))
+# define FP_UNPACK_RAW_DP(X, val)	_FP_UNPACK_RAW_2_P (D, X, (val))
+# define FP_PACK_RAW_D(val, X)	_FP_PACK_RAW_2 (D, (val), X)
+# define FP_PACK_RAW_DP(val, X)			\
+  do						\
+    {						\
+      if (!FP_INHIBIT_RESULTS)			\
+	_FP_PACK_RAW_2_P (D, (val), X);		\
+    }						\
+  while (0)
+
+# define FP_UNPACK_D(X, val)			\
+  do						\
+    {						\
+      _FP_UNPACK_RAW_2 (D, X, (val));		\
+      _FP_UNPACK_CANONICAL (D, 2, X);		\
+    }						\
+  while (0)
+
+# define FP_UNPACK_DP(X, val)			\
+  do						\
+    {						\
+      _FP_UNPACK_RAW_2_P (D, X, (val));		\
+      _FP_UNPACK_CANONICAL (D, 2, X);		\
+    }						\
+  while (0)
+
+# define FP_UNPACK_SEMIRAW_D(X, val)		\
+  do						\
+    {						\
+      _FP_UNPACK_RAW_2 (D, X, (val));		\
+      _FP_UNPACK_SEMIRAW (D, 2, X);		\
+    }						\
+  while (0)
+
+# define FP_UNPACK_SEMIRAW_DP(X, val)		\
+  do						\
+    {						\
+      _FP_UNPACK_RAW_2_P (D, X, (val));		\
+      _FP_UNPACK_SEMIRAW (D, 2, X);		\
+    }						\
+  while (0)
+
+# define FP_PACK_D(val, X)			\
+  do						\
+    {						\
+      _FP_PACK_CANONICAL (D, 2, X);		\
+      _FP_PACK_RAW_2 (D, (val), X);		\
+    }						\
+  while (0)
+
+# define FP_PACK_DP(val, X)			\
+  do						\
+    {						\
+      _FP_PACK_CANONICAL (D, 2, X);		\
+      if (!FP_INHIBIT_RESULTS)			\
+	_FP_PACK_RAW_2_P (D, (val), X);		\
+    }						\
+  while (0)
+
+# define FP_PACK_SEMIRAW_D(val, X)		\
+  do						\
+    {						\
+      _FP_PACK_SEMIRAW (D, 2, X);		\
+      _FP_PACK_RAW_2 (D, (val), X);		\
+    }						\
+  while (0)
+
+# define FP_PACK_SEMIRAW_DP(val, X)		\
+  do						\
+    {						\
+      _FP_PACK_SEMIRAW (D, 2, X);		\
+      if (!FP_INHIBIT_RESULTS)			\
+	_FP_PACK_RAW_2_P (D, (val), X);		\
+    }						\
+  while (0)
+
+# define FP_ISSIGNAN_D(X)		_FP_ISSIGNAN (D, 2, X)
+# define FP_NEG_D(R, X)			_FP_NEG (D, 2, R, X)
+# define FP_ADD_D(R, X, Y)		_FP_ADD (D, 2, R, X, Y)
+# define FP_SUB_D(R, X, Y)		_FP_SUB (D, 2, R, X, Y)
+# define FP_MUL_D(R, X, Y)		_FP_MUL (D, 2, R, X, Y)
+# define FP_DIV_D(R, X, Y)		_FP_DIV (D, 2, R, X, Y)
+# define FP_SQRT_D(R, X)		_FP_SQRT (D, 2, R, X)
+# define _FP_SQRT_MEAT_D(R, S, T, X, Q)	_FP_SQRT_MEAT_2 (R, S, T, X, (Q))
+# define FP_FMA_D(R, X, Y, Z)		_FP_FMA (D, 2, 4, R, X, Y, Z)
+
+# define FP_CMP_D(r, X, Y, un, ex)	_FP_CMP (D, 2, (r), X, Y, (un), (ex))
+# define FP_CMP_EQ_D(r, X, Y, ex)	_FP_CMP_EQ (D, 2, (r), X, Y, (ex))
+# define FP_CMP_UNORD_D(r, X, Y, ex)	_FP_CMP_UNORD (D, 2, (r), X, Y, (ex))
+
+# define FP_TO_INT_D(r, X, rsz, rsg)	_FP_TO_INT (D, 2, (r), X, (rsz), (rsg))
+# define FP_TO_INT_ROUND_D(r, X, rsz, rsg)	\
+  _FP_TO_INT_ROUND (D, 2, (r), X, (rsz), (rsg))
+# define FP_FROM_INT_D(X, r, rs, rt)	_FP_FROM_INT (D, 2, X, (r), (rs), rt)
+
+# define _FP_FRAC_HIGH_D(X)	_FP_FRAC_HIGH_2 (X)
+# define _FP_FRAC_HIGH_RAW_D(X)	_FP_FRAC_HIGH_2 (X)
+
+# define _FP_FRAC_HIGH_DW_D(X)	_FP_FRAC_HIGH_4 (X)
+
+#else
+
+union _FP_UNION_D
+{
+  DFtype flt;
+  struct _FP_STRUCT_LAYOUT
+  {
+# if __BYTE_ORDER == __BIG_ENDIAN
+    unsigned sign   : 1;
+    unsigned exp    : _FP_EXPBITS_D;
+    _FP_W_TYPE frac : _FP_FRACBITS_D - (_FP_IMPLBIT_D != 0);
+# else
+    _FP_W_TYPE frac : _FP_FRACBITS_D - (_FP_IMPLBIT_D != 0);
+    unsigned exp    : _FP_EXPBITS_D;
+    unsigned sign   : 1;
+# endif
+  } bits;
+};
+
+# define FP_DECL_D(X)		_FP_DECL (1, X)
+# define FP_UNPACK_RAW_D(X, val)	_FP_UNPACK_RAW_1 (D, X, (val))
+# define FP_UNPACK_RAW_DP(X, val)	_FP_UNPACK_RAW_1_P (D, X, (val))
+# define FP_PACK_RAW_D(val, X)	_FP_PACK_RAW_1 (D, (val), X)
+# define FP_PACK_RAW_DP(val, X)			\
+  do						\
+    {						\
+      if (!FP_INHIBIT_RESULTS)			\
+	_FP_PACK_RAW_1_P (D, (val), X);		\
+    }						\
+  while (0)
+
+# define FP_UNPACK_D(X, val)			\
+  do						\
+    {						\
+      _FP_UNPACK_RAW_1 (D, X, (val));		\
+      _FP_UNPACK_CANONICAL (D, 1, X);		\
+    }						\
+  while (0)
+
+# define FP_UNPACK_DP(X, val)			\
+  do						\
+    {						\
+      _FP_UNPACK_RAW_1_P (D, X, (val));		\
+      _FP_UNPACK_CANONICAL (D, 1, X);		\
+    }						\
+  while (0)
+
+# define FP_UNPACK_SEMIRAW_D(X, val)		\
+  do						\
+    {						\
+      _FP_UNPACK_RAW_1 (D, X, (val));		\
+      _FP_UNPACK_SEMIRAW (D, 1, X);		\
+    }						\
+  while (0)
+
+# define FP_UNPACK_SEMIRAW_DP(X, val)		\
+  do						\
+    {						\
+      _FP_UNPACK_RAW_1_P (D, X, (val));		\
+      _FP_UNPACK_SEMIRAW (D, 1, X);		\
+    }						\
+  while (0)
+
+# define FP_PACK_D(val, X)			\
+  do						\
+    {						\
+      _FP_PACK_CANONICAL (D, 1, X);		\
+      _FP_PACK_RAW_1 (D, (val), X);		\
+    }						\
+  while (0)
+
+# define FP_PACK_DP(val, X)			\
+  do						\
+    {						\
+      _FP_PACK_CANONICAL (D, 1, X);		\
+      if (!FP_INHIBIT_RESULTS)			\
+	_FP_PACK_RAW_1_P (D, (val), X);		\
+    }						\
+  while (0)
+
+# define FP_PACK_SEMIRAW_D(val, X)		\
+  do						\
+    {						\
+      _FP_PACK_SEMIRAW (D, 1, X);		\
+      _FP_PACK_RAW_1 (D, (val), X);		\
+    }						\
+  while (0)
+
+# define FP_PACK_SEMIRAW_DP(val, X)		\
+  do						\
+    {						\
+      _FP_PACK_SEMIRAW (D, 1, X);		\
+      if (!FP_INHIBIT_RESULTS)			\
+	_FP_PACK_RAW_1_P (D, (val), X);		\
+    }						\
+  while (0)
+
+# define FP_ISSIGNAN_D(X)		_FP_ISSIGNAN (D, 1, X)
+# define FP_NEG_D(R, X)			_FP_NEG (D, 1, R, X)
+# define FP_ADD_D(R, X, Y)		_FP_ADD (D, 1, R, X, Y)
+# define FP_SUB_D(R, X, Y)		_FP_SUB (D, 1, R, X, Y)
+# define FP_MUL_D(R, X, Y)		_FP_MUL (D, 1, R, X, Y)
+# define FP_DIV_D(R, X, Y)		_FP_DIV (D, 1, R, X, Y)
+# define FP_SQRT_D(R, X)		_FP_SQRT (D, 1, R, X)
+# define _FP_SQRT_MEAT_D(R, S, T, X, Q)	_FP_SQRT_MEAT_1 (R, S, T, X, (Q))
+# define FP_FMA_D(R, X, Y, Z)		_FP_FMA (D, 1, 2, R, X, Y, Z)
+
+/* The implementation of _FP_MUL_D and _FP_DIV_D should be chosen by
+   the target machine.  */
+
+# define FP_CMP_D(r, X, Y, un, ex)	_FP_CMP (D, 1, (r), X, Y, (un), (ex))
+# define FP_CMP_EQ_D(r, X, Y, ex)	_FP_CMP_EQ (D, 1, (r), X, Y, (ex))
+# define FP_CMP_UNORD_D(r, X, Y, ex)	_FP_CMP_UNORD (D, 1, (r), X, Y, (ex))
+
+# define FP_TO_INT_D(r, X, rsz, rsg)	_FP_TO_INT (D, 1, (r), X, (rsz), (rsg))
+# define FP_TO_INT_ROUND_D(r, X, rsz, rsg)	\
+  _FP_TO_INT_ROUND (D, 1, (r), X, (rsz), (rsg))
+# define FP_FROM_INT_D(X, r, rs, rt)	_FP_FROM_INT (D, 1, X, (r), (rs), rt)
+
+# define _FP_FRAC_HIGH_D(X)	_FP_FRAC_HIGH_1 (X)
+# define _FP_FRAC_HIGH_RAW_D(X)	_FP_FRAC_HIGH_1 (X)
+
+# define _FP_FRAC_HIGH_DW_D(X)	_FP_FRAC_HIGH_2 (X)
+
+#endif /* W_TYPE_SIZE < 64 */
+
+#endif /* !SOFT_FP_DOUBLE_H */
--- a/src/linpack/soft-fp/eqdf2.c
+++ b/src/linpack/soft-fp/eqdf2.c
@ -0,0 +1,21 @@
+#include "soft-fp.h"
+#include "double.h"
+
+CMPtype
+__eqdf2 (DFtype a, DFtype b)
+{
+  FP_DECL_EX;
+  FP_DECL_D (A);
+  FP_DECL_D (B);
+  CMPtype r;
+
+  FP_INIT_EXCEPTIONS;
+  FP_UNPACK_RAW_D (A, a);
+  FP_UNPACK_RAW_D (B, b);
+  FP_CMP_EQ_D (r, A, B, 1);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
+
+strong_alias (__eqdf2, __nedf2);
--- a/src/linpack/soft-fp/eqsf2.c
+++ b/src/linpack/soft-fp/eqsf2.c
@ -0,0 +1,21 @@
+#include "soft-fp.h"
+#include "single.h"
+
+CMPtype
+__eqsf2 (SFtype a, SFtype b)
+{
+  FP_DECL_EX;
+  FP_DECL_S (A);
+  FP_DECL_S (B);
+  CMPtype r;
+
+  FP_INIT_EXCEPTIONS;
+  FP_UNPACK_RAW_S (A, a);
+  FP_UNPACK_RAW_S (B, b);
+  FP_CMP_EQ_S (r, A, B, 1);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
+
+strong_alias (__eqsf2, __nesf2);
--- a/src/linpack/soft-fp/extendsfdf2.c
+++ b/src/linpack/soft-fp/extendsfdf2.c
@ -0,0 +1,26 @@
+#define FP_NO_EXACT_UNDERFLOW
+#include "soft-fp.h"
+#include "single.h"
+#include "double.h"
+
+DFtype
+__extendsfdf2 (SFtype a)
+{
+  FP_DECL_EX;
+  FP_DECL_S (A);
+  FP_DECL_D (R);
+  DFtype r;
+
+  FP_INIT_EXCEPTIONS;
+  FP_UNPACK_RAW_S (A, a);
+#if _FP_W_TYPE_SIZE < _FP_FRACBITS_D
+  FP_EXTEND (D, S, 2, 1, R, A);
+#else
+  FP_EXTEND (D, S, 1, 1, R, A);
+#endif
+  FP_PACK_RAW_D (r, R);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
+
--- a/src/linpack/soft-fp/fixdfdi.c
+++ b/src/linpack/soft-fp/fixdfdi.c
@ -0,0 +1,47 @@
+/* Software floating-point emulation.
+   Convert a to 64bit signed integer
+   Copyright (C) 1997-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson (rth@cygnus.com) and
+		  Jakub Jelinek (jj@ultra.linux.cz).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "soft-fp.h"
+#include "double.h"
+
+DItype
+__fixdfdi (DFtype a)
+{
+  FP_DECL_EX;
+  FP_DECL_D (A);
+  UDItype r;
+
+  FP_INIT_EXCEPTIONS;
+  FP_UNPACK_RAW_D (A, a);
+  FP_TO_INT_D (r, A, DI_BITS, 1);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
--- a/src/linpack/soft-fp/fixdfsi.c
+++ b/src/linpack/soft-fp/fixdfsi.c
@ -0,0 +1,47 @@
+/* Software floating-point emulation.
+   Convert a to 32bit signed integer
+   Copyright (C) 1997-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson (rth@cygnus.com) and
+		  Jakub Jelinek (jj@ultra.linux.cz).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "soft-fp.h"
+#include "double.h"
+
+SItype
+__fixdfsi (DFtype a)
+{
+  FP_DECL_EX;
+  FP_DECL_D (A);
+  USItype r;
+
+  FP_INIT_EXCEPTIONS;
+  FP_UNPACK_RAW_D (A, a);
+  FP_TO_INT_D (r, A, SI_BITS, 1);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
--- a/src/linpack/soft-fp/fixdfti.c
+++ b/src/linpack/soft-fp/fixdfti.c
@ -0,0 +1,46 @@
+/* Software floating-point emulation.
+   Convert IEEE double to 128bit signed integer
+   Copyright (C) 2007-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Uros Bizjak (ubizjak@gmail.com).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+// #include "soft-fp.h"
+// #include "double.h"
+
+// TItype
+// __fixdfti (DFtype a)
+// {
+//   FP_DECL_EX;
+//   FP_DECL_D (A);
+//   UTItype r;
+
+//   FP_INIT_EXCEPTIONS;
+//   FP_UNPACK_RAW_D (A, a);
+//   FP_TO_INT_D (r, A, TI_BITS, 1);
+//   FP_HANDLE_EXCEPTIONS;
+
+//   return r;
+// }
--- a/src/linpack/soft-fp/fixsfdi.c
+++ b/src/linpack/soft-fp/fixsfdi.c
@ -0,0 +1,47 @@
+/* Software floating-point emulation.
+   Convert a to 64bit signed integer
+   Copyright (C) 1997-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson (rth@cygnus.com) and
+		  Jakub Jelinek (jj@ultra.linux.cz).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "soft-fp.h"
+#include "single.h"
+
+DItype
+__fixsfdi (SFtype a)
+{
+  FP_DECL_EX;
+  FP_DECL_S (A);
+  UDItype r;
+
+  FP_INIT_EXCEPTIONS;
+  FP_UNPACK_RAW_S (A, a);
+  FP_TO_INT_S (r, A, DI_BITS, 1);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
--- a/src/linpack/soft-fp/fixsfsi.c
+++ b/src/linpack/soft-fp/fixsfsi.c
@ -0,0 +1,47 @@
+/* Software floating-point emulation.
+   Convert a to 32bit signed integer
+   Copyright (C) 1997-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson (rth@cygnus.com) and
+		  Jakub Jelinek (jj@ultra.linux.cz).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "soft-fp.h"
+#include "single.h"
+
+SItype
+__fixsfsi (SFtype a)
+{
+  FP_DECL_EX;
+  FP_DECL_S (A);
+  USItype r;
+
+  FP_INIT_EXCEPTIONS;
+  FP_UNPACK_RAW_S (A, a);
+  FP_TO_INT_S (r, A, SI_BITS, 1);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
--- a/src/linpack/soft-fp/fixsfti.c
+++ b/src/linpack/soft-fp/fixsfti.c
@ -0,0 +1,46 @@
+/* Software floating-point emulation.
+   Convert IEEE single to 128bit signed integer
+   Copyright (C) 2007-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Uros Bizjak (ubizjak@gmail.com).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+// #include "soft-fp.h"
+// #include "single.h"
+
+// TItype
+// __fixsfti (SFtype a)
+// {
+//   FP_DECL_EX;
+//   FP_DECL_S (A);
+//   UTItype r;
+
+//   FP_INIT_EXCEPTIONS;
+//   FP_UNPACK_RAW_S (A, a);
+//   FP_TO_INT_S (r, A, TI_BITS, 1);
+//   FP_HANDLE_EXCEPTIONS;
+
+//   return r;
+// }
--- a/src/linpack/soft-fp/fixunsdfdi.c
+++ b/src/linpack/soft-fp/fixunsdfdi.c
@ -0,0 +1,47 @@
+/* Software floating-point emulation.
+   Convert a to 64bit unsigned integer
+   Copyright (C) 1997-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson (rth@cygnus.com) and
+		  Jakub Jelinek (jj@ultra.linux.cz).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "soft-fp.h"
+#include "double.h"
+
+UDItype
+__fixunsdfdi (DFtype a)
+{
+  FP_DECL_EX;
+  FP_DECL_D (A);
+  UDItype r;
+
+  FP_INIT_EXCEPTIONS;
+  FP_UNPACK_RAW_D (A, a);
+  FP_TO_INT_D (r, A, DI_BITS, 0);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
--- a/src/linpack/soft-fp/fixunsdfsi.c
+++ b/src/linpack/soft-fp/fixunsdfsi.c
@ -0,0 +1,47 @@
+/* Software floating-point emulation.
+   Convert a to 32bit unsigned integer
+   Copyright (C) 1997-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson (rth@cygnus.com) and
+		  Jakub Jelinek (jj@ultra.linux.cz).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "soft-fp.h"
+#include "double.h"
+
+USItype
+__fixunsdfsi (DFtype a)
+{
+  FP_DECL_EX;
+  FP_DECL_D (A);
+  USItype r;
+
+  FP_INIT_EXCEPTIONS;
+  FP_UNPACK_RAW_D (A, a);
+  FP_TO_INT_D (r, A, SI_BITS, 0);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
--- a/src/linpack/soft-fp/fixunsdfti.c
+++ b/src/linpack/soft-fp/fixunsdfti.c
@ -0,0 +1,46 @@
+/* Software floating-point emulation.
+   Convert IEEE double to 128bit unsigned integer
+   Copyright (C) 2007-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Uros Bizjak (ubizjak@gmail.com).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+// #include "soft-fp.h"
+// #include "double.h"
+
+// UTItype
+// __fixunsdfti (DFtype a)
+// {
+//   FP_DECL_EX;
+//   FP_DECL_D (A);
+//   UTItype r;
+
+//   FP_INIT_EXCEPTIONS;
+//   FP_UNPACK_RAW_D (A, a);
+//   FP_TO_INT_D (r, A, TI_BITS, 0);
+//   FP_HANDLE_EXCEPTIONS;
+
+//   return r;
+// }
--- a/src/linpack/soft-fp/fixunssfdi.c
+++ b/src/linpack/soft-fp/fixunssfdi.c
@ -0,0 +1,47 @@
+/* Software floating-point emulation.
+   Convert a to 64bit unsigned integer
+   Copyright (C) 1997-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson (rth@cygnus.com) and
+		  Jakub Jelinek (jj@ultra.linux.cz).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "soft-fp.h"
+#include "single.h"
+
+UDItype
+__fixunssfdi (SFtype a)
+{
+  FP_DECL_EX;
+  FP_DECL_S (A);
+  UDItype r;
+
+  FP_INIT_EXCEPTIONS;
+  FP_UNPACK_RAW_S (A, a);
+  FP_TO_INT_S (r, A, DI_BITS, 0);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
--- a/src/linpack/soft-fp/fixunssfsi.c
+++ b/src/linpack/soft-fp/fixunssfsi.c
@ -0,0 +1,47 @@
+/* Software floating-point emulation.
+   Convert a to 32bit unsigned integer
+   Copyright (C) 1997-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson (rth@cygnus.com) and
+		  Jakub Jelinek (jj@ultra.linux.cz).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "soft-fp.h"
+#include "single.h"
+
+USItype
+__fixunssfsi (SFtype a)
+{
+  FP_DECL_EX;
+  FP_DECL_S (A);
+  USItype r;
+
+  FP_INIT_EXCEPTIONS;
+  FP_UNPACK_RAW_S (A, a);
+  FP_TO_INT_S (r, A, SI_BITS, 0);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
--- a/src/linpack/soft-fp/fixunssfti.c
+++ b/src/linpack/soft-fp/fixunssfti.c
@ -0,0 +1,46 @@
+/* Software floating-point emulation.
+   Convert IEEE single to 128bit unsigned integer
+   Copyright (C) 2007-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Uros Bizjak (ubizjak@gmail.com).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+// #include "soft-fp.h"
+// #include "single.h"
+
+// UTItype
+// __fixunssfti (SFtype a)
+// {
+//   FP_DECL_EX;
+//   FP_DECL_S (A);
+//   UTItype r;
+
+//   FP_INIT_EXCEPTIONS;
+//   FP_UNPACK_RAW_S (A, a);
+//   FP_TO_INT_S (r, A, TI_BITS, 0);
+//   FP_HANDLE_EXCEPTIONS;
+
+//   return r;
+// }
--- a/src/linpack/soft-fp/floatdidf.c
+++ b/src/linpack/soft-fp/floatdidf.c
@ -0,0 +1,47 @@
+/* Software floating-point emulation.
+   Convert a 64bit signed integer to IEEE double
+   Copyright (C) 1997-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson (rth@cygnus.com) and
+		  Jakub Jelinek (jj@ultra.linux.cz).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "soft-fp.h"
+#include "double.h"
+
+DFtype
+__floatdidf (DItype i)
+{
+  FP_DECL_EX;
+  FP_DECL_D (A);
+  DFtype a;
+
+  FP_INIT_ROUNDMODE;
+  FP_FROM_INT_D (A, i, DI_BITS, UDItype);
+  FP_PACK_RAW_D (a, A);
+  FP_HANDLE_EXCEPTIONS;
+
+  return a;
+}
--- a/src/linpack/soft-fp/floatdisf.c
+++ b/src/linpack/soft-fp/floatdisf.c
@ -0,0 +1,47 @@
+/* Software floating-point emulation.
+   Convert a 64bit signed integer to IEEE single
+   Copyright (C) 1997-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson (rth@cygnus.com) and
+		  Jakub Jelinek (jj@ultra.linux.cz).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "soft-fp.h"
+#include "single.h"
+
+SFtype
+__floatdisf (DItype i)
+{
+  FP_DECL_EX;
+  FP_DECL_S (A);
+  SFtype a;
+
+  FP_INIT_ROUNDMODE;
+  FP_FROM_INT_S (A, i, DI_BITS, UDItype);
+  FP_PACK_RAW_S (a, A);
+  FP_HANDLE_EXCEPTIONS;
+
+  return a;
+}
--- a/src/linpack/soft-fp/floatsidf.c
+++ b/src/linpack/soft-fp/floatsidf.c
@ -0,0 +1,49 @@
+/* Software floating-point emulation.
+   Convert a 32bit signed integer to IEEE double
+   Copyright (C) 1997-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson (rth@cygnus.com) and
+		  Jakub Jelinek (jj@ultra.linux.cz).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define FP_NO_EXCEPTIONS
+#include "soft-fp.h"
+#include "double.h"
+
+DFtype
+__floatsidf (SItype i)
+{
+  FP_DECL_EX;
+  FP_DECL_D (A);
+  DFtype a;
+
+  FP_INIT_ROUNDMODE;
+  FP_FROM_INT_D (A, i, SI_BITS, USItype);
+  FP_PACK_RAW_D (a, A);
+  FP_HANDLE_EXCEPTIONS;
+
+  return a;
+}
+
--- a/src/linpack/soft-fp/floatsisf.c
+++ b/src/linpack/soft-fp/floatsisf.c
@ -0,0 +1,47 @@
+/* Software floating-point emulation.
+   Convert a 32bit signed integer to IEEE single
+   Copyright (C) 1997-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson (rth@cygnus.com) and
+		  Jakub Jelinek (jj@ultra.linux.cz).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "soft-fp.h"
+#include "single.h"
+
+SFtype
+__floatsisf (SItype i)
+{
+  FP_DECL_EX;
+  FP_DECL_S (A);
+  SFtype a;
+
+  FP_INIT_ROUNDMODE;
+  FP_FROM_INT_S (A, i, SI_BITS, USItype);
+  FP_PACK_RAW_S (a, A);
+  FP_HANDLE_EXCEPTIONS;
+
+  return a;
+}
--- a/src/linpack/soft-fp/floatundidf.c
+++ b/src/linpack/soft-fp/floatundidf.c
@ -0,0 +1,47 @@
+/* Software floating-point emulation.
+   Convert a 64bit unsigned integer to IEEE double
+   Copyright (C) 1997-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson (rth@cygnus.com) and
+		  Jakub Jelinek (jj@ultra.linux.cz).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "soft-fp.h"
+#include "double.h"
+
+DFtype
+__floatundidf (UDItype i)
+{
+  FP_DECL_EX;
+  FP_DECL_D (A);
+  DFtype a;
+
+  FP_INIT_ROUNDMODE;
+  FP_FROM_INT_D (A, i, DI_BITS, UDItype);
+  FP_PACK_RAW_D (a, A);
+  FP_HANDLE_EXCEPTIONS;
+
+  return a;
+}
--- a/src/linpack/soft-fp/floatundisf.c
+++ b/src/linpack/soft-fp/floatundisf.c
@ -0,0 +1,47 @@
+/* Software floating-point emulation.
+   Convert a 64bit unsigned integer to IEEE single
+   Copyright (C) 1997-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson (rth@cygnus.com) and
+		  Jakub Jelinek (jj@ultra.linux.cz).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "soft-fp.h"
+#include "single.h"
+
+SFtype
+__floatundisf (UDItype i)
+{
+  FP_DECL_EX;
+  FP_DECL_S (A);
+  SFtype a;
+
+  FP_INIT_ROUNDMODE;
+  FP_FROM_INT_S (A, i, DI_BITS, UDItype);
+  FP_PACK_RAW_S (a, A);
+  FP_HANDLE_EXCEPTIONS;
+
+  return a;
+}
--- a/src/linpack/soft-fp/floatunsidf.c
+++ b/src/linpack/soft-fp/floatunsidf.c
@ -0,0 +1,47 @@
+/* Software floating-point emulation.
+   Convert a 32bit unsigned integer to IEEE double
+   Copyright (C) 1997-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson (rth@cygnus.com) and
+		  Jakub Jelinek (jj@ultra.linux.cz).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define FP_NO_EXCEPTIONS
+#include "soft-fp.h"
+#include "double.h"
+
+DFtype
+__floatunsidf (USItype i)
+{
+  FP_DECL_EX;
+  FP_DECL_D (A);
+  DFtype a;
+
+  FP_INIT_ROUNDMODE;
+  FP_FROM_INT_D (A, i, SI_BITS, USItype);
+  FP_PACK_RAW_D (a, A);
+
+  return a;
+}
--- a/src/linpack/soft-fp/floatunsisf.c
+++ b/src/linpack/soft-fp/floatunsisf.c
@ -0,0 +1,47 @@
+/* Software floating-point emulation.
+   Convert a 32bit unsigned integer to IEEE single
+   Copyright (C) 1997-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson (rth@cygnus.com) and
+		  Jakub Jelinek (jj@ultra.linux.cz).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "soft-fp.h"
+#include "single.h"
+
+SFtype
+__floatunsisf (USItype i)
+{
+  FP_DECL_EX;
+  FP_DECL_S (A);
+  SFtype a;
+
+  FP_INIT_ROUNDMODE;
+  FP_FROM_INT_S (A, i, SI_BITS, USItype);
+  FP_PACK_RAW_S (a, A);
+  FP_HANDLE_EXCEPTIONS;
+
+  return a;
+}
--- a/src/linpack/soft-fp/gedf2.c
+++ b/src/linpack/soft-fp/gedf2.c
@ -0,0 +1,21 @@
+#include "soft-fp.h"
+#include "double.h"
+
+CMPtype
+__gedf2 (DFtype a, DFtype b)
+{
+  FP_DECL_EX;
+  FP_DECL_D (A);
+  FP_DECL_D (B);
+  CMPtype r;
+
+  FP_INIT_EXCEPTIONS;
+  FP_UNPACK_RAW_D (A, a);
+  FP_UNPACK_RAW_D (B, b);
+  FP_CMP_D (r, A, B, -2, 2);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
+
+strong_alias (__gedf2, __gtdf2);
--- a/src/linpack/soft-fp/gesf2.c
+++ b/src/linpack/soft-fp/gesf2.c
@ -0,0 +1,21 @@
+#include "soft-fp.h"
+#include "single.h"
+
+CMPtype
+__gesf2 (SFtype a, SFtype b)
+{
+  FP_DECL_EX;
+  FP_DECL_S (A);
+  FP_DECL_S (B);
+  CMPtype r;
+
+  FP_INIT_EXCEPTIONS;
+  FP_UNPACK_RAW_S (A, a);
+  FP_UNPACK_RAW_S (B, b);
+  FP_CMP_S (r, A, B, -2, 2);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
+
+strong_alias (__gesf2, __gtsf2);
--- a/src/linpack/soft-fp/ledf2.c
+++ b/src/linpack/soft-fp/ledf2.c
@ -0,0 +1,21 @@
+#include "soft-fp.h"
+#include "double.h"
+
+CMPtype
+__ledf2 (DFtype a, DFtype b)
+{
+  FP_DECL_EX;
+  FP_DECL_D (A);
+  FP_DECL_D (B);
+  CMPtype r;
+
+  FP_INIT_EXCEPTIONS;
+  FP_UNPACK_RAW_D (A, a);
+  FP_UNPACK_RAW_D (B, b);
+  FP_CMP_D (r, A, B, 2, 2);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
+
+strong_alias (__ledf2, __ltdf2);
--- a/src/linpack/soft-fp/lesf2.c
+++ b/src/linpack/soft-fp/lesf2.c
@ -0,0 +1,22 @@
+#include "soft-fp.h"
+#include "single.h"
+
+CMPtype
+__lesf2 (SFtype a, SFtype b)
+{
+  FP_DECL_EX;
+  FP_DECL_S (A);
+  FP_DECL_S (B);
+  CMPtype r;
+
+  FP_INIT_EXCEPTIONS;
+  FP_UNPACK_RAW_S (A, a);
+  FP_UNPACK_RAW_S (B, b);
+  FP_CMP_S (r, A, B, 2, 2);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
+
+strong_alias (__lesf2, __ltsf2);
+
--- a/src/linpack/soft-fp/longlong.h
+++ b/src/linpack/soft-fp/longlong.h
--- a/src/linpack/soft-fp/muldf3.c
+++ b/src/linpack/soft-fp/muldf3.c
@ -0,0 +1,21 @@
+#include "soft-fp.h"
+#include "double.h"
+
+DFtype
+__muldf3 (DFtype a, DFtype b)
+{
+  FP_DECL_EX;
+  FP_DECL_D (A);
+  FP_DECL_D (B);
+  FP_DECL_D (R);
+  DFtype r;
+
+  FP_INIT_ROUNDMODE;
+  FP_UNPACK_D (A, a);
+  FP_UNPACK_D (B, b);
+  FP_MUL_D (R, A, B);
+  FP_PACK_D (r, R);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
--- a/src/linpack/soft-fp/mulsf3.c
+++ b/src/linpack/soft-fp/mulsf3.c
@ -0,0 +1,22 @@
+#include "soft-fp.h"
+#include "single.h"
+
+SFtype
+__mulsf3 (SFtype a, SFtype b)
+{
+  FP_DECL_EX;
+  FP_DECL_S (A);
+  FP_DECL_S (B);
+  FP_DECL_S (R);
+  SFtype r;
+
+  FP_INIT_ROUNDMODE;
+  FP_UNPACK_S (A, a);
+  FP_UNPACK_S (B, b);
+  FP_MUL_S (R, A, B);
+  FP_PACK_S (r, R);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
+
--- a/src/linpack/soft-fp/negdf2.c
+++ b/src/linpack/soft-fp/negdf2.c
@ -0,0 +1,16 @@
+#include "soft-fp.h"
+#include "double.h"
+
+DFtype
+__negdf2 (DFtype a)
+{
+  FP_DECL_D (A);
+  FP_DECL_D (R);
+  DFtype r;
+
+  FP_UNPACK_RAW_D (A, a);
+  FP_NEG_D (R, A);
+  FP_PACK_RAW_D (r, R);
+
+  return r;
+}
--- a/src/linpack/soft-fp/negsf2.c
+++ b/src/linpack/soft-fp/negsf2.c
@ -0,0 +1,16 @@
+#include "soft-fp.h"
+#include "single.h"
+
+SFtype
+__negsf2 (SFtype a)
+{
+  FP_DECL_S (A);
+  FP_DECL_S (R);
+  SFtype r;
+
+  FP_UNPACK_RAW_S (A, a);
+  FP_NEG_S (R, A);
+  FP_PACK_RAW_S (r, R);
+
+  return r;
+}
--- a/src/linpack/soft-fp/op-1.h
+++ b/src/linpack/soft-fp/op-1.h
@ -0,0 +1,369 @@
+/* Software floating-point emulation.
+   Basic one-word fraction declaration and manipulation.
+   Copyright (C) 1997-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson (rth@cygnus.com),
+		  Jakub Jelinek (jj@ultra.linux.cz),
+		  David S. Miller (davem@redhat.com) and
+		  Peter Maydell (pmaydell@chiark.greenend.org.uk).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef SOFT_FP_OP_1_H
+#define SOFT_FP_OP_1_H	1
+
+#define _FP_FRAC_DECL_1(X)	_FP_W_TYPE X##_f _FP_ZERO_INIT
+#define _FP_FRAC_COPY_1(D, S)	(D##_f = S##_f)
+#define _FP_FRAC_SET_1(X, I)	(X##_f = I)
+#define _FP_FRAC_HIGH_1(X)	(X##_f)
+#define _FP_FRAC_LOW_1(X)	(X##_f)
+#define _FP_FRAC_WORD_1(X, w)	(X##_f)
+
+#define _FP_FRAC_ADDI_1(X, I)	(X##_f += I)
+#define _FP_FRAC_SLL_1(X, N)			\
+  do						\
+    {						\
+      if (__builtin_constant_p (N) && (N) == 1)	\
+	X##_f += X##_f;				\
+      else					\
+	X##_f <<= (N);				\
+    }						\
+  while (0)
+#define _FP_FRAC_SRL_1(X, N)	(X##_f >>= N)
+
+/* Right shift with sticky-lsb.  */
+#define _FP_FRAC_SRST_1(X, S, N, sz)	__FP_FRAC_SRST_1 (X##_f, S, (N), (sz))
+#define _FP_FRAC_SRS_1(X, N, sz)	__FP_FRAC_SRS_1 (X##_f, (N), (sz))
+
+#define __FP_FRAC_SRST_1(X, S, N, sz)			\
+  do							\
+    {							\
+      S = (__builtin_constant_p (N) && (N) == 1		\
+	   ? X & 1					\
+	   : (X << (_FP_W_TYPE_SIZE - (N))) != 0);	\
+      X = X >> (N);					\
+    }							\
+  while (0)
+
+#define __FP_FRAC_SRS_1(X, N, sz)				\
+  (X = (X >> (N) | (__builtin_constant_p (N) && (N) == 1	\
+		    ? X & 1					\
+		    : (X << (_FP_W_TYPE_SIZE - (N))) != 0)))
+
+#define _FP_FRAC_ADD_1(R, X, Y)	(R##_f = X##_f + Y##_f)
+#define _FP_FRAC_SUB_1(R, X, Y)	(R##_f = X##_f - Y##_f)
+#define _FP_FRAC_DEC_1(X, Y)	(X##_f -= Y##_f)
+#define _FP_FRAC_CLZ_1(z, X)	__FP_CLZ ((z), X##_f)
+
+/* Predicates.  */
+#define _FP_FRAC_NEGP_1(X)	((_FP_WS_TYPE) X##_f < 0)
+#define _FP_FRAC_ZEROP_1(X)	(X##_f == 0)
+#define _FP_FRAC_OVERP_1(fs, X)	(X##_f & _FP_OVERFLOW_##fs)
+#define _FP_FRAC_CLEAR_OVERP_1(fs, X)	(X##_f &= ~_FP_OVERFLOW_##fs)
+#define _FP_FRAC_HIGHBIT_DW_1(fs, X)	(X##_f & _FP_HIGHBIT_DW_##fs)
+#define _FP_FRAC_EQ_1(X, Y)	(X##_f == Y##_f)
+#define _FP_FRAC_GE_1(X, Y)	(X##_f >= Y##_f)
+#define _FP_FRAC_GT_1(X, Y)	(X##_f > Y##_f)
+
+#define _FP_ZEROFRAC_1		0
+#define _FP_MINFRAC_1		1
+#define _FP_MAXFRAC_1		(~(_FP_WS_TYPE) 0)
+
+/* Unpack the raw bits of a native fp value.  Do not classify or
+   normalize the data.  */
+
+#define _FP_UNPACK_RAW_1(fs, X, val)			\
+  do							\
+    {							\
+      union _FP_UNION_##fs _FP_UNPACK_RAW_1_flo;	\
+      _FP_UNPACK_RAW_1_flo.flt = (val);			\
+							\
+      X##_f = _FP_UNPACK_RAW_1_flo.bits.frac;		\
+      X##_e = _FP_UNPACK_RAW_1_flo.bits.exp;		\
+      X##_s = _FP_UNPACK_RAW_1_flo.bits.sign;		\
+    }							\
+  while (0)
+
+#define _FP_UNPACK_RAW_1_P(fs, X, val)			\
+  do							\
+    {							\
+      union _FP_UNION_##fs *_FP_UNPACK_RAW_1_P_flo	\
+	= (union _FP_UNION_##fs *) (val);		\
+							\
+      X##_f = _FP_UNPACK_RAW_1_P_flo->bits.frac;	\
+      X##_e = _FP_UNPACK_RAW_1_P_flo->bits.exp;		\
+      X##_s = _FP_UNPACK_RAW_1_P_flo->bits.sign;	\
+    }							\
+  while (0)
+
+/* Repack the raw bits of a native fp value.  */
+
+#define _FP_PACK_RAW_1(fs, val, X)		\
+  do						\
+    {						\
+      union _FP_UNION_##fs _FP_PACK_RAW_1_flo;	\
+						\
+      _FP_PACK_RAW_1_flo.bits.frac = X##_f;	\
+      _FP_PACK_RAW_1_flo.bits.exp  = X##_e;	\
+      _FP_PACK_RAW_1_flo.bits.sign = X##_s;	\
+						\
+      (val) = _FP_PACK_RAW_1_flo.flt;		\
+    }						\
+  while (0)
+
+#define _FP_PACK_RAW_1_P(fs, val, X)			\
+  do							\
+    {							\
+      union _FP_UNION_##fs *_FP_PACK_RAW_1_P_flo	\
+	= (union _FP_UNION_##fs *) (val);		\
+							\
+      _FP_PACK_RAW_1_P_flo->bits.frac = X##_f;		\
+      _FP_PACK_RAW_1_P_flo->bits.exp  = X##_e;		\
+      _FP_PACK_RAW_1_P_flo->bits.sign = X##_s;		\
+    }							\
+  while (0)
+
+
+/* Multiplication algorithms: */
+
+/* Basic.  Assuming the host word size is >= 2*FRACBITS, we can do the
+   multiplication immediately.  */
+
+#define _FP_MUL_MEAT_DW_1_imm(wfracbits, R, X, Y)	\
+  do							\
+    {							\
+      R##_f = X##_f * Y##_f;				\
+    }							\
+  while (0)
+
+#define _FP_MUL_MEAT_1_imm(wfracbits, R, X, Y)				\
+  do									\
+    {									\
+      _FP_MUL_MEAT_DW_1_imm ((wfracbits), R, X, Y);			\
+      /* Normalize since we know where the msb of the multiplicands	\
+	 were (bit B), we know that the msb of the of the product is	\
+	 at either 2B or 2B-1.  */					\
+      _FP_FRAC_SRS_1 (R, (wfracbits)-1, 2*(wfracbits));			\
+    }									\
+  while (0)
+
+/* Given a 1W * 1W => 2W primitive, do the extended multiplication.  */
+
+#define _FP_MUL_MEAT_DW_1_wide(wfracbits, R, X, Y, doit)	\
+  do								\
+    {								\
+      doit (R##_f1, R##_f0, X##_f, Y##_f);			\
+    }								\
+  while (0)
+
+#define _FP_MUL_MEAT_1_wide(wfracbits, R, X, Y, doit)			\
+  do									\
+    {									\
+      _FP_FRAC_DECL_2 (_FP_MUL_MEAT_1_wide_Z);				\
+      _FP_MUL_MEAT_DW_1_wide ((wfracbits), _FP_MUL_MEAT_1_wide_Z,	\
+			      X, Y, doit);				\
+      /* Normalize since we know where the msb of the multiplicands	\
+	 were (bit B), we know that the msb of the of the product is	\
+	 at either 2B or 2B-1.  */					\
+      _FP_FRAC_SRS_2 (_FP_MUL_MEAT_1_wide_Z, (wfracbits)-1,		\
+		      2*(wfracbits));					\
+      R##_f = _FP_MUL_MEAT_1_wide_Z_f0;					\
+    }									\
+  while (0)
+
+/* Finally, a simple widening multiply algorithm.  What fun!  */
+
+#define _FP_MUL_MEAT_DW_1_hard(wfracbits, R, X, Y)			\
+  do									\
+    {									\
+      _FP_W_TYPE _FP_MUL_MEAT_DW_1_hard_xh, _FP_MUL_MEAT_DW_1_hard_xl;	\
+      _FP_W_TYPE _FP_MUL_MEAT_DW_1_hard_yh, _FP_MUL_MEAT_DW_1_hard_yl;	\
+      _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_1_hard_a);			\
+									\
+      /* Split the words in half.  */					\
+      _FP_MUL_MEAT_DW_1_hard_xh = X##_f >> (_FP_W_TYPE_SIZE/2);		\
+      _FP_MUL_MEAT_DW_1_hard_xl						\
+	= X##_f & (((_FP_W_TYPE) 1 << (_FP_W_TYPE_SIZE/2)) - 1);	\
+      _FP_MUL_MEAT_DW_1_hard_yh = Y##_f >> (_FP_W_TYPE_SIZE/2);		\
+      _FP_MUL_MEAT_DW_1_hard_yl						\
+	= Y##_f & (((_FP_W_TYPE) 1 << (_FP_W_TYPE_SIZE/2)) - 1);	\
+									\
+      /* Multiply the pieces.  */					\
+      R##_f0 = _FP_MUL_MEAT_DW_1_hard_xl * _FP_MUL_MEAT_DW_1_hard_yl;	\
+      _FP_MUL_MEAT_DW_1_hard_a_f0					\
+	= _FP_MUL_MEAT_DW_1_hard_xh * _FP_MUL_MEAT_DW_1_hard_yl;	\
+      _FP_MUL_MEAT_DW_1_hard_a_f1					\
+	= _FP_MUL_MEAT_DW_1_hard_xl * _FP_MUL_MEAT_DW_1_hard_yh;	\
+      R##_f1 = _FP_MUL_MEAT_DW_1_hard_xh * _FP_MUL_MEAT_DW_1_hard_yh;	\
+									\
+      /* Reassemble into two full words.  */				\
+      if ((_FP_MUL_MEAT_DW_1_hard_a_f0 += _FP_MUL_MEAT_DW_1_hard_a_f1)	\
+	  < _FP_MUL_MEAT_DW_1_hard_a_f1)				\
+	R##_f1 += (_FP_W_TYPE) 1 << (_FP_W_TYPE_SIZE/2);		\
+      _FP_MUL_MEAT_DW_1_hard_a_f1					\
+	= _FP_MUL_MEAT_DW_1_hard_a_f0 >> (_FP_W_TYPE_SIZE/2);		\
+      _FP_MUL_MEAT_DW_1_hard_a_f0					\
+	= _FP_MUL_MEAT_DW_1_hard_a_f0 << (_FP_W_TYPE_SIZE/2);		\
+      _FP_FRAC_ADD_2 (R, R, _FP_MUL_MEAT_DW_1_hard_a);			\
+    }									\
+  while (0)
+
+#define _FP_MUL_MEAT_1_hard(wfracbits, R, X, Y)			\
+  do								\
+    {								\
+      _FP_FRAC_DECL_2 (_FP_MUL_MEAT_1_hard_z);			\
+      _FP_MUL_MEAT_DW_1_hard ((wfracbits),			\
+			      _FP_MUL_MEAT_1_hard_z, X, Y);	\
+								\
+      /* Normalize.  */						\
+      _FP_FRAC_SRS_2 (_FP_MUL_MEAT_1_hard_z,			\
+		      (wfracbits) - 1, 2*(wfracbits));		\
+      R##_f = _FP_MUL_MEAT_1_hard_z_f0;				\
+    }								\
+  while (0)
+
+
+/* Division algorithms: */
+
+/* Basic.  Assuming the host word size is >= 2*FRACBITS, we can do the
+   division immediately.  Give this macro either _FP_DIV_HELP_imm for
+   C primitives or _FP_DIV_HELP_ldiv for the ISO function.  Which you
+   choose will depend on what the compiler does with divrem4.  */
+
+#define _FP_DIV_MEAT_1_imm(fs, R, X, Y, doit)				\
+  do									\
+    {									\
+      _FP_W_TYPE _FP_DIV_MEAT_1_imm_q, _FP_DIV_MEAT_1_imm_r;		\
+      X##_f <<= (X##_f < Y##_f						\
+		 ? R##_e--, _FP_WFRACBITS_##fs				\
+		 : _FP_WFRACBITS_##fs - 1);				\
+      doit (_FP_DIV_MEAT_1_imm_q, _FP_DIV_MEAT_1_imm_r, X##_f, Y##_f);	\
+      R##_f = _FP_DIV_MEAT_1_imm_q | (_FP_DIV_MEAT_1_imm_r != 0);	\
+    }									\
+  while (0)
+
+/* GCC's longlong.h defines a 2W / 1W => (1W,1W) primitive udiv_qrnnd
+   that may be useful in this situation.  This first is for a primitive
+   that requires normalization, the second for one that does not.  Look
+   for UDIV_NEEDS_NORMALIZATION to tell which your machine needs.  */
+
+#define _FP_DIV_MEAT_1_udiv_norm(fs, R, X, Y)				\
+  do									\
+    {									\
+      _FP_W_TYPE _FP_DIV_MEAT_1_udiv_norm_nh;				\
+      _FP_W_TYPE _FP_DIV_MEAT_1_udiv_norm_nl;				\
+      _FP_W_TYPE _FP_DIV_MEAT_1_udiv_norm_q;				\
+      _FP_W_TYPE _FP_DIV_MEAT_1_udiv_norm_r;				\
+      _FP_W_TYPE _FP_DIV_MEAT_1_udiv_norm_y;				\
+									\
+      /* Normalize Y -- i.e. make the most significant bit set.  */	\
+      _FP_DIV_MEAT_1_udiv_norm_y = Y##_f << _FP_WFRACXBITS_##fs;	\
+									\
+      /* Shift X op correspondingly high, that is, up one full word.  */ \
+      if (X##_f < Y##_f)						\
+	{								\
+	  R##_e--;							\
+	  _FP_DIV_MEAT_1_udiv_norm_nl = 0;				\
+	  _FP_DIV_MEAT_1_udiv_norm_nh = X##_f;				\
+	}								\
+      else								\
+	{								\
+	  _FP_DIV_MEAT_1_udiv_norm_nl = X##_f << (_FP_W_TYPE_SIZE - 1);	\
+	  _FP_DIV_MEAT_1_udiv_norm_nh = X##_f >> 1;			\
+	}								\
+									\
+      udiv_qrnnd (_FP_DIV_MEAT_1_udiv_norm_q,				\
+		  _FP_DIV_MEAT_1_udiv_norm_r,				\
+		  _FP_DIV_MEAT_1_udiv_norm_nh,				\
+		  _FP_DIV_MEAT_1_udiv_norm_nl,				\
+		  _FP_DIV_MEAT_1_udiv_norm_y);				\
+      R##_f = (_FP_DIV_MEAT_1_udiv_norm_q				\
+	       | (_FP_DIV_MEAT_1_udiv_norm_r != 0));			\
+    }									\
+  while (0)
+
+#define _FP_DIV_MEAT_1_udiv(fs, R, X, Y)				\
+  do									\
+    {									\
+      _FP_W_TYPE _FP_DIV_MEAT_1_udiv_nh, _FP_DIV_MEAT_1_udiv_nl;	\
+      _FP_W_TYPE _FP_DIV_MEAT_1_udiv_q, _FP_DIV_MEAT_1_udiv_r;		\
+      if (X##_f < Y##_f)						\
+	{								\
+	  R##_e--;							\
+	  _FP_DIV_MEAT_1_udiv_nl = X##_f << _FP_WFRACBITS_##fs;		\
+	  _FP_DIV_MEAT_1_udiv_nh = X##_f >> _FP_WFRACXBITS_##fs;	\
+	}								\
+      else								\
+	{								\
+	  _FP_DIV_MEAT_1_udiv_nl = X##_f << (_FP_WFRACBITS_##fs - 1);	\
+	  _FP_DIV_MEAT_1_udiv_nh = X##_f >> (_FP_WFRACXBITS_##fs + 1);	\
+	}								\
+      udiv_qrnnd (_FP_DIV_MEAT_1_udiv_q, _FP_DIV_MEAT_1_udiv_r,		\
+		  _FP_DIV_MEAT_1_udiv_nh, _FP_DIV_MEAT_1_udiv_nl,	\
+		  Y##_f);						\
+      R##_f = _FP_DIV_MEAT_1_udiv_q | (_FP_DIV_MEAT_1_udiv_r != 0);	\
+    }									\
+  while (0)
+
+
+/* Square root algorithms:
+   We have just one right now, maybe Newton approximation
+   should be added for those machines where division is fast.  */
+
+#define _FP_SQRT_MEAT_1(R, S, T, X, q)		\
+  do						\
+    {						\
+      while ((q) != _FP_WORK_ROUND)		\
+	{					\
+	  T##_f = S##_f + (q);			\
+	  if (T##_f <= X##_f)			\
+	    {					\
+	      S##_f = T##_f + (q);		\
+	      X##_f -= T##_f;			\
+	      R##_f += (q);			\
+	    }					\
+	  _FP_FRAC_SLL_1 (X, 1);		\
+	  (q) >>= 1;				\
+	}					\
+      if (X##_f)				\
+	{					\
+	  if (S##_f < X##_f)			\
+	    R##_f |= _FP_WORK_ROUND;		\
+	  R##_f |= _FP_WORK_STICKY;		\
+	}					\
+    }						\
+  while (0)
+
+/* Assembly/disassembly for converting to/from integral types.
+   No shifting or overflow handled here.  */
+
+#define _FP_FRAC_ASSEMBLE_1(r, X, rsize)	((r) = X##_f)
+#define _FP_FRAC_DISASSEMBLE_1(X, r, rsize)	(X##_f = (r))
+
+
+/* Convert FP values between word sizes.  */
+
+#define _FP_FRAC_COPY_1_1(D, S)		(D##_f = S##_f)
+
+#endif /* !SOFT_FP_OP_1_H */
--- a/src/linpack/soft-fp/op-2.h
+++ b/src/linpack/soft-fp/op-2.h
@ -0,0 +1,705 @@
+/* Software floating-point emulation.
+   Basic two-word fraction declaration and manipulation.
+   Copyright (C) 1997-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson (rth@cygnus.com),
+		  Jakub Jelinek (jj@ultra.linux.cz),
+		  David S. Miller (davem@redhat.com) and
+		  Peter Maydell (pmaydell@chiark.greenend.org.uk).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef SOFT_FP_OP_2_H
+#define SOFT_FP_OP_2_H	1
+
+#define _FP_FRAC_DECL_2(X)				\
+  _FP_W_TYPE X##_f0 _FP_ZERO_INIT, X##_f1 _FP_ZERO_INIT
+#define _FP_FRAC_COPY_2(D, S)	(D##_f0 = S##_f0, D##_f1 = S##_f1)
+#define _FP_FRAC_SET_2(X, I)	__FP_FRAC_SET_2 (X, I)
+#define _FP_FRAC_HIGH_2(X)	(X##_f1)
+#define _FP_FRAC_LOW_2(X)	(X##_f0)
+#define _FP_FRAC_WORD_2(X, w)	(X##_f##w)
+
+#define _FP_FRAC_SLL_2(X, N)						\
+  (void) (((N) < _FP_W_TYPE_SIZE)					\
+	  ? ({								\
+	      if (__builtin_constant_p (N) && (N) == 1)			\
+		{							\
+		  X##_f1 = X##_f1 + X##_f1 + (((_FP_WS_TYPE) (X##_f0)) < 0); \
+		  X##_f0 += X##_f0;					\
+		}							\
+	      else							\
+		{							\
+		  X##_f1 = X##_f1 << (N) | X##_f0 >> (_FP_W_TYPE_SIZE - (N)); \
+		  X##_f0 <<= (N);					\
+		}							\
+	      0;							\
+	    })								\
+	  : ({								\
+	      X##_f1 = X##_f0 << ((N) - _FP_W_TYPE_SIZE);		\
+	      X##_f0 = 0;						\
+	    }))
+
+
+#define _FP_FRAC_SRL_2(X, N)						\
+  (void) (((N) < _FP_W_TYPE_SIZE)					\
+	  ? ({								\
+	      X##_f0 = X##_f0 >> (N) | X##_f1 << (_FP_W_TYPE_SIZE - (N)); \
+	      X##_f1 >>= (N);						\
+	    })								\
+	  : ({								\
+	      X##_f0 = X##_f1 >> ((N) - _FP_W_TYPE_SIZE);		\
+	      X##_f1 = 0;						\
+	    }))
+
+/* Right shift with sticky-lsb.  */
+#define _FP_FRAC_SRST_2(X, S, N, sz)					\
+  (void) (((N) < _FP_W_TYPE_SIZE)					\
+	  ? ({								\
+	      S = (__builtin_constant_p (N) && (N) == 1			\
+		   ? X##_f0 & 1						\
+		   : (X##_f0 << (_FP_W_TYPE_SIZE - (N))) != 0);		\
+	      X##_f0 = (X##_f1 << (_FP_W_TYPE_SIZE - (N)) | X##_f0 >> (N)); \
+	      X##_f1 >>= (N);						\
+	    })								\
+	  : ({								\
+	      S = ((((N) == _FP_W_TYPE_SIZE				\
+		     ? 0						\
+		     : (X##_f1 << (2*_FP_W_TYPE_SIZE - (N))))		\
+		    | X##_f0) != 0);					\
+	      X##_f0 = (X##_f1 >> ((N) - _FP_W_TYPE_SIZE));		\
+	      X##_f1 = 0;						\
+	    }))
+
+#define _FP_FRAC_SRS_2(X, N, sz)					\
+  (void) (((N) < _FP_W_TYPE_SIZE)					\
+	  ? ({								\
+	      X##_f0 = (X##_f1 << (_FP_W_TYPE_SIZE - (N)) | X##_f0 >> (N) \
+			| (__builtin_constant_p (N) && (N) == 1		\
+			   ? X##_f0 & 1					\
+			   : (X##_f0 << (_FP_W_TYPE_SIZE - (N))) != 0)); \
+	      X##_f1 >>= (N);						\
+	    })								\
+	  : ({								\
+	      X##_f0 = (X##_f1 >> ((N) - _FP_W_TYPE_SIZE)		\
+			| ((((N) == _FP_W_TYPE_SIZE			\
+			     ? 0					\
+			     : (X##_f1 << (2*_FP_W_TYPE_SIZE - (N))))	\
+			    | X##_f0) != 0));				\
+	      X##_f1 = 0;						\
+	    }))
+
+#define _FP_FRAC_ADDI_2(X, I)	\
+  __FP_FRAC_ADDI_2 (X##_f1, X##_f0, I)
+
+#define _FP_FRAC_ADD_2(R, X, Y)	\
+  __FP_FRAC_ADD_2 (R##_f1, R##_f0, X##_f1, X##_f0, Y##_f1, Y##_f0)
+
+#define _FP_FRAC_SUB_2(R, X, Y)	\
+  __FP_FRAC_SUB_2 (R##_f1, R##_f0, X##_f1, X##_f0, Y##_f1, Y##_f0)
+
+#define _FP_FRAC_DEC_2(X, Y)	\
+  __FP_FRAC_DEC_2 (X##_f1, X##_f0, Y##_f1, Y##_f0)
+
+#define _FP_FRAC_CLZ_2(R, X)			\
+  do						\
+    {						\
+      if (X##_f1)				\
+	__FP_CLZ ((R), X##_f1);			\
+      else					\
+	{					\
+	  __FP_CLZ ((R), X##_f0);		\
+	  (R) += _FP_W_TYPE_SIZE;		\
+	}					\
+    }						\
+  while (0)
+
+/* Predicates.  */
+#define _FP_FRAC_NEGP_2(X)	((_FP_WS_TYPE) X##_f1 < 0)
+#define _FP_FRAC_ZEROP_2(X)	((X##_f1 | X##_f0) == 0)
+#define _FP_FRAC_OVERP_2(fs, X)	(_FP_FRAC_HIGH_##fs (X) & _FP_OVERFLOW_##fs)
+#define _FP_FRAC_CLEAR_OVERP_2(fs, X)	(_FP_FRAC_HIGH_##fs (X) &= ~_FP_OVERFLOW_##fs)
+#define _FP_FRAC_HIGHBIT_DW_2(fs, X)	\
+  (_FP_FRAC_HIGH_DW_##fs (X) & _FP_HIGHBIT_DW_##fs)
+#define _FP_FRAC_EQ_2(X, Y)	(X##_f1 == Y##_f1 && X##_f0 == Y##_f0)
+#define _FP_FRAC_GT_2(X, Y)	\
+  (X##_f1 > Y##_f1 || (X##_f1 == Y##_f1 && X##_f0 > Y##_f0))
+#define _FP_FRAC_GE_2(X, Y)	\
+  (X##_f1 > Y##_f1 || (X##_f1 == Y##_f1 && X##_f0 >= Y##_f0))
+
+#define _FP_ZEROFRAC_2		0, 0
+#define _FP_MINFRAC_2		0, 1
+#define _FP_MAXFRAC_2		(~(_FP_WS_TYPE) 0), (~(_FP_WS_TYPE) 0)
+
+/* Internals.  */
+
+#define __FP_FRAC_SET_2(X, I1, I0)	(X##_f0 = I0, X##_f1 = I1)
+
+#define __FP_CLZ_2(R, xh, xl)			\
+  do						\
+    {						\
+      if (xh)					\
+	__FP_CLZ ((R), xh);			\
+      else					\
+	{					\
+	  __FP_CLZ ((R), xl);			\
+	  (R) += _FP_W_TYPE_SIZE;		\
+	}					\
+    }						\
+  while (0)
+
+#if 0
+
+# ifndef __FP_FRAC_ADDI_2
+#  define __FP_FRAC_ADDI_2(xh, xl, i)	\
+  (xh += ((xl += i) < i))
+# endif
+# ifndef __FP_FRAC_ADD_2
+#  define __FP_FRAC_ADD_2(rh, rl, xh, xl, yh, yl)	\
+  (rh = xh + yh + ((rl = xl + yl) < xl))
+# endif
+# ifndef __FP_FRAC_SUB_2
+#  define __FP_FRAC_SUB_2(rh, rl, xh, xl, yh, yl)	\
+  (rh = xh - yh - ((rl = xl - yl) > xl))
+# endif
+# ifndef __FP_FRAC_DEC_2
+#  define __FP_FRAC_DEC_2(xh, xl, yh, yl)		\
+  do							\
+    {							\
+      UWtype __FP_FRAC_DEC_2_t = xl;			\
+      xh -= yh + ((xl -= yl) > __FP_FRAC_DEC_2_t);	\
+    }							\
+  while (0)
+# endif
+
+#else
+
+# undef __FP_FRAC_ADDI_2
+# define __FP_FRAC_ADDI_2(xh, xl, i)	add_ssaaaa (xh, xl, xh, xl, 0, i)
+# undef __FP_FRAC_ADD_2
+# define __FP_FRAC_ADD_2		add_ssaaaa
+# undef __FP_FRAC_SUB_2
+# define __FP_FRAC_SUB_2		sub_ddmmss
+# undef __FP_FRAC_DEC_2
+# define __FP_FRAC_DEC_2(xh, xl, yh, yl)	\
+  sub_ddmmss (xh, xl, xh, xl, yh, yl)
+
+#endif
+
+/* Unpack the raw bits of a native fp value.  Do not classify or
+   normalize the data.  */
+
+#define _FP_UNPACK_RAW_2(fs, X, val)			\
+  do							\
+    {							\
+      union _FP_UNION_##fs _FP_UNPACK_RAW_2_flo;	\
+      _FP_UNPACK_RAW_2_flo.flt = (val);			\
+							\
+      X##_f0 = _FP_UNPACK_RAW_2_flo.bits.frac0;		\
+      X##_f1 = _FP_UNPACK_RAW_2_flo.bits.frac1;		\
+      X##_e  = _FP_UNPACK_RAW_2_flo.bits.exp;		\
+      X##_s  = _FP_UNPACK_RAW_2_flo.bits.sign;		\
+    }							\
+  while (0)
+
+#define _FP_UNPACK_RAW_2_P(fs, X, val)			\
+  do							\
+    {							\
+      union _FP_UNION_##fs *_FP_UNPACK_RAW_2_P_flo	\
+	= (union _FP_UNION_##fs *) (val);		\
+							\
+      X##_f0 = _FP_UNPACK_RAW_2_P_flo->bits.frac0;	\
+      X##_f1 = _FP_UNPACK_RAW_2_P_flo->bits.frac1;	\
+      X##_e  = _FP_UNPACK_RAW_2_P_flo->bits.exp;	\
+      X##_s  = _FP_UNPACK_RAW_2_P_flo->bits.sign;	\
+    }							\
+  while (0)
+
+
+/* Repack the raw bits of a native fp value.  */
+
+#define _FP_PACK_RAW_2(fs, val, X)		\
+  do						\
+    {						\
+      union _FP_UNION_##fs _FP_PACK_RAW_2_flo;	\
+						\
+      _FP_PACK_RAW_2_flo.bits.frac0 = X##_f0;	\
+      _FP_PACK_RAW_2_flo.bits.frac1 = X##_f1;	\
+      _FP_PACK_RAW_2_flo.bits.exp   = X##_e;	\
+      _FP_PACK_RAW_2_flo.bits.sign  = X##_s;	\
+						\
+      (val) = _FP_PACK_RAW_2_flo.flt;		\
+    }						\
+  while (0)
+
+#define _FP_PACK_RAW_2_P(fs, val, X)			\
+  do							\
+    {							\
+      union _FP_UNION_##fs *_FP_PACK_RAW_2_P_flo	\
+	= (union _FP_UNION_##fs *) (val);		\
+							\
+      _FP_PACK_RAW_2_P_flo->bits.frac0 = X##_f0;	\
+      _FP_PACK_RAW_2_P_flo->bits.frac1 = X##_f1;	\
+      _FP_PACK_RAW_2_P_flo->bits.exp   = X##_e;		\
+      _FP_PACK_RAW_2_P_flo->bits.sign  = X##_s;		\
+    }							\
+  while (0)
+
+
+/* Multiplication algorithms: */
+
+/* Given a 1W * 1W => 2W primitive, do the extended multiplication.  */
+
+#define _FP_MUL_MEAT_DW_2_wide(wfracbits, R, X, Y, doit)		\
+  do									\
+    {									\
+      _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_2_wide_b);			\
+      _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_2_wide_c);			\
+									\
+      doit (_FP_FRAC_WORD_4 (R, 1), _FP_FRAC_WORD_4 (R, 0),		\
+	    X##_f0, Y##_f0);						\
+      doit (_FP_MUL_MEAT_DW_2_wide_b_f1, _FP_MUL_MEAT_DW_2_wide_b_f0,	\
+	    X##_f0, Y##_f1);						\
+      doit (_FP_MUL_MEAT_DW_2_wide_c_f1, _FP_MUL_MEAT_DW_2_wide_c_f0,	\
+	    X##_f1, Y##_f0);						\
+      doit (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),		\
+	    X##_f1, Y##_f1);						\
+									\
+      __FP_FRAC_ADD_3 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),	\
+		       _FP_FRAC_WORD_4 (R, 1), 0,			\
+		       _FP_MUL_MEAT_DW_2_wide_b_f1,			\
+		       _FP_MUL_MEAT_DW_2_wide_b_f0,			\
+		       _FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),	\
+		       _FP_FRAC_WORD_4 (R, 1));				\
+      __FP_FRAC_ADD_3 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),	\
+		       _FP_FRAC_WORD_4 (R, 1), 0,			\
+		       _FP_MUL_MEAT_DW_2_wide_c_f1,			\
+		       _FP_MUL_MEAT_DW_2_wide_c_f0,			\
+		       _FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),	\
+		       _FP_FRAC_WORD_4 (R, 1));				\
+    }									\
+  while (0)
+
+#define _FP_MUL_MEAT_2_wide(wfracbits, R, X, Y, doit)			\
+  do									\
+    {									\
+      _FP_FRAC_DECL_4 (_FP_MUL_MEAT_2_wide_z);				\
+									\
+      _FP_MUL_MEAT_DW_2_wide ((wfracbits), _FP_MUL_MEAT_2_wide_z,	\
+			      X, Y, doit);				\
+									\
+      /* Normalize since we know where the msb of the multiplicands	\
+	 were (bit B), we know that the msb of the of the product is	\
+	 at either 2B or 2B-1.  */					\
+      _FP_FRAC_SRS_4 (_FP_MUL_MEAT_2_wide_z, (wfracbits)-1,		\
+		      2*(wfracbits));					\
+      R##_f0 = _FP_FRAC_WORD_4 (_FP_MUL_MEAT_2_wide_z, 0);		\
+      R##_f1 = _FP_FRAC_WORD_4 (_FP_MUL_MEAT_2_wide_z, 1);		\
+    }									\
+  while (0)
+
+/* Given a 1W * 1W => 2W primitive, do the extended multiplication.
+   Do only 3 multiplications instead of four. This one is for machines
+   where multiplication is much more expensive than subtraction.  */
+
+#define _FP_MUL_MEAT_DW_2_wide_3mul(wfracbits, R, X, Y, doit)		\
+  do									\
+    {									\
+      _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_2_wide_3mul_b);			\
+      _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_2_wide_3mul_c);			\
+      _FP_W_TYPE _FP_MUL_MEAT_DW_2_wide_3mul_d;				\
+      int _FP_MUL_MEAT_DW_2_wide_3mul_c1;				\
+      int _FP_MUL_MEAT_DW_2_wide_3mul_c2;				\
+									\
+      _FP_MUL_MEAT_DW_2_wide_3mul_b_f0 = X##_f0 + X##_f1;		\
+      _FP_MUL_MEAT_DW_2_wide_3mul_c1					\
+	= _FP_MUL_MEAT_DW_2_wide_3mul_b_f0 < X##_f0;			\
+      _FP_MUL_MEAT_DW_2_wide_3mul_b_f1 = Y##_f0 + Y##_f1;		\
+      _FP_MUL_MEAT_DW_2_wide_3mul_c2					\
+	= _FP_MUL_MEAT_DW_2_wide_3mul_b_f1 < Y##_f0;			\
+      doit (_FP_MUL_MEAT_DW_2_wide_3mul_d, _FP_FRAC_WORD_4 (R, 0),	\
+	    X##_f0, Y##_f0);						\
+      doit (_FP_FRAC_WORD_4 (R, 2), _FP_FRAC_WORD_4 (R, 1),		\
+	    _FP_MUL_MEAT_DW_2_wide_3mul_b_f0,				\
+	    _FP_MUL_MEAT_DW_2_wide_3mul_b_f1);				\
+      doit (_FP_MUL_MEAT_DW_2_wide_3mul_c_f1,				\
+	    _FP_MUL_MEAT_DW_2_wide_3mul_c_f0, X##_f1, Y##_f1);		\
+									\
+      _FP_MUL_MEAT_DW_2_wide_3mul_b_f0					\
+	&= -_FP_MUL_MEAT_DW_2_wide_3mul_c2;				\
+      _FP_MUL_MEAT_DW_2_wide_3mul_b_f1					\
+	&= -_FP_MUL_MEAT_DW_2_wide_3mul_c1;				\
+      __FP_FRAC_ADD_3 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),	\
+		       _FP_FRAC_WORD_4 (R, 1),				\
+		       (_FP_MUL_MEAT_DW_2_wide_3mul_c1			\
+			& _FP_MUL_MEAT_DW_2_wide_3mul_c2), 0,		\
+		       _FP_MUL_MEAT_DW_2_wide_3mul_d,			\
+		       0, _FP_FRAC_WORD_4 (R, 2), _FP_FRAC_WORD_4 (R, 1)); \
+      __FP_FRAC_ADDI_2 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),	\
+			_FP_MUL_MEAT_DW_2_wide_3mul_b_f0);		\
+      __FP_FRAC_ADDI_2 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),	\
+			_FP_MUL_MEAT_DW_2_wide_3mul_b_f1);		\
+      __FP_FRAC_DEC_3 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),	\
+		       _FP_FRAC_WORD_4 (R, 1),				\
+		       0, _FP_MUL_MEAT_DW_2_wide_3mul_d,		\
+		       _FP_FRAC_WORD_4 (R, 0));				\
+      __FP_FRAC_DEC_3 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),	\
+		       _FP_FRAC_WORD_4 (R, 1), 0,			\
+		       _FP_MUL_MEAT_DW_2_wide_3mul_c_f1,		\
+		       _FP_MUL_MEAT_DW_2_wide_3mul_c_f0);		\
+      __FP_FRAC_ADD_2 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),	\
+		       _FP_MUL_MEAT_DW_2_wide_3mul_c_f1,		\
+		       _FP_MUL_MEAT_DW_2_wide_3mul_c_f0,		\
+		       _FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2));	\
+    }									\
+  while (0)
+
+#define _FP_MUL_MEAT_2_wide_3mul(wfracbits, R, X, Y, doit)		\
+  do									\
+    {									\
+      _FP_FRAC_DECL_4 (_FP_MUL_MEAT_2_wide_3mul_z);			\
+									\
+      _FP_MUL_MEAT_DW_2_wide_3mul ((wfracbits),				\
+				   _FP_MUL_MEAT_2_wide_3mul_z,		\
+				   X, Y, doit);				\
+									\
+      /* Normalize since we know where the msb of the multiplicands	\
+	 were (bit B), we know that the msb of the of the product is	\
+	 at either 2B or 2B-1.  */					\
+      _FP_FRAC_SRS_4 (_FP_MUL_MEAT_2_wide_3mul_z,			\
+		      (wfracbits)-1, 2*(wfracbits));			\
+      R##_f0 = _FP_FRAC_WORD_4 (_FP_MUL_MEAT_2_wide_3mul_z, 0);		\
+      R##_f1 = _FP_FRAC_WORD_4 (_FP_MUL_MEAT_2_wide_3mul_z, 1);		\
+    }									\
+  while (0)
+
+#define _FP_MUL_MEAT_DW_2_gmp(wfracbits, R, X, Y)	\
+  do							\
+    {							\
+      _FP_W_TYPE _FP_MUL_MEAT_DW_2_gmp_x[2];		\
+      _FP_W_TYPE _FP_MUL_MEAT_DW_2_gmp_y[2];		\
+      _FP_MUL_MEAT_DW_2_gmp_x[0] = X##_f0;		\
+      _FP_MUL_MEAT_DW_2_gmp_x[1] = X##_f1;		\
+      _FP_MUL_MEAT_DW_2_gmp_y[0] = Y##_f0;		\
+      _FP_MUL_MEAT_DW_2_gmp_y[1] = Y##_f1;		\
+							\
+      mpn_mul_n (R##_f, _FP_MUL_MEAT_DW_2_gmp_x,	\
+		 _FP_MUL_MEAT_DW_2_gmp_y, 2);		\
+    }							\
+  while (0)
+
+#define _FP_MUL_MEAT_2_gmp(wfracbits, R, X, Y)				\
+  do									\
+    {									\
+      _FP_FRAC_DECL_4 (_FP_MUL_MEAT_2_gmp_z);				\
+									\
+      _FP_MUL_MEAT_DW_2_gmp ((wfracbits), _FP_MUL_MEAT_2_gmp_z, X, Y);	\
+									\
+      /* Normalize since we know where the msb of the multiplicands	\
+	 were (bit B), we know that the msb of the of the product is	\
+	 at either 2B or 2B-1.  */					\
+      _FP_FRAC_SRS_4 (_FP_MUL_MEAT_2_gmp_z, (wfracbits)-1,		\
+		      2*(wfracbits));					\
+      R##_f0 = _FP_MUL_MEAT_2_gmp_z_f[0];				\
+      R##_f1 = _FP_MUL_MEAT_2_gmp_z_f[1];				\
+    }									\
+  while (0)
+
+/* Do at most 120x120=240 bits multiplication using double floating
+   point multiplication.  This is useful if floating point
+   multiplication has much bigger throughput than integer multiply.
+   It is supposed to work for _FP_W_TYPE_SIZE 64 and wfracbits
+   between 106 and 120 only.
+   Caller guarantees that X and Y has (1LLL << (wfracbits - 1)) set.
+   SETFETZ is a macro which will disable all FPU exceptions and set rounding
+   towards zero,  RESETFE should optionally reset it back.  */
+
+#define _FP_MUL_MEAT_2_120_240_double(wfracbits, R, X, Y, setfetz, resetfe) \
+  do									\
+    {									\
+      static const double _const[] =					\
+	{								\
+	  /* 2^-24 */ 5.9604644775390625e-08,				\
+	  /* 2^-48 */ 3.5527136788005009e-15,				\
+	  /* 2^-72 */ 2.1175823681357508e-22,				\
+	  /* 2^-96 */ 1.2621774483536189e-29,				\
+	  /* 2^28 */ 2.68435456e+08,					\
+	  /* 2^4 */ 1.600000e+01,					\
+	  /* 2^-20 */ 9.5367431640625e-07,				\
+	  /* 2^-44 */ 5.6843418860808015e-14,				\
+	  /* 2^-68 */ 3.3881317890172014e-21,				\
+	  /* 2^-92 */ 2.0194839173657902e-28,				\
+	  /* 2^-116 */ 1.2037062152420224e-35				\
+	};								\
+      double _a240, _b240, _c240, _d240, _e240, _f240,			\
+	_g240, _h240, _i240, _j240, _k240;				\
+      union { double d; UDItype i; } _l240, _m240, _n240, _o240,	\
+				       _p240, _q240, _r240, _s240;	\
+      UDItype _t240, _u240, _v240, _w240, _x240, _y240 = 0;		\
+									\
+      _FP_STATIC_ASSERT ((wfracbits) >= 106 && (wfracbits) <= 120,	\
+			 "wfracbits out of range");			\
+									\
+      setfetz;								\
+									\
+      _e240 = (double) (long) (X##_f0 & 0xffffff);			\
+      _j240 = (double) (long) (Y##_f0 & 0xffffff);			\
+      _d240 = (double) (long) ((X##_f0 >> 24) & 0xffffff);		\
+      _i240 = (double) (long) ((Y##_f0 >> 24) & 0xffffff);		\
+      _c240 = (double) (long) (((X##_f1 << 16) & 0xffffff) | (X##_f0 >> 48)); \
+      _h240 = (double) (long) (((Y##_f1 << 16) & 0xffffff) | (Y##_f0 >> 48)); \
+      _b240 = (double) (long) ((X##_f1 >> 8) & 0xffffff);		\
+      _g240 = (double) (long) ((Y##_f1 >> 8) & 0xffffff);		\
+      _a240 = (double) (long) (X##_f1 >> 32);				\
+      _f240 = (double) (long) (Y##_f1 >> 32);				\
+      _e240 *= _const[3];						\
+      _j240 *= _const[3];						\
+      _d240 *= _const[2];						\
+      _i240 *= _const[2];						\
+      _c240 *= _const[1];						\
+      _h240 *= _const[1];						\
+      _b240 *= _const[0];						\
+      _g240 *= _const[0];						\
+      _s240.d =							      _e240*_j240; \
+      _r240.d =						_d240*_j240 + _e240*_i240; \
+      _q240.d =				  _c240*_j240 + _d240*_i240 + _e240*_h240; \
+      _p240.d =		    _b240*_j240 + _c240*_i240 + _d240*_h240 + _e240*_g240; \
+      _o240.d = _a240*_j240 + _b240*_i240 + _c240*_h240 + _d240*_g240 + _e240*_f240; \
+      _n240.d = _a240*_i240 + _b240*_h240 + _c240*_g240 + _d240*_f240;	\
+      _m240.d = _a240*_h240 + _b240*_g240 + _c240*_f240;		\
+      _l240.d = _a240*_g240 + _b240*_f240;				\
+      _k240 =   _a240*_f240;						\
+      _r240.d += _s240.d;						\
+      _q240.d += _r240.d;						\
+      _p240.d += _q240.d;						\
+      _o240.d += _p240.d;						\
+      _n240.d += _o240.d;						\
+      _m240.d += _n240.d;						\
+      _l240.d += _m240.d;						\
+      _k240 += _l240.d;							\
+      _s240.d -= ((_const[10]+_s240.d)-_const[10]);			\
+      _r240.d -= ((_const[9]+_r240.d)-_const[9]);			\
+      _q240.d -= ((_const[8]+_q240.d)-_const[8]);			\
+      _p240.d -= ((_const[7]+_p240.d)-_const[7]);			\
+      _o240.d += _const[7];						\
+      _n240.d += _const[6];						\
+      _m240.d += _const[5];						\
+      _l240.d += _const[4];						\
+      if (_s240.d != 0.0)						\
+	_y240 = 1;							\
+      if (_r240.d != 0.0)						\
+	_y240 = 1;							\
+      if (_q240.d != 0.0)						\
+	_y240 = 1;							\
+      if (_p240.d != 0.0)						\
+	_y240 = 1;							\
+      _t240 = (DItype) _k240;						\
+      _u240 = _l240.i;							\
+      _v240 = _m240.i;							\
+      _w240 = _n240.i;							\
+      _x240 = _o240.i;							\
+      R##_f1 = ((_t240 << (128 - (wfracbits - 1)))			\
+		| ((_u240 & 0xffffff) >> ((wfracbits - 1) - 104)));	\
+      R##_f0 = (((_u240 & 0xffffff) << (168 - (wfracbits - 1)))		\
+		| ((_v240 & 0xffffff) << (144 - (wfracbits - 1)))	\
+		| ((_w240 & 0xffffff) << (120 - (wfracbits - 1)))	\
+		| ((_x240 & 0xffffff) >> ((wfracbits - 1) - 96))	\
+		| _y240);						\
+      resetfe;								\
+    }									\
+  while (0)
+
+/* Division algorithms: */
+
+#define _FP_DIV_MEAT_2_udiv(fs, R, X, Y)				\
+  do									\
+    {									\
+      _FP_W_TYPE _FP_DIV_MEAT_2_udiv_n_f2;				\
+      _FP_W_TYPE _FP_DIV_MEAT_2_udiv_n_f1;				\
+      _FP_W_TYPE _FP_DIV_MEAT_2_udiv_n_f0;				\
+      _FP_W_TYPE _FP_DIV_MEAT_2_udiv_r_f1;				\
+      _FP_W_TYPE _FP_DIV_MEAT_2_udiv_r_f0;				\
+      _FP_W_TYPE _FP_DIV_MEAT_2_udiv_m_f1;				\
+      _FP_W_TYPE _FP_DIV_MEAT_2_udiv_m_f0;				\
+      if (_FP_FRAC_GE_2 (X, Y))						\
+	{								\
+	  _FP_DIV_MEAT_2_udiv_n_f2 = X##_f1 >> 1;			\
+	  _FP_DIV_MEAT_2_udiv_n_f1					\
+	    = X##_f1 << (_FP_W_TYPE_SIZE - 1) | X##_f0 >> 1;		\
+	  _FP_DIV_MEAT_2_udiv_n_f0					\
+	    = X##_f0 << (_FP_W_TYPE_SIZE - 1);				\
+	}								\
+      else								\
+	{								\
+	  R##_e--;							\
+	  _FP_DIV_MEAT_2_udiv_n_f2 = X##_f1;				\
+	  _FP_DIV_MEAT_2_udiv_n_f1 = X##_f0;				\
+	  _FP_DIV_MEAT_2_udiv_n_f0 = 0;					\
+	}								\
+									\
+      /* Normalize, i.e. make the most significant bit of the		\
+	 denominator set.  */						\
+      _FP_FRAC_SLL_2 (Y, _FP_WFRACXBITS_##fs);				\
+									\
+      udiv_qrnnd (R##_f1, _FP_DIV_MEAT_2_udiv_r_f1,			\
+		  _FP_DIV_MEAT_2_udiv_n_f2, _FP_DIV_MEAT_2_udiv_n_f1,	\
+		  Y##_f1);						\
+      umul_ppmm (_FP_DIV_MEAT_2_udiv_m_f1, _FP_DIV_MEAT_2_udiv_m_f0,	\
+		 R##_f1, Y##_f0);					\
+      _FP_DIV_MEAT_2_udiv_r_f0 = _FP_DIV_MEAT_2_udiv_n_f0;		\
+      if (_FP_FRAC_GT_2 (_FP_DIV_MEAT_2_udiv_m, _FP_DIV_MEAT_2_udiv_r))	\
+	{								\
+	  R##_f1--;							\
+	  _FP_FRAC_ADD_2 (_FP_DIV_MEAT_2_udiv_r, Y,			\
+			  _FP_DIV_MEAT_2_udiv_r);			\
+	  if (_FP_FRAC_GE_2 (_FP_DIV_MEAT_2_udiv_r, Y)			\
+	      && _FP_FRAC_GT_2 (_FP_DIV_MEAT_2_udiv_m,			\
+				_FP_DIV_MEAT_2_udiv_r))			\
+	    {								\
+	      R##_f1--;							\
+	      _FP_FRAC_ADD_2 (_FP_DIV_MEAT_2_udiv_r, Y,			\
+			      _FP_DIV_MEAT_2_udiv_r);			\
+	    }								\
+	}								\
+      _FP_FRAC_DEC_2 (_FP_DIV_MEAT_2_udiv_r, _FP_DIV_MEAT_2_udiv_m);	\
+									\
+      if (_FP_DIV_MEAT_2_udiv_r_f1 == Y##_f1)				\
+	{								\
+	  /* This is a special case, not an optimization		\
+	     (_FP_DIV_MEAT_2_udiv_r/Y##_f1 would not fit into UWtype).	\
+	     As _FP_DIV_MEAT_2_udiv_r is guaranteed to be < Y,		\
+	     R##_f0 can be either (UWtype)-1 or (UWtype)-2.  But as we	\
+	     know what kind of bits it is (sticky, guard, round),	\
+	     we don't care.  We also don't care what the reminder is,	\
+	     because the guard bit will be set anyway.  -jj */		\
+	  R##_f0 = -1;							\
+	}								\
+      else								\
+	{								\
+	  udiv_qrnnd (R##_f0, _FP_DIV_MEAT_2_udiv_r_f1,			\
+		      _FP_DIV_MEAT_2_udiv_r_f1,				\
+		      _FP_DIV_MEAT_2_udiv_r_f0, Y##_f1);		\
+	  umul_ppmm (_FP_DIV_MEAT_2_udiv_m_f1,				\
+		     _FP_DIV_MEAT_2_udiv_m_f0, R##_f0, Y##_f0);		\
+	  _FP_DIV_MEAT_2_udiv_r_f0 = 0;					\
+	  if (_FP_FRAC_GT_2 (_FP_DIV_MEAT_2_udiv_m,			\
+			     _FP_DIV_MEAT_2_udiv_r))			\
+	    {								\
+	      R##_f0--;							\
+	      _FP_FRAC_ADD_2 (_FP_DIV_MEAT_2_udiv_r, Y,			\
+			      _FP_DIV_MEAT_2_udiv_r);			\
+	      if (_FP_FRAC_GE_2 (_FP_DIV_MEAT_2_udiv_r, Y)		\
+		  && _FP_FRAC_GT_2 (_FP_DIV_MEAT_2_udiv_m,		\
+				    _FP_DIV_MEAT_2_udiv_r))		\
+		{							\
+		  R##_f0--;						\
+		  _FP_FRAC_ADD_2 (_FP_DIV_MEAT_2_udiv_r, Y,		\
+				  _FP_DIV_MEAT_2_udiv_r);		\
+		}							\
+	    }								\
+	  if (!_FP_FRAC_EQ_2 (_FP_DIV_MEAT_2_udiv_r,			\
+			      _FP_DIV_MEAT_2_udiv_m))			\
+	    R##_f0 |= _FP_WORK_STICKY;					\
+	}								\
+    }									\
+  while (0)
+
+
+/* Square root algorithms:
+   We have just one right now, maybe Newton approximation
+   should be added for those machines where division is fast.  */
+
+#define _FP_SQRT_MEAT_2(R, S, T, X, q)				\
+  do								\
+    {								\
+      while (q)							\
+	{							\
+	  T##_f1 = S##_f1 + (q);				\
+	  if (T##_f1 <= X##_f1)					\
+	    {							\
+	      S##_f1 = T##_f1 + (q);				\
+	      X##_f1 -= T##_f1;					\
+	      R##_f1 += (q);					\
+	    }							\
+	  _FP_FRAC_SLL_2 (X, 1);				\
+	  (q) >>= 1;						\
+	}							\
+      (q) = (_FP_W_TYPE) 1 << (_FP_W_TYPE_SIZE - 1);		\
+      while ((q) != _FP_WORK_ROUND)				\
+	{							\
+	  T##_f0 = S##_f0 + (q);				\
+	  T##_f1 = S##_f1;					\
+	  if (T##_f1 < X##_f1					\
+	      || (T##_f1 == X##_f1 && T##_f0 <= X##_f0))	\
+	    {							\
+	      S##_f0 = T##_f0 + (q);				\
+	      S##_f1 += (T##_f0 > S##_f0);			\
+	      _FP_FRAC_DEC_2 (X, T);				\
+	      R##_f0 += (q);					\
+	    }							\
+	  _FP_FRAC_SLL_2 (X, 1);				\
+	  (q) >>= 1;						\
+	}							\
+      if (X##_f0 | X##_f1)					\
+	{							\
+	  if (S##_f1 < X##_f1					\
+	      || (S##_f1 == X##_f1 && S##_f0 < X##_f0))		\
+	    R##_f0 |= _FP_WORK_ROUND;				\
+	  R##_f0 |= _FP_WORK_STICKY;				\
+	}							\
+    }								\
+  while (0)
+
+
+/* Assembly/disassembly for converting to/from integral types.
+   No shifting or overflow handled here.  */
+
+#define _FP_FRAC_ASSEMBLE_2(r, X, rsize)	\
+  (void) (((rsize) <= _FP_W_TYPE_SIZE)		\
+	  ? ({ (r) = X##_f0; })			\
+	  : ({					\
+	      (r) = X##_f1;			\
+	      (r) <<= _FP_W_TYPE_SIZE;		\
+	      (r) += X##_f0;			\
+	    }))
+
+#define _FP_FRAC_DISASSEMBLE_2(X, r, rsize)	\
+  do						\
+    {						\
+      X##_f0 = (r);				\
+      X##_f1 = ((rsize) <= _FP_W_TYPE_SIZE	\
+		? 0				\
+		: (r) >> _FP_W_TYPE_SIZE);	\
+    }						\
+  while (0)
+
+/* Convert FP values between word sizes.  */
+
+#define _FP_FRAC_COPY_1_2(D, S)		(D##_f = S##_f0)
+
+#define _FP_FRAC_COPY_2_1(D, S)		((D##_f0 = S##_f), (D##_f1 = 0))
+
+#define _FP_FRAC_COPY_2_2(D, S)		_FP_FRAC_COPY_2 (D, S)
+
+#endif /* !SOFT_FP_OP_2_H */
--- a/src/linpack/soft-fp/op-4.h
+++ b/src/linpack/soft-fp/op-4.h
@ -0,0 +1,882 @@
+/* Software floating-point emulation.
+   Basic four-word fraction declaration and manipulation.
+   Copyright (C) 1997-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson (rth@cygnus.com),
+		  Jakub Jelinek (jj@ultra.linux.cz),
+		  David S. Miller (davem@redhat.com) and
+		  Peter Maydell (pmaydell@chiark.greenend.org.uk).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef SOFT_FP_OP_4_H
+#define SOFT_FP_OP_4_H	1
+
+#define _FP_FRAC_DECL_4(X)	_FP_W_TYPE X##_f[4]
+#define _FP_FRAC_COPY_4(D, S)			\
+  (D##_f[0] = S##_f[0], D##_f[1] = S##_f[1],	\
+   D##_f[2] = S##_f[2], D##_f[3] = S##_f[3])
+#define _FP_FRAC_SET_4(X, I)	__FP_FRAC_SET_4 (X, I)
+#define _FP_FRAC_HIGH_4(X)	(X##_f[3])
+#define _FP_FRAC_LOW_4(X)	(X##_f[0])
+#define _FP_FRAC_WORD_4(X, w)	(X##_f[w])
+
+#define _FP_FRAC_SLL_4(X, N)						\
+  do									\
+    {									\
+      _FP_I_TYPE _FP_FRAC_SLL_4_up, _FP_FRAC_SLL_4_down;		\
+      _FP_I_TYPE _FP_FRAC_SLL_4_skip, _FP_FRAC_SLL_4_i;			\
+      _FP_FRAC_SLL_4_skip = (N) / _FP_W_TYPE_SIZE;			\
+      _FP_FRAC_SLL_4_up = (N) % _FP_W_TYPE_SIZE;			\
+      _FP_FRAC_SLL_4_down = _FP_W_TYPE_SIZE - _FP_FRAC_SLL_4_up;	\
+      if (!_FP_FRAC_SLL_4_up)						\
+	for (_FP_FRAC_SLL_4_i = 3;					\
+	     _FP_FRAC_SLL_4_i >= _FP_FRAC_SLL_4_skip;			\
+	     --_FP_FRAC_SLL_4_i)					\
+	  X##_f[_FP_FRAC_SLL_4_i]					\
+	    = X##_f[_FP_FRAC_SLL_4_i-_FP_FRAC_SLL_4_skip];		\
+      else								\
+	{								\
+	  for (_FP_FRAC_SLL_4_i = 3;					\
+	       _FP_FRAC_SLL_4_i > _FP_FRAC_SLL_4_skip;			\
+	       --_FP_FRAC_SLL_4_i)					\
+	    X##_f[_FP_FRAC_SLL_4_i]					\
+	      = ((X##_f[_FP_FRAC_SLL_4_i-_FP_FRAC_SLL_4_skip]		\
+		  << _FP_FRAC_SLL_4_up)					\
+		 | (X##_f[_FP_FRAC_SLL_4_i-_FP_FRAC_SLL_4_skip-1]	\
+		    >> _FP_FRAC_SLL_4_down));				\
+	  X##_f[_FP_FRAC_SLL_4_i--] = X##_f[0] << _FP_FRAC_SLL_4_up;	\
+	}								\
+      for (; _FP_FRAC_SLL_4_i >= 0; --_FP_FRAC_SLL_4_i)			\
+	X##_f[_FP_FRAC_SLL_4_i] = 0;					\
+    }									\
+  while (0)
+
+/* This one was broken too.  */
+#define _FP_FRAC_SRL_4(X, N)						\
+  do									\
+    {									\
+      _FP_I_TYPE _FP_FRAC_SRL_4_up, _FP_FRAC_SRL_4_down;		\
+      _FP_I_TYPE _FP_FRAC_SRL_4_skip, _FP_FRAC_SRL_4_i;			\
+      _FP_FRAC_SRL_4_skip = (N) / _FP_W_TYPE_SIZE;			\
+      _FP_FRAC_SRL_4_down = (N) % _FP_W_TYPE_SIZE;			\
+      _FP_FRAC_SRL_4_up = _FP_W_TYPE_SIZE - _FP_FRAC_SRL_4_down;	\
+      if (!_FP_FRAC_SRL_4_down)						\
+	for (_FP_FRAC_SRL_4_i = 0;					\
+	     _FP_FRAC_SRL_4_i <= 3-_FP_FRAC_SRL_4_skip;			\
+	     ++_FP_FRAC_SRL_4_i)					\
+	  X##_f[_FP_FRAC_SRL_4_i]					\
+	    = X##_f[_FP_FRAC_SRL_4_i+_FP_FRAC_SRL_4_skip];		\
+      else								\
+	{								\
+	  for (_FP_FRAC_SRL_4_i = 0;					\
+	       _FP_FRAC_SRL_4_i < 3-_FP_FRAC_SRL_4_skip;		\
+	       ++_FP_FRAC_SRL_4_i)					\
+	    X##_f[_FP_FRAC_SRL_4_i]					\
+	      = ((X##_f[_FP_FRAC_SRL_4_i+_FP_FRAC_SRL_4_skip]		\
+		  >> _FP_FRAC_SRL_4_down)				\
+		 | (X##_f[_FP_FRAC_SRL_4_i+_FP_FRAC_SRL_4_skip+1]	\
+		    << _FP_FRAC_SRL_4_up));				\
+	  X##_f[_FP_FRAC_SRL_4_i++] = X##_f[3] >> _FP_FRAC_SRL_4_down;	\
+	}								\
+      for (; _FP_FRAC_SRL_4_i < 4; ++_FP_FRAC_SRL_4_i)			\
+	X##_f[_FP_FRAC_SRL_4_i] = 0;					\
+    }									\
+  while (0)
+
+
+/* Right shift with sticky-lsb.
+   What this actually means is that we do a standard right-shift,
+   but that if any of the bits that fall off the right hand side
+   were one then we always set the LSbit.  */
+#define _FP_FRAC_SRST_4(X, S, N, size)					\
+  do									\
+    {									\
+      _FP_I_TYPE _FP_FRAC_SRST_4_up, _FP_FRAC_SRST_4_down;		\
+      _FP_I_TYPE _FP_FRAC_SRST_4_skip, _FP_FRAC_SRST_4_i;		\
+      _FP_W_TYPE _FP_FRAC_SRST_4_s;					\
+      _FP_FRAC_SRST_4_skip = (N) / _FP_W_TYPE_SIZE;			\
+      _FP_FRAC_SRST_4_down = (N) % _FP_W_TYPE_SIZE;			\
+      _FP_FRAC_SRST_4_up = _FP_W_TYPE_SIZE - _FP_FRAC_SRST_4_down;	\
+      for (_FP_FRAC_SRST_4_s = _FP_FRAC_SRST_4_i = 0;			\
+	   _FP_FRAC_SRST_4_i < _FP_FRAC_SRST_4_skip;			\
+	   ++_FP_FRAC_SRST_4_i)						\
+	_FP_FRAC_SRST_4_s |= X##_f[_FP_FRAC_SRST_4_i];			\
+      if (!_FP_FRAC_SRST_4_down)					\
+	for (_FP_FRAC_SRST_4_i = 0;					\
+	     _FP_FRAC_SRST_4_i <= 3-_FP_FRAC_SRST_4_skip;		\
+	     ++_FP_FRAC_SRST_4_i)					\
+	  X##_f[_FP_FRAC_SRST_4_i]					\
+	    = X##_f[_FP_FRAC_SRST_4_i+_FP_FRAC_SRST_4_skip];		\
+      else								\
+	{								\
+	  _FP_FRAC_SRST_4_s						\
+	    |= X##_f[_FP_FRAC_SRST_4_i] << _FP_FRAC_SRST_4_up;		\
+	  for (_FP_FRAC_SRST_4_i = 0;					\
+	       _FP_FRAC_SRST_4_i < 3-_FP_FRAC_SRST_4_skip;		\
+	       ++_FP_FRAC_SRST_4_i)					\
+	    X##_f[_FP_FRAC_SRST_4_i]					\
+	      = ((X##_f[_FP_FRAC_SRST_4_i+_FP_FRAC_SRST_4_skip]		\
+		  >> _FP_FRAC_SRST_4_down)				\
+		 | (X##_f[_FP_FRAC_SRST_4_i+_FP_FRAC_SRST_4_skip+1]	\
+		    << _FP_FRAC_SRST_4_up));				\
+	  X##_f[_FP_FRAC_SRST_4_i++]					\
+	    = X##_f[3] >> _FP_FRAC_SRST_4_down;				\
+	}								\
+      for (; _FP_FRAC_SRST_4_i < 4; ++_FP_FRAC_SRST_4_i)		\
+	X##_f[_FP_FRAC_SRST_4_i] = 0;					\
+      S = (_FP_FRAC_SRST_4_s != 0);					\
+    }									\
+  while (0)
+
+#define _FP_FRAC_SRS_4(X, N, size)				\
+  do								\
+    {								\
+      int _FP_FRAC_SRS_4_sticky;				\
+      _FP_FRAC_SRST_4 (X, _FP_FRAC_SRS_4_sticky, (N), (size));	\
+      X##_f[0] |= _FP_FRAC_SRS_4_sticky;			\
+    }								\
+  while (0)
+
+#define _FP_FRAC_ADD_4(R, X, Y)					\
+  __FP_FRAC_ADD_4 (R##_f[3], R##_f[2], R##_f[1], R##_f[0],	\
+		   X##_f[3], X##_f[2], X##_f[1], X##_f[0],	\
+		   Y##_f[3], Y##_f[2], Y##_f[1], Y##_f[0])
+
+#define _FP_FRAC_SUB_4(R, X, Y)					\
+  __FP_FRAC_SUB_4 (R##_f[3], R##_f[2], R##_f[1], R##_f[0],	\
+		   X##_f[3], X##_f[2], X##_f[1], X##_f[0],	\
+		   Y##_f[3], Y##_f[2], Y##_f[1], Y##_f[0])
+
+#define _FP_FRAC_DEC_4(X, Y)					\
+  __FP_FRAC_DEC_4 (X##_f[3], X##_f[2], X##_f[1], X##_f[0],	\
+		   Y##_f[3], Y##_f[2], Y##_f[1], Y##_f[0])
+
+#define _FP_FRAC_ADDI_4(X, I)					\
+  __FP_FRAC_ADDI_4 (X##_f[3], X##_f[2], X##_f[1], X##_f[0], I)
+
+#define _FP_ZEROFRAC_4  0, 0, 0, 0
+#define _FP_MINFRAC_4   0, 0, 0, 1
+#define _FP_MAXFRAC_4	(~(_FP_WS_TYPE) 0), (~(_FP_WS_TYPE) 0), (~(_FP_WS_TYPE) 0), (~(_FP_WS_TYPE) 0)
+
+#define _FP_FRAC_ZEROP_4(X)     ((X##_f[0] | X##_f[1] | X##_f[2] | X##_f[3]) == 0)
+#define _FP_FRAC_NEGP_4(X)      ((_FP_WS_TYPE) X##_f[3] < 0)
+#define _FP_FRAC_OVERP_4(fs, X)  (_FP_FRAC_HIGH_##fs (X) & _FP_OVERFLOW_##fs)
+#define _FP_FRAC_HIGHBIT_DW_4(fs, X)	\
+  (_FP_FRAC_HIGH_DW_##fs (X) & _FP_HIGHBIT_DW_##fs)
+#define _FP_FRAC_CLEAR_OVERP_4(fs, X)  (_FP_FRAC_HIGH_##fs (X) &= ~_FP_OVERFLOW_##fs)
+
+#define _FP_FRAC_EQ_4(X, Y)				\
+  (X##_f[0] == Y##_f[0] && X##_f[1] == Y##_f[1]		\
+   && X##_f[2] == Y##_f[2] && X##_f[3] == Y##_f[3])
+
+#define _FP_FRAC_GT_4(X, Y)				\
+  (X##_f[3] > Y##_f[3]					\
+   || (X##_f[3] == Y##_f[3]				\
+       && (X##_f[2] > Y##_f[2]				\
+	   || (X##_f[2] == Y##_f[2]			\
+	       && (X##_f[1] > Y##_f[1]			\
+		   || (X##_f[1] == Y##_f[1]		\
+		       && X##_f[0] > Y##_f[0]))))))
+
+#define _FP_FRAC_GE_4(X, Y)				\
+  (X##_f[3] > Y##_f[3]					\
+   || (X##_f[3] == Y##_f[3]				\
+       && (X##_f[2] > Y##_f[2]				\
+	   || (X##_f[2] == Y##_f[2]			\
+	       && (X##_f[1] > Y##_f[1]			\
+		   || (X##_f[1] == Y##_f[1]		\
+		       && X##_f[0] >= Y##_f[0]))))))
+
+
+#define _FP_FRAC_CLZ_4(R, X)			\
+  do						\
+    {						\
+      if (X##_f[3])				\
+	__FP_CLZ ((R), X##_f[3]);		\
+      else if (X##_f[2])			\
+	{					\
+	  __FP_CLZ ((R), X##_f[2]);		\
+	  (R) += _FP_W_TYPE_SIZE;		\
+	}					\
+      else if (X##_f[1])			\
+	{					\
+	  __FP_CLZ ((R), X##_f[1]);		\
+	  (R) += _FP_W_TYPE_SIZE*2;		\
+	}					\
+      else					\
+	{					\
+	  __FP_CLZ ((R), X##_f[0]);		\
+	  (R) += _FP_W_TYPE_SIZE*3;		\
+	}					\
+    }						\
+  while (0)
+
+
+#define _FP_UNPACK_RAW_4(fs, X, val)			\
+  do							\
+    {							\
+      union _FP_UNION_##fs _FP_UNPACK_RAW_4_flo;	\
+      _FP_UNPACK_RAW_4_flo.flt = (val);			\
+      X##_f[0] = _FP_UNPACK_RAW_4_flo.bits.frac0;	\
+      X##_f[1] = _FP_UNPACK_RAW_4_flo.bits.frac1;	\
+      X##_f[2] = _FP_UNPACK_RAW_4_flo.bits.frac2;	\
+      X##_f[3] = _FP_UNPACK_RAW_4_flo.bits.frac3;	\
+      X##_e  = _FP_UNPACK_RAW_4_flo.bits.exp;		\
+      X##_s  = _FP_UNPACK_RAW_4_flo.bits.sign;		\
+    }							\
+  while (0)
+
+#define _FP_UNPACK_RAW_4_P(fs, X, val)			\
+  do							\
+    {							\
+      union _FP_UNION_##fs *_FP_UNPACK_RAW_4_P_flo	\
+	= (union _FP_UNION_##fs *) (val);		\
+							\
+      X##_f[0] = _FP_UNPACK_RAW_4_P_flo->bits.frac0;	\
+      X##_f[1] = _FP_UNPACK_RAW_4_P_flo->bits.frac1;	\
+      X##_f[2] = _FP_UNPACK_RAW_4_P_flo->bits.frac2;	\
+      X##_f[3] = _FP_UNPACK_RAW_4_P_flo->bits.frac3;	\
+      X##_e  = _FP_UNPACK_RAW_4_P_flo->bits.exp;	\
+      X##_s  = _FP_UNPACK_RAW_4_P_flo->bits.sign;	\
+    }							\
+  while (0)
+
+#define _FP_PACK_RAW_4(fs, val, X)		\
+  do						\
+    {						\
+      union _FP_UNION_##fs _FP_PACK_RAW_4_flo;	\
+      _FP_PACK_RAW_4_flo.bits.frac0 = X##_f[0];	\
+      _FP_PACK_RAW_4_flo.bits.frac1 = X##_f[1];	\
+      _FP_PACK_RAW_4_flo.bits.frac2 = X##_f[2];	\
+      _FP_PACK_RAW_4_flo.bits.frac3 = X##_f[3];	\
+      _FP_PACK_RAW_4_flo.bits.exp   = X##_e;	\
+      _FP_PACK_RAW_4_flo.bits.sign  = X##_s;	\
+      (val) = _FP_PACK_RAW_4_flo.flt;		\
+    }						\
+  while (0)
+
+#define _FP_PACK_RAW_4_P(fs, val, X)			\
+  do							\
+    {							\
+      union _FP_UNION_##fs *_FP_PACK_RAW_4_P_flo	\
+	= (union _FP_UNION_##fs *) (val);		\
+							\
+      _FP_PACK_RAW_4_P_flo->bits.frac0 = X##_f[0];	\
+      _FP_PACK_RAW_4_P_flo->bits.frac1 = X##_f[1];	\
+      _FP_PACK_RAW_4_P_flo->bits.frac2 = X##_f[2];	\
+      _FP_PACK_RAW_4_P_flo->bits.frac3 = X##_f[3];	\
+      _FP_PACK_RAW_4_P_flo->bits.exp   = X##_e;		\
+      _FP_PACK_RAW_4_P_flo->bits.sign  = X##_s;		\
+    }							\
+  while (0)
+
+/* Multiplication algorithms: */
+
+/* Given a 1W * 1W => 2W primitive, do the extended multiplication.  */
+
+#define _FP_MUL_MEAT_DW_4_wide(wfracbits, R, X, Y, doit)		\
+  do									\
+    {									\
+      _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_4_wide_b);			\
+      _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_4_wide_c);			\
+      _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_4_wide_d);			\
+      _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_4_wide_e);			\
+      _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_4_wide_f);			\
+									\
+      doit (_FP_FRAC_WORD_8 (R, 1), _FP_FRAC_WORD_8 (R, 0),		\
+	    X##_f[0], Y##_f[0]);					\
+      doit (_FP_MUL_MEAT_DW_4_wide_b_f1, _FP_MUL_MEAT_DW_4_wide_b_f0,	\
+	    X##_f[0], Y##_f[1]);					\
+      doit (_FP_MUL_MEAT_DW_4_wide_c_f1, _FP_MUL_MEAT_DW_4_wide_c_f0,	\
+	    X##_f[1], Y##_f[0]);					\
+      doit (_FP_MUL_MEAT_DW_4_wide_d_f1, _FP_MUL_MEAT_DW_4_wide_d_f0,	\
+	    X##_f[1], Y##_f[1]);					\
+      doit (_FP_MUL_MEAT_DW_4_wide_e_f1, _FP_MUL_MEAT_DW_4_wide_e_f0,	\
+	    X##_f[0], Y##_f[2]);					\
+      doit (_FP_MUL_MEAT_DW_4_wide_f_f1, _FP_MUL_MEAT_DW_4_wide_f_f0,	\
+	    X##_f[2], Y##_f[0]);					\
+      __FP_FRAC_ADD_3 (_FP_FRAC_WORD_8 (R, 3), _FP_FRAC_WORD_8 (R, 2),	\
+		       _FP_FRAC_WORD_8 (R, 1), 0,			\
+		       _FP_MUL_MEAT_DW_4_wide_b_f1,			\
+		       _FP_MUL_MEAT_DW_4_wide_b_f0,			\
+		       0, 0, _FP_FRAC_WORD_8 (R, 1));			\
+      __FP_FRAC_ADD_3 (_FP_FRAC_WORD_8 (R, 3), _FP_FRAC_WORD_8 (R, 2),	\
+		       _FP_FRAC_WORD_8 (R, 1), 0,			\
+		       _FP_MUL_MEAT_DW_4_wide_c_f1,			\
+		       _FP_MUL_MEAT_DW_4_wide_c_f0,			\
+		       _FP_FRAC_WORD_8 (R, 3), _FP_FRAC_WORD_8 (R, 2),	\
+		       _FP_FRAC_WORD_8 (R, 1));				\
+      __FP_FRAC_ADD_3 (_FP_FRAC_WORD_8 (R, 4), _FP_FRAC_WORD_8 (R, 3),	\
+		       _FP_FRAC_WORD_8 (R, 2), 0,			\
+		       _FP_MUL_MEAT_DW_4_wide_d_f1,			\
+		       _FP_MUL_MEAT_DW_4_wide_d_f0,			\
+		       0, _FP_FRAC_WORD_8 (R, 3), _FP_FRAC_WORD_8 (R, 2)); \
+      __FP_FRAC_ADD_3 (_FP_FRAC_WORD_8 (R, 4), _FP_FRAC_WORD_8 (R, 3),	\
+		       _FP_FRAC_WORD_8 (R, 2), 0,			\
+		       _FP_MUL_MEAT_DW_4_wide_e_f1,			\
+		       _FP_MUL_MEAT_DW_4_wide_e_f0,			\
+		       _FP_FRAC_WORD_8 (R, 4), _FP_FRAC_WORD_8 (R, 3),	\
+		       _FP_FRAC_WORD_8 (R, 2));				\
+      __FP_FRAC_ADD_3 (_FP_FRAC_WORD_8 (R, 4), _FP_FRAC_WORD_8 (R, 3),	\
+		       _FP_FRAC_WORD_8 (R, 2), 0,			\
+		       _FP_MUL_MEAT_DW_4_wide_f_f1,			\
+		       _FP_MUL_MEAT_DW_4_wide_f_f0,			\
+		       _FP_FRAC_WORD_8 (R, 4), _FP_FRAC_WORD_8 (R, 3),	\
+		       _FP_FRAC_WORD_8 (R, 2));				\
+      doit (_FP_MUL_MEAT_DW_4_wide_b_f1,				\
+	    _FP_MUL_MEAT_DW_4_wide_b_f0, X##_f[0], Y##_f[3]);		\
+      doit (_FP_MUL_MEAT_DW_4_wide_c_f1,				\
+	    _FP_MUL_MEAT_DW_4_wide_c_f0, X##_f[3], Y##_f[0]);		\
+      doit (_FP_MUL_MEAT_DW_4_wide_d_f1, _FP_MUL_MEAT_DW_4_wide_d_f0,	\
+	    X##_f[1], Y##_f[2]);					\
+      doit (_FP_MUL_MEAT_DW_4_wide_e_f1, _FP_MUL_MEAT_DW_4_wide_e_f0,	\
+	    X##_f[2], Y##_f[1]);					\
+      __FP_FRAC_ADD_3 (_FP_FRAC_WORD_8 (R, 5), _FP_FRAC_WORD_8 (R, 4),	\
+		       _FP_FRAC_WORD_8 (R, 3), 0,			\
+		       _FP_MUL_MEAT_DW_4_wide_b_f1,			\
+		       _FP_MUL_MEAT_DW_4_wide_b_f0,			\
+		       0, _FP_FRAC_WORD_8 (R, 4), _FP_FRAC_WORD_8 (R, 3)); \
+      __FP_FRAC_ADD_3 (_FP_FRAC_WORD_8 (R, 5), _FP_FRAC_WORD_8 (R, 4),	\
+		       _FP_FRAC_WORD_8 (R, 3), 0,			\
+		       _FP_MUL_MEAT_DW_4_wide_c_f1,			\
+		       _FP_MUL_MEAT_DW_4_wide_c_f0,			\
+		       _FP_FRAC_WORD_8 (R, 5), _FP_FRAC_WORD_8 (R, 4),	\
+		       _FP_FRAC_WORD_8 (R, 3));				\
+      __FP_FRAC_ADD_3 (_FP_FRAC_WORD_8 (R, 5), _FP_FRAC_WORD_8 (R, 4),	\
+		       _FP_FRAC_WORD_8 (R, 3), 0,			\
+		       _FP_MUL_MEAT_DW_4_wide_d_f1,			\
+		       _FP_MUL_MEAT_DW_4_wide_d_f0,			\
+		       _FP_FRAC_WORD_8 (R, 5), _FP_FRAC_WORD_8 (R, 4),	\
+		       _FP_FRAC_WORD_8 (R, 3));				\
+      __FP_FRAC_ADD_3 (_FP_FRAC_WORD_8 (R, 5), _FP_FRAC_WORD_8 (R, 4),	\
+		       _FP_FRAC_WORD_8 (R, 3), 0,			\
+		       _FP_MUL_MEAT_DW_4_wide_e_f1,			\
+		       _FP_MUL_MEAT_DW_4_wide_e_f0,			\
+		       _FP_FRAC_WORD_8 (R, 5), _FP_FRAC_WORD_8 (R, 4),	\
+		       _FP_FRAC_WORD_8 (R, 3));				\
+      doit (_FP_MUL_MEAT_DW_4_wide_b_f1, _FP_MUL_MEAT_DW_4_wide_b_f0,	\
+	    X##_f[2], Y##_f[2]);					\
+      doit (_FP_MUL_MEAT_DW_4_wide_c_f1, _FP_MUL_MEAT_DW_4_wide_c_f0,	\
+	    X##_f[1], Y##_f[3]);					\
+      doit (_FP_MUL_MEAT_DW_4_wide_d_f1, _FP_MUL_MEAT_DW_4_wide_d_f0,	\
+	    X##_f[3], Y##_f[1]);					\
+      doit (_FP_MUL_MEAT_DW_4_wide_e_f1, _FP_MUL_MEAT_DW_4_wide_e_f0,	\
+	    X##_f[2], Y##_f[3]);					\
+      doit (_FP_MUL_MEAT_DW_4_wide_f_f1, _FP_MUL_MEAT_DW_4_wide_f_f0,	\
+	    X##_f[3], Y##_f[2]);					\
+      __FP_FRAC_ADD_3 (_FP_FRAC_WORD_8 (R, 6), _FP_FRAC_WORD_8 (R, 5),	\
+		       _FP_FRAC_WORD_8 (R, 4), 0,			\
+		       _FP_MUL_MEAT_DW_4_wide_b_f1,			\
+		       _FP_MUL_MEAT_DW_4_wide_b_f0,			\
+		       0, _FP_FRAC_WORD_8 (R, 5), _FP_FRAC_WORD_8 (R, 4)); \
+      __FP_FRAC_ADD_3 (_FP_FRAC_WORD_8 (R, 6), _FP_FRAC_WORD_8 (R, 5),	\
+		       _FP_FRAC_WORD_8 (R, 4), 0,			\
+		       _FP_MUL_MEAT_DW_4_wide_c_f1,			\
+		       _FP_MUL_MEAT_DW_4_wide_c_f0,			\
+		       _FP_FRAC_WORD_8 (R, 6), _FP_FRAC_WORD_8 (R, 5),	\
+		       _FP_FRAC_WORD_8 (R, 4));				\
+      __FP_FRAC_ADD_3 (_FP_FRAC_WORD_8 (R, 6), _FP_FRAC_WORD_8 (R, 5),	\
+		       _FP_FRAC_WORD_8 (R, 4), 0,			\
+		       _FP_MUL_MEAT_DW_4_wide_d_f1,			\
+		       _FP_MUL_MEAT_DW_4_wide_d_f0,			\
+		       _FP_FRAC_WORD_8 (R, 6), _FP_FRAC_WORD_8 (R, 5),	\
+		       _FP_FRAC_WORD_8 (R, 4));				\
+      __FP_FRAC_ADD_3 (_FP_FRAC_WORD_8 (R, 7), _FP_FRAC_WORD_8 (R, 6),	\
+		       _FP_FRAC_WORD_8 (R, 5), 0,			\
+		       _FP_MUL_MEAT_DW_4_wide_e_f1,			\
+		       _FP_MUL_MEAT_DW_4_wide_e_f0,			\
+		       0, _FP_FRAC_WORD_8 (R, 6), _FP_FRAC_WORD_8 (R, 5)); \
+      __FP_FRAC_ADD_3 (_FP_FRAC_WORD_8 (R, 7), _FP_FRAC_WORD_8 (R, 6),	\
+		       _FP_FRAC_WORD_8 (R, 5), 0,			\
+		       _FP_MUL_MEAT_DW_4_wide_f_f1,			\
+		       _FP_MUL_MEAT_DW_4_wide_f_f0,			\
+		       _FP_FRAC_WORD_8 (R, 7), _FP_FRAC_WORD_8 (R, 6),	\
+		       _FP_FRAC_WORD_8 (R, 5));				\
+      doit (_FP_MUL_MEAT_DW_4_wide_b_f1, _FP_MUL_MEAT_DW_4_wide_b_f0,	\
+	    X##_f[3], Y##_f[3]);					\
+      __FP_FRAC_ADD_2 (_FP_FRAC_WORD_8 (R, 7), _FP_FRAC_WORD_8 (R, 6),	\
+		       _FP_MUL_MEAT_DW_4_wide_b_f1,			\
+		       _FP_MUL_MEAT_DW_4_wide_b_f0,			\
+		       _FP_FRAC_WORD_8 (R, 7), _FP_FRAC_WORD_8 (R, 6));	\
+    }									\
+  while (0)
+
+#define _FP_MUL_MEAT_4_wide(wfracbits, R, X, Y, doit)			\
+  do									\
+    {									\
+      _FP_FRAC_DECL_8 (_FP_MUL_MEAT_4_wide_z);				\
+									\
+      _FP_MUL_MEAT_DW_4_wide ((wfracbits), _FP_MUL_MEAT_4_wide_z,	\
+			      X, Y, doit);				\
+									\
+      /* Normalize since we know where the msb of the multiplicands	\
+	 were (bit B), we know that the msb of the of the product is	\
+	 at either 2B or 2B-1.  */					\
+      _FP_FRAC_SRS_8 (_FP_MUL_MEAT_4_wide_z, (wfracbits)-1,		\
+		      2*(wfracbits));					\
+      __FP_FRAC_SET_4 (R, _FP_FRAC_WORD_8 (_FP_MUL_MEAT_4_wide_z, 3),	\
+		       _FP_FRAC_WORD_8 (_FP_MUL_MEAT_4_wide_z, 2),	\
+		       _FP_FRAC_WORD_8 (_FP_MUL_MEAT_4_wide_z, 1),	\
+		       _FP_FRAC_WORD_8 (_FP_MUL_MEAT_4_wide_z, 0));	\
+    }									\
+  while (0)
+
+#define _FP_MUL_MEAT_DW_4_gmp(wfracbits, R, X, Y)	\
+  do							\
+    {							\
+      mpn_mul_n (R##_f, _x_f, _y_f, 4);			\
+    }							\
+  while (0)
+
+#define _FP_MUL_MEAT_4_gmp(wfracbits, R, X, Y)				\
+  do									\
+    {									\
+      _FP_FRAC_DECL_8 (_FP_MUL_MEAT_4_gmp_z);				\
+									\
+      _FP_MUL_MEAT_DW_4_gmp ((wfracbits), _FP_MUL_MEAT_4_gmp_z, X, Y);	\
+									\
+      /* Normalize since we know where the msb of the multiplicands	\
+	 were (bit B), we know that the msb of the of the product is	\
+	 at either 2B or 2B-1.  */					\
+      _FP_FRAC_SRS_8 (_FP_MUL_MEAT_4_gmp_z, (wfracbits)-1,		\
+		      2*(wfracbits));					\
+      __FP_FRAC_SET_4 (R, _FP_FRAC_WORD_8 (_FP_MUL_MEAT_4_gmp_z, 3),	\
+		       _FP_FRAC_WORD_8 (_FP_MUL_MEAT_4_gmp_z, 2),	\
+		       _FP_FRAC_WORD_8 (_FP_MUL_MEAT_4_gmp_z, 1),	\
+		       _FP_FRAC_WORD_8 (_FP_MUL_MEAT_4_gmp_z, 0));	\
+    }									\
+  while (0)
+
+/* Helper utility for _FP_DIV_MEAT_4_udiv:
+ * pppp = m * nnn.  */
+#define umul_ppppmnnn(p3, p2, p1, p0, m, n2, n1, n0)	\
+  do							\
+    {							\
+      UWtype umul_ppppmnnn_t;				\
+      umul_ppmm (p1, p0, m, n0);			\
+      umul_ppmm (p2, umul_ppppmnnn_t, m, n1);		\
+      __FP_FRAC_ADDI_2 (p2, p1, umul_ppppmnnn_t);	\
+      umul_ppmm (p3, umul_ppppmnnn_t, m, n2);		\
+      __FP_FRAC_ADDI_2 (p3, p2, umul_ppppmnnn_t);	\
+    }							\
+  while (0)
+
+/* Division algorithms: */
+
+#define _FP_DIV_MEAT_4_udiv(fs, R, X, Y)				\
+  do									\
+    {									\
+      int _FP_DIV_MEAT_4_udiv_i;					\
+      _FP_FRAC_DECL_4 (_FP_DIV_MEAT_4_udiv_n);				\
+      _FP_FRAC_DECL_4 (_FP_DIV_MEAT_4_udiv_m);				\
+      _FP_FRAC_SET_4 (_FP_DIV_MEAT_4_udiv_n, _FP_ZEROFRAC_4);		\
+      if (_FP_FRAC_GE_4 (X, Y))						\
+	{								\
+	  _FP_DIV_MEAT_4_udiv_n_f[3]					\
+	    = X##_f[0] << (_FP_W_TYPE_SIZE - 1);			\
+	  _FP_FRAC_SRL_4 (X, 1);					\
+	}								\
+      else								\
+	R##_e--;							\
+									\
+      /* Normalize, i.e. make the most significant bit of the		\
+	 denominator set.  */						\
+      _FP_FRAC_SLL_4 (Y, _FP_WFRACXBITS_##fs);				\
+									\
+      for (_FP_DIV_MEAT_4_udiv_i = 3; ; _FP_DIV_MEAT_4_udiv_i--)	\
+	{								\
+	  if (X##_f[3] == Y##_f[3])					\
+	    {								\
+	      /* This is a special case, not an optimization		\
+		 (X##_f[3]/Y##_f[3] would not fit into UWtype).		\
+		 As X## is guaranteed to be < Y,			\
+		 R##_f[_FP_DIV_MEAT_4_udiv_i] can be either		\
+		 (UWtype)-1 or (UWtype)-2.  */				\
+	      R##_f[_FP_DIV_MEAT_4_udiv_i] = -1;			\
+	      if (!_FP_DIV_MEAT_4_udiv_i)				\
+		break;							\
+	      __FP_FRAC_SUB_4 (X##_f[3], X##_f[2], X##_f[1], X##_f[0],	\
+			       Y##_f[2], Y##_f[1], Y##_f[0], 0,		\
+			       X##_f[2], X##_f[1], X##_f[0],		\
+			       _FP_DIV_MEAT_4_udiv_n_f[_FP_DIV_MEAT_4_udiv_i]); \
+	      _FP_FRAC_SUB_4 (X, Y, X);					\
+	      if (X##_f[3] > Y##_f[3])					\
+		{							\
+		  R##_f[_FP_DIV_MEAT_4_udiv_i] = -2;			\
+		  _FP_FRAC_ADD_4 (X, Y, X);				\
+		}							\
+	    }								\
+	  else								\
+	    {								\
+	      udiv_qrnnd (R##_f[_FP_DIV_MEAT_4_udiv_i],			\
+			  X##_f[3], X##_f[3], X##_f[2], Y##_f[3]);	\
+	      umul_ppppmnnn (_FP_DIV_MEAT_4_udiv_m_f[3],		\
+			     _FP_DIV_MEAT_4_udiv_m_f[2],		\
+			     _FP_DIV_MEAT_4_udiv_m_f[1],		\
+			     _FP_DIV_MEAT_4_udiv_m_f[0],		\
+			     R##_f[_FP_DIV_MEAT_4_udiv_i],		\
+			     Y##_f[2], Y##_f[1], Y##_f[0]);		\
+	      X##_f[2] = X##_f[1];					\
+	      X##_f[1] = X##_f[0];					\
+	      X##_f[0]							\
+		= _FP_DIV_MEAT_4_udiv_n_f[_FP_DIV_MEAT_4_udiv_i];	\
+	      if (_FP_FRAC_GT_4 (_FP_DIV_MEAT_4_udiv_m, X))		\
+		{							\
+		  R##_f[_FP_DIV_MEAT_4_udiv_i]--;			\
+		  _FP_FRAC_ADD_4 (X, Y, X);				\
+		  if (_FP_FRAC_GE_4 (X, Y)				\
+		      && _FP_FRAC_GT_4 (_FP_DIV_MEAT_4_udiv_m, X))	\
+		    {							\
+		      R##_f[_FP_DIV_MEAT_4_udiv_i]--;			\
+		      _FP_FRAC_ADD_4 (X, Y, X);				\
+		    }							\
+		}							\
+	      _FP_FRAC_DEC_4 (X, _FP_DIV_MEAT_4_udiv_m);		\
+	      if (!_FP_DIV_MEAT_4_udiv_i)				\
+		{							\
+		  if (!_FP_FRAC_EQ_4 (X, _FP_DIV_MEAT_4_udiv_m))	\
+		    R##_f[0] |= _FP_WORK_STICKY;			\
+		  break;						\
+		}							\
+	    }								\
+	}								\
+    }									\
+  while (0)
+
+
+/* Square root algorithms:
+   We have just one right now, maybe Newton approximation
+   should be added for those machines where division is fast.  */
+
+#define _FP_SQRT_MEAT_4(R, S, T, X, q)					\
+  do									\
+    {									\
+      while (q)								\
+	{								\
+	  T##_f[3] = S##_f[3] + (q);					\
+	  if (T##_f[3] <= X##_f[3])					\
+	    {								\
+	      S##_f[3] = T##_f[3] + (q);				\
+	      X##_f[3] -= T##_f[3];					\
+	      R##_f[3] += (q);						\
+	    }								\
+	  _FP_FRAC_SLL_4 (X, 1);					\
+	  (q) >>= 1;							\
+	}								\
+      (q) = (_FP_W_TYPE) 1 << (_FP_W_TYPE_SIZE - 1);			\
+      while (q)								\
+	{								\
+	  T##_f[2] = S##_f[2] + (q);					\
+	  T##_f[3] = S##_f[3];						\
+	  if (T##_f[3] < X##_f[3]					\
+	      || (T##_f[3] == X##_f[3] && T##_f[2] <= X##_f[2]))	\
+	    {								\
+	      S##_f[2] = T##_f[2] + (q);				\
+	      S##_f[3] += (T##_f[2] > S##_f[2]);			\
+	      __FP_FRAC_DEC_2 (X##_f[3], X##_f[2],			\
+			       T##_f[3], T##_f[2]);			\
+	      R##_f[2] += (q);						\
+	    }								\
+	  _FP_FRAC_SLL_4 (X, 1);					\
+	  (q) >>= 1;							\
+	}								\
+      (q) = (_FP_W_TYPE) 1 << (_FP_W_TYPE_SIZE - 1);			\
+      while (q)								\
+	{								\
+	  T##_f[1] = S##_f[1] + (q);					\
+	  T##_f[2] = S##_f[2];						\
+	  T##_f[3] = S##_f[3];						\
+	  if (T##_f[3] < X##_f[3]					\
+	      || (T##_f[3] == X##_f[3]					\
+		  && (T##_f[2] < X##_f[2]				\
+		      || (T##_f[2] == X##_f[2]				\
+			  && T##_f[1] <= X##_f[1]))))			\
+	    {								\
+	      S##_f[1] = T##_f[1] + (q);				\
+	      S##_f[2] += (T##_f[1] > S##_f[1]);			\
+	      S##_f[3] += (T##_f[2] > S##_f[2]);			\
+	      __FP_FRAC_DEC_3 (X##_f[3], X##_f[2], X##_f[1],		\
+			       T##_f[3], T##_f[2], T##_f[1]);		\
+	      R##_f[1] += (q);						\
+	    }								\
+	  _FP_FRAC_SLL_4 (X, 1);					\
+	  (q) >>= 1;							\
+	}								\
+      (q) = (_FP_W_TYPE) 1 << (_FP_W_TYPE_SIZE - 1);			\
+      while ((q) != _FP_WORK_ROUND)					\
+	{								\
+	  T##_f[0] = S##_f[0] + (q);					\
+	  T##_f[1] = S##_f[1];						\
+	  T##_f[2] = S##_f[2];						\
+	  T##_f[3] = S##_f[3];						\
+	  if (_FP_FRAC_GE_4 (X, T))					\
+	    {								\
+	      S##_f[0] = T##_f[0] + (q);				\
+	      S##_f[1] += (T##_f[0] > S##_f[0]);			\
+	      S##_f[2] += (T##_f[1] > S##_f[1]);			\
+	      S##_f[3] += (T##_f[2] > S##_f[2]);			\
+	      _FP_FRAC_DEC_4 (X, T);					\
+	      R##_f[0] += (q);						\
+	    }								\
+	  _FP_FRAC_SLL_4 (X, 1);					\
+	  (q) >>= 1;							\
+	}								\
+      if (!_FP_FRAC_ZEROP_4 (X))					\
+	{								\
+	  if (_FP_FRAC_GT_4 (X, S))					\
+	    R##_f[0] |= _FP_WORK_ROUND;					\
+	  R##_f[0] |= _FP_WORK_STICKY;					\
+	}								\
+    }									\
+  while (0)
+
+
+/* Internals.  */
+
+#define __FP_FRAC_SET_4(X, I3, I2, I1, I0)			\
+  (X##_f[3] = I3, X##_f[2] = I2, X##_f[1] = I1, X##_f[0] = I0)
+
+#ifndef __FP_FRAC_ADD_3
+# define __FP_FRAC_ADD_3(r2, r1, r0, x2, x1, x0, y2, y1, y0)	\
+  do								\
+    {								\
+      _FP_W_TYPE __FP_FRAC_ADD_3_c1, __FP_FRAC_ADD_3_c2;	\
+      r0 = x0 + y0;						\
+      __FP_FRAC_ADD_3_c1 = r0 < x0;				\
+      r1 = x1 + y1;						\
+      __FP_FRAC_ADD_3_c2 = r1 < x1;				\
+      r1 += __FP_FRAC_ADD_3_c1;					\
+      __FP_FRAC_ADD_3_c2 |= r1 < __FP_FRAC_ADD_3_c1;		\
+      r2 = x2 + y2 + __FP_FRAC_ADD_3_c2;			\
+    }								\
+  while (0)
+#endif
+
+#ifndef __FP_FRAC_ADD_4
+# define __FP_FRAC_ADD_4(r3, r2, r1, r0, x3, x2, x1, x0, y3, y2, y1, y0) \
+  do									\
+    {									\
+      _FP_W_TYPE __FP_FRAC_ADD_4_c1, __FP_FRAC_ADD_4_c2;		\
+      _FP_W_TYPE __FP_FRAC_ADD_4_c3;					\
+      r0 = x0 + y0;							\
+      __FP_FRAC_ADD_4_c1 = r0 < x0;					\
+      r1 = x1 + y1;							\
+      __FP_FRAC_ADD_4_c2 = r1 < x1;					\
+      r1 += __FP_FRAC_ADD_4_c1;						\
+      __FP_FRAC_ADD_4_c2 |= r1 < __FP_FRAC_ADD_4_c1;			\
+      r2 = x2 + y2;							\
+      __FP_FRAC_ADD_4_c3 = r2 < x2;					\
+      r2 += __FP_FRAC_ADD_4_c2;						\
+      __FP_FRAC_ADD_4_c3 |= r2 < __FP_FRAC_ADD_4_c2;			\
+      r3 = x3 + y3 + __FP_FRAC_ADD_4_c3;				\
+    }									\
+  while (0)
+#endif
+
+#ifndef __FP_FRAC_SUB_3
+# define __FP_FRAC_SUB_3(r2, r1, r0, x2, x1, x0, y2, y1, y0)    \
+  do                                                            \
+    {                                                           \
+      _FP_W_TYPE __FP_FRAC_SUB_3_tmp[2];                        \
+      _FP_W_TYPE __FP_FRAC_SUB_3_c1, __FP_FRAC_SUB_3_c2;        \
+      __FP_FRAC_SUB_3_tmp[0] = x0 - y0;                         \
+      __FP_FRAC_SUB_3_c1 = __FP_FRAC_SUB_3_tmp[0] > x0;         \
+      __FP_FRAC_SUB_3_tmp[1] = x1 - y1;                         \
+      __FP_FRAC_SUB_3_c2 = __FP_FRAC_SUB_3_tmp[1] > x1;         \
+      __FP_FRAC_SUB_3_tmp[1] -= __FP_FRAC_SUB_3_c1;             \
+      __FP_FRAC_SUB_3_c2 |= __FP_FRAC_SUB_3_c1 && (y1 == x1);   \
+      r2 = x2 - y2 - __FP_FRAC_SUB_3_c2;                        \
+      r1 = __FP_FRAC_SUB_3_tmp[1];                              \
+      r0 = __FP_FRAC_SUB_3_tmp[0];                              \
+    }                                                           \
+  while (0)
+#endif
+
+#ifndef __FP_FRAC_SUB_4
+# define __FP_FRAC_SUB_4(r3, r2, r1, r0, x3, x2, x1, x0, y3, y2, y1, y0) \
+  do                                                                     \
+    {                                                                    \
+      _FP_W_TYPE __FP_FRAC_SUB_4_tmp[3];                                 \
+      _FP_W_TYPE __FP_FRAC_SUB_4_c1, __FP_FRAC_SUB_4_c2;                 \
+      _FP_W_TYPE __FP_FRAC_SUB_4_c3;                                     \
+      __FP_FRAC_SUB_4_tmp[0] = x0 - y0;                                  \
+      __FP_FRAC_SUB_4_c1 = __FP_FRAC_SUB_4_tmp[0] > x0;                  \
+      __FP_FRAC_SUB_4_tmp[1] = x1 - y1;                                  \
+      __FP_FRAC_SUB_4_c2 = __FP_FRAC_SUB_4_tmp[1] > x1;                  \
+      __FP_FRAC_SUB_4_tmp[1] -= __FP_FRAC_SUB_4_c1;                      \
+      __FP_FRAC_SUB_4_c2 |= __FP_FRAC_SUB_4_c1 && (y1 == x1);            \
+      __FP_FRAC_SUB_4_tmp[2] = x2 - y2;                                  \
+      __FP_FRAC_SUB_4_c3 = __FP_FRAC_SUB_4_tmp[2] > x2;                  \
+      __FP_FRAC_SUB_4_tmp[2] -= __FP_FRAC_SUB_4_c2;                      \
+      __FP_FRAC_SUB_4_c3 |= __FP_FRAC_SUB_4_c2 && (y2 == x2);            \
+      r3 = x3 - y3 - __FP_FRAC_SUB_4_c3;                                 \
+      r2 = __FP_FRAC_SUB_4_tmp[2];                                       \
+      r1 = __FP_FRAC_SUB_4_tmp[1];                                       \
+      r0 = __FP_FRAC_SUB_4_tmp[0];                                       \
+    }                                                                    \
+  while (0)
+#endif
+
+#ifndef __FP_FRAC_DEC_3
+# define __FP_FRAC_DEC_3(x2, x1, x0, y2, y1, y0)		\
+  do								\
+    {								\
+      UWtype __FP_FRAC_DEC_3_t0, __FP_FRAC_DEC_3_t1;		\
+      UWtype __FP_FRAC_DEC_3_t2;				\
+      __FP_FRAC_DEC_3_t0 = x0;					\
+      __FP_FRAC_DEC_3_t1 = x1;					\
+      __FP_FRAC_DEC_3_t2 = x2;					\
+      __FP_FRAC_SUB_3 (x2, x1, x0, __FP_FRAC_DEC_3_t2,		\
+		       __FP_FRAC_DEC_3_t1, __FP_FRAC_DEC_3_t0,	\
+		       y2, y1, y0);				\
+    }								\
+  while (0)
+#endif
+
+#ifndef __FP_FRAC_DEC_4
+# define __FP_FRAC_DEC_4(x3, x2, x1, x0, y3, y2, y1, y0)	\
+  do								\
+    {								\
+      UWtype __FP_FRAC_DEC_4_t0, __FP_FRAC_DEC_4_t1;		\
+      UWtype __FP_FRAC_DEC_4_t2, __FP_FRAC_DEC_4_t3;		\
+      __FP_FRAC_DEC_4_t0 = x0;					\
+      __FP_FRAC_DEC_4_t1 = x1;					\
+      __FP_FRAC_DEC_4_t2 = x2;					\
+      __FP_FRAC_DEC_4_t3 = x3;					\
+      __FP_FRAC_SUB_4 (x3, x2, x1, x0, __FP_FRAC_DEC_4_t3,	\
+		       __FP_FRAC_DEC_4_t2, __FP_FRAC_DEC_4_t1,	\
+		       __FP_FRAC_DEC_4_t0, y3, y2, y1, y0);	\
+    }								\
+  while (0)
+#endif
+
+#ifndef __FP_FRAC_ADDI_4
+# define __FP_FRAC_ADDI_4(x3, x2, x1, x0, i)		\
+  do							\
+    {							\
+      UWtype __FP_FRAC_ADDI_4_t;			\
+      __FP_FRAC_ADDI_4_t = ((x0 += i) < i);		\
+      x1 += __FP_FRAC_ADDI_4_t;				\
+      __FP_FRAC_ADDI_4_t = (x1 < __FP_FRAC_ADDI_4_t);	\
+      x2 += __FP_FRAC_ADDI_4_t;				\
+      __FP_FRAC_ADDI_4_t = (x2 < __FP_FRAC_ADDI_4_t);	\
+      x3 += __FP_FRAC_ADDI_4_t;				\
+    }							\
+  while (0)
+#endif
+
+/* Convert FP values between word sizes. This appears to be more
+   complicated than I'd have expected it to be, so these might be
+   wrong... These macros are in any case somewhat bogus because they
+   use information about what various FRAC_n variables look like
+   internally [eg, that 2 word vars are X_f0 and x_f1]. But so do
+   the ones in op-2.h and op-1.h.  */
+#define _FP_FRAC_COPY_1_4(D, S)		(D##_f = S##_f[0])
+
+#define _FP_FRAC_COPY_2_4(D, S)			\
+  do						\
+    {						\
+      D##_f0 = S##_f[0];			\
+      D##_f1 = S##_f[1];			\
+    }						\
+  while (0)
+
+/* Assembly/disassembly for converting to/from integral types.
+   No shifting or overflow handled here.  */
+/* Put the FP value X into r, which is an integer of size rsize.  */
+#define _FP_FRAC_ASSEMBLE_4(r, X, rsize)				\
+  do									\
+    {									\
+      if ((rsize) <= _FP_W_TYPE_SIZE)					\
+	(r) = X##_f[0];							\
+	else if ((rsize) <= 2*_FP_W_TYPE_SIZE)				\
+	{								\
+	  (r) = X##_f[1];						\
+	  (r) = ((rsize) <= _FP_W_TYPE_SIZE				\
+		 ? 0							\
+		 : (r) << _FP_W_TYPE_SIZE);				\
+	  (r) += X##_f[0];						\
+	}								\
+      else								\
+	{								\
+	  /* I'm feeling lazy so we deal with int == 3words		\
+	     (implausible) and int == 4words as a single case.  */	\
+	  (r) = X##_f[3];						\
+	  (r) = ((rsize) <= _FP_W_TYPE_SIZE				\
+		 ? 0							\
+		 : (r) << _FP_W_TYPE_SIZE);				\
+	  (r) += X##_f[2];						\
+	  (r) = ((rsize) <= _FP_W_TYPE_SIZE				\
+		 ? 0							\
+		 : (r) << _FP_W_TYPE_SIZE);				\
+	  (r) += X##_f[1];						\
+	  (r) = ((rsize) <= _FP_W_TYPE_SIZE				\
+		 ? 0							\
+		 : (r) << _FP_W_TYPE_SIZE);				\
+	  (r) += X##_f[0];						\
+	}								\
+    }									\
+  while (0)
+
+/* "No disassemble Number Five!" */
+/* Move an integer of size rsize into X's fractional part. We rely on
+   the _f[] array consisting of words of size _FP_W_TYPE_SIZE to avoid
+   having to mask the values we store into it.  */
+#define _FP_FRAC_DISASSEMBLE_4(X, r, rsize)	\
+  do						\
+    {						\
+      X##_f[0] = (r);				\
+      X##_f[1] = ((rsize) <= _FP_W_TYPE_SIZE	\
+		  ? 0				\
+		  : (r) >> _FP_W_TYPE_SIZE);	\
+      X##_f[2] = ((rsize) <= 2*_FP_W_TYPE_SIZE	\
+		  ? 0				\
+		  : (r) >> 2*_FP_W_TYPE_SIZE);	\
+      X##_f[3] = ((rsize) <= 3*_FP_W_TYPE_SIZE	\
+		  ? 0				\
+		  : (r) >> 3*_FP_W_TYPE_SIZE);	\
+    }						\
+  while (0)
+
+#define _FP_FRAC_COPY_4_1(D, S)			\
+  do						\
+    {						\
+      D##_f[0] = S##_f;				\
+      D##_f[1] = D##_f[2] = D##_f[3] = 0;	\
+    }						\
+  while (0)
+
+#define _FP_FRAC_COPY_4_2(D, S)			\
+  do						\
+    {						\
+      D##_f[0] = S##_f0;			\
+      D##_f[1] = S##_f1;			\
+      D##_f[2] = D##_f[3] = 0;			\
+    }						\
+  while (0)
+
+#define _FP_FRAC_COPY_4_4(D, S)	_FP_FRAC_COPY_4 (D, S)
+
+#endif /* !SOFT_FP_OP_4_H */
--- a/src/linpack/soft-fp/op-8.h
+++ b/src/linpack/soft-fp/op-8.h
@ -0,0 +1,238 @@
+/* Software floating-point emulation.
+   Basic eight-word fraction declaration and manipulation.
+   Copyright (C) 1997-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson (rth@cygnus.com),
+		  Jakub Jelinek (jj@ultra.linux.cz) and
+		  Peter Maydell (pmaydell@chiark.greenend.org.uk).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef SOFT_FP_OP_8_H
+#define SOFT_FP_OP_8_H	1
+
+/* We need just a few things from here for op-4, if we ever need some
+   other macros, they can be added.  */
+#define _FP_FRAC_DECL_8(X)	_FP_W_TYPE X##_f[8]
+#define _FP_FRAC_SET_8(X, I)    __FP_FRAC_SET_8 (X, I)
+#define _FP_FRAC_HIGH_8(X)	(X##_f[7])
+#define _FP_FRAC_LOW_8(X)	(X##_f[0])
+#define _FP_FRAC_WORD_8(X, w)	(X##_f[w])
+
+#define _FP_FRAC_SLL_8(X, N)						\
+  do									\
+    {									\
+      _FP_I_TYPE _FP_FRAC_SLL_8_up, _FP_FRAC_SLL_8_down;		\
+      _FP_I_TYPE _FP_FRAC_SLL_8_skip, _FP_FRAC_SLL_8_i;			\
+      _FP_FRAC_SLL_8_skip = (N) / _FP_W_TYPE_SIZE;			\
+      _FP_FRAC_SLL_8_up = (N) % _FP_W_TYPE_SIZE;			\
+      _FP_FRAC_SLL_8_down = _FP_W_TYPE_SIZE - _FP_FRAC_SLL_8_up;	\
+      if (!_FP_FRAC_SLL_8_up)						\
+	for (_FP_FRAC_SLL_8_i = 7;					\
+	     _FP_FRAC_SLL_8_i >= _FP_FRAC_SLL_8_skip;			\
+	     --_FP_FRAC_SLL_8_i)					\
+	  X##_f[_FP_FRAC_SLL_8_i]					\
+	    = X##_f[_FP_FRAC_SLL_8_i-_FP_FRAC_SLL_8_skip];		\
+      else								\
+	{								\
+	  for (_FP_FRAC_SLL_8_i = 7;					\
+	       _FP_FRAC_SLL_8_i > _FP_FRAC_SLL_8_skip;			\
+	       --_FP_FRAC_SLL_8_i)					\
+	    X##_f[_FP_FRAC_SLL_8_i]					\
+	      = ((X##_f[_FP_FRAC_SLL_8_i-_FP_FRAC_SLL_8_skip]		\
+		  << _FP_FRAC_SLL_8_up)					\
+		 | (X##_f[_FP_FRAC_SLL_8_i-_FP_FRAC_SLL_8_skip-1]	\
+		    >> _FP_FRAC_SLL_8_down));				\
+	  X##_f[_FP_FRAC_SLL_8_i--] = X##_f[0] << _FP_FRAC_SLL_8_up;	\
+	}								\
+      for (; _FP_FRAC_SLL_8_i >= 0; --_FP_FRAC_SLL_8_i)			\
+	X##_f[_FP_FRAC_SLL_8_i] = 0;					\
+    }									\
+  while (0)
+
+#define _FP_FRAC_SRL_8(X, N)						\
+  do									\
+    {									\
+      _FP_I_TYPE _FP_FRAC_SRL_8_up, _FP_FRAC_SRL_8_down;		\
+      _FP_I_TYPE _FP_FRAC_SRL_8_skip, _FP_FRAC_SRL_8_i;			\
+      _FP_FRAC_SRL_8_skip = (N) / _FP_W_TYPE_SIZE;			\
+      _FP_FRAC_SRL_8_down = (N) % _FP_W_TYPE_SIZE;			\
+      _FP_FRAC_SRL_8_up = _FP_W_TYPE_SIZE - _FP_FRAC_SRL_8_down;	\
+      if (!_FP_FRAC_SRL_8_down)						\
+	for (_FP_FRAC_SRL_8_i = 0;					\
+	     _FP_FRAC_SRL_8_i <= 7-_FP_FRAC_SRL_8_skip;			\
+	     ++_FP_FRAC_SRL_8_i)					\
+	  X##_f[_FP_FRAC_SRL_8_i]					\
+	    = X##_f[_FP_FRAC_SRL_8_i+_FP_FRAC_SRL_8_skip];		\
+      else								\
+	{								\
+	  for (_FP_FRAC_SRL_8_i = 0;					\
+	       _FP_FRAC_SRL_8_i < 7-_FP_FRAC_SRL_8_skip;		\
+	       ++_FP_FRAC_SRL_8_i)					\
+	    X##_f[_FP_FRAC_SRL_8_i]					\
+	      = ((X##_f[_FP_FRAC_SRL_8_i+_FP_FRAC_SRL_8_skip]		\
+		  >> _FP_FRAC_SRL_8_down)				\
+		 | (X##_f[_FP_FRAC_SRL_8_i+_FP_FRAC_SRL_8_skip+1]	\
+		    << _FP_FRAC_SRL_8_up));				\
+	  X##_f[_FP_FRAC_SRL_8_i++] = X##_f[7] >> _FP_FRAC_SRL_8_down;	\
+	}								\
+      for (; _FP_FRAC_SRL_8_i < 8; ++_FP_FRAC_SRL_8_i)			\
+	X##_f[_FP_FRAC_SRL_8_i] = 0;					\
+    }									\
+  while (0)
+
+
+/* Right shift with sticky-lsb.
+   What this actually means is that we do a standard right-shift,
+   but that if any of the bits that fall off the right hand side
+   were one then we always set the LSbit.  */
+#define _FP_FRAC_SRS_8(X, N, size)					\
+  do									\
+    {									\
+      _FP_I_TYPE _FP_FRAC_SRS_8_up, _FP_FRAC_SRS_8_down;		\
+      _FP_I_TYPE _FP_FRAC_SRS_8_skip, _FP_FRAC_SRS_8_i;			\
+      _FP_W_TYPE _FP_FRAC_SRS_8_s;					\
+      _FP_FRAC_SRS_8_skip = (N) / _FP_W_TYPE_SIZE;			\
+      _FP_FRAC_SRS_8_down = (N) % _FP_W_TYPE_SIZE;			\
+      _FP_FRAC_SRS_8_up = _FP_W_TYPE_SIZE - _FP_FRAC_SRS_8_down;	\
+      for (_FP_FRAC_SRS_8_s = _FP_FRAC_SRS_8_i = 0;			\
+	   _FP_FRAC_SRS_8_i < _FP_FRAC_SRS_8_skip;			\
+	   ++_FP_FRAC_SRS_8_i)						\
+	_FP_FRAC_SRS_8_s |= X##_f[_FP_FRAC_SRS_8_i];			\
+      if (!_FP_FRAC_SRS_8_down)						\
+	for (_FP_FRAC_SRS_8_i = 0;					\
+	     _FP_FRAC_SRS_8_i <= 7-_FP_FRAC_SRS_8_skip;			\
+	     ++_FP_FRAC_SRS_8_i)					\
+	  X##_f[_FP_FRAC_SRS_8_i]					\
+	    = X##_f[_FP_FRAC_SRS_8_i+_FP_FRAC_SRS_8_skip];		\
+      else								\
+	{								\
+	  _FP_FRAC_SRS_8_s						\
+	    |= X##_f[_FP_FRAC_SRS_8_i] << _FP_FRAC_SRS_8_up;		\
+	  for (_FP_FRAC_SRS_8_i = 0;					\
+	       _FP_FRAC_SRS_8_i < 7-_FP_FRAC_SRS_8_skip;		\
+	       ++_FP_FRAC_SRS_8_i)					\
+	    X##_f[_FP_FRAC_SRS_8_i]					\
+	      = ((X##_f[_FP_FRAC_SRS_8_i+_FP_FRAC_SRS_8_skip]		\
+		  >> _FP_FRAC_SRS_8_down)				\
+		 | (X##_f[_FP_FRAC_SRS_8_i+_FP_FRAC_SRS_8_skip+1]	\
+		    << _FP_FRAC_SRS_8_up));				\
+	  X##_f[_FP_FRAC_SRS_8_i++] = X##_f[7] >> _FP_FRAC_SRS_8_down;	\
+	}								\
+      for (; _FP_FRAC_SRS_8_i < 8; ++_FP_FRAC_SRS_8_i)			\
+	X##_f[_FP_FRAC_SRS_8_i] = 0;					\
+      /* Don't fix the LSB until the very end when we're sure f[0] is	\
+	 stable.  */							\
+      X##_f[0] |= (_FP_FRAC_SRS_8_s != 0);				\
+    }									\
+  while (0)
+
+#define _FP_FRAC_ADD_8(R, X, Y)                                             \
+  do                                                                        \
+    {                                                                       \
+      _FP_W_TYPE _FP_FRAC_ADD_8_c = 0;                                      \
+      _FP_I_TYPE _FP_FRAC_ADD_8_i;                                          \
+      for (_FP_FRAC_ADD_8_i = 0; _FP_FRAC_ADD_8_i < 8; ++_FP_FRAC_ADD_8_i)  \
+        {                                                                   \
+          R##_f[_FP_FRAC_ADD_8_i]                                           \
+            = (X##_f[_FP_FRAC_ADD_8_i] + Y##_f[_FP_FRAC_ADD_8_i]            \
+               + _FP_FRAC_ADD_8_c);                                         \
+          _FP_FRAC_ADD_8_c                                                  \
+            = (_FP_FRAC_ADD_8_c                                             \
+               ? R##_f[_FP_FRAC_ADD_8_i] <= X##_f[_FP_FRAC_ADD_8_i]         \
+               : R##_f[_FP_FRAC_ADD_8_i] < X##_f[_FP_FRAC_ADD_8_i]);        \
+        }                                                                   \
+    }                                                                       \
+  while (0)
+
+#define _FP_FRAC_SUB_8(R, X, Y)                                             \
+  do                                                                        \
+    {                                                                       \
+      _FP_W_TYPE _FP_FRAC_SUB_8_tmp[8];                                     \
+      _FP_W_TYPE _FP_FRAC_SUB_8_c = 0;                                      \
+      _FP_I_TYPE _FP_FRAC_SUB_8_i;                                          \
+      for (_FP_FRAC_SUB_8_i = 0; _FP_FRAC_SUB_8_i < 8; ++_FP_FRAC_SUB_8_i)  \
+        {                                                                   \
+          _FP_FRAC_SUB_8_tmp[_FP_FRAC_SUB_8_i]                              \
+            = (X##_f[_FP_FRAC_SUB_8_i] - Y##_f[_FP_FRAC_SUB_8_i]            \
+               - _FP_FRAC_SUB_8_c);                                         \
+          _FP_FRAC_SUB_8_c                                                  \
+            = (_FP_FRAC_SUB_8_c                                             \
+               ? (_FP_FRAC_SUB_8_tmp[_FP_FRAC_SUB_8_i]                      \
+                  >= X##_f[_FP_FRAC_SUB_8_i])                               \
+               : (_FP_FRAC_SUB_8_tmp[_FP_FRAC_SUB_8_i]                      \
+                  > X##_f[_FP_FRAC_SUB_8_i]));                              \
+        }                                                                   \
+      for (_FP_FRAC_SUB_8_i = 0; _FP_FRAC_SUB_8_i < 8; ++_FP_FRAC_SUB_8_i)  \
+        R##_f[_FP_FRAC_SUB_8_i] = _FP_FRAC_SUB_8_tmp[_FP_FRAC_SUB_8_i];     \
+    }                                                                       \
+  while (0)
+
+#define _FP_FRAC_CLZ_8(R, X)                                                \
+  do                                                                        \
+    {                                                                       \
+      _FP_I_TYPE _FP_FRAC_CLZ_8_i;                                          \
+      for (_FP_FRAC_CLZ_8_i = 7; _FP_FRAC_CLZ_8_i > 0; _FP_FRAC_CLZ_8_i--)  \
+        if (X##_f[_FP_FRAC_CLZ_8_i])                                        \
+          break;                                                            \
+      __FP_CLZ ((R), X##_f[_FP_FRAC_CLZ_8_i]);                              \
+      (R) += _FP_W_TYPE_SIZE * (7 - _FP_FRAC_CLZ_8_i);                      \
+    }                                                                       \
+  while (0)
+
+#define _FP_MINFRAC_8   0, 0, 0, 0, 0, 0, 0, 1
+
+#define _FP_FRAC_NEGP_8(X)      ((_FP_WS_TYPE) X##_f[7] < 0)
+#define _FP_FRAC_ZEROP_8(X)                                             \
+  ((X##_f[0] | X##_f[1] | X##_f[2] | X##_f[3]                           \
+    | X##_f[4] | X##_f[5] | X##_f[6] | X##_f[7]) == 0)
+#define _FP_FRAC_HIGHBIT_DW_8(fs, X)                                    \
+  (_FP_FRAC_HIGH_DW_##fs (X) & _FP_HIGHBIT_DW_##fs)
+
+#define _FP_FRAC_COPY_4_8(D, S)                           \
+  do                                                      \
+    {                                                     \
+      D##_f[0] = S##_f[0];                                \
+      D##_f[1] = S##_f[1];                                \
+      D##_f[2] = S##_f[2];                                \
+      D##_f[3] = S##_f[3];                                \
+    }                                                     \
+  while (0)
+
+#define _FP_FRAC_COPY_8_4(D, S)                           \
+  do                                                      \
+    {                                                     \
+      D##_f[0] = S##_f[0];                                \
+      D##_f[1] = S##_f[1];                                \
+      D##_f[2] = S##_f[2];                                \
+      D##_f[3] = S##_f[3];                                \
+      D##_f[4] = D##_f[5] = D##_f[6] = D##_f[7]= 0;       \
+    }                                                     \
+  while (0)
+
+#define __FP_FRAC_SET_8(X, I7, I6, I5, I4, I3, I2, I1, I0)             \
+  (X##_f[7] = I7, X##_f[6] = I6, X##_f[5] = I5, X##_f[4] = I4,         \
+   X##_f[3] = I3, X##_f[2] = I2, X##_f[1] = I1, X##_f[0] = I0)
+
+#endif /* !SOFT_FP_OP_8_H */
--- a/src/linpack/soft-fp/op-common.h
+++ b/src/linpack/soft-fp/op-common.h
--- a/src/linpack/soft-fp/sfp-machine.h
+++ b/src/linpack/soft-fp/sfp-machine.h
@ -0,0 +1,117 @@
+
+#if __riscv_xlen == 32
+
+#define _FP_W_TYPE_SIZE		32
+#define _FP_W_TYPE		unsigned long
+#define _FP_WS_TYPE		signed long
+#define _FP_I_TYPE		long
+
+#define _FP_MUL_MEAT_S(R,X,Y)				\
+  _FP_MUL_MEAT_1_wide(_FP_WFRACBITS_S,R,X,Y,umul_ppmm)
+#define _FP_MUL_MEAT_D(R,X,Y)				\
+  _FP_MUL_MEAT_2_wide(_FP_WFRACBITS_D,R,X,Y,umul_ppmm)
+#define _FP_MUL_MEAT_Q(R,X,Y)				\
+  _FP_MUL_MEAT_4_wide(_FP_WFRACBITS_Q,R,X,Y,umul_ppmm)
+
+#define _FP_DIV_MEAT_S(R,X,Y)	_FP_DIV_MEAT_1_udiv_norm(S,R,X,Y)
+#define _FP_DIV_MEAT_D(R,X,Y)	_FP_DIV_MEAT_2_udiv(D,R,X,Y)
+#define _FP_DIV_MEAT_Q(R,X,Y)	_FP_DIV_MEAT_4_udiv(Q,R,X,Y)
+
+#define _FP_NANFRAC_S		_FP_QNANBIT_S
+#define _FP_NANFRAC_D		_FP_QNANBIT_D, 0
+#define _FP_NANFRAC_Q		_FP_QNANBIT_Q, 0, 0, 0
+
+#else
+
+#define _FP_W_TYPE_SIZE		64
+#define _FP_W_TYPE		unsigned long long
+#define _FP_WS_TYPE		signed long long
+#define _FP_I_TYPE		long long
+
+#define _FP_MUL_MEAT_S(R,X,Y)					\
+  _FP_MUL_MEAT_1_imm(_FP_WFRACBITS_S,R,X,Y)
+#define _FP_MUL_MEAT_D(R,X,Y)					\
+  _FP_MUL_MEAT_1_wide(_FP_WFRACBITS_D,R,X,Y,umul_ppmm)
+#define _FP_MUL_MEAT_Q(R,X,Y)					\
+  _FP_MUL_MEAT_2_wide_3mul(_FP_WFRACBITS_Q,R,X,Y,umul_ppmm)
+
+#define _FP_DIV_MEAT_S(R,X,Y)	_FP_DIV_MEAT_1_imm(S,R,X,Y,_FP_DIV_HELP_imm)
+#define _FP_DIV_MEAT_D(R,X,Y)	_FP_DIV_MEAT_1_udiv_norm(D,R,X,Y)
+#define _FP_DIV_MEAT_Q(R,X,Y)	_FP_DIV_MEAT_2_udiv(Q,R,X,Y)
+
+#define _FP_NANFRAC_S		_FP_QNANBIT_S
+#define _FP_NANFRAC_D		_FP_QNANBIT_D
+#define _FP_NANFRAC_Q		_FP_QNANBIT_Q, 0
+
+#endif
+
+#if __riscv_xlen == 64
+typedef int TItype __attribute__ ((mode (TI)));
+typedef unsigned int UTItype __attribute__ ((mode (TI)));
+#define TI_BITS (__CHAR_BIT__ * (int)sizeof(TItype))
+#endif
+
+/* The type of the result of a floating point comparison.  This must
+   match __libgcc_cmp_return__ in GCC for the target.  */
+typedef int __gcc_CMPtype __attribute__ ((mode (__libgcc_cmp_return__)));
+#define CMPtype __gcc_CMPtype
+
+#define _FP_NANSIGN_S		0
+#define _FP_NANSIGN_D		0
+#define _FP_NANSIGN_Q		0
+
+#define _FP_KEEPNANFRACP 0
+#define _FP_QNANNEGATEDP 0
+
+#define _FP_CHOOSENAN(fs, wc, R, X, Y, OP)	\
+  do {						\
+    R##_s = _FP_NANSIGN_##fs;			\
+    _FP_FRAC_SET_##wc(R,_FP_NANFRAC_##fs);	\
+    R##_c = FP_CLS_NAN;				\
+  } while (0)
+
+#define _FP_DECL_EX		int _frm __attribute__ ((unused));
+#define FP_ROUNDMODE		_frm
+
+#define FP_RND_NEAREST		0x0
+#define FP_RND_ZERO		0x1
+#define FP_RND_PINF		0x3
+#define FP_RND_MINF		0x2
+
+#define FP_EX_INVALID		0x10
+#define FP_EX_OVERFLOW		0x04
+#define FP_EX_UNDERFLOW		0x02
+#define FP_EX_DIVZERO		0x08
+#define FP_EX_INEXACT		0x01
+
+#define _FP_TININESS_AFTER_ROUNDING 1
+
+#ifdef __riscv_flen
+#define FP_INIT_ROUNDMODE			\
+do {						\
+  __asm__ volatile ("frrm %0" : "=r" (_frm));	\
+} while (0)
+
+#define FP_HANDLE_EXCEPTIONS					\
+do {								\
+  if (__builtin_expect (_fex, 0))				\
+    __asm__ volatile ("csrs fflags, %0" : : "rK" (_fex));	\
+} while (0)
+#else
+#define FP_INIT_ROUNDMODE	_frm = FP_RND_NEAREST
+#endif
+
+#define	__LITTLE_ENDIAN	1234
+#define	__BIG_ENDIAN	4321
+
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#define __BYTE_ORDER __BIG_ENDIAN
+#else
+#define __BYTE_ORDER __LITTLE_ENDIAN
+#endif
+
+
+/* Define ALIASNAME as a strong alias for NAME.  */
+# define strong_alias(name, aliasname) _strong_alias(name, aliasname)
+# define _strong_alias(name, aliasname) \
+  extern __typeof (name) aliasname __attribute__ ((alias (#name)));
--- a/src/linpack/soft-fp/single.h
+++ b/src/linpack/soft-fp/single.h
@ -0,0 +1,199 @@
+/* Software floating-point emulation.
+   Definitions for IEEE Single Precision.
+   Copyright (C) 1997-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson (rth@cygnus.com),
+		  Jakub Jelinek (jj@ultra.linux.cz),
+		  David S. Miller (davem@redhat.com) and
+		  Peter Maydell (pmaydell@chiark.greenend.org.uk).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef SOFT_FP_SINGLE_H
+#define SOFT_FP_SINGLE_H	1
+
+#if _FP_W_TYPE_SIZE < 32
+# error "Here's a nickel kid.  Go buy yourself a real computer."
+#endif
+
+#define _FP_FRACTBITS_S		_FP_W_TYPE_SIZE
+
+#if _FP_W_TYPE_SIZE < 64
+# define _FP_FRACTBITS_DW_S	(2 * _FP_W_TYPE_SIZE)
+#else
+# define _FP_FRACTBITS_DW_S	_FP_W_TYPE_SIZE
+#endif
+
+#define _FP_FRACBITS_S		24
+#define _FP_FRACXBITS_S		(_FP_FRACTBITS_S - _FP_FRACBITS_S)
+#define _FP_WFRACBITS_S		(_FP_WORKBITS + _FP_FRACBITS_S)
+#define _FP_WFRACXBITS_S	(_FP_FRACTBITS_S - _FP_WFRACBITS_S)
+#define _FP_EXPBITS_S		8
+#define _FP_EXPBIAS_S		127
+#define _FP_EXPMAX_S		255
+#define _FP_QNANBIT_S		((_FP_W_TYPE) 1 << (_FP_FRACBITS_S-2))
+#define _FP_QNANBIT_SH_S	((_FP_W_TYPE) 1 << (_FP_FRACBITS_S-2+_FP_WORKBITS))
+#define _FP_IMPLBIT_S		((_FP_W_TYPE) 1 << (_FP_FRACBITS_S-1))
+#define _FP_IMPLBIT_SH_S	((_FP_W_TYPE) 1 << (_FP_FRACBITS_S-1+_FP_WORKBITS))
+#define _FP_OVERFLOW_S		((_FP_W_TYPE) 1 << (_FP_WFRACBITS_S))
+
+#define _FP_WFRACBITS_DW_S	(2 * _FP_WFRACBITS_S)
+#define _FP_WFRACXBITS_DW_S	(_FP_FRACTBITS_DW_S - _FP_WFRACBITS_DW_S)
+#define _FP_HIGHBIT_DW_S	\
+  ((_FP_W_TYPE) 1 << (_FP_WFRACBITS_DW_S - 1) % _FP_W_TYPE_SIZE)
+
+/* The implementation of _FP_MUL_MEAT_S and _FP_DIV_MEAT_S should be
+   chosen by the target machine.  */
+
+typedef float SFtype __attribute__ ((mode (SF)));
+
+union _FP_UNION_S
+{
+  SFtype flt;
+  struct _FP_STRUCT_LAYOUT
+  {
+#if __BYTE_ORDER == __BIG_ENDIAN
+    unsigned sign : 1;
+    unsigned exp  : _FP_EXPBITS_S;
+    unsigned frac : _FP_FRACBITS_S - (_FP_IMPLBIT_S != 0);
+#else
+    unsigned frac : _FP_FRACBITS_S - (_FP_IMPLBIT_S != 0);
+    unsigned exp  : _FP_EXPBITS_S;
+    unsigned sign : 1;
+#endif
+  } bits;
+};
+
+#define FP_DECL_S(X)		_FP_DECL (1, X)
+#define FP_UNPACK_RAW_S(X, val)	_FP_UNPACK_RAW_1 (S, X, (val))
+#define FP_UNPACK_RAW_SP(X, val)	_FP_UNPACK_RAW_1_P (S, X, (val))
+#define FP_PACK_RAW_S(val, X)	_FP_PACK_RAW_1 (S, (val), X)
+#define FP_PACK_RAW_SP(val, X)			\
+  do						\
+    {						\
+      if (!FP_INHIBIT_RESULTS)			\
+	_FP_PACK_RAW_1_P (S, (val), X);		\
+    }						\
+  while (0)
+
+#define FP_UNPACK_S(X, val)			\
+  do						\
+    {						\
+      _FP_UNPACK_RAW_1 (S, X, (val));		\
+      _FP_UNPACK_CANONICAL (S, 1, X);		\
+    }						\
+  while (0)
+
+#define FP_UNPACK_SP(X, val)			\
+  do						\
+    {						\
+      _FP_UNPACK_RAW_1_P (S, X, (val));		\
+      _FP_UNPACK_CANONICAL (S, 1, X);		\
+    }						\
+  while (0)
+
+#define FP_UNPACK_SEMIRAW_S(X, val)		\
+  do						\
+    {						\
+      _FP_UNPACK_RAW_1 (S, X, (val));		\
+      _FP_UNPACK_SEMIRAW (S, 1, X);		\
+    }						\
+  while (0)
+
+#define FP_UNPACK_SEMIRAW_SP(X, val)		\
+  do						\
+    {						\
+      _FP_UNPACK_RAW_1_P (S, X, (val));		\
+      _FP_UNPACK_SEMIRAW (S, 1, X);		\
+    }						\
+  while (0)
+
+#define FP_PACK_S(val, X)			\
+  do						\
+    {						\
+      _FP_PACK_CANONICAL (S, 1, X);		\
+      _FP_PACK_RAW_1 (S, (val), X);		\
+    }						\
+  while (0)
+
+#define FP_PACK_SP(val, X)			\
+  do						\
+    {						\
+      _FP_PACK_CANONICAL (S, 1, X);		\
+      if (!FP_INHIBIT_RESULTS)			\
+	_FP_PACK_RAW_1_P (S, (val), X);		\
+    }						\
+  while (0)
+
+#define FP_PACK_SEMIRAW_S(val, X)		\
+  do						\
+    {						\
+      _FP_PACK_SEMIRAW (S, 1, X);		\
+      _FP_PACK_RAW_1 (S, (val), X);		\
+    }						\
+  while (0)
+
+#define FP_PACK_SEMIRAW_SP(val, X)		\
+  do						\
+    {						\
+      _FP_PACK_SEMIRAW (S, 1, X);		\
+      if (!FP_INHIBIT_RESULTS)			\
+	_FP_PACK_RAW_1_P (S, (val), X);		\
+    }						\
+  while (0)
+
+#define FP_ISSIGNAN_S(X)		_FP_ISSIGNAN (S, 1, X)
+#define FP_NEG_S(R, X)			_FP_NEG (S, 1, R, X)
+#define FP_ADD_S(R, X, Y)		_FP_ADD (S, 1, R, X, Y)
+#define FP_SUB_S(R, X, Y)		_FP_SUB (S, 1, R, X, Y)
+#define FP_MUL_S(R, X, Y)		_FP_MUL (S, 1, R, X, Y)
+#define FP_DIV_S(R, X, Y)		_FP_DIV (S, 1, R, X, Y)
+#define FP_SQRT_S(R, X)			_FP_SQRT (S, 1, R, X)
+#define _FP_SQRT_MEAT_S(R, S, T, X, Q)	_FP_SQRT_MEAT_1 (R, S, T, X, (Q))
+
+#if _FP_W_TYPE_SIZE < 64
+# define FP_FMA_S(R, X, Y, Z)	_FP_FMA (S, 1, 2, R, X, Y, Z)
+#else
+# define FP_FMA_S(R, X, Y, Z)	_FP_FMA (S, 1, 1, R, X, Y, Z)
+#endif
+
+#define FP_CMP_S(r, X, Y, un, ex)	_FP_CMP (S, 1, (r), X, Y, (un), (ex))
+#define FP_CMP_EQ_S(r, X, Y, ex)	_FP_CMP_EQ (S, 1, (r), X, Y, (ex))
+#define FP_CMP_UNORD_S(r, X, Y, ex)	_FP_CMP_UNORD (S, 1, (r), X, Y, (ex))
+
+#define FP_TO_INT_S(r, X, rsz, rsg)	_FP_TO_INT (S, 1, (r), X, (rsz), (rsg))
+#define FP_TO_INT_ROUND_S(r, X, rsz, rsg)	\
+  _FP_TO_INT_ROUND (S, 1, (r), X, (rsz), (rsg))
+#define FP_FROM_INT_S(X, r, rs, rt)	_FP_FROM_INT (S, 1, X, (r), (rs), rt)
+
+#define _FP_FRAC_HIGH_S(X)	_FP_FRAC_HIGH_1 (X)
+#define _FP_FRAC_HIGH_RAW_S(X)	_FP_FRAC_HIGH_1 (X)
+
+#if _FP_W_TYPE_SIZE < 64
+# define _FP_FRAC_HIGH_DW_S(X)	_FP_FRAC_HIGH_2 (X)
+#else
+# define _FP_FRAC_HIGH_DW_S(X)	_FP_FRAC_HIGH_1 (X)
+#endif
+
+#endif /* !SOFT_FP_SINGLE_H */
--- a/src/linpack/soft-fp/soft-fp.h
+++ b/src/linpack/soft-fp/soft-fp.h
@ -0,0 +1,230 @@
+#ifndef __SOFT_FP_H__
+#define __SOFT_FP_H__
+
+#include "sfp-machine.h"
+
+#define abort()   // 54
+/* For unreachable default cases in switch statements over bitwise OR
+   of FP_CLS_* values.  */
+#if (defined __GNUC__							\
+     && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)))
+# define _FP_UNREACHABLE	__builtin_unreachable ()
+#else
+# define _FP_UNREACHABLE	abort ()
+#endif
+// 63
+#if ((defined __GNUC__							\
+      && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))	\
+     || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 201112L))
+# define _FP_STATIC_ASSERT(expr, msg)		\
+  _Static_assert ((expr), msg)
+#else
+# define _FP_STATIC_ASSERT(expr, msg)					\
+  extern int (*__Static_assert_function (void))				\
+    [!!sizeof (struct { int __error_if_negative: (expr) ? 2 : -1; })]
+#endif
+
+
+#define _FP_ZERO_INIT	  = 0		// 82
+#define _FP_WORKBITS		3			// 85
+#define _FP_WORK_LSB		((_FP_W_TYPE) 1 << 3)
+#define _FP_WORK_ROUND		((_FP_W_TYPE) 1 << 2)	// 87
+#define _FP_WORK_GUARD		((_FP_W_TYPE) 1 << 1)
+#define _FP_WORK_STICKY		((_FP_W_TYPE) 1 << 0)	// 89
+
+#ifndef FP_RND_NEAREST
+# define FP_RND_NEAREST		0
+# define FP_RND_ZERO		1
+# define FP_RND_PINF		2
+# define FP_RND_MINF		3
+#endif
+#ifndef FP_ROUNDMODE
+# define FP_ROUNDMODE		FP_RND_NEAREST
+#endif
+
+/* By default don't care about exceptions.  */	// 101
+#ifndef FP_EX_INVALID
+# define FP_EX_INVALID		0
+#endif
+#ifndef FP_EX_OVERFLOW
+# define FP_EX_OVERFLOW		0
+#endif
+#ifndef FP_EX_UNDERFLOW
+# define FP_EX_UNDERFLOW	0
+#endif
+#ifndef FP_EX_DIVZERO
+# define FP_EX_DIVZERO		0
+#endif
+#ifndef FP_EX_INEXACT
+# define FP_EX_INEXACT		0
+#endif
+#ifndef FP_EX_DENORM
+# define FP_EX_DENORM		0
+#endif
+
+/* Sub-exceptions of "invalid".  */		// 121
+/* Signaling NaN operand.  */
+#ifndef FP_EX_INVALID_SNAN
+# define FP_EX_INVALID_SNAN	0
+#endif
+/* Inf * 0.  */							// 126
+#ifndef FP_EX_INVALID_IMZ
+# define FP_EX_INVALID_IMZ	0
+#endif
+
+/* Inf - Inf.  */						// 134
+#ifndef FP_EX_INVALID_ISI
+# define FP_EX_INVALID_ISI	0
+#endif
+/* 0 / 0.  */
+#ifndef FP_EX_INVALID_ZDZ
+# define FP_EX_INVALID_ZDZ	0
+#endif
+/* Inf / Inf.  */
+#ifndef FP_EX_INVALID_IDI
+# define FP_EX_INVALID_IDI	0
+#endif
+
+/* Invalid conversion to integer.  */
+#ifndef FP_EX_INVALID_CVI
+# define FP_EX_INVALID_CVI	0
+#endif
+/* Invalid comparison.  */				// 154
+#ifndef FP_EX_INVALID_VC				
+# define FP_EX_INVALID_VC	0			
+#endif
+
+/* _FP_STRUCT_LAYOUT may be defined as an attribute to determine the
+   struct layout variant used for structures where bit-fields are used
+   to access specific parts of binary floating-point numbers.  This is
+   required for systems where the default ABI uses struct layout with
+   differences in how consecutive bit-fields are laid out from the
+   default expected by soft-fp.  */
+#ifndef _FP_STRUCT_LAYOUT
+# define _FP_STRUCT_LAYOUT
+#endif
+										// 169
+#ifdef _FP_DECL_EX
+# define FP_DECL_EX					\
+  int _fex = 0;						\
+  _FP_DECL_EX
+#else
+# define FP_DECL_EX int _fex = 0
+#endif
+
+/* Initialize any machine-specific state used in FP_ROUNDMODE,
+   FP_TRAPPING_EXCEPTIONS or FP_HANDLE_EXCEPTIONS.  */
+#ifndef FP_INIT_ROUNDMODE
+# define FP_INIT_ROUNDMODE do {} while (0)
+#endif
+
+/* Initialize any machine-specific state used in
+   FP_TRAPPING_EXCEPTIONS or FP_HANDLE_EXCEPTIONS.  */
+# define FP_INIT_TRAPPING_EXCEPTIONS FP_INIT_ROUNDMODE	// 186
+
+/* Initialize any machine-specific state used in
+   FP_HANDLE_EXCEPTIONS.  */
+#define FP_INIT_EXCEPTIONS FP_INIT_TRAPPING_EXCEPTIONS	// 192
+
+#define FP_HANDLE_EXCEPTIONS do {} while (0)	// 196
+
+#define FP_DENORM_ZERO  0			// 201
+#define FP_SET_EXCEPTION(ex)	_fex |= (ex)		// 212
+#define FP_CUR_EXCEPTIONS		 (_fex)				// 215
+#define FP_TRAPPING_EXCEPTIONS 0					// 219
+ 
+
+													// 259
+#define _FP_ROUND_NEAREST(wc, X)				\
+  do								\
+    {								\
+      if ((_FP_FRAC_LOW_##wc (X) & 15) != _FP_WORK_ROUND)	\
+	_FP_FRAC_ADDI_##wc (X, _FP_WORK_ROUND);			\
+    }								\
+  while (0)
+
+#define _FP_ROUND_ZERO(wc, X)		(void) 0
+
+#define _FP_ROUND_PINF(wc, X)				\
+  do							\
+    {							\
+      if (!X##_s && (_FP_FRAC_LOW_##wc (X) & 7))	\
+	_FP_FRAC_ADDI_##wc (X, _FP_WORK_LSB);		\
+    }							\
+  while (0)
+
+#define _FP_ROUND_MINF(wc, X)			\
+  do						\
+    {						\
+      if (X##_s && (_FP_FRAC_LOW_##wc (X) & 7))	\
+	_FP_FRAC_ADDI_##wc (X, _FP_WORK_LSB);	\
+    }						\
+  while (0)
+
+#define _FP_ROUND(wc, X)			\
+  do						\
+    {						\
+      if (_FP_FRAC_LOW_##wc (X) & 7)		\
+	{					\
+	  FP_SET_EXCEPTION (FP_EX_INEXACT);	\
+	  switch (FP_ROUNDMODE)			\
+	    {					\
+	    case FP_RND_NEAREST:		\
+	      _FP_ROUND_NEAREST (wc, X);	\
+	      break;				\
+	    case FP_RND_ZERO:			\
+	      _FP_ROUND_ZERO (wc, X);		\
+	      break;				\
+	    case FP_RND_PINF:			\
+	      _FP_ROUND_PINF (wc, X);		\
+	      break;				\
+	    case FP_RND_MINF:			\
+	      _FP_ROUND_MINF (wc, X);		\
+	      break;				\
+	    }					\
+	}					\
+    }						\
+  while (0)
+
+#define FP_CLS_NORMAL		0		// 310
+#define FP_CLS_ZERO		1
+#define FP_CLS_INF		2
+#define FP_CLS_NAN		3
+
+#define _FP_CLS_COMBINE(x, y)	(((x) << 2) | (y))	// 315
+
+#include "op-1.h"
+#include "op-2.h"
+#include "op-4.h"
+#include "op-8.h"
+#include "op-common.h"
+
+/* Sigh.  Silly things longlong.h needs.  */
+#define UWtype		_FP_W_TYPE
+#define W_TYPE_SIZE	_FP_W_TYPE_SIZE
+
+typedef int QItype __attribute__ ((mode (QI)));
+typedef int SItype __attribute__ ((mode (SI)));
+typedef int DItype __attribute__ ((mode (DI)));
+typedef unsigned int UQItype __attribute__ ((mode (QI)));
+typedef unsigned int USItype __attribute__ ((mode (SI)));
+typedef unsigned int UDItype __attribute__ ((mode (DI)));
+#if _FP_W_TYPE_SIZE == 32
+typedef unsigned int UHWtype __attribute__ ((mode (HI)));
+#elif _FP_W_TYPE_SIZE == 64
+typedef USItype UHWtype;
+#endif
+
+#ifndef CMPtype
+# define CMPtype	int
+#endif
+
+#define SI_BITS		(__CHAR_BIT__ * (int) sizeof (SItype))
+#define DI_BITS		(__CHAR_BIT__ * (int) sizeof (DItype))
+
+#include "longlong.h"
+
+#endif
+
+
+
--- a/src/linpack/soft-fp/subdf3.c
+++ b/src/linpack/soft-fp/subdf3.c
@ -0,0 +1,21 @@
+#include "soft-fp.h"
+#include "double.h"
+
+DFtype
+__subdf3 (DFtype a, DFtype b)
+{
+  FP_DECL_EX;
+  FP_DECL_D (A);
+  FP_DECL_D (B);
+  FP_DECL_D (R);
+  DFtype r;
+
+  FP_INIT_ROUNDMODE;
+  FP_UNPACK_SEMIRAW_D (A, a);
+  FP_UNPACK_SEMIRAW_D (B, b);
+  FP_SUB_D (R, A, B);
+  FP_PACK_SEMIRAW_D (r, R);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
--- a/Show more
+++ b/Show more