builtin_fp16.h

/*
 * Copyright (c) 2009-2015 by llvm/compiler-rt contributors
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.

 * Copyright (c) 2018 by Contributors
 * \file builtin_fp16.cc
 * \brief Functions for conversion between fp32 and fp16, adopted from compiler-rt.
 */

#include <cstdint>

static inline uint32_t __clz(uint32_t x) {
  // count leading zeros
  int n = 32;
  uint32_t y;

  y = x >>16; if (y) { n = n -16; x = y; }
  y = x >> 8; if (y) { n = n - 8; x = y; }
  y = x >> 4; if (y) { n = n - 4; x = y; }
  y = x >> 2; if (y) { n = n - 2; x = y; }
  y = x >> 1; if (y) return n - 2;
  return n - x;
}

template <typename SRC_T, typename SRC_REP_T, int SRC_SIG_BITS,
          typename DST_T, typename DST_REP_T, int DST_SIG_BITS>
static inline DST_T __truncXfYf2__(SRC_T a) {
  // Various constants whose values follow from the type parameters.
  // Any reasonable optimizer will fold and propagate all of these.
  const int srcBits = sizeof(SRC_T) * 8;
  const int srcExpBits = srcBits - SRC_SIG_BITS - 1;
  const int srcInfExp = (1 << srcExpBits) - 1;
  const int srcExpBias = srcInfExp >> 1;

  const SRC_REP_T srcMinNormal = SRC_REP_T(1) << SRC_SIG_BITS;
  const SRC_REP_T srcSignificandMask = srcMinNormal - 1;
  const SRC_REP_T srcInfinity = (SRC_REP_T)srcInfExp << SRC_SIG_BITS;
  const SRC_REP_T srcSignMask = SRC_REP_T(1) << (SRC_SIG_BITS + srcExpBits);
  const SRC_REP_T srcAbsMask = srcSignMask - 1;
  const SRC_REP_T roundMask = (SRC_REP_T(1) << (SRC_SIG_BITS - DST_SIG_BITS)) - 1;
  const SRC_REP_T halfway = SRC_REP_T(1) << (SRC_SIG_BITS - DST_SIG_BITS - 1);
  const SRC_REP_T srcQNaN = SRC_REP_T(1) << (SRC_SIG_BITS - 1);
  const SRC_REP_T srcNaNCode = srcQNaN - 1;

  const int dstBits = sizeof(DST_T) * 8;
  const int dstExpBits = dstBits - DST_SIG_BITS - 1;
  const int dstInfExp = (1 << dstExpBits) - 1;
  const int dstExpBias = dstInfExp >> 1;

  const int underflowExponent = srcExpBias + 1 - dstExpBias;
  const int overflowExponent = srcExpBias + dstInfExp - dstExpBias;
  const SRC_REP_T underflow = (SRC_REP_T)underflowExponent << SRC_SIG_BITS;
  const SRC_REP_T overflow = (SRC_REP_T)overflowExponent << SRC_SIG_BITS;

  const DST_REP_T dstQNaN = DST_REP_T(1) << (DST_SIG_BITS - 1);
  const DST_REP_T dstNaNCode = dstQNaN - 1;

  // Break a into a sign and representation of the absolute value
  union SrcExchangeType { SRC_T f; SRC_REP_T i; };
  SrcExchangeType src_rep;
  src_rep.f = a;
  const SRC_REP_T aRep = src_rep.i;
  const SRC_REP_T aAbs = aRep & srcAbsMask;
  const SRC_REP_T sign = aRep & srcSignMask;
  DST_REP_T absResult;

  if (aAbs - underflow < aAbs - overflow) {
    // The exponent of a is within the range of normal numbers in the
    // destination format.  We can convert by simply right-shifting with
    // rounding and adjusting the exponent.
    absResult = aAbs >> (SRC_SIG_BITS - DST_SIG_BITS);
    absResult -= (DST_REP_T)(srcExpBias - dstExpBias) << DST_SIG_BITS;

    const SRC_REP_T roundBits = aAbs & roundMask;
    // Round to nearest
    if (roundBits > halfway)
      absResult++;
      // Ties to even
    else if (roundBits == halfway)
      absResult += absResult & 1;
  }
  else if (aAbs > srcInfinity) {
    // a is NaN.
    // Conjure the result by beginning with infinity, setting the qNaN
    // bit and inserting the (truncated) trailing NaN field.
    absResult = (DST_REP_T)dstInfExp << DST_SIG_BITS;
    absResult |= dstQNaN;
    absResult |= ((aAbs & srcNaNCode) >> (SRC_SIG_BITS - DST_SIG_BITS)) & dstNaNCode;
  }
  else if (aAbs >= overflow) {
    // a overflows to infinity.
    absResult = (DST_REP_T)dstInfExp << DST_SIG_BITS;
  }
  else {
    // a underflows on conversion to the destination type or is an exact
    // zero.  The result may be a denormal or zero.  Extract the exponent
    // to get the shift amount for the denormalization.
    const int aExp = aAbs >> SRC_SIG_BITS;
    const int shift = srcExpBias - dstExpBias - aExp + 1;

    const SRC_REP_T significand = (aRep & srcSignificandMask) | srcMinNormal;

    // Right shift by the denormalization amount with sticky.
    if (shift > SRC_SIG_BITS) {
      absResult = 0;
    } else {
      const bool sticky = significand << (srcBits - shift);
      SRC_REP_T denormalizedSignificand = significand >> shift | sticky;
      absResult = denormalizedSignificand >> (SRC_SIG_BITS - DST_SIG_BITS);
      const SRC_REP_T roundBits = denormalizedSignificand & roundMask;
      // Round to nearest
      if (roundBits > halfway)
        absResult++;
        // Ties to even
      else if (roundBits == halfway)
        absResult += absResult & 1;
    }
  }

  // Apply the signbit to (DST_T)abs(a).
  const DST_REP_T result = absResult | sign >> (srcBits - dstBits);
  union DstExchangeType { DST_T f; DST_REP_T i; };
  DstExchangeType dst_rep;
  dst_rep.i = result;
  return dst_rep.f;
}

template<typename SRC_T, typename SRC_REP_T, int SRC_SIG_BITS,
         typename DST_T, typename DST_REP_T, int DST_SIG_BITS>
static inline DST_T __extendXfYf2__(SRC_T a) {
  // Various constants whose values follow from the type parameters.
  // Any reasonable optimizer will fold and propagate all of these.
  const int srcBits = sizeof(SRC_T) * 8;
  const int srcExpBits = srcBits - SRC_SIG_BITS - 1;
  const int srcInfExp = (1 << srcExpBits) - 1;
  const int srcExpBias = srcInfExp >> 1;

  const SRC_REP_T srcMinNormal = SRC_REP_T(1) << SRC_SIG_BITS;
  const SRC_REP_T srcInfinity = (SRC_REP_T)srcInfExp << SRC_SIG_BITS;
  const SRC_REP_T srcSignMask = SRC_REP_T(1) << (SRC_SIG_BITS + srcExpBits);
  const SRC_REP_T srcAbsMask = srcSignMask - 1;
  const SRC_REP_T srcQNaN = SRC_REP_T(1) << (SRC_SIG_BITS - 1);
  const SRC_REP_T srcNaNCode = srcQNaN - 1;

  const int dstBits = sizeof(DST_T)*8;
  const int dstExpBits = dstBits - DST_SIG_BITS - 1;
  const int dstInfExp = (1 << dstExpBits) - 1;
  const int dstExpBias = dstInfExp >> 1;

  const DST_REP_T dstMinNormal = DST_REP_T(1) << DST_SIG_BITS;

  // Break a into a sign and representation of the absolute value
  union SrcExchangeType { SRC_T f; SRC_REP_T i; };
  SrcExchangeType src_rep;
  src_rep.f = a;
  const SRC_REP_T aRep = src_rep.i;
  const SRC_REP_T aAbs = aRep & srcAbsMask;
  const SRC_REP_T sign = aRep & srcSignMask;
  DST_REP_T absResult;

  // If sizeof(SRC_REP_T) < sizeof(int), the subtraction result is promoted
  // to (signed) int.  To avoid that, explicitly cast to SRC_REP_T.
  if ((SRC_REP_T)(aAbs - srcMinNormal) < srcInfinity - srcMinNormal) {
    // a is a normal number.
    // Extend to the destination type by shifting the significand and
    // exponent into the proper position and rebiasing the exponent.
    absResult = (DST_REP_T)aAbs << (DST_SIG_BITS - SRC_SIG_BITS);
    absResult += (DST_REP_T)(dstExpBias - srcExpBias) << DST_SIG_BITS;
  }

  else if (aAbs >= srcInfinity) {
    // a is NaN or infinity.
    // Conjure the result by beginning with infinity, then setting the qNaN
    // bit (if needed) and right-aligning the rest of the trailing NaN
    // payload field.
    absResult = (DST_REP_T)dstInfExp << DST_SIG_BITS;
    absResult |= (DST_REP_T)(aAbs & srcQNaN) << (DST_SIG_BITS - SRC_SIG_BITS);
    absResult |= (DST_REP_T)(aAbs & srcNaNCode) << (DST_SIG_BITS - SRC_SIG_BITS);
  }
  else if (aAbs) {
    // a is denormal.
    // renormalize the significand and clear the leading bit, then insert
    // the correct adjusted exponent in the destination type.
    const int scale = __clz(aAbs) - __clz(srcMinNormal);
    absResult = (DST_REP_T)aAbs << (DST_SIG_BITS - SRC_SIG_BITS + scale);
    absResult ^= dstMinNormal;
    const int resultExponent = dstExpBias - srcExpBias - scale + 1;
    absResult |= (DST_REP_T)resultExponent << DST_SIG_BITS;
  }
  else {
    // a is zero.
    absResult = 0;
  }

  // Apply the signbit to (DST_T)abs(a).
  const DST_REP_T result = absResult | (DST_REP_T)sign << (dstBits - srcBits);
  union DstExchangeType { DST_T f; DST_REP_T i; };
  DstExchangeType dst_rep;
  dst_rep.i = result;
  return dst_rep.f;
}