
/**********************************************************//**
 **
 ** @file util/bitops.h
 ** vector manipulation primitives
 **
 ** Copyright (C) 2008  Xpace, LLC.  All rights reserved.
 **
 **************************************************************/

#ifndef XPACE_BITOPS_H
#define XPACE_BITOPS_H

#include "base/types.h"

namespace Xpace
  {
  // how many bits needed to represent a quantity?
  template<typename T>
  size_t bitsNeeded
    (T n);

  // how many bytes?
  template<typename T> 
  size_t bytesNeeded
    (T n);

  // is a type signed
  template<typename T> 
  bool isSigned
    ();

  // how many bits are set
  template<typename T> 
  size_t bitsSet
    (T n);

  // what is the highest bit set?
  // @note equivalent to floor(log2())
  // @warning do NOT call with n==0
  // @return bit number
  template<typename T>
  size_t leftBit
    (T n);

  // what is the lowest bit set?
  // @warning do NOT call with n==0
  // @return bit number
  template<typename T>
  size_t rightBit
    (T n);

  // is the bit set?
  // @return nonzero if set
  template<typename T>
  uint testBit
    (T n, 
     uint bit);

  // is the bit set?
  // if not, set it
  // @return nonzero if it was originally set
  template<typename T>
  uint testAndSetBit
    (T* n, 
     uint bit);

  // return log2 (left bit number) of
  // a constant
  template <unsigned N>
  uint log2Const
    ();

  // return bits set of a constant
  template <unsigned N>
  size_t bitsSetConst
    ();

  // ============================================================
  // ============================================================
  // ============================================================

  #if defined _MSC_VER
  // temporarily disable warnings about excessive shifts
  # pragma warning(push)
  # pragma warning(disable: 4293 4333)
  #elif defined __GNUC__
  # pragma GCC system_header
  #endif

  template<>
  inline 
  uint log2Const<0>
    ()
  { 
    return 0; 
  }

  template<unsigned N>
  inline 
  uint log2Const
    ()
  {
    return 1 + log2Const<N >> 1>();
  }

  template<>
  inline 
  size_t bitsSetConst<0>
    ()
  { 
    return 0; 
  }

  template<unsigned N>
  inline 
  size_t bitsSetConst
    ()
  {
    return uint(N & 1) + bitsSetConst<N >> 1>();
  }

  template <unsigned BYTES>
  struct BytesToWordType
  {
    typedef uint type;
  };

  template <> struct BytesToWordType<1>
  {
    typedef uint8 type;
  };

  template <> struct BytesToWordType<2>
  {
    typedef uint16 type;
  };

  template <> struct BytesToWordType<4>
  {
    typedef uint32 type;
  };

  template <> struct BytesToWordType<8>
  {
    typedef uint64 type;
  };

  #define WORD_T(T) typename BytesToWordType<sizeof(T)>::type
  #define TO_WORD(T,n) static_cast<WORD_T(T)>(n)

  template <typename T>
  inline bool isSigned()
  {
    return false;

  }
  template <> inline bool isSigned< BytesToWordType<1>::type >() { return true; }
  template <> inline bool isSigned< BytesToWordType<2>::type >() { return true; }
  template <> inline bool isSigned< BytesToWordType<4>::type >() { return true; }
  template <> inline bool isSigned< BytesToWordType<8>::type >() { return true; }

  template <unsigned BYTES>
  inline uint bitsSetWord
    (typename BytesToWordType<BYTES>::type n)
  #if 1
  {
  #if defined __GNUC__
    if (BYTES == 4) return __builtin_popcount(n);
    if (BYTES == 8) return __builtin_popcountl(n);
  #endif
    // this implementation has no branches and
    // is somewhat pipeline friendly
    typedef typename BytesToWordType<BYTES>::type type;
    const type m1  = type(0x5555555555555555);
    const type m2  = type(0x3333333333333333);
    const type m4  = type(0x0f0f0f0f0f0f0f0f);
    const type m8  = type(0x00ff00ff00ff00ff);
    const type m16 = type(0x0000ffff0000ffff);
    const type m32 = type(0x00000000ffffffff);
    n = (n & m1 ) + ((n >>  1) & m1 ); 
    n = (n & m2 ) + ((n >>  2) & m2 );
    n = (n & m4 ) + ((n >>  4) & m4 );
    if (BYTES > 1) n = (n & m8 ) + ((n >>  8) & m8 );
    if (BYTES > 2) n = (n & m16) + ((n >> 16) & m16);
    if (BYTES > 4) n = (n & m32) + ((n >> 32) & m32);
    return uint(n);
  }
  #elif 0
  {
    // Parallelizes bit counting on a byte basis.
    // However this version suffers from three problems:
    // 1) it has to put the parameter in byte-addressable
    // storage, and 2) it does an external array lookup
    // for each byte, and 3) it is not particularly pipeline
    // friendly
    uint tot = 0;
    uint8 *p = reinterpret_cast<uint8*>(&n);
    for (uint i = 0; i < BYTES; i++)
    tot += BSS::bitsSet[*p++];
    return tot;
  }
  #elif 0
  {
    // this version is almost completely naive,
    // except that the bit counting is predicated
    uint b = 0;
    for (; n; n >>= 1) b += (n & 1);
    return b;
  }
  #else
  {
    // this version is quite naive
    uint b = 0;
    for (; n; n >>= 1)
    if (n & 1) ++b;
    return b;
  }
  #endif

  template <typename T>
  inline size_t bitsSet
    (T n)
  {
    return bitsSetWord<sizeof(T)>(TO_WORD(T, n));
  }

  #if defined _MSC_VER
  }
  # include <intrin.h>
  namespace Xpace
  {
  # pragma intrinsic(_BitScanForward, _BitScanReverse, _bittest)
  # if defined _WIN64
  #  pragma intrinsic(_BitScanForward64, _BitScanReverse64, _bittest64)
  # endif
  #endif

  template <unsigned BYTES>
  inline uint leftBitWord
    (typename BytesToWordType<BYTES>::type n)
  {
  #if defined _WIN64
    if (BYTES == 8)
    {
      unsigned long pos;
      _BitScanReverse64(&pos, n);
      return int(pos);
    }
  #endif
  #if defined _MSC_VER
  # if ! defined _WIN64
    if ((BYTES == 8) && (0 != (n >> 32)))
    {
      unsigned long pos;
      _BitScanReverse(&pos, uint32(n >> 32));
      return 32 + int(pos);
    }
  # endif
    if (BYTES >= 4)
    {
      unsigned long pos;
      _BitScanReverse(&pos, uint32(n));
      return int(pos);
    }
  #endif
  #if defined __GNUC__
    if (BYTES == 4) 
      return 31 - __builtin_clz(n);
    if (BYTES == 8) 
      return 63 - __builtin_clzll(n);
  #endif
    int bit = sizeof(n) * 8 - 1;
    const typename BytesToWordType<BYTES>::type UBIT =
    typename BytesToWordType<BYTES>::type(1) << (sizeof(n) * 8 - 1);
    for (;;)
    {
      if (n & UBIT) return bit;
      n <<= 1;
      --bit;
    }
  }

  template <unsigned BYTES>
  inline uint rightBitWord
    (typename BytesToWordType<BYTES>::type n)
  {
  #if defined _WIN64
    if (BYTES == 8)
    {
      unsigned long pos;
      _BitScanForward64(&pos, n);
      return uint(pos);
    }
  #endif
  #if defined _MSC_VER
  # if ! defined _WIN64
    if ((BYTES == 8) && (0 == (n & ~uint32(0))))
    {
      unsigned long pos;
      _BitScanForward(&pos, uint32(n >> 32));
      return 32 + uint(pos);
    }
  # endif
    if (BYTES >= 4)
    {
      unsigned long pos;
      _BitScanForward(&pos, uint32(n));
      return uint(pos);
    }
  #endif
  #if defined __GNUC__
    if (BYTES == 4)
      return __builtin_ctz(n);
    if (BYTES == 8)
      return __builtin_ctzll(n);
  #endif
    int bit = 0;
    for (;;)
    {
      if (n & 1) 
        return bit;
      n >>= 1;
      ++bit;
    }
  }

  template <unsigned BYTES>
  inline uint testBitWord
    (typename BytesToWordType<BYTES>::type n,
     uint bit)
  {
  #if defined _WIN64
    if (BYTES == 8)
      return _bittest64((int64*)&n, bit);
  #endif
  #if defined _MSC_VER
    if (BYTES == 4)
      return _bittest((long*)&n, bit);
    if (BYTES < 4)
    {
      long nl = n;
      return _bittest(&nl, bit);
    }
  #endif
    return (n & (typename BytesToWordType<BYTES>::type(1) << bit));
  }

  template <unsigned BYTES>
  inline uint testAndSetBitWord
    (typename BytesToWordType<BYTES>::type* n,
     uint bit)
  {
  #if defined _WIN64
    if (BYTES == 8)
      return _bittestandset64((int64*)n, bit);
  #endif
  #if defined _MSC_VER
    if (BYTES == 4)
      return _bittestandset((long*)n, bit);
  #endif
    typedef typename BytesToWordType<BYTES>::type Word;
    if (*n & (Word(1) << bit)) 
      return 1;
    *n |= (Word(1) << bit);
    return 0;
  }

  template <typename T>
  inline size_t leftBit
    (T n)
  {
    return leftBitWord<sizeof(T)>(TO_WORD(T, n));
  }

  template <typename T>
  inline size_t rightBit
    (T n)
  {
    return rightBitWord<sizeof(T)>(TO_WORD(T, n));
  }

  template<typename T>
  inline uint testBit
    (T n, 
     uint bit)
  {
    return testBitWord<sizeof(T)>(TO_WORD(T, n), bit);
  }

  template<typename T>
  inline 
  uint testAndSetBit
    (T* n, 
     uint bit)
  {
    return testAndSetBitWord<sizeof(T)>(static_cast<WORD_T(T)*>(n), bit);
  }

  template <typename T>
  inline 
  size_t bitsNeeded
    (T n)
  {
    return (n == 0) ? 0 : 1 + leftBit(n);
  }

  template <>
  inline 
  size_t bitsNeeded<int>
    (int n)
  {
    bool sign(n < 0);
    return sign + bitsNeeded<Xpace::uint>(Xpace::uint(sign ? -n : n));
  }

  template <>
  inline 
  size_t bitsNeeded<int64>
    (int64 n)
  {
    bool sign(n < 0);
    return bitsNeeded(Xpace::uint64(sign ? -n - 1 : n));
  }

  template <typename T>
  inline size_t bytesNeeded
    (T n)
  {
    return (bitsNeeded(n) + 7) / 8;
  }

  #if defined(_MSC_VER)
  # pragma warning(pop)
  #endif
}

#endif
