/* Do not edit this file. It is produced from the corresponding .m4 source */
/*
 *	Copyright 1996, University Corporation for Atmospheric Research
 *	See netcdf/COPYRIGHT file for copying and redistribution conditions.
 * 	
 * 	This file contains some routines derived from code
 *	which is copyrighted by Sun Microsystems, Inc.
 *	The "#ifdef vax" versions of
 *		 ncx_put_float_float()
 *		 ncx_get_float_float()
 *		 ncx_put_double_double()
 *		 ncx_get_double_double()
 *		 ncx_putn_float_float()
 *		 ncx_getn_float_float()
 *		 ncx_putn_double_double()
 *		 ncx_getn_double_double()
 * 	are derived from xdr_float() and xdr_double() routines
 *	in the freely available, copyrighted Sun RPCSRC 3.9
 *	distribution, xdr_float.c.
 * 	Our "value added" is that these are always memory to memory,
 *	they handle IEEE subnormals properly, and their "n" versions
 *	operate speedily on arrays.
 */
/* $Id: ncx.m4,v 2.58 2010/05/26 18:11:08 dmh Exp $ */

/*
 * An external data representation interface.
 */

#include "ncx.h"
#include "nc3dispatch.h"
#include <string.h>
#include <limits.h>

/* alias poorly named limits.h macros */
#define  SHORT_MAX  SHRT_MAX
#define  SHORT_MIN  SHRT_MIN
#define USHORT_MAX USHRT_MAX
#ifndef LLONG_MAX
#   define LLONG_MAX	9223372036854775807LL
#   define LLONG_MIN	(-LLONG_MAX - 1LL)
#   define ULLONG_MAX	18446744073709551615ULL
#endif
#define LONG_LONG_MAX LLONG_MAX
#define LONG_LONG_MIN LLONG_MIN
#define ULONG_LONG_MAX ULLONG_MAX
#include <float.h>
#ifndef FLT_MAX /* This POSIX macro missing on some systems */
# ifndef NO_IEEE_FLOAT
# define FLT_MAX 3.40282347e+38f
# else
# error "You will need to define FLT_MAX"
# endif
#endif
/* alias poorly named float.h macros */
#define FLOAT_MAX FLT_MAX
#define FLOAT_MIN (-FLT_MAX)
#define DOUBLE_MAX DBL_MAX
#define DOUBLE_MIN (-DBL_MAX)
#define FLOAT_MAX_EXP FLT_MAX_EXP
#define DOUBLE_MAX_EXP DBL_MAX_EXP
#include <assert.h>
#define UCHAR_MIN 0
#define Min(a,b) ((a) < (b) ? (a) : (b))
#define Max(a,b) ((a) > (b) ? (a) : (b))

/*
 * If the machine's float domain is "smaller" than the external one
 * use the machine domain
 */
#if defined(FLT_MAX_EXP) && FLT_MAX_EXP < 128 /* 128 is X_FLT_MAX_EXP */
#undef X_FLOAT_MAX
# define X_FLOAT_MAX FLT_MAX
#undef X_FLOAT_MIN
# define X_FLOAT_MIN (-X_FLOAT_MAX)
#endif

#if _SX /* NEC SUPER UX */
#define LOOPCNT 256    /* must be no longer than hardware vector length */
#if _INT64
#undef  INT_MAX /* workaround cpp bug */
#define INT_MAX  X_INT_MAX
#undef  INT_MIN /* workaround cpp bug */
#define INT_MIN  X_INT_MIN
#undef  LONG_MAX /* workaround cpp bug */
#define LONG_MAX  X_INT_MAX
#undef  LONG_MIN /* workaround cpp bug */
#define LONG_MIN  X_INT_MIN
#elif _LONG64
#undef  LONG_MAX /* workaround cpp bug */
#define LONG_MAX  4294967295L
#undef  LONG_MIN /* workaround cpp bug */
#define LONG_MIN -4294967295L
#endif
#if !_FLOAT0
#error "FLOAT1 and FLOAT2 not supported"
#endif
#endif /* _SX */

static const char nada[X_ALIGN] = {0, 0, 0, 0};

#ifndef WORDS_BIGENDIAN
/* LITTLE_ENDIAN: DEC and intel */
/*
 * Routines to convert to BIGENDIAN.
 * Optimize the swapn?b() and swap?b() routines aggressivly.
 */

#define SWAP2(a) ( (((a) & 0xff) << 8) | \
		(((a) >> 8) & 0xff) )

#define SWAP4(a) ( ((a) << 24) | \
		(((a) <<  8) & 0x00ff0000) | \
		(((a) >>  8) & 0x0000ff00) | \
		(((a) >> 24) & 0x000000ff) )


static void
swapn2b(void *dst, const void *src, size_t nn)
{
	char *op = dst;
	const char *ip = src;

/* unroll the following to reduce loop overhead
 *
 *	while(nn-- != 0)
 *	{
 *		*op++ = *(++ip);
 *		*op++ = *(ip++ -1);
 *	}                                       
 */
	while(nn > 3)
	{
		*op++ = *(++ip);
		*op++ = *(ip++ -1);
		*op++ = *(++ip);
		*op++ = *(ip++ -1);
		*op++ = *(++ip);
		*op++ = *(ip++ -1);
		*op++ = *(++ip);
		*op++ = *(ip++ -1);
		nn -= 4;
	}
	while(nn-- != 0)
	{
		*op++ = *(++ip);
		*op++ = *(ip++ -1);
	}
}

# ifndef vax
static void
swap4b(void *dst, const void *src)
{
	char *op = dst;
	const char *ip = src;
	op[0] = ip[3];
	op[1] = ip[2];
	op[2] = ip[1];
	op[3] = ip[0];
}
# endif /* !vax */

static void
swapn4b(void *dst, const void *src, size_t nn)
{
	char *op = dst;
	const char *ip = src;

/* unroll the following to reduce loop overhead
 *	while(nn-- != 0)
 *	{
 *		op[0] = ip[3];
 *		op[1] = ip[2];
 *		op[2] = ip[1];
 *		op[3] = ip[0];
 *		op += 4;
 *		ip += 4;
 *	}
 */
	while(nn > 3)
	{
		op[0] = ip[3];
		op[1] = ip[2];
		op[2] = ip[1];
		op[3] = ip[0];
		op[4] = ip[7];
		op[5] = ip[6];
		op[6] = ip[5];
		op[7] = ip[4];
		op[8] = ip[11];
		op[9] = ip[10];
		op[10] = ip[9];
		op[11] = ip[8];
		op[12] = ip[15];
		op[13] = ip[14];
		op[14] = ip[13];
		op[15] = ip[12];
		op += 16;
		ip += 16;
		nn -= 4;
	}
	while(nn-- != 0)
	{
		op[0] = ip[3];
		op[1] = ip[2];
		op[2] = ip[1];
		op[3] = ip[0];
		op += 4;
		ip += 4;
	}
}

# ifndef vax
static void
swap8b(void *dst, const void *src)
{
	char *op = dst;
	const char *ip = src;
#  ifndef FLOAT_WORDS_BIGENDIAN
	op[0] = ip[7];
	op[1] = ip[6];
	op[2] = ip[5];
	op[3] = ip[4];
	op[4] = ip[3];
	op[5] = ip[2];
	op[6] = ip[1];
	op[7] = ip[0];
#  else
	op[0] = ip[3];
	op[1] = ip[2];
	op[2] = ip[1];
	op[3] = ip[0];
	op[4] = ip[7];
	op[5] = ip[6];
	op[6] = ip[5];
	op[7] = ip[4];
#  endif
}
# endif /* !vax */

# ifndef vax
static void
swapn8b(void *dst, const void *src, size_t nn)
{
	char *op = dst;
	const char *ip = src;

/* unroll the following to reduce loop overhead
 *	while(nn-- != 0)
 *	{
 *		op[0] = ip[7];
 *		op[1] = ip[6];
 *		op[2] = ip[5];
 *		op[3] = ip[4];
 *		op[4] = ip[3];
 *		op[5] = ip[2];
 *		op[6] = ip[1];
 *		op[7] = ip[0];
 *		op += 8;
 *		ip += 8;
 *	}
 */
#  ifndef FLOAT_WORDS_BIGENDIAN
	while(nn > 1)
	{
		op[0] = ip[7];
		op[1] = ip[6];
		op[2] = ip[5];
		op[3] = ip[4];
		op[4] = ip[3];
		op[5] = ip[2];
		op[6] = ip[1];
		op[7] = ip[0];
		op[8] = ip[15];
		op[9] = ip[14];
		op[10] = ip[13];
		op[11] = ip[12];
		op[12] = ip[11];
		op[13] = ip[10];
		op[14] = ip[9];
		op[15] = ip[8];
		op += 16;
		ip += 16;
		nn -= 2;
	}
	while(nn-- != 0)
	{
		op[0] = ip[7];
		op[1] = ip[6];
		op[2] = ip[5];
		op[3] = ip[4];
		op[4] = ip[3];
		op[5] = ip[2];
		op[6] = ip[1];
		op[7] = ip[0];
		op += 8;
		ip += 8;
	}
#  else
	while(nn-- != 0)
	{
		op[0] = ip[3];
		op[1] = ip[2];
		op[2] = ip[1];
		op[3] = ip[0];
		op[4] = ip[7];
		op[5] = ip[6];
		op[6] = ip[5];
		op[7] = ip[4];
		op += 8;
		ip += 8;
	}
#  endif
}
# endif /* !vax */

#endif /* LITTLE_ENDIAN */


/*
 * Primitive numeric conversion functions.
 */

/* x_schar */

/* We don't implement any x_schar primitives. */


/* x_short */

#if SHORT_MAX == X_SHORT_MAX
typedef short ix_short;
#define SIZEOF_IX_SHORT SIZEOF_SHORT
#define IX_SHORT_MAX SHORT_MAX
#elif INT_MAX >= X_SHORT_MAX
typedef int ix_short;
#define SIZEOF_IX_SHORT SIZEOF_INT
#define IX_SHORT_MAX INT_MAX
#elif LONG_MAX >= X_SHORT_MAX
typedef long ix_short;
#define SIZEOF_IX_SHORT SIZEOF_LONG
#define IX_SHORT_MAX LONG_MAX
#elif LLONG_MAX >= X_SHORT_MAX
typedef long long ix_short;
#define SIZEOF_IX_SHORT SIZEOF_LONG_LONG
#define IX_SHORT_MAX LLONG_MAX
#else
#error "ix_short implementation"
#endif

static void
get_ix_short(const void *xp, ix_short *ip)
{
	const uchar *cp = (const uchar *) xp;
	*ip = *cp++ << 8;
#if SIZEOF_IX_SHORT > X_SIZEOF_SHORT
	if(*ip & 0x8000)
	{
		/* extern is negative */
		*ip |= (~(0xffff)); /* N.B. Assumes "twos complement" */
	}
#endif
	*ip |= *cp; 
}

static void
put_ix_short(void *xp, const ix_short *ip)
{
	uchar *cp = (uchar *) xp;
	*cp++ = (*ip) >> 8;
	*cp = (*ip) & 0xff;
}


int
ncx_get_short_schar(const void *xp, schar *ip)
{
	ix_short xx;
	get_ix_short(xp, &xx);
	*ip = xx;
	if(xx > SCHAR_MAX || xx < SCHAR_MIN)
		return NC_ERANGE;
	return ENOERR;
}

int
ncx_get_short_uchar(const void *xp, uchar *ip)
{
	ix_short xx;
	get_ix_short(xp, &xx);
	*ip = xx;
	if(xx > UCHAR_MAX || xx < 0)
		return NC_ERANGE;
	return ENOERR;
}

int
ncx_get_short_short(const void *xp, short *ip)
{
#if SIZEOF_IX_SHORT == SIZEOF_SHORT && IX_SHORT_MAX == SHORT_MAX
	get_ix_short(xp, (ix_short *)ip);
	return ENOERR;
#else
	ix_short xx;
	get_ix_short(xp, &xx);
	*ip = xx;
#   if IX_SHORT_MAX > SHORT_MAX
	if(xx > SHORT_MAX || xx < SHORT_MIN)
		return NC_ERANGE;
#   endif
	return ENOERR;
#endif
}

int
ncx_get_short_int(const void *xp, int *ip)
{
#if SIZEOF_IX_SHORT == SIZEOF_INT && IX_SHORT_MAX == INT_MAX
	get_ix_short(xp, (ix_short *)ip);
	return ENOERR;
#else
	ix_short xx;
	get_ix_short(xp, &xx);
	*ip = xx;
#   if IX_SHORT_MAX > INT_MAX
	if(xx > INT_MAX || xx < INT_MIN)
		return NC_ERANGE;
#   endif
	return ENOERR;
#endif
}

int
ncx_get_short_uint(const void *xp, unsigned int *ip)
{
#if SIZEOF_IX_SHORT == SIZEOF_INT && IX_SHORT_MAX == INT_MAX
	get_ix_short(xp, (ix_short *)ip);
	return ENOERR;
#else
	ix_short xx;
	get_ix_short(xp, &xx);
	*ip = xx;
#   if IX_SHORT_MAX > INT_MAX
	if(xx > UINT_MAX || xx < 0)
		return NC_ERANGE;
#   endif
	return ENOERR;
#endif
}

int
ncx_get_short_longlong(const void *xp, long long *ip)
{
#if SIZEOF_IX_SHORT == SIZEOF_LONG_LONG && IX_SHORT_MAX == LONG_LONG_MAX
	get_ix_short(xp, (ix_short *)ip);
	return ENOERR;
#else
	/* assert(LONG_LONG_MAX >= X_SHORT_MAX); */
	ix_short xx;
	get_ix_short(xp, &xx);
	*ip = xx;
	return ENOERR;
#endif
}

int
ncx_get_short_ulonglong(const void *xp, unsigned long long *ip)
{
#if SIZEOF_IX_SHORT == SIZEOF_LONG && IX_SHORT_MAX == LONG_MAX
	get_ix_short(xp, (ix_short *)ip);
	return ENOERR;
#else
	/* assert(LONG_LONG_MAX >= X_SHORT_MAX); */
	ix_short xx;
	get_ix_short(xp, &xx);
	*ip = xx;
	if(xx < 0)
		return NC_ERANGE;
	return ENOERR;
#endif
}

int
ncx_get_short_float(const void *xp, float *ip)
{
	ix_short xx;
	get_ix_short(xp, &xx);
	*ip = xx;
#if 0	/* TODO: determine when necessary */
	if(xx > FLT_MAX || xx < (-FLT_MAX))
		return NC_ERANGE;
#endif
	return ENOERR;
}

int
ncx_get_short_double(const void *xp, double *ip)
{
	/* assert(DBL_MAX >= X_SHORT_MAX); */
	ix_short xx;
	get_ix_short(xp, &xx);
	*ip = xx;
	return ENOERR;
}

int
ncx_put_short_schar(void *xp, const schar *ip)
{
	uchar *cp = (uchar *) xp;
	if(*ip & 0x80)
		*cp++ = 0xff;
	else
		*cp++ = 0;
	*cp = (uchar)*ip;
	return ENOERR;
}

int
ncx_put_short_uchar(void *xp, const uchar *ip)
{
	uchar *cp = (uchar *) xp;
	*cp++ = 0;
	*cp = *ip;
	return ENOERR;
}

int
ncx_put_short_short(void *xp, const short *ip)
{
#if SIZEOF_IX_SHORT == SIZEOF_SHORT && X_SHORT_MAX == SHORT_MAX
	put_ix_short(xp, (const ix_short *)ip);
	return ENOERR;
#else
	ix_short xx = (ix_short)*ip;
	put_ix_short(xp, &xx);
# if X_SHORT_MAX < SHORT_MAX
	if(*ip > X_SHORT_MAX || *ip < X_SHORT_MIN)
		return NC_ERANGE;
# endif
	return ENOERR;
#endif
}

int
ncx_put_short_int(void *xp, const int *ip)
{
#if SIZEOF_IX_SHORT == SIZEOF_INT && X_SHORT_MAX == INT_MAX
	put_ix_short(xp, (const ix_short *)ip);
	return ENOERR;
#else
	ix_short xx = (ix_short)*ip;
	put_ix_short(xp, &xx);
# if X_SHORT_MAX < INT_MAX
	if(*ip > X_SHORT_MAX || *ip < X_SHORT_MIN)
		return NC_ERANGE;
# endif
	return ENOERR;
#endif
}

int
ncx_put_short_uint(void *xp, const unsigned int *ip)
{
#if SIZEOF_IX_SHORT == SIZEOF_INT && X_SHORT_MAX == INT_MAX
	put_ix_short(xp, (const ix_short *)ip);
	return ENOERR;
#else
	ix_short xx = (ix_short)*ip;
	put_ix_short(xp, &xx);
# if X_SHORT_MAX < INT_MAX
	if(*ip > X_SHORT_MAX)
		return NC_ERANGE;
# endif
	return ENOERR;
#endif
}

int
ncx_put_short_longlong(void *xp, const long long *ip)
{
#if SIZEOF_IX_SHORT == SIZEOF_LONG_LONG && X_SHORT_MAX == LONG_LONG_MAX
	put_ix_short(xp, (const ix_short *)ip);
	return ENOERR;
#else
	ix_short xx = (ix_short)*ip;
	put_ix_short(xp, &xx);
# if X_SHORT_MAX < LONG_LONG_MAX
	if(*ip > X_SHORT_MAX || *ip < X_SHORT_MIN)
		return NC_ERANGE;
# endif
	return ENOERR;
#endif
}

int
ncx_put_short_ulonglong(void *xp, const unsigned long long *ip)
{
#if SIZEOF_IX_SHORT == SIZEOF_LONG_LONG && X_SHORT_MAX == LONG_LONG_MAX
	put_ix_short(xp, (const ix_short *)ip);
	return ENOERR;
#else
	ix_short xx = (ix_short)*ip;
	put_ix_short(xp, &xx);
# if X_SHORT_MAX < LONG_LONG_MAX
	if(*ip > X_SHORT_MAX)
		return NC_ERANGE;
# endif
	return ENOERR;
#endif
}

int
ncx_put_short_float(void *xp, const float *ip)
{
	ix_short xx = *ip;
	put_ix_short(xp, &xx);
	if(*ip > X_SHORT_MAX || *ip < X_SHORT_MIN)
		return NC_ERANGE;
	return ENOERR;
}

int
ncx_put_short_double(void *xp, const double *ip)
{
	ix_short xx = *ip;
	put_ix_short(xp, &xx);
	if(*ip > X_SHORT_MAX || *ip < X_SHORT_MIN)
		return NC_ERANGE;
	return ENOERR;
}

/* x_int */

#if SHORT_MAX == X_INT_MAX
typedef short ix_int;
#define SIZEOF_IX_INT SIZEOF_SHORT
#define IX_INT_MAX SHORT_MAX
#elif INT_MAX  >= X_INT_MAX
typedef int ix_int;
#define SIZEOF_IX_INT SIZEOF_INT
#define IX_INT_MAX INT_MAX
#elif LONG_MAX  >= X_INT_MAX
typedef long ix_int;
#define SIZEOF_IX_INT SIZEOF_LONG
#define IX_INT_MAX LONG_MAX
#else
#error "ix_int implementation"
#endif


static void
get_ix_int(const void *xp, ix_int *ip)
{
	const uchar *cp = (const uchar *) xp;

	*ip = *cp++ << 24;
#if SIZEOF_IX_INT > X_SIZEOF_INT
	if(*ip & 0x80000000)
	{
		/* extern is negative */
		*ip |= (~(0xffffffff)); /* N.B. Assumes "twos complement" */
	}
#endif
	*ip |= (*cp++ << 16);
	*ip |= (*cp++ << 8);
	*ip |= *cp; 
}

static void
put_ix_int(void *xp, const ix_int *ip)
{
	uchar *cp = (uchar *) xp;

	*cp++ = (*ip) >> 24;
	*cp++ = ((*ip) & 0x00ff0000) >> 16;
	*cp++ = ((*ip) & 0x0000ff00) >>  8;
	*cp   = ((*ip) & 0x000000ff);
}


int
ncx_get_int_schar(const void *xp, schar *ip)
{
	ix_int xx;
	get_ix_int(xp, &xx);
	*ip = xx;
	if(xx > SCHAR_MAX || xx < SCHAR_MIN)
		return NC_ERANGE;
	return ENOERR;
}

int
ncx_get_int_uchar(const void *xp, uchar *ip)
{
	ix_int xx;
	get_ix_int(xp, &xx);
	*ip = xx;
	if(xx > UCHAR_MAX || xx < 0)
		return NC_ERANGE;
	return ENOERR;
}

int
ncx_get_int_short(const void *xp, short *ip)
{
#if SIZEOF_IX_INT == SIZEOF_SHORT && IX_INT_MAX == SHORT_MAX
	get_ix_int(xp, (ix_int *)ip);
	return ENOERR;
#else
	ix_int xx;
	get_ix_int(xp, &xx);
	*ip = xx;
#  if IX_INT_MAX > SHORT_MAX
	if(xx > SHORT_MAX || xx < SHORT_MIN)
		return NC_ERANGE;
#  endif
	return ENOERR;
#endif
}

int
ncx_get_int_int(const void *xp, int *ip)
{
#if SIZEOF_IX_INT == SIZEOF_INT && IX_INT_MAX == INT_MAX
	get_ix_int(xp, (ix_int *)ip);
	return ENOERR;
#else
	ix_int xx;
	get_ix_int(xp, &xx);
	*ip = xx;
#  if IX_INT_MAX > INT_MAX
	if(xx > INT_MAX || xx < INT_MIN)
		return NC_ERANGE;
#  endif
	return ENOERR;
#endif
}

int
ncx_get_int_uint(const void *xp, unsigned int *ip)
{
	ix_int xx;
	get_ix_int(xp, &xx);
	*ip = xx;
	if(xx > UINT_MAX || xx < 0)
		return NC_ERANGE;
	return ENOERR;
}

int
ncx_get_int_longlong(const void *xp, long long *ip)
{
	ix_int xx;
	get_ix_int(xp, &xx);
	*ip = xx;
	return ENOERR;
}

int
ncx_get_int_ulonglong(const void *xp, unsigned long long *ip)
{
	ix_int xx;
	get_ix_int(xp, &xx);
	*ip = xx;
	if(xx < 0)
	      return NC_ERANGE;
	return ENOERR;
}

int
ncx_get_int_float(const void *xp, float *ip)
{
	ix_int xx;
	get_ix_int(xp, &xx);
	*ip = xx;
#if 0	/* TODO: determine when necessary */
	if(xx > FLT_MAX || xx < (-FLT_MAX))
		return NC_ERANGE;
#endif
	return ENOERR;
}

int
ncx_get_int_double(const void *xp, double *ip)
{
	/* assert((DBL_MAX >= X_INT_MAX); */
	ix_int xx;
	get_ix_int(xp, &xx);
	*ip = xx;
	return ENOERR;
}

int
ncx_put_int_schar(void *xp, const schar *ip)
{
	uchar *cp = (uchar *) xp;
	if(*ip & 0x80)
	{
		*cp++ = 0xff;
		*cp++ = 0xff;
		*cp++ = 0xff;
	}
	else
	{
		*cp++ = 0x00;
		*cp++ = 0x00;
		*cp++ = 0x00;
	}
	*cp = (uchar)*ip;
	return ENOERR;
}

int
ncx_put_int_uchar(void *xp, const uchar *ip)
{
	uchar *cp = (uchar *) xp;
	*cp++ = 0x00;
	*cp++ = 0x00;
	*cp++ = 0x00;
	*cp   = *ip;
	return ENOERR;
}

int
ncx_put_int_short(void *xp, const short *ip)
{
#if SIZEOF_IX_INT == SIZEOF_SHORT && IX_INT_MAX == SHORT_MAX
	put_ix_int(xp, (ix_int *)ip);
	return ENOERR;
#else
	ix_int xx = (ix_int)(*ip);
	put_ix_int(xp, &xx);
#   if IX_INT_MAX < SHORT_MAX
	if(*ip > X_INT_MAX || *ip < X_INT_MIN)
		return NC_ERANGE;
#   endif
	return ENOERR;
#endif
}

int
ncx_put_int_int(void *xp, const int *ip)
{
#if SIZEOF_IX_INT == SIZEOF_INT && IX_INT_MAX == INT_MAX
	put_ix_int(xp, (ix_int *)ip);
	return ENOERR;
#else
	ix_int xx = (ix_int)(*ip);
	put_ix_int(xp, &xx);
#   if IX_INT_MAX < INT_MAX
	if(*ip > X_INT_MAX || *ip < X_INT_MIN)
		return NC_ERANGE;
#   endif
	return ENOERR;
#endif
}

int
ncx_put_int_uint(void *xp, const unsigned int *ip)
{
#if SIZEOF_IX_INT == SIZEOF_INT && IX_INT_MAX == INT_MAX
	put_ix_int(xp, (ix_int *)ip);
	return ENOERR;
#else
	ix_int xx = (ix_int)(*ip);
	put_ix_int(xp, &xx);
	if(*ip > X_UINT_MAX)
		return NC_ERANGE;
	return ENOERR;
#endif
}

int
ncx_put_int_longlong(void *xp, const longlong *ip)
{
#if SIZEOF_IX_INT == SIZEOF_LONG && IX_INT_MAX == LONG_MAX
	put_ix_int(xp, (ix_int *)ip);
	return ENOERR;
#else
	ix_int xx = (ix_int)(*ip);
	put_ix_int(xp, &xx);
#   if IX_INT_MAX < LONG_LONG_MAX
	if(*ip > X_INT_MAX || *ip < X_INT_MIN)
		return NC_ERANGE;
#   endif
	return ENOERR;
#endif
}

int
ncx_put_int_ulonglong(void *xp, const unsigned long long *ip)
{
#if SIZEOF_IX_INT == SIZEOF_LONG && IX_INT_MAX == LONG_MAX
	put_ix_int(xp, (ix_int *)ip);
	return ENOERR;
#else
	ix_int xx = (ix_int)(*ip);
	put_ix_int(xp, &xx);
#   if IX_INT_MAX < LONG_MAX
	if(*ip > X_INT_MAX)
		return NC_ERANGE;
#   endif
	return ENOERR;
#endif
}

int
ncx_put_int_float(void *xp, const float *ip)
{
	ix_int xx = (ix_int)(*ip);
	put_ix_int(xp, &xx);
	if(*ip > (double)X_INT_MAX || *ip < (double)X_INT_MIN)
		return NC_ERANGE;
	return ENOERR;
}

int
ncx_put_int_double(void *xp, const double *ip)
{
	ix_int xx = (ix_int)(*ip);
	put_ix_int(xp, &xx);
	if(*ip > X_INT_MAX || *ip < X_INT_MIN)
		return NC_ERANGE;
	return ENOERR;
}
 

/* x_float */

#if X_SIZEOF_FLOAT == SIZEOF_FLOAT && !defined(NO_IEEE_FLOAT)

static void
get_ix_float(const void *xp, float *ip)
{
#ifdef WORDS_BIGENDIAN
	(void) memcpy(ip, xp, sizeof(float));
#else
	swap4b(ip, xp);
#endif
}

static void
put_ix_float(void *xp, const float *ip)
{
#ifdef WORDS_BIGENDIAN
	(void) memcpy(xp, ip, X_SIZEOF_FLOAT);
#else
	swap4b(xp, ip);
#endif
}

#elif vax

/* What IEEE single precision floating point looks like on a Vax */
struct	ieee_single {
	unsigned int	exp_hi       : 7;
	unsigned int	sign         : 1;
	unsigned int 	mant_hi      : 7;
	unsigned int	exp_lo       : 1;
	unsigned int	mant_lo_hi   : 8;
	unsigned int	mant_lo_lo   : 8;
};

/* Vax single precision floating point */
struct	vax_single {
	unsigned int	mantissa1 : 7;
	unsigned int	exp       : 8;
	unsigned int	sign      : 1;
	unsigned int	mantissa2 : 16;
};

#define VAX_SNG_BIAS	0x81
#define IEEE_SNG_BIAS	0x7f

static struct sgl_limits {
	struct vax_single s;
	struct ieee_single ieee;
} max = {
	{ 0x7f, 0xff, 0x0, 0xffff },	/* Max Vax */
	{ 0x7f, 0x0, 0x0, 0x1, 0x0, 0x0 }		/* Max IEEE */
};
static struct sgl_limits min = {
	{ 0x0, 0x0, 0x0, 0x0 },	/* Min Vax */
	{ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 }		/* Min IEEE */
};

static void
get_ix_float(const void *xp, float *ip)
{
		struct vax_single *const vsp = (struct vax_single *) ip;
		const struct ieee_single *const isp =
			 (const struct ieee_single *) xp;
		unsigned exp = isp->exp_hi << 1 | isp->exp_lo;

		switch(exp) {
		case 0 :
			/* ieee subnormal */
			if(isp->mant_hi == min.ieee.mant_hi
				&& isp->mant_lo_hi == min.ieee.mant_lo_hi
				&& isp->mant_lo_lo == min.ieee.mant_lo_lo)
			{
				*vsp = min.s;
			}
			else
			{
				unsigned mantissa = (isp->mant_hi << 16)
					 | isp->mant_lo_hi << 8
					 | isp->mant_lo_lo;
				unsigned tmp = mantissa >> 20;
				if(tmp >= 4) {
					vsp->exp = 2;
				} else if (tmp >= 2) {
					vsp->exp = 1;
				} else {
					*vsp = min.s;
					break;
				} /* else */
				tmp = mantissa - (1 << (20 + vsp->exp ));
				tmp <<= 3 - vsp->exp;
				vsp->mantissa2 = tmp;
				vsp->mantissa1 = (tmp >> 16);
			}
			break;
		case 0xfe :
		case 0xff :
			*vsp = max.s;
			break;
		default :
			vsp->exp = exp - IEEE_SNG_BIAS + VAX_SNG_BIAS;
			vsp->mantissa2 = isp->mant_lo_hi << 8 | isp->mant_lo_lo;
			vsp->mantissa1 = isp->mant_hi;
		}

		vsp->sign = isp->sign;

}


static void
put_ix_float(void *xp, const float *ip)
{
		const struct vax_single *const vsp =
			 (const struct vax_single *)ip;
		struct ieee_single *const isp = (struct ieee_single *) xp;

		switch(vsp->exp){
		case 0 :
			/* all vax float with zero exponent map to zero */
			*isp = min.ieee;
			break;
		case 2 :
		case 1 :
		{
			/* These will map to subnormals */
			unsigned mantissa = (vsp->mantissa1 << 16)
					 | vsp->mantissa2;
			mantissa >>= 3 - vsp->exp;
			mantissa += (1 << (20 + vsp->exp));
			isp->mant_lo_lo = mantissa;
			isp->mant_lo_hi = mantissa >> 8;
			isp->mant_hi = mantissa >> 16;
			isp->exp_lo = 0;
			isp->exp_hi = 0;
		}
			break;
		case 0xff : /* max.s.exp */
			if( vsp->mantissa2 == max.s.mantissa2
				&& vsp->mantissa1 == max.s.mantissa1)
			{
				/* map largest vax float to ieee infinity */
				*isp = max.ieee;
				break;
			} /* else, fall thru */
		default :
		{
			unsigned exp = vsp->exp - VAX_SNG_BIAS + IEEE_SNG_BIAS;
			isp->exp_hi = exp >> 1;
			isp->exp_lo = exp;
			isp->mant_lo_lo = vsp->mantissa2;
			isp->mant_lo_hi = vsp->mantissa2 >> 8;
			isp->mant_hi = vsp->mantissa1;
		}
		}

		isp->sign = vsp->sign;

}

	/* vax */
#elif defined(_CRAY) && !defined(__crayx1)

/*
 * Return the number of bytes until the next "word" boundary
 * N.B. This is based on the very wierd YMP address structure,
 * which puts the address within a word in the leftmost 3 bits
 * of the address.
 */
static size_t
word_align(const void *vp)
{
	const size_t rem = ((size_t)vp >> (64 - 3)) & 0x7;
	return (rem != 0);
}

struct ieee_single_hi {
	unsigned int	sign	: 1;
	unsigned int	 exp	: 8;
	unsigned int	mant	:23;
	unsigned int	pad	:32;
};
typedef struct ieee_single_hi ieee_single_hi;

struct ieee_single_lo {
	unsigned int	pad	:32;
	unsigned int	sign	: 1;
	unsigned int	 exp	: 8;
	unsigned int	mant	:23;
};
typedef struct ieee_single_lo ieee_single_lo;

static const int ieee_single_bias = 0x7f;

struct ieee_double {
	unsigned int	sign	: 1;
	unsigned int	 exp	:11;
	unsigned int	mant	:52;
};
typedef struct ieee_double ieee_double;

static const int ieee_double_bias = 0x3ff;

#if defined(NO_IEEE_FLOAT)

struct cray_single {
	unsigned int	sign	: 1;
	unsigned int	 exp	:15;
	unsigned int	mant	:48;
};
typedef struct cray_single cray_single;

static const int cs_ieis_bias = 0x4000 - 0x7f;

static const int cs_id_bias = 0x4000 - 0x3ff;


static void
get_ix_float(const void *xp, float *ip)
{

	if(word_align(xp) == 0)
	{
		const ieee_single_hi *isp = (const ieee_single_hi *) xp;
		cray_single *csp = (cray_single *) ip;

		if(isp->exp == 0)
		{
			/* ieee subnormal */
			*ip = (double)isp->mant;
			if(isp->mant != 0)
			{
				csp->exp -= (ieee_single_bias + 22);
			}
		}
		else
		{
			csp->exp  = isp->exp + cs_ieis_bias + 1;
			csp->mant = isp->mant << (48 - 1 - 23);
			csp->mant |= (1 << (48 - 1));
		}
		csp->sign = isp->sign;


	}
	else
	{
		const ieee_single_lo *isp = (const ieee_single_lo *) xp;
		cray_single *csp = (cray_single *) ip;

		if(isp->exp == 0)
		{
			/* ieee subnormal */
			*ip = (double)isp->mant;
			if(isp->mant != 0)
			{
				csp->exp -= (ieee_single_bias + 22);
			}
		}
		else
		{
			csp->exp  = isp->exp + cs_ieis_bias + 1;
			csp->mant = isp->mant << (48 - 1 - 23);
			csp->mant |= (1 << (48 - 1));
		}
		csp->sign = isp->sign;


	}
}

static void
put_ix_float(void *xp, const float *ip)
{
	if(word_align(xp) == 0)
	{
		ieee_single_hi *isp = (ieee_single_hi*)xp;
	const cray_single *csp = (const cray_single *) ip;
	int ieee_exp = csp->exp - cs_ieis_bias -1;

	isp->sign = csp->sign;

	if(ieee_exp >= 0xff)
	{
		/* NC_ERANGE => ieee Inf */
		isp->exp = 0xff;
		isp->mant = 0x0;
	}
	else if(ieee_exp > 0)
	{
		/* normal ieee representation */
		isp->exp  = ieee_exp;
		/* assumes cray rep is in normal form */
		assert(csp->mant & 0x800000000000);
		isp->mant = (((csp->mant << 1) &
				0xffffffffffff) >> (48 - 23));
	}
	else if(ieee_exp > -23)
	{
		/* ieee subnormal, right shift */
		const int rshift = (48 - 23 - ieee_exp);

		isp->mant = csp->mant >> rshift;

#if 0
		if(csp->mant & (1 << (rshift -1)))
		{
			/* round up */
			isp->mant++;
		}
#endif

		isp->exp  = 0;
	}
	else
	{
		/* smaller than ieee can represent */
		isp->exp = 0;
		isp->mant = 0;
	}

	}
	else
	{
		ieee_single_lo *isp = (ieee_single_lo*)xp;
	const cray_single *csp = (const cray_single *) ip;
	int ieee_exp = csp->exp - cs_ieis_bias -1;

	isp->sign = csp->sign;

	if(ieee_exp >= 0xff)
	{
		/* NC_ERANGE => ieee Inf */
		isp->exp = 0xff;
		isp->mant = 0x0;
	}
	else if(ieee_exp > 0)
	{
		/* normal ieee representation */
		isp->exp  = ieee_exp;
		/* assumes cray rep is in normal form */
		assert(csp->mant & 0x800000000000);
		isp->mant = (((csp->mant << 1) &
				0xffffffffffff) >> (48 - 23));
	}
	else if(ieee_exp > -23)
	{
		/* ieee subnormal, right shift */
		const int rshift = (48 - 23 - ieee_exp);

		isp->mant = csp->mant >> rshift;

#if 0
		if(csp->mant & (1 << (rshift -1)))
		{
			/* round up */
			isp->mant++;
		}
#endif

		isp->exp  = 0;
	}
	else
	{
		/* smaller than ieee can represent */
		isp->exp = 0;
		isp->mant = 0;
	}

	}
}

#else
	/* IEEE Cray with only doubles */
static void
get_ix_float(const void *xp, float *ip)
{

	ieee_double *idp = (ieee_double *) ip;

	if(word_align(xp) == 0)
	{
		const ieee_single_hi *isp = (const ieee_single_hi *) xp;
		if(isp->exp == 0 && isp->mant == 0)
		{
			idp->exp = 0;
			idp->mant = 0;
		}
		else
		{
			idp->exp = isp->exp + (ieee_double_bias - ieee_single_bias);
			idp->mant = isp->mant << (52 - 23);
		}
		idp->sign = isp->sign;
	}
	else
	{
		const ieee_single_lo *isp = (const ieee_single_lo *) xp;
		if(isp->exp == 0 && isp->mant == 0)
		{
			idp->exp = 0;
			idp->mant = 0;
		}
		else
		{
			idp->exp = isp->exp + (ieee_double_bias - ieee_single_bias);
			idp->mant = isp->mant << (52 - 23);
		}
		idp->sign = isp->sign;
	}
}

static void
put_ix_float(void *xp, const float *ip)
{
	const ieee_double *idp = (const ieee_double *) ip;
	if(word_align(xp) == 0)
	{
		ieee_single_hi *isp = (ieee_single_hi*)xp;
		if(idp->exp > (ieee_double_bias - ieee_single_bias))
			isp->exp = idp->exp - (ieee_double_bias - ieee_single_bias);
		else
			isp->exp = 0;
		isp->mant = idp->mant >> (52 - 23);
		isp->sign = idp->sign;
	}
	else
	{
		ieee_single_lo *isp = (ieee_single_lo*)xp;
		if(idp->exp > (ieee_double_bias - ieee_single_bias))
			isp->exp = idp->exp - (ieee_double_bias - ieee_single_bias);
		else
			isp->exp = 0;
		isp->mant = idp->mant >> (52 - 23);
		isp->sign = idp->sign;
	}
}
#endif

#else
#error "ix_float implementation"
#endif


int
ncx_get_float_schar(const void *xp, schar *ip)
{
	float xx;
	get_ix_float(xp, &xx);
	*ip = (schar) xx;
	if(xx > SCHAR_MAX || xx < SCHAR_MIN)
		return NC_ERANGE;
	return ENOERR;
}

int
ncx_get_float_uchar(const void *xp, uchar *ip)
{
	float xx;
	get_ix_float(xp, &xx);
	*ip = (uchar) xx;
	if(xx > UCHAR_MAX || xx < 0)
		return NC_ERANGE;
	return ENOERR;
}

int
ncx_get_float_short(const void *xp, short *ip)
{
	float xx;
	get_ix_float(xp, &xx);
	*ip = (short) xx;
	if(xx > SHORT_MAX || xx < SHORT_MIN)
		return NC_ERANGE;
	return ENOERR;
}

int
ncx_get_float_int(const void *xp, int *ip)
{
	float xx;
	get_ix_float(xp, &xx);
	*ip = (int) xx;
	if(xx > (double)INT_MAX || xx < (double)INT_MIN)
		return NC_ERANGE;
	return ENOERR;
}

int
ncx_get_float_uint(const void *xp, unsigned int *ip)
{
	float xx;
	get_ix_float(xp, &xx);
	*ip = (unsigned int) xx;
	if(xx > (double)UINT_MAX || xx < 0)
		return NC_ERANGE;
	return ENOERR;
}

int
ncx_get_float_longlong(const void *xp, longlong *ip)
{
	float xx;
	get_ix_float(xp, &xx);
	*ip = (longlong) xx;
	if(xx > (double)LONG_LONG_MAX || xx < (double)LONG_LONG_MIN)
		return NC_ERANGE;
	return ENOERR;
}

int
ncx_get_float_ulonglong(const void *xp, unsigned long long *ip)
{
	float xx;
	get_ix_float(xp, &xx);
	*ip = (longlong) xx;
	if(xx > (double)ULONG_LONG_MAX || xx < 0)
		return NC_ERANGE;
	return ENOERR;
}

int
ncx_get_float_float(const void *xp, float *ip)
{
	/* TODO */
	get_ix_float(xp, ip);
	return ENOERR;
}

int
ncx_get_float_double(const void *xp, double *ip)
{
	/* TODO */
	float xx;
	get_ix_float(xp, &xx);
	*ip = xx;
	return ENOERR;
}


int
ncx_put_float_schar(void *xp, const schar *ip)
{
	float xx = (float) *ip;
	put_ix_float(xp, &xx);
	return ENOERR;
}

int
ncx_put_float_uchar(void *xp, const uchar *ip)
{
	float xx = (float) *ip;
	put_ix_float(xp, &xx);
	return ENOERR;
}

int
ncx_put_float_short(void *xp, const short *ip)
{
	float xx = (float) *ip;
	put_ix_float(xp, &xx);
#if 0	/* TODO: figure this out */
	if((float)(*ip) > X_FLOAT_MAX || (float)(*ip) < X_FLOAT_MIN)
		return NC_ERANGE;
#endif
	return ENOERR;
}

int
ncx_put_float_int(void *xp, const int *ip)
{
	float xx = (float) *ip;
	put_ix_float(xp, &xx);
#if 1	/* TODO: figure this out */
	if((float)(*ip) > X_FLOAT_MAX || (float)(*ip) < X_FLOAT_MIN)
		return NC_ERANGE;
#endif
	return ENOERR;
}

int
ncx_put_float_uint(void *xp, const unsigned int *ip)
{
	float xx = (float) *ip;
	put_ix_float(xp, &xx);
#if 1	/* TODO: figure this out */
	if((float)(*ip) > X_FLOAT_MAX)
		return NC_ERANGE;
#endif
	return ENOERR;
}

int
ncx_put_float_longlong(void *xp, const longlong *ip)
{
	float xx = (float) *ip;
	put_ix_float(xp, &xx);
#if 1	/* TODO: figure this out */
	if((float)(*ip) > X_FLOAT_MAX || (float)(*ip) < X_FLOAT_MIN)
		return NC_ERANGE;
#endif
	return ENOERR;
}

int
ncx_put_float_ulonglong(void *xp, const unsigned long long *ip)
{
	float xx = (float) *ip;
	put_ix_float(xp, &xx);
#if 1	/* TODO: figure this out */
	if((float)(*ip) > X_FLOAT_MAX)
		return NC_ERANGE;
#endif
	return ENOERR;
}

int
ncx_put_float_float(void *xp, const float *ip)
{
	put_ix_float(xp, ip);
#ifdef NO_IEEE_FLOAT
	if(*ip > X_FLOAT_MAX || *ip < X_FLOAT_MIN)
		return NC_ERANGE;
#endif
	return ENOERR;
}

int
ncx_put_float_double(void *xp, const double *ip)
{
	float xx = (float) *ip;
	put_ix_float(xp, &xx);
	if(*ip > X_FLOAT_MAX || *ip < X_FLOAT_MIN)
		return NC_ERANGE;
	return ENOERR;
}

/* x_double */

#if X_SIZEOF_DOUBLE == SIZEOF_DOUBLE  && !defined(NO_IEEE_FLOAT)

static void
get_ix_double(const void *xp, double *ip)
{
#ifdef WORDS_BIGENDIAN
	(void) memcpy(ip, xp, sizeof(double));
#else
	swap8b(ip, xp);
#endif
}

static void
put_ix_double(void *xp, const double *ip)
{
#ifdef WORDS_BIGENDIAN
	(void) memcpy(xp, ip, X_SIZEOF_DOUBLE);
#else
	swap8b(xp, ip);
#endif
}

#elif vax

/* What IEEE double precision floating point looks like on a Vax */
struct	ieee_double {
	unsigned int	exp_hi   : 7;
	unsigned int	sign     : 1;
	unsigned int 	mant_6   : 4;
	unsigned int	exp_lo   : 4;
	unsigned int	mant_5   : 8;
	unsigned int	mant_4   : 8;

	unsigned int	mant_lo  : 32;
};

/* Vax double precision floating point */
struct  vax_double {
	unsigned int	mantissa1 : 7;
	unsigned int	exp       : 8;
	unsigned int	sign      : 1;
	unsigned int	mantissa2 : 16;
	unsigned int	mantissa3 : 16;
	unsigned int	mantissa4 : 16;
};

#define VAX_DBL_BIAS	0x81
#define IEEE_DBL_BIAS	0x3ff
#define MASK(nbits)	((1 << nbits) - 1)

static const struct dbl_limits {
	struct	vax_double d;
	struct	ieee_double ieee;
} dbl_limits[2] = {
	{{ 0x7f, 0xff, 0x0, 0xffff, 0xffff, 0xffff },	/* Max Vax */
	{ 0x7f, 0x0, 0x0, 0xf, 0x0, 0x0, 0x0}}, /* Max IEEE */
	{{ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},		/* Min Vax */
	{ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}}, /* Min IEEE */
};


static void
get_ix_double(const void *xp, double *ip)
{
	struct vax_double *const vdp =
			 (struct vax_double *)ip;
	const struct ieee_double *const idp =
			 (const struct ieee_double *) xp;
	{
		const struct dbl_limits *lim;
		int ii;
		for (ii = 0, lim = dbl_limits;
			ii < sizeof(dbl_limits)/sizeof(struct dbl_limits);
			ii++, lim++)
		{
			if ((idp->mant_lo == lim->ieee.mant_lo)
				&& (idp->mant_4 == lim->ieee.mant_4)
				&& (idp->mant_5 == lim->ieee.mant_5)
				&& (idp->mant_6 == lim->ieee.mant_6)
				&& (idp->exp_lo == lim->ieee.exp_lo)
				&& (idp->exp_hi == lim->ieee.exp_hi)
				)
			{
				*vdp = lim->d;
				goto doneit;
			}
		}
	}
	{
		unsigned exp = idp->exp_hi << 4 | idp->exp_lo;
		vdp->exp = exp - IEEE_DBL_BIAS + VAX_DBL_BIAS;
	}
	{
		unsigned mant_hi = ((idp->mant_6 << 16)
				 | (idp->mant_5 << 8)
				 | idp->mant_4);
		unsigned mant_lo = SWAP4(idp->mant_lo);
		vdp->mantissa1 = (mant_hi >> 13);
		vdp->mantissa2 = ((mant_hi & MASK(13)) << 3)
				| (mant_lo >> 29);
		vdp->mantissa3 = (mant_lo >> 13);
		vdp->mantissa4 = (mant_lo << 3);
	}
	doneit:
		vdp->sign = idp->sign;

}


static void
put_ix_double(void *xp, const double *ip)
{
	const struct vax_double *const vdp = 
			(const struct vax_double *)ip;
	struct ieee_double *const idp =
			 (struct ieee_double *) xp;

	if ((vdp->mantissa4 > (dbl_limits[0].d.mantissa4 - 3)) &&
		(vdp->mantissa3 == dbl_limits[0].d.mantissa3) &&
		(vdp->mantissa2 == dbl_limits[0].d.mantissa2) &&
		(vdp->mantissa1 == dbl_limits[0].d.mantissa1) &&
		(vdp->exp == dbl_limits[0].d.exp))
	{
		*idp = dbl_limits[0].ieee;
		goto shipit;
	}
	if ((vdp->mantissa4 == dbl_limits[1].d.mantissa4) &&
		(vdp->mantissa3 == dbl_limits[1].d.mantissa3) &&
		(vdp->mantissa2 == dbl_limits[1].d.mantissa2) &&
		(vdp->mantissa1 == dbl_limits[1].d.mantissa1) &&
		(vdp->exp == dbl_limits[1].d.exp))
	{
		*idp = dbl_limits[1].ieee;
		goto shipit;
	}

	{
		unsigned exp = vdp->exp - VAX_DBL_BIAS + IEEE_DBL_BIAS;

		unsigned mant_lo = ((vdp->mantissa2 & MASK(3)) << 29) |
			(vdp->mantissa3 << 13) |
			((vdp->mantissa4 >> 3) & MASK(13));

		unsigned mant_hi = (vdp->mantissa1 << 13)
				 | (vdp->mantissa2 >> 3);

		if((vdp->mantissa4 & 7) > 4)
		{
			/* round up */
			mant_lo++;
			if(mant_lo == 0)
			{
				mant_hi++;
				if(mant_hi > 0xffffff)
				{
					mant_hi = 0;
					exp++;
				}
			}
		}

		idp->mant_lo = SWAP4(mant_lo);
		idp->mant_6 = mant_hi >> 16;
		idp->mant_5 = (mant_hi & 0xff00) >> 8;
		idp->mant_4 = mant_hi;
		idp->exp_hi = exp >> 4;
		idp->exp_lo = exp;
	}
		
	shipit:
		idp->sign = vdp->sign;

}

	/* vax */
#elif defined(_CRAY) && !defined(__crayx1)

static void
get_ix_double(const void *xp, double *ip)
{
	const ieee_double *idp = (const ieee_double *) xp;
	cray_single *csp = (cray_single *) ip;

	if(idp->exp == 0)
	{
		/* ieee subnormal */
		*ip = (double)idp->mant;
		if(idp->mant != 0)
		{
			csp->exp -= (ieee_double_bias + 51);
		}
	}
	else
	{
		csp->exp  = idp->exp + cs_id_bias + 1;
		csp->mant = idp->mant >> (52 - 48 + 1);
		csp->mant |= (1 << (48 - 1));
	}
	csp->sign = idp->sign;
}

static void
put_ix_double(void *xp, const double *ip)
{
	ieee_double *idp = (ieee_double *) xp;
	const cray_single *csp = (const cray_single *) ip;

	int ieee_exp = csp->exp - cs_id_bias -1;

	idp->sign = csp->sign;

	if(ieee_exp >= 0x7ff)
	{
		/* NC_ERANGE => ieee Inf */
		idp->exp = 0x7ff;
		idp->mant = 0x0;
	}
	else if(ieee_exp > 0)
	{
		/* normal ieee representation */
		idp->exp  = ieee_exp;
		/* assumes cray rep is in normal form */
		assert(csp->mant & 0x800000000000);
		idp->mant = (((csp->mant << 1) &
				0xffffffffffff) << (52 - 48));
	}
	else if(ieee_exp >= (-(52 -48)))
	{
		/* ieee subnormal, left shift */
		const int lshift = (52 - 48) + ieee_exp;
		idp->mant = csp->mant << lshift;
		idp->exp  = 0;
	}
	else if(ieee_exp >= -52)
	{
		/* ieee subnormal, right shift */
		const int rshift = (- (52 - 48) - ieee_exp);

		idp->mant = csp->mant >> rshift;

#if 0
		if(csp->mant & (1 << (rshift -1)))
		{
			/* round up */
			idp->mant++;
		}
#endif

		idp->exp  = 0;
	}
	else
	{
		/* smaller than ieee can represent */
		idp->exp = 0;
		idp->mant = 0;
	}
}
#else
#error "ix_double implementation"
#endif

int
ncx_get_double_schar(const void *xp, schar *ip)
{
	double xx;
	get_ix_double(xp, &xx);
	*ip = (schar) xx;
	if(xx > SCHAR_MAX || xx < SCHAR_MIN)
		return NC_ERANGE;
	return ENOERR;
}

int
ncx_get_double_uchar(const void *xp, uchar *ip)
{
	double xx;
	get_ix_double(xp, &xx);
	*ip = (uchar) xx;
	if(xx > UCHAR_MAX || xx < 0)
		return NC_ERANGE;
	return ENOERR;
}

int
ncx_get_double_short(const void *xp, short *ip)
{
	double xx;
	get_ix_double(xp, &xx);
	*ip = (short) xx;
	if(xx > SHORT_MAX || xx < SHORT_MIN)
		return NC_ERANGE;
	return ENOERR;
}

int
ncx_get_double_int(const void *xp, int *ip)
{
	double xx;
	get_ix_double(xp, &xx);
	*ip = (int) xx;
	if(xx > INT_MAX || xx < INT_MIN)
		return NC_ERANGE;
	return ENOERR;
}

int
ncx_get_double_uint(const void *xp, unsigned int *ip)
{
	double xx;
	get_ix_double(xp, &xx);
	*ip = (unsigned int) xx;
	if(xx > UINT_MAX || xx < 0)
		return NC_ERANGE;
	return ENOERR;
}

int
ncx_get_double_longlong(const void *xp, longlong *ip)
{
	double xx;
	get_ix_double(xp, &xx);
	*ip = (longlong) xx;
	if(xx > LONG_LONG_MAX || xx < LONG_LONG_MIN)
		return NC_ERANGE;
	return ENOERR;
}

int
ncx_get_double_ulonglong(const void *xp, unsigned long long *ip)
{
	double xx;
	get_ix_double(xp, &xx);
	*ip = (unsigned longlong) xx;
	if(xx > ULONG_LONG_MAX || xx < 0)
		return NC_ERANGE;
	return ENOERR;
}

int
ncx_get_double_float(const void *xp, float *ip)
{
	double xx;
	get_ix_double(xp, &xx);
	if(xx > FLT_MAX)
	{
		*ip = FLT_MAX;
		return NC_ERANGE;
	}
	if(xx < (-FLT_MAX))
	{
		*ip = (-FLT_MAX);
		return NC_ERANGE;
	}
	*ip = (float) xx;
	return ENOERR;
}

int
ncx_get_double_double(const void *xp, double *ip)
{
	/* TODO */
	get_ix_double(xp, ip);
	return ENOERR;
}


int
ncx_put_double_schar(void *xp, const schar *ip)
{
	double xx = (double) *ip;
	put_ix_double(xp, &xx);
	return ENOERR;
}

int
ncx_put_double_uchar(void *xp, const uchar *ip)
{
	double xx = (double) *ip;
	put_ix_double(xp, &xx);
	return ENOERR;
}

int
ncx_put_double_short(void *xp, const short *ip)
{
	double xx = (double) *ip;
	put_ix_double(xp, &xx);
#if 0	/* TODO: figure this out */
	if((double)(*ip) > X_DOUBLE_MAX || (double)(*ip) < X_DOUBLE_MIN)
		return NC_ERANGE;
#endif
	return ENOERR;
}

int
ncx_put_double_int(void *xp, const int *ip)
{
	double xx = (double) *ip;
	put_ix_double(xp, &xx);
#if 0	/* TODO: figure this out */
	if((double)(*ip) > X_DOUBLE_MAX || (double)(*ip) < X_DOUBLE_MIN)
		return NC_ERANGE;
#endif
	return ENOERR;
}

int
ncx_put_double_uint(void *xp, const unsigned int *ip)
{
	double xx = (double) *ip;
	put_ix_double(xp, &xx);
#if 0	/* TODO: figure this out */
	if((double)(*ip) > X_DOUBLE_MAX)
		return NC_ERANGE;
#endif
	return ENOERR;
}

int
ncx_put_double_longlong(void *xp, const longlong *ip)
{
	double xx = (double) *ip;
	put_ix_double(xp, &xx);
#if 1	/* TODO: figure this out */
	if((double)(*ip) > X_DOUBLE_MAX || (double)(*ip) < X_DOUBLE_MIN)
		return NC_ERANGE;
#endif
	return ENOERR;
}

int
ncx_put_double_ulonglong(void *xp, const unsigned long long *ip)
{
	double xx = (double) *ip;
	put_ix_double(xp, &xx);
#if 1	/* TODO: figure this out */
	if((double)(*ip) > X_DOUBLE_MAX)
		return NC_ERANGE;
#endif
	return ENOERR;
}

int
ncx_put_double_float(void *xp, const float *ip)
{
	double xx = (double) *ip;
	put_ix_double(xp, &xx);
#if 1	/* TODO: figure this out */
	if((double)(*ip) > X_DOUBLE_MAX || (double)(*ip) < X_DOUBLE_MIN)
		return NC_ERANGE;
#endif
	return ENOERR;
}

int
ncx_put_double_double(void *xp, const double *ip)
{
	put_ix_double(xp, ip);
#ifdef NO_IEEE_FLOAT
	if(*ip > X_DOUBLE_MAX || *ip < X_DOUBLE_MIN)
		return NC_ERANGE;
#endif
	return ENOERR;
}


/* x_size_t */

#if SIZEOF_SIZE_T < X_SIZEOF_SIZE_T
#error "x_size_t implementation"
/* netcdf requires size_t which can hold a values from 0 to 2^32 -1 */
#endif

int
ncx_put_size_t(void **xpp, const size_t *ulp)
{
	/* similar to put_ix_int() */
	uchar *cp = (uchar *) *xpp;
	assert(*ulp <= X_SIZE_MAX);

	*cp++ = (uchar)((*ulp) >> 24);
	*cp++ = (uchar)(((*ulp) & 0x00ff0000) >> 16);
	*cp++ = (uchar)(((*ulp) & 0x0000ff00) >>  8);
	*cp   = (uchar)((*ulp) & 0x000000ff);

	*xpp = (void *)((char *)(*xpp) + X_SIZEOF_SIZE_T);
	return ENOERR;
}

int
ncx_get_size_t(const void **xpp,  size_t *ulp)
{
	/* similar to get_ix_int */
	const uchar *cp = (const uchar *) *xpp;

	*ulp = (unsigned)(*cp++ << 24);
	*ulp |= (*cp++ << 16);
	*ulp |= (*cp++ << 8);
	*ulp |= *cp; 

	*xpp = (const void *)((const char *)(*xpp) + X_SIZEOF_SIZE_T);
	return ENOERR;
}

/* x_off_t */

int
ncx_put_off_t(void **xpp, const off_t *lp, size_t sizeof_off_t)
{
	/* similar to put_ix_int() */
	uchar *cp = (uchar *) *xpp;
		/* No negative offsets stored in netcdf */
	if (*lp < 0) {
	  /* Assume this is an overflow of a 32-bit int... */
	  return ERANGE;
	}
	  
	assert(sizeof_off_t == 4 || sizeof_off_t == 8);

	if (sizeof_off_t == 4) {
		*cp++ = (uchar) ((*lp)               >> 24);
		*cp++ = (uchar)(((*lp) & 0x00ff0000) >> 16);
		*cp++ = (uchar)(((*lp) & 0x0000ff00) >>  8);
		*cp   = (uchar)( (*lp) & 0x000000ff);
	} else {
#if SIZEOF_OFF_T == 4
/* Write a 64-bit offset on a system with only a 32-bit offset */
		*cp++ = (uchar)0;
		*cp++ = (uchar)0;
		*cp++ = (uchar)0;
		*cp++ = (uchar)0;

		*cp++ = (uchar)(((*lp) & 0xff000000) >> 24);
		*cp++ = (uchar)(((*lp) & 0x00ff0000) >> 16);
		*cp++ = (uchar)(((*lp) & 0x0000ff00) >>  8);
		*cp   = (uchar)( (*lp) & 0x000000ff);
#else
		*cp++ = (uchar) ((*lp)                          >> 56);
		*cp++ = (uchar)(((*lp) & 0x00ff000000000000ULL) >> 48);
		*cp++ = (uchar)(((*lp) & 0x0000ff0000000000ULL) >> 40);
		*cp++ = (uchar)(((*lp) & 0x000000ff00000000ULL) >> 32);
		*cp++ = (uchar)(((*lp) & 0x00000000ff000000ULL) >> 24);
		*cp++ = (uchar)(((*lp) & 0x0000000000ff0000ULL) >> 16);
		*cp++ = (uchar)(((*lp) & 0x000000000000ff00ULL) >>  8);
		*cp   = (uchar)( (*lp) & 0x00000000000000ffULL);
#endif
	}
	*xpp = (void *)((char *)(*xpp) + sizeof_off_t);
	return ENOERR;
}

int
ncx_get_off_t(const void **xpp, off_t *lp, size_t sizeof_off_t)
{
	/* similar to get_ix_int() */
	const uchar *cp = (const uchar *) *xpp;
	assert(sizeof_off_t == 4 || sizeof_off_t == 8);

 	if (sizeof_off_t == 4) {
		*lp = *cp++ << 24;
		*lp |= (*cp++ << 16);
		*lp |= (*cp++ <<  8);
		*lp |= *cp; 
	} else {
#if SIZEOF_OFF_T == 4
/* Read a 64-bit offset on a system with only a 32-bit offset */
/* If the offset overflows, set an error code and return */
		*lp =  ((off_t)(*cp++) << 24);
		*lp |= ((off_t)(*cp++) << 16);
		*lp |= ((off_t)(*cp++) <<  8);
		*lp |= ((off_t)(*cp++));
/*
 * lp now contains the upper 32-bits of the 64-bit offset.  if lp is
 * not zero, then the dataset is larger than can be represented
 * on this system.  Set an error code and return.
 */
		if (*lp != 0) {
		  return ERANGE;
		}

		*lp  = ((off_t)(*cp++) << 24);
		*lp |= ((off_t)(*cp++) << 16);
		*lp |= ((off_t)(*cp++) <<  8);
		*lp |=  (off_t)*cp;

		if (*lp < 0) {
		  /*
		   * If this fails, then the offset is >2^31, but less
		   * than 2^32 which is not allowed, but is not caught
		   * by the previous check
		   */
		  return ERANGE;
		}
#else
		*lp =  ((off_t)(*cp++) << 56);
		*lp |= ((off_t)(*cp++) << 48);
		*lp |= ((off_t)(*cp++) << 40);
		*lp |= ((off_t)(*cp++) << 32);
		*lp |= ((off_t)(*cp++) << 24);
		*lp |= ((off_t)(*cp++) << 16);
		*lp |= ((off_t)(*cp++) <<  8);
		*lp |=  (off_t)*cp;
#endif
	}
	*xpp = (const void *)((const char *)(*xpp) + sizeof_off_t);
	return ENOERR;
}


/*
 * Aggregate numeric conversion functions.
 */


/* schar */

int
ncx_getn_schar_schar(const void **xpp, size_t nelems, schar *tp)
{
		(void) memcpy(tp, *xpp, nelems);
	*xpp = (void *)((char *)(*xpp) + nelems);
	return ENOERR;

}
int
ncx_getn_schar_uchar(const void **xpp, size_t nelems, uchar *tp)
{
		(void) memcpy(tp, *xpp, nelems);
	*xpp = (void *)((char *)(*xpp) + nelems);
	return ENOERR;

}
int
ncx_getn_schar_short(const void **xpp, size_t nelems, short *tp)
{
	schar *xp = (schar *)(*xpp);

	while(nelems-- != 0)
	{
		*tp++ = *xp++;
	}

	*xpp = (const void *)xp;
	return ENOERR;
}

int
ncx_getn_schar_int(const void **xpp, size_t nelems, int *tp)
{
	schar *xp = (schar *)(*xpp);

	while(nelems-- != 0)
	{
		*tp++ = *xp++;
	}

	*xpp = (const void *)xp;
	return ENOERR;
}

int
ncx_getn_schar_float(const void **xpp, size_t nelems, float *tp)
{
	schar *xp = (schar *)(*xpp);

	while(nelems-- != 0)
	{
		*tp++ = *xp++;
	}

	*xpp = (const void *)xp;
	return ENOERR;
}

int
ncx_getn_schar_double(const void **xpp, size_t nelems, double *tp)
{
	schar *xp = (schar *)(*xpp);

	while(nelems-- != 0)
	{
		*tp++ = *xp++;
	}

	*xpp = (const void *)xp;
	return ENOERR;
}

int
ncx_getn_schar_uint(const void **xpp, size_t nelems, uint *tp)
{
	schar *xp = (schar *)(*xpp);

	while(nelems-- != 0)
	{
		*tp++ = *xp++;
	}

	*xpp = (const void *)xp;
	return ENOERR;
}

int
ncx_getn_schar_longlong(const void **xpp, size_t nelems, longlong *tp)
{
	schar *xp = (schar *)(*xpp);

	while(nelems-- != 0)
	{
		*tp++ = *xp++;
	}

	*xpp = (const void *)xp;
	return ENOERR;
}

int
ncx_getn_schar_ulonglong(const void **xpp, size_t nelems, ulonglong *tp)
{
	schar *xp = (schar *)(*xpp);

	while(nelems-- != 0)
	{
		*tp++ = *xp++;
	}

	*xpp = (const void *)xp;
	return ENOERR;
}


int
ncx_pad_getn_schar_schar(const void **xpp, size_t nelems, schar *tp)
{
		size_t rndup = nelems % X_ALIGN;

	if(rndup)
		rndup = X_ALIGN - rndup;

	(void) memcpy(tp, *xpp, nelems);
	*xpp = (void *)((char *)(*xpp) + nelems + rndup);

	return ENOERR;

}
int
ncx_pad_getn_schar_uchar(const void **xpp, size_t nelems, uchar *tp)
{
		size_t rndup = nelems % X_ALIGN;

	if(rndup)
		rndup = X_ALIGN - rndup;

	(void) memcpy(tp, *xpp, nelems);
	*xpp = (void *)((char *)(*xpp) + nelems + rndup);

	return ENOERR;

}
int
ncx_pad_getn_schar_short(const void **xpp, size_t nelems, short *tp)
{
	size_t rndup = nelems % X_ALIGN;
	schar *xp = (schar *) *xpp;

	if(rndup)
		rndup = X_ALIGN - rndup;

	while(nelems-- != 0)
	{
		*tp++ = *xp++;
	}

	*xpp = (void *)(xp + rndup);
	return ENOERR;
}

int
ncx_pad_getn_schar_int(const void **xpp, size_t nelems, int *tp)
{
	size_t rndup = nelems % X_ALIGN;
	schar *xp = (schar *) *xpp;

	if(rndup)
		rndup = X_ALIGN - rndup;

	while(nelems-- != 0)
	{
		*tp++ = *xp++;
	}

	*xpp = (void *)(xp + rndup);
	return ENOERR;
}

int
ncx_pad_getn_schar_float(const void **xpp, size_t nelems, float *tp)
{
	size_t rndup = nelems % X_ALIGN;
	schar *xp = (schar *) *xpp;

	if(rndup)
		rndup = X_ALIGN - rndup;

	while(nelems-- != 0)
	{
		*tp++ = *xp++;
	}

	*xpp = (void *)(xp + rndup);
	return ENOERR;
}

int
ncx_pad_getn_schar_double(const void **xpp, size_t nelems, double *tp)
{
	size_t rndup = nelems % X_ALIGN;
	schar *xp = (schar *) *xpp;

	if(rndup)
		rndup = X_ALIGN - rndup;

	while(nelems-- != 0)
	{
		*tp++ = *xp++;
	}

	*xpp = (void *)(xp + rndup);
	return ENOERR;
}

int
ncx_pad_getn_schar_uint(const void **xpp, size_t nelems, uint *tp)
{
	size_t rndup = nelems % X_ALIGN;
	schar *xp = (schar *) *xpp;

	if(rndup)
		rndup = X_ALIGN - rndup;

	while(nelems-- != 0)
	{
		*tp++ = *xp++;
	}

	*xpp = (void *)(xp + rndup);
	return ENOERR;
}

int
ncx_pad_getn_schar_longlong(const void **xpp, size_t nelems, longlong *tp)
{
	size_t rndup = nelems % X_ALIGN;
	schar *xp = (schar *) *xpp;

	if(rndup)
		rndup = X_ALIGN - rndup;

	while(nelems-- != 0)
	{
		*tp++ = *xp++;
	}

	*xpp = (void *)(xp + rndup);
	return ENOERR;
}

int
ncx_pad_getn_schar_ulonglong(const void **xpp, size_t nelems, ulonglong *tp)
{
	size_t rndup = nelems % X_ALIGN;
	schar *xp = (schar *) *xpp;

	if(rndup)
		rndup = X_ALIGN - rndup;

	while(nelems-- != 0)
	{
		*tp++ = *xp++;
	}

	*xpp = (void *)(xp + rndup);
	return ENOERR;
}


int
ncx_putn_schar_schar(void **xpp, size_t nelems, const schar *tp)
{
		(void) memcpy(*xpp, tp, nelems);
	*xpp = (void *)((char *)(*xpp) + nelems);

	return ENOERR;

}
int
ncx_putn_schar_uchar(void **xpp, size_t nelems, const uchar *tp)
{
		(void) memcpy(*xpp, tp, nelems);
	*xpp = (void *)((char *)(*xpp) + nelems);

	return ENOERR;

}
int
ncx_putn_schar_short(void **xpp, size_t nelems, const short *tp)
{
	int status = ENOERR;
	schar *xp = (schar *) *xpp;

	while(nelems-- != 0)
	{
		if(*tp > X_SCHAR_MAX || *tp < X_SCHAR_MIN)
			status = NC_ERANGE;
		*xp++ = (schar) *tp++;
	}

	*xpp = (void *)xp;
	return status;
}

int
ncx_putn_schar_int(void **xpp, size_t nelems, const int *tp)
{
	int status = ENOERR;
	schar *xp = (schar *) *xpp;

	while(nelems-- != 0)
	{
		if(*tp > X_SCHAR_MAX || *tp < X_SCHAR_MIN)
			status = NC_ERANGE;
		*xp++ = (schar) *tp++;
	}

	*xpp = (void *)xp;
	return status;
}

int
ncx_putn_schar_float(void **xpp, size_t nelems, const float *tp)
{
	int status = ENOERR;
	schar *xp = (schar *) *xpp;

	while(nelems-- != 0)
	{
		if(*tp > X_SCHAR_MAX || *tp < X_SCHAR_MIN)
			status = NC_ERANGE;
		*xp++ = (schar) *tp++;
	}

	*xpp = (void *)xp;
	return status;
}

int
ncx_putn_schar_double(void **xpp, size_t nelems, const double *tp)
{
	int status = ENOERR;
	schar *xp = (schar *) *xpp;

	while(nelems-- != 0)
	{
		if(*tp > X_SCHAR_MAX || *tp < X_SCHAR_MIN)
			status = NC_ERANGE;
		*xp++ = (schar) *tp++;
	}

	*xpp = (void *)xp;
	return status;
}

int
ncx_putn_schar_uint(void **xpp, size_t nelems, const uint *tp)
{
	int status = ENOERR;
	schar *xp = (schar *) *xpp;

	while(nelems-- != 0)
	{
		if(*tp > X_SCHAR_MAX || *tp < X_SCHAR_MIN)
			status = NC_ERANGE;
		*xp++ = (schar) *tp++;
	}

	*xpp = (void *)xp;
	return status;
}

int
ncx_putn_schar_longlong(void **xpp, size_t nelems, const longlong *tp)
{
	int status = ENOERR;
	schar *xp = (schar *) *xpp;

	while(nelems-- != 0)
	{
		if(*tp > X_SCHAR_MAX || *tp < X_SCHAR_MIN)
			status = NC_ERANGE;
		*xp++ = (schar) *tp++;
	}

	*xpp = (void *)xp;
	return status;
}

int
ncx_putn_schar_ulonglong(void **xpp, size_t nelems, const ulonglong *tp)
{
	int status = ENOERR;
	schar *xp = (schar *) *xpp;

	while(nelems-- != 0)
	{
		if(*tp > X_SCHAR_MAX || *tp < X_SCHAR_MIN)
			status = NC_ERANGE;
		*xp++ = (schar) *tp++;
	}

	*xpp = (void *)xp;
	return status;
}


int
ncx_pad_putn_schar_schar(void **xpp, size_t nelems, const schar *tp)
{
		size_t rndup = nelems % X_ALIGN;

	if(rndup)
		rndup = X_ALIGN - rndup;

	(void) memcpy(*xpp, tp, nelems);
	*xpp = (void *)((char *)(*xpp) + nelems);

	if(rndup)
	{
		(void) memcpy(*xpp, nada, rndup);
		*xpp = (void *)((char *)(*xpp) + rndup);
	}
	
	return ENOERR;

}
int
ncx_pad_putn_schar_uchar(void **xpp, size_t nelems, const uchar *tp)
{
		size_t rndup = nelems % X_ALIGN;

	if(rndup)
		rndup = X_ALIGN - rndup;

	(void) memcpy(*xpp, tp, nelems);
	*xpp = (void *)((char *)(*xpp) + nelems);

	if(rndup)
	{
		(void) memcpy(*xpp, nada, rndup);
		*xpp = (void *)((char *)(*xpp) + rndup);
	}
	
	return ENOERR;

}
int
ncx_pad_putn_schar_short(void **xpp, size_t nelems, const short *tp)
{
	int status = ENOERR;
	size_t rndup = nelems % X_ALIGN;
	schar *xp = (schar *) *xpp;

	if(rndup)
		rndup = X_ALIGN - rndup;

	while(nelems-- != 0)
	{
		/* N.B. schar as signed */
		if(*tp > X_SCHAR_MAX || *tp < X_SCHAR_MIN)
			status = NC_ERANGE;
		*xp++ = (schar) *tp++;
	}


	if(rndup)
	{
		(void) memcpy(xp, nada, rndup);
		xp += rndup;
	}

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_putn_schar_int(void **xpp, size_t nelems, const int *tp)
{
	int status = ENOERR;
	size_t rndup = nelems % X_ALIGN;
	schar *xp = (schar *) *xpp;

	if(rndup)
		rndup = X_ALIGN - rndup;

	while(nelems-- != 0)
	{
		/* N.B. schar as signed */
		if(*tp > X_SCHAR_MAX || *tp < X_SCHAR_MIN)
			status = NC_ERANGE;
		*xp++ = (schar) *tp++;
	}


	if(rndup)
	{
		(void) memcpy(xp, nada, rndup);
		xp += rndup;
	}

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_putn_schar_float(void **xpp, size_t nelems, const float *tp)
{
	int status = ENOERR;
	size_t rndup = nelems % X_ALIGN;
	schar *xp = (schar *) *xpp;

	if(rndup)
		rndup = X_ALIGN - rndup;

	while(nelems-- != 0)
	{
		/* N.B. schar as signed */
		if(*tp > X_SCHAR_MAX || *tp < X_SCHAR_MIN)
			status = NC_ERANGE;
		*xp++ = (schar) *tp++;
	}


	if(rndup)
	{
		(void) memcpy(xp, nada, rndup);
		xp += rndup;
	}

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_putn_schar_double(void **xpp, size_t nelems, const double *tp)
{
	int status = ENOERR;
	size_t rndup = nelems % X_ALIGN;
	schar *xp = (schar *) *xpp;

	if(rndup)
		rndup = X_ALIGN - rndup;

	while(nelems-- != 0)
	{
		/* N.B. schar as signed */
		if(*tp > X_SCHAR_MAX || *tp < X_SCHAR_MIN)
			status = NC_ERANGE;
		*xp++ = (schar) *tp++;
	}


	if(rndup)
	{
		(void) memcpy(xp, nada, rndup);
		xp += rndup;
	}

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_putn_schar_uint(void **xpp, size_t nelems, const uint *tp)
{
	int status = ENOERR;
	size_t rndup = nelems % X_ALIGN;
	schar *xp = (schar *) *xpp;

	if(rndup)
		rndup = X_ALIGN - rndup;

	while(nelems-- != 0)
	{
		/* N.B. schar as signed */
		if(*tp > X_SCHAR_MAX || *tp < X_SCHAR_MIN)
			status = NC_ERANGE;
		*xp++ = (schar) *tp++;
	}


	if(rndup)
	{
		(void) memcpy(xp, nada, rndup);
		xp += rndup;
	}

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_putn_schar_longlong(void **xpp, size_t nelems, const longlong *tp)
{
	int status = ENOERR;
	size_t rndup = nelems % X_ALIGN;
	schar *xp = (schar *) *xpp;

	if(rndup)
		rndup = X_ALIGN - rndup;

	while(nelems-- != 0)
	{
		/* N.B. schar as signed */
		if(*tp > X_SCHAR_MAX || *tp < X_SCHAR_MIN)
			status = NC_ERANGE;
		*xp++ = (schar) *tp++;
	}


	if(rndup)
	{
		(void) memcpy(xp, nada, rndup);
		xp += rndup;
	}

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_putn_schar_ulonglong(void **xpp, size_t nelems, const ulonglong *tp)
{
	int status = ENOERR;
	size_t rndup = nelems % X_ALIGN;
	schar *xp = (schar *) *xpp;

	if(rndup)
		rndup = X_ALIGN - rndup;

	while(nelems-- != 0)
	{
		/* N.B. schar as signed */
		if(*tp > X_SCHAR_MAX || *tp < X_SCHAR_MIN)
			status = NC_ERANGE;
		*xp++ = (schar) *tp++;
	}


	if(rndup)
	{
		(void) memcpy(xp, nada, rndup);
		xp += rndup;
	}

	*xpp = (void *)xp;
	return status;
}


/* short */

int
ncx_getn_short_schar(const void **xpp, size_t nelems, schar *tp)
{
#if _SX && \
           X_SIZEOF_SHORT == SIZEOF_SHORT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  short tmp[LOOPCNT];        /* in case input is misaligned */
  short *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_SHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, ni*SIZEOF_SHORT);
      xp = tmp;
    } else {
      xp = (short *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (schar) Max( SCHAR_MIN, Min(SCHAR_MAX, (schar) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += xp[i] < SCHAR_MIN || xp[i] > SCHAR_MAX;
    }
   /* update xpp and tp */
    if (realign) xp = (short *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		const int lstatus = ncx_get_short_schar(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#  endif
}

int
ncx_getn_short_uchar(const void **xpp, size_t nelems, uchar *tp)
{
#if _SX && \
           X_SIZEOF_SHORT == SIZEOF_SHORT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  short tmp[LOOPCNT];        /* in case input is misaligned */
  short *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_SHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, ni*SIZEOF_SHORT);
      xp = tmp;
    } else {
      xp = (short *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (uchar) Max( UCHAR_MIN, Min(UCHAR_MAX, (uchar) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += xp[i] < UCHAR_MIN || xp[i] > UCHAR_MAX;
    }
   /* update xpp and tp */
    if (realign) xp = (short *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		const int lstatus = ncx_get_short_uchar(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#  endif
}

#if X_SIZEOF_SHORT == SIZEOF_SHORT
/* optimized version */
int
ncx_getn_short_short(const void **xpp, size_t nelems, short *tp)
{
#ifdef WORDS_BIGENDIAN
	(void) memcpy(tp, *xpp, nelems * sizeof(short));
# else
	swapn2b(tp, *xpp, nelems);
# endif
	*xpp = (const void *)((const char *)(*xpp) + nelems * X_SIZEOF_SHORT);
	return ENOERR;
}
#else
int
ncx_getn_short_short(const void **xpp, size_t nelems, short *tp)
{
#if _SX && \
           X_SIZEOF_SHORT == SIZEOF_SHORT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  short tmp[LOOPCNT];        /* in case input is misaligned */
  short *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_SHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, ni*SIZEOF_SHORT);
      xp = tmp;
    } else {
      xp = (short *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (short) Max( SHORT_MIN, Min(SHORT_MAX, (short) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += xp[i] < SHORT_MIN || xp[i] > SHORT_MAX;
    }
   /* update xpp and tp */
    if (realign) xp = (short *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		const int lstatus = ncx_get_short_short(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#  endif
}

#endif
int
ncx_getn_short_int(const void **xpp, size_t nelems, int *tp)
{
#if _SX && \
           X_SIZEOF_SHORT == SIZEOF_SHORT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  short tmp[LOOPCNT];        /* in case input is misaligned */
  short *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_SHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, ni*SIZEOF_SHORT);
      xp = tmp;
    } else {
      xp = (short *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (int) Max( INT_MIN, Min(INT_MAX, (int) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += xp[i] < INT_MIN || xp[i] > INT_MAX;
    }
   /* update xpp and tp */
    if (realign) xp = (short *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		const int lstatus = ncx_get_short_int(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#  endif
}

int
ncx_getn_short_float(const void **xpp, size_t nelems, float *tp)
{
#if _SX && \
           X_SIZEOF_SHORT == SIZEOF_SHORT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  short tmp[LOOPCNT];        /* in case input is misaligned */
  short *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_SHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, ni*SIZEOF_SHORT);
      xp = tmp;
    } else {
      xp = (short *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (float) Max( FLOAT_MIN, Min(FLOAT_MAX, (float) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += xp[i] < FLOAT_MIN || xp[i] > FLOAT_MAX;
    }
   /* update xpp and tp */
    if (realign) xp = (short *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		const int lstatus = ncx_get_short_float(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#  endif
}

int
ncx_getn_short_double(const void **xpp, size_t nelems, double *tp)
{
#if _SX && \
           X_SIZEOF_SHORT == SIZEOF_SHORT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  short tmp[LOOPCNT];        /* in case input is misaligned */
  short *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_SHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, ni*SIZEOF_SHORT);
      xp = tmp;
    } else {
      xp = (short *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (double) Max( DOUBLE_MIN, Min(DOUBLE_MAX, (double) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += xp[i] < DOUBLE_MIN || xp[i] > DOUBLE_MAX;
    }
   /* update xpp and tp */
    if (realign) xp = (short *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		const int lstatus = ncx_get_short_double(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#  endif
}

int
ncx_getn_short_uint(const void **xpp, size_t nelems, uint *tp)
{
#if _SX && \
           X_SIZEOF_SHORT == SIZEOF_SHORT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  short tmp[LOOPCNT];        /* in case input is misaligned */
  short *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_SHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, ni*SIZEOF_SHORT);
      xp = tmp;
    } else {
      xp = (short *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (uint) Max( UINT_MIN, Min(UINT_MAX, (uint) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += xp[i] < UINT_MIN || xp[i] > UINT_MAX;
    }
   /* update xpp and tp */
    if (realign) xp = (short *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		const int lstatus = ncx_get_short_uint(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#  endif
}

int
ncx_getn_short_longlong(const void **xpp, size_t nelems, longlong *tp)
{
#if _SX && \
           X_SIZEOF_SHORT == SIZEOF_SHORT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  short tmp[LOOPCNT];        /* in case input is misaligned */
  short *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_SHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, ni*SIZEOF_SHORT);
      xp = tmp;
    } else {
      xp = (short *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (longlong) Max( LONGLONG_MIN, Min(LONGLONG_MAX, (longlong) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += xp[i] < LONGLONG_MIN || xp[i] > LONGLONG_MAX;
    }
   /* update xpp and tp */
    if (realign) xp = (short *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		const int lstatus = ncx_get_short_longlong(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#  endif
}

int
ncx_getn_short_ulonglong(const void **xpp, size_t nelems, ulonglong *tp)
{
#if _SX && \
           X_SIZEOF_SHORT == SIZEOF_SHORT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  short tmp[LOOPCNT];        /* in case input is misaligned */
  short *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_SHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, ni*SIZEOF_SHORT);
      xp = tmp;
    } else {
      xp = (short *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (ulonglong) Max( ULONGLONG_MIN, Min(ULONGLONG_MAX, (ulonglong) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += xp[i] < ULONGLONG_MIN || xp[i] > ULONGLONG_MAX;
    }
   /* update xpp and tp */
    if (realign) xp = (short *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		const int lstatus = ncx_get_short_ulonglong(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#  endif
}


int
ncx_pad_getn_short_schar(const void **xpp, size_t nelems, schar *tp)
{
	const size_t rndup = nelems % 2;

	const char *xp = (const char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		const int lstatus = ncx_get_short_schar(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	if(rndup != 0)
		xp += X_SIZEOF_SHORT;
		
	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_getn_short_uchar(const void **xpp, size_t nelems, uchar *tp)
{
	const size_t rndup = nelems % 2;

	const char *xp = (const char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		const int lstatus = ncx_get_short_uchar(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	if(rndup != 0)
		xp += X_SIZEOF_SHORT;
		
	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_getn_short_short(const void **xpp, size_t nelems, short *tp)
{
	const size_t rndup = nelems % 2;

	const char *xp = (const char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		const int lstatus = ncx_get_short_short(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	if(rndup != 0)
		xp += X_SIZEOF_SHORT;
		
	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_getn_short_int(const void **xpp, size_t nelems, int *tp)
{
	const size_t rndup = nelems % 2;

	const char *xp = (const char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		const int lstatus = ncx_get_short_int(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	if(rndup != 0)
		xp += X_SIZEOF_SHORT;
		
	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_getn_short_float(const void **xpp, size_t nelems, float *tp)
{
	const size_t rndup = nelems % 2;

	const char *xp = (const char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		const int lstatus = ncx_get_short_float(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	if(rndup != 0)
		xp += X_SIZEOF_SHORT;
		
	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_getn_short_double(const void **xpp, size_t nelems, double *tp)
{
	const size_t rndup = nelems % 2;

	const char *xp = (const char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		const int lstatus = ncx_get_short_double(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	if(rndup != 0)
		xp += X_SIZEOF_SHORT;
		
	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_getn_short_uint(const void **xpp, size_t nelems, uint *tp)
{
	const size_t rndup = nelems % 2;

	const char *xp = (const char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		const int lstatus = ncx_get_short_uint(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	if(rndup != 0)
		xp += X_SIZEOF_SHORT;
		
	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_getn_short_longlong(const void **xpp, size_t nelems, longlong *tp)
{
	const size_t rndup = nelems % 2;

	const char *xp = (const char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		const int lstatus = ncx_get_short_longlong(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	if(rndup != 0)
		xp += X_SIZEOF_SHORT;
		
	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_getn_short_ulonglong(const void **xpp, size_t nelems, ulonglong *tp)
{
	const size_t rndup = nelems % 2;

	const char *xp = (const char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		const int lstatus = ncx_get_short_ulonglong(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	if(rndup != 0)
		xp += X_SIZEOF_SHORT;
		
	*xpp = (void *)xp;
	return status;
}


int
ncx_putn_short_schar(void **xpp, size_t nelems, const schar *tp)
{
#if _SX && \
           X_SIZEOF_SHORT == SIZEOF_SHORT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  short tmp[LOOPCNT];        /* in case input is misaligned */
  short *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_SHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (short *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (short) Max( X_SHORT_MIN, Min(X_SHORT_MAX, (short) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += tp[i] < X_SHORT_MIN || tp[i] > X_SHORT_MAX;
    }
   /* copy workspace back if necessary */ 
    if (realign) {
      memcpy(*xpp, tmp, ni*X_SIZEOF_SHORT);
      xp = (short *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		int lstatus = ncx_put_short_schar(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_short_uchar(void **xpp, size_t nelems, const uchar *tp)
{
#if _SX && \
           X_SIZEOF_SHORT == SIZEOF_SHORT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  short tmp[LOOPCNT];        /* in case input is misaligned */
  short *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_SHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (short *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (short) Max( X_SHORT_MIN, Min(X_SHORT_MAX, (short) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += tp[i] < X_SHORT_MIN || tp[i] > X_SHORT_MAX;
    }
   /* copy workspace back if necessary */ 
    if (realign) {
      memcpy(*xpp, tmp, ni*X_SIZEOF_SHORT);
      xp = (short *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		int lstatus = ncx_put_short_uchar(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

#if X_SIZEOF_SHORT == SIZEOF_SHORT
/* optimized version */
int
ncx_putn_short_short(void **xpp, size_t nelems, const short *tp)
{
#ifdef WORDS_BIGENDIAN
	(void) memcpy(*xpp, tp, nelems * X_SIZEOF_SHORT);
# else
	swapn2b(*xpp, tp, nelems);
# endif
	*xpp = (void *)((char *)(*xpp) + nelems * X_SIZEOF_SHORT);
	return ENOERR;
}
#else
int
ncx_putn_short_short(void **xpp, size_t nelems, const short *tp)
{
#if _SX && \
           X_SIZEOF_SHORT == SIZEOF_SHORT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  short tmp[LOOPCNT];        /* in case input is misaligned */
  short *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_SHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (short *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (short) Max( X_SHORT_MIN, Min(X_SHORT_MAX, (short) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += tp[i] < X_SHORT_MIN || tp[i] > X_SHORT_MAX;
    }
   /* copy workspace back if necessary */ 
    if (realign) {
      memcpy(*xpp, tmp, ni*X_SIZEOF_SHORT);
      xp = (short *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		int lstatus = ncx_put_short_short(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

#endif
int
ncx_putn_short_int(void **xpp, size_t nelems, const int *tp)
{
#if _SX && \
           X_SIZEOF_SHORT == SIZEOF_SHORT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  short tmp[LOOPCNT];        /* in case input is misaligned */
  short *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_SHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (short *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (short) Max( X_SHORT_MIN, Min(X_SHORT_MAX, (short) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += tp[i] < X_SHORT_MIN || tp[i] > X_SHORT_MAX;
    }
   /* copy workspace back if necessary */ 
    if (realign) {
      memcpy(*xpp, tmp, ni*X_SIZEOF_SHORT);
      xp = (short *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		int lstatus = ncx_put_short_int(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_short_float(void **xpp, size_t nelems, const float *tp)
{
#if _SX && \
           X_SIZEOF_SHORT == SIZEOF_SHORT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  short tmp[LOOPCNT];        /* in case input is misaligned */
  short *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_SHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (short *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (short) Max( X_SHORT_MIN, Min(X_SHORT_MAX, (short) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += tp[i] < X_SHORT_MIN || tp[i] > X_SHORT_MAX;
    }
   /* copy workspace back if necessary */ 
    if (realign) {
      memcpy(*xpp, tmp, ni*X_SIZEOF_SHORT);
      xp = (short *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		int lstatus = ncx_put_short_float(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_short_double(void **xpp, size_t nelems, const double *tp)
{
#if _SX && \
           X_SIZEOF_SHORT == SIZEOF_SHORT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  short tmp[LOOPCNT];        /* in case input is misaligned */
  short *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_SHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (short *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (short) Max( X_SHORT_MIN, Min(X_SHORT_MAX, (short) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += tp[i] < X_SHORT_MIN || tp[i] > X_SHORT_MAX;
    }
   /* copy workspace back if necessary */ 
    if (realign) {
      memcpy(*xpp, tmp, ni*X_SIZEOF_SHORT);
      xp = (short *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		int lstatus = ncx_put_short_double(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_short_uint(void **xpp, size_t nelems, const uint *tp)
{
#if _SX && \
           X_SIZEOF_SHORT == SIZEOF_SHORT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  short tmp[LOOPCNT];        /* in case input is misaligned */
  short *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_SHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (short *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (short) Max( X_SHORT_MIN, Min(X_SHORT_MAX, (short) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += tp[i] < X_SHORT_MIN || tp[i] > X_SHORT_MAX;
    }
   /* copy workspace back if necessary */ 
    if (realign) {
      memcpy(*xpp, tmp, ni*X_SIZEOF_SHORT);
      xp = (short *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		int lstatus = ncx_put_short_uint(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_short_longlong(void **xpp, size_t nelems, const longlong *tp)
{
#if _SX && \
           X_SIZEOF_SHORT == SIZEOF_SHORT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  short tmp[LOOPCNT];        /* in case input is misaligned */
  short *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_SHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (short *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (short) Max( X_SHORT_MIN, Min(X_SHORT_MAX, (short) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += tp[i] < X_SHORT_MIN || tp[i] > X_SHORT_MAX;
    }
   /* copy workspace back if necessary */ 
    if (realign) {
      memcpy(*xpp, tmp, ni*X_SIZEOF_SHORT);
      xp = (short *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		int lstatus = ncx_put_short_longlong(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_short_ulonglong(void **xpp, size_t nelems, const ulonglong *tp)
{
#if _SX && \
           X_SIZEOF_SHORT == SIZEOF_SHORT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  short tmp[LOOPCNT];        /* in case input is misaligned */
  short *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_SHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (short *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (short) Max( X_SHORT_MIN, Min(X_SHORT_MAX, (short) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += tp[i] < X_SHORT_MIN || tp[i] > X_SHORT_MAX;
    }
   /* copy workspace back if necessary */ 
    if (realign) {
      memcpy(*xpp, tmp, ni*X_SIZEOF_SHORT);
      xp = (short *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		int lstatus = ncx_put_short_ulonglong(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}


int
ncx_pad_putn_short_schar(void **xpp, size_t nelems, const schar *tp)
{
	const size_t rndup = nelems % 2;

	char *xp = (char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		int lstatus = ncx_put_short_schar(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	if(rndup != 0)
	{
		(void) memcpy(xp, nada, X_SIZEOF_SHORT);
		xp += X_SIZEOF_SHORT;	
	}
		
	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_putn_short_uchar(void **xpp, size_t nelems, const uchar *tp)
{
	const size_t rndup = nelems % 2;

	char *xp = (char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		int lstatus = ncx_put_short_uchar(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	if(rndup != 0)
	{
		(void) memcpy(xp, nada, X_SIZEOF_SHORT);
		xp += X_SIZEOF_SHORT;	
	}
		
	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_putn_short_short(void **xpp, size_t nelems, const short *tp)
{
	const size_t rndup = nelems % 2;

	char *xp = (char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		int lstatus = ncx_put_short_short(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	if(rndup != 0)
	{
		(void) memcpy(xp, nada, X_SIZEOF_SHORT);
		xp += X_SIZEOF_SHORT;	
	}
		
	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_putn_short_int(void **xpp, size_t nelems, const int *tp)
{
	const size_t rndup = nelems % 2;

	char *xp = (char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		int lstatus = ncx_put_short_int(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	if(rndup != 0)
	{
		(void) memcpy(xp, nada, X_SIZEOF_SHORT);
		xp += X_SIZEOF_SHORT;	
	}
		
	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_putn_short_float(void **xpp, size_t nelems, const float *tp)
{
	const size_t rndup = nelems % 2;

	char *xp = (char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		int lstatus = ncx_put_short_float(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	if(rndup != 0)
	{
		(void) memcpy(xp, nada, X_SIZEOF_SHORT);
		xp += X_SIZEOF_SHORT;	
	}
		
	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_putn_short_double(void **xpp, size_t nelems, const double *tp)
{
	const size_t rndup = nelems % 2;

	char *xp = (char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		int lstatus = ncx_put_short_double(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	if(rndup != 0)
	{
		(void) memcpy(xp, nada, X_SIZEOF_SHORT);
		xp += X_SIZEOF_SHORT;	
	}
		
	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_putn_short_uint(void **xpp, size_t nelems, const uint *tp)
{
	const size_t rndup = nelems % 2;

	char *xp = (char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		int lstatus = ncx_put_short_uint(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	if(rndup != 0)
	{
		(void) memcpy(xp, nada, X_SIZEOF_SHORT);
		xp += X_SIZEOF_SHORT;	
	}
		
	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_putn_short_longlong(void **xpp, size_t nelems, const longlong *tp)
{
	const size_t rndup = nelems % 2;

	char *xp = (char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		int lstatus = ncx_put_short_longlong(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	if(rndup != 0)
	{
		(void) memcpy(xp, nada, X_SIZEOF_SHORT);
		xp += X_SIZEOF_SHORT;	
	}
		
	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_putn_short_ulonglong(void **xpp, size_t nelems, const ulonglong *tp)
{
	const size_t rndup = nelems % 2;

	char *xp = (char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		int lstatus = ncx_put_short_ulonglong(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	if(rndup != 0)
	{
		(void) memcpy(xp, nada, X_SIZEOF_SHORT);
		xp += X_SIZEOF_SHORT;	
	}
		
	*xpp = (void *)xp;
	return status;
}


/* int */

int
ncx_getn_int_schar(const void **xpp, size_t nelems, schar *tp)
{
#if _SX && \
           X_SIZEOF_INT == SIZEOF_INT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int tmp[LOOPCNT];        /* in case input is misaligned */
  int *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, ni*SIZEOF_INT);
      xp = tmp;
    } else {
      xp = (int *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (schar) Max( SCHAR_MIN, Min(SCHAR_MAX, (schar) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += xp[i] < SCHAR_MIN || xp[i] > SCHAR_MAX;
    }
   /* update xpp and tp */
    if (realign) xp = (int *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT, tp++)
	{
		const int lstatus = ncx_get_int_schar(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#  endif
}

int
ncx_getn_int_uchar(const void **xpp, size_t nelems, uchar *tp)
{
#if _SX && \
           X_SIZEOF_INT == SIZEOF_INT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int tmp[LOOPCNT];        /* in case input is misaligned */
  int *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, ni*SIZEOF_INT);
      xp = tmp;
    } else {
      xp = (int *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (uchar) Max( UCHAR_MIN, Min(UCHAR_MAX, (uchar) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += xp[i] < UCHAR_MIN || xp[i] > UCHAR_MAX;
    }
   /* update xpp and tp */
    if (realign) xp = (int *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT, tp++)
	{
		const int lstatus = ncx_get_int_uchar(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#  endif
}

int
ncx_getn_int_short(const void **xpp, size_t nelems, short *tp)
{
#if _SX && \
           X_SIZEOF_INT == SIZEOF_INT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int tmp[LOOPCNT];        /* in case input is misaligned */
  int *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, ni*SIZEOF_INT);
      xp = tmp;
    } else {
      xp = (int *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (short) Max( SHORT_MIN, Min(SHORT_MAX, (short) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += xp[i] < SHORT_MIN || xp[i] > SHORT_MAX;
    }
   /* update xpp and tp */
    if (realign) xp = (int *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT, tp++)
	{
		const int lstatus = ncx_get_int_short(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#  endif
}

#if X_SIZEOF_INT == SIZEOF_INT
/* optimized version */
int
ncx_getn_int_int(const void **xpp, size_t nelems, int *tp)
{
#ifdef WORDS_BIGENDIAN
	(void) memcpy(tp, *xpp, nelems * sizeof(int));
# else
	swapn4b(tp, *xpp, nelems);
# endif
	*xpp = (const void *)((const char *)(*xpp) + nelems * X_SIZEOF_INT);
	return ENOERR;
}
int
ncx_getn_int_uint(const void **xpp, size_t nelems, unsigned int *tp)
{
#ifdef WORDS_BIGENDIAN
	(void) memcpy(tp, *xpp, nelems * sizeof(int));
# else
	swapn4b(tp, *xpp, nelems);
# endif
	*xpp = (const void *)((const char *)(*xpp) + nelems * X_SIZEOF_INT);
	return ENOERR;
}
#else
int
ncx_getn_int_int(const void **xpp, size_t nelems, int *tp)
{
#if _SX && \
           X_SIZEOF_INT == SIZEOF_INT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int tmp[LOOPCNT];        /* in case input is misaligned */
  int *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, ni*SIZEOF_INT);
      xp = tmp;
    } else {
      xp = (int *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (int) Max( INT_MIN, Min(INT_MAX, (int) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += xp[i] < INT_MIN || xp[i] > INT_MAX;
    }
   /* update xpp and tp */
    if (realign) xp = (int *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT, tp++)
	{
		const int lstatus = ncx_get_int_int(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#  endif
}

int
ncx_getn_int_uint(const void **xpp, size_t nelems, uint *tp)
{
#if _SX && \
           X_SIZEOF_INT == SIZEOF_INT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int tmp[LOOPCNT];        /* in case input is misaligned */
  int *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, ni*SIZEOF_INT);
      xp = tmp;
    } else {
      xp = (int *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (uint) Max( UINT_MIN, Min(UINT_MAX, (uint) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += xp[i] < UINT_MIN || xp[i] > UINT_MAX;
    }
   /* update xpp and tp */
    if (realign) xp = (int *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT, tp++)
	{
		const int lstatus = ncx_get_int_uint(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#  endif
}

#endif

int
ncx_getn_int_longlong(const void **xpp, size_t nelems, longlong *tp)
{
#if _SX && \
           X_SIZEOF_INT == SIZEOF_INT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int tmp[LOOPCNT];        /* in case input is misaligned */
  int *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, ni*SIZEOF_INT);
      xp = tmp;
    } else {
      xp = (int *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (longlong) Max( LONGLONG_MIN, Min(LONGLONG_MAX, (longlong) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += xp[i] < LONGLONG_MIN || xp[i] > LONGLONG_MAX;
    }
   /* update xpp and tp */
    if (realign) xp = (int *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT, tp++)
	{
		const int lstatus = ncx_get_int_longlong(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#  endif
}

int
ncx_getn_int_ulonglong(const void **xpp, size_t nelems, ulonglong *tp)
{
#if _SX && \
           X_SIZEOF_INT == SIZEOF_INT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int tmp[LOOPCNT];        /* in case input is misaligned */
  int *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, ni*SIZEOF_INT);
      xp = tmp;
    } else {
      xp = (int *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (ulonglong) Max( ULONGLONG_MIN, Min(ULONGLONG_MAX, (ulonglong) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += xp[i] < ULONGLONG_MIN || xp[i] > ULONGLONG_MAX;
    }
   /* update xpp and tp */
    if (realign) xp = (int *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT, tp++)
	{
		const int lstatus = ncx_get_int_ulonglong(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#  endif
}


int
ncx_getn_int_float(const void **xpp, size_t nelems, float *tp)
{
#if _SX && \
           X_SIZEOF_INT == SIZEOF_INT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int tmp[LOOPCNT];        /* in case input is misaligned */
  int *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, ni*SIZEOF_INT);
      xp = tmp;
    } else {
      xp = (int *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (float) Max( FLOAT_MIN, Min(FLOAT_MAX, (float) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += xp[i] < FLOAT_MIN || xp[i] > FLOAT_MAX;
    }
   /* update xpp and tp */
    if (realign) xp = (int *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT, tp++)
	{
		const int lstatus = ncx_get_int_float(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#  endif
}

int
ncx_getn_int_double(const void **xpp, size_t nelems, double *tp)
{
#if _SX && \
           X_SIZEOF_INT == SIZEOF_INT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int tmp[LOOPCNT];        /* in case input is misaligned */
  int *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, ni*SIZEOF_INT);
      xp = tmp;
    } else {
      xp = (int *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (double) Max( DOUBLE_MIN, Min(DOUBLE_MAX, (double) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += xp[i] < DOUBLE_MIN || xp[i] > DOUBLE_MAX;
    }
   /* update xpp and tp */
    if (realign) xp = (int *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT, tp++)
	{
		const int lstatus = ncx_get_int_double(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#  endif
}


int
ncx_putn_int_schar(void **xpp, size_t nelems, const schar *tp)
{
#if _SX && \
           X_SIZEOF_INT == SIZEOF_INT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int tmp[LOOPCNT];        /* in case input is misaligned */
  int *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (int *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (int) Max( X_INT_MIN, Min(X_INT_MAX, (int) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += tp[i] < X_INT_MIN || tp[i] > X_INT_MAX;
    }
   /* copy workspace back if necessary */ 
    if (realign) {
      memcpy(*xpp, tmp, ni*X_SIZEOF_INT);
      xp = (int *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT, tp++)
	{
		int lstatus = ncx_put_int_schar(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_int_uchar(void **xpp, size_t nelems, const uchar *tp)
{
#if _SX && \
           X_SIZEOF_INT == SIZEOF_INT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int tmp[LOOPCNT];        /* in case input is misaligned */
  int *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (int *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (int) Max( X_INT_MIN, Min(X_INT_MAX, (int) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += tp[i] < X_INT_MIN || tp[i] > X_INT_MAX;
    }
   /* copy workspace back if necessary */ 
    if (realign) {
      memcpy(*xpp, tmp, ni*X_SIZEOF_INT);
      xp = (int *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT, tp++)
	{
		int lstatus = ncx_put_int_uchar(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_int_short(void **xpp, size_t nelems, const short *tp)
{
#if _SX && \
           X_SIZEOF_INT == SIZEOF_INT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int tmp[LOOPCNT];        /* in case input is misaligned */
  int *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (int *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (int) Max( X_INT_MIN, Min(X_INT_MAX, (int) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += tp[i] < X_INT_MIN || tp[i] > X_INT_MAX;
    }
   /* copy workspace back if necessary */ 
    if (realign) {
      memcpy(*xpp, tmp, ni*X_SIZEOF_INT);
      xp = (int *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT, tp++)
	{
		int lstatus = ncx_put_int_short(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

#if X_SIZEOF_INT == SIZEOF_INT
/* optimized version */
int
ncx_putn_int_int(void **xpp, size_t nelems, const int *tp)
{
#ifdef WORDS_BIGENDIAN
	(void) memcpy(*xpp, tp, nelems * X_SIZEOF_INT);
# else
	swapn4b(*xpp, tp, nelems);
# endif
	*xpp = (void *)((char *)(*xpp) + nelems * X_SIZEOF_INT);
	return ENOERR;
}
int
ncx_putn_int_uint(void **xpp, size_t nelems, const unsigned int *tp)
{
#ifdef WORDS_BIGENDIAN
	(void) memcpy(*xpp, tp, nelems * X_SIZEOF_INT);
# else
	swapn4b(*xpp, tp, nelems);
# endif
	*xpp = (void *)((char *)(*xpp) + nelems * X_SIZEOF_INT);
	return ENOERR;
}
#else
int
ncx_putn_int_int(void **xpp, size_t nelems, const int *tp)
{
#if _SX && \
           X_SIZEOF_INT == SIZEOF_INT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int tmp[LOOPCNT];        /* in case input is misaligned */
  int *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (int *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (int) Max( X_INT_MIN, Min(X_INT_MAX, (int) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += tp[i] < X_INT_MIN || tp[i] > X_INT_MAX;
    }
   /* copy workspace back if necessary */ 
    if (realign) {
      memcpy(*xpp, tmp, ni*X_SIZEOF_INT);
      xp = (int *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT, tp++)
	{
		int lstatus = ncx_put_int_int(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_int_uint(void **xpp, size_t nelems, const uint *tp)
{
#if _SX && \
           X_SIZEOF_INT == SIZEOF_INT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int tmp[LOOPCNT];        /* in case input is misaligned */
  int *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (int *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (int) Max( X_INT_MIN, Min(X_INT_MAX, (int) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += tp[i] < X_INT_MIN || tp[i] > X_INT_MAX;
    }
   /* copy workspace back if necessary */ 
    if (realign) {
      memcpy(*xpp, tmp, ni*X_SIZEOF_INT);
      xp = (int *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT, tp++)
	{
		int lstatus = ncx_put_int_uint(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

#endif

int
ncx_putn_int_longlong(void **xpp, size_t nelems, const longlong *tp)
{
#if _SX && \
           X_SIZEOF_INT == SIZEOF_INT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int tmp[LOOPCNT];        /* in case input is misaligned */
  int *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (int *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (int) Max( X_INT_MIN, Min(X_INT_MAX, (int) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += tp[i] < X_INT_MIN || tp[i] > X_INT_MAX;
    }
   /* copy workspace back if necessary */ 
    if (realign) {
      memcpy(*xpp, tmp, ni*X_SIZEOF_INT);
      xp = (int *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT, tp++)
	{
		int lstatus = ncx_put_int_longlong(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_int_ulonglong(void **xpp, size_t nelems, const ulonglong *tp)
{
#if _SX && \
           X_SIZEOF_INT == SIZEOF_INT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int tmp[LOOPCNT];        /* in case input is misaligned */
  int *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (int *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (int) Max( X_INT_MIN, Min(X_INT_MAX, (int) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += tp[i] < X_INT_MIN || tp[i] > X_INT_MAX;
    }
   /* copy workspace back if necessary */ 
    if (realign) {
      memcpy(*xpp, tmp, ni*X_SIZEOF_INT);
      xp = (int *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT, tp++)
	{
		int lstatus = ncx_put_int_ulonglong(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_int_float(void **xpp, size_t nelems, const float *tp)
{
#if _SX && \
           X_SIZEOF_INT == SIZEOF_INT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int tmp[LOOPCNT];        /* in case input is misaligned */
  int *xp;
  double d;               /* special case for ncx_putn_int_float */
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (int *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* for some reason int to float, for putn, requires a special case */ 
      d = tp[i];
      xp[i] = (int) Max( X_INT_MIN, Min(X_INT_MAX, (int) d));
      nrange += d < X_INT_MIN || d > X_INT_MAX;
    }
   /* copy workspace back if necessary */ 
    if (realign) {
      memcpy(*xpp, tmp, ni*X_SIZEOF_INT);
      xp = (int *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT, tp++)
	{
		int lstatus = ncx_put_int_float(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_int_double(void **xpp, size_t nelems, const double *tp)
{
#if _SX && \
           X_SIZEOF_INT == SIZEOF_INT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int tmp[LOOPCNT];        /* in case input is misaligned */
  int *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (int *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (int) Max( X_INT_MIN, Min(X_INT_MAX, (int) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += tp[i] < X_INT_MIN || tp[i] > X_INT_MAX;
    }
   /* copy workspace back if necessary */ 
    if (realign) {
      memcpy(*xpp, tmp, ni*X_SIZEOF_INT);
      xp = (int *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT, tp++)
	{
		int lstatus = ncx_put_int_double(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}


/* float */

int
ncx_getn_float_schar(const void **xpp, size_t nelems, schar *tp)
{
#if _SX && \
           X_SIZEOF_FLOAT == SIZEOF_FLOAT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  float tmp[LOOPCNT];        /* in case input is misaligned */
  float *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_FLOAT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, ni*SIZEOF_FLOAT);
      xp = tmp;
    } else {
      xp = (float *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (schar) Max( SCHAR_MIN, Min(SCHAR_MAX, (schar) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += xp[i] < SCHAR_MIN || xp[i] > SCHAR_MAX;
    }
   /* update xpp and tp */
    if (realign) xp = (float *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_FLOAT, tp++)
	{
		const int lstatus = ncx_get_float_schar(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#  endif
}

int
ncx_getn_float_uchar(const void **xpp, size_t nelems, uchar *tp)
{
#if _SX && \
           X_SIZEOF_FLOAT == SIZEOF_FLOAT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  float tmp[LOOPCNT];        /* in case input is misaligned */
  float *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_FLOAT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, ni*SIZEOF_FLOAT);
      xp = tmp;
    } else {
      xp = (float *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (uchar) Max( UCHAR_MIN, Min(UCHAR_MAX, (uchar) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += xp[i] < UCHAR_MIN || xp[i] > UCHAR_MAX;
    }
   /* update xpp and tp */
    if (realign) xp = (float *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_FLOAT, tp++)
	{
		const int lstatus = ncx_get_float_uchar(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#  endif
}

int
ncx_getn_float_short(const void **xpp, size_t nelems, short *tp)
{
#if _SX && \
           X_SIZEOF_FLOAT == SIZEOF_FLOAT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  float tmp[LOOPCNT];        /* in case input is misaligned */
  float *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_FLOAT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, ni*SIZEOF_FLOAT);
      xp = tmp;
    } else {
      xp = (float *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (short) Max( SHORT_MIN, Min(SHORT_MAX, (short) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += xp[i] < SHORT_MIN || xp[i] > SHORT_MAX;
    }
   /* update xpp and tp */
    if (realign) xp = (float *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_FLOAT, tp++)
	{
		const int lstatus = ncx_get_float_short(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#  endif
}

int
ncx_getn_float_int(const void **xpp, size_t nelems, int *tp)
{
#if _SX && \
           X_SIZEOF_FLOAT == SIZEOF_FLOAT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  float tmp[LOOPCNT];        /* in case input is misaligned */
  float *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_FLOAT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, ni*SIZEOF_FLOAT);
      xp = tmp;
    } else {
      xp = (float *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (int) Max( INT_MIN, Min(INT_MAX, (int) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += xp[i] < INT_MIN || xp[i] > INT_MAX;
    }
   /* update xpp and tp */
    if (realign) xp = (float *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_FLOAT, tp++)
	{
		const int lstatus = ncx_get_float_int(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#  endif
}

#if X_SIZEOF_FLOAT == SIZEOF_FLOAT && !defined(NO_IEEE_FLOAT)
/* optimized version */
int
ncx_getn_float_float(const void **xpp, size_t nelems, float *tp)
{
#ifdef WORDS_BIGENDIAN
	(void) memcpy(tp, *xpp, nelems * sizeof(float));
# else
	swapn4b(tp, *xpp, nelems);
# endif
	*xpp = (const void *)((const char *)(*xpp) + nelems * X_SIZEOF_FLOAT);
	return ENOERR;
}
#elif vax
int
ncx_getn_float_float(const void **xpp, size_t nfloats, float *ip)
{
	float *const end = ip + nfloats;

	while(ip < end)
	{
		struct vax_single *const vsp = (struct vax_single *) ip;
		const struct ieee_single *const isp =
			 (const struct ieee_single *) (*xpp);
		unsigned exp = isp->exp_hi << 1 | isp->exp_lo;

		switch(exp) {
		case 0 :
			/* ieee subnormal */
			if(isp->mant_hi == min.ieee.mant_hi
				&& isp->mant_lo_hi == min.ieee.mant_lo_hi
				&& isp->mant_lo_lo == min.ieee.mant_lo_lo)
			{
				*vsp = min.s;
			}
			else
			{
				unsigned mantissa = (isp->mant_hi << 16)
					 | isp->mant_lo_hi << 8
					 | isp->mant_lo_lo;
				unsigned tmp = mantissa >> 20;
				if(tmp >= 4) {
					vsp->exp = 2;
				} else if (tmp >= 2) {
					vsp->exp = 1;
				} else {
					*vsp = min.s;
					break;
				} /* else */
				tmp = mantissa - (1 << (20 + vsp->exp ));
				tmp <<= 3 - vsp->exp;
				vsp->mantissa2 = tmp;
				vsp->mantissa1 = (tmp >> 16);
			}
			break;
		case 0xfe :
		case 0xff :
			*vsp = max.s;
			break;
		default :
			vsp->exp = exp - IEEE_SNG_BIAS + VAX_SNG_BIAS;
			vsp->mantissa2 = isp->mant_lo_hi << 8 | isp->mant_lo_lo;
			vsp->mantissa1 = isp->mant_hi;
		}

		vsp->sign = isp->sign;


		ip++;
		*xpp = (char *)(*xpp) + X_SIZEOF_FLOAT;
	}
	return ENOERR;
}
#else
int
ncx_getn_float_float(const void **xpp, size_t nelems, float *tp)
{
	const char *xp = *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_FLOAT, tp++)
	{
		const int lstatus = ncx_get_float_float(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
}

#endif
int
ncx_getn_float_double(const void **xpp, size_t nelems, double *tp)
{
#if _SX && \
           X_SIZEOF_FLOAT == SIZEOF_FLOAT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  float tmp[LOOPCNT];        /* in case input is misaligned */
  float *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_FLOAT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, ni*SIZEOF_FLOAT);
      xp = tmp;
    } else {
      xp = (float *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (double) Max( DOUBLE_MIN, Min(DOUBLE_MAX, (double) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += xp[i] < DOUBLE_MIN || xp[i] > DOUBLE_MAX;
    }
   /* update xpp and tp */
    if (realign) xp = (float *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_FLOAT, tp++)
	{
		const int lstatus = ncx_get_float_double(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#  endif
}

int
ncx_getn_float_uint(const void **xpp, size_t nelems, uint *tp)
{
#if _SX && \
           X_SIZEOF_FLOAT == SIZEOF_FLOAT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  float tmp[LOOPCNT];        /* in case input is misaligned */
  float *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_FLOAT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, ni*SIZEOF_FLOAT);
      xp = tmp;
    } else {
      xp = (float *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (uint) Max( UINT_MIN, Min(UINT_MAX, (uint) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += xp[i] < UINT_MIN || xp[i] > UINT_MAX;
    }
   /* update xpp and tp */
    if (realign) xp = (float *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_FLOAT, tp++)
	{
		const int lstatus = ncx_get_float_uint(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#  endif
}

int
ncx_getn_float_longlong(const void **xpp, size_t nelems, longlong *tp)
{
#if _SX && \
           X_SIZEOF_FLOAT == SIZEOF_FLOAT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  float tmp[LOOPCNT];        /* in case input is misaligned */
  float *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_FLOAT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, ni*SIZEOF_FLOAT);
      xp = tmp;
    } else {
      xp = (float *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (longlong) Max( LONGLONG_MIN, Min(LONGLONG_MAX, (longlong) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += xp[i] < LONGLONG_MIN || xp[i] > LONGLONG_MAX;
    }
   /* update xpp and tp */
    if (realign) xp = (float *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_FLOAT, tp++)
	{
		const int lstatus = ncx_get_float_longlong(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#  endif
}

int
ncx_getn_float_ulonglong(const void **xpp, size_t nelems, ulonglong *tp)
{
#if _SX && \
           X_SIZEOF_FLOAT == SIZEOF_FLOAT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  float tmp[LOOPCNT];        /* in case input is misaligned */
  float *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_FLOAT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, ni*SIZEOF_FLOAT);
      xp = tmp;
    } else {
      xp = (float *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (ulonglong) Max( ULONGLONG_MIN, Min(ULONGLONG_MAX, (ulonglong) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += xp[i] < ULONGLONG_MIN || xp[i] > ULONGLONG_MAX;
    }
   /* update xpp and tp */
    if (realign) xp = (float *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_FLOAT, tp++)
	{
		const int lstatus = ncx_get_float_ulonglong(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#  endif
}


int
ncx_putn_float_schar(void **xpp, size_t nelems, const schar *tp)
{
#if _SX && \
           X_SIZEOF_FLOAT == SIZEOF_FLOAT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  float tmp[LOOPCNT];        /* in case input is misaligned */
  float *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_FLOAT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (float *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (float) Max( X_FLOAT_MIN, Min(X_FLOAT_MAX, (float) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += tp[i] < X_FLOAT_MIN || tp[i] > X_FLOAT_MAX;
    }
   /* copy workspace back if necessary */ 
    if (realign) {
      memcpy(*xpp, tmp, ni*X_SIZEOF_FLOAT);
      xp = (float *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_FLOAT, tp++)
	{
		int lstatus = ncx_put_float_schar(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_float_uchar(void **xpp, size_t nelems, const uchar *tp)
{
#if _SX && \
           X_SIZEOF_FLOAT == SIZEOF_FLOAT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  float tmp[LOOPCNT];        /* in case input is misaligned */
  float *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_FLOAT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (float *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (float) Max( X_FLOAT_MIN, Min(X_FLOAT_MAX, (float) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += tp[i] < X_FLOAT_MIN || tp[i] > X_FLOAT_MAX;
    }
   /* copy workspace back if necessary */ 
    if (realign) {
      memcpy(*xpp, tmp, ni*X_SIZEOF_FLOAT);
      xp = (float *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_FLOAT, tp++)
	{
		int lstatus = ncx_put_float_uchar(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_float_short(void **xpp, size_t nelems, const short *tp)
{
#if _SX && \
           X_SIZEOF_FLOAT == SIZEOF_FLOAT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  float tmp[LOOPCNT];        /* in case input is misaligned */
  float *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_FLOAT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (float *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (float) Max( X_FLOAT_MIN, Min(X_FLOAT_MAX, (float) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += tp[i] < X_FLOAT_MIN || tp[i] > X_FLOAT_MAX;
    }
   /* copy workspace back if necessary */ 
    if (realign) {
      memcpy(*xpp, tmp, ni*X_SIZEOF_FLOAT);
      xp = (float *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_FLOAT, tp++)
	{
		int lstatus = ncx_put_float_short(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_float_int(void **xpp, size_t nelems, const int *tp)
{
#if _SX && \
           X_SIZEOF_FLOAT == SIZEOF_FLOAT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  float tmp[LOOPCNT];        /* in case input is misaligned */
  float *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_FLOAT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (float *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (float) Max( X_FLOAT_MIN, Min(X_FLOAT_MAX, (float) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += tp[i] < X_FLOAT_MIN || tp[i] > X_FLOAT_MAX;
    }
   /* copy workspace back if necessary */ 
    if (realign) {
      memcpy(*xpp, tmp, ni*X_SIZEOF_FLOAT);
      xp = (float *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_FLOAT, tp++)
	{
		int lstatus = ncx_put_float_int(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

#if X_SIZEOF_FLOAT == SIZEOF_FLOAT && !defined(NO_IEEE_FLOAT)
/* optimized version */
int
ncx_putn_float_float(void **xpp, size_t nelems, const float *tp)
{
#ifdef WORDS_BIGENDIAN
	(void) memcpy(*xpp, tp, nelems * X_SIZEOF_FLOAT);
# else
	swapn4b(*xpp, tp, nelems);
# endif
	*xpp = (void *)((char *)(*xpp) + nelems * X_SIZEOF_FLOAT);
	return ENOERR;
}
#elif vax
int
ncx_putn_float_float(void **xpp, size_t nfloats, const float *ip)
{
	const float *const end = ip + nfloats;

	while(ip < end)
	{
		const struct vax_single *const vsp =
			 (const struct vax_single *)ip;
		struct ieee_single *const isp = (struct ieee_single *) (*xpp);

		switch(vsp->exp){
		case 0 :
			/* all vax float with zero exponent map to zero */
			*isp = min.ieee;
			break;
		case 2 :
		case 1 :
		{
			/* These will map to subnormals */
			unsigned mantissa = (vsp->mantissa1 << 16)
					 | vsp->mantissa2;
			mantissa >>= 3 - vsp->exp;
			mantissa += (1 << (20 + vsp->exp));
			isp->mant_lo_lo = mantissa;
			isp->mant_lo_hi = mantissa >> 8;
			isp->mant_hi = mantissa >> 16;
			isp->exp_lo = 0;
			isp->exp_hi = 0;
		}
			break;
		case 0xff : /* max.s.exp */
			if( vsp->mantissa2 == max.s.mantissa2
				&& vsp->mantissa1 == max.s.mantissa1)
			{
				/* map largest vax float to ieee infinity */
				*isp = max.ieee;
				break;
			} /* else, fall thru */
		default :
		{
			unsigned exp = vsp->exp - VAX_SNG_BIAS + IEEE_SNG_BIAS;
			isp->exp_hi = exp >> 1;
			isp->exp_lo = exp;
			isp->mant_lo_lo = vsp->mantissa2;
			isp->mant_lo_hi = vsp->mantissa2 >> 8;
			isp->mant_hi = vsp->mantissa1;
		}
		}

		isp->sign = vsp->sign;

	
		ip++;
		*xpp = (char *)(*xpp) + X_SIZEOF_FLOAT;
	}
	return ENOERR;
}
#else
int
ncx_putn_float_float(void **xpp, size_t nelems, const float *tp)
{
	char *xp = *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_FLOAT, tp++)
	{
		int lstatus = ncx_put_float_float(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
}

#endif
int
ncx_putn_float_double(void **xpp, size_t nelems, const double *tp)
{
#if _SX && \
           X_SIZEOF_FLOAT == SIZEOF_FLOAT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  float tmp[LOOPCNT];        /* in case input is misaligned */
  float *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_FLOAT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (float *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (float) Max( X_FLOAT_MIN, Min(X_FLOAT_MAX, (float) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += tp[i] < X_FLOAT_MIN || tp[i] > X_FLOAT_MAX;
    }
   /* copy workspace back if necessary */ 
    if (realign) {
      memcpy(*xpp, tmp, ni*X_SIZEOF_FLOAT);
      xp = (float *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_FLOAT, tp++)
	{
		int lstatus = ncx_put_float_double(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_float_uint(void **xpp, size_t nelems, const uint *tp)
{
#if _SX && \
           X_SIZEOF_FLOAT == SIZEOF_FLOAT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  float tmp[LOOPCNT];        /* in case input is misaligned */
  float *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_FLOAT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (float *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (float) Max( X_FLOAT_MIN, Min(X_FLOAT_MAX, (float) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += tp[i] < X_FLOAT_MIN || tp[i] > X_FLOAT_MAX;
    }
   /* copy workspace back if necessary */ 
    if (realign) {
      memcpy(*xpp, tmp, ni*X_SIZEOF_FLOAT);
      xp = (float *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_FLOAT, tp++)
	{
		int lstatus = ncx_put_float_uint(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_float_longlong(void **xpp, size_t nelems, const longlong *tp)
{
#if _SX && \
           X_SIZEOF_FLOAT == SIZEOF_FLOAT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  float tmp[LOOPCNT];        /* in case input is misaligned */
  float *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_FLOAT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (float *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (float) Max( X_FLOAT_MIN, Min(X_FLOAT_MAX, (float) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += tp[i] < X_FLOAT_MIN || tp[i] > X_FLOAT_MAX;
    }
   /* copy workspace back if necessary */ 
    if (realign) {
      memcpy(*xpp, tmp, ni*X_SIZEOF_FLOAT);
      xp = (float *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_FLOAT, tp++)
	{
		int lstatus = ncx_put_float_longlong(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_float_ulonglong(void **xpp, size_t nelems, const ulonglong *tp)
{
#if _SX && \
           X_SIZEOF_FLOAT == SIZEOF_FLOAT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  float tmp[LOOPCNT];        /* in case input is misaligned */
  float *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_FLOAT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (float *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (float) Max( X_FLOAT_MIN, Min(X_FLOAT_MAX, (float) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += tp[i] < X_FLOAT_MIN || tp[i] > X_FLOAT_MAX;
    }
   /* copy workspace back if necessary */ 
    if (realign) {
      memcpy(*xpp, tmp, ni*X_SIZEOF_FLOAT);
      xp = (float *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_FLOAT, tp++)
	{
		int lstatus = ncx_put_float_ulonglong(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}


/* double */

int
ncx_getn_double_schar(const void **xpp, size_t nelems, schar *tp)
{
#if _SX && \
           X_SIZEOF_DOUBLE == SIZEOF_DOUBLE

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  double tmp[LOOPCNT];        /* in case input is misaligned */
  double *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_DOUBLE;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, ni*SIZEOF_DOUBLE);
      xp = tmp;
    } else {
      xp = (double *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (schar) Max( SCHAR_MIN, Min(SCHAR_MAX, (schar) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += xp[i] < SCHAR_MIN || xp[i] > SCHAR_MAX;
    }
   /* update xpp and tp */
    if (realign) xp = (double *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_DOUBLE, tp++)
	{
		const int lstatus = ncx_get_double_schar(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#  endif
}

int
ncx_getn_double_uchar(const void **xpp, size_t nelems, uchar *tp)
{
#if _SX && \
           X_SIZEOF_DOUBLE == SIZEOF_DOUBLE

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  double tmp[LOOPCNT];        /* in case input is misaligned */
  double *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_DOUBLE;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, ni*SIZEOF_DOUBLE);
      xp = tmp;
    } else {
      xp = (double *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (uchar) Max( UCHAR_MIN, Min(UCHAR_MAX, (uchar) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += xp[i] < UCHAR_MIN || xp[i] > UCHAR_MAX;
    }
   /* update xpp and tp */
    if (realign) xp = (double *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_DOUBLE, tp++)
	{
		const int lstatus = ncx_get_double_uchar(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#  endif
}

int
ncx_getn_double_short(const void **xpp, size_t nelems, short *tp)
{
#if _SX && \
           X_SIZEOF_DOUBLE == SIZEOF_DOUBLE

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  double tmp[LOOPCNT];        /* in case input is misaligned */
  double *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_DOUBLE;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, ni*SIZEOF_DOUBLE);
      xp = tmp;
    } else {
      xp = (double *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (short) Max( SHORT_MIN, Min(SHORT_MAX, (short) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += xp[i] < SHORT_MIN || xp[i] > SHORT_MAX;
    }
   /* update xpp and tp */
    if (realign) xp = (double *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_DOUBLE, tp++)
	{
		const int lstatus = ncx_get_double_short(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#  endif
}

int
ncx_getn_double_int(const void **xpp, size_t nelems, int *tp)
{
#if _SX && \
           X_SIZEOF_DOUBLE == SIZEOF_DOUBLE

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  double tmp[LOOPCNT];        /* in case input is misaligned */
  double *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_DOUBLE;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, ni*SIZEOF_DOUBLE);
      xp = tmp;
    } else {
      xp = (double *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (int) Max( INT_MIN, Min(INT_MAX, (int) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += xp[i] < INT_MIN || xp[i] > INT_MAX;
    }
   /* update xpp and tp */
    if (realign) xp = (double *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_DOUBLE, tp++)
	{
		const int lstatus = ncx_get_double_int(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#  endif
}

int
ncx_getn_double_float(const void **xpp, size_t nelems, float *tp)
{
#if _SX && \
           X_SIZEOF_DOUBLE == SIZEOF_DOUBLE

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  double tmp[LOOPCNT];        /* in case input is misaligned */
  double *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_DOUBLE;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, ni*SIZEOF_DOUBLE);
      xp = tmp;
    } else {
      xp = (double *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (float) Max( FLOAT_MIN, Min(FLOAT_MAX, (float) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += xp[i] < FLOAT_MIN || xp[i] > FLOAT_MAX;
    }
   /* update xpp and tp */
    if (realign) xp = (double *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_DOUBLE, tp++)
	{
		const int lstatus = ncx_get_double_float(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#  endif
}

int
ncx_getn_double_uint(const void **xpp, size_t nelems, uint *tp)
{
#if _SX && \
           X_SIZEOF_DOUBLE == SIZEOF_DOUBLE

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  double tmp[LOOPCNT];        /* in case input is misaligned */
  double *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_DOUBLE;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, ni*SIZEOF_DOUBLE);
      xp = tmp;
    } else {
      xp = (double *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (uint) Max( UINT_MIN, Min(UINT_MAX, (uint) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += xp[i] < UINT_MIN || xp[i] > UINT_MAX;
    }
   /* update xpp and tp */
    if (realign) xp = (double *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_DOUBLE, tp++)
	{
		const int lstatus = ncx_get_double_uint(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#  endif
}

int
ncx_getn_double_longlong(const void **xpp, size_t nelems, longlong *tp)
{
#if _SX && \
           X_SIZEOF_DOUBLE == SIZEOF_DOUBLE

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  double tmp[LOOPCNT];        /* in case input is misaligned */
  double *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_DOUBLE;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, ni*SIZEOF_DOUBLE);
      xp = tmp;
    } else {
      xp = (double *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (longlong) Max( LONGLONG_MIN, Min(LONGLONG_MAX, (longlong) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += xp[i] < LONGLONG_MIN || xp[i] > LONGLONG_MAX;
    }
   /* update xpp and tp */
    if (realign) xp = (double *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_DOUBLE, tp++)
	{
		const int lstatus = ncx_get_double_longlong(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#  endif
}

int
ncx_getn_double_ulonglong(const void **xpp, size_t nelems, ulonglong *tp)
{
#if _SX && \
           X_SIZEOF_DOUBLE == SIZEOF_DOUBLE

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  double tmp[LOOPCNT];        /* in case input is misaligned */
  double *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_DOUBLE;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, ni*SIZEOF_DOUBLE);
      xp = tmp;
    } else {
      xp = (double *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (ulonglong) Max( ULONGLONG_MIN, Min(ULONGLONG_MAX, (ulonglong) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += xp[i] < ULONGLONG_MIN || xp[i] > ULONGLONG_MAX;
    }
   /* update xpp and tp */
    if (realign) xp = (double *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_DOUBLE, tp++)
	{
		const int lstatus = ncx_get_double_ulonglong(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#  endif
}

#if X_SIZEOF_DOUBLE == SIZEOF_DOUBLE && !defined(NO_IEEE_FLOAT)
/* optimized version */
int
ncx_getn_double_double(const void **xpp, size_t nelems, double *tp)
{
#ifdef WORDS_BIGENDIAN
	(void) memcpy(tp, *xpp, nelems * sizeof(double));
# else
	swapn8b(tp, *xpp, nelems);
# endif
	*xpp = (const void *)((const char *)(*xpp) + nelems * X_SIZEOF_DOUBLE);
	return ENOERR;
}
#elif vax
int
ncx_getn_double_double(const void **xpp, size_t ndoubles, double *ip)
{
	double *const end = ip + ndoubles;

	while(ip < end)
	{
	struct vax_double *const vdp =
			 (struct vax_double *)ip;
	const struct ieee_double *const idp =
			 (const struct ieee_double *) (*xpp);
	{
		const struct dbl_limits *lim;
		int ii;
		for (ii = 0, lim = dbl_limits;
			ii < sizeof(dbl_limits)/sizeof(struct dbl_limits);
			ii++, lim++)
		{
			if ((idp->mant_lo == lim->ieee.mant_lo)
				&& (idp->mant_4 == lim->ieee.mant_4)
				&& (idp->mant_5 == lim->ieee.mant_5)
				&& (idp->mant_6 == lim->ieee.mant_6)
				&& (idp->exp_lo == lim->ieee.exp_lo)
				&& (idp->exp_hi == lim->ieee.exp_hi)
				)
			{
				*vdp = lim->d;
				goto doneit;
			}
		}
	}
	{
		unsigned exp = idp->exp_hi << 4 | idp->exp_lo;
		vdp->exp = exp - IEEE_DBL_BIAS + VAX_DBL_BIAS;
	}
	{
		unsigned mant_hi = ((idp->mant_6 << 16)
				 | (idp->mant_5 << 8)
				 | idp->mant_4);
		unsigned mant_lo = SWAP4(idp->mant_lo);
		vdp->mantissa1 = (mant_hi >> 13);
		vdp->mantissa2 = ((mant_hi & MASK(13)) << 3)
				| (mant_lo >> 29);
		vdp->mantissa3 = (mant_lo >> 13);
		vdp->mantissa4 = (mant_lo << 3);
	}
	doneit:
		vdp->sign = idp->sign;

		ip++;
		*xpp = (char *)(*xpp) + X_SIZEOF_DOUBLE;
	}
	return ENOERR;
}
	/* vax */
#else
int
ncx_getn_double_double(const void **xpp, size_t nelems, double *tp)
{
	const char *xp = *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_DOUBLE, tp++)
	{
		const int lstatus = ncx_get_double_double(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
}

#endif

int
ncx_putn_double_schar(void **xpp, size_t nelems, const schar *tp)
{
#if _SX && \
           X_SIZEOF_DOUBLE == SIZEOF_DOUBLE

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  double tmp[LOOPCNT];        /* in case input is misaligned */
  double *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_DOUBLE;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (double *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (double) Max( X_DOUBLE_MIN, Min(X_DOUBLE_MAX, (double) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += tp[i] < X_DOUBLE_MIN || tp[i] > X_DOUBLE_MAX;
    }
   /* copy workspace back if necessary */ 
    if (realign) {
      memcpy(*xpp, tmp, ni*X_SIZEOF_DOUBLE);
      xp = (double *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_DOUBLE, tp++)
	{
		int lstatus = ncx_put_double_schar(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_double_uchar(void **xpp, size_t nelems, const uchar *tp)
{
#if _SX && \
           X_SIZEOF_DOUBLE == SIZEOF_DOUBLE

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  double tmp[LOOPCNT];        /* in case input is misaligned */
  double *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_DOUBLE;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (double *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (double) Max( X_DOUBLE_MIN, Min(X_DOUBLE_MAX, (double) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += tp[i] < X_DOUBLE_MIN || tp[i] > X_DOUBLE_MAX;
    }
   /* copy workspace back if necessary */ 
    if (realign) {
      memcpy(*xpp, tmp, ni*X_SIZEOF_DOUBLE);
      xp = (double *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_DOUBLE, tp++)
	{
		int lstatus = ncx_put_double_uchar(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_double_short(void **xpp, size_t nelems, const short *tp)
{
#if _SX && \
           X_SIZEOF_DOUBLE == SIZEOF_DOUBLE

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  double tmp[LOOPCNT];        /* in case input is misaligned */
  double *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_DOUBLE;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (double *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (double) Max( X_DOUBLE_MIN, Min(X_DOUBLE_MAX, (double) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += tp[i] < X_DOUBLE_MIN || tp[i] > X_DOUBLE_MAX;
    }
   /* copy workspace back if necessary */ 
    if (realign) {
      memcpy(*xpp, tmp, ni*X_SIZEOF_DOUBLE);
      xp = (double *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_DOUBLE, tp++)
	{
		int lstatus = ncx_put_double_short(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_double_int(void **xpp, size_t nelems, const int *tp)
{
#if _SX && \
           X_SIZEOF_DOUBLE == SIZEOF_DOUBLE

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  double tmp[LOOPCNT];        /* in case input is misaligned */
  double *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_DOUBLE;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (double *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (double) Max( X_DOUBLE_MIN, Min(X_DOUBLE_MAX, (double) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += tp[i] < X_DOUBLE_MIN || tp[i] > X_DOUBLE_MAX;
    }
   /* copy workspace back if necessary */ 
    if (realign) {
      memcpy(*xpp, tmp, ni*X_SIZEOF_DOUBLE);
      xp = (double *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_DOUBLE, tp++)
	{
		int lstatus = ncx_put_double_int(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_double_float(void **xpp, size_t nelems, const float *tp)
{
#if _SX && \
           X_SIZEOF_DOUBLE == SIZEOF_DOUBLE

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  double tmp[LOOPCNT];        /* in case input is misaligned */
  double *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_DOUBLE;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (double *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (double) Max( X_DOUBLE_MIN, Min(X_DOUBLE_MAX, (double) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += tp[i] < X_DOUBLE_MIN || tp[i] > X_DOUBLE_MAX;
    }
   /* copy workspace back if necessary */ 
    if (realign) {
      memcpy(*xpp, tmp, ni*X_SIZEOF_DOUBLE);
      xp = (double *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_DOUBLE, tp++)
	{
		int lstatus = ncx_put_double_float(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_double_uint(void **xpp, size_t nelems, const uint *tp)
{
#if _SX && \
           X_SIZEOF_DOUBLE == SIZEOF_DOUBLE

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  double tmp[LOOPCNT];        /* in case input is misaligned */
  double *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_DOUBLE;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (double *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (double) Max( X_DOUBLE_MIN, Min(X_DOUBLE_MAX, (double) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += tp[i] < X_DOUBLE_MIN || tp[i] > X_DOUBLE_MAX;
    }
   /* copy workspace back if necessary */ 
    if (realign) {
      memcpy(*xpp, tmp, ni*X_SIZEOF_DOUBLE);
      xp = (double *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_DOUBLE, tp++)
	{
		int lstatus = ncx_put_double_uint(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_double_longlong(void **xpp, size_t nelems, const longlong *tp)
{
#if _SX && \
           X_SIZEOF_DOUBLE == SIZEOF_DOUBLE

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  double tmp[LOOPCNT];        /* in case input is misaligned */
  double *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_DOUBLE;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (double *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (double) Max( X_DOUBLE_MIN, Min(X_DOUBLE_MAX, (double) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += tp[i] < X_DOUBLE_MIN || tp[i] > X_DOUBLE_MAX;
    }
   /* copy workspace back if necessary */ 
    if (realign) {
      memcpy(*xpp, tmp, ni*X_SIZEOF_DOUBLE);
      xp = (double *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_DOUBLE, tp++)
	{
		int lstatus = ncx_put_double_longlong(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_double_ulonglong(void **xpp, size_t nelems, const ulonglong *tp)
{
#if _SX && \
           X_SIZEOF_DOUBLE == SIZEOF_DOUBLE

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  double tmp[LOOPCNT];        /* in case input is misaligned */
  double *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_DOUBLE;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (double *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (double) Max( X_DOUBLE_MIN, Min(X_DOUBLE_MAX, (double) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
      nrange += tp[i] < X_DOUBLE_MIN || tp[i] > X_DOUBLE_MAX;
    }
   /* copy workspace back if necessary */ 
    if (realign) {
      memcpy(*xpp, tmp, ni*X_SIZEOF_DOUBLE);
      xp = (double *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? ENOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_DOUBLE, tp++)
	{
		int lstatus = ncx_put_double_ulonglong(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

#if X_SIZEOF_DOUBLE == SIZEOF_DOUBLE && !defined(NO_IEEE_FLOAT)
/* optimized version */
int
ncx_putn_double_double(void **xpp, size_t nelems, const double *tp)
{
#ifdef WORDS_BIGENDIAN
	(void) memcpy(*xpp, tp, nelems * X_SIZEOF_DOUBLE);
# else
	swapn8b(*xpp, tp, nelems);
# endif
	*xpp = (void *)((char *)(*xpp) + nelems * X_SIZEOF_DOUBLE);
	return ENOERR;
}
#elif vax
int
ncx_putn_double_double(void **xpp, size_t ndoubles, const double *ip)
{
	const double *const end = ip + ndoubles;

	while(ip < end)
	{
	const struct vax_double *const vdp = 
			(const struct vax_double *)ip;
	struct ieee_double *const idp =
			 (struct ieee_double *) (*xpp);

	if ((vdp->mantissa4 > (dbl_limits[0].d.mantissa4 - 3)) &&
		(vdp->mantissa3 == dbl_limits[0].d.mantissa3) &&
		(vdp->mantissa2 == dbl_limits[0].d.mantissa2) &&
		(vdp->mantissa1 == dbl_limits[0].d.mantissa1) &&
		(vdp->exp == dbl_limits[0].d.exp))
	{
		*idp = dbl_limits[0].ieee;
		goto shipit;
	}
	if ((vdp->mantissa4 == dbl_limits[1].d.mantissa4) &&
		(vdp->mantissa3 == dbl_limits[1].d.mantissa3) &&
		(vdp->mantissa2 == dbl_limits[1].d.mantissa2) &&
		(vdp->mantissa1 == dbl_limits[1].d.mantissa1) &&
		(vdp->exp == dbl_limits[1].d.exp))
	{
		*idp = dbl_limits[1].ieee;
		goto shipit;
	}

	{
		unsigned exp = vdp->exp - VAX_DBL_BIAS + IEEE_DBL_BIAS;

		unsigned mant_lo = ((vdp->mantissa2 & MASK(3)) << 29) |
			(vdp->mantissa3 << 13) |
			((vdp->mantissa4 >> 3) & MASK(13));

		unsigned mant_hi = (vdp->mantissa1 << 13)
				 | (vdp->mantissa2 >> 3);

		if((vdp->mantissa4 & 7) > 4)
		{
			/* round up */
			mant_lo++;
			if(mant_lo == 0)
			{
				mant_hi++;
				if(mant_hi > 0xffffff)
				{
					mant_hi = 0;
					exp++;
				}
			}
		}

		idp->mant_lo = SWAP4(mant_lo);
		idp->mant_6 = mant_hi >> 16;
		idp->mant_5 = (mant_hi & 0xff00) >> 8;
		idp->mant_4 = mant_hi;
		idp->exp_hi = exp >> 4;
		idp->exp_lo = exp;
	}
		
	shipit:
		idp->sign = vdp->sign;

		ip++;
		*xpp = (char *)(*xpp) + X_SIZEOF_DOUBLE;
	}
	return ENOERR;
}
	/* vax */
#else
int
ncx_putn_double_double(void **xpp, size_t nelems, const double *tp)
{
	char *xp = *xpp;
	int status = ENOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_DOUBLE, tp++)
	{
		int lstatus = ncx_put_double_double(xp, tp);
		if(lstatus != ENOERR)
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
}

#endif


/*
 * Other aggregate conversion functions.
 */

/* text */

int
ncx_getn_text(const void **xpp, size_t nelems, char *tp)
{
	(void) memcpy(tp, *xpp, nelems);
	*xpp = (void *)((char *)(*xpp) + nelems);
	return ENOERR;

}

int
ncx_pad_getn_text(const void **xpp, size_t nelems, char *tp)
{
	size_t rndup = nelems % X_ALIGN;

	if(rndup)
		rndup = X_ALIGN - rndup;

	(void) memcpy(tp, *xpp, nelems);
	*xpp = (void *)((char *)(*xpp) + nelems + rndup);

	return ENOERR;

}

int
ncx_putn_text(void **xpp, size_t nelems, const char *tp)
{
	(void) memcpy(*xpp, tp, nelems);
	*xpp = (void *)((char *)(*xpp) + nelems);

	return ENOERR;

}

int
ncx_pad_putn_text(void **xpp, size_t nelems, const char *tp)
{
	size_t rndup = nelems % X_ALIGN;

	if(rndup)
		rndup = X_ALIGN - rndup;

	(void) memcpy(*xpp, tp, nelems);
	*xpp = (void *)((char *)(*xpp) + nelems);

	if(rndup)
	{
		(void) memcpy(*xpp, nada, rndup);
		*xpp = (void *)((char *)(*xpp) + rndup);
	}
	
	return ENOERR;

}


/* opaque */

int
ncx_getn_void(const void **xpp, size_t nelems, void *tp)
{
	(void) memcpy(tp, *xpp, nelems);
	*xpp = (void *)((char *)(*xpp) + nelems);
	return ENOERR;

}

int
ncx_pad_getn_void(const void **xpp, size_t nelems, void *tp)
{
	size_t rndup = nelems % X_ALIGN;

	if(rndup)
		rndup = X_ALIGN - rndup;

	(void) memcpy(tp, *xpp, nelems);
	*xpp = (void *)((char *)(*xpp) + nelems + rndup);

	return ENOERR;

}

int
ncx_putn_void(void **xpp, size_t nelems, const void *tp)
{
	(void) memcpy(*xpp, tp, nelems);
	*xpp = (void *)((char *)(*xpp) + nelems);

	return ENOERR;

}

int
ncx_pad_putn_void(void **xpp, size_t nelems, const void *tp)
{
	size_t rndup = nelems % X_ALIGN;

	if(rndup)
		rndup = X_ALIGN - rndup;

	(void) memcpy(*xpp, tp, nelems);
	*xpp = (void *)((char *)(*xpp) + nelems);

	if(rndup)
	{
		(void) memcpy(*xpp, nada, rndup);
		*xpp = (void *)((char *)(*xpp) + rndup);
	}
	
	return ENOERR;

}