/*
    libfame - Fast Assembly MPEG Encoder Library
    Copyright (C) 2000-2001 Vivien Chappelier
                            Damien Vincent

    This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Library General Public
    License as published by the Free Software Foundation; either
    version 2 of the License, or (at your option) any later version.

    This library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    Library General Public License for more details.

    You should have received a copy of the GNU Library General Public
    License along with this library; if not, write to the Free
    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/**************************** half-pixel interpolation ***********************/

static short const _mmx_one[] = { 1, 1, 1, 1 };
static unsigned char const _mmx_one_byte[] = {1,1,1,1,1,1,1,1};


static void inline mmx_interpolate_signed(unsigned char **ref,
					  int pitch,
					  int rounding)
{
  int dummy = 0;
  short _mmx_rc[4]; /* rounding control */
  register short *rc = _mmx_rc;

  _mmx_rc[0] = 1 - rounding;
  _mmx_rc[1] = 1 - rounding;
  _mmx_rc[2] = 1 - rounding;
  _mmx_rc[3] = 1 - rounding;

  asm volatile ("movl (%0), %3\n"          /* %3 = ref[0] */
		"pxor %%mm7, %%mm7\n"      /* mm7 = 0 */
		"movq (%3), %%mm0\n"       /* mm0 = [ref] */
		"movq 1(%3), %%mm1\n"      /* mm1 = [ref+1] */
		"movq %%mm0, %%mm2\n"      /* mm2 = mm0 */
		"movq %%mm1, %%mm3\n"      /* mm3 = mm1 */
		"punpcklbw %%mm7, %%mm0\n" /* mm0 = ref[0-3]w */
		"punpcklbw %%mm7, %%mm1\n" /* mm1 = ref[1-4]w */
		"punpckhbw %%mm7, %%mm2\n" /* mm2 = ref[4-7]w */
		"punpckhbw %%mm7, %%mm3\n" /* mm3 = ref[5-8]w */
		"movq %%mm0, %%mm4\n"      /* save ref[0-3] */
		"movq %%mm2, %%mm5\n"      /* save ref[4-7] */
		"paddw %%mm1, %%mm0\n"     /* mm0 = ref[0-3] + ref[1-4] */
		"paddw %%mm3, %%mm2\n"     /* mm2 = ref[4-7] + ref[5-8] */
		"paddw (%2), %%mm0\n"      /* add 1 - residual */
		"paddw (%2), %%mm2\n"      /* add 1 - residual */
		"psrlw $1, %%mm2\n"        /* divide by 2 */
		"psrlw $1, %%mm0\n"        /* divide by 2 */
		"packuswb %%mm2, %%mm0\n"  /* pack to byte and saturate */
		"movq (%3, %1), %%mm1\n"   /* mm1 = [ref+1 line] */
		"movq 1(%3, %1), %%mm3\n"  /* mm3 = [ref+1 line+1] */
		"movq %%mm1, %%mm2\n"      /* mm2 = mm1 */
		"movq %%mm3, %%mm6\n"      /* mm6 = mm3 */
		"punpcklbw %%mm7, %%mm1\n" /* mm1 = ref[0-3+1line] */
		"punpcklbw %%mm7, %%mm3\n" /* mm3 = ref[1-4+1line] */
		"punpckhbw %%mm7, %%mm2\n" /* mm2 = ref[4-7+1line] */
		"punpckhbw %%mm7, %%mm6\n" /* mm6 = ref[5-8+1line] */
		"paddw %%mm1, %%mm4\n"     /* mm4 = ref[0-3]+ref[0-3+1l] */
		"paddw %%mm2, %%mm5\n"     /* mm5 = ref[4-7]+ref[4-7+1l] */
		"paddw (%2), %%mm4\n"      /* add 1 - residual */
		"paddw (%2), %%mm5\n"      /* add 1 - residual */
		"paddw %%mm4, %%mm3\n"     /* mm3 = ref00+ref10+ref11+1-r 0-3*/
		"paddw %%mm5, %%mm6\n"     /* mm6 = ref00+ref10+ref11+1-r 4-7*/
		"psrlw $1, %%mm4\n"        /* divide by 2 */
		"psrlw $1, %%mm5\n"        /* divide by 2 */
		"paddw " ASMSYM "_mmx_one, %%mm3\n"  /* add 1 */
		"paddw " ASMSYM "_mmx_one, %%mm6\n"  /* add 1 */
		"packuswb %%mm5, %%mm4\n"  /* pack to byte and saturate */
		"movq 1(%3), %%mm1\n"      /* mm1 = [ref+1] */
		"movq %%mm1, %%mm2\n"      /* mm2 = mm1 */
		"punpcklbw %%mm7, %%mm1\n" /* mm0 = ref[1-4] */
		"punpckhbw %%mm7, %%mm2\n" /* mm2 = ref[5-8] */
		"paddw %%mm1, %%mm3\n"     /* mm3 = all ref +2-r 0-3*/
		"paddw %%mm2, %%mm6\n"     /* mm6 = all ref +2-r 4-7*/
		"psrlw $2, %%mm3\n"        /* divide by 4 */
		"psrlw $2, %%mm6\n"        /* divide by 4 */
		"packuswb %%mm6, %%mm3\n"  /* pack to byte and saturate */
		"movl 4(%0), %3\n"         /* %3 = ref[1] */
		"movq %%mm0, (%3)\n"       /* store in frame */
		"movl 8(%0), %3\n"         /* %3 = ref[2] */
		"movq %%mm4, (%3)\n"       /* store in frame */
		"movl 12(%0), %3\n"        /* %3 = ref[3] */
		"movq %%mm3, (%3)\n"       /* store in frame */
		: "=r"(ref), "=r"(pitch), "=r"(rc), "=r"(dummy)
		: "0"(ref), "1"(pitch), "2"(rc), "3"(dummy)
		: "memory");
}


static void inline mmx_interpolate_unsigned(unsigned char **ref,
					    int pitch,
					    int rounding)
{
  int dummy = 0;
  short _mmx_rc[4]; /* rounding control */
  register short *rc = _mmx_rc;

  _mmx_rc[0] = 1 - rounding;
  _mmx_rc[1] = 1 - rounding;
  _mmx_rc[2] = 1 - rounding;
  _mmx_rc[3] = 1 - rounding;

  asm volatile (
		"movl (%0), %3\n"          /* %3 = ref[0] */

		"prefetcht0 8(%3)\n"
		"prefetcht0 8(%3, %1)\n"

		"movq (%3), %%mm0\n"       /* mm0 = [ref] */
		"movq 1(%3), %%mm1\n"      /* mm1 = [ref+1] */
		"movq %%mm0, %%mm5\n"
		"pavgb %%mm1, %%mm5\n"

		"movq (%3, %1), %%mm2\n"   /* mm2 = [ref+1 line] */
		"movq 1(%3, %1), %%mm3\n"  /* mm3 = [ref+1 line+1] */
		"movq %%mm2, %%mm6\n"
		"pavgb %%mm0, %%mm6\n"

		"movq %%mm3, %%mm7\n"
		"pavgb %%mm2, %%mm7\n"
		"pavgb %%mm5, %%mm7\n"
		"pand %%mm1, %%mm0\n"
		"pand %%mm3, %%mm2\n"
		"pxor %%mm2, %%mm0\n"
		"pand _mmx_one_byte, %%mm0\n"
		"paddb %%mm0, %%mm7\n"

		"movl 4(%0), %3\n"         /* %3 = ref[1] */
		"movq %%mm5, (%3)\n"       /* store in frame */
		"movl 8(%0), %3\n"         /* %3 = ref[2] */
		"movq %%mm6, (%3)\n"       /* store in frame */
		"movl 12(%0), %3\n"        /* %3 = ref[3] */
		"movq %%mm7, (%3)\n"       /* store in frame */
		: "=r"(ref), "=r"(pitch), "=r"(rc), "=r"(dummy)
		: "0"(ref), "1"(pitch), "2"(rc), "3"(dummy)
		: "memory");
}

/*  half_interpolate                                                         */
/*                                                                           */
/*  Description:                                                             */
/*    Compute half-pel resolution frames from reference frame.               */
/*                                                                           */
/*  Arguments:                                                               */
/*    fame_encoder_t *encoder: the encoder                                   */
/*                                                                           */
/*  Return value:                                                            */
/*    None.                                                                  */

static void inline half_interpolate(int width,
				    int height,
				    fame_yuv_t **ref,
				    int rounding)
{
  int x, y, w, h;
  int pitch;
  unsigned char *planes[4];

  /* Note: reference frame is allocated a little larger */
  /*       to allow overflow when doing interpolation   */
  /*       and thus avoid a special case for borders.   */
  /* Note: interpolation is done for Cr and Cb components   */
  /*       too, as it is used when computing the difference */
  /*       in motion compensation, even if it's not used    */
  /*       in motion estimation.                            */

  /* Y component */
  w = width >> 3;
  h = height;
  planes[0] = ref[0]->y;
  planes[1] = ref[1]->y;
  planes[2] = ref[2]->y;
  planes[3] = ref[3]->y;
  pitch = width;

  for(y = 0; y < h; y++) {
    for(x = 0; x < w; x++) {
      mmx_interpolate_unsigned(planes, pitch, rounding);
      planes[0]+=8;
      planes[1]+=8;
      planes[2]+=8;
      planes[3]+=8;
    }
  }

  /* U component */
  w = width >> 4;
  h = height >> 1;
  planes[0] = ref[0]->u;
  planes[1] = ref[1]->u;
  planes[2] = ref[2]->u;
  planes[3] = ref[3]->u;
  pitch = width >> 1;

  for(y = 0; y < h; y++) {
    for(x = 0; x < w; x++) {
      mmx_interpolate_signed(planes, pitch, rounding);
      planes[0]+=8;
      planes[1]+=8;
      planes[2]+=8;
      planes[3]+=8;
    }
  }

  /* V component */
  w = width >> 4;
  h = height >> 1;
  planes[0] = ref[0]->v;
  planes[1] = ref[1]->v;
  planes[2] = ref[2]->v;
  planes[3] = ref[3]->v;
  pitch = width >> 1;

  for(y = 0; y < h; y++) {
    for(x = 0; x < w; x++) {
      mmx_interpolate_signed(planes, pitch, rounding);
      planes[0]+=8;
      planes[1]+=8;
      planes[2]+=8;
      planes[3]+=8;
    }
  }
}
