From: dstamp@watserv1.waterloo.edu (Dave Stampe-Psy+Eng)
Subject: Fast VGA poly blitter code (Code included)
Date: Thu, 31 Oct 1991 05:54:20 GMT
Message-ID: <1991Oct31.055420.22279@watserv1.waterloo.edu>
Organization: University of Waterloo



I've had quite a few requests to look at the fast VGA poly blitter
code.  I'ts not done by any means, but this is what I have so far.
You'll notice that poly timing is done by subtracting a dummy call
time from that of the poly drawing call: this gives a better 
estimate of the poly code speed without the C call, procedure
and test parameter generation time.  Obviously a general poly
blitter with clipping will run a bit slower because of added
interface code, but right now fine timing is critical, as this
part of the blitter is called many times.

Timing as of now (on my Paradise VGA card (pretty slow one) and
a 486/25 is 6400 24x24 triangles or 4800 24x24 trapezoids per
second.  Thus, trapezoids are about 50% faster per pixel than
the triangles.  

THe code is compiled with Borland C++ or Turbo C++ (others may
need rewrites).  Note the inline assembler: this will be moved
to a seperate .asm file in the future, but this style seems to
work well for development.  

Please contact me if you have any questions.  More later.

--------------------- fpoly.c --------------------------

#pragma inline

#include <bios.h>
#include <dos.h>
#include <stdio.h>
#include <conio.h>
#include <graphics.h>

union REGS regs;


#define PUT 0			/* defines of write modes */
#define AND 1
#define OR  2
#define XOR 3

int gdriver = VGA;
int gmode = VGAHI;

#define VGA 0x3CE		/* VGA controller port address */

int vmode = 0x0d;		/* 320x200x16 colors */

unsigned char stmask[320];	/* start, end mask fast lookup arrays */
unsigned char fnmask[320];

unsigned char smask[] = { 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01 };
unsigned char emask[] = { 0x80, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE, 0xFF };


make_data()			/* fill mask arrays */
{				/* wd. be code segment tables in assembler */
 int i,j;

 for(i=0;i<320;i++)
  {
   stmask[i] = smask[i&7];
   fnmask[i] = emask[i&7];
  }
}



main()
{
 long btime;
 float mtime;
 int i,j,k;

 initgraph(&gdriver,&gmode,"");
 regs.h.ah = 0;			/* set video mode */
 regs.h.al = vmode;		/* as driver doesn't sppt 320x200 */
 int86(0x10,&regs,&regs);

 make_data();                   /* create dummy asm tables */

 btime = biostime(0,0L);        /* dummy timer to find interface time */
 for(i=0;i<290;i++)
 for(k=0;k<170;k++)
   dpoly(i+20, i+20, i, i+24, k, 24+k, (i+k)%16);
 mtime = (float)(biostime(0,0L)-btime)/18.2;

 setup_hdwe(PUT);               /* setup VGA hardware */
 btime = biostime(0,0L);        /* draw 49300 24x24 triangles */
 for(i=0;i<290;i++)             /* of 288 pixels ea.          */
 for(k=0;k<170;k++)
   trpoly(i+20, i+20, i, i+24, k, 24+k, (i+k)%16);
 reset_hdwe();                  /* reset VGA hardware */

 printf("Triangle blits: %f\n", (float)(biostime(0,0L)-btime)/18.2-mtime);

 setup_hdwe(PUT);
 btime = biostime(0,0L);        /* draw 49300 24x24 trapezoids */
 for(i=0;i<290;i++)             /* of 576 pixels each          */
 for(k=0;k<170;k++)
   trpoly(i+7, i+30, i, i+25, k, 24+k, (i+k)%16);
 reset_hdwe();

 printf("Trapezoidal blits: %f\n", (float)(biostime(0,0L)-btime)/18.2-mtime);

 getch();
 textmode(-1);
}



setup_hdwe(int mode)       /* set VGA to draw in desired mode */
{                          /* do ONCE for all polys */
 asm {
	mov	dx,VGA
	mov	ah,BYTE PTR mode
	sal	ah,1
	sal	ah,1
	sal	ah,1
	mov	al,03h	           /* set mode */
	out	dx,ax		   /* assumed PUT by BIOS */
	mov	ax,0B05h	   /* write mode 3, read mode 1 */
	out	dx,ax
	mov	ax,0007h	   /* 0 to CDC for 0xFF read */
	out	dx,ax
	mov	ax,0FF08h	   /* bit mask = all       */
	out	dx,ax		   /* assumed 0xFF by BIOS */
	mov	ax,0FF01h	   /* ESR = 0x0F */
	out	dx,ax
     }
}



reset_hdwe()              /* reset VGA to expected state after drawing */
{
 asm {
	mov	dx,VGA
	mov	ax,0000
	out	dx,ax
	mov	ax,0001
	out	dx,ax
	mov	ax,0003
	out	dx,ax
	mov	ax,0005
	out	dx,ax
     }
}




/*  1  2 */  /* draw trapezoid: horizontal top, bottom          */
/*       */  /* do it as simply as possible: stack these to get */
/* 3  4  */  /* any 2 or 3-sided poly: quad is 50% faster per   */
	     /* pixel for 24x24 than triangles                  */
	     /* just make 2 points the same for triangle draw.  */

trpoly(int x1,int x2, int x3, int x4, int y1, int y3, int color)
{
 unsigned int vline = y1*40;  /* video line: offset in buffer  */
 long l_incr, r_incr;         /* side slopes (16-bit underflow */
 int lines = y3-y1;           /* line counter  */

 if(lines<1)return;

 asm {
      .386
      mov	dx,VGA
      xor	al,al
      mov	ah,BYTE PTR color           /* set color */
      out	dx,ax

      cld

      mov	ax,0a000h		    /* set segment */
      mov	es,ax
     }

 asm {
      xor	ecx,ecx		/* compute left incrementer */
      mov	ax,x3
      sub	ax,x1
      cwd
      movsx	eax,ax
      movsx	edx,dx          /* (x3-x1)/(y3-y1) */
      shl	eax,16
      mov	cx,lines
      idiv	ecx
      cmp 	eax,0           /* round up if + ( - already done) */
      jle	rnd1
      inc	eax
     }
rnd1:
 asm {
      mov	l_incr,eax

      mov	ax,x4           /* compute right incrementer */
      sub	ax,x2
      cwd
      movsx	eax,ax          /* (x4-x2)/(y4-y2) */
      movsx	edx,dx
      shl	eax,16
      mov	cx,lines
      idiv	ecx
      cmp 	eax,0           /* round up */
      jle	rnd2
      inc	eax
     }
rnd2:
 asm {
      mov	r_incr,eax

      mov	dx,x1		/* set start of left/right */
      mov	si,x2
      shl	edx,16          /* add zero frac. part     */
      shl	esi,16
      add	edx,08000h	/* add 0.5 to left, so it rounds up   */

      mov	bx,x1           /* faster to load reg's than to shift */
      mov	cx,x2
     }

nextline:

 /* bx=left side, cx=right side, vline=line start */

 asm {
      mov	al,[bx+stmask]     /* compute left side */
      shr	bx,3
      mov	di,cx              /* compute right side */
      mov	ah,[di+fnmask]     /* lookup 350 nS faster than shift */
      shr	cx,3

      mov	di,vline
      add	di,bx              /* compute start byte  */
      sub	cx,bx		   /* number of bytes - 1 */
      jz	onebyte
      jc	doneline	   /* skip if L>R     */

      and	es:[di],al	   /* mask start byte */
      inc	di
      dec	cx
				   /* cx==0 test not worth it:     */
      mov	al,0ffh		   /* faster to let REP handle 0's */
      rep	stosb		   /* fill center bytes            */

      and	es:[di],ah	   /* mask end byte */
     }
 goto doneline;

onebyte:
 asm {
      and	al,ah              /* only 1 byte to mask     */
      and	es:[di],al         /* combine start, end mask */
     }

doneline:

 asm {
      dec	WORD PTR lines
      jz	donetri

      mov	ax,40
      add	vline,ax

      add	edx,l_incr      /* add in slope */
      add	esi,r_incr
      mov	ebx,edx         /* throw away fraction: lt rounded up */
      sar	ebx,16
      mov	ecx,esi
      sar	ecx,16
      cmp	cx,0   		/* clip to 0 on left:     */
      jge	nextline	/* code auto-clip rt to 0 */

      xor	cx,cx
      jmp	nextline
     }
 donetri: ;
}



dpoly(int x1,int x2, int x3, int x4, int y1, int y3, int color)
{
 unsigned int vline = y1*40;
 long l_incr, r_incr;
 int lines = y3-y1;

 if(lines<1)return;

 asm {
      .386
      mov	dx,VGA
      xor	al,al
      mov	ah,BYTE PTR color           /* set color */
      out	dx,ax

      cld

      mov	ax,0a000h		    /* set segment */
      mov	es,ax
     }
}

---------------------- ends -----------------------


--------------------------------------------------------------------------
| My life is Hardware,                    |                              | 
| my destiny is Software,                 |         Dave Stampe          |
| my CPU is Wetware...                    |                              | 
| Anybody got a SDB I can borrow?         | dstamp@watserv1.uwaterloo.ca |
__________________________________________________________________________
