0x1 Architecture

From the doc of swiftshader in the following link, we can see the architecture introduction of it.
https://github.com/google/swiftshader/blob/master/docs/Index.md

The API layer is an implementation of a graphics API, such as OpenGL (ES) or Direct3D, on top of the Renderer interface. It is responsible for managing API-level resources and rendering state, as well as compiling high-level shaders to bytecode form.

The Renderer layer generates specialized processing routines for draw calls and coordinates the execution of rendering tasks. It defines the data structures used and how the processing is performed.

Reactor is an embedded language for C++ to dynamically generate code in a WYSIWYG fashion. It allows to specialize the processing routines for the state and shaders used by each draw call. Its syntax closely resembles C and shading languages, to make the code generation easily readable.

The JIT layer is a run-time compiler, such as LLVM’s JIT, or Subzero. Reactor records its operations in an in-memory intermediate form which can be materialized by the JIT into a function which can be called directly.

To achieve exceptional performance, SwiftShader is built around two major optimizations that affect its architecture: dynamic code generation, and parallel processing. Generating code at run-time allows to eliminate code branches and optimizes register usage, specializing the processing routines for exactly the operations required by each draw call. Parallel processing means both utilizing the CPU’s multiple cores and processing multiple elements accoss the width of the SIMD vector units.

0x2 Graphics Pipeline

Here is the graphics pipeline diagram from the following OpenGL ES spec. SwiftShader supports the pipeline by JIT through LLVM then executes it on CPU.
https://www.khronos.org/registry/OpenGL/specs/es/2.0/es_full_spec_2.0.pdf

0x3 JIT through LLVM

0x31 GLSL compiler frontend

SwiftShader uses glslang as its glsl compiler frontend, it consumes glsl source code, and produces AST, then outputs its IR with recursive traverse method, here is the IR definition.

enum Opcode
{
	// Matches order in d3d9types.h
	OPCODE_NOP = 0,
	OPCODE_MOV,
	OPCODE_ADD,
	OPCODE_SUB,
	OPCODE_MAD,
	OPCODE_MUL,
	OPCODE_RCPX,
	OPCODE_RSQX,
	OPCODE_DP3,
	OPCODE_DP4,
	OPCODE_MIN,
	OPCODE_MAX,
	OPCODE_SLT,
	OPCODE_SGE,
	OPCODE_EXP2X,   // D3DSIO_EXP
	OPCODE_LOG2X,   // D3DSIO_LOG
	OPCODE_LIT,
	OPCODE_ATT,   // D3DSIO_DST
	OPCODE_LRP,
	OPCODE_FRC,
	OPCODE_M4X4,
	OPCODE_M4X3,
	OPCODE_M3X4,
	OPCODE_M3X3,
	OPCODE_M3X2,
	OPCODE_CALL,
	OPCODE_CALLNZ,
	OPCODE_LOOP,
	OPCODE_RET,
	OPCODE_ENDLOOP,
	OPCODE_LABEL,
	OPCODE_DCL,
	OPCODE_POWX,
	OPCODE_CRS,
	OPCODE_SGN,
	OPCODE_ABS,
	OPCODE_NRM3,   // D3DSIO_NRM
	OPCODE_SINCOS,
	OPCODE_REP,
	OPCODE_ENDREP,
	OPCODE_IF,
	OPCODE_IFC,
	OPCODE_ELSE,
	OPCODE_ENDIF,
	OPCODE_BREAK,
	OPCODE_BREAKC,
	OPCODE_MOVA,
	OPCODE_DEFB,
	OPCODE_DEFI,
	OPCODE_TEXCOORD = 64,
	OPCODE_TEXKILL,
	OPCODE_TEX,
	OPCODE_TEXBEM,
	OPCODE_TEXBEML,
	OPCODE_TEXREG2AR,
	OPCODE_TEXREG2GB,
	OPCODE_TEXM3X2PAD,
	OPCODE_TEXM3X2TEX,
	OPCODE_TEXM3X3PAD,
	OPCODE_TEXM3X3TEX,
	OPCODE_RESERVED0,
	OPCODE_TEXM3X3SPEC,
	OPCODE_TEXM3X3VSPEC,
	OPCODE_EXPP,
	OPCODE_LOGP,
	OPCODE_CND,
	OPCODE_DEF,
	OPCODE_TEXREG2RGB,
	OPCODE_TEXDP3TEX,
	OPCODE_TEXM3X2DEPTH,
	OPCODE_TEXDP3,
	OPCODE_TEXM3X3,
	OPCODE_TEXDEPTH,
	OPCODE_CMP0,   // D3DSIO_CMP
	OPCODE_BEM,
	OPCODE_DP2ADD,
	OPCODE_DFDX,   // D3DSIO_DSX
	OPCODE_DFDY,   // D3DSIO_DSY
	OPCODE_TEXLDD,
	OPCODE_CMP,   // D3DSIO_SETP
	OPCODE_TEXLDL,
	OPCODE_BREAKP,
	OPCODE_PHASE = 0xFFFD,
	OPCODE_COMMENT = 0xFFFE,
	OPCODE_END = 0xFFFF,
	OPCODE_PS_1_0 = 0xFFFF0100,
	OPCODE_PS_1_1 = 0xFFFF0101,
	OPCODE_PS_1_2 = 0xFFFF0102,
	OPCODE_PS_1_3 = 0xFFFF0103,
	OPCODE_PS_1_4 = 0xFFFF0104,
	OPCODE_PS_2_0 = 0xFFFF0200,
	OPCODE_PS_2_x = 0xFFFF0201,
	OPCODE_PS_3_0 = 0xFFFF0300,
	OPCODE_VS_1_0 = 0xFFFE0100,
	OPCODE_VS_1_1 = 0xFFFE0101,
	OPCODE_VS_2_0 = 0xFFFE0200,
	OPCODE_VS_2_x = 0xFFFE0201,
	OPCODE_VS_2_sw = 0xFFFE02FF,
	OPCODE_VS_3_0 = 0xFFFE0300,
	OPCODE_VS_3_sw = 0xFFFE03FF,
	OPCODE_NULL = 0x10000000,   // Dead instruction, to be eliminated
	OPCODE_WHILE,
	OPCODE_ENDWHILE,
	OPCODE_COS,
	OPCODE_SIN,
	OPCODE_TAN,
	OPCODE_ACOS,
	OPCODE_ASIN,
	OPCODE_ATAN,
	OPCODE_ATAN2,
	OPCODE_COSH,
	OPCODE_SINH,
	OPCODE_TANH,
	OPCODE_ACOSH,
	OPCODE_ASINH,
	OPCODE_ATANH,
	OPCODE_DP1,
	OPCODE_DP2,
	OPCODE_TRUNC,
	OPCODE_FLOOR,
	OPCODE_ROUND,
	OPCODE_ROUNDEVEN,
	OPCODE_CEIL,
	OPCODE_SQRT,
	OPCODE_RSQ,
	OPCODE_LEN2,
	OPCODE_LEN3,
	OPCODE_LEN4,
	OPCODE_DIST1,
	OPCODE_DIST2,
	OPCODE_DIST3,
	OPCODE_DIST4,
	OPCODE_NRM2,
	OPCODE_NRM4,
	OPCODE_DIV,
	OPCODE_MOD,
	OPCODE_EXP2,
	OPCODE_LOG2,
	OPCODE_EXP,
	OPCODE_LOG,
	OPCODE_POW,
	OPCODE_F2B,   // Float to bool
	OPCODE_B2F,   // Bool to float
	OPCODE_F2I,   // Float to int
	OPCODE_I2F,   // Int to float
	OPCODE_F2U,   // Float to uint
	OPCODE_U2F,   // Uint to float
	OPCODE_I2B,   // Int to bool
	OPCODE_B2I,   // Bool to int
	OPCODE_DET2,
	OPCODE_DET3,
	OPCODE_DET4,
	OPCODE_ALL,
	OPCODE_ANY,
	OPCODE_NEG,
	OPCODE_NOT,
	OPCODE_OR,
	OPCODE_XOR,
	OPCODE_AND,
	OPCODE_EQ,
	OPCODE_NE,
	OPCODE_STEP,
	OPCODE_SMOOTH,
	OPCODE_ISNAN,
	OPCODE_ISINF,
	OPCODE_TEXOFFSET,
	OPCODE_TEXLODOFFSET,
	OPCODE_TEXELFETCH,
	OPCODE_TEXELFETCHOFFSET,
	OPCODE_TEXGRAD,
	OPCODE_TEXGRADOFFSET,
	OPCODE_TEXBIAS,
	OPCODE_TEXLOD,
	OPCODE_TEXOFFSETBIAS,
	OPCODE_TEXRECT,
	OPCODE_TEXSIZE,
	OPCODE_FLOATBITSTOINT,
	OPCODE_FLOATBITSTOUINT,
	OPCODE_INTBITSTOFLOAT,
	OPCODE_UINTBITSTOFLOAT,
	OPCODE_PACKSNORM2x16,
	OPCODE_PACKUNORM2x16,
	OPCODE_PACKHALF2x16,
	OPCODE_UNPACKSNORM2x16,
	OPCODE_UNPACKUNORM2x16,
	OPCODE_UNPACKHALF2x16,
	OPCODE_FORWARD1,
	OPCODE_FORWARD2,
	OPCODE_FORWARD3,
	OPCODE_FORWARD4,
	OPCODE_REFLECT1,
	OPCODE_REFLECT2,
	OPCODE_REFLECT3,
	OPCODE_REFLECT4,
	OPCODE_REFRACT1,
	OPCODE_REFRACT2,
	OPCODE_REFRACT3,
	OPCODE_REFRACT4,
	OPCODE_ICMP,
	OPCODE_UCMP,
	OPCODE_SELECT,
	OPCODE_EXTRACT,
	OPCODE_INSERT,
	OPCODE_DISCARD,
	OPCODE_FWIDTH,
	OPCODE_LEAVE,   // Return before the end of the function
	OPCODE_CONTINUE,
	OPCODE_TEST,   // Marks the end of the code that can be skipped by 'continue'
	OPCODE_SWITCH,
	OPCODE_ENDSWITCH,
	// Integer opcodes
	OPCODE_INEG,
	OPCODE_IABS,
	OPCODE_ISGN,
	OPCODE_IADD,
	OPCODE_ISUB,
	OPCODE_IMUL,
	OPCODE_IDIV,
	OPCODE_IMAD,
	OPCODE_IMOD,
	OPCODE_SHL,
	OPCODE_ISHR,
	OPCODE_IMIN,
	OPCODE_IMAX,
	// Unsigned integer opcodes
	OPCODE_UDIV,
	OPCODE_UMOD,
	OPCODE_USHR,
	OPCODE_UMIN,
	OPCODE_UMAX,
};

Here is the callstack about how IR is generated, it is done by traversing the AST.

Once IR is ready, swiftshader will consume those IR and generate LLVM IR, some fixed pipeline and graphic state will also be programmed into LLVM IR at the same time.

We will discuss the three processors, vertex processor, setup processor and pixel processor, all the three processor will generate LLVM IR accoring to graphic state.

0x31 Vertex processor

Here is diagram about how vertex processor produces LLVM IR.

Prepare the draw state

It prepare thes draw state by updating the structe State according to the graphic state.

const VertexProcessor::State VertexProcessor::update(DrawType drawType)
{
		if(isFixedFunction())
		{
			updateTransform();
			if(updateLighting)
			{
				for(int i = 0; i < 8; i++)
				{
					if(context->vertexLightActive(i))
					{
						// Light position in camera coordinates
						setLightViewPosition(i, B * V * context->getLightPosition(i));
					}
				}
				updateLighting = false;
			}
		}
		State state;
		if(context->vertexShader)
		{
			state.shaderID = context->vertexShader->getSerialID();
		}
		else
		{
			state.shaderID = 0;
		}
		state.fixedFunction = !context->vertexShader && context->pixelShaderModel() < 0x0300;
		state.textureSampling = context->vertexShader ? context->vertexShader->containsTextureSampling() : false;
		state.positionRegister = context->vertexShader ? context->vertexShader->getPositionRegister() : Pos;
		state.pointSizeRegister = context->vertexShader ? context->vertexShader->getPointSizeRegister() : Pts;
		state.vertexBlendMatrixCount = context->vertexBlendMatrixCountActive();
		state.indexedVertexBlendEnable = context->indexedVertexBlendActive();
		state.vertexNormalActive = context->vertexNormalActive();
		state.normalizeNormals = context->normalizeNormalsActive();
		state.vertexLightingActive = context->vertexLightingActive();
		state.diffuseActive = context->diffuseActive();
		state.specularActive = context->specularActive();
		state.vertexSpecularActive = context->vertexSpecularActive();
		state.vertexLightActive = context->vertexLightActive(0) << 0 |
		                          context->vertexLightActive(1) << 1 |
		                          context->vertexLightActive(2) << 2 |
		                          context->vertexLightActive(3) << 3 |
		                          context->vertexLightActive(4) << 4 |
		                          context->vertexLightActive(5) << 5 |
		                          context->vertexLightActive(6) << 6 |
		                          context->vertexLightActive(7) << 7;
		state.vertexDiffuseMaterialSourceActive = context->vertexDiffuseMaterialSourceActive();
		state.vertexSpecularMaterialSourceActive = context->vertexSpecularMaterialSourceActive();
		state.vertexAmbientMaterialSourceActive = context->vertexAmbientMaterialSourceActive();
		state.vertexEmissiveMaterialSourceActive = context->vertexEmissiveMaterialSourceActive();
		state.fogActive = context->fogActive();
		state.vertexFogMode = context->vertexFogModeActive();
		state.rangeFogActive = context->rangeFogActive();
		state.localViewerActive = context->localViewerActive();
		state.pointSizeActive = context->pointSizeActive();
		state.pointScaleActive = context->pointScaleActive();
		state.preTransformed = context->preTransformed;
		state.superSampling = context->getSuperSampleCount() > 1;
		state.multiSampling = context->getMultiSampleCount() > 1;
		state.transformFeedbackQueryEnabled = context->transformFeedbackQueryEnabled;
		state.transformFeedbackEnabled = context->transformFeedbackEnabled;
		// Note: Quads aren't handled for verticesPerPrimitive, but verticesPerPrimitive is used for transform feedback,
		//       which is an OpenGL ES 3.0 feature, and OpenGL ES 3.0 doesn't support quads as a primitive type.
		DrawType type = static_cast<DrawType>(static_cast<unsigned int>(drawType) & 0xF);
		state.verticesPerPrimitive = 1 + (type >= DRAW_LINELIST) + (type >= DRAW_TRIANGLELIST);
		for(int i = 0; i < MAX_VERTEX_INPUTS; i++)
		{
			state.input[i].type = context->input[i].type;
			state.input[i].count = context->input[i].count;
			state.input[i].normalized = context->input[i].normalized;
			state.input[i].attribType = context->vertexShader ? context->vertexShader->getAttribType(i) : VertexShader::ATTRIBTYPE_FLOAT;
		}
		if(!context->vertexShader)
		{
			for(int i = 0; i < 8; i++)
			{
			//	state.textureState[i].vertexTextureActive = context->vertexTextureActive(i, 0);
				state.textureState[i].texGenActive = context->texGenActive(i);
				state.textureState[i].textureTransformCountActive = context->textureTransformCountActive(i);
				state.textureState[i].texCoordIndexActive = context->texCoordIndexActive(i);
			}
		}
		else
		{
			for(unsigned int i = 0; i < VERTEX_TEXTURE_IMAGE_UNITS; i++)
			{
				if(context->vertexShader->usesSampler(i))
				{
					state.sampler[i] = context->sampler[TEXTURE_IMAGE_UNITS + i].samplerState();
				}
			}
		}
		if(context->vertexShader)   // FIXME: Also when pre-transformed?
		{
			for(int i = 0; i < MAX_VERTEX_OUTPUTS; i++)
			{
				state.output[i].xWrite = context->vertexShader->getOutput(i, 0).active();
				state.output[i].yWrite = context->vertexShader->getOutput(i, 1).active();
				state.output[i].zWrite = context->vertexShader->getOutput(i, 2).active();
				state.output[i].wWrite = context->vertexShader->getOutput(i, 3).active();
			}
		}
		else if(!context->preTransformed || context->pixelShaderModel() < 0x0300)
		{
			state.output[Pos].write = 0xF;
			if(context->diffuseActive() && (context->lightingEnable || context->input[Color0]))
			{
				state.output[C0].write = 0xF;
			}
			if(context->specularActive())
			{
				state.output[C1].write = 0xF;
			}
			for(int stage = 0; stage < 8; stage++)
			{
				if(context->texCoordActive(stage, 0)) state.output[T0 + stage].write |= 0x01;
				if(context->texCoordActive(stage, 1)) state.output[T0 + stage].write |= 0x02;
				if(context->texCoordActive(stage, 2)) state.output[T0 + stage].write |= 0x04;
				if(context->texCoordActive(stage, 3)) state.output[T0 + stage].write |= 0x08;
			}
			if(context->fogActive())
			{
				state.output[Fog].xWrite = true;
			}
			if(context->pointSizeActive())
			{
				state.output[Pts].yWrite = true;
			}
		}
		else
		{
			state.output[Pos].write = 0xF;
			for(int i = 0; i < 2; i++)
			{
				if(context->input[Color0 + i])
				{
					state.output[C0 + i].write = 0xF;
				}
			}
			for(int i = 0; i < 8; i++)
			{
				if(context->input[TexCoord0 + i])
				{
					state.output[T0 + i].write = 0xF;
				}
			}
			if(context->input[PointSize])
			{
				state.output[Pts].yWrite = true;
			}
		}
		if(context->vertexShaderModel() < 0x0300)
		{
			state.output[C0].clamp = 0xF;
			state.output[C1].clamp = 0xF;
			state.output[Fog].xClamp = true;
		}
		state.hash = state.computeHash();
		return state;
}

Generate LLVM IR

Then it generates the LLVM IR with reactor method based on the state which is updated in the previous step.

void VertexRoutine::generate()
{
		const bool textureSampling = state.textureSampling;
		Pointer<Byte> cache = task + OFFSET(VertexTask,vertexCache);
		Pointer<Byte> vertexCache = cache + OFFSET(VertexCache,vertex);
		Pointer<Byte> tagCache = cache + OFFSET(VertexCache,tag);
		UInt vertexCount = *Pointer<UInt>(task + OFFSET(VertexTask,vertexCount));
		UInt primitiveNumber = *Pointer<UInt>(task + OFFSET(VertexTask, primitiveStart));
		UInt indexInPrimitive = 0;
		constants = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,constants));
		Do
		{
			UInt index = *Pointer<UInt>(batch);
			UInt tagIndex = index & 0x0000003C;
			UInt indexQ = !textureSampling ? UInt(index & 0xFFFFFFFC) : index;   // FIXME: TEXLDL hack to have independent LODs, hurts performance.
			If(*Pointer<UInt>(tagCache + tagIndex) != indexQ)
			{
				*Pointer<UInt>(tagCache + tagIndex) = indexQ;
				readInput(indexQ);
				pipeline(indexQ);
				postTransform();
				computeClipFlags();
				Pointer<Byte> cacheLine0 = vertexCache + tagIndex * UInt((int)sizeof(Vertex));
				writeCache(cacheLine0);
			}
			UInt cacheIndex = index & 0x0000003F;
			Pointer<Byte> cacheLine = vertexCache + cacheIndex * UInt((int)sizeof(Vertex));
			writeVertex(vertex, cacheLine);
			if(state.transformFeedbackEnabled != 0)
			{
				transformFeedback(vertex, primitiveNumber, indexInPrimitive);
				indexInPrimitive++;
				If(indexInPrimitive == 3)
				{
					primitiveNumber++;
					indexInPrimitive = 0;
				}
			}
			vertex += sizeof(Vertex);
			batch += sizeof(unsigned int);
			vertexCount--;
		}
		Until(vertexCount == 0)
		Return();
}

0x32 Setup processor

The LLVM IR generation process of setup processor is similar to Vertex processor’s.

0x33 Pixel processor

The LLVM IR generation process of pixel processor is similar to Vertex processor’s.

0x4 Multithread

Here is the timeline diagram about how multithread is supported in swiftshader.
It split the draw task into several batches, each batch is executed in one thread.vertex processor and setup processor is executed together, pixel processor is executed after that.

0x41 Task producer.

Task producer produces the task in the main thread, then push the task in the queue and waits for the consumer to get those task for cosuming.

0x42 Task consumer.

Task consumer finds the task in the queue, then exeucute the task in the specific thread.
Here is the diagram about how vertex processor and setup processor is executed in one thread.

Here is the diagram about how pixel processor is executed in one thread.