Optimise VBO updates.

Some drivers (at least the Intel drivers on Windows) are slow at
incrementally updating a VBO with hundreds of calls to glBufferSubData
every frame. Performance is significantly better if you use
glBufferData(NULL) to tell it to discard all the previous contents, and
then re-upload all the data at once.

Update CVertexBuffer so that GL_DYNAMIC_DRAW/GL_STREAM_DRAW buffers are
handled with the new mechanism. This requires the caller to hold onto
the backing store so it can be re-uploaded when necessary, and needs a
bit more signalling to indicate exactly what needs uploading.

I see an improvement from roughly 60 to 75 fps on Intel HD Graphics
3000, Windows, 1024x768, Siwa Oasis.

This was SVN commit r16241.
This commit is contained in:
Ykkrosh 2015-01-28 00:48:00 +00:00
parent c599f92875
commit 0ef6c7555e
12 changed files with 222 additions and 40 deletions

View file

@ -31,7 +31,7 @@
CParticleEmitter::CParticleEmitter(const CParticleEmitterTypePtr& type) :
m_Type(type), m_Active(true), m_NextParticleIdx(0), m_EmissionRoundingError(0.f),
m_LastUpdateTime(type->m_Manager.GetCurrentTime()),
m_IndexArray(GL_DYNAMIC_DRAW),
m_IndexArray(GL_STATIC_DRAW),
m_VertexArray(GL_DYNAMIC_DRAW),
m_LastFrameNumber(-1)
{
@ -171,6 +171,11 @@ void CParticleEmitter::UpdateArrayData(int frameNumber)
m_VertexArray.Upload();
}
void CParticleEmitter::PrepareForRendering()
{
m_VertexArray.PrepareForRendering();
}
void CParticleEmitter::Bind(const CShaderProgramPtr& shader)
{
CLOSTexture& los = g_Renderer.GetScene().GetLOSTexture();

View file

@ -113,6 +113,11 @@ public:
*/
void UpdateArrayData(int frameNumber);
/**
* Make the vertex data available for subsequent binding and rendering.
*/
void PrepareForRendering();
/**
* Bind rendering state (textures and blend modes).
*/

View file

@ -580,6 +580,8 @@ void CMiniMap::Draw()
m_VertexArray.Upload();
}
m_VertexArray.PrepareForRendering();
if (m_EntitiesDrawn > 0)
{
#if !CONFIG2_GLES

View file

@ -226,6 +226,8 @@ void ShaderModelVertexRenderer::UpdateModelData(CModel* model, CModelRData* data
// upload everything to vertex buffer
shadermodel->m_Array.Upload();
}
shadermodel->m_Array.PrepareForRendering();
}

View file

@ -126,7 +126,7 @@ struct OverlayRendererInternals
const float OverlayRenderer::OVERLAY_VOFFSET = 0.2f;
OverlayRendererInternals::OverlayRendererInternals()
: quadVertices(GL_DYNAMIC_DRAW), quadIndices(GL_DYNAMIC_DRAW)
: quadVertices(GL_DYNAMIC_DRAW), quadIndices(GL_STATIC_DRAW)
{
quadAttributePos.elems = 3;
quadAttributePos.type = GL_FLOAT;
@ -351,6 +351,8 @@ void OverlayRenderer::PrepareForRendering()
m->quadVertices.Upload();
// don't free the backing store! we'll overwrite it on the next frame to save a reallocation.
m->quadVertices.PrepareForRendering();
}
void OverlayRenderer::RenderOverlaysBeforeWater()

View file

@ -104,6 +104,7 @@ void ParticleRenderer::PrepareForRendering(const CShaderDefines& context)
{
CParticleEmitter* emitter = m->emitters[cullGroup][i];
emitter->UpdateArrayData(m->frameNumber);
emitter->PrepareForRendering();
}
}

View file

@ -1,4 +1,4 @@
/* Copyright (C) 2012 Wildfire Games.
/* Copyright (C) 2015 Wildfire Games.
* This file is part of 0 A.D.
*
* 0 A.D. is free software: you can redistribute it and/or modify
@ -22,6 +22,7 @@
#include "lib/sysdep/rtl.h"
#include "maths/Vector3D.h"
#include "maths/Vector4D.h"
#include "ps/CLogger.h"
#include "graphics/Color.h"
#include "graphics/SColor.h"
#include "renderer/VertexArray.h"
@ -273,6 +274,10 @@ void VertexArray::Layout()
m_BackingStore = (char*)rtl_AllocateAligned(m_Stride * m_NumVertices, 16);
}
void VertexArray::PrepareForRendering()
{
m_VB->m_Owner->PrepareForRendering(m_VB);
}
// (Re-)Upload the attributes.
// Create the VBO if necessary.
@ -281,10 +286,13 @@ void VertexArray::Upload()
ENSURE(m_BackingStore);
if (!m_VB)
m_VB = g_VBMan.Allocate(m_Stride, m_NumVertices, m_Usage, m_Target);
m_VB = g_VBMan.Allocate(m_Stride, m_NumVertices, m_Usage, m_Target, m_BackingStore);
if (!m_VB) // failed to allocate VBO
if (!m_VB)
{
LOGERROR("Failed to allocate VBO for vertex array");
return;
}
m_VB->m_Owner->UpdateChunkVertices(m_VB, m_BackingStore);
}
@ -305,6 +313,9 @@ u8* VertexArray::Bind()
// Free the backing store to save some memory
void VertexArray::FreeBackingStore()
{
// In streaming modes, the backing store must be retained
ENSURE(!CVertexBuffer::UseStreaming(m_Usage));
rtl_FreeAligned(m_BackingStore);
m_BackingStore = 0;
}

View file

@ -176,6 +176,8 @@ public:
// (Re-)Upload the attributes of the vertex array from the backing store to
// the underlying VBO object.
void Upload();
// Make this vertex array's data available for the next series of calls to Bind
void PrepareForRendering();
// Bind this array, returns the base address for calls to glVertexPointer etc.
u8* Bind();

View file

@ -1,4 +1,4 @@
/* Copyright (C) 2013 Wildfire Games.
/* Copyright (C) 2015 Wildfire Games.
* This file is part of 0 A.D.
*
* 0 A.D. is free software: you can redistribute it and/or modify
@ -28,6 +28,12 @@
#include "VertexBufferManager.h"
#include "ps/CLogger.h"
// Absolute maximum (bytewise) size of each GL vertex buffer object.
// Make it large enough for the maximum feasible mesh size (64K vertexes,
// 32 bytes per vertex in ShaderModelRenderer).
// TODO: measure what influence this has on performance
#define MAX_VB_SIZE_BYTES (2*1024*1024)
CVertexBuffer::CVertexBuffer(size_t vertexSize, GLenum usage, GLenum target)
: m_VertexSize(vertexSize), m_Handle(0), m_SysMem(0), m_Usage(usage), m_Target(target)
{
@ -41,22 +47,22 @@ CVertexBuffer::CVertexBuffer(size_t vertexSize, GLenum usage, GLenum target)
size = std::min(size, vertexSize*65536);
}
// store max/free vertex counts
m_MaxVertices = m_FreeVertices = size / vertexSize;
// allocate raw buffer
if (g_Renderer.m_Caps.m_VBO)
{
pglGenBuffersARB(1, &m_Handle);
pglBindBufferARB(m_Target, m_Handle);
pglBufferDataARB(m_Target, size, 0, m_Usage);
pglBufferDataARB(m_Target, m_MaxVertices * m_VertexSize, 0, m_Usage);
pglBindBufferARB(m_Target, 0);
}
else
{
m_SysMem = new u8[size];
m_SysMem = new u8[m_MaxVertices * m_VertexSize];
}
// store max/free vertex counts
m_MaxVertices = m_FreeVertices = size/vertexSize;
// create sole free chunk
VBChunk* chunk = new VBChunk;
chunk->m_Owner = this;
@ -67,6 +73,9 @@ CVertexBuffer::CVertexBuffer(size_t vertexSize, GLenum usage, GLenum target)
CVertexBuffer::~CVertexBuffer()
{
// Must have released all chunks before destroying the buffer
ENSURE(m_AllocList.empty());
if (m_Handle)
pglDeleteBuffersARB(1, &m_Handle);
@ -90,12 +99,15 @@ bool CVertexBuffer::CompatibleVertexType(size_t vertexSize, GLenum usage, GLenum
// Allocate: try to allocate a buffer of given number of vertices (each of
// given size), with the given type, and using the given texture - return null
// if no free chunks available
CVertexBuffer::VBChunk* CVertexBuffer::Allocate(size_t vertexSize, size_t numVertices, GLenum usage, GLenum target)
CVertexBuffer::VBChunk* CVertexBuffer::Allocate(size_t vertexSize, size_t numVertices, GLenum usage, GLenum target, void* backingStore)
{
// check this is the right kind of buffer
if (!CompatibleVertexType(vertexSize, usage, target))
return 0;
if (UseStreaming(usage))
ENSURE(backingStore != NULL);
// quick check there's enough vertices spare to allocate
if (numVertices > m_FreeVertices)
return 0;
@ -119,6 +131,10 @@ CVertexBuffer::VBChunk* CVertexBuffer::Allocate(size_t vertexSize, size_t numVer
return 0;
}
chunk->m_BackingStore = backingStore;
chunk->m_Dirty = false;
chunk->m_Needed = false;
// split chunk into two; - allocate a new chunk using all unused vertices in the
// found chunk, and add it to the free list
if (chunk->m_Count > numVertices)
@ -135,6 +151,7 @@ CVertexBuffer::VBChunk* CVertexBuffer::Allocate(size_t vertexSize, size_t numVer
}
// return found chunk
m_AllocList.push_back(chunk);
return chunk;
}
@ -145,6 +162,8 @@ void CVertexBuffer::Release(VBChunk* chunk)
// Update total free count before potentially modifying this chunk's count
m_FreeVertices += chunk->m_Count;
m_AllocList.remove(chunk);
typedef std::list<VBChunk*>::iterator Iter;
// Coalesce with any free-list items that are adjacent to this chunk;
@ -180,9 +199,21 @@ void CVertexBuffer::UpdateChunkVertices(VBChunk* chunk, void* data)
if (g_Renderer.m_Caps.m_VBO)
{
ENSURE(m_Handle);
pglBindBufferARB(m_Target, m_Handle);
pglBufferSubDataARB(m_Target, chunk->m_Index * m_VertexSize, chunk->m_Count * m_VertexSize, data);
pglBindBufferARB(m_Target, 0);
if (UseStreaming(m_Usage))
{
// The VBO is now out of sync with the backing store
chunk->m_Dirty = true;
// Sanity check: Make sure the caller hasn't tried to reallocate
// their backing store
ENSURE(data == chunk->m_BackingStore);
}
else
{
pglBindBufferARB(m_Target, m_Handle);
pglBufferSubDataARB(m_Target, chunk->m_Index * m_VertexSize, chunk->m_Count * m_VertexSize, data);
pglBindBufferARB(m_Target, 0);
}
}
else
{
@ -196,15 +227,85 @@ void CVertexBuffer::UpdateChunkVertices(VBChunk* chunk, void* data)
// to glVertexPointer ( + etc) calls
u8* CVertexBuffer::Bind()
{
if (g_Renderer.m_Caps.m_VBO)
{
pglBindBufferARB(m_Target, m_Handle);
return (u8*)0;
}
else
{
if (!g_Renderer.m_Caps.m_VBO)
return m_SysMem;
pglBindBufferARB(m_Target, m_Handle);
if (UseStreaming(m_Usage))
{
// If any chunks are out of sync with the current VBO, and are
// needed for rendering this frame, we'll need to re-upload the VBO
bool needUpload = false;
for (auto& chunk : m_AllocList)
{
if (chunk->m_Dirty && chunk->m_Needed)
{
needUpload = true;
break;
}
}
if (needUpload)
{
// Tell the driver that it can reallocate the whole VBO
pglBufferDataARB(m_Target, m_MaxVertices * m_VertexSize, NULL, m_Usage);
// (In theory, glMapBufferRange with GL_MAP_INVALIDATE_BUFFER_BIT could be used
// here instead of glBufferData(..., NULL, ...) plus glMapBuffer(), but with
// current Intel Windows GPU drivers (as of 2015-01) it's much faster if you do
// the explicit glBufferData.)
while (true)
{
void* p = pglMapBufferARB(m_Target, GL_WRITE_ONLY);
if (p == NULL)
{
// This shouldn't happen unless we run out of virtual address space
LOGERROR("glMapBuffer failed");
break;
}
#ifndef NDEBUG
// To help detect bugs where PrepareForRendering() was not called,
// force all not-needed data to 0, so things won't get rendered
// with undefined (but possibly still correct-looking) data.
memset(p, 0, m_MaxVertices * m_VertexSize);
#endif
// Copy only the chunks we need. (This condition is helpful when
// the VBO contains data for every unit in the world, but only a
// handful are visible on screen and we don't need to bother copying
// the rest.)
for (auto& chunk : m_AllocList)
if (chunk->m_Needed)
memcpy((u8 *)p + chunk->m_Index * m_VertexSize, chunk->m_BackingStore, chunk->m_Count * m_VertexSize);
if (pglUnmapBufferARB(m_Target) == GL_TRUE)
break;
// Unmap might fail on e.g. resolution switches, so just try again
// and hope it will eventually succeed
debug_printf(L"glUnmapBuffer failed, trying again...");
}
// Anything we just uploaded is clean; anything else is dirty
// since the rest of the VBO content is now undefined
for (auto& chunk : m_AllocList)
{
if (chunk->m_Needed)
chunk->m_Dirty = false;
else
chunk->m_Dirty = true;
}
}
// Reset the flags for the next phase
for (auto& chunk : m_AllocList)
chunk->m_Needed = false;
}
return (u8*)0;
}
u8* CVertexBuffer::GetBindAddress()
@ -247,3 +348,8 @@ void CVertexBuffer::DumpStatus()
}
debug_printf(L"max size = %d\n", (int)maxSize);
}
bool CVertexBuffer::UseStreaming(GLenum usage)
{
return (usage == GL_DYNAMIC_DRAW || usage == GL_STREAM_DRAW);
}

View file

@ -1,4 +1,4 @@
/* Copyright (C) 2013 Wildfire Games.
/* Copyright (C) 2015 Wildfire Games.
* This file is part of 0 A.D.
*
* 0 A.D. is free software: you can redistribute it and/or modify
@ -27,15 +27,30 @@
#include <list>
#include <vector>
// Absolute maximum (bytewise) size of each GL vertex buffer object.
// Make it large enough for the maximum feasible mesh size (64K vertexes,
// 32 bytes per vertex in ShaderModelRenderer).
// TODO: measure what influence this has on performance
#define MAX_VB_SIZE_BYTES (4*1024*1024)
/**
* CVertexBuffer: encapsulation of ARB_vertex_buffer_object, also supplying
* some additional functionality for sharing buffers between multiple objects
* some additional functionality for sharing buffers between multiple objects.
*
* The class can be used in two modes, depending on the usage parameter:
*
* GL_STATIC_DRAW: Call Allocate() with backingStore = NULL. Then call
* UpdateChunkVertices() with any pointer - the data will be immediately copied
* to the VBO. This should be used for vertex data that rarely changes.
*
* GL_DYNAMIC_DRAW, GL_STREAM_DRAW: Call Allocate() with backingStore pointing
* at some memory that will remain valid for the lifetime of the CVertexBuffer.
* This should be used for vertex data that may change every frame.
* Rendering is expected to occur in two phases:
* - "Prepare" phase:
* If this chunk is going to be used for rendering during the next Bind phase,
* you must call PrepareForRendering().
* If the vertex data in backingStore has been modified since the last Bind phase,
* you must call UpdateChunkVertices().
* - "Bind" phase:
* Bind() can be called (multiple times). The vertex data will be uploaded
* to the GPU if necessary.
* It is okay to have multiple prepare/bind cycles per frame (though slightly less
* efficient), but they must occur sequentially.
*/
class CVertexBuffer
{
@ -52,6 +67,16 @@ public:
size_t m_Index;
/// Number of vertices used by chunk
size_t m_Count;
/// If UseStreaming() is true, points at the data for this chunk
void* m_BackingStore;
/// If true, the VBO is not consistent with the chunk's backing store
/// (and will need to be re-uploaded before rendering with this chunk)
bool m_Dirty;
/// If true, we have been told this chunk is going to be used for
/// rendering in the next bind phase and will need to be uploaded
bool m_Needed;
private:
// Only CVertexBuffer can construct/delete these
@ -76,6 +101,9 @@ public:
/// Unbind any currently-bound buffer, so glVertexPointer etc calls will not attempt to use it
static void Unbind();
/// Make the vertex data available for the next call to Bind()
void PrepareForRendering(VBChunk* chunk) { chunk->m_Needed = true; }
/// Update vertex data for given chunk. Transfers the provided data to the actual OpenGL vertex buffer.
void UpdateChunkVertices(VBChunk* chunk, void* data);
@ -88,23 +116,38 @@ public:
void DumpStatus();
/**
* Given the usage flags of a buffer that has been (or will be) allocated:
*
* If true, we assume the buffer is going to be modified on every frame,
* so we will re-upload the entire buffer every frame using glMapBuffer.
* This requires the buffer's owner to hold onto its backing store.
*
* If false, we assume it will change rarely, and use glSubBufferData to
* update it incrementally. The backing store can be freed to save memory.
*/
static bool UseStreaming(GLenum usage);
protected:
friend class CVertexBufferManager; // allow allocate only via CVertexBufferManager
/// Try to allocate a buffer of given number of vertices (each of given size),
/// and with the given type - return null if no free chunks available
VBChunk* Allocate(size_t vertexSize, size_t numVertices, GLenum usage, GLenum target);
VBChunk* Allocate(size_t vertexSize, size_t numVertices, GLenum usage, GLenum target, void* backingStore);
/// Return given chunk to this buffer
void Release(VBChunk* chunk);
private:
private:
/// Vertex size of this vertex buffer
size_t m_VertexSize;
/// Number of vertices of above size in this buffer
size_t m_MaxVertices;
/// List of free chunks in this buffer
std::list<VBChunk*> m_FreeList;
/// List of allocated chunks
std::list<VBChunk*> m_AllocList;
/// Available free vertices - total of all free vertices in the free list
size_t m_FreeVertices;
/// Handle to the actual GL vertex buffer object

View file

@ -47,7 +47,7 @@ void CVertexBufferManager::Shutdown()
// Allocate: try to allocate a buffer of given number of vertices (each of
// given size), with the given type, and using the given texture - return null
// if no free chunks available
CVertexBuffer::VBChunk* CVertexBufferManager::Allocate(size_t vertexSize, size_t numVertices, GLenum usage, GLenum target)
CVertexBuffer::VBChunk* CVertexBufferManager::Allocate(size_t vertexSize, size_t numVertices, GLenum usage, GLenum target, void* backingStore)
{
CVertexBuffer::VBChunk* result=0;
@ -55,6 +55,9 @@ CVertexBuffer::VBChunk* CVertexBufferManager::Allocate(size_t vertexSize, size_t
ENSURE(target == GL_ARRAY_BUFFER || target == GL_ELEMENT_ARRAY_BUFFER);
if (CVertexBuffer::UseStreaming(usage))
ENSURE(backingStore != NULL);
// TODO, RC - run some sanity checks on allocation request
typedef std::list<CVertexBuffer*>::iterator Iter;
@ -75,7 +78,7 @@ CVertexBuffer::VBChunk* CVertexBufferManager::Allocate(size_t vertexSize, size_t
// satisfy the allocation
for (Iter iter = m_Buffers.begin(); iter != m_Buffers.end(); ++iter) {
CVertexBuffer* buffer = *iter;
result = buffer->Allocate(vertexSize, numVertices, usage, target);
result = buffer->Allocate(vertexSize, numVertices, usage, target, backingStore);
if (result)
return result;
}
@ -83,7 +86,7 @@ CVertexBuffer::VBChunk* CVertexBufferManager::Allocate(size_t vertexSize, size_t
// got this far; need to allocate a new buffer
CVertexBuffer* buffer = new CVertexBuffer(vertexSize, usage, target);
m_Buffers.push_front(buffer);
result = buffer->Allocate(vertexSize, numVertices, usage, target);
result = buffer->Allocate(vertexSize, numVertices, usage, target, backingStore);
if (!result)
{

View file

@ -36,11 +36,14 @@ public:
*
* @param vertexSize size of each vertex in the buffer
* @param numVertices number of vertices in the buffer
* @param usage typically GL_STATIC_DRAW or GL_DYNAMIC_DRAW
* @param usage GL_STATIC_DRAW, GL_DYNAMIC_DRAW, GL_STREAM_DRAW
* @param target typically GL_ARRAY_BUFFER or GL_ELEMENT_ARRAY_BUFFER
* @param backingStore if usage is STATIC, this is NULL; else for DYNAMIC/STREAM,
* this must be a copy of the vertex data that remains valid for the
* lifetime of the VBChunk
* @return chunk, or NULL if no free chunks available
*/
CVertexBuffer::VBChunk* Allocate(size_t vertexSize, size_t numVertices, GLenum usage, GLenum target);
CVertexBuffer::VBChunk* Allocate(size_t vertexSize, size_t numVertices, GLenum usage, GLenum target, void* backingStore = NULL);
/// Returns the given @p chunk to its owning buffer
void Release(CVertexBuffer::VBChunk* chunk);
@ -51,9 +54,6 @@ public:
size_t GetBytesReserved();
size_t GetBytesAllocated();
/// Returns the maximum possible size of a single vertex buffer
size_t GetMaxBufferSize() const { return MAX_VB_SIZE_BYTES; }
/// Explicit shutdown of the vertex buffer subsystem; releases all currently-allocated buffers.
void Shutdown();