From 53cc45e7e11cda2ef3efeca1ea5d26def576adc2 Mon Sep 17 00:00:00 2001
From: BotoX <botox@botox.bz>
Date: Sat, 24 Dec 2016 12:12:27 +0100
Subject: [PATCH] refactored and fixed crash bug, some memleaks

---
 AMBuildScript                 | 285 ++++++------
 README.md                     |  10 +
 extension/AMBuilder           |  14 +-
 extension/atomicops.h         | 664 +++++++++++++++++++++++++++
 extension/context.cpp         | 119 +++--
 extension/context.h           |  33 +-
 extension/extension.cpp       | 511 +++++++++++----------
 extension/extension.h         |  35 +-
 extension/libuv/.gitignore    |   2 +
 extension/queue.h             |  44 --
 extension/readerwriterqueue.h | 815 ++++++++++++++++++++++++++++++++++
 11 files changed, 2018 insertions(+), 514 deletions(-)
 create mode 100644 README.md
 create mode 100644 extension/atomicops.h
 create mode 100644 extension/libuv/.gitignore
 delete mode 100644 extension/queue.h
 create mode 100644 extension/readerwriterqueue.h

diff --git a/AMBuildScript b/AMBuildScript
index 3023b44..58dba67 100644
--- a/AMBuildScript
+++ b/AMBuildScript
@@ -26,8 +26,9 @@ class AsyncSocketConfig(object):
   def __init__(self):
     self.sdks = {}
     self.binaries = []
-    self.sm_root = None
     self.extensions = []
+    self.generated_headers = None
+    self.sm_root = None
 
   @property
   def tag(self):
@@ -39,160 +40,179 @@ class AsyncSocketConfig(object):
     if builder.options.sm_path:
       self.sm_root = builder.options.sm_path
     else:
-      self.sm_root = ResolveEnvPath('SOURCEMOD17', 'sourcemod-1.7')
+      self.sm_root = ResolveEnvPath('SOURCEMOD18', 'sourcemod-1.8')
       if not self.sm_root:
         self.sm_root = ResolveEnvPath('SOURCEMOD', 'sourcemod')
       if not self.sm_root:
-        self.sm_root = ResolveEnvPath('SMCENTRAL', 'sourcemod-central')
+        self.sm_root = ResolveEnvPath('SOURCEMOD_DEV', 'sourcemod-central')
 
     if not self.sm_root or not os.path.isdir(self.sm_root):
-      raise Exception('Could not find a source copy of Sourcemod')
+      raise Exception('Could not find a source copy of SourceMod')
     self.sm_root = Normalize(self.sm_root)
 
   def configure(self):
-    builder.AddConfigureFile('pushbuild.txt')
-
     cxx = builder.DetectCompilers()
 
     if cxx.like('gcc'):
-      cxx.defines += [
-        'stricmp=strcasecmp',
-        '_stricmp=strcasecmp',
-        '_snprintf=snprintf',
-        '_vsnprintf=vsnprintf',
-        'HAVE_STDINT_H',
-        'GNUC',
-      ]
-      cxx.cflags += [
-        '-pipe',
-        '-fno-strict-aliasing',
-        '-Wall',
-        '-Werror',
-        '-Wno-unused',
-        '-Wno-switch',
-        '-msse',
-        '-m32',
-      ]
-      cxx.cxxflags += [
-          '-std=c++11',
-      ]
-
-      have_gcc = cxx.vendor == 'gcc'
-      have_clang = cxx.vendor == 'clang'
-      if have_clang or (have_gcc and cxx.version >= '4'):
-        cxx.cflags += ['-fvisibility=hidden']
-        cxx.cxxflags += ['-fvisibility-inlines-hidden']
-        if have_clang or (have_gcc and cxx.version >= '4.6'):
-          cxx.cflags += ['-Wno-narrowing']
-        if (have_gcc and cxx.version >= '4.7') or (have_clang and cxx.version >= '3'):
-          cxx.cxxflags += ['-Wno-delete-non-virtual-dtor']
-        if have_gcc and cxx.version >= '4.8':
-          cxx.cflags += ['-Wno-unused-result']
-      if have_clang:
-        cxx.cxxflags += ['-Wno-implicit-exception-spec-mismatch']
-        if (builder.target_platform == 'mac' and cxx.version >= '5.1') or cxx.version >= '3.4':
-          cxx.cxxflags += ['-Wno-deprecated-register']
-        else:
-          cxx.cxxflags += ['-Wno-deprecated']
-        cxx.cflags += ['-Wno-sometimes-uninitialized']
-
-      cxx.linkflags += ['-m32']
-      cxx.cxxflags += [
-        '-fno-exceptions',
-        '-fno-threadsafe-statics',
-        '-Wno-non-virtual-dtor',
-        '-Wno-overloaded-virtual',
-      ]
-
-      if have_gcc:
-        cxx.cflags += ['-mfpmath=sse']
+      self.configure_gcc(cxx)
     elif cxx.vendor == 'msvc':
-      if builder.options.debug == '1':
-        cxx.cflags += ['/MTd']
-        cxx.linkflags += ['/NODEFAULTLIB:libcmt']
-      else:
-        cxx.cflags += ['/MT']
-      cxx.defines += [
-        '_CRT_SECURE_NO_DEPRECATE',
-        '_CRT_SECURE_NO_WARNINGS',
-        '_CRT_NONSTDC_NO_DEPRECATE',
-        '_ITERATOR_DEBUG_LEVEL=0',
-      ]
-      cxx.cflags += [
-        '/W3',
-      ]
-      cxx.cxxflags += [
-        '/EHsc',
-        '/GR-',
-        '/TP',
-      ]
-      cxx.linkflags += [
-        '/MACHINE:X86',
-        '/SUBSYSTEM:WINDOWS',
-        'kernel32.lib',
-        'user32.lib',
-        'gdi32.lib',
-        'winspool.lib',
-        'comdlg32.lib',
-        'advapi32.lib',
-        'shell32.lib',
-        'ole32.lib',
-        'oleaut32.lib',
-        'uuid.lib',
-        'odbc32.lib',
-        'odbccp32.lib',
-      ] 
+      self.configure_msvc(cxx)
 
-    # Optimization
+    # Optimizaiton
     if builder.options.opt == '1':
       cxx.defines += ['NDEBUG']
-      if cxx.like('gcc'):
-        cxx.cflags += ['-O3']
-      elif cxx.like('msvc'):
-        cxx.cflags += ['/Ox']
-        cxx.linkflags += ['/OPT:ICF', '/OPT:REF']
 
     # Debugging
     if builder.options.debug == '1':
       cxx.defines += ['DEBUG', '_DEBUG']
-      if cxx.like('msvc'):
-        cxx.cflags += ['/Od', '/RTC1']
-        if cxx.version >= 1600:
-          cxx.cflags += ['/d2Zi+']
-
-    # This needs to be after our optimization flags which could otherwise disable it.
-    if cxx.vendor == 'msvc':
-      # Don't omit the frame pointer.
-      cxx.cflags += ['/Oy-']
 
     # Platform-specifics
     if builder.target_platform == 'linux':
-      cxx.defines += ['_LINUX', 'POSIX']
-      if cxx.vendor == 'gcc':
-        cxx.linkflags += ['-static-libgcc']
-      elif cxx.vendor == 'clang':
-        cxx.linkflags += ['-lgcc_eh']
+      self.configure_linux(cxx)
     elif builder.target_platform == 'mac':
-      cxx.defines += ['OSX', '_OSX', 'POSIX']
-      cxx.cflags += ['-mmacosx-version-min=10.5']
-      cxx.linkflags += [
-        '-mmacosx-version-min=10.5',
-        '-arch', 'i386',
-        '-lstdc++',
-        '-stdlib=libstdc++',
-      ]
-      cxx.cxxflags += ['-stdlib=libstdc++']
+      self.configure_mac(cxx)
     elif builder.target_platform == 'windows':
-      cxx.defines += ['WIN32', '_WINDOWS']
-	  
+      self.configure_windows(cxx)
+
+    # Finish up.
+    cxx.includes += [
+      os.path.join(self.sm_root, 'public'),
+    ]
+
+  def configure_gcc(self, cxx):
+    cxx.defines += [
+      'stricmp=strcasecmp',
+      '_stricmp=strcasecmp',
+      '_snprintf=snprintf',
+      '_vsnprintf=vsnprintf',
+      'HAVE_STDINT_H',
+      'GNUC',
+    ]
+    cxx.cflags += [
+      '-pipe',
+      '-fno-strict-aliasing',
+#      '-Wall',
+      '-Werror',
+      '-Wno-unused',
+      '-Wno-switch',
+      '-Wno-array-bounds',
+      '-msse',
+      '-m32',
+      '-fvisibility=hidden',
+    ]
+    cxx.cxxflags += [
+      '-std=c++11',
+      '-fno-exceptions',
+      '-fno-threadsafe-statics',
+      '-Wno-non-virtual-dtor',
+      '-Wno-overloaded-virtual',
+      '-fvisibility-inlines-hidden',
+    ]
+    cxx.linkflags += ['-m32']
+
+    have_gcc = cxx.vendor == 'gcc'
+    have_clang = cxx.vendor == 'clang'
+    if cxx.version >= 'clang-3.6':
+      cxx.cxxflags += ['-Wno-inconsistent-missing-override']
+    if have_clang or (cxx.version >= 'gcc-4.6'):
+      cxx.cflags += ['-Wno-narrowing']
+    if have_clang or (cxx.version >= 'gcc-4.7'):
+      cxx.cxxflags += ['-Wno-delete-non-virtual-dtor']
+    if cxx.version >= 'gcc-4.8':
+      cxx.cflags += ['-Wno-unused-result']
+
+    if have_clang:
+      cxx.cxxflags += ['-Wno-implicit-exception-spec-mismatch']
+      if cxx.version >= 'apple-clang-5.1' or cxx.version >= 'clang-3.4':
+        cxx.cxxflags += ['-Wno-deprecated-register']
+      else:
+        cxx.cxxflags += ['-Wno-deprecated']
+      cxx.cflags += ['-Wno-sometimes-uninitialized']
+
+    if have_gcc:
+      cxx.cflags += ['-mfpmath=sse']
+
+    if builder.options.opt == '1':
+      cxx.cflags += ['-O3']
+
+  def configure_msvc(self, cxx):
+    if builder.options.debug == '1':
+      cxx.cflags += ['/MTd']
+      cxx.linkflags += ['/NODEFAULTLIB:libcmt']
+    else:
+      cxx.cflags += ['/MT']
+    cxx.defines += [
+      '_CRT_SECURE_NO_DEPRECATE',
+      '_CRT_SECURE_NO_WARNINGS',
+      '_CRT_NONSTDC_NO_DEPRECATE',
+      '_ITERATOR_DEBUG_LEVEL=0',
+    ]
+    cxx.cflags += [
+      '/W3',
+    ]
+    cxx.cxxflags += [
+      '/EHsc',
+      '/GR-',
+      '/TP',
+    ]
+    cxx.linkflags += [
+      '/MACHINE:X86',
+      'kernel32.lib',
+      'user32.lib',
+      'gdi32.lib',
+      'winspool.lib',
+      'comdlg32.lib',
+      'advapi32.lib',
+      'shell32.lib',
+      'ole32.lib',
+      'oleaut32.lib',
+      'uuid.lib',
+      'odbc32.lib',
+      'odbccp32.lib',
+    ]
+
+    if builder.options.opt == '1':
+      cxx.cflags += ['/Ox', '/Zo']
+      cxx.linkflags += ['/OPT:ICF', '/OPT:REF']
+
+    if builder.options.debug == '1':
+      cxx.cflags += ['/Od', '/RTC1']
+
+    # This needs to be after our optimization flags which could otherwise disable it.
+    # Don't omit the frame pointer.
+    cxx.cflags += ['/Oy-']
+
+  def configure_linux(self, cxx):
+    cxx.defines += ['_LINUX', 'POSIX']
+    cxx.linkflags += ['-Wl,--exclude-libs,ALL', '-lm']
+    if cxx.vendor == 'gcc':
+      cxx.linkflags += ['-static-libgcc']
+    elif cxx.vendor == 'clang':
+      cxx.linkflags += ['-lgcc_eh']
+
+  def configure_mac(self, cxx):
+    cxx.defines += ['OSX', '_OSX', 'POSIX']
+    cxx.cflags += ['-mmacosx-version-min=10.5']
+    cxx.linkflags += [
+      '-mmacosx-version-min=10.5',
+      '-arch', 'i386',
+      '-lstdc++',
+      '-stdlib=libstdc++',
+    ]
+    cxx.cxxflags += ['-stdlib=libstdc++']
+
+  def configure_windows(self, cxx):
+    cxx.defines += ['WIN32', '_WINDOWS']
+
   def ConfigureForExtension(self, context, compiler):
     compiler.cxxincludes += [
       os.path.join(context.currentSourcePath),
-	  os.path.join(context.currentSourcePath, 'libs', 'libuv-v1.5.0', 'include'),
+      os.path.join(context.currentSourcePath, 'sdk'),
       os.path.join(self.sm_root, 'public'),
-      os.path.join(self.sm_root, 'public', 'amtl'),
       os.path.join(self.sm_root, 'public', 'extensions'),
-      os.path.join(self.sm_root, 'public', 'sourcepawn')
+      os.path.join(self.sm_root, 'sourcepawn', 'include'),
+      os.path.join(self.sm_root, 'public', 'amtl', 'amtl'),
+      os.path.join(self.sm_root, 'public', 'amtl'),
     ]
     return compiler
 
@@ -200,5 +220,14 @@ AsyncSocket = AsyncSocketConfig()
 AsyncSocket.detectSDKs()
 AsyncSocket.configure()
 
-builder.RunBuildScripts(['extension/AMBuilder'], { 'AsyncSocket': AsyncSocket })
-builder.RunScript('PackageScript', { 'AsyncSocket': AsyncSocket })
\ No newline at end of file
+# Add additional buildscripts here
+BuildScripts = [
+  'extension/AMBuilder'
+]
+
+if builder.backend == 'amb2':
+  BuildScripts += [
+    'PackageScript',
+  ]
+
+builder.RunBuildScripts(BuildScripts, { 'AsyncSocket': AsyncSocket})
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..2f47522
--- /dev/null
+++ b/README.md
@@ -0,0 +1,10 @@
+# async_connect
+
+You need to compile https://github.com/libuv/libuv for 32bit like below if you have a 64bit OS.
+```
+sh autgen.sh
+./configure --build=i686-pc-linux-gnu "CFLAGS=-m32" "CXXFLAGS=-m32" "LDFLAGS=-m32" --disable-shared --enable-static
+make
+```
+
+Put the `libuv/include` folder and `libuv/.libs/libuv.a` in `extensions/libuv`.
diff --git a/extension/AMBuilder b/extension/AMBuilder
index f4adb41..8e19f18 100644
--- a/extension/AMBuilder
+++ b/extension/AMBuilder
@@ -5,7 +5,7 @@ binary = builder.compiler.Library('async_socket.ext')
 AsyncSocket.ConfigureForExtension(builder, binary.compiler)
 
 binary.compiler.includes += [
-  os.path.join(builder.sourcePath, 'extension', 'libs', 'libuv-v1.5.0', 'include')
+  os.path.join(builder.sourcePath, 'extension', 'libuv', 'include')
 ]
 
 binary.sources += [
@@ -14,11 +14,17 @@ binary.sources += [
   os.path.join(AsyncSocket.sm_root, 'public', 'smsdk_ext.cpp')
 ]
 
-binary.compiler.defines += ['SOURCEMOD_BUILD']
+binary.compiler.defines += [
+  'SOURCEMOD_BUILD'
+]
 
-binary.compiler.postlink += [os.path.join(builder.sourcePath, 'extension', 'libs', 'libuv-v1.5.0', '.libs', 'libuv.a')]
+binary.compiler.postlink += [
+  os.path.join(builder.sourcePath, 'extension', 'libuv', 'libuv.a')
+]
 
 if builder.target_platform == 'windows':
   binary.compiler.linkflags += ['ws2_32.lib']
 
-AsyncSocket.extensions += [builder.Add(binary)]
\ No newline at end of file
+AsyncSocket.extensions += [
+  builder.Add(binary)
+]
diff --git a/extension/atomicops.h b/extension/atomicops.h
new file mode 100644
index 0000000..c375710
--- /dev/null
+++ b/extension/atomicops.h
@@ -0,0 +1,664 @@
+﻿// ©2013-2016 Cameron Desrochers.
+// Distributed under the simplified BSD license (see the license file that
+// should have come with this header).
+// Uses Jeff Preshing's semaphore implementation (under the terms of its
+// separate zlib license, embedded below).
+
+#pragma once
+
+// Provides portable (VC++2010+, Intel ICC 13, GCC 4.7+, and anything C++11 compliant) implementation
+// of low-level memory barriers, plus a few semi-portable utility macros (for inlining and alignment).
+// Also has a basic atomic type (limited to hardware-supported atomics with no memory ordering guarantees).
+// Uses the AE_* prefix for macros (historical reasons), and the "moodycamel" namespace for symbols.
+
+#include <cassert>
+#include <type_traits>
+#include <cerrno>
+#include <cstdint>
+#include <ctime>
+
+// Platform detection
+#if defined(__INTEL_COMPILER)
+#define AE_ICC
+#elif defined(_MSC_VER)
+#define AE_VCPP
+#elif defined(__GNUC__)
+#define AE_GCC
+#endif
+
+#if defined(_M_IA64) || defined(__ia64__)
+#define AE_ARCH_IA64
+#elif defined(_WIN64) || defined(__amd64__) || defined(_M_X64) || defined(__x86_64__)
+#define AE_ARCH_X64
+#elif defined(_M_IX86) || defined(__i386__)
+#define AE_ARCH_X86
+#elif defined(_M_PPC) || defined(__powerpc__)
+#define AE_ARCH_PPC
+#else
+#define AE_ARCH_UNKNOWN
+#endif
+
+
+// AE_UNUSED
+#define AE_UNUSED(x) ((void)x)
+
+
+// AE_FORCEINLINE
+#if defined(AE_VCPP) || defined(AE_ICC)
+#define AE_FORCEINLINE __forceinline
+#elif defined(AE_GCC)
+//#define AE_FORCEINLINE __attribute__((always_inline)) 
+#define AE_FORCEINLINE inline
+#else
+#define AE_FORCEINLINE inline
+#endif
+
+
+// AE_ALIGN
+#if defined(AE_VCPP) || defined(AE_ICC)
+#define AE_ALIGN(x) __declspec(align(x))
+#elif defined(AE_GCC)
+#define AE_ALIGN(x) __attribute__((aligned(x)))
+#else
+// Assume GCC compliant syntax...
+#define AE_ALIGN(x) __attribute__((aligned(x)))
+#endif
+
+
+// Portable atomic fences implemented below:
+
+namespace moodycamel {
+
+enum memory_order {
+	memory_order_relaxed,
+	memory_order_acquire,
+	memory_order_release,
+	memory_order_acq_rel,
+	memory_order_seq_cst,
+
+	// memory_order_sync: Forces a full sync:
+	// #LoadLoad, #LoadStore, #StoreStore, and most significantly, #StoreLoad
+	memory_order_sync = memory_order_seq_cst
+};
+
+}    // end namespace moodycamel
+
+#if (defined(AE_VCPP) && (_MSC_VER < 1700 || defined(__cplusplus_cli))) || defined(AE_ICC)
+// VS2010 and ICC13 don't support std::atomic_*_fence, implement our own fences
+
+#include <intrin.h>
+
+#if defined(AE_ARCH_X64) || defined(AE_ARCH_X86)
+#define AeFullSync _mm_mfence
+#define AeLiteSync _mm_mfence
+#elif defined(AE_ARCH_IA64)
+#define AeFullSync __mf
+#define AeLiteSync __mf
+#elif defined(AE_ARCH_PPC)
+#include <ppcintrinsics.h>
+#define AeFullSync __sync
+#define AeLiteSync __lwsync
+#endif
+
+
+#ifdef AE_VCPP
+#pragma warning(push)
+#pragma warning(disable: 4365)		// Disable erroneous 'conversion from long to unsigned int, signed/unsigned mismatch' error when using `assert`
+#ifdef __cplusplus_cli
+#pragma managed(push, off)
+#endif
+#endif
+
+namespace moodycamel {
+
+AE_FORCEINLINE void compiler_fence(memory_order order)
+{
+	switch (order) {
+		case memory_order_relaxed: break;
+		case memory_order_acquire: _ReadBarrier(); break;
+		case memory_order_release: _WriteBarrier(); break;
+		case memory_order_acq_rel: _ReadWriteBarrier(); break;
+		case memory_order_seq_cst: _ReadWriteBarrier(); break;
+		default: assert(false);
+	}
+}
+
+// x86/x64 have a strong memory model -- all loads and stores have
+// acquire and release semantics automatically (so only need compiler
+// barriers for those).
+#if defined(AE_ARCH_X86) || defined(AE_ARCH_X64)
+AE_FORCEINLINE void fence(memory_order order)
+{
+	switch (order) {
+		case memory_order_relaxed: break;
+		case memory_order_acquire: _ReadBarrier(); break;
+		case memory_order_release: _WriteBarrier(); break;
+		case memory_order_acq_rel: _ReadWriteBarrier(); break;
+		case memory_order_seq_cst:
+			_ReadWriteBarrier();
+			AeFullSync();
+			_ReadWriteBarrier();
+			break;
+		default: assert(false);
+	}
+}
+#else
+AE_FORCEINLINE void fence(memory_order order)
+{
+	// Non-specialized arch, use heavier memory barriers everywhere just in case :-(
+	switch (order) {
+		case memory_order_relaxed:
+			break;
+		case memory_order_acquire:
+			_ReadBarrier();
+			AeLiteSync();
+			_ReadBarrier();
+			break;
+		case memory_order_release:
+			_WriteBarrier();
+			AeLiteSync();
+			_WriteBarrier();
+			break;
+		case memory_order_acq_rel:
+			_ReadWriteBarrier();
+			AeLiteSync();
+			_ReadWriteBarrier();
+			break;
+		case memory_order_seq_cst:
+			_ReadWriteBarrier();
+			AeFullSync();
+			_ReadWriteBarrier();
+			break;
+		default: assert(false);
+	}
+}
+#endif
+}    // end namespace moodycamel
+#else
+// Use standard library of atomics
+#include <atomic>
+
+namespace moodycamel {
+
+AE_FORCEINLINE void compiler_fence(memory_order order)
+{
+	switch (order) {
+		case memory_order_relaxed: break;
+		case memory_order_acquire: std::atomic_signal_fence(std::memory_order_acquire); break;
+		case memory_order_release: std::atomic_signal_fence(std::memory_order_release); break;
+		case memory_order_acq_rel: std::atomic_signal_fence(std::memory_order_acq_rel); break;
+		case memory_order_seq_cst: std::atomic_signal_fence(std::memory_order_seq_cst); break;
+		default: assert(false);
+	}
+}
+
+AE_FORCEINLINE void fence(memory_order order)
+{
+	switch (order) {
+		case memory_order_relaxed: break;
+		case memory_order_acquire: std::atomic_thread_fence(std::memory_order_acquire); break;
+		case memory_order_release: std::atomic_thread_fence(std::memory_order_release); break;
+		case memory_order_acq_rel: std::atomic_thread_fence(std::memory_order_acq_rel); break;
+		case memory_order_seq_cst: std::atomic_thread_fence(std::memory_order_seq_cst); break;
+		default: assert(false);
+	}
+}
+
+}    // end namespace moodycamel
+
+#endif
+
+
+#if !defined(AE_VCPP) || (_MSC_VER >= 1700 && !defined(__cplusplus_cli))
+#define AE_USE_STD_ATOMIC_FOR_WEAK_ATOMIC
+#endif
+
+#ifdef AE_USE_STD_ATOMIC_FOR_WEAK_ATOMIC
+#include <atomic>
+#endif
+#include <utility>
+
+// WARNING: *NOT* A REPLACEMENT FOR std::atomic. READ CAREFULLY:
+// Provides basic support for atomic variables -- no memory ordering guarantees are provided.
+// The guarantee of atomicity is only made for types that already have atomic load and store guarantees
+// at the hardware level -- on most platforms this generally means aligned pointers and integers (only).
+namespace moodycamel {
+template<typename T>
+class weak_atomic
+{
+public:
+	weak_atomic() { }
+#ifdef AE_VCPP
+#pragma warning(disable: 4100)		// Get rid of (erroneous) 'unreferenced formal parameter' warning
+#endif
+	template<typename U> weak_atomic(U&& x) : value(std::forward<U>(x)) {  }
+#ifdef __cplusplus_cli
+	// Work around bug with universal reference/nullptr combination that only appears when /clr is on
+	weak_atomic(nullptr_t) : value(nullptr) {  }
+#endif
+	weak_atomic(weak_atomic const& other) : value(other.value) {  }
+	weak_atomic(weak_atomic&& other) : value(std::move(other.value)) {  }
+#ifdef AE_VCPP
+#pragma warning(default: 4100)
+#endif
+
+	AE_FORCEINLINE operator T() const { return load(); }
+
+	
+#ifndef AE_USE_STD_ATOMIC_FOR_WEAK_ATOMIC
+	template<typename U> AE_FORCEINLINE weak_atomic const& operator=(U&& x) { value = std::forward<U>(x); return *this; }
+	AE_FORCEINLINE weak_atomic const& operator=(weak_atomic const& other) { value = other.value; return *this; }
+	
+	AE_FORCEINLINE T load() const { return value; }
+	
+	AE_FORCEINLINE T fetch_add_acquire(T increment)
+	{
+#if defined(AE_ARCH_X64) || defined(AE_ARCH_X86)
+		if (sizeof(T) == 4) return _InterlockedExchangeAdd((long volatile*)&value, (long)increment);
+#if defined(_M_AMD64)
+		else if (sizeof(T) == 8) return _InterlockedExchangeAdd64((long long volatile*)&value, (long long)increment);
+#endif
+#else
+#error Unsupported platform
+#endif
+		assert(false && "T must be either a 32 or 64 bit type");
+		return value;
+	}
+	
+	AE_FORCEINLINE T fetch_add_release(T increment)
+	{
+#if defined(AE_ARCH_X64) || defined(AE_ARCH_X86)
+		if (sizeof(T) == 4) return _InterlockedExchangeAdd((long volatile*)&value, (long)increment);
+#if defined(_M_AMD64)
+		else if (sizeof(T) == 8) return _InterlockedExchangeAdd64((long long volatile*)&value, (long long)increment);
+#endif
+#else
+#error Unsupported platform
+#endif
+		assert(false && "T must be either a 32 or 64 bit type");
+		return value;
+	}
+#else
+	template<typename U>
+	AE_FORCEINLINE weak_atomic const& operator=(U&& x)
+	{
+		value.store(std::forward<U>(x), std::memory_order_relaxed);
+		return *this;
+	}
+	
+	AE_FORCEINLINE weak_atomic const& operator=(weak_atomic const& other)
+	{
+		value.store(other.value.load(std::memory_order_relaxed), std::memory_order_relaxed);
+		return *this;
+	}
+
+	AE_FORCEINLINE T load() const { return value.load(std::memory_order_relaxed); }
+	
+	AE_FORCEINLINE T fetch_add_acquire(T increment)
+	{
+		return value.fetch_add(increment, std::memory_order_acquire);
+	}
+	
+	AE_FORCEINLINE T fetch_add_release(T increment)
+	{
+		return value.fetch_add(increment, std::memory_order_release);
+	}
+#endif
+	
+
+private:
+#ifndef AE_USE_STD_ATOMIC_FOR_WEAK_ATOMIC
+	// No std::atomic support, but still need to circumvent compiler optimizations.
+	// `volatile` will make memory access slow, but is guaranteed to be reliable.
+	volatile T value;
+#else
+	std::atomic<T> value;
+#endif
+};
+
+}	// end namespace moodycamel
+
+
+
+// Portable single-producer, single-consumer semaphore below:
+
+#if defined(_WIN32)
+// Avoid including windows.h in a header; we only need a handful of
+// items, so we'll redeclare them here (this is relatively safe since
+// the API generally has to remain stable between Windows versions).
+// I know this is an ugly hack but it still beats polluting the global
+// namespace with thousands of generic names or adding a .cpp for nothing.
+extern "C" {
+	struct _SECURITY_ATTRIBUTES;
+	__declspec(dllimport) void* __stdcall CreateSemaphoreW(_SECURITY_ATTRIBUTES* lpSemaphoreAttributes, long lInitialCount, long lMaximumCount, const wchar_t* lpName);
+	__declspec(dllimport) int __stdcall CloseHandle(void* hObject);
+	__declspec(dllimport) unsigned long __stdcall WaitForSingleObject(void* hHandle, unsigned long dwMilliseconds);
+	__declspec(dllimport) int __stdcall ReleaseSemaphore(void* hSemaphore, long lReleaseCount, long* lpPreviousCount);
+}
+#elif defined(__MACH__)
+#include <mach/mach.h>
+#elif defined(__unix__)
+#include <semaphore.h>
+#endif
+
+namespace moodycamel
+{
+	// Code in the spsc_sema namespace below is an adaptation of Jeff Preshing's
+	// portable + lightweight semaphore implementations, originally from
+	// https://github.com/preshing/cpp11-on-multicore/blob/master/common/sema.h
+	// LICENSE:
+	// Copyright (c) 2015 Jeff Preshing
+	//
+	// This software is provided 'as-is', without any express or implied
+	// warranty. In no event will the authors be held liable for any damages
+	// arising from the use of this software.
+	//
+	// Permission is granted to anyone to use this software for any purpose,
+	// including commercial applications, and to alter it and redistribute it
+	// freely, subject to the following restrictions:
+	//
+	// 1. The origin of this software must not be misrepresented; you must not
+	//    claim that you wrote the original software. If you use this software
+	//    in a product, an acknowledgement in the product documentation would be
+	//    appreciated but is not required.
+	// 2. Altered source versions must be plainly marked as such, and must not be
+	//    misrepresented as being the original software.
+	// 3. This notice may not be removed or altered from any source distribution.
+	namespace spsc_sema
+	{
+#if defined(_WIN32)
+		class Semaphore
+		{
+		private:
+		    void* m_hSema;
+		    
+		    Semaphore(const Semaphore& other);
+		    Semaphore& operator=(const Semaphore& other);
+
+		public:
+		    Semaphore(int initialCount = 0)
+		    {
+		        assert(initialCount >= 0);
+		        const long maxLong = 0x7fffffff;
+		        m_hSema = CreateSemaphoreW(nullptr, initialCount, maxLong, nullptr);
+		    }
+
+		    ~Semaphore()
+		    {
+		        CloseHandle(m_hSema);
+		    }
+
+		    void wait()
+		    {
+		    	const unsigned long infinite = 0xffffffff;
+		        WaitForSingleObject(m_hSema, infinite);
+		    }
+
+			bool try_wait()
+			{
+				const unsigned long RC_WAIT_TIMEOUT = 0x00000102;
+				return WaitForSingleObject(m_hSema, 0) != RC_WAIT_TIMEOUT;
+			}
+
+			bool timed_wait(std::uint64_t usecs)
+			{
+				const unsigned long RC_WAIT_TIMEOUT = 0x00000102;
+				return WaitForSingleObject(m_hSema, (unsigned long)(usecs / 1000)) != RC_WAIT_TIMEOUT;
+			}
+
+		    void signal(int count = 1)
+		    {
+		        ReleaseSemaphore(m_hSema, count, nullptr);
+		    }
+		};
+#elif defined(__MACH__)
+		//---------------------------------------------------------
+		// Semaphore (Apple iOS and OSX)
+		// Can't use POSIX semaphores due to http://lists.apple.com/archives/darwin-kernel/2009/Apr/msg00010.html
+		//---------------------------------------------------------
+		class Semaphore
+		{
+		private:
+		    semaphore_t m_sema;
+
+		    Semaphore(const Semaphore& other);
+		    Semaphore& operator=(const Semaphore& other);
+
+		public:
+		    Semaphore(int initialCount = 0)
+		    {
+		        assert(initialCount >= 0);
+		        semaphore_create(mach_task_self(), &m_sema, SYNC_POLICY_FIFO, initialCount);
+		    }
+
+		    ~Semaphore()
+		    {
+		        semaphore_destroy(mach_task_self(), m_sema);
+		    }
+
+		    void wait()
+		    {
+		        semaphore_wait(m_sema);
+		    }
+
+			bool try_wait()
+			{
+				return timed_wait(0);
+			}
+
+			bool timed_wait(std::int64_t timeout_usecs)
+			{
+				mach_timespec_t ts;
+				ts.tv_sec = timeout_usecs / 1000000;
+				ts.tv_nsec = (timeout_usecs % 1000000) * 1000;
+
+				// added in OSX 10.10: https://developer.apple.com/library/prerelease/mac/documentation/General/Reference/APIDiffsMacOSX10_10SeedDiff/modules/Darwin.html
+				kern_return_t rc = semaphore_timedwait(m_sema, ts);
+
+				return rc != KERN_OPERATION_TIMED_OUT;
+			}
+
+		    void signal()
+		    {
+		        semaphore_signal(m_sema);
+		    }
+
+		    void signal(int count)
+		    {
+		        while (count-- > 0)
+		        {
+		            semaphore_signal(m_sema);
+		        }
+		    }
+		};
+#elif defined(__unix__)
+		//---------------------------------------------------------
+		// Semaphore (POSIX, Linux)
+		//---------------------------------------------------------
+		class Semaphore
+		{
+		private:
+		    sem_t m_sema;
+
+		    Semaphore(const Semaphore& other);
+		    Semaphore& operator=(const Semaphore& other);
+
+		public:
+		    Semaphore(int initialCount = 0)
+		    {
+		        assert(initialCount >= 0);
+		        sem_init(&m_sema, 0, initialCount);
+		    }
+
+		    ~Semaphore()
+		    {
+		        sem_destroy(&m_sema);
+		    }
+
+		    void wait()
+		    {
+		        // http://stackoverflow.com/questions/2013181/gdb-causes-sem-wait-to-fail-with-eintr-error
+		        int rc;
+		        do
+		        {
+		            rc = sem_wait(&m_sema);
+		        }
+		        while (rc == -1 && errno == EINTR);
+		    }
+
+			bool try_wait()
+			{
+				int rc;
+				do {
+					rc = sem_trywait(&m_sema);
+				} while (rc == -1 && errno == EINTR);
+				return !(rc == -1 && errno == EAGAIN);
+			}
+
+			bool timed_wait(std::uint64_t usecs)
+			{
+				struct timespec ts;
+				const int usecs_in_1_sec = 1000000;
+				const int nsecs_in_1_sec = 1000000000;
+				clock_gettime(CLOCK_REALTIME, &ts);
+				ts.tv_sec += usecs / usecs_in_1_sec;
+				ts.tv_nsec += (usecs % usecs_in_1_sec) * 1000;
+				// sem_timedwait bombs if you have more than 1e9 in tv_nsec
+				// so we have to clean things up before passing it in
+				if (ts.tv_nsec > nsecs_in_1_sec) {
+					ts.tv_nsec -= nsecs_in_1_sec;
+					++ts.tv_sec;
+				}
+
+				int rc;
+				do {
+					rc = sem_timedwait(&m_sema, &ts);
+				} while (rc == -1 && errno == EINTR);
+				return !(rc == -1 && errno == ETIMEDOUT);
+			}
+
+		    void signal()
+		    {
+		        sem_post(&m_sema);
+		    }
+
+		    void signal(int count)
+		    {
+		        while (count-- > 0)
+		        {
+		            sem_post(&m_sema);
+		        }
+		    }
+		};
+#else
+#error Unsupported platform! (No semaphore wrapper available)
+#endif
+
+		//---------------------------------------------------------
+		// LightweightSemaphore
+		//---------------------------------------------------------
+		class LightweightSemaphore
+		{
+		public:
+			typedef std::make_signed<std::size_t>::type ssize_t;
+			
+		private:
+		    weak_atomic<ssize_t> m_count;
+		    Semaphore m_sema;
+
+		    bool waitWithPartialSpinning(std::int64_t timeout_usecs = -1)
+		    {
+		        ssize_t oldCount;
+		        // Is there a better way to set the initial spin count?
+		        // If we lower it to 1000, testBenaphore becomes 15x slower on my Core i7-5930K Windows PC,
+		        // as threads start hitting the kernel semaphore.
+		        int spin = 10000;
+		        while (--spin >= 0)
+		        {
+		            if (m_count.load() > 0)
+		            {
+		                m_count.fetch_add_acquire(-1);
+		                return true;
+		            }
+		            compiler_fence(memory_order_acquire);     // Prevent the compiler from collapsing the loop.
+		        }
+		        oldCount = m_count.fetch_add_acquire(-1);
+				if (oldCount > 0)
+					return true;
+		        if (timeout_usecs < 0)
+				{
+					m_sema.wait();
+					return true;
+				}
+				if (m_sema.timed_wait(timeout_usecs))
+					return true;
+				// At this point, we've timed out waiting for the semaphore, but the
+				// count is still decremented indicating we may still be waiting on
+				// it. So we have to re-adjust the count, but only if the semaphore
+				// wasn't signaled enough times for us too since then. If it was, we
+				// need to release the semaphore too.
+				while (true)
+				{
+					oldCount = m_count.fetch_add_release(1);
+					if (oldCount < 0)
+						return false;    // successfully restored things to the way they were
+					// Oh, the producer thread just signaled the semaphore after all. Try again:
+					oldCount = m_count.fetch_add_acquire(-1);
+					if (oldCount > 0 && m_sema.try_wait())
+						return true;
+				}
+		    }
+
+		public:
+		    LightweightSemaphore(ssize_t initialCount = 0) : m_count(initialCount)
+		    {
+		        assert(initialCount >= 0);
+		    }
+
+		    bool tryWait()
+		    {
+		        if (m_count.load() > 0)
+		        {
+		        	m_count.fetch_add_acquire(-1);
+		        	return true;
+		        }
+		        return false;
+		    }
+
+		    void wait()
+		    {
+		        if (!tryWait())
+		            waitWithPartialSpinning();
+		    }
+
+			bool wait(std::int64_t timeout_usecs)
+			{
+				return tryWait() || waitWithPartialSpinning(timeout_usecs);
+			}
+
+		    void signal(ssize_t count = 1)
+		    {
+		    	assert(count >= 0);
+		        ssize_t oldCount = m_count.fetch_add_release(count);
+		        assert(oldCount >= -1);
+		        if (oldCount < 0)
+		        {
+		            m_sema.signal(1);
+		        }
+		    }
+		    
+		    ssize_t availableApprox() const
+		    {
+		    	ssize_t count = m_count.load();
+		    	return count > 0 ? count : 0;
+		    }
+		};
+	}	// end namespace spsc_sema
+}	// end namespace moodycamel
+
+#if defined(AE_VCPP) && (_MSC_VER < 1700 || defined(__cplusplus_cli))
+#pragma warning(pop)
+#ifdef __cplusplus_cli
+#pragma managed(pop)
+#endif
+#endif
diff --git a/extension/context.cpp b/extension/context.cpp
index 27bd98b..8b739e2 100644
--- a/extension/context.cpp
+++ b/extension/context.cpp
@@ -1,91 +1,86 @@
 #include "context.h"
 
-AsyncSocketContext::AsyncSocketContext(IPluginContext* pContext) {
-	this->pContext = pContext;
+CAsyncSocketContext::CAsyncSocketContext(IPluginContext *pContext)
+{
+	this->m_pContext = pContext;
 
+	socket = NULL;
 	stream = NULL;
 
-	connectCallback = NULL;
-	errorCallback = NULL;
-	dataCallback = NULL;
+	m_pConnectCallback = NULL;
+	m_pErrorCallback = NULL;
+	m_pDataCallback = NULL;
 }
 
-AsyncSocketContext::~AsyncSocketContext() {
-	if (connect_req != NULL) {
-		free(connect_req);
-	}
+CAsyncSocketContext::~CAsyncSocketContext()
+{
+	if(socket != NULL)
+		uv_close((uv_handle_t *)socket, NULL);
 
-	if (socket != NULL) {
-		uv_close((uv_handle_t *) socket, NULL);
-	}
+	if(m_pConnectCallback)
+		forwards->ReleaseForward(m_pConnectCallback);
 
-	if (connectCallback) {
-		forwards->ReleaseForward(connectCallback);
-	}
+	if(m_pErrorCallback)
+		forwards->ReleaseForward(m_pErrorCallback);
 
-	if (errorCallback) {
-		forwards->ReleaseForward(errorCallback);
-	}
-
-	if (dataCallback) {
-		forwards->ReleaseForward(dataCallback);
-	}
+	if(m_pDataCallback)
+		forwards->ReleaseForward(m_pDataCallback);
 }
 
-void AsyncSocketContext::Connected() {
-	if (!connectCallback) {
+void CAsyncSocketContext::Connected()
+{
+	if(!m_pConnectCallback)
 		return;
-	}
 
-	connectCallback->PushCell(hndl);
-    connectCallback->Execute(NULL);
+	m_pConnectCallback->PushCell(m_Handle);
+    m_pConnectCallback->Execute(NULL);
 }
 
-void AsyncSocketContext::OnError(int error) {
-	if (!errorCallback) {
+void CAsyncSocketContext::OnError(int error)
+{
+	if(!m_pErrorCallback)
 		return;
-	}
 
-	errorCallback->PushCell(hndl);
-	errorCallback->PushCell(error);
-	errorCallback->PushString(uv_err_name(error));
-	errorCallback->Execute(NULL);
+	m_pErrorCallback->PushCell(m_Handle);
+	m_pErrorCallback->PushCell(error);
+	m_pErrorCallback->PushString(uv_err_name(error));
+	m_pErrorCallback->Execute(NULL);
 }
 
-void AsyncSocketContext::OnData(char* data, ssize_t size) {
-	if (!dataCallback) {
+void CAsyncSocketContext::OnData(char* data, ssize_t size)
+{
+	if(!m_pDataCallback)
 		return;
-	}
 
-	dataCallback->PushCell(hndl);
-	dataCallback->PushString(data);
-	dataCallback->PushCell(size);
-    dataCallback->Execute(NULL);
+	m_pDataCallback->PushCell(m_Handle);
+	m_pDataCallback->PushString(data);
+	m_pDataCallback->PushCell(size);
+    m_pDataCallback->Execute(NULL);
 }
 
-bool AsyncSocketContext::SetConnectCallback(funcid_t function) {
-	if (connectCallback) {
-		forwards->ReleaseForward(connectCallback);
-	}
-	
-	connectCallback = forwards->CreateForwardEx(NULL, ET_Single, 1, NULL, Param_Cell);
-	return connectCallback->AddFunction(pContext, function);
+bool CAsyncSocketContext::SetConnectCallback(funcid_t function)
+{
+	if(m_pConnectCallback)
+		forwards->ReleaseForward(m_pConnectCallback);
+
+	m_pConnectCallback = forwards->CreateForwardEx(NULL, ET_Single, 1, NULL, Param_Cell);
+	return m_pConnectCallback->AddFunction(m_pContext, function);
 }
 
-bool AsyncSocketContext::SetErrorCallback(funcid_t function) {
-	if (connectCallback) {
-		forwards->ReleaseForward(errorCallback);
-	}
+bool CAsyncSocketContext::SetErrorCallback(funcid_t function)
+{
+	if(m_pConnectCallback)
+		forwards->ReleaseForward(m_pErrorCallback);
 
-	errorCallback = forwards->CreateForwardEx(NULL, ET_Single, 3, NULL, Param_Cell, Param_Cell, Param_String);
-	return errorCallback->AddFunction(pContext, function);
+	m_pErrorCallback = forwards->CreateForwardEx(NULL, ET_Single, 3, NULL, Param_Cell, Param_Cell, Param_String);
+	return m_pErrorCallback->AddFunction(m_pContext, function);
 }
 
-bool AsyncSocketContext::SetDataCallback(funcid_t function) {
-	if (dataCallback) {
-		forwards->ReleaseForward(dataCallback);
-	}
-	
-	dataCallback = forwards->CreateForwardEx(NULL, ET_Single, 3, NULL, Param_Cell, Param_String, Param_Cell);
-	return dataCallback->AddFunction(pContext, function);
-}
\ No newline at end of file
+bool CAsyncSocketContext::SetDataCallback(funcid_t function)
+{
+	if(m_pDataCallback)
+		forwards->ReleaseForward(m_pDataCallback);
+
+	m_pDataCallback = forwards->CreateForwardEx(NULL, ET_Single, 3, NULL, Param_Cell, Param_String, Param_Cell);
+	return m_pDataCallback->AddFunction(m_pContext, function);
+}
diff --git a/extension/context.h b/extension/context.h
index d382a3c..b8efd60 100644
--- a/extension/context.h
+++ b/extension/context.h
@@ -6,36 +6,35 @@
 
 #include "smsdk_ext.h"
 
-class AsyncSocketContext {
+class CAsyncSocketContext
+{
 public:
-    IPluginContext* pContext;
+    IPluginContext *m_pContext;
+	Handle_t m_Handle;
 
-	Handle_t hndl;
+	char *m_pHost;
+	int m_Port;
 
-	char* host;
-	int port;
-
-    IChangeableForward *connectCallback;
-	IChangeableForward *errorCallback;
-	IChangeableForward *dataCallback;
+    IChangeableForward *m_pConnectCallback;
+	IChangeableForward *m_pErrorCallback;
+	IChangeableForward *m_pDataCallback;
 
 	uv_getaddrinfo_t resolver;
-	uv_connect_t* connect_req;
-	uv_tcp_t* socket;
-	uv_stream_t* stream;
+	uv_tcp_t *socket;
+	uv_stream_t *stream;
+
+    CAsyncSocketContext(IPluginContext *plugin);
+    ~CAsyncSocketContext();
 
-    AsyncSocketContext(IPluginContext* plugin);
-    ~AsyncSocketContext();
-	
 	void Connected();
 
 	void OnError(int error);
 
-	void OnData(char* data, ssize_t size);
+	void OnData(char *data, ssize_t size);
 
 	bool SetConnectCallback(funcid_t function);
 	bool SetErrorCallback(funcid_t function);
 	bool SetDataCallback(funcid_t function);
 };
 
-#endif
\ No newline at end of file
+#endif
diff --git a/extension/extension.cpp b/extension/extension.cpp
index 5ed10a2..96d3dc8 100644
--- a/extension/extension.cpp
+++ b/extension/extension.cpp
@@ -8,7 +8,7 @@
  * This program is free software; you can redistribute it and/or modify it under
  * the terms of the GNU General Public License, version 3.0, as published by the
  * Free Software Foundation.
- * 
+ *
  * This program is distributed in the hope that it will be useful, but WITHOUT
  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
@@ -30,8 +30,8 @@
  */
 
 #include "extension.h"
-#include "queue.h"
 #include "context.h"
+#include "readerwriterqueue.h"
 #include <uv.h>
 
 /**
@@ -39,277 +39,190 @@
  * @brief Implement extension code here.
  */
 
-LockedQueue<AsyncSocketContext*> g_connect_queue;
-LockedQueue<socket_data_t*> g_data_queue;
-LockedQueue<error_data_t*> g_error_queue;
+moodycamel::ReaderWriterQueue<CAsyncSocketContext *> g_ConnectQueue;
+moodycamel::ReaderWriterQueue<CSocketError *> g_ErrorQueue;
+moodycamel::ReaderWriterQueue<CSocketData *> g_DataQueue;
 
-uv_loop_t *loop;
+uv_loop_t *g_UV_Loop;
+uv_thread_t g_UV_LoopThread;
 
-uv_thread_t loop_thread;
-
-uv_async_t g_async_resolve;
-uv_async_t g_async_write;
+uv_async_t g_UV_AsyncAdded;
+moodycamel::ReaderWriterQueue<CAsyncAddJob> g_AsyncAddQueue;
 
 AsyncSocket g_AsyncSocket;		/**< Global singleton for extension's main interface */
 
 SMEXT_LINK(&g_AsyncSocket);
 
-void push_error(AsyncSocketContext *ctx, int error);
-
-AsyncSocketContext* AsyncSocket::GetSocketInstanceByHandle(Handle_t handle) {
+CAsyncSocketContext *AsyncSocket::GetSocketInstanceByHandle(Handle_t handle)
+{
 	HandleSecurity sec;
 	sec.pOwner = NULL;
 	sec.pIdentity = myself->GetIdentity();
-	
-	AsyncSocketContext *client;
 
-	if (handlesys->ReadHandle(handle, socketHandleType, &sec, (void**)&client) != HandleError_None)
+	CAsyncSocketContext *pContext;
+
+	if(handlesys->ReadHandle(handle, socketHandleType, &sec, (void **)&pContext) != HandleError_None)
 		return NULL;
 
-	return client;
+	return pContext;
 }
 
-void AsyncSocket::OnHandleDestroy(HandleType_t type, void *object) {
-	if(object != NULL) {
-		AsyncSocketContext *ctx = (AsyncSocketContext *) object;
-
-		delete ctx;
+void AsyncSocket::OnHandleDestroy(HandleType_t type, void *object)
+{
+	if(object != NULL)
+	{
+		CAsyncSocketContext *pContext = (CAsyncSocketContext *)object;
+		delete pContext;
 	}
 }
 
-void OnGameFrame(bool simulating) {
-	if (!g_connect_queue.Empty()) {
-		g_connect_queue.Lock();
-		while(!g_connect_queue.Empty()) {
-			g_connect_queue.Pop()->Connected();
-		}
-		g_connect_queue.Unlock();
+void OnGameFrame(bool simulating)
+{
+	CAsyncSocketContext *pContext;
+	while(g_ConnectQueue.try_dequeue(pContext))
+	{
+		pContext->Connected();
 	}
 
-	if (!g_error_queue.Empty()) {
-		g_error_queue.Lock();
-		while(!g_error_queue.Empty()) {
-			error_data_t *err = g_error_queue.Pop();
+	CSocketError *pError;
+	while(g_ErrorQueue.try_dequeue(pError))
+	{
+		pError->pAsyncContext->OnError(pError->Error);
 
-			err->ctx->OnError(err->err);
-
-			free(err);
-		}
-		g_error_queue.Unlock();
+		free(pError);
 	}
 
-	if (!g_data_queue.Empty()) {
-		g_data_queue.Lock();
-		while(!g_data_queue.Empty()) {
-			socket_data_t *data = g_data_queue.Pop();
+	CSocketData *pData;
+	while(g_DataQueue.try_dequeue(pData))
+	{
+		pData->pAsyncContext->OnData(pData->pBuffer, pData->BufferSize);
 
-			data->ctx->OnData(data->buf, data->size);
+		free(pData->pBuffer);
+		free(pData);
+	}
+}
 
-			free(data->buf);
-			free(data);
-		}
-		g_data_queue.Unlock();
+void UV_OnAsyncAdded(uv_async_t *pHandle)
+{
+	CAsyncAddJob Job;
+	while(g_AsyncAddQueue.try_dequeue(Job))
+	{
+		uv_async_t *pAsync = (uv_async_t *)malloc(sizeof(uv_async_t));
+		uv_async_init(g_UV_Loop, pAsync, Job.CallbackFn);
+		pAsync->data = Job.pData;
+		uv_async_send(pAsync);
 	}
 }
 
 // main event loop thread
-void EventLoop(void* data) {
-	uv_run(loop, UV_RUN_DEFAULT);
+void UV_EventLoop(void *data)
+{
+	uv_run(g_UV_Loop, UV_RUN_DEFAULT);
 }
 
-void alloc_buffer(uv_handle_t *handle, size_t suggested_size, uv_buf_t *buf) {
-	buf->base = (char*) malloc(suggested_size);
+void UV_AllocBuffer(uv_handle_t *handle, size_t suggested_size, uv_buf_t *buf)
+{
+	buf->base = (char *)malloc(suggested_size);
 	buf->len = suggested_size;
 }
 
-void on_read(uv_stream_t *client, ssize_t nread, const uv_buf_t *buf) {
-	if (nread < 0) {
-		push_error((AsyncSocketContext*) client->data, nread);
-		// Should we decide to close the socket? For now let's let the plugin handle errors, including EOF.
+void UV_HandleCleanup(uv_handle_t *handle)
+{
+	free(handle);
+}
+
+void UV_PushError(CAsyncSocketContext *pContext, int error)
+{
+	CSocketError *pError = (CSocketError *)malloc(sizeof(CSocketError));
+
+	pError->pAsyncContext = pContext;
+	pError->Error = error;
+
+	g_ErrorQueue.enqueue(pError);
+}
+
+void UV_OnRead(uv_stream_t *client, ssize_t nread, const uv_buf_t *buf)
+{
+	CAsyncSocketContext *pContext = (CAsyncSocketContext *)client->data;
+	if(nread < 0)
+	{
+		// Connection closed
+		uv_close((uv_handle_t *)client, NULL);
+		pContext->socket = NULL;
+
+		UV_PushError((CAsyncSocketContext *)client->data, nread);
 		return;
 	}
 
-	char *data = (char*) malloc(sizeof(char) * (nread+1));
-	data[nread] = '\0';
+	char *data = (char *)malloc(sizeof(char) * (nread + 1));
+	data[nread] = 0;
 	strncpy(data, buf->base, nread);
 
-	socket_data_t *s = (socket_data_t *) malloc(sizeof(socket_data_t));
+	CSocketData *pData = (CSocketData *)malloc(sizeof(CSocketData));
+	pData->pAsyncContext = pContext;
+	pData->pBuffer = data;
+	pData->BufferSize = nread;
 
-	s->ctx = static_cast<AsyncSocketContext*>(client->data);
-	s->buf = data;
-	s->size = nread;
-
-	g_data_queue.Lock();
-	g_data_queue.Push(s);
-	g_data_queue.Unlock();
+	g_DataQueue.enqueue(pData);
 
 	free(buf->base);
 }
 
-void on_connect(uv_connect_t *req, int status) {
-	AsyncSocketContext *ctx = (AsyncSocketContext*) req->data;
+void UV_OnConnect(uv_connect_t *req, int status)
+{
+	CAsyncSocketContext *pContext = (CAsyncSocketContext *)req->data;
 
-	if (status < 0) {
-		push_error(ctx, status);
+	if(status < 0)
+	{
+		UV_PushError(pContext, status);
 		return;
 	}
 
-	ctx->connect_req = NULL;
-	ctx->stream = req->handle;
+	pContext->stream = req->handle;
 
-	g_connect_queue.Lock();
-	g_connect_queue.Push(ctx);
-	g_connect_queue.Unlock();
+	g_ConnectQueue.enqueue(pContext);
 
 	req->handle->data = req->data;
+	free(req);
 
-	uv_read_start(ctx->stream, alloc_buffer, on_read);
+	uv_read_start(pContext->stream, UV_AllocBuffer, UV_OnRead);
 }
 
-void push_error(AsyncSocketContext *ctx, int error) {
-	error_data_t *err = (error_data_t*) malloc(sizeof(error_data_t));
+void UV_OnAsyncResolved(uv_getaddrinfo_t *resolver, int status, struct addrinfo *res)
+{
+	free(resolver->service);
+	CAsyncSocketContext *pContext = (CAsyncSocketContext *) resolver->data;
 
-	err->ctx = ctx;
-	err->err = error;
-
-	g_error_queue.Lock();
-	g_error_queue.Push(err);
-	g_error_queue.Unlock();
-}
-
-void on_resolved(uv_getaddrinfo_t *resolver, int status, struct addrinfo *res) {
-	AsyncSocketContext *ctx = (AsyncSocketContext *) resolver->data;
-
-	if (status < 0) {
-		push_error(ctx, status);
+	if(status < 0)
+	{
+		UV_PushError(pContext, status);
 		return;
 	}
 
-	uv_connect_t *connect_req = (uv_connect_t*) malloc(sizeof(uv_connect_t));
-	uv_tcp_t *socket = (uv_tcp_t*) malloc(sizeof(uv_tcp_t));
+	uv_connect_t *connect_req = (uv_connect_t *)malloc(sizeof(uv_connect_t));
+	uv_tcp_t *socket = (uv_tcp_t *)malloc(sizeof(uv_tcp_t));
 
-	ctx->connect_req = connect_req;
-	ctx->socket = socket;
+	pContext->socket = socket;
+	connect_req->data = pContext;
 
-	connect_req->data = ctx;
+	char addr[32] = {0};
+	uv_ip4_name((struct sockaddr_in *)res->ai_addr, addr, sizeof(addr));
 
-	char addr[17] = {'\0'};
-	uv_ip4_name((struct sockaddr_in*) res->ai_addr, addr, 16);
-
-	uv_tcp_init(loop, socket);
-
-	uv_tcp_connect(connect_req, socket, (const struct sockaddr*) res->ai_addr, on_connect);
+	uv_tcp_init(g_UV_Loop, socket);
+	uv_tcp_connect(connect_req, socket, (const struct sockaddr*) res->ai_addr, UV_OnConnect);
 
 	uv_freeaddrinfo(res);
 }
 
-cell_t Socket_Create(IPluginContext *pContext, const cell_t *params) {
-	AsyncSocketContext *ctx = new AsyncSocketContext(pContext);
+void UV_OnAsyncResolve(uv_async_t *handle)
+{
+	CAsyncSocketContext *pAsyncContext = (CAsyncSocketContext *)handle->data;
+	uv_close((uv_handle_t *)handle, UV_HandleCleanup);
 
-	ctx->hndl = handlesys->CreateHandle(g_AsyncSocket.socketHandleType, ctx, pContext->GetIdentity(), myself->GetIdentity(), NULL);
+	pAsyncContext->resolver.data = pAsyncContext;
 
-	return ctx->hndl;
-}
-
-cell_t Socket_Connect(IPluginContext *pContext, const cell_t *params) {
-	AsyncSocketContext *ctx = g_AsyncSocket.GetSocketInstanceByHandle(params[1]);
-
-	if (ctx == NULL) {
-		return pContext->ThrowNativeError("Invalid socket handle");
-	}
-
-	if (params[3] < 0 || params[3] > 65535) {
-		return pContext->ThrowNativeError("Invalid port specified");
-	}
-	
-	char *address = NULL;
-	pContext->LocalToString(params[2], &address);
-
-	ctx->host = address;
-	ctx->port = params[3];
-
-	g_async_resolve.data = ctx;
-	uv_async_send(&g_async_resolve);
-
-	return 1;
-}
-
-cell_t Socket_Write(IPluginContext *pContext, const cell_t *params) {
-	AsyncSocketContext *ctx = g_AsyncSocket.GetSocketInstanceByHandle(params[1]);
-
-	if (ctx == NULL) {
-		return pContext->ThrowNativeError("Invalid socket handle");
-	}
-
-	char *data = NULL;
-	pContext->LocalToString(params[2], &data);
-
-	uv_buf_t* buffer = (uv_buf_t *) malloc(sizeof(uv_buf_t));
-
-	buffer->base = strdup(data);
-	buffer->len = strlen(data);
-
-	socket_write_t *write = (socket_write_t *) malloc(sizeof(socket_write_t));
-
-	write->ctx = ctx;
-	write->buf = buffer;
-
-	g_async_write.data = write;
-	uv_async_send(&g_async_write);
-
-	return 1;
-}
-
-cell_t Socket_SetConnectCallback(IPluginContext *pContext, const cell_t *params) {
-	AsyncSocketContext *ctx = g_AsyncSocket.GetSocketInstanceByHandle(params[1]);
-
-	if (ctx == NULL) {
-		return pContext->ThrowNativeError("Invalid socket handle");
-	}
-
-	if (!ctx->SetConnectCallback(params[2])) {
-		return pContext->ThrowNativeError("Invalid callback");
-	}
-
-	return true;
-}
-
-cell_t Socket_SetErrorCallback(IPluginContext *pContext, const cell_t *params) {
-	AsyncSocketContext *ctx = g_AsyncSocket.GetSocketInstanceByHandle(params[1]);
-
-	if (ctx == NULL) {
-		return pContext->ThrowNativeError("Invalid socket handle");
-	}
-
-	if (!ctx->SetErrorCallback(params[2])) {
-		return pContext->ThrowNativeError("Invalid callback");
-	}
-
-	return true;
-}
-
-cell_t Socket_SetDataCallback(IPluginContext *pContext, const cell_t *params) {
-	AsyncSocketContext *ctx = g_AsyncSocket.GetSocketInstanceByHandle(params[1]);
-
-	if (ctx == NULL) {
-		return pContext->ThrowNativeError("Invalid socket handle");
-	}
-
-	if (!ctx->SetDataCallback(params[2])) {
-		return pContext->ThrowNativeError("Invalid callback");
-	}
-
-	return true;
-}
-
-void async_resolve(uv_async_t *handle) {
-	AsyncSocketContext *ctx = static_cast<AsyncSocketContext *>(handle->data);
-
-	ctx->resolver.data = ctx;
-	
-	char *service = (char *) malloc(sizeof(char) * 6);
-
-	sprintf(service, "%d", ctx->port);
+	char *service = (char *)malloc(8);
+	sprintf(service, "%d", pAsyncContext->m_Port);
 
 	struct addrinfo hints;
 	hints.ai_family = PF_INET;
@@ -317,47 +230,151 @@ void async_resolve(uv_async_t *handle) {
 	hints.ai_protocol = IPPROTO_TCP;
 	hints.ai_flags = 0;
 
-	int r = uv_getaddrinfo(loop, &ctx->resolver, on_resolved, ctx->host, service, &hints);
-
-	if (r) {
-		push_error(ctx, r);
-	}
+	int err = uv_getaddrinfo(g_UV_Loop, &pAsyncContext->resolver, UV_OnAsyncResolved, pAsyncContext->m_pHost, service, &hints);
+	if(err)
+		UV_PushError(pAsyncContext, err);
 }
 
-void async_write_cb(uv_write_t* req, int status) {
-	socket_write_t *data = (socket_write_t *) req->data;
-
-	if (data->buf->base) {
-		free(data->buf->base);
-	}
-
-	free(data->buf);
-
-	free(data);
+void UV_OnAsyncWriteCleanup(uv_write_t *req, int status)
+{
+	CAsyncWrite *pWrite = (CAsyncWrite *)req->data;
 
+	free(pWrite->pBuffer->base);
+	free(pWrite->pBuffer);
+	free(pWrite);
 	free(req);
 }
 
-void async_write(uv_async_t *handle) {
-	socket_write_t *data = (socket_write_t *) handle->data;
+void UV_OnAsyncWrite(uv_async_t *handle)
+{
+	CAsyncWrite *pWrite = (CAsyncWrite *)handle->data;
+	uv_close((uv_handle_t *)handle, UV_HandleCleanup);
 
-	if (data == NULL || data->buf == NULL) {
+	if(pWrite == NULL || pWrite->pBuffer == NULL)
+		return;
+
+	if(pWrite->pAsyncContext == NULL || pWrite->pAsyncContext->stream == NULL)
+	{
+		free(pWrite->pBuffer->base);
+		free(pWrite->pBuffer);
+		free(pWrite);
 		return;
 	}
 
-	if (data->ctx == NULL || data->ctx->stream == NULL) {
-		return;
-	}
+	uv_write_t *req = (uv_write_t *)malloc(sizeof(uv_write_t));
+	req->data = pWrite;
 
-	uv_write_t* req = (uv_write_t *) malloc(sizeof(uv_write_t));
+	uv_write(req, pWrite->pAsyncContext->stream, pWrite->pBuffer, 1, UV_OnAsyncWriteCleanup);
+}
 
-	req->data = data;
+cell_t Native_AsyncSocket_Create(IPluginContext *pContext, const cell_t *params)
+{
+	CAsyncSocketContext *pAsyncContext = new CAsyncSocketContext(pContext);
 
-	uv_write(req, data->ctx->stream, data->buf, 1, async_write_cb);
+	pAsyncContext->m_Handle = handlesys->CreateHandle(g_AsyncSocket.socketHandleType, pAsyncContext,
+		pContext->GetIdentity(), myself->GetIdentity(), NULL);
+
+	return pAsyncContext->m_Handle;
+}
+
+cell_t Native_AsyncSocket_Connect(IPluginContext *pContext, const cell_t *params)
+{
+	CAsyncSocketContext *pAsyncContext = g_AsyncSocket.GetSocketInstanceByHandle(params[1]);
+
+	if(pAsyncContext == NULL)
+		return pContext->ThrowNativeError("Invalid socket handle");
+
+	if(params[3] < 0 || params[3] > 65535)
+		return pContext->ThrowNativeError("Invalid port specified");
+
+	char *address = NULL;
+	pContext->LocalToString(params[2], &address);
+
+	pAsyncContext->m_pHost = address;
+	pAsyncContext->m_Port = params[3];
+
+	CAsyncAddJob Job;
+	Job.CallbackFn = UV_OnAsyncResolve;
+	Job.pData = pAsyncContext;
+	g_AsyncAddQueue.enqueue(Job);
+
+	uv_async_send(&g_UV_AsyncAdded);
+
+	return 1;
+}
+
+cell_t Native_AsyncSocket_Write(IPluginContext *pContext, const cell_t *params)
+{
+	CAsyncSocketContext *pAsyncContext = g_AsyncSocket.GetSocketInstanceByHandle(params[1]);
+
+	if(pAsyncContext == NULL)
+		return pContext->ThrowNativeError("Invalid socket handle");
+
+	char *data = NULL;
+	pContext->LocalToString(params[2], &data);
+
+	uv_buf_t* buffer = (uv_buf_t *)malloc(sizeof(uv_buf_t));
+
+	buffer->base = strdup(data);
+	buffer->len = strlen(data);
+
+	CAsyncWrite *pWrite = (CAsyncWrite *)malloc(sizeof(CAsyncWrite));
+
+	pWrite->pAsyncContext = pAsyncContext;
+	pWrite->pBuffer = buffer;
+
+	CAsyncAddJob Job;
+	Job.CallbackFn = UV_OnAsyncWrite;
+	Job.pData = pWrite;
+	g_AsyncAddQueue.enqueue(Job);
+
+	uv_async_send(&g_UV_AsyncAdded);
+
+	return 1;
+}
+
+cell_t Native_AsyncSocket_SetConnectCallback(IPluginContext *pContext, const cell_t *params)
+{
+	CAsyncSocketContext *pAsyncContext = g_AsyncSocket.GetSocketInstanceByHandle(params[1]);
+
+	if(pAsyncContext == NULL)
+		return pContext->ThrowNativeError("Invalid socket handle");
+
+	if(!pAsyncContext->SetConnectCallback(params[2]))
+		return pContext->ThrowNativeError("Invalid callback");
+
+	return true;
+}
+
+cell_t Native_AsyncSocket_SetErrorCallback(IPluginContext *pContext, const cell_t *params)
+{
+	CAsyncSocketContext *pAsyncContext = g_AsyncSocket.GetSocketInstanceByHandle(params[1]);
+
+	if(pAsyncContext == NULL)
+		return pContext->ThrowNativeError("Invalid socket handle");
+
+	if(!pAsyncContext->SetErrorCallback(params[2]))
+		return pContext->ThrowNativeError("Invalid callback");
+
+	return true;
+}
+
+cell_t Native_AsyncSocket_SetDataCallback(IPluginContext *pContext, const cell_t *params)
+{
+	CAsyncSocketContext *pAsyncContext = g_AsyncSocket.GetSocketInstanceByHandle(params[1]);
+
+	if(pAsyncContext == NULL)
+		return pContext->ThrowNativeError("Invalid socket handle");
+
+	if(!pAsyncContext->SetDataCallback(params[2]))
+		return pContext->ThrowNativeError("Invalid callback");
+
+	return true;
 }
 
 // Sourcemod Plugin Events
-bool AsyncSocket::SDK_OnLoad(char *error, size_t maxlength, bool late) {
+bool AsyncSocket::SDK_OnLoad(char *error, size_t maxlength, bool late)
+{
 	sharesys->AddNatives(myself, AsyncSocketNatives);
 	sharesys->RegisterLibrary(myself, "async_socket");
 
@@ -365,32 +382,34 @@ bool AsyncSocket::SDK_OnLoad(char *error, size_t maxlength, bool late) {
 
 	smutils->AddGameFrameHook(OnGameFrame);
 
-	loop = uv_default_loop();
+	g_UV_Loop = uv_default_loop();
 
-	uv_async_init(loop, &g_async_resolve, async_resolve);
-	uv_async_init(loop, &g_async_write, async_write);
+	uv_async_init(g_UV_Loop, &g_UV_AsyncAdded, UV_OnAsyncAdded);
+
+	uv_thread_create(&g_UV_LoopThread, UV_EventLoop, NULL);
 
-	uv_thread_create(&loop_thread, EventLoop, NULL);
-	
 	return true;
 }
 
-void AsyncSocket::SDK_OnUnload() {
+void AsyncSocket::SDK_OnUnload()
+{
 	handlesys->RemoveType(socketHandleType, NULL);
 
-	uv_thread_join(&loop_thread);
+	uv_close((uv_handle_t *)&g_UV_AsyncAdded, NULL);
 
-	uv_loop_close(loop);
+	uv_thread_join(&g_UV_LoopThread);
+
+	uv_loop_close(g_UV_Loop);
 
 	smutils->RemoveGameFrameHook(OnGameFrame);
 }
 
 const sp_nativeinfo_t AsyncSocketNatives[] = {
-	{"AsyncSocket.AsyncSocket", Socket_Create},
-	{"AsyncSocket.Connect", Socket_Connect},
-	{"AsyncSocket.Write", Socket_Write},
-	{"AsyncSocket.SetConnectCallback", Socket_SetConnectCallback},
-	{"AsyncSocket.SetErrorCallback", Socket_SetErrorCallback},
-	{"AsyncSocket.SetDataCallback", Socket_SetDataCallback},
+	{"AsyncSocket.AsyncSocket", Native_AsyncSocket_Create},
+	{"AsyncSocket.Connect", Native_AsyncSocket_Connect},
+	{"AsyncSocket.Write", Native_AsyncSocket_Write},
+	{"AsyncSocket.SetConnectCallback", Native_AsyncSocket_SetConnectCallback},
+	{"AsyncSocket.SetErrorCallback", Native_AsyncSocket_SetErrorCallback},
+	{"AsyncSocket.SetDataCallback", Native_AsyncSocket_SetDataCallback},
 	{NULL, NULL}
-};
\ No newline at end of file
+};
diff --git a/extension/extension.h b/extension/extension.h
index fb6a840..afa68d9 100644
--- a/extension/extension.h
+++ b/extension/extension.h
@@ -8,7 +8,7 @@
  * This program is free software; you can redistribute it and/or modify it under
  * the terms of the GNU General Public License, version 3.0, as published by the
  * Free Software Foundation.
- * 
+ *
  * This program is distributed in the hope that it will be useful, but WITHOUT
  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
@@ -42,20 +42,29 @@
 #include "smsdk_ext.h"
 #include "context.h"
 
-struct socket_write_t {
-	AsyncSocketContext *ctx;
-	uv_buf_t* buf;
+struct CAsyncAddJob
+{
+	uv_async_cb CallbackFn;
+	void *pData;
 };
 
-struct socket_data_t {
-	AsyncSocketContext *ctx;
-	char* buf;
-	ssize_t size;
+struct CAsyncWrite
+{
+	CAsyncSocketContext *pAsyncContext;
+	uv_buf_t *pBuffer;
 };
 
-struct error_data_t {
-	AsyncSocketContext *ctx;
-	int err;
+struct CSocketData
+{
+	CAsyncSocketContext *pAsyncContext;
+	char *pBuffer;
+	ssize_t BufferSize;
+};
+
+struct CSocketError
+{
+	CAsyncSocketContext *pAsyncContext;
+	int Error;
 };
 
 /**
@@ -74,7 +83,7 @@ public:
 	 * @return			True to succeed loading, false to fail.
 	 */
 	virtual bool SDK_OnLoad(char *error, size_t maxlength, bool late);
-	
+
 	/**
 	 * @brief This is called right before the extension is unloaded.
 	 */
@@ -135,7 +144,7 @@ public:
 public:
 	HandleType_t socketHandleType;
 
-	AsyncSocketContext* GetSocketInstanceByHandle(Handle_t handle);
+	CAsyncSocketContext* GetSocketInstanceByHandle(Handle_t handle);
 public:
 	void OnHandleDestroy(HandleType_t type, void *object);
 };
diff --git a/extension/libuv/.gitignore b/extension/libuv/.gitignore
new file mode 100644
index 0000000..d6b7ef3
--- /dev/null
+++ b/extension/libuv/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
diff --git a/extension/queue.h b/extension/queue.h
deleted file mode 100644
index cc5cbbd..0000000
--- a/extension/queue.h
+++ /dev/null
@@ -1,44 +0,0 @@
-#ifndef ASYNC_QUEUE_H
-#define ASYNC_QUEUE_H
-
-#include <deque>
-#include <uv.h>
-
-template <class T> 
-class LockedQueue {
-    uv_mutex_t lock;
-    std::deque<T> queue;
-    
-public:    
-    LockedQueue() {
-        uv_mutex_init(&lock);
-    }
-    
-    ~LockedQueue() {
-        uv_mutex_destroy(&lock);
-    }
-    
-    void Lock() {
-        uv_mutex_lock(&lock);
-    }
-    
-    void Unlock() {
-        uv_mutex_unlock(&lock);
-    }
-    
-    T Pop() {
-        T output = queue.front();
-        queue.pop_front();
-        return output;
-    }
-    
-    void Push(T item) {
-        queue.push_back(item);
-    }
-    
-    bool Empty() {
-        return queue.empty();
-    }
-};
-
-#endif
\ No newline at end of file
diff --git a/extension/readerwriterqueue.h b/extension/readerwriterqueue.h
new file mode 100644
index 0000000..ec465d6
--- /dev/null
+++ b/extension/readerwriterqueue.h
@@ -0,0 +1,815 @@
+// ©2013-2016 Cameron Desrochers.
+// Distributed under the simplified BSD license (see the license file that
+// should have come with this header).
+
+#pragma once
+
+#include "atomicops.h"
+#include <type_traits>
+#include <utility>
+#include <cassert>
+#include <stdexcept>
+#include <new>
+#include <cstdint>
+#include <cstdlib>		// For malloc/free/abort & size_t
+#if __cplusplus > 199711L || _MSC_VER >= 1700 // C++11 or VS2012
+#include <chrono>
+#endif
+
+
+// A lock-free queue for a single-consumer, single-producer architecture.
+// The queue is also wait-free in the common path (except if more memory
+// needs to be allocated, in which case malloc is called).
+// Allocates memory sparingly (O(lg(n) times, amortized), and only once if
+// the original maximum size estimate is never exceeded.
+// Tested on x86/x64 processors, but semantics should be correct for all
+// architectures (given the right implementations in atomicops.h), provided
+// that aligned integer and pointer accesses are naturally atomic.
+// Note that there should only be one consumer thread and producer thread;
+// Switching roles of the threads, or using multiple consecutive threads for
+// one role, is not safe unless properly synchronized.
+// Using the queue exclusively from one thread is fine, though a bit silly.
+
+#ifndef MOODYCAMEL_CACHE_LINE_SIZE
+#define MOODYCAMEL_CACHE_LINE_SIZE 64
+#endif
+
+#ifndef MOODYCAMEL_EXCEPTIONS_ENABLED
+#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || (defined(__GNUC__) && defined(__EXCEPTIONS)) || (!defined(_MSC_VER) && !defined(__GNUC__))
+#define MOODYCAMEL_EXCEPTIONS_ENABLED
+#endif
+#endif
+
+#ifdef AE_VCPP
+#pragma warning(push)
+#pragma warning(disable: 4324)	// structure was padded due to __declspec(align())
+#pragma warning(disable: 4820)	// padding was added
+#pragma warning(disable: 4127)	// conditional expression is constant
+#endif
+
+namespace moodycamel {
+
+template<typename T, size_t MAX_BLOCK_SIZE = 512>
+class ReaderWriterQueue
+{
+	// Design: Based on a queue-of-queues. The low-level queues are just
+	// circular buffers with front and tail indices indicating where the
+	// next element to dequeue is and where the next element can be enqueued,
+	// respectively. Each low-level queue is called a "block". Each block
+	// wastes exactly one element's worth of space to keep the design simple
+	// (if front == tail then the queue is empty, and can't be full).
+	// The high-level queue is a circular linked list of blocks; again there
+	// is a front and tail, but this time they are pointers to the blocks.
+	// The front block is where the next element to be dequeued is, provided
+	// the block is not empty. The back block is where elements are to be
+	// enqueued, provided the block is not full.
+	// The producer thread owns all the tail indices/pointers. The consumer
+	// thread owns all the front indices/pointers. Both threads read each
+	// other's variables, but only the owning thread updates them. E.g. After
+	// the consumer reads the producer's tail, the tail may change before the
+	// consumer is done dequeuing an object, but the consumer knows the tail
+	// will never go backwards, only forwards.
+	// If there is no room to enqueue an object, an additional block (of
+	// equal size to the last block) is added. Blocks are never removed.
+
+public:
+	// Constructs a queue that can hold maxSize elements without further
+	// allocations. If more than MAX_BLOCK_SIZE elements are requested,
+	// then several blocks of MAX_BLOCK_SIZE each are reserved (including
+	// at least one extra buffer block).
+	explicit ReaderWriterQueue(size_t maxSize = 15)
+#ifndef NDEBUG
+		: enqueuing(false)
+		,dequeuing(false)
+#endif
+	{
+		assert(maxSize > 0);
+		assert(MAX_BLOCK_SIZE == ceilToPow2(MAX_BLOCK_SIZE) && "MAX_BLOCK_SIZE must be a power of 2");
+		assert(MAX_BLOCK_SIZE >= 2 && "MAX_BLOCK_SIZE must be at least 2");
+		
+		Block* firstBlock = nullptr;
+		
+		largestBlockSize = ceilToPow2(maxSize + 1);		// We need a spare slot to fit maxSize elements in the block
+		if (largestBlockSize > MAX_BLOCK_SIZE * 2) {
+			// We need a spare block in case the producer is writing to a different block the consumer is reading from, and
+			// wants to enqueue the maximum number of elements. We also need a spare element in each block to avoid the ambiguity
+			// between front == tail meaning "empty" and "full".
+			// So the effective number of slots that are guaranteed to be usable at any time is the block size - 1 times the
+			// number of blocks - 1. Solving for maxSize and applying a ceiling to the division gives us (after simplifying):
+			size_t initialBlockCount = (maxSize + MAX_BLOCK_SIZE * 2 - 3) / (MAX_BLOCK_SIZE - 1);
+			largestBlockSize = MAX_BLOCK_SIZE;
+			Block* lastBlock = nullptr;
+			for (size_t i = 0; i != initialBlockCount; ++i) {
+				auto block = make_block(largestBlockSize);
+				if (block == nullptr) {
+#ifdef MOODYCAMEL_EXCEPTIONS_ENABLED
+					throw std::bad_alloc();
+#else
+					abort();
+#endif
+				}
+				if (firstBlock == nullptr) {
+					firstBlock = block;
+				}
+				else {
+					lastBlock->next = block;
+				}
+				lastBlock = block;
+				block->next = firstBlock;
+			}
+		}
+		else {
+			firstBlock = make_block(largestBlockSize);
+			if (firstBlock == nullptr) {
+#ifdef MOODYCAMEL_EXCEPTIONS_ENABLED
+				throw std::bad_alloc();
+#else
+				abort();
+#endif
+			}
+			firstBlock->next = firstBlock;
+		}
+		frontBlock = firstBlock;
+		tailBlock = firstBlock;
+		
+		// Make sure the reader/writer threads will have the initialized memory setup above:
+		fence(memory_order_sync);
+	}
+
+	// Note: The queue should not be accessed concurrently while it's
+	// being deleted. It's up to the user to synchronize this.
+	~ReaderWriterQueue()
+	{
+		// Make sure we get the latest version of all variables from other CPUs:
+		fence(memory_order_sync);
+
+		// Destroy any remaining objects in queue and free memory
+		Block* frontBlock_ = frontBlock;
+		Block* block = frontBlock_;
+		do {
+			Block* nextBlock = block->next;
+			size_t blockFront = block->front;
+			size_t blockTail = block->tail;
+
+			for (size_t i = blockFront; i != blockTail; i = (i + 1) & block->sizeMask) {
+				auto element = reinterpret_cast<T*>(block->data + i * sizeof(T));
+				element->~T();
+				(void)element;
+			}
+			
+			auto rawBlock = block->rawThis;
+			block->~Block();
+			std::free(rawBlock);
+			block = nextBlock;
+		} while (block != frontBlock_);
+	}
+
+
+	// Enqueues a copy of element if there is room in the queue.
+	// Returns true if the element was enqueued, false otherwise.
+	// Does not allocate memory.
+	AE_FORCEINLINE bool try_enqueue(T const& element)
+	{
+		return inner_enqueue<CannotAlloc>(element);
+	}
+
+	// Enqueues a moved copy of element if there is room in the queue.
+	// Returns true if the element was enqueued, false otherwise.
+	// Does not allocate memory.
+	AE_FORCEINLINE bool try_enqueue(T&& element)
+	{
+		return inner_enqueue<CannotAlloc>(std::forward<T>(element));
+	}
+
+
+	// Enqueues a copy of element on the queue.
+	// Allocates an additional block of memory if needed.
+	// Only fails (returns false) if memory allocation fails.
+	AE_FORCEINLINE bool enqueue(T const& element)
+	{
+		return inner_enqueue<CanAlloc>(element);
+	}
+
+	// Enqueues a moved copy of element on the queue.
+	// Allocates an additional block of memory if needed.
+	// Only fails (returns false) if memory allocation fails.
+	AE_FORCEINLINE bool enqueue(T&& element)
+	{
+		return inner_enqueue<CanAlloc>(std::forward<T>(element));
+	}
+
+
+	// Attempts to dequeue an element; if the queue is empty,
+	// returns false instead. If the queue has at least one element,
+	// moves front to result using operator=, then returns true.
+	template<typename U>
+	bool try_dequeue(U& result)
+	{
+#ifndef NDEBUG
+		ReentrantGuard guard(this->dequeuing);
+#endif
+
+		// High-level pseudocode:
+		// Remember where the tail block is
+		// If the front block has an element in it, dequeue it
+		// Else
+		//     If front block was the tail block when we entered the function, return false
+		//     Else advance to next block and dequeue the item there
+
+		// Note that we have to use the value of the tail block from before we check if the front
+		// block is full or not, in case the front block is empty and then, before we check if the
+		// tail block is at the front block or not, the producer fills up the front block *and
+		// moves on*, which would make us skip a filled block. Seems unlikely, but was consistently
+		// reproducible in practice.
+		// In order to avoid overhead in the common case, though, we do a double-checked pattern
+		// where we have the fast path if the front block is not empty, then read the tail block,
+		// then re-read the front block and check if it's not empty again, then check if the tail
+		// block has advanced.
+		
+		Block* frontBlock_ = frontBlock.load();
+		size_t blockTail = frontBlock_->localTail;
+		size_t blockFront = frontBlock_->front.load();
+		
+		if (blockFront != blockTail || blockFront != (frontBlock_->localTail = frontBlock_->tail.load())) {
+			fence(memory_order_acquire);
+			
+		non_empty_front_block:
+			// Front block not empty, dequeue from here
+			auto element = reinterpret_cast<T*>(frontBlock_->data + blockFront * sizeof(T));
+			result = std::move(*element);
+			element->~T();
+
+			blockFront = (blockFront + 1) & frontBlock_->sizeMask;
+
+			fence(memory_order_release);
+			frontBlock_->front = blockFront;
+		}
+		else if (frontBlock_ != tailBlock.load()) {
+			fence(memory_order_acquire);
+
+			frontBlock_ = frontBlock.load();
+			blockTail = frontBlock_->localTail = frontBlock_->tail.load();
+			blockFront = frontBlock_->front.load();
+			fence(memory_order_acquire);
+			
+			if (blockFront != blockTail) {
+				// Oh look, the front block isn't empty after all
+				goto non_empty_front_block;
+			}
+			
+			// Front block is empty but there's another block ahead, advance to it
+			Block* nextBlock = frontBlock_->next;
+			// Don't need an acquire fence here since next can only ever be set on the tailBlock,
+			// and we're not the tailBlock, and we did an acquire earlier after reading tailBlock which
+			// ensures next is up-to-date on this CPU in case we recently were at tailBlock.
+
+			size_t nextBlockFront = nextBlock->front.load();
+			size_t nextBlockTail = nextBlock->localTail = nextBlock->tail.load();
+			fence(memory_order_acquire);
+
+			// Since the tailBlock is only ever advanced after being written to,
+			// we know there's for sure an element to dequeue on it
+			assert(nextBlockFront != nextBlockTail);
+			AE_UNUSED(nextBlockTail);
+
+			// We're done with this block, let the producer use it if it needs
+			fence(memory_order_release);		// Expose possibly pending changes to frontBlock->front from last dequeue
+			frontBlock = frontBlock_ = nextBlock;
+
+			compiler_fence(memory_order_release);	// Not strictly needed
+
+			auto element = reinterpret_cast<T*>(frontBlock_->data + nextBlockFront * sizeof(T));
+			
+			result = std::move(*element);
+			element->~T();
+
+			nextBlockFront = (nextBlockFront + 1) & frontBlock_->sizeMask;
+			
+			fence(memory_order_release);
+			frontBlock_->front = nextBlockFront;
+		}
+		else {
+			// No elements in current block and no other block to advance to
+			return false;
+		}
+
+		return true;
+	}
+
+
+	// Returns a pointer to the front element in the queue (the one that
+	// would be removed next by a call to `try_dequeue` or `pop`). If the
+	// queue appears empty at the time the method is called, nullptr is
+	// returned instead.
+	// Must be called only from the consumer thread.
+	T* peek()
+	{
+#ifndef NDEBUG
+		ReentrantGuard guard(this->dequeuing);
+#endif
+		// See try_dequeue() for reasoning
+
+		Block* frontBlock_ = frontBlock.load();
+		size_t blockTail = frontBlock_->localTail;
+		size_t blockFront = frontBlock_->front.load();
+		
+		if (blockFront != blockTail || blockFront != (frontBlock_->localTail = frontBlock_->tail.load())) {
+			fence(memory_order_acquire);
+		non_empty_front_block:
+			return reinterpret_cast<T*>(frontBlock_->data + blockFront * sizeof(T));
+		}
+		else if (frontBlock_ != tailBlock.load()) {
+			fence(memory_order_acquire);
+			frontBlock_ = frontBlock.load();
+			blockTail = frontBlock_->localTail = frontBlock_->tail.load();
+			blockFront = frontBlock_->front.load();
+			fence(memory_order_acquire);
+			
+			if (blockFront != blockTail) {
+				goto non_empty_front_block;
+			}
+			
+			Block* nextBlock = frontBlock_->next;
+			
+			size_t nextBlockFront = nextBlock->front.load();
+			fence(memory_order_acquire);
+
+			assert(nextBlockFront != nextBlock->tail.load());
+			return reinterpret_cast<T*>(nextBlock->data + nextBlockFront * sizeof(T));
+		}
+		
+		return nullptr;
+	}
+	
+	// Removes the front element from the queue, if any, without returning it.
+	// Returns true on success, or false if the queue appeared empty at the time
+	// `pop` was called.
+	bool pop()
+	{
+#ifndef NDEBUG
+		ReentrantGuard guard(this->dequeuing);
+#endif
+		// See try_dequeue() for reasoning
+		
+		Block* frontBlock_ = frontBlock.load();
+		size_t blockTail = frontBlock_->localTail;
+		size_t blockFront = frontBlock_->front.load();
+		
+		if (blockFront != blockTail || blockFront != (frontBlock_->localTail = frontBlock_->tail.load())) {
+			fence(memory_order_acquire);
+			
+		non_empty_front_block:
+			auto element = reinterpret_cast<T*>(frontBlock_->data + blockFront * sizeof(T));
+			element->~T();
+
+			blockFront = (blockFront + 1) & frontBlock_->sizeMask;
+
+			fence(memory_order_release);
+			frontBlock_->front = blockFront;
+		}
+		else if (frontBlock_ != tailBlock.load()) {
+			fence(memory_order_acquire);
+			frontBlock_ = frontBlock.load();
+			blockTail = frontBlock_->localTail = frontBlock_->tail.load();
+			blockFront = frontBlock_->front.load();
+			fence(memory_order_acquire);
+			
+			if (blockFront != blockTail) {
+				goto non_empty_front_block;
+			}
+			
+			// Front block is empty but there's another block ahead, advance to it
+			Block* nextBlock = frontBlock_->next;
+			
+			size_t nextBlockFront = nextBlock->front.load();
+			size_t nextBlockTail = nextBlock->localTail = nextBlock->tail.load();
+			fence(memory_order_acquire);
+
+			assert(nextBlockFront != nextBlockTail);
+			AE_UNUSED(nextBlockTail);
+
+			fence(memory_order_release);
+			frontBlock = frontBlock_ = nextBlock;
+
+			compiler_fence(memory_order_release);
+
+			auto element = reinterpret_cast<T*>(frontBlock_->data + nextBlockFront * sizeof(T));
+			element->~T();
+
+			nextBlockFront = (nextBlockFront + 1) & frontBlock_->sizeMask;
+			
+			fence(memory_order_release);
+			frontBlock_->front = nextBlockFront;
+		}
+		else {
+			// No elements in current block and no other block to advance to
+			return false;
+		}
+
+		return true;
+	}
+	
+	// Returns the approximate number of items currently in the queue.
+	// Safe to call from both the producer and consumer threads.
+	inline size_t size_approx() const
+	{
+		size_t result = 0;
+		Block* frontBlock_ = frontBlock.load();
+		Block* block = frontBlock_;
+		do {
+			fence(memory_order_acquire);
+			size_t blockFront = block->front.load();
+			size_t blockTail = block->tail.load();
+			result += (blockTail - blockFront) & block->sizeMask;
+			block = block->next.load();
+		} while (block != frontBlock_);
+		return result;
+	}
+
+
+private:
+	enum AllocationMode { CanAlloc, CannotAlloc };
+
+	template<AllocationMode canAlloc, typename U>
+	bool inner_enqueue(U&& element)
+	{
+#ifndef NDEBUG
+		ReentrantGuard guard(this->enqueuing);
+#endif
+
+		// High-level pseudocode (assuming we're allowed to alloc a new block):
+		// If room in tail block, add to tail
+		// Else check next block
+		//     If next block is not the head block, enqueue on next block
+		//     Else create a new block and enqueue there
+		//     Advance tail to the block we just enqueued to
+
+		Block* tailBlock_ = tailBlock.load();
+		size_t blockFront = tailBlock_->localFront;
+		size_t blockTail = tailBlock_->tail.load();
+
+		size_t nextBlockTail = (blockTail + 1) & tailBlock_->sizeMask;
+		if (nextBlockTail != blockFront || nextBlockTail != (tailBlock_->localFront = tailBlock_->front.load())) {
+			fence(memory_order_acquire);
+			// This block has room for at least one more element
+			char* location = tailBlock_->data + blockTail * sizeof(T);
+			new (location) T(std::forward<U>(element));
+
+			fence(memory_order_release);
+			tailBlock_->tail = nextBlockTail;
+		}
+		else {
+			fence(memory_order_acquire);
+			if (tailBlock_->next.load() != frontBlock) {
+				// Note that the reason we can't advance to the frontBlock and start adding new entries there
+				// is because if we did, then dequeue would stay in that block, eventually reading the new values,
+				// instead of advancing to the next full block (whose values were enqueued first and so should be
+				// consumed first).
+				
+				fence(memory_order_acquire);		// Ensure we get latest writes if we got the latest frontBlock
+
+				// tailBlock is full, but there's a free block ahead, use it
+				Block* tailBlockNext = tailBlock_->next.load();
+				size_t nextBlockFront = tailBlockNext->localFront = tailBlockNext->front.load();
+				nextBlockTail = tailBlockNext->tail.load();
+				fence(memory_order_acquire);
+
+				// This block must be empty since it's not the head block and we
+				// go through the blocks in a circle
+				assert(nextBlockFront == nextBlockTail);
+				tailBlockNext->localFront = nextBlockFront;
+
+				char* location = tailBlockNext->data + nextBlockTail * sizeof(T);
+				new (location) T(std::forward<U>(element));
+
+				tailBlockNext->tail = (nextBlockTail + 1) & tailBlockNext->sizeMask;
+
+				fence(memory_order_release);
+				tailBlock = tailBlockNext;
+			}
+			else if (canAlloc == CanAlloc) {
+				// tailBlock is full and there's no free block ahead; create a new block
+				auto newBlockSize = largestBlockSize >= MAX_BLOCK_SIZE ? largestBlockSize : largestBlockSize * 2;
+				auto newBlock = make_block(newBlockSize);
+				if (newBlock == nullptr) {
+					// Could not allocate a block!
+					return false;
+				}
+				largestBlockSize = newBlockSize;
+
+				new (newBlock->data) T(std::forward<U>(element));
+
+				assert(newBlock->front == 0);
+				newBlock->tail = newBlock->localTail = 1;
+
+				newBlock->next = tailBlock_->next.load();
+				tailBlock_->next = newBlock;
+
+				// Might be possible for the dequeue thread to see the new tailBlock->next
+				// *without* seeing the new tailBlock value, but this is OK since it can't
+				// advance to the next block until tailBlock is set anyway (because the only
+				// case where it could try to read the next is if it's already at the tailBlock,
+				// and it won't advance past tailBlock in any circumstance).
+				
+				fence(memory_order_release);
+				tailBlock = newBlock;
+			}
+			else if (canAlloc == CannotAlloc) {
+				// Would have had to allocate a new block to enqueue, but not allowed
+				return false;
+			}
+			else {
+				assert(false && "Should be unreachable code");
+				return false;
+			}
+		}
+
+		return true;
+	}
+
+
+	// Disable copying
+	ReaderWriterQueue(ReaderWriterQueue const&) {  }
+
+	// Disable assignment
+	ReaderWriterQueue& operator=(ReaderWriterQueue const&) {  }
+
+
+
+	AE_FORCEINLINE static size_t ceilToPow2(size_t x)
+	{
+		// From http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+		--x;
+		x |= x >> 1;
+		x |= x >> 2;
+		x |= x >> 4;
+		for (size_t i = 1; i < sizeof(size_t); i <<= 1) {
+			x |= x >> (i << 3);
+		}
+		++x;
+		return x;
+	}
+	
+	template<typename U>
+	static AE_FORCEINLINE char* align_for(char* ptr)
+	{
+		const std::size_t alignment = std::alignment_of<U>::value;
+		return ptr + (alignment - (reinterpret_cast<std::uintptr_t>(ptr) % alignment)) % alignment;
+	}
+private:
+#ifndef NDEBUG
+	struct ReentrantGuard
+	{
+		ReentrantGuard(bool& _inSection)
+			: inSection(_inSection)
+		{
+			assert(!inSection && "ReaderWriterQueue does not support enqueuing or dequeuing elements from other elements' ctors and dtors");
+			inSection = true;
+		}
+
+		~ReentrantGuard() { inSection = false; }
+
+	private:
+		ReentrantGuard& operator=(ReentrantGuard const&);
+
+	private:
+		bool& inSection;
+	};
+#endif
+
+	struct Block
+	{
+		// Avoid false-sharing by putting highly contended variables on their own cache lines
+		weak_atomic<size_t> front;	// (Atomic) Elements are read from here
+		size_t localTail;			// An uncontended shadow copy of tail, owned by the consumer
+		
+		char cachelineFiller0[MOODYCAMEL_CACHE_LINE_SIZE - sizeof(weak_atomic<size_t>) - sizeof(size_t)];
+		weak_atomic<size_t> tail;	// (Atomic) Elements are enqueued here
+		size_t localFront;
+		
+		char cachelineFiller1[MOODYCAMEL_CACHE_LINE_SIZE - sizeof(weak_atomic<size_t>) - sizeof(size_t)];	// next isn't very contended, but we don't want it on the same cache line as tail (which is)
+		weak_atomic<Block*> next;	// (Atomic)
+		
+		char* data;		// Contents (on heap) are aligned to T's alignment
+
+		const size_t sizeMask;
+
+
+		// size must be a power of two (and greater than 0)
+		Block(size_t const& _size, char* _rawThis, char* _data)
+			: front(0), localTail(0), tail(0), localFront(0), next(nullptr), data(_data), sizeMask(_size - 1), rawThis(_rawThis)
+		{
+		}
+
+	private:
+		// C4512 - Assignment operator could not be generated
+		Block& operator=(Block const&);
+
+	public:
+		char* rawThis;
+	};
+	
+	
+	static Block* make_block(size_t capacity)
+	{
+		// Allocate enough memory for the block itself, as well as all the elements it will contain
+		auto size = sizeof(Block) + std::alignment_of<Block>::value - 1;
+		size += sizeof(T) * capacity + std::alignment_of<T>::value - 1;
+		auto newBlockRaw = static_cast<char*>(std::malloc(size));
+		if (newBlockRaw == nullptr) {
+			return nullptr;
+		}
+		
+		auto newBlockAligned = align_for<Block>(newBlockRaw);
+		auto newBlockData = align_for<T>(newBlockAligned + sizeof(Block));
+		return new (newBlockAligned) Block(capacity, newBlockRaw, newBlockData);
+	}
+
+private:
+	weak_atomic<Block*> frontBlock;		// (Atomic) Elements are enqueued to this block
+	
+	char cachelineFiller[MOODYCAMEL_CACHE_LINE_SIZE - sizeof(weak_atomic<Block*>)];
+	weak_atomic<Block*> tailBlock;		// (Atomic) Elements are dequeued from this block
+
+	size_t largestBlockSize;
+
+#ifndef NDEBUG
+	bool enqueuing;
+	bool dequeuing;
+#endif
+};
+
+// Like ReaderWriterQueue, but also providees blocking operations
+template<typename T, size_t MAX_BLOCK_SIZE = 512>
+class BlockingReaderWriterQueue
+{
+private:
+	typedef ::moodycamel::ReaderWriterQueue<T, MAX_BLOCK_SIZE> ReaderWriterQueue;
+	
+public:
+	explicit BlockingReaderWriterQueue(size_t maxSize = 15)
+		: inner(maxSize)
+	{ }
+
+	
+	// Enqueues a copy of element if there is room in the queue.
+	// Returns true if the element was enqueued, false otherwise.
+	// Does not allocate memory.
+	AE_FORCEINLINE bool try_enqueue(T const& element)
+	{
+		if (inner.try_enqueue(element)) {
+			sema.signal();
+			return true;
+		}
+		return false;
+	}
+
+	// Enqueues a moved copy of element if there is room in the queue.
+	// Returns true if the element was enqueued, false otherwise.
+	// Does not allocate memory.
+	AE_FORCEINLINE bool try_enqueue(T&& element)
+	{
+		if (inner.try_enqueue(std::forward<T>(element))) {
+			sema.signal();
+			return true;
+		}
+		return false;
+	}
+
+
+	// Enqueues a copy of element on the queue.
+	// Allocates an additional block of memory if needed.
+	// Only fails (returns false) if memory allocation fails.
+	AE_FORCEINLINE bool enqueue(T const& element)
+	{
+		if (inner.enqueue(element)) {
+			sema.signal();
+			return true;
+		}
+		return false;
+	}
+
+	// Enqueues a moved copy of element on the queue.
+	// Allocates an additional block of memory if needed.
+	// Only fails (returns false) if memory allocation fails.
+	AE_FORCEINLINE bool enqueue(T&& element)
+	{
+		if (inner.enqueue(std::forward<T>(element))) {
+			sema.signal();
+			return true;
+		}
+		return false;
+	}
+
+
+	// Attempts to dequeue an element; if the queue is empty,
+	// returns false instead. If the queue has at least one element,
+	// moves front to result using operator=, then returns true.
+	template<typename U>
+	bool try_dequeue(U& result)
+	{
+		if (sema.tryWait()) {
+			bool success = inner.try_dequeue(result);
+			assert(success);
+			AE_UNUSED(success);
+			return true;
+		}
+		return false;
+	}
+	
+	
+	// Attempts to dequeue an element; if the queue is empty,
+	// waits until an element is available, then dequeues it.
+	template<typename U>
+	void wait_dequeue(U& result)
+	{
+		sema.wait();
+		bool success = inner.try_dequeue(result);
+		AE_UNUSED(result);
+		assert(success);
+		AE_UNUSED(success);
+	}
+
+
+	// Attempts to dequeue an element; if the queue is empty,
+	// waits until an element is available up to the specified timeout,
+	// then dequeues it and returns true, or returns false if the timeout
+	// expires before an element can be dequeued.
+	// Using a negative timeout indicates an indefinite timeout,
+	// and is thus functionally equivalent to calling wait_dequeue.
+	template<typename U>
+	bool wait_dequeue_timed(U& result, std::int64_t timeout_usecs)
+	{
+		if (!sema.wait(timeout_usecs)) {
+			return false;
+		}
+		bool success = inner.try_dequeue(result);
+		AE_UNUSED(result);
+		assert(success);
+		AE_UNUSED(success);
+		return true;
+	}
+
+
+#if __cplusplus > 199711L || _MSC_VER >= 1700
+	// Attempts to dequeue an element; if the queue is empty,
+	// waits until an element is available up to the specified timeout,
+	// then dequeues it and returns true, or returns false if the timeout
+	// expires before an element can be dequeued.
+	// Using a negative timeout indicates an indefinite timeout,
+	// and is thus functionally equivalent to calling wait_dequeue.
+	template<typename U, typename Rep, typename Period>
+	inline bool wait_dequeue_timed(U& result, std::chrono::duration<Rep, Period> const& timeout)
+	{
+        return wait_dequeue_timed(result, std::chrono::duration_cast<std::chrono::microseconds>(timeout).count());
+	}
+#endif
+
+
+	// Returns a pointer to the front element in the queue (the one that
+	// would be removed next by a call to `try_dequeue` or `pop`). If the
+	// queue appears empty at the time the method is called, nullptr is
+	// returned instead.
+	// Must be called only from the consumer thread.
+	AE_FORCEINLINE T* peek()
+	{
+		return inner.peek();
+	}
+	
+	// Removes the front element from the queue, if any, without returning it.
+	// Returns true on success, or false if the queue appeared empty at the time
+	// `pop` was called.
+	AE_FORCEINLINE bool pop()
+	{
+		if (sema.tryWait()) {
+			bool result = inner.pop();
+			assert(result);
+			AE_UNUSED(result);
+			return true;
+		}
+		return false;
+	}
+	
+	// Returns the approximate number of items currently in the queue.
+	// Safe to call from both the producer and consumer threads.
+	AE_FORCEINLINE size_t size_approx() const
+	{
+		return sema.availableApprox();
+	}
+
+
+private:
+	// Disable copying & assignment
+	BlockingReaderWriterQueue(ReaderWriterQueue const&) {  }
+	BlockingReaderWriterQueue& operator=(ReaderWriterQueue const&) {  }
+	
+private:
+	ReaderWriterQueue inner;
+	spsc_sema::LightweightSemaphore sema;
+};
+
+}    // end namespace moodycamel
+
+#ifdef AE_VCPP
+#pragma warning(pop)
+#endif