shithub: openh264

Download patch

ref: 5ee3337dfd2c132837ac06d79f93fa1e54fe44e5
parent: ea2826e567399307e08c09d01966315086fd0c42
parent: a913cc853e517c2a5a0f79cc72cd5df590d82317
author: Cullen Jennings <[email protected]>
date: Fri Dec 13 05:10:40 EST 2013

Merge remote-tracking branch 'upstream/master'

--- /dev/null
+++ b/CODING_STYLE
@@ -1,0 +1,8 @@
+Code Guidelines
+
+Try to follow the style of the existing code.
+
+Please do not add tabs, trailing whitespace, or Windows-style line endings (CRLF).
+
+The C++ code was pretty-printed with astyle on 12/12/2013 using the configuration found in build/astyle.cfg
+
--- a/CONTRIBUTORS
+++ b/CONTRIBUTORS
@@ -9,7 +9,10 @@
 Yi Guo
 Horace Huang
 Steven Huang
+Ethan Hugg
+Cullen Jennings
 Zhaofeng Jia
+Derrick Jin
 Jesse Li
 Jifei Li
 Kai Li
@@ -19,6 +22,7 @@
 Bourne Ling
 Alex Liu
 Wayne Liu
+Eric Rescorla
 Sawyer Shan
 Siping Tao
 James Wang
--- a/Makefile
+++ b/Makefile
@@ -1,8 +1,15 @@
 UNAME=$(shell uname | tr A-Z a-z)
 LIBPREFIX=lib
 LIBSUFFIX=a
+CP=cp
 ROOTDIR=$(PWD)
 
+ifeq (,$(wildcard ./gtest))
+HAVE_GTEST=No
+else
+HAVE_GTEST=Yes
+endif
+
 # Configurations
 ifeq ($(BUILDTYPE), Release)
 CFLAGS += -O3
@@ -12,21 +19,24 @@
 USE_ASM = No
 endif
 
+include build/platform-$(UNAME).mk
+
 ifeq ($(USE_ASM),Yes)
-  CFLAGS += -DX86_ASM 
+  CFLAGS += -DX86_ASM
 endif
 
-include build/platform-$(UNAME).mk
-
 CFLAGS += -DNO_DYNAMIC_VP -DHAVE_CACHE_LINE_ALIGN
 LDFLAGS +=
-ASMFLAGS += -DNO_DYNAMIC_VP -DNOPREFIX 
+ASMFLAGS += -DNO_DYNAMIC_VP -DNOPREFIX
 
 
 #### No user-serviceable parts below this line
-INCLUDES = -Icodec/api/svc
+INCLUDES = -Icodec/api/svc  -Icodec/common -Igtest/include
 ASM_INCLUDES = -Iprocessing/src/asm/
 
+COMMON_INCLUDES = \
+    -Icodec/decoder/core/inc
+
 DECODER_INCLUDES = \
     -Icodec/decoder/core/inc \
     -Icodec/decoder/plus/inc
@@ -41,22 +51,36 @@
     -Icodec/encoder/plus/inc
 
 H264DEC_INCLUDES = $(DECODER_INCLUDES) -Icodec/console/dec/inc
-H264DEC_LDFLAGS = -L. -ldecoder
+H264DEC_LDFLAGS = -L. -ldecoder -lcommon
 
 H264ENC_INCLUDES = $(ENCODER_INCLUDES) -Icodec/console/enc/inc
-H264ENC_LDFLAGS = -L. -lencoder -lprocessing
+H264ENC_LDFLAGS = -L. -lencoder -lprocessing -lcommon
 
+CODEC_UNITTEST_LDFLAGS = -L. -lgtest -ldecoder -lcommon
+
 all:	libraries binaries
 
 clean:
 	rm -f $(OBJS) $(LIBRARIES) $(BINARIES)
+	echo $(HAVE_GTEST)
 
+gtest-bootstrap:
+	svn co https://googletest.googlecode.com/svn/trunk/ gtest
 
+include codec/common/targets.mk
 include codec/decoder/targets.mk
 include codec/encoder/targets.mk
 include processing/targets.mk
 include codec/console/dec/targets.mk
 include codec/console/enc/targets.mk
+
+ifeq ($(HAVE_GTEST),Yes)
+include build/gtest-targets.mk
+include test/targets.mk
+endif
+
+
+
 
 
 
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 OpenH264
 =======
 OpenH264 is a codec library which supports H.264 encoding and decoding. It is suitable for use in real time applications such as WebRTC. See http://www.openh264.org/ for more details.
- 
+
 Encoder Features
 ------------------------
 - Constrained Baseline Profile up to Level 5.2 (4096x2304)
@@ -17,10 +17,10 @@
 - Single reference frame for inter prediction
 - Multiple reference frames when using LTR and/or 3-4 temporal layers
 - Periodic and on-demand Instantaneous Decoder Refresh (IDR) frame insertion
-- Dynamic changes to bit rate, frame rate, and resolution 
+- Dynamic changes to bit rate, frame rate, and resolution
 - Annex B byte stream output
 - YUV 4:2:0 planar input
- 
+
 Decoder Features
 ------------------------
 - Constrained Baseline Profile up to Level 5.2 (4096x2304)
@@ -32,7 +32,7 @@
 - Multiple reference frames when specified in Sequence Parameter Set (SPS)
 - Annex B byte stream input
 - YUV 4:2:0 planar output
- 
+
 OS Support
 ----------------
 - Windows 64-bit and 32-bit (initial release is only 32-bit, 64-bit will follow soon)
@@ -40,7 +40,7 @@
 - Linux 64-bit and 32-bit (initial release is only 32-bit, 64-bit will follow soon)
 - Android 32-bit (initial release does not include this target, will follow soon)
 - iOS 64-bit and 32-bit (not supported yet, may be added in the future)
- 
+
 Processor Support
 -------------------------
 - Intel x86 optionally with MMX/SSE (no AVX yet, help is welcome)
@@ -53,30 +53,30 @@
     : build the decoder library and executable via codec/build/linux/dec/makefile
     : build the encoder library and executable via codec/build/linux/enc/makefile
     : build the encoder shared library via processing/build/linux/makefile
- 
+
 Windows Visual Studio 2008/2010/2012 projects are available:
     : build the decoder via the Visual Studio projects in codec/build/win32/dec
-    : build the encoder via the Visual Studio projects in codec/build/win32/dec
+    : build the encoder via the Visual Studio projects in codec/build/win32/enc
     : build the encoder shared library via the Visual Studio projects in processing/build/win32/
- 
+
 NASM needed to be installed for assembly code: workable version 2.07 or above, nasm can downloaded from http://www.nasm.us/
- 
+
 API details to be provided later.
- 
+
 Using the Test App
 -------------------------
 Linux shell scripts to build the test apps:
     : build via testbin/AutoBuild_Linux.sh
     : clean via testbin/AutoClean_Linux.sh
- 
+
 Windows batch files to build the test apps:
     : Visual Studio 2008 use testbin/AutoBuild_Windows_VS2008.bat
     : Visual Studio 2010 use testbin/AutoBuild_Windows_VS2010.bat
     : Visual Studio 2012 use testbin/AutoBuild_Windows_VS2012.bat
- 
+
 Usage information can be found in testbin/CmdLineReadMe
 Command line options and details to be provided later.
- 
+
 Using the Source
 -----------------------
 codec - encoder, decoder, console (test app), build (makefile, vcproj)
@@ -83,7 +83,7 @@
 processing - raw pixel processing (used by encoder)
 testbin - autobuild scripts, test app config files, yuv test files
 bin - binaries for library and test app
- 
+
 Known Issues
 -------------------
 See the issue tracker on https://github.com/cisco/openh264/issues
@@ -91,7 +91,7 @@
 - Encoder errors when compressed frame size exceeds half uncompressed size
 - Encoder console app only support multiple of 16 width/height for now
 - Decoder errors when compressed frame size exceeds 1MB
- 
+
 License
 ----------
 BSD, see LICENSE file for details.
--- /dev/null
+++ b/build/astyle.cfg
@@ -1,0 +1,9 @@
+--style=google
+--indent=spaces=2
+--max-code-length=120
+--pad-oper
+--align-pointer=type
+--align-reference=type
+--unpad-paren
+--pad-first-paren-out
+--lineend=linux
--- /dev/null
+++ b/build/gtest-targets.mk
@@ -1,0 +1,19 @@
+GTEST_PREFIX=GTEST
+GTEST_SRCDIR=gtest
+GTEST_CPP_SRCS=\
+	$(GTEST_SRCDIR)/src/gtest-all.cc
+
+GTEST_OBJS += $(GTEST_CPP_SRCS:.cc=.o)
+
+OBJS += $(GTEST_OBJS)
+GTEST_INCLUDES += -Igtest
+
+$(GTEST_SRCDIR)/src/gtest-all.o: $(GTEST_SRCDIR)/src/gtest-all.cc
+	$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(GTEST_CFLAGS) $(GTEST_INCLUDES) -c -o $(GTEST_SRCDIR)/src/gtest-all.o $(GTEST_SRCDIR)/src/gtest-all.cc
+
+$(LIBPREFIX)gtest.$(LIBSUFFIX): $(GTEST_OBJS)
+	rm -f $(LIBPREFIX)gtest.$(LIBSUFFIX)
+	ar cr $@ $(GTEST_OBJS)
+
+libraries: $(LIBPREFIX)gtest.$(LIBSUFFIX)
+LIBRARIES += $(LIBPREFIX)gtest.$(LIBSUFFIX)
--- a/build/mktargets.py
+++ b/build/mktargets.py
@@ -19,7 +19,7 @@
 def write_cpp_rule(f, x):
     src = "$(%s_SRCDIR)/%s"%(PREFIX, x)
     dst = "$(%s_SRCDIR)/%s"%(PREFIX, make_o(x))
-    
+
     f.write("%s: %s\n"%(dst, src))
     f.write('\t$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(' + PREFIX + '_CFLAGS) $(' + PREFIX + '_INCLUDES) -c -o ' + dst + ' ' + src + '\n');
     f.write("\n")
@@ -27,7 +27,7 @@
 def write_asm_rule(f, x):
     src = "$(%s_SRCDIR)/%s"%(PREFIX, x)
     dst = "$(%s_SRCDIR)/%s"%(PREFIX, make_o(x))
-    
+
     f.write("%s: %s\n"%(dst, src))
     f.write('\t$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(' + PREFIX + '_ASMFLAGS) $(' + PREFIX + '_ASM_INCLUDES) -o ' + dst + ' ' + src + '\n');
     f.write("\n")
@@ -70,7 +70,7 @@
 f.write("%s_CPP_SRCS=\\\n"%(PREFIX))
 for c in cpp:
     f.write("\t$(%s_SRCDIR)/%s\\\n"%(PREFIX, c))
-f.write("\n")    
+f.write("\n")
 f.write("%s_OBJS += $(%s_CPP_SRCS:.cpp=.o)\n"%(PREFIX, PREFIX))
 
 f.write("ifeq ($(USE_ASM), Yes)\n");
--- a/build/mktargets.sh
+++ b/build/mktargets.sh
@@ -1,7 +1,10 @@
 #!/bin/sh
 (cd codec/decoder; python ../../build/mktargets.py --directory codec/decoder --library decoder --exclude StdAfx.cpp)
 (cd codec/encoder; python ../../build/mktargets.py --directory codec/encoder --library encoder --exclude DllEntry.cpp)
+(cd codec/common; python ../../build/mktargets.py --directory codec/common --library common)
 (cd processing; python ../build/mktargets.py --directory processing --library processing --exclude wels_process.cpp --exclude WelsVideoProcessor.cpp)
 
 (cd codec/console/dec; python ../../../build/mktargets.py --directory codec/console/dec --binary h264dec --exclude dec_console.h --exclude load_bundle_functions.cpp)
 (cd codec/console/enc; python ../../../build/mktargets.py --directory codec/console/enc --binary h264enc --exclude enc_console.h --exclude bundlewelsenc.cpp)
+(cd test; python ../build/mktargets.py --directory test --binary codec_unittest)
+
--- a/codec/WelsThreadLib/api/WelsThreadLib.h
+++ b/codec/WelsThreadLib/api/WelsThreadLib.h
@@ -78,12 +78,12 @@
 #include <fcntl.h>
 
 typedef   pthread_t    WELS_THREAD_HANDLE;
-typedef  void* (*LPWELS_THREAD_ROUTINE)  ( void * );
+typedef  void* (*LPWELS_THREAD_ROUTINE) (void*);
 
 typedef   pthread_mutex_t           WELS_MUTEX;
-typedef   sem_t                     WELS_EVENT; 
+typedef   sem_t                     WELS_EVENT;
 
-#define   WELS_THREAD_ROUTINE_TYPE         void * 
+#define   WELS_THREAD_ROUTINE_TYPE         void *
 #define   WELS_THREAD_ROUTINE_RETURN(rc)   return (void*)rc;
 
 #endif//__GNUC__
@@ -93,55 +93,56 @@
 typedef    int32_t        WELS_THREAD_ERROR_CODE;
 typedef    int32_t        WELS_THREAD_ATTR;
 
-typedef  struct _WelsLogicalProcessorInfo
-{
-	int32_t    ProcessorCount;
+typedef  struct _WelsLogicalProcessorInfo {
+  int32_t    ProcessorCount;
 } WelsLogicalProcessInfo;
 
 #define    WELS_THREAD_ERROR_OK					0
 #define    WELS_THREAD_ERROR_GENERIAL			((uint32_t)(-1))
 #define    WELS_THREAD_ERROR_WAIT_OBJECT_0		0
-#define	   WELS_THREAD_ERROR_WAIT_TIMEOUT		((uint32_t)0x00000102L)  
+#define	   WELS_THREAD_ERROR_WAIT_TIMEOUT		((uint32_t)0x00000102L)
 #define	   WELS_THREAD_ERROR_WAIT_FAILED		WELS_THREAD_ERROR_GENERIAL
 
-void WelsSleep( uint32_t dwMilliseconds );
-WELS_THREAD_ERROR_CODE    WelsMutexInit( WELS_MUTEX   * mutex );
-WELS_THREAD_ERROR_CODE    WelsMutexLock( WELS_MUTEX   * mutex );
-WELS_THREAD_ERROR_CODE    WelsMutexUnlock( WELS_MUTEX * mutex );
-WELS_THREAD_ERROR_CODE    WelsMutexDestroy( WELS_MUTEX * mutex );
+void WelsSleep (uint32_t dwMilliseconds);
+WELS_THREAD_ERROR_CODE    WelsMutexInit (WELS_MUTEX*    mutex);
+WELS_THREAD_ERROR_CODE    WelsMutexLock (WELS_MUTEX*    mutex);
+WELS_THREAD_ERROR_CODE    WelsMutexUnlock (WELS_MUTEX* mutex);
+WELS_THREAD_ERROR_CODE    WelsMutexDestroy (WELS_MUTEX* mutex);
 
 #ifdef __GNUC__
-WELS_THREAD_ERROR_CODE    WelsEventOpen( WELS_EVENT **p_event, str_t *event_name );
-WELS_THREAD_ERROR_CODE    WelsEventClose( WELS_EVENT *event, str_t *event_name );
+WELS_THREAD_ERROR_CODE    WelsEventOpen (WELS_EVENT** p_event, str_t* event_name);
+WELS_THREAD_ERROR_CODE    WelsEventClose (WELS_EVENT* event, str_t* event_name);
 #endif//__GNUC__
-WELS_THREAD_ERROR_CODE    WelsEventInit( WELS_EVENT *event );
-WELS_THREAD_ERROR_CODE    WelsEventDestroy( WELS_EVENT * event );
-WELS_THREAD_ERROR_CODE    WelsEventSignal( WELS_EVENT * event );
-WELS_THREAD_ERROR_CODE    WelsEventReset( WELS_EVENT * event );
-WELS_THREAD_ERROR_CODE    WelsEventWait( WELS_EVENT * event );
-WELS_THREAD_ERROR_CODE    WelsEventWaitWithTimeOut( WELS_EVENT * event, uint32_t dwMilliseconds );
+WELS_THREAD_ERROR_CODE    WelsEventInit (WELS_EVENT* event);
+WELS_THREAD_ERROR_CODE    WelsEventDestroy (WELS_EVENT* event);
+WELS_THREAD_ERROR_CODE    WelsEventSignal (WELS_EVENT* event);
+WELS_THREAD_ERROR_CODE    WelsEventReset (WELS_EVENT* event);
+WELS_THREAD_ERROR_CODE    WelsEventWait (WELS_EVENT* event);
+WELS_THREAD_ERROR_CODE    WelsEventWaitWithTimeOut (WELS_EVENT* event, uint32_t dwMilliseconds);
 #ifdef WIN32
-WELS_THREAD_ERROR_CODE    WelsMultipleEventsWaitSingleBlocking( uint32_t nCount, WELS_EVENT *event_list, uint32_t dwMilliseconds );
-WELS_THREAD_ERROR_CODE    WelsMultipleEventsWaitAllBlocking( uint32_t nCount, WELS_EVENT *event_list );
+WELS_THREAD_ERROR_CODE    WelsMultipleEventsWaitSingleBlocking (uint32_t nCount, WELS_EVENT* event_list,
+    uint32_t dwMilliseconds);
+WELS_THREAD_ERROR_CODE    WelsMultipleEventsWaitAllBlocking (uint32_t nCount, WELS_EVENT* event_list);
 #else
-WELS_THREAD_ERROR_CODE    WelsMultipleEventsWaitSingleBlocking( uint32_t nCount, WELS_EVENT **event_list, uint32_t dwMilliseconds );
-WELS_THREAD_ERROR_CODE    WelsMultipleEventsWaitAllBlocking( uint32_t nCount, WELS_EVENT **event_list );
+WELS_THREAD_ERROR_CODE    WelsMultipleEventsWaitSingleBlocking (uint32_t nCount, WELS_EVENT** event_list,
+    uint32_t dwMilliseconds);
+WELS_THREAD_ERROR_CODE    WelsMultipleEventsWaitAllBlocking (uint32_t nCount, WELS_EVENT** event_list);
 #endif//WIN32
 
-WELS_THREAD_ERROR_CODE    WelsThreadCreate( WELS_THREAD_HANDLE * thread,  LPWELS_THREAD_ROUTINE  routine, 
-										   void * arg, WELS_THREAD_ATTR attr);
+WELS_THREAD_ERROR_CODE    WelsThreadCreate (WELS_THREAD_HANDLE* thread,  LPWELS_THREAD_ROUTINE  routine,
+    void* arg, WELS_THREAD_ATTR attr);
 
 WELS_THREAD_ERROR_CODE	  WelsSetThreadCancelable();
 
-WELS_THREAD_ERROR_CODE    WelsThreadJoin( WELS_THREAD_HANDLE  thread );
+WELS_THREAD_ERROR_CODE    WelsThreadJoin (WELS_THREAD_HANDLE  thread);
 
-WELS_THREAD_ERROR_CODE    WelsThreadCancel( WELS_THREAD_HANDLE  thread );
+WELS_THREAD_ERROR_CODE    WelsThreadCancel (WELS_THREAD_HANDLE  thread);
 
-WELS_THREAD_ERROR_CODE    WelsThreadDestroy( WELS_THREAD_HANDLE *thread );
+WELS_THREAD_ERROR_CODE    WelsThreadDestroy (WELS_THREAD_HANDLE* thread);
 
 WELS_THREAD_HANDLE        WelsThreadSelf();
 
-WELS_THREAD_ERROR_CODE    WelsQueryLogicalProcessInfo(WelsLogicalProcessInfo * pInfo);
+WELS_THREAD_ERROR_CODE    WelsQueryLogicalProcessInfo (WelsLogicalProcessInfo* pInfo);
 
 
 #ifdef  __cplusplus
--- a/codec/WelsThreadLib/src/WelsThreadLib.cpp
+++ b/codec/WelsThreadLib/src/WelsThreadLib.cpp
@@ -1,567 +1,513 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file	WelsThreadLib.c
- *
- * \brief	Interfaces introduced in thread programming
- *
- * \date	11/17/2009 Created
- *
- *************************************************************************************
- */
-
-
-#include "WelsThreadLib.h"
-#include <stdio.h>
-
-#ifdef  WIN32
-
-void WelsSleep( uint32_t dwMilliseconds )
-{
-	Sleep( dwMilliseconds );
-}
-
-WELS_THREAD_ERROR_CODE    WelsMutexInit( WELS_MUTEX   * mutex )
-{
-	InitializeCriticalSection(mutex);
-
-	return WELS_THREAD_ERROR_OK;
-}
-
-WELS_THREAD_ERROR_CODE    WelsMutexLock( WELS_MUTEX   * mutex )
-{
-	EnterCriticalSection(mutex);
-
-	return WELS_THREAD_ERROR_OK;
-}
-
-WELS_THREAD_ERROR_CODE    WelsMutexUnlock( WELS_MUTEX * mutex )
-{
-	LeaveCriticalSection(mutex);
-
-	return WELS_THREAD_ERROR_OK;
-}
-
-WELS_THREAD_ERROR_CODE    WelsMutexDestroy( WELS_MUTEX * mutex )
-{
-    DeleteCriticalSection(mutex);
-
-	return WELS_THREAD_ERROR_OK;
-}
-
-WELS_THREAD_ERROR_CODE    WelsEventInit( WELS_EVENT  *  event )
-{
-    WELS_EVENT   h = CreateEvent(NULL, FALSE, FALSE, NULL);
-
-	if( h == NULL ){
-		return WELS_THREAD_ERROR_GENERIAL;
-	}
-	*event = h;
-	return WELS_THREAD_ERROR_OK;
-}
-
-WELS_THREAD_ERROR_CODE    WelsEventSignal( WELS_EVENT * event )
-{
-	if( SetEvent( *event ) ){
-		return WELS_THREAD_ERROR_OK;
-	}
-	return WELS_THREAD_ERROR_GENERIAL;
-}
-
-WELS_THREAD_ERROR_CODE    WelsEventReset( WELS_EVENT * event )
-{
-	if ( ResetEvent( *event ) )
-		return WELS_THREAD_ERROR_OK;
-	return WELS_THREAD_ERROR_GENERIAL;
-}
-
-WELS_THREAD_ERROR_CODE    WelsEventWait( WELS_EVENT * event )
-{
-	return WaitForSingleObject(*event, INFINITE );
-}
-
-WELS_THREAD_ERROR_CODE    WelsEventWaitWithTimeOut( WELS_EVENT * event, uint32_t dwMilliseconds )
-{
-	return WaitForSingleObject(*event, dwMilliseconds );
-}
-
-WELS_THREAD_ERROR_CODE    WelsMultipleEventsWaitSingleBlocking(	uint32_t nCount,
-																WELS_EVENT *event_list,
-																uint32_t dwMilliseconds )
-{
-	return WaitForMultipleObjects( nCount, event_list, FALSE, dwMilliseconds );
-}
-
-WELS_THREAD_ERROR_CODE    WelsMultipleEventsWaitAllBlocking( uint32_t nCount, WELS_EVENT *event_list )
-{
-	return WaitForMultipleObjects( nCount, event_list, TRUE, (uint32_t)-1 );
-}
-
-WELS_THREAD_ERROR_CODE    WelsEventDestroy( WELS_EVENT * event )
-{
-	CloseHandle( *event );
-
-	*event = NULL;
-	return WELS_THREAD_ERROR_OK;
-}
-
-
-WELS_THREAD_ERROR_CODE    WelsThreadCreate( WELS_THREAD_HANDLE * thread,  LPWELS_THREAD_ROUTINE  routine, 
-										   void * arg, WELS_THREAD_ATTR attr)
-{
-    WELS_THREAD_HANDLE   h = CreateThread(NULL, 0, routine, arg, 0, NULL);
-
-	if( h == NULL ) {
-		return WELS_THREAD_ERROR_GENERIAL;
-	}
-	* thread = h;
-
-	return WELS_THREAD_ERROR_OK;
-}
-
-WELS_THREAD_ERROR_CODE	  WelsSetThreadCancelable()
-{
-	// nil implementation for WIN32
-	return WELS_THREAD_ERROR_OK;
-}
-
-WELS_THREAD_ERROR_CODE    WelsThreadJoin( WELS_THREAD_HANDLE  thread )
-{
-    WaitForSingleObject(thread, INFINITE);
-
-	return WELS_THREAD_ERROR_OK;
-}
-
-WELS_THREAD_ERROR_CODE    WelsThreadCancel( WELS_THREAD_HANDLE  thread )
-{
-	return WELS_THREAD_ERROR_OK;
-}
-
-
-WELS_THREAD_ERROR_CODE    WelsThreadDestroy( WELS_THREAD_HANDLE *thread )
-{
-	if ( thread != NULL )
-	{
-		CloseHandle(*thread);
-		*thread = NULL;
-	}	
-	return WELS_THREAD_ERROR_OK;
-}
-
-WELS_THREAD_HANDLE        WelsThreadSelf()
-{
-	return GetCurrentThread();
-}
-
-WELS_THREAD_ERROR_CODE    WelsQueryLogicalProcessInfo(WelsLogicalProcessInfo * pInfo)
-{
-	SYSTEM_INFO  si;	
-	
-	GetSystemInfo(&si);
-
-	pInfo->ProcessorCount = si.dwNumberOfProcessors;
-
-	return WELS_THREAD_ERROR_OK;
-}
-
-#elif   defined(__GNUC__)
-
-#ifdef MACOS
-#include <CoreServices/CoreServices.h>
-//#include <Gestalt.h>
-#endif//MACOS
-
-static int32_t  SystemCall(const str_t * pCmd, str_t * pRes, int32_t iSize)
-{
-    int32_t fd[2];
-    int32_t iPid;
-    int32_t iCount;
-    int32_t left;
-    str_t * p = NULL;
-    int32_t iMaxLen = iSize - 1;
-    memset(pRes, 0, iSize);
-
-    if( pipe(fd) ){
-        return -1;
-    }
-
-    if( (iPid = fork()) == 0 ){
-        int32_t  fd2[2];
-        if( pipe(fd2) ){
-            return -1;
-        }
-        close(STDOUT_FILENO);
-        dup2(fd2[1],STDOUT_FILENO);
-        close(fd[0]);
-        close(fd2[1]);
-        system(pCmd);
-        read(fd2[0], pRes, iMaxLen);
-        write(fd[1], pRes, strlen(pRes));	// confirmed_safe_unsafe_usage
-        close(fd2[0]);
-		close(fd[1]);
-        exit(0);
-    }
-    close(fd[1]);
-    p = pRes;
-    left = iMaxLen;
-    while( (iCount = read(fd[0], p, left)) ){
-        p += iCount;
-        left -= iCount;
-        if( left <=0 ) break;   
-    }
-    close(fd[0]);
-    return 0;
-}
-
-void WelsSleep( uint32_t dwMilliseconds )
-{
-	usleep( dwMilliseconds * 1000 );	// microseconds
-}
-
-WELS_THREAD_ERROR_CODE    WelsThreadCreate( WELS_THREAD_HANDLE * thread,  LPWELS_THREAD_ROUTINE  routine, 
-										   void * arg, WELS_THREAD_ATTR attr)
-{
-	WELS_THREAD_ERROR_CODE err = 0;
-
-	pthread_attr_t at;
-	err = pthread_attr_init(&at);
-	if ( err )
-		return err;
-	err = pthread_attr_setscope(&at, PTHREAD_SCOPE_SYSTEM);
-	if ( err )
-		return err;
-	err = pthread_attr_setschedpolicy(&at, SCHED_FIFO);
-	if ( err )
-		return err;
-	err = pthread_create( thread, &at, routine, arg );
-
-	pthread_attr_destroy(&at);
-
-	return err;
-
-//	return pthread_create(thread, NULL, routine, arg); 
-}
-
-WELS_THREAD_ERROR_CODE	  WelsSetThreadCancelable()
-{
-	WELS_THREAD_ERROR_CODE err = pthread_setcancelstate( PTHREAD_CANCEL_ENABLE, NULL );
-	if ( 0 == err )
-		err = pthread_setcanceltype( PTHREAD_CANCEL_DEFERRED, NULL );
-	return err;
-}
-
-WELS_THREAD_ERROR_CODE    WelsThreadJoin( WELS_THREAD_HANDLE  thread )
-{
-    return pthread_join(thread, NULL);
-}
-
-WELS_THREAD_ERROR_CODE    WelsThreadCancel( WELS_THREAD_HANDLE  thread )
-{
-	return pthread_cancel( thread );
-}
-
-WELS_THREAD_ERROR_CODE    WelsThreadDestroy( WELS_THREAD_HANDLE *thread )
-{	
-	return WELS_THREAD_ERROR_OK;
-}
-
-WELS_THREAD_HANDLE        WelsThreadSelf()
-{
-	return pthread_self();
-}
-
-WELS_THREAD_ERROR_CODE    WelsMutexInit( WELS_MUTEX   * mutex )
-{
-	return pthread_mutex_init(mutex, NULL);
-}
-
-WELS_THREAD_ERROR_CODE    WelsMutexLock( WELS_MUTEX   * mutex )
-{
-	return pthread_mutex_lock(mutex);
-}
-
-WELS_THREAD_ERROR_CODE    WelsMutexUnlock( WELS_MUTEX * mutex )
-{
-	return pthread_mutex_unlock(mutex);
-}
-
-WELS_THREAD_ERROR_CODE    WelsMutexDestroy( WELS_MUTEX * mutex )
-{
-    return pthread_mutex_destroy(mutex);
-}
-
-// unnamed semaphores can not work well for posix threading models under not root users
-
-WELS_THREAD_ERROR_CODE    WelsEventInit( WELS_EVENT *event )
-{
-	return sem_init(event, 0, 0);
-}
-
-WELS_THREAD_ERROR_CODE   WelsEventDestroy( WELS_EVENT * event )
-{
-	return sem_destroy( event );	// match with sem_init	
-}
-
-WELS_THREAD_ERROR_CODE    WelsEventOpen( WELS_EVENT **p_event, str_t *event_name )
-{
-	if ( p_event == NULL || event_name == NULL )
-		return WELS_THREAD_ERROR_GENERIAL;
-	*p_event = sem_open(event_name, O_CREAT,  (S_IRUSR | S_IWUSR)/*0600*/, 0);
-	if ( *p_event == (sem_t *)SEM_FAILED ) {
-		sem_unlink( event_name );
-		*p_event = NULL;
-		return WELS_THREAD_ERROR_GENERIAL;
-	} else {		
-		return WELS_THREAD_ERROR_OK;
-	}
-}
-WELS_THREAD_ERROR_CODE    WelsEventClose( WELS_EVENT *event, str_t *event_name )
-{
-	WELS_THREAD_ERROR_CODE err = sem_close( event );	// match with sem_open
-	if ( event_name )
-		sem_unlink( event_name );
-	return err;
-}
-
-WELS_THREAD_ERROR_CODE   WelsEventSignal( WELS_EVENT * event )
-{
-	WELS_THREAD_ERROR_CODE err = 0;
-//	int32_t val = 0;
-//	sem_getvalue(event, &val);
-//	fprintf( stderr, "before signal it, val= %d..\n",val );
-	err = sem_post(event);
-//	sem_getvalue(event, &val);
-//	fprintf( stderr, "after signal it, val= %d..\n",val );
-    return err;
-}
-WELS_THREAD_ERROR_CODE    WelsEventReset( WELS_EVENT * event )
-{
-	// FIXME for posix event reset, seems not be supported for pthread??
-	sem_close(event);
-	return sem_init(event, 0, 0);
-}
-
-WELS_THREAD_ERROR_CODE   WelsEventWait( WELS_EVENT * event )
-{
-	return sem_wait(event);	// blocking until signaled
-}
-
-WELS_THREAD_ERROR_CODE    WelsEventWaitWithTimeOut( WELS_EVENT * event, uint32_t dwMilliseconds )
-{	
-	if ( dwMilliseconds != (uint32_t)-1 )
-	{
-		return sem_wait(event);
-	}
-	else
-	{
-#if defined(MACOS)
-		int32_t err = 0;
-		int32_t wait_count = 0;
-		do{
-			err = sem_trywait(event);
-			if ( WELS_THREAD_ERROR_OK == err)
-				break;// WELS_THREAD_ERROR_OK;
-			else if ( wait_count > 0 )
-				break;
-			usleep( dwMilliseconds * 1000 );
-			++ wait_count;
-		}while(1);
-		return err;
-#else
-		struct timespec ts;
-		struct timeval tv;
-
-		gettimeofday(&tv,0);
-
-		ts.tv_sec = tv.tv_sec + dwMilliseconds /1000;
-		ts.tv_nsec = tv.tv_usec*1000 + (dwMilliseconds % 1000) * 1000000;
-
-		return sem_timedwait(event, &ts);
-#endif//MACOS
-	}
-}
-
-WELS_THREAD_ERROR_CODE    WelsMultipleEventsWaitSingleBlocking(	uint32_t nCount,
-																WELS_EVENT **event_list,
-																uint32_t dwMilliseconds )
-{
-	// bWaitAll = FALSE && blocking
-	uint32_t nIdx = 0;
-	const uint32_t kuiAccessTime = 2;	// 2 us once
-//	uint32_t uiSleepMs = 0;
-
-	if ( nCount == 0 )
-		return WELS_THREAD_ERROR_WAIT_FAILED;
-
-	while (1)
-	{
-		nIdx = 0;	// access each event by order
-		while ( nIdx < nCount )
-		{
-			int32_t err = 0;			
-//#if defined(MACOS)	// clock_gettime(CLOCK_REALTIME) & sem_timedwait not supported on mac, so have below impl
-			int32_t wait_count = 0;
-//			struct timespec ts;
-//			struct timeval tv;
-//			
-//			gettimeofday(&tv,0);
-//			ts.tv_sec = tv.tv_sec/*+ kuiAccessTime / 1000*/;		// second
-//			ts.tv_nsec = (tv.tv_usec + kuiAccessTime) * 1000;	// nano-second
-			
-			/*
-			 * although such interface is not used in __GNUC__ like platform, to use 
-			 * pthread_cond_timedwait() might be better choice if need
-			 */
-			do{
-				err = sem_trywait( event_list[nIdx] );
-				if ( WELS_THREAD_ERROR_OK == err )
-					return WELS_THREAD_ERROR_WAIT_OBJECT_0 + nIdx;
-				else if ( wait_count > 0 )
-					break;
-				usleep(kuiAccessTime);
-				++ wait_count;
-			}while( 1 );
-//#else
-//			struct timespec ts;
-//			
-//			if ( clock_gettime(CLOCK_REALTIME, &ts) == -1 )
-//				return WELS_THREAD_ERROR_WAIT_FAILED;
-//			ts.tv_nsec += kuiAccessTime/*(kuiAccessTime % 1000)*/ * 1000;
-//			
-////			fprintf( stderr, "sem_timedwait(): start to wait event %d..\n", nIdx );
-//			err = sem_timedwait(event_list[nIdx], &ts);
-////			if ( err == -1 )
-////			{
-////				sem_getvalue(&event_list[nIdx], &val);
-////				fprintf( stderr, "sem_timedwait() errno(%d) semaphore %d..\n", errno, val);
-////				return WELS_THREAD_ERROR_WAIT_FAILED;
-////			}			
-////			fprintf( stderr, "sem_timedwait(): wait event %d result %d errno %d..\n", nIdx, err, errno );
-//			if ( WELS_THREAD_ERROR_OK == err ) // non-blocking mode
-//			{	
-////				int32_t val = 0;
-////				sem_getvalue(&event_list[nIdx], &val);
-////				fprintf( stderr, "after sem_timedwait(), event_list[%d] semaphore value= %d..\n", nIdx, val);
-////				fprintf( stderr, "WelsMultipleEventsWaitSingleBlocking sleep %d us\n", uiSleepMs);
-//				return WELS_THREAD_ERROR_WAIT_OBJECT_0 + nIdx;
-//			}
-//#endif					
-			// we do need access next event next time
-			++ nIdx;
-//			uiSleepMs += kuiAccessTime;
-		}
-		usleep( 1 );	// switch to working threads
-//		++ uiSleepMs;
-	}	
-
-	return WELS_THREAD_ERROR_WAIT_FAILED;
-}
-
-WELS_THREAD_ERROR_CODE    WelsMultipleEventsWaitAllBlocking( uint32_t nCount, WELS_EVENT **event_list )
-{
-	// bWaitAll = TRUE && blocking
-	uint32_t nIdx = 0;
-//	const uint32_t kuiAccessTime = (uint32_t)-1;// 1 ms once
-	uint32_t uiCountSignals = 0;
-	uint32_t uiSignalFlag	= 0;	// UGLY: suppose maximal event number up to 32
-	
-	if ( nCount == 0 || nCount > (sizeof(uint32_t)<<3) )
-		return WELS_THREAD_ERROR_WAIT_FAILED;
-	
-	while (1)
-	{
-		nIdx = 0;	// access each event by order
-		while (nIdx < nCount)
-		{			
-			const uint32_t kuiBitwiseFlag = (1<<nIdx);
-			
-			if ( (uiSignalFlag & kuiBitwiseFlag) != kuiBitwiseFlag ) // non-blocking mode
-			{	
-				int32_t err = 0;
-//				fprintf( stderr, "sem_wait(): start to wait event %d..\n", nIdx );
-				err = sem_wait(event_list[nIdx]);
-//				fprintf( stderr, "sem_wait(): wait event %d result %d errno %d..\n", nIdx, err, errno );
-				if ( WELS_THREAD_ERROR_OK == err )
-				{
-//					int32_t val = 0;
-//					sem_getvalue(&event_list[nIdx], &val);
-//					fprintf( stderr, "after sem_timedwait(), event_list[%d] semaphore value= %d..\n", nIdx, val);
-
-					uiSignalFlag |= kuiBitwiseFlag;
-					++ uiCountSignals;
-					if ( uiCountSignals >= nCount )
-					{						
-						return WELS_THREAD_ERROR_OK;
-					}
-				}				
-			}			
-			// we do need access next event next time
-			++ nIdx;
-		}		
-	}	
-	
-	return WELS_THREAD_ERROR_WAIT_FAILED;
-}
-
-WELS_THREAD_ERROR_CODE    WelsQueryLogicalProcessInfo(WelsLogicalProcessInfo * pInfo)
-{
-#ifdef LINUX
-
-#define   CMD_RES_SIZE    2048
-    str_t pBuf[CMD_RES_SIZE];
-   
-    SystemCall("cat /proc/cpuinfo | grep \"processor\" | wc -l", pBuf, CMD_RES_SIZE);
-
-    pInfo->ProcessorCount = atoi(pBuf);
-
-    if( pInfo->ProcessorCount == 0 ){
-        pInfo->ProcessorCount = 1;
-    }   
- 
-	return WELS_THREAD_ERROR_OK;
-#undef   CMD_RES_SIZE
-
-#else
-
-	SInt32 cpunumber;
-	Gestalt(gestaltCountOfCPUs,&cpunumber);
-
-	pInfo->ProcessorCount	= cpunumber;
-
-	return WELS_THREAD_ERROR_OK;
-
-#endif//LINUX
-}
-
-#endif
-
-
-
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	WelsThreadLib.c
+ *
+ * \brief	Interfaces introduced in thread programming
+ *
+ * \date	11/17/2009 Created
+ *
+ *************************************************************************************
+ */
+
+
+#include "WelsThreadLib.h"
+#include <stdio.h>
+
+#ifdef  WIN32
+
+void WelsSleep (uint32_t dwMilliseconds) {
+  Sleep (dwMilliseconds);
+}
+
+WELS_THREAD_ERROR_CODE    WelsMutexInit (WELS_MUTEX*    mutex) {
+  InitializeCriticalSection (mutex);
+
+  return WELS_THREAD_ERROR_OK;
+}
+
+WELS_THREAD_ERROR_CODE    WelsMutexLock (WELS_MUTEX*    mutex) {
+  EnterCriticalSection (mutex);
+
+  return WELS_THREAD_ERROR_OK;
+}
+
+WELS_THREAD_ERROR_CODE    WelsMutexUnlock (WELS_MUTEX* mutex) {
+  LeaveCriticalSection (mutex);
+
+  return WELS_THREAD_ERROR_OK;
+}
+
+WELS_THREAD_ERROR_CODE    WelsMutexDestroy (WELS_MUTEX* mutex) {
+  DeleteCriticalSection (mutex);
+
+  return WELS_THREAD_ERROR_OK;
+}
+
+WELS_THREAD_ERROR_CODE    WelsEventInit (WELS_EVENT*    event) {
+  WELS_EVENT   h = CreateEvent (NULL, FALSE, FALSE, NULL);
+
+  if (h == NULL) {
+    return WELS_THREAD_ERROR_GENERIAL;
+  }
+  *event = h;
+  return WELS_THREAD_ERROR_OK;
+}
+
+WELS_THREAD_ERROR_CODE    WelsEventSignal (WELS_EVENT* event) {
+  if (SetEvent (*event)) {
+    return WELS_THREAD_ERROR_OK;
+  }
+  return WELS_THREAD_ERROR_GENERIAL;
+}
+
+WELS_THREAD_ERROR_CODE    WelsEventReset (WELS_EVENT* event) {
+  if (ResetEvent (*event))
+    return WELS_THREAD_ERROR_OK;
+  return WELS_THREAD_ERROR_GENERIAL;
+}
+
+WELS_THREAD_ERROR_CODE    WelsEventWait (WELS_EVENT* event) {
+  return WaitForSingleObject (*event, INFINITE);
+}
+
+WELS_THREAD_ERROR_CODE    WelsEventWaitWithTimeOut (WELS_EVENT* event, uint32_t dwMilliseconds) {
+  return WaitForSingleObject (*event, dwMilliseconds);
+}
+
+WELS_THREAD_ERROR_CODE    WelsMultipleEventsWaitSingleBlocking (uint32_t nCount,
+    WELS_EVENT* event_list,
+    uint32_t dwMilliseconds) {
+  return WaitForMultipleObjects (nCount, event_list, FALSE, dwMilliseconds);
+}
+
+WELS_THREAD_ERROR_CODE    WelsMultipleEventsWaitAllBlocking (uint32_t nCount, WELS_EVENT* event_list) {
+  return WaitForMultipleObjects (nCount, event_list, TRUE, (uint32_t) - 1);
+}
+
+WELS_THREAD_ERROR_CODE    WelsEventDestroy (WELS_EVENT* event) {
+  CloseHandle (*event);
+
+  *event = NULL;
+  return WELS_THREAD_ERROR_OK;
+}
+
+
+WELS_THREAD_ERROR_CODE    WelsThreadCreate (WELS_THREAD_HANDLE* thread,  LPWELS_THREAD_ROUTINE  routine,
+    void* arg, WELS_THREAD_ATTR attr) {
+  WELS_THREAD_HANDLE   h = CreateThread (NULL, 0, routine, arg, 0, NULL);
+
+  if (h == NULL) {
+    return WELS_THREAD_ERROR_GENERIAL;
+  }
+  * thread = h;
+
+  return WELS_THREAD_ERROR_OK;
+}
+
+WELS_THREAD_ERROR_CODE	  WelsSetThreadCancelable() {
+  // nil implementation for WIN32
+  return WELS_THREAD_ERROR_OK;
+}
+
+WELS_THREAD_ERROR_CODE    WelsThreadJoin (WELS_THREAD_HANDLE  thread) {
+  WaitForSingleObject (thread, INFINITE);
+
+  return WELS_THREAD_ERROR_OK;
+}
+
+WELS_THREAD_ERROR_CODE    WelsThreadCancel (WELS_THREAD_HANDLE  thread) {
+  return WELS_THREAD_ERROR_OK;
+}
+
+
+WELS_THREAD_ERROR_CODE    WelsThreadDestroy (WELS_THREAD_HANDLE* thread) {
+  if (thread != NULL) {
+    CloseHandle (*thread);
+    *thread = NULL;
+  }
+  return WELS_THREAD_ERROR_OK;
+}
+
+WELS_THREAD_HANDLE        WelsThreadSelf() {
+  return GetCurrentThread();
+}
+
+WELS_THREAD_ERROR_CODE    WelsQueryLogicalProcessInfo (WelsLogicalProcessInfo* pInfo) {
+  SYSTEM_INFO  si;
+
+  GetSystemInfo (&si);
+
+  pInfo->ProcessorCount = si.dwNumberOfProcessors;
+
+  return WELS_THREAD_ERROR_OK;
+}
+
+#elif   defined(__GNUC__)
+
+#ifdef MACOS
+#include <CoreServices/CoreServices.h>
+//#include <Gestalt.h>
+#endif//MACOS
+
+static int32_t  SystemCall (const str_t* pCmd, str_t* pRes, int32_t iSize) {
+  int32_t fd[2];
+  int32_t iPid;
+  int32_t iCount;
+  int32_t left;
+  str_t* p = NULL;
+  int32_t iMaxLen = iSize - 1;
+  memset (pRes, 0, iSize);
+
+  if (pipe (fd)) {
+    return -1;
+  }
+
+  if ((iPid = fork()) == 0) {
+    int32_t  fd2[2];
+    if (pipe (fd2)) {
+      return -1;
+    }
+    close (STDOUT_FILENO);
+    dup2 (fd2[1], STDOUT_FILENO);
+    close (fd[0]);
+    close (fd2[1]);
+    system (pCmd);
+    read (fd2[0], pRes, iMaxLen);
+    write (fd[1], pRes, strlen (pRes));	// confirmed_safe_unsafe_usage
+    close (fd2[0]);
+    close (fd[1]);
+    exit (0);
+  }
+  close (fd[1]);
+  p = pRes;
+  left = iMaxLen;
+  while ((iCount = read (fd[0], p, left))) {
+    p += iCount;
+    left -= iCount;
+    if (left <= 0) break;
+  }
+  close (fd[0]);
+  return 0;
+}
+
+void WelsSleep (uint32_t dwMilliseconds) {
+  usleep (dwMilliseconds * 1000);	// microseconds
+}
+
+WELS_THREAD_ERROR_CODE    WelsThreadCreate (WELS_THREAD_HANDLE* thread,  LPWELS_THREAD_ROUTINE  routine,
+    void* arg, WELS_THREAD_ATTR attr) {
+  WELS_THREAD_ERROR_CODE err = 0;
+
+  pthread_attr_t at;
+  err = pthread_attr_init (&at);
+  if (err)
+    return err;
+  err = pthread_attr_setscope (&at, PTHREAD_SCOPE_SYSTEM);
+  if (err)
+    return err;
+  err = pthread_attr_setschedpolicy (&at, SCHED_FIFO);
+  if (err)
+    return err;
+  err = pthread_create (thread, &at, routine, arg);
+
+  pthread_attr_destroy (&at);
+
+  return err;
+
+//	return pthread_create(thread, NULL, routine, arg);
+}
+
+WELS_THREAD_ERROR_CODE	  WelsSetThreadCancelable() {
+  WELS_THREAD_ERROR_CODE err = pthread_setcancelstate (PTHREAD_CANCEL_ENABLE, NULL);
+  if (0 == err)
+    err = pthread_setcanceltype (PTHREAD_CANCEL_DEFERRED, NULL);
+  return err;
+}
+
+WELS_THREAD_ERROR_CODE    WelsThreadJoin (WELS_THREAD_HANDLE  thread) {
+  return pthread_join (thread, NULL);
+}
+
+WELS_THREAD_ERROR_CODE    WelsThreadCancel (WELS_THREAD_HANDLE  thread) {
+  return pthread_cancel (thread);
+}
+
+WELS_THREAD_ERROR_CODE    WelsThreadDestroy (WELS_THREAD_HANDLE* thread) {
+  return WELS_THREAD_ERROR_OK;
+}
+
+WELS_THREAD_HANDLE        WelsThreadSelf() {
+  return pthread_self();
+}
+
+WELS_THREAD_ERROR_CODE    WelsMutexInit (WELS_MUTEX*    mutex) {
+  return pthread_mutex_init (mutex, NULL);
+}
+
+WELS_THREAD_ERROR_CODE    WelsMutexLock (WELS_MUTEX*    mutex) {
+  return pthread_mutex_lock (mutex);
+}
+
+WELS_THREAD_ERROR_CODE    WelsMutexUnlock (WELS_MUTEX* mutex) {
+  return pthread_mutex_unlock (mutex);
+}
+
+WELS_THREAD_ERROR_CODE    WelsMutexDestroy (WELS_MUTEX* mutex) {
+  return pthread_mutex_destroy (mutex);
+}
+
+// unnamed semaphores can not work well for posix threading models under not root users
+
+WELS_THREAD_ERROR_CODE    WelsEventInit (WELS_EVENT* event) {
+  return sem_init (event, 0, 0);
+}
+
+WELS_THREAD_ERROR_CODE   WelsEventDestroy (WELS_EVENT* event) {
+  return sem_destroy (event);	// match with sem_init
+}
+
+WELS_THREAD_ERROR_CODE    WelsEventOpen (WELS_EVENT** p_event, str_t* event_name) {
+  if (p_event == NULL || event_name == NULL)
+    return WELS_THREAD_ERROR_GENERIAL;
+  *p_event = sem_open (event_name, O_CREAT, (S_IRUSR | S_IWUSR)/*0600*/, 0);
+  if (*p_event == (sem_t*)SEM_FAILED) {
+    sem_unlink (event_name);
+    *p_event = NULL;
+    return WELS_THREAD_ERROR_GENERIAL;
+  } else {
+    return WELS_THREAD_ERROR_OK;
+  }
+}
+WELS_THREAD_ERROR_CODE    WelsEventClose (WELS_EVENT* event, str_t* event_name) {
+  WELS_THREAD_ERROR_CODE err = sem_close (event);	// match with sem_open
+  if (event_name)
+    sem_unlink (event_name);
+  return err;
+}
+
+WELS_THREAD_ERROR_CODE   WelsEventSignal (WELS_EVENT* event) {
+  WELS_THREAD_ERROR_CODE err = 0;
+//	int32_t val = 0;
+//	sem_getvalue(event, &val);
+//	fprintf( stderr, "before signal it, val= %d..\n",val );
+  err = sem_post (event);
+//	sem_getvalue(event, &val);
+//	fprintf( stderr, "after signal it, val= %d..\n",val );
+  return err;
+}
+WELS_THREAD_ERROR_CODE    WelsEventReset (WELS_EVENT* event) {
+  // FIXME for posix event reset, seems not be supported for pthread??
+  sem_close (event);
+  return sem_init (event, 0, 0);
+}
+
+WELS_THREAD_ERROR_CODE   WelsEventWait (WELS_EVENT* event) {
+  return sem_wait (event);	// blocking until signaled
+}
+
+WELS_THREAD_ERROR_CODE    WelsEventWaitWithTimeOut (WELS_EVENT* event, uint32_t dwMilliseconds) {
+  if (dwMilliseconds != (uint32_t) - 1) {
+    return sem_wait (event);
+  } else {
+#if defined(MACOS)
+    int32_t err = 0;
+    int32_t wait_count = 0;
+    do {
+      err = sem_trywait (event);
+      if (WELS_THREAD_ERROR_OK == err)
+        break;// WELS_THREAD_ERROR_OK;
+      else if (wait_count > 0)
+        break;
+      usleep (dwMilliseconds * 1000);
+      ++ wait_count;
+    } while (1);
+    return err;
+#else
+    struct timespec ts;
+    struct timeval tv;
+
+    gettimeofday (&tv, 0);
+
+    ts.tv_sec = tv.tv_sec + dwMilliseconds / 1000;
+    ts.tv_nsec = tv.tv_usec * 1000 + (dwMilliseconds % 1000) * 1000000;
+
+    return sem_timedwait (event, &ts);
+#endif//MACOS
+  }
+}
+
+WELS_THREAD_ERROR_CODE    WelsMultipleEventsWaitSingleBlocking (uint32_t nCount,
+    WELS_EVENT** event_list,
+    uint32_t dwMilliseconds) {
+  // bWaitAll = FALSE && blocking
+  uint32_t nIdx = 0;
+  const uint32_t kuiAccessTime = 2;	// 2 us once
+//	uint32_t uiSleepMs = 0;
+
+  if (nCount == 0)
+    return WELS_THREAD_ERROR_WAIT_FAILED;
+
+  while (1) {
+    nIdx = 0;	// access each event by order
+    while (nIdx < nCount) {
+      int32_t err = 0;
+//#if defined(MACOS)	// clock_gettime(CLOCK_REALTIME) & sem_timedwait not supported on mac, so have below impl
+      int32_t wait_count = 0;
+//			struct timespec ts;
+//			struct timeval tv;
+//
+//			gettimeofday(&tv,0);
+//			ts.tv_sec = tv.tv_sec/*+ kuiAccessTime / 1000*/;		// second
+//			ts.tv_nsec = (tv.tv_usec + kuiAccessTime) * 1000;	// nano-second
+
+      /*
+       * although such interface is not used in __GNUC__ like platform, to use
+       * pthread_cond_timedwait() might be better choice if need
+       */
+      do {
+        err = sem_trywait (event_list[nIdx]);
+        if (WELS_THREAD_ERROR_OK == err)
+          return WELS_THREAD_ERROR_WAIT_OBJECT_0 + nIdx;
+        else if (wait_count > 0)
+          break;
+        usleep (kuiAccessTime);
+        ++ wait_count;
+      } while (1);
+//#else
+//			struct timespec ts;
+//
+//			if ( clock_gettime(CLOCK_REALTIME, &ts) == -1 )
+//				return WELS_THREAD_ERROR_WAIT_FAILED;
+//			ts.tv_nsec += kuiAccessTime/*(kuiAccessTime % 1000)*/ * 1000;
+//
+////			fprintf( stderr, "sem_timedwait(): start to wait event %d..\n", nIdx );
+//			err = sem_timedwait(event_list[nIdx], &ts);
+////			if ( err == -1 )
+////			{
+////				sem_getvalue(&event_list[nIdx], &val);
+////				fprintf( stderr, "sem_timedwait() errno(%d) semaphore %d..\n", errno, val);
+////				return WELS_THREAD_ERROR_WAIT_FAILED;
+////			}
+////			fprintf( stderr, "sem_timedwait(): wait event %d result %d errno %d..\n", nIdx, err, errno );
+//			if ( WELS_THREAD_ERROR_OK == err ) // non-blocking mode
+//			{
+////				int32_t val = 0;
+////				sem_getvalue(&event_list[nIdx], &val);
+////				fprintf( stderr, "after sem_timedwait(), event_list[%d] semaphore value= %d..\n", nIdx, val);
+////				fprintf( stderr, "WelsMultipleEventsWaitSingleBlocking sleep %d us\n", uiSleepMs);
+//				return WELS_THREAD_ERROR_WAIT_OBJECT_0 + nIdx;
+//			}
+//#endif
+      // we do need access next event next time
+      ++ nIdx;
+//			uiSleepMs += kuiAccessTime;
+    }
+    usleep (1);	// switch to working threads
+//		++ uiSleepMs;
+  }
+
+  return WELS_THREAD_ERROR_WAIT_FAILED;
+}
+
+WELS_THREAD_ERROR_CODE    WelsMultipleEventsWaitAllBlocking (uint32_t nCount, WELS_EVENT** event_list) {
+  // bWaitAll = TRUE && blocking
+  uint32_t nIdx = 0;
+//	const uint32_t kuiAccessTime = (uint32_t)-1;// 1 ms once
+  uint32_t uiCountSignals = 0;
+  uint32_t uiSignalFlag	= 0;	// UGLY: suppose maximal event number up to 32
+
+  if (nCount == 0 || nCount > (sizeof (uint32_t) << 3))
+    return WELS_THREAD_ERROR_WAIT_FAILED;
+
+  while (1) {
+    nIdx = 0;	// access each event by order
+    while (nIdx < nCount) {
+      const uint32_t kuiBitwiseFlag = (1 << nIdx);
+
+      if ((uiSignalFlag & kuiBitwiseFlag) != kuiBitwiseFlag) { // non-blocking mode
+        int32_t err = 0;
+//				fprintf( stderr, "sem_wait(): start to wait event %d..\n", nIdx );
+        err = sem_wait (event_list[nIdx]);
+//				fprintf( stderr, "sem_wait(): wait event %d result %d errno %d..\n", nIdx, err, errno );
+        if (WELS_THREAD_ERROR_OK == err) {
+//					int32_t val = 0;
+//					sem_getvalue(&event_list[nIdx], &val);
+//					fprintf( stderr, "after sem_timedwait(), event_list[%d] semaphore value= %d..\n", nIdx, val);
+
+          uiSignalFlag |= kuiBitwiseFlag;
+          ++ uiCountSignals;
+          if (uiCountSignals >= nCount) {
+            return WELS_THREAD_ERROR_OK;
+          }
+        }
+      }
+      // we do need access next event next time
+      ++ nIdx;
+    }
+  }
+
+  return WELS_THREAD_ERROR_WAIT_FAILED;
+}
+
+WELS_THREAD_ERROR_CODE    WelsQueryLogicalProcessInfo (WelsLogicalProcessInfo* pInfo) {
+#ifdef LINUX
+
+#define   CMD_RES_SIZE    2048
+  str_t pBuf[CMD_RES_SIZE];
+
+  SystemCall ("cat /proc/cpuinfo | grep \"processor\" | wc -l", pBuf, CMD_RES_SIZE);
+
+  pInfo->ProcessorCount = atoi (pBuf);
+
+  if (pInfo->ProcessorCount == 0) {
+    pInfo->ProcessorCount = 1;
+  }
+
+  return WELS_THREAD_ERROR_OK;
+#undef   CMD_RES_SIZE
+
+#else
+
+  SInt32 cpunumber;
+  Gestalt (gestaltCountOfCPUs, &cpunumber);
+
+  pInfo->ProcessorCount	= cpunumber;
+
+  return WELS_THREAD_ERROR_OK;
+
+#endif//LINUX
+}
+
+#endif
+
+
+
--- a/codec/api/svc/codec_api.h
+++ b/codec/api/svc/codec_api.h
@@ -36,91 +36,89 @@
 #include "codec_app_def.h"
 #include "codec_def.h"
 
-class ISVCEncoder
-{
-public:
-	/*
-	 * return: CM_RETURN: 0 - success; otherwise - failed;
-	 */
-	virtual int Initialize(SVCEncodingParam* pParam, const INIT_TYPE kiInitType = INIT_TYPE_PARAMETER_BASED) = 0;
-	virtual int Initialize(void* pParam, const INIT_TYPE kiInitType = INIT_TYPE_CONFIG_BASED) = 0;	
-	  
-	virtual int Unintialize() = 0;
-	
-	/*
-	 * return: EVideoFrameType [IDR: videoFrameTypeIDR; P: videoFrameTypeP; ERROR: videoFrameTypeInvalid]
-	 */
-	virtual int EncodeFrame(const unsigned char* kpSrc, SFrameBSInfo* pBsInfo) = 0;	
-	virtual int EncodeFrame(const SSourcePicture  ** kppSrcPicList, int nSrcPicNum, SFrameBSInfo * pBsInfo) = 0;
-	
-	/*
-	 * return: 0 - success; otherwise - failed;
-	 */
-	virtual int PauseFrame(const unsigned char* kpSrc, SFrameBSInfo* pBsInfo) = 0;	
-	
-	/*
-	 * return: 0 - success; otherwise - failed;
-	 */
-	virtual int ForceIntraFrame(bool bIDR) = 0;		
-	
-	/************************************************************************
-	 * InDataFormat, IDRInterval, SVC Encode Param, Frame Rate, Bitrate,..
-	 ************************************************************************/
-	/*
-	 * return: CM_RETURN: 0 - success; otherwise - failed;
-	 */
-	virtual int SetOption(ENCODER_OPTION eOptionId, void* pOption) = 0;
-	virtual int GetOption(ENCODER_OPTION eOptionId, void* pOption) = 0;
+class ISVCEncoder {
+ public:
+  /*
+   * return: CM_RETURN: 0 - success; otherwise - failed;
+   */
+  virtual int Initialize (SVCEncodingParam* pParam, const INIT_TYPE kiInitType = INIT_TYPE_PARAMETER_BASED) = 0;
+  virtual int Initialize (void* pParam, const INIT_TYPE kiInitType = INIT_TYPE_CONFIG_BASED) = 0;
+
+  virtual int Uninitialize() = 0;
+
+  /*
+   * return: EVideoFrameType [IDR: videoFrameTypeIDR; P: videoFrameTypeP; ERROR: videoFrameTypeInvalid]
+   */
+  virtual int EncodeFrame (const unsigned char* kpSrc, SFrameBSInfo* pBsInfo) = 0;
+  virtual int EncodeFrame (const SSourcePicture**   kppSrcPicList, int nSrcPicNum, SFrameBSInfo* pBsInfo) = 0;
+
+  /*
+   * return: 0 - success; otherwise - failed;
+   */
+  virtual int PauseFrame (const unsigned char* kpSrc, SFrameBSInfo* pBsInfo) = 0;
+
+  /*
+   * return: 0 - success; otherwise - failed;
+   */
+  virtual int ForceIntraFrame (bool bIDR) = 0;
+
+  /************************************************************************
+   * InDataFormat, IDRInterval, SVC Encode Param, Frame Rate, Bitrate,..
+   ************************************************************************/
+  /*
+   * return: CM_RETURN: 0 - success; otherwise - failed;
+   */
+  virtual int SetOption (ENCODER_OPTION eOptionId, void* pOption) = 0;
+  virtual int GetOption (ENCODER_OPTION eOptionId, void* pOption) = 0;
 };
 
-class ISVCDecoder
-{
-public:
-	virtual long Initialize(void* pParam, const INIT_TYPE iInitType) = 0;
-	virtual long Unintialize() = 0;
+class ISVCDecoder {
+ public:
+  virtual long Initialize (void* pParam, const INIT_TYPE iInitType) = 0;
+  virtual long Uninitialize() = 0;
 
-	virtual DECODING_STATE DecodeFrame(	const unsigned char* pSrc,
-		                                const int iSrcLen,	
-                                        unsigned char** ppDst,
-		                                int* pStride,
-		                                int& iWidth,
-		                                int& iHeight	) = 0;
+  virtual DECODING_STATE DecodeFrame (const unsigned char* pSrc,
+                                      const int iSrcLen,
+                                      unsigned char** ppDst,
+                                      int* pStride,
+                                      int& iWidth,
+                                      int& iHeight) = 0;
 
-	/*
-	 *  src must be 4 byte aligned,   recommend 16 byte aligned.    the available src size must be multiple of 4.
-	 */
-	virtual DECODING_STATE DecodeFrame(	const unsigned char* pSrc,
-											const int iSrcLen,	
-											void ** ppDst,
-											SBufferInfo* pDstInfo) = 0;
+  /*
+   *  src must be 4 byte aligned,   recommend 16 byte aligned.    the available src size must be multiple of 4.
+   */
+  virtual DECODING_STATE DecodeFrame (const unsigned char* pSrc,
+                                      const int iSrcLen,
+                                      void** ppDst,
+                                      SBufferInfo* pDstInfo) = 0;
 
-	/*
-	 *  src must be 4 byte aligned,   recommend 16 byte aligned.    the available src size must be multiple of 4.
-	 */
-	virtual DECODING_STATE DecodeFrameEx( const unsigned char * pSrc,
-		                                  const int iSrcLen,
-		                                  unsigned char * pDst,
-										  int iDstStride,
-		                                  int & iDstLen,
-		                                  int & iWidth,
-		                                  int & iHeight,
-		                                  int & iColorFormat) = 0;
+  /*
+   *  src must be 4 byte aligned,   recommend 16 byte aligned.    the available src size must be multiple of 4.
+   */
+  virtual DECODING_STATE DecodeFrameEx (const unsigned char* pSrc,
+                                        const int iSrcLen,
+                                        unsigned char* pDst,
+                                        int iDstStride,
+                                        int& iDstLen,
+                                        int& iWidth,
+                                        int& iHeight,
+                                        int& iColorFormat) = 0;
 
-	/*************************************************************************
-	 * OutDataFormat
-	 *************************************************************************/
-	virtual long SetOption(DECODER_OPTION eOptionId, void* pOption) = 0;
-	virtual long GetOption(DECODER_OPTION eOptionId, void* pOption) = 0;
+  /*************************************************************************
+   * OutDataFormat
+   *************************************************************************/
+  virtual long SetOption (DECODER_OPTION eOptionId, void* pOption) = 0;
+  virtual long GetOption (DECODER_OPTION eOptionId, void* pOption) = 0;
 };
 
 
-extern "C" 
+extern "C"
 {
-int  CreateSVCEncoder(ISVCEncoder** ppEncoder);
-void DestroySVCEncoder(ISVCEncoder* pEncoder);
+  int  CreateSVCEncoder (ISVCEncoder** ppEncoder);
+  void DestroySVCEncoder (ISVCEncoder* pEncoder);
 
-long CreateDecoder(ISVCDecoder** ppDecoder);
-void DestroyDecoder(ISVCDecoder* pDecoder);
+  long CreateDecoder (ISVCDecoder** ppDecoder);
+  void DestroyDecoder (ISVCDecoder* pDecoder);
 }
 
 #endif//WELS_VIDEO_CODEC_SVC_API_H__
--- a/codec/api/svc/codec_app_def.h
+++ b/codec/api/svc/codec_app_def.h
@@ -50,241 +50,230 @@
 #define SAVED_NALUNIT_NUM_TMP		( (MAX_SPATIAL_LAYER_NUM*MAX_QUALITY_LAYER_NUM) + 1 + MAX_SPATIAL_LAYER_NUM ) //SPS/PPS + SEI/SSEI + PADDING_NAL
 #define MAX_SLICES_NUM_TMP			( ( MAX_NAL_UNITS_IN_LAYER - SAVED_NALUNIT_NUM_TMP ) / 3 )
 
-typedef enum
-{
-	/* Errors derived from bitstream parsing */
-	dsErrorFree			= 0x00,	/* Bitstream error-free */
-	dsFramePending		= 0x01,	/* Need more throughput to generate a frame output,  */
-	dsRefLost			= 0x02,	/* layer lost at reference frame with temporal id 0  */
-	dsBitstreamError	= 0x04,	/* Error bitstreams(maybe broken internal frame) the decoder cared */
-	dsDepLayerLost		= 0x08,	/* Dependented layer is ever lost */
-	dsNoParamSets		= 0x10, /* No parameter set NALs involved */
-	
-	/* Errors derived from logic level */
-	dsInvalidArgument	= 0x1000,	/* Invalid argument specified */
-	dsInitialOptExpected= 0x2000,	/* Initializing operation is expected */
-	dsOutOfMemory		= 0x4000,	/* Out of memory due to new request */
-		/* ANY OTHERS? */
-	dsDstBufNeedExpand	= 0x8000	/* Actual picture size exceeds size of dst pBuffer feed in decoder, so need expand its size */
-	
-}DECODING_STATE;
+typedef enum {
+  /* Errors derived from bitstream parsing */
+  dsErrorFree			= 0x00,	/* Bitstream error-free */
+  dsFramePending		= 0x01,	/* Need more throughput to generate a frame output,  */
+  dsRefLost			= 0x02,	/* layer lost at reference frame with temporal id 0  */
+  dsBitstreamError	= 0x04,	/* Error bitstreams(maybe broken internal frame) the decoder cared */
+  dsDepLayerLost		= 0x08,	/* Dependented layer is ever lost */
+  dsNoParamSets		= 0x10, /* No parameter set NALs involved */
 
+  /* Errors derived from logic level */
+  dsInvalidArgument	= 0x1000,	/* Invalid argument specified */
+  dsInitialOptExpected = 0x2000,	/* Initializing operation is expected */
+  dsOutOfMemory		= 0x4000,	/* Out of memory due to new request */
+  /* ANY OTHERS? */
+  dsDstBufNeedExpand	= 0x8000	/* Actual picture size exceeds size of dst pBuffer feed in decoder, so need expand its size */
+
+} DECODING_STATE;
+
 /* Option types introduced in SVC encoder application */
-typedef enum
-{
-	ENCODER_OPTION_DATAFORMAT = 0,
-	ENCODER_OPTION_IDR_INTERVAL,
-	ENCODER_OPTION_SVC_ENCODE_PARAM,
-	ENCODER_OPTION_FRAME_RATE,
-	ENCODER_OPTION_iBitRate,
-	ENCODER_OPTION_INTER_SPATIAL_PRED,
-	ENCODER_OPTION_RC_MODE,
-	ENCODER_PADDING_PADDING,
+typedef enum {
+  ENCODER_OPTION_DATAFORMAT = 0,
+  ENCODER_OPTION_IDR_INTERVAL,
+  ENCODER_OPTION_SVC_ENCODE_PARAM,
+  ENCODER_OPTION_FRAME_RATE,
+  ENCODER_OPTION_iBitRate,
+  ENCODER_OPTION_INTER_SPATIAL_PRED,
+  ENCODER_OPTION_RC_MODE,
+  ENCODER_PADDING_PADDING,
 
-	ENCODER_LTR_RECOVERY_REQUEST,
-	ENCODER_LTR_MARKING_FEEDBACK,
-	ENCOCER_LTR_MARKING_PERIOD,
-	ENCODER_OPTION_LTR,
-			
-	ENCODER_OPTION_ENABLE_SSEI,               //disable SSEI: true--disable ssei; false--enable ssei
-	ENCODER_OPTION_ENABLE_PREFIX_NAL_ADDING,   //enable prefix: true--enable prefix; false--disable prefix
-	ENCODER_OPTION_ENABLE_SPS_PPS_ID_ADDITION, //disable pSps/pPps id addition: true--disable pSps/pPps id; false--enable pSps/pPps id addistion
+  ENCODER_LTR_RECOVERY_REQUEST,
+  ENCODER_LTR_MARKING_FEEDBACK,
+  ENCOCER_LTR_MARKING_PERIOD,
+  ENCODER_OPTION_LTR,
 
-	ENCODER_OPTION_CURRENT_PATH
+  ENCODER_OPTION_ENABLE_SSEI,               //disable SSEI: true--disable ssei; false--enable ssei
+  ENCODER_OPTION_ENABLE_PREFIX_NAL_ADDING,   //enable prefix: true--enable prefix; false--disable prefix
+  ENCODER_OPTION_ENABLE_SPS_PPS_ID_ADDITION, //disable pSps/pPps id addition: true--disable pSps/pPps id; false--enable pSps/pPps id addistion
+
+  ENCODER_OPTION_CURRENT_PATH
 } ENCODER_OPTION;
 
 /* Option types introduced in SVC decoder application */
-typedef enum
-{
-	DECODER_OPTION_DATAFORMAT = 0,	/* Set color space of decoding output frame */
-	DECODER_OPTION_TRUNCATED_MODE,	/* Used in decoding bitstream of non integrated frame, only truncated working mode is supported by tune, so skip it */
-	DECODER_OPTION_END_OF_STREAM,	/* Indicate bitstream of the final frame to be decoded */
-	DECODER_OPTION_VCL_NAL,        //feedback whether or not have VCL NAL in current AU for application layer
-	DECODER_OPTION_TEMPORAL_ID,      //feedback temporal id for application layer
-	DECODER_OPTION_MODE,             // indicates the decoding mode
-	DECODER_OPTION_OUTPUT_PROPERTY,
-	DECODER_OPTION_FRAME_NUM,	//feedback current decoded frame number
-	DECODER_OPTION_IDR_PIC_ID,	// feedback current frame belong to which IDR period
-	DECODER_OPTION_LTR_MARKING_FLAG,	// feedback wether current frame mark a LTR
-	DECODER_OPTION_LTR_MARKED_FRAME_NUM,	// feedback frame num marked by current Frame
-	DECODER_OPTION_DEVICE_INFO,
+typedef enum {
+  DECODER_OPTION_DATAFORMAT = 0,	/* Set color space of decoding output frame */
+  DECODER_OPTION_TRUNCATED_MODE,	/* Used in decoding bitstream of non integrated frame, only truncated working mode is supported by tune, so skip it */
+  DECODER_OPTION_END_OF_STREAM,	/* Indicate bitstream of the final frame to be decoded */
+  DECODER_OPTION_VCL_NAL,        //feedback whether or not have VCL NAL in current AU for application layer
+  DECODER_OPTION_TEMPORAL_ID,      //feedback temporal id for application layer
+  DECODER_OPTION_MODE,             // indicates the decoding mode
+  DECODER_OPTION_OUTPUT_PROPERTY,
+  DECODER_OPTION_FRAME_NUM,	//feedback current decoded frame number
+  DECODER_OPTION_IDR_PIC_ID,	// feedback current frame belong to which IDR period
+  DECODER_OPTION_LTR_MARKING_FLAG,	// feedback wether current frame mark a LTR
+  DECODER_OPTION_LTR_MARKED_FRAME_NUM,	// feedback frame num marked by current Frame
+  DECODER_OPTION_DEVICE_INFO,
 
 } DECODER_OPTION;
-typedef enum //feedback that whether or not have VCL NAL in current AU
-{
-	FEEDBACK_NON_VCL_NAL = 0,
-	FEEDBACK_VCL_NAL,
-	FEEDBACK_UNKNOWN_NAL	
+typedef enum { //feedback that whether or not have VCL NAL in current AU
+  FEEDBACK_NON_VCL_NAL = 0,
+  FEEDBACK_VCL_NAL,
+  FEEDBACK_UNKNOWN_NAL
 } FEEDBACK_VCL_NAL_IN_AU;
-typedef enum //feedback the iTemporalId in current AU if have VCL NAL
-{
-	FEEDBACK_TEMPORAL_ID_0 = 0,
-	FEEDBACK_TEMPORAL_ID_1,
-	FEEDBACK_TEMPORAL_ID_2,
-	FEEDBACK_TEMPORAL_ID_3,
-	FEEDBACK_TEMPORAL_ID_4,
-	FEEDBACK_UNKNOWN_TEMPORAL_ID	
+typedef enum { //feedback the iTemporalId in current AU if have VCL NAL
+  FEEDBACK_TEMPORAL_ID_0 = 0,
+  FEEDBACK_TEMPORAL_ID_1,
+  FEEDBACK_TEMPORAL_ID_2,
+  FEEDBACK_TEMPORAL_ID_3,
+  FEEDBACK_TEMPORAL_ID_4,
+  FEEDBACK_UNKNOWN_TEMPORAL_ID
 } FEEDBACK_TEMPORAL_ID;
 
 /* Type of layer being encoded */
-typedef enum
-{
-	NON_VIDEO_CODING_LAYER = 0,
-	    VIDEO_CODING_LAYER = 1
+typedef enum {
+  NON_VIDEO_CODING_LAYER = 0,
+  VIDEO_CODING_LAYER = 1
 } LAYER_TYPE;
 
 /* SVC Encoder/Decoder Initializing Parameter Types */
-typedef enum
-{
-	INIT_TYPE_PARAMETER_BASED = 0,	// For SVC DEMO Application
-	INIT_TYPE_CONFIG_BASED,			// For SVC CONSOLE Application
-}INIT_TYPE;
+typedef enum {
+  INIT_TYPE_PARAMETER_BASED = 0,	// For SVC DEMO Application
+  INIT_TYPE_CONFIG_BASED,			// For SVC CONSOLE Application
+} INIT_TYPE;
 
 //enumerate the type of video bitstream which is provided to decoder
-typedef enum
-{
-	VIDEO_BITSTREAM_AVC               = 0,	
-	VIDEO_BITSTREAM_SVC               = 1,
-	VIDEO_BITSTREAM_DEFAULT           = VIDEO_BITSTREAM_SVC,
-}VIDEO_BITSTREAM_TYPE;
+typedef enum {
+  VIDEO_BITSTREAM_AVC               = 0,
+  VIDEO_BITSTREAM_SVC               = 1,
+  VIDEO_BITSTREAM_DEFAULT           = VIDEO_BITSTREAM_SVC,
+} VIDEO_BITSTREAM_TYPE;
 
-typedef enum
-{
-	NO_RECOVERY_REQUSET  = 0,
-	LTR_RECOVERY_REQUEST = 1,
-	IDR_RECOVERY_REQUEST = 2,
-	NO_LTR_MARKING_FEEDBACK =3,
-	LTR_MARKING_SUCCESS = 4,
-	LTR_MARKING_FAILED = 5,
-}KEY_FRAME_REQUEST_TYPE;
+typedef enum {
+  NO_RECOVERY_REQUSET  = 0,
+  LTR_RECOVERY_REQUEST = 1,
+  IDR_RECOVERY_REQUEST = 2,
+  NO_LTR_MARKING_FEEDBACK = 3,
+  LTR_MARKING_SUCCESS = 4,
+  LTR_MARKING_FAILED = 5,
+} KEY_FRAME_REQUEST_TYPE;
 
-typedef struct
-{
-	unsigned int uiFeedbackType; //IDR request or LTR recovery request
-	unsigned int uiIDRPicId; // distinguish request from different IDR
-	int		  iLastCorrectFrameNum;
-	int		  iCurrentFrameNum; //specify current decoder frame_num.
-}SLTRRecoverRequest;
+typedef struct {
+  unsigned int uiFeedbackType; //IDR request or LTR recovery request
+  unsigned int uiIDRPicId; // distinguish request from different IDR
+  int		  iLastCorrectFrameNum;
+  int		  iCurrentFrameNum; //specify current decoder frame_num.
+} SLTRRecoverRequest;
 
-typedef struct
-{
-	unsigned int  uiFeedbackType; //mark failed or successful
-	unsigned int  uiIDRPicId; // distinguish request from different IDR
-	int			  iLTRFrameNum; //specify current decoder frame_num
-}SLTRMarkingFeedback;
+typedef struct {
+  unsigned int  uiFeedbackType; //mark failed or successful
+  unsigned int  uiIDRPicId; // distinguish request from different IDR
+  int			  iLTRFrameNum; //specify current decoder frame_num
+} SLTRMarkingFeedback;
 #pragma pack(1)
 
-typedef struct 
-{
-	
-	//# 0 SM_SINGLE_SLICE			| SliceNum==1
-	//# 1 SM_FIXEDSLCNUM_SLICE	| according to SliceNum			| Enabled dynamic slicing for multi-thread
-	//# 2 SM_RASTER_SLICE			| according to SlicesAssign		| Need input of MB numbers each slice. In addition, if other constraint in SSliceArgument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved.
-	//# 3 SM_ROWMB_SLICE			| according to PictureMBHeight	|  Typical of single row of mbs each slice?+ slice size constraint which including re-encoding
-	//# 4 SM_DYN_SLICE			| according to SliceSize		| Dynamic slicing (have no idea about slice_nums until encoding current frame)
-	unsigned int uiSliceMode; //by default, uiSliceMode will be 0
-	struct {
-		unsigned int		uiSliceMbNum[MAX_SLICES_NUM_TMP];  //here we use a tmp fixed value since MAX_SLICES_NUM is not defined here and its definition may be changed; 
-		unsigned int		uiSliceNum;
-		unsigned int		uiSliceSizeConstraint;
-	} sSliceArgument;//not all the elements in this argument will be used, how it will be used depends on uiSliceMode; see below	
+typedef struct {
+
+  //# 0 SM_SINGLE_SLICE			| SliceNum==1
+  //# 1 SM_FIXEDSLCNUM_SLICE	| according to SliceNum			| Enabled dynamic slicing for multi-thread
+  //# 2 SM_RASTER_SLICE			| according to SlicesAssign		| Need input of MB numbers each slice. In addition, if other constraint in SSliceArgument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved.
+  //# 3 SM_ROWMB_SLICE			| according to PictureMBHeight	|  Typical of single row of mbs each slice?+ slice size constraint which including re-encoding
+  //# 4 SM_DYN_SLICE			| according to SliceSize		| Dynamic slicing (have no idea about slice_nums until encoding current frame)
+  unsigned int uiSliceMode; //by default, uiSliceMode will be 0
+  struct {
+    unsigned int
+    uiSliceMbNum[MAX_SLICES_NUM_TMP];  //here we use a tmp fixed value since MAX_SLICES_NUM is not defined here and its definition may be changed;
+    unsigned int		uiSliceNum;
+    unsigned int		uiSliceSizeConstraint;
+  } sSliceArgument;//not all the elements in this argument will be used, how it will be used depends on uiSliceMode; see below
 } SSliceConfig;
 
 typedef struct {
-	int	iVideoWidth;		// video size in cx specified for a layer
-	int	iVideoHeight;		// video size in cy specified for a layer
-	float	fFrameRate;		// frame rate specified for a layer
-	int	iQualityLayerNum;	// layer number at quality level
-	int	iSpatialBitrate;	// target bitrate for a spatial layer
-	int	iCgsSnrRefined;	// 0: SNR layers all MGS; 1: SNR layers all CGS
-	int	iInterSpatialLayerPredFlag;	// 0: diabled [independency spatial layer coding]; 1: enabled [base spatial layer dependency coding]
+  int	iVideoWidth;		// video size in cx specified for a layer
+  int	iVideoHeight;		// video size in cy specified for a layer
+  float	fFrameRate;		// frame rate specified for a layer
+  int	iQualityLayerNum;	// layer number at quality level
+  int	iSpatialBitrate;	// target bitrate for a spatial layer
+  int	iCgsSnrRefined;	// 0: SNR layers all MGS; 1: SNR layers all CGS
+  int	iInterSpatialLayerPredFlag;	// 0: diabled [independency spatial layer coding]; 1: enabled [base spatial layer dependency coding]
 
-	int	iQualityBitrate[MAX_QUALITY_LAYER_NUM];	// target bitrate for a quality layer
-	
-	SSliceConfig sSliceCfg;
+  int	iQualityBitrate[MAX_QUALITY_LAYER_NUM];	// target bitrate for a quality layer
+
+  SSliceConfig sSliceCfg;
 } SSpatialLayerConfig;
 
 /* SVC Encoding Parameters */
 typedef struct {
-	int		iPicWidth;			// width of picture in samples
-	int		iPicHeight;			// height of picture in samples
-	int		iTargetBitrate;		// target bitrate desired
-	int		iTemporalLayerNum;	// layer number at temporal level
-	int		iSpatialLayerNum;	// layer number at spatial level
+  int		iPicWidth;			// width of picture in samples
+  int		iPicHeight;			// height of picture in samples
+  int		iTargetBitrate;		// target bitrate desired
+  int		iTemporalLayerNum;	// layer number at temporal level
+  int		iSpatialLayerNum;	// layer number at spatial level
 
-	float	fFrameRate;			// input maximal frame rate
-	
-	int		iInputCsp;			// color space of input sequence
-	int		iKeyPicCodingMode;// mode of key picture coding
-	int		iIntraPeriod;		// period of Intra frame
-	bool    bEnableSpsPpsIdAddition;
-	bool    bPrefixNalAddingCtrl;
-	bool   	bEnableDenoise;	    // denoise control
-	bool    bEnableBackgroundDetection; 	// background detection control //VAA_BACKGROUND_DETECTION //BGD cmd
-	bool    bEnableAdaptiveQuant; // adaptive quantization control
-	bool	bEnableCropPic;	// enable cropping source picture.  8/25/2010
-								// FALSE: Streaming Video Sharing; TRUE: Video Conferencing Meeting;
-	bool     bEnableLongTermReference; // 0: on, 1: off
-	int     iLtrMarkPeriod;
+  float	fFrameRate;			// input maximal frame rate
 
-	int iRCMode;                 // RC mode
-	int	iTemporalBitrate[MAX_TEMPORAL_LAYER_NUM];	// target bitrate specified for a temporal level
-	int iPaddingFlag;            // 0:disable padding;1:padding	
+  int		iInputCsp;			// color space of input sequence
+  int		iKeyPicCodingMode;// mode of key picture coding
+  int		iIntraPeriod;		// period of Intra frame
+  bool    bEnableSpsPpsIdAddition;
+  bool    bPrefixNalAddingCtrl;
+  bool   	bEnableDenoise;	    // denoise control
+  bool    bEnableBackgroundDetection; 	// background detection control //VAA_BACKGROUND_DETECTION //BGD cmd
+  bool    bEnableAdaptiveQuant; // adaptive quantization control
+  bool	bEnableCropPic;	// enable cropping source picture.  8/25/2010
+  // FALSE: Streaming Video Sharing; TRUE: Video Conferencing Meeting;
+  bool     bEnableLongTermReference; // 0: on, 1: off
+  int     iLtrMarkPeriod;
 
-	SSpatialLayerConfig sSpatialLayers[MAX_SPATIAL_LAYER_NUM];
-	
+  int iRCMode;                 // RC mode
+  int	iTemporalBitrate[MAX_TEMPORAL_LAYER_NUM];	// target bitrate specified for a temporal level
+  int iPaddingFlag;            // 0:disable padding;1:padding
+
+  SSpatialLayerConfig sSpatialLayers[MAX_SPATIAL_LAYER_NUM];
+
 } SVCEncodingParam, *PSVCEncodingParam;
 
 //Define a new struct to show the property of video bitstream.
 typedef struct {
-	unsigned int          size; //size of the struct
-	VIDEO_BITSTREAM_TYPE  eVideoBsType;
+  unsigned int          size; //size of the struct
+  VIDEO_BITSTREAM_TYPE  eVideoBsType;
 } SVideoProperty;
 
 /* SVC Decoding Parameters, reserved here and potential applicable in the future */
-typedef struct TagSVCDecodingParam{
-	char		*pFileNameRestructed;	// File name of restructed frame used for PSNR calculation based debug
-	
-	int				iOutputColorFormat;	// color space format to be outputed, EVideoFormatType specified in codec_def.h
-	unsigned int	uiCpuLoad;		// CPU load
-	unsigned char	uiTargetDqLayer;	// Setting target dq layer id
+typedef struct TagSVCDecodingParam {
+  char*		pFileNameRestructed;	// File name of restructed frame used for PSNR calculation based debug
 
-	unsigned char	uiEcActiveFlag;		// Whether active error concealment feature in decoder
+  int				iOutputColorFormat;	// color space format to be outputed, EVideoFormatType specified in codec_def.h
+  unsigned int	uiCpuLoad;		// CPU load
+  unsigned char	uiTargetDqLayer;	// Setting target dq layer id
 
-	SVideoProperty   sVideoProperty;
+  unsigned char	uiEcActiveFlag;		// Whether active error concealment feature in decoder
+
+  SVideoProperty   sVideoProperty;
 } SDecodingParam, *PDecodingParam;
 
 /* Bitstream inforamtion of a layer being encoded */
 typedef struct {
-	unsigned char uiTemporalId;
-	unsigned char uiSpatialId;
-	unsigned char uiQualityId;
+  unsigned char uiTemporalId;
+  unsigned char uiSpatialId;
+  unsigned char uiQualityId;
 
-	unsigned char uiPriorityId; //ignore it currently
+  unsigned char uiPriorityId; //ignore it currently
 
-	unsigned char uiLayerType;
+  unsigned char uiLayerType;
 
-	int	iNalCount;					// Count number of NAL coded already
-	int	iNalLengthInByte[MAX_NAL_UNITS_IN_LAYER];	// Length of NAL size in byte from 0 to iNalCount-1
-	unsigned char*	pBsBuf;		// Buffer of bitstream contained
+  int	iNalCount;					// Count number of NAL coded already
+  int	iNalLengthInByte[MAX_NAL_UNITS_IN_LAYER];	// Length of NAL size in byte from 0 to iNalCount-1
+  unsigned char*	pBsBuf;		// Buffer of bitstream contained
 } SLayerBSInfo, *PLayerBSInfo;
 
 
 typedef struct {
-	int		iTemporalId;	// Temporal ID
-	unsigned char	uiFrameType;
+  int		iTemporalId;	// Temporal ID
+  unsigned char	uiFrameType;
 
-	int		iLayerNum;
-	SLayerBSInfo	sLayerInfo[MAX_LAYER_NUM_OF_FRAME];
+  int		iLayerNum;
+  SLayerBSInfo	sLayerInfo[MAX_LAYER_NUM_OF_FRAME];
 
 } SFrameBSInfo, *PFrameBSInfo;
 
-typedef struct Source_Picture_s {	
-	int		    iColorFormat;	// color space type
-	int  		iStride[4];		// stride for each plane pData
-	unsigned char  *pData[4];		// plane pData
-	int  		iPicWidth;				// luma picture width in x coordinate
-	int 		iPicHeight;				// luma picture height in y coordinate
+typedef struct Source_Picture_s {
+  int		    iColorFormat;	// color space type
+  int  		iStride[4];		// stride for each plane pData
+  unsigned char*  pData[4];		// plane pData
+  int  		iPicWidth;				// luma picture width in x coordinate
+  int 		iPicHeight;				// luma picture height in y coordinate
 } SSourcePicture;
 
 
--- a/codec/api/svc/codec_def.h
+++ b/codec/api/svc/codec_def.h
@@ -37,75 +37,70 @@
 #pragma once
 #endif//WIN32
 
-typedef enum
-{
-	/*rgb color formats*/
-	videoFormatRGB        = 1,
-	videoFormatRGBA       = 2, 
-	videoFormatRGB555     = 3,
-	videoFormatRGB565     = 4,
-	videoFormatBGR        = 5,
-	videoFormatBGRA       = 6,
-	videoFormatABGR       = 7,
-	videoFormatARGB       = 8,
+typedef enum {
+  /*rgb color formats*/
+  videoFormatRGB        = 1,
+  videoFormatRGBA       = 2,
+  videoFormatRGB555     = 3,
+  videoFormatRGB565     = 4,
+  videoFormatBGR        = 5,
+  videoFormatBGRA       = 6,
+  videoFormatABGR       = 7,
+  videoFormatARGB       = 8,
 
-	/*yuv color formats*/
-	videoFormatYUY2       = 20,
-	videoFormatYVYU       = 21,
-	videoFormatUYVY       = 22,
-	videoFormatI420       = 23,                        //same as IYUV
-	videoFormatYV12       = 24,
-	videoFormatInternal   = 25,                        // Only Used for SVC decoder testbed
-	
-	videoFormatNV12		  = 26,						// new format for output by DXVA decoding
-	
-	videoFormatVFlip      = 0x80000000
-}EVideoFormatType;
+  /*yuv color formats*/
+  videoFormatYUY2       = 20,
+  videoFormatYVYU       = 21,
+  videoFormatUYVY       = 22,
+  videoFormatI420       = 23,                        //same as IYUV
+  videoFormatYV12       = 24,
+  videoFormatInternal   = 25,                        // Only Used for SVC decoder testbed
 
-typedef enum
-{
-	videoFrameTypeInvalid,		/* Encoder not ready or parameters are invalidate */
-	videoFrameTypeIDR,		/* This type is only available for H264 if this frame is key frame, then return this type */
-	videoFrameTypeI,		/* I frame type */
-	videoFrameTypeP,		/* P frame type */
-	videoFrameTypeSkip,		/* Skip the frame based encoder kernel */
-	videoFrameTypeIPMixed,		/* Frame type introduced I and P slices are mixing */
-}EVideoFrameType;
+  videoFormatNV12		  = 26,						// new format for output by DXVA decoding
 
-typedef enum
-{
-	cmResultSuccess,
-	cmInitParaError,                  /*Parameters are invalid */
-	cmMachPerfIsBad,                  /*The performance of machine is not enough to support 
-									    H264 CODEC, in this case, suggestion user use h263 
+  videoFormatVFlip      = 0x80000000
+} EVideoFormatType;
+
+typedef enum {
+  videoFrameTypeInvalid,		/* Encoder not ready or parameters are invalidate */
+  videoFrameTypeIDR,		/* This type is only available for H264 if this frame is key frame, then return this type */
+  videoFrameTypeI,		/* I frame type */
+  videoFrameTypeP,		/* P frame type */
+  videoFrameTypeSkip,		/* Skip the frame based encoder kernel */
+  videoFrameTypeIPMixed,		/* Frame type introduced I and P slices are mixing */
+} EVideoFrameType;
+
+typedef enum {
+  cmResultSuccess,
+  cmInitParaError,                  /*Parameters are invalid */
+  cmMachPerfIsBad,                  /*The performance of machine is not enough to support
+									    H264 CODEC, in this case, suggestion user use h263
 										or set fps to low like 5fps or more low*/
-	cmUnkonwReason,
-	cmMallocMemeError,                /*Malloc a memory error*/
-	cmInitExpected,			  /*Initial action is expected*/
-}CM_RETURN;
+  cmUnkonwReason,
+  cmMallocMemeError,                /*Malloc a memory error*/
+  cmInitExpected,			  /*Initial action is expected*/
+} CM_RETURN;
 
 
 /* nal unit type */
-enum ENalUnitType
-{
-    NAL_UNKNOWN = 0,
-	NAL_SLICE   = 1,
-	NAL_SLICE_DPA   = 2,
-	NAL_SLICE_DPB   = 3,
-	NAL_SLICE_DPC   = 4,
-	NAL_SLICE_IDR   = 5,    /* ref_idc != 0 */
-	NAL_SEI         = 6,    /* ref_idc == 0 */
-	NAL_SPS         = 7,
-	NAL_PPS         = 8
-	/* ref_idc == 0 for 6,9,10,11,12 */
+enum ENalUnitType {
+  NAL_UNKNOWN = 0,
+  NAL_SLICE   = 1,
+  NAL_SLICE_DPA   = 2,
+  NAL_SLICE_DPB   = 3,
+  NAL_SLICE_DPC   = 4,
+  NAL_SLICE_IDR   = 5,    /* ref_idc != 0 */
+  NAL_SEI         = 6,    /* ref_idc == 0 */
+  NAL_SPS         = 7,
+  NAL_PPS         = 8
+                    /* ref_idc == 0 for 6,9,10,11,12 */
 };
 /* NRI: eNalRefIdc */
-enum ENalPriority
-{
-    NAL_PRIORITY_DISPOSABLE = 0,
-	NAL_PRIORITY_LOW        = 1,
-	NAL_PRIORITY_HIGH       = 2,
-	NAL_PRIORITY_HIGHEST    = 3,
+enum ENalPriority {
+  NAL_PRIORITY_DISPOSABLE = 0,
+  NAL_PRIORITY_LOW        = 1,
+  NAL_PRIORITY_HIGH       = 2,
+  NAL_PRIORITY_HIGHEST    = 3,
 };
 
 #define IS_PARAMETER_SET_NAL(eNalRefIdc, eNalType) \
@@ -121,31 +116,31 @@
 
 /* Error Tools definition */
 typedef unsigned short ERR_TOOL;
-enum{
-	ET_NONE = 0x00,					// NONE Error Tools
-	ET_IP_SCALE = 0x01,				// IP Scalable
-	ET_FMO = 0x02,					// Flexible Macroblock Ordering
-	ET_IR_R1 = 0x04,				// Intra Refresh in predifined 2% MB
-	ET_IR_R2 = 0x08,				// Intra Refresh in predifined 5% MB
-	ET_IR_R3 = 0x10,				// Intra Refresh in predifined 10% MB
-	ET_FEC_HALF = 0x20,				// Forward Error Correction in 50% redundency mode
-	ET_FEC_FULL	= 0x40,				// Forward Error Correction in 100% redundency mode
-	ET_RFS = 0x80,					// Reference Frame Selection
+enum {
+  ET_NONE = 0x00,					// NONE Error Tools
+  ET_IP_SCALE = 0x01,				// IP Scalable
+  ET_FMO = 0x02,					// Flexible Macroblock Ordering
+  ET_IR_R1 = 0x04,				// Intra Refresh in predifined 2% MB
+  ET_IR_R2 = 0x08,				// Intra Refresh in predifined 5% MB
+  ET_IR_R3 = 0x10,				// Intra Refresh in predifined 10% MB
+  ET_FEC_HALF = 0x20,				// Forward Error Correction in 50% redundency mode
+  ET_FEC_FULL	= 0x40,				// Forward Error Correction in 100% redundency mode
+  ET_RFS = 0x80,					// Reference Frame Selection
 };
 
 /* information of coded Slice(=NAL)(s) */
-typedef struct SliceInformation
-{
-	unsigned char*	pBufferOfSlices;		// base buffer of coded slice(s)
-	int				iCodedSliceCount;	// number of coded slices
-	unsigned int*	pLengthOfSlices;		// array of slices length accordingly by number of slice
-	int				iFecType;			// FEC type[0, 50%FEC, 100%FEC]
-	unsigned char	uiSliceIdx;		// index of slice in frame [FMO: 0,..,uiSliceCount-1; No FMO: 0] 
-	unsigned char	uiSliceCount;		// count number of slice in frame [FMO: 2-8; No FMO: 1]
-	char			iFrameIndex;		// index of frame[-1, .., idr_interval-1]
-	unsigned char	uiNalRefIdc;		// NRI, priority level of slice(NAL)
-	unsigned char	uiNalType;			// NAL type
-	unsigned char	uiContainingFinalNal;	// whether final NAL is involved in buffer of coded slices, flag used in Pause feature in T27
+typedef struct SliceInformation {
+  unsigned char*	pBufferOfSlices;		// base buffer of coded slice(s)
+  int				iCodedSliceCount;	// number of coded slices
+  unsigned int*	pLengthOfSlices;		// array of slices length accordingly by number of slice
+  int				iFecType;			// FEC type[0, 50%FEC, 100%FEC]
+  unsigned char	uiSliceIdx;		// index of slice in frame [FMO: 0,..,uiSliceCount-1; No FMO: 0]
+  unsigned char	uiSliceCount;		// count number of slice in frame [FMO: 2-8; No FMO: 1]
+  char			iFrameIndex;		// index of frame[-1, .., idr_interval-1]
+  unsigned char	uiNalRefIdc;		// NRI, priority level of slice(NAL)
+  unsigned char	uiNalType;			// NAL type
+  unsigned char
+  uiContainingFinalNal;	// whether final NAL is involved in buffer of coded slices, flag used in Pause feature in T27
 } SliceInfo, *PSliceInfo;
 
 
@@ -161,90 +156,84 @@
 
 /* thresholds of the initial, maximal and minimal rate */
 typedef struct {
-	int	iWidth;			// frame width
-	int	iHeight;			// frame height
-	int	iThresholdOfInitRate;	// threshold of initial rate
-	int	iThresholdOfMaxRate;	// threshold of maximal rate
-	int	iThresholdOfMinRate;	// threshold of minimal rate
-	int iMinThresholdFrameRate;		//min frame rate min
-	int	iSkipFrameRate;	//skip to frame rate min
-	int iSkipFrameStep;	//how many frames to skip
-}SRateThresholds, *PRateThresholds;
+  int	iWidth;			// frame width
+  int	iHeight;			// frame height
+  int	iThresholdOfInitRate;	// threshold of initial rate
+  int	iThresholdOfMaxRate;	// threshold of maximal rate
+  int	iThresholdOfMinRate;	// threshold of minimal rate
+  int iMinThresholdFrameRate;		//min frame rate min
+  int	iSkipFrameRate;	//skip to frame rate min
+  int iSkipFrameStep;	//how many frames to skip
+} SRateThresholds, *PRateThresholds;
 
 /*new interface*/
-typedef struct WelsDeviceInfo
-{
-	int  bSupport;          /* a logic flag provided by decoder which indicates whether GPU decoder can work based on the following device info. */
-	char Vendor[128];   // vendor name
-	char Device[128];    // device name
-	char Driver[128];     // driver version
-	char DriverDate[128]; //  driver release date 
+typedef struct WelsDeviceInfo {
+  int  bSupport;          /* a logic flag provided by decoder which indicates whether GPU decoder can work based on the following device info. */
+  char Vendor[128];   // vendor name
+  char Device[128];    // device name
+  char Driver[128];     // driver version
+  char DriverDate[128]; //  driver release date
 } Device_Info;
 
-typedef enum TagBufferProperty
-{
-	BUFFER_HOST	   = 0,   // host memory
-	BUFFER_DEVICE  = 1,	  // device memory including surface and shared handle
-						  // for DXVA: shared handle
-						  // for VDA : iosurface
-						
-	//SURFACE_DEVICE ,	 // surface
-	//SHARED_HANDLE      // shared handle
-}EBufferProperty;
+typedef enum TagBufferProperty {
+  BUFFER_HOST	   = 0,   // host memory
+  BUFFER_DEVICE  = 1,	  // device memory including surface and shared handle
+  // for DXVA: shared handle
+  // for VDA : iosurface
 
-typedef enum TagDecodeMode
-{
-	AUTO_MODE = 0,   // decided by decoder itself, dynamic mode switch, delayed switch
-	SW_MODE = 1,		// decoded by CPU, instant switch
-	GPU_MODE = 2,	// decoded by GPU, instant switch 
-	SWITCH_MODE =3	// switch to the other mode, forced mode switch, delayed switch
-}EDecodeMode;
+  //SURFACE_DEVICE ,	 // surface
+  //SHARED_HANDLE      // shared handle
+} EBufferProperty;
 
-typedef struct TagSysMemBuffer
-{	
-	int	iWidth;			//width of decoded pic for display
-	int iHeight;			//height of decoded pic for display
-	int iFormat; 		// type is "EVideoFormatType"
-	int iStride[2];		//stride of 2 component	
-}SSysMEMBuffer;
+typedef enum TagDecodeMode {
+  AUTO_MODE = 0,   // decided by decoder itself, dynamic mode switch, delayed switch
+  SW_MODE = 1,		// decoded by CPU, instant switch
+  GPU_MODE = 2,	// decoded by GPU, instant switch
+  SWITCH_MODE = 3	// switch to the other mode, forced mode switch, delayed switch
+} EDecodeMode;
 
-typedef struct TagVideoMemBuffer
-{
-	int iSurfaceWidth;   // used for surface create
-	int iSurfaceHeight;
-	int D3Dformat;  //type is "D3DFORMAT"
-  	int D3DPool; // type is "D3DPOOL";
-	int iLeftTopX;
-	int iLeftTopY;
-	int iRightBottomX;
-	int iRightBottomY;
-}SVideoMemBuffer;
+typedef struct TagSysMemBuffer {
+  int	iWidth;			//width of decoded pic for display
+  int iHeight;			//height of decoded pic for display
+  int iFormat; 		// type is "EVideoFormatType"
+  int iStride[2];		//stride of 2 component
+} SSysMEMBuffer;
 
-typedef struct TagBufferInfo
-{
-	EBufferProperty eBufferProperty;	//0: host memory; 1: device memory;
-	int iBufferStatus;  // 0: one frame data is not ready; 1: one frame data is ready
-	EDecodeMode eWorkMode;				//indicate what the real working mode in decoder
-	union {
-		SSysMEMBuffer sSystemBuffer;
-		SVideoMemBuffer sVideoBuffer;
-	}UsrData;	
-}SBufferInfo;
+typedef struct TagVideoMemBuffer {
+  int iSurfaceWidth;   // used for surface create
+  int iSurfaceHeight;
+  int D3Dformat;  //type is "D3DFORMAT"
+  int D3DPool; // type is "D3DPOOL";
+  int iLeftTopX;
+  int iLeftTopY;
+  int iRightBottomX;
+  int iRightBottomY;
+} SVideoMemBuffer;
 
+typedef struct TagBufferInfo {
+  EBufferProperty eBufferProperty;	//0: host memory; 1: device memory;
+  int iBufferStatus;  // 0: one frame data is not ready; 1: one frame data is ready
+  EDecodeMode eWorkMode;				//indicate what the real working mode in decoder
+  union {
+    SSysMEMBuffer sSystemBuffer;
+    SVideoMemBuffer sVideoBuffer;
+  } UsrData;
+} SBufferInfo;
+
 /* Constants related to transmission rate at various resolutions */
 static const SRateThresholds ksRateThrMap[4] = {
-	// initial-maximal-minimal
-	{CIF_WIDTH, CIF_HEIGHT, 225000, 384000, 96000, 3, 1, 1},		// CIF
-	{QVGA_WIDTH, QVGA_HEIGHT, 192000, 320000, 80000, -1, -1, -1},	// QVGA
-	{QCIF_WIDTH, QCIF_HEIGHT, 150000, 256000, 64000, 8, 4, 2},		// QCIF
-	{SQCIF_WIDTH, SQCIF_HEIGHT, 120000, 192000, 48000, 5, 3, 1}	// SQCIF
+  // initial-maximal-minimal
+  {CIF_WIDTH, CIF_HEIGHT, 225000, 384000, 96000, 3, 1, 1},		// CIF
+  {QVGA_WIDTH, QVGA_HEIGHT, 192000, 320000, 80000, -1, -1, -1},	// QVGA
+  {QCIF_WIDTH, QCIF_HEIGHT, 150000, 256000, 64000, 8, 4, 2},		// QCIF
+  {SQCIF_WIDTH, SQCIF_HEIGHT, 120000, 192000, 48000, 5, 3, 1}	// SQCIF
 };
 
 
-// In a GOP, multiple of the key frame number, derived from 
+// In a GOP, multiple of the key frame number, derived from
 // the number of layers(index or array below)
 static const char kiKeyNumMultiple[] = {
-	1, 1, 2, 4, 8, 16,
+  1, 1, 2, 4, 8, 16,
 };
 
 #pragma pack()
--- a/codec/build/linux/dec/makefile
+++ b/codec/build/linux/dec/makefile
@@ -7,11 +7,12 @@
 
 BINDIR= 	../bin
 OUTDIR= 	../../../../bin/linux
-INCLUDE= 	-I../../../api/svc -I../../../decoder/core/inc -I../../../decoder/plus/inc -I../../../console/dec/inc
+INCLUDE= 	-I../../../api/svc -I../../../decoder/core/inc -I../../../decoder/plus/inc -I../../../console/dec/inc -I../../../common
 CORESRCDIR=	../../../decoder/core/src
 PLUSSRCDIR=	../../../decoder/plus/src
 ASMSRCDIR=	../../../decoder/core/asm
 MAINSRCDIR=	../../../console/dec/src
+COMMONSRCDIR=   ../../../common
 
 OBJMAINDIR= ../obj
 OBJDIR= ../obj/dec
@@ -24,7 +25,7 @@
 ASFLAGS= -f elf -DNOPREFIX -I ../../../decoder/core/asm/
 
 LIBS= -lstdc++ -ldl
-#-lm 
+#-lm
 CFLAGS=  $(INCLUDE) -fPIC -D__GCC__ -DLINUX -D__NO_CTYPE -DHAVE_CACHE_LINE_ALIGN
 
 ifeq ($(DBG),1)
@@ -63,7 +64,8 @@
 $(CORESRCDIR)/decoder_core.cpp \
 $(CORESRCDIR)/utils.cpp \
 $(PLUSSRCDIR)/welsDecoderExt.cpp \
-$(PLUSSRCDIR)/welsCodecTrace.cpp
+$(PLUSSRCDIR)/welsCodecTrace.cpp \
+$(COMMONSRCDIR)/logging.cpp
 
 ASMSRC= $(ASMSRCDIR)/block_add.asm \
 $(ASMSRCDIR)/cpuid.asm \
@@ -76,7 +78,7 @@
 $(ASMSRCDIR)/mc_luma.asm \
 $(ASMSRCDIR)/memzero.asm \
 $(ASMSRCDIR)/asm_inc.asm \
- 
+
 MAINSRC= $(MAINSRCDIR)/d3d9_utils.cpp \
 $(MAINSRCDIR)/h264dec.cpp \
 $(MAINSRCDIR)/read_config.cpp
@@ -103,7 +105,8 @@
 $(OBJDIR)/decoder_core.o \
 $(OBJDIR)/utils.o \
 $(OBJDIR)/welsDecoderExt.o \
-$(OBJDIR)/welsCodecTrace.o
+$(OBJDIR)/welsCodecTrace.o \
+$(OBJDIR)/logging.o
 
 ifeq ($(NASM), 1)
 OBJDEC+=$(OBJDIR)/block_add.o \
@@ -116,7 +119,7 @@
 $(OBJDIR)/mb_copy.o \
 $(OBJDIR)/mc_luma.o \
 $(OBJDIR)/memzero.o \
-$(OBJDIR)/asm_inc.o 
+$(OBJDIR)/asm_inc.o
 endif
 
 OBJBIN=	$(OBJDIR)/d3d9_utils.o \
@@ -131,7 +134,7 @@
 
 dependencies:
 	@echo "" >dependencies
-	
+
 checkdir:
 	@echo 'checkdir..'
 	@if test ! -d $(BINDIR) ; \
@@ -151,7 +154,7 @@
 		mkdir -p $(OBJDIR) ; \
 	fi
 	@echo
-	
+
 release:
 	@echo 'release..'
 	@echo 'cp -f $(SHAREDLIB) $(OUTDIR)'
@@ -166,14 +169,14 @@
 	@rm -f $(OBJBIN)
 	@rm -f $(BINLIB)
 	@rm -f $(SHAREDLIB)
-	@rm -f $(BIN)    
+	@rm -f $(BIN)
 
 tags:
 	@echo update tag table
 	@etags $(CORESRCDIR)/*.c $(CORESRCDIR)/*.cpp $(PLUSSRCDIR)/*.cpp $(MAINSRCDIR)/*.cpp
-	
-	
-lib:   	$(OBJDEC) 
+
+
+lib:   	$(OBJDEC)
 	@echo '$(OBJDEC)'
 	@echo
 	@echo 'ar cr $(BINLIB) $(OBJDEC)'
@@ -194,15 +197,15 @@
 	@$(CXX)  -shared -Wl,-Bsymbolic -o $(SHAREDLIB) $(OBJDEC)  $(LIBS)
 	@echo '... done'
 	@echo
-	
 
+
 exe:	$(OBJBIN)
-	@echo	
+	@echo
 	@echo '$(OBJBIN)'
 	@echo
 	@echo '$(CXX) $(LIBS) $(OBJBIN) $(BINLIB) -o $(BIN)'
 	@echo 'creating binary "$(BIN)"'
-	@$(CXX) $(OBJBIN) $(BINLIB) -o $(BIN) $(LIBS) 
+	@$(CXX) $(OBJBIN) $(BINLIB) -o $(BIN) $(LIBS)
 	@echo '... done'
 	@echo
 
@@ -220,27 +223,31 @@
 
 $(OBJDIR)/%.o$(SUFFIX): $(CORESRCDIR)/%.c
 	@echo 'compiling object file "$@" ...'
-	@$(CC) -m32 -c $(CFLAGS) -o $@ $<		
+	@$(CC) -m32 -c $(CFLAGS) -o $@ $<
 
 $(OBJDIR)/%.o$(SUFFIX): $(CORESRCDIR)/%.cpp
 	@echo 'compiling object file "$@" ...'
 	@$(CC) -m32 -c $(CFLAGS) -o $@ $<
-		
+
 $(OBJDIR)/%.o$(SUFFIX): $(PLUSSRCDIR)/%.cpp
 	@echo 'compiling object file "$@" ...'
-	@$(CC) -m32 -c $(CFLAGS) -o $@ $<		
-	
+	@$(CC) -m32 -c $(CFLAGS) -o $@ $<
+
 $(OBJDIR)/%.o$(SUFFIX): $(ASMSRCDIR)/%.asm
 	@echo 'compiling object file "$@" ...'
-	@$(AS) $(ASFLAGS) -o $@ $<	
+	@$(AS) $(ASFLAGS) -o $@ $<
 
 #$(OBJDIR)/%.o$(SUFFIX): $(ASMCOMDIR)/%.asm
 #	@echo 'compiling object file "$@" ...'
 #	@$(AS) $(ASFLAGS) -o $@ $<
-	
+
 $(OBJDIR)/%.o$(SUFFIX): $(MAINSRCDIR)/%.cpp
 	@echo 'compiling object file "$@" ...'
-	@$(CC) -m32 -c $(CFLAGS) -o $@ $<		
-	
+	@$(CC) -m32 -c $(CFLAGS) -o $@ $<
+
+$(OBJDIR)/%.o$(SUFFIX): $(COMMONSRCDIR)/%.cpp
+	@echo 'compiling object file "$@" ...'
+	@$(CC) -m32 -c $(CFLAGS) -o $@ $<
+
 include $(DEPEND)
 
--- a/codec/build/linux/enc/makefile
+++ b/codec/build/linux/enc/makefile
@@ -7,12 +7,13 @@
 
 OUTDIR=		../../../../bin/linux
 BINDIR= 	../bin
-INCLUDE=  -I../../../encoder/core/inc -I../../../encoder/plus/inc -I../../../api/svc -I../../../WelsThreadLib/api -I../../../console/enc/inc
+INCLUDE=  -I../../../encoder/core/inc -I../../../encoder/plus/inc -I../../../api/svc -I../../../WelsThreadLib/api -I../../../console/enc/inc -I../../../common
 THREADLIBSRCDIR=../../../WelsThreadLib/src
 CORESRCDIR=	../../../encoder/core/src
 PLUSSRCDIR=	../../../encoder/plus/src
 ASMSRCDIR=	../../../encoder/core/asm
 MAINSRCDIR=	../../../console/enc/src
+COMMONSRCDIR=   ../../../common
 
 OBJMAINDIR= ../obj
 OBJDIR= ../obj/enc
@@ -24,9 +25,9 @@
 
 ASFLAGS= -f elf -DNOPREFIX -I ../../../encoder/core/asm/
 
-LIBS= -lstdc++ -ldl -lpthread
-#-lm 
-CFLAGS=  $(INCLUDE) -m32 -fPIC -D__GCC__ -DLINUX -D__NO_CTYPE -DWELS_SVC -DENCODER_CORE -DHAVE_CACHE_LINE_ALIGN -DWELS_TESTBED -DMT_ENABLED 
+LIBS= -lstdc++ -ldl -lpthread -lm
+#-lm
+CFLAGS=  $(INCLUDE) -m32 -fPIC -D__GCC__ -DLINUX -D__NO_CTYPE -DWELS_SVC -DENCODER_CORE -DHAVE_CACHE_LINE_ALIGN -DWELS_TESTBED -DMT_ENABLED
 
 ifeq ($(DBG),1)
 #SUFFIX= .dbg
@@ -73,7 +74,8 @@
 $(CORESRCDIR)/utils.cpp \
 $(THREADLIBSRCDIR)/WelsThreadLib.cpp \
 $(PLUSSRCDIR)/welsEncoderExt.cpp \
-$(PLUSSRCDIR)/welsCodecTrace.cpp
+$(PLUSSRCDIR)/welsCodecTrace.cpp \
+$(COMMONSRCDIR)/logging.cpp
 
 ASMSRC=	$(ASMSRCDIR)/coeff.asm \
 $(ASMSRCDIR)/cpuid.asm \
@@ -129,7 +131,8 @@
 $(OBJDIR)/utils.o \
 $(OBJDIR)/WelsThreadLib.o \
 $(OBJDIR)/welsEncoderExt.o \
-$(OBJDIR)/welsCodecTrace.o
+$(OBJDIR)/welsCodecTrace.o \
+$(OBJDIR)/logging.o
 
 ifeq ($(NASM), 1)
 OBJENC += $(OBJDIR)/cpuid.o \
@@ -147,7 +150,7 @@
 $(OBJDIR)/satd_sad.o \
 $(OBJDIR)/score.o \
 $(OBJDIR)/asm_inc.o \
-$(OBJDIR)/vaa.o 
+$(OBJDIR)/vaa.o
 endif
 OBJBIN=	$(OBJDIR)/read_config.o \
 $(OBJDIR)/welsenc.o
@@ -160,7 +163,7 @@
 
 dependencies:
 	@echo "" >dependencies
-	
+
 checkdir:
 	@echo 'checkdir..'
 	@if test ! -d $(OUTDIR) ; \
@@ -192,9 +195,9 @@
 tags:
 	@echo update tag table
 	@etags $(THREADLIBSRCDIR)/*.cpp $(COMMSRCDIR)/*.cpp $(CORESRCDIR)/*.cpp $(PLUSSRCDIR)/*.cpp $(MAINSRCDIR)/*.cpp
-	
-	
-lib:   	$(OBJENC) 
+
+
+lib:   	$(OBJENC)
 	@echo '$(OBJENC)'
 	@echo
 	@echo 'ar cr $(BINLIB) $(OBJENC)'
@@ -215,7 +218,7 @@
 	@$(GCC)  -shared -Wl,-Bsymbolic -m32 -o $(SHAREDLIB) $(OBJENC)  $(LIBS)
 	@echo '... done'
 	@echo
-	
+
 release:
 	@echo 'release..'
 	@echo 'cp -f $(SHAREDLIB) $(OUTDIR)'
@@ -225,7 +228,7 @@
 	@echo
 
 exe:	$(OBJBIN)
-	@echo	
+	@echo
 	@echo '$(OBJBIN)'
 	@echo
 	@echo '$(GCC) $(LIBS) $(OBJBIN) $(BINLIB) -m32 -o $(BIN)'
@@ -248,23 +251,27 @@
 
 $(OBJDIR)/%.o$(SUFFIX): $(CORESRCDIR)/%.cpp
 	@echo 'compiling object file "$@" ...'
-	@$(CC) -m32 -c $(CFLAGS) -o $@ $<		
-	
+	@$(CC) -m32 -c $(CFLAGS) -o $@ $<
+
 $(OBJDIR)/%.o$(SUFFIX): $(PLUSSRCDIR)/%.cpp
 	@echo 'compiling object file "$@" ...'
-	@$(CC) -m32 -c $(CFLAGS) -o $@ $<		
+	@$(CC) -m32 -c $(CFLAGS) -o $@ $<
 
 $(OBJDIR)/%.o$(SUFFIX): $(ASMSRCDIR)/%.asm
 	@echo 'compiling object file "$@" ...'
-	@$(AS) $(ASFLAGS) -o $@ $<	
-	
+	@$(AS) $(ASFLAGS) -o $@ $<
+
 $(OBJDIR)/%.o$(SUFFIX): $(MAINSRCDIR)/%.cpp
 	@echo 'compiling object file "$@" ...'
-	@$(CC) -m32 -c $(CFLAGS) -o $@ $<	
-	
+	@$(CC) -m32 -c $(CFLAGS) -o $@ $<
+
 $(OBJDIR)/%.o$(SUFFIX): $(MAINSRCDIR)/%.cpp
 	@echo 'compiling object file "$@" ...'
-	@$(CC) -m32 -c $(CFLAGS) -o $@ $<	
-	
+	@$(CC) -m32 -c $(CFLAGS) -o $@ $<
+
+$(OBJDIR)/%.o$(SUFFIX): $(COMMONSRCDIR)/%.cpp
+	@echo 'compiling object file "$@" ...'
+	@$(CC) -m32 -c $(CFLAGS) -o $@ $<
+
 include $(DEPEND)
 
--- a/codec/build/win32/dec/WelsDecPlus.vcproj
+++ b/codec/build/win32/dec/WelsDecPlus.vcproj
@@ -50,7 +50,7 @@
 				Name="VCCLCompilerTool"
 				Optimization="2"
 				InlineFunctionExpansion="1"
-				AdditionalIncludeDirectories="..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common\inc;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc"
+				AdditionalIncludeDirectories="..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc"
 				PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS;_USRDLL;WELSDECPLUS_EXPORTS;HAVE_CACHE_LINE_ALIGN"
 				StringPooling="true"
 				RuntimeLibrary="2"
@@ -149,7 +149,7 @@
 			<Tool
 				Name="VCCLCompilerTool"
 				Optimization="0"
-				AdditionalIncludeDirectories="..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common\inc;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc"
+				AdditionalIncludeDirectories="..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc"
 				PreprocessorDefinitions="WIN32;_DEBUG;_WINDOWS;_USRDLL;WELSDECPLUS_EXPORTS;HAVE_CACHE_LINE_ALIGN"
 				MinimalRebuild="true"
 				BasicRuntimeChecks="3"
--- a/codec/build/win32/dec/WelsDecPlus_2010.vcxproj
+++ b/codec/build/win32/dec/WelsDecPlus_2010.vcxproj
@@ -67,7 +67,7 @@
     <ClCompile>
       <Optimization>MaxSpeed</Optimization>
       <InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
-      <AdditionalIncludeDirectories>..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common\inc;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <PreprocessorDefinitions>WIN32;NDEBUG;_WINDOWS;_USRDLL;WELSDECPLUS_EXPORTS;HAVE_CACHE_LINE_ALIGN;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <StringPooling>true</StringPooling>
       <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
@@ -116,7 +116,7 @@
     </Midl>
     <ClCompile>
       <Optimization>Disabled</Optimization>
-      <AdditionalIncludeDirectories>..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common\inc;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <PreprocessorDefinitions>WIN32;_DEBUG;_WINDOWS;_USRDLL;WELSDECPLUS_EXPORTS;HAVE_CACHE_LINE_ALIGN;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <MinimalRebuild>true</MinimalRebuild>
       <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
--- a/codec/build/win32/dec/WelsDecPlus_2012.vcxproj
+++ b/codec/build/win32/dec/WelsDecPlus_2012.vcxproj
@@ -66,7 +66,7 @@
     <ClCompile>
       <Optimization>MaxSpeed</Optimization>
       <InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
-      <AdditionalIncludeDirectories>..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common\inc;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <PreprocessorDefinitions>WIN32;NDEBUG;_WINDOWS;_USRDLL;WELSDECPLUS_EXPORTS;HAVE_CACHE_LINE_ALIGN;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <StringPooling>true</StringPooling>
       <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
@@ -115,7 +115,7 @@
     </Midl>
     <ClCompile>
       <Optimization>Disabled</Optimization>
-      <AdditionalIncludeDirectories>..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common\inc;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <PreprocessorDefinitions>WIN32;_DEBUG;_WINDOWS;_USRDLL;WELSDECPLUS_EXPORTS;HAVE_CACHE_LINE_ALIGN;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <MinimalRebuild>true</MinimalRebuild>
       <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
@@ -171,7 +171,7 @@
     <ClInclude Include="..\..\..\decoder\plus\inc\welsDecoderExt.h" />
   </ItemGroup>
   <ItemGroup>
-    <ProjectReference Include="WelsDecCore.vcxproj">
+    <ProjectReference Include="WelsDecCore_2012.vcxproj">
       <Project>{01b4ae41-6ad6-4caf-aeb3-c42f7f9121d5}</Project>
       <ReferenceOutputAssembly>false</ReferenceOutputAssembly>
     </ProjectReference>
--- a/codec/build/win32/dec/decConsole.vcproj
+++ b/codec/build/win32/dec/decConsole.vcproj
@@ -45,7 +45,7 @@
 				Name="VCCLCompilerTool"
 				Optimization="2"
 				InlineFunctionExpansion="1"
-				AdditionalIncludeDirectories="..\..\..\console\dec\inc,..\..\..\api\svc,..\..\..\common\inc"
+				AdditionalIncludeDirectories="..\..\..\console\dec\inc,..\..\..\api\svc,..\..\..\common,..\..\..\encoder\core\inc"
 				PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE"
 				StringPooling="true"
 				RuntimeLibrary="2"
@@ -135,7 +135,7 @@
 			<Tool
 				Name="VCCLCompilerTool"
 				Optimization="0"
-				AdditionalIncludeDirectories="..\..\..\console\dec\inc,..\..\..\api\svc,..\..\..\common\inc"
+				AdditionalIncludeDirectories="..\..\..\console\dec\inc,..\..\..\api\svc,..\..\..\common,..\..\..\decoder\core\inc,..\..\..\encoder\core\inc"
 				PreprocessorDefinitions="WIN32;_DEBUG;_CONSOLE"
 				MinimalRebuild="true"
 				BasicRuntimeChecks="3"
@@ -230,6 +230,10 @@
 						PreprocessorDefinitions=""
 					/>
 				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\common\logging.cpp"
+				>
 			</File>
 			<File
 				RelativePath="..\..\..\console\dec\src\read_config.cpp"
--- a/codec/build/win32/dec/decConsole_2010.vcxproj
+++ b/codec/build/win32/dec/decConsole_2010.vcxproj
@@ -62,7 +62,7 @@
     <ClCompile>
       <Optimization>MaxSpeed</Optimization>
       <InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
-      <AdditionalIncludeDirectories>..\..\..\console\dec\inc;..\..\..\api\svc;..\..\..\common\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>..\..\..\console\dec\inc;..\..\..\api\svc;..\..\..\common;..\..\..\encoder\core\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <StringPooling>true</StringPooling>
       <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
@@ -103,7 +103,7 @@
     </Midl>
     <ClCompile>
       <Optimization>Disabled</Optimization>
-      <AdditionalIncludeDirectories>..\..\..\console\dec\inc;..\..\..\api\svc;..\..\..\common\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>..\..\..\console\dec\inc;..\..\..\api\svc;..\..\..\common;..\..\..\encoder\core\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <MinimalRebuild>true</MinimalRebuild>
       <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
@@ -152,6 +152,7 @@
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
     </ClCompile>
+    <ClCompile Include="..\..\..\common\logging.cpp" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\..\console\dec\inc\d3d9_utils.h" />
@@ -161,7 +162,7 @@
     <None Include="..\..\..\..\bin\Release\welsdec.cfg" />
   </ItemGroup>
   <ItemGroup>
-    <ProjectReference Include="WelsDecPlus.vcxproj">
+    <ProjectReference Include="WelsDecPlus_2010.vcxproj">
       <Project>{1131558a-9986-4f4b-a13f-8b7f4c8438bf}</Project>
       <ReferenceOutputAssembly>false</ReferenceOutputAssembly>
     </ProjectReference>
--- a/codec/build/win32/dec/decConsole_2012.vcxproj
+++ b/codec/build/win32/dec/decConsole_2012.vcxproj
@@ -61,7 +61,7 @@
     <ClCompile>
       <Optimization>MaxSpeed</Optimization>
       <InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
-      <AdditionalIncludeDirectories>..\..\..\console\dec\inc;..\..\..\api\svc;..\..\..\common\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>..\..\..\console\dec\inc;..\..\..\api\svc;..\..\..\common;..\..\..\encoder\core\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <StringPooling>true</StringPooling>
       <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
@@ -102,7 +102,7 @@
     </Midl>
     <ClCompile>
       <Optimization>Disabled</Optimization>
-      <AdditionalIncludeDirectories>..\..\..\console\dec\inc;..\..\..\api\svc;..\..\..\common\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>..\..\..\console\dec\inc;..\..\..\api\svc;..\..\..\common;..\..\..\encoder\core\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <MinimalRebuild>true</MinimalRebuild>
       <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
@@ -143,6 +143,7 @@
     <ClCompile Include="..\..\..\console\dec\src\d3d9_utils.cpp" />
     <ClCompile Include="..\..\..\console\dec\src\h264dec.cpp" />
     <ClCompile Include="..\..\..\console\dec\src\read_config.cpp" />
+    <ClCompile Include="..\..\..\common\logging.cpp" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\..\console\dec\inc\d3d9_utils.h" />
@@ -152,7 +153,7 @@
     <None Include="..\..\..\..\bin\Release\welsdec.cfg" />
   </ItemGroup>
   <ItemGroup>
-    <ProjectReference Include="WelsDecPlus.vcxproj">
+    <ProjectReference Include="WelsDecPlus_2012.vcxproj">
       <Project>{1131558a-9986-4f4b-a13f-8b7f4c8438bf}</Project>
       <ReferenceOutputAssembly>false</ReferenceOutputAssembly>
     </ProjectReference>
--- a/codec/build/win32/enc/WelsEncPlus.vcproj
+++ b/codec/build/win32/enc/WelsEncPlus.vcproj
@@ -49,7 +49,7 @@
 			<Tool
 				Name="VCCLCompilerTool"
 				Optimization="0"
-				AdditionalIncludeDirectories="..\..\..\encoder\plus\inc;..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\WelsThreadLib\api"
+				AdditionalIncludeDirectories="..\..\..\encoder\plus\inc;..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\WelsThreadLib\api"
 				PreprocessorDefinitions="WIN32;_DEBUG;_WINDOWS;_USRDLL;WELSENCPLUS_EXPORTS;ENCODER_CORE;HAVE_CACHE_LINE_ALIGN;MT_ENABLED;"
 				MinimalRebuild="true"
 				BasicRuntimeChecks="3"
@@ -152,7 +152,7 @@
 				FavorSizeOrSpeed="1"
 				EnableFiberSafeOptimizations="true"
 				WholeProgramOptimization="true"
-				AdditionalIncludeDirectories="..\..\..\encoder\plus\inc;..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\WelsThreadLib\api"
+				AdditionalIncludeDirectories="..\..\..\encoder\plus\inc;..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\WelsThreadLib\api"
 				PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS;_USRDLL;WELSENCPLUS_EXPORTS;ENCODER_CORE;HAVE_CACHE_LINE_ALIGN;MT_ENABLED;"
 				StringPooling="true"
 				RuntimeLibrary="2"
--- a/codec/build/win32/enc/WelsEncPlus_2010.vcxproj
+++ b/codec/build/win32/enc/WelsEncPlus_2010.vcxproj
@@ -66,7 +66,7 @@
     </Midl>
     <ClCompile>
       <Optimization>Disabled</Optimization>
-      <AdditionalIncludeDirectories>..\..\..\encoder\plus\inc;..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\WelsThreadLib\api;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>..\..\..\encoder\plus\inc;..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\WelsThreadLib\api;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <PreprocessorDefinitions>WIN32;_DEBUG;_WINDOWS;_USRDLL;WELSENCPLUS_EXPORTS;ENCODER_CORE;HAVE_CACHE_LINE_ALIGN;MT_ENABLED;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <MinimalRebuild>true</MinimalRebuild>
       <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
@@ -117,7 +117,7 @@
       <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
       <EnableFiberSafeOptimizations>true</EnableFiberSafeOptimizations>
       <WholeProgramOptimization>true</WholeProgramOptimization>
-      <AdditionalIncludeDirectories>..\..\..\encoder\plus\inc;..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\WelsThreadLib\api;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>..\..\..\encoder\plus\inc;..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\WelsThreadLib\api;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <PreprocessorDefinitions>WIN32;NDEBUG;_WINDOWS;_USRDLL;WELSENCPLUS_EXPORTS;ENCODER_CORE;HAVE_CACHE_LINE_ALIGN;MT_ENABLED;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <StringPooling>true</StringPooling>
       <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
@@ -183,7 +183,7 @@
     </ResourceCompile>
   </ItemGroup>
   <ItemGroup>
-    <ProjectReference Include="WelsEncCore.vcxproj">
+    <ProjectReference Include="WelsEncCore_2010.vcxproj">
       <Project>{59208004-1774-4816-ac24-31ff44c324b4}</Project>
       <ReferenceOutputAssembly>false</ReferenceOutputAssembly>
     </ProjectReference>
--- a/codec/build/win32/enc/WelsEncPlus_2012.vcxproj
+++ b/codec/build/win32/enc/WelsEncPlus_2012.vcxproj
@@ -65,7 +65,7 @@
     </Midl>
     <ClCompile>
       <Optimization>Disabled</Optimization>
-      <AdditionalIncludeDirectories>..\..\..\encoder\plus\inc;..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\WelsThreadLib\api;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>..\..\..\encoder\plus\inc;..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\WelsThreadLib\api;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <PreprocessorDefinitions>WIN32;_DEBUG;_WINDOWS;_USRDLL;WELSENCPLUS_EXPORTS;ENCODER_CORE;HAVE_CACHE_LINE_ALIGN;MT_ENABLED;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <MinimalRebuild>true</MinimalRebuild>
       <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
@@ -116,7 +116,7 @@
       <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
       <EnableFiberSafeOptimizations>true</EnableFiberSafeOptimizations>
       <WholeProgramOptimization>true</WholeProgramOptimization>
-      <AdditionalIncludeDirectories>..\..\..\encoder\plus\inc;..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\WelsThreadLib\api;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>..\..\..\encoder\plus\inc;..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\WelsThreadLib\api;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <PreprocessorDefinitions>WIN32;NDEBUG;_WINDOWS;_USRDLL;WELSENCPLUS_EXPORTS;ENCODER_CORE;HAVE_CACHE_LINE_ALIGN;MT_ENABLED;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <StringPooling>true</StringPooling>
       <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
--- a/codec/build/win32/enc/encConsole.vcproj
+++ b/codec/build/win32/enc/encConsole.vcproj
@@ -45,7 +45,7 @@
 			<Tool
 				Name="VCCLCompilerTool"
 				Optimization="0"
-				AdditionalIncludeDirectories="..\..\..\console\enc\inc,..\..\..\api\svc,..\..\..\WelsThreadLib\api,..\..\..\encoder\core\inc,..\..\..\common\inc"
+				AdditionalIncludeDirectories="..\..\..\console\enc\inc,..\..\..\api\svc,..\..\..\WelsThreadLib\api,..\..\..\encoder\core\inc,..\..\..\common"
 				PreprocessorDefinitions="WIN32;_DEBUG;_CONSOLE;ENCODER_CORE;MT_ENABLED;"
 				MinimalRebuild="true"
 				BasicRuntimeChecks="3"
@@ -140,7 +140,7 @@
 				Name="VCCLCompilerTool"
 				Optimization="2"
 				InlineFunctionExpansion="1"
-				AdditionalIncludeDirectories="..\..\..\console\enc\inc,..\..\..\api\svc,..\..\..\WelsThreadLib\api,..\..\..\encoder\core\inc,..\..\..\common\inc"
+				AdditionalIncludeDirectories="..\..\..\console\enc\inc,..\..\..\api\svc,..\..\..\WelsThreadLib\api,..\..\..\encoder\core\inc,..\..\..\common"
 				PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE;ENCODER_CORE;X86_ASM;MT_ENABLED;"
 				StringPooling="true"
 				RuntimeLibrary="2"
@@ -213,6 +213,10 @@
 			Name="Source Files"
 			Filter="cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
 			>
+			<File
+				RelativePath="..\..\..\common\logging.cpp"
+				>
+			</File>
 			<File
 				RelativePath="..\..\..\console\enc\src\read_config.cpp"
 				>
--- a/codec/build/win32/enc/encConsole_2010.vcxproj
+++ b/codec/build/win32/enc/encConsole_2010.vcxproj
@@ -62,7 +62,7 @@
     </Midl>
     <ClCompile>
       <Optimization>Disabled</Optimization>
-      <AdditionalIncludeDirectories>..\..\..\console\enc\inc;..\..\..\api\svc;..\..\..\WelsThreadLib\api;..\..\..\encoder\core\inc;..\..\..\common\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>..\..\..\console\enc\inc;..\..\..\api\svc;..\..\..\WelsThreadLib\api;..\..\..\encoder\core\inc;..\..\..\common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;ENCODER_CORE;X86_ASM;MT_ENABLED;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <MinimalRebuild>true</MinimalRebuild>
       <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
@@ -105,7 +105,7 @@
     <ClCompile>
       <Optimization>MaxSpeed</Optimization>
       <InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
-      <AdditionalIncludeDirectories>..\..\..\console\enc\inc;..\..\..\api\svc;..\..\..\WelsThreadLib\api;..\..\..\encoder\core\inc;..\..\..\common\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>..\..\..\console\enc\inc;..\..\..\api\svc;..\..\..\WelsThreadLib\api;..\..\..\encoder\core\inc;..\..\..\common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;ENCODER_CORE;X86_ASM;MT_ENABLED;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <StringPooling>true</StringPooling>
       <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
@@ -155,12 +155,13 @@
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
     </ClCompile>
+    <ClCompile Include="..\..\..\common\logging.cpp" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\..\console\enc\inc\read_config.h" />
   </ItemGroup>
   <ItemGroup>
-    <ProjectReference Include="WelsEncPlus.vcxproj">
+    <ProjectReference Include="WelsEncPlus_2010.vcxproj">
       <Project>{1e7b4e9a-986e-4167-8c70-6e4f60eaee7f}</Project>
       <ReferenceOutputAssembly>false</ReferenceOutputAssembly>
     </ProjectReference>
--- a/codec/build/win32/enc/encConsole_2012.vcxproj
+++ b/codec/build/win32/enc/encConsole_2012.vcxproj
@@ -61,7 +61,7 @@
     </Midl>
     <ClCompile>
       <Optimization>Disabled</Optimization>
-      <AdditionalIncludeDirectories>..\..\..\console\enc\inc;..\..\..\api\svc;..\..\..\WelsThreadLib\api;..\..\..\encoder\core\inc;..\..\..\common\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>..\..\..\console\enc\inc;..\..\..\api\svc;..\..\..\WelsThreadLib\api;..\..\..\encoder\core\inc;..\..\..\common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;ENCODER_CORE;X86_ASM;MT_ENABLED;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <MinimalRebuild>true</MinimalRebuild>
       <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
@@ -105,7 +105,7 @@
     <ClCompile>
       <Optimization>MaxSpeed</Optimization>
       <InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
-      <AdditionalIncludeDirectories>..\..\..\console\enc\inc;..\..\..\api\svc;..\..\..\WelsThreadLib\api;..\..\..\encoder\core\inc;..\..\..\common\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>..\..\..\console\enc\inc;..\..\..\api\svc;..\..\..\WelsThreadLib\api;..\..\..\encoder\core\inc;..\..\..\common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;ENCODER_CORE;X86_ASM;MT_ENABLED;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <StringPooling>true</StringPooling>
       <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
@@ -147,12 +147,13 @@
   <ItemGroup>
     <ClCompile Include="..\..\..\console\enc\src\read_config.cpp" />
     <ClCompile Include="..\..\..\console\enc\src\welsenc.cpp" />
+    <ClCompile Include="..\..\..\common\logging.cpp" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\..\console\enc\inc\read_config.h" />
   </ItemGroup>
   <ItemGroup>
-    <ProjectReference Include="WelsEncPlus.vcxproj">
+    <ProjectReference Include="WelsEncPlus_2012.vcxproj">
       <Project>{1e7b4e9a-986e-4167-8c70-6e4f60eaee7f}</Project>
       <ReferenceOutputAssembly>false</ReferenceOutputAssembly>
     </ProjectReference>
--- /dev/null
+++ b/codec/common/logging.cpp
@@ -1,0 +1,49 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     Copyright (c)  2013, Mozilla
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "typedefs.h"
+
+static int32_t g_TraceLevel = 0;
+
+void WelsStderrSetTraceLevel (int32_t level) {
+  g_TraceLevel = level;
+}
+
+int32_t welsStderrLevelTrace (int32_t level, const str_t* format, va_list ap) {
+  if (level < g_TraceLevel) {
+    vfprintf (stderr, format, ap);
+  }
+  return 0;
+}
--- /dev/null
+++ b/codec/common/logging.h
@@ -1,0 +1,60 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     Copyright (c)  2013, Mozilla
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+
+#ifndef WELS_LOGGING_H__
+#define WELS_LOGGING_H__
+
+// API surface.
+void WelsStderrSetTraceLevel (int32_t level);
+
+
+// Internal details.
+int32_t welsStderrLevelTrace (int32_t level, const str_t* format, va_list ap);
+
+template<int level> int32_t welsStderrTrace (
+#ifndef WIN32
+  const str_t* dllname,
+#endif
+  const str_t* format, ...) {
+#ifndef WIN32
+  (void)dllname;  // Unused.
+#endif
+  va_list ap;
+  va_start (ap, format);
+  welsStderrLevelTrace (level, format, ap);
+  va_end (ap);
+  return 0;
+}
+
+#endif
--- /dev/null
+++ b/codec/common/targets.mk
@@ -1,0 +1,22 @@
+COMMON_PREFIX=COMMON
+COMMON_SRCDIR=codec/common
+COMMON_CPP_SRCS=\
+	$(COMMON_SRCDIR)/./logging.cpp\
+
+COMMON_OBJS += $(COMMON_CPP_SRCS:.cpp=.o)
+ifeq ($(USE_ASM), Yes)
+COMMON_ASM_SRCS=\
+
+COMMON_OBJS += $(COMMON_ASM_SRCS:.asm=.o)
+endif
+
+OBJS += $(COMMON_OBJS)
+$(COMMON_SRCDIR)/./logging.o: $(COMMON_SRCDIR)/./logging.cpp
+	$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(COMMON_CFLAGS) $(COMMON_INCLUDES) -c -o $(COMMON_SRCDIR)/./logging.o $(COMMON_SRCDIR)/./logging.cpp
+
+$(LIBPREFIX)common.$(LIBSUFFIX): $(COMMON_OBJS)
+	rm -f $(LIBPREFIX)common.$(LIBSUFFIX)
+	ar cr $@ $(COMMON_OBJS)
+
+libraries: $(LIBPREFIX)common.$(LIBSUFFIX)
+LIBRARIES += $(LIBPREFIX)common.$(LIBSUFFIX)
--- a/codec/console/dec/inc/d3d9_utils.h
+++ b/codec/console/dec/inc/d3d9_utils.h
@@ -1,143 +1,139 @@
-/*!
- * \copy
- *     Copyright (c)  2010-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- * \file	d3d9_utils.h
- *
- * \brief	interface of d3d9 render module
- *
- * \date	Created 12/14/2010
- *
- * \description : 1. Rendering in Vista and upper : D3D9Ex method, support host memory / shared surface input 
- *                2. Rendering in XP : D3D9 method w/o device lost handling, support host memory input  
- *                3. File Dump : support host memory / shared surface input 
- *
- *************************************************************************************
- */
-#ifndef WELS_D3D9_UTILS_H__
-#define WELS_D3D9_UTILS_H__
-
-//#pragma once	// do not use this due cross platform, esp for Solaris
-
-#include <stdio.h>
-#include "codec_def.h"
-
-#if defined(_MSC_VER) && (_MSC_VER>=1500) // vs2008 and upper
-#define ENABLE_DISPLAY_MODULE // enable/disable the render feature 
-#endif
-
-#ifdef ENABLE_DISPLAY_MODULE
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-#include <d3d9.h>
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-class CD3D9Utils
-{
-public:
-	CD3D9Utils();
-	~CD3D9Utils();
-
-public:
-	HRESULT Init(BOOL bWindowed);
-	HRESULT Uninit(void);
-	HRESULT Process(void *pDst[3], SBufferInfo *Info, FILE *pFile = NULL);
-
-private:
-	HRESULT InitResource(void *pSharedHandle, SBufferInfo *pInfo);
-	HRESULT Render(void *pDst[3], SBufferInfo *pInfo);
-	HRESULT Dump(void *pDst[3], SBufferInfo *pInfo, FILE *pFile);
-                  
-private:
-	HMODULE               m_hDll;
-	HWND                  m_hWnd;
-	unsigned char        *m_pDumpYUV;
-	BOOL                  m_bInitDone;
-
-	LPDIRECT3D9           m_lpD3D9;
-	LPDIRECT3DDEVICE9     m_lpD3D9Device;
-
-	D3DPRESENT_PARAMETERS m_d3dpp;
-	LPDIRECT3DSURFACE9    m_lpD3D9RawSurfaceShare;
-};
-
-class CD3D9ExUtils
-{
-public:
-	CD3D9ExUtils();
-	~CD3D9ExUtils();
-
-public:
-	HRESULT Init(BOOL bWindowed);
-	HRESULT Uninit(void);
-	HRESULT Process(void *dst[3], SBufferInfo *Info, FILE *fp = NULL);
-
-private:
-	HRESULT InitResource(void *pSharedHandle, SBufferInfo *Info);
-	HRESULT Render(void *pDst[3], SBufferInfo *Info);
-	HRESULT Dump(void *pDst[3], SBufferInfo *Info, FILE *fp);
-
-private:
-	HMODULE               m_hDll;
-	HWND                  m_hWnd;
-	unsigned char        *m_pDumpYUV;
-	BOOL                  m_bInitDone;
-
-	LPDIRECT3D9EX         m_lpD3D9;
-	LPDIRECT3DDEVICE9EX   m_lpD3D9Device;
-
-	D3DPRESENT_PARAMETERS m_d3dpp;
-	LPDIRECT3DSURFACE9    m_lpD3D9RawSurfaceShare;
-};
-#endif
-
-typedef enum
-{
-  OS_UNSUPPORTED = 0,
-  OS_XP,
-  OS_VISTA_UPPER
-};
-
-class CUtils
-{
-public:
-	CUtils();
-	~CUtils();
-
-	int Process(void *dst[3], SBufferInfo *Info, FILE *fp);
-
-private:
-	int CheckOS(void);
-
-private:
-	int iOSType;
-	void *hHandle;
-};
-
-#endif//WELS_D3D9_UTILS_H__
-
+/*!
+ * \copy
+ *     Copyright (c)  2010-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	d3d9_utils.h
+ *
+ * \brief	interface of d3d9 render module
+ *
+ * \date	Created 12/14/2010
+ *
+ * \description : 1. Rendering in Vista and upper : D3D9Ex method, support host memory / shared surface input
+ *                2. Rendering in XP : D3D9 method w/o device lost handling, support host memory input
+ *                3. File Dump : support host memory / shared surface input
+ *
+ *************************************************************************************
+ */
+#ifndef WELS_D3D9_UTILS_H__
+#define WELS_D3D9_UTILS_H__
+
+//#pragma once	// do not use this due cross platform, esp for Solaris
+
+#include <stdio.h>
+#include "codec_def.h"
+
+#if defined(_MSC_VER) && (_MSC_VER>=1500) // vs2008 and upper
+#define ENABLE_DISPLAY_MODULE // enable/disable the render feature 
+#endif
+
+#ifdef ENABLE_DISPLAY_MODULE
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+#include <d3d9.h>
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+class CD3D9Utils {
+ public:
+  CD3D9Utils();
+  ~CD3D9Utils();
+
+ public:
+  HRESULT Init (BOOL bWindowed);
+  HRESULT Uninit (void);
+  HRESULT Process (void* pDst[3], SBufferInfo* Info, FILE* pFile = NULL);
+
+ private:
+  HRESULT InitResource (void* pSharedHandle, SBufferInfo* pInfo);
+  HRESULT Render (void* pDst[3], SBufferInfo* pInfo);
+  HRESULT Dump (void* pDst[3], SBufferInfo* pInfo, FILE* pFile);
+
+ private:
+  HMODULE               m_hDll;
+  HWND                  m_hWnd;
+  unsigned char*        m_pDumpYUV;
+  BOOL                  m_bInitDone;
+
+  LPDIRECT3D9           m_lpD3D9;
+  LPDIRECT3DDEVICE9     m_lpD3D9Device;
+
+  D3DPRESENT_PARAMETERS m_d3dpp;
+  LPDIRECT3DSURFACE9    m_lpD3D9RawSurfaceShare;
+};
+
+class CD3D9ExUtils {
+ public:
+  CD3D9ExUtils();
+  ~CD3D9ExUtils();
+
+ public:
+  HRESULT Init (BOOL bWindowed);
+  HRESULT Uninit (void);
+  HRESULT Process (void* dst[3], SBufferInfo* Info, FILE* fp = NULL);
+
+ private:
+  HRESULT InitResource (void* pSharedHandle, SBufferInfo* Info);
+  HRESULT Render (void* pDst[3], SBufferInfo* Info);
+  HRESULT Dump (void* pDst[3], SBufferInfo* Info, FILE* fp);
+
+ private:
+  HMODULE               m_hDll;
+  HWND                  m_hWnd;
+  unsigned char*        m_pDumpYUV;
+  BOOL                  m_bInitDone;
+
+  LPDIRECT3D9EX         m_lpD3D9;
+  LPDIRECT3DDEVICE9EX   m_lpD3D9Device;
+
+  D3DPRESENT_PARAMETERS m_d3dpp;
+  LPDIRECT3DSURFACE9    m_lpD3D9RawSurfaceShare;
+};
+#endif
+
+typedef enum {
+  OS_UNSUPPORTED = 0,
+  OS_XP,
+  OS_VISTA_UPPER
+};
+
+class CUtils {
+ public:
+  CUtils();
+  ~CUtils();
+
+  int Process (void* dst[3], SBufferInfo* Info, FILE* fp);
+
+ private:
+  int CheckOS (void);
+
+ private:
+  int iOSType;
+  void* hHandle;
+};
+
+#endif//WELS_D3D9_UTILS_H__
+
--- a/codec/console/dec/inc/dec_console.h
+++ b/codec/console/dec/inc/dec_console.h
@@ -46,8 +46,8 @@
 
 bool load_bundle_welsdec();
 void free_bundle_welsdec();
-bool get_functions_address_free_decoder(ISVCDecoder* pDecoder);
-bool get_functions_address_create_decoder(ISVCDecoder** ppDecoder);
+bool get_functions_address_free_decoder (ISVCDecoder* pDecoder);
+bool get_functions_address_create_decoder (ISVCDecoder** ppDecoder);
 
 
 
--- a/codec/console/dec/inc/read_config.h
+++ b/codec/console/dec/inc/read_config.h
@@ -44,22 +44,21 @@
 #include <string>
 using namespace std;
 
-class CReadConfig
-{
-public:
-	CReadConfig( const char *kpConfigFileName );
-	virtual ~CReadConfig();
-	
-	long ReadLine( string* val, const int kiValSize = 4 );
-	const bool EndOfFile();
-	const int GetLines();
-	const bool ExistFile();
-	const string& GetFileName();
-	
-private:
-	FILE			*m_pCfgFile;
-	string			m_strCfgFileName;
-	unsigned long	m_ulLines;
+class CReadConfig {
+ public:
+  CReadConfig (const char* kpConfigFileName);
+  virtual ~CReadConfig();
+
+  long ReadLine (string* val, const int kiValSize = 4);
+  const bool EndOfFile();
+  const int GetLines();
+  const bool ExistFile();
+  const string& GetFileName();
+
+ private:
+  FILE*			m_pCfgFile;
+  string			m_strCfgFileName;
+  unsigned long	m_ulLines;
 };
 
 #endif	// READ_CONFIG_H__
--- a/codec/console/dec/src/d3d9_utils.cpp
+++ b/codec/console/dec/src/d3d9_utils.cpp
@@ -1,778 +1,701 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "d3d9_utils.h"
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-void Write2File(FILE *pFp, unsigned char* pData[3], int iStride[2], int iWidth, int iHeight);
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-#ifdef ENABLE_DISPLAY_MODULE
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-#define IDM_ABOUT						104
-#define IDM_EXIT						105
-#define IDI_TESTSHARESURFACE	        107
-#define IDI_SMALL						108
-#define IDC_TESTSHARESURFACE	        109
-
-#define NV12_FORMAT  MAKEFOURCC('N','V','1','2')
-
-typedef struct
-{
-	UINT      uiWidth;
-	UINT      uiHeight;
-	D3DFORMAT D3Dformat;
-	D3DPOOL   D3DPool;
-} SHandleInfo;
-
-#define SAFE_RELEASE(p) if(p) { (p)->Release(); (p) = NULL; }
-#define SAFE_FREE(p)    if(p) { free (p); (p) = NULL; }
-
-HRESULT Dump2YUV(void *pDst[3], void *pSurface, int iWidth, int iHeight, int iStride[2]);
-HRESULT Dump2Surface(void *pDst[3], void *pSurface, int iWidth, int iHeight, int iStride[2]);
-HRESULT InitWindow(HWND *hWnd);
-LRESULT CALLBACK WndProc(HWND, UINT, WPARAM, LPARAM);
-
-typedef HRESULT (WINAPI *pFnCreateD3D9Ex) (UINT SDKVersion, IDirect3D9Ex** );
-typedef LPDIRECT3D9 (WINAPI *pFnCreateD3D9)(UINT SDKVersion);
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-CD3D9Utils::CD3D9Utils()
-{
-	m_hDll        = NULL;
-	m_hWnd        = NULL;
-	m_pDumpYUV    = NULL;
-
-	m_bInitDone   = FALSE;
-
-	m_lpD3D9                = NULL;
-	m_lpD3D9Device          = NULL;
-	m_lpD3D9RawSurfaceShare = NULL;
-
-	// coverity scan uninitial
-	ZeroMemory(&m_d3dpp, sizeof(m_d3dpp));
-}
-
-CD3D9Utils::~CD3D9Utils()
-{
-	Uninit();
-}
-
-HRESULT CD3D9Utils::Init(BOOL bWindowed)
-{
-	if (m_bInitDone)
-		return S_OK;
-
-	m_hDll = LoadLibrary(TEXT("d3d9.dll"));
-	pFnCreateD3D9 pCreateD3D9 = NULL;
-	if(m_hDll)
-		pCreateD3D9 = (pFnCreateD3D9) GetProcAddress(m_hDll, TEXT("Direct3DCreate9"));
-	else 
-		return E_FAIL;
-
-	m_lpD3D9 = pCreateD3D9(D3D_SDK_VERSION);
-
-	return bWindowed ? InitWindow(&m_hWnd) : S_OK;
-}
-
-HRESULT CD3D9Utils::Uninit()
-{
-	SAFE_RELEASE(m_lpD3D9RawSurfaceShare);
-    SAFE_RELEASE(m_lpD3D9Device);
-    SAFE_RELEASE(m_lpD3D9);
-	SAFE_FREE(m_pDumpYUV);
-
-	if(m_hDll)
-	{
-		FreeLibrary(m_hDll);
-		m_hDll = NULL;
-	}
-
-	return S_OK;
-}
-
-HRESULT CD3D9Utils::Process(void *pDst[3], SBufferInfo *pInfo, FILE *pFp)
-{
-	HRESULT hResult = E_FAIL;
-
-	if (pDst == NULL || pInfo == NULL)
-		return hResult;
-
-	BOOL bWindowed = pFp ? FALSE : TRUE;
-	BOOL bNeedD3D9 = !(!bWindowed && pInfo->eBufferProperty == BUFFER_HOST);
-	if (!m_bInitDone)
-		m_bInitDone = !bNeedD3D9;
-
-	if (!m_bInitDone)
-	{
-		hResult = Init(bWindowed);
-		if (SUCCEEDED(hResult))
-			m_bInitDone = TRUE;
-	}
-
-	if (m_bInitDone)
-	{
-		if (bWindowed)
-		{	
-			hResult = Render(pDst, pInfo);				
-			Sleep(30); 
-		}
-		else if (pFp)
-		{
-			hResult = Dump(pDst, pInfo, pFp);
-			Sleep(0);
-		}
-	}
-
-	return hResult;
-}
-
-HRESULT CD3D9Utils::Render(void *pDst[3], SBufferInfo *pInfo)
-{
-	HRESULT hResult = E_FAIL;
-	EBufferProperty eBufferProperty = pInfo->eBufferProperty;
-
-	if (eBufferProperty == BUFFER_HOST)
-	{
-		hResult = InitResource(NULL, pInfo);
-		if (SUCCEEDED(hResult))
-		 hResult = Dump2Surface(pDst, m_lpD3D9RawSurfaceShare, pInfo->UsrData.sSystemBuffer.iWidth, pInfo->UsrData.sSystemBuffer.iHeight, pInfo->UsrData.sSystemBuffer.iStride);
-	}
-	
-	if (SUCCEEDED(hResult))
-	{
-		IDirect3DSurface9 *pBackBuffer = NULL;
-		hResult = m_lpD3D9Device->GetBackBuffer(0, 0, D3DBACKBUFFER_TYPE_MONO, &pBackBuffer);
-		hResult = m_lpD3D9Device->StretchRect(m_lpD3D9RawSurfaceShare, NULL, pBackBuffer, NULL, D3DTEXF_NONE);
-		hResult = m_lpD3D9Device->Present(0, 0, NULL, NULL);
-	}
-
-	return hResult;
-}
-
-HRESULT CD3D9Utils::Dump(void *pDst[3], SBufferInfo *pInfo, FILE *pFp)
-{
-	HRESULT hResult = E_FAIL;
-	EBufferProperty eBufferProperty = pInfo->eBufferProperty;
-	int iStride[2];
-	int iWidth;
-	int iHeight;	
-
-	iWidth = pInfo->UsrData.sSystemBuffer.iWidth;
-	iHeight = pInfo->UsrData.sSystemBuffer.iHeight;
-	iStride[0] = pInfo->UsrData.sSystemBuffer.iStride[0];
-	iStride[1] = pInfo->UsrData.sSystemBuffer.iStride[1];
-	
-	if (pDst[0] && pDst[1] && pDst[2])
-		Write2File(pFp, (unsigned char **)pDst, iStride, iWidth, iHeight);
-
-	return hResult;
-}
-
-HRESULT CD3D9Utils::InitResource(void *pSharedHandle, SBufferInfo *pInfo)
-{
-	HRESULT hResult = S_OK;
-
-	// coverity scan uninitial
-	int iWidth = 0;
-	int iHeight = 0;
-	D3DFORMAT D3Dformat = (D3DFORMAT)D3DFMT_UNKNOWN;
-	D3DPOOL D3Dpool = (D3DPOOL)D3DPOOL_DEFAULT;
-
-	if (pInfo == NULL)
-		return E_FAIL;
-
-	if (m_lpD3D9Device == NULL && m_lpD3D9RawSurfaceShare == NULL)
-	{
-		HMONITOR hMonitorWnd = MonitorFromWindow(m_hWnd, MONITOR_DEFAULTTONULL);
-
-		UINT uiAdapter = D3DADAPTER_DEFAULT;
-		UINT uiCnt = m_lpD3D9->GetAdapterCount();
-		for(UINT i=0; i<uiCnt; i++)
-		{
-			HMONITOR hMonitor = m_lpD3D9->GetAdapterMonitor(i);
-			if(hMonitor == hMonitorWnd)
-			{
-				uiAdapter = i;
-				break;
-			}
-		}
-
-		D3DDISPLAYMODE D3DDisplayMode;
-		hResult = m_lpD3D9->GetAdapterDisplayMode(uiAdapter, &D3DDisplayMode);
-
-		D3DDEVTYPE D3DDevType = D3DDEVTYPE_HAL;
-		DWORD dwBehaviorFlags = D3DCREATE_SOFTWARE_VERTEXPROCESSING | D3DCREATE_MULTITHREADED;
-
-		ZeroMemory(&m_d3dpp, sizeof(m_d3dpp));
-		m_d3dpp.Flags = D3DPRESENTFLAG_VIDEO;
-		m_d3dpp.SwapEffect = D3DSWAPEFFECT_DISCARD;
-		m_d3dpp.BackBufferFormat = D3DDisplayMode.Format;
-		m_d3dpp.Windowed = TRUE;
-		m_d3dpp.hDeviceWindow = m_hWnd;
-		m_d3dpp.PresentationInterval = D3DPRESENT_INTERVAL_IMMEDIATE;
-		hResult = m_lpD3D9->CreateDevice(uiAdapter, D3DDevType, NULL, dwBehaviorFlags, &m_d3dpp, &m_lpD3D9Device);
-		if (pInfo->eBufferProperty == BUFFER_HOST)
-		{
-			iWidth = pInfo->UsrData.sSystemBuffer.iWidth;
-			iHeight = pInfo->UsrData.sSystemBuffer.iHeight;
-			D3Dformat = (D3DFORMAT)NV12_FORMAT;
-			D3Dpool = (D3DPOOL)D3DPOOL_DEFAULT;
-		}
-		
-		hResult = m_lpD3D9Device->CreateOffscreenPlainSurface(iWidth, iHeight, (D3DFORMAT)D3Dformat, (D3DPOOL)D3Dpool, &m_lpD3D9RawSurfaceShare, NULL);
-		
-	}
-
-	if (m_lpD3D9Device == NULL || m_lpD3D9RawSurfaceShare == NULL)
-		hResult = E_FAIL;
-
-	return hResult;
-}
-
-CD3D9ExUtils::CD3D9ExUtils()
-{
-	m_hDll        = NULL;
-	m_hWnd        = NULL;
-	m_pDumpYUV    = NULL;
-
-	m_bInitDone   = FALSE;
-
-	m_lpD3D9                = NULL;
-	m_lpD3D9Device          = NULL;
-	m_lpD3D9RawSurfaceShare = NULL;
-
-	// coverity scan uninitial
-	ZeroMemory(&m_d3dpp, sizeof(m_d3dpp));
-}
-
-CD3D9ExUtils::~CD3D9ExUtils()
-{
-	Uninit();
-}
-
-HRESULT CD3D9ExUtils::Init(BOOL bWindowed)
-{
-	if (m_bInitDone)
-		return S_OK;
-
-	m_hDll = LoadLibrary(TEXT("d3d9.dll"));
-	pFnCreateD3D9Ex pCreateD3D9Ex = NULL;
-	if(m_hDll)
-		pCreateD3D9Ex = (pFnCreateD3D9Ex) GetProcAddress(m_hDll, TEXT("Direct3DCreate9Ex"));
-	else 
-		return E_FAIL;
-
-	pCreateD3D9Ex(D3D_SDK_VERSION, &m_lpD3D9);
-
-	return bWindowed ? InitWindow(&m_hWnd) : S_OK;
-}
-
-HRESULT CD3D9ExUtils::Uninit()
-{
-	SAFE_RELEASE(m_lpD3D9RawSurfaceShare);
-	SAFE_RELEASE(m_lpD3D9Device);
-	SAFE_RELEASE(m_lpD3D9);
-	SAFE_FREE(m_pDumpYUV);
-
-	if(m_hDll)
-	{
-		FreeLibrary(m_hDll);
-		m_hDll = NULL;
-	}
-
-	return S_OK;
-}
-
-HRESULT CD3D9ExUtils::Process(void *pDst[3], SBufferInfo *pInfo, FILE *pFp)
-{
-	HRESULT hResult = E_FAIL;
-
-	if (pDst == NULL || pInfo == NULL)
-		return hResult;
-
-	BOOL bWindowed = pFp ? FALSE : TRUE;
-	BOOL bNeedD3D9 = !(!bWindowed && pInfo->eBufferProperty == BUFFER_HOST);
-	if (!m_bInitDone)
-		m_bInitDone = !bNeedD3D9;
-
-	if (!m_bInitDone)
-	{
-		hResult = Init(bWindowed);
-		if (SUCCEEDED(hResult))
-			m_bInitDone = TRUE;
-	}
-
-	if (m_bInitDone)
-	{
-		if (bWindowed)
-		{	
-			hResult = Render(pDst, pInfo);				
-			Sleep(30); // set a simple time controlling with default of 30fps
-		}
-		else if (pFp)
-		{
-			hResult = Dump(pDst, pInfo, pFp);
-			Sleep(0);
-		}
-	}
-
- 	return hResult;
-}
-
-HRESULT CD3D9ExUtils::Render(void *pDst[3], SBufferInfo *pInfo)
-{
-	HRESULT hResult = E_FAIL;
-	EBufferProperty eBufferProperty = pInfo->eBufferProperty;
-
-	if (eBufferProperty == BUFFER_HOST)
-	{
-		hResult = InitResource(NULL, pInfo);
-		if (SUCCEEDED(hResult))
-			hResult = Dump2Surface(pDst, m_lpD3D9RawSurfaceShare, pInfo->UsrData.sSystemBuffer.iWidth, pInfo->UsrData.sSystemBuffer.iHeight, pInfo->UsrData.sSystemBuffer.iStride);
-	}
-	else if (eBufferProperty == BUFFER_DEVICE)
-	{
-		VOID * pSharedHandle = pDst[0];	
-		hResult = InitResource(pSharedHandle, pInfo);
-	}
-
-	if (SUCCEEDED(hResult))
-	{
-		IDirect3DSurface9 *pBackBuffer = NULL;
-		hResult = m_lpD3D9Device->GetBackBuffer(0, 0, D3DBACKBUFFER_TYPE_MONO, &pBackBuffer);
-		hResult = m_lpD3D9Device->StretchRect(m_lpD3D9RawSurfaceShare, NULL, pBackBuffer, NULL, D3DTEXF_NONE);
-		hResult = m_lpD3D9Device->PresentEx(0, 0, NULL, NULL, 0);
-	}
-
-	return hResult;
-}
-
-HRESULT CD3D9ExUtils::Dump(void *pDst[3], SBufferInfo *pInfo, FILE *pFp)
-{
-	HRESULT hResult = E_FAIL;
-	EBufferProperty eBufferProperty = pInfo->eBufferProperty;
-	int iStride[2];
-	int iWidth;
-	int iHeight;	
-	
-	if (eBufferProperty != BUFFER_HOST)
-	{		
-		iWidth = pInfo->UsrData.sVideoBuffer.iSurfaceWidth;
-		iHeight = pInfo->UsrData.sVideoBuffer.iSurfaceHeight;
-		iStride[0] = iWidth;
-		iStride[1] = iWidth / 2;
-		
-		if (m_pDumpYUV == NULL)
-		{
-			m_pDumpYUV = (unsigned char *)malloc(iWidth * iHeight * 3 / 2 * sizeof(unsigned char));
-		}
-
-		if (m_pDumpYUV)
-		{
-			void *pSurface = pDst[1];
-			pDst[0] = m_pDumpYUV;
-			pDst[1] = m_pDumpYUV + iHeight * iStride[0] * sizeof(unsigned char);
-			pDst[2] = m_pDumpYUV + iHeight * iStride[0] * 5 / 4 * sizeof(unsigned char);
-			hResult = Dump2YUV(pDst, pSurface, iWidth, iHeight, iStride);
-		}
-	}
-	else
-	{
-		iWidth = pInfo->UsrData.sSystemBuffer.iWidth;
-		iHeight = pInfo->UsrData.sSystemBuffer.iHeight;
-		iStride[0] = pInfo->UsrData.sSystemBuffer.iStride[0];
-		iStride[1] = pInfo->UsrData.sSystemBuffer.iStride[1];
-	}
-	
-	if (pDst[0] && pDst[1] && pDst[2])
-		Write2File(pFp, (unsigned char **)pDst, iStride, iWidth, iHeight);
-
-	return hResult;
-}
-
-HRESULT CD3D9ExUtils::InitResource(void *pSharedHandle, SBufferInfo *pInfo)
-{
-	HRESULT hResult = S_OK;
-	int iWidth;
-	int iHeight;
-	D3DFORMAT D3Dformat;
-	D3DPOOL D3Dpool;
-
-	if (pInfo == NULL)
-		return E_FAIL;
-
-	if (m_lpD3D9Device == NULL && m_lpD3D9RawSurfaceShare == NULL)
-	{
-		HMONITOR hMonitorWnd = MonitorFromWindow(m_hWnd, MONITOR_DEFAULTTONULL);
-
-		UINT uiAdapter = D3DADAPTER_DEFAULT;
-		UINT uiCnt = m_lpD3D9->GetAdapterCount();
-		for(UINT i=0; i<uiCnt; i++)
-		{
-			HMONITOR hMonitor = m_lpD3D9->GetAdapterMonitor(i);
-			if(hMonitor == hMonitorWnd)
-			{
-				uiAdapter = i;
-				break;
-			}
-		}
-
-		D3DDISPLAYMODEEX D3DDisplayMode;
-		D3DDisplayMode.Size = sizeof(D3DDISPLAYMODEEX);
-		hResult = m_lpD3D9->GetAdapterDisplayModeEx(uiAdapter, &D3DDisplayMode, NULL);
-
-		D3DDEVTYPE D3DDevType = D3DDEVTYPE_HAL;
-		DWORD dwBehaviorFlags = D3DCREATE_SOFTWARE_VERTEXPROCESSING | D3DCREATE_MULTITHREADED;
-
-		ZeroMemory(&m_d3dpp, sizeof(m_d3dpp));
-		m_d3dpp.Flags = D3DPRESENTFLAG_VIDEO;
-		m_d3dpp.SwapEffect = D3DSWAPEFFECT_DISCARD;
-		m_d3dpp.BackBufferFormat = D3DDisplayMode.Format;
-		m_d3dpp.Windowed = TRUE;
-		m_d3dpp.hDeviceWindow = m_hWnd;
-		m_d3dpp.PresentationInterval = D3DPRESENT_INTERVAL_IMMEDIATE;
-		hResult = m_lpD3D9->CreateDeviceEx(uiAdapter, D3DDevType, NULL, dwBehaviorFlags, &m_d3dpp, NULL, &m_lpD3D9Device);
-		if (pInfo->eBufferProperty == BUFFER_HOST)
-		{
-			iWidth = pInfo->UsrData.sSystemBuffer.iWidth;
-			iHeight = pInfo->UsrData.sSystemBuffer.iHeight;
-			D3Dformat = (D3DFORMAT)NV12_FORMAT;
-			D3Dpool = (D3DPOOL)D3DPOOL_DEFAULT;
-		}
-		else
-		{
-			iWidth = pInfo->UsrData.sVideoBuffer.iSurfaceWidth;
-			iHeight = pInfo->UsrData.sVideoBuffer.iSurfaceHeight;
-			D3Dformat = (D3DFORMAT)pInfo->UsrData.sVideoBuffer.D3Dformat;
-			D3Dpool = (D3DPOOL)pInfo->UsrData.sVideoBuffer.D3DPool;
-		}
-		hResult = m_lpD3D9Device->CreateOffscreenPlainSurface(iWidth, iHeight, (D3DFORMAT)D3Dformat, (D3DPOOL)D3Dpool, &m_lpD3D9RawSurfaceShare, &pSharedHandle);
-	}
-
-	if (m_lpD3D9Device == NULL || m_lpD3D9RawSurfaceShare == NULL)
-		hResult = E_FAIL;
-
-	return hResult;
-}
-
-
-HRESULT Dump2YUV(void *pDst[3], void *pSurface, int iWidth, int iHeight, int iStride[2])
-{
-	HRESULT hResult = E_FAIL;
-
-	if (!pDst[0] || !pDst[1] || !pDst[2] || !pSurface)
-		return hResult;
-
-	IDirect3DSurface9 *pSurfaceData = (IDirect3DSurface9 *)pSurface;
-	D3DLOCKED_RECT sD3DLockedRect = {0};
-	hResult = pSurfaceData->LockRect(&sD3DLockedRect, NULL, 0);
-
-	unsigned char * pInY = (unsigned char *)sD3DLockedRect.pBits;
-	unsigned char * pOutY = (unsigned char *)pDst[0];
-	int iInStride = sD3DLockedRect.Pitch;
-	int iOutStride = iStride[0];
-
-	for (int j=0; j<iHeight; j++)
-		memcpy(pOutY+j*iOutStride, pInY+j*iInStride, iWidth);//confirmed_safe_unsafe_usage
-
-	unsigned char * pOutV = (unsigned char *)pDst[1];
-	unsigned char * pOutU = (unsigned char *)pDst[2];
-	unsigned char * pInC = pInY + iInStride * iHeight;
-	iOutStride = iStride[1];
-	for (int i=0; i<iHeight/2; i++)
-	{
-		for (int j=0; j<iWidth; j+=2)
-		{
-			pOutV[i*iOutStride+j/2] = pInC[i*iInStride+j  ];
-			pOutU[i*iOutStride+j/2] = pInC[i*iInStride+j+1];
-		}
-	}
-
-	pSurfaceData->UnlockRect();
-
-	return hResult;
-}
-
-HRESULT Dump2Surface(void *pDst[3], void *pSurface, int iWidth, int iHeight, int iStride[2])
-{
-	HRESULT hResult = E_FAIL;
-
-	if (!pDst[0] || !pDst[1] || !pDst[2] || !pSurface)
-		return hResult;
-
-	IDirect3DSurface9 *pSurfaceData = (IDirect3DSurface9 *)pSurface;
-	D3DLOCKED_RECT sD3DLockedRect = {0};
-	hResult = pSurfaceData->LockRect(&sD3DLockedRect, NULL, 0);
-
-	unsigned char * pInY = (unsigned char *)pDst[0];
-	unsigned char * pOutY = (unsigned char *)sD3DLockedRect.pBits;
-	int iOutStride = sD3DLockedRect.Pitch;
-
-	for (int j=0; j<iHeight; j++)
-		memcpy(pOutY+j*iOutStride, pInY+j*iStride[0], iWidth);//confirmed_safe_unsafe_usage
-	
-	unsigned char * pInV = (unsigned char *)pDst[1];
-	unsigned char * pInU = (unsigned char *)pDst[2];
-	unsigned char * pOutC = pOutY + iOutStride * iHeight;
-	for (int i=0; i<iHeight/2; i++)
-	{
-		for (int j=0; j<iWidth; j+=2)
-		{
-			pOutC[i*iOutStride+j  ] = pInV[i*iStride[1]+j/2];
-			pOutC[i*iOutStride+j+1] = pInU[i*iStride[1]+j/2];
-		}
-	}
-
-	pSurfaceData->UnlockRect();
-
-	return hResult;
-}
-
-HRESULT InitWindow(HWND *hWnd)
-{
-	const TCHAR kszWindowTitle[] = TEXT("Wels Decoder Application");
-	const TCHAR kszWindowClass[] = TEXT("Wels Decoder Class");
-
-	WNDCLASSEX sWndClassEx = {0};
-	sWndClassEx.cbSize          = sizeof(WNDCLASSEX); 
-	sWndClassEx.style			= CS_HREDRAW | CS_VREDRAW;
-	sWndClassEx.lpfnWndProc	    = (WNDPROC)WndProc;
-	sWndClassEx.cbClsExtra		= 0;
-	sWndClassEx.cbWndExtra		= 0;
-	sWndClassEx.hInstance		= GetModuleHandle(NULL);
-	sWndClassEx.hIcon			= LoadIcon(sWndClassEx.hInstance, (LPCTSTR)IDI_TESTSHARESURFACE);
-	sWndClassEx.hCursor		    = LoadCursor(NULL, IDC_ARROW);
-	sWndClassEx.hbrBackground	= (HBRUSH)(COLOR_WINDOW + 1);
-	sWndClassEx.lpszMenuName	= (LPCSTR)IDC_TESTSHARESURFACE;
-	sWndClassEx.lpszClassName	= kszWindowClass;
-	sWndClassEx.hIconSm		    = LoadIcon(sWndClassEx.hInstance, (LPCTSTR)IDI_SMALL);
-
-	if (!RegisterClassEx(&sWndClassEx))
-		return E_FAIL;
-
-	HWND hTmpWnd = CreateWindow(kszWindowClass, kszWindowTitle, WS_OVERLAPPEDWINDOW,
-		CW_USEDEFAULT, 0, CW_USEDEFAULT, 0, NULL, NULL, sWndClassEx.hInstance, NULL);
-
-    *hWnd = hTmpWnd;
-	if (!hTmpWnd)
-		return E_FAIL;
-
-	ShowWindow(hTmpWnd, SW_SHOWDEFAULT);
-	UpdateWindow(hTmpWnd);
-
-	return S_OK;
-}
-
-LRESULT CALLBACK WndProc(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam)
-{
-	INT wmId, wmEvent;
-
-	switch (message) 
-	{
-	case WM_COMMAND:
-		wmId    = LOWORD(wParam); 
-		wmEvent = HIWORD(wParam); 
-		switch (wmId)
-		{
-		case IDM_ABOUT:
-			break;
-		case IDM_EXIT:
-			DestroyWindow(hWnd);
-			break;
-		default:
-			return DefWindowProc(hWnd, message, wParam, lParam);
-		}
-		break;
-	case WM_PAINT:
-		ValidateRect(hWnd , NULL);
-		break;
-	case WM_DESTROY:
-		PostQuitMessage(0);
-		break;
-	default:
-		return DefWindowProc(hWnd, message, wParam, lParam);
-	}
-	return 0;
-}
-
-#endif
-
-CUtils::CUtils()
-{
-	hHandle = NULL;
-	iOSType = CheckOS();
-
-#ifdef ENABLE_DISPLAY_MODULE
-	if (iOSType == OS_XP)
-		hHandle = (void *) new CD3D9Utils;
-
-	else if (iOSType == OS_VISTA_UPPER)
-		hHandle = (void *) new CD3D9ExUtils;
-#endif
-
-	if (hHandle == NULL)
-		iOSType = OS_UNSUPPORTED;
-}
-
-CUtils::~CUtils()
-{
-#ifdef ENABLE_DISPLAY_MODULE
-	if (hHandle)
-	{
-		if (iOSType == OS_XP)
-		{
-			CD3D9Utils *hTmp = (CD3D9Utils *) hHandle;
-		    delete hTmp;
-		}
-		else if (iOSType == OS_VISTA_UPPER)
-		{
-			CD3D9ExUtils *hTmp = (CD3D9ExUtils *) hHandle;
-			delete hTmp;
-		}
-		hHandle = NULL;
-	}
-#endif
-}
-
-int CUtils::Process(void *pDst[3], SBufferInfo *pInfo, FILE *pFp)
-{
-	
-	int iRet = 0;
-
-	if (iOSType == OS_UNSUPPORTED)
-	{
-		if (pFp && pDst[0] && pDst[1] && pDst[2] && pInfo)
-		{
-			int iStride[2];
-			int iWidth = pInfo->UsrData.sSystemBuffer.iWidth;
-			int iHeight= pInfo->UsrData.sSystemBuffer.iHeight;
-			iStride[0] = pInfo->UsrData.sSystemBuffer.iStride[0];
-			iStride[1] = pInfo->UsrData.sSystemBuffer.iStride[1];
-
-			Write2File(pFp, (unsigned char **)pDst, iStride, iWidth, iHeight);
-		}
-	}
-
-#ifdef ENABLE_DISPLAY_MODULE
-	else
-	{
-		MSG msg;
-		ZeroMemory( &msg, sizeof(msg) );
-		while( msg.message != WM_QUIT )
-		{
-			if( PeekMessage( &msg, NULL, 0U, 0U, PM_REMOVE ) )
-			{
-				TranslateMessage( &msg );
-				DispatchMessage( &msg );
-			}
-			else
-			{
-				HRESULT hResult = S_OK;
-				if (iOSType == OS_XP)
-					hResult = ((CD3D9Utils *)hHandle)->Process(pDst, pInfo, pFp);
-
-				else if (iOSType == OS_VISTA_UPPER)
-					hResult = ((CD3D9ExUtils *)hHandle)->Process(pDst, pInfo, pFp);
-              
-				iRet = !SUCCEEDED(hResult);
-				break;
-			}		
-		}
-	}	
-#endif
-
-	return iRet;
-}
-
-int CUtils::CheckOS()
-{
-	int iType = OS_UNSUPPORTED;
-
-#ifdef ENABLE_DISPLAY_MODULE
-	OSVERSIONINFOEX osvi;
-	ZeroMemory(&osvi, sizeof(OSVERSIONINFOEX));
-	osvi.dwOSVersionInfoSize = sizeof(OSVERSIONINFOEX);
-
-	if( !GetVersionEx ((OSVERSIONINFO *) &osvi) )
-	{
-		osvi.dwOSVersionInfoSize = sizeof (OSVERSIONINFO);
-		if (! GetVersionEx ( (OSVERSIONINFO *) &osvi) ) 
-			return iType;
-	}
-
-	switch (osvi.dwPlatformId)
-	{
-	case VER_PLATFORM_WIN32_NT:	
-		if (osvi.dwMajorVersion >= 6)
-			iType = OS_VISTA_UPPER;
-		else if (osvi.dwMajorVersion == 5)
-			iType = OS_XP;
-		break;		
-
-	default:
-		break;
-	}
-#endif
-
-	return iType;
-}
-
-void Write2File(FILE *pFp, unsigned char* pData[3], int iStride[2], int iWidth, int iHeight)
-{
-	int   i;
-	unsigned char  *pPtr = NULL;
-
-	pPtr = pData[0];
-	for( i=0; i<iHeight; i++ )
-	{
-		fwrite(pPtr, 1, iWidth, pFp);
-		pPtr += iStride[0];
-	}
-
-	iHeight = iHeight/2;
-	iWidth = iWidth/2;
-	pPtr = pData[1];
-	for( i=0; i<iHeight; i++ )
-	{
-		fwrite(pPtr, 1, iWidth, pFp);
-		pPtr += iStride[1];
-	}
-
-	pPtr = pData[2];
-	for( i=0; i<iHeight; i++ )
-	{
-		fwrite(pPtr, 1, iWidth, pFp);
-		pPtr += iStride[1];
-	}
-}
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "d3d9_utils.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+void Write2File (FILE* pFp, unsigned char* pData[3], int iStride[2], int iWidth, int iHeight);
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#ifdef ENABLE_DISPLAY_MODULE
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+#define IDM_ABOUT						104
+#define IDM_EXIT						105
+#define IDI_TESTSHARESURFACE	        107
+#define IDI_SMALL						108
+#define IDC_TESTSHARESURFACE	        109
+
+#define NV12_FORMAT  MAKEFOURCC('N','V','1','2')
+
+typedef struct {
+  UINT      uiWidth;
+  UINT      uiHeight;
+  D3DFORMAT D3Dformat;
+  D3DPOOL   D3DPool;
+} SHandleInfo;
+
+#define SAFE_RELEASE(p) if(p) { (p)->Release(); (p) = NULL; }
+#define SAFE_FREE(p)    if(p) { free (p); (p) = NULL; }
+
+HRESULT Dump2YUV (void* pDst[3], void* pSurface, int iWidth, int iHeight, int iStride[2]);
+HRESULT Dump2Surface (void* pDst[3], void* pSurface, int iWidth, int iHeight, int iStride[2]);
+HRESULT InitWindow (HWND* hWnd);
+LRESULT CALLBACK WndProc (HWND, UINT, WPARAM, LPARAM);
+
+typedef HRESULT (WINAPI* pFnCreateD3D9Ex) (UINT SDKVersion, IDirect3D9Ex**);
+typedef LPDIRECT3D9 (WINAPI* pFnCreateD3D9) (UINT SDKVersion);
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+CD3D9Utils::CD3D9Utils() {
+  m_hDll        = NULL;
+  m_hWnd        = NULL;
+  m_pDumpYUV    = NULL;
+
+  m_bInitDone   = FALSE;
+
+  m_lpD3D9                = NULL;
+  m_lpD3D9Device          = NULL;
+  m_lpD3D9RawSurfaceShare = NULL;
+
+  // coverity scan uninitial
+  ZeroMemory (&m_d3dpp, sizeof (m_d3dpp));
+}
+
+CD3D9Utils::~CD3D9Utils() {
+  Uninit();
+}
+
+HRESULT CD3D9Utils::Init (BOOL bWindowed) {
+  if (m_bInitDone)
+    return S_OK;
+
+  m_hDll = LoadLibrary (TEXT ("d3d9.dll"));
+  pFnCreateD3D9 pCreateD3D9 = NULL;
+  if (m_hDll)
+    pCreateD3D9 = (pFnCreateD3D9) GetProcAddress (m_hDll, TEXT ("Direct3DCreate9"));
+  else
+    return E_FAIL;
+
+  m_lpD3D9 = pCreateD3D9 (D3D_SDK_VERSION);
+
+  return bWindowed ? InitWindow (&m_hWnd) : S_OK;
+}
+
+HRESULT CD3D9Utils::Uninit() {
+  SAFE_RELEASE (m_lpD3D9RawSurfaceShare);
+  SAFE_RELEASE (m_lpD3D9Device);
+  SAFE_RELEASE (m_lpD3D9);
+  SAFE_FREE (m_pDumpYUV);
+
+  if (m_hDll) {
+    FreeLibrary (m_hDll);
+    m_hDll = NULL;
+  }
+
+  return S_OK;
+}
+
+HRESULT CD3D9Utils::Process (void* pDst[3], SBufferInfo* pInfo, FILE* pFp) {
+  HRESULT hResult = E_FAIL;
+
+  if (pDst == NULL || pInfo == NULL)
+    return hResult;
+
+  BOOL bWindowed = pFp ? FALSE : TRUE;
+  BOOL bNeedD3D9 = ! (!bWindowed && pInfo->eBufferProperty == BUFFER_HOST);
+  if (!m_bInitDone)
+    m_bInitDone = !bNeedD3D9;
+
+  if (!m_bInitDone) {
+    hResult = Init (bWindowed);
+    if (SUCCEEDED (hResult))
+      m_bInitDone = TRUE;
+  }
+
+  if (m_bInitDone) {
+    if (bWindowed) {
+      hResult = Render (pDst, pInfo);
+      Sleep (30);
+    } else if (pFp) {
+      hResult = Dump (pDst, pInfo, pFp);
+      Sleep (0);
+    }
+  }
+
+  return hResult;
+}
+
+HRESULT CD3D9Utils::Render (void* pDst[3], SBufferInfo* pInfo) {
+  HRESULT hResult = E_FAIL;
+  EBufferProperty eBufferProperty = pInfo->eBufferProperty;
+
+  if (eBufferProperty == BUFFER_HOST) {
+    hResult = InitResource (NULL, pInfo);
+    if (SUCCEEDED (hResult))
+      hResult = Dump2Surface (pDst, m_lpD3D9RawSurfaceShare, pInfo->UsrData.sSystemBuffer.iWidth,
+                              pInfo->UsrData.sSystemBuffer.iHeight, pInfo->UsrData.sSystemBuffer.iStride);
+  }
+
+  if (SUCCEEDED (hResult)) {
+    IDirect3DSurface9* pBackBuffer = NULL;
+    hResult = m_lpD3D9Device->GetBackBuffer (0, 0, D3DBACKBUFFER_TYPE_MONO, &pBackBuffer);
+    hResult = m_lpD3D9Device->StretchRect (m_lpD3D9RawSurfaceShare, NULL, pBackBuffer, NULL, D3DTEXF_NONE);
+    hResult = m_lpD3D9Device->Present (0, 0, NULL, NULL);
+  }
+
+  return hResult;
+}
+
+HRESULT CD3D9Utils::Dump (void* pDst[3], SBufferInfo* pInfo, FILE* pFp) {
+  HRESULT hResult = E_FAIL;
+  EBufferProperty eBufferProperty = pInfo->eBufferProperty;
+  int iStride[2];
+  int iWidth;
+  int iHeight;
+
+  iWidth = pInfo->UsrData.sSystemBuffer.iWidth;
+  iHeight = pInfo->UsrData.sSystemBuffer.iHeight;
+  iStride[0] = pInfo->UsrData.sSystemBuffer.iStride[0];
+  iStride[1] = pInfo->UsrData.sSystemBuffer.iStride[1];
+
+  if (pDst[0] && pDst[1] && pDst[2])
+    Write2File (pFp, (unsigned char**)pDst, iStride, iWidth, iHeight);
+
+  return hResult;
+}
+
+HRESULT CD3D9Utils::InitResource (void* pSharedHandle, SBufferInfo* pInfo) {
+  HRESULT hResult = S_OK;
+
+  // coverity scan uninitial
+  int iWidth = 0;
+  int iHeight = 0;
+  D3DFORMAT D3Dformat = (D3DFORMAT)D3DFMT_UNKNOWN;
+  D3DPOOL D3Dpool = (D3DPOOL)D3DPOOL_DEFAULT;
+
+  if (pInfo == NULL)
+    return E_FAIL;
+
+  if (m_lpD3D9Device == NULL && m_lpD3D9RawSurfaceShare == NULL) {
+    HMONITOR hMonitorWnd = MonitorFromWindow (m_hWnd, MONITOR_DEFAULTTONULL);
+
+    UINT uiAdapter = D3DADAPTER_DEFAULT;
+    UINT uiCnt = m_lpD3D9->GetAdapterCount();
+    for (UINT i = 0; i < uiCnt; i++) {
+      HMONITOR hMonitor = m_lpD3D9->GetAdapterMonitor (i);
+      if (hMonitor == hMonitorWnd) {
+        uiAdapter = i;
+        break;
+      }
+    }
+
+    D3DDISPLAYMODE D3DDisplayMode;
+    hResult = m_lpD3D9->GetAdapterDisplayMode (uiAdapter, &D3DDisplayMode);
+
+    D3DDEVTYPE D3DDevType = D3DDEVTYPE_HAL;
+    DWORD dwBehaviorFlags = D3DCREATE_SOFTWARE_VERTEXPROCESSING | D3DCREATE_MULTITHREADED;
+
+    ZeroMemory (&m_d3dpp, sizeof (m_d3dpp));
+    m_d3dpp.Flags = D3DPRESENTFLAG_VIDEO;
+    m_d3dpp.SwapEffect = D3DSWAPEFFECT_DISCARD;
+    m_d3dpp.BackBufferFormat = D3DDisplayMode.Format;
+    m_d3dpp.Windowed = TRUE;
+    m_d3dpp.hDeviceWindow = m_hWnd;
+    m_d3dpp.PresentationInterval = D3DPRESENT_INTERVAL_IMMEDIATE;
+    hResult = m_lpD3D9->CreateDevice (uiAdapter, D3DDevType, NULL, dwBehaviorFlags, &m_d3dpp, &m_lpD3D9Device);
+    if (pInfo->eBufferProperty == BUFFER_HOST) {
+      iWidth = pInfo->UsrData.sSystemBuffer.iWidth;
+      iHeight = pInfo->UsrData.sSystemBuffer.iHeight;
+      D3Dformat = (D3DFORMAT)NV12_FORMAT;
+      D3Dpool = (D3DPOOL)D3DPOOL_DEFAULT;
+    }
+
+    hResult = m_lpD3D9Device->CreateOffscreenPlainSurface (iWidth, iHeight, (D3DFORMAT)D3Dformat, (D3DPOOL)D3Dpool,
+              &m_lpD3D9RawSurfaceShare, NULL);
+
+  }
+
+  if (m_lpD3D9Device == NULL || m_lpD3D9RawSurfaceShare == NULL)
+    hResult = E_FAIL;
+
+  return hResult;
+}
+
+CD3D9ExUtils::CD3D9ExUtils() {
+  m_hDll        = NULL;
+  m_hWnd        = NULL;
+  m_pDumpYUV    = NULL;
+
+  m_bInitDone   = FALSE;
+
+  m_lpD3D9                = NULL;
+  m_lpD3D9Device          = NULL;
+  m_lpD3D9RawSurfaceShare = NULL;
+
+  // coverity scan uninitial
+  ZeroMemory (&m_d3dpp, sizeof (m_d3dpp));
+}
+
+CD3D9ExUtils::~CD3D9ExUtils() {
+  Uninit();
+}
+
+HRESULT CD3D9ExUtils::Init (BOOL bWindowed) {
+  if (m_bInitDone)
+    return S_OK;
+
+  m_hDll = LoadLibrary (TEXT ("d3d9.dll"));
+  pFnCreateD3D9Ex pCreateD3D9Ex = NULL;
+  if (m_hDll)
+    pCreateD3D9Ex = (pFnCreateD3D9Ex) GetProcAddress (m_hDll, TEXT ("Direct3DCreate9Ex"));
+  else
+    return E_FAIL;
+
+  pCreateD3D9Ex (D3D_SDK_VERSION, &m_lpD3D9);
+
+  return bWindowed ? InitWindow (&m_hWnd) : S_OK;
+}
+
+HRESULT CD3D9ExUtils::Uninit() {
+  SAFE_RELEASE (m_lpD3D9RawSurfaceShare);
+  SAFE_RELEASE (m_lpD3D9Device);
+  SAFE_RELEASE (m_lpD3D9);
+  SAFE_FREE (m_pDumpYUV);
+
+  if (m_hDll) {
+    FreeLibrary (m_hDll);
+    m_hDll = NULL;
+  }
+
+  return S_OK;
+}
+
+HRESULT CD3D9ExUtils::Process (void* pDst[3], SBufferInfo* pInfo, FILE* pFp) {
+  HRESULT hResult = E_FAIL;
+
+  if (pDst == NULL || pInfo == NULL)
+    return hResult;
+
+  BOOL bWindowed = pFp ? FALSE : TRUE;
+  BOOL bNeedD3D9 = ! (!bWindowed && pInfo->eBufferProperty == BUFFER_HOST);
+  if (!m_bInitDone)
+    m_bInitDone = !bNeedD3D9;
+
+  if (!m_bInitDone) {
+    hResult = Init (bWindowed);
+    if (SUCCEEDED (hResult))
+      m_bInitDone = TRUE;
+  }
+
+  if (m_bInitDone) {
+    if (bWindowed) {
+      hResult = Render (pDst, pInfo);
+      Sleep (30); // set a simple time controlling with default of 30fps
+    } else if (pFp) {
+      hResult = Dump (pDst, pInfo, pFp);
+      Sleep (0);
+    }
+  }
+
+  return hResult;
+}
+
+HRESULT CD3D9ExUtils::Render (void* pDst[3], SBufferInfo* pInfo) {
+  HRESULT hResult = E_FAIL;
+  EBufferProperty eBufferProperty = pInfo->eBufferProperty;
+
+  if (eBufferProperty == BUFFER_HOST) {
+    hResult = InitResource (NULL, pInfo);
+    if (SUCCEEDED (hResult))
+      hResult = Dump2Surface (pDst, m_lpD3D9RawSurfaceShare, pInfo->UsrData.sSystemBuffer.iWidth,
+                              pInfo->UsrData.sSystemBuffer.iHeight, pInfo->UsrData.sSystemBuffer.iStride);
+  } else if (eBufferProperty == BUFFER_DEVICE) {
+    VOID* pSharedHandle = pDst[0];
+    hResult = InitResource (pSharedHandle, pInfo);
+  }
+
+  if (SUCCEEDED (hResult)) {
+    IDirect3DSurface9* pBackBuffer = NULL;
+    hResult = m_lpD3D9Device->GetBackBuffer (0, 0, D3DBACKBUFFER_TYPE_MONO, &pBackBuffer);
+    hResult = m_lpD3D9Device->StretchRect (m_lpD3D9RawSurfaceShare, NULL, pBackBuffer, NULL, D3DTEXF_NONE);
+    hResult = m_lpD3D9Device->PresentEx (0, 0, NULL, NULL, 0);
+  }
+
+  return hResult;
+}
+
+HRESULT CD3D9ExUtils::Dump (void* pDst[3], SBufferInfo* pInfo, FILE* pFp) {
+  HRESULT hResult = E_FAIL;
+  EBufferProperty eBufferProperty = pInfo->eBufferProperty;
+  int iStride[2];
+  int iWidth;
+  int iHeight;
+
+  if (eBufferProperty != BUFFER_HOST) {
+    iWidth = pInfo->UsrData.sVideoBuffer.iSurfaceWidth;
+    iHeight = pInfo->UsrData.sVideoBuffer.iSurfaceHeight;
+    iStride[0] = iWidth;
+    iStride[1] = iWidth / 2;
+
+    if (m_pDumpYUV == NULL) {
+      m_pDumpYUV = (unsigned char*)malloc (iWidth * iHeight * 3 / 2 * sizeof (unsigned char));
+    }
+
+    if (m_pDumpYUV) {
+      void* pSurface = pDst[1];
+      pDst[0] = m_pDumpYUV;
+      pDst[1] = m_pDumpYUV + iHeight * iStride[0] * sizeof (unsigned char);
+      pDst[2] = m_pDumpYUV + iHeight * iStride[0] * 5 / 4 * sizeof (unsigned char);
+      hResult = Dump2YUV (pDst, pSurface, iWidth, iHeight, iStride);
+    }
+  } else {
+    iWidth = pInfo->UsrData.sSystemBuffer.iWidth;
+    iHeight = pInfo->UsrData.sSystemBuffer.iHeight;
+    iStride[0] = pInfo->UsrData.sSystemBuffer.iStride[0];
+    iStride[1] = pInfo->UsrData.sSystemBuffer.iStride[1];
+  }
+
+  if (pDst[0] && pDst[1] && pDst[2])
+    Write2File (pFp, (unsigned char**)pDst, iStride, iWidth, iHeight);
+
+  return hResult;
+}
+
+HRESULT CD3D9ExUtils::InitResource (void* pSharedHandle, SBufferInfo* pInfo) {
+  HRESULT hResult = S_OK;
+  int iWidth;
+  int iHeight;
+  D3DFORMAT D3Dformat;
+  D3DPOOL D3Dpool;
+
+  if (pInfo == NULL)
+    return E_FAIL;
+
+  if (m_lpD3D9Device == NULL && m_lpD3D9RawSurfaceShare == NULL) {
+    HMONITOR hMonitorWnd = MonitorFromWindow (m_hWnd, MONITOR_DEFAULTTONULL);
+
+    UINT uiAdapter = D3DADAPTER_DEFAULT;
+    UINT uiCnt = m_lpD3D9->GetAdapterCount();
+    for (UINT i = 0; i < uiCnt; i++) {
+      HMONITOR hMonitor = m_lpD3D9->GetAdapterMonitor (i);
+      if (hMonitor == hMonitorWnd) {
+        uiAdapter = i;
+        break;
+      }
+    }
+
+    D3DDISPLAYMODEEX D3DDisplayMode;
+    D3DDisplayMode.Size = sizeof (D3DDISPLAYMODEEX);
+    hResult = m_lpD3D9->GetAdapterDisplayModeEx (uiAdapter, &D3DDisplayMode, NULL);
+
+    D3DDEVTYPE D3DDevType = D3DDEVTYPE_HAL;
+    DWORD dwBehaviorFlags = D3DCREATE_SOFTWARE_VERTEXPROCESSING | D3DCREATE_MULTITHREADED;
+
+    ZeroMemory (&m_d3dpp, sizeof (m_d3dpp));
+    m_d3dpp.Flags = D3DPRESENTFLAG_VIDEO;
+    m_d3dpp.SwapEffect = D3DSWAPEFFECT_DISCARD;
+    m_d3dpp.BackBufferFormat = D3DDisplayMode.Format;
+    m_d3dpp.Windowed = TRUE;
+    m_d3dpp.hDeviceWindow = m_hWnd;
+    m_d3dpp.PresentationInterval = D3DPRESENT_INTERVAL_IMMEDIATE;
+    hResult = m_lpD3D9->CreateDeviceEx (uiAdapter, D3DDevType, NULL, dwBehaviorFlags, &m_d3dpp, NULL, &m_lpD3D9Device);
+    if (pInfo->eBufferProperty == BUFFER_HOST) {
+      iWidth = pInfo->UsrData.sSystemBuffer.iWidth;
+      iHeight = pInfo->UsrData.sSystemBuffer.iHeight;
+      D3Dformat = (D3DFORMAT)NV12_FORMAT;
+      D3Dpool = (D3DPOOL)D3DPOOL_DEFAULT;
+    } else {
+      iWidth = pInfo->UsrData.sVideoBuffer.iSurfaceWidth;
+      iHeight = pInfo->UsrData.sVideoBuffer.iSurfaceHeight;
+      D3Dformat = (D3DFORMAT)pInfo->UsrData.sVideoBuffer.D3Dformat;
+      D3Dpool = (D3DPOOL)pInfo->UsrData.sVideoBuffer.D3DPool;
+    }
+    hResult = m_lpD3D9Device->CreateOffscreenPlainSurface (iWidth, iHeight, (D3DFORMAT)D3Dformat, (D3DPOOL)D3Dpool,
+              &m_lpD3D9RawSurfaceShare, &pSharedHandle);
+  }
+
+  if (m_lpD3D9Device == NULL || m_lpD3D9RawSurfaceShare == NULL)
+    hResult = E_FAIL;
+
+  return hResult;
+}
+
+
+HRESULT Dump2YUV (void* pDst[3], void* pSurface, int iWidth, int iHeight, int iStride[2]) {
+  HRESULT hResult = E_FAIL;
+
+  if (!pDst[0] || !pDst[1] || !pDst[2] || !pSurface)
+    return hResult;
+
+  IDirect3DSurface9* pSurfaceData = (IDirect3DSurface9*)pSurface;
+  D3DLOCKED_RECT sD3DLockedRect = {0};
+  hResult = pSurfaceData->LockRect (&sD3DLockedRect, NULL, 0);
+
+  unsigned char* pInY = (unsigned char*)sD3DLockedRect.pBits;
+  unsigned char* pOutY = (unsigned char*)pDst[0];
+  int iInStride = sD3DLockedRect.Pitch;
+  int iOutStride = iStride[0];
+
+  for (int j = 0; j < iHeight; j++)
+    memcpy (pOutY + j * iOutStride, pInY + j * iInStride, iWidth); //confirmed_safe_unsafe_usage
+
+  unsigned char* pOutV = (unsigned char*)pDst[1];
+  unsigned char* pOutU = (unsigned char*)pDst[2];
+  unsigned char* pInC = pInY + iInStride * iHeight;
+  iOutStride = iStride[1];
+  for (int i = 0; i < iHeight / 2; i++) {
+    for (int j = 0; j < iWidth; j += 2) {
+      pOutV[i * iOutStride + j / 2] = pInC[i * iInStride + j  ];
+      pOutU[i * iOutStride + j / 2] = pInC[i * iInStride + j + 1];
+    }
+  }
+
+  pSurfaceData->UnlockRect();
+
+  return hResult;
+}
+
+HRESULT Dump2Surface (void* pDst[3], void* pSurface, int iWidth, int iHeight, int iStride[2]) {
+  HRESULT hResult = E_FAIL;
+
+  if (!pDst[0] || !pDst[1] || !pDst[2] || !pSurface)
+    return hResult;
+
+  IDirect3DSurface9* pSurfaceData = (IDirect3DSurface9*)pSurface;
+  D3DLOCKED_RECT sD3DLockedRect = {0};
+  hResult = pSurfaceData->LockRect (&sD3DLockedRect, NULL, 0);
+
+  unsigned char* pInY = (unsigned char*)pDst[0];
+  unsigned char* pOutY = (unsigned char*)sD3DLockedRect.pBits;
+  int iOutStride = sD3DLockedRect.Pitch;
+
+  for (int j = 0; j < iHeight; j++)
+    memcpy (pOutY + j * iOutStride, pInY + j * iStride[0], iWidth); //confirmed_safe_unsafe_usage
+
+  unsigned char* pInV = (unsigned char*)pDst[1];
+  unsigned char* pInU = (unsigned char*)pDst[2];
+  unsigned char* pOutC = pOutY + iOutStride * iHeight;
+  for (int i = 0; i < iHeight / 2; i++) {
+    for (int j = 0; j < iWidth; j += 2) {
+      pOutC[i * iOutStride + j  ] = pInV[i * iStride[1] + j / 2];
+      pOutC[i * iOutStride + j + 1] = pInU[i * iStride[1] + j / 2];
+    }
+  }
+
+  pSurfaceData->UnlockRect();
+
+  return hResult;
+}
+
+HRESULT InitWindow (HWND* hWnd) {
+  const TCHAR kszWindowTitle[] = TEXT ("Wels Decoder Application");
+  const TCHAR kszWindowClass[] = TEXT ("Wels Decoder Class");
+
+  WNDCLASSEX sWndClassEx = {0};
+  sWndClassEx.cbSize          = sizeof (WNDCLASSEX);
+  sWndClassEx.style			= CS_HREDRAW | CS_VREDRAW;
+  sWndClassEx.lpfnWndProc	    = (WNDPROC)WndProc;
+  sWndClassEx.cbClsExtra		= 0;
+  sWndClassEx.cbWndExtra		= 0;
+  sWndClassEx.hInstance		= GetModuleHandle (NULL);
+  sWndClassEx.hIcon			= LoadIcon (sWndClassEx.hInstance, (LPCTSTR)IDI_TESTSHARESURFACE);
+  sWndClassEx.hCursor		    = LoadCursor (NULL, IDC_ARROW);
+  sWndClassEx.hbrBackground	= (HBRUSH) (COLOR_WINDOW + 1);
+  sWndClassEx.lpszMenuName	= (LPCSTR)IDC_TESTSHARESURFACE;
+  sWndClassEx.lpszClassName	= kszWindowClass;
+  sWndClassEx.hIconSm		    = LoadIcon (sWndClassEx.hInstance, (LPCTSTR)IDI_SMALL);
+
+  if (!RegisterClassEx (&sWndClassEx))
+    return E_FAIL;
+
+  HWND hTmpWnd = CreateWindow (kszWindowClass, kszWindowTitle, WS_OVERLAPPEDWINDOW,
+                               CW_USEDEFAULT, 0, CW_USEDEFAULT, 0, NULL, NULL, sWndClassEx.hInstance, NULL);
+
+  *hWnd = hTmpWnd;
+  if (!hTmpWnd)
+    return E_FAIL;
+
+  ShowWindow (hTmpWnd, SW_SHOWDEFAULT);
+  UpdateWindow (hTmpWnd);
+
+  return S_OK;
+}
+
+LRESULT CALLBACK WndProc (HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam) {
+  INT wmId, wmEvent;
+
+  switch (message) {
+  case WM_COMMAND:
+    wmId    = LOWORD (wParam);
+    wmEvent = HIWORD (wParam);
+    switch (wmId) {
+    case IDM_ABOUT:
+      break;
+    case IDM_EXIT:
+      DestroyWindow (hWnd);
+      break;
+    default:
+      return DefWindowProc (hWnd, message, wParam, lParam);
+    }
+    break;
+  case WM_PAINT:
+    ValidateRect (hWnd , NULL);
+    break;
+  case WM_DESTROY:
+    PostQuitMessage (0);
+    break;
+  default:
+    return DefWindowProc (hWnd, message, wParam, lParam);
+  }
+  return 0;
+}
+
+#endif
+
+CUtils::CUtils() {
+  hHandle = NULL;
+  iOSType = CheckOS();
+
+#ifdef ENABLE_DISPLAY_MODULE
+  if (iOSType == OS_XP)
+    hHandle = (void*) new CD3D9Utils;
+
+  else if (iOSType == OS_VISTA_UPPER)
+    hHandle = (void*) new CD3D9ExUtils;
+#endif
+
+  if (hHandle == NULL)
+    iOSType = OS_UNSUPPORTED;
+}
+
+CUtils::~CUtils() {
+#ifdef ENABLE_DISPLAY_MODULE
+  if (hHandle) {
+    if (iOSType == OS_XP) {
+      CD3D9Utils* hTmp = (CD3D9Utils*) hHandle;
+      delete hTmp;
+    } else if (iOSType == OS_VISTA_UPPER) {
+      CD3D9ExUtils* hTmp = (CD3D9ExUtils*) hHandle;
+      delete hTmp;
+    }
+    hHandle = NULL;
+  }
+#endif
+}
+
+int CUtils::Process (void* pDst[3], SBufferInfo* pInfo, FILE* pFp) {
+
+  int iRet = 0;
+
+  if (iOSType == OS_UNSUPPORTED) {
+    if (pFp && pDst[0] && pDst[1] && pDst[2] && pInfo) {
+      int iStride[2];
+      int iWidth = pInfo->UsrData.sSystemBuffer.iWidth;
+      int iHeight = pInfo->UsrData.sSystemBuffer.iHeight;
+      iStride[0] = pInfo->UsrData.sSystemBuffer.iStride[0];
+      iStride[1] = pInfo->UsrData.sSystemBuffer.iStride[1];
+
+      Write2File (pFp, (unsigned char**)pDst, iStride, iWidth, iHeight);
+    }
+  }
+
+#ifdef ENABLE_DISPLAY_MODULE
+  else {
+    MSG msg;
+    ZeroMemory (&msg, sizeof (msg));
+    while (msg.message != WM_QUIT) {
+      if (PeekMessage (&msg, NULL, 0U, 0U, PM_REMOVE)) {
+        TranslateMessage (&msg);
+        DispatchMessage (&msg);
+      } else {
+        HRESULT hResult = S_OK;
+        if (iOSType == OS_XP)
+          hResult = ((CD3D9Utils*)hHandle)->Process (pDst, pInfo, pFp);
+
+        else if (iOSType == OS_VISTA_UPPER)
+          hResult = ((CD3D9ExUtils*)hHandle)->Process (pDst, pInfo, pFp);
+
+        iRet = !SUCCEEDED (hResult);
+        break;
+      }
+    }
+  }
+#endif
+
+  return iRet;
+}
+
+int CUtils::CheckOS() {
+  int iType = OS_UNSUPPORTED;
+
+#ifdef ENABLE_DISPLAY_MODULE
+  OSVERSIONINFOEX osvi;
+  ZeroMemory (&osvi, sizeof (OSVERSIONINFOEX));
+  osvi.dwOSVersionInfoSize = sizeof (OSVERSIONINFOEX);
+
+  if (!GetVersionEx ((OSVERSIONINFO*) &osvi)) {
+    osvi.dwOSVersionInfoSize = sizeof (OSVERSIONINFO);
+    if (! GetVersionEx ((OSVERSIONINFO*) &osvi))
+      return iType;
+  }
+
+  switch (osvi.dwPlatformId) {
+  case VER_PLATFORM_WIN32_NT:
+    if (osvi.dwMajorVersion >= 6)
+      iType = OS_VISTA_UPPER;
+    else if (osvi.dwMajorVersion == 5)
+      iType = OS_XP;
+    break;
+
+  default:
+    break;
+  }
+#endif
+
+  return iType;
+}
+
+void Write2File (FILE* pFp, unsigned char* pData[3], int iStride[2], int iWidth, int iHeight) {
+  int   i;
+  unsigned char*  pPtr = NULL;
+
+  pPtr = pData[0];
+  for (i = 0; i < iHeight; i++) {
+    fwrite (pPtr, 1, iWidth, pFp);
+    pPtr += iStride[0];
+  }
+
+  iHeight = iHeight / 2;
+  iWidth = iWidth / 2;
+  pPtr = pData[1];
+  for (i = 0; i < iHeight; i++) {
+    fwrite (pPtr, 1, iWidth, pFp);
+    pPtr += iStride[1];
+  }
+
+  pPtr = pData[2];
+  for (i = 0; i < iHeight; i++) {
+    fwrite (pPtr, 1, iWidth, pFp);
+    pPtr += iStride[1];
+  }
+}
--- a/codec/console/dec/src/h264dec.cpp
+++ b/codec/console/dec/src/h264dec.cpp
@@ -39,6 +39,7 @@
 #endif
 #include <stdio.h>
 #include <stdlib.h>
+#include <stdarg.h>
 
 #include "codec_def.h"
 #include "codec_app_def.h"
@@ -47,9 +48,10 @@
 #include "../../decoder/core/inc/typedefs.h"
 #include "../../decoder/core/inc/measure_time.h"
 #include "d3d9_utils.h"
+#include "logging.h"
 
-typedef long   (*PCreateDecoderFunc) (ISVCDecoder** ppDecoder);
-typedef void_t (*PDestroyDecoderFunc)(ISVCDecoder* pDecoder);
+typedef long (*PCreateDecoderFunc) (ISVCDecoder** ppDecoder);
+typedef void_t (*PDestroyDecoderFunc) (ISVCDecoder* pDecoder);
 
 
 using namespace std;
@@ -58,469 +60,438 @@
 
 //#define STICK_STREAM_SIZE	// For Demo interfaces test with track file of integrated frames
 
-void_t H264DecodeInstance( ISVCDecoder* pDecoder, const char* kpH264FileName, const char* kpOuputFileName, int32_t& iWidth, int32_t& iHeight, void_t* pOptionFileName )
-{
-	FILE *pH264File	  = NULL;
-	FILE *pYuvFile	  = NULL;
-	FILE *pOptionFile = NULL;
-	int64_t iStart = 0, iEnd = 0, iTotal = 0;
-	int32_t iSliceSize;
-	int32_t iSliceIndex = 0;
-	uint8_t* pBuf = NULL;
-	uint8_t uiStartCode[4] = {0, 0, 0, 1};
+void_t H264DecodeInstance (ISVCDecoder* pDecoder, const char* kpH264FileName, const char* kpOuputFileName,
+                           int32_t& iWidth, int32_t& iHeight, void_t* pOptionFileName) {
+  FILE* pH264File	  = NULL;
+  FILE* pYuvFile	  = NULL;
+  FILE* pOptionFile = NULL;
+  int64_t iStart = 0, iEnd = 0, iTotal = 0;
+  int32_t iSliceSize;
+  int32_t iSliceIndex = 0;
+  uint8_t* pBuf = NULL;
+  uint8_t uiStartCode[4] = {0, 0, 0, 1};
 
-	void_t *pData[3] = {NULL};
-	uint8_t *pDst[3] = {NULL};
-	SBufferInfo sDstBufInfo;
+  void_t* pData[3] = {NULL};
+  uint8_t* pDst[3] = {NULL};
+  SBufferInfo sDstBufInfo;
 
-	int32_t iBufPos = 0;
-	int32_t iFileSize;
-	int32_t i = 0;
-	int32_t iLastWidth = 0, iLastHeight = 0;
-	int32_t iFrameCount = 0;
-	int32_t iEndOfStreamFlag = 0;
-	int32_t iColorFormat = videoFormatInternal;
-	static int32_t iFrameNum = 0;
+  int32_t iBufPos = 0;
+  int32_t iFileSize;
+  int32_t i = 0;
+  int32_t iLastWidth = 0, iLastHeight = 0;
+  int32_t iFrameCount = 0;
+  int32_t iEndOfStreamFlag = 0;
+  int32_t iColorFormat = videoFormatInternal;
+  static int32_t iFrameNum = 0;
 
-	EDecodeMode     eDecoderMode    = AUTO_MODE;
-	EBufferProperty	eOutputProperty = BUFFER_DEVICE;
-	
-	CUtils cOutputModule;
-	double dElapsed = 0;
+  EDecodeMode     eDecoderMode    = AUTO_MODE;
+  EBufferProperty	eOutputProperty = BUFFER_DEVICE;
 
-	if (pDecoder == NULL) return;	
-	if (kpH264FileName)
-	{
-		pH264File = fopen(kpH264FileName,"rb");
-		if (pH264File == NULL){
-			fprintf(stderr, "Can not open h264 source file, check its legal path related please..\n");
-			return;
-		}
-		fprintf(stderr, "H264 source file name: %s..\n",kpH264FileName);
-	}
-	else
-	{
-		fprintf(stderr, "Can not find any h264 bitstream file to read..\n");
-		fprintf(stderr, "----------------decoder return------------------------\n" );
-		return;
-	}
+  CUtils cOutputModule;
+  double dElapsed = 0;
 
-	if (kpOuputFileName){
-		pYuvFile = fopen(kpOuputFileName, "wb");
-		if (pYuvFile == NULL){
-			fprintf(stderr, "Can not open yuv file to output result of decoding..\n");
-			// any options
-			//return;	// can let decoder work in quiet mode, no writing any output
-		}
-		else
-			fprintf(stderr, "Sequence output file name: %s..\n", kpOuputFileName);
-	}
-	else{
-		fprintf(stderr, "Can not find any output file to write..\n");
-		// any options
-	}
-	
-	if (pOptionFileName){
-		pOptionFile = fopen((char*)pOptionFileName, "wb");
-		if ( pOptionFile == NULL ){
-			fprintf(stderr, "Can not open optional file for write..\n");
-		}
-		else
-			fprintf(stderr, "Extra optional file: %s..\n", (char*)pOptionFileName);
-	}
+  if (pDecoder == NULL) return;
+  if (kpH264FileName) {
+    pH264File = fopen (kpH264FileName, "rb");
+    if (pH264File == NULL) {
+      fprintf (stderr, "Can not open h264 source file, check its legal path related please..\n");
+      return;
+    }
+    fprintf (stderr, "H264 source file name: %s..\n", kpH264FileName);
+  } else {
+    fprintf (stderr, "Can not find any h264 bitstream file to read..\n");
+    fprintf (stderr, "----------------decoder return------------------------\n");
+    return;
+  }
 
-	printf( "------------------------------------------------------\n" );
+  if (kpOuputFileName) {
+    pYuvFile = fopen (kpOuputFileName, "wb");
+    if (pYuvFile == NULL) {
+      fprintf (stderr, "Can not open yuv file to output result of decoding..\n");
+      // any options
+      //return;	// can let decoder work in quiet mode, no writing any output
+    } else
+      fprintf (stderr, "Sequence output file name: %s..\n", kpOuputFileName);
+  } else {
+    fprintf (stderr, "Can not find any output file to write..\n");
+    // any options
+  }
 
-	fseek(pH264File, 0L, SEEK_END);
-	iFileSize = ftell(pH264File);
-	if (iFileSize<=0) {
-		fprintf(stderr, "Current Bit Stream File is too small, read error!!!!\n");
-		goto label_exit;
-	}
-	fseek(pH264File, 0L, SEEK_SET);
+  if (pOptionFileName) {
+    pOptionFile = fopen ((char*)pOptionFileName, "wb");
+    if (pOptionFile == NULL) {
+      fprintf (stderr, "Can not open optional file for write..\n");
+    } else
+      fprintf (stderr, "Extra optional file: %s..\n", (char*)pOptionFileName);
+  }
 
-	pBuf = new uint8_t[iFileSize+4];
-	if (pBuf == NULL){
-		fprintf(stderr, "new buffer failed!\n");
-		goto label_exit;
-	}
+  printf ("------------------------------------------------------\n");
 
-	fread(pBuf, 1, iFileSize, pH264File);
-	memcpy(pBuf+iFileSize, &uiStartCode[0], 4);//confirmed_safe_unsafe_usage
+  fseek (pH264File, 0L, SEEK_END);
+  iFileSize = ftell (pH264File);
+  if (iFileSize <= 0) {
+    fprintf (stderr, "Current Bit Stream File is too small, read error!!!!\n");
+    goto label_exit;
+  }
+  fseek (pH264File, 0L, SEEK_SET);
 
-	if( pDecoder->SetOption( DECODER_OPTION_DATAFORMAT,  &iColorFormat ) ){
-		fprintf(stderr, "SetOption() failed, opt_id : %d  ..\n", DECODER_OPTION_DATAFORMAT);
-		goto label_exit;
-	}
+  pBuf = new uint8_t[iFileSize + 4];
+  if (pBuf == NULL) {
+    fprintf (stderr, "new buffer failed!\n");
+    goto label_exit;
+  }
 
-	if( pDecoder->SetOption( DECODER_OPTION_MODE,  &eDecoderMode ) ){
-		fprintf(stderr, "SetOption() failed, opt_id : %d  ..\n", DECODER_OPTION_MODE);
-		goto label_exit;
-	}
+  fread (pBuf, 1, iFileSize, pH264File);
+  memcpy (pBuf + iFileSize, &uiStartCode[0], 4); //confirmed_safe_unsafe_usage
 
-	// set the output buffer property
-	if(pYuvFile)
-	{
-		pDecoder->SetOption( DECODER_OPTION_OUTPUT_PROPERTY,  &eOutputProperty );
-	}
+  if (pDecoder->SetOption (DECODER_OPTION_DATAFORMAT,  &iColorFormat)) {
+    fprintf (stderr, "SetOption() failed, opt_id : %d  ..\n", DECODER_OPTION_DATAFORMAT);
+    goto label_exit;
+  }
 
+  if (pDecoder->SetOption (DECODER_OPTION_MODE,  &eDecoderMode)) {
+    fprintf (stderr, "SetOption() failed, opt_id : %d  ..\n", DECODER_OPTION_MODE);
+    goto label_exit;
+  }
+
+  // set the output buffer property
+  if (pYuvFile) {
+    pDecoder->SetOption (DECODER_OPTION_OUTPUT_PROPERTY,  &eOutputProperty);
+  }
+
 #if defined ( STICK_STREAM_SIZE )
-	FILE *fpTrack = fopen("3.len", "rb");	
+  FILE* fpTrack = fopen ("3.len", "rb");
 
 #endif// STICK_STREAM_SIZE
-	
 
-	while ( true ) {
 
-		if ( iBufPos >= iFileSize ){
-			iEndOfStreamFlag = true;
-			if ( iEndOfStreamFlag )
-				pDecoder->SetOption( DECODER_OPTION_END_OF_STREAM, (void_t*)&iEndOfStreamFlag );
-			break;
-		}
+  while (true) {
 
+    if (iBufPos >= iFileSize) {
+      iEndOfStreamFlag = true;
+      if (iEndOfStreamFlag)
+        pDecoder->SetOption (DECODER_OPTION_END_OF_STREAM, (void_t*)&iEndOfStreamFlag);
+      break;
+    }
+
 #if defined ( STICK_STREAM_SIZE )
-		if ( fpTrack )
-			fread(&iSliceSize, 1, sizeof(int32_t), fpTrack);		
+    if (fpTrack)
+      fread (&iSliceSize, 1, sizeof (int32_t), fpTrack);
 #else
-		for (i=0; i<iFileSize; i++) {
-			if (pBuf[iBufPos+i]==0 && pBuf[iBufPos+i+1]==0 && pBuf[iBufPos+i+2]==0 && 
-				pBuf[iBufPos+i+3]==1 && i>0) {
-				break;
-			}
-		}
-		iSliceSize = i;
+    for (i = 0; i < iFileSize; i++) {
+      if (pBuf[iBufPos + i] == 0 && pBuf[iBufPos + i + 1] == 0 && pBuf[iBufPos + i + 2] == 0 &&
+          pBuf[iBufPos + i + 3] == 1 && i > 0) {
+        break;
+      }
+    }
+    iSliceSize = i;
 #endif
 
 //for coverage test purpose
-        int32_t iOutputColorFormat;
-        pDecoder->GetOption(DECODER_OPTION_DATAFORMAT, &iOutputColorFormat);
-        int32_t iEndOfStreamFlag;
-        pDecoder->GetOption(DECODER_OPTION_END_OF_STREAM, &iEndOfStreamFlag);
-        int32_t iCurIdrPicId;
-        pDecoder->GetOption(DECODER_OPTION_IDR_PIC_ID, &iCurIdrPicId);
-        int32_t iFrameNum;
-        pDecoder->GetOption(DECODER_OPTION_FRAME_NUM, &iFrameNum);
-        int32_t bCurAuContainLtrMarkSeFlag;
-        pDecoder->GetOption(DECODER_OPTION_LTR_MARKING_FLAG, &bCurAuContainLtrMarkSeFlag);
-        int32_t iFrameNumOfAuMarkedLtr;
-        pDecoder->GetOption(DECODER_OPTION_LTR_MARKED_FRAME_NUM, &iFrameNumOfAuMarkedLtr);
-        int32_t iFeedbackVclNalInAu;
-        pDecoder->GetOption(DECODER_OPTION_VCL_NAL, &iFeedbackVclNalInAu);        
-        int32_t iFeedbackTidInAu;
-        pDecoder->GetOption(DECODER_OPTION_TEMPORAL_ID, &iFeedbackTidInAu);
-        int32_t iSetMode;
-        pDecoder->GetOption(DECODER_OPTION_MODE, &iSetMode);
-        int32_t iDeviceInfo;
-        pDecoder->GetOption(DECODER_OPTION_DEVICE_INFO, &iDeviceInfo);
+    int32_t iOutputColorFormat;
+    pDecoder->GetOption (DECODER_OPTION_DATAFORMAT, &iOutputColorFormat);
+    int32_t iEndOfStreamFlag;
+    pDecoder->GetOption (DECODER_OPTION_END_OF_STREAM, &iEndOfStreamFlag);
+    int32_t iCurIdrPicId;
+    pDecoder->GetOption (DECODER_OPTION_IDR_PIC_ID, &iCurIdrPicId);
+    int32_t iFrameNum;
+    pDecoder->GetOption (DECODER_OPTION_FRAME_NUM, &iFrameNum);
+    int32_t bCurAuContainLtrMarkSeFlag;
+    pDecoder->GetOption (DECODER_OPTION_LTR_MARKING_FLAG, &bCurAuContainLtrMarkSeFlag);
+    int32_t iFrameNumOfAuMarkedLtr;
+    pDecoder->GetOption (DECODER_OPTION_LTR_MARKED_FRAME_NUM, &iFrameNumOfAuMarkedLtr);
+    int32_t iFeedbackVclNalInAu;
+    pDecoder->GetOption (DECODER_OPTION_VCL_NAL, &iFeedbackVclNalInAu);
+    int32_t iFeedbackTidInAu;
+    pDecoder->GetOption (DECODER_OPTION_TEMPORAL_ID, &iFeedbackTidInAu);
+    int32_t iSetMode;
+    pDecoder->GetOption (DECODER_OPTION_MODE, &iSetMode);
+    int32_t iDeviceInfo;
+    pDecoder->GetOption (DECODER_OPTION_DEVICE_INFO, &iDeviceInfo);
 //~end for
 
-		iStart = WelsTime();
-		pData[0] = NULL;
-		pData[1] = NULL;
-		pData[2] = NULL;
-		memset(&sDstBufInfo, 0, sizeof(SBufferInfo));
+    iStart = WelsTime();
+    pData[0] = NULL;
+    pData[1] = NULL;
+    pData[2] = NULL;
+    memset (&sDstBufInfo, 0, sizeof (SBufferInfo));
 
-		pDecoder->DecodeFrame( pBuf + iBufPos, iSliceSize, pData, &sDstBufInfo );
-		
-		if(sDstBufInfo.iBufferStatus == 1)
-		{
-			pDst[0] = (uint8_t *)pData[0];
-			pDst[1] = (uint8_t *)pData[1];
-			pDst[2] = (uint8_t *)pData[2];
-		}
-		iEnd	= WelsTime();
-		iTotal	+= iEnd - iStart;
-		if ( (sDstBufInfo.iBufferStatus==1) )
-		{
-				iFrameNum++;
-			cOutputModule.Process((void_t **)pDst, &sDstBufInfo, pYuvFile);
-			if (sDstBufInfo.eBufferProperty == BUFFER_HOST)
-			{
-				iWidth  = sDstBufInfo.UsrData.sSystemBuffer.iWidth;
-				iHeight = sDstBufInfo.UsrData.sSystemBuffer.iHeight;
-			}
-			else
-			{
-				iWidth  = sDstBufInfo.UsrData.sVideoBuffer.iSurfaceWidth;
-				iHeight = sDstBufInfo.UsrData.sVideoBuffer.iSurfaceHeight;
-			}
-					
-			if ( pOptionFile != NULL )
-			{
-				if ( iWidth != iLastWidth && iHeight != iLastHeight )
-				{
-					fwrite(&iFrameCount, sizeof(iFrameCount), 1, pOptionFile);
-					fwrite(&iWidth , sizeof(iWidth) , 1, pOptionFile);
-					fwrite(&iHeight, sizeof(iHeight), 1, pOptionFile);
-					iLastWidth  = iWidth;
-					iLastHeight = iHeight;
-				}
-			}
-			++ iFrameCount;
-		}
+    pDecoder->DecodeFrame (pBuf + iBufPos, iSliceSize, pData, &sDstBufInfo);
 
-		iBufPos += iSliceSize;
-		++ iSliceIndex;
-	}
+    if (sDstBufInfo.iBufferStatus == 1) {
+      pDst[0] = (uint8_t*)pData[0];
+      pDst[1] = (uint8_t*)pData[1];
+      pDst[2] = (uint8_t*)pData[2];
+    }
+    iEnd	= WelsTime();
+    iTotal	+= iEnd - iStart;
+    if ((sDstBufInfo.iBufferStatus == 1)) {
+      iFrameNum++;
+      cOutputModule.Process ((void_t**)pDst, &sDstBufInfo, pYuvFile);
+      if (sDstBufInfo.eBufferProperty == BUFFER_HOST) {
+        iWidth  = sDstBufInfo.UsrData.sSystemBuffer.iWidth;
+        iHeight = sDstBufInfo.UsrData.sSystemBuffer.iHeight;
+      } else {
+        iWidth  = sDstBufInfo.UsrData.sVideoBuffer.iSurfaceWidth;
+        iHeight = sDstBufInfo.UsrData.sVideoBuffer.iSurfaceHeight;
+      }
 
-	// Get pending last frame
-	pData[0] = NULL;
-	pData[1] = NULL;
-	pData[2] = NULL;
-	memset(&sDstBufInfo, 0, sizeof(SBufferInfo));
+      if (pOptionFile != NULL) {
+        if (iWidth != iLastWidth && iHeight != iLastHeight) {
+          fwrite (&iFrameCount, sizeof (iFrameCount), 1, pOptionFile);
+          fwrite (&iWidth , sizeof (iWidth) , 1, pOptionFile);
+          fwrite (&iHeight, sizeof (iHeight), 1, pOptionFile);
+          iLastWidth  = iWidth;
+          iLastHeight = iHeight;
+        }
+      }
+      ++ iFrameCount;
+    }
 
-	pDecoder->DecodeFrame( NULL, 0, pData, &sDstBufInfo );
-	if(sDstBufInfo.iBufferStatus == 1)
-	{
-		pDst[0] = (uint8_t *)pData[0];
-		pDst[1] = (uint8_t *)pData[1];
-		pDst[2] = (uint8_t *)pData[2];
-	}
+    iBufPos += iSliceSize;
+    ++ iSliceIndex;
+  }
 
-	if ((sDstBufInfo.iBufferStatus==1))
-	{
-		cOutputModule.Process((void_t **)pDst, &sDstBufInfo, pYuvFile);
-		if (sDstBufInfo.eBufferProperty == BUFFER_HOST)
-		{
-			iWidth  = sDstBufInfo.UsrData.sSystemBuffer.iWidth;
-			iHeight = sDstBufInfo.UsrData.sSystemBuffer.iHeight;
-		}
-		else
-		{
-			iWidth  = sDstBufInfo.UsrData.sVideoBuffer.iSurfaceWidth;
-			iHeight = sDstBufInfo.UsrData.sVideoBuffer.iSurfaceHeight;
-		}
-		
-		if ( pOptionFile != NULL )
-		{
-			/* Anyway, we need write in case of final frame decoding */
-			fwrite(&iFrameCount, sizeof(iFrameCount), 1, pOptionFile);
-			fwrite(&iWidth , sizeof(iWidth) , 1, pOptionFile);
-			fwrite(&iHeight, sizeof(iHeight), 1, pOptionFile);
-			iLastWidth	= iWidth;
-			iLastHeight	= iHeight;
-		}
-		++ iFrameCount;
-	}
+  // Get pending last frame
+  pData[0] = NULL;
+  pData[1] = NULL;
+  pData[2] = NULL;
+  memset (&sDstBufInfo, 0, sizeof (SBufferInfo));
 
+  pDecoder->DecodeFrame (NULL, 0, pData, &sDstBufInfo);
+  if (sDstBufInfo.iBufferStatus == 1) {
+    pDst[0] = (uint8_t*)pData[0];
+    pDst[1] = (uint8_t*)pData[1];
+    pDst[2] = (uint8_t*)pData[2];
+  }
 
+  if ((sDstBufInfo.iBufferStatus == 1)) {
+    cOutputModule.Process ((void_t**)pDst, &sDstBufInfo, pYuvFile);
+    if (sDstBufInfo.eBufferProperty == BUFFER_HOST) {
+      iWidth  = sDstBufInfo.UsrData.sSystemBuffer.iWidth;
+      iHeight = sDstBufInfo.UsrData.sSystemBuffer.iHeight;
+    } else {
+      iWidth  = sDstBufInfo.UsrData.sVideoBuffer.iSurfaceWidth;
+      iHeight = sDstBufInfo.UsrData.sVideoBuffer.iSurfaceHeight;
+    }
+
+    if (pOptionFile != NULL) {
+      /* Anyway, we need write in case of final frame decoding */
+      fwrite (&iFrameCount, sizeof (iFrameCount), 1, pOptionFile);
+      fwrite (&iWidth , sizeof (iWidth) , 1, pOptionFile);
+      fwrite (&iHeight, sizeof (iHeight), 1, pOptionFile);
+      iLastWidth	= iWidth;
+      iLastHeight	= iHeight;
+    }
+    ++ iFrameCount;
+  }
+
+
 #if defined ( STICK_STREAM_SIZE )
-	if ( fpTrack ){
-		fclose( fpTrack );
-		fpTrack = NULL;
-	}
+  if (fpTrack) {
+    fclose (fpTrack);
+    fpTrack = NULL;
+  }
 #endif// STICK_STREAM_SIZE
-	
-	dElapsed = iTotal / 1e6;
-	fprintf( stderr, "-------------------------------------------------------\n" );
-	fprintf( stderr, "iWidth:		%d\nheight:		%d\nFrames:		%d\ndecode time:	%f sec\nFPS:		%f fps\n",
-			 iWidth, iHeight, iFrameCount, dElapsed, (iFrameCount * 1.0)/dElapsed );
-	fprintf( stderr, "-------------------------------------------------------\n" );
 
-	// coverity scan uninitial
+  dElapsed = iTotal / 1e6;
+  fprintf (stderr, "-------------------------------------------------------\n");
+  fprintf (stderr, "iWidth:		%d\nheight:		%d\nFrames:		%d\ndecode time:	%f sec\nFPS:		%f fps\n",
+           iWidth, iHeight, iFrameCount, dElapsed, (iFrameCount * 1.0) / dElapsed);
+  fprintf (stderr, "-------------------------------------------------------\n");
+
+  // coverity scan uninitial
 label_exit:
-	if (pBuf) 
-	{
-		delete[] pBuf;
-		pBuf = NULL;
-	}	
-	if ( pH264File )
-	{
-		fclose(pH264File);
-		pH264File = NULL;
-	}
-	if ( pYuvFile )
-	{
-		fclose(pYuvFile);
-		pYuvFile = NULL;
-	}
-	if ( pOptionFile )
-	{
-		fclose(pOptionFile);
-		pOptionFile = NULL;
-	}
+  if (pBuf) {
+    delete[] pBuf;
+    pBuf = NULL;
+  }
+  if (pH264File) {
+    fclose (pH264File);
+    pH264File = NULL;
+  }
+  if (pYuvFile) {
+    fclose (pYuvFile);
+    pYuvFile = NULL;
+  }
+  if (pOptionFile) {
+    fclose (pOptionFile);
+    pOptionFile = NULL;
+  }
 }
 
 
-int32_t main(int32_t iArgC, char* pArgV[])
-{
-	ISVCDecoder *pDecoder = NULL;
+int32_t main (int32_t iArgC, char* pArgV[]) {
+  ISVCDecoder* pDecoder = NULL;
 
-	SDecodingParam sDecParam = {0};
-	string strInputFile(""), strOutputFile(""), strOptionFile("");
+  SDecodingParam sDecParam = {0};
+  string strInputFile (""), strOutputFile (""), strOptionFile ("");
 
-	sDecParam.sVideoProperty.size = sizeof( sDecParam.sVideoProperty );
+  sDecParam.sVideoProperty.size = sizeof (sDecParam.sVideoProperty);
 
-	if (iArgC < 2)
-	{
-		printf( "usage 1: h264dec.exe welsdec.cfg\n" );
-		printf( "usage 2: h264dec.exe welsdec.264 out.yuv\n" );
-		printf( "usage 3: h264dec.exe welsdec.264\n" );
-		return 1;
-	}
-	else if (iArgC == 2)
-	{
-		if (strstr(pArgV[1], ".cfg")) // read config file //confirmed_safe_unsafe_usage
-		{
-			CReadConfig cReadCfg(pArgV[1]);
-			string strTag[4];
-			string strReconFile("");
+  if (iArgC < 2) {
+    printf ("usage 1: h264dec.exe welsdec.cfg\n");
+    printf ("usage 2: h264dec.exe welsdec.264 out.yuv\n");
+    printf ("usage 3: h264dec.exe welsdec.264\n");
+    return 1;
+  } else if (iArgC == 2) {
+    if (strstr (pArgV[1], ".cfg")) { // read config file //confirmed_safe_unsafe_usage
+      CReadConfig cReadCfg (pArgV[1]);
+      string strTag[4];
+      string strReconFile ("");
 
-			if ( !cReadCfg.ExistFile() ){
-				printf("Specified file: %s not exist, maybe invalid path or parameter settting.\n", cReadCfg.GetFileName().c_str());
-				return 1;
-			}
-			memset(&sDecParam, 0, sizeof(sDecParam));
+      if (!cReadCfg.ExistFile()) {
+        printf ("Specified file: %s not exist, maybe invalid path or parameter settting.\n", cReadCfg.GetFileName().c_str());
+        return 1;
+      }
+      memset (&sDecParam, 0, sizeof (sDecParam));
 
-			while ( !cReadCfg.EndOfFile() ){
-				long nRd = cReadCfg.ReadLine(&strTag[0]);
-				if (nRd > 0){
-					if (strTag[0].compare("InputFile") == 0){
-						strInputFile	= strTag[1];
-					}
-					else if (strTag[0].compare("OutputFile") == 0){
-						strOutputFile	= strTag[1];
-					}
-					else if (strTag[0].compare("RestructionFile") == 0){
-						strReconFile	= strTag[1];
-						int32_t iLen = strReconFile.length();
-						sDecParam.pFileNameRestructed	= new char[iLen + 1];
-						if (sDecParam.pFileNameRestructed != NULL){
-							sDecParam.pFileNameRestructed[iLen] = 0;
-						}
-					
-						strncpy(sDecParam.pFileNameRestructed, strReconFile.c_str(), iLen);//confirmed_safe_unsafe_usage
-					}
-					else if (strTag[0].compare("TargetDQID") == 0){
-						sDecParam.uiTargetDqLayer	= (uint8_t)atol(strTag[1].c_str());
-					}
-					else if (strTag[0].compare("OutColorFormat") == 0){
-						sDecParam.iOutputColorFormat = atol(strTag[1].c_str());
-					}
-					else if (strTag[0].compare("ErrorConcealmentFlag") == 0){
-						sDecParam.uiEcActiveFlag	= (uint8_t)atol(strTag[1].c_str());
-					}
-					else if (strTag[0].compare("CPULoad") == 0){
-						sDecParam.uiCpuLoad	= (uint32_t)atol(strTag[1].c_str());
-					}
-					else if (strTag[0].compare("VideoBitstreamType") == 0){
-						sDecParam.sVideoProperty.eVideoBsType = (VIDEO_BITSTREAM_TYPE)atol(strTag[1].c_str());
-					}
-				}
-			}
-			if (strOutputFile.empty())
-			{
-				printf( "No output file specified in configuration file.\n" );
-				return 1;
-			}
-		}
-		else if (strstr(pArgV[1], ".264")) // no output dump yuv file, just try to render the decoded pictures //confirmed_safe_unsafe_usage
-		{
-			strInputFile	= pArgV[1];
-			memset(&sDecParam, 0, sizeof(sDecParam));
-			sDecParam.iOutputColorFormat          = videoFormatI420;
-			sDecParam.uiTargetDqLayer	          = (uint8_t)-1;
-			sDecParam.uiEcActiveFlag	          = 1;
-			sDecParam.sVideoProperty.eVideoBsType = VIDEO_BITSTREAM_DEFAULT;
-		}
-	}
-	else //iArgC > 2
-	{
-		strInputFile	= pArgV[1];
-		strOutputFile	= pArgV[2];
-		memset(&sDecParam, 0, sizeof(sDecParam));
-		sDecParam.iOutputColorFormat	= videoFormatI420;
-		sDecParam.uiTargetDqLayer	= (uint8_t)-1;
-		sDecParam.uiEcActiveFlag	= 1;
-		sDecParam.sVideoProperty.eVideoBsType = VIDEO_BITSTREAM_DEFAULT;
-		if (iArgC > 3)
-			strOptionFile	= pArgV[3];
+      while (!cReadCfg.EndOfFile()) {
+        long nRd = cReadCfg.ReadLine (&strTag[0]);
+        if (nRd > 0) {
+          if (strTag[0].compare ("InputFile") == 0) {
+            strInputFile	= strTag[1];
+          } else if (strTag[0].compare ("OutputFile") == 0) {
+            strOutputFile	= strTag[1];
+          } else if (strTag[0].compare ("RestructionFile") == 0) {
+            strReconFile	= strTag[1];
+            int32_t iLen = strReconFile.length();
+            sDecParam.pFileNameRestructed	= new char[iLen + 1];
+            if (sDecParam.pFileNameRestructed != NULL) {
+              sDecParam.pFileNameRestructed[iLen] = 0;
+            }
 
-		if (strOutputFile.empty())
-		{
-			printf( "No output file specified in configuration file.\n" );
-			return 1;
-		}
-	}
-	
-	if (strInputFile.empty())
-	{
-		printf( "No input file specified in configuration file.\n" );
-		return 1;
-	}
-	
+            strncpy (sDecParam.pFileNameRestructed, strReconFile.c_str(), iLen); //confirmed_safe_unsafe_usage
+          } else if (strTag[0].compare ("TargetDQID") == 0) {
+            sDecParam.uiTargetDqLayer	= (uint8_t)atol (strTag[1].c_str());
+          } else if (strTag[0].compare ("OutColorFormat") == 0) {
+            sDecParam.iOutputColorFormat = atol (strTag[1].c_str());
+          } else if (strTag[0].compare ("ErrorConcealmentFlag") == 0) {
+            sDecParam.uiEcActiveFlag	= (uint8_t)atol (strTag[1].c_str());
+          } else if (strTag[0].compare ("CPULoad") == 0) {
+            sDecParam.uiCpuLoad	= (uint32_t)atol (strTag[1].c_str());
+          } else if (strTag[0].compare ("VideoBitstreamType") == 0) {
+            sDecParam.sVideoProperty.eVideoBsType = (VIDEO_BITSTREAM_TYPE)atol (strTag[1].c_str());
+          }
+        }
+      }
+      if (strOutputFile.empty()) {
+        printf ("No output file specified in configuration file.\n");
+        return 1;
+      }
+    } else if (strstr (pArgV[1],
+                       ".264")) { // no output dump yuv file, just try to render the decoded pictures //confirmed_safe_unsafe_usage
+      strInputFile	= pArgV[1];
+      memset (&sDecParam, 0, sizeof (sDecParam));
+      sDecParam.iOutputColorFormat          = videoFormatI420;
+      sDecParam.uiTargetDqLayer	          = (uint8_t) - 1;
+      sDecParam.uiEcActiveFlag	          = 1;
+      sDecParam.sVideoProperty.eVideoBsType = VIDEO_BITSTREAM_DEFAULT;
+    }
+  } else { //iArgC > 2
+    strInputFile	= pArgV[1];
+    strOutputFile	= pArgV[2];
+    memset (&sDecParam, 0, sizeof (sDecParam));
+    sDecParam.iOutputColorFormat	= videoFormatI420;
+    sDecParam.uiTargetDqLayer	= (uint8_t) - 1;
+    sDecParam.uiEcActiveFlag	= 1;
+    sDecParam.sVideoProperty.eVideoBsType = VIDEO_BITSTREAM_DEFAULT;
+    if (iArgC > 3) {
+      // Basic option parser. Note that this is not safe about the
+      // number of remaining arguments.
+      // TODO: rewrite
+      for (int i = 3; i < iArgC; i++) {
+        char* cmd = pArgV[i];
 
+        if (!strcmp (cmd, "-options")) {
+          strOutputFile = pArgV[i + 1];
+          i += 2;
+        } else if (!strcmp (cmd, "-trace")) {
+          WelsStderrSetTraceLevel (atoi (pArgV[i + 1]));
+          i += 2;
+        } else {
+          i++;
+        }
+      }
+    }
 
+    if (strOutputFile.empty()) {
+      printf ("No output file specified in configuration file.\n");
+      return 1;
+    }
+  }
 
+  if (strInputFile.empty()) {
+    printf ("No input file specified in configuration file.\n");
+    return 1;
+  }
+
+
+
+
 #if defined(_MSC_VER)
 
-	HMODULE hModule = LoadLibraryA(".\\welsdec.dll");
+  HMODULE hModule = LoadLibraryA (".\\welsdec.dll");
 
-	PCreateDecoderFunc  pCreateDecoderFunc				= NULL;
-	PDestroyDecoderFunc pDestroyDecoderFunc				= NULL;
+  PCreateDecoderFunc  pCreateDecoderFunc				= NULL;
+  PDestroyDecoderFunc pDestroyDecoderFunc				= NULL;
 
 
-	pCreateDecoderFunc  = (PCreateDecoderFunc)::GetProcAddress(hModule, "CreateDecoder");
-	pDestroyDecoderFunc = (PDestroyDecoderFunc)::GetProcAddress(hModule, "DestroyDecoder");
+  pCreateDecoderFunc  = (PCreateDecoderFunc)::GetProcAddress (hModule, "CreateDecoder");
+  pDestroyDecoderFunc = (PDestroyDecoderFunc)::GetProcAddress (hModule, "DestroyDecoder");
 
-	if ((hModule != NULL) && (pCreateDecoderFunc != NULL) && (pDestroyDecoderFunc != NULL))
-	{
-		printf("load library sw function successfully\n");
+  if ((hModule != NULL) && (pCreateDecoderFunc != NULL) && (pDestroyDecoderFunc != NULL)) {
+    printf ("load library sw function successfully\n");
 
-		if ( pCreateDecoderFunc( &pDecoder )  || (NULL == pDecoder) )
-		{
-			printf( "Create Decoder failed.\n" );
-			return 1;
-		}
-	}
-	else 
-	{
-		printf("load library sw function failed\n");
-		return 1;
-	}
+    if (pCreateDecoderFunc (&pDecoder)  || (NULL == pDecoder)) {
+      printf ("Create Decoder failed.\n");
+      return 1;
+    }
+  } else {
+    printf ("load library sw function failed\n");
+    return 1;
+  }
 
 
 #else
 
 
-	if ( CreateDecoder( &pDecoder )  || (NULL == pDecoder) )
-	{
-		printf( "Create Decoder failed.\n" );
-		return 1;
-	}
-	
+  if (CreateDecoder (&pDecoder)  || (NULL == pDecoder)) {
+    printf ("Create Decoder failed.\n");
+    return 1;
+  }
+
 #endif
 
 
-	if ( pDecoder->Initialize( &sDecParam, INIT_TYPE_PARAMETER_BASED ) )
-	{
-		printf( "Decoder initialization failed.\n" );
-		return 1;
-	}
-	
-	
-	int32_t iWidth = 0;
-	int32_t iHeight= 0;
+  if (pDecoder->Initialize (&sDecParam, INIT_TYPE_PARAMETER_BASED)) {
+    printf ("Decoder initialization failed.\n");
+    return 1;
+  }
 
-	
-	H264DecodeInstance( pDecoder, strInputFile.c_str(), strOutputFile.c_str(), iWidth, iHeight, (!strOptionFile.empty() ? (void_t*)(const_cast<char*>(strOptionFile.c_str())) : NULL) );
-	
-	if (sDecParam.pFileNameRestructed != NULL){
-		delete []sDecParam.pFileNameRestructed;
-		sDecParam.pFileNameRestructed = NULL;
-	}
-		
-	if ( pDecoder ){
-		pDecoder->Unintialize();
-		
+
+  int32_t iWidth = 0;
+  int32_t iHeight = 0;
+
+
+  H264DecodeInstance (pDecoder, strInputFile.c_str(), strOutputFile.c_str(), iWidth, iHeight,
+                      (!strOptionFile.empty() ? (void_t*) (const_cast<char*> (strOptionFile.c_str())) : NULL));
+
+  if (sDecParam.pFileNameRestructed != NULL) {
+    delete []sDecParam.pFileNameRestructed;
+    sDecParam.pFileNameRestructed = NULL;
+  }
+
+  if (pDecoder) {
+    pDecoder->Uninitialize();
+
 #if defined(_MSC_VER)
-		pDestroyDecoderFunc( pDecoder );
+    pDestroyDecoderFunc (pDecoder);
 #else
-		DestroyDecoder(pDecoder);
+    DestroyDecoder (pDecoder);
 #endif
-	}
+  }
 
-	return 0;
+  return 0;
 }
 
--- a/codec/console/dec/src/load_bundle_functions.cpp
+++ b/codec/console/dec/src/load_bundle_functions.cpp
@@ -36,7 +36,7 @@
  * \date	Created on 03/15/2011
  *
  * \description : 1. Load bundle: welsdec.bundle
- *                2. Load address of function  
+ *                2. Load address of function
  *                3. Create or destroy decoder
  *
  *************************************************************************************
@@ -55,8 +55,8 @@
 #include "dec_console.h"
 #include "codec_api.h"
 
-typedef long (*LPCreateWelsCSDecoder)(ISVCDecoder** ppDecoder);
-typedef void (*LPDestroyWelsCSDecoder)(ISVCDecoder* pDecoder);
+typedef long (*LPCreateWelsCSDecoder) (ISVCDecoder** ppDecoder);
+typedef void (*LPDestroyWelsCSDecoder) (ISVCDecoder* pDecoder);
 
 
 typedef long (*LPCreateVHDController)();
@@ -70,200 +70,170 @@
 
 
 ////////////////////////////////////////////////////////////////////////////////////////
-int GetCurrentModulePath(char* lpModulePath, const int iPathMax)
-{
-	if(lpModulePath == NULL || iPathMax <= 0)
-	{
-		return -1;
-	}
-	
-	memset(lpModulePath, 0, iPathMax);
-	
-	char cCurrentPath[PATH_MAX];
-	memset(cCurrentPath, 0, PATH_MAX);
-	
-	Dl_info 	dlInfo;
-	static int  sDummy;
-	dladdr((void*)&sDummy, &dlInfo);
-	
-	strlcpy(cCurrentPath, dlInfo.dli_fname, PATH_MAX);
-	
+int GetCurrentModulePath (char* lpModulePath, const int iPathMax) {
+  if (lpModulePath == NULL || iPathMax <= 0) {
+    return -1;
+  }
+
+  memset (lpModulePath, 0, iPathMax);
+
+  char cCurrentPath[PATH_MAX];
+  memset (cCurrentPath, 0, PATH_MAX);
+
+  Dl_info 	dlInfo;
+  static int  sDummy;
+  dladdr ((void*)&sDummy, &dlInfo);
+
+  strlcpy (cCurrentPath, dlInfo.dli_fname, PATH_MAX);
+
 #if defined(__apple__)
-	// whether is self a framework ? 
-	int locateNumber = 1;
-	struct FSRef currentPath;
-	OSStatus iStatus = FSPathMakeRef((unsigned char*)cCurrentPath, &currentPath, NULL);
-	if(noErr == iStatus)
-	{
-		LSItemInfoRecord  info;
-		iStatus = LSCopyItemInfoForRef(&currentPath, kLSRequestExtension, &info);
-		if(noErr == iStatus && NULL == info.extension)
-		{
-			locateNumber = 4;
-		}
-	}
+  // whether is self a framework ?
+  int locateNumber = 1;
+  struct FSRef currentPath;
+  OSStatus iStatus = FSPathMakeRef ((unsigned char*)cCurrentPath, &currentPath, NULL);
+  if (noErr == iStatus) {
+    LSItemInfoRecord  info;
+    iStatus = LSCopyItemInfoForRef (&currentPath, kLSRequestExtension, &info);
+    if (noErr == iStatus && NULL == info.extension) {
+      locateNumber = 4;
+    }
+  }
 #else
-	int locateNumber = 1;
+  int locateNumber = 1;
 #endif
-	
-	std::string strPath(cCurrentPath);
-	int pos = std::string::npos;
-	for(int i = 0; i < locateNumber; i++)
-	{
-		pos = strPath.rfind('/');
-		if(std::string::npos == pos)
-		{
-			break;
-		}
-		strPath.erase(pos);
-	}
-	if(std::string::npos == pos)
-	{
-		return -2;
-	}
-	cCurrentPath[pos] = 0;
-	
-	strlcpy(lpModulePath, cCurrentPath, iPathMax);
-	strlcat(lpModulePath, "/", iPathMax);
-	
-	return 0;
+
+  std::string strPath (cCurrentPath);
+  int pos = std::string::npos;
+  for (int i = 0; i < locateNumber; i++) {
+    pos = strPath.rfind ('/');
+    if (std::string::npos == pos) {
+      break;
+    }
+    strPath.erase (pos);
+  }
+  if (std::string::npos == pos) {
+    return -2;
+  }
+  cCurrentPath[pos] = 0;
+
+  strlcpy (lpModulePath, cCurrentPath, iPathMax);
+  strlcat (lpModulePath, "/", iPathMax);
+
+  return 0;
 }
 
-CFBundleRef LoadBundle(const char* lpBundlePath)
-{
-	if(lpBundlePath == NULL)
-	{
-		return NULL;
-	}
-	
-	CFStringRef bundlePath = CFStringCreateWithCString(kCFAllocatorSystemDefault, lpBundlePath, CFStringGetSystemEncoding());
-	if(NULL == bundlePath)
-	{
-		return NULL;
-	}
-	
-	CFURLRef bundleURL = CFURLCreateWithString(kCFAllocatorSystemDefault, bundlePath, NULL);
-	if(NULL == bundleURL)
-	{
-		return NULL;
-	}
+CFBundleRef LoadBundle (const char* lpBundlePath) {
+  if (lpBundlePath == NULL) {
+    return NULL;
+  }
+
+  CFStringRef bundlePath = CFStringCreateWithCString (kCFAllocatorSystemDefault, lpBundlePath,
+                           CFStringGetSystemEncoding());
+  if (NULL == bundlePath) {
+    return NULL;
+  }
+
+  CFURLRef bundleURL = CFURLCreateWithString (kCFAllocatorSystemDefault, bundlePath, NULL);
+  if (NULL == bundleURL) {
+    return NULL;
+  }
 #endif
-	
-	// 2.get bundle ref
-	CFBundleRef bundleRef = CFBundleCreate(kCFAllocatorSystemDefault, bundleURL);
-	CFRelease(bundleURL);
-	
-	if(NULL != bundleRef)
-	{
-	}
 
-	return bundleRef;
+  // 2.get bundle ref
+  CFBundleRef bundleRef = CFBundleCreate (kCFAllocatorSystemDefault, bundleURL);
+  CFRelease (bundleURL);
+
+  if (NULL != bundleRef) {
+  }
+
+  return bundleRef;
 }
 
-void* GetProcessAddress(CFBundleRef bundleRef, const char* lpProcName)
-{
-	void *processAddress = NULL;
-	if(NULL != bundleRef)
-	{
-		CFStringRef cfProcName = CFStringCreateWithCString(kCFAllocatorSystemDefault, lpProcName, CFStringGetSystemEncoding());
-		processAddress = CFBundleGetFunctionPointerForName(bundleRef, cfProcName);
-		CFRelease(cfProcName);
-	}
-	return processAddress;
+void* GetProcessAddress (CFBundleRef bundleRef, const char* lpProcName) {
+  void* processAddress = NULL;
+  if (NULL != bundleRef) {
+    CFStringRef cfProcName = CFStringCreateWithCString (kCFAllocatorSystemDefault, lpProcName, CFStringGetSystemEncoding());
+    processAddress = CFBundleGetFunctionPointerForName (bundleRef, cfProcName);
+    CFRelease (cfProcName);
+  }
+  return processAddress;
 }
 
 
 ////////////////////////
 
-bool load_bundle_welsdec()
-{
-	
-	char achPath[512] = {0};
-	
-	GetCurrentModulePath(achPath, 512);
-	strlcat(achPath, H264DecoderDLL, 512);
-	
-	g_at264Module = LoadBundle(achPath);
-	
-	if (g_at264Module == NULL)
-		return false;
+bool load_bundle_welsdec() {
 
-	return true;
+  char achPath[512] = {0};
 
+  GetCurrentModulePath (achPath, 512);
+  strlcat (achPath, H264DecoderDLL, 512);
+
+  g_at264Module = LoadBundle (achPath);
+
+  if (g_at264Module == NULL)
+    return false;
+
+  return true;
+
 }
 
-void free_bundle_welsdec()
-{
-	if(g_at264Module != NULL)
-	{
-		CFBundleUnloadExecutable(g_at264Module);
-	}
+void free_bundle_welsdec() {
+  if (g_at264Module != NULL) {
+    CFBundleUnloadExecutable (g_at264Module);
+  }
 }
 
-bool get_functions_address_create_decoder(ISVCDecoder** ppDecoder)
-{
-	if(!g_at264Module)
-		return false;
-	
-	LPCreateWelsCSDecoder pfuncCreateSWDec = 
-	(LPCreateWelsCSDecoder)GetProcessAddress(g_at264Module, "CreateSVCDecoder");
-	
-	LPCreateVHDController pfuncCreateHWDec = 
-	(LPCreateVHDController)GetProcessAddress(g_at264Module, "CreateSVCVHDController");
-	
+bool get_functions_address_create_decoder (ISVCDecoder** ppDecoder) {
+  if (!g_at264Module)
+    return false;
 
-	if(pfuncCreateSWDec != NULL)
-	{
-		pfuncCreateSWDec( ppDecoder );
-	}
-	else
-	{
-		return false;
-	}
-	
-	if(pfuncCreateHWDec != NULL)
-	{
-		pfuncCreateHWDec();
-	}
-	else
-	{
-		return false;
-	}
-	
-	return true;
-	
+  LPCreateWelsCSDecoder pfuncCreateSWDec =
+    (LPCreateWelsCSDecoder)GetProcessAddress (g_at264Module, "CreateSVCDecoder");
+
+  LPCreateVHDController pfuncCreateHWDec =
+    (LPCreateVHDController)GetProcessAddress (g_at264Module, "CreateSVCVHDController");
+
+
+  if (pfuncCreateSWDec != NULL) {
+    pfuncCreateSWDec (ppDecoder);
+  } else {
+    return false;
+  }
+
+  if (pfuncCreateHWDec != NULL) {
+    pfuncCreateHWDec();
+  } else {
+    return false;
+  }
+
+  return true;
+
 }
 
-bool get_functions_address_free_decoder(ISVCDecoder* pDecoder)
-{
-	if(!g_at264Module)
-		return false;
-	
-	LPDestroyWelsCSDecoder pfuncDestroySWDec = 
-	(LPDestroyWelsCSDecoder)GetProcessAddress(g_at264Module, "DestroySVCDecoder");
-	
-	LPDestroyVHDController pfuncDestroyHWDec = 
-	(LPDestroyVHDController)GetProcessAddress(g_at264Module, "DestroySVCVHDController");
-	
-	if(pfuncDestroySWDec != NULL)
-	{
-		pfuncDestroySWDec( pDecoder );
-	}
-	else
-	{
-		return false;
-	}
-	
-	if(pfuncDestroyHWDec != NULL)
-	{
-		pfuncDestroyHWDec();
-	}
-	else
-	{
-		return false;
-	}
+bool get_functions_address_free_decoder (ISVCDecoder* pDecoder) {
+  if (!g_at264Module)
+    return false;
 
-	return true;
+  LPDestroyWelsCSDecoder pfuncDestroySWDec =
+    (LPDestroyWelsCSDecoder)GetProcessAddress (g_at264Module, "DestroySVCDecoder");
+
+  LPDestroyVHDController pfuncDestroyHWDec =
+    (LPDestroyVHDController)GetProcessAddress (g_at264Module, "DestroySVCVHDController");
+
+  if (pfuncDestroySWDec != NULL) {
+    pfuncDestroySWDec (pDecoder);
+  } else {
+    return false;
+  }
+
+  if (pfuncDestroyHWDec != NULL) {
+    pfuncDestroyHWDec();
+  } else {
+    return false;
+  }
+
+  return true;
 }
 
 
--- a/codec/console/dec/src/read_config.cpp
+++ b/codec/console/dec/src/read_config.cpp
@@ -46,83 +46,75 @@
 
 #include "read_config.h"
 
-CReadConfig::CReadConfig( const char *kpConfigFileName )
-: m_pCfgFile(0)
-, m_strCfgFileName(kpConfigFileName)
-, m_ulLines(0)
-{
-	if ( strlen(kpConfigFileName) > 0 ){	// FIXME: To check validation in configure file name
-		m_pCfgFile = fopen(kpConfigFileName, "r");
-	}
+CReadConfig::CReadConfig (const char* kpConfigFileName)
+  : m_pCfgFile (0)
+  , m_strCfgFileName (kpConfigFileName)
+  , m_ulLines (0) {
+  if (strlen (kpConfigFileName) > 0) {	// FIXME: To check validation in configure file name
+    m_pCfgFile = fopen (kpConfigFileName, "r");
+  }
 }
 
-CReadConfig::~CReadConfig()
-{
-	if ( m_pCfgFile ){
-		fclose( m_pCfgFile );
-		m_pCfgFile = NULL;
-	}
+CReadConfig::~CReadConfig() {
+  if (m_pCfgFile) {
+    fclose (m_pCfgFile);
+    m_pCfgFile = NULL;
+  }
 }
-	
-long CReadConfig::ReadLine( string* pStr, const int kiValSize/* = 4*/ )
-{
-	if ( m_pCfgFile == NULL || pStr == NULL || kiValSize <= 1)
-		return 0;
-	
-	string *strTags = &pStr[0];
-	int iTagNum = 0, iNum = 0;
-	bool bCommentFlag = false;	
-	
-	while (iNum < kiValSize) {
-		pStr[iNum]	= "";
-		++ iNum;
-	}	
 
-	do {
-		const char kChar = (char)fgetc(m_pCfgFile);
-		
-		if ( kChar == '\n' || feof(m_pCfgFile) ){
-			++ m_ulLines;
-			break;
-		}
-		if ( kChar == '#' )
-			bCommentFlag = true;
-		if ( !bCommentFlag ){
-			if ( kChar == '\t' || kChar == ' ' ){
-				if ( iTagNum >= kiValSize )
-					break;
-				if ( !(*strTags).empty() ){
-					++ iTagNum;
-					strTags	= &pStr[iTagNum];
-				}
-			}
-			else
-				*strTags += kChar;
-		}
-		
-	} while(true);
-	
-	return 1+iTagNum;
+long CReadConfig::ReadLine (string* pStr, const int kiValSize/* = 4*/) {
+  if (m_pCfgFile == NULL || pStr == NULL || kiValSize <= 1)
+    return 0;
+
+  string* strTags = &pStr[0];
+  int iTagNum = 0, iNum = 0;
+  bool bCommentFlag = false;
+
+  while (iNum < kiValSize) {
+    pStr[iNum]	= "";
+    ++ iNum;
+  }
+
+  do {
+    const char kChar = (char)fgetc (m_pCfgFile);
+
+    if (kChar == '\n' || feof (m_pCfgFile)) {
+      ++ m_ulLines;
+      break;
+    }
+    if (kChar == '#')
+      bCommentFlag = true;
+    if (!bCommentFlag) {
+      if (kChar == '\t' || kChar == ' ') {
+        if (iTagNum >= kiValSize)
+          break;
+        if (! (*strTags).empty()) {
+          ++ iTagNum;
+          strTags	= &pStr[iTagNum];
+        }
+      } else
+        *strTags += kChar;
+    }
+
+  } while (true);
+
+  return 1 + iTagNum;
 }
 
-const bool CReadConfig::EndOfFile()
-{
-	if (m_pCfgFile == NULL)
-		return true;
-	return feof(m_pCfgFile) ? true : false;
+const bool CReadConfig::EndOfFile() {
+  if (m_pCfgFile == NULL)
+    return true;
+  return feof (m_pCfgFile) ? true : false;
 }
 
-const int CReadConfig::GetLines()
-{
-	return m_ulLines;
+const int CReadConfig::GetLines() {
+  return m_ulLines;
 }
 
-const bool CReadConfig::ExistFile()
-{
-	return (m_pCfgFile != NULL);
+const bool CReadConfig::ExistFile() {
+  return (m_pCfgFile != NULL);
 }
 
-const string& CReadConfig::GetFileName()
-{
-	return m_strCfgFileName;
+const string& CReadConfig::GetFileName() {
+  return m_strCfgFileName;
 }
--- a/codec/console/enc/inc/read_config.h
+++ b/codec/console/enc/inc/read_config.h
@@ -45,37 +45,34 @@
 #include "wels_const.h"
 using namespace std;
 
-typedef struct tagFilesSet
-{
-	string strBsFile;
-	string strSeqFile;	// for cmd lines
-	struct
-	{
-		string strLayerCfgFile;
-		string strSeqFile;
-	} sSpatialLayers[MAX_DEPENDENCY_LAYER];
+typedef struct tagFilesSet {
+  string strBsFile;
+  string strSeqFile;	// for cmd lines
+  struct {
+    string strLayerCfgFile;
+    string strSeqFile;
+  } sSpatialLayers[MAX_DEPENDENCY_LAYER];
 } SFilesSet;
 
 
-class CReadConfig
-{
-public:
-	CReadConfig();
-	CReadConfig( const char *pConfigFileName );
-	CReadConfig( const string& pConfigFileName );
-	virtual ~CReadConfig();
-	
-	void Openf(const char * strFile);
-	long ReadLine( string* strVal, const int iValSize = 4 );
-	const bool EndOfFile();
-	const int GetLines();
-	const bool ExistFile();
-	const string& GetFileName();
-	
-private:
-	FILE			*m_pCfgFile;
-	string			m_strCfgFileName;
-	unsigned long	m_iLines;
+class CReadConfig {
+ public:
+  CReadConfig();
+  CReadConfig (const char* pConfigFileName);
+  CReadConfig (const string& pConfigFileName);
+  virtual ~CReadConfig();
+
+  void Openf (const char* strFile);
+  long ReadLine (string* strVal, const int iValSize = 4);
+  const bool EndOfFile();
+  const int GetLines();
+  const bool ExistFile();
+  const string& GetFileName();
+
+ private:
+  FILE*			m_pCfgFile;
+  string			m_strCfgFileName;
+  unsigned long	m_iLines;
 };
 
 #endif	// READ_CONFIG_H__
--- a/codec/console/enc/src/bundlewelsenc.cpp
+++ b/codec/console/enc/src/bundlewelsenc.cpp
@@ -39,111 +39,99 @@
 #include "bundleloader.h"
 #include "codec_api.h"
 
-typedef long (*LPCreateWelsCSEncoder)(ISVCEncoder** ppEncoder);
-typedef void (*LPDestroyWelsCSEncoder)(ISVCEncoder* pEncoder);
+typedef long (*LPCreateWelsCSEncoder) (ISVCEncoder** ppEncoder);
+typedef void (*LPDestroyWelsCSEncoder) (ISVCEncoder* pEncoder);
 
 CFBundleRef g_at264Module = nil;
 
 const char H264EncoderDLL[] = "welsenc.bundle";
 
-int WelsEncGetCurrentModulePath(char* lpModulePath, const int iPathMax)
-{
-	if(lpModulePath == NULL || iPathMax <= 0)
-	{
-		return -1;
-	}
-	
-	memset(lpModulePath, 0, iPathMax);
-	
-	char cCurrentPath[PATH_MAX];
-	memset(cCurrentPath, 0, PATH_MAX);
-	
-	Dl_info 	dlInfo;
-	static int  sDummy;
-	dladdr((void*)&sDummy, &dlInfo);
-	
-	strlcpy(cCurrentPath, dlInfo.dli_fname, PATH_MAX);
-	
-	int locateNumber = 1;
-	
-	std::string strPath(cCurrentPath);
-	int pos = std::string::npos;
-	for(int i = 0; i < locateNumber; i++)
-	{
-		pos = strPath.rfind('/');
-		if(std::string::npos == pos)
-		{
-			break;
-		}
-		strPath.erase(pos);
-	}
-	if(std::string::npos == pos)
-	{
-		return -2;
-	}
-	cCurrentPath[pos] = 0;
-	
-	strlcpy(lpModulePath, cCurrentPath, iPathMax);
-	strlcat(lpModulePath, "/", iPathMax);
-	
-	return 0;
-	
+int WelsEncGetCurrentModulePath (char* lpModulePath, const int iPathMax) {
+  if (lpModulePath == NULL || iPathMax <= 0) {
+    return -1;
+  }
+
+  memset (lpModulePath, 0, iPathMax);
+
+  char cCurrentPath[PATH_MAX];
+  memset (cCurrentPath, 0, PATH_MAX);
+
+  Dl_info 	dlInfo;
+  static int  sDummy;
+  dladdr ((void*)&sDummy, &dlInfo);
+
+  strlcpy (cCurrentPath, dlInfo.dli_fname, PATH_MAX);
+
+  int locateNumber = 1;
+
+  std::string strPath (cCurrentPath);
+  int pos = std::string::npos;
+  for (int i = 0; i < locateNumber; i++) {
+    pos = strPath.rfind ('/');
+    if (std::string::npos == pos) {
+      break;
+    }
+    strPath.erase (pos);
+  }
+  if (std::string::npos == pos) {
+    return -2;
+  }
+  cCurrentPath[pos] = 0;
+
+  strlcpy (lpModulePath, cCurrentPath, iPathMax);
+  strlcat (lpModulePath, "/", iPathMax);
+
+  return 0;
+
 }
 
-int32_t WelsEncBundleLoad()
-{
-	
-	char achPath[512] = {0};
-	
-	WelsEncGetCurrentModulePath(achPath, 512);
-	strlcat(achPath, H264EncoderDLL, 512);
-	
-	g_at264Module = LoadBundle(achPath);
-	
-	if (g_at264Module == NULL)
-		return 1;
-	else
-		return 0;
+int32_t WelsEncBundleLoad() {
+
+  char achPath[512] = {0};
+
+  WelsEncGetCurrentModulePath (achPath, 512);
+  strlcat (achPath, H264EncoderDLL, 512);
+
+  g_at264Module = LoadBundle (achPath);
+
+  if (g_at264Module == NULL)
+    return 1;
+  else
+    return 0;
 }
 
-void WelsEncBundleFree()
-{
-	if(g_at264Module != NULL)
-	{
-		CFBundleUnloadExecutable(g_at264Module);
-	}
+void WelsEncBundleFree() {
+  if (g_at264Module != NULL) {
+    CFBundleUnloadExecutable (g_at264Module);
+  }
 }
 
-int32_t WelsEncBundleCreateEncoder(ISVCEncoder** ppEncoder)
-{
-	if(!g_at264Module)
-		return 1;
-	
-	LPCreateWelsCSEncoder pfuncCreateCSEnc = 
-	(LPCreateWelsCSEncoder)GetProcessAddress(g_at264Module, "CreateSVCEncoder");
-	
-	if(pfuncCreateCSEnc != NULL)
-	{
-		return (pfuncCreateCSEnc( ppEncoder ));
-	}
-	
-	return 1;
+int32_t WelsEncBundleCreateEncoder (ISVCEncoder** ppEncoder) {
+  if (!g_at264Module)
+    return 1;
+
+  LPCreateWelsCSEncoder pfuncCreateCSEnc =
+    (LPCreateWelsCSEncoder)GetProcessAddress (g_at264Module, "CreateSVCEncoder");
+
+  if (pfuncCreateCSEnc != NULL) {
+    return (pfuncCreateCSEnc (ppEncoder));
+  }
+
+  return 1;
 }
 
-int32_t WelsEncBundleDestroyEncoder(ISVCEncoder* pEncoder)
-{
-	if(!g_at264Module)
-		return 1;
-	
-	LPDestroyWelsCSEncoder pfuncDestroyCSEnc = 
-	(LPDestroyWelsCSEncoder)GetProcessAddress(g_at264Module, "DestroySVCEncoder");
-	
-	if(pfuncDestroyCSEnc != NULL){
-		pfuncDestroyCSEnc( pEncoder );
-		return 0;
-	}
-	else
-		return 1;
+int32_t WelsEncBundleDestroyEncoder (ISVCEncoder* pEncoder) {
+  if (!g_at264Module)
+    return 1;
+
+  LPDestroyWelsCSEncoder pfuncDestroyCSEnc =
+    (LPDestroyWelsCSEncoder)GetProcessAddress (g_at264Module, "DestroySVCEncoder");
+
+  if (pfuncDestroyCSEnc != NULL) {
+    pfuncDestroyCSEnc (pEncoder);
+    return 0;
+  } else
+    return 1;
 }
 
 
--- a/codec/console/enc/src/read_config.cpp
+++ b/codec/console/enc/src/read_config.cpp
@@ -1,160 +1,147 @@
-/*!
- * \copy
- *     Copyright (c)  2008-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *  read_config.h
- *
- *  Abstract
- *      Class for reading parameter settings in a configure file.
- *
- *  History
- *      08/18/2008 Created
- *
- *****************************************************************************/
-
-#include <stdio.h>
-#include <string.h>
-#include "read_config.h"
-
-#if defined(_MSC_VER)
-#pragma warning(push)
-#pragma warning(disable:4996)
-#endif
-
-CReadConfig::CReadConfig()
-: m_pCfgFile( NULL )
-, m_strCfgFileName("")
-, m_iLines( 0 )
-{
-}
-
-CReadConfig::CReadConfig( const char *kpConfigFileName )
-: m_pCfgFile(0)
-, m_strCfgFileName(kpConfigFileName)
-, m_iLines(0)
-{
-	if ( strlen(kpConfigFileName) > 0 ){	// confirmed_safe_unsafe_usage
-		m_pCfgFile = fopen(kpConfigFileName, "r");
-	}
-}
-
-CReadConfig::CReadConfig( const string& kpConfigFileName )
-: m_pCfgFile(0)
-, m_strCfgFileName(kpConfigFileName)
-, m_iLines(0)
-{
-	if ( kpConfigFileName.length() > 0 )
-	{
-		m_pCfgFile = fopen(kpConfigFileName.c_str(), "r");
-	}
-}
-
-CReadConfig::~CReadConfig()
-{
-	if ( m_pCfgFile ){
-		fclose( m_pCfgFile );
-		m_pCfgFile = NULL;
-	}
-}
-
-void CReadConfig::Openf(const char *kpStrFile)
-{
-	if ( kpStrFile != NULL && strlen(kpStrFile) > 0 )	// confirmed_safe_unsafe_usage
-	{
-		m_strCfgFileName = kpStrFile;
-		m_pCfgFile = fopen(kpStrFile, "r");
-	}
-}
-
-long CReadConfig::ReadLine( string* pVal, const int kiValSize/* = 4*/ )
-{
-	if ( m_pCfgFile == NULL || pVal == NULL || kiValSize <= 1)
-		return 0;
-	
-	string *strTags = &pVal[0];
-	int nTagNum = 0, n = 0;
-	bool bCommentFlag = false;	
-	
-	while (n < kiValSize) {
-		pVal[n]	= "";
-		++ n;
-	}	
-
-	do {
-		const char kCh = (char)fgetc(m_pCfgFile);
-		
-		if ( kCh == '\n' || feof(m_pCfgFile) ){
-			++ m_iLines;
-			break;
-		}
-		if ( kCh == '#' )
-			bCommentFlag = true;
-		if ( !bCommentFlag ){
-			if ( kCh == '\t' || kCh == ' ' ){
-				if ( nTagNum >= kiValSize )
-					break;
-				if ( !(*strTags).empty() ){
-					++ nTagNum;
-					strTags	= &pVal[nTagNum];
-				}
-			}
-			else
-				*strTags += kCh;
-		}
-		
-	} while(true);
-	
-	return 1+nTagNum;
-}
-
-const bool CReadConfig::EndOfFile()
-{
-	if (m_pCfgFile == NULL)
-		return true;
-	return feof(m_pCfgFile) ? true : false;
-}
-
-const int CReadConfig::GetLines()
-{
-	return m_iLines;
-}
-
-const bool CReadConfig::ExistFile()
-{
-	return (m_pCfgFile != NULL);
-}
-
-const string& CReadConfig::GetFileName()
-{
-	return m_strCfgFileName;
-}
-
-#if defined(_MSC_VER)
-#pragma warning(pop)
-#endif
+/*!
+ * \copy
+ *     Copyright (c)  2008-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *  read_config.h
+ *
+ *  Abstract
+ *      Class for reading parameter settings in a configure file.
+ *
+ *  History
+ *      08/18/2008 Created
+ *
+ *****************************************************************************/
+
+#include <stdio.h>
+#include <string.h>
+#include "read_config.h"
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable:4996)
+#endif
+
+CReadConfig::CReadConfig()
+  : m_pCfgFile (NULL)
+  , m_strCfgFileName ("")
+  , m_iLines (0) {
+}
+
+CReadConfig::CReadConfig (const char* kpConfigFileName)
+  : m_pCfgFile (0)
+  , m_strCfgFileName (kpConfigFileName)
+  , m_iLines (0) {
+  if (strlen (kpConfigFileName) > 0) {	// confirmed_safe_unsafe_usage
+    m_pCfgFile = fopen (kpConfigFileName, "r");
+  }
+}
+
+CReadConfig::CReadConfig (const string& kpConfigFileName)
+  : m_pCfgFile (0)
+  , m_strCfgFileName (kpConfigFileName)
+  , m_iLines (0) {
+  if (kpConfigFileName.length() > 0) {
+    m_pCfgFile = fopen (kpConfigFileName.c_str(), "r");
+  }
+}
+
+CReadConfig::~CReadConfig() {
+  if (m_pCfgFile) {
+    fclose (m_pCfgFile);
+    m_pCfgFile = NULL;
+  }
+}
+
+void CReadConfig::Openf (const char* kpStrFile) {
+  if (kpStrFile != NULL && strlen (kpStrFile) > 0) {	// confirmed_safe_unsafe_usage
+    m_strCfgFileName = kpStrFile;
+    m_pCfgFile = fopen (kpStrFile, "r");
+  }
+}
+
+long CReadConfig::ReadLine (string* pVal, const int kiValSize/* = 4*/) {
+  if (m_pCfgFile == NULL || pVal == NULL || kiValSize <= 1)
+    return 0;
+
+  string* strTags = &pVal[0];
+  int nTagNum = 0, n = 0;
+  bool bCommentFlag = false;
+
+  while (n < kiValSize) {
+    pVal[n]	= "";
+    ++ n;
+  }
+
+  do {
+    const char kCh = (char)fgetc (m_pCfgFile);
+
+    if (kCh == '\n' || feof (m_pCfgFile)) {
+      ++ m_iLines;
+      break;
+    }
+    if (kCh == '#')
+      bCommentFlag = true;
+    if (!bCommentFlag) {
+      if (kCh == '\t' || kCh == ' ') {
+        if (nTagNum >= kiValSize)
+          break;
+        if (! (*strTags).empty()) {
+          ++ nTagNum;
+          strTags	= &pVal[nTagNum];
+        }
+      } else
+        *strTags += kCh;
+    }
+
+  } while (true);
+
+  return 1 + nTagNum;
+}
+
+const bool CReadConfig::EndOfFile() {
+  if (m_pCfgFile == NULL)
+    return true;
+  return feof (m_pCfgFile) ? true : false;
+}
+
+const int CReadConfig::GetLines() {
+  return m_iLines;
+}
+
+const bool CReadConfig::ExistFile() {
+  return (m_pCfgFile != NULL);
+}
+
+const string& CReadConfig::GetFileName() {
+  return m_strCfgFileName;
+}
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
--- a/codec/console/enc/src/welsenc.cpp
+++ b/codec/console/enc/src/welsenc.cpp
@@ -34,6 +34,7 @@
 #include <string.h>
 #include <assert.h>
 #include <signal.h>
+#include <stdarg.h>
 
 #ifdef ONLY_ENC_FRAMES_NUM
 #undef ONLY_ENC_FRAMES_NUM
@@ -75,6 +76,7 @@
 #include "extern.h"
 #include "macros.h"
 #include "wels_const.h"
+#include "logging.h"
 
 #ifdef MT_ENABLED
 #include "mt_defs.h"
@@ -89,8 +91,8 @@
  *	Layer Context
  */
 typedef struct LayerpEncCtx_s {
-	int32_t				iDLayerQp;
-	SMulSliceOption	sMso;
+  int32_t				iDLayerQp;
+  SMulSliceOption	sMso;
 } SLayerPEncCtx;
 
 
@@ -97,1397 +99,1289 @@
 
 /* Ctrl-C handler */
 static int     g_iCtrlC = 0;
-static void    SigIntHandler( int a )
-{
-    g_iCtrlC = 1;
+static void    SigIntHandler (int a) {
+  g_iCtrlC = 1;
 }
 
-int ParseConfig(CReadConfig& cRdCfg, SWelsSvcCodingParam& pSvcParam, SFilesSet& sFileSet)
-{
-	string strTag[4];
-	int32_t iLeftTargetBitrate = 0;
-	int32_t	iLeftSpatialBitrate[MAX_DEPENDENCY_LAYER] = { 0 };
-	int32_t iRet = 0;
-	int8_t iLayerCount = 0;
-	string str_("SlicesAssign");
-	const int kiSize = str_.size();
-	
+int ParseConfig (CReadConfig& cRdCfg, SWelsSvcCodingParam& pSvcParam, SFilesSet& sFileSet) {
+  string strTag[4];
+  int32_t iLeftTargetBitrate = 0;
+  int32_t	iLeftSpatialBitrate[MAX_DEPENDENCY_LAYER] = { 0 };
+  int32_t iRet = 0;
+  int8_t iLayerCount = 0;
+  string str_ ("SlicesAssign");
+  const int kiSize = str_.size();
+
 //	memset(&pSvcParam, 0, sizeof(WelsSVCParamConfig));
 
-	while ( !cRdCfg.EndOfFile() ){
-		long iRd = cRdCfg.ReadLine(&strTag[0]);
-		if (iRd > 0){
-			if ( strTag[0].empty() )
-				continue;
-			if (strTag[0].compare("OutputFile") == 0){			
-				sFileSet.strBsFile	= strTag[1];
-				continue;
-			}
-			else if (strTag[0].compare("MaxFrameRate") == 0){
-				pSvcParam.fMaxFrameRate	= (float)atof(strTag[1].c_str());
-				continue;
-			}
-			else if (strTag[0].compare("FramesToBeEncoded") == 0){
-				pSvcParam.uiFrameToBeCoded	= atoi(strTag[1].c_str());
-				continue;
-			}
-			else if ( strTag[0].compare("SourceSequenceInRGB24") == 0 ){
-				pSvcParam.iInputCsp	= atoi(strTag[1].c_str()) == 0 ? videoFormatI420 : videoFormatRGB;
-				continue;
-			}
-			else if (strTag[0].compare("GOPSize") == 0){
-				pSvcParam.uiGopSize	= atoi(strTag[1].c_str());
-				continue;
-			}
-			else if (strTag[0].compare("IntraPeriod") == 0){
-				pSvcParam.uiIntraPeriod	= atoi(strTag[1].c_str());
-				continue;
-			}
-			else if (strTag[0].compare("EnableSpsPpsIDAddition") == 0)
-			{
-				pSvcParam.bEnableSpsPpsIdAddition	= atoi(strTag[1].c_str())?true:false; 
-				continue;
-			}
-			else if (strTag[0].compare("EnableScalableSEI") == 0)
-			{
-				pSvcParam.bEnableSSEI	= atoi(strTag[1].c_str())?true:false;
-				continue;
-			}
-			else if (strTag[0].compare("EnableFrameCropping") == 0)
-			{
-				pSvcParam.bEnableFrameCroppingFlag = (atoi(strTag[1].c_str()) != 0);	
-				continue;
-			}
-			else if (strTag[0].compare("LoopFilterDisableIDC") == 0){
-				pSvcParam.iLoopFilterDisableIdc	= (int8_t)atoi(strTag[1].c_str());
-				if (pSvcParam.iLoopFilterDisableIdc > 6 || pSvcParam.iLoopFilterDisableIdc < 0){
-					fprintf(stderr, "Invalid parameter in iLoopFilterDisableIdc: %d.\n", pSvcParam.iLoopFilterDisableIdc);
-					iRet = 1;
-					break;
-				}
-				continue;
-			}
-			else if (strTag[0].compare("LoopFilterAlphaC0Offset") == 0){
-				pSvcParam.iLoopFilterAlphaC0Offset	= (int8_t)atoi(strTag[1].c_str());
-				if ( pSvcParam.iLoopFilterAlphaC0Offset < -6 )
-					pSvcParam.iLoopFilterAlphaC0Offset	= -6;
-				else if ( pSvcParam.iLoopFilterAlphaC0Offset > 6 )
-					pSvcParam.iLoopFilterAlphaC0Offset	= 6;
-				continue;
-			}
-			else if (strTag[0].compare("LoopFilterBetaOffset") == 0){
-				pSvcParam.iLoopFilterBetaOffset	= (int8_t)atoi(strTag[1].c_str());
-				if ( pSvcParam.iLoopFilterBetaOffset < -6 )
-					pSvcParam.iLoopFilterBetaOffset	= -6;
-				else if ( pSvcParam.iLoopFilterBetaOffset > 6 )
-					pSvcParam.iLoopFilterBetaOffset	= 6;
-				continue;
-			}
-			else if (strTag[0].compare("InterLayerLoopFilterDisableIDC") == 0){
-				pSvcParam.iInterLayerLoopFilterDisableIdc = (int8_t)atoi(strTag[1].c_str());
-				if (pSvcParam.iInterLayerLoopFilterDisableIdc > 6 || pSvcParam.iInterLayerLoopFilterDisableIdc < 0){
-					fprintf(stderr, "Invalid parameter in iInterLayerLoopFilterDisableIdc: %d.\n", pSvcParam.iInterLayerLoopFilterDisableIdc);
-					iRet = 1;
-					break;
-				}
-				continue;
-			}
-			else if (strTag[0].compare("InterLayerLoopFilterAlphaC0Offset") == 0){
-				pSvcParam.iInterLayerLoopFilterAlphaC0Offset	= (int8_t)atoi(strTag[1].c_str());
-				if ( pSvcParam.iInterLayerLoopFilterAlphaC0Offset < -6 )
-					pSvcParam.iInterLayerLoopFilterAlphaC0Offset	= -6;
-				else if ( pSvcParam.iInterLayerLoopFilterAlphaC0Offset > 6 )
-					pSvcParam.iInterLayerLoopFilterAlphaC0Offset	= 6;
-				continue;
-			}
-			else if (strTag[0].compare("InterLayerLoopFilterBetaOffset") == 0){
-				pSvcParam.iInterLayerLoopFilterBetaOffset	= (int8_t)atoi(strTag[1].c_str());
-				if ( pSvcParam.iInterLayerLoopFilterBetaOffset < -6 )
-					pSvcParam.iInterLayerLoopFilterBetaOffset	= -6;
-				else if ( pSvcParam.iInterLayerLoopFilterBetaOffset > 6 )
-					pSvcParam.iInterLayerLoopFilterBetaOffset	= 6;
-				continue;
-			}			
-			else if ( strTag[0].compare("MultipleThreadIdc") == 0 )
-			{
-				// # 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
-				pSvcParam.iMultipleThreadIdc	= atoi( strTag[1].c_str() );
-				if ( pSvcParam.iMultipleThreadIdc < 0 )
-					pSvcParam.iMultipleThreadIdc = 0;
-				else if ( pSvcParam.iMultipleThreadIdc > MAX_THREADS_NUM )
-					 pSvcParam.iMultipleThreadIdc = MAX_THREADS_NUM;
-				continue;
-			}
-			else if (strTag[0].compare("EnableRC") == 0){
-				pSvcParam.bEnableRc	= atoi(strTag[1].c_str())?true:false;
-				continue;
-			}
-			else if (strTag[0].compare("RCMode") == 0){
-				pSvcParam.iRCMode	= atoi(strTag[1].c_str());
-				continue;
-			}
-			else if (strTag[0].compare("TargetBitrate") == 0){
-				pSvcParam.iTargetBitrate	= 1000 * atoi(strTag[1].c_str());
-				if ( pSvcParam.bEnableRc && pSvcParam.iTargetBitrate <= 0 ){
-					fprintf(stderr, "Invalid target bitrate setting due to RC enabled. Check TargetBitrate field please!\n");
-					return 1;
-				}
-				if ( pSvcParam.bEnableRc ){
-					iLeftTargetBitrate	= pSvcParam.iTargetBitrate;
-				}
-				continue;
-			}
-			else if (strTag[0].compare("EnableDenoise") == 0){
-				pSvcParam.bEnableDenoise	= atoi(strTag[1].c_str())?true:false;
-				continue;
-			}
-			else if (strTag[0].compare("EnableSceneChangeDetection") == 0){
-				pSvcParam.bEnableSceneChangeDetect	= atoi(strTag[1].c_str())?true:false;
-				continue;
-			}
-			else if (strTag[0].compare("EnableBackgroundDetection") == 0)
-			{
-				pSvcParam.bEnableBackgroundDetection	= atoi(strTag[1].c_str())?true:false;
-				continue;
-			}
-			else if (strTag[0].compare("EnableAdaptiveQuantization") == 0){
-				pSvcParam.bEnableAdaptiveQuant	= atoi(strTag[1].c_str())?true:false;
-				continue;
-			}
-			else if (strTag[0].compare("EnableLongTermReference") == 0){
-				pSvcParam.bEnableLongTermReference	= atoi(strTag[1].c_str())?true:false;
-				continue;
-			}
-			else if (strTag[0].compare("LtrMarkPeriod") == 0){
-				pSvcParam.uiLtrMarkPeriod	= (uint32_t)atoi(strTag[1].c_str());
-				continue;
-			}
-			else if (strTag[0].compare("NumLayers") == 0){
-				pSvcParam.iNumDependencyLayer	= (int8_t)atoi(strTag[1].c_str());
-				if (pSvcParam.iNumDependencyLayer > MAX_DEPENDENCY_LAYER || pSvcParam.iNumDependencyLayer <= 0){
-					fprintf(stderr, "Invalid parameter in iNumDependencyLayer: %d.\n", pSvcParam.iNumDependencyLayer);
-					iRet = 1;
-					break;
-				}
-				continue;
-			}
-			else if (strTag[0].compare("LayerCfg") == 0){		
-				if ( strTag[1].length() > 0 )
-					sFileSet.sSpatialLayers[iLayerCount].strLayerCfgFile	= strTag[1];
+  while (!cRdCfg.EndOfFile()) {
+    long iRd = cRdCfg.ReadLine (&strTag[0]);
+    if (iRd > 0) {
+      if (strTag[0].empty())
+        continue;
+      if (strTag[0].compare ("OutputFile") == 0) {
+        sFileSet.strBsFile	= strTag[1];
+        continue;
+      } else if (strTag[0].compare ("MaxFrameRate") == 0) {
+        pSvcParam.fMaxFrameRate	= (float)atof (strTag[1].c_str());
+        continue;
+      } else if (strTag[0].compare ("FramesToBeEncoded") == 0) {
+        pSvcParam.uiFrameToBeCoded	= atoi (strTag[1].c_str());
+        continue;
+      } else if (strTag[0].compare ("SourceSequenceInRGB24") == 0) {
+        pSvcParam.iInputCsp	= atoi (strTag[1].c_str()) == 0 ? videoFormatI420 : videoFormatRGB;
+        continue;
+      } else if (strTag[0].compare ("GOPSize") == 0) {
+        pSvcParam.uiGopSize	= atoi (strTag[1].c_str());
+        continue;
+      } else if (strTag[0].compare ("IntraPeriod") == 0) {
+        pSvcParam.uiIntraPeriod	= atoi (strTag[1].c_str());
+        continue;
+      } else if (strTag[0].compare ("EnableSpsPpsIDAddition") == 0) {
+        pSvcParam.bEnableSpsPpsIdAddition	= atoi (strTag[1].c_str()) ? true : false;
+        continue;
+      } else if (strTag[0].compare ("EnableScalableSEI") == 0) {
+        pSvcParam.bEnableSSEI	= atoi (strTag[1].c_str()) ? true : false;
+        continue;
+      } else if (strTag[0].compare ("EnableFrameCropping") == 0) {
+        pSvcParam.bEnableFrameCroppingFlag = (atoi (strTag[1].c_str()) != 0);
+        continue;
+      } else if (strTag[0].compare ("LoopFilterDisableIDC") == 0) {
+        pSvcParam.iLoopFilterDisableIdc	= (int8_t)atoi (strTag[1].c_str());
+        if (pSvcParam.iLoopFilterDisableIdc > 6 || pSvcParam.iLoopFilterDisableIdc < 0) {
+          fprintf (stderr, "Invalid parameter in iLoopFilterDisableIdc: %d.\n", pSvcParam.iLoopFilterDisableIdc);
+          iRet = 1;
+          break;
+        }
+        continue;
+      } else if (strTag[0].compare ("LoopFilterAlphaC0Offset") == 0) {
+        pSvcParam.iLoopFilterAlphaC0Offset	= (int8_t)atoi (strTag[1].c_str());
+        if (pSvcParam.iLoopFilterAlphaC0Offset < -6)
+          pSvcParam.iLoopFilterAlphaC0Offset	= -6;
+        else if (pSvcParam.iLoopFilterAlphaC0Offset > 6)
+          pSvcParam.iLoopFilterAlphaC0Offset	= 6;
+        continue;
+      } else if (strTag[0].compare ("LoopFilterBetaOffset") == 0) {
+        pSvcParam.iLoopFilterBetaOffset	= (int8_t)atoi (strTag[1].c_str());
+        if (pSvcParam.iLoopFilterBetaOffset < -6)
+          pSvcParam.iLoopFilterBetaOffset	= -6;
+        else if (pSvcParam.iLoopFilterBetaOffset > 6)
+          pSvcParam.iLoopFilterBetaOffset	= 6;
+        continue;
+      } else if (strTag[0].compare ("InterLayerLoopFilterDisableIDC") == 0) {
+        pSvcParam.iInterLayerLoopFilterDisableIdc = (int8_t)atoi (strTag[1].c_str());
+        if (pSvcParam.iInterLayerLoopFilterDisableIdc > 6 || pSvcParam.iInterLayerLoopFilterDisableIdc < 0) {
+          fprintf (stderr, "Invalid parameter in iInterLayerLoopFilterDisableIdc: %d.\n",
+                   pSvcParam.iInterLayerLoopFilterDisableIdc);
+          iRet = 1;
+          break;
+        }
+        continue;
+      } else if (strTag[0].compare ("InterLayerLoopFilterAlphaC0Offset") == 0) {
+        pSvcParam.iInterLayerLoopFilterAlphaC0Offset	= (int8_t)atoi (strTag[1].c_str());
+        if (pSvcParam.iInterLayerLoopFilterAlphaC0Offset < -6)
+          pSvcParam.iInterLayerLoopFilterAlphaC0Offset	= -6;
+        else if (pSvcParam.iInterLayerLoopFilterAlphaC0Offset > 6)
+          pSvcParam.iInterLayerLoopFilterAlphaC0Offset	= 6;
+        continue;
+      } else if (strTag[0].compare ("InterLayerLoopFilterBetaOffset") == 0) {
+        pSvcParam.iInterLayerLoopFilterBetaOffset	= (int8_t)atoi (strTag[1].c_str());
+        if (pSvcParam.iInterLayerLoopFilterBetaOffset < -6)
+          pSvcParam.iInterLayerLoopFilterBetaOffset	= -6;
+        else if (pSvcParam.iInterLayerLoopFilterBetaOffset > 6)
+          pSvcParam.iInterLayerLoopFilterBetaOffset	= 6;
+        continue;
+      } else if (strTag[0].compare ("MultipleThreadIdc") == 0) {
+        // # 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
+        pSvcParam.iMultipleThreadIdc	= atoi (strTag[1].c_str());
+        if (pSvcParam.iMultipleThreadIdc < 0)
+          pSvcParam.iMultipleThreadIdc = 0;
+        else if (pSvcParam.iMultipleThreadIdc > MAX_THREADS_NUM)
+          pSvcParam.iMultipleThreadIdc = MAX_THREADS_NUM;
+        continue;
+      } else if (strTag[0].compare ("EnableRC") == 0) {
+        pSvcParam.bEnableRc	= atoi (strTag[1].c_str()) ? true : false;
+        continue;
+      } else if (strTag[0].compare ("RCMode") == 0) {
+        pSvcParam.iRCMode	= atoi (strTag[1].c_str());
+        continue;
+      } else if (strTag[0].compare ("TargetBitrate") == 0) {
+        pSvcParam.iTargetBitrate	= 1000 * atoi (strTag[1].c_str());
+        if (pSvcParam.bEnableRc && pSvcParam.iTargetBitrate <= 0) {
+          fprintf (stderr, "Invalid target bitrate setting due to RC enabled. Check TargetBitrate field please!\n");
+          return 1;
+        }
+        if (pSvcParam.bEnableRc) {
+          iLeftTargetBitrate	= pSvcParam.iTargetBitrate;
+        }
+        continue;
+      } else if (strTag[0].compare ("EnableDenoise") == 0) {
+        pSvcParam.bEnableDenoise	= atoi (strTag[1].c_str()) ? true : false;
+        continue;
+      } else if (strTag[0].compare ("EnableSceneChangeDetection") == 0) {
+        pSvcParam.bEnableSceneChangeDetect	= atoi (strTag[1].c_str()) ? true : false;
+        continue;
+      } else if (strTag[0].compare ("EnableBackgroundDetection") == 0) {
+        pSvcParam.bEnableBackgroundDetection	= atoi (strTag[1].c_str()) ? true : false;
+        continue;
+      } else if (strTag[0].compare ("EnableAdaptiveQuantization") == 0) {
+        pSvcParam.bEnableAdaptiveQuant	= atoi (strTag[1].c_str()) ? true : false;
+        continue;
+      } else if (strTag[0].compare ("EnableLongTermReference") == 0) {
+        pSvcParam.bEnableLongTermReference	= atoi (strTag[1].c_str()) ? true : false;
+        continue;
+      } else if (strTag[0].compare ("LtrMarkPeriod") == 0) {
+        pSvcParam.uiLtrMarkPeriod	= (uint32_t)atoi (strTag[1].c_str());
+        continue;
+      } else if (strTag[0].compare ("NumLayers") == 0) {
+        pSvcParam.iNumDependencyLayer	= (int8_t)atoi (strTag[1].c_str());
+        if (pSvcParam.iNumDependencyLayer > MAX_DEPENDENCY_LAYER || pSvcParam.iNumDependencyLayer <= 0) {
+          fprintf (stderr, "Invalid parameter in iNumDependencyLayer: %d.\n", pSvcParam.iNumDependencyLayer);
+          iRet = 1;
+          break;
+        }
+        continue;
+      } else if (strTag[0].compare ("LayerCfg") == 0) {
+        if (strTag[1].length() > 0)
+          sFileSet.sSpatialLayers[iLayerCount].strLayerCfgFile	= strTag[1];
 //				pSvcParam.sDependencyLayers[iLayerCount].uiDependencyId	= iLayerCount;
-				++ iLayerCount;
-				continue;
-			}
-			else if (strTag[0].compare("PrefixNALAddingCtrl") == 0){
-				int ctrl_flag = atoi(strTag[1].c_str());
-				if (ctrl_flag > 1)
-					ctrl_flag	= 1;
-				else if (ctrl_flag < 0)
-					ctrl_flag	= 0;
-				pSvcParam.bPrefixNalAddingCtrl	= ctrl_flag?true:false;
-				continue;
-			}
-		}
-	}
+        ++ iLayerCount;
+        continue;
+      } else if (strTag[0].compare ("PrefixNALAddingCtrl") == 0) {
+        int ctrl_flag = atoi (strTag[1].c_str());
+        if (ctrl_flag > 1)
+          ctrl_flag	= 1;
+        else if (ctrl_flag < 0)
+          ctrl_flag	= 0;
+        pSvcParam.bPrefixNalAddingCtrl	= ctrl_flag ? true : false;
+        continue;
+      }
+    }
+  }
 
-	const int8_t kiActualLayerNum = WELS_MIN(pSvcParam.iNumDependencyLayer, iLayerCount);
-	if (pSvcParam.iNumDependencyLayer > kiActualLayerNum){	// fixed number of dependency layer due to parameter error in settings
-		pSvcParam.iNumDependencyLayer	= kiActualLayerNum;
-	}
-	
-	assert( kiActualLayerNum <= MAX_DEPENDENCY_LAYER );
+  const int8_t kiActualLayerNum = WELS_MIN (pSvcParam.iNumDependencyLayer, iLayerCount);
+  if (pSvcParam.iNumDependencyLayer >
+      kiActualLayerNum) {	// fixed number of dependency layer due to parameter error in settings
+    pSvcParam.iNumDependencyLayer	= kiActualLayerNum;
+  }
 
-	for (int8_t iLayer = 0; iLayer < kiActualLayerNum; ++ iLayer){
-		SLayerPEncCtx sLayerCtx;
-		int32_t iLayerArg = -2;
-		int32_t iNumQualityBitrateLayerSet = 0;
+  assert (kiActualLayerNum <= MAX_DEPENDENCY_LAYER);
 
-		SDLayerParam *pDLayer = &pSvcParam.sDependencyLayers[iLayer];
-		CReadConfig cRdLayerCfg( sFileSet.sSpatialLayers[iLayer].strLayerCfgFile );
+  for (int8_t iLayer = 0; iLayer < kiActualLayerNum; ++ iLayer) {
+    SLayerPEncCtx sLayerCtx;
+    int32_t iLayerArg = -2;
+    int32_t iNumQualityBitrateLayerSet = 0;
 
-		memset(&sLayerCtx, 0, sizeof(SLayerPEncCtx));
+    SDLayerParam* pDLayer = &pSvcParam.sDependencyLayers[iLayer];
+    CReadConfig cRdLayerCfg (sFileSet.sSpatialLayers[iLayer].strLayerCfgFile);
 
-		if ( !cRdLayerCfg.ExistFile() ){
-			fprintf(stderr, "Unabled to open layer #%d configuration file: %s.\n", iLayer, cRdLayerCfg.GetFileName().c_str());
-			continue;
-		}
-		
-		while ( !cRdLayerCfg.EndOfFile() ){
-			long iLayerRd = cRdLayerCfg.ReadLine(&strTag[0]);
-			bool_t bFound = false;
-			if (iLayerRd > 0){
-				if ( strTag[0].empty() )
-					continue;
-				if (strTag[0].compare("SourceWidth") == 0){
-					pDLayer->iFrameWidth	= atoi(strTag[1].c_str());
-					pDLayer->iActualWidth= pDLayer->iFrameWidth;
-					continue;
-				}
-				else if (strTag[0].compare("SourceHeight") == 0){
-					pDLayer->iFrameHeight	= atoi(strTag[1].c_str());
-					pDLayer->iActualHeight	= pDLayer->iFrameHeight;
-					continue;
-				}
-				else if (strTag[0].compare("FrameRateIn") == 0){
-					pDLayer->fInputFrameRate	= (float)atof(strTag[1].c_str());
-					continue;
-				}
-				else if (strTag[0].compare("FrameRateOut") == 0){
-					pDLayer->fOutputFrameRate = (float)atof(strTag[1].c_str());
-					continue;
-				}
-				else if (strTag[0].compare("InputFile") == 0){		
-					if ( strTag[1].length() > 0 )
-						sFileSet.sSpatialLayers[iLayer].strSeqFile	= strTag[1];
-					continue;
-				}
-				else if (strTag[0].compare("ReconFile") == 0){
-					const int kiLen = strTag[1].length();
-					if (kiLen >= MAX_FNAME_LEN)
-						return 1;
+    memset (&sLayerCtx, 0, sizeof (SLayerPEncCtx));
+
+    if (!cRdLayerCfg.ExistFile()) {
+      fprintf (stderr, "Unabled to open layer #%d configuration file: %s.\n", iLayer, cRdLayerCfg.GetFileName().c_str());
+      continue;
+    }
+
+    while (!cRdLayerCfg.EndOfFile()) {
+      long iLayerRd = cRdLayerCfg.ReadLine (&strTag[0]);
+      bool_t bFound = false;
+      if (iLayerRd > 0) {
+        if (strTag[0].empty())
+          continue;
+        if (strTag[0].compare ("SourceWidth") == 0) {
+          pDLayer->iFrameWidth	= atoi (strTag[1].c_str());
+          pDLayer->iActualWidth = pDLayer->iFrameWidth;
+          continue;
+        } else if (strTag[0].compare ("SourceHeight") == 0) {
+          pDLayer->iFrameHeight	= atoi (strTag[1].c_str());
+          pDLayer->iActualHeight	= pDLayer->iFrameHeight;
+          continue;
+        } else if (strTag[0].compare ("FrameRateIn") == 0) {
+          pDLayer->fInputFrameRate	= (float)atof (strTag[1].c_str());
+          continue;
+        } else if (strTag[0].compare ("FrameRateOut") == 0) {
+          pDLayer->fOutputFrameRate = (float)atof (strTag[1].c_str());
+          continue;
+        } else if (strTag[0].compare ("InputFile") == 0) {
+          if (strTag[1].length() > 0)
+            sFileSet.sSpatialLayers[iLayer].strSeqFile	= strTag[1];
+          continue;
+        } else if (strTag[0].compare ("ReconFile") == 0) {
+          const int kiLen = strTag[1].length();
+          if (kiLen >= MAX_FNAME_LEN)
+            return 1;
 #ifdef ENABLE_FRAME_DUMP
-					pDLayer->sRecFileName[kiLen] = '\0';
-					strncpy(pDLayer->sRecFileName, strTag[1].c_str(), kiLen);	// confirmed_safe_unsafe_usage
+          pDLayer->sRecFileName[kiLen] = '\0';
+          strncpy (pDLayer->sRecFileName, strTag[1].c_str(), kiLen);	// confirmed_safe_unsafe_usage
 #endif//ENABLE_FRAME_DUMP
-					continue;
-				}
-				else if (strTag[0].compare("ProfileIdc") == 0){
-					pDLayer->uiProfileIdc	= atoi(strTag[1].c_str());
-					continue;
-				}
-				else if (strTag[0].compare("FRExt") == 0){
+          continue;
+        } else if (strTag[0].compare ("ProfileIdc") == 0) {
+          pDLayer->uiProfileIdc	= atoi (strTag[1].c_str());
+          continue;
+        } else if (strTag[0].compare ("FRExt") == 0) {
 //					pDLayer->frext_mode	= (bool_t)atoi(strTag[1].c_str());
-					continue;
-				}
+          continue;
+        }
 
-				if (strTag[0].compare("SpatialBitrate") == 0){
-					pDLayer->iSpatialBitrate	= 1000 * atoi(strTag[1].c_str());
-					if ( pSvcParam.bEnableRc && pDLayer->iSpatialBitrate <= 0 ){
-						fprintf(stderr, "Invalid spatial bitrate(%d) in dependency layer #%d.\n", pDLayer->iSpatialBitrate, iLayer);
-						return 1;
-					}
-					if ( pSvcParam.bEnableRc &&pDLayer->iSpatialBitrate > iLeftTargetBitrate ){ 
-						fprintf(stderr, "Invalid spatial(#%d) bitrate(%d) setting due to unavailable left(%d)!\n", iLayer, pDLayer->iSpatialBitrate, iLeftTargetBitrate);
-						return 1;
-					}
-					iLeftSpatialBitrate[iLayer]	= pDLayer->iSpatialBitrate;
-					continue;
-				}
-				if (strTag[0].compare("InitialQP") == 0){
-					sLayerCtx.iDLayerQp	= atoi(strTag[1].c_str());
-					continue;
-				}
-				if (strTag[0].compare("SliceMode") == 0){
-					sLayerCtx.sMso.uiSliceMode	= (SliceMode)atoi(strTag[1].c_str());
-					continue;
-				}
-				else if (strTag[0].compare("SliceSize") == 0){//SM_DYN_SLICE
-					sLayerCtx.sMso.sSliceArgument.uiSliceSizeConstraint	= (SliceMode)atoi(strTag[1].c_str());
-					continue;
-				}
-				else if (strTag[0].compare("SliceNum") == 0){
-					sLayerCtx.sMso.sSliceArgument.iSliceNum = atoi(strTag[1].c_str());
-					continue;
-				}
-				else if ( strTag[0].compare(0, kiSize, str_ ) == 0 )
-				{
-					const char* kpString = strTag[0].c_str();
-					int uiSliceIdx = atoi(&kpString[kiSize]);
-					assert( uiSliceIdx < MAX_SLICES_NUM );
-					sLayerCtx.sMso.sSliceArgument.uiSliceMbNum[uiSliceIdx] = atoi( strTag[1].c_str() );
-					continue;
-				}
-			}
-		}
-		pDLayer->iDLayerQp	= sLayerCtx.iDLayerQp;
-		pDLayer->sMso.uiSliceMode		= sLayerCtx.sMso.uiSliceMode;		
+        if (strTag[0].compare ("SpatialBitrate") == 0) {
+          pDLayer->iSpatialBitrate	= 1000 * atoi (strTag[1].c_str());
+          if (pSvcParam.bEnableRc && pDLayer->iSpatialBitrate <= 0) {
+            fprintf (stderr, "Invalid spatial bitrate(%d) in dependency layer #%d.\n", pDLayer->iSpatialBitrate, iLayer);
+            return 1;
+          }
+          if (pSvcParam.bEnableRc && pDLayer->iSpatialBitrate > iLeftTargetBitrate) {
+            fprintf (stderr, "Invalid spatial(#%d) bitrate(%d) setting due to unavailable left(%d)!\n", iLayer,
+                     pDLayer->iSpatialBitrate, iLeftTargetBitrate);
+            return 1;
+          }
+          iLeftSpatialBitrate[iLayer]	= pDLayer->iSpatialBitrate;
+          continue;
+        }
+        if (strTag[0].compare ("InitialQP") == 0) {
+          sLayerCtx.iDLayerQp	= atoi (strTag[1].c_str());
+          continue;
+        }
+        if (strTag[0].compare ("SliceMode") == 0) {
+          sLayerCtx.sMso.uiSliceMode	= (SliceMode)atoi (strTag[1].c_str());
+          continue;
+        } else if (strTag[0].compare ("SliceSize") == 0) { //SM_DYN_SLICE
+          sLayerCtx.sMso.sSliceArgument.uiSliceSizeConstraint	= (SliceMode)atoi (strTag[1].c_str());
+          continue;
+        } else if (strTag[0].compare ("SliceNum") == 0) {
+          sLayerCtx.sMso.sSliceArgument.iSliceNum = atoi (strTag[1].c_str());
+          continue;
+        } else if (strTag[0].compare (0, kiSize, str_) == 0) {
+          const char* kpString = strTag[0].c_str();
+          int uiSliceIdx = atoi (&kpString[kiSize]);
+          assert (uiSliceIdx < MAX_SLICES_NUM);
+          sLayerCtx.sMso.sSliceArgument.uiSliceMbNum[uiSliceIdx] = atoi (strTag[1].c_str());
+          continue;
+        }
+      }
+    }
+    pDLayer->iDLayerQp	= sLayerCtx.iDLayerQp;
+    pDLayer->sMso.uiSliceMode		= sLayerCtx.sMso.uiSliceMode;
 
-		memcpy( &pDLayer->sMso, &sLayerCtx.sMso, sizeof(SMulSliceOption) );	// confirmed_safe_unsafe_usage
-		memcpy( &pDLayer->sMso.sSliceArgument.uiSliceMbNum[0], &sLayerCtx.sMso.sSliceArgument.uiSliceMbNum[0], sizeof(sLayerCtx.sMso.sSliceArgument.uiSliceMbNum) );	// confirmed_safe_unsafe_usage
-	}
+    memcpy (&pDLayer->sMso, &sLayerCtx.sMso, sizeof (SMulSliceOption));	// confirmed_safe_unsafe_usage
+    memcpy (&pDLayer->sMso.sSliceArgument.uiSliceMbNum[0], &sLayerCtx.sMso.sSliceArgument.uiSliceMbNum[0],
+            sizeof (sLayerCtx.sMso.sSliceArgument.uiSliceMbNum));	// confirmed_safe_unsafe_usage
+  }
 
-	return iRet;
+  return iRet;
 }
 
-int ParseCommandLine( int argc, char ** argv, SVCEncodingParam & sParam)
-{
-	char * pCmd;
-	int i = 0;
+int ParseCommandLine (int argc, char** argv, SVCEncodingParam& sParam) {
+  char* pCmd;
+  int i = 0;
 
-	if (argc <= 0) // no additional pCmd parameters 
-		return 0;
+  if (argc <= 0) // no additional pCmd parameters
+    return 0;
 
-	while ( i < argc )
-	{
-		pCmd = argv[i];
+  while (i < argc) {
+    pCmd = argv[i];
 
-		if( !strcmp(pCmd, "-numl") ) {	// confirmed_safe_unsafe_usage
-			int  iNumSpatial = atoi(argv[i+1]);
-			sParam.iSpatialLayerNum = iNumSpatial;
-			i += 2;
-		} else if( !strcmp(pCmd, "-numt") ) {	// confirmed_safe_unsafe_usage
-			int  iNumTemporal = atoi(argv[i+1]);
-			sParam.iTemporalLayerNum = iNumTemporal;
-			i += 2;
-		} else if( !strcmp(pCmd,"-iper") ) {	// confirmed_safe_unsafe_usage
-			int iPeriod = atoi(argv[i+1]);
-			sParam.iIntraPeriod = iPeriod;
-			i += 2;
-		}
-		else if( !strcmp(pCmd,"-spsid") ) {	// confirmed_safe_unsafe_usage
-			int iSpsPpsId = atoi(argv[i+1]);
-			sParam.bEnableSpsPpsIdAddition = iSpsPpsId?true:false;
-			i += 2;
-		} 
-		else if( !strcmp(pCmd,"-denois") ) {	// confirmed_safe_unsafe_usage
-			int iDenois = atoi(argv[i+1]);
-			sParam.bEnableDenoise = iDenois?true:false;
-			i += 2;
-		} else if( !strcmp(pCmd,"-bgd") ) {	// confirmed_safe_unsafe_usage
-			int iBgd = atoi(argv[i+1]);
-			sParam.bEnableBackgroundDetection = iBgd?true:false;
-			i += 2;
-		} else if( !strcmp(pCmd,"-aq") ) {	// confirmed_safe_unsafe_usage
-			int iAq = atoi(argv[i+1]);
-			sParam.bEnableAdaptiveQuant = iAq?true:false;
-			i += 2;
-		} else if( !strcmp(pCmd,"-ltr") ) {	// confirmed_safe_unsafe_usage
-			int iLtr = atoi(argv[i+1]);
-			sParam.bEnableLongTermReference = iLtr?true:false;
-			i += 2;
-		} else if( !strcmp(pCmd,"-ltrper") ) {	// confirmed_safe_unsafe_usage
-			int iLtrPer = atoi(argv[i+1]);
-			sParam.iLtrMarkPeriod = iLtrPer;
-			i += 2;	
-		} else if( !strcmp(pCmd,"-rcm") ) {	// confirmed_safe_unsafe_usage
-			int iRcMode = atoi(argv[i+1]);
-			sParam.iRCMode = iRcMode;
-			i += 2;
-		} else if( !strcmp(pCmd,"-tarb") ) {	// confirmed_safe_unsafe_usage
-			int iTarB = atoi(argv[i+1]);
-			sParam.iTargetBitrate = iTarB;
-			i += 2;
-		} else if( !strcmp(pCmd,"-ltarb") )	// confirmed_safe_unsafe_usage
-		{
-			int	iLayer = atoi( argv[i+1] );
-			int iSpatialBitrate = atoi( argv[i+2] );
-			sParam.sSpatialLayers[iLayer].iSpatialBitrate	= iSpatialBitrate;
-			i += 3;
-		} else {
-			i ++;
-		}		
-	}
+    if (!strcmp (pCmd, "-numl")) {	// confirmed_safe_unsafe_usage
+      int  iNumSpatial = atoi (argv[i + 1]);
+      sParam.iSpatialLayerNum = iNumSpatial;
+      i += 2;
+    } else if (!strcmp (pCmd, "-numt")) {	// confirmed_safe_unsafe_usage
+      int  iNumTemporal = atoi (argv[i + 1]);
+      sParam.iTemporalLayerNum = iNumTemporal;
+      i += 2;
+    } else if (!strcmp (pCmd, "-iper")) {	// confirmed_safe_unsafe_usage
+      int iPeriod = atoi (argv[i + 1]);
+      sParam.iIntraPeriod = iPeriod;
+      i += 2;
+    } else if (!strcmp (pCmd, "-spsid")) {	// confirmed_safe_unsafe_usage
+      int iSpsPpsId = atoi (argv[i + 1]);
+      sParam.bEnableSpsPpsIdAddition = iSpsPpsId ? true : false;
+      i += 2;
+    } else if (!strcmp (pCmd, "-denois")) {	// confirmed_safe_unsafe_usage
+      int iDenois = atoi (argv[i + 1]);
+      sParam.bEnableDenoise = iDenois ? true : false;
+      i += 2;
+    } else if (!strcmp (pCmd, "-bgd")) {	// confirmed_safe_unsafe_usage
+      int iBgd = atoi (argv[i + 1]);
+      sParam.bEnableBackgroundDetection = iBgd ? true : false;
+      i += 2;
+    } else if (!strcmp (pCmd, "-aq")) {	// confirmed_safe_unsafe_usage
+      int iAq = atoi (argv[i + 1]);
+      sParam.bEnableAdaptiveQuant = iAq ? true : false;
+      i += 2;
+    } else if (!strcmp (pCmd, "-ltr")) {	// confirmed_safe_unsafe_usage
+      int iLtr = atoi (argv[i + 1]);
+      sParam.bEnableLongTermReference = iLtr ? true : false;
+      i += 2;
+    } else if (!strcmp (pCmd, "-ltrper")) {	// confirmed_safe_unsafe_usage
+      int iLtrPer = atoi (argv[i + 1]);
+      sParam.iLtrMarkPeriod = iLtrPer;
+      i += 2;
+    } else if (!strcmp (pCmd, "-rcm")) {	// confirmed_safe_unsafe_usage
+      int iRcMode = atoi (argv[i + 1]);
+      sParam.iRCMode = iRcMode;
+      i += 2;
+    } else if (!strcmp (pCmd, "-tarb")) {	// confirmed_safe_unsafe_usage
+      int iTarB = atoi (argv[i + 1]);
+      sParam.iTargetBitrate = iTarB;
+      i += 2;
+    } else if (!strcmp (pCmd, "-ltarb")) {	// confirmed_safe_unsafe_usage
+      int	iLayer = atoi (argv[i + 1]);
+      int iSpatialBitrate = atoi (argv[i + 2]);
+      sParam.sSpatialLayers[iLayer].iSpatialBitrate	= iSpatialBitrate;
+      i += 3;
+    } else if (!strcmp (pCmd, "-trace")) {
+      int32_t iLog = atoi (argv[i + 1]);
+      WelsStderrSetTraceLevel (iLog);
+      i += 2;
+    } else if (!strcmp (pCmd, "-sw")) {
+      int iWidth = atoi (argv[i + 1]);
+      sParam.iPicWidth = iWidth;
+      i += 2;
+    } else if (!strcmp (pCmd, "-sh")) {
+      int iHeight = atoi (argv[i + 1]);
+      sParam.iPicHeight = iHeight;
+      i += 2;
+    } else {
+      i ++;
+    }
+  }
 
-    return 0;
+  return 0;
 }
 
-void PrintHelp()
-{
-	printf("\n Wels SVC Encoder Usage:\n\n");
-	printf(" Syntax: welsenc.exe welsenc.cfg\n");
-	printf(" Syntax: welsenc.exe welsenc.cfg [options]\n");
+void PrintHelp() {
+  printf ("\n Wels SVC Encoder Usage:\n\n");
+  printf (" Syntax: welsenc.exe welsenc.cfg\n");
+  printf (" Syntax: welsenc.exe welsenc.cfg [options]\n");
 
-	printf("\n Supported Options:\n");
-	printf("  -h      Print Help\n");
-	printf("  -bf     Bit Stream File\n");
-	printf("  -frms   Number of total frames to be encoded\n");
-	printf("  -gop    GOPSize - GOP size (2,4,8,16,32,64, default: 1)\n");
-	printf("  -iper   Intra period (default: -1) : must be a power of 2 of GOP size (or -1)\n");
-	printf("  -spsid   Enable id adding in SPS/PPS per IDR \n");
-	printf("  -denois Control denoising  (default: 0)\n");
-	printf("  -scene  Control scene change detection (default: 0)\n");
-	printf("  -bgd    Control background detection (default: 0)\n");
-	printf("  -aq     Control adaptive quantization (default: 0)\n");
-	printf("  -ltr    Control long term reference (default: 0)\n");
-	printf("  -rc	  Control rate control: 0-disable; 1-enable \n");
-	printf("  -tarb	  Overall target bitrate\n");
-	printf("  -numl   Number Of Layers: Must exist with layer_cfg file and the number of input layer_cfg file must equal to the value set by this command\n");
-	printf("  The options below are layer-based: (need to be set with layer id)\n");
-	printf("  -org		(Layer) (original file); example: -org 0 src.yuv\n");
-	printf("  -drec		(Layer) (reconstruction file); Setting the reconstruction file, this will only functioning when dumping reconstruction is enabled\n");
-	printf("  -sw		(Layer) (source width)\n");
-	printf("  -sh		(Layer) (source height)\n");
-	printf("  -frin		(Layer) (input frame rate)\n");
-	printf("  -frout  	(Layer) (output frame rate)\n");
-	printf("  -lqp		(Layer) (base quality layer qp : must work with -ldeltaqp or -lqparr)\n");
-	printf("  -ltarb	    (Layer) (spatial layer target bitrate)\n");
-	printf("  -slcmd   (Layer) (spatial layer slice mode): pls refer to layerX.cfg for details ( -slcnum: set target slice num; -slcsize: set target slice size constraint ) \n");
-	printf("\n");
+  printf ("\n Supported Options:\n");
+  printf ("  -h      Print Help\n");
+  printf ("  -bf     Bit Stream File\n");
+  printf ("  -frms   Number of total frames to be encoded\n");
+  printf ("  -gop    GOPSize - GOP size (2,4,8,16,32,64, default: 1)\n");
+  printf ("  -iper   Intra period (default: -1) : must be a power of 2 of GOP size (or -1)\n");
+  printf ("  -spsid   Enable id adding in SPS/PPS per IDR \n");
+  printf ("  -denois Control denoising  (default: 0)\n");
+  printf ("  -scene  Control scene change detection (default: 0)\n");
+  printf ("  -bgd    Control background detection (default: 0)\n");
+  printf ("  -aq     Control adaptive quantization (default: 0)\n");
+  printf ("  -ltr    Control long term reference (default: 0)\n");
+  printf ("  -rc	  Control rate control: 0-disable; 1-enable \n");
+  printf ("  -tarb	  Overall target bitrate\n");
+  printf ("  -numl   Number Of Layers: Must exist with layer_cfg file and the number of input layer_cfg file must equal to the value set by this command\n");
+  printf ("  The options below are layer-based: (need to be set with layer id)\n");
+  printf ("  -org		(Layer) (original file); example: -org 0 src.yuv\n");
+  printf ("  -drec		(Layer) (reconstruction file); Setting the reconstruction file, this will only functioning when dumping reconstruction is enabled\n");
+  printf ("  -sw		(Layer) (source width)\n");
+  printf ("  -sh		(Layer) (source height)\n");
+  printf ("  -frin		(Layer) (input frame rate)\n");
+  printf ("  -frout  	(Layer) (output frame rate)\n");
+  printf ("  -lqp		(Layer) (base quality layer qp : must work with -ldeltaqp or -lqparr)\n");
+  printf ("  -ltarb	    (Layer) (spatial layer target bitrate)\n");
+  printf ("  -slcmd   (Layer) (spatial layer slice mode): pls refer to layerX.cfg for details ( -slcnum: set target slice num; -slcsize: set target slice size constraint ) \n");
+  printf ("\n");
 }
 
-int ParseCommandLine(int argc, char** argv, SWelsSvcCodingParam & pSvcParam, SFilesSet& sFileSet) 
-{
-	char* pCommand = NULL;
-	char* pTemp = NULL;
-	unsigned int uiQpChangeFlag[4] = {0};
-	unsigned int uiQlPredModeChangeFlag[4] = {0};
-	SLayerPEncCtx sLayerCtx[3];
-	int n = 0;
-	string str_("SlicesAssign");
-	const int kiSize = str_.size();
+int ParseCommandLine (int argc, char** argv, SWelsSvcCodingParam& pSvcParam, SFilesSet& sFileSet) {
+  char* pCommand = NULL;
+  char* pTemp = NULL;
+  unsigned int uiQpChangeFlag[4] = {0};
+  unsigned int uiQlPredModeChangeFlag[4] = {0};
+  SLayerPEncCtx sLayerCtx[3];
+  int n = 0;
+  string str_ ("SlicesAssign");
+  const int kiSize = str_.size();
 
-	if (argc <= 0) // no additional pCmd parameters 
-		return 0;
+  if (argc <= 0) // no additional pCmd parameters
+    return 0;
 
-	while(n < argc)
-	{
-		pCommand = argv[n++];
-		if (!(strcmp(pCommand,"-h")))	// confirmed_safe_unsafe_usage
-		{
-			PrintHelp();
-			continue;
-		}
-		if (!(strcmp(pCommand,"-bf")))	// confirmed_safe_unsafe_usage
-		{			
-			sFileSet.strBsFile.assign(argv[n]);
-			++ n;
-			continue;
-		}
-		if( !(strcmp(pCommand,"-frms")) )	// confirmed_safe_unsafe_usage
-		{
-			pSvcParam.uiFrameToBeCoded = atoi(argv[n ]);
-			++ n;
-			continue;
-		}
-		if( !(strcmp(pCommand,"-gop")) )	// confirmed_safe_unsafe_usage
-		{
-			pSvcParam.uiGopSize = atoi(argv[n ]);
-			++ n;
-			continue;
-		}
-		if( !(strcmp(pCommand,"-iper")) )	// confirmed_safe_unsafe_usage
-		{
-			pSvcParam.uiIntraPeriod = atoi(argv[n ]);
-			++ n;
-			continue;
-		}
-		if( !(strcmp(pCommand,"-spsid")) )	// confirmed_safe_unsafe_usage
-		{
-			pSvcParam.bEnableSpsPpsIdAddition = atoi(argv[n ])?true:false;
-			++ n;
-			continue;
-		}
-		if( !(strcmp(pCommand,"-denois")) )	// confirmed_safe_unsafe_usage
-		{
-			pSvcParam.bEnableDenoise = atoi(argv[n ])?true:false;
-			++ n;
-			continue;
-		}
-		if( !(strcmp(pCommand,"-scene")) )	// confirmed_safe_unsafe_usage
-		{
-			pSvcParam.bEnableSceneChangeDetect = atoi(argv[n ])?true:false;
-			++ n;
-			continue;
-		}
-		if ( !(strcmp(pCommand,"-bgd")) )	// confirmed_safe_unsafe_usage
-		{
-			pSvcParam.bEnableBackgroundDetection = atoi(argv[n ])?true:false;
-			++ n;
-			continue;
-		}
-		if( !(strcmp(pCommand,"-aq")) )	// confirmed_safe_unsafe_usage
-		{
-			pSvcParam.bEnableAdaptiveQuant = atoi(argv[n ])?true:false;
-			++ n;
-			continue;
-		}
-		if( !(strcmp(pCommand,"-ltr")) )	// confirmed_safe_unsafe_usage
-		{
-			pSvcParam.bEnableLongTermReference = atoi(argv[n ])?true:false;
-			++ n;
-			continue;
-		}
-		if( !(strcmp(pCommand,"-ltrper")) )	// confirmed_safe_unsafe_usage
-		{
-			pSvcParam.uiLtrMarkPeriod = atoi(argv[n ]);
-			++ n;
-			continue;
-		}
-		if( !(strcmp(pCommand,"-rc")) )	// confirmed_safe_unsafe_usage
-		{
-			pSvcParam.bEnableRc = atoi(argv[n ])?true:false;
-			++ n;
-			continue;
-		}
-		if( !(strcmp(pCommand,"-tarb")) )	// confirmed_safe_unsafe_usage
-		{
-			pSvcParam.iTargetBitrate = atoi(argv[n ]);
-			++ n;
-			continue;
-		}
-		if( !(strcmp(pCommand,"-numl")) )	// confirmed_safe_unsafe_usage
-		{
-			bool_t bFound = false;
-			pSvcParam.iNumDependencyLayer = atoi(argv[n++]);
-			for (int ln = 0 ; ln < pSvcParam.iNumDependencyLayer ; ln++)
-			{
-//				pSvcParam.sDependencyLayers[ln].uiDependencyId = ln;				
-				sFileSet.sSpatialLayers[ln].strLayerCfgFile.assign( argv[n] );
-				++ n;
-			}
+  while (n < argc) {
+    pCommand = argv[n++];
+    if (! (strcmp (pCommand, "-h"))) {	// confirmed_safe_unsafe_usage
+      PrintHelp();
+      continue;
+    }
+    if (! (strcmp (pCommand, "-bf"))) {	// confirmed_safe_unsafe_usage
+      sFileSet.strBsFile.assign (argv[n]);
+      ++ n;
+      continue;
+    }
+    if (! (strcmp (pCommand, "-frms"))) {	// confirmed_safe_unsafe_usage
+      pSvcParam.uiFrameToBeCoded = atoi (argv[n ]);
+      ++ n;
+      continue;
+    }
+    if (! (strcmp (pCommand, "-gop"))) {	// confirmed_safe_unsafe_usage
+      pSvcParam.uiGopSize = atoi (argv[n ]);
+      ++ n;
+      continue;
+    }
+    if (! (strcmp (pCommand, "-iper"))) {	// confirmed_safe_unsafe_usage
+      pSvcParam.uiIntraPeriod = atoi (argv[n ]);
+      ++ n;
+      continue;
+    }
+    if (! (strcmp (pCommand, "-spsid"))) {	// confirmed_safe_unsafe_usage
+      pSvcParam.bEnableSpsPpsIdAddition = atoi (argv[n ]) ? true : false;
+      ++ n;
+      continue;
+    }
+    if (! (strcmp (pCommand, "-denois"))) {	// confirmed_safe_unsafe_usage
+      pSvcParam.bEnableDenoise = atoi (argv[n ]) ? true : false;
+      ++ n;
+      continue;
+    }
+    if (! (strcmp (pCommand, "-scene"))) {	// confirmed_safe_unsafe_usage
+      pSvcParam.bEnableSceneChangeDetect = atoi (argv[n ]) ? true : false;
+      ++ n;
+      continue;
+    }
+    if (! (strcmp (pCommand, "-bgd"))) {	// confirmed_safe_unsafe_usage
+      pSvcParam.bEnableBackgroundDetection = atoi (argv[n ]) ? true : false;
+      ++ n;
+      continue;
+    }
+    if (! (strcmp (pCommand, "-aq"))) {	// confirmed_safe_unsafe_usage
+      pSvcParam.bEnableAdaptiveQuant = atoi (argv[n ]) ? true : false;
+      ++ n;
+      continue;
+    }
+    if (! (strcmp (pCommand, "-ltr"))) {	// confirmed_safe_unsafe_usage
+      pSvcParam.bEnableLongTermReference = atoi (argv[n ]) ? true : false;
+      ++ n;
+      continue;
+    }
+    if (! (strcmp (pCommand, "-ltrper"))) {	// confirmed_safe_unsafe_usage
+      pSvcParam.uiLtrMarkPeriod = atoi (argv[n ]);
+      ++ n;
+      continue;
+    }
+    if (! (strcmp (pCommand, "-rc"))) {	// confirmed_safe_unsafe_usage
+      pSvcParam.bEnableRc = atoi (argv[n ]) ? true : false;
+      ++ n;
+      continue;
+    }
+    if (! (strcmp (pCommand, "-tarb"))) {	// confirmed_safe_unsafe_usage
+      pSvcParam.iTargetBitrate = atoi (argv[n ]);
+      ++ n;
+      continue;
+    }
+    if (! (strcmp (pCommand, "-numl"))) {	// confirmed_safe_unsafe_usage
+      bool_t bFound = false;
+      pSvcParam.iNumDependencyLayer = atoi (argv[n++]);
+      for (int ln = 0 ; ln < pSvcParam.iNumDependencyLayer ; ln++) {
+//				pSvcParam.sDependencyLayers[ln].uiDependencyId = ln;
+        sFileSet.sSpatialLayers[ln].strLayerCfgFile.assign (argv[n]);
+        ++ n;
+      }
 
-			for (int8_t iLayer = 0; iLayer < pSvcParam.iNumDependencyLayer; ++ iLayer){
-				SLayerPEncCtx sLayerCtx;	
-				string strTag[4];
-				int32_t iLayerArg = -2;
-				int32_t iNumQualityBitrateLayerSet = 0;
+      for (int8_t iLayer = 0; iLayer < pSvcParam.iNumDependencyLayer; ++ iLayer) {
+        SLayerPEncCtx sLayerCtx;
+        string strTag[4];
+        int32_t iLayerArg = -2;
+        int32_t iNumQualityBitrateLayerSet = 0;
 
-				SDLayerParam *pDLayer = &pSvcParam.sDependencyLayers[iLayer];
-				CReadConfig cRdLayerCfg( sFileSet.sSpatialLayers[iLayer].strLayerCfgFile );
+        SDLayerParam* pDLayer = &pSvcParam.sDependencyLayers[iLayer];
+        CReadConfig cRdLayerCfg (sFileSet.sSpatialLayers[iLayer].strLayerCfgFile);
 
-				memset(&sLayerCtx, 0, sizeof(SLayerPEncCtx));
+        memset (&sLayerCtx, 0, sizeof (SLayerPEncCtx));
 
 //				pDLayer->frext_mode = 0;
-				if ( !cRdLayerCfg.ExistFile() ){
-					fprintf(stderr, "Unabled to open layer #%d configuration file: %s.\n", iLayer, cRdLayerCfg.GetFileName().c_str());
-					continue;
-				}
-				
-				while ( !cRdLayerCfg.EndOfFile() ){
-					long iLayerRd = cRdLayerCfg.ReadLine(&strTag[0]);
-					if (iLayerRd > 0){
-						if ( strTag[0].empty() )
-							continue;
-						if (strTag[0].compare("SourceWidth") == 0){
-							pDLayer->iFrameWidth	= atoi(strTag[1].c_str());
-							pDLayer->iActualWidth= pDLayer->iFrameWidth;
-							continue;
-						}
-						else if (strTag[0].compare("SourceHeight") == 0){
-							pDLayer->iFrameHeight	= atoi(strTag[1].c_str());
-							pDLayer->iActualHeight	= pDLayer->iFrameHeight;
-							continue;
-						}
-						else if (strTag[0].compare("FrameRateIn") == 0){
-							pDLayer->fInputFrameRate	= (float)atof(strTag[1].c_str());
-							continue;
-						}
-						else if (strTag[0].compare("FrameRateOut") == 0){
-							pDLayer->fOutputFrameRate = (float)atof(strTag[1].c_str());
-							continue;
-						}
-						else if (strTag[0].compare("InputFile") == 0){							
-							if ( strTag[1].length() > 0 )
-								sFileSet.sSpatialLayers[iLayer].strSeqFile = strTag[1];
-							continue;
-						}
-						else if (strTag[0].compare("ReconFile") == 0){
+        if (!cRdLayerCfg.ExistFile()) {
+          fprintf (stderr, "Unabled to open layer #%d configuration file: %s.\n", iLayer, cRdLayerCfg.GetFileName().c_str());
+          continue;
+        }
+
+        while (!cRdLayerCfg.EndOfFile()) {
+          long iLayerRd = cRdLayerCfg.ReadLine (&strTag[0]);
+          if (iLayerRd > 0) {
+            if (strTag[0].empty())
+              continue;
+            if (strTag[0].compare ("SourceWidth") == 0) {
+              pDLayer->iFrameWidth	= atoi (strTag[1].c_str());
+              pDLayer->iActualWidth = pDLayer->iFrameWidth;
+              continue;
+            } else if (strTag[0].compare ("SourceHeight") == 0) {
+              pDLayer->iFrameHeight	= atoi (strTag[1].c_str());
+              pDLayer->iActualHeight	= pDLayer->iFrameHeight;
+              continue;
+            } else if (strTag[0].compare ("FrameRateIn") == 0) {
+              pDLayer->fInputFrameRate	= (float)atof (strTag[1].c_str());
+              continue;
+            } else if (strTag[0].compare ("FrameRateOut") == 0) {
+              pDLayer->fOutputFrameRate = (float)atof (strTag[1].c_str());
+              continue;
+            } else if (strTag[0].compare ("InputFile") == 0) {
+              if (strTag[1].length() > 0)
+                sFileSet.sSpatialLayers[iLayer].strSeqFile = strTag[1];
+              continue;
+            } else if (strTag[0].compare ("ReconFile") == 0) {
 #ifdef ENABLE_FRAME_DUMP
-							const int kiLen = strTag[1].length();
-							if (kiLen >= MAX_FNAME_LEN)
-								return 1;
-							pDLayer->sRecFileName[kiLen] = '\0';
-							strncpy(pDLayer->sRecFileName, strTag[1].c_str(), kiLen);	// confirmed_safe_unsafe_usage
+              const int kiLen = strTag[1].length();
+              if (kiLen >= MAX_FNAME_LEN)
+                return 1;
+              pDLayer->sRecFileName[kiLen] = '\0';
+              strncpy (pDLayer->sRecFileName, strTag[1].c_str(), kiLen);	// confirmed_safe_unsafe_usage
 #endif//ENABLE_FRAME_DUMP
-							continue;
-						}
-						else if (strTag[0].compare("ProfileIdc") == 0){
-							pDLayer->uiProfileIdc	= atoi(strTag[1].c_str());
-							continue;
-						}
-						else if (strTag[0].compare("FRExt") == 0){
+              continue;
+            } else if (strTag[0].compare ("ProfileIdc") == 0) {
+              pDLayer->uiProfileIdc	= atoi (strTag[1].c_str());
+              continue;
+            } else if (strTag[0].compare ("FRExt") == 0) {
 //							pDLayer->frext_mode	= (bool_t)atoi(strTag[1].c_str());
-							continue;
-						}	
-						if (strTag[0].compare("SpatialBitrate") == 0){
-							pDLayer->iSpatialBitrate	= 1000 * atoi(strTag[1].c_str());
-							continue;
-						}
+              continue;
+            }
+            if (strTag[0].compare ("SpatialBitrate") == 0) {
+              pDLayer->iSpatialBitrate	= 1000 * atoi (strTag[1].c_str());
+              continue;
+            }
 
-						if (strTag[0].compare("InitialQP") == 0){
-							sLayerCtx.iDLayerQp	= atoi(strTag[1].c_str());
-							continue;
-						}
+            if (strTag[0].compare ("InitialQP") == 0) {
+              sLayerCtx.iDLayerQp	= atoi (strTag[1].c_str());
+              continue;
+            }
 
-						if (strTag[0].compare("SliceMode") == 0){
-							sLayerCtx.sMso.uiSliceMode	= (SliceMode)atoi(strTag[1].c_str());
-							continue;
-						}
-						else if (strTag[0].compare("SliceSize") == 0){//SM_DYN_SLICE
-							sLayerCtx.sMso.sSliceArgument.uiSliceSizeConstraint	= (SliceMode)atoi(strTag[1].c_str());
-							continue;
-						}
-						else if (strTag[0].compare("SliceNum") == 0){
-							sLayerCtx.sMso.sSliceArgument.iSliceNum = atoi(strTag[1].c_str());
-							continue;
-						}
-						else if ( strTag[0].compare(0, kiSize, str_ ) == 0 )
-						{
-							const char* kpString = strTag[0].c_str();
-							int uiSliceIdx = atoi(&kpString[kiSize]);
-							assert( uiSliceIdx < MAX_SLICES_NUM );
-							sLayerCtx.sMso.sSliceArgument.uiSliceMbNum[uiSliceIdx] = atoi( strTag[1].c_str() );
-							continue;
-						}
-					}
-				}
-				pDLayer->iDLayerQp		= sLayerCtx.iDLayerQp;
-				pDLayer->sMso.uiSliceMode		= sLayerCtx.sMso.uiSliceMode;		
-	memcpy( &pDLayer->sMso, &sLayerCtx.sMso, sizeof(SMulSliceOption) );	// confirmed_safe_unsafe_usage
-		memcpy( &pDLayer->sMso.sSliceArgument.uiSliceMbNum[0], &sLayerCtx.sMso.sSliceArgument.uiSliceMbNum[0], sizeof(sLayerCtx.sMso.sSliceArgument.uiSliceMbNum) );	// confirmed_safe_unsafe_usage
+            if (strTag[0].compare ("SliceMode") == 0) {
+              sLayerCtx.sMso.uiSliceMode	= (SliceMode)atoi (strTag[1].c_str());
+              continue;
+            } else if (strTag[0].compare ("SliceSize") == 0) { //SM_DYN_SLICE
+              sLayerCtx.sMso.sSliceArgument.uiSliceSizeConstraint	= (SliceMode)atoi (strTag[1].c_str());
+              continue;
+            } else if (strTag[0].compare ("SliceNum") == 0) {
+              sLayerCtx.sMso.sSliceArgument.iSliceNum = atoi (strTag[1].c_str());
+              continue;
+            } else if (strTag[0].compare (0, kiSize, str_) == 0) {
+              const char* kpString = strTag[0].c_str();
+              int uiSliceIdx = atoi (&kpString[kiSize]);
+              assert (uiSliceIdx < MAX_SLICES_NUM);
+              sLayerCtx.sMso.sSliceArgument.uiSliceMbNum[uiSliceIdx] = atoi (strTag[1].c_str());
+              continue;
+            }
+          }
+        }
+        pDLayer->iDLayerQp		= sLayerCtx.iDLayerQp;
+        pDLayer->sMso.uiSliceMode		= sLayerCtx.sMso.uiSliceMode;
+        memcpy (&pDLayer->sMso, &sLayerCtx.sMso, sizeof (SMulSliceOption));	// confirmed_safe_unsafe_usage
+        memcpy (&pDLayer->sMso.sSliceArgument.uiSliceMbNum[0], &sLayerCtx.sMso.sSliceArgument.uiSliceMbNum[0],
+                sizeof (sLayerCtx.sMso.sSliceArgument.uiSliceMbNum));	// confirmed_safe_unsafe_usage
 
-			}
-			//n += 1;
-			continue;
-		}
-		if( !(strcmp(pCommand,"-org")) )	// confirmed_safe_unsafe_usage
-		{
-			unsigned int	iLayer = atoi( argv[n++] );
-			sFileSet.sSpatialLayers[iLayer].strSeqFile.assign( argv[n] );
-			++ n;
-			continue;
-		}
-		if( !(strcmp(pCommand,"-drec")) )	// confirmed_safe_unsafe_usage
-		{
-			unsigned int	iLayer = atoi( argv[n++] );
-			const int iLen = strlen(argv[n]);	// confirmed_safe_unsafe_usage
+      }
+      //n += 1;
+      continue;
+    }
+    if (! (strcmp (pCommand, "-org"))) {	// confirmed_safe_unsafe_usage
+      unsigned int	iLayer = atoi (argv[n++]);
+      sFileSet.sSpatialLayers[iLayer].strSeqFile.assign (argv[n]);
+      ++ n;
+      continue;
+    }
+    if (! (strcmp (pCommand, "-drec"))) {	// confirmed_safe_unsafe_usage
+      unsigned int	iLayer = atoi (argv[n++]);
+      const int iLen = strlen (argv[n]);	// confirmed_safe_unsafe_usage
 #ifdef ENABLE_FRAME_DUMP
-			SDLayerParam *pDLayer = &pSvcParam.sDependencyLayers[iLayer];
-			pDLayer->sRecFileName[iLen] = '\0';
-			strncpy(pDLayer->sRecFileName, argv[n], iLen);	// confirmed_safe_unsafe_usage
+      SDLayerParam* pDLayer = &pSvcParam.sDependencyLayers[iLayer];
+      pDLayer->sRecFileName[iLen] = '\0';
+      strncpy (pDLayer->sRecFileName, argv[n], iLen);	// confirmed_safe_unsafe_usage
 #endif//ENABLE_FRAME_DUMP
-			++ n;
-			continue;
-		}
-		if( !(strcmp(pCommand,"-sw")) )	// confirmed_safe_unsafe_usage
-		{
-			unsigned int	iLayer = atoi( argv[n++] );
-			SDLayerParam *pDLayer = &pSvcParam.sDependencyLayers[iLayer];
-			pDLayer->iFrameWidth =  atoi(argv[n ]);
-			pDLayer->iActualWidth= pDLayer->iFrameWidth;
-			++ n;
-			continue;
-		}
-		if( !(strcmp(pCommand,"-sh")) )	// confirmed_safe_unsafe_usage
-		{
-			unsigned int	iLayer = atoi( argv[n++] );
-			SDLayerParam *pDLayer = &pSvcParam.sDependencyLayers[iLayer];
-			pDLayer->iFrameHeight =  atoi(argv[n ]);
-			pDLayer->iActualHeight= pDLayer->iFrameHeight;
-			++ n;
-			continue;
-		}
-		if( !(strcmp(pCommand,"-frin")) )	// confirmed_safe_unsafe_usage
-		{
-			unsigned int	iLayer = atoi( argv[n++] );
-			SDLayerParam *pDLayer = &pSvcParam.sDependencyLayers[iLayer];
-			pDLayer->fInputFrameRate =  (float)atof(argv[n ]);
-			++ n;
-			continue;
-		}
-		if( !(strcmp(pCommand,"-frout")) )	// confirmed_safe_unsafe_usage
-		{
-			unsigned int	iLayer = atoi( argv[n++] );
-			SDLayerParam *pDLayer = &pSvcParam.sDependencyLayers[iLayer];
-			pDLayer->fOutputFrameRate =  (float)atof(argv[n ]);
-			++ n;
-			continue;
-		}	
+      ++ n;
+      continue;
+    }
+    if (! (strcmp (pCommand, "-sw"))) {	// confirmed_safe_unsafe_usage
+      unsigned int	iLayer = atoi (argv[n++]);
+      SDLayerParam* pDLayer = &pSvcParam.sDependencyLayers[iLayer];
+      pDLayer->iFrameWidth =  atoi (argv[n ]);
+      pDLayer->iActualWidth = pDLayer->iFrameWidth;
+      ++ n;
+      continue;
+    }
+    if (! (strcmp (pCommand, "-sh"))) {	// confirmed_safe_unsafe_usage
+      unsigned int	iLayer = atoi (argv[n++]);
+      SDLayerParam* pDLayer = &pSvcParam.sDependencyLayers[iLayer];
+      pDLayer->iFrameHeight =  atoi (argv[n ]);
+      pDLayer->iActualHeight = pDLayer->iFrameHeight;
+      ++ n;
+      continue;
+    }
+    if (! (strcmp (pCommand, "-frin"))) {	// confirmed_safe_unsafe_usage
+      unsigned int	iLayer = atoi (argv[n++]);
+      SDLayerParam* pDLayer = &pSvcParam.sDependencyLayers[iLayer];
+      pDLayer->fInputFrameRate = (float)atof (argv[n ]);
+      ++ n;
+      continue;
+    }
+    if (! (strcmp (pCommand, "-frout"))) {	// confirmed_safe_unsafe_usage
+      unsigned int	iLayer = atoi (argv[n++]);
+      SDLayerParam* pDLayer = &pSvcParam.sDependencyLayers[iLayer];
+      pDLayer->fOutputFrameRate = (float)atof (argv[n ]);
+      ++ n;
+      continue;
+    }
 
-		if( !(strcmp(pCommand,"-lqp")) )	// confirmed_safe_unsafe_usage
-		{
-			unsigned int	iLayer = atoi( argv[n++] );
-			SDLayerParam *pDLayer = &pSvcParam.sDependencyLayers[iLayer];
-			uiQpChangeFlag[iLayer] = 1;
-			pDLayer->iDLayerQp = sLayerCtx[iLayer].iDLayerQp=  atoi(argv[n ]);
-			n += 1;
-			continue;
-		}
-		//sLayerCtx[iLayer].num_quality_layers = pDLayer->num_quality_layers = 1;
+    if (! (strcmp (pCommand, "-lqp"))) {	// confirmed_safe_unsafe_usage
+      unsigned int	iLayer = atoi (argv[n++]);
+      SDLayerParam* pDLayer = &pSvcParam.sDependencyLayers[iLayer];
+      uiQpChangeFlag[iLayer] = 1;
+      pDLayer->iDLayerQp = sLayerCtx[iLayer].iDLayerQp =  atoi (argv[n ]);
+      n += 1;
+      continue;
+    }
+    //sLayerCtx[iLayer].num_quality_layers = pDLayer->num_quality_layers = 1;
 
-		if( !(strcmp(pCommand,"-ltarb")) )	// confirmed_safe_unsafe_usage
-		{
-			unsigned int	iLayer = atoi( argv[n++] );
-			SDLayerParam *pDLayer = &pSvcParam.sDependencyLayers[iLayer];
-			pDLayer->iSpatialBitrate	= 1000 * atoi(argv[n ]);
-			++ n;
-			continue;
-		}
+    if (! (strcmp (pCommand, "-ltarb"))) {	// confirmed_safe_unsafe_usage
+      unsigned int	iLayer = atoi (argv[n++]);
+      SDLayerParam* pDLayer = &pSvcParam.sDependencyLayers[iLayer];
+      pDLayer->iSpatialBitrate	= 1000 * atoi (argv[n ]);
+      ++ n;
+      continue;
+    }
 
-		if( !(strcmp(pCommand,"-slcmd")) )	// confirmed_safe_unsafe_usage
-		{
-			unsigned int	iLayer = atoi( argv[n++] );
-			SDLayerParam *pDLayer = &pSvcParam.sDependencyLayers[iLayer];
+    if (! (strcmp (pCommand, "-slcmd"))) {	// confirmed_safe_unsafe_usage
+      unsigned int	iLayer = atoi (argv[n++]);
+      SDLayerParam* pDLayer = &pSvcParam.sDependencyLayers[iLayer];
 
-			switch ( atoi(argv[n] ) )
-			{
-			case 0: 
-				pDLayer->sMso.uiSliceMode = SM_SINGLE_SLICE;
-				break;
-			case 1: 
-				pDLayer->sMso.uiSliceMode = SM_FIXEDSLCNUM_SLICE;
-				break;
-			case 2: 
-				pDLayer->sMso.uiSliceMode = SM_RASTER_SLICE;
-				break;
-			case 3: 
-				pDLayer->sMso.uiSliceMode = SM_ROWMB_SLICE;
-				break;
-			case 4: 
-				pDLayer->sMso.uiSliceMode = SM_DYN_SLICE;
-				break;
-			default: 
-				pDLayer->sMso.uiSliceMode = SM_RESERVED;
-				break;
-			}
-			++ n;
-			continue;
-		}
-		if( !(strcmp(pCommand,"-slcsize")) )//confirmed_safe_unsafe_usage
-		{
-			unsigned int	iLayer = atoi( argv[n++] );
-			SDLayerParam *pDLayer = &pSvcParam.sDependencyLayers[iLayer];
-			pDLayer->sMso.sSliceArgument.uiSliceSizeConstraint = atoi(argv[n ]);
-			++ n;
-			continue;
-		}
-		if( !(strcmp(pCommand,"-slcnum")) )// confirmed_safe_unsafe_usage
-		{
-			unsigned int	iLayer = atoi( argv[n++] );
-			SDLayerParam *pDLayer = &pSvcParam.sDependencyLayers[iLayer];
-			pDLayer->sMso.sSliceArgument.iSliceNum = atoi(argv[n ]);
-			++ n;
-			continue;
-		}
-	}
-	return 0;
+      switch (atoi (argv[n])) {
+      case 0:
+        pDLayer->sMso.uiSliceMode = SM_SINGLE_SLICE;
+        break;
+      case 1:
+        pDLayer->sMso.uiSliceMode = SM_FIXEDSLCNUM_SLICE;
+        break;
+      case 2:
+        pDLayer->sMso.uiSliceMode = SM_RASTER_SLICE;
+        break;
+      case 3:
+        pDLayer->sMso.uiSliceMode = SM_ROWMB_SLICE;
+        break;
+      case 4:
+        pDLayer->sMso.uiSliceMode = SM_DYN_SLICE;
+        break;
+      default:
+        pDLayer->sMso.uiSliceMode = SM_RESERVED;
+        break;
+      }
+      ++ n;
+      continue;
+    }
+    if (! (strcmp (pCommand, "-slcsize"))) { //confirmed_safe_unsafe_usage
+      unsigned int	iLayer = atoi (argv[n++]);
+      SDLayerParam* pDLayer = &pSvcParam.sDependencyLayers[iLayer];
+      pDLayer->sMso.sSliceArgument.uiSliceSizeConstraint = atoi (argv[n ]);
+      ++ n;
+      continue;
+    }
+    if (! (strcmp (pCommand, "-slcnum"))) { // confirmed_safe_unsafe_usage
+      unsigned int	iLayer = atoi (argv[n++]);
+      SDLayerParam* pDLayer = &pSvcParam.sDependencyLayers[iLayer];
+      pDLayer->sMso.sSliceArgument.iSliceNum = atoi (argv[n ]);
+      ++ n;
+      continue;
+    }
+  }
+  return 0;
 }
 
 
 
-int FillSpecificParameters( SVCEncodingParam &sParam )
-{
-	/* Test for temporal, spatial, SNR scalability */
-	sParam.fFrameRate	= 30.0f;		// input frame rate  
-	sParam.iPicWidth		= 1280;			// width of picture in samples
-	sParam.iPicHeight	= 720;			// height of picture in samples
-	sParam.iTargetBitrate= 2500000;		// target bitrate desired
-	sParam.iRCMode       = 0;            //  rc mode control
-	sParam.iTemporalLayerNum= 3;	// layer number at temporal level
-	sParam.iSpatialLayerNum	= 4;	// layer number at spatial level
-	sParam.bEnableDenoise    = 0;    // denoise control
-	sParam.bEnableBackgroundDetection = 1; // background detection control	
-	sParam.bEnableAdaptiveQuant       = 1; // adaptive quantization control
-	sParam.bEnableLongTermReference  = 0; // long term reference control
-	sParam.iLtrMarkPeriod = 30;
+int FillSpecificParameters (SVCEncodingParam& sParam) {
+  /* Test for temporal, spatial, SNR scalability */
+  sParam.fFrameRate	= 30.0f;		// input frame rate
+  sParam.iPicWidth		= 1280;			// width of picture in samples
+  sParam.iPicHeight	= 720;			// height of picture in samples
+  sParam.iTargetBitrate = 2500000;		// target bitrate desired
+  sParam.iRCMode       = 0;            //  rc mode control
+  sParam.iTemporalLayerNum = 3;	// layer number at temporal level
+  sParam.iSpatialLayerNum	= 4;	// layer number at spatial level
+  sParam.bEnableDenoise    = 0;    // denoise control
+  sParam.bEnableBackgroundDetection = 1; // background detection control
+  sParam.bEnableAdaptiveQuant       = 1; // adaptive quantization control
+  sParam.bEnableLongTermReference  = 0; // long term reference control
+  sParam.iLtrMarkPeriod = 30;
 
-	sParam.iInputCsp			= videoFormatI420;			// color space of input sequence
-	sParam.iKeyPicCodingMode= 1;// mode of key picture coding
-	sParam.iIntraPeriod		= 320;		// period of Intra frame
-	sParam.bEnableSpsPpsIdAddition = 1;
-	sParam.bPrefixNalAddingCtrl = 1;
+  sParam.iInputCsp			= videoFormatI420;			// color space of input sequence
+  sParam.iKeyPicCodingMode = 1; // mode of key picture coding
+  sParam.iIntraPeriod		= 320;		// period of Intra frame
+  sParam.bEnableSpsPpsIdAddition = 1;
+  sParam.bPrefixNalAddingCtrl = 1;
 
-	int iIndexLayer = 0;
-	sParam.sSpatialLayers[iIndexLayer].iVideoWidth	= 160;
-	sParam.sSpatialLayers[iIndexLayer].iVideoHeight	= 90;
-	sParam.sSpatialLayers[iIndexLayer].fFrameRate	= 7.5f;
-	sParam.sSpatialLayers[iIndexLayer].iQualityLayerNum	    = 1;
-	sParam.sSpatialLayers[iIndexLayer].iSpatialBitrate		= 64000;
-	sParam.sSpatialLayers[iIndexLayer].iCgsSnrRefined		= 0;
+  int iIndexLayer = 0;
+  sParam.sSpatialLayers[iIndexLayer].iVideoWidth	= 160;
+  sParam.sSpatialLayers[iIndexLayer].iVideoHeight	= 90;
+  sParam.sSpatialLayers[iIndexLayer].fFrameRate	= 7.5f;
+  sParam.sSpatialLayers[iIndexLayer].iQualityLayerNum	    = 1;
+  sParam.sSpatialLayers[iIndexLayer].iSpatialBitrate		= 64000;
+  sParam.sSpatialLayers[iIndexLayer].iCgsSnrRefined		= 0;
 //	sParam.sSpatialLayers[iIndexLayer].iQualityBitrate[0]	= 0;
 //	memset(sParam.iTemporalBitrate, 0, sizeof(sParam.iTemporalBitrate));
-	sParam.sSpatialLayers[iIndexLayer].iInterSpatialLayerPredFlag	= 0;
+  sParam.sSpatialLayers[iIndexLayer].iInterSpatialLayerPredFlag	= 0;
 #ifdef MT_ENABLED
-	sParam.sSpatialLayers[iIndexLayer].sSliceCfg.uiSliceMode = 0;  
+  sParam.sSpatialLayers[iIndexLayer].sSliceCfg.uiSliceMode = 0;
 #endif
 
-	++ iIndexLayer;
-	sParam.sSpatialLayers[iIndexLayer].iVideoWidth	= 320;
-	sParam.sSpatialLayers[iIndexLayer].iVideoHeight	= 180;
-	sParam.sSpatialLayers[iIndexLayer].fFrameRate	= 15.0f;
-	sParam.sSpatialLayers[iIndexLayer].iQualityLayerNum	    = 1;
-	sParam.sSpatialLayers[iIndexLayer].iSpatialBitrate		= 160000;
-	sParam.sSpatialLayers[iIndexLayer].iCgsSnrRefined		= 0;
-//	sParam.sSpatialLayers[iIndexLayer].iQualityBitrate[0]	= 0;	
-//	sParam.sSpatialLayers[iIndexLayer].iQualityBitrate[1]	= 0;	
+  ++ iIndexLayer;
+  sParam.sSpatialLayers[iIndexLayer].iVideoWidth	= 320;
+  sParam.sSpatialLayers[iIndexLayer].iVideoHeight	= 180;
+  sParam.sSpatialLayers[iIndexLayer].fFrameRate	= 15.0f;
+  sParam.sSpatialLayers[iIndexLayer].iQualityLayerNum	    = 1;
+  sParam.sSpatialLayers[iIndexLayer].iSpatialBitrate		= 160000;
+  sParam.sSpatialLayers[iIndexLayer].iCgsSnrRefined		= 0;
+//	sParam.sSpatialLayers[iIndexLayer].iQualityBitrate[0]	= 0;
+//	sParam.sSpatialLayers[iIndexLayer].iQualityBitrate[1]	= 0;
 //	sParam.sSpatialLayers[iIndexLayer].iQualityBitrate[2]	= 0;
-	sParam.sSpatialLayers[iIndexLayer].iInterSpatialLayerPredFlag	= 0;	
+  sParam.sSpatialLayers[iIndexLayer].iInterSpatialLayerPredFlag	= 0;
 #ifdef MT_ENABLED
-	sParam.sSpatialLayers[iIndexLayer].sSliceCfg.uiSliceMode = 0; 
+  sParam.sSpatialLayers[iIndexLayer].sSliceCfg.uiSliceMode = 0;
 #endif
 
-	++ iIndexLayer;
-	sParam.sSpatialLayers[iIndexLayer].iVideoWidth	= 640;
-	sParam.sSpatialLayers[iIndexLayer].iVideoHeight	= 360;
-	sParam.sSpatialLayers[iIndexLayer].fFrameRate	= 30.0f;
-	sParam.sSpatialLayers[iIndexLayer].iQualityLayerNum	    = 1;
-	sParam.sSpatialLayers[iIndexLayer].iSpatialBitrate		= 512000;
-	sParam.sSpatialLayers[iIndexLayer].iCgsSnrRefined		= 0;
-//	sParam.sSpatialLayers[iIndexLayer].iQualityBitrate[0]	= 0;	
-//	sParam.sSpatialLayers[iIndexLayer].iQualityBitrate[1]	= 0;	
+  ++ iIndexLayer;
+  sParam.sSpatialLayers[iIndexLayer].iVideoWidth	= 640;
+  sParam.sSpatialLayers[iIndexLayer].iVideoHeight	= 360;
+  sParam.sSpatialLayers[iIndexLayer].fFrameRate	= 30.0f;
+  sParam.sSpatialLayers[iIndexLayer].iQualityLayerNum	    = 1;
+  sParam.sSpatialLayers[iIndexLayer].iSpatialBitrate		= 512000;
+  sParam.sSpatialLayers[iIndexLayer].iCgsSnrRefined		= 0;
+//	sParam.sSpatialLayers[iIndexLayer].iQualityBitrate[0]	= 0;
+//	sParam.sSpatialLayers[iIndexLayer].iQualityBitrate[1]	= 0;
 //	sParam.sSpatialLayers[iIndexLayer].iQualityBitrate[2]	= 0;
-	sParam.sSpatialLayers[iIndexLayer].iInterSpatialLayerPredFlag	= 0;
+  sParam.sSpatialLayers[iIndexLayer].iInterSpatialLayerPredFlag	= 0;
 #ifdef MT_ENABLED
-	sParam.sSpatialLayers[iIndexLayer].sSliceCfg.uiSliceMode = 0;                  
-    sParam.sSpatialLayers[iIndexLayer].sSliceCfg.sSliceArgument.uiSliceNum = 1;    
+  sParam.sSpatialLayers[iIndexLayer].sSliceCfg.uiSliceMode = 0;
+  sParam.sSpatialLayers[iIndexLayer].sSliceCfg.sSliceArgument.uiSliceNum = 1;
 #endif
 
-	++ iIndexLayer;
-	sParam.sSpatialLayers[iIndexLayer].iVideoWidth	= 1280;
-	sParam.sSpatialLayers[iIndexLayer].iVideoHeight	= 720;
-	sParam.sSpatialLayers[iIndexLayer].fFrameRate	= 30.0f;
-	sParam.sSpatialLayers[iIndexLayer].iQualityLayerNum	    = 1;
-	sParam.sSpatialLayers[iIndexLayer].iSpatialBitrate		= 1500000;
-	sParam.sSpatialLayers[iIndexLayer].iCgsSnrRefined		= 0;
-//	sParam.sSpatialLayers[iIndexLayer].iQualityBitrate[0]	= 0;	
-//	sParam.sSpatialLayers[iIndexLayer].iQualityBitrate[1]	= 0;	
+  ++ iIndexLayer;
+  sParam.sSpatialLayers[iIndexLayer].iVideoWidth	= 1280;
+  sParam.sSpatialLayers[iIndexLayer].iVideoHeight	= 720;
+  sParam.sSpatialLayers[iIndexLayer].fFrameRate	= 30.0f;
+  sParam.sSpatialLayers[iIndexLayer].iQualityLayerNum	    = 1;
+  sParam.sSpatialLayers[iIndexLayer].iSpatialBitrate		= 1500000;
+  sParam.sSpatialLayers[iIndexLayer].iCgsSnrRefined		= 0;
+//	sParam.sSpatialLayers[iIndexLayer].iQualityBitrate[0]	= 0;
+//	sParam.sSpatialLayers[iIndexLayer].iQualityBitrate[1]	= 0;
 //	sParam.sSpatialLayers[iIndexLayer].iQualityBitrate[2]	= 0;
-	sParam.sSpatialLayers[iIndexLayer].iInterSpatialLayerPredFlag	= 0;
+  sParam.sSpatialLayers[iIndexLayer].iInterSpatialLayerPredFlag	= 0;
 #ifdef MT_ENABLED
-	sParam.sSpatialLayers[iIndexLayer].sSliceCfg.uiSliceMode = 0;  
-	sParam.sSpatialLayers[iIndexLayer].sSliceCfg.sSliceArgument.uiSliceNum = 1; 
+  sParam.sSpatialLayers[iIndexLayer].sSliceCfg.uiSliceMode = 0;
+  sParam.sSpatialLayers[iIndexLayer].sSliceCfg.sSliceArgument.uiSliceNum = 1;
 #endif
 
-	float fMaxFr = sParam.sSpatialLayers[sParam.iSpatialLayerNum-1].fFrameRate;
-	for (int32_t i = sParam.iSpatialLayerNum-2; i >= 0; -- i)
-	{
-		if (sParam.sSpatialLayers[i].fFrameRate > fMaxFr+EPSN)
-			fMaxFr = sParam.sSpatialLayers[i].fFrameRate;
-	}
-	sParam.fFrameRate = fMaxFr;
+  float fMaxFr = sParam.sSpatialLayers[sParam.iSpatialLayerNum - 1].fFrameRate;
+  for (int32_t i = sParam.iSpatialLayerNum - 2; i >= 0; -- i) {
+    if (sParam.sSpatialLayers[i].fFrameRate > fMaxFr + EPSN)
+      fMaxFr = sParam.sSpatialLayers[i].fFrameRate;
+  }
+  sParam.fFrameRate = fMaxFr;
 
-	return 0;
+  return 0;
 }
 
 /* For SVC Demo test */
-int ProcessEncodingSvcWithParam ( ISVCEncoder *pPtrEnc, int argc, char ** argv )
-{
-    const char * kpSrcFile = argv[1];
-	const char * kpStrBsFile = argv[2];
+int ProcessEncodingSvcWithParam (ISVCEncoder* pPtrEnc, int argc, char** argv) {
+  const char* kpSrcFile = argv[1];
+  const char* kpStrBsFile = argv[2];
 
-	if ( pPtrEnc == NULL || kpSrcFile == NULL || kpStrBsFile == NULL )
-		return 1;
+  if (pPtrEnc == NULL || kpSrcFile == NULL || kpStrBsFile == NULL)
+    return 1;
 
-	FILE *pFpBs = NULL;
-	FILE *pFpSrc= NULL;
-	SFrameBSInfo sFbi;
-	SVCEncodingParam sSvcParam;
-	int64_t iStart = 0, iTotal = 0;
+  FILE* pFpBs = NULL;
+  FILE* pFpSrc = NULL;
+  SFrameBSInfo sFbi;
+  SVCEncodingParam sSvcParam;
+  int64_t iStart = 0, iTotal = 0;
 #if defined ( STICK_STREAM_SIZE )
-	FILE *fTrackStream = fopen("coding_size.stream", "wb");;
+  FILE* fTrackStream = fopen ("coding_size.stream", "wb");;
 #endif
 
-	pFpSrc	= fopen(kpSrcFile, "rb");
-	if ( NULL == pFpSrc )
-		return 1;
-	pFpBs	= fopen(kpStrBsFile, "wb");
-	if ( NULL == pFpBs){
-		fclose( pFpSrc );
-		pFpSrc = NULL;
-		return 1;
-	}
+  pFpSrc	= fopen (kpSrcFile, "rb");
+  if (NULL == pFpSrc)
+    return 1;
+  pFpBs	= fopen (kpStrBsFile, "wb");
+  if (NULL == pFpBs) {
+    fclose (pFpSrc);
+    pFpSrc = NULL;
+    return 1;
+  }
 
-	memset( &sFbi, 0, sizeof(SFrameBSInfo) );
-	memset( &sSvcParam, 0, sizeof(SVCEncodingParam) );
+  memset (&sFbi, 0, sizeof (SFrameBSInfo));
+  memset (&sSvcParam, 0, sizeof (SVCEncodingParam));
 
-	FillSpecificParameters(sSvcParam);
+  FillSpecificParameters (sSvcParam);
 
-	int iParsedNum = 3;
-	if( ParseCommandLine(argc-iParsedNum, argv+iParsedNum, sSvcParam) != 0 )
-	{
-		printf("parse pCommand line failed\n");
-		return 1;
-	}
+  int iParsedNum = 3;
+  if (ParseCommandLine (argc - iParsedNum, argv + iParsedNum, sSvcParam) != 0) {
+    printf ("parse pCommand line failed\n");
+    return 1;
+  }
 
-	if ( cmResultSuccess != pPtrEnc->Initialize( &sSvcParam, INIT_TYPE_PARAMETER_BASED ) )
-	{
-		fprintf(stderr, "Encoder Initialization failed!\n");
-		return 1;
-	}
+  if (cmResultSuccess != pPtrEnc->Initialize (&sSvcParam, INIT_TYPE_PARAMETER_BASED)) {
+    fprintf (stderr, "Encoder Initialization failed!\n");
+    return 1;
+  }
 
-	const int32_t iPicLumaSize = sSvcParam.iPicWidth * sSvcParam.iPicHeight;
-	int32_t iFrameSize = 0;
-	uint8_t *pPlanes[3] = { 0 };
+  const int32_t iPicLumaSize = sSvcParam.iPicWidth * sSvcParam.iPicHeight;
+  int32_t iFrameSize = 0;
+  uint8_t* pPlanes[3] = { 0 };
 
-	switch( sSvcParam.iInputCsp ) {
-		int iStride;
-	case videoFormatI420:
-	case videoFormatYV12:
-		iFrameSize  = (3 * iPicLumaSize)>>1;
-		pPlanes[0]	= new uint8_t[iFrameSize];
-		pPlanes[1]	= pPlanes[0] + iPicLumaSize;
-		pPlanes[2]	= pPlanes[1]	+ (iPicLumaSize>>2);
-		break;	
-	case videoFormatYUY2:
-	case videoFormatYVYU:
-	case videoFormatUYVY:
-		iStride      = CALC_BI_STRIDE(sSvcParam.iPicWidth,  16);
-		iFrameSize  = iStride * sSvcParam.iPicHeight;
-		pPlanes[0]   = new uint8_t[iFrameSize];
-		break;
-	case videoFormatRGB:
-	case videoFormatBGR:
-		iStride      = CALC_BI_STRIDE(sSvcParam.iPicWidth,  24);
-		iFrameSize  = iStride * sSvcParam.iPicHeight;
-		pPlanes[0]	= new uint8_t[iFrameSize];
-		break;
-	case videoFormatBGRA:
-	case videoFormatRGBA:
-	case videoFormatARGB:
-	case videoFormatABGR:
-		iStride = 4 * sSvcParam.iPicWidth;
-		iFrameSize  = iStride * sSvcParam.iPicHeight;
-		pPlanes[0]	= new uint8_t[iFrameSize];
-		break;
-	default:
-		return 1;
-	}
-	
-	int32_t iFrame = 0;
-	while (true) 
-	{
-		if ( feof(pFpSrc) )
-			break;
+  switch (sSvcParam.iInputCsp) {
+    int iStride;
+  case videoFormatI420:
+  case videoFormatYV12:
+    iFrameSize  = (3 * iPicLumaSize) >> 1;
+    pPlanes[0]	= new uint8_t[iFrameSize];
+    pPlanes[1]	= pPlanes[0] + iPicLumaSize;
+    pPlanes[2]	= pPlanes[1]	+ (iPicLumaSize >> 2);
+    break;
+  case videoFormatYUY2:
+  case videoFormatYVYU:
+  case videoFormatUYVY:
+    iStride      = CALC_BI_STRIDE (sSvcParam.iPicWidth,  16);
+    iFrameSize  = iStride * sSvcParam.iPicHeight;
+    pPlanes[0]   = new uint8_t[iFrameSize];
+    break;
+  case videoFormatRGB:
+  case videoFormatBGR:
+    iStride      = CALC_BI_STRIDE (sSvcParam.iPicWidth,  24);
+    iFrameSize  = iStride * sSvcParam.iPicHeight;
+    pPlanes[0]	= new uint8_t[iFrameSize];
+    break;
+  case videoFormatBGRA:
+  case videoFormatRGBA:
+  case videoFormatARGB:
+  case videoFormatABGR:
+    iStride = 4 * sSvcParam.iPicWidth;
+    iFrameSize  = iStride * sSvcParam.iPicHeight;
+    pPlanes[0]	= new uint8_t[iFrameSize];
+    break;
+  default:
+    return 1;
+  }
+
+  int32_t iFrame = 0;
+  while (true) {
+    if (feof (pFpSrc))
+      break;
 #ifdef ONLY_ENC_FRAMES_NUM
-		if ( iFrame >= ONLY_ENC_FRAMES_NUM )
-			break;
+    if (iFrame >= ONLY_ENC_FRAMES_NUM)
+      break;
 #endif//ONLY_ENC_FRAMES_NUM
-		if ( fread(pPlanes[0], sizeof(uint8_t), iFrameSize, pFpSrc) <= 0 )
-				break;
+    if (fread (pPlanes[0], sizeof (uint8_t), iFrameSize, pFpSrc) <= 0)
+      break;
 
-		iStart	= WelsTime();
-		long iEncode = pPtrEnc->EncodeFrame( pPlanes[0], &sFbi);
-		iTotal += WelsTime() - iStart;
-		if ( videoFrameTypeInvalid == iEncode ){
-			fprintf(stderr, "EncodeFrame() failed: %d.\n", iEncode);
-			break;
-		}
+    iStart	= WelsTime();
+    long iEncode = pPtrEnc->EncodeFrame (pPlanes[0], &sFbi);
+    iTotal += WelsTime() - iStart;
+    if (videoFrameTypeInvalid == iEncode) {
+      fprintf (stderr, "EncodeFrame() failed: %d.\n", iEncode);
+      break;
+    }
 
-		/* Write bit-stream */
-		if ( pFpBs != NULL && videoFrameTypeSkip != iEncode ){	// file handler to write bit stream
-			int iLayer = 0;
-			while ( iLayer < sFbi.iLayerNum ){
-				SLayerBSInfo *pLayerBsInfo = &sFbi.sLayerInfo[iLayer];
-				if ( pLayerBsInfo != NULL ){
-					int iLayerSize = 0;
-					int iNalIdx = pLayerBsInfo->iNalCount -1;
-					do {
-						iLayerSize += pLayerBsInfo->iNalLengthInByte[iNalIdx];
-						-- iNalIdx;
-					} while(iNalIdx >= 0);
-					fwrite(pLayerBsInfo->pBsBuf, 1, iLayerSize, pFpBs);	// write pure bit stream into file
-				}
-				++ iLayer;
-			}
-			++ iFrame;
-		}		
-	}
+    /* Write bit-stream */
+    if (pFpBs != NULL && videoFrameTypeSkip != iEncode) {	// file handler to write bit stream
+      int iLayer = 0;
+      while (iLayer < sFbi.iLayerNum) {
+        SLayerBSInfo* pLayerBsInfo = &sFbi.sLayerInfo[iLayer];
+        if (pLayerBsInfo != NULL) {
+          int iLayerSize = 0;
+          int iNalIdx = pLayerBsInfo->iNalCount - 1;
+          do {
+            iLayerSize += pLayerBsInfo->iNalLengthInByte[iNalIdx];
+            -- iNalIdx;
+          } while (iNalIdx >= 0);
+          fwrite (pLayerBsInfo->pBsBuf, 1, iLayerSize, pFpBs);	// write pure bit stream into file
+        }
+        ++ iLayer;
+      }
+      ++ iFrame;
+    }
+  }
 
-	if (iFrame > 0){
-		double dElapsed = iTotal / 1e6;
-		printf( "Frames:		%d\nencode time:	%f sec\nFPS:		%f fps\n", iFrame, dElapsed, (iFrame * 1.0)/dElapsed );
-	}
+  if (iFrame > 0) {
+    double dElapsed = iTotal / 1e6;
+    printf ("Frames:		%d\nencode time:	%f sec\nFPS:		%f fps\n", iFrame, dElapsed, (iFrame * 1.0) / dElapsed);
+  }
 
-	if ( NULL != pPlanes[0] )
-	{
-        delete [] pPlanes[0];
-		pPlanes[0] = NULL;
-	}
+  if (NULL != pPlanes[0]) {
+    delete [] pPlanes[0];
+    pPlanes[0] = NULL;
+  }
 
-	if ( pFpBs ){
-		fclose( pFpBs );
-		pFpBs = NULL;
-	}
-	if ( pFpSrc ){
-		fclose( pFpSrc );
-		pFpSrc= NULL;
-	}
+  if (pFpBs) {
+    fclose (pFpBs);
+    pFpBs = NULL;
+  }
+  if (pFpSrc) {
+    fclose (pFpSrc);
+    pFpSrc = NULL;
+  }
 
-	return 0;
+  return 0;
 }
 
 
-int ProcessEncodingSvcWithConfig ( ISVCEncoder *pPtrEnc, int argc, char **argv )
-{
-	int iRet				= 0;	 
+int ProcessEncodingSvcWithConfig (ISVCEncoder* pPtrEnc, int argc, char** argv) {
+  int iRet				= 0;
 
-	if ( pPtrEnc == NULL )	
-		return 1;
-	
-	SFrameBSInfo sFbi;
-	SWelsSvcCodingParam sSvcParam;
-	int64_t iStart = 0, iTotal = 0;
+  if (pPtrEnc == NULL)
+    return 1;
 
-	// Preparing encoding process
-	FILE* pFileYUV[MAX_DEPENDENCY_LAYER] = {0};
-	int32_t iActualFrameEncodedCount = 0;
-	int32_t iFrameIdx = 0;
-	int32_t	iTotalFrameMax = -1;
-	int8_t  iDlayerIdx = 0;
-	uint8_t * pYUV[MAX_DEPENDENCY_LAYER] = { 0 };
-	SSourcePicture  **  pSrcPicList = NULL;
+  SFrameBSInfo sFbi;
+  SWelsSvcCodingParam sSvcParam;
+  int64_t iStart = 0, iTotal = 0;
+
+  // Preparing encoding process
+  FILE* pFileYUV[MAX_DEPENDENCY_LAYER] = {0};
+  int32_t iActualFrameEncodedCount = 0;
+  int32_t iFrameIdx = 0;
+  int32_t	iTotalFrameMax = -1;
+  int8_t  iDlayerIdx = 0;
+  uint8_t* pYUV[MAX_DEPENDENCY_LAYER] = { 0 };
+  SSourcePicture**    pSrcPicList = NULL;
 #if (defined(RUN_SIMULATOR) || defined(WIN32)||defined(_MACH_PLATFORM) || (defined(__GNUC__)))
-	// Inactive with sink with output file handler
-	FILE *pFpBs = NULL;
+  // Inactive with sink with output file handler
+  FILE* pFpBs = NULL;
 #endif
 #if defined(COMPARE_DATA)
-	//For getting the golden file handle
-	FILE *fpGolden = NULL;
+  //For getting the golden file handle
+  FILE* fpGolden = NULL;
 #endif
 #if defined ( STICK_STREAM_SIZE )
-	FILE *fTrackStream = fopen("coding_size.stream", "wb");;
+  FILE* fTrackStream = fopen ("coding_size.stream", "wb");;
 #endif
-	SFilesSet fs;
-	// for configuration file
-	CReadConfig cRdCfg;
-	int iParsedNum = 2;
+  SFilesSet fs;
+  // for configuration file
+  CReadConfig cRdCfg;
+  int iParsedNum = 2;
 
-	memset(&sFbi, 0, sizeof(SFrameBSInfo));
-	memset(&sSvcParam, 0, sizeof(SWelsSvcCodingParam));	
+  memset (&sFbi, 0, sizeof (SFrameBSInfo));
+  memset (&sSvcParam, 0, sizeof (SWelsSvcCodingParam));
 
-	sSvcParam.iInputCsp	= videoFormatI420;	// I420 in default
-	sSvcParam.sDependencyLayers[0].uiProfileIdc	= PRO_BASELINE;
+  sSvcParam.iInputCsp	= videoFormatI420;	// I420 in default
+  sSvcParam.sDependencyLayers[0].uiProfileIdc	= PRO_BASELINE;
 //	svc_cfg->sDependencyLayers[0].frext_mode	= 0;
 
-	// for configuration file
-	cRdCfg.Openf(argv[1]);
-	if ( !cRdCfg.ExistFile() ){
-		fprintf(stderr, "Specified file: %s not exist, maybe invalid path or parameter settting.\n", cRdCfg.GetFileName().c_str());
-		iRet = 1;
-		goto INSIDE_MEM_FREE;
-	}	
+  // for configuration file
+  cRdCfg.Openf (argv[1]);
+  if (!cRdCfg.ExistFile()) {
+    fprintf (stderr, "Specified file: %s not exist, maybe invalid path or parameter settting.\n",
+             cRdCfg.GetFileName().c_str());
+    iRet = 1;
+    goto INSIDE_MEM_FREE;
+  }
 
-	iRet = ParseConfig(cRdCfg, sSvcParam, fs);	
-	if ( iRet ){
-		fprintf(stderr, "parse svc parameter config file failed.\n");
-		iRet = 1;
-		goto INSIDE_MEM_FREE;
-	}
-	
-	if ( ParseCommandLine(argc-iParsedNum, argv+iParsedNum, sSvcParam, fs) != 0 )
-	{
-		printf("parse pCommand line failed\n");
-		iRet = 1;
-		goto INSIDE_MEM_FREE;
-	}	
+  iRet = ParseConfig (cRdCfg, sSvcParam, fs);
+  if (iRet) {
+    fprintf (stderr, "parse svc parameter config file failed.\n");
+    iRet = 1;
+    goto INSIDE_MEM_FREE;
+  }
 
-	iTotalFrameMax = (int32_t)sSvcParam.uiFrameToBeCoded;
-	sSvcParam.SUsedPicRect.iLeft = 0;
-	sSvcParam.SUsedPicRect.iTop = 0;
-//	sSvcParam.max_pic_width	= 
-	sSvcParam.iActualPicWidth =
-	sSvcParam.SUsedPicRect.iWidth = sSvcParam.sDependencyLayers[sSvcParam.iNumDependencyLayer-1].iFrameWidth;
-//	pSvcParam.max_pic_height	= 
-	sSvcParam.iActualPicHeight =
-	sSvcParam.SUsedPicRect.iHeight = sSvcParam.sDependencyLayers[sSvcParam.iNumDependencyLayer-1].iFrameHeight;	
-	
-	if ( cmResultSuccess != pPtrEnc->Initialize((void *)&sSvcParam, INIT_TYPE_CONFIG_BASED) )	// SVC encoder initialization
-	{
-		fprintf( stderr, "SVC encoder Initialize failed\n");
-		iRet = 1;
-		goto INSIDE_MEM_FREE;
-	}
+  if (ParseCommandLine (argc - iParsedNum, argv + iParsedNum, sSvcParam, fs) != 0) {
+    printf ("parse pCommand line failed\n");
+    iRet = 1;
+    goto INSIDE_MEM_FREE;
+  }
+
+  iTotalFrameMax = (int32_t)sSvcParam.uiFrameToBeCoded;
+  sSvcParam.SUsedPicRect.iLeft = 0;
+  sSvcParam.SUsedPicRect.iTop = 0;
+//	sSvcParam.max_pic_width	=
+  sSvcParam.iActualPicWidth =
+    sSvcParam.SUsedPicRect.iWidth = sSvcParam.sDependencyLayers[sSvcParam.iNumDependencyLayer - 1].iFrameWidth;
+//	pSvcParam.max_pic_height	=
+  sSvcParam.iActualPicHeight =
+    sSvcParam.SUsedPicRect.iHeight = sSvcParam.sDependencyLayers[sSvcParam.iNumDependencyLayer - 1].iFrameHeight;
+
+  if (cmResultSuccess != pPtrEnc->Initialize ((void*)&sSvcParam, INIT_TYPE_CONFIG_BASED)) {	// SVC encoder initialization
+    fprintf (stderr, "SVC encoder Initialize failed\n");
+    iRet = 1;
+    goto INSIDE_MEM_FREE;
+  }
 #if (defined(RUN_SIMULATOR) || defined(WIN32)||defined(_MACH_PLATFORM) || (defined(__GNUC__)))
-	// Inactive with sink with output file handler	
-	if ( fs.strBsFile.length() > 0 ){
-		pFpBs = fopen (fs.strBsFile.c_str(), "wb");
-		if (pFpBs == NULL){
-			fprintf( stderr, "Can not open file (%s) to write bitstream!\n", fs.strBsFile.c_str() );
-			iRet = 1;
-			goto INSIDE_MEM_FREE;
-		}
-	}
-#endif	
-	
+  // Inactive with sink with output file handler
+  if (fs.strBsFile.length() > 0) {
+    pFpBs = fopen (fs.strBsFile.c_str(), "wb");
+    if (pFpBs == NULL) {
+      fprintf (stderr, "Can not open file (%s) to write bitstream!\n", fs.strBsFile.c_str());
+      iRet = 1;
+      goto INSIDE_MEM_FREE;
+    }
+  }
+#endif
+
 #if defined(COMPARE_DATA)
-	//For getting the golden file handle	
-	if((fpGolden = fopen(argv[3], "rb")) == NULL) 
-	{
-		fprintf(stderr, "Unable to open golden sequence file, check corresponding path!\n");
-		iRet = 1;
-		goto INSIDE_MEM_FREE;
-	}
+  //For getting the golden file handle
+  if ((fpGolden = fopen (argv[3], "rb")) == NULL) {
+    fprintf (stderr, "Unable to open golden sequence file, check corresponding path!\n");
+    iRet = 1;
+    goto INSIDE_MEM_FREE;
+  }
 #endif
 
-	pSrcPicList = new SSourcePicture * [sSvcParam.iNumDependencyLayer];		
-	while (iDlayerIdx < sSvcParam.iNumDependencyLayer) {
-		SDLayerParam *pDLayer = &sSvcParam.sDependencyLayers[iDlayerIdx];			
-		const int kiPicResSize = pDLayer->iFrameWidth * pDLayer->iFrameHeight;
-		SSourcePicture * pSrcPic = new SSourcePicture;
-		if( pSrcPic == NULL ){
-			iRet = 1;
-			goto INSIDE_MEM_FREE;
-		}
-		memset(pSrcPic, 0, sizeof(SSourcePicture));
-		
-		pYUV[iDlayerIdx] = new uint8_t [(3*kiPicResSize)>>1];
-		if (pYUV[iDlayerIdx] == NULL)
-		{
-			iRet = 1;
-			goto INSIDE_MEM_FREE;
-		}
+  pSrcPicList = new SSourcePicture * [sSvcParam.iNumDependencyLayer];
+  while (iDlayerIdx < sSvcParam.iNumDependencyLayer) {
+    SDLayerParam* pDLayer = &sSvcParam.sDependencyLayers[iDlayerIdx];
+    const int kiPicResSize = pDLayer->iFrameWidth * pDLayer->iFrameHeight;
+    SSourcePicture* pSrcPic = new SSourcePicture;
+    if (pSrcPic == NULL) {
+      iRet = 1;
+      goto INSIDE_MEM_FREE;
+    }
+    memset (pSrcPic, 0, sizeof (SSourcePicture));
 
-		pSrcPic->iColorFormat = videoFormatI420;
-		pSrcPic->iPicWidth = pDLayer->iFrameWidth;
-		pSrcPic->iPicHeight = pDLayer->iFrameHeight;
-		pSrcPic->iStride[0] = pDLayer->iFrameWidth;
-		pSrcPic->iStride[1] = pSrcPic->iStride[2] = pDLayer->iFrameWidth >> 1;
+    pYUV[iDlayerIdx] = new uint8_t [ (3 * kiPicResSize) >> 1];
+    if (pYUV[iDlayerIdx] == NULL) {
+      iRet = 1;
+      goto INSIDE_MEM_FREE;
+    }
 
-		pSrcPicList[iDlayerIdx] = pSrcPic;		
+    pSrcPic->iColorFormat = videoFormatI420;
+    pSrcPic->iPicWidth = pDLayer->iFrameWidth;
+    pSrcPic->iPicHeight = pDLayer->iFrameHeight;
+    pSrcPic->iStride[0] = pDLayer->iFrameWidth;
+    pSrcPic->iStride[1] = pSrcPic->iStride[2] = pDLayer->iFrameWidth >> 1;
 
-		pFileYUV[iDlayerIdx]	= fopen( fs.sSpatialLayers[iDlayerIdx].strSeqFile.c_str(), "rb");
-		if (pFileYUV[iDlayerIdx] != NULL){
-			if( !fseek( pFileYUV[iDlayerIdx], 0, SEEK_END ) )
-			{
-				int64_t i_size = ftell( pFileYUV[iDlayerIdx] );
-				fseek( pFileYUV[iDlayerIdx], 0, SEEK_SET );
-				iTotalFrameMax = WELS_MAX( (int32_t)(i_size / ((3*kiPicResSize)>>1) ), iTotalFrameMax );
-			}
-		}
-		else{
-			fprintf(stderr, "Unable to open source sequence file (%s), check corresponding path!\n", fs.sSpatialLayers[iDlayerIdx].strSeqFile.c_str());
-			iRet = 1;
-			goto INSIDE_MEM_FREE;
-		}			
+    pSrcPicList[iDlayerIdx] = pSrcPic;
 
-		++ iDlayerIdx;
-	}
-	
-	iFrameIdx = 0;
-	while (iFrameIdx < iTotalFrameMax && (((int32_t)sSvcParam.uiFrameToBeCoded <= 0) || (iFrameIdx < (int32_t)sSvcParam.uiFrameToBeCoded)) ) {
-		bool_t bOnePicAvailableAtLeast = false;
-		bool_t bSomeSpatialUnavailable	  = false;
+    pFileYUV[iDlayerIdx]	= fopen (fs.sSpatialLayers[iDlayerIdx].strSeqFile.c_str(), "rb");
+    if (pFileYUV[iDlayerIdx] != NULL) {
+      if (!fseek (pFileYUV[iDlayerIdx], 0, SEEK_END)) {
+        int64_t i_size = ftell (pFileYUV[iDlayerIdx]);
+        fseek (pFileYUV[iDlayerIdx], 0, SEEK_SET);
+        iTotalFrameMax = WELS_MAX ((int32_t) (i_size / ((3 * kiPicResSize) >> 1)), iTotalFrameMax);
+      }
+    } else {
+      fprintf (stderr, "Unable to open source sequence file (%s), check corresponding path!\n",
+               fs.sSpatialLayers[iDlayerIdx].strSeqFile.c_str());
+      iRet = 1;
+      goto INSIDE_MEM_FREE;
+    }
 
+    ++ iDlayerIdx;
+  }
+
+  iFrameIdx = 0;
+  while (iFrameIdx < iTotalFrameMax && (((int32_t)sSvcParam.uiFrameToBeCoded <= 0)
+                                        || (iFrameIdx < (int32_t)sSvcParam.uiFrameToBeCoded))) {
+    bool_t bOnePicAvailableAtLeast = false;
+    bool_t bSomeSpatialUnavailable	  = false;
+
 #ifdef ONLY_ENC_FRAMES_NUM
-		// Only encoded some limited frames here
-		if ( iActualFrameEncodedCount >= ONLY_ENC_FRAMES_NUM )
-		{
-			break;
-		}
+    // Only encoded some limited frames here
+    if (iActualFrameEncodedCount >= ONLY_ENC_FRAMES_NUM) {
+      break;
+    }
 #endif//ONLY_ENC_FRAMES_NUM
 
-		iDlayerIdx = 0;
-        int  nSpatialLayerNum = 0;
-		while (iDlayerIdx < sSvcParam.iNumDependencyLayer) {
-			SDLayerParam * pDLayer = &sSvcParam.sDependencyLayers[iDlayerIdx];
-			const int kiPicResSize = ((pDLayer->iFrameWidth * pDLayer->iFrameHeight)*3)>>1;			
-			uint32_t uiSkipIdx = (1 << pDLayer->iTemporalResolution);
-			
-			bool_t bCanBeRead= false;
+    iDlayerIdx = 0;
+    int  nSpatialLayerNum = 0;
+    while (iDlayerIdx < sSvcParam.iNumDependencyLayer) {
+      SDLayerParam* pDLayer = &sSvcParam.sDependencyLayers[iDlayerIdx];
+      const int kiPicResSize = ((pDLayer->iFrameWidth * pDLayer->iFrameHeight) * 3) >> 1;
+      uint32_t uiSkipIdx = (1 << pDLayer->iTemporalResolution);
 
-			if ( iFrameIdx % uiSkipIdx == 0 )	// such layer is enabled to encode indeed
-			{				
-				bCanBeRead = (fread(pYUV[iDlayerIdx], 1, kiPicResSize, pFileYUV[iDlayerIdx]) == kiPicResSize);
-				
-				if ( bCanBeRead )
-				{										
-					bOnePicAvailableAtLeast	= true;					
+      bool_t bCanBeRead = false;
 
-					pSrcPicList[nSpatialLayerNum]->pData[0] = pYUV[iDlayerIdx];
-					pSrcPicList[nSpatialLayerNum]->pData[1] = pSrcPicList[nSpatialLayerNum]->pData[0] +
-						(pDLayer->iFrameWidth * pDLayer->iFrameHeight);
-					pSrcPicList[nSpatialLayerNum]->pData[2] = pSrcPicList[nSpatialLayerNum]->pData[1] + 
-						((pDLayer->iFrameWidth * pDLayer->iFrameHeight)>>2);
+      if (iFrameIdx % uiSkipIdx == 0) {	// such layer is enabled to encode indeed
+        bCanBeRead = (fread (pYUV[iDlayerIdx], 1, kiPicResSize, pFileYUV[iDlayerIdx]) == kiPicResSize);
 
-					pSrcPicList[nSpatialLayerNum]->iPicWidth = pDLayer->iFrameWidth;
-					pSrcPicList[nSpatialLayerNum]->iPicHeight = pDLayer->iFrameHeight;
-					pSrcPicList[nSpatialLayerNum]->iStride[0] = pDLayer->iFrameWidth;
-					pSrcPicList[nSpatialLayerNum]->iStride[1] = pSrcPicList[nSpatialLayerNum]->iStride[2]
-					  = pDLayer->iFrameWidth >> 1;
+        if (bCanBeRead) {
+          bOnePicAvailableAtLeast	= true;
 
-					++ nSpatialLayerNum;
-				}
-				else	// file end while reading
-				{
-					bSomeSpatialUnavailable = true;
-					break;
-				}
-			}
-			else
-			{					
-				
-			}		
-			
-			++ iDlayerIdx;			
-		}
+          pSrcPicList[nSpatialLayerNum]->pData[0] = pYUV[iDlayerIdx];
+          pSrcPicList[nSpatialLayerNum]->pData[1] = pSrcPicList[nSpatialLayerNum]->pData[0] +
+              (pDLayer->iFrameWidth * pDLayer->iFrameHeight);
+          pSrcPicList[nSpatialLayerNum]->pData[2] = pSrcPicList[nSpatialLayerNum]->pData[1] +
+              ((pDLayer->iFrameWidth * pDLayer->iFrameHeight) >> 2);
 
-		if ( bSomeSpatialUnavailable )
-			break;
+          pSrcPicList[nSpatialLayerNum]->iPicWidth = pDLayer->iFrameWidth;
+          pSrcPicList[nSpatialLayerNum]->iPicHeight = pDLayer->iFrameHeight;
+          pSrcPicList[nSpatialLayerNum]->iStride[0] = pDLayer->iFrameWidth;
+          pSrcPicList[nSpatialLayerNum]->iStride[1] = pSrcPicList[nSpatialLayerNum]->iStride[2]
+              = pDLayer->iFrameWidth >> 1;
 
-		if ( !bOnePicAvailableAtLeast ){
-			++ iFrameIdx;
-			continue;
-		}		
-		
-		// To encoder this frame
-		iStart	= WelsTime();			
-		int iEncFrames = pPtrEnc->EncodeFrame(const_cast<const SSourcePicture**>(pSrcPicList), nSpatialLayerNum, &sFbi);
-		iTotal += WelsTime() - iStart;		
+          ++ nSpatialLayerNum;
+        } else {	// file end while reading
+          bSomeSpatialUnavailable = true;
+          break;
+        }
+      } else {
 
-		// fixed issue in case dismatch source picture introduced by frame skipped, 1/12/2010
-		if ( videoFrameTypeSkip == iEncFrames )
-		{
-			continue;
-		}
+      }
 
-		if ( iEncFrames != videoFrameTypeInvalid && iEncFrames != videoFrameTypeSkip )
-		{
-			int iLayer = 0;
-			int iFrameSize = 0;
-			while ( iLayer < sFbi.iLayerNum ){
-				SLayerBSInfo *pLayerBsInfo = &sFbi.sLayerInfo[iLayer];
-				if ( pLayerBsInfo != NULL ){
-					int iLayerSize = 0;
-					int iNalIdx = pLayerBsInfo->iNalCount -1;
-					do {
-						iLayerSize += pLayerBsInfo->iNalLengthInByte[iNalIdx];
-						-- iNalIdx;
-					} while(iNalIdx >= 0);
+      ++ iDlayerIdx;
+    }
+
+    if (bSomeSpatialUnavailable)
+      break;
+
+    if (!bOnePicAvailableAtLeast) {
+      ++ iFrameIdx;
+      continue;
+    }
+
+    // To encoder this frame
+    iStart	= WelsTime();
+    int iEncFrames = pPtrEnc->EncodeFrame (const_cast<const SSourcePicture**> (pSrcPicList), nSpatialLayerNum, &sFbi);
+    iTotal += WelsTime() - iStart;
+
+    // fixed issue in case dismatch source picture introduced by frame skipped, 1/12/2010
+    if (videoFrameTypeSkip == iEncFrames) {
+      continue;
+    }
+
+    if (iEncFrames != videoFrameTypeInvalid && iEncFrames != videoFrameTypeSkip) {
+      int iLayer = 0;
+      int iFrameSize = 0;
+      while (iLayer < sFbi.iLayerNum) {
+        SLayerBSInfo* pLayerBsInfo = &sFbi.sLayerInfo[iLayer];
+        if (pLayerBsInfo != NULL) {
+          int iLayerSize = 0;
+          int iNalIdx = pLayerBsInfo->iNalCount - 1;
+          do {
+            iLayerSize += pLayerBsInfo->iNalLengthInByte[iNalIdx];
+            -- iNalIdx;
+          } while (iNalIdx >= 0);
 #if defined(COMPARE_DATA)
-						//Comparing the result of encoder with golden pData
-                        {
-							unsigned char *pUCArry = new unsigned char [iLayerSize];
-							
-							fread(pUCArry, 1, iLayerSize, fpGolden);
+          //Comparing the result of encoder with golden pData
+          {
+            unsigned char* pUCArry = new unsigned char [iLayerSize];
 
-							for (int w=0; w<iLayerSize; w++) {
-								if (pUCArry[w] != pLayerBsInfo->pBsBuf[w]) {
-									fprintf(stderr, "error @frame%d/layer%d/byte%d!!!!!!!!!!!!!!!!!!!!!!!!\n", iFrameIdx, iLayer, w);
-									//fprintf(stderr, "%x - %x\n", pUCArry[w], pLayerBsInfo->pBsBuf[w]);									
-									break;
-								}
-							}
-							fprintf( stderr, "frame%d/layer%d comparation completed!\n", iFrameIdx, iLayer);
-							
-							delete [] pUCArry;
-						} 
+            fread (pUCArry, 1, iLayerSize, fpGolden);
+
+            for (int w = 0; w < iLayerSize; w++) {
+              if (pUCArry[w] != pLayerBsInfo->pBsBuf[w]) {
+                fprintf (stderr, "error @frame%d/layer%d/byte%d!!!!!!!!!!!!!!!!!!!!!!!!\n", iFrameIdx, iLayer, w);
+                //fprintf(stderr, "%x - %x\n", pUCArry[w], pLayerBsInfo->pBsBuf[w]);
+                break;
+              }
+            }
+            fprintf (stderr, "frame%d/layer%d comparation completed!\n", iFrameIdx, iLayer);
+
+            delete [] pUCArry;
+          }
 #endif
 #if (defined(RUN_SIMULATOR) || defined(WIN32)||defined(_MACH_PLATFORM) || (defined(__GNUC__)))
-					fwrite(pLayerBsInfo->pBsBuf, 1, iLayerSize, pFpBs);	// write pure bit stream into file
-#endif					
-					iFrameSize += iLayerSize;
-				}
-				++ iLayer;
-			}
+          fwrite (pLayerBsInfo->pBsBuf, 1, iLayerSize, pFpBs);	// write pure bit stream into file
+#endif
+          iFrameSize += iLayerSize;
+        }
+        ++ iLayer;
+      }
 #if defined (STICK_STREAM_SIZE)
-			if ( fTrackStream ){
-				fwrite( &iFrameSize, 1, sizeof(int), fTrackStream );
-			}
+      if (fTrackStream) {
+        fwrite (&iFrameSize, 1, sizeof (int), fTrackStream);
+      }
 #endif//STICK_STREAM_SIZE
-			++ iActualFrameEncodedCount;	// excluding skipped frame time
-		}
-		else{
-			fprintf(stderr, "EncodeFrame(), ret: %d, frame index: %d.\n", iEncFrames, iFrameIdx);
-		}
+      ++ iActualFrameEncodedCount;	// excluding skipped frame time
+    } else {
+      fprintf (stderr, "EncodeFrame(), ret: %d, frame index: %d.\n", iEncFrames, iFrameIdx);
+    }
 
-		++ iFrameIdx;
-	}
+    ++ iFrameIdx;
+  }
 
-	if (iActualFrameEncodedCount > 0){
-		double dElapsed = iTotal / 1e6;
-		printf( "Width:		%d\nHeight:		%d\nFrames:		%d\nencode time:	%f sec\nFPS:		%f fps\n",
-			sSvcParam.iActualPicWidth, sSvcParam.iActualPicHeight,
-			iActualFrameEncodedCount, dElapsed, (iActualFrameEncodedCount * 1.0)/dElapsed );
-	}	
+  if (iActualFrameEncodedCount > 0) {
+    double dElapsed = iTotal / 1e6;
+    printf ("Width:		%d\nHeight:		%d\nFrames:		%d\nencode time:	%f sec\nFPS:		%f fps\n",
+            sSvcParam.iActualPicWidth, sSvcParam.iActualPicHeight,
+            iActualFrameEncodedCount, dElapsed, (iActualFrameEncodedCount * 1.0) / dElapsed);
+  }
 
-INSIDE_MEM_FREE:
-	{
+INSIDE_MEM_FREE: {
 #if (defined(RUN_SIMULATOR) || defined(WIN32)||defined(_MACH_PLATFORM) || (defined(__GNUC__)))
-	if (pFpBs)
-	{
-		fclose(pFpBs);
-		pFpBs = NULL;
-	}
+    if (pFpBs) {
+      fclose (pFpBs);
+      pFpBs = NULL;
+    }
 #endif
 #if defined (STICK_STREAM_SIZE)
-	if ( fTrackStream ){
-		fclose( fTrackStream );
-		fTrackStream = NULL;
-	}
+    if (fTrackStream) {
+      fclose (fTrackStream);
+      fTrackStream = NULL;
+    }
 #endif
-#if defined (COMPARE_DATA)	
-	if ( fpGolden ){
-		fclose(fpGolden);
-		fpGolden = NULL;
-	}  
+#if defined (COMPARE_DATA)
+    if (fpGolden) {
+      fclose (fpGolden);
+      fpGolden = NULL;
+    }
 #endif
-	// Destruction memory introduced in this routine
-	iDlayerIdx = 0;	
-	while (iDlayerIdx < sSvcParam.iNumDependencyLayer)
-	{
-		if (pFileYUV[iDlayerIdx] != NULL){
-			fclose(pFileYUV[iDlayerIdx]);
-			pFileYUV[iDlayerIdx] = NULL;
-		}
-		++ iDlayerIdx;		
-	}	
+    // Destruction memory introduced in this routine
+    iDlayerIdx = 0;
+    while (iDlayerIdx < sSvcParam.iNumDependencyLayer) {
+      if (pFileYUV[iDlayerIdx] != NULL) {
+        fclose (pFileYUV[iDlayerIdx]);
+        pFileYUV[iDlayerIdx] = NULL;
+      }
+      ++ iDlayerIdx;
+    }
 
-	if( pSrcPicList ){
-		for( int32_t i=0;i<sSvcParam.iNumDependencyLayer;i++ )
-		{
-			if( pSrcPicList[i] ){
-				delete pSrcPicList[i];
-				pSrcPicList[i] = NULL;
-			}
-		}
-		delete pSrcPicList;
-		pSrcPicList = NULL;
-	}
+    if (pSrcPicList) {
+      for (int32_t i = 0; i < sSvcParam.iNumDependencyLayer; i++) {
+        if (pSrcPicList[i]) {
+          delete pSrcPicList[i];
+          pSrcPicList[i] = NULL;
+        }
+      }
+      delete pSrcPicList;
+      pSrcPicList = NULL;
+    }
 
-	for( int32_t i=0;i<MAX_DEPENDENCY_LAYER;i++ ){
-		if( pYUV[i] ){
-			delete [] pYUV[i];
-			pYUV[i] = NULL;
-		}
-	}
-	}
+    for (int32_t i = 0; i < MAX_DEPENDENCY_LAYER; i++) {
+      if (pYUV[i]) {
+        delete [] pYUV[i];
+        pYUV[i] = NULL;
+      }
+    }
+  }
 
-	return iRet;
+  return iRet;
 }
 
 //  Merge from Heifei's Wonder.  Lock process to a single core
-void LockToSingleCore()
-{  
+void LockToSingleCore() {
 #ifdef _MSC_VER
-	//for 2005 compiler, change "DWORD" to "DWORD_PTR"
-	DWORD ProcessAffMask = 0, SystemAffMask = 0;
-	HANDLE hProcess = GetCurrentProcess();
+  //for 2005 compiler, change "DWORD" to "DWORD_PTR"
+  DWORD ProcessAffMask = 0, SystemAffMask = 0;
+  HANDLE hProcess = GetCurrentProcess();
 
-	GetProcessAffinityMask(hProcess, &ProcessAffMask, &SystemAffMask);
-	if (ProcessAffMask > 1)
-	{
-		// more than one CPU core available. Fix to only one:
-		if (ProcessAffMask & 2) 
-		{
-			ProcessAffMask = 2;
-		}
-		else 
-		{
-			ProcessAffMask = 1;
-		}
-		// Lock process to a single CPU core
-		SetProcessAffinityMask(hProcess, ProcessAffMask);
-	}
+  GetProcessAffinityMask (hProcess, &ProcessAffMask, &SystemAffMask);
+  if (ProcessAffMask > 1) {
+    // more than one CPU core available. Fix to only one:
+    if (ProcessAffMask & 2) {
+      ProcessAffMask = 2;
+    } else {
+      ProcessAffMask = 1;
+    }
+    // Lock process to a single CPU core
+    SetProcessAffinityMask (hProcess, ProcessAffMask);
+  }
 
-	// set high priority to avoid interrupts during test
-	SetPriorityClass(hProcess, REALTIME_PRIORITY_CLASS);
+  // set high priority to avoid interrupts during test
+  SetPriorityClass (hProcess, REALTIME_PRIORITY_CLASS);
 #endif
-	return ;
+  return ;
 }
 
-long CreateSVCEncHandle(ISVCEncoder** ppEncoder)
-{
-	long ret = 0;
+long CreateSVCEncHandle (ISVCEncoder** ppEncoder) {
+  long ret = 0;
 #if defined(MACOS)
-	ret = WelsEncBundleLoad();
-	WelsEncBundleCreateEncoder(ppEncoder);
+  ret = WelsEncBundleLoad();
+  WelsEncBundleCreateEncoder (ppEncoder);
 #else
-	ret = CreateSVCEncoder( ppEncoder );
+  ret = CreateSVCEncoder (ppEncoder);
 #endif//MACOS
-	return ret;
+  return ret;
 }
 
-void DestroySVCEncHanlde(ISVCEncoder* pEncoder)
-{
-	if (pEncoder)
-	{
+void DestroySVCEncHanlde (ISVCEncoder* pEncoder) {
+  if (pEncoder) {
 #if defined(MACOS)
-		WelsEncBundleDestroyEncoder(pEncoder);
+    WelsEncBundleDestroyEncoder (pEncoder);
 #else
-		DestroySVCEncoder( pEncoder );
+    DestroySVCEncoder (pEncoder);
 #endif//MACOS
 
-	}
+  }
 }
 
 /****************************************************************************
@@ -1494,71 +1388,61 @@
  * main:
  ****************************************************************************/
 #if (defined(MACOS))
-int main_demo( int argc, char **argv )
+int main_demo (int argc, char** argv)
 #else
-int main( int argc, char **argv )
+int main (int argc, char** argv)
 #endif
-{	
-	ISVCEncoder* pSVCEncoder	= NULL;
-    FILE *pFileOut					= NULL; 
-    FILE *pFileIn					= NULL;
-	int iRet					= 0;
-	
+{
+  ISVCEncoder* pSVCEncoder	= NULL;
+  FILE* pFileOut					= NULL;
+  FILE* pFileIn					= NULL;
+  int iRet					= 0;
+
 #ifdef _MSC_VER
-	_setmode(_fileno(stdin), _O_BINARY);    /* thanks to Marcoss Morais <morais at dee.ufcg.edu.br> */
-	_setmode(_fileno(stdout), _O_BINARY);
+  _setmode (_fileno (stdin), _O_BINARY);  /* thanks to Marcoss Morais <morais at dee.ufcg.edu.br> */
+  _setmode (_fileno (stdout), _O_BINARY);
 
-	// remove the LOCK_TO_SINGLE_CORE micro, user need to enable it with manual  
-	// LockToSingleCore();
+  // remove the LOCK_TO_SINGLE_CORE micro, user need to enable it with manual
+  // LockToSingleCore();
 #endif
 
-	/* Control-C handler */
-	signal( SIGINT, SigIntHandler );
+  /* Control-C handler */
+  signal (SIGINT, SigIntHandler);
 
-	iRet = CreateSVCEncHandle( &pSVCEncoder );
-	if ( iRet )
-	{
-		cout << "CreateSVCEncoder() failed!!" << endl;		
-		goto exit;
-	}
+  iRet = CreateSVCEncHandle (&pSVCEncoder);
+  if (iRet) {
+    cout << "CreateSVCEncoder() failed!!" << endl;
+    goto exit;
+  }
 
-	if (argc < 2)
-	{
-		goto exit;
-	}
-	else
-	{
-		string	strCfgFileName = argv[1];
-		basic_string <char>::size_type index;
-		static const basic_string <char>::size_type npos = size_t(-1);
-		index = strCfgFileName.rfind(".cfg");	// check configuration type (like .cfg?)
-		if ( index == npos )
-		{
-			if (argc > 2)
-			{
-				iRet = ProcessEncodingSvcWithParam( pSVCEncoder, argc, argv );
-				if ( iRet != 0 )
-					goto exit;
-			}
-			else
-			{
-				cout << "You specified pCommand is invalid!!" << endl;
-				goto exit;
-			}
-		}
-		else
-		{
-			iRet = ProcessEncodingSvcWithConfig( pSVCEncoder, argc, argv);
-			if (iRet > 0)
-				goto exit;
-		}
-	}
+  if (argc < 2) {
+    goto exit;
+  } else {
+    string	strCfgFileName = argv[1];
+    basic_string <char>::size_type index;
+    static const basic_string <char>::size_type npos = size_t (-1);
+    index = strCfgFileName.rfind (".cfg");	// check configuration type (like .cfg?)
+    if (index == npos) {
+      if (argc > 2) {
+        iRet = ProcessEncodingSvcWithParam (pSVCEncoder, argc, argv);
+        if (iRet != 0)
+          goto exit;
+      } else {
+        cout << "You specified pCommand is invalid!!" << endl;
+        goto exit;
+      }
+    } else {
+      iRet = ProcessEncodingSvcWithConfig (pSVCEncoder, argc, argv);
+      if (iRet > 0)
+        goto exit;
+    }
+  }
 
-	DestroySVCEncHanlde( pSVCEncoder );
-	return 0;
+  DestroySVCEncHanlde (pSVCEncoder);
+  return 0;
 
 exit:
-	DestroySVCEncHanlde( pSVCEncoder );
-	PrintHelp();
-	return 1;
+  DestroySVCEncHanlde (pSVCEncoder);
+  PrintHelp();
+  return 1;
 }
--- a/codec/decoder/core/asm/asm_inc.asm
+++ b/codec/decoder/core/asm/asm_inc.asm
@@ -43,7 +43,7 @@
 ; Options, for DEBUG
 ;***********************************************************************
 
-%if 1 
+%if 1
 	%define MOVDQ movdqa
 %else
 	%define MOVDQ movdqu
@@ -58,7 +58,7 @@
 BITS 32
 
 ;***********************************************************************
-; Macros 
+; Macros
 ;***********************************************************************
 
 %macro WELS_EXTERN 1
@@ -74,7 +74,7 @@
 	pxor        %2, %2
     psubw       %2, %1
     pmaxsw      %1, %2
-%endmacro 	
+%endmacro
 
 %macro MMX_XSwap  4
     movq		%4, %2
@@ -105,7 +105,7 @@
     SSE2_XSawp qdq, %5, %2, %3
 %endmacro
 
-;in: xmm0, xmm1, xmm2, xmm3  pOut:  xmm0, xmm1, xmm3, xmm4 
+;in: xmm0, xmm1, xmm2, xmm3  pOut:  xmm0, xmm1, xmm3, xmm4
 %macro SSE2_TransTwo4x4W 5
     SSE2_XSawp wd,  %1, %2, %5
     SSE2_XSawp wd,  %3, %4, %2
@@ -125,26 +125,26 @@
 	movdqa	%6, %9
 	movdqa	%9, %4
 	SSE2_XSawp bw,  %7, %6, %4
-	
-	SSE2_XSawp wd,  %1, %3, %6	
+
+	SSE2_XSawp wd,  %1, %3, %6
 	SSE2_XSawp wd,  %8, %2, %3
 	SSE2_XSawp wd,  %5, %7, %2
 	movdqa	%7, %9
-	movdqa	%9, %3	
+	movdqa	%9, %3
 	SSE2_XSawp wd,  %7, %4, %3
-	
-	SSE2_XSawp dq,  %1, %5, %4	
+
+	SSE2_XSawp dq,  %1, %5, %4
 	SSE2_XSawp dq,  %6, %2, %5
 	SSE2_XSawp dq,  %8, %7, %2
 	movdqa	%7, %9
-	movdqa	%9, %5		
+	movdqa	%9, %5
 	SSE2_XSawp dq,  %7, %3, %5
-	
+
 	SSE2_XSawp qdq,  %1, %8, %3
 	SSE2_XSawp qdq,  %4, %2, %8
 	SSE2_XSawp qdq,  %6, %7, %2
 	movdqa	%7, %9
-	movdqa	%9, %1		
+	movdqa	%9, %1
 	SSE2_XSawp qdq,  %7, %5, %1
 	movdqa	%5, %9
 %endmacro
@@ -170,9 +170,9 @@
 %macro butterfly_1to16_sse	3	; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
 	mov %3h, %3l
 	movd %1, e%3x		; i.e, 1% = eax (=b0)
-	pshuflw %2, %1, 00h	; ..., b0 b0 b0 b0 b0 b0 b0 b0	
-	pshufd %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0	
-%endmacro  
+	pshuflw %2, %1, 00h	; ..., b0 b0 b0 b0 b0 b0 b0 b0
+	pshufd %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
+%endmacro
 
 ;copy a dw into a xmm for 8 times
 %macro  SSE2_Copy8Times 2
--- a/codec/decoder/core/asm/block_add.asm
+++ b/codec/decoder/core/asm/block_add.asm
@@ -48,7 +48,7 @@
 ; Macros and other preprocessor constants
 ;*******************************************************************************
 
-%macro   BLOCK_ADD_16_SSE2   4 
+%macro   BLOCK_ADD_16_SSE2   4
 	movdqa    xmm0,       [%2]
 	movdqa    xmm1,       [%3]
     movdqa    xmm2,       [%3+10h]
@@ -65,7 +65,7 @@
 
 	lea          %2,      [%2+%4]
 	lea          %3,      [%3+%4*2]
-	lea          %1,      [%1+%4] 
+	lea          %1,      [%1+%4]
 %endmacro
 
 %macro    BLOCK_ADD_8_MMXEXT   4
@@ -106,7 +106,7 @@
 
 	lea          %2,      [%2+%4]
 	lea          %3,      [%3+%5*2]
-	lea          %1,      [%1+%4] 
+	lea          %1,      [%1+%4]
 %endmacro
 
 
@@ -130,24 +130,24 @@
 	lea          %1,      [%1+%4]
 %endmacro
 
-%macro    BLOCK_ADD_8_STRIDE_2_LINES_SSE2   5    
+%macro    BLOCK_ADD_8_STRIDE_2_LINES_SSE2   5
 	movdqa xmm1, [%3]
 	movq xmm0, [%2]
 	punpcklbw xmm0, xmm7
 	paddw xmm0, xmm1
 	packuswb xmm0, xmm7
-	movq [%1], xmm0	
-	
+	movq [%1], xmm0
+
 	movdqa xmm3, [%3+%5*2]
 	movq xmm2, [%2+%4]
 	punpcklbw xmm2, xmm7
 	paddw xmm2, xmm3
-	packuswb xmm2, xmm7	
-	movq [%1+%4], xmm2	
-	
+	packuswb xmm2, xmm7
+	movq [%1+%4], xmm2
+
 	lea %1, [%1+%4*2]
 	lea %2, [%2+%4*2]
-	lea %3, [%3+%5*4]	
+	lea %3, [%3+%5*4]
 %endmacro
 
 %macro   CHECK_DATA_16_ZERO_SSE4     3
@@ -159,7 +159,7 @@
 	por		   xmm0,	 xmm1
 	ptest      xmm7,     xmm0
 	cmovae     eax,      %3
-	
+
 	add        %1,       20h
 	add        ecx,      04h
 	mov        byte [%2+ebx],  al
@@ -170,12 +170,12 @@
     movdqa     xmm1,      [%1+%3]
     movdqa     xmm2,      [%1+%3*2]
     movdqa     xmm3,      [%1+%4]
-    
+
     mov        eax,       0h
     mov        ebx,       0h
     movdqa     xmm4,      xmm0
     movdqa     xmm5,      xmm2
-    
+
     punpcklqdq  xmm0,     xmm1
     punpckhqdq  xmm4,     xmm1
     punpcklqdq  xmm2,     xmm3
@@ -183,12 +183,12 @@
 
 	por			xmm0,	  xmm2
 	por			xmm4,	  xmm5
-    
+
     ptest       xmm7,     xmm0
     cmovae      eax,      %5
     ptest       xmm7,     xmm4
-    cmovae      ebx,      %5    
-    
+    cmovae      ebx,      %5
+
     mov     byte [%2],    al
     mov     byte [%2+1],  bl
 %endmacro
@@ -230,45 +230,45 @@
     movdqa     xmm0,      [%1]
     movdqa     xmm1,      [%1+10h]
     mov        ebx,       [ecx]
-    
+
     pcmpeqw    xmm0,      xmm7
     pcmpeqw    xmm1,      xmm7
     packsswb   xmm0,      xmm1
-    pmovmskb   edx,       xmm0    
+    pmovmskb   edx,       xmm0
     sub        edx,       0ffffh
-    
-    cmovb      eax,       ebp   
+
+    cmovb      eax,       ebp
     add        ecx,       4
     add        %1,        20h
     mov      byte [%2+ebx],    al
 %endmacro
-    
 
 
+
 %macro   CHECK_RS_4x4_BLOCK_2_ZERO_SSE2    5
     movdqa    xmm0,      [%1]
     movdqa    xmm1,      [%1 + %3]
     movdqa    xmm2,      [%1 + %3*2]
-    movdqa    xmm3,      [%1 + %4]    
-    
+    movdqa    xmm3,      [%1 + %4]
+
     movdqa    xmm4,       xmm0
     movdqa    xmm5,       xmm2
-    
+
     punpcklqdq   xmm0,    xmm1
     punpckhqdq   xmm4,    xmm1
     punpcklqdq   xmm2,    xmm3
     punpckhqdq   xmm5,    xmm3
-    
+
     pcmpeqw      xmm0,    xmm7
     pcmpeqw      xmm2,    xmm7
     pcmpeqw      xmm4,    xmm7
     pcmpeqw      xmm5,    xmm7
-    
+
     packsswb     xmm0,    xmm2
     packsswb     xmm4,    xmm5
     pmovmskb     eax,     xmm0
     pmovmskb     ebx,     xmm4
-    
+
     sub          eax,     0ffffh
     mov          eax,     0
     cmovb        eax,     %5
@@ -276,7 +276,7 @@
     mov          ebx,     0
     cmovb        ebx,     %5
     mov       byte [%2],    al
-    mov       byte [%2+1],  bl        
+    mov       byte [%2+1],  bl
 %endmacro
 
 ;*******************************************************************************
@@ -291,12 +291,12 @@
 
 ALIGN  16
 SubMbScanIdx:
-     dd    0x0,  0x1,  0x4,  0x5, 
+     dd    0x0,  0x1,  0x4,  0x5,
 	 dd    0x2,  0x3,  0x6,  0x7,
 	 dd    0x8,  0x9,  0xc,  0xd,
 	 dd    0xa,  0xb,  0xe,  0xf,
 	 dd    0x10, 0x11, 0x14, 0x15,
-	 dd    0x12, 0x13, 0x16, 0x17,     
+	 dd    0x12, 0x13, 0x16, 0x17,
 
 ;*******************************************************************************
 ; Code
@@ -312,10 +312,10 @@
 ;  void_t WelsResBlockZero16x16_sse2(int16_t* pBlock,int32_t iStride)
 ;*******************************************************************************
 WelsResBlockZero16x16_sse2:
-    push     esi	
+    push     esi
 
 	mov      esi,        [esp+08h]
-	mov      ecx,        [esp+0ch]	
+	mov      ecx,        [esp+0ch]
 	lea      ecx,        [ecx*2]
 	lea      eax,        [ecx*3]
 
@@ -375,7 +375,7 @@
 
 	movdqa   [esi+eax],     xmm7
 	movdqa   [esi+eax+10h],     xmm7
-    
+
     pop      esi
 	ret
 
@@ -386,7 +386,7 @@
 ;*******************************************************************************
 ;  void_t WelsResBlockZero8x8_sse2(int16_t * pBlock, int32_t iStride)
 ;*******************************************************************************
-WelsResBlockZero8x8_sse2: 
+WelsResBlockZero8x8_sse2:
 	  push      esi
 
       mov       esi,     [esp+08h]
@@ -407,7 +407,7 @@
 	  movdqa    [esi+ecx*2],   xmm7
 	  movdqa    [esi+eax],     xmm7
 
-	  
+
 	  pop       esi
 	  ret
 
--- a/codec/decoder/core/asm/cpuid.asm
+++ b/codec/decoder/core/asm/cpuid.asm
@@ -84,12 +84,12 @@
 ;   void WelsCPUId( int32_t index, int32_t *uiFeatureA, int32_t *uiFeatureB, int32_t *uiFeatureC, int32_t *uiFeatureD )
 ;****************************************************************************************************
 WelsCPUId:
-	push	ebx	
+	push	ebx
 	push	edi
-	
+
 	mov     eax, [esp+12]	; operating index
     cpuid					; cpuid
-	
+
 	; processing various information return
 	mov     edi, [esp+16]
     mov     [edi], eax
@@ -100,10 +100,10 @@
     mov     edi, [esp+28]
     mov     [edi], edx
 
-	pop		edi	
+	pop		edi
     pop     ebx
 	ret
-	
+
 WELS_EXTERN WelsCPUSupportAVX
 ; need call after cpuid=1 and eax, ecx flag got then
 ALIGN 16
@@ -139,7 +139,7 @@
 WelsCPUSupportFMA:
 	mov eax, [esp+4]
 	mov ecx, [esp+8]
-	
+
 	; refer to detection of FMA addressed in INTEL AVX manual document
 	and ecx, 018001000H
 	cmp ecx, 018001000H		; check OSXSAVE, AVX, FMA feature flags
@@ -153,7 +153,7 @@
 	mov eax, 1
 	ret
 fma_not_supported:
-	mov eax, 0	
+	mov eax, 0
 	ret
 
 WELS_EXTERN WelsEmms
--- a/codec/decoder/core/asm/dct.asm
+++ b/codec/decoder/core/asm/dct.asm
@@ -1,129 +1,129 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        ?Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        ?Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  dct.asm
-;*
-;*  Abstract
-;*      WelsDctFourT4_sse2
-;*
-;*  History
-;*      8/4/2009 Created
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-BITS 32
-
-;*******************************************************************************
-; Macros and other preprocessor constants
-;*******************************************************************************
-%macro MMX_SumSubDiv2 3
-    movq    %3, %2
-    psraw   %3, $1
-    paddw   %3, %1
-    psraw   %1, $1
-    psubw   %1, %2
-%endmacro
-
-%macro MMX_SumSub 3
-	movq    %3, %2
-    psubw   %2, %1
-    paddw   %1, %3
-%endmacro
-
-%macro MMX_IDCT 6
-    MMX_SumSub      %4, %5, %6
-    MMX_SumSubDiv2  %3, %2, %1
-    MMX_SumSub		%1, %4, %6
-	MMX_SumSub		%3, %5, %6
-%endmacro
-
-
-%macro MMX_StoreDiff4P 5
-    movd       %2, %5
-    punpcklbw  %2, %4
-    paddw      %1, %3
-    psraw      %1, $6
-    paddsw     %1, %2
-    packuswb   %1, %2
-    movd       %5, %1
-%endmacro
-
-;*******************************************************************************
-; Code
-;*******************************************************************************
-
-SECTION .text
-
-WELS_EXTERN IdctResAddPred_mmx
-
-ALIGN 16
-;*******************************************************************************
-;   void_t __cdecl IdctResAddPred_mmx( uint8_t *pPred, const int32_t kiStride, int16_t *pRs )
-;*******************************************************************************
-
-IdctResAddPred_mmx:
-
-%define	pushsize	0
-%define pPred       esp+pushsize+4
-%define kiStride     esp+pushsize+8
-%define pRs         esp+pushsize+12
-
-	mov     eax, [pRs   ] 
-    mov     edx, [pPred ]   
-    mov     ecx, [kiStride]   
-    movq    mm0, [eax+ 0]
-    movq    mm1, [eax+ 8]
-    movq    mm2, [eax+16]
-    movq    mm3, [eax+24]
-
-	MMX_Trans4x4W        mm0, mm1, mm2, mm3, mm4
-	MMX_IDCT			mm1, mm2, mm3, mm4, mm0, mm6
-    MMX_Trans4x4W        mm1, mm3, mm0, mm4, mm2
-	MMX_IDCT			mm3, mm0, mm4, mm2, mm1, mm6
-
-    WELS_Zero			mm7
-    WELS_DW32			mm6
-    
-    MMX_StoreDiff4P    mm3, mm0, mm6, mm7, [edx]
-    MMX_StoreDiff4P    mm4, mm0, mm6, mm7, [edx+ecx]
-    lea     edx, [edx+2*ecx]
-    MMX_StoreDiff4P    mm1, mm0, mm6, mm7, [edx]
-    MMX_StoreDiff4P    mm2, mm0, mm6, mm7, [edx+ecx]
-    
-%undef	pushsize
-%undef  pPred
-%undef  kiStride
-%undef  pRs
-	emms
-    ret
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        ?Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        ?Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  dct.asm
+;*
+;*  Abstract
+;*      WelsDctFourT4_sse2
+;*
+;*  History
+;*      8/4/2009 Created
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+BITS 32
+
+;*******************************************************************************
+; Macros and other preprocessor constants
+;*******************************************************************************
+%macro MMX_SumSubDiv2 3
+    movq    %3, %2
+    psraw   %3, $1
+    paddw   %3, %1
+    psraw   %1, $1
+    psubw   %1, %2
+%endmacro
+
+%macro MMX_SumSub 3
+	movq    %3, %2
+    psubw   %2, %1
+    paddw   %1, %3
+%endmacro
+
+%macro MMX_IDCT 6
+    MMX_SumSub      %4, %5, %6
+    MMX_SumSubDiv2  %3, %2, %1
+    MMX_SumSub		%1, %4, %6
+	MMX_SumSub		%3, %5, %6
+%endmacro
+
+
+%macro MMX_StoreDiff4P 5
+    movd       %2, %5
+    punpcklbw  %2, %4
+    paddw      %1, %3
+    psraw      %1, $6
+    paddsw     %1, %2
+    packuswb   %1, %2
+    movd       %5, %1
+%endmacro
+
+;*******************************************************************************
+; Code
+;*******************************************************************************
+
+SECTION .text
+
+WELS_EXTERN IdctResAddPred_mmx
+
+ALIGN 16
+;*******************************************************************************
+;   void_t __cdecl IdctResAddPred_mmx( uint8_t *pPred, const int32_t kiStride, int16_t *pRs )
+;*******************************************************************************
+
+IdctResAddPred_mmx:
+
+%define	pushsize	0
+%define pPred       esp+pushsize+4
+%define kiStride     esp+pushsize+8
+%define pRs         esp+pushsize+12
+
+	mov     eax, [pRs   ]
+    mov     edx, [pPred ]
+    mov     ecx, [kiStride]
+    movq    mm0, [eax+ 0]
+    movq    mm1, [eax+ 8]
+    movq    mm2, [eax+16]
+    movq    mm3, [eax+24]
+
+	MMX_Trans4x4W        mm0, mm1, mm2, mm3, mm4
+	MMX_IDCT			mm1, mm2, mm3, mm4, mm0, mm6
+    MMX_Trans4x4W        mm1, mm3, mm0, mm4, mm2
+	MMX_IDCT			mm3, mm0, mm4, mm2, mm1, mm6
+
+    WELS_Zero			mm7
+    WELS_DW32			mm6
+
+    MMX_StoreDiff4P    mm3, mm0, mm6, mm7, [edx]
+    MMX_StoreDiff4P    mm4, mm0, mm6, mm7, [edx+ecx]
+    lea     edx, [edx+2*ecx]
+    MMX_StoreDiff4P    mm1, mm0, mm6, mm7, [edx]
+    MMX_StoreDiff4P    mm2, mm0, mm6, mm7, [edx+ecx]
+
+%undef	pushsize
+%undef  pPred
+%undef  kiStride
+%undef  pRs
+	emms
+    ret
--- a/codec/decoder/core/asm/deblock.asm
+++ b/codec/decoder/core/asm/deblock.asm
@@ -1,2113 +1,2113 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  deblock.asm
-;*
-;*  Abstract
-;*      edge loop
-;*
-;*  History
-;*      08/07/2009 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-BITS 32
-
-;*******************************************************************************
-; Macros and other preprocessor constants
-;*******************************************************************************
-
-%ifdef FORMAT_COFF
-SECTION .rodata pData
-%else
-SECTION .rodata align=16
-%endif
-
-SECTION .text
-
-;********************************************************************************
-;  void DeblockChromaEq4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-;                             int32_t iAlpha, int32_t iBeta)
-;********************************************************************************
-WELS_EXTERN   DeblockChromaEq4V_sse2
-
-ALIGN  16
-DeblockChromaEq4V_sse2:
-  push        ebp  
-  mov         ebp,esp 
-  and         esp,0FFFFFFF0h 
-  sub         esp,68h 
-  mov         edx,[ebp+10h]      ;  iStride
-  mov         eax,[ebp+8]        ;  pPixCb
-  mov         ecx,[ebp+0Ch]      ;  pPixCr
-  movq        xmm4,[ecx] 
-  movq        xmm5,[edx+ecx] 
-  push        esi  
-  push        edi  
-  lea         esi,[edx+edx] 
-  mov         edi,eax 
-  sub         edi,esi 
-  movq        xmm1,[edi] 
-  mov         edi,ecx 
-  sub         edi,esi 
-  movq        xmm2,[edi] 
-  punpcklqdq  xmm1,xmm2 
-  mov         esi,eax 
-  sub         esi,edx 
-  movq        xmm2,[esi] 
-  mov         edi,ecx 
-  sub         edi,edx 
-  movq        xmm3,[edi] 
-  punpcklqdq  xmm2,xmm3 
-  movq        xmm3,[eax] 
-  punpcklqdq  xmm3,xmm4 
-  movq        xmm4,[edx+eax] 
-  mov       edx, [ebp + 14h] 
-  punpcklqdq  xmm4,xmm5 
-  movd        xmm5,edx 
-  mov       edx, [ebp + 18h] 
-  pxor        xmm0,xmm0 
-  movdqa      xmm6,xmm5 
-  punpcklwd   xmm6,xmm5 
-  pshufd      xmm5,xmm6,0 
-  movd        xmm6,edx 
-  movdqa      xmm7,xmm6 
-  punpcklwd   xmm7,xmm6 
-  pshufd      xmm6,xmm7,0 
-  movdqa      xmm7,xmm1 
-  punpckhbw   xmm1,xmm0 
-  punpcklbw   xmm7,xmm0 
-  movdqa      [esp+40h],xmm1 
-  movdqa      [esp+60h],xmm7 
-  movdqa      xmm7,xmm2 
-  punpcklbw   xmm7,xmm0 
-  movdqa      [esp+10h],xmm7 
-  movdqa      xmm7,xmm3 
-  punpcklbw   xmm7,xmm0 
-  punpckhbw   xmm3,xmm0 
-  movdqa      [esp+50h],xmm7 
-  movdqa      xmm7,xmm4 
-  punpckhbw   xmm4,xmm0 
-  punpckhbw   xmm2,xmm0 
-  punpcklbw   xmm7,xmm0 
-  movdqa      [esp+30h],xmm3 
-  movdqa      xmm3,[esp+10h] 
-  movdqa      xmm1,xmm3 
-  psubw       xmm1,[esp+50h] 
-  pabsw       xmm1,xmm1 
-  movdqa      [esp+20h],xmm4 
-  movdqa      xmm0,xmm5 
-  pcmpgtw     xmm0,xmm1 
-  movdqa      xmm1,[esp+60h] 
-  psubw       xmm1,xmm3 
-  pabsw       xmm1,xmm1 
-  movdqa      xmm4,xmm6 
-  pcmpgtw     xmm4,xmm1 
-  pand        xmm0,xmm4 
-  movdqa      xmm1,xmm7 
-  psubw       xmm1,[esp+50h] 
-  pabsw       xmm1,xmm1 
-  movdqa      xmm4,xmm6 
-  pcmpgtw     xmm4,xmm1 
-  movdqa      xmm1,xmm2 
-  psubw       xmm1,[esp+30h] 
-  pabsw       xmm1,xmm1 
-  pcmpgtw     xmm5,xmm1 
-  movdqa      xmm1,[esp+40h] 
-  pand        xmm0,xmm4 
-  psubw       xmm1,xmm2 
-  pabsw       xmm1,xmm1 
-  movdqa      xmm4,xmm6 
-  pcmpgtw     xmm4,xmm1 
-  movdqa      xmm1,[esp+20h] 
-  psubw       xmm1,[esp+30h] 
-  pand        xmm5,xmm4 
-  pabsw       xmm1,xmm1 
-  pcmpgtw     xmm6,xmm1 
-  pand        xmm5,xmm6 
-  mov         edx,2 
-  movsx       edx,dx 
-  movd        xmm1,edx 
-  movdqa      xmm4,xmm1 
-  punpcklwd   xmm4,xmm1 
-  pshufd      xmm1,xmm4,0 
-  movdqa      xmm4,[esp+60h] 
-  movdqa      xmm6,xmm4 
-  paddw       xmm6,xmm4 
-  paddw       xmm6,xmm3 
-  paddw       xmm6,xmm7 
-  movdqa      [esp+10h],xmm1 
-  paddw       xmm6,[esp+10h] 
-  psraw       xmm6,2 
-  movdqa      xmm4,xmm0 
-  pandn       xmm4,xmm3 
-  movdqa      xmm3,[esp+40h] 
-  movdqa      xmm1,xmm0 
-  pand        xmm1,xmm6 
-  por         xmm1,xmm4 
-  movdqa      xmm6,xmm3 
-  paddw       xmm6,xmm3 
-  movdqa      xmm3,[esp+10h] 
-  paddw       xmm6,xmm2 
-  paddw       xmm6,[esp+20h] 
-  paddw       xmm6,xmm3 
-  psraw       xmm6,2 
-  movdqa      xmm4,xmm5 
-  pand        xmm4,xmm6 
-  movdqa      xmm6,xmm5 
-  pandn       xmm6,xmm2 
-  por         xmm4,xmm6 
-  packuswb    xmm1,xmm4 
-  movdqa      xmm4,[esp+50h] 
-  movdqa      xmm6,xmm7 
-  paddw       xmm6,xmm7 
-  paddw       xmm6,xmm4 
-  paddw       xmm6,[esp+60h] 
-  paddw       xmm6,xmm3 
-  psraw       xmm6,2 
-  movdqa      xmm2,xmm0 
-  pand        xmm2,xmm6 
-  pandn       xmm0,xmm4 
-  por         xmm2,xmm0 
-  movdqa      xmm0,[esp+20h] 
-  movdqa      xmm6,xmm0 
-  paddw       xmm6,xmm0 
-  movdqa      xmm0,[esp+30h] 
-  paddw       xmm6,xmm0 
-  paddw       xmm6,[esp+40h] 
-  movdqa      xmm4,xmm5 
-  paddw       xmm6,xmm3 
-  movq        [esi],xmm1 
-  psraw       xmm6,2 
-  pand        xmm4,xmm6 
-  pandn       xmm5,xmm0 
-  por         xmm4,xmm5 
-  packuswb    xmm2,xmm4 
-  movq        [eax],xmm2 
-  psrldq      xmm1,8 
-  movq        [edi],xmm1 
-  pop         edi  
-  psrldq      xmm2,8 
-  movq        [ecx],xmm2 
-  pop         esi  
-  mov         esp,ebp 
-  pop         ebp  
-  ret              
-
-;******************************************************************************
-; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, 
-;                           int32_t iAlpha, int32_t iBeta, int8_t * pTC);
-;*******************************************************************************
-
-WELS_EXTERN  DeblockChromaLt4V_sse2
-
-DeblockChromaLt4V_sse2:
-  push        ebp  
-  mov         ebp,esp 
-  and         esp,0FFFFFFF0h 
-  sub         esp,0E4h 
-  push        ebx  
-  push        esi  
-  mov         esi, [ebp+1Ch]      ;  pTC
-  movsx       ebx, byte [esi+2] 
-  push        edi  
-  movsx       di,byte [esi+3] 
-  mov         word [esp+0Ch],bx 
-  movsx       bx,byte  [esi+1] 
-  movsx       esi,byte  [esi] 
-  mov         word  [esp+0Eh],si 
-  movzx       esi,di 
-  movd        xmm1,esi 
-  movzx       esi,di 
-  movd        xmm2,esi 
-  mov         si,word  [esp+0Ch] 
-  mov         edx, [ebp + 10h] 
-  mov         eax, [ebp + 08h] 
-  movzx       edi,si 
-  movzx       esi,si 
-  mov         ecx, [ebp + 0Ch] 
-  movd        xmm4,esi 
-  movzx       esi,bx 
-  movd        xmm5,esi 
-  movd        xmm3,edi 
-  movzx       esi,bx 
-  movd        xmm6,esi 
-  mov         si,word [esp+0Eh] 
-  movzx       edi,si 
-  movzx       esi,si 
-  punpcklwd   xmm6,xmm2 
-  pxor        xmm0,xmm0 
-  movdqa      [esp+40h],xmm0 
-  movd        xmm7,edi 
-  movd        xmm0,esi 
-  lea         esi,[edx+edx] 
-  mov         edi,eax 
-  sub         edi,esi 
-  punpcklwd   xmm5,xmm1 
-  movdqa      xmm1,[esp+40h] 
-  punpcklwd   xmm0,xmm4 
-  movq        xmm4,[edx+ecx] 
-  punpcklwd   xmm7,xmm3 
-  movq        xmm3,[eax] 
-  punpcklwd   xmm0,xmm6 
-  movq        xmm6,[edi] 
-  punpcklwd   xmm7,xmm5 
-  punpcklwd   xmm0,xmm7 
-  mov         edi,ecx 
-  sub         edi,esi 
-  movdqa      xmm2,xmm1 
-  psubw       xmm2,xmm0 
-  movdqa      [esp+60h],xmm2 
-  movq        xmm2, [edi] 
-  punpcklqdq  xmm6,xmm2 
-  mov         esi,eax 
-  sub         esi,edx 
-  movq        xmm7,[esi] 
-  mov         edi,ecx 
-  sub         edi,edx 
-  movq        xmm2,[edi] 
-  punpcklqdq  xmm7,xmm2 
-  movq        xmm2,[ecx] 
-  punpcklqdq  xmm3,xmm2 
-  movq        xmm2,[edx+eax] 
-  movsx       edx,word [ebp + 14h] 
-  punpcklqdq  xmm2,xmm4 
-  movdqa      [esp+0E0h],xmm2 
-  movd        xmm2,edx 
-  movsx       edx,word [ebp + 18h] 
-  movdqa      xmm4,xmm2 
-  punpcklwd   xmm4,xmm2 
-  movd        xmm2,edx 
-  movdqa      xmm5,xmm2 
-  punpcklwd   xmm5,xmm2 
-  pshufd      xmm2,xmm5,0 
-  movdqa      [esp+50h],xmm2 
-  movdqa      xmm2,xmm6 
-  punpcklbw   xmm2,xmm1 
-  movdqa      [esp+0D0h],xmm3 
-  pshufd      xmm4,xmm4,0 
-  movdqa      [esp+30h],xmm2 
-  punpckhbw   xmm6,xmm1 
-  movdqa      [esp+80h],xmm6 
-  movdqa      xmm6,[esp+0D0h] 
-  punpckhbw   xmm6,xmm1 
-  movdqa      [esp+70h],xmm6 
-  movdqa      xmm6, [esp+0E0h] 
-  punpckhbw   xmm6,xmm1 
-  movdqa     [esp+90h],xmm6 
-  movdqa      xmm5, [esp+0E0h] 
-  movdqa      xmm2,xmm7 
-  punpckhbw   xmm7,xmm1 
-  punpcklbw   xmm5,xmm1 
-  movdqa       [esp+0A0h],xmm7 
-  punpcklbw   xmm3,xmm1 
-  mov         edx,4 
-  punpcklbw   xmm2,xmm1 
-  movsx       edx,dx 
-  movd        xmm6,edx 
-  movdqa      xmm7,xmm6 
-  punpcklwd   xmm7,xmm6 
-  pshufd      xmm6,xmm7,0 
-  movdqa      xmm7,[esp+30h] 
-  movdqa      [esp+20h],xmm6 
-  psubw       xmm7,xmm5 
-  movdqa      xmm6,xmm0 
-  pcmpgtw     xmm6,xmm1 
-  movdqa      xmm1,[esp+60h] 
-  movdqa      [esp+40h],xmm6 
-  movdqa      xmm6,xmm3 
-  psubw       xmm6,xmm2 
-  psllw       xmm6,2 
-  paddw       xmm6,xmm7 
-  paddw       xmm6, [esp+20h] 
-  movdqa      xmm7, [esp+50h] 
-  psraw       xmm6,3 
-  pmaxsw      xmm1,xmm6 
-  movdqa      [esp+10h],xmm0 
-  movdqa      xmm6, [esp+10h] 
-  pminsw      xmm6,xmm1 
-  movdqa      [esp+10h],xmm6 
-  movdqa      xmm1,xmm2 
-  psubw       xmm1,xmm3 
-  pabsw       xmm1,xmm1 
-  movdqa      xmm6,xmm4 
-  pcmpgtw     xmm6,xmm1 
-  movdqa      xmm1, [esp+30h] 
-  psubw       xmm1,xmm2 
-  pabsw       xmm1,xmm1 
-  pcmpgtw     xmm7,xmm1 
-  movdqa      xmm1,[esp+50h] 
-  pand        xmm6,xmm7 
-  movdqa      xmm7,[esp+50h] 
-  psubw       xmm5,xmm3 
-  pabsw       xmm5,xmm5 
-  pcmpgtw     xmm1,xmm5 
-  movdqa      xmm5,[esp+80h] 
-  psubw       xmm5,[esp+90h] 
-  pand        xmm6,xmm1 
-  pand        xmm6,[esp+40h] 
-  movdqa      xmm1,[esp+10h] 
-  pand        xmm1,xmm6 
-  movdqa      xmm6,[esp+70h] 
-  movdqa      [esp+30h],xmm1 
-  movdqa      xmm1,[esp+0A0h] 
-  psubw       xmm6,xmm1 
-  psllw       xmm6,2 
-  paddw       xmm6,xmm5 
-  paddw       xmm6,[esp+20h] 
-  movdqa      xmm5,[esp+60h] 
-  psraw       xmm6,3 
-  pmaxsw      xmm5,xmm6 
-  pminsw      xmm0,xmm5 
-  movdqa      xmm5,[esp+70h] 
-  movdqa      xmm6,xmm1 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm4,xmm6 
-  movdqa      xmm6,[esp+80h] 
-  psubw       xmm6,xmm1 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm7,xmm6 
-  movdqa      xmm6,[esp+90h] 
-  pand        xmm4,xmm7 
-  movdqa      xmm7,[esp+50h] 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm7,xmm6 
-  pand        xmm4,xmm7 
-  pand        xmm4,[esp+40h] 
-  pand        xmm0,xmm4 
-  movdqa      xmm4,[esp+30h] 
-  paddw       xmm2,xmm4 
-  paddw       xmm1,xmm0 
-  packuswb    xmm2,xmm1 
-  movq        [esi],xmm2 
-  psubw       xmm3,xmm4 
-  psubw       xmm5,xmm0 
-  packuswb    xmm3,xmm5 
-  movq        [eax],xmm3 
-  psrldq      xmm2,8 
-  movq        [edi],xmm2 
-  pop         edi  
-  pop         esi  
-  psrldq      xmm3,8 
-  movq        [ecx],xmm3 
-  pop         ebx  
-  mov         esp,ebp 
-  pop         ebp  
-  ret    
-  
-;***************************************************************************
-;  void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, 
-;          int32_t iAlpha, int32_t iBeta)
-;***************************************************************************
-
-WELS_EXTERN     DeblockChromaEq4H_sse2
-
-ALIGN  16
-  
-DeblockChromaEq4H_sse2:
-  push        ebp  
-  mov         ebp,esp 
-  and         esp,0FFFFFFF0h 
-  sub         esp,0C8h  
-  mov         ecx,dword [ebp+8] 
-  mov         edx,dword [ebp+0Ch] 
-  mov         eax,dword [ebp+10h] 
-  sub         ecx,2 
-  sub         edx,2 
-  push        esi  
-  lea         esi,[eax+eax*2] 
-  mov         dword [esp+18h],ecx 
-  mov         dword [esp+4],edx 
-  lea         ecx,[ecx+eax*4] 
-  lea         edx,[edx+eax*4] 
-  lea         eax,[esp+7Ch] 
-  push        edi  
-  mov         dword [esp+14h],esi 
-  mov         dword [esp+18h],ecx 
-  mov         dword [esp+0Ch],edx 
-  mov         dword [esp+10h],eax 
-  mov         esi,dword [esp+1Ch] 
-  mov         ecx,dword [ebp+10h] 
-  mov         edx,dword [esp+14h] 
-  movd        xmm0,dword [esi] 
-  movd        xmm1,dword [esi+ecx] 
-  movd        xmm2,dword [esi+ecx*2] 
-  movd        xmm3,dword [esi+edx] 
-  mov         esi,dword  [esp+8] 
-  movd        xmm4,dword [esi] 
-  movd        xmm5,dword [esi+ecx] 
-  movd        xmm6,dword [esi+ecx*2] 
-  movd        xmm7,dword [esi+edx] 
-  punpckldq   xmm0,xmm4 
-  punpckldq   xmm1,xmm5 
-  punpckldq   xmm2,xmm6 
-  punpckldq   xmm3,xmm7 
-  mov         esi,dword [esp+18h] 
-  mov         edi,dword [esp+0Ch] 
-  movd        xmm4,dword [esi] 
-  movd        xmm5,dword [edi] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm0,xmm4 
-  movd        xmm4,dword [esi+ecx] 
-  movd        xmm5,dword [edi+ecx] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm1,xmm4 
-  movd        xmm4,dword [esi+ecx*2] 
-  movd        xmm5,dword [edi+ecx*2] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm2,xmm4 
-  movd        xmm4,dword [esi+edx] 
-  movd        xmm5,dword [edi+edx] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm3,xmm4 
-  movdqa      xmm6,xmm0 
-  punpcklbw   xmm0,xmm1 
-  punpckhbw   xmm6,xmm1 
-  movdqa      xmm7,xmm2 
-  punpcklbw   xmm2,xmm3 
-  punpckhbw   xmm7,xmm3 
-  movdqa      xmm4,xmm0 
-  movdqa      xmm5,xmm6 
-  punpcklwd   xmm0,xmm2 
-  punpckhwd   xmm4,xmm2 
-  punpcklwd   xmm6,xmm7 
-  punpckhwd   xmm5,xmm7 
-  movdqa      xmm1,xmm0 
-  movdqa      xmm2,xmm4 
-  punpckldq   xmm0,xmm6 
-  punpckhdq   xmm1,xmm6 
-  punpckldq   xmm4,xmm5 
-  punpckhdq   xmm2,xmm5 
-  movdqa      xmm5,xmm0 
-  movdqa      xmm6,xmm1 
-  punpcklqdq  xmm0,xmm4 
-  punpckhqdq  xmm5,xmm4 
-  punpcklqdq  xmm1,xmm2 
-  punpckhqdq  xmm6,xmm2 
-  mov         edi,dword [esp+10h] 
-  movdqa      [edi],xmm0 
-  movdqa      [edi+10h],xmm5 
-  movdqa      [edi+20h],xmm1 
-  movdqa      [edi+30h],xmm6 
-  movsx       ecx,word [ebp+14h] 
-  movsx       edx,word [ebp+18h] 
-  movdqa      xmm6,[esp+80h] 
-  movdqa      xmm4,[esp+90h] 
-  movdqa      xmm5,[esp+0A0h] 
-  movdqa      xmm7,[esp+0B0h] 
-  pxor        xmm0,xmm0 
-  movd        xmm1,ecx 
-  movdqa      xmm2,xmm1 
-  punpcklwd   xmm2,xmm1 
-  pshufd      xmm1,xmm2,0 
-  movd        xmm2,edx 
-  movdqa      xmm3,xmm2 
-  punpcklwd   xmm3,xmm2 
-  pshufd      xmm2,xmm3,0 
-  movdqa      xmm3,xmm6 
-  punpckhbw   xmm6,xmm0 
-  movdqa      [esp+60h],xmm6 
-  movdqa      xmm6,[esp+90h] 
-  punpckhbw   xmm6,xmm0 
-  movdqa      [esp+30h],xmm6 
-  movdqa      xmm6,[esp+0A0h] 
-  punpckhbw   xmm6,xmm0 
-  movdqa      [esp+40h],xmm6 
-  movdqa      xmm6,[esp+0B0h] 
-  punpckhbw   xmm6,xmm0 
-  movdqa      [esp+70h],xmm6 
-  punpcklbw   xmm7,xmm0 
-  punpcklbw   xmm4,xmm0 
-  punpcklbw   xmm5,xmm0 
-  punpcklbw   xmm3,xmm0 
-  movdqa      [esp+50h],xmm7 
-  movdqa      xmm6,xmm4 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  movdqa      xmm0,xmm1 
-  pcmpgtw     xmm0,xmm6 
-  movdqa      xmm6,xmm3 
-  psubw       xmm6,xmm4 
-  pabsw       xmm6,xmm6 
-  movdqa      xmm7,xmm2 
-  pcmpgtw     xmm7,xmm6 
-  movdqa      xmm6,[esp+50h] 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  pand        xmm0,xmm7 
-  movdqa      xmm7,xmm2 
-  pcmpgtw     xmm7,xmm6 
-  movdqa      xmm6,[esp+30h] 
-  psubw       xmm6,[esp+40h] 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm1,xmm6 
-  movdqa      xmm6,[esp+60h] 
-  psubw       xmm6,[esp+30h] 
-  pabsw       xmm6,xmm6 
-  pand        xmm0,xmm7 
-  movdqa      xmm7,xmm2 
-  pcmpgtw     xmm7,xmm6 
-  movdqa      xmm6,[esp+70h] 
-  psubw       xmm6,[esp+40h] 
-  pabsw       xmm6,xmm6 
-  pand        xmm1,xmm7 
-  pcmpgtw     xmm2,xmm6 
-  pand        xmm1,xmm2 
-  mov         eax,2 
-  movsx       ecx,ax 
-  movd        xmm2,ecx 
-  movdqa      xmm6,xmm2 
-  punpcklwd   xmm6,xmm2 
-  pshufd      xmm2,xmm6,0 
-  movdqa      [esp+20h],xmm2 
-  movdqa      xmm2,xmm3 
-  paddw       xmm2,xmm3 
-  paddw       xmm2,xmm4 
-  paddw       xmm2,[esp+50h] 
-  paddw       xmm2,[esp+20h] 
-  psraw       xmm2,2 
-  movdqa      xmm6,xmm0 
-  pand        xmm6,xmm2 
-  movdqa      xmm2,xmm0 
-  pandn       xmm2,xmm4 
-  por         xmm6,xmm2 
-  movdqa      xmm2,[esp+60h] 
-  movdqa      xmm7,xmm2 
-  paddw       xmm7,xmm2 
-  paddw       xmm7,[esp+30h] 
-  paddw       xmm7,[esp+70h] 
-  paddw       xmm7,[esp+20h] 
-  movdqa      xmm4,xmm1 
-  movdqa      xmm2,xmm1 
-  pandn       xmm2,[esp+30h] 
-  psraw       xmm7,2 
-  pand        xmm4,xmm7 
-  por         xmm4,xmm2 
-  movdqa      xmm2,[esp+50h] 
-  packuswb    xmm6,xmm4 
-  movdqa      [esp+90h],xmm6 
-  movdqa      xmm6,xmm2 
-  paddw       xmm6,xmm2 
-  movdqa      xmm2,[esp+20h] 
-  paddw       xmm6,xmm5 
-  paddw       xmm6,xmm3 
-  movdqa      xmm4,xmm0 
-  pandn       xmm0,xmm5 
-  paddw       xmm6,xmm2 
-  psraw       xmm6,2 
-  pand        xmm4,xmm6 
-  por         xmm4,xmm0 
-  movdqa      xmm0,[esp+70h] 
-  movdqa      xmm5,xmm0 
-  paddw       xmm5,xmm0 
-  movdqa      xmm0,[esp+40h] 
-  paddw       xmm5,xmm0 
-  paddw       xmm5,[esp+60h] 
-  movdqa      xmm3,xmm1 
-  paddw       xmm5,xmm2 
-  psraw       xmm5,2 
-  pand        xmm3,xmm5 
-  pandn       xmm1,xmm0 
-  por         xmm3,xmm1 
-  packuswb    xmm4,xmm3 
-  movdqa      [esp+0A0h],xmm4 
-  mov         esi,dword [esp+10h] 
-  movdqa      xmm0,[esi] 
-  movdqa      xmm1,[esi+10h] 
-  movdqa      xmm2,[esi+20h] 
-  movdqa      xmm3,[esi+30h] 
-  movdqa      xmm6,xmm0 
-  punpcklbw   xmm0,xmm1 
-  punpckhbw   xmm6,xmm1 
-  movdqa      xmm7,xmm2 
-  punpcklbw   xmm2,xmm3 
-  punpckhbw   xmm7,xmm3 
-  movdqa      xmm4,xmm0 
-  movdqa      xmm5,xmm6 
-  punpcklwd   xmm0,xmm2 
-  punpckhwd   xmm4,xmm2 
-  punpcklwd   xmm6,xmm7 
-  punpckhwd   xmm5,xmm7 
-  movdqa      xmm1,xmm0 
-  movdqa      xmm2,xmm4 
-  punpckldq   xmm0,xmm6 
-  punpckhdq   xmm1,xmm6 
-  punpckldq   xmm4,xmm5 
-  punpckhdq   xmm2,xmm5 
-  movdqa      xmm5,xmm0 
-  movdqa      xmm6,xmm1 
-  punpcklqdq  xmm0,xmm4 
-  punpckhqdq  xmm5,xmm4 
-  punpcklqdq  xmm1,xmm2 
-  punpckhqdq  xmm6,xmm2 
-  mov         esi,dword [esp+1Ch] 
-  mov         ecx,dword [ebp+10h] 
-  mov         edx,dword [esp+14h] 
-  mov         edi,dword [esp+8] 
-  movd        dword [esi],xmm0 
-  movd        dword [esi+ecx],xmm5 
-  movd        dword [esi+ecx*2],xmm1 
-  movd        dword [esi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  mov         esi,dword [esp+18h] 
-  movd        dword [edi],xmm0 
-  movd        dword [edi+ecx],xmm5 
-  movd        dword [edi+ecx*2],xmm1 
-  movd        dword [edi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  movd        dword [esi],xmm0 
-  movd        dword [esi+ecx],xmm5 
-  movd        dword [esi+ecx*2],xmm1 
-  movd        dword [esi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  mov         edi,dword [esp+0Ch] 
-  movd        dword [edi],xmm0 
-  movd        dword [edi+ecx],xmm5 
-  movd        dword [edi+ecx*2],xmm1 
-  movd        dword [edi+edx],xmm6 
-  pop         edi  
-  pop         esi  
-  mov         esp,ebp 
-  pop         ebp  
-  ret              
-  
-;*******************************************************************************
-;    void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, 
-;                                int32_t iAlpha, int32_t iBeta, int8_t * pTC);
-;*******************************************************************************
-  
-WELS_EXTERN  DeblockChromaLt4H_sse2
-  
-ALIGN  16
-
-DeblockChromaLt4H_sse2:
-  push        ebp  
-  mov         ebp,esp 
-  and         esp,0FFFFFFF0h 
-  sub         esp,108h   
-  mov         ecx,dword [ebp+8] 
-  mov         edx,dword [ebp+0Ch] 
-  mov         eax,dword [ebp+10h] 
-  sub         ecx,2 
-  sub         edx,2 
-  push        esi  
-  lea         esi,[eax+eax*2] 
-  mov         dword [esp+10h],ecx 
-  mov         dword [esp+4],edx 
-  lea         ecx,[ecx+eax*4] 
-  lea         edx,[edx+eax*4] 
-  lea         eax,[esp+6Ch] 
-  push        edi  
-  mov         dword [esp+0Ch],esi 
-  mov         dword [esp+18h],ecx 
-  mov         dword [esp+10h],edx 
-  mov         dword [esp+1Ch],eax 
-  mov         esi,dword [esp+14h] 
-  mov         ecx,dword [ebp+10h] 
-  mov         edx,dword [esp+0Ch] 
-  movd        xmm0,dword [esi] 
-  movd        xmm1,dword [esi+ecx] 
-  movd        xmm2,dword [esi+ecx*2] 
-  movd        xmm3,dword [esi+edx] 
-  mov         esi,dword [esp+8] 
-  movd        xmm4,dword [esi] 
-  movd        xmm5,dword [esi+ecx] 
-  movd        xmm6,dword [esi+ecx*2] 
-  movd        xmm7,dword [esi+edx] 
-  punpckldq   xmm0,xmm4 
-  punpckldq   xmm1,xmm5 
-  punpckldq   xmm2,xmm6 
-  punpckldq   xmm3,xmm7 
-  mov         esi,dword [esp+18h] 
-  mov         edi,dword [esp+10h] 
-  movd        xmm4,dword [esi] 
-  movd        xmm5,dword [edi] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm0,xmm4 
-  movd        xmm4,dword [esi+ecx] 
-  movd        xmm5,dword [edi+ecx] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm1,xmm4 
-  movd        xmm4,dword [esi+ecx*2] 
-  movd        xmm5,dword [edi+ecx*2] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm2,xmm4 
-  movd        xmm4,dword [esi+edx] 
-  movd        xmm5,dword [edi+edx] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm3,xmm4 
-  movdqa      xmm6,xmm0 
-  punpcklbw   xmm0,xmm1 
-  punpckhbw   xmm6,xmm1 
-  movdqa      xmm7,xmm2 
-  punpcklbw   xmm2,xmm3 
-  punpckhbw   xmm7,xmm3 
-  movdqa      xmm4,xmm0 
-  movdqa      xmm5,xmm6 
-  punpcklwd   xmm0,xmm2 
-  punpckhwd   xmm4,xmm2 
-  punpcklwd   xmm6,xmm7 
-  punpckhwd   xmm5,xmm7 
-  movdqa      xmm1,xmm0 
-  movdqa      xmm2,xmm4 
-  punpckldq   xmm0,xmm6 
-  punpckhdq   xmm1,xmm6 
-  punpckldq   xmm4,xmm5 
-  punpckhdq   xmm2,xmm5 
-  movdqa      xmm5,xmm0 
-  movdqa      xmm6,xmm1 
-  punpcklqdq  xmm0,xmm4 
-  punpckhqdq  xmm5,xmm4 
-  punpcklqdq  xmm1,xmm2 
-  punpckhqdq  xmm6,xmm2 
-  mov         edi,dword [esp+1Ch] 
-  movdqa      [edi],xmm0 
-  movdqa      [edi+10h],xmm5 
-  movdqa      [edi+20h],xmm1 
-  movdqa      [edi+30h],xmm6 
-  mov         eax,dword [ebp+1Ch] 
-  movsx       cx,byte [eax+3] 
-  movsx       dx,byte [eax+2] 
-  movsx       si,byte [eax+1] 
-  movsx       ax,byte [eax] 
-  movzx       edi,cx 
-  movzx       ecx,cx 
-  movd        xmm2,ecx 
-  movzx       ecx,dx 
-  movzx       edx,dx 
-  movd        xmm3,ecx 
-  movd        xmm4,edx 
-  movzx       ecx,si 
-  movzx       edx,si 
-  movd        xmm5,ecx 
-  pxor        xmm0,xmm0 
-  movd        xmm6,edx 
-  movzx       ecx,ax 
-  movdqa      [esp+60h],xmm0 
-  movzx       edx,ax 
-  movsx       eax,word [ebp+14h] 
-  punpcklwd   xmm6,xmm2 
-  movd        xmm1,edi 
-  movd        xmm7,ecx 
-  movsx       ecx,word [ebp+18h] 
-  movd        xmm0,edx 
-  punpcklwd   xmm7,xmm3 
-  punpcklwd   xmm5,xmm1 
-  movdqa      xmm1,[esp+60h] 
-  punpcklwd   xmm7,xmm5 
-  movdqa      xmm5,[esp+0A0h] 
-  punpcklwd   xmm0,xmm4 
-  punpcklwd   xmm0,xmm6 
-  movdqa      xmm6, [esp+70h] 
-  punpcklwd   xmm0,xmm7 
-  movdqa      xmm7,[esp+80h] 
-  movdqa      xmm2,xmm1 
-  psubw       xmm2,xmm0 
-  movdqa      [esp+0D0h],xmm2 
-  movd        xmm2,eax 
-  movdqa      xmm3,xmm2 
-  punpcklwd   xmm3,xmm2 
-  pshufd      xmm4,xmm3,0 
-  movd        xmm2,ecx 
-  movdqa      xmm3,xmm2 
-  punpcklwd   xmm3,xmm2 
-  pshufd      xmm2,xmm3,0 
-  movdqa      xmm3, [esp+90h] 
-  movdqa      [esp+50h],xmm2 
-  movdqa      xmm2,xmm6 
-  punpcklbw   xmm2,xmm1 
-  punpckhbw   xmm6,xmm1 
-  movdqa      [esp+40h],xmm2 
-  movdqa      [esp+0B0h],xmm6 
-  movdqa      xmm6,[esp+90h] 
-  movdqa      xmm2,xmm7 
-  punpckhbw   xmm7,xmm1 
-  punpckhbw   xmm6,xmm1 
-  punpcklbw   xmm2,xmm1 
-  punpcklbw   xmm3,xmm1 
-  punpcklbw   xmm5,xmm1 
-  movdqa      [esp+0F0h],xmm7 
-  movdqa      [esp+0C0h],xmm6 
-  movdqa      xmm6, [esp+0A0h] 
-  punpckhbw   xmm6,xmm1 
-  movdqa      [esp+0E0h],xmm6 
-  mov         edx,4 
-  movsx       eax,dx 
-  movd        xmm6,eax 
-  movdqa      xmm7,xmm6 
-  punpcklwd   xmm7,xmm6 
-  pshufd      xmm6,xmm7,0 
-  movdqa      [esp+30h],xmm6 
-  movdqa      xmm7, [esp+40h] 
-  psubw       xmm7,xmm5 
-  movdqa      xmm6,xmm0 
-  pcmpgtw     xmm6,xmm1 
-  movdqa      [esp+60h],xmm6 
-  movdqa      xmm1, [esp+0D0h] 
-  movdqa      xmm6,xmm3 
-  psubw       xmm6,xmm2 
-  psllw       xmm6,2 
-  paddw       xmm6,xmm7 
-  paddw       xmm6,[esp+30h] 
-  psraw       xmm6,3 
-  pmaxsw      xmm1,xmm6 
-  movdqa      xmm7,[esp+50h] 
-  movdqa      [esp+20h],xmm0 
-  movdqa      xmm6, [esp+20h] 
-  pminsw      xmm6,xmm1 
-  movdqa      [esp+20h],xmm6 
-  movdqa      xmm6,xmm4 
-  movdqa      xmm1,xmm2 
-  psubw       xmm1,xmm3 
-  pabsw       xmm1,xmm1 
-  pcmpgtw     xmm6,xmm1 
-  movdqa      xmm1, [esp+40h] 
-  psubw       xmm1,xmm2 
-  pabsw       xmm1,xmm1 
-  pcmpgtw     xmm7,xmm1 
-  movdqa      xmm1, [esp+50h] 
-  pand        xmm6,xmm7 
-  movdqa      xmm7, [esp+50h] 
-  psubw       xmm5,xmm3 
-  pabsw       xmm5,xmm5 
-  pcmpgtw     xmm1,xmm5 
-  movdqa      xmm5, [esp+0B0h] 
-  psubw       xmm5,[esp+0E0h] 
-  pand        xmm6,xmm1 
-  pand        xmm6, [esp+60h] 
-  movdqa      xmm1, [esp+20h] 
-  pand        xmm1,xmm6 
-  movdqa      xmm6, [esp+0C0h] 
-  movdqa      [esp+40h],xmm1 
-  movdqa      xmm1, [esp+0F0h] 
-  psubw       xmm6,xmm1 
-  psllw       xmm6,2 
-  paddw       xmm6,xmm5 
-  paddw       xmm6, [esp+30h] 
-  movdqa      xmm5, [esp+0D0h] 
-  psraw       xmm6,3 
-  pmaxsw      xmm5,xmm6 
-  pminsw      xmm0,xmm5 
-  movdqa      xmm5,[esp+0C0h] 
-  movdqa      xmm6,xmm1 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm4,xmm6 
-  movdqa      xmm6,[esp+0B0h] 
-  psubw       xmm6,xmm1 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm7,xmm6 
-  movdqa      xmm6, [esp+0E0h] 
-  pand        xmm4,xmm7 
-  movdqa      xmm7, [esp+50h] 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm7,xmm6 
-  pand        xmm4,xmm7 
-  pand        xmm4,[esp+60h] 
-  pand        xmm0,xmm4 
-  movdqa      xmm4, [esp+40h] 
-  paddw       xmm2,xmm4 
-  paddw       xmm1,xmm0 
-  psubw       xmm3,xmm4 
-  psubw       xmm5,xmm0 
-  packuswb    xmm2,xmm1 
-  packuswb    xmm3,xmm5 
-  movdqa      [esp+80h],xmm2 
-  movdqa      [esp+90h],xmm3 
-  mov         esi,dword [esp+1Ch] 
-  movdqa      xmm0, [esi] 
-  movdqa      xmm1, [esi+10h] 
-  movdqa      xmm2, [esi+20h] 
-  movdqa      xmm3, [esi+30h] 
-  movdqa      xmm6,xmm0 
-  punpcklbw   xmm0,xmm1 
-  punpckhbw   xmm6,xmm1 
-  movdqa      xmm7,xmm2 
-  punpcklbw   xmm2,xmm3 
-  punpckhbw   xmm7,xmm3 
-  movdqa      xmm4,xmm0 
-  movdqa      xmm5,xmm6 
-  punpcklwd   xmm0,xmm2 
-  punpckhwd   xmm4,xmm2 
-  punpcklwd   xmm6,xmm7 
-  punpckhwd   xmm5,xmm7 
-  movdqa      xmm1,xmm0 
-  movdqa      xmm2,xmm4 
-  punpckldq   xmm0,xmm6 
-  punpckhdq   xmm1,xmm6 
-  punpckldq   xmm4,xmm5 
-  punpckhdq   xmm2,xmm5 
-  movdqa      xmm5,xmm0 
-  movdqa      xmm6,xmm1 
-  punpcklqdq  xmm0,xmm4 
-  punpckhqdq  xmm5,xmm4 
-  punpcklqdq  xmm1,xmm2 
-  punpckhqdq  xmm6,xmm2 
-  mov         esi,dword [esp+14h] 
-  mov         ecx,dword [ebp+10h] 
-  mov         edx,dword [esp+0Ch] 
-  mov         edi,dword [esp+8] 
-  movd        dword [esi],xmm0 
-  movd        dword [esi+ecx],xmm5 
-  movd        dword [esi+ecx*2],xmm1 
-  movd        dword [esi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  mov         esi,dword [esp+18h] 
-  movd        dword [edi],xmm0 
-  movd        dword [edi+ecx],xmm5 
-  movd        dword [edi+ecx*2],xmm1 
-  movd        dword [edi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  movd        dword [esi],xmm0 
-  movd        dword [esi+ecx],xmm5 
-  movd        dword [esi+ecx*2],xmm1 
-  movd        dword [esi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  mov         edi,dword [esp+10h] 
-  movd        dword [edi],xmm0 
-  movd        dword [edi+ecx],xmm5 
-  movd        dword [edi+ecx*2],xmm1 
-  movd        dword [edi+edx],xmm6  
-  pop         edi  
-  pop         esi   
-  mov         esp,ebp 
-  pop         ebp  
-  ret     
-  
-  
-  
-;*******************************************************************************
-;    void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha, 
-;                                 int32_t iBeta, int8_t * pTC)
-;*******************************************************************************
-  
-
-WELS_EXTERN  DeblockLumaLt4V_sse2
-  
-ALIGN  16
-
-DeblockLumaLt4V_sse2:
-    push	ebp
-	mov	ebp, esp
-	and	esp, -16				; fffffff0H
-	sub	esp, 420				; 000001a4H
-	mov	eax, dword [ebp+8]
-	mov	ecx, dword [ebp+12]
-
-	pxor	xmm0, xmm0
-	push	ebx
-	mov	edx, dword [ebp+24]
-	movdqa	[esp+424-384], xmm0
-	push	esi
-
-	lea	esi, [ecx+ecx*2]
-	push	edi
-	mov	edi, eax
-	sub	edi, esi
-	movdqa	xmm0, [edi]
-
-	lea	esi, [ecx+ecx]
-	movdqa	[esp+432-208], xmm0
-	mov	edi, eax
-	sub	edi, esi
-	movdqa	xmm0, [edi]
-	movdqa	[esp+448-208], xmm0
-
-	mov	ebx, eax
-	sub	ebx, ecx
-	movdqa	xmm0, [ebx]
-	movdqa	[esp+464-208], xmm0
-
-	movdqa	xmm0, [eax]
-
-	add	ecx, eax
-	movdqa	[esp+480-208], xmm0
-	movdqa	xmm0, [ecx]
-	mov	dword [esp+432-404], ecx
-
-	movsx	ecx, word [ebp+16]
-	movdqa	[esp+496-208], xmm0
-	movdqa	xmm0, [esi+eax]
-
-	movsx	si, byte [edx]
-	movdqa	[esp+512-208], xmm0
-	movd	xmm0, ecx
-	movsx	ecx, word [ebp+20]
-	movdqa	xmm1, xmm0
-	punpcklwd xmm1, xmm0
-	pshufd	xmm0, xmm1, 0
-	movdqa	[esp+432-112], xmm0
-	movd	xmm0, ecx
-	movsx	cx, byte [edx+1]
-	movdqa	xmm1, xmm0
-	punpcklwd xmm1, xmm0
-	mov	dword [esp+432-408], ebx
-	movzx	ebx, cx
-	pshufd	xmm0, xmm1, 0
-	movd	xmm1, ebx
-	movzx	ebx, cx
-	movd	xmm2, ebx
-	movzx	ebx, cx
-	movzx	ecx, cx
-	movd	xmm4, ecx
-	movzx	ecx, si
-	movd	xmm5, ecx
-	movzx	ecx, si
-	movd	xmm6, ecx
-	movzx	ecx, si
-	movd	xmm7, ecx
-	movzx	ecx, si
-	movdqa	[esp+432-336], xmm0
-	movd	xmm0, ecx
-
-	movsx	cx, byte [edx+3]
-	movsx	dx, byte [edx+2]
-	movd	xmm3, ebx
-	punpcklwd xmm0, xmm4
-	movzx	esi, cx
-	punpcklwd xmm6, xmm2
-	punpcklwd xmm5, xmm1
-	punpcklwd xmm0, xmm6
-	punpcklwd xmm7, xmm3
-	punpcklwd xmm7, xmm5
-	punpcklwd xmm0, xmm7
-	movdqa	[esp+432-400], xmm0
-	movd	xmm0, esi
-	movzx	esi, cx
-	movd	xmm2, esi
-	movzx	esi, cx
-	movzx	ecx, cx
-	movd	xmm4, ecx
-	movzx	ecx, dx
-	movd	xmm3, esi
-	movd	xmm5, ecx
-	punpcklwd xmm5, xmm0
-
-	movdqa	xmm0, [esp+432-384]
-	movzx	ecx, dx
-	movd	xmm6, ecx
-	movzx	ecx, dx
-	movzx	edx, dx
-	punpcklwd xmm6, xmm2
-	movd	xmm7, ecx
-	movd	xmm1, edx
-
-	movdqa	xmm2, [esp+448-208]
-	punpcklbw xmm2, xmm0
-
-	mov	ecx, 4
-	movsx	edx, cx
-	punpcklwd xmm7, xmm3
-	punpcklwd xmm7, xmm5
-	movdqa	xmm5, [esp+496-208]
-	movdqa	xmm3, [esp+464-208]
-	punpcklbw xmm5, xmm0
-	movdqa	[esp+432-240], xmm5
-	movdqa	xmm5, [esp+512-208]
-	punpcklbw xmm5, xmm0
-	movdqa	[esp+432-352], xmm5
-	punpcklwd xmm1, xmm4
-	movdqa	xmm4, [esp+432-208]
-	punpcklwd xmm1, xmm6
-	movdqa	xmm6, [esp+480-208]
-	punpcklwd xmm1, xmm7
-	punpcklbw xmm6, xmm0
-	punpcklbw xmm3, xmm0
-	punpcklbw xmm4, xmm0
-	movdqa	xmm7, xmm3
-	psubw	xmm7, xmm4
-	pabsw	xmm7, xmm7
-	movdqa	[esp+432-272], xmm4
-	movdqa	xmm4, [esp+432-336]
-	movdqa	xmm5, xmm4
-	pcmpgtw	xmm5, xmm7
-	movdqa	[esp+432-288], xmm5
-	movdqa	xmm7, xmm6
-	psubw	xmm7, [esp+432-352]
-	pabsw	xmm7, xmm7
-	movdqa	xmm5, xmm4
-	pcmpgtw	xmm5, xmm7
-	movdqa	[esp+432-256], xmm5
-	movdqa	xmm5, xmm3
-	pavgw	xmm5, xmm6
-	movdqa	[esp+432-304], xmm5
-	movdqa	xmm5, [esp+432-400]
-	psubw	xmm5, [esp+432-288]
-	psubw	xmm5, [esp+432-256]
-	movdqa	[esp+432-224], xmm5
-	movdqa	xmm5, xmm6
-	psubw	xmm5, xmm3
-	movdqa	[esp+432-32], xmm6
-	psubw	xmm6, [esp+432-240]
-	movdqa	xmm7, xmm5
-	movdqa	[esp+432-384], xmm5
-	movdqa	xmm5, [esp+432-112]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm5, xmm7
-	pabsw	xmm6, xmm6
-	movdqa	xmm7, xmm4
-	pcmpgtw	xmm7, xmm6
-
-	pand	xmm5, xmm7
-	movdqa	xmm6, xmm3
-	psubw	xmm6, xmm2
-	pabsw	xmm6, xmm6
-	movdqa	xmm7, xmm4
-	pcmpgtw	xmm7, xmm6
-	movdqa	xmm6, [esp+432-400]
-	pand	xmm5, xmm7
-	movdqa	xmm7, xmm6
-	pcmpeqw	xmm6, xmm0
-	pcmpgtw	xmm7, xmm0
-	por	xmm7, xmm6
-	pand	xmm5, xmm7
-	movdqa	[esp+432-320], xmm5
-	movd	xmm5, edx
-	movdqa	xmm6, xmm5
-	punpcklwd xmm6, xmm5
-	pshufd	xmm5, xmm6, 0
-	movdqa	[esp+432-336], xmm5
-	movdqa	xmm5, [esp+432-224]
-	movdqa	[esp+432-368], xmm5
-	movdqa	xmm6, xmm0
-	psubw	xmm6, xmm5
-	movdqa	xmm5, [esp+432-384]
-	psllw	xmm5, 2
-	movdqa	xmm7, xmm2
-	psubw	xmm7, [esp+432-240]
-	paddw	xmm7, xmm5
-	paddw	xmm7, [esp+432-336]
-	movdqa	xmm5, [esp+432-368]
-	psraw	xmm7, 3
-	pmaxsw	xmm6, xmm7
-	pminsw	xmm5, xmm6
-
-	pand	xmm5, [esp+432-320]
-	movdqa	xmm6, [esp+432-400]
-	movdqa	[esp+432-64], xmm5
-	movdqa	[esp+432-384], xmm6
-	movdqa	xmm5, xmm0
-	psubw	xmm5, xmm6
-	movdqa	[esp+432-368], xmm5
-	movdqa	xmm6, xmm5
-	movdqa	xmm5, [esp+432-272]
-	paddw	xmm5, [esp+432-304]
-	movdqa	xmm7, xmm2
-	paddw	xmm7, xmm2
-	psubw	xmm5, xmm7
-	psraw	xmm5, 1
-	pmaxsw	xmm6, xmm5
-	movdqa	xmm5, [esp+432-384]
-	pminsw	xmm5, xmm6
-
-	pand	xmm5, [esp+432-320]
-	pand	xmm5, [esp+432-288]
-	movdqa	xmm6, [esp+432-240]
-	movdqa	[esp+432-96], xmm5
-	movdqa	xmm5, [esp+432-352]
-	paddw	xmm5, [esp+432-304]
-	movdqa	xmm7, xmm6
-	paddw	xmm7, xmm6
-	movdqa	xmm6, [esp+432-368]
-	psubw	xmm5, xmm7
-
-	movdqa	xmm7, [esp+496-208]
-	psraw	xmm5, 1
-	pmaxsw	xmm6, xmm5
-	movdqa	xmm5, [esp+432-400]
-	pminsw	xmm5, xmm6
-	pand	xmm5, [esp+432-320]
-	pand	xmm5, [esp+432-256]
-	movdqa	xmm6, [esp+448-208]
-	punpckhbw xmm7, xmm0
-	movdqa	[esp+432-352], xmm7
-
-	movdqa	xmm7, [esp+512-208]
-	punpckhbw xmm6, xmm0
-	movdqa	[esp+432-48], xmm5
-	movdqa	xmm5, [esp+432-208]
-	movdqa	[esp+432-368], xmm6
-	movdqa	xmm6, [esp+464-208]
-	punpckhbw xmm7, xmm0
-	punpckhbw xmm5, xmm0
-	movdqa	[esp+432-384], xmm7
-	punpckhbw xmm6, xmm0
-	movdqa	[esp+432-400], xmm6
-
-	movdqa	xmm7, [esp+432-400]
-	movdqa	xmm6, [esp+480-208]
-	psubw	xmm7, xmm5
-	movdqa	[esp+432-16], xmm5
-	pabsw	xmm7, xmm7
-	punpckhbw xmm6, xmm0
-	movdqa	xmm5, xmm4
-	pcmpgtw	xmm5, xmm7
-	movdqa	[esp+432-288], xmm5
-
-	movdqa	xmm7, xmm6
-	psubw	xmm7, [esp+432-384]
-	pabsw	xmm7, xmm7
-	movdqa	xmm5, xmm4
-	pcmpgtw	xmm5, xmm7
-	movdqa	[esp+432-256], xmm5
-
-	movdqa	xmm5, [esp+432-400]
-	movdqa	[esp+432-80], xmm6
-	pavgw	xmm5, xmm6
-	movdqa	[esp+432-304], xmm5
-
-	movdqa	xmm5, xmm1
-	psubw	xmm5, [esp+432-288]
-	psubw	xmm5, [esp+432-256]
-	movdqa	[esp+432-224], xmm5
-	movdqa	xmm5, xmm6
-	psubw	xmm5, [esp+432-400]
-	psubw	xmm6, [esp+432-352]
-	movdqa	[esp+432-272], xmm5
-	movdqa	xmm7, xmm5
-	movdqa	xmm5, [esp+432-112]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm5, xmm7
-	movdqa	xmm7, xmm4
-	pabsw	xmm6, xmm6
-	pcmpgtw	xmm7, xmm6
-	movdqa	xmm6, [esp+432-368]
-
-	pand	xmm5, xmm7
-	movdqa	xmm7, [esp+432-400]
-	psubw	xmm7, xmm6
-	psubw	xmm6, [esp+432-352]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm4, xmm7
-	pand	xmm5, xmm4
-
-	paddw	xmm2, [esp+432-96]
-	movdqa	xmm4, xmm1
-	pcmpgtw	xmm4, xmm0
-	movdqa	xmm7, xmm1
-	pcmpeqw	xmm7, xmm0
-	por	xmm4, xmm7
-	pand	xmm5, xmm4
-	movdqa	xmm4, [esp+432-224]
-	movdqa	[esp+432-320], xmm5
-	movdqa	xmm5, [esp+432-272]
-	movdqa	xmm7, xmm0
-	psubw	xmm7, xmm4
-	psubw	xmm0, xmm1
-	psllw	xmm5, 2
-	paddw	xmm6, xmm5
-	paddw	xmm6, [esp+432-336]
-	movdqa	xmm5, [esp+432-368]
-	movdqa	[esp+432-336], xmm0
-	psraw	xmm6, 3
-	pmaxsw	xmm7, xmm6
-	pminsw	xmm4, xmm7
-	pand	xmm4, [esp+432-320]
-	movdqa	xmm6, xmm0
-	movdqa	xmm0, [esp+432-16]
-	paddw	xmm0, [esp+432-304]
-	movdqa	[esp+432-272], xmm4
-	movdqa	xmm4, [esp+432-368]
-	paddw	xmm4, xmm4
-	psubw	xmm0, xmm4
-
-	movdqa	xmm4, [esp+432-64]
-	psraw	xmm0, 1
-	pmaxsw	xmm6, xmm0
-	movdqa	xmm0, [esp+432-400]
-	movdqa	xmm7, xmm1
-	pminsw	xmm7, xmm6
-	movdqa	xmm6, [esp+432-320]
-	pand	xmm7, xmm6
-	pand	xmm7, [esp+432-288]
-	paddw	xmm5, xmm7
-	packuswb xmm2, xmm5
-	movdqa	xmm5, [esp+432-272]
-	paddw	xmm0, xmm5
-	paddw	xmm3, xmm4
-	packuswb xmm3, xmm0
-
-	movdqa	xmm0, [esp+432-32]
-	psubw	xmm0, xmm4
-	movdqa	xmm4, [esp+432-80]
-	psubw	xmm4, xmm5
-
-	movdqa	xmm5, [esp+432-240]
-	paddw	xmm5, [esp+432-48]
-	packuswb xmm0, xmm4
-	movdqa	xmm4, [esp+432-384]
-	paddw	xmm4, [esp+432-304]
-	movdqa	[esp+480-208], xmm0
-	movdqa	xmm0, [esp+432-352]
-	movdqa	xmm7, xmm0
-	paddw	xmm0, xmm0
-
-	mov	ecx, dword [esp+432-408]
-
-	mov	edx, dword [esp+432-404]
-	psubw	xmm4, xmm0
-	movdqa	xmm0, [esp+432-336]
-	movdqa	[edi], xmm2
-	psraw	xmm4, 1
-	pmaxsw	xmm0, xmm4
-	pminsw	xmm1, xmm0
-	movdqa	xmm0, [esp+480-208]
-
-	pop	edi
-	pand	xmm1, xmm6
-	pand	xmm1, [esp+428-256]
-	movdqa	[ecx], xmm3
-	paddw	xmm7, xmm1
-	pop	esi
-	packuswb xmm5, xmm7
-	movdqa	[eax], xmm0
-	movdqa	[edx], xmm5
-	pop	ebx
-	mov	esp, ebp
-	pop	ebp
-	ret
-
-
-;*******************************************************************************
-;    void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha, 
-;                                 int32_t iBeta)
-;*******************************************************************************
-
-WELS_EXTERN  DeblockLumaEq4V_sse2
-  
-ALIGN  16
-
-DeblockLumaEq4V_sse2:
-
-	push	ebp
-	mov	ebp, esp
-	and	esp, -16				; fffffff0H
-	sub	esp, 628				; 00000274H
-	mov	eax, dword [ebp+8]
-	mov	ecx, dword [ebp+12]
-	push	ebx
-	push	esi
-
-	lea	edx, [ecx*4]
-	pxor	xmm0, xmm0
-	movdqa	xmm2, xmm0
-
-	movdqa	xmm0, [ecx+eax]
-	mov	esi, eax
-	sub	esi, edx
-	movdqa	xmm3, [esi]
-	movdqa	xmm5, [eax]
-	push	edi
-	lea	edi, [ecx+ecx]
-	lea	ebx, [ecx+ecx*2]
-	mov	dword [esp+640-600], edi
-	mov	esi, eax
-	sub	esi, edi
-	movdqa	xmm1, [esi]
-	movdqa	 [esp+720-272], xmm0
-	mov	edi, eax
-	sub	edi, ecx
-	movdqa	xmm4, [edi]
-	add	ecx, eax
-	mov	dword [esp+640-596], ecx
-
-	mov	ecx, dword [esp+640-600]
-	movdqa	xmm0, [ecx+eax]
-	movdqa	 [esp+736-272], xmm0
-
-	movdqa	xmm0, [eax+ebx]
-	mov	edx, eax
-	sub	edx, ebx
-
-	movsx	ebx, word [ebp+16]
-	movdqa	xmm6, [edx]
-	add	ecx, eax
-	movdqa	 [esp+752-272], xmm0
-	movd	xmm0, ebx
-
-	movsx	ebx, word [ebp+20]
-	movdqa	xmm7, xmm0
-	punpcklwd xmm7, xmm0
-	pshufd	xmm0, xmm7, 0
-	movdqa	 [esp+640-320], xmm0
-	movd	xmm0, ebx
-	movdqa	xmm7, xmm0
-	punpcklwd xmm7, xmm0
-	pshufd	xmm0, xmm7, 0
-
-	movdqa	xmm7, [esp+736-272]
-	punpcklbw xmm7, xmm2
-	movdqa	 [esp+640-416], xmm7
-	movdqa	 [esp+640-512], xmm0
-	movdqa	xmm0, xmm1
-	movdqa	 [esp+672-272], xmm1
-	movdqa	xmm1, xmm4
-	movdqa	 [esp+704-272], xmm5
-	punpcklbw xmm5, xmm2
-	punpcklbw xmm1, xmm2
-
-	movdqa	xmm7, xmm5
-	psubw	xmm7, xmm1
-	pabsw	xmm7, xmm7
-	movdqa	 [esp+640-560], xmm7
-	punpcklbw xmm0, xmm2
-	movdqa	 [esp+688-272], xmm4
-	movdqa	xmm4, [esp+720-272]
-	movdqa	 [esp+640-480], xmm0
-
-	movdqa	xmm7, xmm1
-	psubw	xmm7, xmm0
-
-	movdqa	xmm0, [esp+640-512]
-	pabsw	xmm7, xmm7
-	punpcklbw xmm4, xmm2
-	pcmpgtw	xmm0, xmm7
-	movdqa	 [esp+640-384], xmm4
-	movdqa	xmm7, xmm5
-	psubw	xmm7, xmm4
-	movdqa	xmm4, [esp+640-512]
-	movdqa	 [esp+656-272], xmm6
-	punpcklbw xmm6, xmm2
-	pabsw	xmm7, xmm7
-	movdqa	 [esp+640-48], xmm2
-	movdqa	 [esp+640-368], xmm6
-	movdqa	 [esp+640-144], xmm1
-	movdqa	 [esp+640-400], xmm5
-	pcmpgtw	xmm4, xmm7
-	pand	xmm0, xmm4
-	movdqa	xmm4, [esp+640-320]
-	pcmpgtw	xmm4, [esp+640-560]
-	pand	xmm0, xmm4
-
-	mov	ebx, 2
-	movsx	ebx, bx
-	movd	xmm4, ebx
-	movdqa	xmm7, xmm4
-	punpcklwd xmm7, xmm4
-	movdqa	xmm4, [esp+640-320]
-	psraw	xmm4, 2
-	pshufd	xmm7, xmm7, 0
-	paddw	xmm4, xmm7
-	movdqa	 [esp+640-576], xmm4
-	pcmpgtw	xmm4, [esp+640-560]
-	movdqa	 [esp+640-560], xmm4
-
-	movdqa	xmm4, [esp+640-512]
-	movdqa	 [esp+640-624], xmm7
-	movdqa	xmm7, xmm1
-	psubw	xmm7, xmm6
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm4, xmm7
-
-	pand	xmm4, [esp+640-560]
-	movdqa	 [esp+640-544], xmm4
-	movdqa	xmm4, [esp+640-512]
-	movdqa	xmm7, xmm5
-	psubw	xmm7, [esp+640-416]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm4, xmm7
-
-	pand	xmm4, [esp+640-560]
-	movdqa	 [esp+640-560], xmm4
-
-	movdqa	xmm4, [esp+640-544]
-	pandn	xmm4, xmm6
-	movdqa	 [esp+640-16], xmm4
-	mov	ebx, 4
-	movsx	ebx, bx
-	movd	xmm4, ebx
-	movdqa	xmm7, xmm4
-	punpcklwd xmm7, xmm4
-	movdqa	xmm4, xmm3
-	punpcklbw xmm4, xmm2
-	psllw	xmm4, 1
-	paddw	xmm4, xmm6
-	paddw	xmm4, xmm6
-	paddw	xmm4, xmm6
-	paddw	xmm4, [esp+640-480]
-
-	movdqa	xmm6, [esp+640-560]
-	pshufd	xmm7, xmm7, 0
-	paddw	xmm4, xmm1
-	movdqa	 [esp+640-592], xmm7
-	paddw	xmm4, xmm5
-	paddw	xmm4, xmm7
-	movdqa	xmm7, [esp+640-416]
-	pandn	xmm6, xmm7
-	movdqa	 [esp+640-80], xmm6
-	movdqa	xmm6, [esp+752-272]
-	punpcklbw xmm6, xmm2
-	psllw	xmm6, 1
-	paddw	xmm6, xmm7
-	paddw	xmm6, xmm7
-	paddw	xmm6, xmm7
-	paddw	xmm6, [esp+640-384]
-
-	movdqa	xmm7, [esp+640-480]
-	paddw	xmm6, xmm5
-	paddw	xmm6, xmm1
-	paddw	xmm6, [esp+640-592]
-	psraw	xmm6, 3
-	pand	xmm6, [esp+640-560]
-	movdqa	 [esp+640-112], xmm6
-	movdqa	xmm6, [esp+640-544]
-	pandn	xmm6, xmm7
-	movdqa	 [esp+640-336], xmm6
-	movdqa	xmm6, [esp+640-544]
-	movdqa	 [esp+640-528], xmm6
-	movdqa	xmm6, [esp+640-368]
-	paddw	xmm6, xmm7
-	movdqa	xmm7, xmm1
-	psraw	xmm4, 3
-	pand	xmm4, [esp+640-544]
-	paddw	xmm7, xmm5
-	paddw	xmm6, xmm7
-	paddw	xmm6, [esp+640-624]
-	movdqa	xmm7, [esp+640-528]
-
-	paddw	xmm5, xmm1
-	psraw	xmm6, 2
-	pand	xmm7, xmm6
-
-	movdqa	xmm6, [esp+640-384]
-	movdqa	 [esp+640-64], xmm7
-	movdqa	xmm7, [esp+640-560]
-	pandn	xmm7, xmm6
-	movdqa	 [esp+640-304], xmm7
-	movdqa	xmm7, [esp+640-560]
-	movdqa	 [esp+640-528], xmm7
-	movdqa	xmm7, [esp+640-416]
-	paddw	xmm7, xmm6
-	paddw	xmm7, xmm5
-	paddw	xmm7, [esp+640-624]
-	movdqa	xmm5, [esp+640-528]
-	psraw	xmm7, 2
-	pand	xmm5, xmm7
-	movdqa	 [esp+640-32], xmm5
-
-	movdqa	xmm5, [esp+640-544]
-	movdqa	 [esp+640-528], xmm5
-	movdqa	xmm5, [esp+640-480]
-	movdqa	xmm7, xmm5
-	paddw	xmm7, xmm5
-	movdqa	xmm5, xmm1
-	paddw	xmm5, xmm6
-	paddw	xmm6, [esp+640-592]
-	paddw	xmm7, xmm5
-	paddw	xmm7, [esp+640-624]
-	movdqa	xmm5, [esp+640-528]
-	psraw	xmm7, 2
-	pandn	xmm5, xmm7
-	movdqa	xmm7, [esp+640-480]
-	paddw	xmm7, xmm1
-	paddw	xmm7, [esp+640-400]
-	movdqa	xmm1, [esp+640-544]
-	movdqa	 [esp+640-352], xmm5
-	movdqa	xmm5, [esp+640-368]
-	psllw	xmm7, 1
-	paddw	xmm7, xmm6
-	paddw	xmm5, xmm7
-
-	movdqa	xmm7, [esp+640-400]
-	psraw	xmm5, 3
-	pand	xmm1, xmm5
-	movdqa	xmm5, [esp+640-480]
-	movdqa	 [esp+640-96], xmm1
-	movdqa	xmm1, [esp+640-560]
-	movdqa	 [esp+640-528], xmm1
-	movdqa	xmm1, [esp+640-384]
-	movdqa	xmm6, xmm1
-	paddw	xmm6, xmm1
-	paddw	xmm1, [esp+640-400]
-	paddw	xmm1, [esp+640-144]
-	paddw	xmm7, xmm5
-	paddw	xmm5, [esp+640-592]
-	paddw	xmm6, xmm7
-	paddw	xmm6, [esp+640-624]
-	movdqa	xmm7, [esp+640-528]
-	psraw	xmm6, 2
-	psllw	xmm1, 1
-	paddw	xmm1, xmm5
-
-	movdqa	xmm5, [esp+656-272]
-	pandn	xmm7, xmm6
-	movdqa	xmm6, [esp+640-416]
-	paddw	xmm6, xmm1
-	movdqa	xmm1, [esp+640-560]
-	psraw	xmm6, 3
-	pand	xmm1, xmm6
-
-	movdqa	xmm6, [esp+704-272]
-	movdqa	 [esp+640-128], xmm1
-	movdqa	xmm1, [esp+672-272]
-	punpckhbw xmm1, xmm2
-	movdqa	 [esp+640-448], xmm1
-	movdqa	xmm1, [esp+688-272]
-	punpckhbw xmm1, xmm2
-	punpckhbw xmm6, xmm2
-	movdqa	 [esp+640-288], xmm7
-	punpckhbw xmm5, xmm2
-	movdqa	 [esp+640-496], xmm1
-	movdqa	 [esp+640-432], xmm6
-
-	movdqa	xmm7, [esp+720-272]
-	punpckhbw xmm7, xmm2
-	movdqa	 [esp+640-464], xmm7
-
-	movdqa	xmm7, [esp+736-272]
-	punpckhbw xmm7, xmm2
-	movdqa	 [esp+640-528], xmm7
-
-	movdqa	xmm7, xmm6
-
-	psubw	xmm6, [esp+640-464]
-	psubw	xmm7, xmm1
-	pabsw	xmm7, xmm7
-	movdqa	 [esp+640-560], xmm7
-	por	xmm4, [esp+640-16]
-	pabsw	xmm6, xmm6
-	movdqa	xmm7, xmm1
-	psubw	xmm7, [esp+640-448]
-
-	movdqa	xmm1, [esp+640-512]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm1, xmm7
-	movdqa	xmm7, [esp+640-512]
-	pcmpgtw	xmm7, xmm6
-	movdqa	xmm6, [esp+640-320]
-	pand	xmm1, xmm7
-	movdqa	xmm7, [esp+640-560]
-	pcmpgtw	xmm6, xmm7
-	pand	xmm1, xmm6
-
-	movdqa	xmm6, [esp+640-576]
-	pcmpgtw	xmm6, xmm7
-
-	movdqa	xmm7, [esp+640-496]
-	punpckhbw xmm3, xmm2
-	movdqa	 [esp+640-560], xmm6
-	movdqa	xmm6, [esp+640-512]
-	psubw	xmm7, xmm5
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm6, xmm7
-
-	pand	xmm6, [esp+640-560]
-	movdqa	xmm7, [esp+640-432]
-	psubw	xmm7, [esp+640-528]
-
-	psllw	xmm3, 1
-	movdqa	 [esp+640-544], xmm6
-	movdqa	xmm6, [esp+640-512]
-
-	movdqa	xmm2, [esp+640-544]
-	paddw	xmm3, xmm5
-	paddw	xmm3, xmm5
-	paddw	xmm3, xmm5
-	paddw	xmm3, [esp+640-448]
-	paddw	xmm3, [esp+640-496]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm6, xmm7
-	pand	xmm6, [esp+640-560]
-	movdqa	 [esp+640-560], xmm6
-
-	movdqa	xmm6, xmm0
-	pand	xmm6, xmm4
-	movdqa	xmm4, xmm0
-	pandn	xmm4, [esp+640-368]
-	por	xmm6, xmm4
-	movdqa	xmm4, [esp+640-432]
-	paddw	xmm3, xmm4
-	paddw	xmm3, [esp+640-592]
-	psraw	xmm3, 3
-	pand	xmm3, xmm2
-	pandn	xmm2, xmm5
-	por	xmm3, xmm2
-	movdqa	xmm7, xmm1
-	pand	xmm7, xmm3
-	movdqa	xmm3, [esp+640-64]
-	por	xmm3, [esp+640-336]
-	movdqa	xmm2, xmm1
-	pandn	xmm2, xmm5
-	por	xmm7, xmm2
-
-	movdqa	xmm2, xmm0
-	pand	xmm2, xmm3
-	movdqa	xmm3, xmm0
-	pandn	xmm3, [esp+640-480]
-	por	xmm2, xmm3
-	packuswb xmm6, xmm7
-	movdqa	 [esp+640-336], xmm2
-	movdqa	 [esp+656-272], xmm6
-	movdqa	xmm6, [esp+640-544]
-	movdqa	xmm2, xmm5
-	paddw	xmm2, [esp+640-448]
-	movdqa	xmm3, xmm1
-	movdqa	xmm7, [esp+640-496]
-	paddw	xmm7, xmm4
-	paddw	xmm2, xmm7
-	paddw	xmm2, [esp+640-624]
-	movdqa	xmm7, [esp+640-544]
-	psraw	xmm2, 2
-	pand	xmm6, xmm2
-	movdqa	xmm2, [esp+640-448]
-	pandn	xmm7, xmm2
-	por	xmm6, xmm7
-	pand	xmm3, xmm6
-	movdqa	xmm6, xmm1
-	pandn	xmm6, xmm2
-	paddw	xmm2, [esp+640-496]
-	paddw	xmm2, xmm4
-	por	xmm3, xmm6
-	movdqa	xmm6, [esp+640-336]
-	packuswb xmm6, xmm3
-	psllw	xmm2, 1
-	movdqa	 [esp+672-272], xmm6
-	movdqa	xmm6, [esp+640-96]
-	por	xmm6, [esp+640-352]
-
-	movdqa	xmm3, xmm0
-	pand	xmm3, xmm6
-	movdqa	xmm6, xmm0
-	pandn	xmm6, [esp+640-144]
-	por	xmm3, xmm6
-	movdqa	xmm6, [esp+640-544]
-	movdqa	 [esp+640-352], xmm3
-	movdqa	xmm3, [esp+640-464]
-	paddw	xmm3, [esp+640-592]
-	paddw	xmm2, xmm3
-	movdqa	xmm3, [esp+640-448]
-	paddw	xmm5, xmm2
-	movdqa	xmm2, [esp+640-496]
-	psraw	xmm5, 3
-	pand	xmm6, xmm5
-	movdqa	xmm5, [esp+640-464]
-	paddw	xmm2, xmm5
-	paddw	xmm5, [esp+640-432]
-	movdqa	xmm4, xmm3
-	paddw	xmm4, xmm3
-	paddw	xmm4, xmm2
-	paddw	xmm4, [esp+640-624]
-	movdqa	xmm2, [esp+640-544]
-	paddw	xmm3, [esp+640-592]
-	psraw	xmm4, 2
-	pandn	xmm2, xmm4
-	por	xmm6, xmm2
-	movdqa	xmm7, xmm1
-	pand	xmm7, xmm6
-	movdqa	xmm6, [esp+640-496]
-	movdqa	xmm2, xmm1
-	pandn	xmm2, xmm6
-	por	xmm7, xmm2
-	movdqa	xmm2, [esp+640-352]
-	packuswb xmm2, xmm7
-	movdqa	 [esp+688-272], xmm2
-	movdqa	xmm2, [esp+640-128]
-	por	xmm2, [esp+640-288]
-
-	movdqa	xmm4, xmm0
-	pand	xmm4, xmm2
-	paddw	xmm5, xmm6
-	movdqa	xmm2, xmm0
-	pandn	xmm2, [esp+640-400]
-	por	xmm4, xmm2
-	movdqa	xmm2, [esp+640-528]
-	psllw	xmm5, 1
-	paddw	xmm5, xmm3
-	movdqa	xmm3, [esp+640-560]
-	paddw	xmm2, xmm5
-	psraw	xmm2, 3
-	movdqa	 [esp+640-288], xmm4
-	movdqa	xmm4, [esp+640-560]
-	pand	xmm4, xmm2
-	movdqa	xmm2, [esp+640-464]
-	movdqa	xmm5, xmm2
-	paddw	xmm5, xmm2
-	movdqa	xmm2, [esp+640-432]
-	paddw	xmm2, [esp+640-448]
-	movdqa	xmm7, xmm1
-	paddw	xmm5, xmm2
-	paddw	xmm5, [esp+640-624]
-	movdqa	xmm6, [esp+640-560]
-	psraw	xmm5, 2
-	pandn	xmm3, xmm5
-	por	xmm4, xmm3
-	movdqa	xmm3, [esp+640-32]
-	por	xmm3, [esp+640-304]
-	pand	xmm7, xmm4
-	movdqa	xmm4, [esp+640-432]
-	movdqa	xmm5, [esp+640-464]
-	movdqa	xmm2, xmm1
-	pandn	xmm2, xmm4
-	paddw	xmm4, [esp+640-496]
-	por	xmm7, xmm2
-	movdqa	xmm2, [esp+640-288]
-	packuswb xmm2, xmm7
-	movdqa	 [esp+704-272], xmm2
-
-	movdqa	xmm2, xmm0
-	pand	xmm2, xmm3
-	movdqa	xmm3, xmm0
-	pandn	xmm3, [esp+640-384]
-	por	xmm2, xmm3
-	movdqa	 [esp+640-304], xmm2
-	movdqa	xmm2, [esp+640-528]
-	movdqa	xmm3, xmm2
-	paddw	xmm3, [esp+640-464]
-	paddw	xmm3, xmm4
-	paddw	xmm3, [esp+640-624]
-	psraw	xmm3, 2
-	pand	xmm6, xmm3
-	movdqa	xmm3, [esp+640-560]
-	movdqa	xmm4, xmm3
-	pandn	xmm4, xmm5
-	por	xmm6, xmm4
-	movdqa	xmm7, xmm1
-	pand	xmm7, xmm6
-	movdqa	xmm6, [esp+640-304]
-	movdqa	xmm4, xmm1
-	pandn	xmm4, xmm5
-	por	xmm7, xmm4
-
-	movdqa	xmm4, xmm0
-	pandn	xmm0, [esp+640-416]
-	packuswb xmm6, xmm7
-	movdqa	xmm7, [esp+640-112]
-	por	xmm7, [esp+640-80]
-	pand	xmm4, xmm7
-	por	xmm4, xmm0
-	movdqa	xmm0, [esp+752-272]
-	punpckhbw xmm0, [esp+640-48]
-	psllw	xmm0, 1
-	paddw	xmm0, xmm2
-	paddw	xmm0, xmm2
-	paddw	xmm0, xmm2
-	paddw	xmm0, xmm5
-	paddw	xmm0, [esp+640-432]
-	paddw	xmm0, [esp+640-496]
-	paddw	xmm0, [esp+640-592]
-	psraw	xmm0, 3
-	pand	xmm0, xmm3
-	movdqa	xmm7, xmm1
-	pandn	xmm3, xmm2
-	por	xmm0, xmm3
-	pand	xmm7, xmm0
-
-	movdqa	xmm0, [esp+656-272]
-	movdqa	 [edx], xmm0
-
-	movdqa	xmm0, [esp+672-272]
-
-	mov	edx, dword [esp+640-596]
-	movdqa	 [esi], xmm0
-	movdqa	xmm0, [esp+688-272]
-	movdqa	 [edi], xmm0
-	movdqa	xmm0, [esp+704-272]
-
-	pop	edi
-	pandn	xmm1, xmm2
-	movdqa	 [eax], xmm0
-	por	xmm7, xmm1
-	pop	esi
-	packuswb xmm4, xmm7
-	movdqa	 [edx], xmm6
-	movdqa	 [ecx], xmm4
-	pop	ebx
-	mov	esp, ebp
-	pop	ebp
-	ret
-  
-    
-;********************************************************************************
-;
-;   void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);     
-;
-;********************************************************************************
-
-WELS_EXTERN  DeblockLumaTransposeH2V_sse2
-
-ALIGN  16
-
-DeblockLumaTransposeH2V_sse2:
-    push    ebp
-    push    ebx
-    mov     ebp,   esp
-    and     esp,0FFFFFFF0h
-    sub     esp,   10h    
-    
-    mov     eax,   [ebp + 0Ch]  
-    mov     ecx,   [ebp + 10h]
-    lea     edx,   [eax + ecx * 8]
-    lea     ebx,   [ecx*3]
-    
-    movq    xmm0,  [eax] 
-    movq    xmm7,  [edx]
-    punpcklqdq   xmm0,  xmm7  
-    movq    xmm1,  [eax + ecx]
-    movq    xmm7,  [edx + ecx]
-    punpcklqdq   xmm1,  xmm7
-    movq    xmm2,  [eax + ecx*2] 
-    movq    xmm7,  [edx + ecx*2]
-    punpcklqdq   xmm2,  xmm7
-    movq    xmm3,  [eax + ebx]
-    movq    xmm7,  [edx + ebx]
-    punpcklqdq   xmm3,  xmm7
-    
-    lea     eax,   [eax + ecx * 4]
-    lea     edx,   [edx + ecx * 4]
-    movq    xmm4,  [eax] 
-    movq    xmm7,  [edx]
-    punpcklqdq   xmm4,  xmm7  
-    movq    xmm5,  [eax + ecx]
-    movq    xmm7,  [edx + ecx]
-    punpcklqdq   xmm5,  xmm7
-    movq    xmm6,  [eax + ecx*2] 
-    movq    xmm7,  [edx + ecx*2]
-    punpcklqdq   xmm6,  xmm7
-    
-    movdqa  [esp],   xmm0
-    movq    xmm7,  [eax + ebx]
-    movq    xmm0,  [edx + ebx]
-    punpcklqdq   xmm7,  xmm0
-    movdqa  xmm0,   [esp]
-    
-    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
-    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-    
-    mov    eax,   [ebp + 14h]
-    movdqa  [eax],    xmm4 
-    movdqa  [eax + 10h],  xmm2
-    movdqa  [eax + 20h],  xmm3
-    movdqa  [eax + 30h],  xmm7
-    movdqa  [eax + 40h],  xmm5
-    movdqa  [eax + 50h],  xmm1
-    movdqa  [eax + 60h],  xmm6
-    movdqa  [eax + 70h],  xmm0   
-    
-    mov     esp,   ebp
-    pop     ebx
-    pop     ebp
-    ret
-    
-    
-    
-;*******************************************************************************************
-;
-;   void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
-;
-;*******************************************************************************************
-
-WELS_EXTERN   DeblockLumaTransposeV2H_sse2
-
-ALIGN  16
-
-DeblockLumaTransposeV2H_sse2:
-    push     ebp
-    mov      ebp,   esp
-    
-    and     esp,  0FFFFFFF0h
-    sub     esp,   10h  
-    
-    mov      eax,   [ebp + 10h]  
-    mov      ecx,   [ebp + 0Ch]
-    mov      edx,   [ebp + 08h]
-      
-    movdqa   xmm0,  [eax]
-    movdqa   xmm1,  [eax + 10h]
-    movdqa   xmm2,  [eax + 20h]
-    movdqa   xmm3,	[eax + 30h]
-    movdqa   xmm4,	[eax + 40h]
-    movdqa   xmm5,	[eax + 50h]
-    movdqa   xmm6,	[eax + 60h]
-    movdqa   xmm7,	[eax + 70h]
-    
-    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
-    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-    
-    lea      eax,   [ecx * 3]
-    
-    movq     [edx],  xmm4 
-    movq     [edx + ecx],  xmm2
-    movq     [edx + ecx*2],  xmm3
-    movq     [edx + eax],  xmm7
-    
-    lea      edx,   [edx + ecx*4]
-    movq     [edx],  xmm5 
-    movq     [edx + ecx],  xmm1
-    movq     [edx + ecx*2],  xmm6
-    movq     [edx + eax],  xmm0    
-    
-    psrldq    xmm4,   8
-    psrldq    xmm2,   8
-    psrldq    xmm3,   8
-    psrldq    xmm7,   8
-    psrldq    xmm5,   8
-    psrldq    xmm1,   8
-    psrldq    xmm6,   8
-    psrldq    xmm0,   8
-    
-    lea       edx,  [edx + ecx*4]
-    movq     [edx],  xmm4 
-    movq     [edx + ecx],  xmm2
-    movq     [edx + ecx*2],  xmm3
-    movq     [edx + eax],  xmm7
-    
-    lea      edx,   [edx + ecx*4]
-    movq     [edx],  xmm5 
-    movq     [edx + ecx],  xmm1
-    movq     [edx + ecx*2],  xmm6
-    movq     [edx + eax],  xmm0   
-    
-    
-    mov      esp,   ebp
-    pop      ebp
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  deblock.asm
+;*
+;*  Abstract
+;*      edge loop
+;*
+;*  History
+;*      08/07/2009 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+BITS 32
+
+;*******************************************************************************
+; Macros and other preprocessor constants
+;*******************************************************************************
+
+%ifdef FORMAT_COFF
+SECTION .rodata pData
+%else
+SECTION .rodata align=16
+%endif
+
+SECTION .text
+
+;********************************************************************************
+;  void DeblockChromaEq4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+;                             int32_t iAlpha, int32_t iBeta)
+;********************************************************************************
+WELS_EXTERN   DeblockChromaEq4V_sse2
+
+ALIGN  16
+DeblockChromaEq4V_sse2:
+  push        ebp
+  mov         ebp,esp
+  and         esp,0FFFFFFF0h
+  sub         esp,68h
+  mov         edx,[ebp+10h]      ;  iStride
+  mov         eax,[ebp+8]        ;  pPixCb
+  mov         ecx,[ebp+0Ch]      ;  pPixCr
+  movq        xmm4,[ecx]
+  movq        xmm5,[edx+ecx]
+  push        esi
+  push        edi
+  lea         esi,[edx+edx]
+  mov         edi,eax
+  sub         edi,esi
+  movq        xmm1,[edi]
+  mov         edi,ecx
+  sub         edi,esi
+  movq        xmm2,[edi]
+  punpcklqdq  xmm1,xmm2
+  mov         esi,eax
+  sub         esi,edx
+  movq        xmm2,[esi]
+  mov         edi,ecx
+  sub         edi,edx
+  movq        xmm3,[edi]
+  punpcklqdq  xmm2,xmm3
+  movq        xmm3,[eax]
+  punpcklqdq  xmm3,xmm4
+  movq        xmm4,[edx+eax]
+  mov       edx, [ebp + 14h]
+  punpcklqdq  xmm4,xmm5
+  movd        xmm5,edx
+  mov       edx, [ebp + 18h]
+  pxor        xmm0,xmm0
+  movdqa      xmm6,xmm5
+  punpcklwd   xmm6,xmm5
+  pshufd      xmm5,xmm6,0
+  movd        xmm6,edx
+  movdqa      xmm7,xmm6
+  punpcklwd   xmm7,xmm6
+  pshufd      xmm6,xmm7,0
+  movdqa      xmm7,xmm1
+  punpckhbw   xmm1,xmm0
+  punpcklbw   xmm7,xmm0
+  movdqa      [esp+40h],xmm1
+  movdqa      [esp+60h],xmm7
+  movdqa      xmm7,xmm2
+  punpcklbw   xmm7,xmm0
+  movdqa      [esp+10h],xmm7
+  movdqa      xmm7,xmm3
+  punpcklbw   xmm7,xmm0
+  punpckhbw   xmm3,xmm0
+  movdqa      [esp+50h],xmm7
+  movdqa      xmm7,xmm4
+  punpckhbw   xmm4,xmm0
+  punpckhbw   xmm2,xmm0
+  punpcklbw   xmm7,xmm0
+  movdqa      [esp+30h],xmm3
+  movdqa      xmm3,[esp+10h]
+  movdqa      xmm1,xmm3
+  psubw       xmm1,[esp+50h]
+  pabsw       xmm1,xmm1
+  movdqa      [esp+20h],xmm4
+  movdqa      xmm0,xmm5
+  pcmpgtw     xmm0,xmm1
+  movdqa      xmm1,[esp+60h]
+  psubw       xmm1,xmm3
+  pabsw       xmm1,xmm1
+  movdqa      xmm4,xmm6
+  pcmpgtw     xmm4,xmm1
+  pand        xmm0,xmm4
+  movdqa      xmm1,xmm7
+  psubw       xmm1,[esp+50h]
+  pabsw       xmm1,xmm1
+  movdqa      xmm4,xmm6
+  pcmpgtw     xmm4,xmm1
+  movdqa      xmm1,xmm2
+  psubw       xmm1,[esp+30h]
+  pabsw       xmm1,xmm1
+  pcmpgtw     xmm5,xmm1
+  movdqa      xmm1,[esp+40h]
+  pand        xmm0,xmm4
+  psubw       xmm1,xmm2
+  pabsw       xmm1,xmm1
+  movdqa      xmm4,xmm6
+  pcmpgtw     xmm4,xmm1
+  movdqa      xmm1,[esp+20h]
+  psubw       xmm1,[esp+30h]
+  pand        xmm5,xmm4
+  pabsw       xmm1,xmm1
+  pcmpgtw     xmm6,xmm1
+  pand        xmm5,xmm6
+  mov         edx,2
+  movsx       edx,dx
+  movd        xmm1,edx
+  movdqa      xmm4,xmm1
+  punpcklwd   xmm4,xmm1
+  pshufd      xmm1,xmm4,0
+  movdqa      xmm4,[esp+60h]
+  movdqa      xmm6,xmm4
+  paddw       xmm6,xmm4
+  paddw       xmm6,xmm3
+  paddw       xmm6,xmm7
+  movdqa      [esp+10h],xmm1
+  paddw       xmm6,[esp+10h]
+  psraw       xmm6,2
+  movdqa      xmm4,xmm0
+  pandn       xmm4,xmm3
+  movdqa      xmm3,[esp+40h]
+  movdqa      xmm1,xmm0
+  pand        xmm1,xmm6
+  por         xmm1,xmm4
+  movdqa      xmm6,xmm3
+  paddw       xmm6,xmm3
+  movdqa      xmm3,[esp+10h]
+  paddw       xmm6,xmm2
+  paddw       xmm6,[esp+20h]
+  paddw       xmm6,xmm3
+  psraw       xmm6,2
+  movdqa      xmm4,xmm5
+  pand        xmm4,xmm6
+  movdqa      xmm6,xmm5
+  pandn       xmm6,xmm2
+  por         xmm4,xmm6
+  packuswb    xmm1,xmm4
+  movdqa      xmm4,[esp+50h]
+  movdqa      xmm6,xmm7
+  paddw       xmm6,xmm7
+  paddw       xmm6,xmm4
+  paddw       xmm6,[esp+60h]
+  paddw       xmm6,xmm3
+  psraw       xmm6,2
+  movdqa      xmm2,xmm0
+  pand        xmm2,xmm6
+  pandn       xmm0,xmm4
+  por         xmm2,xmm0
+  movdqa      xmm0,[esp+20h]
+  movdqa      xmm6,xmm0
+  paddw       xmm6,xmm0
+  movdqa      xmm0,[esp+30h]
+  paddw       xmm6,xmm0
+  paddw       xmm6,[esp+40h]
+  movdqa      xmm4,xmm5
+  paddw       xmm6,xmm3
+  movq        [esi],xmm1
+  psraw       xmm6,2
+  pand        xmm4,xmm6
+  pandn       xmm5,xmm0
+  por         xmm4,xmm5
+  packuswb    xmm2,xmm4
+  movq        [eax],xmm2
+  psrldq      xmm1,8
+  movq        [edi],xmm1
+  pop         edi
+  psrldq      xmm2,8
+  movq        [ecx],xmm2
+  pop         esi
+  mov         esp,ebp
+  pop         ebp
+  ret
+
+;******************************************************************************
+; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+;                           int32_t iAlpha, int32_t iBeta, int8_t * pTC);
+;*******************************************************************************
+
+WELS_EXTERN  DeblockChromaLt4V_sse2
+
+DeblockChromaLt4V_sse2:
+  push        ebp
+  mov         ebp,esp
+  and         esp,0FFFFFFF0h
+  sub         esp,0E4h
+  push        ebx
+  push        esi
+  mov         esi, [ebp+1Ch]      ;  pTC
+  movsx       ebx, byte [esi+2]
+  push        edi
+  movsx       di,byte [esi+3]
+  mov         word [esp+0Ch],bx
+  movsx       bx,byte  [esi+1]
+  movsx       esi,byte  [esi]
+  mov         word  [esp+0Eh],si
+  movzx       esi,di
+  movd        xmm1,esi
+  movzx       esi,di
+  movd        xmm2,esi
+  mov         si,word  [esp+0Ch]
+  mov         edx, [ebp + 10h]
+  mov         eax, [ebp + 08h]
+  movzx       edi,si
+  movzx       esi,si
+  mov         ecx, [ebp + 0Ch]
+  movd        xmm4,esi
+  movzx       esi,bx
+  movd        xmm5,esi
+  movd        xmm3,edi
+  movzx       esi,bx
+  movd        xmm6,esi
+  mov         si,word [esp+0Eh]
+  movzx       edi,si
+  movzx       esi,si
+  punpcklwd   xmm6,xmm2
+  pxor        xmm0,xmm0
+  movdqa      [esp+40h],xmm0
+  movd        xmm7,edi
+  movd        xmm0,esi
+  lea         esi,[edx+edx]
+  mov         edi,eax
+  sub         edi,esi
+  punpcklwd   xmm5,xmm1
+  movdqa      xmm1,[esp+40h]
+  punpcklwd   xmm0,xmm4
+  movq        xmm4,[edx+ecx]
+  punpcklwd   xmm7,xmm3
+  movq        xmm3,[eax]
+  punpcklwd   xmm0,xmm6
+  movq        xmm6,[edi]
+  punpcklwd   xmm7,xmm5
+  punpcklwd   xmm0,xmm7
+  mov         edi,ecx
+  sub         edi,esi
+  movdqa      xmm2,xmm1
+  psubw       xmm2,xmm0
+  movdqa      [esp+60h],xmm2
+  movq        xmm2, [edi]
+  punpcklqdq  xmm6,xmm2
+  mov         esi,eax
+  sub         esi,edx
+  movq        xmm7,[esi]
+  mov         edi,ecx
+  sub         edi,edx
+  movq        xmm2,[edi]
+  punpcklqdq  xmm7,xmm2
+  movq        xmm2,[ecx]
+  punpcklqdq  xmm3,xmm2
+  movq        xmm2,[edx+eax]
+  movsx       edx,word [ebp + 14h]
+  punpcklqdq  xmm2,xmm4
+  movdqa      [esp+0E0h],xmm2
+  movd        xmm2,edx
+  movsx       edx,word [ebp + 18h]
+  movdqa      xmm4,xmm2
+  punpcklwd   xmm4,xmm2
+  movd        xmm2,edx
+  movdqa      xmm5,xmm2
+  punpcklwd   xmm5,xmm2
+  pshufd      xmm2,xmm5,0
+  movdqa      [esp+50h],xmm2
+  movdqa      xmm2,xmm6
+  punpcklbw   xmm2,xmm1
+  movdqa      [esp+0D0h],xmm3
+  pshufd      xmm4,xmm4,0
+  movdqa      [esp+30h],xmm2
+  punpckhbw   xmm6,xmm1
+  movdqa      [esp+80h],xmm6
+  movdqa      xmm6,[esp+0D0h]
+  punpckhbw   xmm6,xmm1
+  movdqa      [esp+70h],xmm6
+  movdqa      xmm6, [esp+0E0h]
+  punpckhbw   xmm6,xmm1
+  movdqa     [esp+90h],xmm6
+  movdqa      xmm5, [esp+0E0h]
+  movdqa      xmm2,xmm7
+  punpckhbw   xmm7,xmm1
+  punpcklbw   xmm5,xmm1
+  movdqa       [esp+0A0h],xmm7
+  punpcklbw   xmm3,xmm1
+  mov         edx,4
+  punpcklbw   xmm2,xmm1
+  movsx       edx,dx
+  movd        xmm6,edx
+  movdqa      xmm7,xmm6
+  punpcklwd   xmm7,xmm6
+  pshufd      xmm6,xmm7,0
+  movdqa      xmm7,[esp+30h]
+  movdqa      [esp+20h],xmm6
+  psubw       xmm7,xmm5
+  movdqa      xmm6,xmm0
+  pcmpgtw     xmm6,xmm1
+  movdqa      xmm1,[esp+60h]
+  movdqa      [esp+40h],xmm6
+  movdqa      xmm6,xmm3
+  psubw       xmm6,xmm2
+  psllw       xmm6,2
+  paddw       xmm6,xmm7
+  paddw       xmm6, [esp+20h]
+  movdqa      xmm7, [esp+50h]
+  psraw       xmm6,3
+  pmaxsw      xmm1,xmm6
+  movdqa      [esp+10h],xmm0
+  movdqa      xmm6, [esp+10h]
+  pminsw      xmm6,xmm1
+  movdqa      [esp+10h],xmm6
+  movdqa      xmm1,xmm2
+  psubw       xmm1,xmm3
+  pabsw       xmm1,xmm1
+  movdqa      xmm6,xmm4
+  pcmpgtw     xmm6,xmm1
+  movdqa      xmm1, [esp+30h]
+  psubw       xmm1,xmm2
+  pabsw       xmm1,xmm1
+  pcmpgtw     xmm7,xmm1
+  movdqa      xmm1,[esp+50h]
+  pand        xmm6,xmm7
+  movdqa      xmm7,[esp+50h]
+  psubw       xmm5,xmm3
+  pabsw       xmm5,xmm5
+  pcmpgtw     xmm1,xmm5
+  movdqa      xmm5,[esp+80h]
+  psubw       xmm5,[esp+90h]
+  pand        xmm6,xmm1
+  pand        xmm6,[esp+40h]
+  movdqa      xmm1,[esp+10h]
+  pand        xmm1,xmm6
+  movdqa      xmm6,[esp+70h]
+  movdqa      [esp+30h],xmm1
+  movdqa      xmm1,[esp+0A0h]
+  psubw       xmm6,xmm1
+  psllw       xmm6,2
+  paddw       xmm6,xmm5
+  paddw       xmm6,[esp+20h]
+  movdqa      xmm5,[esp+60h]
+  psraw       xmm6,3
+  pmaxsw      xmm5,xmm6
+  pminsw      xmm0,xmm5
+  movdqa      xmm5,[esp+70h]
+  movdqa      xmm6,xmm1
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm4,xmm6
+  movdqa      xmm6,[esp+80h]
+  psubw       xmm6,xmm1
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm7,xmm6
+  movdqa      xmm6,[esp+90h]
+  pand        xmm4,xmm7
+  movdqa      xmm7,[esp+50h]
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm7,xmm6
+  pand        xmm4,xmm7
+  pand        xmm4,[esp+40h]
+  pand        xmm0,xmm4
+  movdqa      xmm4,[esp+30h]
+  paddw       xmm2,xmm4
+  paddw       xmm1,xmm0
+  packuswb    xmm2,xmm1
+  movq        [esi],xmm2
+  psubw       xmm3,xmm4
+  psubw       xmm5,xmm0
+  packuswb    xmm3,xmm5
+  movq        [eax],xmm3
+  psrldq      xmm2,8
+  movq        [edi],xmm2
+  pop         edi
+  pop         esi
+  psrldq      xmm3,8
+  movq        [ecx],xmm3
+  pop         ebx
+  mov         esp,ebp
+  pop         ebp
+  ret
+
+;***************************************************************************
+;  void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+;          int32_t iAlpha, int32_t iBeta)
+;***************************************************************************
+
+WELS_EXTERN     DeblockChromaEq4H_sse2
+
+ALIGN  16
+
+DeblockChromaEq4H_sse2:
+  push        ebp
+  mov         ebp,esp
+  and         esp,0FFFFFFF0h
+  sub         esp,0C8h
+  mov         ecx,dword [ebp+8]
+  mov         edx,dword [ebp+0Ch]
+  mov         eax,dword [ebp+10h]
+  sub         ecx,2
+  sub         edx,2
+  push        esi
+  lea         esi,[eax+eax*2]
+  mov         dword [esp+18h],ecx
+  mov         dword [esp+4],edx
+  lea         ecx,[ecx+eax*4]
+  lea         edx,[edx+eax*4]
+  lea         eax,[esp+7Ch]
+  push        edi
+  mov         dword [esp+14h],esi
+  mov         dword [esp+18h],ecx
+  mov         dword [esp+0Ch],edx
+  mov         dword [esp+10h],eax
+  mov         esi,dword [esp+1Ch]
+  mov         ecx,dword [ebp+10h]
+  mov         edx,dword [esp+14h]
+  movd        xmm0,dword [esi]
+  movd        xmm1,dword [esi+ecx]
+  movd        xmm2,dword [esi+ecx*2]
+  movd        xmm3,dword [esi+edx]
+  mov         esi,dword  [esp+8]
+  movd        xmm4,dword [esi]
+  movd        xmm5,dword [esi+ecx]
+  movd        xmm6,dword [esi+ecx*2]
+  movd        xmm7,dword [esi+edx]
+  punpckldq   xmm0,xmm4
+  punpckldq   xmm1,xmm5
+  punpckldq   xmm2,xmm6
+  punpckldq   xmm3,xmm7
+  mov         esi,dword [esp+18h]
+  mov         edi,dword [esp+0Ch]
+  movd        xmm4,dword [esi]
+  movd        xmm5,dword [edi]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm0,xmm4
+  movd        xmm4,dword [esi+ecx]
+  movd        xmm5,dword [edi+ecx]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm1,xmm4
+  movd        xmm4,dword [esi+ecx*2]
+  movd        xmm5,dword [edi+ecx*2]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm2,xmm4
+  movd        xmm4,dword [esi+edx]
+  movd        xmm5,dword [edi+edx]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm3,xmm4
+  movdqa      xmm6,xmm0
+  punpcklbw   xmm0,xmm1
+  punpckhbw   xmm6,xmm1
+  movdqa      xmm7,xmm2
+  punpcklbw   xmm2,xmm3
+  punpckhbw   xmm7,xmm3
+  movdqa      xmm4,xmm0
+  movdqa      xmm5,xmm6
+  punpcklwd   xmm0,xmm2
+  punpckhwd   xmm4,xmm2
+  punpcklwd   xmm6,xmm7
+  punpckhwd   xmm5,xmm7
+  movdqa      xmm1,xmm0
+  movdqa      xmm2,xmm4
+  punpckldq   xmm0,xmm6
+  punpckhdq   xmm1,xmm6
+  punpckldq   xmm4,xmm5
+  punpckhdq   xmm2,xmm5
+  movdqa      xmm5,xmm0
+  movdqa      xmm6,xmm1
+  punpcklqdq  xmm0,xmm4
+  punpckhqdq  xmm5,xmm4
+  punpcklqdq  xmm1,xmm2
+  punpckhqdq  xmm6,xmm2
+  mov         edi,dword [esp+10h]
+  movdqa      [edi],xmm0
+  movdqa      [edi+10h],xmm5
+  movdqa      [edi+20h],xmm1
+  movdqa      [edi+30h],xmm6
+  movsx       ecx,word [ebp+14h]
+  movsx       edx,word [ebp+18h]
+  movdqa      xmm6,[esp+80h]
+  movdqa      xmm4,[esp+90h]
+  movdqa      xmm5,[esp+0A0h]
+  movdqa      xmm7,[esp+0B0h]
+  pxor        xmm0,xmm0
+  movd        xmm1,ecx
+  movdqa      xmm2,xmm1
+  punpcklwd   xmm2,xmm1
+  pshufd      xmm1,xmm2,0
+  movd        xmm2,edx
+  movdqa      xmm3,xmm2
+  punpcklwd   xmm3,xmm2
+  pshufd      xmm2,xmm3,0
+  movdqa      xmm3,xmm6
+  punpckhbw   xmm6,xmm0
+  movdqa      [esp+60h],xmm6
+  movdqa      xmm6,[esp+90h]
+  punpckhbw   xmm6,xmm0
+  movdqa      [esp+30h],xmm6
+  movdqa      xmm6,[esp+0A0h]
+  punpckhbw   xmm6,xmm0
+  movdqa      [esp+40h],xmm6
+  movdqa      xmm6,[esp+0B0h]
+  punpckhbw   xmm6,xmm0
+  movdqa      [esp+70h],xmm6
+  punpcklbw   xmm7,xmm0
+  punpcklbw   xmm4,xmm0
+  punpcklbw   xmm5,xmm0
+  punpcklbw   xmm3,xmm0
+  movdqa      [esp+50h],xmm7
+  movdqa      xmm6,xmm4
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  movdqa      xmm0,xmm1
+  pcmpgtw     xmm0,xmm6
+  movdqa      xmm6,xmm3
+  psubw       xmm6,xmm4
+  pabsw       xmm6,xmm6
+  movdqa      xmm7,xmm2
+  pcmpgtw     xmm7,xmm6
+  movdqa      xmm6,[esp+50h]
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  pand        xmm0,xmm7
+  movdqa      xmm7,xmm2
+  pcmpgtw     xmm7,xmm6
+  movdqa      xmm6,[esp+30h]
+  psubw       xmm6,[esp+40h]
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm1,xmm6
+  movdqa      xmm6,[esp+60h]
+  psubw       xmm6,[esp+30h]
+  pabsw       xmm6,xmm6
+  pand        xmm0,xmm7
+  movdqa      xmm7,xmm2
+  pcmpgtw     xmm7,xmm6
+  movdqa      xmm6,[esp+70h]
+  psubw       xmm6,[esp+40h]
+  pabsw       xmm6,xmm6
+  pand        xmm1,xmm7
+  pcmpgtw     xmm2,xmm6
+  pand        xmm1,xmm2
+  mov         eax,2
+  movsx       ecx,ax
+  movd        xmm2,ecx
+  movdqa      xmm6,xmm2
+  punpcklwd   xmm6,xmm2
+  pshufd      xmm2,xmm6,0
+  movdqa      [esp+20h],xmm2
+  movdqa      xmm2,xmm3
+  paddw       xmm2,xmm3
+  paddw       xmm2,xmm4
+  paddw       xmm2,[esp+50h]
+  paddw       xmm2,[esp+20h]
+  psraw       xmm2,2
+  movdqa      xmm6,xmm0
+  pand        xmm6,xmm2
+  movdqa      xmm2,xmm0
+  pandn       xmm2,xmm4
+  por         xmm6,xmm2
+  movdqa      xmm2,[esp+60h]
+  movdqa      xmm7,xmm2
+  paddw       xmm7,xmm2
+  paddw       xmm7,[esp+30h]
+  paddw       xmm7,[esp+70h]
+  paddw       xmm7,[esp+20h]
+  movdqa      xmm4,xmm1
+  movdqa      xmm2,xmm1
+  pandn       xmm2,[esp+30h]
+  psraw       xmm7,2
+  pand        xmm4,xmm7
+  por         xmm4,xmm2
+  movdqa      xmm2,[esp+50h]
+  packuswb    xmm6,xmm4
+  movdqa      [esp+90h],xmm6
+  movdqa      xmm6,xmm2
+  paddw       xmm6,xmm2
+  movdqa      xmm2,[esp+20h]
+  paddw       xmm6,xmm5
+  paddw       xmm6,xmm3
+  movdqa      xmm4,xmm0
+  pandn       xmm0,xmm5
+  paddw       xmm6,xmm2
+  psraw       xmm6,2
+  pand        xmm4,xmm6
+  por         xmm4,xmm0
+  movdqa      xmm0,[esp+70h]
+  movdqa      xmm5,xmm0
+  paddw       xmm5,xmm0
+  movdqa      xmm0,[esp+40h]
+  paddw       xmm5,xmm0
+  paddw       xmm5,[esp+60h]
+  movdqa      xmm3,xmm1
+  paddw       xmm5,xmm2
+  psraw       xmm5,2
+  pand        xmm3,xmm5
+  pandn       xmm1,xmm0
+  por         xmm3,xmm1
+  packuswb    xmm4,xmm3
+  movdqa      [esp+0A0h],xmm4
+  mov         esi,dword [esp+10h]
+  movdqa      xmm0,[esi]
+  movdqa      xmm1,[esi+10h]
+  movdqa      xmm2,[esi+20h]
+  movdqa      xmm3,[esi+30h]
+  movdqa      xmm6,xmm0
+  punpcklbw   xmm0,xmm1
+  punpckhbw   xmm6,xmm1
+  movdqa      xmm7,xmm2
+  punpcklbw   xmm2,xmm3
+  punpckhbw   xmm7,xmm3
+  movdqa      xmm4,xmm0
+  movdqa      xmm5,xmm6
+  punpcklwd   xmm0,xmm2
+  punpckhwd   xmm4,xmm2
+  punpcklwd   xmm6,xmm7
+  punpckhwd   xmm5,xmm7
+  movdqa      xmm1,xmm0
+  movdqa      xmm2,xmm4
+  punpckldq   xmm0,xmm6
+  punpckhdq   xmm1,xmm6
+  punpckldq   xmm4,xmm5
+  punpckhdq   xmm2,xmm5
+  movdqa      xmm5,xmm0
+  movdqa      xmm6,xmm1
+  punpcklqdq  xmm0,xmm4
+  punpckhqdq  xmm5,xmm4
+  punpcklqdq  xmm1,xmm2
+  punpckhqdq  xmm6,xmm2
+  mov         esi,dword [esp+1Ch]
+  mov         ecx,dword [ebp+10h]
+  mov         edx,dword [esp+14h]
+  mov         edi,dword [esp+8]
+  movd        dword [esi],xmm0
+  movd        dword [esi+ecx],xmm5
+  movd        dword [esi+ecx*2],xmm1
+  movd        dword [esi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  mov         esi,dword [esp+18h]
+  movd        dword [edi],xmm0
+  movd        dword [edi+ecx],xmm5
+  movd        dword [edi+ecx*2],xmm1
+  movd        dword [edi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  movd        dword [esi],xmm0
+  movd        dword [esi+ecx],xmm5
+  movd        dword [esi+ecx*2],xmm1
+  movd        dword [esi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  mov         edi,dword [esp+0Ch]
+  movd        dword [edi],xmm0
+  movd        dword [edi+ecx],xmm5
+  movd        dword [edi+ecx*2],xmm1
+  movd        dword [edi+edx],xmm6
+  pop         edi
+  pop         esi
+  mov         esp,ebp
+  pop         ebp
+  ret
+
+;*******************************************************************************
+;    void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+;                                int32_t iAlpha, int32_t iBeta, int8_t * pTC);
+;*******************************************************************************
+
+WELS_EXTERN  DeblockChromaLt4H_sse2
+
+ALIGN  16
+
+DeblockChromaLt4H_sse2:
+  push        ebp
+  mov         ebp,esp
+  and         esp,0FFFFFFF0h
+  sub         esp,108h
+  mov         ecx,dword [ebp+8]
+  mov         edx,dword [ebp+0Ch]
+  mov         eax,dword [ebp+10h]
+  sub         ecx,2
+  sub         edx,2
+  push        esi
+  lea         esi,[eax+eax*2]
+  mov         dword [esp+10h],ecx
+  mov         dword [esp+4],edx
+  lea         ecx,[ecx+eax*4]
+  lea         edx,[edx+eax*4]
+  lea         eax,[esp+6Ch]
+  push        edi
+  mov         dword [esp+0Ch],esi
+  mov         dword [esp+18h],ecx
+  mov         dword [esp+10h],edx
+  mov         dword [esp+1Ch],eax
+  mov         esi,dword [esp+14h]
+  mov         ecx,dword [ebp+10h]
+  mov         edx,dword [esp+0Ch]
+  movd        xmm0,dword [esi]
+  movd        xmm1,dword [esi+ecx]
+  movd        xmm2,dword [esi+ecx*2]
+  movd        xmm3,dword [esi+edx]
+  mov         esi,dword [esp+8]
+  movd        xmm4,dword [esi]
+  movd        xmm5,dword [esi+ecx]
+  movd        xmm6,dword [esi+ecx*2]
+  movd        xmm7,dword [esi+edx]
+  punpckldq   xmm0,xmm4
+  punpckldq   xmm1,xmm5
+  punpckldq   xmm2,xmm6
+  punpckldq   xmm3,xmm7
+  mov         esi,dword [esp+18h]
+  mov         edi,dword [esp+10h]
+  movd        xmm4,dword [esi]
+  movd        xmm5,dword [edi]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm0,xmm4
+  movd        xmm4,dword [esi+ecx]
+  movd        xmm5,dword [edi+ecx]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm1,xmm4
+  movd        xmm4,dword [esi+ecx*2]
+  movd        xmm5,dword [edi+ecx*2]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm2,xmm4
+  movd        xmm4,dword [esi+edx]
+  movd        xmm5,dword [edi+edx]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm3,xmm4
+  movdqa      xmm6,xmm0
+  punpcklbw   xmm0,xmm1
+  punpckhbw   xmm6,xmm1
+  movdqa      xmm7,xmm2
+  punpcklbw   xmm2,xmm3
+  punpckhbw   xmm7,xmm3
+  movdqa      xmm4,xmm0
+  movdqa      xmm5,xmm6
+  punpcklwd   xmm0,xmm2
+  punpckhwd   xmm4,xmm2
+  punpcklwd   xmm6,xmm7
+  punpckhwd   xmm5,xmm7
+  movdqa      xmm1,xmm0
+  movdqa      xmm2,xmm4
+  punpckldq   xmm0,xmm6
+  punpckhdq   xmm1,xmm6
+  punpckldq   xmm4,xmm5
+  punpckhdq   xmm2,xmm5
+  movdqa      xmm5,xmm0
+  movdqa      xmm6,xmm1
+  punpcklqdq  xmm0,xmm4
+  punpckhqdq  xmm5,xmm4
+  punpcklqdq  xmm1,xmm2
+  punpckhqdq  xmm6,xmm2
+  mov         edi,dword [esp+1Ch]
+  movdqa      [edi],xmm0
+  movdqa      [edi+10h],xmm5
+  movdqa      [edi+20h],xmm1
+  movdqa      [edi+30h],xmm6
+  mov         eax,dword [ebp+1Ch]
+  movsx       cx,byte [eax+3]
+  movsx       dx,byte [eax+2]
+  movsx       si,byte [eax+1]
+  movsx       ax,byte [eax]
+  movzx       edi,cx
+  movzx       ecx,cx
+  movd        xmm2,ecx
+  movzx       ecx,dx
+  movzx       edx,dx
+  movd        xmm3,ecx
+  movd        xmm4,edx
+  movzx       ecx,si
+  movzx       edx,si
+  movd        xmm5,ecx
+  pxor        xmm0,xmm0
+  movd        xmm6,edx
+  movzx       ecx,ax
+  movdqa      [esp+60h],xmm0
+  movzx       edx,ax
+  movsx       eax,word [ebp+14h]
+  punpcklwd   xmm6,xmm2
+  movd        xmm1,edi
+  movd        xmm7,ecx
+  movsx       ecx,word [ebp+18h]
+  movd        xmm0,edx
+  punpcklwd   xmm7,xmm3
+  punpcklwd   xmm5,xmm1
+  movdqa      xmm1,[esp+60h]
+  punpcklwd   xmm7,xmm5
+  movdqa      xmm5,[esp+0A0h]
+  punpcklwd   xmm0,xmm4
+  punpcklwd   xmm0,xmm6
+  movdqa      xmm6, [esp+70h]
+  punpcklwd   xmm0,xmm7
+  movdqa      xmm7,[esp+80h]
+  movdqa      xmm2,xmm1
+  psubw       xmm2,xmm0
+  movdqa      [esp+0D0h],xmm2
+  movd        xmm2,eax
+  movdqa      xmm3,xmm2
+  punpcklwd   xmm3,xmm2
+  pshufd      xmm4,xmm3,0
+  movd        xmm2,ecx
+  movdqa      xmm3,xmm2
+  punpcklwd   xmm3,xmm2
+  pshufd      xmm2,xmm3,0
+  movdqa      xmm3, [esp+90h]
+  movdqa      [esp+50h],xmm2
+  movdqa      xmm2,xmm6
+  punpcklbw   xmm2,xmm1
+  punpckhbw   xmm6,xmm1
+  movdqa      [esp+40h],xmm2
+  movdqa      [esp+0B0h],xmm6
+  movdqa      xmm6,[esp+90h]
+  movdqa      xmm2,xmm7
+  punpckhbw   xmm7,xmm1
+  punpckhbw   xmm6,xmm1
+  punpcklbw   xmm2,xmm1
+  punpcklbw   xmm3,xmm1
+  punpcklbw   xmm5,xmm1
+  movdqa      [esp+0F0h],xmm7
+  movdqa      [esp+0C0h],xmm6
+  movdqa      xmm6, [esp+0A0h]
+  punpckhbw   xmm6,xmm1
+  movdqa      [esp+0E0h],xmm6
+  mov         edx,4
+  movsx       eax,dx
+  movd        xmm6,eax
+  movdqa      xmm7,xmm6
+  punpcklwd   xmm7,xmm6
+  pshufd      xmm6,xmm7,0
+  movdqa      [esp+30h],xmm6
+  movdqa      xmm7, [esp+40h]
+  psubw       xmm7,xmm5
+  movdqa      xmm6,xmm0
+  pcmpgtw     xmm6,xmm1
+  movdqa      [esp+60h],xmm6
+  movdqa      xmm1, [esp+0D0h]
+  movdqa      xmm6,xmm3
+  psubw       xmm6,xmm2
+  psllw       xmm6,2
+  paddw       xmm6,xmm7
+  paddw       xmm6,[esp+30h]
+  psraw       xmm6,3
+  pmaxsw      xmm1,xmm6
+  movdqa      xmm7,[esp+50h]
+  movdqa      [esp+20h],xmm0
+  movdqa      xmm6, [esp+20h]
+  pminsw      xmm6,xmm1
+  movdqa      [esp+20h],xmm6
+  movdqa      xmm6,xmm4
+  movdqa      xmm1,xmm2
+  psubw       xmm1,xmm3
+  pabsw       xmm1,xmm1
+  pcmpgtw     xmm6,xmm1
+  movdqa      xmm1, [esp+40h]
+  psubw       xmm1,xmm2
+  pabsw       xmm1,xmm1
+  pcmpgtw     xmm7,xmm1
+  movdqa      xmm1, [esp+50h]
+  pand        xmm6,xmm7
+  movdqa      xmm7, [esp+50h]
+  psubw       xmm5,xmm3
+  pabsw       xmm5,xmm5
+  pcmpgtw     xmm1,xmm5
+  movdqa      xmm5, [esp+0B0h]
+  psubw       xmm5,[esp+0E0h]
+  pand        xmm6,xmm1
+  pand        xmm6, [esp+60h]
+  movdqa      xmm1, [esp+20h]
+  pand        xmm1,xmm6
+  movdqa      xmm6, [esp+0C0h]
+  movdqa      [esp+40h],xmm1
+  movdqa      xmm1, [esp+0F0h]
+  psubw       xmm6,xmm1
+  psllw       xmm6,2
+  paddw       xmm6,xmm5
+  paddw       xmm6, [esp+30h]
+  movdqa      xmm5, [esp+0D0h]
+  psraw       xmm6,3
+  pmaxsw      xmm5,xmm6
+  pminsw      xmm0,xmm5
+  movdqa      xmm5,[esp+0C0h]
+  movdqa      xmm6,xmm1
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm4,xmm6
+  movdqa      xmm6,[esp+0B0h]
+  psubw       xmm6,xmm1
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm7,xmm6
+  movdqa      xmm6, [esp+0E0h]
+  pand        xmm4,xmm7
+  movdqa      xmm7, [esp+50h]
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm7,xmm6
+  pand        xmm4,xmm7
+  pand        xmm4,[esp+60h]
+  pand        xmm0,xmm4
+  movdqa      xmm4, [esp+40h]
+  paddw       xmm2,xmm4
+  paddw       xmm1,xmm0
+  psubw       xmm3,xmm4
+  psubw       xmm5,xmm0
+  packuswb    xmm2,xmm1
+  packuswb    xmm3,xmm5
+  movdqa      [esp+80h],xmm2
+  movdqa      [esp+90h],xmm3
+  mov         esi,dword [esp+1Ch]
+  movdqa      xmm0, [esi]
+  movdqa      xmm1, [esi+10h]
+  movdqa      xmm2, [esi+20h]
+  movdqa      xmm3, [esi+30h]
+  movdqa      xmm6,xmm0
+  punpcklbw   xmm0,xmm1
+  punpckhbw   xmm6,xmm1
+  movdqa      xmm7,xmm2
+  punpcklbw   xmm2,xmm3
+  punpckhbw   xmm7,xmm3
+  movdqa      xmm4,xmm0
+  movdqa      xmm5,xmm6
+  punpcklwd   xmm0,xmm2
+  punpckhwd   xmm4,xmm2
+  punpcklwd   xmm6,xmm7
+  punpckhwd   xmm5,xmm7
+  movdqa      xmm1,xmm0
+  movdqa      xmm2,xmm4
+  punpckldq   xmm0,xmm6
+  punpckhdq   xmm1,xmm6
+  punpckldq   xmm4,xmm5
+  punpckhdq   xmm2,xmm5
+  movdqa      xmm5,xmm0
+  movdqa      xmm6,xmm1
+  punpcklqdq  xmm0,xmm4
+  punpckhqdq  xmm5,xmm4
+  punpcklqdq  xmm1,xmm2
+  punpckhqdq  xmm6,xmm2
+  mov         esi,dword [esp+14h]
+  mov         ecx,dword [ebp+10h]
+  mov         edx,dword [esp+0Ch]
+  mov         edi,dword [esp+8]
+  movd        dword [esi],xmm0
+  movd        dword [esi+ecx],xmm5
+  movd        dword [esi+ecx*2],xmm1
+  movd        dword [esi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  mov         esi,dword [esp+18h]
+  movd        dword [edi],xmm0
+  movd        dword [edi+ecx],xmm5
+  movd        dword [edi+ecx*2],xmm1
+  movd        dword [edi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  movd        dword [esi],xmm0
+  movd        dword [esi+ecx],xmm5
+  movd        dword [esi+ecx*2],xmm1
+  movd        dword [esi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  mov         edi,dword [esp+10h]
+  movd        dword [edi],xmm0
+  movd        dword [edi+ecx],xmm5
+  movd        dword [edi+ecx*2],xmm1
+  movd        dword [edi+edx],xmm6
+  pop         edi
+  pop         esi
+  mov         esp,ebp
+  pop         ebp
+  ret
+
+
+
+;*******************************************************************************
+;    void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
+;                                 int32_t iBeta, int8_t * pTC)
+;*******************************************************************************
+
+
+WELS_EXTERN  DeblockLumaLt4V_sse2
+
+ALIGN  16
+
+DeblockLumaLt4V_sse2:
+    push	ebp
+	mov	ebp, esp
+	and	esp, -16				; fffffff0H
+	sub	esp, 420				; 000001a4H
+	mov	eax, dword [ebp+8]
+	mov	ecx, dword [ebp+12]
+
+	pxor	xmm0, xmm0
+	push	ebx
+	mov	edx, dword [ebp+24]
+	movdqa	[esp+424-384], xmm0
+	push	esi
+
+	lea	esi, [ecx+ecx*2]
+	push	edi
+	mov	edi, eax
+	sub	edi, esi
+	movdqa	xmm0, [edi]
+
+	lea	esi, [ecx+ecx]
+	movdqa	[esp+432-208], xmm0
+	mov	edi, eax
+	sub	edi, esi
+	movdqa	xmm0, [edi]
+	movdqa	[esp+448-208], xmm0
+
+	mov	ebx, eax
+	sub	ebx, ecx
+	movdqa	xmm0, [ebx]
+	movdqa	[esp+464-208], xmm0
+
+	movdqa	xmm0, [eax]
+
+	add	ecx, eax
+	movdqa	[esp+480-208], xmm0
+	movdqa	xmm0, [ecx]
+	mov	dword [esp+432-404], ecx
+
+	movsx	ecx, word [ebp+16]
+	movdqa	[esp+496-208], xmm0
+	movdqa	xmm0, [esi+eax]
+
+	movsx	si, byte [edx]
+	movdqa	[esp+512-208], xmm0
+	movd	xmm0, ecx
+	movsx	ecx, word [ebp+20]
+	movdqa	xmm1, xmm0
+	punpcklwd xmm1, xmm0
+	pshufd	xmm0, xmm1, 0
+	movdqa	[esp+432-112], xmm0
+	movd	xmm0, ecx
+	movsx	cx, byte [edx+1]
+	movdqa	xmm1, xmm0
+	punpcklwd xmm1, xmm0
+	mov	dword [esp+432-408], ebx
+	movzx	ebx, cx
+	pshufd	xmm0, xmm1, 0
+	movd	xmm1, ebx
+	movzx	ebx, cx
+	movd	xmm2, ebx
+	movzx	ebx, cx
+	movzx	ecx, cx
+	movd	xmm4, ecx
+	movzx	ecx, si
+	movd	xmm5, ecx
+	movzx	ecx, si
+	movd	xmm6, ecx
+	movzx	ecx, si
+	movd	xmm7, ecx
+	movzx	ecx, si
+	movdqa	[esp+432-336], xmm0
+	movd	xmm0, ecx
+
+	movsx	cx, byte [edx+3]
+	movsx	dx, byte [edx+2]
+	movd	xmm3, ebx
+	punpcklwd xmm0, xmm4
+	movzx	esi, cx
+	punpcklwd xmm6, xmm2
+	punpcklwd xmm5, xmm1
+	punpcklwd xmm0, xmm6
+	punpcklwd xmm7, xmm3
+	punpcklwd xmm7, xmm5
+	punpcklwd xmm0, xmm7
+	movdqa	[esp+432-400], xmm0
+	movd	xmm0, esi
+	movzx	esi, cx
+	movd	xmm2, esi
+	movzx	esi, cx
+	movzx	ecx, cx
+	movd	xmm4, ecx
+	movzx	ecx, dx
+	movd	xmm3, esi
+	movd	xmm5, ecx
+	punpcklwd xmm5, xmm0
+
+	movdqa	xmm0, [esp+432-384]
+	movzx	ecx, dx
+	movd	xmm6, ecx
+	movzx	ecx, dx
+	movzx	edx, dx
+	punpcklwd xmm6, xmm2
+	movd	xmm7, ecx
+	movd	xmm1, edx
+
+	movdqa	xmm2, [esp+448-208]
+	punpcklbw xmm2, xmm0
+
+	mov	ecx, 4
+	movsx	edx, cx
+	punpcklwd xmm7, xmm3
+	punpcklwd xmm7, xmm5
+	movdqa	xmm5, [esp+496-208]
+	movdqa	xmm3, [esp+464-208]
+	punpcklbw xmm5, xmm0
+	movdqa	[esp+432-240], xmm5
+	movdqa	xmm5, [esp+512-208]
+	punpcklbw xmm5, xmm0
+	movdqa	[esp+432-352], xmm5
+	punpcklwd xmm1, xmm4
+	movdqa	xmm4, [esp+432-208]
+	punpcklwd xmm1, xmm6
+	movdqa	xmm6, [esp+480-208]
+	punpcklwd xmm1, xmm7
+	punpcklbw xmm6, xmm0
+	punpcklbw xmm3, xmm0
+	punpcklbw xmm4, xmm0
+	movdqa	xmm7, xmm3
+	psubw	xmm7, xmm4
+	pabsw	xmm7, xmm7
+	movdqa	[esp+432-272], xmm4
+	movdqa	xmm4, [esp+432-336]
+	movdqa	xmm5, xmm4
+	pcmpgtw	xmm5, xmm7
+	movdqa	[esp+432-288], xmm5
+	movdqa	xmm7, xmm6
+	psubw	xmm7, [esp+432-352]
+	pabsw	xmm7, xmm7
+	movdqa	xmm5, xmm4
+	pcmpgtw	xmm5, xmm7
+	movdqa	[esp+432-256], xmm5
+	movdqa	xmm5, xmm3
+	pavgw	xmm5, xmm6
+	movdqa	[esp+432-304], xmm5
+	movdqa	xmm5, [esp+432-400]
+	psubw	xmm5, [esp+432-288]
+	psubw	xmm5, [esp+432-256]
+	movdqa	[esp+432-224], xmm5
+	movdqa	xmm5, xmm6
+	psubw	xmm5, xmm3
+	movdqa	[esp+432-32], xmm6
+	psubw	xmm6, [esp+432-240]
+	movdqa	xmm7, xmm5
+	movdqa	[esp+432-384], xmm5
+	movdqa	xmm5, [esp+432-112]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm5, xmm7
+	pabsw	xmm6, xmm6
+	movdqa	xmm7, xmm4
+	pcmpgtw	xmm7, xmm6
+
+	pand	xmm5, xmm7
+	movdqa	xmm6, xmm3
+	psubw	xmm6, xmm2
+	pabsw	xmm6, xmm6
+	movdqa	xmm7, xmm4
+	pcmpgtw	xmm7, xmm6
+	movdqa	xmm6, [esp+432-400]
+	pand	xmm5, xmm7
+	movdqa	xmm7, xmm6
+	pcmpeqw	xmm6, xmm0
+	pcmpgtw	xmm7, xmm0
+	por	xmm7, xmm6
+	pand	xmm5, xmm7
+	movdqa	[esp+432-320], xmm5
+	movd	xmm5, edx
+	movdqa	xmm6, xmm5
+	punpcklwd xmm6, xmm5
+	pshufd	xmm5, xmm6, 0
+	movdqa	[esp+432-336], xmm5
+	movdqa	xmm5, [esp+432-224]
+	movdqa	[esp+432-368], xmm5
+	movdqa	xmm6, xmm0
+	psubw	xmm6, xmm5
+	movdqa	xmm5, [esp+432-384]
+	psllw	xmm5, 2
+	movdqa	xmm7, xmm2
+	psubw	xmm7, [esp+432-240]
+	paddw	xmm7, xmm5
+	paddw	xmm7, [esp+432-336]
+	movdqa	xmm5, [esp+432-368]
+	psraw	xmm7, 3
+	pmaxsw	xmm6, xmm7
+	pminsw	xmm5, xmm6
+
+	pand	xmm5, [esp+432-320]
+	movdqa	xmm6, [esp+432-400]
+	movdqa	[esp+432-64], xmm5
+	movdqa	[esp+432-384], xmm6
+	movdqa	xmm5, xmm0
+	psubw	xmm5, xmm6
+	movdqa	[esp+432-368], xmm5
+	movdqa	xmm6, xmm5
+	movdqa	xmm5, [esp+432-272]
+	paddw	xmm5, [esp+432-304]
+	movdqa	xmm7, xmm2
+	paddw	xmm7, xmm2
+	psubw	xmm5, xmm7
+	psraw	xmm5, 1
+	pmaxsw	xmm6, xmm5
+	movdqa	xmm5, [esp+432-384]
+	pminsw	xmm5, xmm6
+
+	pand	xmm5, [esp+432-320]
+	pand	xmm5, [esp+432-288]
+	movdqa	xmm6, [esp+432-240]
+	movdqa	[esp+432-96], xmm5
+	movdqa	xmm5, [esp+432-352]
+	paddw	xmm5, [esp+432-304]
+	movdqa	xmm7, xmm6
+	paddw	xmm7, xmm6
+	movdqa	xmm6, [esp+432-368]
+	psubw	xmm5, xmm7
+
+	movdqa	xmm7, [esp+496-208]
+	psraw	xmm5, 1
+	pmaxsw	xmm6, xmm5
+	movdqa	xmm5, [esp+432-400]
+	pminsw	xmm5, xmm6
+	pand	xmm5, [esp+432-320]
+	pand	xmm5, [esp+432-256]
+	movdqa	xmm6, [esp+448-208]
+	punpckhbw xmm7, xmm0
+	movdqa	[esp+432-352], xmm7
+
+	movdqa	xmm7, [esp+512-208]
+	punpckhbw xmm6, xmm0
+	movdqa	[esp+432-48], xmm5
+	movdqa	xmm5, [esp+432-208]
+	movdqa	[esp+432-368], xmm6
+	movdqa	xmm6, [esp+464-208]
+	punpckhbw xmm7, xmm0
+	punpckhbw xmm5, xmm0
+	movdqa	[esp+432-384], xmm7
+	punpckhbw xmm6, xmm0
+	movdqa	[esp+432-400], xmm6
+
+	movdqa	xmm7, [esp+432-400]
+	movdqa	xmm6, [esp+480-208]
+	psubw	xmm7, xmm5
+	movdqa	[esp+432-16], xmm5
+	pabsw	xmm7, xmm7
+	punpckhbw xmm6, xmm0
+	movdqa	xmm5, xmm4
+	pcmpgtw	xmm5, xmm7
+	movdqa	[esp+432-288], xmm5
+
+	movdqa	xmm7, xmm6
+	psubw	xmm7, [esp+432-384]
+	pabsw	xmm7, xmm7
+	movdqa	xmm5, xmm4
+	pcmpgtw	xmm5, xmm7
+	movdqa	[esp+432-256], xmm5
+
+	movdqa	xmm5, [esp+432-400]
+	movdqa	[esp+432-80], xmm6
+	pavgw	xmm5, xmm6
+	movdqa	[esp+432-304], xmm5
+
+	movdqa	xmm5, xmm1
+	psubw	xmm5, [esp+432-288]
+	psubw	xmm5, [esp+432-256]
+	movdqa	[esp+432-224], xmm5
+	movdqa	xmm5, xmm6
+	psubw	xmm5, [esp+432-400]
+	psubw	xmm6, [esp+432-352]
+	movdqa	[esp+432-272], xmm5
+	movdqa	xmm7, xmm5
+	movdqa	xmm5, [esp+432-112]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm5, xmm7
+	movdqa	xmm7, xmm4
+	pabsw	xmm6, xmm6
+	pcmpgtw	xmm7, xmm6
+	movdqa	xmm6, [esp+432-368]
+
+	pand	xmm5, xmm7
+	movdqa	xmm7, [esp+432-400]
+	psubw	xmm7, xmm6
+	psubw	xmm6, [esp+432-352]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm4, xmm7
+	pand	xmm5, xmm4
+
+	paddw	xmm2, [esp+432-96]
+	movdqa	xmm4, xmm1
+	pcmpgtw	xmm4, xmm0
+	movdqa	xmm7, xmm1
+	pcmpeqw	xmm7, xmm0
+	por	xmm4, xmm7
+	pand	xmm5, xmm4
+	movdqa	xmm4, [esp+432-224]
+	movdqa	[esp+432-320], xmm5
+	movdqa	xmm5, [esp+432-272]
+	movdqa	xmm7, xmm0
+	psubw	xmm7, xmm4
+	psubw	xmm0, xmm1
+	psllw	xmm5, 2
+	paddw	xmm6, xmm5
+	paddw	xmm6, [esp+432-336]
+	movdqa	xmm5, [esp+432-368]
+	movdqa	[esp+432-336], xmm0
+	psraw	xmm6, 3
+	pmaxsw	xmm7, xmm6
+	pminsw	xmm4, xmm7
+	pand	xmm4, [esp+432-320]
+	movdqa	xmm6, xmm0
+	movdqa	xmm0, [esp+432-16]
+	paddw	xmm0, [esp+432-304]
+	movdqa	[esp+432-272], xmm4
+	movdqa	xmm4, [esp+432-368]
+	paddw	xmm4, xmm4
+	psubw	xmm0, xmm4
+
+	movdqa	xmm4, [esp+432-64]
+	psraw	xmm0, 1
+	pmaxsw	xmm6, xmm0
+	movdqa	xmm0, [esp+432-400]
+	movdqa	xmm7, xmm1
+	pminsw	xmm7, xmm6
+	movdqa	xmm6, [esp+432-320]
+	pand	xmm7, xmm6
+	pand	xmm7, [esp+432-288]
+	paddw	xmm5, xmm7
+	packuswb xmm2, xmm5
+	movdqa	xmm5, [esp+432-272]
+	paddw	xmm0, xmm5
+	paddw	xmm3, xmm4
+	packuswb xmm3, xmm0
+
+	movdqa	xmm0, [esp+432-32]
+	psubw	xmm0, xmm4
+	movdqa	xmm4, [esp+432-80]
+	psubw	xmm4, xmm5
+
+	movdqa	xmm5, [esp+432-240]
+	paddw	xmm5, [esp+432-48]
+	packuswb xmm0, xmm4
+	movdqa	xmm4, [esp+432-384]
+	paddw	xmm4, [esp+432-304]
+	movdqa	[esp+480-208], xmm0
+	movdqa	xmm0, [esp+432-352]
+	movdqa	xmm7, xmm0
+	paddw	xmm0, xmm0
+
+	mov	ecx, dword [esp+432-408]
+
+	mov	edx, dword [esp+432-404]
+	psubw	xmm4, xmm0
+	movdqa	xmm0, [esp+432-336]
+	movdqa	[edi], xmm2
+	psraw	xmm4, 1
+	pmaxsw	xmm0, xmm4
+	pminsw	xmm1, xmm0
+	movdqa	xmm0, [esp+480-208]
+
+	pop	edi
+	pand	xmm1, xmm6
+	pand	xmm1, [esp+428-256]
+	movdqa	[ecx], xmm3
+	paddw	xmm7, xmm1
+	pop	esi
+	packuswb xmm5, xmm7
+	movdqa	[eax], xmm0
+	movdqa	[edx], xmm5
+	pop	ebx
+	mov	esp, ebp
+	pop	ebp
+	ret
+
+
+;*******************************************************************************
+;    void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
+;                                 int32_t iBeta)
+;*******************************************************************************
+
+WELS_EXTERN  DeblockLumaEq4V_sse2
+
+ALIGN  16
+
+DeblockLumaEq4V_sse2:
+
+	push	ebp
+	mov	ebp, esp
+	and	esp, -16				; fffffff0H
+	sub	esp, 628				; 00000274H
+	mov	eax, dword [ebp+8]
+	mov	ecx, dword [ebp+12]
+	push	ebx
+	push	esi
+
+	lea	edx, [ecx*4]
+	pxor	xmm0, xmm0
+	movdqa	xmm2, xmm0
+
+	movdqa	xmm0, [ecx+eax]
+	mov	esi, eax
+	sub	esi, edx
+	movdqa	xmm3, [esi]
+	movdqa	xmm5, [eax]
+	push	edi
+	lea	edi, [ecx+ecx]
+	lea	ebx, [ecx+ecx*2]
+	mov	dword [esp+640-600], edi
+	mov	esi, eax
+	sub	esi, edi
+	movdqa	xmm1, [esi]
+	movdqa	 [esp+720-272], xmm0
+	mov	edi, eax
+	sub	edi, ecx
+	movdqa	xmm4, [edi]
+	add	ecx, eax
+	mov	dword [esp+640-596], ecx
+
+	mov	ecx, dword [esp+640-600]
+	movdqa	xmm0, [ecx+eax]
+	movdqa	 [esp+736-272], xmm0
+
+	movdqa	xmm0, [eax+ebx]
+	mov	edx, eax
+	sub	edx, ebx
+
+	movsx	ebx, word [ebp+16]
+	movdqa	xmm6, [edx]
+	add	ecx, eax
+	movdqa	 [esp+752-272], xmm0
+	movd	xmm0, ebx
+
+	movsx	ebx, word [ebp+20]
+	movdqa	xmm7, xmm0
+	punpcklwd xmm7, xmm0
+	pshufd	xmm0, xmm7, 0
+	movdqa	 [esp+640-320], xmm0
+	movd	xmm0, ebx
+	movdqa	xmm7, xmm0
+	punpcklwd xmm7, xmm0
+	pshufd	xmm0, xmm7, 0
+
+	movdqa	xmm7, [esp+736-272]
+	punpcklbw xmm7, xmm2
+	movdqa	 [esp+640-416], xmm7
+	movdqa	 [esp+640-512], xmm0
+	movdqa	xmm0, xmm1
+	movdqa	 [esp+672-272], xmm1
+	movdqa	xmm1, xmm4
+	movdqa	 [esp+704-272], xmm5
+	punpcklbw xmm5, xmm2
+	punpcklbw xmm1, xmm2
+
+	movdqa	xmm7, xmm5
+	psubw	xmm7, xmm1
+	pabsw	xmm7, xmm7
+	movdqa	 [esp+640-560], xmm7
+	punpcklbw xmm0, xmm2
+	movdqa	 [esp+688-272], xmm4
+	movdqa	xmm4, [esp+720-272]
+	movdqa	 [esp+640-480], xmm0
+
+	movdqa	xmm7, xmm1
+	psubw	xmm7, xmm0
+
+	movdqa	xmm0, [esp+640-512]
+	pabsw	xmm7, xmm7
+	punpcklbw xmm4, xmm2
+	pcmpgtw	xmm0, xmm7
+	movdqa	 [esp+640-384], xmm4
+	movdqa	xmm7, xmm5
+	psubw	xmm7, xmm4
+	movdqa	xmm4, [esp+640-512]
+	movdqa	 [esp+656-272], xmm6
+	punpcklbw xmm6, xmm2
+	pabsw	xmm7, xmm7
+	movdqa	 [esp+640-48], xmm2
+	movdqa	 [esp+640-368], xmm6
+	movdqa	 [esp+640-144], xmm1
+	movdqa	 [esp+640-400], xmm5
+	pcmpgtw	xmm4, xmm7
+	pand	xmm0, xmm4
+	movdqa	xmm4, [esp+640-320]
+	pcmpgtw	xmm4, [esp+640-560]
+	pand	xmm0, xmm4
+
+	mov	ebx, 2
+	movsx	ebx, bx
+	movd	xmm4, ebx
+	movdqa	xmm7, xmm4
+	punpcklwd xmm7, xmm4
+	movdqa	xmm4, [esp+640-320]
+	psraw	xmm4, 2
+	pshufd	xmm7, xmm7, 0
+	paddw	xmm4, xmm7
+	movdqa	 [esp+640-576], xmm4
+	pcmpgtw	xmm4, [esp+640-560]
+	movdqa	 [esp+640-560], xmm4
+
+	movdqa	xmm4, [esp+640-512]
+	movdqa	 [esp+640-624], xmm7
+	movdqa	xmm7, xmm1
+	psubw	xmm7, xmm6
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm4, xmm7
+
+	pand	xmm4, [esp+640-560]
+	movdqa	 [esp+640-544], xmm4
+	movdqa	xmm4, [esp+640-512]
+	movdqa	xmm7, xmm5
+	psubw	xmm7, [esp+640-416]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm4, xmm7
+
+	pand	xmm4, [esp+640-560]
+	movdqa	 [esp+640-560], xmm4
+
+	movdqa	xmm4, [esp+640-544]
+	pandn	xmm4, xmm6
+	movdqa	 [esp+640-16], xmm4
+	mov	ebx, 4
+	movsx	ebx, bx
+	movd	xmm4, ebx
+	movdqa	xmm7, xmm4
+	punpcklwd xmm7, xmm4
+	movdqa	xmm4, xmm3
+	punpcklbw xmm4, xmm2
+	psllw	xmm4, 1
+	paddw	xmm4, xmm6
+	paddw	xmm4, xmm6
+	paddw	xmm4, xmm6
+	paddw	xmm4, [esp+640-480]
+
+	movdqa	xmm6, [esp+640-560]
+	pshufd	xmm7, xmm7, 0
+	paddw	xmm4, xmm1
+	movdqa	 [esp+640-592], xmm7
+	paddw	xmm4, xmm5
+	paddw	xmm4, xmm7
+	movdqa	xmm7, [esp+640-416]
+	pandn	xmm6, xmm7
+	movdqa	 [esp+640-80], xmm6
+	movdqa	xmm6, [esp+752-272]
+	punpcklbw xmm6, xmm2
+	psllw	xmm6, 1
+	paddw	xmm6, xmm7
+	paddw	xmm6, xmm7
+	paddw	xmm6, xmm7
+	paddw	xmm6, [esp+640-384]
+
+	movdqa	xmm7, [esp+640-480]
+	paddw	xmm6, xmm5
+	paddw	xmm6, xmm1
+	paddw	xmm6, [esp+640-592]
+	psraw	xmm6, 3
+	pand	xmm6, [esp+640-560]
+	movdqa	 [esp+640-112], xmm6
+	movdqa	xmm6, [esp+640-544]
+	pandn	xmm6, xmm7
+	movdqa	 [esp+640-336], xmm6
+	movdqa	xmm6, [esp+640-544]
+	movdqa	 [esp+640-528], xmm6
+	movdqa	xmm6, [esp+640-368]
+	paddw	xmm6, xmm7
+	movdqa	xmm7, xmm1
+	psraw	xmm4, 3
+	pand	xmm4, [esp+640-544]
+	paddw	xmm7, xmm5
+	paddw	xmm6, xmm7
+	paddw	xmm6, [esp+640-624]
+	movdqa	xmm7, [esp+640-528]
+
+	paddw	xmm5, xmm1
+	psraw	xmm6, 2
+	pand	xmm7, xmm6
+
+	movdqa	xmm6, [esp+640-384]
+	movdqa	 [esp+640-64], xmm7
+	movdqa	xmm7, [esp+640-560]
+	pandn	xmm7, xmm6
+	movdqa	 [esp+640-304], xmm7
+	movdqa	xmm7, [esp+640-560]
+	movdqa	 [esp+640-528], xmm7
+	movdqa	xmm7, [esp+640-416]
+	paddw	xmm7, xmm6
+	paddw	xmm7, xmm5
+	paddw	xmm7, [esp+640-624]
+	movdqa	xmm5, [esp+640-528]
+	psraw	xmm7, 2
+	pand	xmm5, xmm7
+	movdqa	 [esp+640-32], xmm5
+
+	movdqa	xmm5, [esp+640-544]
+	movdqa	 [esp+640-528], xmm5
+	movdqa	xmm5, [esp+640-480]
+	movdqa	xmm7, xmm5
+	paddw	xmm7, xmm5
+	movdqa	xmm5, xmm1
+	paddw	xmm5, xmm6
+	paddw	xmm6, [esp+640-592]
+	paddw	xmm7, xmm5
+	paddw	xmm7, [esp+640-624]
+	movdqa	xmm5, [esp+640-528]
+	psraw	xmm7, 2
+	pandn	xmm5, xmm7
+	movdqa	xmm7, [esp+640-480]
+	paddw	xmm7, xmm1
+	paddw	xmm7, [esp+640-400]
+	movdqa	xmm1, [esp+640-544]
+	movdqa	 [esp+640-352], xmm5
+	movdqa	xmm5, [esp+640-368]
+	psllw	xmm7, 1
+	paddw	xmm7, xmm6
+	paddw	xmm5, xmm7
+
+	movdqa	xmm7, [esp+640-400]
+	psraw	xmm5, 3
+	pand	xmm1, xmm5
+	movdqa	xmm5, [esp+640-480]
+	movdqa	 [esp+640-96], xmm1
+	movdqa	xmm1, [esp+640-560]
+	movdqa	 [esp+640-528], xmm1
+	movdqa	xmm1, [esp+640-384]
+	movdqa	xmm6, xmm1
+	paddw	xmm6, xmm1
+	paddw	xmm1, [esp+640-400]
+	paddw	xmm1, [esp+640-144]
+	paddw	xmm7, xmm5
+	paddw	xmm5, [esp+640-592]
+	paddw	xmm6, xmm7
+	paddw	xmm6, [esp+640-624]
+	movdqa	xmm7, [esp+640-528]
+	psraw	xmm6, 2
+	psllw	xmm1, 1
+	paddw	xmm1, xmm5
+
+	movdqa	xmm5, [esp+656-272]
+	pandn	xmm7, xmm6
+	movdqa	xmm6, [esp+640-416]
+	paddw	xmm6, xmm1
+	movdqa	xmm1, [esp+640-560]
+	psraw	xmm6, 3
+	pand	xmm1, xmm6
+
+	movdqa	xmm6, [esp+704-272]
+	movdqa	 [esp+640-128], xmm1
+	movdqa	xmm1, [esp+672-272]
+	punpckhbw xmm1, xmm2
+	movdqa	 [esp+640-448], xmm1
+	movdqa	xmm1, [esp+688-272]
+	punpckhbw xmm1, xmm2
+	punpckhbw xmm6, xmm2
+	movdqa	 [esp+640-288], xmm7
+	punpckhbw xmm5, xmm2
+	movdqa	 [esp+640-496], xmm1
+	movdqa	 [esp+640-432], xmm6
+
+	movdqa	xmm7, [esp+720-272]
+	punpckhbw xmm7, xmm2
+	movdqa	 [esp+640-464], xmm7
+
+	movdqa	xmm7, [esp+736-272]
+	punpckhbw xmm7, xmm2
+	movdqa	 [esp+640-528], xmm7
+
+	movdqa	xmm7, xmm6
+
+	psubw	xmm6, [esp+640-464]
+	psubw	xmm7, xmm1
+	pabsw	xmm7, xmm7
+	movdqa	 [esp+640-560], xmm7
+	por	xmm4, [esp+640-16]
+	pabsw	xmm6, xmm6
+	movdqa	xmm7, xmm1
+	psubw	xmm7, [esp+640-448]
+
+	movdqa	xmm1, [esp+640-512]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm1, xmm7
+	movdqa	xmm7, [esp+640-512]
+	pcmpgtw	xmm7, xmm6
+	movdqa	xmm6, [esp+640-320]
+	pand	xmm1, xmm7
+	movdqa	xmm7, [esp+640-560]
+	pcmpgtw	xmm6, xmm7
+	pand	xmm1, xmm6
+
+	movdqa	xmm6, [esp+640-576]
+	pcmpgtw	xmm6, xmm7
+
+	movdqa	xmm7, [esp+640-496]
+	punpckhbw xmm3, xmm2
+	movdqa	 [esp+640-560], xmm6
+	movdqa	xmm6, [esp+640-512]
+	psubw	xmm7, xmm5
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm6, xmm7
+
+	pand	xmm6, [esp+640-560]
+	movdqa	xmm7, [esp+640-432]
+	psubw	xmm7, [esp+640-528]
+
+	psllw	xmm3, 1
+	movdqa	 [esp+640-544], xmm6
+	movdqa	xmm6, [esp+640-512]
+
+	movdqa	xmm2, [esp+640-544]
+	paddw	xmm3, xmm5
+	paddw	xmm3, xmm5
+	paddw	xmm3, xmm5
+	paddw	xmm3, [esp+640-448]
+	paddw	xmm3, [esp+640-496]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm6, xmm7
+	pand	xmm6, [esp+640-560]
+	movdqa	 [esp+640-560], xmm6
+
+	movdqa	xmm6, xmm0
+	pand	xmm6, xmm4
+	movdqa	xmm4, xmm0
+	pandn	xmm4, [esp+640-368]
+	por	xmm6, xmm4
+	movdqa	xmm4, [esp+640-432]
+	paddw	xmm3, xmm4
+	paddw	xmm3, [esp+640-592]
+	psraw	xmm3, 3
+	pand	xmm3, xmm2
+	pandn	xmm2, xmm5
+	por	xmm3, xmm2
+	movdqa	xmm7, xmm1
+	pand	xmm7, xmm3
+	movdqa	xmm3, [esp+640-64]
+	por	xmm3, [esp+640-336]
+	movdqa	xmm2, xmm1
+	pandn	xmm2, xmm5
+	por	xmm7, xmm2
+
+	movdqa	xmm2, xmm0
+	pand	xmm2, xmm3
+	movdqa	xmm3, xmm0
+	pandn	xmm3, [esp+640-480]
+	por	xmm2, xmm3
+	packuswb xmm6, xmm7
+	movdqa	 [esp+640-336], xmm2
+	movdqa	 [esp+656-272], xmm6
+	movdqa	xmm6, [esp+640-544]
+	movdqa	xmm2, xmm5
+	paddw	xmm2, [esp+640-448]
+	movdqa	xmm3, xmm1
+	movdqa	xmm7, [esp+640-496]
+	paddw	xmm7, xmm4
+	paddw	xmm2, xmm7
+	paddw	xmm2, [esp+640-624]
+	movdqa	xmm7, [esp+640-544]
+	psraw	xmm2, 2
+	pand	xmm6, xmm2
+	movdqa	xmm2, [esp+640-448]
+	pandn	xmm7, xmm2
+	por	xmm6, xmm7
+	pand	xmm3, xmm6
+	movdqa	xmm6, xmm1
+	pandn	xmm6, xmm2
+	paddw	xmm2, [esp+640-496]
+	paddw	xmm2, xmm4
+	por	xmm3, xmm6
+	movdqa	xmm6, [esp+640-336]
+	packuswb xmm6, xmm3
+	psllw	xmm2, 1
+	movdqa	 [esp+672-272], xmm6
+	movdqa	xmm6, [esp+640-96]
+	por	xmm6, [esp+640-352]
+
+	movdqa	xmm3, xmm0
+	pand	xmm3, xmm6
+	movdqa	xmm6, xmm0
+	pandn	xmm6, [esp+640-144]
+	por	xmm3, xmm6
+	movdqa	xmm6, [esp+640-544]
+	movdqa	 [esp+640-352], xmm3
+	movdqa	xmm3, [esp+640-464]
+	paddw	xmm3, [esp+640-592]
+	paddw	xmm2, xmm3
+	movdqa	xmm3, [esp+640-448]
+	paddw	xmm5, xmm2
+	movdqa	xmm2, [esp+640-496]
+	psraw	xmm5, 3
+	pand	xmm6, xmm5
+	movdqa	xmm5, [esp+640-464]
+	paddw	xmm2, xmm5
+	paddw	xmm5, [esp+640-432]
+	movdqa	xmm4, xmm3
+	paddw	xmm4, xmm3
+	paddw	xmm4, xmm2
+	paddw	xmm4, [esp+640-624]
+	movdqa	xmm2, [esp+640-544]
+	paddw	xmm3, [esp+640-592]
+	psraw	xmm4, 2
+	pandn	xmm2, xmm4
+	por	xmm6, xmm2
+	movdqa	xmm7, xmm1
+	pand	xmm7, xmm6
+	movdqa	xmm6, [esp+640-496]
+	movdqa	xmm2, xmm1
+	pandn	xmm2, xmm6
+	por	xmm7, xmm2
+	movdqa	xmm2, [esp+640-352]
+	packuswb xmm2, xmm7
+	movdqa	 [esp+688-272], xmm2
+	movdqa	xmm2, [esp+640-128]
+	por	xmm2, [esp+640-288]
+
+	movdqa	xmm4, xmm0
+	pand	xmm4, xmm2
+	paddw	xmm5, xmm6
+	movdqa	xmm2, xmm0
+	pandn	xmm2, [esp+640-400]
+	por	xmm4, xmm2
+	movdqa	xmm2, [esp+640-528]
+	psllw	xmm5, 1
+	paddw	xmm5, xmm3
+	movdqa	xmm3, [esp+640-560]
+	paddw	xmm2, xmm5
+	psraw	xmm2, 3
+	movdqa	 [esp+640-288], xmm4
+	movdqa	xmm4, [esp+640-560]
+	pand	xmm4, xmm2
+	movdqa	xmm2, [esp+640-464]
+	movdqa	xmm5, xmm2
+	paddw	xmm5, xmm2
+	movdqa	xmm2, [esp+640-432]
+	paddw	xmm2, [esp+640-448]
+	movdqa	xmm7, xmm1
+	paddw	xmm5, xmm2
+	paddw	xmm5, [esp+640-624]
+	movdqa	xmm6, [esp+640-560]
+	psraw	xmm5, 2
+	pandn	xmm3, xmm5
+	por	xmm4, xmm3
+	movdqa	xmm3, [esp+640-32]
+	por	xmm3, [esp+640-304]
+	pand	xmm7, xmm4
+	movdqa	xmm4, [esp+640-432]
+	movdqa	xmm5, [esp+640-464]
+	movdqa	xmm2, xmm1
+	pandn	xmm2, xmm4
+	paddw	xmm4, [esp+640-496]
+	por	xmm7, xmm2
+	movdqa	xmm2, [esp+640-288]
+	packuswb xmm2, xmm7
+	movdqa	 [esp+704-272], xmm2
+
+	movdqa	xmm2, xmm0
+	pand	xmm2, xmm3
+	movdqa	xmm3, xmm0
+	pandn	xmm3, [esp+640-384]
+	por	xmm2, xmm3
+	movdqa	 [esp+640-304], xmm2
+	movdqa	xmm2, [esp+640-528]
+	movdqa	xmm3, xmm2
+	paddw	xmm3, [esp+640-464]
+	paddw	xmm3, xmm4
+	paddw	xmm3, [esp+640-624]
+	psraw	xmm3, 2
+	pand	xmm6, xmm3
+	movdqa	xmm3, [esp+640-560]
+	movdqa	xmm4, xmm3
+	pandn	xmm4, xmm5
+	por	xmm6, xmm4
+	movdqa	xmm7, xmm1
+	pand	xmm7, xmm6
+	movdqa	xmm6, [esp+640-304]
+	movdqa	xmm4, xmm1
+	pandn	xmm4, xmm5
+	por	xmm7, xmm4
+
+	movdqa	xmm4, xmm0
+	pandn	xmm0, [esp+640-416]
+	packuswb xmm6, xmm7
+	movdqa	xmm7, [esp+640-112]
+	por	xmm7, [esp+640-80]
+	pand	xmm4, xmm7
+	por	xmm4, xmm0
+	movdqa	xmm0, [esp+752-272]
+	punpckhbw xmm0, [esp+640-48]
+	psllw	xmm0, 1
+	paddw	xmm0, xmm2
+	paddw	xmm0, xmm2
+	paddw	xmm0, xmm2
+	paddw	xmm0, xmm5
+	paddw	xmm0, [esp+640-432]
+	paddw	xmm0, [esp+640-496]
+	paddw	xmm0, [esp+640-592]
+	psraw	xmm0, 3
+	pand	xmm0, xmm3
+	movdqa	xmm7, xmm1
+	pandn	xmm3, xmm2
+	por	xmm0, xmm3
+	pand	xmm7, xmm0
+
+	movdqa	xmm0, [esp+656-272]
+	movdqa	 [edx], xmm0
+
+	movdqa	xmm0, [esp+672-272]
+
+	mov	edx, dword [esp+640-596]
+	movdqa	 [esi], xmm0
+	movdqa	xmm0, [esp+688-272]
+	movdqa	 [edi], xmm0
+	movdqa	xmm0, [esp+704-272]
+
+	pop	edi
+	pandn	xmm1, xmm2
+	movdqa	 [eax], xmm0
+	por	xmm7, xmm1
+	pop	esi
+	packuswb xmm4, xmm7
+	movdqa	 [edx], xmm6
+	movdqa	 [ecx], xmm4
+	pop	ebx
+	mov	esp, ebp
+	pop	ebp
+	ret
+
+
+;********************************************************************************
+;
+;   void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);
+;
+;********************************************************************************
+
+WELS_EXTERN  DeblockLumaTransposeH2V_sse2
+
+ALIGN  16
+
+DeblockLumaTransposeH2V_sse2:
+    push    ebp
+    push    ebx
+    mov     ebp,   esp
+    and     esp,0FFFFFFF0h
+    sub     esp,   10h
+
+    mov     eax,   [ebp + 0Ch]
+    mov     ecx,   [ebp + 10h]
+    lea     edx,   [eax + ecx * 8]
+    lea     ebx,   [ecx*3]
+
+    movq    xmm0,  [eax]
+    movq    xmm7,  [edx]
+    punpcklqdq   xmm0,  xmm7
+    movq    xmm1,  [eax + ecx]
+    movq    xmm7,  [edx + ecx]
+    punpcklqdq   xmm1,  xmm7
+    movq    xmm2,  [eax + ecx*2]
+    movq    xmm7,  [edx + ecx*2]
+    punpcklqdq   xmm2,  xmm7
+    movq    xmm3,  [eax + ebx]
+    movq    xmm7,  [edx + ebx]
+    punpcklqdq   xmm3,  xmm7
+
+    lea     eax,   [eax + ecx * 4]
+    lea     edx,   [edx + ecx * 4]
+    movq    xmm4,  [eax]
+    movq    xmm7,  [edx]
+    punpcklqdq   xmm4,  xmm7
+    movq    xmm5,  [eax + ecx]
+    movq    xmm7,  [edx + ecx]
+    punpcklqdq   xmm5,  xmm7
+    movq    xmm6,  [eax + ecx*2]
+    movq    xmm7,  [edx + ecx*2]
+    punpcklqdq   xmm6,  xmm7
+
+    movdqa  [esp],   xmm0
+    movq    xmm7,  [eax + ebx]
+    movq    xmm0,  [edx + ebx]
+    punpcklqdq   xmm7,  xmm0
+    movdqa  xmm0,   [esp]
+
+    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
+    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
+
+    mov    eax,   [ebp + 14h]
+    movdqa  [eax],    xmm4
+    movdqa  [eax + 10h],  xmm2
+    movdqa  [eax + 20h],  xmm3
+    movdqa  [eax + 30h],  xmm7
+    movdqa  [eax + 40h],  xmm5
+    movdqa  [eax + 50h],  xmm1
+    movdqa  [eax + 60h],  xmm6
+    movdqa  [eax + 70h],  xmm0
+
+    mov     esp,   ebp
+    pop     ebx
+    pop     ebp
+    ret
+
+
+
+;*******************************************************************************************
+;
+;   void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
+;
+;*******************************************************************************************
+
+WELS_EXTERN   DeblockLumaTransposeV2H_sse2
+
+ALIGN  16
+
+DeblockLumaTransposeV2H_sse2:
+    push     ebp
+    mov      ebp,   esp
+
+    and     esp,  0FFFFFFF0h
+    sub     esp,   10h
+
+    mov      eax,   [ebp + 10h]
+    mov      ecx,   [ebp + 0Ch]
+    mov      edx,   [ebp + 08h]
+
+    movdqa   xmm0,  [eax]
+    movdqa   xmm1,  [eax + 10h]
+    movdqa   xmm2,  [eax + 20h]
+    movdqa   xmm3,	[eax + 30h]
+    movdqa   xmm4,	[eax + 40h]
+    movdqa   xmm5,	[eax + 50h]
+    movdqa   xmm6,	[eax + 60h]
+    movdqa   xmm7,	[eax + 70h]
+
+    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
+    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
+
+    lea      eax,   [ecx * 3]
+
+    movq     [edx],  xmm4
+    movq     [edx + ecx],  xmm2
+    movq     [edx + ecx*2],  xmm3
+    movq     [edx + eax],  xmm7
+
+    lea      edx,   [edx + ecx*4]
+    movq     [edx],  xmm5
+    movq     [edx + ecx],  xmm1
+    movq     [edx + ecx*2],  xmm6
+    movq     [edx + eax],  xmm0
+
+    psrldq    xmm4,   8
+    psrldq    xmm2,   8
+    psrldq    xmm3,   8
+    psrldq    xmm7,   8
+    psrldq    xmm5,   8
+    psrldq    xmm1,   8
+    psrldq    xmm6,   8
+    psrldq    xmm0,   8
+
+    lea       edx,  [edx + ecx*4]
+    movq     [edx],  xmm4
+    movq     [edx + ecx],  xmm2
+    movq     [edx + ecx*2],  xmm3
+    movq     [edx + eax],  xmm7
+
+    lea      edx,   [edx + ecx*4]
+    movq     [edx],  xmm5
+    movq     [edx + ecx],  xmm1
+    movq     [edx + ecx*2],  xmm6
+    movq     [edx + eax],  xmm0
+
+
+    mov      esp,   ebp
+    pop      ebp
     ret
\ No newline at end of file
--- a/codec/decoder/core/asm/expand_picture.asm
+++ b/codec/decoder/core/asm/expand_picture.asm
@@ -155,11 +155,11 @@
 	lea %1, [%1+%2]
 %endmacro
 
-%macro exp_top_bottom_sse2	1	; iPaddingSize [luma(32)/chroma(16)]		
+%macro exp_top_bottom_sse2	1	; iPaddingSize [luma(32)/chroma(16)]
 	; ebx [width/16(8)]
 	; esi [pSrc+0], edi [pSrc-1], ecx [-stride], 32(16)		; top
 	; eax [pSrc+(h-1)*stride], ebp [pSrc+(h+31)*stride], 32(16)	; bottom
-		
+
 %if %1 == 32		; for luma
 	sar ebx, 04h 	; width / 16(8) pixels
 .top_bottom_loops:
@@ -173,7 +173,7 @@
 	mov_line_16x4_sse2 edi, ecx, xmm0, a
 	mov_line_16x4_sse2 edi, ecx, xmm0, a
 	mov_line_end16x4_sse2 edi, ecx, xmm0, a
-	
+
 	; bottom
 	movdqa xmm1, [eax] 		; last line of picture pData
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a	; dst, stride, xmm?
@@ -184,15 +184,15 @@
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a
 	mov_line_end16x4_sse2 ebp, ecx, xmm1, a
-		
+
 	lea esi, [esi+16]		; top pSrc
 	lea edi, [edi+16]		; top dst
 	lea eax, [eax+16]		; bottom pSrc
 	lea ebp, [ebp+16]		; bottom dst
-	neg ecx 			; positive/negative stride need for next loop?	
-	
+	neg ecx 			; positive/negative stride need for next loop?
+
 	dec ebx
-	jnz near .top_bottom_loops		
+	jnz near .top_bottom_loops
 %elif %1 == 16	; for chroma ??
 	mov edx, ebx
 	sar ebx, 04h 	; (width / 16) pixels
@@ -202,21 +202,21 @@
 	mov_line_16x4_sse2 edi, ecx, xmm0, a	; dst, stride, xmm?
 	mov_line_16x4_sse2 edi, ecx, xmm0, a
 	mov_line_16x4_sse2 edi, ecx, xmm0, a
-	mov_line_end16x4_sse2 edi, ecx, xmm0, a	
-	
+	mov_line_end16x4_sse2 edi, ecx, xmm0, a
+
 	; bottom
 	movdqa xmm1, [eax] 		; last line of picture pData
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a	; dst, stride, xmm?
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a
-	mov_line_end16x4_sse2 ebp, ecx, xmm1, a	
-		
+	mov_line_end16x4_sse2 ebp, ecx, xmm1, a
+
 	lea esi, [esi+16]		; top pSrc
 	lea edi, [edi+16]		; top dst
 	lea eax, [eax+16]		; bottom pSrc
 	lea ebp, [ebp+16]		; bottom dst
-	neg ecx 			; positive/negative stride need for next loop?	
-	
+	neg ecx 			; positive/negative stride need for next loop?
+
 	dec ebx
 	jnz near .top_bottom_loops
 
@@ -243,13 +243,13 @@
 %endif
 %endmacro
 
-%macro exp_left_right_sse2	2	; iPaddingSize [luma(32)/chroma(16)], u/a	
+%macro exp_left_right_sse2	2	; iPaddingSize [luma(32)/chroma(16)], u/a
 	; ecx [height]
 	; esi [pSrc+0], 	   edi [pSrc-32], edx [stride], 32(16)	; left
 	; ebx [pSrc+(w-1)], ebp [pSrc+w], 32(16)			; right
 ;	xor eax, eax 	; for pixel pData (uint8_t)		; make sure eax=0 at least high 24 bits of eax = 0
-	
-%if %1 == 32		; for luma	
+
+%if %1 == 32		; for luma
 .left_right_loops:
 	; left
 	mov al, byte [esi]		; pixel pData for left border
@@ -256,37 +256,37 @@
 	butterfly_1to16_sse	xmm0, xmm1, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
 	movdqa [edi], xmm0
 	movdqa [edi+16], xmm0
-	
+
 	; right
 	mov al, byte [ebx]
 	butterfly_1to16_sse	xmm1, xmm2, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
 	movdqa [ebp], xmm1
 	movdqa [ebp+16], xmm1
-	
+
 	lea esi, [esi+edx]		; left pSrc
 	lea edi, [edi+edx]		; left dst
 	lea ebx, [ebx+edx]		; right pSrc
-	lea ebp, [ebp+edx]		; right dst	
-	
+	lea ebp, [ebp+edx]		; right dst
+
 	dec ecx
-	jnz near .left_right_loops		
-%elif %1 == 16	; for chroma ??	
+	jnz near .left_right_loops
+%elif %1 == 16	; for chroma ??
 .left_right_loops:
 	; left
 	mov al, byte [esi]		; pixel pData for left border
 	butterfly_1to16_sse	xmm0, xmm1, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
-	movdqa [edi], xmm0	
-	
+	movdqa [edi], xmm0
+
 	; right
 	mov al, byte [ebx]
 	butterfly_1to16_sse	xmm1, xmm2, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
 	movdq%2 [ebp], xmm1								; might not be aligned 16 bytes in case chroma planes
-	
+
 	lea esi, [esi+edx]		; left pSrc
 	lea edi, [edi+edx]		; left dst
 	lea ebx, [ebx+edx]		; right pSrc
-	lea ebp, [ebp+edx]		; right dst	
-	
+	lea ebp, [ebp+edx]		; right dst
+
 	dec ecx
 	jnz near .left_right_loops
 %endif
@@ -339,25 +339,25 @@
 	; TL
 	mov_line_16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?
 	mov_line_16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?
-	mov_line_16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?	
+	mov_line_16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?
 	mov_line_end16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?
 
 	; TR
 	mov_line_16x4_sse2	ebp, ecx, xmm4, %2	; dst, stride, xmm?
 	mov_line_16x4_sse2	ebp, ecx, xmm4, %2	; dst, stride, xmm?
-	mov_line_16x4_sse2	ebp, ecx, xmm4, %2	; dst, stride, xmm?	
+	mov_line_16x4_sse2	ebp, ecx, xmm4, %2	; dst, stride, xmm?
 	mov_line_end16x4_sse2 ebp, ecx, xmm4, %2	; dst, stride, xmm?
 
 	; BL
 	mov_line_16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?
 	mov_line_16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?
-	mov_line_16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?	
+	mov_line_16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?
 	mov_line_end16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?
 
 	; BR
 	mov_line_16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
 	mov_line_16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
-	mov_line_16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?	
+	mov_line_16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
 	mov_line_end16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
 %endif
 %endmacro
@@ -375,7 +375,7 @@
 	push esi
 	push edi
 	push ebp
-	
+
 	; for both top and bottom border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; pDst
@@ -387,10 +387,10 @@
 	mov cl, byte [esi]
 	butterfly_1to16_sse xmm3, xmm4, c		; pDst, tmp, pSrc [generic register name: a/b/c/d]
 	; load top border
-	mov ecx, edx							; kiStride	
+	mov ecx, edx							; kiStride
 	neg ecx 								; -kiStride
 	lea edi, [esi+ecx]						; last line of top border
-	; load bottom border 
+	; load bottom border
 	dec eax									; h-1
 	imul eax, edx 							; (h-1)*kiStride
 	lea eax, [esi+eax]						; last line of picture pData
@@ -398,16 +398,16 @@
 	lea ebp, [eax+edx]						; last line of bottom border, (h-1)*stride + 32 * stride
 	; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
 	dec ebx									; kiWidth-1
-	lea ebx, [eax+ebx]						; dst[w-1][h-1]	
+	lea ebx, [eax+ebx]						; dst[w-1][h-1]
 ;	xor edx, edx
 	mov dl, byte [eax]						; bottom-left
 	butterfly_1to16_sse xmm5, xmm6, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	mov dl, byte [ebx]						; bottom-right
 	butterfly_1to16_sse xmm6, xmm4, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	; for top & bottom expanding	
+	; for top & bottom expanding
 	mov ebx, [esp+32]						; kiWidth
-	exp_top_bottom_sse2	32	
-	
+	exp_top_bottom_sse2	32
+
 	; for both left and right border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst: left border pSrc
@@ -419,7 +419,7 @@
 	lea edi, [esi+eax]						; left border dst
 	dec ebx
 	lea ebx, [esi+ebx]						; right border pSrc, (p_dst + width - 1)
-	lea ebp, [ebx+1]						; right border dst	
+	lea ebp, [ebx+1]						; right border dst
 	; prepare for cross border pData: top-right with xmm4
 ;	xor eax, eax
 	mov al, byte [ebx]						; top-right
@@ -426,7 +426,7 @@
 	butterfly_1to16_sse xmm4, xmm0, a		; pDst, tmp, pSrc [generic register name: a/b/c/d]
 	; for left & right border expanding
 	exp_left_right_sse2	32, a
-	
+
 	; for cross border [top-left, top-right, bottom-left, bottom-right]
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; pDst
@@ -436,7 +436,7 @@
 	; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
 	mov eax, -32							; luma=-32, chroma=-16
 	neg ecx										; -stride
-	lea edi, [esi+eax]						
+	lea edi, [esi+eax]
 	lea edi, [edi+ecx]				; last line of top-left border
 	lea ebp, [esi+ebx]
 	lea ebp, [ebp+ecx]				; last line of top-right border
@@ -444,19 +444,19 @@
 	mov ecx, [esp+28]					; kiStride
 	imul edx, ecx							; (height+32(16)) * stride
 	lea eax, [edi+edx]						; last line of bottom-left border
-	lea ebx, [ebp+edx]						; last line of bottom-right border	
+	lea ebx, [ebp+edx]						; last line of bottom-right border
 	neg ecx										; -kiStride
 	; for left & right border expanding
-	exp_cross_sse2		32, a	
-	
+	exp_cross_sse2		32, a
+
 ;	sfence									; commit cache write back memory
-	
+
 	pop ebp
 	pop edi
 	pop esi
 	pop edx
 	pop ebx
-	
+
 	ret
 
 ALIGN 16
@@ -472,7 +472,7 @@
 	push esi
 	push edi
 	push ebp
-	
+
 	; for both top and bottom border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; pDst
@@ -484,10 +484,10 @@
 	mov cl, byte [esi]
 	butterfly_1to16_sse xmm3, xmm4, c		; pDst, tmp, pSrc [generic register name: a/b/c/d]
 	; load top border
-	mov ecx, edx							; kiStride	
+	mov ecx, edx							; kiStride
 	neg ecx 								; -kiStride
 	lea edi, [esi+ecx]						; last line of top border
-	; load bottom border 
+	; load bottom border
 	dec eax									; h-1
 	imul eax, edx 							; (h-1)*kiStride
 	lea eax, [esi+eax]						; last line of picture pData
@@ -495,16 +495,16 @@
 	lea ebp, [eax+edx]						; last line of bottom border, (h-1)*kiStride + 16 * kiStride
 	; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
 	dec ebx									; kiWidth-1
-	lea ebx, [eax+ebx]						; pDst[w-1][h-1]	
+	lea ebx, [eax+ebx]						; pDst[w-1][h-1]
 ;	xor edx, edx
 	mov dl, byte [eax]						; bottom-left
 	butterfly_1to16_sse xmm5, xmm6, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	mov dl, byte [ebx]						; bottom-right
 	butterfly_1to16_sse xmm6, xmm4, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	; for top & bottom expanding	
+	; for top & bottom expanding
 	mov ebx, [esp+32]						; kiWidth
-	exp_top_bottom_sse2	16	
-	
+	exp_top_bottom_sse2	16
+
 	; for both left and right border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; pDst: left border pSrc
@@ -516,7 +516,7 @@
 	lea edi, [esi+eax]						; left border dst
 	dec ebx
 	lea ebx, [esi+ebx]						; right border pSrc, (p_dst + width - 1)
-	lea ebp, [ebx+1]						; right border dst	
+	lea ebp, [ebx+1]						; right border dst
 	; prepare for cross border pData: top-right with xmm4
 ;	xor eax, eax
 	mov al, byte [ebx]						; top-right
@@ -523,7 +523,7 @@
 	butterfly_1to16_sse xmm4, xmm0, a		; pDst, tmp, pSrc [generic register name: a/b/c/d]
 	; for left & right border expanding
 	exp_left_right_sse2	16, a
-	
+
 	; for cross border [top-left, top-right, bottom-left, bottom-right]
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; pDst
@@ -533,9 +533,9 @@
 	; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
 	mov eax, -16							; chroma=-16
 	neg ecx										; -stride
-	lea edi, [esi+eax]						
+	lea edi, [esi+eax]
 	lea edi, [edi+ecx]				; last line of top-left border
-	lea ebp, [esi+ebx]				
+	lea ebp, [esi+ebx]
 	lea ebp, [ebp+ecx]				; last line of top-right border
 	mov ecx, [esp+28]						; kiStride
 	add edx, 16							; height+16, luma=32, chroma=16
@@ -545,15 +545,15 @@
 	neg ecx										; -kiStride
 	; for left & right border expanding
 	exp_cross_sse2		16, a
-	
+
 ;	sfence									; commit cache write back memory
-	
+
 	pop ebp
 	pop edi
 	pop esi
 	pop edx
 	pop ebx
-	
+
 	ret
 
 ALIGN 16
@@ -569,7 +569,7 @@
 	push esi
 	push edi
 	push ebp
-	
+
 	; for both top and bottom border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; pDst
@@ -581,10 +581,10 @@
 	mov cl, byte [esi]
 	butterfly_1to16_sse xmm3, xmm4, c		; pDst, tmp, pSrc [generic register name: a/b/c/d]
 	; load top border
-	mov ecx, edx							; kiStride	
+	mov ecx, edx							; kiStride
 	neg ecx 								; -kiStride
 	lea edi, [esi+ecx]						; last line of top border
-	; load bottom border 
+	; load bottom border
 	dec eax									; h-1
 	imul eax, edx 							; (h-1)*kiStride
 	lea eax, [esi+eax]						; last line of picture pData
@@ -592,16 +592,16 @@
 	lea ebp, [eax+edx]						; last line of bottom border, (h-1)*kiStride + 16 * kiStride
 	; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
 	dec ebx									; kiWidth-1
-	lea ebx, [eax+ebx]						; dst[w-1][h-1]	
+	lea ebx, [eax+ebx]						; dst[w-1][h-1]
 ;	xor edx, edx
 	mov dl, byte [eax]						; bottom-left
 	butterfly_1to16_sse xmm5, xmm6, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	mov dl, byte [ebx]						; bottom-right
 	butterfly_1to16_sse xmm6, xmm4, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	; for top & bottom expanding	
+	; for top & bottom expanding
 	mov ebx, [esp+32]						; kiWidth
-	exp_top_bottom_sse2	16	
-	
+	exp_top_bottom_sse2	16
+
 	; for both left and right border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst: left border pSrc
@@ -613,7 +613,7 @@
 	lea edi, [esi+eax]						; left border dst
 	dec ebx
 	lea ebx, [esi+ebx]						; right border pSrc, (p_dst + width - 1)
-	lea ebp, [ebx+1]						; right border dst	
+	lea ebp, [ebx+1]						; right border dst
 	; prepare for cross border pData: top-right with xmm4
 ;	xor eax, eax
 	mov al, byte [ebx]						; top-right
@@ -620,7 +620,7 @@
 	butterfly_1to16_sse xmm4, xmm0, a		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	; for left & right border expanding
 	exp_left_right_sse2	16, u
-	
+
 	; for cross border [top-left, top-right, bottom-left, bottom-right]
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst
@@ -630,9 +630,9 @@
 	; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
 	neg ecx									; -kiStride
 	mov eax, -16							; chroma=-16
-	lea edi, [esi+eax]						
+	lea edi, [esi+eax]
 	lea edi, [edi+ecx]				; last line of top-left border
-	lea ebp, [esi+ebx]						
+	lea ebp, [esi+ebx]
 	lea ebp, [ebp+ecx]				; last line of top-right border
 	mov ecx, [esp+28]						; kiStride
 	add edx, 16							; kiHeight+16, luma=32, chroma=16
@@ -642,14 +642,14 @@
 	neg ecx									; -kiStride
 	; for left & right border expanding
 	exp_cross_sse2		16, u
-	
+
 ;	sfence									; commit cache write back memory
-	
+
 	pop ebp
 	pop edi
 	pop esi
 	pop edx
 	pop ebx
-	
+
 	ret
 
--- a/codec/decoder/core/asm/intra_pred.asm
+++ b/codec/decoder/core/asm/intra_pred.asm
@@ -38,7 +38,7 @@
 ;*      18/09/2009 Created
 ;*		19/11/2010 Added
 ;*					WelsI16x16LumaPredDcTop_sse2, WelsI16x16LumaPredDcNA_sse2,
-;*					WelsIChromaPredDcLeft_mmx, WelsIChromaPredDcTop_sse2 
+;*					WelsIChromaPredDcLeft_mmx, WelsIChromaPredDcTop_sse2
 ;*					and WelsIChromaPredDcNA_mmx
 ;*
 ;*
@@ -96,13 +96,13 @@
 	punpcklbw	%1,	%3
 	movdqa		%3,	%1
 	punpcklbw	%1,	%3
-	
+
 	;add			%4,	%5
 	movd		%2,	[%4+%5-1]
 	movdqa		%3,	%2
 	punpcklbw	%2,	%3
 	movdqa		%3,	%2
-	punpcklbw	%2,	%3	
+	punpcklbw	%2,	%3
 	punpckldq	%1,	%2
 %endmacro
 
@@ -116,24 +116,24 @@
 		movd	%2,	[%5+%6]
 		punpcklbw %3,	%2
 		punpcklwd %1,	%3
-		lea		%5,	[%5+2*%6]	
+		lea		%5,	[%5+2*%6]
 		movd	%4,	[%5]
 		movd	%2,	[%5+%6]
 		punpcklbw %4,	%2
-		lea		%5,	[%5+2*%6]	
+		lea		%5,	[%5+2*%6]
 		movd	%3,	[%5]
 		movd	%2,	[%5+%6]
 		lea		%5,	[%5+2*%6]
 		punpcklbw %3,	%2
 		punpcklwd %4,	%3
-		punpckhdq %1,	%4	
-%endmacro	
+		punpckhdq %1,	%4
+%endmacro
 
 %macro  SUMW_HORIZON 3
 	movhlps		%2, %1			; x2 = xx xx xx xx d7 d6 d5 d4
 	paddw		%1, %2			; x1 = xx xx xx xx d37 d26 d15 d04
-	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04 
-	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26 
+	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04
+	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26
 	paddd		%1, %2			; x1 = xxxx xxxx d1357 d0246
 	pshuflw		%2, %1, 0x4e	; x2 = xxxx xxxx d0246 d1357
 	paddd		%1, %2			; x1 = xxxx xxxx xxxx  d01234567
@@ -162,7 +162,7 @@
 		movd	%2,	[%5+%6]
 		punpcklbw %3,	%2
 		punpckhwd %1,	%3
-		lea		%5,	[%5+2*%6]			
+		lea		%5,	[%5+2*%6]
 %endmacro
 
 %macro LOAD_2_LEFT_AND_ADD 0
@@ -186,7 +186,7 @@
 ALIGN 16
 ;*******************************************************************************
 ;   void_t __cdecl WelsI4x4LumaPredH_sse2(uint8_t *pPred, const int32_t kiStride)
-;   
+;
 ;	pPred must align to 16
 ;*******************************************************************************
 WelsI4x4LumaPredH_sse2:
@@ -196,7 +196,7 @@
 	movzx		edx,	byte [eax-1]
 	movd		xmm0,	edx
 	pmuludq		xmm0,	[mmx_01bytes]
-	
+
 	movzx		edx,	byte [eax+ecx-1]
 	movd		xmm1,	edx
 	pmuludq		xmm1,	[mmx_01bytes]
@@ -205,11 +205,11 @@
 	movzx		edx,	byte [eax+ecx-1]
 	movd		xmm2,	edx
 	pmuludq		xmm2,	[mmx_01bytes]
-	
+
 	movzx		edx,	byte [eax+2*ecx-1]
-	movd		xmm3,	edx	
+	movd		xmm3,	edx
 	pmuludq		xmm3,	[mmx_01bytes]
-	
+
 	sub         eax,    ecx
 	movd        [eax], xmm0
 	movd        [eax+ecx], xmm1
@@ -216,9 +216,9 @@
 	lea         eax, [eax+2*ecx]
 	movd        [eax], xmm2
 	movd        [eax+ecx], xmm3
-	
+
 	ret
-	
+
 ;*******************************************************************************
 ; void_t WelsI16x16LumaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
 ;*******************************************************************************
@@ -229,9 +229,9 @@
 		mov		ecx,	[esp + pushsize + 8]
 		sub		esi,	1
 		sub		esi,	ecx
-		
+
 		;for H
-		pxor	xmm7,	xmm7	
+		pxor	xmm7,	xmm7
 		movq	xmm0,	[esi]
 		movdqa	xmm5,	[sse2_plane_dec]
 		punpcklbw xmm0,	xmm7
@@ -241,7 +241,7 @@
 		punpcklbw xmm1,	xmm7
 		pmullw	xmm1,	xmm6
 		psubw	xmm1,	xmm0
-		
+
 		SUMW_HORIZON	xmm1,xmm0,xmm2
 		movd    eax,	xmm1		; H += (i + 1) * (top[8 + i] - top[6 - i]);
 		movsx	eax,	ax
@@ -249,26 +249,26 @@
 		add		eax,	32
 		sar		eax,	6			; b = (5 * H + 32) >> 6;
 		SSE2_Copy8Times	xmm1, eax	; xmm1 = b,b,b,b,b,b,b,b
-		
-		movzx	edx,	BYTE [esi+16]	
+
+		movzx	edx,	BYTE [esi+16]
 		sub	esi, 3
 		LOAD_COLUMN		xmm0, xmm2, xmm3, xmm4, esi, ecx
-			
+
 		add		esi,	3
 		movzx	eax,	BYTE [esi+8*ecx]
 		add		edx,	eax
 		shl		edx,	4			;	a = (left[15*kiStride] + top[15]) << 4;
-		
+
 		sub	esi, 3
 		add		esi,	ecx
 		LOAD_COLUMN		xmm7, xmm2, xmm3, xmm4, esi, ecx
-		pxor	xmm4,	xmm4	
+		pxor	xmm4,	xmm4
 		punpckhbw xmm0,	xmm4
 		pmullw	xmm0,	xmm5
 		punpckhbw xmm7,	xmm4
 		pmullw	xmm7,	xmm6
 		psubw	xmm7,	xmm0
-		
+
 		SUMW_HORIZON   xmm7,xmm0,xmm2
 		movd    eax,   xmm7			; V
 		movsx	eax,	ax
@@ -276,17 +276,17 @@
 		imul	eax,	5
 		add		eax,	32
 		sar		eax,	6				; c = (5 * V + 32) >> 6;
-		SSE2_Copy8Times	xmm4, eax		; xmm4 = c,c,c,c,c,c,c,c		
-		
+		SSE2_Copy8Times	xmm4, eax		; xmm4 = c,c,c,c,c,c,c,c
+
 		mov		esi,	[esp + pushsize + 4]
 		add		edx,	16
 		imul	eax,	-7
-		add		edx,	eax				; s = a + 16 + (-7)*c		
-		SSE2_Copy8Times	xmm0, edx		; xmm0 = s,s,s,s,s,s,s,s		
-		
+		add		edx,	eax				; s = a + 16 + (-7)*c
+		SSE2_Copy8Times	xmm0, edx		; xmm0 = s,s,s,s,s,s,s,s
+
 		xor		eax,	eax
 		movdqa	xmm5,	[sse2_plane_inc_minus]
-		
+
 get_i16x16_luma_pred_plane_sse2_1:
 		movdqa	xmm2,	xmm1
 		pmullw	xmm2,	xmm5
@@ -295,7 +295,7 @@
 		movdqa	xmm3,	xmm1
 		pmullw	xmm3,	xmm6
 		paddw	xmm3,	xmm0
-		psraw	xmm3,	5	
+		psraw	xmm3,	5
 		packuswb xmm2,	xmm3
 		movdqa	[esi],	xmm2
 		paddw	xmm0,	xmm4
@@ -302,13 +302,13 @@
 		add		esi,	ecx
 		inc		eax
 		cmp		eax,	16
-		jnz get_i16x16_luma_pred_plane_sse2_1					
-		
+		jnz get_i16x16_luma_pred_plane_sse2_1
+
 		pop		esi
 		ret
-		
-		
-		
+
+
+
 ;*******************************************************************************
 ; void_t WelsI16x16LumaPredH_sse2(uint8_t *pPred, const int32_t kiStride);
 ;*******************************************************************************
@@ -315,7 +315,7 @@
 
 %macro SSE2_PRED_H_16X16_TWO_LINE_DEC 0
     lea     eax,	[eax+ecx*2]
-    
+
     COPY_16_TIMES eax,	xmm0
     movdqa  [eax],	xmm0
     COPY_16_TIMESS eax,	xmm0,	ecx
@@ -326,13 +326,12 @@
 WelsI16x16LumaPredH_sse2:
     mov     eax, [esp+4]    ; pPred
     mov     ecx, [esp+8]    ; kiStride
-    
+
     COPY_16_TIMES eax,	xmm0
     movdqa  [eax],		xmm0
     COPY_16_TIMESS eax,	xmm0,	ecx
     movdqa  [eax+ecx],	xmm0
-    
-	SSE2_PRED_H_16X16_TWO_LINE_DEC 
+
 	SSE2_PRED_H_16X16_TWO_LINE_DEC
 	SSE2_PRED_H_16X16_TWO_LINE_DEC
 	SSE2_PRED_H_16X16_TWO_LINE_DEC
@@ -339,9 +338,10 @@
 	SSE2_PRED_H_16X16_TWO_LINE_DEC
 	SSE2_PRED_H_16X16_TWO_LINE_DEC
 	SSE2_PRED_H_16X16_TWO_LINE_DEC
-   
+	SSE2_PRED_H_16X16_TWO_LINE_DEC
+
     ret
-    
+
 ;*******************************************************************************
 ; void_t WelsI16x16LumaPredV_sse2(uint8_t *pPred, const int32_t kiStride);
 ;*******************************************************************************
@@ -349,10 +349,10 @@
 WelsI16x16LumaPredV_sse2:
     mov     edx, [esp+4]    ; pPred
     mov     ecx, [esp+8]    ; kiStride
-    
+
     sub     edx, ecx
     movdqa  xmm0, [edx]
-    
+
     movdqa  [edx+ecx], xmm0
     lea     edx, [edx+2*ecx]
     movdqa  [edx],     xmm0
@@ -377,9 +377,9 @@
     movdqa  [edx+ecx], xmm0
     lea     edx, [edx+2*ecx]
     movdqa  [edx],     xmm0
-        
+
     ret
-    
+
 ;*******************************************************************************
 ; void_t WelsIChromaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
 ;*******************************************************************************
@@ -391,8 +391,8 @@
 		mov		ecx,	[esp + pushsize + 8]	;kiStride
 		sub		esi,	1
 		sub		esi,	ecx
-		
-		pxor	mm7,	mm7	
+
+		pxor	mm7,	mm7
 		movq	mm0,	[esi]
 		movq	mm5,	[sse2_plane_dec_c]
 		punpcklbw mm0,	mm7
@@ -402,7 +402,7 @@
 		punpcklbw mm1,	mm7
 		pmullw	mm1,	mm6
 		psubw	mm1,	mm0
-		
+
 		movq2dq xmm1,   mm1
 		pxor    xmm2,   xmm2
 		SUMW_HORIZON	xmm1,xmm0,xmm2
@@ -412,7 +412,7 @@
 		add		eax,	16
 		sar		eax,	5			; b = (17 * H + 16) >> 5;
 		SSE2_Copy8Times	xmm1, eax	; mm1 = b,b,b,b,b,b,b,b
-		
+
 		movzx	edx,	BYTE [esi+8]
 		sub	esi, 3
 		LOAD_COLUMN_C	mm0, mm2, mm3, mm4, esi, ecx
@@ -421,17 +421,17 @@
 		movzx	eax,	BYTE [esi+4*ecx]
 		add		edx,	eax
 		shl		edx,	4			; a = (left[7*kiStride] + top[7]) << 4;
-		
+
 		sub	esi, 3
 		add		esi,	ecx
 		LOAD_COLUMN_C	mm7, mm2, mm3, mm4, esi, ecx
-		pxor	mm4,	mm4	
+		pxor	mm4,	mm4
 		punpckhbw mm0,	mm4
 		pmullw	mm0,	mm5
 		punpckhbw mm7,	mm4
 		pmullw	mm7,	mm6
 		psubw	mm7,	mm0
-		
+
 		movq2dq xmm7,   mm7
 		pxor    xmm2,   xmm2
 		SUMW_HORIZON	xmm7,xmm0,xmm2
@@ -441,17 +441,17 @@
 		imul	eax,	17
 		add		eax,	16
 		sar		eax,	5				; c = (17 * V + 16) >> 5;
-		SSE2_Copy8Times	xmm4, eax		; mm4 = c,c,c,c,c,c,c,c		
-		
+		SSE2_Copy8Times	xmm4, eax		; mm4 = c,c,c,c,c,c,c,c
+
 		mov		esi,	[esp + pushsize + 4]
 		add		edx,	16
 		imul	eax,	-3
-		add		edx,	eax				; s = a + 16 + (-3)*c		
-		SSE2_Copy8Times	xmm0, edx		; xmm0 = s,s,s,s,s,s,s,s		
-		
+		add		edx,	eax				; s = a + 16 + (-3)*c
+		SSE2_Copy8Times	xmm0, edx		; xmm0 = s,s,s,s,s,s,s,s
+
 		xor		eax,	eax
 		movdqa	xmm5,	[sse2_plane_mul_b_c]
-		
+
 get_i_chroma_pred_plane_sse2_1:
 		movdqa	xmm2,	xmm1
 		pmullw	xmm2,	xmm5
@@ -463,12 +463,12 @@
 		add		esi,	ecx
 		inc		eax
 		cmp		eax,	8
-		jnz get_i_chroma_pred_plane_sse2_1					
-		
+		jnz get_i_chroma_pred_plane_sse2_1
+
 		pop		esi
 		WELSEMMS
-		ret	
-		
+		ret
+
 ALIGN 16
 ;*******************************************************************************
 ;	0 |1 |2 |3 |4 |
@@ -480,13 +480,13 @@
 ;	pPred[7] = ([6]+[0]*2+[1]+2)/4
 ;
 ;   void_t __cdecl WelsI4x4LumaPredDDR_mmx(uint8_t *pPred, const int32_t kiStride)
-;   
+;
 ;*******************************************************************************
-WelsI4x4LumaPredDDR_mmx:	
+WelsI4x4LumaPredDDR_mmx:
 	mov			edx,[esp+4]			;pPred
 	mov         eax,edx
 	mov			ecx,[esp+8]		;kiStride
-	
+
 	movq        mm1,[eax+ecx-8]		;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
 	movq        mm2,[eax-8]			;get value of 6 mm2[8] = 6
 	sub			eax, ecx			;mov eax to above line of current block(postion of 1)
@@ -513,19 +513,19 @@
 	pand        mm1,[mmx_01bytes]	;set the odd bit
 	psubusb     mm3,mm1				;decrease 1 from odd bytes
 	pavgb       mm2,mm3				;mm2=(([11]+[21]+1)/2+1+[16])/2
-	
+
 	lea         edx,[edx+ecx]
-	movd        [edx+2*ecx],mm2 
+	movd        [edx+2*ecx],mm2
 	sub         edx,ecx
-	psrlq       mm2,8 
-	movd        [edx+2*ecx],mm2 
-	psrlq       mm2,8 
-	movd        [edx+ecx],mm2 
-	psrlq       mm2,8 
+	psrlq       mm2,8
+	movd        [edx+2*ecx],mm2
+	psrlq       mm2,8
+	movd        [edx+ecx],mm2
+	psrlq       mm2,8
 	movd        [edx],mm2
 	WELSEMMS
 	ret
-	
+
 ALIGN 16
 ;*******************************************************************************
 ;	0 |1 |2 |3 |4 |
@@ -537,36 +537,36 @@
 ;	pPred[6] = ([1]+[2]+[3]+[4]+[5]+[10]+[15]+[20]+4)/8
 ;
 ;   void_t __cdecl WelsI4x4LumaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
-;   
+;
 ;*******************************************************************************
-WelsI4x4LumaPredDc_sse2:	
+WelsI4x4LumaPredDc_sse2:
 	mov         eax,[esp+4]			;pPred
 	mov			ecx,[esp+8]			;kiStride
 	push		ebx
-		
+
 	movzx		edx,	byte [eax-1h]
-	
+
 	sub			eax,	ecx
 	movd		xmm0,	[eax]
 	pxor		xmm1,	xmm1
 	psadbw		xmm0,	xmm1
-	
+
 	movd		ebx,	xmm0
 	add			ebx,	edx
-	
+
 	movzx		edx,	byte [eax+ecx*2-1h]
 	add			ebx,	edx
-	
+
 	lea			eax,	[eax+ecx*2-1]
 	movzx		edx,	byte [eax+ecx]
 	add			ebx,	edx
-	
+
 	movzx		edx,	byte [eax+ecx*2]
 	add			ebx,	edx
 	add			ebx,	4
 	sar			ebx,	3
 	imul		ebx,	0x01010101
-	
+
 	mov			edx,	[esp+8]			;pPred
 	mov         [edx],       ebx
 	mov         [edx+ecx],   ebx
@@ -575,8 +575,8 @@
 	mov         [edx+ecx],   ebx
 
 	pop ebx
-	ret	
-	
+	ret
+
 ALIGN 16
 ;*******************************************************************************
 ;	void_t __cdecl WelsIChromaPredH_mmx(uint8_t *pPred, const int32_t kiStride)
@@ -585,7 +585,7 @@
 %macro MMX_PRED_H_8X8_ONE_LINE 4
 	movq		%1,		[%3-8]
 	psrlq		%1,		38h
-	
+
 	pmullw		%1,		[mmx_01bytes]
 	pshufw		%1,		%1,	0
 	movq		[%4],	%1
@@ -594,7 +594,7 @@
 %macro MMX_PRED_H_8X8_ONE_LINEE 4
 	movq		%1,		[%3+ecx-8]
 	psrlq		%1,		38h
-	
+
 	pmullw		%1,		[mmx_01bytes]
 	pshufw		%1,		%1,	0
 	movq		[%4],	%1
@@ -605,37 +605,37 @@
 	mov			edx,	[esp+4]			;pPred
 	mov         eax,	edx
 	mov			ecx,	[esp+8]			;kiStride
-	
+
 	movq		mm0,	[eax-8]
 	psrlq		mm0,	38h
-	
+
 	pmullw		mm0,		[mmx_01bytes]
 	pshufw		mm0,	mm0,	0
 	movq		[edx],	mm0
-	
+
 	MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax, edx+ecx
-	
+
 	lea			eax, [eax+ecx*2]
 	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, eax, edx+2*ecx
-	
+
 	lea         edx, [edx+2*ecx]
 	MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax, edx+ecx
-	
+
 	lea			eax, [eax+ecx*2]
 	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, eax, edx+2*ecx
-	
+
 	lea         edx, [edx+2*ecx]
 	MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax, edx+ecx
-	
+
 	lea			eax, [eax+ecx*2]
 	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, eax, edx+2*ecx
 
     lea         edx, [edx+2*ecx]
 	MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax, edx+ecx
-		
+
 	WELSEMMS
-	ret	
-	
+	ret
+
 ALIGN 16
 ;*******************************************************************************
 ;	void_t __cdecl get_i4x4_luma_pred_v_asm(uint8_t *pPred, const int32_t kiStride)
@@ -645,7 +645,7 @@
 get_i4x4_luma_pred_v_asm:
 	mov			eax,	[esp+4]        ;pPred
 	mov			ecx,	[esp+8]        ;kiStride
-	
+
 	sub			eax,	ecx
 	mov         edx,    [eax]
 	mov		    [eax+ecx],	 edx
@@ -653,9 +653,9 @@
 	lea         eax, [eax+2*ecx]
 	mov			[eax+ecx],	 edx
 	mov			[eax+2*ecx], edx
-	
-	ret	
 
+	ret
+
 ALIGN 16
 ;*******************************************************************************
 ;	void_t __cdecl WelsIChromaPredV_mmx(uint8_t *pPred, const int32_t kiStride)
@@ -665,7 +665,7 @@
 WelsIChromaPredV_mmx:
 	mov			eax,		[esp+4]    ;pPred
 	mov			ecx,		[esp+8]    ;kiStride
-	
+
 	sub			eax,		ecx
 	movq		mm0,		[eax]
 
@@ -680,11 +680,11 @@
 	lea         eax, [eax+2*ecx]
 	movq		[eax+ecx],      mm0
 	movq		[eax+2*ecx],    mm0
-	
+
 	WELSEMMS
 	ret
-	
-	
+
+
 	ALIGN 16
 ;*******************************************************************************
 ;	lt|t0|t1|t2|t3|
@@ -710,13 +710,13 @@
 
 ;   f = (2 + l1 + (l0<<1) + lt)>>2
 ;   h = (2 + l2 + (l1<<1) + l0)>>2
-;   j = (2 + l3 + (l2<<1) + l1)>>2   
+;   j = (2 + l3 + (l2<<1) + l1)>>2
 ;   [b a f e h g j i] + [d c b a] --> mov to memory
-;   
+;
 ;   void_t WelsI4x4LumaPredHD_mmx(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsI4x4LumaPredHD_mmx
-WelsI4x4LumaPredHD_mmx:	
+WelsI4x4LumaPredHD_mmx:
 	mov			edx, [esp+4]			; pPred
 	mov         eax, edx
 	mov			ecx, [esp+8]            ; kiStride
@@ -723,16 +723,16 @@
 	sub         eax, ecx
 	movd        mm0, [eax-1]            ; mm0 = [xx xx xx xx t2 t1 t0 lt]
 	psllq       mm0, 20h                ; mm0 = [t2 t1 t0 lt xx xx xx xx]
-	
-	movd        mm1, [eax+2*ecx-4]        
-	punpcklbw   mm1, [eax+ecx-4]        ; mm1[7] = l0, mm1[6] = l1	
+
+	movd        mm1, [eax+2*ecx-4]
+	punpcklbw   mm1, [eax+ecx-4]        ; mm1[7] = l0, mm1[6] = l1
 	lea         eax, [eax+2*ecx]
-	movd        mm2, [eax+2*ecx-4]        
+	movd        mm2, [eax+2*ecx-4]
 	punpcklbw   mm2, [eax+ecx-4]        ; mm2[7] = l2, mm2[6] = l3
 	punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
 	psrlq       mm2, 20h
 	pxor        mm0, mm2                ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
-	
+
 	movq        mm1, mm0
 	psrlq       mm1, 10h                ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
 	movq        mm2, mm0
@@ -740,17 +740,17 @@
 	movq        mm3, mm2
 	movq        mm4, mm1
 	pavgb       mm1, mm0
-	
+
 	pxor        mm4, mm0				; find odd value in the lowest bit of each byte
 	pand        mm4, [mmx_01bytes]	    ; set the odd bit
 	psubusb     mm1, mm4				; decrease 1 from odd bytes
-	
+
 	pavgb       mm2, mm1                ; mm2 = [xx xx d  c  b  f  h  j]
-	
+
 	movq        mm4, mm0
 	pavgb       mm3, mm4                ; mm3 = [xx xx xx xx a  e  g  i]
 	punpcklbw   mm3, mm2                ; mm3 = [b  a  f  e  h  g  j  i]
-	
+
 	psrlq       mm2, 20h
 	psllq       mm2, 30h                ; mm2 = [d  c  0  0  0  0  0  0]
 	movq        mm4, mm3
@@ -757,7 +757,7 @@
 	psrlq       mm4, 10h                ; mm4 = [0  0  b  a  f  e  h  j]
 	pxor        mm2, mm4                ; mm2 = [d  c  b  a  xx xx xx xx]
 	psrlq       mm2, 20h                ; mm2 = [xx xx xx xx  d  c  b  a]
-	
+
 	movd        [edx], mm2
 	lea         edx, [edx+ecx]
 	movd        [edx+2*ecx], mm3
@@ -768,9 +768,9 @@
 	movd        [edx+ecx], mm3
 	WELSEMMS
 	ret
-	
-	
-	
+
+
+
 ALIGN 16
 ;*******************************************************************************
 ;	lt|t0|t1|t2|t3|
@@ -793,17 +793,17 @@
 ;   b = (2 + l0 + (l1<<1) + l2)>>2
 ;   d = (2 + l1 + (l2<<1) + l3)>>2
 ;   f = (2 + l2 + (l3<<1) + l3)>>2
- 
+
 ;   [g g f e d c b a] + [g g g g] --> mov to memory
-;   
+;
 ;   void_t WelsI4x4LumaPredHU_mmx(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsI4x4LumaPredHU_mmx
-WelsI4x4LumaPredHU_mmx:	
+WelsI4x4LumaPredHU_mmx:
 	mov			edx, [esp+4]			; pPred
 	mov         eax, edx
 	mov			ecx, [esp+8]            ; kiStride
-	
+
 	movd        mm0, [eax-4]            ; mm0[3] = l0
 	punpcklbw   mm0, [eax+ecx-4]        ; mm0[7] = l1, mm0[6] = l0
 	lea         eax, [eax+2*ecx]
@@ -811,39 +811,39 @@
 	movd        mm4, [eax+ecx-4]        ; mm4[3] = l3
 	punpcklbw   mm2, mm4
 	punpckhwd   mm0, mm2                ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
-	
+
 	psrlq       mm4, 18h
 	psllq       mm4, 38h                ; mm4 = [l3 xx xx xx xx xx xx xx]
 	psrlq       mm0, 8h
 	pxor        mm0, mm4                ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
-	
+
 	movq        mm1, mm0
 	psllq       mm1, 8h                 ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
 	movq        mm3, mm1                ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
 	pavgb       mm1, mm0                ; mm1 = [g  e  c  a  xx xx xx xx]
-	
+
 	movq        mm2, mm0
 	psllq       mm2, 10h                ; mm2 = [l2 l1 l0 xx xx xx xx xx]
 	movq        mm5, mm2
 	pavgb       mm2, mm0
-	
+
 	pxor        mm5, mm0				; find odd value in the lowest bit of each byte
 	pand        mm5, [mmx_01bytes]	    ; set the odd bit
 	psubusb     mm2, mm5				; decrease 1 from odd bytes
-	
+
 	pavgb       mm2, mm3                ; mm2 = [f  d  b  xx xx xx xx xx]
-	
+
 	psrlq       mm2, 8h
 	pxor        mm2, mm4                ; mm2 = [g  f  d  b  xx xx xx xx]
-	
+
 	punpckhbw   mm1, mm2                ; mm1 = [g  g  f  e  d  c  b  a]
 	punpckhbw   mm4, mm4                ; mm4 = [g  g  xx xx xx xx xx xx]
 	punpckhbw   mm4, mm4                ; mm4 = [g  g  g  g  xx xx xx xx]
-	
+
 	psrlq       mm4, 20h
 	lea         edx, [edx+ecx]
 	movd        [edx+2*ecx], mm4
-	
+
 	sub         edx, ecx
 	movd        [edx], mm1
 	psrlq       mm1, 10h
@@ -852,9 +852,9 @@
 	movd        [edx+2*ecx], mm1
 	WELSEMMS
 	ret
-	
-	
-	
+
+
+
 ALIGN 16
 ;*******************************************************************************
 ;	lt|t0|t1|t2|t3|
@@ -880,12 +880,12 @@
 
 ;   h = (2 + t1 + (t2<<1) + t3)>>2
 ;   i = (2 + lt + (l0<<1) + l1)>>2
-;   j = (2 + l0 + (l1<<1) + l2)>>2   
-;   
+;   j = (2 + l0 + (l1<<1) + l2)>>2
+;
 ;   void_t WelsI4x4LumaPredVR_mmx(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsI4x4LumaPredVR_mmx
-WelsI4x4LumaPredVR_mmx:	
+WelsI4x4LumaPredVR_mmx:
 	mov			edx, [esp+4]			; pPred
 	mov         eax, edx
 	mov			ecx, [esp+8]            ; kiStride
@@ -892,51 +892,51 @@
 	sub         eax, ecx
 	movq        mm0, [eax-1]            ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
 	psllq       mm0, 18h                ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
-	
-	movd        mm1, [eax+2*ecx-4]        
-	punpcklbw   mm1, [eax+ecx-4]        ; mm1[7] = l0, mm1[6] = l1	
+
+	movd        mm1, [eax+2*ecx-4]
+	punpcklbw   mm1, [eax+ecx-4]        ; mm1[7] = l0, mm1[6] = l1
 	lea         eax, [eax+2*ecx]
 	movq        mm2, [eax+ecx-8]        ; mm2[7] = l2
 	punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 xx xx xx xx xx]
 	psrlq       mm2, 28h
 	pxor        mm0, mm2                ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
-	
+
 	movq        mm1, mm0
 	psllq       mm1, 8h                 ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
 	pavgb       mm1, mm0                ; mm1 = [d  c  b  a  xx xx xx xx]
-	
+
 	movq        mm2, mm0
 	psllq       mm2, 10h                ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
 	movq        mm3, mm2
 	pavgb       mm2, mm0
-	
+
 	pxor        mm3, mm0				; find odd value in the lowest bit of each byte
 	pand        mm3, [mmx_01bytes]	    ; set the odd bit
 	psubusb     mm2, mm3				; decrease 1 from odd bytes
-	
+
 	movq        mm3, mm0
 	psllq       mm3, 8h                 ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
 	pavgb       mm3, mm2                ; mm3 = [h  g  f  e  i  j  xx xx]
 	movq        mm2, mm3
-	
+
 	psrlq       mm1, 20h                ; mm1 = [xx xx xx xx d  c  b  a]
 	movd        [edx], mm1
-	
+
 	psrlq       mm2, 20h                ; mm2 = [xx xx xx xx h  g  f  e]
 	movd        [edx+ecx], mm2
-	
+
 	movq        mm4, mm3
 	psllq       mm4, 20h
 	psrlq       mm4, 38h                ; mm4 = [xx xx xx xx xx xx xx i]
-	
+
 	movq        mm5, mm3
 	psllq       mm5, 28h
 	psrlq       mm5, 38h                ; mm5 = [xx xx xx xx xx xx xx j]
-	
+
 	psllq       mm1, 8h
 	pxor        mm4, mm1                ; mm4 = [xx xx xx xx c  b  a  i]
 	movd        [edx+2*ecx], mm4
-	
+
 	psllq       mm2, 8h
 	pxor        mm5, mm2                ; mm5 = [xx xx xx xx g  f  e  j]
 	lea         edx, [edx+2*ecx]
@@ -943,7 +943,7 @@
 	movd        [edx+ecx], mm5
 	WELSEMMS
 	ret
-	
+
 ALIGN 16
 ;*******************************************************************************
 ;	lt|t0|t1|t2|t3|t4|t5|t6|t7
@@ -966,13 +966,13 @@
 ;   e = (2 + t4 + t6 + (t5<<1))>>2
 ;   f = (2 + t5 + t7 + (t6<<1))>>2
 ;   g = (2 + t6 + t7 + (t7<<1))>>2
- 
+
 ;   [g f e d c b a] --> mov to memory
-;   
+;
 ;   void_t WelsI4x4LumaPredDDL_mmx(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsI4x4LumaPredDDL_mmx
-WelsI4x4LumaPredDDL_mmx:	
+WelsI4x4LumaPredDDL_mmx:
 	mov			edx, [esp+4]			; pPred
 	mov         eax, edx
 	mov			ecx, [esp+8]            ; kiStride
@@ -980,11 +980,11 @@
 	movq        mm0, [eax]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
 	movq        mm1, mm0
 	movq        mm2, mm0
-	
+
 	movq        mm3, mm0
 	psrlq       mm3, 38h
 	psllq       mm3, 38h                ; mm3 = [t7 xx xx xx xx xx xx xx]
-	
+
 	psllq       mm1, 8h                 ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
 	psrlq       mm2, 8h
 	pxor        mm2, mm3                ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
@@ -994,9 +994,9 @@
 	pxor        mm3, mm2				; find odd value in the lowest bit of each byte
 	pand        mm3, [mmx_01bytes]	    ; set the odd bit
 	psubusb     mm1, mm3				; decrease 1 from odd bytes
-	
+
 	pavgb       mm0, mm1                ; mm0 = [g f e d c b a xx]
-	
+
 	psrlq       mm0, 8h
 	movd        [edx], mm0
 	psrlq       mm0, 8h
@@ -1008,8 +1008,8 @@
 	movd        [edx+ecx], mm0
 	WELSEMMS
 	ret
-	
-	
+
+
 ALIGN 16
 ;*******************************************************************************
 ;	lt|t0|t1|t2|t3|t4|t5|t6|t7
@@ -1035,40 +1035,40 @@
 ;   g = (2 + t2 + (t3<<1) + t4)>>2
 ;   h = (2 + t3 + (t4<<1) + t5)>>2
 ;   j = (2 + t4 + (t5<<1) + t6)>>2
- 
+
 ;   [i d c b a] + [j h g f e] --> mov to memory
-;   
+;
 ;   void_t WelsI4x4LumaPredVL_mmx(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsI4x4LumaPredVL_mmx
-WelsI4x4LumaPredVL_mmx:	
+WelsI4x4LumaPredVL_mmx:
 	mov			edx, [esp+4]			; pPred
 	mov         eax, edx
 	mov			ecx, [esp+8]            ; kiStride
-	
+
 	sub         eax, ecx
 	movq        mm0, [eax]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
 	movq        mm1, mm0
 	movq        mm2, mm0
-	
+
 	psrlq       mm1, 8h                 ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
 	psrlq       mm2, 10h                ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
 
 	movq        mm3, mm1
 	pavgb       mm3, mm0                ; mm3 = [xx xx xx i  d  c  b  a]
-	
+
 	movq        mm4, mm2
-	pavgb       mm2, mm0	
+	pavgb       mm2, mm0
 	pxor        mm4, mm0				; find odd value in the lowest bit of each byte
 	pand        mm4, [mmx_01bytes]	    ; set the odd bit
 	psubusb     mm2, mm4				; decrease 1 from odd bytes
-	
+
 	pavgb       mm2, mm1                ; mm2 = [xx xx xx j  h  g  f  e]
-	
+
 	movd        [edx], mm3
 	psrlq       mm3, 8h
 	movd        [edx+2*ecx], mm3
-	
+
 	movd        [edx+ecx], mm2
 	psrlq       mm2, 8h
 	lea         edx, [edx+2*ecx]
@@ -1075,7 +1075,7 @@
 	movd        [edx+ecx], mm2
 	WELSEMMS
 	ret
-	
+
 ALIGN 16
 ;*******************************************************************************
 ;
@@ -1082,11 +1082,11 @@
 ;   void_t WelsIChromaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsIChromaPredDc_sse2
-WelsIChromaPredDc_sse2:	
+WelsIChromaPredDc_sse2:
 	push        ebx
 	mov         eax, [esp+8]			; pPred
 	mov			ecx, [esp+12]           ; kiStride
-	
+
 	sub         eax, ecx
 	movq        mm0, [eax]
 
@@ -1100,7 +1100,7 @@
 	movzx		edx, byte [eax-0x01]     ; l4
 	add			ebx, edx
 	movd        mm1, ebx                 ; mm1 = l1+l2+l3+l4
-	
+
 	movzx		ebx, byte [eax+ecx-0x01] ; l5
 	lea         eax, [eax+2*ecx]
 	movzx		edx, byte [eax-0x01]     ; l6
@@ -1111,7 +1111,7 @@
 	movzx		edx, byte [eax-0x01]     ; l8
 	add			ebx, edx
 	movd        mm2, ebx                 ; mm2 = l5+l6+l7+l8
-	
+
 	movq        mm3, mm0
 	psrlq       mm0, 0x20
 	psllq       mm3, 0x20
@@ -1118,46 +1118,46 @@
 	psrlq       mm3, 0x20
 	pxor		mm4, mm4
 	psadbw		mm0, mm4
-	psadbw		mm3, mm4                 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2	
-	
+	psadbw		mm3, mm4                 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
+
 	paddq       mm3, mm1
 	movq        mm1, mm2
 	paddq       mm1, mm0;                ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
-	
+
 	movq        mm4, [mmx_0x02]
-	
+
 	paddq       mm0, mm4
 	psrlq       mm0, 0x02
-	
+
 	paddq       mm2, mm4
 	psrlq       mm2, 0x02
-	
+
 	paddq       mm3, mm4
 	paddq       mm3, mm4
 	psrlq       mm3, 0x03
-	
+
 	paddq       mm1, mm4
 	paddq       mm1, mm4
 	psrlq       mm1, 0x03
-	
+
 	pmuludq     mm0, [mmx_01bytes]
 	pmuludq     mm3, [mmx_01bytes]
 	psllq       mm0, 0x20
 	pxor        mm0, mm3                 ; mm0 = m_up
-	
+
 	pmuludq     mm2, [mmx_01bytes]
 	pmuludq     mm1, [mmx_01bytes]
 	psllq       mm1, 0x20
 	pxor        mm1, mm2                 ; mm2 = m_down
-	
+
 	mov         edx, [esp+8]			 ; pPred
-	
+
 	movq        [edx],       mm0
 	movq        [edx+ecx],   mm0
 	movq        [edx+2*ecx], mm0
 	lea         edx, [edx+2*ecx]
 	movq        [edx+ecx],   mm0
-	
+
 	movq        [edx+2*ecx], mm1
 	lea         edx, [edx+2*ecx]
 	movq        [edx+ecx],   mm1
@@ -1164,13 +1164,13 @@
 	movq        [edx+2*ecx], mm1
 	lea         edx, [edx+2*ecx]
 	movq        [edx+ecx],   mm1
-	
+
 	pop         ebx
 	WELSEMMS
 	ret
-	
-	
-	
+
+
+
 ALIGN 16
 ;*******************************************************************************
 ;
@@ -1177,11 +1177,11 @@
 ;   void_t WelsI16x16LumaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsI16x16LumaPredDc_sse2
-WelsI16x16LumaPredDc_sse2:	
+WelsI16x16LumaPredDc_sse2:
 	push        ebx
 	mov         eax, [esp+8]			; pPred
 	mov			ecx, [esp+12]           ; kiStride
-	
+
 	sub         eax, ecx
 	movdqa      xmm0, [eax]             ; read one row
 	pxor		xmm1, xmm1
@@ -1191,7 +1191,7 @@
 	pslldq      xmm0, 0x08
 	psrldq      xmm0, 0x08
 	paddw       xmm0, xmm1
-	
+
 	movzx		ebx, byte [eax+ecx-0x01]
 	movzx		edx, byte [eax+2*ecx-0x01]
 	add			ebx, edx
@@ -1209,44 +1209,44 @@
 	psrld       xmm0, 0x05
 	pmuludq     xmm0, [mmx_01bytes]
 	pshufd      xmm0, xmm0, 0
-	
+
 	mov         edx, [esp+8]			; pPred
-	
+
 	movdqa      [edx],       xmm0
 	movdqa      [edx+ecx],   xmm0
 	movdqa      [edx+2*ecx], xmm0
 	lea         edx,         [edx+2*ecx]
-	
+
 	movdqa      [edx+ecx],   xmm0
 	movdqa      [edx+2*ecx], xmm0
 	lea         edx,         [edx+2*ecx]
-	
+
 	movdqa      [edx+ecx],   xmm0
 	movdqa      [edx+2*ecx], xmm0
 	lea         edx,         [edx+2*ecx]
-	
+
 	movdqa      [edx+ecx],   xmm0
 	movdqa      [edx+2*ecx], xmm0
 	lea         edx,         [edx+2*ecx]
-	
+
 	movdqa      [edx+ecx],   xmm0
 	movdqa      [edx+2*ecx], xmm0
 	lea         edx,         [edx+2*ecx]
-	
+
 	movdqa      [edx+ecx],   xmm0
 	movdqa      [edx+2*ecx], xmm0
 	lea         edx,         [edx+2*ecx]
-	
+
 	movdqa      [edx+ecx],   xmm0
 	movdqa      [edx+2*ecx], xmm0
 	lea         edx,         [edx+2*ecx]
-	
+
 	movdqa      [edx+ecx],   xmm0
 
 	pop         ebx
 
 	ret
-	
+
 ;*******************************************************************************
 ; for intra prediction as follows, 11/19/2010
 ;*******************************************************************************
@@ -1258,12 +1258,12 @@
 WELS_EXTERN WelsI16x16LumaPredDcTop_sse2
 WelsI16x16LumaPredDcTop_sse2:
 	push ebx
-	
+
 	%define PUSH_SIZE 4
-	
+
 	mov eax, [esp+PUSH_SIZE+4]	; pPred
 	mov ebx, [esp+PUSH_SIZE+8]	; kiStride
-	
+
 	mov ecx, ebx
 	neg ecx
 	movdqa xmm0, [eax+ecx]		; pPred-kiStride, top line
@@ -1278,10 +1278,10 @@
 	pshufd xmm1, xmm0, 0b1h		; 10110001, w1+5 w0+4 w3+7 w2+6 w1+5 w0+4 w3+7 w2+6
 	paddw xmm0, xmm1			; w_o w_e w_o w_e w_o w_e w_o w_e (w_o=1+3+5+7, w_e=0+2+4+6)
 	pshuflw xmm1, xmm0, 0b1h	; 10110001
-	paddw xmm0, xmm1			; sum in word unit (x8)	
+	paddw xmm0, xmm1			; sum in word unit (x8)
 	movd edx, xmm0
 	and edx, 0ffffh
-	
+
 	add edx, 08h
 	sar edx, 04h
 	mov dh, dl
@@ -1288,35 +1288,35 @@
 	mov ecx, edx
 	shl ecx, 010h
 	or edx, ecx
-	movd xmm1, edx	
+	movd xmm1, edx
 	pshufd xmm0, xmm1, 00h
 	movdqa xmm1, xmm0
-	
+
 	lea ecx, [2*ebx+ebx]		; 3*kiStride
-	
+
 	movdqa [eax], xmm0
 	movdqa [eax+ebx], xmm1
 	movdqa [eax+2*ebx], xmm0
 	movdqa [eax+ecx], xmm1
-	
+
 	lea eax, [eax+4*ebx]
 	movdqa [eax], xmm0
 	movdqa [eax+ebx], xmm1
 	movdqa [eax+2*ebx], xmm0
 	movdqa [eax+ecx], xmm1
-	
+
 	lea eax, [eax+4*ebx]
 	movdqa [eax], xmm0
 	movdqa [eax+ebx], xmm1
 	movdqa [eax+2*ebx], xmm0
 	movdqa [eax+ecx], xmm1
-	
+
 	lea eax, [eax+4*ebx]
 	movdqa [eax], xmm0
 	movdqa [eax+ebx], xmm1
 	movdqa [eax+2*ebx], xmm0
 	movdqa [eax+ecx], xmm1
-	
+
 	%undef PUSH_SIZE
 	pop ebx
 	ret
@@ -1328,41 +1328,41 @@
 WELS_EXTERN WelsI16x16LumaPredDcNA_sse2
 WelsI16x16LumaPredDcNA_sse2:
 	push ebx
-	
+
 	%define PUSH_SIZE	4
-	
+
 	mov eax, [esp+PUSH_SIZE+4]	; pPred
-	mov ebx, [esp+PUSH_SIZE+8]	; kiStride	
-	
+	mov ebx, [esp+PUSH_SIZE+8]	; kiStride
+
 	lea ecx, [2*ebx+ebx]		; 3*kiStride
-	
+
 	movdqa xmm0, [sse2_dc_0x80]
-	movdqa xmm1, xmm0	
+	movdqa xmm1, xmm0
 	movdqa [eax], xmm0
 	movdqa [eax+ebx], xmm1
 	movdqa [eax+2*ebx], xmm0
-	movdqa [eax+ecx], xmm1	
+	movdqa [eax+ecx], xmm1
 	lea eax, [eax+4*ebx]
 	movdqa [eax], xmm0
 	movdqa [eax+ebx], xmm1
 	movdqa [eax+2*ebx], xmm0
-	movdqa [eax+ecx], xmm1	
+	movdqa [eax+ecx], xmm1
 	lea eax, [eax+4*ebx]
 	movdqa [eax], xmm0
 	movdqa [eax+ebx], xmm1
 	movdqa [eax+2*ebx], xmm0
-	movdqa [eax+ecx], xmm1	
+	movdqa [eax+ecx], xmm1
 	lea eax, [eax+4*ebx]
 	movdqa [eax], xmm0
 	movdqa [eax+ebx], xmm1
 	movdqa [eax+2*ebx], xmm0
 	movdqa [eax+ecx], xmm1
-	
+
 	%undef PUSH_SIZE
-	
+
 	pop ebx
 	ret
-	
+
 ALIGN 16
 ;*******************************************************************************
 ;	void_t WelsIChromaPredDcLeft_mmx(uint8_t *pPred, const int32_t kiStride)
@@ -1370,12 +1370,12 @@
 WELS_EXTERN WelsIChromaPredDcLeft_mmx
 WelsIChromaPredDcLeft_mmx:
 	push ebx
-	push esi	
+	push esi
 	%define PUSH_SIZE 8
 	mov esi, [esp+PUSH_SIZE+4]	; pPred
 	mov ecx, [esp+PUSH_SIZE+8]	; kiStride
 	mov eax, esi
-	; for left	
+	; for left
 	dec eax
 	xor ebx, ebx
 	xor edx, edx
@@ -1384,7 +1384,7 @@
 	add ebx, edx
 	lea eax, [eax+2*ecx]
 	mov dl, [eax]
-	add ebx, edx	
+	add ebx, edx
 	mov dl, [eax+ecx]
 	add ebx, edx
 	add ebx, 02h
@@ -1451,7 +1451,7 @@
 	movdqa xmm6, [sse2_wd_0x02]
 	paddw xmm0, xmm6
 	psraw xmm0, 02h
-	packuswb xmm0, xmm7	
+	packuswb xmm0, xmm7
 	lea ebx, [2*ecx+ecx]
 	movq [eax], xmm0
 	movq [eax+ecx], xmm0
@@ -1463,10 +1463,10 @@
 	movq [eax+2*ecx], xmm0
 	movq [eax+ebx], xmm0
 	%undef PUSH_SIZE
-	pop ebx	
+	pop ebx
 	ret
 
-	
+
 ALIGN 16
 ;*******************************************************************************
 ;	void_t WelsIChromaPredDcNA_mmx(uint8_t *pPred, const int32_t kiStride)
@@ -1495,4 +1495,4 @@
 	ret
 
 
-	
+
--- a/codec/decoder/core/asm/mb_copy.asm
+++ b/codec/decoder/core/asm/mb_copy.asm
@@ -37,7 +37,7 @@
 ;*  History
 ;*      15/09/2009 Created
 ;*		12/28/2009 Modified with larger throughput
-;*		12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2, 
+;*		12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2,
 ;*				   WelsCopy16x8_mmx, WelsCopy8x16_mmx etc;
 ;*
 ;*
@@ -84,7 +84,7 @@
 ;                           int iHeight );
 ;*******************************************************************************
 PixelAvgWidthEq4_mmx:
-   
+
     push        esi
     push        edi
     push        ebp
@@ -102,7 +102,7 @@
 	movd        mm0, [ebp]
     pavgb       mm0, [esi]
     movd        [edi], mm0
-   
+
     dec         ebx
     lea         edi, [edi+eax]
     lea         esi, [esi+ecx]
@@ -115,7 +115,7 @@
     pop         edi
     pop         esi
     ret
-                          
+
 ALIGN 16
 ;*******************************************************************************
 ; void_t PixelAvgWidthEq8_mmx( uint8_t *pDst,  int iDstStride,
@@ -124,7 +124,7 @@
 ;                           int iHeight );
 ;*******************************************************************************
 PixelAvgWidthEq8_mmx:
-    
+
     push        esi
     push        edi
     push        ebp
@@ -145,14 +145,14 @@
     movq        mm0, [esi+ecx]
     pavgb       mm0, [ebp+edx]
     movq		[edi+eax], mm0
-    
+
     lea			esi,  [esi+2*ecx]
     lea			ebp, [ebp+2*edx]
     lea			edi,  [edi+2*eax]
-    
+
     sub           ebx, 2
     jnz         .height_loop
-	
+
 	WELSEMMS
     pop         ebx
     pop         ebp
@@ -174,8 +174,8 @@
     push        edi
     push        ebp
     push        ebx
-    
 
+
     mov         edi, [esp+20]       ; pDst
     mov         eax, [esp+24]       ; iDstStride
     mov         esi, [esp+28]       ; pSrcA
@@ -188,28 +188,28 @@
 	movdqu      xmm0, [esi]
 	pavgb         xmm0, [ebp]
     movdqu      [edi], xmm0
-    
+
 	movdqu      xmm0, [esi+ecx]
 	pavgb         xmm0, [ebp+edx]
     movdqu      [edi+eax], xmm0
-	
+
 	movdqu      xmm0, [esi+2*ecx]
 	pavgb         xmm0, [ebp+2*edx]
     movdqu      [edi+2*eax], xmm0
-    
+
     lea              esi,  [esi+2*ecx]
     lea			   ebp, [ebp+2*edx]
     lea			   edi,  [edi+2*eax]
-     
+
 	movdqu      xmm0, [esi+ecx]
 	pavgb         xmm0, [ebp+edx]
     movdqu      [edi+eax], xmm0
-    
+
     lea              esi,  [esi+2*ecx]
     lea			   ebp, [ebp+2*edx]
     lea			   edi,  [edi+2*eax]
-	    
-    
+
+
     sub         ebx, 4
     jne         .height_loop
 
@@ -232,7 +232,7 @@
     push    edi
     push    ebx
 
-    
+
     mov esi,  [esp+16]
     mov eax, [esp+20]
     mov edi,  [esp+24]
@@ -242,12 +242,12 @@
 .height_loop:
 	mov ebx, [esi]
 	mov [edi], ebx
-	
+
 	add esi, eax
 	add edi, ecx
 	dec edx
 	jnz .height_loop
-	WELSEMMS   
+	WELSEMMS
 	pop	   ebx
     pop     edi
     pop     esi
@@ -275,12 +275,11 @@
 	add edi, ecx
 	dec edx
 	jnz .height_loop
-	
-	WELSEMMS   
+
+	WELSEMMS
     pop     edi
     pop     esi
     ret
-	
 
 
 
@@ -288,6 +287,7 @@
 
 
 
+
 ALIGN 16
 ;*******************************************************************************
 ;   void_t McCopyWidthEq16_sse2( uint8_t *pSrc, int iSrcStride, uint8_t *pDst, int iDstStride, int iHeight )
@@ -308,7 +308,7 @@
     push    edi
 
     mov     esi, [esp+12]       ; pSrc
-    mov     eax, [esp+16]       ; iSrcStride    
+    mov     eax, [esp+16]       ; iSrcStride
     mov     edi, [esp+20]       ; pDst
     mov     edx, [esp+24]       ; iDstStride
     mov     ecx, [esp+28]       ; iHeight
@@ -324,7 +324,7 @@
     lea     esi, [esi+eax*2]
     lea     edi, [edi+edx*2]
     jnz     .height_loop
-  
+
     pop     edi
     pop     esi
     ret
--- a/codec/decoder/core/asm/mc_chroma.asm
+++ b/codec/decoder/core/asm/mc_chroma.asm
@@ -1,317 +1,317 @@
-;*!
-;* \copy
-;*     Copyright (c)  2004-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  mc_chroma.asm
-;*
-;*  Abstract
-;*      mmx motion compensation for chroma
-;*
-;*  History
-;*      10/13/2004 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-BITS 32
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-SECTION .rodata align=16
-
-;***********************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;***********************************************************************
-
-ALIGN 16
-h264_d0x20_sse2:
-	dw 32,32,32,32,32,32,32,32
-ALIGN 16
-h264_d0x20_mmx:
-	dw 32,32,32,32
-
-
-;=============================================================================
-; Code
-;=============================================================================
-
-SECTION .text
-
-ALIGN 16
-;*******************************************************************************
-; void McChromaWidthEq4_mmx( uint8_t *src, 
-;							int32_t iSrcStride, 
-;							uint8_t *pDst, 
-;							int32_t iDstStride, 
-;							uint8_t *pABCD, 
-;							int32_t iHeigh );
-;*******************************************************************************
-WELS_EXTERN McChromaWidthEq4_mmx
-McChromaWidthEq4_mmx:
-	push esi
-	push edi
-	push ebx
-	
-	mov eax, [esp +12 + 20]
-	movd mm3, [eax]
-	WELS_Zero mm7
-	punpcklbw mm3, mm3
-	movq      mm4, mm3
-	punpcklwd mm3, mm3       
-	punpckhwd mm4, mm4		 
-	
-	movq	  mm5, mm3
-	punpcklbw mm3, mm7
-	punpckhbw mm5, mm7
-	
-	movq	  mm6, mm4
-	punpcklbw mm4, mm7
-	punpckhbw mm6, mm7
-	
-	mov esi, [esp +12+ 4]   
-	mov eax, [esp + 12 + 8]   
-	mov edi, [esp + 12 + 12]  
-	mov edx, [esp + 12 + 16]  
-    mov ecx, [esp + 12 + 24]   
-		
-	lea ebx, [esi + eax]
-	movd mm0, [esi]
-	movd mm1, [esi+1]
-	punpcklbw mm0, mm7
-	punpcklbw mm1, mm7
-.xloop:
-	
-	pmullw mm0, mm3
-	pmullw mm1, mm5
-	paddw  mm0, mm1
-	
-	movd  mm1, [ebx]
-	punpcklbw mm1, mm7
-	movq mm2, mm1
-	pmullw mm1, mm4
-	paddw mm0, mm1
-	
-	movd mm1, [ebx+1]
-	punpcklbw mm1, mm7
-	movq mm7, mm1
-	pmullw mm1,mm6
-	paddw mm0, mm1
-	movq mm1,mm7
-
-	paddw mm0, [h264_d0x20_mmx]
-	psrlw mm0, 6
-	
-	WELS_Zero mm7
-	packuswb mm0, mm7
-	movd [edi], mm0	
-
-	movq mm0, mm2
-	
-	lea edi, [edi +edx  ]
-	lea ebx, [ebx + eax]
-
-	dec ecx
-	jnz near .xloop
-	WELSEMMS
-	pop ebx
-	pop edi
-	pop esi
-	ret
-
-
-ALIGN 16
-;*******************************************************************************
-; void McChromaWidthEq8_sse2( uint8_t *pSrc, 
-;						int32_t iSrcStride, 
-;						uint8_t *pDst, 
-;						int32_t iDstStride, 
-;						uint8_t *pABCD, 
-;						int32_t iheigh );
-;*******************************************************************************
-WELS_EXTERN McChromaWidthEq8_sse2
-McChromaWidthEq8_sse2:
-	push esi
-	push edi
-	push ebx
-	
-	mov eax, [esp +12 + 20]
-	movd xmm3, [eax]
-	WELS_Zero xmm7
-	punpcklbw  xmm3, xmm3
-	punpcklwd  xmm3, xmm3
-	
-	movdqa	   xmm4, xmm3
-	punpckldq  xmm3, xmm3
-	punpckhdq  xmm4, xmm4
-	movdqa     xmm5, xmm3
-	movdqa	   xmm6, xmm4
-	
-	punpcklbw  xmm3, xmm7
-	punpckhbw  xmm5, xmm7
-	punpcklbw  xmm4, xmm7
-	punpckhbw  xmm6, xmm7
-	
-	mov esi, [esp +12+ 4]   
-	mov eax, [esp + 12 + 8]   
-	mov edi, [esp + 12 + 12]  
-	mov edx, [esp + 12 + 16]  
-    mov ecx, [esp + 12 + 24]   
-		
-	lea ebx, [esi + eax]
-	movq xmm0, [esi]
-	movq xmm1, [esi+1]
-	punpcklbw xmm0, xmm7
-	punpcklbw xmm1, xmm7
-.xloop:
-	
-	pmullw xmm0, xmm3
-	pmullw xmm1, xmm5
-	paddw  xmm0, xmm1
-	
-	movq  xmm1, [ebx]
-	punpcklbw xmm1, xmm7
-	movdqa xmm2, xmm1
-	pmullw xmm1, xmm4
-	paddw xmm0, xmm1
-	
-	movq xmm1, [ebx+1]
-	punpcklbw xmm1, xmm7
-	movdqa xmm7, xmm1
-	pmullw xmm1, xmm6
-	paddw xmm0, xmm1
-	movdqa xmm1,xmm7
-
-	paddw xmm0, [h264_d0x20_sse2]
-	psrlw xmm0, 6
-	
-	WELS_Zero xmm7
-	packuswb xmm0, xmm7
-	movq [edi], xmm0	
-
-	movdqa xmm0, xmm2
-	
-	lea edi, [edi +edx  ]
-	lea ebx, [ebx + eax]
-
-	dec ecx
-	jnz near .xloop
-	
-	pop ebx
-	pop edi
-	pop esi
-	ret
-
-
-
-
-ALIGN 16
-;***********************************************************************
-; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
-;						 int32_t iSrcStride, 
-;                        uint8_t *pDst,  
-;                        int32_t iDstStride,
-;                        uint8_t *pABCD,
-;					     int32_t iHeigh);
-;***********************************************************************
-WELS_EXTERN McChromaWidthEq8_ssse3
-McChromaWidthEq8_ssse3:
-	push ebx
-	push esi
-	push edi
-		
-	mov eax, [esp + 12 + 20]
-
-    pxor      xmm7, xmm7
-    movd   xmm5, [eax]   
-    punpcklwd xmm5, xmm5  
-    punpckldq xmm5, xmm5 
-    movdqa    xmm6, xmm5
-    punpcklqdq xmm5, xmm5
-    punpckhqdq xmm6, xmm6    
-    
-	mov eax, [esp + 12 + 4]   
-	mov edx, [esp + 12 + 8]   
-	mov esi, [esp + 12 + 12]  
-	mov edi, [esp + 12 + 16]  
-    mov ecx, [esp + 12 + 24]   
-    
-    sub esi, edi
-    sub esi, edi
-	movdqa xmm7, [h264_d0x20_sse2]
-
-	movdqu xmm0, [eax]
-	movdqa xmm1, xmm0
-	psrldq xmm1, 1
-	punpcklbw xmm0, xmm1
-	
-.hloop_chroma:	
-	lea	esi, [esi+2*edi]
-	
-	movdqu xmm2, [eax+edx]
-	movdqa xmm3, xmm2
-	psrldq xmm3, 1
-	punpcklbw xmm2, xmm3
-	movdqa      xmm4, xmm2
-	
-    pmaddubsw  xmm0, xmm5
-    pmaddubsw  xmm2, xmm6
-    paddw      xmm0, xmm2
-    paddw      xmm0, xmm7
-	psrlw      xmm0, 6
-    packuswb   xmm0, xmm0
-    movq       [esi],xmm0	
-    
-    lea eax, [eax+2*edx]
-    movdqu xmm2, [eax]
-    movdqa xmm3, xmm2
-    psrldq xmm3, 1
-    punpcklbw xmm2, xmm3
-    movdqa      xmm0, xmm2
-    
-    pmaddubsw  xmm4, xmm5
-    pmaddubsw  xmm2, xmm6
-    paddw      xmm4, xmm2
-    paddw      xmm4, xmm7
-	psrlw      xmm4, 6
-    packuswb   xmm4, xmm4
-    movq       [esi+edi],xmm4	
-	
-	sub ecx, 2
-	jnz .hloop_chroma
-	pop edi
-	pop esi
-	pop ebx
-
-	ret
-
-
+;*!
+;* \copy
+;*     Copyright (c)  2004-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  mc_chroma.asm
+;*
+;*  Abstract
+;*      mmx motion compensation for chroma
+;*
+;*  History
+;*      10/13/2004 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+BITS 32
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+SECTION .rodata align=16
+
+;***********************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;***********************************************************************
+
+ALIGN 16
+h264_d0x20_sse2:
+	dw 32,32,32,32,32,32,32,32
+ALIGN 16
+h264_d0x20_mmx:
+	dw 32,32,32,32
+
+
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
+
+ALIGN 16
+;*******************************************************************************
+; void McChromaWidthEq4_mmx( uint8_t *src,
+;							int32_t iSrcStride,
+;							uint8_t *pDst,
+;							int32_t iDstStride,
+;							uint8_t *pABCD,
+;							int32_t iHeigh );
+;*******************************************************************************
+WELS_EXTERN McChromaWidthEq4_mmx
+McChromaWidthEq4_mmx:
+	push esi
+	push edi
+	push ebx
+
+	mov eax, [esp +12 + 20]
+	movd mm3, [eax]
+	WELS_Zero mm7
+	punpcklbw mm3, mm3
+	movq      mm4, mm3
+	punpcklwd mm3, mm3
+	punpckhwd mm4, mm4
+
+	movq	  mm5, mm3
+	punpcklbw mm3, mm7
+	punpckhbw mm5, mm7
+
+	movq	  mm6, mm4
+	punpcklbw mm4, mm7
+	punpckhbw mm6, mm7
+
+	mov esi, [esp +12+ 4]
+	mov eax, [esp + 12 + 8]
+	mov edi, [esp + 12 + 12]
+	mov edx, [esp + 12 + 16]
+    mov ecx, [esp + 12 + 24]
+
+	lea ebx, [esi + eax]
+	movd mm0, [esi]
+	movd mm1, [esi+1]
+	punpcklbw mm0, mm7
+	punpcklbw mm1, mm7
+.xloop:
+
+	pmullw mm0, mm3
+	pmullw mm1, mm5
+	paddw  mm0, mm1
+
+	movd  mm1, [ebx]
+	punpcklbw mm1, mm7
+	movq mm2, mm1
+	pmullw mm1, mm4
+	paddw mm0, mm1
+
+	movd mm1, [ebx+1]
+	punpcklbw mm1, mm7
+	movq mm7, mm1
+	pmullw mm1,mm6
+	paddw mm0, mm1
+	movq mm1,mm7
+
+	paddw mm0, [h264_d0x20_mmx]
+	psrlw mm0, 6
+
+	WELS_Zero mm7
+	packuswb mm0, mm7
+	movd [edi], mm0
+
+	movq mm0, mm2
+
+	lea edi, [edi +edx  ]
+	lea ebx, [ebx + eax]
+
+	dec ecx
+	jnz near .xloop
+	WELSEMMS
+	pop ebx
+	pop edi
+	pop esi
+	ret
+
+
+ALIGN 16
+;*******************************************************************************
+; void McChromaWidthEq8_sse2( uint8_t *pSrc,
+;						int32_t iSrcStride,
+;						uint8_t *pDst,
+;						int32_t iDstStride,
+;						uint8_t *pABCD,
+;						int32_t iheigh );
+;*******************************************************************************
+WELS_EXTERN McChromaWidthEq8_sse2
+McChromaWidthEq8_sse2:
+	push esi
+	push edi
+	push ebx
+
+	mov eax, [esp +12 + 20]
+	movd xmm3, [eax]
+	WELS_Zero xmm7
+	punpcklbw  xmm3, xmm3
+	punpcklwd  xmm3, xmm3
+
+	movdqa	   xmm4, xmm3
+	punpckldq  xmm3, xmm3
+	punpckhdq  xmm4, xmm4
+	movdqa     xmm5, xmm3
+	movdqa	   xmm6, xmm4
+
+	punpcklbw  xmm3, xmm7
+	punpckhbw  xmm5, xmm7
+	punpcklbw  xmm4, xmm7
+	punpckhbw  xmm6, xmm7
+
+	mov esi, [esp +12+ 4]
+	mov eax, [esp + 12 + 8]
+	mov edi, [esp + 12 + 12]
+	mov edx, [esp + 12 + 16]
+    mov ecx, [esp + 12 + 24]
+
+	lea ebx, [esi + eax]
+	movq xmm0, [esi]
+	movq xmm1, [esi+1]
+	punpcklbw xmm0, xmm7
+	punpcklbw xmm1, xmm7
+.xloop:
+
+	pmullw xmm0, xmm3
+	pmullw xmm1, xmm5
+	paddw  xmm0, xmm1
+
+	movq  xmm1, [ebx]
+	punpcklbw xmm1, xmm7
+	movdqa xmm2, xmm1
+	pmullw xmm1, xmm4
+	paddw xmm0, xmm1
+
+	movq xmm1, [ebx+1]
+	punpcklbw xmm1, xmm7
+	movdqa xmm7, xmm1
+	pmullw xmm1, xmm6
+	paddw xmm0, xmm1
+	movdqa xmm1,xmm7
+
+	paddw xmm0, [h264_d0x20_sse2]
+	psrlw xmm0, 6
+
+	WELS_Zero xmm7
+	packuswb xmm0, xmm7
+	movq [edi], xmm0
+
+	movdqa xmm0, xmm2
+
+	lea edi, [edi +edx  ]
+	lea ebx, [ebx + eax]
+
+	dec ecx
+	jnz near .xloop
+
+	pop ebx
+	pop edi
+	pop esi
+	ret
+
+
+
+
+ALIGN 16
+;***********************************************************************
+; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
+;						 int32_t iSrcStride,
+;                        uint8_t *pDst,
+;                        int32_t iDstStride,
+;                        uint8_t *pABCD,
+;					     int32_t iHeigh);
+;***********************************************************************
+WELS_EXTERN McChromaWidthEq8_ssse3
+McChromaWidthEq8_ssse3:
+	push ebx
+	push esi
+	push edi
+
+	mov eax, [esp + 12 + 20]
+
+    pxor      xmm7, xmm7
+    movd   xmm5, [eax]
+    punpcklwd xmm5, xmm5
+    punpckldq xmm5, xmm5
+    movdqa    xmm6, xmm5
+    punpcklqdq xmm5, xmm5
+    punpckhqdq xmm6, xmm6
+
+	mov eax, [esp + 12 + 4]
+	mov edx, [esp + 12 + 8]
+	mov esi, [esp + 12 + 12]
+	mov edi, [esp + 12 + 16]
+    mov ecx, [esp + 12 + 24]
+
+    sub esi, edi
+    sub esi, edi
+	movdqa xmm7, [h264_d0x20_sse2]
+
+	movdqu xmm0, [eax]
+	movdqa xmm1, xmm0
+	psrldq xmm1, 1
+	punpcklbw xmm0, xmm1
+
+.hloop_chroma:
+	lea	esi, [esi+2*edi]
+
+	movdqu xmm2, [eax+edx]
+	movdqa xmm3, xmm2
+	psrldq xmm3, 1
+	punpcklbw xmm2, xmm3
+	movdqa      xmm4, xmm2
+
+    pmaddubsw  xmm0, xmm5
+    pmaddubsw  xmm2, xmm6
+    paddw      xmm0, xmm2
+    paddw      xmm0, xmm7
+	psrlw      xmm0, 6
+    packuswb   xmm0, xmm0
+    movq       [esi],xmm0
+
+    lea eax, [eax+2*edx]
+    movdqu xmm2, [eax]
+    movdqa xmm3, xmm2
+    psrldq xmm3, 1
+    punpcklbw xmm2, xmm3
+    movdqa      xmm0, xmm2
+
+    pmaddubsw  xmm4, xmm5
+    pmaddubsw  xmm2, xmm6
+    paddw      xmm4, xmm2
+    paddw      xmm4, xmm7
+	psrlw      xmm4, 6
+    packuswb   xmm4, xmm4
+    movq       [esi+edi],xmm4
+
+	sub ecx, 2
+	jnz .hloop_chroma
+	pop edi
+	pop esi
+	pop ebx
+
+	ret
+
+
--- a/codec/decoder/core/asm/mc_luma.asm
+++ b/codec/decoder/core/asm/mc_luma.asm
@@ -69,16 +69,16 @@
 
 ALIGN 16
 ;*******************************************************************************
-; void_t McHorVer20WidthEq4_mmx( uint8_t *pSrc, 
-;                       int iSrcStride, 
-;						uint8_t *pDst, 
-;						int iDstStride, 
+; void_t McHorVer20WidthEq4_mmx( uint8_t *pSrc,
+;                       int iSrcStride,
+;						uint8_t *pDst,
+;						int iDstStride,
 ;						int iHeight)
 ;*******************************************************************************
 McHorVer20WidthEq4_mmx:
 	push esi
 	push edi
-	
+
 	mov  esi, [esp+12]
 	mov eax, [esp+16]
 	mov edi, [esp+20]
@@ -100,7 +100,7 @@
 	punpcklbw mm4, mm7
 	movd mm5, [esi+3]
 	punpcklbw mm5, mm7
-	
+
 	paddw mm2, mm3
 	paddw mm4, mm5
 	psllw mm4, 2
@@ -113,12 +113,12 @@
 	psraw mm0, 5
 	packuswb mm0, mm7
 	movd [edi], mm0
-	
+
 	add esi, eax
 	add edi, ecx
 	dec edx
 	jnz .height_loop
-	
+
 	WELSEMMS
 	pop edi
 	pop esi
@@ -181,8 +181,8 @@
 
 ALIGN 16
 ;***********************************************************************
-; void_t McHorVer22Width8HorFirst_sse2(int16_t *pSrc, 
-;                       int16_t iSrcStride, 
+; void_t McHorVer22Width8HorFirst_sse2(int16_t *pSrc,
+;                       int16_t iSrcStride,
 ;						uint8_t *pDst,
 ;						int32_t iDstStride
 ;						int32_t iHeight
@@ -197,11 +197,11 @@
 	mov edi, [esp+24]		;pDst
 	mov edx, [esp+28]	;iDstStride
 	mov ebx, [esp+32]	;iHeight
-	pxor xmm7, xmm7	
-	
+	pxor xmm7, xmm7
+
 	sub esi, eax				;;;;;;;;need more 5 lines.
 	sub esi, eax
-		
+
 .yloop_width_8:
 	movq xmm0, [esi]
 	punpcklbw xmm0, xmm7
@@ -215,7 +215,7 @@
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3]
 	punpcklbw xmm5, xmm7
-	
+
 	paddw xmm2, xmm3
 	paddw xmm4, xmm5
 	psllw xmm4, 2
@@ -225,7 +225,7 @@
 	psllw xmm4, 2
 	paddw xmm0, xmm4
 	movdqa [edi], xmm0
-		
+
 	add esi, eax
 	add edi, edx
 	dec ebx
@@ -238,8 +238,8 @@
 ALIGN 16
 ;***********************************************************************
 ;void_t McHorVer22VerLast_sse2(
-;											uint8_t *pSrc, 
-;											int32_t pSrcStride, 
+;											uint8_t *pSrc,
+;											int32_t pSrcStride,
 ;											uint8_t * pDst,
 ;											int32_t iDstStride,
 ;											int32_t iWidth,
@@ -250,17 +250,17 @@
 	paddw  %1, %6
 	movdqa %7, %2
 	movdqa %8, %3
-	
-	
+
+
 	paddw %7, %5
 	paddw %8, %4
-	
-	psubw  %1, %7   
-	psraw   %1, 2	  
-	paddw  %1, %8   
-	psubw  %1, %7 
-	psraw   %1, 2	
-	paddw  %8, %1   
+
+	psubw  %1, %7
+	psraw   %1, 2
+	paddw  %1, %8
+	psubw  %1, %7
+	psraw   %1, 2
+	paddw  %8, %1
 	paddw  %8, [h264_mc_hc_32]
 	psraw   %8, 6
 	packuswb %8, %8
@@ -272,15 +272,15 @@
 	push edi
 	push ebx
 	push ebp
-	
+
 	mov esi, [esp+20]
 	mov eax, [esp+24]
 	mov edi, [esp+28]
 	mov edx, [esp+32]
 	mov ebx, [esp+36]
-	mov ecx, [esp+40]	
-	shr ebx, 3	
-	
+	mov ecx, [esp+40]
+	shr ebx, 3
+
 .width_loop:
 	movdqa xmm0, [esi]
 	movdqa xmm1, [esi+eax]
@@ -290,12 +290,12 @@
 	lea esi, [esi+2*eax]
 	movdqa xmm4, [esi]
 	movdqa xmm5, [esi+eax]
-	
+
 	FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
 	dec ecx
 	lea esi, [esi+2*eax]
 	movdqa xmm6, [esi]
-	
+
 	movdqa xmm0, xmm1
 	movdqa xmm1, xmm2
 	movdqa xmm2, xmm3
@@ -302,61 +302,61 @@
 	movdqa xmm3, xmm4
 	movdqa xmm4, xmm5
 	movdqa xmm5, xmm6
-	
+
 	add edi, edx
-	sub esi, eax		
-	
+	sub esi, eax
+
 .start:
 	FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqa xmm6, [esi]
 	FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqa xmm7, [esi+eax]
 	FILTER_VER  xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqa xmm0, [esi]
 	FILTER_VER  xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqa xmm1, [esi+eax]
 	FILTER_VER  xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqa xmm2, [esi]
 	FILTER_VER  xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqa xmm3, [esi+eax]
 	FILTER_VER  xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqa xmm4, [esi]
 	FILTER_VER  xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqa xmm5, [esi+eax]
 	jmp near .start
-	
+
 .x_loop_dec:
 	dec ebx
 	jz near .exit
@@ -366,9 +366,9 @@
 	add esi, 16
 	add edi, 8
 	jmp .width_loop
-	
-	
-	
+
+
+
 .exit:
 	pop ebp
 	pop ebx
@@ -379,10 +379,10 @@
 
 ALIGN 16
 ;*******************************************************************************
-; void_t McHorVer20WidthEq8_sse2(  uint8_t *pSrc, 
-;                       int iSrcStride, 
-;												uint8_t *pDst, 
-;												int iDstStride, 
+; void_t McHorVer20WidthEq8_sse2(  uint8_t *pSrc,
+;                       int iSrcStride,
+;												uint8_t *pDst,
+;												int iDstStride,
 ;												int iHeight,
 ;                      );
 ;*******************************************************************************
@@ -389,18 +389,18 @@
 McHorVer20WidthEq8_sse2:
 	push	esi
 	push	edi
-	
+
 	mov esi, [esp + 12]         ;pSrc
 	mov eax, [esp + 16]         ;iSrcStride
 	mov edi, [esp + 20]         ;pDst
 	mov ecx, [esp + 28]         ;iHeight
 	mov edx, [esp + 24]			;iDstStride
-	
+
 	lea esi, [esi-2]            ;pSrc -= 2;
-	
+
 	pxor xmm7, xmm7
 	movdqa xmm6, [h264_w0x10_1]
-.y_loop:	
+.y_loop:
 	movq xmm0, [esi]
 	punpcklbw xmm0, xmm7
 	movq xmm1, [esi+5]
@@ -413,7 +413,7 @@
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3]
 	punpcklbw xmm5, xmm7
-	
+
 	paddw xmm2, xmm3
 	paddw xmm4, xmm5
 	psllw xmm4, 2
@@ -424,7 +424,7 @@
 	paddw xmm0, xmm4
 	paddw xmm0, xmm6
 	psraw xmm0, 5
-	
+
 	packuswb xmm0, xmm7
 	movq [edi], xmm0
 
@@ -432,17 +432,17 @@
 	lea esi, [esi+eax]
 	dec ecx
 	jnz near .y_loop
-	
+
 	pop edi
 	pop esi
 	ret
-	
+
 ALIGN 16
 ;*******************************************************************************
-; void_t McHorVer20WidthEq16_sse2(  uint8_t *pSrc, 
-;                       int iSrcStride, 
-;												uint8_t *pDst, 
-;												int iDstStride, 
+; void_t McHorVer20WidthEq16_sse2(  uint8_t *pSrc,
+;                       int iSrcStride,
+;												uint8_t *pDst,
+;												int iDstStride,
 ;												int iHeight,
 ;                      );
 ;*******************************************************************************
@@ -449,20 +449,20 @@
 McHorVer20WidthEq16_sse2:
 	push	esi
 	push	edi
-	
 
+
 	mov esi, [esp + 12]         ;pSrc
 	mov eax, [esp + 16]         ;iSrcStride
 	mov edi, [esp + 20]         ;pDst
 	mov ecx, [esp + 28]         ;iHeight
 	mov edx, [esp + 24]			;iDstStride
-	
+
 	lea esi, [esi-2]            ;pSrc -= 2;
-	
+
 	pxor xmm7, xmm7
 	movdqa xmm6, [h264_w0x10_1]
 .y_loop:
-	
+
 	movq xmm0, [esi]
 	punpcklbw xmm0, xmm7
 	movq xmm1, [esi+5]
@@ -475,7 +475,7 @@
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3]
 	punpcklbw xmm5, xmm7
-	
+
 	paddw xmm2, xmm3
 	paddw xmm4, xmm5
 	psllw xmm4, 2
@@ -501,7 +501,7 @@
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3+8]
 	punpcklbw xmm5, xmm7
-	
+
 	paddw xmm2, xmm3
 	paddw xmm4, xmm5
 	psllw xmm4, 2
@@ -514,9 +514,9 @@
 	psraw xmm0, 5
 	packuswb xmm0, xmm7
 	movq [edi+8], xmm0
-	
-	lea edi, [edi+edx]	
-	lea esi, [esi+eax]	
+
+	lea edi, [edi+edx]
+	lea esi, [esi+eax]
 	dec ecx
 	jnz near .y_loop
 	pop edi
@@ -525,10 +525,10 @@
 
 
 ;*******************************************************************************
-; void_t McHorVer02WidthEq8_sse2( uint8_t *pSrc, 
-;                       int iSrcStride, 
-;                       uint8_t *pDst, 
-;                       int iDstStride, 
+; void_t McHorVer02WidthEq8_sse2( uint8_t *pSrc,
+;                       int iSrcStride,
+;                       uint8_t *pDst,
+;                       int iDstStride,
 ;                       int iHeight )
 ;*******************************************************************************
 ALIGN 16
@@ -535,7 +535,7 @@
 McHorVer02WidthEq8_sse2:
 	push esi
 	push edi
-	
+
 	mov esi, [esp + 12]           ;pSrc
 	mov edx, [esp + 16]	          ;iSrcStride
 	mov edi, [esp + 20]           ;pDst
@@ -546,7 +546,7 @@
 	sub esi, edx
 
 	WELS_Zero xmm7
-			
+
 	SSE_LOAD_8P xmm0, xmm7, [esi]
 	SSE_LOAD_8P xmm1, xmm7, [esi+edx]
 	lea esi, [esi+2*edx]
@@ -555,8 +555,8 @@
 	lea esi, [esi+2*edx]
 	SSE_LOAD_8P xmm4, xmm7, [esi]
 	SSE_LOAD_8P xmm5, xmm7, [esi+edx]
-	
-.start:	
+
+.start:
 	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
 	dec ecx
 	jz near .xx_exit
@@ -566,7 +566,7 @@
 	FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [edi+eax]
 	dec ecx
 	jz near .xx_exit
-	
+
 	lea edi, [edi+2*eax]
 	SSE_LOAD_8P xmm7, xmm0, [esi+edx]
 	FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
--- a/codec/decoder/core/asm/memzero.asm
+++ b/codec/decoder/core/asm/memzero.asm
@@ -32,8 +32,8 @@
 ;*  memzero.asm
 ;*
 ;*  Abstract
-;*      
 ;*
+;*
 ;*  History
 ;*      9/16/2009 Created
 ;*
@@ -47,8 +47,8 @@
 ; Code
 ;***********************************************************************
 
-SECTION .text			
-		
+SECTION .text
+
 ALIGN 16
 ;***********************************************************************
 ;_inline void __cdecl WelsPrefetchZero_mmx(int8_t const*_A);
@@ -57,7 +57,7 @@
 WelsPrefetchZero_mmx:
 	mov  eax,[esp+4]
 	prefetchnta [eax]
-	ret 			
+	ret
 
 
 ALIGN 16
@@ -69,7 +69,7 @@
 		mov		eax,	[esp + 4]          ; dst
 		mov		ecx,	[esp + 8]
 		neg		ecx
-			
+
 		pxor	xmm0,		xmm0
 .memzeroa64_sse2_loops:
 		movdqa	[eax],		xmm0
@@ -77,12 +77,12 @@
 		movdqa	[eax+32],	xmm0
 		movdqa	[eax+48],	xmm0
 		add		eax, 0x40
-		
+
 		add ecx, 0x40
 		jnz near .memzeroa64_sse2_loops
-			
-		ret	
 
+		ret
+
 ALIGN 16
 ;***********************************************************************
 ;   void WelsSetMemZeroSize64_mmx(void *dst, int32_t size)
@@ -92,7 +92,7 @@
 		mov		eax,	[esp + 4]          ; dst
 		mov		ecx,	[esp + 8]
 		neg		ecx
-			
+
 		pxor	mm0,		mm0
 .memzero64_mmx_loops:
 		movq	[eax],		mm0
@@ -102,16 +102,16 @@
 		movq	[eax+32],	mm0
 		movq	[eax+40],	mm0
 		movq	[eax+48],	mm0
-		movq	[eax+56],	mm0		
+		movq	[eax+56],	mm0
 		add		eax,		0x40
-		
+
 		add ecx, 0x40
 		jnz near .memzero64_mmx_loops
-			
-		WELSEMMS	
-		ret	
-	
-ALIGN 16		
+
+		WELSEMMS
+		ret
+
+ALIGN 16
 ;***********************************************************************
 ;   void WelsSetMemZeroSize8_mmx(void *dst, int32_t size)
 ;***********************************************************************
@@ -119,17 +119,17 @@
 WelsSetMemZeroSize8_mmx:
 		mov		eax,	[esp + 4]		; dst
 		mov		ecx,	[esp + 8]		; size
-		neg		ecx			
+		neg		ecx
 		pxor	mm0,		mm0
-		
+
 .memzero8_mmx_loops:
 		movq	[eax],		mm0
 		add		eax,		0x08
-	
+
 		add		ecx,		0x08
 		jnz near .memzero8_mmx_loops
-		
-		WELSEMMS	
-		ret	
 
-							
+		WELSEMMS
+		ret
+
+
--- a/codec/decoder/core/inc/as264_common.h
+++ b/codec/decoder/core/inc/as264_common.h
@@ -33,7 +33,7 @@
  *
  * \brief	common flag definitions
  *
- * \date	7/6/2009 Created 
+ * \date	7/6/2009 Created
  *************************************************************************************
  */
 #ifndef WELS_AS264_COMMON_H__
@@ -47,15 +47,15 @@
 
 #ifdef X86_ASM
 
-void MemZeroUnalign32Bytes(void *pSrc);
-void MemZeroAlign32Bytes(void *pSrc);
-void MemZeroUnalign16Bytes(void *pSrc);
-void MemZeroAlign16Bytes(void *pSrc);
-void MemZeroAlign128Bytes(void *pSrc);
-void MemZeroUnalign128Bytes(void *pSrc);
-void MemZeroAlign256Bytes(void *pSrc);
-void MemZeroAlign240Bytes(void *pSrc);
-void MmPrefetch0(char const *kpA);
+void MemZeroUnalign32Bytes (void* pSrc);
+void MemZeroAlign32Bytes (void* pSrc);
+void MemZeroUnalign16Bytes (void* pSrc);
+void MemZeroAlign16Bytes (void* pSrc);
+void MemZeroAlign128Bytes (void* pSrc);
+void MemZeroUnalign128Bytes (void* pSrc);
+void MemZeroAlign256Bytes (void* pSrc);
+void MemZeroAlign240Bytes (void* pSrc);
+void MmPrefetch0 (char const* kpA);
 
 #endif// X86_ASM
 
--- a/codec/decoder/core/inc/au_parser.h
+++ b/codec/decoder/core/inc/au_parser.h
@@ -48,13 +48,13 @@
 
 namespace WelsDec {
 
-/*! 
+/*!
  *************************************************************************************
  * \brief	Start Code Prefix (0x 00 00 00 01) detection
  *
  * \param 	pBuf		bitstream payload buffer
  * \param	pOffset		offset between NAL rbsp and original bitsteam that
- * 				start code prefix is seperated from. 
+ * 				start code prefix is seperated from.
  * \param	iBufSize	count size of buffer
  *
  * \return	RBSP buffer of start code prefix exclusive
@@ -62,9 +62,9 @@
  * \note	N/A
  *************************************************************************************
  */
-uint8_t* DetectStartCodePrefix( const uint8_t *kpBuf, int32_t *pOffset, int32_t iBufSize );
+uint8_t* DetectStartCodePrefix (const uint8_t* kpBuf, int32_t* pOffset, int32_t iBufSize);
 
-/*! 
+/*!
  *************************************************************************************
  * \brief	to parse network abstraction layer unit,
  *			escape emulation_prevention_three_byte within it
@@ -74,31 +74,33 @@
  * \param 	pNalUnitHeader	parsed result of NAL Unit Header to output
  * \param   pSrcRbsp        bitstream buffer to input
  * \param   iSrcRbspLen     length size of bitstream buffer payload
- * \param	pSrcNal		    
- * \param	iSrcNalLen		
+ * \param	pSrcNal
+ * \param	iSrcNalLen
  * \param	pConsumedBytes	consumed bytes during parsing
  *
- * \return	decoded bytes payload, might be (pSrcRbsp+1) if no escapes 
+ * \return	decoded bytes payload, might be (pSrcRbsp+1) if no escapes
  *
  * \note	N/A
  *************************************************************************************
  */
-uint8_t* ParseNalHeader( PWelsDecoderContext pCtx, SNalUnitHeader *pNalUnitHeader, uint8_t *pSrcRbsp, int32_t iSrcRbspLen, uint8_t *pSrcNal, int32_t iSrcNalLen, int32_t* pConsumedBytes );		 
+uint8_t* ParseNalHeader (PWelsDecoderContext pCtx, SNalUnitHeader* pNalUnitHeader, uint8_t* pSrcRbsp,
+                         int32_t iSrcRbspLen, uint8_t* pSrcNal, int32_t iSrcNalLen, int32_t* pConsumedBytes);
 
-int32_t ParseNonVclNal( PWelsDecoderContext pCtx, uint8_t *pRbsp, const int32_t kiSrcLen );
+int32_t ParseNonVclNal (PWelsDecoderContext pCtx, uint8_t* pRbsp, const int32_t kiSrcLen);
 
-void_t ParseRefBasePicMarking ( PBitStringAux pBs, PRefBasePicMarking pRefBasePicMarking );
+void_t ParseRefBasePicMarking (PBitStringAux pBs, PRefBasePicMarking pRefBasePicMarking);
 
-void_t ParsePrefixNalUnit ( PWelsDecoderContext pCtx, PBitStringAux pBs );
+void_t ParsePrefixNalUnit (PWelsDecoderContext pCtx, PBitStringAux pBs);
 
-bool_t CheckAccessUnitBoundary( const PNalUnit kpCurNal, const PNalUnit kpLastNal, const PSps kpSps );
-bool_t CheckAccessUnitBoundaryExt( PNalUnitHeaderExt pLastNalHdrExt, PNalUnitHeaderExt pCurNalHeaderExt, PSliceHeader pLastSliceHeader, PSliceHeader pCurSliceHeader );
-/*! 
+bool_t CheckAccessUnitBoundary (const PNalUnit kpCurNal, const PNalUnit kpLastNal, const PSps kpSps);
+bool_t CheckAccessUnitBoundaryExt (PNalUnitHeaderExt pLastNalHdrExt, PNalUnitHeaderExt pCurNalHeaderExt,
+                                   PSliceHeader pLastSliceHeader, PSliceHeader pCurSliceHeader);
+/*!
  *************************************************************************************
  * \brief	to parse Sequence Parameter Set (SPS)
  *
  * \param	pCtx		Decoder context
- * \param	pBsAux		bitstream reader auxiliary 
+ * \param	pBsAux		bitstream reader auxiliary
  * \param	pPicWidth	picture width current Sps represented
  * \param	pPicHeight	picture height current Sps represented
  *
@@ -108,15 +110,15 @@
  * \note	Call it in case eNalUnitType is SPS.
  *************************************************************************************
  */
-int32_t ParseSps( PWelsDecoderContext pCtx, PBitStringAux pBsAux, int32_t *pPicWidth, int32_t *pPicHeight );
+int32_t ParseSps (PWelsDecoderContext pCtx, PBitStringAux pBsAux, int32_t* pPicWidth, int32_t* pPicHeight);
 
-/*! 
+/*!
  *************************************************************************************
  * \brief	to parse Picture Parameter Set (PPS)
  *
  * \param	pCtx		Decoder context
  * \param 	pPpsList	pps list
- * \param	pBsAux		bitstream reader auxiliary 
+ * \param	pBsAux		bitstream reader auxiliary
  *
  * \return	0 - successed
  *		1 - failed
@@ -124,14 +126,14 @@
  * \note	Call it in case eNalUnitType is PPS.
  *************************************************************************************
  */
-int32_t ParsePps( PWelsDecoderContext pCtx, PPps pPpsList, PBitStringAux pBsAux );
+int32_t ParsePps (PWelsDecoderContext pCtx, PPps pPpsList, PBitStringAux pBsAux);
 
-/*! 
+/*!
  *************************************************************************************
  * \brief	to parse SEI message payload
  *
  * \param 	pSei		sei message to be parsed output
- * \param	pBsAux		bitstream reader auxiliary 
+ * \param	pBsAux		bitstream reader auxiliary
  *
  * \return	0 - successed
  *		1 - failed
@@ -139,7 +141,7 @@
  * \note	Call it in case eNalUnitType is NAL_UNIT_SEI.
  *************************************************************************************
  */
-int32_t ParseSei( void_t *pSei, PBitStringAux pBsAux );	// reserved Sei_Msg type
+int32_t ParseSei (void_t* pSei, PBitStringAux pBsAux);	// reserved Sei_Msg type
 
 /*!
  *************************************************************************************
@@ -150,7 +152,7 @@
  * \return	count number of fmo context units are reset
  *************************************************************************************
  */
-int32_t ResetFmoList( PWelsDecoderContext pCtx );
+int32_t ResetFmoList (PWelsDecoderContext pCtx);
 
 } // namespace WelsDec
 
--- a/codec/decoder/core/inc/bit_stream.h
+++ b/codec/decoder/core/inc/bit_stream.h
@@ -30,7 +30,7 @@
  *
  */
 
- //bit_stream.h	-	bit-stream reading and / writing auxiliary data
+//bit_stream.h	-	bit-stream reading and / writing auxiliary data
 #ifndef WELS_BIT_STREAM_H__
 #define WELS_BIT_STREAM_H__
 
@@ -42,16 +42,16 @@
  *	Bit-stream auxiliary reading / writing
  */
 typedef struct TagBitStringAux {
-	uint8_t		*pStartBuf;	// buffer to start position
-	uint8_t		*pEndBuf;	// buffer + length
-	int32_t     iBits;       // count bits of overall bitstreaming input
+uint8_t*		pStartBuf;	// buffer to start position
+uint8_t*		pEndBuf;	// buffer + length
+int32_t     iBits;       // count bits of overall bitstreaming input
 
-	int32_t     iIndex;      //only for cavlc usage
-	uint8_t		*pCurBuf;	// current reading position	
-	uint32_t    uiCurBits;  
-	int32_t		iLeftBits;	// count number of available bits left ([1, 8]),
-	                        // need pointer to next byte start position in case 0 bit left then 8 instead
-}SBitStringAux, *PBitStringAux;
+int32_t     iIndex;      //only for cavlc usage
+uint8_t*		pCurBuf;	// current reading position
+uint32_t    uiCurBits;
+int32_t		iLeftBits;	// count number of available bits left ([1, 8]),
+// need pointer to next byte start position in case 0 bit left then 8 instead
+} SBitStringAux, *PBitStringAux;
 
 //#pragma pack()
 
@@ -64,11 +64,11 @@
  *
  * \return	size of buffer data in byte; failed in -1 return
  */
-int32_t InitBits( PBitStringAux pBitString, const uint8_t *kpBuf, const int32_t kiSize );
+int32_t InitBits (PBitStringAux pBitString, const uint8_t* kpBuf, const int32_t kiSize);
 
-void_t InitReadBits( PBitStringAux pBitString );
+void_t InitReadBits (PBitStringAux pBitString);
 
-uint32_t EndianFix(uint32_t uiX);
+uint32_t EndianFix (uint32_t uiX);
 
 
 
--- a/codec/decoder/core/inc/cpu.h
+++ b/codec/decoder/core/inc/cpu.h
@@ -55,19 +55,19 @@
  */
 int32_t  WelsCPUIdVerify();
 
-void_t WelsCPUId( uint32_t uiIndex, uint32_t *pFeatureA, uint32_t *pFeatureB, uint32_t *pFeatureC, uint32_t *pFeatureD );
+void_t WelsCPUId (uint32_t uiIndex, uint32_t* pFeatureA, uint32_t* pFeatureB, uint32_t* pFeatureC, uint32_t* pFeatureD);
 
-int32_t WelsCPUSupportAVX( uint32_t eax, uint32_t ecx );
-int32_t WelsCPUSupportFMA( uint32_t eax, uint32_t ecx );
+int32_t WelsCPUSupportAVX (uint32_t eax, uint32_t ecx);
+int32_t WelsCPUSupportFMA (uint32_t eax, uint32_t ecx);
 
 void_t WelsEmms();
 
-uint32_t WelsCPUFeatureDetect( int32_t *pNumberOfLogicProcessors );
+uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors);
 
 /*
  *	clear FPU registers states for potential float based calculation if support
  */
-void     WelsCPURestore( const uint32_t kuiCPU );
+void     WelsCPURestore (const uint32_t kuiCPU);
 
 #endif
 
--- a/codec/decoder/core/inc/cpu_core.h
+++ b/codec/decoder/core/inc/cpu_core.h
@@ -42,7 +42,7 @@
 
 /*
  *	WELS CPU feature flags
- */ 
+ */
 #define WELS_CPU_MMX        0x00000001    /* mmx */
 #define WELS_CPU_MMXEXT     0x00000002    /* mmx-ext*/
 #define WELS_CPU_SSE        0x00000004    /* sse */
--- a/codec/decoder/core/inc/crt_util_safe_x.h
+++ b/codec/decoder/core/inc/crt_util_safe_x.h
@@ -73,26 +73,26 @@
 
 #ifdef WIN32
 typedef      struct _timeb     SWelsTime;
-#else 
+#else
 typedef      struct timeb      SWelsTime;
 #endif
 
-int32_t   WelsSnprintf( str_t * buffer,  int32_t sizeOfBuffer,  const str_t * format, ... );
-str_t *  WelsStrncpy(str_t * dest, int32_t sizeInBytes, const str_t * src, int32_t count);
-str_t *  WelsStrcat(str_t * dest, int32_t sizeInBytes, str_t * src);
-int32_t   WelsStrnlen(const str_t * str,  int32_t maxlen);
-int32_t   WelsVsprintf(str_t * buffer, int32_t sizeOfBuffer, const str_t * format, va_list argptr);
+int32_t   WelsSnprintf (str_t* buffer,  int32_t sizeOfBuffer,  const str_t* format, ...);
+str_t*   WelsStrncpy (str_t* dest, int32_t sizeInBytes, const str_t* src, int32_t count);
+str_t*   WelsStrcat (str_t* dest, int32_t sizeInBytes, str_t* src);
+int32_t   WelsStrnlen (const str_t* str,  int32_t maxlen);
+int32_t   WelsVsprintf (str_t* buffer, int32_t sizeOfBuffer, const str_t* format, va_list argptr);
 
-WelsFileHandle      *  WelsFopen(const str_t * filename,  const str_t * mode);
-int32_t                WelsFclose(WelsFileHandle  * fp);
-int32_t                WelsFread(void * buffer, int32_t size, int32_t count, WelsFileHandle * fp);
-int32_t                WelsFwrite(const void * buffer, int32_t size, int32_t count, WelsFileHandle * fp);
-int32_t                WelsFseek(WelsFileHandle * fp, int32_t offset, int32_t origin);
-int32_t                WelsFflush(WelsFileHandle * fp);
+WelsFileHandle*        WelsFopen (const str_t* filename,  const str_t* mode);
+int32_t                WelsFclose (WelsFileHandle*   fp);
+int32_t                WelsFread (void* buffer, int32_t size, int32_t count, WelsFileHandle* fp);
+int32_t                WelsFwrite (const void* buffer, int32_t size, int32_t count, WelsFileHandle* fp);
+int32_t                WelsFseek (WelsFileHandle* fp, int32_t offset, int32_t origin);
+int32_t                WelsFflush (WelsFileHandle* fp);
 
-int32_t                WelsGetTimeOfDay(SWelsTime * tp);
-int32_t                WelsStrftime(str_t * buffer, int32_t size, const str_t * format, const SWelsTime * tp);
-uint16_t               WelsGetMillsecond(const SWelsTime * tp);
+int32_t                WelsGetTimeOfDay (SWelsTime* tp);
+int32_t                WelsStrftime (str_t* buffer, int32_t size, const str_t* format, const SWelsTime* tp);
+uint16_t               WelsGetMillsecond (const SWelsTime* tp);
 
 
 #ifdef __cplusplus
--- a/codec/decoder/core/inc/deblocking.h
+++ b/codec/decoder/core/inc/deblocking.h
@@ -48,7 +48,7 @@
 namespace WelsDec {
 
 /*!
- * \brief	deblocking module initialize 
+ * \brief	deblocking module initialize
  *
  * \param	pf
  *          cpu
@@ -56,7 +56,7 @@
  * \return	NONE
  */
 
-void_t  DeblockingInit( PDeblockingFunc pDeblockingFunc,  int32_t iCpu );
+void_t  DeblockingInit (PDeblockingFunc pDeblockingFunc,  int32_t iCpu);
 
 
 /*!
@@ -66,7 +66,7 @@
  *
  * \return	NONE
  */
-void_t WelsDeblockingFilterSlice( PWelsDecoderContext pCtx, PDeblockingFilterMbFunc pDeblockMb );
+void_t WelsDeblockingFilterSlice (PWelsDecoderContext pCtx, PDeblockingFilterMbFunc pDeblockMb);
 
 /*!
  * \brief	pixel deblocking filtering
@@ -79,26 +79,28 @@
  * \return	NONE
  */
 
-uint32_t DeblockingBsMarginalMBAvcbase( PDqLayer pCurDqLayer, int32_t iEdge, int32_t iNeighMb, int32_t iMbXy);
+uint32_t DeblockingBsMarginalMBAvcbase (PDqLayer pCurDqLayer, int32_t iEdge, int32_t iNeighMb, int32_t iMbXy);
 
-int32_t DeblockingAvailableNoInterlayer( PDqLayer pCurDqLayer, int32_t iFilterIdc );
+int32_t DeblockingAvailableNoInterlayer (PDqLayer pCurDqLayer, int32_t iFilterIdc);
 
-void_t DeblockingIntraMb( PDqLayer pCurDqLayer, PDeblockingFilter  pFilter, int32_t iBoundryFlag );
-void_t DeblockingInterMb( PDqLayer pCurDqLayer, PDeblockingFilter  pFilter, uint8_t nBS[2][4][4], int32_t iBoundryFlag );
+void_t DeblockingIntraMb (PDqLayer pCurDqLayer, PDeblockingFilter  pFilter, int32_t iBoundryFlag);
+void_t DeblockingInterMb (PDqLayer pCurDqLayer, PDeblockingFilter  pFilter, uint8_t nBS[2][4][4], int32_t iBoundryFlag);
 
-void_t WelsDeblockingMb( PDqLayer pCurDqLayer, PDeblockingFilter  pFilter, int32_t iBoundryFlag );
+void_t WelsDeblockingMb (PDqLayer pCurDqLayer, PDeblockingFilter  pFilter, int32_t iBoundryFlag);
 
-void_t DeblockLumaLt4V_c( uint8_t *pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *pTc );
-void_t DeblockLumaEq4V_c( uint8_t *pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta );
+void_t DeblockLumaLt4V_c (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
+void_t DeblockLumaEq4V_c (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
 
-void_t DeblockLumaLt4H_c( uint8_t *pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *pTc );
-void_t DeblockLumaEq4H_c( uint8_t *pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta );
+void_t DeblockLumaLt4H_c (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
+void_t DeblockLumaEq4H_c (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
 
-void_t DeblockChromaLt4V_c( uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *pTc );
-void_t DeblockChromaEq4V_c( uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta );
+void_t DeblockChromaLt4V_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
+                            int8_t* pTc);
+void_t DeblockChromaEq4V_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
 
-void_t DeblockChromaLt4H_c( uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *pTc );
-void_t DeblockChromaEq4H_c( uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta );
+void_t DeblockChromaLt4H_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
+                            int8_t* pTc);
+void_t DeblockChromaEq4H_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
 
 #if defined(__cplusplus)
 extern "C" {
@@ -105,16 +107,18 @@
 #endif//__cplusplus
 
 #ifdef  X86_ASM
-void DeblockLumaLt4V_sse2( uint8_t *pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *pTc );
-void DeblockLumaEq4V_sse2( uint8_t *pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta );
-void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);
-void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
-void DeblockLumaLt4H_sse2(uint8_t *pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *pTc);
-void DeblockLumaEq4H_sse2(uint8_t *pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
-void DeblockChromaEq4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
-void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t * pTC);
-void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
-void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t * pTC);
+void DeblockLumaLt4V_sse2 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
+void DeblockLumaEq4V_sse2 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void DeblockLumaTransposeH2V_sse2 (uint8_t* pPixY, int32_t iStride, uint8_t* pDst);
+void DeblockLumaTransposeV2H_sse2 (uint8_t* pPixY, int32_t iStride, uint8_t* pSrc);
+void DeblockLumaLt4H_sse2 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
+void DeblockLumaEq4H_sse2 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void DeblockChromaEq4V_sse2 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void DeblockChromaLt4V_sse2 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
+                             int8_t* pTC);
+void DeblockChromaEq4H_sse2 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void DeblockChromaLt4H_sse2 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
+                             int8_t* pTC);
 #endif
 #if defined(__cplusplus)
 }
--- a/codec/decoder/core/inc/dec_frame.h
+++ b/codec/decoder/core/inc/dec_frame.h
@@ -49,92 +49,92 @@
 ///////////////////////////////////DQ Layer level///////////////////////////////////
 typedef struct TagDqLayer	SDqLayer;
 typedef SDqLayer*			PDqLayer;
-typedef struct TagLayerInfo{
-	SNalUnitHeaderExt		sNalHeaderExt;
-	SSlice					sSliceInLayer;	// Here Slice identify to Frame on concept	
-	PSubsetSps				pSubsetSps;	// current pSubsetSps used, memory alloc in external
-	PSps					pSps;		// current sps based avc used, memory alloc in external
-	PPps					pPps;		// current pps used
+typedef struct TagLayerInfo {
+  SNalUnitHeaderExt		sNalHeaderExt;
+  SSlice					sSliceInLayer;	// Here Slice identify to Frame on concept
+  PSubsetSps				pSubsetSps;	// current pSubsetSps used, memory alloc in external
+  PSps					pSps;		// current sps based avc used, memory alloc in external
+  PPps					pPps;		// current pps used
 } SLayerInfo, *PLayerInfo;
 /* Layer Representation */
 
-struct TagDqLayer{
-	SLayerInfo			sLayerInfo;	
-		
-	uint8_t				*pCsData[3];	// pointer to reconstructed picture data
-	int32_t				iCsStride[3];	// Cs stride
-	PBitStringAux		pBitStringAux;	// pointer to SBitStringAux
-	PFmo				pFmo;		// Current fmo context pointer used
-	int8_t  *pMbType;
-	int32_t *pSliceIdc;				// using int32_t for slice_idc
-	int16_t	(*pMv[LIST_A])[MB_BLOCK4x4_NUM][MV_A];
-	int8_t	(*pRefIndex[LIST_A])[MB_BLOCK4x4_NUM]; 
-	int8_t  *pLumaQp;
-	int8_t  *pChromaQp;
-	int8_t  *pCbp;
-	int8_t  (*pNzc)[24];
-	int8_t  (*pNzcRs)[24];
-	int8_t  *pResidualPredFlag;
-	int8_t  *pInterPredictionDoneFlag;
-	int16_t (*pScaledTCoeff)[MB_COEFF_LIST_SIZE];
-	int8_t  (*pIntraPredMode)[8]; //0~3 top4x4 ; 4~6 left 4x4; 7 intra16x16
-	int8_t  (*pIntra4x4FinalMode)[MB_BLOCK4x4_NUM];
-	int8_t  *pChromaPredMode;
-	//uint8_t (*motion_pred_flag[LIST_A])[MB_PARTITION_SIZE]; // 8x8
-	int8_t  (*pSubMbType)[MB_SUB_PARTITION_SIZE];
-	int32_t iLumaStride;
-	int32_t iChromaStride;
-	uint8_t *pPred[3];
-	int32_t iMbX;
-	int32_t iMbY;
-	int32_t iMbXyIndex;
-	int32_t	iMbWidth;		// MB width of this picture, equal to sSps.iMbWidth
-	int32_t	iMbHeight;		// MB height of this picture, equal to sSps.iMbHeight;
+struct TagDqLayer {
+  SLayerInfo			sLayerInfo;
 
-	/* Common syntax elements across all slices of a DQLayer */
-	int32_t					iSliceIdcBackup;
-	uint32_t				uiSpsId;
-	uint32_t				uiPpsId;
-	uint32_t				uiDisableInterLayerDeblockingFilterIdc;
-	int32_t					iInterLayerSliceAlphaC0Offset;
-	int32_t					iInterLayerSliceBetaOffset;	
-	//SPosOffset			sScaledRefLayer;
-	int32_t					iSliceGroupChangeCycle;
-	PRefPicListReorderSyn	pRefPicListReordering;
-	PRefPicMarking          pRefPicMarking; // Decoded reference picture marking syntaxs
-	PRefBasePicMarking	    pRefPicBaseMarking;
+  uint8_t*				pCsData[3];	// pointer to reconstructed picture data
+  int32_t				iCsStride[3];	// Cs stride
+  PBitStringAux		pBitStringAux;	// pointer to SBitStringAux
+  PFmo				pFmo;		// Current fmo context pointer used
+  int8_t*  pMbType;
+  int32_t* pSliceIdc;				// using int32_t for slice_idc
+  int16_t	(*pMv[LIST_A])[MB_BLOCK4x4_NUM][MV_A];
+  int8_t	(*pRefIndex[LIST_A])[MB_BLOCK4x4_NUM];
+  int8_t*  pLumaQp;
+  int8_t*  pChromaQp;
+  int8_t*  pCbp;
+  int8_t (*pNzc)[24];
+  int8_t (*pNzcRs)[24];
+  int8_t*  pResidualPredFlag;
+  int8_t*  pInterPredictionDoneFlag;
+  int16_t (*pScaledTCoeff)[MB_COEFF_LIST_SIZE];
+  int8_t (*pIntraPredMode)[8];  //0~3 top4x4 ; 4~6 left 4x4; 7 intra16x16
+  int8_t (*pIntra4x4FinalMode)[MB_BLOCK4x4_NUM];
+  int8_t*  pChromaPredMode;
+  //uint8_t (*motion_pred_flag[LIST_A])[MB_PARTITION_SIZE]; // 8x8
+  int8_t (*pSubMbType)[MB_SUB_PARTITION_SIZE];
+  int32_t iLumaStride;
+  int32_t iChromaStride;
+  uint8_t* pPred[3];
+  int32_t iMbX;
+  int32_t iMbY;
+  int32_t iMbXyIndex;
+  int32_t	iMbWidth;		// MB width of this picture, equal to sSps.iMbWidth
+  int32_t	iMbHeight;		// MB height of this picture, equal to sSps.iMbHeight;
 
-	PPicture				pRef;			// reference picture pointer
-	PPicture				pDec;			// reconstruction picture pointer for layer
+  /* Common syntax elements across all slices of a DQLayer */
+  int32_t					iSliceIdcBackup;
+  uint32_t				uiSpsId;
+  uint32_t				uiPpsId;
+  uint32_t				uiDisableInterLayerDeblockingFilterIdc;
+  int32_t					iInterLayerSliceAlphaC0Offset;
+  int32_t					iInterLayerSliceBetaOffset;
+  //SPosOffset			sScaledRefLayer;
+  int32_t					iSliceGroupChangeCycle;
+  PRefPicListReorderSyn	pRefPicListReordering;
+  PRefPicMarking          pRefPicMarking; // Decoded reference picture marking syntaxs
+  PRefBasePicMarking	    pRefPicBaseMarking;
 
-	bool_t					bStoreRefBasePicFlag;				// iCurTid == 0 && iCurQid = 0 && bEncodeKeyPic = 1
-	bool_t					bTCoeffLevelPredFlag;
-	bool_t					bConstrainedIntraResamplingFlag;
-	uint8_t					uiRefLayerDqId;
-	uint8_t					uiRefLayerChromaPhaseXPlus1Flag;
-	uint8_t					uiRefLayerChromaPhaseYPlus1;
-	uint8_t					uiLayerDqId;			// dq_id of current layer
-	bool_t					bUseRefBasePicFlag;	// whether reference pic or reference base pic is referred?
+  PPicture				pRef;			// reference picture pointer
+  PPicture				pDec;			// reconstruction picture pointer for layer
+
+  bool_t					bStoreRefBasePicFlag;				// iCurTid == 0 && iCurQid = 0 && bEncodeKeyPic = 1
+  bool_t					bTCoeffLevelPredFlag;
+  bool_t					bConstrainedIntraResamplingFlag;
+  uint8_t					uiRefLayerDqId;
+  uint8_t					uiRefLayerChromaPhaseXPlus1Flag;
+  uint8_t					uiRefLayerChromaPhaseYPlus1;
+  uint8_t					uiLayerDqId;			// dq_id of current layer
+  bool_t					bUseRefBasePicFlag;	// whether reference pic or reference base pic is referred?
 };
 
-typedef struct TagGpuAvcLayer{
-	SLayerInfo				sLayerInfo;	
-	PBitStringAux			pBitStringAux;	// pointer to SBitStringAux
+typedef struct TagGpuAvcLayer {
+  SLayerInfo				sLayerInfo;
+  PBitStringAux			pBitStringAux;	// pointer to SBitStringAux
 
-	int8_t					*pMbType;
-	int32_t					*pSliceIdc;	// using int32_t for slice_idc
-	int8_t					*pLumaQp;
-	int8_t					*pCbp;
-	int8_t					(*pNzc)[24];
-	int8_t					(*pIntraPredMode)[8]; //0~3 top4x4 ; 4~6 left 4x4; 7 intra16x16
+  int8_t*					pMbType;
+  int32_t*					pSliceIdc;	// using int32_t for slice_idc
+  int8_t*					pLumaQp;
+  int8_t*					pCbp;
+  int8_t	(*pNzc)[24];
+  int8_t	(*pIntraPredMode)[8];     //0~3 top4x4 ; 4~6 left 4x4; 7 intra16x16
 
-	int32_t					iMbX;
-	int32_t					iMbY;
-	int32_t					iMbXyIndex;
-	int32_t					iMbWidth;		// MB width of this picture, equal to sSps.iMbWidth
-	int32_t					iMbHeight;		// MB height of this picture, equal to sSps.iMbHeight;
+  int32_t					iMbX;
+  int32_t					iMbY;
+  int32_t					iMbXyIndex;
+  int32_t					iMbWidth;		// MB width of this picture, equal to sSps.iMbWidth
+  int32_t					iMbHeight;		// MB height of this picture, equal to sSps.iMbHeight;
 
-}SGpuAvcDqLayer, *PGpuAvcDqLayer;
+} SGpuAvcDqLayer, *PGpuAvcDqLayer;
 
 ///////////////////////////////////////////////////////////////////////
 
--- a/codec/decoder/core/inc/dec_golomb.h
+++ b/codec/decoder/core/inc/dec_golomb.h
@@ -52,33 +52,30 @@
 	iCurBits |= ((pBufPtr[0] << 8) | pBufPtr[1]) << (iLeftBits); \
 	iLeftBits -= 16; \
 	pBufPtr +=2; \
-} 
+}
 #define NEED_BITS(iCurBits, pBufPtr, iLeftBits) { \
 	if( iLeftBits > 0 ) { \
 	GET_WORD(iCurBits, pBufPtr, iLeftBits); \
 	} \
-} 
-#define UBITS(iCurBits, iNumBits) (iCurBits>>(32-(iNumBits)))  
+}
+#define UBITS(iCurBits, iNumBits) (iCurBits>>(32-(iNumBits)))
 #define DUMP_BITS(iCurBits, pBufPtr, iLeftBits, iNumBits) { \
 	iCurBits <<= (iNumBits); \
 	iLeftBits += (iNumBits); \
 	NEED_BITS(iCurBits, pBufPtr, iLeftBits); \
-}  
+}
 
-static inline int32_t ShowBits( PBitStringAux pBs, int32_t iNumBits )
-{
-	return UBITS( pBs->uiCurBits, iNumBits );
-} 
-static inline void_t FlushBits( PBitStringAux pBs, int32_t iNumBits )
-{
-	DUMP_BITS( pBs->uiCurBits, pBs->pCurBuf, pBs->iLeftBits, iNumBits );
-} 
-static inline int32_t BsGetBits( PBitStringAux pBs, int32_t iNumBits )
-{
-	int32_t iRc = UBITS( pBs->uiCurBits, iNumBits );
-	DUMP_BITS( pBs->uiCurBits, pBs->pCurBuf, pBs->iLeftBits, iNumBits );
-	return iRc;
-}   
+static inline int32_t ShowBits (PBitStringAux pBs, int32_t iNumBits) {
+return UBITS (pBs->uiCurBits, iNumBits);
+}
+static inline void_t FlushBits (PBitStringAux pBs, int32_t iNumBits) {
+DUMP_BITS (pBs->uiCurBits, pBs->pCurBuf, pBs->iLeftBits, iNumBits);
+}
+static inline int32_t BsGetBits (PBitStringAux pBs, int32_t iNumBits) {
+int32_t iRc = UBITS (pBs->uiCurBits, iNumBits);
+DUMP_BITS (pBs->uiCurBits, pBs->pCurBuf, pBs->iLeftBits, iNumBits);
+return iRc;
+}
 
 /*
  *	Exponential Golomb codes decoding routines
@@ -90,152 +87,129 @@
 
 extern const uint8_t g_kuiLeadingZeroTable[256];
 
-static const uint32_t g_kuiPrefix8BitsTable[16] =
-{
-	0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3
+static const uint32_t g_kuiPrefix8BitsTable[16] = {
+0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3
 };
 
 
-static inline uint32_t GetPrefixBits(uint32_t uiValue)
-{
-	uint32_t iNumBit = 0;	
+static inline uint32_t GetPrefixBits (uint32_t uiValue) {
+uint32_t iNumBit = 0;
 
-	if (uiValue & 0xffff0000) 
-	{
-		uiValue >>= 16;
-		iNumBit += 16;
-	}
-	if (uiValue & 0xff00) 
-	{
-		uiValue >>= 8;
-		iNumBit += 8;
-	}
+if (uiValue & 0xffff0000) {
+  uiValue >>= 16;
+  iNumBit += 16;
+}
+if (uiValue & 0xff00) {
+  uiValue >>= 8;
+  iNumBit += 8;
+}
 
-	if (uiValue & 0xf0)
-	{
-		uiValue >>= 4;
-		iNumBit += 4;
-	}
-	iNumBit += g_kuiPrefix8BitsTable[uiValue];
+if (uiValue & 0xf0) {
+  uiValue >>= 4;
+  iNumBit += 4;
+}
+iNumBit += g_kuiPrefix8BitsTable[uiValue];
 
-	return (32-iNumBit);
+return (32 - iNumBit);
 }
 
 /*
  *	Read one bit from bit stream followed
  */
-static inline uint32_t BsGetOneBit(PBitStringAux pBs)
-{
-	return ( BsGetBits(pBs, 1) );
+static inline uint32_t BsGetOneBit (PBitStringAux pBs) {
+return (BsGetBits (pBs, 1));
 }
 
-static inline int32_t GetLeadingZeroBits( uint32_t iCurBits ) //<=16 bits 
-{
-	int32_t  iValue; 
+static inline int32_t GetLeadingZeroBits (uint32_t iCurBits) { //<=16 bits
+int32_t  iValue;
 
-	iValue = UBITS( iCurBits, 8 );//ShowBits( bs, 8 );
-	if( iValue )
-	{
-		return g_kuiLeadingZeroTable[iValue];
-	}
+iValue = UBITS (iCurBits, 8); //ShowBits( bs, 8 );
+if (iValue) {
+  return g_kuiLeadingZeroTable[iValue];
+}
 
-	iValue = UBITS( iCurBits, 16 );//ShowBits( bs, 16 );
-	if( iValue )
-	{
-		return (g_kuiLeadingZeroTable[iValue] + 8);
-	}
+iValue = UBITS (iCurBits, 16); //ShowBits( bs, 16 );
+if (iValue) {
+  return (g_kuiLeadingZeroTable[iValue] + 8);
+}
 
-	//ASSERT(FALSE);  // should not go here
-	return -1;
+//ASSERT(FALSE);  // should not go here
+return -1;
 }
 
-static inline uint32_t BsGetUe( PBitStringAux pBs )
-{
-	uint32_t iValue = 0;
-	int32_t  iLeadingZeroBits = GetLeadingZeroBits( pBs->uiCurBits );
+static inline uint32_t BsGetUe (PBitStringAux pBs) {
+uint32_t iValue = 0;
+int32_t  iLeadingZeroBits = GetLeadingZeroBits (pBs->uiCurBits);
 
-	if ( iLeadingZeroBits == -1 ) //bistream error
-	{
-		return 0xffffffff;//-1
-	}	
-	
-	DUMP_BITS( pBs->uiCurBits, pBs->pCurBuf, pBs->iLeftBits, iLeadingZeroBits + 1 );
+if (iLeadingZeroBits == -1) { //bistream error
+  return 0xffffffff;//-1
+}
 
-	if( iLeadingZeroBits )
-	{
-		iValue = UBITS( pBs->uiCurBits, iLeadingZeroBits );
-		DUMP_BITS( pBs->uiCurBits, pBs->pCurBuf, pBs->iLeftBits, iLeadingZeroBits );
-	}
+DUMP_BITS (pBs->uiCurBits, pBs->pCurBuf, pBs->iLeftBits, iLeadingZeroBits + 1);
 
-	return ((1<<iLeadingZeroBits) - 1 + iValue);		
+if (iLeadingZeroBits) {
+  iValue = UBITS (pBs->uiCurBits, iLeadingZeroBits);
+  DUMP_BITS (pBs->uiCurBits, pBs->pCurBuf, pBs->iLeftBits, iLeadingZeroBits);
 }
 
+return ((1 << iLeadingZeroBits) - 1 + iValue);
+}
 
+
 /*
  *	Read signed exp golomb codes
  */
-static inline int32_t BsGetSe(PBitStringAux pBs)
-{
-	uint32_t uiCodeNum;
-	
-	uiCodeNum = BsGetUe( pBs );	
+static inline int32_t BsGetSe (PBitStringAux pBs) {
+uint32_t uiCodeNum;
 
-	if(uiCodeNum&0x01)							
-	{
-		return (int32_t)((uiCodeNum+1)>>1);		
-	}
-	else      
-	{
-		return NEG_NUM( (int32_t)(uiCodeNum>>1) );
-	}
+uiCodeNum = BsGetUe (pBs);
+
+if (uiCodeNum & 0x01) {
+  return (int32_t) ((uiCodeNum + 1) >> 1);
+} else {
+  return NEG_NUM ((int32_t) (uiCodeNum >> 1));
 }
+}
 
 /*
  *	Read truncated exp golomb codes
  */
-static inline uint32_t BsGetTe(PBitStringAux pBs, uint8_t uiRange)
-{
-	if ( 1 == uiRange )
-	{
-		return BsGetOneBit(pBs)^1;
-	}
-	else
-	{	
-		return BsGetUe(pBs);
-	}
+static inline uint32_t BsGetTe (PBitStringAux pBs, uint8_t uiRange) {
+if (1 == uiRange) {
+  return BsGetOneBit (pBs) ^ 1;
+} else {
+  return BsGetUe (pBs);
 }
+}
 
 /*
  * Get unsigned truncated exp golomb code.
  */
-static inline int32_t BsGetTe0(PBitStringAux pBs, int32_t iRange)
-{
-	if(iRange==1)
-		return 0;
-	else if(iRange==2)
-		return BsGetOneBit(pBs)^1;
-	else
-		return BsGetUe(pBs);
+static inline int32_t BsGetTe0 (PBitStringAux pBs, int32_t iRange) {
+if (iRange == 1)
+  return 0;
+else if (iRange == 2)
+  return BsGetOneBit (pBs) ^ 1;
+else
+  return BsGetUe (pBs);
 }
 
 /*
  *	Get number of trailing bits
  */
-static inline int32_t BsGetTrailingBits( uint8_t *pBuf )
-{
-	// TODO
-	uint32_t uiValue = *pBuf;
-    int32_t iRetNum = 1;
-	
-	do 
-	{
-		if (uiValue&1)
-			return iRetNum;
-		uiValue >>= 1;
-		++ iRetNum;
-	} while(iRetNum < 9);
-	
-	return 0;
+static inline int32_t BsGetTrailingBits (uint8_t* pBuf) {
+// TODO
+uint32_t uiValue = *pBuf;
+int32_t iRetNum = 1;
+
+do {
+  if (uiValue & 1)
+    return iRetNum;
+  uiValue >>= 1;
+  ++ iRetNum;
+} while (iRetNum < 9);
+
+return 0;
 }
 
 } // namespace WelsDec
--- a/codec/decoder/core/inc/decode_mb_aux.h
+++ b/codec/decoder/core/inc/decode_mb_aux.h
@@ -38,9 +38,9 @@
 
 namespace WelsDec {
 
-void_t InitDctClipTable(void_t);
+void_t InitDctClipTable (void_t);
 
-void_t IdctResAddPred_c(uint8_t *pPred, const int32_t kiStride, int16_t *pRs);
+void_t IdctResAddPred_c (uint8_t* pPred, const int32_t kiStride, int16_t* pRs);
 
 #if defined(__cplusplus)
 extern "C" {
@@ -47,7 +47,7 @@
 #endif//__cplusplus
 
 #if defined(X86_ASM)
-    void_t IdctResAddPred_mmx(uint8_t *pPred, const int32_t kiStride, int16_t *pRs);
+void_t IdctResAddPred_mmx (uint8_t* pPred, const int32_t kiStride, int16_t* pRs);
 #endif//X86_ASM
 
 #if defined(__cplusplus)
@@ -54,7 +54,7 @@
 }
 #endif//__cplusplus
 
-void_t GetI4LumaIChromaAddrTable(int32_t *pBlockOffset, const int32_t kiYStride, const int32_t kiUVStride);
+void_t GetI4LumaIChromaAddrTable (int32_t* pBlockOffset, const int32_t kiYStride, const int32_t kiUVStride);
 
 } // namespace WelsDec
 
--- a/codec/decoder/core/inc/decode_slice.h
+++ b/codec/decoder/core/inc/decode_slice.h
@@ -37,32 +37,32 @@
 
 namespace WelsDec {
 
-void_t WelsBlockInit(int16_t* pBlock, int32_t iWidth, int32_t iHeight, int32_t iStride, uint8_t uiVal);
+void_t WelsBlockInit (int16_t* pBlock, int32_t iWidth, int32_t iHeight, int32_t iStride, uint8_t uiVal);
 
-int32_t WelsActualDecodeMbCavlcISlice  (PWelsDecoderContext pCtx);
-int32_t WelsDecodeMbCavlcISlice        (PWelsDecoderContext pCtx, PNalUnit pNalCur);
+int32_t WelsActualDecodeMbCavlcISlice (PWelsDecoderContext pCtx);
+int32_t WelsDecodeMbCavlcISlice (PWelsDecoderContext pCtx, PNalUnit pNalCur);
 
-int32_t WelsActualDecodeMbCavlcPSlice  (PWelsDecoderContext pCtx);
-int32_t WelsDecodeMbCavlcPSlice        (PWelsDecoderContext pCtx, PNalUnit pNalCur);
+int32_t WelsActualDecodeMbCavlcPSlice (PWelsDecoderContext pCtx);
+int32_t WelsDecodeMbCavlcPSlice (PWelsDecoderContext pCtx, PNalUnit pNalCur);
 typedef int32_t (*PWelsDecMbCavlcFunc) (PWelsDecoderContext pCtx, PNalUnit pNalCur);
 
-int32_t WelsTargetSliceConstruction(PWelsDecoderContext pCtx); //construction based on slice
+int32_t WelsTargetSliceConstruction (PWelsDecoderContext pCtx); //construction based on slice
 
-int32_t WelsDecodeSlice(PWelsDecoderContext pCtx, bool_t bFirstSliceInLayer, PNalUnit pNalCur);
+int32_t WelsDecodeSlice (PWelsDecoderContext pCtx, bool_t bFirstSliceInLayer, PNalUnit pNalCur);
 
 
-int32_t WelsTargetMbConstruction(PWelsDecoderContext pCtx);
+int32_t WelsTargetMbConstruction (PWelsDecoderContext pCtx);
 
-int32_t WelsMbIntraPredictionConstruction(PWelsDecoderContext pCtx, PDqLayer pCurLayer, bool_t bOutput);
-int32_t WelsMbInterSampleConstruction( PWelsDecoderContext pCtx, PDqLayer pCurLayer, 
-											  uint8_t* pDstY, uint8_t* pDstU, uint8_t* pDstV, int32_t iStrideL, int32_t iStrideC );
-int32_t WelsMbInterConstruction(PWelsDecoderContext pCtx, PDqLayer pCurLayer);
-void_t WelsLumaDcDequantIdct(int16_t *pBlock, int32_t iQp);
-int32_t WelsMbInterPrediction  (PWelsDecoderContext pCtx, PDqLayer pCurLayer);
-void_t WelsMbCopy( uint8_t *pDst, int32_t iStrideDst, uint8_t *pSrc, int32_t iStrideSrc, 
-				 int32_t iHeight, int32_t iWidth );
+int32_t WelsMbIntraPredictionConstruction (PWelsDecoderContext pCtx, PDqLayer pCurLayer, bool_t bOutput);
+int32_t WelsMbInterSampleConstruction (PWelsDecoderContext pCtx, PDqLayer pCurLayer,
+                                       uint8_t* pDstY, uint8_t* pDstU, uint8_t* pDstV, int32_t iStrideL, int32_t iStrideC);
+int32_t WelsMbInterConstruction (PWelsDecoderContext pCtx, PDqLayer pCurLayer);
+void_t WelsLumaDcDequantIdct (int16_t* pBlock, int32_t iQp);
+int32_t WelsMbInterPrediction (PWelsDecoderContext pCtx, PDqLayer pCurLayer);
+void_t WelsMbCopy (uint8_t* pDst, int32_t iStrideDst, uint8_t* pSrc, int32_t iStrideSrc,
+                   int32_t iHeight, int32_t iWidth);
 
-void_t WelsChromaDcIdct( int16_t *pBlock );
+void_t WelsChromaDcIdct (int16_t* pBlock);
 
 #ifdef __cplusplus
 extern "C" {
@@ -70,7 +70,7 @@
 
 #ifdef  X86_ASM
 void_t WelsResBlockZero16x16_sse2 (int16_t* pBlock, int32_t iStride);
-void_t WelsResBlockZero8x8_sse2   (int16_t* pBlock, int32_t iStride);
+void_t WelsResBlockZero8x8_sse2 (int16_t* pBlock, int32_t iStride);
 #endif
 
 #ifdef __cplusplus
@@ -77,11 +77,11 @@
 }
 #endif//__cplusplus
 
-void_t WelsBlockZero16x16_c(int16_t * pBlock, int32_t iStride);
-void_t WelsBlockZero8x8_c  (int16_t * pBlock, int32_t iStride);
-void_t SetNonZeroCount_c   (int16_t * pBlock, int8_t * pNonZeroCount);
+void_t WelsBlockZero16x16_c (int16_t* pBlock, int32_t iStride);
+void_t WelsBlockZero8x8_c (int16_t* pBlock, int32_t iStride);
+void_t SetNonZeroCount_c (int16_t* pBlock, int8_t* pNonZeroCount);
 
-void_t WelsBlockFuncInit(SBlockFunc *pFunc,  int32_t iCpu);
+void_t WelsBlockFuncInit (SBlockFunc* pFunc,  int32_t iCpu);
 
 } // namespace WelsDec
 
--- a/codec/decoder/core/inc/decoder.h
+++ b/codec/decoder/core/inc/decoder.h
@@ -50,15 +50,15 @@
 #endif//__cplusplus
 
 /*!
- * \brief	configure decoder parameters	
+ * \brief	configure decoder parameters
  */
-int32_t DecoderConfigParam ( PWelsDecoderContext pCtx, const void_t* kpParam );
+int32_t DecoderConfigParam (PWelsDecoderContext pCtx, const void_t* kpParam);
 
-/*! 
+/*!
  *************************************************************************************
  * \brief	Initialize Wels decoder parameters and memory
  *
- * \param 	pCtx	        input context to be initialized at first stage 
+ * \param 	pCtx	        input context to be initialized at first stage
  * \param   pTraceHandle    handle for trace
  * \param   pLo             log info pointer
  *
@@ -68,13 +68,13 @@
  * \note	N/A
  *************************************************************************************
  */
-int32_t WelsInitDecoder( PWelsDecoderContext pCtx,  void_t * pTraceHandle, PWelsLogCallbackFunc pLog );
+int32_t WelsInitDecoder (PWelsDecoderContext pCtx,  void_t* pTraceHandle, PWelsLogCallbackFunc pLog);
 
-/*! 
+/*!
  *************************************************************************************
  * \brief	Uninitialize Wels decoder parameters and memory
  *
- * \param 	pCtx	input context to be uninitialized at release stage 
+ * \param 	pCtx	input context to be uninitialized at release stage
  *
  * \return	NONE
  *
@@ -81,9 +81,9 @@
  * \note	N/A
  *************************************************************************************
  */
-void_t WelsEndDecoder( PWelsDecoderContext pCtx );
+void_t WelsEndDecoder (PWelsDecoderContext pCtx);
 
-/*! 
+/*!
  *************************************************************************************
  * \brief	First entrance to decoding core interface.
  *
@@ -100,24 +100,24 @@
  *************************************************************************************
  */
 
-int32_t WelsDecodeBs( PWelsDecoderContext pCtx, const uint8_t *kpBsBuf, const int32_t kiBsLen, 
-					   uint8_t **ppDst, SBufferInfo* pDstBufInfo);
+int32_t WelsDecodeBs (PWelsDecoderContext pCtx, const uint8_t* kpBsBuf, const int32_t kiBsLen,
+                      uint8_t** ppDst, SBufferInfo* pDstBufInfo);
 
 /*
  *	request memory blocks for decoder avc part
  */
-int32_t WelsRequestMem( PWelsDecoderContext pCtx, const int32_t kiMbWidth, const int32_t kiMbHeight );
+int32_t WelsRequestMem (PWelsDecoderContext pCtx, const int32_t kiMbWidth, const int32_t kiMbHeight);
 
 
 /*
  *	free memory blocks in avc
  */
-void_t WelsFreeMem( PWelsDecoderContext pCtx );
+void_t WelsFreeMem (PWelsDecoderContext pCtx);
 
 /*
  * set colorspace format in decoder
  */
-int32_t DecoderSetCsp(PWelsDecoderContext pCtx, const int32_t kiColorFormat);
+int32_t DecoderSetCsp (PWelsDecoderContext pCtx, const int32_t kiColorFormat);
 
 /*!
  * \brief	make sure synchonozization picture resolution (get from slice header) among different parts (i.e, memory related and so on)
@@ -125,22 +125,22 @@
  * ( MB coordinate and parts of data within decoder context structure )
  * \param	pCtx		Wels decoder context
  * \param	iMbWidth	MB width
- * \pram	iMbHeight	MB height 
+ * \pram	iMbHeight	MB height
  * \return	0 - successful; none 0 - something wrong
  */
-int32_t SyncPictureResolutionExt( PWelsDecoderContext pCtx, const int32_t kiMbWidth, const int32_t kiMbHeight );
+int32_t SyncPictureResolutionExt (PWelsDecoderContext pCtx, const int32_t kiMbWidth, const int32_t kiMbHeight);
 
 /*!
  * \brief	update maximal picture width and height if applicable when receiving a SPS NAL
  */
-void_t UpdateMaxPictureResolution( PWelsDecoderContext pCtx, const int32_t kiCurWidth, const int32_t kiCurHeight );
+void_t UpdateMaxPictureResolution (PWelsDecoderContext pCtx, const int32_t kiCurWidth, const int32_t kiCurHeight);
 
-void_t AssignFuncPointerForRec( PWelsDecoderContext pCtx );
+void_t AssignFuncPointerForRec (PWelsDecoderContext pCtx);
 
-void_t ResetParameterSetsState( PWelsDecoderContext pCtx );
+void_t ResetParameterSetsState (PWelsDecoderContext pCtx);
 
-void_t GetVclNalTemporalId( PWelsDecoderContext pCtx );//get the info that whether or not have VCL NAL in current AU,
-                                                            //and if YES, get the temporal ID
+void_t GetVclNalTemporalId (PWelsDecoderContext pCtx); //get the info that whether or not have VCL NAL in current AU,
+//and if YES, get the temporal ID
 
 #ifdef __cplusplus
 }
--- a/codec/decoder/core/inc/decoder_context.h
+++ b/codec/decoder/core/inc/decoder_context.h
@@ -60,14 +60,13 @@
 //#define MOSAIC_AVOID_BASED_ON_SPS_PPS_ID
 #endif //MOSAIC_AVOID_BASED_ON_SPS_PPS_ID
 
-typedef struct TagDataBuffer
-{
-	uint8_t* pHead;
-	uint8_t* pEnd;
+typedef struct TagDataBuffer {
+  uint8_t* pHead;
+  uint8_t* pEnd;
 
-	uint8_t* pStartPos;
-	uint8_t* pCurPos;
-}SDataBuffer;
+  uint8_t* pStartPos;
+  uint8_t* pCurPos;
+} SDataBuffer;
 
 //#ifdef __cplusplus
 //extern "C" {
@@ -76,258 +75,266 @@
 //#pragma pack(1)
 
 /*
- *	Need move below structures to function pointer to seperate module/file later  
+ *	Need move below structures to function pointer to seperate module/file later
  */
 
 //typedef int32_t (*rec_mb) (Mb *cur_mb, PWelsDecoderContext pCtx);
 
 /*typedef for get intra predictor func pointer*/
-typedef void_t (*PGetIntraPredFunc)(uint8_t *pPred, const int32_t kiLumaStride);
-typedef void_t (*PIdctResAddPredFunc)(uint8_t *pPred, const int32_t kiStride, int16_t *pRs);
-typedef void_t (*PExpandPictureFunc)( uint8_t *pDst, const int32_t kiStride, const int32_t kiPicWidth, const int32_t kiPicHeight );
+typedef void_t (*PGetIntraPredFunc) (uint8_t* pPred, const int32_t kiLumaStride);
+typedef void_t (*PIdctResAddPredFunc) (uint8_t* pPred, const int32_t kiStride, int16_t* pRs);
+typedef void_t (*PExpandPictureFunc) (uint8_t* pDst, const int32_t kiStride, const int32_t kiPicWidth,
+                                      const int32_t kiPicHeight);
 
 /**/
 typedef struct TagRefPic {
-	PPicture			pRefList[LIST_A][MAX_REF_PIC_COUNT];	// reference picture marking plus FIFO scheme
-	PPicture			pShortRefList[LIST_A][MAX_SHORT_REF_COUNT];
-	PPicture			pLongRefList[LIST_A][MAX_LONG_REF_COUNT];
-	uint8_t				uiRefCount[LIST_A]; 
-	uint8_t				uiShortRefCount[LIST_A];
-	uint8_t				uiLongRefCount[LIST_A];	// dependend on ref pic module
-	int32_t				iMaxLongTermFrameIdx;
+  PPicture			pRefList[LIST_A][MAX_REF_PIC_COUNT];	// reference picture marking plus FIFO scheme
+  PPicture			pShortRefList[LIST_A][MAX_SHORT_REF_COUNT];
+  PPicture			pLongRefList[LIST_A][MAX_LONG_REF_COUNT];
+  uint8_t				uiRefCount[LIST_A];
+  uint8_t				uiShortRefCount[LIST_A];
+  uint8_t				uiLongRefCount[LIST_A];	// dependend on ref pic module
+  int32_t				iMaxLongTermFrameIdx;
 } SRefPic, *PRefPic;
 
 typedef void_t (*PWelsMcFunc) (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-						      int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight);
-typedef struct TagMcFunc{
-	PWelsMcFunc pMcLumaFunc;
-	PWelsMcFunc pMcChromaFunc;
-}SMcFunc;
+                               int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight);
+typedef struct TagMcFunc {
+  PWelsMcFunc pMcLumaFunc;
+  PWelsMcFunc pMcChromaFunc;
+} SMcFunc;
 
 //deblock module defination
 struct TagDeblockingFunc;
 
 typedef struct tagDeblockingFilter {
-	uint8_t	*pCsData[3];	// pointer to reconstructed picture data
-	int32_t	iCsStride[2];	// Cs stride
-	ESliceType  eSliceType;
-	int8_t	iSliceAlphaC0Offset;
-	int8_t	iSliceBetaOffset;
-	int8_t  iChromaQP;
-	int8_t  iLumaQP;
-	struct TagDeblockingFunc  *pLoopf;
-}SDeblockingFilter, *PDeblockingFilter;
+  uint8_t*	pCsData[3];	// pointer to reconstructed picture data
+  int32_t	iCsStride[2];	// Cs stride
+  ESliceType  eSliceType;
+  int8_t	iSliceAlphaC0Offset;
+  int8_t	iSliceBetaOffset;
+  int8_t  iChromaQP;
+  int8_t  iLumaQP;
+  struct TagDeblockingFunc*  pLoopf;
+} SDeblockingFilter, *PDeblockingFilter;
 
-typedef void_t (*PDeblockingFilterMbFunc)( PDqLayer pCurDqLayer, PDeblockingFilter  filter, int32_t boundry_flag );
-typedef void_t (*PLumaDeblockingLT4Func)( uint8_t *iSampleY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *iTc );
-typedef void_t (*PLumaDeblockingEQ4Func)(  uint8_t *iSampleY, int32_t iStride, int32_t iAlpha, int32_t iBeta );
-typedef void_t (*PChromaDeblockingLT4Func)( uint8_t *iSampleCb, uint8_t *iSampleCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *iTc );
-typedef void_t (*PChromaDeblockingEQ4Func)(  uint8_t *iSampleCb, uint8_t *iSampleCr, int32_t iStride, int32_t iAlpha, int32_t iBeta  );
+typedef void_t (*PDeblockingFilterMbFunc) (PDqLayer pCurDqLayer, PDeblockingFilter  filter, int32_t boundry_flag);
+typedef void_t (*PLumaDeblockingLT4Func) (uint8_t* iSampleY, int32_t iStride, int32_t iAlpha, int32_t iBeta,
+    int8_t* iTc);
+typedef void_t (*PLumaDeblockingEQ4Func) (uint8_t* iSampleY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+typedef void_t (*PChromaDeblockingLT4Func) (uint8_t* iSampleCb, uint8_t* iSampleCr, int32_t iStride, int32_t iAlpha,
+    int32_t iBeta, int8_t* iTc);
+typedef void_t (*PChromaDeblockingEQ4Func) (uint8_t* iSampleCb, uint8_t* iSampleCr, int32_t iStride, int32_t iAlpha,
+    int32_t iBeta);
 
 typedef struct TagDeblockingFunc {
-	PLumaDeblockingLT4Func    pfLumaDeblockingLT4Ver;
-	PLumaDeblockingEQ4Func    pfLumaDeblockingEQ4Ver;
-	PLumaDeblockingLT4Func    pfLumaDeblockingLT4Hor;
-	PLumaDeblockingEQ4Func    pfLumaDeblockingEQ4Hor;
+  PLumaDeblockingLT4Func    pfLumaDeblockingLT4Ver;
+  PLumaDeblockingEQ4Func    pfLumaDeblockingEQ4Ver;
+  PLumaDeblockingLT4Func    pfLumaDeblockingLT4Hor;
+  PLumaDeblockingEQ4Func    pfLumaDeblockingEQ4Hor;
 
-	PChromaDeblockingLT4Func  pfChromaDeblockingLT4Ver;
-	PChromaDeblockingEQ4Func  pfChromaDeblockingEQ4Ver;
-	PChromaDeblockingLT4Func  pfChromaDeblockingLT4Hor;
-	PChromaDeblockingEQ4Func  pfChromaDeblockinEQ4Hor;
+  PChromaDeblockingLT4Func  pfChromaDeblockingLT4Ver;
+  PChromaDeblockingEQ4Func  pfChromaDeblockingEQ4Ver;
+  PChromaDeblockingLT4Func  pfChromaDeblockingLT4Hor;
+  PChromaDeblockingEQ4Func  pfChromaDeblockinEQ4Hor;
 } SDeblockingFunc, *PDeblockingFunc;
 
-typedef void_t (*PWelsBlockAddStrideFunc)(uint8_t *pDest, uint8_t *pPred, int16_t *pRes, int32_t iPredStride, int32_t iResStride);
+typedef void_t (*PWelsBlockAddStrideFunc) (uint8_t* pDest, uint8_t* pPred, int16_t* pRes, int32_t iPredStride,
+    int32_t iResStride);
 typedef void_t (*PWelsBlockZeroFunc) (int16_t* pBlock, int32_t iStride);
-typedef void_t (*PWelsNonZeroCountFunc) (int16_t *pBlock, int8_t *pNonZeroCount);
-typedef void_t (*PWelsSimpleIdct4x4AddFunc) (int16_t *pDest, int16_t *pSrc, int32_t iStride);
+typedef void_t (*PWelsNonZeroCountFunc) (int16_t* pBlock, int8_t* pNonZeroCount);
+typedef void_t (*PWelsSimpleIdct4x4AddFunc) (int16_t* pDest, int16_t* pSrc, int32_t iStride);
 
 typedef  struct  TagBlockFunc {
-	PWelsBlockZeroFunc			pWelsBlockZero16x16Func;
-	PWelsBlockZeroFunc			pWelsBlockZero8x8Func;
-	PWelsNonZeroCountFunc		pWelsSetNonZeroCountFunc;  
+  PWelsBlockZeroFunc			pWelsBlockZero16x16Func;
+  PWelsBlockZeroFunc			pWelsBlockZero8x8Func;
+  PWelsNonZeroCountFunc		pWelsSetNonZeroCountFunc;
 } SBlockFunc;
 
-typedef void_t ( *PWelsFillNeighborMbInfoIntra4x4Func )( PNeighAvail pNeighAvail, uint8_t* pNonZeroCount, int8_t* pIntraPredMode, PDqLayer pCurLayer );
-typedef int32_t (*PWelsParseIntra4x4ModeFunc)          ( PNeighAvail pNeighAvail, int8_t* pIntraPredMode, PBitStringAux pBs, PDqLayer pCurDqLayer);
-typedef int32_t (*PWelsParseIntra16x16ModeFunc)        ( PNeighAvail pNeighAvail, PBitStringAux pBs, PDqLayer pCurDqLayer);
+typedef void_t (*PWelsFillNeighborMbInfoIntra4x4Func) (PNeighAvail pNeighAvail, uint8_t* pNonZeroCount,
+    int8_t* pIntraPredMode, PDqLayer pCurLayer);
+typedef int32_t (*PWelsParseIntra4x4ModeFunc) (PNeighAvail pNeighAvail, int8_t* pIntraPredMode, PBitStringAux pBs,
+    PDqLayer pCurDqLayer);
+typedef int32_t (*PWelsParseIntra16x16ModeFunc) (PNeighAvail pNeighAvail, PBitStringAux pBs, PDqLayer pCurDqLayer);
 
-typedef struct TagExpandPicFunc{
-	PExpandPictureFunc pExpandLumaPicture;
-	PExpandPictureFunc pExpandChromaPicture[2];
-}SExpandPicFunc;
+typedef struct TagExpandPicFunc {
+  PExpandPictureFunc pExpandLumaPicture;
+  PExpandPictureFunc pExpandChromaPicture[2];
+} SExpandPicFunc;
 
 /*
  *	SWelsDecoderContext: to maintail all modules data over decoder@framework
  */
 
-typedef struct TagWelsDecoderContext {	
-	// Input
-	void_t				*pArgDec;			// structured arguments for decoder, reserved here for extension in the future
+typedef struct TagWelsDecoderContext {
+  // Input
+  void_t*				pArgDec;			// structured arguments for decoder, reserved here for extension in the future
 
-	SDataBuffer       	sRawData;
+  SDataBuffer       	sRawData;
 
-	// Configuration
-	SDecodingParam	    *pParam;
-	uint32_t			uiCpuFlag;			// CPU compatibility detected
-	int32_t 	   		iDecoderMode;		// indicate decoder running mode
-	int32_t				iSetMode;			// indicate decoder mode set from upper layer, this is read-only for decoder internal
-	int32_t 			iDecoderOutputProperty; // indicate the output buffer property
-	int32_t				iModeSwitchType;	// 1: optimal decision; 2: forced switch to the other mode; 0: no switch
-	
-	int32_t				iOutputColorFormat;		// color space format to be outputed
-	VIDEO_BITSTREAM_TYPE eVideoType; //indicate the type of video to decide whether or not to do qp_delta error detection.
-	bool_t				bErrorResilienceFlag;		// error resilience flag
-	bool_t				bHaveGotMemory;	// global memory for decoder context related ever requested?	
-	
-	int32_t				iImgWidthInPixel;	// width of image in pixel reconstruction picture to be output
-	int32_t				iImgHeightInPixel;// height of image in pixel reconstruction picture to be output
-	int32_t				iMaxWidthInSps;	// maximal width of pixel in SPS sets
-	int32_t				iMaxHeightInSps;	// maximal height of pixel in SPS sets
+  // Configuration
+  SDecodingParam*    	pParam;
+  uint32_t			uiCpuFlag;			// CPU compatibility detected
+  int32_t 	   		iDecoderMode;		// indicate decoder running mode
+  int32_t				iSetMode;			// indicate decoder mode set from upper layer, this is read-only for decoder internal
+  int32_t 			iDecoderOutputProperty; // indicate the output buffer property
+  int32_t				iModeSwitchType;	// 1: optimal decision; 2: forced switch to the other mode; 0: no switch
 
-	// Derived common elements
-	SNalUnitHeader		sCurNalHead;
-	ESliceType			eSliceType;			// Slice type
-	int32_t				iFrameNum;
-	int32_t				iPrevFrameNum;		// frame number of previous frame well decoded for non-truncated mode yet
-    bool_t              bLastHasMmco5;      //
-	int32_t				iErrorCode;			// error code return while decoding in case packets lost
-	SFmo				sFmoList[MAX_PPS_COUNT];	// list for FMO storage
-	PFmo				pFmo;				// current fmo context after parsed slice_header
-	int32_t				iActiveFmoNum;		// active count number of fmo context in list
+  int32_t				iOutputColorFormat;		// color space format to be outputed
+  VIDEO_BITSTREAM_TYPE eVideoType; //indicate the type of video to decide whether or not to do qp_delta error detection.
+  bool_t				bErrorResilienceFlag;		// error resilience flag
+  bool_t				bHaveGotMemory;	// global memory for decoder context related ever requested?
 
-	/*needed info by decode slice level and mb level*/
-	int32_t				iDecBlockOffsetArray[24];	// address talbe for sub 4x4 block in intra4x4_mb, so no need to caculta the address every time.
+  int32_t				iImgWidthInPixel;	// width of image in pixel reconstruction picture to be output
+  int32_t				iImgHeightInPixel;// height of image in pixel reconstruction picture to be output
+  int32_t				iMaxWidthInSps;	// maximal width of pixel in SPS sets
+  int32_t				iMaxHeightInSps;	// maximal height of pixel in SPS sets
 
-	struct
-	{
-		int8_t  *pMbType[LAYER_NUM_EXCHANGEABLE];                      /* mb type */
-		int16_t	(*pMv[LAYER_NUM_EXCHANGEABLE][LIST_A])[MB_BLOCK4x4_NUM][MV_A]; //[LAYER_NUM_EXCHANGEABLE   MB_BLOCK4x4_NUM*]
-		int8_t	(*pRefIndex[LAYER_NUM_EXCHANGEABLE][LIST_A])[MB_BLOCK4x4_NUM]; 
-		int8_t	*pLumaQp[LAYER_NUM_EXCHANGEABLE];	/*mb luma_qp*/
-		int8_t	*pChromaQp[LAYER_NUM_EXCHANGEABLE];					/*mb chroma_qp*/
-		int8_t	(*pNzc[LAYER_NUM_EXCHANGEABLE])[24];
-		int8_t	(*pNzcRs[LAYER_NUM_EXCHANGEABLE])[24];	
-		int16_t (*pScaledTCoeff[LAYER_NUM_EXCHANGEABLE])[MB_COEFF_LIST_SIZE]; /*need be aligned*/
-		int8_t	(*pIntraPredMode[LAYER_NUM_EXCHANGEABLE])[8]; //0~3 top4x4 ; 4~6 left 4x4; 7 intra16x16
-		int8_t  (*pIntra4x4FinalMode[LAYER_NUM_EXCHANGEABLE])[MB_BLOCK4x4_NUM];
-		int8_t  *pChromaPredMode[LAYER_NUM_EXCHANGEABLE];
-		int8_t  *pCbp[LAYER_NUM_EXCHANGEABLE];
-		uint8_t (*pMotionPredFlag[LAYER_NUM_EXCHANGEABLE][LIST_A])[MB_PARTITION_SIZE]; // 8x8
-		int8_t  (*pSubMbType[LAYER_NUM_EXCHANGEABLE])[MB_SUB_PARTITION_SIZE];
-		int32_t *pSliceIdc[LAYER_NUM_EXCHANGEABLE];		// using int32_t for slice_idc
-		int8_t  *pResidualPredFlag[LAYER_NUM_EXCHANGEABLE];	
-		int8_t  *pInterPredictionDoneFlag[LAYER_NUM_EXCHANGEABLE];
-		int16_t iMbWidth;
-		int16_t iMbHeight;
-	}sMb;
+  // Derived common elements
+  SNalUnitHeader		sCurNalHead;
+  ESliceType			eSliceType;			// Slice type
+  int32_t				iFrameNum;
+  int32_t				iPrevFrameNum;		// frame number of previous frame well decoded for non-truncated mode yet
+  bool_t              bLastHasMmco5;      //
+  int32_t				iErrorCode;			// error code return while decoding in case packets lost
+  SFmo				sFmoList[MAX_PPS_COUNT];	// list for FMO storage
+  PFmo				pFmo;				// current fmo context after parsed slice_header
+  int32_t				iActiveFmoNum;		// active count number of fmo context in list
 
+  /*needed info by decode slice level and mb level*/
+  int32_t
+  iDecBlockOffsetArray[24];	// address talbe for sub 4x4 block in intra4x4_mb, so no need to caculta the address every time.
 
-	// reconstruction picture	
-	PPicture			pDec;			//pointer to current picture being reconstructed
+  struct {
+    int8_t*  pMbType[LAYER_NUM_EXCHANGEABLE];                      /* mb type */
+    int16_t	(*pMv[LAYER_NUM_EXCHANGEABLE][LIST_A])[MB_BLOCK4x4_NUM][MV_A]; //[LAYER_NUM_EXCHANGEABLE   MB_BLOCK4x4_NUM*]
+    int8_t	(*pRefIndex[LAYER_NUM_EXCHANGEABLE][LIST_A])[MB_BLOCK4x4_NUM];
+    int8_t*	pLumaQp[LAYER_NUM_EXCHANGEABLE];	/*mb luma_qp*/
+    int8_t*	pChromaQp[LAYER_NUM_EXCHANGEABLE];					/*mb chroma_qp*/
+    int8_t	(*pNzc[LAYER_NUM_EXCHANGEABLE])[24];
+    int8_t	(*pNzcRs[LAYER_NUM_EXCHANGEABLE])[24];
+    int16_t (*pScaledTCoeff[LAYER_NUM_EXCHANGEABLE])[MB_COEFF_LIST_SIZE]; /*need be aligned*/
+    int8_t	(*pIntraPredMode[LAYER_NUM_EXCHANGEABLE])[8]; //0~3 top4x4 ; 4~6 left 4x4; 7 intra16x16
+    int8_t (*pIntra4x4FinalMode[LAYER_NUM_EXCHANGEABLE])[MB_BLOCK4x4_NUM];
+    int8_t*  pChromaPredMode[LAYER_NUM_EXCHANGEABLE];
+    int8_t*  pCbp[LAYER_NUM_EXCHANGEABLE];
+    uint8_t (*pMotionPredFlag[LAYER_NUM_EXCHANGEABLE][LIST_A])[MB_PARTITION_SIZE]; // 8x8
+    int8_t (*pSubMbType[LAYER_NUM_EXCHANGEABLE])[MB_SUB_PARTITION_SIZE];
+    int32_t* pSliceIdc[LAYER_NUM_EXCHANGEABLE];		// using int32_t for slice_idc
+    int8_t*  pResidualPredFlag[LAYER_NUM_EXCHANGEABLE];
+    int8_t*  pInterPredictionDoneFlag[LAYER_NUM_EXCHANGEABLE];
+    int16_t iMbWidth;
+    int16_t iMbHeight;
+  } sMb;
 
-	// reference pictures
-	SRefPic				sRefPic;
 
-	SVlcTable			sVlcTable;		 // vlc table
-	
-	SBitStringAux		sBs;
+  // reconstruction picture
+  PPicture			pDec;			//pointer to current picture being reconstructed
 
-	/* Global memory external */
+  // reference pictures
+  SRefPic				sRefPic;
 
-	SPosOffset	sFrameCrop;
+  SVlcTable			sVlcTable;		 // vlc table
 
+  SBitStringAux		sBs;
+
+  /* Global memory external */
+
+  SPosOffset	sFrameCrop;
+
 #ifdef MOSAIC_AVOID_BASED_ON_SPS_PPS_ID
-	int32_t             iSpsTotalNum;  //the number of SPS in current IDR interval
-	int32_t             iSubspsTotalNum; //the number of subsps in current IDR interval
-	int32_t             iPpsTotalNum; //the number of PPS in current IDR interval.
+  int32_t             iSpsTotalNum;  //the number of SPS in current IDR interval
+  int32_t             iSubspsTotalNum; //the number of subsps in current IDR interval
+  int32_t             iPpsTotalNum; //the number of PPS in current IDR interval.
 #endif //MOSAIC_AVOID_BASED_ON_SPS_PPS_ID	
 
-	SSps				sSpsBuffer[MAX_SPS_COUNT];
-	SPps				sPpsBuffer[MAX_PPS_COUNT];
-	PSliceHeader		pSliceHeader;
+  SSps				sSpsBuffer[MAX_SPS_COUNT];
+  SPps				sPpsBuffer[MAX_PPS_COUNT];
+  PSliceHeader		pSliceHeader;
 
-	PPicBuff	        pPicBuff[LIST_A];	// Initially allocated memory for pictures which are used in decoding.
-	int32_t				iPicQueueNumber;
-	
-	SSubsetSps			sSubsetSpsBuffer[MAX_SPS_COUNT];
-	SNalUnit            sPrefixNal;
-	
-	PAccessUnit			pAccessUnitList;	// current access unit list to be performed
-	PSps				pSps;	// used by current AU
-	PPps				pPps;	// used by current AU
-	// Memory for pAccessUnitList is dynamically held till decoder destruction.
-	PDqLayer			pCurDqLayer;		// current DQ layer representation, also carry reference base layer if applicable
-	PDqLayer			pDqLayersList[LAYER_NUM_EXCHANGEABLE];	// DQ layers list with memory allocated
-	uint8_t				*pCsListXchg[LAYER_NUM_EXCHANGEABLE][3];	// Constructed picture buffer: 0- cur layer, 1- ref layer;
-	int16_t				*pRsListXchg[LAYER_NUM_EXCHANGEABLE][3];// Residual picture buffer: 0- cur layer, 1- ref layer;
+  PPicBuff	        pPicBuff[LIST_A];	// Initially allocated memory for pictures which are used in decoding.
+  int32_t				iPicQueueNumber;
 
-	int32_t				iCsStride[3];		// strides for Cs
-	int32_t				iRsStride[3];		// strides for Rs
+  SSubsetSps			sSubsetSpsBuffer[MAX_SPS_COUNT];
+  SNalUnit            sPrefixNal;
 
-	int32_t             iPicWidthReq;		// picture width have requested the memory
-	int32_t             iPicHeightReq;		// picture height have requested the memory
+  PAccessUnit			pAccessUnitList;	// current access unit list to be performed
+  PSps				pSps;	// used by current AU
+  PPps				pPps;	// used by current AU
+  // Memory for pAccessUnitList is dynamically held till decoder destruction.
+  PDqLayer			pCurDqLayer;		// current DQ layer representation, also carry reference base layer if applicable
+  PDqLayer			pDqLayersList[LAYER_NUM_EXCHANGEABLE];	// DQ layers list with memory allocated
+  uint8_t*				pCsListXchg[LAYER_NUM_EXCHANGEABLE][3];	// Constructed picture buffer: 0- cur layer, 1- ref layer;
+  int16_t*				pRsListXchg[LAYER_NUM_EXCHANGEABLE][3];// Residual picture buffer: 0- cur layer, 1- ref layer;
 
-	uint8_t				uiTargetDqId;		// maximal DQ ID in current access unit, meaning target layer ID	
-	bool_t				bAvcBasedFlag;		// For decoding bitstream:
-	bool_t				bEndOfStreamFlag;	// Flag on end of stream requested by external application layer
-	bool_t				bInitialDqLayersMem;	// dq layers related memory is available?
+  int32_t				iCsStride[3];		// strides for Cs
+  int32_t				iRsStride[3];		// strides for Rs
 
-	bool_t              bOnlyOneLayerInCurAuFlag; //only one layer in current AU: 1
-	
-	// for EC parameter sets
-	bool_t				bSpsExistAheadFlag;	// whether does SPS NAL exist ahead of sequence?
-	bool_t				bSubspsExistAheadFlag;// whether does Subset SPS NAL exist ahead of sequence?
-	bool_t				bPpsExistAheadFlag;	// whether does PPS NAL exist ahead of sequence?
+  int32_t             iPicWidthReq;		// picture width have requested the memory
+  int32_t             iPicHeightReq;		// picture height have requested the memory
 
-	bool_t				bSpsAvailFlags[MAX_SPS_COUNT];
-	bool_t				bSubspsAvailFlags[MAX_SPS_COUNT];
-	bool_t				bPpsAvailFlags[MAX_PPS_COUNT];
-	bool_t				bReferenceLostAtT0Flag;
+  uint8_t				uiTargetDqId;		// maximal DQ ID in current access unit, meaning target layer ID
+  bool_t				bAvcBasedFlag;		// For decoding bitstream:
+  bool_t				bEndOfStreamFlag;	// Flag on end of stream requested by external application layer
+  bool_t				bInitialDqLayersMem;	// dq layers related memory is available?
+
+  bool_t              bOnlyOneLayerInCurAuFlag; //only one layer in current AU: 1
+
+  // for EC parameter sets
+  bool_t				bSpsExistAheadFlag;	// whether does SPS NAL exist ahead of sequence?
+  bool_t				bSubspsExistAheadFlag;// whether does Subset SPS NAL exist ahead of sequence?
+  bool_t				bPpsExistAheadFlag;	// whether does PPS NAL exist ahead of sequence?
+
+  bool_t				bSpsAvailFlags[MAX_SPS_COUNT];
+  bool_t				bSubspsAvailFlags[MAX_SPS_COUNT];
+  bool_t				bPpsAvailFlags[MAX_PPS_COUNT];
+  bool_t				bReferenceLostAtT0Flag;
 #ifdef LONG_TERM_REF
-	bool_t				bParamSetsLostFlag;	//sps or pps do not exist or not correct
+  bool_t				bParamSetsLostFlag;	//sps or pps do not exist or not correct
 
-	bool_t              bCurAuContainLtrMarkSeFlag; //current AU has the LTR marking syntax element, mark the previous frame or self
-	int32_t             iFrameNumOfAuMarkedLtr; //if bCurAuContainLtrMarkSeFlag==true, SHOULD set this variable
+  bool_t
+  bCurAuContainLtrMarkSeFlag; //current AU has the LTR marking syntax element, mark the previous frame or self
+  int32_t             iFrameNumOfAuMarkedLtr; //if bCurAuContainLtrMarkSeFlag==true, SHOULD set this variable
 
-	uint16_t            uiCurIdrPicId;
+  uint16_t            uiCurIdrPicId;
 #endif
 
-	PGetIntraPredFunc 	pGetI16x16LumaPredFunc[7];		//h264_predict_copy_16x16;
-	PGetIntraPredFunc 	pGetI4x4LumaPredFunc[14];		// h264_predict_4x4_t
-	PGetIntraPredFunc 	pGetIChromaPredFunc[7];		// h264_predict_8x8_t
-	PIdctResAddPredFunc	pIdctResAddPredFunc;
-	SMcFunc				sMcFunc;
-	/* For Deblocking */
-	SDeblockingFunc     sDeblockingFunc;
-    SExpandPicFunc	    sExpandPicFunc;
+  PGetIntraPredFunc 	pGetI16x16LumaPredFunc[7];		//h264_predict_copy_16x16;
+  PGetIntraPredFunc 	pGetI4x4LumaPredFunc[14];		// h264_predict_4x4_t
+  PGetIntraPredFunc 	pGetIChromaPredFunc[7];		// h264_predict_8x8_t
+  PIdctResAddPredFunc	pIdctResAddPredFunc;
+  SMcFunc				sMcFunc;
+  /* For Deblocking */
+  SDeblockingFunc     sDeblockingFunc;
+  SExpandPicFunc	    sExpandPicFunc;
 
-	/* For Block */
-	SBlockFunc          sBlockFunc;
-	/* For EC */
-	int32_t iCurSeqIntervalTargetDependId;
-	int32_t iCurSeqIntervalMaxPicWidth;
-	int32_t iCurSeqIntervalMaxPicHeight;
-	
-	PWelsFillNeighborMbInfoIntra4x4Func  pFillInfoCacheIntra4x4Func;
-	PWelsParseIntra4x4ModeFunc           pParseIntra4x4ModeFunc;
-	PWelsParseIntra16x16ModeFunc         pParseIntra16x16ModeFunc;
+  /* For Block */
+  SBlockFunc          sBlockFunc;
+  /* For EC */
+  int32_t iCurSeqIntervalTargetDependId;
+  int32_t iCurSeqIntervalMaxPicWidth;
+  int32_t iCurSeqIntervalMaxPicHeight;
 
-	//feedback whether or not have VCL in current AU, and the temporal ID
-	int32_t iFeedbackVclNalInAu;
-	int32_t iFeedbackTidInAu;	
+  PWelsFillNeighborMbInfoIntra4x4Func  pFillInfoCacheIntra4x4Func;
+  PWelsParseIntra4x4ModeFunc           pParseIntra4x4ModeFunc;
+  PWelsParseIntra16x16ModeFunc         pParseIntra16x16ModeFunc;
 
-	bool_t bAuReadyFlag;   // TRUE: one au is ready for decoding; FALSE: default value
-	
-	//trace handle
-	void_t   *   pTraceHandle;
-	
+  //feedback whether or not have VCL in current AU, and the temporal ID
+  int32_t iFeedbackVclNalInAu;
+  int32_t iFeedbackTidInAu;
+
+  bool_t bAuReadyFlag;   // TRUE: one au is ready for decoding; FALSE: default value
+
+  //trace handle
+  void_t*      pTraceHandle;
+
 #ifdef NO_WAITING_AU
-	//Save the last nal header info
-	SNalUnitHeaderExt sLastNalHdrExt;
-	SSliceHeader      sLastSliceHeader;
+  //Save the last nal header info
+  SNalUnitHeaderExt sLastNalHdrExt;
+  SSliceHeader      sLastSliceHeader;
 #endif
 
-}SWelsDecoderContext, *PWelsDecoderContext;
+} SWelsDecoderContext, *PWelsDecoderContext;
 
 //#pragma pack()
 
--- a/codec/decoder/core/inc/decoder_core.h
+++ b/codec/decoder/core/inc/decoder_core.h
@@ -32,7 +32,7 @@
  *  decoder_core.h
  *
  *  Abstract
- *      Encapsulative core interfaces 
+ *      Encapsulative core interfaces
  *
  *  History
  *      07/10/2008 Created
@@ -57,24 +57,24 @@
  * return:
  *	0 - success; otherwise returned error_no defined in error_no.h.
 */
-int32_t WelsInitMemory( PWelsDecoderContext pCtx );
+int32_t WelsInitMemory (PWelsDecoderContext pCtx);
 
 /*
  * WelsFreeMemory
  * Free memory introduced in WelsInitMemory at destruction of decoder.
- * 
+ *
  */
-void_t WelsFreeMemory( PWelsDecoderContext pCtx );
+void_t WelsFreeMemory (PWelsDecoderContext pCtx);
 
 /*!
- * \brief	request memory when maximal picture width and height are available	
+ * \brief	request memory when maximal picture width and height are available
  */
-int32_t InitialDqLayersContext ( PWelsDecoderContext pCtx, const int32_t kiMaxWidth, const int32_t kiMaxHeight );
+int32_t InitialDqLayersContext (PWelsDecoderContext pCtx, const int32_t kiMaxWidth, const int32_t kiMaxHeight);
 
 /*!
- * \brief	free dq layer context memory related		
+ * \brief	free dq layer context memory related
  */
-void_t UninitialDqLayersContext ( PWelsDecoderContext pCtx );
+void_t UninitialDqLayersContext (PWelsDecoderContext pCtx);
 
 /*
  *	DecodeNalHeaderExt
@@ -83,19 +83,19 @@
  *	pNal:	target NALUnit ptr
  *	pSrc:	NAL Unit bitstream
  */
-void_t DecodeNalHeaderExt( PNalUnit pNal, uint8_t* pSrc );
+void_t DecodeNalHeaderExt (PNalUnit pNal, uint8_t* pSrc);
 
 /*
  *	ParseSliceHeaderSyntaxs
  *	Parse slice header of bitstream
  */
-int32_t ParseSliceHeaderSyntaxs ( PWelsDecoderContext pCtx, PBitStringAux pBs, const bool_t kbExtensionFlag );
+int32_t ParseSliceHeaderSyntaxs (PWelsDecoderContext pCtx, PBitStringAux pBs, const bool_t kbExtensionFlag);
 /*
  *	Copy relative syntax elements of NALUnitHeaderExt, sRefPicBaseMarking and bStoreRefBasePicFlag in prefix nal unit.
  *	pSrc:	mark as decoded prefix NAL
  *	pDst:	succeeded VCL NAL based AVC (I/P Slice)
  */
-bool_t PrefetchNalHeaderExtSyntax ( PWelsDecoderContext pCtx, PNalUnit const kpDst, PNalUnit const kpSrc);
+bool_t PrefetchNalHeaderExtSyntax (PWelsDecoderContext pCtx, PNalUnit const kpDst, PNalUnit const kpSrc);
 
 
 /*
@@ -110,7 +110,7 @@
  * return:
  *	0 - success; otherwise returned error_no defined in error_no.h
  */
-int32_t ConstructAccessUnit( PWelsDecoderContext pCtx, uint8_t** ppDst, SBufferInfo *pDstInfo);
+int32_t ConstructAccessUnit (PWelsDecoderContext pCtx, uint8_t** ppDst, SBufferInfo* pDstInfo);
 
 
 /*
@@ -117,19 +117,20 @@
  * DecodeCurrentAccessUnit
  * Decode current access unit when current AU is completed.
  */
-int32_t DecodeCurrentAccessUnit( PWelsDecoderContext pCtx, uint8_t **ppDst, int32_t *iDstLen, int32_t *pWidth, int32_t *pHeight, SBufferInfo *pDstInfo );
+int32_t DecodeCurrentAccessUnit (PWelsDecoderContext pCtx, uint8_t** ppDst, int32_t* iDstLen, int32_t* pWidth,
+                                 int32_t* pHeight, SBufferInfo* pDstInfo);
 
 /*
  *	Prepare current dq layer context initialization.
  */
-void_t WelsDqLayerDecodeStart ( PWelsDecoderContext pCtx, PNalUnit pCurNal, PSps pSps, PPps pPps );
+void_t WelsDqLayerDecodeStart (PWelsDecoderContext pCtx, PNalUnit pCurNal, PSps pSps, PPps pPps);
 
 
-int32_t WelsDecodeAccessUnitStart ( PWelsDecoderContext pCtx );
-void_t WelsDecodeAccessUnitEnd ( PWelsDecoderContext pCtx );
+int32_t WelsDecodeAccessUnitStart (PWelsDecoderContext pCtx);
+void_t WelsDecodeAccessUnitEnd (PWelsDecoderContext pCtx);
 
-void_t ForceResetCurrentAccessUnit( PAccessUnit pAu );
-void_t ForceClearCurrentNal( PAccessUnit pAu );
+void_t ForceResetCurrentAccessUnit (PAccessUnit pAu);
+void_t ForceClearCurrentNal (PAccessUnit pAu);
 
 } // namespace WelsDec
 
--- a/codec/decoder/core/inc/error_code.h
+++ b/codec/decoder/core/inc/error_code.h
@@ -42,16 +42,15 @@
 
 namespace WelsDec {
 
-typedef enum TagWelsErr
-{
-	ERR_NONE				= 0,
-	ERR_INVALID_PARAMETERS	= 1,
-	ERR_MALLOC_FAILED		= 2,
-	ERR_API_FAILED			= 3,
-	
-	ERR_BOUND				= 31,
-}EWelsErr;
+typedef enum TagWelsErr {
+ERR_NONE				= 0,
+ERR_INVALID_PARAMETERS	= 1,
+ERR_MALLOC_FAILED		= 2,
+ERR_API_FAILED			= 3,
 
+ERR_BOUND				= 31,
+} EWelsErr;
+
 /*
  * Specified error format:
  * ERR_NO = (ERR_LEVEL_FROM (HIGH WORD) << 16) | (ERR_INFO_FROM (LOW WORD))
@@ -62,14 +61,14 @@
 
 /* ERR_LEVEL */
 //-----------------------------------------------------------------------------------------------------------
-enum{
-	ERR_LEVEL_ACCESS_UNIT = 1,
-	ERR_LEVEL_NAL_UNIT_HEADER,
-	ERR_LEVEL_PREFIX_NAL,
-	ERR_LEVEL_PARAM_SETS,
-	ERR_LEVEL_SLICE_HEADER,
-	ERR_LEVEL_SLICE_DATA,
-	ERR_LEVEL_MB_DATA,
+enum {
+ERR_LEVEL_ACCESS_UNIT = 1,
+ERR_LEVEL_NAL_UNIT_HEADER,
+ERR_LEVEL_PREFIX_NAL,
+ERR_LEVEL_PARAM_SETS,
+ERR_LEVEL_SLICE_HEADER,
+ERR_LEVEL_SLICE_DATA,
+ERR_LEVEL_MB_DATA,
 };
 
 //-----------------------------------------------------------------------------------------------------------
@@ -79,88 +78,88 @@
 #define ERR_INFO_COMMON_BASE		1
 #define ERR_INFO_SYNTAX_BASE		1001
 #define ERR_INFO_LOGIC_BASE		10001
-enum{
-	/* Error from common system level: 1-1000 */	
-	ERR_INFO_OUT_OF_MEMORY		= ERR_INFO_COMMON_BASE,
-	ERR_INFO_INVALID_ACCESS,
-	ERR_INFO_INVALID_PTR,
-	ERR_INFO_INVALID_PARAM,
-	ERR_INFO_FILE_NO_FOUND,
-	ERR_INFO_PATH_NO_FOUND,
-	ERR_INFO_ACCESS_DENIED,
-	ERR_INFO_NOT_READY,
-	ERR_INFO_WRITE_FAULT,
-	ERR_INFO_READ_FAULT,	
-	/* Error from H.264 syntax elements parser: 1001-10000 */
-	ERR_INFO_NO_PREFIX_CODE		= ERR_INFO_SYNTAX_BASE,	// No start prefix code indication
-	ERR_INFO_NO_PARAM_SETS, 					// No SPS and/ PPS before sequence header
-	ERR_INFO_PARAM_SETS_NOT_INTEGRATED,			// Parameters sets (sps/pps) are not integrated at all before to decode VCL nal
-	ERR_INFO_SPS_ID_OVERFLOW,
-	ERR_INFO_PPS_ID_OVERFLOW,
-	ERR_INFO_INVALID_PROFILE_IDC, 
-	ERR_INFO_UNMATCHED_LEVEL_IDC, 
-	ERR_INFO_INVALID_POC_TYPE,
-	ERR_INFO_REF_COUNT_OVERFLOW,
-	ERR_INFO_CROPPING_NO_SUPPORTED,
-	ERR_INFO_INVALID_SLICEGROUP,
-	ERR_INFO_INVALID_SLICEGROUP_MAP_TYPE,
-	ERR_INFO_INVALID_FRAME_NUM,
-	ERR_INFO_FMO_INIT_FAIL,
-	ERR_INFO_SLICE_TYPE_OVERFLOW,
-	ERR_INFO_INVALID_QP,
-	ERR_INFO_INVALID_DBLOCKING_IDC,
-	ERR_INFO_INVALID_MB_TYPE,
-	ERR_INFO_INVALID_SUB_MB_TYPE,
-	ERR_INFO_UNAVAILABLE_TOP_BLOCK_FOR_INTRA,
-	ERR_INFO_UNAVAILABLE_LEFT_BLOCK_FOR_INTRA,
-	ERR_INFO_INVALID_REF_INDEX,
-	ERR_INFO_INVALID_CBP,
-	ERR_INFO_DQUANT_OUT_OF_RANGE,
-	ERR_INFO_CAVLC_INVALID_PREFIX,
-	ERR_INFO_CAVLC_INVALID_TOTAL_COEFF,
-	ERR_INFO_CAVLC_INVALID_ZERO_LEFT,
-	ERR_INFO_MV_OUT_OF_RANGE,
+enum {
+/* Error from common system level: 1-1000 */
+ERR_INFO_OUT_OF_MEMORY		= ERR_INFO_COMMON_BASE,
+ERR_INFO_INVALID_ACCESS,
+ERR_INFO_INVALID_PTR,
+ERR_INFO_INVALID_PARAM,
+ERR_INFO_FILE_NO_FOUND,
+ERR_INFO_PATH_NO_FOUND,
+ERR_INFO_ACCESS_DENIED,
+ERR_INFO_NOT_READY,
+ERR_INFO_WRITE_FAULT,
+ERR_INFO_READ_FAULT,
+/* Error from H.264 syntax elements parser: 1001-10000 */
+ERR_INFO_NO_PREFIX_CODE		= ERR_INFO_SYNTAX_BASE,	// No start prefix code indication
+ERR_INFO_NO_PARAM_SETS, 					// No SPS and/ PPS before sequence header
+ERR_INFO_PARAM_SETS_NOT_INTEGRATED,			// Parameters sets (sps/pps) are not integrated at all before to decode VCL nal
+ERR_INFO_SPS_ID_OVERFLOW,
+ERR_INFO_PPS_ID_OVERFLOW,
+ERR_INFO_INVALID_PROFILE_IDC,
+ERR_INFO_UNMATCHED_LEVEL_IDC,
+ERR_INFO_INVALID_POC_TYPE,
+ERR_INFO_REF_COUNT_OVERFLOW,
+ERR_INFO_CROPPING_NO_SUPPORTED,
+ERR_INFO_INVALID_SLICEGROUP,
+ERR_INFO_INVALID_SLICEGROUP_MAP_TYPE,
+ERR_INFO_INVALID_FRAME_NUM,
+ERR_INFO_FMO_INIT_FAIL,
+ERR_INFO_SLICE_TYPE_OVERFLOW,
+ERR_INFO_INVALID_QP,
+ERR_INFO_INVALID_DBLOCKING_IDC,
+ERR_INFO_INVALID_MB_TYPE,
+ERR_INFO_INVALID_SUB_MB_TYPE,
+ERR_INFO_UNAVAILABLE_TOP_BLOCK_FOR_INTRA,
+ERR_INFO_UNAVAILABLE_LEFT_BLOCK_FOR_INTRA,
+ERR_INFO_INVALID_REF_INDEX,
+ERR_INFO_INVALID_CBP,
+ERR_INFO_DQUANT_OUT_OF_RANGE,
+ERR_INFO_CAVLC_INVALID_PREFIX,
+ERR_INFO_CAVLC_INVALID_TOTAL_COEFF,
+ERR_INFO_CAVLC_INVALID_ZERO_LEFT,
+ERR_INFO_MV_OUT_OF_RANGE,
 
-	ERR_INFO_INVALID_I4x4_PRED_MODE, 
-	ERR_INFO_INVALID_I16x16_PRED_MODE,
-	ERR_INFO_INVALID_I_CHROMA_PRED_MODE,
+ERR_INFO_INVALID_I4x4_PRED_MODE,
+ERR_INFO_INVALID_I16x16_PRED_MODE,
+ERR_INFO_INVALID_I_CHROMA_PRED_MODE,
 
-    ERR_INFO_UNSUPPORTED_NON_BASELINE,
-    ERR_INFO_UNSUPPORTED_FMOTYPE,
-    ERR_INFO_UNSUPPORTED_MBAFF,
-    ERR_INFO_UNSUPPORTED_ILP,
-    ERR_INFO_UNSUPPORTED_CABAC_EL,
-    ERR_INFO_UNSUPPORTED_SPSI,
-    ERR_INFO_UNSUPPORTED_MGS,
-    ERR_INFO_UNSUPPORTED_BIPRED,
-    ERR_INFO_UNSUPPORTED_WP,
+ERR_INFO_UNSUPPORTED_NON_BASELINE,
+ERR_INFO_UNSUPPORTED_FMOTYPE,
+ERR_INFO_UNSUPPORTED_MBAFF,
+ERR_INFO_UNSUPPORTED_ILP,
+ERR_INFO_UNSUPPORTED_CABAC_EL,
+ERR_INFO_UNSUPPORTED_SPSI,
+ERR_INFO_UNSUPPORTED_MGS,
+ERR_INFO_UNSUPPORTED_BIPRED,
+ERR_INFO_UNSUPPORTED_WP,
 
-    ERR_INFO_FRAMES_LOST,
-	ERR_INFO_DEPENDENCY_SPATIAL_LAYER_LOST,
-	ERR_INFO_DEPENDENCY_QUALIT_LAYER_LOST,
-	ERR_INFO_REFERENCE_PIC_LOST,
-	ERR_INFO_INVALID_REORDERING,
-	ERR_INFO_INVALID_MARKING,
+ERR_INFO_FRAMES_LOST,
+ERR_INFO_DEPENDENCY_SPATIAL_LAYER_LOST,
+ERR_INFO_DEPENDENCY_QUALIT_LAYER_LOST,
+ERR_INFO_REFERENCE_PIC_LOST,
+ERR_INFO_INVALID_REORDERING,
+ERR_INFO_INVALID_MARKING,
 
-	ERR_INFO_FMO_NOT_SUPPORTED_IN_BASE_LAYER,
-	ERR_INFO_INVALID_ESS,
-	ERR_INFO_INVALID_SLICE_TYPE,
-	ERR_INFO_INVALID_REF_MARKING,
-	ERR_INFO_INVALID_REF_REORDERING,
-	
-	/* Error from corresponding logic, 10001-65535 */
-	ERR_INFO_NO_IDR_PIC		= ERR_INFO_LOGIC_BASE,	// NO IDR picture available before sequence header
-	ERR_INFO_EC_NO_NEIGHBOUR_MBS,
-	ERR_INFO_EC_UNEXPECTED_MB_TYPE,
-	ERR_INFO_EC_NO_ENOUGH_NEIGHBOUR_MBS,
-	//for LTR
-	ERR_INFO_INVALID_MMCO_OPCODE_BASE,
-	ERR_INFO_INVALID_MMCO_SHORT2UNUSED,
-	EER_INFO_INVALID_MMCO_LONG2UNUSED,
-	ERR_INFO_INVALID_MMCO_SHOART2LONG,
-	ERR_INFO_INVALID_MMCO_REF_NUM_OVERFLOW,
-	ERR_INFO_INVALID_MMCO_REF_NUM_NOT_ENOUGH,
-	ERR_INFO_INVALID_MMCO_LONG_TERM_IDX_EXCEED_MAX,
+ERR_INFO_FMO_NOT_SUPPORTED_IN_BASE_LAYER,
+ERR_INFO_INVALID_ESS,
+ERR_INFO_INVALID_SLICE_TYPE,
+ERR_INFO_INVALID_REF_MARKING,
+ERR_INFO_INVALID_REF_REORDERING,
+
+/* Error from corresponding logic, 10001-65535 */
+ERR_INFO_NO_IDR_PIC		= ERR_INFO_LOGIC_BASE,	// NO IDR picture available before sequence header
+ERR_INFO_EC_NO_NEIGHBOUR_MBS,
+ERR_INFO_EC_UNEXPECTED_MB_TYPE,
+ERR_INFO_EC_NO_ENOUGH_NEIGHBOUR_MBS,
+//for LTR
+ERR_INFO_INVALID_MMCO_OPCODE_BASE,
+ERR_INFO_INVALID_MMCO_SHORT2UNUSED,
+EER_INFO_INVALID_MMCO_LONG2UNUSED,
+ERR_INFO_INVALID_MMCO_SHOART2LONG,
+ERR_INFO_INVALID_MMCO_REF_NUM_OVERFLOW,
+ERR_INFO_INVALID_MMCO_REF_NUM_NOT_ENOUGH,
+ERR_INFO_INVALID_MMCO_LONG_TERM_IDX_EXCEED_MAX,
 };
 //-----------------------------------------------------------------------------------------------------------
 
--- a/codec/decoder/core/inc/expand_pic.h
+++ b/codec/decoder/core/inc/expand_pic.h
@@ -45,7 +45,8 @@
 
 namespace WelsDec {
 
-void_t ExpandReferencingPicture(PPicture pPic, PExpandPictureFunc pExpandPictureLuma, PExpandPictureFunc pExpandPictureChroma[2]);
+void_t ExpandReferencingPicture (PPicture pPic, PExpandPictureFunc pExpandPictureLuma,
+                                 PExpandPictureFunc pExpandPictureChroma[2]);
 
 #if defined(__cplusplus)
 extern "C" {
@@ -52,18 +53,18 @@
 #endif//__cplusplus
 
 #if defined(X86_ASM)
-void_t ExpandPictureLuma_sse2(	uint8_t *pDst,
-								const int32_t kiStride,
-								const int32_t kiPicWidth,
-								const int32_t kiPicHeight	);
-void_t ExpandPictureChromaAlign_sse2(	uint8_t *pDst,
-									const int32_t kiStride,
-									const int32_t kiPicWidth,
-									const int32_t kiPicHeight	);
-void_t ExpandPictureChromaUnalign_sse2(	uint8_t *pDst,
-									const int32_t kiStride,
-									const int32_t kiPicWidth,
-									const int32_t kiPicHeight	);
+void_t ExpandPictureLuma_sse2 (uint8_t* pDst,
+                               const int32_t kiStride,
+                               const int32_t kiPicWidth,
+                               const int32_t kiPicHeight);
+void_t ExpandPictureChromaAlign_sse2 (uint8_t* pDst,
+                                      const int32_t kiStride,
+                                      const int32_t kiPicWidth,
+                                      const int32_t kiPicHeight);
+void_t ExpandPictureChromaUnalign_sse2 (uint8_t* pDst,
+                                        const int32_t kiStride,
+                                        const int32_t kiPicWidth,
+                                        const int32_t kiPicHeight);
 #endif//X86_ASM
 
 #if defined(__cplusplus)
@@ -71,7 +72,7 @@
 #endif//__cplusplus
 
 //
-void_t InitExpandPictureFunc( SExpandPicFunc *pExpandPicFunc, const uint32_t kuiCpuFlags );
+void_t InitExpandPictureFunc (SExpandPicFunc* pExpandPicFunc, const uint32_t kuiCpuFlags);
 
 } // namespace WelsDec
 
--- a/codec/decoder/core/inc/fmo.h
+++ b/codec/decoder/core/inc/fmo.h
@@ -50,16 +50,16 @@
 #define MB_XY_T	int16_t
 #endif//MB_XY_T
 
-/*! 
- * \brief	Wels Flexible Macroblock Ordering (FMO) 
+/*!
+ * \brief	Wels Flexible Macroblock Ordering (FMO)
  */
-typedef struct TagFmo{
-	uint8_t		*pMbAllocMap;
-	int32_t		iCountMbNum;
-	int32_t		iSliceGroupCount;
-	int32_t		iSliceGroupType;	
-	bool_t		bActiveFlag;
-	uint8_t		uiReserved[3];		// reserved padding bytes
+typedef struct TagFmo {
+uint8_t*		pMbAllocMap;
+int32_t		iCountMbNum;
+int32_t		iSliceGroupCount;
+int32_t		iSliceGroupType;
+bool_t		bActiveFlag;
+uint8_t		uiReserved[3];		// reserved padding bytes
 } SFmo, *PFmo;
 
 
@@ -73,7 +73,7 @@
  *
  * \return	0 - successful; none 0 - failed;
  */
-int32_t	InitFmo( PFmo pFmo, PPps pPps, const int32_t kiMbWidth, const int32_t kiMbHeight );
+int32_t	InitFmo (PFmo pFmo, PPps pPps, const int32_t kiMbWidth, const int32_t kiMbHeight);
 
 /*!
  * \brief	Uninitialize Wels Flexible Macroblock Ordering (FMO) list
@@ -84,7 +84,7 @@
  *
  * \return	NONE
  */
-void_t UninitFmoList( PFmo pFmo, const int32_t kiCnt, const int32_t kiAvail );
+void_t UninitFmoList (PFmo pFmo, const int32_t kiCnt, const int32_t kiAvail);
 
 /*!
  * \brief	update/insert FMO parameter unit
@@ -96,7 +96,7 @@
  *
  * \return	true - update/insert successfully; false - failed;
  */
-bool_t FmoParamUpdate( PFmo pFmo, PSps pSps, PPps pPps, int32_t *pActiveFmoNum );
+bool_t FmoParamUpdate (PFmo pFmo, PSps pSps, PPps pPps, int32_t* pActiveFmoNum);
 
 /*!
  * \brief	Get successive mb to be processed with given current mb_xy
@@ -106,7 +106,7 @@
  *
  * \return	iNextMb - successful; -1 - failed;
  */
-MB_XY_T FmoNextMb( PFmo pFmo, const MB_XY_T kiMbXy );
+MB_XY_T FmoNextMb (PFmo pFmo, const MB_XY_T kiMbXy);
 
 } // namespace WelsDec
 
--- a/codec/decoder/core/inc/get_intra_predictor.h
+++ b/codec/decoder/core/inc/get_intra_predictor.h
@@ -47,36 +47,36 @@
 
 namespace WelsDec {
 
-void_t WelsI4x4LumaPredV_c     (uint8_t *pPred, const int32_t kiStride);
-void_t WelsI4x4LumaPredH_c     (uint8_t *pPred, const int32_t kiStride);
-void_t WelsI4x4LumaPredDc_c    (uint8_t *pPred, const int32_t kiStride);
-void_t WelsI4x4LumaPredDcLeft_c(uint8_t *pPred, const int32_t kiStride);
-void_t WelsI4x4LumaPredDcTop_c (uint8_t *pPred, const int32_t kiStride);
-void_t WelsI4x4LumaPredDcNA_c  (uint8_t *pPred, const int32_t kiStride);
-void_t WelsI4x4LumaPredDDL_c   (uint8_t *pPred, const int32_t kiStride);
-void_t WelsI4x4LumaPredDDLTop_c(uint8_t *pPred, const int32_t kiStride);
-void_t WelsI4x4LumaPredDDR_c   (uint8_t *pPred, const int32_t kiStride);
-void_t WelsI4x4LumaPredVL_c    (uint8_t *pPred, const int32_t kiStride);
-void_t WelsI4x4LumaPredVLTop_c (uint8_t *pPred, const int32_t kiStride);
-void_t WelsI4x4LumaPredVR_c    (uint8_t *pPred, const int32_t kiStride);
-void_t WelsI4x4LumaPredHU_c    (uint8_t *pPred, const int32_t kiStride);
-void_t WelsI4x4LumaPredHD_c    (uint8_t *pPred, const int32_t kiStride);
+void_t WelsI4x4LumaPredV_c (uint8_t* pPred, const int32_t kiStride);
+void_t WelsI4x4LumaPredH_c (uint8_t* pPred, const int32_t kiStride);
+void_t WelsI4x4LumaPredDc_c (uint8_t* pPred, const int32_t kiStride);
+void_t WelsI4x4LumaPredDcLeft_c (uint8_t* pPred, const int32_t kiStride);
+void_t WelsI4x4LumaPredDcTop_c (uint8_t* pPred, const int32_t kiStride);
+void_t WelsI4x4LumaPredDcNA_c (uint8_t* pPred, const int32_t kiStride);
+void_t WelsI4x4LumaPredDDL_c (uint8_t* pPred, const int32_t kiStride);
+void_t WelsI4x4LumaPredDDLTop_c (uint8_t* pPred, const int32_t kiStride);
+void_t WelsI4x4LumaPredDDR_c (uint8_t* pPred, const int32_t kiStride);
+void_t WelsI4x4LumaPredVL_c (uint8_t* pPred, const int32_t kiStride);
+void_t WelsI4x4LumaPredVLTop_c (uint8_t* pPred, const int32_t kiStride);
+void_t WelsI4x4LumaPredVR_c (uint8_t* pPred, const int32_t kiStride);
+void_t WelsI4x4LumaPredHU_c (uint8_t* pPred, const int32_t kiStride);
+void_t WelsI4x4LumaPredHD_c (uint8_t* pPred, const int32_t kiStride);
 
-void_t WelsIChromaPredV_c      (uint8_t *pPred, const int32_t kiStride);
-void_t WelsIChromaPredH_c      (uint8_t *pPred, const int32_t kiStride);
-void_t WelsIChromaPredPlane_c  (uint8_t *pPred, const int32_t kiStride);
-void_t WelsIChromaPredDc_c     (uint8_t *pPred, const int32_t kiStride);
-void_t WelsIChromaPredDcLeft_c (uint8_t *pPred, const int32_t kiStride);
-void_t WelsIChromaPredDcTop_c  (uint8_t *pPred, const int32_t kiStride);
-void_t WelsIChromaPredDcNA_c   (uint8_t *pPred, const int32_t kiStride);
+void_t WelsIChromaPredV_c (uint8_t* pPred, const int32_t kiStride);
+void_t WelsIChromaPredH_c (uint8_t* pPred, const int32_t kiStride);
+void_t WelsIChromaPredPlane_c (uint8_t* pPred, const int32_t kiStride);
+void_t WelsIChromaPredDc_c (uint8_t* pPred, const int32_t kiStride);
+void_t WelsIChromaPredDcLeft_c (uint8_t* pPred, const int32_t kiStride);
+void_t WelsIChromaPredDcTop_c (uint8_t* pPred, const int32_t kiStride);
+void_t WelsIChromaPredDcNA_c (uint8_t* pPred, const int32_t kiStride);
 
-void_t WelsI16x16LumaPredV_c     (uint8_t *pPred, const int32_t kiStride);
-void_t WelsI16x16LumaPredH_c     (uint8_t *pPred, const int32_t kiStride);
-void_t WelsI16x16LumaPredPlane_c (uint8_t *pPred, const int32_t kiStride);
-void_t WelsI16x16LumaPredDc_c    (uint8_t *pPred, const int32_t kiStride);
-void_t WelsI16x16LumaPredDcTop_c (uint8_t *pPred, const int32_t kiStride);
-void_t WelsI16x16LumaPredDcLeft_c(uint8_t *pPred, const int32_t kiStride);
-void_t WelsI16x16LumaPredDcNA_c  (uint8_t *pPred, const int32_t kiStride);
+void_t WelsI16x16LumaPredV_c (uint8_t* pPred, const int32_t kiStride);
+void_t WelsI16x16LumaPredH_c (uint8_t* pPred, const int32_t kiStride);
+void_t WelsI16x16LumaPredPlane_c (uint8_t* pPred, const int32_t kiStride);
+void_t WelsI16x16LumaPredDc_c (uint8_t* pPred, const int32_t kiStride);
+void_t WelsI16x16LumaPredDcTop_c (uint8_t* pPred, const int32_t kiStride);
+void_t WelsI16x16LumaPredDcLeft_c (uint8_t* pPred, const int32_t kiStride);
+void_t WelsI16x16LumaPredDcNA_c (uint8_t* pPred, const int32_t kiStride);
 
 #if defined(__cplusplus)
 extern "C" {
@@ -83,29 +83,29 @@
 #endif//__cplusplus
 
 #if defined(X86_ASM)
-void_t WelsI16x16LumaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
-void_t WelsI16x16LumaPredH_sse2    (uint8_t *pPred, const int32_t kiStride);
-void_t WelsI16x16LumaPredV_sse2    (uint8_t *pPred, const int32_t kiStride);
-void_t WelsI16x16LumaPredDc_sse2   (uint8_t *pPred, const int32_t kiStride);
-void_t WelsI16x16LumaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride);
-void_t WelsI16x16LumaPredDcNA_sse2 (uint8_t *pPred, const int32_t kiStride);
+void_t WelsI16x16LumaPredPlane_sse2 (uint8_t* pPred, const int32_t kiStride);
+void_t WelsI16x16LumaPredH_sse2 (uint8_t* pPred, const int32_t kiStride);
+void_t WelsI16x16LumaPredV_sse2 (uint8_t* pPred, const int32_t kiStride);
+void_t WelsI16x16LumaPredDc_sse2 (uint8_t* pPred, const int32_t kiStride);
+void_t WelsI16x16LumaPredDcTop_sse2 (uint8_t* pPred, const int32_t kiStride);
+void_t WelsI16x16LumaPredDcNA_sse2 (uint8_t* pPred, const int32_t kiStride);
 
-void_t WelsIChromaPredDcTop_sse2   (uint8_t *pPred, const int32_t kiStride);
-void_t WelsIChromaPredPlane_sse2   (uint8_t *pPred, const int32_t kiStride);
-void_t WelsIChromaPredDc_sse2      (uint8_t *pPred, const int32_t kiStride);
-void_t WelsIChromaPredH_mmx        (uint8_t *pPred, const int32_t kiStride);
-void_t WelsIChromaPredV_mmx        (uint8_t *pPred, const int32_t kiStride);
-void_t WelsIChromaPredDcLeft_mmx(uint8_t *pPred, const int32_t kiStride);
-void_t WelsIChromaPredDcNA_mmx  (uint8_t *pPred, const int32_t kiStride);
+void_t WelsIChromaPredDcTop_sse2 (uint8_t* pPred, const int32_t kiStride);
+void_t WelsIChromaPredPlane_sse2 (uint8_t* pPred, const int32_t kiStride);
+void_t WelsIChromaPredDc_sse2 (uint8_t* pPred, const int32_t kiStride);
+void_t WelsIChromaPredH_mmx (uint8_t* pPred, const int32_t kiStride);
+void_t WelsIChromaPredV_mmx (uint8_t* pPred, const int32_t kiStride);
+void_t WelsIChromaPredDcLeft_mmx (uint8_t* pPred, const int32_t kiStride);
+void_t WelsIChromaPredDcNA_mmx (uint8_t* pPred, const int32_t kiStride);
 
-void_t WelsI4x4LumaPredH_sse2 (uint8_t *pPred, const int32_t kiStride);
-void_t WelsI4x4LumaPredDc_sse2(uint8_t *pPred, const int32_t kiStride);
-void_t WelsI4x4LumaPredDDR_mmx(uint8_t *pPred, const int32_t kiStride);
-void_t WelsI4x4LumaPredHD_mmx (uint8_t *pPred, const int32_t kiStride);
-void_t WelsI4x4LumaPredHU_mmx (uint8_t *pPred, const int32_t kiStride);
-void_t WelsI4x4LumaPredVR_mmx (uint8_t *pPred, const int32_t kiStride);
-void_t WelsI4x4LumaPredDDL_mmx(uint8_t *pPred, const int32_t kiStride);
-void_t WelsI4x4LumaPredVL_mmx (uint8_t *pPred, const int32_t kiStride);
+void_t WelsI4x4LumaPredH_sse2 (uint8_t* pPred, const int32_t kiStride);
+void_t WelsI4x4LumaPredDc_sse2 (uint8_t* pPred, const int32_t kiStride);
+void_t WelsI4x4LumaPredDDR_mmx (uint8_t* pPred, const int32_t kiStride);
+void_t WelsI4x4LumaPredHD_mmx (uint8_t* pPred, const int32_t kiStride);
+void_t WelsI4x4LumaPredHU_mmx (uint8_t* pPred, const int32_t kiStride);
+void_t WelsI4x4LumaPredVR_mmx (uint8_t* pPred, const int32_t kiStride);
+void_t WelsI4x4LumaPredDDL_mmx (uint8_t* pPred, const int32_t kiStride);
+void_t WelsI4x4LumaPredVL_mmx (uint8_t* pPred, const int32_t kiStride);
 #endif//X86_ASM
 
 #if defined(__cplusplus)
--- a/codec/decoder/core/inc/ls_defines.h
+++ b/codec/decoder/core/inc/ls_defines.h
@@ -37,26 +37,32 @@
 
 #ifdef __GNUC__
 
-	struct tagUnaligned_64 { uint64_t l; } __attribute__((packed));
-	struct tagUnaligned_32 { uint32_t l; } __attribute__((packed));
-	struct tagUnaligned_16 { uint16_t l; } __attribute__((packed));
-	
-	#define LD16(a) (((struct tagUnaligned_16 *) (a))->l)
-	#define LD32(a) (((struct tagUnaligned_32 *) (a))->l)
-	#define LD64(a) (((struct tagUnaligned_64 *) (a))->l)
-	//#define _USE_STRUCT_INT_CVT
+struct tagUnaligned_64 {
+  uint64_t l;
+} __attribute__ ((packed));
+struct tagUnaligned_32 {
+  uint32_t l;
+} __attribute__ ((packed));
+struct tagUnaligned_16 {
+  uint16_t l;
+} __attribute__ ((packed));
+
+#define LD16(a) (((struct tagUnaligned_16 *) (a))->l)
+#define LD32(a) (((struct tagUnaligned_32 *) (a))->l)
+#define LD64(a) (((struct tagUnaligned_64 *) (a))->l)
+//#define _USE_STRUCT_INT_CVT
 //	#ifdef _USE_STRUCT_INT_CVT
-		#define ST16(a, b) (((struct tagUnaligned_16 *) (a))->l) = (b)
-		#define ST32(a, b) (((struct tagUnaligned_32 *) (a))->l) = (b)
-		#define ST64(a, b) (((struct tagUnaligned_64 *) (a))->l) = (b)
+#define ST16(a, b) (((struct tagUnaligned_16 *) (a))->l) = (b)
+#define ST32(a, b) (((struct tagUnaligned_32 *) (a))->l) = (b)
+#define ST64(a, b) (((struct tagUnaligned_64 *) (a))->l) = (b)
 //	#else
 //		inline void_t __ST16(void_t *dst, uint16_t v) { memcpy(dst, &v, 2); }
 //		inline void_t __ST32(void_t *dst, uint32_t v) { memcpy(dst, &v, 4); }
-		//inline void_t __ST64(void_t *dst, uint64_t v) { memcpy(dst, &v, 8); }
+//inline void_t __ST64(void_t *dst, uint64_t v) { memcpy(dst, &v, 8); }
 //	#endif
 
 #else
-	
+
 //#define INTD16(a) (*((int16_t*)(a)))
 //#define INTD32(a) (*((int32_t*)(a)))
 //#define INTD64(a) (*((int64_t*)(a)))
--- a/codec/decoder/core/inc/macros.h
+++ b/codec/decoder/core/inc/macros.h
@@ -42,7 +42,7 @@
 
 #include <math.h>
 #include <assert.h>
-#include "typedefs.h"
+#include "typedefs.h"
 
 
 namespace WelsDec {
@@ -59,32 +59,32 @@
 	_tp _nm ## _tEmP[(_sz)+(_al)-1]; \
 	_tp *_nm = _nm ## _tEmP + ((_al)-1) - (((int32_t)(_nm ## _tEmP + ((_al)-1)) & ((_al)-1))/sizeof(_tp))
 
-
-#define ENFORCE_STACK_ALIGN_2D(_tp, _nm, _cx, _cy, _al) \
-	assert( ((_al) && !((_al) & ((_al) - 1))) && ((_al) >= sizeof(_tp)) ); /*_al should be power-of-2 and >= sizeof(_tp)*/\
-	_tp _nm ## _tEmP[(_cx)*(_cy)+(_al)/sizeof(_tp)-1]; \
-	_tp *_nm ## _tEmP_al = _nm ## _tEmP + ((_al)/sizeof(_tp)-1); \
-	_nm ## _tEmP_al -= (((int32_t)_nm ## _tEmP_al & ((_al)-1))/sizeof(_tp)); \
+
+#define ENFORCE_STACK_ALIGN_2D(_tp, _nm, _cx, _cy, _al) \
+	assert( ((_al) && !((_al) & ((_al) - 1))) && ((_al) >= sizeof(_tp)) ); /*_al should be power-of-2 and >= sizeof(_tp)*/\
+	_tp _nm ## _tEmP[(_cx)*(_cy)+(_al)/sizeof(_tp)-1]; \
+	_tp *_nm ## _tEmP_al = _nm ## _tEmP + ((_al)/sizeof(_tp)-1); \
+	_nm ## _tEmP_al -= (((int32_t)_nm ## _tEmP_al & ((_al)-1))/sizeof(_tp)); \
 	_tp (*_nm)[(_cy)] = (_tp (*)[(_cy)])_nm ## _tEmP_al;
 
 
 ///////////// from encoder
 #if defined(_MSC_VER)
-	#define inline	__inline
-    #define __FASTCALL   __fastcall
+#define inline	__inline
+#define __FASTCALL   __fastcall
 //	#define __align8(t,v) __declspec(align(8)) t v
-	#define __align16(t,v) __declspec(align(16)) t v
+#define __align16(t,v) __declspec(align(16)) t v
 #elif defined(__GNUC__)
 #if !defined(MAC_POWERPC) && !defined(UNIX) && !defined(ANDROID_NDK) && !defined(APPLE_IOS)
-    #define __FASTCALL    __attribute__ ((fastcall))// linux, centos, mac_x86 can be used
+#define __FASTCALL    __attribute__ ((fastcall))// linux, centos, mac_x86 can be used
 #else
-	#define __FASTCALL	// mean NULL for mac_ppc, solaris(sparc/x86)
+#define __FASTCALL	// mean NULL for mac_ppc, solaris(sparc/x86)
 #endif//MAC_POWERPC
 //	#define __align8(t,v) t v __attribute__ ((aligned (8)))
-	#define __align16(t,v) t v __attribute__ ((aligned (16)))
+#define __align16(t,v) t v __attribute__ ((aligned (16)))
 
-#if defined(APPLE_IOS)  
-    #define inline  //For iOS platform
+#if defined(APPLE_IOS)
+#define inline  //For iOS platform
 #endif
 
 #endif//_MSC_VER
@@ -143,45 +143,42 @@
 	nC += (uint8_t)(nA == -1 && nB == -1);           \
 }
 
-static __inline int32_t CeilLog2( int32_t i )
-{
-	int32_t s = 0; i--;
-	while( i > 0 )
-	{
-		s++;
-		i >>= 1;
-	}
-	return s;
+static __inline int32_t CeilLog2 (int32_t i) {
+int32_t s = 0;
+i--;
+while (i > 0) {
+  s++;
+  i >>= 1;
 }
+return s;
+}
 /*
 the second path will degrades the performance
 */
 #if 1
-static inline int32_t WelsMedian(int32_t iX,  int32_t iY, int32_t iZ)
-{
-	int32_t iMin = iX, iMax = iX;	
-	
-	if ( iY < iMin )
-		iMin	= iY;
-	else
-		iMax = iY;
+static inline int32_t WelsMedian (int32_t iX,  int32_t iY, int32_t iZ) {
+int32_t iMin = iX, iMax = iX;
 
-	if ( iZ < iMin )
-		iMin	= iZ;
-	else if ( iZ > iMax )
-		iMax	= iZ;
+if (iY < iMin)
+  iMin	= iY;
+else
+  iMax = iY;
 
-	return (iX + iY + iZ) - (iMin + iMax);
+if (iZ < iMin)
+  iMin	= iZ;
+else if (iZ > iMax)
+  iMax	= iZ;
+
+return (iX + iY + iZ) - (iMin + iMax);
 }
 #else
-static inline int32_t WelsMedian(int32_t iX,  int32_t iY, int32_t iZ)
-{
-	int32_t iTmp = (iX-iY)&((iX-iY)>>31);
-	iX -= iTmp;
-	iY += iTmp;
-	iY -= (iY-iZ)&((iY-iZ)>>31);
-	iY += (iX-iY)&((iX-iY)>>31);
-	return iY;
+static inline int32_t WelsMedian (int32_t iX,  int32_t iY, int32_t iZ) {
+int32_t iTmp = (iX - iY) & ((iX - iY) >> 31);
+iX -= iTmp;
+iY += iTmp;
+iY -= (iY - iZ) & ((iY - iZ) >> 31);
+iY += (iX - iY) & ((iX - iY) >> 31);
+return iY;
 }
 
 #endif
@@ -222,7 +219,7 @@
 #endif//#if WELS_VERIFY_RETURN_IF
 
 /*
- *	Description: to check variable validation and return the specified result 
+ *	Description: to check variable validation and return the specified result
  *		with correspoinding process advance.
  *	 result:	value to be return
  *	 case_if:	negative condition to be verified
@@ -281,7 +278,7 @@
  * Description: to safe free an array ptr with free function pointer
  *	arr:		pointer to an array, something like "**p";
  *	num:		number of elements in array
- *  free_fn:	free function pointer	
+ *  free_fn:	free function pointer
  */
 #ifndef WELS_SAFE_FREE_ARR
 #define WELS_SAFE_FREE_ARR(pArray, iNum, fFreeFunc) \
--- a/codec/decoder/core/inc/manage_dec_ref.h
+++ b/codec/decoder/core/inc/manage_dec_ref.h
@@ -47,32 +47,32 @@
 
 namespace WelsDec {
 
-typedef enum TagRemoveFlag{
-	REMOVE_TARGET = 0,
-	REMOVE_BASE = 1,	
-	REMOVE_BASE_FIRST = 2
-}ERemoveFlag;
+typedef enum TagRemoveFlag {
+REMOVE_TARGET = 0,
+REMOVE_BASE = 1,
+REMOVE_BASE_FIRST = 2
+} ERemoveFlag;
 
-void_t  WelsResetRefPic   (PWelsDecoderContext pCtx);
-int32_t WelsInitRefList   (PWelsDecoderContext pCtx, int32_t iPoc);
-int32_t WelsReorderRefList(PWelsDecoderContext pCtx);
-int32_t WelsMarkAsRef     (PWelsDecoderContext pCtx, const bool_t kbRefBaseMarkingFlag);
+void_t  WelsResetRefPic (PWelsDecoderContext pCtx);
+int32_t WelsInitRefList (PWelsDecoderContext pCtx, int32_t iPoc);
+int32_t WelsReorderRefList (PWelsDecoderContext pCtx);
+int32_t WelsMarkAsRef (PWelsDecoderContext pCtx, const bool_t kbRefBaseMarkingFlag);
 
-static PPicture WelsDelShortFromList        (PRefPic pRefPic, int32_t iFrameNum,           ERemoveFlag eRemoveFlag);
-static PPicture WelsDelLongFromList         (PRefPic pRefPic, uint32_t uiLongTermFrameIdx, ERemoveFlag eRemoveFlag);
-static PPicture WelsDelShortFromListSetUnref(PRefPic pRefPic, int32_t iFrameNum,           ERemoveFlag eRemoveFlag);
+static PPicture WelsDelShortFromList (PRefPic pRefPic, int32_t iFrameNum,           ERemoveFlag eRemoveFlag);
+static PPicture WelsDelLongFromList (PRefPic pRefPic, uint32_t uiLongTermFrameIdx, ERemoveFlag eRemoveFlag);
+static PPicture WelsDelShortFromListSetUnref (PRefPic pRefPic, int32_t iFrameNum,           ERemoveFlag eRemoveFlag);
 static PPicture WelsDelLongFromListSetUnref (PRefPic pRefPic, uint32_t uiLongTermFrameIdx, ERemoveFlag eRemoveFlag);
 
-static int32_t MMCOBase     (PWelsDecoderContext pCtx, PRefBasePicMarking pRefPicBaseMarking);
-static int32_t MMCO         (PWelsDecoderContext pCtx, PRefPicMarking pRefPicMarking);
-static int32_t MMCOProcess  (PWelsDecoderContext pCtx, uint32_t uiMmcoType, bool_t bRefBasePic,
-                               int32_t iShortFrameNum, uint32_t uiLongTermPicNum, int32_t iLongTermFrameIdx, int32_t iMaxLongTermFrameIdx);
-static int32_t SlidingWindow(PWelsDecoderContext pCtx);
+static int32_t MMCOBase (PWelsDecoderContext pCtx, PRefBasePicMarking pRefPicBaseMarking);
+static int32_t MMCO (PWelsDecoderContext pCtx, PRefPicMarking pRefPicMarking);
+static int32_t MMCOProcess (PWelsDecoderContext pCtx, uint32_t uiMmcoType, bool_t bRefBasePic,
+                            int32_t iShortFrameNum, uint32_t uiLongTermPicNum, int32_t iLongTermFrameIdx, int32_t iMaxLongTermFrameIdx);
+static int32_t SlidingWindow (PWelsDecoderContext pCtx);
 
-static int32_t AddShortTermToList(PRefPic pRefPic, PPicture pPic);
+static int32_t AddShortTermToList (PRefPic pRefPic, PPicture pPic);
 static int32_t AddLongTermToList (PRefPic pRefPic, PPicture pPic, int32_t iLongTermFrameIdx);
 static int32_t AssignLongTermIdx (PRefPic pRefPic, int32_t iFrameNum, int32_t iLongTermFrameIdx);
-static int32_t MarkAsLongTerm    (PRefPic pRefPic, int32_t iFrameNum, int32_t iLongTermFrameIdx);
+static int32_t MarkAsLongTerm (PRefPic pRefPic, int32_t iFrameNum, int32_t iLongTermFrameIdx);
 
 } // namespace WelsDec
 
--- a/codec/decoder/core/inc/mb_cache.h
+++ b/codec/decoder/core/inc/mb_cache.h
@@ -48,7 +48,7 @@
  */
 /*
  * Cache for Luma				Cache for Chroma(Cb, Cr)
- *	
+ *
  *	TL T T T T					TL T T
  *	 L - - - -					 L - -
  *	 L - - - -					 L - - TR
@@ -66,18 +66,17 @@
 
 extern const uint8_t g_kuiScan4[16];
 
-typedef struct TagNeighborAvail
-{
-	int32_t iTopAvail;
-	int32_t iLeftAvail;
-	int32_t iRightTopAvail;
-	int32_t iLeftTopAvail;  //used for check intra_pred_mode avail or not   //1: avail; 0: unavail
+typedef struct TagNeighborAvail {
+int32_t iTopAvail;
+int32_t iLeftAvail;
+int32_t iRightTopAvail;
+int32_t iLeftTopAvail;  //used for check intra_pred_mode avail or not   //1: avail; 0: unavail
 
-	int32_t iLeftType;
-	int32_t iTopType;
-	int32_t iLeftTopType;
-	int32_t iRightTopType; 
-}SNeighAvail, *PNeighAvail;
+int32_t iLeftType;
+int32_t iTopType;
+int32_t iLeftTopType;
+int32_t iRightTopType;
+} SNeighAvail, *PNeighAvail;
 
 } // namespace WelsDec
 
--- a/codec/decoder/core/inc/mc.h
+++ b/codec/decoder/core/inc/mc.h
@@ -39,7 +39,7 @@
 
 namespace WelsDec {
 
-void_t InitMcFunc(SMcFunc *pMcFunc, int32_t iCpu);
+void_t InitMcFunc (SMcFunc* pMcFunc, int32_t iCpu);
 
 #if defined(__cplusplus)
 extern "C" {
@@ -49,24 +49,39 @@
 //                       MMXEXT definition                          //
 //***************************************************************************//
 #if defined(X86_ASM)
-typedef void_t (*PMcChromaWidthExtFunc)( uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, const uint8_t *kpABCD, int32_t iHeight );
-extern void_t McHorVer20WidthEq4_mmx (uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, int32_t iHeight);
-extern void_t McChromaWidthEq4_mmx   (uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, const uint8_t *kpABCD, int32_t iHeight );
-extern void_t McCopyWidthEq4_mmx     (uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, int32_t iHeight);
-extern void_t McCopyWidthEq8_mmx     (uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, int32_t iHeight);
-extern void_t PixelAvgWidthEq4_mmx   (uint8_t *pDst, int32_t iDstStride, uint8_t *pSrcA, int32_t iSrcAStride, uint8_t *pSrcB, int32_t iSrcBStride, int32_t iHeight);
-extern void_t PixelAvgWidthEq8_mmx   (uint8_t *pDst, int32_t iDstStride, uint8_t *pSrcA, int32_t iSrcAStride, uint8_t *pSrcB, int32_t iSrcBStride, int32_t iHeight);
+typedef void_t (*PMcChromaWidthExtFunc) (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+    const uint8_t* kpABCD, int32_t iHeight);
+extern void_t McHorVer20WidthEq4_mmx (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iHeight);
+extern void_t McChromaWidthEq4_mmx (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                    const uint8_t* kpABCD, int32_t iHeight);
+extern void_t McCopyWidthEq4_mmx (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                  int32_t iHeight);
+extern void_t McCopyWidthEq8_mmx (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                  int32_t iHeight);
+extern void_t PixelAvgWidthEq4_mmx (uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, int32_t iSrcAStride,
+                                    uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
+extern void_t PixelAvgWidthEq8_mmx (uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, int32_t iSrcAStride,
+                                    uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
 //***************************************************************************//
 //                       SSE2 definition                          //
 //***************************************************************************//
-extern void_t McChromaWidthEq8_sse2   (uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, const uint8_t* kpABCD, int32_t iHeight );
-extern void_t McCopyWidthEq16_sse2    (uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, int32_t iHeight);
-extern void_t McHorVer20WidthEq8_sse2 (uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, int32_t iHeight);
-extern void_t McHorVer20WidthEq16_sse2(uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, int32_t iHeight);
-extern void_t McHorVer02WidthEq8_sse2 (uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, int32_t iHeight);
-extern void_t McHorVer22Width8HorFirst_sse2(uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, int32_t iHeight);
-extern void_t McHorVer22VerLast_sse2(uint8_t * pTap, int32_t iTapStride, uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight);
-extern void_t PixelAvgWidthEq16_sse2  (uint8_t *pDst, int32_t iDstStride, uint8_t *pSrcA, int32_t iSrcAStride, uint8_t *pSrcB, int32_t iSrcBStride, int32_t iHeight);
+extern void_t McChromaWidthEq8_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                     const uint8_t* kpABCD, int32_t iHeight);
+extern void_t McCopyWidthEq16_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                    int32_t iHeight);
+extern void_t McHorVer20WidthEq8_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                       int32_t iHeight);
+extern void_t McHorVer20WidthEq16_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                        int32_t iHeight);
+extern void_t McHorVer02WidthEq8_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                       int32_t iHeight);
+extern void_t McHorVer22Width8HorFirst_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+    int32_t iHeight);
+extern void_t McHorVer22VerLast_sse2 (uint8_t* pTap, int32_t iTapStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iWidth, int32_t iHeight);
+extern void_t PixelAvgWidthEq16_sse2 (uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, int32_t iSrcAStride,
+                                      uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
 
 #endif //X86_ASM
 
--- a/codec/decoder/core/inc/measure_time.h
+++ b/codec/decoder/core/inc/measure_time.h
@@ -64,31 +64,30 @@
  * \return	time elapsed since run (unit: microsecond)
  */
 
-int64_t WelsTime( void_t )
-{
+int64_t WelsTime (void_t) {
 #if !(defined(_MSC_VER) || defined(__MINGW32__))
-	struct timeval tv_date;
-	
-	gettimeofday( &tv_date, NULL );
-	return( (int64_t) tv_date.tv_sec * 1000000 + (int64_t) tv_date.tv_usec );
+  struct timeval tv_date;
+
+  gettimeofday (&tv_date, NULL);
+  return ((int64_t) tv_date.tv_sec * 1000000 + (int64_t) tv_date.tv_usec);
 #else
 #if defined (WIN32)
-	static int64_t iMtimeFreq = 0;
-	int64_t iMtimeCur = 0;
-	int64_t iResult = 0;
-	if ( !iMtimeFreq ){
-		QueryPerformanceFrequency((LARGE_INTEGER *)&iMtimeFreq);
-		if ( !iMtimeFreq )
-			iMtimeFreq = 1;
-	}
-	QueryPerformanceCounter((LARGE_INTEGER *)&iMtimeCur);
-	iResult = (int64_t)((double)iMtimeCur * 1e6 / (double)iMtimeFreq + 0.5);
-	return iResult;
+  static int64_t iMtimeFreq = 0;
+  int64_t iMtimeCur = 0;
+  int64_t iResult = 0;
+  if (!iMtimeFreq) {
+    QueryPerformanceFrequency ((LARGE_INTEGER*)&iMtimeFreq);
+    if (!iMtimeFreq)
+      iMtimeFreq = 1;
+  }
+  QueryPerformanceCounter ((LARGE_INTEGER*)&iMtimeCur);
+  iResult = (int64_t) ((double)iMtimeCur * 1e6 / (double)iMtimeFreq + 0.5);
+  return iResult;
 #else
-	struct _timeb sTime;
-	
-	_ftime(&sTime);
-	return ((int64_t)sTime.time * (1000) + (int64_t)sTime.millitm) * (1000);
+  struct _timeb sTime;
+
+  _ftime (&sTime);
+  return ((int64_t)sTime.time * (1000) + (int64_t)sTime.millitm) * (1000);
 #endif//#if WIN32
 #endif//!(defined(_MSC_VER) || defined(__MINGW32__))
 }
--- a/codec/decoder/core/inc/mem_align.h
+++ b/codec/decoder/core/inc/mem_align.h
@@ -50,7 +50,7 @@
 
 
 
-/*! 
+/*!
 *************************************************************************************
 * \brief	malloc with zero filled utilization in Wels
 *
@@ -61,13 +61,13 @@
 * \note	N/A
 *************************************************************************************
 */
-void_t * WelsMalloc( const uint32_t kuiSize, const str_t *kpTag );
+void_t* WelsMalloc (const uint32_t kuiSize, const str_t* kpTag);
 
-/*! 
+/*!
 *************************************************************************************
 * \brief	free utilization in Wels
 *
-* \param 	pPtr	data pointer to be free. 
+* \param 	pPtr	data pointer to be free.
 *			i.e, uint8_t *pPtr = actual data to be free, argv = &pPtr.
 *
 * \return	NONE
@@ -75,7 +75,7 @@
 * \note	N/A
 *************************************************************************************
 */
-void_t WelsFree( void_t * pPtr, const str_t *kpTag );
+void_t WelsFree (void_t* pPtr, const str_t* kpTag);
 
 #define WELS_SAFE_FREE(pPtr, pTag)		if (pPtr) { WelsFree(pPtr, pTag); pPtr = NULL; }
 
--- a/codec/decoder/core/inc/memmgr_nal_unit.h
+++ b/codec/decoder/core/inc/memmgr_nal_unit.h
@@ -47,9 +47,9 @@
 
 namespace WelsDec {
 
-int32_t MemInitNalList(PAccessUnit *ppAu, const uint32_t kuiSize);
+int32_t MemInitNalList (PAccessUnit* ppAu, const uint32_t kuiSize);
 
-int32_t MemFreeNalList(PAccessUnit *ppAu);
+int32_t MemFreeNalList (PAccessUnit* ppAu);
 
 /*
  *	MemGetNextNal
@@ -56,7 +56,7 @@
  *	Get next NAL Unit for using.
  *	Need expand NAL Unit list if exceeding count number of available NAL Units withing an Access Unit
  */
-PNalUnit MemGetNextNal(PAccessUnit *ppAu);
+PNalUnit MemGetNextNal (PAccessUnit* ppAu);
 
 } // namespace WelsDec
 
--- a/codec/decoder/core/inc/mv_pred.h
+++ b/codec/decoder/core/inc/mv_pred.h
@@ -47,51 +47,53 @@
 
 /*!
 * \brief	 update mv and ref_index cache for current MB, only for P_16x16 (SKIP inclusive)
-* \param	 
-* \param	 
+* \param
+* \param
 */
-void_t UpdateP16x16MotionInfo(PDqLayer pCurDqLayer, int8_t iRef, int16_t iMVs[2]);
+void_t UpdateP16x16MotionInfo (PDqLayer pCurDqLayer, int8_t iRef, int16_t iMVs[2]);
 
- /*!
- * \brief   update mv and ref_index cache for current MB, only for P_16x8
- * \param 	
- * \param 	
- */
-void_t UpdateP16x8MotionInfo(PDqLayer pCurDqLayer, int16_t iMotionVector[LIST_A][30][MV_A], int8_t iRefIndex[LIST_A][30], 
-							  int32_t iPartIdx, int8_t iRef, int16_t iMVs[2]);
+/*!
+* \brief   update mv and ref_index cache for current MB, only for P_16x8
+* \param
+* \param
+*/
+void_t UpdateP16x8MotionInfo (PDqLayer pCurDqLayer, int16_t iMotionVector[LIST_A][30][MV_A],
+                              int8_t iRefIndex[LIST_A][30],
+                              int32_t iPartIdx, int8_t iRef, int16_t iMVs[2]);
 
 
- /*!
-  * \brief	 update mv and ref_index cache for current MB, only for P_8x16
-  * \param	 
-  * \param	 
-  */
-void_t UpdateP8x16MotionInfo(PDqLayer pCurDqLayer, int16_t iMotionVector[LIST_A][30][MV_A], int8_t iRefIndex[LIST_A][30], 
-							  int32_t iPartIdx, int8_t iRef, int16_t iMVs[2]);
- 
 /*!
+ * \brief	 update mv and ref_index cache for current MB, only for P_8x16
+ * \param
+ * \param
+ */
+void_t UpdateP8x16MotionInfo (PDqLayer pCurDqLayer, int16_t iMotionVector[LIST_A][30][MV_A],
+                              int8_t iRefIndex[LIST_A][30],
+                              int32_t iPartIdx, int8_t iRef, int16_t iMVs[2]);
+
+/*!
  * \brief   get the motion predictor for 4*4 or 8*8 or 16*16 block
- * \param 	
+ * \param
  * \param 	output mvp_x and mvp_y
  */
-void_t PredMv(int16_t iMotionVector[LIST_A][30][MV_A], int8_t iRefIndex[LIST_A][30], 
-			 int32_t iPartIdx, int32_t iPartWidth, int8_t iRef, int16_t iMVP[2]);
+void_t PredMv (int16_t iMotionVector[LIST_A][30][MV_A], int8_t iRefIndex[LIST_A][30],
+               int32_t iPartIdx, int32_t iPartWidth, int8_t iRef, int16_t iMVP[2]);
 
 /*!
  * \brief   get the motion predictor for inter16x8 MB
- * \param 	
+ * \param
  * \param 	output mvp_x and mvp_y
  */
-void_t PredInter16x8Mv(int16_t iMotionVector[LIST_A][30][MV_A], int8_t iRefIndex[LIST_A][30], 
-						int32_t iPartIdx, int8_t iRef, int16_t iMVP[2]);
+void_t PredInter16x8Mv (int16_t iMotionVector[LIST_A][30][MV_A], int8_t iRefIndex[LIST_A][30],
+                        int32_t iPartIdx, int8_t iRef, int16_t iMVP[2]);
 
 /*!
  * \brief   get the motion predictor for inter8x16 MB
- * \param 	
+ * \param
  * \param 	output mvp_x and mvp_y
  */
-void_t PredInter8x16Mv(int16_t iMotionVector[LIST_A][30][MV_A], int8_t iRefIndex[LIST_A][30], 
-						int32_t iPartIdx, int8_t iRef, int16_t iMVP[2]);
+void_t PredInter8x16Mv (int16_t iMotionVector[LIST_A][30][MV_A], int8_t iRefIndex[LIST_A][30],
+                        int32_t iPartIdx, int8_t iRef, int16_t iMVP[2]);
 
 } // namespace WelsDec
 
--- a/codec/decoder/core/inc/nal_prefix.h
+++ b/codec/decoder/core/inc/nal_prefix.h
@@ -45,42 +45,42 @@
 ///////////////////////////////////NAL Unit prefix/headers///////////////////////////////////
 
 /* NAL Unix Header in AVC, refer to Page 56 in JVT X201wcm */
-typedef struct TagNalUnitHeader{
-	uint8_t		    uiForbiddenZeroBit;
-	uint8_t		    uiNalRefIdc;
-	ENalUnitType    eNalUnitType;
-	uint8_t		    uiReservedOneByte;		// only padding usage
-}SNalUnitHeader, *PNalUnitHeader;
+typedef struct TagNalUnitHeader {
+  uint8_t		    uiForbiddenZeroBit;
+  uint8_t		    uiNalRefIdc;
+  ENalUnitType    eNalUnitType;
+  uint8_t		    uiReservedOneByte;		// only padding usage
+} SNalUnitHeader, *PNalUnitHeader;
 
 /* NAL Unit Header in scalable extension syntax, refer to Page 390 in JVT X201wcm */
-typedef struct TagNalUnitHeaderExt{
-	SNalUnitHeader	sNalUnitHeader;
-	
+typedef struct TagNalUnitHeaderExt {
+  SNalUnitHeader	sNalUnitHeader;
+
 //	uint8_t		reserved_one_bit;
-	bool_t		bIdrFlag;
-	uint8_t		uiPriorityId;
-	int8_t		iNoInterLayerPredFlag;	// change as int8_t to support 3 values probably in encoder	
-	uint8_t		uiDependencyId;
+  bool_t		bIdrFlag;
+  uint8_t		uiPriorityId;
+  int8_t		iNoInterLayerPredFlag;	// change as int8_t to support 3 values probably in encoder
+  uint8_t		uiDependencyId;
 
-	uint8_t		uiQualityId;
-	uint8_t		uiTemporalId;
-	bool_t		bUseRefBasePicFlag;
-	bool_t		bDiscardableFlag;
-	
-	bool_t		bOutputFlag;
-	uint8_t		uiReservedThree2Bits;
-	// Derived variable(s)
-	uint8_t		uiLayerDqId;
-	bool_t		bNalExtFlag;
-}SNalUnitHeaderExt, *PNalUnitHeaderExt;
+  uint8_t		uiQualityId;
+  uint8_t		uiTemporalId;
+  bool_t		bUseRefBasePicFlag;
+  bool_t		bDiscardableFlag;
 
+  bool_t		bOutputFlag;
+  uint8_t		uiReservedThree2Bits;
+  // Derived variable(s)
+  uint8_t		uiLayerDqId;
+  bool_t		bNalExtFlag;
+} SNalUnitHeaderExt, *PNalUnitHeaderExt;
+
 /* Prefix NAL Unix syntax, refer to Page 392 in JVT X201wcm */
-typedef struct TagPrefixNalUnit{
-	SRefBasePicMarking	sRefPicBaseMarking;	
-	bool_t		bStoreRefBasePicFlag;		
-	bool_t		bPrefixNalUnitAdditionalExtFlag;
-	bool_t		bPrefixNalUnitExtFlag;
-}SPrefixNalUnit, *PPrefixNalUnit;
+typedef struct TagPrefixNalUnit {
+  SRefBasePicMarking	sRefPicBaseMarking;
+  bool_t		bStoreRefBasePicFlag;
+  bool_t		bPrefixNalUnitAdditionalExtFlag;
+  bool_t		bPrefixNalUnitExtFlag;
+} SPrefixNalUnit, *PPrefixNalUnit;
 
 //#pragma pack()
 
--- a/codec/decoder/core/inc/nalu.h
+++ b/codec/decoder/core/inc/nalu.h
@@ -44,35 +44,35 @@
 ///////////////////////////////////NAL UNIT level///////////////////////////////////
 
 /* NAL Unit Structure */
-typedef struct TagNalUnit{
-	SNalUnitHeaderExt	sNalHeaderExt;
-	
-	union{
-		struct SVclNal{
-			SSliceHeaderExt	sSliceHeaderExt;
-			SBitStringAux	sSliceBitsRead;
-			uint8_t 		*pNalPos;	  // save the address of slice nal for GPU function
-			int32_t 		iNalLength;   // save the nal length for GPU function
-			bool_t			bSliceHeaderExtFlag;
-		} sVclNal;
-		SPrefixNalUnit	sPrefixNal;
-	} sNalData;		
-	
-}SNalUnit, *PNalUnit;
+typedef struct TagNalUnit {
+SNalUnitHeaderExt	sNalHeaderExt;
 
+union {
+  struct SVclNal {
+    SSliceHeaderExt	sSliceHeaderExt;
+    SBitStringAux	sSliceBitsRead;
+    uint8_t*		 pNalPos;	  // save the address of slice nal for GPU function
+    int32_t 		iNalLength;   // save the nal length for GPU function
+    bool_t			bSliceHeaderExtFlag;
+  } sVclNal;
+  SPrefixNalUnit	sPrefixNal;
+} sNalData;
+
+} SNalUnit, *PNalUnit;
+
 ///////////////////////////////////ACCESS Unit level///////////////////////////////////
 
 /* Access Unit structure */
-typedef struct TagAccessUnits{
-	PNalUnit		*pNalUnitsList;	// list of NAL Units pointer in this AU
-	uint32_t		uiAvailUnitsNum;	// Number of NAL Units available in each AU list based current bitstream,
-	uint32_t		uiActualUnitsNum;	// actual number of NAL units belong to current au
-	// While available number exceeds count size below, need realloc extra NAL Units for list space.
-	uint32_t		uiCountUnitsNum;	// Count size number of malloced NAL Units in each AU list
-	uint32_t		uiStartPos;
-	uint32_t		uiEndPos;
-	bool_t			bCompletedAuFlag;	// Indicate whether it is a completed AU
-}SAccessUnit, *PAccessUnit;
+typedef struct TagAccessUnits {
+PNalUnit*		pNalUnitsList;	// list of NAL Units pointer in this AU
+uint32_t		uiAvailUnitsNum;	// Number of NAL Units available in each AU list based current bitstream,
+uint32_t		uiActualUnitsNum;	// actual number of NAL units belong to current au
+// While available number exceeds count size below, need realloc extra NAL Units for list space.
+uint32_t		uiCountUnitsNum;	// Count size number of malloced NAL Units in each AU list
+uint32_t		uiStartPos;
+uint32_t		uiEndPos;
+bool_t			bCompletedAuFlag;	// Indicate whether it is a completed AU
+} SAccessUnit, *PAccessUnit;
 
 } // namespace WelsDec
 
--- a/codec/decoder/core/inc/parameter_sets.h
+++ b/codec/decoder/core/inc/parameter_sets.h
@@ -42,54 +42,54 @@
 //#pragma pack(1)
 
 /* Sequence Parameter Set, refer to Page 57 in JVT X201wcm */
-typedef struct TagSps{
-	int32_t	    iSpsId;
-	uint32_t	iMbWidth;
-	uint32_t	iMbHeight;
-	uint32_t	uiTotalMbCount;	//used in decode_slice_data()
-	
-	uint32_t	uiLog2MaxFrameNum;
-	uint32_t	uiPocType;
-	/* POC type 0 */
-	int32_t		iLog2MaxPocLsb;
-	/* POC type 1 */
-	int32_t		iOffsetForNonRefPic;
+typedef struct TagSps {
+int32_t	    iSpsId;
+uint32_t	iMbWidth;
+uint32_t	iMbHeight;
+uint32_t	uiTotalMbCount;	//used in decode_slice_data()
 
-	int32_t		iOffsetForTopToBottomField;
-	int32_t		iNumRefFramesInPocCycle;
-	int8_t		iOffsetForRefFrame[256];
-	int32_t		iNumRefFrames;
-	
-	SPosOffset	sFrameCrop;
-	
-	ProfileIdc	uiProfileIdc;
-	uint8_t		uiLevelIdc;
-	uint8_t		uiChromaFormatIdc;
-	uint8_t		uiChromaArrayType;
-	
-	uint8_t		uiBitDepthLuma;
-	uint8_t		uiBitDepthChroma;
-	/* TO BE CONTINUE: POC type 1 */
-	bool_t		bDeltaPicOrderAlwaysZeroFlag;	
-	bool_t		bGapsInFrameNumValueAllowedFlag;
+uint32_t	uiLog2MaxFrameNum;
+uint32_t	uiPocType;
+/* POC type 0 */
+int32_t		iLog2MaxPocLsb;
+/* POC type 1 */
+int32_t		iOffsetForNonRefPic;
 
-	bool_t		bFrameMbsOnlyFlag;
-	bool_t		bMbaffFlag;	// MB Adapative Frame Field
-	bool_t		bDirect8x8InferenceFlag;
-	bool_t		bFrameCroppingFlag;
+int32_t		iOffsetForTopToBottomField;
+int32_t		iNumRefFramesInPocCycle;
+int8_t		iOffsetForRefFrame[256];
+int32_t		iNumRefFrames;
 
-	bool_t		bVuiParamPresentFlag;
+SPosOffset	sFrameCrop;
+
+ProfileIdc	uiProfileIdc;
+uint8_t		uiLevelIdc;
+uint8_t		uiChromaFormatIdc;
+uint8_t		uiChromaArrayType;
+
+uint8_t		uiBitDepthLuma;
+uint8_t		uiBitDepthChroma;
+/* TO BE CONTINUE: POC type 1 */
+bool_t		bDeltaPicOrderAlwaysZeroFlag;
+bool_t		bGapsInFrameNumValueAllowedFlag;
+
+bool_t		bFrameMbsOnlyFlag;
+bool_t		bMbaffFlag;	// MB Adapative Frame Field
+bool_t		bDirect8x8InferenceFlag;
+bool_t		bFrameCroppingFlag;
+
+bool_t		bVuiParamPresentFlag;
 //	bool_t		bTimingInfoPresentFlag;
 //	bool_t		bFixedFrameRateFlag;
-	bool_t		bConstraintSet0Flag;
-	bool_t		bConstraintSet1Flag;
-	bool_t		bConstraintSet2Flag;
-	bool_t		bConstraintSet3Flag;
-	bool_t		bSeparateColorPlaneFlag;
-	bool_t		bQpPrimeYZeroTransfBypassFlag;
-	bool_t		bSeqScalingMatrixPresentFlag;
-	bool_t		bSeqScalingListPresentFlag[12];	
-}SSps, *PSps;
+bool_t		bConstraintSet0Flag;
+bool_t		bConstraintSet1Flag;
+bool_t		bConstraintSet2Flag;
+bool_t		bConstraintSet3Flag;
+bool_t		bSeparateColorPlaneFlag;
+bool_t		bQpPrimeYZeroTransfBypassFlag;
+bool_t		bSeqScalingMatrixPresentFlag;
+bool_t		bSeqScalingListPresentFlag[12];
+} SSps, *PSps;
 
 
 /* Sequence Parameter Set extension syntax, refer to Page 58 in JVT X201wcm */
@@ -98,7 +98,7 @@
 //	uint32_t	uiAuxFormatIdc;
 //	int32_t		iAlphaOpaqueValue;
 //	int32_t		iAlphaTransparentValue;
-	
+
 //	uint8_t		uiBitDepthAux;
 //	bool_t		bAlphaIncrFlag;
 //	bool_t		bAdditionalExtFlag;
@@ -105,65 +105,65 @@
 //}SSpsExt, *PSpsExt;
 
 /* Sequence Parameter Set extension syntax, refer to Page 391 in JVT X201wcm */
-typedef struct TagSpsSvcExt{
-	SPosOffset	sSeqScaledRefLayer;
-	
-	uint8_t		uiExtendedSpatialScalability;	// ESS
-	uint8_t		uiChromaPhaseXPlus1Flag;
-	uint8_t		uiChromaPhaseYPlus1;
-	uint8_t		uiSeqRefLayerChromaPhaseXPlus1Flag;
-	uint8_t		uiSeqRefLayerChromaPhaseYPlus1;
-	bool_t		bInterLayerDeblockingFilterCtrlPresentFlag;
-	bool_t		bSeqTCoeffLevelPredFlag;
-	bool_t		bAdaptiveTCoeffLevelPredFlag;
-	bool_t		bSliceHeaderRestrictionFlag;	
-}SSpsSvcExt, *PSpsSvcExt;
+typedef struct TagSpsSvcExt {
+SPosOffset	sSeqScaledRefLayer;
 
+uint8_t		uiExtendedSpatialScalability;	// ESS
+uint8_t		uiChromaPhaseXPlus1Flag;
+uint8_t		uiChromaPhaseYPlus1;
+uint8_t		uiSeqRefLayerChromaPhaseXPlus1Flag;
+uint8_t		uiSeqRefLayerChromaPhaseYPlus1;
+bool_t		bInterLayerDeblockingFilterCtrlPresentFlag;
+bool_t		bSeqTCoeffLevelPredFlag;
+bool_t		bAdaptiveTCoeffLevelPredFlag;
+bool_t		bSliceHeaderRestrictionFlag;
+} SSpsSvcExt, *PSpsSvcExt;
+
 /* Subset sequence parameter set syntax, refer to Page 391 in JVT X201wcm */
-typedef struct TagSubsetSps{	
-	SSps		sSps;
-	SSpsSvcExt	sSpsSvcExt;
-	bool_t		bSvcVuiParamPresentFlag;	
-	bool_t		bAdditionalExtension2Flag;
-	bool_t		bAdditionalExtension2DataFlag;
-}SSubsetSps, *PSubsetSps;
+typedef struct TagSubsetSps {
+SSps		sSps;
+SSpsSvcExt	sSpsSvcExt;
+bool_t		bSvcVuiParamPresentFlag;
+bool_t		bAdditionalExtension2Flag;
+bool_t		bAdditionalExtension2DataFlag;
+} SSubsetSps, *PSubsetSps;
 
 /* Picture parameter set syntax, refer to Page 59 in JVT X201wcm */
-typedef struct TagPps{
-	int32_t	iSpsId;
-	int32_t	iPpsId;
-	
-	uint32_t	uiNumSliceGroups;
-	uint32_t	uiSliceGroupMapType;
-	/* slice_group_map_type = 0 */
-	uint32_t	uiRunLength[MAX_SLICEGROUP_IDS];
-	/* slice_group_map_type = 2 */
-	uint32_t	uiTopLeft[MAX_SLICEGROUP_IDS];
-	uint32_t	uiBottomRight[MAX_SLICEGROUP_IDS];
-	/* slice_group_map_type = 3, 4 or 5 */
-	uint32_t	uiSliceGroupChangeRate;
-	/* slice_group_map_type = 6 */
-	uint32_t	uiPicSizeInMapUnits;
-	uint32_t	uiSliceGroupId[MAX_SLICEGROUP_IDS];
-	
-	uint32_t	uiNumRefIdxL0Active;
-	uint32_t	uiNumRefIdxL1Active;
-	
-	int32_t		iPicInitQp;
-	int32_t		iPicInitQs;
-	int32_t		iChromaQpIndexOffset;	
+typedef struct TagPps {
+int32_t	iSpsId;
+int32_t	iPpsId;
 
-	bool_t		bEntropyCodingModeFlag;
-	bool_t		bPicOrderPresentFlag;
-	/* slice_group_map_type = 3, 4 or 5 */
-	bool_t		bSliceGroupChangeDirectionFlag;
-	bool_t		bDeblockingFilterControlPresentFlag;
-	
-	bool_t		bConstainedIntraPredFlag;
-	bool_t		bRedundantPicCntPresentFlag;
-	bool_t		bWeightedPredFlag;
-	uint8_t		uiWeightedBipredIdc;
-	
+uint32_t	uiNumSliceGroups;
+uint32_t	uiSliceGroupMapType;
+/* slice_group_map_type = 0 */
+uint32_t	uiRunLength[MAX_SLICEGROUP_IDS];
+/* slice_group_map_type = 2 */
+uint32_t	uiTopLeft[MAX_SLICEGROUP_IDS];
+uint32_t	uiBottomRight[MAX_SLICEGROUP_IDS];
+/* slice_group_map_type = 3, 4 or 5 */
+uint32_t	uiSliceGroupChangeRate;
+/* slice_group_map_type = 6 */
+uint32_t	uiPicSizeInMapUnits;
+uint32_t	uiSliceGroupId[MAX_SLICEGROUP_IDS];
+
+uint32_t	uiNumRefIdxL0Active;
+uint32_t	uiNumRefIdxL1Active;
+
+int32_t		iPicInitQp;
+int32_t		iPicInitQs;
+int32_t		iChromaQpIndexOffset;
+
+bool_t		bEntropyCodingModeFlag;
+bool_t		bPicOrderPresentFlag;
+/* slice_group_map_type = 3, 4 or 5 */
+bool_t		bSliceGroupChangeDirectionFlag;
+bool_t		bDeblockingFilterControlPresentFlag;
+
+bool_t		bConstainedIntraPredFlag;
+bool_t		bRedundantPicCntPresentFlag;
+bool_t		bWeightedPredFlag;
+uint8_t		uiWeightedBipredIdc;
+
 } SPps, *PPps;
 
 //#pragma pack()
--- a/codec/decoder/core/inc/parse_mb_syn_cavlc.h
+++ b/codec/decoder/core/inc/parse_mb_syn_cavlc.h
@@ -38,7 +38,7 @@
  *************************************************************************************
  */
 
- 
+
 #ifndef WELS_PARSE_MB_SYN_CAVLC_H__
 #define WELS_PARSE_MB_SYN_CAVLC_H__
 
@@ -50,78 +50,77 @@
 namespace WelsDec {
 
 #define I16_LUMA_DC  1
-#define I16_LUMA_AC  2 
+#define I16_LUMA_AC  2
 #define LUMA_DC_AC   3
 #define CHROMA_DC    4
 #define CHROMA_AC    5
 
-typedef struct TagReadBitsCache
-{
-    uint32_t uiCache32Bit;
-    uint8_t  uiRemainBits;
-    uint8_t  *pBuf;
-}SReadBitsCache;
+typedef struct TagReadBitsCache {
+uint32_t uiCache32Bit;
+uint8_t  uiRemainBits;
+uint8_t*  pBuf;
+} SReadBitsCache;
 
 #define SHIFT_BUFFER(pBitsCache)	{	pBitsCache->pBuf+=2; pBitsCache->uiRemainBits += 16; pBitsCache->uiCache32Bit |= (((pBitsCache->pBuf[2] << 8) | pBitsCache->pBuf[3]) << (32 - pBitsCache->uiRemainBits));	}
 #define POP_BUFFER(pBitsCache, iCount)	{ pBitsCache->uiCache32Bit <<= iCount;	pBitsCache->uiRemainBits -= iCount;	}
 
-static const uint8_t g_kuiZigzagScan[16]={//4*4block residual zig-zag scan order
-	0,  1,  4,  8,	
-	5,  2,  3,  6,		
-	9, 12, 13, 10,	
-	7, 11, 14, 15,	
+static const uint8_t g_kuiZigzagScan[16] = { //4*4block residual zig-zag scan order
+0,  1,  4,  8,
+5,  2,  3,  6,
+9, 12, 13, 10,
+7, 11, 14, 15,
 };
 
 
-typedef struct TagI16PredInfo{
-	int8_t iPredMode;
-	int8_t iLeftAvail;
-	int8_t iTopAvail;
-	int8_t iLeftTopAvail;
+typedef struct TagI16PredInfo {
+int8_t iPredMode;
+int8_t iLeftAvail;
+int8_t iTopAvail;
+int8_t iLeftTopAvail;
 } SI16PredInfo;
 static const SI16PredInfo g_ksI16PredInfo[4] = {
-	{I16_PRED_V, 0, 1, 0},
-	{I16_PRED_H, 1, 0, 0},
-	{         0, 0, 0, 0},
-	{I16_PRED_P, 1, 1, 1},
+{I16_PRED_V, 0, 1, 0},
+{I16_PRED_H, 1, 0, 0},
+{         0, 0, 0, 0},
+{I16_PRED_P, 1, 1, 1},
 };
 
 static const SI16PredInfo g_ksChromaPredInfo[4] = {
-	{       0, 0, 0, 0},
-	{C_PRED_H, 1, 0, 0},
-	{C_PRED_V, 0, 1, 0},
-	{C_PRED_P, 1, 1, 1},
+{       0, 0, 0, 0},
+{C_PRED_H, 1, 0, 0},
+{C_PRED_V, 0, 1, 0},
+{C_PRED_P, 1, 1, 1},
 };
 
 
 typedef struct TagI4PredInfo {
-	int8_t iPredMode;
-	int8_t iLeftAvail;
-	int8_t iTopAvail;
-	int8_t iLeftTopAvail;
+int8_t iPredMode;
+int8_t iLeftAvail;
+int8_t iTopAvail;
+int8_t iLeftTopAvail;
 //	int8_t right_top_avail; //when right_top unavailable but top avail, we can pad the right-top with the rightmost pixel of top
 } SI4PredInfo;
 static const SI4PredInfo g_ksI4PredInfo[9] = {
-	{  I4_PRED_V, 0, 1, 0},
-	{  I4_PRED_H, 1, 0, 0},
-	{          0, 0, 0, 0},
-	{I4_PRED_DDL, 0, 1, 0},
-	{I4_PRED_DDR, 1, 1, 1},
-	{ I4_PRED_VR, 1, 1, 1},
-	{ I4_PRED_HD, 1, 1, 1},
-	{ I4_PRED_VL, 0, 1, 0},
-	{ I4_PRED_HU, 1, 0, 0},
+{  I4_PRED_V, 0, 1, 0},
+{  I4_PRED_H, 1, 0, 0},
+{          0, 0, 0, 0},
+{I4_PRED_DDL, 0, 1, 0},
+{I4_PRED_DDR, 1, 1, 1},
+{ I4_PRED_VR, 1, 1, 1},
+{ I4_PRED_HD, 1, 1, 1},
+{ I4_PRED_VL, 0, 1, 0},
+{ I4_PRED_HU, 1, 0, 0},
 };
 
 static const uint8_t g_kuiI16CbpTable[6] = {0, 16, 32, 15, 31, 47}; //reference to JM
 
 
-typedef struct TagPartMbInfo{
-    MbType iType;
-    int8_t iPartCount; //P_16*16, P_16*8, P_8*16, P_8*8 based on 8*8 block; P_8*4, P_4*8, P_4*4 based on 4*4 block
-	int8_t iPartWidth; //based on 4*4 block
-} SPartMbInfo; 
-static const SPartMbInfo g_ksInterMbTypeInfo[5]={
+typedef struct TagPartMbInfo {
+MbType iType;
+int8_t iPartCount; //P_16*16, P_16*8, P_8*16, P_8*8 based on 8*8 block; P_8*4, P_4*8, P_4*4 based on 4*4 block
+int8_t iPartWidth; //based on 4*4 block
+} SPartMbInfo;
+static const SPartMbInfo g_ksInterMbTypeInfo[5] = {
 {MB_TYPE_16x16,    1, 4},
 {MB_TYPE_16x8,     2, 4},
 {MB_TYPE_8x16,     2, 2},
@@ -128,7 +127,7 @@
 {MB_TYPE_8x8,      4, 4},
 {MB_TYPE_8x8_REF0, 4, 4}, //ref0--ref_idx not present in bit-stream and default as 0
 };
-static const SPartMbInfo g_ksInterSubMbTypeInfo[4]={
+static const SPartMbInfo g_ksInterSubMbTypeInfo[4] = {
 {SUB_MB_TYPE_8x8, 1, 2},
 {SUB_MB_TYPE_8x4, 2, 2},
 {SUB_MB_TYPE_4x8, 2, 1},
@@ -135,14 +134,16 @@
 {SUB_MB_TYPE_4x4, 4, 1},
 };
 
-void_t GetNeighborAvailMbType         (PNeighAvail pNeighAvail, PDqLayer pCurLayer);
-void_t WelsFillCacheNonZeroCount      (PNeighAvail pNeighAvail, uint8_t* pNonZeroCount, PDqLayer pCurLayer);
-void_t WelsFillCacheConstrain0Intra4x4(PNeighAvail pNeighAvail, uint8_t* pNonZeroCount, int8_t* pIntraPredMode, PDqLayer pCurLayer);
-void_t WelsFillCacheConstrain1Intra4x4(PNeighAvail pNeighAvail, uint8_t* pNonZeroCount, int8_t* pIntraPredMode, PDqLayer pCurLayer);
-void_t WelsFillCacheInter             (PNeighAvail pNeighAvail, uint8_t* pNonZeroCount, 
-						              int16_t iMvArray[LIST_A][30][MV_A], int8_t iRefIdxArray[LIST_A][30], PDqLayer pCurLayer);
+void_t GetNeighborAvailMbType (PNeighAvail pNeighAvail, PDqLayer pCurLayer);
+void_t WelsFillCacheNonZeroCount (PNeighAvail pNeighAvail, uint8_t* pNonZeroCount, PDqLayer pCurLayer);
+void_t WelsFillCacheConstrain0Intra4x4 (PNeighAvail pNeighAvail, uint8_t* pNonZeroCount, int8_t* pIntraPredMode,
+                                        PDqLayer pCurLayer);
+void_t WelsFillCacheConstrain1Intra4x4 (PNeighAvail pNeighAvail, uint8_t* pNonZeroCount, int8_t* pIntraPredMode,
+                                        PDqLayer pCurLayer);
+void_t WelsFillCacheInter (PNeighAvail pNeighAvail, uint8_t* pNonZeroCount,
+                           int16_t iMvArray[LIST_A][30][MV_A], int8_t iRefIdxArray[LIST_A][30], PDqLayer pCurLayer);
 
-void_t PredPSkipMvFromNeighbor       (PDqLayer pCurLayer, int16_t iMvp[2]);
+void_t PredPSkipMvFromNeighbor (PDqLayer pCurLayer, int16_t iMvp[2]);
 
 /*!
  * \brief   check iPredMode for intra16x16 eligible or not
@@ -149,7 +150,7 @@
  * \param 	input : current iPredMode
  * \param 	output: 0 indicating decoding correctly; -1 means error occurence
  */
- int32_t CheckIntra16x16PredMode(uint8_t uiSampleAvail, int8_t* pMode);
+int32_t CheckIntra16x16PredMode (uint8_t uiSampleAvail, int8_t* pMode);
 
 /*!
  * \brief   check iPredMode for intra4x4 eligible or not
@@ -156,7 +157,7 @@
  * \param 	input : current iPredMode
  * \param 	output: 0 indicating decoding correctly; -1 means error occurence
  */
- int32_t CheckIntra4x4PredMode(int32_t* pSampleAvail, int8_t* pMode, int32_t iIndex);
+int32_t CheckIntra4x4PredMode (int32_t* pSampleAvail, int8_t* pMode, int32_t iIndex);
 
 /*!
  * \brief   check iPredMode for chroma eligible or not
@@ -163,7 +164,7 @@
  * \param 	input : current iPredMode
  * \param 	output: 0 indicating decoding correctly; -1 means error occurence
  */
- int32_t CheckIntraChromaPredMode(uint8_t uiSampleAvail, int8_t* pMode);
+int32_t CheckIntraChromaPredMode (uint8_t uiSampleAvail, int8_t* pMode);
 
 /*!
  * \brief   predict the mode of intra4x4
@@ -170,42 +171,45 @@
  * \param 	input : current intra4x4 block index
  * \param 	output: mode index
  */
-int32_t PredIntra4x4Mode(int8_t* pIntraPredMode, int32_t iIdx4);
+int32_t PredIntra4x4Mode (int8_t* pIntraPredMode, int32_t iIdx4);
 
 
-void_t BsStartCavlc( PBitStringAux pBs );
-void_t BsEndCavlc( PBitStringAux pBs );
+void_t BsStartCavlc (PBitStringAux pBs);
+void_t BsEndCavlc (PBitStringAux pBs);
 
-int32_t WelsResidualBlockCavlc(	SVlcTable* pVlcTable,
-										uint8_t* pNonZeroCountCache,
-										PBitStringAux pBs,
-										/*int16_t* coeff_level,*/
-										int32_t iIndex,
-										int32_t iMaxNumCoeff,
-										const uint8_t *kpZigzagTable,
-										int32_t iResidualProperty,
-										/*short *tCoeffLevel,*/
-										int16_t *pTCoeff,
-										int32_t iMbMode,
-										uint8_t uiQp,
-										PWelsDecoderContext pCtx);
+int32_t WelsResidualBlockCavlc (SVlcTable* pVlcTable,
+                                uint8_t* pNonZeroCountCache,
+                                PBitStringAux pBs,
+                                /*int16_t* coeff_level,*/
+                                int32_t iIndex,
+                                int32_t iMaxNumCoeff,
+                                const uint8_t* kpZigzagTable,
+                                int32_t iResidualProperty,
+                                /*short *tCoeffLevel,*/
+                                int16_t* pTCoeff,
+                                int32_t iMbMode,
+                                uint8_t uiQp,
+                                PWelsDecoderContext pCtx);
 
 /*!
- * \brief   parsing intra mode 
+ * \brief   parsing intra mode
  * \param 	input : current mb, bit-stream
  * \param 	output: 0 indicating decoding correctly; -1 means error
  */
-int32_t ParseIntra4x4ModeConstrain0  (PNeighAvail pNeighAvail, int8_t* pIntraPredMode, PBitStringAux pBs, PDqLayer pCurDqLayer);
-int32_t ParseIntra4x4ModeConstrain1  (PNeighAvail pNeighAvail, int8_t* pIntraPredMode, PBitStringAux pBs, PDqLayer pCurDqLayer);
-int32_t ParseIntra16x16ModeConstrain0(PNeighAvail pNeighAvail, PBitStringAux pBs, PDqLayer pCurDqLayer);
-int32_t ParseIntra16x16ModeConstrain1(PNeighAvail pNeighAvail, PBitStringAux pBs, PDqLayer pCurDqLayer);
+int32_t ParseIntra4x4ModeConstrain0 (PNeighAvail pNeighAvail, int8_t* pIntraPredMode, PBitStringAux pBs,
+                                     PDqLayer pCurDqLayer);
+int32_t ParseIntra4x4ModeConstrain1 (PNeighAvail pNeighAvail, int8_t* pIntraPredMode, PBitStringAux pBs,
+                                     PDqLayer pCurDqLayer);
+int32_t ParseIntra16x16ModeConstrain0 (PNeighAvail pNeighAvail, PBitStringAux pBs, PDqLayer pCurDqLayer);
+int32_t ParseIntra16x16ModeConstrain1 (PNeighAvail pNeighAvail, PBitStringAux pBs, PDqLayer pCurDqLayer);
 
 /*!
- * \brief   parsing inter info (including ref_index and mvd) 
+ * \brief   parsing inter info (including ref_index and mvd)
  * \param 	input : decoding context, current mb, bit-stream
  * \param 	output: 0 indicating decoding correctly; -1 means error
  */
-int32_t ParseInterInfo(PWelsDecoderContext pCtx, int16_t iMvArray[LIST_A][30][MV_A], int8_t iRefIdxArray[LIST_A][30], PBitStringAux pBs);
+int32_t ParseInterInfo (PWelsDecoderContext pCtx, int16_t iMvArray[LIST_A][30][MV_A], int8_t iRefIdxArray[LIST_A][30],
+                        PBitStringAux pBs);
 
 //#pragma pack()
 
--- a/codec/decoder/core/inc/pic_queue.h
+++ b/codec/decoder/core/inc/pic_queue.h
@@ -44,17 +44,17 @@
 #define   PICTURE_RESOLUTION_ALIGNMENT      32
 
 
-typedef struct TagPicBuff{
-	PPicture*      ppPic;   
-	int32_t        iCapacity;  // capacity size of queue
-	int32_t        iCurrentIdx;
-}SPicBuff, *PPicBuff;
+typedef struct TagPicBuff {
+PPicture*      ppPic;
+int32_t        iCapacity;  // capacity size of queue
+int32_t        iCurrentIdx;
+} SPicBuff, *PPicBuff;
 
 /*
  *	Interfaces
  */
 
-PPicture PrefetchPic( PPicBuff pPicBuff ); // To get current node applicable
+PPicture PrefetchPic (PPicBuff pPicBuff);  // To get current node applicable
 
 } // namespace WelsDec
 
--- a/codec/decoder/core/inc/picture.h
+++ b/codec/decoder/core/inc/picture.h
@@ -44,41 +44,41 @@
  *	Reconstructed Picture definition
  *	It is used to express reference picture, also consequent reconstruction picture for output
  */
-typedef struct TagPicture{
-	/************************************payload data*********************************/
-	uint8_t		*pBuffer[4];		// pointer to the first allocated byte, basical offset of buffer, dimension:
-	uint8_t		*pData[4];		// pointer to picture planes respectively
-	int32_t		iLinesize[4];// linesize of picture planes respectively used currently
-	int32_t		iPlanes;			// How many planes are introduced due to color space format?
-	// picture information
-	
-	/*******************************from other standard syntax****************************/
-	/*from sps*/
-	int32_t		iWidthInPixel;	// picture width in pixel
-	int32_t		iHeightInPixel;// picture height in pixel
-	/*from slice header*/
-	int32_t		iFramePoc;		// frame POC
+typedef struct TagPicture {
+/************************************payload data*********************************/
+uint8_t*		pBuffer[4];		// pointer to the first allocated byte, basical offset of buffer, dimension:
+uint8_t*		pData[4];		// pointer to picture planes respectively
+int32_t		iLinesize[4];// linesize of picture planes respectively used currently
+int32_t		iPlanes;			// How many planes are introduced due to color space format?
+// picture information
 
-	/*******************************sef_definition for misc use****************************/
-	bool_t		bUsedAsRef;							//for ref pic management
-	bool_t		bIsLongRef;	// long term reference frame flag	//for ref pic management
-	uint8_t		uiRefCount;
-	bool_t		bAvailableFlag;	// indicate whether it is available in this picture memory block.
+/*******************************from other standard syntax****************************/
+/*from sps*/
+int32_t		iWidthInPixel;	// picture width in pixel
+int32_t		iHeightInPixel;// picture height in pixel
+/*from slice header*/
+int32_t		iFramePoc;		// frame POC
 
-	/*******************************for future use****************************/
-	uint8_t		uiTemporalId;
-	uint8_t		uiSpatialId;
-	uint8_t		uiQualityId;
-	bool_t		bRefBaseFlag;
-	
-	int32_t		iFrameNum;		// frame number			//for ref pic management
-	int32_t		iLongTermFrameIdx;					//id for long term ref pic
+/*******************************sef_definition for misc use****************************/
+bool_t		bUsedAsRef;							//for ref pic management
+bool_t		bIsLongRef;	// long term reference frame flag	//for ref pic management
+uint8_t		uiRefCount;
+bool_t		bAvailableFlag;	// indicate whether it is available in this picture memory block.
 
-	int32_t     iTotalNumMbRec; //show how many MB constructed
+/*******************************for future use****************************/
+uint8_t		uiTemporalId;
+uint8_t		uiSpatialId;
+uint8_t		uiQualityId;
+bool_t		bRefBaseFlag;
 
-	int32_t     iSpsId; //against mosaic caused by cross-IDR interval reference.
-	int32_t     iPpsId;
-}SPicture, *PPicture;	// "Picture" declaration is comflict with Mac system
+int32_t		iFrameNum;		// frame number			//for ref pic management
+int32_t		iLongTermFrameIdx;					//id for long term ref pic
+
+int32_t     iTotalNumMbRec; //show how many MB constructed
+
+int32_t     iSpsId; //against mosaic caused by cross-IDR interval reference.
+int32_t     iPpsId;
+} SPicture, *PPicture;	// "Picture" declaration is comflict with Mac system
 
 } // namespace WelsDec
 
--- a/codec/decoder/core/inc/rec_mb.h
+++ b/codec/decoder/core/inc/rec_mb.h
@@ -51,22 +51,22 @@
 
 namespace WelsDec {
 
-void_t WelsFillRecNeededMbInfo(PWelsDecoderContext pCtx, bool_t bOutput, PDqLayer pCurLayer);
+void_t WelsFillRecNeededMbInfo (PWelsDecoderContext pCtx, bool_t bOutput, PDqLayer pCurLayer);
 
-int32_t RecI4x4Mb    (int32_t iMBXY, PWelsDecoderContext pCtx, int16_t *pScoeffLevel, PDqLayer pDqLayer);
+int32_t RecI4x4Mb (int32_t iMBXY, PWelsDecoderContext pCtx, int16_t* pScoeffLevel, PDqLayer pDqLayer);
 
-int32_t RecI4x4Luma  (int32_t iMBXY, PWelsDecoderContext pCtx, int16_t *pScoeffLevel, PDqLayer pDqLayer);
+int32_t RecI4x4Luma (int32_t iMBXY, PWelsDecoderContext pCtx, int16_t* pScoeffLevel, PDqLayer pDqLayer);
 
-int32_t RecI4x4Chroma(int32_t iMBXY, PWelsDecoderContext pCtx, int16_t *pScoeffLevel, PDqLayer pDqLayer);
+int32_t RecI4x4Chroma (int32_t iMBXY, PWelsDecoderContext pCtx, int16_t* pScoeffLevel, PDqLayer pDqLayer);
 
-int32_t RecI16x16Mb  (int32_t iMBXY, PWelsDecoderContext pCtx, int16_t *pScoeffLevel, PDqLayer pDqLayer);
+int32_t RecI16x16Mb (int32_t iMBXY, PWelsDecoderContext pCtx, int16_t* pScoeffLevel, PDqLayer pDqLayer);
 
-int32_t RecChroma    (int32_t iMBXY, PWelsDecoderContext pCtx, int16_t *pScoeffLevel, PDqLayer pDqLayer);
+int32_t RecChroma (int32_t iMBXY, PWelsDecoderContext pCtx, int16_t* pScoeffLevel, PDqLayer pDqLayer);
 
-void_t GetInterPred (uint8_t *pPredY, uint8_t *pPredCb, uint8_t *pPredCr, PWelsDecoderContext pCtx);
+void_t GetInterPred (uint8_t* pPredY, uint8_t* pPredCb, uint8_t* pPredCr, PWelsDecoderContext pCtx);
 
-void_t FillBufForMc(uint8_t *pBuf, int32_t iBufStride, uint8_t *pSrc, int32_t iSrcStride, int32_t iSrcOffset, 
-					 int32_t iBlockWidth, int32_t iBlockHeight, int32_t iSrcX, int32_t iSrcY, int32_t iPicWidth, int32_t iPicHeight);
+void_t FillBufForMc (uint8_t* pBuf, int32_t iBufStride, uint8_t* pSrc, int32_t iSrcStride, int32_t iSrcOffset,
+                     int32_t iBlockWidth, int32_t iBlockHeight, int32_t iSrcX, int32_t iSrcY, int32_t iPicWidth, int32_t iPicHeight);
 
 } // namespace WelsDec
 
--- a/codec/decoder/core/inc/slice.h
+++ b/codec/decoder/core/inc/slice.h
@@ -48,158 +48,158 @@
  *	Reference picture list reordering syntax, refer to page 64 in JVT X201wcm
  */
 typedef struct TagRefPicListReorderSyntax {
-	struct {
-		uint32_t    uiAbsDiffPicNumMinus1;
-		uint16_t    uiLongTermPicNum;
-		uint16_t    uiReorderingOfPicNumsIdc;
-	} sReorderingSyn[LIST_A][MAX_REF_PIC_COUNT];
-	bool_t		bRefPicListReorderingFlag[LIST_A];
-}SRefPicListReorderSyn, *PRefPicListReorderSyn;
+  struct {
+    uint32_t    uiAbsDiffPicNumMinus1;
+    uint16_t    uiLongTermPicNum;
+    uint16_t    uiReorderingOfPicNumsIdc;
+  } sReorderingSyn[LIST_A][MAX_REF_PIC_COUNT];
+  bool_t		bRefPicListReorderingFlag[LIST_A];
+} SRefPicListReorderSyn, *PRefPicListReorderSyn;
 
 /*
  *	Prediction weight table syntax, refer to page 65 in JVT X201wcm
  */
-typedef struct TagPredWeightTabSyntax{
-	uint32_t	uiLumaLog2WeightDenom;
-	uint32_t	uiChromaLog2WeightDenom;
-	struct{
-		int32_t	iLumaWeight[MAX_REF_PIC_COUNT];
-		int32_t iLumaOffset[MAX_REF_PIC_COUNT];
-		int32_t	iChromaWeight[MAX_REF_PIC_COUNT][2];
-		int32_t iChromaOffset[MAX_REF_PIC_COUNT][2];
-		bool_t	bLumaWeightFlag;
-		bool_t	bChromaWeightFlag;		
-	}sPredList[LIST_A];
-}SPredWeightTabSyn;
+typedef struct TagPredWeightTabSyntax {
+  uint32_t	uiLumaLog2WeightDenom;
+  uint32_t	uiChromaLog2WeightDenom;
+  struct {
+    int32_t	iLumaWeight[MAX_REF_PIC_COUNT];
+    int32_t iLumaOffset[MAX_REF_PIC_COUNT];
+    int32_t	iChromaWeight[MAX_REF_PIC_COUNT][2];
+    int32_t iChromaOffset[MAX_REF_PIC_COUNT][2];
+    bool_t	bLumaWeightFlag;
+    bool_t	bChromaWeightFlag;
+  } sPredList[LIST_A];
+} SPredWeightTabSyn;
 
 /* Decoded reference picture marking syntax, refer to Page 66 in JVT X201wcm */
 typedef struct TagRefPicMarking {
-	struct {
-		uint32_t    uiMmcoType;
-		int32_t     iShortFrameNum;
-		int32_t	    iDiffOfPicNum;
-		uint32_t    uiLongTermPicNum;
-		int32_t	    iLongTermFrameIdx;
-		int32_t	    iMaxLongTermFrameIdx;
-	} sMmcoRef[MAX_MMCO_COUNT];
+  struct {
+    uint32_t    uiMmcoType;
+    int32_t     iShortFrameNum;
+    int32_t	    iDiffOfPicNum;
+    uint32_t    uiLongTermPicNum;
+    int32_t	    iLongTermFrameIdx;
+    int32_t	    iMaxLongTermFrameIdx;
+  } sMmcoRef[MAX_MMCO_COUNT];
 
-    bool_t		bNoOutputOfPriorPicsFlag;
-	bool_t		bLongTermRefFlag;
-	bool_t		bAdaptiveRefPicMarkingModeFlag;	
+  bool_t		bNoOutputOfPriorPicsFlag;
+  bool_t		bLongTermRefFlag;
+  bool_t		bAdaptiveRefPicMarkingModeFlag;
 } SRefPicMarking, *PRefPicMarking;
 
 /* Decode reference base picture marking syntax in Page 396 of JVT X201wcm */
 typedef struct TagRefBasePicMarkingSyn {
-	struct {
-		uint32_t	uiMmcoType;
-		int32_t	    iShortFrameNum;
-		uint32_t	uiDiffOfPicNums;
-		uint32_t	uiLongTermPicNum; //should uint32_t, cover larger range of iFrameNum.
-	} mmco_base[MAX_MMCO_COUNT];	// MAX_REF_PIC for reference picture based on frame
+  struct {
+    uint32_t	uiMmcoType;
+    int32_t	    iShortFrameNum;
+    uint32_t	uiDiffOfPicNums;
+    uint32_t	uiLongTermPicNum; //should uint32_t, cover larger range of iFrameNum.
+  } mmco_base[MAX_MMCO_COUNT];	// MAX_REF_PIC for reference picture based on frame
 
-    bool_t		bAdaptiveRefBasePicMarkingModeFlag;
+  bool_t		bAdaptiveRefBasePicMarkingModeFlag;
 } SRefBasePicMarking, *PRefBasePicMarking;
 
 /* Header of slice syntax elements, refer to Page 63 in JVT X201wcm */
-typedef struct TagSliceHeaders{	
-	/*****************************slice header syntax and generated****************************/
-	int32_t		iFirstMbInSlice;		
-	int32_t		iFrameNum;
-	int32_t		iPicOrderCntLsb;
-	int32_t		iDeltaPicOrderCntBottom;
-	int32_t		iDeltaPicOrderCnt[2];
-	int32_t		iRedundantPicCnt;
-	int32_t		uiRefCount[LIST_A];
-	int32_t		iSliceQpDelta;	//no use for iSliceQp is used directly
-	int32_t		iSliceQp;	
-	int32_t		iSliceQsDelta;	// For SP/SI slices
-	uint32_t	uiDisableDeblockingFilterIdc;
-	int32_t		iSliceAlphaC0Offset;
-	int32_t		iSliceBetaOffset;
-	int32_t		iSliceGroupChangeCycle;
+typedef struct TagSliceHeaders {
+  /*****************************slice header syntax and generated****************************/
+  int32_t		iFirstMbInSlice;
+  int32_t		iFrameNum;
+  int32_t		iPicOrderCntLsb;
+  int32_t		iDeltaPicOrderCntBottom;
+  int32_t		iDeltaPicOrderCnt[2];
+  int32_t		iRedundantPicCnt;
+  int32_t		uiRefCount[LIST_A];
+  int32_t		iSliceQpDelta;	//no use for iSliceQp is used directly
+  int32_t		iSliceQp;
+  int32_t		iSliceQsDelta;	// For SP/SI slices
+  uint32_t	uiDisableDeblockingFilterIdc;
+  int32_t		iSliceAlphaC0Offset;
+  int32_t		iSliceBetaOffset;
+  int32_t		iSliceGroupChangeCycle;
 
-	PSps		pSps;
-	PPps		pPps;
-	int32_t	    iSpsId;
-	int32_t	    iPpsId;
+  PSps		pSps;
+  PPps		pPps;
+  int32_t	    iSpsId;
+  int32_t	    iPpsId;
 
-	/*********************got from other layer for efficency if possible*********************/
-	SRefPicListReorderSyn	pRefPicListReordering;	// Reference picture list reordering syntaxs
-	SPredWeightTabSyn		sPredWeightTable;
-	int32_t		iCabacInitIdc;
-	int32_t		iMbWidth;	//from?
-	int32_t		iMbHeight; //from?
-	SRefPicMarking		sRefMarking;	// Decoded reference picture marking syntaxs
+  /*********************got from other layer for efficency if possible*********************/
+  SRefPicListReorderSyn	pRefPicListReordering;	// Reference picture list reordering syntaxs
+  SPredWeightTabSyn		sPredWeightTable;
+  int32_t		iCabacInitIdc;
+  int32_t		iMbWidth;	//from?
+  int32_t		iMbHeight; //from?
+  SRefPicMarking		sRefMarking;	// Decoded reference picture marking syntaxs
 
-	uint16_t    uiIdrPicId;
-	ESliceType	eSliceType;
-	bool_t		bNumRefIdxActiveOverrideFlag;
-	bool_t		bFieldPicFlag;		//not supported in base profile
-	bool_t		bBottomFiledFlag;		//not supported in base profile
-	uint8_t		uiPadding1Byte;
-	bool_t		bSpForSwitchFlag;			// For SP/SI slices
-	int16_t		iPadding2Bytes;
-}SSliceHeader, *PSliceHeader;
+  uint16_t    uiIdrPicId;
+  ESliceType	eSliceType;
+  bool_t		bNumRefIdxActiveOverrideFlag;
+  bool_t		bFieldPicFlag;		//not supported in base profile
+  bool_t		bBottomFiledFlag;		//not supported in base profile
+  uint8_t		uiPadding1Byte;
+  bool_t		bSpForSwitchFlag;			// For SP/SI slices
+  int16_t		iPadding2Bytes;
+} SSliceHeader, *PSliceHeader;
 
 
 /* Slice header in scalable extension syntax, refer to Page 394 in JVT X201wcm */
-typedef struct TagSliceHeaderExt{	
-	SSliceHeader	sSliceHeader;
-	PSubsetSps	pSubsetSps;
-	
-	uint32_t	uiNumMbsInSlice;
-	uint32_t	uiDisableInterLayerDeblockingFilterIdc;
-	int32_t		iInterLayerSliceAlphaC0Offset;
-	int32_t		iInterLayerSliceBetaOffset;	
-	
-	//SPosOffset sScaledRefLayer;
-	int32_t		iScaledRefLayerPicWidthInSampleLuma;
-	int32_t		iScaledRefLayerPicHeightInSampleLuma;
+typedef struct TagSliceHeaderExt {
+  SSliceHeader	sSliceHeader;
+  PSubsetSps	pSubsetSps;
 
-	SRefBasePicMarking	sRefBasePicMarking;
-	bool_t		bBasePredWeightTableFlag;
-	bool_t		bStoreRefBasePicFlag;	
-	bool_t		bConstrainedIntraResamplingFlag;	
-	bool_t		bSliceSkipFlag;
-	
-	bool_t		bAdaptiveBaseModeFlag;
-	bool_t		bDefaultBaseModeFlag;
-	bool_t		bAdaptiveMotionPredFlag;
-	bool_t		bDefaultMotionPredFlag;
-	bool_t		bAdaptiveResidualPredFlag;
-	bool_t		bDefaultResidualPredFlag;
-	bool_t		bTCoeffLevelPredFlag;		
-	uint8_t		uiRefLayerChromaPhaseXPlus1Flag;
-	
-	uint8_t		uiRefLayerChromaPhaseYPlus1;
-	uint8_t		uiRefLayerDqId;
-	uint8_t		uiScanIdxStart;
-	uint8_t		uiScanIdxEnd;
-}SSliceHeaderExt, *PSliceHeaderExt;
+  uint32_t	uiNumMbsInSlice;
+  uint32_t	uiDisableInterLayerDeblockingFilterIdc;
+  int32_t		iInterLayerSliceAlphaC0Offset;
+  int32_t		iInterLayerSliceBetaOffset;
 
+  //SPosOffset sScaledRefLayer;
+  int32_t		iScaledRefLayerPicWidthInSampleLuma;
+  int32_t		iScaledRefLayerPicHeightInSampleLuma;
 
-typedef struct TagSlice{	
-	/*******************************slice_header****************************/
-	SSliceHeaderExt	sSliceHeaderExt;		
-	
-	/*******************************use for future****************************/
-	// for Macroblock coding within slice
-	int32_t		iLastMbQp;		// stored qp for last mb coded, maybe more efficient for mb skip detection etc.
+  SRefBasePicMarking	sRefBasePicMarking;
+  bool_t		bBasePredWeightTableFlag;
+  bool_t		bStoreRefBasePicFlag;
+  bool_t		bConstrainedIntraResamplingFlag;
+  bool_t		bSliceSkipFlag;
 
-	/*******************************slice_data****************************/
-	/*slice_data_ext()*/
-	int32_t		iMbSkipRun;
-	int32_t     iTotalMbInCurSlice; //record the total number of MB in current slice.
-	
-	/*slice_data_ext() generate*/
-		
-	/*******************************misc use****************************/
-	bool_t		bSliceHeaderExtFlag; // Indicate which slice header is used, avc or ext?
-	/*************got from other layer for effiency if possible***************/
-	/*from lower layer: slice header*/
-	uint8_t		eSliceType;	
-	uint8_t		uiPadding[2];	
-}SSlice, *PSlice;
+  bool_t		bAdaptiveBaseModeFlag;
+  bool_t		bDefaultBaseModeFlag;
+  bool_t		bAdaptiveMotionPredFlag;
+  bool_t		bDefaultMotionPredFlag;
+  bool_t		bAdaptiveResidualPredFlag;
+  bool_t		bDefaultResidualPredFlag;
+  bool_t		bTCoeffLevelPredFlag;
+  uint8_t		uiRefLayerChromaPhaseXPlus1Flag;
+
+  uint8_t		uiRefLayerChromaPhaseYPlus1;
+  uint8_t		uiRefLayerDqId;
+  uint8_t		uiScanIdxStart;
+  uint8_t		uiScanIdxEnd;
+} SSliceHeaderExt, *PSliceHeaderExt;
+
+
+typedef struct TagSlice {
+  /*******************************slice_header****************************/
+  SSliceHeaderExt	sSliceHeaderExt;
+
+  /*******************************use for future****************************/
+  // for Macroblock coding within slice
+  int32_t		iLastMbQp;		// stored qp for last mb coded, maybe more efficient for mb skip detection etc.
+
+  /*******************************slice_data****************************/
+  /*slice_data_ext()*/
+  int32_t		iMbSkipRun;
+  int32_t     iTotalMbInCurSlice; //record the total number of MB in current slice.
+
+  /*slice_data_ext() generate*/
+
+  /*******************************misc use****************************/
+  bool_t		bSliceHeaderExtFlag; // Indicate which slice header is used, avc or ext?
+  /*************got from other layer for effiency if possible***************/
+  /*from lower layer: slice header*/
+  uint8_t		eSliceType;
+  uint8_t		uiPadding[2];
+} SSlice, *PSlice;
 
 } // namespace WelsDec
 
--- a/codec/decoder/core/inc/typedefs.h
+++ b/codec/decoder/core/inc/typedefs.h
@@ -47,7 +47,7 @@
 
 #else
 
-// FIXME:     all singed type should be declared explicit,  for example,  int8_t should be declared as signed char.  
+// FIXME:     all singed type should be declared explicit,  for example,  int8_t should be declared as signed char.
 typedef signed char      int8_t  ;
 typedef unsigned char    uint8_t ;
 typedef short            int16_t ;
@@ -59,7 +59,7 @@
 
 #endif // _MSC_VER defined
 
-// FIXME:     all string type should be declared explicit as char. 
+// FIXME:     all string type should be declared explicit as char.
 typedef char      str_t;
 typedef float     real32_t;
 
--- a/codec/decoder/core/inc/utils.h
+++ b/codec/decoder/core/inc/utils.h
@@ -59,16 +59,17 @@
  *	Function pointer declaration for various tool sets
  */
 // wels log output
-typedef void_t (*PWelsLogCallbackFunc)(void_t *pPtr, const int32_t kiLevel, const char *kpFmt, va_list pArgv);
+typedef void_t (*PWelsLogCallbackFunc) (void_t* pPtr, const int32_t kiLevel, const char* kpFmt, va_list pArgv);
 
 extern PWelsLogCallbackFunc	g_pLog;
 
 #ifdef __GNUC__
-extern void_t WelsLog(void_t *pPtr, int32_t iLevel, const char *kpFmt, ...) __attribute__ ((__format__ (__printf__, 3, 4)));
+extern void_t WelsLog (void_t* pPtr, int32_t iLevel, const char* kpFmt, ...) __attribute__ ((__format__ (__printf__, 3,
+    4)));
 #else
-extern void_t WelsLog(void_t *pPtr, int32_t iLevel, const char *kpFmt, ...);
+extern void_t WelsLog (void_t* pPtr, int32_t iLevel, const char* kpFmt, ...);
 #endif
-	
+
 #define DECODER_MODE_NAME(a) ((a == SW_MODE)?"SW_MODE":((a == GPU_MODE)?"GPU_MODE":((a == AUTO_MODE)?"AUTO_MODE":"SWITCH_MODE")))
 #define OUTPUT_PROPERTY_NAME(a) ((a == 0)?"system_memory":"video_memory")
 #define BUFFER_STATUS_NAME(a) ((a == 0)?"unvalid":"valid")
@@ -79,15 +80,15 @@
  */
 
 typedef int32_t	WelsLogLevel;
-enum{
-	WELS_LOG_QUIET		= 0x00,		// Quiet mode
-	WELS_LOG_ERROR		= 1 << 0,	// Error log level
-	WELS_LOG_WARNING	= 1 << 1,	// Warning log level
-	WELS_LOG_INFO		= 1 << 2,	// Information log level
-	WELS_LOG_DEBUG		= 1 << 3,	// Debug log level
-	WELS_LOG_RESV		= 1 << 4,	// Resversed log level
-	WELS_LOG_LEVEL_COUNT= 5,
-	WELS_LOG_DEFAULT	= WELS_LOG_ERROR | WELS_LOG_WARNING | WELS_LOG_INFO | WELS_LOG_DEBUG	// Default log level in Wels codec
+enum {
+  WELS_LOG_QUIET		= 0x00,		// Quiet mode
+  WELS_LOG_ERROR		= 1 << 0,	// Error log level
+  WELS_LOG_WARNING	= 1 << 1,	// Warning log level
+  WELS_LOG_INFO		= 1 << 2,	// Information log level
+  WELS_LOG_DEBUG		= 1 << 3,	// Debug log level
+  WELS_LOG_RESV		= 1 << 4,	// Resversed log level
+  WELS_LOG_LEVEL_COUNT = 5,
+  WELS_LOG_DEFAULT	= WELS_LOG_ERROR | WELS_LOG_WARNING | WELS_LOG_INFO | WELS_LOG_DEBUG	// Default log level in Wels codec
 };
 
 #ifdef __cplusplus
--- a/codec/decoder/core/inc/vlc_decoder.h
+++ b/codec/decoder/core/inc/vlc_decoder.h
@@ -38,13 +38,12 @@
 
 namespace WelsDec {
 
-typedef struct TagVlcTable
-{
-	const uint8_t (*kpCoeffTokenVlcTable[4][8])[2];
-	const uint8_t (*kpChromaCoeffTokenVlcTable)[2];
-	const uint8_t (*kpZeroTable[7])[2];
-	const uint8_t (*kpTotalZerosTable[2][15])[2];
-}SVlcTable;
+typedef struct TagVlcTable {
+const uint8_t (*kpCoeffTokenVlcTable[4][8])[2];
+const uint8_t (*kpChromaCoeffTokenVlcTable)[2];
+const uint8_t (*kpZeroTable[7])[2];
+const uint8_t (*kpTotalZerosTable[2][15])[2];
+} SVlcTable;
 
 // for data sharing cross modules and try to reduce size of binary generated
 extern const uint8_t g_kuiVlcChromaTable[256][2];
@@ -114,60 +113,59 @@
 }
 #endif
 
-static inline void_t InitVlcTable(SVlcTable * pVlcTable)
-{
-	pVlcTable->kpChromaCoeffTokenVlcTable = g_kuiVlcChromaTable;
-	
-	pVlcTable->kpCoeffTokenVlcTable[0][0] = g_kuiVlcTable_0;
-	pVlcTable->kpCoeffTokenVlcTable[0][1] = g_kuiVlcTable_1;
-	pVlcTable->kpCoeffTokenVlcTable[0][2] = g_kuiVlcTable_2;
-	pVlcTable->kpCoeffTokenVlcTable[0][3] = g_kuiVlcTable_3;
-	
-	pVlcTable->kpCoeffTokenVlcTable[1][0] = g_kuiVlcTable_0_0;
-	pVlcTable->kpCoeffTokenVlcTable[1][1] = g_kuiVlcTable_0_1;
-	pVlcTable->kpCoeffTokenVlcTable[1][2] = g_kuiVlcTable_0_2;
-	pVlcTable->kpCoeffTokenVlcTable[1][3] = g_kuiVlcTable_0_3;
-	
-	pVlcTable->kpCoeffTokenVlcTable[2][0] = g_kuiVlcTable_1_0;
-	pVlcTable->kpCoeffTokenVlcTable[2][1] = g_kuiVlcTable_1_1;
-	pVlcTable->kpCoeffTokenVlcTable[2][2] = g_kuiVlcTable_1_2;
-	pVlcTable->kpCoeffTokenVlcTable[2][3] = g_kuiVlcTable_1_3;
-	
-	pVlcTable->kpCoeffTokenVlcTable[3][0] = g_kuiVlcTable_2_0;
-	pVlcTable->kpCoeffTokenVlcTable[3][1] = g_kuiVlcTable_2_1;
-	pVlcTable->kpCoeffTokenVlcTable[3][2] = g_kuiVlcTable_2_2;
-	pVlcTable->kpCoeffTokenVlcTable[3][3] = g_kuiVlcTable_2_3;
-	pVlcTable->kpCoeffTokenVlcTable[3][4] = g_kuiVlcTable_2_4;
-	pVlcTable->kpCoeffTokenVlcTable[3][5] = g_kuiVlcTable_2_5;
-	pVlcTable->kpCoeffTokenVlcTable[3][6] = g_kuiVlcTable_2_6;
-	pVlcTable->kpCoeffTokenVlcTable[3][7] = g_kuiVlcTable_2_7;
-	
-	pVlcTable->kpZeroTable[0] = g_kuiZeroLeftTable0;
-	pVlcTable->kpZeroTable[1] = g_kuiZeroLeftTable1;
-	pVlcTable->kpZeroTable[2] = g_kuiZeroLeftTable2;
-	pVlcTable->kpZeroTable[3] = g_kuiZeroLeftTable3;
-	pVlcTable->kpZeroTable[4] = g_kuiZeroLeftTable4;
-	pVlcTable->kpZeroTable[5] = g_kuiZeroLeftTable5;
-	pVlcTable->kpZeroTable[6] = g_kuiZeroLeftTable6;
+static inline void_t InitVlcTable (SVlcTable* pVlcTable) {
+pVlcTable->kpChromaCoeffTokenVlcTable = g_kuiVlcChromaTable;
 
-	pVlcTable->kpTotalZerosTable[0][0] = g_kuiTotalZerosTable0;
-	pVlcTable->kpTotalZerosTable[0][1] = g_kuiTotalZerosTable1;
-	pVlcTable->kpTotalZerosTable[0][2] = g_kuiTotalZerosTable2;
-	pVlcTable->kpTotalZerosTable[0][3] = g_kuiTotalZerosTable3;
-	pVlcTable->kpTotalZerosTable[0][4] = g_kuiTotalZerosTable4;
-	pVlcTable->kpTotalZerosTable[0][5] = g_kuiTotalZerosTable5;
-	pVlcTable->kpTotalZerosTable[0][6] = g_kuiTotalZerosTable6;
-	pVlcTable->kpTotalZerosTable[0][7] = g_kuiTotalZerosTable7;
-	pVlcTable->kpTotalZerosTable[0][8] = g_kuiTotalZerosTable8;
-	pVlcTable->kpTotalZerosTable[0][9] = g_kuiTotalZerosTable9;
-	pVlcTable->kpTotalZerosTable[0][10] = g_kuiTotalZerosTable10;
-	pVlcTable->kpTotalZerosTable[0][11] = g_kuiTotalZerosTable11;
-	pVlcTable->kpTotalZerosTable[0][12] = g_kuiTotalZerosTable12;
-	pVlcTable->kpTotalZerosTable[0][13] = g_kuiTotalZerosTable13;
-	pVlcTable->kpTotalZerosTable[0][14] = g_kuiTotalZerosTable14;
-	pVlcTable->kpTotalZerosTable[1][0] = g_kuiTotalZerosChromaTable0;
-	pVlcTable->kpTotalZerosTable[1][1] = g_kuiTotalZerosChromaTable1;
-	pVlcTable->kpTotalZerosTable[1][2] = g_kuiTotalZerosChromaTable2;
+pVlcTable->kpCoeffTokenVlcTable[0][0] = g_kuiVlcTable_0;
+pVlcTable->kpCoeffTokenVlcTable[0][1] = g_kuiVlcTable_1;
+pVlcTable->kpCoeffTokenVlcTable[0][2] = g_kuiVlcTable_2;
+pVlcTable->kpCoeffTokenVlcTable[0][3] = g_kuiVlcTable_3;
+
+pVlcTable->kpCoeffTokenVlcTable[1][0] = g_kuiVlcTable_0_0;
+pVlcTable->kpCoeffTokenVlcTable[1][1] = g_kuiVlcTable_0_1;
+pVlcTable->kpCoeffTokenVlcTable[1][2] = g_kuiVlcTable_0_2;
+pVlcTable->kpCoeffTokenVlcTable[1][3] = g_kuiVlcTable_0_3;
+
+pVlcTable->kpCoeffTokenVlcTable[2][0] = g_kuiVlcTable_1_0;
+pVlcTable->kpCoeffTokenVlcTable[2][1] = g_kuiVlcTable_1_1;
+pVlcTable->kpCoeffTokenVlcTable[2][2] = g_kuiVlcTable_1_2;
+pVlcTable->kpCoeffTokenVlcTable[2][3] = g_kuiVlcTable_1_3;
+
+pVlcTable->kpCoeffTokenVlcTable[3][0] = g_kuiVlcTable_2_0;
+pVlcTable->kpCoeffTokenVlcTable[3][1] = g_kuiVlcTable_2_1;
+pVlcTable->kpCoeffTokenVlcTable[3][2] = g_kuiVlcTable_2_2;
+pVlcTable->kpCoeffTokenVlcTable[3][3] = g_kuiVlcTable_2_3;
+pVlcTable->kpCoeffTokenVlcTable[3][4] = g_kuiVlcTable_2_4;
+pVlcTable->kpCoeffTokenVlcTable[3][5] = g_kuiVlcTable_2_5;
+pVlcTable->kpCoeffTokenVlcTable[3][6] = g_kuiVlcTable_2_6;
+pVlcTable->kpCoeffTokenVlcTable[3][7] = g_kuiVlcTable_2_7;
+
+pVlcTable->kpZeroTable[0] = g_kuiZeroLeftTable0;
+pVlcTable->kpZeroTable[1] = g_kuiZeroLeftTable1;
+pVlcTable->kpZeroTable[2] = g_kuiZeroLeftTable2;
+pVlcTable->kpZeroTable[3] = g_kuiZeroLeftTable3;
+pVlcTable->kpZeroTable[4] = g_kuiZeroLeftTable4;
+pVlcTable->kpZeroTable[5] = g_kuiZeroLeftTable5;
+pVlcTable->kpZeroTable[6] = g_kuiZeroLeftTable6;
+
+pVlcTable->kpTotalZerosTable[0][0] = g_kuiTotalZerosTable0;
+pVlcTable->kpTotalZerosTable[0][1] = g_kuiTotalZerosTable1;
+pVlcTable->kpTotalZerosTable[0][2] = g_kuiTotalZerosTable2;
+pVlcTable->kpTotalZerosTable[0][3] = g_kuiTotalZerosTable3;
+pVlcTable->kpTotalZerosTable[0][4] = g_kuiTotalZerosTable4;
+pVlcTable->kpTotalZerosTable[0][5] = g_kuiTotalZerosTable5;
+pVlcTable->kpTotalZerosTable[0][6] = g_kuiTotalZerosTable6;
+pVlcTable->kpTotalZerosTable[0][7] = g_kuiTotalZerosTable7;
+pVlcTable->kpTotalZerosTable[0][8] = g_kuiTotalZerosTable8;
+pVlcTable->kpTotalZerosTable[0][9] = g_kuiTotalZerosTable9;
+pVlcTable->kpTotalZerosTable[0][10] = g_kuiTotalZerosTable10;
+pVlcTable->kpTotalZerosTable[0][11] = g_kuiTotalZerosTable11;
+pVlcTable->kpTotalZerosTable[0][12] = g_kuiTotalZerosTable12;
+pVlcTable->kpTotalZerosTable[0][13] = g_kuiTotalZerosTable13;
+pVlcTable->kpTotalZerosTable[0][14] = g_kuiTotalZerosTable14;
+pVlcTable->kpTotalZerosTable[1][0] = g_kuiTotalZerosChromaTable0;
+pVlcTable->kpTotalZerosTable[1][1] = g_kuiTotalZerosChromaTable1;
+pVlcTable->kpTotalZerosTable[1][2] = g_kuiTotalZerosChromaTable2;
 
 }
 
--- a/codec/decoder/core/inc/wels_common_basis.h
+++ b/codec/decoder/core/inc/wels_common_basis.h
@@ -47,61 +47,60 @@
 extern const uint8_t g_kuiScan8[24];
 extern const uint8_t g_kuiLumaDcZigzagScan[16];
 extern const uint8_t g_kuiChromaDcScan[4];
-extern __align16( const uint16_t, g_kuiDequantCoeff[52][8]);
+extern __align16 (const uint16_t, g_kuiDequantCoeff[52][8]);
 /* Profile IDC */
 typedef uint8_t		ProfileIdc;
-enum{
-	PRO_BASELINE	= 66,
-	PRO_MAIN		= 77,
-	PRO_EXTENDED	= 88,
-	PRO_HIGH		= 100,
-	PRO_HIGH10		= 110,
-	PRO_HIGH422		= 122,
-	PRO_HIGH444		= 144,
-	PRO_CAVLC444	= 244,
-	
-	PRO_SCALABLE_BASELINE	= 83,
-	PRO_SCALABLE_HIGH		= 86,
+enum {
+PRO_BASELINE	= 66,
+PRO_MAIN		= 77,
+PRO_EXTENDED	= 88,
+PRO_HIGH		= 100,
+PRO_HIGH10		= 110,
+PRO_HIGH422		= 122,
+PRO_HIGH444		= 144,
+PRO_CAVLC444	= 244,
+
+PRO_SCALABLE_BASELINE	= 83,
+PRO_SCALABLE_HIGH		= 86,
 };
 
 /*
  *	NAL Unit Type (5 Bits)
  */
-typedef enum TagNalUnitType
-{
-	NAL_UNIT_UNSPEC_0			= 0,
-	NAL_UNIT_CODED_SLICE		= 1,
-	NAL_UNIT_CODED_SLICE_DPA	= 2,
-	NAL_UNIT_CODED_SLICE_DPB	= 3,
-	NAL_UNIT_CODED_SLICE_DPC	= 4,
-	NAL_UNIT_CODED_SLICE_IDR	= 5,
-	NAL_UNIT_SEI				= 6,
-	NAL_UNIT_SPS				= 7,
-	NAL_UNIT_PPS				= 8,
-	NAL_UNIT_AU_DELIMITER		= 9,
-	NAL_UNIT_END_OF_SEQ			= 10,
-	NAL_UNIT_END_OF_STR			= 11,
-	NAL_UNIT_FILLER_DATA		= 12,
-	NAL_UNIT_SPS_EXT			= 13,
-	NAL_UNIT_PREFIX				= 14,
-	NAL_UNIT_SUBSET_SPS			= 15,
-	NAL_UNIT_RESV_16			= 16,
-	NAL_UNIT_RESV_17			= 17,
-	NAL_UNIT_RESV_18			= 18,
-	NAL_UNIT_AUX_CODED_SLICE	= 19,
-	NAL_UNIT_CODED_SLICE_EXT	= 20,
-	NAL_UNIT_RESV_21			= 21,
-	NAL_UNIT_RESV_22			= 22,
-	NAL_UNIT_RESV_23			= 23,
-	NAL_UNIT_UNSPEC_24			= 24,
-	NAL_UNIT_UNSPEC_25			= 25,
-	NAL_UNIT_UNSPEC_26			= 26,
-	NAL_UNIT_UNSPEC_27			= 27,
-	NAL_UNIT_UNSPEC_28			= 28,
-	NAL_UNIT_UNSPEC_29			= 29,
-	NAL_UNIT_UNSPEC_30			= 30,
-	NAL_UNIT_UNSPEC_31			= 31
-}ENalUnitType;
+typedef enum TagNalUnitType {
+NAL_UNIT_UNSPEC_0			= 0,
+NAL_UNIT_CODED_SLICE		= 1,
+NAL_UNIT_CODED_SLICE_DPA	= 2,
+NAL_UNIT_CODED_SLICE_DPB	= 3,
+NAL_UNIT_CODED_SLICE_DPC	= 4,
+NAL_UNIT_CODED_SLICE_IDR	= 5,
+NAL_UNIT_SEI				= 6,
+NAL_UNIT_SPS				= 7,
+NAL_UNIT_PPS				= 8,
+NAL_UNIT_AU_DELIMITER		= 9,
+NAL_UNIT_END_OF_SEQ			= 10,
+NAL_UNIT_END_OF_STR			= 11,
+NAL_UNIT_FILLER_DATA		= 12,
+NAL_UNIT_SPS_EXT			= 13,
+NAL_UNIT_PREFIX				= 14,
+NAL_UNIT_SUBSET_SPS			= 15,
+NAL_UNIT_RESV_16			= 16,
+NAL_UNIT_RESV_17			= 17,
+NAL_UNIT_RESV_18			= 18,
+NAL_UNIT_AUX_CODED_SLICE	= 19,
+NAL_UNIT_CODED_SLICE_EXT	= 20,
+NAL_UNIT_RESV_21			= 21,
+NAL_UNIT_RESV_22			= 22,
+NAL_UNIT_RESV_23			= 23,
+NAL_UNIT_UNSPEC_24			= 24,
+NAL_UNIT_UNSPEC_25			= 25,
+NAL_UNIT_UNSPEC_26			= 26,
+NAL_UNIT_UNSPEC_27			= 27,
+NAL_UNIT_UNSPEC_28			= 28,
+NAL_UNIT_UNSPEC_29			= 29,
+NAL_UNIT_UNSPEC_30			= 30,
+NAL_UNIT_UNSPEC_31			= 31
+} ENalUnitType;
 
 static const uint8_t g_kuiEmulationPreventionThreeByte	= 0x03;
 
@@ -109,27 +108,27 @@
  *	NAL Reference IDC (2 Bits)
  */
 typedef uint8_t		NalRefIdc;
-enum{
-	NRI_PRI_LOWEST	= 0,
-	NRI_PRI_LOW		= 1,
-	NRI_PRI_HIGH	= 2,
-	NRI_PRI_HIGHEST	= 3
+enum {
+NRI_PRI_LOWEST	= 0,
+NRI_PRI_LOW		= 1,
+NRI_PRI_HIGH	= 2,
+NRI_PRI_HIGHEST	= 3
 };
 
 /*
- * VCL TYPE	
+ * VCL TYPE
  */
 typedef uint8_t		VclType;
-enum{
-	NON_VCL			= 0,
-	VCL				= 1,
-	NOT_APP			= 2
+enum {
+NON_VCL			= 0,
+VCL				= 1,
+NOT_APP			= 2
 };
 
 /*
  *	vcl type map for given NAL unit type and corresponding H264 type
  */
-extern const VclType g_kuiVclTypeMap[32][2];  
+extern const VclType g_kuiVclTypeMap[32][2];
 
 #define IS_VCL_NAL(t, ext_idx)			(g_kuiVclTypeMap[t][ext_idx] == VCL)
 #define IS_PARAM_SETS_NALS(t)			( (t) == NAL_UNIT_SPS || (t) == NAL_UNIT_PPS || (t) == NAL_UNIT_SUBSET_SPS )
@@ -147,68 +146,67 @@
  * Need trim when eSliceType > 4 as fixed SliceType(eSliceType-4),
  * meaning mapped version after eSliceType minus 4.
  */
-typedef enum TagSliceType{
-	P_SLICE	= 0,
-	B_SLICE	= 1,
-	I_SLICE	= 2,
-	SP_SLICE= 3,
-	SI_SLICE= 4,
-	UNKNOWN_SLICE= 5
-}ESliceType;
+typedef enum TagSliceType {
+P_SLICE	= 0,
+B_SLICE	= 1,
+I_SLICE	= 2,
+SP_SLICE = 3,
+SI_SLICE = 4,
+UNKNOWN_SLICE = 5
+} ESliceType;
 
 /* Slice Types in scalable extension */
 typedef uint8_t		SliceTypeExt;
-enum{
-	EP_SLICE = 0,	// EP_SLICE: 0, 5
-	EB_SLICE = 1,	// EB_SLICE: 1, 6
-	EI_SLICE = 2	// EI_SLICE: 2, 7
+enum {
+EP_SLICE = 0,	// EP_SLICE: 0, 5
+EB_SLICE = 1,	// EB_SLICE: 1, 6
+EI_SLICE = 2	// EI_SLICE: 2, 7
 };
 
 /* List Index */
 typedef uint8_t		ListIndex;
-enum{
-	LIST_0	= 0,
-	LIST_1	= 1,
-	LIST_A	= 2
+enum {
+LIST_0	= 0,
+LIST_1	= 1,
+LIST_A	= 2
 };
 
 /* Picture Size */
-typedef struct TagPictureSize{
-	int32_t	iWidth;
-	int32_t iHeight;
-}SPictureSize;
+typedef struct TagPictureSize {
+int32_t	iWidth;
+int32_t iHeight;
+} SPictureSize;
 
 /* Motion Vector components */
 typedef uint8_t		MvComp;
-enum{
-	MV_X	= 0,
-	MV_Y	= 1,
-	MV_A	= 2
+enum {
+MV_X	= 0,
+MV_Y	= 1,
+MV_A	= 2
 };
 
 /* Chroma Components */
 typedef uint8_t		ChromaComp;
-enum{
-	CHROMA_CB	= 0,
-	CHROMA_CR	= 1,
-	CHROMA_A	= 2
+enum {
+CHROMA_CB	= 0,
+CHROMA_CR	= 1,
+CHROMA_A	= 2
 };
 
 /* Position Offset structure */
-typedef struct TagPosOffset{
-	int32_t	iLeftOffset;
-	int32_t	iTopOffset;
-	int32_t	iRightOffset;
-	int32_t	iBottomOffset;
-}SPosOffset;
+typedef struct TagPosOffset {
+int32_t	iLeftOffset;
+int32_t	iTopOffset;
+int32_t	iRightOffset;
+int32_t	iBottomOffset;
+} SPosOffset;
 
-enum EMbPosition //
-{
-    MB_LEFT     = 0x01,	// A
-    MB_TOP      = 0x02,	// B
-    MB_TOPRIGHT = 0x04,	// C
-	MB_TOPLEFT	= 0x08,	// D,
-    MB_PRIVATE  = 0x10,
+enum EMbPosition { //
+MB_LEFT     = 0x01,	// A
+MB_TOP      = 0x02,	// B
+MB_TOPRIGHT = 0x04,	// C
+MB_TOPLEFT	= 0x08,	// D,
+MB_PRIVATE  = 0x10,
 };
 /* MB Type & Sub-MB Type */
 typedef int32_t MbType;
@@ -246,14 +244,14 @@
 /*
  *	Memory Management Control Operation (MMCO) code
  */
-enum{
-	MMCO_END			=0,
-	MMCO_SHORT2UNUSED	=1,
-	MMCO_LONG2UNUSED	=2,
-	MMCO_SHORT2LONG		=3,
-	MMCO_SET_MAX_LONG	=4,
-	MMCO_RESET			=5,
-	MMCO_LONG			=6
+enum {
+MMCO_END			= 0,
+MMCO_SHORT2UNUSED	= 1,
+MMCO_LONG2UNUSED	= 2,
+MMCO_SHORT2LONG		= 3,
+MMCO_SET_MAX_LONG	= 4,
+MMCO_RESET			= 5,
+MMCO_LONG			= 6
 };
 
 /////////intra16x16  Luma
@@ -291,7 +289,7 @@
 
 #define C_PRED_DC_L      4
 #define C_PRED_DC_T      5
-#define C_PRED_DC_128    6 
+#define C_PRED_DC_128    6
 
 } // namespace WelsDec
 
--- a/codec/decoder/core/inc/wels_const.h
+++ b/codec/decoder/core/inc/wels_const.h
@@ -34,7 +34,7 @@
 #ifndef WELS_CONSTANCE_H__
 #define WELS_CONSTANCE_H__
 
-// Miscellaneous sizing infos  
+// Miscellaneous sizing infos
 #ifndef MAX_FNAME_LEN
 #define MAX_FNAME_LEN		256	// maximal length of file name in char size
 #endif//MAX_FNAME_LEN
@@ -95,10 +95,10 @@
 #define MAX_ACCESS_UINT_CAPACITY	1048576	// Maximal AU capacity in bytes: (1<<20) = 1024 KB predefined
 
 enum {
-	BASE_MB = 0,
-	NON_AVC_REWRITE_ENHANCE_MB =1,
-	AVC_REWRITE_ENHANCE_MB = 2
-		
+  BASE_MB = 0,
+  NON_AVC_REWRITE_ENHANCE_MB = 1,
+  AVC_REWRITE_ENHANCE_MB = 2
+
 };
 
 #endif//WELS_CONSTANCE_H__
--- a/codec/decoder/core/src/au_parser.cpp
+++ b/codec/decoder/core/src/au_parser.cpp
@@ -1,1027 +1,973 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file	au_parser.c
- *
- * \brief	Interfaces introduced in Access Unit level based parser
- *
- * \date	03/10/2009 Created
- *
- *************************************************************************************
- */
-#include <string.h>
-#include "codec_def.h"
-#include "ls_defines.h"
-#include "macros.h"
-#include "au_parser.h"
-#include "decoder.h"
-#include "error_code.h"
-#include "dec_frame.h"
-#include "dec_golomb.h"
-#include "bit_stream.h"
-#include "utils.h"
-#include "codec_app_def.h"
-#include "memmgr_nal_unit.h"
-#include "decoder_core.h"
-#include "wels_common_basis.h"
-#include "decoder_core.h"
-#include "manage_dec_ref.h"
-#include "mem_align.h"
-
-namespace WelsDec {
-/*! 
- *************************************************************************************
- * \brief	Start Code Prefix (0x 00 00 00 01) detection
- *
- * \param 	pBuf		bitstream payload buffer
- * \param	pOffset		offset between NAL rbsp and original bitsteam that
- * 				start code prefix is seperated from. 
- * \param	iBufSize	count size of buffer
- *
- * \return	RBSP buffer of start code prefix exclusive
- *
- * \note	N/A
- *************************************************************************************
- */
-uint8_t* DetectStartCodePrefix( const uint8_t *kpBuf, int32_t *pOffset, int32_t iBufSize )
-{
-	uint8_t *pBits = (uint8_t *)kpBuf;	
-
-	do {
-		int32_t iIdx = 0;
-	    while( (iIdx<iBufSize) && (!(*pBits)) ){
-		    ++ pBits; 
-		    ++ iIdx;
-	    }
-		if( iIdx >= iBufSize )  break;
-
-		++ iIdx;
-		++ pBits;		
-		
-		if( (iIdx>=3) && ((*(pBits-1)) == 0x1) ){		
-			*pOffset = ((uint32_t)pBits) - ((uint32_t)kpBuf);
-            return pBits;
-		}
-		
-		iBufSize -= iIdx;	
-	}  while (1);
-
-	return NULL;
-}
-
-/*! 
- *************************************************************************************
- * \brief	to parse nal unit
- *
- * \param	pCtx		    decoder context
- * \param 	pNalUnitHeader	parsed result of NAL Unit Header to output
- * \param   pSrcRbsp        bitstream buffer to input
- * \param   iSrcRbspLen     length size of bitstream buffer payload
- * \param	pSrcNal		    
- * \param	iSrcNalLen		
- * \param	pConsumedBytes	consumed bytes during parsing
- *
- * \return	decoded bytes payload, might be (pSrcRbsp+1) if no escapes 
- *
- * \note	N/A
- *************************************************************************************
- */
-uint8_t* ParseNalHeader( PWelsDecoderContext pCtx, SNalUnitHeader *pNalUnitHeader, uint8_t *pSrcRbsp, int32_t iSrcRbspLen, uint8_t *pSrcNal, int32_t iSrcNalLen, int32_t* pConsumedBytes )
-{
-	PNalUnit pCurNal = NULL;
-	uint8_t* pNal     = pSrcRbsp;
-	int32_t iNalSize  = iSrcRbspLen;
-	PBitStringAux pBs = NULL;
-	bool_t bExtensionFlag = false;
-	int32_t iErr	= ERR_NONE;	
-	int32_t iBitSize = 0;
-	
-	pNalUnitHeader->eNalUnitType = NAL_UNIT_UNSPEC_0;//SHOULD init it. because pCtx->sCurNalHead is common variable.
-
-	//remove the consecutive ZERO at the end of current NAL in the reverse order.--2011.6.1
-	{
-		int32_t iIndex = iSrcRbspLen - 1;
-		uint8_t uiBsZero = 0; 
-		while ( iIndex >= 0 )
-		{
-			uiBsZero = pSrcRbsp[iIndex];
-			if ( 0 == uiBsZero )
-			{
-				--iNalSize;
-				--iIndex;
-			}
-			else
-			{
-				break;
-			}
-		}	
-	}
-	
-	pNalUnitHeader->uiForbiddenZeroBit	= (uint8_t)(pNal[0] >> 7);			// uiForbiddenZeroBit	
-	if ( pNalUnitHeader->uiForbiddenZeroBit )//2010.4.14
-	{
-		return NULL; //uiForbiddenZeroBit should always equal to 0
-	}
-
-	pNalUnitHeader->uiNalRefIdc		= (uint8_t)(pNal[0] >> 5);			// uiNalRefIdc
-	pNalUnitHeader->eNalUnitType		= (ENalUnitType)(pNal[0] & 0x1f);	// eNalUnitType	
-	
-	++pNal;
-	--iNalSize;
-	++(*pConsumedBytes);
-	
-#ifdef DEBUG_PARSE_INFO
-	WelsLog(pCtx, WELS_LOG_INFO, "nal type: %d \n", pNalUnitHeader->eNalUnitType);
-#endif
-	
-	if ( !(IS_SEI_NAL(pNalUnitHeader->eNalUnitType) || IS_SPS_NAL(pNalUnitHeader->eNalUnitType) || pCtx->bSpsExistAheadFlag) )
-	{
-		WelsLog( pCtx, WELS_LOG_WARNING, "parse_nal(), no exist Sequence Parameter Sets ahead of sequence when try to decode NAL(type:%d).\n", pNalUnitHeader->eNalUnitType);
-		pCtx->iErrorCode	= dsNoParamSets;
-		return NULL;
-	}
-	if ( !(IS_SEI_NAL(pNalUnitHeader->eNalUnitType) || IS_PARAM_SETS_NALS(pNalUnitHeader->eNalUnitType) || pCtx->bPpsExistAheadFlag) )
-	{
-		WelsLog( pCtx, WELS_LOG_WARNING, "parse_nal(), no exist Picture Parameter Sets ahead of sequence when try to decode NAL(type:%d).\n", pNalUnitHeader->eNalUnitType);
-		pCtx->iErrorCode	= dsNoParamSets;
-		return NULL;
-	}
-	if ( (IS_VCL_NAL_AVC_BASE(pNalUnitHeader->eNalUnitType) && !(pCtx->bSpsExistAheadFlag || pCtx->bPpsExistAheadFlag)) || 
-		(IS_NEW_INTRODUCED_NAL(pNalUnitHeader->eNalUnitType) && !(pCtx->bSpsExistAheadFlag || pCtx->bSubspsExistAheadFlag || pCtx->bPpsExistAheadFlag) ) )
-	{
-		WelsLog( pCtx, WELS_LOG_WARNING, "ParseNalHeader(), no exist Parameter Sets ahead of sequence when try to decode slice(type:%d).\n", pNalUnitHeader->eNalUnitType);
-		pCtx->iErrorCode	|= dsNoParamSets;
-		return NULL;
-	}
-	
-
-	switch(pNalUnitHeader->eNalUnitType){
-	case NAL_UNIT_SEI:
-			
-		if ( pCtx->pAccessUnitList->uiAvailUnitsNum > 0 )
-		{
-			pCtx->pAccessUnitList->uiEndPos = pCtx->pAccessUnitList->uiAvailUnitsNum - 1;
-			pCtx->bAuReadyFlag = true;
-		}
-
-		break;
-	
-	case NAL_UNIT_SPS:	
-		
-		if ( pCtx->pAccessUnitList->uiAvailUnitsNum > 0 )
-		{
-			pCtx->pAccessUnitList->uiEndPos = pCtx->pAccessUnitList->uiAvailUnitsNum - 1;
-			pCtx->bAuReadyFlag = true;				
-		}
-			
-		break;
-
-	case NAL_UNIT_PREFIX:
-		pCurNal = &pCtx->sPrefixNal;
-
-		if ( iNalSize < NAL_UNIT_HEADER_EXT_SIZE )
-		{
-			return NULL;
-		}
-
-		DecodeNalHeaderExt( pCurNal, pNal );
-		
-		pNal            += NAL_UNIT_HEADER_EXT_SIZE;
-		iNalSize        -= NAL_UNIT_HEADER_EXT_SIZE;
-		*pConsumedBytes += NAL_UNIT_HEADER_EXT_SIZE;
-
-		pCurNal->sNalHeaderExt.sNalUnitHeader.uiForbiddenZeroBit = pNalUnitHeader->uiForbiddenZeroBit;
-		pCurNal->sNalHeaderExt.sNalUnitHeader.uiNalRefIdc		  = pNalUnitHeader->uiNalRefIdc;
-		pCurNal->sNalHeaderExt.sNalUnitHeader.eNalUnitType	      = pNalUnitHeader->eNalUnitType;
-
-		pBs = &pCtx->sBs;
-		
-		iBitSize = (iNalSize<<3) - BsGetTrailingBits( pNal + iNalSize - 1 ); // convert into bit
-		
-		InitBits( pBs, pNal, iBitSize);
-
-		ParsePrefixNalUnit( pCtx, pBs );
-		
-		break;
-	case NAL_UNIT_CODED_SLICE_EXT:
-		bExtensionFlag = true;
-	case NAL_UNIT_CODED_SLICE:
-	case NAL_UNIT_CODED_SLICE_IDR:
-		{
-			PAccessUnit pCurAu		= NULL;
-			uint32_t uiAvailNalNum;
-			pCurNal = MemGetNextNal( &pCtx->pAccessUnitList );
-			if( NULL == pCurNal )
-			{
-				WelsLog( pCtx, WELS_LOG_WARNING, "MemGetNextNal() fail due out of memory.\n");
-				pCtx->iErrorCode	|= dsOutOfMemory;
-				return NULL;
-			}
-
-			pCurNal->sNalHeaderExt.sNalUnitHeader.uiForbiddenZeroBit = pNalUnitHeader->uiForbiddenZeroBit;
-			pCurNal->sNalHeaderExt.sNalUnitHeader.uiNalRefIdc		  = pNalUnitHeader->uiNalRefIdc;
-			pCurNal->sNalHeaderExt.sNalUnitHeader.eNalUnitType	  = pNalUnitHeader->eNalUnitType;
-			pCurAu	      = pCtx->pAccessUnitList;
-			uiAvailNalNum = pCurAu->uiAvailUnitsNum;
-			
-
-			if( pNalUnitHeader->eNalUnitType == NAL_UNIT_CODED_SLICE_EXT )
-			{	
-				if ( iNalSize < NAL_UNIT_HEADER_EXT_SIZE )
-				{
-					return NULL;
-				}
-
-				DecodeNalHeaderExt( pCurNal, pNal );
-                if( pCurNal->sNalHeaderExt.uiQualityId != 0 ||
-                    pCurNal->sNalHeaderExt.bUseRefBasePicFlag )
-                {
-                    if( pCurNal->sNalHeaderExt.uiQualityId != 0 )
-                        WelsLog(pCtx, WELS_LOG_WARNING, "ParseNalHeader():uiQualityId (%d) != 0, MGS not supported!\n", pCurNal->sNalHeaderExt.uiQualityId);
-                    if( pCurNal->sNalHeaderExt.bUseRefBasePicFlag != 0 )
-                        WelsLog(pCtx, WELS_LOG_WARNING, "ParseNalHeader():bUseRefBasePicFlag (%d) != 0, MGS not supported!\n", pCurNal->sNalHeaderExt.bUseRefBasePicFlag);
-
-                    pCtx->iErrorCode |= dsInvalidArgument;
-				    ForceClearCurrentNal( pCurAu );
-
-				    if ( uiAvailNalNum > 1 )
-				    {
-					    pCurAu->uiEndPos = uiAvailNalNum - 2;
-					    pCtx->bAuReadyFlag = true;
-				    }
-                    return NULL;
-                }
-				pNal            += NAL_UNIT_HEADER_EXT_SIZE;
-				iNalSize        -= NAL_UNIT_HEADER_EXT_SIZE;
-				*pConsumedBytes += NAL_UNIT_HEADER_EXT_SIZE;
-				
-			}
-			else
-			{	
-
-				
-				if ( NAL_UNIT_PREFIX == pCtx->sPrefixNal.sNalHeaderExt.sNalUnitHeader.eNalUnitType )
-				{
-					PrefetchNalHeaderExtSyntax( pCtx, pCurNal, &pCtx->sPrefixNal );
-				}	
-
-				pCurNal->sNalHeaderExt.bIdrFlag = ( NAL_UNIT_CODED_SLICE_IDR == pNalUnitHeader->eNalUnitType ) ? true : false; //SHOULD update this flag for AVC if no prefix NAL
-				pCurNal->sNalHeaderExt.iNoInterLayerPredFlag = 1;
-			}		
-						
-			pBs = &pCurAu->pNalUnitsList[uiAvailNalNum-1]->sNalData.sVclNal.sSliceBitsRead;
-			iBitSize = (iNalSize<<3) - BsGetTrailingBits( pNal+ iNalSize - 1 ); // convert into bit
-			InitBits( pBs, pNal, iBitSize);
-			iErr = ParseSliceHeaderSyntaxs( pCtx, pBs, bExtensionFlag );
-			if ( iErr != ERR_NONE )
-			{
-				//if current NAL occur error when parsing, should clean it from pNalUnitsList
-				//otherwise, when Next good NAL decoding, this corrupt NAL is considered as normal NAL and lead to decoder crash		
-				ForceClearCurrentNal( pCurAu );
-
-				if ( uiAvailNalNum > 1 )
-				{
-					pCurAu->uiEndPos = uiAvailNalNum - 2;
-					pCtx->bAuReadyFlag = true;
-					
-
-				}
-#ifdef MOSAIC_AVOID_BASED_ON_SPS_PPS_ID			
-				if (  dsNoParamSets & pCtx->iErrorCode )
-				{
-					if ( uiAvailNalNum <= 1 ) //no any data to decode and SPS/PPS ID mismatch, SHOULD request IDR
-					{
-#ifdef LONG_TERM_REF
-						pCtx->bParamSetsLostFlag = true;
-#else
-						pCtx->bReferenceLostAtT0Flag = true;
-#endif
-						ResetParameterSetsState( pCtx );
-					}
-					return NULL;
-				}
-				else
-				{
-					return NULL;
-				}
-#else
-				return NULL;
-#endif //MOSAIC_AVOID_BASED_ON_SPS_PPS_ID
-			}
-
-			if ( (uiAvailNalNum > 1) &&
-                CheckAccessUnitBoundary(	pCurAu->pNalUnitsList[uiAvailNalNum-1], pCurAu->pNalUnitsList[uiAvailNalNum-2], 
-				pCurAu->pNalUnitsList[uiAvailNalNum-1]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.pSps) )
-			{
-				pCurAu->uiEndPos = uiAvailNalNum - 2;
-				pCtx->bAuReadyFlag = true;
-				
-
-			}	
-		}
-		break;
-	default:
-		break;
-	}  
-	
-	return pNal;
-}	
-
-
-bool_t CheckAccessUnitBoundaryExt( PNalUnitHeaderExt pLastNalHdrExt, PNalUnitHeaderExt pCurNalHeaderExt, PSliceHeader pLastSliceHeader, PSliceHeader pCurSliceHeader )
-{
-	const PSps kpSps = pCurSliceHeader->pSps;
-
-	//Sub-clause 7.1.4.1.1 temporal_id  
-	if (pLastNalHdrExt->uiTemporalId != pCurNalHeaderExt->uiTemporalId) {
-		return TRUE;
-	}
-
-	// Subclause 7.4.1.2.5
-	if ( pLastSliceHeader->iRedundantPicCnt < pCurSliceHeader->iRedundantPicCnt )
-		return FALSE;
-	else if ( pLastSliceHeader->iRedundantPicCnt > pCurSliceHeader->iRedundantPicCnt )
-		return TRUE;
-
-	// Subclause G7.4.1.2.4
-	if ( pLastNalHdrExt->uiDependencyId < pCurNalHeaderExt->uiDependencyId )
-		return FALSE;
-	else if ( pLastNalHdrExt->uiDependencyId > pCurNalHeaderExt->uiDependencyId )
-		return TRUE;
-	if ( pLastNalHdrExt->uiQualityId < pCurNalHeaderExt->uiQualityId )
-		return FALSE;
-	else if ( pLastNalHdrExt->uiQualityId > pCurNalHeaderExt->uiQualityId )
-		return TRUE;
-
-	// Subclause 7.4.1.2.4
-	if ( pLastSliceHeader->iFrameNum != pCurSliceHeader->iFrameNum )
-		return TRUE;
-	if ( pLastSliceHeader->iPpsId != pCurSliceHeader->iPpsId )
-		return TRUE;
-	if ( pLastSliceHeader->bFieldPicFlag != pCurSliceHeader->bFieldPicFlag )
-		return TRUE;
-	if ( pLastSliceHeader->bBottomFiledFlag != pCurSliceHeader->bBottomFiledFlag )
-		return TRUE;
-	if ( (pLastNalHdrExt->sNalUnitHeader.uiNalRefIdc != NRI_PRI_LOWEST) !=  (pCurNalHeaderExt->sNalUnitHeader.uiNalRefIdc != NRI_PRI_LOWEST) )
-		return TRUE;
-	if ( pLastNalHdrExt->bIdrFlag != pCurNalHeaderExt->bIdrFlag )
-		return TRUE;
-	if ( pCurNalHeaderExt->bIdrFlag ){
-		if ( pLastSliceHeader->uiIdrPicId != pCurSliceHeader->uiIdrPicId )
-			return TRUE;
-	}
-	if ( kpSps->uiPocType == 0 ){
-		if ( pLastSliceHeader->iPicOrderCntLsb != pCurSliceHeader->iPicOrderCntLsb )
-			return TRUE;
-		if ( pLastSliceHeader->iDeltaPicOrderCntBottom != pCurSliceHeader->iDeltaPicOrderCntBottom )
-			return TRUE;
-	}
-	else if ( kpSps->uiPocType == 1 ){
-		if ( pLastSliceHeader->iDeltaPicOrderCnt[0] != pCurSliceHeader->iDeltaPicOrderCnt[0] )
-			return TRUE;
-		if ( pLastSliceHeader->iDeltaPicOrderCnt[1] != pCurSliceHeader->iDeltaPicOrderCnt[1] )
-			return TRUE;
-	}
-
-	return FALSE;
-}	 
-
-
-bool_t CheckAccessUnitBoundary( const PNalUnit kpCurNal, const PNalUnit kpLastNal, const PSps kpSps )
-{
-	const PNalUnitHeaderExt kpLastNalHeaderExt = &kpLastNal->sNalHeaderExt;
-	const PNalUnitHeaderExt kpCurNalHeaderExt = &kpCurNal->sNalHeaderExt;
-	const SSliceHeader *kpLastSliceHeader = &kpLastNal->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader;
-	const SSliceHeader *kpCurSliceHeader = &kpCurNal->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader;
-
-	//Sub-clause 7.1.4.1.1 temporal_id  
-	if (kpLastNalHeaderExt->uiTemporalId != kpCurNalHeaderExt->uiTemporalId) {
-		return TRUE;
-	}
-
-	// Subclause 7.4.1.2.5
-	if ( kpLastSliceHeader->iRedundantPicCnt < kpCurSliceHeader->iRedundantPicCnt )
-		return FALSE;
-	else if ( kpLastSliceHeader->iRedundantPicCnt > kpCurSliceHeader->iRedundantPicCnt )
-		return TRUE;
-
-	// Subclause G7.4.1.2.4
-	if ( kpLastNalHeaderExt->uiDependencyId < kpCurNalHeaderExt->uiDependencyId )
-		return FALSE;
-	else if ( kpLastNalHeaderExt->uiDependencyId > kpCurNalHeaderExt->uiDependencyId )
-		return TRUE;
-	if ( kpLastNalHeaderExt->uiQualityId < kpCurNalHeaderExt->uiQualityId )
-		return FALSE;
-	else if ( kpLastNalHeaderExt->uiQualityId > kpCurNalHeaderExt->uiQualityId )
-		return TRUE;
-
-	// Subclause 7.4.1.2.4
-	if ( kpLastSliceHeader->iFrameNum != kpCurSliceHeader->iFrameNum )
-		return TRUE;
-	if ( kpLastSliceHeader->iPpsId != kpCurSliceHeader->iPpsId )
-		return TRUE;
-	if ( kpLastSliceHeader->bFieldPicFlag != kpCurSliceHeader->bFieldPicFlag )
-		return TRUE;
-	if ( kpLastSliceHeader->bBottomFiledFlag != kpCurSliceHeader->bBottomFiledFlag )
-		return TRUE;
-	if ( (kpLastNalHeaderExt->sNalUnitHeader.uiNalRefIdc != NRI_PRI_LOWEST) !=  (kpCurNalHeaderExt->sNalUnitHeader.uiNalRefIdc != NRI_PRI_LOWEST) )
-		return TRUE;
-	if ( kpLastNalHeaderExt->bIdrFlag != kpCurNalHeaderExt->bIdrFlag )
-		return TRUE;
-	if ( kpCurNalHeaderExt->bIdrFlag ){
-		if ( kpLastSliceHeader->uiIdrPicId != kpCurSliceHeader->uiIdrPicId )
-			return TRUE;
-	}
-	if ( kpSps->uiPocType == 0 ){
-		if ( kpLastSliceHeader->iPicOrderCntLsb != kpCurSliceHeader->iPicOrderCntLsb )
-			return TRUE;
-		if ( kpLastSliceHeader->iDeltaPicOrderCntBottom != kpCurSliceHeader->iDeltaPicOrderCntBottom )
-			return TRUE;
-	}
-	else if ( kpSps->uiPocType == 1 ){
-		if ( kpLastSliceHeader->iDeltaPicOrderCnt[0] != kpCurSliceHeader->iDeltaPicOrderCnt[0] )
-			return TRUE;
-		if ( kpLastSliceHeader->iDeltaPicOrderCnt[1] != kpCurSliceHeader->iDeltaPicOrderCnt[1] )
-			return TRUE;
-	}
-
-	return FALSE;
-}	 
-
-/*! 
- *************************************************************************************
- * \brief	to parse NON VCL NAL Units
- *
- * \param 	pCtx		decoder context
- * \param	rbsp		rbsp buffer of NAL Unit
- * \param	src_len		length of rbsp buffer
- *
- * \return	0 - successed
- *	    	1 - failed
- *
- *************************************************************************************
- */
-int32_t ParseNonVclNal( PWelsDecoderContext pCtx, uint8_t *pRbsp, const int32_t kiSrcLen )
-{
-	PBitStringAux	pBs = NULL;	
-	ENalUnitType eNalType	= NAL_UNIT_UNSPEC_0; // make initial value as unspecified
-	int32_t iPicWidth		= 0;
-	int32_t iPicHeight		= 0;
-	int32_t iBitSize		= 0;
-	int32_t iErr				= ERR_NONE;	
-
-	pBs	     = &pCtx->sBs;	// SBitStringAux instance for non VCL NALs decoding
-	iBitSize = (kiSrcLen<<3) - BsGetTrailingBits( pRbsp + kiSrcLen - 1 ); // convert into bit		
-	eNalType = pCtx->sCurNalHead.eNalUnitType;
-
-	switch( eNalType ) {	
-		case NAL_UNIT_SPS:
-		case NAL_UNIT_SUBSET_SPS:
-			if ( iBitSize > 0 )
-				InitBits( pBs, pRbsp, iBitSize );
-#ifdef DEBUG_PARSE_INFO
-			WelsLog(pCtx, WELS_LOG_INFO, "parsing nal: %d \n", eNalType);
-#endif
-			iErr = ParseSps( pCtx, pBs, &iPicWidth, &iPicHeight );
-			if ( ERR_NONE != iErr )	// modified for pSps/pSubsetSps invalid, 12/1/2009 
-			{
-				pCtx->iErrorCode |= dsNoParamSets;
-				return iErr;
-			}
-
-			if ( ERR_NONE == iErr )
-				UpdateMaxPictureResolution( pCtx, iPicWidth, iPicHeight );
-			
-			break;		
-
-		case NAL_UNIT_PPS:
-			if ( iBitSize > 0 )
-				InitBits( pBs, pRbsp, iBitSize );
-#ifdef DEBUG_PARSE_INFO
-			WelsLog(pCtx, WELS_LOG_INFO, "parsing nal: %d \n", eNalType);
-#endif
-			iErr = ParsePps( pCtx, &pCtx->sPpsBuffer[0], pBs );
-			if ( ERR_NONE != iErr )	// modified for pps invalid, 12/1/2009 
-			{
-				pCtx->iErrorCode |= dsNoParamSets;
-				return iErr;
-			}
-
-			pCtx->bPpsExistAheadFlag	= true;
-
-			break;
-
-		case NAL_UNIT_SEI:
-
-			break;
-
-		case NAL_UNIT_PREFIX:
-			break;		
-		case NAL_UNIT_CODED_SLICE_DPA:
-		case NAL_UNIT_CODED_SLICE_DPB:
-		case NAL_UNIT_CODED_SLICE_DPC:
-
-			break;
-
-		default:
-			break;		
-	}
-
-	return iErr;
-}
-
-void_t ParseRefBasePicMarking ( PBitStringAux pBs, PRefBasePicMarking pRefBasePicMarking )
-{	
-	const bool_t kbAdaptiveMarkingModeFlag = !!BsGetOneBit( pBs );
-	pRefBasePicMarking->bAdaptiveRefBasePicMarkingModeFlag = kbAdaptiveMarkingModeFlag;
-	if ( kbAdaptiveMarkingModeFlag ){
-		int32_t iIdx = 0;
-		do {
-			const uint32_t kuiMmco = BsGetUe( pBs );
-
-			pRefBasePicMarking->mmco_base[iIdx].uiMmcoType	= kuiMmco;
-
-			if (kuiMmco == MMCO_END)
-				break;
-
-			if (kuiMmco == MMCO_SHORT2UNUSED){
-				pRefBasePicMarking->mmco_base[iIdx].uiDiffOfPicNums	= 1 + BsGetUe( pBs );
-				pRefBasePicMarking->mmco_base[iIdx].iShortFrameNum	= 0;
-			}
-			else if (kuiMmco == MMCO_LONG2UNUSED){
-				pRefBasePicMarking->mmco_base[iIdx].uiLongTermPicNum	= BsGetUe( pBs );
-			}
-			++ iIdx;
-		} while(iIdx < MAX_MMCO_COUNT);
-	}
-}
-
-void_t ParsePrefixNalUnit ( PWelsDecoderContext pCtx, PBitStringAux pBs )
-{
-	PNalUnit pCurNal = &pCtx->sPrefixNal;
-
-	if ( pCurNal->sNalHeaderExt.sNalUnitHeader.uiNalRefIdc != 0 ){
-		PNalUnitHeaderExt head_ext = &pCurNal->sNalHeaderExt;
-		PPrefixNalUnit sPrefixNal = &pCurNal->sNalData.sPrefixNal;
-		sPrefixNal->bStoreRefBasePicFlag	= !!BsGetOneBit( pBs );
-		if ( (head_ext->bUseRefBasePicFlag || sPrefixNal->bStoreRefBasePicFlag) && !head_ext->bIdrFlag )
-		{
-			ParseRefBasePicMarking ( pBs, &sPrefixNal->sRefPicBaseMarking );
-		}
-		sPrefixNal->bPrefixNalUnitAdditionalExtFlag	= !!BsGetOneBit( pBs );
-		if ( sPrefixNal->bPrefixNalUnitAdditionalExtFlag ){
-			sPrefixNal->bPrefixNalUnitExtFlag	= !!BsGetOneBit( pBs );
-		}
-	}	
-}
-
-
-int32_t DecodeSpsSvcExt( PWelsDecoderContext pCtx, PSubsetSps pSpsExt, PBitStringAux pBs )
-{	
-	PSpsSvcExt  pExt			= NULL;
-	uint8_t uiChromaArrayType	= 1;
-
-	pExt	= &pSpsExt->sSpsSvcExt;
-	
-	pExt->bInterLayerDeblockingFilterCtrlPresentFlag	= !!BsGetOneBit( pBs );
-	pExt->uiExtendedSpatialScalability						= BsGetBits( pBs, 2 );
-	if ( pExt->uiExtendedSpatialScalability > 2 )
-    {
-        WelsLog(pCtx, WELS_LOG_WARNING, "DecodeSpsSvcExt():extended_spatial_scalability (%d) != 0, ESS not supported!\n", pExt->uiExtendedSpatialScalability);
-		return GENERATE_ERROR_NO(ERR_LEVEL_PARAM_SETS, ERR_INFO_INVALID_ESS);
-    }
-	
-	pExt->uiChromaPhaseXPlus1Flag	= 0;	// FIXME: Incoherent with JVT X201 standard (= 1), but conformance to JSVM (= 0) implementation.
-	pExt->uiChromaPhaseYPlus1		= 1;
-	uiChromaArrayType = pSpsExt->sSps.uiChromaArrayType;
-
-    pExt->uiChromaPhaseXPlus1Flag	= BsGetOneBit( pBs );
-    pExt->uiChromaPhaseYPlus1		= BsGetBits( pBs, 2 );
-	
-	pExt->uiSeqRefLayerChromaPhaseXPlus1Flag	= pExt->uiChromaPhaseXPlus1Flag;
-	pExt->uiSeqRefLayerChromaPhaseYPlus1		= pExt->uiChromaPhaseYPlus1;
-	memset(&pExt->sSeqScaledRefLayer, 0, sizeof(SPosOffset));
-
-    if ( pExt->uiExtendedSpatialScalability == 1 ){
-		SPosOffset* const kpPos = &pExt->sSeqScaledRefLayer;
-		pExt->uiSeqRefLayerChromaPhaseXPlus1Flag	= BsGetOneBit( pBs );
-		pExt->uiSeqRefLayerChromaPhaseYPlus1		= BsGetBits( pBs, 2 );
-
-        kpPos->iLeftOffset	= BsGetSe( pBs );
-		kpPos->iTopOffset	= BsGetSe( pBs );
-		kpPos->iRightOffset	= BsGetSe( pBs );
-		kpPos->iBottomOffset= BsGetSe( pBs );
-	}
-	
-	pExt->bSeqTCoeffLevelPredFlag	= !!BsGetOneBit( pBs );
-	pExt->bAdaptiveTCoeffLevelPredFlag	= false;
-	if ( pExt->bSeqTCoeffLevelPredFlag )
-		pExt->bAdaptiveTCoeffLevelPredFlag	= !!BsGetOneBit( pBs );
-	pExt->bSliceHeaderRestrictionFlag	= !!BsGetOneBit( pBs );
-
-
-	
-	return 0;
-}
-
-/*! 
- *************************************************************************************
- * \brief	to parse Sequence Parameter Set (SPS)
- *
- * \param	pCtx		Decoder context
- * \param	pBsAux		bitstream reader auxiliary 
- * \param	pPicWidth	picture width current Sps represented
- * \param	pPicHeight	picture height current Sps represented
- *
- * \return	0 - successed
- *		1 - failed
- *
- * \note	Call it in case eNalUnitType is SPS.
- *************************************************************************************
- */
-
-
-int32_t ParseSps( PWelsDecoderContext pCtx, PBitStringAux pBsAux, int32_t *pPicWidth, int32_t *pPicHeight  )
-{
-	PBitStringAux pBs		= pBsAux;
-	PSps pSps				= NULL;
-	PSubsetSps pSubsetSps	= NULL;
-	SNalUnitHeader *pNalHead= &pCtx->sCurNalHead;
-	ProfileIdc	uiProfileIdc;
-	uint8_t	uiLevelIdc;
-	int32_t iSpsId;
-	bool_t bConstraintSetFlags[6] = { false };
-	const bool_t kbUseSubsetFlag   = IS_SUBSET_SPS_NAL(pNalHead->eNalUnitType);
-
-	
-	if ( kbUseSubsetFlag )	// SubsetSps
-	{
-		pCtx->bSubspsExistAheadFlag	= true;
-	}
-	else	// Sps
-	{
-		pCtx->bSpsExistAheadFlag		= true;
-
-		// added for EC, 10/28/2009		
-		// for safe
-		memset( &pCtx->bSpsAvailFlags[0], 0, sizeof(pCtx->bSpsAvailFlags) );
-		memset( &pCtx->bSubspsAvailFlags[0], 0, sizeof(pCtx->bSubspsAvailFlags) );
-		memset( &pCtx->bPpsAvailFlags[0], 0, sizeof(pCtx->bPpsAvailFlags) );
-
-#ifdef MOSAIC_AVOID_BASED_ON_SPS_PPS_ID
-		pCtx->iSpsTotalNum    = 0;
-		pCtx->iSubspsTotalNum = 0;
-		pCtx->iPpsTotalNum    = 0;
-#endif //MOSAIC_AVOID_BASED_ON_SPS_PPS_ID		
-	}
-
-	uiProfileIdc	= BsGetBits( pBs, 8 );
-	bConstraintSetFlags[0]	= !!BsGetOneBit( pBs );	// constraint_set0_flag
-	bConstraintSetFlags[1]	= !!BsGetOneBit( pBs );	// constraint_set1_flag
-	bConstraintSetFlags[2]	= !!BsGetOneBit( pBs );	// constraint_set2_flag
-	bConstraintSetFlags[3]	= !!BsGetOneBit( pBs );	// constraint_set3_flag
-	bConstraintSetFlags[4]	= !!BsGetOneBit( pBs );	// constraint_set4_flag
-	bConstraintSetFlags[5]	= !!BsGetOneBit( pBs );	// constraint_set5_flag
-	BsGetBits( pBs, 2 );							// reserved_zero_2bits, equal to 0
-	uiLevelIdc	= BsGetBits( pBs, 8  );				// level_idc
-
-	iSpsId		= BsGetUe( pBs  );					// seq_parameter_set_id
-	
-		
-	if ( iSpsId >= MAX_SPS_COUNT || iSpsId < 0 )	// Modified to check invalid negative iSpsId, 12/1/2009
-	{
-		WelsLog( pCtx, WELS_LOG_WARNING, " iSpsId is out of range! \n");
-		return GENERATE_ERROR_NO(ERR_LEVEL_PARAM_SETS, ERR_INFO_SPS_ID_OVERFLOW);
-	}
-
-	if ( kbUseSubsetFlag )
-	{
-#ifdef MOSAIC_AVOID_BASED_ON_SPS_PPS_ID
-		pSubsetSps = &pCtx->sSubsetSpsBuffer[pCtx->iSubspsTotalNum];
-		pCtx->bSubspsAvailFlags[pCtx->iSubspsTotalNum] = true;
-		
-		pSubsetSps->sSps.iSpsId = iSpsId;
-		pSps = &pSubsetSps->sSps;
-		++pCtx->iSubspsTotalNum;
-#else
-		pSubsetSps	= &pCtx->sSubsetSpsBuffer[iSpsId];
-		pSps		= &pSubsetSps->sSps;		
-		pCtx->bSubspsAvailFlags[iSpsId]	= true; // added for EC, 10/28/2009
-#endif //MOSAIC_AVOID_BASED_ON_SPS_PPS_ID			
-	}
-	else
-	{
-#ifdef MOSAIC_AVOID_BASED_ON_SPS_PPS_ID
-		pSps = &pCtx->sSpsBuffer[pCtx->iSpsTotalNum];
-		pCtx->bSpsAvailFlags[pCtx->iSpsTotalNum] = true;
-		
-		pSps->iSpsId = iSpsId;
-		++pCtx->iSpsTotalNum;
-#else
-		pSps = &pCtx->sSpsBuffer[iSpsId];		
-		pCtx->bSpsAvailFlags[iSpsId] = true; // added for EC, 10/28/2009
-#endif //MOSAIC_AVOID_BASED_ON_SPS_PPS_ID		
-	}
-
-	// syntax elements in default
-	pSps->uiChromaFormatIdc	= 1;
-	pSps->uiBitDepthLuma		=
-	pSps->uiBitDepthChroma	= 8; 
-	
-	pSps->uiProfileIdc	= uiProfileIdc;
-	pSps->uiLevelIdc	= uiLevelIdc;
-	pSps->iSpsId		= iSpsId;
-
-	if ( PRO_SCALABLE_BASELINE == uiProfileIdc || PRO_SCALABLE_HIGH == uiProfileIdc ||
-		PRO_HIGH == uiProfileIdc || PRO_HIGH10 == uiProfileIdc ||
-		PRO_HIGH422 == uiProfileIdc || PRO_HIGH444 == uiProfileIdc ||
-		PRO_CAVLC444 == uiProfileIdc || 44 == uiProfileIdc ){
-		
-		pSps->uiChromaFormatIdc = BsGetUe( pBs );	
-        if( pSps->uiChromaFormatIdc != 1 )
-        {
-            WelsLog( pCtx, WELS_LOG_WARNING, "ParseSps(): chroma_format_idc (%d) = 1 supported.\n", pSps->uiChromaFormatIdc);
-            return GENERATE_ERROR_NO(ERR_LEVEL_PARAM_SETS, ERR_INFO_UNSUPPORTED_NON_BASELINE);       
-        }
-        pSps->uiChromaArrayType = pSps->uiChromaFormatIdc;
-		pSps->uiBitDepthLuma		= 8 + BsGetUe( pBs );
-        if( pSps->uiBitDepthLuma != 8 )
-        {
-            WelsLog( pCtx, WELS_LOG_WARNING, "ParseSps(): bit_depth_luma (%d) Only 8 bit supported.\n", pSps->uiBitDepthLuma);
-            return GENERATE_ERROR_NO(ERR_LEVEL_PARAM_SETS, ERR_INFO_UNSUPPORTED_NON_BASELINE);       
-        }
-		
-		pSps->uiBitDepthChroma	= 8 + BsGetUe( pBs );
-        if( pSps->uiBitDepthChroma != 8 )
-        {
-            WelsLog( pCtx, WELS_LOG_WARNING, "ParseSps(): bit_depth_chroma (%d). Only 8 bit supported.\n", pSps->uiBitDepthChroma);
-            return GENERATE_ERROR_NO(ERR_LEVEL_PARAM_SETS, ERR_INFO_UNSUPPORTED_NON_BASELINE);       
-        }
-		pSps->bQpPrimeYZeroTransfBypassFlag	= !!BsGetOneBit( pBs );
-		pSps->bSeqScalingMatrixPresentFlag	= !!BsGetOneBit( pBs );
-		
-		if ( pSps->bSeqScalingMatrixPresentFlag ){	// For high profile, it is not used in current application. FIXME
-            WelsLog( pCtx, WELS_LOG_WARNING, "ParseSps(): seq_scaling_matrix_present_flag (%d). Feature not supported.\n", pSps->bSeqScalingMatrixPresentFlag);
-            return GENERATE_ERROR_NO(ERR_LEVEL_PARAM_SETS, ERR_INFO_UNSUPPORTED_NON_BASELINE);       
-		}
-	}
-
-	pSps->uiLog2MaxFrameNum	= 4 + BsGetUe( pBs );	// log2_max_frame_num_minus4
-	pSps->uiPocType			= BsGetUe( pBs );		// pic_order_cnt_type
-	
-	if ( 0 == pSps->uiPocType )
-	{
-		pSps->iLog2MaxPocLsb	= 4 + BsGetUe( pBs );	// log2_max_pic_order_cnt_lsb_minus4
-		
-	}
-	else if ( 1 == pSps->uiPocType )
-	{
-		int32_t i;
-		pSps->bDeltaPicOrderAlwaysZeroFlag	= !!BsGetOneBit( pBs );	// bDeltaPicOrderAlwaysZeroFlag
-		pSps->iOffsetForNonRefPic			= BsGetSe( pBs );		// iOffsetForNonRefPic
-		pSps->iOffsetForTopToBottomField	= BsGetSe( pBs );		// iOffsetForTopToBottomField
-		pSps->iNumRefFramesInPocCycle		= BsGetUe( pBs );	// num_ref_frames_in_pic_order_cnt_cycle
-		for( i = 0; i < pSps->iNumRefFramesInPocCycle; i++ )
-			pSps->iOffsetForRefFrame[ i ]	= BsGetSe( pBs );		// iOffsetForRefFrame[ i ]
-	}
-	if ( pSps->uiPocType > 2 )
-	{
-		WelsLog( pCtx, WELS_LOG_WARNING, " illegal pic_order_cnt_type: %d ! \n", pSps->uiPocType );
-		return GENERATE_ERROR_NO(ERR_LEVEL_PARAM_SETS, ERR_INFO_INVALID_POC_TYPE);
-	}
-
-	pSps->iNumRefFrames	= BsGetUe( pBs );		// max_num_ref_frames
-	pSps->bGapsInFrameNumValueAllowedFlag	= !!BsGetOneBit( pBs );	// bGapsInFrameNumValueAllowedFlag
-	pSps->iMbWidth		= 1 + BsGetUe( pBs );		// pic_width_in_mbs_minus1
-	pSps->iMbHeight		= 1 + BsGetUe( pBs );		// pic_height_in_map_units_minus1
-	pSps->uiTotalMbCount	= pSps->iMbWidth * pSps->iMbHeight;
-	pSps->bFrameMbsOnlyFlag	= !!BsGetOneBit( pBs );	// frame_mbs_only_flag
-	
-	if ( !pSps->bFrameMbsOnlyFlag )
-    {
-        WelsLog( pCtx, WELS_LOG_WARNING, "ParseSps(): frame_mbs_only_flag (%d) not supported.\n", pSps->bFrameMbsOnlyFlag);
-		return GENERATE_ERROR_NO(ERR_LEVEL_PARAM_SETS, ERR_INFO_UNSUPPORTED_MBAFF);
-    }
-	pSps->bDirect8x8InferenceFlag	= !!BsGetOneBit( pBs );	// direct_8x8_inference_flag
-	pSps->bFrameCroppingFlag		= !!BsGetOneBit( pBs );	// frame_cropping_flag
-	if ( pSps->bFrameCroppingFlag )
-	{
-		pSps->sFrameCrop.iLeftOffset	= BsGetUe( pBs );	// frame_crop_left_offset
-		pSps->sFrameCrop.iRightOffset	= BsGetUe( pBs );	// frame_crop_right_offset
-		pSps->sFrameCrop.iTopOffset		= BsGetUe( pBs );	// frame_crop_top_offset
-        pSps->sFrameCrop.iBottomOffset	= BsGetUe( pBs );	// frame_crop_bottom_offset
-	}
-	else
-	{
-		pSps->sFrameCrop.iLeftOffset	= 0;				// frame_crop_left_offset
-		pSps->sFrameCrop.iRightOffset	= 0;				// frame_crop_right_offset
-		pSps->sFrameCrop.iTopOffset		= 0;				// frame_crop_top_offset
-		pSps->sFrameCrop.iBottomOffset	= 0;				// frame_crop_bottom_offset
-	}
-	pSps->bVuiParamPresentFlag			= !!BsGetOneBit( pBs );	// vui_parameters_present_flag
-	
-	// Check if SPS SVC extension applicated
-	if ( kbUseSubsetFlag && ( PRO_SCALABLE_BASELINE == uiProfileIdc || PRO_SCALABLE_HIGH == uiProfileIdc ) )
-	{
-		if ( DecodeSpsSvcExt( pCtx, pSubsetSps, pBs ) != ERR_NONE ){
-			return -1;
-		}
-		
-		pSubsetSps->bSvcVuiParamPresentFlag = !!BsGetOneBit( pBs );
-		if ( pSubsetSps->bSvcVuiParamPresentFlag ){
-		}
-	}
-
-
-	if ( PRO_SCALABLE_BASELINE == uiProfileIdc || PRO_SCALABLE_HIGH == uiProfileIdc )
-		pCtx->bAvcBasedFlag	= false;
-	else
-		pCtx->bAvcBasedFlag	= true;	// added for avc base pBs
-
-	*pPicWidth	= pSps->iMbWidth << 4;
-	*pPicHeight	= pSps->iMbHeight << 4;
-	
-	return 0;
-}
-
-/*! 
- *************************************************************************************
- * \brief	to parse Picture Parameter Set (PPS)
- *
- * \param	pCtx		Decoder context
- * \param 	pPpsList	pps list
- * \param	pBsAux		bitstream reader auxiliary 
- *
- * \return	0 - successed
- *		1 - failed
- *
- * \note	Call it in case eNalUnitType is PPS.
- *************************************************************************************
- */
-int32_t ParsePps( PWelsDecoderContext pCtx, PPps pPpsList, PBitStringAux pBsAux )
-{
-
-	PPps pPps = NULL;	
-	uint32_t uiPpsId = 0;
-    uint32_t iTmp;
-
-	uiPpsId = BsGetUe(pBsAux);	
-	if ( uiPpsId >= MAX_PPS_COUNT )
-	{
-		return ERR_INFO_PPS_ID_OVERFLOW;
-	}
-
-#ifdef MOSAIC_AVOID_BASED_ON_SPS_PPS_ID
-	pPps = &pPpsList[pCtx->iPpsTotalNum];
-#else
-	pPps = &pPpsList[uiPpsId];
-#endif //MOSAIC_AVOID_BASED_ON_SPS_PPS_ID	
-	
-	
-	pPps->iPpsId = uiPpsId;
-	pPps->iSpsId = BsGetUe(pBsAux);
-	
-	if (pPps->iSpsId >= MAX_SPS_COUNT)
-	{
-		return ERR_INFO_SPS_ID_OVERFLOW;
-	}
-
-	pPps->bEntropyCodingModeFlag = !!BsGetOneBit(pBsAux);
-	pPps->bPicOrderPresentFlag   = !!BsGetOneBit(pBsAux);
-
-	pPps->uiNumSliceGroups = 1 + BsGetUe(pBsAux);
-
-	if (pPps->uiNumSliceGroups > MAX_SLICEGROUP_IDS)
-	{
-		return ERR_INFO_INVALID_SLICEGROUP;
-	}
-
-	if (pPps->uiNumSliceGroups > 1)
-	{
-		pPps->uiSliceGroupMapType = BsGetUe(pBsAux);
-        if( pPps->uiSliceGroupMapType > 1)
-        {
-            WelsLog( pCtx, WELS_LOG_WARNING, "ParsePps(): slice_group_map_type (%d): support only 0,1.\n", pPps->uiSliceGroupMapType);
-		    return GENERATE_ERROR_NO(ERR_LEVEL_PARAM_SETS, ERR_INFO_UNSUPPORTED_FMOTYPE);
-        }
-		
-		switch(pPps->uiSliceGroupMapType)
-		{
-		case 0:
-			for (iTmp = 0; iTmp < pPps->uiNumSliceGroups; iTmp++)
-			{
-				pPps->uiRunLength[iTmp] = 1 + BsGetUe(pBsAux);
-			}
-			break;
-		default:
-			break;
-		}
-	}
-
-	pPps->uiNumRefIdxL0Active = 1 + BsGetUe(pBsAux);
-	pPps->uiNumRefIdxL1Active = 1 + BsGetUe(pBsAux);
-
-	if (pPps->uiNumRefIdxL0Active > MAX_REF_PIC_COUNT ||
-		pPps->uiNumRefIdxL1Active > MAX_REF_PIC_COUNT) 
-	{
-		return ERR_INFO_REF_COUNT_OVERFLOW;
-	}
-	
-	pPps->bWeightedPredFlag  = !!BsGetOneBit(pBsAux);
-	pPps->uiWeightedBipredIdc = BsGetBits(pBsAux, 2);
-    if( pPps->bWeightedPredFlag || pPps->uiWeightedBipredIdc != 0 )
-    {
-        WelsLog( pCtx, WELS_LOG_WARNING, "ParsePps(): weighted_pred_flag (%d) weighted_bipred_idc (%d) neither supported.\n", pPps->bWeightedPredFlag, pPps->uiWeightedBipredIdc);
-        return GENERATE_ERROR_NO(ERR_LEVEL_PARAM_SETS, ERR_INFO_UNSUPPORTED_WP);
-    }
-
-	pPps->iPicInitQp = 26 + BsGetSe(pBsAux);
-	pPps->iPicInitQs = 26 + BsGetSe(pBsAux);
-
-	pPps->iChromaQpIndexOffset                  = BsGetSe(pBsAux);
-	pPps->bDeblockingFilterControlPresentFlag   = !!BsGetOneBit(pBsAux);
-	pPps->bConstainedIntraPredFlag              = !!BsGetOneBit(pBsAux);
-	pPps->bRedundantPicCntPresentFlag           = !!BsGetOneBit(pBsAux);	
-
-
-#ifdef MOSAIC_AVOID_BASED_ON_SPS_PPS_ID
-	pCtx->bPpsAvailFlags[pCtx->iPpsTotalNum] = true;
-	++pCtx->iPpsTotalNum;
-#else	
-	pCtx->bPpsAvailFlags[uiPpsId] = true; // added for EC, 10/28/2009
-#endif //MOSAIC_AVOID_BASED_ON_SPS_PPS_ID
-
-	return ERR_NONE;
-}
-
-/*! 
- *************************************************************************************
- * \brief	to parse SEI message payload
- *
- * \param 	pSei		sei message to be parsed output
- * \param	pBsAux		bitstream reader auxiliary 
- *
- * \return	0 - successed
- *		1 - failed
- *
- * \note	Call it in case eNalUnitType is NAL_UNIT_SEI.
- *************************************************************************************
- */
-int32_t ParseSei( void_t *pSei, PBitStringAux pBsAux )	// reserved Sei_Msg type
-{
-	
-
-	return ERR_NONE;
-}
-
-/*!
- *************************************************************************************
- * \brief	reset fmo list due to got Sps now
- *
- * \param	pCtx	decoder context
- *
- * \return	count number of fmo context units are reset
- *************************************************************************************
- */
-int32_t ResetFmoList( PWelsDecoderContext pCtx )
-{
-	int32_t iCountNum = 0;
-	if ( NULL != pCtx )
-	{
-		// Fixed memory leak due to PPS_ID might not be continuous sometimes, 1/5/2010
-		UninitFmoList( &pCtx->sFmoList[0], MAX_PPS_COUNT, pCtx->iActiveFmoNum );
-		iCountNum	= pCtx->iActiveFmoNum;
-		pCtx->iActiveFmoNum	= 0;
-	}
-	return iCountNum;
-}
-
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	au_parser.c
+ *
+ * \brief	Interfaces introduced in Access Unit level based parser
+ *
+ * \date	03/10/2009 Created
+ *
+ *************************************************************************************
+ */
+#include <string.h>
+#include "codec_def.h"
+#include "ls_defines.h"
+#include "macros.h"
+#include "au_parser.h"
+#include "decoder.h"
+#include "error_code.h"
+#include "dec_frame.h"
+#include "dec_golomb.h"
+#include "bit_stream.h"
+#include "utils.h"
+#include "codec_app_def.h"
+#include "memmgr_nal_unit.h"
+#include "decoder_core.h"
+#include "wels_common_basis.h"
+#include "decoder_core.h"
+#include "manage_dec_ref.h"
+#include "mem_align.h"
+
+namespace WelsDec {
+/*!
+ *************************************************************************************
+ * \brief	Start Code Prefix (0x 00 00 00 01) detection
+ *
+ * \param 	pBuf		bitstream payload buffer
+ * \param	pOffset		offset between NAL rbsp and original bitsteam that
+ * 				start code prefix is seperated from.
+ * \param	iBufSize	count size of buffer
+ *
+ * \return	RBSP buffer of start code prefix exclusive
+ *
+ * \note	N/A
+ *************************************************************************************
+ */
+uint8_t* DetectStartCodePrefix (const uint8_t* kpBuf, int32_t* pOffset, int32_t iBufSize) {
+  uint8_t* pBits = (uint8_t*)kpBuf;
+
+  do {
+    int32_t iIdx = 0;
+    while ((iIdx < iBufSize) && (! (*pBits))) {
+      ++ pBits;
+      ++ iIdx;
+    }
+    if (iIdx >= iBufSize)  break;
+
+    ++ iIdx;
+    ++ pBits;
+
+    if ((iIdx >= 3) && ((* (pBits - 1)) == 0x1)) {
+      *pOffset = ((uint32_t)pBits) - ((uint32_t)kpBuf);
+      return pBits;
+    }
+
+    iBufSize -= iIdx;
+  }  while (1);
+
+  return NULL;
+}
+
+/*!
+ *************************************************************************************
+ * \brief	to parse nal unit
+ *
+ * \param	pCtx		    decoder context
+ * \param 	pNalUnitHeader	parsed result of NAL Unit Header to output
+ * \param   pSrcRbsp        bitstream buffer to input
+ * \param   iSrcRbspLen     length size of bitstream buffer payload
+ * \param	pSrcNal
+ * \param	iSrcNalLen
+ * \param	pConsumedBytes	consumed bytes during parsing
+ *
+ * \return	decoded bytes payload, might be (pSrcRbsp+1) if no escapes
+ *
+ * \note	N/A
+ *************************************************************************************
+ */
+uint8_t* ParseNalHeader (PWelsDecoderContext pCtx, SNalUnitHeader* pNalUnitHeader, uint8_t* pSrcRbsp,
+                         int32_t iSrcRbspLen, uint8_t* pSrcNal, int32_t iSrcNalLen, int32_t* pConsumedBytes) {
+  PNalUnit pCurNal = NULL;
+  uint8_t* pNal     = pSrcRbsp;
+  int32_t iNalSize  = iSrcRbspLen;
+  PBitStringAux pBs = NULL;
+  bool_t bExtensionFlag = false;
+  int32_t iErr	= ERR_NONE;
+  int32_t iBitSize = 0;
+
+  pNalUnitHeader->eNalUnitType = NAL_UNIT_UNSPEC_0;//SHOULD init it. because pCtx->sCurNalHead is common variable.
+
+  //remove the consecutive ZERO at the end of current NAL in the reverse order.--2011.6.1
+  {
+    int32_t iIndex = iSrcRbspLen - 1;
+    uint8_t uiBsZero = 0;
+    while (iIndex >= 0) {
+      uiBsZero = pSrcRbsp[iIndex];
+      if (0 == uiBsZero) {
+        --iNalSize;
+        --iIndex;
+      } else {
+        break;
+      }
+    }
+  }
+
+  pNalUnitHeader->uiForbiddenZeroBit	= (uint8_t) (pNal[0] >> 7);			// uiForbiddenZeroBit
+  if (pNalUnitHeader->uiForbiddenZeroBit) { //2010.4.14
+    return NULL; //uiForbiddenZeroBit should always equal to 0
+  }
+
+  pNalUnitHeader->uiNalRefIdc		= (uint8_t) (pNal[0] >> 5);			// uiNalRefIdc
+  pNalUnitHeader->eNalUnitType		= (ENalUnitType) (pNal[0] & 0x1f);	// eNalUnitType
+
+  ++pNal;
+  --iNalSize;
+  ++ (*pConsumedBytes);
+
+#ifdef DEBUG_PARSE_INFO
+  WelsLog (pCtx, WELS_LOG_INFO, "nal type: %d \n", pNalUnitHeader->eNalUnitType);
+#endif
+
+  if (! (IS_SEI_NAL (pNalUnitHeader->eNalUnitType) || IS_SPS_NAL (pNalUnitHeader->eNalUnitType)
+         || pCtx->bSpsExistAheadFlag)) {
+    WelsLog (pCtx, WELS_LOG_WARNING,
+             "parse_nal(), no exist Sequence Parameter Sets ahead of sequence when try to decode NAL(type:%d).\n",
+             pNalUnitHeader->eNalUnitType);
+    pCtx->iErrorCode	= dsNoParamSets;
+    return NULL;
+  }
+  if (! (IS_SEI_NAL (pNalUnitHeader->eNalUnitType) || IS_PARAM_SETS_NALS (pNalUnitHeader->eNalUnitType)
+         || pCtx->bPpsExistAheadFlag)) {
+    WelsLog (pCtx, WELS_LOG_WARNING,
+             "parse_nal(), no exist Picture Parameter Sets ahead of sequence when try to decode NAL(type:%d).\n",
+             pNalUnitHeader->eNalUnitType);
+    pCtx->iErrorCode	= dsNoParamSets;
+    return NULL;
+  }
+  if ((IS_VCL_NAL_AVC_BASE (pNalUnitHeader->eNalUnitType) && ! (pCtx->bSpsExistAheadFlag || pCtx->bPpsExistAheadFlag)) ||
+      (IS_NEW_INTRODUCED_NAL (pNalUnitHeader->eNalUnitType) && ! (pCtx->bSpsExistAheadFlag || pCtx->bSubspsExistAheadFlag
+          || pCtx->bPpsExistAheadFlag))) {
+    WelsLog (pCtx, WELS_LOG_WARNING,
+             "ParseNalHeader(), no exist Parameter Sets ahead of sequence when try to decode slice(type:%d).\n",
+             pNalUnitHeader->eNalUnitType);
+    pCtx->iErrorCode	|= dsNoParamSets;
+    return NULL;
+  }
+
+
+  switch (pNalUnitHeader->eNalUnitType) {
+  case NAL_UNIT_SEI:
+
+    if (pCtx->pAccessUnitList->uiAvailUnitsNum > 0) {
+      pCtx->pAccessUnitList->uiEndPos = pCtx->pAccessUnitList->uiAvailUnitsNum - 1;
+      pCtx->bAuReadyFlag = true;
+    }
+
+    break;
+
+  case NAL_UNIT_SPS:
+
+    if (pCtx->pAccessUnitList->uiAvailUnitsNum > 0) {
+      pCtx->pAccessUnitList->uiEndPos = pCtx->pAccessUnitList->uiAvailUnitsNum - 1;
+      pCtx->bAuReadyFlag = true;
+    }
+
+    break;
+
+  case NAL_UNIT_PREFIX:
+    pCurNal = &pCtx->sPrefixNal;
+
+    if (iNalSize < NAL_UNIT_HEADER_EXT_SIZE) {
+      return NULL;
+    }
+
+    DecodeNalHeaderExt (pCurNal, pNal);
+
+    pNal            += NAL_UNIT_HEADER_EXT_SIZE;
+    iNalSize        -= NAL_UNIT_HEADER_EXT_SIZE;
+    *pConsumedBytes += NAL_UNIT_HEADER_EXT_SIZE;
+
+    pCurNal->sNalHeaderExt.sNalUnitHeader.uiForbiddenZeroBit = pNalUnitHeader->uiForbiddenZeroBit;
+    pCurNal->sNalHeaderExt.sNalUnitHeader.uiNalRefIdc		  = pNalUnitHeader->uiNalRefIdc;
+    pCurNal->sNalHeaderExt.sNalUnitHeader.eNalUnitType	      = pNalUnitHeader->eNalUnitType;
+
+    pBs = &pCtx->sBs;
+
+    iBitSize = (iNalSize << 3) - BsGetTrailingBits (pNal + iNalSize - 1); // convert into bit
+
+    InitBits (pBs, pNal, iBitSize);
+
+    ParsePrefixNalUnit (pCtx, pBs);
+
+    break;
+  case NAL_UNIT_CODED_SLICE_EXT:
+    bExtensionFlag = true;
+  case NAL_UNIT_CODED_SLICE:
+  case NAL_UNIT_CODED_SLICE_IDR: {
+    PAccessUnit pCurAu		= NULL;
+    uint32_t uiAvailNalNum;
+    pCurNal = MemGetNextNal (&pCtx->pAccessUnitList);
+    if (NULL == pCurNal) {
+      WelsLog (pCtx, WELS_LOG_WARNING, "MemGetNextNal() fail due out of memory.\n");
+      pCtx->iErrorCode	|= dsOutOfMemory;
+      return NULL;
+    }
+
+    pCurNal->sNalHeaderExt.sNalUnitHeader.uiForbiddenZeroBit = pNalUnitHeader->uiForbiddenZeroBit;
+    pCurNal->sNalHeaderExt.sNalUnitHeader.uiNalRefIdc		  = pNalUnitHeader->uiNalRefIdc;
+    pCurNal->sNalHeaderExt.sNalUnitHeader.eNalUnitType	  = pNalUnitHeader->eNalUnitType;
+    pCurAu	      = pCtx->pAccessUnitList;
+    uiAvailNalNum = pCurAu->uiAvailUnitsNum;
+
+
+    if (pNalUnitHeader->eNalUnitType == NAL_UNIT_CODED_SLICE_EXT) {
+      if (iNalSize < NAL_UNIT_HEADER_EXT_SIZE) {
+        return NULL;
+      }
+
+      DecodeNalHeaderExt (pCurNal, pNal);
+      if (pCurNal->sNalHeaderExt.uiQualityId != 0 ||
+          pCurNal->sNalHeaderExt.bUseRefBasePicFlag) {
+        if (pCurNal->sNalHeaderExt.uiQualityId != 0)
+          WelsLog (pCtx, WELS_LOG_WARNING, "ParseNalHeader():uiQualityId (%d) != 0, MGS not supported!\n",
+                   pCurNal->sNalHeaderExt.uiQualityId);
+        if (pCurNal->sNalHeaderExt.bUseRefBasePicFlag != 0)
+          WelsLog (pCtx, WELS_LOG_WARNING, "ParseNalHeader():bUseRefBasePicFlag (%d) != 0, MGS not supported!\n",
+                   pCurNal->sNalHeaderExt.bUseRefBasePicFlag);
+
+        pCtx->iErrorCode |= dsInvalidArgument;
+        ForceClearCurrentNal (pCurAu);
+
+        if (uiAvailNalNum > 1) {
+          pCurAu->uiEndPos = uiAvailNalNum - 2;
+          pCtx->bAuReadyFlag = true;
+        }
+        return NULL;
+      }
+      pNal            += NAL_UNIT_HEADER_EXT_SIZE;
+      iNalSize        -= NAL_UNIT_HEADER_EXT_SIZE;
+      *pConsumedBytes += NAL_UNIT_HEADER_EXT_SIZE;
+
+    } else {
+
+
+      if (NAL_UNIT_PREFIX == pCtx->sPrefixNal.sNalHeaderExt.sNalUnitHeader.eNalUnitType) {
+        PrefetchNalHeaderExtSyntax (pCtx, pCurNal, &pCtx->sPrefixNal);
+      }
+
+      pCurNal->sNalHeaderExt.bIdrFlag = (NAL_UNIT_CODED_SLICE_IDR == pNalUnitHeader->eNalUnitType) ? true :
+                                        false;   //SHOULD update this flag for AVC if no prefix NAL
+      pCurNal->sNalHeaderExt.iNoInterLayerPredFlag = 1;
+    }
+
+    pBs = &pCurAu->pNalUnitsList[uiAvailNalNum - 1]->sNalData.sVclNal.sSliceBitsRead;
+    iBitSize = (iNalSize << 3) - BsGetTrailingBits (pNal + iNalSize - 1); // convert into bit
+    InitBits (pBs, pNal, iBitSize);
+    iErr = ParseSliceHeaderSyntaxs (pCtx, pBs, bExtensionFlag);
+    if (iErr != ERR_NONE) {
+      //if current NAL occur error when parsing, should clean it from pNalUnitsList
+      //otherwise, when Next good NAL decoding, this corrupt NAL is considered as normal NAL and lead to decoder crash
+      ForceClearCurrentNal (pCurAu);
+
+      if (uiAvailNalNum > 1) {
+        pCurAu->uiEndPos = uiAvailNalNum - 2;
+        pCtx->bAuReadyFlag = true;
+
+
+      }
+#ifdef MOSAIC_AVOID_BASED_ON_SPS_PPS_ID
+      if (dsNoParamSets & pCtx->iErrorCode) {
+        if (uiAvailNalNum <= 1) { //no any data to decode and SPS/PPS ID mismatch, SHOULD request IDR
+#ifdef LONG_TERM_REF
+          pCtx->bParamSetsLostFlag = true;
+#else
+          pCtx->bReferenceLostAtT0Flag = true;
+#endif
+          ResetParameterSetsState (pCtx);
+        }
+        return NULL;
+      } else {
+        return NULL;
+      }
+#else
+      return NULL;
+#endif //MOSAIC_AVOID_BASED_ON_SPS_PPS_ID
+    }
+
+    if ((uiAvailNalNum > 1) &&
+        CheckAccessUnitBoundary (pCurAu->pNalUnitsList[uiAvailNalNum - 1], pCurAu->pNalUnitsList[uiAvailNalNum - 2],
+                                 pCurAu->pNalUnitsList[uiAvailNalNum - 1]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.pSps)) {
+      pCurAu->uiEndPos = uiAvailNalNum - 2;
+      pCtx->bAuReadyFlag = true;
+
+
+    }
+  }
+  break;
+  default:
+    break;
+  }
+
+  return pNal;
+}
+
+
+bool_t CheckAccessUnitBoundaryExt (PNalUnitHeaderExt pLastNalHdrExt, PNalUnitHeaderExt pCurNalHeaderExt,
+                                   PSliceHeader pLastSliceHeader, PSliceHeader pCurSliceHeader) {
+  const PSps kpSps = pCurSliceHeader->pSps;
+
+  //Sub-clause 7.1.4.1.1 temporal_id
+  if (pLastNalHdrExt->uiTemporalId != pCurNalHeaderExt->uiTemporalId) {
+    return TRUE;
+  }
+
+  // Subclause 7.4.1.2.5
+  if (pLastSliceHeader->iRedundantPicCnt < pCurSliceHeader->iRedundantPicCnt)
+    return FALSE;
+  else if (pLastSliceHeader->iRedundantPicCnt > pCurSliceHeader->iRedundantPicCnt)
+    return TRUE;
+
+  // Subclause G7.4.1.2.4
+  if (pLastNalHdrExt->uiDependencyId < pCurNalHeaderExt->uiDependencyId)
+    return FALSE;
+  else if (pLastNalHdrExt->uiDependencyId > pCurNalHeaderExt->uiDependencyId)
+    return TRUE;
+  if (pLastNalHdrExt->uiQualityId < pCurNalHeaderExt->uiQualityId)
+    return FALSE;
+  else if (pLastNalHdrExt->uiQualityId > pCurNalHeaderExt->uiQualityId)
+    return TRUE;
+
+  // Subclause 7.4.1.2.4
+  if (pLastSliceHeader->iFrameNum != pCurSliceHeader->iFrameNum)
+    return TRUE;
+  if (pLastSliceHeader->iPpsId != pCurSliceHeader->iPpsId)
+    return TRUE;
+  if (pLastSliceHeader->bFieldPicFlag != pCurSliceHeader->bFieldPicFlag)
+    return TRUE;
+  if (pLastSliceHeader->bBottomFiledFlag != pCurSliceHeader->bBottomFiledFlag)
+    return TRUE;
+  if ((pLastNalHdrExt->sNalUnitHeader.uiNalRefIdc != NRI_PRI_LOWEST) != (pCurNalHeaderExt->sNalUnitHeader.uiNalRefIdc !=
+      NRI_PRI_LOWEST))
+    return TRUE;
+  if (pLastNalHdrExt->bIdrFlag != pCurNalHeaderExt->bIdrFlag)
+    return TRUE;
+  if (pCurNalHeaderExt->bIdrFlag) {
+    if (pLastSliceHeader->uiIdrPicId != pCurSliceHeader->uiIdrPicId)
+      return TRUE;
+  }
+  if (kpSps->uiPocType == 0) {
+    if (pLastSliceHeader->iPicOrderCntLsb != pCurSliceHeader->iPicOrderCntLsb)
+      return TRUE;
+    if (pLastSliceHeader->iDeltaPicOrderCntBottom != pCurSliceHeader->iDeltaPicOrderCntBottom)
+      return TRUE;
+  } else if (kpSps->uiPocType == 1) {
+    if (pLastSliceHeader->iDeltaPicOrderCnt[0] != pCurSliceHeader->iDeltaPicOrderCnt[0])
+      return TRUE;
+    if (pLastSliceHeader->iDeltaPicOrderCnt[1] != pCurSliceHeader->iDeltaPicOrderCnt[1])
+      return TRUE;
+  }
+
+  return FALSE;
+}
+
+
+bool_t CheckAccessUnitBoundary (const PNalUnit kpCurNal, const PNalUnit kpLastNal, const PSps kpSps) {
+  const PNalUnitHeaderExt kpLastNalHeaderExt = &kpLastNal->sNalHeaderExt;
+  const PNalUnitHeaderExt kpCurNalHeaderExt = &kpCurNal->sNalHeaderExt;
+  const SSliceHeader* kpLastSliceHeader = &kpLastNal->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader;
+  const SSliceHeader* kpCurSliceHeader = &kpCurNal->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader;
+
+  //Sub-clause 7.1.4.1.1 temporal_id
+  if (kpLastNalHeaderExt->uiTemporalId != kpCurNalHeaderExt->uiTemporalId) {
+    return TRUE;
+  }
+
+  // Subclause 7.4.1.2.5
+  if (kpLastSliceHeader->iRedundantPicCnt < kpCurSliceHeader->iRedundantPicCnt)
+    return FALSE;
+  else if (kpLastSliceHeader->iRedundantPicCnt > kpCurSliceHeader->iRedundantPicCnt)
+    return TRUE;
+
+  // Subclause G7.4.1.2.4
+  if (kpLastNalHeaderExt->uiDependencyId < kpCurNalHeaderExt->uiDependencyId)
+    return FALSE;
+  else if (kpLastNalHeaderExt->uiDependencyId > kpCurNalHeaderExt->uiDependencyId)
+    return TRUE;
+  if (kpLastNalHeaderExt->uiQualityId < kpCurNalHeaderExt->uiQualityId)
+    return FALSE;
+  else if (kpLastNalHeaderExt->uiQualityId > kpCurNalHeaderExt->uiQualityId)
+    return TRUE;
+
+  // Subclause 7.4.1.2.4
+  if (kpLastSliceHeader->iFrameNum != kpCurSliceHeader->iFrameNum)
+    return TRUE;
+  if (kpLastSliceHeader->iPpsId != kpCurSliceHeader->iPpsId)
+    return TRUE;
+  if (kpLastSliceHeader->bFieldPicFlag != kpCurSliceHeader->bFieldPicFlag)
+    return TRUE;
+  if (kpLastSliceHeader->bBottomFiledFlag != kpCurSliceHeader->bBottomFiledFlag)
+    return TRUE;
+  if ((kpLastNalHeaderExt->sNalUnitHeader.uiNalRefIdc != NRI_PRI_LOWEST) != (kpCurNalHeaderExt->sNalUnitHeader.uiNalRefIdc
+      != NRI_PRI_LOWEST))
+    return TRUE;
+  if (kpLastNalHeaderExt->bIdrFlag != kpCurNalHeaderExt->bIdrFlag)
+    return TRUE;
+  if (kpCurNalHeaderExt->bIdrFlag) {
+    if (kpLastSliceHeader->uiIdrPicId != kpCurSliceHeader->uiIdrPicId)
+      return TRUE;
+  }
+  if (kpSps->uiPocType == 0) {
+    if (kpLastSliceHeader->iPicOrderCntLsb != kpCurSliceHeader->iPicOrderCntLsb)
+      return TRUE;
+    if (kpLastSliceHeader->iDeltaPicOrderCntBottom != kpCurSliceHeader->iDeltaPicOrderCntBottom)
+      return TRUE;
+  } else if (kpSps->uiPocType == 1) {
+    if (kpLastSliceHeader->iDeltaPicOrderCnt[0] != kpCurSliceHeader->iDeltaPicOrderCnt[0])
+      return TRUE;
+    if (kpLastSliceHeader->iDeltaPicOrderCnt[1] != kpCurSliceHeader->iDeltaPicOrderCnt[1])
+      return TRUE;
+  }
+
+  return FALSE;
+}
+
+/*!
+ *************************************************************************************
+ * \brief	to parse NON VCL NAL Units
+ *
+ * \param 	pCtx		decoder context
+ * \param	rbsp		rbsp buffer of NAL Unit
+ * \param	src_len		length of rbsp buffer
+ *
+ * \return	0 - successed
+ *	    	1 - failed
+ *
+ *************************************************************************************
+ */
+int32_t ParseNonVclNal (PWelsDecoderContext pCtx, uint8_t* pRbsp, const int32_t kiSrcLen) {
+  PBitStringAux	pBs = NULL;
+  ENalUnitType eNalType	= NAL_UNIT_UNSPEC_0; // make initial value as unspecified
+  int32_t iPicWidth		= 0;
+  int32_t iPicHeight		= 0;
+  int32_t iBitSize		= 0;
+  int32_t iErr				= ERR_NONE;
+
+  pBs	     = &pCtx->sBs;	// SBitStringAux instance for non VCL NALs decoding
+  iBitSize = (kiSrcLen << 3) - BsGetTrailingBits (pRbsp + kiSrcLen - 1); // convert into bit
+  eNalType = pCtx->sCurNalHead.eNalUnitType;
+
+  switch (eNalType) {
+  case NAL_UNIT_SPS:
+  case NAL_UNIT_SUBSET_SPS:
+    if (iBitSize > 0)
+      InitBits (pBs, pRbsp, iBitSize);
+#ifdef DEBUG_PARSE_INFO
+    WelsLog (pCtx, WELS_LOG_INFO, "parsing nal: %d \n", eNalType);
+#endif
+    iErr = ParseSps (pCtx, pBs, &iPicWidth, &iPicHeight);
+    if (ERR_NONE != iErr) {	// modified for pSps/pSubsetSps invalid, 12/1/2009
+      pCtx->iErrorCode |= dsNoParamSets;
+      return iErr;
+    }
+
+    if (ERR_NONE == iErr)
+      UpdateMaxPictureResolution (pCtx, iPicWidth, iPicHeight);
+
+    break;
+
+  case NAL_UNIT_PPS:
+    if (iBitSize > 0)
+      InitBits (pBs, pRbsp, iBitSize);
+#ifdef DEBUG_PARSE_INFO
+    WelsLog (pCtx, WELS_LOG_INFO, "parsing nal: %d \n", eNalType);
+#endif
+    iErr = ParsePps (pCtx, &pCtx->sPpsBuffer[0], pBs);
+    if (ERR_NONE != iErr) {	// modified for pps invalid, 12/1/2009
+      pCtx->iErrorCode |= dsNoParamSets;
+      return iErr;
+    }
+
+    pCtx->bPpsExistAheadFlag	= true;
+
+    break;
+
+  case NAL_UNIT_SEI:
+
+    break;
+
+  case NAL_UNIT_PREFIX:
+    break;
+  case NAL_UNIT_CODED_SLICE_DPA:
+  case NAL_UNIT_CODED_SLICE_DPB:
+  case NAL_UNIT_CODED_SLICE_DPC:
+
+    break;
+
+  default:
+    break;
+  }
+
+  return iErr;
+}
+
+void_t ParseRefBasePicMarking (PBitStringAux pBs, PRefBasePicMarking pRefBasePicMarking) {
+  const bool_t kbAdaptiveMarkingModeFlag = !!BsGetOneBit (pBs);
+  pRefBasePicMarking->bAdaptiveRefBasePicMarkingModeFlag = kbAdaptiveMarkingModeFlag;
+  if (kbAdaptiveMarkingModeFlag) {
+    int32_t iIdx = 0;
+    do {
+      const uint32_t kuiMmco = BsGetUe (pBs);
+
+      pRefBasePicMarking->mmco_base[iIdx].uiMmcoType	= kuiMmco;
+
+      if (kuiMmco == MMCO_END)
+        break;
+
+      if (kuiMmco == MMCO_SHORT2UNUSED) {
+        pRefBasePicMarking->mmco_base[iIdx].uiDiffOfPicNums	= 1 + BsGetUe (pBs);
+        pRefBasePicMarking->mmco_base[iIdx].iShortFrameNum	= 0;
+      } else if (kuiMmco == MMCO_LONG2UNUSED) {
+        pRefBasePicMarking->mmco_base[iIdx].uiLongTermPicNum	= BsGetUe (pBs);
+      }
+      ++ iIdx;
+    } while (iIdx < MAX_MMCO_COUNT);
+  }
+}
+
+void_t ParsePrefixNalUnit (PWelsDecoderContext pCtx, PBitStringAux pBs) {
+  PNalUnit pCurNal = &pCtx->sPrefixNal;
+
+  if (pCurNal->sNalHeaderExt.sNalUnitHeader.uiNalRefIdc != 0) {
+    PNalUnitHeaderExt head_ext = &pCurNal->sNalHeaderExt;
+    PPrefixNalUnit sPrefixNal = &pCurNal->sNalData.sPrefixNal;
+    sPrefixNal->bStoreRefBasePicFlag	= !!BsGetOneBit (pBs);
+    if ((head_ext->bUseRefBasePicFlag || sPrefixNal->bStoreRefBasePicFlag) && !head_ext->bIdrFlag) {
+      ParseRefBasePicMarking (pBs, &sPrefixNal->sRefPicBaseMarking);
+    }
+    sPrefixNal->bPrefixNalUnitAdditionalExtFlag	= !!BsGetOneBit (pBs);
+    if (sPrefixNal->bPrefixNalUnitAdditionalExtFlag) {
+      sPrefixNal->bPrefixNalUnitExtFlag	= !!BsGetOneBit (pBs);
+    }
+  }
+}
+
+
+int32_t DecodeSpsSvcExt (PWelsDecoderContext pCtx, PSubsetSps pSpsExt, PBitStringAux pBs) {
+  PSpsSvcExt  pExt			= NULL;
+  uint8_t uiChromaArrayType	= 1;
+
+  pExt	= &pSpsExt->sSpsSvcExt;
+
+  pExt->bInterLayerDeblockingFilterCtrlPresentFlag	= !!BsGetOneBit (pBs);
+  pExt->uiExtendedSpatialScalability						= BsGetBits (pBs, 2);
+  if (pExt->uiExtendedSpatialScalability > 2) {
+    WelsLog (pCtx, WELS_LOG_WARNING, "DecodeSpsSvcExt():extended_spatial_scalability (%d) != 0, ESS not supported!\n",
+             pExt->uiExtendedSpatialScalability);
+    return GENERATE_ERROR_NO (ERR_LEVEL_PARAM_SETS, ERR_INFO_INVALID_ESS);
+  }
+
+  pExt->uiChromaPhaseXPlus1Flag	=
+    0;	// FIXME: Incoherent with JVT X201 standard (= 1), but conformance to JSVM (= 0) implementation.
+  pExt->uiChromaPhaseYPlus1		= 1;
+  uiChromaArrayType = pSpsExt->sSps.uiChromaArrayType;
+
+  pExt->uiChromaPhaseXPlus1Flag	= BsGetOneBit (pBs);
+  pExt->uiChromaPhaseYPlus1		= BsGetBits (pBs, 2);
+
+  pExt->uiSeqRefLayerChromaPhaseXPlus1Flag	= pExt->uiChromaPhaseXPlus1Flag;
+  pExt->uiSeqRefLayerChromaPhaseYPlus1		= pExt->uiChromaPhaseYPlus1;
+  memset (&pExt->sSeqScaledRefLayer, 0, sizeof (SPosOffset));
+
+  if (pExt->uiExtendedSpatialScalability == 1) {
+    SPosOffset* const kpPos = &pExt->sSeqScaledRefLayer;
+    pExt->uiSeqRefLayerChromaPhaseXPlus1Flag	= BsGetOneBit (pBs);
+    pExt->uiSeqRefLayerChromaPhaseYPlus1		= BsGetBits (pBs, 2);
+
+    kpPos->iLeftOffset	= BsGetSe (pBs);
+    kpPos->iTopOffset	= BsGetSe (pBs);
+    kpPos->iRightOffset	= BsGetSe (pBs);
+    kpPos->iBottomOffset = BsGetSe (pBs);
+  }
+
+  pExt->bSeqTCoeffLevelPredFlag	= !!BsGetOneBit (pBs);
+  pExt->bAdaptiveTCoeffLevelPredFlag	= false;
+  if (pExt->bSeqTCoeffLevelPredFlag)
+    pExt->bAdaptiveTCoeffLevelPredFlag	= !!BsGetOneBit (pBs);
+  pExt->bSliceHeaderRestrictionFlag	= !!BsGetOneBit (pBs);
+
+
+
+  return 0;
+}
+
+/*!
+ *************************************************************************************
+ * \brief	to parse Sequence Parameter Set (SPS)
+ *
+ * \param	pCtx		Decoder context
+ * \param	pBsAux		bitstream reader auxiliary
+ * \param	pPicWidth	picture width current Sps represented
+ * \param	pPicHeight	picture height current Sps represented
+ *
+ * \return	0 - successed
+ *		1 - failed
+ *
+ * \note	Call it in case eNalUnitType is SPS.
+ *************************************************************************************
+ */
+
+
+int32_t ParseSps (PWelsDecoderContext pCtx, PBitStringAux pBsAux, int32_t* pPicWidth, int32_t* pPicHeight) {
+  PBitStringAux pBs		= pBsAux;
+  PSps pSps				= NULL;
+  PSubsetSps pSubsetSps	= NULL;
+  SNalUnitHeader* pNalHead = &pCtx->sCurNalHead;
+  ProfileIdc	uiProfileIdc;
+  uint8_t	uiLevelIdc;
+  int32_t iSpsId;
+  bool_t bConstraintSetFlags[6] = { false };
+  const bool_t kbUseSubsetFlag   = IS_SUBSET_SPS_NAL (pNalHead->eNalUnitType);
+
+
+  if (kbUseSubsetFlag) {	// SubsetSps
+    pCtx->bSubspsExistAheadFlag	= true;
+  } else {	// Sps
+    pCtx->bSpsExistAheadFlag		= true;
+
+    // added for EC, 10/28/2009
+    // for safe
+    memset (&pCtx->bSpsAvailFlags[0], 0, sizeof (pCtx->bSpsAvailFlags));
+    memset (&pCtx->bSubspsAvailFlags[0], 0, sizeof (pCtx->bSubspsAvailFlags));
+    memset (&pCtx->bPpsAvailFlags[0], 0, sizeof (pCtx->bPpsAvailFlags));
+
+#ifdef MOSAIC_AVOID_BASED_ON_SPS_PPS_ID
+    pCtx->iSpsTotalNum    = 0;
+    pCtx->iSubspsTotalNum = 0;
+    pCtx->iPpsTotalNum    = 0;
+#endif //MOSAIC_AVOID_BASED_ON_SPS_PPS_ID		
+  }
+
+  uiProfileIdc	= BsGetBits (pBs, 8);
+  bConstraintSetFlags[0]	= !!BsGetOneBit (pBs);	// constraint_set0_flag
+  bConstraintSetFlags[1]	= !!BsGetOneBit (pBs);	// constraint_set1_flag
+  bConstraintSetFlags[2]	= !!BsGetOneBit (pBs);	// constraint_set2_flag
+  bConstraintSetFlags[3]	= !!BsGetOneBit (pBs);	// constraint_set3_flag
+  bConstraintSetFlags[4]	= !!BsGetOneBit (pBs);	// constraint_set4_flag
+  bConstraintSetFlags[5]	= !!BsGetOneBit (pBs);	// constraint_set5_flag
+  BsGetBits (pBs, 2);							// reserved_zero_2bits, equal to 0
+  uiLevelIdc	= BsGetBits (pBs, 8);				// level_idc
+
+  iSpsId		= BsGetUe (pBs);					// seq_parameter_set_id
+
+
+  if (iSpsId >= MAX_SPS_COUNT || iSpsId < 0) {	// Modified to check invalid negative iSpsId, 12/1/2009
+    WelsLog (pCtx, WELS_LOG_WARNING, " iSpsId is out of range! \n");
+    return GENERATE_ERROR_NO (ERR_LEVEL_PARAM_SETS, ERR_INFO_SPS_ID_OVERFLOW);
+  }
+
+  if (kbUseSubsetFlag) {
+#ifdef MOSAIC_AVOID_BASED_ON_SPS_PPS_ID
+    pSubsetSps = &pCtx->sSubsetSpsBuffer[pCtx->iSubspsTotalNum];
+    pCtx->bSubspsAvailFlags[pCtx->iSubspsTotalNum] = true;
+
+    pSubsetSps->sSps.iSpsId = iSpsId;
+    pSps = &pSubsetSps->sSps;
+    ++pCtx->iSubspsTotalNum;
+#else
+    pSubsetSps	= &pCtx->sSubsetSpsBuffer[iSpsId];
+    pSps		= &pSubsetSps->sSps;
+    pCtx->bSubspsAvailFlags[iSpsId]	= true; // added for EC, 10/28/2009
+#endif //MOSAIC_AVOID_BASED_ON_SPS_PPS_ID			
+  } else {
+#ifdef MOSAIC_AVOID_BASED_ON_SPS_PPS_ID
+    pSps = &pCtx->sSpsBuffer[pCtx->iSpsTotalNum];
+    pCtx->bSpsAvailFlags[pCtx->iSpsTotalNum] = true;
+
+    pSps->iSpsId = iSpsId;
+    ++pCtx->iSpsTotalNum;
+#else
+    pSps = &pCtx->sSpsBuffer[iSpsId];
+    pCtx->bSpsAvailFlags[iSpsId] = true; // added for EC, 10/28/2009
+#endif //MOSAIC_AVOID_BASED_ON_SPS_PPS_ID		
+  }
+
+  // syntax elements in default
+  pSps->uiChromaFormatIdc	= 1;
+  pSps->uiBitDepthLuma		=
+    pSps->uiBitDepthChroma	= 8;
+
+  pSps->uiProfileIdc	= uiProfileIdc;
+  pSps->uiLevelIdc	= uiLevelIdc;
+  pSps->iSpsId		= iSpsId;
+
+  if (PRO_SCALABLE_BASELINE == uiProfileIdc || PRO_SCALABLE_HIGH == uiProfileIdc ||
+      PRO_HIGH == uiProfileIdc || PRO_HIGH10 == uiProfileIdc ||
+      PRO_HIGH422 == uiProfileIdc || PRO_HIGH444 == uiProfileIdc ||
+      PRO_CAVLC444 == uiProfileIdc || 44 == uiProfileIdc) {
+
+    pSps->uiChromaFormatIdc = BsGetUe (pBs);
+    if (pSps->uiChromaFormatIdc != 1) {
+      WelsLog (pCtx, WELS_LOG_WARNING, "ParseSps(): chroma_format_idc (%d) = 1 supported.\n", pSps->uiChromaFormatIdc);
+      return GENERATE_ERROR_NO (ERR_LEVEL_PARAM_SETS, ERR_INFO_UNSUPPORTED_NON_BASELINE);
+    }
+    pSps->uiChromaArrayType = pSps->uiChromaFormatIdc;
+    pSps->uiBitDepthLuma		= 8 + BsGetUe (pBs);
+    if (pSps->uiBitDepthLuma != 8) {
+      WelsLog (pCtx, WELS_LOG_WARNING, "ParseSps(): bit_depth_luma (%d) Only 8 bit supported.\n", pSps->uiBitDepthLuma);
+      return GENERATE_ERROR_NO (ERR_LEVEL_PARAM_SETS, ERR_INFO_UNSUPPORTED_NON_BASELINE);
+    }
+
+    pSps->uiBitDepthChroma	= 8 + BsGetUe (pBs);
+    if (pSps->uiBitDepthChroma != 8) {
+      WelsLog (pCtx, WELS_LOG_WARNING, "ParseSps(): bit_depth_chroma (%d). Only 8 bit supported.\n", pSps->uiBitDepthChroma);
+      return GENERATE_ERROR_NO (ERR_LEVEL_PARAM_SETS, ERR_INFO_UNSUPPORTED_NON_BASELINE);
+    }
+    pSps->bQpPrimeYZeroTransfBypassFlag	= !!BsGetOneBit (pBs);
+    pSps->bSeqScalingMatrixPresentFlag	= !!BsGetOneBit (pBs);
+
+    if (pSps->bSeqScalingMatrixPresentFlag) {	// For high profile, it is not used in current application. FIXME
+      WelsLog (pCtx, WELS_LOG_WARNING, "ParseSps(): seq_scaling_matrix_present_flag (%d). Feature not supported.\n",
+               pSps->bSeqScalingMatrixPresentFlag);
+      return GENERATE_ERROR_NO (ERR_LEVEL_PARAM_SETS, ERR_INFO_UNSUPPORTED_NON_BASELINE);
+    }
+  }
+
+  pSps->uiLog2MaxFrameNum	= 4 + BsGetUe (pBs);	// log2_max_frame_num_minus4
+  pSps->uiPocType			= BsGetUe (pBs);		// pic_order_cnt_type
+
+  if (0 == pSps->uiPocType) {
+    pSps->iLog2MaxPocLsb	= 4 + BsGetUe (pBs);	// log2_max_pic_order_cnt_lsb_minus4
+
+  } else if (1 == pSps->uiPocType) {
+    int32_t i;
+    pSps->bDeltaPicOrderAlwaysZeroFlag	= !!BsGetOneBit (pBs);	// bDeltaPicOrderAlwaysZeroFlag
+    pSps->iOffsetForNonRefPic			= BsGetSe (pBs);		// iOffsetForNonRefPic
+    pSps->iOffsetForTopToBottomField	= BsGetSe (pBs);		// iOffsetForTopToBottomField
+    pSps->iNumRefFramesInPocCycle		= BsGetUe (pBs);	// num_ref_frames_in_pic_order_cnt_cycle
+    for (i = 0; i < pSps->iNumRefFramesInPocCycle; i++)
+      pSps->iOffsetForRefFrame[ i ]	= BsGetSe (pBs);		// iOffsetForRefFrame[ i ]
+  }
+  if (pSps->uiPocType > 2) {
+    WelsLog (pCtx, WELS_LOG_WARNING, " illegal pic_order_cnt_type: %d ! \n", pSps->uiPocType);
+    return GENERATE_ERROR_NO (ERR_LEVEL_PARAM_SETS, ERR_INFO_INVALID_POC_TYPE);
+  }
+
+  pSps->iNumRefFrames	= BsGetUe (pBs);		// max_num_ref_frames
+  pSps->bGapsInFrameNumValueAllowedFlag	= !!BsGetOneBit (pBs);	// bGapsInFrameNumValueAllowedFlag
+  pSps->iMbWidth		= 1 + BsGetUe (pBs);		// pic_width_in_mbs_minus1
+  pSps->iMbHeight		= 1 + BsGetUe (pBs);		// pic_height_in_map_units_minus1
+  pSps->uiTotalMbCount	= pSps->iMbWidth * pSps->iMbHeight;
+  pSps->bFrameMbsOnlyFlag	= !!BsGetOneBit (pBs);	// frame_mbs_only_flag
+
+  if (!pSps->bFrameMbsOnlyFlag) {
+    WelsLog (pCtx, WELS_LOG_WARNING, "ParseSps(): frame_mbs_only_flag (%d) not supported.\n", pSps->bFrameMbsOnlyFlag);
+    return GENERATE_ERROR_NO (ERR_LEVEL_PARAM_SETS, ERR_INFO_UNSUPPORTED_MBAFF);
+  }
+  pSps->bDirect8x8InferenceFlag	= !!BsGetOneBit (pBs);	// direct_8x8_inference_flag
+  pSps->bFrameCroppingFlag		= !!BsGetOneBit (pBs);	// frame_cropping_flag
+  if (pSps->bFrameCroppingFlag) {
+    pSps->sFrameCrop.iLeftOffset	= BsGetUe (pBs);	// frame_crop_left_offset
+    pSps->sFrameCrop.iRightOffset	= BsGetUe (pBs);	// frame_crop_right_offset
+    pSps->sFrameCrop.iTopOffset		= BsGetUe (pBs);	// frame_crop_top_offset
+    pSps->sFrameCrop.iBottomOffset	= BsGetUe (pBs);	// frame_crop_bottom_offset
+  } else {
+    pSps->sFrameCrop.iLeftOffset	= 0;				// frame_crop_left_offset
+    pSps->sFrameCrop.iRightOffset	= 0;				// frame_crop_right_offset
+    pSps->sFrameCrop.iTopOffset		= 0;				// frame_crop_top_offset
+    pSps->sFrameCrop.iBottomOffset	= 0;				// frame_crop_bottom_offset
+  }
+  pSps->bVuiParamPresentFlag			= !!BsGetOneBit (pBs);	// vui_parameters_present_flag
+
+  // Check if SPS SVC extension applicated
+  if (kbUseSubsetFlag && (PRO_SCALABLE_BASELINE == uiProfileIdc || PRO_SCALABLE_HIGH == uiProfileIdc)) {
+    if (DecodeSpsSvcExt (pCtx, pSubsetSps, pBs) != ERR_NONE) {
+      return -1;
+    }
+
+    pSubsetSps->bSvcVuiParamPresentFlag = !!BsGetOneBit (pBs);
+    if (pSubsetSps->bSvcVuiParamPresentFlag) {
+    }
+  }
+
+
+  if (PRO_SCALABLE_BASELINE == uiProfileIdc || PRO_SCALABLE_HIGH == uiProfileIdc)
+    pCtx->bAvcBasedFlag	= false;
+  else
+    pCtx->bAvcBasedFlag	= true;	// added for avc base pBs
+
+  *pPicWidth	= pSps->iMbWidth << 4;
+  *pPicHeight	= pSps->iMbHeight << 4;
+
+  return 0;
+}
+
+/*!
+ *************************************************************************************
+ * \brief	to parse Picture Parameter Set (PPS)
+ *
+ * \param	pCtx		Decoder context
+ * \param 	pPpsList	pps list
+ * \param	pBsAux		bitstream reader auxiliary
+ *
+ * \return	0 - successed
+ *		1 - failed
+ *
+ * \note	Call it in case eNalUnitType is PPS.
+ *************************************************************************************
+ */
+int32_t ParsePps (PWelsDecoderContext pCtx, PPps pPpsList, PBitStringAux pBsAux) {
+
+  PPps pPps = NULL;
+  uint32_t uiPpsId = 0;
+  uint32_t iTmp;
+
+  uiPpsId = BsGetUe (pBsAux);
+  if (uiPpsId >= MAX_PPS_COUNT) {
+    return ERR_INFO_PPS_ID_OVERFLOW;
+  }
+
+#ifdef MOSAIC_AVOID_BASED_ON_SPS_PPS_ID
+  pPps = &pPpsList[pCtx->iPpsTotalNum];
+#else
+  pPps = &pPpsList[uiPpsId];
+#endif //MOSAIC_AVOID_BASED_ON_SPS_PPS_ID	
+
+
+  pPps->iPpsId = uiPpsId;
+  pPps->iSpsId = BsGetUe (pBsAux);
+
+  if (pPps->iSpsId >= MAX_SPS_COUNT) {
+    return ERR_INFO_SPS_ID_OVERFLOW;
+  }
+
+  pPps->bEntropyCodingModeFlag = !!BsGetOneBit (pBsAux);
+  pPps->bPicOrderPresentFlag   = !!BsGetOneBit (pBsAux);
+
+  pPps->uiNumSliceGroups = 1 + BsGetUe (pBsAux);
+
+  if (pPps->uiNumSliceGroups > MAX_SLICEGROUP_IDS) {
+    return ERR_INFO_INVALID_SLICEGROUP;
+  }
+
+  if (pPps->uiNumSliceGroups > 1) {
+    pPps->uiSliceGroupMapType = BsGetUe (pBsAux);
+    if (pPps->uiSliceGroupMapType > 1) {
+      WelsLog (pCtx, WELS_LOG_WARNING, "ParsePps(): slice_group_map_type (%d): support only 0,1.\n",
+               pPps->uiSliceGroupMapType);
+      return GENERATE_ERROR_NO (ERR_LEVEL_PARAM_SETS, ERR_INFO_UNSUPPORTED_FMOTYPE);
+    }
+
+    switch (pPps->uiSliceGroupMapType) {
+    case 0:
+      for (iTmp = 0; iTmp < pPps->uiNumSliceGroups; iTmp++) {
+        pPps->uiRunLength[iTmp] = 1 + BsGetUe (pBsAux);
+      }
+      break;
+    default:
+      break;
+    }
+  }
+
+  pPps->uiNumRefIdxL0Active = 1 + BsGetUe (pBsAux);
+  pPps->uiNumRefIdxL1Active = 1 + BsGetUe (pBsAux);
+
+  if (pPps->uiNumRefIdxL0Active > MAX_REF_PIC_COUNT ||
+      pPps->uiNumRefIdxL1Active > MAX_REF_PIC_COUNT) {
+    return ERR_INFO_REF_COUNT_OVERFLOW;
+  }
+
+  pPps->bWeightedPredFlag  = !!BsGetOneBit (pBsAux);
+  pPps->uiWeightedBipredIdc = BsGetBits (pBsAux, 2);
+  if (pPps->bWeightedPredFlag || pPps->uiWeightedBipredIdc != 0) {
+    WelsLog (pCtx, WELS_LOG_WARNING, "ParsePps(): weighted_pred_flag (%d) weighted_bipred_idc (%d) neither supported.\n",
+             pPps->bWeightedPredFlag, pPps->uiWeightedBipredIdc);
+    return GENERATE_ERROR_NO (ERR_LEVEL_PARAM_SETS, ERR_INFO_UNSUPPORTED_WP);
+  }
+
+  pPps->iPicInitQp = 26 + BsGetSe (pBsAux);
+  pPps->iPicInitQs = 26 + BsGetSe (pBsAux);
+
+  pPps->iChromaQpIndexOffset                  = BsGetSe (pBsAux);
+  pPps->bDeblockingFilterControlPresentFlag   = !!BsGetOneBit (pBsAux);
+  pPps->bConstainedIntraPredFlag              = !!BsGetOneBit (pBsAux);
+  pPps->bRedundantPicCntPresentFlag           = !!BsGetOneBit (pBsAux);
+
+
+#ifdef MOSAIC_AVOID_BASED_ON_SPS_PPS_ID
+  pCtx->bPpsAvailFlags[pCtx->iPpsTotalNum] = true;
+  ++pCtx->iPpsTotalNum;
+#else
+  pCtx->bPpsAvailFlags[uiPpsId] = true; // added for EC, 10/28/2009
+#endif //MOSAIC_AVOID_BASED_ON_SPS_PPS_ID
+
+  return ERR_NONE;
+}
+
+/*!
+ *************************************************************************************
+ * \brief	to parse SEI message payload
+ *
+ * \param 	pSei		sei message to be parsed output
+ * \param	pBsAux		bitstream reader auxiliary
+ *
+ * \return	0 - successed
+ *		1 - failed
+ *
+ * \note	Call it in case eNalUnitType is NAL_UNIT_SEI.
+ *************************************************************************************
+ */
+int32_t ParseSei (void_t* pSei, PBitStringAux pBsAux) {	// reserved Sei_Msg type
+
+
+  return ERR_NONE;
+}
+
+/*!
+ *************************************************************************************
+ * \brief	reset fmo list due to got Sps now
+ *
+ * \param	pCtx	decoder context
+ *
+ * \return	count number of fmo context units are reset
+ *************************************************************************************
+ */
+int32_t ResetFmoList (PWelsDecoderContext pCtx) {
+  int32_t iCountNum = 0;
+  if (NULL != pCtx) {
+    // Fixed memory leak due to PPS_ID might not be continuous sometimes, 1/5/2010
+    UninitFmoList (&pCtx->sFmoList[0], MAX_PPS_COUNT, pCtx->iActiveFmoNum);
+    iCountNum	= pCtx->iActiveFmoNum;
+    pCtx->iActiveFmoNum	= 0;
+  }
+  return iCountNum;
+}
+
 } // namespace WelsDec
\ No newline at end of file
--- a/codec/decoder/core/src/bit_stream.cpp
+++ b/codec/decoder/core/src/bit_stream.cpp
@@ -43,53 +43,47 @@
 namespace WelsDec {
 
 #ifdef WORDS_BIGENDIAN
-inline uint32_t EndianFix(uint32_t uiX)
-{
-	return uiX;
+inline uint32_t EndianFix (uint32_t uiX) {
+  return uiX;
 }
 #else //WORDS_BIGENDIAN
 
 #ifdef _MSC_VER
-inline uint32_t EndianFix(uint32_t uiX)
-{
-	__asm
-	{
-		mov   eax,  uiX
-		bswap   eax
-		mov   uiX,    eax
-	}
-	return uiX;
+inline uint32_t EndianFix (uint32_t uiX) {
+  __asm {
+    mov   eax,  uiX
+    bswap   eax
+    mov   uiX,    eax
+  }
+  return uiX;
 }
 #else  //_MSC_VER
 
-inline uint32_t EndianFix(uint32_t uiX)
-{
+inline uint32_t EndianFix (uint32_t uiX) {
 #ifdef ARM_ARCHv7
-	__asm__ __volatile__("rev %0, %0":"+r"(uiX)); //Just for the ARMv7 
+  __asm__ __volatile__ ("rev %0, %0":"+r" (uiX)); //Just for the ARMv7
 #elif defined (X86_ARCH)
-	__asm__ __volatile__("bswap %0":"+r"(uiX));
+  __asm__ __volatile__ ("bswap %0":"+r" (uiX));
 #else
-    uiX = ((uiX & 0xff000000)>> 24) | ((uiX & 0xff0000) >> 8) |
-        ((uiX & 0xff00) << 8) | ((uiX&0xff) << 24);
-#endif	
-	return uiX;
+  uiX = ((uiX & 0xff000000) >> 24) | ((uiX & 0xff0000) >> 8) |
+        ((uiX & 0xff00) << 8) | ((uiX & 0xff) << 24);
+#endif
+  return uiX;
 }
 #endif //_MSC_VER
 
 #endif //WORDS_BIGENDIAN
 
-inline uint32_t GetValue4Bytes( uint8_t* pDstNal )
-{
-	uint32_t uiValue = 0;
-	uiValue = (pDstNal[0]<<24) | (pDstNal[1]<<16) | (pDstNal[2]<<8) | (pDstNal[3]);
-	return uiValue;
+inline uint32_t GetValue4Bytes (uint8_t* pDstNal) {
+  uint32_t uiValue = 0;
+  uiValue = (pDstNal[0] << 24) | (pDstNal[1] << 16) | (pDstNal[2] << 8) | (pDstNal[3]);
+  return uiValue;
 }
 
-void_t InitReadBits( PBitStringAux pBitString )
-{
-	pBitString->uiCurBits  = GetValue4Bytes( pBitString->pCurBuf );
-	pBitString->pCurBuf  += 4;
-	pBitString->iLeftBits = -16;
+void_t InitReadBits (PBitStringAux pBitString) {
+  pBitString->uiCurBits  = GetValue4Bytes (pBitString->pCurBuf);
+  pBitString->pCurBuf  += 4;
+  pBitString->iLeftBits = -16;
 }
 
 /*!
@@ -101,22 +95,21 @@
  *
  * \return	size of buffer data in byte; failed in -1 return
  */
-int32_t InitBits( PBitStringAux pBitString, const uint8_t *kpBuf, const int32_t kiSize )
-{	
-	const int32_t kiSizeBuf = (kiSize + 7) >> 3;
-	uint8_t *pTmp = (uint8_t *)kpBuf;
+int32_t InitBits (PBitStringAux pBitString, const uint8_t* kpBuf, const int32_t kiSize) {
+  const int32_t kiSizeBuf = (kiSize + 7) >> 3;
+  uint8_t* pTmp = (uint8_t*)kpBuf;
 
-	if ( NULL == pTmp )
-		return -1;
+  if (NULL == pTmp)
+    return -1;
 
-	pBitString->pStartBuf   = pTmp;				// buffer to start position
-	pBitString->pEndBuf	    = pTmp + kiSizeBuf;	// buffer + length
-	pBitString->iBits	    = kiSize;				// count bits of overall bitstreaming inputindex;
+  pBitString->pStartBuf   = pTmp;				// buffer to start position
+  pBitString->pEndBuf	    = pTmp + kiSizeBuf;	// buffer + length
+  pBitString->iBits	    = kiSize;				// count bits of overall bitstreaming inputindex;
 
-	pBitString->pCurBuf   = pBitString->pStartBuf;
-	InitReadBits( pBitString );
+  pBitString->pCurBuf   = pBitString->pStartBuf;
+  InitReadBits (pBitString);
 
-	return kiSizeBuf;
+  return kiSizeBuf;
 }
 
 } // namespace WelsDec
--- a/codec/decoder/core/src/cpu.cpp
+++ b/codec/decoder/core/src/cpu.cpp
@@ -50,160 +50,143 @@
 
 #if defined(X86_ASM)
 
-uint32_t WelsCPUFeatureDetect( int32_t *pNumberOfLogicProcessors )
-{
-    uint32_t uiCPU = 0;	
-    uint32_t uiFeatureA = 0, uiFeatureB = 0, uiFeatureC = 0, uiFeatureD = 0;
-	int32_t  CacheLineSize = 0;
-	int8_t   chVenderName[16] = { 0 };	
-	
-    if( !WelsCPUIdVerify() )
-    {
-        /* cpuid is not supported in cpu */
-        return 0;
-    }
-	
-	WelsCPUId( 0, &uiFeatureA, (uint32_t*)&chVenderName[0],(uint32_t*)&chVenderName[8],(uint32_t*)&chVenderName[4] );
-    if( uiFeatureA == 0 )
-    {
-		/* maximum input value for basic cpuid information */
-        return 0;
-    }
-	
-	WelsCPUId( 1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD );
-    if( (uiFeatureD & 0x00800000) == 0 )
-    {
-        /* Basic MMX technology is not support in cpu, mean nothing for us so return here */
-        return 0;
-    }
-	
-    uiCPU = WELS_CPU_MMX;
-    if( uiFeatureD & 0x02000000 )
-    {
-        /* SSE technology is identical to AMD MMX extensions */
-        uiCPU |= WELS_CPU_MMXEXT|WELS_CPU_SSE;
-    }
-    if( uiFeatureD & 0x04000000 )
-    {
-        /* SSE2 support here */
-        uiCPU |= WELS_CPU_SSE2;
-    }
-	if ( uiFeatureD & 0x00000001 )
-	{
-		/* x87 FPU on-chip checking */
-		uiCPU |= WELS_CPU_FPU;
-	}
-	if ( uiFeatureD & 0x00008000 )
-	{
-		/* CMOV instruction checking */
-		uiCPU |= WELS_CPU_CMOV;
-	}
-	if ( !strcmp((const str_t*)chVenderName,CPU_Vender_INTEL) )	// confirmed_safe_unsafe_usage
-	{
-		if ( uiFeatureD & 0x10000000 )
-		{
-			/* Multi-Threading checking: contains of multiple logic processors */
-			uiCPU |= WELS_CPU_HTT;
-		}
-	}	
-
-	if( uiFeatureC & 0x00000001 ){
-		/* SSE3 support here */
-		uiCPU |= WELS_CPU_SSE3;
-	}
-	if( uiFeatureC & 0x00000200 ){
-		/* SSSE3 support here */
-		uiCPU |= WELS_CPU_SSSE3;
-	}
-	if( uiFeatureC & 0x00080000 ){
-		/* SSE4.1 support here, 45nm Penryn processor */
-		uiCPU |= WELS_CPU_SSE41; 
-	}
-	if( uiFeatureC & 0x00100000 ){
-		/* SSE4.2 support here, next generation Nehalem processor */
-		uiCPU |= WELS_CPU_SSE42;
-	}
-	if ( WelsCPUSupportAVX( uiFeatureA, uiFeatureC ) )
-	{
-		/* AVX supported */
-		uiCPU |= WELS_CPU_AVX;
-	}
-	if ( WelsCPUSupportFMA( uiFeatureA, uiFeatureC ) )
-	{
-		/* AVX FMA supported */
-		uiCPU |= WELS_CPU_FMA;
-	}
-	if ( uiFeatureC & 0x02000000 )
-	{
-		/* AES checking */
-		uiCPU |= WELS_CPU_AES;
-	}
-	if ( uiFeatureC & 0x00400000 )
-	{
-		/* MOVBE checking */
-		uiCPU |= WELS_CPU_MOVBE;
-	}
-
-	if ( pNumberOfLogicProcessors != NULL )
-	{
-		// HTT enabled on chip
-		*pNumberOfLogicProcessors = (uiFeatureB & 0x00ff0000) >> 16; // feature bits: 23-16 on returned EBX		
-	}	
-	
-    WelsCPUId( 0x80000000, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD );
-
-	if( (!strcmp((const str_t*)chVenderName,CPU_Vender_AMD)) && (uiFeatureA>=0x80000001) ){	// confirmed_safe_unsafe_usage
-		WelsCPUId(0x80000001, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD );
-		if( uiFeatureD&0x00400000 ){
-			uiCPU |= WELS_CPU_MMXEXT;
-		}
-		if( uiFeatureD&0x80000000 ){
-			uiCPU |= WELS_CPU_3DNOW;
-		}
-	}
-
-	if( !strcmp((const str_t*)chVenderName,CPU_Vender_INTEL) ){	// confirmed_safe_unsafe_usage
-		int32_t  family, model;
-
-		WelsCPUId(1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
-		family = ((uiFeatureA>>8)&0xf) + ((uiFeatureA>>20)&0xff);
-        model  = ((uiFeatureA>>4)&0xf) + ((uiFeatureA>>12)&0xf0);
-
-		if( (family==6) && (model==9 || model==13 || model==14) ){
-			uiCPU &= ~(WELS_CPU_SSE2|WELS_CPU_SSE3);
-		}
-	}
-
-	// get cache line size
-	if( (!strcmp((const str_t*)chVenderName,CPU_Vender_INTEL)) || !(strcmp((const str_t*)chVenderName,CPU_Vender_CYRIX)) ){	// confirmed_safe_unsafe_usage
-		WelsCPUId(1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
-
-		CacheLineSize = (uiFeatureB&0xff00)>>5;	// ((clflush_line_size >> 8) << 3), CLFLUSH_line_size * 8 = CacheLineSize_in_byte
-
-		if( CacheLineSize == 128 ){
-			uiCPU |= WELS_CPU_CACHELINE_128;
-		}
-		else if( CacheLineSize == 64 ){
-			uiCPU |= WELS_CPU_CACHELINE_64;
-		}
-		else if( CacheLineSize == 32 ){
-			uiCPU |= WELS_CPU_CACHELINE_32;
-		}
-		else if( CacheLineSize == 16 ){
-			uiCPU |= WELS_CPU_CACHELINE_16;
-		}
-	}
-	
-    return uiCPU;
-}
-
-
-void WelsCPURestore( const uint32_t kuiCPU )
-{
-    if( kuiCPU & (WELS_CPU_MMX|WELS_CPU_MMXEXT|WELS_CPU_3DNOW|WELS_CPU_3DNOWEXT) )
-    {
-        WelsEmms();
-    }
+uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors) {
+  uint32_t uiCPU = 0;
+  uint32_t uiFeatureA = 0, uiFeatureB = 0, uiFeatureC = 0, uiFeatureD = 0;
+  int32_t  CacheLineSize = 0;
+  int8_t   chVenderName[16] = { 0 };
+
+  if (!WelsCPUIdVerify()) {
+    /* cpuid is not supported in cpu */
+    return 0;
+  }
+
+  WelsCPUId (0, &uiFeatureA, (uint32_t*)&chVenderName[0], (uint32_t*)&chVenderName[8], (uint32_t*)&chVenderName[4]);
+  if (uiFeatureA == 0) {
+    /* maximum input value for basic cpuid information */
+    return 0;
+  }
+
+  WelsCPUId (1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
+  if ((uiFeatureD & 0x00800000) == 0) {
+    /* Basic MMX technology is not support in cpu, mean nothing for us so return here */
+    return 0;
+  }
+
+  uiCPU = WELS_CPU_MMX;
+  if (uiFeatureD & 0x02000000) {
+    /* SSE technology is identical to AMD MMX extensions */
+    uiCPU |= WELS_CPU_MMXEXT | WELS_CPU_SSE;
+  }
+  if (uiFeatureD & 0x04000000) {
+    /* SSE2 support here */
+    uiCPU |= WELS_CPU_SSE2;
+  }
+  if (uiFeatureD & 0x00000001) {
+    /* x87 FPU on-chip checking */
+    uiCPU |= WELS_CPU_FPU;
+  }
+  if (uiFeatureD & 0x00008000) {
+    /* CMOV instruction checking */
+    uiCPU |= WELS_CPU_CMOV;
+  }
+  if (!strcmp ((const str_t*)chVenderName, CPU_Vender_INTEL)) {	// confirmed_safe_unsafe_usage
+    if (uiFeatureD & 0x10000000) {
+      /* Multi-Threading checking: contains of multiple logic processors */
+      uiCPU |= WELS_CPU_HTT;
+    }
+  }
+
+  if (uiFeatureC & 0x00000001) {
+    /* SSE3 support here */
+    uiCPU |= WELS_CPU_SSE3;
+  }
+  if (uiFeatureC & 0x00000200) {
+    /* SSSE3 support here */
+    uiCPU |= WELS_CPU_SSSE3;
+  }
+  if (uiFeatureC & 0x00080000) {
+    /* SSE4.1 support here, 45nm Penryn processor */
+    uiCPU |= WELS_CPU_SSE41;
+  }
+  if (uiFeatureC & 0x00100000) {
+    /* SSE4.2 support here, next generation Nehalem processor */
+    uiCPU |= WELS_CPU_SSE42;
+  }
+  if (WelsCPUSupportAVX (uiFeatureA, uiFeatureC)) {
+    /* AVX supported */
+    uiCPU |= WELS_CPU_AVX;
+  }
+  if (WelsCPUSupportFMA (uiFeatureA, uiFeatureC)) {
+    /* AVX FMA supported */
+    uiCPU |= WELS_CPU_FMA;
+  }
+  if (uiFeatureC & 0x02000000) {
+    /* AES checking */
+    uiCPU |= WELS_CPU_AES;
+  }
+  if (uiFeatureC & 0x00400000) {
+    /* MOVBE checking */
+    uiCPU |= WELS_CPU_MOVBE;
+  }
+
+  if (pNumberOfLogicProcessors != NULL) {
+    // HTT enabled on chip
+    *pNumberOfLogicProcessors = (uiFeatureB & 0x00ff0000) >> 16; // feature bits: 23-16 on returned EBX
+  }
+
+  WelsCPUId (0x80000000, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
+
+  if ((!strcmp ((const str_t*)chVenderName, CPU_Vender_AMD))
+      && (uiFeatureA >= 0x80000001)) {	// confirmed_safe_unsafe_usage
+    WelsCPUId (0x80000001, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
+    if (uiFeatureD & 0x00400000) {
+      uiCPU |= WELS_CPU_MMXEXT;
+    }
+    if (uiFeatureD & 0x80000000) {
+      uiCPU |= WELS_CPU_3DNOW;
+    }
+  }
+
+  if (!strcmp ((const str_t*)chVenderName, CPU_Vender_INTEL)) {	// confirmed_safe_unsafe_usage
+    int32_t  family, model;
+
+    WelsCPUId (1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
+    family = ((uiFeatureA >> 8) & 0xf) + ((uiFeatureA >> 20) & 0xff);
+    model  = ((uiFeatureA >> 4) & 0xf) + ((uiFeatureA >> 12) & 0xf0);
+
+    if ((family == 6) && (model == 9 || model == 13 || model == 14)) {
+      uiCPU &= ~ (WELS_CPU_SSE2 | WELS_CPU_SSE3);
+    }
+  }
+
+  // get cache line size
+  if ((!strcmp ((const str_t*)chVenderName, CPU_Vender_INTEL))
+      || ! (strcmp ((const str_t*)chVenderName, CPU_Vender_CYRIX))) {	// confirmed_safe_unsafe_usage
+    WelsCPUId (1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
+
+    CacheLineSize = (uiFeatureB & 0xff00) >>
+                    5;	// ((clflush_line_size >> 8) << 3), CLFLUSH_line_size * 8 = CacheLineSize_in_byte
+
+    if (CacheLineSize == 128) {
+      uiCPU |= WELS_CPU_CACHELINE_128;
+    } else if (CacheLineSize == 64) {
+      uiCPU |= WELS_CPU_CACHELINE_64;
+    } else if (CacheLineSize == 32) {
+      uiCPU |= WELS_CPU_CACHELINE_32;
+    } else if (CacheLineSize == 16) {
+      uiCPU |= WELS_CPU_CACHELINE_16;
+    }
+  }
+
+  return uiCPU;
+}
+
+
+void WelsCPURestore (const uint32_t kuiCPU) {
+  if (kuiCPU & (WELS_CPU_MMX | WELS_CPU_MMXEXT | WELS_CPU_3DNOW | WELS_CPU_3DNOWEXT)) {
+    WelsEmms();
+  }
 }
 
 #endif
--- a/codec/decoder/core/src/deblocking.cpp
+++ b/codec/decoder/core/src/deblocking.cpp
@@ -34,7 +34,7 @@
  * \brief	Interfaces introduced in frame deblocking filtering
  *
  * \date	08/02/2010
- *           
+ *
  *************************************************************************************
  */
 
@@ -86,51 +86,54 @@
 	iBeta  = g_kiBetaTable((iQp + iBetaOffset));\
 }
 
-static const uint8_t g_kuiAlphaTable[52+24] = { //this table refers to Table 8-16 in H.264/AVC standard
-	0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 0,  0,
-	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-	0,  0,  0,  0,  0,  0,  4,  4,  5,  6,
-	7,  8,  9, 10, 12, 13, 15, 17, 20, 22,
-	25, 28, 32, 36, 40, 45, 50, 56, 63, 71,
-	80, 90,101,113,127,144,162,182,203,226,
-	255, 255
-	,255, 255,255, 255,255, 255,255, 255,255, 255,255, 255
+static const uint8_t g_kuiAlphaTable[52 + 24] = { //this table refers to Table 8-16 in H.264/AVC standard
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  4,  4,  5,  6,
+  7,  8,  9, 10, 12, 13, 15, 17, 20, 22,
+  25, 28, 32, 36, 40, 45, 50, 56, 63, 71,
+  80, 90, 101, 113, 127, 144, 162, 182, 203, 226,
+  255, 255
+  , 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
 };
 
-static const int8_t g_kiBetaTable[52+24] = { //this table refers to Table 8-16 in H.264/AVC standard
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 0,  0,
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-     0,  0,  0,  0,  0,  0,  2,  2,  2,  3,
-     3,  3,  3,  4,  4,  4,  6,  6,  7,  7,
-     8,  8,  9,  9, 10, 10, 11, 11, 12, 12,
-    13, 13, 14, 14, 15, 15, 16, 16, 17, 17,
-    18, 18
-    ,18, 18,18, 18,18, 18,18, 18,18, 18,18, 18
+static const int8_t g_kiBetaTable[52 + 24] = { //this table refers to Table 8-16 in H.264/AVC standard
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  2,  2,  2,  3,
+  3,  3,  3,  4,  4,  4,  6,  6,  7,  7,
+  8,  8,  9,  9, 10, 10, 11, 11, 12, 12,
+  13, 13, 14, 14, 15, 15, 16, 16, 17, 17,
+  18, 18
+  , 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18
 };
 
-static const int8_t g_kiTc0Table[52+24][4] = { //this table refers Table 8-17 in H.264/AVC standard
-    { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 },
-    { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 },
-    { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 },
-    { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 },
-    { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 1 },
-    { -1, 0, 0, 1 }, { -1, 0, 0, 1 }, { -1, 0, 0, 1 }, { -1, 0, 1, 1 }, { -1, 0, 1, 1 }, { -1, 1, 1, 1 },
-    { -1, 1, 1, 1 }, { -1, 1, 1, 1 }, { -1, 1, 1, 1 }, { -1, 1, 1, 2 }, { -1, 1, 1, 2 }, { -1, 1, 1, 2 },
-    { -1, 1, 1, 2 }, { -1, 1, 2, 3 }, { -1, 1, 2, 3 }, { -1, 2, 2, 3 }, { -1, 2, 2, 4 }, { -1, 2, 3, 4 },
-    { -1, 2, 3, 4 }, { -1, 3, 3, 5 }, { -1, 3, 4, 6 }, { -1, 3, 4, 6 }, { -1, 4, 5, 7 }, { -1, 4, 5, 8 },
-    { -1, 4, 6, 9 }, { -1, 5, 7,10 }, { -1, 6, 8,11 }, { -1, 6, 8,13 }, { -1, 7,10,14 }, { -1, 8,11,16 },
-    { -1, 9,12,18 }, { -1, 10,13,20 }, {-1,11,15,23 }, { -1,13,17,25 }
-    ,{ -1,13,17,25 },{ -1,13,17,25 },{ -1,13,17,25 },{ -1,13,17,25 },{ -1,13,17,25 },{ -1,13,17,25 }
-	,{ -1,13,17,25 },{ -1,13,17,25 },{ -1,13,17,25 },{ -1,13,17,25 },{ -1,13,17,25 },{ -1,13,17,25 }
+static const int8_t g_kiTc0Table[52 + 24][4] = { //this table refers Table 8-17 in H.264/AVC standard
+  { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 },
+  { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 },
+  { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 },
+  { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 },
+  { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 1 },
+  { -1, 0, 0, 1 }, { -1, 0, 0, 1 }, { -1, 0, 0, 1 }, { -1, 0, 1, 1 }, { -1, 0, 1, 1 }, { -1, 1, 1, 1 },
+  { -1, 1, 1, 1 }, { -1, 1, 1, 1 }, { -1, 1, 1, 1 }, { -1, 1, 1, 2 }, { -1, 1, 1, 2 }, { -1, 1, 1, 2 },
+  { -1, 1, 1, 2 }, { -1, 1, 2, 3 }, { -1, 1, 2, 3 }, { -1, 2, 2, 3 }, { -1, 2, 2, 4 }, { -1, 2, 3, 4 },
+  { -1, 2, 3, 4 }, { -1, 3, 3, 5 }, { -1, 3, 4, 6 }, { -1, 3, 4, 6 }, { -1, 4, 5, 7 }, { -1, 4, 5, 8 },
+  { -1, 4, 6, 9 }, { -1, 5, 7, 10 }, { -1, 6, 8, 11 }, { -1, 6, 8, 13 }, { -1, 7, 10, 14 }, { -1, 8, 11, 16 },
+  { -1, 9, 12, 18 }, { -1, 10, 13, 20 }, { -1, 11, 15, 23 }, { -1, 13, 17, 25 }
+  , { -1, 13, 17, 25 }, { -1, 13, 17, 25 }, { -1, 13, 17, 25 }, { -1, 13, 17, 25 }, { -1, 13, 17, 25 }, { -1, 13, 17, 25 }
+  , { -1, 13, 17, 25 }, { -1, 13, 17, 25 }, { -1, 13, 17, 25 }, { -1, 13, 17, 25 }, { -1, 13, 17, 25 }, { -1, 13, 17, 25 }
 };
 
-static const uint8_t g_kuiTableBIdx[2][8] =   
-{     
-	{0,  4,  8,  12, 
-	3,  7,  11, 15}, 
+static const uint8_t g_kuiTableBIdx[2][8] = {
+  {
+    0,  4,  8,  12,
+    3,  7,  11, 15
+  },
 
-	{0,  1,  2,  3 , 
-	12, 13, 14, 15}, 
+  {
+    0,  1,  2,  3 ,
+    12, 13, 14, 15
+  },
 };
 
 #define TC0_TBL_LOOKUP(tc, iIndexA, pBS, bChroma) \
@@ -141,761 +144,681 @@
 	tc[3] = g_kiTc0Table(iIndexA)[pBS[3]] + bChroma;\
 }
 
-void_t inline DeblockingBSInsideMBAvsbase( int8_t* pNnzTab, uint8_t nBS[2][4][4], int32_t iLShiftFactor )
-{
-	uint32_t uiNnz32b0, uiNnz32b1, uiNnz32b2, uiNnz32b3;
-	FORCE_STACK_ALIGN_1D( uint8_t, uiBsx3, 4, 4 );
+void_t inline DeblockingBSInsideMBAvsbase (int8_t* pNnzTab, uint8_t nBS[2][4][4], int32_t iLShiftFactor) {
+  uint32_t uiNnz32b0, uiNnz32b1, uiNnz32b2, uiNnz32b3;
+  FORCE_STACK_ALIGN_1D (uint8_t, uiBsx3, 4, 4);
 
-	uiNnz32b0 = *(uint32_t *)(pNnzTab+0);
-	uiNnz32b1 = *(uint32_t *)(pNnzTab+4);
-	uiNnz32b2 = *(uint32_t *)(pNnzTab+8);
-	uiNnz32b3 = *(uint32_t *)(pNnzTab+12);
+  uiNnz32b0 = * (uint32_t*) (pNnzTab + 0);
+  uiNnz32b1 = * (uint32_t*) (pNnzTab + 4);
+  uiNnz32b2 = * (uint32_t*) (pNnzTab + 8);
+  uiNnz32b3 = * (uint32_t*) (pNnzTab + 12);
 
-	*(uint32_t *)uiBsx3 = (uiNnz32b0|(uiNnz32b0>>8))<<iLShiftFactor;
-	nBS[0][1][0] = uiBsx3[0];
-	nBS[0][2][0] = uiBsx3[1];
-	nBS[0][3][0] = uiBsx3[2];
+  * (uint32_t*)uiBsx3 = (uiNnz32b0 | (uiNnz32b0 >> 8)) << iLShiftFactor;
+  nBS[0][1][0] = uiBsx3[0];
+  nBS[0][2][0] = uiBsx3[1];
+  nBS[0][3][0] = uiBsx3[2];
 
-	*(uint32_t *)uiBsx3 = (uiNnz32b1|(uiNnz32b1>>8))<<iLShiftFactor;
-	nBS[0][1][1] = uiBsx3[0];
-	nBS[0][2][1] = uiBsx3[1];
-	nBS[0][3][1] = uiBsx3[2];
-	*(uint32_t *)nBS[1][1] = (uiNnz32b0|uiNnz32b1)<<iLShiftFactor;
+  * (uint32_t*)uiBsx3 = (uiNnz32b1 | (uiNnz32b1 >> 8)) << iLShiftFactor;
+  nBS[0][1][1] = uiBsx3[0];
+  nBS[0][2][1] = uiBsx3[1];
+  nBS[0][3][1] = uiBsx3[2];
+  * (uint32_t*)nBS[1][1] = (uiNnz32b0 | uiNnz32b1) << iLShiftFactor;
 
-	*(uint32_t *)uiBsx3 = (uiNnz32b2|(uiNnz32b2>>8))<<iLShiftFactor;
-	nBS[0][1][2] = uiBsx3[0];
-	nBS[0][2][2] = uiBsx3[1];
-	nBS[0][3][2] = uiBsx3[2];
-	*(uint32_t *)nBS[1][2] = (uiNnz32b1|uiNnz32b2)<<iLShiftFactor;
+  * (uint32_t*)uiBsx3 = (uiNnz32b2 | (uiNnz32b2 >> 8)) << iLShiftFactor;
+  nBS[0][1][2] = uiBsx3[0];
+  nBS[0][2][2] = uiBsx3[1];
+  nBS[0][3][2] = uiBsx3[2];
+  * (uint32_t*)nBS[1][2] = (uiNnz32b1 | uiNnz32b2) << iLShiftFactor;
 
-	*(uint32_t *)uiBsx3 = (uiNnz32b3|(uiNnz32b3>>8))<<iLShiftFactor;
-	nBS[0][1][3] = uiBsx3[0];
-	nBS[0][2][3] = uiBsx3[1];
-	nBS[0][3][3] = uiBsx3[2];	
-	*(uint32_t *)nBS[1][3] = (uiNnz32b2|uiNnz32b3)<<iLShiftFactor;
+  * (uint32_t*)uiBsx3 = (uiNnz32b3 | (uiNnz32b3 >> 8)) << iLShiftFactor;
+  nBS[0][1][3] = uiBsx3[0];
+  nBS[0][2][3] = uiBsx3[1];
+  nBS[0][3][3] = uiBsx3[2];
+  * (uint32_t*)nBS[1][3] = (uiNnz32b2 | uiNnz32b3) << iLShiftFactor;
 
 }
 
-void_t static inline DeblockingBSInsideMBNormal( PDqLayer pCurDqLayer, uint8_t nBS[2][4][4], int8_t* pNnzTab, int32_t iMbXy )
-{
-	uint32_t uiNnz32b0, uiNnz32b1, uiNnz32b2, uiNnz32b3;
-    int8_t* iRefIndex = pCurDqLayer->pRefIndex[LIST_0][iMbXy];
-	FORCE_STACK_ALIGN_1D( uint8_t, uiBsx4, 4, 4 );
+void_t static inline DeblockingBSInsideMBNormal (PDqLayer pCurDqLayer, uint8_t nBS[2][4][4], int8_t* pNnzTab,
+    int32_t iMbXy) {
+  uint32_t uiNnz32b0, uiNnz32b1, uiNnz32b2, uiNnz32b3;
+  int8_t* iRefIndex = pCurDqLayer->pRefIndex[LIST_0][iMbXy];
+  FORCE_STACK_ALIGN_1D (uint8_t, uiBsx4, 4, 4);
 
-	uiNnz32b0 = *(uint32_t *)(pNnzTab+0);
-	uiNnz32b1 = *(uint32_t *)(pNnzTab+4);
-	uiNnz32b2 = *(uint32_t *)(pNnzTab+8);
-	uiNnz32b3 = *(uint32_t *)(pNnzTab+12);
+  uiNnz32b0 = * (uint32_t*) (pNnzTab + 0);
+  uiNnz32b1 = * (uint32_t*) (pNnzTab + 4);
+  uiNnz32b2 = * (uint32_t*) (pNnzTab + 8);
+  uiNnz32b3 = * (uint32_t*) (pNnzTab + 12);
 
-	*(uint32_t *)uiBsx4 = (uiNnz32b0|(uiNnz32b0>>8));
-	nBS[0][1][0] = BS_EDGE(uiBsx4[0], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 1, 0);
-	nBS[0][2][0] = BS_EDGE(uiBsx4[1], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 2, 1);
-	nBS[0][3][0] = BS_EDGE(uiBsx4[2], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 3, 2);
+  * (uint32_t*)uiBsx4 = (uiNnz32b0 | (uiNnz32b0 >> 8));
+  nBS[0][1][0] = BS_EDGE (uiBsx4[0], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 1, 0);
+  nBS[0][2][0] = BS_EDGE (uiBsx4[1], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 2, 1);
+  nBS[0][3][0] = BS_EDGE (uiBsx4[2], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 3, 2);
 
-	*(uint32_t *)uiBsx4 = (uiNnz32b1|(uiNnz32b1>>8));
-	nBS[0][1][1] = BS_EDGE(uiBsx4[0], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 5, 4);
-	nBS[0][2][1] = BS_EDGE(uiBsx4[1], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 6, 5);
-	nBS[0][3][1] = BS_EDGE(uiBsx4[2], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 7, 6);
+  * (uint32_t*)uiBsx4 = (uiNnz32b1 | (uiNnz32b1 >> 8));
+  nBS[0][1][1] = BS_EDGE (uiBsx4[0], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 5, 4);
+  nBS[0][2][1] = BS_EDGE (uiBsx4[1], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 6, 5);
+  nBS[0][3][1] = BS_EDGE (uiBsx4[2], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 7, 6);
 
-	*(uint32_t *)uiBsx4 = (uiNnz32b2|(uiNnz32b2>>8));
-	nBS[0][1][2] = BS_EDGE(uiBsx4[0], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 9, 8);
-	nBS[0][2][2] = BS_EDGE(uiBsx4[1], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 10,9);
-	nBS[0][3][2] = BS_EDGE(uiBsx4[2], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 11,10);
+  * (uint32_t*)uiBsx4 = (uiNnz32b2 | (uiNnz32b2 >> 8));
+  nBS[0][1][2] = BS_EDGE (uiBsx4[0], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 9, 8);
+  nBS[0][2][2] = BS_EDGE (uiBsx4[1], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 10, 9);
+  nBS[0][3][2] = BS_EDGE (uiBsx4[2], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 11, 10);
 
-	*(uint32_t *)uiBsx4 = (uiNnz32b3|(uiNnz32b3>>8));
-	nBS[0][1][3] = BS_EDGE(uiBsx4[0], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 13,12);
-	nBS[0][2][3] = BS_EDGE(uiBsx4[1], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 14,13);
-	nBS[0][3][3] = BS_EDGE(uiBsx4[2], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 15,14);
+  * (uint32_t*)uiBsx4 = (uiNnz32b3 | (uiNnz32b3 >> 8));
+  nBS[0][1][3] = BS_EDGE (uiBsx4[0], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 13, 12);
+  nBS[0][2][3] = BS_EDGE (uiBsx4[1], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 14, 13);
+  nBS[0][3][3] = BS_EDGE (uiBsx4[2], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 15, 14);
 
-	// horizontal
-	*(uint32_t *)uiBsx4 = (uiNnz32b0|uiNnz32b1);
-	nBS[1][1][0] = BS_EDGE(uiBsx4[0], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 4, 0);
-	nBS[1][1][1] = BS_EDGE(uiBsx4[1], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 5, 1);
-	nBS[1][1][2] = BS_EDGE(uiBsx4[2], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 6, 2);
-	nBS[1][1][3] = BS_EDGE(uiBsx4[3], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 7, 3);
+  // horizontal
+  * (uint32_t*)uiBsx4 = (uiNnz32b0 | uiNnz32b1);
+  nBS[1][1][0] = BS_EDGE (uiBsx4[0], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 4, 0);
+  nBS[1][1][1] = BS_EDGE (uiBsx4[1], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 5, 1);
+  nBS[1][1][2] = BS_EDGE (uiBsx4[2], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 6, 2);
+  nBS[1][1][3] = BS_EDGE (uiBsx4[3], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 7, 3);
 
-	*(uint32_t *)uiBsx4 = (uiNnz32b1|uiNnz32b2);
-	nBS[1][2][0] = BS_EDGE(uiBsx4[0], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 8, 4);
-	nBS[1][2][1] = BS_EDGE(uiBsx4[1], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 9, 5);
-	nBS[1][2][2] = BS_EDGE(uiBsx4[2], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 10, 6);
-	nBS[1][2][3] = BS_EDGE(uiBsx4[3], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 11, 7);
+  * (uint32_t*)uiBsx4 = (uiNnz32b1 | uiNnz32b2);
+  nBS[1][2][0] = BS_EDGE (uiBsx4[0], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 8, 4);
+  nBS[1][2][1] = BS_EDGE (uiBsx4[1], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 9, 5);
+  nBS[1][2][2] = BS_EDGE (uiBsx4[2], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 10, 6);
+  nBS[1][2][3] = BS_EDGE (uiBsx4[3], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 11, 7);
 
-	*(uint32_t *)uiBsx4 = (uiNnz32b2|uiNnz32b3);
-	nBS[1][3][0] = BS_EDGE(uiBsx4[0], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 12, 8);
-	nBS[1][3][1] = BS_EDGE(uiBsx4[1], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 13, 9);
-	nBS[1][3][2] = BS_EDGE(uiBsx4[2], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 14, 10);
-	nBS[1][3][3] = BS_EDGE(uiBsx4[3], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 15, 11);
+  * (uint32_t*)uiBsx4 = (uiNnz32b2 | uiNnz32b3);
+  nBS[1][3][0] = BS_EDGE (uiBsx4[0], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 12, 8);
+  nBS[1][3][1] = BS_EDGE (uiBsx4[1], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 13, 9);
+  nBS[1][3][2] = BS_EDGE (uiBsx4[2], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 14, 10);
+  nBS[1][3][3] = BS_EDGE (uiBsx4[3], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 15, 11);
 }
 
-uint32_t DeblockingBsMarginalMBAvcbase( PDqLayer pCurDqLayer, int32_t iEdge, int32_t iNeighMb, int32_t iMbXy)
-{
-	int32_t i;
-	uint32_t uiBSx4;    
-    //uint8_t* bS = static_cast<uint8_t*>(&uiBSx4);
-    uint8_t* pBS = (uint8_t*)(&uiBSx4);
-	uint32_t uiBIdx  = *(uint32_t *)(&g_kuiTableBIdx[iEdge][0]); 
-	uint32_t uiBnIdx = *(uint32_t *)(&g_kuiTableBIdx[iEdge][4]);
+uint32_t DeblockingBsMarginalMBAvcbase (PDqLayer pCurDqLayer, int32_t iEdge, int32_t iNeighMb, int32_t iMbXy) {
+  int32_t i;
+  uint32_t uiBSx4;
+  //uint8_t* bS = static_cast<uint8_t*>(&uiBSx4);
+  uint8_t* pBS = (uint8_t*) (&uiBSx4);
+  uint32_t uiBIdx  = * (uint32_t*) (&g_kuiTableBIdx[iEdge][0]);
+  uint32_t uiBnIdx = * (uint32_t*) (&g_kuiTableBIdx[iEdge][4]);
 
-	for( i = 0; i < 4; i++ )
-	{
-		if( pCurDqLayer->pNzc[iMbXy][uiBIdx&0xff] | pCurDqLayer->pNzc[iNeighMb][uiBnIdx&0xff] )
-		{
-			pBS[i] = 2;
-		} 
-		else 
-		{
-			pBS[i] = MB_BS_MV(pCurDqLayer->pRefIndex[LIST_0], pCurDqLayer->pMv[LIST_0], iMbXy, iNeighMb, (uiBIdx&0xff), (uiBnIdx&0xff));
-		}
-		uiBIdx  = uiBIdx  >> 8;
-		uiBnIdx = uiBnIdx >> 8;
-	}
-    return uiBSx4;
+  for (i = 0; i < 4; i++) {
+    if (pCurDqLayer->pNzc[iMbXy][uiBIdx & 0xff] | pCurDqLayer->pNzc[iNeighMb][uiBnIdx & 0xff]) {
+      pBS[i] = 2;
+    } else {
+      pBS[i] = MB_BS_MV (pCurDqLayer->pRefIndex[LIST_0], pCurDqLayer->pMv[LIST_0], iMbXy, iNeighMb, (uiBIdx & 0xff),
+                         (uiBnIdx & 0xff));
+    }
+    uiBIdx  = uiBIdx  >> 8;
+    uiBnIdx = uiBnIdx >> 8;
+  }
+  return uiBSx4;
 }
-int32_t DeblockingAvailableNoInterlayer( PDqLayer pCurDqLayer, int32_t iFilterIdc )
-{
- 	int32_t iMbY = pCurDqLayer->iMbY;
- 	int32_t iMbX = pCurDqLayer->iMbX;
-	int32_t iMbXy = pCurDqLayer->iMbXyIndex;
-	BOOL_T bLeftFlag = FALSE;
-	BOOL_T bTopFlag  = FALSE;
-   
-	if ( 2 == iFilterIdc )
-	{
-		bLeftFlag = ( iMbX > 0 ) && ( pCurDqLayer->pSliceIdc[iMbXy] == pCurDqLayer->pSliceIdc[iMbXy-1] );
-		bTopFlag  = ( iMbY > 0 ) && ( pCurDqLayer->pSliceIdc[iMbXy] == pCurDqLayer->pSliceIdc[iMbXy-pCurDqLayer->iMbWidth] );
-	}
-	else //if ( 0 == iFilterIdc )
-	{
-		bLeftFlag = ( iMbX > 0 );
-		bTopFlag  = ( iMbY > 0 );
-	}
-	return (bLeftFlag<<LEFT_FLAG_BIT)|(bTopFlag<<TOP_FLAG_BIT);
+int32_t DeblockingAvailableNoInterlayer (PDqLayer pCurDqLayer, int32_t iFilterIdc) {
+  int32_t iMbY = pCurDqLayer->iMbY;
+  int32_t iMbX = pCurDqLayer->iMbX;
+  int32_t iMbXy = pCurDqLayer->iMbXyIndex;
+  BOOL_T bLeftFlag = FALSE;
+  BOOL_T bTopFlag  = FALSE;
+
+  if (2 == iFilterIdc) {
+    bLeftFlag = (iMbX > 0) && (pCurDqLayer->pSliceIdc[iMbXy] == pCurDqLayer->pSliceIdc[iMbXy - 1]);
+    bTopFlag  = (iMbY > 0) && (pCurDqLayer->pSliceIdc[iMbXy] == pCurDqLayer->pSliceIdc[iMbXy - pCurDqLayer->iMbWidth]);
+  } else { //if ( 0 == iFilterIdc )
+    bLeftFlag = (iMbX > 0);
+    bTopFlag  = (iMbY > 0);
+  }
+  return (bLeftFlag << LEFT_FLAG_BIT) | (bTopFlag << TOP_FLAG_BIT);
 }
 
-void_t FilteringEdgeLumaH(SDeblockingFilter* pFilter, uint8_t* pPix, int32_t iStride, uint8_t* pBS )
-{
-	int32_t iIndexA; 
-	int32_t iAlpha; 
-	int32_t iBeta;  
-	FORCE_STACK_ALIGN_1D( int8_t, tc, 4, 16 );
+void_t FilteringEdgeLumaH (SDeblockingFilter* pFilter, uint8_t* pPix, int32_t iStride, uint8_t* pBS) {
+  int32_t iIndexA;
+  int32_t iAlpha;
+  int32_t iBeta;
+  FORCE_STACK_ALIGN_1D (int8_t, tc, 4, 16);
 
-	GET_ALPHA_BETA_FROM_QP(pFilter->iLumaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIndexA, iAlpha, iBeta);
+  GET_ALPHA_BETA_FROM_QP (pFilter->iLumaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIndexA, iAlpha,
+                          iBeta);
 
-	if( iAlpha | iBeta )
-	{
-		TC0_TBL_LOOKUP(tc, iIndexA, pBS, 0);
-		pFilter->pLoopf->pfLumaDeblockingLT4Ver(pPix, iStride, iAlpha, iBeta, tc);
-	}
-	return;
+  if (iAlpha | iBeta) {
+    TC0_TBL_LOOKUP (tc, iIndexA, pBS, 0);
+    pFilter->pLoopf->pfLumaDeblockingLT4Ver (pPix, iStride, iAlpha, iBeta, tc);
+  }
+  return;
 }
 
 
-void_t FilteringEdgeLumaV(SDeblockingFilter* pFilter, uint8_t* pPix, int32_t iStride, uint8_t* pBS )
-{
-	int32_t  iIndexA;
-	int32_t  iAlpha;
-	int32_t  iBeta; 
-	FORCE_STACK_ALIGN_1D( int8_t, tc, 4, 16 );
+void_t FilteringEdgeLumaV (SDeblockingFilter* pFilter, uint8_t* pPix, int32_t iStride, uint8_t* pBS) {
+  int32_t  iIndexA;
+  int32_t  iAlpha;
+  int32_t  iBeta;
+  FORCE_STACK_ALIGN_1D (int8_t, tc, 4, 16);
 
-	GET_ALPHA_BETA_FROM_QP(pFilter->iLumaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIndexA, iAlpha, iBeta);
+  GET_ALPHA_BETA_FROM_QP (pFilter->iLumaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIndexA, iAlpha,
+                          iBeta);
 
-	if( iAlpha | iBeta )
-	{
-		TC0_TBL_LOOKUP(tc, iIndexA, pBS, 0);
-		pFilter->pLoopf->pfLumaDeblockingLT4Hor(pPix, iStride, iAlpha, iBeta, tc);
-	}
-	return;
+  if (iAlpha | iBeta) {
+    TC0_TBL_LOOKUP (tc, iIndexA, pBS, 0);
+    pFilter->pLoopf->pfLumaDeblockingLT4Hor (pPix, iStride, iAlpha, iBeta, tc);
+  }
+  return;
 }
 
 
-void_t FilteringEdgeLumaIntraH( SDeblockingFilter* pFilter, uint8_t* pPix, int32_t iStride, uint8_t* pBS )
-{
-	int32_t iIndexA; 
-	int32_t iAlpha; 
-	int32_t iBeta;  	
+void_t FilteringEdgeLumaIntraH (SDeblockingFilter* pFilter, uint8_t* pPix, int32_t iStride, uint8_t* pBS) {
+  int32_t iIndexA;
+  int32_t iAlpha;
+  int32_t iBeta;
 
-	GET_ALPHA_BETA_FROM_QP(pFilter->iLumaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIndexA, iAlpha, iBeta);
+  GET_ALPHA_BETA_FROM_QP (pFilter->iLumaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIndexA, iAlpha,
+                          iBeta);
 
-	if( iAlpha | iBeta )
-	{
-		pFilter->pLoopf->pfLumaDeblockingEQ4Ver(pPix, iStride, iAlpha, iBeta);
-	}
-	return;
+  if (iAlpha | iBeta) {
+    pFilter->pLoopf->pfLumaDeblockingEQ4Ver (pPix, iStride, iAlpha, iBeta);
+  }
+  return;
 }
 
-void_t FilteringEdgeLumaIntraV( SDeblockingFilter* pFilter, uint8_t* pPix, int32_t iStride, uint8_t* pBS )
-{
-	int32_t iIndexA; 
-	int32_t iAlpha; 
-	int32_t iBeta;  
+void_t FilteringEdgeLumaIntraV (SDeblockingFilter* pFilter, uint8_t* pPix, int32_t iStride, uint8_t* pBS) {
+  int32_t iIndexA;
+  int32_t iAlpha;
+  int32_t iBeta;
 
-	GET_ALPHA_BETA_FROM_QP(pFilter->iLumaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIndexA, iAlpha, iBeta);
+  GET_ALPHA_BETA_FROM_QP (pFilter->iLumaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIndexA, iAlpha,
+                          iBeta);
 
-	if( iAlpha | iBeta )
-	{	
-		pFilter->pLoopf->pfLumaDeblockingEQ4Hor(pPix, iStride, iAlpha, iBeta);
-	}
-	return;
+  if (iAlpha | iBeta) {
+    pFilter->pLoopf->pfLumaDeblockingEQ4Hor (pPix, iStride, iAlpha, iBeta);
+  }
+  return;
 }
-void_t FilteringEdgeChromaH( SDeblockingFilter* pFilter, uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, uint8_t* pBS )
-{	
-	int32_t iIndexA; 
-	int32_t iAlpha; 
-	int32_t iBeta;  
-	FORCE_STACK_ALIGN_1D( int8_t, tc, 4, 16 );
+void_t FilteringEdgeChromaH (SDeblockingFilter* pFilter, uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride,
+                             uint8_t* pBS) {
+  int32_t iIndexA;
+  int32_t iAlpha;
+  int32_t iBeta;
+  FORCE_STACK_ALIGN_1D (int8_t, tc, 4, 16);
 
-	GET_ALPHA_BETA_FROM_QP(pFilter->iChromaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIndexA, iAlpha, iBeta);
+  GET_ALPHA_BETA_FROM_QP (pFilter->iChromaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIndexA, iAlpha,
+                          iBeta);
 
-	if( iAlpha | iBeta )
-	{
-		TC0_TBL_LOOKUP(tc, iIndexA, pBS, 1);
-		pFilter->pLoopf->pfChromaDeblockingLT4Ver(pPixCb, pPixCr, iStride,iAlpha, iBeta, tc);
-	}
-	return;
-} 
-void_t FilteringEdgeChromaV( SDeblockingFilter* pFilter, uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, uint8_t* pBS )
-{	  
-	int32_t iIndexA; 
-	int32_t iAlpha; 
-	int32_t iBeta;  
-	FORCE_STACK_ALIGN_1D( int8_t, tc, 4, 16 );
+  if (iAlpha | iBeta) {
+    TC0_TBL_LOOKUP (tc, iIndexA, pBS, 1);
+    pFilter->pLoopf->pfChromaDeblockingLT4Ver (pPixCb, pPixCr, iStride, iAlpha, iBeta, tc);
+  }
+  return;
+}
+void_t FilteringEdgeChromaV (SDeblockingFilter* pFilter, uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride,
+                             uint8_t* pBS) {
+  int32_t iIndexA;
+  int32_t iAlpha;
+  int32_t iBeta;
+  FORCE_STACK_ALIGN_1D (int8_t, tc, 4, 16);
 
-	GET_ALPHA_BETA_FROM_QP(pFilter->iChromaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIndexA, iAlpha, iBeta);
+  GET_ALPHA_BETA_FROM_QP (pFilter->iChromaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIndexA, iAlpha,
+                          iBeta);
 
-	if( iAlpha | iBeta )
-	{
-		TC0_TBL_LOOKUP(tc, iIndexA, pBS, 1);
-		pFilter->pLoopf->pfChromaDeblockingLT4Hor(pPixCb, pPixCr, iStride, iAlpha, iBeta, tc);
-	}
-	return;
+  if (iAlpha | iBeta) {
+    TC0_TBL_LOOKUP (tc, iIndexA, pBS, 1);
+    pFilter->pLoopf->pfChromaDeblockingLT4Hor (pPixCb, pPixCr, iStride, iAlpha, iBeta, tc);
+  }
+  return;
 }
 
-void_t FilteringEdgeChromaIntraH( SDeblockingFilter* pFilter, uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, uint8_t* pBS )
-{
-	int32_t iIndexA; 
-	int32_t iAlpha; 
-	int32_t iBeta;  
+void_t FilteringEdgeChromaIntraH (SDeblockingFilter* pFilter, uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride,
+                                  uint8_t* pBS) {
+  int32_t iIndexA;
+  int32_t iAlpha;
+  int32_t iBeta;
 
-	GET_ALPHA_BETA_FROM_QP(pFilter->iChromaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIndexA, iAlpha, iBeta);
+  GET_ALPHA_BETA_FROM_QP (pFilter->iChromaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIndexA, iAlpha,
+                          iBeta);
 
-	if( iAlpha | iBeta )
-	{
-		pFilter->pLoopf->pfChromaDeblockingEQ4Ver(pPixCb, pPixCr, iStride, iAlpha, iBeta);
-	}
-	return;
+  if (iAlpha | iBeta) {
+    pFilter->pLoopf->pfChromaDeblockingEQ4Ver (pPixCb, pPixCr, iStride, iAlpha, iBeta);
+  }
+  return;
 }
 
-void_t FilteringEdgeChromaIntraV( SDeblockingFilter* pFilter, uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, uint8_t* pBS )
-{
-	int32_t iIndexA; 
-	int32_t iAlpha; 
-	int32_t iBeta;  
+void_t FilteringEdgeChromaIntraV (SDeblockingFilter* pFilter, uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride,
+                                  uint8_t* pBS) {
+  int32_t iIndexA;
+  int32_t iAlpha;
+  int32_t iBeta;
 
-	GET_ALPHA_BETA_FROM_QP(pFilter->iChromaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIndexA, iAlpha, iBeta);
+  GET_ALPHA_BETA_FROM_QP (pFilter->iChromaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIndexA, iAlpha,
+                          iBeta);
 
-	if( iAlpha | iBeta )
-	{
-		pFilter->pLoopf->pfChromaDeblockinEQ4Hor(pPixCb, pPixCr, iStride, iAlpha, iBeta);
-	}
-	return;
+  if (iAlpha | iBeta) {
+    pFilter->pLoopf->pfChromaDeblockinEQ4Hor (pPixCb, pPixCr, iStride, iAlpha, iBeta);
+  }
+  return;
 }
 
 
-void_t DeblockingInterMb( PDqLayer pCurDqLayer, PDeblockingFilter  pFilter, uint8_t nBS[2][4][4], int32_t iBoundryFlag )
-{
-	int32_t iMbXyIndex = pCurDqLayer->iMbXyIndex;
-	int32_t iMbX = pCurDqLayer->iMbX;
-	int32_t iMbY = pCurDqLayer->iMbY;
-    
-	int32_t iCurLumaQp = pCurDqLayer->pLumaQp[iMbXyIndex];
-	int32_t iCurChromaQp = pCurDqLayer->pChromaQp[iMbXyIndex];
-	int32_t iLineSize   = pFilter->iCsStride[0];
-	int32_t iLineSizeUV = pFilter->iCsStride[1];
+void_t DeblockingInterMb (PDqLayer pCurDqLayer, PDeblockingFilter  pFilter, uint8_t nBS[2][4][4],
+                          int32_t iBoundryFlag) {
+  int32_t iMbXyIndex = pCurDqLayer->iMbXyIndex;
+  int32_t iMbX = pCurDqLayer->iMbX;
+  int32_t iMbY = pCurDqLayer->iMbY;
 
-    uint8_t * pDestY, * pDestCb, * pDestCr;
-	pDestY  = pFilter->pCsData[0] + ((iMbY * iLineSize + iMbX) << 4);
-	pDestCb = pFilter->pCsData[1] + ((iMbY * iLineSizeUV + iMbX) << 3);				
-	pDestCr = pFilter->pCsData[2] + ((iMbY * iLineSizeUV + iMbX) << 3);
+  int32_t iCurLumaQp = pCurDqLayer->pLumaQp[iMbXyIndex];
+  int32_t iCurChromaQp = pCurDqLayer->pChromaQp[iMbXyIndex];
+  int32_t iLineSize   = pFilter->iCsStride[0];
+  int32_t iLineSizeUV = pFilter->iCsStride[1];
 
-	if( iBoundryFlag & LEFT_FLAG_MASK)
-	{
-		int32_t iLeftXyIndex = iMbXyIndex - 1;
-		pFilter->iLumaQP   = (iCurLumaQp + pCurDqLayer->pLumaQp[iLeftXyIndex] + 1) >> 1;
-		pFilter->iChromaQP = (iCurChromaQp + pCurDqLayer->pChromaQp[iLeftXyIndex]+ 1) >> 1;
+  uint8_t* pDestY, * pDestCb, * pDestCr;
+  pDestY  = pFilter->pCsData[0] + ((iMbY * iLineSize + iMbX) << 4);
+  pDestCb = pFilter->pCsData[1] + ((iMbY * iLineSizeUV + iMbX) << 3);
+  pDestCr = pFilter->pCsData[2] + ((iMbY * iLineSizeUV + iMbX) << 3);
 
-		if( nBS[0][0][0] == 0x04 )
-		{
-			FilteringEdgeLumaIntraV( pFilter, pDestY, iLineSize, NULL );
-			FilteringEdgeChromaIntraV( pFilter, pDestCb, pDestCr, iLineSizeUV, NULL );
-		} 
-		else
-		{
-			if(*(uint32_t *)nBS[0][0] != 0)
-			{
-				FilteringEdgeLumaV( pFilter, pDestY, iLineSize, nBS[0][0] );
-				FilteringEdgeChromaV( pFilter, pDestCb, pDestCr, iLineSizeUV, nBS[0][0] );
-			}
-		}
-	}
-	
-	pFilter->iLumaQP = iCurLumaQp;
-	pFilter->iChromaQP = iCurChromaQp;
-    
-	if(*(uint32_t *)nBS[0][1] != 0)
-	{
-		FilteringEdgeLumaV( pFilter, &pDestY[1<<2], iLineSize, nBS[0][1]);
-	}
+  if (iBoundryFlag & LEFT_FLAG_MASK) {
+    int32_t iLeftXyIndex = iMbXyIndex - 1;
+    pFilter->iLumaQP   = (iCurLumaQp + pCurDqLayer->pLumaQp[iLeftXyIndex] + 1) >> 1;
+    pFilter->iChromaQP = (iCurChromaQp + pCurDqLayer->pChromaQp[iLeftXyIndex] + 1) >> 1;
 
-	if(*(uint32_t *)nBS[0][2] != 0)
-	{
-		FilteringEdgeLumaV( pFilter, &pDestY[2<<2], iLineSize, nBS[0][2]);
-		FilteringEdgeChromaV( pFilter, &pDestCb[2<<1], &pDestCr[2<<1], iLineSizeUV, nBS[0][2] );
-	}
+    if (nBS[0][0][0] == 0x04) {
+      FilteringEdgeLumaIntraV (pFilter, pDestY, iLineSize, NULL);
+      FilteringEdgeChromaIntraV (pFilter, pDestCb, pDestCr, iLineSizeUV, NULL);
+    } else {
+      if (* (uint32_t*)nBS[0][0] != 0) {
+        FilteringEdgeLumaV (pFilter, pDestY, iLineSize, nBS[0][0]);
+        FilteringEdgeChromaV (pFilter, pDestCb, pDestCr, iLineSizeUV, nBS[0][0]);
+      }
+    }
+  }
 
-	if(*(uint32_t *)nBS[0][3] != 0)
-	{
-		FilteringEdgeLumaV( pFilter, &pDestY[3<<2], iLineSize, nBS[0][3] );
-	}
-	
-	if( iBoundryFlag & TOP_FLAG_MASK)
-	{	
-		int32_t iTopXyIndex = iMbXyIndex - pCurDqLayer->iMbWidth;
-        pFilter->iLumaQP = (iCurLumaQp + pCurDqLayer->pLumaQp[iTopXyIndex] + 1) >> 1;
-        pFilter->iChromaQP = (iCurChromaQp + pCurDqLayer->pChromaQp[iTopXyIndex] + 1) >> 1;
-		
-		if(  nBS[1][0][0] == 0x04)
-		{
-			FilteringEdgeLumaIntraH( pFilter, pDestY, iLineSize, NULL );
-			FilteringEdgeChromaIntraH( pFilter, pDestCb, pDestCr, iLineSizeUV, NULL );
-		} 
-		else 
-		{
-			if(*(uint32_t *)nBS[1][0] != 0)
-			{
-				FilteringEdgeLumaH( pFilter, pDestY, iLineSize, nBS[1][0] );
-				FilteringEdgeChromaH( pFilter, pDestCb, pDestCr, iLineSizeUV, nBS[1][0] );
-			}
-		}  
-	}
-	
-	pFilter->iLumaQP = iCurLumaQp;
-	pFilter->iChromaQP = iCurChromaQp;
+  pFilter->iLumaQP = iCurLumaQp;
+  pFilter->iChromaQP = iCurChromaQp;
 
-	if(*(uint32_t *)nBS[1][1] != 0)
-	{
-		FilteringEdgeLumaH( pFilter, &pDestY[(1<<2)*iLineSize], iLineSize, nBS[1][1] );
-	}
+  if (* (uint32_t*)nBS[0][1] != 0) {
+    FilteringEdgeLumaV (pFilter, &pDestY[1 << 2], iLineSize, nBS[0][1]);
+  }
 
-	if(*(uint32_t *)nBS[1][2] != 0)
-	{
-		FilteringEdgeLumaH( pFilter, &pDestY[(2<<2)*iLineSize], iLineSize, nBS[1][2] );
-		FilteringEdgeChromaH( pFilter, &pDestCb[(2<<1)*iLineSizeUV], &pDestCr[(2<<1)*iLineSizeUV], iLineSizeUV, nBS[1][2] );
-	}
+  if (* (uint32_t*)nBS[0][2] != 0) {
+    FilteringEdgeLumaV (pFilter, &pDestY[2 << 2], iLineSize, nBS[0][2]);
+    FilteringEdgeChromaV (pFilter, &pDestCb[2 << 1], &pDestCr[2 << 1], iLineSizeUV, nBS[0][2]);
+  }
 
-	if(*(uint32_t *)nBS[1][3] != 0)
-	{
-		FilteringEdgeLumaH( pFilter, &pDestY[(3<<2)*iLineSize], iLineSize, nBS[1][3] );
-	}
+  if (* (uint32_t*)nBS[0][3] != 0) {
+    FilteringEdgeLumaV (pFilter, &pDestY[3 << 2], iLineSize, nBS[0][3]);
+  }
+
+  if (iBoundryFlag & TOP_FLAG_MASK) {
+    int32_t iTopXyIndex = iMbXyIndex - pCurDqLayer->iMbWidth;
+    pFilter->iLumaQP = (iCurLumaQp + pCurDqLayer->pLumaQp[iTopXyIndex] + 1) >> 1;
+    pFilter->iChromaQP = (iCurChromaQp + pCurDqLayer->pChromaQp[iTopXyIndex] + 1) >> 1;
+
+    if (nBS[1][0][0] == 0x04) {
+      FilteringEdgeLumaIntraH (pFilter, pDestY, iLineSize, NULL);
+      FilteringEdgeChromaIntraH (pFilter, pDestCb, pDestCr, iLineSizeUV, NULL);
+    } else {
+      if (* (uint32_t*)nBS[1][0] != 0) {
+        FilteringEdgeLumaH (pFilter, pDestY, iLineSize, nBS[1][0]);
+        FilteringEdgeChromaH (pFilter, pDestCb, pDestCr, iLineSizeUV, nBS[1][0]);
+      }
+    }
+  }
+
+  pFilter->iLumaQP = iCurLumaQp;
+  pFilter->iChromaQP = iCurChromaQp;
+
+  if (* (uint32_t*)nBS[1][1] != 0) {
+    FilteringEdgeLumaH (pFilter, &pDestY[ (1 << 2)*iLineSize], iLineSize, nBS[1][1]);
+  }
+
+  if (* (uint32_t*)nBS[1][2] != 0) {
+    FilteringEdgeLumaH (pFilter, &pDestY[ (2 << 2)*iLineSize], iLineSize, nBS[1][2]);
+    FilteringEdgeChromaH (pFilter, &pDestCb[ (2 << 1)*iLineSizeUV], &pDestCr[ (2 << 1)*iLineSizeUV], iLineSizeUV,
+                          nBS[1][2]);
+  }
+
+  if (* (uint32_t*)nBS[1][3] != 0) {
+    FilteringEdgeLumaH (pFilter, &pDestY[ (3 << 2)*iLineSize], iLineSize, nBS[1][3]);
+  }
 }
 
-void_t /*__FASTCALL*/ FilteringEdgeLumaHV( PDqLayer pCurDqLayer, PDeblockingFilter  pFilter, int32_t iBoundryFlag )
-{
-	int32_t iMbXyIndex = pCurDqLayer->iMbXyIndex;
-	int32_t iMbX      = pCurDqLayer->iMbX;
-	int32_t iMbY      = pCurDqLayer->iMbY;
-	int32_t iMbWidth  = pCurDqLayer->iMbWidth;
-	int32_t iLineSize  = pFilter->iCsStride[0];
+void_t /*__FASTCALL*/ FilteringEdgeLumaHV (PDqLayer pCurDqLayer, PDeblockingFilter  pFilter, int32_t iBoundryFlag) {
+  int32_t iMbXyIndex = pCurDqLayer->iMbXyIndex;
+  int32_t iMbX      = pCurDqLayer->iMbX;
+  int32_t iMbY      = pCurDqLayer->iMbY;
+  int32_t iMbWidth  = pCurDqLayer->iMbWidth;
+  int32_t iLineSize  = pFilter->iCsStride[0];
 
-	uint8_t  *pDestY;	
-	int32_t  iCurQp;
-	int32_t  iIndexA, iAlpha, iBeta;
+  uint8_t*  pDestY;
+  int32_t  iCurQp;
+  int32_t  iIndexA, iAlpha, iBeta;
 
-	FORCE_STACK_ALIGN_1D(int8_t,  iTc,   4, 16 );
-	FORCE_STACK_ALIGN_1D(uint8_t, uiBSx4, 4, 4  );
+  FORCE_STACK_ALIGN_1D (int8_t,  iTc,   4, 16);
+  FORCE_STACK_ALIGN_1D (uint8_t, uiBSx4, 4, 4);
 
-	pDestY  = pFilter->pCsData[0] + ((iMbY * iLineSize + iMbX) << 4);
-	iCurQp  = pCurDqLayer->pLumaQp[iMbXyIndex];
-	
-	*(uint32_t*)uiBSx4 = 0x03030303;
+  pDestY  = pFilter->pCsData[0] + ((iMbY * iLineSize + iMbX) << 4);
+  iCurQp  = pCurDqLayer->pLumaQp[iMbXyIndex];
 
-	// luma v
-	if( iBoundryFlag & LEFT_FLAG_MASK)
-	{
-		pFilter->iLumaQP   = ( iCurQp   + pCurDqLayer->pLumaQp[iMbXyIndex-1] + 1 ) >> 1;		
-		FilteringEdgeLumaIntraV( pFilter, pDestY, iLineSize, NULL );
-	}
+  * (uint32_t*)uiBSx4 = 0x03030303;
 
-	pFilter->iLumaQP   = iCurQp;	
-	GET_ALPHA_BETA_FROM_QP(pFilter->iLumaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIndexA, iAlpha, iBeta);
-	if( iAlpha | iBeta )
-	{
-		TC0_TBL_LOOKUP(iTc, iIndexA, uiBSx4, 0);
-		pFilter->pLoopf->pfLumaDeblockingLT4Hor( &pDestY[1 << 2],iLineSize,iAlpha,iBeta,iTc );
-		pFilter->pLoopf->pfLumaDeblockingLT4Hor( &pDestY[2 << 2],iLineSize,iAlpha,iBeta,iTc );
-		pFilter->pLoopf->pfLumaDeblockingLT4Hor( &pDestY[3 << 2],iLineSize,iAlpha,iBeta,iTc );
-	}
+  // luma v
+  if (iBoundryFlag & LEFT_FLAG_MASK) {
+    pFilter->iLumaQP   = (iCurQp   + pCurDqLayer->pLumaQp[iMbXyIndex - 1] + 1) >> 1;
+    FilteringEdgeLumaIntraV (pFilter, pDestY, iLineSize, NULL);
+  }
 
-	// luma h
-	if( iBoundryFlag & TOP_FLAG_MASK)
-	{
-		pFilter->iLumaQP   = ( iCurQp   + pCurDqLayer->pLumaQp[iMbXyIndex-iMbWidth] + 1 ) >> 1;	
-		FilteringEdgeLumaIntraH( pFilter, pDestY, iLineSize, NULL );
-	}   
+  pFilter->iLumaQP   = iCurQp;
+  GET_ALPHA_BETA_FROM_QP (pFilter->iLumaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIndexA, iAlpha,
+                          iBeta);
+  if (iAlpha | iBeta) {
+    TC0_TBL_LOOKUP (iTc, iIndexA, uiBSx4, 0);
+    pFilter->pLoopf->pfLumaDeblockingLT4Hor (&pDestY[1 << 2], iLineSize, iAlpha, iBeta, iTc);
+    pFilter->pLoopf->pfLumaDeblockingLT4Hor (&pDestY[2 << 2], iLineSize, iAlpha, iBeta, iTc);
+    pFilter->pLoopf->pfLumaDeblockingLT4Hor (&pDestY[3 << 2], iLineSize, iAlpha, iBeta, iTc);
+  }
 
-	pFilter->iLumaQP   = iCurQp;	
-	if( iAlpha | iBeta )
-	{
-		pFilter->pLoopf->pfLumaDeblockingLT4Ver( &pDestY[(1<<2)*iLineSize],iLineSize,iAlpha,iBeta,iTc );
-		pFilter->pLoopf->pfLumaDeblockingLT4Ver( &pDestY[(2<<2)*iLineSize],iLineSize,iAlpha,iBeta,iTc );
-		pFilter->pLoopf->pfLumaDeblockingLT4Ver( &pDestY[(3<<2)*iLineSize],iLineSize,iAlpha,iBeta,iTc );
-	}
+  // luma h
+  if (iBoundryFlag & TOP_FLAG_MASK) {
+    pFilter->iLumaQP   = (iCurQp   + pCurDqLayer->pLumaQp[iMbXyIndex - iMbWidth] + 1) >> 1;
+    FilteringEdgeLumaIntraH (pFilter, pDestY, iLineSize, NULL);
+  }
+
+  pFilter->iLumaQP   = iCurQp;
+  if (iAlpha | iBeta) {
+    pFilter->pLoopf->pfLumaDeblockingLT4Ver (&pDestY[ (1 << 2)*iLineSize], iLineSize, iAlpha, iBeta, iTc);
+    pFilter->pLoopf->pfLumaDeblockingLT4Ver (&pDestY[ (2 << 2)*iLineSize], iLineSize, iAlpha, iBeta, iTc);
+    pFilter->pLoopf->pfLumaDeblockingLT4Ver (&pDestY[ (3 << 2)*iLineSize], iLineSize, iAlpha, iBeta, iTc);
+  }
 }
-void_t /*__FASTCALL*/ FilteringEdgeChromaHV( PDqLayer pCurDqLayer, PDeblockingFilter  pFilter, int32_t iBoundryFlag )
-{
-	int32_t iMbXyIndex     = pCurDqLayer->iMbXyIndex;
-	int32_t iMbX      = pCurDqLayer->iMbX;
-	int32_t iMbY      = pCurDqLayer->iMbY;
-	int32_t iMbWidth  = pCurDqLayer->iMbWidth;
-	int32_t iLineSize  = pFilter->iCsStride[1];
+void_t /*__FASTCALL*/ FilteringEdgeChromaHV (PDqLayer pCurDqLayer, PDeblockingFilter  pFilter, int32_t iBoundryFlag) {
+  int32_t iMbXyIndex     = pCurDqLayer->iMbXyIndex;
+  int32_t iMbX      = pCurDqLayer->iMbX;
+  int32_t iMbY      = pCurDqLayer->iMbY;
+  int32_t iMbWidth  = pCurDqLayer->iMbWidth;
+  int32_t iLineSize  = pFilter->iCsStride[1];
 
-	uint8_t  *pDestCb, *pDestCr;	
-	int32_t  iCurQp;
-	int32_t  iIndexA, iAlpha, iBeta;
-	
-	FORCE_STACK_ALIGN_1D( int8_t,  iTc,   4, 16 );
-	FORCE_STACK_ALIGN_1D( uint8_t, uiBSx4, 4, 4  );
+  uint8_t*  pDestCb, *pDestCr;
+  int32_t  iCurQp;
+  int32_t  iIndexA, iAlpha, iBeta;
 
-	pDestCb = pFilter->pCsData[1] + ((iMbY * iLineSize + iMbX) << 3);				
-	pDestCr = pFilter->pCsData[2] + ((iMbY * iLineSize + iMbX) << 3);	
-	iCurQp  = pCurDqLayer->pChromaQp[iMbXyIndex];
-	*(uint32_t*)uiBSx4 = 0x03030303;
+  FORCE_STACK_ALIGN_1D (int8_t,  iTc,   4, 16);
+  FORCE_STACK_ALIGN_1D (uint8_t, uiBSx4, 4, 4);
 
-	// chroma v
-	if( iBoundryFlag & LEFT_FLAG_MASK)
-	{
-		pFilter->iChromaQP = ( iCurQp + pCurDqLayer->pChromaQp[iMbXyIndex-1] + 1 ) >> 1;	
-		FilteringEdgeChromaIntraV( pFilter, pDestCb, pDestCr, iLineSize, NULL);
-	}
+  pDestCb = pFilter->pCsData[1] + ((iMbY * iLineSize + iMbX) << 3);
+  pDestCr = pFilter->pCsData[2] + ((iMbY * iLineSize + iMbX) << 3);
+  iCurQp  = pCurDqLayer->pChromaQp[iMbXyIndex];
+  * (uint32_t*)uiBSx4 = 0x03030303;
 
-	pFilter->iChromaQP   = iCurQp;	
-	GET_ALPHA_BETA_FROM_QP(pFilter->iChromaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIndexA, iAlpha, iBeta);
-	if( iAlpha | iBeta )
-	{
-		TC0_TBL_LOOKUP(iTc, iIndexA, uiBSx4, 1);
-		pFilter->pLoopf->pfChromaDeblockingLT4Hor( &pDestCb[2 << 1],&pDestCr[2 << 1],iLineSize,iAlpha,iBeta,iTc );
-	}
+  // chroma v
+  if (iBoundryFlag & LEFT_FLAG_MASK) {
+    pFilter->iChromaQP = (iCurQp + pCurDqLayer->pChromaQp[iMbXyIndex - 1] + 1) >> 1;
+    FilteringEdgeChromaIntraV (pFilter, pDestCb, pDestCr, iLineSize, NULL);
+  }
 
-	// chroma h
-	if( iBoundryFlag & TOP_FLAG_MASK)
-	{
-		pFilter->iChromaQP = ( iCurQp + pCurDqLayer->pChromaQp[iMbXyIndex-iMbWidth] + 1 ) >> 1;		
-		FilteringEdgeChromaIntraH( pFilter, pDestCb, pDestCr, iLineSize, NULL);
-	}   
+  pFilter->iChromaQP   = iCurQp;
+  GET_ALPHA_BETA_FROM_QP (pFilter->iChromaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIndexA, iAlpha,
+                          iBeta);
+  if (iAlpha | iBeta) {
+    TC0_TBL_LOOKUP (iTc, iIndexA, uiBSx4, 1);
+    pFilter->pLoopf->pfChromaDeblockingLT4Hor (&pDestCb[2 << 1], &pDestCr[2 << 1], iLineSize, iAlpha, iBeta, iTc);
+  }
 
-	pFilter->iChromaQP   = iCurQp;	
-	if( iAlpha | iBeta )
-	{
-		pFilter->pLoopf->pfChromaDeblockingLT4Ver( &pDestCb[(2<<1)*iLineSize],&pDestCr[(2<<1)*iLineSize],iLineSize,iAlpha,iBeta,iTc );
-	}
+  // chroma h
+  if (iBoundryFlag & TOP_FLAG_MASK) {
+    pFilter->iChromaQP = (iCurQp + pCurDqLayer->pChromaQp[iMbXyIndex - iMbWidth] + 1) >> 1;
+    FilteringEdgeChromaIntraH (pFilter, pDestCb, pDestCr, iLineSize, NULL);
+  }
+
+  pFilter->iChromaQP   = iCurQp;
+  if (iAlpha | iBeta) {
+    pFilter->pLoopf->pfChromaDeblockingLT4Ver (&pDestCb[ (2 << 1)*iLineSize], &pDestCr[ (2 << 1)*iLineSize], iLineSize,
+        iAlpha, iBeta, iTc);
+  }
 }
 
 // merge h&v lookup table operation to save performance
-void_t DeblockingIntraMb( PDqLayer pCurDqLayer, PDeblockingFilter  pFilter, int32_t iBoundryFlag )
-{
-	FilteringEdgeLumaHV(pCurDqLayer, pFilter, iBoundryFlag);
-	FilteringEdgeChromaHV(pCurDqLayer, pFilter, iBoundryFlag);
+void_t DeblockingIntraMb (PDqLayer pCurDqLayer, PDeblockingFilter  pFilter, int32_t iBoundryFlag) {
+  FilteringEdgeLumaHV (pCurDqLayer, pFilter, iBoundryFlag);
+  FilteringEdgeChromaHV (pCurDqLayer, pFilter, iBoundryFlag);
 }
 
-void_t WelsDeblockingMb( PDqLayer pCurDqLayer, PDeblockingFilter  pFilter, int32_t iBoundryFlag )
-{
-	uint8_t nBS[2][4][4] = { 0 };
+void_t WelsDeblockingMb (PDqLayer pCurDqLayer, PDeblockingFilter  pFilter, int32_t iBoundryFlag) {
+  uint8_t nBS[2][4][4] = { 0 };
 
-	int32_t iMbXyIndex	= pCurDqLayer->iMbXyIndex;
-	int32_t iCurMbType  = pCurDqLayer->pMbType[iMbXyIndex];
-    int32_t iMbNb;
-	
-	switch( iCurMbType )
-    {
-	case MB_TYPE_INTRA4x4:
-	case MB_TYPE_INTRA16x16:
-	case MB_TYPE_INTRA_PCM:
-		DeblockingIntraMb( pCurDqLayer, pFilter, iBoundryFlag );
-		break;
-	default:
+  int32_t iMbXyIndex	= pCurDqLayer->iMbXyIndex;
+  int32_t iCurMbType  = pCurDqLayer->pMbType[iMbXyIndex];
+  int32_t iMbNb;
 
-        if(iBoundryFlag & LEFT_FLAG_MASK)
-        {
-            iMbNb = iMbXyIndex - 1;
-            *(uint32_t*)nBS[0][0] = IS_INTRA(pCurDqLayer->pMbType[iMbNb])?0x04040404:DeblockingBsMarginalMBAvcbase( pCurDqLayer, 0, iMbNb, iMbXyIndex);
-        }
-		else
-		{
-			*(uint32_t*)nBS[0][0] = 0;
-		}
-        if(iBoundryFlag & TOP_FLAG_MASK)
-        {
-             iMbNb = iMbXyIndex - pCurDqLayer->iMbWidth;
-           *(uint32_t*)nBS[1][0] = IS_INTRA(pCurDqLayer->pMbType[iMbNb])?0x04040404:DeblockingBsMarginalMBAvcbase( pCurDqLayer, 1, iMbNb, iMbXyIndex);
-        }
-		else
-		{
-			*(uint32_t*)nBS[1][0] = 0;
-		}
-		//SKIP MB_16x16 or others
-		if( iCurMbType != MB_TYPE_SKIP )
-		{
-			if( iCurMbType == MB_TYPE_16x16 )
-			{
-				DeblockingBSInsideMBAvsbase( pCurDqLayer->pNzc[iMbXyIndex], nBS, 1 );
-			} 
-			else 
-			{
-				DeblockingBSInsideMBNormal(pCurDqLayer, nBS, pCurDqLayer->pNzc[iMbXyIndex], iMbXyIndex);
-			}
-		}
-		else
-		{
-			*(uint32_t*)nBS[0][1] = *(uint32_t*)nBS[0][2] = *(uint32_t*)nBS[0][3] = 
-			*(uint32_t*)nBS[1][1] = *(uint32_t*)nBS[1][2] = *(uint32_t*)nBS[1][3] = 0;
-		}
-		DeblockingInterMb( pCurDqLayer, pFilter, nBS, iBoundryFlag );
-		break;
-   }
+  switch (iCurMbType) {
+  case MB_TYPE_INTRA4x4:
+  case MB_TYPE_INTRA16x16:
+  case MB_TYPE_INTRA_PCM:
+    DeblockingIntraMb (pCurDqLayer, pFilter, iBoundryFlag);
+    break;
+  default:
+
+    if (iBoundryFlag & LEFT_FLAG_MASK) {
+      iMbNb = iMbXyIndex - 1;
+      * (uint32_t*)nBS[0][0] = IS_INTRA (pCurDqLayer->pMbType[iMbNb]) ? 0x04040404 : DeblockingBsMarginalMBAvcbase (
+                                 pCurDqLayer, 0, iMbNb, iMbXyIndex);
+    } else {
+      * (uint32_t*)nBS[0][0] = 0;
+    }
+    if (iBoundryFlag & TOP_FLAG_MASK) {
+      iMbNb = iMbXyIndex - pCurDqLayer->iMbWidth;
+      * (uint32_t*)nBS[1][0] = IS_INTRA (pCurDqLayer->pMbType[iMbNb]) ? 0x04040404 : DeblockingBsMarginalMBAvcbase (
+                                 pCurDqLayer, 1, iMbNb, iMbXyIndex);
+    } else {
+      * (uint32_t*)nBS[1][0] = 0;
+    }
+    //SKIP MB_16x16 or others
+    if (iCurMbType != MB_TYPE_SKIP) {
+      if (iCurMbType == MB_TYPE_16x16) {
+        DeblockingBSInsideMBAvsbase (pCurDqLayer->pNzc[iMbXyIndex], nBS, 1);
+      } else {
+        DeblockingBSInsideMBNormal (pCurDqLayer, nBS, pCurDqLayer->pNzc[iMbXyIndex], iMbXyIndex);
+      }
+    } else {
+      * (uint32_t*)nBS[0][1] = * (uint32_t*)nBS[0][2] = * (uint32_t*)nBS[0][3] =
+                                 * (uint32_t*)nBS[1][1] = * (uint32_t*)nBS[1][2] = * (uint32_t*)nBS[1][3] = 0;
+    }
+    DeblockingInterMb (pCurDqLayer, pFilter, nBS, iBoundryFlag);
+    break;
+  }
 }
 
 //  C code only
-void_t DeblockLumaLt4_c( uint8_t *pPix, int32_t iStrideX,int32_t iStrideY, int32_t iAlpha, int32_t iBeta, int8_t *pTc )
-{
-	for( int32_t i = 0;i<16;i++)
-	{
-		int32_t iTc0 = pTc[i>>2];
-		if(iTc0>=0)
-		{
-				int32_t p0 = pPix[-iStrideX];	
-				int32_t p1 = pPix[-2*iStrideX];	
-				int32_t p2 = pPix[-3*iStrideX];	
-				int32_t q0 = pPix[0];	
-				int32_t q1 = pPix[iStrideX];	
-				int32_t q2 = pPix[2*iStrideX];	
-				bool_t bDetaP0Q0= WELS_ABS( p0 - q0 )<iAlpha;
-				bool_t bDetaP1P0 = WELS_ABS( p1 - p0 ) < iBeta;
-				bool_t bDetaQ1Q0 = WELS_ABS( q1 - q0 ) < iBeta;
-				int32_t iTc = iTc0;
-				if ( bDetaP0Q0&& bDetaP1P0 && bDetaQ1Q0 )
-				{	
-					bool_t bDetaP2P0 =  WELS_ABS( p2 - p0 ) < iBeta;
-					bool_t bDetaQ2Q0 =  WELS_ABS( q2 - q0 ) < iBeta;
-					if ( bDetaP2P0) 
-					{
-						pPix[-2*iStrideX] = p1 + WELS_CLIP3( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -iTc0, iTc0 );
-						iTc++;
-					}
-					if (bDetaQ2Q0)
-					{
-						pPix[iStrideX] = q1 + WELS_CLIP3( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -iTc0, iTc0 );
-						iTc++;
-					}
-					int32_t iDeta = WELS_CLIP3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -iTc, iTc );
-					pPix[-iStrideX] = WELS_CLIP1( p0 + iDeta );    /* p0' */
-					pPix[0]  = WELS_CLIP1( q0 - iDeta );    /* q0' */
-			}
-		}
-		pPix +=iStrideY;
-	}
+void_t DeblockLumaLt4_c (uint8_t* pPix, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha, int32_t iBeta,
+                         int8_t* pTc) {
+  for (int32_t i = 0; i < 16; i++) {
+    int32_t iTc0 = pTc[i >> 2];
+    if (iTc0 >= 0) {
+      int32_t p0 = pPix[-iStrideX];
+      int32_t p1 = pPix[-2 * iStrideX];
+      int32_t p2 = pPix[-3 * iStrideX];
+      int32_t q0 = pPix[0];
+      int32_t q1 = pPix[iStrideX];
+      int32_t q2 = pPix[2 * iStrideX];
+      bool_t bDetaP0Q0 = WELS_ABS (p0 - q0) < iAlpha;
+      bool_t bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
+      bool_t bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
+      int32_t iTc = iTc0;
+      if (bDetaP0Q0 && bDetaP1P0 && bDetaQ1Q0) {
+        bool_t bDetaP2P0 =  WELS_ABS (p2 - p0) < iBeta;
+        bool_t bDetaQ2Q0 =  WELS_ABS (q2 - q0) < iBeta;
+        if (bDetaP2P0) {
+          pPix[-2 * iStrideX] = p1 + WELS_CLIP3 ((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1, -iTc0, iTc0);
+          iTc++;
+        }
+        if (bDetaQ2Q0) {
+          pPix[iStrideX] = q1 + WELS_CLIP3 ((q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1, -iTc0, iTc0);
+          iTc++;
+        }
+        int32_t iDeta = WELS_CLIP3 ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3, -iTc, iTc);
+        pPix[-iStrideX] = WELS_CLIP1 (p0 + iDeta);     /* p0' */
+        pPix[0]  = WELS_CLIP1 (q0 - iDeta);     /* q0' */
+      }
+    }
+    pPix += iStrideY;
+  }
 }
-void_t DeblockLumaEq4_c( uint8_t *pPix, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha, int32_t iBeta )
-{
-	int32_t p0,p1,p2,q0,q1,q2;
-	int32_t iDetaP0Q0;
-	bool_t bDetaP1P0,bDetaQ1Q0;
-	for (int32_t i = 0;i<16;i++)
-	{
-		p0 = pPix[-iStrideX];
-		p1 = pPix[-2*iStrideX];
-		p2 = pPix[-3*iStrideX];							
-		q0 = pPix[0];
-		q1 = pPix[iStrideX];
-		q2 = pPix[2*iStrideX];
-		iDetaP0Q0 = WELS_ABS( p0 - q0 );
-		bDetaP1P0 = WELS_ABS( p1 - p0 ) < iBeta;
-		bDetaQ1Q0 = WELS_ABS( q1 - q0 ) < iBeta;
-		if ((iDetaP0Q0<iAlpha) && bDetaP1P0 &&bDetaQ1Q0)
-		{
-			if (iDetaP0Q0< (( iAlpha >> 2 ) + 2 ) )
-			{
-				bool_t bDetaP2P0 = WELS_ABS( p2 - p0 ) < iBeta;
-				bool_t bDetaQ2Q0 =  WELS_ABS( q2 - q0 ) < iBeta;
-				if(bDetaP2P0)
-				{	
-					const int32_t p3 = pPix[-4*iStrideX];	
-					pPix[-iStrideX] = ( p2 + (p1 << 1) + (p0 << 1) + (q0 << 1) + q1 + 4 ) >> 3;	 //p0
-					pPix[-2*iStrideX] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;	 //p1
-					pPix[-3*iStrideX] = ( (p3 << 1) + p2 + (p2 << 1) + p1 + p0 + q0 + 4 ) >> 3;//p2
-				 } 
-				 else 
-				 {
-					pPix[-1*iStrideX] = ( (p1 << 1) + p0 + q1 + 2 ) >> 2;	//p0
-			     }	
-				 if (bDetaQ2Q0)	
-				 {	
-					const int32_t q3 = pPix[3*iStrideX];		
-					pPix[0] = ( p1 + (p0 << 1) + (q0 << 1) + (q1 << 1) + q2 + 4 ) >> 3; //q0
-					pPix[iStrideX] = ( p0 + q0 + q1 + q2 + 2 ) >> 2; //q1
-					pPix[2*iStrideX] = ( (q3 << 1) + q2 + (q2 << 1) + q1 + q0 + p0 + 4 ) >> 3;//q2
-				  } 
-				  else 
-				  {	
-					pPix[0] = ( (q1 << 1) + q0 + p1 + 2 ) >> 2; //q0
-				  }
-			}
-			else
-			{
-			 	pPix[-iStrideX] = ( (p1 << 1) + p0 + q1 + 2 ) >> 2; //p0
-				pPix[ 0] = ( (q1 << 1) + q0 + p1 + 2 ) >> 2; //q0
-			}
-		}
-	 pPix += iStrideY;
-	} 
+void_t DeblockLumaEq4_c (uint8_t* pPix, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha, int32_t iBeta) {
+  int32_t p0, p1, p2, q0, q1, q2;
+  int32_t iDetaP0Q0;
+  bool_t bDetaP1P0, bDetaQ1Q0;
+  for (int32_t i = 0; i < 16; i++) {
+    p0 = pPix[-iStrideX];
+    p1 = pPix[-2 * iStrideX];
+    p2 = pPix[-3 * iStrideX];
+    q0 = pPix[0];
+    q1 = pPix[iStrideX];
+    q2 = pPix[2 * iStrideX];
+    iDetaP0Q0 = WELS_ABS (p0 - q0);
+    bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
+    bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
+    if ((iDetaP0Q0 < iAlpha) && bDetaP1P0 && bDetaQ1Q0) {
+      if (iDetaP0Q0 < ((iAlpha >> 2) + 2)) {
+        bool_t bDetaP2P0 = WELS_ABS (p2 - p0) < iBeta;
+        bool_t bDetaQ2Q0 =  WELS_ABS (q2 - q0) < iBeta;
+        if (bDetaP2P0) {
+          const int32_t p3 = pPix[-4 * iStrideX];
+          pPix[-iStrideX] = (p2 + (p1 << 1) + (p0 << 1) + (q0 << 1) + q1 + 4) >> 3;	   //p0
+          pPix[-2 * iStrideX] = (p2 + p1 + p0 + q0 + 2) >> 2;	 //p1
+          pPix[-3 * iStrideX] = ((p3 << 1) + p2 + (p2 << 1) + p1 + p0 + q0 + 4) >> 3;//p2
+        } else {
+          pPix[-1 * iStrideX] = ((p1 << 1) + p0 + q1 + 2) >> 2;	//p0
+        }
+        if (bDetaQ2Q0) {
+          const int32_t q3 = pPix[3 * iStrideX];
+          pPix[0] = (p1 + (p0 << 1) + (q0 << 1) + (q1 << 1) + q2 + 4) >> 3;   //q0
+          pPix[iStrideX] = (p0 + q0 + q1 + q2 + 2) >> 2;   //q1
+          pPix[2 * iStrideX] = ((q3 << 1) + q2 + (q2 << 1) + q1 + q0 + p0 + 4) >> 3;//q2
+        } else {
+          pPix[0] = ((q1 << 1) + q0 + p1 + 2) >> 2;   //q0
+        }
+      } else {
+        pPix[-iStrideX] = ((p1 << 1) + p0 + q1 + 2) >> 2;   //p0
+        pPix[ 0] = ((q1 << 1) + q0 + p1 + 2) >> 2;   //q0
+      }
+    }
+    pPix += iStrideY;
+  }
 }
-void_t DeblockLumaLt4V_c( uint8_t *pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *tc )
-{
-	DeblockLumaLt4_c( pPix, iStride, 1, iAlpha, iBeta, tc );
+void_t DeblockLumaLt4V_c (uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* tc) {
+  DeblockLumaLt4_c (pPix, iStride, 1, iAlpha, iBeta, tc);
 }
-void_t DeblockLumaLt4H_c( uint8_t *pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *tc )
-{
-	DeblockLumaLt4_c( pPix, 1, iStride, iAlpha, iBeta, tc );
+void_t DeblockLumaLt4H_c (uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* tc) {
+  DeblockLumaLt4_c (pPix, 1, iStride, iAlpha, iBeta, tc);
 }
-void_t DeblockLumaEq4V_c( uint8_t *pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta )
-{
-	DeblockLumaEq4_c( pPix, iStride, 1, iAlpha, iBeta);
+void_t DeblockLumaEq4V_c (uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
+  DeblockLumaEq4_c (pPix, iStride, 1, iAlpha, iBeta);
 }
-void_t DeblockLumaEq4H_c( uint8_t *pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta )
-{
-	DeblockLumaEq4_c( pPix, 1, iStride, iAlpha, iBeta );
+void_t DeblockLumaEq4H_c (uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
+  DeblockLumaEq4_c (pPix, 1, iStride, iAlpha, iBeta);
 }
-void_t DeblockChromaLt4_c( uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha, int32_t iBeta, int8_t *pTc )
-{
-	int32_t p0, p1, q0, q1,iDeta;
-	bool_t bDetaP0Q0,bDetaP1P0,bDetaQ1Q0;
+void_t DeblockChromaLt4_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha,
+                           int32_t iBeta, int8_t* pTc) {
+  int32_t p0, p1, q0, q1, iDeta;
+  bool_t bDetaP0Q0, bDetaP1P0, bDetaQ1Q0;
 
-	for(int32_t i = 0;i<8;i++)
-	{
-		int32_t iTc0 = pTc[i>>1];
-		if(iTc0 >0)
-		{
-		p0 = pPixCb[-iStrideX];
-		p1 = pPixCb[-2*iStrideX];
-		q0 = pPixCb[0];
-		q1 = pPixCb[iStrideX];		
+  for (int32_t i = 0; i < 8; i++) {
+    int32_t iTc0 = pTc[i >> 1];
+    if (iTc0 > 0) {
+      p0 = pPixCb[-iStrideX];
+      p1 = pPixCb[-2 * iStrideX];
+      q0 = pPixCb[0];
+      q1 = pPixCb[iStrideX];
 
-		bDetaP0Q0 =  WELS_ABS( p0 - q0 ) < iAlpha;
-		bDetaP1P0 =  WELS_ABS( p1 - p0 ) < iBeta;
-		bDetaQ1Q0 = WELS_ABS( q1 - q0 ) < iBeta;
-		if( bDetaP0Q0&&bDetaP1P0 &&	bDetaQ1Q0) 
-		{
-			iDeta = WELS_CLIP3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -iTc0, iTc0 );
-			pPixCb[-iStrideX] = WELS_CLIP1( p0 + iDeta );    /* p0' */
-			pPixCb[0]  = WELS_CLIP1( q0 - iDeta );    /* q0' */
-		}
-	
+      bDetaP0Q0 =  WELS_ABS (p0 - q0) < iAlpha;
+      bDetaP1P0 =  WELS_ABS (p1 - p0) < iBeta;
+      bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
+      if (bDetaP0Q0 && bDetaP1P0 &&	bDetaQ1Q0) {
+        iDeta = WELS_CLIP3 ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3, -iTc0, iTc0);
+        pPixCb[-iStrideX] = WELS_CLIP1 (p0 + iDeta);     /* p0' */
+        pPixCb[0]  = WELS_CLIP1 (q0 - iDeta);     /* q0' */
+      }
 
-		p0 = pPixCr[-iStrideX];
-		p1 = pPixCr[-2*iStrideX];
-		q0 = pPixCr[0];
-		q1 = pPixCr[iStrideX];	
 
-		bDetaP0Q0 =  WELS_ABS( p0 - q0 ) < iAlpha;
-		bDetaP1P0 =  WELS_ABS( p1 - p0 ) < iBeta;
-		bDetaQ1Q0 = WELS_ABS( q1 - q0 ) < iBeta;
+      p0 = pPixCr[-iStrideX];
+      p1 = pPixCr[-2 * iStrideX];
+      q0 = pPixCr[0];
+      q1 = pPixCr[iStrideX];
 
-		if( bDetaP0Q0&&bDetaP1P0 &&	bDetaQ1Q0) 
-		{
-			iDeta = WELS_CLIP3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -iTc0, iTc0 );
-			pPixCr[-iStrideX] = WELS_CLIP1( p0 + iDeta );    /* p0' */
-			pPixCr[0]  = WELS_CLIP1( q0 - iDeta );    /* q0' */
-		}
-		}
-		pPixCb += iStrideY;
-		pPixCr += iStrideY;
-	}
+      bDetaP0Q0 =  WELS_ABS (p0 - q0) < iAlpha;
+      bDetaP1P0 =  WELS_ABS (p1 - p0) < iBeta;
+      bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
+
+      if (bDetaP0Q0 && bDetaP1P0 &&	bDetaQ1Q0) {
+        iDeta = WELS_CLIP3 ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3, -iTc0, iTc0);
+        pPixCr[-iStrideX] = WELS_CLIP1 (p0 + iDeta);     /* p0' */
+        pPixCr[0]  = WELS_CLIP1 (q0 - iDeta);     /* q0' */
+      }
+    }
+    pPixCb += iStrideY;
+    pPixCr += iStrideY;
+  }
 }
-void_t DeblockChromaEq4_c( uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha, int32_t iBeta )
-{
-	int32_t i = 0, d = 0;
-	int32_t p0, p1, q0, q1;
-	bool_t bDetaP0Q0,bDetaP1P0,bDetaQ1Q0;
-	for(int32_t i =0;i<8;i++)
-	{
-		    //cb
-			p0 = pPixCb[-iStrideX];
-			p1 = pPixCb[-2*iStrideX];
-			q0 = pPixCb[0];
-			q1 = pPixCb[iStrideX];
-			bDetaP0Q0 = WELS_ABS( p0 - q0 ) < iAlpha;
-			bDetaP1P0 = WELS_ABS( p1 - p0 ) < iBeta;
-			bDetaQ1Q0 = WELS_ABS( q1 - q0 ) < iBeta;
-			if(bDetaP0Q0&&bDetaP1P0&&bDetaQ1Q0)
-			{
-				pPixCb[-iStrideX] = ( (p1 << 1) + p0 + q1 + 2 ) >> 2;   /* p0' */
-				pPixCb[0]  = ( (q1 << 1) + q0 + p1 + 2 ) >> 2;   /* q0' */
-			}
-			
-			//cr
-			p0 = pPixCr[-iStrideX];
-			p1 = pPixCr[-2*iStrideX];
-			q0 = pPixCr[0];
-			q1 = pPixCr[iStrideX];
-			bDetaP0Q0 = WELS_ABS( p0 - q0 ) < iAlpha;
-			bDetaP1P0 = WELS_ABS( p1 - p0 ) < iBeta;
-			bDetaQ1Q0 = WELS_ABS( q1 - q0 ) < iBeta;
-			if(bDetaP0Q0&&bDetaP1P0&&bDetaQ1Q0)
-			{
-				pPixCr[-iStrideX] = ( (p1 << 1) + p0 + q1 + 2 ) >> 2;   /* p0' */
-				pPixCr[0]  = ( (q1 << 1) + q0 + p1 + 2 ) >> 2;   /* q0' */
-			}
-			pPixCr += iStrideY;	
-			pPixCb += iStrideY;	
-	}
+void_t DeblockChromaEq4_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha,
+                           int32_t iBeta) {
+  int32_t i = 0, d = 0;
+  int32_t p0, p1, q0, q1;
+  bool_t bDetaP0Q0, bDetaP1P0, bDetaQ1Q0;
+  for (int32_t i = 0; i < 8; i++) {
+    //cb
+    p0 = pPixCb[-iStrideX];
+    p1 = pPixCb[-2 * iStrideX];
+    q0 = pPixCb[0];
+    q1 = pPixCb[iStrideX];
+    bDetaP0Q0 = WELS_ABS (p0 - q0) < iAlpha;
+    bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
+    bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
+    if (bDetaP0Q0 && bDetaP1P0 && bDetaQ1Q0) {
+      pPixCb[-iStrideX] = ((p1 << 1) + p0 + q1 + 2) >> 2;     /* p0' */
+      pPixCb[0]  = ((q1 << 1) + q0 + p1 + 2) >> 2;     /* q0' */
+    }
+
+    //cr
+    p0 = pPixCr[-iStrideX];
+    p1 = pPixCr[-2 * iStrideX];
+    q0 = pPixCr[0];
+    q1 = pPixCr[iStrideX];
+    bDetaP0Q0 = WELS_ABS (p0 - q0) < iAlpha;
+    bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
+    bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
+    if (bDetaP0Q0 && bDetaP1P0 && bDetaQ1Q0) {
+      pPixCr[-iStrideX] = ((p1 << 1) + p0 + q1 + 2) >> 2;     /* p0' */
+      pPixCr[0]  = ((q1 << 1) + q0 + p1 + 2) >> 2;     /* q0' */
+    }
+    pPixCr += iStrideY;
+    pPixCb += iStrideY;
+  }
 }
-void_t DeblockChromaLt4V_c( uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *tc )
-{
-	DeblockChromaLt4_c( pPixCb, pPixCr, iStride, 1, iAlpha, iBeta, tc );
+void_t DeblockChromaLt4V_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
+                            int8_t* tc) {
+  DeblockChromaLt4_c (pPixCb, pPixCr, iStride, 1, iAlpha, iBeta, tc);
 }
-void_t DeblockChromaLt4H_c( uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *tc )
-{
-	DeblockChromaLt4_c( pPixCb, pPixCr, 1, iStride, iAlpha, iBeta, tc );
+void_t DeblockChromaLt4H_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
+                            int8_t* tc) {
+  DeblockChromaLt4_c (pPixCb, pPixCr, 1, iStride, iAlpha, iBeta, tc);
 }
-void_t DeblockChromaEq4V_c( uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta )
-{
-	DeblockChromaEq4_c( pPixCb, pPixCr, iStride, 1, iAlpha, iBeta );
+void_t DeblockChromaEq4V_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
+  DeblockChromaEq4_c (pPixCb, pPixCr, iStride, 1, iAlpha, iBeta);
 }
-void_t DeblockChromaEq4H_c( uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta )
-{
-	DeblockChromaEq4_c( pPixCb, pPixCr, 1, iStride, iAlpha, iBeta );
+void_t DeblockChromaEq4H_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
+  DeblockChromaEq4_c (pPixCb, pPixCr, 1, iStride, iAlpha, iBeta);
 }
 
-#ifdef X86_ASM
-extern "C" {
-void DeblockLumaLt4H_sse2(uint8_t *pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *pTc)
-{
-    FORCE_STACK_ALIGN_1D(uint8_t,  uiBuf,   16*8, 16);
-    
-    DeblockLumaTransposeH2V_sse2(pPixY - 4, iStride, &uiBuf[0]);
-	DeblockLumaLt4V_sse2(&uiBuf[4*16], 16, iAlpha, iBeta, pTc);
-	DeblockLumaTransposeV2H_sse2(pPixY - 4, iStride, &uiBuf[0]);
-}
-
-void DeblockLumaEq4H_sse2(uint8_t *pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta)
-{
-	FORCE_STACK_ALIGN_1D(uint8_t,  uiBuf,   16*8, 16);
-    
-    DeblockLumaTransposeH2V_sse2(pPixY - 4, iStride, &uiBuf[0]);
-	DeblockLumaEq4V_sse2(&uiBuf[4*16], 16, iAlpha, iBeta);
-	DeblockLumaTransposeV2H_sse2(pPixY - 4, iStride, &uiBuf[0]);
-}
-
-}
-
+#ifdef X86_ASM
+extern "C" {
+  void DeblockLumaLt4H_sse2 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc) {
+    FORCE_STACK_ALIGN_1D (uint8_t,  uiBuf,   16 * 8, 16);
+
+    DeblockLumaTransposeH2V_sse2 (pPixY - 4, iStride, &uiBuf[0]);
+    DeblockLumaLt4V_sse2 (&uiBuf[4 * 16], 16, iAlpha, iBeta, pTc);
+    DeblockLumaTransposeV2H_sse2 (pPixY - 4, iStride, &uiBuf[0]);
+  }
+
+  void DeblockLumaEq4H_sse2 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
+    FORCE_STACK_ALIGN_1D (uint8_t,  uiBuf,   16 * 8, 16);
+
+    DeblockLumaTransposeH2V_sse2 (pPixY - 4, iStride, &uiBuf[0]);
+    DeblockLumaEq4V_sse2 (&uiBuf[4 * 16], 16, iAlpha, iBeta);
+    DeblockLumaTransposeV2H_sse2 (pPixY - 4, iStride, &uiBuf[0]);
+  }
+
+}
+
 #endif
 /*!
  * \brief	AVC slice deblocking filtering target layer
@@ -904,78 +827,70 @@
  *
  * \return	NONE
  */
-void_t WelsDeblockingFilterSlice( PWelsDecoderContext pCtx, PDeblockingFilterMbFunc pDeblockMb )
-{
-	PDqLayer pCurDqLayer = pCtx->pCurDqLayer;
-	PSliceHeaderExt pSliceHeaderExt = &pCurDqLayer->sLayerInfo.sSliceInLayer.sSliceHeaderExt;
-	int32_t iMbWidth  = pCurDqLayer->iMbWidth;
-	int32_t iTotalMbCount = pSliceHeaderExt->sSliceHeader.pSps->uiTotalMbCount;
+void_t WelsDeblockingFilterSlice (PWelsDecoderContext pCtx, PDeblockingFilterMbFunc pDeblockMb) {
+  PDqLayer pCurDqLayer = pCtx->pCurDqLayer;
+  PSliceHeaderExt pSliceHeaderExt = &pCurDqLayer->sLayerInfo.sSliceInLayer.sSliceHeaderExt;
+  int32_t iMbWidth  = pCurDqLayer->iMbWidth;
+  int32_t iTotalMbCount = pSliceHeaderExt->sSliceHeader.pSps->uiTotalMbCount;
 
-	SDeblockingFilter pFilter = {0};
+  SDeblockingFilter pFilter = {0};
 
-	PFmo pFmo = pCtx->pFmo;
-	int32_t iNextMbXyIndex = 0;
-	int32_t iTotalNumMb = pCurDqLayer->sLayerInfo.sSliceInLayer.iTotalMbInCurSlice;
-	int32_t iCountNumMb = 0;
-	int32_t iBoundryFlag;
-	int32_t iFilterIdc = pCurDqLayer->sLayerInfo.sSliceInLayer.sSliceHeaderExt.sSliceHeader.uiDisableDeblockingFilterIdc;
+  PFmo pFmo = pCtx->pFmo;
+  int32_t iNextMbXyIndex = 0;
+  int32_t iTotalNumMb = pCurDqLayer->sLayerInfo.sSliceInLayer.iTotalMbInCurSlice;
+  int32_t iCountNumMb = 0;
+  int32_t iBoundryFlag;
+  int32_t iFilterIdc = pCurDqLayer->sLayerInfo.sSliceInLayer.sSliceHeaderExt.sSliceHeader.uiDisableDeblockingFilterIdc;
 
-	/* Step1: parameters set */	
-	pFilter.pCsData[0] = pCtx->pDec->pData[0];
-	pFilter.pCsData[1] = pCtx->pDec->pData[1];
-	pFilter.pCsData[2] = pCtx->pDec->pData[2];
-	
-	pFilter.iCsStride[0] = pCtx->pDec->iLinesize[0];
-	pFilter.iCsStride[1] = pCtx->pDec->iLinesize[1];
-	
-	pFilter.eSliceType = (ESliceType) pCurDqLayer->sLayerInfo.sSliceInLayer.eSliceType;
-	
-	pFilter.iSliceAlphaC0Offset = pSliceHeaderExt->sSliceHeader.iSliceAlphaC0Offset;
-	pFilter.iSliceBetaOffset     = pSliceHeaderExt->sSliceHeader.iSliceBetaOffset;
+  /* Step1: parameters set */
+  pFilter.pCsData[0] = pCtx->pDec->pData[0];
+  pFilter.pCsData[1] = pCtx->pDec->pData[1];
+  pFilter.pCsData[2] = pCtx->pDec->pData[2];
 
-	pFilter.pLoopf = &pCtx->sDeblockingFunc;
+  pFilter.iCsStride[0] = pCtx->pDec->iLinesize[0];
+  pFilter.iCsStride[1] = pCtx->pDec->iLinesize[1];
 
-	/* Step2: macroblock deblocking */
-    if( 0 == iFilterIdc || 2 == iFilterIdc )
-    {
-		iNextMbXyIndex = pSliceHeaderExt->sSliceHeader.iFirstMbInSlice;
-		pCurDqLayer->iMbX  = iNextMbXyIndex % iMbWidth;
-		pCurDqLayer->iMbY  = iNextMbXyIndex / iMbWidth; 
-		pCurDqLayer->iMbXyIndex = iNextMbXyIndex;
+  pFilter.eSliceType = (ESliceType) pCurDqLayer->sLayerInfo.sSliceInLayer.eSliceType;
 
-		do 
-		{
-			iBoundryFlag = DeblockingAvailableNoInterlayer(pCurDqLayer, iFilterIdc);
+  pFilter.iSliceAlphaC0Offset = pSliceHeaderExt->sSliceHeader.iSliceAlphaC0Offset;
+  pFilter.iSliceBetaOffset     = pSliceHeaderExt->sSliceHeader.iSliceBetaOffset;
 
-			pDeblockMb( pCurDqLayer, &pFilter, iBoundryFlag );
+  pFilter.pLoopf = &pCtx->sDeblockingFunc;
 
-			++iCountNumMb;
-			if ( iCountNumMb >= iTotalNumMb )
-			{
-				break;
-			}
+  /* Step2: macroblock deblocking */
+  if (0 == iFilterIdc || 2 == iFilterIdc) {
+    iNextMbXyIndex = pSliceHeaderExt->sSliceHeader.iFirstMbInSlice;
+    pCurDqLayer->iMbX  = iNextMbXyIndex % iMbWidth;
+    pCurDqLayer->iMbY  = iNextMbXyIndex / iMbWidth;
+    pCurDqLayer->iMbXyIndex = iNextMbXyIndex;
 
-			if ( pSliceHeaderExt->sSliceHeader.pPps->uiNumSliceGroups > 1 )
-			{
-				iNextMbXyIndex = FmoNextMb( pFmo, iNextMbXyIndex );
-			}
-			else
-			{
-				++iNextMbXyIndex;
-			}
-			if ( -1 == iNextMbXyIndex || iNextMbXyIndex >= iTotalMbCount )	// slice group boundary or end of a frame
-			{
-				break;
-			}
+    do {
+      iBoundryFlag = DeblockingAvailableNoInterlayer (pCurDqLayer, iFilterIdc);
 
-			pCurDqLayer->iMbX  = iNextMbXyIndex % iMbWidth;
-			pCurDqLayer->iMbY  = iNextMbXyIndex / iMbWidth;
-			pCurDqLayer->iMbXyIndex = iNextMbXyIndex;
-		} while ( 1 );
-	}
+      pDeblockMb (pCurDqLayer, &pFilter, iBoundryFlag);
+
+      ++iCountNumMb;
+      if (iCountNumMb >= iTotalNumMb) {
+        break;
+      }
+
+      if (pSliceHeaderExt->sSliceHeader.pPps->uiNumSliceGroups > 1) {
+        iNextMbXyIndex = FmoNextMb (pFmo, iNextMbXyIndex);
+      } else {
+        ++iNextMbXyIndex;
+      }
+      if (-1 == iNextMbXyIndex || iNextMbXyIndex >= iTotalMbCount) {	// slice group boundary or end of a frame
+        break;
+      }
+
+      pCurDqLayer->iMbX  = iNextMbXyIndex % iMbWidth;
+      pCurDqLayer->iMbY  = iNextMbXyIndex / iMbWidth;
+      pCurDqLayer->iMbXyIndex = iNextMbXyIndex;
+    } while (1);
+  }
 }
 /*!
- * \brief	deblocking module initialize 
+ * \brief	deblocking module initialize
  *
  * \param	pf
  *          cpu
@@ -982,30 +897,29 @@
  *
  * \return	NONE
  */
- 
-void_t  DeblockingInit( SDeblockingFunc  *pFunc,  int32_t iCpu )
-{
-	pFunc->pfLumaDeblockingLT4Ver		= DeblockLumaLt4V_c;
-	pFunc->pfLumaDeblockingEQ4Ver		= DeblockLumaEq4V_c;
-	pFunc->pfLumaDeblockingLT4Hor		= DeblockLumaLt4H_c;
-	pFunc->pfLumaDeblockingEQ4Hor		= DeblockLumaEq4H_c;
 
-	pFunc->pfChromaDeblockingLT4Ver	    = DeblockChromaLt4V_c;
-	pFunc->pfChromaDeblockingEQ4Ver	    = DeblockChromaEq4V_c;
-	pFunc->pfChromaDeblockingLT4Hor	    = DeblockChromaLt4H_c;
-	pFunc->pfChromaDeblockinEQ4Hor	    = DeblockChromaEq4H_c;
+void_t  DeblockingInit (SDeblockingFunc*  pFunc,  int32_t iCpu) {
+  pFunc->pfLumaDeblockingLT4Ver		= DeblockLumaLt4V_c;
+  pFunc->pfLumaDeblockingEQ4Ver		= DeblockLumaEq4V_c;
+  pFunc->pfLumaDeblockingLT4Hor		= DeblockLumaLt4H_c;
+  pFunc->pfLumaDeblockingEQ4Hor		= DeblockLumaEq4H_c;
 
-#ifdef X86_ASM
-	if( iCpu & WELS_CPU_SSE2 ){
-	    pFunc->pfLumaDeblockingLT4Ver	= DeblockLumaLt4V_sse2;
-	    pFunc->pfLumaDeblockingEQ4Ver	= DeblockLumaEq4V_sse2;
-		pFunc->pfLumaDeblockingLT4Hor   = DeblockLumaLt4H_sse2;
-		pFunc->pfLumaDeblockingEQ4Hor   = DeblockLumaEq4H_sse2;
-	    pFunc->pfChromaDeblockingLT4Ver	= DeblockChromaLt4V_sse2;
-	    pFunc->pfChromaDeblockingEQ4Ver	= DeblockChromaEq4V_sse2;
-	    pFunc->pfChromaDeblockingLT4Hor	= DeblockChromaLt4H_sse2;
-	    pFunc->pfChromaDeblockinEQ4Hor	= DeblockChromaEq4H_sse2;
-	}
+  pFunc->pfChromaDeblockingLT4Ver	    = DeblockChromaLt4V_c;
+  pFunc->pfChromaDeblockingEQ4Ver	    = DeblockChromaEq4V_c;
+  pFunc->pfChromaDeblockingLT4Hor	    = DeblockChromaLt4H_c;
+  pFunc->pfChromaDeblockinEQ4Hor	    = DeblockChromaEq4H_c;
+
+#ifdef X86_ASM
+  if (iCpu & WELS_CPU_SSE2) {
+    pFunc->pfLumaDeblockingLT4Ver	= DeblockLumaLt4V_sse2;
+    pFunc->pfLumaDeblockingEQ4Ver	= DeblockLumaEq4V_sse2;
+    pFunc->pfLumaDeblockingLT4Hor   = DeblockLumaLt4H_sse2;
+    pFunc->pfLumaDeblockingEQ4Hor   = DeblockLumaEq4H_sse2;
+    pFunc->pfChromaDeblockingLT4Ver	= DeblockChromaLt4V_sse2;
+    pFunc->pfChromaDeblockingEQ4Ver	= DeblockChromaEq4V_sse2;
+    pFunc->pfChromaDeblockingLT4Hor	= DeblockChromaLt4H_sse2;
+    pFunc->pfChromaDeblockinEQ4Hor	= DeblockChromaEq4H_sse2;
+  }
 #endif
 
 }
--- a/codec/decoder/core/src/decode_mb_aux.cpp
+++ b/codec/decoder/core/src/decode_mb_aux.cpp
@@ -38,97 +38,90 @@
 namespace WelsDec {
 
 #define MAX_NEG_CROP 1024
-uint8_t g_ClipTable[256 + 2 * MAX_NEG_CROP];	//the front 1024 is 0, the back 1024 is 255, the middle 256 elements is 0-255
+uint8_t g_ClipTable[256 + 2 *
+                    MAX_NEG_CROP];	//the front 1024 is 0, the back 1024 is 255, the middle 256 elements is 0-255
 
 
 /* init pClip table to pClip the final dct data */
-void_t InitDctClipTable(void_t)
-{
-	uint8_t *p		        = &g_ClipTable[0];
-	const int32_t kiLength	= MAX_NEG_CROP * sizeof(uint8_t);
-	int32_t i               = 0;
-	
-	do
-    {
-		const int32_t kiIdx = MAX_NEG_CROP + i;
+void_t InitDctClipTable (void_t) {
+  uint8_t* p		        = &g_ClipTable[0];
+  const int32_t kiLength	= MAX_NEG_CROP * sizeof (uint8_t);
+  int32_t i               = 0;
 
-		p[kiIdx]	= i;
-		p[1+kiIdx]	= 1+i;
-		p[2+kiIdx]	= 2+i;
-		p[3+kiIdx]	= 3+i;
+  do {
+    const int32_t kiIdx = MAX_NEG_CROP + i;
 
-		i += 4;
-	} while(i < 256);
+    p[kiIdx]	= i;
+    p[1 + kiIdx]	= 1 + i;
+    p[2 + kiIdx]	= 2 + i;
+    p[3 + kiIdx]	= 3 + i;
 
-	memset( p, 0, kiLength);
-	memset( p + MAX_NEG_CROP + 256, 0xFF, kiLength);
+    i += 4;
+  } while (i < 256);
+
+  memset (p, 0, kiLength);
+  memset (p + MAX_NEG_CROP + 256, 0xFF, kiLength);
 }
 
 //NOTE::: p_RS should NOT be modified and it will lead to mismatch with JSVM.
 //        so should allocate kA array to store the temporary value (idct).
-void_t IdctResAddPred_c(uint8_t *pPred, const int32_t kiStride, int16_t *pRs)
-{
-	int16_t iSrc[16];
+void_t IdctResAddPred_c (uint8_t* pPred, const int32_t kiStride, int16_t* pRs) {
+  int16_t iSrc[16];
 
-	uint8_t *pDst			= pPred;
-	const int32_t kiStride2	= kiStride<<1;
-	const int32_t kiStride3	= kiStride + kiStride2;
-	uint8_t *pClip			= &g_ClipTable[MAX_NEG_CROP];	
-	int32_t i;
+  uint8_t* pDst			= pPred;
+  const int32_t kiStride2	= kiStride << 1;
+  const int32_t kiStride3	= kiStride + kiStride2;
+  uint8_t* pClip			= &g_ClipTable[MAX_NEG_CROP];
+  int32_t i;
 
-	for(i=0; i<4; i++)
-	{
- 		const int32_t kiY  = i<<2;
-		const int32_t kiT0 = pRs[kiY] + pRs[kiY+2];
-		const int32_t kiT1 = pRs[kiY] - pRs[kiY+2];
-		const int32_t kiT2 = (pRs[kiY+1]>>1) - pRs[kiY+3];
-		const int32_t kiT3 = pRs[kiY+1] + (pRs[kiY+3]>>1);
+  for (i = 0; i < 4; i++) {
+    const int32_t kiY  = i << 2;
+    const int32_t kiT0 = pRs[kiY] + pRs[kiY + 2];
+    const int32_t kiT1 = pRs[kiY] - pRs[kiY + 2];
+    const int32_t kiT2 = (pRs[kiY + 1] >> 1) - pRs[kiY + 3];
+    const int32_t kiT3 = pRs[kiY + 1] + (pRs[kiY + 3] >> 1);
 
-		iSrc[kiY] = kiT0 + kiT3;
-		iSrc[kiY+1] = kiT1 + kiT2;
-		iSrc[kiY+2] = kiT1 - kiT2;
-		iSrc[kiY+3] = kiT0 - kiT3;
-	}
+    iSrc[kiY] = kiT0 + kiT3;
+    iSrc[kiY + 1] = kiT1 + kiT2;
+    iSrc[kiY + 2] = kiT1 - kiT2;
+    iSrc[kiY + 3] = kiT0 - kiT3;
+  }
 
-	for(i=0; i<4; i++)
-	{
-		int32_t kT1	= iSrc[i]	+ iSrc[i+8];
-		int32_t kT2	= iSrc[i+4] + (iSrc[i+12]>>1);
-		int32_t kT3	= (32 + kT1 + kT2) >> 6;
-		int32_t kT4	= (32 + kT1 - kT2) >> 6;
-		
-		pDst[i] = pClip[ kT3 + pPred[i] ];
-		pDst[i+kiStride3] = pClip[ kT4 + pPred[i+kiStride3] ];
+  for (i = 0; i < 4; i++) {
+    int32_t kT1	= iSrc[i]	+ iSrc[i + 8];
+    int32_t kT2	= iSrc[i + 4] + (iSrc[i + 12] >> 1);
+    int32_t kT3	= (32 + kT1 + kT2) >> 6;
+    int32_t kT4	= (32 + kT1 - kT2) >> 6;
 
-		kT1	= iSrc[i] - iSrc[i+8];
-		kT2	= (iSrc[i+4]>>1) - iSrc[i+12];
-		pDst[i+kiStride] = pClip[ ((32 + kT1 + kT2) >> 6) + pDst[i+kiStride] ];
-		pDst[i+kiStride2] = pClip[ ((32 + kT1 - kT2) >> 6) + pDst[i+kiStride2] ];
-	}
+    pDst[i] = pClip[ kT3 + pPred[i] ];
+    pDst[i + kiStride3] = pClip[ kT4 + pPred[i + kiStride3] ];
+
+    kT1	= iSrc[i] - iSrc[i + 8];
+    kT2	= (iSrc[i + 4] >> 1) - iSrc[i + 12];
+    pDst[i + kiStride] = pClip[ ((32 + kT1 + kT2) >> 6) + pDst[i + kiStride] ];
+    pDst[i + kiStride2] = pClip[ ((32 + kT1 - kT2) >> 6) + pDst[i + kiStride2] ];
+  }
 }
 
-void_t GetI4LumaIChromaAddrTable(int32_t *pBlockOffset, const int32_t kiYStride, const int32_t kiUVStride)
-{
-	int32_t *pOffset	   = pBlockOffset;
-	int32_t i;
-	const uint8_t kuiScan0 = g_kuiScan8[0];
+void_t GetI4LumaIChromaAddrTable (int32_t* pBlockOffset, const int32_t kiYStride, const int32_t kiUVStride) {
+  int32_t* pOffset	   = pBlockOffset;
+  int32_t i;
+  const uint8_t kuiScan0 = g_kuiScan8[0];
 
-	for(i=0; i<16; i++)
-	{
-		const uint32_t kuiA = g_kuiScan8[i] - kuiScan0;
-		const uint32_t kuiX = kuiA & 0x07;
-		const uint32_t kuiY = kuiA >> 3;
+  for (i = 0; i < 16; i++) {
+    const uint32_t kuiA = g_kuiScan8[i] - kuiScan0;
+    const uint32_t kuiX = kuiA & 0x07;
+    const uint32_t kuiY = kuiA >> 3;
 
-		pOffset[i]= (kuiX + kiYStride* kuiY) << 2;
-	}
+    pOffset[i] = (kuiX + kiYStride * kuiY) << 2;
+  }
 
-	for(i=0; i<4; i++)
-	{
-		const uint32_t kuiA = g_kuiScan8[i] - kuiScan0;
+  for (i = 0; i < 4; i++) {
+    const uint32_t kuiA = g_kuiScan8[i] - kuiScan0;
 
-		pOffset[16+i]=
-		pOffset[20+i]= ((kuiA & 0x07) + (kiUVStride/*>>1*/) * (kuiA >> 3)) << 2;
-	}
+    pOffset[16 + i] =
+      pOffset[20 + i] = ((kuiA & 0x07) + (kiUVStride/*>>1*/) * (kuiA >> 3)) << 2;
+  }
 }
 
 } // namespace WelsDec
\ No newline at end of file
--- a/codec/decoder/core/src/decode_slice.cpp
+++ b/codec/decoder/core/src/decode_slice.cpp
@@ -60,1302 +60,1114 @@
 
 namespace WelsDec {
 
-int32_t WelsTargetSliceConstruction( PWelsDecoderContext pCtx )
-{
-	int32_t iPreQP = 0;
+int32_t WelsTargetSliceConstruction (PWelsDecoderContext pCtx) {
+  int32_t iPreQP = 0;
 
-	PDqLayer pCurLayer = pCtx->pCurDqLayer;
-	PSlice pCurSlice = &pCurLayer->sLayerInfo.sSliceInLayer;
-	PSliceHeader pSliceHeader = &pCurSlice->sSliceHeaderExt.sSliceHeader;
+  PDqLayer pCurLayer = pCtx->pCurDqLayer;
+  PSlice pCurSlice = &pCurLayer->sLayerInfo.sSliceInLayer;
+  PSliceHeader pSliceHeader = &pCurSlice->sSliceHeaderExt.sSliceHeader;
 
-	int32_t iTotalMbTargetLayer = pSliceHeader->pSps->uiTotalMbCount;
+  int32_t iTotalMbTargetLayer = pSliceHeader->pSps->uiTotalMbCount;
 
-	int32_t iCurLayerWidth  = pCurLayer->iMbWidth << 4; 
-	int32_t iCurLayerHeight = pCurLayer->iMbHeight << 4;
+  int32_t iCurLayerWidth  = pCurLayer->iMbWidth << 4;
+  int32_t iCurLayerHeight = pCurLayer->iMbHeight << 4;
 
-	int32_t iNextMbXyIndex = 0;
-	PFmo pFmo = pCtx->pFmo;
+  int32_t iNextMbXyIndex = 0;
+  PFmo pFmo = pCtx->pFmo;
 
-	int32_t iTotalNumMb = pCurSlice->iTotalMbInCurSlice;
-	int32_t iCountNumMb = 0;
-	PDeblockingFilterMbFunc pDeblockMb;
+  int32_t iTotalNumMb = pCurSlice->iTotalMbInCurSlice;
+  int32_t iCountNumMb = 0;
+  PDeblockingFilterMbFunc pDeblockMb;
 
-	if ( !pCtx->bAvcBasedFlag && iCurLayerWidth != pCtx->iCurSeqIntervalMaxPicWidth ) 
-	{
-		return -1;
-	}
+  if (!pCtx->bAvcBasedFlag && iCurLayerWidth != pCtx->iCurSeqIntervalMaxPicWidth) {
+    return -1;
+  }
 
-	iNextMbXyIndex   = pSliceHeader->iFirstMbInSlice;
-	pCurLayer->iMbX  = iNextMbXyIndex % pCurLayer->iMbWidth;
-	pCurLayer->iMbY  = iNextMbXyIndex / pCurLayer->iMbWidth; 
-	pCurLayer->iMbXyIndex = iNextMbXyIndex;	
+  iNextMbXyIndex   = pSliceHeader->iFirstMbInSlice;
+  pCurLayer->iMbX  = iNextMbXyIndex % pCurLayer->iMbWidth;
+  pCurLayer->iMbY  = iNextMbXyIndex / pCurLayer->iMbWidth;
+  pCurLayer->iMbXyIndex = iNextMbXyIndex;
 
-	if ( 0 == iNextMbXyIndex )
-	{
-		pCurLayer->pDec->iSpsId = pSliceHeader->iSpsId;
-		pCurLayer->pDec->iPpsId = pSliceHeader->iPpsId;
+  if (0 == iNextMbXyIndex) {
+    pCurLayer->pDec->iSpsId = pSliceHeader->iSpsId;
+    pCurLayer->pDec->iPpsId = pSliceHeader->iPpsId;
 
-		pCurLayer->pDec->uiQualityId = pCurLayer->sLayerInfo.sNalHeaderExt.uiQualityId;
-	}
+    pCurLayer->pDec->uiQualityId = pCurLayer->sLayerInfo.sNalHeaderExt.uiQualityId;
+  }
 
-	do 
-	{
-		iPreQP = pCurLayer->pLumaQp[pCurLayer->iMbXyIndex];
-		
-		if ( WelsTargetMbConstruction( pCtx ) )
-		{
-			WelsLog( pCtx, WELS_LOG_WARNING, "WelsTargetSliceConstruction():::MB(%d, %d) construction error. pCurSlice_type:%d\n",
-				pCurLayer->iMbX, pCurLayer->iMbY, pCurSlice->eSliceType );
+  do {
+    iPreQP = pCurLayer->pLumaQp[pCurLayer->iMbXyIndex];
 
-			return -1;
-		}
+    if (WelsTargetMbConstruction (pCtx)) {
+      WelsLog (pCtx, WELS_LOG_WARNING, "WelsTargetSliceConstruction():::MB(%d, %d) construction error. pCurSlice_type:%d\n",
+               pCurLayer->iMbX, pCurLayer->iMbY, pCurSlice->eSliceType);
 
-		++iCountNumMb;
-		++pCurLayer->pDec->iTotalNumMbRec;
-		if ( iCountNumMb >= iTotalNumMb )
-		{
-			break;
-		}		
-		if ( pCurLayer->pDec->iTotalNumMbRec > iTotalMbTargetLayer )
-		{
-			WelsLog( pCtx, WELS_LOG_WARNING, "WelsTargetSliceConstruction():::fdec->iTotalNumMbRec:%d, iTotalMbTargetLayer:%d\n",
-				pCurLayer->pDec->iTotalNumMbRec, iTotalMbTargetLayer );
+      return -1;
+    }
 
-			return -1;
-		}		
-		
-		if ( pSliceHeader->pPps->uiNumSliceGroups > 1 )
-		{
-			iNextMbXyIndex = FmoNextMb( pFmo, iNextMbXyIndex );
-		}
-		else
-		{
-			++iNextMbXyIndex;
-		}
-		if ( -1 == iNextMbXyIndex || iNextMbXyIndex >= iTotalMbTargetLayer )	// slice group boundary or end of a frame
-		{
-			break;
-		}
-		pCurLayer->iMbX  = iNextMbXyIndex % pCurLayer->iMbWidth;
-		pCurLayer->iMbY  = iNextMbXyIndex / pCurLayer->iMbWidth;
-		pCurLayer->iMbXyIndex = iNextMbXyIndex;
-	} while (1);
-	
-	pCtx->pDec->iWidthInPixel  = iCurLayerWidth;
-	pCtx->pDec->iHeightInPixel = iCurLayerHeight;
+    ++iCountNumMb;
+    ++pCurLayer->pDec->iTotalNumMbRec;
+    if (iCountNumMb >= iTotalNumMb) {
+      break;
+    }
+    if (pCurLayer->pDec->iTotalNumMbRec > iTotalMbTargetLayer) {
+      WelsLog (pCtx, WELS_LOG_WARNING, "WelsTargetSliceConstruction():::fdec->iTotalNumMbRec:%d, iTotalMbTargetLayer:%d\n",
+               pCurLayer->pDec->iTotalNumMbRec, iTotalMbTargetLayer);
 
-	if((pCurSlice->eSliceType != I_SLICE)&&(pCurSlice->eSliceType != P_SLICE))
-		return 0;
+      return -1;
+    }
 
-    pDeblockMb = WelsDeblockingMb; 
+    if (pSliceHeader->pPps->uiNumSliceGroups > 1) {
+      iNextMbXyIndex = FmoNextMb (pFmo, iNextMbXyIndex);
+    } else {
+      ++iNextMbXyIndex;
+    }
+    if (-1 == iNextMbXyIndex || iNextMbXyIndex >= iTotalMbTargetLayer) {	// slice group boundary or end of a frame
+      break;
+    }
+    pCurLayer->iMbX  = iNextMbXyIndex % pCurLayer->iMbWidth;
+    pCurLayer->iMbY  = iNextMbXyIndex / pCurLayer->iMbWidth;
+    pCurLayer->iMbXyIndex = iNextMbXyIndex;
+  } while (1);
 
-	if ( 1 == pSliceHeader->uiDisableDeblockingFilterIdc )
-	{
-		return 0;//NO_SUPPORTED_FILTER_IDX
-	}
-	else
-	{
-		WelsDeblockingFilterSlice( pCtx, pDeblockMb );
+  pCtx->pDec->iWidthInPixel  = iCurLayerWidth;
+  pCtx->pDec->iHeightInPixel = iCurLayerHeight;
 
-	}
-	// any other filter_idc not supported here, 7/22/2010
+  if ((pCurSlice->eSliceType != I_SLICE) && (pCurSlice->eSliceType != P_SLICE))
+    return 0;
 
-	return 0;
+  pDeblockMb = WelsDeblockingMb;
+
+  if (1 == pSliceHeader->uiDisableDeblockingFilterIdc) {
+    return 0;//NO_SUPPORTED_FILTER_IDX
+  } else {
+    WelsDeblockingFilterSlice (pCtx, pDeblockMb);
+
+  }
+  // any other filter_idc not supported here, 7/22/2010
+
+  return 0;
 }
 
-int32_t WelsMbInterSampleConstruction( PWelsDecoderContext pCtx, PDqLayer pCurLayer, 
-											  uint8_t* pDstY, uint8_t* pDstU, uint8_t* pDstV, int32_t iStrideL, int32_t iStrideC )
-{
-	int32_t iMbXy = pCurLayer->iMbXyIndex;
-	int32_t i, iIndex, iOffset;
+int32_t WelsMbInterSampleConstruction (PWelsDecoderContext pCtx, PDqLayer pCurLayer,
+                                       uint8_t* pDstY, uint8_t* pDstU, uint8_t* pDstV, int32_t iStrideL, int32_t iStrideC) {
+  int32_t iMbXy = pCurLayer->iMbXyIndex;
+  int32_t i, iIndex, iOffset;
 
-	WelsChromaDcIdct( pCurLayer->pScaledTCoeff[iMbXy] + 256 );	// 256 = 16*16
-	WelsChromaDcIdct( pCurLayer->pScaledTCoeff[iMbXy] + 320 );	// 320 = 16*16 + 16*4
+  WelsChromaDcIdct (pCurLayer->pScaledTCoeff[iMbXy] + 256);	// 256 = 16*16
+  WelsChromaDcIdct (pCurLayer->pScaledTCoeff[iMbXy] + 320);	// 320 = 16*16 + 16*4
 
-	for(i=0; i<16; i++) //luma
-	{
-		iIndex = g_kuiMbNonZeroCountIdx[i];
-		if( pCurLayer->pNzc[iMbXy][iIndex] )
-		{
-			iOffset = ((iIndex>>2)<<2) * iStrideL + ((iIndex%4)<<2);			
-			pCtx->pIdctResAddPredFunc( pDstY+iOffset, iStrideL, pCurLayer->pScaledTCoeff[iMbXy]+(i<<4) );
-		}
-	}
+  for (i = 0; i < 16; i++) { //luma
+    iIndex = g_kuiMbNonZeroCountIdx[i];
+    if (pCurLayer->pNzc[iMbXy][iIndex]) {
+      iOffset = ((iIndex >> 2) << 2) * iStrideL + ((iIndex % 4) << 2);
+      pCtx->pIdctResAddPredFunc (pDstY + iOffset, iStrideL, pCurLayer->pScaledTCoeff[iMbXy] + (i << 4));
+    }
+  }
 
-	for ( i = 0; i < 4; i++ ) //chroma
-	{
-		iIndex = g_kuiMbNonZeroCountIdx[i+16]; //Cb
-		if ( pCurLayer->pNzc[iMbXy][iIndex] || *(pCurLayer->pScaledTCoeff[iMbXy]+((i+16)<<4)) )
-		{
-			iOffset = (((iIndex-16)>>2)<<2) * iStrideC + (((iIndex-16)%4)<<2);			
-			pCtx->pIdctResAddPredFunc( pDstU+iOffset, iStrideC, pCurLayer->pScaledTCoeff[iMbXy]+((i+16)<<4) );
-		}
+  for (i = 0; i < 4; i++) { //chroma
+    iIndex = g_kuiMbNonZeroCountIdx[i + 16]; //Cb
+    if (pCurLayer->pNzc[iMbXy][iIndex] || * (pCurLayer->pScaledTCoeff[iMbXy] + ((i + 16) << 4))) {
+      iOffset = (((iIndex - 16) >> 2) << 2) * iStrideC + (((iIndex - 16) % 4) << 2);
+      pCtx->pIdctResAddPredFunc (pDstU + iOffset, iStrideC, pCurLayer->pScaledTCoeff[iMbXy] + ((i + 16) << 4));
+    }
 
-		iIndex = g_kuiMbNonZeroCountIdx[i+20]; //Cr
-		if ( pCurLayer->pNzc[iMbXy][iIndex] || *(pCurLayer->pScaledTCoeff[iMbXy]+((i+20)<<4)) )
-		{
-			iOffset = (((iIndex-18)>>2)<<2) * iStrideC + (((iIndex-18)%4)<<2);			
-			pCtx->pIdctResAddPredFunc( pDstV+iOffset, iStrideC , pCurLayer->pScaledTCoeff[iMbXy]+((i+20)<<4));
-		}
-	}
+    iIndex = g_kuiMbNonZeroCountIdx[i + 20]; //Cr
+    if (pCurLayer->pNzc[iMbXy][iIndex] || * (pCurLayer->pScaledTCoeff[iMbXy] + ((i + 20) << 4))) {
+      iOffset = (((iIndex - 18) >> 2) << 2) * iStrideC + (((iIndex - 18) % 4) << 2);
+      pCtx->pIdctResAddPredFunc (pDstV + iOffset, iStrideC , pCurLayer->pScaledTCoeff[iMbXy] + ((i + 20) << 4));
+    }
+  }
 
-	return 0;
+  return 0;
 }
-int32_t WelsMbInterConstruction(PWelsDecoderContext pCtx, PDqLayer pCurLayer)
-{
-	int32_t iMbX = pCurLayer->iMbX;
-	int32_t iMbY = pCurLayer->iMbY;
-	uint8_t  *pDstY, *pDstCb, *pDstCr;
+int32_t WelsMbInterConstruction (PWelsDecoderContext pCtx, PDqLayer pCurLayer) {
+  int32_t iMbX = pCurLayer->iMbX;
+  int32_t iMbY = pCurLayer->iMbY;
+  uint8_t*  pDstY, *pDstCb, *pDstCr;
 
-	int32_t iLumaStride   = pCtx->pDec->iLinesize[0];
-	int32_t iChromaStride = pCtx->pDec->iLinesize[1];
+  int32_t iLumaStride   = pCtx->pDec->iLinesize[0];
+  int32_t iChromaStride = pCtx->pDec->iLinesize[1];
 
-	pDstY  = pCurLayer->pDec->pData[0] + ((iMbY * iLumaStride + iMbX)<<4);
-	pDstCb = pCurLayer->pDec->pData[1] + ((iMbY * iChromaStride + iMbX)<<3);
-	pDstCr = pCurLayer->pDec->pData[2] + ((iMbY * iChromaStride + iMbX)<<3);
+  pDstY  = pCurLayer->pDec->pData[0] + ((iMbY * iLumaStride + iMbX) << 4);
+  pDstCb = pCurLayer->pDec->pData[1] + ((iMbY * iChromaStride + iMbX) << 3);
+  pDstCr = pCurLayer->pDec->pData[2] + ((iMbY * iChromaStride + iMbX) << 3);
 
-	GetInterPred(pDstY, pDstCb, pDstCr, pCtx);
-	WelsMbInterSampleConstruction( pCtx, pCurLayer, pDstY, pDstCb, pDstCr, iLumaStride, iChromaStride );
+  GetInterPred (pDstY, pDstCb, pDstCr, pCtx);
+  WelsMbInterSampleConstruction (pCtx, pCurLayer, pDstY, pDstCb, pDstCr, iLumaStride, iChromaStride);
 
-	pCtx->sBlockFunc.pWelsSetNonZeroCountFunc(NULL, pCurLayer->pNzc[pCurLayer->iMbXyIndex]);// set all none-zero nzc to 1; dbk can be opti!
-	return 0;
+  pCtx->sBlockFunc.pWelsSetNonZeroCountFunc (NULL,
+      pCurLayer->pNzc[pCurLayer->iMbXyIndex]); // set all none-zero nzc to 1; dbk can be opti!
+  return 0;
 }
 
-void_t WelsLumaDcDequantIdct(int16_t *pBlock, int32_t iQp){
-    const int32_t kiQMul= g_kuiDequantCoeff[iQp][0];
+void_t WelsLumaDcDequantIdct (int16_t* pBlock, int32_t iQp) {
+  const int32_t kiQMul = g_kuiDequantCoeff[iQp][0];
 #define STRIDE 16
-    int32_t i;
-    int32_t iTemp[16]; //FIXME check if this is a good idea
-	int16_t* pBlk = pBlock;
-    static const int32_t kiXOffset[4]={0, STRIDE, STRIDE<<2,  5*STRIDE};
-    static const int32_t kiYOffset[4]={0, STRIDE<<1, STRIDE<<3, 10*STRIDE};
+  int32_t i;
+  int32_t iTemp[16]; //FIXME check if this is a good idea
+  int16_t* pBlk = pBlock;
+  static const int32_t kiXOffset[4] = {0, STRIDE, STRIDE << 2,  5 * STRIDE};
+  static const int32_t kiYOffset[4] = {0, STRIDE << 1, STRIDE << 3, 10 * STRIDE};
 
-    for(i=0; i<4; i++){
-        const int32_t kiOffset= kiYOffset[i];
-		const int32_t kiX1 = kiOffset + kiXOffset[2];
-		const int32_t kiX2 = STRIDE + kiOffset;
-		const int32_t kiX3 = kiOffset + kiXOffset[3];
-		const int32_t kiI4 = i << 2;	// 4*i
-        const int32_t kiZ0= pBlk[kiOffset] + pBlk[kiX1];
-        const int32_t kiZ1= pBlk[kiOffset] - pBlk[kiX1];
-        const int32_t kiZ2= pBlk[kiX2] - pBlk[kiX3];
-        const int32_t kiZ3= pBlk[kiX2] + pBlk[kiX3];
+  for (i = 0; i < 4; i++) {
+    const int32_t kiOffset = kiYOffset[i];
+    const int32_t kiX1 = kiOffset + kiXOffset[2];
+    const int32_t kiX2 = STRIDE + kiOffset;
+    const int32_t kiX3 = kiOffset + kiXOffset[3];
+    const int32_t kiI4 = i << 2;	// 4*i
+    const int32_t kiZ0 = pBlk[kiOffset] + pBlk[kiX1];
+    const int32_t kiZ1 = pBlk[kiOffset] - pBlk[kiX1];
+    const int32_t kiZ2 = pBlk[kiX2] - pBlk[kiX3];
+    const int32_t kiZ3 = pBlk[kiX2] + pBlk[kiX3];
 
-        iTemp[kiI4]  = kiZ0+kiZ3;
-        iTemp[1+kiI4]= kiZ1+kiZ2;
-        iTemp[2+kiI4]= kiZ1-kiZ2;
-        iTemp[3+kiI4]= kiZ0-kiZ3;
-    }
+    iTemp[kiI4]  = kiZ0 + kiZ3;
+    iTemp[1 + kiI4] = kiZ1 + kiZ2;
+    iTemp[2 + kiI4] = kiZ1 - kiZ2;
+    iTemp[3 + kiI4] = kiZ0 - kiZ3;
+  }
 
-    for(i=0; i<4; i++){
-        const int32_t kiOffset= kiXOffset[i];
-		const int32_t kiI4 = 4 + i;
-        const int32_t kiZ0= iTemp[i] + iTemp[4+kiI4];
-        const int32_t kiZ1= iTemp[i] - iTemp[4+kiI4];
-        const int32_t kiZ2= iTemp[kiI4] - iTemp[8+kiI4];
-        const int32_t kiZ3= iTemp[kiI4] + iTemp[8+kiI4];
+  for (i = 0; i < 4; i++) {
+    const int32_t kiOffset = kiXOffset[i];
+    const int32_t kiI4 = 4 + i;
+    const int32_t kiZ0 = iTemp[i] + iTemp[4 + kiI4];
+    const int32_t kiZ1 = iTemp[i] - iTemp[4 + kiI4];
+    const int32_t kiZ2 = iTemp[kiI4] - iTemp[8 + kiI4];
+    const int32_t kiZ3 = iTemp[kiI4] + iTemp[8 + kiI4];
 
-        pBlk[kiOffset]= ((kiZ0 + kiZ3)*kiQMul + 2)>>2; //FIXME think about merging this into decode_resdual
-        pBlk[kiYOffset[1] +kiOffset]= ((kiZ1 + kiZ2)*kiQMul + 2)>>2;
-        pBlk[kiYOffset[2] +kiOffset]= ((kiZ1 - kiZ2)*kiQMul + 2)>>2;
-        pBlk[kiYOffset[3] +kiOffset]= ((kiZ0 - kiZ3)*kiQMul + 2)>>2;
-    }
-	#undef STRIDE
+    pBlk[kiOffset] = ((kiZ0 + kiZ3) * kiQMul + 2) >> 2; //FIXME think about merging this into decode_resdual
+    pBlk[kiYOffset[1] + kiOffset] = ((kiZ1 + kiZ2) * kiQMul + 2) >> 2;
+    pBlk[kiYOffset[2] + kiOffset] = ((kiZ1 - kiZ2) * kiQMul + 2) >> 2;
+    pBlk[kiYOffset[3] + kiOffset] = ((kiZ0 - kiZ3) * kiQMul + 2) >> 2;
+  }
+#undef STRIDE
 }
 
-int32_t WelsMbIntraPredictionConstruction(PWelsDecoderContext pCtx, PDqLayer pCurLayer, bool_t bOutput)
-{
+int32_t WelsMbIntraPredictionConstruction (PWelsDecoderContext pCtx, PDqLayer pCurLayer, bool_t bOutput) {
 //seems IPCM should not enter this path
-	int32_t iMbXy = pCurLayer->iMbXyIndex;
+  int32_t iMbXy = pCurLayer->iMbXyIndex;
 
-	FORCE_STACK_ALIGN_1D( int16_t, pTempScaledTCoeff, MB_COEFF_LIST_SIZE, 16 );
+  FORCE_STACK_ALIGN_1D (int16_t, pTempScaledTCoeff, MB_COEFF_LIST_SIZE, 16);
 
-	memcpy(pTempScaledTCoeff, pCurLayer->pScaledTCoeff[iMbXy], 384*sizeof(pCurLayer->pScaledTCoeff[iMbXy][0]));
+  memcpy (pTempScaledTCoeff, pCurLayer->pScaledTCoeff[iMbXy], 384 * sizeof (pCurLayer->pScaledTCoeff[iMbXy][0]));
 
-	WelsFillRecNeededMbInfo(pCtx, bOutput, pCurLayer);
-	
-	if(IS_INTRA16x16(pCurLayer->pMbType[iMbXy]))
-	{
-		int32_t i,j;
-		// really need?
-		for(i=0; i<16; i++)
-		{
-			j = g_kuiLumaDcZigzagScan[i];
-			pTempScaledTCoeff[j] = pCurLayer->pScaledTCoeff[iMbXy][j];
-		}
-		WelsLumaDcDequantIdct(pTempScaledTCoeff, pCurLayer->pLumaQp[iMbXy]);
-		RecI16x16Mb(iMbXy, pCtx,pTempScaledTCoeff,pCurLayer);
+  WelsFillRecNeededMbInfo (pCtx, bOutput, pCurLayer);
 
-		return 0;
-	}
-		
-	if(IS_INTRA4x4(pCurLayer->pMbType[iMbXy]))
-		RecI4x4Mb(iMbXy, pCtx,pTempScaledTCoeff,pCurLayer);
-		
-	return 0;
+  if (IS_INTRA16x16 (pCurLayer->pMbType[iMbXy])) {
+    int32_t i, j;
+    // really need?
+    for (i = 0; i < 16; i++) {
+      j = g_kuiLumaDcZigzagScan[i];
+      pTempScaledTCoeff[j] = pCurLayer->pScaledTCoeff[iMbXy][j];
+    }
+    WelsLumaDcDequantIdct (pTempScaledTCoeff, pCurLayer->pLumaQp[iMbXy]);
+    RecI16x16Mb (iMbXy, pCtx, pTempScaledTCoeff, pCurLayer);
+
+    return 0;
+  }
+
+  if (IS_INTRA4x4 (pCurLayer->pMbType[iMbXy]))
+    RecI4x4Mb (iMbXy, pCtx, pTempScaledTCoeff, pCurLayer);
+
+  return 0;
 }
 
-int32_t WelsMbInterPrediction(PWelsDecoderContext pCtx, PDqLayer pCurLayer)
-{
-	int32_t iMbX = pCurLayer->iMbX;
-	int32_t iMbY = pCurLayer->iMbY;
-	uint8_t  *pDstY, *pDstCb, *pDstCr;
+int32_t WelsMbInterPrediction (PWelsDecoderContext pCtx, PDqLayer pCurLayer) {
+  int32_t iMbX = pCurLayer->iMbX;
+  int32_t iMbY = pCurLayer->iMbY;
+  uint8_t*  pDstY, *pDstCb, *pDstCr;
 
-	int32_t iLumaStride   = pCtx->pDec->iLinesize[0];
-	int32_t iChromaStride = pCtx->pDec->iLinesize[1];
+  int32_t iLumaStride   = pCtx->pDec->iLinesize[0];
+  int32_t iChromaStride = pCtx->pDec->iLinesize[1];
 
-	pDstY  = pCurLayer->pDec->pData[0] + ((iMbY * iLumaStride + iMbX)<<4);
-	pDstCb = pCurLayer->pDec->pData[1] + ((iMbY * iChromaStride + iMbX)<<3);
-	pDstCr = pCurLayer->pDec->pData[2] + ((iMbY * iChromaStride + iMbX)<<3);
+  pDstY  = pCurLayer->pDec->pData[0] + ((iMbY * iLumaStride + iMbX) << 4);
+  pDstCb = pCurLayer->pDec->pData[1] + ((iMbY * iChromaStride + iMbX) << 3);
+  pDstCr = pCurLayer->pDec->pData[2] + ((iMbY * iChromaStride + iMbX) << 3);
 
-	GetInterPred(pDstY, pDstCb, pDstCr, pCtx);
+  GetInterPred (pDstY, pDstCb, pDstCr, pCtx);
 
-	return 0;
+  return 0;
 }
 
-void_t WelsMbCopy( uint8_t *pDst, int32_t iStrideDst, uint8_t *pSrc, int32_t iStrideSrc, 
-				 int32_t iHeight, int32_t iWidth )
-{
-	int32_t i;
-	int32_t iOffsetDst = 0, iOffsetSrc = 0;
-	for ( i = 0; i < iHeight; i++ )
-	{
-		memcpy( pDst+iOffsetDst, pSrc+iOffsetSrc, iWidth );
-		iOffsetDst += iStrideDst;
-		iOffsetSrc += iStrideSrc;
-	}
+void_t WelsMbCopy (uint8_t* pDst, int32_t iStrideDst, uint8_t* pSrc, int32_t iStrideSrc,
+                   int32_t iHeight, int32_t iWidth) {
+  int32_t i;
+  int32_t iOffsetDst = 0, iOffsetSrc = 0;
+  for (i = 0; i < iHeight; i++) {
+    memcpy (pDst + iOffsetDst, pSrc + iOffsetSrc, iWidth);
+    iOffsetDst += iStrideDst;
+    iOffsetSrc += iStrideSrc;
+  }
 }
 
 
-int32_t WelsTargetMbConstruction(PWelsDecoderContext pCtx)
-{
-	PDqLayer pCurLayer = pCtx->pCurDqLayer;	
-	if ( MB_TYPE_INTRA_PCM == pCurLayer->pMbType[pCurLayer->iMbXyIndex] )
-	{		
-		//copy cs into fdec
-		int32_t iCsStrideL = pCurLayer->iCsStride[0];
-		int32_t iCsStrideC = pCurLayer->iCsStride[1];
+int32_t WelsTargetMbConstruction (PWelsDecoderContext pCtx) {
+  PDqLayer pCurLayer = pCtx->pCurDqLayer;
+  if (MB_TYPE_INTRA_PCM == pCurLayer->pMbType[pCurLayer->iMbXyIndex]) {
+    //copy cs into fdec
+    int32_t iCsStrideL = pCurLayer->iCsStride[0];
+    int32_t iCsStrideC = pCurLayer->iCsStride[1];
 
-		int32_t iDecStrideL = pCurLayer->pDec->iLinesize[0]; 
-		int32_t iDecStrideC = pCurLayer->pDec->iLinesize[1]; 
+    int32_t iDecStrideL = pCurLayer->pDec->iLinesize[0];
+    int32_t iDecStrideC = pCurLayer->pDec->iLinesize[1];
 
-		int32_t iCsOffsetL = ( pCurLayer->iMbX + pCurLayer->iMbY * iCsStrideL ) << 4;
-		int32_t iCsOffsetC = ( pCurLayer->iMbX + pCurLayer->iMbY * iCsStrideC ) << 3;
-		
-		int32_t iDecOffsetL = ( pCurLayer->iMbX + pCurLayer->iMbY * iDecStrideL ) << 4;
-		int32_t iDecOffsetC = ( pCurLayer->iMbX + pCurLayer->iMbY * iDecStrideC ) << 3;
-		
-		uint8_t* pSrcY = pCurLayer->pCsData[0] + iCsOffsetL;
-		uint8_t* pSrcU = pCurLayer->pCsData[1] + iCsOffsetC;
-		uint8_t* pSrcV = pCurLayer->pCsData[2] + iCsOffsetC;
-		
-		uint8_t* pDecY = pCurLayer->pDec->pData[0] + iDecOffsetL;
-		uint8_t* pDecU = pCurLayer->pDec->pData[1] + iDecOffsetC;
-		uint8_t* pDecV = pCurLayer->pDec->pData[2] + iDecOffsetC;
+    int32_t iCsOffsetL = (pCurLayer->iMbX + pCurLayer->iMbY * iCsStrideL) << 4;
+    int32_t iCsOffsetC = (pCurLayer->iMbX + pCurLayer->iMbY * iCsStrideC) << 3;
 
-		WelsMbCopy( pDecY, iDecStrideL, pSrcY, iCsStrideL, 16, 16 );
-		WelsMbCopy( pDecU, iDecStrideC, pSrcU, iCsStrideC, 8, 8 );
-		WelsMbCopy( pDecV, iDecStrideC, pSrcV, iCsStrideC, 8, 8 );
-		
-		return 0;
-	}
-	else if(IS_INTRA(pCurLayer->pMbType[pCurLayer->iMbXyIndex]))
-	{
-		WelsMbIntraPredictionConstruction(pCtx, pCurLayer, 1);
-	}
-	else if ( IS_INTER( pCurLayer->pMbType[pCurLayer->iMbXyIndex] ) ) //InterMB
-	{
-		if ( 0 == pCurLayer->pCbp[pCurLayer->iMbXyIndex] ) //uiCbp==0 include SKIP
-		{
-			WelsMbInterPrediction( pCtx, pCurLayer );
-		}
-		else
-		{
-			WelsMbInterConstruction( pCtx, pCurLayer );
-		}			
-	}
-	else
-	{
-		WelsLog( pCtx, WELS_LOG_WARNING, "WelsTargetMbConstruction():::::Unknown MB type: %d\n", pCurLayer->pMbType[pCurLayer->iMbXyIndex] );
-		return -1;
-	}
-	
-	return 0;
+    int32_t iDecOffsetL = (pCurLayer->iMbX + pCurLayer->iMbY * iDecStrideL) << 4;
+    int32_t iDecOffsetC = (pCurLayer->iMbX + pCurLayer->iMbY * iDecStrideC) << 3;
+
+    uint8_t* pSrcY = pCurLayer->pCsData[0] + iCsOffsetL;
+    uint8_t* pSrcU = pCurLayer->pCsData[1] + iCsOffsetC;
+    uint8_t* pSrcV = pCurLayer->pCsData[2] + iCsOffsetC;
+
+    uint8_t* pDecY = pCurLayer->pDec->pData[0] + iDecOffsetL;
+    uint8_t* pDecU = pCurLayer->pDec->pData[1] + iDecOffsetC;
+    uint8_t* pDecV = pCurLayer->pDec->pData[2] + iDecOffsetC;
+
+    WelsMbCopy (pDecY, iDecStrideL, pSrcY, iCsStrideL, 16, 16);
+    WelsMbCopy (pDecU, iDecStrideC, pSrcU, iCsStrideC, 8, 8);
+    WelsMbCopy (pDecV, iDecStrideC, pSrcV, iCsStrideC, 8, 8);
+
+    return 0;
+  } else if (IS_INTRA (pCurLayer->pMbType[pCurLayer->iMbXyIndex])) {
+    WelsMbIntraPredictionConstruction (pCtx, pCurLayer, 1);
+  } else if (IS_INTER (pCurLayer->pMbType[pCurLayer->iMbXyIndex])) { //InterMB
+    if (0 == pCurLayer->pCbp[pCurLayer->iMbXyIndex]) { //uiCbp==0 include SKIP
+      WelsMbInterPrediction (pCtx, pCurLayer);
+    } else {
+      WelsMbInterConstruction (pCtx, pCurLayer);
+    }
+  } else {
+    WelsLog (pCtx, WELS_LOG_WARNING, "WelsTargetMbConstruction():::::Unknown MB type: %d\n",
+             pCurLayer->pMbType[pCurLayer->iMbXyIndex]);
+    return -1;
+  }
+
+  return 0;
 }
 
-void_t WelsChromaDcIdct( int16_t *pBlock )
-{
-    int32_t iStride= 32;
-    int32_t iXStride= 16;
-	int32_t iStride1 = iXStride + iStride;
-	int16_t* pBlk = pBlock;
-    int32_t iA,iB,iC,iD,iE;
-	
-    iA= pBlk[0];
-    iB= pBlk[iXStride];
-    iC= pBlk[iStride];
-    iD= pBlk[iStride1];
-	
-    iE = iA-iB;
-    iA += iB;
-    iB = iC-iD;
-    iC += iD;
-	
-	pBlk[0]= (iA+iC) >> 1;
-    pBlk[iXStride]= (iE+iB) >> 1;
-    pBlk[iStride]= (iA-iC) >> 1;
-    pBlk[iStride1]= (iE-iB) >> 1;
+void_t WelsChromaDcIdct (int16_t* pBlock) {
+  int32_t iStride = 32;
+  int32_t iXStride = 16;
+  int32_t iStride1 = iXStride + iStride;
+  int16_t* pBlk = pBlock;
+  int32_t iA, iB, iC, iD, iE;
+
+  iA = pBlk[0];
+  iB = pBlk[iXStride];
+  iC = pBlk[iStride];
+  iD = pBlk[iStride1];
+
+  iE = iA - iB;
+  iA += iB;
+  iB = iC - iD;
+  iC += iD;
+
+  pBlk[0] = (iA + iC) >> 1;
+  pBlk[iXStride] = (iE + iB) >> 1;
+  pBlk[iStride] = (iA - iC) >> 1;
+  pBlk[iStride1] = (iE - iB) >> 1;
 }
 
-int32_t WelsDecodeSlice(PWelsDecoderContext pCtx, bool_t bFirstSliceInLayer, PNalUnit pNalCur)
-{
-	PDqLayer pCurLayer = pCtx->pCurDqLayer;
-	PFmo pFmo = pCtx->pFmo;
-	int32_t i, iRet;
-	int32_t iNextMbXyIndex, iSliceIdc; 
+int32_t WelsDecodeSlice (PWelsDecoderContext pCtx, bool_t bFirstSliceInLayer, PNalUnit pNalCur) {
+  PDqLayer pCurLayer = pCtx->pCurDqLayer;
+  PFmo pFmo = pCtx->pFmo;
+  int32_t i, iRet;
+  int32_t iNextMbXyIndex, iSliceIdc;
 
-	PSlice pSlice = &pCurLayer->sLayerInfo.sSliceInLayer;
-	PSliceHeaderExt pSliceHeaderExt = &pSlice->sSliceHeaderExt;
-	PSliceHeader pSliceHeader = &pSliceHeaderExt->sSliceHeader;
-	int32_t iMbX, iMbY;
-	const int32_t kiCountNumMb = pSliceHeader->pSps->uiTotalMbCount; //need to be correct when fmo or multi slice
-	PBitStringAux pBs = pCurLayer->pBitStringAux; 
-	int32_t iUsedBits  = 0;
-	
-	PWelsDecMbCavlcFunc pDecMbCavlcFunc; 
+  PSlice pSlice = &pCurLayer->sLayerInfo.sSliceInLayer;
+  PSliceHeaderExt pSliceHeaderExt = &pSlice->sSliceHeaderExt;
+  PSliceHeader pSliceHeader = &pSliceHeaderExt->sSliceHeader;
+  int32_t iMbX, iMbY;
+  const int32_t kiCountNumMb = pSliceHeader->pSps->uiTotalMbCount; //need to be correct when fmo or multi slice
+  PBitStringAux pBs = pCurLayer->pBitStringAux;
+  int32_t iUsedBits  = 0;
 
-	pSlice->iTotalMbInCurSlice = 0; //initialize at the starting of slice decoding.
+  PWelsDecMbCavlcFunc pDecMbCavlcFunc;
 
-	if ( P_SLICE == pSliceHeader->eSliceType ) 
-	{
-		pDecMbCavlcFunc = WelsDecodeMbCavlcPSlice;
-	}
-	else //I_SLICE
-	{
-		pDecMbCavlcFunc = WelsDecodeMbCavlcISlice;
-	}
+  pSlice->iTotalMbInCurSlice = 0; //initialize at the starting of slice decoding.
 
-	if ( pSliceHeader->pPps->bConstainedIntraPredFlag )
-	{
-		pCtx->pFillInfoCacheIntra4x4Func = WelsFillCacheConstrain1Intra4x4;
-		pCtx->pParseIntra4x4ModeFunc      = ParseIntra4x4ModeConstrain1;
-		pCtx->pParseIntra16x16ModeFunc    = ParseIntra16x16ModeConstrain1;
-	}
-	else
-	{
-		pCtx->pFillInfoCacheIntra4x4Func = WelsFillCacheConstrain0Intra4x4;
-		pCtx->pParseIntra4x4ModeFunc      = ParseIntra4x4ModeConstrain0;
-		pCtx->pParseIntra16x16ModeFunc    = ParseIntra16x16ModeConstrain0;
-	}
-	
-	pCtx->eSliceType = pSliceHeader->eSliceType;
+  if (P_SLICE == pSliceHeader->eSliceType) {
+    pDecMbCavlcFunc = WelsDecodeMbCavlcPSlice;
+  } else { //I_SLICE
+    pDecMbCavlcFunc = WelsDecodeMbCavlcISlice;
+  }
 
-	if (pCurLayer->sLayerInfo.pPps->bEntropyCodingModeFlag == 1)
-	{
-		//CABAC encoding is unsupported yet!
-		return -1;
-	}
-	
-	iNextMbXyIndex = pSliceHeader->iFirstMbInSlice;
+  if (pSliceHeader->pPps->bConstainedIntraPredFlag) {
+    pCtx->pFillInfoCacheIntra4x4Func = WelsFillCacheConstrain1Intra4x4;
+    pCtx->pParseIntra4x4ModeFunc      = ParseIntra4x4ModeConstrain1;
+    pCtx->pParseIntra16x16ModeFunc    = ParseIntra16x16ModeConstrain1;
+  } else {
+    pCtx->pFillInfoCacheIntra4x4Func = WelsFillCacheConstrain0Intra4x4;
+    pCtx->pParseIntra4x4ModeFunc      = ParseIntra4x4ModeConstrain0;
+    pCtx->pParseIntra16x16ModeFunc    = ParseIntra16x16ModeConstrain0;
+  }
 
-	if ( iNextMbXyIndex >= kiCountNumMb )
-	{
-		WelsLog(pCtx, WELS_LOG_ERROR, "WelsDecodeSlice()::iFirstMbInSlice(%d) > pSps->kiTotalMb(%d). ERROR!!! resolution change....\n", 
-			iNextMbXyIndex, kiCountNumMb);
-		pCtx->iErrorCode |= dsNoParamSets;
-		return dsNoParamSets;
-	}	
+  pCtx->eSliceType = pSliceHeader->eSliceType;
 
-	iMbX = iNextMbXyIndex % pCurLayer->iMbWidth;
-	iMbY = iNextMbXyIndex / pCurLayer->iMbWidth; // error is introduced by multiple slices case, 11/23/2009
-	pSlice->iMbSkipRun = -1;
-	iSliceIdc = (pSliceHeader->iFirstMbInSlice<<7)+pCurLayer->uiLayerDqId;
-	
-	pCurLayer->iMbX =  iMbX;
-	pCurLayer->iMbY = iMbY;
-	pCurLayer->iMbXyIndex = iNextMbXyIndex;
+  if (pCurLayer->sLayerInfo.pPps->bEntropyCodingModeFlag == 1) {
+    //CABAC encoding is unsupported yet!
+    return -1;
+  }
 
-	if(pSliceHeaderExt->bSliceSkipFlag == 1)
-	{
-		for(i=0; i<(int32_t)pSliceHeaderExt->uiNumMbsInSlice; i++)
-		{
-			pCurLayer->pSliceIdc[iNextMbXyIndex] = iSliceIdc;
-		
-			
-			pCurLayer->pResidualPredFlag[iNextMbXyIndex] = 1;		
-			
-			if ( pSliceHeaderExt->sSliceHeader.pPps->uiNumSliceGroups > 1 )
-			{
-				iNextMbXyIndex = FmoNextMb( pFmo, iNextMbXyIndex );
-			}
-			else
-			{
-				++iNextMbXyIndex;
-			}
+  iNextMbXyIndex = pSliceHeader->iFirstMbInSlice;
 
-			iMbX = iNextMbXyIndex%pCurLayer->iMbWidth;
-			iMbY = iNextMbXyIndex%pCurLayer->iMbHeight;
-	
-			pCurLayer->iMbX =  iMbX;
-			pCurLayer->iMbY = iMbY;
-			pCurLayer->iMbXyIndex = iNextMbXyIndex;
-		}
-		return 0;
-	}
+  if (iNextMbXyIndex >= kiCountNumMb) {
+    WelsLog (pCtx, WELS_LOG_ERROR,
+             "WelsDecodeSlice()::iFirstMbInSlice(%d) > pSps->kiTotalMb(%d). ERROR!!! resolution change....\n",
+             iNextMbXyIndex, kiCountNumMb);
+    pCtx->iErrorCode |= dsNoParamSets;
+    return dsNoParamSets;
+  }
 
-	do{
-		pCurLayer->pSliceIdc[iNextMbXyIndex] = iSliceIdc;
-		iRet = pDecMbCavlcFunc( pCtx,  pNalCur );
+  iMbX = iNextMbXyIndex % pCurLayer->iMbWidth;
+  iMbY = iNextMbXyIndex / pCurLayer->iMbWidth; // error is introduced by multiple slices case, 11/23/2009
+  pSlice->iMbSkipRun = -1;
+  iSliceIdc = (pSliceHeader->iFirstMbInSlice << 7) + pCurLayer->uiLayerDqId;
 
-		if (iRet != ERR_NONE){
-			return iRet;
-		}
+  pCurLayer->iMbX =  iMbX;
+  pCurLayer->iMbY = iMbY;
+  pCurLayer->iMbXyIndex = iNextMbXyIndex;
 
-		++pSlice->iTotalMbInCurSlice;
+  if (pSliceHeaderExt->bSliceSkipFlag == 1) {
+    for (i = 0; i < (int32_t)pSliceHeaderExt->uiNumMbsInSlice; i++) {
+      pCurLayer->pSliceIdc[iNextMbXyIndex] = iSliceIdc;
 
-		if ( pSliceHeader->pPps->uiNumSliceGroups > 1 )
-		{
-			iNextMbXyIndex = FmoNextMb( pFmo, iNextMbXyIndex );
-		}
-		else
-		{
-			++iNextMbXyIndex;
-		}
-		if ( (-1 == iNextMbXyIndex) || (iNextMbXyIndex >= kiCountNumMb) )	// slice group boundary or end of a frame
-		{
-			break;
-		}
 
-		// check whether there is left bits to read next time in case multiple slices
-		iUsedBits = ((pBs->pCurBuf-pBs->pStartBuf)<<3) - (16-pBs->iLeftBits);
-		if ( iUsedBits == pBs->iBits && 0 >= pCurLayer->sLayerInfo.sSliceInLayer.iMbSkipRun )	// slice boundary
-		{
-			break;
-		}		
-		if ( iUsedBits > pBs->iBits )//When BS incomplete, as long as find it, SHOULD stop decoding to avoid mosaic or crash.
-		{
-			WelsLog( pCtx, WELS_LOG_WARNING, "WelsDecodeSlice()::::pBs incomplete, iUsedBits:%d > pBs->iBits:%d, MUST stop decoding.\n", 
-				iUsedBits, pBs->iBits );
-			return -1;
-		}
-		iMbX = iNextMbXyIndex % pCurLayer->iMbWidth;
-		iMbY = iNextMbXyIndex / pCurLayer->iMbWidth;
-		pCurLayer->iMbX =  iMbX;
-		pCurLayer->iMbY = iMbY;
-		pCurLayer->iMbXyIndex = iNextMbXyIndex;
-	}while(1);
-	
-	return ERR_NONE;
-}
+      pCurLayer->pResidualPredFlag[iNextMbXyIndex] = 1;
 
-int32_t WelsActualDecodeMbCavlcISlice(PWelsDecoderContext pCtx)
-{	
-	SVlcTable* pVlcTable     = &pCtx->sVlcTable;
-	PDqLayer pCurLayer		 = pCtx->pCurDqLayer;
-	PBitStringAux pBs		 = pCurLayer->pBitStringAux;
-	PSlice pSlice			 = &pCurLayer->sLayerInfo.sSliceInLayer;
-	PSliceHeader pSliceHeader		     = &pSlice->sSliceHeaderExt.sSliceHeader;
+      if (pSliceHeaderExt->sSliceHeader.pPps->uiNumSliceGroups > 1) {
+        iNextMbXyIndex = FmoNextMb (pFmo, iNextMbXyIndex);
+      } else {
+        ++iNextMbXyIndex;
+      }
 
-	SNeighAvail sNeighAvail;
+      iMbX = iNextMbXyIndex % pCurLayer->iMbWidth;
+      iMbY = iNextMbXyIndex % pCurLayer->iMbHeight;
 
-	int32_t iScanIdxStart = pSlice->sSliceHeaderExt.uiScanIdxStart;
-	int32_t iScanIdxEnd   = pSlice->sSliceHeaderExt.uiScanIdxEnd;	
+      pCurLayer->iMbX =  iMbX;
+      pCurLayer->iMbY = iMbY;
+      pCurLayer->iMbXyIndex = iNextMbXyIndex;
+    }
+    return 0;
+  }
 
-	int32_t iMbX = pCurLayer->iMbX;
-	int32_t iMbY = pCurLayer->iMbY;
-	int32_t iMbXy = pCurLayer->iMbXyIndex;
-	int32_t iNMbMode, i;
-	uint32_t uiMbType = 0, uiCbp = 0, uiCbpL = 0, uiCbpC = 0;
+  do {
+    pCurLayer->pSliceIdc[iNextMbXyIndex] = iSliceIdc;
+    iRet = pDecMbCavlcFunc (pCtx,  pNalCur);
 
-	FORCE_STACK_ALIGN_1D( uint8_t, pNonZeroCount, 48, 16 );
-	
-	pCurLayer->pInterPredictionDoneFlag[iMbXy] = 0;
-	pCurLayer->pResidualPredFlag[iMbXy] = pSlice->sSliceHeaderExt.bDefaultResidualPredFlag;
-	
-	uiMbType = BsGetUe(pBs);
-	if ( uiMbType > 25 )
-	{
-		return ERR_INFO_INVALID_MB_TYPE;
-	}	
+    if (iRet != ERR_NONE) {
+      return iRet;
+    }
 
-	if ( 25 == uiMbType )
-	{
-		int32_t iDecStrideL = pCurLayer->pDec->iLinesize[0]; 
-		int32_t iDecStrideC = pCurLayer->pDec->iLinesize[1]; 
-		
-		int32_t iOffsetL = ( iMbX + iMbY * iDecStrideL ) << 4;
-		int32_t iOffsetC = ( iMbX + iMbY * iDecStrideC ) << 3;
-		
-		uint8_t* pDecY = pCurLayer->pCsData[0] + iOffsetL;
-		uint8_t* pDecU = pCurLayer->pCsData[1] + iOffsetC;
-		uint8_t* pDecV = pCurLayer->pCsData[2] + iOffsetC;
-		
-		uint8_t *pTmpBsBuf;
-		
-		int32_t i;
-		int32_t iCopySizeY  = ( sizeof( uint8_t ) << 4 );
-		int32_t iCopySizeUV = ( sizeof( uint8_t ) << 3 );
+    ++pSlice->iTotalMbInCurSlice;
 
-		int32_t iIndex = ((-pBs->iLeftBits)>>3) + 2;
-		
-		pCurLayer->pMbType[iMbXy] = MB_TYPE_INTRA_PCM;
-		
-		//step 1: locating bit-stream pointer [must align into integer byte]	
-		pBs->pCurBuf -= iIndex;
-		
-		//step 2: copy pixel from bit-stream into fdec [reconstruction]		
-		pTmpBsBuf = pBs->pCurBuf;
-		for ( i = 0; i < 16; i++ ) //luma
-		{
-			memcpy( pDecY , pTmpBsBuf, iCopySizeY );
-			pDecY += iDecStrideL;				
-			pTmpBsBuf += 16;
-		}
-		for ( i = 0; i < 8; i++ ) //cb
-		{				
-			memcpy( pDecU, pTmpBsBuf, iCopySizeUV );
-			pDecU += iDecStrideC;				
-			pTmpBsBuf += 8;
-		}
-		for ( i = 0; i < 8; i++ ) //cr
-		{				
-			memcpy( pDecV, pTmpBsBuf, iCopySizeUV );
-			pDecV += iDecStrideC;
-			pTmpBsBuf += 8;
-		}	
+    if (pSliceHeader->pPps->uiNumSliceGroups > 1) {
+      iNextMbXyIndex = FmoNextMb (pFmo, iNextMbXyIndex);
+    } else {
+      ++iNextMbXyIndex;
+    }
+    if ((-1 == iNextMbXyIndex) || (iNextMbXyIndex >= kiCountNumMb)) {	// slice group boundary or end of a frame
+      break;
+    }
 
-		pBs->pCurBuf += 384;
-		InitReadBits( pBs );
-		
-		//step 3: update QP and pNonZeroCount
-		pCurLayer->pLumaQp[iMbXy] = 0;
-		pCurLayer->pChromaQp[iMbXy] = 0;
-		memset( pCurLayer->pNzc[iMbXy], 16, sizeof( pCurLayer->pNzc[iMbXy] ) ); //JVT-x201wcm1.doc, page229, 2009.10.23		
-		return 0;				
-	}
-	else if (0 == uiMbType) //reference to JM
-	{
-		FORCE_STACK_ALIGN_1D( int8_t, pIntraPredMode, 48, 16 );
-		pCurLayer->pMbType[iMbXy] = MB_TYPE_INTRA4x4;
-		pCtx->pFillInfoCacheIntra4x4Func( &sNeighAvail, pNonZeroCount, pIntraPredMode, pCurLayer );
-		if ( pCtx->pParseIntra4x4ModeFunc( &sNeighAvail, pIntraPredMode, pBs, pCurLayer ) )
-		{
-			return -1;
-		}
+    // check whether there is left bits to read next time in case multiple slices
+    iUsedBits = ((pBs->pCurBuf - pBs->pStartBuf) << 3) - (16 - pBs->iLeftBits);
+    if (iUsedBits == pBs->iBits && 0 >= pCurLayer->sLayerInfo.sSliceInLayer.iMbSkipRun) {	// slice boundary
+      break;
+    }
+    if (iUsedBits > pBs->iBits) { //When BS incomplete, as long as find it, SHOULD stop decoding to avoid mosaic or crash.
+      WelsLog (pCtx, WELS_LOG_WARNING,
+               "WelsDecodeSlice()::::pBs incomplete, iUsedBits:%d > pBs->iBits:%d, MUST stop decoding.\n",
+               iUsedBits, pBs->iBits);
+      return -1;
+    }
+    iMbX = iNextMbXyIndex % pCurLayer->iMbWidth;
+    iMbY = iNextMbXyIndex / pCurLayer->iMbWidth;
+    pCurLayer->iMbX =  iMbX;
+    pCurLayer->iMbY = iMbY;
+    pCurLayer->iMbXyIndex = iNextMbXyIndex;
+  } while (1);
 
-		//uiCbp
-		uiCbp = BsGetUe(pBs);
-		//G.9.1 Alternative parsing process for coded pBlock pattern
-		if ( uiCbp > 47 ) 
-			return ERR_INFO_INVALID_CBP;
+  return ERR_NONE;
+}
 
-		uiCbp = g_kuiIntra4x4CbpTable[uiCbp];
+int32_t WelsActualDecodeMbCavlcISlice (PWelsDecoderContext pCtx) {
+  SVlcTable* pVlcTable     = &pCtx->sVlcTable;
+  PDqLayer pCurLayer		 = pCtx->pCurDqLayer;
+  PBitStringAux pBs		 = pCurLayer->pBitStringAux;
+  PSlice pSlice			 = &pCurLayer->sLayerInfo.sSliceInLayer;
+  PSliceHeader pSliceHeader		     = &pSlice->sSliceHeaderExt.sSliceHeader;
 
-		pCurLayer->pCbp[iMbXy] = uiCbp;
-		uiCbpC = uiCbp >> 4;
-		uiCbpL = uiCbp & 15;
-	}
-	else //I_PCM exclude, we can ignore it
-	{
-		pCurLayer->pMbType[iMbXy] = MB_TYPE_INTRA16x16;
-		pCurLayer->pIntraPredMode[iMbXy][7] = (uiMbType-1) & 3;
-		pCurLayer->pCbp[iMbXy] = g_kuiI16CbpTable[(uiMbType-1)>>2];
-		uiCbpC = pCurLayer->pCbp[iMbXy] >> 4;
-		uiCbpL = pCurLayer->pCbp[iMbXy] & 15;
-		WelsFillCacheNonZeroCount( &sNeighAvail, pNonZeroCount, pCurLayer );
-		if ( pCtx->pParseIntra16x16ModeFunc( &sNeighAvail, pBs, pCurLayer ) )
-		{
-			return -1;
-		}
-	}
-	
-	iNMbMode = BASE_MB;					
+  SNeighAvail sNeighAvail;
 
-	memset(pCurLayer->pScaledTCoeff[iMbXy], 0, 384*sizeof(pCurLayer->pScaledTCoeff[iMbXy][0]));
-	ST32(&pCurLayer->pNzc[iMbXy][0], 0);
-	ST32(&pCurLayer->pNzc[iMbXy][4], 0);
-	ST32(&pCurLayer->pNzc[iMbXy][8], 0);
-	ST32(&pCurLayer->pNzc[iMbXy][12], 0);
-	ST32(&pCurLayer->pNzc[iMbXy][16], 0);
-	ST32(&pCurLayer->pNzc[iMbXy][20], 0);
+  int32_t iScanIdxStart = pSlice->sSliceHeaderExt.uiScanIdxStart;
+  int32_t iScanIdxEnd   = pSlice->sSliceHeaderExt.uiScanIdxEnd;
 
-	if( pCurLayer->pCbp[iMbXy] == 0 && IS_INTRA4x4(pCurLayer->pMbType[iMbXy]))
-	{
-		pCurLayer->pLumaQp[iMbXy] = pSlice->iLastMbQp;
-		pCurLayer->pChromaQp[iMbXy] = g_kuiChromaQp[WELS_CLIP3(pCurLayer->pLumaQp[iMbXy] + 
-											pSliceHeader->pPps->iChromaQpIndexOffset, 0, 51)];
+  int32_t iMbX = pCurLayer->iMbX;
+  int32_t iMbY = pCurLayer->iMbY;
+  int32_t iMbXy = pCurLayer->iMbXyIndex;
+  int32_t iNMbMode, i;
+  uint32_t uiMbType = 0, uiCbp = 0, uiCbpL = 0, uiCbpC = 0;
 
-	}
+  FORCE_STACK_ALIGN_1D (uint8_t, pNonZeroCount, 48, 16);
 
-	if ( pCurLayer->pCbp[iMbXy] || MB_TYPE_INTRA16x16 == pCurLayer->pMbType[iMbXy] ) 
-	{
-		int32_t iQpDelta, iId8x8, iId4x4;		
+  pCurLayer->pInterPredictionDoneFlag[iMbXy] = 0;
+  pCurLayer->pResidualPredFlag[iMbXy] = pSlice->sSliceHeaderExt.bDefaultResidualPredFlag;
 
-		iQpDelta = BsGetSe(pBs);
+  uiMbType = BsGetUe (pBs);
+  if (uiMbType > 25) {
+    return ERR_INFO_INVALID_MB_TYPE;
+  }
 
-        if (iQpDelta > 25 || iQpDelta < -26) //out of iQpDelta range
-		{
-			return ERR_INFO_INVALID_QP;
-		}
+  if (25 == uiMbType) {
+    int32_t iDecStrideL = pCurLayer->pDec->iLinesize[0];
+    int32_t iDecStrideC = pCurLayer->pDec->iLinesize[1];
 
-		pCurLayer->pLumaQp[iMbXy] = pSlice->iLastMbQp + iQpDelta; //update iLastMbQp
-		//refer to JVT-X201wcm1.doc equation(7-35)
-		if ( (unsigned)(pCurLayer->pLumaQp[iMbXy]) > 51 )
-		{
-			if ( pCurLayer->pLumaQp[iMbXy] < 0 )
-			{
-				pCurLayer->pLumaQp[iMbXy] += 52;
-			} 
-			else
-			{
-				pCurLayer->pLumaQp[iMbXy] -= 52;
-			}
-		}
-		//QP should be in the range of [0, 51]
-		if ( pCurLayer->pLumaQp[iMbXy] < 0 || pCurLayer->pLumaQp[iMbXy] > 51 )
-		{
-			return ERR_INFO_INVALID_QP;
-		}
-		pSlice->iLastMbQp = pCurLayer->pLumaQp[iMbXy];
-		pCurLayer->pChromaQp[iMbXy] = g_kuiChromaQp[WELS_CLIP3(pSlice->iLastMbQp + pSliceHeader->pPps->iChromaQpIndexOffset, 0, 51)];
+    int32_t iOffsetL = (iMbX + iMbY * iDecStrideL) << 4;
+    int32_t iOffsetC = (iMbX + iMbY * iDecStrideC) << 3;
 
+    uint8_t* pDecY = pCurLayer->pCsData[0] + iOffsetL;
+    uint8_t* pDecU = pCurLayer->pCsData[1] + iOffsetC;
+    uint8_t* pDecV = pCurLayer->pCsData[2] + iOffsetC;
 
-		BsStartCavlc( pBs );
+    uint8_t* pTmpBsBuf;
 
-		if ( MB_TYPE_INTRA16x16 == pCurLayer->pMbType[iMbXy] ) 
-		{
-			//step1: Luma DC
-			if ( WelsResidualBlockCavlc( pVlcTable, pNonZeroCount, pBs, 0, 16,
-				g_kuiLumaDcZigzagScan, I16_LUMA_DC, pCurLayer->pScaledTCoeff[iMbXy], iNMbMode, pCurLayer->pLumaQp[iMbXy], pCtx) )
-			{
-				return -1;//abnormal
-			}
-			//step2: Luma AC
-			if (uiCbpL)
-			{
-				for (i = 0; i < 16; i++) 
-				{	
-                    if ( WelsResidualBlockCavlc( pVlcTable, pNonZeroCount, pBs, i,
-								iScanIdxEnd - WELS_MAX(iScanIdxStart, 1) + 1, g_kuiZigzagScan+ WELS_MAX(iScanIdxStart,1),
-								I16_LUMA_AC, pCurLayer->pScaledTCoeff[iMbXy] + (i<<4), iNMbMode, pCurLayer->pLumaQp[iMbXy], pCtx) ) 
-					{
-							return -1;//abnormal
-					}					
-				}
-				ST32(&pCurLayer->pNzc[iMbXy][0], LD32(&pNonZeroCount[1+8*1]));
-				ST32(&pCurLayer->pNzc[iMbXy][4], LD32(&pNonZeroCount[1+8*2]));
-				ST32(&pCurLayer->pNzc[iMbXy][8], LD32(&pNonZeroCount[1+8*3]));
-				ST32(&pCurLayer->pNzc[iMbXy][12], LD32(&pNonZeroCount[1+8*4]));
-			}
-			else //pNonZeroCount = 0
-			{
-				ST32(&pCurLayer->pNzc[iMbXy][0], 0);
-				ST32(&pCurLayer->pNzc[iMbXy][4], 0);	
-				ST32(&pCurLayer->pNzc[iMbXy][8], 0);
-				ST32(&pCurLayer->pNzc[iMbXy][12], 0);
-			}
-		}
-		else //non-MB_TYPE_INTRA16x16
-		{	
-			for (iId8x8 = 0; iId8x8 < 4; iId8x8++) 
-			{
-				if (uiCbpL & (1 << iId8x8)) 
-				{
-					int32_t iIndex = (iId8x8 << 2);
-					for (iId4x4 = 0; iId4x4 < 4; iId4x4++) 
-					{
-						//Luma (DC and AC decoding together)
-						if ( WelsResidualBlockCavlc( pVlcTable, pNonZeroCount, pBs, iIndex,
-							iScanIdxEnd - iScanIdxStart + 1, g_kuiZigzagScan+iScanIdxStart, 
-							LUMA_DC_AC, pCurLayer->pScaledTCoeff[iMbXy] + (iIndex<<4), iNMbMode, pCurLayer->pLumaQp[iMbXy], pCtx) )
-						{
-							return -1;//abnormal
-						}
-						iIndex++;
-					}
-				}
-				else
-				{
-					ST16(&pNonZeroCount[g_kuiCacheNzcScanIdx[(iId8x8<<2)]], 0);
-					ST16(&pNonZeroCount[g_kuiCacheNzcScanIdx[(iId8x8<<2)+2]], 0);
-				}
-			}	
-			ST32(&pCurLayer->pNzc[iMbXy][0], LD32(&pNonZeroCount[1+8*1]));
-			ST32(&pCurLayer->pNzc[iMbXy][4], LD32(&pNonZeroCount[1+8*2]));
-			ST32(&pCurLayer->pNzc[iMbXy][8], LD32(&pNonZeroCount[1+8*3]));
-			ST32(&pCurLayer->pNzc[iMbXy][12], LD32(&pNonZeroCount[1+8*4]));
-		}
+    int32_t i;
+    int32_t iCopySizeY  = (sizeof (uint8_t) << 4);
+    int32_t iCopySizeUV = (sizeof (uint8_t) << 3);
 
-		//chroma 
-		//step1: DC
-		if ( 1 == uiCbpC || 2 == uiCbpC )
-		{	
-			for (i = 0; i < 2; i++) //Cb Cr
-			{			
-				if ( WelsResidualBlockCavlc( pVlcTable, pNonZeroCount, pBs,
-					16 + (i << 2), 4, g_kuiChromaDcScan, CHROMA_DC, pCurLayer->pScaledTCoeff[iMbXy] + 256 + (i<<6),
-					iNMbMode, pCurLayer->pChromaQp[iMbXy], pCtx) ) 
-				{
-					return -1;//abnormal
-				}
-			}
-		}
+    int32_t iIndex = ((-pBs->iLeftBits) >> 3) + 2;
 
-		//step2: AC
-		if (2 == uiCbpC)
-		{
-			for (i = 0; i < 2; i++) //Cb Cr
-			{
-				int32_t iIndex = 16 + (i<<2);
-				for (iId4x4 = 0; iId4x4 < 4; iId4x4++) 
-				{
-					if ( WelsResidualBlockCavlc( pVlcTable, pNonZeroCount, pBs, iIndex, 
-						iScanIdxEnd - WELS_MAX(iScanIdxStart, 1) + 1, g_kuiZigzagScan + WELS_MAX(iScanIdxStart,1), 
-						CHROMA_AC, pCurLayer->pScaledTCoeff[iMbXy]+(iIndex<<4), iNMbMode, pCurLayer->pChromaQp[iMbXy], pCtx) )
-					{
-						return -1;//abnormal
-					}
-					iIndex++;
-				}
-			}
-			ST16(&pCurLayer->pNzc[iMbXy][16], LD16(&pNonZeroCount[6+8*1]));
-			ST16(&pCurLayer->pNzc[iMbXy][20], LD16(&pNonZeroCount[6+8*2]));
-			ST16(&pCurLayer->pNzc[iMbXy][18], LD16(&pNonZeroCount[6+8*4]));
-			ST16(&pCurLayer->pNzc[iMbXy][22], LD16(&pNonZeroCount[6+8*5]));
-		}
-		else 
-		{
-			ST16(&pCurLayer->pNzc[iMbXy][16], 0);
-			ST16(&pCurLayer->pNzc[iMbXy][20], 0);
-			ST16(&pCurLayer->pNzc[iMbXy][18], 0);
-			ST16(&pCurLayer->pNzc[iMbXy][22], 0);
-		}
-		BsEndCavlc( pBs ); 
-	}
-	else
-	{
-		ST32(&pCurLayer->pNzc[iMbXy][0], 0);
-		ST32(&pCurLayer->pNzc[iMbXy][4], 0);
-		ST32(&pCurLayer->pNzc[iMbXy][8], 0);
-		ST32(&pCurLayer->pNzc[iMbXy][12], 0);
-		ST32(&pCurLayer->pNzc[iMbXy][16], 0);
-		ST32(&pCurLayer->pNzc[iMbXy][20], 0);
-	}	
+    pCurLayer->pMbType[iMbXy] = MB_TYPE_INTRA_PCM;
 
-	return 0;
-}
+    //step 1: locating bit-stream pointer [must align into integer byte]
+    pBs->pCurBuf -= iIndex;
 
-int32_t WelsDecodeMbCavlcISlice(PWelsDecoderContext pCtx, PNalUnit pNalCur)
-{
-	PDqLayer pCurLayer = pCtx->pCurDqLayer;
-	PBitStringAux pBs = pCurLayer->pBitStringAux;	
-	PSliceHeaderExt pSliceHeaderExt = &pCurLayer->sLayerInfo.sSliceInLayer.sSliceHeaderExt;
-	int32_t iBaseModeFlag;	
-	int32_t iRet = 0; //should have the return value to indicate decoding error or not, It's NECESSARY--2010.4.15
+    //step 2: copy pixel from bit-stream into fdec [reconstruction]
+    pTmpBsBuf = pBs->pCurBuf;
+    for (i = 0; i < 16; i++) { //luma
+      memcpy (pDecY , pTmpBsBuf, iCopySizeY);
+      pDecY += iDecStrideL;
+      pTmpBsBuf += 16;
+    }
+    for (i = 0; i < 8; i++) { //cb
+      memcpy (pDecU, pTmpBsBuf, iCopySizeUV);
+      pDecU += iDecStrideC;
+      pTmpBsBuf += 8;
+    }
+    for (i = 0; i < 8; i++) { //cr
+      memcpy (pDecV, pTmpBsBuf, iCopySizeUV);
+      pDecV += iDecStrideC;
+      pTmpBsBuf += 8;
+    }
 
-	if( pSliceHeaderExt->bAdaptiveBaseModeFlag == 1)
-	{
-		iBaseModeFlag = BsGetOneBit(pBs);
-	}
-	else
-	{
-		iBaseModeFlag = pSliceHeaderExt->bDefaultBaseModeFlag;
-	}
-    if( !iBaseModeFlag )
-    {
-        iRet = WelsActualDecodeMbCavlcISlice( pCtx);
+    pBs->pCurBuf += 384;
+    InitReadBits (pBs);
+
+    //step 3: update QP and pNonZeroCount
+    pCurLayer->pLumaQp[iMbXy] = 0;
+    pCurLayer->pChromaQp[iMbXy] = 0;
+    memset (pCurLayer->pNzc[iMbXy], 16, sizeof (pCurLayer->pNzc[iMbXy]));   //JVT-x201wcm1.doc, page229, 2009.10.23
+    return 0;
+  } else if (0 == uiMbType) { //reference to JM
+    FORCE_STACK_ALIGN_1D (int8_t, pIntraPredMode, 48, 16);
+    pCurLayer->pMbType[iMbXy] = MB_TYPE_INTRA4x4;
+    pCtx->pFillInfoCacheIntra4x4Func (&sNeighAvail, pNonZeroCount, pIntraPredMode, pCurLayer);
+    if (pCtx->pParseIntra4x4ModeFunc (&sNeighAvail, pIntraPredMode, pBs, pCurLayer)) {
+      return -1;
     }
-    else
-    {
-        WelsLog( pCtx, WELS_LOG_WARNING, "iBaseModeFlag (%d) != 0, inter-layer prediction not supported.\n", iBaseModeFlag);
-        return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_UNSUPPORTED_ILP);
+
+    //uiCbp
+    uiCbp = BsGetUe (pBs);
+    //G.9.1 Alternative parsing process for coded pBlock pattern
+    if (uiCbp > 47)
+      return ERR_INFO_INVALID_CBP;
+
+    uiCbp = g_kuiIntra4x4CbpTable[uiCbp];
+
+    pCurLayer->pCbp[iMbXy] = uiCbp;
+    uiCbpC = uiCbp >> 4;
+    uiCbpL = uiCbp & 15;
+  } else { //I_PCM exclude, we can ignore it
+    pCurLayer->pMbType[iMbXy] = MB_TYPE_INTRA16x16;
+    pCurLayer->pIntraPredMode[iMbXy][7] = (uiMbType - 1) & 3;
+    pCurLayer->pCbp[iMbXy] = g_kuiI16CbpTable[ (uiMbType - 1) >> 2];
+    uiCbpC = pCurLayer->pCbp[iMbXy] >> 4;
+    uiCbpL = pCurLayer->pCbp[iMbXy] & 15;
+    WelsFillCacheNonZeroCount (&sNeighAvail, pNonZeroCount, pCurLayer);
+    if (pCtx->pParseIntra16x16ModeFunc (&sNeighAvail, pBs, pCurLayer)) {
+      return -1;
     }
-	if ( iRet ) //occur error when parsing, MUST STOP decoding
-	{
-		return iRet;
-	}
+  }
 
-	return 0;
-}
+  iNMbMode = BASE_MB;
 
-int32_t WelsActualDecodeMbCavlcPSlice(PWelsDecoderContext pCtx)
-{
-	SVlcTable* pVlcTable     = &pCtx->sVlcTable;
-	PDqLayer pCurLayer		 = pCtx->pCurDqLayer;
-	PBitStringAux pBs		 = pCurLayer->pBitStringAux;
-	PSlice pSlice			 = &pCurLayer->sLayerInfo.sSliceInLayer;
-	PSliceHeader pSliceHeader		     = &pSlice->sSliceHeaderExt.sSliceHeader;
+  memset (pCurLayer->pScaledTCoeff[iMbXy], 0, 384 * sizeof (pCurLayer->pScaledTCoeff[iMbXy][0]));
+  ST32 (&pCurLayer->pNzc[iMbXy][0], 0);
+  ST32 (&pCurLayer->pNzc[iMbXy][4], 0);
+  ST32 (&pCurLayer->pNzc[iMbXy][8], 0);
+  ST32 (&pCurLayer->pNzc[iMbXy][12], 0);
+  ST32 (&pCurLayer->pNzc[iMbXy][16], 0);
+  ST32 (&pCurLayer->pNzc[iMbXy][20], 0);
 
-	SNeighAvail sNeighAvail;
+  if (pCurLayer->pCbp[iMbXy] == 0 && IS_INTRA4x4 (pCurLayer->pMbType[iMbXy])) {
+    pCurLayer->pLumaQp[iMbXy] = pSlice->iLastMbQp;
+    pCurLayer->pChromaQp[iMbXy] = g_kuiChromaQp[WELS_CLIP3 (pCurLayer->pLumaQp[iMbXy] +
+                                  pSliceHeader->pPps->iChromaQpIndexOffset, 0, 51)];
 
-	int32_t iScanIdxStart = pSlice->sSliceHeaderExt.uiScanIdxStart;
-	int32_t iScanIdxEnd   = pSlice->sSliceHeaderExt.uiScanIdxEnd;	
+  }
 
-	int32_t iMbX = pCurLayer->iMbX;
-	int32_t iMbY = pCurLayer->iMbY;
-	int32_t iMbXy = pCurLayer->iMbXyIndex;
+  if (pCurLayer->pCbp[iMbXy] || MB_TYPE_INTRA16x16 == pCurLayer->pMbType[iMbXy]) {
+    int32_t iQpDelta, iId8x8, iId4x4;
 
-	int32_t iNMbMode, i;
-	uint32_t uiMbType = 0, uiCbp = 0, uiCbpL = 0, uiCbpC = 0;
+    iQpDelta = BsGetSe (pBs);
+
+    if (iQpDelta > 25 || iQpDelta < -26) { //out of iQpDelta range
+      return ERR_INFO_INVALID_QP;
+    }
+
+    pCurLayer->pLumaQp[iMbXy] = pSlice->iLastMbQp + iQpDelta; //update iLastMbQp
+    //refer to JVT-X201wcm1.doc equation(7-35)
+    if ((unsigned) (pCurLayer->pLumaQp[iMbXy]) > 51) {
+      if (pCurLayer->pLumaQp[iMbXy] < 0) {
+        pCurLayer->pLumaQp[iMbXy] += 52;
+      } else {
+        pCurLayer->pLumaQp[iMbXy] -= 52;
+      }
+    }
+    //QP should be in the range of [0, 51]
+    if (pCurLayer->pLumaQp[iMbXy] < 0 || pCurLayer->pLumaQp[iMbXy] > 51) {
+      return ERR_INFO_INVALID_QP;
+    }
+    pSlice->iLastMbQp = pCurLayer->pLumaQp[iMbXy];
+    pCurLayer->pChromaQp[iMbXy] = g_kuiChromaQp[WELS_CLIP3 (pSlice->iLastMbQp + pSliceHeader->pPps->iChromaQpIndexOffset, 0,
+                                  51)];
 
-	FORCE_STACK_ALIGN_1D( uint8_t, pNonZeroCount, 48, 16 );
-	pCurLayer->pInterPredictionDoneFlag[iMbXy] = 0;//2009.10.23
-	
-	uiMbType = BsGetUe(pBs);	
-	if (uiMbType < 5) //inter MB type
-	{
-		int16_t iMotionVector[LIST_A][30][MV_A];	
 
-		int8_t	iRefIndex[LIST_A][30];
-		pCurLayer->pMbType[iMbXy] = g_ksInterMbTypeInfo[uiMbType].iType;
-		WelsFillCacheInter( &sNeighAvail, pNonZeroCount, iMotionVector, iRefIndex, pCurLayer );
-		if ( ParseInterInfo(pCtx, iMotionVector, iRefIndex, pBs) )
-		{
-			return -1;//abnormal
-		}
+    BsStartCavlc (pBs);
 
-		if( pSlice->sSliceHeaderExt.bAdaptiveResidualPredFlag ==1 )
-		{
-			pCurLayer->pResidualPredFlag[iMbXy] =  BsGetOneBit(pBs);
-		}
-		else
-		{
-			pCurLayer->pResidualPredFlag[iMbXy] = pSlice->sSliceHeaderExt.bDefaultResidualPredFlag;
-		}
+    if (MB_TYPE_INTRA16x16 == pCurLayer->pMbType[iMbXy]) {
+      //step1: Luma DC
+      if (WelsResidualBlockCavlc (pVlcTable, pNonZeroCount, pBs, 0, 16,
+                                  g_kuiLumaDcZigzagScan, I16_LUMA_DC, pCurLayer->pScaledTCoeff[iMbXy], iNMbMode, pCurLayer->pLumaQp[iMbXy], pCtx)) {
+        return -1;//abnormal
+      }
+      //step2: Luma AC
+      if (uiCbpL) {
+        for (i = 0; i < 16; i++) {
+          if (WelsResidualBlockCavlc (pVlcTable, pNonZeroCount, pBs, i,
+                                      iScanIdxEnd - WELS_MAX (iScanIdxStart, 1) + 1, g_kuiZigzagScan + WELS_MAX (iScanIdxStart, 1),
+                                      I16_LUMA_AC, pCurLayer->pScaledTCoeff[iMbXy] + (i << 4), iNMbMode, pCurLayer->pLumaQp[iMbXy], pCtx)) {
+            return -1;//abnormal
+          }
+        }
+        ST32 (&pCurLayer->pNzc[iMbXy][0], LD32 (&pNonZeroCount[1 + 8 * 1]));
+        ST32 (&pCurLayer->pNzc[iMbXy][4], LD32 (&pNonZeroCount[1 + 8 * 2]));
+        ST32 (&pCurLayer->pNzc[iMbXy][8], LD32 (&pNonZeroCount[1 + 8 * 3]));
+        ST32 (&pCurLayer->pNzc[iMbXy][12], LD32 (&pNonZeroCount[1 + 8 * 4]));
+      } else { //pNonZeroCount = 0
+        ST32 (&pCurLayer->pNzc[iMbXy][0], 0);
+        ST32 (&pCurLayer->pNzc[iMbXy][4], 0);
+        ST32 (&pCurLayer->pNzc[iMbXy][8], 0);
+        ST32 (&pCurLayer->pNzc[iMbXy][12], 0);
+      }
+    } else { //non-MB_TYPE_INTRA16x16
+      for (iId8x8 = 0; iId8x8 < 4; iId8x8++) {
+        if (uiCbpL & (1 << iId8x8)) {
+          int32_t iIndex = (iId8x8 << 2);
+          for (iId4x4 = 0; iId4x4 < 4; iId4x4++) {
+            //Luma (DC and AC decoding together)
+            if (WelsResidualBlockCavlc (pVlcTable, pNonZeroCount, pBs, iIndex,
+                                        iScanIdxEnd - iScanIdxStart + 1, g_kuiZigzagScan + iScanIdxStart,
+                                        LUMA_DC_AC, pCurLayer->pScaledTCoeff[iMbXy] + (iIndex << 4), iNMbMode, pCurLayer->pLumaQp[iMbXy], pCtx)) {
+              return -1;//abnormal
+            }
+            iIndex++;
+          }
+        } else {
+          ST16 (&pNonZeroCount[g_kuiCacheNzcScanIdx[ (iId8x8 << 2)]], 0);
+          ST16 (&pNonZeroCount[g_kuiCacheNzcScanIdx[ (iId8x8 << 2) + 2]], 0);
+        }
+      }
+      ST32 (&pCurLayer->pNzc[iMbXy][0], LD32 (&pNonZeroCount[1 + 8 * 1]));
+      ST32 (&pCurLayer->pNzc[iMbXy][4], LD32 (&pNonZeroCount[1 + 8 * 2]));
+      ST32 (&pCurLayer->pNzc[iMbXy][8], LD32 (&pNonZeroCount[1 + 8 * 3]));
+      ST32 (&pCurLayer->pNzc[iMbXy][12], LD32 (&pNonZeroCount[1 + 8 * 4]));
+    }
 
-		if(pCurLayer->pResidualPredFlag[iMbXy] == 0)
-		{
-			iNMbMode = BASE_MB;
-			pCurLayer->pInterPredictionDoneFlag[iMbXy] = 0;
-		}
-		else 
-		{
-            WelsLog(pCtx, WELS_LOG_WARNING, "residual_pred_flag = 1 not supported.\n");
-            return -1;
-		}
-	}
-	else //intra MB type
-	{
-		uiMbType -= 5;
-		if ( uiMbType > 25 )
-		{
-			return ERR_INFO_INVALID_MB_TYPE;
-		}
-		
-		if ( 25 == uiMbType )
-		{
-			int32_t iDecStrideL = pCurLayer->pDec->iLinesize[0]; 
-			int32_t iDecStrideC = pCurLayer->pDec->iLinesize[1]; 
-		
-			int32_t iOffsetL = ( iMbX + iMbY * iDecStrideL ) << 4;
-			int32_t iOffsetC = ( iMbX + iMbY * iDecStrideC ) << 3;
-		
-			uint8_t* pDecY = pCurLayer->pCsData[0] + iOffsetL;
-			uint8_t* pDecU = pCurLayer->pCsData[1] + iOffsetC;
-			uint8_t* pDecV = pCurLayer->pCsData[2] + iOffsetC;
-		
-			uint8_t *pTmpBsBuf;
-		
-			int32_t i;
-			int32_t iCopySizeY  = ( sizeof( uint8_t ) << 4 );
-			int32_t iCopySizeUV = ( sizeof( uint8_t ) << 3 );
+    //chroma
+    //step1: DC
+    if (1 == uiCbpC || 2 == uiCbpC) {
+      for (i = 0; i < 2; i++) { //Cb Cr
+        if (WelsResidualBlockCavlc (pVlcTable, pNonZeroCount, pBs,
+                                    16 + (i << 2), 4, g_kuiChromaDcScan, CHROMA_DC, pCurLayer->pScaledTCoeff[iMbXy] + 256 + (i << 6),
+                                    iNMbMode, pCurLayer->pChromaQp[iMbXy], pCtx)) {
+          return -1;//abnormal
+        }
+      }
+    }
 
-			int32_t iIndex = ((-pBs->iLeftBits)>>3) + 2;
-			
-			pCurLayer->pMbType[iMbXy] = MB_TYPE_INTRA_PCM;
+    //step2: AC
+    if (2 == uiCbpC) {
+      for (i = 0; i < 2; i++) { //Cb Cr
+        int32_t iIndex = 16 + (i << 2);
+        for (iId4x4 = 0; iId4x4 < 4; iId4x4++) {
+          if (WelsResidualBlockCavlc (pVlcTable, pNonZeroCount, pBs, iIndex,
+                                      iScanIdxEnd - WELS_MAX (iScanIdxStart, 1) + 1, g_kuiZigzagScan + WELS_MAX (iScanIdxStart, 1),
+                                      CHROMA_AC, pCurLayer->pScaledTCoeff[iMbXy] + (iIndex << 4), iNMbMode, pCurLayer->pChromaQp[iMbXy], pCtx)) {
+            return -1;//abnormal
+          }
+          iIndex++;
+        }
+      }
+      ST16 (&pCurLayer->pNzc[iMbXy][16], LD16 (&pNonZeroCount[6 + 8 * 1]));
+      ST16 (&pCurLayer->pNzc[iMbXy][20], LD16 (&pNonZeroCount[6 + 8 * 2]));
+      ST16 (&pCurLayer->pNzc[iMbXy][18], LD16 (&pNonZeroCount[6 + 8 * 4]));
+      ST16 (&pCurLayer->pNzc[iMbXy][22], LD16 (&pNonZeroCount[6 + 8 * 5]));
+    } else {
+      ST16 (&pCurLayer->pNzc[iMbXy][16], 0);
+      ST16 (&pCurLayer->pNzc[iMbXy][20], 0);
+      ST16 (&pCurLayer->pNzc[iMbXy][18], 0);
+      ST16 (&pCurLayer->pNzc[iMbXy][22], 0);
+    }
+    BsEndCavlc (pBs);
+  } else {
+    ST32 (&pCurLayer->pNzc[iMbXy][0], 0);
+    ST32 (&pCurLayer->pNzc[iMbXy][4], 0);
+    ST32 (&pCurLayer->pNzc[iMbXy][8], 0);
+    ST32 (&pCurLayer->pNzc[iMbXy][12], 0);
+    ST32 (&pCurLayer->pNzc[iMbXy][16], 0);
+    ST32 (&pCurLayer->pNzc[iMbXy][20], 0);
+  }
 
-			//step 1: locating bit-stream pointer [must align into integer byte]
-			pBs->pCurBuf -= iIndex;
-		
-		    //step 2: copy pixel from bit-stream into fdec [reconstruction]			
-			pTmpBsBuf = pBs->pCurBuf;
-			for ( i = 0; i < 16; i++ ) //luma
-			{
-				memcpy( pDecY , pTmpBsBuf, iCopySizeY );
-				pDecY += iDecStrideL;				
-				pTmpBsBuf += 16;
-			}
-			
-			for ( i = 0; i < 8; i++ ) //cb
-			{				
-				memcpy( pDecU, pTmpBsBuf, iCopySizeUV );
-				pDecU += iDecStrideC;				
-				pTmpBsBuf += 8;
-			}
-			for ( i = 0; i < 8; i++ ) //cr
-			{				
-				memcpy( pDecV, pTmpBsBuf, iCopySizeUV );
-				pDecV += iDecStrideC;
-				pTmpBsBuf += 8;
-			}		
+  return 0;
+}
 
-			pBs->pCurBuf += 384;
-			InitReadBits( pBs );
-		
-		    //step 3: update QP and pNonZeroCount
-			pCurLayer->pLumaQp[iMbXy] = 0;
-			pCurLayer->pChromaQp[iMbXy] = 0;		
-			ST32(&pCurLayer->pNzc[iMbXy][0], 0);
-			ST32(&pCurLayer->pNzc[iMbXy][4], 0);
-			ST32(&pCurLayer->pNzc[iMbXy][8], 0);
-			ST32(&pCurLayer->pNzc[iMbXy][12], 0);
-			return 0;
-		}
-		else
-		{
-			if (0 == uiMbType) 
-			{
-				FORCE_STACK_ALIGN_1D( int8_t, pIntraPredMode, 48, 16 );
-				pCurLayer->pMbType[iMbXy] = MB_TYPE_INTRA4x4;
-				pCtx->pFillInfoCacheIntra4x4Func( &sNeighAvail, pNonZeroCount, pIntraPredMode, pCurLayer );
-				if ( pCtx->pParseIntra4x4ModeFunc( &sNeighAvail, pIntraPredMode, pBs, pCurLayer ) )
-				{
-					return -1;
-				}
-				iNMbMode = BASE_MB;
-			}
-			else //I_PCM exclude, we can ignore it
-			{
-				pCurLayer->pMbType[iMbXy] = MB_TYPE_INTRA16x16;
-				pCurLayer->pIntraPredMode[iMbXy][7] = (uiMbType-1) & 3;
-				pCurLayer->pCbp[iMbXy] = g_kuiI16CbpTable[(uiMbType-1)>>2];
-				uiCbpC = pCurLayer->pCbp[iMbXy] >> 4;
-				uiCbpL = pCurLayer->pCbp[iMbXy] & 15;
-				WelsFillCacheNonZeroCount( &sNeighAvail, pNonZeroCount, pCurLayer );
-				if ( pCtx->pParseIntra16x16ModeFunc( &sNeighAvail, pBs, pCurLayer ) )
-				{
-					return -1;
-				}
-				iNMbMode = BASE_MB;
-			}
-		}
-	}	
-	
-	if ( MB_TYPE_INTRA16x16 != pCurLayer->pMbType[iMbXy] ) 
-	{
-		uiCbp = BsGetUe(pBs);	
-		{
-			if ( uiCbp > 47 ) 
-				return ERR_INFO_INVALID_CBP;
+int32_t WelsDecodeMbCavlcISlice (PWelsDecoderContext pCtx, PNalUnit pNalCur) {
+  PDqLayer pCurLayer = pCtx->pCurDqLayer;
+  PBitStringAux pBs = pCurLayer->pBitStringAux;
+  PSliceHeaderExt pSliceHeaderExt = &pCurLayer->sLayerInfo.sSliceInLayer.sSliceHeaderExt;
+  int32_t iBaseModeFlag;
+  int32_t iRet = 0; //should have the return value to indicate decoding error or not, It's NECESSARY--2010.4.15
 
-			if (MB_TYPE_INTRA4x4 == pCurLayer->pMbType[iMbXy]) 
-			{
-				uiCbp = g_kuiIntra4x4CbpTable[uiCbp];
-			}
-			else //inter
-				uiCbp = g_kuiInterCbpTable[uiCbp];
-		}
+  if (pSliceHeaderExt->bAdaptiveBaseModeFlag == 1) {
+    iBaseModeFlag = BsGetOneBit (pBs);
+  } else {
+    iBaseModeFlag = pSliceHeaderExt->bDefaultBaseModeFlag;
+  }
+  if (!iBaseModeFlag) {
+    iRet = WelsActualDecodeMbCavlcISlice (pCtx);
+  } else {
+    WelsLog (pCtx, WELS_LOG_WARNING, "iBaseModeFlag (%d) != 0, inter-layer prediction not supported.\n", iBaseModeFlag);
+    return GENERATE_ERROR_NO (ERR_LEVEL_SLICE_HEADER, ERR_INFO_UNSUPPORTED_ILP);
+  }
+  if (iRet) { //occur error when parsing, MUST STOP decoding
+    return iRet;
+  }
 
-		pCurLayer->pCbp[iMbXy] = uiCbp;
-		uiCbpC = pCurLayer->pCbp[iMbXy] >> 4;
-		uiCbpL = pCurLayer->pCbp[iMbXy] & 15;
-	}		
+  return 0;
+}
 
-	if(iNMbMode == BASE_MB)
-	{
-		pCtx->sBlockFunc.pWelsBlockZero16x16Func(pCurLayer->pScaledTCoeff[iMbXy], 16);
-		pCtx->sBlockFunc.pWelsBlockZero8x8Func(pCurLayer->pScaledTCoeff[iMbXy]+256, 8);
-		pCtx->sBlockFunc.pWelsBlockZero8x8Func(pCurLayer->pScaledTCoeff[iMbXy]+256+64, 8);
+int32_t WelsActualDecodeMbCavlcPSlice (PWelsDecoderContext pCtx) {
+  SVlcTable* pVlcTable     = &pCtx->sVlcTable;
+  PDqLayer pCurLayer		 = pCtx->pCurDqLayer;
+  PBitStringAux pBs		 = pCurLayer->pBitStringAux;
+  PSlice pSlice			 = &pCurLayer->sLayerInfo.sSliceInLayer;
+  PSliceHeader pSliceHeader		     = &pSlice->sSliceHeaderExt.sSliceHeader;
 
-		ST32(&pCurLayer->pNzc[iMbXy][0], 0);
-		ST32(&pCurLayer->pNzc[iMbXy][4], 0);
-		ST32(&pCurLayer->pNzc[iMbXy][8], 0);
-		ST32(&pCurLayer->pNzc[iMbXy][12], 0);
-		ST32(&pCurLayer->pNzc[iMbXy][20], 0);
-		if( pCurLayer->pCbp[iMbXy] == 0 && !IS_INTRA16x16(pCurLayer->pMbType[iMbXy]) && !IS_I_BL(pCurLayer->pMbType[iMbXy]))
-		{
-			pCurLayer->pLumaQp[iMbXy] = pSlice->iLastMbQp;
-			pCurLayer->pChromaQp[iMbXy] = g_kuiChromaQp[WELS_CLIP3(pCurLayer->pLumaQp[iMbXy] + pSliceHeader->pPps->iChromaQpIndexOffset, 0, 51)];
-		}
-	}	
+  SNeighAvail sNeighAvail;
 
-	if ( pCurLayer->pCbp[iMbXy] || MB_TYPE_INTRA16x16 == pCurLayer->pMbType[iMbXy] )
-	{
-		int32_t iQpDelta, iId8x8, iId4x4;	
-		
-		iQpDelta = BsGetSe(pBs);
+  int32_t iScanIdxStart = pSlice->sSliceHeaderExt.uiScanIdxStart;
+  int32_t iScanIdxEnd   = pSlice->sSliceHeaderExt.uiScanIdxEnd;
 
-        if (iQpDelta > 25 || iQpDelta < -26) //out of iQpDelta range
-		{
-			return ERR_INFO_INVALID_QP;
-		}
+  int32_t iMbX = pCurLayer->iMbX;
+  int32_t iMbY = pCurLayer->iMbY;
+  int32_t iMbXy = pCurLayer->iMbXyIndex;
 
-		pCurLayer->pLumaQp[iMbXy] = pSlice->iLastMbQp + iQpDelta; //update iLastMbQp
-		//refer to JVT-X201wcm1.doc equation(7-35)	
-		if ( (unsigned)(pCurLayer->pLumaQp[iMbXy]) > 51 )
-		{
-			if ( pCurLayer->pLumaQp[iMbXy] < 0 )
-			{
-				pCurLayer->pLumaQp[iMbXy] += 52;
-			} 
-			else
-			{
-				pCurLayer->pLumaQp[iMbXy] -= 52;
-			}
-		}
-		//QP should be in the range of [0, 51]
-		if ( pCurLayer->pLumaQp[iMbXy] < 0 || pCurLayer->pLumaQp[iMbXy] > 51 )
-		{
-			return ERR_INFO_INVALID_QP;
-		}
-		pSlice->iLastMbQp = pCurLayer->pLumaQp[iMbXy];
-		pCurLayer->pChromaQp[iMbXy] = g_kuiChromaQp[WELS_CLIP3(pSlice->iLastMbQp + pSliceHeader->pPps->iChromaQpIndexOffset, 0, 51)];
+  int32_t iNMbMode, i;
+  uint32_t uiMbType = 0, uiCbp = 0, uiCbpL = 0, uiCbpC = 0;
 
-		BsStartCavlc( pBs );
+  FORCE_STACK_ALIGN_1D (uint8_t, pNonZeroCount, 48, 16);
+  pCurLayer->pInterPredictionDoneFlag[iMbXy] = 0;//2009.10.23
 
-		if ( MB_TYPE_INTRA16x16 == pCurLayer->pMbType[iMbXy] ) 
-		{
-			//step1: Luma DC
-			if ( WelsResidualBlockCavlc( pVlcTable, pNonZeroCount, pBs, 0, 16, g_kuiLumaDcZigzagScan, 
-				I16_LUMA_DC, pCurLayer->pScaledTCoeff[iMbXy], iNMbMode, pCurLayer->pLumaQp[iMbXy], pCtx) )
-			{
-				return -1;//abnormal
-			}
-			//step2: Luma AC
-			if (uiCbpL)
-			{
-				for (i = 0; i < 16; i++) 
-				{
-                    if ( WelsResidualBlockCavlc( pVlcTable, pNonZeroCount,pBs, i,
-							iScanIdxEnd - WELS_MAX(iScanIdxStart, 1) + 1, g_kuiZigzagScan + WELS_MAX(iScanIdxStart,1),
-							I16_LUMA_AC, pCurLayer->pScaledTCoeff[iMbXy] + (i<<4), iNMbMode, pCurLayer->pLumaQp[iMbXy], pCtx) )
-					{
-						return -1;//abnormal
-					}
-				}
-				ST32(&pCurLayer->pNzc[iMbXy][0], LD32(&pNonZeroCount[1+8*1]));
-				ST32(&pCurLayer->pNzc[iMbXy][4], LD32(&pNonZeroCount[1+8*2]));
-				ST32(&pCurLayer->pNzc[iMbXy][8], LD32(&pNonZeroCount[1+8*3]));
-				ST32(&pCurLayer->pNzc[iMbXy][12], LD32(&pNonZeroCount[1+8*4]));
-			}
-			else //pNonZeroCount = 0
-			{
-				ST32(&pCurLayer->pNzc[iMbXy][0], 0);
-				ST32(&pCurLayer->pNzc[iMbXy][4], 0);
-				ST32(&pCurLayer->pNzc[iMbXy][8], 0);
-				ST32(&pCurLayer->pNzc[iMbXy][12], 0);
-			}
-		}
-		else //non-MB_TYPE_INTRA16x16
-		{	
-			for (iId8x8 = 0; iId8x8 < 4; iId8x8++) 
-			{
-				if (uiCbpL & (1 << iId8x8)) 
-				{
-					int32_t iIndex = (iId8x8 << 2);
-					for (iId4x4 = 0; iId4x4 < 4; iId4x4++) 
-					{
-						//Luma (DC and AC decoding together)
-						if ( WelsResidualBlockCavlc( pVlcTable, pNonZeroCount, pBs, iIndex,
-							iScanIdxEnd - iScanIdxStart + 1, g_kuiZigzagScan+iScanIdxStart, LUMA_DC_AC,
-							pCurLayer->pScaledTCoeff[iMbXy] + (iIndex<<4), iNMbMode, pCurLayer->pLumaQp[iMbXy], pCtx) )
-						{
-							return -1;//abnormal
-						}
-						iIndex++;
-					}
-				}
-				else
-				{					
-					ST16(&pNonZeroCount[g_kuiCacheNzcScanIdx[iId8x8<<2]],0);
-					ST16(&pNonZeroCount[g_kuiCacheNzcScanIdx[(iId8x8<<2)+2]],0);
-				}
-			}	
-			ST32(&pCurLayer->pNzc[iMbXy][0], LD32(&pNonZeroCount[1+8*1]));
-			ST32(&pCurLayer->pNzc[iMbXy][4], LD32(&pNonZeroCount[1+8*2]));
-			ST32(&pCurLayer->pNzc[iMbXy][8], LD32(&pNonZeroCount[1+8*3]));
-			ST32(&pCurLayer->pNzc[iMbXy][12], LD32(&pNonZeroCount[1+8*4]));
+  uiMbType = BsGetUe (pBs);
+  if (uiMbType < 5) { //inter MB type
+    int16_t iMotionVector[LIST_A][30][MV_A];
+
+    int8_t	iRefIndex[LIST_A][30];
+    pCurLayer->pMbType[iMbXy] = g_ksInterMbTypeInfo[uiMbType].iType;
+    WelsFillCacheInter (&sNeighAvail, pNonZeroCount, iMotionVector, iRefIndex, pCurLayer);
+    if (ParseInterInfo (pCtx, iMotionVector, iRefIndex, pBs)) {
+      return -1;//abnormal
+    }
+
+    if (pSlice->sSliceHeaderExt.bAdaptiveResidualPredFlag == 1) {
+      pCurLayer->pResidualPredFlag[iMbXy] =  BsGetOneBit (pBs);
+    } else {
+      pCurLayer->pResidualPredFlag[iMbXy] = pSlice->sSliceHeaderExt.bDefaultResidualPredFlag;
+    }
+
+    if (pCurLayer->pResidualPredFlag[iMbXy] == 0) {
+      iNMbMode = BASE_MB;
+      pCurLayer->pInterPredictionDoneFlag[iMbXy] = 0;
+    } else {
+      WelsLog (pCtx, WELS_LOG_WARNING, "residual_pred_flag = 1 not supported.\n");
+      return -1;
+    }
+  } else { //intra MB type
+    uiMbType -= 5;
+    if (uiMbType > 25) {
+      return ERR_INFO_INVALID_MB_TYPE;
+    }
+
+    if (25 == uiMbType) {
+      int32_t iDecStrideL = pCurLayer->pDec->iLinesize[0];
+      int32_t iDecStrideC = pCurLayer->pDec->iLinesize[1];
+
+      int32_t iOffsetL = (iMbX + iMbY * iDecStrideL) << 4;
+      int32_t iOffsetC = (iMbX + iMbY * iDecStrideC) << 3;
+
+      uint8_t* pDecY = pCurLayer->pCsData[0] + iOffsetL;
+      uint8_t* pDecU = pCurLayer->pCsData[1] + iOffsetC;
+      uint8_t* pDecV = pCurLayer->pCsData[2] + iOffsetC;
+
+      uint8_t* pTmpBsBuf;
+
+      int32_t i;
+      int32_t iCopySizeY  = (sizeof (uint8_t) << 4);
+      int32_t iCopySizeUV = (sizeof (uint8_t) << 3);
+
+      int32_t iIndex = ((-pBs->iLeftBits) >> 3) + 2;
+
+      pCurLayer->pMbType[iMbXy] = MB_TYPE_INTRA_PCM;
+
+      //step 1: locating bit-stream pointer [must align into integer byte]
+      pBs->pCurBuf -= iIndex;
+
+      //step 2: copy pixel from bit-stream into fdec [reconstruction]
+      pTmpBsBuf = pBs->pCurBuf;
+      for (i = 0; i < 16; i++) { //luma
+        memcpy (pDecY , pTmpBsBuf, iCopySizeY);
+        pDecY += iDecStrideL;
+        pTmpBsBuf += 16;
+      }
+
+      for (i = 0; i < 8; i++) { //cb
+        memcpy (pDecU, pTmpBsBuf, iCopySizeUV);
+        pDecU += iDecStrideC;
+        pTmpBsBuf += 8;
+      }
+      for (i = 0; i < 8; i++) { //cr
+        memcpy (pDecV, pTmpBsBuf, iCopySizeUV);
+        pDecV += iDecStrideC;
+        pTmpBsBuf += 8;
+      }
+
+      pBs->pCurBuf += 384;
+      InitReadBits (pBs);
+
+      //step 3: update QP and pNonZeroCount
+      pCurLayer->pLumaQp[iMbXy] = 0;
+      pCurLayer->pChromaQp[iMbXy] = 0;
+      ST32 (&pCurLayer->pNzc[iMbXy][0], 0);
+      ST32 (&pCurLayer->pNzc[iMbXy][4], 0);
+      ST32 (&pCurLayer->pNzc[iMbXy][8], 0);
+      ST32 (&pCurLayer->pNzc[iMbXy][12], 0);
+      return 0;
+    } else {
+      if (0 == uiMbType) {
+        FORCE_STACK_ALIGN_1D (int8_t, pIntraPredMode, 48, 16);
+        pCurLayer->pMbType[iMbXy] = MB_TYPE_INTRA4x4;
+        pCtx->pFillInfoCacheIntra4x4Func (&sNeighAvail, pNonZeroCount, pIntraPredMode, pCurLayer);
+        if (pCtx->pParseIntra4x4ModeFunc (&sNeighAvail, pIntraPredMode, pBs, pCurLayer)) {
+          return -1;
         }
+        iNMbMode = BASE_MB;
+      } else { //I_PCM exclude, we can ignore it
+        pCurLayer->pMbType[iMbXy] = MB_TYPE_INTRA16x16;
+        pCurLayer->pIntraPredMode[iMbXy][7] = (uiMbType - 1) & 3;
+        pCurLayer->pCbp[iMbXy] = g_kuiI16CbpTable[ (uiMbType - 1) >> 2];
+        uiCbpC = pCurLayer->pCbp[iMbXy] >> 4;
+        uiCbpL = pCurLayer->pCbp[iMbXy] & 15;
+        WelsFillCacheNonZeroCount (&sNeighAvail, pNonZeroCount, pCurLayer);
+        if (pCtx->pParseIntra16x16ModeFunc (&sNeighAvail, pBs, pCurLayer)) {
+          return -1;
+        }
+        iNMbMode = BASE_MB;
+      }
+    }
+  }
 
-		
-		//chroma 
-		//step1: DC
-		if ( 1 == uiCbpC || 2 == uiCbpC )
-		{	
-			for (i = 0; i < 2; i++) //Cb Cr
-			{	
-				if ( WelsResidualBlockCavlc( pVlcTable, pNonZeroCount, pBs,
-					16 + (i << 2), 4, g_kuiChromaDcScan, CHROMA_DC, pCurLayer->pScaledTCoeff[iMbXy] + 256 + (i<<6),
-					iNMbMode, pCurLayer->pChromaQp[iMbXy], pCtx) ) 
-				{
-					return -1;//abnormal
-				}
-			}
-		}
-		else
-		{
-		}
-		//step2: AC
-		if (2 == uiCbpC)
-		{
-			for (i = 0; i < 2; i++) //Cb Cr
-			{
-				int32_t iIndex= 16 + (i<<2);
-				for (iId4x4 = 0; iId4x4 < 4; iId4x4++) 
-				{
-					if ( WelsResidualBlockCavlc( pVlcTable, pNonZeroCount, pBs, iIndex,
-						iScanIdxEnd - WELS_MAX(iScanIdxStart, 1) + 1, g_kuiZigzagScan + WELS_MAX(iScanIdxStart,1),
-						CHROMA_AC, pCurLayer->pScaledTCoeff[iMbXy]+(iIndex<<4), iNMbMode, pCurLayer->pChromaQp[iMbXy], pCtx) )
-					{
-						return -1;//abnormal
-					}
-					iIndex++;
-				}
-			}
-			ST16(&pCurLayer->pNzc[iMbXy][16], LD16(&pNonZeroCount[6+8*1]));
-			ST16(&pCurLayer->pNzc[iMbXy][20], LD16(&pNonZeroCount[6+8*2]));
-			ST16(&pCurLayer->pNzc[iMbXy][18], LD16(&pNonZeroCount[6+8*4]));
-			ST16(&pCurLayer->pNzc[iMbXy][22], LD16(&pNonZeroCount[6+8*5]));
-		}
-		else 
-		{
-			ST32(&pCurLayer->pNzc[iMbXy][16], 0);
-			ST32(&pCurLayer->pNzc[iMbXy][20], 0); 
-		}
-		BsEndCavlc( pBs );
-	}
-	else
-	{
-		ST32(&pCurLayer->pNzc[iMbXy][0], 0);
-		ST32(&pCurLayer->pNzc[iMbXy][4], 0);
-		ST32(&pCurLayer->pNzc[iMbXy][8], 0);
-		ST32(&pCurLayer->pNzc[iMbXy][12], 0);
-		ST32(&pCurLayer->pNzc[iMbXy][16], 0);
-		ST32(&pCurLayer->pNzc[iMbXy][20], 0);
-	}	
+  if (MB_TYPE_INTRA16x16 != pCurLayer->pMbType[iMbXy]) {
+    uiCbp = BsGetUe (pBs);
+    {
+      if (uiCbp > 47)
+        return ERR_INFO_INVALID_CBP;
+
+      if (MB_TYPE_INTRA4x4 == pCurLayer->pMbType[iMbXy]) {
+        uiCbp = g_kuiIntra4x4CbpTable[uiCbp];
+      } else //inter
+        uiCbp = g_kuiInterCbpTable[uiCbp];
+    }
+
+    pCurLayer->pCbp[iMbXy] = uiCbp;
+    uiCbpC = pCurLayer->pCbp[iMbXy] >> 4;
+    uiCbpL = pCurLayer->pCbp[iMbXy] & 15;
+  }
+
+  if (iNMbMode == BASE_MB) {
+    pCtx->sBlockFunc.pWelsBlockZero16x16Func (pCurLayer->pScaledTCoeff[iMbXy], 16);
+    pCtx->sBlockFunc.pWelsBlockZero8x8Func (pCurLayer->pScaledTCoeff[iMbXy] + 256, 8);
+    pCtx->sBlockFunc.pWelsBlockZero8x8Func (pCurLayer->pScaledTCoeff[iMbXy] + 256 + 64, 8);
+
+    ST32 (&pCurLayer->pNzc[iMbXy][0], 0);
+    ST32 (&pCurLayer->pNzc[iMbXy][4], 0);
+    ST32 (&pCurLayer->pNzc[iMbXy][8], 0);
+    ST32 (&pCurLayer->pNzc[iMbXy][12], 0);
+    ST32 (&pCurLayer->pNzc[iMbXy][20], 0);
+    if (pCurLayer->pCbp[iMbXy] == 0 && !IS_INTRA16x16 (pCurLayer->pMbType[iMbXy]) && !IS_I_BL (pCurLayer->pMbType[iMbXy])) {
+      pCurLayer->pLumaQp[iMbXy] = pSlice->iLastMbQp;
+      pCurLayer->pChromaQp[iMbXy] = g_kuiChromaQp[WELS_CLIP3 (pCurLayer->pLumaQp[iMbXy] +
+                                    pSliceHeader->pPps->iChromaQpIndexOffset, 0, 51)];
+    }
+  }
+
+  if (pCurLayer->pCbp[iMbXy] || MB_TYPE_INTRA16x16 == pCurLayer->pMbType[iMbXy]) {
+    int32_t iQpDelta, iId8x8, iId4x4;
+
+    iQpDelta = BsGetSe (pBs);
+
+    if (iQpDelta > 25 || iQpDelta < -26) { //out of iQpDelta range
+      return ERR_INFO_INVALID_QP;
+    }
+
+    pCurLayer->pLumaQp[iMbXy] = pSlice->iLastMbQp + iQpDelta; //update iLastMbQp
+    //refer to JVT-X201wcm1.doc equation(7-35)
+    if ((unsigned) (pCurLayer->pLumaQp[iMbXy]) > 51) {
+      if (pCurLayer->pLumaQp[iMbXy] < 0) {
+        pCurLayer->pLumaQp[iMbXy] += 52;
+      } else {
+        pCurLayer->pLumaQp[iMbXy] -= 52;
+      }
+    }
+    //QP should be in the range of [0, 51]
+    if (pCurLayer->pLumaQp[iMbXy] < 0 || pCurLayer->pLumaQp[iMbXy] > 51) {
+      return ERR_INFO_INVALID_QP;
+    }
+    pSlice->iLastMbQp = pCurLayer->pLumaQp[iMbXy];
+    pCurLayer->pChromaQp[iMbXy] = g_kuiChromaQp[WELS_CLIP3 (pSlice->iLastMbQp + pSliceHeader->pPps->iChromaQpIndexOffset, 0,
+                                  51)];
+
+    BsStartCavlc (pBs);
+
+    if (MB_TYPE_INTRA16x16 == pCurLayer->pMbType[iMbXy]) {
+      //step1: Luma DC
+      if (WelsResidualBlockCavlc (pVlcTable, pNonZeroCount, pBs, 0, 16, g_kuiLumaDcZigzagScan,
+                                  I16_LUMA_DC, pCurLayer->pScaledTCoeff[iMbXy], iNMbMode, pCurLayer->pLumaQp[iMbXy], pCtx)) {
+        return -1;//abnormal
+      }
+      //step2: Luma AC
+      if (uiCbpL) {
+        for (i = 0; i < 16; i++) {
+          if (WelsResidualBlockCavlc (pVlcTable, pNonZeroCount, pBs, i,
+                                      iScanIdxEnd - WELS_MAX (iScanIdxStart, 1) + 1, g_kuiZigzagScan + WELS_MAX (iScanIdxStart, 1),
+                                      I16_LUMA_AC, pCurLayer->pScaledTCoeff[iMbXy] + (i << 4), iNMbMode, pCurLayer->pLumaQp[iMbXy], pCtx)) {
+            return -1;//abnormal
+          }
+        }
+        ST32 (&pCurLayer->pNzc[iMbXy][0], LD32 (&pNonZeroCount[1 + 8 * 1]));
+        ST32 (&pCurLayer->pNzc[iMbXy][4], LD32 (&pNonZeroCount[1 + 8 * 2]));
+        ST32 (&pCurLayer->pNzc[iMbXy][8], LD32 (&pNonZeroCount[1 + 8 * 3]));
+        ST32 (&pCurLayer->pNzc[iMbXy][12], LD32 (&pNonZeroCount[1 + 8 * 4]));
+      } else { //pNonZeroCount = 0
+        ST32 (&pCurLayer->pNzc[iMbXy][0], 0);
+        ST32 (&pCurLayer->pNzc[iMbXy][4], 0);
+        ST32 (&pCurLayer->pNzc[iMbXy][8], 0);
+        ST32 (&pCurLayer->pNzc[iMbXy][12], 0);
+      }
+    } else { //non-MB_TYPE_INTRA16x16
+      for (iId8x8 = 0; iId8x8 < 4; iId8x8++) {
+        if (uiCbpL & (1 << iId8x8)) {
+          int32_t iIndex = (iId8x8 << 2);
+          for (iId4x4 = 0; iId4x4 < 4; iId4x4++) {
+            //Luma (DC and AC decoding together)
+            if (WelsResidualBlockCavlc (pVlcTable, pNonZeroCount, pBs, iIndex,
+                                        iScanIdxEnd - iScanIdxStart + 1, g_kuiZigzagScan + iScanIdxStart, LUMA_DC_AC,
+                                        pCurLayer->pScaledTCoeff[iMbXy] + (iIndex << 4), iNMbMode, pCurLayer->pLumaQp[iMbXy], pCtx)) {
+              return -1;//abnormal
+            }
+            iIndex++;
+          }
+        } else {
+          ST16 (&pNonZeroCount[g_kuiCacheNzcScanIdx[iId8x8 << 2]], 0);
+          ST16 (&pNonZeroCount[g_kuiCacheNzcScanIdx[ (iId8x8 << 2) + 2]], 0);
+        }
+      }
+      ST32 (&pCurLayer->pNzc[iMbXy][0], LD32 (&pNonZeroCount[1 + 8 * 1]));
+      ST32 (&pCurLayer->pNzc[iMbXy][4], LD32 (&pNonZeroCount[1 + 8 * 2]));
+      ST32 (&pCurLayer->pNzc[iMbXy][8], LD32 (&pNonZeroCount[1 + 8 * 3]));
+      ST32 (&pCurLayer->pNzc[iMbXy][12], LD32 (&pNonZeroCount[1 + 8 * 4]));
+    }
 
-	return 0;
+
+    //chroma
+    //step1: DC
+    if (1 == uiCbpC || 2 == uiCbpC) {
+      for (i = 0; i < 2; i++) { //Cb Cr
+        if (WelsResidualBlockCavlc (pVlcTable, pNonZeroCount, pBs,
+                                    16 + (i << 2), 4, g_kuiChromaDcScan, CHROMA_DC, pCurLayer->pScaledTCoeff[iMbXy] + 256 + (i << 6),
+                                    iNMbMode, pCurLayer->pChromaQp[iMbXy], pCtx)) {
+          return -1;//abnormal
+        }
+      }
+    } else {
+    }
+    //step2: AC
+    if (2 == uiCbpC) {
+      for (i = 0; i < 2; i++) { //Cb Cr
+        int32_t iIndex = 16 + (i << 2);
+        for (iId4x4 = 0; iId4x4 < 4; iId4x4++) {
+          if (WelsResidualBlockCavlc (pVlcTable, pNonZeroCount, pBs, iIndex,
+                                      iScanIdxEnd - WELS_MAX (iScanIdxStart, 1) + 1, g_kuiZigzagScan + WELS_MAX (iScanIdxStart, 1),
+                                      CHROMA_AC, pCurLayer->pScaledTCoeff[iMbXy] + (iIndex << 4), iNMbMode, pCurLayer->pChromaQp[iMbXy], pCtx)) {
+            return -1;//abnormal
+          }
+          iIndex++;
+        }
+      }
+      ST16 (&pCurLayer->pNzc[iMbXy][16], LD16 (&pNonZeroCount[6 + 8 * 1]));
+      ST16 (&pCurLayer->pNzc[iMbXy][20], LD16 (&pNonZeroCount[6 + 8 * 2]));
+      ST16 (&pCurLayer->pNzc[iMbXy][18], LD16 (&pNonZeroCount[6 + 8 * 4]));
+      ST16 (&pCurLayer->pNzc[iMbXy][22], LD16 (&pNonZeroCount[6 + 8 * 5]));
+    } else {
+      ST32 (&pCurLayer->pNzc[iMbXy][16], 0);
+      ST32 (&pCurLayer->pNzc[iMbXy][20], 0);
+    }
+    BsEndCavlc (pBs);
+  } else {
+    ST32 (&pCurLayer->pNzc[iMbXy][0], 0);
+    ST32 (&pCurLayer->pNzc[iMbXy][4], 0);
+    ST32 (&pCurLayer->pNzc[iMbXy][8], 0);
+    ST32 (&pCurLayer->pNzc[iMbXy][12], 0);
+    ST32 (&pCurLayer->pNzc[iMbXy][16], 0);
+    ST32 (&pCurLayer->pNzc[iMbXy][20], 0);
+  }
+
+  return 0;
 }
 
-int32_t WelsDecodeMbCavlcPSlice(PWelsDecoderContext pCtx, PNalUnit pNalCur)
-{
-	PDqLayer pCurLayer		 = pCtx->pCurDqLayer;
-	PBitStringAux pBs		 = pCurLayer->pBitStringAux;
-	PSlice pSlice			 = &pCurLayer->sLayerInfo.sSliceInLayer;
-	PSliceHeader pSliceHeader		    = &pSlice->sSliceHeaderExt.sSliceHeader;
+int32_t WelsDecodeMbCavlcPSlice (PWelsDecoderContext pCtx, PNalUnit pNalCur) {
+  PDqLayer pCurLayer		 = pCtx->pCurDqLayer;
+  PBitStringAux pBs		 = pCurLayer->pBitStringAux;
+  PSlice pSlice			 = &pCurLayer->sLayerInfo.sSliceInLayer;
+  PSliceHeader pSliceHeader		    = &pSlice->sSliceHeaderExt.sSliceHeader;
 
-	int32_t iMbXy = pCurLayer->iMbXyIndex;
-	int32_t iBaseModeFlag, i;
-	int32_t iRet = 0; //should have the return value to indicate decoding error or not, It's NECESSARY--2010.4.15
+  int32_t iMbXy = pCurLayer->iMbXyIndex;
+  int32_t iBaseModeFlag, i;
+  int32_t iRet = 0; //should have the return value to indicate decoding error or not, It's NECESSARY--2010.4.15
 
-	if (-1 == pSlice->iMbSkipRun) 
-	{
-		pSlice->iMbSkipRun = BsGetUe(pBs);
-		if ( -1 == pSlice->iMbSkipRun )
-		{
-			return -1;
-		}
-		
-	}
-	if (pSlice->iMbSkipRun--)
-	{
-		int16_t iMv[2] = {0};
-		
-		pCurLayer->pMbType[iMbXy] = MB_TYPE_SKIP;
-		ST32(&pCurLayer->pNzc[iMbXy][0], 0);
-		ST32(&pCurLayer->pNzc[iMbXy][4], 0);
-		ST32(&pCurLayer->pNzc[iMbXy][8], 0);
-		ST32(&pCurLayer->pNzc[iMbXy][12], 0);
-		ST32(&pCurLayer->pNzc[iMbXy][16], 0);
-		ST32(&pCurLayer->pNzc[iMbXy][20], 0);
-		
-		pCurLayer->pInterPredictionDoneFlag[iMbXy] = 0;
-		memset(pCurLayer->pRefIndex[0][iMbXy], 0, sizeof(int8_t) * 16);
-		
-		//predict iMv			
-		PredPSkipMvFromNeighbor( pCurLayer, iMv );
-		for (i = 0; i < 16; i++)
-		{
-			ST32( pCurLayer->pMv[0][iMbXy][i], *(uint32_t*)iMv );
-		}
-		
-		if(!pSlice->sSliceHeaderExt.bDefaultResidualPredFlag)
-		{
-			memset(pCurLayer->pScaledTCoeff[iMbXy], 0, 384*sizeof(int16_t));
-		}
+  if (-1 == pSlice->iMbSkipRun) {
+    pSlice->iMbSkipRun = BsGetUe (pBs);
+    if (-1 == pSlice->iMbSkipRun) {
+      return -1;
+    }
 
-		//reset rS
-		if(!pSlice->sSliceHeaderExt.bDefaultResidualPredFlag ||
-			(pNalCur->sNalHeaderExt.uiQualityId==0 && pNalCur->sNalHeaderExt.uiDependencyId==0))
-		{
-			pCurLayer->pLumaQp[iMbXy] = pSlice->iLastMbQp;
-			pCurLayer->pChromaQp[iMbXy] = g_kuiChromaQp[WELS_CLIP3(pCurLayer->pLumaQp[iMbXy] + pSliceHeader->pPps->iChromaQpIndexOffset, 0, 51)];
-		}
+  }
+  if (pSlice->iMbSkipRun--) {
+    int16_t iMv[2] = {0};
 
-		pCurLayer->pCbp[iMbXy] = 0;
+    pCurLayer->pMbType[iMbXy] = MB_TYPE_SKIP;
+    ST32 (&pCurLayer->pNzc[iMbXy][0], 0);
+    ST32 (&pCurLayer->pNzc[iMbXy][4], 0);
+    ST32 (&pCurLayer->pNzc[iMbXy][8], 0);
+    ST32 (&pCurLayer->pNzc[iMbXy][12], 0);
+    ST32 (&pCurLayer->pNzc[iMbXy][16], 0);
+    ST32 (&pCurLayer->pNzc[iMbXy][20], 0);
 
-		return 0;
-	}
+    pCurLayer->pInterPredictionDoneFlag[iMbXy] = 0;
+    memset (pCurLayer->pRefIndex[0][iMbXy], 0, sizeof (int8_t) * 16);
 
-	if(	pSlice->sSliceHeaderExt.bAdaptiveBaseModeFlag == 1)
-	{
-		iBaseModeFlag = BsGetOneBit(pBs);
-	}
-	else
-	{
-		iBaseModeFlag = pSlice->sSliceHeaderExt.bDefaultBaseModeFlag;
-	}
-    if( !iBaseModeFlag )
-    {
-        iRet = WelsActualDecodeMbCavlcPSlice( pCtx );
+    //predict iMv
+    PredPSkipMvFromNeighbor (pCurLayer, iMv);
+    for (i = 0; i < 16; i++) {
+      ST32 (pCurLayer->pMv[0][iMbXy][i], * (uint32_t*)iMv);
     }
-    else
-    {
-        WelsLog( pCtx, WELS_LOG_WARNING, "iBaseModeFlag (%d) != 0, inter-layer prediction not supported.\n", iBaseModeFlag);
-        return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_UNSUPPORTED_ILP);
+
+    if (!pSlice->sSliceHeaderExt.bDefaultResidualPredFlag) {
+      memset (pCurLayer->pScaledTCoeff[iMbXy], 0, 384 * sizeof (int16_t));
     }
-	if ( iRet ) //occur error when parsing, MUST STOP decoding
-	{
-		return iRet;
-	}
 
-	return 0;
+    //reset rS
+    if (!pSlice->sSliceHeaderExt.bDefaultResidualPredFlag ||
+        (pNalCur->sNalHeaderExt.uiQualityId == 0 && pNalCur->sNalHeaderExt.uiDependencyId == 0)) {
+      pCurLayer->pLumaQp[iMbXy] = pSlice->iLastMbQp;
+      pCurLayer->pChromaQp[iMbXy] = g_kuiChromaQp[WELS_CLIP3 (pCurLayer->pLumaQp[iMbXy] +
+                                    pSliceHeader->pPps->iChromaQpIndexOffset, 0, 51)];
+    }
+
+    pCurLayer->pCbp[iMbXy] = 0;
+
+    return 0;
+  }
+
+  if (pSlice->sSliceHeaderExt.bAdaptiveBaseModeFlag == 1) {
+    iBaseModeFlag = BsGetOneBit (pBs);
+  } else {
+    iBaseModeFlag = pSlice->sSliceHeaderExt.bDefaultBaseModeFlag;
+  }
+  if (!iBaseModeFlag) {
+    iRet = WelsActualDecodeMbCavlcPSlice (pCtx);
+  } else {
+    WelsLog (pCtx, WELS_LOG_WARNING, "iBaseModeFlag (%d) != 0, inter-layer prediction not supported.\n", iBaseModeFlag);
+    return GENERATE_ERROR_NO (ERR_LEVEL_SLICE_HEADER, ERR_INFO_UNSUPPORTED_ILP);
+  }
+  if (iRet) { //occur error when parsing, MUST STOP decoding
+    return iRet;
+  }
+
+  return 0;
 }
 
-void_t WelsBlockInit(int16_t* pBlock, int32_t iWidth, int32_t iHeight, int32_t iStride, uint8_t uiVal)
-{
-	int32_t i;
-	int16_t* pDst = pBlock;
-	
-	for(i=0; i<iHeight; i++)
-	{
-		memset(pDst, uiVal, iWidth*sizeof(int16_t));
-		pDst += iStride;
-	}
+void_t WelsBlockInit (int16_t* pBlock, int32_t iWidth, int32_t iHeight, int32_t iStride, uint8_t uiVal) {
+  int32_t i;
+  int16_t* pDst = pBlock;
+
+  for (i = 0; i < iHeight; i++) {
+    memset (pDst, uiVal, iWidth * sizeof (int16_t));
+    pDst += iStride;
+  }
 }
 
-void_t WelsBlockFuncInit(SBlockFunc  * pFunc,  int32_t iCpu)
-{
-	pFunc->pWelsBlockZero16x16Func		= WelsBlockZero16x16_c;
-	pFunc->pWelsBlockZero8x8Func	    = WelsBlockZero8x8_c;
-	pFunc->pWelsSetNonZeroCountFunc	    = SetNonZeroCount_c;
+void_t WelsBlockFuncInit (SBlockFunc*   pFunc,  int32_t iCpu) {
+  pFunc->pWelsBlockZero16x16Func		= WelsBlockZero16x16_c;
+  pFunc->pWelsBlockZero8x8Func	    = WelsBlockZero8x8_c;
+  pFunc->pWelsSetNonZeroCountFunc	    = SetNonZeroCount_c;
 
 #ifdef  X86_ASM
-	if( iCpu & WELS_CPU_SSE2 ){
-		pFunc->pWelsBlockZero16x16Func		= WelsResBlockZero16x16_sse2;
-		pFunc->pWelsBlockZero8x8Func	    = WelsResBlockZero8x8_sse2;
-	}
+  if (iCpu & WELS_CPU_SSE2) {
+    pFunc->pWelsBlockZero16x16Func		= WelsResBlockZero16x16_sse2;
+    pFunc->pWelsBlockZero8x8Func	    = WelsResBlockZero8x8_sse2;
+  }
 #endif
 }
-void_t WelsBlockZero16x16_c(int16_t * pBlock, int32_t iStride)
-{
-    WelsBlockInit(pBlock,16,16,iStride,0);
+void_t WelsBlockZero16x16_c (int16_t* pBlock, int32_t iStride) {
+  WelsBlockInit (pBlock, 16, 16, iStride, 0);
 }
 
-void_t WelsBlockZero8x8_c(int16_t * pBlock, int32_t iStride)
-{
-	WelsBlockInit(pBlock,8,8,iStride,0);
+void_t WelsBlockZero8x8_c (int16_t* pBlock, int32_t iStride) {
+  WelsBlockInit (pBlock, 8, 8, iStride, 0);
 }
 
-void_t SetNonZeroCount_c(int16_t* pBlock, int8_t* pNonZeroCount)
-{
-    int32_t i;
-    int32_t iIndex;
+void_t SetNonZeroCount_c (int16_t* pBlock, int8_t* pNonZeroCount) {
+  int32_t i;
+  int32_t iIndex;
 
-	for( i=0;i<24;i++ ){
-        iIndex = g_kuiMbNonZeroCountIdx[i];
-	    pNonZeroCount[iIndex] = !!pNonZeroCount[iIndex];
-	}
+  for (i = 0; i < 24; i++) {
+    iIndex = g_kuiMbNonZeroCountIdx[i];
+    pNonZeroCount[iIndex] = !!pNonZeroCount[iIndex];
+  }
 }
 
 } // namespace WelsDec
--- a/codec/decoder/core/src/decoder.cpp
+++ b/codec/decoder/core/src/decoder.cpp
@@ -65,152 +65,137 @@
 
 namespace WelsDec {
 
-extern PPicture AllocPicture( PWelsDecoderContext pCtx,const int32_t kiPicWidth, const int32_t kiPicHeight );
+extern PPicture AllocPicture (PWelsDecoderContext pCtx, const int32_t kiPicWidth, const int32_t kiPicHeight);
 
-extern void_t FreePicture( PPicture pPic );
+extern void_t FreePicture (PPicture pPic);
 
 #ifdef WORDS_BIGENDIAN
-inline void_t GetValueOf4Bytes( uint8_t* pDstNal, int32_t iDdstIdx )
-{
-	pDstNal[0] = (iDdstIdx & 0xff000000) >> 24;
-	pDstNal[1] = (iDdstIdx & 0xff0000) >> 16;
-	pDstNal[2] = (iDdstIdx & 0xff00) >> 8;
-	pDstNal[3] = (iDdstIdx & 0xff);
+inline void_t GetValueOf4Bytes (uint8_t* pDstNal, int32_t iDdstIdx) {
+  pDstNal[0] = (iDdstIdx & 0xff000000) >> 24;
+  pDstNal[1] = (iDdstIdx & 0xff0000) >> 16;
+  pDstNal[2] = (iDdstIdx & 0xff00) >> 8;
+  pDstNal[3] = (iDdstIdx & 0xff);
 }
 #else //WORDS_BIGENDIAN
-inline void_t GetValueOf4Bytes( uint8_t* pDstNal, int32_t iDdstIdx )
-{
-	pDstNal[0] = (iDdstIdx & 0xff);
-	pDstNal[1] = (iDdstIdx & 0xff00) >> 8;
-	pDstNal[2] = (iDdstIdx & 0xff0000) >> 16;
-	pDstNal[3] = (iDdstIdx & 0xff000000) >> 24;
+inline void_t GetValueOf4Bytes (uint8_t* pDstNal, int32_t iDdstIdx) {
+  pDstNal[0] = (iDdstIdx & 0xff);
+  pDstNal[1] = (iDdstIdx & 0xff00) >> 8;
+  pDstNal[2] = (iDdstIdx & 0xff0000) >> 16;
+  pDstNal[3] = (iDdstIdx & 0xff000000) >> 24;
 }
 #endif //WORDS_BIGENDIAN
 
-static int32_t CreatePicBuff(PWelsDecoderContext pCtx, PPicBuff *ppPicBuf, const int32_t kiSize, const int32_t kiPicWidth, const int32_t kiPicHeight)
-{
-	PPicBuff pPicBuf = NULL;
-	int32_t iPicIdx = 0;
-	if (kiSize <= 0 || kiPicWidth <= 0 || kiPicHeight <= 0)
-	{
-		return 1;
-	}
+static int32_t CreatePicBuff (PWelsDecoderContext pCtx, PPicBuff* ppPicBuf, const int32_t kiSize,
+                              const int32_t kiPicWidth, const int32_t kiPicHeight) {
+  PPicBuff pPicBuf = NULL;
+  int32_t iPicIdx = 0;
+  if (kiSize <= 0 || kiPicWidth <= 0 || kiPicHeight <= 0) {
+    return 1;
+  }
 
-	pPicBuf	= (PPicBuff)WelsMalloc( sizeof(SPicBuff), "PPicBuff" );
+  pPicBuf	= (PPicBuff)WelsMalloc (sizeof (SPicBuff), "PPicBuff");
 
-	if ( NULL == pPicBuf )
-	{
-		return 1;
-	}
+  if (NULL == pPicBuf) {
+    return 1;
+  }
 
-	pPicBuf->ppPic = (PPicture *)WelsMalloc(kiSize * sizeof(PPicture), "PPicture*");
+  pPicBuf->ppPic = (PPicture*)WelsMalloc (kiSize * sizeof (PPicture), "PPicture*");
 
-	if ( NULL == pPicBuf->ppPic )
-	{
-		return 1;
-	}
-	for (iPicIdx = 0; iPicIdx < kiSize; ++ iPicIdx)
-	{	
-		PPicture pPic = AllocPicture( pCtx, kiPicWidth, kiPicHeight );
-		if ( NULL == pPic )
-		{
-			return 1;
-		}
-		pPicBuf->ppPic[iPicIdx] = pPic;
-	}
+  if (NULL == pPicBuf->ppPic) {
+    return 1;
+  }
+  for (iPicIdx = 0; iPicIdx < kiSize; ++ iPicIdx) {
+    PPicture pPic = AllocPicture (pCtx, kiPicWidth, kiPicHeight);
+    if (NULL == pPic) {
+      return 1;
+    }
+    pPicBuf->ppPic[iPicIdx] = pPic;
+  }
 
-	// initialize context in queue
-	pPicBuf->iCapacity	 = kiSize;	
-	pPicBuf->iCurrentIdx = 0;
-	*ppPicBuf			 = pPicBuf;
+  // initialize context in queue
+  pPicBuf->iCapacity	 = kiSize;
+  pPicBuf->iCurrentIdx = 0;
+  *ppPicBuf			 = pPicBuf;
 
-	return 0;
+  return 0;
 }
 
-static void_t DestroyPicBuff( PPicBuff *ppPicBuf )
-{
-	PPicBuff pPicBuf = NULL;
+static void_t DestroyPicBuff (PPicBuff* ppPicBuf) {
+  PPicBuff pPicBuf = NULL;
 
-	if ( NULL == ppPicBuf || NULL == *ppPicBuf )
-		return;
+  if (NULL == ppPicBuf || NULL == *ppPicBuf)
+    return;
 
-	pPicBuf = *ppPicBuf;
-	while(pPicBuf->ppPic != NULL)
-	{
-		int32_t iPicIdx = 0;
-		while (iPicIdx < pPicBuf->iCapacity)
-		{
-			PPicture pPic = pPicBuf->ppPic[iPicIdx];
-			if(pPic != NULL)
-			{
-				FreePicture( pPic );
-			}	
-			pPic = NULL;
-			++ iPicIdx;
-		}
+  pPicBuf = *ppPicBuf;
+  while (pPicBuf->ppPic != NULL) {
+    int32_t iPicIdx = 0;
+    while (iPicIdx < pPicBuf->iCapacity) {
+      PPicture pPic = pPicBuf->ppPic[iPicIdx];
+      if (pPic != NULL) {
+        FreePicture (pPic);
+      }
+      pPic = NULL;
+      ++ iPicIdx;
+    }
 
-		WelsFree(pPicBuf->ppPic, "pPicBuf->queue");
+    WelsFree (pPicBuf->ppPic, "pPicBuf->queue");
 
-		pPicBuf->ppPic	= NULL;
-	}
-	pPicBuf->iCapacity	= 0;
-	pPicBuf->iCurrentIdx= 0;
+    pPicBuf->ppPic	= NULL;
+  }
+  pPicBuf->iCapacity	= 0;
+  pPicBuf->iCurrentIdx = 0;
 
-	WelsFree( pPicBuf, "pPicBuf" );
+  WelsFree (pPicBuf, "pPicBuf");
 
-	pPicBuf = NULL;
-	*ppPicBuf = NULL;
+  pPicBuf = NULL;
+  *ppPicBuf = NULL;
 }
 /*
  * fill data fields in default for decoder context
  */
-void_t WelsDecoderDefaults( PWelsDecoderContext pCtx )
-{
-    int32_t iCpuCores               = 1;
-	memset( pCtx, 0, sizeof(SWelsDecoderContext) );	// fill zero first
+void_t WelsDecoderDefaults (PWelsDecoderContext pCtx) {
+  int32_t iCpuCores               = 1;
+  memset (pCtx, 0, sizeof (SWelsDecoderContext));	// fill zero first
 
-	pCtx->pArgDec                   = NULL;
+  pCtx->pArgDec                   = NULL;
 
-	pCtx->iOutputColorFormat		= videoFormatI420;	// yuv in default
-	pCtx->bHaveGotMemory			= false;	// not ever request memory blocks for decoder context related
-	pCtx->uiCpuFlag					= 0;
-	
-	pCtx->bAuReadyFlag				= 0; // au data is not ready
-	
+  pCtx->iOutputColorFormat		= videoFormatI420;	// yuv in default
+  pCtx->bHaveGotMemory			= false;	// not ever request memory blocks for decoder context related
+  pCtx->uiCpuFlag					= 0;
 
-	g_uiCacheLineSize				= 16;
+  pCtx->bAuReadyFlag				= 0; // au data is not ready
+
+
+  g_uiCacheLineSize				= 16;
 #if defined(X86_ASM)
-	pCtx->uiCpuFlag = WelsCPUFeatureDetect(&iCpuCores);
+  pCtx->uiCpuFlag = WelsCPUFeatureDetect (&iCpuCores);
 #ifdef HAVE_CACHE_LINE_ALIGN
-	if ( pCtx->uiCpuFlag & WELS_CPU_CACHELINE_64 )
-	{
-		g_uiCacheLineSize	= 64;
-	}
-	else if ( pCtx->uiCpuFlag & WELS_CPU_CACHELINE_32 )
-	{
-		g_uiCacheLineSize	= 32;
-	}
+  if (pCtx->uiCpuFlag & WELS_CPU_CACHELINE_64) {
+    g_uiCacheLineSize	= 64;
+  } else if (pCtx->uiCpuFlag & WELS_CPU_CACHELINE_32) {
+    g_uiCacheLineSize	= 32;
+  }
 #endif//HAVE_CACHE_LINE_ALIGN
 #endif//X86_ASM	
 
-	pCtx->iImgWidthInPixel		= 0;
-	pCtx->iImgHeightInPixel		= 0;		// alloc picture data when picture size is available
+  pCtx->iImgWidthInPixel		= 0;
+  pCtx->iImgHeightInPixel		= 0;		// alloc picture data when picture size is available
 
-	pCtx->iFrameNum				= -1;
-	pCtx->iPrevFrameNum			= -1;
-	pCtx->iErrorCode			= ERR_NONE;
-	
-	pCtx->pDec					= NULL;
+  pCtx->iFrameNum				= -1;
+  pCtx->iPrevFrameNum			= -1;
+  pCtx->iErrorCode			= ERR_NONE;
 
-	WelsResetRefPic(pCtx);
-	
-	pCtx->iActiveFmoNum			= 0;
+  pCtx->pDec					= NULL;
 
-	pCtx->pPicBuff[LIST_0]		= NULL;
-	pCtx->pPicBuff[LIST_1]		= NULL;
+  WelsResetRefPic (pCtx);
 
-	pCtx->bAvcBasedFlag			= true;
+  pCtx->iActiveFmoNum			= 0;
 
+  pCtx->pPicBuff[LIST_0]		= NULL;
+  pCtx->pPicBuff[LIST_1]		= NULL;
+
+  pCtx->bAvcBasedFlag			= true;
+
 }
 
 /*
@@ -221,231 +206,212 @@
 /*
  *	get size of reference picture list in target layer incoming, = (iNumRefFrames x 2)
  */
-static inline int32_t GetTargetRefListSize( PWelsDecoderContext pCtx )
-{	
-	bool_t  *pSubsetSpsAvail= &pCtx->bSubspsAvailFlags[0];
-	bool_t  *pSpsAvail		= &pCtx->bSpsAvailFlags[0];
-	int32_t iSubsetIdx		= -1;
-	int32_t iSpsIdx			= -1;
-	bool_t  bExistSubsetSps = false;
-	int32_t bExistSps		= false;
-	int32_t iPos			= MAX_SPS_COUNT - 1;
-	int32_t iNumRefFrames	= 0;
+static inline int32_t GetTargetRefListSize (PWelsDecoderContext pCtx) {
+  bool_t*  pSubsetSpsAvail = &pCtx->bSubspsAvailFlags[0];
+  bool_t*  pSpsAvail		= &pCtx->bSpsAvailFlags[0];
+  int32_t iSubsetIdx		= -1;
+  int32_t iSpsIdx			= -1;
+  bool_t  bExistSubsetSps = false;
+  int32_t bExistSps		= false;
+  int32_t iPos			= MAX_SPS_COUNT - 1;
+  int32_t iNumRefFrames	= 0;
 
-	while (iPos >= 0)
-	{
-		if ( pSubsetSpsAvail[iPos] )
-		{
-			bExistSubsetSps	= true;
-			iSubsetIdx		= iPos;
-			break;
-		}
-		-- iPos;
-	}
+  while (iPos >= 0) {
+    if (pSubsetSpsAvail[iPos]) {
+      bExistSubsetSps	= true;
+      iSubsetIdx		= iPos;
+      break;
+    }
+    -- iPos;
+  }
 
-	if ( !bExistSubsetSps )
-	{
-		iPos = MAX_SPS_COUNT - 1;
-		while (iPos >= 0)
-		{
-			if ( pSpsAvail[iPos] )
-			{
-				bExistSps	= true;
-				iSpsIdx		= iPos;
-				break;
-			}
-			-- iPos;
-		}
-	}
+  if (!bExistSubsetSps) {
+    iPos = MAX_SPS_COUNT - 1;
+    while (iPos >= 0) {
+      if (pSpsAvail[iPos]) {
+        bExistSps	= true;
+        iSpsIdx		= iPos;
+        break;
+      }
+      -- iPos;
+    }
+  }
 
-	if ( !(bExistSubsetSps || bExistSps) )
-	{
-		iNumRefFrames = MAX_REF_PIC_COUNT;
-	}
-	else
-	{
-		PSps pSps = bExistSubsetSps ? (&pCtx->sSubsetSpsBuffer[iSubsetIdx].sSps) : (&pCtx->sSpsBuffer[iSpsIdx]);
-		
-        iNumRefFrames	= (pSps->iNumRefFrames ) + 1;
-	}
+  if (! (bExistSubsetSps || bExistSps)) {
+    iNumRefFrames = MAX_REF_PIC_COUNT;
+  } else {
+    PSps pSps = bExistSubsetSps ? (&pCtx->sSubsetSpsBuffer[iSubsetIdx].sSps) : (&pCtx->sSpsBuffer[iSpsIdx]);
 
-	if ( 0 == iNumRefFrames )
-        iNumRefFrames	= (MIN_REF_PIC_COUNT);
-	
+    iNumRefFrames	= (pSps->iNumRefFrames) + 1;
+  }
+
+  if (0 == iNumRefFrames)
+    iNumRefFrames	= (MIN_REF_PIC_COUNT);
+
 #ifdef LONG_TERM_REF
-	//pic_queue size minimum set 2
-	if (iNumRefFrames <2)
-	{
-		iNumRefFrames = 2;
-	}
+  //pic_queue size minimum set 2
+  if (iNumRefFrames < 2) {
+    iNumRefFrames = 2;
+  }
 #endif
 
-	return iNumRefFrames;
+  return iNumRefFrames;
 }
 
 /*
  *	request memory blocks for decoder avc part
  */
-int32_t WelsRequestMem( PWelsDecoderContext pCtx, const int32_t kiMbWidth, const int32_t kiMbHeight )
-{
-	const int32_t kiPicWidth	= kiMbWidth << 4;
-	const int32_t kiPicHeight	= kiMbHeight << 4;
-	int32_t iErr = ERR_NONE;
+int32_t WelsRequestMem (PWelsDecoderContext pCtx, const int32_t kiMbWidth, const int32_t kiMbHeight) {
+  const int32_t kiPicWidth	= kiMbWidth << 4;
+  const int32_t kiPicHeight	= kiMbHeight << 4;
+  int32_t iErr = ERR_NONE;
 
-	int32_t iListIdx			= 0;	//, mb_blocks	= 0;
-	int32_t	iPicQueueSize		= 0;	// adaptive size of picture queue, = (pSps->iNumRefFrames x 2)
-	bool_t  bNeedChangePicQueue	= true;
-	
-	WELS_VERIFY_RETURN_IF( ERR_INFO_INVALID_PARAM, ( NULL == pCtx || kiPicWidth <= 0 || kiPicHeight <= 0 ) )	
+  int32_t iListIdx			= 0;	//, mb_blocks	= 0;
+  int32_t	iPicQueueSize		= 0;	// adaptive size of picture queue, = (pSps->iNumRefFrames x 2)
+  bool_t  bNeedChangePicQueue	= true;
 
-	// Fixed the issue about different gop size over last, 5/17/2010
-	// get picture queue size currently
-	iPicQueueSize	= GetTargetRefListSize( pCtx );	// adaptive size of picture queue, = (pSps->iNumRefFrames x 2)
-	pCtx->iPicQueueNumber = iPicQueueSize;
-	if ( pCtx->pPicBuff[LIST_0] != NULL && pCtx->pPicBuff[LIST_0]->iCapacity == iPicQueueSize )	// comparing current picture queue size requested and previous allocation picture queue
-		bNeedChangePicQueue	= false;
-	// HD based pic buffer need consider memory size consumed when switch from 720p to other lower size
-	WELS_VERIFY_RETURN_IF( ERR_NONE, pCtx->bHaveGotMemory && ( kiPicWidth == pCtx->iImgWidthInPixel && kiPicHeight == pCtx->iImgHeightInPixel ) && (!bNeedChangePicQueue) )	// have same scaled buffer
+  WELS_VERIFY_RETURN_IF (ERR_INFO_INVALID_PARAM, (NULL == pCtx || kiPicWidth <= 0 || kiPicHeight <= 0))
 
-	// sync update pRefList
-	WelsResetRefPic( pCtx );	// added to sync update ref list due to pictures are free
-	
-	// for Recycled_Pic_Queue
-	for ( iListIdx = LIST_0; iListIdx < LIST_A; ++ iListIdx )
-	{
- 		PPicBuff *ppPic = &pCtx->pPicBuff[iListIdx];
- 		if ( NULL != ppPic && NULL != *ppPic )
- 		{
- 			DestroyPicBuff( ppPic );			
- 		}
-	}
-	
-	// currently only active for LIST_0 due to have no B frames
-	iErr = CreatePicBuff( pCtx, &pCtx->pPicBuff[LIST_0], iPicQueueSize, kiPicWidth, kiPicHeight );
-	if ( iErr != ERR_NONE )
-		return iErr;	
-	
-	
-	pCtx->iImgWidthInPixel	= kiPicWidth;	// target width of image to be reconstruted while decoding
-	pCtx->iImgHeightInPixel	= kiPicHeight;	// target height of image to be reconstruted while decoding
+  // Fixed the issue about different gop size over last, 5/17/2010
+  // get picture queue size currently
+  iPicQueueSize	= GetTargetRefListSize (pCtx);	// adaptive size of picture queue, = (pSps->iNumRefFrames x 2)
+  pCtx->iPicQueueNumber = iPicQueueSize;
+  if (pCtx->pPicBuff[LIST_0] != NULL
+      && pCtx->pPicBuff[LIST_0]->iCapacity ==
+      iPicQueueSize)	// comparing current picture queue size requested and previous allocation picture queue
+    bNeedChangePicQueue	= false;
+  // HD based pic buffer need consider memory size consumed when switch from 720p to other lower size
+  WELS_VERIFY_RETURN_IF (ERR_NONE, pCtx->bHaveGotMemory && (kiPicWidth == pCtx->iImgWidthInPixel
+                         && kiPicHeight == pCtx->iImgHeightInPixel) && (!bNeedChangePicQueue))	// have same scaled buffer
 
-	pCtx->bHaveGotMemory	= true;			// global memory for decoder context related is requested
-	pCtx->pDec		        = NULL;			// need prefetch a new pic due to spatial size changed
-	return ERR_NONE;
+  // sync update pRefList
+  WelsResetRefPic (pCtx);	// added to sync update ref list due to pictures are free
+
+  // for Recycled_Pic_Queue
+  for (iListIdx = LIST_0; iListIdx < LIST_A; ++ iListIdx) {
+    PPicBuff* ppPic = &pCtx->pPicBuff[iListIdx];
+    if (NULL != ppPic && NULL != *ppPic) {
+      DestroyPicBuff (ppPic);
+    }
+  }
+
+  // currently only active for LIST_0 due to have no B frames
+  iErr = CreatePicBuff (pCtx, &pCtx->pPicBuff[LIST_0], iPicQueueSize, kiPicWidth, kiPicHeight);
+  if (iErr != ERR_NONE)
+    return iErr;
+
+
+  pCtx->iImgWidthInPixel	= kiPicWidth;	// target width of image to be reconstruted while decoding
+  pCtx->iImgHeightInPixel	= kiPicHeight;	// target height of image to be reconstruted while decoding
+
+  pCtx->bHaveGotMemory	= true;			// global memory for decoder context related is requested
+  pCtx->pDec		        = NULL;			// need prefetch a new pic due to spatial size changed
+  return ERR_NONE;
 }
 
 /*
  *	free memory blocks in avc
  */
-void_t WelsFreeMem( PWelsDecoderContext pCtx )
-{
-	int32_t iListIdx = 0;
-	
-	/* TODO: free memory blocks introduced in avc */
-	ResetFmoList( pCtx );
+void_t WelsFreeMem (PWelsDecoderContext pCtx) {
+  int32_t iListIdx = 0;
 
-	WelsResetRefPic( pCtx );
+  /* TODO: free memory blocks introduced in avc */
+  ResetFmoList (pCtx);
 
-	// for sPicBuff
-	for ( iListIdx = LIST_0; iListIdx < LIST_A; ++ iListIdx )
-	{
-		PPicBuff *pPicBuff = &pCtx->pPicBuff[iListIdx];
-		if ( NULL != pPicBuff && NULL != *pPicBuff )
-		{
-			DestroyPicBuff( pPicBuff );			
-		}
-	}	
+  WelsResetRefPic (pCtx);
 
-	// added for safe memory
-	pCtx->iImgWidthInPixel	= 0;
-	pCtx->iImgHeightInPixel = 0;
-	pCtx->bHaveGotMemory	= false;
-	
+  // for sPicBuff
+  for (iListIdx = LIST_0; iListIdx < LIST_A; ++ iListIdx) {
+    PPicBuff* pPicBuff = &pCtx->pPicBuff[iListIdx];
+    if (NULL != pPicBuff && NULL != *pPicBuff) {
+      DestroyPicBuff (pPicBuff);
+    }
+  }
+
+  // added for safe memory
+  pCtx->iImgWidthInPixel	= 0;
+  pCtx->iImgHeightInPixel = 0;
+  pCtx->bHaveGotMemory	= false;
+
 }
 
 /*!
- * \brief	Open decoder	
+ * \brief	Open decoder
  */
-void_t WelsOpenDecoder( PWelsDecoderContext pCtx )
-{
-	// function pointers
-	//initial MC function pointer--
-	InitMcFunc(&(pCtx->sMcFunc), pCtx->uiCpuFlag);
+void_t WelsOpenDecoder (PWelsDecoderContext pCtx) {
+  // function pointers
+  //initial MC function pointer--
+  InitMcFunc (& (pCtx->sMcFunc), pCtx->uiCpuFlag);
 
-    InitExpandPictureFunc(&(pCtx->sExpandPicFunc), pCtx->uiCpuFlag);
-	AssignFuncPointerForRec(pCtx);
-	
-	// vlc tables
-	InitVlcTable(&pCtx->sVlcTable);
+  InitExpandPictureFunc (& (pCtx->sExpandPicFunc), pCtx->uiCpuFlag);
+  AssignFuncPointerForRec (pCtx);
 
-	// startup memory
-	if ( ERR_NONE != WelsInitMemory( pCtx ) )
-		return;	
+  // vlc tables
+  InitVlcTable (&pCtx->sVlcTable);
 
-	pCtx->iMaxWidthInSps	= 0;
-	pCtx->iMaxHeightInSps	= 0;
+  // startup memory
+  if (ERR_NONE != WelsInitMemory (pCtx))
+    return;
+
+  pCtx->iMaxWidthInSps	= 0;
+  pCtx->iMaxHeightInSps	= 0;
 #ifdef LONG_TERM_REF
-	pCtx->bParamSetsLostFlag = true;
+  pCtx->bParamSetsLostFlag = true;
 #else
-	pCtx->bReferenceLostAtT0Flag	= true;	// should be true to waiting IDR at incoming AU bits following, 6/4/2010
+  pCtx->bReferenceLostAtT0Flag	= true;	// should be true to waiting IDR at incoming AU bits following, 6/4/2010
 #endif //LONG_TERM_REF
 }
 
 /*!
- * \brief	Close decoder	
+ * \brief	Close decoder
  */
-void_t WelsCloseDecoder( PWelsDecoderContext pCtx )
-{
-	WelsFreeMem( pCtx );
-	
-	WelsFreeMemory( pCtx );
+void_t WelsCloseDecoder (PWelsDecoderContext pCtx) {
+  WelsFreeMem (pCtx);
 
-	UninitialDqLayersContext( pCtx );
+  WelsFreeMemory (pCtx);
 
+  UninitialDqLayersContext (pCtx);
+
 #ifdef LONG_TERM_REF
-	pCtx->bParamSetsLostFlag       = false;
+  pCtx->bParamSetsLostFlag       = false;
 #else
-	pCtx->bReferenceLostAtT0Flag = false;
+  pCtx->bReferenceLostAtT0Flag = false;
 #endif
 }
 
 /*!
- * \brief	configure decoder parameters	
+ * \brief	configure decoder parameters
  */
-int32_t DecoderConfigParam ( PWelsDecoderContext pCtx, const void_t* kpParam )
-{
-	if ( NULL == pCtx || NULL == kpParam )
-		return 1;
+int32_t DecoderConfigParam (PWelsDecoderContext pCtx, const void_t* kpParam) {
+  if (NULL == pCtx || NULL == kpParam)
+    return 1;
 
-	pCtx->pParam	= (SDecodingParam *)WelsMalloc( sizeof(SDecodingParam), "SDecodingParam" );
+  pCtx->pParam	= (SDecodingParam*)WelsMalloc (sizeof (SDecodingParam), "SDecodingParam");
 
-	if ( NULL == pCtx->pParam )
-		return 1;
+  if (NULL == pCtx->pParam)
+    return 1;
 
-	memcpy( pCtx->pParam, kpParam, sizeof(SDecodingParam) );
-	pCtx->iOutputColorFormat	= pCtx->pParam->iOutputColorFormat;
-	pCtx->bErrorResilienceFlag	= pCtx->pParam->uiEcActiveFlag ? true : false;
+  memcpy (pCtx->pParam, kpParam, sizeof (SDecodingParam));
+  pCtx->iOutputColorFormat	= pCtx->pParam->iOutputColorFormat;
+  pCtx->bErrorResilienceFlag	= pCtx->pParam->uiEcActiveFlag ? true : false;
 
-	if ( VIDEO_BITSTREAM_SVC == pCtx->pParam->sVideoProperty.eVideoBsType ||
-		 VIDEO_BITSTREAM_AVC == pCtx->pParam->sVideoProperty.eVideoBsType )
-	{
-		pCtx->eVideoType = pCtx->pParam->sVideoProperty.eVideoBsType;
-	}
-	else
-	{
-		pCtx->eVideoType = VIDEO_BITSTREAM_DEFAULT;
-	}
+  if (VIDEO_BITSTREAM_SVC == pCtx->pParam->sVideoProperty.eVideoBsType ||
+      VIDEO_BITSTREAM_AVC == pCtx->pParam->sVideoProperty.eVideoBsType) {
+    pCtx->eVideoType = pCtx->pParam->sVideoProperty.eVideoBsType;
+  } else {
+    pCtx->eVideoType = VIDEO_BITSTREAM_DEFAULT;
+  }
 
-	WelsLog(pCtx, WELS_LOG_INFO, "eVideoType: %d\n", pCtx->eVideoType);
+  WelsLog (pCtx, WELS_LOG_INFO, "eVideoType: %d\n", pCtx->eVideoType);
 
-	return 0;
+  return 0;
 }
 
-/*! 
+/*!
  *************************************************************************************
- * \brief	Initialize Wels decoder parameters and memory 
+ * \brief	Initialize Wels decoder parameters and memory
  *
- * \param 	pCtx input context to be initialized at first stage 
+ * \param 	pCtx input context to be initialized at first stage
  *
  * \return	0 - successed
  * \return	1 - failed
@@ -453,37 +419,36 @@
  * \note	N/A
  *************************************************************************************
  */
-int32_t WelsInitDecoder( PWelsDecoderContext pCtx, void_t * pTraceHandle, PWelsLogCallbackFunc pLog )
-{
-	if ( pCtx == NULL ){
-		return ERR_INFO_INVALID_PTR;
-	}
+int32_t WelsInitDecoder (PWelsDecoderContext pCtx, void_t* pTraceHandle, PWelsLogCallbackFunc pLog) {
+  if (pCtx == NULL) {
+    return ERR_INFO_INVALID_PTR;
+  }
 
-	// default
-	WelsDecoderDefaults( pCtx );	
+  // default
+  WelsDecoderDefaults (pCtx);
 
-	pCtx->pTraceHandle = pTraceHandle;
+  pCtx->pTraceHandle = pTraceHandle;
 
-	g_pLog = pLog;
+  g_pLog = pLog;
 
-	// open decoder
-	WelsOpenDecoder( pCtx );
-	
-	// decode mode setting 
-	pCtx->iDecoderMode = SW_MODE;
-	pCtx->iSetMode = AUTO_MODE;
-	pCtx->iDecoderOutputProperty = BUFFER_HOST;
-	pCtx->iModeSwitchType = 0; // 0: do not do mode switch
+  // open decoder
+  WelsOpenDecoder (pCtx);
 
+  // decode mode setting
+  pCtx->iDecoderMode = SW_MODE;
+  pCtx->iSetMode = AUTO_MODE;
+  pCtx->iDecoderOutputProperty = BUFFER_HOST;
+  pCtx->iModeSwitchType = 0; // 0: do not do mode switch
 
-	return ERR_NONE;
+
+  return ERR_NONE;
 }
 
-/*! 
+/*!
  *************************************************************************************
  * \brief	Uninitialize Wels decoder parameters and memory
  *
- * \param 	pCtx input context to be uninitialized at release stage 
+ * \param 	pCtx input context to be uninitialized at release stage
  *
  * \return	NONE
  *
@@ -490,22 +455,20 @@
  * \note	N/A
  *************************************************************************************
  */
-void_t WelsEndDecoder( PWelsDecoderContext pCtx )
-{
-	// close decoder
-	WelsCloseDecoder( pCtx );	
+void_t WelsEndDecoder (PWelsDecoderContext pCtx) {
+  // close decoder
+  WelsCloseDecoder (pCtx);
 }
 
-void_t GetVclNalTemporalId( PWelsDecoderContext pCtx )
-{
-	PAccessUnit pAccessUnit = pCtx->pAccessUnitList;
-	int32_t idx = pAccessUnit->uiStartPos;
+void_t GetVclNalTemporalId (PWelsDecoderContext pCtx) {
+  PAccessUnit pAccessUnit = pCtx->pAccessUnitList;
+  int32_t idx = pAccessUnit->uiStartPos;
 
-	pCtx->iFeedbackVclNalInAu = FEEDBACK_VCL_NAL;
-	pCtx->iFeedbackTidInAu    = pAccessUnit->pNalUnitsList[idx]->sNalHeaderExt.uiTemporalId;
+  pCtx->iFeedbackVclNalInAu = FEEDBACK_VCL_NAL;
+  pCtx->iFeedbackTidInAu    = pAccessUnit->pNalUnitsList[idx]->sNalHeaderExt.uiTemporalId;
 }
 
-/*! 
+/*!
  *************************************************************************************
  * \brief	First entrance to decoding core interface.
  *
@@ -521,210 +484,183 @@
  * \note	N/A
  *************************************************************************************
  */
-int32_t WelsDecodeBs( PWelsDecoderContext pCtx, const uint8_t *kpBsBuf, const int32_t kiBsLen, 
-			   uint8_t **ppDst, SBufferInfo* pDstBufInfo)
-{	
-	if ( !pCtx->bEndOfStreamFlag)
-	{
-		SDataBuffer* pRawData   = &pCtx->sRawData;
+int32_t WelsDecodeBs (PWelsDecoderContext pCtx, const uint8_t* kpBsBuf, const int32_t kiBsLen,
+                      uint8_t** ppDst, SBufferInfo* pDstBufInfo) {
+  if (!pCtx->bEndOfStreamFlag) {
+    SDataBuffer* pRawData   = &pCtx->sRawData;
 
-		int32_t iSrcIdx        = 0; //the index of source bit-stream till now after parsing one or more NALs
-		int32_t iSrcConsumed   = 0; // consumed bit count of source bs
-		int32_t iDstIdx        = 0; //the size of current NAL after 0x03 removal and 00 00 01 removal
-		int32_t iSrcLength     = 0;	//the total size of current AU or NAL
+    int32_t iSrcIdx        = 0; //the index of source bit-stream till now after parsing one or more NALs
+    int32_t iSrcConsumed   = 0; // consumed bit count of source bs
+    int32_t iDstIdx        = 0; //the size of current NAL after 0x03 removal and 00 00 01 removal
+    int32_t iSrcLength     = 0;	//the total size of current AU or NAL
 
-		int32_t iConsumedBytes = 0;	
-		int32_t iOffset        = 0;	
+    int32_t iConsumedBytes = 0;
+    int32_t iOffset        = 0;
 
-		uint8_t* pSrcNal       = NULL;
-		uint8_t* pDstNal       = NULL;
-		uint8_t *pNalPayload   = NULL;	
-		
-		
-		if ( NULL == DetectStartCodePrefix( kpBsBuf, &iOffset, kiBsLen ) ) //CAN'T find the 00 00 01 start prefix from the source buffer
-		{
-			return dsBitstreamError;
-		}
+    uint8_t* pSrcNal       = NULL;
+    uint8_t* pDstNal       = NULL;
+    uint8_t* pNalPayload   = NULL;
 
-		pSrcNal    = const_cast<uint8_t*> (kpBsBuf) + iOffset;
-		iSrcLength = kiBsLen - iOffset;
 
-		if ( (kiBsLen + 4) > ( pRawData->pEnd - pRawData->pCurPos ) )
-		{
-			pRawData->pCurPos = pRawData->pHead;
-		}
+    if (NULL == DetectStartCodePrefix (kpBsBuf, &iOffset,
+                                       kiBsLen)) {  //CAN'T find the 00 00 01 start prefix from the source buffer
+      return dsBitstreamError;
+    }
 
+    pSrcNal    = const_cast<uint8_t*> (kpBsBuf) + iOffset;
+    iSrcLength = kiBsLen - iOffset;
 
-		//copy raw data from source buffer (application) to raw data buffer (codec inside)
-		//0x03 removal and extract all of NAL Unit from current raw data
-		pDstNal = pRawData->pCurPos + 4; //4-bytes used to write the length of current NAL rbsp
+    if ((kiBsLen + 4) > (pRawData->pEnd - pRawData->pCurPos)) {
+      pRawData->pCurPos = pRawData->pHead;
+    }
 
-		while ( iSrcConsumed < iSrcLength )
-		{
-			if ( ( 2 + iSrcConsumed < iSrcLength ) && 
-				( 0 == LD16(pSrcNal+iSrcIdx) ) &&
-				( (pSrcNal[2+iSrcIdx]==0x03) || (pSrcNal[2+iSrcIdx]==0x01) ) )
-			{
-				if ( pSrcNal[2+iSrcIdx] == 0x03 )
-				{
-					ST16(pDstNal+iDstIdx, 0);
-					iDstIdx	+= 2;
-					iSrcIdx	+= 3;	
-					iSrcConsumed += 3;
-				}
-				else
-				{
-					GetValueOf4Bytes( pDstNal-4, iDstIdx );   //pDstNal-4 (non-aligned by 4) in Solaris10(SPARC). Given value by byte.
 
-					iConsumedBytes = 0;
-					pNalPayload	= ParseNalHeader( pCtx, &pCtx->sCurNalHead, pDstNal, iDstIdx, pSrcNal-3, iSrcIdx+3, &iConsumedBytes );
-					
-					if (pCtx->bAuReadyFlag)
-					{	
-						ConstructAccessUnit( pCtx, ppDst, pDstBufInfo );	
+    //copy raw data from source buffer (application) to raw data buffer (codec inside)
+    //0x03 removal and extract all of NAL Unit from current raw data
+    pDstNal = pRawData->pCurPos + 4; //4-bytes used to write the length of current NAL rbsp
 
-						if ( (dsOutOfMemory | dsNoParamSets) & pCtx->iErrorCode)
-						{							
+    while (iSrcConsumed < iSrcLength) {
+      if ((2 + iSrcConsumed < iSrcLength) &&
+          (0 == LD16 (pSrcNal + iSrcIdx)) &&
+          ((pSrcNal[2 + iSrcIdx] == 0x03) || (pSrcNal[2 + iSrcIdx] == 0x01))) {
+        if (pSrcNal[2 + iSrcIdx] == 0x03) {
+          ST16 (pDstNal + iDstIdx, 0);
+          iDstIdx	+= 2;
+          iSrcIdx	+= 3;
+          iSrcConsumed += 3;
+        } else {
+          GetValueOf4Bytes (pDstNal - 4, iDstIdx);  //pDstNal-4 (non-aligned by 4) in Solaris10(SPARC). Given value by byte.
+
+          iConsumedBytes = 0;
+          pNalPayload	= ParseNalHeader (pCtx, &pCtx->sCurNalHead, pDstNal, iDstIdx, pSrcNal - 3, iSrcIdx + 3, &iConsumedBytes);
+
+          if (pCtx->bAuReadyFlag) {
+            ConstructAccessUnit (pCtx, ppDst, pDstBufInfo);
+
+            if ((dsOutOfMemory | dsNoParamSets) & pCtx->iErrorCode) {
 #ifdef LONG_TERM_REF
-							pCtx->bParamSetsLostFlag = true;
+              pCtx->bParamSetsLostFlag = true;
 #else
-							pCtx->bReferenceLostAtT0Flag = true;
+              pCtx->bReferenceLostAtT0Flag = true;
 #endif
-							ResetParameterSetsState( pCtx );
+              ResetParameterSetsState (pCtx);
 
-                            if( dsOutOfMemory & pCtx->iErrorCode){
-  							   return pCtx->iErrorCode;
-                            }
-						}
-					}
-					
-					if( (IS_PARAM_SETS_NALS(pCtx->sCurNalHead.eNalUnitType) || IS_SEI_NAL(pCtx->sCurNalHead.eNalUnitType)) &&
-						pNalPayload )
-					{	
-						if ( ParseNonVclNal( pCtx, pNalPayload, iDstIdx-iConsumedBytes ) )
-						{
-							if ( dsNoParamSets & pCtx->iErrorCode )
-							{
+              if (dsOutOfMemory & pCtx->iErrorCode) {
+                return pCtx->iErrorCode;
+              }
+            }
+          }
+
+          if ((IS_PARAM_SETS_NALS (pCtx->sCurNalHead.eNalUnitType) || IS_SEI_NAL (pCtx->sCurNalHead.eNalUnitType)) &&
+              pNalPayload) {
+            if (ParseNonVclNal (pCtx, pNalPayload, iDstIdx - iConsumedBytes)) {
+              if (dsNoParamSets & pCtx->iErrorCode) {
 #ifdef LONG_TERM_REF
-								pCtx->bParamSetsLostFlag = true;
+                pCtx->bParamSetsLostFlag = true;
 #else
-								pCtx->bReferenceLostAtT0Flag = true;
+                pCtx->bReferenceLostAtT0Flag = true;
 #endif
-								ResetParameterSetsState( pCtx );
-							}
-							return pCtx->iErrorCode;
-						}
-					}
+                ResetParameterSetsState (pCtx);
+              }
+              return pCtx->iErrorCode;
+            }
+          }
 
-					pDstNal += iDstIdx; //update current position
-					if ( (iSrcLength - iSrcConsumed + 4) > (pRawData->pEnd - pDstNal) )
-					{
-						pRawData->pCurPos = pRawData->pHead;
-					}
-					else
-					{
-						pRawData->pCurPos = pDstNal;
-					}
-					pDstNal = pRawData->pCurPos + 4; //init, 4 bytes used to store the next NAL
+          pDstNal += iDstIdx; //update current position
+          if ((iSrcLength - iSrcConsumed + 4) > (pRawData->pEnd - pDstNal)) {
+            pRawData->pCurPos = pRawData->pHead;
+          } else {
+            pRawData->pCurPos = pDstNal;
+          }
+          pDstNal = pRawData->pCurPos + 4; //init, 4 bytes used to store the next NAL
 
-					pSrcNal += iSrcIdx+3;
-					iSrcConsumed += 3;						
-					iSrcIdx = 0;	
-					iDstIdx  = 0; //reset 0, used to statistic the length of next NAL					
-				}
-				continue;
-			}
-			pDstNal[iDstIdx++] = pSrcNal[iSrcIdx++];
-			iSrcConsumed++;
-		}
-		
-		//last NAL decoding
-		GetValueOf4Bytes( pDstNal-4, iDstIdx ); //pDstNal-4 (non-aligned by 4) in Solaris10(SPARC). Given value by byte.
+          pSrcNal += iSrcIdx + 3;
+          iSrcConsumed += 3;
+          iSrcIdx = 0;
+          iDstIdx  = 0; //reset 0, used to statistic the length of next NAL
+        }
+        continue;
+      }
+      pDstNal[iDstIdx++] = pSrcNal[iSrcIdx++];
+      iSrcConsumed++;
+    }
 
-		iConsumedBytes = 0;
-		pNalPayload = ParseNalHeader( pCtx, &pCtx->sCurNalHead, pDstNal, iDstIdx, pSrcNal-3, iSrcIdx+3, &iConsumedBytes );
+    //last NAL decoding
+    GetValueOf4Bytes (pDstNal - 4, iDstIdx); //pDstNal-4 (non-aligned by 4) in Solaris10(SPARC). Given value by byte.
 
-		if (pCtx->bAuReadyFlag)
-		{	
-			ConstructAccessUnit( pCtx, ppDst, pDstBufInfo );
+    iConsumedBytes = 0;
+    pNalPayload = ParseNalHeader (pCtx, &pCtx->sCurNalHead, pDstNal, iDstIdx, pSrcNal - 3, iSrcIdx + 3, &iConsumedBytes);
 
-			if ( (dsOutOfMemory | dsNoParamSets) & pCtx->iErrorCode)
-			{				
+    if (pCtx->bAuReadyFlag) {
+      ConstructAccessUnit (pCtx, ppDst, pDstBufInfo);
+
+      if ((dsOutOfMemory | dsNoParamSets) & pCtx->iErrorCode) {
 #ifdef LONG_TERM_REF
-				pCtx->bParamSetsLostFlag = true;
+        pCtx->bParamSetsLostFlag = true;
 #else
-				pCtx->bReferenceLostAtT0Flag = true;
+        pCtx->bReferenceLostAtT0Flag = true;
 #endif
-				ResetParameterSetsState( pCtx );
-				return pCtx->iErrorCode;
-			}			
-		}
+        ResetParameterSetsState (pCtx);
+        return pCtx->iErrorCode;
+      }
+    }
 
-		if( (IS_PARAM_SETS_NALS(pCtx->sCurNalHead.eNalUnitType) || IS_SEI_NAL(pCtx->sCurNalHead.eNalUnitType)) && pNalPayload )
-		{
-			if ( ParseNonVclNal( pCtx, pNalPayload, iDstIdx-iConsumedBytes ) )
-			{
-				if ( dsNoParamSets & pCtx->iErrorCode )
-				{
+    if ((IS_PARAM_SETS_NALS (pCtx->sCurNalHead.eNalUnitType) || IS_SEI_NAL (pCtx->sCurNalHead.eNalUnitType))
+        && pNalPayload) {
+      if (ParseNonVclNal (pCtx, pNalPayload, iDstIdx - iConsumedBytes)) {
+        if (dsNoParamSets & pCtx->iErrorCode) {
 #ifdef LONG_TERM_REF
-					pCtx->bParamSetsLostFlag = true;
+          pCtx->bParamSetsLostFlag = true;
 #else
-					pCtx->bReferenceLostAtT0Flag = true;
+          pCtx->bReferenceLostAtT0Flag = true;
 #endif
-					ResetParameterSetsState( pCtx );
-				}
-				return pCtx->iErrorCode;
-			}
-		}	
+          ResetParameterSetsState (pCtx);
+        }
+        return pCtx->iErrorCode;
+      }
+    }
 
-		pDstNal += iDstIdx;
-		pRawData->pCurPos = pDstNal; //init the pCurPos for next NAL(s) storage
-	}	
-	else  /* no supplementary picture payload input, but stored a picture */
-	{
-		PAccessUnit pCurAu	= pCtx->pAccessUnitList;	// current access unit, it will never point to NULL after decode's successful initialization
-		
-		if ( pCurAu->uiAvailUnitsNum == 0 )
-		{
-			return pCtx->iErrorCode;
-		}
-		else
-		{			
-			pCtx->pAccessUnitList->uiEndPos = pCtx->pAccessUnitList->uiAvailUnitsNum - 1;
-			
-			ConstructAccessUnit( pCtx, ppDst, pDstBufInfo );
+    pDstNal += iDstIdx;
+    pRawData->pCurPos = pDstNal; //init the pCurPos for next NAL(s) storage
+  } else { /* no supplementary picture payload input, but stored a picture */
+    PAccessUnit pCurAu	=
+      pCtx->pAccessUnitList;	// current access unit, it will never point to NULL after decode's successful initialization
 
-			if ( (dsOutOfMemory | dsNoParamSets) & pCtx->iErrorCode)
-			{				
+    if (pCurAu->uiAvailUnitsNum == 0) {
+      return pCtx->iErrorCode;
+    } else {
+      pCtx->pAccessUnitList->uiEndPos = pCtx->pAccessUnitList->uiAvailUnitsNum - 1;
+
+      ConstructAccessUnit (pCtx, ppDst, pDstBufInfo);
+
+      if ((dsOutOfMemory | dsNoParamSets) & pCtx->iErrorCode) {
 #ifdef LONG_TERM_REF
-				pCtx->bParamSetsLostFlag = true;
+        pCtx->bParamSetsLostFlag = true;
 #else
-				pCtx->bReferenceLostAtT0Flag = true;
+        pCtx->bReferenceLostAtT0Flag = true;
 #endif
-				ResetParameterSetsState( pCtx );
-				return pCtx->iErrorCode;
-			}
-			
-		}
-	}
+        ResetParameterSetsState (pCtx);
+        return pCtx->iErrorCode;
+      }
 
-	return pCtx->iErrorCode;
+    }
+  }
+
+  return pCtx->iErrorCode;
 }
 
 /*
  * set colorspace format in decoder
  */
-int32_t DecoderSetCsp(PWelsDecoderContext pCtx, const int32_t kiColorFormat)
-{
-	WELS_VERIFY_RETURN_IF( 1, (NULL == pCtx) );
+int32_t DecoderSetCsp (PWelsDecoderContext pCtx, const int32_t kiColorFormat) {
+  WELS_VERIFY_RETURN_IF (1, (NULL == pCtx));
 
-	pCtx->iOutputColorFormat	= kiColorFormat;
-	if ( pCtx->pParam != NULL )
-	{
-		pCtx->pParam->iOutputColorFormat	= kiColorFormat;
-	}
+  pCtx->iOutputColorFormat	= kiColorFormat;
+  if (pCtx->pParam != NULL) {
+    pCtx->pParam->iOutputColorFormat	= kiColorFormat;
+  }
 
-	return 0;
+  return 0;
 }
 
 /*!
@@ -733,118 +669,110 @@
  * ( MB coordinate and parts of data within decoder context structure )
  * \param	pCtx		Wels decoder context
  * \param	iMbWidth	MB width
- * \pram	iMbHeight	MB height 
+ * \pram	iMbHeight	MB height
  * \return	0 - successful; none 0 - something wrong
  */
-int32_t SyncPictureResolutionExt( PWelsDecoderContext pCtx, const int32_t kiMbWidth, const int32_t kiMbHeight )
-{
-	int32_t iErr = ERR_NONE;
-	const int32_t kiPicWidth	= kiMbWidth << 4;
-	const int32_t kiPicHeight   = kiMbHeight<< 4;
-	
-	iErr = WelsRequestMem( pCtx, kiMbWidth, kiMbHeight );	// common memory used
-	if ( ERR_NONE != iErr )
-	{
-		WelsLog( pCtx, WELS_LOG_WARNING, "SyncPictureResolutionExt()::WelsRequestMem--buffer allocated failure.\n" );
-		pCtx->iErrorCode = dsOutOfMemory;
-		return iErr;	
-	}
+int32_t SyncPictureResolutionExt (PWelsDecoderContext pCtx, const int32_t kiMbWidth, const int32_t kiMbHeight) {
+  int32_t iErr = ERR_NONE;
+  const int32_t kiPicWidth	= kiMbWidth << 4;
+  const int32_t kiPicHeight   = kiMbHeight << 4;
 
-	iErr = InitialDqLayersContext( pCtx, kiPicWidth, kiPicHeight );
-	if ( ERR_NONE != iErr )
-	{
-		WelsLog( pCtx, WELS_LOG_WARNING, "SyncPictureResolutionExt()::InitialDqLayersContext--buffer allocated failure.\n" );
-		pCtx->iErrorCode = dsOutOfMemory;
-	}	
+  iErr = WelsRequestMem (pCtx, kiMbWidth, kiMbHeight);	// common memory used
+  if (ERR_NONE != iErr) {
+    WelsLog (pCtx, WELS_LOG_WARNING, "SyncPictureResolutionExt()::WelsRequestMem--buffer allocated failure.\n");
+    pCtx->iErrorCode = dsOutOfMemory;
+    return iErr;
+  }
 
-	return iErr;
+  iErr = InitialDqLayersContext (pCtx, kiPicWidth, kiPicHeight);
+  if (ERR_NONE != iErr) {
+    WelsLog (pCtx, WELS_LOG_WARNING, "SyncPictureResolutionExt()::InitialDqLayersContext--buffer allocated failure.\n");
+    pCtx->iErrorCode = dsOutOfMemory;
+  }
+
+  return iErr;
 }
 
 /*!
  * \brief	update maximal picture width and height if applicable when receiving a SPS NAL
  */
-void_t UpdateMaxPictureResolution( PWelsDecoderContext pCtx, const int32_t kiCurWidth, const int32_t kiCurHeight )
-{
-	//any dimension larger than that of current dimension, should modify the max-dimension
-	if ( kiCurWidth > pCtx->iMaxWidthInSps || kiCurHeight > pCtx->iMaxHeightInSps)		
-	{
-		pCtx->iMaxWidthInSps	= kiCurWidth;
-		pCtx->iMaxHeightInSps	= kiCurHeight;
-	}
+void_t UpdateMaxPictureResolution (PWelsDecoderContext pCtx, const int32_t kiCurWidth, const int32_t kiCurHeight) {
+  //any dimension larger than that of current dimension, should modify the max-dimension
+  if (kiCurWidth > pCtx->iMaxWidthInSps || kiCurHeight > pCtx->iMaxHeightInSps) {
+    pCtx->iMaxWidthInSps	= kiCurWidth;
+    pCtx->iMaxHeightInSps	= kiCurHeight;
+  }
 
-	return;
+  return;
 }
 
-void_t AssignFuncPointerForRec(PWelsDecoderContext pCtx)
-{
-	pCtx->pGetI16x16LumaPredFunc[I16_PRED_V     ] = WelsI16x16LumaPredV_c;
-	pCtx->pGetI16x16LumaPredFunc[I16_PRED_H     ] = WelsI16x16LumaPredH_c;
-	pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC    ] = WelsI16x16LumaPredDc_c;
-	pCtx->pGetI16x16LumaPredFunc[I16_PRED_P     ] = WelsI16x16LumaPredPlane_c;
-	pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC_L  ] = WelsI16x16LumaPredDcLeft_c;
-	pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC_T  ] = WelsI16x16LumaPredDcTop_c;
-	pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC_128] = WelsI16x16LumaPredDcNA_c;
+void_t AssignFuncPointerForRec (PWelsDecoderContext pCtx) {
+  pCtx->pGetI16x16LumaPredFunc[I16_PRED_V     ] = WelsI16x16LumaPredV_c;
+  pCtx->pGetI16x16LumaPredFunc[I16_PRED_H     ] = WelsI16x16LumaPredH_c;
+  pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC    ] = WelsI16x16LumaPredDc_c;
+  pCtx->pGetI16x16LumaPredFunc[I16_PRED_P     ] = WelsI16x16LumaPredPlane_c;
+  pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC_L  ] = WelsI16x16LumaPredDcLeft_c;
+  pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC_T  ] = WelsI16x16LumaPredDcTop_c;
+  pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC_128] = WelsI16x16LumaPredDcNA_c;
 
-	pCtx->pGetI4x4LumaPredFunc[I4_PRED_V     ] = WelsI4x4LumaPredV_c;
-	pCtx->pGetI4x4LumaPredFunc[I4_PRED_H     ] = WelsI4x4LumaPredH_c;
-	pCtx->pGetI4x4LumaPredFunc[I4_PRED_DC    ] = WelsI4x4LumaPredDc_c;
-	pCtx->pGetI4x4LumaPredFunc[I4_PRED_DC_L  ] = WelsI4x4LumaPredDcLeft_c;
-	pCtx->pGetI4x4LumaPredFunc[I4_PRED_DC_T  ] = WelsI4x4LumaPredDcTop_c;
-	pCtx->pGetI4x4LumaPredFunc[I4_PRED_DC_128] = WelsI4x4LumaPredDcNA_c;
-	pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDL    ] = WelsI4x4LumaPredDDL_c;
-	pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDL_TOP] = WelsI4x4LumaPredDDLTop_c;
-	pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDR    ] = WelsI4x4LumaPredDDR_c;
-	pCtx->pGetI4x4LumaPredFunc[I4_PRED_VL    ] = WelsI4x4LumaPredVL_c;
-	pCtx->pGetI4x4LumaPredFunc[I4_PRED_VL_TOP] = WelsI4x4LumaPredVLTop_c;
-	pCtx->pGetI4x4LumaPredFunc[I4_PRED_VR    ] = WelsI4x4LumaPredVR_c;
-	pCtx->pGetI4x4LumaPredFunc[I4_PRED_HU    ] = WelsI4x4LumaPredHU_c;
-	pCtx->pGetI4x4LumaPredFunc[I4_PRED_HD    ] = WelsI4x4LumaPredHD_c;
-		
-	pCtx->pGetIChromaPredFunc[C_PRED_DC    ] = WelsIChromaPredDc_c;
-	pCtx->pGetIChromaPredFunc[C_PRED_H     ] = WelsIChromaPredH_c;
-	pCtx->pGetIChromaPredFunc[C_PRED_V     ] = WelsIChromaPredV_c;
-	pCtx->pGetIChromaPredFunc[C_PRED_P     ] = WelsIChromaPredPlane_c;
-	pCtx->pGetIChromaPredFunc[C_PRED_DC_L  ] = WelsIChromaPredDcLeft_c;
-	pCtx->pGetIChromaPredFunc[C_PRED_DC_T  ] = WelsIChromaPredDcTop_c;
-	pCtx->pGetIChromaPredFunc[C_PRED_DC_128] = WelsIChromaPredDcNA_c;
+  pCtx->pGetI4x4LumaPredFunc[I4_PRED_V     ] = WelsI4x4LumaPredV_c;
+  pCtx->pGetI4x4LumaPredFunc[I4_PRED_H     ] = WelsI4x4LumaPredH_c;
+  pCtx->pGetI4x4LumaPredFunc[I4_PRED_DC    ] = WelsI4x4LumaPredDc_c;
+  pCtx->pGetI4x4LumaPredFunc[I4_PRED_DC_L  ] = WelsI4x4LumaPredDcLeft_c;
+  pCtx->pGetI4x4LumaPredFunc[I4_PRED_DC_T  ] = WelsI4x4LumaPredDcTop_c;
+  pCtx->pGetI4x4LumaPredFunc[I4_PRED_DC_128] = WelsI4x4LumaPredDcNA_c;
+  pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDL    ] = WelsI4x4LumaPredDDL_c;
+  pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDL_TOP] = WelsI4x4LumaPredDDLTop_c;
+  pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDR    ] = WelsI4x4LumaPredDDR_c;
+  pCtx->pGetI4x4LumaPredFunc[I4_PRED_VL    ] = WelsI4x4LumaPredVL_c;
+  pCtx->pGetI4x4LumaPredFunc[I4_PRED_VL_TOP] = WelsI4x4LumaPredVLTop_c;
+  pCtx->pGetI4x4LumaPredFunc[I4_PRED_VR    ] = WelsI4x4LumaPredVR_c;
+  pCtx->pGetI4x4LumaPredFunc[I4_PRED_HU    ] = WelsI4x4LumaPredHU_c;
+  pCtx->pGetI4x4LumaPredFunc[I4_PRED_HD    ] = WelsI4x4LumaPredHD_c;
 
-	InitDctClipTable();
-	pCtx->pIdctResAddPredFunc	= IdctResAddPred_c;
+  pCtx->pGetIChromaPredFunc[C_PRED_DC    ] = WelsIChromaPredDc_c;
+  pCtx->pGetIChromaPredFunc[C_PRED_H     ] = WelsIChromaPredH_c;
+  pCtx->pGetIChromaPredFunc[C_PRED_V     ] = WelsIChromaPredV_c;
+  pCtx->pGetIChromaPredFunc[C_PRED_P     ] = WelsIChromaPredPlane_c;
+  pCtx->pGetIChromaPredFunc[C_PRED_DC_L  ] = WelsIChromaPredDcLeft_c;
+  pCtx->pGetIChromaPredFunc[C_PRED_DC_T  ] = WelsIChromaPredDcTop_c;
+  pCtx->pGetIChromaPredFunc[C_PRED_DC_128] = WelsIChromaPredDcNA_c;
 
+  InitDctClipTable();
+  pCtx->pIdctResAddPredFunc	= IdctResAddPred_c;
+
 #if defined(X86_ASM)
-	if ( pCtx->uiCpuFlag & WELS_CPU_MMXEXT )
-	{		
-		pCtx->pIdctResAddPredFunc	= IdctResAddPred_mmx;	
+  if (pCtx->uiCpuFlag & WELS_CPU_MMXEXT) {
+    pCtx->pIdctResAddPredFunc	= IdctResAddPred_mmx;
 
-		/////////mmx code opt---
-		pCtx->pGetIChromaPredFunc[C_PRED_H]      = WelsIChromaPredH_mmx;
-		pCtx->pGetIChromaPredFunc[C_PRED_V]      = WelsIChromaPredV_mmx;
-		pCtx->pGetIChromaPredFunc[C_PRED_DC_L  ] = WelsIChromaPredDcLeft_mmx;		
-		pCtx->pGetIChromaPredFunc[C_PRED_DC_128] = WelsIChromaPredDcNA_mmx;
-		pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDR]  = WelsI4x4LumaPredDDR_mmx;
-		pCtx->pGetI4x4LumaPredFunc[I4_PRED_HD ]  = WelsI4x4LumaPredHD_mmx;
-		pCtx->pGetI4x4LumaPredFunc[I4_PRED_HU ]  = WelsI4x4LumaPredHU_mmx;
-		pCtx->pGetI4x4LumaPredFunc[I4_PRED_VR ]  = WelsI4x4LumaPredVR_mmx;
-		pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDL]  = WelsI4x4LumaPredDDL_mmx;
-		pCtx->pGetI4x4LumaPredFunc[I4_PRED_VL ]  = WelsI4x4LumaPredVL_mmx;
-	}
-	if ( pCtx->uiCpuFlag & WELS_CPU_SSE2 )
-	{
-		/////////sse2 code opt---
-		pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC] = WelsI16x16LumaPredDc_sse2;
-		pCtx->pGetI16x16LumaPredFunc[I16_PRED_P]  = WelsI16x16LumaPredPlane_sse2;
-		pCtx->pGetI16x16LumaPredFunc[I16_PRED_H]  = WelsI16x16LumaPredH_sse2;
-		pCtx->pGetI16x16LumaPredFunc[I16_PRED_V]  = WelsI16x16LumaPredV_sse2;
-		pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC_T  ] = WelsI16x16LumaPredDcTop_sse2;
-		pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC_128] = WelsI16x16LumaPredDcNA_sse2;
-		pCtx->pGetIChromaPredFunc[C_PRED_P ]      = WelsIChromaPredPlane_sse2;
-		pCtx->pGetIChromaPredFunc[C_PRED_DC]      = WelsIChromaPredDc_sse2;
-		pCtx->pGetIChromaPredFunc[C_PRED_DC_T]    = WelsIChromaPredDcTop_sse2;
-	}
+    /////////mmx code opt---
+    pCtx->pGetIChromaPredFunc[C_PRED_H]      = WelsIChromaPredH_mmx;
+    pCtx->pGetIChromaPredFunc[C_PRED_V]      = WelsIChromaPredV_mmx;
+    pCtx->pGetIChromaPredFunc[C_PRED_DC_L  ] = WelsIChromaPredDcLeft_mmx;
+    pCtx->pGetIChromaPredFunc[C_PRED_DC_128] = WelsIChromaPredDcNA_mmx;
+    pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDR]  = WelsI4x4LumaPredDDR_mmx;
+    pCtx->pGetI4x4LumaPredFunc[I4_PRED_HD ]  = WelsI4x4LumaPredHD_mmx;
+    pCtx->pGetI4x4LumaPredFunc[I4_PRED_HU ]  = WelsI4x4LumaPredHU_mmx;
+    pCtx->pGetI4x4LumaPredFunc[I4_PRED_VR ]  = WelsI4x4LumaPredVR_mmx;
+    pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDL]  = WelsI4x4LumaPredDDL_mmx;
+    pCtx->pGetI4x4LumaPredFunc[I4_PRED_VL ]  = WelsI4x4LumaPredVL_mmx;
+  }
+  if (pCtx->uiCpuFlag & WELS_CPU_SSE2) {
+    /////////sse2 code opt---
+    pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC] = WelsI16x16LumaPredDc_sse2;
+    pCtx->pGetI16x16LumaPredFunc[I16_PRED_P]  = WelsI16x16LumaPredPlane_sse2;
+    pCtx->pGetI16x16LumaPredFunc[I16_PRED_H]  = WelsI16x16LumaPredH_sse2;
+    pCtx->pGetI16x16LumaPredFunc[I16_PRED_V]  = WelsI16x16LumaPredV_sse2;
+    pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC_T  ] = WelsI16x16LumaPredDcTop_sse2;
+    pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC_128] = WelsI16x16LumaPredDcNA_sse2;
+    pCtx->pGetIChromaPredFunc[C_PRED_P ]      = WelsIChromaPredPlane_sse2;
+    pCtx->pGetIChromaPredFunc[C_PRED_DC]      = WelsIChromaPredDc_sse2;
+    pCtx->pGetIChromaPredFunc[C_PRED_DC_T]    = WelsIChromaPredDcTop_sse2;
+  }
 #endif
-	DeblockingInit(&pCtx->sDeblockingFunc, pCtx->uiCpuFlag);
+  DeblockingInit (&pCtx->sDeblockingFunc, pCtx->uiCpuFlag);
 
-	WelsBlockFuncInit(&pCtx->sBlockFunc, pCtx->uiCpuFlag);
+  WelsBlockFuncInit (&pCtx->sBlockFunc, pCtx->uiCpuFlag);
 }
 
 } // namespace WelsDec
\ No newline at end of file
--- a/codec/decoder/core/src/decoder_core.cpp
+++ b/codec/decoder/core/src/decoder_core.cpp
@@ -59,242 +59,232 @@
 
 namespace WelsDec {
 
-static inline int32_t DecodeFrameConstruction( PWelsDecoderContext pCtx, uint8_t **ppDst, int32_t *pDstLen, int32_t *pWidth, int32_t *pHeight, SBufferInfo *pDstInfo )
-{
-	PDqLayer pCurDq = pCtx->pCurDqLayer;	
-	PPicture pPic = pCtx->pDec;
+static inline int32_t DecodeFrameConstruction (PWelsDecoderContext pCtx, uint8_t** ppDst, int32_t* pDstLen,
+    int32_t* pWidth, int32_t* pHeight, SBufferInfo* pDstInfo) {
+  PDqLayer pCurDq = pCtx->pCurDqLayer;
+  PPicture pPic = pCtx->pDec;
 
-	const int32_t kiWidth = pCurDq->iMbWidth << 4;
-	const int32_t kiHeight= pCurDq->iMbHeight << 4;
+  const int32_t kiWidth = pCurDq->iMbWidth << 4;
+  const int32_t kiHeight = pCurDq->iMbHeight << 4;
 
-	const int32_t kiTotalNumMbInCurLayer = pCurDq->iMbWidth * pCurDq->iMbHeight;
+  const int32_t kiTotalNumMbInCurLayer = pCurDq->iMbWidth * pCurDq->iMbHeight;
 
-	if ( pPic->iTotalNumMbRec != kiTotalNumMbInCurLayer )
-	{
-		WelsLog( pCtx, WELS_LOG_WARNING, "DecodeFrameConstruction():::iTotalNumMbRec:%d, total_num_mb_sps:%d, cur_layer_mb_width:%d, cur_layer_mb_height:%d \n",
-			pPic->iTotalNumMbRec, kiTotalNumMbInCurLayer, pCurDq->iMbWidth, pCurDq->iMbHeight );
-		return -1;
-	}
+  if (pPic->iTotalNumMbRec != kiTotalNumMbInCurLayer) {
+    WelsLog (pCtx, WELS_LOG_WARNING,
+             "DecodeFrameConstruction():::iTotalNumMbRec:%d, total_num_mb_sps:%d, cur_layer_mb_width:%d, cur_layer_mb_height:%d \n",
+             pPic->iTotalNumMbRec, kiTotalNumMbInCurLayer, pCurDq->iMbWidth, pCurDq->iMbHeight);
+    return -1;
+  }
 #ifdef NO_WAITING_AU
-    pPic->iTotalNumMbRec = 0;
+  pPic->iTotalNumMbRec = 0;
 #endif
 
-	if ( I_SLICE == pCurDq->sLayerInfo.sSliceInLayer.eSliceType )
-	{
-		memcpy( &(pCtx->sFrameCrop), &(pCurDq->sLayerInfo.sSliceInLayer.sSliceHeaderExt.sSliceHeader.pSps->sFrameCrop), sizeof(SPosOffset) );//confirmed_safe_unsafe_usage
+  if (I_SLICE == pCurDq->sLayerInfo.sSliceInLayer.eSliceType) {
+    memcpy (& (pCtx->sFrameCrop), & (pCurDq->sLayerInfo.sSliceInLayer.sSliceHeaderExt.sSliceHeader.pSps->sFrameCrop),
+            sizeof (SPosOffset)); //confirmed_safe_unsafe_usage
 #ifdef LONG_TERM_REF
-		pCtx->bParamSetsLostFlag      = false;
+    pCtx->bParamSetsLostFlag      = false;
 #else
-		pCtx->bReferenceLostAtT0Flag = false;	// need initialize it due I_SLICE, 6/4/2010
+    pCtx->bReferenceLostAtT0Flag = false;	// need initialize it due I_SLICE, 6/4/2010
 #endif //LONG_TERM_REF
-		WelsLog( pCtx, WELS_LOG_INFO, "DecodeFrameConstruction()::::output good I frame, %d x %d, crop_left:%d, crop_right:%d, crop_top:%d, crop_bottom:%d.\n",
-			kiWidth, kiHeight, pCtx->sFrameCrop.iLeftOffset, pCtx->sFrameCrop.iRightOffset, pCtx->sFrameCrop.iTopOffset, pCtx->sFrameCrop.iBottomOffset );
-		WelsLog( pCtx, WELS_LOG_INFO, "After decoding, set_mode:[%s], eWorkMode:[%s], eBufferProperty:[%s]\n",
-			DECODER_MODE_NAME(pCtx->iSetMode), DECODER_MODE_NAME(pCtx->iDecoderMode), OUTPUT_PROPERTY_NAME(pDstInfo->eBufferProperty));
-	}
-	
-	//////output:::normal path
-	ppDst[0]      = pPic->pData[0];
-	ppDst[1]      = pPic->pData[1];
-	ppDst[2]      = pPic->pData[2];
-	*pDstLen     = pPic->iLinesize[0]; 
-	*(pDstLen+1) = pPic->iLinesize[1];
-	*pWidth      = kiWidth;
-	*pHeight     = kiHeight;
+    WelsLog (pCtx, WELS_LOG_INFO,
+             "DecodeFrameConstruction()::::output good I frame, %d x %d, crop_left:%d, crop_right:%d, crop_top:%d, crop_bottom:%d.\n",
+             kiWidth, kiHeight, pCtx->sFrameCrop.iLeftOffset, pCtx->sFrameCrop.iRightOffset, pCtx->sFrameCrop.iTopOffset,
+             pCtx->sFrameCrop.iBottomOffset);
+    WelsLog (pCtx, WELS_LOG_INFO, "After decoding, set_mode:[%s], eWorkMode:[%s], eBufferProperty:[%s]\n",
+             DECODER_MODE_NAME (pCtx->iSetMode), DECODER_MODE_NAME (pCtx->iDecoderMode),
+             OUTPUT_PROPERTY_NAME (pDstInfo->eBufferProperty));
+  }
 
-	pDstInfo->UsrData.sSystemBuffer.iFormat = videoFormatI420;
+  //////output:::normal path
+  ppDst[0]      = pPic->pData[0];
+  ppDst[1]      = pPic->pData[1];
+  ppDst[2]      = pPic->pData[2];
+  *pDstLen     = pPic->iLinesize[0];
+  * (pDstLen + 1) = pPic->iLinesize[1];
+  *pWidth      = kiWidth;
+  *pHeight     = kiHeight;
 
-	pDstInfo->UsrData.sSystemBuffer.iWidth = kiWidth - (pCtx->sFrameCrop.iLeftOffset + pCtx->sFrameCrop.iRightOffset)*2;
-	pDstInfo->UsrData.sSystemBuffer.iHeight = kiHeight - (pCtx->sFrameCrop.iTopOffset + pCtx->sFrameCrop.iBottomOffset)*2;
-	pDstInfo->UsrData.sSystemBuffer.iStride[0] = pPic->iLinesize[0];
-	pDstInfo->UsrData.sSystemBuffer.iStride[1] = pPic->iLinesize[1];
-	ppDst[0] = ppDst[0] + pCtx->sFrameCrop.iTopOffset*2*pPic->iLinesize[0] + pCtx->sFrameCrop.iLeftOffset*2;
-	ppDst[1] = ppDst[1] + pCtx->sFrameCrop.iTopOffset  *pPic->iLinesize[1] + pCtx->sFrameCrop.iLeftOffset;
-	ppDst[2] = ppDst[2] + pCtx->sFrameCrop.iTopOffset  *pPic->iLinesize[1] + pCtx->sFrameCrop.iLeftOffset;
-	pDstInfo->eBufferProperty = BUFFER_HOST;
-	pDstInfo->iBufferStatus = 1;
+  pDstInfo->UsrData.sSystemBuffer.iFormat = videoFormatI420;
 
-	return 0;
+  pDstInfo->UsrData.sSystemBuffer.iWidth = kiWidth - (pCtx->sFrameCrop.iLeftOffset + pCtx->sFrameCrop.iRightOffset) * 2;
+  pDstInfo->UsrData.sSystemBuffer.iHeight = kiHeight - (pCtx->sFrameCrop.iTopOffset + pCtx->sFrameCrop.iBottomOffset) * 2;
+  pDstInfo->UsrData.sSystemBuffer.iStride[0] = pPic->iLinesize[0];
+  pDstInfo->UsrData.sSystemBuffer.iStride[1] = pPic->iLinesize[1];
+  ppDst[0] = ppDst[0] + pCtx->sFrameCrop.iTopOffset * 2 * pPic->iLinesize[0] + pCtx->sFrameCrop.iLeftOffset * 2;
+  ppDst[1] = ppDst[1] + pCtx->sFrameCrop.iTopOffset  * pPic->iLinesize[1] + pCtx->sFrameCrop.iLeftOffset;
+  ppDst[2] = ppDst[2] + pCtx->sFrameCrop.iTopOffset  * pPic->iLinesize[1] + pCtx->sFrameCrop.iLeftOffset;
+  pDstInfo->eBufferProperty = BUFFER_HOST;
+  pDstInfo->iBufferStatus = 1;
+
+  return 0;
 }
 
-inline BOOL_T    CheckSliceNeedReconstruct(int16_t iCurDid, int16_t iCurQid, bool_t bStoreRefBasePicFlag, 
-	uint8_t uiDidMax, uint8_t uiLayerDqId, uint8_t uiTargetDqId)
-{
-    return ( (iCurDid == uiDidMax) && (iCurQid == BASE_QUALITY_ID) && (bStoreRefBasePicFlag) ) // store base
-       || (uiLayerDqId == uiTargetDqId); // target layer
+inline BOOL_T    CheckSliceNeedReconstruct (int16_t iCurDid, int16_t iCurQid, bool_t bStoreRefBasePicFlag,
+    uint8_t uiDidMax, uint8_t uiLayerDqId, uint8_t uiTargetDqId) {
+  return ((iCurDid == uiDidMax) && (iCurQid == BASE_QUALITY_ID) && (bStoreRefBasePicFlag))   // store base
+         || (uiLayerDqId == uiTargetDqId); // target layer
 }
 
-inline uint8_t GetTargetDqId(uint8_t uiTargetDqId,  SDecodingParam * psParam)
-{
-    uint8_t  uiRequiredDqId = psParam ? psParam->uiTargetDqLayer : (uint8_t)255;
+inline uint8_t GetTargetDqId (uint8_t uiTargetDqId,  SDecodingParam* psParam) {
+  uint8_t  uiRequiredDqId = psParam ? psParam->uiTargetDqLayer : (uint8_t)255;
 
-	return WELS_MIN(uiTargetDqId, uiRequiredDqId);
+  return WELS_MIN (uiTargetDqId, uiRequiredDqId);
 }
-	
 
-inline void_t    HandleReferenceLostL0(PWelsDecoderContext pCtx, PNalUnit pCurNal)
-{
-    if( 0 == pCurNal->sNalHeaderExt.uiTemporalId ){
-		pCtx->bReferenceLostAtT0Flag = true;
-    }
+
+inline void_t    HandleReferenceLostL0 (PWelsDecoderContext pCtx, PNalUnit pCurNal) {
+  if (0 == pCurNal->sNalHeaderExt.uiTemporalId) {
+    pCtx->bReferenceLostAtT0Flag = true;
+  }
 #ifndef LONG_TERM_REF
-	if( pCtx->bReferenceLostAtT0Flag ){
-		ResetParameterSetsState(pCtx);
-	}	
+  if (pCtx->bReferenceLostAtT0Flag) {
+    ResetParameterSetsState (pCtx);
+  }
 #endif
-	pCtx->iErrorCode |= dsBitstreamError;
+  pCtx->iErrorCode |= dsBitstreamError;
 }
 
-inline void_t    HandleReferenceLost(PWelsDecoderContext pCtx, PNalUnit pCurNal)
-{
-    if( (0 == pCurNal->sNalHeaderExt.uiTemporalId) || (1 == pCurNal->sNalHeaderExt.uiTemporalId) ){
-		pCtx->bReferenceLostAtT0Flag = true;
-    }
+inline void_t    HandleReferenceLost (PWelsDecoderContext pCtx, PNalUnit pCurNal) {
+  if ((0 == pCurNal->sNalHeaderExt.uiTemporalId) || (1 == pCurNal->sNalHeaderExt.uiTemporalId)) {
+    pCtx->bReferenceLostAtT0Flag = true;
+  }
 #ifndef LONG_TERM_REF
-	if( pCtx->bReferenceLostAtT0Flag ){
-		ResetParameterSetsState(pCtx);
-    }
+  if (pCtx->bReferenceLostAtT0Flag) {
+    ResetParameterSetsState (pCtx);
+  }
 #endif
-	pCtx->iErrorCode |= dsRefLost;
+  pCtx->iErrorCode |= dsRefLost;
 }
 
-inline int32_t  WelsDecodeConstructSlice(PWelsDecoderContext pCtx, PNalUnit pCurNal)
-{
-    int32_t  iRet = WelsTargetSliceConstruction(pCtx);
+inline int32_t  WelsDecodeConstructSlice (PWelsDecoderContext pCtx, PNalUnit pCurNal) {
+  int32_t  iRet = WelsTargetSliceConstruction (pCtx);
 
-	if( iRet ){
-		HandleReferenceLostL0(pCtx, pCurNal);
-	}
+  if (iRet) {
+    HandleReferenceLostL0 (pCtx, pCurNal);
+  }
 
-	return iRet;
+  return iRet;
 }
 
 /*
  *	Predeclared function routines ..
  */
-int32_t ParseRefPicListReordering ( PBitStringAux pBs, PSliceHeader pSh )
-{
-	int32_t iList = 0;
-	const ESliceType keSt = pSh->eSliceType;
-	PRefPicListReorderSyn pRefPicListReordering = &pSh->pRefPicListReordering;
-	
-	if ( keSt == I_SLICE || keSt == SI_SLICE )
-		return ERR_NONE;
+int32_t ParseRefPicListReordering (PBitStringAux pBs, PSliceHeader pSh) {
+  int32_t iList = 0;
+  const ESliceType keSt = pSh->eSliceType;
+  PRefPicListReorderSyn pRefPicListReordering = &pSh->pRefPicListReordering;
 
-	// Common syntaxs for P or B slices: list0, list1 followed if B slices used.
-	do {
-		pRefPicListReordering->bRefPicListReorderingFlag[iList]	= !!BsGetOneBit( pBs);
+  if (keSt == I_SLICE || keSt == SI_SLICE)
+    return ERR_NONE;
 
-		if ( pRefPicListReordering->bRefPicListReorderingFlag[iList] ){
-			int32_t iIdx = 0;
-			do {
-				const uint8_t kuiIdc = BsGetUe( pBs );	
-				
-				//Fixed the referrence list reordering crash issue.(fault kIdc value > 3 case)---
-				if ((iIdx >= MAX_REF_PIC_COUNT )||(kuiIdc > 3))
-				{
-					return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_INVALID_REF_REORDERING);
-				}
-				pRefPicListReordering->sReorderingSyn[iList][iIdx].uiReorderingOfPicNumsIdc	= kuiIdc;
-				if ( kuiIdc == 3 )
-					break;
+  // Common syntaxs for P or B slices: list0, list1 followed if B slices used.
+  do {
+    pRefPicListReordering->bRefPicListReorderingFlag[iList]	= !!BsGetOneBit (pBs);
 
-				if ( iIdx >= pSh->uiRefCount[iList] || iIdx >= MAX_REF_PIC_COUNT )
-					return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_INVALID_REF_REORDERING);
-				
-				if (kuiIdc == 0 || kuiIdc == 1){
-					pRefPicListReordering->sReorderingSyn[iList][iIdx].uiAbsDiffPicNumMinus1 = BsGetUe( pBs );	// uiAbsDiffPicNumMinus1
-				}
-				else if (kuiIdc == 2){				
-					pRefPicListReordering->sReorderingSyn[iList][iIdx].uiLongTermPicNum= BsGetUe( pBs );			
-				}
-				
-				++ iIdx;
-			} while(true);
-		}
-		if (keSt != B_SLICE)
-			break;
-		++ iList;
-	} while(iList < LIST_A);
-	
-	return ERR_NONE;
+    if (pRefPicListReordering->bRefPicListReorderingFlag[iList]) {
+      int32_t iIdx = 0;
+      do {
+        const uint8_t kuiIdc = BsGetUe (pBs);
+
+        //Fixed the referrence list reordering crash issue.(fault kIdc value > 3 case)---
+        if ((iIdx >= MAX_REF_PIC_COUNT) || (kuiIdc > 3)) {
+          return GENERATE_ERROR_NO (ERR_LEVEL_SLICE_HEADER, ERR_INFO_INVALID_REF_REORDERING);
+        }
+        pRefPicListReordering->sReorderingSyn[iList][iIdx].uiReorderingOfPicNumsIdc	= kuiIdc;
+        if (kuiIdc == 3)
+          break;
+
+        if (iIdx >= pSh->uiRefCount[iList] || iIdx >= MAX_REF_PIC_COUNT)
+          return GENERATE_ERROR_NO (ERR_LEVEL_SLICE_HEADER, ERR_INFO_INVALID_REF_REORDERING);
+
+        if (kuiIdc == 0 || kuiIdc == 1) {
+          pRefPicListReordering->sReorderingSyn[iList][iIdx].uiAbsDiffPicNumMinus1 = BsGetUe (pBs);	// uiAbsDiffPicNumMinus1
+        } else if (kuiIdc == 2) {
+          pRefPicListReordering->sReorderingSyn[iList][iIdx].uiLongTermPicNum = BsGetUe (pBs);
+        }
+
+        ++ iIdx;
+      } while (true);
+    }
+    if (keSt != B_SLICE)
+      break;
+    ++ iList;
+  } while (iList < LIST_A);
+
+  return ERR_NONE;
 }
 
-int32_t ParseDecRefPicMarking ( PWelsDecoderContext pCtx, PBitStringAux pBs, PSliceHeader pSh, PSps pSps, const bool_t kbIdrFlag)
-{
-	PRefPicMarking const kpRefMarking = &pSh->sRefMarking;
+int32_t ParseDecRefPicMarking (PWelsDecoderContext pCtx, PBitStringAux pBs, PSliceHeader pSh, PSps pSps,
+                               const bool_t kbIdrFlag) {
+  PRefPicMarking const kpRefMarking = &pSh->sRefMarking;
 
-	if ( kbIdrFlag ){
-		kpRefMarking->bNoOutputOfPriorPicsFlag	= !!BsGetOneBit( pBs );
-		kpRefMarking->bLongTermRefFlag			= !!BsGetOneBit( pBs );
-	}
-	else{
-		kpRefMarking->bAdaptiveRefPicMarkingModeFlag	= !!BsGetOneBit( pBs );
-		if (kpRefMarking->bAdaptiveRefPicMarkingModeFlag){
-			int32_t iIdx = 0;
-			do {
-				const int32_t kiMmco = BsGetUe( pBs );
+  if (kbIdrFlag) {
+    kpRefMarking->bNoOutputOfPriorPicsFlag	= !!BsGetOneBit (pBs);
+    kpRefMarking->bLongTermRefFlag			= !!BsGetOneBit (pBs);
+  } else {
+    kpRefMarking->bAdaptiveRefPicMarkingModeFlag	= !!BsGetOneBit (pBs);
+    if (kpRefMarking->bAdaptiveRefPicMarkingModeFlag) {
+      int32_t iIdx = 0;
+      do {
+        const int32_t kiMmco = BsGetUe (pBs);
 
-				kpRefMarking->sMmcoRef[iIdx].uiMmcoType = kiMmco;
-				if (kiMmco == MMCO_END)
-					break;
+        kpRefMarking->sMmcoRef[iIdx].uiMmcoType = kiMmco;
+        if (kiMmco == MMCO_END)
+          break;
 
-				if (kiMmco == MMCO_SHORT2UNUSED || kiMmco == MMCO_SHORT2LONG)
-				{
-					kpRefMarking->sMmcoRef[iIdx].iDiffOfPicNum = 1 + BsGetUe( pBs );
-					kpRefMarking->sMmcoRef[iIdx].iShortFrameNum = (pSh->iFrameNum - kpRefMarking->sMmcoRef[iIdx].iDiffOfPicNum) & ((1<<pSps->uiLog2MaxFrameNum)-1);
-				}
-				else if (kiMmco == MMCO_LONG2UNUSED)
-					kpRefMarking->sMmcoRef[iIdx].uiLongTermPicNum = BsGetUe( pBs );
+        if (kiMmco == MMCO_SHORT2UNUSED || kiMmco == MMCO_SHORT2LONG) {
+          kpRefMarking->sMmcoRef[iIdx].iDiffOfPicNum = 1 + BsGetUe (pBs);
+          kpRefMarking->sMmcoRef[iIdx].iShortFrameNum = (pSh->iFrameNum - kpRefMarking->sMmcoRef[iIdx].iDiffOfPicNum) & ((
+                1 << pSps->uiLog2MaxFrameNum) - 1);
+        } else if (kiMmco == MMCO_LONG2UNUSED)
+          kpRefMarking->sMmcoRef[iIdx].uiLongTermPicNum = BsGetUe (pBs);
 
-				if (kiMmco == MMCO_SHORT2LONG || kiMmco == MMCO_LONG)
-				{
-					kpRefMarking->sMmcoRef[iIdx].iLongTermFrameIdx = BsGetUe( pBs );
-				}
-				else if (kiMmco == MMCO_SET_MAX_LONG)
-					kpRefMarking->sMmcoRef[iIdx].iMaxLongTermFrameIdx = -1 + BsGetUe( pBs );
-				++ iIdx;
+        if (kiMmco == MMCO_SHORT2LONG || kiMmco == MMCO_LONG) {
+          kpRefMarking->sMmcoRef[iIdx].iLongTermFrameIdx = BsGetUe (pBs);
+        } else if (kiMmco == MMCO_SET_MAX_LONG)
+          kpRefMarking->sMmcoRef[iIdx].iMaxLongTermFrameIdx = -1 + BsGetUe (pBs);
+        ++ iIdx;
 
-			} while(iIdx < MAX_MMCO_COUNT);
-		}	
-	}
-	
-	return ERR_NONE;
+      } while (iIdx < MAX_MMCO_COUNT);
+    }
+  }
+
+  return ERR_NONE;
 }
 
-bool_t FillDefaultSliceHeaderExt ( PSliceHeaderExt pShExt, PNalUnitHeaderExt pNalExt )
-{
-	if ( pShExt == NULL || pNalExt == NULL )
-		return false;
+bool_t FillDefaultSliceHeaderExt (PSliceHeaderExt pShExt, PNalUnitHeaderExt pNalExt) {
+  if (pShExt == NULL || pNalExt == NULL)
+    return false;
 
-	if ( pNalExt->iNoInterLayerPredFlag || pNalExt->uiQualityId > 0 )
-		pShExt->bBasePredWeightTableFlag	= false;
-	else
-		pShExt->bBasePredWeightTableFlag	= true;
-    pShExt->uiRefLayerDqId = (uint8_t)-1;
-	pShExt->uiDisableInterLayerDeblockingFilterIdc	= 0;
-	pShExt->iInterLayerSliceAlphaC0Offset			= 0;
-	pShExt->iInterLayerSliceBetaOffset				= 0;
-	pShExt->bConstrainedIntraResamplingFlag			= false;
-	pShExt->uiRefLayerChromaPhaseXPlus1Flag			= 0;
-	pShExt->uiRefLayerChromaPhaseYPlus1				= 1;
-	//memset(&pShExt->sScaledRefLayer, 0, sizeof(SPosOffset));
+  if (pNalExt->iNoInterLayerPredFlag || pNalExt->uiQualityId > 0)
+    pShExt->bBasePredWeightTableFlag	= false;
+  else
+    pShExt->bBasePredWeightTableFlag	= true;
+  pShExt->uiRefLayerDqId = (uint8_t) - 1;
+  pShExt->uiDisableInterLayerDeblockingFilterIdc	= 0;
+  pShExt->iInterLayerSliceAlphaC0Offset			= 0;
+  pShExt->iInterLayerSliceBetaOffset				= 0;
+  pShExt->bConstrainedIntraResamplingFlag			= false;
+  pShExt->uiRefLayerChromaPhaseXPlus1Flag			= 0;
+  pShExt->uiRefLayerChromaPhaseYPlus1				= 1;
+  //memset(&pShExt->sScaledRefLayer, 0, sizeof(SPosOffset));
 
-	pShExt->iScaledRefLayerPicWidthInSampleLuma	= pShExt->sSliceHeader.iMbWidth << 4;
-	pShExt->iScaledRefLayerPicHeightInSampleLuma	= pShExt->sSliceHeader.iMbHeight << 4;
+  pShExt->iScaledRefLayerPicWidthInSampleLuma	= pShExt->sSliceHeader.iMbWidth << 4;
+  pShExt->iScaledRefLayerPicHeightInSampleLuma	= pShExt->sSliceHeader.iMbHeight << 4;
 
-	pShExt->bSliceSkipFlag	= false;
-	pShExt->bAdaptiveBaseModeFlag	= false;
-	pShExt->bDefaultBaseModeFlag	= false;
-	pShExt->bAdaptiveMotionPredFlag	= false;
-	pShExt->bDefaultMotionPredFlag	= false;
-	pShExt->bAdaptiveResidualPredFlag	= false;
-	pShExt->bDefaultResidualPredFlag	= false;
-	pShExt->bTCoeffLevelPredFlag		= false;
-	pShExt->uiScanIdxStart				= 0;
-	pShExt->uiScanIdxEnd				= 15;
+  pShExt->bSliceSkipFlag	= false;
+  pShExt->bAdaptiveBaseModeFlag	= false;
+  pShExt->bDefaultBaseModeFlag	= false;
+  pShExt->bAdaptiveMotionPredFlag	= false;
+  pShExt->bDefaultMotionPredFlag	= false;
+  pShExt->bAdaptiveResidualPredFlag	= false;
+  pShExt->bDefaultResidualPredFlag	= false;
+  pShExt->bTCoeffLevelPredFlag		= false;
+  pShExt->uiScanIdxStart				= 0;
+  pShExt->uiScanIdxEnd				= 15;
 
-	return true;
+  return true;
 }
 
 /*
@@ -305,58 +295,54 @@
  * return:
  *	0 - success; otherwise returned error_no defined in error_no.h.
 */
-int32_t WelsInitMemory( PWelsDecoderContext pCtx )
-{
-	if (pCtx == NULL){
-		return ERR_INFO_INVALID_PTR;
-	}
+int32_t WelsInitMemory (PWelsDecoderContext pCtx) {
+  if (pCtx == NULL) {
+    return ERR_INFO_INVALID_PTR;
+  }
 
-	if ( MemInitNalList( &pCtx->pAccessUnitList, MAX_NAL_UNIT_NUM_IN_AU ) != 0 )
-		return ERR_INFO_OUT_OF_MEMORY;	
+  if (MemInitNalList (&pCtx->pAccessUnitList, MAX_NAL_UNIT_NUM_IN_AU) != 0)
+    return ERR_INFO_OUT_OF_MEMORY;
 
-	if ( ( pCtx->sRawData.pHead = static_cast<uint8_t*> (WelsMalloc( MAX_ACCESS_UINT_CAPACITY, "pCtx->sRawData->pHead" )) ) == NULL )
-	{
-		return ERR_INFO_OUT_OF_MEMORY;
-	}
-	pCtx->sRawData.pStartPos               =
-	pCtx->sRawData.pCurPos                 = pCtx->sRawData.pHead;
-	pCtx->sRawData.pEnd                     = pCtx->sRawData.pHead + MAX_ACCESS_UINT_CAPACITY;	
-	
-	pCtx->uiTargetDqId			= (uint8_t)-1;
-	pCtx->bEndOfStreamFlag	= false;
-	pCtx->iImgWidthInPixel	= 0;
-	pCtx->iImgHeightInPixel	= 0;	
-	
-	return ERR_NONE;
+  if ((pCtx->sRawData.pHead = static_cast<uint8_t*> (WelsMalloc (MAX_ACCESS_UINT_CAPACITY,
+                              "pCtx->sRawData->pHead"))) == NULL) {
+    return ERR_INFO_OUT_OF_MEMORY;
+  }
+  pCtx->sRawData.pStartPos               =
+    pCtx->sRawData.pCurPos                 = pCtx->sRawData.pHead;
+  pCtx->sRawData.pEnd                     = pCtx->sRawData.pHead + MAX_ACCESS_UINT_CAPACITY;
+
+  pCtx->uiTargetDqId			= (uint8_t) - 1;
+  pCtx->bEndOfStreamFlag	= false;
+  pCtx->iImgWidthInPixel	= 0;
+  pCtx->iImgHeightInPixel	= 0;
+
+  return ERR_NONE;
 }
 
 /*
  * WelsFreeMemory
  * Free memory introduced in WelsInitMemory at destruction of decoder.
- * 
+ *
  */
-void_t WelsFreeMemory( PWelsDecoderContext pCtx )
-{
-	if ( pCtx == NULL )
-		return;
+void_t WelsFreeMemory (PWelsDecoderContext pCtx) {
+  if (pCtx == NULL)
+    return;
 
-	if ( NULL != pCtx->pParam )
-	{
-		WelsFree( pCtx->pParam, "pCtx->pParam" );
+  if (NULL != pCtx->pParam) {
+    WelsFree (pCtx->pParam, "pCtx->pParam");
 
-		pCtx->pParam = NULL;
-	}
+    pCtx->pParam = NULL;
+  }
 
-	MemFreeNalList( &pCtx->pAccessUnitList );
-			
-	if ( pCtx->sRawData.pHead )
-	{
-		WelsFree(pCtx->sRawData.pHead, "pCtx->sRawData->pHead");		
-	}
-	pCtx->sRawData.pHead                = NULL;
-	pCtx->sRawData.pEnd                 = NULL;
-	pCtx->sRawData.pStartPos	        = NULL;
-	pCtx->sRawData.pCurPos             = NULL;	
+  MemFreeNalList (&pCtx->pAccessUnitList);
+
+  if (pCtx->sRawData.pHead) {
+    WelsFree (pCtx->sRawData.pHead, "pCtx->sRawData->pHead");
+  }
+  pCtx->sRawData.pHead                = NULL;
+  pCtx->sRawData.pEnd                 = NULL;
+  pCtx->sRawData.pStartPos	        = NULL;
+  pCtx->sRawData.pCurPos             = NULL;
 }
 
 /*
@@ -366,138 +352,117 @@
  *	pNal:	target NALUnit ptr
  *	pSrc:	NAL Unit bitstream
  */
-void_t DecodeNalHeaderExt( PNalUnit pNal, uint8_t* pSrc )
-{
-	PNalUnitHeaderExt pHeaderExt = &pNal->sNalHeaderExt;	
+void_t DecodeNalHeaderExt (PNalUnit pNal, uint8_t* pSrc) {
+  PNalUnitHeaderExt pHeaderExt = &pNal->sNalHeaderExt;
 
-	uint8_t uiCurByte = *pSrc;	
-	pHeaderExt->bIdrFlag				 = !!(uiCurByte & 0x40);
-	pHeaderExt->uiPriorityId			 = uiCurByte & 0x3F;
+  uint8_t uiCurByte = *pSrc;
+  pHeaderExt->bIdrFlag				 = !! (uiCurByte & 0x40);
+  pHeaderExt->uiPriorityId			 = uiCurByte & 0x3F;
 
-	uiCurByte = *(++pSrc);
-	pHeaderExt->iNoInterLayerPredFlag = uiCurByte >> 7;
-	pHeaderExt->uiDependencyId			 = (uiCurByte & 0x70) >> 4;
-	pHeaderExt->uiQualityId				 = uiCurByte & 0x0F;
-	uiCurByte = *(++pSrc);
-	pHeaderExt->uiTemporalId			 = uiCurByte >> 5;
-	pHeaderExt->bUseRefBasePicFlag	     = !!(uiCurByte & 0x10);
-	pHeaderExt->bDiscardableFlag		 = !!(uiCurByte & 0x08);
-	pHeaderExt->bOutputFlag				 = !!(uiCurByte & 0x04);
-	pHeaderExt->uiReservedThree2Bits	 = uiCurByte & 0x03;	
-	pHeaderExt->uiLayerDqId				 = (pHeaderExt->uiDependencyId << 4) | pHeaderExt->uiQualityId;
+  uiCurByte = * (++pSrc);
+  pHeaderExt->iNoInterLayerPredFlag = uiCurByte >> 7;
+  pHeaderExt->uiDependencyId			 = (uiCurByte & 0x70) >> 4;
+  pHeaderExt->uiQualityId				 = uiCurByte & 0x0F;
+  uiCurByte = * (++pSrc);
+  pHeaderExt->uiTemporalId			 = uiCurByte >> 5;
+  pHeaderExt->bUseRefBasePicFlag	     = !! (uiCurByte & 0x10);
+  pHeaderExt->bDiscardableFlag		 = !! (uiCurByte & 0x08);
+  pHeaderExt->bOutputFlag				 = !! (uiCurByte & 0x04);
+  pHeaderExt->uiReservedThree2Bits	 = uiCurByte & 0x03;
+  pHeaderExt->uiLayerDqId				 = (pHeaderExt->uiDependencyId << 4) | pHeaderExt->uiQualityId;
 }
 
 
 #ifdef MOSAIC_AVOID_BASED_ON_SPS_PPS_ID
-int32_t CheckPpsId( PWelsDecoderContext pCtx, PPps* ppPps, uint32_t uiPpsId )
-{
-	PPps pPpsList = pCtx->sPpsBuffer;
-	int32_t iPpsNum = pCtx->iPpsTotalNum;
-	int32_t i = 0;
+int32_t CheckPpsId (PWelsDecoderContext pCtx, PPps* ppPps, uint32_t uiPpsId) {
+  PPps pPpsList = pCtx->sPpsBuffer;
+  int32_t iPpsNum = pCtx->iPpsTotalNum;
+  int32_t i = 0;
 
-	if ( iPpsNum <= 0 )
-	{
-		pCtx->iErrorCode |= dsNoParamSets;
-		
-		WelsLog( pCtx, WELS_LOG_WARNING, "CheckPpsId():::::PPS list is empty...NO PPS!!!\n" );
-		return dsNoParamSets;
-	}
+  if (iPpsNum <= 0) {
+    pCtx->iErrorCode |= dsNoParamSets;
 
-	while ( i < iPpsNum )
-	{
-		if ( uiPpsId == pPpsList[i].iPpsId )
-		{
-			*ppPps = &pPpsList[i];
-			break;
-		}
-		else
-		{
-			++i;
-		}
-	}
+    WelsLog (pCtx, WELS_LOG_WARNING, "CheckPpsId():::::PPS list is empty...NO PPS!!!\n");
+    return dsNoParamSets;
+  }
 
-	if ( i == iPpsNum )
-	{
-		pCtx->iErrorCode |= dsNoParamSets;
-		
-		WelsLog( pCtx, WELS_LOG_WARNING, "CheckPpsId()::::::CAN NOT find the matching from the PPS List.  iPpsId:%d\n", uiPpsId );
-		return dsNoParamSets;
-	}
+  while (i < iPpsNum) {
+    if (uiPpsId == pPpsList[i].iPpsId) {
+      *ppPps = &pPpsList[i];
+      break;
+    } else {
+      ++i;
+    }
+  }
 
-	return 0;
+  if (i == iPpsNum) {
+    pCtx->iErrorCode |= dsNoParamSets;
+
+    WelsLog (pCtx, WELS_LOG_WARNING, "CheckPpsId()::::::CAN NOT find the matching from the PPS List.  iPpsId:%d\n",
+             uiPpsId);
+    return dsNoParamSets;
+  }
+
+  return 0;
 }
 
-int32_t CheckSpsId( PWelsDecoderContext pCtx, PSubsetSps* ppSubsetSps, PSps* ppSps, int32_t iSpsId, bool_t bExtensionFlag )
-{
-	PSps pSpsList = pCtx->sSpsBuffer;
-	PSubsetSps pSubspsList = pCtx->sSubsetSpsBuffer;
+int32_t CheckSpsId (PWelsDecoderContext pCtx, PSubsetSps* ppSubsetSps, PSps* ppSps, int32_t iSpsId,
+                    bool_t bExtensionFlag) {
+  PSps pSpsList = pCtx->sSpsBuffer;
+  PSubsetSps pSubspsList = pCtx->sSubsetSpsBuffer;
 
-	int32_t iSpsNum    = pCtx->iSpsTotalNum;
-	int32_t iSubspsNum = pCtx->iSubspsTotalNum;
-	int32_t i = 0;
+  int32_t iSpsNum    = pCtx->iSpsTotalNum;
+  int32_t iSubspsNum = pCtx->iSubspsTotalNum;
+  int32_t i = 0;
 
-	if ( bExtensionFlag )
-	{
-		if ( iSubspsNum <= 0 )
-		{
-			pCtx->iErrorCode |= dsNoParamSets;
-			
-			WelsLog( pCtx, WELS_LOG_WARNING, "CheckSpsId()::::SUBSPS list is empty....NO SUBSPS\n" );
-			return dsNoParamSets;
-		}
-		while ( i < iSubspsNum )
-		{
-			if ( iSpsId == pSubspsList[i].sSps.iSpsId )
-			{
-				*ppSubsetSps = &pSubspsList[i];
-				*ppSps       = &pSubspsList[i].sSps;
-				break;
-			}
-			else
-			{
-				++i;
-			}
-		}
-		if ( i == iSubspsNum )
-		{
-			pCtx->iErrorCode |= dsNoParamSets;
-			
-			WelsLog( pCtx, WELS_LOG_WARNING, "CheckSpsId()::::::CAN NOT find the matching from the SUBSPS List.  iSpsId:%d\n", iSpsId );
-			return dsNoParamSets;
-		}
-	}
-	else
-	{
-		if ( iSpsNum <= 0 )
-		{
-			pCtx->iErrorCode |= dsNoParamSets;
-			
-			WelsLog( pCtx, WELS_LOG_WARNING, "CheckSpsId()::::SPS list is empty....NO SPS\n" );
-			return dsNoParamSets;
-		}
-		while ( i < iSpsNum )
-		{
-			if ( iSpsId == pSpsList[i].iSpsId )
-			{
-				*ppSubsetSps = NULL;
-				*ppSps       = &pSpsList[i];
-				break;
-			}
-			else
-			{
-				++i;
-			}
-		}
-		if ( i == iSpsNum )
-		{
-			pCtx->iErrorCode |= dsNoParamSets;
-			
-			WelsLog( pCtx, WELS_LOG_WARNING, "CheckSpsId()::::::CAN NOT find the matching from the SPS List.  iSpsId:%d\n", iSpsId );
-			return dsNoParamSets;
-		}
-	}
-	
-	return 0;
+  if (bExtensionFlag) {
+    if (iSubspsNum <= 0) {
+      pCtx->iErrorCode |= dsNoParamSets;
+
+      WelsLog (pCtx, WELS_LOG_WARNING, "CheckSpsId()::::SUBSPS list is empty....NO SUBSPS\n");
+      return dsNoParamSets;
+    }
+    while (i < iSubspsNum) {
+      if (iSpsId == pSubspsList[i].sSps.iSpsId) {
+        *ppSubsetSps = &pSubspsList[i];
+        *ppSps       = &pSubspsList[i].sSps;
+        break;
+      } else {
+        ++i;
+      }
+    }
+    if (i == iSubspsNum) {
+      pCtx->iErrorCode |= dsNoParamSets;
+
+      WelsLog (pCtx, WELS_LOG_WARNING, "CheckSpsId()::::::CAN NOT find the matching from the SUBSPS List.  iSpsId:%d\n",
+               iSpsId);
+      return dsNoParamSets;
+    }
+  } else {
+    if (iSpsNum <= 0) {
+      pCtx->iErrorCode |= dsNoParamSets;
+
+      WelsLog (pCtx, WELS_LOG_WARNING, "CheckSpsId()::::SPS list is empty....NO SPS\n");
+      return dsNoParamSets;
+    }
+    while (i < iSpsNum) {
+      if (iSpsId == pSpsList[i].iSpsId) {
+        *ppSubsetSps = NULL;
+        *ppSps       = &pSpsList[i];
+        break;
+      } else {
+        ++i;
+      }
+    }
+    if (i == iSpsNum) {
+      pCtx->iErrorCode |= dsNoParamSets;
+
+      WelsLog (pCtx, WELS_LOG_WARNING, "CheckSpsId()::::::CAN NOT find the matching from the SPS List.  iSpsId:%d\n", iSpsId);
+      return dsNoParamSets;
+    }
+  }
+
+  return 0;
 }
 
 #endif
@@ -505,384 +470,371 @@
  *	decode_slice_header_avc
  *	Parse slice header of bitstream in avc for storing data structure
  */
-int32_t ParseSliceHeaderSyntaxs ( PWelsDecoderContext pCtx, PBitStringAux pBs, const bool_t kbExtensionFlag )
-{
-	PNalUnit const kpCurNal				= pCtx->pAccessUnitList->pNalUnitsList[pCtx->pAccessUnitList->uiAvailUnitsNum-1];
-	
-	PNalUnitHeaderExt pNalHeaderExt	= NULL;
-	PSliceHeader pSliceHead			= NULL;
-	PSliceHeaderExt pSliceHeadExt	= NULL;
-	PSubsetSps pSubsetSps				= NULL;
-	PSps pSps							= NULL;
-	PPps pPps							= NULL;
-	ENalUnitType eNalType				= static_cast<ENalUnitType> (0);
-	int32_t iPpsId						= 0;
-	int32_t iRet						= ERR_NONE;
-	uint8_t uiSliceType				= 0;
-	uint8_t uiQualityId					= BASE_QUALITY_ID;
-	bool_t	bIdrFlag					= false;
-	bool_t	bSgChangeCycleInvolved	= false;	// involved slice group change cycle ?
-		
-	if (kpCurNal == NULL)
-	{	
-		return ERR_INFO_OUT_OF_MEMORY;
-	}
-	
-	pNalHeaderExt	= &kpCurNal->sNalHeaderExt;
-	pSliceHead		= &kpCurNal->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader;
-	eNalType		= pNalHeaderExt->sNalUnitHeader.eNalUnitType;
-	
-	pSliceHeadExt	= &kpCurNal->sNalData.sVclNal.sSliceHeaderExt;
+int32_t ParseSliceHeaderSyntaxs (PWelsDecoderContext pCtx, PBitStringAux pBs, const bool_t kbExtensionFlag) {
+  PNalUnit const kpCurNal				= pCtx->pAccessUnitList->pNalUnitsList[pCtx->pAccessUnitList->uiAvailUnitsNum - 1];
 
-	if ( pSliceHeadExt ){
-		SRefBasePicMarking sBaseMarking;
-		const bool_t kbStoreRefBaseFlag = pSliceHeadExt->bStoreRefBasePicFlag;
-		memcpy(&sBaseMarking, &pSliceHeadExt->sRefBasePicMarking, sizeof(SRefBasePicMarking));//confirmed_safe_unsafe_usage
-		memset(pSliceHeadExt, 0, sizeof(SSliceHeaderExt));
-		pSliceHeadExt->bStoreRefBasePicFlag	= kbStoreRefBaseFlag;
-		memcpy(&pSliceHeadExt->sRefBasePicMarking, &sBaseMarking, sizeof(SRefBasePicMarking));//confirmed_safe_unsafe_usage
-	}
-	
-	kpCurNal->sNalData.sVclNal.bSliceHeaderExtFlag	= kbExtensionFlag;
+  PNalUnitHeaderExt pNalHeaderExt	= NULL;
+  PSliceHeader pSliceHead			= NULL;
+  PSliceHeaderExt pSliceHeadExt	= NULL;
+  PSubsetSps pSubsetSps				= NULL;
+  PSps pSps							= NULL;
+  PPps pPps							= NULL;
+  ENalUnitType eNalType				= static_cast<ENalUnitType> (0);
+  int32_t iPpsId						= 0;
+  int32_t iRet						= ERR_NONE;
+  uint8_t uiSliceType				= 0;
+  uint8_t uiQualityId					= BASE_QUALITY_ID;
+  bool_t	bIdrFlag					= false;
+  bool_t	bSgChangeCycleInvolved	= false;	// involved slice group change cycle ?
 
-	pSliceHead->iFirstMbInSlice	= BsGetUe( pBs );
+  if (kpCurNal == NULL) {
+    return ERR_INFO_OUT_OF_MEMORY;
+  }
 
-    uiSliceType= BsGetUe( pBs );
-    if(uiSliceType > 9)
-	{
-		WelsLog( pCtx, WELS_LOG_WARNING, "slice type too large (%d) at first_mb(%d)\n", uiSliceType, pSliceHead->iFirstMbInSlice);
-		return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_INVALID_SLICE_TYPE);
-    }
-    if(uiSliceType > 4)
-		uiSliceType -= 5;
-	
-	if ( kbExtensionFlag ){	
-		if (uiSliceType > 2){
-			WelsLog( pCtx, WELS_LOG_WARNING, "Invalid slice type(%d).\n", uiSliceType);
-			return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_INVALID_SLICE_TYPE);
-		}
-	}
-	
-	pSliceHead->eSliceType	= static_cast <ESliceType> (uiSliceType);
+  pNalHeaderExt	= &kpCurNal->sNalHeaderExt;
+  pSliceHead		= &kpCurNal->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader;
+  eNalType		= pNalHeaderExt->sNalUnitHeader.eNalUnitType;
 
-    iPpsId= BsGetUe( pBs );
-    
-	if(iPpsId >= MAX_PPS_COUNT){
-		WelsLog( pCtx, WELS_LOG_WARNING, "iPpsId out of range\n");
-        return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_PPS_ID_OVERFLOW);
+  pSliceHeadExt	= &kpCurNal->sNalData.sVclNal.sSliceHeaderExt;
+
+  if (pSliceHeadExt) {
+    SRefBasePicMarking sBaseMarking;
+    const bool_t kbStoreRefBaseFlag = pSliceHeadExt->bStoreRefBasePicFlag;
+    memcpy (&sBaseMarking, &pSliceHeadExt->sRefBasePicMarking, sizeof (SRefBasePicMarking)); //confirmed_safe_unsafe_usage
+    memset (pSliceHeadExt, 0, sizeof (SSliceHeaderExt));
+    pSliceHeadExt->bStoreRefBasePicFlag	= kbStoreRefBaseFlag;
+    memcpy (&pSliceHeadExt->sRefBasePicMarking, &sBaseMarking, sizeof (SRefBasePicMarking)); //confirmed_safe_unsafe_usage
+  }
+
+  kpCurNal->sNalData.sVclNal.bSliceHeaderExtFlag	= kbExtensionFlag;
+
+  pSliceHead->iFirstMbInSlice	= BsGetUe (pBs);
+
+  uiSliceType = BsGetUe (pBs);
+  if (uiSliceType > 9) {
+    WelsLog (pCtx, WELS_LOG_WARNING, "slice type too large (%d) at first_mb(%d)\n", uiSliceType,
+             pSliceHead->iFirstMbInSlice);
+    return GENERATE_ERROR_NO (ERR_LEVEL_SLICE_HEADER, ERR_INFO_INVALID_SLICE_TYPE);
+  }
+  if (uiSliceType > 4)
+    uiSliceType -= 5;
+
+  if (kbExtensionFlag) {
+    if (uiSliceType > 2) {
+      WelsLog (pCtx, WELS_LOG_WARNING, "Invalid slice type(%d).\n", uiSliceType);
+      return GENERATE_ERROR_NO (ERR_LEVEL_SLICE_HEADER, ERR_INFO_INVALID_SLICE_TYPE);
     }
+  }
 
+  pSliceHead->eSliceType	= static_cast <ESliceType> (uiSliceType);
+
+  iPpsId = BsGetUe (pBs);
+
+  if (iPpsId >= MAX_PPS_COUNT) {
+    WelsLog (pCtx, WELS_LOG_WARNING, "iPpsId out of range\n");
+    return GENERATE_ERROR_NO (ERR_LEVEL_SLICE_HEADER, ERR_INFO_PPS_ID_OVERFLOW);
+  }
+
 #ifdef MOSAIC_AVOID_BASED_ON_SPS_PPS_ID
-	if ( CheckPpsId( pCtx, &pPps, iPpsId ) )
-	{
-		return dsNoParamSets;
-	}
+  if (CheckPpsId (pCtx, &pPps, iPpsId)) {
+    return dsNoParamSets;
+  }
 #else
-	pPps    = &pCtx->sPpsBuffer[iPpsId];
+  pPps    = &pCtx->sPpsBuffer[iPpsId];
 #endif //MOSAIC_AVOID_BASED_ON_SPS_PPS_ID
 
-	if (pPps->uiNumSliceGroups == 0){
-		WelsLog( pCtx, WELS_LOG_WARNING, "non existing PPS referenced\n");
-        return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_NO_PARAM_SETS);
-    }
+  if (pPps->uiNumSliceGroups == 0) {
+    WelsLog (pCtx, WELS_LOG_WARNING, "non existing PPS referenced\n");
+    return GENERATE_ERROR_NO (ERR_LEVEL_SLICE_HEADER, ERR_INFO_NO_PARAM_SETS);
+  }
 
-	if (pPps->iSpsId >= MAX_SPS_COUNT){
-		WelsLog( pCtx, WELS_LOG_WARNING, "iSpsId out of range\n");
-        return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_SPS_ID_OVERFLOW);
-	}
+  if (pPps->iSpsId >= MAX_SPS_COUNT) {
+    WelsLog (pCtx, WELS_LOG_WARNING, "iSpsId out of range\n");
+    return GENERATE_ERROR_NO (ERR_LEVEL_SLICE_HEADER, ERR_INFO_SPS_ID_OVERFLOW);
+  }
 
-	
+
 #ifdef MOSAIC_AVOID_BASED_ON_SPS_PPS_ID
-	if ( CheckSpsId( pCtx, &pSubsetSps, &pSps, pPps->iSpsId, kExtensionFlag ) )
-	{
-		return dsNoParamSets;
-	}
-#else	
-	if ( kbExtensionFlag )
-	{
-		pSubsetSps	= &pCtx->sSubsetSpsBuffer[pPps->iSpsId];
-		pSps		= &pSubsetSps->sSps;
-	}
-	else
-	{
-		pSps		= &pCtx->sSpsBuffer[pPps->iSpsId];
-	}
-	pCtx->pSps			= pSps;
+  if (CheckSpsId (pCtx, &pSubsetSps, &pSps, pPps->iSpsId, kExtensionFlag)) {
+    return dsNoParamSets;
+  }
+#else
+  if (kbExtensionFlag) {
+    pSubsetSps	= &pCtx->sSubsetSpsBuffer[pPps->iSpsId];
+    pSps		= &pSubsetSps->sSps;
+  } else {
+    pSps		= &pCtx->sSpsBuffer[pPps->iSpsId];
+  }
+  pCtx->pSps			= pSps;
 #endif //MOSAIC_AVOID_BASED_ON_SPS_PPS_ID
-	pSliceHead->iPpsId = iPpsId;
-	pSliceHead->iSpsId = pPps->iSpsId; 
-	pSliceHead->pPps   = pPps;
-	pSliceHead->pSps   = pSps;
+  pSliceHead->iPpsId = iPpsId;
+  pSliceHead->iSpsId = pPps->iSpsId;
+  pSliceHead->pPps   = pPps;
+  pSliceHead->pSps   = pSps;
 
-	pSliceHeadExt->pSubsetSps = pSubsetSps;
-	
-	bIdrFlag = (!kbExtensionFlag && eNalType == NAL_UNIT_CODED_SLICE_IDR) || (kbExtensionFlag && pNalHeaderExt->bIdrFlag);
+  pSliceHeadExt->pSubsetSps = pSubsetSps;
 
-    if(pSps->uiLog2MaxFrameNum == 0){
-		WelsLog( pCtx, WELS_LOG_WARNING, "non existing SPS referenced\n");
-        return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_NO_PARAM_SETS);
-    }
-	pSliceHead->iFrameNum = BsGetBits(pBs, pSps->uiLog2MaxFrameNum);	
+  bIdrFlag = (!kbExtensionFlag && eNalType == NAL_UNIT_CODED_SLICE_IDR) || (kbExtensionFlag && pNalHeaderExt->bIdrFlag);
 
-	pSliceHead->bFieldPicFlag		= false;
-	pSliceHead->bBottomFiledFlag	= false;
-	if( !pSps->bFrameMbsOnlyFlag ){
-        WelsLog( pCtx, WELS_LOG_WARNING, "ParseSliceHeaderSyntaxs(): frame_mbs_only_flag = %d not supported. \n", pSps->bFrameMbsOnlyFlag );
-        return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_UNSUPPORTED_MBAFF);
-	}
-	pSliceHead->iMbWidth	= pSps->iMbWidth;
-	pSliceHead->iMbHeight	= pSps->iMbHeight / (1 + pSliceHead->bFieldPicFlag);
-	
-	if ( bIdrFlag ){
-		if ( pSliceHead->iFrameNum != 0 ){
-			WelsLog( pCtx, WELS_LOG_WARNING, "ParseSliceHeaderSyntaxs(), invaild frame number: %d due to IDR frame introduced!\n", pSliceHead->iFrameNum);
-			return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_INVALID_FRAME_NUM);
-		}
-		pSliceHead->uiIdrPicId	= BsGetUe(pBs); /* uiIdrPicId */
+  if (pSps->uiLog2MaxFrameNum == 0) {
+    WelsLog (pCtx, WELS_LOG_WARNING, "non existing SPS referenced\n");
+    return GENERATE_ERROR_NO (ERR_LEVEL_SLICE_HEADER, ERR_INFO_NO_PARAM_SETS);
+  }
+  pSliceHead->iFrameNum = BsGetBits (pBs, pSps->uiLog2MaxFrameNum);
+
+  pSliceHead->bFieldPicFlag		= false;
+  pSliceHead->bBottomFiledFlag	= false;
+  if (!pSps->bFrameMbsOnlyFlag) {
+    WelsLog (pCtx, WELS_LOG_WARNING, "ParseSliceHeaderSyntaxs(): frame_mbs_only_flag = %d not supported. \n",
+             pSps->bFrameMbsOnlyFlag);
+    return GENERATE_ERROR_NO (ERR_LEVEL_SLICE_HEADER, ERR_INFO_UNSUPPORTED_MBAFF);
+  }
+  pSliceHead->iMbWidth	= pSps->iMbWidth;
+  pSliceHead->iMbHeight	= pSps->iMbHeight / (1 + pSliceHead->bFieldPicFlag);
+
+  if (bIdrFlag) {
+    if (pSliceHead->iFrameNum != 0) {
+      WelsLog (pCtx, WELS_LOG_WARNING, "ParseSliceHeaderSyntaxs(), invaild frame number: %d due to IDR frame introduced!\n",
+               pSliceHead->iFrameNum);
+      return GENERATE_ERROR_NO (ERR_LEVEL_SLICE_HEADER, ERR_INFO_INVALID_FRAME_NUM);
+    }
+    pSliceHead->uiIdrPicId	= BsGetUe (pBs); /* uiIdrPicId */
 #ifdef LONG_TERM_REF
-		pCtx->uiCurIdrPicId      = pSliceHead->uiIdrPicId;
+    pCtx->uiCurIdrPicId      = pSliceHead->uiIdrPicId;
 #endif
-	}
-	
-	pSliceHead->iDeltaPicOrderCntBottom	= 0;
-	pSliceHead->iDeltaPicOrderCnt[0]		=
-	pSliceHead->iDeltaPicOrderCnt[1]		= 0;
-	if(pSps->uiPocType == 0){
-		pSliceHead->iPicOrderCntLsb	= BsGetBits(pBs, pSps->iLog2MaxPocLsb);	
-		if( pPps->bPicOrderPresentFlag && !pSliceHead->bFieldPicFlag ){
-			pSliceHead->iDeltaPicOrderCntBottom	= BsGetSe(pBs);
-		}
-	}
-	else if(pSps->uiPocType == 1 && !pSps->bDeltaPicOrderAlwaysZeroFlag ){
-		pSliceHead->iDeltaPicOrderCnt[0]	= BsGetSe(pBs);
-		if( pPps->bPicOrderPresentFlag && !pSliceHead->bFieldPicFlag )
-			pSliceHead->iDeltaPicOrderCnt[1]= BsGetSe(pBs);
-	}
-	
-	pSliceHead->iRedundantPicCnt	= 0;
-	if( pPps->bRedundantPicCntPresentFlag ){
-		pSliceHead->iRedundantPicCnt = BsGetUe(pBs);
-	}
+  }
 
-    //set defaults, might be overriden a few line later
-	pSliceHead->uiRefCount[0]	= pPps->uiNumRefIdxL0Active;
-	pSliceHead->uiRefCount[1]	= pPps->uiNumRefIdxL1Active;
-	if ( kbExtensionFlag ){
-		uiQualityId = pNalHeaderExt->uiQualityId;
-		if ( BASE_QUALITY_ID == uiQualityId && (EP_SLICE == uiSliceType || EB_SLICE == uiSliceType) ){
-			const bool_t kbBipredFlag = (EB_SLICE == uiSliceType);
-			if ( kbBipredFlag )
-            {
-                WelsLog ( pCtx, WELS_LOG_WARNING, "ParseSliceHeaderSyntaxs(): kbBipredFlag = 1 not supported.\n");
-                return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_UNSUPPORTED_BIPRED);
-            }
-			pSliceHead->bNumRefIdxActiveOverrideFlag	= !!BsGetOneBit(pBs);
-			if ( pSliceHead->bNumRefIdxActiveOverrideFlag ){
-				pSliceHead->uiRefCount[0]	= 1 + BsGetUe(pBs);
-			}
-		}
-	}
-	else if(uiSliceType == P_SLICE || uiSliceType == SP_SLICE || uiSliceType == B_SLICE){
-		const bool_t kbBipredFlag = (B_SLICE == uiSliceType);
-        if( kbBipredFlag ){
-            WelsLog ( pCtx, WELS_LOG_WARNING, "ParseSliceHeaderSyntaxs(): kbBipredFlag = 1 not supported.\n");
-            return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_UNSUPPORTED_BIPRED);
-        }
-		pSliceHead->bNumRefIdxActiveOverrideFlag	= !!BsGetOneBit(pBs);
-		if( pSliceHead->bNumRefIdxActiveOverrideFlag ){
-			pSliceHead->uiRefCount[0]	= 1 + BsGetUe(pBs);
-		}
+  pSliceHead->iDeltaPicOrderCntBottom	= 0;
+  pSliceHead->iDeltaPicOrderCnt[0]		=
+    pSliceHead->iDeltaPicOrderCnt[1]		= 0;
+  if (pSps->uiPocType == 0) {
+    pSliceHead->iPicOrderCntLsb	= BsGetBits (pBs, pSps->iLog2MaxPocLsb);
+    if (pPps->bPicOrderPresentFlag && !pSliceHead->bFieldPicFlag) {
+      pSliceHead->iDeltaPicOrderCntBottom	= BsGetSe (pBs);
     }
+  } else if (pSps->uiPocType == 1 && !pSps->bDeltaPicOrderAlwaysZeroFlag) {
+    pSliceHead->iDeltaPicOrderCnt[0]	= BsGetSe (pBs);
+    if (pPps->bPicOrderPresentFlag && !pSliceHead->bFieldPicFlag)
+      pSliceHead->iDeltaPicOrderCnt[1] = BsGetSe (pBs);
+  }
 
-	if( pSliceHead->uiRefCount[0] > MAX_REF_PIC_COUNT || pSliceHead->uiRefCount[1] > MAX_REF_PIC_COUNT){
-		WelsLog( pCtx, WELS_LOG_WARNING, "reference overflow\n");
-		return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_REF_COUNT_OVERFLOW);
-	}
+  pSliceHead->iRedundantPicCnt	= 0;
+  if (pPps->bRedundantPicCntPresentFlag) {
+    pSliceHead->iRedundantPicCnt = BsGetUe (pBs);
+  }
 
-	if ( BASE_QUALITY_ID == uiQualityId ){
-		iRet = ParseRefPicListReordering(pBs, pSliceHead);
-		if (iRet != ERR_NONE){
-			WelsLog( pCtx, WELS_LOG_WARNING, "invalid ref pPic list reordering syntaxs!\n");
-			return iRet;
-		}
+  //set defaults, might be overriden a few line later
+  pSliceHead->uiRefCount[0]	= pPps->uiNumRefIdxL0Active;
+  pSliceHead->uiRefCount[1]	= pPps->uiNumRefIdxL1Active;
+  if (kbExtensionFlag) {
+    uiQualityId = pNalHeaderExt->uiQualityId;
+    if (BASE_QUALITY_ID == uiQualityId && (EP_SLICE == uiSliceType || EB_SLICE == uiSliceType)) {
+      const bool_t kbBipredFlag = (EB_SLICE == uiSliceType);
+      if (kbBipredFlag) {
+        WelsLog (pCtx, WELS_LOG_WARNING, "ParseSliceHeaderSyntaxs(): kbBipredFlag = 1 not supported.\n");
+        return GENERATE_ERROR_NO (ERR_LEVEL_SLICE_HEADER, ERR_INFO_UNSUPPORTED_BIPRED);
+      }
+      pSliceHead->bNumRefIdxActiveOverrideFlag	= !!BsGetOneBit (pBs);
+      if (pSliceHead->bNumRefIdxActiveOverrideFlag) {
+        pSliceHead->uiRefCount[0]	= 1 + BsGetUe (pBs);
+      }
+    }
+  } else if (uiSliceType == P_SLICE || uiSliceType == SP_SLICE || uiSliceType == B_SLICE) {
+    const bool_t kbBipredFlag = (B_SLICE == uiSliceType);
+    if (kbBipredFlag) {
+      WelsLog (pCtx, WELS_LOG_WARNING, "ParseSliceHeaderSyntaxs(): kbBipredFlag = 1 not supported.\n");
+      return GENERATE_ERROR_NO (ERR_LEVEL_SLICE_HEADER, ERR_INFO_UNSUPPORTED_BIPRED);
+    }
+    pSliceHead->bNumRefIdxActiveOverrideFlag	= !!BsGetOneBit (pBs);
+    if (pSliceHead->bNumRefIdxActiveOverrideFlag) {
+      pSliceHead->uiRefCount[0]	= 1 + BsGetUe (pBs);
+    }
+  }
 
-		if ( kbExtensionFlag ){
-			if ( pNalHeaderExt->iNoInterLayerPredFlag || pNalHeaderExt->uiQualityId > 0 )
-				pSliceHeadExt->bBasePredWeightTableFlag	= false;
-			else
-				pSliceHeadExt->bBasePredWeightTableFlag	= true;
-		}
+  if (pSliceHead->uiRefCount[0] > MAX_REF_PIC_COUNT || pSliceHead->uiRefCount[1] > MAX_REF_PIC_COUNT) {
+    WelsLog (pCtx, WELS_LOG_WARNING, "reference overflow\n");
+    return GENERATE_ERROR_NO (ERR_LEVEL_SLICE_HEADER, ERR_INFO_REF_COUNT_OVERFLOW);
+  }
 
-		if( kpCurNal->sNalHeaderExt.sNalUnitHeader.uiNalRefIdc != 0 ){
-			iRet = ParseDecRefPicMarking(pCtx, pBs, pSliceHead, pSps, bIdrFlag );
-			if (iRet != ERR_NONE){
-				return iRet;
-			}
+  if (BASE_QUALITY_ID == uiQualityId) {
+    iRet = ParseRefPicListReordering (pBs, pSliceHead);
+    if (iRet != ERR_NONE) {
+      WelsLog (pCtx, WELS_LOG_WARNING, "invalid ref pPic list reordering syntaxs!\n");
+      return iRet;
+    }
 
-			if ( kbExtensionFlag && !pSubsetSps->sSpsSvcExt.bSliceHeaderRestrictionFlag ){
-				pSliceHeadExt->bStoreRefBasePicFlag	= !!BsGetOneBit(pBs);
-				if ( (pNalHeaderExt->bUseRefBasePicFlag || pSliceHeadExt->bStoreRefBasePicFlag) && !bIdrFlag ){
-                    WelsLog( pCtx, WELS_LOG_WARNING, "ParseSliceHeaderSyntaxs(): bUseRefBasePicFlag or bStoreRefBasePicFlag = 1 not supported.\n" );
-                    return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_UNSUPPORTED_ILP);
-				}
-			}
-		}
-	}
-	
-	if ( pPps->bEntropyCodingModeFlag ){
-        WelsLog( pCtx, WELS_LOG_WARNING, "ParseSliceHeaderSyntaxs(): CABAC in Enhancement layer not supported.\n" );
-        return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_UNSUPPORTED_CABAC_EL);
-	}	
-	
-	pSliceHead->iSliceQpDelta	= BsGetSe(pBs);
-	pSliceHead->iSliceQp		= pPps->iPicInitQp + pSliceHead->iSliceQpDelta;	
-    if( pSliceHead->iSliceQp < 0 || pSliceHead->iSliceQp > 51 ){
-        WelsLog( pCtx, WELS_LOG_WARNING, "QP %d out of range\n", pSliceHead->iSliceQp);
-        return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_INVALID_QP);
+    if (kbExtensionFlag) {
+      if (pNalHeaderExt->iNoInterLayerPredFlag || pNalHeaderExt->uiQualityId > 0)
+        pSliceHeadExt->bBasePredWeightTableFlag	= false;
+      else
+        pSliceHeadExt->bBasePredWeightTableFlag	= true;
     }
-	
-    //FIXME qscale / qp ... stuff
-	if ( !kbExtensionFlag ){
-        if( uiSliceType == SP_SLICE || uiSliceType == SI_SLICE )
-        {
-            WelsLog( pCtx, WELS_LOG_WARNING, "SP/SI not supported\n");
-            return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_UNSUPPORTED_SPSI);
+
+    if (kpCurNal->sNalHeaderExt.sNalUnitHeader.uiNalRefIdc != 0) {
+      iRet = ParseDecRefPicMarking (pCtx, pBs, pSliceHead, pSps, bIdrFlag);
+      if (iRet != ERR_NONE) {
+        return iRet;
+      }
+
+      if (kbExtensionFlag && !pSubsetSps->sSpsSvcExt.bSliceHeaderRestrictionFlag) {
+        pSliceHeadExt->bStoreRefBasePicFlag	= !!BsGetOneBit (pBs);
+        if ((pNalHeaderExt->bUseRefBasePicFlag || pSliceHeadExt->bStoreRefBasePicFlag) && !bIdrFlag) {
+          WelsLog (pCtx, WELS_LOG_WARNING,
+                   "ParseSliceHeaderSyntaxs(): bUseRefBasePicFlag or bStoreRefBasePicFlag = 1 not supported.\n");
+          return GENERATE_ERROR_NO (ERR_LEVEL_SLICE_HEADER, ERR_INFO_UNSUPPORTED_ILP);
         }
-	}
+      }
+    }
+  }
 
-	pSliceHead->uiDisableDeblockingFilterIdc	= 0;
-	pSliceHead->iSliceAlphaC0Offset			= 0;
-	pSliceHead->iSliceBetaOffset				= 0;
-	if ( pPps->bDeblockingFilterControlPresentFlag ){
-		pSliceHead->uiDisableDeblockingFilterIdc	= BsGetUe(pBs);
-		//refer to JVT-X201wcm1.doc G.7.4.3.4--2010.4.20
-		if ( pSliceHead->uiDisableDeblockingFilterIdc > 6 )
-		{
-			WelsLog( pCtx, WELS_LOG_WARNING, "disable_deblock_filter_idc (%d) out of range [0, 6]\n", pSliceHead->uiDisableDeblockingFilterIdc );
-			return ERR_INFO_INVALID_DBLOCKING_IDC;
-		}
-		if (pSliceHead->uiDisableDeblockingFilterIdc != 1){
-			pSliceHead->iSliceAlphaC0Offset	= BsGetSe(pBs) << 1;	// slice_alpha_c0_offset_div2
-			pSliceHead->iSliceBetaOffset		= BsGetSe(pBs) << 1;	// iSliceBetaOffset
-		}
-	}
-	
-	bSgChangeCycleInvolved	= (pPps->uiNumSliceGroups > 1 && pPps->uiSliceGroupMapType >= 3 && pPps->uiSliceGroupMapType <= 5);
-	if ( kbExtensionFlag && bSgChangeCycleInvolved )
-		bSgChangeCycleInvolved= (bSgChangeCycleInvolved && (uiQualityId == BASE_QUALITY_ID));
-	if ( bSgChangeCycleInvolved ){
-		if ( pPps->uiSliceGroupChangeRate > 0 ){
-			const int32_t kiNumBits = (int32_t)WELS_CEIL(log(static_cast<double>(1 + pPps->uiPicSizeInMapUnits / pPps->uiSliceGroupChangeRate)));
-			pSliceHead->iSliceGroupChangeCycle	= BsGetBits(pBs, kiNumBits);	// For FMO extra types
-		}
-		else
-			pSliceHead->iSliceGroupChangeCycle	= 0;
-	}   
+  if (pPps->bEntropyCodingModeFlag) {
+    WelsLog (pCtx, WELS_LOG_WARNING, "ParseSliceHeaderSyntaxs(): CABAC in Enhancement layer not supported.\n");
+    return GENERATE_ERROR_NO (ERR_LEVEL_SLICE_HEADER, ERR_INFO_UNSUPPORTED_CABAC_EL);
+  }
 
-	if ( !kbExtensionFlag ){
-		FillDefaultSliceHeaderExt ( pSliceHeadExt, pNalHeaderExt );
-	}
-	else{
-		/* Extra syntax elements newly introduced */
-		pSliceHeadExt->pSubsetSps	= pSubsetSps;
-		
-		if ( !pNalHeaderExt->iNoInterLayerPredFlag && BASE_QUALITY_ID == uiQualityId ){
-            //the following should be deleted for CODE_CLEAN
-			pSliceHeadExt->uiRefLayerDqId	= BsGetUe(pBs);
-			if ( pSubsetSps->sSpsSvcExt.bInterLayerDeblockingFilterCtrlPresentFlag )
-			{
-				pSliceHeadExt->uiDisableInterLayerDeblockingFilterIdc	= BsGetUe(pBs);
-				//refer to JVT-X201wcm1.doc G.7.4.3.4--2010.4.20
-				if ( pSliceHeadExt->uiDisableInterLayerDeblockingFilterIdc > 6 ) 
-				{
-					WelsLog( pCtx, WELS_LOG_WARNING, "disable_inter_layer_deblock_filter_idc (%d) out of range [0, 6]\n", pSliceHeadExt->uiDisableInterLayerDeblockingFilterIdc );
-					return ERR_INFO_INVALID_DBLOCKING_IDC;
-				}
-				if ( pSliceHeadExt->uiDisableInterLayerDeblockingFilterIdc != 1 ){
-					pSliceHeadExt->iInterLayerSliceAlphaC0Offset	= BsGetSe(pBs) << 1;
-					pSliceHeadExt->iInterLayerSliceBetaOffset		= BsGetSe(pBs) << 1;
-				}
-			}
+  pSliceHead->iSliceQpDelta	= BsGetSe (pBs);
+  pSliceHead->iSliceQp		= pPps->iPicInitQp + pSliceHead->iSliceQpDelta;
+  if (pSliceHead->iSliceQp < 0 || pSliceHead->iSliceQp > 51) {
+    WelsLog (pCtx, WELS_LOG_WARNING, "QP %d out of range\n", pSliceHead->iSliceQp);
+    return GENERATE_ERROR_NO (ERR_LEVEL_SLICE_HEADER, ERR_INFO_INVALID_QP);
+  }
 
-			pSliceHeadExt->uiRefLayerChromaPhaseXPlus1Flag	= pSubsetSps->sSpsSvcExt.uiSeqRefLayerChromaPhaseXPlus1Flag;
-			pSliceHeadExt->uiRefLayerChromaPhaseYPlus1		= pSubsetSps->sSpsSvcExt.uiSeqRefLayerChromaPhaseYPlus1;
+  //FIXME qscale / qp ... stuff
+  if (!kbExtensionFlag) {
+    if (uiSliceType == SP_SLICE || uiSliceType == SI_SLICE) {
+      WelsLog (pCtx, WELS_LOG_WARNING, "SP/SI not supported\n");
+      return GENERATE_ERROR_NO (ERR_LEVEL_SLICE_HEADER, ERR_INFO_UNSUPPORTED_SPSI);
+    }
+  }
 
-			pSliceHeadExt->bConstrainedIntraResamplingFlag	= !!BsGetOneBit(pBs);
+  pSliceHead->uiDisableDeblockingFilterIdc	= 0;
+  pSliceHead->iSliceAlphaC0Offset			= 0;
+  pSliceHead->iSliceBetaOffset				= 0;
+  if (pPps->bDeblockingFilterControlPresentFlag) {
+    pSliceHead->uiDisableDeblockingFilterIdc	= BsGetUe (pBs);
+    //refer to JVT-X201wcm1.doc G.7.4.3.4--2010.4.20
+    if (pSliceHead->uiDisableDeblockingFilterIdc > 6) {
+      WelsLog (pCtx, WELS_LOG_WARNING, "disable_deblock_filter_idc (%d) out of range [0, 6]\n",
+               pSliceHead->uiDisableDeblockingFilterIdc);
+      return ERR_INFO_INVALID_DBLOCKING_IDC;
+    }
+    if (pSliceHead->uiDisableDeblockingFilterIdc != 1) {
+      pSliceHead->iSliceAlphaC0Offset	= BsGetSe (pBs) << 1;	// slice_alpha_c0_offset_div2
+      pSliceHead->iSliceBetaOffset		= BsGetSe (pBs) << 1;	// iSliceBetaOffset
+    }
+  }
 
-            {
-                SPosOffset pos;
-			    pos.iLeftOffset	= pSubsetSps->sSpsSvcExt.sSeqScaledRefLayer.iLeftOffset;
-			    pos.iTopOffset	= pSubsetSps->sSpsSvcExt.sSeqScaledRefLayer.iTopOffset * (2 - pSps->bFrameMbsOnlyFlag);
-			    pos.iRightOffset= pSubsetSps->sSpsSvcExt.sSeqScaledRefLayer.iRightOffset;
-			    pos.iBottomOffset=pSubsetSps->sSpsSvcExt.sSeqScaledRefLayer.iBottomOffset * (2 - pSps->bFrameMbsOnlyFlag);				
-			    //memcpy(&pSliceHeadExt->sScaledRefLayer, &pos, sizeof(SPosOffset));//confirmed_safe_unsafe_usage
-			    pSliceHeadExt->iScaledRefLayerPicWidthInSampleLuma	= (pSliceHead->iMbWidth << 4) - (pos.iLeftOffset+pos.iRightOffset);
-			    pSliceHeadExt->iScaledRefLayerPicHeightInSampleLuma	= (pSliceHead->iMbHeight << 4) - (pos.iTopOffset+pos.iBottomOffset) / (1 + pSliceHead->bFieldPicFlag);
-            }
-		}
-		else if (uiQualityId > BASE_QUALITY_ID){
-            WelsLog( pCtx, WELS_LOG_WARNING, "MGS not supported.\n" );
-            return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_UNSUPPORTED_MGS);
-			pSliceHeadExt->uiRefLayerDqId	= pNalHeaderExt->uiLayerDqId - 1;
-		}
-		else{
-			pSliceHeadExt->uiRefLayerDqId	= (uint8_t)-1;
-		}
+  bSgChangeCycleInvolved	= (pPps->uiNumSliceGroups > 1 && pPps->uiSliceGroupMapType >= 3
+                             && pPps->uiSliceGroupMapType <= 5);
+  if (kbExtensionFlag && bSgChangeCycleInvolved)
+    bSgChangeCycleInvolved = (bSgChangeCycleInvolved && (uiQualityId == BASE_QUALITY_ID));
+  if (bSgChangeCycleInvolved) {
+    if (pPps->uiSliceGroupChangeRate > 0) {
+      const int32_t kiNumBits = (int32_t)WELS_CEIL (log (static_cast<double> (1 + pPps->uiPicSizeInMapUnits /
+                                pPps->uiSliceGroupChangeRate)));
+      pSliceHead->iSliceGroupChangeCycle	= BsGetBits (pBs, kiNumBits);	// For FMO extra types
+    } else
+      pSliceHead->iSliceGroupChangeCycle	= 0;
+  }
 
-		pSliceHeadExt->bSliceSkipFlag	= false;
-		pSliceHeadExt->bAdaptiveBaseModeFlag	= false;
-		pSliceHeadExt->bDefaultBaseModeFlag	= false;
-		pSliceHeadExt->bAdaptiveMotionPredFlag	= false;
-		pSliceHeadExt->bDefaultMotionPredFlag	= false;
-		pSliceHeadExt->bAdaptiveResidualPredFlag	= false;
-		pSliceHeadExt->bDefaultResidualPredFlag	= false;
-		if ( pNalHeaderExt->iNoInterLayerPredFlag )
-			pSliceHeadExt->bTCoeffLevelPredFlag	= false;
-		else
-			pSliceHeadExt->bTCoeffLevelPredFlag	= pSubsetSps->sSpsSvcExt.bSeqTCoeffLevelPredFlag;
+  if (!kbExtensionFlag) {
+    FillDefaultSliceHeaderExt (pSliceHeadExt, pNalHeaderExt);
+  } else {
+    /* Extra syntax elements newly introduced */
+    pSliceHeadExt->pSubsetSps	= pSubsetSps;
 
-		if ( !pNalHeaderExt->iNoInterLayerPredFlag ){
-			pSliceHeadExt->bSliceSkipFlag	= !!BsGetOneBit(pBs);
-			if ( pSliceHeadExt->bSliceSkipFlag ){
-				pSliceHeadExt->uiNumMbsInSlice	= 1 + BsGetUe(pBs);
-			}
-			else{
-				pSliceHeadExt->bAdaptiveBaseModeFlag	= !!BsGetOneBit(pBs);
-				if ( !pSliceHeadExt->bAdaptiveBaseModeFlag ){
-					pSliceHeadExt->bDefaultBaseModeFlag	= !!BsGetOneBit(pBs);
-				}
-				if ( !pSliceHeadExt->bDefaultBaseModeFlag ){
-					pSliceHeadExt->bAdaptiveMotionPredFlag	= !!BsGetOneBit(pBs);
-					if ( !pSliceHeadExt->bAdaptiveMotionPredFlag )
-						pSliceHeadExt->bDefaultMotionPredFlag	= !!BsGetOneBit(pBs);
-				}
+    if (!pNalHeaderExt->iNoInterLayerPredFlag && BASE_QUALITY_ID == uiQualityId) {
+      //the following should be deleted for CODE_CLEAN
+      pSliceHeadExt->uiRefLayerDqId	= BsGetUe (pBs);
+      if (pSubsetSps->sSpsSvcExt.bInterLayerDeblockingFilterCtrlPresentFlag) {
+        pSliceHeadExt->uiDisableInterLayerDeblockingFilterIdc	= BsGetUe (pBs);
+        //refer to JVT-X201wcm1.doc G.7.4.3.4--2010.4.20
+        if (pSliceHeadExt->uiDisableInterLayerDeblockingFilterIdc > 6) {
+          WelsLog (pCtx, WELS_LOG_WARNING, "disable_inter_layer_deblock_filter_idc (%d) out of range [0, 6]\n",
+                   pSliceHeadExt->uiDisableInterLayerDeblockingFilterIdc);
+          return ERR_INFO_INVALID_DBLOCKING_IDC;
+        }
+        if (pSliceHeadExt->uiDisableInterLayerDeblockingFilterIdc != 1) {
+          pSliceHeadExt->iInterLayerSliceAlphaC0Offset	= BsGetSe (pBs) << 1;
+          pSliceHeadExt->iInterLayerSliceBetaOffset		= BsGetSe (pBs) << 1;
+        }
+      }
 
-				pSliceHeadExt->bAdaptiveResidualPredFlag	= !!BsGetOneBit(pBs);
-				if ( !pSliceHeadExt->bAdaptiveResidualPredFlag ){
-					pSliceHeadExt->bDefaultResidualPredFlag = !!BsGetOneBit(pBs);
-				}
-			}
-			if ( pSubsetSps->sSpsSvcExt.bAdaptiveTCoeffLevelPredFlag )
-				pSliceHeadExt->bTCoeffLevelPredFlag	= !!BsGetOneBit(pBs);
-		}
+      pSliceHeadExt->uiRefLayerChromaPhaseXPlus1Flag	= pSubsetSps->sSpsSvcExt.uiSeqRefLayerChromaPhaseXPlus1Flag;
+      pSliceHeadExt->uiRefLayerChromaPhaseYPlus1		= pSubsetSps->sSpsSvcExt.uiSeqRefLayerChromaPhaseYPlus1;
 
-		if ( !pSubsetSps->sSpsSvcExt.bSliceHeaderRestrictionFlag )
-		{
-			pSliceHeadExt->uiScanIdxStart	= BsGetBits(pBs, 4);
-			pSliceHeadExt->uiScanIdxEnd	= BsGetBits(pBs, 4);
-            if( pSliceHeadExt->uiScanIdxStart != 0 || pSliceHeadExt->uiScanIdxEnd != 15 )
-            {
-                WelsLog( pCtx, WELS_LOG_WARNING, "uiScanIdxStart (%d) != 0 and uiScanIdxEnd (%d) !=15 not supported here\n", pSliceHeadExt->uiScanIdxStart, pSliceHeadExt->uiScanIdxEnd );
-                return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_UNSUPPORTED_MGS);
-            }
-		}
-		else{
-			pSliceHeadExt->uiScanIdxStart	= 0;
-			pSliceHeadExt->uiScanIdxEnd	= 15;
-		}
-	}
+      pSliceHeadExt->bConstrainedIntraResamplingFlag	= !!BsGetOneBit (pBs);
 
-	return ERR_NONE;
+      {
+        SPosOffset pos;
+        pos.iLeftOffset	= pSubsetSps->sSpsSvcExt.sSeqScaledRefLayer.iLeftOffset;
+        pos.iTopOffset	= pSubsetSps->sSpsSvcExt.sSeqScaledRefLayer.iTopOffset * (2 - pSps->bFrameMbsOnlyFlag);
+        pos.iRightOffset = pSubsetSps->sSpsSvcExt.sSeqScaledRefLayer.iRightOffset;
+        pos.iBottomOffset = pSubsetSps->sSpsSvcExt.sSeqScaledRefLayer.iBottomOffset * (2 - pSps->bFrameMbsOnlyFlag);
+        //memcpy(&pSliceHeadExt->sScaledRefLayer, &pos, sizeof(SPosOffset));//confirmed_safe_unsafe_usage
+        pSliceHeadExt->iScaledRefLayerPicWidthInSampleLuma	= (pSliceHead->iMbWidth << 4) - (pos.iLeftOffset + pos.iRightOffset);
+        pSliceHeadExt->iScaledRefLayerPicHeightInSampleLuma	= (pSliceHead->iMbHeight << 4) -
+            (pos.iTopOffset + pos.iBottomOffset) / (1 + pSliceHead->bFieldPicFlag);
+      }
+    } else if (uiQualityId > BASE_QUALITY_ID) {
+      WelsLog (pCtx, WELS_LOG_WARNING, "MGS not supported.\n");
+      return GENERATE_ERROR_NO (ERR_LEVEL_SLICE_HEADER, ERR_INFO_UNSUPPORTED_MGS);
+      pSliceHeadExt->uiRefLayerDqId	= pNalHeaderExt->uiLayerDqId - 1;
+    } else {
+      pSliceHeadExt->uiRefLayerDqId	= (uint8_t) - 1;
+    }
+
+    pSliceHeadExt->bSliceSkipFlag	= false;
+    pSliceHeadExt->bAdaptiveBaseModeFlag	= false;
+    pSliceHeadExt->bDefaultBaseModeFlag	= false;
+    pSliceHeadExt->bAdaptiveMotionPredFlag	= false;
+    pSliceHeadExt->bDefaultMotionPredFlag	= false;
+    pSliceHeadExt->bAdaptiveResidualPredFlag	= false;
+    pSliceHeadExt->bDefaultResidualPredFlag	= false;
+    if (pNalHeaderExt->iNoInterLayerPredFlag)
+      pSliceHeadExt->bTCoeffLevelPredFlag	= false;
+    else
+      pSliceHeadExt->bTCoeffLevelPredFlag	= pSubsetSps->sSpsSvcExt.bSeqTCoeffLevelPredFlag;
+
+    if (!pNalHeaderExt->iNoInterLayerPredFlag) {
+      pSliceHeadExt->bSliceSkipFlag	= !!BsGetOneBit (pBs);
+      if (pSliceHeadExt->bSliceSkipFlag) {
+        pSliceHeadExt->uiNumMbsInSlice	= 1 + BsGetUe (pBs);
+      } else {
+        pSliceHeadExt->bAdaptiveBaseModeFlag	= !!BsGetOneBit (pBs);
+        if (!pSliceHeadExt->bAdaptiveBaseModeFlag) {
+          pSliceHeadExt->bDefaultBaseModeFlag	= !!BsGetOneBit (pBs);
+        }
+        if (!pSliceHeadExt->bDefaultBaseModeFlag) {
+          pSliceHeadExt->bAdaptiveMotionPredFlag	= !!BsGetOneBit (pBs);
+          if (!pSliceHeadExt->bAdaptiveMotionPredFlag)
+            pSliceHeadExt->bDefaultMotionPredFlag	= !!BsGetOneBit (pBs);
+        }
+
+        pSliceHeadExt->bAdaptiveResidualPredFlag	= !!BsGetOneBit (pBs);
+        if (!pSliceHeadExt->bAdaptiveResidualPredFlag) {
+          pSliceHeadExt->bDefaultResidualPredFlag = !!BsGetOneBit (pBs);
+        }
+      }
+      if (pSubsetSps->sSpsSvcExt.bAdaptiveTCoeffLevelPredFlag)
+        pSliceHeadExt->bTCoeffLevelPredFlag	= !!BsGetOneBit (pBs);
+    }
+
+    if (!pSubsetSps->sSpsSvcExt.bSliceHeaderRestrictionFlag) {
+      pSliceHeadExt->uiScanIdxStart	= BsGetBits (pBs, 4);
+      pSliceHeadExt->uiScanIdxEnd	= BsGetBits (pBs, 4);
+      if (pSliceHeadExt->uiScanIdxStart != 0 || pSliceHeadExt->uiScanIdxEnd != 15) {
+        WelsLog (pCtx, WELS_LOG_WARNING, "uiScanIdxStart (%d) != 0 and uiScanIdxEnd (%d) !=15 not supported here\n",
+                 pSliceHeadExt->uiScanIdxStart, pSliceHeadExt->uiScanIdxEnd);
+        return GENERATE_ERROR_NO (ERR_LEVEL_SLICE_HEADER, ERR_INFO_UNSUPPORTED_MGS);
+      }
+    } else {
+      pSliceHeadExt->uiScanIdxStart	= 0;
+      pSliceHeadExt->uiScanIdxEnd	= 15;
+    }
+  }
+
+  return ERR_NONE;
 }
 
 /*
@@ -890,397 +842,390 @@
  *	pSrc:	mark as decoded prefix NAL
  *	ppDst:	succeeded VCL NAL based AVC (I/P Slice)
  */
-bool_t PrefetchNalHeaderExtSyntax ( PWelsDecoderContext pCtx, PNalUnit const kppDst, PNalUnit const kpSrc)
-{
-	PNalUnitHeaderExt pNalHdrExtD	= NULL, pNalHdrExtS = NULL;
-	PSliceHeaderExt pShExtD = NULL;
-	PPrefixNalUnit pPrefixS = NULL;
-	PSps pSps = NULL;
-	int32_t iIdx = 0;
-	
-	if ( kppDst == NULL || kpSrc == NULL )
-		return false;	
-	
-	pNalHdrExtD	= &kppDst->sNalHeaderExt;
-	pNalHdrExtS	= &kpSrc->sNalHeaderExt;
-	pShExtD		= &kppDst->sNalData.sVclNal.sSliceHeaderExt;
-	pPrefixS		= &kpSrc->sNalData.sPrefixNal;
-	pSps			= &pCtx->sSpsBuffer[pCtx->sPpsBuffer[pShExtD->sSliceHeader.iPpsId].iSpsId];
-	
-	pNalHdrExtD->uiDependencyId	    = pNalHdrExtS->uiDependencyId;
-	pNalHdrExtD->uiQualityId		= pNalHdrExtS->uiQualityId;
-	pNalHdrExtD->uiTemporalId		= pNalHdrExtS->uiTemporalId;
-	pNalHdrExtD->uiPriorityId		= pNalHdrExtS->uiPriorityId;
-	pNalHdrExtD->bIdrFlag			= pNalHdrExtS->bIdrFlag;
-	pNalHdrExtD->iNoInterLayerPredFlag	= pNalHdrExtS->iNoInterLayerPredFlag;
-	pNalHdrExtD->bDiscardableFlag			= pNalHdrExtS->bDiscardableFlag;
-	pNalHdrExtD->bOutputFlag				= pNalHdrExtS->bOutputFlag;
-	pNalHdrExtD->bUseRefBasePicFlag	= pNalHdrExtS->bUseRefBasePicFlag;
-	pNalHdrExtD->uiLayerDqId				= pNalHdrExtS->uiLayerDqId;
-	
-	pShExtD->bStoreRefBasePicFlag		= pPrefixS->bStoreRefBasePicFlag;
-	memcpy(&pShExtD->sRefBasePicMarking, &pPrefixS->sRefPicBaseMarking, sizeof(SRefBasePicMarking));//confirmed_safe_unsafe_usage
-	if (pShExtD->sRefBasePicMarking.bAdaptiveRefBasePicMarkingModeFlag){
-		PRefBasePicMarking pRefBasePicMarking = &pShExtD->sRefBasePicMarking;
-		iIdx = 0;
-		do {
-			if (pRefBasePicMarking->mmco_base[iIdx].uiMmcoType == MMCO_END)
-				break;
-			if (pRefBasePicMarking->mmco_base[iIdx].uiMmcoType == MMCO_SHORT2UNUSED)
-				pRefBasePicMarking->mmco_base[iIdx].iShortFrameNum = (pShExtD->sSliceHeader.iFrameNum-pRefBasePicMarking->mmco_base[iIdx].uiDiffOfPicNums) & ((1<<pSps->uiLog2MaxFrameNum)-1);
-			++ iIdx;
-		} while(iIdx < MAX_MMCO_COUNT);
-	}
-	
-	return true;
+bool_t PrefetchNalHeaderExtSyntax (PWelsDecoderContext pCtx, PNalUnit const kppDst, PNalUnit const kpSrc) {
+  PNalUnitHeaderExt pNalHdrExtD	= NULL, pNalHdrExtS = NULL;
+  PSliceHeaderExt pShExtD = NULL;
+  PPrefixNalUnit pPrefixS = NULL;
+  PSps pSps = NULL;
+  int32_t iIdx = 0;
+
+  if (kppDst == NULL || kpSrc == NULL)
+    return false;
+
+  pNalHdrExtD	= &kppDst->sNalHeaderExt;
+  pNalHdrExtS	= &kpSrc->sNalHeaderExt;
+  pShExtD		= &kppDst->sNalData.sVclNal.sSliceHeaderExt;
+  pPrefixS		= &kpSrc->sNalData.sPrefixNal;
+  pSps			= &pCtx->sSpsBuffer[pCtx->sPpsBuffer[pShExtD->sSliceHeader.iPpsId].iSpsId];
+
+  pNalHdrExtD->uiDependencyId	    = pNalHdrExtS->uiDependencyId;
+  pNalHdrExtD->uiQualityId		= pNalHdrExtS->uiQualityId;
+  pNalHdrExtD->uiTemporalId		= pNalHdrExtS->uiTemporalId;
+  pNalHdrExtD->uiPriorityId		= pNalHdrExtS->uiPriorityId;
+  pNalHdrExtD->bIdrFlag			= pNalHdrExtS->bIdrFlag;
+  pNalHdrExtD->iNoInterLayerPredFlag	= pNalHdrExtS->iNoInterLayerPredFlag;
+  pNalHdrExtD->bDiscardableFlag			= pNalHdrExtS->bDiscardableFlag;
+  pNalHdrExtD->bOutputFlag				= pNalHdrExtS->bOutputFlag;
+  pNalHdrExtD->bUseRefBasePicFlag	= pNalHdrExtS->bUseRefBasePicFlag;
+  pNalHdrExtD->uiLayerDqId				= pNalHdrExtS->uiLayerDqId;
+
+  pShExtD->bStoreRefBasePicFlag		= pPrefixS->bStoreRefBasePicFlag;
+  memcpy (&pShExtD->sRefBasePicMarking, &pPrefixS->sRefPicBaseMarking,
+          sizeof (SRefBasePicMarking)); //confirmed_safe_unsafe_usage
+  if (pShExtD->sRefBasePicMarking.bAdaptiveRefBasePicMarkingModeFlag) {
+    PRefBasePicMarking pRefBasePicMarking = &pShExtD->sRefBasePicMarking;
+    iIdx = 0;
+    do {
+      if (pRefBasePicMarking->mmco_base[iIdx].uiMmcoType == MMCO_END)
+        break;
+      if (pRefBasePicMarking->mmco_base[iIdx].uiMmcoType == MMCO_SHORT2UNUSED)
+        pRefBasePicMarking->mmco_base[iIdx].iShortFrameNum = (pShExtD->sSliceHeader.iFrameNum -
+            pRefBasePicMarking->mmco_base[iIdx].uiDiffOfPicNums) & ((1 << pSps->uiLog2MaxFrameNum) - 1);
+      ++ iIdx;
+    } while (iIdx < MAX_MMCO_COUNT);
+  }
+
+  return true;
 }
 
 
 
-int32_t UpdateAccessUnit ( PWelsDecoderContext pCtx )
-{
-	PAccessUnit pCurAu	= pCtx->pAccessUnitList;	
-	int32_t iIdx         = pCurAu->uiEndPos;
-	
-	// Conversed iterator
-	pCtx->uiTargetDqId = pCurAu->pNalUnitsList[iIdx]->sNalHeaderExt.uiLayerDqId;
-	pCurAu->uiActualUnitsNum  = iIdx + 1;
-	pCurAu->bCompletedAuFlag = true;	
+int32_t UpdateAccessUnit (PWelsDecoderContext pCtx) {
+  PAccessUnit pCurAu	= pCtx->pAccessUnitList;
+  int32_t iIdx         = pCurAu->uiEndPos;
 
-	// Added for mosaic avoidance, 11/19/2009
+  // Conversed iterator
+  pCtx->uiTargetDqId = pCurAu->pNalUnitsList[iIdx]->sNalHeaderExt.uiLayerDqId;
+  pCurAu->uiActualUnitsNum  = iIdx + 1;
+  pCurAu->bCompletedAuFlag = true;
+
+  // Added for mosaic avoidance, 11/19/2009
 #ifdef LONG_TERM_REF
-	if ( pCtx->bParamSetsLostFlag )
+  if (pCtx->bParamSetsLostFlag)
 #else
-	if ( pCtx->bReferenceLostAtT0Flag )
-#endif 
-	{
-		uint32_t uiActualIdx = 0;
-		while ( uiActualIdx < pCurAu->uiActualUnitsNum ) {
-			PNalUnit nal = pCurAu->pNalUnitsList[uiActualIdx];
+  if (pCtx->bReferenceLostAtT0Flag)
+#endif
+  {
+    uint32_t uiActualIdx = 0;
+    while (uiActualIdx < pCurAu->uiActualUnitsNum) {
+      PNalUnit nal = pCurAu->pNalUnitsList[uiActualIdx];
 
-			if ( nal->sNalHeaderExt.sNalUnitHeader.eNalUnitType == NAL_UNIT_CODED_SLICE_IDR || nal->sNalHeaderExt.bIdrFlag )
-			{
-				break;
-			}
-			++ uiActualIdx;
-		}
-		if ( uiActualIdx == pCurAu->uiActualUnitsNum )	// no found IDR nal within incoming AU, need exit to avoid mosaic issue, 11/19/2009
-		{
-			WelsLog( pCtx, WELS_LOG_WARNING, "UpdateAccessUnit():::::Key frame lost.....CAN NOT find IDR from current AU.\n" );
+      if (nal->sNalHeaderExt.sNalUnitHeader.eNalUnitType == NAL_UNIT_CODED_SLICE_IDR || nal->sNalHeaderExt.bIdrFlag) {
+        break;
+      }
+      ++ uiActualIdx;
+    }
+    if (uiActualIdx ==
+        pCurAu->uiActualUnitsNum) {	// no found IDR nal within incoming AU, need exit to avoid mosaic issue, 11/19/2009
+      WelsLog (pCtx, WELS_LOG_WARNING, "UpdateAccessUnit():::::Key frame lost.....CAN NOT find IDR from current AU.\n");
 #ifdef LONG_TERM_REF
-			pCtx->iErrorCode |= dsNoParamSets;
-			return dsNoParamSets;
+      pCtx->iErrorCode |= dsNoParamSets;
+      return dsNoParamSets;
 #else
-			pCtx->iErrorCode |= dsRefLost;
-			return ERR_INFO_REFERENCE_PIC_LOST;
-#endif			
-		}
-	}		
-	
-	return ERR_NONE;
+      pCtx->iErrorCode |= dsRefLost;
+      return ERR_INFO_REFERENCE_PIC_LOST;
+#endif
+    }
+  }
+
+  return ERR_NONE;
 }
 
-int32_t InitialDqLayersContext ( PWelsDecoderContext pCtx, const int32_t kiMaxWidth, const int32_t kiMaxHeight )
-{
-	const int32_t kiPicStride		= ((kiMaxWidth + 15) & 0xfffff0) + (PADDING_LENGTH<<1);
-	const int32_t kiPicLines		= ((kiMaxHeight + 15) & 0xfffff0);
-		
-	int32_t i = 0;
+int32_t InitialDqLayersContext (PWelsDecoderContext pCtx, const int32_t kiMaxWidth, const int32_t kiMaxHeight) {
+  const int32_t kiPicStride		= ((kiMaxWidth + 15) & 0xfffff0) + (PADDING_LENGTH << 1);
+  const int32_t kiPicLines		= ((kiMaxHeight + 15) & 0xfffff0);
 
-	WELS_VERIFY_RETURN_IF( ERR_INFO_INVALID_PARAM, ( NULL == pCtx || kiMaxWidth <= 0 || kiMaxHeight <= 0 ) )
-	pCtx->sMb.iMbWidth		= (kiMaxWidth + 15) >> 4;
-	pCtx->sMb.iMbHeight		= (kiMaxHeight + 15) >> 4;
+  int32_t i = 0;
 
-	if ( pCtx->bInitialDqLayersMem && kiMaxWidth <= pCtx->iPicWidthReq && kiMaxHeight <= pCtx->iPicHeightReq )	// have same dimension memory, skipped
-		return ERR_NONE;
+  WELS_VERIFY_RETURN_IF (ERR_INFO_INVALID_PARAM, (NULL == pCtx || kiMaxWidth <= 0 || kiMaxHeight <= 0))
+  pCtx->sMb.iMbWidth		= (kiMaxWidth + 15) >> 4;
+  pCtx->sMb.iMbHeight		= (kiMaxHeight + 15) >> 4;
 
-	
-		UninitialDqLayersContext( pCtx );
-	
-		do {
-		PDqLayer pDq = (PDqLayer )WelsMalloc(sizeof(SDqLayer), "PDqLayer");
+  if (pCtx->bInitialDqLayersMem && kiMaxWidth <= pCtx->iPicWidthReq
+      && kiMaxHeight <= pCtx->iPicHeightReq)	// have same dimension memory, skipped
+    return ERR_NONE;
 
-		int32_t iPlaneIdx = 0;
 
-		if ( pDq == NULL )
-			return ERR_INFO_OUT_OF_MEMORY;
-		
-		memset(pDq, 0, sizeof(SDqLayer));
-		if(pCtx->iDecoderMode == SW_MODE)
-		{
-		
-		do {
-			const int32_t kiHshift	= iPlaneIdx ? 1 : 0;
-			const int32_t kiVshift	= kiHshift;
-			const int32_t kiStride	= WELS_ALIGN( (kiPicStride >> kiHshift), (16 << (1-kiHshift)) );
-			const int32_t kiLine	= (kiPicLines + (PADDING_LENGTH<<1)) >> kiVshift;
-			const int32_t kiSize	= kiStride * kiLine;
+  UninitialDqLayersContext (pCtx);
 
-			pCtx->pCsListXchg[i][iPlaneIdx]	= (uint8_t *)WelsMalloc( kiSize * sizeof(uint8_t), "pCtx->pCsListXchg[][]" );
+  do {
+    PDqLayer pDq = (PDqLayer)WelsMalloc (sizeof (SDqLayer), "PDqLayer");
 
-			WELS_VERIFY_RETURN_IF( ERR_INFO_OUT_OF_MEMORY, (NULL == pCtx->pCsListXchg[i][iPlaneIdx]) )
-			pCtx->iCsStride[iPlaneIdx]	= kiStride;
+    int32_t iPlaneIdx = 0;
 
+    if (pDq == NULL)
+      return ERR_INFO_OUT_OF_MEMORY;
 
-			pCtx->pRsListXchg[i][iPlaneIdx]	= (int16_t *)WelsMalloc( kiSize * sizeof(int16_t), "pCtx->pRsListXchg[][]" );
+    memset (pDq, 0, sizeof (SDqLayer));
+    if (pCtx->iDecoderMode == SW_MODE) {
 
-			WELS_VERIFY_RETURN_IF( ERR_INFO_OUT_OF_MEMORY , (NULL == pCtx->pRsListXchg[i][iPlaneIdx]) )
-			pCtx->iRsStride[iPlaneIdx]	= kiStride;
-	
-			++ iPlaneIdx;
-		} while(iPlaneIdx < 3);
+      do {
+        const int32_t kiHshift	= iPlaneIdx ? 1 : 0;
+        const int32_t kiVshift	= kiHshift;
+        const int32_t kiStride	= WELS_ALIGN ((kiPicStride >> kiHshift), (16 << (1 - kiHshift)));
+        const int32_t kiLine	= (kiPicLines + (PADDING_LENGTH << 1)) >> kiVshift;
+        const int32_t kiSize	= kiStride * kiLine;
 
+        pCtx->pCsListXchg[i][iPlaneIdx]	= (uint8_t*)WelsMalloc (kiSize * sizeof (uint8_t), "pCtx->pCsListXchg[][]");
 
-		pCtx->sMb.pMbType[i] = (int8_t *)WelsMalloc( pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight *sizeof(int8_t), "pCtx->sMb.pMbType[]" );
-		pCtx->sMb.pMv[i][0] = (int16_t (*)[16][2])WelsMalloc( pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof(int16_t) * MV_A * MB_BLOCK4x4_NUM, "pCtx->sMb.pMv[][]"); 
-		pCtx->sMb.pRefIndex[i][0] = (int8_t (*)[MB_BLOCK4x4_NUM])WelsMalloc( pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof(int8_t) * MB_BLOCK4x4_NUM, "pCtx->sMb.pRefIndex[][]");
-		pCtx->sMb.pLumaQp[i] = (int8_t *)WelsMalloc( pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof(int8_t), "pCtx->sMb.pLumaQp[]");
-		pCtx->sMb.pChromaQp[i] = (int8_t *)WelsMalloc(pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof(int8_t), "pCtx->sMb.pChromaQp[]");
-        pCtx->sMb.pNzc[i] = (int8_t (*)[24])WelsMalloc(pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof(int8_t) * 24, "pCtx->sMb.pNzc[]");
-		pCtx->sMb.pNzcRs[i] = (int8_t (*)[24])WelsMalloc(pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof(int8_t) * 24, "pCtx->sMb.pNzcRs[]");
-		pCtx->sMb.pScaledTCoeff[i] = (int16_t (*)[MB_COEFF_LIST_SIZE])WelsMalloc(pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof(int16_t) * MB_COEFF_LIST_SIZE, "pCtx->sMb.pScaledTCoeff[]"); 
-		pCtx->sMb.pIntraPredMode[i] = (int8_t (*)[8])WelsMalloc(pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof(int8_t) * 8, "pCtx->sMb.pIntraPredMode[]");
-		pCtx->sMb.pIntra4x4FinalMode[i] = (int8_t (*)[MB_BLOCK4x4_NUM])WelsMalloc(pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof(int8_t) * MB_BLOCK4x4_NUM, "pCtx->sMb.pIntra4x4FinalMode[]");	
-		pCtx->sMb.pChromaPredMode[i] = (int8_t *)WelsMalloc(pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof(int8_t), "pCtx->sMb.pChromaPredMode[]");
-		pCtx->sMb.pCbp[i] = (int8_t *)WelsMalloc(pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight *sizeof(int8_t), "pCtx->sMb.pCbp[]");
-		pCtx->sMb.pSubMbType[i] = (int8_t (*)[MB_PARTITION_SIZE])WelsMalloc(pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof(int8_t) * MB_PARTITION_SIZE, "pCtx->sMb.pSubMbType[]");
-		pCtx->sMb.pSliceIdc[i] = (int32_t *) WelsMalloc(pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof(int32_t), "pCtx->sMb.pSliceIdc[]");	// using int32_t for slice_idc, 4/21/2010
-		if ( pCtx->sMb.pSliceIdc[i] != NULL )
-			memset(pCtx->sMb.pSliceIdc[i], 0xff, (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof(int32_t)) );
-		pCtx->sMb.pResidualPredFlag[i] = (int8_t *) WelsMalloc(pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof(int8_t), "pCtx->sMb.pResidualPredFlag[]");
-		//pCtx->sMb.pMotionPredFlag[i] = (uint8_t *) WelsMalloc(pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof(uint8_t), "pCtx->sMb.pMotionPredFlag[]");
-		pCtx->sMb.pInterPredictionDoneFlag[i] = (int8_t *) WelsMalloc(pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof(int8_t), "pCtx->sMb.pInterPredictionDoneFlag[]");
+        WELS_VERIFY_RETURN_IF (ERR_INFO_OUT_OF_MEMORY, (NULL == pCtx->pCsListXchg[i][iPlaneIdx]))
+        pCtx->iCsStride[iPlaneIdx]	= kiStride;
 
-		// check memory block valid due above allocated..
-		WELS_VERIFY_RETURN_IF	(	ERR_INFO_OUT_OF_MEMORY,
-                   			       ( (NULL == pCtx->sMb.pMbType[i]) ||
-								     (NULL == pCtx->sMb.pMv[i][0]) ||
-									 (NULL == pCtx->sMb.pRefIndex[i][0]) ||
-									 (NULL == pCtx->sMb.pLumaQp[i]) ||
-									 (NULL == pCtx->sMb.pChromaQp[i]) ||
-									 (NULL == pCtx->sMb.pNzc[i]) ||
-									 (NULL == pCtx->sMb.pNzcRs[i]) ||
-									 (NULL == pCtx->sMb.pScaledTCoeff[i]) ||
-									 (NULL == pCtx->sMb.pIntraPredMode[i]) ||
-									 (NULL == pCtx->sMb.pIntra4x4FinalMode[i]) ||
-									 (NULL == pCtx->sMb.pChromaPredMode[i]) ||
-									 (NULL == pCtx->sMb.pCbp[i]) ||
-									 (NULL == pCtx->sMb.pSubMbType[i]) ||
-									 (NULL == pCtx->sMb.pSliceIdc[i]) ||
-									 (NULL == pCtx->sMb.pResidualPredFlag[i]) ||
-									 (NULL == pCtx->sMb.pInterPredictionDoneFlag[i])
-									)
-								)
-		} // end of if(pCtx->iDecoderMode == SW_MODE)
-		
-		pCtx->pDqLayersList[i] = pDq;
-		++ i;
-	} while( i < LAYER_NUM_EXCHANGEABLE );
-	
 
-	pCtx->bInitialDqLayersMem	= true;
-	pCtx->iPicWidthReq			= kiMaxWidth;
-	pCtx->iPicHeightReq			= kiMaxHeight;
-	
-	return ERR_NONE;
+        pCtx->pRsListXchg[i][iPlaneIdx]	= (int16_t*)WelsMalloc (kiSize * sizeof (int16_t), "pCtx->pRsListXchg[][]");
+
+        WELS_VERIFY_RETURN_IF (ERR_INFO_OUT_OF_MEMORY , (NULL == pCtx->pRsListXchg[i][iPlaneIdx]))
+        pCtx->iRsStride[iPlaneIdx]	= kiStride;
+
+        ++ iPlaneIdx;
+      } while (iPlaneIdx < 3);
+
+
+      pCtx->sMb.pMbType[i] = (int8_t*)WelsMalloc (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int8_t),
+                             "pCtx->sMb.pMbType[]");
+      pCtx->sMb.pMv[i][0] = (int16_t (*)[16][2])WelsMalloc (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (
+                              int16_t) * MV_A * MB_BLOCK4x4_NUM, "pCtx->sMb.pMv[][]");
+      pCtx->sMb.pRefIndex[i][0] = (int8_t (*)[MB_BLOCK4x4_NUM])WelsMalloc (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (
+                                    int8_t) * MB_BLOCK4x4_NUM, "pCtx->sMb.pRefIndex[][]");
+      pCtx->sMb.pLumaQp[i] = (int8_t*)WelsMalloc (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int8_t),
+                             "pCtx->sMb.pLumaQp[]");
+      pCtx->sMb.pChromaQp[i] = (int8_t*)WelsMalloc (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int8_t),
+                               "pCtx->sMb.pChromaQp[]");
+      pCtx->sMb.pNzc[i] = (int8_t (*)[24])WelsMalloc (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int8_t) * 24,
+                          "pCtx->sMb.pNzc[]");
+      pCtx->sMb.pNzcRs[i] = (int8_t (*)[24])WelsMalloc (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int8_t) * 24,
+                            "pCtx->sMb.pNzcRs[]");
+      pCtx->sMb.pScaledTCoeff[i] = (int16_t (*)[MB_COEFF_LIST_SIZE])WelsMalloc (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight *
+                                   sizeof (int16_t) * MB_COEFF_LIST_SIZE, "pCtx->sMb.pScaledTCoeff[]");
+      pCtx->sMb.pIntraPredMode[i] = (int8_t (*)[8])WelsMalloc (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int8_t) * 8,
+                                    "pCtx->sMb.pIntraPredMode[]");
+      pCtx->sMb.pIntra4x4FinalMode[i] = (int8_t (*)[MB_BLOCK4x4_NUM])WelsMalloc (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight *
+                                        sizeof (int8_t) * MB_BLOCK4x4_NUM, "pCtx->sMb.pIntra4x4FinalMode[]");
+      pCtx->sMb.pChromaPredMode[i] = (int8_t*)WelsMalloc (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int8_t),
+                                     "pCtx->sMb.pChromaPredMode[]");
+      pCtx->sMb.pCbp[i] = (int8_t*)WelsMalloc (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int8_t),
+                          "pCtx->sMb.pCbp[]");
+      pCtx->sMb.pSubMbType[i] = (int8_t (*)[MB_PARTITION_SIZE])WelsMalloc (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (
+                                  int8_t) * MB_PARTITION_SIZE, "pCtx->sMb.pSubMbType[]");
+      pCtx->sMb.pSliceIdc[i] = (int32_t*) WelsMalloc (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int32_t),
+                               "pCtx->sMb.pSliceIdc[]");	// using int32_t for slice_idc, 4/21/2010
+      if (pCtx->sMb.pSliceIdc[i] != NULL)
+        memset (pCtx->sMb.pSliceIdc[i], 0xff, (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int32_t)));
+      pCtx->sMb.pResidualPredFlag[i] = (int8_t*) WelsMalloc (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int8_t),
+                                       "pCtx->sMb.pResidualPredFlag[]");
+      //pCtx->sMb.pMotionPredFlag[i] = (uint8_t *) WelsMalloc(pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof(uint8_t), "pCtx->sMb.pMotionPredFlag[]");
+      pCtx->sMb.pInterPredictionDoneFlag[i] = (int8_t*) WelsMalloc (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (
+          int8_t), "pCtx->sMb.pInterPredictionDoneFlag[]");
+
+      // check memory block valid due above allocated..
+      WELS_VERIFY_RETURN_IF (ERR_INFO_OUT_OF_MEMORY,
+                             ((NULL == pCtx->sMb.pMbType[i]) ||
+                              (NULL == pCtx->sMb.pMv[i][0]) ||
+                              (NULL == pCtx->sMb.pRefIndex[i][0]) ||
+                              (NULL == pCtx->sMb.pLumaQp[i]) ||
+                              (NULL == pCtx->sMb.pChromaQp[i]) ||
+                              (NULL == pCtx->sMb.pNzc[i]) ||
+                              (NULL == pCtx->sMb.pNzcRs[i]) ||
+                              (NULL == pCtx->sMb.pScaledTCoeff[i]) ||
+                              (NULL == pCtx->sMb.pIntraPredMode[i]) ||
+                              (NULL == pCtx->sMb.pIntra4x4FinalMode[i]) ||
+                              (NULL == pCtx->sMb.pChromaPredMode[i]) ||
+                              (NULL == pCtx->sMb.pCbp[i]) ||
+                              (NULL == pCtx->sMb.pSubMbType[i]) ||
+                              (NULL == pCtx->sMb.pSliceIdc[i]) ||
+                              (NULL == pCtx->sMb.pResidualPredFlag[i]) ||
+                              (NULL == pCtx->sMb.pInterPredictionDoneFlag[i])
+                             )
+                            )
+    } // end of if(pCtx->iDecoderMode == SW_MODE)
+
+    pCtx->pDqLayersList[i] = pDq;
+    ++ i;
+  } while (i < LAYER_NUM_EXCHANGEABLE);
+
+
+  pCtx->bInitialDqLayersMem	= true;
+  pCtx->iPicWidthReq			= kiMaxWidth;
+  pCtx->iPicHeightReq			= kiMaxHeight;
+
+  return ERR_NONE;
 }
 
-void_t UninitialDqLayersContext ( PWelsDecoderContext pCtx )
-{
-	int32_t i = 0;
-	int32_t j = 0;
-	
-	do {
-		PDqLayer pDq = pCtx->pDqLayersList[i];		
-		if ( pDq == NULL ){
-			++ i;
-			continue;
-		}
-		
-		if ( pCtx->pCsListXchg[i] ){	// cs picture
-			j = 0;
-			do {
-				if ( NULL != pCtx->pCsListXchg[i][j] )
-				{
-					WelsFree( pCtx->pCsListXchg[i][j], "pCtx->pCsListXchg[][]" );
+void_t UninitialDqLayersContext (PWelsDecoderContext pCtx) {
+  int32_t i = 0;
+  int32_t j = 0;
 
-					pCtx->pCsListXchg[i][j] = NULL;
-				}
-				pCtx->iCsStride[j]	= 0;
-				++ j;
-			} while( j < 3 );			
-			
-			pDq->pCsData[i]		= NULL;	// for safe
-			pDq->iCsStride[i]	= 0;
-		}
-		if ( pCtx->pRsListXchg[i] ){			
-			j = 0;
-			do {
-				if ( NULL != pCtx->pRsListXchg[i][j] )
-				{
-					WelsFree( pCtx->pRsListXchg[i][j], "pCtx->pRsListXchg[][]" );
+  do {
+    PDqLayer pDq = pCtx->pDqLayersList[i];
+    if (pDq == NULL) {
+      ++ i;
+      continue;
+    }
 
-					pCtx->pRsListXchg[i][j]	= NULL;
-				}
-				pCtx->iRsStride[j]	= 0;
-				++ j;
-			} while(j < 3);
-		}
+    if (pCtx->pCsListXchg[i]) {	// cs picture
+      j = 0;
+      do {
+        if (NULL != pCtx->pCsListXchg[i][j]) {
+          WelsFree (pCtx->pCsListXchg[i][j], "pCtx->pCsListXchg[][]");
 
-		if (pCtx->sMb.pMbType[i])
-		{	
-			WelsFree( pCtx->sMb.pMbType[i], "pCtx->sMb.pMbType[]");
+          pCtx->pCsListXchg[i][j] = NULL;
+        }
+        pCtx->iCsStride[j]	= 0;
+        ++ j;
+      } while (j < 3);
 
-			pCtx->sMb.pMbType[i] = NULL;	
-		}
-		
-		if (pCtx->sMb.pMv[i][0])
-		{
-			WelsFree( pCtx->sMb.pMv[i][0], "pCtx->sMb.pMv[][]" );
+      pDq->pCsData[i]		= NULL;	// for safe
+      pDq->iCsStride[i]	= 0;
+    }
+    if (pCtx->pRsListXchg[i]) {
+      j = 0;
+      do {
+        if (NULL != pCtx->pRsListXchg[i][j]) {
+          WelsFree (pCtx->pRsListXchg[i][j], "pCtx->pRsListXchg[][]");
 
-			pCtx->sMb.pMv[i][0] = NULL;
-		}
+          pCtx->pRsListXchg[i][j]	= NULL;
+        }
+        pCtx->iRsStride[j]	= 0;
+        ++ j;
+      } while (j < 3);
+    }
 
-		if (pCtx->sMb.pRefIndex[i][0])
-		{
-			WelsFree( pCtx->sMb.pRefIndex[i][0], "pCtx->sMb.pRefIndex[][]" );
+    if (pCtx->sMb.pMbType[i]) {
+      WelsFree (pCtx->sMb.pMbType[i], "pCtx->sMb.pMbType[]");
 
-			pCtx->sMb.pRefIndex[i][0] = NULL;
-		}
+      pCtx->sMb.pMbType[i] = NULL;
+    }
 
-		if (pCtx->sMb.pLumaQp[i])
-		{
-			WelsFree(pCtx->sMb.pLumaQp[i], "pCtx->sMb.pLumaQp[]");
+    if (pCtx->sMb.pMv[i][0]) {
+      WelsFree (pCtx->sMb.pMv[i][0], "pCtx->sMb.pMv[][]");
 
-			pCtx->sMb.pLumaQp[i] = NULL;
-		}
+      pCtx->sMb.pMv[i][0] = NULL;
+    }
 
-		if (pCtx->sMb.pChromaQp[i])
-		{
-			WelsFree(pCtx->sMb.pChromaQp[i], "pCtx->sMb.pChromaQp[]");
+    if (pCtx->sMb.pRefIndex[i][0]) {
+      WelsFree (pCtx->sMb.pRefIndex[i][0], "pCtx->sMb.pRefIndex[][]");
 
-			pCtx->sMb.pChromaQp[i] = NULL;
-		}
-		
-		if (pCtx->sMb.pNzc[i])
-		{
-			WelsFree(pCtx->sMb.pNzc[i], "pCtx->sMb.pNzc[]");
+      pCtx->sMb.pRefIndex[i][0] = NULL;
+    }
 
-			pCtx->sMb.pNzc[i] = NULL;
-		}
+    if (pCtx->sMb.pLumaQp[i]) {
+      WelsFree (pCtx->sMb.pLumaQp[i], "pCtx->sMb.pLumaQp[]");
 
-		if (pCtx->sMb.pNzcRs[i])
-		{
-			WelsFree(pCtx->sMb.pNzcRs[i], "pCtx->sMb.pNzcRs[]");
+      pCtx->sMb.pLumaQp[i] = NULL;
+    }
 
-			pCtx->sMb.pNzcRs[i] = NULL;
-		}		
+    if (pCtx->sMb.pChromaQp[i]) {
+      WelsFree (pCtx->sMb.pChromaQp[i], "pCtx->sMb.pChromaQp[]");
 
-		if (pCtx->sMb.pScaledTCoeff[i])
-		{
-			WelsFree(pCtx->sMb.pScaledTCoeff[i], "pCtx->sMb.pScaledTCoeff[]");
+      pCtx->sMb.pChromaQp[i] = NULL;
+    }
 
-			pCtx->sMb.pScaledTCoeff[i] = NULL;
-		}
+    if (pCtx->sMb.pNzc[i]) {
+      WelsFree (pCtx->sMb.pNzc[i], "pCtx->sMb.pNzc[]");
 
-		if (pCtx->sMb.pIntraPredMode[i])
-		{
-			WelsFree(pCtx->sMb.pIntraPredMode[i], "pCtx->sMb.pIntraPredMode[]");
+      pCtx->sMb.pNzc[i] = NULL;
+    }
 
-			pCtx->sMb.pIntraPredMode[i] = NULL;
-		}
+    if (pCtx->sMb.pNzcRs[i]) {
+      WelsFree (pCtx->sMb.pNzcRs[i], "pCtx->sMb.pNzcRs[]");
 
-		if (pCtx->sMb.pIntra4x4FinalMode[i])
-		{
-			WelsFree(pCtx->sMb.pIntra4x4FinalMode[i], "pCtx->sMb.pIntra4x4FinalMode[]");
+      pCtx->sMb.pNzcRs[i] = NULL;
+    }
 
-			pCtx->sMb.pIntra4x4FinalMode[i] = NULL;
-		}			
+    if (pCtx->sMb.pScaledTCoeff[i]) {
+      WelsFree (pCtx->sMb.pScaledTCoeff[i], "pCtx->sMb.pScaledTCoeff[]");
 
-		if (pCtx->sMb.pChromaPredMode[i])
-		{
-			WelsFree(pCtx->sMb.pChromaPredMode[i], "pCtx->sMb.pChromaPredMode[]");
+      pCtx->sMb.pScaledTCoeff[i] = NULL;
+    }
 
-			pCtx->sMb.pChromaPredMode[i] = NULL;
-		}
-	
-		if (pCtx->sMb.pCbp[i])
-		{
-			WelsFree( pCtx->sMb.pCbp[i], "pCtx->sMb.pCbp[]" );
+    if (pCtx->sMb.pIntraPredMode[i]) {
+      WelsFree (pCtx->sMb.pIntraPredMode[i], "pCtx->sMb.pIntraPredMode[]");
 
-			pCtx->sMb.pCbp[i] = NULL;
-		}
+      pCtx->sMb.pIntraPredMode[i] = NULL;
+    }
 
-  //      if (pCtx->sMb.pMotionPredFlag[i])
-		//{
-		//	WelsFree( pCtx->sMb.pMotionPredFlag[i], "pCtx->sMb.pMotionPredFlag[]" );
+    if (pCtx->sMb.pIntra4x4FinalMode[i]) {
+      WelsFree (pCtx->sMb.pIntra4x4FinalMode[i], "pCtx->sMb.pIntra4x4FinalMode[]");
 
-		//	pCtx->sMb.pMotionPredFlag[i] = NULL;
-		//}
+      pCtx->sMb.pIntra4x4FinalMode[i] = NULL;
+    }
 
-		if (pCtx->sMb.pSubMbType[i])
-		{
-			WelsFree(pCtx->sMb.pSubMbType[i], "pCtx->sMb.pSubMbType[]");
+    if (pCtx->sMb.pChromaPredMode[i]) {
+      WelsFree (pCtx->sMb.pChromaPredMode[i], "pCtx->sMb.pChromaPredMode[]");
 
-			pCtx->sMb.pSubMbType[i] = NULL;
-		}
+      pCtx->sMb.pChromaPredMode[i] = NULL;
+    }
 
-		if (pCtx->sMb.pSliceIdc[i])
-		{
-			WelsFree( pCtx->sMb.pSliceIdc[i], "pCtx->sMb.pSliceIdc[]" );
+    if (pCtx->sMb.pCbp[i]) {
+      WelsFree (pCtx->sMb.pCbp[i], "pCtx->sMb.pCbp[]");
 
-			pCtx->sMb.pSliceIdc[i] = NULL;
-		}
+      pCtx->sMb.pCbp[i] = NULL;
+    }
 
-       if (pCtx->sMb.pResidualPredFlag[i])
-		{
-			WelsFree( pCtx->sMb.pResidualPredFlag[i], "pCtx->sMb.pResidualPredFlag[]" );
+    //      if (pCtx->sMb.pMotionPredFlag[i])
+    //{
+    //	WelsFree( pCtx->sMb.pMotionPredFlag[i], "pCtx->sMb.pMotionPredFlag[]" );
 
-			pCtx->sMb.pResidualPredFlag[i] = NULL;
-		}
+    //	pCtx->sMb.pMotionPredFlag[i] = NULL;
+    //}
 
-		if (pCtx->sMb.pInterPredictionDoneFlag[i])
-		{
-			WelsFree( pCtx->sMb.pInterPredictionDoneFlag[i], "pCtx->sMb.pInterPredictionDoneFlag[]" );
+    if (pCtx->sMb.pSubMbType[i]) {
+      WelsFree (pCtx->sMb.pSubMbType[i], "pCtx->sMb.pSubMbType[]");
 
-			pCtx->sMb.pInterPredictionDoneFlag[i] = NULL;
-		}
-		WelsFree( pDq, "pDq" );
+      pCtx->sMb.pSubMbType[i] = NULL;
+    }
 
-		pDq = NULL;
-		pCtx->pDqLayersList[i] = NULL;
-		
-		++ i;
-	} while( i < LAYER_NUM_EXCHANGEABLE );
-	
-	pCtx->iPicWidthReq			= 0;
-	pCtx->iPicHeightReq			= 0;
-	pCtx->bInitialDqLayersMem	= false;
+    if (pCtx->sMb.pSliceIdc[i]) {
+      WelsFree (pCtx->sMb.pSliceIdc[i], "pCtx->sMb.pSliceIdc[]");
+
+      pCtx->sMb.pSliceIdc[i] = NULL;
+    }
+
+    if (pCtx->sMb.pResidualPredFlag[i]) {
+      WelsFree (pCtx->sMb.pResidualPredFlag[i], "pCtx->sMb.pResidualPredFlag[]");
+
+      pCtx->sMb.pResidualPredFlag[i] = NULL;
+    }
+
+    if (pCtx->sMb.pInterPredictionDoneFlag[i]) {
+      WelsFree (pCtx->sMb.pInterPredictionDoneFlag[i], "pCtx->sMb.pInterPredictionDoneFlag[]");
+
+      pCtx->sMb.pInterPredictionDoneFlag[i] = NULL;
+    }
+    WelsFree (pDq, "pDq");
+
+    pDq = NULL;
+    pCtx->pDqLayersList[i] = NULL;
+
+    ++ i;
+  } while (i < LAYER_NUM_EXCHANGEABLE);
+
+  pCtx->iPicWidthReq			= 0;
+  pCtx->iPicHeightReq			= 0;
+  pCtx->bInitialDqLayersMem	= false;
 }
 
-void_t ResetCurrentAccessUnit ( PWelsDecoderContext pCtx )
-{
-	PAccessUnit pCurAu = pCtx->pAccessUnitList;
-	
-	pCurAu->uiEndPos		= 0;
-	pCurAu->bCompletedAuFlag	= false;
-	if (pCurAu->uiActualUnitsNum > 0){
-		uint32_t iIdx = 0;
-		const uint32_t kuiActualNum = pCurAu->uiActualUnitsNum;		
-		// a more simpler method to do nal units list management prefered here		
-		const uint32_t kuiAvailNum	= pCurAu->uiAvailUnitsNum;
-		const uint32_t kuiLeftNum	= kuiAvailNum - kuiActualNum;		
-		
-		// Swapping active nal unit nodes of succeeding AU with leading of list
-		while (iIdx < kuiLeftNum)
-		{
-			PNalUnit t = pCurAu->pNalUnitsList[kuiActualNum+iIdx];
-			pCurAu->pNalUnitsList[kuiActualNum+iIdx] = pCurAu->pNalUnitsList[iIdx];
-			pCurAu->pNalUnitsList[iIdx] = t;
-			++ iIdx;
-		}
-		pCurAu->uiActualUnitsNum = pCurAu->uiAvailUnitsNum	= kuiLeftNum;
-	}
+void_t ResetCurrentAccessUnit (PWelsDecoderContext pCtx) {
+  PAccessUnit pCurAu = pCtx->pAccessUnitList;
+
+  pCurAu->uiEndPos		= 0;
+  pCurAu->bCompletedAuFlag	= false;
+  if (pCurAu->uiActualUnitsNum > 0) {
+    uint32_t iIdx = 0;
+    const uint32_t kuiActualNum = pCurAu->uiActualUnitsNum;
+    // a more simpler method to do nal units list management prefered here
+    const uint32_t kuiAvailNum	= pCurAu->uiAvailUnitsNum;
+    const uint32_t kuiLeftNum	= kuiAvailNum - kuiActualNum;
+
+    // Swapping active nal unit nodes of succeeding AU with leading of list
+    while (iIdx < kuiLeftNum) {
+      PNalUnit t = pCurAu->pNalUnitsList[kuiActualNum + iIdx];
+      pCurAu->pNalUnitsList[kuiActualNum + iIdx] = pCurAu->pNalUnitsList[iIdx];
+      pCurAu->pNalUnitsList[iIdx] = t;
+      ++ iIdx;
+    }
+    pCurAu->uiActualUnitsNum = pCurAu->uiAvailUnitsNum	= kuiLeftNum;
+  }
 }
 
 /*!
@@ -1288,396 +1233,341 @@
  * \author
  * \history	11/16/2009
  */
-void_t ForceResetCurrentAccessUnit( PAccessUnit pAu )
-{
-	uint32_t uiSucAuIdx	= pAu->uiEndPos + 1;
-	uint32_t uiCurAuIdx	= 0;
+void_t ForceResetCurrentAccessUnit (PAccessUnit pAu) {
+  uint32_t uiSucAuIdx	= pAu->uiEndPos + 1;
+  uint32_t uiCurAuIdx	= 0;
 
-	// swap the succeeding AU's nal units to the front
-	while(uiSucAuIdx < pAu->uiAvailUnitsNum)
-	{
-		PNalUnit t = pAu->pNalUnitsList[uiSucAuIdx];
-		pAu->pNalUnitsList[uiSucAuIdx]	= pAu->pNalUnitsList[uiCurAuIdx];
-		pAu->pNalUnitsList[uiCurAuIdx]	= t;
-		++ uiSucAuIdx;
-		++ uiCurAuIdx;
-	}
+  // swap the succeeding AU's nal units to the front
+  while (uiSucAuIdx < pAu->uiAvailUnitsNum) {
+    PNalUnit t = pAu->pNalUnitsList[uiSucAuIdx];
+    pAu->pNalUnitsList[uiSucAuIdx]	= pAu->pNalUnitsList[uiCurAuIdx];
+    pAu->pNalUnitsList[uiCurAuIdx]	= t;
+    ++ uiSucAuIdx;
+    ++ uiCurAuIdx;
+  }
 
-	// Update avail/actual units num accordingly for next AU parsing
-	if ( pAu->uiAvailUnitsNum > pAu->uiEndPos )
-		pAu->uiAvailUnitsNum	-= (pAu->uiEndPos+1);
-	else
-		pAu->uiAvailUnitsNum	= 0;
-	pAu->uiActualUnitsNum	= 0;
-	pAu->uiEndPos		= 0;
-	pAu->bCompletedAuFlag	= false;	
+  // Update avail/actual units num accordingly for next AU parsing
+  if (pAu->uiAvailUnitsNum > pAu->uiEndPos)
+    pAu->uiAvailUnitsNum	-= (pAu->uiEndPos + 1);
+  else
+    pAu->uiAvailUnitsNum	= 0;
+  pAu->uiActualUnitsNum	= 0;
+  pAu->uiEndPos		= 0;
+  pAu->bCompletedAuFlag	= false;
 }
 
 //clear current corrupted NAL from pNalUnitsList
-void_t ForceClearCurrentNal( PAccessUnit pAu )
-{	
-	if (pAu->uiAvailUnitsNum > 0)
-		-- pAu->uiAvailUnitsNum;
+void_t ForceClearCurrentNal (PAccessUnit pAu) {
+  if (pAu->uiAvailUnitsNum > 0)
+    -- pAu->uiAvailUnitsNum;
 }
 
 
-void_t CheckAvailNalUnitsListContinuity( PWelsDecoderContext pCtx, int32_t iStartIdx, int32_t iEndIdx )
-{
-	PAccessUnit pCurAu = pCtx->pAccessUnitList;
+void_t CheckAvailNalUnitsListContinuity (PWelsDecoderContext pCtx, int32_t iStartIdx, int32_t iEndIdx) {
+  PAccessUnit pCurAu = pCtx->pAccessUnitList;
 
-	uint8_t uiLastNuDependencyId, uiLastNuLayerDqId;
-	uint8_t uiCurNuDependencyId, uiCurNuQualityId, uiCurNuLayerDqId, uiCurNuRefLayerDqId;
+  uint8_t uiLastNuDependencyId, uiLastNuLayerDqId;
+  uint8_t uiCurNuDependencyId, uiCurNuQualityId, uiCurNuLayerDqId, uiCurNuRefLayerDqId;
 
-	int32_t iCurNalUnitIdx = 0;	
-	
-	//check the continuity of pNalUnitsList forwards (from pIdxNoInterLayerPred to end_postion)	
-	uiLastNuDependencyId = pCurAu->pNalUnitsList[iStartIdx]->sNalHeaderExt.uiDependencyId;//starting nal unit
-	uiLastNuLayerDqId   = pCurAu->pNalUnitsList[iStartIdx]->sNalHeaderExt.uiLayerDqId;//starting nal unit
-	iCurNalUnitIdx = iStartIdx + 1;//current nal unit
-	while ( iCurNalUnitIdx <= iEndIdx )
-	{
-		uiCurNuDependencyId   = pCurAu->pNalUnitsList[iCurNalUnitIdx]->sNalHeaderExt.uiDependencyId;
-		uiCurNuQualityId      = pCurAu->pNalUnitsList[iCurNalUnitIdx]->sNalHeaderExt.uiQualityId;
-		uiCurNuLayerDqId     = pCurAu->pNalUnitsList[iCurNalUnitIdx]->sNalHeaderExt.uiLayerDqId;
-		uiCurNuRefLayerDqId = pCurAu->pNalUnitsList[iCurNalUnitIdx]->sNalData.sVclNal.sSliceHeaderExt.uiRefLayerDqId;
-		
-		if ( uiCurNuDependencyId == uiLastNuDependencyId ) 
-		{
-			uiLastNuLayerDqId = uiCurNuLayerDqId;
-			++ iCurNalUnitIdx;
-		}
-		else //uiCurNuDependencyId != uiLastNuDependencyId, new dependency arrive
-		{
-			if ( uiCurNuQualityId == 0 ) 
-			{
-				uiLastNuDependencyId = uiCurNuDependencyId;
-				if ( uiCurNuRefLayerDqId == uiLastNuLayerDqId )					
-				{
-					uiLastNuLayerDqId = uiCurNuLayerDqId;
-					++ iCurNalUnitIdx;
-				}
-				else //cur_nu_layer_id != next_nu_ref_layer_dq_id, the chain is broken at this point
-				{
-					break;
-				}
-			}
-			else //new dependency arrive, but no base quality layer, so we must stop in this point
-			{
-				break;
-			}
-		}
-	}
-	
-	-- iCurNalUnitIdx;
-	pCurAu->uiEndPos = iCurNalUnitIdx;
-	pCtx->uiTargetDqId = pCurAu->pNalUnitsList[iCurNalUnitIdx]->sNalHeaderExt.uiLayerDqId;	
+  int32_t iCurNalUnitIdx = 0;
+
+  //check the continuity of pNalUnitsList forwards (from pIdxNoInterLayerPred to end_postion)
+  uiLastNuDependencyId = pCurAu->pNalUnitsList[iStartIdx]->sNalHeaderExt.uiDependencyId;//starting nal unit
+  uiLastNuLayerDqId   = pCurAu->pNalUnitsList[iStartIdx]->sNalHeaderExt.uiLayerDqId;//starting nal unit
+  iCurNalUnitIdx = iStartIdx + 1;//current nal unit
+  while (iCurNalUnitIdx <= iEndIdx) {
+    uiCurNuDependencyId   = pCurAu->pNalUnitsList[iCurNalUnitIdx]->sNalHeaderExt.uiDependencyId;
+    uiCurNuQualityId      = pCurAu->pNalUnitsList[iCurNalUnitIdx]->sNalHeaderExt.uiQualityId;
+    uiCurNuLayerDqId     = pCurAu->pNalUnitsList[iCurNalUnitIdx]->sNalHeaderExt.uiLayerDqId;
+    uiCurNuRefLayerDqId = pCurAu->pNalUnitsList[iCurNalUnitIdx]->sNalData.sVclNal.sSliceHeaderExt.uiRefLayerDqId;
+
+    if (uiCurNuDependencyId == uiLastNuDependencyId) {
+      uiLastNuLayerDqId = uiCurNuLayerDqId;
+      ++ iCurNalUnitIdx;
+    } else { //uiCurNuDependencyId != uiLastNuDependencyId, new dependency arrive
+      if (uiCurNuQualityId == 0) {
+        uiLastNuDependencyId = uiCurNuDependencyId;
+        if (uiCurNuRefLayerDqId == uiLastNuLayerDqId) {
+          uiLastNuLayerDqId = uiCurNuLayerDqId;
+          ++ iCurNalUnitIdx;
+        } else { //cur_nu_layer_id != next_nu_ref_layer_dq_id, the chain is broken at this point
+          break;
+        }
+      } else { //new dependency arrive, but no base quality layer, so we must stop in this point
+        break;
+      }
+    }
+  }
+
+  -- iCurNalUnitIdx;
+  pCurAu->uiEndPos = iCurNalUnitIdx;
+  pCtx->uiTargetDqId = pCurAu->pNalUnitsList[iCurNalUnitIdx]->sNalHeaderExt.uiLayerDqId;
 }
 
 //main purpose: to support multi-slice and to include all slice which have the same uiDependencyId, uiQualityId and frame_num
 //for single slice, pIdxNoInterLayerPred SHOULD NOT be modified
-void_t RefineIdxNoInterLayerPred( PAccessUnit pCurAu, int32_t* pIdxNoInterLayerPred )
-{
-	int32_t iLastNalDependId  = pCurAu->pNalUnitsList[*pIdxNoInterLayerPred]->sNalHeaderExt.uiDependencyId;
-	int32_t iLastNalQualityId = pCurAu->pNalUnitsList[*pIdxNoInterLayerPred]->sNalHeaderExt.uiQualityId;
-	uint8_t uiLastNalTId       = pCurAu->pNalUnitsList[*pIdxNoInterLayerPred]->sNalHeaderExt.uiTemporalId;
-	int32_t iLastNalFrameNum  = pCurAu->pNalUnitsList[*pIdxNoInterLayerPred]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.iFrameNum;
-	int32_t iLastNalPoc        = pCurAu->pNalUnitsList[*pIdxNoInterLayerPred]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.iPicOrderCntLsb;
-	int32_t iLastNalFirstMb   = pCurAu->pNalUnitsList[*pIdxNoInterLayerPred]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.iFirstMbInSlice;
-	int32_t iCurNalDependId, iCurNalQualityId, iCurNalTId, iCurNalFrameNum, iCurNalPoc, iCurNalFirstMb, iCurIdx, iFinalIdxNoInterLayerPred;
+void_t RefineIdxNoInterLayerPred (PAccessUnit pCurAu, int32_t* pIdxNoInterLayerPred) {
+  int32_t iLastNalDependId  = pCurAu->pNalUnitsList[*pIdxNoInterLayerPred]->sNalHeaderExt.uiDependencyId;
+  int32_t iLastNalQualityId = pCurAu->pNalUnitsList[*pIdxNoInterLayerPred]->sNalHeaderExt.uiQualityId;
+  uint8_t uiLastNalTId       = pCurAu->pNalUnitsList[*pIdxNoInterLayerPred]->sNalHeaderExt.uiTemporalId;
+  int32_t iLastNalFrameNum  =
+    pCurAu->pNalUnitsList[*pIdxNoInterLayerPred]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.iFrameNum;
+  int32_t iLastNalPoc        =
+    pCurAu->pNalUnitsList[*pIdxNoInterLayerPred]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.iPicOrderCntLsb;
+  int32_t iLastNalFirstMb   =
+    pCurAu->pNalUnitsList[*pIdxNoInterLayerPred]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.iFirstMbInSlice;
+  int32_t iCurNalDependId, iCurNalQualityId, iCurNalTId, iCurNalFrameNum, iCurNalPoc, iCurNalFirstMb, iCurIdx,
+          iFinalIdxNoInterLayerPred;
 
-	bool_t  bMultiSliceFind = false;
-	
-	iFinalIdxNoInterLayerPred = 0;
-	iCurIdx = *pIdxNoInterLayerPred - 1;
-	while ( iCurIdx >= 0 )
-	{
-		if ( pCurAu->pNalUnitsList[iCurIdx]->sNalHeaderExt.iNoInterLayerPredFlag )
-		{
-			iCurNalDependId  = pCurAu->pNalUnitsList[iCurIdx]->sNalHeaderExt.uiDependencyId;
-			iCurNalQualityId = pCurAu->pNalUnitsList[iCurIdx]->sNalHeaderExt.uiQualityId;
-			iCurNalTId       = pCurAu->pNalUnitsList[iCurIdx]->sNalHeaderExt.uiTemporalId;
-			iCurNalFrameNum  = pCurAu->pNalUnitsList[iCurIdx]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.iFrameNum;
-			iCurNalPoc        = pCurAu->pNalUnitsList[iCurIdx]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.iPicOrderCntLsb;
-			iCurNalFirstMb   = pCurAu->pNalUnitsList[iCurIdx]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.iFirstMbInSlice;
-			
-			if ( iCurNalDependId == iLastNalDependId  && 
-				iCurNalQualityId == iLastNalQualityId && 
-				iCurNalTId       == uiLastNalTId       &&
-				iCurNalFrameNum  == iLastNalFrameNum  && 
-				iCurNalPoc        == iLastNalPoc        &&
-				iCurNalFirstMb   != iLastNalFirstMb ) 
-			{
-				bMultiSliceFind = true;
-				iFinalIdxNoInterLayerPred = iCurIdx;
-				--iCurIdx;
-				continue;
-			}
-			else
-			{
-				break;
-			}
-		}
-		--iCurIdx;
-	}
+  bool_t  bMultiSliceFind = false;
 
-	if ( bMultiSliceFind && *pIdxNoInterLayerPred != iFinalIdxNoInterLayerPred )
-	{
-		*pIdxNoInterLayerPred = iFinalIdxNoInterLayerPred;
-	}
+  iFinalIdxNoInterLayerPred = 0;
+  iCurIdx = *pIdxNoInterLayerPred - 1;
+  while (iCurIdx >= 0) {
+    if (pCurAu->pNalUnitsList[iCurIdx]->sNalHeaderExt.iNoInterLayerPredFlag) {
+      iCurNalDependId  = pCurAu->pNalUnitsList[iCurIdx]->sNalHeaderExt.uiDependencyId;
+      iCurNalQualityId = pCurAu->pNalUnitsList[iCurIdx]->sNalHeaderExt.uiQualityId;
+      iCurNalTId       = pCurAu->pNalUnitsList[iCurIdx]->sNalHeaderExt.uiTemporalId;
+      iCurNalFrameNum  = pCurAu->pNalUnitsList[iCurIdx]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.iFrameNum;
+      iCurNalPoc        = pCurAu->pNalUnitsList[iCurIdx]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.iPicOrderCntLsb;
+      iCurNalFirstMb   = pCurAu->pNalUnitsList[iCurIdx]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.iFirstMbInSlice;
+
+      if (iCurNalDependId == iLastNalDependId  &&
+          iCurNalQualityId == iLastNalQualityId &&
+          iCurNalTId       == uiLastNalTId       &&
+          iCurNalFrameNum  == iLastNalFrameNum  &&
+          iCurNalPoc        == iLastNalPoc        &&
+          iCurNalFirstMb   != iLastNalFirstMb) {
+        bMultiSliceFind = true;
+        iFinalIdxNoInterLayerPred = iCurIdx;
+        --iCurIdx;
+        continue;
+      } else {
+        break;
+      }
+    }
+    --iCurIdx;
+  }
+
+  if (bMultiSliceFind && *pIdxNoInterLayerPred != iFinalIdxNoInterLayerPred) {
+    *pIdxNoInterLayerPred = iFinalIdxNoInterLayerPred;
+  }
 }
 
-bool_t CheckPocOfCurValidNalUnits( PAccessUnit pCurAu, int32_t pIdxNoInterLayerPred )
-{	 
-	int32_t iEndIdx    = pCurAu->uiEndPos;
-	int32_t iCurAuPoc = pCurAu->pNalUnitsList[pIdxNoInterLayerPred]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.iPicOrderCntLsb;
-	int32_t iTmpPoc, i;
-	for ( i = pIdxNoInterLayerPred+1; i < iEndIdx; i++ )
-	{
-		iTmpPoc = pCurAu->pNalUnitsList[i]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.iPicOrderCntLsb;
-		if ( iTmpPoc != iCurAuPoc )
-		{
-			return false;
-		}
-	}
+bool_t CheckPocOfCurValidNalUnits (PAccessUnit pCurAu, int32_t pIdxNoInterLayerPred) {
+  int32_t iEndIdx    = pCurAu->uiEndPos;
+  int32_t iCurAuPoc =
+    pCurAu->pNalUnitsList[pIdxNoInterLayerPred]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.iPicOrderCntLsb;
+  int32_t iTmpPoc, i;
+  for (i = pIdxNoInterLayerPred + 1; i < iEndIdx; i++) {
+    iTmpPoc = pCurAu->pNalUnitsList[i]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.iPicOrderCntLsb;
+    if (iTmpPoc != iCurAuPoc) {
+      return false;
+    }
+  }
 
-	return true;
+  return true;
 }
 
-bool_t CheckIntegrityNalUnitsList( PWelsDecoderContext pCtx )
-{
-	PAccessUnit pCurAu = pCtx->pAccessUnitList;
-	const int32_t kiEndPos = pCurAu->uiEndPos;
-	int32_t iIdxNoInterLayerPred = 0;
-	int32_t iCurNalUnitIdx = kiEndPos;
+bool_t CheckIntegrityNalUnitsList (PWelsDecoderContext pCtx) {
+  PAccessUnit pCurAu = pCtx->pAccessUnitList;
+  const int32_t kiEndPos = pCurAu->uiEndPos;
+  int32_t iIdxNoInterLayerPred = 0;
+  int32_t iCurNalUnitIdx = kiEndPos;
 
-	ESliceType eSliceType = static_cast<ESliceType> (0);//EC 2009.11.12
-	
-	if ( !pCurAu->bCompletedAuFlag )
-		return false;
+  ESliceType eSliceType = static_cast<ESliceType> (0);//EC 2009.11.12
 
-	eSliceType = pCurAu->pNalUnitsList[iCurNalUnitIdx]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.eSliceType;
+  if (!pCurAu->bCompletedAuFlag)
+    return false;
 
-	if ( I_SLICE == eSliceType )
-	{
-		pCurAu->uiStartPos = 0;
-		//step1: search the pNalUnit whose iNoInterLayerPredFlag equal to 1 backwards (from uiEndPos to 0)
-		iIdxNoInterLayerPred = kiEndPos;
-		while ( iIdxNoInterLayerPred >= 0 ) 
-		{
-			if ( pCurAu->pNalUnitsList[iIdxNoInterLayerPred]->sNalHeaderExt.iNoInterLayerPredFlag ) 
-			{
-				break;
-			}
-			--iIdxNoInterLayerPred;
-		}
-		if ( iIdxNoInterLayerPred < 0 )
-		{
-			//can not find the Nal Unit whose no_inter_pred_falg equal to 1, MUST STOP decode
-			return false;
-		}		
-		
-		//step2: support multi-slice, to include all base layer slice
-		RefineIdxNoInterLayerPred( pCurAu, &iIdxNoInterLayerPred );		
-		pCurAu->uiStartPos = iIdxNoInterLayerPred;
-		CheckAvailNalUnitsListContinuity( pCtx, iIdxNoInterLayerPred, kiEndPos );
+  eSliceType = pCurAu->pNalUnitsList[iCurNalUnitIdx]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.eSliceType;
 
-		if ( !CheckPocOfCurValidNalUnits( pCurAu, iIdxNoInterLayerPred ) ) 
-		{
-			return false;
-		}
-		
-		pCtx->iCurSeqIntervalTargetDependId = pCurAu->pNalUnitsList[pCurAu->uiEndPos]->sNalHeaderExt.uiDependencyId;
-		pCtx->iCurSeqIntervalMaxPicWidth  = pCurAu->pNalUnitsList[pCurAu->uiEndPos]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.iMbWidth << 4;
-		pCtx->iCurSeqIntervalMaxPicHeight = pCurAu->pNalUnitsList[pCurAu->uiEndPos]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.iMbHeight << 4;
-	}
-	else //P_SLICE
-	{
-		//step 1: search uiDependencyId equal to pCtx->cur_seq_interval_target_dependency_id
-		bool_t bGetDependId = false;
-		int32_t iIdxDependId = 0;
+  if (I_SLICE == eSliceType) {
+    pCurAu->uiStartPos = 0;
+    //step1: search the pNalUnit whose iNoInterLayerPredFlag equal to 1 backwards (from uiEndPos to 0)
+    iIdxNoInterLayerPred = kiEndPos;
+    while (iIdxNoInterLayerPred >= 0) {
+      if (pCurAu->pNalUnitsList[iIdxNoInterLayerPred]->sNalHeaderExt.iNoInterLayerPredFlag) {
+        break;
+      }
+      --iIdxNoInterLayerPred;
+    }
+    if (iIdxNoInterLayerPred < 0) {
+      //can not find the Nal Unit whose no_inter_pred_falg equal to 1, MUST STOP decode
+      return false;
+    }
 
-		iIdxDependId = kiEndPos;
-		while ( iIdxDependId >= 0 ) 
-		{
-			if ( pCtx->iCurSeqIntervalTargetDependId == pCurAu->pNalUnitsList[iIdxDependId]->sNalHeaderExt.uiDependencyId )
-			{
-				bGetDependId = true;
-				break;
-			}
-			else
-			{
-				--iIdxDependId;
-			}
-		}
-		
-		//step 2: switch according to whether or not find the index of pNalUnit whose uiDependencyId equal to iCurSeqIntervalTargetDependId
-		if ( bGetDependId ) //get the index of pNalUnit whose uiDependencyId equal to iCurSeqIntervalTargetDependId
-		{
-			bool_t bGetNoInterPredFront = false;
-			//step 2a: search iNoInterLayerPredFlag [0....iIdxDependId]
-			iIdxNoInterLayerPred = iIdxDependId;
-			while ( iIdxNoInterLayerPred >= 0 )
-			{
-				if ( pCurAu->pNalUnitsList[iIdxNoInterLayerPred]->sNalHeaderExt.iNoInterLayerPredFlag )
-				{
-					bGetNoInterPredFront = true;
-					break;
-				}
-				--iIdxNoInterLayerPred;
-			}
-			//step 2b: switch, whether or not find the NAL unit whose no_inter_pred_flag equal to 1 among [0....iIdxDependId] 
-			if ( bGetNoInterPredFront ) //YES
-			{
-				RefineIdxNoInterLayerPred( pCurAu, &iIdxNoInterLayerPred );
-				pCurAu->uiStartPos = iIdxNoInterLayerPred;
-				CheckAvailNalUnitsListContinuity( pCtx, iIdxNoInterLayerPred, iIdxDependId );
-				
-				if ( !CheckPocOfCurValidNalUnits( pCurAu, iIdxNoInterLayerPred ) ) 
-				{
-					return false;
-				}
-			}
-			else //NO, should find the NAL unit whose no_inter_pred_flag equal to 1 among [iIdxDependId....uiEndPos]
-			{
-				iIdxNoInterLayerPred = iIdxDependId;
-				while ( iIdxNoInterLayerPred <= kiEndPos )
-				{
-					if ( pCurAu->pNalUnitsList[iIdxNoInterLayerPred]->sNalHeaderExt.iNoInterLayerPredFlag )
-					{
-						break;
-					}					
-					++iIdxNoInterLayerPred;
-				}
+    //step2: support multi-slice, to include all base layer slice
+    RefineIdxNoInterLayerPred (pCurAu, &iIdxNoInterLayerPred);
+    pCurAu->uiStartPos = iIdxNoInterLayerPred;
+    CheckAvailNalUnitsListContinuity (pCtx, iIdxNoInterLayerPred, kiEndPos);
 
-				if ( iIdxNoInterLayerPred > kiEndPos )
-				{
-					return false; //cann't find the index of pNalUnit whose no_inter_pred_flag = 1
-				}
+    if (!CheckPocOfCurValidNalUnits (pCurAu, iIdxNoInterLayerPred)) {
+      return false;
+    }
 
-				RefineIdxNoInterLayerPred( pCurAu, &iIdxNoInterLayerPred );			
-				pCurAu->uiStartPos = iIdxNoInterLayerPred;
-				CheckAvailNalUnitsListContinuity( pCtx, iIdxNoInterLayerPred, kiEndPos );
-				
-				if ( !CheckPocOfCurValidNalUnits( pCurAu, iIdxNoInterLayerPred ) ) 
-				{
-					return false;
-				}				
-			}
-		}
-		else //without the index of pNalUnit, should process this AU as common case
-		{
-			iIdxNoInterLayerPred = kiEndPos;
-			while (iIdxNoInterLayerPred >= 0)
-			{
-				if ( pCurAu->pNalUnitsList[iIdxNoInterLayerPred]->sNalHeaderExt.iNoInterLayerPredFlag ) 
-				{
-					break;
-				}
-				--iIdxNoInterLayerPred;
-			}
-			if (iIdxNoInterLayerPred < 0) 
-			{
-				return false; //cann't find the index of pNalUnit whose iNoInterLayerPredFlag = 1
-			}
+    pCtx->iCurSeqIntervalTargetDependId = pCurAu->pNalUnitsList[pCurAu->uiEndPos]->sNalHeaderExt.uiDependencyId;
+    pCtx->iCurSeqIntervalMaxPicWidth  =
+      pCurAu->pNalUnitsList[pCurAu->uiEndPos]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.iMbWidth << 4;
+    pCtx->iCurSeqIntervalMaxPicHeight =
+      pCurAu->pNalUnitsList[pCurAu->uiEndPos]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.iMbHeight << 4;
+  } else { //P_SLICE
+    //step 1: search uiDependencyId equal to pCtx->cur_seq_interval_target_dependency_id
+    bool_t bGetDependId = false;
+    int32_t iIdxDependId = 0;
 
-			RefineIdxNoInterLayerPred( pCurAu, &iIdxNoInterLayerPred );	
-			pCurAu->uiStartPos = iIdxNoInterLayerPred;
-			CheckAvailNalUnitsListContinuity( pCtx, iIdxNoInterLayerPred, kiEndPos );
+    iIdxDependId = kiEndPos;
+    while (iIdxDependId >= 0) {
+      if (pCtx->iCurSeqIntervalTargetDependId == pCurAu->pNalUnitsList[iIdxDependId]->sNalHeaderExt.uiDependencyId) {
+        bGetDependId = true;
+        break;
+      } else {
+        --iIdxDependId;
+      }
+    }
 
-			if ( !CheckPocOfCurValidNalUnits( pCurAu, iIdxNoInterLayerPred ) ) 
-			{
-				return false;
-			}
-		}
-	}	
+    //step 2: switch according to whether or not find the index of pNalUnit whose uiDependencyId equal to iCurSeqIntervalTargetDependId
+    if (bGetDependId) { //get the index of pNalUnit whose uiDependencyId equal to iCurSeqIntervalTargetDependId
+      bool_t bGetNoInterPredFront = false;
+      //step 2a: search iNoInterLayerPredFlag [0....iIdxDependId]
+      iIdxNoInterLayerPred = iIdxDependId;
+      while (iIdxNoInterLayerPred >= 0) {
+        if (pCurAu->pNalUnitsList[iIdxNoInterLayerPred]->sNalHeaderExt.iNoInterLayerPredFlag) {
+          bGetNoInterPredFront = true;
+          break;
+        }
+        --iIdxNoInterLayerPred;
+      }
+      //step 2b: switch, whether or not find the NAL unit whose no_inter_pred_flag equal to 1 among [0....iIdxDependId]
+      if (bGetNoInterPredFront) { //YES
+        RefineIdxNoInterLayerPred (pCurAu, &iIdxNoInterLayerPred);
+        pCurAu->uiStartPos = iIdxNoInterLayerPred;
+        CheckAvailNalUnitsListContinuity (pCtx, iIdxNoInterLayerPred, iIdxDependId);
 
-	return true;	
+        if (!CheckPocOfCurValidNalUnits (pCurAu, iIdxNoInterLayerPred)) {
+          return false;
+        }
+      } else { //NO, should find the NAL unit whose no_inter_pred_flag equal to 1 among [iIdxDependId....uiEndPos]
+        iIdxNoInterLayerPred = iIdxDependId;
+        while (iIdxNoInterLayerPred <= kiEndPos) {
+          if (pCurAu->pNalUnitsList[iIdxNoInterLayerPred]->sNalHeaderExt.iNoInterLayerPredFlag) {
+            break;
+          }
+          ++iIdxNoInterLayerPred;
+        }
+
+        if (iIdxNoInterLayerPred > kiEndPos) {
+          return false; //cann't find the index of pNalUnit whose no_inter_pred_flag = 1
+        }
+
+        RefineIdxNoInterLayerPred (pCurAu, &iIdxNoInterLayerPred);
+        pCurAu->uiStartPos = iIdxNoInterLayerPred;
+        CheckAvailNalUnitsListContinuity (pCtx, iIdxNoInterLayerPred, kiEndPos);
+
+        if (!CheckPocOfCurValidNalUnits (pCurAu, iIdxNoInterLayerPred)) {
+          return false;
+        }
+      }
+    } else { //without the index of pNalUnit, should process this AU as common case
+      iIdxNoInterLayerPred = kiEndPos;
+      while (iIdxNoInterLayerPred >= 0) {
+        if (pCurAu->pNalUnitsList[iIdxNoInterLayerPred]->sNalHeaderExt.iNoInterLayerPredFlag) {
+          break;
+        }
+        --iIdxNoInterLayerPred;
+      }
+      if (iIdxNoInterLayerPred < 0) {
+        return false; //cann't find the index of pNalUnit whose iNoInterLayerPredFlag = 1
+      }
+
+      RefineIdxNoInterLayerPred (pCurAu, &iIdxNoInterLayerPred);
+      pCurAu->uiStartPos = iIdxNoInterLayerPred;
+      CheckAvailNalUnitsListContinuity (pCtx, iIdxNoInterLayerPred, kiEndPos);
+
+      if (!CheckPocOfCurValidNalUnits (pCurAu, iIdxNoInterLayerPred)) {
+        return false;
+      }
+    }
+  }
+
+  return true;
 }
 
-void_t CheckOnlyOneLayerInAu( PWelsDecoderContext pCtx )
-{
-	PAccessUnit pCurAu = pCtx->pAccessUnitList;
+void_t CheckOnlyOneLayerInAu (PWelsDecoderContext pCtx) {
+  PAccessUnit pCurAu = pCtx->pAccessUnitList;
 
-	int32_t iEndIdx = pCurAu->uiEndPos;	
-	int32_t iCurIdx = pCurAu->uiStartPos;
-	uint8_t uiDId = pCurAu->pNalUnitsList[iCurIdx]->sNalHeaderExt.uiDependencyId;
-	uint8_t uiQId = pCurAu->pNalUnitsList[iCurIdx]->sNalHeaderExt.uiQualityId;
-	uint8_t uiTId = pCurAu->pNalUnitsList[iCurIdx]->sNalHeaderExt.uiTemporalId;
+  int32_t iEndIdx = pCurAu->uiEndPos;
+  int32_t iCurIdx = pCurAu->uiStartPos;
+  uint8_t uiDId = pCurAu->pNalUnitsList[iCurIdx]->sNalHeaderExt.uiDependencyId;
+  uint8_t uiQId = pCurAu->pNalUnitsList[iCurIdx]->sNalHeaderExt.uiQualityId;
+  uint8_t uiTId = pCurAu->pNalUnitsList[iCurIdx]->sNalHeaderExt.uiTemporalId;
 
-	uint8_t uiCurDId, uiCurQId, uiCurTId;	
+  uint8_t uiCurDId, uiCurQId, uiCurTId;
 
-	pCtx->bOnlyOneLayerInCurAuFlag = true;
+  pCtx->bOnlyOneLayerInCurAuFlag = true;
 
-	if ( iEndIdx == iCurIdx ) //only one NAL in pNalUnitsList
-	{
-		return;
-	}
+  if (iEndIdx == iCurIdx) { //only one NAL in pNalUnitsList
+    return;
+  }
 
-	++iCurIdx;
-	while ( iCurIdx <= iEndIdx )
-	{
-		uiCurDId = pCurAu->pNalUnitsList[iCurIdx]->sNalHeaderExt.uiDependencyId;
-		uiCurQId = pCurAu->pNalUnitsList[iCurIdx]->sNalHeaderExt.uiQualityId;
-		uiCurTId = pCurAu->pNalUnitsList[iCurIdx]->sNalHeaderExt.uiTemporalId;
+  ++iCurIdx;
+  while (iCurIdx <= iEndIdx) {
+    uiCurDId = pCurAu->pNalUnitsList[iCurIdx]->sNalHeaderExt.uiDependencyId;
+    uiCurQId = pCurAu->pNalUnitsList[iCurIdx]->sNalHeaderExt.uiQualityId;
+    uiCurTId = pCurAu->pNalUnitsList[iCurIdx]->sNalHeaderExt.uiTemporalId;
 
-		if ( uiDId != uiCurDId || uiQId != uiCurQId || uiTId != uiCurTId )
-		{
-			pCtx->bOnlyOneLayerInCurAuFlag = false;
-			return;
-		}
+    if (uiDId != uiCurDId || uiQId != uiCurQId || uiTId != uiCurTId) {
+      pCtx->bOnlyOneLayerInCurAuFlag = false;
+      return;
+    }
 
-		++iCurIdx;
-	}
+    ++iCurIdx;
+  }
 }
 
-int32_t WelsDecodeAccessUnitStart ( PWelsDecoderContext pCtx )
-{
-	// Roll back NAL units not being belong to current access unit list for proceeded access unit
-	int32_t iRet = UpdateAccessUnit ( pCtx );
-	if ( iRet != ERR_NONE )
-		return iRet;
+int32_t WelsDecodeAccessUnitStart (PWelsDecoderContext pCtx) {
+  // Roll back NAL units not being belong to current access unit list for proceeded access unit
+  int32_t iRet = UpdateAccessUnit (pCtx);
+  if (iRet != ERR_NONE)
+    return iRet;
 
-	pCtx->pAccessUnitList->uiStartPos = 0;
-	if ( !pCtx->bAvcBasedFlag && !CheckIntegrityNalUnitsList( pCtx ) ) 
-	{
-		pCtx->iErrorCode |= dsBitstreamError;
-		return dsBitstreamError;
-	}
+  pCtx->pAccessUnitList->uiStartPos = 0;
+  if (!pCtx->bAvcBasedFlag && !CheckIntegrityNalUnitsList (pCtx)) {
+    pCtx->iErrorCode |= dsBitstreamError;
+    return dsBitstreamError;
+  }
 
-	//check current AU has only one layer or not
-	//If YES, can use deblocking based on AVC
-	if ( !pCtx->bAvcBasedFlag )
-	{
-		CheckOnlyOneLayerInAu( pCtx );
-	}
+  //check current AU has only one layer or not
+  //If YES, can use deblocking based on AVC
+  if (!pCtx->bAvcBasedFlag) {
+    CheckOnlyOneLayerInAu (pCtx);
+  }
 
-	return ERR_NONE;
+  return ERR_NONE;
 }
 
-void_t WelsDecodeAccessUnitEnd ( PWelsDecoderContext pCtx )
-{
-	// uninitialize context of current access unit and rbsp buffer clean
-	ResetCurrentAccessUnit ( pCtx );	
+void_t WelsDecodeAccessUnitEnd (PWelsDecoderContext pCtx) {
+  // uninitialize context of current access unit and rbsp buffer clean
+  ResetCurrentAccessUnit (pCtx);
 }
 
 
-int32_t CheckBSBound(int32_t iWidth, int32_t iHeight, int32_t sliceNum, int32_t ppsId)
-{
-	int32_t iRet = 0;
-	
-#if defined(WIN32)	
-	iRet = ((iWidth == 80) && (iHeight = 45) && (sliceNum < 60));
-	
-#elif defined(MACOS)	
-	iRet = ((iWidth == 80) && (iHeight = 45) && (ppsId < 57));
-	
+int32_t CheckBSBound (int32_t iWidth, int32_t iHeight, int32_t sliceNum, int32_t ppsId) {
+  int32_t iRet = 0;
+
+#if defined(WIN32)
+  iRet = ((iWidth == 80) && (iHeight = 45) && (sliceNum < 60));
+
+#elif defined(MACOS)
+  iRet = ((iWidth == 80) && (iHeight = 45) && (ppsId < 57));
+
 #elif defined(ANDROID)
-	iRet = ((iWidth == 40) && (iHeight = 22));
-	
+  iRet = ((iWidth == 40) && (iHeight = 22));
+
 #endif
-	
-	return iRet;
-	
+
+  return iRet;
+
 }
 
 
@@ -1694,168 +1584,157 @@
  * return:
  *	0 - success; otherwise returned error_no defined in error_no.h
  */
-int32_t ConstructAccessUnit( PWelsDecoderContext pCtx, uint8_t** ppDst, SBufferInfo *pDstInfo)
-{
-	int32_t iErr;
-	int32_t iWidth;
-	int32_t iHeight;
-	int32_t iStride[2] = { 0 };
+int32_t ConstructAccessUnit (PWelsDecoderContext pCtx, uint8_t** ppDst, SBufferInfo* pDstInfo) {
+  int32_t iErr;
+  int32_t iWidth;
+  int32_t iHeight;
+  int32_t iStride[2] = { 0 };
 
-	PAccessUnit pCurAu = pCtx->pAccessUnitList;
+  PAccessUnit pCurAu = pCtx->pAccessUnitList;
 
-	pCtx->bAuReadyFlag = false;
-    pCtx->bLastHasMmco5 = false;
+  pCtx->bAuReadyFlag = false;
+  pCtx->bLastHasMmco5 = false;
 
-	iErr = WelsDecodeAccessUnitStart( pCtx );
-	GetVclNalTemporalId( pCtx );
-	
-	if ( ERR_NONE != iErr )
-	{
-		ForceResetCurrentAccessUnit( pCtx->pAccessUnitList );
-		pDstInfo->iBufferStatus = 0;
-		return iErr;
-	}
-	
-	pCtx->pSps = pCurAu->pNalUnitsList[pCurAu->uiStartPos]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.pSps;
-	pCtx->pPps = pCurAu->pNalUnitsList[pCurAu->uiStartPos]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.pPps;
-	
-	//try to allocate or relocate DPB memory only when IDR arrival.
-	if ( NAL_UNIT_CODED_SLICE_IDR == pCurAu->pNalUnitsList[pCurAu->uiStartPos]->sNalHeaderExt.sNalUnitHeader.eNalUnitType ||
-		pCurAu->pNalUnitsList[pCurAu->uiStartPos]->sNalHeaderExt.bIdrFlag )
-	{
-		WelsResetRefPic(pCtx); //clear ref pPic when IDR NAL
-		iErr = SyncPictureResolutionExt( pCtx, (pCtx->iMaxWidthInSps+15)>>4, (pCtx->iMaxHeightInSps+15)>>4 );		
+  iErr = WelsDecodeAccessUnitStart (pCtx);
+  GetVclNalTemporalId (pCtx);
 
-		if( ERR_NONE != iErr ){
-            WelsLog(pCtx, WELS_LOG_WARNING, "sync picture resolution ext failed,  the error is %d", iErr);
-			return iErr;
-		}		
-	}
-	
-	
-	pDstInfo->eBufferProperty = (EBufferProperty)pCtx->iDecoderOutputProperty;
-	
-	iErr = DecodeCurrentAccessUnit( pCtx, ppDst, iStride, &iWidth, &iHeight, pDstInfo );
-	
-	WelsDecodeAccessUnitEnd( pCtx );
-	
-	if ( ERR_NONE != iErr )
-	{
-		WelsLog( pCtx, WELS_LOG_INFO, "returned error from decoding:[0x%x]\n", iErr);
-		
-		pDstInfo->iBufferStatus = 0;
-		return iErr;
-	}
-	
-	return 0;
+  if (ERR_NONE != iErr) {
+    ForceResetCurrentAccessUnit (pCtx->pAccessUnitList);
+    pDstInfo->iBufferStatus = 0;
+    return iErr;
+  }
+
+  pCtx->pSps = pCurAu->pNalUnitsList[pCurAu->uiStartPos]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.pSps;
+  pCtx->pPps = pCurAu->pNalUnitsList[pCurAu->uiStartPos]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.pPps;
+
+  //try to allocate or relocate DPB memory only when IDR arrival.
+  if (NAL_UNIT_CODED_SLICE_IDR == pCurAu->pNalUnitsList[pCurAu->uiStartPos]->sNalHeaderExt.sNalUnitHeader.eNalUnitType ||
+      pCurAu->pNalUnitsList[pCurAu->uiStartPos]->sNalHeaderExt.bIdrFlag) {
+    WelsResetRefPic (pCtx); //clear ref pPic when IDR NAL
+    iErr = SyncPictureResolutionExt (pCtx, (pCtx->iMaxWidthInSps + 15) >> 4, (pCtx->iMaxHeightInSps + 15) >> 4);
+
+    if (ERR_NONE != iErr) {
+      WelsLog (pCtx, WELS_LOG_WARNING, "sync picture resolution ext failed,  the error is %d", iErr);
+      return iErr;
+    }
+  }
+
+
+  pDstInfo->eBufferProperty = (EBufferProperty)pCtx->iDecoderOutputProperty;
+
+  iErr = DecodeCurrentAccessUnit (pCtx, ppDst, iStride, &iWidth, &iHeight, pDstInfo);
+
+  WelsDecodeAccessUnitEnd (pCtx);
+
+  if (ERR_NONE != iErr) {
+    WelsLog (pCtx, WELS_LOG_INFO, "returned error from decoding:[0x%x]\n", iErr);
+
+    pDstInfo->iBufferStatus = 0;
+    return iErr;
+  }
+
+  return 0;
 }
 
-static inline void_t InitDqLayerInfo( PDqLayer pDqLayer, PLayerInfo pLayerInfo, PNalUnit pNalUnit, PPicture pPicDec )
-{
-	PNalUnitHeaderExt pNalHdrExt    = &pNalUnit->sNalHeaderExt;
-	PSliceHeaderExt pShExt			= &pNalUnit->sNalData.sVclNal.sSliceHeaderExt;
-	PSliceHeader        pSh			= &pShExt->sSliceHeader;
-	const uint8_t kuiQualityId		= pNalHdrExt->uiQualityId;
-	
-	memcpy(&pDqLayer->sLayerInfo, pLayerInfo, sizeof(SLayerInfo));//confirmed_safe_unsafe_usage
-	
-	pDqLayer->pDec		= pPicDec;
-	pDqLayer->iMbWidth	= pSh->iMbWidth;	// MB width of this picture
-	pDqLayer->iMbHeight	= pSh->iMbHeight;// MB height of this picture			
+static inline void_t InitDqLayerInfo (PDqLayer pDqLayer, PLayerInfo pLayerInfo, PNalUnit pNalUnit, PPicture pPicDec) {
+  PNalUnitHeaderExt pNalHdrExt    = &pNalUnit->sNalHeaderExt;
+  PSliceHeaderExt pShExt			= &pNalUnit->sNalData.sVclNal.sSliceHeaderExt;
+  PSliceHeader        pSh			= &pShExt->sSliceHeader;
+  const uint8_t kuiQualityId		= pNalHdrExt->uiQualityId;
 
-	pDqLayer->iSliceIdcBackup = (pSh->iFirstMbInSlice << 7) | (pNalHdrExt->uiDependencyId << 4) | (pNalHdrExt->uiQualityId);
-	
-	/* Common syntax elements across all slices of a DQLayer */			
-	pDqLayer->uiPpsId									= pLayerInfo->pPps->iPpsId;
-	pDqLayer->uiDisableInterLayerDeblockingFilterIdc	= pShExt->uiDisableInterLayerDeblockingFilterIdc;
-	pDqLayer->iInterLayerSliceAlphaC0Offset			    = pShExt->iInterLayerSliceAlphaC0Offset;
-	pDqLayer->iInterLayerSliceBetaOffset				= pShExt->iInterLayerSliceBetaOffset;	
-	pDqLayer->iSliceGroupChangeCycle					= pSh->iSliceGroupChangeCycle;
-	pDqLayer->bStoreRefBasePicFlag					    = pShExt->bStoreRefBasePicFlag;
-	pDqLayer->bTCoeffLevelPredFlag					    = pShExt->bTCoeffLevelPredFlag;
-	pDqLayer->bConstrainedIntraResamplingFlag			= pShExt->bConstrainedIntraResamplingFlag;
-	pDqLayer->uiRefLayerDqId							= pShExt->uiRefLayerDqId;
-	pDqLayer->uiRefLayerChromaPhaseXPlus1Flag		    = pShExt->uiRefLayerChromaPhaseXPlus1Flag;
-	pDqLayer->uiRefLayerChromaPhaseYPlus1				= pShExt->uiRefLayerChromaPhaseYPlus1;
-	//memcpy(&pDqLayer->sScaledRefLayer, &pShExt->sScaledRefLayer, sizeof(SPosOffset));//confirmed_safe_unsafe_usage
-	
-	if ( kuiQualityId == BASE_QUALITY_ID ){
-		pDqLayer->pRefPicListReordering		= &pSh->pRefPicListReordering;
-		pDqLayer->pRefPicMarking		= &pSh->sRefMarking;
-		pDqLayer->pRefPicBaseMarking	= &pShExt->sRefBasePicMarking;
-	}	
-	
-	pDqLayer->uiLayerDqId			= pNalHdrExt->uiLayerDqId;	// dq_id of current layer
-	pDqLayer->bUseRefBasePicFlag	= pNalHdrExt->bUseRefBasePicFlag;
+  memcpy (&pDqLayer->sLayerInfo, pLayerInfo, sizeof (SLayerInfo)); //confirmed_safe_unsafe_usage
+
+  pDqLayer->pDec		= pPicDec;
+  pDqLayer->iMbWidth	= pSh->iMbWidth;	// MB width of this picture
+  pDqLayer->iMbHeight	= pSh->iMbHeight;// MB height of this picture
+
+  pDqLayer->iSliceIdcBackup = (pSh->iFirstMbInSlice << 7) | (pNalHdrExt->uiDependencyId << 4) | (pNalHdrExt->uiQualityId);
+
+  /* Common syntax elements across all slices of a DQLayer */
+  pDqLayer->uiPpsId									= pLayerInfo->pPps->iPpsId;
+  pDqLayer->uiDisableInterLayerDeblockingFilterIdc	= pShExt->uiDisableInterLayerDeblockingFilterIdc;
+  pDqLayer->iInterLayerSliceAlphaC0Offset			    = pShExt->iInterLayerSliceAlphaC0Offset;
+  pDqLayer->iInterLayerSliceBetaOffset				= pShExt->iInterLayerSliceBetaOffset;
+  pDqLayer->iSliceGroupChangeCycle					= pSh->iSliceGroupChangeCycle;
+  pDqLayer->bStoreRefBasePicFlag					    = pShExt->bStoreRefBasePicFlag;
+  pDqLayer->bTCoeffLevelPredFlag					    = pShExt->bTCoeffLevelPredFlag;
+  pDqLayer->bConstrainedIntraResamplingFlag			= pShExt->bConstrainedIntraResamplingFlag;
+  pDqLayer->uiRefLayerDqId							= pShExt->uiRefLayerDqId;
+  pDqLayer->uiRefLayerChromaPhaseXPlus1Flag		    = pShExt->uiRefLayerChromaPhaseXPlus1Flag;
+  pDqLayer->uiRefLayerChromaPhaseYPlus1				= pShExt->uiRefLayerChromaPhaseYPlus1;
+  //memcpy(&pDqLayer->sScaledRefLayer, &pShExt->sScaledRefLayer, sizeof(SPosOffset));//confirmed_safe_unsafe_usage
+
+  if (kuiQualityId == BASE_QUALITY_ID) {
+    pDqLayer->pRefPicListReordering		= &pSh->pRefPicListReordering;
+    pDqLayer->pRefPicMarking		= &pSh->sRefMarking;
+    pDqLayer->pRefPicBaseMarking	= &pShExt->sRefBasePicMarking;
+  }
+
+  pDqLayer->uiLayerDqId			= pNalHdrExt->uiLayerDqId;	// dq_id of current layer
+  pDqLayer->bUseRefBasePicFlag	= pNalHdrExt->bUseRefBasePicFlag;
 }
 
-void_t WelsDqLayerDecodeStart ( PWelsDecoderContext pCtx, PNalUnit pCurNal, PSps pSps, PPps pPps )
-{		
-	SNalUnitHeader *pNalHdr = &pCurNal->sNalHeaderExt.sNalUnitHeader;
-	PSliceHeader pSh = &pCurNal->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader;
-			
-	pCtx->eSliceType			= pSh->eSliceType;	
-	pCtx->pSliceHeader			= pSh;
+void_t WelsDqLayerDecodeStart (PWelsDecoderContext pCtx, PNalUnit pCurNal, PSps pSps, PPps pPps) {
+  SNalUnitHeader* pNalHdr = &pCurNal->sNalHeaderExt.sNalUnitHeader;
+  PSliceHeader pSh = &pCurNal->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader;
 
-	pCtx->iFrameNum			= pSh->iFrameNum;
+  pCtx->eSliceType			= pSh->eSliceType;
+  pCtx->pSliceHeader			= pSh;
 
-	if ((pNalHdr->eNalUnitType == NAL_UNIT_CODED_SLICE_IDR || 
-		(pCurNal->sNalHeaderExt.uiQualityId == BASE_QUALITY_ID && pCurNal->sNalHeaderExt.bIdrFlag)) &&
-		 pSh->iFrameNum == 0) //pSh->iFrameNum == 0 this condition can deleted??????
-	{
-		WelsResetRefPic ( pCtx );	// Reset decoded picture buffer lists due to an IDR frame incomes
-	}
+  pCtx->iFrameNum			= pSh->iFrameNum;
+
+  if ((pNalHdr->eNalUnitType == NAL_UNIT_CODED_SLICE_IDR ||
+       (pCurNal->sNalHeaderExt.uiQualityId == BASE_QUALITY_ID && pCurNal->sNalHeaderExt.bIdrFlag)) &&
+      pSh->iFrameNum == 0) { //pSh->iFrameNum == 0 this condition can deleted??????
+    WelsResetRefPic (pCtx);	// Reset decoded picture buffer lists due to an IDR frame incomes
+  }
 }
 
-int32_t InitRefPicList ( PWelsDecoderContext pCtx, const uint8_t kuiNRi, const bool_t kbFirstSlice, int32_t iPoc)
-{
-	int32_t iRet = ERR_NONE;
-    if( kbFirstSlice)
-        iRet = WelsInitRefList( pCtx, iPoc );
-	if ( (pCtx->eSliceType!=I_SLICE && pCtx->eSliceType!=SI_SLICE) && kbFirstSlice ){
-		iRet = WelsReorderRefList ( pCtx );
-	}
-	
-	return iRet;
+int32_t InitRefPicList (PWelsDecoderContext pCtx, const uint8_t kuiNRi, const bool_t kbFirstSlice, int32_t iPoc) {
+  int32_t iRet = ERR_NONE;
+  if (kbFirstSlice)
+    iRet = WelsInitRefList (pCtx, iPoc);
+  if ((pCtx->eSliceType != I_SLICE && pCtx->eSliceType != SI_SLICE) && kbFirstSlice) {
+    iRet = WelsReorderRefList (pCtx);
+  }
+
+  return iRet;
 }
 
-void_t InitCurDqLayerData( PWelsDecoderContext pCtx, PDqLayer pCurDq )
-{
-	if ( NULL != pCtx && NULL != pCurDq )
-	{
-		pCurDq->pCsData[0]		= pCtx->pCsListXchg[0][0];
-		pCurDq->pCsData[1]		= pCtx->pCsListXchg[0][1];
-		pCurDq->pCsData[2]		= pCtx->pCsListXchg[0][2];
-		pCurDq->iCsStride[0]	= pCtx->iCsStride[0];
-		pCurDq->iCsStride[1]	= pCtx->iCsStride[1];
-		pCurDq->iCsStride[2]	= pCtx->iCsStride[2];
-	
-		pCurDq->pMbType			= pCtx->sMb.pMbType[0];
-		pCurDq->pSliceIdc		= pCtx->sMb.pSliceIdc[0];
-		pCurDq->pMv[0]			= pCtx->sMb.pMv[0][0];
-		pCurDq->pRefIndex[0]    = pCtx->sMb.pRefIndex[0][0];
-		pCurDq->pLumaQp         = pCtx->sMb.pLumaQp[0];
-		pCurDq->pChromaQp       = pCtx->sMb.pChromaQp[0];
-		pCurDq->pNzc			= pCtx->sMb.pNzc[0];
-		pCurDq->pNzcRs			= pCtx->sMb.pNzcRs[0];
-		pCurDq->pScaledTCoeff   = pCtx->sMb.pScaledTCoeff[0];
-		pCurDq->pIntraPredMode  = pCtx->sMb.pIntraPredMode[0];
-		pCurDq->pIntra4x4FinalMode = pCtx->sMb.pIntra4x4FinalMode[0];
-		pCurDq->pChromaPredMode = pCtx->sMb.pChromaPredMode[0];
-		pCurDq->pCbp            = pCtx->sMb.pCbp[0];
-		pCurDq->pSubMbType      = pCtx->sMb.pSubMbType[0];
-		pCurDq->pInterPredictionDoneFlag = pCtx->sMb.pInterPredictionDoneFlag[0];
-		pCurDq->pResidualPredFlag = pCtx->sMb.pResidualPredFlag[0];
-	}
+void_t InitCurDqLayerData (PWelsDecoderContext pCtx, PDqLayer pCurDq) {
+  if (NULL != pCtx && NULL != pCurDq) {
+    pCurDq->pCsData[0]		= pCtx->pCsListXchg[0][0];
+    pCurDq->pCsData[1]		= pCtx->pCsListXchg[0][1];
+    pCurDq->pCsData[2]		= pCtx->pCsListXchg[0][2];
+    pCurDq->iCsStride[0]	= pCtx->iCsStride[0];
+    pCurDq->iCsStride[1]	= pCtx->iCsStride[1];
+    pCurDq->iCsStride[2]	= pCtx->iCsStride[2];
+
+    pCurDq->pMbType			= pCtx->sMb.pMbType[0];
+    pCurDq->pSliceIdc		= pCtx->sMb.pSliceIdc[0];
+    pCurDq->pMv[0]			= pCtx->sMb.pMv[0][0];
+    pCurDq->pRefIndex[0]    = pCtx->sMb.pRefIndex[0][0];
+    pCurDq->pLumaQp         = pCtx->sMb.pLumaQp[0];
+    pCurDq->pChromaQp       = pCtx->sMb.pChromaQp[0];
+    pCurDq->pNzc			= pCtx->sMb.pNzc[0];
+    pCurDq->pNzcRs			= pCtx->sMb.pNzcRs[0];
+    pCurDq->pScaledTCoeff   = pCtx->sMb.pScaledTCoeff[0];
+    pCurDq->pIntraPredMode  = pCtx->sMb.pIntraPredMode[0];
+    pCurDq->pIntra4x4FinalMode = pCtx->sMb.pIntra4x4FinalMode[0];
+    pCurDq->pChromaPredMode = pCtx->sMb.pChromaPredMode[0];
+    pCurDq->pCbp            = pCtx->sMb.pCbp[0];
+    pCurDq->pSubMbType      = pCtx->sMb.pSubMbType[0];
+    pCurDq->pInterPredictionDoneFlag = pCtx->sMb.pInterPredictionDoneFlag[0];
+    pCurDq->pResidualPredFlag = pCtx->sMb.pResidualPredFlag[0];
+  }
 }
 
 // added to reset state of parameter sets to waiting successive incoming IDR, 6/4/2010
 // It will be called in case packets lost/ broken and decoded failed at temporal level 0
-void_t ResetParameterSetsState( PWelsDecoderContext pCtx )
-{
-	pCtx->bSpsExistAheadFlag	   = false;
-	pCtx->bSubspsExistAheadFlag = false;
-	pCtx->bPpsExistAheadFlag	   = false;
+void_t ResetParameterSetsState (PWelsDecoderContext pCtx) {
+  pCtx->bSpsExistAheadFlag	   = false;
+  pCtx->bSubspsExistAheadFlag = false;
+  pCtx->bPpsExistAheadFlag	   = false;
 }
 
 /*
@@ -1862,245 +1741,242 @@
  * DecodeCurrentAccessUnit
  * Decode current access unit when current AU is completed.
  */
-int32_t DecodeCurrentAccessUnit( PWelsDecoderContext pCtx, uint8_t **ppDst, int32_t *pDstLen, int32_t *pWidth, int32_t *pHeight, SBufferInfo *pDstInfo )
-{	
-	int32_t iRefCount[LIST_A];	
-	PNalUnit pNalCur = NULL;
-	PAccessUnit pCurAu = pCtx->pAccessUnitList;
+int32_t DecodeCurrentAccessUnit (PWelsDecoderContext pCtx, uint8_t** ppDst, int32_t* pDstLen, int32_t* pWidth,
+                                 int32_t* pHeight, SBufferInfo* pDstInfo) {
+  int32_t iRefCount[LIST_A];
+  PNalUnit pNalCur = NULL;
+  PAccessUnit pCurAu = pCtx->pAccessUnitList;
 
-	int32_t iIdx = pCurAu->uiStartPos;
-	int32_t iEndIdx = pCurAu->uiEndPos;
-	
-	int32_t iPpsId = 0;
-	int32_t iRet = ERR_NONE;
+  int32_t iIdx = pCurAu->uiStartPos;
+  int32_t iEndIdx = pCurAu->uiEndPos;
 
-	const uint8_t kuiTargetLayerDqId = GetTargetDqId(pCtx->uiTargetDqId, pCtx->pParam); 
-	const uint8_t kuiDependencyIdMax = (kuiTargetLayerDqId & 0x7F) >> 4;
-	int16_t iLastIdD = -1, iLastIdQ = -1;
-	int16_t iCurrIdD = 0, iCurrIdQ = 0;
-	uint8_t uiNalRefIdc = 0;
-	bool_t	bFreshSliceAvailable = true;	// Another fresh slice comingup for given dq layer, for multiple slices in case of header parts of slices sometimes loss over error-prone channels, 8/14/2008
-	PPicture  pStoreBasePic = NULL;	
+  int32_t iPpsId = 0;
+  int32_t iRet = ERR_NONE;
 
-	//update pCurDqLayer at the starting of AU decoding
-	if ( pCtx->bInitialDqLayersMem )
-	{		
-		pCtx->pCurDqLayer				= pCtx->pDqLayersList[0];
-	}
+  const uint8_t kuiTargetLayerDqId = GetTargetDqId (pCtx->uiTargetDqId, pCtx->pParam);
+  const uint8_t kuiDependencyIdMax = (kuiTargetLayerDqId & 0x7F) >> 4;
+  int16_t iLastIdD = -1, iLastIdQ = -1;
+  int16_t iCurrIdD = 0, iCurrIdQ = 0;
+  uint8_t uiNalRefIdc = 0;
+  bool_t	bFreshSliceAvailable =
+    true;	// Another fresh slice comingup for given dq layer, for multiple slices in case of header parts of slices sometimes loss over error-prone channels, 8/14/2008
+  PPicture  pStoreBasePic = NULL;
 
-	InitCurDqLayerData( pCtx, pCtx->pCurDqLayer );
+  //update pCurDqLayer at the starting of AU decoding
+  if (pCtx->bInitialDqLayersMem) {
+    pCtx->pCurDqLayer				= pCtx->pDqLayersList[0];
+  }
 
-	pNalCur = pCurAu->pNalUnitsList[iIdx];	
-	while ( iIdx <= iEndIdx )
-	{
-		PDqLayer dq_cur							= pCtx->pCurDqLayer;
-		SLayerInfo pLayerInfo;
-		PSliceHeaderExt pShExt					= NULL;
-		PSliceHeader pSh							= NULL;		
-	
-		if( pCtx->pDec == NULL ){
-			pCtx->pDec = PrefetchPic(pCtx->pPicBuff[0]);
+  InitCurDqLayerData (pCtx, pCtx->pCurDqLayer);
 
-			if( NULL == pCtx->pDec ){
-				WelsLog( pCtx, WELS_LOG_ERROR, "DecodeCurrentAccessUnit()::::::PrefetchPic ERROR, pSps->iNumRefFrames:%d.\n", 
-					pCtx->pSps->iNumRefFrames );
-				pCtx->iErrorCode |= dsOutOfMemory;
-				return ERR_INFO_REF_COUNT_OVERFLOW;
-			}
-		}
+  pNalCur = pCurAu->pNalUnitsList[iIdx];
+  while (iIdx <= iEndIdx) {
+    PDqLayer dq_cur							= pCtx->pCurDqLayer;
+    SLayerInfo pLayerInfo;
+    PSliceHeaderExt pShExt					= NULL;
+    PSliceHeader pSh							= NULL;
 
+    if (pCtx->pDec == NULL) {
+      pCtx->pDec = PrefetchPic (pCtx->pPicBuff[0]);
+
+      if (NULL == pCtx->pDec) {
+        WelsLog (pCtx, WELS_LOG_ERROR, "DecodeCurrentAccessUnit()::::::PrefetchPic ERROR, pSps->iNumRefFrames:%d.\n",
+                 pCtx->pSps->iNumRefFrames);
+        pCtx->iErrorCode |= dsOutOfMemory;
+        return ERR_INFO_REF_COUNT_OVERFLOW;
+      }
+    }
+
 #ifdef NO_WAITING_AU
-		//For fixing the nal lossing issue
-		if ((pCtx->pDec->iTotalNumMbRec != 0)&&
-			(CheckAccessUnitBoundaryExt(&pCtx->sLastNalHdrExt, &pNalCur->sNalHeaderExt, &pCtx->sLastSliceHeader,  &pNalCur->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader)))
-		{
-             pCtx->pDec->iTotalNumMbRec = 0;	
-        }
+    //For fixing the nal lossing issue
+    if ((pCtx->pDec->iTotalNumMbRec != 0) &&
+        (CheckAccessUnitBoundaryExt (&pCtx->sLastNalHdrExt, &pNalCur->sNalHeaderExt, &pCtx->sLastSliceHeader,
+                                     &pNalCur->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader))) {
+      pCtx->pDec->iTotalNumMbRec = 0;
+    }
 #else
-		//initialize at the starting of AU.
-		pCtx->pDec->iTotalNumMbRec = 0;			
+    //initialize at the starting of AU.
+    pCtx->pDec->iTotalNumMbRec = 0;
 #endif
-        if(pCtx->pDec->iTotalNumMbRec == 0) //Picture start to decode
-        {
-            for( int32_t i = 0; i < LAYER_NUM_EXCHANGEABLE; ++ i)
-                memset(pCtx->sMb.pSliceIdc[i], 0xff, (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof(int32_t)) );
-        }
-		GetI4LumaIChromaAddrTable(pCtx->iDecBlockOffsetArray, pCtx->pDec->iLinesize[0], pCtx->pDec->iLinesize[1]);
+    if (pCtx->pDec->iTotalNumMbRec == 0) { //Picture start to decode
+      for (int32_t i = 0; i < LAYER_NUM_EXCHANGEABLE; ++ i)
+        memset (pCtx->sMb.pSliceIdc[i], 0xff, (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int32_t)));
+    }
+    GetI4LumaIChromaAddrTable (pCtx->iDecBlockOffsetArray, pCtx->pDec->iLinesize[0], pCtx->pDec->iLinesize[1]);
 
-		if ( pNalCur->sNalHeaderExt.uiLayerDqId > kuiTargetLayerDqId ) {
-			break;	// Per formance it need not to decode the remaining bits any more due to given uiLayerDqId required, 9/2/2009
-		}
+    if (pNalCur->sNalHeaderExt.uiLayerDqId > kuiTargetLayerDqId) {
+      break;	// Per formance it need not to decode the remaining bits any more due to given uiLayerDqId required, 9/2/2009
+    }
 
-		memset(&pLayerInfo, 0, sizeof(SLayerInfo));
-		
-		/*
-		 *	Loop decoding for slices (even FMO and/ multiple slices) within a dq layer
-		 */
-		while ( iIdx <= iEndIdx )
-		{	
-		    BOOL_T         bReconstructSlice;
-			iCurrIdQ	= pNalCur->sNalHeaderExt.uiQualityId;
-			iCurrIdD	= pNalCur->sNalHeaderExt.uiDependencyId;
-			pSh		= &pNalCur->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader;
-			pShExt	= &pNalCur->sNalData.sVclNal.sSliceHeaderExt;
+    memset (&pLayerInfo, 0, sizeof (SLayerInfo));
 
-			bReconstructSlice = CheckSliceNeedReconstruct(iCurrIdD, iCurrIdQ, pShExt->bStoreRefBasePicFlag, 
-				kuiDependencyIdMax, pNalCur->sNalHeaderExt.uiLayerDqId, kuiTargetLayerDqId);
+    /*
+     *	Loop decoding for slices (even FMO and/ multiple slices) within a dq layer
+     */
+    while (iIdx <= iEndIdx) {
+      BOOL_T         bReconstructSlice;
+      iCurrIdQ	= pNalCur->sNalHeaderExt.uiQualityId;
+      iCurrIdD	= pNalCur->sNalHeaderExt.uiDependencyId;
+      pSh		= &pNalCur->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader;
+      pShExt	= &pNalCur->sNalData.sVclNal.sSliceHeaderExt;
 
-			memcpy(&pLayerInfo.sNalHeaderExt, &pNalCur->sNalHeaderExt, sizeof(SNalUnitHeaderExt));//confirmed_safe_unsafe_usage
+      bReconstructSlice = CheckSliceNeedReconstruct (iCurrIdD, iCurrIdQ, pShExt->bStoreRefBasePicFlag,
+                          kuiDependencyIdMax, pNalCur->sNalHeaderExt.uiLayerDqId, kuiTargetLayerDqId);
 
-			pCtx->pDec->iFrameNum = pSh->iFrameNum;		
+      memcpy (&pLayerInfo.sNalHeaderExt, &pNalCur->sNalHeaderExt, sizeof (SNalUnitHeaderExt)); //confirmed_safe_unsafe_usage
 
-			memcpy(&pLayerInfo.sSliceInLayer.sSliceHeaderExt, pShExt, sizeof(SSliceHeaderExt));//confirmed_safe_unsafe_usage
-			pLayerInfo.sSliceInLayer.bSliceHeaderExtFlag	= pNalCur->sNalData.sVclNal.bSliceHeaderExtFlag;
-			pLayerInfo.sSliceInLayer.eSliceType			= pSh->eSliceType;
-			pLayerInfo.sSliceInLayer.iLastMbQp			= pSh->iSliceQp;
-				dq_cur->pBitStringAux	= &pNalCur->sNalData.sVclNal.sSliceBitsRead;
-			
-			uiNalRefIdc	= pNalCur->sNalHeaderExt.sNalUnitHeader.uiNalRefIdc;	
+      pCtx->pDec->iFrameNum = pSh->iFrameNum;
 
-			iPpsId	= pSh->iPpsId;
+      memcpy (&pLayerInfo.sSliceInLayer.sSliceHeaderExt, pShExt, sizeof (SSliceHeaderExt)); //confirmed_safe_unsafe_usage
+      pLayerInfo.sSliceInLayer.bSliceHeaderExtFlag	= pNalCur->sNalData.sVclNal.bSliceHeaderExtFlag;
+      pLayerInfo.sSliceInLayer.eSliceType			= pSh->eSliceType;
+      pLayerInfo.sSliceInLayer.iLastMbQp			= pSh->iSliceQp;
+      dq_cur->pBitStringAux	= &pNalCur->sNalData.sVclNal.sSliceBitsRead;
 
-			pLayerInfo.pPps = pSh->pPps;
-			pLayerInfo.pSps = pSh->pSps;
-			pLayerInfo.pSubsetSps = pShExt->pSubsetSps;				
+      uiNalRefIdc	= pNalCur->sNalHeaderExt.sNalUnitHeader.uiNalRefIdc;
 
-			pCtx->pFmo = &pCtx->sFmoList[iPpsId];
-			if ( !FmoParamUpdate( pCtx->pFmo, pLayerInfo.pSps, pLayerInfo.pPps, &pCtx->iActiveFmoNum ) ) {
-				pCtx->iErrorCode |= dsBitstreamError;
-				WelsLog( pCtx, WELS_LOG_WARNING, "DecodeCurrentAccessUnit(), FmoParamUpdate failed, eSliceType: %d.\n", pSh->eSliceType);
-				return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_FMO_INIT_FAIL);
-			}
+      iPpsId	= pSh->iPpsId;
 
-			bFreshSliceAvailable	= (iCurrIdD != iLastIdD || iCurrIdQ != iLastIdQ);	// do not need condition of (first_mb == 0) due multiple slices might be disorder
-			
-			WelsDqLayerDecodeStart ( pCtx, pNalCur, pLayerInfo.pSps, pLayerInfo.pPps );
+      pLayerInfo.pPps = pSh->pPps;
+      pLayerInfo.pSps = pSh->pSps;
+      pLayerInfo.pSubsetSps = pShExt->pSubsetSps;
 
-			if ( iCurrIdQ == BASE_QUALITY_ID )
-			{
-				ST64(iRefCount, LD64(pLayerInfo.sSliceInLayer.sSliceHeaderExt.sSliceHeader.uiRefCount));
-			}
-			
-			if ( (iLastIdD < 0) || //case 1: first layer
-				( iLastIdD == iCurrIdD) ) //case 2: same uiDId
-			{
-				InitDqLayerInfo( dq_cur, &pLayerInfo, pNalCur, pCtx->pDec );
+      pCtx->pFmo = &pCtx->sFmoList[iPpsId];
+      if (!FmoParamUpdate (pCtx->pFmo, pLayerInfo.pSps, pLayerInfo.pPps, &pCtx->iActiveFmoNum)) {
+        pCtx->iErrorCode |= dsBitstreamError;
+        WelsLog (pCtx, WELS_LOG_WARNING, "DecodeCurrentAccessUnit(), FmoParamUpdate failed, eSliceType: %d.\n",
+                 pSh->eSliceType);
+        return GENERATE_ERROR_NO (ERR_LEVEL_SLICE_HEADER, ERR_INFO_FMO_INIT_FAIL);
+      }
 
-				if ( !dq_cur->sLayerInfo.pSps->bGapsInFrameNumValueAllowedFlag )
-				{
-					const bool_t kbIdrFlag = dq_cur->sLayerInfo.sNalHeaderExt.bIdrFlag || (dq_cur->sLayerInfo.sNalHeaderExt.sNalUnitHeader.eNalUnitType == NAL_UNIT_CODED_SLICE_IDR);
-					// Subclause 8.2.5.2 Decoding process for gaps in frame_num
-					if (	!kbIdrFlag  && 
-						pSh->iFrameNum != pCtx->iPrevFrameNum &&
-						pSh->iFrameNum != ((pCtx->iPrevFrameNum+1) & ((1<<dq_cur->sLayerInfo.pSps->uiLog2MaxFrameNum)-1))	)
-					{
-						WelsLog( pCtx, WELS_LOG_WARNING, "referencing pictures lost due frame gaps exist, prev_frame_num: %d, curr_frame_num: %d\n", pCtx->iPrevFrameNum, pSh->iFrameNum);
+      bFreshSliceAvailable	= (iCurrIdD != iLastIdD
+                               || iCurrIdQ != iLastIdQ);	// do not need condition of (first_mb == 0) due multiple slices might be disorder
 
+      WelsDqLayerDecodeStart (pCtx, pNalCur, pLayerInfo.pSps, pLayerInfo.pPps);
+
+      if (iCurrIdQ == BASE_QUALITY_ID) {
+        ST64 (iRefCount, LD64 (pLayerInfo.sSliceInLayer.sSliceHeaderExt.sSliceHeader.uiRefCount));
+      }
+
+      if ((iLastIdD < 0) ||  //case 1: first layer
+          (iLastIdD == iCurrIdD)) { //case 2: same uiDId
+        InitDqLayerInfo (dq_cur, &pLayerInfo, pNalCur, pCtx->pDec);
+
+        if (!dq_cur->sLayerInfo.pSps->bGapsInFrameNumValueAllowedFlag) {
+          const bool_t kbIdrFlag = dq_cur->sLayerInfo.sNalHeaderExt.bIdrFlag
+                                   || (dq_cur->sLayerInfo.sNalHeaderExt.sNalUnitHeader.eNalUnitType == NAL_UNIT_CODED_SLICE_IDR);
+          // Subclause 8.2.5.2 Decoding process for gaps in frame_num
+          if (!kbIdrFlag  &&
+              pSh->iFrameNum != pCtx->iPrevFrameNum &&
+              pSh->iFrameNum != ((pCtx->iPrevFrameNum + 1) & ((1 << dq_cur->sLayerInfo.pSps->uiLog2MaxFrameNum) - 1))) {
+            WelsLog (pCtx, WELS_LOG_WARNING,
+                     "referencing pictures lost due frame gaps exist, prev_frame_num: %d, curr_frame_num: %d\n", pCtx->iPrevFrameNum,
+                     pSh->iFrameNum);
+
 #ifdef LONG_TERM_REF
-						pCtx->bParamSetsLostFlag = true;
+            pCtx->bParamSetsLostFlag = true;
 #else
-						pCtx->bReferenceLostAtT0Flag = true;
+            pCtx->bReferenceLostAtT0Flag = true;
 #endif
-						ResetParameterSetsState( pCtx );				
+            ResetParameterSetsState (pCtx);
 
-						pCtx->iErrorCode |= dsRefLost;
-						return ERR_INFO_REFERENCE_PIC_LOST;
-					}
-				}
+            pCtx->iErrorCode |= dsRefLost;
+            return ERR_INFO_REFERENCE_PIC_LOST;
+          }
+        }
 
-				if ( iCurrIdD == kuiDependencyIdMax && iCurrIdQ == BASE_QUALITY_ID )
-				{
-					iRet = InitRefPicList ( pCtx, uiNalRefIdc, bFreshSliceAvailable, pSh->iPicOrderCntLsb);
-					if ( iRet )
-					{
-						HandleReferenceLost(pCtx, pNalCur);
-						WelsLog( pCtx, WELS_LOG_WARNING, "reference picture introduced by this frame is lost during transmission! uiTId: %d\n", pNalCur->sNalHeaderExt.uiTemporalId );
-						return iRet;
-					}
-				}
+        if (iCurrIdD == kuiDependencyIdMax && iCurrIdQ == BASE_QUALITY_ID) {
+          iRet = InitRefPicList (pCtx, uiNalRefIdc, bFreshSliceAvailable, pSh->iPicOrderCntLsb);
+          if (iRet) {
+            HandleReferenceLost (pCtx, pNalCur);
+            WelsLog (pCtx, WELS_LOG_WARNING, "reference picture introduced by this frame is lost during transmission! uiTId: %d\n",
+                     pNalCur->sNalHeaderExt.uiTemporalId);
+            return iRet;
+          }
+        }
 
-				iRet = WelsDecodeSlice ( pCtx, bFreshSliceAvailable, pNalCur );
+        iRet = WelsDecodeSlice (pCtx, bFreshSliceAvailable, pNalCur);
 
-				//Output good store_base reconstruction when enhancement quality layer occurred error for MGS key picture case
-				if ( iRet != ERR_NONE )
-				{
-					WelsLog( pCtx, WELS_LOG_WARNING, "DecodeCurrentAccessUnit() failed (%d) in frame: %d uiDId: %d uiQId: %d\n",
-						iRet, pSh->iFrameNum, iCurrIdD, iCurrIdQ);
-					HandleReferenceLostL0(pCtx, pNalCur);
-					return iRet;
-				}
-				if( bReconstructSlice )	{					
-					if( WelsDecodeConstructSlice(pCtx, pNalCur) ){
-						return -1;
-					}
-				}				
-			}
+        //Output good store_base reconstruction when enhancement quality layer occurred error for MGS key picture case
+        if (iRet != ERR_NONE) {
+          WelsLog (pCtx, WELS_LOG_WARNING, "DecodeCurrentAccessUnit() failed (%d) in frame: %d uiDId: %d uiQId: %d\n",
+                   iRet, pSh->iFrameNum, iCurrIdD, iCurrIdQ);
+          HandleReferenceLostL0 (pCtx, pNalCur);
+          return iRet;
+        }
+        if (bReconstructSlice)	{
+          if (WelsDecodeConstructSlice (pCtx, pNalCur)) {
+            return -1;
+          }
+        }
+      }
 #if defined (_DEBUG) &&  !defined (CODEC_FOR_TESTBED)
-			fprintf( stderr, "cur_frame : %d	iCurrIdD : %d\n ", 
-				dq_cur->sLayerInfo.sSliceInLayer.sSliceHeaderExt.sSliceHeader.iFrameNum, iCurrIdD );
+      fprintf (stderr, "cur_frame : %d	iCurrIdD : %d\n ",
+               dq_cur->sLayerInfo.sSliceInLayer.sSliceHeaderExt.sSliceHeader.iFrameNum, iCurrIdD);
 #endif//#if !CODEC_FOR_TESTBED
-			iLastIdD	= iCurrIdD;
-			iLastIdQ	= iCurrIdQ;		
-	
-			//pNalUnitsList overflow.
-			++ iIdx;
-			if (iIdx <= iEndIdx)
-			{				
-				pNalCur	= pCurAu->pNalUnitsList[iIdx];
-			}
-			else
-			{
-				pNalCur	= NULL;
-			}
+      iLastIdD	= iCurrIdD;
+      iLastIdQ	= iCurrIdQ;
 
-			if ( pNalCur == NULL ||
-				iLastIdD != pNalCur->sNalHeaderExt.uiDependencyId || 
-				iLastIdQ != pNalCur->sNalHeaderExt.uiQualityId )
-				break;
-		} 
+      //pNalUnitsList overflow.
+      ++ iIdx;
+      if (iIdx <= iEndIdx) {
+        pNalCur	= pCurAu->pNalUnitsList[iIdx];
+      } else {
+        pNalCur	= NULL;
+      }
 
-		// A dq layer decoded here
+      if (pNalCur == NULL ||
+          iLastIdD != pNalCur->sNalHeaderExt.uiDependencyId ||
+          iLastIdQ != pNalCur->sNalHeaderExt.uiQualityId)
+        break;
+    }
+
+    // A dq layer decoded here
 #if defined (_DEBUG) &&  !defined (CODEC_FOR_TESTBED)
 #undef fprintf
-		fprintf(stderr, "POC: #%d, FRAME: #%d, D: %d, Q: %d, T: %d, P: %d,	%d\n",
-			pSh->iPicOrderCntLsb, pSh->iFrameNum, iCurrIdD, iCurrIdQ, dq_cur->sLayerInfo.sNalHeaderExt.uiTemporalId, dq_cur->sLayerInfo.sNalHeaderExt.uiPriorityId,dq_cur->sLayerInfo.sSliceInLayer.sSliceHeaderExt.sSliceHeader.iSliceQp);
+    fprintf (stderr, "POC: #%d, FRAME: #%d, D: %d, Q: %d, T: %d, P: %d,	%d\n",
+             pSh->iPicOrderCntLsb, pSh->iFrameNum, iCurrIdD, iCurrIdQ, dq_cur->sLayerInfo.sNalHeaderExt.uiTemporalId,
+             dq_cur->sLayerInfo.sNalHeaderExt.uiPriorityId, dq_cur->sLayerInfo.sSliceInLayer.sSliceHeaderExt.sSliceHeader.iSliceQp);
 #endif//#if !CODEC_FOR_TESTBED
 
-        if( dq_cur->uiLayerDqId == kuiTargetLayerDqId ){
-		    if( DecodeFrameConstruction( pCtx, ppDst, pDstLen, pWidth, pHeight, pDstInfo) ){
+    if (dq_cur->uiLayerDqId == kuiTargetLayerDqId) {
+      if (DecodeFrameConstruction (pCtx, ppDst, pDstLen, pWidth, pHeight, pDstInfo)) {
 #ifdef NO_WAITING_AU
-                memcpy(&pCtx->sLastNalHdrExt, &pCurAu->pNalUnitsList[iIdx-1]->sNalHeaderExt, sizeof(SNalUnitHeaderExt));
-                memcpy(&pCtx->sLastSliceHeader, &pCurAu->pNalUnitsList[iIdx-1]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader, sizeof(SSliceHeader));
-				return ERR_NONE;
+        memcpy (&pCtx->sLastNalHdrExt, &pCurAu->pNalUnitsList[iIdx - 1]->sNalHeaderExt, sizeof (SNalUnitHeaderExt));
+        memcpy (&pCtx->sLastSliceHeader, &pCurAu->pNalUnitsList[iIdx - 1]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader,
+                sizeof (SSliceHeader));
+        return ERR_NONE;
 #else
-				pCtx->iErrorCode |= dsBitstreamError;
-				return -1;
+        pCtx->iErrorCode |= dsBitstreamError;
+        return -1;
 #endif
-				
-		    }
-			if( (uiNalRefIdc > 0) && ( iCurrIdQ || (!dq_cur->bStoreRefBasePicFlag) ) ){
-				WelsMarkAsRef(pCtx, false);
-                ExpandReferencingPicture(pCtx->pDec, pCtx->sExpandPicFunc.pExpandLumaPicture, pCtx->sExpandPicFunc.pExpandChromaPicture);
-				pCtx->pDec = NULL;
-			}
-        }
 
-		if( (iCurrIdD == kuiDependencyIdMax) && (iCurrIdQ == BASE_QUALITY_ID) && (dq_cur->bStoreRefBasePicFlag) ){
-			pStoreBasePic = pCtx->pDec;
+      }
+      if ((uiNalRefIdc > 0) && (iCurrIdQ || (!dq_cur->bStoreRefBasePicFlag))) {
+        WelsMarkAsRef (pCtx, false);
+        ExpandReferencingPicture (pCtx->pDec, pCtx->sExpandPicFunc.pExpandLumaPicture,
+                                  pCtx->sExpandPicFunc.pExpandChromaPicture);
+        pCtx->pDec = NULL;
+      }
+    }
 
-			if( uiNalRefIdc > 0 ){
-				WelsMarkAsRef(pCtx, true);
-                ExpandReferencingPicture(pCtx->pDec, pCtx->sExpandPicFunc.pExpandLumaPicture, pCtx->sExpandPicFunc.pExpandChromaPicture);
-				pCtx->pDec = NULL;
-			}
-		}		
-		// need update frame_num due current frame is well decoded
-		pCtx->iPrevFrameNum	= pSh->iFrameNum;
-        if( pCtx->bLastHasMmco5 )   
-            pCtx->iPrevFrameNum = 0;
-	} 
+    if ((iCurrIdD == kuiDependencyIdMax) && (iCurrIdQ == BASE_QUALITY_ID) && (dq_cur->bStoreRefBasePicFlag)) {
+      pStoreBasePic = pCtx->pDec;
 
-	return ERR_NONE;
+      if (uiNalRefIdc > 0) {
+        WelsMarkAsRef (pCtx, true);
+        ExpandReferencingPicture (pCtx->pDec, pCtx->sExpandPicFunc.pExpandLumaPicture,
+                                  pCtx->sExpandPicFunc.pExpandChromaPicture);
+        pCtx->pDec = NULL;
+      }
+    }
+    // need update frame_num due current frame is well decoded
+    pCtx->iPrevFrameNum	= pSh->iFrameNum;
+    if (pCtx->bLastHasMmco5)
+      pCtx->iPrevFrameNum = 0;
+  }
+
+  return ERR_NONE;
 }
 
 } // namespace WelsDec
\ No newline at end of file
--- a/codec/decoder/core/src/decoder_data_tables.cpp
+++ b/codec/decoder/core/src/decoder_data_tables.cpp
@@ -43,186 +43,180 @@
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
 //////non_zero_count[16+8] mapping scan index
-const uint8_t g_kuiMbNonZeroCountIdx[24] =
-{                     //  0   1 | 4  5      luma 8*8 block           non_zero_count[16+8] 
-	0,  1,  4,  5,   //  2   3 | 6  7        0  |  1                  0   1   2   3 
-	2,  3,  6,  7,   //---------------      ---------                 4   5   6   7 
-	8,  9, 12, 13,   //  8   9 | 12 13       2  |  3                  8   9  10  11 
-	10, 11, 14, 15,   // 10  11 | 14 15-----------------------------> 12  13  14  15 
-	16, 17, 20, 21,   //----------------    chroma 8*8 block          16  17  18  19  
-	18, 19, 22, 23   // 16  17 | 20 21        0    1                 20  21  22  23 
+const uint8_t g_kuiMbNonZeroCountIdx[24] = {
+  //  0   1 | 4  5      luma 8*8 block           non_zero_count[16+8]
+  0,  1,  4,  5,   //  2   3 | 6  7        0  |  1                  0   1   2   3
+  2,  3,  6,  7,   //---------------      ---------                 4   5   6   7
+  8,  9, 12, 13,   //  8   9 | 12 13       2  |  3                  8   9  10  11
+  10, 11, 14, 15,   // 10  11 | 14 15-----------------------------> 12  13  14  15
+  16, 17, 20, 21,   //----------------    chroma 8*8 block          16  17  18  19
+  18, 19, 22, 23   // 16  17 | 20 21        0    1                 20  21  22  23
 };
 //cache element equal to 26
 
-const uint8_t g_kuiCacheNzcScanIdx[24] = 
-{
-	/* Luma */
-	9, 10, 17, 18,	// 1+1*8, 2+1*8, 1+2*8, 2+2*8,
-	11, 12, 19, 20,	// 3+1*8, 4+1*8, 3+2*8, 4+2*8,
-	25, 26, 33, 34,	// 1+3*8, 2+3*8, 1+4*8, 2+4*8,
-	27, 28, 35, 36,	// 3+3*8, 4+3*8, 3+4*8, 4+4*8,
-    /* Cb */
-	14, 15,			// 6+1*8, 7+1*8,
-	22, 23,			// 6+2*8, 7+2*8,
+const uint8_t g_kuiCacheNzcScanIdx[24] = {
+  /* Luma */
+  9, 10, 17, 18,	// 1+1*8, 2+1*8, 1+2*8, 2+2*8,
+  11, 12, 19, 20,	// 3+1*8, 4+1*8, 3+2*8, 4+2*8,
+  25, 26, 33, 34,	// 1+3*8, 2+3*8, 1+4*8, 2+4*8,
+  27, 28, 35, 36,	// 3+3*8, 4+3*8, 3+4*8, 4+4*8,
+  /* Cb */
+  14, 15,			// 6+1*8, 7+1*8,
+  22, 23,			// 6+2*8, 7+2*8,
 
-    /* Cr */
-	38, 39,			// 6+4*8, 7+4*8,
-	46, 47,			// 6+5*8, 7+5*8,
+  /* Cr */
+  38, 39,			// 6+4*8, 7+4*8,
+  46, 47,			// 6+5*8, 7+5*8,
 };
 
 //cache element equal to 30
-const uint8_t g_kuiCache30ScanIdx[16] = //mv or ref_index cache scan index, 4*4 block as basic unit
-{
-	7,  8, 13, 14,
-	9, 10, 15, 16,
-	19, 20, 25, 26,
-	21, 22, 27, 28
+const uint8_t g_kuiCache30ScanIdx[16] = { //mv or ref_index cache scan index, 4*4 block as basic unit
+  7,  8, 13, 14,
+  9, 10, 15, 16,
+  19, 20, 25, 26,
+  21, 22, 27, 28
 };
 
-const uint8_t g_kuiScan4[16] = //for mb cache in sMb (only current element, without neighbor) 
-{                         // 4*4block scan    mb cache order
-	0,  1,  4,  5,        // 0  1 | 4  5      0  1 | 2  3
-	2,  3,  6,  7,        // 2  3 | 6  7      4  5 | 6  7
-	8,  9, 12, 13,        //----------------->----------- 
-	10, 11, 14, 15        // 8  9 |12 13      8  9 |10 11
-};                        //10 11 |14 15     12 13 |14 15 
+const uint8_t g_kuiScan4[16] = { //for mb cache in sMb (only current element, without neighbor)
+  // 4*4block scan    mb cache order
+  0,  1,  4,  5,        // 0  1 | 4  5      0  1 | 2  3
+  2,  3,  6,  7,        // 2  3 | 6  7      4  5 | 6  7
+  8,  9, 12, 13,        //----------------->-----------
+  10, 11, 14, 15        // 8  9 |12 13      8  9 |10 11
+};                        //10 11 |14 15     12 13 |14 15
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
 // extern at wels_common_basis.h
 
-const uint8_t g_kuiChromaQp[52]={
-	0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,
-	12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,
-	28,29,29,30,31,32,32,33,34,34,35,35,36,36,37,37,
-	37,38,38,38,39,39,39,39
+const uint8_t g_kuiChromaQp[52] = {
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+  12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
+  28, 29, 29, 30, 31, 32, 32, 33, 34, 34, 35, 35, 36, 36, 37, 37,
+  37, 38, 38, 38, 39, 39, 39, 39
 };
 
 /*
  *	vcl type map for given NAL unit type and corresponding H264 type
  */
-const VclType g_kuiVclTypeMap[32][2] =   
-{
-	{ NON_VCL,	NON_VCL },	// 0: NAL_UNIT_UNSPEC_0
-	{ VCL,		VCL,	},	// 1: NAL_UNIT_CODED_SLICE
-	{ VCL,		NOT_APP },	// 2: NAL_UNIT_CODED_SLICE_DPA
-	{ VCL,		NOT_APP },	// 3: NAL_UNIT_CODED_SLICE_DPB
-	{ VCL,		NOT_APP },	// 4: NAL_UNIT_CODED_SLICE_DPC
-	{ VCL,		VCL		},	// 5: NAL_UNIT_CODED_SLICE_IDR
-	{ NON_VCL,	NON_VCL },	// 6: NAL_UNIT_SEI
-	{ NON_VCL,	NON_VCL },	// 7: NAL_UNIT_SPS
-	{ NON_VCL,	NON_VCL },	// 8: NAL_UNIT_PPS
-	{ NON_VCL,	NON_VCL },	// 9: NAL_UNIT_AU_DELIMITER
-	{ NON_VCL,	NON_VCL },	// 10: NAL_UNIT_END_OF_SEQ
-	{ NON_VCL,	NON_VCL },	// 11: NAL_UNIT_END_OF_STR
-	{ NON_VCL,	NON_VCL	},	// 12: NAL_UNIT_FILLER_DATA
-	{ NON_VCL,	NON_VCL },	// 13: NAL_UNIT_SPS_EXT
-	{ NON_VCL,	NON_VCL },	// 14: NAL_UNIT_PREFIX, NEED associate succeeded NAL to make a VCL
-	{ NON_VCL,	NON_VCL },	// 15: NAL_UNIT_SUBSET_SPS
-	{ NON_VCL,	NON_VCL },	// 16: NAL_UNIT_RESV_16
-	{ NON_VCL,	NON_VCL },	// 17: NAL_UNIT_RESV_17
-	{ NON_VCL,	NON_VCL },	// 18: NAL_UNIT_RESV_18
-	{ NON_VCL,	NON_VCL },	// 19: NAL_UNIT_AUX_CODED_SLICE
-	{ NON_VCL,	VCL		},	// 20: NAL_UNIT_CODED_SLICE_EXT
-	{ NON_VCL,	NON_VCL },	// 21: NAL_UNIT_RESV_21
-	{ NON_VCL,	NON_VCL },	// 22: NAL_UNIT_RESV_22
-	{ NON_VCL,	NON_VCL },	// 23: NAL_UNIT_RESV_23
-	{ NON_VCL,	NON_VCL },	// 24: NAL_UNIT_UNSPEC_24
-	{ NON_VCL,	NON_VCL },	// 25: NAL_UNIT_UNSPEC_25
-	{ NON_VCL,	NON_VCL },	// 26: NAL_UNIT_UNSPEC_26
-	{ NON_VCL,	NON_VCL	},	// 27: NAL_UNIT_UNSPEC_27
-	{ NON_VCL,	NON_VCL },	// 28: NAL_UNIT_UNSPEC_28
-	{ NON_VCL,	NON_VCL },	// 29: NAL_UNIT_UNSPEC_29
-	{ NON_VCL,	NON_VCL },	// 30: NAL_UNIT_UNSPEC_30
-	{ NON_VCL,	NON_VCL }	// 31: NAL_UNIT_UNSPEC_31
+const VclType g_kuiVclTypeMap[32][2] = {
+  { NON_VCL,	NON_VCL },	// 0: NAL_UNIT_UNSPEC_0
+  { VCL,		VCL,	},	// 1: NAL_UNIT_CODED_SLICE
+  { VCL,		NOT_APP },	// 2: NAL_UNIT_CODED_SLICE_DPA
+  { VCL,		NOT_APP },	// 3: NAL_UNIT_CODED_SLICE_DPB
+  { VCL,		NOT_APP },	// 4: NAL_UNIT_CODED_SLICE_DPC
+  { VCL,		VCL		},	// 5: NAL_UNIT_CODED_SLICE_IDR
+  { NON_VCL,	NON_VCL },	// 6: NAL_UNIT_SEI
+  { NON_VCL,	NON_VCL },	// 7: NAL_UNIT_SPS
+  { NON_VCL,	NON_VCL },	// 8: NAL_UNIT_PPS
+  { NON_VCL,	NON_VCL },	// 9: NAL_UNIT_AU_DELIMITER
+  { NON_VCL,	NON_VCL },	// 10: NAL_UNIT_END_OF_SEQ
+  { NON_VCL,	NON_VCL },	// 11: NAL_UNIT_END_OF_STR
+  { NON_VCL,	NON_VCL	},	// 12: NAL_UNIT_FILLER_DATA
+  { NON_VCL,	NON_VCL },	// 13: NAL_UNIT_SPS_EXT
+  { NON_VCL,	NON_VCL },	// 14: NAL_UNIT_PREFIX, NEED associate succeeded NAL to make a VCL
+  { NON_VCL,	NON_VCL },	// 15: NAL_UNIT_SUBSET_SPS
+  { NON_VCL,	NON_VCL },	// 16: NAL_UNIT_RESV_16
+  { NON_VCL,	NON_VCL },	// 17: NAL_UNIT_RESV_17
+  { NON_VCL,	NON_VCL },	// 18: NAL_UNIT_RESV_18
+  { NON_VCL,	NON_VCL },	// 19: NAL_UNIT_AUX_CODED_SLICE
+  { NON_VCL,	VCL		},	// 20: NAL_UNIT_CODED_SLICE_EXT
+  { NON_VCL,	NON_VCL },	// 21: NAL_UNIT_RESV_21
+  { NON_VCL,	NON_VCL },	// 22: NAL_UNIT_RESV_22
+  { NON_VCL,	NON_VCL },	// 23: NAL_UNIT_RESV_23
+  { NON_VCL,	NON_VCL },	// 24: NAL_UNIT_UNSPEC_24
+  { NON_VCL,	NON_VCL },	// 25: NAL_UNIT_UNSPEC_25
+  { NON_VCL,	NON_VCL },	// 26: NAL_UNIT_UNSPEC_26
+  { NON_VCL,	NON_VCL	},	// 27: NAL_UNIT_UNSPEC_27
+  { NON_VCL,	NON_VCL },	// 28: NAL_UNIT_UNSPEC_28
+  { NON_VCL,	NON_VCL },	// 29: NAL_UNIT_UNSPEC_29
+  { NON_VCL,	NON_VCL },	// 30: NAL_UNIT_UNSPEC_30
+  { NON_VCL,	NON_VCL }	// 31: NAL_UNIT_UNSPEC_31
 };
 
 /*common use table*/
-const uint8_t g_kuiScan8[24]={	// [16 + 2*4]
-    9, 10, 17, 18,	// 1+1*8, 2+1*8, 1+2*8, 2+2*8,
-	11, 12, 19, 20,	// 3+1*8, 4+1*8, 3+2*8, 4+2*8,
-	25, 26, 33, 34,	// 1+3*8, 2+3*8, 1+4*8, 2+4*8,
-	27, 28, 35, 36,	// 3+3*8, 4+3*8, 3+4*8, 4+4*8,
-	14, 15,			// 6+1*8, 7+1*8,
-	22, 23,			// 6+2*8, 7+2*8,
-	38, 39,			// 6+4*8, 7+4*8,
-	46, 47,			// 6+5*8, 7+5*8,
+const uint8_t g_kuiScan8[24] = {	// [16 + 2*4]
+  9, 10, 17, 18,	// 1+1*8, 2+1*8, 1+2*8, 2+2*8,
+  11, 12, 19, 20,	// 3+1*8, 4+1*8, 3+2*8, 4+2*8,
+  25, 26, 33, 34,	// 1+3*8, 2+3*8, 1+4*8, 2+4*8,
+  27, 28, 35, 36,	// 3+3*8, 4+3*8, 3+4*8, 4+4*8,
+  14, 15,			// 6+1*8, 7+1*8,
+  22, 23,			// 6+2*8, 7+2*8,
+  38, 39,			// 6+4*8, 7+4*8,
+  46, 47,			// 6+5*8, 7+5*8,
 };
 
-const uint8_t g_kuiLumaDcZigzagScan[16]={
-	0, 16, 32, 128,			// 0*16 + 0*64, 1*16 + 0*64, 2*16 + 0*64, 0*16 + 2*64,
-	48, 64, 80, 96,			// 3*16 + 0*64, 0*16 + 1*64, 1*16 + 1*64, 2*16 + 1*64,
-	144, 160, 176, 192,		// 1*16 + 2*64, 2*16 + 2*64, 3*16 + 2*64, 0*16 + 3*64,
-	112, 208, 224, 240		// 3*16 + 1*64, 1*16 + 3*64, 2*16 + 3*64, 3*16 + 3*64,
+const uint8_t g_kuiLumaDcZigzagScan[16] = {
+  0, 16, 32, 128,			// 0*16 + 0*64, 1*16 + 0*64, 2*16 + 0*64, 0*16 + 2*64,
+  48, 64, 80, 96,			// 3*16 + 0*64, 0*16 + 1*64, 1*16 + 1*64, 2*16 + 1*64,
+  144, 160, 176, 192,		// 1*16 + 2*64, 2*16 + 2*64, 3*16 + 2*64, 0*16 + 3*64,
+  112, 208, 224, 240		// 3*16 + 1*64, 1*16 + 3*64, 2*16 + 3*64, 3*16 + 3*64,
 };
 
-const uint8_t g_kuiChromaDcScan[4]={
-	0, 16, 32, 48
+const uint8_t g_kuiChromaDcScan[4] = {
+  0, 16, 32, 48
 };
 
-__align16( const uint16_t, g_kuiDequantCoeff[52][8]) = {
-	/* 0*/{   10,   13,   10,   13,   13,   16,   13,   16 },	/* 1*/{   11,   14,   11,   14,   14,   18,   14,   18 },
-	/* 2*/{   13,   16,   13,   16,   16,   20,   16,   20 },	/* 3*/{   14,   18,   14,   18,   18,   23,   18,   23 },
-	/* 4*/{   16,   20,   16,   20,   20,   25,   20,   25 },	/* 5*/{   18,   23,   18,   23,   23,   29,   23,   29 },
-	/* 6*/{   20,   26,   20,   26,   26,   32,   26,   32 },	/* 7*/{   22,   28,   22,   28,   28,   36,   28,   36 },
-	/* 8*/{   26,   32,   26,   32,   32,   40,   32,   40 },	/* 9*/{   28,   36,   28,   36,   36,   46,   36,   46 },
-	/*10*/{   32,   40,   32,   40,   40,   50,   40,   50 },	/*11*/{   36,   46,   36,   46,   46,   58,   46,   58 },
-	/*12*/{   40,   52,   40,   52,   52,   64,   52,   64 },	/*13*/{   44,   56,   44,   56,   56,   72,   56,   72 },
-	/*14*/{   52,   64,   52,   64,   64,   80,   64,   80 },	/*15*/{   56,   72,   56,   72,   72,   92,   72,   92 },
-	/*16*/{   64,   80,   64,   80,   80,  100,   80,  100 },	/*17*/{   72,   92,   72,   92,   92,  116,   92,  116 },
-	/*18*/{   80,  104,   80,  104,  104,  128,  104,  128 },	/*19*/{   88,  112,   88,  112,  112,  144,  112,  144 },
-	/*20*/{  104,  128,  104,  128,  128,  160,  128,  160 },	/*21*/{  112,  144,  112,  144,  144,  184,  144,  184 },
-	/*22*/{  128,  160,  128,  160,  160,  200,  160,  200 },	/*23*/{  144,  184,  144,  184,  184,  232,  184,  232 },
-	/*24*/{  160,  208,  160,  208,  208,  256,  208,  256 },	/*25*/{  176,  224,  176,  224,  224,  288,  224,  288 },
-	/*26*/{  208,  256,  208,  256,  256,  320,  256,  320 },	/*27*/{  224,  288,  224,  288,  288,  368,  288,  368 },
-	/*28*/{  256,  320,  256,  320,  320,  400,  320,  400 },	/*29*/{  288,  368,  288,  368,  368,  464,  368,  464 },
-	/*30*/{  320,  416,  320,  416,  416,  512,  416,  512 },	/*31*/{  352,  448,  352,  448,  448,  576,  448,  576 },
-	/*32*/{  416,  512,  416,  512,  512,  640,  512,  640 },	/*33*/{  448,  576,  448,  576,  576,  736,  576,  736 },
-	/*34*/{  512,  640,  512,  640,  640,  800,  640,  800 },	/*35*/{  576,  736,  576,  736,  736,  928,  736,  928 },
-	/*36*/{  640,  832,  640,  832,  832, 1024,  832, 1024 },	/*37*/{  704,  896,  704,  896,  896, 1152,  896, 1152 },
-	/*38*/{  832, 1024,  832, 1024, 1024, 1280, 1024, 1280 },	/*39*/{  896, 1152,  896, 1152, 1152, 1472, 1152, 1472 },
-	/*40*/{ 1024, 1280, 1024, 1280, 1280, 1600, 1280, 1600 },	/*41*/{ 1152, 1472, 1152, 1472, 1472, 1856, 1472, 1856 },
-	/*42*/{ 1280, 1664, 1280, 1664, 1664, 2048, 1664, 2048 },	/*43*/{ 1408, 1792, 1408, 1792, 1792, 2304, 1792, 2304 },
-	/*44*/{ 1664, 2048, 1664, 2048, 2048, 2560, 2048, 2560 },	/*45*/{ 1792, 2304, 1792, 2304, 2304, 2944, 2304, 2944 },
-	/*46*/{ 2048, 2560, 2048, 2560, 2560, 3200, 2560, 3200 },	/*47*/{ 2304, 2944, 2304, 2944, 2944, 3712, 2944, 3712 },
-	/*48*/{ 2560, 3328, 2560, 3328, 3328, 4096, 3328, 4096 },	/*49*/{ 2816, 3584, 2816, 3584, 3584, 4608, 3584, 4608 },
-	/*50*/{ 3328, 4096, 3328, 4096, 4096, 5120, 4096, 5120 },	/*51*/{ 3584, 4608, 3584, 4608, 4608, 5888, 4608, 5888 },
+__align16 (const uint16_t, g_kuiDequantCoeff[52][8]) = {
+  /* 0*/{   10,   13,   10,   13,   13,   16,   13,   16 },	/* 1*/{   11,   14,   11,   14,   14,   18,   14,   18 },
+  /* 2*/{   13,   16,   13,   16,   16,   20,   16,   20 },	/* 3*/{   14,   18,   14,   18,   18,   23,   18,   23 },
+  /* 4*/{   16,   20,   16,   20,   20,   25,   20,   25 },	/* 5*/{   18,   23,   18,   23,   23,   29,   23,   29 },
+  /* 6*/{   20,   26,   20,   26,   26,   32,   26,   32 },	/* 7*/{   22,   28,   22,   28,   28,   36,   28,   36 },
+  /* 8*/{   26,   32,   26,   32,   32,   40,   32,   40 },	/* 9*/{   28,   36,   28,   36,   36,   46,   36,   46 },
+  /*10*/{   32,   40,   32,   40,   40,   50,   40,   50 },	/*11*/{   36,   46,   36,   46,   46,   58,   46,   58 },
+  /*12*/{   40,   52,   40,   52,   52,   64,   52,   64 },	/*13*/{   44,   56,   44,   56,   56,   72,   56,   72 },
+  /*14*/{   52,   64,   52,   64,   64,   80,   64,   80 },	/*15*/{   56,   72,   56,   72,   72,   92,   72,   92 },
+  /*16*/{   64,   80,   64,   80,   80,  100,   80,  100 },	/*17*/{   72,   92,   72,   92,   92,  116,   92,  116 },
+  /*18*/{   80,  104,   80,  104,  104,  128,  104,  128 },	/*19*/{   88,  112,   88,  112,  112,  144,  112,  144 },
+  /*20*/{  104,  128,  104,  128,  128,  160,  128,  160 },	/*21*/{  112,  144,  112,  144,  144,  184,  144,  184 },
+  /*22*/{  128,  160,  128,  160,  160,  200,  160,  200 },	/*23*/{  144,  184,  144,  184,  184,  232,  184,  232 },
+  /*24*/{  160,  208,  160,  208,  208,  256,  208,  256 },	/*25*/{  176,  224,  176,  224,  224,  288,  224,  288 },
+  /*26*/{  208,  256,  208,  256,  256,  320,  256,  320 },	/*27*/{  224,  288,  224,  288,  288,  368,  288,  368 },
+  /*28*/{  256,  320,  256,  320,  320,  400,  320,  400 },	/*29*/{  288,  368,  288,  368,  368,  464,  368,  464 },
+  /*30*/{  320,  416,  320,  416,  416,  512,  416,  512 },	/*31*/{  352,  448,  352,  448,  448,  576,  448,  576 },
+  /*32*/{  416,  512,  416,  512,  512,  640,  512,  640 },	/*33*/{  448,  576,  448,  576,  576,  736,  576,  736 },
+  /*34*/{  512,  640,  512,  640,  640,  800,  640,  800 },	/*35*/{  576,  736,  576,  736,  736,  928,  736,  928 },
+  /*36*/{  640,  832,  640,  832,  832, 1024,  832, 1024 },	/*37*/{  704,  896,  704,  896,  896, 1152,  896, 1152 },
+  /*38*/{  832, 1024,  832, 1024, 1024, 1280, 1024, 1280 },	/*39*/{  896, 1152,  896, 1152, 1152, 1472, 1152, 1472 },
+  /*40*/{ 1024, 1280, 1024, 1280, 1280, 1600, 1280, 1600 },	/*41*/{ 1152, 1472, 1152, 1472, 1472, 1856, 1472, 1856 },
+  /*42*/{ 1280, 1664, 1280, 1664, 1664, 2048, 1664, 2048 },	/*43*/{ 1408, 1792, 1408, 1792, 1792, 2304, 1792, 2304 },
+  /*44*/{ 1664, 2048, 1664, 2048, 2048, 2560, 2048, 2560 },	/*45*/{ 1792, 2304, 1792, 2304, 2304, 2944, 2304, 2944 },
+  /*46*/{ 2048, 2560, 2048, 2560, 2560, 3200, 2560, 3200 },	/*47*/{ 2304, 2944, 2304, 2944, 2944, 3712, 2944, 3712 },
+  /*48*/{ 2560, 3328, 2560, 3328, 3328, 4096, 3328, 4096 },	/*49*/{ 2816, 3584, 2816, 3584, 3584, 4608, 3584, 4608 },
+  /*50*/{ 3328, 4096, 3328, 4096, 4096, 5120, 4096, 5120 },	/*51*/{ 3584, 4608, 3584, 4608, 4608, 5888, 4608, 5888 },
 };
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-const uint8_t g_kuiIntra4x4CbpTable[48] =
-{
-	47, 31, 15,  0, 23, 27, 29, 30,  7, 11, 13, 14, 39, 43, 45, 46, //15
-	16,  3,  5, 10, 12, 19, 21, 26, 28, 35, 37, 42, 44,  1,  2,  4, //31
-	8, 17, 18, 20, 24,  6,  9, 22, 25, 32, 33, 34, 36, 40, 38, 41  //47 
+const uint8_t g_kuiIntra4x4CbpTable[48] = {
+  47, 31, 15,  0, 23, 27, 29, 30,  7, 11, 13, 14, 39, 43, 45, 46, //15
+  16,  3,  5, 10, 12, 19, 21, 26, 28, 35, 37, 42, 44,  1,  2,  4, //31
+  8, 17, 18, 20, 24,  6,  9, 22, 25, 32, 33, 34, 36, 40, 38, 41  //47
 };
 
-const uint8_t g_kuiInterCbpTable[48] =
-{
-	0, 16,  1,  2,  4,  8, 32,  3,  5, 10, 12, 15, 47,  7, 11, 13, //15
-	14,  6,  9, 31, 35, 37, 42, 44, 33, 34, 36, 40, 39, 43, 45, 46, //31
-	17, 18, 20, 24, 19, 21, 26, 28, 23, 27, 29, 30, 22, 25, 38, 41  //47 
+const uint8_t g_kuiInterCbpTable[48] = {
+  0, 16,  1,  2,  4,  8, 32,  3,  5, 10, 12, 15, 47,  7, 11, 13, //15
+  14,  6,  9, 31, 35, 37, 42, 44, 33, 34, 36, 40, 39, 43, 45, 46, //31
+  17, 18, 20, 24, 19, 21, 26, 28, 23, 27, 29, 30, 22, 25, 38, 41  //47
 };
 
-const uint8_t g_kuiLeadingZeroTable[256] = 
-{
-	8,  7,  6,  6,  5,  5,  5,  5,  4,  4,  4,  4,  4,  4,  4,  4,
-	3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
-	2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-	2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
-	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
-	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
-	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
-	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+const uint8_t g_kuiLeadingZeroTable[256] = {
+  8,  7,  6,  6,  5,  5,  5,  5,  4,  4,  4,  4,  4,  4,  4,  4,
+  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
+  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
 };
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -229,433 +223,397 @@
 
 // extern at vlc_decoder.h
 
-const uint8_t g_kuiVlcChromaTable[256][2] =
-{
-	{13, 7}, {13, 7}, {12, 8}, {11, 8}, {8, 7}, {8, 7}, {7, 7}, {7, 7}, {10, 6}, {10, 6}, {10, 6}, {10, 6}, {6, 6}, {6, 6}, {6, 6}, {6, 6}, //15
-	{ 3, 6}, { 3, 6}, { 3, 6}, { 3, 6}, {9, 6}, {9, 6}, {9, 6}, {9, 6}, { 4, 6}, { 4, 6}, { 4, 6}, { 4, 6}, {1, 6}, {1, 6}, {1, 6}, {1, 6}, //31
-	{ 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, {5, 3}, {5, 3}, {5, 3}, {5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, {5, 3}, {5, 3}, {5, 3}, {5, 3}, //47
-	{ 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, {5, 3}, {5, 3}, {5, 3}, {5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, {5, 3}, {5, 3}, {5, 3}, {5, 3}, //63
-	{ 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, //79
-	{ 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, //95
-	{ 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, //111
-	{ 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, //127
-	{ 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, //143
-	{ 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, //159
-	{ 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, //175
-	{ 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, //191
-	{ 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, //207
-	{ 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, //223
-	{ 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, //239
-	{ 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}  //255 
+const uint8_t g_kuiVlcChromaTable[256][2] = {
+  {13, 7}, {13, 7}, {12, 8}, {11, 8}, {8, 7}, {8, 7}, {7, 7}, {7, 7}, {10, 6}, {10, 6}, {10, 6}, {10, 6}, {6, 6}, {6, 6}, {6, 6}, {6, 6}, //15
+  { 3, 6}, { 3, 6}, { 3, 6}, { 3, 6}, {9, 6}, {9, 6}, {9, 6}, {9, 6}, { 4, 6}, { 4, 6}, { 4, 6}, { 4, 6}, {1, 6}, {1, 6}, {1, 6}, {1, 6}, //31
+  { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, {5, 3}, {5, 3}, {5, 3}, {5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, {5, 3}, {5, 3}, {5, 3}, {5, 3}, //47
+  { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, {5, 3}, {5, 3}, {5, 3}, {5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, {5, 3}, {5, 3}, {5, 3}, {5, 3}, //63
+  { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, //79
+  { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, //95
+  { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, //111
+  { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, //127
+  { 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, //143
+  { 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, //159
+  { 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, //175
+  { 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, //191
+  { 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, //207
+  { 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, //223
+  { 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, //239
+  { 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}  //255
 };
 
-const uint8_t g_kuiVlcTable_0[256][2] = //[0] means the index of vlc table, [1] means the length of vlc code  [256] value means the value of 8bits  
-{
-	{ 0, 0}, { 0, 0}, { 0, 0}, {0, 0}, {21, 8}, {12, 8}, {7, 8}, {3, 8}, {17, 7}, {17, 7}, {8, 7}, {8, 7}, {13, 6}, {13, 6}, {13, 6}, {13, 6}, //15
-	{ 4, 6}, { 4, 6}, { 4, 6}, {4, 6}, { 1, 6}, { 1, 6}, {1, 6}, {1, 6}, { 9, 5}, { 9, 5}, {9, 5}, {9, 5}, { 9, 5}, { 9, 5}, { 9, 5}, { 9, 5}, //31
-	{ 5, 3}, { 5, 3}, { 5, 3}, {5, 3}, { 5, 3}, { 5, 3}, {5, 3}, {5, 3}, { 5, 3}, { 5, 3}, {5, 3}, {5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, //47
-	{ 5, 3}, { 5, 3}, { 5, 3}, {5, 3}, { 5, 3}, { 5, 3}, {5, 3}, {5, 3}, { 5, 3}, { 5, 3}, {5, 3}, {5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, //63
-	{ 2, 2}, { 2, 2}, { 2, 2}, {2, 2}, { 2, 2}, { 2, 2}, {2, 2}, {2, 2}, { 2, 2}, { 2, 2}, {2, 2}, {2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, //79
-	{ 2, 2}, { 2, 2}, { 2, 2}, {2, 2}, { 2, 2}, { 2, 2}, {2, 2}, {2, 2}, { 2, 2}, { 2, 2}, {2, 2}, {2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, //95
-	{ 2, 2}, { 2, 2}, { 2, 2}, {2, 2}, { 2, 2}, { 2, 2}, {2, 2}, {2, 2}, { 2, 2}, { 2, 2}, {2, 2}, {2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, //111
-	{ 2, 2}, { 2, 2}, { 2, 2}, {2, 2}, { 2, 2}, { 2, 2}, {2, 2}, {2, 2}, { 2, 2}, { 2, 2}, {2, 2}, {2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, //127
-	{ 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, //143
-	{ 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, //159
-	{ 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, //175
-	{ 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, //191
-	{ 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, //207
-	{ 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, //223
-	{ 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, //239
-	{ 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1} //255  
+const uint8_t g_kuiVlcTable_0[256][2]
+= { //[0] means the index of vlc table, [1] means the length of vlc code  [256] value means the value of 8bits
+  { 0, 0}, { 0, 0}, { 0, 0}, {0, 0}, {21, 8}, {12, 8}, {7, 8}, {3, 8}, {17, 7}, {17, 7}, {8, 7}, {8, 7}, {13, 6}, {13, 6}, {13, 6}, {13, 6}, //15
+  { 4, 6}, { 4, 6}, { 4, 6}, {4, 6}, { 1, 6}, { 1, 6}, {1, 6}, {1, 6}, { 9, 5}, { 9, 5}, {9, 5}, {9, 5}, { 9, 5}, { 9, 5}, { 9, 5}, { 9, 5}, //31
+  { 5, 3}, { 5, 3}, { 5, 3}, {5, 3}, { 5, 3}, { 5, 3}, {5, 3}, {5, 3}, { 5, 3}, { 5, 3}, {5, 3}, {5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, //47
+  { 5, 3}, { 5, 3}, { 5, 3}, {5, 3}, { 5, 3}, { 5, 3}, {5, 3}, {5, 3}, { 5, 3}, { 5, 3}, {5, 3}, {5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, //63
+  { 2, 2}, { 2, 2}, { 2, 2}, {2, 2}, { 2, 2}, { 2, 2}, {2, 2}, {2, 2}, { 2, 2}, { 2, 2}, {2, 2}, {2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, //79
+  { 2, 2}, { 2, 2}, { 2, 2}, {2, 2}, { 2, 2}, { 2, 2}, {2, 2}, {2, 2}, { 2, 2}, { 2, 2}, {2, 2}, {2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, //95
+  { 2, 2}, { 2, 2}, { 2, 2}, {2, 2}, { 2, 2}, { 2, 2}, {2, 2}, {2, 2}, { 2, 2}, { 2, 2}, {2, 2}, {2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, //111
+  { 2, 2}, { 2, 2}, { 2, 2}, {2, 2}, { 2, 2}, { 2, 2}, {2, 2}, {2, 2}, { 2, 2}, { 2, 2}, {2, 2}, {2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, //127
+  { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, //143
+  { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, //159
+  { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, //175
+  { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, //191
+  { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, //207
+  { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, //223
+  { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, //239
+  { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1} //255
 };
 
-const uint8_t g_kuiVlcTable_0_0[256][2] = // read 8 bits  // for g_kuiVlcTable_0[0] //checked no error--
-{
-	{ 0, 0}, { 0, 0}, {47, 7}, {47, 7}, {58, 8}, {60, 8}, {59, 8}, {54, 8}, {61, 8}, {56, 8}, {55, 8}, {50, 8}, {57, 8}, {52, 8}, {51, 8}, {46, 8}, //15
-	{53, 7}, {53, 7}, {48, 7}, {48, 7}, {43, 7}, {43, 7}, {42, 7}, {42, 7}, {49, 7}, {49, 7}, {44, 7}, {44, 7}, {39, 7}, {39, 7}, {38, 7}, {38, 7}, //31
-	{45, 6}, {45, 6}, {45, 6}, {45, 6}, {40, 6}, {40, 6}, {40, 6}, {40, 6}, {35, 6}, {35, 6}, {35, 6}, {35, 6}, {34, 6}, {34, 6}, {34, 6}, {34, 6}, //47
-	{41, 6}, {41, 6}, {41, 6}, {41, 6}, {36, 6}, {36, 6}, {36, 6}, {36, 6}, {31, 6}, {31, 6}, {31, 6}, {31, 6}, {30, 6}, {30, 6}, {30, 6}, {30, 6}, //63
-	{26, 5}, {26, 5}, {26, 5}, {26, 5}, {26, 5}, {26, 5}, {26, 5}, {26, 5}, {32, 5}, {32, 5}, {32, 5}, {32, 5}, {32, 5}, {32, 5}, {32, 5}, {32, 5}, //79
-	{27, 5}, {27, 5}, {27, 5}, {27, 5}, {27, 5}, {27, 5}, {27, 5}, {27, 5}, {22, 5}, {22, 5}, {22, 5}, {22, 5}, {22, 5}, {22, 5}, {22, 5}, {22, 5}, //95
-	{37, 5}, {37, 5}, {37, 5}, {37, 5}, {37, 5}, {37, 5}, {37, 5}, {37, 5}, {28, 5}, {28, 5}, {28, 5}, {28, 5}, {28, 5}, {28, 5}, {28, 5}, {28, 5}, //111
-	{23, 5}, {23, 5}, {23, 5}, {23, 5}, {23, 5}, {23, 5}, {23, 5}, {23, 5}, {18, 5}, {18, 5}, {18, 5}, {18, 5}, {18, 5}, {18, 5}, {18, 5}, {18, 5}, //127
-	{33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, //143
-	{33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, //159
-	{24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, //175
-	{24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, //191
-	{19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, //207
-	{19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, //223
-	{14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, //239
-	{14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3} //255		
+const uint8_t g_kuiVlcTable_0_0[256][2] = { // read 8 bits  // for g_kuiVlcTable_0[0] //checked no error--
+  { 0, 0}, { 0, 0}, {47, 7}, {47, 7}, {58, 8}, {60, 8}, {59, 8}, {54, 8}, {61, 8}, {56, 8}, {55, 8}, {50, 8}, {57, 8}, {52, 8}, {51, 8}, {46, 8}, //15
+  {53, 7}, {53, 7}, {48, 7}, {48, 7}, {43, 7}, {43, 7}, {42, 7}, {42, 7}, {49, 7}, {49, 7}, {44, 7}, {44, 7}, {39, 7}, {39, 7}, {38, 7}, {38, 7}, //31
+  {45, 6}, {45, 6}, {45, 6}, {45, 6}, {40, 6}, {40, 6}, {40, 6}, {40, 6}, {35, 6}, {35, 6}, {35, 6}, {35, 6}, {34, 6}, {34, 6}, {34, 6}, {34, 6}, //47
+  {41, 6}, {41, 6}, {41, 6}, {41, 6}, {36, 6}, {36, 6}, {36, 6}, {36, 6}, {31, 6}, {31, 6}, {31, 6}, {31, 6}, {30, 6}, {30, 6}, {30, 6}, {30, 6}, //63
+  {26, 5}, {26, 5}, {26, 5}, {26, 5}, {26, 5}, {26, 5}, {26, 5}, {26, 5}, {32, 5}, {32, 5}, {32, 5}, {32, 5}, {32, 5}, {32, 5}, {32, 5}, {32, 5}, //79
+  {27, 5}, {27, 5}, {27, 5}, {27, 5}, {27, 5}, {27, 5}, {27, 5}, {27, 5}, {22, 5}, {22, 5}, {22, 5}, {22, 5}, {22, 5}, {22, 5}, {22, 5}, {22, 5}, //95
+  {37, 5}, {37, 5}, {37, 5}, {37, 5}, {37, 5}, {37, 5}, {37, 5}, {37, 5}, {28, 5}, {28, 5}, {28, 5}, {28, 5}, {28, 5}, {28, 5}, {28, 5}, {28, 5}, //111
+  {23, 5}, {23, 5}, {23, 5}, {23, 5}, {23, 5}, {23, 5}, {23, 5}, {23, 5}, {18, 5}, {18, 5}, {18, 5}, {18, 5}, {18, 5}, {18, 5}, {18, 5}, {18, 5}, //127
+  {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, //143
+  {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, //159
+  {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, //175
+  {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, //191
+  {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, //207
+  {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, //223
+  {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, //239
+  {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3} //255
 };
 
-const uint8_t g_kuiVlcTable_0_1[4][2] = // read 2 bits // for g_kuiVlcTable_0[1] //checked no error--
-{
-	{29, 2}, {20, 2}, {15, 2}, {10, 2}	
+const uint8_t g_kuiVlcTable_0_1[4][2] = { // read 2 bits // for g_kuiVlcTable_0[1] //checked no error--
+  {29, 2}, {20, 2}, {15, 2}, {10, 2}
 };
 
-const uint8_t g_kuiVlcTable_0_2[2][2] = // read 1 bit // for g_kuiVlcTable_0[2] //checked no error--
-{
-	{25, 1}, {16, 1}	
+const uint8_t g_kuiVlcTable_0_2[2][2] = { // read 1 bit // for g_kuiVlcTable_0[2] //checked no error--
+  {25, 1}, {16, 1}
 };
 
-const uint8_t g_kuiVlcTable_0_3[2][2] = // read 1 bit // for g_kuiVlcTable_0[3] //checked no error--
-{
-	{11, 1}, {6, 1}	
+const uint8_t g_kuiVlcTable_0_3[2][2] = { // read 1 bit // for g_kuiVlcTable_0[3] //checked no error--
+  {11, 1}, {6, 1}
 };
 
-const uint8_t g_kuiVlcTable_1[256][2] = //checked no error--
-{
-	{ 0, 0}, { 0, 0}, { 0, 0}, { 0, 0}, {14, 8}, {20, 8}, {19, 8}, {10, 8}, {29, 7}, {29, 7}, {16, 7}, {16, 7}, {15, 7}, {15, 7}, { 6, 7}, { 6, 7}, //15
-	{25, 6}, {25, 6}, {25, 6}, {25, 6}, {12, 6}, {12, 6}, {12, 6}, {12, 6}, {11, 6}, {11, 6}, {11, 6}, {11, 6}, { 3, 6}, { 3, 6}, { 3, 6}, { 3, 6}, //31
-	{21, 6}, {21, 6}, {21, 6}, {21, 6}, { 8, 6}, { 8, 6}, { 8, 6}, { 8, 6}, { 7, 6}, { 7, 6}, { 7, 6}, { 7, 6}, { 1, 6}, { 1, 6}, { 1, 6}, { 1, 6}, //47
-	{17, 5}, {17, 5}, {17, 5}, {17, 5}, {17, 5}, {17, 5}, {17, 5}, {17, 5}, { 4, 5}, { 4, 5}, { 4, 5}, { 4, 5}, { 4, 5}, { 4, 5}, { 4, 5}, { 4, 5}, //63
-	{13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, //79
-	{ 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, //95
-	{ 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, //111
-	{ 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, //127
-	{ 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, //143
-	{ 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, //159
-	{ 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, //175
-	{ 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, //191
-	{ 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, //207
-	{ 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, //223
-	{ 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, //239
-	{ 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2} //255
-	
+const uint8_t g_kuiVlcTable_1[256][2] = { //checked no error--
+  { 0, 0}, { 0, 0}, { 0, 0}, { 0, 0}, {14, 8}, {20, 8}, {19, 8}, {10, 8}, {29, 7}, {29, 7}, {16, 7}, {16, 7}, {15, 7}, {15, 7}, { 6, 7}, { 6, 7}, //15
+  {25, 6}, {25, 6}, {25, 6}, {25, 6}, {12, 6}, {12, 6}, {12, 6}, {12, 6}, {11, 6}, {11, 6}, {11, 6}, {11, 6}, { 3, 6}, { 3, 6}, { 3, 6}, { 3, 6}, //31
+  {21, 6}, {21, 6}, {21, 6}, {21, 6}, { 8, 6}, { 8, 6}, { 8, 6}, { 8, 6}, { 7, 6}, { 7, 6}, { 7, 6}, { 7, 6}, { 1, 6}, { 1, 6}, { 1, 6}, { 1, 6}, //47
+  {17, 5}, {17, 5}, {17, 5}, {17, 5}, {17, 5}, {17, 5}, {17, 5}, {17, 5}, { 4, 5}, { 4, 5}, { 4, 5}, { 4, 5}, { 4, 5}, { 4, 5}, { 4, 5}, { 4, 5}, //63
+  {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, //79
+  { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, //95
+  { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, //111
+  { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, //127
+  { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, //143
+  { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, //159
+  { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, //175
+  { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, //191
+  { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, //207
+  { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, //223
+  { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, //239
+  { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2} //255
+
 };
 
-const uint8_t g_kuiVlcTable_1_0[64][2] = // read 6 bits  // for g_kuiVlcTable_1[0] //checked no error--
-{
-	{ 0, 0}, { 0, 0}, {57, 5}, {57, 5}, {61, 6}, {60, 6}, {59, 6}, {58, 6}, {55, 6}, {54, 6}, {56, 6}, {51, 6}, {52, 5}, {52, 5}, {50, 5}, {50, 5}, //15
-	{53, 5}, {53, 5}, {48, 5}, {48, 5}, {47, 5}, {47, 5}, {46, 5}, {46, 5}, {49, 5}, {49, 5}, {44, 5}, {44, 5}, {43, 5}, {43, 5}, {42, 5}, {42, 5}, //31
-	{38, 4}, {38, 4}, {38, 4}, {38, 4}, {40, 4}, {40, 4}, {40, 4}, {40, 4}, {39, 4}, {39, 4}, {39, 4}, {39, 4}, {34, 4}, {34, 4}, {34, 4}, {34, 4}, //47
-	{45, 4}, {45, 4}, {45, 4}, {45, 4}, {36, 4}, {36, 4}, {36, 4}, {36, 4}, {35, 4}, {35, 4}, {35, 4}, {35, 4}, {30, 4}, {30, 4}, {30, 4}, {30, 4} //63 
+const uint8_t g_kuiVlcTable_1_0[64][2] = { // read 6 bits  // for g_kuiVlcTable_1[0] //checked no error--
+  { 0, 0}, { 0, 0}, {57, 5}, {57, 5}, {61, 6}, {60, 6}, {59, 6}, {58, 6}, {55, 6}, {54, 6}, {56, 6}, {51, 6}, {52, 5}, {52, 5}, {50, 5}, {50, 5}, //15
+  {53, 5}, {53, 5}, {48, 5}, {48, 5}, {47, 5}, {47, 5}, {46, 5}, {46, 5}, {49, 5}, {49, 5}, {44, 5}, {44, 5}, {43, 5}, {43, 5}, {42, 5}, {42, 5}, //31
+  {38, 4}, {38, 4}, {38, 4}, {38, 4}, {40, 4}, {40, 4}, {40, 4}, {40, 4}, {39, 4}, {39, 4}, {39, 4}, {39, 4}, {34, 4}, {34, 4}, {34, 4}, {34, 4}, //47
+  {45, 4}, {45, 4}, {45, 4}, {45, 4}, {36, 4}, {36, 4}, {36, 4}, {36, 4}, {35, 4}, {35, 4}, {35, 4}, {35, 4}, {30, 4}, {30, 4}, {30, 4}, {30, 4} //63
 };
 
-const uint8_t g_kuiVlcTable_1_1[8][2] = // read 3 bits // for g_kuiVlcTable_1[1] //checked no error--
-{
-	{41, 3}, {32, 3}, {31, 3}, {26, 3}, {37, 3}, {28, 3}, {27, 3}, {22, 3}	
+const uint8_t g_kuiVlcTable_1_1[8][2] = { // read 3 bits // for g_kuiVlcTable_1[1] //checked no error--
+  {41, 3}, {32, 3}, {31, 3}, {26, 3}, {37, 3}, {28, 3}, {27, 3}, {22, 3}
 };
 
-const uint8_t g_kuiVlcTable_1_2[2][2] = // read 1 bit // for g_kuiVlcTable_1[2] //checked no error--
-{
-	{33, 1}, {24, 1}	
+const uint8_t g_kuiVlcTable_1_2[2][2] = { // read 1 bit // for g_kuiVlcTable_1[2] //checked no error--
+  {33, 1}, {24, 1}
 };
 
-const uint8_t g_kuiVlcTable_1_3[2][2] = // read 1 bit // for g_kuiVlcTable_1[3] //checked no error--
-{
-	{23, 1}, {18, 1}	
+const uint8_t g_kuiVlcTable_1_3[2][2] = { // read 1 bit // for g_kuiVlcTable_1[3] //checked no error--
+  {23, 1}, {18, 1}
 };
 
-const uint8_t g_kuiVlcTable_2[256][2] = //checked no error--
-{
-	{ 0, 0}, { 0, 0}, { 0, 0}, { 0, 0}, { 0, 0}, { 0, 0}, { 0, 0}, { 0, 0}, {45, 8}, {40, 8}, {35, 8}, {30, 8}, {41, 8}, {36, 8}, {31, 8}, {26, 8}, //15
-	{22, 7}, {22, 7}, {18, 7}, {18, 7}, {32, 7}, {32, 7}, {14, 7}, {14, 7}, {37, 7}, {37, 7}, {28, 7}, {28, 7}, {27, 7}, {27, 7}, {10, 7}, {10, 7}, //31
-	{ 6, 6}, { 6, 6}, { 6, 6}, { 6, 6}, {24, 6}, {24, 6}, {24, 6}, {24, 6}, {23, 6}, {23, 6}, {23, 6}, {23, 6}, { 3, 6}, { 3, 6}, { 3, 6}, { 3, 6}, //47
-	{33, 6}, {33, 6}, {33, 6}, {33, 6}, {20, 6}, {20, 6}, {20, 6}, {20, 6}, {19, 6}, {19, 6}, {19, 6}, {19, 6}, { 1, 6}, { 1, 6}, { 1, 6}, { 1, 6}, //63
-	{15, 5}, {15, 5}, {15, 5}, {15, 5}, {15, 5}, {15, 5}, {15, 5}, {15, 5}, {16, 5}, {16, 5}, {16, 5}, {16, 5}, {16, 5}, {16, 5}, {16, 5}, {16, 5}, //79
-	{11, 5}, {11, 5}, {11, 5}, {11, 5}, {11, 5}, {11, 5}, {11, 5}, {11, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, //95
-	{ 7, 5}, { 7, 5}, { 7, 5}, { 7, 5}, { 7, 5}, { 7, 5}, { 7, 5}, { 7, 5}, {29, 5}, {29, 5}, {29, 5}, {29, 5}, {29, 5}, {29, 5}, {29, 5}, {29, 5}, //111
-	{ 8, 5}, { 8, 5}, { 8, 5}, { 8, 5}, { 8, 5}, { 8, 5}, { 8, 5}, { 8, 5}, { 4, 5}, { 4, 5}, { 4, 5}, { 4, 5}, { 4, 5}, { 4, 5}, { 4, 5}, { 4, 5}, //127
-	{25, 4}, {25, 4}, {25, 4}, {25, 4}, {25, 4}, {25, 4}, {25, 4}, {25, 4}, {25, 4}, {25, 4}, {25, 4}, {25, 4}, {25, 4}, {25, 4}, {25, 4}, {25, 4}, //143
-	{21, 4}, {21, 4}, {21, 4}, {21, 4}, {21, 4}, {21, 4}, {21, 4}, {21, 4}, {21, 4}, {21, 4}, {21, 4}, {21, 4}, {21, 4}, {21, 4}, {21, 4}, {21, 4}, //159
-	{17, 4}, {17, 4}, {17, 4}, {17, 4}, {17, 4}, {17, 4}, {17, 4}, {17, 4}, {17, 4}, {17, 4}, {17, 4}, {17, 4}, {17, 4}, {17, 4}, {17, 4}, {17, 4}, //175
-	{13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, //191
-	{ 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, //207
-	{ 5, 4}, { 5, 4}, { 5, 4}, { 5, 4}, { 5, 4}, { 5, 4}, { 5, 4}, { 5, 4}, { 5, 4}, { 5, 4}, { 5, 4}, { 5, 4}, { 5, 4}, { 5, 4}, { 5, 4}, { 5, 4}, //223
-	{ 2, 4}, { 2, 4}, { 2, 4}, { 2, 4}, { 2, 4}, { 2, 4}, { 2, 4}, { 2, 4}, { 2, 4}, { 2, 4}, { 2, 4}, { 2, 4}, { 2, 4}, { 2, 4}, { 2, 4}, { 2, 4}, //239
-	{ 0, 4}, { 0, 4}, { 0, 4}, { 0, 4}, { 0, 4}, { 0, 4}, { 0, 4}, { 0, 4}, { 0, 4}, { 0, 4}, { 0, 4}, { 0, 4}, { 0, 4}, { 0, 4}, { 0, 4}, { 0, 4} //255
+const uint8_t g_kuiVlcTable_2[256][2] = { //checked no error--
+  { 0, 0}, { 0, 0}, { 0, 0}, { 0, 0}, { 0, 0}, { 0, 0}, { 0, 0}, { 0, 0}, {45, 8}, {40, 8}, {35, 8}, {30, 8}, {41, 8}, {36, 8}, {31, 8}, {26, 8}, //15
+  {22, 7}, {22, 7}, {18, 7}, {18, 7}, {32, 7}, {32, 7}, {14, 7}, {14, 7}, {37, 7}, {37, 7}, {28, 7}, {28, 7}, {27, 7}, {27, 7}, {10, 7}, {10, 7}, //31
+  { 6, 6}, { 6, 6}, { 6, 6}, { 6, 6}, {24, 6}, {24, 6}, {24, 6}, {24, 6}, {23, 6}, {23, 6}, {23, 6}, {23, 6}, { 3, 6}, { 3, 6}, { 3, 6}, { 3, 6}, //47
+  {33, 6}, {33, 6}, {33, 6}, {33, 6}, {20, 6}, {20, 6}, {20, 6}, {20, 6}, {19, 6}, {19, 6}, {19, 6}, {19, 6}, { 1, 6}, { 1, 6}, { 1, 6}, { 1, 6}, //63
+  {15, 5}, {15, 5}, {15, 5}, {15, 5}, {15, 5}, {15, 5}, {15, 5}, {15, 5}, {16, 5}, {16, 5}, {16, 5}, {16, 5}, {16, 5}, {16, 5}, {16, 5}, {16, 5}, //79
+  {11, 5}, {11, 5}, {11, 5}, {11, 5}, {11, 5}, {11, 5}, {11, 5}, {11, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, //95
+  { 7, 5}, { 7, 5}, { 7, 5}, { 7, 5}, { 7, 5}, { 7, 5}, { 7, 5}, { 7, 5}, {29, 5}, {29, 5}, {29, 5}, {29, 5}, {29, 5}, {29, 5}, {29, 5}, {29, 5}, //111
+  { 8, 5}, { 8, 5}, { 8, 5}, { 8, 5}, { 8, 5}, { 8, 5}, { 8, 5}, { 8, 5}, { 4, 5}, { 4, 5}, { 4, 5}, { 4, 5}, { 4, 5}, { 4, 5}, { 4, 5}, { 4, 5}, //127
+  {25, 4}, {25, 4}, {25, 4}, {25, 4}, {25, 4}, {25, 4}, {25, 4}, {25, 4}, {25, 4}, {25, 4}, {25, 4}, {25, 4}, {25, 4}, {25, 4}, {25, 4}, {25, 4}, //143
+  {21, 4}, {21, 4}, {21, 4}, {21, 4}, {21, 4}, {21, 4}, {21, 4}, {21, 4}, {21, 4}, {21, 4}, {21, 4}, {21, 4}, {21, 4}, {21, 4}, {21, 4}, {21, 4}, //159
+  {17, 4}, {17, 4}, {17, 4}, {17, 4}, {17, 4}, {17, 4}, {17, 4}, {17, 4}, {17, 4}, {17, 4}, {17, 4}, {17, 4}, {17, 4}, {17, 4}, {17, 4}, {17, 4}, //175
+  {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, //191
+  { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, //207
+  { 5, 4}, { 5, 4}, { 5, 4}, { 5, 4}, { 5, 4}, { 5, 4}, { 5, 4}, { 5, 4}, { 5, 4}, { 5, 4}, { 5, 4}, { 5, 4}, { 5, 4}, { 5, 4}, { 5, 4}, { 5, 4}, //223
+  { 2, 4}, { 2, 4}, { 2, 4}, { 2, 4}, { 2, 4}, { 2, 4}, { 2, 4}, { 2, 4}, { 2, 4}, { 2, 4}, { 2, 4}, { 2, 4}, { 2, 4}, { 2, 4}, { 2, 4}, { 2, 4}, //239
+  { 0, 4}, { 0, 4}, { 0, 4}, { 0, 4}, { 0, 4}, { 0, 4}, { 0, 4}, { 0, 4}, { 0, 4}, { 0, 4}, { 0, 4}, { 0, 4}, { 0, 4}, { 0, 4}, { 0, 4}, { 0, 4} //255
 };
 
-const uint8_t g_kuiVlcTable_2_0[4][2] = // read 2 bits // for g_kuiVlcTable_2[0] //checked
-{
-	{0, 0}, {58, 2}, {61, 2}, {60, 2}	
+const uint8_t g_kuiVlcTable_2_0[4][2] = { // read 2 bits // for g_kuiVlcTable_2[0] //checked
+  {0, 0}, {58, 2}, {61, 2}, {60, 2}
 };
 
 
-const uint8_t g_kuiVlcTable_2_1[4][2] = // read 2 bits // for g_kuiVlcTable_2[1] //checked
-{
-	{59, 2}, {54, 2}, {57, 2}, {56, 2}	
+const uint8_t g_kuiVlcTable_2_1[4][2] = { // read 2 bits // for g_kuiVlcTable_2[1] //checked
+  {59, 2}, {54, 2}, {57, 2}, {56, 2}
 };
 
-const uint8_t g_kuiVlcTable_2_2[4][2] = // read 2 bits // for g_kuiVlcTable_2[2] //checked
-{
-	{55, 2}, {50, 2}, {53, 2}, {52, 2}	
+const uint8_t g_kuiVlcTable_2_2[4][2] = { // read 2 bits // for g_kuiVlcTable_2[2] //checked
+  {55, 2}, {50, 2}, {53, 2}, {52, 2}
 };
 
-const uint8_t g_kuiVlcTable_2_3[4][2] = // read 2 bits // for g_kuiVlcTable_2[3] //checked
-{
-	{51, 2}, {46, 2}, {47, 1}, {47, 1}	
+const uint8_t g_kuiVlcTable_2_3[4][2] = { // read 2 bits // for g_kuiVlcTable_2[3] //checked
+  {51, 2}, {46, 2}, {47, 1}, {47, 1}
 };
 
-const uint8_t g_kuiVlcTable_2_4[2][2] = // read 1 bit // for g_kuiVlcTable_2[4] //checked
-{
-	{42, 1}, {48, 1}	
+const uint8_t g_kuiVlcTable_2_4[2][2] = { // read 1 bit // for g_kuiVlcTable_2[4] //checked
+  {42, 1}, {48, 1}
 };
 
-const uint8_t g_kuiVlcTable_2_5[2][2] = // read 1 bit // for g_kuiVlcTable_2[5] //checked
-{
-	{43, 1}, {38, 1}	
+const uint8_t g_kuiVlcTable_2_5[2][2] = { // read 1 bit // for g_kuiVlcTable_2[5] //checked
+  {43, 1}, {38, 1}
 };
 
-const uint8_t g_kuiVlcTable_2_6[2][2] = // read 1 bit // for g_kuiVlcTable_2[6] //checked no error--
-{
-	{49, 1}, {44, 1}	
+const uint8_t g_kuiVlcTable_2_6[2][2] = { // read 1 bit // for g_kuiVlcTable_2[6] //checked no error--
+  {49, 1}, {44, 1}
 };
 
-const uint8_t g_kuiVlcTable_2_7[2][2] = // read 1 bit // for g_kuiVlcTable_2[7] //checked no error--
-{
-	{39, 1}, {34, 1}	
+const uint8_t g_kuiVlcTable_2_7[2][2] = { // read 1 bit // for g_kuiVlcTable_2[7] //checked no error--
+  {39, 1}, {34, 1}
 };
 
-const uint8_t g_kuiVlcTable_3[64][2] = // read 6 bits //corrected
-{
-	{ 1, 6}, { 2, 6}, { 0, 0}, { 0, 6}, { 3, 6}, { 4, 6}, { 5, 6}, { 0, 0}, { 6, 6}, { 7, 6}, { 8, 6}, { 9, 6}, {10, 6}, {11, 6}, {12, 6}, {13, 6}, //15 
-	{14, 6}, {15, 6}, {16, 6}, {17, 6}, {18, 6}, {19, 6}, {20, 6}, {21, 6}, {22, 6}, {23, 6}, {24, 6}, {25, 6}, {26, 6}, {27, 6}, {28, 6}, {29, 6}, //31 
-	{30, 6}, {31, 6}, {32, 6}, {33, 6}, {34, 6}, {35, 6}, {36, 6}, {37, 6}, {38, 6}, {39, 6}, {40, 6}, {41, 6}, {42, 6}, {43, 6}, {44, 6}, {45, 6}, //47
-	{46, 6}, {47, 6}, {48, 6}, {49, 6}, {50, 6}, {51, 6}, {52, 6}, {53, 6}, {54, 6}, {55, 6}, {56, 6}, {57, 6}, {58, 6}, {59, 6}, {60, 6}, {61, 6}, //63
+const uint8_t g_kuiVlcTable_3[64][2] = { // read 6 bits //corrected
+  { 1, 6}, { 2, 6}, { 0, 0}, { 0, 6}, { 3, 6}, { 4, 6}, { 5, 6}, { 0, 0}, { 6, 6}, { 7, 6}, { 8, 6}, { 9, 6}, {10, 6}, {11, 6}, {12, 6}, {13, 6}, //15
+  {14, 6}, {15, 6}, {16, 6}, {17, 6}, {18, 6}, {19, 6}, {20, 6}, {21, 6}, {22, 6}, {23, 6}, {24, 6}, {25, 6}, {26, 6}, {27, 6}, {28, 6}, {29, 6}, //31
+  {30, 6}, {31, 6}, {32, 6}, {33, 6}, {34, 6}, {35, 6}, {36, 6}, {37, 6}, {38, 6}, {39, 6}, {40, 6}, {41, 6}, {42, 6}, {43, 6}, {44, 6}, {45, 6}, //47
+  {46, 6}, {47, 6}, {48, 6}, {49, 6}, {50, 6}, {51, 6}, {52, 6}, {53, 6}, {54, 6}, {55, 6}, {56, 6}, {57, 6}, {58, 6}, {59, 6}, {60, 6}, {61, 6}, //63
 };
 
 
-const uint8_t g_kuiVlcTableNeedMoreBitsThread[3] = 
-{
-	4, 4, 8
+const uint8_t g_kuiVlcTableNeedMoreBitsThread[3] = {
+  4, 4, 8
 };
 
-const uint8_t g_kuiVlcTableMoreBitsCount0[4] = 
-{
-	8, 2, 1, 1
+const uint8_t g_kuiVlcTableMoreBitsCount0[4] = {
+  8, 2, 1, 1
 };
 
-const uint8_t g_kuiVlcTableMoreBitsCount1[4] = 
-{
-	6, 3, 1, 1
+const uint8_t g_kuiVlcTableMoreBitsCount1[4] = {
+  6, 3, 1, 1
 };
 
-const uint8_t g_kuiVlcTableMoreBitsCount2[8] = 
-{
-	2, 2, 2, 2, 1, 1, 1, 1
+const uint8_t g_kuiVlcTableMoreBitsCount2[8] = {
+  2, 2, 2, 2, 1, 1, 1, 1
 };
 
-const uint8_t g_kuiNcMapTable[17] = 
-{
-	0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3
+const uint8_t g_kuiNcMapTable[17] = {
+  0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3
 };
 
 
-const uint8_t g_kuiVlcTrailingOneTotalCoeffTable[62][2] = 
-{
-	{0, 0}, 
-	{0, 1}, {1, 1}, 
-	{0, 2}, {1, 2}, {2, 2}, 
-	{0, 3}, {1, 3}, {2, 3}, {3, 3},
-	{0, 4}, {1, 4}, {2, 4}, {3, 4},
-	{0, 5}, {1, 5}, {2, 5}, {3, 5},
-	{0, 6}, {1, 6}, {2, 6}, {3, 6},
-	{0, 7}, {1, 7}, {2, 7}, {3, 7},
-	{0, 8}, {1, 8}, {2, 8}, {3, 8},
-	{0, 9}, {1, 9}, {2, 9}, {3, 9},
-	{0, 10}, {1, 10}, {2, 10}, {3, 10},
-	{0, 11}, {1, 11}, {2, 11}, {3, 11},
-	{0, 12}, {1, 12}, {2, 12}, {3, 12},
-	{0, 13}, {1, 13}, {2, 13}, {3, 13},
-	{0, 14}, {1, 14}, {2, 14}, {3, 14},
-	{0, 15}, {1, 15}, {2, 15}, {3, 15},
-	{0, 16}, {1, 16}, {2, 16}, {3, 16}  
+const uint8_t g_kuiVlcTrailingOneTotalCoeffTable[62][2] = {
+  {0, 0},
+  {0, 1}, {1, 1},
+  {0, 2}, {1, 2}, {2, 2},
+  {0, 3}, {1, 3}, {2, 3}, {3, 3},
+  {0, 4}, {1, 4}, {2, 4}, {3, 4},
+  {0, 5}, {1, 5}, {2, 5}, {3, 5},
+  {0, 6}, {1, 6}, {2, 6}, {3, 6},
+  {0, 7}, {1, 7}, {2, 7}, {3, 7},
+  {0, 8}, {1, 8}, {2, 8}, {3, 8},
+  {0, 9}, {1, 9}, {2, 9}, {3, 9},
+  {0, 10}, {1, 10}, {2, 10}, {3, 10},
+  {0, 11}, {1, 11}, {2, 11}, {3, 11},
+  {0, 12}, {1, 12}, {2, 12}, {3, 12},
+  {0, 13}, {1, 13}, {2, 13}, {3, 13},
+  {0, 14}, {1, 14}, {2, 14}, {3, 14},
+  {0, 15}, {1, 15}, {2, 15}, {3, 15},
+  {0, 16}, {1, 16}, {2, 16}, {3, 16}
 };
 
-const uint8_t g_kuiTotalZerosTable0[512][2] = //read 9 bits, generated by tzVlcIndex=1 in Table 9-7 in H.264/AVC standard
-{
-	{0, 0}, {15, 9}, {14, 9}, {13, 9}, {12, 8}, {12, 8}, {11, 8}, {11, 8}, {10, 7}, {10, 7}, {10, 7}, {10, 7}, {9, 7}, {9, 7}, {9, 7}, {9, 7}, //15
-	{8, 6}, { 8, 6}, { 8, 6}, { 8, 6}, { 8, 6}, { 8, 6}, { 8, 6}, { 8, 6}, { 7, 6}, { 7, 6}, { 7, 6}, { 7, 6}, {7, 6}, {7, 6}, {7, 6}, {7, 6}, //31
-	{6, 5}, { 6, 5}, { 6, 5}, { 6, 5}, { 6, 5}, { 6, 5}, { 6, 5}, { 6, 5}, { 6, 5}, { 6, 5}, { 6, 5}, { 6, 5}, {6, 5}, {6, 5}, {6, 5}, {6, 5}, //47
-	{5, 5}, { 5, 5}, { 5, 5}, { 5, 5}, { 5, 5}, { 5, 5}, { 5, 5}, { 5, 5}, { 5, 5}, { 5, 5}, { 5, 5}, { 5, 5}, {5, 5}, {5, 5}, {5, 5}, {5, 5}, //63
-	{4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, {4, 4}, {4, 4}, {4, 4}, {4, 4}, //79
-	{4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, {4, 4}, {4, 4}, {4, 4}, {4, 4}, //95
-	{3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, {3, 4}, {3, 4}, {3, 4}, {3, 4}, //111
-	{3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, {3, 4}, {3, 4}, {3, 4}, {3, 4}, //127
-	{2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, {2, 3}, {2, 3}, {2, 3}, {2, 3}, //143
-	{2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, {2, 3}, {2, 3}, {2, 3}, {2, 3}, //159
-	{2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, {2, 3}, {2, 3}, {2, 3}, {2, 3}, //175
-	{2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, {2, 3}, {2, 3}, {2, 3}, {2, 3}, //191
-	{1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, {1, 3}, {1, 3}, {1, 3}, {1, 3}, //207
-	{1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, {1, 3}, {1, 3}, {1, 3}, {1, 3}, //223
-	{1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, {1, 3}, {1, 3}, {1, 3}, {1, 3}, //239
-	{1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, {1, 3}, {1, 3}, {1, 3}, {1, 3}, //255
-	{0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //271
-	{0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //287
-	{0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //303
-	{0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //319
-	{0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //335
-	{0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //351
-	{0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //367
-	{0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //383
-	{0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //399
-	{0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //415
-	{0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //431
-	{0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //447
-	{0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //463
-	{0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //479
-	{0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //495
-	{0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1} //511
+const uint8_t g_kuiTotalZerosTable0[512][2]
+= { //read 9 bits, generated by tzVlcIndex=1 in Table 9-7 in H.264/AVC standard
+  {0, 0}, {15, 9}, {14, 9}, {13, 9}, {12, 8}, {12, 8}, {11, 8}, {11, 8}, {10, 7}, {10, 7}, {10, 7}, {10, 7}, {9, 7}, {9, 7}, {9, 7}, {9, 7}, //15
+  {8, 6}, { 8, 6}, { 8, 6}, { 8, 6}, { 8, 6}, { 8, 6}, { 8, 6}, { 8, 6}, { 7, 6}, { 7, 6}, { 7, 6}, { 7, 6}, {7, 6}, {7, 6}, {7, 6}, {7, 6}, //31
+  {6, 5}, { 6, 5}, { 6, 5}, { 6, 5}, { 6, 5}, { 6, 5}, { 6, 5}, { 6, 5}, { 6, 5}, { 6, 5}, { 6, 5}, { 6, 5}, {6, 5}, {6, 5}, {6, 5}, {6, 5}, //47
+  {5, 5}, { 5, 5}, { 5, 5}, { 5, 5}, { 5, 5}, { 5, 5}, { 5, 5}, { 5, 5}, { 5, 5}, { 5, 5}, { 5, 5}, { 5, 5}, {5, 5}, {5, 5}, {5, 5}, {5, 5}, //63
+  {4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, {4, 4}, {4, 4}, {4, 4}, {4, 4}, //79
+  {4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, {4, 4}, {4, 4}, {4, 4}, {4, 4}, //95
+  {3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, {3, 4}, {3, 4}, {3, 4}, {3, 4}, //111
+  {3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, {3, 4}, {3, 4}, {3, 4}, {3, 4}, //127
+  {2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, {2, 3}, {2, 3}, {2, 3}, {2, 3}, //143
+  {2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, {2, 3}, {2, 3}, {2, 3}, {2, 3}, //159
+  {2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, {2, 3}, {2, 3}, {2, 3}, {2, 3}, //175
+  {2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, {2, 3}, {2, 3}, {2, 3}, {2, 3}, //191
+  {1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, {1, 3}, {1, 3}, {1, 3}, {1, 3}, //207
+  {1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, {1, 3}, {1, 3}, {1, 3}, {1, 3}, //223
+  {1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, {1, 3}, {1, 3}, {1, 3}, {1, 3}, //239
+  {1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, {1, 3}, {1, 3}, {1, 3}, {1, 3}, //255
+  {0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //271
+  {0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //287
+  {0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //303
+  {0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //319
+  {0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //335
+  {0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //351
+  {0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //367
+  {0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //383
+  {0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //399
+  {0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //415
+  {0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //431
+  {0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //447
+  {0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //463
+  {0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //479
+  {0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //495
+  {0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1} //511
 };
 
-const uint8_t g_kuiTotalZerosTable1[64][2] = //read 6 bits, generated by tzVlcIndex=2 in Table 9-7 in H.264/AVC standard
-{
-	{14, 6}, {13, 6}, {12, 6}, {11, 6}, {10, 5}, {10, 5}, {9, 5}, {9, 5}, {8, 4}, {8, 4}, {8, 4}, {8, 4}, {7, 4}, {7, 4}, {7, 4}, {7, 4}, //15
-	{ 6, 4}, { 6, 4}, { 6, 4}, { 6, 4}, { 5, 4}, { 5, 4}, {5, 4}, {5, 4}, {4, 3}, {4, 3}, {4, 3}, {4, 3}, {4, 3}, {4, 3}, {4, 3}, {4, 3}, //31
-	{ 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, {3, 3}, {3, 3}, {2, 3}, {2, 3}, {2, 3}, {2, 3}, {2, 3}, {2, 3}, {2, 3}, {2, 3}, //47
-	{ 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, {1, 3}, {1, 3}, {0, 3}, {0, 3}, {0, 3}, {0, 3}, {0, 3}, {0, 3}, {0, 3}, {0, 3} //63
+const uint8_t g_kuiTotalZerosTable1[64][2]
+= { //read 6 bits, generated by tzVlcIndex=2 in Table 9-7 in H.264/AVC standard
+  {14, 6}, {13, 6}, {12, 6}, {11, 6}, {10, 5}, {10, 5}, {9, 5}, {9, 5}, {8, 4}, {8, 4}, {8, 4}, {8, 4}, {7, 4}, {7, 4}, {7, 4}, {7, 4}, //15
+  { 6, 4}, { 6, 4}, { 6, 4}, { 6, 4}, { 5, 4}, { 5, 4}, {5, 4}, {5, 4}, {4, 3}, {4, 3}, {4, 3}, {4, 3}, {4, 3}, {4, 3}, {4, 3}, {4, 3}, //31
+  { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, {3, 3}, {3, 3}, {2, 3}, {2, 3}, {2, 3}, {2, 3}, {2, 3}, {2, 3}, {2, 3}, {2, 3}, //47
+  { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, {1, 3}, {1, 3}, {0, 3}, {0, 3}, {0, 3}, {0, 3}, {0, 3}, {0, 3}, {0, 3}, {0, 3} //63
 };
 
-const uint8_t g_kuiTotalZerosTable2[64][2] = //read 6 bits, generated by tzVlcIndex=3 in Table 9-7 in H.264/AVC standard
-{
-	{13, 6}, {11, 6}, {12, 5}, {12, 5}, {10, 5}, {10, 5}, {9, 5}, {9, 5}, {8, 4}, {8, 4}, {8, 4}, {8, 4}, {5, 4}, {5, 4}, {5, 4}, {5, 4}, //15
-	{ 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 0, 4}, { 0, 4}, {0, 4}, {0, 4}, {7, 3}, {7, 3}, {7, 3}, {7, 3}, {7, 3}, {7, 3}, {7, 3}, {7, 3}, //31
-	{ 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, {6, 3}, {6, 3}, {3, 3}, {3, 3}, {3, 3}, {3, 3}, {3, 3}, {3, 3}, {3, 3}, {3, 3}, //47
-	{ 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, {2, 3}, {2, 3}, {1, 3}, {1, 3}, {1, 3}, {1, 3}, {1, 3}, {1, 3}, {1, 3}, {1, 3} //63
+const uint8_t g_kuiTotalZerosTable2[64][2]
+= { //read 6 bits, generated by tzVlcIndex=3 in Table 9-7 in H.264/AVC standard
+  {13, 6}, {11, 6}, {12, 5}, {12, 5}, {10, 5}, {10, 5}, {9, 5}, {9, 5}, {8, 4}, {8, 4}, {8, 4}, {8, 4}, {5, 4}, {5, 4}, {5, 4}, {5, 4}, //15
+  { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 0, 4}, { 0, 4}, {0, 4}, {0, 4}, {7, 3}, {7, 3}, {7, 3}, {7, 3}, {7, 3}, {7, 3}, {7, 3}, {7, 3}, //31
+  { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, {6, 3}, {6, 3}, {3, 3}, {3, 3}, {3, 3}, {3, 3}, {3, 3}, {3, 3}, {3, 3}, {3, 3}, //47
+  { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, {2, 3}, {2, 3}, {1, 3}, {1, 3}, {1, 3}, {1, 3}, {1, 3}, {1, 3}, {1, 3}, {1, 3} //63
 };
 
-const uint8_t g_kuiTotalZerosTable3[32][2] = //read 5 bits, generated by tzVlcIndex=4 in Table 9-7 in H.264/AVC standard
-{
-	{12, 5}, {11, 5}, {10, 5}, {0, 5}, {9, 4}, {9, 4}, {7, 4}, {7, 4}, {3, 4}, {3, 4}, {2, 4}, {2, 4}, {8, 3}, {8, 3}, {8, 3}, {8, 3}, //15
-	{ 6, 3}, { 6, 3}, { 6, 3}, {6, 3}, {5, 3}, {5, 3}, {5, 3}, {5, 3}, {4, 3}, {4, 3}, {4, 3}, {4, 3}, {1, 3}, {1, 3}, {1, 3}, {1, 3}, //31
+const uint8_t g_kuiTotalZerosTable3[32][2]
+= { //read 5 bits, generated by tzVlcIndex=4 in Table 9-7 in H.264/AVC standard
+  {12, 5}, {11, 5}, {10, 5}, {0, 5}, {9, 4}, {9, 4}, {7, 4}, {7, 4}, {3, 4}, {3, 4}, {2, 4}, {2, 4}, {8, 3}, {8, 3}, {8, 3}, {8, 3}, //15
+  { 6, 3}, { 6, 3}, { 6, 3}, {6, 3}, {5, 3}, {5, 3}, {5, 3}, {5, 3}, {4, 3}, {4, 3}, {4, 3}, {4, 3}, {1, 3}, {1, 3}, {1, 3}, {1, 3}, //31
 };
- 
-const uint8_t g_kuiTotalZerosTable4[32][2] = //read 5 bits, generated by tzVlcIndex=5 in Table 9-7 in H.264/AVC standard
-{
-	{11, 5}, { 9, 5}, {10, 4}, {10, 4}, { 8, 4}, { 8, 4}, { 2, 4}, { 2, 4}, { 1, 4}, { 1, 4}, { 0, 4}, { 0, 4}, { 7, 3}, { 7, 3}, { 7, 3}, { 7, 3}, //15
-	{ 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 4, 3}, { 4, 3}, { 4, 3}, { 4, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3} //31
+
+const uint8_t g_kuiTotalZerosTable4[32][2]
+= { //read 5 bits, generated by tzVlcIndex=5 in Table 9-7 in H.264/AVC standard
+  {11, 5}, { 9, 5}, {10, 4}, {10, 4}, { 8, 4}, { 8, 4}, { 2, 4}, { 2, 4}, { 1, 4}, { 1, 4}, { 0, 4}, { 0, 4}, { 7, 3}, { 7, 3}, { 7, 3}, { 7, 3}, //15
+  { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 4, 3}, { 4, 3}, { 4, 3}, { 4, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3} //31
 };
 
-const uint8_t g_kuiTotalZerosTable5[64][2] = //read 6 bits, generated by tzVlcIndex=6 in Table 9-7 in H.264/AVC standard
-{
-	{10, 6}, { 0, 6}, { 1, 5}, { 1, 5}, { 8, 4}, { 8, 4}, { 8, 4}, { 8, 4}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, //15
-	{ 7, 3}, { 7, 3}, { 7, 3}, { 7, 3}, { 7, 3}, { 7, 3}, { 7, 3}, { 7, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, //31
-	{ 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 4, 3}, { 4, 3}, { 4, 3}, { 4, 3}, { 4, 3}, { 4, 3}, { 4, 3}, { 4, 3}, //47
-	{ 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3} //63
+const uint8_t g_kuiTotalZerosTable5[64][2]
+= { //read 6 bits, generated by tzVlcIndex=6 in Table 9-7 in H.264/AVC standard
+  {10, 6}, { 0, 6}, { 1, 5}, { 1, 5}, { 8, 4}, { 8, 4}, { 8, 4}, { 8, 4}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, //15
+  { 7, 3}, { 7, 3}, { 7, 3}, { 7, 3}, { 7, 3}, { 7, 3}, { 7, 3}, { 7, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, //31
+  { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 4, 3}, { 4, 3}, { 4, 3}, { 4, 3}, { 4, 3}, { 4, 3}, { 4, 3}, { 4, 3}, //47
+  { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3} //63
 };
 
-const uint8_t g_kuiTotalZerosTable6[64][2] = //read 6 bits, generated by tzVlcIndex=7 in Table 9-7 in H.264/AVC standard
-{
-	{ 9, 6}, { 0, 6}, { 1, 5}, { 1, 5}, { 7, 4}, { 7, 4}, { 7, 4}, { 7, 4}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, //15
-	{ 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 4, 3}, { 4, 3}, { 4, 3}, { 4, 3}, { 4, 3}, { 4, 3}, { 4, 3}, { 4, 3}, //31
-	{ 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, //47
-	{ 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2} //63
+const uint8_t g_kuiTotalZerosTable6[64][2]
+= { //read 6 bits, generated by tzVlcIndex=7 in Table 9-7 in H.264/AVC standard
+  { 9, 6}, { 0, 6}, { 1, 5}, { 1, 5}, { 7, 4}, { 7, 4}, { 7, 4}, { 7, 4}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, //15
+  { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 4, 3}, { 4, 3}, { 4, 3}, { 4, 3}, { 4, 3}, { 4, 3}, { 4, 3}, { 4, 3}, //31
+  { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, //47
+  { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2} //63
 };
 
-const uint8_t g_kuiTotalZerosTable7[64][2] = //read 6 bits, generated by tzVlcIndex=8 in Table 9-7 in H.264/AVC standard
-{
-	{ 8, 6}, { 0, 6}, { 2, 5}, { 2, 5}, { 1, 4}, { 1, 4}, { 1, 4}, { 1, 4}, { 7, 3}, { 7, 3}, { 7, 3}, { 7, 3}, { 7, 3}, { 7, 3}, { 7, 3}, { 7, 3}, //15
-	{ 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, //31
-	{ 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, //47
-	{ 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2} //63
+const uint8_t g_kuiTotalZerosTable7[64][2]
+= { //read 6 bits, generated by tzVlcIndex=8 in Table 9-7 in H.264/AVC standard
+  { 8, 6}, { 0, 6}, { 2, 5}, { 2, 5}, { 1, 4}, { 1, 4}, { 1, 4}, { 1, 4}, { 7, 3}, { 7, 3}, { 7, 3}, { 7, 3}, { 7, 3}, { 7, 3}, { 7, 3}, { 7, 3}, //15
+  { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, //31
+  { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, //47
+  { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2} //63
 };
 
-const uint8_t g_kuiTotalZerosTable8[64][2] = //read 6 bits, generated by tzVlcIndex=9 in Table 9-7 in H.264/AVC standard
-{
-	{ 1, 6}, { 0, 6}, { 7, 5}, { 7, 5}, { 2, 4}, { 2, 4}, { 2, 4}, { 2, 4}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, //15
-	{ 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, //31
-	{ 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, //47
-	{ 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2} //63
+const uint8_t g_kuiTotalZerosTable8[64][2]
+= { //read 6 bits, generated by tzVlcIndex=9 in Table 9-7 in H.264/AVC standard
+  { 1, 6}, { 0, 6}, { 7, 5}, { 7, 5}, { 2, 4}, { 2, 4}, { 2, 4}, { 2, 4}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, //15
+  { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, //31
+  { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, //47
+  { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2} //63
 };
 
-const uint8_t g_kuiTotalZerosTable9[32][2] = //read 5 bits, generated by tzVlcIndex=10 in Table 9-7 in H.264/AVC standard
-{
-	{ 1, 5}, { 0, 5}, { 6, 4}, { 6, 4}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, //15
-	{ 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2} //31
+const uint8_t g_kuiTotalZerosTable9[32][2]
+= { //read 5 bits, generated by tzVlcIndex=10 in Table 9-7 in H.264/AVC standard
+  { 1, 5}, { 0, 5}, { 6, 4}, { 6, 4}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, //15
+  { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2} //31
 };
 
-const uint8_t g_kuiTotalZerosTable10[16][2] = //read 4 bits, generated by tzVlcIndex=11 in Table 9-7 in H.264/AVC standard
-{
-	{ 0, 4}, { 1, 4}, { 2, 3}, { 2, 3}, { 3, 3}, { 3, 3}, { 5, 3}, { 5, 3}, { 4, 1}, { 4, 1}, { 4, 1}, { 4, 1}, { 4, 1}, { 4, 1}, { 4, 1}, { 4, 1} //15
+const uint8_t g_kuiTotalZerosTable10[16][2]
+= { //read 4 bits, generated by tzVlcIndex=11 in Table 9-7 in H.264/AVC standard
+  { 0, 4}, { 1, 4}, { 2, 3}, { 2, 3}, { 3, 3}, { 3, 3}, { 5, 3}, { 5, 3}, { 4, 1}, { 4, 1}, { 4, 1}, { 4, 1}, { 4, 1}, { 4, 1}, { 4, 1}, { 4, 1} //15
 };
 
-const uint8_t g_kuiTotalZerosTable11[16][2] = //read 4 bits, generated by tzVlcIndex=12 in Table 9-7 in H.264/AVC standard
-{
-	{ 0, 4}, { 1, 4}, { 4, 3}, { 4, 3}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 3, 1}, { 3, 1}, { 3, 1}, { 3, 1}, { 3, 1}, { 3, 1}, { 3, 1}, { 3, 1} //15
+const uint8_t g_kuiTotalZerosTable11[16][2]
+= { //read 4 bits, generated by tzVlcIndex=12 in Table 9-7 in H.264/AVC standard
+  { 0, 4}, { 1, 4}, { 4, 3}, { 4, 3}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 3, 1}, { 3, 1}, { 3, 1}, { 3, 1}, { 3, 1}, { 3, 1}, { 3, 1}, { 3, 1} //15
 };
 
-const uint8_t g_kuiTotalZerosTable12[8][2] = //read 3 bits, generated by tzVlcIndex=13 in Table 9-7 in H.264/AVC standard
-{
-	{ 0, 3}, { 1, 3}, { 3, 2}, { 3, 2}, { 2, 1}, { 2, 1}, { 2, 1}, { 2, 1} //8
+const uint8_t g_kuiTotalZerosTable12[8][2]
+= { //read 3 bits, generated by tzVlcIndex=13 in Table 9-7 in H.264/AVC standard
+  { 0, 3}, { 1, 3}, { 3, 2}, { 3, 2}, { 2, 1}, { 2, 1}, { 2, 1}, { 2, 1} //8
 };
 
-const uint8_t g_kuiTotalZerosTable13[4][2] = //read 2 bits, generated by tzVlcIndex=14 in Table 9-7 in H.264/AVC standard
-{
-	{ 0, 2}, { 1, 2}, { 2, 1}, { 2, 1}
+const uint8_t g_kuiTotalZerosTable13[4][2]
+= { //read 2 bits, generated by tzVlcIndex=14 in Table 9-7 in H.264/AVC standard
+  { 0, 2}, { 1, 2}, { 2, 1}, { 2, 1}
 };
 
-const uint8_t g_kuiTotalZerosTable14[2][2] = //read 1 bits generated by tzVlcIndex=15 in Table 9-7 in H.264/AVC standard
-{
-	{ 0, 1}, { 1, 1} 
+const uint8_t g_kuiTotalZerosTable14[2][2]
+= { //read 1 bits generated by tzVlcIndex=15 in Table 9-7 in H.264/AVC standard
+  { 0, 1}, { 1, 1}
 };
 
-const uint8_t g_kuiTotalZerosBitNumMap[15] = 
-{
-	9, 6, 6, 5, 5, 6, 6, 6, 6, 5, 4, 4, 3, 2, 1	
+const uint8_t g_kuiTotalZerosBitNumMap[15] = {
+  9, 6, 6, 5, 5, 6, 6, 6, 6, 5, 4, 4, 3, 2, 1
 };
 
 
-const uint8_t g_kuiTotalZerosChromaTable0[8][2] = //read 3 bits, generated by tzVlcIndex=1 in Table 9-9(a) in H.264/AVC standard
-{
-	{ 3, 3}, { 2, 3}, { 1, 2}, { 1, 2}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}
+const uint8_t g_kuiTotalZerosChromaTable0[8][2]
+= { //read 3 bits, generated by tzVlcIndex=1 in Table 9-9(a) in H.264/AVC standard
+  { 3, 3}, { 2, 3}, { 1, 2}, { 1, 2}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}
 };
 
-const uint8_t g_kuiTotalZerosChromaTable1[4][2] = //read 2 bits, generated by tzVlcIndex=2 in Table 9-9(a) in H.264/AVC standard
-{
-	{ 2, 2}, { 1, 2}, { 0, 1}, { 0, 1}
+const uint8_t g_kuiTotalZerosChromaTable1[4][2]
+= { //read 2 bits, generated by tzVlcIndex=2 in Table 9-9(a) in H.264/AVC standard
+  { 2, 2}, { 1, 2}, { 0, 1}, { 0, 1}
 };
 
-const uint8_t g_kuiTotalZerosChromaTable2[2][2] = //read 1 bits, generated by tzVlcIndex=3 in Table 9-9(a) in H.264/AVC standard
-{
-	{ 1, 1}, { 0, 1}
+const uint8_t g_kuiTotalZerosChromaTable2[2][2]
+= { //read 1 bits, generated by tzVlcIndex=3 in Table 9-9(a) in H.264/AVC standard
+  { 1, 1}, { 0, 1}
 };
 
-const uint8_t g_kuiTotalZerosBitNumChromaMap[3] = 
-{
-	3, 2, 1
+const uint8_t g_kuiTotalZerosBitNumChromaMap[3] = {
+  3, 2, 1
 };
 
-const uint8_t g_kuiZeroLeftTable0[2][2] = //read 1 bits
-{
-	{1, 1}, {0, 1}
+const uint8_t g_kuiZeroLeftTable0[2][2] = { //read 1 bits
+  {1, 1}, {0, 1}
 };
 
-const uint8_t g_kuiZeroLeftTable1[4][2] = //read 2 bits
-{
-	{2, 2}, {1, 2}, {0, 1}, {0, 1}
+const uint8_t g_kuiZeroLeftTable1[4][2] = { //read 2 bits
+  {2, 2}, {1, 2}, {0, 1}, {0, 1}
 };
 
-const uint8_t g_kuiZeroLeftTable2[4][2] = //read 2 bits
-{
-	{3, 2}, {2, 2}, {1, 2}, {0, 2}
+const uint8_t g_kuiZeroLeftTable2[4][2] = { //read 2 bits
+  {3, 2}, {2, 2}, {1, 2}, {0, 2}
 };
 
-const uint8_t g_kuiZeroLeftTable3[8][2] = //read 3 bits
-{
-	{4, 3}, {3, 3}, {2, 2}, {2, 2}, {1, 2}, {1, 2}, {0, 2}, {0, 2}
+const uint8_t g_kuiZeroLeftTable3[8][2] = { //read 3 bits
+  {4, 3}, {3, 3}, {2, 2}, {2, 2}, {1, 2}, {1, 2}, {0, 2}, {0, 2}
 };
 
-const uint8_t g_kuiZeroLeftTable4[8][2] = //read 3 bits
-{
-	{5, 3}, {4, 3}, {3, 3}, {2, 3}, {1, 2}, {1, 2}, {0, 2}, {0, 2}
+const uint8_t g_kuiZeroLeftTable4[8][2] = { //read 3 bits
+  {5, 3}, {4, 3}, {3, 3}, {2, 3}, {1, 2}, {1, 2}, {0, 2}, {0, 2}
 };
 
-const uint8_t g_kuiZeroLeftTable5[8][2] = //read 3 bits
-{
-	{1, 3}, {2, 3}, {4, 3}, {3, 3}, {6, 3}, {5, 3}, {0, 2}, {0, 2}
+const uint8_t g_kuiZeroLeftTable5[8][2] = { //read 3 bits
+  {1, 3}, {2, 3}, {4, 3}, {3, 3}, {6, 3}, {5, 3}, {0, 2}, {0, 2}
 };
 
-const uint8_t g_kuiZeroLeftTable6[8][2] = //read 3 bits
-{
-	{7, 3}, {6, 3}, {5, 3}, {4, 3}, {3, 3}, {2, 3}, {1, 3}, {0, 3}
+const uint8_t g_kuiZeroLeftTable6[8][2] = { //read 3 bits
+  {7, 3}, {6, 3}, {5, 3}, {4, 3}, {3, 3}, {2, 3}, {1, 3}, {0, 3}
 };
 
-const uint8_t g_kuiZeroLeftBitNumMap[16] = 
-{
-	0, 1, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3	
+const uint8_t g_kuiZeroLeftBitNumMap[16] = {
+  0, 1, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
 };
 
 } // namespace WelsDec
--- a/codec/decoder/core/src/expand_pic.cpp
+++ b/codec/decoder/core/src/expand_pic.cpp
@@ -39,127 +39,121 @@
 namespace WelsDec {
 
 // rewrite it (split into luma & chroma) that is helpful for mmx/sse2 optimization perform, 9/27/2009
-static inline void_t ExpandPictureLuma_c( uint8_t *pDst, const int32_t kiStride, const int32_t kiPicWidth, const int32_t kiPicHeight )
-{
-	uint8_t *pTmp				= pDst;
-	uint8_t *pDstLastLine		= pTmp + (kiPicHeight-1) * kiStride;	
-	const int32_t kiPaddingLen	= PADDING_LENGTH;	
-	const uint8_t kuiTopLeft	= pTmp[0];
-	const uint8_t kuiTopRight	= pTmp[kiPicWidth-1];
-	const uint8_t kuiBottomLeft	= pDstLastLine[0];
-	const uint8_t kuiBottomRight= pDstLastLine[kiPicWidth-1];
-	int32_t i					= 0;
+static inline void_t ExpandPictureLuma_c (uint8_t* pDst, const int32_t kiStride, const int32_t kiPicWidth,
+    const int32_t kiPicHeight) {
+  uint8_t* pTmp				= pDst;
+  uint8_t* pDstLastLine		= pTmp + (kiPicHeight - 1) * kiStride;
+  const int32_t kiPaddingLen	= PADDING_LENGTH;
+  const uint8_t kuiTopLeft	= pTmp[0];
+  const uint8_t kuiTopRight	= pTmp[kiPicWidth - 1];
+  const uint8_t kuiBottomLeft	= pDstLastLine[0];
+  const uint8_t kuiBottomRight = pDstLastLine[kiPicWidth - 1];
+  int32_t i					= 0;
 
-	do {
-		const int32_t kiStrides	= (1+i) * kiStride;
-		uint8_t* pTop			= pTmp - kiStrides;
-		uint8_t* pBottom		= pDstLastLine + kiStrides;
-		
-		// pad pTop and pBottom
-		memcpy(pTop, pTmp, kiPicWidth);
-		memcpy(pBottom, pDstLastLine, kiPicWidth);
-		
-		// pad corners
-		memset(pTop-kiPaddingLen,    kuiTopLeft,     kiPaddingLen); //pTop left
-		memset(pTop+kiPicWidth,      kuiTopRight,    kiPaddingLen); //pTop right
-		memset(pBottom-kiPaddingLen, kuiBottomLeft,  kiPaddingLen); //pBottom left
-		memset(pBottom+kiPicWidth,   kuiBottomRight, kiPaddingLen); //pBottom right
-		
-		++ i;
-	} while( i < kiPaddingLen );
+  do {
+    const int32_t kiStrides	= (1 + i) * kiStride;
+    uint8_t* pTop			= pTmp - kiStrides;
+    uint8_t* pBottom		= pDstLastLine + kiStrides;
 
-	// pad left and right
-	i = 0;
-	do {
-		memset(pTmp-kiPaddingLen, pTmp[0], kiPaddingLen);
-		memset(pTmp+kiPicWidth, pTmp[kiPicWidth-1], kiPaddingLen);
+    // pad pTop and pBottom
+    memcpy (pTop, pTmp, kiPicWidth);
+    memcpy (pBottom, pDstLastLine, kiPicWidth);
 
-		pTmp += kiStride;
-		++ i;
-	} while( i < kiPicHeight );
+    // pad corners
+    memset (pTop - kiPaddingLen,    kuiTopLeft,     kiPaddingLen); //pTop left
+    memset (pTop + kiPicWidth,      kuiTopRight,    kiPaddingLen); //pTop right
+    memset (pBottom - kiPaddingLen, kuiBottomLeft,  kiPaddingLen); //pBottom left
+    memset (pBottom + kiPicWidth,   kuiBottomRight, kiPaddingLen); //pBottom right
+
+    ++ i;
+  } while (i < kiPaddingLen);
+
+  // pad left and right
+  i = 0;
+  do {
+    memset (pTmp - kiPaddingLen, pTmp[0], kiPaddingLen);
+    memset (pTmp + kiPicWidth, pTmp[kiPicWidth - 1], kiPaddingLen);
+
+    pTmp += kiStride;
+    ++ i;
+  } while (i < kiPicHeight);
 }
 
-static inline void_t ExpandPictureChroma_c( uint8_t *pDst, const int32_t kiStride, const int32_t kiPicWidth, const int32_t kiPicHeight )
-{
-	uint8_t *pTmp				= pDst;
-	uint8_t *pDstLastLine		= pTmp + (kiPicHeight-1) * kiStride;	
-	const int32_t kiPaddingLen	= (PADDING_LENGTH>>1);	
-	const uint8_t kuiTopLeft	= pTmp[0];
-	const uint8_t kuiTopRight	= pTmp[kiPicWidth-1];
-	const uint8_t kuiBottomLeft	= pDstLastLine[0];
-	const uint8_t kuiBottomRight= pDstLastLine[kiPicWidth-1];
-	int32_t i					= 0;
-	
-	do {
-		const int32_t kiStrides	= (1+i) * kiStride;
-		uint8_t* pTop			= pTmp - kiStrides;
-		uint8_t* pBottom		= pDstLastLine + kiStrides;
-		
-		// pad pTop and pBottom
-		memcpy(pTop, pTmp, kiPicWidth);
-		memcpy(pBottom, pDstLastLine, kiPicWidth);
-		
-		// pad corners
-		memset(pTop-kiPaddingLen,    kuiTopLeft,     kiPaddingLen); //pTop left
-		memset(pTop+kiPicWidth,      kuiTopRight,    kiPaddingLen); //pTop right
-		memset(pBottom-kiPaddingLen, kuiBottomLeft,  kiPaddingLen); //pBottom left
-		memset(pBottom+kiPicWidth,   kuiBottomRight, kiPaddingLen); //pBottom right
-		
-		++ i;
-	} while( i < kiPaddingLen );
-	
-	// pad left and right
-	i = 0;
-	do {
-		memset(pTmp-kiPaddingLen, pTmp[0], kiPaddingLen);
-		memset(pTmp+kiPicWidth, pTmp[kiPicWidth-1], kiPaddingLen);
-		
-		pTmp += kiStride;
-		++ i;
-	} while( i < kiPicHeight );
+static inline void_t ExpandPictureChroma_c (uint8_t* pDst, const int32_t kiStride, const int32_t kiPicWidth,
+    const int32_t kiPicHeight) {
+  uint8_t* pTmp				= pDst;
+  uint8_t* pDstLastLine		= pTmp + (kiPicHeight - 1) * kiStride;
+  const int32_t kiPaddingLen	= (PADDING_LENGTH >> 1);
+  const uint8_t kuiTopLeft	= pTmp[0];
+  const uint8_t kuiTopRight	= pTmp[kiPicWidth - 1];
+  const uint8_t kuiBottomLeft	= pDstLastLine[0];
+  const uint8_t kuiBottomRight = pDstLastLine[kiPicWidth - 1];
+  int32_t i					= 0;
+
+  do {
+    const int32_t kiStrides	= (1 + i) * kiStride;
+    uint8_t* pTop			= pTmp - kiStrides;
+    uint8_t* pBottom		= pDstLastLine + kiStrides;
+
+    // pad pTop and pBottom
+    memcpy (pTop, pTmp, kiPicWidth);
+    memcpy (pBottom, pDstLastLine, kiPicWidth);
+
+    // pad corners
+    memset (pTop - kiPaddingLen,    kuiTopLeft,     kiPaddingLen); //pTop left
+    memset (pTop + kiPicWidth,      kuiTopRight,    kiPaddingLen); //pTop right
+    memset (pBottom - kiPaddingLen, kuiBottomLeft,  kiPaddingLen); //pBottom left
+    memset (pBottom + kiPicWidth,   kuiBottomRight, kiPaddingLen); //pBottom right
+
+    ++ i;
+  } while (i < kiPaddingLen);
+
+  // pad left and right
+  i = 0;
+  do {
+    memset (pTmp - kiPaddingLen, pTmp[0], kiPaddingLen);
+    memset (pTmp + kiPicWidth, pTmp[kiPicWidth - 1], kiPaddingLen);
+
+    pTmp += kiStride;
+    ++ i;
+  } while (i < kiPicHeight);
 }
 
-void_t InitExpandPictureFunc( SExpandPicFunc *pExpandPicFunc, const uint32_t kuiCpuFlags )
-{
-	pExpandPicFunc->pExpandLumaPicture	= ExpandPictureLuma_c;
-	pExpandPicFunc->pExpandChromaPicture[0] = ExpandPictureChroma_c;
-	pExpandPicFunc->pExpandChromaPicture[1] = ExpandPictureChroma_c;
+void_t InitExpandPictureFunc (SExpandPicFunc* pExpandPicFunc, const uint32_t kuiCpuFlags) {
+  pExpandPicFunc->pExpandLumaPicture	= ExpandPictureLuma_c;
+  pExpandPicFunc->pExpandChromaPicture[0] = ExpandPictureChroma_c;
+  pExpandPicFunc->pExpandChromaPicture[1] = ExpandPictureChroma_c;
 
 #if defined(X86_ASM)
-	if ( (kuiCpuFlags & WELS_CPU_SSE2) == WELS_CPU_SSE2 )
-	{
-		pExpandPicFunc->pExpandLumaPicture	   = ExpandPictureLuma_sse2;
-		pExpandPicFunc->pExpandChromaPicture[0] = ExpandPictureChromaUnalign_sse2;
-		pExpandPicFunc->pExpandChromaPicture[1] = ExpandPictureChromaAlign_sse2;
-	}
+  if ((kuiCpuFlags & WELS_CPU_SSE2) == WELS_CPU_SSE2) {
+    pExpandPicFunc->pExpandLumaPicture	   = ExpandPictureLuma_sse2;
+    pExpandPicFunc->pExpandChromaPicture[0] = ExpandPictureChromaUnalign_sse2;
+    pExpandPicFunc->pExpandChromaPicture[1] = ExpandPictureChromaAlign_sse2;
+  }
 #endif//X86_ASM
 }
 
-void_t ExpandReferencingPicture(PPicture pPic, PExpandPictureFunc pExpLuma, PExpandPictureFunc pExpChroma[2])
-{
-	/*local variable*/
-	uint8_t *pPicY = pPic->pData[0];
-	uint8_t *pPicCb = pPic->pData[1];
-	uint8_t *pPicCr = pPic->pData[2];
-	const int32_t kiWidthY	= pPic->iWidthInPixel;
-	const int32_t kiHeightY	= pPic->iHeightInPixel;
-	const int32_t kiWidthUV	= kiWidthY >> 1;
-	const int32_t kiHeightUV= kiHeightY >> 1;	
-	
-    pExpLuma(pPicY, pPic->iLinesize[0], kiWidthY, kiHeightY);	
-	if ( kiWidthUV >= 16 )
-	{
-		// fix coding picture size as 16x16 issues 7/27/2010
-		const bool_t kbChrAligned= /*(kiWidthUV >= 16) && */((kiWidthUV & 0x0F) == 0);	// chroma planes: (16+kiWidthUV) & 15
-		pExpChroma[kbChrAligned](pPicCb, pPic->iLinesize[1], kiWidthUV, kiHeightUV);
-		pExpChroma[kbChrAligned](pPicCr, pPic->iLinesize[2], kiWidthUV, kiHeightUV);
-	}
-	else
-	{
-		// fix coding picture size as 16x16 issues 7/27/2010
-		ExpandPictureChroma_c(pPicCb, pPic->iLinesize[1], kiWidthUV, kiHeightUV);
-		ExpandPictureChroma_c(pPicCr, pPic->iLinesize[2], kiWidthUV, kiHeightUV);
-	}
+void_t ExpandReferencingPicture (PPicture pPic, PExpandPictureFunc pExpLuma, PExpandPictureFunc pExpChroma[2]) {
+  /*local variable*/
+  uint8_t* pPicY = pPic->pData[0];
+  uint8_t* pPicCb = pPic->pData[1];
+  uint8_t* pPicCr = pPic->pData[2];
+  const int32_t kiWidthY	= pPic->iWidthInPixel;
+  const int32_t kiHeightY	= pPic->iHeightInPixel;
+  const int32_t kiWidthUV	= kiWidthY >> 1;
+  const int32_t kiHeightUV = kiHeightY >> 1;
+
+  pExpLuma (pPicY, pPic->iLinesize[0], kiWidthY, kiHeightY);
+  if (kiWidthUV >= 16) {
+    // fix coding picture size as 16x16 issues 7/27/2010
+    const bool_t kbChrAligned = /*(kiWidthUV >= 16) && */ ((kiWidthUV & 0x0F) == 0);	// chroma planes: (16+kiWidthUV) & 15
+    pExpChroma[kbChrAligned] (pPicCb, pPic->iLinesize[1], kiWidthUV, kiHeightUV);
+    pExpChroma[kbChrAligned] (pPicCr, pPic->iLinesize[2], kiWidthUV, kiHeightUV);
+  } else {
+    // fix coding picture size as 16x16 issues 7/27/2010
+    ExpandPictureChroma_c (pPicCb, pPic->iLinesize[1], kiWidthUV, kiHeightUV);
+    ExpandPictureChroma_c (pPicCr, pPic->iLinesize[2], kiWidthUV, kiHeightUV);
+  }
 }
 
 } // namespace WelsDec
--- a/codec/decoder/core/src/fmo.cpp
+++ b/codec/decoder/core/src/fmo.cpp
@@ -39,7 +39,7 @@
  */
 #include <string.h>
 
-#include "fmo.h" 
+#include "fmo.h"
 #include "macros.h"
 #include "utils.h"
 #include "mem_align.h"
@@ -50,41 +50,39 @@
  * \brief	Generate MB allocated map for interleaved slice group (TYPE 0)
  *
  * \param	pFmo	fmo context
- * \param	pPps	pps context 
+ * \param	pPps	pps context
  *
  * \return	0 - successful; none 0 - failed
  */
-static inline int32_t FmoGenerateMbAllocMapType0( PFmo pFmo, PPps pPps )
-{
-	uint32_t uiNumSliceGroups = 0;
-	int32_t iMbNum = 0;
-	int32_t i = 0;
+static inline int32_t FmoGenerateMbAllocMapType0 (PFmo pFmo, PPps pPps) {
+  uint32_t uiNumSliceGroups = 0;
+  int32_t iMbNum = 0;
+  int32_t i = 0;
 
-	WELS_VERIFY_RETURN_IF( 1, ( NULL == pFmo || NULL == pPps ) )		
-	uiNumSliceGroups = pPps->uiNumSliceGroups;
-	iMbNum = pFmo->iCountMbNum;
-	WELS_VERIFY_RETURN_IF( 1, ( NULL == pFmo->pMbAllocMap || iMbNum <= 0 || uiNumSliceGroups >= MAX_SLICEGROUP_IDS ) )
-	
-	do
-	{
-		uint8_t uiGroup = 0;
-		do {
-			const int32_t kiRunIdx = pPps->uiRunLength[uiGroup];
-			int32_t j = 0;
-			do {
-				pFmo->pMbAllocMap[i+j] = uiGroup;
-				++ j;
-			} while(j < kiRunIdx && i + j < iMbNum);
-			i += kiRunIdx;
-			++ uiGroup;
-		} while(uiGroup < uiNumSliceGroups && i < iMbNum);
-	}while(i < iMbNum);
-	
-	return 0; // well here
+  WELS_VERIFY_RETURN_IF (1, (NULL == pFmo || NULL == pPps))
+  uiNumSliceGroups = pPps->uiNumSliceGroups;
+  iMbNum = pFmo->iCountMbNum;
+  WELS_VERIFY_RETURN_IF (1, (NULL == pFmo->pMbAllocMap || iMbNum <= 0 || uiNumSliceGroups >= MAX_SLICEGROUP_IDS))
+
+  do {
+    uint8_t uiGroup = 0;
+    do {
+      const int32_t kiRunIdx = pPps->uiRunLength[uiGroup];
+      int32_t j = 0;
+      do {
+        pFmo->pMbAllocMap[i + j] = uiGroup;
+        ++ j;
+      } while (j < kiRunIdx && i + j < iMbNum);
+      i += kiRunIdx;
+      ++ uiGroup;
+    } while (uiGroup < uiNumSliceGroups && i < iMbNum);
+  } while (i < iMbNum);
+
+  return 0; // well here
 }
 
 /*!
- * \brief	Generate MB allocated map for dispersed slice group (TYPE 1)	
+ * \brief	Generate MB allocated map for dispersed slice group (TYPE 1)
  *
  * \param	pFmo	fmo context
  * \param	pPps	pps context
@@ -92,98 +90,93 @@
  *
  * \return	0 - successful; none 0 - failed
  */
-static inline int32_t FmoGenerateMbAllocMapType1( PFmo pFmo, PPps pPps, const int32_t kiMbWidth )
-{
-	uint32_t uiNumSliceGroups = 0;
-	int32_t iMbNum = 0;
-	int16_t i = 0;
-	WELS_VERIFY_RETURN_IF( 1, ( NULL == pFmo || NULL == pPps ) )
-	uiNumSliceGroups = pPps->uiNumSliceGroups;
-	iMbNum			 = pFmo->iCountMbNum;
-	WELS_VERIFY_RETURN_IF( 1, ( NULL == pFmo->pMbAllocMap || iMbNum <= 0 || kiMbWidth == 0  || uiNumSliceGroups >= MAX_SLICEGROUP_IDS ) )
-		
-	do	
-	{
-		pFmo->pMbAllocMap[i] = (uint8_t)(((i % kiMbWidth)+(((i / kiMbWidth)*uiNumSliceGroups)>>1)) % uiNumSliceGroups);
-		++ i;
-	}while (i < iMbNum);
-	
-	return 0; // well here
+static inline int32_t FmoGenerateMbAllocMapType1 (PFmo pFmo, PPps pPps, const int32_t kiMbWidth) {
+  uint32_t uiNumSliceGroups = 0;
+  int32_t iMbNum = 0;
+  int16_t i = 0;
+  WELS_VERIFY_RETURN_IF (1, (NULL == pFmo || NULL == pPps))
+  uiNumSliceGroups = pPps->uiNumSliceGroups;
+  iMbNum			 = pFmo->iCountMbNum;
+  WELS_VERIFY_RETURN_IF (1, (NULL == pFmo->pMbAllocMap || iMbNum <= 0 || kiMbWidth == 0
+                             || uiNumSliceGroups >= MAX_SLICEGROUP_IDS))
+
+  do {
+    pFmo->pMbAllocMap[i] = (uint8_t) (((i % kiMbWidth) + (((i / kiMbWidth) * uiNumSliceGroups) >> 1)) % uiNumSliceGroups);
+    ++ i;
+  } while (i < iMbNum);
+
+  return 0; // well here
 }
 
 /*!
  * \brief	Generate MB allocated map for various type of slice group cases (TYPE 0, .., 6)
  *
- * \param	pFmo		fmo context 
+ * \param	pFmo		fmo context
  * \param	pPps		pps context
  * \param	kiMbWidth	MB width
  * \param	kiMbHeight	MB height
  *
- * \return	0 - successful; none 0 - failed	
+ * \return	0 - successful; none 0 - failed
  */
-static inline int32_t FmoGenerateSliceGroup( PFmo pFmo, const PPps kpPps, const int32_t kiMbWidth, const int32_t kiMbHeight )
-{
-	int32_t iNumMb	= 0;
-	int32_t iErr		= 0;
-	bool_t	bResolutionChanged = false;
+static inline int32_t FmoGenerateSliceGroup (PFmo pFmo, const PPps kpPps, const int32_t kiMbWidth,
+    const int32_t kiMbHeight) {
+  int32_t iNumMb	= 0;
+  int32_t iErr		= 0;
+  bool_t	bResolutionChanged = false;
 
-	// the cases we would not like
-	WELS_VERIFY_RETURN_IF( 1, ( NULL == pFmo || NULL == kpPps ) )
-	
-	iNumMb	= pFmo->iCountMbNum;
+  // the cases we would not like
+  WELS_VERIFY_RETURN_IF (1, (NULL == pFmo || NULL == kpPps))
 
-	iNumMb = kiMbWidth * kiMbHeight;
-	
-	if ( 0 == iNumMb )
-		return 1;		
+  iNumMb	= pFmo->iCountMbNum;
 
+  iNumMb = kiMbWidth * kiMbHeight;
 
-    WelsFree(pFmo->pMbAllocMap, "_fmo->pMbAllocMap");
-	pFmo->pMbAllocMap	= (uint8_t *)WelsMalloc( iNumMb * sizeof(uint8_t), "_fmo->pMbAllocMap" );		
-	WELS_VERIFY_RETURN_IF( 1, (NULL == pFmo->pMbAllocMap) )	// out of memory		
-	
-	pFmo->iCountMbNum	= iNumMb;		
+  if (0 == iNumMb)
+    return 1;
 
-	if ( kpPps->uiNumSliceGroups < 2 && iNumMb > 0) // only one slice group, exactly it is single slice based
-	{		
-		memset ( pFmo->pMbAllocMap, 0,  iNumMb * sizeof(int8_t));	// for safe
-		
-		pFmo->iSliceGroupCount		= 1;
-		
-		return 0;
-	}	
-		
-	if ( bResolutionChanged || ((int32_t)kpPps->uiSliceGroupMapType != pFmo->iSliceGroupType) 
-			|| ((int32_t)kpPps->uiNumSliceGroups != pFmo->iSliceGroupCount)	)
-	{
-		switch ( kpPps->uiSliceGroupMapType )
-		{
-		case 0:
-			iErr	= FmoGenerateMbAllocMapType0( pFmo, kpPps );			
-			break;
-		case 1:			
-			iErr = FmoGenerateMbAllocMapType1( pFmo, kpPps, kiMbWidth );
-			break;
-		case 2:
-		case 3:
-		case 4:
-		case 5:
-		case 6:
-			// Reserve for others slice group type
-			iErr	= 1;
-			break;
-		default:
-			return 1;
-		}
-	}
-	
-	if ( 0 == iErr )	// well now
-	{
-		pFmo->iSliceGroupCount	= kpPps->uiNumSliceGroups;
-		pFmo->iSliceGroupType	= kpPps->uiSliceGroupMapType;
-	}
 
-	return iErr;
+  WelsFree (pFmo->pMbAllocMap, "_fmo->pMbAllocMap");
+  pFmo->pMbAllocMap	= (uint8_t*)WelsMalloc (iNumMb * sizeof (uint8_t), "_fmo->pMbAllocMap");
+  WELS_VERIFY_RETURN_IF (1, (NULL == pFmo->pMbAllocMap))	// out of memory
+
+  pFmo->iCountMbNum	= iNumMb;
+
+  if (kpPps->uiNumSliceGroups < 2 && iNumMb > 0) { // only one slice group, exactly it is single slice based
+    memset (pFmo->pMbAllocMap, 0,  iNumMb * sizeof (int8_t));	// for safe
+
+    pFmo->iSliceGroupCount		= 1;
+
+    return 0;
+  }
+
+  if (bResolutionChanged || ((int32_t)kpPps->uiSliceGroupMapType != pFmo->iSliceGroupType)
+      || ((int32_t)kpPps->uiNumSliceGroups != pFmo->iSliceGroupCount)) {
+    switch (kpPps->uiSliceGroupMapType) {
+    case 0:
+      iErr	= FmoGenerateMbAllocMapType0 (pFmo, kpPps);
+      break;
+    case 1:
+      iErr = FmoGenerateMbAllocMapType1 (pFmo, kpPps, kiMbWidth);
+      break;
+    case 2:
+    case 3:
+    case 4:
+    case 5:
+    case 6:
+      // Reserve for others slice group type
+      iErr	= 1;
+      break;
+    default:
+      return 1;
+    }
+  }
+
+  if (0 == iErr) {	// well now
+    pFmo->iSliceGroupCount	= kpPps->uiNumSliceGroups;
+    pFmo->iSliceGroupType	= kpPps->uiSliceGroupMapType;
+  }
+
+  return iErr;
 }
 
 /*!
@@ -196,9 +189,8 @@
  *
  * \return	0 - successful; none 0 - failed;
  */
-int32_t	InitFmo( PFmo pFmo, PPps pPps, const int32_t kiMbWidth, const int32_t kiMbHeight )
-{
-	return FmoGenerateSliceGroup( pFmo, pPps, kiMbWidth, kiMbHeight );
+int32_t	InitFmo (PFmo pFmo, PPps pPps, const int32_t kiMbWidth, const int32_t kiMbHeight) {
+  return FmoGenerateSliceGroup (pFmo, pPps, kiMbWidth, kiMbHeight);
 }
 
 
@@ -211,35 +203,32 @@
  *
  * \return	NONE
  */
-void_t UninitFmoList( PFmo pFmo, const int32_t kiCnt, const int32_t kiAvail )
-{
-	PFmo pIter = pFmo;
-	int32_t i = 0;
-	int32_t iFreeNodes = 0;
+void_t UninitFmoList (PFmo pFmo, const int32_t kiCnt, const int32_t kiAvail) {
+  PFmo pIter = pFmo;
+  int32_t i = 0;
+  int32_t iFreeNodes = 0;
 
-	if ( NULL == pIter || kiAvail <= 0 || kiCnt < kiAvail )
-		return;
+  if (NULL == pIter || kiAvail <= 0 || kiCnt < kiAvail)
+    return;
 
-	while ( i < kiCnt ) {
-		if ( pIter != NULL && pIter->bActiveFlag )
-		{
-			if ( NULL != pIter->pMbAllocMap )
-			{
-				WelsFree( pIter->pMbAllocMap, "pIter->pMbAllocMap" );
+  while (i < kiCnt) {
+    if (pIter != NULL && pIter->bActiveFlag) {
+      if (NULL != pIter->pMbAllocMap) {
+        WelsFree (pIter->pMbAllocMap, "pIter->pMbAllocMap");
 
-				pIter->pMbAllocMap	= NULL;
-			}
-			pIter->iSliceGroupCount	= 0;
-			pIter->iSliceGroupType	= -1;
-			pIter->iCountMbNum		= 0;
-			pIter->bActiveFlag		= false;
-			++ iFreeNodes;
-			if ( iFreeNodes >= kiAvail )
-				break;
-		}
-		++ pIter;
-		++ i;
-	}
+        pIter->pMbAllocMap	= NULL;
+      }
+      pIter->iSliceGroupCount	= 0;
+      pIter->iSliceGroupType	= -1;
+      pIter->iCountMbNum		= 0;
+      pIter->bActiveFlag		= false;
+      ++ iFreeNodes;
+      if (iFreeNodes >= kiAvail)
+        break;
+    }
+    ++ pIter;
+    ++ i;
+  }
 }
 
 /*!
@@ -252,14 +241,14 @@
  *
  * \return	true - changed or not initialized yet; false - not change at all
  */
-bool_t FmoParamSetsChanged( PFmo pFmo, const int32_t kiCountNumMb, const int32_t kiSliceGroupType, const int32_t kiSliceGroupCount )
-{
-	WELS_VERIFY_RETURN_IF( false, (NULL == pFmo) )
-	
-	return  ( (!pFmo->bActiveFlag)
-			|| (kiCountNumMb != pFmo->iCountMbNum)
-			|| (kiSliceGroupType != pFmo->iSliceGroupType)
-			|| (kiSliceGroupCount != pFmo->iSliceGroupCount) );
+bool_t FmoParamSetsChanged (PFmo pFmo, const int32_t kiCountNumMb, const int32_t kiSliceGroupType,
+                            const int32_t kiSliceGroupCount) {
+  WELS_VERIFY_RETURN_IF (false, (NULL == pFmo))
+
+  return ((!pFmo->bActiveFlag)
+          || (kiCountNumMb != pFmo->iCountMbNum)
+          || (kiSliceGroupType != pFmo->iSliceGroupType)
+          || (kiSliceGroupCount != pFmo->iSliceGroupCount));
 }
 
 /*!
@@ -272,32 +261,26 @@
  *
  * \return	true - update/insert successfully; false - failed;
  */
-bool_t FmoParamUpdate( PFmo pFmo, PSps pSps, PPps pPps, int32_t *pActiveFmoNum )
-{
-	const uint32_t kuiMbWidth = pSps->iMbWidth;
-	const uint32_t kuiMbHeight= pSps->iMbHeight;
+bool_t FmoParamUpdate (PFmo pFmo, PSps pSps, PPps pPps, int32_t* pActiveFmoNum) {
+  const uint32_t kuiMbWidth = pSps->iMbWidth;
+  const uint32_t kuiMbHeight = pSps->iMbHeight;
 
-	if ( FmoParamSetsChanged(	pFmo,
-									kuiMbWidth * kuiMbHeight,
-									pPps->uiSliceGroupMapType,
-									pPps->uiNumSliceGroups	) )
-	{
+  if (FmoParamSetsChanged (pFmo,
+                           kuiMbWidth * kuiMbHeight,
+                           pPps->uiSliceGroupMapType,
+                           pPps->uiNumSliceGroups)) {
 
-		if ( InitFmo( pFmo, pPps, kuiMbWidth, kuiMbHeight ) )
-		{
-			return false;
-		}
-		else
-		{
-			if ( !pFmo->bActiveFlag && *pActiveFmoNum < MAX_PPS_COUNT )
-			{
-				++ (*pActiveFmoNum);
-				pFmo->bActiveFlag	= true;
-			}
-		}
-	}
+    if (InitFmo (pFmo, pPps, kuiMbWidth, kuiMbHeight)) {
+      return false;
+    } else {
+      if (!pFmo->bActiveFlag && *pActiveFmoNum < MAX_PPS_COUNT) {
+        ++ (*pActiveFmoNum);
+        pFmo->bActiveFlag	= true;
+      }
+    }
+  }
 
-	return true;
+  return true;
 }
 
 /*!
@@ -304,19 +287,18 @@
  * \brief	Convert kMbXy to slice group idc correspondingly
  *
  * \param	pFmo		Wels fmo context
- * \param	kMbXy		kMbXy to be converted 
+ * \param	kMbXy		kMbXy to be converted
  *
  * \return	slice group idc - successful; -1 - failed;
  */
-int32_t FmoMbToSliceGroup( PFmo pFmo, const MB_XY_T kiMbXy )
-{
-	const int32_t kiMbNum	= pFmo->iCountMbNum;
-	const uint8_t* kpMbMap	= pFmo->pMbAllocMap;
-	
-	if ( kiMbXy < 0 || kiMbXy >= kiMbNum || kpMbMap == NULL)
-		return -1;
-	
-	return kpMbMap[ kiMbXy ];
+int32_t FmoMbToSliceGroup (PFmo pFmo, const MB_XY_T kiMbXy) {
+  const int32_t kiMbNum	= pFmo->iCountMbNum;
+  const uint8_t* kpMbMap	= pFmo->pMbAllocMap;
+
+  if (kiMbXy < 0 || kiMbXy >= kiMbNum || kpMbMap == NULL)
+    return -1;
+
+  return kpMbMap[ kiMbXy ];
 }
 
 /*!
@@ -327,29 +309,28 @@
  *
  * \return	iNextMb - successful; -1 - failed;
  */
-MB_XY_T FmoNextMb( PFmo pFmo, const MB_XY_T kiMbXy )
-{
-	const int32_t kiTotalMb			= pFmo->iCountMbNum;
-	const uint8_t* kpMbMap			= pFmo->pMbAllocMap;
-	MB_XY_T iNextMb					= kiMbXy;
-	const uint8_t kuiSliceGroupIdc	= (uint8_t)FmoMbToSliceGroup( pFmo, kiMbXy );
-	
-	if (kuiSliceGroupIdc == (uint8_t)(-1))
-		return -1;
-	
-	do {
-		++ iNextMb;
-		if (iNextMb >= kiTotalMb){
-			iNextMb	= -1;
-			break;
-		}
-		if (kpMbMap[iNextMb] == kuiSliceGroupIdc){
-			break;
-		}
-	} while( 1 );
-	
-	// -1: No further MB in this slice (could be end of picture)
-	return iNextMb;
+MB_XY_T FmoNextMb (PFmo pFmo, const MB_XY_T kiMbXy) {
+  const int32_t kiTotalMb			= pFmo->iCountMbNum;
+  const uint8_t* kpMbMap			= pFmo->pMbAllocMap;
+  MB_XY_T iNextMb					= kiMbXy;
+  const uint8_t kuiSliceGroupIdc	= (uint8_t)FmoMbToSliceGroup (pFmo, kiMbXy);
+
+  if (kuiSliceGroupIdc == (uint8_t) (-1))
+    return -1;
+
+  do {
+    ++ iNextMb;
+    if (iNextMb >= kiTotalMb) {
+      iNextMb	= -1;
+      break;
+    }
+    if (kpMbMap[iNextMb] == kuiSliceGroupIdc) {
+      break;
+    }
+  } while (1);
+
+  // -1: No further MB in this slice (could be end of picture)
+  return iNextMb;
 }
 
 } // namespace WelsDec
\ No newline at end of file
--- a/codec/decoder/core/src/get_intra_predictor.cpp
+++ b/codec/decoder/core/src/get_intra_predictor.cpp
@@ -51,650 +51,606 @@
 #define I8x8_COUNT 8
 #define I16x16_COUNT 16
 
-void_t WelsI4x4LumaPredV_c(uint8_t *pPred, const int32_t kiStride)
-{
-	const uint32_t kuiVal = LD32(pPred-kiStride);
+void_t WelsI4x4LumaPredV_c (uint8_t* pPred, const int32_t kiStride) {
+  const uint32_t kuiVal = LD32 (pPred - kiStride);
 
-	ST32( pPred						    , kuiVal );
-	ST32( pPred+kiStride				, kuiVal );
-	ST32( pPred+(kiStride<<1)			, kuiVal );
-	ST32( pPred+(kiStride<<1)+kiStride	, kuiVal );	
+  ST32 (pPred						    , kuiVal);
+  ST32 (pPred + kiStride				, kuiVal);
+  ST32 (pPred + (kiStride << 1)			, kuiVal);
+  ST32 (pPred + (kiStride << 1) + kiStride	, kuiVal);
 }
 
-void_t WelsI4x4LumaPredH_c(uint8_t *pPred, const int32_t kiStride)
-{
-	const int32_t kiStride2 = kiStride << 1;
-	const int32_t kiStride3 = kiStride2 + kiStride;
-	const uint32_t kuiL0 = 0x01010101U * pPred[-1          ];
-	const uint32_t kuiL1 = 0x01010101U * pPred[-1+kiStride ];
-	const uint32_t kuiL2 = 0x01010101U * pPred[-1+kiStride2];
-	const uint32_t kuiL3 = 0x01010101U * pPred[-1+kiStride3];
+void_t WelsI4x4LumaPredH_c (uint8_t* pPred, const int32_t kiStride) {
+  const int32_t kiStride2 = kiStride << 1;
+  const int32_t kiStride3 = kiStride2 + kiStride;
+  const uint32_t kuiL0 = 0x01010101U * pPred[-1          ];
+  const uint32_t kuiL1 = 0x01010101U * pPred[-1 + kiStride ];
+  const uint32_t kuiL2 = 0x01010101U * pPred[-1 + kiStride2];
+  const uint32_t kuiL3 = 0x01010101U * pPred[-1 + kiStride3];
 
-	ST32( pPred          , kuiL0 );
-	ST32( pPred+kiStride , kuiL1 );
-	ST32( pPred+kiStride2, kuiL2 );
-	ST32( pPred+kiStride3, kuiL3 );	
+  ST32 (pPred          , kuiL0);
+  ST32 (pPred + kiStride , kuiL1);
+  ST32 (pPred + kiStride2, kuiL2);
+  ST32 (pPred + kiStride3, kuiL3);
 }
 
-void_t WelsI4x4LumaPredDc_c(uint8_t *pPred, const int32_t kiStride)
-{
-	const int32_t kiStride2	= kiStride << 1;
-	const int32_t kiStride3	= kiStride2 + kiStride;
-	const uint8_t kuiMean	= (	pPred[-1] + pPred[-1+kiStride] + pPred[-1+kiStride2] + pPred[-1+kiStride3] +
-								pPred[-kiStride] + pPred[-kiStride+1] + pPred[-kiStride+2] + pPred[-kiStride+3] + 4 ) >> 3;
-	const uint32_t kuiMean32= 0x01010101U * kuiMean;
+void_t WelsI4x4LumaPredDc_c (uint8_t* pPred, const int32_t kiStride) {
+  const int32_t kiStride2	= kiStride << 1;
+  const int32_t kiStride3	= kiStride2 + kiStride;
+  const uint8_t kuiMean	= (pPred[-1] + pPred[-1 + kiStride] + pPred[-1 + kiStride2] + pPred[-1 + kiStride3] +
+                           pPred[-kiStride] + pPred[-kiStride + 1] + pPred[-kiStride + 2] + pPred[-kiStride + 3] + 4) >> 3;
+  const uint32_t kuiMean32 = 0x01010101U * kuiMean;
 
-	ST32( pPred          , kuiMean32 );
-	ST32( pPred+kiStride , kuiMean32 );
-	ST32( pPred+kiStride2, kuiMean32 );
-	ST32( pPred+kiStride3, kuiMean32 );	
+  ST32 (pPred          , kuiMean32);
+  ST32 (pPred + kiStride , kuiMean32);
+  ST32 (pPred + kiStride2, kuiMean32);
+  ST32 (pPred + kiStride3, kuiMean32);
 }
 
-void_t WelsI4x4LumaPredDcLeft_c(uint8_t *pPred, const int32_t kiStride)
-{
-	const int32_t kiStride2	= kiStride << 1;
-	const int32_t kiStride3	= kiStride2 + kiStride;
-	const uint8_t kuiMean	= ( pPred[-1] + pPred[-1+kiStride] + pPred[-1+kiStride2] + pPred[-1+kiStride3] + 2 ) >> 2;
-	const uint32_t kuiMean32= 0x01010101U * kuiMean;
+void_t WelsI4x4LumaPredDcLeft_c (uint8_t* pPred, const int32_t kiStride) {
+  const int32_t kiStride2	= kiStride << 1;
+  const int32_t kiStride3	= kiStride2 + kiStride;
+  const uint8_t kuiMean	= (pPred[-1] + pPred[-1 + kiStride] + pPred[-1 + kiStride2] + pPred[-1 + kiStride3] + 2) >> 2;
+  const uint32_t kuiMean32 = 0x01010101U * kuiMean;
 
-	ST32( pPred          , kuiMean32 );
-	ST32( pPred+kiStride , kuiMean32 );
-	ST32( pPred+kiStride2, kuiMean32 );
-	ST32( pPred+kiStride3, kuiMean32 );	
+  ST32 (pPred          , kuiMean32);
+  ST32 (pPred + kiStride , kuiMean32);
+  ST32 (pPred + kiStride2, kuiMean32);
+  ST32 (pPred + kiStride3, kuiMean32);
 }
 
-void_t WelsI4x4LumaPredDcTop_c(uint8_t *pPred, const int32_t kiStride)
-{
-	const int32_t kiStride2	= kiStride << 1;
-	const int32_t kiStride3	= kiStride2 + kiStride;
-	const uint8_t kuiMean	= (pPred[-kiStride] + pPred[-kiStride+1] + pPred[-kiStride+2] + pPred[-kiStride+3] + 2) >> 2;
-	const uint32_t kuiMean32= 0x01010101U * kuiMean;
+void_t WelsI4x4LumaPredDcTop_c (uint8_t* pPred, const int32_t kiStride) {
+  const int32_t kiStride2	= kiStride << 1;
+  const int32_t kiStride3	= kiStride2 + kiStride;
+  const uint8_t kuiMean	= (pPred[-kiStride] + pPred[-kiStride + 1] + pPred[-kiStride + 2] + pPred[-kiStride + 3] + 2) >>
+                          2;
+  const uint32_t kuiMean32 = 0x01010101U * kuiMean;
 
-	ST32( pPred          , kuiMean32 );
-	ST32( pPred+kiStride , kuiMean32 );
-	ST32( pPred+kiStride2, kuiMean32 );
-	ST32( pPred+kiStride3, kuiMean32 );	
+  ST32 (pPred          , kuiMean32);
+  ST32 (pPred + kiStride , kuiMean32);
+  ST32 (pPred + kiStride2, kuiMean32);
+  ST32 (pPred + kiStride3, kuiMean32);
 }
 
-void_t WelsI4x4LumaPredDcNA_c(uint8_t *pPred, const int32_t kiStride)
-{
-	const uint32_t kuiDC32		= 0x80808080U;
+void_t WelsI4x4LumaPredDcNA_c (uint8_t* pPred, const int32_t kiStride) {
+  const uint32_t kuiDC32		= 0x80808080U;
 
-	ST32( pPred                       , kuiDC32 );
-	ST32( pPred+kiStride              , kuiDC32 );
-	ST32( pPred+(kiStride<<1)         , kuiDC32 );
-	ST32( pPred+(kiStride<<1)+kiStride, kuiDC32 );
+  ST32 (pPred                       , kuiDC32);
+  ST32 (pPred + kiStride              , kuiDC32);
+  ST32 (pPred + (kiStride << 1)         , kuiDC32);
+  ST32 (pPred + (kiStride << 1) + kiStride, kuiDC32);
 }
 
 /*down pLeft*/
-void_t WelsI4x4LumaPredDDL_c(uint8_t *pPred, const int32_t kiStride)
-{
-	const int32_t kiStride2	= kiStride<<1;
-	const int32_t kiStride3	= kiStride + kiStride2;
-	/*get pTop*/
-	uint8_t *ptop			= &pPred[-kiStride];
-	const uint8_t kuiT0		= *ptop;
-	const uint8_t kuiT1		= *(ptop+1);
-	const uint8_t kuiT2		= *(ptop+2);
-	const uint8_t kuiT3		= *(ptop+3);
-	const uint8_t kuiT4		= *(ptop+4);
-	const uint8_t kuiT5		= *(ptop+5);
-	const uint8_t kuiT6		= *(ptop+6);
-	const uint8_t kuiT7		= *(ptop+7);
-	const uint8_t kuiDDL0	= (2 + kuiT0 + kuiT2 + (kuiT1<<1))>>2;	// kDDL0
-	const uint8_t kuiDDL1	= (2 + kuiT1 + kuiT3 + (kuiT2<<1))>>2;	// kDDL1
-	const uint8_t kuiDDL2	= (2 + kuiT2 + kuiT4 + (kuiT3<<1))>>2;	// kDDL2
-	const uint8_t kuiDDL3	= (2 + kuiT3 + kuiT5 + (kuiT4<<1))>>2;	// kDDL3
-	const uint8_t kuiDDL4	= (2 + kuiT4 + kuiT6 + (kuiT5<<1))>>2;	// kDDL4
-	const uint8_t kuiDDL5	= (2 + kuiT5 + kuiT7 + (kuiT6<<1))>>2;	// kDDL5
-	const uint8_t kuiDDL6	= (2 + kuiT6 + kuiT7 + (kuiT7<<1))>>2;	// kDDL6
-	const uint8_t kuiList[8]= { kuiDDL0, kuiDDL1, kuiDDL2, kuiDDL3, kuiDDL4, kuiDDL5, kuiDDL6, 0 };
+void_t WelsI4x4LumaPredDDL_c (uint8_t* pPred, const int32_t kiStride) {
+  const int32_t kiStride2	= kiStride << 1;
+  const int32_t kiStride3	= kiStride + kiStride2;
+  /*get pTop*/
+  uint8_t* ptop			= &pPred[-kiStride];
+  const uint8_t kuiT0		= *ptop;
+  const uint8_t kuiT1		= * (ptop + 1);
+  const uint8_t kuiT2		= * (ptop + 2);
+  const uint8_t kuiT3		= * (ptop + 3);
+  const uint8_t kuiT4		= * (ptop + 4);
+  const uint8_t kuiT5		= * (ptop + 5);
+  const uint8_t kuiT6		= * (ptop + 6);
+  const uint8_t kuiT7		= * (ptop + 7);
+  const uint8_t kuiDDL0	= (2 + kuiT0 + kuiT2 + (kuiT1 << 1)) >> 2;	// kDDL0
+  const uint8_t kuiDDL1	= (2 + kuiT1 + kuiT3 + (kuiT2 << 1)) >> 2;	// kDDL1
+  const uint8_t kuiDDL2	= (2 + kuiT2 + kuiT4 + (kuiT3 << 1)) >> 2;	// kDDL2
+  const uint8_t kuiDDL3	= (2 + kuiT3 + kuiT5 + (kuiT4 << 1)) >> 2;	// kDDL3
+  const uint8_t kuiDDL4	= (2 + kuiT4 + kuiT6 + (kuiT5 << 1)) >> 2;	// kDDL4
+  const uint8_t kuiDDL5	= (2 + kuiT5 + kuiT7 + (kuiT6 << 1)) >> 2;	// kDDL5
+  const uint8_t kuiDDL6	= (2 + kuiT6 + kuiT7 + (kuiT7 << 1)) >> 2;	// kDDL6
+  const uint8_t kuiList[8] = { kuiDDL0, kuiDDL1, kuiDDL2, kuiDDL3, kuiDDL4, kuiDDL5, kuiDDL6, 0 };
 
-	ST32( pPred          , LD32(kuiList  ) );
-	ST32( pPred+kiStride , LD32(kuiList+1) );
-	ST32( pPred+kiStride2, LD32(kuiList+2) );
-	ST32( pPred+kiStride3, LD32(kuiList+3) );
+  ST32 (pPred          , LD32 (kuiList));
+  ST32 (pPred + kiStride , LD32 (kuiList + 1));
+  ST32 (pPred + kiStride2, LD32 (kuiList + 2));
+  ST32 (pPred + kiStride3, LD32 (kuiList + 3));
 }
 
 /*down pLeft*/
-void_t WelsI4x4LumaPredDDLTop_c(uint8_t *pPred, const int32_t kiStride)
-{
-	const int32_t kiStride2	= kiStride<<1;
-	const int32_t kiStride3	= kiStride + kiStride2;
-	/*get pTop*/
-	uint8_t *ptop			= &pPred[-kiStride];
-	const uint8_t kuiT0		= *ptop;
-	const uint8_t kuiT1		= *(ptop+1);
-	const uint8_t kuiT2		= *(ptop+2);
-	const uint8_t kuiT3		= *(ptop+3);
-	const uint16_t kuiT01	= 1 + kuiT0 + kuiT1;
-	const uint16_t kuiT12	= 1 + kuiT1 + kuiT2;
-	const uint16_t kuiT23	= 1 + kuiT2 + kuiT3;
-	const uint16_t kuiT33	= 1 + (kuiT3 << 1);
-	const uint8_t kuiDLT0	= (kuiT01 + kuiT12) >> 2;	// kDLT0
-	const uint8_t kuiDLT1	= (kuiT12 + kuiT23) >> 2;	// kDLT1
-	const uint8_t kuiDLT2	= (kuiT23 + kuiT33) >> 2;	// kDLT2
-	const uint8_t kuiDLT3	= kuiT33 >> 1;			// kDLT3
-	const uint8_t kuiList[8]= { kuiDLT0, kuiDLT1, kuiDLT2, kuiDLT3, kuiDLT3, kuiDLT3, kuiDLT3 ,kuiDLT3 };
+void_t WelsI4x4LumaPredDDLTop_c (uint8_t* pPred, const int32_t kiStride) {
+  const int32_t kiStride2	= kiStride << 1;
+  const int32_t kiStride3	= kiStride + kiStride2;
+  /*get pTop*/
+  uint8_t* ptop			= &pPred[-kiStride];
+  const uint8_t kuiT0		= *ptop;
+  const uint8_t kuiT1		= * (ptop + 1);
+  const uint8_t kuiT2		= * (ptop + 2);
+  const uint8_t kuiT3		= * (ptop + 3);
+  const uint16_t kuiT01	= 1 + kuiT0 + kuiT1;
+  const uint16_t kuiT12	= 1 + kuiT1 + kuiT2;
+  const uint16_t kuiT23	= 1 + kuiT2 + kuiT3;
+  const uint16_t kuiT33	= 1 + (kuiT3 << 1);
+  const uint8_t kuiDLT0	= (kuiT01 + kuiT12) >> 2;	// kDLT0
+  const uint8_t kuiDLT1	= (kuiT12 + kuiT23) >> 2;	// kDLT1
+  const uint8_t kuiDLT2	= (kuiT23 + kuiT33) >> 2;	// kDLT2
+  const uint8_t kuiDLT3	= kuiT33 >> 1;			// kDLT3
+  const uint8_t kuiList[8] = { kuiDLT0, kuiDLT1, kuiDLT2, kuiDLT3, kuiDLT3, kuiDLT3, kuiDLT3 , kuiDLT3 };
 
-	ST32( pPred,           LD32(kuiList  ) );
-	ST32( pPred+kiStride,  LD32(kuiList+1) );
-	ST32( pPred+kiStride2, LD32(kuiList+2) );
-	ST32( pPred+kiStride3, LD32(kuiList+3) );	
+  ST32 (pPred,           LD32 (kuiList));
+  ST32 (pPred + kiStride,  LD32 (kuiList + 1));
+  ST32 (pPred + kiStride2, LD32 (kuiList + 2));
+  ST32 (pPred + kiStride3, LD32 (kuiList + 3));
 }
 
 
 /*down right*/
-void_t WelsI4x4LumaPredDDR_c(uint8_t *pPred, const int32_t kiStride)
-{
-	const int32_t kiStride2	= kiStride<<1;
-	const int32_t kiStride3	= kiStride + kiStride2;
-	uint8_t *ptopleft		= &pPred[-(kiStride+1)];
-	uint8_t *pleft			= &pPred[-1];
-	const uint8_t kuiLT		= *ptopleft;
-	/*get pLeft and pTop*/
-	const uint8_t kuiL0		= *pleft;
-	const uint8_t kuiL1		= *(pleft+kiStride );
-	const uint8_t kuiL2		= *(pleft+kiStride2);
-	const uint8_t kuiL3		= *(pleft+kiStride3);
-	const uint8_t kuiT0		= *(ptopleft+1);
-	const uint8_t kuiT1		= *(ptopleft+2);
-	const uint8_t kuiT2		= *(ptopleft+3);
-	const uint8_t kuiT3		= *(ptopleft+4);
-	const uint16_t kuiTL0	= 1 + kuiLT + kuiL0;
-	const uint16_t kuiLT0	= 1 + kuiLT + kuiT0;
-	const uint16_t kuiT01	= 1 + kuiT0 + kuiT1;
-	const uint16_t kuiT12	= 1 + kuiT1 + kuiT2;
-	const uint16_t kuiT23	= 1 + kuiT2 + kuiT3;
-	const uint16_t kuiL01	= 1 + kuiL0 + kuiL1;
-	const uint16_t kuiL12	= 1 + kuiL1 + kuiL2;
-	const uint16_t kuiL23	= 1 + kuiL2 + kuiL3;
-	const uint8_t kuiDDR0	= (kuiTL0 + kuiLT0) >> 2;	// kuiDDR0
-	const uint8_t kuiDDR1	= (kuiLT0 + kuiT01) >> 2;	// kuiDDR1
-	const uint8_t kuiDDR2	= (kuiT01 + kuiT12) >> 2;	// kuiDDR2
-	const uint8_t kuiDDR3	= (kuiT12 + kuiT23) >> 2;	// kuiDDR3
-	const uint8_t kuiDDR4	= (kuiTL0 + kuiL01) >> 2;	// kuiDDR4
-	const uint8_t kuiDDR5	= (kuiL01 + kuiL12) >> 2;	// kuiDDR5
-	const uint8_t kuiDDR6	= (kuiL12 + kuiL23) >> 2;	// kuiDDR6
-	const uint8_t kuiList[8]= { kuiDDR6, kuiDDR5, kuiDDR4, kuiDDR0, kuiDDR1, kuiDDR2, kuiDDR3, 0	};
+void_t WelsI4x4LumaPredDDR_c (uint8_t* pPred, const int32_t kiStride) {
+  const int32_t kiStride2	= kiStride << 1;
+  const int32_t kiStride3	= kiStride + kiStride2;
+  uint8_t* ptopleft		= &pPred[- (kiStride + 1)];
+  uint8_t* pleft			= &pPred[-1];
+  const uint8_t kuiLT		= *ptopleft;
+  /*get pLeft and pTop*/
+  const uint8_t kuiL0		= *pleft;
+  const uint8_t kuiL1		= * (pleft + kiStride);
+  const uint8_t kuiL2		= * (pleft + kiStride2);
+  const uint8_t kuiL3		= * (pleft + kiStride3);
+  const uint8_t kuiT0		= * (ptopleft + 1);
+  const uint8_t kuiT1		= * (ptopleft + 2);
+  const uint8_t kuiT2		= * (ptopleft + 3);
+  const uint8_t kuiT3		= * (ptopleft + 4);
+  const uint16_t kuiTL0	= 1 + kuiLT + kuiL0;
+  const uint16_t kuiLT0	= 1 + kuiLT + kuiT0;
+  const uint16_t kuiT01	= 1 + kuiT0 + kuiT1;
+  const uint16_t kuiT12	= 1 + kuiT1 + kuiT2;
+  const uint16_t kuiT23	= 1 + kuiT2 + kuiT3;
+  const uint16_t kuiL01	= 1 + kuiL0 + kuiL1;
+  const uint16_t kuiL12	= 1 + kuiL1 + kuiL2;
+  const uint16_t kuiL23	= 1 + kuiL2 + kuiL3;
+  const uint8_t kuiDDR0	= (kuiTL0 + kuiLT0) >> 2;	// kuiDDR0
+  const uint8_t kuiDDR1	= (kuiLT0 + kuiT01) >> 2;	// kuiDDR1
+  const uint8_t kuiDDR2	= (kuiT01 + kuiT12) >> 2;	// kuiDDR2
+  const uint8_t kuiDDR3	= (kuiT12 + kuiT23) >> 2;	// kuiDDR3
+  const uint8_t kuiDDR4	= (kuiTL0 + kuiL01) >> 2;	// kuiDDR4
+  const uint8_t kuiDDR5	= (kuiL01 + kuiL12) >> 2;	// kuiDDR5
+  const uint8_t kuiDDR6	= (kuiL12 + kuiL23) >> 2;	// kuiDDR6
+  const uint8_t kuiList[8] = { kuiDDR6, kuiDDR5, kuiDDR4, kuiDDR0, kuiDDR1, kuiDDR2, kuiDDR3, 0	};
 
-	ST32( pPred          , LD32(kuiList+3) );
-	ST32( pPred+kiStride , LD32(kuiList+2) );
-	ST32( pPred+kiStride2, LD32(kuiList+1) );
-	ST32( pPred+kiStride3, LD32(kuiList  ) );
+  ST32 (pPred          , LD32 (kuiList + 3));
+  ST32 (pPred + kiStride , LD32 (kuiList + 2));
+  ST32 (pPred + kiStride2, LD32 (kuiList + 1));
+  ST32 (pPred + kiStride3, LD32 (kuiList));
 }
 
 
 /*vertical pLeft*/
-void_t WelsI4x4LumaPredVL_c(uint8_t *pPred, const int32_t kiStride)
-{
-	const int32_t kiStride2	= kiStride<<1;
-	const int32_t kiStride3	= kiStride + kiStride2;
-	uint8_t *ptopleft		= &pPred[-(kiStride+1)];
-	/*get pTop*/
-	const uint8_t kuiT0		    = *(ptopleft+1);
-	const uint8_t kuiT1		    = *(ptopleft+2);
-	const uint8_t kuiT2		    = *(ptopleft+3);
-	const uint8_t kuiT3		    = *(ptopleft+4);
-	const uint8_t kuiT4		    = *(ptopleft+5);
-	const uint8_t kuiT5		    = *(ptopleft+6);
-	const uint8_t kuiT6		    = *(ptopleft+7);
-	const uint16_t kuiT01		= 1 + kuiT0 + kuiT1;
-	const uint16_t kuiT12		= 1 + kuiT1 + kuiT2;
-	const uint16_t kuiT23		= 1 + kuiT2 + kuiT3;
-	const uint16_t kuiT34		= 1 + kuiT3 + kuiT4;
-	const uint16_t kuiT45		= 1 + kuiT4 + kuiT5;
-	const uint16_t kuiT56		= 1 + kuiT5 + kuiT6;
-	const uint8_t kuiVL0		= kuiT01 >> 1;			// kuiVL0
-	const uint8_t kuiVL1		= kuiT12 >> 1;			// kuiVL1
-	const uint8_t kuiVL2		= kuiT23 >> 1;			// kuiVL2
-	const uint8_t kuiVL3		= kuiT34 >> 1;			// kuiVL3
-	const uint8_t kuiVL4		= kuiT45 >> 1;			// kuiVL4
-	const uint8_t kuiVL5		= (kuiT01 + kuiT12) >> 2;	// kuiVL5
-	const uint8_t kuiVL6		= (kuiT12 + kuiT23) >> 2;	// kuiVL6
-	const uint8_t kuiVL7		= (kuiT23 + kuiT34) >> 2;	// kuiVL7
-	const uint8_t kuiVL8		= (kuiT34 + kuiT45) >> 2;	// kuiVL8
-	const uint8_t kuiVL9		= (kuiT45 + kuiT56) >> 2;	// kuiVL9
-	const uint8_t kuiList[10]	= { kuiVL0, kuiVL1, kuiVL2, kuiVL3, kuiVL4, kuiVL5, kuiVL6, kuiVL7, kuiVL8, kuiVL9 };
+void_t WelsI4x4LumaPredVL_c (uint8_t* pPred, const int32_t kiStride) {
+  const int32_t kiStride2	= kiStride << 1;
+  const int32_t kiStride3	= kiStride + kiStride2;
+  uint8_t* ptopleft		= &pPred[- (kiStride + 1)];
+  /*get pTop*/
+  const uint8_t kuiT0		    = * (ptopleft + 1);
+  const uint8_t kuiT1		    = * (ptopleft + 2);
+  const uint8_t kuiT2		    = * (ptopleft + 3);
+  const uint8_t kuiT3		    = * (ptopleft + 4);
+  const uint8_t kuiT4		    = * (ptopleft + 5);
+  const uint8_t kuiT5		    = * (ptopleft + 6);
+  const uint8_t kuiT6		    = * (ptopleft + 7);
+  const uint16_t kuiT01		= 1 + kuiT0 + kuiT1;
+  const uint16_t kuiT12		= 1 + kuiT1 + kuiT2;
+  const uint16_t kuiT23		= 1 + kuiT2 + kuiT3;
+  const uint16_t kuiT34		= 1 + kuiT3 + kuiT4;
+  const uint16_t kuiT45		= 1 + kuiT4 + kuiT5;
+  const uint16_t kuiT56		= 1 + kuiT5 + kuiT6;
+  const uint8_t kuiVL0		= kuiT01 >> 1;			// kuiVL0
+  const uint8_t kuiVL1		= kuiT12 >> 1;			// kuiVL1
+  const uint8_t kuiVL2		= kuiT23 >> 1;			// kuiVL2
+  const uint8_t kuiVL3		= kuiT34 >> 1;			// kuiVL3
+  const uint8_t kuiVL4		= kuiT45 >> 1;			// kuiVL4
+  const uint8_t kuiVL5		= (kuiT01 + kuiT12) >> 2;	// kuiVL5
+  const uint8_t kuiVL6		= (kuiT12 + kuiT23) >> 2;	// kuiVL6
+  const uint8_t kuiVL7		= (kuiT23 + kuiT34) >> 2;	// kuiVL7
+  const uint8_t kuiVL8		= (kuiT34 + kuiT45) >> 2;	// kuiVL8
+  const uint8_t kuiVL9		= (kuiT45 + kuiT56) >> 2;	// kuiVL9
+  const uint8_t kuiList[10]	= { kuiVL0, kuiVL1, kuiVL2, kuiVL3, kuiVL4, kuiVL5, kuiVL6, kuiVL7, kuiVL8, kuiVL9 };
 
-	ST32( pPred,           LD32(kuiList  ) );
-	ST32( pPred+kiStride,  LD32(kuiList+5) );
-	ST32( pPred+kiStride2, LD32(kuiList+1) );
-	ST32( pPred+kiStride3, LD32(kuiList+6) );	
+  ST32 (pPred,           LD32 (kuiList));
+  ST32 (pPred + kiStride,  LD32 (kuiList + 5));
+  ST32 (pPred + kiStride2, LD32 (kuiList + 1));
+  ST32 (pPred + kiStride3, LD32 (kuiList + 6));
 }
 
 /*vertical pLeft*/
-void_t WelsI4x4LumaPredVLTop_c(uint8_t *pPred, const int32_t kiStride)
-{
-	const int32_t kiStride2	    = kiStride<<1;
-	const int32_t kiStride3	    = kiStride + kiStride2;
-	uint8_t *ptopleft		    = &pPred[-(kiStride+1)];
-	/*get pTop*/
-	const uint8_t kuiT0		    = *(ptopleft+1);
-	const uint8_t kuiT1		    = *(ptopleft+2);
-	const uint8_t kuiT2		    = *(ptopleft+3);
-	const uint8_t kuiT3		    = *(ptopleft+4);
-	const uint16_t kuiT01		= 1 + kuiT0 + kuiT1;
-	const uint16_t kuiT12		= 1 + kuiT1 + kuiT2;
-	const uint16_t kuiT23		= 1 + kuiT2 + kuiT3;
-	const uint16_t kuiT33		= 1 + (kuiT3 << 1);
-	const uint8_t kuiVL0		= kuiT01 >> 1;
-	const uint8_t kuiVL1		= kuiT12 >> 1;
-	const uint8_t kuiVL2		= kuiT23 >> 1;
-	const uint8_t kuiVL3		= kuiT33 >> 1;
-	const uint8_t kuiVL4		= (kuiT01 + kuiT12) >> 2;
-	const uint8_t kuiVL5		= (kuiT12 + kuiT23) >> 2;
-	const uint8_t kuiVL6		= (kuiT23 + kuiT33) >> 2;
-	const uint8_t kuiVL7		= kuiVL3;
-	const uint8_t kuiList[10]	= { kuiVL0, kuiVL1, kuiVL2, kuiVL3, kuiVL3, kuiVL4, kuiVL5, kuiVL6, kuiVL7, kuiVL7 };
+void_t WelsI4x4LumaPredVLTop_c (uint8_t* pPred, const int32_t kiStride) {
+  const int32_t kiStride2	    = kiStride << 1;
+  const int32_t kiStride3	    = kiStride + kiStride2;
+  uint8_t* ptopleft		    = &pPred[- (kiStride + 1)];
+  /*get pTop*/
+  const uint8_t kuiT0		    = * (ptopleft + 1);
+  const uint8_t kuiT1		    = * (ptopleft + 2);
+  const uint8_t kuiT2		    = * (ptopleft + 3);
+  const uint8_t kuiT3		    = * (ptopleft + 4);
+  const uint16_t kuiT01		= 1 + kuiT0 + kuiT1;
+  const uint16_t kuiT12		= 1 + kuiT1 + kuiT2;
+  const uint16_t kuiT23		= 1 + kuiT2 + kuiT3;
+  const uint16_t kuiT33		= 1 + (kuiT3 << 1);
+  const uint8_t kuiVL0		= kuiT01 >> 1;
+  const uint8_t kuiVL1		= kuiT12 >> 1;
+  const uint8_t kuiVL2		= kuiT23 >> 1;
+  const uint8_t kuiVL3		= kuiT33 >> 1;
+  const uint8_t kuiVL4		= (kuiT01 + kuiT12) >> 2;
+  const uint8_t kuiVL5		= (kuiT12 + kuiT23) >> 2;
+  const uint8_t kuiVL6		= (kuiT23 + kuiT33) >> 2;
+  const uint8_t kuiVL7		= kuiVL3;
+  const uint8_t kuiList[10]	= { kuiVL0, kuiVL1, kuiVL2, kuiVL3, kuiVL3, kuiVL4, kuiVL5, kuiVL6, kuiVL7, kuiVL7 };
 
-	ST32( pPred          , LD32(kuiList  ) );
-	ST32( pPred+kiStride , LD32(kuiList+5) );
-	ST32( pPred+kiStride2, LD32(kuiList+1) );
-	ST32( pPred+kiStride3, LD32(kuiList+6) );	
+  ST32 (pPred          , LD32 (kuiList));
+  ST32 (pPred + kiStride , LD32 (kuiList + 5));
+  ST32 (pPred + kiStride2, LD32 (kuiList + 1));
+  ST32 (pPred + kiStride3, LD32 (kuiList + 6));
 }
 
 
 /*vertical right*/
-void_t WelsI4x4LumaPredVR_c(uint8_t *pPred, const int32_t kiStride)
-{
-	const int32_t kiStride2	    = kiStride<<1;
-	const int32_t kiStride3	    = kiStride + kiStride2;
-	const uint8_t kuiLT		    = pPred[-kiStride-1];
-	/*get pLeft and pTop*/
-	const uint8_t kuiL0		    = pPred[         -1];
-	const uint8_t kuiL1		    = pPred[kiStride -1];
-	const uint8_t kuiL2		    = pPred[kiStride2-1];
-	const uint8_t kuiT0		    = pPred[ -kiStride];
-	const uint8_t kuiT1		    = pPred[1-kiStride];
-	const uint8_t kuiT2		    = pPred[2-kiStride];
-	const uint8_t kuiT3		    = pPred[3-kiStride];
-	const uint8_t kuiVR0		= (1 + kuiLT + kuiT0)>>1;	// kuiVR0
-	const uint8_t kuiVR1		= (1 + kuiT0 + kuiT1)>>1;	// kuiVR1
-	const uint8_t kuiVR2		= (1 + kuiT1 + kuiT2)>>1;	// kuiVR2
-	const uint8_t kuiVR3		= (1 + kuiT2 + kuiT3)>>1;	// kuiVR3
-	const uint8_t kuiVR4		= (2 + kuiL0 + (kuiLT<<1) + kuiT0)>>2;	// kuiVR4
-	const uint8_t kuiVR5		= (2 + kuiLT + (kuiT0<<1) + kuiT1)>>2;	// kuiVR5
-	const uint8_t kuiVR6		= (2 + kuiT0 + (kuiT1<<1) + kuiT2)>>2;	// kuiVR6
-	const uint8_t kuiVR7		= (2 + kuiT1 + (kuiT2<<1) + kuiT3)>>2;	// kuiVR7
-	const uint8_t kuiVR8		= (2 + kuiLT + (kuiL0<<1) + kuiL1)>>2;	// kuiVR8
-	const uint8_t kuiVR9		= (2 + kuiL0 + (kuiL1<<1) + kuiL2)>>2;	// kuiVR9
-	const uint8_t kuiList[10]	= { kuiVR8, kuiVR0, kuiVR1, kuiVR2, kuiVR3, kuiVR9, kuiVR4, kuiVR5, kuiVR6, kuiVR7 };
+void_t WelsI4x4LumaPredVR_c (uint8_t* pPred, const int32_t kiStride) {
+  const int32_t kiStride2	    = kiStride << 1;
+  const int32_t kiStride3	    = kiStride + kiStride2;
+  const uint8_t kuiLT		    = pPred[-kiStride - 1];
+  /*get pLeft and pTop*/
+  const uint8_t kuiL0		    = pPred[         -1];
+  const uint8_t kuiL1		    = pPred[kiStride - 1];
+  const uint8_t kuiL2		    = pPred[kiStride2 - 1];
+  const uint8_t kuiT0		    = pPred[ -kiStride];
+  const uint8_t kuiT1		    = pPred[1 - kiStride];
+  const uint8_t kuiT2		    = pPred[2 - kiStride];
+  const uint8_t kuiT3		    = pPred[3 - kiStride];
+  const uint8_t kuiVR0		= (1 + kuiLT + kuiT0) >> 1;	// kuiVR0
+  const uint8_t kuiVR1		= (1 + kuiT0 + kuiT1) >> 1;	// kuiVR1
+  const uint8_t kuiVR2		= (1 + kuiT1 + kuiT2) >> 1;	// kuiVR2
+  const uint8_t kuiVR3		= (1 + kuiT2 + kuiT3) >> 1;	// kuiVR3
+  const uint8_t kuiVR4		= (2 + kuiL0 + (kuiLT << 1) + kuiT0) >> 2;	// kuiVR4
+  const uint8_t kuiVR5		= (2 + kuiLT + (kuiT0 << 1) + kuiT1) >> 2;	// kuiVR5
+  const uint8_t kuiVR6		= (2 + kuiT0 + (kuiT1 << 1) + kuiT2) >> 2;	// kuiVR6
+  const uint8_t kuiVR7		= (2 + kuiT1 + (kuiT2 << 1) + kuiT3) >> 2;	// kuiVR7
+  const uint8_t kuiVR8		= (2 + kuiLT + (kuiL0 << 1) + kuiL1) >> 2;	// kuiVR8
+  const uint8_t kuiVR9		= (2 + kuiL0 + (kuiL1 << 1) + kuiL2) >> 2;	// kuiVR9
+  const uint8_t kuiList[10]	= { kuiVR8, kuiVR0, kuiVR1, kuiVR2, kuiVR3, kuiVR9, kuiVR4, kuiVR5, kuiVR6, kuiVR7 };
 
-	ST32( pPred          , LD32(kuiList+1) );
-	ST32( pPred+kiStride , LD32(kuiList+6) );
-	ST32( pPred+kiStride2, LD32(kuiList  ) );
-	ST32( pPred+kiStride3, LD32(kuiList+5) );	
+  ST32 (pPred          , LD32 (kuiList + 1));
+  ST32 (pPred + kiStride , LD32 (kuiList + 6));
+  ST32 (pPred + kiStride2, LD32 (kuiList));
+  ST32 (pPred + kiStride3, LD32 (kuiList + 5));
 }
 
 /*horizontal up*/
-void_t WelsI4x4LumaPredHU_c(uint8_t *pPred, const int32_t kiStride)
-{
-	const int32_t kiStride2	    = kiStride<<1;
-	const int32_t kiStride3	    = kiStride + kiStride2;
-	/*get pLeft*/
-	const uint8_t kuiL0		    = pPred[         -1];
-	const uint8_t kuiL1		    = pPred[kiStride -1];
-	const uint8_t kuiL2		    = pPred[kiStride2-1];
-	const uint8_t kuiL3		    = pPred[kiStride3-1];
-	const uint16_t kuiL01		= 1 + kuiL0 + kuiL1;
-	const uint16_t kuiL12		= 1 + kuiL1 + kuiL2;
-	const uint16_t kuiL23		= 1 + kuiL2 + kuiL3;
-	const uint8_t kuiHU0		= kuiL01 >> 1;
-	const uint8_t kuiHU1		= (kuiL01 + kuiL12) >> 2;
-	const uint8_t kuiHU2		= kuiL12 >> 1;
-	const uint8_t kuiHU3		= (kuiL12 + kuiL23) >> 2;
-	const uint8_t kuiHU4		= kuiL23 >> 1;
-	const uint8_t kuiHU5		= (1 + kuiL23 + (kuiL3<<1)) >> 2;
-	const uint8_t kuiList[10]	= { kuiHU0, kuiHU1, kuiHU2, kuiHU3, kuiHU4, kuiHU5, kuiL3, kuiL3, kuiL3, kuiL3 };
+void_t WelsI4x4LumaPredHU_c (uint8_t* pPred, const int32_t kiStride) {
+  const int32_t kiStride2	    = kiStride << 1;
+  const int32_t kiStride3	    = kiStride + kiStride2;
+  /*get pLeft*/
+  const uint8_t kuiL0		    = pPred[         -1];
+  const uint8_t kuiL1		    = pPred[kiStride - 1];
+  const uint8_t kuiL2		    = pPred[kiStride2 - 1];
+  const uint8_t kuiL3		    = pPred[kiStride3 - 1];
+  const uint16_t kuiL01		= 1 + kuiL0 + kuiL1;
+  const uint16_t kuiL12		= 1 + kuiL1 + kuiL2;
+  const uint16_t kuiL23		= 1 + kuiL2 + kuiL3;
+  const uint8_t kuiHU0		= kuiL01 >> 1;
+  const uint8_t kuiHU1		= (kuiL01 + kuiL12) >> 2;
+  const uint8_t kuiHU2		= kuiL12 >> 1;
+  const uint8_t kuiHU3		= (kuiL12 + kuiL23) >> 2;
+  const uint8_t kuiHU4		= kuiL23 >> 1;
+  const uint8_t kuiHU5		= (1 + kuiL23 + (kuiL3 << 1)) >> 2;
+  const uint8_t kuiList[10]	= { kuiHU0, kuiHU1, kuiHU2, kuiHU3, kuiHU4, kuiHU5, kuiL3, kuiL3, kuiL3, kuiL3 };
 
-	ST32( pPred          , LD32(kuiList  ) );
-	ST32( pPred+kiStride , LD32(kuiList+2) );
-	ST32( pPred+kiStride2, LD32(kuiList+4) );
-	ST32( pPred+kiStride3, LD32(kuiList+6) );	
+  ST32 (pPred          , LD32 (kuiList));
+  ST32 (pPred + kiStride , LD32 (kuiList + 2));
+  ST32 (pPred + kiStride2, LD32 (kuiList + 4));
+  ST32 (pPred + kiStride3, LD32 (kuiList + 6));
 }
 
 /*horizontal down*/
-void_t WelsI4x4LumaPredHD_c(uint8_t *pPred, const int32_t kiStride)
-{
-	const int32_t kiStride2 	= kiStride<<1;
-	const int32_t kiStride3	    = kiStride + kiStride2;
-	const uint8_t kuiLT		    = pPred[-(kiStride+1)];
-	/*get pLeft and pTop*/
-	const uint8_t kuiL0		    = pPred[-1          ];
-	const uint8_t kuiL1		    = pPred[-1+kiStride ];
-	const uint8_t kuiL2		    = pPred[-1+kiStride2];
-	const uint8_t kuiL3		    = pPred[-1+kiStride3];
-	const uint8_t kuiT0		    = pPred[-kiStride   ];
-	const uint8_t kuiT1		    = pPred[-kiStride+1 ];
-	const uint8_t kuiT2		    = pPred[-kiStride+2 ];
-	const uint16_t kuiTL0		= 1 + kuiLT + kuiL0;
-	const uint16_t kuiLT0		= 1 + kuiLT + kuiT0;
-	const uint16_t kuiT01		= 1 + kuiT0 + kuiT1;
-	const uint16_t kuiT12		= 1 + kuiT1 + kuiT2;
-	const uint16_t kuiL01		= 1 + kuiL0 + kuiL1;
-	const uint16_t kuiL12		= 1 + kuiL1 + kuiL2;
-	const uint16_t kuiL23		= 1 + kuiL2 + kuiL3;
-	const uint8_t kuiHD0		= kuiTL0 >> 1;
-	const uint8_t kuiHD1		= (kuiTL0 + kuiLT0) >> 2;
-	const uint8_t kuiHD2		= (kuiLT0 + kuiT01) >> 2;
-	const uint8_t kuiHD3		= (kuiT01 + kuiT12) >> 2;
-	const uint8_t kuiHD4		= kuiL01 >> 1;
-	const uint8_t kuiHD5		= (kuiTL0 + kuiL01) >> 2;
-	const uint8_t kuiHD6		= kuiL12 >> 1;
-	const uint8_t kuiHD7		= (kuiL01 + kuiL12) >> 2;
-	const uint8_t kuiHD8		= kuiL23 >> 1;
-	const uint8_t kuiHD9	    = (kuiL12 + kuiL23) >> 2;
-	const uint8_t kuiList[10]	= { kuiHD8, kuiHD9, kuiHD6, kuiHD7, kuiHD4, kuiHD5, kuiHD0, kuiHD1, kuiHD2, kuiHD3 };
+void_t WelsI4x4LumaPredHD_c (uint8_t* pPred, const int32_t kiStride) {
+  const int32_t kiStride2 	= kiStride << 1;
+  const int32_t kiStride3	    = kiStride + kiStride2;
+  const uint8_t kuiLT		    = pPred[- (kiStride + 1)];
+  /*get pLeft and pTop*/
+  const uint8_t kuiL0		    = pPred[-1          ];
+  const uint8_t kuiL1		    = pPred[-1 + kiStride ];
+  const uint8_t kuiL2		    = pPred[-1 + kiStride2];
+  const uint8_t kuiL3		    = pPred[-1 + kiStride3];
+  const uint8_t kuiT0		    = pPred[-kiStride   ];
+  const uint8_t kuiT1		    = pPred[-kiStride + 1 ];
+  const uint8_t kuiT2		    = pPred[-kiStride + 2 ];
+  const uint16_t kuiTL0		= 1 + kuiLT + kuiL0;
+  const uint16_t kuiLT0		= 1 + kuiLT + kuiT0;
+  const uint16_t kuiT01		= 1 + kuiT0 + kuiT1;
+  const uint16_t kuiT12		= 1 + kuiT1 + kuiT2;
+  const uint16_t kuiL01		= 1 + kuiL0 + kuiL1;
+  const uint16_t kuiL12		= 1 + kuiL1 + kuiL2;
+  const uint16_t kuiL23		= 1 + kuiL2 + kuiL3;
+  const uint8_t kuiHD0		= kuiTL0 >> 1;
+  const uint8_t kuiHD1		= (kuiTL0 + kuiLT0) >> 2;
+  const uint8_t kuiHD2		= (kuiLT0 + kuiT01) >> 2;
+  const uint8_t kuiHD3		= (kuiT01 + kuiT12) >> 2;
+  const uint8_t kuiHD4		= kuiL01 >> 1;
+  const uint8_t kuiHD5		= (kuiTL0 + kuiL01) >> 2;
+  const uint8_t kuiHD6		= kuiL12 >> 1;
+  const uint8_t kuiHD7		= (kuiL01 + kuiL12) >> 2;
+  const uint8_t kuiHD8		= kuiL23 >> 1;
+  const uint8_t kuiHD9	    = (kuiL12 + kuiL23) >> 2;
+  const uint8_t kuiList[10]	= { kuiHD8, kuiHD9, kuiHD6, kuiHD7, kuiHD4, kuiHD5, kuiHD0, kuiHD1, kuiHD2, kuiHD3 };
 
-	ST32( pPred          , LD32(kuiList+6) );
-	ST32( pPred+kiStride , LD32(kuiList+4) );
-	ST32( pPred+kiStride2, LD32(kuiList+2) );
-	ST32( pPred+kiStride3, LD32(kuiList  ) );	
+  ST32 (pPred          , LD32 (kuiList + 6));
+  ST32 (pPred + kiStride , LD32 (kuiList + 4));
+  ST32 (pPred + kiStride2, LD32 (kuiList + 2));
+  ST32 (pPred + kiStride3, LD32 (kuiList));
 }
 
-void_t WelsIChromaPredV_c(uint8_t *pPred, const int32_t kiStride)
-{
-	const uint64_t kuiVal64	= LD64(&pPred[-kiStride]);
-	const int32_t kiStride2	= kiStride  << 1;
-	const int32_t kiStride4 = kiStride2 << 1;
+void_t WelsIChromaPredV_c (uint8_t* pPred, const int32_t kiStride) {
+  const uint64_t kuiVal64	= LD64 (&pPred[-kiStride]);
+  const int32_t kiStride2	= kiStride  << 1;
+  const int32_t kiStride4 = kiStride2 << 1;
 
-	ST64( pPred                        , kuiVal64 );
-	ST64( pPred+kiStride               , kuiVal64 );
-	ST64( pPred+kiStride2              , kuiVal64 );
-	ST64( pPred+kiStride2+kiStride     , kuiVal64 );
-	ST64( pPred+kiStride4              , kuiVal64 );
-	ST64( pPred+kiStride4+kiStride     , kuiVal64 );
-	ST64( pPred+kiStride4+kiStride2    , kuiVal64 );
-	ST64( pPred+(kiStride<<3)-kiStride , kuiVal64 );
+  ST64 (pPred                        , kuiVal64);
+  ST64 (pPred + kiStride               , kuiVal64);
+  ST64 (pPred + kiStride2              , kuiVal64);
+  ST64 (pPred + kiStride2 + kiStride     , kuiVal64);
+  ST64 (pPred + kiStride4              , kuiVal64);
+  ST64 (pPred + kiStride4 + kiStride     , kuiVal64);
+  ST64 (pPred + kiStride4 + kiStride2    , kuiVal64);
+  ST64 (pPred + (kiStride << 3) - kiStride , kuiVal64);
 }
 
-void_t WelsIChromaPredH_c(uint8_t *pPred, const int32_t kiStride)
-{
-	int32_t iTmp = (kiStride<<3)-kiStride;
-	uint8_t i = 7;
-	
-	do
-	{
-		const uint8_t kuiVal8	= pPred[iTmp-1];
-		const uint64_t kuiVal64	= 0x0101010101010101ULL * kuiVal8;
+void_t WelsIChromaPredH_c (uint8_t* pPred, const int32_t kiStride) {
+  int32_t iTmp = (kiStride << 3) - kiStride;
+  uint8_t i = 7;
 
-		ST64( pPred+iTmp, kuiVal64 );
+  do {
+    const uint8_t kuiVal8	= pPred[iTmp - 1];
+    const uint64_t kuiVal64	= 0x0101010101010101ULL * kuiVal8;
 
-		iTmp -= kiStride;
-	}while(i-->0);
+    ST64 (pPred + iTmp, kuiVal64);
+
+    iTmp -= kiStride;
+  } while (i-- > 0);
 }
 
 
-void_t WelsIChromaPredPlane_c(uint8_t *pPred, const int32_t kiStride)
-{
-	int32_t a=0, b=0, c=0, H=0, V=0;
-	int32_t i, j;
-	uint8_t *pTop = &pPred[-kiStride];
-	uint8_t *pLeft = &pPred[-1];
+void_t WelsIChromaPredPlane_c (uint8_t* pPred, const int32_t kiStride) {
+  int32_t a = 0, b = 0, c = 0, H = 0, V = 0;
+  int32_t i, j;
+  uint8_t* pTop = &pPred[-kiStride];
+  uint8_t* pLeft = &pPred[-1];
 
-	for(i = 0 ; i < 4 ; i ++)
-	{
-		H += (i + 1) * (pTop[4 + i] - pTop[2 - i]);
-		V += (i + 1) * (pLeft[(4 + i)*kiStride] - pLeft[(2 - i)*kiStride]);
-	}
+  for (i = 0 ; i < 4 ; i ++) {
+    H += (i + 1) * (pTop[4 + i] - pTop[2 - i]);
+    V += (i + 1) * (pLeft[ (4 + i) * kiStride] - pLeft[ (2 - i) * kiStride]);
+  }
 
-	a = (pLeft[7*kiStride] + pTop[7]) << 4;
-	b = (17 * H + 16) >> 5;
-	c = (17 * V + 16) >> 5;
+  a = (pLeft[7 * kiStride] + pTop[7]) << 4;
+  b = (17 * H + 16) >> 5;
+  c = (17 * V + 16) >> 5;
 
-	for(i = 0 ; i < 8 ; i ++)
-	{
-		for(j = 0 ; j < 8 ; j ++)
-		{
-			int32_t iTmp = (a + b * (j - 3) + c * (i - 3) + 16) >> 5;
-			iTmp = WELS_CLIP1(iTmp);
-			pPred[j] = iTmp;
-		}
-		pPred += kiStride;
-	}
+  for (i = 0 ; i < 8 ; i ++) {
+    for (j = 0 ; j < 8 ; j ++) {
+      int32_t iTmp = (a + b * (j - 3) + c * (i - 3) + 16) >> 5;
+      iTmp = WELS_CLIP1 (iTmp);
+      pPred[j] = iTmp;
+    }
+    pPred += kiStride;
+  }
 }
 
 
-void_t WelsIChromaPredDc_c(uint8_t *pPred, const int32_t kiStride)
-{
-	const int32_t kiL1		= kiStride-1;
-	const int32_t kiL2		= kiL1 + kiStride;
-	const int32_t kiL3		= kiL2 + kiStride;
-	const int32_t kiL4		= kiL3 + kiStride;
-	const int32_t kiL5		= kiL4 + kiStride;
-	const int32_t kiL6		= kiL5 + kiStride;
-	const int32_t kiL7		= kiL6 + kiStride;	
-	/*caculate the kMean value*/
-	const uint8_t kuiM1		= ( pPred[-kiStride] + pPred[1-kiStride] + pPred[2-kiStride] + pPred[3-kiStride] +
-								pPred[-1] + pPred[kiL1] + pPred[kiL2] + pPred[kiL3] + 4) >> 3 ;
-	const uint32_t kuiSum2	= pPred[4-kiStride] + pPred[5-kiStride] + pPred[6-kiStride] + pPred[7-kiStride];
-	const uint32_t kuiSum3	= pPred[kiL4] + pPred[kiL5] + pPred[kiL6] + pPred[kiL7];
-	const uint8_t kuiM2		= (kuiSum2 + 2) >> 2;
-	const uint8_t kuiM3		= (kuiSum3 + 2) >> 2;
-	const uint8_t kuiM4		= (kuiSum2 + kuiSum3 + 4) >> 3;
-	const uint8_t kuiMUP[8]	= {kuiM1, kuiM1, kuiM1, kuiM1, kuiM2, kuiM2, kuiM2, kuiM2};
-	const uint8_t kuiMDown[8]	= {kuiM3, kuiM3, kuiM3, kuiM3, kuiM4, kuiM4, kuiM4, kuiM4};
-	const uint64_t kuiUP64		= LD64(kuiMUP);
-	const uint64_t kuiDN64		= LD64(kuiMDown);
+void_t WelsIChromaPredDc_c (uint8_t* pPred, const int32_t kiStride) {
+  const int32_t kiL1		= kiStride - 1;
+  const int32_t kiL2		= kiL1 + kiStride;
+  const int32_t kiL3		= kiL2 + kiStride;
+  const int32_t kiL4		= kiL3 + kiStride;
+  const int32_t kiL5		= kiL4 + kiStride;
+  const int32_t kiL6		= kiL5 + kiStride;
+  const int32_t kiL7		= kiL6 + kiStride;
+  /*caculate the kMean value*/
+  const uint8_t kuiM1		= (pPred[-kiStride] + pPred[1 - kiStride] + pPred[2 - kiStride] + pPred[3 - kiStride] +
+                           pPred[-1] + pPred[kiL1] + pPred[kiL2] + pPred[kiL3] + 4) >> 3 ;
+  const uint32_t kuiSum2	= pPred[4 - kiStride] + pPred[5 - kiStride] + pPred[6 - kiStride] + pPred[7 - kiStride];
+  const uint32_t kuiSum3	= pPred[kiL4] + pPred[kiL5] + pPred[kiL6] + pPred[kiL7];
+  const uint8_t kuiM2		= (kuiSum2 + 2) >> 2;
+  const uint8_t kuiM3		= (kuiSum3 + 2) >> 2;
+  const uint8_t kuiM4		= (kuiSum2 + kuiSum3 + 4) >> 3;
+  const uint8_t kuiMUP[8]	= {kuiM1, kuiM1, kuiM1, kuiM1, kuiM2, kuiM2, kuiM2, kuiM2};
+  const uint8_t kuiMDown[8]	= {kuiM3, kuiM3, kuiM3, kuiM3, kuiM4, kuiM4, kuiM4, kuiM4};
+  const uint64_t kuiUP64		= LD64 (kuiMUP);
+  const uint64_t kuiDN64		= LD64 (kuiMDown);
 
-	ST64( pPred       , kuiUP64 );
-	ST64( pPred+kiL1+1, kuiUP64 );
-	ST64( pPred+kiL2+1, kuiUP64 );
-	ST64( pPred+kiL3+1, kuiUP64 );
-	ST64( pPred+kiL4+1, kuiDN64 );
-	ST64( pPred+kiL5+1, kuiDN64 );
-	ST64( pPred+kiL6+1, kuiDN64 );
-	ST64( pPred+kiL7+1, kuiDN64 );
+  ST64 (pPred       , kuiUP64);
+  ST64 (pPred + kiL1 + 1, kuiUP64);
+  ST64 (pPred + kiL2 + 1, kuiUP64);
+  ST64 (pPred + kiL3 + 1, kuiUP64);
+  ST64 (pPred + kiL4 + 1, kuiDN64);
+  ST64 (pPred + kiL5 + 1, kuiDN64);
+  ST64 (pPred + kiL6 + 1, kuiDN64);
+  ST64 (pPred + kiL7 + 1, kuiDN64);
 }
 
-void_t WelsIChromaPredDcLeft_c(uint8_t *pPred, const int32_t kiStride)
-{
-	const int32_t kiL1	=   -1 + kiStride;
-	const int32_t kiL2	= kiL1 + kiStride;
-	const int32_t kiL3	= kiL2 + kiStride;
-	const int32_t kiL4	= kiL3 + kiStride;
-	const int32_t kiL5	= kiL4 + kiStride;
-	const int32_t kiL6	= kiL5 + kiStride;
-	const int32_t kiL7	= kiL6 + kiStride;	
-	/*caculate the kMean value*/
-	const uint8_t kuiMUP   = (pPred[-1] + pPred[kiL1] + pPred[kiL2] + pPred[kiL3] + 2)>>2 ;
-	const uint8_t kuiMDown = (pPred[kiL4] + pPred[kiL5] + pPred[kiL6] + pPred[kiL7] + 2)>>2;
-	const uint64_t kuiUP64 = 0x0101010101010101ULL * kuiMUP;
-	const uint64_t kuiDN64 = 0x0101010101010101ULL * kuiMDown;
+void_t WelsIChromaPredDcLeft_c (uint8_t* pPred, const int32_t kiStride) {
+  const int32_t kiL1	=   -1 + kiStride;
+  const int32_t kiL2	= kiL1 + kiStride;
+  const int32_t kiL3	= kiL2 + kiStride;
+  const int32_t kiL4	= kiL3 + kiStride;
+  const int32_t kiL5	= kiL4 + kiStride;
+  const int32_t kiL6	= kiL5 + kiStride;
+  const int32_t kiL7	= kiL6 + kiStride;
+  /*caculate the kMean value*/
+  const uint8_t kuiMUP   = (pPred[-1] + pPred[kiL1] + pPred[kiL2] + pPred[kiL3] + 2) >> 2 ;
+  const uint8_t kuiMDown = (pPred[kiL4] + pPred[kiL5] + pPred[kiL6] + pPred[kiL7] + 2) >> 2;
+  const uint64_t kuiUP64 = 0x0101010101010101ULL * kuiMUP;
+  const uint64_t kuiDN64 = 0x0101010101010101ULL * kuiMDown;
 
-	ST64( pPred       , kuiUP64 );
-	ST64( pPred+kiL1+1, kuiUP64 );
-	ST64( pPred+kiL2+1, kuiUP64 );
-	ST64( pPred+kiL3+1, kuiUP64 );
-	ST64( pPred+kiL4+1, kuiDN64 );
-	ST64( pPred+kiL5+1, kuiDN64 );
-	ST64( pPred+kiL6+1, kuiDN64 );
-	ST64( pPred+kiL7+1, kuiDN64 );
+  ST64 (pPred       , kuiUP64);
+  ST64 (pPred + kiL1 + 1, kuiUP64);
+  ST64 (pPred + kiL2 + 1, kuiUP64);
+  ST64 (pPred + kiL3 + 1, kuiUP64);
+  ST64 (pPred + kiL4 + 1, kuiDN64);
+  ST64 (pPred + kiL5 + 1, kuiDN64);
+  ST64 (pPred + kiL6 + 1, kuiDN64);
+  ST64 (pPred + kiL7 + 1, kuiDN64);
 }
 
-void_t WelsIChromaPredDcTop_c(uint8_t *pPred, const int32_t kiStride)
-{
-	int32_t iTmp			= (kiStride<<3)-kiStride;
-	/*caculate the kMean value*/
-	const uint8_t kuiM1	    = (pPred[-kiStride] + pPred[1-kiStride] + pPred[2-kiStride] + pPred[3-kiStride]+2)>>2;
-	const uint8_t kuiM2	    = (pPred[4-kiStride] + pPred[5-kiStride] + pPred[6-kiStride] + pPred[7-kiStride] + 2)>>2;
-	const uint8_t kuiM[8]	= {kuiM1, kuiM1, kuiM1, kuiM1, kuiM2, kuiM2, kuiM2, kuiM2};
+void_t WelsIChromaPredDcTop_c (uint8_t* pPred, const int32_t kiStride) {
+  int32_t iTmp			= (kiStride << 3) - kiStride;
+  /*caculate the kMean value*/
+  const uint8_t kuiM1	    = (pPred[-kiStride] + pPred[1 - kiStride] + pPred[2 - kiStride] + pPred[3 - kiStride] + 2) >> 2;
+  const uint8_t kuiM2	    = (pPred[4 - kiStride] + pPred[5 - kiStride] + pPred[6 - kiStride] + pPred[7 - kiStride] + 2) >>
+                            2;
+  const uint8_t kuiM[8]	= {kuiM1, kuiM1, kuiM1, kuiM1, kuiM2, kuiM2, kuiM2, kuiM2};
 
-	uint8_t i = 7;
-	
-	do
-	{
-		ST64( pPred+iTmp, LD64(kuiM) );
+  uint8_t i = 7;
 
-		iTmp -= kiStride;
-	}while(i-->0);
+  do {
+    ST64 (pPred + iTmp, LD64 (kuiM));
+
+    iTmp -= kiStride;
+  } while (i-- > 0);
 }
 
-void_t WelsIChromaPredDcNA_c(uint8_t *pPred, const int32_t kiStride)
-{
-	int32_t iTmp = (kiStride<<3)-kiStride;
-	const uint64_t kuiDC64 = 0x8080808080808080ULL;
-	uint8_t i = 7;
-	
-	do
-	{
-		ST64( pPred+iTmp, kuiDC64 );
+void_t WelsIChromaPredDcNA_c (uint8_t* pPred, const int32_t kiStride) {
+  int32_t iTmp = (kiStride << 3) - kiStride;
+  const uint64_t kuiDC64 = 0x8080808080808080ULL;
+  uint8_t i = 7;
 
-		iTmp -= kiStride;
-	}while(i-->0);
+  do {
+    ST64 (pPred + iTmp, kuiDC64);
+
+    iTmp -= kiStride;
+  } while (i-- > 0);
 }
 
-void_t WelsI16x16LumaPredV_c(uint8_t *pPred, const int32_t kiStride)
-{
-	int32_t iTmp			= (kiStride<<4)-kiStride;
-	const uint64_t kuiTop1	= LD64(pPred-kiStride);
-	const uint64_t kuiTop2  = LD64(pPred-kiStride+8);
-	uint8_t i = 15;	
-	
-	do
-	{
-		ST64( pPred+iTmp  , kuiTop1 );
-		ST64( pPred+iTmp+8, kuiTop2 );
+void_t WelsI16x16LumaPredV_c (uint8_t* pPred, const int32_t kiStride) {
+  int32_t iTmp			= (kiStride << 4) - kiStride;
+  const uint64_t kuiTop1	= LD64 (pPred - kiStride);
+  const uint64_t kuiTop2  = LD64 (pPred - kiStride + 8);
+  uint8_t i = 15;
 
-		iTmp -= kiStride;
-	}while(i-->0);
+  do {
+    ST64 (pPred + iTmp  , kuiTop1);
+    ST64 (pPred + iTmp + 8, kuiTop2);
+
+    iTmp -= kiStride;
+  } while (i-- > 0);
 }
 
-void_t WelsI16x16LumaPredH_c(uint8_t *pPred, const int32_t kiStride)
-{
-	int32_t iTmp = (kiStride<<4)-kiStride;
-	uint8_t i = 15;
-	
-	do
-	{
-		const uint8_t kuiVal8	= pPred[iTmp-1];
-		const uint64_t kuiVal64	= 0x0101010101010101ULL * kuiVal8;
+void_t WelsI16x16LumaPredH_c (uint8_t* pPred, const int32_t kiStride) {
+  int32_t iTmp = (kiStride << 4) - kiStride;
+  uint8_t i = 15;
 
-		ST64( pPred+iTmp  , kuiVal64 );
-		ST64( pPred+iTmp+8, kuiVal64 );
+  do {
+    const uint8_t kuiVal8	= pPred[iTmp - 1];
+    const uint64_t kuiVal64	= 0x0101010101010101ULL * kuiVal8;
 
-		iTmp -= kiStride;
-	}while(i-->0);
+    ST64 (pPred + iTmp  , kuiVal64);
+    ST64 (pPred + iTmp + 8, kuiVal64);
+
+    iTmp -= kiStride;
+  } while (i-- > 0);
 }
 
-void_t WelsI16x16LumaPredPlane_c(uint8_t *pPred, const int32_t kiStride)
-{
-	int32_t a=0, b=0, c=0, H=0, V=0;
-	int32_t i, j;
-	uint8_t *pTop = &pPred[-kiStride];
-	uint8_t *pLeft = &pPred[-1];
+void_t WelsI16x16LumaPredPlane_c (uint8_t* pPred, const int32_t kiStride) {
+  int32_t a = 0, b = 0, c = 0, H = 0, V = 0;
+  int32_t i, j;
+  uint8_t* pTop = &pPred[-kiStride];
+  uint8_t* pLeft = &pPred[-1];
 
-	for(i = 0 ; i < 8 ; i ++)
-	{
-		H += (i + 1) * (pTop[8 + i] - pTop[6 - i]);
-		V += (i + 1) * (pLeft[(8 + i)*kiStride] - pLeft[(6 - i)*kiStride]);
-	}
+  for (i = 0 ; i < 8 ; i ++) {
+    H += (i + 1) * (pTop[8 + i] - pTop[6 - i]);
+    V += (i + 1) * (pLeft[ (8 + i) * kiStride] - pLeft[ (6 - i) * kiStride]);
+  }
 
-	a = (pLeft[15*kiStride] + pTop[15]) << 4;
-	b = (5 * H + 32) >> 6;
-	c = (5 * V + 32) >> 6;
+  a = (pLeft[15 * kiStride] + pTop[15]) << 4;
+  b = (5 * H + 32) >> 6;
+  c = (5 * V + 32) >> 6;
 
-	for(i = 0 ; i < 16 ; i ++)
-	{
-		for(j = 0 ; j < 16 ; j ++)
-		{
-			int32_t iTmp = (a + b * (j - 7) + c * (i - 7) + 16) >> 5;
-			iTmp = WELS_CLIP1(iTmp);
-			pPred[j] = iTmp;
-		}
-		pPred += kiStride;
-	}
+  for (i = 0 ; i < 16 ; i ++) {
+    for (j = 0 ; j < 16 ; j ++) {
+      int32_t iTmp = (a + b * (j - 7) + c * (i - 7) + 16) >> 5;
+      iTmp = WELS_CLIP1 (iTmp);
+      pPred[j] = iTmp;
+    }
+    pPred += kiStride;
+  }
 }
 
-void_t WelsI16x16LumaPredDc_c(uint8_t *pPred, const int32_t kiStride)
-{
-	int32_t iTmp = (kiStride<<4)-kiStride;
-	int32_t iSum = 0;
-	uint8_t i = 15;
-	uint8_t uiMean = 0;
+void_t WelsI16x16LumaPredDc_c (uint8_t* pPred, const int32_t kiStride) {
+  int32_t iTmp = (kiStride << 4) - kiStride;
+  int32_t iSum = 0;
+  uint8_t i = 15;
+  uint8_t uiMean = 0;
 
-	/*caculate the kMean value*/
-	do
-	{
-		iSum += pPred[-1+iTmp] + pPred[-kiStride+i];
-		iTmp -= kiStride;
-	}while(i-->0);
-	uiMean = ( 16 + iSum ) >> 5;
+  /*caculate the kMean value*/
+  do {
+    iSum += pPred[-1 + iTmp] + pPred[-kiStride + i];
+    iTmp -= kiStride;
+  } while (i-- > 0);
+  uiMean = (16 + iSum) >> 5;
 
-	iTmp = (kiStride<<4)-kiStride;
-	i = 15;
-	do
-	{
-		memset(&pPred[iTmp], uiMean, I16x16_COUNT);
-		iTmp -= kiStride;
-	}while(i-->0);
+  iTmp = (kiStride << 4) - kiStride;
+  i = 15;
+  do {
+    memset (&pPred[iTmp], uiMean, I16x16_COUNT);
+    iTmp -= kiStride;
+  } while (i-- > 0);
 }
 
 
-void_t WelsI16x16LumaPredDcTop_c(uint8_t *pPred, const int32_t kiStride)
-{
-	int32_t iTmp = (kiStride<<4)-kiStride;
-	int32_t iSum = 0;
-	uint8_t i = 15;
-	uint8_t uiMean = 0;
-	
-	/*caculate the kMean value*/
-	do
-	{
-		iSum += pPred[-kiStride+i];
-	}while(i-->0);
-	uiMean = ( 8 + iSum ) >> 4;
+void_t WelsI16x16LumaPredDcTop_c (uint8_t* pPred, const int32_t kiStride) {
+  int32_t iTmp = (kiStride << 4) - kiStride;
+  int32_t iSum = 0;
+  uint8_t i = 15;
+  uint8_t uiMean = 0;
 
-	i = 15;
-	do
-	{
-		memset(&pPred[iTmp], uiMean, I16x16_COUNT);
-		iTmp -= kiStride;
-	}while(i-->0);
+  /*caculate the kMean value*/
+  do {
+    iSum += pPred[-kiStride + i];
+  } while (i-- > 0);
+  uiMean = (8 + iSum) >> 4;
+
+  i = 15;
+  do {
+    memset (&pPred[iTmp], uiMean, I16x16_COUNT);
+    iTmp -= kiStride;
+  } while (i-- > 0);
 }
 
-void_t WelsI16x16LumaPredDcLeft_c(uint8_t *pPred, const int32_t kiStride)
-{
-	int32_t iTmp = (kiStride<<4)-kiStride;
-	int32_t iSum = 0;
-	uint64_t uiMean64 = 0;
-	uint8_t uiMean = 0;
-	uint8_t i = 15;	
+void_t WelsI16x16LumaPredDcLeft_c (uint8_t* pPred, const int32_t kiStride) {
+  int32_t iTmp = (kiStride << 4) - kiStride;
+  int32_t iSum = 0;
+  uint64_t uiMean64 = 0;
+  uint8_t uiMean = 0;
+  uint8_t i = 15;
 
-	/*caculate the kMean value*/
-	do
-	{
-		iSum += pPred[-1+iTmp];
-		iTmp -= kiStride;
-	}while(i-->0);
-	uiMean	= ( 8 + iSum ) >> 4;
-	uiMean64	= 0x0101010101010101ULL * uiMean;
+  /*caculate the kMean value*/
+  do {
+    iSum += pPred[-1 + iTmp];
+    iTmp -= kiStride;
+  } while (i-- > 0);
+  uiMean	= (8 + iSum) >> 4;
+  uiMean64	= 0x0101010101010101ULL * uiMean;
 
-	iTmp = (kiStride<<4)-kiStride;
-	i = 15;
-	do
-	{
-		ST64( pPred+iTmp  , uiMean64 );
-		ST64( pPred+iTmp+8, uiMean64 );
+  iTmp = (kiStride << 4) - kiStride;
+  i = 15;
+  do {
+    ST64 (pPred + iTmp  , uiMean64);
+    ST64 (pPred + iTmp + 8, uiMean64);
 
-		iTmp -= kiStride;
-	}while(i-->0);
+    iTmp -= kiStride;
+  } while (i-- > 0);
 }
 
-void_t WelsI16x16LumaPredDcNA_c(uint8_t *pPred, const int32_t kiStride)
-{
-	const uint64_t kuiDC64 = 0x8080808080808080ULL;
-	int32_t iTmp = (kiStride<<4)-kiStride;
-	uint8_t i = 15;	
-	
-	do
-	{
-		ST64( pPred+iTmp, kuiDC64 );
-		ST64( pPred+iTmp+8, kuiDC64 );
+void_t WelsI16x16LumaPredDcNA_c (uint8_t* pPred, const int32_t kiStride) {
+  const uint64_t kuiDC64 = 0x8080808080808080ULL;
+  int32_t iTmp = (kiStride << 4) - kiStride;
+  uint8_t i = 15;
 
-		iTmp -= kiStride;
-	}while(i-->0);
+  do {
+    ST64 (pPred + iTmp, kuiDC64);
+    ST64 (pPred + iTmp + 8, kuiDC64);
+
+    iTmp -= kiStride;
+  } while (i-- > 0);
 }
 
 } // namespace WelsDec
\ No newline at end of file
--- a/codec/decoder/core/src/manage_dec_ref.cpp
+++ b/codec/decoder/core/src/manage_dec_ref.cpp
@@ -47,21 +47,19 @@
 
 namespace WelsDec {
 
-static void_t SetUnRef(PPicture pRef)
-{
-    if( NULL != pRef)
-    {
-	    pRef->bUsedAsRef = false;
-	    pRef->bIsLongRef = false;
-	    pRef->iFrameNum = -1;
-	    pRef->iFramePoc = 0;
-	    pRef->iLongTermFrameIdx = -1;
-	    pRef->bRefBaseFlag = 0;
-	    pRef->uiQualityId = -1;	
-	    pRef->uiTemporalId = -1;
-	    pRef->uiSpatialId = -1;
-	    pRef->iSpsId = -1;
-    }
+static void_t SetUnRef (PPicture pRef) {
+  if (NULL != pRef) {
+    pRef->bUsedAsRef = false;
+    pRef->bIsLongRef = false;
+    pRef->iFrameNum = -1;
+    pRef->iFramePoc = 0;
+    pRef->iLongTermFrameIdx = -1;
+    pRef->bRefBaseFlag = 0;
+    pRef->uiQualityId = -1;
+    pRef->uiTemporalId = -1;
+    pRef->uiSpatialId = -1;
+    pRef->iSpsId = -1;
+  }
 }
 
 //reset pRefList when
@@ -68,508 +66,492 @@
 // 1.sps arrived that is new sequence starting
 // 2.IDR NAL i.e. 1st layer in IDR AU
 
-void_t WelsResetRefPic(PWelsDecoderContext pCtx)
-{
-	int32_t i = 0;
-	PRefPic pRefPic = &pCtx->sRefPic;
-	pCtx->sRefPic.uiLongRefCount[0] = pCtx->sRefPic.uiShortRefCount[0] = 0;
+void_t WelsResetRefPic (PWelsDecoderContext pCtx) {
+  int32_t i = 0;
+  PRefPic pRefPic = &pCtx->sRefPic;
+  pCtx->sRefPic.uiLongRefCount[0] = pCtx->sRefPic.uiShortRefCount[0] = 0;
 
-	pRefPic->uiRefCount[LIST_0]	= 0;
-	
-	for(i=0; i < MAX_SHORT_REF_COUNT; i++)	{
-		if ( pRefPic->pShortRefList[LIST_0][i] != NULL){	
-			SetUnRef(pRefPic->pShortRefList[LIST_0][i]);
-			pRefPic->pShortRefList[LIST_0][i] = NULL;
-		}
-	}
-	pRefPic->uiShortRefCount[LIST_0] = 0;
+  pRefPic->uiRefCount[LIST_0]	= 0;
 
-	for(i=0; i < MAX_LONG_REF_COUNT; i++){
-		if (pRefPic->pLongRefList[LIST_0][i] != NULL)	{	
-			SetUnRef(pRefPic->pLongRefList[LIST_0][i]);
-			pRefPic->pLongRefList[LIST_0][i] = NULL;
-		}
-	}
-	pRefPic->uiLongRefCount[LIST_0] = 0;
+  for (i = 0; i < MAX_SHORT_REF_COUNT; i++)	{
+    if (pRefPic->pShortRefList[LIST_0][i] != NULL) {
+      SetUnRef (pRefPic->pShortRefList[LIST_0][i]);
+      pRefPic->pShortRefList[LIST_0][i] = NULL;
+    }
+  }
+  pRefPic->uiShortRefCount[LIST_0] = 0;
+
+  for (i = 0; i < MAX_LONG_REF_COUNT; i++) {
+    if (pRefPic->pLongRefList[LIST_0][i] != NULL)	{
+      SetUnRef (pRefPic->pLongRefList[LIST_0][i]);
+      pRefPic->pLongRefList[LIST_0][i] = NULL;
+    }
+  }
+  pRefPic->uiLongRefCount[LIST_0] = 0;
 }
 
 /**
  * fills the pRefPic.pRefList.
  */
-int32_t WelsInitRefList(PWelsDecoderContext pCtx, int32_t iPoc)
-{
-	int32_t i,j, iCount=0;
-	const bool_t kbUseRefBasePicFlag = pCtx->pCurDqLayer->bUseRefBasePicFlag;
-	PPicture* ppShoreRefList = pCtx->sRefPic.pShortRefList[LIST_0];
-	PPicture* ppLongRefList  = pCtx->sRefPic.pLongRefList[LIST_0];
-	memset(pCtx->sRefPic.pRefList[LIST_0],0,MAX_REF_PIC_COUNT*sizeof(PPicture));
-	//short
-	for(i=0; i<pCtx->sRefPic.uiShortRefCount[LIST_0]; ++i){	
-		if( kbUseRefBasePicFlag == ppShoreRefList[i]->bRefBaseFlag ) {
-			pCtx->sRefPic.pRefList[LIST_0][iCount++ ]= ppShoreRefList[i];	
-		}else{
-			for ( j = 0;j<pCtx->sRefPic.uiShortRefCount[LIST_0];++j)
-			{
-				if (ppShoreRefList[j]->iFrameNum == ppShoreRefList[i]->iFrameNum && ppShoreRefList[j]->bRefBaseFlag == kbUseRefBasePicFlag)
-				{
-					break;
-				}
-			}
-			if (j == pCtx->sRefPic.uiShortRefCount[LIST_0])
-			{
-				pCtx->sRefPic.pRefList[LIST_0][iCount++] = ppShoreRefList[i];
-			}
-		}
-	}
-				
-	//long
-	j = 0;
-	for(i=0; i< pCtx->sRefPic.uiLongRefCount[LIST_0] ; ++i){
-		if(kbUseRefBasePicFlag == ppLongRefList[i]->bRefBaseFlag){
-			pCtx->sRefPic.pRefList[LIST_0][iCount++  ]= ppLongRefList[i];
-		}else{
-			for ( j = 0;j<pCtx->sRefPic.uiLongRefCount[LIST_0];++j)
-			{
-				if (ppLongRefList[j]->iLongTermFrameIdx == ppLongRefList[i]->iLongTermFrameIdx && ppLongRefList[j]->bRefBaseFlag == kbUseRefBasePicFlag)
-				{
-					break;
-				}
-			}
-			if (j == pCtx->sRefPic.uiLongRefCount[LIST_0])
-			{
-				pCtx->sRefPic.pRefList[LIST_0][iCount++] = ppLongRefList[i];
-			}
-		}
-	}
-	pCtx->sRefPic.uiRefCount[LIST_0] = iCount;	
+int32_t WelsInitRefList (PWelsDecoderContext pCtx, int32_t iPoc) {
+  int32_t i, j, iCount = 0;
+  const bool_t kbUseRefBasePicFlag = pCtx->pCurDqLayer->bUseRefBasePicFlag;
+  PPicture* ppShoreRefList = pCtx->sRefPic.pShortRefList[LIST_0];
+  PPicture* ppLongRefList  = pCtx->sRefPic.pLongRefList[LIST_0];
+  memset (pCtx->sRefPic.pRefList[LIST_0], 0, MAX_REF_PIC_COUNT * sizeof (PPicture));
+  //short
+  for (i = 0; i < pCtx->sRefPic.uiShortRefCount[LIST_0]; ++i) {
+    if (kbUseRefBasePicFlag == ppShoreRefList[i]->bRefBaseFlag) {
+      pCtx->sRefPic.pRefList[LIST_0][iCount++ ] = ppShoreRefList[i];
+    } else {
+      for (j = 0; j < pCtx->sRefPic.uiShortRefCount[LIST_0]; ++j) {
+        if (ppShoreRefList[j]->iFrameNum == ppShoreRefList[i]->iFrameNum
+            && ppShoreRefList[j]->bRefBaseFlag == kbUseRefBasePicFlag) {
+          break;
+        }
+      }
+      if (j == pCtx->sRefPic.uiShortRefCount[LIST_0]) {
+        pCtx->sRefPic.pRefList[LIST_0][iCount++] = ppShoreRefList[i];
+      }
+    }
+  }
 
-   return ERR_NONE;
+  //long
+  j = 0;
+  for (i = 0; i < pCtx->sRefPic.uiLongRefCount[LIST_0] ; ++i) {
+    if (kbUseRefBasePicFlag == ppLongRefList[i]->bRefBaseFlag) {
+      pCtx->sRefPic.pRefList[LIST_0][iCount++  ] = ppLongRefList[i];
+    } else {
+      for (j = 0; j < pCtx->sRefPic.uiLongRefCount[LIST_0]; ++j) {
+        if (ppLongRefList[j]->iLongTermFrameIdx == ppLongRefList[i]->iLongTermFrameIdx
+            && ppLongRefList[j]->bRefBaseFlag == kbUseRefBasePicFlag) {
+          break;
+        }
+      }
+      if (j == pCtx->sRefPic.uiLongRefCount[LIST_0]) {
+        pCtx->sRefPic.pRefList[LIST_0][iCount++] = ppLongRefList[i];
+      }
+    }
+  }
+  pCtx->sRefPic.uiRefCount[LIST_0] = iCount;
+
+  return ERR_NONE;
 }
 
-int32_t WelsReorderRefList(PWelsDecoderContext pCtx)
-{
-	PRefPicListReorderSyn pRefPicListReorderSyn = pCtx->pCurDqLayer->pRefPicListReordering;
-	PNalUnitHeaderExt pNalHeaderExt = &pCtx->pCurDqLayer->sLayerInfo.sNalHeaderExt;
-	PSliceHeader pSliceHeader = &pCtx->pCurDqLayer->sLayerInfo.sSliceInLayer.sSliceHeaderExt.sSliceHeader;
-	PPicture pPic = NULL;
-	PPicture* ppRefList = pCtx->sRefPic.pRefList[LIST_0];
-	int32_t iRefCount = pCtx->sRefPic.uiRefCount[LIST_0];
-	int32_t iPredFrameNum = pSliceHeader->iFrameNum;
-	int32_t iMaxPicNum = 1<<pSliceHeader->pSps->uiLog2MaxFrameNum;
-	int32_t iAbsDiffPicNum = -1;
-	int32_t iReorderingIndex = 0;
-	int32_t i = 0;
+int32_t WelsReorderRefList (PWelsDecoderContext pCtx) {
+  PRefPicListReorderSyn pRefPicListReorderSyn = pCtx->pCurDqLayer->pRefPicListReordering;
+  PNalUnitHeaderExt pNalHeaderExt = &pCtx->pCurDqLayer->sLayerInfo.sNalHeaderExt;
+  PSliceHeader pSliceHeader = &pCtx->pCurDqLayer->sLayerInfo.sSliceInLayer.sSliceHeaderExt.sSliceHeader;
+  PPicture pPic = NULL;
+  PPicture* ppRefList = pCtx->sRefPic.pRefList[LIST_0];
+  int32_t iRefCount = pCtx->sRefPic.uiRefCount[LIST_0];
+  int32_t iPredFrameNum = pSliceHeader->iFrameNum;
+  int32_t iMaxPicNum = 1 << pSliceHeader->pSps->uiLog2MaxFrameNum;
+  int32_t iAbsDiffPicNum = -1;
+  int32_t iReorderingIndex = 0;
+  int32_t i = 0;
 
-	if(pCtx->eSliceType == I_SLICE || pCtx->eSliceType == SI_SLICE)	{	
-		return ERR_NONE;	
-	}
+  if (pCtx->eSliceType == I_SLICE || pCtx->eSliceType == SI_SLICE)	{
+    return ERR_NONE;
+  }
 
-	if ( iRefCount <= 0 )
-	{
-		pCtx->iErrorCode = dsNoParamSets; //No any reference for decoding, SHOULD request IDR
-		return ERR_INFO_REFERENCE_PIC_LOST;
-	}
+  if (iRefCount <= 0) {
+    pCtx->iErrorCode = dsNoParamSets; //No any reference for decoding, SHOULD request IDR
+    return ERR_INFO_REFERENCE_PIC_LOST;
+  }
 
-	if (pRefPicListReorderSyn->bRefPicListReorderingFlag[LIST_0]){
-		while (pRefPicListReorderSyn->sReorderingSyn[LIST_0][iReorderingIndex].uiReorderingOfPicNumsIdc != 3)
-		{
-			uint16_t uiReorderingOfPicNumsIdc = pRefPicListReorderSyn->sReorderingSyn[LIST_0][iReorderingIndex].uiReorderingOfPicNumsIdc;
-			if (uiReorderingOfPicNumsIdc <2){
-				iAbsDiffPicNum = pRefPicListReorderSyn->sReorderingSyn[LIST_0][iReorderingIndex].uiAbsDiffPicNumMinus1 + 1;
+  if (pRefPicListReorderSyn->bRefPicListReorderingFlag[LIST_0]) {
+    while (pRefPicListReorderSyn->sReorderingSyn[LIST_0][iReorderingIndex].uiReorderingOfPicNumsIdc != 3) {
+      uint16_t uiReorderingOfPicNumsIdc =
+        pRefPicListReorderSyn->sReorderingSyn[LIST_0][iReorderingIndex].uiReorderingOfPicNumsIdc;
+      if (uiReorderingOfPicNumsIdc < 2) {
+        iAbsDiffPicNum = pRefPicListReorderSyn->sReorderingSyn[LIST_0][iReorderingIndex].uiAbsDiffPicNumMinus1 + 1;
 
-				if (uiReorderingOfPicNumsIdc == 0){	
-					iPredFrameNum -= iAbsDiffPicNum;
-				}else{	
-					iPredFrameNum += iAbsDiffPicNum;	
-				}
-				iPredFrameNum &= iMaxPicNum-1;
+        if (uiReorderingOfPicNumsIdc == 0) {
+          iPredFrameNum -= iAbsDiffPicNum;
+        } else {
+          iPredFrameNum += iAbsDiffPicNum;
+        }
+        iPredFrameNum &= iMaxPicNum - 1;
 
-				for( i= iRefCount-1; i>=iReorderingIndex; i--){
-					if (ppRefList[i]->iFrameNum == iPredFrameNum && !ppRefList[i]->bIsLongRef)
-					{
-						if( ( pNalHeaderExt->uiQualityId == ppRefList[i]->uiQualityId ) && ( pSliceHeader->iSpsId != ppRefList[i]->iSpsId ) )//check;
-						{
-							WelsLog( pCtx, WELS_LOG_WARNING, "WelsReorderRefList()::::BASE LAYER::::iSpsId:%d, ref_sps_id:%d\n",pSliceHeader->iSpsId, ppRefList[i]->iSpsId );						
-							pCtx->iErrorCode = dsNoParamSets;	//cross-IDR reference frame selection, SHOULD request IDR.--
-							return ERR_INFO_REFERENCE_PIC_LOST;
-						}else{
-							break;
-						}
-					}
-				}
-		
-			}else if (uiReorderingOfPicNumsIdc == 2){
-				for(  i = iRefCount -1; i>=iReorderingIndex; i--){
-					if( ppRefList[i]->bIsLongRef && ppRefList[i]->iLongTermFrameIdx == pRefPicListReorderSyn->sReorderingSyn[LIST_0][iReorderingIndex].uiLongTermPicNum )
-					{
-						if ( ( pNalHeaderExt->uiQualityId == ppRefList[i]->uiQualityId ) && ( pSliceHeader->iSpsId != ppRefList[i]->iSpsId ) )//check;
-						{
-							WelsLog( pCtx, WELS_LOG_WARNING, "WelsReorderRefList()::::BASE LAYER::::iSpsId:%d, ref_sps_id:%d\n",pSliceHeader->iSpsId, ppRefList[i]->iSpsId );						
-							pCtx->iErrorCode = dsNoParamSets;	//cross-IDR reference frame selection, SHOULD request IDR.--
-							return ERR_INFO_REFERENCE_PIC_LOST;
-						}else{
-							break;
-						}
-					}
-				}
-			}
-			if (i < 0)	{	
-				return ERR_INFO_REFERENCE_PIC_LOST;
-			}
-			pPic = ppRefList[i];
-			memmove(&ppRefList[1+iReorderingIndex], &ppRefList[iReorderingIndex], (i-iReorderingIndex)*sizeof(PPicture));//confirmed_safe_unsafe_usage
-			ppRefList[iReorderingIndex]= pPic;
-			iReorderingIndex++;
-		}
-	}
-	return ERR_NONE;
+        for (i = iRefCount - 1; i >= iReorderingIndex; i--) {
+          if (ppRefList[i]->iFrameNum == iPredFrameNum && !ppRefList[i]->bIsLongRef) {
+            if ((pNalHeaderExt->uiQualityId == ppRefList[i]->uiQualityId)
+                && (pSliceHeader->iSpsId != ppRefList[i]->iSpsId)) {   //check;
+              WelsLog (pCtx, WELS_LOG_WARNING, "WelsReorderRefList()::::BASE LAYER::::iSpsId:%d, ref_sps_id:%d\n",
+                       pSliceHeader->iSpsId, ppRefList[i]->iSpsId);
+              pCtx->iErrorCode = dsNoParamSets;	//cross-IDR reference frame selection, SHOULD request IDR.--
+              return ERR_INFO_REFERENCE_PIC_LOST;
+            } else {
+              break;
+            }
+          }
+        }
+
+      } else if (uiReorderingOfPicNumsIdc == 2) {
+        for (i = iRefCount - 1; i >= iReorderingIndex; i--) {
+          if (ppRefList[i]->bIsLongRef
+              && ppRefList[i]->iLongTermFrameIdx ==
+              pRefPicListReorderSyn->sReorderingSyn[LIST_0][iReorderingIndex].uiLongTermPicNum) {
+            if ((pNalHeaderExt->uiQualityId == ppRefList[i]->uiQualityId)
+                && (pSliceHeader->iSpsId != ppRefList[i]->iSpsId)) {    //check;
+              WelsLog (pCtx, WELS_LOG_WARNING, "WelsReorderRefList()::::BASE LAYER::::iSpsId:%d, ref_sps_id:%d\n",
+                       pSliceHeader->iSpsId, ppRefList[i]->iSpsId);
+              pCtx->iErrorCode = dsNoParamSets;	//cross-IDR reference frame selection, SHOULD request IDR.--
+              return ERR_INFO_REFERENCE_PIC_LOST;
+            } else {
+              break;
+            }
+          }
+        }
+      }
+      if (i < 0)	{
+        return ERR_INFO_REFERENCE_PIC_LOST;
+      }
+      pPic = ppRefList[i];
+      memmove (&ppRefList[1 + iReorderingIndex], &ppRefList[iReorderingIndex],
+               (i - iReorderingIndex)*sizeof (PPicture)); //confirmed_safe_unsafe_usage
+      ppRefList[iReorderingIndex] = pPic;
+      iReorderingIndex++;
+    }
+  }
+  return ERR_NONE;
 }
 
-int32_t WelsMarkAsRef(PWelsDecoderContext pCtx, const bool_t kbRefBaseMarkingFlag)
-{
-	PRefPic pRefPic = &pCtx->sRefPic;
-	PRefPicMarking pRefPicMarking = pCtx->pCurDqLayer->pRefPicMarking;
-	PRefBasePicMarking pRefPicBaseMarking =pCtx->pCurDqLayer->pRefPicBaseMarking;
-	PAccessUnit pCurAU = pCtx->pAccessUnitList;
-	bool_t bIsIDRAU = FALSE;
-	uint32_t j;
+int32_t WelsMarkAsRef (PWelsDecoderContext pCtx, const bool_t kbRefBaseMarkingFlag) {
+  PRefPic pRefPic = &pCtx->sRefPic;
+  PRefPicMarking pRefPicMarking = pCtx->pCurDqLayer->pRefPicMarking;
+  PRefBasePicMarking pRefPicBaseMarking = pCtx->pCurDqLayer->pRefPicBaseMarking;
+  PAccessUnit pCurAU = pCtx->pAccessUnitList;
+  bool_t bIsIDRAU = FALSE;
+  uint32_t j;
 
-	int32_t iRet = ERR_NONE;
-	if(pCtx->pCurDqLayer->bStoreRefBasePicFlag && (pCtx->pSps->iNumRefFrames<2)){
-		return ERR_INFO_INVALID_MMCO_REF_NUM_NOT_ENOUGH;
-	}
-	
-	pCtx->pDec->bUsedAsRef = TRUE;
-	pCtx->pDec->uiQualityId = pCtx->pCurDqLayer->sLayerInfo.sNalHeaderExt.uiQualityId;
-	pCtx->pDec->uiTemporalId = pCtx->pCurDqLayer->sLayerInfo.sNalHeaderExt.uiTemporalId;
-	pCtx->pDec->bRefBaseFlag = kbRefBaseMarkingFlag;
+  int32_t iRet = ERR_NONE;
+  if (pCtx->pCurDqLayer->bStoreRefBasePicFlag && (pCtx->pSps->iNumRefFrames < 2)) {
+    return ERR_INFO_INVALID_MMCO_REF_NUM_NOT_ENOUGH;
+  }
 
-	for( j = pCurAU->uiStartPos; j <= pCurAU->uiEndPos; j++ ) {
-		if (pCurAU->pNalUnitsList[j]->sNalHeaderExt.sNalUnitHeader.eNalUnitType== NAL_UNIT_CODED_SLICE_IDR||	pCurAU->pNalUnitsList[j]->sNalHeaderExt.bIdrFlag) {
-			bIsIDRAU = TRUE;
-			break;
-		}
-	}
-	if(bIsIDRAU){
-		if (pRefPicMarking->bLongTermRefFlag){
-			pCtx->sRefPic.iMaxLongTermFrameIdx = 0;
-			AddLongTermToList(pRefPic,pCtx->pDec,0);
-		}else{	
-			pCtx->sRefPic.iMaxLongTermFrameIdx = -1;
-		}
-	}else{
-		if (pRefPicBaseMarking->bAdaptiveRefBasePicMarkingModeFlag){
-			iRet = MMCOBase(pCtx,pRefPicBaseMarking);
-		}
+  pCtx->pDec->bUsedAsRef = TRUE;
+  pCtx->pDec->uiQualityId = pCtx->pCurDqLayer->sLayerInfo.sNalHeaderExt.uiQualityId;
+  pCtx->pDec->uiTemporalId = pCtx->pCurDqLayer->sLayerInfo.sNalHeaderExt.uiTemporalId;
+  pCtx->pDec->bRefBaseFlag = kbRefBaseMarkingFlag;
 
-		if (iRet != ERR_NONE){
-			return iRet;
-		}
+  for (j = pCurAU->uiStartPos; j <= pCurAU->uiEndPos; j++) {
+    if (pCurAU->pNalUnitsList[j]->sNalHeaderExt.sNalUnitHeader.eNalUnitType == NAL_UNIT_CODED_SLICE_IDR
+        ||	pCurAU->pNalUnitsList[j]->sNalHeaderExt.bIdrFlag) {
+      bIsIDRAU = TRUE;
+      break;
+    }
+  }
+  if (bIsIDRAU) {
+    if (pRefPicMarking->bLongTermRefFlag) {
+      pCtx->sRefPic.iMaxLongTermFrameIdx = 0;
+      AddLongTermToList (pRefPic, pCtx->pDec, 0);
+    } else {
+      pCtx->sRefPic.iMaxLongTermFrameIdx = -1;
+    }
+  } else {
+    if (pRefPicBaseMarking->bAdaptiveRefBasePicMarkingModeFlag) {
+      iRet = MMCOBase (pCtx, pRefPicBaseMarking);
+    }
 
-		if (pRefPicMarking->bAdaptiveRefPicMarkingModeFlag){
-			iRet = MMCO(pCtx,pRefPicMarking);
-            if( pCtx->bLastHasMmco5 )
-            {
-                pCtx->pDec->iFrameNum = 0;
-                pCtx->pDec->iFramePoc = 0;
-            }
-			if (pRefPic->uiLongRefCount[LIST_0]+pRefPic->uiShortRefCount[LIST_0] > pCtx->pSps->iNumRefFrames){
-				return ERR_INFO_INVALID_MMCO_REF_NUM_OVERFLOW;
-			}
-		}else{	
-			iRet = SlidingWindow(pCtx);
-		}
-	}
-	
-	if (!pCtx->pDec->bIsLongRef){
-		AddShortTermToList(pRefPic,pCtx->pDec);
-	}
+    if (iRet != ERR_NONE) {
+      return iRet;
+    }
 
-	return iRet;
+    if (pRefPicMarking->bAdaptiveRefPicMarkingModeFlag) {
+      iRet = MMCO (pCtx, pRefPicMarking);
+      if (pCtx->bLastHasMmco5) {
+        pCtx->pDec->iFrameNum = 0;
+        pCtx->pDec->iFramePoc = 0;
+      }
+      if (pRefPic->uiLongRefCount[LIST_0] + pRefPic->uiShortRefCount[LIST_0] > pCtx->pSps->iNumRefFrames) {
+        return ERR_INFO_INVALID_MMCO_REF_NUM_OVERFLOW;
+      }
+    } else {
+      iRet = SlidingWindow (pCtx);
+    }
+  }
+
+  if (!pCtx->pDec->bIsLongRef) {
+    AddShortTermToList (pRefPic, pCtx->pDec);
+  }
+
+  return iRet;
 }
 
-static int32_t MMCOBase(PWelsDecoderContext pCtx,PRefBasePicMarking pRefPicBaseMarking)
-{
-	PSps pSps = pCtx->pCurDqLayer->sLayerInfo.pSps;
-	int32_t i = 0;
-	int32_t iRet = ERR_NONE;
+static int32_t MMCOBase (PWelsDecoderContext pCtx, PRefBasePicMarking pRefPicBaseMarking) {
+  PSps pSps = pCtx->pCurDqLayer->sLayerInfo.pSps;
+  int32_t i = 0;
+  int32_t iRet = ERR_NONE;
 
-	for ( i = 0 ; pRefPicBaseMarking->mmco_base[i].uiMmcoType != MMCO_END; i++){
-		uint32_t uiMmcoType = pRefPicBaseMarking->mmco_base[i].uiMmcoType;			
-		int32_t iShortFrameNum = (pCtx->iFrameNum - pRefPicBaseMarking->mmco_base[i].uiDiffOfPicNums) &((1<<pSps->uiLog2MaxFrameNum)-1);
-		uint32_t uiLongTermPicNum = pRefPicBaseMarking->mmco_base[i].uiLongTermPicNum;
-		if ( uiMmcoType > MMCO_LONG2UNUSED)	{
-			return ERR_INFO_INVALID_MMCO_OPCODE_BASE;
-		}
-		iRet = MMCOProcess(pCtx,uiMmcoType,TRUE,iShortFrameNum,uiLongTermPicNum,0,0);
+  for (i = 0 ; pRefPicBaseMarking->mmco_base[i].uiMmcoType != MMCO_END; i++) {
+    uint32_t uiMmcoType = pRefPicBaseMarking->mmco_base[i].uiMmcoType;
+    int32_t iShortFrameNum = (pCtx->iFrameNum - pRefPicBaseMarking->mmco_base[i].uiDiffOfPicNums) & ((
+                               1 << pSps->uiLog2MaxFrameNum) - 1);
+    uint32_t uiLongTermPicNum = pRefPicBaseMarking->mmco_base[i].uiLongTermPicNum;
+    if (uiMmcoType > MMCO_LONG2UNUSED)	{
+      return ERR_INFO_INVALID_MMCO_OPCODE_BASE;
+    }
+    iRet = MMCOProcess (pCtx, uiMmcoType, TRUE, iShortFrameNum, uiLongTermPicNum, 0, 0);
 
-		if (iRet != ERR_NONE){
-			return iRet;
-		}
-	}
+    if (iRet != ERR_NONE) {
+      return iRet;
+    }
+  }
 
-	return ERR_NONE;
+  return ERR_NONE;
 }
 
-static int32_t MMCO(PWelsDecoderContext pCtx,PRefPicMarking pRefPicMarking)
-{
-	PSps pSps = pCtx->pCurDqLayer->sLayerInfo.pSps;
-	int32_t i = 0;
-	int32_t iRet = ERR_NONE;
-	for ( i = 0; pRefPicMarking->sMmcoRef[i].uiMmcoType != MMCO_END; i++){
-		uint32_t uiMmcoType = pRefPicMarking->sMmcoRef[i].uiMmcoType;
-		int32_t iShortFrameNum = (pCtx->iFrameNum - pRefPicMarking->sMmcoRef[i].iDiffOfPicNum) & ((1<<pSps->uiLog2MaxFrameNum)-1);
-		uint32_t uiLongTermPicNum = pRefPicMarking->sMmcoRef[i].uiLongTermPicNum;
-		int32_t iLongTermFrameIdx = pRefPicMarking->sMmcoRef[i].iLongTermFrameIdx;
-		int32_t iMaxLongTermFrameIdx = pRefPicMarking->sMmcoRef[i].iMaxLongTermFrameIdx;
-		if ( uiMmcoType > MMCO_LONG)	{
-			return ERR_INFO_INVALID_MMCO_OPCODE_BASE;
-		}
-		iRet = MMCOProcess(pCtx,uiMmcoType,FALSE,iShortFrameNum,uiLongTermPicNum,iLongTermFrameIdx,iMaxLongTermFrameIdx);
-		if (iRet != ERR_NONE){
-			return iRet;
-		}
-	}
+static int32_t MMCO (PWelsDecoderContext pCtx, PRefPicMarking pRefPicMarking) {
+  PSps pSps = pCtx->pCurDqLayer->sLayerInfo.pSps;
+  int32_t i = 0;
+  int32_t iRet = ERR_NONE;
+  for (i = 0; pRefPicMarking->sMmcoRef[i].uiMmcoType != MMCO_END; i++) {
+    uint32_t uiMmcoType = pRefPicMarking->sMmcoRef[i].uiMmcoType;
+    int32_t iShortFrameNum = (pCtx->iFrameNum - pRefPicMarking->sMmcoRef[i].iDiffOfPicNum) & ((
+                               1 << pSps->uiLog2MaxFrameNum) - 1);
+    uint32_t uiLongTermPicNum = pRefPicMarking->sMmcoRef[i].uiLongTermPicNum;
+    int32_t iLongTermFrameIdx = pRefPicMarking->sMmcoRef[i].iLongTermFrameIdx;
+    int32_t iMaxLongTermFrameIdx = pRefPicMarking->sMmcoRef[i].iMaxLongTermFrameIdx;
+    if (uiMmcoType > MMCO_LONG)	{
+      return ERR_INFO_INVALID_MMCO_OPCODE_BASE;
+    }
+    iRet = MMCOProcess (pCtx, uiMmcoType, FALSE, iShortFrameNum, uiLongTermPicNum, iLongTermFrameIdx, iMaxLongTermFrameIdx);
+    if (iRet != ERR_NONE) {
+      return iRet;
+    }
+  }
 
-	return ERR_NONE;
+  return ERR_NONE;
 }
-static int32_t MMCOProcess( PWelsDecoderContext pCtx,uint32_t uiMmcoType,bool_t bRefBasePic,
-                           int32_t iShortFrameNum,uint32_t uiLongTermPicNum ,int32_t iLongTermFrameIdx,int32_t iMaxLongTermFrameIdx )
-{
-	PRefPic pRefPic = &pCtx->sRefPic;
-	PPicture pPic = NULL;
-	int32_t i = 0;
-	int32_t iRet = ERR_NONE;
+static int32_t MMCOProcess (PWelsDecoderContext pCtx, uint32_t uiMmcoType, bool_t bRefBasePic,
+                            int32_t iShortFrameNum, uint32_t uiLongTermPicNum , int32_t iLongTermFrameIdx, int32_t iMaxLongTermFrameIdx) {
+  PRefPic pRefPic = &pCtx->sRefPic;
+  PPicture pPic = NULL;
+  int32_t i = 0;
+  int32_t iRet = ERR_NONE;
 
-	switch (uiMmcoType)
-	{
-	case MMCO_SHORT2UNUSED:
-		pPic = WelsDelShortFromListSetUnref(pRefPic,iShortFrameNum,(ERemoveFlag) bRefBasePic);
-		break;
-	case MMCO_LONG2UNUSED:
-		pPic = WelsDelLongFromListSetUnref(pRefPic,uiLongTermPicNum,(ERemoveFlag) bRefBasePic);
-		break;
-	case MMCO_SHORT2LONG:
-		if(iLongTermFrameIdx > pRefPic->iMaxLongTermFrameIdx){	
-			return ERR_INFO_INVALID_MMCO_LONG_TERM_IDX_EXCEED_MAX;
-		}
-		pPic = WelsDelShortFromList(pRefPic,iShortFrameNum,REMOVE_TARGET); 
-		WelsDelLongFromListSetUnref(pRefPic,iLongTermFrameIdx,REMOVE_TARGET);
+  switch (uiMmcoType) {
+  case MMCO_SHORT2UNUSED:
+    pPic = WelsDelShortFromListSetUnref (pRefPic, iShortFrameNum, (ERemoveFlag) bRefBasePic);
+    break;
+  case MMCO_LONG2UNUSED:
+    pPic = WelsDelLongFromListSetUnref (pRefPic, uiLongTermPicNum, (ERemoveFlag) bRefBasePic);
+    break;
+  case MMCO_SHORT2LONG:
+    if (iLongTermFrameIdx > pRefPic->iMaxLongTermFrameIdx) {
+      return ERR_INFO_INVALID_MMCO_LONG_TERM_IDX_EXCEED_MAX;
+    }
+    pPic = WelsDelShortFromList (pRefPic, iShortFrameNum, REMOVE_TARGET);
+    WelsDelLongFromListSetUnref (pRefPic, iLongTermFrameIdx, REMOVE_TARGET);
 
-		WelsDelShortFromList(pRefPic,iShortFrameNum,REMOVE_BASE); 			
-		WelsDelLongFromListSetUnref(pRefPic,iLongTermFrameIdx,REMOVE_BASE);
+    WelsDelShortFromList (pRefPic, iShortFrameNum, REMOVE_BASE);
+    WelsDelLongFromListSetUnref (pRefPic, iLongTermFrameIdx, REMOVE_BASE);
 #ifdef LONG_TERM_REF
-		pCtx->bCurAuContainLtrMarkSeFlag = true;
-		pCtx->iFrameNumOfAuMarkedLtr      = iShortFrameNum;
-		WelsLog( pCtx, WELS_LOG_INFO, "ex_mark_avc():::MMCO_SHORT2LONG:::LTR marking....iFrameNum: %d\n", pCtx->iFrameNumOfAuMarkedLtr );
+    pCtx->bCurAuContainLtrMarkSeFlag = true;
+    pCtx->iFrameNumOfAuMarkedLtr      = iShortFrameNum;
+    WelsLog (pCtx, WELS_LOG_INFO, "ex_mark_avc():::MMCO_SHORT2LONG:::LTR marking....iFrameNum: %d\n",
+             pCtx->iFrameNumOfAuMarkedLtr);
 #endif
 
-		MarkAsLongTerm(pRefPic,iShortFrameNum,iLongTermFrameIdx);
-		break;
-	case MMCO_SET_MAX_LONG:
-		pRefPic->iMaxLongTermFrameIdx = iMaxLongTermFrameIdx;
-		for (i = 0 ;i <pRefPic->uiLongRefCount[LIST_0];i++) {
-			if (pRefPic->pLongRefList[LIST_0][i]->iLongTermFrameIdx > pRefPic->iMaxLongTermFrameIdx) {
-				WelsDelLongFromListSetUnref(pRefPic,pRefPic->pLongRefList[LIST_0][i]->iLongTermFrameIdx,REMOVE_BASE_FIRST);		
-			}
-		}
-		break;
-	case MMCO_RESET:
-		WelsResetRefPic(pCtx);
-        pCtx->bLastHasMmco5 = true;
-		break;
-	case MMCO_LONG:
-		if(iLongTermFrameIdx > pRefPic->iMaxLongTermFrameIdx){	
-			return ERR_INFO_INVALID_MMCO_LONG_TERM_IDX_EXCEED_MAX; 
-		}
+    MarkAsLongTerm (pRefPic, iShortFrameNum, iLongTermFrameIdx);
+    break;
+  case MMCO_SET_MAX_LONG:
+    pRefPic->iMaxLongTermFrameIdx = iMaxLongTermFrameIdx;
+    for (i = 0 ; i < pRefPic->uiLongRefCount[LIST_0]; i++) {
+      if (pRefPic->pLongRefList[LIST_0][i]->iLongTermFrameIdx > pRefPic->iMaxLongTermFrameIdx) {
+        WelsDelLongFromListSetUnref (pRefPic, pRefPic->pLongRefList[LIST_0][i]->iLongTermFrameIdx, REMOVE_BASE_FIRST);
+      }
+    }
+    break;
+  case MMCO_RESET:
+    WelsResetRefPic (pCtx);
+    pCtx->bLastHasMmco5 = true;
+    break;
+  case MMCO_LONG:
+    if (iLongTermFrameIdx > pRefPic->iMaxLongTermFrameIdx) {
+      return ERR_INFO_INVALID_MMCO_LONG_TERM_IDX_EXCEED_MAX;
+    }
 #ifdef LONG_TERM_REF
-		pCtx->bCurAuContainLtrMarkSeFlag = true;
-		pCtx->iFrameNumOfAuMarkedLtr      = pCtx->iFrameNum;
-		WelsLog( pCtx, WELS_LOG_INFO, "ex_mark_avc():::MMCO_LONG:::LTR marking....iFrameNum: %d\n", pCtx->iFrameNum );
+    pCtx->bCurAuContainLtrMarkSeFlag = true;
+    pCtx->iFrameNumOfAuMarkedLtr      = pCtx->iFrameNum;
+    WelsLog (pCtx, WELS_LOG_INFO, "ex_mark_avc():::MMCO_LONG:::LTR marking....iFrameNum: %d\n", pCtx->iFrameNum);
 #endif
-		WelsDelLongFromListSetUnref(pRefPic,iLongTermFrameIdx,REMOVE_TARGET);
-		WelsDelLongFromListSetUnref(pRefPic,iLongTermFrameIdx,REMOVE_BASE);
-		iRet = AddLongTermToList(pRefPic,pCtx->pDec,iLongTermFrameIdx);
-		break;
-	default :
-		break;
-	}
+    WelsDelLongFromListSetUnref (pRefPic, iLongTermFrameIdx, REMOVE_TARGET);
+    WelsDelLongFromListSetUnref (pRefPic, iLongTermFrameIdx, REMOVE_BASE);
+    iRet = AddLongTermToList (pRefPic, pCtx->pDec, iLongTermFrameIdx);
+    break;
+  default :
+    break;
+  }
 
-	return iRet;
+  return iRet;
 }
 
-static int32_t SlidingWindow( PWelsDecoderContext pCtx )
-{
-	PRefPic pRefPic = &pCtx->sRefPic;
-	PPicture pPic = NULL;
-	int32_t i = 0;
+static int32_t SlidingWindow (PWelsDecoderContext pCtx) {
+  PRefPic pRefPic = &pCtx->sRefPic;
+  PPicture pPic = NULL;
+  int32_t i = 0;
 
-	if (pCtx->sRefPic.uiShortRefCount[LIST_0] +pCtx->sRefPic.uiLongRefCount[LIST_0] >= pCtx->pSps->iNumRefFrames){	
-		for ( i = pRefPic->uiShortRefCount[LIST_0] -1;i>=0;i--){
-			pPic = WelsDelShortFromList(pRefPic,pRefPic->pShortRefList[LIST_0][i]->iFrameNum,REMOVE_BASE_FIRST);
-			if (pPic){	
-				SetUnRef(pPic);
-				break;
-			}else{
-				return ERR_INFO_INVALID_MMCO_REF_NUM_OVERFLOW;
-			}
-		}
-	}
-	return ERR_NONE;
+  if (pCtx->sRefPic.uiShortRefCount[LIST_0] + pCtx->sRefPic.uiLongRefCount[LIST_0] >= pCtx->pSps->iNumRefFrames) {
+    for (i = pRefPic->uiShortRefCount[LIST_0] - 1; i >= 0; i--) {
+      pPic = WelsDelShortFromList (pRefPic, pRefPic->pShortRefList[LIST_0][i]->iFrameNum, REMOVE_BASE_FIRST);
+      if (pPic) {
+        SetUnRef (pPic);
+        break;
+      } else {
+        return ERR_INFO_INVALID_MMCO_REF_NUM_OVERFLOW;
+      }
+    }
+  }
+  return ERR_NONE;
 }
 
-static PPicture WelsDelShortFromList(PRefPic pRefPic, int32_t iFrameNum, ERemoveFlag eRemoveFlag)
-{
-	int32_t i = 0;
-	int32_t iMoveSize = 0;
-	PPicture pPic = NULL;
+static PPicture WelsDelShortFromList (PRefPic pRefPic, int32_t iFrameNum, ERemoveFlag eRemoveFlag) {
+  int32_t i = 0;
+  int32_t iMoveSize = 0;
+  PPicture pPic = NULL;
 
-	for(i=0; i<pRefPic->uiShortRefCount[LIST_0]; i++){
-		if( pRefPic->pShortRefList[LIST_0][i]->iFrameNum == iFrameNum)
-		{
-			if(   ( eRemoveFlag == REMOVE_TARGET && !pRefPic->pShortRefList[LIST_0][i]->bRefBaseFlag )	
-				||( eRemoveFlag == REMOVE_BASE && pRefPic->pShortRefList[LIST_0][i]->bRefBaseFlag) 
-				||(eRemoveFlag == REMOVE_BASE_FIRST ) )
-			{
-				iMoveSize = pRefPic->uiShortRefCount[LIST_0] - i - 1;
-				pRefPic->pShortRefList[LIST_0][i]->bUsedAsRef = false;
-				pPic = pRefPic->pShortRefList[LIST_0][i];
-				pRefPic->pShortRefList[LIST_0][i]= NULL;
-				if (iMoveSize > 0){	
-					memmove(&pRefPic->pShortRefList[LIST_0][i], &pRefPic->pShortRefList[LIST_0][i+1], iMoveSize * sizeof(PPicture));//confirmed_safe_unsafe_usage
-				}
-				pRefPic->uiShortRefCount[LIST_0]--;
-				pRefPic->pShortRefList[LIST_0][pRefPic->uiShortRefCount[0]] = NULL;
-				break;
-			}
-		}
-	}
+  for (i = 0; i < pRefPic->uiShortRefCount[LIST_0]; i++) {
+    if (pRefPic->pShortRefList[LIST_0][i]->iFrameNum == iFrameNum) {
+      if ((eRemoveFlag == REMOVE_TARGET && !pRefPic->pShortRefList[LIST_0][i]->bRefBaseFlag)
+          || (eRemoveFlag == REMOVE_BASE && pRefPic->pShortRefList[LIST_0][i]->bRefBaseFlag)
+          || (eRemoveFlag == REMOVE_BASE_FIRST)) {
+        iMoveSize = pRefPic->uiShortRefCount[LIST_0] - i - 1;
+        pRefPic->pShortRefList[LIST_0][i]->bUsedAsRef = false;
+        pPic = pRefPic->pShortRefList[LIST_0][i];
+        pRefPic->pShortRefList[LIST_0][i] = NULL;
+        if (iMoveSize > 0) {
+          memmove (&pRefPic->pShortRefList[LIST_0][i], &pRefPic->pShortRefList[LIST_0][i + 1],
+                   iMoveSize * sizeof (PPicture)); //confirmed_safe_unsafe_usage
+        }
+        pRefPic->uiShortRefCount[LIST_0]--;
+        pRefPic->pShortRefList[LIST_0][pRefPic->uiShortRefCount[0]] = NULL;
+        break;
+      }
+    }
+  }
 
-	return pPic;
+  return pPic;
 }
 
-static PPicture WelsDelShortFromListSetUnref(PRefPic pRefPic, int32_t iFrameNum, ERemoveFlag eRemoveFlag)
-{
-	PPicture pPic = WelsDelShortFromList(pRefPic,iFrameNum,eRemoveFlag);
-	if (pPic){	
-		SetUnRef(pPic);
-	}
-	return pPic;
+static PPicture WelsDelShortFromListSetUnref (PRefPic pRefPic, int32_t iFrameNum, ERemoveFlag eRemoveFlag) {
+  PPicture pPic = WelsDelShortFromList (pRefPic, iFrameNum, eRemoveFlag);
+  if (pPic) {
+    SetUnRef (pPic);
+  }
+  return pPic;
 }
 
-static PPicture WelsDelLongFromList(PRefPic pRefPic, uint32_t uiLongTermFrameIdx, ERemoveFlag eRemoveFlag)
-{
-	PPicture pPic = NULL;
-	int32_t i = 0;
-	for ( i = 0;i<pRefPic->uiLongRefCount[LIST_0];i++)
-	{
-		pPic = pRefPic->pLongRefList[LIST_0][i];
-		if ( pPic->iLongTermFrameIdx == (int32_t)uiLongTermFrameIdx)
-		{
-			if( ((eRemoveFlag == REMOVE_TARGET) && !(pPic->bRefBaseFlag)) || ((eRemoveFlag == REMOVE_BASE) && pPic->bRefBaseFlag) )
-			{
-				int32_t iMoveSize = pRefPic->uiLongRefCount[LIST_0] - i - 1;
-				pPic->bUsedAsRef = FALSE;
-				pPic->bIsLongRef = FALSE;
-				if (iMoveSize > 0){	
-					memmove(&pRefPic->pLongRefList[LIST_0][i], &pRefPic->pLongRefList[LIST_0][i+1], iMoveSize * sizeof(PPicture));//confirmed_safe_unsafe_usage
-				}
-				pRefPic->uiLongRefCount[LIST_0]--;
-				pRefPic->pLongRefList[LIST_0][pRefPic->uiLongRefCount[LIST_0]] = NULL;
-				return pPic;
-			}
-		}
-	}
-	return NULL;
+static PPicture WelsDelLongFromList (PRefPic pRefPic, uint32_t uiLongTermFrameIdx, ERemoveFlag eRemoveFlag) {
+  PPicture pPic = NULL;
+  int32_t i = 0;
+  for (i = 0; i < pRefPic->uiLongRefCount[LIST_0]; i++) {
+    pPic = pRefPic->pLongRefList[LIST_0][i];
+    if (pPic->iLongTermFrameIdx == (int32_t)uiLongTermFrameIdx) {
+      if (((eRemoveFlag == REMOVE_TARGET) && ! (pPic->bRefBaseFlag)) || ((eRemoveFlag == REMOVE_BASE)
+          && pPic->bRefBaseFlag)) {
+        int32_t iMoveSize = pRefPic->uiLongRefCount[LIST_0] - i - 1;
+        pPic->bUsedAsRef = FALSE;
+        pPic->bIsLongRef = FALSE;
+        if (iMoveSize > 0) {
+          memmove (&pRefPic->pLongRefList[LIST_0][i], &pRefPic->pLongRefList[LIST_0][i + 1],
+                   iMoveSize * sizeof (PPicture)); //confirmed_safe_unsafe_usage
+        }
+        pRefPic->uiLongRefCount[LIST_0]--;
+        pRefPic->pLongRefList[LIST_0][pRefPic->uiLongRefCount[LIST_0]] = NULL;
+        return pPic;
+      }
+    }
+  }
+  return NULL;
 }
 
-static PPicture WelsDelLongFromListSetUnref(PRefPic pRefPic, uint32_t uiLongTermFrameIdx, ERemoveFlag eRemoveFlag)
-{
-	PPicture pPic = WelsDelLongFromList(pRefPic,uiLongTermFrameIdx,eRemoveFlag);
-	if (pPic){
-		SetUnRef(pPic);
-	}
-	return pPic;
+static PPicture WelsDelLongFromListSetUnref (PRefPic pRefPic, uint32_t uiLongTermFrameIdx, ERemoveFlag eRemoveFlag) {
+  PPicture pPic = WelsDelLongFromList (pRefPic, uiLongTermFrameIdx, eRemoveFlag);
+  if (pPic) {
+    SetUnRef (pPic);
+  }
+  return pPic;
 }
 
-static int32_t AddShortTermToList(PRefPic pRefPic,PPicture pPic)
-{
-	pPic->bUsedAsRef = TRUE;
-	pPic->bIsLongRef = FALSE;
-	pPic->iLongTermFrameIdx = -1;
-	if (pRefPic->uiShortRefCount[LIST_0]>0)	{
-		memmove(&pRefPic->pShortRefList[LIST_0][1],&pRefPic->pShortRefList[LIST_0][0],pRefPic->uiShortRefCount[LIST_0]*sizeof(PPicture));//confirmed_safe_unsafe_usage
-	}
-	pRefPic->pShortRefList[LIST_0][0] = pPic;
-	pRefPic->uiShortRefCount[LIST_0]++;
-	return ERR_NONE;
+static int32_t AddShortTermToList (PRefPic pRefPic, PPicture pPic) {
+  pPic->bUsedAsRef = TRUE;
+  pPic->bIsLongRef = FALSE;
+  pPic->iLongTermFrameIdx = -1;
+  if (pRefPic->uiShortRefCount[LIST_0] > 0)	{
+    memmove (&pRefPic->pShortRefList[LIST_0][1], &pRefPic->pShortRefList[LIST_0][0],
+             pRefPic->uiShortRefCount[LIST_0]*sizeof (PPicture));//confirmed_safe_unsafe_usage
+  }
+  pRefPic->pShortRefList[LIST_0][0] = pPic;
+  pRefPic->uiShortRefCount[LIST_0]++;
+  return ERR_NONE;
 }
 
-static int32_t AddLongTermToList(PRefPic pRefPic,PPicture pPic, int32_t iLongTermFrameIdx)
-{
-	int32_t i = 0;
+static int32_t AddLongTermToList (PRefPic pRefPic, PPicture pPic, int32_t iLongTermFrameIdx) {
+  int32_t i = 0;
 
-	pPic->bUsedAsRef = TRUE;
-	pPic->bIsLongRef = TRUE;
-	pPic->iLongTermFrameIdx = iLongTermFrameIdx;
-	if (pRefPic->uiLongRefCount[LIST_0] == 0){
-		pRefPic->pLongRefList[LIST_0][pRefPic->uiLongRefCount[LIST_0]] = pPic;
-	}else if (pRefPic->uiLongRefCount[LIST_0] >0){
-		for ( i = 0; i<pRefPic->uiLongRefCount[LIST_0];i++){
-			if (pRefPic->pLongRefList[LIST_0][i]->iLongTermFrameIdx > pPic->iLongTermFrameIdx)	{	
-				break;
-			}
-		}
-		memmove(&pRefPic->pLongRefList[LIST_0][i+1],&pRefPic->pLongRefList[LIST_0][i],(pRefPic->uiLongRefCount[LIST_0]-i)*sizeof(PPicture));//confirmed_safe_unsafe_usage
-		pRefPic->pLongRefList[LIST_0][i] = pPic;	
-	}else{
-		return ERR_INFO_REF_COUNT_OVERFLOW;
-	}
+  pPic->bUsedAsRef = TRUE;
+  pPic->bIsLongRef = TRUE;
+  pPic->iLongTermFrameIdx = iLongTermFrameIdx;
+  if (pRefPic->uiLongRefCount[LIST_0] == 0) {
+    pRefPic->pLongRefList[LIST_0][pRefPic->uiLongRefCount[LIST_0]] = pPic;
+  } else if (pRefPic->uiLongRefCount[LIST_0] > 0) {
+    for (i = 0; i < pRefPic->uiLongRefCount[LIST_0]; i++) {
+      if (pRefPic->pLongRefList[LIST_0][i]->iLongTermFrameIdx > pPic->iLongTermFrameIdx)	{
+        break;
+      }
+    }
+    memmove (&pRefPic->pLongRefList[LIST_0][i + 1], &pRefPic->pLongRefList[LIST_0][i],
+             (pRefPic->uiLongRefCount[LIST_0] - i)*sizeof (PPicture)); //confirmed_safe_unsafe_usage
+    pRefPic->pLongRefList[LIST_0][i] = pPic;
+  } else {
+    return ERR_INFO_REF_COUNT_OVERFLOW;
+  }
 
 
-	pRefPic->uiLongRefCount[LIST_0]++;
-	return ERR_NONE;
+  pRefPic->uiLongRefCount[LIST_0]++;
+  return ERR_NONE;
 }
 
-static int32_t AssignLongTermIdx(PRefPic pRefPic,int32_t iFrameNum,int32_t iLongTermFrameIdx )
-{
-	PPicture pPic = NULL;
-	int32_t iRet = ERR_NONE;
-	WelsDelLongFromListSetUnref(pRefPic,iLongTermFrameIdx,REMOVE_TARGET);
-	WelsDelLongFromListSetUnref(pRefPic,iLongTermFrameIdx,REMOVE_BASE);
+static int32_t AssignLongTermIdx (PRefPic pRefPic, int32_t iFrameNum, int32_t iLongTermFrameIdx) {
+  PPicture pPic = NULL;
+  int32_t iRet = ERR_NONE;
+  WelsDelLongFromListSetUnref (pRefPic, iLongTermFrameIdx, REMOVE_TARGET);
+  WelsDelLongFromListSetUnref (pRefPic, iLongTermFrameIdx, REMOVE_BASE);
 
-	pPic = WelsDelShortFromList(pRefPic,iFrameNum,REMOVE_TARGET);
-	if (pPic){
-		iRet = AddLongTermToList(pRefPic,pPic,iLongTermFrameIdx);
-	}else{	
-		return ERR_INFO_INVALID_REF_MARKING;	
-	}
-	
-	pPic = NULL;
-	pPic = WelsDelShortFromList(pRefPic,iFrameNum,REMOVE_BASE);
-	if (pPic){	
-		iRet = AddLongTermToList(pRefPic,pPic,iLongTermFrameIdx);	
-	}
+  pPic = WelsDelShortFromList (pRefPic, iFrameNum, REMOVE_TARGET);
+  if (pPic) {
+    iRet = AddLongTermToList (pRefPic, pPic, iLongTermFrameIdx);
+  } else {
+    return ERR_INFO_INVALID_REF_MARKING;
+  }
 
-	return iRet;
+  pPic = NULL;
+  pPic = WelsDelShortFromList (pRefPic, iFrameNum, REMOVE_BASE);
+  if (pPic) {
+    iRet = AddLongTermToList (pRefPic, pPic, iLongTermFrameIdx);
+  }
+
+  return iRet;
 }
 
-static int32_t MarkAsLongTerm( PRefPic pRefPic,int32_t iFrameNum, int32_t iLongTermFrameIdx )
-{
-	PPicture pPic = NULL;
-	int32_t i = 0;
-	int32_t iRet = ERR_NONE;
-	WelsDelLongFromListSetUnref(pRefPic,iLongTermFrameIdx,REMOVE_TARGET);
-	WelsDelLongFromListSetUnref(pRefPic,iLongTermFrameIdx,REMOVE_BASE);
+static int32_t MarkAsLongTerm (PRefPic pRefPic, int32_t iFrameNum, int32_t iLongTermFrameIdx) {
+  PPicture pPic = NULL;
+  int32_t i = 0;
+  int32_t iRet = ERR_NONE;
+  WelsDelLongFromListSetUnref (pRefPic, iLongTermFrameIdx, REMOVE_TARGET);
+  WelsDelLongFromListSetUnref (pRefPic, iLongTermFrameIdx, REMOVE_BASE);
 
-	for (i = 0; i<pRefPic->uiRefCount[LIST_0];i++)	{
-		pPic = pRefPic->pRefList[LIST_0][i];
-		if ( pPic->iFrameNum == iFrameNum && !pPic->bIsLongRef){
-			iRet = AddLongTermToList(pRefPic,pPic,iLongTermFrameIdx);
-		}
-	}
-	
-	return iRet;
+  for (i = 0; i < pRefPic->uiRefCount[LIST_0]; i++)	{
+    pPic = pRefPic->pRefList[LIST_0][i];
+    if (pPic->iFrameNum == iFrameNum && !pPic->bIsLongRef) {
+      iRet = AddLongTermToList (pRefPic, pPic, iLongTermFrameIdx);
+    }
+  }
+
+  return iRet;
 }
 
 } // namespace WelsDec
\ No newline at end of file
--- a/codec/decoder/core/src/mc.cpp
+++ b/codec/decoder/core/src/mc.cpp
@@ -46,331 +46,312 @@
 namespace WelsDec {
 
 /*------------------weight for chroma fraction pixel interpolation------------------*/
-//iA = (8 - dx) * (8 - dy);   
-//iB = dx * (8 - dy);   
+//iA = (8 - dx) * (8 - dy);
+//iB = dx * (8 - dy);
 //iC = (8 - dx) * dy;
 //iD = dx * dy
-static const uint8_t g_kuiABCD[8][8][4] =	//g_kA[dy][dx], g_kB[dy][dx], g_kC[dy][dx], g_kD[dy][dx]
-{
-	{	
-		{64, 0, 0, 0},{56, 8, 0, 0},{48, 16, 0, 0},{40, 24, 0, 0},
-		{32, 32, 0, 0},{24, 40, 0, 0},{16, 48, 0, 0},{8, 56, 0, 0}
-	},
-	{	
-		{56, 0, 8, 0},{49, 7, 7, 1},{42, 14, 6, 2},{35, 21, 5, 3},
-		{28, 28, 4, 4},{21, 35, 3, 5},{14, 42, 2, 6},{7, 49, 1, 7}
-	},
-	{	
-		{48, 0, 16, 0},{42, 6, 14, 2},{36, 12, 12, 4},{30, 18, 10, 6},
-		{24, 24, 8, 8},{18, 30, 6, 10},{12, 36, 4, 12},{6, 42, 2, 14}
-	},
-	{	
-		{40, 0, 24, 0},{35, 5, 21, 3},{30, 10, 18, 6},{25, 15, 15, 9},
-		{20, 20, 12, 12},{15, 25, 9, 15},{10, 30, 6, 18},{5, 35, 3, 21}
-	},
-	{	
-		{32, 0, 32, 0},{28, 4, 28, 4},{24, 8, 24, 8},{20, 12, 20, 12},
-		{16, 16, 16, 16},{12, 20, 12, 20},{8, 24, 8, 24},{4, 28, 4, 28}
-	},
-	{	
-		{24, 0, 40, 0},{21, 3, 35, 5},{18, 6, 30, 10},{15, 9, 25, 15},
-		{12, 12, 20, 20},{9, 15, 15, 25},{6, 18, 10, 30},{3, 21, 5, 35}
-	},
-	{	
-		{16, 0, 48, 0},{14, 2, 42, 6},{12, 4, 36, 12},{10, 6, 30, 18},
-		{8, 8, 24, 24},{6, 10, 18, 30},{4, 12, 12, 36},{2, 14, 6, 42}
-	},
-	{	
-		{8, 0, 56, 0},{7, 1, 49, 7},{6, 2, 42, 14},{5, 3, 35, 21},
-		{4, 4, 28, 28},{3, 5, 21, 35},{2, 6, 14, 42},{1, 7, 7, 49}
-	}
+static const uint8_t g_kuiABCD[8][8][4] = {	//g_kA[dy][dx], g_kB[dy][dx], g_kC[dy][dx], g_kD[dy][dx]
+  {
+    {64, 0, 0, 0}, {56, 8, 0, 0}, {48, 16, 0, 0}, {40, 24, 0, 0},
+    {32, 32, 0, 0}, {24, 40, 0, 0}, {16, 48, 0, 0}, {8, 56, 0, 0}
+  },
+  {
+    {56, 0, 8, 0}, {49, 7, 7, 1}, {42, 14, 6, 2}, {35, 21, 5, 3},
+    {28, 28, 4, 4}, {21, 35, 3, 5}, {14, 42, 2, 6}, {7, 49, 1, 7}
+  },
+  {
+    {48, 0, 16, 0}, {42, 6, 14, 2}, {36, 12, 12, 4}, {30, 18, 10, 6},
+    {24, 24, 8, 8}, {18, 30, 6, 10}, {12, 36, 4, 12}, {6, 42, 2, 14}
+  },
+  {
+    {40, 0, 24, 0}, {35, 5, 21, 3}, {30, 10, 18, 6}, {25, 15, 15, 9},
+    {20, 20, 12, 12}, {15, 25, 9, 15}, {10, 30, 6, 18}, {5, 35, 3, 21}
+  },
+  {
+    {32, 0, 32, 0}, {28, 4, 28, 4}, {24, 8, 24, 8}, {20, 12, 20, 12},
+    {16, 16, 16, 16}, {12, 20, 12, 20}, {8, 24, 8, 24}, {4, 28, 4, 28}
+  },
+  {
+    {24, 0, 40, 0}, {21, 3, 35, 5}, {18, 6, 30, 10}, {15, 9, 25, 15},
+    {12, 12, 20, 20}, {9, 15, 15, 25}, {6, 18, 10, 30}, {3, 21, 5, 35}
+  },
+  {
+    {16, 0, 48, 0}, {14, 2, 42, 6}, {12, 4, 36, 12}, {10, 6, 30, 18},
+    {8, 8, 24, 24}, {6, 10, 18, 30}, {4, 12, 12, 36}, {2, 14, 6, 42}
+  },
+  {
+    {8, 0, 56, 0}, {7, 1, 49, 7}, {6, 2, 42, 14}, {5, 3, 35, 21},
+    {4, 4, 28, 28}, {3, 5, 21, 35}, {2, 6, 14, 42}, {1, 7, 7, 49}
+  }
 };
 
-typedef void_t (*PWelsMcWidthHeightFunc)(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
+typedef void_t (*PWelsMcWidthHeightFunc) (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+    int32_t iWidth, int32_t iHeight);
 
 //***************************************************************************//
 //                          C code implementation                            //
 //***************************************************************************//
-static inline void_t McCopyWidthEq2_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight)
-{
-	int32_t i;
-	for (i = 0; i < iHeight; i++)// iWidth == 2 only for chroma
-	{
-		ST16(pDst, LD16(pSrc));
-		pDst += iDstStride;
-		pSrc += iSrcStride;
-	}
+static inline void_t McCopyWidthEq2_c (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                       int32_t iHeight) {
+  int32_t i;
+  for (i = 0; i < iHeight; i++) { // iWidth == 2 only for chroma
+    ST16 (pDst, LD16 (pSrc));
+    pDst += iDstStride;
+    pSrc += iSrcStride;
+  }
 }
 
-static inline void_t McCopyWidthEq4_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight)
-{
-	int32_t i;
-	for (i = 0; i < iHeight; i++)
-	{
-		ST32(pDst, LD32(pSrc));
-		pDst += iDstStride;
-		pSrc += iSrcStride;
-	}
+static inline void_t McCopyWidthEq4_c (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                       int32_t iHeight) {
+  int32_t i;
+  for (i = 0; i < iHeight; i++) {
+    ST32 (pDst, LD32 (pSrc));
+    pDst += iDstStride;
+    pSrc += iSrcStride;
+  }
 }
 
-static inline void_t McCopyWidthEq8_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight)
-{
-	int32_t i;
-	for (i = 0; i < iHeight; i++)
-	{
-		ST64(pDst, LD64(pSrc));
-		pDst += iDstStride;
-		pSrc += iSrcStride;
-	}
+static inline void_t McCopyWidthEq8_c (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                       int32_t iHeight) {
+  int32_t i;
+  for (i = 0; i < iHeight; i++) {
+    ST64 (pDst, LD64 (pSrc));
+    pDst += iDstStride;
+    pSrc += iSrcStride;
+  }
 }
 
-static inline void_t McCopyWidthEq16_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight)
-{
-	int32_t i;
-	for (i = 0; i < iHeight; i++)
-	{
-		ST64(pDst  , LD64(pSrc));
-		ST64(pDst+8, LD64(pSrc+8));
-		pDst += iDstStride;
-		pSrc += iSrcStride;
-	}
+static inline void_t McCopyWidthEq16_c (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                        int32_t iHeight) {
+  int32_t i;
+  for (i = 0; i < iHeight; i++) {
+    ST64 (pDst  , LD64 (pSrc));
+    ST64 (pDst + 8, LD64 (pSrc + 8));
+    pDst += iDstStride;
+    pSrc += iSrcStride;
+  }
 }
 
 //--------------------Luma sample MC------------------//
 
-static inline int32_t HorFilterInput16bit_c(int16_t* pSrc)
-{
-	int32_t iPix05 = pSrc[-2] + pSrc[3];
-	int32_t iPix14 = pSrc[-1] + pSrc[2];
-	int32_t iPix23 = pSrc[ 0] + pSrc[1];
-	
-	return (iPix05 - ((iPix14<<2)+iPix14) + (iPix23<<4) + (iPix23<<2));
+static inline int32_t HorFilterInput16bit_c (int16_t* pSrc) {
+  int32_t iPix05 = pSrc[-2] + pSrc[3];
+  int32_t iPix14 = pSrc[-1] + pSrc[2];
+  int32_t iPix23 = pSrc[ 0] + pSrc[1];
+
+  return (iPix05 - ((iPix14 << 2) + iPix14) + (iPix23 << 4) + (iPix23 << 2));
 }
 // h: iOffset=1 / v: iOffset=iSrcStride
-static inline int32_t FilterInput8bitWithStride_c(uint8_t* pSrc, const int32_t kiOffset)
-{
-	const int32_t kiOffset1 = kiOffset;
-	const int32_t kiOffset2 = (kiOffset << 1);
-	const int32_t kiOffset3 = kiOffset + kiOffset2;
-	const uint32_t kuiPix05   = *(pSrc - kiOffset2) + *(pSrc + kiOffset3);
-	const uint32_t kuiPix14   = *(pSrc - kiOffset1) + *(pSrc + kiOffset2);
-	const uint32_t kuiPix23   = *(pSrc           ) + *(pSrc + kiOffset1);
+static inline int32_t FilterInput8bitWithStride_c (uint8_t* pSrc, const int32_t kiOffset) {
+  const int32_t kiOffset1 = kiOffset;
+  const int32_t kiOffset2 = (kiOffset << 1);
+  const int32_t kiOffset3 = kiOffset + kiOffset2;
+  const uint32_t kuiPix05   = * (pSrc - kiOffset2) + * (pSrc + kiOffset3);
+  const uint32_t kuiPix14   = * (pSrc - kiOffset1) + * (pSrc + kiOffset2);
+  const uint32_t kuiPix23   = * (pSrc) + * (pSrc + kiOffset1);
 
-	return (kuiPix05 - ((kuiPix14<<2)+kuiPix14) + (kuiPix23<<4) + (kuiPix23<<2));
+  return (kuiPix05 - ((kuiPix14 << 2) + kuiPix14) + (kuiPix23 << 4) + (kuiPix23 << 2));
 }
 
-static inline void_t PixelAvg_c(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, int32_t iSrcAStride,
-										uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight)
-{
-	int32_t i, j;
-	for (i = 0; i < iHeight; i++)
-	{
-		for (j = 0; j < iWidth; j++) 
-		{
-			pDst[j] = (pSrcA[j] + pSrcB[j] + 1) >> 1;
-		}
-		pDst  += iDstStride;
-		pSrcA += iSrcAStride;
-		pSrcB += iSrcBStride;
-	}
+static inline void_t PixelAvg_c (uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, int32_t iSrcAStride,
+                                 uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) {
+  int32_t i, j;
+  for (i = 0; i < iHeight; i++) {
+    for (j = 0; j < iWidth; j++) {
+      pDst[j] = (pSrcA[j] + pSrcB[j] + 1) >> 1;
+    }
+    pDst  += iDstStride;
+    pSrcA += iSrcAStride;
+    pSrcB += iSrcBStride;
+  }
 }
-static inline void_t McCopy_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
-{
-	if (iWidth == 16)
-		McCopyWidthEq16_c(pSrc,iSrcStride,pDst,iDstStride,iHeight);
-	else if(iWidth == 8)
-		McCopyWidthEq8_c(pSrc,iSrcStride,pDst,iDstStride,iHeight);
-	else if(iWidth == 4)
-		McCopyWidthEq4_c(pSrc,iSrcStride,pDst,iDstStride,iHeight);	
-	else //here iWidth == 2
-		McCopyWidthEq2_c(pSrc,iSrcStride,pDst,iDstStride,iHeight);	
+static inline void_t McCopy_c (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                               int32_t iHeight) {
+  if (iWidth == 16)
+    McCopyWidthEq16_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McCopyWidthEq8_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 4)
+    McCopyWidthEq4_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else //here iWidth == 2
+    McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
 }
 
-static inline void_t McHorVer20_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
-{
-	int32_t i, j;
-	for (i = 0; i < iHeight; i++) 
-	{
-		for (j = 0; j < iWidth; j++)
-		{
-			pDst[j] = WELS_CLIP1((FilterInput8bitWithStride_c(pSrc+j,1)+16)>>5);
-		}
-		pDst += iDstStride;
-		pSrc += iSrcStride;
-	}
+static inline void_t McHorVer20_c (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                                   int32_t iHeight) {
+  int32_t i, j;
+  for (i = 0; i < iHeight; i++) {
+    for (j = 0; j < iWidth; j++) {
+      pDst[j] = WELS_CLIP1 ((FilterInput8bitWithStride_c (pSrc + j, 1) + 16) >> 5);
+    }
+    pDst += iDstStride;
+    pSrc += iSrcStride;
+  }
 }
 
-static inline void_t McHorVer02_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
-{
-	int32_t i, j;
-	for (i = 0; i < iHeight; i++)
-	{
-		for (j = 0; j < iWidth; j++) 
-		{
-			pDst[j] = WELS_CLIP1((FilterInput8bitWithStride_c(pSrc+j, iSrcStride)+16)>>5);
-		}
-		pDst += iDstStride;
-		pSrc += iSrcStride;
-	}
+static inline void_t McHorVer02_c (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                                   int32_t iHeight) {
+  int32_t i, j;
+  for (i = 0; i < iHeight; i++) {
+    for (j = 0; j < iWidth; j++) {
+      pDst[j] = WELS_CLIP1 ((FilterInput8bitWithStride_c (pSrc + j, iSrcStride) + 16) >> 5);
+    }
+    pDst += iDstStride;
+    pSrc += iSrcStride;
+  }
 }
 
-static inline void_t McHorVer22_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
-{
-	int16_t iTmp[16+5] = {0}; //16
-	int32_t i, j, k;
+static inline void_t McHorVer22_c (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                                   int32_t iHeight) {
+  int16_t iTmp[16 + 5] = {0}; //16
+  int32_t i, j, k;
 
-	for (i = 0; i < iHeight; i++)
-	{
-		for (j = 0; j < iWidth + 5; j++)
-		{
-			iTmp[j] = FilterInput8bitWithStride_c(pSrc-2+j, iSrcStride);
-		}
-		for (k = 0; k < iWidth; k++)
-		{
-			pDst[k] = WELS_CLIP1((HorFilterInput16bit_c(&iTmp[2+k])+512)>>10);
-		}		
-		pSrc += iSrcStride;
-		pDst += iDstStride;
-	}
+  for (i = 0; i < iHeight; i++) {
+    for (j = 0; j < iWidth + 5; j++) {
+      iTmp[j] = FilterInput8bitWithStride_c (pSrc - 2 + j, iSrcStride);
+    }
+    for (k = 0; k < iWidth; k++) {
+      pDst[k] = WELS_CLIP1 ((HorFilterInput16bit_c (&iTmp[2 + k]) + 512) >> 10);
+    }
+    pSrc += iSrcStride;
+    pDst += iDstStride;
+  }
 }
 
-/////////////////////luma MC////////////////////////// 
-static inline void_t McHorVer01_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
-{
-	uint8_t uiTmp[256] = { 0 };
-	McHorVer02_c(pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);	
-	PixelAvg_c(pDst, iDstStride, pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
+/////////////////////luma MC//////////////////////////
+static inline void_t McHorVer01_c (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                                   int32_t iHeight) {
+  uint8_t uiTmp[256] = { 0 };
+  McHorVer02_c (pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
+  PixelAvg_c (pDst, iDstStride, pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
 }
-static inline void_t McHorVer03_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
-{
-	uint8_t uiTmp[256] = { 0 };
-	McHorVer02_c(pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);	
-	PixelAvg_c(pDst, iDstStride, pSrc+iSrcStride, iSrcStride, uiTmp, 16, iWidth, iHeight);
+static inline void_t McHorVer03_c (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                                   int32_t iHeight) {
+  uint8_t uiTmp[256] = { 0 };
+  McHorVer02_c (pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
+  PixelAvg_c (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, uiTmp, 16, iWidth, iHeight);
 }
-static inline void_t McHorVer10_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
-{
-	uint8_t uiTmp[256] = { 0 };
-	McHorVer20_c(pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);	
-	PixelAvg_c(pDst, iDstStride, pSrc, iSrcStride, uiTmp, 16,iWidth, iHeight);
+static inline void_t McHorVer10_c (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                                   int32_t iHeight) {
+  uint8_t uiTmp[256] = { 0 };
+  McHorVer20_c (pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
+  PixelAvg_c (pDst, iDstStride, pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
 }
-static inline void_t McHorVer11_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
-{
-	uint8_t uiHorTmp[256] = { 0 };
-	uint8_t uiVerTmp[256] = { 0 };
-	McHorVer20_c(pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
-	McHorVer02_c(pSrc, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
-	PixelAvg_c(pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight);
+static inline void_t McHorVer11_c (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                                   int32_t iHeight) {
+  uint8_t uiHorTmp[256] = { 0 };
+  uint8_t uiVerTmp[256] = { 0 };
+  McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
+  McHorVer02_c (pSrc, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
+  PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight);
 }
-static inline void_t McHorVer12_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
-{
-	uint8_t uiVerTmp[256] = { 0 };
-	uint8_t uiCtrTmp[256] = { 0 };
-	McHorVer02_c(pSrc, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
-	McHorVer22_c(pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight);
-	PixelAvg_c(pDst, iDstStride, uiVerTmp, 16, uiCtrTmp, 16, iWidth, iHeight);
+static inline void_t McHorVer12_c (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                                   int32_t iHeight) {
+  uint8_t uiVerTmp[256] = { 0 };
+  uint8_t uiCtrTmp[256] = { 0 };
+  McHorVer02_c (pSrc, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
+  McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight);
+  PixelAvg_c (pDst, iDstStride, uiVerTmp, 16, uiCtrTmp, 16, iWidth, iHeight);
 }
-static inline void_t McHorVer13_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
-{
-	uint8_t uiHorTmp[256] = { 0 };
-	uint8_t uiVerTmp[256] = { 0 };
-	McHorVer20_c(pSrc+iSrcStride, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
-	McHorVer02_c(pSrc, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
-	PixelAvg_c(pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight);
+static inline void_t McHorVer13_c (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                                   int32_t iHeight) {
+  uint8_t uiHorTmp[256] = { 0 };
+  uint8_t uiVerTmp[256] = { 0 };
+  McHorVer20_c (pSrc + iSrcStride, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
+  McHorVer02_c (pSrc, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
+  PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight);
 }
-static inline void_t McHorVer21_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
-{	
-	uint8_t uiHorTmp[256] = { 0 };
-	uint8_t uiCtrTmp[256] = { 0 };
-	McHorVer20_c(pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
-	McHorVer22_c(pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight);
-	PixelAvg_c(pDst, iDstStride, uiHorTmp, 16, uiCtrTmp, 16, iWidth, iHeight);
+static inline void_t McHorVer21_c (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                                   int32_t iHeight) {
+  uint8_t uiHorTmp[256] = { 0 };
+  uint8_t uiCtrTmp[256] = { 0 };
+  McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
+  McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight);
+  PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiCtrTmp, 16, iWidth, iHeight);
 }
-static inline void_t McHorVer23_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
-{	
-	uint8_t uiHorTmp[256] = { 0 };
-	uint8_t uiCtrTmp[256] = { 0 };
-	McHorVer20_c(pSrc+iSrcStride, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
-	McHorVer22_c(pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight);
-	PixelAvg_c(pDst, iDstStride, uiHorTmp, 16, uiCtrTmp, 16, iWidth, iHeight);
+static inline void_t McHorVer23_c (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                                   int32_t iHeight) {
+  uint8_t uiHorTmp[256] = { 0 };
+  uint8_t uiCtrTmp[256] = { 0 };
+  McHorVer20_c (pSrc + iSrcStride, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
+  McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight);
+  PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiCtrTmp, 16, iWidth, iHeight);
 }
-static inline void_t McHorVer30_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
-{
-	uint8_t uiHorTmp[256] = { 0 };
-	McHorVer20_c(pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
-	PixelAvg_c(pDst, iDstStride, pSrc+1, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
+static inline void_t McHorVer30_c (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                                   int32_t iHeight) {
+  uint8_t uiHorTmp[256] = { 0 };
+  McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
+  PixelAvg_c (pDst, iDstStride, pSrc + 1, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
 }
-static inline void_t McHorVer31_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
-{
-	uint8_t uiHorTmp[256] = { 0 };
-	uint8_t uiVerTmp[256] = { 0 };
-	McHorVer20_c(pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
-	McHorVer02_c(pSrc+1, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
-	PixelAvg_c(pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight);
+static inline void_t McHorVer31_c (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                                   int32_t iHeight) {
+  uint8_t uiHorTmp[256] = { 0 };
+  uint8_t uiVerTmp[256] = { 0 };
+  McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
+  McHorVer02_c (pSrc + 1, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
+  PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight);
 }
-static inline void_t McHorVer32_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
-{
-	uint8_t uiVerTmp[256] = { 0 };
-	uint8_t uiCtrTmp[256] = { 0 };
-	McHorVer02_c(pSrc+1, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
-	McHorVer22_c(pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight);
-	PixelAvg_c(pDst, iDstStride, uiVerTmp, 16, uiCtrTmp, 16, iWidth, iHeight);
+static inline void_t McHorVer32_c (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                                   int32_t iHeight) {
+  uint8_t uiVerTmp[256] = { 0 };
+  uint8_t uiCtrTmp[256] = { 0 };
+  McHorVer02_c (pSrc + 1, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
+  McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight);
+  PixelAvg_c (pDst, iDstStride, uiVerTmp, 16, uiCtrTmp, 16, iWidth, iHeight);
 }
-static inline void_t McHorVer33_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
-{
-	uint8_t uiHorTmp[256] = { 0 };
-	uint8_t uiVerTmp[256] = { 0 };
-	McHorVer20_c(pSrc+iSrcStride, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
-	McHorVer02_c(pSrc+1, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
-	PixelAvg_c(pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight);
+static inline void_t McHorVer33_c (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                                   int32_t iHeight) {
+  uint8_t uiHorTmp[256] = { 0 };
+  uint8_t uiVerTmp[256] = { 0 };
+  McHorVer20_c (pSrc + iSrcStride, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
+  McHorVer02_c (pSrc + 1, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
+  PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight);
 }
 
-void_t McLuma_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-			      int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
-				  //pSrc has been added the offset of mv
+void_t McLuma_c (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                 int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
+//pSrc has been added the offset of mv
 {
-    PWelsMcWidthHeightFunc pWelsMcFunc[4][4] =  //[x][y]   
-    {
-		{McCopy_c,      McHorVer01_c, McHorVer02_c, McHorVer03_c},
-        {McHorVer10_c,  McHorVer11_c, McHorVer12_c, McHorVer13_c},
-        {McHorVer20_c,  McHorVer21_c, McHorVer22_c, McHorVer23_c},
-        {McHorVer30_c,  McHorVer31_c, McHorVer32_c, McHorVer33_c},
-    };
+  PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y]
+    {McCopy_c,      McHorVer01_c, McHorVer02_c, McHorVer03_c},
+    {McHorVer10_c,  McHorVer11_c, McHorVer12_c, McHorVer13_c},
+    {McHorVer20_c,  McHorVer21_c, McHorVer22_c, McHorVer23_c},
+    {McHorVer30_c,  McHorVer31_c, McHorVer32_c, McHorVer33_c},
+  };
 
-    pWelsMcFunc[iMvX&0x03][iMvY&0x03](pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+  pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
 }
 
-static inline void_t McChromaWithFragMv_c( uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight )
-{
-	int32_t i, j;
-	int32_t iA, iB, iC, iD;
-	uint8_t* pSrcNext = pSrc + iSrcStride;
-	const uint32_t kuiABCD = *((uint32_t *)g_kuiABCD[iMvY&0x07][iMvX&0x07]);
-	iA = (kuiABCD      ) & 0xff;
-	iB = (kuiABCD >>  8) & 0xff;
-	iC = (kuiABCD >> 16) & 0xff;
-	iD = (kuiABCD >> 24) & 0xff;
-	for (i = 0; i < iHeight; i++)
-	{
-		for (j = 0; j < iWidth; j++)
-		{
-			pDst[j] = (iA * pSrc[j] + iB * pSrc[j+1] + iC * pSrcNext[j] + iD * pSrcNext[j+1] + 32) >> 6;
-		}
-		pDst     += iDstStride;
-		pSrc      = pSrcNext;
-		pSrcNext += iSrcStride;
-	}
+static inline void_t McChromaWithFragMv_c (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+    int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
+  int32_t i, j;
+  int32_t iA, iB, iC, iD;
+  uint8_t* pSrcNext = pSrc + iSrcStride;
+  const uint32_t kuiABCD = * ((uint32_t*)g_kuiABCD[iMvY & 0x07][iMvX & 0x07]);
+  iA = (kuiABCD) & 0xff;
+  iB = (kuiABCD >>  8) & 0xff;
+  iC = (kuiABCD >> 16) & 0xff;
+  iD = (kuiABCD >> 24) & 0xff;
+  for (i = 0; i < iHeight; i++) {
+    for (j = 0; j < iWidth; j++) {
+      pDst[j] = (iA * pSrc[j] + iB * pSrc[j + 1] + iC * pSrcNext[j] + iD * pSrcNext[j + 1] + 32) >> 6;
+    }
+    pDst     += iDstStride;
+    pSrc      = pSrcNext;
+    pSrcNext += iSrcStride;
+  }
 }
 
-void_t McChroma_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-			        int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
-					//pSrc has been added the offset of mv
+void_t McChroma_c (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                   int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
+//pSrc has been added the offset of mv
 {
-	const int32_t kiD8x = iMvX&0x07;
-	const int32_t kiD8y = iMvY&0x07;
-	if (0 == kiD8x && 0 == kiD8y)
-		McCopy_c(pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
-	else
-		McChromaWithFragMv_c(pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
+  const int32_t kiD8x = iMvX & 0x07;
+  const int32_t kiD8y = iMvY & 0x07;
+  if (0 == kiD8x && 0 == kiD8y)
+    McCopy_c (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+  else
+    McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
 }
 
 #if defined(X86_ASM)
@@ -377,380 +358,312 @@
 //***************************************************************************//
 //                       SSE2 implement                          //
 //***************************************************************************//
-static inline void_t McHorVer22WidthEq8_sse2( uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, int32_t iHeight )
-{
-	ENFORCE_STACK_ALIGN_2D(int16_t, iTap, 21, 8, 16)
-	McHorVer22Width8HorFirst_sse2(pSrc-2, iSrcStride, (uint8_t *)iTap,16,iHeight+5);
-	McHorVer22VerLast_sse2((uint8_t *)iTap,16, pDst, iDstStride, 8, iHeight);
+static inline void_t McHorVer22WidthEq8_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+    int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_2D (int16_t, iTap, 21, 8, 16)
+  McHorVer22Width8HorFirst_sse2 (pSrc - 2, iSrcStride, (uint8_t*)iTap, 16, iHeight + 5);
+  McHorVer22VerLast_sse2 ((uint8_t*)iTap, 16, pDst, iDstStride, 8, iHeight);
 }
 
-static inline void_t McHorVer02WidthEq16_sse2( uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, int32_t iHeight )
-{
-	McHorVer02WidthEq8_sse2( pSrc,     iSrcStride, pDst,     iDstStride, iHeight );
-	McHorVer02WidthEq8_sse2( &pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight );
+static inline void_t McHorVer02WidthEq16_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+    int32_t iHeight) {
+  McHorVer02WidthEq8_sse2 (pSrc,     iSrcStride, pDst,     iDstStride, iHeight);
+  McHorVer02WidthEq8_sse2 (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight);
 }
 
-static inline void_t McHorVer22WidthEq16_sse2( uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, int32_t iHeight )
-{
-	McHorVer22WidthEq8_sse2( pSrc,     iSrcStride, pDst,     iDstStride, iHeight );
-	McHorVer22WidthEq8_sse2( &pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight );
+static inline void_t McHorVer22WidthEq16_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+    int32_t iHeight) {
+  McHorVer22WidthEq8_sse2 (pSrc,     iSrcStride, pDst,     iDstStride, iHeight);
+  McHorVer22WidthEq8_sse2 (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight);
 }
 
-static inline void_t McCopy_sse2(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
-{
-	if (iWidth == 16)
-		McCopyWidthEq16_sse2(pSrc,iSrcStride,pDst,iDstStride,iHeight);
-	else if(iWidth == 8)
-		McCopyWidthEq8_mmx(pSrc,iSrcStride,pDst,iDstStride,iHeight);
-	else if(iWidth ==4)
-		McCopyWidthEq4_mmx(pSrc,iSrcStride,pDst,iDstStride,iHeight);	
-	else
-		McCopyWidthEq2_c(pSrc,iSrcStride,pDst,iDstStride,iHeight);	
+static inline void_t McCopy_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                                  int32_t iHeight) {
+  if (iWidth == 16)
+    McCopyWidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McCopyWidthEq8_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 4)
+    McCopyWidthEq4_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else
+    McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
 }
 
-static inline void_t McHorVer20_sse2(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
-{
-	if (iWidth == 16)
-		McHorVer20WidthEq16_sse2(pSrc,iSrcStride,pDst,iDstStride,iHeight);
-	else if(iWidth == 8)
-		McHorVer20WidthEq8_sse2(pSrc,iSrcStride,pDst,iDstStride,iHeight);
-	else
-		McHorVer20WidthEq4_mmx(pSrc,iSrcStride,pDst,iDstStride,iHeight);
+static inline void_t McHorVer20_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iWidth, int32_t iHeight) {
+  if (iWidth == 16)
+    McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else
+    McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
 }
 
-static inline void_t McHorVer02_sse2(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
-{
-	if (iWidth == 16)
-		McHorVer02WidthEq16_sse2(pSrc,iSrcStride,pDst,iDstStride,iHeight);
-	else if(iWidth == 8)
-		McHorVer02WidthEq8_sse2(pSrc,iSrcStride,pDst,iDstStride,iHeight);
-	else
-		McHorVer02_c(pSrc,iSrcStride,pDst,iDstStride, 4, iHeight);
+static inline void_t McHorVer02_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iWidth, int32_t iHeight) {
+  if (iWidth == 16)
+    McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else
+    McHorVer02_c (pSrc, iSrcStride, pDst, iDstStride, 4, iHeight);
 }
 
-static inline void_t McHorVer22_sse2(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
-{
-	if (iWidth == 16)
-		McHorVer22WidthEq16_sse2(pSrc,iSrcStride,pDst,iDstStride,iHeight);
-	else if(iWidth == 8)
-		McHorVer22WidthEq8_sse2(pSrc,iSrcStride,pDst,iDstStride,iHeight);
-	else
-		McHorVer22_c(pSrc,iSrcStride,pDst,iDstStride,4, iHeight);
+static inline void_t McHorVer22_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iWidth, int32_t iHeight) {
+  if (iWidth == 16)
+    McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McHorVer22WidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else
+    McHorVer22_c (pSrc, iSrcStride, pDst, iDstStride, 4, iHeight);
 }
 
-static inline void_t McHorVer01_sse2(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
-{
-	FORCE_STACK_ALIGN_1D( uint8_t, pTmp, 256, 16 );
-	if (iWidth == 16)
-	{
-		McHorVer02WidthEq16_sse2(pSrc, iSrcStride, pTmp, 16, iHeight);	
-		PixelAvgWidthEq16_sse2(pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
-	}
-	else if(iWidth == 8)
-	{
-		McHorVer02WidthEq8_sse2(pSrc, iSrcStride, pTmp, 16, iHeight);	
-		PixelAvgWidthEq8_mmx(pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
-	}
-	else
-	{
-		McHorVer02_c(pSrc, iSrcStride, pTmp, 16, 4, iHeight);	
-		PixelAvgWidthEq4_mmx(pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
-	}
+static inline void_t McHorVer01_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iWidth, int32_t iHeight) {
+  FORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
+    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
+  } else {
+    McHorVer02_c (pSrc, iSrcStride, pTmp, 16, 4, iHeight);
+    PixelAvgWidthEq4_mmx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
+  }
 }
-static inline void_t McHorVer03_sse2(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
-{
-	FORCE_STACK_ALIGN_1D( uint8_t, pTmp, 256, 16 );
-	if (iWidth == 16)
-	{
-		McHorVer02WidthEq16_sse2(pSrc, iSrcStride, pTmp, 16, iHeight);	
-		PixelAvgWidthEq16_sse2(pDst, iDstStride, pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight);
-	}
-	else if(iWidth == 8)
-	{
-		McHorVer02WidthEq8_sse2(pSrc, iSrcStride, pTmp, 16, iHeight);	
-		PixelAvgWidthEq8_mmx(pDst, iDstStride, pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight);
-	}
-	else
-	{
-		McHorVer02_c(pSrc, iSrcStride, pTmp, 16, 4, iHeight);	
-		PixelAvgWidthEq4_mmx(pDst, iDstStride, pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight);
-	}	
+static inline void_t McHorVer03_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iWidth, int32_t iHeight) {
+  FORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
+    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmx (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
+  } else {
+    McHorVer02_c (pSrc, iSrcStride, pTmp, 16, 4, iHeight);
+    PixelAvgWidthEq4_mmx (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
+  }
 }
-static inline void_t McHorVer10_sse2(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
-{
-	FORCE_STACK_ALIGN_1D( uint8_t, pTmp, 256, 16 );
-	if (iWidth == 16)
-	{
-		McHorVer20WidthEq16_sse2(pSrc, iSrcStride, pTmp, 16, iHeight);	
-		PixelAvgWidthEq16_sse2(pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
-	}
-	else if(iWidth == 8)
-	{
-		McHorVer20WidthEq8_sse2(pSrc, iSrcStride, pTmp, 16, iHeight);	
-		PixelAvgWidthEq8_mmx(pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
-	}
-	else
-	{
-		McHorVer20WidthEq4_mmx(pSrc, iSrcStride, pTmp, 16, iHeight);	
-		PixelAvgWidthEq4_mmx(pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
-	}
+static inline void_t McHorVer10_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iWidth, int32_t iHeight) {
+  FORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
+    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
+  } else {
+    McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pTmp, 16, iHeight);
+    PixelAvgWidthEq4_mmx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
+  }
 }
-static inline void_t McHorVer11_sse2(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
-{
-	FORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
-	FORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
-	if (iWidth == 16)
-	{
-		McHorVer20WidthEq16_sse2(pSrc, iSrcStride, pHorTmp, 16, iHeight);
-		McHorVer02WidthEq16_sse2(pSrc, iSrcStride, pVerTmp, 16, iHeight);
-		PixelAvgWidthEq16_sse2  (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-	}
-	else if(iWidth == 8)
-	{
-		McHorVer20WidthEq8_sse2(pSrc, iSrcStride, pHorTmp, 16, iHeight);
-		McHorVer02WidthEq8_sse2(pSrc, iSrcStride, pVerTmp, 16, iHeight);
-		PixelAvgWidthEq8_mmx(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-	}
-	else
-	{
-		McHorVer20WidthEq4_mmx(pSrc, iSrcStride, pHorTmp, 16, iHeight);
-		McHorVer02_c     (pSrc, iSrcStride, pVerTmp, 16, 4, iHeight);
-		PixelAvgWidthEq4_mmx  (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-	}
+static inline void_t McHorVer11_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iWidth, int32_t iHeight) {
+  FORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  FORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else {
+    McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02_c (pSrc, iSrcStride, pVerTmp, 16, 4, iHeight);
+    PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  }
 }
-static inline void_t McHorVer12_sse2(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
-{
-	FORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
-	FORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 );
-	if (iWidth == 16)
-	{
-		McHorVer02WidthEq16_sse2(pSrc, iSrcStride, pVerTmp, 16, iHeight);
-		McHorVer22WidthEq16_sse2(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-		PixelAvgWidthEq16_sse2  (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
-	}
-	else if(iWidth == 8)
-	{
-		McHorVer02WidthEq8_sse2(pSrc, iSrcStride, pVerTmp, 16, iHeight);
-		McHorVer22WidthEq8_sse2(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-		PixelAvgWidthEq8_mmx(pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
-	}
-	else
-	{
-		McHorVer02_c   (pSrc, iSrcStride, pVerTmp, 16, 4, iHeight);
-		McHorVer22_c   (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight);
-		PixelAvgWidthEq4_mmx(pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
-	}
+static inline void_t McHorVer12_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iWidth, int32_t iHeight) {
+  FORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  FORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    McHorVer22WidthEq8_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+  } else {
+    McHorVer02_c (pSrc, iSrcStride, pVerTmp, 16, 4, iHeight);
+    McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight);
+    PixelAvgWidthEq4_mmx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+  }
 }
-static inline void_t McHorVer13_sse2(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
-{
-	FORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
-	FORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
-	if (iWidth ==16)
-	{
-		McHorVer20WidthEq16_sse2(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-		McHorVer02WidthEq16_sse2(pSrc,            iSrcStride, pVerTmp, 16, iHeight);
-		PixelAvgWidthEq16_sse2  (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-	}
-	else if(iWidth == 8)
-	{
-		McHorVer20WidthEq8_sse2(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-		McHorVer02WidthEq8_sse2(pSrc,            iSrcStride, pVerTmp, 16, iHeight);
-		PixelAvgWidthEq8_mmx(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);		
-	}
-	else
-	{
-		McHorVer20WidthEq4_mmx(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-		McHorVer02_c     (pSrc,            iSrcStride, pVerTmp, 16, 4 ,iHeight);
-		PixelAvgWidthEq4_mmx(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);		
-	}
+static inline void_t McHorVer13_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iWidth, int32_t iHeight) {
+  FORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  FORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq16_sse2 (pSrc,            iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq8_sse2 (pSrc,            iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else {
+    McHorVer20WidthEq4_mmx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02_c (pSrc,            iSrcStride, pVerTmp, 16, 4 , iHeight);
+    PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  }
 }
-static inline void_t McHorVer21_sse2(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
-{
-	FORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
-	FORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 );
-	if (iWidth == 16)
-	{
-		McHorVer20WidthEq16_sse2(pSrc, iSrcStride, pHorTmp, 16, iHeight);
-		McHorVer22WidthEq16_sse2(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-		PixelAvgWidthEq16_sse2(pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
-	}
-	else if(iWidth == 8)
-	{
-		McHorVer20WidthEq8_sse2(pSrc, iSrcStride, pHorTmp, 16, iHeight);
-		McHorVer22WidthEq8_sse2(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-		PixelAvgWidthEq8_mmx(pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
-	}
-	else
-	{
-		McHorVer20WidthEq4_mmx(pSrc, iSrcStride, pHorTmp, 16, iHeight);
-		McHorVer22_c     (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight);
-		PixelAvgWidthEq4_mmx(pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
-	}
+static inline void_t McHorVer21_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iWidth, int32_t iHeight) {
+  FORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  FORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22WidthEq8_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+  } else {
+    McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight);
+    PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+  }
 }
-static inline void_t McHorVer23_sse2(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
-{	
-	FORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
-	FORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 );
-	if (iWidth == 16)
-	{
-		McHorVer20WidthEq16_sse2(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-		McHorVer22WidthEq16_sse2(pSrc,            iSrcStride, pCtrTmp, 16, iHeight);
-		PixelAvgWidthEq16_sse2(pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
-	}
-	else if(iWidth == 8)
-	{
-		McHorVer20WidthEq8_sse2(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-		McHorVer22WidthEq8_sse2(pSrc,            iSrcStride, pCtrTmp, 16, iHeight);
-		PixelAvgWidthEq8_mmx(pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
-	}
-	else
-	{
-		McHorVer20WidthEq4_mmx(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-		McHorVer22_c     (pSrc,            iSrcStride, pCtrTmp, 16, 4, iHeight);
-		PixelAvgWidthEq4_mmx(pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
-	}
+static inline void_t McHorVer23_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iWidth, int32_t iHeight) {
+  FORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  FORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22WidthEq16_sse2 (pSrc,            iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22WidthEq8_sse2 (pSrc,            iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+  } else {
+    McHorVer20WidthEq4_mmx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22_c (pSrc,            iSrcStride, pCtrTmp, 16, 4, iHeight);
+    PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+  }
 }
-static inline void_t McHorVer30_sse2(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
-{
-	FORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
-	if (iWidth == 16)
-	{
-		McHorVer20WidthEq16_sse2(pSrc, iSrcStride, pHorTmp, 16, iHeight);
-		PixelAvgWidthEq16_sse2(pDst, iDstStride, pSrc+1, iSrcStride, pHorTmp, 16, iHeight);
-	}
-	else if(iWidth == 8)
-	{
-		McHorVer20WidthEq8_sse2(pSrc, iSrcStride, pHorTmp, 16, iHeight);
-		PixelAvgWidthEq8_mmx(pDst, iDstStride, pSrc+1, iSrcStride, pHorTmp, 16, iHeight);
-	}
-	else
-	{
-		McHorVer20WidthEq4_mmx(pSrc, iSrcStride, pHorTmp, 16, iHeight);
-		PixelAvgWidthEq4_mmx(pDst, iDstStride, pSrc+1, iSrcStride, pHorTmp, 16, iHeight);
-	}
+static inline void_t McHorVer30_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iWidth, int32_t iHeight) {
+  FORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmx (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight);
+  } else {
+    McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    PixelAvgWidthEq4_mmx (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight);
+  }
 }
-static inline void_t McHorVer31_sse2(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
-{
-	FORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
-	FORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
-	if (iWidth == 16)
-	{
-		McHorVer20WidthEq16_sse2(pSrc,   iSrcStride, pHorTmp, 16, iHeight);
-		McHorVer02WidthEq16_sse2(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
-		PixelAvgWidthEq16_sse2(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-	}
-	else if(iWidth == 8)
-	{
-		McHorVer20WidthEq8_sse2(pSrc, iSrcStride, pHorTmp, 16, iHeight);
-		McHorVer02WidthEq8_sse2(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
-		PixelAvgWidthEq8_mmx(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-	}
-	else
-	{
-		McHorVer20WidthEq4_mmx(pSrc, iSrcStride, pHorTmp, 16, iHeight);
-		McHorVer02_c(pSrc+1, iSrcStride, pVerTmp, 16, 4, iHeight);
-		PixelAvgWidthEq4_mmx(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-	}
+static inline void_t McHorVer31_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iWidth, int32_t iHeight) {
+  FORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  FORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_sse2 (pSrc,   iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq16_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq8_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else {
+    McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02_c (pSrc + 1, iSrcStride, pVerTmp, 16, 4, iHeight);
+    PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  }
 }
-static inline void_t McHorVer32_sse2(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
-{
-	FORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
-	FORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 );
-	if (iWidth ==16)
-	{
-		McHorVer02WidthEq16_sse2(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
-		McHorVer22WidthEq16_sse2(pSrc,   iSrcStride, pCtrTmp, 16, iHeight);
-		PixelAvgWidthEq16_sse2(pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
-	}
-	else if(iWidth == 8)
-	{
-		McHorVer02WidthEq8_sse2(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
-		McHorVer22WidthEq8_sse2(pSrc,   iSrcStride, pCtrTmp, 16, iHeight);
-		PixelAvgWidthEq8_mmx(pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
-	}
-	else
-	{
-		McHorVer02_c(pSrc+1, iSrcStride, pVerTmp, 16, 4, iHeight);
-		McHorVer22_c(pSrc,   iSrcStride, pCtrTmp, 16, 4, iHeight);
-		PixelAvgWidthEq4_mmx(pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
-	}
+static inline void_t McHorVer32_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iWidth, int32_t iHeight) {
+  FORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  FORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer02WidthEq16_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    McHorVer22WidthEq16_sse2 (pSrc,   iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer02WidthEq8_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    McHorVer22WidthEq8_sse2 (pSrc,   iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+  } else {
+    McHorVer02_c (pSrc + 1, iSrcStride, pVerTmp, 16, 4, iHeight);
+    McHorVer22_c (pSrc,   iSrcStride, pCtrTmp, 16, 4, iHeight);
+    PixelAvgWidthEq4_mmx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+  }
 }
-static inline void_t McHorVer33_sse2(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
-{
-	FORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
-	FORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
-	if (iWidth == 16)
-	{
-		McHorVer20WidthEq16_sse2(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-		McHorVer02WidthEq16_sse2(pSrc+1,          iSrcStride, pVerTmp, 16, iHeight);
-		PixelAvgWidthEq16_sse2(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-	}
-	else if(iWidth == 8)
-	{
-		McHorVer20WidthEq8_sse2(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-		McHorVer02WidthEq8_sse2(pSrc+1,          iSrcStride, pVerTmp, 16, iHeight);
-		PixelAvgWidthEq8_mmx(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-	}
-	else
-	{
-		McHorVer20WidthEq4_mmx(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-		McHorVer02_c     (pSrc+1,          iSrcStride, pVerTmp, 16, 4, iHeight);
-		PixelAvgWidthEq4_mmx(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-	}
+static inline void_t McHorVer33_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iWidth, int32_t iHeight) {
+  FORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  FORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq16_sse2 (pSrc + 1,          iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq8_sse2 (pSrc + 1,          iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else {
+    McHorVer20WidthEq4_mmx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02_c (pSrc + 1,          iSrcStride, pVerTmp, 16, 4, iHeight);
+    PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  }
 }
 
-void_t McLuma_sse2(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-				  int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
-				  //pSrc has been added the offset of mv
+void_t McLuma_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                    int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
+//pSrc has been added the offset of mv
 {
-	PWelsMcWidthHeightFunc pWelsMcFunc[4][4] =  //[x][y]   
-	{
-		{McCopy_sse2,     McHorVer01_sse2, McHorVer02_sse2, McHorVer03_sse2},
-		{McHorVer10_sse2, McHorVer11_sse2, McHorVer12_sse2, McHorVer13_sse2},
-		{McHorVer20_sse2, McHorVer21_sse2, McHorVer22_sse2, McHorVer23_sse2},
-		{McHorVer30_sse2, McHorVer31_sse2, McHorVer32_sse2, McHorVer33_sse2},
-	};
+  PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y]
+    {McCopy_sse2,     McHorVer01_sse2, McHorVer02_sse2, McHorVer03_sse2},
+    {McHorVer10_sse2, McHorVer11_sse2, McHorVer12_sse2, McHorVer13_sse2},
+    {McHorVer20_sse2, McHorVer21_sse2, McHorVer22_sse2, McHorVer23_sse2},
+    {McHorVer30_sse2, McHorVer31_sse2, McHorVer32_sse2, McHorVer33_sse2},
+  };
 
-	pWelsMcFunc[iMvX&0x03][iMvY&0x03](pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+  pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
 }
 
-void_t McChroma_sse2( uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride,
-					   int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight )
-{
-	static const PMcChromaWidthExtFunc kpMcChromaWidthFuncs[2] =
-	{
-		McChromaWidthEq4_mmx,
-		McChromaWidthEq8_sse2
-	};
-	const int32_t kiD8x = iMvX&0x07;
-	const int32_t kiD8y = iMvY&0x07;
-	if (kiD8x ==0 && kiD8y ==0)
-	{
-		McCopy_sse2(pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
-		return;
-	}
-	if (iWidth != 2)
-	{
-		kpMcChromaWidthFuncs[iWidth>>3](pSrc, iSrcStride, pDst, iDstStride, g_kuiABCD[kiD8y][kiD8x], iHeight);
-	}
-	else
-		McChromaWithFragMv_c(pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
+void_t McChroma_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
+  static const PMcChromaWidthExtFunc kpMcChromaWidthFuncs[2] = {
+    McChromaWidthEq4_mmx,
+    McChromaWidthEq8_sse2
+  };
+  const int32_t kiD8x = iMvX & 0x07;
+  const int32_t kiD8y = iMvY & 0x07;
+  if (kiD8x == 0 && kiD8y == 0) {
+    McCopy_sse2 (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+    return;
+  }
+  if (iWidth != 2) {
+    kpMcChromaWidthFuncs[iWidth >> 3] (pSrc, iSrcStride, pDst, iDstStride, g_kuiABCD[kiD8y][kiD8x], iHeight);
+  } else
+    McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
 }
 
 
 #endif //X86_ASM
 
-void_t InitMcFunc(SMcFunc *pMcFunc, int32_t iCpu)
-{
-	pMcFunc->pMcLumaFunc   = McLuma_c;
-	pMcFunc->pMcChromaFunc = McChroma_c; 
-	
+void_t InitMcFunc (SMcFunc* pMcFunc, int32_t iCpu) {
+  pMcFunc->pMcLumaFunc   = McLuma_c;
+  pMcFunc->pMcChromaFunc = McChroma_c;
+
 #if defined (X86_ASM)
-	if ( iCpu & WELS_CPU_SSE2 )
-	{
-		pMcFunc->pMcLumaFunc   = McLuma_sse2;
-		pMcFunc->pMcChromaFunc = McChroma_sse2;
-	}
+  if (iCpu & WELS_CPU_SSE2) {
+    pMcFunc->pMcLumaFunc   = McLuma_sse2;
+    pMcFunc->pMcChromaFunc = McChroma_sse2;
+  }
 #endif //(X86_ASM)	
 }
 
--- a/codec/decoder/core/src/mem_align.cpp
+++ b/codec/decoder/core/src/mem_align.cpp
@@ -1,115 +1,109 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "mem_align.h"
-#include "crt_util_safe_x.h"
-
-namespace WelsDec {
-
-//#define MEMORY_CHECK
-#ifdef MEMORY_CHECK
-
-WelsFileHandle * pMemCheckMalloc = NULL;
-WelsFileHandle * pMemCheckFree = NULL; 
-
-int32_t iCountMalloc = 0;
-#endif
-//
-
-/////////////////////////////////////////////////////////////////////////////////
-#define ALIGNBYTES (16)
-/////////////////////////////////////////////////////////////////////////////////
-
-void_t * WelsMalloc( const uint32_t kuiSize, const str_t *kpTag )
-{
-	const int32_t kiSizeVoidPtr	= sizeof( void_t ** );
-	const int32_t kiSizeInt		= sizeof( int32_t );
-#ifdef HAVE_CACHE_LINE_ALIGN
-	const int32_t kiAlignBytes	= ALIGNBYTES - 1;
-#else
-	const int32_t kiAlignBytes	= 15;
-#endif// HAVE_CACHE_LINE_ALIGN
-	uint8_t* pBuf		= (uint8_t *) malloc( kuiSize + kiAlignBytes + kiSizeVoidPtr + kiSizeInt );
-	uint8_t* pAlignBuf;
-
-#ifdef MEMORY_CHECK	
-	if( pMemCheckMalloc == NULL ){
-		pMemCheckMalloc = WelsFopen(".\\mem_check_malloc.txt", "at+");
-		pMemCheckFree   = WelsFopen(".\\mem_check_free.txt", "at+");
-	}
-
-	if ( kpTag != NULL )
-	{
-		if ( pMemCheckMalloc != NULL )
-		{
-			fprintf( pMemCheckMalloc, "0x%x, size: %d       , malloc %s\n", (void_t *)pBuf, (kuiSize + kiAlignBytes + kiSizeVoidPtr + kiSizeInt), kpTag );			
-		}
-		if ( pMemCheckMalloc != NULL )
-		{
-			fflush( pMemCheckMalloc );
-		}
-	}
-#endif	
-
-	if ( NULL == pBuf )
-		return NULL;
-
-	// to fill zero values
-	memset( pBuf, 0, kuiSize + kiAlignBytes + kiSizeVoidPtr + kiSizeInt );
-
-	pAlignBuf = pBuf + kiAlignBytes + kiSizeVoidPtr + kiSizeInt;
-	pAlignBuf -= (int32_t) pAlignBuf & kiAlignBytes;
-	*( (void_t **) ( pAlignBuf - kiSizeVoidPtr ) ) = pBuf;
-	*( (int32_t *) ( pAlignBuf - (kiSizeVoidPtr + kiSizeInt) ) ) = kuiSize;
-
-	return (pAlignBuf);
-}
-
-/////////////////////////////////////////////////////////////////////////////
-
-void_t WelsFree( void_t* pPtr, const str_t *kpTag )
-{
-	if( pPtr )
-	{
-#ifdef MEMORY_CHECK			
-		if ( NULL != pMemCheckFree && kpTag != NULL )
-		{				
-			fprintf( pMemCheckFree, "0x%x, free %s\n", (void_t *)(*( ( ( void_t **) pPtr ) - 1 )), kpTag );
-			fflush( pMemCheckFree );
-		}	
-#endif
-		free( *( ( ( void_t **) pPtr ) - 1 ) );
-	}
-}
-
-/////////////////////////////////////////////////////////////////////////////
-} // namespace WelsDec
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "mem_align.h"
+#include "crt_util_safe_x.h"
+
+namespace WelsDec {
+
+//#define MEMORY_CHECK
+#ifdef MEMORY_CHECK
+
+WelsFileHandle* pMemCheckMalloc = NULL;
+WelsFileHandle* pMemCheckFree = NULL;
+
+int32_t iCountMalloc = 0;
+#endif
+//
+
+/////////////////////////////////////////////////////////////////////////////////
+#define ALIGNBYTES (16)
+/////////////////////////////////////////////////////////////////////////////////
+
+void_t* WelsMalloc (const uint32_t kuiSize, const str_t* kpTag) {
+  const int32_t kiSizeVoidPtr	= sizeof (void_t**);
+  const int32_t kiSizeInt		= sizeof (int32_t);
+#ifdef HAVE_CACHE_LINE_ALIGN
+  const int32_t kiAlignBytes	= ALIGNBYTES - 1;
+#else
+  const int32_t kiAlignBytes	= 15;
+#endif// HAVE_CACHE_LINE_ALIGN
+  uint8_t* pBuf		= (uint8_t*) malloc (kuiSize + kiAlignBytes + kiSizeVoidPtr + kiSizeInt);
+  uint8_t* pAlignBuf;
+
+#ifdef MEMORY_CHECK
+  if (pMemCheckMalloc == NULL) {
+    pMemCheckMalloc = WelsFopen (".\\mem_check_malloc.txt", "at+");
+    pMemCheckFree   = WelsFopen (".\\mem_check_free.txt", "at+");
+  }
+
+  if (kpTag != NULL) {
+    if (pMemCheckMalloc != NULL) {
+      fprintf (pMemCheckMalloc, "0x%x, size: %d       , malloc %s\n", (void_t*)pBuf,
+               (kuiSize + kiAlignBytes + kiSizeVoidPtr + kiSizeInt), kpTag);
+    }
+    if (pMemCheckMalloc != NULL) {
+      fflush (pMemCheckMalloc);
+    }
+  }
+#endif
+
+  if (NULL == pBuf)
+    return NULL;
+
+  // to fill zero values
+  memset (pBuf, 0, kuiSize + kiAlignBytes + kiSizeVoidPtr + kiSizeInt);
+
+  pAlignBuf = pBuf + kiAlignBytes + kiSizeVoidPtr + kiSizeInt;
+  pAlignBuf -= (int32_t) pAlignBuf & kiAlignBytes;
+  * ((void_t**) (pAlignBuf - kiSizeVoidPtr)) = pBuf;
+  * ((int32_t*) (pAlignBuf - (kiSizeVoidPtr + kiSizeInt))) = kuiSize;
+
+  return (pAlignBuf);
+}
+
+/////////////////////////////////////////////////////////////////////////////
+
+void_t WelsFree (void_t* pPtr, const str_t* kpTag) {
+  if (pPtr) {
+#ifdef MEMORY_CHECK
+    if (NULL != pMemCheckFree && kpTag != NULL) {
+      fprintf (pMemCheckFree, "0x%x, free %s\n", (void_t*) (* (((void_t**) pPtr) - 1)), kpTag);
+      fflush (pMemCheckFree);
+    }
+#endif
+    free (* (((void_t**) pPtr) - 1));
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////
+} // namespace WelsDec
--- a/codec/decoder/core/src/memmgr_nal_unit.cpp
+++ b/codec/decoder/core/src/memmgr_nal_unit.cpp
@@ -44,86 +44,81 @@
 
 namespace WelsDec {
 
-int32_t MemInitNalList(PAccessUnit *ppAu, const uint32_t kuiSize){
-	uint32_t uiIdx = 0;
-	uint8_t *pBase = NULL, *pPtr = NULL;
-	const uint32_t kuiSizeAu = sizeof(SAccessUnit);
-	const uint32_t kuiSizeNalUnitPtr= kuiSize*sizeof(PNalUnit);
-	const uint32_t kuiSizeNalUnit = sizeof(SNalUnit);
-	const uint32_t kuiCountSize = (kuiSizeAu + kuiSizeNalUnitPtr + kuiSize * kuiSizeNalUnit) * sizeof(uint8_t);
-	
-	if (kuiSize == 0)
-		return 1;
+int32_t MemInitNalList (PAccessUnit* ppAu, const uint32_t kuiSize) {
+  uint32_t uiIdx = 0;
+  uint8_t* pBase = NULL, *pPtr = NULL;
+  const uint32_t kuiSizeAu = sizeof (SAccessUnit);
+  const uint32_t kuiSizeNalUnitPtr = kuiSize * sizeof (PNalUnit);
+  const uint32_t kuiSizeNalUnit = sizeof (SNalUnit);
+  const uint32_t kuiCountSize = (kuiSizeAu + kuiSizeNalUnitPtr + kuiSize * kuiSizeNalUnit) * sizeof (uint8_t);
 
-	if ( *ppAu != NULL ){
-		MemFreeNalList(ppAu);
-	}
+  if (kuiSize == 0)
+    return 1;
 
-	pBase = (uint8_t *)WelsMalloc( kuiCountSize, "Access Unit" );
-	if ( pBase == NULL )
-		return 1;
-	pPtr = pBase;
-	*ppAu = (PAccessUnit)pPtr;
-	pPtr += kuiSizeAu;
-	(*ppAu)->pNalUnitsList	= (PNalUnit*)pPtr;	
-	pPtr += kuiSizeNalUnitPtr;
-	do {
-		(*ppAu)->pNalUnitsList[uiIdx] = (PNalUnit)pPtr;
-		pPtr += kuiSizeNalUnit;
-		++ uiIdx;
-	} while(uiIdx < kuiSize);
+  if (*ppAu != NULL) {
+    MemFreeNalList (ppAu);
+  }
 
-	(*ppAu)->uiCountUnitsNum	= kuiSize;
-	(*ppAu)->uiAvailUnitsNum	= 0;
-	(*ppAu)->uiActualUnitsNum	= 0;
-	(*ppAu)->uiEndPos		    = 0;
-	(*ppAu)->bCompletedAuFlag	= false;	
+  pBase = (uint8_t*)WelsMalloc (kuiCountSize, "Access Unit");
+  if (pBase == NULL)
+    return 1;
+  pPtr = pBase;
+  *ppAu = (PAccessUnit)pPtr;
+  pPtr += kuiSizeAu;
+  (*ppAu)->pNalUnitsList	= (PNalUnit*)pPtr;
+  pPtr += kuiSizeNalUnitPtr;
+  do {
+    (*ppAu)->pNalUnitsList[uiIdx] = (PNalUnit)pPtr;
+    pPtr += kuiSizeNalUnit;
+    ++ uiIdx;
+  } while (uiIdx < kuiSize);
 
-	return 0;
+  (*ppAu)->uiCountUnitsNum	= kuiSize;
+  (*ppAu)->uiAvailUnitsNum	= 0;
+  (*ppAu)->uiActualUnitsNum	= 0;
+  (*ppAu)->uiEndPos		    = 0;
+  (*ppAu)->bCompletedAuFlag	= false;
+
+  return 0;
 }
 
-int32_t MemFreeNalList(PAccessUnit *ppAu)
-{
-	if (ppAu != NULL){
-		PAccessUnit pAu = *ppAu;
-		if (pAu != NULL)
-		{			
-			WelsFree(pAu, "Access Unit");
-			*ppAu = NULL;
-		}		
-	}
-	return 0;
+int32_t MemFreeNalList (PAccessUnit* ppAu) {
+  if (ppAu != NULL) {
+    PAccessUnit pAu = *ppAu;
+    if (pAu != NULL) {
+      WelsFree (pAu, "Access Unit");
+      *ppAu = NULL;
+    }
+  }
+  return 0;
 }
 
 
-int32_t ExpandNalUnitList(PAccessUnit *ppAu, const int32_t kiOrgSize, const int32_t kiExpSize)
-{
-	if ( kiExpSize <= kiOrgSize )
-		return 1;
-	else
-	{
-		PAccessUnit pTmp = NULL;
-		int32_t iIdx = 0;
+int32_t ExpandNalUnitList (PAccessUnit* ppAu, const int32_t kiOrgSize, const int32_t kiExpSize) {
+  if (kiExpSize <= kiOrgSize)
+    return 1;
+  else {
+    PAccessUnit pTmp = NULL;
+    int32_t iIdx = 0;
 
-		if ( MemInitNalList( &pTmp, kiExpSize ) )	// request new list with expanding
-			return 1;
+    if (MemInitNalList (&pTmp, kiExpSize))	// request new list with expanding
+      return 1;
 
-		do
-		{
-			memcpy(pTmp->pNalUnitsList[iIdx], (*ppAu)->pNalUnitsList[iIdx], sizeof(SNalUnit) );//confirmed_safe_unsafe_usage
-			++ iIdx;
-		}while(iIdx < kiOrgSize);		
+    do {
+      memcpy (pTmp->pNalUnitsList[iIdx], (*ppAu)->pNalUnitsList[iIdx], sizeof (SNalUnit)); //confirmed_safe_unsafe_usage
+      ++ iIdx;
+    } while (iIdx < kiOrgSize);
 
-		pTmp->uiCountUnitsNum	= kiExpSize;
-		pTmp->uiAvailUnitsNum	= (*ppAu)->uiAvailUnitsNum;
-		pTmp->uiActualUnitsNum	= (*ppAu)->uiActualUnitsNum;
-		pTmp->uiEndPos		    = (*ppAu)->uiEndPos;
-		pTmp->bCompletedAuFlag	= (*ppAu)->bCompletedAuFlag;
+    pTmp->uiCountUnitsNum	= kiExpSize;
+    pTmp->uiAvailUnitsNum	= (*ppAu)->uiAvailUnitsNum;
+    pTmp->uiActualUnitsNum	= (*ppAu)->uiActualUnitsNum;
+    pTmp->uiEndPos		    = (*ppAu)->uiEndPos;
+    pTmp->bCompletedAuFlag	= (*ppAu)->bCompletedAuFlag;
 
-		MemFreeNalList( ppAu );	// free old list
-		*ppAu = pTmp;
-		return 0;
-	}
+    MemFreeNalList (ppAu);	// free old list
+    *ppAu = pTmp;
+    return 0;
+  }
 }
 
 /*
@@ -131,23 +126,22 @@
  *	Get next NAL Unit for using.
  *	Need expand NAL Unit list if exceeding count number of available NAL Units withing an Access Unit
  */
-PNalUnit MemGetNextNal(PAccessUnit *ppAu){	
-	PAccessUnit pAu = *ppAu;
-	PNalUnit pNu = NULL;
-	
-	if (pAu->uiAvailUnitsNum >= pAu->uiCountUnitsNum)	// need expand list
-	{
-		const uint32_t kuiExpandingSize = pAu->uiCountUnitsNum + (MAX_NAL_UNIT_NUM_IN_AU>>1);
-		if ( ExpandNalUnitList(ppAu, pAu->uiCountUnitsNum, kuiExpandingSize) )
-			return NULL;	// out of memory
-		pAu = *ppAu;
-	}
+PNalUnit MemGetNextNal (PAccessUnit* ppAu) {
+  PAccessUnit pAu = *ppAu;
+  PNalUnit pNu = NULL;
 
-	pNu = pAu->pNalUnitsList[pAu->uiAvailUnitsNum++];	// ready for next nal position
+  if (pAu->uiAvailUnitsNum >= pAu->uiCountUnitsNum) {	// need expand list
+    const uint32_t kuiExpandingSize = pAu->uiCountUnitsNum + (MAX_NAL_UNIT_NUM_IN_AU >> 1);
+    if (ExpandNalUnitList (ppAu, pAu->uiCountUnitsNum, kuiExpandingSize))
+      return NULL;	// out of memory
+    pAu = *ppAu;
+  }
 
-	memset(pNu, 0, sizeof(SNalUnit));	// Please do not remove this for cache intend!!
-	
-	return pNu;
+  pNu = pAu->pNalUnitsList[pAu->uiAvailUnitsNum++];	// ready for next nal position
+
+  memset (pNu, 0, sizeof (SNalUnit));	// Please do not remove this for cache intend!!
+
+  return pNu;
 }
 
 } // namespace WelsDec
\ No newline at end of file
--- a/codec/decoder/core/src/mv_pred.cpp
+++ b/codec/decoder/core/src/mv_pred.cpp
@@ -45,207 +45,179 @@
 namespace WelsDec {
 
 //basic iMVs prediction unit for iMVs partition width (4, 2, 1)
-void_t PredMv(int16_t iMotionVector[LIST_A][30][MV_A], int8_t iRefIndex[LIST_A][30], 
-			 int32_t iPartIdx, int32_t iPartWidth, int8_t iRef, int16_t iMVP[2])
-{
-	const uint8_t kuiLeftIdx	= g_kuiCache30ScanIdx[iPartIdx] - 1;
-	const uint8_t kuiTopIdx		= g_kuiCache30ScanIdx[iPartIdx] - 6;
-	const uint8_t kuiRightTopIdx= kuiTopIdx + iPartWidth;
-	const uint8_t kuiLeftTopIdx	= kuiTopIdx - 1;	
+void_t PredMv (int16_t iMotionVector[LIST_A][30][MV_A], int8_t iRefIndex[LIST_A][30],
+               int32_t iPartIdx, int32_t iPartWidth, int8_t iRef, int16_t iMVP[2]) {
+  const uint8_t kuiLeftIdx	= g_kuiCache30ScanIdx[iPartIdx] - 1;
+  const uint8_t kuiTopIdx		= g_kuiCache30ScanIdx[iPartIdx] - 6;
+  const uint8_t kuiRightTopIdx = kuiTopIdx + iPartWidth;
+  const uint8_t kuiLeftTopIdx	= kuiTopIdx - 1;
 
-	const int8_t kiLeftRef      = iRefIndex[0][kuiLeftIdx];
-	const int8_t kiTopRef       = iRefIndex[0][ kuiTopIdx];
-	const int8_t kiRightTopRef = iRefIndex[0][kuiRightTopIdx];
-	const int8_t kiLeftTopRef  = iRefIndex[0][ kuiLeftTopIdx];
-	int8_t iDiagonalRef  = kiRightTopRef;
+  const int8_t kiLeftRef      = iRefIndex[0][kuiLeftIdx];
+  const int8_t kiTopRef       = iRefIndex[0][ kuiTopIdx];
+  const int8_t kiRightTopRef = iRefIndex[0][kuiRightTopIdx];
+  const int8_t kiLeftTopRef  = iRefIndex[0][ kuiLeftTopIdx];
+  int8_t iDiagonalRef  = kiRightTopRef;
 
-	int8_t iMatchRef = 0;
+  int8_t iMatchRef = 0;
 
 
-	int16_t iAMV[2], iBMV[2], iCMV[2];
+  int16_t iAMV[2], iBMV[2], iCMV[2];
 
-	*(int32_t*)iAMV = INTD32(iMotionVector[0][     kuiLeftIdx]);
-	*(int32_t*)iBMV = INTD32(iMotionVector[0][      kuiTopIdx]);
-	*(int32_t*)iCMV = INTD32(iMotionVector[0][kuiRightTopIdx]);
+  * (int32_t*)iAMV = INTD32 (iMotionVector[0][     kuiLeftIdx]);
+  * (int32_t*)iBMV = INTD32 (iMotionVector[0][      kuiTopIdx]);
+  * (int32_t*)iCMV = INTD32 (iMotionVector[0][kuiRightTopIdx]);
 
-	if (REF_NOT_AVAIL == iDiagonalRef) 
-	{
-		iDiagonalRef = kiLeftTopRef;
-		*(int32_t*)iCMV = INTD32(iMotionVector[0][kuiLeftTopIdx]);
-	}
+  if (REF_NOT_AVAIL == iDiagonalRef) {
+    iDiagonalRef = kiLeftTopRef;
+    * (int32_t*)iCMV = INTD32 (iMotionVector[0][kuiLeftTopIdx]);
+  }
 
-	iMatchRef = (iRef == kiLeftRef) + (iRef == kiTopRef) + (iRef == iDiagonalRef);	
+  iMatchRef = (iRef == kiLeftRef) + (iRef == kiTopRef) + (iRef == iDiagonalRef);
 
-	if (REF_NOT_AVAIL == kiTopRef && REF_NOT_AVAIL == iDiagonalRef && kiLeftRef >= REF_NOT_IN_LIST) 
-	{
-		ST32(iMVP, LD32(iAMV));
-		return;
-	}
+  if (REF_NOT_AVAIL == kiTopRef && REF_NOT_AVAIL == iDiagonalRef && kiLeftRef >= REF_NOT_IN_LIST) {
+    ST32 (iMVP, LD32 (iAMV));
+    return;
+  }
 
-	if (1 == iMatchRef) 
-	{
-		if (iRef == kiLeftRef) 
-		{
-			ST32(iMVP, LD32(iAMV));
-		}
-		else if (iRef == kiTopRef) 
-		{
-			ST32(iMVP, LD32(iBMV));
-		}
-		else
-		{
-			ST32(iMVP, LD32(iCMV));
-		}
-	}
-	else
-	{
-		iMVP[0] = WelsMedian(iAMV[0], iBMV[0], iCMV[0]);
-		iMVP[1] = WelsMedian(iAMV[1], iBMV[1], iCMV[1]);
-	}	
+  if (1 == iMatchRef) {
+    if (iRef == kiLeftRef) {
+      ST32 (iMVP, LD32 (iAMV));
+    } else if (iRef == kiTopRef) {
+      ST32 (iMVP, LD32 (iBMV));
+    } else {
+      ST32 (iMVP, LD32 (iCMV));
+    }
+  } else {
+    iMVP[0] = WelsMedian (iAMV[0], iBMV[0], iCMV[0]);
+    iMVP[1] = WelsMedian (iAMV[1], iBMV[1], iCMV[1]);
+  }
 }
-void_t PredInter8x16Mv(int16_t iMotionVector[LIST_A][30][MV_A], int8_t iRefIndex[LIST_A][30], 
-						int32_t iPartIdx, int8_t iRef, int16_t iMVP[2])
-{
-	if (0 == iPartIdx) 
-	{
-		const int8_t kiLeftRef = iRefIndex[0][6];
-		if (iRef == kiLeftRef)
-		{
-			ST32( iMVP, LD32(&iMotionVector[0][6][0]) );
-			return;
-		}		
-	}
-	else // 1 == iPartIdx
-	{
-		int8_t iDiagonalRef = iRefIndex[0][5]; //top-right
-		int8_t index = 5;
-		if (REF_NOT_AVAIL == iDiagonalRef)
-		{
-			iDiagonalRef = iRefIndex[0][2]; //top-left for 8*8 block(index 1)
-			index = 2;
-		}
-		if (iRef == iDiagonalRef) 
-		{
-			ST32( iMVP, LD32(&iMotionVector[0][index][0]) );
-			return;
-		}	
-	}
+void_t PredInter8x16Mv (int16_t iMotionVector[LIST_A][30][MV_A], int8_t iRefIndex[LIST_A][30],
+                        int32_t iPartIdx, int8_t iRef, int16_t iMVP[2]) {
+  if (0 == iPartIdx) {
+    const int8_t kiLeftRef = iRefIndex[0][6];
+    if (iRef == kiLeftRef) {
+      ST32 (iMVP, LD32 (&iMotionVector[0][6][0]));
+      return;
+    }
+  } else { // 1 == iPartIdx
+    int8_t iDiagonalRef = iRefIndex[0][5]; //top-right
+    int8_t index = 5;
+    if (REF_NOT_AVAIL == iDiagonalRef) {
+      iDiagonalRef = iRefIndex[0][2]; //top-left for 8*8 block(index 1)
+      index = 2;
+    }
+    if (iRef == iDiagonalRef) {
+      ST32 (iMVP, LD32 (&iMotionVector[0][index][0]));
+      return;
+    }
+  }
 
-	PredMv(iMotionVector, iRefIndex, iPartIdx, 2, iRef, iMVP);
+  PredMv (iMotionVector, iRefIndex, iPartIdx, 2, iRef, iMVP);
 }
-void_t PredInter16x8Mv(int16_t iMotionVector[LIST_A][30][MV_A], int8_t iRefIndex[LIST_A][30], 
-						int32_t iPartIdx, int8_t iRef, int16_t iMVP[2])
-{
-	if (0 == iPartIdx) 
-	{
-		const int8_t kiTopRef = iRefIndex[0][1];
-		if (iRef == kiTopRef)
-		{
-			ST32(iMVP, LD32(&iMotionVector[0][1][0]));
-			return;
-		}
-	}
-	else // 8 == iPartIdx
-	{
-		const int8_t kiLeftRef = iRefIndex[0][18];
-		if (iRef == kiLeftRef) 
-		{
-			ST32(iMVP, LD32(&iMotionVector[0][18][0]));
-			return;
-		}
-	}
+void_t PredInter16x8Mv (int16_t iMotionVector[LIST_A][30][MV_A], int8_t iRefIndex[LIST_A][30],
+                        int32_t iPartIdx, int8_t iRef, int16_t iMVP[2]) {
+  if (0 == iPartIdx) {
+    const int8_t kiTopRef = iRefIndex[0][1];
+    if (iRef == kiTopRef) {
+      ST32 (iMVP, LD32 (&iMotionVector[0][1][0]));
+      return;
+    }
+  } else { // 8 == iPartIdx
+    const int8_t kiLeftRef = iRefIndex[0][18];
+    if (iRef == kiLeftRef) {
+      ST32 (iMVP, LD32 (&iMotionVector[0][18][0]));
+      return;
+    }
+  }
 
-	PredMv(iMotionVector, iRefIndex, iPartIdx, 4, iRef, iMVP);
+  PredMv (iMotionVector, iRefIndex, iPartIdx, 4, iRef, iMVP);
 }
 
 //update iMVs and iRefIndex cache for current MB, only for P_16*16 (SKIP inclusive)
 /* can be further optimized */
-void_t UpdateP16x16MotionInfo( PDqLayer pCurDqLayer, int8_t iRef, int16_t iMVs[2])
-{
-	const int16_t kiRef2		= (iRef << 8) | iRef;
-	const int32_t kiMV32		= LD32(iMVs);	
-	int32_t i;	
-	int32_t iMbXy = pCurDqLayer->iMbXyIndex;
-	
-	for (i = 0; i < 16; i+=4) 
-	{
-		//mb
-		const uint8_t kuiScan4Idx = g_kuiScan4[i];
-		const uint8_t kuiScan4IdxPlus4= 4 + kuiScan4Idx;
+void_t UpdateP16x16MotionInfo (PDqLayer pCurDqLayer, int8_t iRef, int16_t iMVs[2]) {
+  const int16_t kiRef2		= (iRef << 8) | iRef;
+  const int32_t kiMV32		= LD32 (iMVs);
+  int32_t i;
+  int32_t iMbXy = pCurDqLayer->iMbXyIndex;
 
- 		ST16( &pCurDqLayer->pRefIndex[0][iMbXy][kuiScan4Idx ], kiRef2 );
-		ST16( &pCurDqLayer->pRefIndex[0][iMbXy][kuiScan4IdxPlus4], kiRef2 );
-	
-		ST32( pCurDqLayer->pMv[0][iMbXy][  kuiScan4Idx ], kiMV32 );
-		ST32( pCurDqLayer->pMv[0][iMbXy][1+kuiScan4Idx ], kiMV32 );
-		ST32( pCurDqLayer->pMv[0][iMbXy][  kuiScan4IdxPlus4], kiMV32 );
-		ST32( pCurDqLayer->pMv[0][iMbXy][1+kuiScan4IdxPlus4], kiMV32 );
-	}
+  for (i = 0; i < 16; i += 4) {
+    //mb
+    const uint8_t kuiScan4Idx = g_kuiScan4[i];
+    const uint8_t kuiScan4IdxPlus4 = 4 + kuiScan4Idx;
+
+    ST16 (&pCurDqLayer->pRefIndex[0][iMbXy][kuiScan4Idx ], kiRef2);
+    ST16 (&pCurDqLayer->pRefIndex[0][iMbXy][kuiScan4IdxPlus4], kiRef2);
+
+    ST32 (pCurDqLayer->pMv[0][iMbXy][  kuiScan4Idx ], kiMV32);
+    ST32 (pCurDqLayer->pMv[0][iMbXy][1 + kuiScan4Idx ], kiMV32);
+    ST32 (pCurDqLayer->pMv[0][iMbXy][  kuiScan4IdxPlus4], kiMV32);
+    ST32 (pCurDqLayer->pMv[0][iMbXy][1 + kuiScan4IdxPlus4], kiMV32);
+  }
 }
 
-//update iRefIndex and iMVs of Mb, only for P16x8 
+//update iRefIndex and iMVs of Mb, only for P16x8
 /*need further optimization, mb_cache not work */
-void_t UpdateP16x8MotionInfo(PDqLayer pCurDqLayer, int16_t iMotionVector[LIST_A][30][MV_A], int8_t iRefIndex[LIST_A][30], 
-							  int32_t iPartIdx, int8_t iRef, int16_t iMVs[2])
-{
-	const int16_t kiRef2 = (iRef << 8) | iRef;
-	const int32_t kiMV32 = LD32(iMVs);
-	int32_t i;	
-	int32_t iMbXy = pCurDqLayer->iMbXyIndex;
-	for (i = 0; i < 2; i++, iPartIdx+=4) 
-	{
-		const uint8_t kuiScan4Idx      = g_kuiScan4[iPartIdx];
-		const uint8_t kuiScan4IdxPlus4 = 4 + kuiScan4Idx;
-		const uint8_t kuiCacheIdx      = g_kuiCache30ScanIdx[iPartIdx];
-		const uint8_t kuiCacheIdxPlus6 = 6 + kuiCacheIdx;
+void_t UpdateP16x8MotionInfo (PDqLayer pCurDqLayer, int16_t iMotionVector[LIST_A][30][MV_A],
+                              int8_t iRefIndex[LIST_A][30],
+                              int32_t iPartIdx, int8_t iRef, int16_t iMVs[2]) {
+  const int16_t kiRef2 = (iRef << 8) | iRef;
+  const int32_t kiMV32 = LD32 (iMVs);
+  int32_t i;
+  int32_t iMbXy = pCurDqLayer->iMbXyIndex;
+  for (i = 0; i < 2; i++, iPartIdx += 4) {
+    const uint8_t kuiScan4Idx      = g_kuiScan4[iPartIdx];
+    const uint8_t kuiScan4IdxPlus4 = 4 + kuiScan4Idx;
+    const uint8_t kuiCacheIdx      = g_kuiCache30ScanIdx[iPartIdx];
+    const uint8_t kuiCacheIdxPlus6 = 6 + kuiCacheIdx;
 
-		//mb
-		ST16( &pCurDqLayer->pRefIndex[0][iMbXy][kuiScan4Idx ], kiRef2 );
-		ST16( &pCurDqLayer->pRefIndex[0][iMbXy][kuiScan4IdxPlus4], kiRef2 );
-		ST32( pCurDqLayer->pMv[0][iMbXy][  kuiScan4Idx ], kiMV32 );
-		ST32( pCurDqLayer->pMv[0][iMbXy][1+kuiScan4Idx ], kiMV32 );
-		ST32( pCurDqLayer->pMv[0][iMbXy][  kuiScan4IdxPlus4], kiMV32 );
-		ST32( pCurDqLayer->pMv[0][iMbXy][1+kuiScan4IdxPlus4], kiMV32 );
-		//cache
-		ST16( &iRefIndex[0][kuiCacheIdx ], kiRef2 );
-		ST16( &iRefIndex[0][kuiCacheIdxPlus6], kiRef2 );
-		ST32( iMotionVector[0][  kuiCacheIdx ], kiMV32 );
-		ST32( iMotionVector[0][1+kuiCacheIdx ], kiMV32 );
-		ST32( iMotionVector[0][  kuiCacheIdxPlus6], kiMV32 );
-		ST32( iMotionVector[0][1+kuiCacheIdxPlus6], kiMV32 );
-	}	
+    //mb
+    ST16 (&pCurDqLayer->pRefIndex[0][iMbXy][kuiScan4Idx ], kiRef2);
+    ST16 (&pCurDqLayer->pRefIndex[0][iMbXy][kuiScan4IdxPlus4], kiRef2);
+    ST32 (pCurDqLayer->pMv[0][iMbXy][  kuiScan4Idx ], kiMV32);
+    ST32 (pCurDqLayer->pMv[0][iMbXy][1 + kuiScan4Idx ], kiMV32);
+    ST32 (pCurDqLayer->pMv[0][iMbXy][  kuiScan4IdxPlus4], kiMV32);
+    ST32 (pCurDqLayer->pMv[0][iMbXy][1 + kuiScan4IdxPlus4], kiMV32);
+    //cache
+    ST16 (&iRefIndex[0][kuiCacheIdx ], kiRef2);
+    ST16 (&iRefIndex[0][kuiCacheIdxPlus6], kiRef2);
+    ST32 (iMotionVector[0][  kuiCacheIdx ], kiMV32);
+    ST32 (iMotionVector[0][1 + kuiCacheIdx ], kiMV32);
+    ST32 (iMotionVector[0][  kuiCacheIdxPlus6], kiMV32);
+    ST32 (iMotionVector[0][1 + kuiCacheIdxPlus6], kiMV32);
+  }
 }
 //update iRefIndex and iMVs of both Mb and Mb_cache, only for P8x16
-void_t UpdateP8x16MotionInfo(PDqLayer pCurDqLayer, int16_t iMotionVector[LIST_A][30][MV_A], int8_t iRefIndex[LIST_A][30], 
-							  int32_t iPartIdx, int8_t iRef, int16_t iMVs[2])
-{
-	const int16_t kiRef2 = (iRef << 8) | iRef;
-	const int32_t kiMV32 = LD32(iMVs);
-	int32_t i;
-	int32_t iMbXy = pCurDqLayer->iMbXyIndex;
-	
-	for (i = 0; i < 2; i++, iPartIdx+=8) 
-	{
-		const uint8_t kuiScan4Idx = g_kuiScan4[iPartIdx];
-		const uint8_t kuiCacheIdx = g_kuiCache30ScanIdx[iPartIdx];
-		const uint8_t kuiScan4IdxPlus4= 4 + kuiScan4Idx;
-		const uint8_t kuiCacheIdxPlus6= 6 + kuiCacheIdx;
+void_t UpdateP8x16MotionInfo (PDqLayer pCurDqLayer, int16_t iMotionVector[LIST_A][30][MV_A],
+                              int8_t iRefIndex[LIST_A][30],
+                              int32_t iPartIdx, int8_t iRef, int16_t iMVs[2]) {
+  const int16_t kiRef2 = (iRef << 8) | iRef;
+  const int32_t kiMV32 = LD32 (iMVs);
+  int32_t i;
+  int32_t iMbXy = pCurDqLayer->iMbXyIndex;
 
-		//mb
-		ST16( &pCurDqLayer->pRefIndex[0][iMbXy][kuiScan4Idx ], kiRef2 );
-		ST16( &pCurDqLayer->pRefIndex[0][iMbXy][kuiScan4IdxPlus4], kiRef2 );
-		ST32( pCurDqLayer->pMv[0][iMbXy][  kuiScan4Idx ], kiMV32 );
-		ST32( pCurDqLayer->pMv[0][iMbXy][1+kuiScan4Idx ], kiMV32 );
-		ST32( pCurDqLayer->pMv[0][iMbXy][  kuiScan4IdxPlus4], kiMV32 );
-		ST32( pCurDqLayer->pMv[0][iMbXy][1+kuiScan4IdxPlus4], kiMV32 );
-		//cache
-		ST16( &iRefIndex[0][kuiCacheIdx ], kiRef2 );
-		ST16( &iRefIndex[0][kuiCacheIdxPlus6], kiRef2 );
-		ST32( iMotionVector[0][  kuiCacheIdx ], kiMV32 );
-		ST32( iMotionVector[0][1+kuiCacheIdx ], kiMV32 );
-		ST32( iMotionVector[0][  kuiCacheIdxPlus6], kiMV32 );
-		ST32( iMotionVector[0][1+kuiCacheIdxPlus6], kiMV32 );
-	}	
+  for (i = 0; i < 2; i++, iPartIdx += 8) {
+    const uint8_t kuiScan4Idx = g_kuiScan4[iPartIdx];
+    const uint8_t kuiCacheIdx = g_kuiCache30ScanIdx[iPartIdx];
+    const uint8_t kuiScan4IdxPlus4 = 4 + kuiScan4Idx;
+    const uint8_t kuiCacheIdxPlus6 = 6 + kuiCacheIdx;
+
+    //mb
+    ST16 (&pCurDqLayer->pRefIndex[0][iMbXy][kuiScan4Idx ], kiRef2);
+    ST16 (&pCurDqLayer->pRefIndex[0][iMbXy][kuiScan4IdxPlus4], kiRef2);
+    ST32 (pCurDqLayer->pMv[0][iMbXy][  kuiScan4Idx ], kiMV32);
+    ST32 (pCurDqLayer->pMv[0][iMbXy][1 + kuiScan4Idx ], kiMV32);
+    ST32 (pCurDqLayer->pMv[0][iMbXy][  kuiScan4IdxPlus4], kiMV32);
+    ST32 (pCurDqLayer->pMv[0][iMbXy][1 + kuiScan4IdxPlus4], kiMV32);
+    //cache
+    ST16 (&iRefIndex[0][kuiCacheIdx ], kiRef2);
+    ST16 (&iRefIndex[0][kuiCacheIdxPlus6], kiRef2);
+    ST32 (iMotionVector[0][  kuiCacheIdx ], kiMV32);
+    ST32 (iMotionVector[0][1 + kuiCacheIdx ], kiMV32);
+    ST32 (iMotionVector[0][  kuiCacheIdxPlus6], kiMV32);
+    ST32 (iMotionVector[0][1 + kuiCacheIdxPlus6], kiMV32);
+  }
 }
 
 } // namespace WelsDec
\ No newline at end of file
--- a/codec/decoder/core/src/parse_mb_syn_cavlc.cpp
+++ b/codec/decoder/core/src/parse_mb_syn_cavlc.cpp
@@ -47,590 +47,471 @@
 #include "vlc_decoder.h"
 #include "bit_stream.h"
 #include "ls_defines.h"
-#include "mv_pred.h" 
+#include "mv_pred.h"
 #include "decode_slice.h"
 
 namespace WelsDec {
 
-void_t PredPSkipMvFromNeighbor( PDqLayer pCurLayer, int16_t iMvp[2] )
-{
-	bool_t bTopAvail, bLeftTopAvail, bRightTopAvail, bLeftAvail;
+void_t PredPSkipMvFromNeighbor (PDqLayer pCurLayer, int16_t iMvp[2]) {
+  bool_t bTopAvail, bLeftTopAvail, bRightTopAvail, bLeftAvail;
 
-	int32_t iCurSliceIdc, iTopSliceIdc, iLeftTopSliceIdc, iRightTopSliceIdc, iLeftSliceIdc; 
-	int32_t iLeftTopType, iRightTopType, iTopType, iLeftType;
-	int32_t iCurX, iCurY, iCurXy, iLeftXy, iTopXy, iLeftTopXy, iRightTopXy;
+  int32_t iCurSliceIdc, iTopSliceIdc, iLeftTopSliceIdc, iRightTopSliceIdc, iLeftSliceIdc;
+  int32_t iLeftTopType, iRightTopType, iTopType, iLeftType;
+  int32_t iCurX, iCurY, iCurXy, iLeftXy, iTopXy, iLeftTopXy, iRightTopXy;
 
-	int8_t iLeftRef;
-	int8_t iTopRef;
-	int8_t iRightTopRef;
-	int8_t iLeftTopRef;
-	int8_t iDiagonalRef;
-	int8_t iMatchRef;
-	int16_t iMvA[2], iMvB[2], iMvC[2], iMvD[2];
+  int8_t iLeftRef;
+  int8_t iTopRef;
+  int8_t iRightTopRef;
+  int8_t iLeftTopRef;
+  int8_t iDiagonalRef;
+  int8_t iMatchRef;
+  int16_t iMvA[2], iMvB[2], iMvC[2], iMvD[2];
 
-	iCurXy = pCurLayer->iMbXyIndex;
-	iCurX  = pCurLayer->iMbX;
-	iCurY  = pCurLayer->iMbY;
-	iCurSliceIdc = pCurLayer->pSliceIdc[iCurXy];
+  iCurXy = pCurLayer->iMbXyIndex;
+  iCurX  = pCurLayer->iMbX;
+  iCurY  = pCurLayer->iMbY;
+  iCurSliceIdc = pCurLayer->pSliceIdc[iCurXy];
 
-	if( iCurX != 0)
-	{
-		iLeftXy = iCurXy- 1;
-		iLeftSliceIdc = pCurLayer->pSliceIdc[iLeftXy];
-		bLeftAvail = (iLeftSliceIdc == iCurSliceIdc);
-	}	
-	else
-	{
-		bLeftAvail = 0;
-		bLeftTopAvail = 0;
-	}
-	
-	if( iCurY != 0)
-	{
-		iTopXy = iCurXy - pCurLayer->iMbWidth;
-		iTopSliceIdc = pCurLayer->pSliceIdc[iTopXy];
-		bTopAvail = (iTopSliceIdc == iCurSliceIdc);
-		if (iCurX != 0)
-		{
-			iLeftTopXy = iTopXy - 1;
-			iLeftTopSliceIdc = pCurLayer->pSliceIdc[iLeftTopXy];
-			bLeftTopAvail = (iLeftTopSliceIdc  == iCurSliceIdc);
-		}
-		else
-		{
-			bLeftTopAvail = 0;
-		}
-		if (iCurX != (pCurLayer->iMbWidth-1))
-		{
-			iRightTopXy = iTopXy + 1;
-			iRightTopSliceIdc = pCurLayer->pSliceIdc[iRightTopXy];
-			bRightTopAvail = (iRightTopSliceIdc == iCurSliceIdc);
-		}
-		else
-		{
-			bRightTopAvail = 0;
-		}
-	}
-	else
-	{
-		bTopAvail = 0;
-		bLeftTopAvail = 0;
-		bRightTopAvail = 0;
-	}
+  if (iCurX != 0) {
+    iLeftXy = iCurXy - 1;
+    iLeftSliceIdc = pCurLayer->pSliceIdc[iLeftXy];
+    bLeftAvail = (iLeftSliceIdc == iCurSliceIdc);
+  } else {
+    bLeftAvail = 0;
+    bLeftTopAvail = 0;
+  }
 
-	iLeftType = ((iCurX!=0 && bLeftAvail) ? pCurLayer->pMbType[iLeftXy]: 0);
-	iTopType = ((iCurY!=0 && bTopAvail) ? pCurLayer->pMbType[iTopXy]: 0);
-	iLeftTopType = ((iCurX!=0 &&iCurY!=0 && bLeftTopAvail)
-					? pCurLayer->pMbType[iLeftTopXy]: 0);
-	iRightTopType = ((iCurX!=pCurLayer->iMbWidth-1 &&iCurY!=0 && bRightTopAvail)
-					? pCurLayer->pMbType[iRightTopXy]: 0);
+  if (iCurY != 0) {
+    iTopXy = iCurXy - pCurLayer->iMbWidth;
+    iTopSliceIdc = pCurLayer->pSliceIdc[iTopXy];
+    bTopAvail = (iTopSliceIdc == iCurSliceIdc);
+    if (iCurX != 0) {
+      iLeftTopXy = iTopXy - 1;
+      iLeftTopSliceIdc = pCurLayer->pSliceIdc[iLeftTopXy];
+      bLeftTopAvail = (iLeftTopSliceIdc  == iCurSliceIdc);
+    } else {
+      bLeftTopAvail = 0;
+    }
+    if (iCurX != (pCurLayer->iMbWidth - 1)) {
+      iRightTopXy = iTopXy + 1;
+      iRightTopSliceIdc = pCurLayer->pSliceIdc[iRightTopXy];
+      bRightTopAvail = (iRightTopSliceIdc == iCurSliceIdc);
+    } else {
+      bRightTopAvail = 0;
+    }
+  } else {
+    bTopAvail = 0;
+    bLeftTopAvail = 0;
+    bRightTopAvail = 0;
+  }
 
-	/*get neb mv&iRefIdxArray*/
-	/*left*/
-	if (bLeftAvail && IS_INTER(iLeftType)) 
-	{
-		ST32(iMvA, LD32(pCurLayer->pMv[0][iLeftXy][3]));
-		iLeftRef = pCurLayer->pRefIndex[0][iLeftXy][3];
-	}
-	else
-	{
-		ST32(iMvA, 0);
-		if (0 == bLeftAvail) //not available
-		{
-			iLeftRef = REF_NOT_AVAIL; 
-		}
-		else //available but is intra mb type
-		{
-			iLeftRef = REF_NOT_IN_LIST; 
-		}
-	}
-	if (REF_NOT_AVAIL == iLeftRef ||
-		(0 == iLeftRef && 0 == *(int32_t*)iMvA)) 
-	{
-		ST32( iMvp, 0 );
-		return;
-	}
-	
-	/*top*/
-	if (bTopAvail && IS_INTER(iTopType))
-	{
-		ST32( iMvB, LD32(pCurLayer->pMv[0][iTopXy][12]) );
-		iTopRef = pCurLayer->pRefIndex[0][iTopXy][12];
-	}
-	else
-	{
-		ST32( iMvB, 0 );
-		if (0 == bTopAvail) //not available
-		{
-		    iTopRef = REF_NOT_AVAIL;
-		}
-		else //available but is intra mb type
-		{
-			iTopRef = REF_NOT_IN_LIST;
-		}
-	}
-	if (REF_NOT_AVAIL == iTopRef ||
-		(0 == iTopRef  && 0 == *(int32_t*)iMvB)) 
-	{
-		ST32( iMvp, 0 );
-		return;
-	}
+  iLeftType = ((iCurX != 0 && bLeftAvail) ? pCurLayer->pMbType[iLeftXy] : 0);
+  iTopType = ((iCurY != 0 && bTopAvail) ? pCurLayer->pMbType[iTopXy] : 0);
+  iLeftTopType = ((iCurX != 0 && iCurY != 0 && bLeftTopAvail)
+                  ? pCurLayer->pMbType[iLeftTopXy] : 0);
+  iRightTopType = ((iCurX != pCurLayer->iMbWidth - 1 && iCurY != 0 && bRightTopAvail)
+                   ? pCurLayer->pMbType[iRightTopXy] : 0);
 
-	/*right_top*/
-	if (bRightTopAvail && IS_INTER(iRightTopType))
-	{
-		ST32(iMvC, LD32(pCurLayer->pMv[0][iRightTopXy][12]));
-		iRightTopRef = pCurLayer->pRefIndex[0][iRightTopXy][12];
-	}
-	else
-	{
-		ST32(iMvC, 0);
-		if (0 == bRightTopAvail) //not available
-		{
-			iRightTopRef = REF_NOT_AVAIL;
-		}
-		else //available but is intra mb type
-		{
-			iRightTopRef = REF_NOT_IN_LIST;
-		}			
-	}
+  /*get neb mv&iRefIdxArray*/
+  /*left*/
+  if (bLeftAvail && IS_INTER (iLeftType)) {
+    ST32 (iMvA, LD32 (pCurLayer->pMv[0][iLeftXy][3]));
+    iLeftRef = pCurLayer->pRefIndex[0][iLeftXy][3];
+  } else {
+    ST32 (iMvA, 0);
+    if (0 == bLeftAvail) { //not available
+      iLeftRef = REF_NOT_AVAIL;
+    } else { //available but is intra mb type
+      iLeftRef = REF_NOT_IN_LIST;
+    }
+  }
+  if (REF_NOT_AVAIL == iLeftRef ||
+      (0 == iLeftRef && 0 == * (int32_t*)iMvA)) {
+    ST32 (iMvp, 0);
+    return;
+  }
 
-	/*left_top*/
-	if (bLeftTopAvail && IS_INTER(iLeftTopType))
-	{
-		ST32(iMvD, LD32(pCurLayer->pMv[0][iLeftTopXy][15]));
-		iLeftTopRef = pCurLayer->pRefIndex[0][iLeftTopXy][15];
-	}
-	else
-	{
-		ST32(iMvD, 0);
-		if (0 == bLeftTopAvail) //not available
-		{
-			iLeftTopRef = REF_NOT_AVAIL;
-		}
-		else //available but is intra mb type
-		{
-			iLeftTopRef = REF_NOT_IN_LIST;
-		}			 
-	}
-		
-	iDiagonalRef = iRightTopRef;
-	if (REF_NOT_AVAIL == iDiagonalRef) 
-	{
-		iDiagonalRef = iLeftTopRef;
-		*(int32_t*)iMvC = *(int32_t*)iMvD;
-	}
+  /*top*/
+  if (bTopAvail && IS_INTER (iTopType)) {
+    ST32 (iMvB, LD32 (pCurLayer->pMv[0][iTopXy][12]));
+    iTopRef = pCurLayer->pRefIndex[0][iTopXy][12];
+  } else {
+    ST32 (iMvB, 0);
+    if (0 == bTopAvail) { //not available
+      iTopRef = REF_NOT_AVAIL;
+    } else { //available but is intra mb type
+      iTopRef = REF_NOT_IN_LIST;
+    }
+  }
+  if (REF_NOT_AVAIL == iTopRef ||
+      (0 == iTopRef  && 0 == * (int32_t*)iMvB)) {
+    ST32 (iMvp, 0);
+    return;
+  }
 
-	if (REF_NOT_AVAIL == iTopRef && REF_NOT_AVAIL == iDiagonalRef && iLeftRef >= REF_NOT_IN_LIST) 
-	{
-		ST32(iMvp, LD32(iMvA));
-		return;
-	}
+  /*right_top*/
+  if (bRightTopAvail && IS_INTER (iRightTopType)) {
+    ST32 (iMvC, LD32 (pCurLayer->pMv[0][iRightTopXy][12]));
+    iRightTopRef = pCurLayer->pRefIndex[0][iRightTopXy][12];
+  } else {
+    ST32 (iMvC, 0);
+    if (0 == bRightTopAvail) { //not available
+      iRightTopRef = REF_NOT_AVAIL;
+    } else { //available but is intra mb type
+      iRightTopRef = REF_NOT_IN_LIST;
+    }
+  }
 
-	iMatchRef = (0 == iLeftRef) + (0 == iTopRef) + (0 == iDiagonalRef);	
-	if (1 == iMatchRef) 
-	{
-		if (0 == iLeftRef) 
-		{
-			ST32(iMvp, LD32(iMvA));
-		}
-		else if (0 == iTopRef) 
-		{
-			ST32(iMvp, LD32(iMvB));
-		}
-		else
-		{
-			ST32(iMvp, LD32(iMvC));
-		}
-	}
-	else
-	{
-		iMvp[0] = WelsMedian(iMvA[0], iMvB[0], iMvC[0]);
-		iMvp[1] = WelsMedian(iMvA[1], iMvB[1], iMvC[1]);
-	}
+  /*left_top*/
+  if (bLeftTopAvail && IS_INTER (iLeftTopType)) {
+    ST32 (iMvD, LD32 (pCurLayer->pMv[0][iLeftTopXy][15]));
+    iLeftTopRef = pCurLayer->pRefIndex[0][iLeftTopXy][15];
+  } else {
+    ST32 (iMvD, 0);
+    if (0 == bLeftTopAvail) { //not available
+      iLeftTopRef = REF_NOT_AVAIL;
+    } else { //available but is intra mb type
+      iLeftTopRef = REF_NOT_IN_LIST;
+    }
+  }
+
+  iDiagonalRef = iRightTopRef;
+  if (REF_NOT_AVAIL == iDiagonalRef) {
+    iDiagonalRef = iLeftTopRef;
+    * (int32_t*)iMvC = * (int32_t*)iMvD;
+  }
+
+  if (REF_NOT_AVAIL == iTopRef && REF_NOT_AVAIL == iDiagonalRef && iLeftRef >= REF_NOT_IN_LIST) {
+    ST32 (iMvp, LD32 (iMvA));
+    return;
+  }
+
+  iMatchRef = (0 == iLeftRef) + (0 == iTopRef) + (0 == iDiagonalRef);
+  if (1 == iMatchRef) {
+    if (0 == iLeftRef) {
+      ST32 (iMvp, LD32 (iMvA));
+    } else if (0 == iTopRef) {
+      ST32 (iMvp, LD32 (iMvB));
+    } else {
+      ST32 (iMvp, LD32 (iMvC));
+    }
+  } else {
+    iMvp[0] = WelsMedian (iMvA[0], iMvB[0], iMvC[0]);
+    iMvp[1] = WelsMedian (iMvA[1], iMvB[1], iMvC[1]);
+  }
 }
 
-void_t GetNeighborAvailMbType( PNeighAvail pNeighAvail, PDqLayer pCurLayer )
-{
-	int32_t iCurSliceIdc, iTopSliceIdc, iLeftTopSliceIdc, iRightTopSliceIdc, iLeftSliceIdc;
-	int32_t iCurXy, iTopXy, iLeftXy, iLeftTopXy, iRightTopXy;
-	int32_t iCurX, iCurY;
+void_t GetNeighborAvailMbType (PNeighAvail pNeighAvail, PDqLayer pCurLayer) {
+  int32_t iCurSliceIdc, iTopSliceIdc, iLeftTopSliceIdc, iRightTopSliceIdc, iLeftSliceIdc;
+  int32_t iCurXy, iTopXy, iLeftXy, iLeftTopXy, iRightTopXy;
+  int32_t iCurX, iCurY;
 
-	iCurXy = pCurLayer->iMbXyIndex;
-	iCurX  = pCurLayer->iMbX;
-	iCurY  = pCurLayer->iMbY;
-	iCurSliceIdc = pCurLayer->pSliceIdc[iCurXy];
-	if( iCurX != 0)
-	{
-		iLeftXy = iCurXy- 1;
-		iLeftSliceIdc = pCurLayer->pSliceIdc[iLeftXy];
-		pNeighAvail->iLeftAvail = (iLeftSliceIdc == iCurSliceIdc);
-	}	
-	else
-	{
-		pNeighAvail->iLeftAvail = 0;
-		pNeighAvail->iLeftTopAvail = 0;
-	}
+  iCurXy = pCurLayer->iMbXyIndex;
+  iCurX  = pCurLayer->iMbX;
+  iCurY  = pCurLayer->iMbY;
+  iCurSliceIdc = pCurLayer->pSliceIdc[iCurXy];
+  if (iCurX != 0) {
+    iLeftXy = iCurXy - 1;
+    iLeftSliceIdc = pCurLayer->pSliceIdc[iLeftXy];
+    pNeighAvail->iLeftAvail = (iLeftSliceIdc == iCurSliceIdc);
+  } else {
+    pNeighAvail->iLeftAvail = 0;
+    pNeighAvail->iLeftTopAvail = 0;
+  }
 
-	if( iCurY != 0)
-	{
-		iTopXy = iCurXy - pCurLayer->iMbWidth;
-		iTopSliceIdc = pCurLayer->pSliceIdc[iTopXy];
-		pNeighAvail->iTopAvail = (iTopSliceIdc == iCurSliceIdc);
-		if (iCurX != 0)
-		{
-			iLeftTopXy = iTopXy - 1;
-			iLeftTopSliceIdc = pCurLayer->pSliceIdc[iLeftTopXy];
-			pNeighAvail->iLeftTopAvail = (iLeftTopSliceIdc == iCurSliceIdc);
-		}
-		else
-		{
-			pNeighAvail->iLeftTopAvail = 0;
-		}
-		if (iCurX != (pCurLayer->iMbWidth-1))
-		{
-			iRightTopXy = iTopXy + 1;
-			iRightTopSliceIdc = pCurLayer->pSliceIdc[iRightTopXy];
-			pNeighAvail->iRightTopAvail = (iRightTopSliceIdc == iCurSliceIdc);
-		}
-		else
-		{
-			pNeighAvail->iRightTopAvail = 0;
-		}
-	}
-	else
-	{
-		pNeighAvail->iTopAvail = 0;
-		pNeighAvail->iLeftTopAvail = 0;
-		pNeighAvail->iRightTopAvail = 0;
-	}
+  if (iCurY != 0) {
+    iTopXy = iCurXy - pCurLayer->iMbWidth;
+    iTopSliceIdc = pCurLayer->pSliceIdc[iTopXy];
+    pNeighAvail->iTopAvail = (iTopSliceIdc == iCurSliceIdc);
+    if (iCurX != 0) {
+      iLeftTopXy = iTopXy - 1;
+      iLeftTopSliceIdc = pCurLayer->pSliceIdc[iLeftTopXy];
+      pNeighAvail->iLeftTopAvail = (iLeftTopSliceIdc == iCurSliceIdc);
+    } else {
+      pNeighAvail->iLeftTopAvail = 0;
+    }
+    if (iCurX != (pCurLayer->iMbWidth - 1)) {
+      iRightTopXy = iTopXy + 1;
+      iRightTopSliceIdc = pCurLayer->pSliceIdc[iRightTopXy];
+      pNeighAvail->iRightTopAvail = (iRightTopSliceIdc == iCurSliceIdc);
+    } else {
+      pNeighAvail->iRightTopAvail = 0;
+    }
+  } else {
+    pNeighAvail->iTopAvail = 0;
+    pNeighAvail->iLeftTopAvail = 0;
+    pNeighAvail->iRightTopAvail = 0;
+  }
 
-	pNeighAvail->iLeftType     = ( pNeighAvail->iLeftAvail     ? pCurLayer->pMbType[iLeftXy]     : 0 );
-	pNeighAvail->iTopType      = ( pNeighAvail->iTopAvail      ? pCurLayer->pMbType[iTopXy]      : 0 );
-	pNeighAvail->iLeftTopType  = ( pNeighAvail->iLeftTopAvail  ? pCurLayer->pMbType[iLeftTopXy]  : 0 );
-	pNeighAvail->iRightTopType = ( pNeighAvail->iRightTopAvail ? pCurLayer->pMbType[iRightTopXy] : 0 );
+  pNeighAvail->iLeftType     = (pNeighAvail->iLeftAvail     ? pCurLayer->pMbType[iLeftXy]     : 0);
+  pNeighAvail->iTopType      = (pNeighAvail->iTopAvail      ? pCurLayer->pMbType[iTopXy]      : 0);
+  pNeighAvail->iLeftTopType  = (pNeighAvail->iLeftTopAvail  ? pCurLayer->pMbType[iLeftTopXy]  : 0);
+  pNeighAvail->iRightTopType = (pNeighAvail->iRightTopAvail ? pCurLayer->pMbType[iRightTopXy] : 0);
 }
-void_t WelsFillCacheNonZeroCount(PNeighAvail pNeighAvail, uint8_t* pNonZeroCount, PDqLayer pCurLayer) //no matter slice type, intra_pred_constrained_flag
-{
-	int32_t iCurXy  = pCurLayer->iMbXyIndex;
-	int32_t iTopXy  = 0;
-	int32_t iLeftXy = 0;
+void_t WelsFillCacheNonZeroCount (PNeighAvail pNeighAvail, uint8_t* pNonZeroCount,
+                                  PDqLayer pCurLayer) { //no matter slice type, intra_pred_constrained_flag
+  int32_t iCurXy  = pCurLayer->iMbXyIndex;
+  int32_t iTopXy  = 0;
+  int32_t iLeftXy = 0;
 
-	GetNeighborAvailMbType( pNeighAvail, pCurLayer );
+  GetNeighborAvailMbType (pNeighAvail, pCurLayer);
 
-	if ( pNeighAvail->iTopAvail )
-	{
-		iTopXy = iCurXy - pCurLayer->iMbWidth;
-	}
-	if ( pNeighAvail->iLeftAvail )
-	{
-		iLeftXy = iCurXy - 1;
-	}
+  if (pNeighAvail->iTopAvail) {
+    iTopXy = iCurXy - pCurLayer->iMbWidth;
+  }
+  if (pNeighAvail->iLeftAvail) {
+    iLeftXy = iCurXy - 1;
+  }
 
-	//stuff non_zero_coeff_count from pNeighAvail(left and top)
-	if (pNeighAvail->iTopAvail)
-	{
-		ST32(&pNonZeroCount[1], LD32(&pCurLayer->pNzc[iTopXy][12]));
-        pNonZeroCount[0] = pNonZeroCount[5] = pNonZeroCount[29] = 0;
-		ST16(&pNonZeroCount[6], LD16(&pCurLayer->pNzc[iTopXy][20]));
-		ST16(&pNonZeroCount[30], LD16(&pCurLayer->pNzc[iTopXy][22]));
-	}
-	else
-	{
-		ST32(&pNonZeroCount[1], 0xFFFFFFFFU);
-        pNonZeroCount[0] = pNonZeroCount[5] = pNonZeroCount[29] = 0xFF;
-		ST16(&pNonZeroCount[6], 0xFFFF);
-		ST16(&pNonZeroCount[30], 0xFFFF);
-	}
+  //stuff non_zero_coeff_count from pNeighAvail(left and top)
+  if (pNeighAvail->iTopAvail) {
+    ST32 (&pNonZeroCount[1], LD32 (&pCurLayer->pNzc[iTopXy][12]));
+    pNonZeroCount[0] = pNonZeroCount[5] = pNonZeroCount[29] = 0;
+    ST16 (&pNonZeroCount[6], LD16 (&pCurLayer->pNzc[iTopXy][20]));
+    ST16 (&pNonZeroCount[30], LD16 (&pCurLayer->pNzc[iTopXy][22]));
+  } else {
+    ST32 (&pNonZeroCount[1], 0xFFFFFFFFU);
+    pNonZeroCount[0] = pNonZeroCount[5] = pNonZeroCount[29] = 0xFF;
+    ST16 (&pNonZeroCount[6], 0xFFFF);
+    ST16 (&pNonZeroCount[30], 0xFFFF);
+  }
 
-	if (pNeighAvail->iLeftAvail)
-	{
-		pNonZeroCount[8 * 1] = pCurLayer->pNzc[iLeftXy][3];
-		pNonZeroCount[8 * 2] = pCurLayer->pNzc[iLeftXy][7];
-		pNonZeroCount[8 * 3] = pCurLayer->pNzc[iLeftXy][11];
-		pNonZeroCount[8 * 4] = pCurLayer->pNzc[iLeftXy][15];
+  if (pNeighAvail->iLeftAvail) {
+    pNonZeroCount[8 * 1] = pCurLayer->pNzc[iLeftXy][3];
+    pNonZeroCount[8 * 2] = pCurLayer->pNzc[iLeftXy][7];
+    pNonZeroCount[8 * 3] = pCurLayer->pNzc[iLeftXy][11];
+    pNonZeroCount[8 * 4] = pCurLayer->pNzc[iLeftXy][15];
 
-		pNonZeroCount[5 + 8 * 1] = pCurLayer->pNzc[iLeftXy][17];
-		pNonZeroCount[5 + 8 * 2] = pCurLayer->pNzc[iLeftXy][21];
-		pNonZeroCount[5 + 8 * 4] = pCurLayer->pNzc[iLeftXy][19]; 
-		pNonZeroCount[5 + 8 * 5] = pCurLayer->pNzc[iLeftXy][23];
-	}
-	else 
-	{
-		pNonZeroCount[8 * 1] = 
-		pNonZeroCount[8 * 2] = 
-		pNonZeroCount[8 * 3] = 
-		pNonZeroCount[8 * 4] = -1;//unavailable
+    pNonZeroCount[5 + 8 * 1] = pCurLayer->pNzc[iLeftXy][17];
+    pNonZeroCount[5 + 8 * 2] = pCurLayer->pNzc[iLeftXy][21];
+    pNonZeroCount[5 + 8 * 4] = pCurLayer->pNzc[iLeftXy][19];
+    pNonZeroCount[5 + 8 * 5] = pCurLayer->pNzc[iLeftXy][23];
+  } else {
+    pNonZeroCount[8 * 1] =
+      pNonZeroCount[8 * 2] =
+        pNonZeroCount[8 * 3] =
+          pNonZeroCount[8 * 4] = -1;//unavailable
 
-		pNonZeroCount[5 + 8 * 1] = 
-		pNonZeroCount[5 + 8 * 2] = -1;//unavailable
+    pNonZeroCount[5 + 8 * 1] =
+      pNonZeroCount[5 + 8 * 2] = -1;//unavailable
 
-		pNonZeroCount[5 + 8 * 4] = 
-		pNonZeroCount[5 + 8 * 5] = -1;//unavailable
-	}
-}  
-void_t WelsFillCacheConstrain1Intra4x4(PNeighAvail pNeighAvail, uint8_t* pNonZeroCount, int8_t* pIntraPredMode, PDqLayer pCurLayer) //no matter slice type
-{
-	int32_t iCurXy  = pCurLayer->iMbXyIndex;
-	int32_t iTopXy  = 0;
-	int32_t iLeftXy = 0;
+    pNonZeroCount[5 + 8 * 4] =
+      pNonZeroCount[5 + 8 * 5] = -1;//unavailable
+  }
+}
+void_t WelsFillCacheConstrain1Intra4x4 (PNeighAvail pNeighAvail, uint8_t* pNonZeroCount, int8_t* pIntraPredMode,
+                                        PDqLayer pCurLayer) { //no matter slice type
+  int32_t iCurXy  = pCurLayer->iMbXyIndex;
+  int32_t iTopXy  = 0;
+  int32_t iLeftXy = 0;
 
-	//stuff non_zero_coeff_count from pNeighAvail(left and top)
-	WelsFillCacheNonZeroCount( pNeighAvail, pNonZeroCount, pCurLayer );
+  //stuff non_zero_coeff_count from pNeighAvail(left and top)
+  WelsFillCacheNonZeroCount (pNeighAvail, pNonZeroCount, pCurLayer);
 
-	if ( pNeighAvail->iTopAvail )
-	{
-		iTopXy = iCurXy - pCurLayer->iMbWidth;
-	}
-	if ( pNeighAvail->iLeftAvail )
-	{
-		iLeftXy = iCurXy - 1;
-	}	
+  if (pNeighAvail->iTopAvail) {
+    iTopXy = iCurXy - pCurLayer->iMbWidth;
+  }
+  if (pNeighAvail->iLeftAvail) {
+    iLeftXy = iCurXy - 1;
+  }
 
-	//intra4x4_pred_mode			
-	if (pNeighAvail->iTopAvail && IS_INTRA4x4(pNeighAvail->iTopType)) //top
-	{		
-        ST32(pIntraPredMode+1, LD32(&pCurLayer->pIntraPredMode[iTopXy][0]));
-	}
-	else 
-	{
-		int32_t iPred;
-		if( IS_INTRA16x16( pNeighAvail->iTopType ) || ( MB_TYPE_INTRA_PCM == pNeighAvail->iTopType ) )
-			iPred= 0x02020202;
-		else
-			iPred= 0xffffffff;
-        ST32(pIntraPredMode+1, iPred);
-	}
-
-	if (pNeighAvail->iLeftAvail && IS_INTRA4x4(pNeighAvail->iLeftType)) //left
-	{
-		pIntraPredMode[ 0 + 8    ] = pCurLayer->pIntraPredMode[iLeftXy][4];
-		pIntraPredMode[ 0 + 8 * 2] = pCurLayer->pIntraPredMode[iLeftXy][5];
-		pIntraPredMode[ 0 + 8 * 3] = pCurLayer->pIntraPredMode[iLeftXy][6];
-		pIntraPredMode[ 0 + 8 * 4] = pCurLayer->pIntraPredMode[iLeftXy][3];
-	}
-	else 
-	{	
-		int8_t iPred;
-		if( IS_INTRA16x16( pNeighAvail->iLeftType ) || ( MB_TYPE_INTRA_PCM == pNeighAvail->iLeftType ) )
-			iPred= 2;
-		else
-			iPred= -1;
-		pIntraPredMode[ 0 + 8    ] = 
-		pIntraPredMode[ 0 + 8 * 2] = 
-		pIntraPredMode[ 0 + 8 * 3] = 
-		pIntraPredMode[ 0 + 8 * 4] = iPred;
-	}	
-} 
+  //intra4x4_pred_mode
+  if (pNeighAvail->iTopAvail && IS_INTRA4x4 (pNeighAvail->iTopType)) { //top
+    ST32 (pIntraPredMode + 1, LD32 (&pCurLayer->pIntraPredMode[iTopXy][0]));
+  } else {
+    int32_t iPred;
+    if (IS_INTRA16x16 (pNeighAvail->iTopType) || (MB_TYPE_INTRA_PCM == pNeighAvail->iTopType))
+      iPred = 0x02020202;
+    else
+      iPred = 0xffffffff;
+    ST32 (pIntraPredMode + 1, iPred);
+  }
 
-void_t WelsFillCacheConstrain0Intra4x4(PNeighAvail pNeighAvail, uint8_t* pNonZeroCount, int8_t* pIntraPredMode, PDqLayer pCurLayer) //no matter slice type
-{
-	int32_t iCurXy  = pCurLayer->iMbXyIndex;
-	int32_t iTopXy  = 0;
-	int32_t iLeftXy = 0;
+  if (pNeighAvail->iLeftAvail && IS_INTRA4x4 (pNeighAvail->iLeftType)) { //left
+    pIntraPredMode[ 0 + 8    ] = pCurLayer->pIntraPredMode[iLeftXy][4];
+    pIntraPredMode[ 0 + 8 * 2] = pCurLayer->pIntraPredMode[iLeftXy][5];
+    pIntraPredMode[ 0 + 8 * 3] = pCurLayer->pIntraPredMode[iLeftXy][6];
+    pIntraPredMode[ 0 + 8 * 4] = pCurLayer->pIntraPredMode[iLeftXy][3];
+  } else {
+    int8_t iPred;
+    if (IS_INTRA16x16 (pNeighAvail->iLeftType) || (MB_TYPE_INTRA_PCM == pNeighAvail->iLeftType))
+      iPred = 2;
+    else
+      iPred = -1;
+    pIntraPredMode[ 0 + 8    ] =
+      pIntraPredMode[ 0 + 8 * 2] =
+        pIntraPredMode[ 0 + 8 * 3] =
+          pIntraPredMode[ 0 + 8 * 4] = iPred;
+  }
+}
 
-	//stuff non_zero_coeff_count from pNeighAvail(left and top)
-	WelsFillCacheNonZeroCount( pNeighAvail, pNonZeroCount, pCurLayer );
+void_t WelsFillCacheConstrain0Intra4x4 (PNeighAvail pNeighAvail, uint8_t* pNonZeroCount, int8_t* pIntraPredMode,
+                                        PDqLayer pCurLayer) { //no matter slice type
+  int32_t iCurXy  = pCurLayer->iMbXyIndex;
+  int32_t iTopXy  = 0;
+  int32_t iLeftXy = 0;
 
-	if ( pNeighAvail->iTopAvail )
-	{
-		iTopXy = iCurXy - pCurLayer->iMbWidth;
-	}
-	if ( pNeighAvail->iLeftAvail )
-	{
-		iLeftXy = iCurXy - 1;
-	}	
+  //stuff non_zero_coeff_count from pNeighAvail(left and top)
+  WelsFillCacheNonZeroCount (pNeighAvail, pNonZeroCount, pCurLayer);
 
-	//intra4x4_pred_mode		
-	if (pNeighAvail->iTopAvail && IS_INTRA4x4(pNeighAvail->iTopType)) //top
-	{
-        ST32(pIntraPredMode + 1, LD32(&pCurLayer->pIntraPredMode[iTopXy][0]));
-	}
-	else 
-	{
-		int32_t iPred;
-		if( pNeighAvail->iTopAvail )
-			iPred= 0x02020202;
-		else
-			iPred= 0xffffffff;
-        ST32(pIntraPredMode + 1, iPred);
-	}
+  if (pNeighAvail->iTopAvail) {
+    iTopXy = iCurXy - pCurLayer->iMbWidth;
+  }
+  if (pNeighAvail->iLeftAvail) {
+    iLeftXy = iCurXy - 1;
+  }
 
-	if (pNeighAvail->iLeftAvail && IS_INTRA4x4(pNeighAvail->iLeftType)) //left
-	{
-		pIntraPredMode[ 0 + 8 * 1] = pCurLayer->pIntraPredMode[iLeftXy][4];
-		pIntraPredMode[ 0 + 8 * 2] = pCurLayer->pIntraPredMode[iLeftXy][5];
-		pIntraPredMode[ 0 + 8 * 3] = pCurLayer->pIntraPredMode[iLeftXy][6];
-		pIntraPredMode[ 0 + 8 * 4] = pCurLayer->pIntraPredMode[iLeftXy][3];
-	}
-	else 
-	{	
-		int8_t iPred;
-		if( pNeighAvail->iLeftAvail )
-			iPred= 2;
-		else
-			iPred= -1;
-		pIntraPredMode[ 0 + 8 * 1] = 
-		pIntraPredMode[ 0 + 8 * 2] = 
-		pIntraPredMode[ 0 + 8 * 3] = 
-		pIntraPredMode[ 0 + 8 * 4] = iPred;
-	}
-} 
+  //intra4x4_pred_mode
+  if (pNeighAvail->iTopAvail && IS_INTRA4x4 (pNeighAvail->iTopType)) { //top
+    ST32 (pIntraPredMode + 1, LD32 (&pCurLayer->pIntraPredMode[iTopXy][0]));
+  } else {
+    int32_t iPred;
+    if (pNeighAvail->iTopAvail)
+      iPred = 0x02020202;
+    else
+      iPred = 0xffffffff;
+    ST32 (pIntraPredMode + 1, iPred);
+  }
 
-void_t WelsFillCacheInter(PNeighAvail pNeighAvail, uint8_t* pNonZeroCount, 
-						  int16_t iMvArray[LIST_A][30][MV_A], int8_t iRefIdxArray[LIST_A][30], PDqLayer pCurLayer)
-{
-	int32_t iCurXy      = pCurLayer->iMbXyIndex;
-	int32_t iTopXy      = 0;
-	int32_t iLeftXy     = 0;
-	int32_t iLeftTopXy  = 0;
-	int32_t iRightTopXy = 0;
+  if (pNeighAvail->iLeftAvail && IS_INTRA4x4 (pNeighAvail->iLeftType)) { //left
+    pIntraPredMode[ 0 + 8 * 1] = pCurLayer->pIntraPredMode[iLeftXy][4];
+    pIntraPredMode[ 0 + 8 * 2] = pCurLayer->pIntraPredMode[iLeftXy][5];
+    pIntraPredMode[ 0 + 8 * 3] = pCurLayer->pIntraPredMode[iLeftXy][6];
+    pIntraPredMode[ 0 + 8 * 4] = pCurLayer->pIntraPredMode[iLeftXy][3];
+  } else {
+    int8_t iPred;
+    if (pNeighAvail->iLeftAvail)
+      iPred = 2;
+    else
+      iPred = -1;
+    pIntraPredMode[ 0 + 8 * 1] =
+      pIntraPredMode[ 0 + 8 * 2] =
+        pIntraPredMode[ 0 + 8 * 3] =
+          pIntraPredMode[ 0 + 8 * 4] = iPred;
+  }
+}
 
-	//stuff non_zero_coeff_count from pNeighAvail(left and top)
-	WelsFillCacheNonZeroCount( pNeighAvail, pNonZeroCount, pCurLayer );
+void_t WelsFillCacheInter (PNeighAvail pNeighAvail, uint8_t* pNonZeroCount,
+                           int16_t iMvArray[LIST_A][30][MV_A], int8_t iRefIdxArray[LIST_A][30], PDqLayer pCurLayer) {
+  int32_t iCurXy      = pCurLayer->iMbXyIndex;
+  int32_t iTopXy      = 0;
+  int32_t iLeftXy     = 0;
+  int32_t iLeftTopXy  = 0;
+  int32_t iRightTopXy = 0;
 
-	if ( pNeighAvail->iTopAvail )
-	{
-		iTopXy = iCurXy - pCurLayer->iMbWidth;
-	}
-	if ( pNeighAvail->iLeftAvail )
-	{
-		iLeftXy = iCurXy - 1;
-	}
-	if ( pNeighAvail->iLeftTopAvail )
-	{
-		iLeftTopXy = iCurXy - 1 - pCurLayer->iMbWidth;
-	}
-	if ( pNeighAvail->iRightTopAvail )
-	{
-		iRightTopXy = iCurXy + 1- pCurLayer->iMbWidth;
-	}
+  //stuff non_zero_coeff_count from pNeighAvail(left and top)
+  WelsFillCacheNonZeroCount (pNeighAvail, pNonZeroCount, pCurLayer);
 
-	//stuff mv_cache and iRefIdxArray from left and top (inter)
-	if (pNeighAvail->iLeftAvail && IS_INTER(pNeighAvail->iLeftType)) 
-	{
-		ST32(iMvArray[0][ 6], LD32(pCurLayer->pMv[0][iLeftXy][ 3]));
-		ST32(iMvArray[0][12], LD32(pCurLayer->pMv[0][iLeftXy][ 7]));
-		ST32(iMvArray[0][18], LD32(pCurLayer->pMv[0][iLeftXy][11]));
-		ST32(iMvArray[0][24], LD32(pCurLayer->pMv[0][iLeftXy][15]));
-		iRefIdxArray[0][ 6] = pCurLayer->pRefIndex[0][iLeftXy][ 3];
-		iRefIdxArray[0][12] = pCurLayer->pRefIndex[0][iLeftXy][ 7];
-		iRefIdxArray[0][18] = pCurLayer->pRefIndex[0][iLeftXy][11];
-		iRefIdxArray[0][24] = pCurLayer->pRefIndex[0][iLeftXy][15];
-	}
-	else
-	{
-		ST32(iMvArray[0][ 6], 0);
-		ST32(iMvArray[0][12], 0);
-		ST32(iMvArray[0][18], 0);
-		ST32(iMvArray[0][24], 0);
+  if (pNeighAvail->iTopAvail) {
+    iTopXy = iCurXy - pCurLayer->iMbWidth;
+  }
+  if (pNeighAvail->iLeftAvail) {
+    iLeftXy = iCurXy - 1;
+  }
+  if (pNeighAvail->iLeftTopAvail) {
+    iLeftTopXy = iCurXy - 1 - pCurLayer->iMbWidth;
+  }
+  if (pNeighAvail->iRightTopAvail) {
+    iRightTopXy = iCurXy + 1 - pCurLayer->iMbWidth;
+  }
 
-		if (0 == pNeighAvail->iLeftAvail) //not available
-		{
-			iRefIdxArray[0][ 6] = 
-			iRefIdxArray[0][12] = 
-			iRefIdxArray[0][18] = 
-			iRefIdxArray[0][24] = REF_NOT_AVAIL; 
-		}
-		else //available but is intra mb type
-		{
-			iRefIdxArray[0][ 6] = 
-			iRefIdxArray[0][12] = 
-			iRefIdxArray[0][18] = 
-			iRefIdxArray[0][24] = REF_NOT_IN_LIST;
-		}
-	}
-	if (pNeighAvail->iLeftTopAvail && IS_INTER(pNeighAvail->iLeftTopType))
-	{
-		ST32(iMvArray[0][0], LD32(pCurLayer->pMv[0][iLeftTopXy][15]));
-        iRefIdxArray[0][0] = pCurLayer->pRefIndex[0][iLeftTopXy][15];
-	}
-	else
-	{
-		ST32(iMvArray[0][0], 0);
-		if (0 == pNeighAvail->iLeftTopAvail) //not available
-		{
-			iRefIdxArray[0][0] = REF_NOT_AVAIL;
-		}
-		else //available but is intra mb type
-		{
-			iRefIdxArray[0][0] = REF_NOT_IN_LIST;
-		}			 
-	}
+  //stuff mv_cache and iRefIdxArray from left and top (inter)
+  if (pNeighAvail->iLeftAvail && IS_INTER (pNeighAvail->iLeftType)) {
+    ST32 (iMvArray[0][ 6], LD32 (pCurLayer->pMv[0][iLeftXy][ 3]));
+    ST32 (iMvArray[0][12], LD32 (pCurLayer->pMv[0][iLeftXy][ 7]));
+    ST32 (iMvArray[0][18], LD32 (pCurLayer->pMv[0][iLeftXy][11]));
+    ST32 (iMvArray[0][24], LD32 (pCurLayer->pMv[0][iLeftXy][15]));
+    iRefIdxArray[0][ 6] = pCurLayer->pRefIndex[0][iLeftXy][ 3];
+    iRefIdxArray[0][12] = pCurLayer->pRefIndex[0][iLeftXy][ 7];
+    iRefIdxArray[0][18] = pCurLayer->pRefIndex[0][iLeftXy][11];
+    iRefIdxArray[0][24] = pCurLayer->pRefIndex[0][iLeftXy][15];
+  } else {
+    ST32 (iMvArray[0][ 6], 0);
+    ST32 (iMvArray[0][12], 0);
+    ST32 (iMvArray[0][18], 0);
+    ST32 (iMvArray[0][24], 0);
 
-	if (pNeighAvail->iTopAvail && IS_INTER(pNeighAvail->iTopType))
-	{
-		ST64(iMvArray[0][1], LD64(pCurLayer->pMv[0][iTopXy][12]));
-		ST64(iMvArray[0][3], LD64(pCurLayer->pMv[0][iTopXy][14]));
-        ST32(&iRefIdxArray[0][1], LD32(&pCurLayer->pRefIndex[0][iTopXy][12]));
-	}
-	else
-	{
-		ST64(iMvArray[0][1], 0);
-		ST64(iMvArray[0][3], 0);
+    if (0 == pNeighAvail->iLeftAvail) { //not available
+      iRefIdxArray[0][ 6] =
+        iRefIdxArray[0][12] =
+          iRefIdxArray[0][18] =
+            iRefIdxArray[0][24] = REF_NOT_AVAIL;
+    } else { //available but is intra mb type
+      iRefIdxArray[0][ 6] =
+        iRefIdxArray[0][12] =
+          iRefIdxArray[0][18] =
+            iRefIdxArray[0][24] = REF_NOT_IN_LIST;
+    }
+  }
+  if (pNeighAvail->iLeftTopAvail && IS_INTER (pNeighAvail->iLeftTopType)) {
+    ST32 (iMvArray[0][0], LD32 (pCurLayer->pMv[0][iLeftTopXy][15]));
+    iRefIdxArray[0][0] = pCurLayer->pRefIndex[0][iLeftTopXy][15];
+  } else {
+    ST32 (iMvArray[0][0], 0);
+    if (0 == pNeighAvail->iLeftTopAvail) { //not available
+      iRefIdxArray[0][0] = REF_NOT_AVAIL;
+    } else { //available but is intra mb type
+      iRefIdxArray[0][0] = REF_NOT_IN_LIST;
+    }
+  }
 
-		if (0 == pNeighAvail->iTopAvail) //not available
-		{
-			iRefIdxArray[0][1] = 
-			iRefIdxArray[0][2] = 
-			iRefIdxArray[0][3] = 
-			iRefIdxArray[0][4] = REF_NOT_AVAIL;
-		}
-		else //available but is intra mb type
-		{
-			iRefIdxArray[0][1] = 
-			iRefIdxArray[0][2] = 
-			iRefIdxArray[0][3] = 
-			iRefIdxArray[0][4] = REF_NOT_IN_LIST;
-		}
-	}
+  if (pNeighAvail->iTopAvail && IS_INTER (pNeighAvail->iTopType)) {
+    ST64 (iMvArray[0][1], LD64 (pCurLayer->pMv[0][iTopXy][12]));
+    ST64 (iMvArray[0][3], LD64 (pCurLayer->pMv[0][iTopXy][14]));
+    ST32 (&iRefIdxArray[0][1], LD32 (&pCurLayer->pRefIndex[0][iTopXy][12]));
+  } else {
+    ST64 (iMvArray[0][1], 0);
+    ST64 (iMvArray[0][3], 0);
 
-	if (pNeighAvail->iRightTopAvail && IS_INTER(pNeighAvail->iRightTopType))
-	{
-		ST32(iMvArray[0][5], LD32(pCurLayer->pMv[0][iRightTopXy][12]));
-		iRefIdxArray[0][5] = pCurLayer->pRefIndex[0][iRightTopXy][12];
-	}
-	else
-	{
-		ST32(iMvArray[0][5], 0);
-		if (0 == pNeighAvail->iRightTopAvail) //not available
-		{
-			iRefIdxArray[0][5] = REF_NOT_AVAIL;
-		}
-		else //available but is intra mb type
-		{
-			iRefIdxArray[0][5] = REF_NOT_IN_LIST;
-		}			
-	}
+    if (0 == pNeighAvail->iTopAvail) { //not available
+      iRefIdxArray[0][1] =
+        iRefIdxArray[0][2] =
+          iRefIdxArray[0][3] =
+            iRefIdxArray[0][4] = REF_NOT_AVAIL;
+    } else { //available but is intra mb type
+      iRefIdxArray[0][1] =
+        iRefIdxArray[0][2] =
+          iRefIdxArray[0][3] =
+            iRefIdxArray[0][4] = REF_NOT_IN_LIST;
+    }
+  }
 
-	//right-top 4*4 block unavailable
-	ST32(iMvArray[0][ 9], 0);
-	ST32(iMvArray[0][21], 0);
-	ST32(iMvArray[0][11], 0);
-	ST32(iMvArray[0][17], 0);
-	ST32(iMvArray[0][23], 0);
-	iRefIdxArray[0][ 9] = 
-	iRefIdxArray[0][21] = 
-	iRefIdxArray[0][11] =
-	iRefIdxArray[0][17] =
-	iRefIdxArray[0][23] = REF_NOT_AVAIL;
-} 
+  if (pNeighAvail->iRightTopAvail && IS_INTER (pNeighAvail->iRightTopType)) {
+    ST32 (iMvArray[0][5], LD32 (pCurLayer->pMv[0][iRightTopXy][12]));
+    iRefIdxArray[0][5] = pCurLayer->pRefIndex[0][iRightTopXy][12];
+  } else {
+    ST32 (iMvArray[0][5], 0);
+    if (0 == pNeighAvail->iRightTopAvail) { //not available
+      iRefIdxArray[0][5] = REF_NOT_AVAIL;
+    } else { //available but is intra mb type
+      iRefIdxArray[0][5] = REF_NOT_IN_LIST;
+    }
+  }
 
-int32_t PredIntra4x4Mode(int8_t* pIntraPredMode, int32_t iIdx4)
-{
-	int8_t iTopMode  = pIntraPredMode[g_kuiScan8[iIdx4] - 8];
-	int8_t iLeftMode = pIntraPredMode[g_kuiScan8[iIdx4] - 1];
-	int8_t iBestMode;
+  //right-top 4*4 block unavailable
+  ST32 (iMvArray[0][ 9], 0);
+  ST32 (iMvArray[0][21], 0);
+  ST32 (iMvArray[0][11], 0);
+  ST32 (iMvArray[0][17], 0);
+  ST32 (iMvArray[0][23], 0);
+  iRefIdxArray[0][ 9] =
+    iRefIdxArray[0][21] =
+      iRefIdxArray[0][11] =
+        iRefIdxArray[0][17] =
+          iRefIdxArray[0][23] = REF_NOT_AVAIL;
+}
 
-	if (-1 == iLeftMode || -1 == iTopMode)
-	{
-		iBestMode = 2;
-	}
-	else
-	{	
-		iBestMode = WELS_MIN(iLeftMode, iTopMode);
-	}
-	return iBestMode;
+int32_t PredIntra4x4Mode (int8_t* pIntraPredMode, int32_t iIdx4) {
+  int8_t iTopMode  = pIntraPredMode[g_kuiScan8[iIdx4] - 8];
+  int8_t iLeftMode = pIntraPredMode[g_kuiScan8[iIdx4] - 1];
+  int8_t iBestMode;
+
+  if (-1 == iLeftMode || -1 == iTopMode) {
+    iBestMode = 2;
+  } else {
+    iBestMode = WELS_MIN (iLeftMode, iTopMode);
+  }
+  return iBestMode;
 }
 
 #define MAX_PRED_MODE_ID_I16x16  3
@@ -653,975 +534,815 @@
                       (d >= g_ksI4PredInfo[a].iLeftTopAvail));
 
 
-int32_t CheckIntra16x16PredMode(uint8_t uiSampleAvail, int8_t* pMode)
-{
-	int32_t iLeftAvail     = uiSampleAvail & 0x04;
-	int32_t bLeftTopAvail  = uiSampleAvail & 0x02;
-	int32_t iTopAvail      = uiSampleAvail & 0x01;
+int32_t CheckIntra16x16PredMode (uint8_t uiSampleAvail, int8_t* pMode) {
+  int32_t iLeftAvail     = uiSampleAvail & 0x04;
+  int32_t bLeftTopAvail  = uiSampleAvail & 0x02;
+  int32_t iTopAvail      = uiSampleAvail & 0x01;
 
-	if (*pMode > MAX_PRED_MODE_ID_I16x16)
-	{
-		return ERR_INFO_INVALID_I16x16_PRED_MODE;
-	}
+  if (*pMode > MAX_PRED_MODE_ID_I16x16) {
+    return ERR_INFO_INVALID_I16x16_PRED_MODE;
+  }
 
-	if (I16_PRED_DC == *pMode) 
-	{
-		if (iLeftAvail && iTopAvail) 
-		{
-			return 0;
-		}
-		else if (iLeftAvail) 
-		{
-			*pMode = I16_PRED_DC_L;
-		}
-		else if (iTopAvail) 
-		{
-			*pMode = I16_PRED_DC_T;
-		}
-		else
-		{
-			*pMode = I16_PRED_DC_128;
-		}
-	}
-	else 
-	{
-		bool_t bModeAvail = CHECK_I16_MODE(*pMode, iLeftAvail, iTopAvail, bLeftTopAvail);
-		if (0 == bModeAvail) 
-		{
-			return ERR_INFO_INVALID_I16x16_PRED_MODE;
-		}
-	}
-	return 0;
+  if (I16_PRED_DC == *pMode) {
+    if (iLeftAvail && iTopAvail) {
+      return 0;
+    } else if (iLeftAvail) {
+      *pMode = I16_PRED_DC_L;
+    } else if (iTopAvail) {
+      *pMode = I16_PRED_DC_T;
+    } else {
+      *pMode = I16_PRED_DC_128;
+    }
+  } else {
+    bool_t bModeAvail = CHECK_I16_MODE (*pMode, iLeftAvail, iTopAvail, bLeftTopAvail);
+    if (0 == bModeAvail) {
+      return ERR_INFO_INVALID_I16x16_PRED_MODE;
+    }
+  }
+  return 0;
 }
 
 
-int32_t CheckIntraChromaPredMode(uint8_t uiSampleAvail, int8_t* pMode)
-{
-	int32_t iLeftAvail     = uiSampleAvail & 0x04;
-	int32_t bLeftTopAvail  = uiSampleAvail & 0x02;
-	int32_t iTopAvail      = uiSampleAvail & 0x01;
+int32_t CheckIntraChromaPredMode (uint8_t uiSampleAvail, int8_t* pMode) {
+  int32_t iLeftAvail     = uiSampleAvail & 0x04;
+  int32_t bLeftTopAvail  = uiSampleAvail & 0x02;
+  int32_t iTopAvail      = uiSampleAvail & 0x01;
 
-	if (*pMode > MAX_PRED_MODE_ID_CHROMA)
-	{
-		return ERR_INFO_INVALID_I_CHROMA_PRED_MODE;
-	}
+  if (*pMode > MAX_PRED_MODE_ID_CHROMA) {
+    return ERR_INFO_INVALID_I_CHROMA_PRED_MODE;
+  }
 
-	if (C_PRED_DC == *pMode) 
-	{
-		if (iLeftAvail && iTopAvail) 
-		{
-			return 0;
-		}
-		else if (iLeftAvail) 
-		{
-			*pMode = C_PRED_DC_L;
-		}
-		else if (iTopAvail) 
-		{
-			*pMode = C_PRED_DC_T;
-		}
-		else
-		{
-			*pMode = C_PRED_DC_128;
-		}
-	}
-	else 
-	{
-		bool_t bModeAvail = CHECK_CHROMA_MODE(*pMode, iLeftAvail, iTopAvail, bLeftTopAvail);
-		if (0 == bModeAvail) 
-		{
-			return ERR_INFO_INVALID_I_CHROMA_PRED_MODE;
-		}
-	}
-	return 0;
+  if (C_PRED_DC == *pMode) {
+    if (iLeftAvail && iTopAvail) {
+      return 0;
+    } else if (iLeftAvail) {
+      *pMode = C_PRED_DC_L;
+    } else if (iTopAvail) {
+      *pMode = C_PRED_DC_T;
+    } else {
+      *pMode = C_PRED_DC_128;
+    }
+  } else {
+    bool_t bModeAvail = CHECK_CHROMA_MODE (*pMode, iLeftAvail, iTopAvail, bLeftTopAvail);
+    if (0 == bModeAvail) {
+      return ERR_INFO_INVALID_I_CHROMA_PRED_MODE;
+    }
+  }
+  return 0;
 }
 
-int32_t CheckIntra4x4PredMode(int32_t* pSampleAvail, int8_t* pMode, int32_t iIndex)
-{
-	int8_t iIdx = g_kuiCache30ScanIdx[iIndex];
-	int32_t iLeftAvail     = pSampleAvail[iIdx-1];
-	int32_t iTopAvail      = pSampleAvail[iIdx-6];
-	int32_t bLeftTopAvail  = pSampleAvail[iIdx-7];	
-	int32_t bRightTopAvail = pSampleAvail[iIdx-5];
+int32_t CheckIntra4x4PredMode (int32_t* pSampleAvail, int8_t* pMode, int32_t iIndex) {
+  int8_t iIdx = g_kuiCache30ScanIdx[iIndex];
+  int32_t iLeftAvail     = pSampleAvail[iIdx - 1];
+  int32_t iTopAvail      = pSampleAvail[iIdx - 6];
+  int32_t bLeftTopAvail  = pSampleAvail[iIdx - 7];
+  int32_t bRightTopAvail = pSampleAvail[iIdx - 5];
 
-	int8_t iFinalMode;
+  int8_t iFinalMode;
 
-	if (*pMode > MAX_PRED_MODE_ID_I4x4) 
-	{
-		return -1;
-	}
+  if (*pMode > MAX_PRED_MODE_ID_I4x4) {
+    return -1;
+  }
 
-	if (I4_PRED_DC == *pMode) 
-	{
-		if (iLeftAvail && iTopAvail) 
-		{
-			return *pMode;
-		}
-		else if (iLeftAvail) 
-		{
-			iFinalMode = I4_PRED_DC_L;
-		}
-		else if (iTopAvail) 
-		{
-			iFinalMode = I4_PRED_DC_T;
-		}
-		else
-		{
-			iFinalMode = I4_PRED_DC_128;
-		}
-	}
-	else
-	{
-		bool_t bModeAvail = CHECK_I4_MODE(*pMode, iLeftAvail, iTopAvail, bLeftTopAvail);
-		if (0 == bModeAvail) 
-		{
-			return -1;
-		}
+  if (I4_PRED_DC == *pMode) {
+    if (iLeftAvail && iTopAvail) {
+      return *pMode;
+    } else if (iLeftAvail) {
+      iFinalMode = I4_PRED_DC_L;
+    } else if (iTopAvail) {
+      iFinalMode = I4_PRED_DC_T;
+    } else {
+      iFinalMode = I4_PRED_DC_128;
+    }
+  } else {
+    bool_t bModeAvail = CHECK_I4_MODE (*pMode, iLeftAvail, iTopAvail, bLeftTopAvail);
+    if (0 == bModeAvail) {
+      return -1;
+    }
 
-		iFinalMode = *pMode;
+    iFinalMode = *pMode;
 
-		//if right-top unavailable, modify mode DDL and VL (padding rightmost pixel of top)  
-		if (I4_PRED_DDL == iFinalMode && 0 == bRightTopAvail)
-		{
-			iFinalMode = I4_PRED_DDL_TOP;
-		}
-		else if (I4_PRED_VL == iFinalMode && 0 == bRightTopAvail)
-		{
-			iFinalMode = I4_PRED_VL_TOP;
-		}		
-	}		
-	return iFinalMode;
+    //if right-top unavailable, modify mode DDL and VL (padding rightmost pixel of top)
+    if (I4_PRED_DDL == iFinalMode && 0 == bRightTopAvail) {
+      iFinalMode = I4_PRED_DDL_TOP;
+    } else if (I4_PRED_VL == iFinalMode && 0 == bRightTopAvail) {
+      iFinalMode = I4_PRED_VL_TOP;
+    }
+  }
+  return iFinalMode;
 }
 
-void_t BsStartCavlc( PBitStringAux pBs )
-{
-	pBs->iIndex = ((pBs->pCurBuf - pBs->pStartBuf)<<3) - (16 - pBs->iLeftBits);
+void_t BsStartCavlc (PBitStringAux pBs) {
+  pBs->iIndex = ((pBs->pCurBuf - pBs->pStartBuf) << 3) - (16 - pBs->iLeftBits);
 }
-void_t BsEndCavlc( PBitStringAux pBs )
-{
-	pBs->pCurBuf   = pBs->pStartBuf + (pBs->iIndex>>3);
-	pBs->uiCurBits = ((((pBs->pCurBuf[0] << 8) | pBs->pCurBuf[1]) << 16) | (pBs->pCurBuf[2] << 8) | pBs->pCurBuf[3]) << (pBs->iIndex & 0x07);
-	pBs->pCurBuf  += 4;
-	pBs->iLeftBits = -16 + (pBs->iIndex&0x07);
+void_t BsEndCavlc (PBitStringAux pBs) {
+  pBs->pCurBuf   = pBs->pStartBuf + (pBs->iIndex >> 3);
+  pBs->uiCurBits = ((((pBs->pCurBuf[0] << 8) | pBs->pCurBuf[1]) << 16) | (pBs->pCurBuf[2] << 8) | pBs->pCurBuf[3]) <<
+                   (pBs->iIndex & 0x07);
+  pBs->pCurBuf  += 4;
+  pBs->iLeftBits = -16 + (pBs->iIndex & 0x07);
 }
 
 
-// return: used bits	
-static int32_t CavlcGetTrailingOnesAndTotalCoeff(uint8_t &uiTotalCoeff, uint8_t &uiTrailingOnes, SReadBitsCache *pBitsCache, SVlcTable* pVlcTable, bool_t bChromaDc, int8_t nC)
-{
-	const uint8_t *kpVlcTableMoreBitsCountList[3] = {g_kuiVlcTableMoreBitsCount0, g_kuiVlcTableMoreBitsCount1, g_kuiVlcTableMoreBitsCount2}; 
-    int32_t iUsedBits = 0;
-	int32_t iIndexVlc, iIndexValue, iNcMapIdx;
-	uint32_t uiCount;
-	uint32_t uiValue;
+// return: used bits
+static int32_t CavlcGetTrailingOnesAndTotalCoeff (uint8_t& uiTotalCoeff, uint8_t& uiTrailingOnes,
+    SReadBitsCache* pBitsCache, SVlcTable* pVlcTable, bool_t bChromaDc, int8_t nC) {
+  const uint8_t* kpVlcTableMoreBitsCountList[3] = {g_kuiVlcTableMoreBitsCount0, g_kuiVlcTableMoreBitsCount1, g_kuiVlcTableMoreBitsCount2};
+  int32_t iUsedBits = 0;
+  int32_t iIndexVlc, iIndexValue, iNcMapIdx;
+  uint32_t uiCount;
+  uint32_t uiValue;
 
-    if (bChromaDc)
-	{		
-		uiValue        = pBitsCache->uiCache32Bit >> 24;
-		iIndexVlc      = pVlcTable->kpChromaCoeffTokenVlcTable[uiValue][0];
-		uiCount        = pVlcTable->kpChromaCoeffTokenVlcTable[uiValue][1];
-		POP_BUFFER(pBitsCache, uiCount);
-		iUsedBits     += uiCount;
-		uiTrailingOnes = g_kuiVlcTrailingOneTotalCoeffTable[iIndexVlc][0];
-		uiTotalCoeff   = g_kuiVlcTrailingOneTotalCoeffTable[iIndexVlc][1];
-	}
-	else //luma
-	{
-		iNcMapIdx = g_kuiNcMapTable[nC];
-		if ( iNcMapIdx<= 2 )
-		{
-			uiValue = pBitsCache->uiCache32Bit >> 24;
-			if ( uiValue < g_kuiVlcTableNeedMoreBitsThread[iNcMapIdx] )
-			{					
-				POP_BUFFER(pBitsCache, 8);
-				iUsedBits  += 8;
-				iIndexValue = pBitsCache->uiCache32Bit >> ( 32 - kpVlcTableMoreBitsCountList[iNcMapIdx][uiValue]);
-				iIndexVlc   = pVlcTable->kpCoeffTokenVlcTable[iNcMapIdx+1][uiValue][iIndexValue][0];
-				uiCount     = pVlcTable->kpCoeffTokenVlcTable[iNcMapIdx+1][uiValue][iIndexValue][1];						
-				POP_BUFFER(pBitsCache, uiCount);
-				iUsedBits  += uiCount;
-			}
-			else
-			{
-				iIndexVlc  = pVlcTable->kpCoeffTokenVlcTable[0][iNcMapIdx][uiValue][0];
-				uiCount    = pVlcTable->kpCoeffTokenVlcTable[0][iNcMapIdx][uiValue][1];						
-				uiValue    = pBitsCache->uiCache32Bit >> (32 - uiCount);
-				POP_BUFFER(pBitsCache, uiCount);
-				iUsedBits += uiCount;
-			}
-		}
-		else
-		{
-			uiValue    = pBitsCache->uiCache32Bit >> (32 - 6);			
-			POP_BUFFER(pBitsCache, 6);
-			iUsedBits += 6;
-			iIndexVlc  = pVlcTable->kpCoeffTokenVlcTable[0][3][uiValue][0];  //differ
-		}		
-		uiTrailingOnes= g_kuiVlcTrailingOneTotalCoeffTable[iIndexVlc][0];
-		uiTotalCoeff  = g_kuiVlcTrailingOneTotalCoeffTable[iIndexVlc][1];
-	}
+  if (bChromaDc) {
+    uiValue        = pBitsCache->uiCache32Bit >> 24;
+    iIndexVlc      = pVlcTable->kpChromaCoeffTokenVlcTable[uiValue][0];
+    uiCount        = pVlcTable->kpChromaCoeffTokenVlcTable[uiValue][1];
+    POP_BUFFER (pBitsCache, uiCount);
+    iUsedBits     += uiCount;
+    uiTrailingOnes = g_kuiVlcTrailingOneTotalCoeffTable[iIndexVlc][0];
+    uiTotalCoeff   = g_kuiVlcTrailingOneTotalCoeffTable[iIndexVlc][1];
+  } else { //luma
+    iNcMapIdx = g_kuiNcMapTable[nC];
+    if (iNcMapIdx <= 2) {
+      uiValue = pBitsCache->uiCache32Bit >> 24;
+      if (uiValue < g_kuiVlcTableNeedMoreBitsThread[iNcMapIdx]) {
+        POP_BUFFER (pBitsCache, 8);
+        iUsedBits  += 8;
+        iIndexValue = pBitsCache->uiCache32Bit >> (32 - kpVlcTableMoreBitsCountList[iNcMapIdx][uiValue]);
+        iIndexVlc   = pVlcTable->kpCoeffTokenVlcTable[iNcMapIdx + 1][uiValue][iIndexValue][0];
+        uiCount     = pVlcTable->kpCoeffTokenVlcTable[iNcMapIdx + 1][uiValue][iIndexValue][1];
+        POP_BUFFER (pBitsCache, uiCount);
+        iUsedBits  += uiCount;
+      } else {
+        iIndexVlc  = pVlcTable->kpCoeffTokenVlcTable[0][iNcMapIdx][uiValue][0];
+        uiCount    = pVlcTable->kpCoeffTokenVlcTable[0][iNcMapIdx][uiValue][1];
+        uiValue    = pBitsCache->uiCache32Bit >> (32 - uiCount);
+        POP_BUFFER (pBitsCache, uiCount);
+        iUsedBits += uiCount;
+      }
+    } else {
+      uiValue    = pBitsCache->uiCache32Bit >> (32 - 6);
+      POP_BUFFER (pBitsCache, 6);
+      iUsedBits += 6;
+      iIndexVlc  = pVlcTable->kpCoeffTokenVlcTable[0][3][uiValue][0];  //differ
+    }
+    uiTrailingOnes = g_kuiVlcTrailingOneTotalCoeffTable[iIndexVlc][0];
+    uiTotalCoeff  = g_kuiVlcTrailingOneTotalCoeffTable[iIndexVlc][1];
+  }
 
-	return iUsedBits;
+  return iUsedBits;
 }
 
-static int32_t CavlcGetLevelVal(int32_t iLevel[16], SReadBitsCache *pBitsCache, uint8_t uiTotalCoeff, uint8_t uiTrailingOnes)
-{
-    int32_t i, iUsedBits = 0;
-    int32_t iSuffixLength, iSuffixLengthSize, iLevelPrefix, iPrefixBits, iLevelCode, iThreshold;
-    uint32_t uiCache32Bit;
-	for (i = 0; i < uiTrailingOnes; i++) 
-	{		
-		iLevel[i] = 1 - ((pBitsCache->uiCache32Bit >> (30 - i)) & 0x02);
-	}		
-	POP_BUFFER(pBitsCache, uiTrailingOnes);
-	iUsedBits += uiTrailingOnes;
-		
-	iSuffixLength = (uiTotalCoeff > 10 && uiTrailingOnes < 3);
-	
-	for (; i < uiTotalCoeff; i++) 
-	{		
-		if(pBitsCache->uiRemainBits <= 16)		SHIFT_BUFFER(pBitsCache);
+static int32_t CavlcGetLevelVal (int32_t iLevel[16], SReadBitsCache* pBitsCache, uint8_t uiTotalCoeff,
+                                 uint8_t uiTrailingOnes) {
+  int32_t i, iUsedBits = 0;
+  int32_t iSuffixLength, iSuffixLengthSize, iLevelPrefix, iPrefixBits, iLevelCode, iThreshold;
+  uint32_t uiCache32Bit;
+  for (i = 0; i < uiTrailingOnes; i++) {
+    iLevel[i] = 1 - ((pBitsCache->uiCache32Bit >> (30 - i)) & 0x02);
+  }
+  POP_BUFFER (pBitsCache, uiTrailingOnes);
+  iUsedBits += uiTrailingOnes;
+
+  iSuffixLength = (uiTotalCoeff > 10 && uiTrailingOnes < 3);
+
+  for (; i < uiTotalCoeff; i++) {
+    if (pBitsCache->uiRemainBits <= 16)		SHIFT_BUFFER (pBitsCache);
 #ifdef WIN32
-        uiCache32Bit = pBitsCache->uiCache32Bit;
-		WELS_GET_PREFIX_BITS(uiCache32Bit,iPrefixBits);
+    uiCache32Bit = pBitsCache->uiCache32Bit;
+    WELS_GET_PREFIX_BITS (uiCache32Bit, iPrefixBits);
 #else
-		iPrefixBits = GetPrefixBits(pBitsCache->uiCache32Bit);
+    iPrefixBits = GetPrefixBits (pBitsCache->uiCache32Bit);
 #endif
-		POP_BUFFER(pBitsCache, iPrefixBits);
-		iUsedBits   += iPrefixBits;
-		iLevelPrefix = iPrefixBits - 1;
+    POP_BUFFER (pBitsCache, iPrefixBits);
+    iUsedBits   += iPrefixBits;
+    iLevelPrefix = iPrefixBits - 1;
 
-		iLevelCode = (WELS_MIN(15, iLevelPrefix)) << iSuffixLength; //differ
-		iSuffixLengthSize = iSuffixLength;	
+    iLevelCode = (WELS_MIN (15, iLevelPrefix)) << iSuffixLength; //differ
+    iSuffixLengthSize = iSuffixLength;
 
-		if (iLevelPrefix >= 14) 
-		{	
-			if (14 == iLevelPrefix && 0 == iSuffixLength)
-				iSuffixLengthSize = 4;
-			else if (15 == iLevelPrefix)
-				iSuffixLengthSize = 12;
-			else if(iLevelPrefix > 15)
-				iLevelCode += (1 << (iLevelPrefix - 3)) - 4096;
+    if (iLevelPrefix >= 14) {
+      if (14 == iLevelPrefix && 0 == iSuffixLength)
+        iSuffixLengthSize = 4;
+      else if (15 == iLevelPrefix)
+        iSuffixLengthSize = 12;
+      else if (iLevelPrefix > 15)
+        iLevelCode += (1 << (iLevelPrefix - 3)) - 4096;
 
-			if (iLevelPrefix >= 15 && iSuffixLength == 0) 
-				iLevelCode += 15;
-		}
+      if (iLevelPrefix >= 15 && iSuffixLength == 0)
+        iLevelCode += 15;
+    }
 
-		if(iSuffixLengthSize > 0) 
-		{
-			if(pBitsCache->uiRemainBits <= iSuffixLengthSize) SHIFT_BUFFER(pBitsCache);	
-			if(pBitsCache->uiRemainBits <= iSuffixLengthSize) 
-			return 0;
-			iLevelCode += (pBitsCache->uiCache32Bit >> (32 - iSuffixLengthSize)); 
-			POP_BUFFER(pBitsCache, iSuffixLengthSize);
-			iUsedBits  += iSuffixLengthSize;
-		}
+    if (iSuffixLengthSize > 0) {
+      if (pBitsCache->uiRemainBits <= iSuffixLengthSize) SHIFT_BUFFER (pBitsCache);
+      if (pBitsCache->uiRemainBits <= iSuffixLengthSize)
+        return 0;
+      iLevelCode += (pBitsCache->uiCache32Bit >> (32 - iSuffixLengthSize));
+      POP_BUFFER (pBitsCache, iSuffixLengthSize);
+      iUsedBits  += iSuffixLengthSize;
+    }
 
-		iLevelCode += ((i == uiTrailingOnes) && (uiTrailingOnes < 3)) << 1;
-		iLevel[i]   = ((iLevelCode + 2) >> 1);
-		iLevel[i]  -= (iLevel[i] << 1) & (-(iLevelCode & 0x01));
+    iLevelCode += ((i == uiTrailingOnes) && (uiTrailingOnes < 3)) << 1;
+    iLevel[i]   = ((iLevelCode + 2) >> 1);
+    iLevel[i]  -= (iLevel[i] << 1) & (- (iLevelCode & 0x01));
 
-		iSuffixLength += !iSuffixLength;
-		iThreshold     = 3 << ( iSuffixLength - 1 );
-		iSuffixLength += ((iLevel[i] > iThreshold) || (iLevel[i] < -iThreshold)) && (iSuffixLength < 6);	
-	}
+    iSuffixLength += !iSuffixLength;
+    iThreshold     = 3 << (iSuffixLength - 1);
+    iSuffixLength += ((iLevel[i] > iThreshold) || (iLevel[i] < -iThreshold)) && (iSuffixLength < 6);
+  }
 
-	return iUsedBits;
+  return iUsedBits;
 }
 
-static int32_t CavlcGetTotalZeros(int32_t &iZerosLeft, SReadBitsCache *pBitsCache, uint8_t uiTotalCoeff, SVlcTable* pVlcTable, bool_t bChromaDc)
-{
-	int32_t iCount, iUsedBits = 0;
-	const uint8_t *kpBitNumMap;
-	uint32_t uiValue;
+static int32_t CavlcGetTotalZeros (int32_t& iZerosLeft, SReadBitsCache* pBitsCache, uint8_t uiTotalCoeff,
+                                   SVlcTable* pVlcTable, bool_t bChromaDc) {
+  int32_t iCount, iUsedBits = 0;
+  const uint8_t* kpBitNumMap;
+  uint32_t uiValue;
 
-	int32_t iTotalZeroVlcIdx;
-	uint8_t uiTableType;
-	//chroma_dc (0 < uiTotalCoeff < 4); others (chroma_ac or luma: 0 < uiTotalCoeff < 16)
+  int32_t iTotalZeroVlcIdx;
+  uint8_t uiTableType;
+  //chroma_dc (0 < uiTotalCoeff < 4); others (chroma_ac or luma: 0 < uiTotalCoeff < 16)
 
-	if ( bChromaDc )
-	{
-		iTotalZeroVlcIdx = uiTotalCoeff;
-		kpBitNumMap = g_kuiTotalZerosBitNumChromaMap;
-		uiTableType = bChromaDc;
-	} 
-	else
-	{
-		iTotalZeroVlcIdx = uiTotalCoeff;
-		kpBitNumMap = g_kuiTotalZerosBitNumMap;
-		uiTableType = 0;
-	}
+  if (bChromaDc) {
+    iTotalZeroVlcIdx = uiTotalCoeff;
+    kpBitNumMap = g_kuiTotalZerosBitNumChromaMap;
+    uiTableType = bChromaDc;
+  } else {
+    iTotalZeroVlcIdx = uiTotalCoeff;
+    kpBitNumMap = g_kuiTotalZerosBitNumMap;
+    uiTableType = 0;
+  }
 
-	iCount = kpBitNumMap[iTotalZeroVlcIdx-1];
-	if(pBitsCache->uiRemainBits < iCount) SHIFT_BUFFER(pBitsCache);// if uiRemainBits+16 still smaller than iCount?? potential bug
-	if(pBitsCache->uiRemainBits < iCount) 
-		return 0;
-	uiValue    = pBitsCache->uiCache32Bit >> ( 32 - iCount );
-	iCount     = pVlcTable->kpTotalZerosTable[uiTableType][iTotalZeroVlcIdx-1][uiValue][1];
-	POP_BUFFER(pBitsCache, iCount);
-	iUsedBits += iCount;
-	iZerosLeft = pVlcTable->kpTotalZerosTable[uiTableType][iTotalZeroVlcIdx-1][uiValue][0];
+  iCount = kpBitNumMap[iTotalZeroVlcIdx - 1];
+  if (pBitsCache->uiRemainBits < iCount) SHIFT_BUFFER (
+      pBitsCache); // if uiRemainBits+16 still smaller than iCount?? potential bug
+  if (pBitsCache->uiRemainBits < iCount)
+    return 0;
+  uiValue    = pBitsCache->uiCache32Bit >> (32 - iCount);
+  iCount     = pVlcTable->kpTotalZerosTable[uiTableType][iTotalZeroVlcIdx - 1][uiValue][1];
+  POP_BUFFER (pBitsCache, iCount);
+  iUsedBits += iCount;
+  iZerosLeft = pVlcTable->kpTotalZerosTable[uiTableType][iTotalZeroVlcIdx - 1][uiValue][0];
 
-	return iUsedBits;
+  return iUsedBits;
 }
-static int32_t	CavlcGetRunBefore(int32_t iRun[16], SReadBitsCache *pBitsCache, uint8_t uiTotalCoeff, SVlcTable* pVlcTable, int32_t iZerosLeft)
-{
-    int32_t i, iUsedBits = 0;
-	uint32_t uiCount, uiValue, uiCache32Bit, iPrefixBits;
-	
-	for (i = 0; i < uiTotalCoeff-1; i++) 
-	{
-		if (iZerosLeft > 0) 
-		{			
-			uiCount = g_kuiZeroLeftBitNumMap[iZerosLeft];
-			if(pBitsCache->uiRemainBits < uiCount ) SHIFT_BUFFER(pBitsCache);
-			if(pBitsCache->uiRemainBits < uiCount) 
-			return 0;
-			uiValue = pBitsCache->uiCache32Bit >> ( 32 - uiCount );
-			if ( iZerosLeft < 7 )
-			{
-				uiCount = pVlcTable->kpZeroTable[iZerosLeft-1][uiValue][1];
-				POP_BUFFER(pBitsCache, uiCount);
-				iUsedBits += uiCount;
-				iRun[i] = pVlcTable->kpZeroTable[iZerosLeft-1][uiValue][0];
-			}
-			else
-			{
-				POP_BUFFER(pBitsCache, uiCount);
-				iUsedBits += uiCount;
-				if ( pVlcTable->kpZeroTable[6][uiValue][0] < 7 )
-				{		
-					iRun[i] = pVlcTable->kpZeroTable[6][uiValue][0];
-				}
-				else
-				{
-					if(pBitsCache->uiRemainBits < 16) SHIFT_BUFFER(pBitsCache);
+static int32_t	CavlcGetRunBefore (int32_t iRun[16], SReadBitsCache* pBitsCache, uint8_t uiTotalCoeff,
+                                   SVlcTable* pVlcTable, int32_t iZerosLeft) {
+  int32_t i, iUsedBits = 0;
+  uint32_t uiCount, uiValue, uiCache32Bit, iPrefixBits;
+
+  for (i = 0; i < uiTotalCoeff - 1; i++) {
+    if (iZerosLeft > 0) {
+      uiCount = g_kuiZeroLeftBitNumMap[iZerosLeft];
+      if (pBitsCache->uiRemainBits < uiCount) SHIFT_BUFFER (pBitsCache);
+      if (pBitsCache->uiRemainBits < uiCount)
+        return 0;
+      uiValue = pBitsCache->uiCache32Bit >> (32 - uiCount);
+      if (iZerosLeft < 7) {
+        uiCount = pVlcTable->kpZeroTable[iZerosLeft - 1][uiValue][1];
+        POP_BUFFER (pBitsCache, uiCount);
+        iUsedBits += uiCount;
+        iRun[i] = pVlcTable->kpZeroTable[iZerosLeft - 1][uiValue][0];
+      } else {
+        POP_BUFFER (pBitsCache, uiCount);
+        iUsedBits += uiCount;
+        if (pVlcTable->kpZeroTable[6][uiValue][0] < 7) {
+          iRun[i] = pVlcTable->kpZeroTable[6][uiValue][0];
+        } else {
+          if (pBitsCache->uiRemainBits < 16) SHIFT_BUFFER (pBitsCache);
 #ifdef WIN32
-					uiCache32Bit = pBitsCache->uiCache32Bit;
-					WELS_GET_PREFIX_BITS(uiCache32Bit, iPrefixBits);
+          uiCache32Bit = pBitsCache->uiCache32Bit;
+          WELS_GET_PREFIX_BITS (uiCache32Bit, iPrefixBits);
 #else
-					iPrefixBits = GetPrefixBits(pBitsCache->uiCache32Bit);
+          iPrefixBits = GetPrefixBits (pBitsCache->uiCache32Bit);
 #endif
-					iRun[i] = iPrefixBits + 6;
-					POP_BUFFER(pBitsCache, iPrefixBits);
-					iUsedBits += iPrefixBits;
-				}
-			}			
-		}
-		else
-		{
-			return iUsedBits;
-		}
-		
-		iZerosLeft -= iRun[i];
-	}
+          iRun[i] = iPrefixBits + 6;
+          POP_BUFFER (pBitsCache, iPrefixBits);
+          iUsedBits += iPrefixBits;
+        }
+      }
+    } else {
+      return iUsedBits;
+    }
 
-	iRun[uiTotalCoeff-1] = iZerosLeft;
+    iZerosLeft -= iRun[i];
+  }
 
-	return iUsedBits;
+  iRun[uiTotalCoeff - 1] = iZerosLeft;
+
+  return iUsedBits;
 }
 
-int32_t WelsResidualBlockCavlc(SVlcTable* pVlcTable, uint8_t* pNonZeroCountCache, PBitStringAux pBs, int32_t iIndex, int32_t iMaxNumCoeff, 
-									 const uint8_t *kpZigzagTable, int32_t iResidualProperty, int16_t *pTCoeff, int32_t iMbMode, uint8_t uiQp, PWelsDecoderContext pCtx)
-{
-	int32_t iLevel[16], iZerosLeft, iCoeffNum;
-	int32_t  iRun[16] = {0};
-	const uint8_t *kpBitNumMap;
-	int32_t iCurNonZeroCacheIdx, i;
-	const uint16_t *kpDequantCoeff = g_kuiDequantCoeff[uiQp];
-	int8_t nA, nB, nC;
-	uint8_t uiTotalCoeff, uiTrailingOnes;
-	int32_t iUsedBits = 0;	
-	int32_t iCurIdx   = pBs->iIndex;
-	uint8_t *pBuf     = ((uint8_t *)pBs->pStartBuf) + (iCurIdx >> 3);
-	bool_t  bChromaDc = (CHROMA_DC == iResidualProperty);
-	uint8_t bChroma   = (bChromaDc || CHROMA_AC == iResidualProperty);
-	SReadBitsCache sReadBitsCache;
+int32_t WelsResidualBlockCavlc (SVlcTable* pVlcTable, uint8_t* pNonZeroCountCache, PBitStringAux pBs, int32_t iIndex,
+                                int32_t iMaxNumCoeff,
+                                const uint8_t* kpZigzagTable, int32_t iResidualProperty, int16_t* pTCoeff, int32_t iMbMode, uint8_t uiQp,
+                                PWelsDecoderContext pCtx) {
+  int32_t iLevel[16], iZerosLeft, iCoeffNum;
+  int32_t  iRun[16] = {0};
+  const uint8_t* kpBitNumMap;
+  int32_t iCurNonZeroCacheIdx, i;
+  const uint16_t* kpDequantCoeff = g_kuiDequantCoeff[uiQp];
+  int8_t nA, nB, nC;
+  uint8_t uiTotalCoeff, uiTrailingOnes;
+  int32_t iUsedBits = 0;
+  int32_t iCurIdx   = pBs->iIndex;
+  uint8_t* pBuf     = ((uint8_t*)pBs->pStartBuf) + (iCurIdx >> 3);
+  bool_t  bChromaDc = (CHROMA_DC == iResidualProperty);
+  uint8_t bChroma   = (bChromaDc || CHROMA_AC == iResidualProperty);
+  SReadBitsCache sReadBitsCache;
 
-	sReadBitsCache.uiCache32Bit =  ((((pBuf[0]<<8) | pBuf[1]) << 16) | (pBuf[2]<<8) | pBuf[3]) << (iCurIdx&0x07);
-	sReadBitsCache.uiRemainBits = 32 - (iCurIdx & 0x07);
-    sReadBitsCache.pBuf = pBuf;
-	//////////////////////////////////////////////////////////////////////////
-	
-	if (bChroma) 
-	{
-		iCurNonZeroCacheIdx = g_kuiCacheNzcScanIdx[iIndex];
-		nA = pNonZeroCountCache[iCurNonZeroCacheIdx-1];
-		nB = pNonZeroCountCache[iCurNonZeroCacheIdx-8];
+  sReadBitsCache.uiCache32Bit = ((((pBuf[0] << 8) | pBuf[1]) << 16) | (pBuf[2] << 8) | pBuf[3]) << (iCurIdx & 0x07);
+  sReadBitsCache.uiRemainBits = 32 - (iCurIdx & 0x07);
+  sReadBitsCache.pBuf = pBuf;
+  //////////////////////////////////////////////////////////////////////////
 
-		if (bChromaDc)
-		{
-			kpBitNumMap = g_kuiTotalZerosBitNumChromaMap;
-		}
-		else
-		{
-			kpBitNumMap = g_kuiTotalZerosBitNumMap;
-		}
-	}
-	else //luma
-	{
-		iCurNonZeroCacheIdx = g_kuiCacheNzcScanIdx[iIndex];
-		nA = pNonZeroCountCache[iCurNonZeroCacheIdx-1];
-		nB = pNonZeroCountCache[iCurNonZeroCacheIdx-8];
+  if (bChroma) {
+    iCurNonZeroCacheIdx = g_kuiCacheNzcScanIdx[iIndex];
+    nA = pNonZeroCountCache[iCurNonZeroCacheIdx - 1];
+    nB = pNonZeroCountCache[iCurNonZeroCacheIdx - 8];
 
-		kpBitNumMap = g_kuiTotalZerosBitNumMap;
-	}
+    if (bChromaDc) {
+      kpBitNumMap = g_kuiTotalZerosBitNumChromaMap;
+    } else {
+      kpBitNumMap = g_kuiTotalZerosBitNumMap;
+    }
+  } else { //luma
+    iCurNonZeroCacheIdx = g_kuiCacheNzcScanIdx[iIndex];
+    nA = pNonZeroCountCache[iCurNonZeroCacheIdx - 1];
+    nB = pNonZeroCountCache[iCurNonZeroCacheIdx - 8];
 
-	WELS_NON_ZERO_COUNT_AVERAGE( nC, nA, nB );
+    kpBitNumMap = g_kuiTotalZerosBitNumMap;
+  }
 
-	iUsedBits += CavlcGetTrailingOnesAndTotalCoeff(uiTotalCoeff, uiTrailingOnes, &sReadBitsCache, pVlcTable, bChromaDc, nC);
+  WELS_NON_ZERO_COUNT_AVERAGE (nC, nA, nB);
 
-	if ( iResidualProperty != CHROMA_DC && iResidualProperty != I16_LUMA_DC)
-	{
-		pNonZeroCountCache[iCurNonZeroCacheIdx] = uiTotalCoeff;
-		//////////////////////////////////////////////////////////////////////////
-	}
-	if (0 == uiTotalCoeff) 	
-	{
-		pBs->iIndex += iUsedBits;
-		return 0;
-	}	
-	if ( uiTrailingOnes > 3 || uiTotalCoeff > 16 ) /////////////////check uiTrailingOnes and uiTotalCoeff
-	{
-		return -1;
-	}
-	iUsedBits += CavlcGetLevelVal(iLevel, &sReadBitsCache, uiTotalCoeff, uiTrailingOnes);
+  iUsedBits += CavlcGetTrailingOnesAndTotalCoeff (uiTotalCoeff, uiTrailingOnes, &sReadBitsCache, pVlcTable, bChromaDc,
+               nC);
 
-	if (uiTotalCoeff < iMaxNumCoeff) 
-	{
-	    iUsedBits += CavlcGetTotalZeros(iZerosLeft, &sReadBitsCache, uiTotalCoeff, pVlcTable, bChromaDc);
-	}
-	else
-	{
-		iZerosLeft = 0;
-	}
+  if (iResidualProperty != CHROMA_DC && iResidualProperty != I16_LUMA_DC) {
+    pNonZeroCountCache[iCurNonZeroCacheIdx] = uiTotalCoeff;
+    //////////////////////////////////////////////////////////////////////////
+  }
+  if (0 == uiTotalCoeff) {
+    pBs->iIndex += iUsedBits;
+    return 0;
+  }
+  if (uiTrailingOnes > 3 || uiTotalCoeff > 16) { /////////////////check uiTrailingOnes and uiTotalCoeff
+    return -1;
+  }
+  iUsedBits += CavlcGetLevelVal (iLevel, &sReadBitsCache, uiTotalCoeff, uiTrailingOnes);
 
-	if (iZerosLeft < 0)
-	{
-		return ERR_INFO_CAVLC_INVALID_ZERO_LEFT;
-	}
-	iUsedBits += CavlcGetRunBefore(iRun, &sReadBitsCache, uiTotalCoeff, pVlcTable, iZerosLeft);
+  if (uiTotalCoeff < iMaxNumCoeff) {
+    iUsedBits += CavlcGetTotalZeros (iZerosLeft, &sReadBitsCache, uiTotalCoeff, pVlcTable, bChromaDc);
+  } else {
+    iZerosLeft = 0;
+  }
 
-	pBs->iIndex += iUsedBits;
-	iCoeffNum = -1;
+  if (iZerosLeft < 0) {
+    return ERR_INFO_CAVLC_INVALID_ZERO_LEFT;
+  }
+  iUsedBits += CavlcGetRunBefore (iRun, &sReadBitsCache, uiTotalCoeff, pVlcTable, iZerosLeft);
 
-	if(iResidualProperty == CHROMA_DC){
-		//chroma dc scaling process, is kpDequantCoeff[0]? LevelScale(qPdc%6,0,0))<<(qPdc/6-6), the transform is done at construction.
-			switch(iMbMode)
-			{
-			case BASE_MB:
-				for(i=uiTotalCoeff-1; i>=0; --i)
-				{ //FIXME merge into rundecode?
-					int32_t j;
-					iCoeffNum += iRun[i] + 1; //FIXME add 1 earlier ?
-					j          = kpZigzagTable[ iCoeffNum ];
-					pTCoeff[j] = iLevel[i]*kpDequantCoeff[0];
-				}
-				break;
-			default:
-				break;
-			}
-	}	
-	else if(iResidualProperty == I16_LUMA_DC){ //DC coefficent, only call in Intra_16x16, base_mode_flag = 0
-		for(i=uiTotalCoeff-1; i>=0; --i){ //FIXME merge into rundecode?
-			int32_t j;
-			iCoeffNum += iRun[i] + 1; //FIXME add 1 earlier ?
-			j          = kpZigzagTable[ iCoeffNum ];
-			pTCoeff[j] = iLevel[i];
-		}
-	}
-    else{
-		switch(iMbMode)
-		{
-		case BASE_MB:
-			for(i=uiTotalCoeff-1; i>=0; --i){ //FIXME merge into  rundecode?
-				int32_t j;
-				iCoeffNum += iRun[i] + 1; //FIXME add 1 earlier ?
-				j          = kpZigzagTable[ iCoeffNum ];
-				pTCoeff[j] = iLevel[i]*kpDequantCoeff[j & 0x07];
-			}
-			break;
-		default:
-			break;
-		}
-	}
+  pBs->iIndex += iUsedBits;
+  iCoeffNum = -1;
 
-	return 0;		
+  if (iResidualProperty == CHROMA_DC) {
+    //chroma dc scaling process, is kpDequantCoeff[0]? LevelScale(qPdc%6,0,0))<<(qPdc/6-6), the transform is done at construction.
+    switch (iMbMode) {
+    case BASE_MB:
+      for (i = uiTotalCoeff - 1; i >= 0; --i) {
+        //FIXME merge into rundecode?
+        int32_t j;
+        iCoeffNum += iRun[i] + 1; //FIXME add 1 earlier ?
+        j          = kpZigzagTable[ iCoeffNum ];
+        pTCoeff[j] = iLevel[i] * kpDequantCoeff[0];
+      }
+      break;
+    default:
+      break;
+    }
+  } else if (iResidualProperty == I16_LUMA_DC) { //DC coefficent, only call in Intra_16x16, base_mode_flag = 0
+    for (i = uiTotalCoeff - 1; i >= 0; --i) { //FIXME merge into rundecode?
+      int32_t j;
+      iCoeffNum += iRun[i] + 1; //FIXME add 1 earlier ?
+      j          = kpZigzagTable[ iCoeffNum ];
+      pTCoeff[j] = iLevel[i];
+    }
+  } else {
+    switch (iMbMode) {
+    case BASE_MB:
+      for (i = uiTotalCoeff - 1; i >= 0; --i) { //FIXME merge into  rundecode?
+        int32_t j;
+        iCoeffNum += iRun[i] + 1; //FIXME add 1 earlier ?
+        j          = kpZigzagTable[ iCoeffNum ];
+        pTCoeff[j] = iLevel[i] * kpDequantCoeff[j & 0x07];
+      }
+      break;
+    default:
+      break;
+    }
+  }
+
+  return 0;
 }
 
-int32_t ParseIntra4x4ModeConstrain0(PNeighAvail pNeighAvail, int8_t* pIntraPredMode, PBitStringAux pBs, PDqLayer pCurDqLayer)
-{
-	int32_t iSampleAvail[5*6] = { 0 }; //initialize as 0
-	int32_t iMbXy = pCurDqLayer->iMbXyIndex;
-	int32_t iFinalMode, i;	
+int32_t ParseIntra4x4ModeConstrain0 (PNeighAvail pNeighAvail, int8_t* pIntraPredMode, PBitStringAux pBs,
+                                     PDqLayer pCurDqLayer) {
+  int32_t iSampleAvail[5 * 6] = { 0 }; //initialize as 0
+  int32_t iMbXy = pCurDqLayer->iMbXyIndex;
+  int32_t iFinalMode, i;
 
-	uint8_t uiNeighAvail = 0;
+  uint8_t uiNeighAvail = 0;
 
-	if ( pNeighAvail->iLeftAvail )  //left
-	{
-		iSampleAvail[ 6] = 
-		iSampleAvail[12] =
-		iSampleAvail[18] =
-		iSampleAvail[24] = 1;
-	}
-	if ( pNeighAvail->iLeftTopAvail ) //top_left
-	{
-		iSampleAvail[0] = 1;
-	}
-	if ( pNeighAvail->iTopAvail ) //top
-	{
-		iSampleAvail[1] = 
-		iSampleAvail[2] = 
-		iSampleAvail[3] = 
-		iSampleAvail[4] = 1;
-	}
-	if ( pNeighAvail->iRightTopAvail ) //top_right
-	{
-		iSampleAvail[5] = 1;
-	}
+  if (pNeighAvail->iLeftAvail) {  //left
+    iSampleAvail[ 6] =
+      iSampleAvail[12] =
+        iSampleAvail[18] =
+          iSampleAvail[24] = 1;
+  }
+  if (pNeighAvail->iLeftTopAvail) { //top_left
+    iSampleAvail[0] = 1;
+  }
+  if (pNeighAvail->iTopAvail) { //top
+    iSampleAvail[1] =
+      iSampleAvail[2] =
+        iSampleAvail[3] =
+          iSampleAvail[4] = 1;
+  }
+  if (pNeighAvail->iRightTopAvail) { //top_right
+    iSampleAvail[5] = 1;
+  }
 
-	uiNeighAvail = (iSampleAvail[6]<<2) | (iSampleAvail[0]<<1) | (iSampleAvail[1]);
+  uiNeighAvail = (iSampleAvail[6] << 2) | (iSampleAvail[0] << 1) | (iSampleAvail[1]);
 
-	for(i = 0; i < 16; i++)
-	{
-		const int32_t kiPrevIntra4x4PredMode = BsGetOneBit(pBs);//1bit
-		const int32_t kiPredMode = PredIntra4x4Mode(pIntraPredMode, i);
+  for (i = 0; i < 16; i++) {
+    const int32_t kiPrevIntra4x4PredMode = BsGetOneBit (pBs); //1bit
+    const int32_t kiPredMode = PredIntra4x4Mode (pIntraPredMode, i);
 
-		int8_t iBestMode;
-		if (kiPrevIntra4x4PredMode) 
-		{
-			iBestMode = kiPredMode;
-		}
-		else //kPrevIntra4x4PredMode == 0
-		{
-			const int32_t kiRemIntra4x4PredMode = BsGetBits(pBs, 3);//3bits				
-			if (kiRemIntra4x4PredMode < kiPredMode) 
-			{
-				iBestMode = kiRemIntra4x4PredMode;
-			}
-			else 
-			{
-				iBestMode = kiRemIntra4x4PredMode + 1;
-			}
-		}
+    int8_t iBestMode;
+    if (kiPrevIntra4x4PredMode) {
+      iBestMode = kiPredMode;
+    } else { //kPrevIntra4x4PredMode == 0
+      const int32_t kiRemIntra4x4PredMode = BsGetBits (pBs, 3); //3bits
+      if (kiRemIntra4x4PredMode < kiPredMode) {
+        iBestMode = kiRemIntra4x4PredMode;
+      } else {
+        iBestMode = kiRemIntra4x4PredMode + 1;
+      }
+    }
 
-		iFinalMode = CheckIntra4x4PredMode(&iSampleAvail[0], &iBestMode, i);
-		if (iFinalMode < 0)
-		{
-			return ERR_INFO_INVALID_I4x4_PRED_MODE;
-		}
+    iFinalMode = CheckIntra4x4PredMode (&iSampleAvail[0], &iBestMode, i);
+    if (iFinalMode < 0) {
+      return ERR_INFO_INVALID_I4x4_PRED_MODE;
+    }
 
-		pCurDqLayer->pIntra4x4FinalMode[iMbXy][g_kuiScan4[i]] = iFinalMode;
+    pCurDqLayer->pIntra4x4FinalMode[iMbXy][g_kuiScan4[i]] = iFinalMode;
 
-		pIntraPredMode[g_kuiScan8[i]] = iBestMode;
+    pIntraPredMode[g_kuiScan8[i]] = iBestMode;
 
-		iSampleAvail[g_kuiCache30ScanIdx[i]] = 1;
-	}
-	ST32(&pCurDqLayer->pIntraPredMode[iMbXy][0], LD32(&pIntraPredMode[1 + 8 * 4]));
-	pCurDqLayer->pIntraPredMode[iMbXy][4] = pIntraPredMode[4 + 8 * 1];
-	pCurDqLayer->pIntraPredMode[iMbXy][5] = pIntraPredMode[4 + 8 * 2];
-	pCurDqLayer->pIntraPredMode[iMbXy][6] = pIntraPredMode[4 + 8 * 3];
-	pCurDqLayer->pChromaPredMode[iMbXy] = BsGetUe(pBs);
-	if (-1 == pCurDqLayer->pChromaPredMode[iMbXy] || CheckIntraChromaPredMode(uiNeighAvail, &pCurDqLayer->pChromaPredMode[iMbXy]))
-	{
-		return ERR_INFO_INVALID_I_CHROMA_PRED_MODE;
-	}
+    iSampleAvail[g_kuiCache30ScanIdx[i]] = 1;
+  }
+  ST32 (&pCurDqLayer->pIntraPredMode[iMbXy][0], LD32 (&pIntraPredMode[1 + 8 * 4]));
+  pCurDqLayer->pIntraPredMode[iMbXy][4] = pIntraPredMode[4 + 8 * 1];
+  pCurDqLayer->pIntraPredMode[iMbXy][5] = pIntraPredMode[4 + 8 * 2];
+  pCurDqLayer->pIntraPredMode[iMbXy][6] = pIntraPredMode[4 + 8 * 3];
+  pCurDqLayer->pChromaPredMode[iMbXy] = BsGetUe (pBs);
+  if (-1 == pCurDqLayer->pChromaPredMode[iMbXy]
+      || CheckIntraChromaPredMode (uiNeighAvail, &pCurDqLayer->pChromaPredMode[iMbXy])) {
+    return ERR_INFO_INVALID_I_CHROMA_PRED_MODE;
+  }
 
-	return 0;
+  return 0;
 }
 
-int32_t ParseIntra4x4ModeConstrain1(PNeighAvail pNeighAvail, int8_t* pIntraPredMode, PBitStringAux pBs, PDqLayer pCurDqLayer)
-{
-	int32_t iSampleAvail[5*6] = { 0 }; //initialize as 0
-	int32_t iMbXy = pCurDqLayer->iMbXyIndex;
-	int32_t iFinalMode, i;	
+int32_t ParseIntra4x4ModeConstrain1 (PNeighAvail pNeighAvail, int8_t* pIntraPredMode, PBitStringAux pBs,
+                                     PDqLayer pCurDqLayer) {
+  int32_t iSampleAvail[5 * 6] = { 0 }; //initialize as 0
+  int32_t iMbXy = pCurDqLayer->iMbXyIndex;
+  int32_t iFinalMode, i;
 
-	uint8_t uiNeighAvail = 0;
+  uint8_t uiNeighAvail = 0;
 
-	if ( pNeighAvail->iLeftAvail && IS_INTRA( pNeighAvail->iLeftType ) )  //left
-	{
-		iSampleAvail[ 6] = 
-		iSampleAvail[12] =
-		iSampleAvail[18] =
-		iSampleAvail[24] = 1;
-	}
-	if ( pNeighAvail->iLeftTopAvail && IS_INTRA( pNeighAvail->iLeftTopType ) ) //top_left
-	{
-		iSampleAvail[0] = 1;
-	}
-	if ( pNeighAvail->iTopAvail && IS_INTRA( pNeighAvail->iTopType ) ) //top
-	{
-		iSampleAvail[1] = 
-		iSampleAvail[2] = 
-		iSampleAvail[3] = 
-		iSampleAvail[4] = 1;
-	}
-	if ( pNeighAvail->iRightTopAvail && IS_INTRA( pNeighAvail->iRightTopType ) ) //top_right
-	{
-		iSampleAvail[5] = 1;
-	}
+  if (pNeighAvail->iLeftAvail && IS_INTRA (pNeighAvail->iLeftType)) {   //left
+    iSampleAvail[ 6] =
+      iSampleAvail[12] =
+        iSampleAvail[18] =
+          iSampleAvail[24] = 1;
+  }
+  if (pNeighAvail->iLeftTopAvail && IS_INTRA (pNeighAvail->iLeftTopType)) {  //top_left
+    iSampleAvail[0] = 1;
+  }
+  if (pNeighAvail->iTopAvail && IS_INTRA (pNeighAvail->iTopType)) {  //top
+    iSampleAvail[1] =
+      iSampleAvail[2] =
+        iSampleAvail[3] =
+          iSampleAvail[4] = 1;
+  }
+  if (pNeighAvail->iRightTopAvail && IS_INTRA (pNeighAvail->iRightTopType)) {  //top_right
+    iSampleAvail[5] = 1;
+  }
 
-	uiNeighAvail = (iSampleAvail[6]<<2) | (iSampleAvail[0]<<1) | (iSampleAvail[1]);
+  uiNeighAvail = (iSampleAvail[6] << 2) | (iSampleAvail[0] << 1) | (iSampleAvail[1]);
 
-	for(i = 0; i < 16; i++)
-	{
-		const int32_t kiPrevIntra4x4PredMode = BsGetOneBit(pBs);//1bit
-		const int32_t kiPredMode = PredIntra4x4Mode(pIntraPredMode, i);
+  for (i = 0; i < 16; i++) {
+    const int32_t kiPrevIntra4x4PredMode = BsGetOneBit (pBs); //1bit
+    const int32_t kiPredMode = PredIntra4x4Mode (pIntraPredMode, i);
 
-		int8_t iBestMode;
-		if (kiPrevIntra4x4PredMode) 
-		{
-			iBestMode = kiPredMode;
-		}
-		else //kPrevIntra4x4PredMode == 0
-		{
-			const int32_t kiRemIntra4x4PredMode = BsGetBits(pBs, 3);//3bits				
-			if (kiRemIntra4x4PredMode < kiPredMode) 
-			{
-				iBestMode = kiRemIntra4x4PredMode;
-			}
-			else 
-			{
-				iBestMode = kiRemIntra4x4PredMode + 1;
-			}
-		}
+    int8_t iBestMode;
+    if (kiPrevIntra4x4PredMode) {
+      iBestMode = kiPredMode;
+    } else { //kPrevIntra4x4PredMode == 0
+      const int32_t kiRemIntra4x4PredMode = BsGetBits (pBs, 3); //3bits
+      if (kiRemIntra4x4PredMode < kiPredMode) {
+        iBestMode = kiRemIntra4x4PredMode;
+      } else {
+        iBestMode = kiRemIntra4x4PredMode + 1;
+      }
+    }
 
-		iFinalMode = CheckIntra4x4PredMode(&iSampleAvail[0], &iBestMode, i);
-		if (iFinalMode < 0)
-		{
-			return ERR_INFO_INVALID_I4x4_PRED_MODE;
-		}
+    iFinalMode = CheckIntra4x4PredMode (&iSampleAvail[0], &iBestMode, i);
+    if (iFinalMode < 0) {
+      return ERR_INFO_INVALID_I4x4_PRED_MODE;
+    }
 
-		pCurDqLayer->pIntra4x4FinalMode[iMbXy][g_kuiScan4[i]] = iFinalMode;
+    pCurDqLayer->pIntra4x4FinalMode[iMbXy][g_kuiScan4[i]] = iFinalMode;
 
-		pIntraPredMode[g_kuiScan8[i]] = iBestMode;
+    pIntraPredMode[g_kuiScan8[i]] = iBestMode;
 
-		iSampleAvail[g_kuiCache30ScanIdx[i]] = 1;
-	}
-	ST32(&pCurDqLayer->pIntraPredMode[iMbXy][0], LD32(&pIntraPredMode[1 + 8 * 4]));
-	pCurDqLayer->pIntraPredMode[iMbXy][4] = pIntraPredMode[4 + 8 * 1];
-	pCurDqLayer->pIntraPredMode[iMbXy][5] = pIntraPredMode[4 + 8 * 2];
-	pCurDqLayer->pIntraPredMode[iMbXy][6] = pIntraPredMode[4 + 8 * 3];
+    iSampleAvail[g_kuiCache30ScanIdx[i]] = 1;
+  }
+  ST32 (&pCurDqLayer->pIntraPredMode[iMbXy][0], LD32 (&pIntraPredMode[1 + 8 * 4]));
+  pCurDqLayer->pIntraPredMode[iMbXy][4] = pIntraPredMode[4 + 8 * 1];
+  pCurDqLayer->pIntraPredMode[iMbXy][5] = pIntraPredMode[4 + 8 * 2];
+  pCurDqLayer->pIntraPredMode[iMbXy][6] = pIntraPredMode[4 + 8 * 3];
 
-	pCurDqLayer->pChromaPredMode[iMbXy] = BsGetUe(pBs);
-	if (-1 == pCurDqLayer->pChromaPredMode[iMbXy] || CheckIntraChromaPredMode(uiNeighAvail, &pCurDqLayer->pChromaPredMode[iMbXy]))
-	{
-		return ERR_INFO_INVALID_I_CHROMA_PRED_MODE;
-	}
+  pCurDqLayer->pChromaPredMode[iMbXy] = BsGetUe (pBs);
+  if (-1 == pCurDqLayer->pChromaPredMode[iMbXy]
+      || CheckIntraChromaPredMode (uiNeighAvail, &pCurDqLayer->pChromaPredMode[iMbXy])) {
+    return ERR_INFO_INVALID_I_CHROMA_PRED_MODE;
+  }
 
-	return 0;
+  return 0;
 }
 
-int32_t ParseIntra16x16ModeConstrain0(PNeighAvail pNeighAvail, PBitStringAux pBs, PDqLayer pCurDqLayer)
-{
-	int32_t iMbXy = pCurDqLayer->iMbXyIndex;
-	uint8_t uiNeighAvail = 0; //0x07 = 0 1 1 1, means left, top-left, top avail or not. (1: avail, 0: unavail)
+int32_t ParseIntra16x16ModeConstrain0 (PNeighAvail pNeighAvail, PBitStringAux pBs, PDqLayer pCurDqLayer) {
+  int32_t iMbXy = pCurDqLayer->iMbXyIndex;
+  uint8_t uiNeighAvail = 0; //0x07 = 0 1 1 1, means left, top-left, top avail or not. (1: avail, 0: unavail)
 
-	if ( pNeighAvail->iLeftAvail ) 
-	{
-		uiNeighAvail = (1<<2);
-	}
-	if ( pNeighAvail->iLeftTopAvail ) 
-	{
-		uiNeighAvail |= (1<<1);
-	}
-	if ( pNeighAvail->iTopAvail ) 
-	{
-		uiNeighAvail |= 1;
-	}
+  if (pNeighAvail->iLeftAvail) {
+    uiNeighAvail = (1 << 2);
+  }
+  if (pNeighAvail->iLeftTopAvail) {
+    uiNeighAvail |= (1 << 1);
+  }
+  if (pNeighAvail->iTopAvail) {
+    uiNeighAvail |= 1;
+  }
 
-	if (CheckIntra16x16PredMode(uiNeighAvail, &pCurDqLayer->pIntraPredMode[iMbXy][7])) //invalid iPredMode, must stop decoding
-	{
-		return ERR_INFO_INVALID_I16x16_PRED_MODE;
-	}
-	pCurDqLayer->pChromaPredMode[iMbXy] = BsGetUe(pBs);
+  if (CheckIntra16x16PredMode (uiNeighAvail,
+                               &pCurDqLayer->pIntraPredMode[iMbXy][7])) { //invalid iPredMode, must stop decoding
+    return ERR_INFO_INVALID_I16x16_PRED_MODE;
+  }
+  pCurDqLayer->pChromaPredMode[iMbXy] = BsGetUe (pBs);
 
-	if (-1 == pCurDqLayer->pChromaPredMode[iMbXy] || CheckIntraChromaPredMode(uiNeighAvail, &pCurDqLayer->pChromaPredMode[iMbXy]))
-	{
-		return ERR_INFO_INVALID_I_CHROMA_PRED_MODE;
-	}
+  if (-1 == pCurDqLayer->pChromaPredMode[iMbXy]
+      || CheckIntraChromaPredMode (uiNeighAvail, &pCurDqLayer->pChromaPredMode[iMbXy])) {
+    return ERR_INFO_INVALID_I_CHROMA_PRED_MODE;
+  }
 
-	return 0;
+  return 0;
 }
 
-int32_t ParseIntra16x16ModeConstrain1(PNeighAvail pNeighAvail, PBitStringAux pBs, PDqLayer pCurDqLayer)
-{
-	int32_t iMbXy = pCurDqLayer->iMbXyIndex;
-	uint8_t uiNeighAvail = 0; //0x07 = 0 1 1 1, means left, top-left, top avail or not. (1: avail, 0: unavail)
+int32_t ParseIntra16x16ModeConstrain1 (PNeighAvail pNeighAvail, PBitStringAux pBs, PDqLayer pCurDqLayer) {
+  int32_t iMbXy = pCurDqLayer->iMbXyIndex;
+  uint8_t uiNeighAvail = 0; //0x07 = 0 1 1 1, means left, top-left, top avail or not. (1: avail, 0: unavail)
 
-	if ( pNeighAvail->iLeftAvail && IS_INTRA( pNeighAvail->iLeftType ) ) 
-	{
-		uiNeighAvail = (1<<2);
-	}
-	if ( pNeighAvail->iLeftTopAvail && IS_INTRA( pNeighAvail->iLeftTopType ) ) 
-	{
-		uiNeighAvail |= (1<<1);
-	}
-	if ( pNeighAvail->iTopAvail && IS_INTRA( pNeighAvail->iTopType ) ) 
-	{
-		uiNeighAvail |= 1;
-	}
+  if (pNeighAvail->iLeftAvail && IS_INTRA (pNeighAvail->iLeftType)) {
+    uiNeighAvail = (1 << 2);
+  }
+  if (pNeighAvail->iLeftTopAvail && IS_INTRA (pNeighAvail->iLeftTopType)) {
+    uiNeighAvail |= (1 << 1);
+  }
+  if (pNeighAvail->iTopAvail && IS_INTRA (pNeighAvail->iTopType)) {
+    uiNeighAvail |= 1;
+  }
 
-	if (CheckIntra16x16PredMode(uiNeighAvail, &pCurDqLayer->pIntraPredMode[iMbXy][7])) //invalid iPredMode, must stop decoding
-	{
-		return ERR_INFO_INVALID_I16x16_PRED_MODE;
-	}
-	pCurDqLayer->pChromaPredMode[iMbXy] = BsGetUe(pBs);
+  if (CheckIntra16x16PredMode (uiNeighAvail,
+                               &pCurDqLayer->pIntraPredMode[iMbXy][7])) { //invalid iPredMode, must stop decoding
+    return ERR_INFO_INVALID_I16x16_PRED_MODE;
+  }
+  pCurDqLayer->pChromaPredMode[iMbXy] = BsGetUe (pBs);
 
-	if (-1 == pCurDqLayer->pChromaPredMode[iMbXy] || CheckIntraChromaPredMode(uiNeighAvail, &pCurDqLayer->pChromaPredMode[iMbXy]))
-	{
-		return ERR_INFO_INVALID_I_CHROMA_PRED_MODE;
-	}
+  if (-1 == pCurDqLayer->pChromaPredMode[iMbXy]
+      || CheckIntraChromaPredMode (uiNeighAvail, &pCurDqLayer->pChromaPredMode[iMbXy])) {
+    return ERR_INFO_INVALID_I_CHROMA_PRED_MODE;
+  }
 
-	return 0;
+  return 0;
 }
 
-int32_t ParseInterInfo(PWelsDecoderContext pCtx, int16_t iMvArray[LIST_A][30][MV_A], int8_t iRefIdxArray[LIST_A][30], PBitStringAux pBs)
-{
-	PSlice pSlice				= &pCtx->pCurDqLayer->sLayerInfo.sSliceInLayer;
-	PSliceHeader pSliceHeader	= &pSlice->sSliceHeaderExt.sSliceHeader;
-	int32_t iNumRefFrames		= pSliceHeader->pSps->iNumRefFrames; 
-	int32_t iRefCount[2];
-	PDqLayer pCurDqLayer = pCtx->pCurDqLayer;
-	int32_t i, j;
-	int32_t iMbXy = pCurDqLayer->iMbXyIndex;
-	int32_t iMotionPredFlag[4];
-	int16_t iMv[2] = {0};
+int32_t ParseInterInfo (PWelsDecoderContext pCtx, int16_t iMvArray[LIST_A][30][MV_A], int8_t iRefIdxArray[LIST_A][30],
+                        PBitStringAux pBs) {
+  PSlice pSlice				= &pCtx->pCurDqLayer->sLayerInfo.sSliceInLayer;
+  PSliceHeader pSliceHeader	= &pSlice->sSliceHeaderExt.sSliceHeader;
+  int32_t iNumRefFrames		= pSliceHeader->pSps->iNumRefFrames;
+  int32_t iRefCount[2];
+  PDqLayer pCurDqLayer = pCtx->pCurDqLayer;
+  int32_t i, j;
+  int32_t iMbXy = pCurDqLayer->iMbXyIndex;
+  int32_t iMotionPredFlag[4];
+  int16_t iMv[2] = {0};
 
-	iMotionPredFlag[0] = iMotionPredFlag[1] = iMotionPredFlag[2] = iMotionPredFlag[3] = pSlice->sSliceHeaderExt.bDefaultMotionPredFlag;
-	iRefCount[0] = pSliceHeader->uiRefCount[0];
-	iRefCount[1] = pSliceHeader->uiRefCount[1];
+  iMotionPredFlag[0] = iMotionPredFlag[1] = iMotionPredFlag[2] = iMotionPredFlag[3] =
+                         pSlice->sSliceHeaderExt.bDefaultMotionPredFlag;
+  iRefCount[0] = pSliceHeader->uiRefCount[0];
+  iRefCount[1] = pSliceHeader->uiRefCount[1];
 
-	switch( pCurDqLayer->pMbType[iMbXy] )
-	{
-	case MB_TYPE_16x16:
-		{
-			int8_t iRefIdx = 0;
-			if(pSlice->sSliceHeaderExt.bAdaptiveMotionPredFlag)
-			{
-				iMotionPredFlag[0] = BsGetOneBit(pBs);			
-			}
-			if (iMotionPredFlag[0] == 0)
-			{
-				iRefIdx = BsGetTe0(pBs, iRefCount[0]);
-				if (iRefIdx < 0 || iRefIdx >= iNumRefFrames) //error ref_idx
-				{ 
-					return ERR_INFO_INVALID_REF_INDEX;
-				}
-			}
-			else
-            {
-                WelsLog( pCtx, WELS_LOG_WARNING, "inter parse: iMotionPredFlag = 1 not supported. \n" );
-                return GENERATE_ERROR_NO(ERR_LEVEL_MB_DATA, ERR_INFO_UNSUPPORTED_ILP);
-            }
-			PredMv(iMvArray, iRefIdxArray, 0, 4, iRefIdx, iMv);
+  switch (pCurDqLayer->pMbType[iMbXy]) {
+  case MB_TYPE_16x16: {
+    int8_t iRefIdx = 0;
+    if (pSlice->sSliceHeaderExt.bAdaptiveMotionPredFlag) {
+      iMotionPredFlag[0] = BsGetOneBit (pBs);
+    }
+    if (iMotionPredFlag[0] == 0) {
+      iRefIdx = BsGetTe0 (pBs, iRefCount[0]);
+      if (iRefIdx < 0 || iRefIdx >= iNumRefFrames) { //error ref_idx
+        return ERR_INFO_INVALID_REF_INDEX;
+      }
+    } else {
+      WelsLog (pCtx, WELS_LOG_WARNING, "inter parse: iMotionPredFlag = 1 not supported. \n");
+      return GENERATE_ERROR_NO (ERR_LEVEL_MB_DATA, ERR_INFO_UNSUPPORTED_ILP);
+    }
+    PredMv (iMvArray, iRefIdxArray, 0, 4, iRefIdx, iMv);
 
-			iMv[0] += BsGetSe(pBs);
-			iMv[1] += BsGetSe(pBs);
-			
-			UpdateP16x16MotionInfo(pCurDqLayer, iRefIdx, iMv);
-		}
-		break;
-	case MB_TYPE_16x8:
-        {
-            int32_t iRefIdx[2];
-		    for (i = 0; i < 2; i++) 
-		    {
-			    if(pSlice->sSliceHeaderExt.bAdaptiveMotionPredFlag)
-			    {		
-				    iMotionPredFlag[i] = BsGetOneBit(pBs);		
-			    }
-		    }
-    		
-		    for (i = 0; i < 2; i++) 
-		    {
-                if( iMotionPredFlag[i] )
-                {
-                    WelsLog( pCtx, WELS_LOG_WARNING, "inter parse: iMotionPredFlag = 1 not supported. \n" );
-                    return GENERATE_ERROR_NO(ERR_LEVEL_MB_DATA, ERR_INFO_UNSUPPORTED_ILP);
-                }
-			    iRefIdx[i] = BsGetTe0(pBs, iRefCount[0]);
-			    if (iRefIdx[i] < 0 || iRefIdx[i] >= iNumRefFrames) //error ref_idx
-			    { 
-				    return ERR_INFO_INVALID_REF_INDEX;
-			    }
-		    }
-		    for (i = 0; i < 2; i++) 
-		    {
-			    PredInter16x8Mv(iMvArray, iRefIdxArray, i<<3, iRefIdx[i], iMv);
+    iMv[0] += BsGetSe (pBs);
+    iMv[1] += BsGetSe (pBs);
 
-			    iMv[0] += BsGetSe(pBs);
-			    iMv[1] += BsGetSe(pBs);
+    UpdateP16x16MotionInfo (pCurDqLayer, iRefIdx, iMv);
+  }
+  break;
+  case MB_TYPE_16x8: {
+    int32_t iRefIdx[2];
+    for (i = 0; i < 2; i++) {
+      if (pSlice->sSliceHeaderExt.bAdaptiveMotionPredFlag) {
+        iMotionPredFlag[i] = BsGetOneBit (pBs);
+      }
+    }
 
-			    UpdateP16x8MotionInfo(pCurDqLayer, iMvArray, iRefIdxArray, i<<3, iRefIdx[i], iMv);
-		    }
-        }
-		break;
-	case MB_TYPE_8x16:
-        {
-            int32_t iRefIdx[2];
-		    for (i = 0; i < 2; i++) 
-		    {
-			    if(pSlice->sSliceHeaderExt.bAdaptiveMotionPredFlag)
-			    {
-				    iMotionPredFlag[i] = BsGetOneBit(pBs);	
-			    }
-		    }
-    		
-		    for (i = 0; i < 2; i++) 
-		    {
-			    if (iMotionPredFlag[i] == 0)
-			    {
-				    iRefIdx[i] = BsGetTe0(pBs, iRefCount[0]);
-				    if (iRefIdx[i] < 0 || iRefIdx[i] >= iNumRefFrames) //error ref_idx
-				    { 
-					    return ERR_INFO_INVALID_REF_INDEX;
-				    }
-			    }
-			    else
-			    {
-                    WelsLog( pCtx, WELS_LOG_WARNING, "inter parse: iMotionPredFlag = 1 not supported. \n" );
-                    return GENERATE_ERROR_NO(ERR_LEVEL_MB_DATA, ERR_INFO_UNSUPPORTED_ILP);
-			    }
-    			
-		    }
-		    for (i = 0; i < 2; i++) 
-		    {
-			    PredInter8x16Mv( iMvArray, iRefIdxArray, i<<2, iRefIdx[i], iMv);
-    			
-			    iMv[0] += BsGetSe(pBs); 
-			    iMv[1] += BsGetSe(pBs);
+    for (i = 0; i < 2; i++) {
+      if (iMotionPredFlag[i]) {
+        WelsLog (pCtx, WELS_LOG_WARNING, "inter parse: iMotionPredFlag = 1 not supported. \n");
+        return GENERATE_ERROR_NO (ERR_LEVEL_MB_DATA, ERR_INFO_UNSUPPORTED_ILP);
+      }
+      iRefIdx[i] = BsGetTe0 (pBs, iRefCount[0]);
+      if (iRefIdx[i] < 0 || iRefIdx[i] >= iNumRefFrames) { //error ref_idx
+        return ERR_INFO_INVALID_REF_INDEX;
+      }
+    }
+    for (i = 0; i < 2; i++) {
+      PredInter16x8Mv (iMvArray, iRefIdxArray, i << 3, iRefIdx[i], iMv);
 
-			    UpdateP8x16MotionInfo(pCurDqLayer, iMvArray, iRefIdxArray, i<<2, iRefIdx[i], iMv);
-		    }
+      iMv[0] += BsGetSe (pBs);
+      iMv[1] += BsGetSe (pBs);
+
+      UpdateP16x8MotionInfo (pCurDqLayer, iMvArray, iRefIdxArray, i << 3, iRefIdx[i], iMv);
+    }
+  }
+  break;
+  case MB_TYPE_8x16: {
+    int32_t iRefIdx[2];
+    for (i = 0; i < 2; i++) {
+      if (pSlice->sSliceHeaderExt.bAdaptiveMotionPredFlag) {
+        iMotionPredFlag[i] = BsGetOneBit (pBs);
+      }
+    }
+
+    for (i = 0; i < 2; i++) {
+      if (iMotionPredFlag[i] == 0) {
+        iRefIdx[i] = BsGetTe0 (pBs, iRefCount[0]);
+        if (iRefIdx[i] < 0 || iRefIdx[i] >= iNumRefFrames) { //error ref_idx
+          return ERR_INFO_INVALID_REF_INDEX;
         }
-		break;
-	case MB_TYPE_8x8:
-	case MB_TYPE_8x8_REF0:
-		{
-			int8_t iRefIdx[4] = {0}, iSubPartCount[4], iPartWidth[4];
-			uint32_t uiSubMbType;
+      } else {
+        WelsLog (pCtx, WELS_LOG_WARNING, "inter parse: iMotionPredFlag = 1 not supported. \n");
+        return GENERATE_ERROR_NO (ERR_LEVEL_MB_DATA, ERR_INFO_UNSUPPORTED_ILP);
+      }
 
-			if ( MB_TYPE_8x8_REF0 == pCurDqLayer->pMbType[iMbXy])
-			{
-				iRefCount[0]	=
-				iRefCount[1]	= 1;
-			}
+    }
+    for (i = 0; i < 2; i++) {
+      PredInter8x16Mv (iMvArray, iRefIdxArray, i << 2, iRefIdx[i], iMv);
 
-			//uiSubMbType, partition
-			for (i = 0; i < 4; i++) 
-			{
-				uiSubMbType = BsGetUe(pBs);
-				if (uiSubMbType >= 4) //invalid uiSubMbType
-				{
-					return ERR_INFO_INVALID_SUB_MB_TYPE;
-				}
-				pCurDqLayer->pSubMbType[iMbXy][i] = g_ksInterSubMbTypeInfo[uiSubMbType].iType;
-				iSubPartCount[i] = g_ksInterSubMbTypeInfo[uiSubMbType].iPartCount;
-				iPartWidth[i] = g_ksInterSubMbTypeInfo[uiSubMbType].iPartWidth;
-			}
+      iMv[0] += BsGetSe (pBs);
+      iMv[1] += BsGetSe (pBs);
 
-			if(pSlice->sSliceHeaderExt.bAdaptiveMotionPredFlag)
-			{
-				for(i=0; i<4; i++)
-				{			
-					iMotionPredFlag[i] = BsGetOneBit(pBs);				
-				}
-			}
-			
-			//iRefIdxArray
-			if (MB_TYPE_8x8_REF0 == pCurDqLayer->pMbType[iMbXy])
-			{
-				memset(pCurDqLayer->pRefIndex[0][iMbXy], 0, 16);
-			}
-			else
-			{
-				for (i = 0; i < 4; i++) 
-				{
-					int16_t iIndex8 = i << 2;
-					uint8_t uiScan4Idx = g_kuiScan4[iIndex8];
+      UpdateP8x16MotionInfo (pCurDqLayer, iMvArray, iRefIdxArray, i << 2, iRefIdx[i], iMv);
+    }
+  }
+  break;
+  case MB_TYPE_8x8:
+  case MB_TYPE_8x8_REF0: {
+    int8_t iRefIdx[4] = {0}, iSubPartCount[4], iPartWidth[4];
+    uint32_t uiSubMbType;
 
-					if (iMotionPredFlag[i] == 0)
-					{
-						iRefIdx[i] = BsGetTe0(pBs, iRefCount[0]);
-						if (iRefIdx[i] < 0 || iRefIdx[i] >= iNumRefFrames) //error ref_idx 
-						{
-							return ERR_INFO_INVALID_REF_INDEX;
-						}
+    if (MB_TYPE_8x8_REF0 == pCurDqLayer->pMbType[iMbXy]) {
+      iRefCount[0]	=
+        iRefCount[1]	= 1;
+    }
 
-						pCurDqLayer->pRefIndex[0][iMbXy][uiScan4Idx  ] = pCurDqLayer->pRefIndex[0][iMbXy][uiScan4Idx+1] =
-						pCurDqLayer->pRefIndex[0][iMbXy][uiScan4Idx+4] = pCurDqLayer->pRefIndex[0][iMbXy][uiScan4Idx+5] = iRefIdx[i];
-					}
-					else
-                    {
-                        WelsLog( pCtx, WELS_LOG_WARNING, "inter parse: iMotionPredFlag = 1 not supported. \n" );
-                        return GENERATE_ERROR_NO(ERR_LEVEL_MB_DATA, ERR_INFO_UNSUPPORTED_ILP);
-                    }
-				}
-			}
-			
-			//gain mv and update mv cache
-			for (i = 0; i < 4; i++) 
-			{
-				int8_t iPartCount = iSubPartCount[i];
-				uint32_t uiSubMbType = pCurDqLayer->pSubMbType[iMbXy][i];
-				int16_t iMv[2], iPartIdx, iBlockWidth = iPartWidth[i], iIdx = i << 2;				
-				uint8_t uiScan4Idx, uiCacheIdx;				
-				
-				uint8_t uiIdx4Cache = g_kuiCache30ScanIdx[iIdx];
-				
-				iRefIdxArray[0][uiIdx4Cache  ] = iRefIdxArray[0][uiIdx4Cache+1] =
-				iRefIdxArray[0][uiIdx4Cache+6] = iRefIdxArray[0][uiIdx4Cache+7] = iRefIdx[i];
-					
-				for (j = 0; j < iPartCount; j++) 
-				{
-					iPartIdx = iIdx + j * iBlockWidth;
-					uiScan4Idx = g_kuiScan4[iPartIdx];
-					uiCacheIdx = g_kuiCache30ScanIdx[iPartIdx];
-					PredMv(iMvArray, iRefIdxArray, iPartIdx, iBlockWidth, iRefIdx[i], iMv);
+    //uiSubMbType, partition
+    for (i = 0; i < 4; i++) {
+      uiSubMbType = BsGetUe (pBs);
+      if (uiSubMbType >= 4) { //invalid uiSubMbType
+        return ERR_INFO_INVALID_SUB_MB_TYPE;
+      }
+      pCurDqLayer->pSubMbType[iMbXy][i] = g_ksInterSubMbTypeInfo[uiSubMbType].iType;
+      iSubPartCount[i] = g_ksInterSubMbTypeInfo[uiSubMbType].iPartCount;
+      iPartWidth[i] = g_ksInterSubMbTypeInfo[uiSubMbType].iPartWidth;
+    }
 
-					iMv[0] += BsGetSe(pBs); 
-					iMv[1] += BsGetSe(pBs);
-					
-					if (SUB_MB_TYPE_8x8 == uiSubMbType) 
-					{
-						ST32(pCurDqLayer->pMv[0][iMbXy][uiScan4Idx], LD32(iMv));
-						ST32(pCurDqLayer->pMv[0][iMbXy][uiScan4Idx+1], LD32(iMv));
-						ST32(pCurDqLayer->pMv[0][iMbXy][uiScan4Idx+4], LD32(iMv));
-						ST32(pCurDqLayer->pMv[0][iMbXy][uiScan4Idx+5], LD32(iMv));
-						ST32(iMvArray[0][uiCacheIdx  ], LD32(iMv));
-						ST32(iMvArray[0][uiCacheIdx+1], LD32(iMv));
-						ST32(iMvArray[0][uiCacheIdx+6], LD32(iMv));
-						ST32(iMvArray[0][uiCacheIdx+7], LD32(iMv));
-					}
-					else if (SUB_MB_TYPE_8x4 == uiSubMbType) 
-					{
-						ST32(pCurDqLayer->pMv[0][iMbXy][uiScan4Idx  ], LD32(iMv));
-						ST32(pCurDqLayer->pMv[0][iMbXy][uiScan4Idx+1], LD32(iMv));
-						ST32(iMvArray[0][uiCacheIdx  ], LD32(iMv));
-						ST32(iMvArray[0][uiCacheIdx+1], LD32(iMv));
-					}
-					else if (SUB_MB_TYPE_4x8 == uiSubMbType) 
-					{
-						ST32(pCurDqLayer->pMv[0][iMbXy][uiScan4Idx  ], LD32(iMv));
-						ST32(pCurDqLayer->pMv[0][iMbXy][uiScan4Idx+4], LD32(iMv));
-						ST32(iMvArray[0][uiCacheIdx  ], LD32(iMv));
-						ST32(iMvArray[0][uiCacheIdx+6], LD32(iMv));
-					}
-					else //SUB_MB_TYPE_4x4 == uiSubMbType
-					{
-						ST32(pCurDqLayer->pMv[0][iMbXy][uiScan4Idx  ], LD32(iMv));
-						ST32(iMvArray[0][uiCacheIdx  ], LD32(iMv));
-					}
-				}
-			}
-		}
-		break;
-	default:
-		break;
-	}	
+    if (pSlice->sSliceHeaderExt.bAdaptiveMotionPredFlag) {
+      for (i = 0; i < 4; i++) {
+        iMotionPredFlag[i] = BsGetOneBit (pBs);
+      }
+    }
 
-	return 0;
+    //iRefIdxArray
+    if (MB_TYPE_8x8_REF0 == pCurDqLayer->pMbType[iMbXy]) {
+      memset (pCurDqLayer->pRefIndex[0][iMbXy], 0, 16);
+    } else {
+      for (i = 0; i < 4; i++) {
+        int16_t iIndex8 = i << 2;
+        uint8_t uiScan4Idx = g_kuiScan4[iIndex8];
+
+        if (iMotionPredFlag[i] == 0) {
+          iRefIdx[i] = BsGetTe0 (pBs, iRefCount[0]);
+          if (iRefIdx[i] < 0 || iRefIdx[i] >= iNumRefFrames) { //error ref_idx
+            return ERR_INFO_INVALID_REF_INDEX;
+          }
+
+          pCurDqLayer->pRefIndex[0][iMbXy][uiScan4Idx  ] = pCurDqLayer->pRefIndex[0][iMbXy][uiScan4Idx + 1] =
+                pCurDqLayer->pRefIndex[0][iMbXy][uiScan4Idx + 4] = pCurDqLayer->pRefIndex[0][iMbXy][uiScan4Idx + 5] = iRefIdx[i];
+        } else {
+          WelsLog (pCtx, WELS_LOG_WARNING, "inter parse: iMotionPredFlag = 1 not supported. \n");
+          return GENERATE_ERROR_NO (ERR_LEVEL_MB_DATA, ERR_INFO_UNSUPPORTED_ILP);
+        }
+      }
+    }
+
+    //gain mv and update mv cache
+    for (i = 0; i < 4; i++) {
+      int8_t iPartCount = iSubPartCount[i];
+      uint32_t uiSubMbType = pCurDqLayer->pSubMbType[iMbXy][i];
+      int16_t iMv[2], iPartIdx, iBlockWidth = iPartWidth[i], iIdx = i << 2;
+      uint8_t uiScan4Idx, uiCacheIdx;
+
+      uint8_t uiIdx4Cache = g_kuiCache30ScanIdx[iIdx];
+
+      iRefIdxArray[0][uiIdx4Cache  ] = iRefIdxArray[0][uiIdx4Cache + 1] =
+                                         iRefIdxArray[0][uiIdx4Cache + 6] = iRefIdxArray[0][uiIdx4Cache + 7] = iRefIdx[i];
+
+      for (j = 0; j < iPartCount; j++) {
+        iPartIdx = iIdx + j * iBlockWidth;
+        uiScan4Idx = g_kuiScan4[iPartIdx];
+        uiCacheIdx = g_kuiCache30ScanIdx[iPartIdx];
+        PredMv (iMvArray, iRefIdxArray, iPartIdx, iBlockWidth, iRefIdx[i], iMv);
+
+        iMv[0] += BsGetSe (pBs);
+        iMv[1] += BsGetSe (pBs);
+
+        if (SUB_MB_TYPE_8x8 == uiSubMbType) {
+          ST32 (pCurDqLayer->pMv[0][iMbXy][uiScan4Idx], LD32 (iMv));
+          ST32 (pCurDqLayer->pMv[0][iMbXy][uiScan4Idx + 1], LD32 (iMv));
+          ST32 (pCurDqLayer->pMv[0][iMbXy][uiScan4Idx + 4], LD32 (iMv));
+          ST32 (pCurDqLayer->pMv[0][iMbXy][uiScan4Idx + 5], LD32 (iMv));
+          ST32 (iMvArray[0][uiCacheIdx  ], LD32 (iMv));
+          ST32 (iMvArray[0][uiCacheIdx + 1], LD32 (iMv));
+          ST32 (iMvArray[0][uiCacheIdx + 6], LD32 (iMv));
+          ST32 (iMvArray[0][uiCacheIdx + 7], LD32 (iMv));
+        } else if (SUB_MB_TYPE_8x4 == uiSubMbType) {
+          ST32 (pCurDqLayer->pMv[0][iMbXy][uiScan4Idx  ], LD32 (iMv));
+          ST32 (pCurDqLayer->pMv[0][iMbXy][uiScan4Idx + 1], LD32 (iMv));
+          ST32 (iMvArray[0][uiCacheIdx  ], LD32 (iMv));
+          ST32 (iMvArray[0][uiCacheIdx + 1], LD32 (iMv));
+        } else if (SUB_MB_TYPE_4x8 == uiSubMbType) {
+          ST32 (pCurDqLayer->pMv[0][iMbXy][uiScan4Idx  ], LD32 (iMv));
+          ST32 (pCurDqLayer->pMv[0][iMbXy][uiScan4Idx + 4], LD32 (iMv));
+          ST32 (iMvArray[0][uiCacheIdx  ], LD32 (iMv));
+          ST32 (iMvArray[0][uiCacheIdx + 6], LD32 (iMv));
+        } else { //SUB_MB_TYPE_4x4 == uiSubMbType
+          ST32 (pCurDqLayer->pMv[0][iMbXy][uiScan4Idx  ], LD32 (iMv));
+          ST32 (iMvArray[0][uiCacheIdx  ], LD32 (iMv));
+        }
+      }
+    }
+  }
+  break;
+  default:
+    break;
+  }
+
+  return 0;
 }
 
 } // namespace WelsDec
\ No newline at end of file
--- a/codec/decoder/core/src/pic_queue.cpp
+++ b/codec/decoder/core/src/pic_queue.cpp
@@ -48,13 +48,13 @@
 
 namespace WelsDec {
 
-void_t FreePicture( PPicture pPic );
+void_t FreePicture (PPicture pPic);
 
 
 ///////////////////////////////////Recycled queue management for pictures///////////////////////////////////
 /*	 ______________________________________
   -->| P0 | P1 | P2 | P3 | P4 | .. | Pn-1 |-->
-	 -------------------------------------- 
+	 --------------------------------------
  *
  *	How does it work?
  *	node <- next; ++ next;
@@ -63,104 +63,94 @@
 
 
 
-PPicture AllocPicture( PWelsDecoderContext pCtx, const int32_t kiPicWidth, const int32_t kiPicHeight )
-{
-	PPicture pPic = NULL;
-	int32_t iPicWidth = 0;
-	int32_t iPicHeight= 0;
+PPicture AllocPicture (PWelsDecoderContext pCtx, const int32_t kiPicWidth, const int32_t kiPicHeight) {
+  PPicture pPic = NULL;
+  int32_t iPicWidth = 0;
+  int32_t iPicHeight = 0;
 
-	int32_t iPicChromaWidth	= 0;
-	int32_t iPicChromaHeight	= 0;
-	int32_t iLumaSize			= 0;
-	int32_t iChromaSize			= 0;	
+  int32_t iPicChromaWidth	= 0;
+  int32_t iPicChromaHeight	= 0;
+  int32_t iLumaSize			= 0;
+  int32_t iChromaSize			= 0;
 
-	pPic	= (PPicture) WelsMalloc( sizeof(SPicture), "PPicture" );	
-	WELS_VERIFY_RETURN_IF( NULL, NULL == pPic );
-	
-	memset(pPic, 0, sizeof(SPicture) );
-	
-	iPicWidth = WELS_ALIGN(kiPicWidth + (PADDING_LENGTH<<1), PICTURE_RESOLUTION_ALIGNMENT);
-	iPicHeight = WELS_ALIGN(kiPicHeight + (PADDING_LENGTH<<1), PICTURE_RESOLUTION_ALIGNMENT);
-	iPicChromaWidth	= iPicWidth >> 1;
-	iPicChromaHeight	= iPicHeight >> 1;
-	
-	iLumaSize	= iPicWidth * iPicHeight;
-	iChromaSize	= iPicChromaWidth * iPicChromaHeight;
-	if(pCtx->iDecoderMode == SW_MODE)
-	{
-		pPic->pBuffer[0]	= static_cast<uint8_t*> (WelsMalloc(	iLumaSize /* luma */
-								  + (iChromaSize << 1) /* Cb,Cr */, "_pic->buffer[0]" ) );
+  pPic	= (PPicture) WelsMalloc (sizeof (SPicture), "PPicture");
+  WELS_VERIFY_RETURN_IF (NULL, NULL == pPic);
 
-		WELS_VERIFY_RETURN_PROC_IF( NULL, NULL == pPic->pBuffer[0], FreePicture(pPic) );
-		pPic->iLinesize[0] = iPicWidth;
-		pPic->iLinesize[1] = pPic->iLinesize[2] = iPicChromaWidth;
-		pPic->pBuffer[1]	= pPic->pBuffer[0] + iLumaSize;
-		pPic->pBuffer[2]	= pPic->pBuffer[1] + iChromaSize;
-		pPic->pData[0]	= pPic->pBuffer[0] + (1+pPic->iLinesize[0]) * PADDING_LENGTH;
-		pPic->pData[1]	= pPic->pBuffer[1] + /*WELS_ALIGN*/( ((1+pPic->iLinesize[1]) * PADDING_LENGTH) >> 1 );
-		pPic->pData[2]	= pPic->pBuffer[2] + /*WELS_ALIGN*/( ((1+pPic->iLinesize[2]) * PADDING_LENGTH) >> 1 );
-	}	
+  memset (pPic, 0, sizeof (SPicture));
 
+  iPicWidth = WELS_ALIGN (kiPicWidth + (PADDING_LENGTH << 1), PICTURE_RESOLUTION_ALIGNMENT);
+  iPicHeight = WELS_ALIGN (kiPicHeight + (PADDING_LENGTH << 1), PICTURE_RESOLUTION_ALIGNMENT);
+  iPicChromaWidth	= iPicWidth >> 1;
+  iPicChromaHeight	= iPicHeight >> 1;
 
+  iLumaSize	= iPicWidth * iPicHeight;
+  iChromaSize	= iPicChromaWidth * iPicChromaHeight;
+  if (pCtx->iDecoderMode == SW_MODE) {
+    pPic->pBuffer[0]	= static_cast<uint8_t*> (WelsMalloc (iLumaSize /* luma */
+                        + (iChromaSize << 1) /* Cb,Cr */, "_pic->buffer[0]"));
 
-	pPic->iPlanes		= 3;	// yv12 in default
-	pPic->iWidthInPixel	= kiPicWidth;
-	pPic->iHeightInPixel= kiPicHeight;
-	pPic->iFrameNum		= -1;
-	pPic->bAvailableFlag= true;
+    WELS_VERIFY_RETURN_PROC_IF (NULL, NULL == pPic->pBuffer[0], FreePicture (pPic));
+    pPic->iLinesize[0] = iPicWidth;
+    pPic->iLinesize[1] = pPic->iLinesize[2] = iPicChromaWidth;
+    pPic->pBuffer[1]	= pPic->pBuffer[0] + iLumaSize;
+    pPic->pBuffer[2]	= pPic->pBuffer[1] + iChromaSize;
+    pPic->pData[0]	= pPic->pBuffer[0] + (1 + pPic->iLinesize[0]) * PADDING_LENGTH;
+    pPic->pData[1]	= pPic->pBuffer[1] + /*WELS_ALIGN*/ (((1 + pPic->iLinesize[1]) * PADDING_LENGTH) >> 1);
+    pPic->pData[2]	= pPic->pBuffer[2] + /*WELS_ALIGN*/ (((1 + pPic->iLinesize[2]) * PADDING_LENGTH) >> 1);
+  }
 
-	return pPic;
+
+
+  pPic->iPlanes		= 3;	// yv12 in default
+  pPic->iWidthInPixel	= kiPicWidth;
+  pPic->iHeightInPixel = kiPicHeight;
+  pPic->iFrameNum		= -1;
+  pPic->bAvailableFlag = true;
+
+  return pPic;
 }
 
-void_t FreePicture( PPicture pPic )
-{
-	if ( NULL != pPic )
-	{
+void_t FreePicture (PPicture pPic) {
+  if (NULL != pPic) {
 
-		if ( pPic->pBuffer[0] )
-		{
-			WelsFree( pPic->pBuffer[0], "pPic->pBuffer[0]" );
-		}		
+    if (pPic->pBuffer[0]) {
+      WelsFree (pPic->pBuffer[0], "pPic->pBuffer[0]");
+    }
 
-		WelsFree( pPic, "pPic" );
+    WelsFree (pPic, "pPic");
 
-		pPic = NULL;
-	}
+    pPic = NULL;
+  }
 }
-PPicture PrefetchPic( PPicBuff pPicBuf )
-{
-	int32_t iPicIdx = 0;
-	PPicture pPic  = NULL;
+PPicture PrefetchPic (PPicBuff pPicBuf) {
+  int32_t iPicIdx = 0;
+  PPicture pPic  = NULL;
 
-	if (pPicBuf->iCapacity == 0)
-	{
-		return NULL;
-	}
+  if (pPicBuf->iCapacity == 0) {
+    return NULL;
+  }
 
-	for ( iPicIdx = pPicBuf->iCurrentIdx+1; iPicIdx<pPicBuf->iCapacity ;++iPicIdx)
-	{
-		if (pPicBuf->ppPic[iPicIdx] !=NULL && pPicBuf->ppPic[iPicIdx]->bAvailableFlag && !pPicBuf->ppPic[iPicIdx]->bUsedAsRef)
-		{
-			pPic = pPicBuf->ppPic[iPicIdx];
-			break;
-		}
-	}
-	if (pPic !=NULL)
-	{
-		pPicBuf->iCurrentIdx = iPicIdx;
-		return pPic;
-	}
-	for ( iPicIdx = 0 ; iPicIdx<pPicBuf->iCurrentIdx ;++iPicIdx)
-	{
-		if (pPicBuf->ppPic[iPicIdx] !=NULL && pPicBuf->ppPic[iPicIdx]->bAvailableFlag && !pPicBuf->ppPic[iPicIdx]->bUsedAsRef)
-		{
-			pPic = pPicBuf->ppPic[iPicIdx];
-			break;
-		}
-	}
-	
-	pPicBuf->iCurrentIdx = iPicIdx;
-	return pPic;
+  for (iPicIdx = pPicBuf->iCurrentIdx + 1; iPicIdx < pPicBuf->iCapacity ; ++iPicIdx) {
+    if (pPicBuf->ppPic[iPicIdx] != NULL && pPicBuf->ppPic[iPicIdx]->bAvailableFlag
+        && !pPicBuf->ppPic[iPicIdx]->bUsedAsRef) {
+      pPic = pPicBuf->ppPic[iPicIdx];
+      break;
+    }
+  }
+  if (pPic != NULL) {
+    pPicBuf->iCurrentIdx = iPicIdx;
+    return pPic;
+  }
+  for (iPicIdx = 0 ; iPicIdx < pPicBuf->iCurrentIdx ; ++iPicIdx) {
+    if (pPicBuf->ppPic[iPicIdx] != NULL && pPicBuf->ppPic[iPicIdx]->bAvailableFlag
+        && !pPicBuf->ppPic[iPicIdx]->bUsedAsRef) {
+      pPic = pPicBuf->ppPic[iPicIdx];
+      break;
+    }
+  }
+
+  pPicBuf->iCurrentIdx = iPicIdx;
+  return pPic;
 }
 
 } // namespace WelsDec
\ No newline at end of file
--- a/codec/decoder/core/src/rec_mb.cpp
+++ b/codec/decoder/core/src/rec_mb.cpp
@@ -49,168 +49,158 @@
 
 namespace WelsDec {
 
-void_t WelsFillRecNeededMbInfo(PWelsDecoderContext pCtx, bool_t bOutput, PDqLayer pCurLayer)
-{
-	PPicture pCurPic = pCtx->pDec;
-	int32_t iLumaStride   = pCurPic->iLinesize[0];
-	int32_t iChromaStride = pCurPic->iLinesize[1];
-	int32_t iMbX = pCurLayer->iMbX;
-	int32_t iMbY = pCurLayer->iMbY;
-	
-	pCurLayer->iLumaStride= iLumaStride;
-	pCurLayer->iChromaStride= iChromaStride;
-	
-	if(bOutput)
-	{
-		pCurLayer->pPred[0] = pCurPic->pData[0] + ((iMbY * iLumaStride + iMbX)<<4);
-		pCurLayer->pPred[1] = pCurPic->pData[1] + ((iMbY * iChromaStride + iMbX)<<3);
-		pCurLayer->pPred[2] = pCurPic->pData[2] + ((iMbY * iChromaStride + iMbX)<<3);
-	}
+void_t WelsFillRecNeededMbInfo (PWelsDecoderContext pCtx, bool_t bOutput, PDqLayer pCurLayer) {
+  PPicture pCurPic = pCtx->pDec;
+  int32_t iLumaStride   = pCurPic->iLinesize[0];
+  int32_t iChromaStride = pCurPic->iLinesize[1];
+  int32_t iMbX = pCurLayer->iMbX;
+  int32_t iMbY = pCurLayer->iMbY;
+
+  pCurLayer->iLumaStride = iLumaStride;
+  pCurLayer->iChromaStride = iChromaStride;
+
+  if (bOutput) {
+    pCurLayer->pPred[0] = pCurPic->pData[0] + ((iMbY * iLumaStride + iMbX) << 4);
+    pCurLayer->pPred[1] = pCurPic->pData[1] + ((iMbY * iChromaStride + iMbX) << 3);
+    pCurLayer->pPred[2] = pCurPic->pData[2] + ((iMbY * iChromaStride + iMbX) << 3);
+  }
 }
 
-int32_t RecI4x4Mb(int32_t iMBXY, PWelsDecoderContext pCtx, int16_t *pScoeffLevel, PDqLayer pDqLayer)
-{
-	RecI4x4Luma(iMBXY, pCtx, pScoeffLevel, pDqLayer);
-	RecI4x4Chroma( iMBXY, pCtx, pScoeffLevel, pDqLayer);
-	return ERR_NONE;
+int32_t RecI4x4Mb (int32_t iMBXY, PWelsDecoderContext pCtx, int16_t* pScoeffLevel, PDqLayer pDqLayer) {
+  RecI4x4Luma (iMBXY, pCtx, pScoeffLevel, pDqLayer);
+  RecI4x4Chroma (iMBXY, pCtx, pScoeffLevel, pDqLayer);
+  return ERR_NONE;
 }
 
-int32_t RecI4x4Luma(int32_t iMBXY, PWelsDecoderContext pCtx, int16_t *pScoeffLevel, PDqLayer pDqLayer)
-{
-	/*****get local variable from outer variable********/
-	/*prediction info*/
-	uint8_t *pPred = pDqLayer->pPred[0];
-	
-	int32_t iLumaStride = pDqLayer->iLumaStride;
-	int32_t *pBlockOffset = pCtx->iDecBlockOffsetArray;
-	PGetIntraPredFunc *pGetI4x4LumaPredFunc = pCtx->pGetI4x4LumaPredFunc;	
-	
-	int8_t *pIntra4x4PredMode = pDqLayer->pIntra4x4FinalMode[iMBXY];
-	int16_t *pRS = pScoeffLevel;
-	/*itransform info*/
-	PIdctResAddPredFunc	pIdctResAddPredFunc = pCtx->pIdctResAddPredFunc;
-	
-	
-	/*************local variable********************/
-	uint8_t i = 0;
-	
-	/*************real process*********************/
-	for(i=0; i<16; i++)
-	{
-		
-		uint8_t *pPredI4x4 = pPred + pBlockOffset[i];
-		uint8_t uiMode= pIntra4x4PredMode[g_kuiScan4[i]];
-		
-		pGetI4x4LumaPredFunc[uiMode](pPredI4x4, iLumaStride);
-	
-		if ( pDqLayer->pNzc[iMBXY][g_kuiMbNonZeroCountIdx[i]] )
-		{	
-			int16_t *pRSI4x4 = &pRS[i<<4];
-			pIdctResAddPredFunc(pPredI4x4, iLumaStride, pRSI4x4);
-		}
-	}	
-	
-	return ERR_NONE;
+int32_t RecI4x4Luma (int32_t iMBXY, PWelsDecoderContext pCtx, int16_t* pScoeffLevel, PDqLayer pDqLayer) {
+  /*****get local variable from outer variable********/
+  /*prediction info*/
+  uint8_t* pPred = pDqLayer->pPred[0];
+
+  int32_t iLumaStride = pDqLayer->iLumaStride;
+  int32_t* pBlockOffset = pCtx->iDecBlockOffsetArray;
+  PGetIntraPredFunc* pGetI4x4LumaPredFunc = pCtx->pGetI4x4LumaPredFunc;
+
+  int8_t* pIntra4x4PredMode = pDqLayer->pIntra4x4FinalMode[iMBXY];
+  int16_t* pRS = pScoeffLevel;
+  /*itransform info*/
+  PIdctResAddPredFunc	pIdctResAddPredFunc = pCtx->pIdctResAddPredFunc;
+
+
+  /*************local variable********************/
+  uint8_t i = 0;
+
+  /*************real process*********************/
+  for (i = 0; i < 16; i++) {
+
+    uint8_t* pPredI4x4 = pPred + pBlockOffset[i];
+    uint8_t uiMode = pIntra4x4PredMode[g_kuiScan4[i]];
+
+    pGetI4x4LumaPredFunc[uiMode] (pPredI4x4, iLumaStride);
+
+    if (pDqLayer->pNzc[iMBXY][g_kuiMbNonZeroCountIdx[i]]) {
+      int16_t* pRSI4x4 = &pRS[i << 4];
+      pIdctResAddPredFunc (pPredI4x4, iLumaStride, pRSI4x4);
+    }
+  }
+
+  return ERR_NONE;
 }
 
 
-int32_t RecI4x4Chroma(int32_t iMBXY, PWelsDecoderContext pCtx, int16_t *pScoeffLevel, PDqLayer pDqLayer)
-{
-	int32_t iChromaStride = pCtx->pCurDqLayer->iCsStride[1];
+int32_t RecI4x4Chroma (int32_t iMBXY, PWelsDecoderContext pCtx, int16_t* pScoeffLevel, PDqLayer pDqLayer) {
+  int32_t iChromaStride = pCtx->pCurDqLayer->iCsStride[1];
 
-	int8_t iChromaPredMode = pDqLayer->pChromaPredMode[iMBXY];
-	
-	PGetIntraPredFunc *pGetIChromaPredFunc = pCtx->pGetIChromaPredFunc;
+  int8_t iChromaPredMode = pDqLayer->pChromaPredMode[iMBXY];
 
-	uint8_t *pPred = pDqLayer->pPred[1];
+  PGetIntraPredFunc* pGetIChromaPredFunc = pCtx->pGetIChromaPredFunc;
 
-	pGetIChromaPredFunc[iChromaPredMode](pPred, iChromaStride);
-	pPred = pDqLayer->pPred[2];
-	pGetIChromaPredFunc[iChromaPredMode](pPred, iChromaStride);
-	
-	RecChroma(iMBXY, pCtx, pScoeffLevel, pDqLayer);
+  uint8_t* pPred = pDqLayer->pPred[1];
 
-	return ERR_NONE;
+  pGetIChromaPredFunc[iChromaPredMode] (pPred, iChromaStride);
+  pPred = pDqLayer->pPred[2];
+  pGetIChromaPredFunc[iChromaPredMode] (pPred, iChromaStride);
+
+  RecChroma (iMBXY, pCtx, pScoeffLevel, pDqLayer);
+
+  return ERR_NONE;
 }
 
 
-int32_t RecI16x16Mb(int32_t iMBXY, PWelsDecoderContext pCtx, int16_t *pScoeffLevel, PDqLayer pDqLayer)
-{
-	/*decoder use, encoder no use*/
-	int8_t iI16x16PredMode = pDqLayer->pIntraPredMode[iMBXY][7];
-	int8_t iChromaPredMode = pDqLayer->pChromaPredMode[iMBXY];
-	PGetIntraPredFunc *pGetIChromaPredFunc = pCtx->pGetIChromaPredFunc;
-	PGetIntraPredFunc *pGetI16x16LumaPredFunc = pCtx->pGetI16x16LumaPredFunc;
-	int32_t iUVStride = pCtx->pCurDqLayer->iCsStride[1];
-	
-	/*common use by decoder&encoder*/
-	int32_t iYStride = pDqLayer->iLumaStride;
-	int32_t *pBlockOffset = pCtx->iDecBlockOffsetArray;
-	int16_t *pRS = pScoeffLevel;
-	
-	uint8_t *pPred = pDqLayer->pPred[0];
-	
-	PIdctResAddPredFunc pIdctResAddPredFunc = pCtx->pIdctResAddPredFunc;
-		
-	uint8_t i = 0;
-	
-	/*decode i16x16 y*/
-	pGetI16x16LumaPredFunc[iI16x16PredMode](pPred, iYStride);
-	
-	/*1 mb is divided 16 4x4_block to idct*/
-	for(i=0; i<16; i++)
-	{
-		int16_t *pRSI4x4 = pRS + (i<<4);
-		uint8_t *pPredI4x4 = pPred + pBlockOffset[i];
-		
-		if ( pDqLayer->pNzc[iMBXY][g_kuiMbNonZeroCountIdx[i]] || pRSI4x4[0] )
-		{
-			pIdctResAddPredFunc(pPredI4x4, iYStride, pRSI4x4);
-		}
-	}
-	
-	/*decode intra mb cb&cr*/
-	pPred = pDqLayer->pPred[1];
-	pGetIChromaPredFunc[iChromaPredMode](pPred, iUVStride);
-	pPred = pDqLayer->pPred[2];
-	pGetIChromaPredFunc[iChromaPredMode](pPred, iUVStride);
-	RecChroma(iMBXY, pCtx, pScoeffLevel,pDqLayer);
-	
-	return ERR_NONE;
+int32_t RecI16x16Mb (int32_t iMBXY, PWelsDecoderContext pCtx, int16_t* pScoeffLevel, PDqLayer pDqLayer) {
+  /*decoder use, encoder no use*/
+  int8_t iI16x16PredMode = pDqLayer->pIntraPredMode[iMBXY][7];
+  int8_t iChromaPredMode = pDqLayer->pChromaPredMode[iMBXY];
+  PGetIntraPredFunc* pGetIChromaPredFunc = pCtx->pGetIChromaPredFunc;
+  PGetIntraPredFunc* pGetI16x16LumaPredFunc = pCtx->pGetI16x16LumaPredFunc;
+  int32_t iUVStride = pCtx->pCurDqLayer->iCsStride[1];
+
+  /*common use by decoder&encoder*/
+  int32_t iYStride = pDqLayer->iLumaStride;
+  int32_t* pBlockOffset = pCtx->iDecBlockOffsetArray;
+  int16_t* pRS = pScoeffLevel;
+
+  uint8_t* pPred = pDqLayer->pPred[0];
+
+  PIdctResAddPredFunc pIdctResAddPredFunc = pCtx->pIdctResAddPredFunc;
+
+  uint8_t i = 0;
+
+  /*decode i16x16 y*/
+  pGetI16x16LumaPredFunc[iI16x16PredMode] (pPred, iYStride);
+
+  /*1 mb is divided 16 4x4_block to idct*/
+  for (i = 0; i < 16; i++) {
+    int16_t* pRSI4x4 = pRS + (i << 4);
+    uint8_t* pPredI4x4 = pPred + pBlockOffset[i];
+
+    if (pDqLayer->pNzc[iMBXY][g_kuiMbNonZeroCountIdx[i]] || pRSI4x4[0]) {
+      pIdctResAddPredFunc (pPredI4x4, iYStride, pRSI4x4);
+    }
+  }
+
+  /*decode intra mb cb&cr*/
+  pPred = pDqLayer->pPred[1];
+  pGetIChromaPredFunc[iChromaPredMode] (pPred, iUVStride);
+  pPred = pDqLayer->pPred[2];
+  pGetIChromaPredFunc[iChromaPredMode] (pPred, iUVStride);
+  RecChroma (iMBXY, pCtx, pScoeffLevel, pDqLayer);
+
+  return ERR_NONE;
 }
 
 typedef struct TagMCRefMember {
-	uint8_t* pDstY;
-	uint8_t* pDstU;
-	uint8_t* pDstV;
+  uint8_t* pDstY;
+  uint8_t* pDstU;
+  uint8_t* pDstV;
 
-	uint8_t* pSrcY;
-	uint8_t* pSrcU;
-	uint8_t* pSrcV;
+  uint8_t* pSrcY;
+  uint8_t* pSrcU;
+  uint8_t* pSrcV;
 
-	int32_t iSrcLineLuma;
-	int32_t iSrcLineChroma;
+  int32_t iSrcLineLuma;
+  int32_t iSrcLineChroma;
 
-	int32_t iDstLineLuma;
-	int32_t iDstLineChroma;
+  int32_t iDstLineLuma;
+  int32_t iDstLineChroma;
 
-	int32_t iPicWidth;
-	int32_t iPicHeight;
-}sMCRefMember;
+  int32_t iPicWidth;
+  int32_t iPicHeight;
+} sMCRefMember;
 //according to current 8*8 block ref_index to gain reference picture
-static inline void_t GetRefPic(sMCRefMember* pMCRefMem, PWelsDecoderContext pCtx, int8_t* pRefIdxList, int32_t iIndex)
-{
-	PPicture pRefPic;
+static inline void_t GetRefPic (sMCRefMember* pMCRefMem, PWelsDecoderContext pCtx, int8_t* pRefIdxList,
+                                int32_t iIndex) {
+  PPicture pRefPic;
 
-	int8_t iRefIdx = pRefIdxList[iIndex];
-	pRefPic = pCtx->sRefPic.pRefList[LIST_0][iRefIdx];
+  int8_t iRefIdx = pRefIdxList[iIndex];
+  pRefPic = pCtx->sRefPic.pRefList[LIST_0][iRefIdx];
 
-	pMCRefMem->iSrcLineLuma   = pRefPic->iLinesize[0];
-	pMCRefMem->iSrcLineChroma = pRefPic->iLinesize[1];	
+  pMCRefMem->iSrcLineLuma   = pRefPic->iLinesize[0];
+  pMCRefMem->iSrcLineChroma = pRefPic->iLinesize[1];
 
-	pMCRefMem->pSrcY = pRefPic->pData[0];
-	pMCRefMem->pSrcU = pRefPic->pData[1];
-	pMCRefMem->pSrcV = pRefPic->pData[2];
+  pMCRefMem->pSrcY = pRefPic->pData[0];
+  pMCRefMem->pSrcU = pRefPic->pData[1];
+  pMCRefMem->pSrcV = pRefPic->pData[2];
 }
 
 
@@ -217,373 +207,350 @@
 #ifndef MC_FLOW_SIMPLE_JUDGE
 #define MC_FLOW_SIMPLE_JUDGE 1
 #endif //MC_FLOW_SIMPLE_JUDGE
-static inline void_t BaseMC(sMCRefMember* pMCRefMem, int32_t iXOffset, int32_t iYOffset, SMcFunc* pMCFunc,
-						   int32_t iBlkWidth, int32_t iBlkHeight, int16_t iMVs[2])
-{		
-	int32_t iExpandWidth = PADDING_LENGTH;
-	int32_t	iExpandHeight = PADDING_LENGTH;
-	
+static inline void_t BaseMC (sMCRefMember* pMCRefMem, int32_t iXOffset, int32_t iYOffset, SMcFunc* pMCFunc,
+                             int32_t iBlkWidth, int32_t iBlkHeight, int16_t iMVs[2]) {
+  int32_t iExpandWidth = PADDING_LENGTH;
+  int32_t	iExpandHeight = PADDING_LENGTH;
 
-	int16_t iMVX = iMVs[0] >> 2;
-	int16_t iMVY = iMVs[1] >> 2;
-	int32_t iMVOffsetLuma = iMVX + iMVY * pMCRefMem->iSrcLineLuma;
-	int32_t iMVOffsetChroma = (iMVX>>1) + (iMVY>>1) * pMCRefMem->iSrcLineChroma;
 
-	int32_t iFullMVx = (iXOffset << 2) + iMVs[0]; //quarter pixel
-	int32_t iFullMVy = (iYOffset << 2) + iMVs[1];
-	int32_t iIntMVx = iFullMVx >> 2;//integer pixel
-	int32_t iIntMVy = iFullMVy >> 2;
+  int16_t iMVX = iMVs[0] >> 2;
+  int16_t iMVY = iMVs[1] >> 2;
+  int32_t iMVOffsetLuma = iMVX + iMVY * pMCRefMem->iSrcLineLuma;
+  int32_t iMVOffsetChroma = (iMVX >> 1) + (iMVY >> 1) * pMCRefMem->iSrcLineChroma;
 
-	int32_t iSrcPixOffsetLuma = iXOffset + iYOffset * pMCRefMem->iSrcLineLuma;
-	int32_t iSrcPixOffsetChroma = (iXOffset>>1) + (iYOffset>>1) * pMCRefMem->iSrcLineChroma;
+  int32_t iFullMVx = (iXOffset << 2) + iMVs[0]; //quarter pixel
+  int32_t iFullMVy = (iYOffset << 2) + iMVs[1];
+  int32_t iIntMVx = iFullMVx >> 2;//integer pixel
+  int32_t iIntMVy = iFullMVy >> 2;
 
-	int32_t iBlkWidthChroma = iBlkWidth >> 1;
-	int32_t iBlkHeightChroma = iBlkHeight >> 1;
-	int32_t iPicWidthChroma = pMCRefMem->iPicWidth >> 1;
-	int32_t iPicHeightChroma = pMCRefMem->iPicHeight >> 1;
+  int32_t iSrcPixOffsetLuma = iXOffset + iYOffset * pMCRefMem->iSrcLineLuma;
+  int32_t iSrcPixOffsetChroma = (iXOffset >> 1) + (iYOffset >> 1) * pMCRefMem->iSrcLineChroma;
 
-	//the offset only for luma padding if MV violation as there was 5-tap (-2, -1, 0, 1, 2) filter for luma (horizon and vertical)
-	int32_t iPadOffset = 2 + (pMCRefMem->iSrcLineLuma << 1); //(-2, -2) pixel location as the starting point
+  int32_t iBlkWidthChroma = iBlkWidth >> 1;
+  int32_t iBlkHeightChroma = iBlkHeight >> 1;
+  int32_t iPicWidthChroma = pMCRefMem->iPicWidth >> 1;
+  int32_t iPicHeightChroma = pMCRefMem->iPicHeight >> 1;
 
-    uint8_t* pSrcY = pMCRefMem->pSrcY + iSrcPixOffsetLuma;
-	uint8_t* pSrcU = pMCRefMem->pSrcU + iSrcPixOffsetChroma;
-    uint8_t* pSrcV = pMCRefMem->pSrcV + iSrcPixOffsetChroma;
-	uint8_t* pDstY = pMCRefMem->pDstY;
-	uint8_t* pDstU = pMCRefMem->pDstU;
-	uint8_t* pDstV = pMCRefMem->pDstV;
-	bool_t bExpand = false;
+  //the offset only for luma padding if MV violation as there was 5-tap (-2, -1, 0, 1, 2) filter for luma (horizon and vertical)
+  int32_t iPadOffset = 2 + (pMCRefMem->iSrcLineLuma << 1); //(-2, -2) pixel location as the starting point
 
-	FORCE_STACK_ALIGN_1D( uint8_t, uiExpandBuf, (PADDING_LENGTH+6)*(PADDING_LENGTH+6), 16 );
-	
-	if (iFullMVx & 0x07)
-	{
-		iExpandWidth -= 3;
-	}		
-	if (iFullMVy & 0x07)
-	{
-		iExpandHeight -= 3;
-	}
+  uint8_t* pSrcY = pMCRefMem->pSrcY + iSrcPixOffsetLuma;
+  uint8_t* pSrcU = pMCRefMem->pSrcU + iSrcPixOffsetChroma;
+  uint8_t* pSrcV = pMCRefMem->pSrcV + iSrcPixOffsetChroma;
+  uint8_t* pDstY = pMCRefMem->pDstY;
+  uint8_t* pDstU = pMCRefMem->pDstU;
+  uint8_t* pDstV = pMCRefMem->pDstV;
+  bool_t bExpand = false;
 
+  FORCE_STACK_ALIGN_1D (uint8_t, uiExpandBuf, (PADDING_LENGTH + 6) * (PADDING_LENGTH + 6), 16);
+
+  if (iFullMVx & 0x07) {
+    iExpandWidth -= 3;
+  }
+  if (iFullMVy & 0x07) {
+    iExpandHeight -= 3;
+  }
+
 #ifdef MC_FLOW_SIMPLE_JUDGE
-	if (iIntMVx < -iExpandWidth || 
-		iIntMVy < -iExpandHeight || 
-		iIntMVx + iBlkWidth > pMCRefMem->iPicWidth - 1 + iExpandWidth || 
-		iIntMVy + iBlkHeight > pMCRefMem->iPicHeight - 1 + iExpandHeight)
+  if (iIntMVx < -iExpandWidth ||
+      iIntMVy < -iExpandHeight ||
+      iIntMVx + iBlkWidth > pMCRefMem->iPicWidth - 1 + iExpandWidth ||
+      iIntMVy + iBlkHeight > pMCRefMem->iPicHeight - 1 + iExpandHeight)
 #else
-	if (iIntMVx < -iExpandWidth || 
-		iIntMVy < -iExpandHeight || 
-		iIntMVx + PADDING_LENGTH > pMCRefMem->iPicWidth + iExpandWidth || 
-		iIntMVy + PADDING_LENGTH > pMCRefMem->iPicHeight + iExpandHeight)
+  if (iIntMVx < -iExpandWidth ||
+      iIntMVy < -iExpandHeight ||
+      iIntMVx + PADDING_LENGTH > pMCRefMem->iPicWidth + iExpandWidth ||
+      iIntMVy + PADDING_LENGTH > pMCRefMem->iPicHeight + iExpandHeight)
 #endif
-	{
-		FillBufForMc(uiExpandBuf, 21, pSrcY, pMCRefMem->iSrcLineLuma, iMVOffsetLuma-iPadOffset, 
-			            iBlkWidth+5, iBlkHeight+5, iIntMVx-2, iIntMVy-2, pMCRefMem->iPicWidth, pMCRefMem->iPicHeight);
-		pMCFunc->pMcLumaFunc(uiExpandBuf+44, 21, pDstY, pMCRefMem->iDstLineLuma, iFullMVx, iFullMVy, iBlkWidth, iBlkHeight);//44=2+2*21
-		bExpand = true;
-	}
-	else
-	{
-		pSrcY += iMVOffsetLuma;
-		pMCFunc->pMcLumaFunc(pSrcY, pMCRefMem->iSrcLineLuma, pDstY, pMCRefMem->iDstLineLuma, iFullMVx, iFullMVy, iBlkWidth, iBlkHeight);
-	}
+  {
+    FillBufForMc (uiExpandBuf, 21, pSrcY, pMCRefMem->iSrcLineLuma, iMVOffsetLuma - iPadOffset,
+                  iBlkWidth + 5, iBlkHeight + 5, iIntMVx - 2, iIntMVy - 2, pMCRefMem->iPicWidth, pMCRefMem->iPicHeight);
+    pMCFunc->pMcLumaFunc (uiExpandBuf + 44, 21, pDstY, pMCRefMem->iDstLineLuma, iFullMVx, iFullMVy, iBlkWidth,
+                          iBlkHeight); //44=2+2*21
+    bExpand = true;
+  } else {
+    pSrcY += iMVOffsetLuma;
+    pMCFunc->pMcLumaFunc (pSrcY, pMCRefMem->iSrcLineLuma, pDstY, pMCRefMem->iDstLineLuma, iFullMVx, iFullMVy, iBlkWidth,
+                          iBlkHeight);
+  }
 
-	if (bExpand)
-	{
-		FillBufForMc(uiExpandBuf, 21, pSrcU, pMCRefMem->iSrcLineChroma, iMVOffsetChroma, iBlkWidthChroma+1, iBlkHeightChroma+1, iFullMVx>>3, iFullMVy>>3, iPicWidthChroma, iPicHeightChroma);
-		pMCFunc->pMcChromaFunc(uiExpandBuf, 21, pDstU, pMCRefMem->iDstLineChroma, iFullMVx, iFullMVy, iBlkWidthChroma, iBlkHeightChroma);
-		
-		FillBufForMc(uiExpandBuf, 21, pSrcV, pMCRefMem->iSrcLineChroma, iMVOffsetChroma, iBlkWidthChroma+1, iBlkHeightChroma+1, iFullMVx>>3, iFullMVy>>3, iPicWidthChroma, iPicHeightChroma);
-		pMCFunc->pMcChromaFunc(uiExpandBuf, 21, pDstV, pMCRefMem->iDstLineChroma, iFullMVx, iFullMVy, iBlkWidthChroma, iBlkHeightChroma);
-	}
-	else
-	{
-		pSrcU += iMVOffsetChroma;
-		pSrcV += iMVOffsetChroma;
-		pMCFunc->pMcChromaFunc(pSrcU, pMCRefMem->iSrcLineChroma, pDstU, pMCRefMem->iDstLineChroma, iFullMVx, iFullMVy, iBlkWidthChroma, iBlkHeightChroma);
-		pMCFunc->pMcChromaFunc(pSrcV, pMCRefMem->iSrcLineChroma, pDstV, pMCRefMem->iDstLineChroma, iFullMVx, iFullMVy, iBlkWidthChroma, iBlkHeightChroma);
-	}
+  if (bExpand) {
+    FillBufForMc (uiExpandBuf, 21, pSrcU, pMCRefMem->iSrcLineChroma, iMVOffsetChroma, iBlkWidthChroma + 1,
+                  iBlkHeightChroma + 1, iFullMVx >> 3, iFullMVy >> 3, iPicWidthChroma, iPicHeightChroma);
+    pMCFunc->pMcChromaFunc (uiExpandBuf, 21, pDstU, pMCRefMem->iDstLineChroma, iFullMVx, iFullMVy, iBlkWidthChroma,
+                            iBlkHeightChroma);
+
+    FillBufForMc (uiExpandBuf, 21, pSrcV, pMCRefMem->iSrcLineChroma, iMVOffsetChroma, iBlkWidthChroma + 1,
+                  iBlkHeightChroma + 1, iFullMVx >> 3, iFullMVy >> 3, iPicWidthChroma, iPicHeightChroma);
+    pMCFunc->pMcChromaFunc (uiExpandBuf, 21, pDstV, pMCRefMem->iDstLineChroma, iFullMVx, iFullMVy, iBlkWidthChroma,
+                            iBlkHeightChroma);
+  } else {
+    pSrcU += iMVOffsetChroma;
+    pSrcV += iMVOffsetChroma;
+    pMCFunc->pMcChromaFunc (pSrcU, pMCRefMem->iSrcLineChroma, pDstU, pMCRefMem->iDstLineChroma, iFullMVx, iFullMVy,
+                            iBlkWidthChroma, iBlkHeightChroma);
+    pMCFunc->pMcChromaFunc (pSrcV, pMCRefMem->iSrcLineChroma, pDstV, pMCRefMem->iDstLineChroma, iFullMVx, iFullMVy,
+                            iBlkWidthChroma, iBlkHeightChroma);
+  }
 }
 
-void_t GetInterPred(uint8_t *pPredY, uint8_t *pPredCb, uint8_t *pPredCr, PWelsDecoderContext pCtx)
-{
-	sMCRefMember pMCRefMem;
-	PDqLayer pCurDqLayer = pCtx->pCurDqLayer;
-	SMcFunc* pMCFunc = &pCtx->sMcFunc;
+void_t GetInterPred (uint8_t* pPredY, uint8_t* pPredCb, uint8_t* pPredCr, PWelsDecoderContext pCtx) {
+  sMCRefMember pMCRefMem;
+  PDqLayer pCurDqLayer = pCtx->pCurDqLayer;
+  SMcFunc* pMCFunc = &pCtx->sMcFunc;
 
-	int32_t iMBXY = pCurDqLayer->iMbXyIndex;
+  int32_t iMBXY = pCurDqLayer->iMbXyIndex;
 
-	int16_t iMVs[2] = {0};
- 	
-	int32_t iMBType = pCurDqLayer->pMbType[iMBXY];
+  int16_t iMVs[2] = {0};
 
-	int32_t iMBOffsetX = pCurDqLayer->iMbX << 4;
- 	int32_t iMBOffsetY = pCurDqLayer->iMbY << 4;
+  int32_t iMBType = pCurDqLayer->pMbType[iMBXY];
 
-	int32_t iDstLineLuma   = pCtx->pDec->iLinesize[0];
-	int32_t iDstLineChroma = pCtx->pDec->iLinesize[1];
-	
-	int32_t iBlk8X, iBlk8Y, iBlk4X, iBlk4Y, i, j, iIIdx, iJIdx;
+  int32_t iMBOffsetX = pCurDqLayer->iMbX << 4;
+  int32_t iMBOffsetY = pCurDqLayer->iMbY << 4;
 
-	pMCRefMem.iPicWidth = (pCurDqLayer->sLayerInfo.sSliceInLayer.sSliceHeaderExt.sSliceHeader.iMbWidth<<4);
-	pMCRefMem.iPicHeight = (pCurDqLayer->sLayerInfo.sSliceInLayer.sSliceHeaderExt.sSliceHeader.iMbHeight<<4);
+  int32_t iDstLineLuma   = pCtx->pDec->iLinesize[0];
+  int32_t iDstLineChroma = pCtx->pDec->iLinesize[1];
 
-	pMCRefMem.pDstY = pPredY;
-	pMCRefMem.pDstU = pPredCb;
-	pMCRefMem.pDstV = pPredCr;
+  int32_t iBlk8X, iBlk8Y, iBlk4X, iBlk4Y, i, j, iIIdx, iJIdx;
 
-	pMCRefMem.iDstLineLuma   = iDstLineLuma;
-	pMCRefMem.iDstLineChroma = iDstLineChroma;
-	switch(iMBType)
-	{
-	case MB_TYPE_SKIP:
- 	case MB_TYPE_16x16:
-		iMVs[0] = pCurDqLayer->pMv[0][iMBXY][0][0];
-		iMVs[1] = pCurDqLayer->pMv[0][iMBXY][0][1];
-		GetRefPic( &pMCRefMem, pCtx, pCurDqLayer->pRefIndex[0][iMBXY], 0 );
-		BaseMC(&pMCRefMem, iMBOffsetX, iMBOffsetY, pMCFunc, 16, 16, iMVs);
-		break;
-	case MB_TYPE_16x8:
-		iMVs[0] = pCurDqLayer->pMv[0][iMBXY][0][0];
-		iMVs[1] = pCurDqLayer->pMv[0][iMBXY][0][1];
-		GetRefPic( &pMCRefMem, pCtx, pCurDqLayer->pRefIndex[0][iMBXY], 0 );
-		BaseMC(&pMCRefMem, iMBOffsetX, iMBOffsetY, pMCFunc, 16, 8, iMVs);
+  pMCRefMem.iPicWidth = (pCurDqLayer->sLayerInfo.sSliceInLayer.sSliceHeaderExt.sSliceHeader.iMbWidth << 4);
+  pMCRefMem.iPicHeight = (pCurDqLayer->sLayerInfo.sSliceInLayer.sSliceHeaderExt.sSliceHeader.iMbHeight << 4);
 
-		iMVs[0] = pCurDqLayer->pMv[0][iMBXY][8][0];
-		iMVs[1] = pCurDqLayer->pMv[0][iMBXY][8][1];
-		GetRefPic( &pMCRefMem, pCtx, pCurDqLayer->pRefIndex[0][iMBXY], 8 );
-		pMCRefMem.pDstY = pPredY  + (iDstLineLuma << 3);
-		pMCRefMem.pDstU = pPredCb + (iDstLineChroma << 2);
-		pMCRefMem.pDstV = pPredCr + (iDstLineChroma << 2);
-		BaseMC(&pMCRefMem, iMBOffsetX, iMBOffsetY+8, pMCFunc, 16, 8, iMVs);
-		break;
-	case MB_TYPE_8x16:
-		iMVs[0] = pCurDqLayer->pMv[0][iMBXY][0][0];
-		iMVs[1] = pCurDqLayer->pMv[0][iMBXY][0][1];
-		GetRefPic( &pMCRefMem, pCtx, pCurDqLayer->pRefIndex[0][iMBXY], 0 );
-		BaseMC(&pMCRefMem, iMBOffsetX, iMBOffsetY, pMCFunc, 8, 16, iMVs);
+  pMCRefMem.pDstY = pPredY;
+  pMCRefMem.pDstU = pPredCb;
+  pMCRefMem.pDstV = pPredCr;
 
-		iMVs[0] = pCurDqLayer->pMv[0][iMBXY][2][0];
-		iMVs[1] = pCurDqLayer->pMv[0][iMBXY][2][1];
-		GetRefPic( &pMCRefMem, pCtx, pCurDqLayer->pRefIndex[0][iMBXY], 2 );
-		pMCRefMem.pDstY = pPredY + 8;
-		pMCRefMem.pDstU = pPredCb + 4;
-		pMCRefMem.pDstV = pPredCr + 4;
-		BaseMC(&pMCRefMem, iMBOffsetX+8, iMBOffsetY, pMCFunc, 8, 16, iMVs);
-		break;
-	case MB_TYPE_8x8:
-	case MB_TYPE_8x8_REF0:
-		{
-			uint32_t iSubMBType;
-			int32_t iXOffset, iYOffset;
-			uint8_t *pDstY, *pDstU, *pDstV;
-			for (i = 0; i < 4; i++)
-			{
-				iSubMBType = pCurDqLayer->pSubMbType[iMBXY][i];
-				iBlk8X = (i&1) << 3;
-				iBlk8Y = (i>>1) << 3;				
-				iXOffset = iMBOffsetX + iBlk8X;
-				iYOffset = iMBOffsetY + iBlk8Y;
+  pMCRefMem.iDstLineLuma   = iDstLineLuma;
+  pMCRefMem.iDstLineChroma = iDstLineChroma;
+  switch (iMBType) {
+  case MB_TYPE_SKIP:
+  case MB_TYPE_16x16:
+    iMVs[0] = pCurDqLayer->pMv[0][iMBXY][0][0];
+    iMVs[1] = pCurDqLayer->pMv[0][iMBXY][0][1];
+    GetRefPic (&pMCRefMem, pCtx, pCurDqLayer->pRefIndex[0][iMBXY], 0);
+    BaseMC (&pMCRefMem, iMBOffsetX, iMBOffsetY, pMCFunc, 16, 16, iMVs);
+    break;
+  case MB_TYPE_16x8:
+    iMVs[0] = pCurDqLayer->pMv[0][iMBXY][0][0];
+    iMVs[1] = pCurDqLayer->pMv[0][iMBXY][0][1];
+    GetRefPic (&pMCRefMem, pCtx, pCurDqLayer->pRefIndex[0][iMBXY], 0);
+    BaseMC (&pMCRefMem, iMBOffsetX, iMBOffsetY, pMCFunc, 16, 8, iMVs);
 
-				iIIdx = ((i>>1)<<3) +((i&1)<<1);
-				GetRefPic( &pMCRefMem, pCtx, pCurDqLayer->pRefIndex[0][iMBXY], iIIdx );
+    iMVs[0] = pCurDqLayer->pMv[0][iMBXY][8][0];
+    iMVs[1] = pCurDqLayer->pMv[0][iMBXY][8][1];
+    GetRefPic (&pMCRefMem, pCtx, pCurDqLayer->pRefIndex[0][iMBXY], 8);
+    pMCRefMem.pDstY = pPredY  + (iDstLineLuma << 3);
+    pMCRefMem.pDstU = pPredCb + (iDstLineChroma << 2);
+    pMCRefMem.pDstV = pPredCr + (iDstLineChroma << 2);
+    BaseMC (&pMCRefMem, iMBOffsetX, iMBOffsetY + 8, pMCFunc, 16, 8, iMVs);
+    break;
+  case MB_TYPE_8x16:
+    iMVs[0] = pCurDqLayer->pMv[0][iMBXY][0][0];
+    iMVs[1] = pCurDqLayer->pMv[0][iMBXY][0][1];
+    GetRefPic (&pMCRefMem, pCtx, pCurDqLayer->pRefIndex[0][iMBXY], 0);
+    BaseMC (&pMCRefMem, iMBOffsetX, iMBOffsetY, pMCFunc, 8, 16, iMVs);
 
-				pDstY = pPredY + iBlk8X + iBlk8Y * iDstLineLuma;
-				pDstU = pPredCb + (iBlk8X >> 1) + (iBlk8Y >> 1) * iDstLineChroma;
-				pDstV = pPredCr + (iBlk8X >> 1) + (iBlk8Y >> 1) * iDstLineChroma;
-				pMCRefMem.pDstY = pDstY;
-				pMCRefMem.pDstU = pDstU;
-				pMCRefMem.pDstV = pDstV;
-				switch(iSubMBType)
-				{
-				case SUB_MB_TYPE_8x8:
-					iMVs[0] = pCurDqLayer->pMv[0][iMBXY][iIIdx][0];
-					iMVs[1] = pCurDqLayer->pMv[0][iMBXY][iIIdx][1];
-					BaseMC( &pMCRefMem, iXOffset, iYOffset, pMCFunc, 8, 8, iMVs );					
-					break;					
-				case SUB_MB_TYPE_8x4:
-					iMVs[0] = pCurDqLayer->pMv[0][iMBXY][iIIdx][0];
-					iMVs[1] = pCurDqLayer->pMv[0][iMBXY][iIIdx][1];
-					BaseMC(&pMCRefMem, iXOffset, iYOffset, pMCFunc, 8, 4, iMVs);
+    iMVs[0] = pCurDqLayer->pMv[0][iMBXY][2][0];
+    iMVs[1] = pCurDqLayer->pMv[0][iMBXY][2][1];
+    GetRefPic (&pMCRefMem, pCtx, pCurDqLayer->pRefIndex[0][iMBXY], 2);
+    pMCRefMem.pDstY = pPredY + 8;
+    pMCRefMem.pDstU = pPredCb + 4;
+    pMCRefMem.pDstV = pPredCr + 4;
+    BaseMC (&pMCRefMem, iMBOffsetX + 8, iMBOffsetY, pMCFunc, 8, 16, iMVs);
+    break;
+  case MB_TYPE_8x8:
+  case MB_TYPE_8x8_REF0: {
+    uint32_t iSubMBType;
+    int32_t iXOffset, iYOffset;
+    uint8_t* pDstY, *pDstU, *pDstV;
+    for (i = 0; i < 4; i++) {
+      iSubMBType = pCurDqLayer->pSubMbType[iMBXY][i];
+      iBlk8X = (i & 1) << 3;
+      iBlk8Y = (i >> 1) << 3;
+      iXOffset = iMBOffsetX + iBlk8X;
+      iYOffset = iMBOffsetY + iBlk8Y;
 
-					iMVs[0] = pCurDqLayer->pMv[0][iMBXY][iIIdx+4][0];
-					iMVs[1] = pCurDqLayer->pMv[0][iMBXY][iIIdx+4][1];
-					pMCRefMem.pDstY += (iDstLineLuma << 2);
-					pMCRefMem.pDstU += (iDstLineChroma << 1);
-					pMCRefMem.pDstV += (iDstLineChroma << 1);
-					BaseMC(&pMCRefMem, iXOffset, iYOffset+4, pMCFunc, 8, 4, iMVs);
-					break;
-				case SUB_MB_TYPE_4x8:
-					iMVs[0] = pCurDqLayer->pMv[0][iMBXY][iIIdx][0];
-					iMVs[1] = pCurDqLayer->pMv[0][iMBXY][iIIdx][1];
-					BaseMC(&pMCRefMem, iXOffset, iYOffset, pMCFunc, 4, 8, iMVs);
+      iIIdx = ((i >> 1) << 3) + ((i & 1) << 1);
+      GetRefPic (&pMCRefMem, pCtx, pCurDqLayer->pRefIndex[0][iMBXY], iIIdx);
 
-					iMVs[0] = pCurDqLayer->pMv[0][iMBXY][iIIdx+1][0];
-					iMVs[1] = pCurDqLayer->pMv[0][iMBXY][iIIdx+1][1];
-					pMCRefMem.pDstY += 4;
-					pMCRefMem.pDstU += 2;
-					pMCRefMem.pDstV += 2;
-					BaseMC(&pMCRefMem, iXOffset+4, iYOffset, pMCFunc, 4, 8, iMVs);
-					break;
-				case SUB_MB_TYPE_4x4:
-					{
-						for (j = 0; j < 4; j++)
-						{
-							int32_t iUVLineStride;
-							iJIdx = ((j>>1)<<2) + (j&1);
+      pDstY = pPredY + iBlk8X + iBlk8Y * iDstLineLuma;
+      pDstU = pPredCb + (iBlk8X >> 1) + (iBlk8Y >> 1) * iDstLineChroma;
+      pDstV = pPredCr + (iBlk8X >> 1) + (iBlk8Y >> 1) * iDstLineChroma;
+      pMCRefMem.pDstY = pDstY;
+      pMCRefMem.pDstU = pDstU;
+      pMCRefMem.pDstV = pDstV;
+      switch (iSubMBType) {
+      case SUB_MB_TYPE_8x8:
+        iMVs[0] = pCurDqLayer->pMv[0][iMBXY][iIIdx][0];
+        iMVs[1] = pCurDqLayer->pMv[0][iMBXY][iIIdx][1];
+        BaseMC (&pMCRefMem, iXOffset, iYOffset, pMCFunc, 8, 8, iMVs);
+        break;
+      case SUB_MB_TYPE_8x4:
+        iMVs[0] = pCurDqLayer->pMv[0][iMBXY][iIIdx][0];
+        iMVs[1] = pCurDqLayer->pMv[0][iMBXY][iIIdx][1];
+        BaseMC (&pMCRefMem, iXOffset, iYOffset, pMCFunc, 8, 4, iMVs);
 
-							iBlk4X = (j&1) << 2;
-							iBlk4Y = (j>>1) << 2;
+        iMVs[0] = pCurDqLayer->pMv[0][iMBXY][iIIdx + 4][0];
+        iMVs[1] = pCurDqLayer->pMv[0][iMBXY][iIIdx + 4][1];
+        pMCRefMem.pDstY += (iDstLineLuma << 2);
+        pMCRefMem.pDstU += (iDstLineChroma << 1);
+        pMCRefMem.pDstV += (iDstLineChroma << 1);
+        BaseMC (&pMCRefMem, iXOffset, iYOffset + 4, pMCFunc, 8, 4, iMVs);
+        break;
+      case SUB_MB_TYPE_4x8:
+        iMVs[0] = pCurDqLayer->pMv[0][iMBXY][iIIdx][0];
+        iMVs[1] = pCurDqLayer->pMv[0][iMBXY][iIIdx][1];
+        BaseMC (&pMCRefMem, iXOffset, iYOffset, pMCFunc, 4, 8, iMVs);
 
-							iUVLineStride = (iBlk4X >> 1) + (iBlk4Y >> 1) * iDstLineChroma; 
-							pMCRefMem.pDstY = pDstY + iBlk4X + iBlk4Y * iDstLineLuma;							
-							pMCRefMem.pDstU = pDstU + iUVLineStride;  
-							pMCRefMem.pDstV = pDstV + iUVLineStride;
+        iMVs[0] = pCurDqLayer->pMv[0][iMBXY][iIIdx + 1][0];
+        iMVs[1] = pCurDqLayer->pMv[0][iMBXY][iIIdx + 1][1];
+        pMCRefMem.pDstY += 4;
+        pMCRefMem.pDstU += 2;
+        pMCRefMem.pDstV += 2;
+        BaseMC (&pMCRefMem, iXOffset + 4, iYOffset, pMCFunc, 4, 8, iMVs);
+        break;
+      case SUB_MB_TYPE_4x4: {
+        for (j = 0; j < 4; j++) {
+          int32_t iUVLineStride;
+          iJIdx = ((j >> 1) << 2) + (j & 1);
 
-							iMVs[0] = pCurDqLayer->pMv[0][iMBXY][iIIdx+iJIdx][0];
-							iMVs[1] = pCurDqLayer->pMv[0][iMBXY][iIIdx+iJIdx][1];
-							BaseMC(&pMCRefMem, iXOffset+iBlk4X, iYOffset+iBlk4Y, pMCFunc, 4, 4, iMVs);
-						}
-					}
-					break;
-				default:
-					break;
-				}
-			}
-		}
-		break;
-	default:
-		break;
-	}
-}
+          iBlk4X = (j & 1) << 2;
+          iBlk4Y = (j >> 1) << 2;
 
-int32_t RecChroma(int32_t iMBXY, PWelsDecoderContext pCtx, int16_t *pScoeffLevel, PDqLayer pDqLayer)
-{
-	int32_t iChromaStride = pCtx->pCurDqLayer->iCsStride[1];
-	PIdctResAddPredFunc pIdctResAddPredFunc = pCtx->pIdctResAddPredFunc;	
+          iUVLineStride = (iBlk4X >> 1) + (iBlk4Y >> 1) * iDstLineChroma;
+          pMCRefMem.pDstY = pDstY + iBlk4X + iBlk4Y * iDstLineLuma;
+          pMCRefMem.pDstU = pDstU + iUVLineStride;
+          pMCRefMem.pDstV = pDstV + iUVLineStride;
 
-	uint8_t i=0, j=0;
-	uint8_t uiCbpC = pDqLayer->pCbp[iMBXY] >> 4;
-	
-	if ( 1 == uiCbpC || 2 == uiCbpC )
-	{
-		WelsChromaDcIdct( pScoeffLevel + 256 );	// 256 = 16*16
-		WelsChromaDcIdct( pScoeffLevel + 320 );	// 256 = 16*16
-		for(i=0; i<2; i++)
-		{
-			int16_t *pRS = pScoeffLevel + 256 + (i << 6);	
-			uint8_t *pPred = pDqLayer->pPred[i+1];
-			int32_t *pBlockOffset = i==0 ? &pCtx->iDecBlockOffsetArray[16] : &pCtx->iDecBlockOffsetArray[20];
-			
-			/*1 chroma is divided 4 4x4_block to idct*/
-			for(j=0; j<4; j++)
-			{
-				int16_t *pRSI4x4 = &pRS[j<<4];
-				uint8_t *pPredI4x4 = pPred + pBlockOffset[j];
-				
-				if ( pDqLayer->pNzc[iMBXY][g_kuiMbNonZeroCountIdx[16+(i<<2)+j]] || pRSI4x4[0] )
-				{
-					pIdctResAddPredFunc(pPredI4x4, iChromaStride, pRSI4x4);
-				}
-			}
-		}
-	}
-	
-	return ERR_NONE;
+          iMVs[0] = pCurDqLayer->pMv[0][iMBXY][iIIdx + iJIdx][0];
+          iMVs[1] = pCurDqLayer->pMv[0][iMBXY][iIIdx + iJIdx][1];
+          BaseMC (&pMCRefMem, iXOffset + iBlk4X, iYOffset + iBlk4Y, pMCFunc, 4, 4, iMVs);
+        }
+      }
+      break;
+      default:
+        break;
+      }
+    }
+  }
+  break;
+  default:
+    break;
+  }
 }
 
-void_t FillBufForMc(uint8_t *pBuf, int32_t iBufStride, uint8_t *pSrc, int32_t iSrcStride, int32_t iSrcOffset, 
-					 int32_t iBlockWidth, int32_t iBlockHeight, int32_t iSrcX, int32_t iSrcY, int32_t iPicWidth, int32_t iPicHeight)
-{
-    int32_t iY;
-    int32_t iStartY, iStartX, iEndY, iEndX;
-	int32_t iOffsetAdj = 0;
-	int32_t iAddrSrc, iAddrBuf;
-	int32_t iNum, iNum1;
-	uint8_t *pBufSrc, *pBufDst;
-	uint8_t *pBufSrc1, *pBufDst1;
+int32_t RecChroma (int32_t iMBXY, PWelsDecoderContext pCtx, int16_t* pScoeffLevel, PDqLayer pDqLayer) {
+  int32_t iChromaStride = pCtx->pCurDqLayer->iCsStride[1];
+  PIdctResAddPredFunc pIdctResAddPredFunc = pCtx->pIdctResAddPredFunc;
 
-    if( iSrcY >= iPicHeight )
-	{
-        iOffsetAdj += ( iPicHeight - 1 - iSrcY ) * iSrcStride;
-        iSrcY = iPicHeight - 1;
+  uint8_t i = 0, j = 0;
+  uint8_t uiCbpC = pDqLayer->pCbp[iMBXY] >> 4;
+
+  if (1 == uiCbpC || 2 == uiCbpC) {
+    WelsChromaDcIdct (pScoeffLevel + 256);	// 256 = 16*16
+    WelsChromaDcIdct (pScoeffLevel + 320);	// 256 = 16*16
+    for (i = 0; i < 2; i++) {
+      int16_t* pRS = pScoeffLevel + 256 + (i << 6);
+      uint8_t* pPred = pDqLayer->pPred[i + 1];
+      int32_t* pBlockOffset = i == 0 ? &pCtx->iDecBlockOffsetArray[16] : &pCtx->iDecBlockOffsetArray[20];
+
+      /*1 chroma is divided 4 4x4_block to idct*/
+      for (j = 0; j < 4; j++) {
+        int16_t* pRSI4x4 = &pRS[j << 4];
+        uint8_t* pPredI4x4 = pPred + pBlockOffset[j];
+
+        if (pDqLayer->pNzc[iMBXY][g_kuiMbNonZeroCountIdx[16 + (i << 2) + j]] || pRSI4x4[0]) {
+          pIdctResAddPredFunc (pPredI4x4, iChromaStride, pRSI4x4);
+        }
+      }
     }
-	else if( iSrcY <= -iBlockHeight )
-	{
-        iOffsetAdj += ( 1 - iBlockHeight - iSrcY ) * iSrcStride;
-        iSrcY = 1 - iBlockHeight;
-    }
-    if( iSrcX >= iPicWidth )
-	{
-        iOffsetAdj += ( iPicWidth - 1 - iSrcX );
-        iSrcX = iPicWidth - 1;
-    }
-	else if( iSrcX <= -iBlockWidth )
-	{
-        iOffsetAdj +=  ( 1 - iBlockWidth - iSrcX );
-        iSrcX = 1 - iBlockWidth;
-    }
+  }
 
-	iOffsetAdj += iSrcOffset;
+  return ERR_NONE;
+}
 
-#define MAX(a,b) ((a) > (b) ? (a) : (b))	
+void_t FillBufForMc (uint8_t* pBuf, int32_t iBufStride, uint8_t* pSrc, int32_t iSrcStride, int32_t iSrcOffset,
+                     int32_t iBlockWidth, int32_t iBlockHeight, int32_t iSrcX, int32_t iSrcY, int32_t iPicWidth, int32_t iPicHeight) {
+  int32_t iY;
+  int32_t iStartY, iStartX, iEndY, iEndX;
+  int32_t iOffsetAdj = 0;
+  int32_t iAddrSrc, iAddrBuf;
+  int32_t iNum, iNum1;
+  uint8_t* pBufSrc, *pBufDst;
+  uint8_t* pBufSrc1, *pBufDst1;
+
+  if (iSrcY >= iPicHeight) {
+    iOffsetAdj += (iPicHeight - 1 - iSrcY) * iSrcStride;
+    iSrcY = iPicHeight - 1;
+  } else if (iSrcY <= -iBlockHeight) {
+    iOffsetAdj += (1 - iBlockHeight - iSrcY) * iSrcStride;
+    iSrcY = 1 - iBlockHeight;
+  }
+  if (iSrcX >= iPicWidth) {
+    iOffsetAdj += (iPicWidth - 1 - iSrcX);
+    iSrcX = iPicWidth - 1;
+  } else if (iSrcX <= -iBlockWidth) {
+    iOffsetAdj += (1 - iBlockWidth - iSrcX);
+    iSrcX = 1 - iBlockWidth;
+  }
+
+  iOffsetAdj += iSrcOffset;
+
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
 #define MIN(a,b) ((a) > (b) ? (b) : (a))
 
-    iStartY = MAX(0, -iSrcY);
-    iStartX = MAX(0, -iSrcX);
-    iEndY = MIN(iBlockHeight, iPicHeight - iSrcY);
-    iEndX = MIN(iBlockWidth, iPicWidth - iSrcX);
-	
-    // copy existing part
-	iAddrSrc = iStartX + iStartY * iSrcStride;
-	iAddrBuf = iStartX + iStartY * iBufStride;
-	iNum = iEndX - iStartX;
-    for( iY = iStartY; iY < iEndY; iY++ )
-	{
-		memcpy( pBuf + iAddrBuf, pSrc + iOffsetAdj + iAddrSrc, iNum );
-		iAddrSrc += iSrcStride;
-		iAddrBuf += iBufStride;
-    }
-	
-    //top
-	pBufSrc = pBuf + iStartX + iStartY * iBufStride;
-	pBufDst = pBuf + iStartX;
-	iNum = iEndX - iStartX;
-    for( iY = 0; iY < iStartY; iY++ )
-	{
-		memcpy( pBufDst, pBufSrc, iNum );
-		pBufDst += iBufStride;
-    }
-	
-    //bottom
-	pBufSrc = pBuf + iStartX + ( iEndY - 1 ) * iBufStride;
-	pBufDst = pBuf + iStartX + iEndY * iBufStride;
-    iNum = iEndX - iStartX;
-    for( iY = iEndY; iY < iBlockHeight; iY++ )
-	{
-		memcpy( pBufDst, pBufSrc, iNum );
-		pBufDst += iBufStride;
-    }
-	
-	
-	pBufSrc = pBuf + iStartX;
-	pBufDst = pBuf;
-	iNum = iStartX;
+  iStartY = MAX (0, -iSrcY);
+  iStartX = MAX (0, -iSrcX);
+  iEndY = MIN (iBlockHeight, iPicHeight - iSrcY);
+  iEndX = MIN (iBlockWidth, iPicWidth - iSrcX);
 
-	pBufSrc1 = pBuf + iEndX - 1;
-	pBufDst1 = pBuf + iEndX;
-	iNum1 = iBlockWidth - iEndX;
-    for( iY=0; iY<iBlockHeight; iY++ )
-	{
-		//left
-		memset( pBufDst, pBufSrc[0], iNum );
-		pBufDst += iBufStride;
-		pBufSrc += iBufStride;
-		
-		//right
-		memset( pBufDst1, pBufSrc1[0], iNum1 );
-		pBufDst1 += iBufStride;
-		pBufSrc1 += iBufStride;
-    }
+  // copy existing part
+  iAddrSrc = iStartX + iStartY * iSrcStride;
+  iAddrBuf = iStartX + iStartY * iBufStride;
+  iNum = iEndX - iStartX;
+  for (iY = iStartY; iY < iEndY; iY++) {
+    memcpy (pBuf + iAddrBuf, pSrc + iOffsetAdj + iAddrSrc, iNum);
+    iAddrSrc += iSrcStride;
+    iAddrBuf += iBufStride;
+  }
+
+  //top
+  pBufSrc = pBuf + iStartX + iStartY * iBufStride;
+  pBufDst = pBuf + iStartX;
+  iNum = iEndX - iStartX;
+  for (iY = 0; iY < iStartY; iY++) {
+    memcpy (pBufDst, pBufSrc, iNum);
+    pBufDst += iBufStride;
+  }
+
+  //bottom
+  pBufSrc = pBuf + iStartX + (iEndY - 1) * iBufStride;
+  pBufDst = pBuf + iStartX + iEndY * iBufStride;
+  iNum = iEndX - iStartX;
+  for (iY = iEndY; iY < iBlockHeight; iY++) {
+    memcpy (pBufDst, pBufSrc, iNum);
+    pBufDst += iBufStride;
+  }
+
+
+  pBufSrc = pBuf + iStartX;
+  pBufDst = pBuf;
+  iNum = iStartX;
+
+  pBufSrc1 = pBuf + iEndX - 1;
+  pBufDst1 = pBuf + iEndX;
+  iNum1 = iBlockWidth - iEndX;
+  for (iY = 0; iY < iBlockHeight; iY++) {
+    //left
+    memset (pBufDst, pBufSrc[0], iNum);
+    pBufDst += iBufStride;
+    pBufSrc += iBufStride;
+
+    //right
+    memset (pBufDst1, pBufSrc1[0], iNum1);
+    pBufDst1 += iBufStride;
+    pBufSrc1 += iBufStride;
+  }
 }
 
 } // namespace WelsDec
--- a/codec/decoder/core/src/utils.cpp
+++ b/codec/decoder/core/src/utils.cpp
@@ -37,7 +37,7 @@
  *
  *************************************************************************************
  */
- 
+
 #include <string.h>
 #include <stdlib.h>
 #include <math.h>
@@ -68,15 +68,14 @@
 
 
 
-void_t WelsLog(void_t *pPtr, int32_t iLevel, const char *kpFmt, ...)
-{
-    va_list pVl;
+void_t WelsLog (void_t* pPtr, int32_t iLevel, const char* kpFmt, ...) {
+  va_list pVl;
 
-	PWelsDecoderContext pCtx  = (PWelsDecoderContext)pPtr;
+  PWelsDecoderContext pCtx  = (PWelsDecoderContext)pPtr;
 
-    va_start(pVl, kpFmt);
-    g_pLog(pCtx->pTraceHandle, iLevel, kpFmt, pVl);
-    va_end(pVl);
+  va_start (pVl, kpFmt);
+  g_pLog (pCtx->pTraceHandle, iLevel, kpFmt, pVl);
+  va_end (pVl);
 }
 
 
@@ -84,122 +83,106 @@
 
 #if  defined(_MSC_VER) && (_MSC_VER>=1500)
 
-int32_t WelsSnprintf(str_t * pBuffer,  int32_t iSizeOfBuffer, const str_t * kpFormat, ...)
-{
-	va_list  pArgPtr; 
-	int32_t  iRc;
+int32_t WelsSnprintf (str_t* pBuffer,  int32_t iSizeOfBuffer, const str_t* kpFormat, ...) {
+  va_list  pArgPtr;
+  int32_t  iRc;
 
-	va_start(pArgPtr, kpFormat);
+  va_start (pArgPtr, kpFormat);
 
-	iRc = vsnprintf_s(pBuffer, iSizeOfBuffer, _TRUNCATE, kpFormat, pArgPtr);
+  iRc = vsnprintf_s (pBuffer, iSizeOfBuffer, _TRUNCATE, kpFormat, pArgPtr);
 
-	va_end(pArgPtr);
+  va_end (pArgPtr);
 
-	return iRc;
+  return iRc;
 }
 
-str_t* WelsStrncpy(str_t * pDest, int32_t iSizeInBytes, const str_t * kpSrc, int32_t iCount)
-{
-    strncpy_s(pDest, iSizeInBytes, kpSrc, iCount);
+str_t* WelsStrncpy (str_t* pDest, int32_t iSizeInBytes, const str_t* kpSrc, int32_t iCount) {
+  strncpy_s (pDest, iSizeInBytes, kpSrc, iCount);
 
-	return pDest;
+  return pDest;
 }
 
-int32_t WelsStrnlen(const str_t * kpStr,  int32_t iMaxlen)
-{
-	return strnlen_s(kpStr, iMaxlen);
+int32_t WelsStrnlen (const str_t* kpStr,  int32_t iMaxlen) {
+  return strnlen_s (kpStr, iMaxlen);
 }
 
-int32_t WelsVsprintf(str_t * pBuffer, int32_t iSizeOfBuffer, const str_t * kpFormat, va_list pArgPtr)
-{
-	return vsprintf_s(pBuffer, iSizeOfBuffer, kpFormat, pArgPtr);
+int32_t WelsVsprintf (str_t* pBuffer, int32_t iSizeOfBuffer, const str_t* kpFormat, va_list pArgPtr) {
+  return vsprintf_s (pBuffer, iSizeOfBuffer, kpFormat, pArgPtr);
 }
 
-WelsFileHandle* WelsFopen(const str_t * kpFilename,  const str_t * kpMode)
-{
-	WelsFileHandle* pFp = NULL;
-	if( fopen_s(&pFp, kpFilename, kpMode) != 0 ){
-		return NULL;
-	}
+WelsFileHandle* WelsFopen (const str_t* kpFilename,  const str_t* kpMode) {
+  WelsFileHandle* pFp = NULL;
+  if (fopen_s (&pFp, kpFilename, kpMode) != 0) {
+    return NULL;
+  }
 
-	return pFp;
+  return pFp;
 }
 
-int32_t WelsFclose(WelsFileHandle* pFp)
-{
-	return fclose(pFp);
+int32_t WelsFclose (WelsFileHandle* pFp) {
+  return fclose (pFp);
 }
 
-int32_t WelsGetTimeOfDay(SWelsTime * pTp)
-{
-	return _ftime_s(pTp);
+int32_t WelsGetTimeOfDay (SWelsTime* pTp) {
+  return _ftime_s (pTp);
 }
 
-int32_t WelsStrftime(str_t * pBuffer, int32_t iSize, const str_t * kpFormat, const SWelsTime * kpTp)
-{
-	struct tm   sTimeNow;
+int32_t WelsStrftime (str_t* pBuffer, int32_t iSize, const str_t* kpFormat, const SWelsTime* kpTp) {
+  struct tm   sTimeNow;
 
-	localtime_s(&sTimeNow, &kpTp->time);
+  localtime_s (&sTimeNow, &kpTp->time);
 
-	return strftime(pBuffer, iSize, kpFormat, &sTimeNow);
+  return strftime (pBuffer, iSize, kpFormat, &sTimeNow);
 }
 
-#else 
+#else
 
-int32_t WelsSnprintf(str_t * pBuffer,  int32_t iSizeOfBuffer, const str_t * kpFormat, ...)
-{
-	va_list pArgPtr;
-	int32_t iRc;
+int32_t WelsSnprintf (str_t* pBuffer,  int32_t iSizeOfBuffer, const str_t* kpFormat, ...) {
+  va_list pArgPtr;
+  int32_t iRc;
 
-	va_start(pArgPtr, kpFormat);
+  va_start (pArgPtr, kpFormat);
 
-    iRc = vsprintf(pBuffer, kpFormat, pArgPtr);//confirmed_safe_unsafe_usage
+  iRc = vsprintf (pBuffer, kpFormat, pArgPtr); //confirmed_safe_unsafe_usage
 
-	va_end(pArgPtr);
+  va_end (pArgPtr);
 
-	return iRc;
+  return iRc;
 }
 
-str_t* WelsStrncpy(str_t * pDest, int32_t iSizeInBytes, const str_t * kpSrc, int32_t iCount)
-{
-	strncpy(pDest, kpSrc, iCount);//confirmed_safe_unsafe_usage
+str_t* WelsStrncpy (str_t* pDest, int32_t iSizeInBytes, const str_t* kpSrc, int32_t iCount) {
+  strncpy (pDest, kpSrc, iCount); //confirmed_safe_unsafe_usage
 
-	return pDest;
+  return pDest;
 }
 
-int32_t WelsStrnlen(const str_t * kpStr,  int32_t iMaxlen)
-{
-	return strlen(kpStr);//confirmed_safe_unsafe_usage
+int32_t WelsStrnlen (const str_t* kpStr,  int32_t iMaxlen) {
+  return strlen (kpStr); //confirmed_safe_unsafe_usage
 }
 
-int32_t WelsVsprintf(str_t * pBuffer, int32_t iSizeOfBuffer, const str_t * kpFormat, va_list pArgPtr)
-{
-	return vsprintf(pBuffer, kpFormat, pArgPtr);//confirmed_safe_unsafe_usage
+int32_t WelsVsprintf (str_t* pBuffer, int32_t iSizeOfBuffer, const str_t* kpFormat, va_list pArgPtr) {
+  return vsprintf (pBuffer, kpFormat, pArgPtr); //confirmed_safe_unsafe_usage
 }
 
 
-WelsFileHandle* WelsFopen(const str_t * kpFilename,  const str_t * kpMode)
-{
-	return fopen(kpFilename, kpMode);
+WelsFileHandle* WelsFopen (const str_t* kpFilename,  const str_t* kpMode) {
+  return fopen (kpFilename, kpMode);
 }
 
-int32_t WelsFclose(WelsFileHandle* pFp)
-{
-	return fclose(pFp);
+int32_t WelsFclose (WelsFileHandle* pFp) {
+  return fclose (pFp);
 }
 
-int32_t WelsGetTimeOfDay(SWelsTime * pTp)
-{
-	return _ftime(pTp);
+int32_t WelsGetTimeOfDay (SWelsTime* pTp) {
+  return _ftime (pTp);
 }
 
-int32_t WelsStrftime(str_t * pBuffer, int32_t iSize, const str_t * kpFormat, const SWelsTime * kpTp)
-{
-	struct tm  * pTnow;
+int32_t WelsStrftime (str_t* pBuffer, int32_t iSize, const str_t* kpFormat, const SWelsTime* kpTp) {
+  struct tm*   pTnow;
 
-	pTnow = localtime(&kpTp->time);
+  pTnow = localtime (&kpTp->time);
 
-	return strftime(pBuffer, iSize, kpFormat, pTnow);
+  return strftime (pBuffer, iSize, kpFormat, pTnow);
 }
 
 
@@ -207,101 +190,89 @@
 
 #else  //GCC
 
-int32_t WelsSnprintf(str_t * pBuffer,  int32_t iSizeOfBuffer, const str_t * kpFormat, ...)
-{
-	va_list pArgPtr;
-	int32_t iRc;
+int32_t WelsSnprintf (str_t* pBuffer,  int32_t iSizeOfBuffer, const str_t* kpFormat, ...) {
+  va_list pArgPtr;
+  int32_t iRc;
 
-	va_start(pArgPtr, kpFormat);
+  va_start (pArgPtr, kpFormat);
 
-    iRc = vsnprintf(pBuffer, iSizeOfBuffer, kpFormat, pArgPtr);
+  iRc = vsnprintf (pBuffer, iSizeOfBuffer, kpFormat, pArgPtr);
 
-	va_end(pArgPtr);
+  va_end (pArgPtr);
 
-	return iRc;
+  return iRc;
 }
 
-str_t* WelsStrncpy(str_t * pDest, int32_t iSizeInBytes, const str_t * kpSrc, int32_t iCount)
-{
-    return strncpy(pDest, kpSrc, iCount);//confirmed_safe_unsafe_usage	
+str_t* WelsStrncpy (str_t* pDest, int32_t iSizeInBytes, const str_t* kpSrc, int32_t iCount) {
+  return strncpy (pDest, kpSrc, iCount); //confirmed_safe_unsafe_usage
 }
 
 #if !defined(MACOS) && !defined(UNIX) && !defined(APPLE_IOS)
-int32_t WelsStrnlen(const str_t * kpStr,  int32_t iMaxlen)
-{
-	return strnlen(kpStr, iMaxlen);//confirmed_safe_unsafe_usage
+int32_t WelsStrnlen (const str_t* kpStr,  int32_t iMaxlen) {
+  return strnlen (kpStr, iMaxlen); //confirmed_safe_unsafe_usage
 }
 #else
-int32_t WelsStrnlen(const str_t *kpString, int32_t iMaxlen)
-{
-	// In mac os, there is no strnlen in string.h, we can only use strlen instead of strnlen or
-	// implement strnlen by ourself
-	
+int32_t WelsStrnlen (const str_t* kpString, int32_t iMaxlen) {
+  // In mac os, there is no strnlen in string.h, we can only use strlen instead of strnlen or
+  // implement strnlen by ourself
+
 #if 1
-	return strlen(pString);//confirmed_safe_unsafe_usage
-#else	
-	const str_t *kpSrc;
-	for (kpSrc = kpString; iMaxlen-- && *kpSrc != '\0'; ++kpSrc)
-		return kpSrc - kpString;
+  return strlen (kpString); //confirmed_safe_unsafe_usage
+#else
+  const str_t* kpSrc;
+  for (kpSrc = kpString; iMaxlen-- && *kpSrc != '\0'; ++kpSrc)
+    return kpSrc - kpString;
 #endif
-	
+
 }
 #endif
 
-int32_t WelsVsprintf(str_t * pBuffer, int32_t iSizeOfBuffer, const str_t * kpFormat, va_list pArgPtr)
-{
-	return vsprintf(pBuffer, kpFormat, pArgPtr);//confirmed_safe_unsafe_usage
+int32_t WelsVsprintf (str_t* pBuffer, int32_t iSizeOfBuffer, const str_t* kpFormat, va_list pArgPtr) {
+  return vsprintf (pBuffer, kpFormat, pArgPtr); //confirmed_safe_unsafe_usage
 }
 
-WelsFileHandle* WelsFopen(const str_t * kpFilename,  const str_t * kpMode)
-{
-	return fopen(kpFilename, kpMode);
+WelsFileHandle* WelsFopen (const str_t* kpFilename,  const str_t* kpMode) {
+  return fopen (kpFilename, kpMode);
 }
 
-int32_t WelsFclose(WelsFileHandle  * pFp)
-{
-	return fclose(pFp);
+int32_t WelsFclose (WelsFileHandle*   pFp) {
+  return fclose (pFp);
 }
 
-int32_t WelsGetTimeOfDay(SWelsTime * pTp)
-{
-        struct timeval  sTv;
+int32_t WelsGetTimeOfDay (SWelsTime* pTp) {
+  struct timeval  sTv;
 
-        if( gettimeofday(&sTv, NULL) ){
-             return -1;
-        }
+  if (gettimeofday (&sTv, NULL)) {
+    return -1;
+  }
 
-        pTp->time = sTv.tv_sec;
-        pTp->millitm = (uint16_t)sTv.tv_usec/1000;
+  pTp->time = sTv.tv_sec;
+  pTp->millitm = (uint16_t)sTv.tv_usec / 1000;
 
-        return 0;
+  return 0;
 }
 
-int32_t WelsStrftime(str_t * pBuffer, int32_t iSize, const str_t * kpFormat, const SWelsTime * kpTp)
-{
-	struct tm  * pTnow;
-        
-	pTnow = localtime(&kpTp->time);
+int32_t WelsStrftime (str_t* pBuffer, int32_t iSize, const str_t* kpFormat, const SWelsTime* kpTp) {
+  struct tm*   pTnow;
 
-	return strftime(pBuffer, iSize, kpFormat, pTnow);
+  pTnow = localtime (&kpTp->time);
+
+  return strftime (pBuffer, iSize, kpFormat, pTnow);
 }
 
 #endif
 
 
-int32_t WelsFwrite(const void_t * kpBuffer, int32_t iSize, int32_t iCount, WelsFileHandle* pFp)
-{
-	return fwrite(kpBuffer, iSize, iCount, pFp);
+int32_t WelsFwrite (const void_t* kpBuffer, int32_t iSize, int32_t iCount, WelsFileHandle* pFp) {
+  return fwrite (kpBuffer, iSize, iCount, pFp);
 }
 
-uint16_t WelsGetMillsecond(const SWelsTime * kpTp)
-{
-	return kpTp->millitm;
+uint16_t WelsGetMillsecond (const SWelsTime* kpTp) {
+  return kpTp->millitm;
 }
 
-int32_t WelsFflush(WelsFileHandle* pFp)
-{
-	return fflush(pFp);
+int32_t WelsFflush (WelsFileHandle* pFp) {
+  return fflush (pFp);
 }
 
 } // namespace WelsDec
\ No newline at end of file
--- a/codec/decoder/plus/inc/welsCodecTrace.h
+++ b/codec/decoder/plus/inc/welsCodecTrace.h
@@ -40,132 +40,124 @@
 namespace WelsDec {
 
 #ifdef WIN32
-typedef int ( *CM_WELS_TRACE)( const char* kpFormat, ...);
+typedef int (*CM_WELS_TRACE) (const char* kpFormat, ...);
 #else
-typedef int ( *CM_WELS_TRACE)( const char* kpDllName, const char* kpFormat, ...);
+typedef int (*CM_WELS_TRACE) (const char* kpDllName, const char* kpFormat, ...);
 #endif
 
 
 typedef  enum {
-	Wels_Trace_Type     = 0,
-	Wels_Trace_Type_File    = 1,
-	Wels_Trace_Type_WinDgb  = 2,
+Wels_Trace_Type     = 0,
+Wels_Trace_Type_File    = 1,
+Wels_Trace_Type_WinDgb  = 2,
 } EWelsTraceType;
 
-class  IWelsTrace 
-{
-public:
-	enum {
-		WELS_LOG_QUIET     = 0,
-		WELS_LOG_ERROR     = 1 << 0,
-		WELS_LOG_WARNING   = 1 << 1,
-		WELS_LOG_INFO      = 1 << 2,
-		WELS_LOG_DEBUG     = 1 << 3,
-		WELS_LOG_RESV      = 1 << 4,
-	    WELS_LOG_DEFAULT   = WELS_LOG_ERROR | WELS_LOG_WARNING | WELS_LOG_INFO | WELS_LOG_DEBUG,
+class  IWelsTrace {
+ public:
+enum {
+  WELS_LOG_QUIET     = 0,
+  WELS_LOG_ERROR     = 1 << 0,
+  WELS_LOG_WARNING   = 1 << 1,
+  WELS_LOG_INFO      = 1 << 2,
+  WELS_LOG_DEBUG     = 1 << 3,
+  WELS_LOG_RESV      = 1 << 4,
+  WELS_LOG_DEFAULT   = WELS_LOG_ERROR | WELS_LOG_WARNING | WELS_LOG_INFO | WELS_LOG_DEBUG,
 
 
-		MAX_LOG_SIZE       = 1024,
-	};
+  MAX_LOG_SIZE       = 1024,
+};
 
-	virtual ~IWelsTrace() {};
+virtual ~IWelsTrace() {};
 
-	virtual int32_t  SetTraceLevel(int32_t iLevel) = 0;
-	virtual int32_t  Trace(const int32_t kLevel, const str_t * kpFormat,  va_list pVl) = 0;
+virtual int32_t  SetTraceLevel (int32_t iLevel) = 0;
+virtual int32_t  Trace (const int32_t kLevel, const str_t* kpFormat,  va_list pVl) = 0;
 
-	static void_t  WelsTrace(void_t* pObject, const int32_t kLevel, const str_t * kpFormat, va_list pVl)
-	{
-		IWelsTrace  * pThis = (IWelsTrace*)(pObject);
+static void_t  WelsTrace (void_t* pObject, const int32_t kLevel, const str_t* kpFormat, va_list pVl) {
+  IWelsTrace*   pThis = (IWelsTrace*) (pObject);
 
-		if( pThis ){
-			pThis->Trace(kLevel, kpFormat, pVl);
-		}
-	}
+  if (pThis) {
+    pThis->Trace (kLevel, kpFormat, pVl);
+  }
+}
 
-	static void_t WelsVTrace(void_t *pObject, const int32_t kLevel, const str_t *kpFormat, ...)
-	{
-		IWelsTrace * pThis = (IWelsTrace *)(pObject);
+static void_t WelsVTrace (void_t* pObject, const int32_t kLevel, const str_t* kpFormat, ...) {
+  IWelsTrace* pThis = (IWelsTrace*) (pObject);
 
-		va_list  argptr;	
+  va_list  argptr;
 
-		va_start(argptr, kpFormat);	
+  va_start (argptr, kpFormat);
 
-		if( pThis ){
-			pThis->Trace(kLevel, kpFormat, argptr);		
-		}
+  if (pThis) {
+    pThis->Trace (kLevel, kpFormat, argptr);
+  }
 
-		va_end(argptr);
-	}
+  va_end (argptr);
+}
 
 
 };
 
-class CWelsTraceBase : public IWelsTrace
-{
-public:
-	virtual int32_t  SetTraceLevel(int32_t iLevel);
-	virtual int32_t  Trace(const int32_t kLevel, const str_t * kpFormat,  va_list pVl);
+class CWelsTraceBase : public IWelsTrace {
+ public:
+virtual int32_t  SetTraceLevel (int32_t iLevel);
+virtual int32_t  Trace (const int32_t kLevel, const str_t* kpFormat,  va_list pVl);
 
-    virtual int32_t  WriteString(int32_t iLevel, const str_t * pStr) = 0;
-protected:
-	CWelsTraceBase() 
-	{
-		m_iLevel = WELS_LOG_DEFAULT;
-	};
+virtual int32_t  WriteString (int32_t iLevel, const str_t* pStr) = 0;
+ protected:
+CWelsTraceBase() {
+  m_iLevel = WELS_LOG_DEFAULT;
+};
 
-private:
-	int32_t   m_iLevel;
+ private:
+int32_t   m_iLevel;
 };
 
-class CWelsTraceFile : public CWelsTraceBase
-{
-public:
-	CWelsTraceFile(const str_t  * filename = (const str_t *)"wels_decoder_trace.txt");
-	virtual ~CWelsTraceFile();
+class CWelsTraceFile : public CWelsTraceBase {
+ public:
+CWelsTraceFile (const str_t*   filename = (const str_t*)"wels_decoder_trace.txt");
+virtual ~CWelsTraceFile();
 
-public:
-	virtual int32_t  WriteString(int32_t iLevel, const str_t * pStr);
+ public:
+virtual int32_t  WriteString (int32_t iLevel, const str_t* pStr);
 
-private:
-    WelsFileHandle* m_pTraceFile;
+ private:
+WelsFileHandle* m_pTraceFile;
 };
 
 #ifdef  WIN32
-class CWelsTraceWinDgb : public CWelsTraceBase
-{
-public:
-	CWelsTraceWinDgb() {};
-	virtual ~CWelsTraceWinDgb() {};
+class CWelsTraceWinDgb : public CWelsTraceBase {
+ public:
+CWelsTraceWinDgb() {};
+virtual ~CWelsTraceWinDgb() {};
 
-public:
-	virtual int32_t  WriteString(int32_t iLevel, const str_t * pStr);
+ public:
+virtual int32_t  WriteString (int32_t iLevel, const str_t* pStr);
 };
 #endif
 
-class CWelsCodecTrace : public CWelsTraceBase
-{
-public:
-	CWelsCodecTrace() ;
-	virtual ~CWelsCodecTrace();
+class CWelsCodecTrace : public CWelsTraceBase {
+ public:
+CWelsCodecTrace() ;
+virtual ~CWelsCodecTrace();
 
-public:
-	virtual int32_t  WriteString(int32_t iLevel, const str_t * pStr);
+ public:
+virtual int32_t  WriteString (int32_t iLevel, const str_t* pStr);
 
-protected:
-	int32_t  LoadWelsTraceModule();
-	int32_t  UnloadWelsTraceModule();
+ protected:
+int32_t  LoadWelsTraceModule();
+int32_t  UnloadWelsTraceModule();
 
-private:
-    void_t  * m_hTraceHandle;
+ private:
+void_t*   m_hTraceHandle;
 
-    CM_WELS_TRACE m_fpDebugTrace;
-	CM_WELS_TRACE m_fpInfoTrace;
-	CM_WELS_TRACE m_fpWarnTrace;
-	CM_WELS_TRACE m_fpErrorTrace;
+CM_WELS_TRACE m_fpDebugTrace;
+CM_WELS_TRACE m_fpInfoTrace;
+CM_WELS_TRACE m_fpWarnTrace;
+CM_WELS_TRACE m_fpErrorTrace;
 };
 
 
-IWelsTrace  * CreateWelsTrace(EWelsTraceType  eType,  void_t * pParam = NULL);
+IWelsTrace*   CreateWelsTrace (EWelsTraceType  eType,  void_t* pParam = NULL);
 
 } // namespace WelsDec
 
--- a/codec/decoder/plus/inc/welsDecoderExt.h
+++ b/codec/decoder/plus/inc/welsDecoderExt.h
@@ -58,62 +58,61 @@
 
 //#define OUTPUT_BIT_STREAM  ////for test to output bitstream
 
-class CWelsDecoder : public ISVCDecoder  
-{
-public:
-	CWelsDecoder(void_t);
-	virtual ~CWelsDecoder();
+class CWelsDecoder : public ISVCDecoder {
+ public:
+CWelsDecoder (void_t);
+virtual ~CWelsDecoder();
 
-	virtual long Initialize(void_t* pParam, const INIT_TYPE keInitType);
-	virtual long Unintialize();		
-	
-	/***************************************************************************
-	*	Description:
-	*		Decompress one frame, and output RGB24 or YV12 decoded stream and its length.
-	*	Input parameters:
-	*       Parameter		TYPE			       Description
-	*       pSrc             unsigned char*         the h264 stream to decode
-	*       srcLength       int                    the length of h264 steam
-	*       pDst             unsigned char*         buffer pointer of decoded data
-	*       pDstInfo        SBufferInfo&           information provided to API including width, height, SW/HW option, etc
-	*
-	*	return: if decode frame success return 0, otherwise corresponding error returned.
-	/***************************************************************************/
-	virtual DECODING_STATE DecodeFrame(	const unsigned char* kpSrc,
-		                                const int kiSrcLen,	
-		                                unsigned char** ppDst,
-		                                int* pStride,
-		                                int& iWidth,
-		                                int& iHeight	);
+virtual long Initialize (void_t* pParam, const INIT_TYPE keInitType);
+virtual long Uninitialize();
 
-	virtual DECODING_STATE DecodeFrame(	const unsigned char* kpSrc,
-											const int kiSrcLen,	
-											void_t ** ppDst,
-											SBufferInfo* pDstInfo);
-	virtual DECODING_STATE DecodeFrameEx( const unsigned char * kpSrc,
-		                                  const int kiSrcLen,
-		                                  unsigned char * pDst,
-										  int iDstStride,
-		                                  int & iDstLen,
-		                                  int & iWidth,
-		                                  int & iHeight,
-		                                  int & color_format);
+/***************************************************************************
+*	Description:
+*		Decompress one frame, and output RGB24 or YV12 decoded stream and its length.
+*	Input parameters:
+*       Parameter		TYPE			       Description
+*       pSrc             unsigned char*         the h264 stream to decode
+*       srcLength       int                    the length of h264 steam
+*       pDst             unsigned char*         buffer pointer of decoded data
+*       pDstInfo        SBufferInfo&           information provided to API including width, height, SW/HW option, etc
+*
+*	return: if decode frame success return 0, otherwise corresponding error returned.
+/***************************************************************************/
+virtual DECODING_STATE DecodeFrame (const unsigned char* kpSrc,
+                                    const int kiSrcLen,
+                                    unsigned char** ppDst,
+                                    int* pStride,
+                                    int& iWidth,
+                                    int& iHeight);
 
-    virtual long SetOption(DECODER_OPTION eOptID, void_t* pOption);
-	virtual long GetOption(DECODER_OPTION eOptID, void_t* pOption);
+virtual DECODING_STATE DecodeFrame (const unsigned char* kpSrc,
+                                    const int kiSrcLen,
+                                    void_t** ppDst,
+                                    SBufferInfo* pDstInfo);
+virtual DECODING_STATE DecodeFrameEx (const unsigned char* kpSrc,
+                                      const int kiSrcLen,
+                                      unsigned char* pDst,
+                                      int iDstStride,
+                                      int& iDstLen,
+                                      int& iWidth,
+                                      int& iHeight,
+                                      int& color_format);
 
-private:	
-	PWelsDecoderContext 				m_pDecContext;
-	IWelsTrace							*m_pTrace;
-	
-	void_t InitDecoder( void_t );
-	void_t UninitDecoder( void_t );
-	
+virtual long SetOption (DECODER_OPTION eOptID, void_t* pOption);
+virtual long GetOption (DECODER_OPTION eOptID, void_t* pOption);
+
+ private:
+PWelsDecoderContext 				m_pDecContext;
+IWelsTrace*							m_pTrace;
+
+void_t InitDecoder (void_t);
+void_t UninitDecoder (void_t);
+
 #ifdef OUTPUT_BIT_STREAM
-	WelsFileHandle* m_pFBS;
-	WelsFileHandle* m_pFBSSize;
+WelsFileHandle* m_pFBS;
+WelsFileHandle* m_pFBSSize;
 #endif//OUTPUT_BIT_STREAM
-	
+
 };
 
 } // namespace WelsDec
--- a/codec/decoder/plus/res/resource.h
+++ b/codec/decoder/plus/res/resource.h
@@ -4,7 +4,7 @@
 //
 
 // Next default values for new objects
-// 
+//
 #ifdef APSTUDIO_INVOKED
 #ifndef APSTUDIO_READONLY_SYMBOLS
 #define _APS_NEXT_RESOURCE_VALUE        101
--- a/codec/decoder/plus/res/welsdec.rc
+++ b/codec/decoder/plus/res/welsdec.rc
@@ -7,7 +7,7 @@
 //
 // Generated from the TEXTINCLUDE 2 resource.
 //
-#include "afxres.h"
+#include "windows.h"
 
 /////////////////////////////////////////////////////////////////////////////
 #undef APSTUDIO_READONLY_SYMBOLS
@@ -27,18 +27,18 @@
 // TEXTINCLUDE
 //
 
-1 TEXTINCLUDE 
+1 TEXTINCLUDE
 BEGIN
     "resource.h\0"
 END
 
-2 TEXTINCLUDE 
+2 TEXTINCLUDE
 BEGIN
-    "#include ""afxres.h""\r\n"
+    "#include ""windows.h""\r\n"
     "\0"
 END
 
-3 TEXTINCLUDE 
+3 TEXTINCLUDE
 BEGIN
     "\r\n"
     "\0"
--- a/codec/decoder/plus/src/welsCodecTrace.cpp
+++ b/codec/decoder/plus/src/welsCodecTrace.cpp
@@ -44,6 +44,7 @@
 
 #include "welsCodecTrace.h"
 #include "utils.h"
+#include "logging.h"
 #if defined LINUX || defined SOLARIS || defined UNIX || defined MACOS //LINUX/SOLARIS/UNIX
 #include <dlfcn.h>
 #endif
@@ -58,362 +59,339 @@
 namespace WelsDec {
 
 #ifdef MACOS
-static CFBundleRef LoadLibrary(const char* lpszbundle)
-{
-	// 1.get bundle path
-	char cBundlePath[PATH_MAX];
-	memset(cBundlePath, 0, PATH_MAX);
-	
-	Dl_info 	dlInfo;
-	static int  sDummy;
-	dladdr((void_t*)&sDummy, &dlInfo);
-	
-	strlcpy(cBundlePath, dlInfo.dli_fname, PATH_MAX);
-	
-	char * pPath = NULL;
-	for(int i = 4; i > 0; i--)
-	{
-		pPath = strrchr(cBundlePath,'/');//confirmed_safe_unsafe_usage
-		if(pPath)
-		{
-			*pPath = 0;
-		}
-		else
-		{
-			break;
-		}
-	}
-	if(pPath)
-	{
-		strlcat(cBundlePath, "/", PATH_MAX);
-	}
-	else
-	{
-		return NULL;
-	}
-	
-	strlcat(cBundlePath, lpszbundle, PATH_MAX);
-	
-	FSRef bundlePath;
-	OSStatus iStatus = FSPathMakeRef((unsigned char*)cBundlePath, &bundlePath, NULL);
-	if(noErr != iStatus)
-		return NULL;
-	
-	CFURLRef bundleURL = CFURLCreateFromFSRef(kCFAllocatorSystemDefault, &bundlePath);
-	if(NULL == bundleURL)
-		return NULL;
-	
-	// 2.get bundle ref
-	CFBundleRef bundleRef = CFBundleCreate(kCFAllocatorSystemDefault, bundleURL);
-	CFRelease(bundleURL);
-	
+static CFBundleRef LoadLibrary (const char* lpszbundle) {
+  // 1.get bundle path
+  char cBundlePath[PATH_MAX];
+  memset (cBundlePath, 0, PATH_MAX);
+
+  Dl_info 	dlInfo;
+  static int  sDummy;
+  dladdr ((void_t*)&sDummy, &dlInfo);
+
+  strlcpy (cBundlePath, dlInfo.dli_fname, PATH_MAX);
+
+  char* pPath = NULL;
+  for (int i = 4; i > 0; i--) {
+    pPath = strrchr (cBundlePath, '/'); //confirmed_safe_unsafe_usage
+    if (pPath) {
+      *pPath = 0;
+    } else {
+      break;
+    }
+  }
+  if (pPath) {
+    strlcat (cBundlePath, "/", PATH_MAX);
+  } else {
+    return NULL;
+  }
+
+  strlcat (cBundlePath, lpszbundle, PATH_MAX);
+
+  FSRef bundlePath;
+  OSStatus iStatus = FSPathMakeRef ((unsigned char*)cBundlePath, &bundlePath, NULL);
+  if (noErr != iStatus)
+    return NULL;
+
+  CFURLRef bundleURL = CFURLCreateFromFSRef (kCFAllocatorSystemDefault, &bundlePath);
+  if (NULL == bundleURL)
+    return NULL;
+
+  // 2.get bundle ref
+  CFBundleRef bundleRef = CFBundleCreate (kCFAllocatorSystemDefault, bundleURL);
+  CFRelease (bundleURL);
+
 //	Boolean bReturn = FALSE;
-	if(NULL != bundleRef)
-	{
-		//	bReturn = CFBundleLoadExecutable(bundleRef);
-	}
-	
-	return bundleRef;
+  if (NULL != bundleRef) {
+    //	bReturn = CFBundleLoadExecutable(bundleRef);
+  }
+
+  return bundleRef;
 }
 
-static Boolean FreeLibrary(CFBundleRef bundle)
-{	
-	if(NULL != bundle)
-	{
-		//	CFBundleUnloadExecutable(bundle);
-		CFRelease(bundle);
-	}
-	
-	return TRUE;
+static Boolean FreeLibrary (CFBundleRef bundle) {
+  if (NULL != bundle) {
+    //	CFBundleUnloadExecutable(bundle);
+    CFRelease (bundle);
+  }
+
+  return TRUE;
 }
 
-static void_t* GetProcessAddress(CFBundleRef bundle, const char* lpszprocname)
-{
-	if(NULL == bundle)
-		return NULL;
-	
-	CFStringRef cfprocname = CFStringCreateWithCString(NULL,lpszprocname,CFStringGetSystemEncoding());
-	void_t *processAddress = CFBundleGetFunctionPointerForName(bundle,cfprocname);
-	CFRelease(cfprocname);
-	
-	return processAddress;
+static void_t* GetProcessAddress (CFBundleRef bundle, const char* lpszprocname) {
+  if (NULL == bundle)
+    return NULL;
+
+  CFStringRef cfprocname = CFStringCreateWithCString (NULL, lpszprocname, CFStringGetSystemEncoding());
+  void_t* processAddress = CFBundleGetFunctionPointerForName (bundle, cfprocname);
+  CFRelease (cfprocname);
+
+  return processAddress;
 }
 #endif
 
 
 
-int32_t  CWelsTraceBase::SetTraceLevel(int iLevel)
-{
-	m_iLevel = iLevel;
+int32_t  CWelsTraceBase::SetTraceLevel (int iLevel) {
+  m_iLevel = iLevel;
 
-	return 0;
+  return 0;
 }
 
-int32_t  CWelsTraceBase::Trace(const int kLevel, const str_t *kpFormat, va_list pVl)
-{
-	if( kLevel & m_iLevel ){
-		str_t chWStrFormat[MAX_LOG_SIZE] = {0};
-		str_t chBuf[MAX_LOG_SIZE] = {0};
-		str_t chResult[MAX_LOG_SIZE] = {0};
-		const int32_t kLen	= WelsStrnlen((const str_t *)"[DECODER]: ", MAX_LOG_SIZE);
+int32_t  CWelsTraceBase::Trace (const int kLevel, const str_t* kpFormat, va_list pVl) {
+  if (kLevel & m_iLevel) {
+    str_t chWStrFormat[MAX_LOG_SIZE] = {0};
+    str_t chBuf[MAX_LOG_SIZE] = {0};
+    str_t chResult[MAX_LOG_SIZE] = {0};
+    const int32_t kLen	= WelsStrnlen ((const str_t*)"[DECODER]: ", MAX_LOG_SIZE);
 
-		WelsStrncpy(chWStrFormat, MAX_LOG_SIZE, (const str_t *)kpFormat, WelsStrnlen((const str_t *)kpFormat, MAX_LOG_SIZE));	
+    WelsStrncpy (chWStrFormat, MAX_LOG_SIZE, (const str_t*)kpFormat, WelsStrnlen ((const str_t*)kpFormat, MAX_LOG_SIZE));
 
-		WelsStrncpy(chBuf, MAX_LOG_SIZE, (const str_t *)"[DECODER]: ", kLen);
+    WelsStrncpy (chBuf, MAX_LOG_SIZE, (const str_t*)"[DECODER]: ", kLen);
 
-		WelsVsprintf((chBuf + kLen),  MAX_LOG_SIZE - kLen, (const str_t *)kpFormat, pVl);
-		WelsStrncpy(chResult, MAX_LOG_SIZE, (const str_t *)chBuf, WelsStrnlen((const str_t *)chBuf, MAX_LOG_SIZE));
+    WelsVsprintf ((chBuf + kLen),  MAX_LOG_SIZE - kLen, (const str_t*)kpFormat, pVl);
+    WelsStrncpy (chResult, MAX_LOG_SIZE, (const str_t*)chBuf, WelsStrnlen ((const str_t*)chBuf, MAX_LOG_SIZE));
 
-		WriteString(kLevel, chResult);
-	}
+    WriteString (kLevel, chResult);
+  }
 
-	return 0;
+  return 0;
 }
 
-CWelsTraceFile::CWelsTraceFile(const str_t * pFileName)
-{
-	m_pTraceFile = WelsFopen(pFileName, (const str_t *)"wt");
+CWelsTraceFile::CWelsTraceFile (const str_t* pFileName) {
+  m_pTraceFile = WelsFopen (pFileName, (const str_t*)"wt");
 }
 
-CWelsTraceFile::~CWelsTraceFile()
-{
-	if( m_pTraceFile ){
-		WelsFclose(m_pTraceFile);
-		m_pTraceFile = NULL;
-	}
+CWelsTraceFile::~CWelsTraceFile() {
+  if (m_pTraceFile) {
+    WelsFclose (m_pTraceFile);
+    m_pTraceFile = NULL;
+  }
 }
 
-int32_t CWelsTraceFile::WriteString(int32_t iLevel, const str_t * pStr)
-{
-	int  iRC = 0;
-	const static str_t chEnter[16] = "\n";
-	if( m_pTraceFile ){
-		iRC += WelsFwrite(pStr, 1, WelsStrnlen(pStr, MAX_LOG_SIZE), m_pTraceFile);
-		iRC += WelsFwrite(chEnter, 1, WelsStrnlen(chEnter,  16), m_pTraceFile);
-		WelsFflush(m_pTraceFile);
-	}
-	return iRC;
+int32_t CWelsTraceFile::WriteString (int32_t iLevel, const str_t* pStr) {
+  int  iRC = 0;
+  const static str_t chEnter[16] = "\n";
+  if (m_pTraceFile) {
+    iRC += WelsFwrite (pStr, 1, WelsStrnlen (pStr, MAX_LOG_SIZE), m_pTraceFile);
+    iRC += WelsFwrite (chEnter, 1, WelsStrnlen (chEnter,  16), m_pTraceFile);
+    WelsFflush (m_pTraceFile);
+  }
+  return iRC;
 }
 
 
 #ifdef WIN32
 
-int32_t CWelsTraceWinDgb::WriteString(int32_t iLevel, const str_t * pStr)
-{
-	OutputDebugStringA(pStr);
+int32_t CWelsTraceWinDgb::WriteString (int32_t iLevel, const str_t* pStr) {
+  OutputDebugStringA (pStr);
 
-	return WelsStrnlen(pStr, MAX_LOG_SIZE);//strnlen(pStr, MAX_LOG_SIZE);
+  return WelsStrnlen (pStr, MAX_LOG_SIZE); //strnlen(pStr, MAX_LOG_SIZE);
 }
 
 #endif
 
-CWelsCodecTrace::CWelsCodecTrace()
-{
-	m_hTraceHandle = NULL;
-    m_fpDebugTrace = NULL;
-	m_fpInfoTrace = NULL;
-	m_fpWarnTrace = NULL;
-	m_fpErrorTrace = NULL;
+CWelsCodecTrace::CWelsCodecTrace() {
+  m_hTraceHandle = NULL;
+  m_fpDebugTrace = NULL;
+  m_fpInfoTrace = NULL;
+  m_fpWarnTrace = NULL;
+  m_fpErrorTrace = NULL;
 
-	LoadWelsTraceModule();
+  LoadWelsTraceModule();
 }
 
-CWelsCodecTrace::~CWelsCodecTrace()
-{
-	UnloadWelsTraceModule();
+CWelsCodecTrace::~CWelsCodecTrace() {
+  UnloadWelsTraceModule();
 }
 
-int32_t  CWelsCodecTrace::LoadWelsTraceModule()
-{	
-#if defined WIN32	
-	HMODULE hHandle = ::LoadLibrary("welstrace.dll");
+int32_t  CWelsCodecTrace::LoadWelsTraceModule() {
+#ifdef NO_DYNAMIC_VP
+  m_fpDebugTrace = welsStderrTrace<WELS_LOG_DEBUG>;
+  m_fpInfoTrace = welsStderrTrace<WELS_LOG_INFO>;
+  m_fpWarnTrace = welsStderrTrace<WELS_LOG_WARNING>;
+  m_fpErrorTrace = welsStderrTrace<WELS_LOG_ERROR>;
+#else
+#if defined WIN32
+  HMODULE hHandle = ::LoadLibrary ("welstrace.dll");
 //	HMODULE handle = ::LoadLibrary("contrace.dll");  // for c7 trace
-	if ( NULL == hHandle )
-		return -1;
+  if (NULL == hHandle)
+    return -1;
 
-	CHAR chPath[ _MAX_PATH]= {0};
-	GetModuleFileName( (HMODULE)hHandle, chPath, _MAX_PATH);
+  CHAR chPath[ _MAX_PATH] = {0};
+  GetModuleFileName ((HMODULE)hHandle, chPath, _MAX_PATH);
 
-	m_hTraceHandle = ::LoadLibrary(chPath);
-	
-	OutputDebugStringA(chPath);
-	if( m_hTraceHandle) {
-		m_fpDebugTrace = ( CM_WELS_TRACE)::GetProcAddress( ( HMODULE)m_hTraceHandle, "WELSDEBUGA");
-		m_fpInfoTrace = ( CM_WELS_TRACE)::GetProcAddress( ( HMODULE)m_hTraceHandle, "WELSINFOA");
-		m_fpWarnTrace = ( CM_WELS_TRACE)::GetProcAddress( ( HMODULE)m_hTraceHandle, "WELSWARNA");
-		m_fpErrorTrace = ( CM_WELS_TRACE)::GetProcAddress( ( HMODULE)m_hTraceHandle, "WELSERRORA");
-	}
+  m_hTraceHandle = ::LoadLibrary (chPath);
 
-	// coverity scan uninitial
-	if (hHandle != NULL)
-	{
-		::FreeLibrary(hHandle);
-		hHandle = NULL;
-	}
+  OutputDebugStringA (chPath);
+  if (m_hTraceHandle) {
+    m_fpDebugTrace = (CM_WELS_TRACE)::GetProcAddress ((HMODULE)m_hTraceHandle, "WELSDEBUGA");
+    m_fpInfoTrace = (CM_WELS_TRACE)::GetProcAddress ((HMODULE)m_hTraceHandle, "WELSINFOA");
+    m_fpWarnTrace = (CM_WELS_TRACE)::GetProcAddress ((HMODULE)m_hTraceHandle, "WELSWARNA");
+    m_fpErrorTrace = (CM_WELS_TRACE)::GetProcAddress ((HMODULE)m_hTraceHandle, "WELSERRORA");
+  }
+
+  // coverity scan uninitial
+  if (hHandle != NULL) {
+    ::FreeLibrary (hHandle);
+    hHandle = NULL;
+  }
 #elif defined MACOS
-	m_hTraceHandle = LoadLibrary("welstrace.bundle");
-	if(m_hTraceHandle) {
-		m_fpDebugTrace = ( CM_WELS_TRACE)GetProcessAddress( (CFBundleRef)m_hTraceHandle, "WELSDEBUG2");
-		m_fpInfoTrace = ( CM_WELS_TRACE)GetProcessAddress( (CFBundleRef)m_hTraceHandle, "WELSINFO2");
-		m_fpWarnTrace = ( CM_WELS_TRACE)GetProcessAddress( (CFBundleRef)m_hTraceHandle, "WELSWARN2");
-		m_fpErrorTrace = ( CM_WELS_TRACE)GetProcessAddress( (CFBundleRef)m_hTraceHandle, "WELSERROR2");
-	}
+  m_hTraceHandle = LoadLibrary ("welstrace.bundle");
+  if (m_hTraceHandle) {
+    m_fpDebugTrace = (CM_WELS_TRACE)GetProcessAddress ((CFBundleRef)m_hTraceHandle, "WELSDEBUG2");
+    m_fpInfoTrace = (CM_WELS_TRACE)GetProcessAddress ((CFBundleRef)m_hTraceHandle, "WELSINFO2");
+    m_fpWarnTrace = (CM_WELS_TRACE)GetProcessAddress ((CFBundleRef)m_hTraceHandle, "WELSWARN2");
+    m_fpErrorTrace = (CM_WELS_TRACE)GetProcessAddress ((CFBundleRef)m_hTraceHandle, "WELSERROR2");
+  }
 #elif defined LINUX || defined SOLARIS || defined UNIX
 //#else
 //	CCmString	cmPath;
-	str_t chPath[255]= {0};
-	Dl_info		sDlInfo;
-	static int	iMmTPAddress;
-    dladdr( &iMmTPAddress, &sDlInfo);
+  str_t chPath[255] = {0};
+  Dl_info		sDlInfo;
+  static int	iMmTPAddress;
+  dladdr (&iMmTPAddress, &sDlInfo);
 
-	if (NULL == sDlInfo.dli_fname)
-		return -1;
-	WelsStrncpy(chPath, 255, (const str_t*)sDlInfo.dli_fname, WelsStrnlen((const str_t*)sDlInfo.dli_fname, 255));
-	str_t* p = strrchr(chPath, '/');//confirmed_safe_unsafe_usage
-	if ( NULL == p )
-		return -1;
-	const int iLenTraceName = WelsStrnlen((const str_t*)"/libwelstrace.so", 15);
-	const int iCurPos = p - chPath;
-	if ( iCurPos + iLenTraceName < 255 )
-		WelsStrncpy(p, 254-iCurPos, (const str_t*)"/libwelstrace.so", iLenTraceName );
-	else
-		return -1;
+  if (NULL == sDlInfo.dli_fname)
+    return -1;
+  WelsStrncpy (chPath, 255, (const str_t*)sDlInfo.dli_fname, WelsStrnlen ((const str_t*)sDlInfo.dli_fname, 255));
+  str_t* p = strrchr (chPath, '/'); //confirmed_safe_unsafe_usage
+  if (NULL == p)
+    return -1;
+  const int iLenTraceName = WelsStrnlen ((const str_t*)"/libwelstrace.so", 15);
+  const int iCurPos = p - chPath;
+  if (iCurPos + iLenTraceName < 255)
+    WelsStrncpy (p, 254 - iCurPos, (const str_t*)"/libwelstrace.so", iLenTraceName);
+  else
+    return -1;
 
-	m_hTraceHandle = dlopen( chPath, RTLD_LAZY);
-	if (m_hTraceHandle == NULL)
-	{
-		WelsFileHandle* fp = WelsFopen((const str_t*)"/tmp/trace.txt", (const str_t*)"a");
-		if(fp)
-		{
-			fprintf(fp, "welsCodecTrace::welsCodecTrace ===> dlopen %s fail, %s\n", chPath, dlerror());
-			WelsFclose(fp);
-		}
-		return -1;
-	}
-	if (m_hTraceHandle) {
-		m_fpDebugTrace = ( CM_WELS_TRACE)dlsym( m_hTraceHandle, "WELSDEBUG2");
-		m_fpInfoTrace = ( CM_WELS_TRACE)dlsym( m_hTraceHandle, "WELSINFO2");
-		m_fpWarnTrace = ( CM_WELS_TRACE)dlsym( m_hTraceHandle, "WELSWARN2");
-		m_fpErrorTrace = ( CM_WELS_TRACE)dlsym( m_hTraceHandle, "WELSERROR2");
-		if(m_fpDebugTrace == NULL)
-		{
-			WelsFileHandle* fp = WelsFopen((const str_t*)"/tmp/trace.txt", (const str_t*)"a");
-			if(fp)
-			{
-				printf("welsCodecTrace::welsCodecTrace ===> dlsym failed (WELSDEBUG2) , dlerror = %s\n", dlerror());
-				WelsFclose(fp);
-			}
-			return -1;
-		}
-	}
+  m_hTraceHandle = dlopen (chPath, RTLD_LAZY);
+  if (m_hTraceHandle == NULL) {
+    WelsFileHandle* fp = WelsFopen ((const str_t*)"/tmp/trace.txt", (const str_t*)"a");
+    if (fp) {
+      fprintf (fp, "welsCodecTrace::welsCodecTrace ===> dlopen %s fail, %s\n", chPath, dlerror());
+      WelsFclose (fp);
+    }
+    return -1;
+  }
+  if (m_hTraceHandle) {
+    m_fpDebugTrace = (CM_WELS_TRACE)dlsym (m_hTraceHandle, "WELSDEBUG2");
+    m_fpInfoTrace = (CM_WELS_TRACE)dlsym (m_hTraceHandle, "WELSINFO2");
+    m_fpWarnTrace = (CM_WELS_TRACE)dlsym (m_hTraceHandle, "WELSWARN2");
+    m_fpErrorTrace = (CM_WELS_TRACE)dlsym (m_hTraceHandle, "WELSERROR2");
+    if (m_fpDebugTrace == NULL) {
+      WelsFileHandle* fp = WelsFopen ((const str_t*)"/tmp/trace.txt", (const str_t*)"a");
+      if (fp) {
+        printf ("welsCodecTrace::welsCodecTrace ===> dlsym failed (WELSDEBUG2) , dlerror = %s\n", dlerror());
+        WelsFclose (fp);
+      }
+      return -1;
+    }
+  }
 #endif
-	return 0;
+#endif  // NO_DYNAMIC_VP
+  return 0;
 }
 
-int32_t  CWelsCodecTrace::UnloadWelsTraceModule()
-{
+int32_t  CWelsCodecTrace::UnloadWelsTraceModule() {
 #if defined WIN32
-	if( m_hTraceHandle) {
-		::FreeLibrary( ( HMODULE)m_hTraceHandle);
-	}
+  if (m_hTraceHandle) {
+    ::FreeLibrary ((HMODULE)m_hTraceHandle);
+  }
 #elif defined MACOS
-	if (m_hTraceHandle) {
-		FreeLibrary( (CFBundleRef)m_hTraceHandle);
-	}
+  if (m_hTraceHandle) {
+    FreeLibrary ((CFBundleRef)m_hTraceHandle);
+  }
 #elif defined LINUX || defined SOLARIS || defined UNIX
-	if (m_hTraceHandle) {
-		::dlclose( m_hTraceHandle);
-	}
+  if (m_hTraceHandle) {
+    ::dlclose (m_hTraceHandle);
+  }
 #endif
 
-	m_hTraceHandle = NULL;
-	m_fpDebugTrace = NULL;
-	m_fpInfoTrace = NULL;
-	m_fpWarnTrace = NULL;
-	m_fpErrorTrace = NULL;
-	return 0;
+  m_hTraceHandle = NULL;
+  m_fpDebugTrace = NULL;
+  m_fpInfoTrace = NULL;
+  m_fpWarnTrace = NULL;
+  m_fpErrorTrace = NULL;
+  return 0;
 }
 
-int32_t  CWelsCodecTrace::WriteString(int32_t iLevel, const str_t * pStr)
-{
-	if( m_hTraceHandle )
-	{
+int32_t  CWelsCodecTrace::WriteString (int32_t iLevel, const str_t* pStr) {
+#ifndef NO_DYNAMIC_VP
+  if (m_hTraceHandle)
+#endif
+  {
 #ifdef WIN32
-		switch(iLevel)
-		{
-		case WELS_LOG_ERROR:
-			if(m_fpErrorTrace)
-				m_fpErrorTrace("%s", pStr);
-			break;
-		case WELS_LOG_WARNING:
-			if(m_fpWarnTrace)
-				m_fpWarnTrace("%s", pStr);
-			break;
-		case WELS_LOG_INFO:
-			if(m_fpInfoTrace)
-				m_fpInfoTrace("%s", pStr);
-			break;
-		case WELS_LOG_DEBUG:
-			if(m_fpDebugTrace)
-				m_fpDebugTrace("%s", pStr);
-			break;
-		default:
-			if(m_fpDebugTrace)
-				m_fpInfoTrace("%s", pStr);
-			break;
-		}
+    switch (iLevel) {
+    case WELS_LOG_ERROR:
+      if (m_fpErrorTrace)
+        m_fpErrorTrace ("%s", pStr);
+      break;
+    case WELS_LOG_WARNING:
+      if (m_fpWarnTrace)
+        m_fpWarnTrace ("%s", pStr);
+      break;
+    case WELS_LOG_INFO:
+      if (m_fpInfoTrace)
+        m_fpInfoTrace ("%s", pStr);
+      break;
+    case WELS_LOG_DEBUG:
+      if (m_fpDebugTrace)
+        m_fpDebugTrace ("%s", pStr);
+      break;
+    default:
+      if (m_fpDebugTrace)
+        m_fpInfoTrace ("%s", pStr);
+      break;
+    }
 #else
-		switch(iLevel)
-		{
-		case WELS_LOG_ERROR:
-			if(m_fpErrorTrace)
-				m_fpErrorTrace("CODEC", "%s", pStr);
-			break;
-		case WELS_LOG_WARNING:
-			if(m_fpWarnTrace)
-				m_fpWarnTrace("CODEC", "%s",  pStr);
-			break;
-		case WELS_LOG_INFO:
-			if(m_fpInfoTrace)
-				m_fpInfoTrace("CODEC", "%s",  pStr);
-			break;
-		case WELS_LOG_DEBUG:
-			if(m_fpInfoTrace)
-				m_fpInfoTrace("CODEC", "%s",  pStr);
-			break;
-		default:
-			if(m_fpInfoTrace)
-				m_fpInfoTrace("CODEC", "%s",  pStr);
-			break;
-		}
+    switch (iLevel) {
+    case WELS_LOG_ERROR:
+      if (m_fpErrorTrace)
+        m_fpErrorTrace ("CODEC", "%s", pStr);
+      break;
+    case WELS_LOG_WARNING:
+      if (m_fpWarnTrace)
+        m_fpWarnTrace ("CODEC", "%s",  pStr);
+      break;
+    case WELS_LOG_INFO:
+      if (m_fpInfoTrace)
+        m_fpInfoTrace ("CODEC", "%s",  pStr);
+      break;
+    case WELS_LOG_DEBUG:
+      if (m_fpInfoTrace)
+        m_fpInfoTrace ("CODEC", "%s",  pStr);
+      break;
+    default:
+      if (m_fpInfoTrace)
+        m_fpInfoTrace ("CODEC", "%s",  pStr);
+      break;
+    }
 #endif
-	}
+  }
 
-	return 0;
+  return 0;
 }
 
 
-IWelsTrace  * CreateWelsTrace(EWelsTraceType  eType,  void_t * pParam)
-{
-	IWelsTrace  * pTrace = NULL;
-	switch(eType)
-	{
-	case Wels_Trace_Type:
-		pTrace = new CWelsCodecTrace();
-		break;
-	case Wels_Trace_Type_File:
-		pTrace = new CWelsTraceFile();
-		break;
+IWelsTrace*   CreateWelsTrace (EWelsTraceType  eType,  void_t* pParam) {
+  IWelsTrace*   pTrace = NULL;
+  switch (eType) {
+  case Wels_Trace_Type:
+    pTrace = new CWelsCodecTrace();
+    break;
+  case Wels_Trace_Type_File:
+    pTrace = new CWelsTraceFile();
+    break;
 #ifdef WIN32
-	case Wels_Trace_Type_WinDgb:
-		pTrace = new CWelsTraceWinDgb();
-		break;
+  case Wels_Trace_Type_WinDgb:
+    pTrace = new CWelsTraceWinDgb();
+    break;
 #endif
-	default:
-		break;
-	}
+  default:
+    break;
+  }
 
-	return pTrace;
+  return pTrace;
 }
 
-} // namespace WelsDec
\ No newline at end of file
+} // namespace WelsDec
--- a/codec/decoder/plus/src/welsDecoderExt.cpp
+++ b/codec/decoder/plus/src/welsDecoderExt.cpp
@@ -76,7 +76,7 @@
 
 /***************************************************************************
 *	Description:
-*			class CWelsDecoder constructor function, do initialization	and    
+*			class CWelsDecoder constructor function, do initialization	and
 *       alloc memory required
 *
 *	Input parameters: none
@@ -83,413 +83,368 @@
 *
 *	return: none
 /***************************************************************************/
-CWelsDecoder::CWelsDecoder(void_t)
-:	m_pDecContext( NULL ),
-	m_pTrace( NULL )
-{
+CWelsDecoder::CWelsDecoder (void_t)
+  :	m_pDecContext (NULL),
+    m_pTrace (NULL) {
 #ifdef OUTPUT_BIT_STREAM
-	str_t chFileName[1024] = { 0 };  //for .264
-	int iBufUsed = 0;
-	int iBufLeft = 1023;
+  str_t chFileName[1024] = { 0 };  //for .264
+  int iBufUsed = 0;
+  int iBufLeft = 1023;
 
-	str_t chFileNameSize[1024] = { 0 }; //for .len
-	int iBufUsedSize = 0;
-	int iBufLeftSize = 1023;
+  str_t chFileNameSize[1024] = { 0 }; //for .len
+  int iBufUsedSize = 0;
+  int iBufLeftSize = 1023;
 #endif//OUTPUT_BIT_STREAM 
 
-	m_pTrace = CreateWelsTrace(Wels_Trace_Type);	
+  m_pTrace = CreateWelsTrace (Wels_Trace_Type);
 
-	IWelsTrace::WelsVTrace(m_pTrace, IWelsTrace::WELS_LOG_INFO,"CWelsDecoder::CWelsDecoder() entry");
-   
-	
+  IWelsTrace::WelsVTrace (m_pTrace, IWelsTrace::WELS_LOG_INFO, "CWelsDecoder::CWelsDecoder() entry");
+
+
 #ifdef OUTPUT_BIT_STREAM
-    SWelsTime sCurTime;
+  SWelsTime sCurTime;
 
-	WelsGetTimeOfDay(&sCurTime);	
-	
-	iBufUsed      += WelsSnprintf(chFileName,  iBufLeft,  "bs_0x%p_", (void_t*)this);
-	iBufUsedSize += WelsSnprintf(chFileNameSize, iBufLeftSize, "size_0x%p_", (void_t*)this);
+  WelsGetTimeOfDay (&sCurTime);
 
-	iBufLeft -= iBufUsed;
-	if ( iBufLeft > iBufUsed )
-	{
-		iBufUsed += WelsStrftime(&chFileName[iBufUsed], iBufLeft, "%y%m%d%H%M%S", &sCurTime);
-		iBufLeft -= iBufUsed;
-	}
+  iBufUsed      += WelsSnprintf (chFileName,  iBufLeft,  "bs_0x%p_", (void_t*)this);
+  iBufUsedSize += WelsSnprintf (chFileNameSize, iBufLeftSize, "size_0x%p_", (void_t*)this);
 
-	iBufLeftSize -= iBufUsedSize;
-	if ( iBufLeftSize> iBufUsedSize )
-	{	
-		iBufUsedSize += WelsStrftime(&chFileNameSize[iBufUsedSize], iBufLeftSize, "%y%m%d%H%M%S", &sCurTime);
-		iBufLeftSize -= iBufUsedSize;
-	}
+  iBufLeft -= iBufUsed;
+  if (iBufLeft > iBufUsed) {
+    iBufUsed += WelsStrftime (&chFileName[iBufUsed], iBufLeft, "%y%m%d%H%M%S", &sCurTime);
+    iBufLeft -= iBufUsed;
+  }
 
-	if ( iBufLeft > iBufUsed )
-	{
-		iBufUsed += WelsSnprintf(&chFileName[iBufUsed], iBufLeft, ".%03.3u.264", WelsGetMillsecond(&sCurTime));
-		iBufLeft -= iBufUsed;
-	}
+  iBufLeftSize -= iBufUsedSize;
+  if (iBufLeftSize > iBufUsedSize) {
+    iBufUsedSize += WelsStrftime (&chFileNameSize[iBufUsedSize], iBufLeftSize, "%y%m%d%H%M%S", &sCurTime);
+    iBufLeftSize -= iBufUsedSize;
+  }
 
-	if ( iBufLeftSize > iBufUsedSize )
-	{
-        iBufUsedSize += WelsSnprintf(&chFileNameSize[iBufUsedSize], iBufLeftSize, ".%03.3u.len", WelsGetMillsecond(&sCurTime));
-		iBufLeftSize -= iBufUsedSize;
-	}
-	
+  if (iBufLeft > iBufUsed) {
+    iBufUsed += WelsSnprintf (&chFileName[iBufUsed], iBufLeft, ".%03.3u.264", WelsGetMillsecond (&sCurTime));
+    iBufLeft -= iBufUsed;
+  }
 
-	m_pFBS = WelsFopen(chFileName, "wb");
-	m_pFBSSize = WelsFopen(chFileNameSize, "wb");	
+  if (iBufLeftSize > iBufUsedSize) {
+    iBufUsedSize += WelsSnprintf (&chFileNameSize[iBufUsedSize], iBufLeftSize, ".%03.3u.len",
+                                  WelsGetMillsecond (&sCurTime));
+    iBufLeftSize -= iBufUsedSize;
+  }
+
+
+  m_pFBS = WelsFopen (chFileName, "wb");
+  m_pFBSSize = WelsFopen (chFileNameSize, "wb");
 #endif//OUTPUT_BIT_STREAM
-		
+
 }
 
 /***************************************************************************
 *	Description:
 *			class CWelsDecoder destructor function, destroy allocced memory
-*       
+*
 *	Input parameters: none
 *
 *	return: none
 /***************************************************************************/
-CWelsDecoder::~CWelsDecoder()
-{		
-	IWelsTrace::WelsVTrace(m_pTrace, IWelsTrace::WELS_LOG_INFO, "CWelsDecoder::~CWelsDecoder()");
+CWelsDecoder::~CWelsDecoder() {
+  IWelsTrace::WelsVTrace (m_pTrace, IWelsTrace::WELS_LOG_INFO, "CWelsDecoder::~CWelsDecoder()");
 
-	UninitDecoder();
+  UninitDecoder();
 
 #ifdef OUTPUT_BIT_STREAM
-	if ( m_pFBS )
-	{
-		WelsFclose( m_pFBS );
-		m_pFBS = NULL;
-	}
-	if ( m_pFBSSize )
-	{
-		WelsFclose( m_pFBSSize );
-		m_pFBSSize = NULL;
-	}
+  if (m_pFBS) {
+    WelsFclose (m_pFBS);
+    m_pFBS = NULL;
+  }
+  if (m_pFBSSize) {
+    WelsFclose (m_pFBSSize);
+    m_pFBSSize = NULL;
+  }
 #endif//OUTPUT_BIT_STREAM
 
-	if( NULL != m_pTrace ){
-		delete m_pTrace;
-		m_pTrace = NULL;
-	}	
+  if (NULL != m_pTrace) {
+    delete m_pTrace;
+    m_pTrace = NULL;
+  }
 }
 
-long CWelsDecoder::Initialize(void_t* pParam, const INIT_TYPE keInitType)
-{
-	if ( pParam == NULL || keInitType != INIT_TYPE_PARAMETER_BASED ){
-		IWelsTrace::WelsVTrace(m_pTrace, IWelsTrace::WELS_LOG_INFO, "CWelsDecoder::Initialize(), invalid input argument.");
-		return cmInitParaError;
-	}
+long CWelsDecoder::Initialize (void_t* pParam, const INIT_TYPE keInitType) {
+  if (pParam == NULL || keInitType != INIT_TYPE_PARAMETER_BASED) {
+    IWelsTrace::WelsVTrace (m_pTrace, IWelsTrace::WELS_LOG_INFO, "CWelsDecoder::Initialize(), invalid input argument.");
+    return cmInitParaError;
+  }
 
-	// H.264 decoder initialization,including memory allocation,then open it ready to decode
-	InitDecoder();
+  // H.264 decoder initialization,including memory allocation,then open it ready to decode
+  InitDecoder();
 
-	DecoderConfigParam( m_pDecContext, pParam );
-	
-	return cmResultSuccess;
+  DecoderConfigParam (m_pDecContext, pParam);
+
+  return cmResultSuccess;
 }
 
-long CWelsDecoder::Unintialize()
-{
-	UninitDecoder();
-	
-	return ERR_NONE;
+long CWelsDecoder::Uninitialize() {
+  UninitDecoder();
+
+  return ERR_NONE;
 }
 
-void_t CWelsDecoder::UninitDecoder( void_t )
-{
-	if ( NULL == m_pDecContext )
-		return;
-	
-	IWelsTrace::WelsVTrace(m_pTrace, IWelsTrace::WELS_LOG_INFO, "into CWelsDecoder::uninit_decoder()..");
+void_t CWelsDecoder::UninitDecoder (void_t) {
+  if (NULL == m_pDecContext)
+    return;
 
-	WelsEndDecoder( m_pDecContext );
+  IWelsTrace::WelsVTrace (m_pTrace, IWelsTrace::WELS_LOG_INFO, "into CWelsDecoder::uninit_decoder()..");
 
-	if ( NULL != m_pDecContext )
-	{
-		WelsFree( m_pDecContext, "m_pDecContext" );
+  WelsEndDecoder (m_pDecContext);
 
-		m_pDecContext	= NULL;
-	}
+  if (NULL != m_pDecContext) {
+    WelsFree (m_pDecContext, "m_pDecContext");
 
-	IWelsTrace::WelsVTrace(m_pTrace, IWelsTrace::WELS_LOG_INFO, "left CWelsDecoder::uninit_decoder()..");
+    m_pDecContext	= NULL;
+  }
+
+  IWelsTrace::WelsVTrace (m_pTrace, IWelsTrace::WELS_LOG_INFO, "left CWelsDecoder::uninit_decoder()..");
 }
 
 // the return value of this function is not suitable, it need report failure info to upper layer.
-void_t CWelsDecoder::InitDecoder( void_t )
-{
-	IWelsTrace::WelsVTrace(m_pTrace, IWelsTrace::WELS_LOG_INFO, "CWelsDecoder::init_decoder()..");	
+void_t CWelsDecoder::InitDecoder (void_t) {
+  IWelsTrace::WelsVTrace (m_pTrace, IWelsTrace::WELS_LOG_INFO, "CWelsDecoder::init_decoder()..");
 
-	m_pDecContext	= (PWelsDecoderContext)WelsMalloc( sizeof(SWelsDecoderContext), "m_pDecContext" );
-	
-	WelsInitDecoder( m_pDecContext, m_pTrace, IWelsTrace::WelsTrace );
+  m_pDecContext	= (PWelsDecoderContext)WelsMalloc (sizeof (SWelsDecoderContext), "m_pDecContext");
 
-	IWelsTrace::WelsVTrace(m_pTrace, IWelsTrace::WELS_LOG_INFO, "CWelsDecoder::init_decoder().. left");
+  WelsInitDecoder (m_pDecContext, m_pTrace, IWelsTrace::WelsTrace);
+
+  IWelsTrace::WelsVTrace (m_pTrace, IWelsTrace::WELS_LOG_INFO, "CWelsDecoder::init_decoder().. left");
 }
 
 /*
- * Set Option	
+ * Set Option
  */
-long CWelsDecoder::SetOption(DECODER_OPTION eOptID, void_t* pOption)
-{
-	int iVal = 0;
-	
-	if ( m_pDecContext == NULL )
-		return dsInitialOptExpected;
-	
-	if ( eOptID == DECODER_OPTION_DATAFORMAT ) // Set color space of decoding output frame
-	{		
-		if ( pOption == NULL )
-			return cmInitParaError;
-		
-		iVal = *((int*)pOption);	// is_rgb
-		
-		return DecoderSetCsp( m_pDecContext, iVal );
-	}
-	else if ( eOptID == DECODER_OPTION_END_OF_STREAM ) // Indicate bit-stream of the final frame to be decoded
-	{
-		if ( pOption == NULL )
-			return cmInitParaError;
-		
-		iVal	= *((int*)pOption);	// boolean value for whether enabled End Of Stream flag
+long CWelsDecoder::SetOption (DECODER_OPTION eOptID, void_t* pOption) {
+  int iVal = 0;
 
-		m_pDecContext->bEndOfStreamFlag	= iVal ? true : false;
-		
-		return cmResultSuccess;
-	}
-	else if ( eOptID == DECODER_OPTION_MODE)
-	{
-		if ( pOption == NULL )
-			return cmInitParaError;
+  if (m_pDecContext == NULL)
+    return dsInitialOptExpected;
 
-		iVal = *((int *)pOption);
+  if (eOptID == DECODER_OPTION_DATAFORMAT) { // Set color space of decoding output frame
+    if (pOption == NULL)
+      return cmInitParaError;
 
-		m_pDecContext->iSetMode = iVal;
-		if(iVal == SW_MODE)
-		{
-			m_pDecContext->iDecoderOutputProperty = BUFFER_HOST;
-		}
-		else
-		{
+    iVal = * ((int*)pOption);	// is_rgb
+
+    return DecoderSetCsp (m_pDecContext, iVal);
+  } else if (eOptID == DECODER_OPTION_END_OF_STREAM) { // Indicate bit-stream of the final frame to be decoded
+    if (pOption == NULL)
+      return cmInitParaError;
+
+    iVal	= * ((int*)pOption);	// boolean value for whether enabled End Of Stream flag
+
+    m_pDecContext->bEndOfStreamFlag	= iVal ? true : false;
+
+    return cmResultSuccess;
+  } else if (eOptID == DECODER_OPTION_MODE) {
+    if (pOption == NULL)
+      return cmInitParaError;
+
+    iVal = * ((int*)pOption);
+
+    m_pDecContext->iSetMode = iVal;
+    if (iVal == SW_MODE) {
+      m_pDecContext->iDecoderOutputProperty = BUFFER_HOST;
+    } else {
 #if !defined(__APPLE__)
-			m_pDecContext->iDecoderOutputProperty = BUFFER_DEVICE;
+      m_pDecContext->iDecoderOutputProperty = BUFFER_DEVICE;
 #else
-			m_pDecContext->iDecoderOutputProperty = BUFFER_HOST;//BUFFER_HOST;//BUFFER_DEVICE;
+      m_pDecContext->iDecoderOutputProperty = BUFFER_HOST;//BUFFER_HOST;//BUFFER_DEVICE;
 #endif
-			
-		}
-		
-		return cmResultSuccess;
-	}
-	else if ( eOptID == DECODER_OPTION_OUTPUT_PROPERTY)
-	{
-		if ( pOption == NULL)
-			return cmInitParaError;
 
-		iVal = *((int *)pOption);
-		if( m_pDecContext->iSetMode != SW_MODE)	
-			m_pDecContext->iDecoderOutputProperty = iVal;
-	}
+    }
 
+    return cmResultSuccess;
+  } else if (eOptID == DECODER_OPTION_OUTPUT_PROPERTY) {
+    if (pOption == NULL)
+      return cmInitParaError;
 
-	return cmInitParaError;
+    iVal = * ((int*)pOption);
+    if (m_pDecContext->iSetMode != SW_MODE)
+      m_pDecContext->iDecoderOutputProperty = iVal;
+  }
+
+
+  return cmInitParaError;
 }
 
 /*
  *	Get Option
  */
-long CWelsDecoder::GetOption(DECODER_OPTION eOptID, void_t* pOption)
-{
-	int iVal = 0;
-	
-	if ( m_pDecContext == NULL )
-		return cmInitExpected;
-	
-	if ( pOption == NULL )
-		return cmInitParaError;
-	
-	if ( DECODER_OPTION_DATAFORMAT == eOptID ){
-		iVal = m_pDecContext->iOutputColorFormat;
-		*((int*)pOption)	= iVal;
-		return cmResultSuccess;
-	}
-	else if ( DECODER_OPTION_END_OF_STREAM == eOptID ){
-		iVal	= m_pDecContext->bEndOfStreamFlag;
-		*((int*)pOption)	= iVal;
-		return cmResultSuccess;
-	}
+long CWelsDecoder::GetOption (DECODER_OPTION eOptID, void_t* pOption) {
+  int iVal = 0;
+
+  if (m_pDecContext == NULL)
+    return cmInitExpected;
+
+  if (pOption == NULL)
+    return cmInitParaError;
+
+  if (DECODER_OPTION_DATAFORMAT == eOptID) {
+    iVal = m_pDecContext->iOutputColorFormat;
+    * ((int*)pOption)	= iVal;
+    return cmResultSuccess;
+  } else if (DECODER_OPTION_END_OF_STREAM == eOptID) {
+    iVal	= m_pDecContext->bEndOfStreamFlag;
+    * ((int*)pOption)	= iVal;
+    return cmResultSuccess;
+  }
 #ifdef LONG_TERM_REF
-	else if ( DECODER_OPTION_IDR_PIC_ID == eOptID ){
-		iVal = m_pDecContext->uiCurIdrPicId;
-		*((int*)pOption) = iVal;
-		return cmResultSuccess;
-	}
-	else if ( DECODER_OPTION_FRAME_NUM == eOptID)
-	{
-		iVal = m_pDecContext->iFrameNum;
-		*((int*)pOption) = iVal;
-		return cmResultSuccess;
-	}
-	else if ( DECODER_OPTION_LTR_MARKING_FLAG == eOptID )
-	{
-		iVal = m_pDecContext->bCurAuContainLtrMarkSeFlag;
-		*((int*)pOption) = iVal;
-		return cmResultSuccess;
-	}
-	else if ( DECODER_OPTION_LTR_MARKED_FRAME_NUM == eOptID )
-	{
-		iVal = m_pDecContext->iFrameNumOfAuMarkedLtr;
-		*((int*)pOption) = iVal;
-		return cmResultSuccess;
-	}
+  else if (DECODER_OPTION_IDR_PIC_ID == eOptID) {
+    iVal = m_pDecContext->uiCurIdrPicId;
+    * ((int*)pOption) = iVal;
+    return cmResultSuccess;
+  } else if (DECODER_OPTION_FRAME_NUM == eOptID) {
+    iVal = m_pDecContext->iFrameNum;
+    * ((int*)pOption) = iVal;
+    return cmResultSuccess;
+  } else if (DECODER_OPTION_LTR_MARKING_FLAG == eOptID) {
+    iVal = m_pDecContext->bCurAuContainLtrMarkSeFlag;
+    * ((int*)pOption) = iVal;
+    return cmResultSuccess;
+  } else if (DECODER_OPTION_LTR_MARKED_FRAME_NUM == eOptID) {
+    iVal = m_pDecContext->iFrameNumOfAuMarkedLtr;
+    * ((int*)pOption) = iVal;
+    return cmResultSuccess;
+  }
 #endif
-	else if ( DECODER_OPTION_VCL_NAL == eOptID ) //feedback whether or not have VCL NAL in current AU
-	{
-		iVal = m_pDecContext->iFeedbackVclNalInAu;
-		*((int*)pOption) = iVal;
-		return cmResultSuccess;
-	}
-	else if ( DECODER_OPTION_TEMPORAL_ID == eOptID ) //if have VCL NAL in current AU, then feedback the temporal ID
-	{
-		iVal = m_pDecContext->iFeedbackTidInAu;
-		*((int*)pOption) = iVal;
-		return cmResultSuccess;
-	}
-	else if ( DECODER_OPTION_MODE == eOptID )
-	{
-		if ( pOption == NULL )
-			return cmInitParaError;
-		
-		iVal = m_pDecContext->iSetMode;
-		
-		*((int *)pOption) = iVal;
-		return cmResultSuccess;
-	}
-	else if ( DECODER_OPTION_DEVICE_INFO == eOptID )
-	{
-		if ( pOption == NULL )
-			return cmInitParaError;
+  else if (DECODER_OPTION_VCL_NAL == eOptID) { //feedback whether or not have VCL NAL in current AU
+    iVal = m_pDecContext->iFeedbackVclNalInAu;
+    * ((int*)pOption) = iVal;
+    return cmResultSuccess;
+  } else if (DECODER_OPTION_TEMPORAL_ID == eOptID) { //if have VCL NAL in current AU, then feedback the temporal ID
+    iVal = m_pDecContext->iFeedbackTidInAu;
+    * ((int*)pOption) = iVal;
+    return cmResultSuccess;
+  } else if (DECODER_OPTION_MODE == eOptID) {
+    if (pOption == NULL)
+      return cmInitParaError;
 
-		return cmResultSuccess;
-	}
-	
-	return cmInitParaError;
+    iVal = m_pDecContext->iSetMode;
+
+    * ((int*)pOption) = iVal;
+    return cmResultSuccess;
+  } else if (DECODER_OPTION_DEVICE_INFO == eOptID) {
+    if (pOption == NULL)
+      return cmInitParaError;
+
+    return cmResultSuccess;
+  }
+
+  return cmInitParaError;
 }
 
-DECODING_STATE CWelsDecoder::DecodeFrame(	const unsigned char* kpSrc,
-											const int kiSrcLen,	
-											void_t ** ppDst,
-											SBufferInfo* pDstInfo)
-{
-	if ( kiSrcLen > 0 && kpSrc != NULL )
-	{		
+DECODING_STATE CWelsDecoder::DecodeFrame (const unsigned char* kpSrc,
+    const int kiSrcLen,
+    void_t** ppDst,
+    SBufferInfo* pDstInfo) {
+  if (kiSrcLen > 0 && kpSrc != NULL) {
 #ifdef OUTPUT_BIT_STREAM
-		if ( m_pFBS )
-		{
-			WelsFwrite( kpSrc, sizeof(unsigned char), kiSrcLen, m_pFBS );
-			WelsFflush( m_pFBS );
-		}
-		if ( m_pFBSSize )
-		{
-			WelsFwrite( &kiSrcLen, sizeof(int), 1, m_pFBSSize );
-			WelsFflush( m_pFBSSize );
-		}
+    if (m_pFBS) {
+      WelsFwrite (kpSrc, sizeof (unsigned char), kiSrcLen, m_pFBS);
+      WelsFflush (m_pFBS);
+    }
+    if (m_pFBSSize) {
+      WelsFwrite (&kiSrcLen, sizeof (int), 1, m_pFBSSize);
+      WelsFflush (m_pFBSSize);
+    }
 #endif//OUTPUT_BIT_STREAM
-		m_pDecContext->bEndOfStreamFlag = false;
-	}
-	else  
-	{   //For application MODE, the error detection should be added for safe.
-		//But for CONSOLE MODE, when decoding LAST AU, kiSrcLen==0 && kpSrc==NULL. 
-		m_pDecContext->bEndOfStreamFlag = true;
-	}
-		
-	ppDst[0] = ppDst[1] = ppDst[2] = NULL;
-	m_pDecContext->iErrorCode             = dsErrorFree; //initialize at the starting of AU decoding.
-	m_pDecContext->iFeedbackVclNalInAu = FEEDBACK_UNKNOWN_NAL; //initialize
-	memset(pDstInfo,0,sizeof(SBufferInfo));
-	pDstInfo->eBufferProperty = (EBufferProperty)m_pDecContext->iDecoderOutputProperty;
+    m_pDecContext->bEndOfStreamFlag = false;
+  } else {
+    //For application MODE, the error detection should be added for safe.
+    //But for CONSOLE MODE, when decoding LAST AU, kiSrcLen==0 && kpSrc==NULL.
+    m_pDecContext->bEndOfStreamFlag = true;
+  }
 
+  ppDst[0] = ppDst[1] = ppDst[2] = NULL;
+  m_pDecContext->iErrorCode             = dsErrorFree; //initialize at the starting of AU decoding.
+  m_pDecContext->iFeedbackVclNalInAu = FEEDBACK_UNKNOWN_NAL; //initialize
+  memset (pDstInfo, 0, sizeof (SBufferInfo));
+  pDstInfo->eBufferProperty = (EBufferProperty)m_pDecContext->iDecoderOutputProperty;
+
 #ifdef LONG_TERM_REF
-	m_pDecContext->bReferenceLostAtT0Flag       = false; //initialize for LTR
-	m_pDecContext->bCurAuContainLtrMarkSeFlag = false;
-	m_pDecContext->iFrameNumOfAuMarkedLtr      = 0;
-	m_pDecContext->iFrameNum                       = -1; //initialize
+  m_pDecContext->bReferenceLostAtT0Flag       = false; //initialize for LTR
+  m_pDecContext->bCurAuContainLtrMarkSeFlag = false;
+  m_pDecContext->iFrameNumOfAuMarkedLtr      = 0;
+  m_pDecContext->iFrameNum                       = -1; //initialize
 #endif
 
-	m_pDecContext->iFeedbackTidInAu             = -1; //initialize
-	
-	WelsDecodeBs( m_pDecContext, kpSrc, kiSrcLen, (unsigned char**)ppDst, pDstInfo); //iErrorCode has been modified in this function
-	
-	pDstInfo->eWorkMode = (EDecodeMode)m_pDecContext->iDecoderMode;
+  m_pDecContext->iFeedbackTidInAu             = -1; //initialize
 
-	if ( m_pDecContext->iErrorCode )
-	{		
-		ENalUnitType eNalType = NAL_UNIT_UNSPEC_0;	//for NBR, IDR frames are expected to decode as followed if error decoding an IDR currently		
+  WelsDecodeBs (m_pDecContext, kpSrc, kiSrcLen, (unsigned char**)ppDst,
+                pDstInfo); //iErrorCode has been modified in this function
 
-		eNalType	= m_pDecContext->sCurNalHead.eNalUnitType;
-		
-		//for AVC bitstream (excluding AVC with temporal scalability, including TP), as long as error occur, SHOULD notify upper layer key frame loss.
-		if ( (IS_PARAM_SETS_NALS(eNalType) || NAL_UNIT_CODED_SLICE_IDR == eNalType) ||
-			(VIDEO_BITSTREAM_AVC == m_pDecContext->eVideoType) )
-		{
+  pDstInfo->eWorkMode = (EDecodeMode)m_pDecContext->iDecoderMode;
+
+  if (m_pDecContext->iErrorCode) {
+    ENalUnitType eNalType =
+      NAL_UNIT_UNSPEC_0;	//for NBR, IDR frames are expected to decode as followed if error decoding an IDR currently
+
+    eNalType	= m_pDecContext->sCurNalHead.eNalUnitType;
+
+    //for AVC bitstream (excluding AVC with temporal scalability, including TP), as long as error occur, SHOULD notify upper layer key frame loss.
+    if ((IS_PARAM_SETS_NALS (eNalType) || NAL_UNIT_CODED_SLICE_IDR == eNalType) ||
+        (VIDEO_BITSTREAM_AVC == m_pDecContext->eVideoType)) {
 #ifdef LONG_TERM_REF
-			m_pDecContext->bParamSetsLostFlag = true;
+      m_pDecContext->bParamSetsLostFlag = true;
 #else
-			m_pDecContext->bReferenceLostAtT0Flag = true;
+      m_pDecContext->bReferenceLostAtT0Flag = true;
 #endif
-			ResetParameterSetsState( m_pDecContext ); //initial SPS&PPS ready flag
-		}		
+      ResetParameterSetsState (m_pDecContext);  //initial SPS&PPS ready flag
+    }
 
-		IWelsTrace::WelsVTrace(m_pTrace, IWelsTrace::WELS_LOG_INFO, "decode failed, failure type:%d \n", m_pDecContext->iErrorCode);
-		return (DECODING_STATE)m_pDecContext->iErrorCode;
-	}
+    IWelsTrace::WelsVTrace (m_pTrace, IWelsTrace::WELS_LOG_INFO, "decode failed, failure type:%d \n",
+                            m_pDecContext->iErrorCode);
+    return (DECODING_STATE)m_pDecContext->iErrorCode;
+  }
 
-	return dsErrorFree;
+  return dsErrorFree;
 }
 
-DECODING_STATE CWelsDecoder::DecodeFrame(	const unsigned char* kpSrc,
-										   const int kiSrcLen,	
-										   unsigned char** ppDst,
-										   int* pStride,
-										   int& iWidth,
-										   int& iHeight )
-{
-	DECODING_STATE eDecState = dsErrorFree;
-	SBufferInfo    DstInfo;
+DECODING_STATE CWelsDecoder::DecodeFrame (const unsigned char* kpSrc,
+    const int kiSrcLen,
+    unsigned char** ppDst,
+    int* pStride,
+    int& iWidth,
+    int& iHeight) {
+  DECODING_STATE eDecState = dsErrorFree;
+  SBufferInfo    DstInfo;
 
-	memset(&DstInfo, 0, sizeof(SBufferInfo));
-	DstInfo.UsrData.sSystemBuffer.iStride[0] = pStride[0];
-	DstInfo.UsrData.sSystemBuffer.iStride[1] = pStride[1];
-	DstInfo.UsrData.sSystemBuffer.iWidth = iWidth;
-	DstInfo.UsrData.sSystemBuffer.iHeight = iHeight;
-	DstInfo.eBufferProperty = BUFFER_HOST;
+  memset (&DstInfo, 0, sizeof (SBufferInfo));
+  DstInfo.UsrData.sSystemBuffer.iStride[0] = pStride[0];
+  DstInfo.UsrData.sSystemBuffer.iStride[1] = pStride[1];
+  DstInfo.UsrData.sSystemBuffer.iWidth = iWidth;
+  DstInfo.UsrData.sSystemBuffer.iHeight = iHeight;
+  DstInfo.eBufferProperty = BUFFER_HOST;
 
-	eDecState = DecodeFrame(kpSrc, kiSrcLen, (void_t **)ppDst, &DstInfo);
-	if (eDecState == dsErrorFree)
-	{
-		pStride[0] = DstInfo.UsrData.sSystemBuffer.iStride[0];
-		pStride[1] = DstInfo.UsrData.sSystemBuffer.iStride[1];
-		iWidth     = DstInfo.UsrData.sSystemBuffer.iWidth;
-		iHeight    = DstInfo.UsrData.sSystemBuffer.iHeight;
-	}
+  eDecState = DecodeFrame (kpSrc, kiSrcLen, (void_t**)ppDst, &DstInfo);
+  if (eDecState == dsErrorFree) {
+    pStride[0] = DstInfo.UsrData.sSystemBuffer.iStride[0];
+    pStride[1] = DstInfo.UsrData.sSystemBuffer.iStride[1];
+    iWidth     = DstInfo.UsrData.sSystemBuffer.iWidth;
+    iHeight    = DstInfo.UsrData.sSystemBuffer.iHeight;
+  }
 
-	return eDecState;
+  return eDecState;
 }
 
-DECODING_STATE CWelsDecoder::DecodeFrameEx(const unsigned char * kpSrc,
-		                                  const int kiSrcLen,
-		                                  unsigned char * pDst,
-										  int iDstStride,
-		                                  int & iDstLen,
-		                                  int & iWidth,
-		                                  int & iHeight,
-		                                  int & iColorFormat	)
-{
-	DECODING_STATE	 state = dsErrorFree;
+DECODING_STATE CWelsDecoder::DecodeFrameEx (const unsigned char* kpSrc,
+    const int kiSrcLen,
+    unsigned char* pDst,
+    int iDstStride,
+    int& iDstLen,
+    int& iWidth,
+    int& iHeight,
+    int& iColorFormat) {
+  DECODING_STATE	 state = dsErrorFree;
 
-    return state;
+  return state;
 }
 
 
@@ -504,29 +459,27 @@
 *	CreateDecoder
 *	@return:	success in return 0, otherwise failed.
 */
-long CreateDecoder( ISVCDecoder** ppDecoder )
-{
+long CreateDecoder (ISVCDecoder** ppDecoder) {
 
-	if ( NULL == ppDecoder ){		
-		return ERR_INVALID_PARAMETERS;
-	}
+  if (NULL == ppDecoder) {
+    return ERR_INVALID_PARAMETERS;
+  }
 
-	*ppDecoder	= new CWelsDecoder();
+  *ppDecoder	= new CWelsDecoder();
 
-	if ( NULL == *ppDecoder ){		
-		return ERR_MALLOC_FAILED;
-	}	
+  if (NULL == *ppDecoder) {
+    return ERR_MALLOC_FAILED;
+  }
 
-	return ERR_NONE;
+  return ERR_NONE;
 }
 
 /*
 *	DestroyDecoder
 */
-void_t DestroyDecoder( ISVCDecoder* pDecoder )
-{	
-	if ( NULL != pDecoder ){
-		delete (CWelsDecoder *)pDecoder;
-		pDecoder = NULL;
-	}
+void_t DestroyDecoder (ISVCDecoder* pDecoder) {
+  if (NULL != pDecoder) {
+    delete (CWelsDecoder*)pDecoder;
+    pDecoder = NULL;
+  }
 }
--- a/codec/encoder/core/asm/asm_inc.asm
+++ b/codec/encoder/core/asm/asm_inc.asm
@@ -43,7 +43,7 @@
 ; Options, for DEBUG
 ;***********************************************************************
 
-%if 1 
+%if 1
 	%define MOVDQ movdqa
 %else
 	%define MOVDQ movdqu
@@ -58,7 +58,7 @@
 BITS 32
 
 ;***********************************************************************
-; Macros 
+; Macros
 ;***********************************************************************
 
 %macro WELS_EXTERN 1
@@ -74,7 +74,7 @@
 	pxor        %2, %2
     psubw       %2, %1
     pmaxsw      %1, %2
-%endmacro 	
+%endmacro
 
 %macro MMX_XSwap  4
     movq		%4, %2
@@ -105,7 +105,7 @@
     SSE2_XSawp qdq, %5, %2, %3
 %endmacro
 
-;in: xmm0, xmm1, xmm2, xmm3  pOut:  xmm0, xmm1, xmm3, xmm4 
+;in: xmm0, xmm1, xmm2, xmm3  pOut:  xmm0, xmm1, xmm3, xmm4
 %macro SSE2_TransTwo4x4W 5
     SSE2_XSawp wd,  %1, %2, %5
     SSE2_XSawp wd,  %3, %4, %2
@@ -125,26 +125,26 @@
 	movdqa	%6, %9
 	movdqa	%9, %4
 	SSE2_XSawp bw,  %7, %6, %4
-	
-	SSE2_XSawp wd,  %1, %3, %6	
+
+	SSE2_XSawp wd,  %1, %3, %6
 	SSE2_XSawp wd,  %8, %2, %3
 	SSE2_XSawp wd,  %5, %7, %2
 	movdqa	%7, %9
-	movdqa	%9, %3	
+	movdqa	%9, %3
 	SSE2_XSawp wd,  %7, %4, %3
-	
-	SSE2_XSawp dq,  %1, %5, %4	
+
+	SSE2_XSawp dq,  %1, %5, %4
 	SSE2_XSawp dq,  %6, %2, %5
 	SSE2_XSawp dq,  %8, %7, %2
 	movdqa	%7, %9
-	movdqa	%9, %5		
+	movdqa	%9, %5
 	SSE2_XSawp dq,  %7, %3, %5
-	
+
 	SSE2_XSawp qdq,  %1, %8, %3
 	SSE2_XSawp qdq,  %4, %2, %8
 	SSE2_XSawp qdq,  %6, %7, %2
 	movdqa	%7, %9
-	movdqa	%9, %1		
+	movdqa	%9, %1
 	SSE2_XSawp qdq,  %7, %5, %1
 	movdqa	%5, %9
 %endmacro
@@ -170,9 +170,9 @@
 %macro butterfly_1to16_sse	3	; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
 	mov %3h, %3l
 	movd %1, e%3x		; i.e, 1% = eax (=b0)
-	pshuflw %2, %1, 00h	; ..., b0 b0 b0 b0 b0 b0 b0 b0	
-	pshufd %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0	
-%endmacro  
+	pshuflw %2, %1, 00h	; ..., b0 b0 b0 b0 b0 b0 b0 b0
+	pshufd %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
+%endmacro
 
 ;copy a dw into a xmm for 8 times
 %macro  SSE2_Copy8Times 2
--- a/codec/encoder/core/asm/coeff.asm
+++ b/codec/encoder/core/asm/coeff.asm
@@ -318,9 +318,9 @@
 SECTION .text
 
 
-	
+
 ;***********************************************************************
-;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx); 
+;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
 ;***********************************************************************
 WELS_EXTERN CavlcParamCal_sse2
 CavlcParamCal_sse2:
@@ -327,16 +327,16 @@
 	push ebx
 	push edi
 	push esi
-	
+
 	mov			eax,	[esp+16]	;coffLevel
 	mov			edi,	[esp+24]	;Level
 	mov			ebx,	[esp+32]	;endIdx
 	cmp			ebx,	3
-	jne			.Level16	
+	jne			.Level16
 	pxor		xmm1,	xmm1
 	movq		xmm0,	[eax]	; removed QWORD
-	jmp			.Cal_begin		
-.Level16:	
+	jmp			.Cal_begin
+.Level16:
 	movdqa		xmm0,	[eax]
 	movdqa		xmm1,	[eax+16]
 .Cal_begin:
@@ -354,7 +354,7 @@
 	pcmpeqw		xmm7,	xmm7	;generate -1
     mov			ebx,	0xff
     ;pinsrw		xmm6,	ebx,	3
-   
+
     mov       bl,   dh
 
 	lea       ebx,  [byte_1pos_table+8*ebx]
@@ -362,7 +362,7 @@
 	pextrw    ecx,  xmm0, 3
 	shr       ecx,  8
     mov       dh,   cl
- 
+
 .loopHighFind0:
     cmp       ecx,   0
     je        .loopHighFind0End
@@ -372,7 +372,7 @@
     add       esi, 8
     mov       esi, [eax+2*esi]
     mov       [edi], si
-    add       edi,   2 
+    add       edi,   2
     ;add       ebx,   1
     inc		  ebx
     dec       ecx
@@ -403,8 +403,8 @@
 	;and       edx, 0xff
 	movzx	  edx,	byte [ebx]
 	mov       edx, [eax+2*edx]
-	mov       [edi], dx 
-	add       edi,   2 
+	mov       [edi], dx
+	add       edi,   2
 	;add       ebx,   1
 	inc		  ebx
     dec       esi
@@ -436,8 +436,8 @@
     psllq    xmm0, xmm3
     psrlq    xmm0, xmm3
     movdqa   xmm4, xmm1
-    psllq    xmm1, xmm2 
-    psrlq    xmm4, xmm3 
+    psllq    xmm1, xmm2
+    psrlq    xmm4, xmm3
     punpcklqdq xmm1, xmm4
     por      xmm0,  xmm1
 
--- a/codec/encoder/core/asm/cpuid.asm
+++ b/codec/encoder/core/asm/cpuid.asm
@@ -84,12 +84,12 @@
 ;   void WelsCPUId( int32_t uiIndex, int32_t *pFeatureA, int32_t *pFeatureB, int32_t *pFeatureC, int32_t *pFeatureD )
 ;****************************************************************************************************
 WelsCPUId:
-	push	ebx	
+	push	ebx
 	push	edi
-	
+
 	mov     eax, [esp+12]	; operating index
     cpuid					; cpuid
-	
+
 	; processing various information return
 	mov     edi, [esp+16]
     mov     [edi], eax
@@ -100,10 +100,10 @@
     mov     edi, [esp+28]
     mov     [edi], edx
 
-	pop		edi	
+	pop		edi
     pop     ebx
 	ret
-	
+
 WELS_EXTERN WelsCPUSupportAVX
 ; need call after cpuid=1 and eax, ecx flag got then
 ALIGN 16
@@ -139,7 +139,7 @@
 WelsCPUSupportFMA:
 	mov eax, [esp+4]
 	mov ecx, [esp+8]
-	
+
 	; refer to detection of FMA addressed in INTEL AVX manual document
 	and ecx, 018001000H
 	cmp ecx, 018001000H		; check OSXSAVE, AVX, FMA feature flags
@@ -153,7 +153,7 @@
 	mov eax, 1
 	ret
 fma_not_supported:
-	mov eax, 0	
+	mov eax, 0
 	ret
 
 WELS_EXTERN WelsEmms
--- a/codec/encoder/core/asm/dct.asm
+++ b/codec/encoder/core/asm/dct.asm
@@ -48,26 +48,26 @@
 
 ;***********************************************************************
 ; Constant
-;***********************************************************************		
-			
+;***********************************************************************
+
 align 16
-SSE2_DeQuant8 dw  10, 13, 10, 13, 13, 16, 13, 16, 
+SSE2_DeQuant8 dw  10, 13, 10, 13, 13, 16, 13, 16,
 			dw	10, 13, 10, 13, 13, 16, 13, 16,
-            dw  11, 14, 11, 14, 14, 18, 14, 18, 
+            dw  11, 14, 11, 14, 14, 18, 14, 18,
 			dw  11, 14, 11, 14, 14, 18, 14, 18,
-			dw  13, 16, 13, 16, 16, 20, 16, 20, 
 			dw  13, 16, 13, 16, 16, 20, 16, 20,
-            dw  14, 18, 14, 18, 18, 23, 18, 23, 
+			dw  13, 16, 13, 16, 16, 20, 16, 20,
+            dw  14, 18, 14, 18, 18, 23, 18, 23,
 			dw  14, 18, 14, 18, 18, 23, 18, 23,
-			dw  16, 20, 16, 20, 20, 25, 20, 25, 
 			dw  16, 20, 16, 20, 20, 25, 20, 25,
-            dw  18, 23, 18, 23, 23, 29, 23, 29, 
+			dw  16, 20, 16, 20, 20, 25, 20, 25,
+            dw  18, 23, 18, 23, 23, 29, 23, 29,
 			dw  18, 23, 18, 23, 23, 29, 23, 29
-			
 
+
 ;***********************************************************************
 ; MMX functions
-;***********************************************************************			
+;***********************************************************************
 
 %macro MMX_LoadDiff4P 5
 	movd        %1, [%3]
@@ -112,7 +112,7 @@
     MMX_SumSub		%4, %1, %6
     MMX_SumSub		%3, %2, %6
     MMX_SumSub		%3, %4, %6
-    MMX_SumSubMul2  %1, %2, %5  
+    MMX_SumSubMul2  %1, %2, %5
 %endmacro
 
 %macro MMX_IDCT 6
@@ -145,13 +145,13 @@
     mov     edx, [esp+24]   ; i_pix2
 
     WELS_Zero    mm7
-    
+
     MMX_LoadDiff4x4P mm1, mm2, mm3, mm4, eax, ebx, ecx, edx, mm0, mm7
 
-    MMX_DCT			mm1, mm2, mm3 ,mm4, mm5, mm6           
+    MMX_DCT			mm1, mm2, mm3 ,mm4, mm5, mm6
     MMX_Trans4x4W	mm3, mm1, mm4, mm5, mm2
-    
-    MMX_DCT			mm3, mm5, mm2 ,mm4, mm1, mm6                    
+
+    MMX_DCT			mm3, mm5, mm2 ,mm4, mm1, mm6
     MMX_Trans4x4W	mm2, mm3, mm4, mm1, mm5
 
     mov     eax, [esp+ 8]   ; pDct
@@ -178,15 +178,15 @@
 %define     i_pred      esp+pushsize+16
 %define     pDct        esp+pushsize+20
 
-	mov     eax, [pDct   ] 
+	mov     eax, [pDct   ]
     movq    mm0, [eax+ 0]
     movq    mm1, [eax+ 8]
     movq    mm2, [eax+16]
     movq    mm3, [eax+24]
-    mov     edx, [p_dst ]   
-    mov     ecx, [i_dst ]   
+    mov     edx, [p_dst ]
+    mov     ecx, [i_dst ]
     mov     eax, [p_pred]
-    mov     ebx, [i_pred]     
+    mov     ebx, [i_pred]
 
 	MMX_Trans4x4W		mm0, mm1, mm2, mm3, mm4
 	MMX_IDCT			mm1, mm2, mm3, mm4, mm0, mm6
@@ -195,7 +195,7 @@
 
     WELS_Zero			mm7
     WELS_DW32			mm6
-    
+
     MMX_StoreDiff4P		mm3, mm0, mm6, mm7, [edx], [eax]
     MMX_StoreDiff4P		mm4, mm0, mm6, mm7, [edx+ecx], [eax+ebx]
     lea     edx, [edx+2*ecx]
@@ -202,7 +202,7 @@
     lea     eax, [eax+2*ebx]
     MMX_StoreDiff4P		mm1, mm0, mm6, mm7, [edx], [eax]
     MMX_StoreDiff4P		mm2, mm0, mm6, mm7, [edx+ecx], [eax+ebx]
-    
+
 	WELSEMMS
 %undef	pushsize
 %undef  p_dst
@@ -220,17 +220,17 @@
 %macro SSE2_Store4x8p 6
 	SSE2_XSawp qdq, %2, %3, %6
 	SSE2_XSawp qdq, %4, %5, %3
-	MOVDQ    [%1+0x00], %2 
-	MOVDQ    [%1+0x10], %4 
-	MOVDQ    [%1+0x20], %6 
-	MOVDQ    [%1+0x30], %3 
+	MOVDQ    [%1+0x00], %2
+	MOVDQ    [%1+0x10], %4
+	MOVDQ    [%1+0x20], %6
+	MOVDQ    [%1+0x30], %3
 %endmacro
 
 %macro SSE2_Load4x8p 6
 	MOVDQ    %2,	[%1+0x00]
-	MOVDQ    %4,	[%1+0x10]  
-	MOVDQ    %6,	[%1+0x20]  
-	MOVDQ    %3,	[%1+0x30]  
+	MOVDQ    %4,	[%1+0x10]
+	MOVDQ    %6,	[%1+0x20]
+	MOVDQ    %3,	[%1+0x30]
 	SSE2_XSawp qdq, %4, %3, %5
 	SSE2_XSawp qdq, %2, %6, %3
 %endmacro
@@ -271,40 +271,40 @@
 %endmacro
 
 %macro SSE2_Load8DC	6
-	movdqa		%1,		%6		; %1 = dc0 dc1	
+	movdqa		%1,		%6		; %1 = dc0 dc1
 	paddw       %1,		%5
-    psraw       %1,		$6		; (dc + 32) >> 6	
-    
+    psraw       %1,		$6		; (dc + 32) >> 6
+
     movdqa		%2,		%1
     psrldq		%2,		4
  	punpcklwd	%2,		%2
-	punpckldq	%2,		%2		; %2 = dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3	   
+	punpckldq	%2,		%2		; %2 = dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
 
     movdqa		%3,		%1
     psrldq		%3,		8
  	punpcklwd	%3,		%3
 	punpckldq	%3,		%3		; %3 = dc4 dc4 dc4 dc4 dc5 dc5 dc5 dc5
-	
+
 	movdqa		%4,		%1
     psrldq		%4,		12
  	punpcklwd	%4,		%4
 	punpckldq	%4,		%4		; %4 = dc6 dc6 dc6 dc6 dc7 dc7 dc7 dc7
-	    	
+
 	punpcklwd	%1,		%1
-	punpckldq	%1,		%1		; %1 = dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1	
+	punpckldq	%1,		%1		; %1 = dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
 %endmacro
 
 %macro SSE2_DCT 6
-    SSE2_SumSub		%6, %3,	%5						
-	SSE2_SumSub		%1, %2, %5																		
-	SSE2_SumSub		%3, %2, %5					
-	SSE2_SumSubMul2		%6, %1, %4               	
+    SSE2_SumSub		%6, %3,	%5
+	SSE2_SumSub		%1, %2, %5
+	SSE2_SumSub		%3, %2, %5
+	SSE2_SumSubMul2		%6, %1, %4
 %endmacro
 
 %macro SSE2_IDCT 7
-    SSE2_SumSub       %7, %2, %6					
-    SSE2_SumSubDiv2     %1, %3, %5, %4              
-    SSE2_SumSub	     %2, %1, %5 
+    SSE2_SumSub       %7, %2, %6
+    SSE2_SumSubDiv2     %1, %3, %5, %4
+    SSE2_SumSub	     %2, %1, %5
     SSE2_SumSub		 %7, %4, %5
 %endmacro
 
@@ -316,12 +316,12 @@
 WelsDctFourT4_sse2:
     push    ebx
     push	esi
-    mov		esi, [esp+12] 
+    mov		esi, [esp+12]
     mov     eax, [esp+16]   ; pix1
     mov     ebx, [esp+20]   ; i_pix1
     mov     ecx, [esp+24]   ; pix2
-    mov     edx, [esp+28]   ; i_pix2    
-    
+    mov     edx, [esp+28]   ; i_pix2
+
     pxor    xmm7, xmm7
 
 	;Load 4x8
@@ -331,33 +331,33 @@
 	lea		ecx, [ecx + 2 * edx]
 	SSE2_LoadDiff8P    xmm2, xmm6, xmm7, [eax], [ecx]
     SSE2_LoadDiff8P    xmm3, xmm6, xmm7, [eax+ebx], [ecx+edx]
-	
+
 	SSE2_DCT			xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
 	SSE2_TransTwo4x4W	xmm2, xmm0, xmm3, xmm4, xmm1
-	SSE2_DCT			xmm0, xmm4, xmm1, xmm3, xmm5, xmm2             		
+	SSE2_DCT			xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
 	SSE2_TransTwo4x4W	xmm4, xmm2, xmm1, xmm3, xmm0
-	
-	SSE2_Store4x8p esi, xmm4, xmm2, xmm3, xmm0, xmm5  
-	
+
+	SSE2_Store4x8p esi, xmm4, xmm2, xmm3, xmm0, xmm5
+
 	lea		eax, [eax + 2 * ebx]
 	lea		ecx, [ecx + 2 * edx]
-    
+
 	;Load 4x8
 	SSE2_LoadDiff8P    xmm0, xmm6, xmm7, [eax      ], [ecx    ]
     SSE2_LoadDiff8P    xmm1, xmm6, xmm7, [eax+ebx  ], [ecx+edx]
 	lea		eax, [eax + 2 * ebx]
-	lea		ecx, [ecx + 2 * edx]	
+	lea		ecx, [ecx + 2 * edx]
     SSE2_LoadDiff8P    xmm2, xmm6, xmm7, [eax], [ecx]
     SSE2_LoadDiff8P    xmm3, xmm6, xmm7, [eax+ebx], [ecx+edx]
-	
+
 	SSE2_DCT			xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
-	SSE2_TransTwo4x4W	xmm2, xmm0, xmm3, xmm4, xmm1		
-    SSE2_DCT			xmm0, xmm4, xmm1, xmm3, xmm5, xmm2              		
+	SSE2_TransTwo4x4W	xmm2, xmm0, xmm3, xmm4, xmm1
+    SSE2_DCT			xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
 	SSE2_TransTwo4x4W	xmm4, xmm2, xmm1, xmm3, xmm0
-	
+
 	lea		esi, [esi+64]
-	SSE2_Store4x8p esi, xmm4, xmm2, xmm3, xmm0, xmm5 
-	
+	SSE2_Store4x8p esi, xmm4, xmm2, xmm3, xmm0, xmm5
+
     pop esi
     pop ebx
     ret
@@ -377,21 +377,21 @@
 %define	pushsize	8
     push		ebx
     push		esi
-    
-    mov			eax,		[rec]   
-    mov			ebx,		[stride]   
-    mov			ecx,		[pred]  
-    mov			edx,		[pred_stride]   
-    mov			esi,		[rs]  
 
+    mov			eax,		[rec]
+    mov			ebx,		[stride]
+    mov			ecx,		[pred]
+    mov			edx,		[pred_stride]
+    mov			esi,		[rs]
+
 	;Load 4x8
-	SSE2_Load4x8p  esi, xmm0, xmm1, xmm4, xmm2, xmm5 	
-	
+	SSE2_Load4x8p  esi, xmm0, xmm1, xmm4, xmm2, xmm5
+
 	SSE2_TransTwo4x4W	xmm0, xmm1, xmm4, xmm2, xmm3
   	SSE2_IDCT			xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
     SSE2_TransTwo4x4W	xmm1, xmm4, xmm0, xmm2, xmm3
     SSE2_IDCT			xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
-    
+
 	WELS_Zero			xmm7
     WELS_DW32			xmm6
 
@@ -398,41 +398,41 @@
 	SSE2_StoreDiff8p   xmm4, xmm5, xmm6, xmm7, [eax		],	[ecx]
 	SSE2_StoreDiff8p   xmm0, xmm5, xmm6, xmm7, [eax + ebx	],	[ecx + edx]
 	lea		eax, [eax + 2 * ebx]
-	lea		ecx, [ecx + 2 * edx]	
+	lea		ecx, [ecx + 2 * edx]
 	SSE2_StoreDiff8p   xmm1, xmm5, xmm6, xmm7, [eax],			[ecx]
 	SSE2_StoreDiff8p   xmm2, xmm5, xmm6, xmm7, [eax + ebx	],	[ecx + edx]
-   
+
     add		esi, 64
 	lea		eax, [eax + 2 * ebx]
 	lea		ecx, [ecx + 2 * edx]
-   	SSE2_Load4x8p  esi, xmm0, xmm1, xmm4, xmm2, xmm5 	
-	
+   	SSE2_Load4x8p  esi, xmm0, xmm1, xmm4, xmm2, xmm5
+
 	SSE2_TransTwo4x4W   xmm0, xmm1, xmm4, xmm2, xmm3
-	SSE2_IDCT			xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0           
+	SSE2_IDCT			xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
     SSE2_TransTwo4x4W   xmm1, xmm4, xmm0, xmm2, xmm3
 	SSE2_IDCT			xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
 
 	WELS_Zero			xmm7
     WELS_DW32			xmm6
-    
+
 	SSE2_StoreDiff8p   xmm4, xmm5, xmm6, xmm7, [eax		],	[ecx]
 	SSE2_StoreDiff8p   xmm0, xmm5, xmm6, xmm7, [eax + ebx	],	[ecx + edx]
 	lea		eax, [eax + 2 * ebx]
-	lea		ecx, [ecx + 2 * edx]	
+	lea		ecx, [ecx + 2 * edx]
 	SSE2_StoreDiff8p   xmm1, xmm5, xmm6, xmm7, [eax],			[ecx]
-	SSE2_StoreDiff8p   xmm2, xmm5, xmm6, xmm7, [eax + ebx],	[ecx + edx] 
+	SSE2_StoreDiff8p   xmm2, xmm5, xmm6, xmm7, [eax + ebx],	[ecx + edx]
 
     pop		esi
     pop		ebx
     ret
-    
+
   %macro SSE2_StoreDiff4x8p 8
    	SSE2_StoreDiff8p    %1, %3, %4, [%5],			[%6]
-	SSE2_StoreDiff8p    %1, %3, %4, [%5 + %7],		[%6 + %8]	
+	SSE2_StoreDiff8p    %1, %3, %4, [%5 + %7],		[%6 + %8]
 	SSE2_StoreDiff8p    %2, %3, %4, [%5 + 8],		[%6 + 8]
-	SSE2_StoreDiff8p    %2, %3, %4, [%5 + %7 + 8],	[%6 + %8 + 8]	
+	SSE2_StoreDiff8p    %2, %3, %4, [%5 + %7 + 8],	[%6 + %8 + 8]
  %endmacro
- 
+
  ;***********************************************************************
 ; void WelsIDctRecI16x16Dc_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *dct_dc)
 ;***********************************************************************
@@ -443,47 +443,47 @@
 WelsIDctRecI16x16Dc_sse2:
     push		esi
     push		edi
-    
+
 	mov			ecx,		[luma_dc]
-    mov			eax,		[rec]	
-    mov			edx,		[stride]	
-    mov			esi,		[pred]	
-    mov			edi,		[pred_stride]	    	
+    mov			eax,		[rec]
+    mov			edx,		[stride]
+    mov			esi,		[pred]
+    mov			edi,		[pred_stride]
 	pxor		xmm7,		xmm7
     WELS_DW32	xmm6
-    
+
 	SSE2_Load8DC			xmm0, xmm1, xmm2, xmm3, xmm6, [ecx]
 	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi
-	
+
 	lea			eax,		[eax + 2 * edx]
-	lea			esi,		[esi + 2 * edi]	
-	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi	
-	  
+	lea			esi,		[esi + 2 * edi]
+	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi
+
 	lea			eax,		[eax + 2 * edx]
-	lea			esi,		[esi + 2 * edi]	
+	lea			esi,		[esi + 2 * edi]
 	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, eax, esi, edx, edi
-	
+
 	lea			eax,		[eax + 2 * edx]
-	lea			esi,		[esi + 2 * edi]		
+	lea			esi,		[esi + 2 * edi]
 	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, eax, esi, edx, edi
-	
-	SSE2_Load8DC			xmm0, xmm1, xmm2, xmm3, xmm6, [ecx + 16]	
+
+	SSE2_Load8DC			xmm0, xmm1, xmm2, xmm3, xmm6, [ecx + 16]
 	lea			eax,		[eax + 2 * edx]
-	lea			esi,		[esi + 2 * edi]		
+	lea			esi,		[esi + 2 * edi]
 	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi
-	
+
 	lea			eax,		[eax + 2 * edx]
-	lea			esi,		[esi + 2 * edi]	
-	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi	
-	  
+	lea			esi,		[esi + 2 * edi]
+	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi
+
 	lea			eax,		[eax + 2 * edx]
-	lea			esi,		[esi + 2 * edi]	 
+	lea			esi,		[esi + 2 * edi]
 	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, eax, esi, edx, edi
-	
+
 	lea			eax,		[eax + 2 * edx]
-	lea			esi,		[esi + 2 * edi]		
+	lea			esi,		[esi + 2 * edi]
 	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, eax, esi, edx, edi
-		
+
     pop		edi
     pop		esi
     ret
@@ -517,7 +517,7 @@
 	punpckldq	%3,			%4
 	punpcklqdq	%1,			%3
  %endmacro
- 
+
 ;***********************************************************************
 ;void WelsHadamardT4Dc_sse2( int16_t *luma_dc, int16_t *pDct)
 ;***********************************************************************
@@ -525,23 +525,23 @@
 WelsHadamardT4Dc_sse2:
 		mov			eax,		[esp + 4]	; luma_dc
 		mov			ecx,		[esp + 8]	; pDct
-		
+
 		SSE2_Load4Col	    xmm1, xmm5, xmm6, xmm0, ecx
 		SSE2_Load4Col	    xmm2, xmm5, xmm6, xmm0, ecx + 0x40
 		SSE2_Load4Col	    xmm3, xmm5, xmm6, xmm0, ecx + 0x100
 		SSE2_Load4Col	    xmm4, xmm5, xmm6, xmm0, ecx + 0x140
-		
+
 		SSE2_SumSubD		xmm1, xmm2, xmm7
 		SSE2_SumSubD		xmm3, xmm4, xmm7
 		SSE2_SumSubD		xmm2, xmm4, xmm7
-		SSE2_SumSubD		xmm1, xmm3, xmm7	
+		SSE2_SumSubD		xmm1, xmm3, xmm7
 
 		SSE2_Trans4x4D		xmm4, xmm2, xmm1, xmm3, xmm5	; pOut: xmm4,xmm3,xmm5,xmm1
-	
+
 		SSE2_SumSubD		xmm4, xmm3, xmm7
 		SSE2_SumSubD		xmm5, xmm1, xmm7
 
-		WELS_DD1 xmm6      
+		WELS_DD1 xmm6
 		SSE2_SumSubDiv2D	xmm3, xmm1, xmm6, xmm0			; pOut: xmm3 = (xmm3+xmm1+1)/2, xmm0 = (xmm3-xmm1+1)/2
 		SSE2_SumSubDiv2D	xmm4, xmm5, xmm6, xmm1			; pOut: xmm4 = (xmm4+xmm5+1)/2, xmm1 = (xmm4-xmm5+1)/2
         SSE2_Trans4x4D		xmm3, xmm0, xmm1, xmm4, xmm2	; pOut: xmm3,xmm4,xmm2,xmm1
@@ -550,7 +550,7 @@
 		packssdw	xmm2,	xmm1
 		movdqa	[eax+ 0],   xmm3
 		movdqa	[eax+16],   xmm2
-		
-		ret	
+
+		ret
 
 
--- a/codec/encoder/core/asm/deblock.asm
+++ b/codec/encoder/core/asm/deblock.asm
@@ -1,2113 +1,2113 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  deblock.asm
-;*
-;*  Abstract
-;*      edge loop
-;*
-;*  History
-;*      08/07/2009 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-BITS 32
-
-;*******************************************************************************
-; Macros and other preprocessor constants
-;*******************************************************************************
-
-%ifdef FORMAT_COFF
-SECTION .rodata pData
-%else
-SECTION .rodata align=16
-%endif
-
-SECTION .text
-
-;********************************************************************************
-;  void DeblockChromaEq4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-;                             int32_t iAlpha, int32_t iBeta)
-;********************************************************************************
-WELS_EXTERN   DeblockChromaEq4V_sse2
-
-ALIGN  16
-DeblockChromaEq4V_sse2:
-  push        ebp  
-  mov         ebp,esp 
-  and         esp,0FFFFFFF0h 
-  sub         esp,68h 
-  mov         edx,[ebp+10h]      ;  iStride
-  mov         eax,[ebp+8]        ;  pPixCb
-  mov         ecx,[ebp+0Ch]      ;  pPixCr
-  movq        xmm4,[ecx] 
-  movq        xmm5,[edx+ecx] 
-  push        esi  
-  push        edi  
-  lea         esi,[edx+edx] 
-  mov         edi,eax 
-  sub         edi,esi 
-  movq        xmm1,[edi] 
-  mov         edi,ecx 
-  sub         edi,esi 
-  movq        xmm2,[edi] 
-  punpcklqdq  xmm1,xmm2 
-  mov         esi,eax 
-  sub         esi,edx 
-  movq        xmm2,[esi] 
-  mov         edi,ecx 
-  sub         edi,edx 
-  movq        xmm3,[edi] 
-  punpcklqdq  xmm2,xmm3 
-  movq        xmm3,[eax] 
-  punpcklqdq  xmm3,xmm4 
-  movq        xmm4,[edx+eax] 
-  mov       edx, [ebp + 14h] 
-  punpcklqdq  xmm4,xmm5 
-  movd        xmm5,edx 
-  mov       edx, [ebp + 18h] 
-  pxor        xmm0,xmm0 
-  movdqa      xmm6,xmm5 
-  punpcklwd   xmm6,xmm5 
-  pshufd      xmm5,xmm6,0 
-  movd        xmm6,edx 
-  movdqa      xmm7,xmm6 
-  punpcklwd   xmm7,xmm6 
-  pshufd      xmm6,xmm7,0 
-  movdqa      xmm7,xmm1 
-  punpckhbw   xmm1,xmm0 
-  punpcklbw   xmm7,xmm0 
-  movdqa      [esp+40h],xmm1 
-  movdqa      [esp+60h],xmm7 
-  movdqa      xmm7,xmm2 
-  punpcklbw   xmm7,xmm0 
-  movdqa      [esp+10h],xmm7 
-  movdqa      xmm7,xmm3 
-  punpcklbw   xmm7,xmm0 
-  punpckhbw   xmm3,xmm0 
-  movdqa      [esp+50h],xmm7 
-  movdqa      xmm7,xmm4 
-  punpckhbw   xmm4,xmm0 
-  punpckhbw   xmm2,xmm0 
-  punpcklbw   xmm7,xmm0 
-  movdqa      [esp+30h],xmm3 
-  movdqa      xmm3,[esp+10h] 
-  movdqa      xmm1,xmm3 
-  psubw       xmm1,[esp+50h] 
-  pabsw       xmm1,xmm1 
-  movdqa      [esp+20h],xmm4 
-  movdqa      xmm0,xmm5 
-  pcmpgtw     xmm0,xmm1 
-  movdqa      xmm1,[esp+60h] 
-  psubw       xmm1,xmm3 
-  pabsw       xmm1,xmm1 
-  movdqa      xmm4,xmm6 
-  pcmpgtw     xmm4,xmm1 
-  pand        xmm0,xmm4 
-  movdqa      xmm1,xmm7 
-  psubw       xmm1,[esp+50h] 
-  pabsw       xmm1,xmm1 
-  movdqa      xmm4,xmm6 
-  pcmpgtw     xmm4,xmm1 
-  movdqa      xmm1,xmm2 
-  psubw       xmm1,[esp+30h] 
-  pabsw       xmm1,xmm1 
-  pcmpgtw     xmm5,xmm1 
-  movdqa      xmm1,[esp+40h] 
-  pand        xmm0,xmm4 
-  psubw       xmm1,xmm2 
-  pabsw       xmm1,xmm1 
-  movdqa      xmm4,xmm6 
-  pcmpgtw     xmm4,xmm1 
-  movdqa      xmm1,[esp+20h] 
-  psubw       xmm1,[esp+30h] 
-  pand        xmm5,xmm4 
-  pabsw       xmm1,xmm1 
-  pcmpgtw     xmm6,xmm1 
-  pand        xmm5,xmm6 
-  mov         edx,2 
-  movsx       edx,dx 
-  movd        xmm1,edx 
-  movdqa      xmm4,xmm1 
-  punpcklwd   xmm4,xmm1 
-  pshufd      xmm1,xmm4,0 
-  movdqa      xmm4,[esp+60h] 
-  movdqa      xmm6,xmm4 
-  paddw       xmm6,xmm4 
-  paddw       xmm6,xmm3 
-  paddw       xmm6,xmm7 
-  movdqa      [esp+10h],xmm1 
-  paddw       xmm6,[esp+10h] 
-  psraw       xmm6,2 
-  movdqa      xmm4,xmm0 
-  pandn       xmm4,xmm3 
-  movdqa      xmm3,[esp+40h] 
-  movdqa      xmm1,xmm0 
-  pand        xmm1,xmm6 
-  por         xmm1,xmm4 
-  movdqa      xmm6,xmm3 
-  paddw       xmm6,xmm3 
-  movdqa      xmm3,[esp+10h] 
-  paddw       xmm6,xmm2 
-  paddw       xmm6,[esp+20h] 
-  paddw       xmm6,xmm3 
-  psraw       xmm6,2 
-  movdqa      xmm4,xmm5 
-  pand        xmm4,xmm6 
-  movdqa      xmm6,xmm5 
-  pandn       xmm6,xmm2 
-  por         xmm4,xmm6 
-  packuswb    xmm1,xmm4 
-  movdqa      xmm4,[esp+50h] 
-  movdqa      xmm6,xmm7 
-  paddw       xmm6,xmm7 
-  paddw       xmm6,xmm4 
-  paddw       xmm6,[esp+60h] 
-  paddw       xmm6,xmm3 
-  psraw       xmm6,2 
-  movdqa      xmm2,xmm0 
-  pand        xmm2,xmm6 
-  pandn       xmm0,xmm4 
-  por         xmm2,xmm0 
-  movdqa      xmm0,[esp+20h] 
-  movdqa      xmm6,xmm0 
-  paddw       xmm6,xmm0 
-  movdqa      xmm0,[esp+30h] 
-  paddw       xmm6,xmm0 
-  paddw       xmm6,[esp+40h] 
-  movdqa      xmm4,xmm5 
-  paddw       xmm6,xmm3 
-  movq        [esi],xmm1 
-  psraw       xmm6,2 
-  pand        xmm4,xmm6 
-  pandn       xmm5,xmm0 
-  por         xmm4,xmm5 
-  packuswb    xmm2,xmm4 
-  movq        [eax],xmm2 
-  psrldq      xmm1,8 
-  movq        [edi],xmm1 
-  pop         edi  
-  psrldq      xmm2,8 
-  movq        [ecx],xmm2 
-  pop         esi  
-  mov         esp,ebp 
-  pop         ebp  
-  ret              
-
-;******************************************************************************
-; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, 
-;                           int32_t iAlpha, int32_t iBeta, int8_t * pTC);
-;*******************************************************************************
-
-WELS_EXTERN  DeblockChromaLt4V_sse2
-
-DeblockChromaLt4V_sse2:
-  push        ebp  
-  mov         ebp,esp 
-  and         esp,0FFFFFFF0h 
-  sub         esp,0E4h 
-  push        ebx  
-  push        esi  
-  mov         esi, [ebp+1Ch]      ;  pTC
-  movsx       ebx, byte [esi+2] 
-  push        edi  
-  movsx       di,byte [esi+3] 
-  mov         word [esp+0Ch],bx 
-  movsx       bx,byte  [esi+1] 
-  movsx       esi,byte  [esi] 
-  mov         word  [esp+0Eh],si 
-  movzx       esi,di 
-  movd        xmm1,esi 
-  movzx       esi,di 
-  movd        xmm2,esi 
-  mov         si,word  [esp+0Ch] 
-  mov         edx, [ebp + 10h] 
-  mov         eax, [ebp + 08h] 
-  movzx       edi,si 
-  movzx       esi,si 
-  mov         ecx, [ebp + 0Ch] 
-  movd        xmm4,esi 
-  movzx       esi,bx 
-  movd        xmm5,esi 
-  movd        xmm3,edi 
-  movzx       esi,bx 
-  movd        xmm6,esi 
-  mov         si,word [esp+0Eh] 
-  movzx       edi,si 
-  movzx       esi,si 
-  punpcklwd   xmm6,xmm2 
-  pxor        xmm0,xmm0 
-  movdqa      [esp+40h],xmm0 
-  movd        xmm7,edi 
-  movd        xmm0,esi 
-  lea         esi,[edx+edx] 
-  mov         edi,eax 
-  sub         edi,esi 
-  punpcklwd   xmm5,xmm1 
-  movdqa      xmm1,[esp+40h] 
-  punpcklwd   xmm0,xmm4 
-  movq        xmm4,[edx+ecx] 
-  punpcklwd   xmm7,xmm3 
-  movq        xmm3,[eax] 
-  punpcklwd   xmm0,xmm6 
-  movq        xmm6,[edi] 
-  punpcklwd   xmm7,xmm5 
-  punpcklwd   xmm0,xmm7 
-  mov         edi,ecx 
-  sub         edi,esi 
-  movdqa      xmm2,xmm1 
-  psubw       xmm2,xmm0 
-  movdqa      [esp+60h],xmm2 
-  movq        xmm2, [edi] 
-  punpcklqdq  xmm6,xmm2 
-  mov         esi,eax 
-  sub         esi,edx 
-  movq        xmm7,[esi] 
-  mov         edi,ecx 
-  sub         edi,edx 
-  movq        xmm2,[edi] 
-  punpcklqdq  xmm7,xmm2 
-  movq        xmm2,[ecx] 
-  punpcklqdq  xmm3,xmm2 
-  movq        xmm2,[edx+eax] 
-  movsx       edx,word [ebp + 14h] 
-  punpcklqdq  xmm2,xmm4 
-  movdqa      [esp+0E0h],xmm2 
-  movd        xmm2,edx 
-  movsx       edx,word [ebp + 18h] 
-  movdqa      xmm4,xmm2 
-  punpcklwd   xmm4,xmm2 
-  movd        xmm2,edx 
-  movdqa      xmm5,xmm2 
-  punpcklwd   xmm5,xmm2 
-  pshufd      xmm2,xmm5,0 
-  movdqa      [esp+50h],xmm2 
-  movdqa      xmm2,xmm6 
-  punpcklbw   xmm2,xmm1 
-  movdqa      [esp+0D0h],xmm3 
-  pshufd      xmm4,xmm4,0 
-  movdqa      [esp+30h],xmm2 
-  punpckhbw   xmm6,xmm1 
-  movdqa      [esp+80h],xmm6 
-  movdqa      xmm6,[esp+0D0h] 
-  punpckhbw   xmm6,xmm1 
-  movdqa      [esp+70h],xmm6 
-  movdqa      xmm6, [esp+0E0h] 
-  punpckhbw   xmm6,xmm1 
-  movdqa     [esp+90h],xmm6 
-  movdqa      xmm5, [esp+0E0h] 
-  movdqa      xmm2,xmm7 
-  punpckhbw   xmm7,xmm1 
-  punpcklbw   xmm5,xmm1 
-  movdqa       [esp+0A0h],xmm7 
-  punpcklbw   xmm3,xmm1 
-  mov         edx,4 
-  punpcklbw   xmm2,xmm1 
-  movsx       edx,dx 
-  movd        xmm6,edx 
-  movdqa      xmm7,xmm6 
-  punpcklwd   xmm7,xmm6 
-  pshufd      xmm6,xmm7,0 
-  movdqa      xmm7,[esp+30h] 
-  movdqa      [esp+20h],xmm6 
-  psubw       xmm7,xmm5 
-  movdqa      xmm6,xmm0 
-  pcmpgtw     xmm6,xmm1 
-  movdqa      xmm1,[esp+60h] 
-  movdqa      [esp+40h],xmm6 
-  movdqa      xmm6,xmm3 
-  psubw       xmm6,xmm2 
-  psllw       xmm6,2 
-  paddw       xmm6,xmm7 
-  paddw       xmm6, [esp+20h] 
-  movdqa      xmm7, [esp+50h] 
-  psraw       xmm6,3 
-  pmaxsw      xmm1,xmm6 
-  movdqa      [esp+10h],xmm0 
-  movdqa      xmm6, [esp+10h] 
-  pminsw      xmm6,xmm1 
-  movdqa      [esp+10h],xmm6 
-  movdqa      xmm1,xmm2 
-  psubw       xmm1,xmm3 
-  pabsw       xmm1,xmm1 
-  movdqa      xmm6,xmm4 
-  pcmpgtw     xmm6,xmm1 
-  movdqa      xmm1, [esp+30h] 
-  psubw       xmm1,xmm2 
-  pabsw       xmm1,xmm1 
-  pcmpgtw     xmm7,xmm1 
-  movdqa      xmm1,[esp+50h] 
-  pand        xmm6,xmm7 
-  movdqa      xmm7,[esp+50h] 
-  psubw       xmm5,xmm3 
-  pabsw       xmm5,xmm5 
-  pcmpgtw     xmm1,xmm5 
-  movdqa      xmm5,[esp+80h] 
-  psubw       xmm5,[esp+90h] 
-  pand        xmm6,xmm1 
-  pand        xmm6,[esp+40h] 
-  movdqa      xmm1,[esp+10h] 
-  pand        xmm1,xmm6 
-  movdqa      xmm6,[esp+70h] 
-  movdqa      [esp+30h],xmm1 
-  movdqa      xmm1,[esp+0A0h] 
-  psubw       xmm6,xmm1 
-  psllw       xmm6,2 
-  paddw       xmm6,xmm5 
-  paddw       xmm6,[esp+20h] 
-  movdqa      xmm5,[esp+60h] 
-  psraw       xmm6,3 
-  pmaxsw      xmm5,xmm6 
-  pminsw      xmm0,xmm5 
-  movdqa      xmm5,[esp+70h] 
-  movdqa      xmm6,xmm1 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm4,xmm6 
-  movdqa      xmm6,[esp+80h] 
-  psubw       xmm6,xmm1 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm7,xmm6 
-  movdqa      xmm6,[esp+90h] 
-  pand        xmm4,xmm7 
-  movdqa      xmm7,[esp+50h] 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm7,xmm6 
-  pand        xmm4,xmm7 
-  pand        xmm4,[esp+40h] 
-  pand        xmm0,xmm4 
-  movdqa      xmm4,[esp+30h] 
-  paddw       xmm2,xmm4 
-  paddw       xmm1,xmm0 
-  packuswb    xmm2,xmm1 
-  movq        [esi],xmm2 
-  psubw       xmm3,xmm4 
-  psubw       xmm5,xmm0 
-  packuswb    xmm3,xmm5 
-  movq        [eax],xmm3 
-  psrldq      xmm2,8 
-  movq        [edi],xmm2 
-  pop         edi  
-  pop         esi  
-  psrldq      xmm3,8 
-  movq        [ecx],xmm3 
-  pop         ebx  
-  mov         esp,ebp 
-  pop         ebp  
-  ret    
-  
-;***************************************************************************
-;  void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, 
-;          int32_t iAlpha, int32_t iBeta)
-;***************************************************************************
-
-WELS_EXTERN     DeblockChromaEq4H_sse2
-
-ALIGN  16
-  
-DeblockChromaEq4H_sse2:
-  push        ebp  
-  mov         ebp,esp 
-  and         esp,0FFFFFFF0h 
-  sub         esp,0C8h  
-  mov         ecx,dword [ebp+8] 
-  mov         edx,dword [ebp+0Ch] 
-  mov         eax,dword [ebp+10h] 
-  sub         ecx,2 
-  sub         edx,2 
-  push        esi  
-  lea         esi,[eax+eax*2] 
-  mov         dword [esp+18h],ecx 
-  mov         dword [esp+4],edx 
-  lea         ecx,[ecx+eax*4] 
-  lea         edx,[edx+eax*4] 
-  lea         eax,[esp+7Ch] 
-  push        edi  
-  mov         dword [esp+14h],esi 
-  mov         dword [esp+18h],ecx 
-  mov         dword [esp+0Ch],edx 
-  mov         dword [esp+10h],eax 
-  mov         esi,dword [esp+1Ch] 
-  mov         ecx,dword [ebp+10h] 
-  mov         edx,dword [esp+14h] 
-  movd        xmm0,dword [esi] 
-  movd        xmm1,dword [esi+ecx] 
-  movd        xmm2,dword [esi+ecx*2] 
-  movd        xmm3,dword [esi+edx] 
-  mov         esi,dword  [esp+8] 
-  movd        xmm4,dword [esi] 
-  movd        xmm5,dword [esi+ecx] 
-  movd        xmm6,dword [esi+ecx*2] 
-  movd        xmm7,dword [esi+edx] 
-  punpckldq   xmm0,xmm4 
-  punpckldq   xmm1,xmm5 
-  punpckldq   xmm2,xmm6 
-  punpckldq   xmm3,xmm7 
-  mov         esi,dword [esp+18h] 
-  mov         edi,dword [esp+0Ch] 
-  movd        xmm4,dword [esi] 
-  movd        xmm5,dword [edi] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm0,xmm4 
-  movd        xmm4,dword [esi+ecx] 
-  movd        xmm5,dword [edi+ecx] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm1,xmm4 
-  movd        xmm4,dword [esi+ecx*2] 
-  movd        xmm5,dword [edi+ecx*2] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm2,xmm4 
-  movd        xmm4,dword [esi+edx] 
-  movd        xmm5,dword [edi+edx] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm3,xmm4 
-  movdqa      xmm6,xmm0 
-  punpcklbw   xmm0,xmm1 
-  punpckhbw   xmm6,xmm1 
-  movdqa      xmm7,xmm2 
-  punpcklbw   xmm2,xmm3 
-  punpckhbw   xmm7,xmm3 
-  movdqa      xmm4,xmm0 
-  movdqa      xmm5,xmm6 
-  punpcklwd   xmm0,xmm2 
-  punpckhwd   xmm4,xmm2 
-  punpcklwd   xmm6,xmm7 
-  punpckhwd   xmm5,xmm7 
-  movdqa      xmm1,xmm0 
-  movdqa      xmm2,xmm4 
-  punpckldq   xmm0,xmm6 
-  punpckhdq   xmm1,xmm6 
-  punpckldq   xmm4,xmm5 
-  punpckhdq   xmm2,xmm5 
-  movdqa      xmm5,xmm0 
-  movdqa      xmm6,xmm1 
-  punpcklqdq  xmm0,xmm4 
-  punpckhqdq  xmm5,xmm4 
-  punpcklqdq  xmm1,xmm2 
-  punpckhqdq  xmm6,xmm2 
-  mov         edi,dword [esp+10h] 
-  movdqa      [edi],xmm0 
-  movdqa      [edi+10h],xmm5 
-  movdqa      [edi+20h],xmm1 
-  movdqa      [edi+30h],xmm6 
-  movsx       ecx,word [ebp+14h] 
-  movsx       edx,word [ebp+18h] 
-  movdqa      xmm6,[esp+80h] 
-  movdqa      xmm4,[esp+90h] 
-  movdqa      xmm5,[esp+0A0h] 
-  movdqa      xmm7,[esp+0B0h] 
-  pxor        xmm0,xmm0 
-  movd        xmm1,ecx 
-  movdqa      xmm2,xmm1 
-  punpcklwd   xmm2,xmm1 
-  pshufd      xmm1,xmm2,0 
-  movd        xmm2,edx 
-  movdqa      xmm3,xmm2 
-  punpcklwd   xmm3,xmm2 
-  pshufd      xmm2,xmm3,0 
-  movdqa      xmm3,xmm6 
-  punpckhbw   xmm6,xmm0 
-  movdqa      [esp+60h],xmm6 
-  movdqa      xmm6,[esp+90h] 
-  punpckhbw   xmm6,xmm0 
-  movdqa      [esp+30h],xmm6 
-  movdqa      xmm6,[esp+0A0h] 
-  punpckhbw   xmm6,xmm0 
-  movdqa      [esp+40h],xmm6 
-  movdqa      xmm6,[esp+0B0h] 
-  punpckhbw   xmm6,xmm0 
-  movdqa      [esp+70h],xmm6 
-  punpcklbw   xmm7,xmm0 
-  punpcklbw   xmm4,xmm0 
-  punpcklbw   xmm5,xmm0 
-  punpcklbw   xmm3,xmm0 
-  movdqa      [esp+50h],xmm7 
-  movdqa      xmm6,xmm4 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  movdqa      xmm0,xmm1 
-  pcmpgtw     xmm0,xmm6 
-  movdqa      xmm6,xmm3 
-  psubw       xmm6,xmm4 
-  pabsw       xmm6,xmm6 
-  movdqa      xmm7,xmm2 
-  pcmpgtw     xmm7,xmm6 
-  movdqa      xmm6,[esp+50h] 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  pand        xmm0,xmm7 
-  movdqa      xmm7,xmm2 
-  pcmpgtw     xmm7,xmm6 
-  movdqa      xmm6,[esp+30h] 
-  psubw       xmm6,[esp+40h] 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm1,xmm6 
-  movdqa      xmm6,[esp+60h] 
-  psubw       xmm6,[esp+30h] 
-  pabsw       xmm6,xmm6 
-  pand        xmm0,xmm7 
-  movdqa      xmm7,xmm2 
-  pcmpgtw     xmm7,xmm6 
-  movdqa      xmm6,[esp+70h] 
-  psubw       xmm6,[esp+40h] 
-  pabsw       xmm6,xmm6 
-  pand        xmm1,xmm7 
-  pcmpgtw     xmm2,xmm6 
-  pand        xmm1,xmm2 
-  mov         eax,2 
-  movsx       ecx,ax 
-  movd        xmm2,ecx 
-  movdqa      xmm6,xmm2 
-  punpcklwd   xmm6,xmm2 
-  pshufd      xmm2,xmm6,0 
-  movdqa      [esp+20h],xmm2 
-  movdqa      xmm2,xmm3 
-  paddw       xmm2,xmm3 
-  paddw       xmm2,xmm4 
-  paddw       xmm2,[esp+50h] 
-  paddw       xmm2,[esp+20h] 
-  psraw       xmm2,2 
-  movdqa      xmm6,xmm0 
-  pand        xmm6,xmm2 
-  movdqa      xmm2,xmm0 
-  pandn       xmm2,xmm4 
-  por         xmm6,xmm2 
-  movdqa      xmm2,[esp+60h] 
-  movdqa      xmm7,xmm2 
-  paddw       xmm7,xmm2 
-  paddw       xmm7,[esp+30h] 
-  paddw       xmm7,[esp+70h] 
-  paddw       xmm7,[esp+20h] 
-  movdqa      xmm4,xmm1 
-  movdqa      xmm2,xmm1 
-  pandn       xmm2,[esp+30h] 
-  psraw       xmm7,2 
-  pand        xmm4,xmm7 
-  por         xmm4,xmm2 
-  movdqa      xmm2,[esp+50h] 
-  packuswb    xmm6,xmm4 
-  movdqa      [esp+90h],xmm6 
-  movdqa      xmm6,xmm2 
-  paddw       xmm6,xmm2 
-  movdqa      xmm2,[esp+20h] 
-  paddw       xmm6,xmm5 
-  paddw       xmm6,xmm3 
-  movdqa      xmm4,xmm0 
-  pandn       xmm0,xmm5 
-  paddw       xmm6,xmm2 
-  psraw       xmm6,2 
-  pand        xmm4,xmm6 
-  por         xmm4,xmm0 
-  movdqa      xmm0,[esp+70h] 
-  movdqa      xmm5,xmm0 
-  paddw       xmm5,xmm0 
-  movdqa      xmm0,[esp+40h] 
-  paddw       xmm5,xmm0 
-  paddw       xmm5,[esp+60h] 
-  movdqa      xmm3,xmm1 
-  paddw       xmm5,xmm2 
-  psraw       xmm5,2 
-  pand        xmm3,xmm5 
-  pandn       xmm1,xmm0 
-  por         xmm3,xmm1 
-  packuswb    xmm4,xmm3 
-  movdqa      [esp+0A0h],xmm4 
-  mov         esi,dword [esp+10h] 
-  movdqa      xmm0,[esi] 
-  movdqa      xmm1,[esi+10h] 
-  movdqa      xmm2,[esi+20h] 
-  movdqa      xmm3,[esi+30h] 
-  movdqa      xmm6,xmm0 
-  punpcklbw   xmm0,xmm1 
-  punpckhbw   xmm6,xmm1 
-  movdqa      xmm7,xmm2 
-  punpcklbw   xmm2,xmm3 
-  punpckhbw   xmm7,xmm3 
-  movdqa      xmm4,xmm0 
-  movdqa      xmm5,xmm6 
-  punpcklwd   xmm0,xmm2 
-  punpckhwd   xmm4,xmm2 
-  punpcklwd   xmm6,xmm7 
-  punpckhwd   xmm5,xmm7 
-  movdqa      xmm1,xmm0 
-  movdqa      xmm2,xmm4 
-  punpckldq   xmm0,xmm6 
-  punpckhdq   xmm1,xmm6 
-  punpckldq   xmm4,xmm5 
-  punpckhdq   xmm2,xmm5 
-  movdqa      xmm5,xmm0 
-  movdqa      xmm6,xmm1 
-  punpcklqdq  xmm0,xmm4 
-  punpckhqdq  xmm5,xmm4 
-  punpcklqdq  xmm1,xmm2 
-  punpckhqdq  xmm6,xmm2 
-  mov         esi,dword [esp+1Ch] 
-  mov         ecx,dword [ebp+10h] 
-  mov         edx,dword [esp+14h] 
-  mov         edi,dword [esp+8] 
-  movd        dword [esi],xmm0 
-  movd        dword [esi+ecx],xmm5 
-  movd        dword [esi+ecx*2],xmm1 
-  movd        dword [esi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  mov         esi,dword [esp+18h] 
-  movd        dword [edi],xmm0 
-  movd        dword [edi+ecx],xmm5 
-  movd        dword [edi+ecx*2],xmm1 
-  movd        dword [edi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  movd        dword [esi],xmm0 
-  movd        dword [esi+ecx],xmm5 
-  movd        dword [esi+ecx*2],xmm1 
-  movd        dword [esi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  mov         edi,dword [esp+0Ch] 
-  movd        dword [edi],xmm0 
-  movd        dword [edi+ecx],xmm5 
-  movd        dword [edi+ecx*2],xmm1 
-  movd        dword [edi+edx],xmm6 
-  pop         edi  
-  pop         esi  
-  mov         esp,ebp 
-  pop         ebp  
-  ret              
-  
-;*******************************************************************************
-;    void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, 
-;                                int32_t iAlpha, int32_t iBeta, int8_t * pTC);
-;*******************************************************************************
-  
-WELS_EXTERN  DeblockChromaLt4H_sse2
-  
-ALIGN  16
-
-DeblockChromaLt4H_sse2:
-  push        ebp  
-  mov         ebp,esp 
-  and         esp,0FFFFFFF0h 
-  sub         esp,108h   
-  mov         ecx,dword [ebp+8] 
-  mov         edx,dword [ebp+0Ch] 
-  mov         eax,dword [ebp+10h] 
-  sub         ecx,2 
-  sub         edx,2 
-  push        esi  
-  lea         esi,[eax+eax*2] 
-  mov         dword [esp+10h],ecx 
-  mov         dword [esp+4],edx 
-  lea         ecx,[ecx+eax*4] 
-  lea         edx,[edx+eax*4] 
-  lea         eax,[esp+6Ch] 
-  push        edi  
-  mov         dword [esp+0Ch],esi 
-  mov         dword [esp+18h],ecx 
-  mov         dword [esp+10h],edx 
-  mov         dword [esp+1Ch],eax 
-  mov         esi,dword [esp+14h] 
-  mov         ecx,dword [ebp+10h] 
-  mov         edx,dword [esp+0Ch] 
-  movd        xmm0,dword [esi] 
-  movd        xmm1,dword [esi+ecx] 
-  movd        xmm2,dword [esi+ecx*2] 
-  movd        xmm3,dword [esi+edx] 
-  mov         esi,dword [esp+8] 
-  movd        xmm4,dword [esi] 
-  movd        xmm5,dword [esi+ecx] 
-  movd        xmm6,dword [esi+ecx*2] 
-  movd        xmm7,dword [esi+edx] 
-  punpckldq   xmm0,xmm4 
-  punpckldq   xmm1,xmm5 
-  punpckldq   xmm2,xmm6 
-  punpckldq   xmm3,xmm7 
-  mov         esi,dword [esp+18h] 
-  mov         edi,dword [esp+10h] 
-  movd        xmm4,dword [esi] 
-  movd        xmm5,dword [edi] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm0,xmm4 
-  movd        xmm4,dword [esi+ecx] 
-  movd        xmm5,dword [edi+ecx] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm1,xmm4 
-  movd        xmm4,dword [esi+ecx*2] 
-  movd        xmm5,dword [edi+ecx*2] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm2,xmm4 
-  movd        xmm4,dword [esi+edx] 
-  movd        xmm5,dword [edi+edx] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm3,xmm4 
-  movdqa      xmm6,xmm0 
-  punpcklbw   xmm0,xmm1 
-  punpckhbw   xmm6,xmm1 
-  movdqa      xmm7,xmm2 
-  punpcklbw   xmm2,xmm3 
-  punpckhbw   xmm7,xmm3 
-  movdqa      xmm4,xmm0 
-  movdqa      xmm5,xmm6 
-  punpcklwd   xmm0,xmm2 
-  punpckhwd   xmm4,xmm2 
-  punpcklwd   xmm6,xmm7 
-  punpckhwd   xmm5,xmm7 
-  movdqa      xmm1,xmm0 
-  movdqa      xmm2,xmm4 
-  punpckldq   xmm0,xmm6 
-  punpckhdq   xmm1,xmm6 
-  punpckldq   xmm4,xmm5 
-  punpckhdq   xmm2,xmm5 
-  movdqa      xmm5,xmm0 
-  movdqa      xmm6,xmm1 
-  punpcklqdq  xmm0,xmm4 
-  punpckhqdq  xmm5,xmm4 
-  punpcklqdq  xmm1,xmm2 
-  punpckhqdq  xmm6,xmm2 
-  mov         edi,dword [esp+1Ch] 
-  movdqa      [edi],xmm0 
-  movdqa      [edi+10h],xmm5 
-  movdqa      [edi+20h],xmm1 
-  movdqa      [edi+30h],xmm6 
-  mov         eax,dword [ebp+1Ch] 
-  movsx       cx,byte [eax+3] 
-  movsx       dx,byte [eax+2] 
-  movsx       si,byte [eax+1] 
-  movsx       ax,byte [eax] 
-  movzx       edi,cx 
-  movzx       ecx,cx 
-  movd        xmm2,ecx 
-  movzx       ecx,dx 
-  movzx       edx,dx 
-  movd        xmm3,ecx 
-  movd        xmm4,edx 
-  movzx       ecx,si 
-  movzx       edx,si 
-  movd        xmm5,ecx 
-  pxor        xmm0,xmm0 
-  movd        xmm6,edx 
-  movzx       ecx,ax 
-  movdqa      [esp+60h],xmm0 
-  movzx       edx,ax 
-  movsx       eax,word [ebp+14h] 
-  punpcklwd   xmm6,xmm2 
-  movd        xmm1,edi 
-  movd        xmm7,ecx 
-  movsx       ecx,word [ebp+18h] 
-  movd        xmm0,edx 
-  punpcklwd   xmm7,xmm3 
-  punpcklwd   xmm5,xmm1 
-  movdqa      xmm1,[esp+60h] 
-  punpcklwd   xmm7,xmm5 
-  movdqa      xmm5,[esp+0A0h] 
-  punpcklwd   xmm0,xmm4 
-  punpcklwd   xmm0,xmm6 
-  movdqa      xmm6, [esp+70h] 
-  punpcklwd   xmm0,xmm7 
-  movdqa      xmm7,[esp+80h] 
-  movdqa      xmm2,xmm1 
-  psubw       xmm2,xmm0 
-  movdqa      [esp+0D0h],xmm2 
-  movd        xmm2,eax 
-  movdqa      xmm3,xmm2 
-  punpcklwd   xmm3,xmm2 
-  pshufd      xmm4,xmm3,0 
-  movd        xmm2,ecx 
-  movdqa      xmm3,xmm2 
-  punpcklwd   xmm3,xmm2 
-  pshufd      xmm2,xmm3,0 
-  movdqa      xmm3, [esp+90h] 
-  movdqa      [esp+50h],xmm2 
-  movdqa      xmm2,xmm6 
-  punpcklbw   xmm2,xmm1 
-  punpckhbw   xmm6,xmm1 
-  movdqa      [esp+40h],xmm2 
-  movdqa      [esp+0B0h],xmm6 
-  movdqa      xmm6,[esp+90h] 
-  movdqa      xmm2,xmm7 
-  punpckhbw   xmm7,xmm1 
-  punpckhbw   xmm6,xmm1 
-  punpcklbw   xmm2,xmm1 
-  punpcklbw   xmm3,xmm1 
-  punpcklbw   xmm5,xmm1 
-  movdqa      [esp+0F0h],xmm7 
-  movdqa      [esp+0C0h],xmm6 
-  movdqa      xmm6, [esp+0A0h] 
-  punpckhbw   xmm6,xmm1 
-  movdqa      [esp+0E0h],xmm6 
-  mov         edx,4 
-  movsx       eax,dx 
-  movd        xmm6,eax 
-  movdqa      xmm7,xmm6 
-  punpcklwd   xmm7,xmm6 
-  pshufd      xmm6,xmm7,0 
-  movdqa      [esp+30h],xmm6 
-  movdqa      xmm7, [esp+40h] 
-  psubw       xmm7,xmm5 
-  movdqa      xmm6,xmm0 
-  pcmpgtw     xmm6,xmm1 
-  movdqa      [esp+60h],xmm6 
-  movdqa      xmm1, [esp+0D0h] 
-  movdqa      xmm6,xmm3 
-  psubw       xmm6,xmm2 
-  psllw       xmm6,2 
-  paddw       xmm6,xmm7 
-  paddw       xmm6,[esp+30h] 
-  psraw       xmm6,3 
-  pmaxsw      xmm1,xmm6 
-  movdqa      xmm7,[esp+50h] 
-  movdqa      [esp+20h],xmm0 
-  movdqa      xmm6, [esp+20h] 
-  pminsw      xmm6,xmm1 
-  movdqa      [esp+20h],xmm6 
-  movdqa      xmm6,xmm4 
-  movdqa      xmm1,xmm2 
-  psubw       xmm1,xmm3 
-  pabsw       xmm1,xmm1 
-  pcmpgtw     xmm6,xmm1 
-  movdqa      xmm1, [esp+40h] 
-  psubw       xmm1,xmm2 
-  pabsw       xmm1,xmm1 
-  pcmpgtw     xmm7,xmm1 
-  movdqa      xmm1, [esp+50h] 
-  pand        xmm6,xmm7 
-  movdqa      xmm7, [esp+50h] 
-  psubw       xmm5,xmm3 
-  pabsw       xmm5,xmm5 
-  pcmpgtw     xmm1,xmm5 
-  movdqa      xmm5, [esp+0B0h] 
-  psubw       xmm5,[esp+0E0h] 
-  pand        xmm6,xmm1 
-  pand        xmm6, [esp+60h] 
-  movdqa      xmm1, [esp+20h] 
-  pand        xmm1,xmm6 
-  movdqa      xmm6, [esp+0C0h] 
-  movdqa      [esp+40h],xmm1 
-  movdqa      xmm1, [esp+0F0h] 
-  psubw       xmm6,xmm1 
-  psllw       xmm6,2 
-  paddw       xmm6,xmm5 
-  paddw       xmm6, [esp+30h] 
-  movdqa      xmm5, [esp+0D0h] 
-  psraw       xmm6,3 
-  pmaxsw      xmm5,xmm6 
-  pminsw      xmm0,xmm5 
-  movdqa      xmm5,[esp+0C0h] 
-  movdqa      xmm6,xmm1 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm4,xmm6 
-  movdqa      xmm6,[esp+0B0h] 
-  psubw       xmm6,xmm1 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm7,xmm6 
-  movdqa      xmm6, [esp+0E0h] 
-  pand        xmm4,xmm7 
-  movdqa      xmm7, [esp+50h] 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm7,xmm6 
-  pand        xmm4,xmm7 
-  pand        xmm4,[esp+60h] 
-  pand        xmm0,xmm4 
-  movdqa      xmm4, [esp+40h] 
-  paddw       xmm2,xmm4 
-  paddw       xmm1,xmm0 
-  psubw       xmm3,xmm4 
-  psubw       xmm5,xmm0 
-  packuswb    xmm2,xmm1 
-  packuswb    xmm3,xmm5 
-  movdqa      [esp+80h],xmm2 
-  movdqa      [esp+90h],xmm3 
-  mov         esi,dword [esp+1Ch] 
-  movdqa      xmm0, [esi] 
-  movdqa      xmm1, [esi+10h] 
-  movdqa      xmm2, [esi+20h] 
-  movdqa      xmm3, [esi+30h] 
-  movdqa      xmm6,xmm0 
-  punpcklbw   xmm0,xmm1 
-  punpckhbw   xmm6,xmm1 
-  movdqa      xmm7,xmm2 
-  punpcklbw   xmm2,xmm3 
-  punpckhbw   xmm7,xmm3 
-  movdqa      xmm4,xmm0 
-  movdqa      xmm5,xmm6 
-  punpcklwd   xmm0,xmm2 
-  punpckhwd   xmm4,xmm2 
-  punpcklwd   xmm6,xmm7 
-  punpckhwd   xmm5,xmm7 
-  movdqa      xmm1,xmm0 
-  movdqa      xmm2,xmm4 
-  punpckldq   xmm0,xmm6 
-  punpckhdq   xmm1,xmm6 
-  punpckldq   xmm4,xmm5 
-  punpckhdq   xmm2,xmm5 
-  movdqa      xmm5,xmm0 
-  movdqa      xmm6,xmm1 
-  punpcklqdq  xmm0,xmm4 
-  punpckhqdq  xmm5,xmm4 
-  punpcklqdq  xmm1,xmm2 
-  punpckhqdq  xmm6,xmm2 
-  mov         esi,dword [esp+14h] 
-  mov         ecx,dword [ebp+10h] 
-  mov         edx,dword [esp+0Ch] 
-  mov         edi,dword [esp+8] 
-  movd        dword [esi],xmm0 
-  movd        dword [esi+ecx],xmm5 
-  movd        dword [esi+ecx*2],xmm1 
-  movd        dword [esi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  mov         esi,dword [esp+18h] 
-  movd        dword [edi],xmm0 
-  movd        dword [edi+ecx],xmm5 
-  movd        dword [edi+ecx*2],xmm1 
-  movd        dword [edi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  movd        dword [esi],xmm0 
-  movd        dword [esi+ecx],xmm5 
-  movd        dword [esi+ecx*2],xmm1 
-  movd        dword [esi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  mov         edi,dword [esp+10h] 
-  movd        dword [edi],xmm0 
-  movd        dword [edi+ecx],xmm5 
-  movd        dword [edi+ecx*2],xmm1 
-  movd        dword [edi+edx],xmm6  
-  pop         edi  
-  pop         esi   
-  mov         esp,ebp 
-  pop         ebp  
-  ret     
-  
-  
-  
-;*******************************************************************************
-;    void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha, 
-;                                 int32_t iBeta, int8_t * pTC)
-;*******************************************************************************
-  
-
-WELS_EXTERN  DeblockLumaLt4V_sse2
-  
-ALIGN  16
-
-DeblockLumaLt4V_sse2:
-    push	ebp
-	mov	ebp, esp
-	and	esp, -16				; fffffff0H
-	sub	esp, 420				; 000001a4H
-	mov	eax, dword [ebp+8]
-	mov	ecx, dword [ebp+12]
-
-	pxor	xmm0, xmm0
-	push	ebx
-	mov	edx, dword [ebp+24]
-	movdqa	[esp+424-384], xmm0
-	push	esi
-
-	lea	esi, [ecx+ecx*2]
-	push	edi
-	mov	edi, eax
-	sub	edi, esi
-	movdqa	xmm0, [edi]
-
-	lea	esi, [ecx+ecx]
-	movdqa	[esp+432-208], xmm0
-	mov	edi, eax
-	sub	edi, esi
-	movdqa	xmm0, [edi]
-	movdqa	[esp+448-208], xmm0
-
-	mov	ebx, eax
-	sub	ebx, ecx
-	movdqa	xmm0, [ebx]
-	movdqa	[esp+464-208], xmm0
-
-	movdqa	xmm0, [eax]
-
-	add	ecx, eax
-	movdqa	[esp+480-208], xmm0
-	movdqa	xmm0, [ecx]
-	mov	dword [esp+432-404], ecx
-
-	movsx	ecx, word [ebp+16]
-	movdqa	[esp+496-208], xmm0
-	movdqa	xmm0, [esi+eax]
-
-	movsx	si, byte [edx]
-	movdqa	[esp+512-208], xmm0
-	movd	xmm0, ecx
-	movsx	ecx, word [ebp+20]
-	movdqa	xmm1, xmm0
-	punpcklwd xmm1, xmm0
-	pshufd	xmm0, xmm1, 0
-	movdqa	[esp+432-112], xmm0
-	movd	xmm0, ecx
-	movsx	cx, byte [edx+1]
-	movdqa	xmm1, xmm0
-	punpcklwd xmm1, xmm0
-	mov	dword [esp+432-408], ebx
-	movzx	ebx, cx
-	pshufd	xmm0, xmm1, 0
-	movd	xmm1, ebx
-	movzx	ebx, cx
-	movd	xmm2, ebx
-	movzx	ebx, cx
-	movzx	ecx, cx
-	movd	xmm4, ecx
-	movzx	ecx, si
-	movd	xmm5, ecx
-	movzx	ecx, si
-	movd	xmm6, ecx
-	movzx	ecx, si
-	movd	xmm7, ecx
-	movzx	ecx, si
-	movdqa	[esp+432-336], xmm0
-	movd	xmm0, ecx
-
-	movsx	cx, byte [edx+3]
-	movsx	dx, byte [edx+2]
-	movd	xmm3, ebx
-	punpcklwd xmm0, xmm4
-	movzx	esi, cx
-	punpcklwd xmm6, xmm2
-	punpcklwd xmm5, xmm1
-	punpcklwd xmm0, xmm6
-	punpcklwd xmm7, xmm3
-	punpcklwd xmm7, xmm5
-	punpcklwd xmm0, xmm7
-	movdqa	[esp+432-400], xmm0
-	movd	xmm0, esi
-	movzx	esi, cx
-	movd	xmm2, esi
-	movzx	esi, cx
-	movzx	ecx, cx
-	movd	xmm4, ecx
-	movzx	ecx, dx
-	movd	xmm3, esi
-	movd	xmm5, ecx
-	punpcklwd xmm5, xmm0
-
-	movdqa	xmm0, [esp+432-384]
-	movzx	ecx, dx
-	movd	xmm6, ecx
-	movzx	ecx, dx
-	movzx	edx, dx
-	punpcklwd xmm6, xmm2
-	movd	xmm7, ecx
-	movd	xmm1, edx
-
-	movdqa	xmm2, [esp+448-208]
-	punpcklbw xmm2, xmm0
-
-	mov	ecx, 4
-	movsx	edx, cx
-	punpcklwd xmm7, xmm3
-	punpcklwd xmm7, xmm5
-	movdqa	xmm5, [esp+496-208]
-	movdqa	xmm3, [esp+464-208]
-	punpcklbw xmm5, xmm0
-	movdqa	[esp+432-240], xmm5
-	movdqa	xmm5, [esp+512-208]
-	punpcklbw xmm5, xmm0
-	movdqa	[esp+432-352], xmm5
-	punpcklwd xmm1, xmm4
-	movdqa	xmm4, [esp+432-208]
-	punpcklwd xmm1, xmm6
-	movdqa	xmm6, [esp+480-208]
-	punpcklwd xmm1, xmm7
-	punpcklbw xmm6, xmm0
-	punpcklbw xmm3, xmm0
-	punpcklbw xmm4, xmm0
-	movdqa	xmm7, xmm3
-	psubw	xmm7, xmm4
-	pabsw	xmm7, xmm7
-	movdqa	[esp+432-272], xmm4
-	movdqa	xmm4, [esp+432-336]
-	movdqa	xmm5, xmm4
-	pcmpgtw	xmm5, xmm7
-	movdqa	[esp+432-288], xmm5
-	movdqa	xmm7, xmm6
-	psubw	xmm7, [esp+432-352]
-	pabsw	xmm7, xmm7
-	movdqa	xmm5, xmm4
-	pcmpgtw	xmm5, xmm7
-	movdqa	[esp+432-256], xmm5
-	movdqa	xmm5, xmm3
-	pavgw	xmm5, xmm6
-	movdqa	[esp+432-304], xmm5
-	movdqa	xmm5, [esp+432-400]
-	psubw	xmm5, [esp+432-288]
-	psubw	xmm5, [esp+432-256]
-	movdqa	[esp+432-224], xmm5
-	movdqa	xmm5, xmm6
-	psubw	xmm5, xmm3
-	movdqa	[esp+432-32], xmm6
-	psubw	xmm6, [esp+432-240]
-	movdqa	xmm7, xmm5
-	movdqa	[esp+432-384], xmm5
-	movdqa	xmm5, [esp+432-112]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm5, xmm7
-	pabsw	xmm6, xmm6
-	movdqa	xmm7, xmm4
-	pcmpgtw	xmm7, xmm6
-
-	pand	xmm5, xmm7
-	movdqa	xmm6, xmm3
-	psubw	xmm6, xmm2
-	pabsw	xmm6, xmm6
-	movdqa	xmm7, xmm4
-	pcmpgtw	xmm7, xmm6
-	movdqa	xmm6, [esp+432-400]
-	pand	xmm5, xmm7
-	movdqa	xmm7, xmm6
-	pcmpeqw	xmm6, xmm0
-	pcmpgtw	xmm7, xmm0
-	por	xmm7, xmm6
-	pand	xmm5, xmm7
-	movdqa	[esp+432-320], xmm5
-	movd	xmm5, edx
-	movdqa	xmm6, xmm5
-	punpcklwd xmm6, xmm5
-	pshufd	xmm5, xmm6, 0
-	movdqa	[esp+432-336], xmm5
-	movdqa	xmm5, [esp+432-224]
-	movdqa	[esp+432-368], xmm5
-	movdqa	xmm6, xmm0
-	psubw	xmm6, xmm5
-	movdqa	xmm5, [esp+432-384]
-	psllw	xmm5, 2
-	movdqa	xmm7, xmm2
-	psubw	xmm7, [esp+432-240]
-	paddw	xmm7, xmm5
-	paddw	xmm7, [esp+432-336]
-	movdqa	xmm5, [esp+432-368]
-	psraw	xmm7, 3
-	pmaxsw	xmm6, xmm7
-	pminsw	xmm5, xmm6
-
-	pand	xmm5, [esp+432-320]
-	movdqa	xmm6, [esp+432-400]
-	movdqa	[esp+432-64], xmm5
-	movdqa	[esp+432-384], xmm6
-	movdqa	xmm5, xmm0
-	psubw	xmm5, xmm6
-	movdqa	[esp+432-368], xmm5
-	movdqa	xmm6, xmm5
-	movdqa	xmm5, [esp+432-272]
-	paddw	xmm5, [esp+432-304]
-	movdqa	xmm7, xmm2
-	paddw	xmm7, xmm2
-	psubw	xmm5, xmm7
-	psraw	xmm5, 1
-	pmaxsw	xmm6, xmm5
-	movdqa	xmm5, [esp+432-384]
-	pminsw	xmm5, xmm6
-
-	pand	xmm5, [esp+432-320]
-	pand	xmm5, [esp+432-288]
-	movdqa	xmm6, [esp+432-240]
-	movdqa	[esp+432-96], xmm5
-	movdqa	xmm5, [esp+432-352]
-	paddw	xmm5, [esp+432-304]
-	movdqa	xmm7, xmm6
-	paddw	xmm7, xmm6
-	movdqa	xmm6, [esp+432-368]
-	psubw	xmm5, xmm7
-
-	movdqa	xmm7, [esp+496-208]
-	psraw	xmm5, 1
-	pmaxsw	xmm6, xmm5
-	movdqa	xmm5, [esp+432-400]
-	pminsw	xmm5, xmm6
-	pand	xmm5, [esp+432-320]
-	pand	xmm5, [esp+432-256]
-	movdqa	xmm6, [esp+448-208]
-	punpckhbw xmm7, xmm0
-	movdqa	[esp+432-352], xmm7
-
-	movdqa	xmm7, [esp+512-208]
-	punpckhbw xmm6, xmm0
-	movdqa	[esp+432-48], xmm5
-	movdqa	xmm5, [esp+432-208]
-	movdqa	[esp+432-368], xmm6
-	movdqa	xmm6, [esp+464-208]
-	punpckhbw xmm7, xmm0
-	punpckhbw xmm5, xmm0
-	movdqa	[esp+432-384], xmm7
-	punpckhbw xmm6, xmm0
-	movdqa	[esp+432-400], xmm6
-
-	movdqa	xmm7, [esp+432-400]
-	movdqa	xmm6, [esp+480-208]
-	psubw	xmm7, xmm5
-	movdqa	[esp+432-16], xmm5
-	pabsw	xmm7, xmm7
-	punpckhbw xmm6, xmm0
-	movdqa	xmm5, xmm4
-	pcmpgtw	xmm5, xmm7
-	movdqa	[esp+432-288], xmm5
-
-	movdqa	xmm7, xmm6
-	psubw	xmm7, [esp+432-384]
-	pabsw	xmm7, xmm7
-	movdqa	xmm5, xmm4
-	pcmpgtw	xmm5, xmm7
-	movdqa	[esp+432-256], xmm5
-
-	movdqa	xmm5, [esp+432-400]
-	movdqa	[esp+432-80], xmm6
-	pavgw	xmm5, xmm6
-	movdqa	[esp+432-304], xmm5
-
-	movdqa	xmm5, xmm1
-	psubw	xmm5, [esp+432-288]
-	psubw	xmm5, [esp+432-256]
-	movdqa	[esp+432-224], xmm5
-	movdqa	xmm5, xmm6
-	psubw	xmm5, [esp+432-400]
-	psubw	xmm6, [esp+432-352]
-	movdqa	[esp+432-272], xmm5
-	movdqa	xmm7, xmm5
-	movdqa	xmm5, [esp+432-112]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm5, xmm7
-	movdqa	xmm7, xmm4
-	pabsw	xmm6, xmm6
-	pcmpgtw	xmm7, xmm6
-	movdqa	xmm6, [esp+432-368]
-
-	pand	xmm5, xmm7
-	movdqa	xmm7, [esp+432-400]
-	psubw	xmm7, xmm6
-	psubw	xmm6, [esp+432-352]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm4, xmm7
-	pand	xmm5, xmm4
-
-	paddw	xmm2, [esp+432-96]
-	movdqa	xmm4, xmm1
-	pcmpgtw	xmm4, xmm0
-	movdqa	xmm7, xmm1
-	pcmpeqw	xmm7, xmm0
-	por	xmm4, xmm7
-	pand	xmm5, xmm4
-	movdqa	xmm4, [esp+432-224]
-	movdqa	[esp+432-320], xmm5
-	movdqa	xmm5, [esp+432-272]
-	movdqa	xmm7, xmm0
-	psubw	xmm7, xmm4
-	psubw	xmm0, xmm1
-	psllw	xmm5, 2
-	paddw	xmm6, xmm5
-	paddw	xmm6, [esp+432-336]
-	movdqa	xmm5, [esp+432-368]
-	movdqa	[esp+432-336], xmm0
-	psraw	xmm6, 3
-	pmaxsw	xmm7, xmm6
-	pminsw	xmm4, xmm7
-	pand	xmm4, [esp+432-320]
-	movdqa	xmm6, xmm0
-	movdqa	xmm0, [esp+432-16]
-	paddw	xmm0, [esp+432-304]
-	movdqa	[esp+432-272], xmm4
-	movdqa	xmm4, [esp+432-368]
-	paddw	xmm4, xmm4
-	psubw	xmm0, xmm4
-
-	movdqa	xmm4, [esp+432-64]
-	psraw	xmm0, 1
-	pmaxsw	xmm6, xmm0
-	movdqa	xmm0, [esp+432-400]
-	movdqa	xmm7, xmm1
-	pminsw	xmm7, xmm6
-	movdqa	xmm6, [esp+432-320]
-	pand	xmm7, xmm6
-	pand	xmm7, [esp+432-288]
-	paddw	xmm5, xmm7
-	packuswb xmm2, xmm5
-	movdqa	xmm5, [esp+432-272]
-	paddw	xmm0, xmm5
-	paddw	xmm3, xmm4
-	packuswb xmm3, xmm0
-
-	movdqa	xmm0, [esp+432-32]
-	psubw	xmm0, xmm4
-	movdqa	xmm4, [esp+432-80]
-	psubw	xmm4, xmm5
-
-	movdqa	xmm5, [esp+432-240]
-	paddw	xmm5, [esp+432-48]
-	packuswb xmm0, xmm4
-	movdqa	xmm4, [esp+432-384]
-	paddw	xmm4, [esp+432-304]
-	movdqa	[esp+480-208], xmm0
-	movdqa	xmm0, [esp+432-352]
-	movdqa	xmm7, xmm0
-	paddw	xmm0, xmm0
-
-	mov	ecx, dword [esp+432-408]
-
-	mov	edx, dword [esp+432-404]
-	psubw	xmm4, xmm0
-	movdqa	xmm0, [esp+432-336]
-	movdqa	[edi], xmm2
-	psraw	xmm4, 1
-	pmaxsw	xmm0, xmm4
-	pminsw	xmm1, xmm0
-	movdqa	xmm0, [esp+480-208]
-
-	pop	edi
-	pand	xmm1, xmm6
-	pand	xmm1, [esp+428-256]
-	movdqa	[ecx], xmm3
-	paddw	xmm7, xmm1
-	pop	esi
-	packuswb xmm5, xmm7
-	movdqa	[eax], xmm0
-	movdqa	[edx], xmm5
-	pop	ebx
-	mov	esp, ebp
-	pop	ebp
-	ret
-
-
-;*******************************************************************************
-;    void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha, 
-;                                 int32_t iBeta)
-;*******************************************************************************
-
-WELS_EXTERN  DeblockLumaEq4V_sse2
-  
-ALIGN  16
-
-DeblockLumaEq4V_sse2:
-
-	push	ebp
-	mov	ebp, esp
-	and	esp, -16				; fffffff0H
-	sub	esp, 628				; 00000274H
-	mov	eax, dword [ebp+8]
-	mov	ecx, dword [ebp+12]
-	push	ebx
-	push	esi
-
-	lea	edx, [ecx*4]
-	pxor	xmm0, xmm0
-	movdqa	xmm2, xmm0
-
-	movdqa	xmm0, [ecx+eax]
-	mov	esi, eax
-	sub	esi, edx
-	movdqa	xmm3, [esi]
-	movdqa	xmm5, [eax]
-	push	edi
-	lea	edi, [ecx+ecx]
-	lea	ebx, [ecx+ecx*2]
-	mov	dword [esp+640-600], edi
-	mov	esi, eax
-	sub	esi, edi
-	movdqa	xmm1, [esi]
-	movdqa	 [esp+720-272], xmm0
-	mov	edi, eax
-	sub	edi, ecx
-	movdqa	xmm4, [edi]
-	add	ecx, eax
-	mov	dword [esp+640-596], ecx
-
-	mov	ecx, dword [esp+640-600]
-	movdqa	xmm0, [ecx+eax]
-	movdqa	 [esp+736-272], xmm0
-
-	movdqa	xmm0, [eax+ebx]
-	mov	edx, eax
-	sub	edx, ebx
-
-	movsx	ebx, word [ebp+16]
-	movdqa	xmm6, [edx]
-	add	ecx, eax
-	movdqa	 [esp+752-272], xmm0
-	movd	xmm0, ebx
-
-	movsx	ebx, word [ebp+20]
-	movdqa	xmm7, xmm0
-	punpcklwd xmm7, xmm0
-	pshufd	xmm0, xmm7, 0
-	movdqa	 [esp+640-320], xmm0
-	movd	xmm0, ebx
-	movdqa	xmm7, xmm0
-	punpcklwd xmm7, xmm0
-	pshufd	xmm0, xmm7, 0
-
-	movdqa	xmm7, [esp+736-272]
-	punpcklbw xmm7, xmm2
-	movdqa	 [esp+640-416], xmm7
-	movdqa	 [esp+640-512], xmm0
-	movdqa	xmm0, xmm1
-	movdqa	 [esp+672-272], xmm1
-	movdqa	xmm1, xmm4
-	movdqa	 [esp+704-272], xmm5
-	punpcklbw xmm5, xmm2
-	punpcklbw xmm1, xmm2
-
-	movdqa	xmm7, xmm5
-	psubw	xmm7, xmm1
-	pabsw	xmm7, xmm7
-	movdqa	 [esp+640-560], xmm7
-	punpcklbw xmm0, xmm2
-	movdqa	 [esp+688-272], xmm4
-	movdqa	xmm4, [esp+720-272]
-	movdqa	 [esp+640-480], xmm0
-
-	movdqa	xmm7, xmm1
-	psubw	xmm7, xmm0
-
-	movdqa	xmm0, [esp+640-512]
-	pabsw	xmm7, xmm7
-	punpcklbw xmm4, xmm2
-	pcmpgtw	xmm0, xmm7
-	movdqa	 [esp+640-384], xmm4
-	movdqa	xmm7, xmm5
-	psubw	xmm7, xmm4
-	movdqa	xmm4, [esp+640-512]
-	movdqa	 [esp+656-272], xmm6
-	punpcklbw xmm6, xmm2
-	pabsw	xmm7, xmm7
-	movdqa	 [esp+640-48], xmm2
-	movdqa	 [esp+640-368], xmm6
-	movdqa	 [esp+640-144], xmm1
-	movdqa	 [esp+640-400], xmm5
-	pcmpgtw	xmm4, xmm7
-	pand	xmm0, xmm4
-	movdqa	xmm4, [esp+640-320]
-	pcmpgtw	xmm4, [esp+640-560]
-	pand	xmm0, xmm4
-
-	mov	ebx, 2
-	movsx	ebx, bx
-	movd	xmm4, ebx
-	movdqa	xmm7, xmm4
-	punpcklwd xmm7, xmm4
-	movdqa	xmm4, [esp+640-320]
-	psraw	xmm4, 2
-	pshufd	xmm7, xmm7, 0
-	paddw	xmm4, xmm7
-	movdqa	 [esp+640-576], xmm4
-	pcmpgtw	xmm4, [esp+640-560]
-	movdqa	 [esp+640-560], xmm4
-
-	movdqa	xmm4, [esp+640-512]
-	movdqa	 [esp+640-624], xmm7
-	movdqa	xmm7, xmm1
-	psubw	xmm7, xmm6
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm4, xmm7
-
-	pand	xmm4, [esp+640-560]
-	movdqa	 [esp+640-544], xmm4
-	movdqa	xmm4, [esp+640-512]
-	movdqa	xmm7, xmm5
-	psubw	xmm7, [esp+640-416]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm4, xmm7
-
-	pand	xmm4, [esp+640-560]
-	movdqa	 [esp+640-560], xmm4
-
-	movdqa	xmm4, [esp+640-544]
-	pandn	xmm4, xmm6
-	movdqa	 [esp+640-16], xmm4
-	mov	ebx, 4
-	movsx	ebx, bx
-	movd	xmm4, ebx
-	movdqa	xmm7, xmm4
-	punpcklwd xmm7, xmm4
-	movdqa	xmm4, xmm3
-	punpcklbw xmm4, xmm2
-	psllw	xmm4, 1
-	paddw	xmm4, xmm6
-	paddw	xmm4, xmm6
-	paddw	xmm4, xmm6
-	paddw	xmm4, [esp+640-480]
-
-	movdqa	xmm6, [esp+640-560]
-	pshufd	xmm7, xmm7, 0
-	paddw	xmm4, xmm1
-	movdqa	 [esp+640-592], xmm7
-	paddw	xmm4, xmm5
-	paddw	xmm4, xmm7
-	movdqa	xmm7, [esp+640-416]
-	pandn	xmm6, xmm7
-	movdqa	 [esp+640-80], xmm6
-	movdqa	xmm6, [esp+752-272]
-	punpcklbw xmm6, xmm2
-	psllw	xmm6, 1
-	paddw	xmm6, xmm7
-	paddw	xmm6, xmm7
-	paddw	xmm6, xmm7
-	paddw	xmm6, [esp+640-384]
-
-	movdqa	xmm7, [esp+640-480]
-	paddw	xmm6, xmm5
-	paddw	xmm6, xmm1
-	paddw	xmm6, [esp+640-592]
-	psraw	xmm6, 3
-	pand	xmm6, [esp+640-560]
-	movdqa	 [esp+640-112], xmm6
-	movdqa	xmm6, [esp+640-544]
-	pandn	xmm6, xmm7
-	movdqa	 [esp+640-336], xmm6
-	movdqa	xmm6, [esp+640-544]
-	movdqa	 [esp+640-528], xmm6
-	movdqa	xmm6, [esp+640-368]
-	paddw	xmm6, xmm7
-	movdqa	xmm7, xmm1
-	psraw	xmm4, 3
-	pand	xmm4, [esp+640-544]
-	paddw	xmm7, xmm5
-	paddw	xmm6, xmm7
-	paddw	xmm6, [esp+640-624]
-	movdqa	xmm7, [esp+640-528]
-
-	paddw	xmm5, xmm1
-	psraw	xmm6, 2
-	pand	xmm7, xmm6
-
-	movdqa	xmm6, [esp+640-384]
-	movdqa	 [esp+640-64], xmm7
-	movdqa	xmm7, [esp+640-560]
-	pandn	xmm7, xmm6
-	movdqa	 [esp+640-304], xmm7
-	movdqa	xmm7, [esp+640-560]
-	movdqa	 [esp+640-528], xmm7
-	movdqa	xmm7, [esp+640-416]
-	paddw	xmm7, xmm6
-	paddw	xmm7, xmm5
-	paddw	xmm7, [esp+640-624]
-	movdqa	xmm5, [esp+640-528]
-	psraw	xmm7, 2
-	pand	xmm5, xmm7
-	movdqa	 [esp+640-32], xmm5
-
-	movdqa	xmm5, [esp+640-544]
-	movdqa	 [esp+640-528], xmm5
-	movdqa	xmm5, [esp+640-480]
-	movdqa	xmm7, xmm5
-	paddw	xmm7, xmm5
-	movdqa	xmm5, xmm1
-	paddw	xmm5, xmm6
-	paddw	xmm6, [esp+640-592]
-	paddw	xmm7, xmm5
-	paddw	xmm7, [esp+640-624]
-	movdqa	xmm5, [esp+640-528]
-	psraw	xmm7, 2
-	pandn	xmm5, xmm7
-	movdqa	xmm7, [esp+640-480]
-	paddw	xmm7, xmm1
-	paddw	xmm7, [esp+640-400]
-	movdqa	xmm1, [esp+640-544]
-	movdqa	 [esp+640-352], xmm5
-	movdqa	xmm5, [esp+640-368]
-	psllw	xmm7, 1
-	paddw	xmm7, xmm6
-	paddw	xmm5, xmm7
-
-	movdqa	xmm7, [esp+640-400]
-	psraw	xmm5, 3
-	pand	xmm1, xmm5
-	movdqa	xmm5, [esp+640-480]
-	movdqa	 [esp+640-96], xmm1
-	movdqa	xmm1, [esp+640-560]
-	movdqa	 [esp+640-528], xmm1
-	movdqa	xmm1, [esp+640-384]
-	movdqa	xmm6, xmm1
-	paddw	xmm6, xmm1
-	paddw	xmm1, [esp+640-400]
-	paddw	xmm1, [esp+640-144]
-	paddw	xmm7, xmm5
-	paddw	xmm5, [esp+640-592]
-	paddw	xmm6, xmm7
-	paddw	xmm6, [esp+640-624]
-	movdqa	xmm7, [esp+640-528]
-	psraw	xmm6, 2
-	psllw	xmm1, 1
-	paddw	xmm1, xmm5
-
-	movdqa	xmm5, [esp+656-272]
-	pandn	xmm7, xmm6
-	movdqa	xmm6, [esp+640-416]
-	paddw	xmm6, xmm1
-	movdqa	xmm1, [esp+640-560]
-	psraw	xmm6, 3
-	pand	xmm1, xmm6
-
-	movdqa	xmm6, [esp+704-272]
-	movdqa	 [esp+640-128], xmm1
-	movdqa	xmm1, [esp+672-272]
-	punpckhbw xmm1, xmm2
-	movdqa	 [esp+640-448], xmm1
-	movdqa	xmm1, [esp+688-272]
-	punpckhbw xmm1, xmm2
-	punpckhbw xmm6, xmm2
-	movdqa	 [esp+640-288], xmm7
-	punpckhbw xmm5, xmm2
-	movdqa	 [esp+640-496], xmm1
-	movdqa	 [esp+640-432], xmm6
-
-	movdqa	xmm7, [esp+720-272]
-	punpckhbw xmm7, xmm2
-	movdqa	 [esp+640-464], xmm7
-
-	movdqa	xmm7, [esp+736-272]
-	punpckhbw xmm7, xmm2
-	movdqa	 [esp+640-528], xmm7
-
-	movdqa	xmm7, xmm6
-
-	psubw	xmm6, [esp+640-464]
-	psubw	xmm7, xmm1
-	pabsw	xmm7, xmm7
-	movdqa	 [esp+640-560], xmm7
-	por	xmm4, [esp+640-16]
-	pabsw	xmm6, xmm6
-	movdqa	xmm7, xmm1
-	psubw	xmm7, [esp+640-448]
-
-	movdqa	xmm1, [esp+640-512]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm1, xmm7
-	movdqa	xmm7, [esp+640-512]
-	pcmpgtw	xmm7, xmm6
-	movdqa	xmm6, [esp+640-320]
-	pand	xmm1, xmm7
-	movdqa	xmm7, [esp+640-560]
-	pcmpgtw	xmm6, xmm7
-	pand	xmm1, xmm6
-
-	movdqa	xmm6, [esp+640-576]
-	pcmpgtw	xmm6, xmm7
-
-	movdqa	xmm7, [esp+640-496]
-	punpckhbw xmm3, xmm2
-	movdqa	 [esp+640-560], xmm6
-	movdqa	xmm6, [esp+640-512]
-	psubw	xmm7, xmm5
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm6, xmm7
-
-	pand	xmm6, [esp+640-560]
-	movdqa	xmm7, [esp+640-432]
-	psubw	xmm7, [esp+640-528]
-
-	psllw	xmm3, 1
-	movdqa	 [esp+640-544], xmm6
-	movdqa	xmm6, [esp+640-512]
-
-	movdqa	xmm2, [esp+640-544]
-	paddw	xmm3, xmm5
-	paddw	xmm3, xmm5
-	paddw	xmm3, xmm5
-	paddw	xmm3, [esp+640-448]
-	paddw	xmm3, [esp+640-496]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm6, xmm7
-	pand	xmm6, [esp+640-560]
-	movdqa	 [esp+640-560], xmm6
-
-	movdqa	xmm6, xmm0
-	pand	xmm6, xmm4
-	movdqa	xmm4, xmm0
-	pandn	xmm4, [esp+640-368]
-	por	xmm6, xmm4
-	movdqa	xmm4, [esp+640-432]
-	paddw	xmm3, xmm4
-	paddw	xmm3, [esp+640-592]
-	psraw	xmm3, 3
-	pand	xmm3, xmm2
-	pandn	xmm2, xmm5
-	por	xmm3, xmm2
-	movdqa	xmm7, xmm1
-	pand	xmm7, xmm3
-	movdqa	xmm3, [esp+640-64]
-	por	xmm3, [esp+640-336]
-	movdqa	xmm2, xmm1
-	pandn	xmm2, xmm5
-	por	xmm7, xmm2
-
-	movdqa	xmm2, xmm0
-	pand	xmm2, xmm3
-	movdqa	xmm3, xmm0
-	pandn	xmm3, [esp+640-480]
-	por	xmm2, xmm3
-	packuswb xmm6, xmm7
-	movdqa	 [esp+640-336], xmm2
-	movdqa	 [esp+656-272], xmm6
-	movdqa	xmm6, [esp+640-544]
-	movdqa	xmm2, xmm5
-	paddw	xmm2, [esp+640-448]
-	movdqa	xmm3, xmm1
-	movdqa	xmm7, [esp+640-496]
-	paddw	xmm7, xmm4
-	paddw	xmm2, xmm7
-	paddw	xmm2, [esp+640-624]
-	movdqa	xmm7, [esp+640-544]
-	psraw	xmm2, 2
-	pand	xmm6, xmm2
-	movdqa	xmm2, [esp+640-448]
-	pandn	xmm7, xmm2
-	por	xmm6, xmm7
-	pand	xmm3, xmm6
-	movdqa	xmm6, xmm1
-	pandn	xmm6, xmm2
-	paddw	xmm2, [esp+640-496]
-	paddw	xmm2, xmm4
-	por	xmm3, xmm6
-	movdqa	xmm6, [esp+640-336]
-	packuswb xmm6, xmm3
-	psllw	xmm2, 1
-	movdqa	 [esp+672-272], xmm6
-	movdqa	xmm6, [esp+640-96]
-	por	xmm6, [esp+640-352]
-
-	movdqa	xmm3, xmm0
-	pand	xmm3, xmm6
-	movdqa	xmm6, xmm0
-	pandn	xmm6, [esp+640-144]
-	por	xmm3, xmm6
-	movdqa	xmm6, [esp+640-544]
-	movdqa	 [esp+640-352], xmm3
-	movdqa	xmm3, [esp+640-464]
-	paddw	xmm3, [esp+640-592]
-	paddw	xmm2, xmm3
-	movdqa	xmm3, [esp+640-448]
-	paddw	xmm5, xmm2
-	movdqa	xmm2, [esp+640-496]
-	psraw	xmm5, 3
-	pand	xmm6, xmm5
-	movdqa	xmm5, [esp+640-464]
-	paddw	xmm2, xmm5
-	paddw	xmm5, [esp+640-432]
-	movdqa	xmm4, xmm3
-	paddw	xmm4, xmm3
-	paddw	xmm4, xmm2
-	paddw	xmm4, [esp+640-624]
-	movdqa	xmm2, [esp+640-544]
-	paddw	xmm3, [esp+640-592]
-	psraw	xmm4, 2
-	pandn	xmm2, xmm4
-	por	xmm6, xmm2
-	movdqa	xmm7, xmm1
-	pand	xmm7, xmm6
-	movdqa	xmm6, [esp+640-496]
-	movdqa	xmm2, xmm1
-	pandn	xmm2, xmm6
-	por	xmm7, xmm2
-	movdqa	xmm2, [esp+640-352]
-	packuswb xmm2, xmm7
-	movdqa	 [esp+688-272], xmm2
-	movdqa	xmm2, [esp+640-128]
-	por	xmm2, [esp+640-288]
-
-	movdqa	xmm4, xmm0
-	pand	xmm4, xmm2
-	paddw	xmm5, xmm6
-	movdqa	xmm2, xmm0
-	pandn	xmm2, [esp+640-400]
-	por	xmm4, xmm2
-	movdqa	xmm2, [esp+640-528]
-	psllw	xmm5, 1
-	paddw	xmm5, xmm3
-	movdqa	xmm3, [esp+640-560]
-	paddw	xmm2, xmm5
-	psraw	xmm2, 3
-	movdqa	 [esp+640-288], xmm4
-	movdqa	xmm4, [esp+640-560]
-	pand	xmm4, xmm2
-	movdqa	xmm2, [esp+640-464]
-	movdqa	xmm5, xmm2
-	paddw	xmm5, xmm2
-	movdqa	xmm2, [esp+640-432]
-	paddw	xmm2, [esp+640-448]
-	movdqa	xmm7, xmm1
-	paddw	xmm5, xmm2
-	paddw	xmm5, [esp+640-624]
-	movdqa	xmm6, [esp+640-560]
-	psraw	xmm5, 2
-	pandn	xmm3, xmm5
-	por	xmm4, xmm3
-	movdqa	xmm3, [esp+640-32]
-	por	xmm3, [esp+640-304]
-	pand	xmm7, xmm4
-	movdqa	xmm4, [esp+640-432]
-	movdqa	xmm5, [esp+640-464]
-	movdqa	xmm2, xmm1
-	pandn	xmm2, xmm4
-	paddw	xmm4, [esp+640-496]
-	por	xmm7, xmm2
-	movdqa	xmm2, [esp+640-288]
-	packuswb xmm2, xmm7
-	movdqa	 [esp+704-272], xmm2
-
-	movdqa	xmm2, xmm0
-	pand	xmm2, xmm3
-	movdqa	xmm3, xmm0
-	pandn	xmm3, [esp+640-384]
-	por	xmm2, xmm3
-	movdqa	 [esp+640-304], xmm2
-	movdqa	xmm2, [esp+640-528]
-	movdqa	xmm3, xmm2
-	paddw	xmm3, [esp+640-464]
-	paddw	xmm3, xmm4
-	paddw	xmm3, [esp+640-624]
-	psraw	xmm3, 2
-	pand	xmm6, xmm3
-	movdqa	xmm3, [esp+640-560]
-	movdqa	xmm4, xmm3
-	pandn	xmm4, xmm5
-	por	xmm6, xmm4
-	movdqa	xmm7, xmm1
-	pand	xmm7, xmm6
-	movdqa	xmm6, [esp+640-304]
-	movdqa	xmm4, xmm1
-	pandn	xmm4, xmm5
-	por	xmm7, xmm4
-
-	movdqa	xmm4, xmm0
-	pandn	xmm0, [esp+640-416]
-	packuswb xmm6, xmm7
-	movdqa	xmm7, [esp+640-112]
-	por	xmm7, [esp+640-80]
-	pand	xmm4, xmm7
-	por	xmm4, xmm0
-	movdqa	xmm0, [esp+752-272]
-	punpckhbw xmm0, [esp+640-48]
-	psllw	xmm0, 1
-	paddw	xmm0, xmm2
-	paddw	xmm0, xmm2
-	paddw	xmm0, xmm2
-	paddw	xmm0, xmm5
-	paddw	xmm0, [esp+640-432]
-	paddw	xmm0, [esp+640-496]
-	paddw	xmm0, [esp+640-592]
-	psraw	xmm0, 3
-	pand	xmm0, xmm3
-	movdqa	xmm7, xmm1
-	pandn	xmm3, xmm2
-	por	xmm0, xmm3
-	pand	xmm7, xmm0
-
-	movdqa	xmm0, [esp+656-272]
-	movdqa	 [edx], xmm0
-
-	movdqa	xmm0, [esp+672-272]
-
-	mov	edx, dword [esp+640-596]
-	movdqa	 [esi], xmm0
-	movdqa	xmm0, [esp+688-272]
-	movdqa	 [edi], xmm0
-	movdqa	xmm0, [esp+704-272]
-
-	pop	edi
-	pandn	xmm1, xmm2
-	movdqa	 [eax], xmm0
-	por	xmm7, xmm1
-	pop	esi
-	packuswb xmm4, xmm7
-	movdqa	 [edx], xmm6
-	movdqa	 [ecx], xmm4
-	pop	ebx
-	mov	esp, ebp
-	pop	ebp
-	ret
-  
-    
-;********************************************************************************
-;
-;   void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);     
-;
-;********************************************************************************
-
-WELS_EXTERN  DeblockLumaTransposeH2V_sse2
-
-ALIGN  16
-
-DeblockLumaTransposeH2V_sse2:
-    push    ebp
-    push    ebx
-    mov     ebp,   esp
-    and     esp,0FFFFFFF0h
-    sub     esp,   10h    
-    
-    mov     eax,   [ebp + 0Ch]  
-    mov     ecx,   [ebp + 10h]
-    lea     edx,   [eax + ecx * 8]
-    lea     ebx,   [ecx*3]
-    
-    movq    xmm0,  [eax] 
-    movq    xmm7,  [edx]
-    punpcklqdq   xmm0,  xmm7  
-    movq    xmm1,  [eax + ecx]
-    movq    xmm7,  [edx + ecx]
-    punpcklqdq   xmm1,  xmm7
-    movq    xmm2,  [eax + ecx*2] 
-    movq    xmm7,  [edx + ecx*2]
-    punpcklqdq   xmm2,  xmm7
-    movq    xmm3,  [eax + ebx]
-    movq    xmm7,  [edx + ebx]
-    punpcklqdq   xmm3,  xmm7
-    
-    lea     eax,   [eax + ecx * 4]
-    lea     edx,   [edx + ecx * 4]
-    movq    xmm4,  [eax] 
-    movq    xmm7,  [edx]
-    punpcklqdq   xmm4,  xmm7  
-    movq    xmm5,  [eax + ecx]
-    movq    xmm7,  [edx + ecx]
-    punpcklqdq   xmm5,  xmm7
-    movq    xmm6,  [eax + ecx*2] 
-    movq    xmm7,  [edx + ecx*2]
-    punpcklqdq   xmm6,  xmm7
-    
-    movdqa  [esp],   xmm0
-    movq    xmm7,  [eax + ebx]
-    movq    xmm0,  [edx + ebx]
-    punpcklqdq   xmm7,  xmm0
-    movdqa  xmm0,   [esp]
-    
-    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
-    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-    
-    mov    eax,   [ebp + 14h]
-    movdqa  [eax],    xmm4 
-    movdqa  [eax + 10h],  xmm2
-    movdqa  [eax + 20h],  xmm3
-    movdqa  [eax + 30h],  xmm7
-    movdqa  [eax + 40h],  xmm5
-    movdqa  [eax + 50h],  xmm1
-    movdqa  [eax + 60h],  xmm6
-    movdqa  [eax + 70h],  xmm0   
-    
-    mov     esp,   ebp
-    pop     ebx
-    pop     ebp
-    ret
-    
-    
-    
-;*******************************************************************************************
-;
-;   void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
-;
-;*******************************************************************************************
-
-WELS_EXTERN   DeblockLumaTransposeV2H_sse2
-
-ALIGN  16
-
-DeblockLumaTransposeV2H_sse2:
-    push     ebp
-    mov      ebp,   esp
-    
-    and     esp,  0FFFFFFF0h
-    sub     esp,   10h  
-    
-    mov      eax,   [ebp + 10h]  
-    mov      ecx,   [ebp + 0Ch]
-    mov      edx,   [ebp + 08h]
-      
-    movdqa   xmm0,  [eax]
-    movdqa   xmm1,  [eax + 10h]
-    movdqa   xmm2,  [eax + 20h]
-    movdqa   xmm3,	[eax + 30h]
-    movdqa   xmm4,	[eax + 40h]
-    movdqa   xmm5,	[eax + 50h]
-    movdqa   xmm6,	[eax + 60h]
-    movdqa   xmm7,	[eax + 70h]
-    
-    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
-    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-    
-    lea      eax,   [ecx * 3]
-    
-    movq     [edx],  xmm4 
-    movq     [edx + ecx],  xmm2
-    movq     [edx + ecx*2],  xmm3
-    movq     [edx + eax],  xmm7
-    
-    lea      edx,   [edx + ecx*4]
-    movq     [edx],  xmm5 
-    movq     [edx + ecx],  xmm1
-    movq     [edx + ecx*2],  xmm6
-    movq     [edx + eax],  xmm0    
-    
-    psrldq    xmm4,   8
-    psrldq    xmm2,   8
-    psrldq    xmm3,   8
-    psrldq    xmm7,   8
-    psrldq    xmm5,   8
-    psrldq    xmm1,   8
-    psrldq    xmm6,   8
-    psrldq    xmm0,   8
-    
-    lea       edx,  [edx + ecx*4]
-    movq     [edx],  xmm4 
-    movq     [edx + ecx],  xmm2
-    movq     [edx + ecx*2],  xmm3
-    movq     [edx + eax],  xmm7
-    
-    lea      edx,   [edx + ecx*4]
-    movq     [edx],  xmm5 
-    movq     [edx + ecx],  xmm1
-    movq     [edx + ecx*2],  xmm6
-    movq     [edx + eax],  xmm0   
-    
-    
-    mov      esp,   ebp
-    pop      ebp
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  deblock.asm
+;*
+;*  Abstract
+;*      edge loop
+;*
+;*  History
+;*      08/07/2009 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+BITS 32
+
+;*******************************************************************************
+; Macros and other preprocessor constants
+;*******************************************************************************
+
+%ifdef FORMAT_COFF
+SECTION .rodata pData
+%else
+SECTION .rodata align=16
+%endif
+
+SECTION .text
+
+;********************************************************************************
+;  void DeblockChromaEq4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+;                             int32_t iAlpha, int32_t iBeta)
+;********************************************************************************
+WELS_EXTERN   DeblockChromaEq4V_sse2
+
+ALIGN  16
+DeblockChromaEq4V_sse2:
+  push        ebp
+  mov         ebp,esp
+  and         esp,0FFFFFFF0h
+  sub         esp,68h
+  mov         edx,[ebp+10h]      ;  iStride
+  mov         eax,[ebp+8]        ;  pPixCb
+  mov         ecx,[ebp+0Ch]      ;  pPixCr
+  movq        xmm4,[ecx]
+  movq        xmm5,[edx+ecx]
+  push        esi
+  push        edi
+  lea         esi,[edx+edx]
+  mov         edi,eax
+  sub         edi,esi
+  movq        xmm1,[edi]
+  mov         edi,ecx
+  sub         edi,esi
+  movq        xmm2,[edi]
+  punpcklqdq  xmm1,xmm2
+  mov         esi,eax
+  sub         esi,edx
+  movq        xmm2,[esi]
+  mov         edi,ecx
+  sub         edi,edx
+  movq        xmm3,[edi]
+  punpcklqdq  xmm2,xmm3
+  movq        xmm3,[eax]
+  punpcklqdq  xmm3,xmm4
+  movq        xmm4,[edx+eax]
+  mov       edx, [ebp + 14h]
+  punpcklqdq  xmm4,xmm5
+  movd        xmm5,edx
+  mov       edx, [ebp + 18h]
+  pxor        xmm0,xmm0
+  movdqa      xmm6,xmm5
+  punpcklwd   xmm6,xmm5
+  pshufd      xmm5,xmm6,0
+  movd        xmm6,edx
+  movdqa      xmm7,xmm6
+  punpcklwd   xmm7,xmm6
+  pshufd      xmm6,xmm7,0
+  movdqa      xmm7,xmm1
+  punpckhbw   xmm1,xmm0
+  punpcklbw   xmm7,xmm0
+  movdqa      [esp+40h],xmm1
+  movdqa      [esp+60h],xmm7
+  movdqa      xmm7,xmm2
+  punpcklbw   xmm7,xmm0
+  movdqa      [esp+10h],xmm7
+  movdqa      xmm7,xmm3
+  punpcklbw   xmm7,xmm0
+  punpckhbw   xmm3,xmm0
+  movdqa      [esp+50h],xmm7
+  movdqa      xmm7,xmm4
+  punpckhbw   xmm4,xmm0
+  punpckhbw   xmm2,xmm0
+  punpcklbw   xmm7,xmm0
+  movdqa      [esp+30h],xmm3
+  movdqa      xmm3,[esp+10h]
+  movdqa      xmm1,xmm3
+  psubw       xmm1,[esp+50h]
+  pabsw       xmm1,xmm1
+  movdqa      [esp+20h],xmm4
+  movdqa      xmm0,xmm5
+  pcmpgtw     xmm0,xmm1
+  movdqa      xmm1,[esp+60h]
+  psubw       xmm1,xmm3
+  pabsw       xmm1,xmm1
+  movdqa      xmm4,xmm6
+  pcmpgtw     xmm4,xmm1
+  pand        xmm0,xmm4
+  movdqa      xmm1,xmm7
+  psubw       xmm1,[esp+50h]
+  pabsw       xmm1,xmm1
+  movdqa      xmm4,xmm6
+  pcmpgtw     xmm4,xmm1
+  movdqa      xmm1,xmm2
+  psubw       xmm1,[esp+30h]
+  pabsw       xmm1,xmm1
+  pcmpgtw     xmm5,xmm1
+  movdqa      xmm1,[esp+40h]
+  pand        xmm0,xmm4
+  psubw       xmm1,xmm2
+  pabsw       xmm1,xmm1
+  movdqa      xmm4,xmm6
+  pcmpgtw     xmm4,xmm1
+  movdqa      xmm1,[esp+20h]
+  psubw       xmm1,[esp+30h]
+  pand        xmm5,xmm4
+  pabsw       xmm1,xmm1
+  pcmpgtw     xmm6,xmm1
+  pand        xmm5,xmm6
+  mov         edx,2
+  movsx       edx,dx
+  movd        xmm1,edx
+  movdqa      xmm4,xmm1
+  punpcklwd   xmm4,xmm1
+  pshufd      xmm1,xmm4,0
+  movdqa      xmm4,[esp+60h]
+  movdqa      xmm6,xmm4
+  paddw       xmm6,xmm4
+  paddw       xmm6,xmm3
+  paddw       xmm6,xmm7
+  movdqa      [esp+10h],xmm1
+  paddw       xmm6,[esp+10h]
+  psraw       xmm6,2
+  movdqa      xmm4,xmm0
+  pandn       xmm4,xmm3
+  movdqa      xmm3,[esp+40h]
+  movdqa      xmm1,xmm0
+  pand        xmm1,xmm6
+  por         xmm1,xmm4
+  movdqa      xmm6,xmm3
+  paddw       xmm6,xmm3
+  movdqa      xmm3,[esp+10h]
+  paddw       xmm6,xmm2
+  paddw       xmm6,[esp+20h]
+  paddw       xmm6,xmm3
+  psraw       xmm6,2
+  movdqa      xmm4,xmm5
+  pand        xmm4,xmm6
+  movdqa      xmm6,xmm5
+  pandn       xmm6,xmm2
+  por         xmm4,xmm6
+  packuswb    xmm1,xmm4
+  movdqa      xmm4,[esp+50h]
+  movdqa      xmm6,xmm7
+  paddw       xmm6,xmm7
+  paddw       xmm6,xmm4
+  paddw       xmm6,[esp+60h]
+  paddw       xmm6,xmm3
+  psraw       xmm6,2
+  movdqa      xmm2,xmm0
+  pand        xmm2,xmm6
+  pandn       xmm0,xmm4
+  por         xmm2,xmm0
+  movdqa      xmm0,[esp+20h]
+  movdqa      xmm6,xmm0
+  paddw       xmm6,xmm0
+  movdqa      xmm0,[esp+30h]
+  paddw       xmm6,xmm0
+  paddw       xmm6,[esp+40h]
+  movdqa      xmm4,xmm5
+  paddw       xmm6,xmm3
+  movq        [esi],xmm1
+  psraw       xmm6,2
+  pand        xmm4,xmm6
+  pandn       xmm5,xmm0
+  por         xmm4,xmm5
+  packuswb    xmm2,xmm4
+  movq        [eax],xmm2
+  psrldq      xmm1,8
+  movq        [edi],xmm1
+  pop         edi
+  psrldq      xmm2,8
+  movq        [ecx],xmm2
+  pop         esi
+  mov         esp,ebp
+  pop         ebp
+  ret
+
+;******************************************************************************
+; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+;                           int32_t iAlpha, int32_t iBeta, int8_t * pTC);
+;*******************************************************************************
+
+WELS_EXTERN  DeblockChromaLt4V_sse2
+
+DeblockChromaLt4V_sse2:
+  push        ebp
+  mov         ebp,esp
+  and         esp,0FFFFFFF0h
+  sub         esp,0E4h
+  push        ebx
+  push        esi
+  mov         esi, [ebp+1Ch]      ;  pTC
+  movsx       ebx, byte [esi+2]
+  push        edi
+  movsx       di,byte [esi+3]
+  mov         word [esp+0Ch],bx
+  movsx       bx,byte  [esi+1]
+  movsx       esi,byte  [esi]
+  mov         word  [esp+0Eh],si
+  movzx       esi,di
+  movd        xmm1,esi
+  movzx       esi,di
+  movd        xmm2,esi
+  mov         si,word  [esp+0Ch]
+  mov         edx, [ebp + 10h]
+  mov         eax, [ebp + 08h]
+  movzx       edi,si
+  movzx       esi,si
+  mov         ecx, [ebp + 0Ch]
+  movd        xmm4,esi
+  movzx       esi,bx
+  movd        xmm5,esi
+  movd        xmm3,edi
+  movzx       esi,bx
+  movd        xmm6,esi
+  mov         si,word [esp+0Eh]
+  movzx       edi,si
+  movzx       esi,si
+  punpcklwd   xmm6,xmm2
+  pxor        xmm0,xmm0
+  movdqa      [esp+40h],xmm0
+  movd        xmm7,edi
+  movd        xmm0,esi
+  lea         esi,[edx+edx]
+  mov         edi,eax
+  sub         edi,esi
+  punpcklwd   xmm5,xmm1
+  movdqa      xmm1,[esp+40h]
+  punpcklwd   xmm0,xmm4
+  movq        xmm4,[edx+ecx]
+  punpcklwd   xmm7,xmm3
+  movq        xmm3,[eax]
+  punpcklwd   xmm0,xmm6
+  movq        xmm6,[edi]
+  punpcklwd   xmm7,xmm5
+  punpcklwd   xmm0,xmm7
+  mov         edi,ecx
+  sub         edi,esi
+  movdqa      xmm2,xmm1
+  psubw       xmm2,xmm0
+  movdqa      [esp+60h],xmm2
+  movq        xmm2, [edi]
+  punpcklqdq  xmm6,xmm2
+  mov         esi,eax
+  sub         esi,edx
+  movq        xmm7,[esi]
+  mov         edi,ecx
+  sub         edi,edx
+  movq        xmm2,[edi]
+  punpcklqdq  xmm7,xmm2
+  movq        xmm2,[ecx]
+  punpcklqdq  xmm3,xmm2
+  movq        xmm2,[edx+eax]
+  movsx       edx,word [ebp + 14h]
+  punpcklqdq  xmm2,xmm4
+  movdqa      [esp+0E0h],xmm2
+  movd        xmm2,edx
+  movsx       edx,word [ebp + 18h]
+  movdqa      xmm4,xmm2
+  punpcklwd   xmm4,xmm2
+  movd        xmm2,edx
+  movdqa      xmm5,xmm2
+  punpcklwd   xmm5,xmm2
+  pshufd      xmm2,xmm5,0
+  movdqa      [esp+50h],xmm2
+  movdqa      xmm2,xmm6
+  punpcklbw   xmm2,xmm1
+  movdqa      [esp+0D0h],xmm3
+  pshufd      xmm4,xmm4,0
+  movdqa      [esp+30h],xmm2
+  punpckhbw   xmm6,xmm1
+  movdqa      [esp+80h],xmm6
+  movdqa      xmm6,[esp+0D0h]
+  punpckhbw   xmm6,xmm1
+  movdqa      [esp+70h],xmm6
+  movdqa      xmm6, [esp+0E0h]
+  punpckhbw   xmm6,xmm1
+  movdqa     [esp+90h],xmm6
+  movdqa      xmm5, [esp+0E0h]
+  movdqa      xmm2,xmm7
+  punpckhbw   xmm7,xmm1
+  punpcklbw   xmm5,xmm1
+  movdqa       [esp+0A0h],xmm7
+  punpcklbw   xmm3,xmm1
+  mov         edx,4
+  punpcklbw   xmm2,xmm1
+  movsx       edx,dx
+  movd        xmm6,edx
+  movdqa      xmm7,xmm6
+  punpcklwd   xmm7,xmm6
+  pshufd      xmm6,xmm7,0
+  movdqa      xmm7,[esp+30h]
+  movdqa      [esp+20h],xmm6
+  psubw       xmm7,xmm5
+  movdqa      xmm6,xmm0
+  pcmpgtw     xmm6,xmm1
+  movdqa      xmm1,[esp+60h]
+  movdqa      [esp+40h],xmm6
+  movdqa      xmm6,xmm3
+  psubw       xmm6,xmm2
+  psllw       xmm6,2
+  paddw       xmm6,xmm7
+  paddw       xmm6, [esp+20h]
+  movdqa      xmm7, [esp+50h]
+  psraw       xmm6,3
+  pmaxsw      xmm1,xmm6
+  movdqa      [esp+10h],xmm0
+  movdqa      xmm6, [esp+10h]
+  pminsw      xmm6,xmm1
+  movdqa      [esp+10h],xmm6
+  movdqa      xmm1,xmm2
+  psubw       xmm1,xmm3
+  pabsw       xmm1,xmm1
+  movdqa      xmm6,xmm4
+  pcmpgtw     xmm6,xmm1
+  movdqa      xmm1, [esp+30h]
+  psubw       xmm1,xmm2
+  pabsw       xmm1,xmm1
+  pcmpgtw     xmm7,xmm1
+  movdqa      xmm1,[esp+50h]
+  pand        xmm6,xmm7
+  movdqa      xmm7,[esp+50h]
+  psubw       xmm5,xmm3
+  pabsw       xmm5,xmm5
+  pcmpgtw     xmm1,xmm5
+  movdqa      xmm5,[esp+80h]
+  psubw       xmm5,[esp+90h]
+  pand        xmm6,xmm1
+  pand        xmm6,[esp+40h]
+  movdqa      xmm1,[esp+10h]
+  pand        xmm1,xmm6
+  movdqa      xmm6,[esp+70h]
+  movdqa      [esp+30h],xmm1
+  movdqa      xmm1,[esp+0A0h]
+  psubw       xmm6,xmm1
+  psllw       xmm6,2
+  paddw       xmm6,xmm5
+  paddw       xmm6,[esp+20h]
+  movdqa      xmm5,[esp+60h]
+  psraw       xmm6,3
+  pmaxsw      xmm5,xmm6
+  pminsw      xmm0,xmm5
+  movdqa      xmm5,[esp+70h]
+  movdqa      xmm6,xmm1
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm4,xmm6
+  movdqa      xmm6,[esp+80h]
+  psubw       xmm6,xmm1
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm7,xmm6
+  movdqa      xmm6,[esp+90h]
+  pand        xmm4,xmm7
+  movdqa      xmm7,[esp+50h]
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm7,xmm6
+  pand        xmm4,xmm7
+  pand        xmm4,[esp+40h]
+  pand        xmm0,xmm4
+  movdqa      xmm4,[esp+30h]
+  paddw       xmm2,xmm4
+  paddw       xmm1,xmm0
+  packuswb    xmm2,xmm1
+  movq        [esi],xmm2
+  psubw       xmm3,xmm4
+  psubw       xmm5,xmm0
+  packuswb    xmm3,xmm5
+  movq        [eax],xmm3
+  psrldq      xmm2,8
+  movq        [edi],xmm2
+  pop         edi
+  pop         esi
+  psrldq      xmm3,8
+  movq        [ecx],xmm3
+  pop         ebx
+  mov         esp,ebp
+  pop         ebp
+  ret
+
+;***************************************************************************
+;  void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+;          int32_t iAlpha, int32_t iBeta)
+;***************************************************************************
+
+WELS_EXTERN     DeblockChromaEq4H_sse2
+
+ALIGN  16
+
+DeblockChromaEq4H_sse2:
+  push        ebp
+  mov         ebp,esp
+  and         esp,0FFFFFFF0h
+  sub         esp,0C8h
+  mov         ecx,dword [ebp+8]
+  mov         edx,dword [ebp+0Ch]
+  mov         eax,dword [ebp+10h]
+  sub         ecx,2
+  sub         edx,2
+  push        esi
+  lea         esi,[eax+eax*2]
+  mov         dword [esp+18h],ecx
+  mov         dword [esp+4],edx
+  lea         ecx,[ecx+eax*4]
+  lea         edx,[edx+eax*4]
+  lea         eax,[esp+7Ch]
+  push        edi
+  mov         dword [esp+14h],esi
+  mov         dword [esp+18h],ecx
+  mov         dword [esp+0Ch],edx
+  mov         dword [esp+10h],eax
+  mov         esi,dword [esp+1Ch]
+  mov         ecx,dword [ebp+10h]
+  mov         edx,dword [esp+14h]
+  movd        xmm0,dword [esi]
+  movd        xmm1,dword [esi+ecx]
+  movd        xmm2,dword [esi+ecx*2]
+  movd        xmm3,dword [esi+edx]
+  mov         esi,dword  [esp+8]
+  movd        xmm4,dword [esi]
+  movd        xmm5,dword [esi+ecx]
+  movd        xmm6,dword [esi+ecx*2]
+  movd        xmm7,dword [esi+edx]
+  punpckldq   xmm0,xmm4
+  punpckldq   xmm1,xmm5
+  punpckldq   xmm2,xmm6
+  punpckldq   xmm3,xmm7
+  mov         esi,dword [esp+18h]
+  mov         edi,dword [esp+0Ch]
+  movd        xmm4,dword [esi]
+  movd        xmm5,dword [edi]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm0,xmm4
+  movd        xmm4,dword [esi+ecx]
+  movd        xmm5,dword [edi+ecx]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm1,xmm4
+  movd        xmm4,dword [esi+ecx*2]
+  movd        xmm5,dword [edi+ecx*2]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm2,xmm4
+  movd        xmm4,dword [esi+edx]
+  movd        xmm5,dword [edi+edx]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm3,xmm4
+  movdqa      xmm6,xmm0
+  punpcklbw   xmm0,xmm1
+  punpckhbw   xmm6,xmm1
+  movdqa      xmm7,xmm2
+  punpcklbw   xmm2,xmm3
+  punpckhbw   xmm7,xmm3
+  movdqa      xmm4,xmm0
+  movdqa      xmm5,xmm6
+  punpcklwd   xmm0,xmm2
+  punpckhwd   xmm4,xmm2
+  punpcklwd   xmm6,xmm7
+  punpckhwd   xmm5,xmm7
+  movdqa      xmm1,xmm0
+  movdqa      xmm2,xmm4
+  punpckldq   xmm0,xmm6
+  punpckhdq   xmm1,xmm6
+  punpckldq   xmm4,xmm5
+  punpckhdq   xmm2,xmm5
+  movdqa      xmm5,xmm0
+  movdqa      xmm6,xmm1
+  punpcklqdq  xmm0,xmm4
+  punpckhqdq  xmm5,xmm4
+  punpcklqdq  xmm1,xmm2
+  punpckhqdq  xmm6,xmm2
+  mov         edi,dword [esp+10h]
+  movdqa      [edi],xmm0
+  movdqa      [edi+10h],xmm5
+  movdqa      [edi+20h],xmm1
+  movdqa      [edi+30h],xmm6
+  movsx       ecx,word [ebp+14h]
+  movsx       edx,word [ebp+18h]
+  movdqa      xmm6,[esp+80h]
+  movdqa      xmm4,[esp+90h]
+  movdqa      xmm5,[esp+0A0h]
+  movdqa      xmm7,[esp+0B0h]
+  pxor        xmm0,xmm0
+  movd        xmm1,ecx
+  movdqa      xmm2,xmm1
+  punpcklwd   xmm2,xmm1
+  pshufd      xmm1,xmm2,0
+  movd        xmm2,edx
+  movdqa      xmm3,xmm2
+  punpcklwd   xmm3,xmm2
+  pshufd      xmm2,xmm3,0
+  movdqa      xmm3,xmm6
+  punpckhbw   xmm6,xmm0
+  movdqa      [esp+60h],xmm6
+  movdqa      xmm6,[esp+90h]
+  punpckhbw   xmm6,xmm0
+  movdqa      [esp+30h],xmm6
+  movdqa      xmm6,[esp+0A0h]
+  punpckhbw   xmm6,xmm0
+  movdqa      [esp+40h],xmm6
+  movdqa      xmm6,[esp+0B0h]
+  punpckhbw   xmm6,xmm0
+  movdqa      [esp+70h],xmm6
+  punpcklbw   xmm7,xmm0
+  punpcklbw   xmm4,xmm0
+  punpcklbw   xmm5,xmm0
+  punpcklbw   xmm3,xmm0
+  movdqa      [esp+50h],xmm7
+  movdqa      xmm6,xmm4
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  movdqa      xmm0,xmm1
+  pcmpgtw     xmm0,xmm6
+  movdqa      xmm6,xmm3
+  psubw       xmm6,xmm4
+  pabsw       xmm6,xmm6
+  movdqa      xmm7,xmm2
+  pcmpgtw     xmm7,xmm6
+  movdqa      xmm6,[esp+50h]
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  pand        xmm0,xmm7
+  movdqa      xmm7,xmm2
+  pcmpgtw     xmm7,xmm6
+  movdqa      xmm6,[esp+30h]
+  psubw       xmm6,[esp+40h]
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm1,xmm6
+  movdqa      xmm6,[esp+60h]
+  psubw       xmm6,[esp+30h]
+  pabsw       xmm6,xmm6
+  pand        xmm0,xmm7
+  movdqa      xmm7,xmm2
+  pcmpgtw     xmm7,xmm6
+  movdqa      xmm6,[esp+70h]
+  psubw       xmm6,[esp+40h]
+  pabsw       xmm6,xmm6
+  pand        xmm1,xmm7
+  pcmpgtw     xmm2,xmm6
+  pand        xmm1,xmm2
+  mov         eax,2
+  movsx       ecx,ax
+  movd        xmm2,ecx
+  movdqa      xmm6,xmm2
+  punpcklwd   xmm6,xmm2
+  pshufd      xmm2,xmm6,0
+  movdqa      [esp+20h],xmm2
+  movdqa      xmm2,xmm3
+  paddw       xmm2,xmm3
+  paddw       xmm2,xmm4
+  paddw       xmm2,[esp+50h]
+  paddw       xmm2,[esp+20h]
+  psraw       xmm2,2
+  movdqa      xmm6,xmm0
+  pand        xmm6,xmm2
+  movdqa      xmm2,xmm0
+  pandn       xmm2,xmm4
+  por         xmm6,xmm2
+  movdqa      xmm2,[esp+60h]
+  movdqa      xmm7,xmm2
+  paddw       xmm7,xmm2
+  paddw       xmm7,[esp+30h]
+  paddw       xmm7,[esp+70h]
+  paddw       xmm7,[esp+20h]
+  movdqa      xmm4,xmm1
+  movdqa      xmm2,xmm1
+  pandn       xmm2,[esp+30h]
+  psraw       xmm7,2
+  pand        xmm4,xmm7
+  por         xmm4,xmm2
+  movdqa      xmm2,[esp+50h]
+  packuswb    xmm6,xmm4
+  movdqa      [esp+90h],xmm6
+  movdqa      xmm6,xmm2
+  paddw       xmm6,xmm2
+  movdqa      xmm2,[esp+20h]
+  paddw       xmm6,xmm5
+  paddw       xmm6,xmm3
+  movdqa      xmm4,xmm0
+  pandn       xmm0,xmm5
+  paddw       xmm6,xmm2
+  psraw       xmm6,2
+  pand        xmm4,xmm6
+  por         xmm4,xmm0
+  movdqa      xmm0,[esp+70h]
+  movdqa      xmm5,xmm0
+  paddw       xmm5,xmm0
+  movdqa      xmm0,[esp+40h]
+  paddw       xmm5,xmm0
+  paddw       xmm5,[esp+60h]
+  movdqa      xmm3,xmm1
+  paddw       xmm5,xmm2
+  psraw       xmm5,2
+  pand        xmm3,xmm5
+  pandn       xmm1,xmm0
+  por         xmm3,xmm1
+  packuswb    xmm4,xmm3
+  movdqa      [esp+0A0h],xmm4
+  mov         esi,dword [esp+10h]
+  movdqa      xmm0,[esi]
+  movdqa      xmm1,[esi+10h]
+  movdqa      xmm2,[esi+20h]
+  movdqa      xmm3,[esi+30h]
+  movdqa      xmm6,xmm0
+  punpcklbw   xmm0,xmm1
+  punpckhbw   xmm6,xmm1
+  movdqa      xmm7,xmm2
+  punpcklbw   xmm2,xmm3
+  punpckhbw   xmm7,xmm3
+  movdqa      xmm4,xmm0
+  movdqa      xmm5,xmm6
+  punpcklwd   xmm0,xmm2
+  punpckhwd   xmm4,xmm2
+  punpcklwd   xmm6,xmm7
+  punpckhwd   xmm5,xmm7
+  movdqa      xmm1,xmm0
+  movdqa      xmm2,xmm4
+  punpckldq   xmm0,xmm6
+  punpckhdq   xmm1,xmm6
+  punpckldq   xmm4,xmm5
+  punpckhdq   xmm2,xmm5
+  movdqa      xmm5,xmm0
+  movdqa      xmm6,xmm1
+  punpcklqdq  xmm0,xmm4
+  punpckhqdq  xmm5,xmm4
+  punpcklqdq  xmm1,xmm2
+  punpckhqdq  xmm6,xmm2
+  mov         esi,dword [esp+1Ch]
+  mov         ecx,dword [ebp+10h]
+  mov         edx,dword [esp+14h]
+  mov         edi,dword [esp+8]
+  movd        dword [esi],xmm0
+  movd        dword [esi+ecx],xmm5
+  movd        dword [esi+ecx*2],xmm1
+  movd        dword [esi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  mov         esi,dword [esp+18h]
+  movd        dword [edi],xmm0
+  movd        dword [edi+ecx],xmm5
+  movd        dword [edi+ecx*2],xmm1
+  movd        dword [edi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  movd        dword [esi],xmm0
+  movd        dword [esi+ecx],xmm5
+  movd        dword [esi+ecx*2],xmm1
+  movd        dword [esi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  mov         edi,dword [esp+0Ch]
+  movd        dword [edi],xmm0
+  movd        dword [edi+ecx],xmm5
+  movd        dword [edi+ecx*2],xmm1
+  movd        dword [edi+edx],xmm6
+  pop         edi
+  pop         esi
+  mov         esp,ebp
+  pop         ebp
+  ret
+
+;*******************************************************************************
+;    void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+;                                int32_t iAlpha, int32_t iBeta, int8_t * pTC);
+;*******************************************************************************
+
+WELS_EXTERN  DeblockChromaLt4H_sse2
+
+ALIGN  16
+
+DeblockChromaLt4H_sse2:
+  push        ebp
+  mov         ebp,esp
+  and         esp,0FFFFFFF0h
+  sub         esp,108h
+  mov         ecx,dword [ebp+8]
+  mov         edx,dword [ebp+0Ch]
+  mov         eax,dword [ebp+10h]
+  sub         ecx,2
+  sub         edx,2
+  push        esi
+  lea         esi,[eax+eax*2]
+  mov         dword [esp+10h],ecx
+  mov         dword [esp+4],edx
+  lea         ecx,[ecx+eax*4]
+  lea         edx,[edx+eax*4]
+  lea         eax,[esp+6Ch]
+  push        edi
+  mov         dword [esp+0Ch],esi
+  mov         dword [esp+18h],ecx
+  mov         dword [esp+10h],edx
+  mov         dword [esp+1Ch],eax
+  mov         esi,dword [esp+14h]
+  mov         ecx,dword [ebp+10h]
+  mov         edx,dword [esp+0Ch]
+  movd        xmm0,dword [esi]
+  movd        xmm1,dword [esi+ecx]
+  movd        xmm2,dword [esi+ecx*2]
+  movd        xmm3,dword [esi+edx]
+  mov         esi,dword [esp+8]
+  movd        xmm4,dword [esi]
+  movd        xmm5,dword [esi+ecx]
+  movd        xmm6,dword [esi+ecx*2]
+  movd        xmm7,dword [esi+edx]
+  punpckldq   xmm0,xmm4
+  punpckldq   xmm1,xmm5
+  punpckldq   xmm2,xmm6
+  punpckldq   xmm3,xmm7
+  mov         esi,dword [esp+18h]
+  mov         edi,dword [esp+10h]
+  movd        xmm4,dword [esi]
+  movd        xmm5,dword [edi]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm0,xmm4
+  movd        xmm4,dword [esi+ecx]
+  movd        xmm5,dword [edi+ecx]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm1,xmm4
+  movd        xmm4,dword [esi+ecx*2]
+  movd        xmm5,dword [edi+ecx*2]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm2,xmm4
+  movd        xmm4,dword [esi+edx]
+  movd        xmm5,dword [edi+edx]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm3,xmm4
+  movdqa      xmm6,xmm0
+  punpcklbw   xmm0,xmm1
+  punpckhbw   xmm6,xmm1
+  movdqa      xmm7,xmm2
+  punpcklbw   xmm2,xmm3
+  punpckhbw   xmm7,xmm3
+  movdqa      xmm4,xmm0
+  movdqa      xmm5,xmm6
+  punpcklwd   xmm0,xmm2
+  punpckhwd   xmm4,xmm2
+  punpcklwd   xmm6,xmm7
+  punpckhwd   xmm5,xmm7
+  movdqa      xmm1,xmm0
+  movdqa      xmm2,xmm4
+  punpckldq   xmm0,xmm6
+  punpckhdq   xmm1,xmm6
+  punpckldq   xmm4,xmm5
+  punpckhdq   xmm2,xmm5
+  movdqa      xmm5,xmm0
+  movdqa      xmm6,xmm1
+  punpcklqdq  xmm0,xmm4
+  punpckhqdq  xmm5,xmm4
+  punpcklqdq  xmm1,xmm2
+  punpckhqdq  xmm6,xmm2
+  mov         edi,dword [esp+1Ch]
+  movdqa      [edi],xmm0
+  movdqa      [edi+10h],xmm5
+  movdqa      [edi+20h],xmm1
+  movdqa      [edi+30h],xmm6
+  mov         eax,dword [ebp+1Ch]
+  movsx       cx,byte [eax+3]
+  movsx       dx,byte [eax+2]
+  movsx       si,byte [eax+1]
+  movsx       ax,byte [eax]
+  movzx       edi,cx
+  movzx       ecx,cx
+  movd        xmm2,ecx
+  movzx       ecx,dx
+  movzx       edx,dx
+  movd        xmm3,ecx
+  movd        xmm4,edx
+  movzx       ecx,si
+  movzx       edx,si
+  movd        xmm5,ecx
+  pxor        xmm0,xmm0
+  movd        xmm6,edx
+  movzx       ecx,ax
+  movdqa      [esp+60h],xmm0
+  movzx       edx,ax
+  movsx       eax,word [ebp+14h]
+  punpcklwd   xmm6,xmm2
+  movd        xmm1,edi
+  movd        xmm7,ecx
+  movsx       ecx,word [ebp+18h]
+  movd        xmm0,edx
+  punpcklwd   xmm7,xmm3
+  punpcklwd   xmm5,xmm1
+  movdqa      xmm1,[esp+60h]
+  punpcklwd   xmm7,xmm5
+  movdqa      xmm5,[esp+0A0h]
+  punpcklwd   xmm0,xmm4
+  punpcklwd   xmm0,xmm6
+  movdqa      xmm6, [esp+70h]
+  punpcklwd   xmm0,xmm7
+  movdqa      xmm7,[esp+80h]
+  movdqa      xmm2,xmm1
+  psubw       xmm2,xmm0
+  movdqa      [esp+0D0h],xmm2
+  movd        xmm2,eax
+  movdqa      xmm3,xmm2
+  punpcklwd   xmm3,xmm2
+  pshufd      xmm4,xmm3,0
+  movd        xmm2,ecx
+  movdqa      xmm3,xmm2
+  punpcklwd   xmm3,xmm2
+  pshufd      xmm2,xmm3,0
+  movdqa      xmm3, [esp+90h]
+  movdqa      [esp+50h],xmm2
+  movdqa      xmm2,xmm6
+  punpcklbw   xmm2,xmm1
+  punpckhbw   xmm6,xmm1
+  movdqa      [esp+40h],xmm2
+  movdqa      [esp+0B0h],xmm6
+  movdqa      xmm6,[esp+90h]
+  movdqa      xmm2,xmm7
+  punpckhbw   xmm7,xmm1
+  punpckhbw   xmm6,xmm1
+  punpcklbw   xmm2,xmm1
+  punpcklbw   xmm3,xmm1
+  punpcklbw   xmm5,xmm1
+  movdqa      [esp+0F0h],xmm7
+  movdqa      [esp+0C0h],xmm6
+  movdqa      xmm6, [esp+0A0h]
+  punpckhbw   xmm6,xmm1
+  movdqa      [esp+0E0h],xmm6
+  mov         edx,4
+  movsx       eax,dx
+  movd        xmm6,eax
+  movdqa      xmm7,xmm6
+  punpcklwd   xmm7,xmm6
+  pshufd      xmm6,xmm7,0
+  movdqa      [esp+30h],xmm6
+  movdqa      xmm7, [esp+40h]
+  psubw       xmm7,xmm5
+  movdqa      xmm6,xmm0
+  pcmpgtw     xmm6,xmm1
+  movdqa      [esp+60h],xmm6
+  movdqa      xmm1, [esp+0D0h]
+  movdqa      xmm6,xmm3
+  psubw       xmm6,xmm2
+  psllw       xmm6,2
+  paddw       xmm6,xmm7
+  paddw       xmm6,[esp+30h]
+  psraw       xmm6,3
+  pmaxsw      xmm1,xmm6
+  movdqa      xmm7,[esp+50h]
+  movdqa      [esp+20h],xmm0
+  movdqa      xmm6, [esp+20h]
+  pminsw      xmm6,xmm1
+  movdqa      [esp+20h],xmm6
+  movdqa      xmm6,xmm4
+  movdqa      xmm1,xmm2
+  psubw       xmm1,xmm3
+  pabsw       xmm1,xmm1
+  pcmpgtw     xmm6,xmm1
+  movdqa      xmm1, [esp+40h]
+  psubw       xmm1,xmm2
+  pabsw       xmm1,xmm1
+  pcmpgtw     xmm7,xmm1
+  movdqa      xmm1, [esp+50h]
+  pand        xmm6,xmm7
+  movdqa      xmm7, [esp+50h]
+  psubw       xmm5,xmm3
+  pabsw       xmm5,xmm5
+  pcmpgtw     xmm1,xmm5
+  movdqa      xmm5, [esp+0B0h]
+  psubw       xmm5,[esp+0E0h]
+  pand        xmm6,xmm1
+  pand        xmm6, [esp+60h]
+  movdqa      xmm1, [esp+20h]
+  pand        xmm1,xmm6
+  movdqa      xmm6, [esp+0C0h]
+  movdqa      [esp+40h],xmm1
+  movdqa      xmm1, [esp+0F0h]
+  psubw       xmm6,xmm1
+  psllw       xmm6,2
+  paddw       xmm6,xmm5
+  paddw       xmm6, [esp+30h]
+  movdqa      xmm5, [esp+0D0h]
+  psraw       xmm6,3
+  pmaxsw      xmm5,xmm6
+  pminsw      xmm0,xmm5
+  movdqa      xmm5,[esp+0C0h]
+  movdqa      xmm6,xmm1
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm4,xmm6
+  movdqa      xmm6,[esp+0B0h]
+  psubw       xmm6,xmm1
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm7,xmm6
+  movdqa      xmm6, [esp+0E0h]
+  pand        xmm4,xmm7
+  movdqa      xmm7, [esp+50h]
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm7,xmm6
+  pand        xmm4,xmm7
+  pand        xmm4,[esp+60h]
+  pand        xmm0,xmm4
+  movdqa      xmm4, [esp+40h]
+  paddw       xmm2,xmm4
+  paddw       xmm1,xmm0
+  psubw       xmm3,xmm4
+  psubw       xmm5,xmm0
+  packuswb    xmm2,xmm1
+  packuswb    xmm3,xmm5
+  movdqa      [esp+80h],xmm2
+  movdqa      [esp+90h],xmm3
+  mov         esi,dword [esp+1Ch]
+  movdqa      xmm0, [esi]
+  movdqa      xmm1, [esi+10h]
+  movdqa      xmm2, [esi+20h]
+  movdqa      xmm3, [esi+30h]
+  movdqa      xmm6,xmm0
+  punpcklbw   xmm0,xmm1
+  punpckhbw   xmm6,xmm1
+  movdqa      xmm7,xmm2
+  punpcklbw   xmm2,xmm3
+  punpckhbw   xmm7,xmm3
+  movdqa      xmm4,xmm0
+  movdqa      xmm5,xmm6
+  punpcklwd   xmm0,xmm2
+  punpckhwd   xmm4,xmm2
+  punpcklwd   xmm6,xmm7
+  punpckhwd   xmm5,xmm7
+  movdqa      xmm1,xmm0
+  movdqa      xmm2,xmm4
+  punpckldq   xmm0,xmm6
+  punpckhdq   xmm1,xmm6
+  punpckldq   xmm4,xmm5
+  punpckhdq   xmm2,xmm5
+  movdqa      xmm5,xmm0
+  movdqa      xmm6,xmm1
+  punpcklqdq  xmm0,xmm4
+  punpckhqdq  xmm5,xmm4
+  punpcklqdq  xmm1,xmm2
+  punpckhqdq  xmm6,xmm2
+  mov         esi,dword [esp+14h]
+  mov         ecx,dword [ebp+10h]
+  mov         edx,dword [esp+0Ch]
+  mov         edi,dword [esp+8]
+  movd        dword [esi],xmm0
+  movd        dword [esi+ecx],xmm5
+  movd        dword [esi+ecx*2],xmm1
+  movd        dword [esi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  mov         esi,dword [esp+18h]
+  movd        dword [edi],xmm0
+  movd        dword [edi+ecx],xmm5
+  movd        dword [edi+ecx*2],xmm1
+  movd        dword [edi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  movd        dword [esi],xmm0
+  movd        dword [esi+ecx],xmm5
+  movd        dword [esi+ecx*2],xmm1
+  movd        dword [esi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  mov         edi,dword [esp+10h]
+  movd        dword [edi],xmm0
+  movd        dword [edi+ecx],xmm5
+  movd        dword [edi+ecx*2],xmm1
+  movd        dword [edi+edx],xmm6
+  pop         edi
+  pop         esi
+  mov         esp,ebp
+  pop         ebp
+  ret
+
+
+
+;*******************************************************************************
+;    void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
+;                                 int32_t iBeta, int8_t * pTC)
+;*******************************************************************************
+
+
+WELS_EXTERN  DeblockLumaLt4V_sse2
+
+ALIGN  16
+
+DeblockLumaLt4V_sse2:
+    push	ebp
+	mov	ebp, esp
+	and	esp, -16				; fffffff0H
+	sub	esp, 420				; 000001a4H
+	mov	eax, dword [ebp+8]
+	mov	ecx, dword [ebp+12]
+
+	pxor	xmm0, xmm0
+	push	ebx
+	mov	edx, dword [ebp+24]
+	movdqa	[esp+424-384], xmm0
+	push	esi
+
+	lea	esi, [ecx+ecx*2]
+	push	edi
+	mov	edi, eax
+	sub	edi, esi
+	movdqa	xmm0, [edi]
+
+	lea	esi, [ecx+ecx]
+	movdqa	[esp+432-208], xmm0
+	mov	edi, eax
+	sub	edi, esi
+	movdqa	xmm0, [edi]
+	movdqa	[esp+448-208], xmm0
+
+	mov	ebx, eax
+	sub	ebx, ecx
+	movdqa	xmm0, [ebx]
+	movdqa	[esp+464-208], xmm0
+
+	movdqa	xmm0, [eax]
+
+	add	ecx, eax
+	movdqa	[esp+480-208], xmm0
+	movdqa	xmm0, [ecx]
+	mov	dword [esp+432-404], ecx
+
+	movsx	ecx, word [ebp+16]
+	movdqa	[esp+496-208], xmm0
+	movdqa	xmm0, [esi+eax]
+
+	movsx	si, byte [edx]
+	movdqa	[esp+512-208], xmm0
+	movd	xmm0, ecx
+	movsx	ecx, word [ebp+20]
+	movdqa	xmm1, xmm0
+	punpcklwd xmm1, xmm0
+	pshufd	xmm0, xmm1, 0
+	movdqa	[esp+432-112], xmm0
+	movd	xmm0, ecx
+	movsx	cx, byte [edx+1]
+	movdqa	xmm1, xmm0
+	punpcklwd xmm1, xmm0
+	mov	dword [esp+432-408], ebx
+	movzx	ebx, cx
+	pshufd	xmm0, xmm1, 0
+	movd	xmm1, ebx
+	movzx	ebx, cx
+	movd	xmm2, ebx
+	movzx	ebx, cx
+	movzx	ecx, cx
+	movd	xmm4, ecx
+	movzx	ecx, si
+	movd	xmm5, ecx
+	movzx	ecx, si
+	movd	xmm6, ecx
+	movzx	ecx, si
+	movd	xmm7, ecx
+	movzx	ecx, si
+	movdqa	[esp+432-336], xmm0
+	movd	xmm0, ecx
+
+	movsx	cx, byte [edx+3]
+	movsx	dx, byte [edx+2]
+	movd	xmm3, ebx
+	punpcklwd xmm0, xmm4
+	movzx	esi, cx
+	punpcklwd xmm6, xmm2
+	punpcklwd xmm5, xmm1
+	punpcklwd xmm0, xmm6
+	punpcklwd xmm7, xmm3
+	punpcklwd xmm7, xmm5
+	punpcklwd xmm0, xmm7
+	movdqa	[esp+432-400], xmm0
+	movd	xmm0, esi
+	movzx	esi, cx
+	movd	xmm2, esi
+	movzx	esi, cx
+	movzx	ecx, cx
+	movd	xmm4, ecx
+	movzx	ecx, dx
+	movd	xmm3, esi
+	movd	xmm5, ecx
+	punpcklwd xmm5, xmm0
+
+	movdqa	xmm0, [esp+432-384]
+	movzx	ecx, dx
+	movd	xmm6, ecx
+	movzx	ecx, dx
+	movzx	edx, dx
+	punpcklwd xmm6, xmm2
+	movd	xmm7, ecx
+	movd	xmm1, edx
+
+	movdqa	xmm2, [esp+448-208]
+	punpcklbw xmm2, xmm0
+
+	mov	ecx, 4
+	movsx	edx, cx
+	punpcklwd xmm7, xmm3
+	punpcklwd xmm7, xmm5
+	movdqa	xmm5, [esp+496-208]
+	movdqa	xmm3, [esp+464-208]
+	punpcklbw xmm5, xmm0
+	movdqa	[esp+432-240], xmm5
+	movdqa	xmm5, [esp+512-208]
+	punpcklbw xmm5, xmm0
+	movdqa	[esp+432-352], xmm5
+	punpcklwd xmm1, xmm4
+	movdqa	xmm4, [esp+432-208]
+	punpcklwd xmm1, xmm6
+	movdqa	xmm6, [esp+480-208]
+	punpcklwd xmm1, xmm7
+	punpcklbw xmm6, xmm0
+	punpcklbw xmm3, xmm0
+	punpcklbw xmm4, xmm0
+	movdqa	xmm7, xmm3
+	psubw	xmm7, xmm4
+	pabsw	xmm7, xmm7
+	movdqa	[esp+432-272], xmm4
+	movdqa	xmm4, [esp+432-336]
+	movdqa	xmm5, xmm4
+	pcmpgtw	xmm5, xmm7
+	movdqa	[esp+432-288], xmm5
+	movdqa	xmm7, xmm6
+	psubw	xmm7, [esp+432-352]
+	pabsw	xmm7, xmm7
+	movdqa	xmm5, xmm4
+	pcmpgtw	xmm5, xmm7
+	movdqa	[esp+432-256], xmm5
+	movdqa	xmm5, xmm3
+	pavgw	xmm5, xmm6
+	movdqa	[esp+432-304], xmm5
+	movdqa	xmm5, [esp+432-400]
+	psubw	xmm5, [esp+432-288]
+	psubw	xmm5, [esp+432-256]
+	movdqa	[esp+432-224], xmm5
+	movdqa	xmm5, xmm6
+	psubw	xmm5, xmm3
+	movdqa	[esp+432-32], xmm6
+	psubw	xmm6, [esp+432-240]
+	movdqa	xmm7, xmm5
+	movdqa	[esp+432-384], xmm5
+	movdqa	xmm5, [esp+432-112]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm5, xmm7
+	pabsw	xmm6, xmm6
+	movdqa	xmm7, xmm4
+	pcmpgtw	xmm7, xmm6
+
+	pand	xmm5, xmm7
+	movdqa	xmm6, xmm3
+	psubw	xmm6, xmm2
+	pabsw	xmm6, xmm6
+	movdqa	xmm7, xmm4
+	pcmpgtw	xmm7, xmm6
+	movdqa	xmm6, [esp+432-400]
+	pand	xmm5, xmm7
+	movdqa	xmm7, xmm6
+	pcmpeqw	xmm6, xmm0
+	pcmpgtw	xmm7, xmm0
+	por	xmm7, xmm6
+	pand	xmm5, xmm7
+	movdqa	[esp+432-320], xmm5
+	movd	xmm5, edx
+	movdqa	xmm6, xmm5
+	punpcklwd xmm6, xmm5
+	pshufd	xmm5, xmm6, 0
+	movdqa	[esp+432-336], xmm5
+	movdqa	xmm5, [esp+432-224]
+	movdqa	[esp+432-368], xmm5
+	movdqa	xmm6, xmm0
+	psubw	xmm6, xmm5
+	movdqa	xmm5, [esp+432-384]
+	psllw	xmm5, 2
+	movdqa	xmm7, xmm2
+	psubw	xmm7, [esp+432-240]
+	paddw	xmm7, xmm5
+	paddw	xmm7, [esp+432-336]
+	movdqa	xmm5, [esp+432-368]
+	psraw	xmm7, 3
+	pmaxsw	xmm6, xmm7
+	pminsw	xmm5, xmm6
+
+	pand	xmm5, [esp+432-320]
+	movdqa	xmm6, [esp+432-400]
+	movdqa	[esp+432-64], xmm5
+	movdqa	[esp+432-384], xmm6
+	movdqa	xmm5, xmm0
+	psubw	xmm5, xmm6
+	movdqa	[esp+432-368], xmm5
+	movdqa	xmm6, xmm5
+	movdqa	xmm5, [esp+432-272]
+	paddw	xmm5, [esp+432-304]
+	movdqa	xmm7, xmm2
+	paddw	xmm7, xmm2
+	psubw	xmm5, xmm7
+	psraw	xmm5, 1
+	pmaxsw	xmm6, xmm5
+	movdqa	xmm5, [esp+432-384]
+	pminsw	xmm5, xmm6
+
+	pand	xmm5, [esp+432-320]
+	pand	xmm5, [esp+432-288]
+	movdqa	xmm6, [esp+432-240]
+	movdqa	[esp+432-96], xmm5
+	movdqa	xmm5, [esp+432-352]
+	paddw	xmm5, [esp+432-304]
+	movdqa	xmm7, xmm6
+	paddw	xmm7, xmm6
+	movdqa	xmm6, [esp+432-368]
+	psubw	xmm5, xmm7
+
+	movdqa	xmm7, [esp+496-208]
+	psraw	xmm5, 1
+	pmaxsw	xmm6, xmm5
+	movdqa	xmm5, [esp+432-400]
+	pminsw	xmm5, xmm6
+	pand	xmm5, [esp+432-320]
+	pand	xmm5, [esp+432-256]
+	movdqa	xmm6, [esp+448-208]
+	punpckhbw xmm7, xmm0
+	movdqa	[esp+432-352], xmm7
+
+	movdqa	xmm7, [esp+512-208]
+	punpckhbw xmm6, xmm0
+	movdqa	[esp+432-48], xmm5
+	movdqa	xmm5, [esp+432-208]
+	movdqa	[esp+432-368], xmm6
+	movdqa	xmm6, [esp+464-208]
+	punpckhbw xmm7, xmm0
+	punpckhbw xmm5, xmm0
+	movdqa	[esp+432-384], xmm7
+	punpckhbw xmm6, xmm0
+	movdqa	[esp+432-400], xmm6
+
+	movdqa	xmm7, [esp+432-400]
+	movdqa	xmm6, [esp+480-208]
+	psubw	xmm7, xmm5
+	movdqa	[esp+432-16], xmm5
+	pabsw	xmm7, xmm7
+	punpckhbw xmm6, xmm0
+	movdqa	xmm5, xmm4
+	pcmpgtw	xmm5, xmm7
+	movdqa	[esp+432-288], xmm5
+
+	movdqa	xmm7, xmm6
+	psubw	xmm7, [esp+432-384]
+	pabsw	xmm7, xmm7
+	movdqa	xmm5, xmm4
+	pcmpgtw	xmm5, xmm7
+	movdqa	[esp+432-256], xmm5
+
+	movdqa	xmm5, [esp+432-400]
+	movdqa	[esp+432-80], xmm6
+	pavgw	xmm5, xmm6
+	movdqa	[esp+432-304], xmm5
+
+	movdqa	xmm5, xmm1
+	psubw	xmm5, [esp+432-288]
+	psubw	xmm5, [esp+432-256]
+	movdqa	[esp+432-224], xmm5
+	movdqa	xmm5, xmm6
+	psubw	xmm5, [esp+432-400]
+	psubw	xmm6, [esp+432-352]
+	movdqa	[esp+432-272], xmm5
+	movdqa	xmm7, xmm5
+	movdqa	xmm5, [esp+432-112]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm5, xmm7
+	movdqa	xmm7, xmm4
+	pabsw	xmm6, xmm6
+	pcmpgtw	xmm7, xmm6
+	movdqa	xmm6, [esp+432-368]
+
+	pand	xmm5, xmm7
+	movdqa	xmm7, [esp+432-400]
+	psubw	xmm7, xmm6
+	psubw	xmm6, [esp+432-352]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm4, xmm7
+	pand	xmm5, xmm4
+
+	paddw	xmm2, [esp+432-96]
+	movdqa	xmm4, xmm1
+	pcmpgtw	xmm4, xmm0
+	movdqa	xmm7, xmm1
+	pcmpeqw	xmm7, xmm0
+	por	xmm4, xmm7
+	pand	xmm5, xmm4
+	movdqa	xmm4, [esp+432-224]
+	movdqa	[esp+432-320], xmm5
+	movdqa	xmm5, [esp+432-272]
+	movdqa	xmm7, xmm0
+	psubw	xmm7, xmm4
+	psubw	xmm0, xmm1
+	psllw	xmm5, 2
+	paddw	xmm6, xmm5
+	paddw	xmm6, [esp+432-336]
+	movdqa	xmm5, [esp+432-368]
+	movdqa	[esp+432-336], xmm0
+	psraw	xmm6, 3
+	pmaxsw	xmm7, xmm6
+	pminsw	xmm4, xmm7
+	pand	xmm4, [esp+432-320]
+	movdqa	xmm6, xmm0
+	movdqa	xmm0, [esp+432-16]
+	paddw	xmm0, [esp+432-304]
+	movdqa	[esp+432-272], xmm4
+	movdqa	xmm4, [esp+432-368]
+	paddw	xmm4, xmm4
+	psubw	xmm0, xmm4
+
+	movdqa	xmm4, [esp+432-64]
+	psraw	xmm0, 1
+	pmaxsw	xmm6, xmm0
+	movdqa	xmm0, [esp+432-400]
+	movdqa	xmm7, xmm1
+	pminsw	xmm7, xmm6
+	movdqa	xmm6, [esp+432-320]
+	pand	xmm7, xmm6
+	pand	xmm7, [esp+432-288]
+	paddw	xmm5, xmm7
+	packuswb xmm2, xmm5
+	movdqa	xmm5, [esp+432-272]
+	paddw	xmm0, xmm5
+	paddw	xmm3, xmm4
+	packuswb xmm3, xmm0
+
+	movdqa	xmm0, [esp+432-32]
+	psubw	xmm0, xmm4
+	movdqa	xmm4, [esp+432-80]
+	psubw	xmm4, xmm5
+
+	movdqa	xmm5, [esp+432-240]
+	paddw	xmm5, [esp+432-48]
+	packuswb xmm0, xmm4
+	movdqa	xmm4, [esp+432-384]
+	paddw	xmm4, [esp+432-304]
+	movdqa	[esp+480-208], xmm0
+	movdqa	xmm0, [esp+432-352]
+	movdqa	xmm7, xmm0
+	paddw	xmm0, xmm0
+
+	mov	ecx, dword [esp+432-408]
+
+	mov	edx, dword [esp+432-404]
+	psubw	xmm4, xmm0
+	movdqa	xmm0, [esp+432-336]
+	movdqa	[edi], xmm2
+	psraw	xmm4, 1
+	pmaxsw	xmm0, xmm4
+	pminsw	xmm1, xmm0
+	movdqa	xmm0, [esp+480-208]
+
+	pop	edi
+	pand	xmm1, xmm6
+	pand	xmm1, [esp+428-256]
+	movdqa	[ecx], xmm3
+	paddw	xmm7, xmm1
+	pop	esi
+	packuswb xmm5, xmm7
+	movdqa	[eax], xmm0
+	movdqa	[edx], xmm5
+	pop	ebx
+	mov	esp, ebp
+	pop	ebp
+	ret
+
+
+;*******************************************************************************
+;    void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
+;                                 int32_t iBeta)
+;*******************************************************************************
+
+WELS_EXTERN  DeblockLumaEq4V_sse2
+
+ALIGN  16
+
+DeblockLumaEq4V_sse2:
+
+	push	ebp
+	mov	ebp, esp
+	and	esp, -16				; fffffff0H
+	sub	esp, 628				; 00000274H
+	mov	eax, dword [ebp+8]
+	mov	ecx, dword [ebp+12]
+	push	ebx
+	push	esi
+
+	lea	edx, [ecx*4]
+	pxor	xmm0, xmm0
+	movdqa	xmm2, xmm0
+
+	movdqa	xmm0, [ecx+eax]
+	mov	esi, eax
+	sub	esi, edx
+	movdqa	xmm3, [esi]
+	movdqa	xmm5, [eax]
+	push	edi
+	lea	edi, [ecx+ecx]
+	lea	ebx, [ecx+ecx*2]
+	mov	dword [esp+640-600], edi
+	mov	esi, eax
+	sub	esi, edi
+	movdqa	xmm1, [esi]
+	movdqa	 [esp+720-272], xmm0
+	mov	edi, eax
+	sub	edi, ecx
+	movdqa	xmm4, [edi]
+	add	ecx, eax
+	mov	dword [esp+640-596], ecx
+
+	mov	ecx, dword [esp+640-600]
+	movdqa	xmm0, [ecx+eax]
+	movdqa	 [esp+736-272], xmm0
+
+	movdqa	xmm0, [eax+ebx]
+	mov	edx, eax
+	sub	edx, ebx
+
+	movsx	ebx, word [ebp+16]
+	movdqa	xmm6, [edx]
+	add	ecx, eax
+	movdqa	 [esp+752-272], xmm0
+	movd	xmm0, ebx
+
+	movsx	ebx, word [ebp+20]
+	movdqa	xmm7, xmm0
+	punpcklwd xmm7, xmm0
+	pshufd	xmm0, xmm7, 0
+	movdqa	 [esp+640-320], xmm0
+	movd	xmm0, ebx
+	movdqa	xmm7, xmm0
+	punpcklwd xmm7, xmm0
+	pshufd	xmm0, xmm7, 0
+
+	movdqa	xmm7, [esp+736-272]
+	punpcklbw xmm7, xmm2
+	movdqa	 [esp+640-416], xmm7
+	movdqa	 [esp+640-512], xmm0
+	movdqa	xmm0, xmm1
+	movdqa	 [esp+672-272], xmm1
+	movdqa	xmm1, xmm4
+	movdqa	 [esp+704-272], xmm5
+	punpcklbw xmm5, xmm2
+	punpcklbw xmm1, xmm2
+
+	movdqa	xmm7, xmm5
+	psubw	xmm7, xmm1
+	pabsw	xmm7, xmm7
+	movdqa	 [esp+640-560], xmm7
+	punpcklbw xmm0, xmm2
+	movdqa	 [esp+688-272], xmm4
+	movdqa	xmm4, [esp+720-272]
+	movdqa	 [esp+640-480], xmm0
+
+	movdqa	xmm7, xmm1
+	psubw	xmm7, xmm0
+
+	movdqa	xmm0, [esp+640-512]
+	pabsw	xmm7, xmm7
+	punpcklbw xmm4, xmm2
+	pcmpgtw	xmm0, xmm7
+	movdqa	 [esp+640-384], xmm4
+	movdqa	xmm7, xmm5
+	psubw	xmm7, xmm4
+	movdqa	xmm4, [esp+640-512]
+	movdqa	 [esp+656-272], xmm6
+	punpcklbw xmm6, xmm2
+	pabsw	xmm7, xmm7
+	movdqa	 [esp+640-48], xmm2
+	movdqa	 [esp+640-368], xmm6
+	movdqa	 [esp+640-144], xmm1
+	movdqa	 [esp+640-400], xmm5
+	pcmpgtw	xmm4, xmm7
+	pand	xmm0, xmm4
+	movdqa	xmm4, [esp+640-320]
+	pcmpgtw	xmm4, [esp+640-560]
+	pand	xmm0, xmm4
+
+	mov	ebx, 2
+	movsx	ebx, bx
+	movd	xmm4, ebx
+	movdqa	xmm7, xmm4
+	punpcklwd xmm7, xmm4
+	movdqa	xmm4, [esp+640-320]
+	psraw	xmm4, 2
+	pshufd	xmm7, xmm7, 0
+	paddw	xmm4, xmm7
+	movdqa	 [esp+640-576], xmm4
+	pcmpgtw	xmm4, [esp+640-560]
+	movdqa	 [esp+640-560], xmm4
+
+	movdqa	xmm4, [esp+640-512]
+	movdqa	 [esp+640-624], xmm7
+	movdqa	xmm7, xmm1
+	psubw	xmm7, xmm6
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm4, xmm7
+
+	pand	xmm4, [esp+640-560]
+	movdqa	 [esp+640-544], xmm4
+	movdqa	xmm4, [esp+640-512]
+	movdqa	xmm7, xmm5
+	psubw	xmm7, [esp+640-416]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm4, xmm7
+
+	pand	xmm4, [esp+640-560]
+	movdqa	 [esp+640-560], xmm4
+
+	movdqa	xmm4, [esp+640-544]
+	pandn	xmm4, xmm6
+	movdqa	 [esp+640-16], xmm4
+	mov	ebx, 4
+	movsx	ebx, bx
+	movd	xmm4, ebx
+	movdqa	xmm7, xmm4
+	punpcklwd xmm7, xmm4
+	movdqa	xmm4, xmm3
+	punpcklbw xmm4, xmm2
+	psllw	xmm4, 1
+	paddw	xmm4, xmm6
+	paddw	xmm4, xmm6
+	paddw	xmm4, xmm6
+	paddw	xmm4, [esp+640-480]
+
+	movdqa	xmm6, [esp+640-560]
+	pshufd	xmm7, xmm7, 0
+	paddw	xmm4, xmm1
+	movdqa	 [esp+640-592], xmm7
+	paddw	xmm4, xmm5
+	paddw	xmm4, xmm7
+	movdqa	xmm7, [esp+640-416]
+	pandn	xmm6, xmm7
+	movdqa	 [esp+640-80], xmm6
+	movdqa	xmm6, [esp+752-272]
+	punpcklbw xmm6, xmm2
+	psllw	xmm6, 1
+	paddw	xmm6, xmm7
+	paddw	xmm6, xmm7
+	paddw	xmm6, xmm7
+	paddw	xmm6, [esp+640-384]
+
+	movdqa	xmm7, [esp+640-480]
+	paddw	xmm6, xmm5
+	paddw	xmm6, xmm1
+	paddw	xmm6, [esp+640-592]
+	psraw	xmm6, 3
+	pand	xmm6, [esp+640-560]
+	movdqa	 [esp+640-112], xmm6
+	movdqa	xmm6, [esp+640-544]
+	pandn	xmm6, xmm7
+	movdqa	 [esp+640-336], xmm6
+	movdqa	xmm6, [esp+640-544]
+	movdqa	 [esp+640-528], xmm6
+	movdqa	xmm6, [esp+640-368]
+	paddw	xmm6, xmm7
+	movdqa	xmm7, xmm1
+	psraw	xmm4, 3
+	pand	xmm4, [esp+640-544]
+	paddw	xmm7, xmm5
+	paddw	xmm6, xmm7
+	paddw	xmm6, [esp+640-624]
+	movdqa	xmm7, [esp+640-528]
+
+	paddw	xmm5, xmm1
+	psraw	xmm6, 2
+	pand	xmm7, xmm6
+
+	movdqa	xmm6, [esp+640-384]
+	movdqa	 [esp+640-64], xmm7
+	movdqa	xmm7, [esp+640-560]
+	pandn	xmm7, xmm6
+	movdqa	 [esp+640-304], xmm7
+	movdqa	xmm7, [esp+640-560]
+	movdqa	 [esp+640-528], xmm7
+	movdqa	xmm7, [esp+640-416]
+	paddw	xmm7, xmm6
+	paddw	xmm7, xmm5
+	paddw	xmm7, [esp+640-624]
+	movdqa	xmm5, [esp+640-528]
+	psraw	xmm7, 2
+	pand	xmm5, xmm7
+	movdqa	 [esp+640-32], xmm5
+
+	movdqa	xmm5, [esp+640-544]
+	movdqa	 [esp+640-528], xmm5
+	movdqa	xmm5, [esp+640-480]
+	movdqa	xmm7, xmm5
+	paddw	xmm7, xmm5
+	movdqa	xmm5, xmm1
+	paddw	xmm5, xmm6
+	paddw	xmm6, [esp+640-592]
+	paddw	xmm7, xmm5
+	paddw	xmm7, [esp+640-624]
+	movdqa	xmm5, [esp+640-528]
+	psraw	xmm7, 2
+	pandn	xmm5, xmm7
+	movdqa	xmm7, [esp+640-480]
+	paddw	xmm7, xmm1
+	paddw	xmm7, [esp+640-400]
+	movdqa	xmm1, [esp+640-544]
+	movdqa	 [esp+640-352], xmm5
+	movdqa	xmm5, [esp+640-368]
+	psllw	xmm7, 1
+	paddw	xmm7, xmm6
+	paddw	xmm5, xmm7
+
+	movdqa	xmm7, [esp+640-400]
+	psraw	xmm5, 3
+	pand	xmm1, xmm5
+	movdqa	xmm5, [esp+640-480]
+	movdqa	 [esp+640-96], xmm1
+	movdqa	xmm1, [esp+640-560]
+	movdqa	 [esp+640-528], xmm1
+	movdqa	xmm1, [esp+640-384]
+	movdqa	xmm6, xmm1
+	paddw	xmm6, xmm1
+	paddw	xmm1, [esp+640-400]
+	paddw	xmm1, [esp+640-144]
+	paddw	xmm7, xmm5
+	paddw	xmm5, [esp+640-592]
+	paddw	xmm6, xmm7
+	paddw	xmm6, [esp+640-624]
+	movdqa	xmm7, [esp+640-528]
+	psraw	xmm6, 2
+	psllw	xmm1, 1
+	paddw	xmm1, xmm5
+
+	movdqa	xmm5, [esp+656-272]
+	pandn	xmm7, xmm6
+	movdqa	xmm6, [esp+640-416]
+	paddw	xmm6, xmm1
+	movdqa	xmm1, [esp+640-560]
+	psraw	xmm6, 3
+	pand	xmm1, xmm6
+
+	movdqa	xmm6, [esp+704-272]
+	movdqa	 [esp+640-128], xmm1
+	movdqa	xmm1, [esp+672-272]
+	punpckhbw xmm1, xmm2
+	movdqa	 [esp+640-448], xmm1
+	movdqa	xmm1, [esp+688-272]
+	punpckhbw xmm1, xmm2
+	punpckhbw xmm6, xmm2
+	movdqa	 [esp+640-288], xmm7
+	punpckhbw xmm5, xmm2
+	movdqa	 [esp+640-496], xmm1
+	movdqa	 [esp+640-432], xmm6
+
+	movdqa	xmm7, [esp+720-272]
+	punpckhbw xmm7, xmm2
+	movdqa	 [esp+640-464], xmm7
+
+	movdqa	xmm7, [esp+736-272]
+	punpckhbw xmm7, xmm2
+	movdqa	 [esp+640-528], xmm7
+
+	movdqa	xmm7, xmm6
+
+	psubw	xmm6, [esp+640-464]
+	psubw	xmm7, xmm1
+	pabsw	xmm7, xmm7
+	movdqa	 [esp+640-560], xmm7
+	por	xmm4, [esp+640-16]
+	pabsw	xmm6, xmm6
+	movdqa	xmm7, xmm1
+	psubw	xmm7, [esp+640-448]
+
+	movdqa	xmm1, [esp+640-512]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm1, xmm7
+	movdqa	xmm7, [esp+640-512]
+	pcmpgtw	xmm7, xmm6
+	movdqa	xmm6, [esp+640-320]
+	pand	xmm1, xmm7
+	movdqa	xmm7, [esp+640-560]
+	pcmpgtw	xmm6, xmm7
+	pand	xmm1, xmm6
+
+	movdqa	xmm6, [esp+640-576]
+	pcmpgtw	xmm6, xmm7
+
+	movdqa	xmm7, [esp+640-496]
+	punpckhbw xmm3, xmm2
+	movdqa	 [esp+640-560], xmm6
+	movdqa	xmm6, [esp+640-512]
+	psubw	xmm7, xmm5
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm6, xmm7
+
+	pand	xmm6, [esp+640-560]
+	movdqa	xmm7, [esp+640-432]
+	psubw	xmm7, [esp+640-528]
+
+	psllw	xmm3, 1
+	movdqa	 [esp+640-544], xmm6
+	movdqa	xmm6, [esp+640-512]
+
+	movdqa	xmm2, [esp+640-544]
+	paddw	xmm3, xmm5
+	paddw	xmm3, xmm5
+	paddw	xmm3, xmm5
+	paddw	xmm3, [esp+640-448]
+	paddw	xmm3, [esp+640-496]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm6, xmm7
+	pand	xmm6, [esp+640-560]
+	movdqa	 [esp+640-560], xmm6
+
+	movdqa	xmm6, xmm0
+	pand	xmm6, xmm4
+	movdqa	xmm4, xmm0
+	pandn	xmm4, [esp+640-368]
+	por	xmm6, xmm4
+	movdqa	xmm4, [esp+640-432]
+	paddw	xmm3, xmm4
+	paddw	xmm3, [esp+640-592]
+	psraw	xmm3, 3
+	pand	xmm3, xmm2
+	pandn	xmm2, xmm5
+	por	xmm3, xmm2
+	movdqa	xmm7, xmm1
+	pand	xmm7, xmm3
+	movdqa	xmm3, [esp+640-64]
+	por	xmm3, [esp+640-336]
+	movdqa	xmm2, xmm1
+	pandn	xmm2, xmm5
+	por	xmm7, xmm2
+
+	movdqa	xmm2, xmm0
+	pand	xmm2, xmm3
+	movdqa	xmm3, xmm0
+	pandn	xmm3, [esp+640-480]
+	por	xmm2, xmm3
+	packuswb xmm6, xmm7
+	movdqa	 [esp+640-336], xmm2
+	movdqa	 [esp+656-272], xmm6
+	movdqa	xmm6, [esp+640-544]
+	movdqa	xmm2, xmm5
+	paddw	xmm2, [esp+640-448]
+	movdqa	xmm3, xmm1
+	movdqa	xmm7, [esp+640-496]
+	paddw	xmm7, xmm4
+	paddw	xmm2, xmm7
+	paddw	xmm2, [esp+640-624]
+	movdqa	xmm7, [esp+640-544]
+	psraw	xmm2, 2
+	pand	xmm6, xmm2
+	movdqa	xmm2, [esp+640-448]
+	pandn	xmm7, xmm2
+	por	xmm6, xmm7
+	pand	xmm3, xmm6
+	movdqa	xmm6, xmm1
+	pandn	xmm6, xmm2
+	paddw	xmm2, [esp+640-496]
+	paddw	xmm2, xmm4
+	por	xmm3, xmm6
+	movdqa	xmm6, [esp+640-336]
+	packuswb xmm6, xmm3
+	psllw	xmm2, 1
+	movdqa	 [esp+672-272], xmm6
+	movdqa	xmm6, [esp+640-96]
+	por	xmm6, [esp+640-352]
+
+	movdqa	xmm3, xmm0
+	pand	xmm3, xmm6
+	movdqa	xmm6, xmm0
+	pandn	xmm6, [esp+640-144]
+	por	xmm3, xmm6
+	movdqa	xmm6, [esp+640-544]
+	movdqa	 [esp+640-352], xmm3
+	movdqa	xmm3, [esp+640-464]
+	paddw	xmm3, [esp+640-592]
+	paddw	xmm2, xmm3
+	movdqa	xmm3, [esp+640-448]
+	paddw	xmm5, xmm2
+	movdqa	xmm2, [esp+640-496]
+	psraw	xmm5, 3
+	pand	xmm6, xmm5
+	movdqa	xmm5, [esp+640-464]
+	paddw	xmm2, xmm5
+	paddw	xmm5, [esp+640-432]
+	movdqa	xmm4, xmm3
+	paddw	xmm4, xmm3
+	paddw	xmm4, xmm2
+	paddw	xmm4, [esp+640-624]
+	movdqa	xmm2, [esp+640-544]
+	paddw	xmm3, [esp+640-592]
+	psraw	xmm4, 2
+	pandn	xmm2, xmm4
+	por	xmm6, xmm2
+	movdqa	xmm7, xmm1
+	pand	xmm7, xmm6
+	movdqa	xmm6, [esp+640-496]
+	movdqa	xmm2, xmm1
+	pandn	xmm2, xmm6
+	por	xmm7, xmm2
+	movdqa	xmm2, [esp+640-352]
+	packuswb xmm2, xmm7
+	movdqa	 [esp+688-272], xmm2
+	movdqa	xmm2, [esp+640-128]
+	por	xmm2, [esp+640-288]
+
+	movdqa	xmm4, xmm0
+	pand	xmm4, xmm2
+	paddw	xmm5, xmm6
+	movdqa	xmm2, xmm0
+	pandn	xmm2, [esp+640-400]
+	por	xmm4, xmm2
+	movdqa	xmm2, [esp+640-528]
+	psllw	xmm5, 1
+	paddw	xmm5, xmm3
+	movdqa	xmm3, [esp+640-560]
+	paddw	xmm2, xmm5
+	psraw	xmm2, 3
+	movdqa	 [esp+640-288], xmm4
+	movdqa	xmm4, [esp+640-560]
+	pand	xmm4, xmm2
+	movdqa	xmm2, [esp+640-464]
+	movdqa	xmm5, xmm2
+	paddw	xmm5, xmm2
+	movdqa	xmm2, [esp+640-432]
+	paddw	xmm2, [esp+640-448]
+	movdqa	xmm7, xmm1
+	paddw	xmm5, xmm2
+	paddw	xmm5, [esp+640-624]
+	movdqa	xmm6, [esp+640-560]
+	psraw	xmm5, 2
+	pandn	xmm3, xmm5
+	por	xmm4, xmm3
+	movdqa	xmm3, [esp+640-32]
+	por	xmm3, [esp+640-304]
+	pand	xmm7, xmm4
+	movdqa	xmm4, [esp+640-432]
+	movdqa	xmm5, [esp+640-464]
+	movdqa	xmm2, xmm1
+	pandn	xmm2, xmm4
+	paddw	xmm4, [esp+640-496]
+	por	xmm7, xmm2
+	movdqa	xmm2, [esp+640-288]
+	packuswb xmm2, xmm7
+	movdqa	 [esp+704-272], xmm2
+
+	movdqa	xmm2, xmm0
+	pand	xmm2, xmm3
+	movdqa	xmm3, xmm0
+	pandn	xmm3, [esp+640-384]
+	por	xmm2, xmm3
+	movdqa	 [esp+640-304], xmm2
+	movdqa	xmm2, [esp+640-528]
+	movdqa	xmm3, xmm2
+	paddw	xmm3, [esp+640-464]
+	paddw	xmm3, xmm4
+	paddw	xmm3, [esp+640-624]
+	psraw	xmm3, 2
+	pand	xmm6, xmm3
+	movdqa	xmm3, [esp+640-560]
+	movdqa	xmm4, xmm3
+	pandn	xmm4, xmm5
+	por	xmm6, xmm4
+	movdqa	xmm7, xmm1
+	pand	xmm7, xmm6
+	movdqa	xmm6, [esp+640-304]
+	movdqa	xmm4, xmm1
+	pandn	xmm4, xmm5
+	por	xmm7, xmm4
+
+	movdqa	xmm4, xmm0
+	pandn	xmm0, [esp+640-416]
+	packuswb xmm6, xmm7
+	movdqa	xmm7, [esp+640-112]
+	por	xmm7, [esp+640-80]
+	pand	xmm4, xmm7
+	por	xmm4, xmm0
+	movdqa	xmm0, [esp+752-272]
+	punpckhbw xmm0, [esp+640-48]
+	psllw	xmm0, 1
+	paddw	xmm0, xmm2
+	paddw	xmm0, xmm2
+	paddw	xmm0, xmm2
+	paddw	xmm0, xmm5
+	paddw	xmm0, [esp+640-432]
+	paddw	xmm0, [esp+640-496]
+	paddw	xmm0, [esp+640-592]
+	psraw	xmm0, 3
+	pand	xmm0, xmm3
+	movdqa	xmm7, xmm1
+	pandn	xmm3, xmm2
+	por	xmm0, xmm3
+	pand	xmm7, xmm0
+
+	movdqa	xmm0, [esp+656-272]
+	movdqa	 [edx], xmm0
+
+	movdqa	xmm0, [esp+672-272]
+
+	mov	edx, dword [esp+640-596]
+	movdqa	 [esi], xmm0
+	movdqa	xmm0, [esp+688-272]
+	movdqa	 [edi], xmm0
+	movdqa	xmm0, [esp+704-272]
+
+	pop	edi
+	pandn	xmm1, xmm2
+	movdqa	 [eax], xmm0
+	por	xmm7, xmm1
+	pop	esi
+	packuswb xmm4, xmm7
+	movdqa	 [edx], xmm6
+	movdqa	 [ecx], xmm4
+	pop	ebx
+	mov	esp, ebp
+	pop	ebp
+	ret
+
+
+;********************************************************************************
+;
+;   void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);
+;
+;********************************************************************************
+
+WELS_EXTERN  DeblockLumaTransposeH2V_sse2
+
+ALIGN  16
+
+DeblockLumaTransposeH2V_sse2:
+    push    ebp
+    push    ebx
+    mov     ebp,   esp
+    and     esp,0FFFFFFF0h
+    sub     esp,   10h
+
+    mov     eax,   [ebp + 0Ch]
+    mov     ecx,   [ebp + 10h]
+    lea     edx,   [eax + ecx * 8]
+    lea     ebx,   [ecx*3]
+
+    movq    xmm0,  [eax]
+    movq    xmm7,  [edx]
+    punpcklqdq   xmm0,  xmm7
+    movq    xmm1,  [eax + ecx]
+    movq    xmm7,  [edx + ecx]
+    punpcklqdq   xmm1,  xmm7
+    movq    xmm2,  [eax + ecx*2]
+    movq    xmm7,  [edx + ecx*2]
+    punpcklqdq   xmm2,  xmm7
+    movq    xmm3,  [eax + ebx]
+    movq    xmm7,  [edx + ebx]
+    punpcklqdq   xmm3,  xmm7
+
+    lea     eax,   [eax + ecx * 4]
+    lea     edx,   [edx + ecx * 4]
+    movq    xmm4,  [eax]
+    movq    xmm7,  [edx]
+    punpcklqdq   xmm4,  xmm7
+    movq    xmm5,  [eax + ecx]
+    movq    xmm7,  [edx + ecx]
+    punpcklqdq   xmm5,  xmm7
+    movq    xmm6,  [eax + ecx*2]
+    movq    xmm7,  [edx + ecx*2]
+    punpcklqdq   xmm6,  xmm7
+
+    movdqa  [esp],   xmm0
+    movq    xmm7,  [eax + ebx]
+    movq    xmm0,  [edx + ebx]
+    punpcklqdq   xmm7,  xmm0
+    movdqa  xmm0,   [esp]
+
+    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
+    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
+
+    mov    eax,   [ebp + 14h]
+    movdqa  [eax],    xmm4
+    movdqa  [eax + 10h],  xmm2
+    movdqa  [eax + 20h],  xmm3
+    movdqa  [eax + 30h],  xmm7
+    movdqa  [eax + 40h],  xmm5
+    movdqa  [eax + 50h],  xmm1
+    movdqa  [eax + 60h],  xmm6
+    movdqa  [eax + 70h],  xmm0
+
+    mov     esp,   ebp
+    pop     ebx
+    pop     ebp
+    ret
+
+
+
+;*******************************************************************************************
+;
+;   void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
+;
+;*******************************************************************************************
+
+WELS_EXTERN   DeblockLumaTransposeV2H_sse2
+
+ALIGN  16
+
+DeblockLumaTransposeV2H_sse2:
+    push     ebp
+    mov      ebp,   esp
+
+    and     esp,  0FFFFFFF0h
+    sub     esp,   10h
+
+    mov      eax,   [ebp + 10h]
+    mov      ecx,   [ebp + 0Ch]
+    mov      edx,   [ebp + 08h]
+
+    movdqa   xmm0,  [eax]
+    movdqa   xmm1,  [eax + 10h]
+    movdqa   xmm2,  [eax + 20h]
+    movdqa   xmm3,	[eax + 30h]
+    movdqa   xmm4,	[eax + 40h]
+    movdqa   xmm5,	[eax + 50h]
+    movdqa   xmm6,	[eax + 60h]
+    movdqa   xmm7,	[eax + 70h]
+
+    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
+    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
+
+    lea      eax,   [ecx * 3]
+
+    movq     [edx],  xmm4
+    movq     [edx + ecx],  xmm2
+    movq     [edx + ecx*2],  xmm3
+    movq     [edx + eax],  xmm7
+
+    lea      edx,   [edx + ecx*4]
+    movq     [edx],  xmm5
+    movq     [edx + ecx],  xmm1
+    movq     [edx + ecx*2],  xmm6
+    movq     [edx + eax],  xmm0
+
+    psrldq    xmm4,   8
+    psrldq    xmm2,   8
+    psrldq    xmm3,   8
+    psrldq    xmm7,   8
+    psrldq    xmm5,   8
+    psrldq    xmm1,   8
+    psrldq    xmm6,   8
+    psrldq    xmm0,   8
+
+    lea       edx,  [edx + ecx*4]
+    movq     [edx],  xmm4
+    movq     [edx + ecx],  xmm2
+    movq     [edx + ecx*2],  xmm3
+    movq     [edx + eax],  xmm7
+
+    lea      edx,   [edx + ecx*4]
+    movq     [edx],  xmm5
+    movq     [edx + ecx],  xmm1
+    movq     [edx + ecx*2],  xmm6
+    movq     [edx + eax],  xmm0
+
+
+    mov      esp,   ebp
+    pop      ebp
     ret
\ No newline at end of file
--- a/codec/encoder/core/asm/expand_picture.asm
+++ b/codec/encoder/core/asm/expand_picture.asm
@@ -153,11 +153,11 @@
 	lea %1, [%1+%2]
 %endmacro
 
-%macro exp_top_bottom_sse2	1	; iPaddingSize [luma(32)/chroma(16)]		
+%macro exp_top_bottom_sse2	1	; iPaddingSize [luma(32)/chroma(16)]
 	; ebx [width/16(8)]
 	; esi [pSrc+0], edi [pSrc-1], ecx [-stride], 32(16)		; top
 	; eax [pSrc+(h-1)*stride], ebp [pSrc+(h+31)*stride], 32(16)	; bottom
-		
+
 %if %1 == 32		; for luma
 	sar ebx, 04h 	; width / 16(8) pixels
 .top_bottom_loops:
@@ -171,7 +171,7 @@
 	mov_line_16x4_sse2 edi, ecx, xmm0, a
 	mov_line_16x4_sse2 edi, ecx, xmm0, a
 	mov_line_end16x4_sse2 edi, ecx, xmm0, a
-	
+
 	; bottom
 	movdqa xmm1, [eax] 		; last line of picture pData
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a	; dst, stride, xmm?
@@ -182,15 +182,15 @@
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a
 	mov_line_end16x4_sse2 ebp, ecx, xmm1, a
-		
+
 	lea esi, [esi+16]		; top pSrc
 	lea edi, [edi+16]		; top dst
 	lea eax, [eax+16]		; bottom pSrc
 	lea ebp, [ebp+16]		; bottom dst
-	neg ecx 			; positive/negative stride need for next loop?	
-	
+	neg ecx 			; positive/negative stride need for next loop?
+
 	dec ebx
-	jnz near .top_bottom_loops		
+	jnz near .top_bottom_loops
 %elif %1 == 16	; for chroma ??
 	mov edx, ebx
 	sar ebx, 04h 	; (width / 16) pixels
@@ -200,21 +200,21 @@
 	mov_line_16x4_sse2 edi, ecx, xmm0, a	; dst, stride, xmm?
 	mov_line_16x4_sse2 edi, ecx, xmm0, a
 	mov_line_16x4_sse2 edi, ecx, xmm0, a
-	mov_line_end16x4_sse2 edi, ecx, xmm0, a	
-	
+	mov_line_end16x4_sse2 edi, ecx, xmm0, a
+
 	; bottom
 	movdqa xmm1, [eax] 		; last line of picture pData
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a	; dst, stride, xmm?
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a
-	mov_line_end16x4_sse2 ebp, ecx, xmm1, a	
-		
+	mov_line_end16x4_sse2 ebp, ecx, xmm1, a
+
 	lea esi, [esi+16]		; top pSrc
 	lea edi, [edi+16]		; top dst
 	lea eax, [eax+16]		; bottom pSrc
 	lea ebp, [ebp+16]		; bottom dst
-	neg ecx 			; positive/negative stride need for next loop?	
-	
+	neg ecx 			; positive/negative stride need for next loop?
+
 	dec ebx
 	jnz near .top_bottom_loops
 
@@ -241,13 +241,13 @@
 %endif
 %endmacro
 
-%macro exp_left_right_sse2	2	; iPaddingSize [luma(32)/chroma(16)], u/a	
+%macro exp_left_right_sse2	2	; iPaddingSize [luma(32)/chroma(16)], u/a
 	; ecx [height]
 	; esi [pSrc+0], 	   edi [pSrc-32], edx [stride], 32(16)	; left
 	; ebx [pSrc+(w-1)], ebp [pSrc+w], 32(16)			; right
 ;	xor eax, eax 	; for pixel pData (uint8_t)		; make sure eax=0 at least high 24 bits of eax = 0
-	
-%if %1 == 32		; for luma	
+
+%if %1 == 32		; for luma
 .left_right_loops:
 	; left
 	mov al, byte [esi]		; pixel pData for left border
@@ -254,37 +254,37 @@
 	butterfly_1to16_sse	xmm0, xmm1, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
 	movdqa [edi], xmm0
 	movdqa [edi+16], xmm0
-	
+
 	; right
 	mov al, byte [ebx]
 	butterfly_1to16_sse	xmm1, xmm2, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
 	movdqa [ebp], xmm1
 	movdqa [ebp+16], xmm1
-	
+
 	lea esi, [esi+edx]		; left pSrc
 	lea edi, [edi+edx]		; left dst
 	lea ebx, [ebx+edx]		; right pSrc
-	lea ebp, [ebp+edx]		; right dst	
-	
+	lea ebp, [ebp+edx]		; right dst
+
 	dec ecx
-	jnz near .left_right_loops		
-%elif %1 == 16	; for chroma ??	
+	jnz near .left_right_loops
+%elif %1 == 16	; for chroma ??
 .left_right_loops:
 	; left
 	mov al, byte [esi]		; pixel pData for left border
 	butterfly_1to16_sse	xmm0, xmm1, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
-	movdqa [edi], xmm0	
-	
+	movdqa [edi], xmm0
+
 	; right
 	mov al, byte [ebx]
 	butterfly_1to16_sse	xmm1, xmm2, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
 	movdq%2 [ebp], xmm1								; might not be aligned 16 bytes in case chroma planes
-	
+
 	lea esi, [esi+edx]		; left pSrc
 	lea edi, [edi+edx]		; left dst
 	lea ebx, [ebx+edx]		; right pSrc
-	lea ebp, [ebp+edx]		; right dst	
-	
+	lea ebp, [ebp+edx]		; right dst
+
 	dec ecx
 	jnz near .left_right_loops
 %endif
@@ -337,25 +337,25 @@
 	; TL
 	mov_line_16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?
 	mov_line_16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?
-	mov_line_16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?	
+	mov_line_16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?
 	mov_line_end16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?
 
 	; TR
 	mov_line_16x4_sse2	ebp, ecx, xmm4, %2	; dst, stride, xmm?
 	mov_line_16x4_sse2	ebp, ecx, xmm4, %2	; dst, stride, xmm?
-	mov_line_16x4_sse2	ebp, ecx, xmm4, %2	; dst, stride, xmm?	
+	mov_line_16x4_sse2	ebp, ecx, xmm4, %2	; dst, stride, xmm?
 	mov_line_end16x4_sse2 ebp, ecx, xmm4, %2	; dst, stride, xmm?
 
 	; BL
 	mov_line_16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?
 	mov_line_16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?
-	mov_line_16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?	
+	mov_line_16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?
 	mov_line_end16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?
 
 	; BR
 	mov_line_16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
 	mov_line_16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
-	mov_line_16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?	
+	mov_line_16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
 	mov_line_end16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
 %endif
 %endmacro
@@ -373,7 +373,7 @@
 	push esi
 	push edi
 	push ebp
-	
+
 	; for both top and bottom border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst
@@ -385,10 +385,10 @@
 	mov cl, byte [esi]
 	butterfly_1to16_sse xmm3, xmm4, c		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	; load top border
-	mov ecx, edx							; stride	
+	mov ecx, edx							; stride
 	neg ecx 								; -stride
 	lea edi, [esi+ecx]						; last line of top border
-	; load bottom border 
+	; load bottom border
 	dec eax									; h-1
 	imul eax, edx 							; (h-1)*stride
 	lea eax, [esi+eax]						; last line of picture pData
@@ -396,16 +396,16 @@
 	lea ebp, [eax+edx]						; last line of bottom border, (h-1)*stride + 32 * stride
 	; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
 	dec ebx									; width-1
-	lea ebx, [eax+ebx]						; dst[w-1][h-1]	
+	lea ebx, [eax+ebx]						; dst[w-1][h-1]
 ;	xor edx, edx
 	mov dl, byte [eax]						; bottom-left
 	butterfly_1to16_sse xmm5, xmm6, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	mov dl, byte [ebx]						; bottom-right
 	butterfly_1to16_sse xmm6, xmm4, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	; for top & bottom expanding	
+	; for top & bottom expanding
 	mov ebx, [esp+32]						; width
-	exp_top_bottom_sse2	32	
-	
+	exp_top_bottom_sse2	32
+
 	; for both left and right border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst: left border pSrc
@@ -417,7 +417,7 @@
 	lea edi, [esi+eax]						; left border dst
 	dec ebx
 	lea ebx, [esi+ebx]						; right border pSrc, (p_dst + width - 1)
-	lea ebp, [ebx+1]						; right border dst	
+	lea ebp, [ebx+1]						; right border dst
 	; prepare for cross border pData: top-right with xmm4
 ;	xor eax, eax
 	mov al, byte [ebx]						; top-right
@@ -424,7 +424,7 @@
 	butterfly_1to16_sse xmm4, xmm0, a		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	; for left & right border expanding
 	exp_left_right_sse2	32, a
-	
+
 	; for cross border [top-left, top-right, bottom-left, bottom-right]
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst
@@ -434,7 +434,7 @@
 	; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
 	mov eax, -32							; luma=-32, chroma=-16
 	neg ecx										; -stride
-	lea edi, [esi+eax]						
+	lea edi, [esi+eax]
 	lea edi, [edi+ecx]				; last line of top-left border
 	lea ebp, [esi+ebx]
 	lea ebp, [ebp+ecx]				; last line of top-right border
@@ -442,19 +442,19 @@
 	mov ecx, [esp+28]					; stride
 	imul edx, ecx							; (height+32(16)) * stride
 	lea eax, [edi+edx]						; last line of bottom-left border
-	lea ebx, [ebp+edx]						; last line of bottom-right border	
+	lea ebx, [ebp+edx]						; last line of bottom-right border
 	neg ecx										; -stride
 	; for left & right border expanding
-	exp_cross_sse2		32, a	
-	
+	exp_cross_sse2		32, a
+
 ;	sfence									; commit cache write back memory
-	
+
 	pop ebp
 	pop edi
 	pop esi
 	pop edx
 	pop ebx
-	
+
 	ret
 
 ALIGN 16
@@ -470,7 +470,7 @@
 	push esi
 	push edi
 	push ebp
-	
+
 	; for both top and bottom border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst
@@ -482,10 +482,10 @@
 	mov cl, byte [esi]
 	butterfly_1to16_sse xmm3, xmm4, c		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	; load top border
-	mov ecx, edx							; stride	
+	mov ecx, edx							; stride
 	neg ecx 								; -stride
 	lea edi, [esi+ecx]						; last line of top border
-	; load bottom border 
+	; load bottom border
 	dec eax									; h-1
 	imul eax, edx 							; (h-1)*stride
 	lea eax, [esi+eax]						; last line of picture pData
@@ -493,16 +493,16 @@
 	lea ebp, [eax+edx]						; last line of bottom border, (h-1)*stride + 16 * stride
 	; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
 	dec ebx									; width-1
-	lea ebx, [eax+ebx]						; dst[w-1][h-1]	
+	lea ebx, [eax+ebx]						; dst[w-1][h-1]
 ;	xor edx, edx
 	mov dl, byte [eax]						; bottom-left
 	butterfly_1to16_sse xmm5, xmm6, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	mov dl, byte [ebx]						; bottom-right
 	butterfly_1to16_sse xmm6, xmm4, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	; for top & bottom expanding	
+	; for top & bottom expanding
 	mov ebx, [esp+32]						; width
-	exp_top_bottom_sse2	16	
-	
+	exp_top_bottom_sse2	16
+
 	; for both left and right border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst: left border pSrc
@@ -514,7 +514,7 @@
 	lea edi, [esi+eax]						; left border dst
 	dec ebx
 	lea ebx, [esi+ebx]						; right border pSrc, (p_dst + width - 1)
-	lea ebp, [ebx+1]						; right border dst	
+	lea ebp, [ebx+1]						; right border dst
 	; prepare for cross border pData: top-right with xmm4
 ;	xor eax, eax
 	mov al, byte [ebx]						; top-right
@@ -521,7 +521,7 @@
 	butterfly_1to16_sse xmm4, xmm0, a		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	; for left & right border expanding
 	exp_left_right_sse2	16, a
-	
+
 	; for cross border [top-left, top-right, bottom-left, bottom-right]
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst
@@ -531,9 +531,9 @@
 	; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
 	mov eax, -16							; chroma=-16
 	neg ecx										; -stride
-	lea edi, [esi+eax]						
+	lea edi, [esi+eax]
 	lea edi, [edi+ecx]				; last line of top-left border
-	lea ebp, [esi+ebx]				
+	lea ebp, [esi+ebx]
 	lea ebp, [ebp+ecx]				; last line of top-right border
 	mov ecx, [esp+28]						; stride
 	add edx, 16							; height+16, luma=32, chroma=16
@@ -543,15 +543,15 @@
 	neg ecx										; -stride
 	; for left & right border expanding
 	exp_cross_sse2		16, a
-	
+
 ;	sfence									; commit cache write back memory
-	
+
 	pop ebp
 	pop edi
 	pop esi
 	pop edx
 	pop ebx
-	
+
 	ret
 
 ALIGN 16
@@ -567,7 +567,7 @@
 	push esi
 	push edi
 	push ebp
-	
+
 	; for both top and bottom border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst
@@ -579,10 +579,10 @@
 	mov cl, byte [esi]
 	butterfly_1to16_sse xmm3, xmm4, c		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	; load top border
-	mov ecx, edx							; stride	
+	mov ecx, edx							; stride
 	neg ecx 								; -stride
 	lea edi, [esi+ecx]						; last line of top border
-	; load bottom border 
+	; load bottom border
 	dec eax									; h-1
 	imul eax, edx 							; (h-1)*stride
 	lea eax, [esi+eax]						; last line of picture pData
@@ -590,16 +590,16 @@
 	lea ebp, [eax+edx]						; last line of bottom border, (h-1)*stride + 16 * stride
 	; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
 	dec ebx									; width-1
-	lea ebx, [eax+ebx]						; dst[w-1][h-1]	
+	lea ebx, [eax+ebx]						; dst[w-1][h-1]
 ;	xor edx, edx
 	mov dl, byte [eax]						; bottom-left
 	butterfly_1to16_sse xmm5, xmm6, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	mov dl, byte [ebx]						; bottom-right
 	butterfly_1to16_sse xmm6, xmm4, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	; for top & bottom expanding	
+	; for top & bottom expanding
 	mov ebx, [esp+32]						; width
-	exp_top_bottom_sse2	16	
-	
+	exp_top_bottom_sse2	16
+
 	; for both left and right border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst: left border pSrc
@@ -611,7 +611,7 @@
 	lea edi, [esi+eax]						; left border dst
 	dec ebx
 	lea ebx, [esi+ebx]						; right border pSrc, (p_dst + width - 1)
-	lea ebp, [ebx+1]						; right border dst	
+	lea ebp, [ebx+1]						; right border dst
 	; prepare for cross border pData: top-right with xmm4
 ;	xor eax, eax
 	mov al, byte [ebx]						; top-right
@@ -618,7 +618,7 @@
 	butterfly_1to16_sse xmm4, xmm0, a		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	; for left & right border expanding
 	exp_left_right_sse2	16, u
-	
+
 	; for cross border [top-left, top-right, bottom-left, bottom-right]
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst
@@ -628,9 +628,9 @@
 	; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
 	neg ecx									; -stride
 	mov eax, -16							; chroma=-16
-	lea edi, [esi+eax]						
+	lea edi, [esi+eax]
 	lea edi, [edi+ecx]				; last line of top-left border
-	lea ebp, [esi+ebx]						
+	lea ebp, [esi+ebx]
 	lea ebp, [ebp+ecx]				; last line of top-right border
 	mov ecx, [esp+28]						; stride
 	add edx, 16							; height+16, luma=32, chroma=16
@@ -640,14 +640,14 @@
 	neg ecx									; -stride
 	; for left & right border expanding
 	exp_cross_sse2		16, u
-	
+
 ;	sfence									; commit cache write back memory
-	
+
 	pop ebp
 	pop edi
 	pop esi
 	pop edx
 	pop ebx
-	
+
 	ret
 
--- a/codec/encoder/core/asm/intra_pred.asm
+++ b/codec/encoder/core/asm/intra_pred.asm
@@ -95,13 +95,13 @@
 	punpcklbw	%1,	%3
 	movdqa		%3,	%1
 	punpcklbw	%1,	%3
-	
+
 	;add			%4,	%5
 	movd		%2,	[%4+%5-1]
 	movdqa		%3,	%2
 	punpcklbw	%2,	%3
 	movdqa		%3,	%2
-	punpcklbw	%2,	%3	
+	punpcklbw	%2,	%3
 	punpckldq	%1,	%2
 %endmacro
 
@@ -126,24 +126,24 @@
 		movd	%2,	[%5+%6]
 		punpcklbw %3,	%2
 		punpcklwd %1,	%3
-		lea		%5,	[%5+2*%6]	
+		lea		%5,	[%5+2*%6]
 		movd	%4,	[%5]
 		movd	%2,	[%5+%6]
 		punpcklbw %4,	%2
-		lea		%5,	[%5+2*%6]	
+		lea		%5,	[%5+2*%6]
 		movd	%3,	[%5]
 		movd	%2,	[%5+%6]
 		lea		%5,	[%5+2*%6]
 		punpcklbw %3,	%2
 		punpcklwd %4,	%3
-		punpckhdq %1,	%4	
-%endmacro	
+		punpckhdq %1,	%4
+%endmacro
 
 %macro  SUMW_HORIZON 3
 	movhlps		%2, %1			; x2 = xx xx xx xx d7 d6 d5 d4
 	paddw		%1, %2			; x1 = xx xx xx xx d37 d26 d15 d04
-	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04 
-	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26 
+	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04
+	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26
 	paddd		%1, %2			; x1 = xxxx xxxx d1357 d0246
 	pshuflw		%2, %1, 0x4e	; x2 = xxxx xxxx d0246 d1357
 	paddd		%1, %2			; x1 = xxxx xxxx xxxx  d01234567
@@ -173,7 +173,7 @@
 		movd	%2,	[%5+%6]
 		punpcklbw %3,	%2
 		punpckhwd %1,	%3
-		lea		%5,	[%5+2*%6]			
+		lea		%5,	[%5+2*%6]
 %endmacro
 
 %macro LOAD_2_LEFT_AND_ADD 0
@@ -197,7 +197,7 @@
 ALIGN 16
 ;***********************************************************************
 ;   void __cdecl WelsI4x4LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
-;   
+;
 ;	pred must align to 16
 ;***********************************************************************
 WelsI4x4LumaPredH_sse2:
@@ -207,11 +207,11 @@
 	movzx		edx,	byte [eax-1]
 	movd		xmm0,	edx
 	pmuludq		xmm0,	[mmx_01bytes]
-	
+
 	movzx		edx,	byte [eax+ecx-1]
 	movd		xmm1,	edx
 	pmuludq		xmm1,	[mmx_01bytes]
-	
+
 	unpcklps	xmm0,	xmm1
 
 	lea			eax,	[eax+ecx*2]
@@ -218,19 +218,19 @@
 	movzx		edx,	byte [eax-1]
 	movd		xmm2,	edx
 	pmuludq		xmm2,	[mmx_01bytes]
-	
+
 	movzx		edx,	byte [eax+ecx-1]
-	movd		xmm3,	edx	
+	movd		xmm3,	edx
 	pmuludq		xmm3,	[mmx_01bytes]
-	
+
 	unpcklps	xmm2,	xmm3
 	unpcklpd	xmm0,	xmm2
-	
+
 	mov			edx,	[esp+4]			;pred
 	movdqa		[edx],	xmm0
-	
+
 	ret
-	
+
 ;***********************************************************************
 ; void WelsI16x16LumaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
 ;***********************************************************************
@@ -241,9 +241,9 @@
 		mov		ecx,	[esp + pushsize + 12]
 		sub		esi,	1
 		sub		esi,	ecx
-		
+
 		;for H
-		pxor	xmm7,	xmm7	
+		pxor	xmm7,	xmm7
 		movq	xmm0,	[esi]
 		movdqa	xmm5,	[sse2_plane_dec]
 		punpcklbw xmm0,	xmm7
@@ -253,7 +253,7 @@
 		punpcklbw xmm1,	xmm7
 		pmullw	xmm1,	xmm6
 		psubw	xmm1,	xmm0
-		
+
 		SUMW_HORIZON	xmm1,xmm0,xmm2
 		movd    eax,	xmm1		; H += (i + 1) * (top[8 + i] - top[6 - i]);
 		movsx	eax,	ax
@@ -261,26 +261,26 @@
 		add		eax,	32
 		sar		eax,	6			; b = (5 * H + 32) >> 6;
 		SSE2_Copy8Times	xmm1, eax	; xmm1 = b,b,b,b,b,b,b,b
-		
-		movzx	edx,	BYTE [esi+16]	
+
+		movzx	edx,	BYTE [esi+16]
 		sub	esi, 3
 		LOAD_COLUMN		xmm0, xmm2, xmm3, xmm4, esi, ecx
-			
+
 		add		esi,	3
 		movzx	eax,	BYTE [esi+8*ecx]
 		add		edx,	eax
 		shl		edx,	4			;	a = (left[15*stride] + top[15]) << 4;
-		
+
 		sub	esi, 3
 		add		esi,	ecx
 		LOAD_COLUMN		xmm7, xmm2, xmm3, xmm4, esi, ecx
-		pxor	xmm4,	xmm4	
+		pxor	xmm4,	xmm4
 		punpckhbw xmm0,	xmm4
 		pmullw	xmm0,	xmm5
 		punpckhbw xmm7,	xmm4
 		pmullw	xmm7,	xmm6
 		psubw	xmm7,	xmm0
-		
+
 		SUMW_HORIZON   xmm7,xmm0,xmm2
 		movd    eax,   xmm7			; V
 		movsx	eax,	ax
@@ -288,17 +288,17 @@
 		imul	eax,	5
 		add		eax,	32
 		sar		eax,	6				; c = (5 * V + 32) >> 6;
-		SSE2_Copy8Times	xmm4, eax		; xmm4 = c,c,c,c,c,c,c,c		
-		
+		SSE2_Copy8Times	xmm4, eax		; xmm4 = c,c,c,c,c,c,c,c
+
 		mov		esi,	[esp + pushsize + 4]
 		add		edx,	16
 		imul	eax,	-7
-		add		edx,	eax				; s = a + 16 + (-7)*c		
-		SSE2_Copy8Times	xmm0, edx		; xmm0 = s,s,s,s,s,s,s,s		
-		
+		add		edx,	eax				; s = a + 16 + (-7)*c
+		SSE2_Copy8Times	xmm0, edx		; xmm0 = s,s,s,s,s,s,s,s
+
 		xor		eax,	eax
 		movdqa	xmm5,	[sse2_plane_inc_minus]
-		
+
 get_i16x16_luma_pred_plane_sse2_1:
 		movdqa	xmm2,	xmm1
 		pmullw	xmm2,	xmm5
@@ -307,7 +307,7 @@
 		movdqa	xmm3,	xmm1
 		pmullw	xmm3,	xmm6
 		paddw	xmm3,	xmm0
-		psraw	xmm3,	5	
+		psraw	xmm3,	5
 		packuswb xmm2,	xmm3
 		movdqa	[esi],	xmm2
 		paddw	xmm0,	xmm4
@@ -314,13 +314,13 @@
 		add		esi,	16
 		inc		eax
 		cmp		eax,	16
-		jnz get_i16x16_luma_pred_plane_sse2_1					
-		
+		jnz get_i16x16_luma_pred_plane_sse2_1
+
 		pop		esi
 		ret
-		
-		
-		
+
+
+
 ;***********************************************************************
 ; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
 ;***********************************************************************
@@ -327,7 +327,7 @@
 
 %macro SSE2_PRED_H_16X16_TWO_LINE 1
     lea     eax,	[eax+ecx*2]
-    
+
     COPY_16_TIMES	eax,	xmm0
     movdqa			[edx+%1],	xmm0
    COPY_16_TIMESS eax,	xmm0,	ecx
@@ -340,13 +340,13 @@
     mov     edx, [esp+4]    ; pred
     mov     eax, [esp+8]	; pRef
     mov     ecx, [esp+12]   ; stride
-    
+
     COPY_16_TIMES eax,	xmm0
     movdqa  [edx],		xmm0
     COPY_16_TIMESS eax,	xmm0,	ecx
     movdqa  [edx+0x10],	xmm0
-    
-	SSE2_PRED_H_16X16_TWO_LINE   0x20 
+
+	SSE2_PRED_H_16X16_TWO_LINE   0x20
 	SSE2_PRED_H_16X16_TWO_LINE   0x40
 	SSE2_PRED_H_16X16_TWO_LINE   0x60
 	SSE2_PRED_H_16X16_TWO_LINE   0x80
@@ -353,9 +353,9 @@
 	SSE2_PRED_H_16X16_TWO_LINE   0xa0
 	SSE2_PRED_H_16X16_TWO_LINE   0xc0
 	SSE2_PRED_H_16X16_TWO_LINE   0xe0
-   
+
     ret
-    
+
 ;***********************************************************************
 ; void WelsI16x16LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
 ;***********************************************************************
@@ -364,10 +364,10 @@
     mov     edx, [esp+4]    ; pred
     mov     eax, [esp+8]	; pRef
     mov     ecx, [esp+12]   ; stride
-    
+
     sub     eax, ecx
     movdqa  xmm0, [eax]
-    
+
     movdqa  [edx], xmm0
     movdqa  [edx+10h], xmm0
     movdqa  [edx+20h], xmm0
@@ -378,15 +378,15 @@
     movdqa  [edx+70h], xmm0
     movdqa  [edx+80h], xmm0
     movdqa  [edx+90h], xmm0
-    movdqa  [edx+160], xmm0 
+    movdqa  [edx+160], xmm0
 	movdqa  [edx+176], xmm0
     movdqa  [edx+192], xmm0
     movdqa  [edx+208], xmm0
     movdqa  [edx+224], xmm0
     movdqa  [edx+240], xmm0
-    
+
     ret
-    
+
 ;***********************************************************************
 ; void WelsIChromaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
 ;***********************************************************************
@@ -398,8 +398,8 @@
 		mov		ecx,	[esp + pushsize + 12]	;stride
 		sub		esi,	1
 		sub		esi,	ecx
-		
-		pxor	mm7,	mm7	
+
+		pxor	mm7,	mm7
 		movq	mm0,	[esi]
 		movq	mm5,	[sse2_plane_dec_c]
 		punpcklbw mm0,	mm7
@@ -409,7 +409,7 @@
 		punpcklbw mm1,	mm7
 		pmullw	mm1,	mm6
 		psubw	mm1,	mm0
-		
+
 		movq2dq xmm1,   mm1
 		pxor    xmm2,   xmm2
 		SUMW_HORIZON	xmm1,xmm0,xmm2
@@ -419,7 +419,7 @@
 		add		eax,	16
 		sar		eax,	5			; b = (17 * H + 16) >> 5;
 		SSE2_Copy8Times	xmm1, eax	; mm1 = b,b,b,b,b,b,b,b
-		
+
 		movzx	edx,	BYTE [esi+8]
 		sub	esi, 3
 		LOAD_COLUMN_C	mm0, mm2, mm3, mm4, esi, ecx
@@ -428,17 +428,17 @@
 		movzx	eax,	BYTE [esi+4*ecx]
 		add		edx,	eax
 		shl		edx,	4			; a = (left[7*stride] + top[7]) << 4;
-		
+
 		sub	esi, 3
 		add		esi,	ecx
 		LOAD_COLUMN_C	mm7, mm2, mm3, mm4, esi, ecx
-		pxor	mm4,	mm4	
+		pxor	mm4,	mm4
 		punpckhbw mm0,	mm4
 		pmullw	mm0,	mm5
 		punpckhbw mm7,	mm4
 		pmullw	mm7,	mm6
 		psubw	mm7,	mm0
-		
+
 		movq2dq xmm7,   mm7
 		pxor    xmm2,   xmm2
 		SUMW_HORIZON	xmm7,xmm0,xmm2
@@ -448,17 +448,17 @@
 		imul	eax,	17
 		add		eax,	16
 		sar		eax,	5				; c = (17 * V + 16) >> 5;
-		SSE2_Copy8Times	xmm4, eax		; mm4 = c,c,c,c,c,c,c,c		
-		
+		SSE2_Copy8Times	xmm4, eax		; mm4 = c,c,c,c,c,c,c,c
+
 		mov		esi,	[esp + pushsize + 4]
 		add		edx,	16
 		imul	eax,	-3
-		add		edx,	eax				; s = a + 16 + (-3)*c		
-		SSE2_Copy8Times	xmm0, edx		; xmm0 = s,s,s,s,s,s,s,s		
-		
+		add		edx,	eax				; s = a + 16 + (-3)*c
+		SSE2_Copy8Times	xmm0, edx		; xmm0 = s,s,s,s,s,s,s,s
+
 		xor		eax,	eax
 		movdqa	xmm5,	[sse2_plane_mul_b_c]
-		
+
 get_i_chroma_pred_plane_sse2_1:
 		movdqa	xmm2,	xmm1
 		pmullw	xmm2,	xmm5
@@ -470,12 +470,12 @@
 		add		esi,	8
 		inc		eax
 		cmp		eax,	8
-		jnz get_i_chroma_pred_plane_sse2_1					
-		
+		jnz get_i_chroma_pred_plane_sse2_1
+
 		pop		esi
 		WELSEMMS
-		ret	
-		
+		ret
+
 ALIGN 16
 ;***********************************************************************
 ;	0 |1 |2 |3 |4 |
@@ -487,13 +487,13 @@
 ;	pred[7] = ([6]+[0]*2+[1]+2)/4
 ;
 ;   void __cdecl WelsI4x4LumaPredDDR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;   
+;
 ;***********************************************************************
-WelsI4x4LumaPredDDR_mmx:	
+WelsI4x4LumaPredDDR_mmx:
 	mov			edx,[esp+4]			;pred
 	mov         eax,[esp+8]			;pRef
 	mov			ecx,[esp+12]		;stride
-	
+
 	movq        mm1,[eax+ecx-8]		;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
 	movq        mm2,[eax-8]			;get value of 6 mm2[8] = 6
 	sub			eax, ecx			;mov eax to above line of current block(postion of 1)
@@ -520,17 +520,17 @@
 	pand        mm1,[mmx_01bytes]	;set the odd bit
 	psubusb     mm3,mm1				;decrease 1 from odd bytes
 	pavgb       mm2,mm3				;mm2=(([11]+[21]+1)/2+1+[16])/2
-	
-	movd        [edx+12],mm2 
-	psrlq       mm2,8 
-	movd        [edx+8],mm2 
-	psrlq       mm2,8 
-	movd        [edx+4],mm2 
-	psrlq       mm2,8 
+
+	movd        [edx+12],mm2
+	psrlq       mm2,8
+	movd        [edx+8],mm2
+	psrlq       mm2,8
+	movd        [edx+4],mm2
+	psrlq       mm2,8
 	movd        [edx],mm2
 	WELSEMMS
 	ret
-	
+
 ALIGN 16
 ;***********************************************************************
 ;	0 |1 |2 |3 |4 |
@@ -542,44 +542,44 @@
 ;	pred[6] = ([1]+[2]+[3]+[4]+[5]+[10]+[15]+[20]+4)/8
 ;
 ;   void __cdecl WelsI4x4LumaPredDc_sse2(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;   
+;
 ;***********************************************************************
-WelsI4x4LumaPredDc_sse2:	
+WelsI4x4LumaPredDc_sse2:
 	mov         eax,[esp+8]			;pRef
 	mov			ecx,[esp+12]		;stride
 	push		ebx
-		
+
 	movzx		edx,	byte [eax-1h]
-	
+
 	sub			eax,	ecx
 	movd		xmm0,	[eax]
 	pxor		xmm1,	xmm1
 	psadbw		xmm0,	xmm1
-	
+
 	movd		ebx,	xmm0
 	add			ebx,	edx
-	
+
 	movzx		edx,	byte [eax+ecx*2-1h]
 	add			ebx,	edx
-	
+
 	lea			eax,	[eax+ecx*2-1]
 	movzx		edx,	byte [eax+ecx]
 	add			ebx,	edx
-	
+
 	movzx		edx,	byte [eax+ecx*2]
 	add			ebx,	edx
 	add			ebx,	4
 	sar			ebx,	3
 	imul		ebx,	0x01010101
-	
+
 	mov			edx,	[esp+8]			;pred
 	movd		xmm0,	ebx
 	pshufd		xmm0,	xmm0,	0
 	movdqa		[edx],	xmm0
-				
+
 	pop ebx
-	ret	
-	
+	ret
+
 ALIGN 16
 ;***********************************************************************
 ;	void __cdecl WelsIChromaPredH_mmx(uint8_t *pred, uint8_t *pRef, int32_t stride)
@@ -588,7 +588,7 @@
 %macro MMX_PRED_H_8X8_ONE_LINE 4
 	movq		%1,		[%3-8]
 	psrlq		%1,		38h
-	
+
 	;pmuludq		%1,		[mmx_01bytes]		;extend to 4 bytes
 	pmullw		%1,		[mmx_01bytes]
 	pshufw		%1,		%1,	0
@@ -598,7 +598,7 @@
 %macro MMX_PRED_H_8X8_ONE_LINEE 4
 	movq		%1,		[%3+ecx-8]
 	psrlq		%1,		38h
-	
+
 	;pmuludq		%1,		[mmx_01bytes]		;extend to 4 bytes
 	pmullw		%1,		[mmx_01bytes]
 	pshufw		%1,		%1,	0
@@ -610,34 +610,34 @@
 	mov			edx,	[esp+4]			;pred
 	mov         eax,	[esp+8]			;pRef
 	mov			ecx,	[esp+12]		;stride
-	
+
 	movq		mm0,	[eax-8]
 	psrlq		mm0,	38h
-	
+
 	;pmuludq		mm0,	[mmx_01bytes]		;extend to 4 bytes
 	pmullw		mm0,		[mmx_01bytes]
 	pshufw		mm0,	mm0,	0
 	movq		[edx],	mm0
-	
+
 	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, eax,edx+8
-	
+
 	lea			eax,[eax+ecx*2]
 	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, eax,edx+16
-	
+
 	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, eax,edx+24
-	
+
 	lea			eax,[eax+ecx*2]
 	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, eax,edx+32
-	
+
 	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, eax,edx+40
-	
+
 	lea			eax,[eax+ecx*2]
 	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, eax,edx+48
 
-	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, eax,edx+56		
+	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, eax,edx+56
 	WELSEMMS
-	ret	
-	
+	ret
+
 ALIGN 16
 ;***********************************************************************
 ;	void __cdecl WelsI4x4LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
@@ -648,12 +648,12 @@
 	mov			edx,	[esp+4]			;pred
 	mov         eax,	[esp+8]			;pRef
 	mov			ecx,	[esp+12]		;stride
-	
+
 	sub			eax,	ecx
 	movd		xmm0,	[eax]
 	pshufd		xmm0,	xmm0,	0
 	movdqa		[edx],	xmm0
-	ret	
+	ret
 
 ALIGN 16
 ;***********************************************************************
@@ -665,7 +665,7 @@
 	mov			edx,		[esp+4]			;pred
 	mov         eax,		[esp+8]			;pRef
 	mov			ecx,		[esp+12]		;stride
-	
+
 	sub			eax,		ecx
 	movq		xmm0,		[eax]
 	movdqa		xmm1,		xmm0
@@ -676,8 +676,8 @@
 	movdqa		[edx+32],	xmm0
 	movdqa		[edx+48],	xmm0
 	ret
-	
-	
+
+
 	ALIGN 16
 ;***********************************************************************
 ;	lt|t0|t1|t2|t3|
@@ -703,13 +703,13 @@
 
 ;   f = (2 + l1 + (l0<<1) + lt)>>2
 ;   h = (2 + l2 + (l1<<1) + l0)>>2
-;   j = (2 + l3 + (l2<<1) + l1)>>2   
+;   j = (2 + l3 + (l2<<1) + l1)>>2
 ;   [b a f e h g j i] + [d c b a] --> mov to memory
-;   
+;
 ;   void WelsI4x4LumaPredHD_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredHD_mmx
-WelsI4x4LumaPredHD_mmx:	
+WelsI4x4LumaPredHD_mmx:
 	mov			edx, [esp+4]			; pred
 	mov         eax, [esp+8]			; pRef
 	mov			ecx, [esp+12]           ; stride
@@ -716,16 +716,16 @@
 	sub         eax, ecx
 	movd        mm0, [eax-1]            ; mm0 = [xx xx xx xx t2 t1 t0 lt]
 	psllq       mm0, 20h                ; mm0 = [t2 t1 t0 lt xx xx xx xx]
-	
-	movd        mm1, [eax+2*ecx-4]        
-	punpcklbw   mm1, [eax+ecx-4]        ; mm1[7] = l0, mm1[6] = l1	
+
+	movd        mm1, [eax+2*ecx-4]
+	punpcklbw   mm1, [eax+ecx-4]        ; mm1[7] = l0, mm1[6] = l1
 	lea         eax, [eax+2*ecx]
-	movd        mm2, [eax+2*ecx-4]        
+	movd        mm2, [eax+2*ecx-4]
 	punpcklbw   mm2, [eax+ecx-4]        ; mm2[7] = l2, mm2[6] = l3
 	punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
 	psrlq       mm2, 20h
 	pxor        mm0, mm2                ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
-	
+
 	movq        mm1, mm0
 	psrlq       mm1, 10h                ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
 	movq        mm2, mm0
@@ -733,17 +733,17 @@
 	movq        mm3, mm2
 	movq        mm4, mm1
 	pavgb       mm1, mm0
-	
+
 	pxor        mm4, mm0				; find odd value in the lowest bit of each byte
 	pand        mm4, [mmx_01bytes]	    ; set the odd bit
 	psubusb     mm1, mm4				; decrease 1 from odd bytes
-	
+
 	pavgb       mm2, mm1                ; mm2 = [xx xx d  c  b  f  h  j]
-	
+
 	movq        mm4, mm0
 	pavgb       mm3, mm4                ; mm3 = [xx xx xx xx a  e  g  i]
 	punpcklbw   mm3, mm2                ; mm3 = [b  a  f  e  h  g  j  i]
-	
+
 	psrlq       mm2, 20h
 	psllq       mm2, 30h                ; mm2 = [d  c  0  0  0  0  0  0]
 	movq        mm4, mm3
@@ -750,7 +750,7 @@
 	psrlq       mm4, 10h                ; mm4 = [0  0  b  a  f  e  h  j]
 	pxor        mm2, mm4                ; mm2 = [d  c  b  a  xx xx xx xx]
 	psrlq       mm2, 20h                ; mm2 = [xx xx xx xx  d  c  b  a]
-	
+
 	movd        [edx], mm2
 	movd        [edx+12], mm3
 	psrlq       mm3, 10h
@@ -759,9 +759,9 @@
 	movd        [edx+4], mm3
 	WELSEMMS
 	ret
-	
-	
-	
+
+
+
 ALIGN 16
 ;***********************************************************************
 ;	lt|t0|t1|t2|t3|
@@ -784,17 +784,17 @@
 ;   b = (2 + l0 + (l1<<1) + l2)>>2
 ;   d = (2 + l1 + (l2<<1) + l3)>>2
 ;   f = (2 + l2 + (l3<<1) + l3)>>2
- 
+
 ;   [g g f e d c b a] + [g g g g] --> mov to memory
-;   
+;
 ;   void WelsI4x4LumaPredHU_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredHU_mmx
-WelsI4x4LumaPredHU_mmx:	
+WelsI4x4LumaPredHU_mmx:
 	mov			edx, [esp+4]			; pred
 	mov         eax, [esp+8]			; pRef
 	mov			ecx, [esp+12]           ; stride
-	
+
 	movd        mm0, [eax-4]            ; mm0[3] = l0
 	punpcklbw   mm0, [eax+ecx-4]        ; mm0[7] = l1, mm0[6] = l0
 	lea         eax, [eax+2*ecx]
@@ -802,38 +802,38 @@
 	movd        mm4, [eax+ecx-4]        ; mm4[3] = l3
 	punpcklbw   mm2, mm4
 	punpckhwd   mm0, mm2                ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
-	
+
 	psrlq       mm4, 18h
 	psllq       mm4, 38h                ; mm4 = [l3 xx xx xx xx xx xx xx]
 	psrlq       mm0, 8h
 	pxor        mm0, mm4                ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
-	
+
 	movq        mm1, mm0
 	psllq       mm1, 8h                 ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
 	movq        mm3, mm1                ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
 	pavgb       mm1, mm0                ; mm1 = [g  e  c  a  xx xx xx xx]
-	
+
 	movq        mm2, mm0
 	psllq       mm2, 10h                ; mm2 = [l2 l1 l0 xx xx xx xx xx]
 	movq        mm5, mm2
 	pavgb       mm2, mm0
-	
+
 	pxor        mm5, mm0				; find odd value in the lowest bit of each byte
 	pand        mm5, [mmx_01bytes]	    ; set the odd bit
 	psubusb     mm2, mm5				; decrease 1 from odd bytes
-	
+
 	pavgb       mm2, mm3                ; mm2 = [f  d  b  xx xx xx xx xx]
-	
+
 	psrlq       mm2, 8h
 	pxor        mm2, mm4                ; mm2 = [g  f  d  b  xx xx xx xx]
-	
+
 	punpckhbw   mm1, mm2                ; mm1 = [g  g  f  e  d  c  b  a]
 	punpckhbw   mm4, mm4                ; mm4 = [g  g  xx xx xx xx xx xx]
 	punpckhbw   mm4, mm4                ; mm4 = [g  g  g  g  xx xx xx xx]
-	
+
 	psrlq       mm4, 20h
 	movd        [edx+12], mm4
-	
+
 	movd        [edx], mm1
 	psrlq       mm1, 10h
 	movd        [edx+4], mm1
@@ -841,9 +841,9 @@
 	movd        [edx+8], mm1
 	WELSEMMS
 	ret
-	
-	
-	
+
+
+
 ALIGN 16
 ;***********************************************************************
 ;	lt|t0|t1|t2|t3|
@@ -869,12 +869,12 @@
 
 ;   h = (2 + t1 + (t2<<1) + t3)>>2
 ;   i = (2 + lt + (l0<<1) + l1)>>2
-;   j = (2 + l0 + (l1<<1) + l2)>>2   
-;   
+;   j = (2 + l0 + (l1<<1) + l2)>>2
+;
 ;   void WelsI4x4LumaPredVR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredVR_mmx
-WelsI4x4LumaPredVR_mmx:	
+WelsI4x4LumaPredVR_mmx:
 	mov			edx, [esp+4]			; pred
 	mov         eax, [esp+8]			; pRef
 	mov			ecx, [esp+12]           ; stride
@@ -881,57 +881,57 @@
 	sub         eax, ecx
 	movq        mm0, [eax-1]            ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
 	psllq       mm0, 18h                ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
-	
-	movd        mm1, [eax+2*ecx-4]        
-	punpcklbw   mm1, [eax+ecx-4]        ; mm1[7] = l0, mm1[6] = l1	
+
+	movd        mm1, [eax+2*ecx-4]
+	punpcklbw   mm1, [eax+ecx-4]        ; mm1[7] = l0, mm1[6] = l1
 	lea         eax, [eax+2*ecx]
 	movq        mm2, [eax+ecx-8]        ; mm2[7] = l2
 	punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 xx xx xx xx xx]
 	psrlq       mm2, 28h
 	pxor        mm0, mm2                ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
-	
+
 	movq        mm1, mm0
 	psllq       mm1, 8h                 ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
 	pavgb       mm1, mm0                ; mm1 = [d  c  b  a  xx xx xx xx]
-	
+
 	movq        mm2, mm0
 	psllq       mm2, 10h                ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
 	movq        mm3, mm2
 	pavgb       mm2, mm0
-	
+
 	pxor        mm3, mm0				; find odd value in the lowest bit of each byte
 	pand        mm3, [mmx_01bytes]	    ; set the odd bit
 	psubusb     mm2, mm3				; decrease 1 from odd bytes
-	
+
 	movq        mm3, mm0
 	psllq       mm3, 8h                 ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
 	pavgb       mm3, mm2                ; mm3 = [h  g  f  e  i  j  xx xx]
 	movq        mm2, mm3
-	
+
 	psrlq       mm1, 20h                ; mm1 = [xx xx xx xx d  c  b  a]
 	movd        [edx], mm1
-	
+
 	psrlq       mm2, 20h                ; mm2 = [xx xx xx xx h  g  f  e]
 	movd        [edx+4], mm2
-	
+
 	movq        mm4, mm3
 	psllq       mm4, 20h
 	psrlq       mm4, 38h                ; mm4 = [xx xx xx xx xx xx xx i]
-	
+
 	movq        mm5, mm3
 	psllq       mm5, 28h
 	psrlq       mm5, 38h                ; mm5 = [xx xx xx xx xx xx xx j]
-	
+
 	psllq       mm1, 8h
 	pxor        mm4, mm1                ; mm4 = [xx xx xx xx c  b  a  i]
 	movd        [edx+8], mm4
-	
+
 	psllq       mm2, 8h
 	pxor        mm5, mm2                ; mm5 = [xx xx xx xx g  f  e  j]
 	movd        [edx+12], mm5
 	WELSEMMS
 	ret
-	
+
 ALIGN 16
 ;***********************************************************************
 ;	lt|t0|t1|t2|t3|t4|t5|t6|t7
@@ -954,13 +954,13 @@
 ;   e = (2 + t4 + t6 + (t5<<1))>>2
 ;   f = (2 + t5 + t7 + (t6<<1))>>2
 ;   g = (2 + t6 + t7 + (t7<<1))>>2
- 
+
 ;   [g f e d c b a] --> mov to memory
-;   
+;
 ;   void WelsI4x4LumaPredDDL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredDDL_mmx
-WelsI4x4LumaPredDDL_mmx:	
+WelsI4x4LumaPredDDL_mmx:
 	mov			edx, [esp+4]			; pred
 	mov         eax, [esp+8]			; pRef
 	mov			ecx, [esp+12]           ; stride
@@ -968,11 +968,11 @@
 	movq        mm0, [eax]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
 	movq        mm1, mm0
 	movq        mm2, mm0
-	
+
 	movq        mm3, mm0
 	psrlq       mm3, 38h
 	psllq       mm3, 38h                ; mm3 = [t7 xx xx xx xx xx xx xx]
-	
+
 	psllq       mm1, 8h                 ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
 	psrlq       mm2, 8h
 	pxor        mm2, mm3                ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
@@ -982,9 +982,9 @@
 	pxor        mm3, mm2				; find odd value in the lowest bit of each byte
 	pand        mm3, [mmx_01bytes]	    ; set the odd bit
 	psubusb     mm1, mm3				; decrease 1 from odd bytes
-	
+
 	pavgb       mm0, mm1                ; mm0 = [g f e d c b a xx]
-	
+
 	psrlq       mm0, 8h
 	movd        [edx], mm0
 	psrlq       mm0, 8h
@@ -995,8 +995,8 @@
 	movd        [edx+12], mm0
 	WELSEMMS
 	ret
-	
-	
+
+
 ALIGN 16
 ;***********************************************************************
 ;	lt|t0|t1|t2|t3|t4|t5|t6|t7
@@ -1022,46 +1022,46 @@
 ;   g = (2 + t2 + (t3<<1) + t4)>>2
 ;   h = (2 + t3 + (t4<<1) + t5)>>2
 ;   j = (2 + t4 + (t5<<1) + t6)>>2
- 
+
 ;   [i d c b a] + [j h g f e] --> mov to memory
-;   
+;
 ;   void WelsI4x4LumaPredVL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredVL_mmx
-WelsI4x4LumaPredVL_mmx:	
+WelsI4x4LumaPredVL_mmx:
 	mov			edx, [esp+4]			; pred
 	mov         eax, [esp+8]			; pRef
 	mov			ecx, [esp+12]           ; stride
-	
+
 	sub         eax, ecx
 	movq        mm0, [eax]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
 	movq        mm1, mm0
 	movq        mm2, mm0
-	
+
 	psrlq       mm1, 8h                 ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
 	psrlq       mm2, 10h                ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
 
 	movq        mm3, mm1
 	pavgb       mm3, mm0                ; mm3 = [xx xx xx i  d  c  b  a]
-	
+
 	movq        mm4, mm2
-	pavgb       mm2, mm0	
+	pavgb       mm2, mm0
 	pxor        mm4, mm0				; find odd value in the lowest bit of each byte
 	pand        mm4, [mmx_01bytes]	    ; set the odd bit
 	psubusb     mm2, mm4				; decrease 1 from odd bytes
-	
+
 	pavgb       mm2, mm1                ; mm2 = [xx xx xx j  h  g  f  e]
-	
+
 	movd        [edx], mm3
 	psrlq       mm3, 8h
 	movd        [edx+8], mm3
-	
+
 	movd        [edx+4], mm2
 	psrlq       mm2, 8h
 	movd        [edx+12], mm2
 	WELSEMMS
 	ret
-	
+
 ALIGN 16
 ;***********************************************************************
 ;
@@ -1068,14 +1068,14 @@
 ;   void WelsIChromaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
 ;***********************************************************************
 WELS_EXTERN WelsIChromaPredDc_sse2
-WelsIChromaPredDc_sse2:	
+WelsIChromaPredDc_sse2:
 	push        ebx
 	mov         eax, [esp+12]			; pRef
 	mov			ecx, [esp+16]           ; stride
-	
+
 	sub         eax, ecx
 	movq        mm0, [eax]
-	
+
 	;xor         ebx, ebx
 	;movzx		edx, byte [eax+ecx-0x01] ; l1
 	movzx		ebx, byte [eax+ecx-0x01] ; l1
@@ -1089,7 +1089,7 @@
 	movzx		edx, byte [eax-0x01]     ; l4
 	add			ebx, edx
 	movd        mm1, ebx                 ; mm1 = l1+l2+l3+l4
-	
+
 	;xor         ebx, ebx
 	;movzx		edx, byte [eax+ecx-0x01] ; l5
 	movzx		ebx, byte [eax+ecx-0x01] ; l5
@@ -1103,7 +1103,7 @@
 	movzx		edx, byte [eax-0x01]     ; l8
 	add			ebx, edx
 	movd        mm2, ebx                 ; mm2 = l5+l6+l7+l8
-	
+
 	movq        mm3, mm0
 	psrlq       mm0, 0x20
 	psllq       mm3, 0x20
@@ -1110,56 +1110,56 @@
 	psrlq       mm3, 0x20
 	pxor		mm4, mm4
 	psadbw		mm0, mm4
-	psadbw		mm3, mm4                 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2	
-	
+	psadbw		mm3, mm4                 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
+
 	paddq       mm3, mm1
 	movq        mm1, mm2
 	paddq       mm1, mm0;                ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
-	
+
 	movq        mm4, [mmx_0x02]
-	
+
 	paddq       mm0, mm4
 	psrlq       mm0, 0x02
-	
+
 	paddq       mm2, mm4
 	psrlq       mm2, 0x02
-	
+
 	paddq       mm3, mm4
 	paddq       mm3, mm4
 	psrlq       mm3, 0x03
-	
+
 	paddq       mm1, mm4
 	paddq       mm1, mm4
 	psrlq       mm1, 0x03
-	
+
 	pmuludq     mm0, [mmx_01bytes]
 	pmuludq     mm3, [mmx_01bytes]
 	psllq       mm0, 0x20
 	pxor        mm0, mm3                 ; mm0 = m_up
-	
+
 	pmuludq     mm2, [mmx_01bytes]
 	pmuludq     mm1, [mmx_01bytes]
 	psllq       mm1, 0x20
 	pxor        mm1, mm2                 ; mm2 = m_down
-	
+
 	mov         edx, [esp+8]			 ; pRef
-	
+
 	movq        [edx], mm0
 	movq        [edx+0x08], mm0
 	movq        [edx+0x10], mm0
 	movq        [edx+0x18], mm0
-	
+
 	movq        [edx+0x20], mm1
 	movq        [edx+0x28], mm1
 	movq        [edx+0x30], mm1
 	movq        [edx+0x38], mm1
-	
+
 	pop         ebx
 	WELSEMMS
 	ret
-	
-	
-	
+
+
+
 ALIGN 16
 ;***********************************************************************
 ;
@@ -1166,11 +1166,11 @@
 ;   void WelsI16x16LumaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
 ;***********************************************************************
 WELS_EXTERN WelsI16x16LumaPredDc_sse2
-WelsI16x16LumaPredDc_sse2:	
+WelsI16x16LumaPredDc_sse2:
 	push        ebx
 	mov         eax, [esp+12]			; pRef
 	mov			ecx, [esp+16]           ; stride
-	
+
 	sub         eax, ecx
 	movdqa      xmm0, [eax]             ; read one row
 	pxor		xmm1, xmm1
@@ -1180,7 +1180,7 @@
 	pslldq      xmm0, 0x08
 	psrldq      xmm0, 0x08
 	paddw       xmm0, xmm1
-	
+
 	;xor         ebx, ebx
 	;movzx		edx, byte [eax+ecx-0x01]
 	movzx		ebx, byte [eax+ecx-0x01]
@@ -1201,7 +1201,7 @@
 	psrld       xmm0, 0x05
 	pmuludq     xmm0, [mmx_01bytes]
 	pshufd      xmm0, xmm0, 0
-	
+
 	mov         edx, [esp+8]			; pred
 	movdqa      [edx], xmm0
 	movdqa      [edx+0x10], xmm0
@@ -1219,7 +1219,7 @@
 	movdqa      [edx+0xd0], xmm0
 	movdqa      [edx+0xe0], xmm0
 	movdqa      [edx+0xf0], xmm0
-	
+
 	pop         ebx
 
 	ret
@@ -1226,7 +1226,7 @@
 
 ;***********************************************************************
 ;
-;int32_t WelsSmpleSatdThree4x4_sse2( uint8_t *pDec, int32_t iLineSizeDec, uint8_t *pEnc, int32_t iLinesizeEnc, 
+;int32_t WelsSmpleSatdThree4x4_sse2( uint8_t *pDec, int32_t iLineSizeDec, uint8_t *pEnc, int32_t iLinesizeEnc,
 ;                             uint8_t* pRed, int32_t* pBestMode, int32_t, int32_t, int32_t);
 ;
 ;***********************************************************************
@@ -1238,7 +1238,7 @@
 	push      edi
 	mov       eax,  [esp+24];p_enc
 	mov       ebx,  [esp+28];linesize_enc
-	
+
 	; load source 4x4 samples and Hadamard transform
     movd      xmm0, [eax]
     movd      xmm1, [eax+ebx]
@@ -1247,16 +1247,16 @@
     movd      xmm3, [eax+ebx]
     punpckldq xmm0, xmm2
     punpckldq xmm1, xmm3
-       
+
     pxor      xmm6, xmm6
     punpcklbw xmm0, xmm6
     punpcklbw xmm1, xmm6
-    
+
     movdqa    xmm2, xmm0
     paddw     xmm0, xmm1
     psubw     xmm2, xmm1
     SSE2_XSawp  qdq, xmm0, xmm2, xmm3
-    
+
     movdqa    xmm4, xmm0
     paddw     xmm0, xmm3
     psubw     xmm4, xmm3
@@ -1264,7 +1264,7 @@
     movdqa    xmm2, xmm0
     punpcklwd xmm0, xmm4
     punpckhwd xmm4, xmm2
-    
+
 	SSE2_XSawp  dq,  xmm0, xmm4, xmm3
 	SSE2_XSawp  qdq, xmm0, xmm3, xmm5
 
@@ -1271,14 +1271,14 @@
     movdqa    xmm7, xmm0
     paddw     xmm0, xmm5
     psubw     xmm7, xmm5
-    
+
 	SSE2_XSawp  qdq,  xmm0, xmm7, xmm1
-    
+
     ; Hadamard transform results are saved in xmm0 and xmm2
     movdqa    xmm2, xmm0
     paddw     xmm0, xmm1
     psubw     xmm2, xmm1
-  	
+
 	; load top boundary samples: [a b c d]
     mov       eax,  [esp+16];p_dec
 	sub		  eax,	[esp+20];linesize_dec
@@ -1286,7 +1286,7 @@
 	movzx     edx,  byte [eax+1]
 	movzx     esi,  byte [eax+2]
 	movzx     edi,  byte [eax+3]
-	
+
 	; get the transform results of top boundary samples: [a b c d]
 	add       edx, ecx ; edx = a + b
 	add       edi, esi ; edi = c + d
@@ -1300,7 +1300,7 @@
 	add       esi, ecx ; esi = (a - b) + (c - d)
 	add       ecx, ecx
 	sub       ecx, esi ; ecx = (a - b) - (c - d) ; [edi edx ecx esi]
-	
+
 	movdqa    xmm6, xmm0
 	movdqa    xmm7, xmm2
 	movd      xmm5, edi ; store the edi for DC mode
@@ -1312,16 +1312,16 @@
 	pinsrw    xmm4, edx, 0
 	pinsrw    xmm4, ecx, 4
 	psllw     xmm4, 2
-	
+
 	; get the satd of H
 	psubw     xmm0, xmm3
 	psubw     xmm2, xmm4
-	
+
 	WELS_AbsW  xmm0, xmm1
 	WELS_AbsW  xmm2, xmm1
     paddusw        xmm0, xmm2
     SUMW_HORIZON1  xmm0, xmm1 ; satd of V is stored in xmm0
-	
+
 	; load left boundary samples: [a b c d]'
     mov       eax,  [esp+16]
 	mov       ebx,  [esp+20]
@@ -1330,7 +1330,7 @@
 	lea       eax , [eax+2*ebx]
 	movzx     esi,  byte [eax-1]
 	movzx     edi,  byte [eax+ebx-1]
-	
+
 	; get the transform results of left boundary samples: [a b c d]'
 	add       edx, ecx ; edx = a + b
 	add       edi, esi ; edi = c + d
@@ -1344,14 +1344,14 @@
 	add       esi, ecx ; esi = (a - b) + (c - d)
 	add       ecx, ecx
 	sub       ecx, esi ; ecx = (a - b) - (c - d) ; [edi edx ecx esi]'
-	
-	; store the transform results in xmm3	
+
+	; store the transform results in xmm3
     movd      xmm3, edi
 	pinsrw    xmm3, edx, 1
 	pinsrw    xmm3, ecx, 2
 	pinsrw    xmm3, esi, 3
 	psllw     xmm3, 2
-	
+
 	; get the satd of V
 	movdqa    xmm2, xmm6
 	movdqa    xmm4, xmm7
@@ -1368,7 +1368,7 @@
 	psrlw     xmm1, 3
 	movdqa    xmm5, xmm1
 	psllw     xmm1, 4
-	
+
     ; get the satd of DC
     psubw          xmm6, xmm1
     WELS_AbsW  xmm6, xmm1
@@ -1375,7 +1375,7 @@
 	WELS_AbsW  xmm7, xmm1
     paddusw        xmm6, xmm7
     SUMW_HORIZON1  xmm6, xmm1 ; satd of DC is stored in xmm6
-    
+
     ; comparing order: DC H V
     mov       edx, [esp+32]
     movd      eax, xmm6
@@ -1394,9 +1394,9 @@
     jg near   not_dc
     cmp       ax, si
     jg near   not_dc_h
-    
+
     ; for DC mode
-    movd      ebx, xmm5 
+    movd      ebx, xmm5
     imul      ebx, 0x01010101
     movd	  xmm5, ebx
 	pshufd    xmm5, xmm5, 0
@@ -1407,11 +1407,11 @@
     pop       esi
     pop       ebx
     ret
-    
+
 not_dc:
     cmp       di, si
     jg near   not_dc_h
-    
+
     ; for H mode
     SSE_DB_1_2REG  xmm6, xmm7
     mov       eax,  [esp+16]
@@ -1422,20 +1422,20 @@
 
 	movzx     ecx,  byte [eax+ebx-1]
 	movd      xmm1, ecx
-    pmuludq   xmm1, xmm6 
+    pmuludq   xmm1, xmm6
 %if 1
     punpckldq xmm0, xmm1
-%else    
+%else
 	unpcklps  xmm0,	xmm1
 %endif
 	lea       eax,	[eax+ebx*2]
 	movzx	  ecx,	byte [eax-1]
 	movd	  xmm2,	ecx
-    pmuludq   xmm2, xmm6  
+    pmuludq   xmm2, xmm6
 
 	movzx	  ecx,	byte [eax+ebx-1]
-	movd	  xmm3,	ecx	
-    pmuludq   xmm3, xmm6  
+	movd	  xmm3,	ecx
+    pmuludq   xmm3, xmm6
 %if 1
     punpckldq  xmm2, xmm3
     punpcklqdq xmm0, xmm2
@@ -1442,13 +1442,13 @@
 %else
 	unpcklps  xmm2,	xmm3
 	unpcklpd  xmm0,	xmm2
-%endif	
+%endif
 	movdqa	  [edx],xmm0
-	
+
 	mov       eax, edi
     mov       ebx, [esp+36]
 	mov       dword [ebx], 0x01
-    
+
     pop       edi
     pop       esi
     pop       ebx
@@ -1460,14 +1460,14 @@
 	movd	  xmm0,	[eax]
 	pshufd	  xmm0,	xmm0, 0
 	movdqa	  [edx],xmm0
-	
+
 	mov       eax, esi
     mov       ebx, [esp+36]
 	mov       dword [ebx], 0x00
-    
+
     pop       edi
     pop       esi
     pop       ebx
     ret
-    
+
 
--- a/codec/encoder/core/asm/intra_pred_util.asm
+++ b/codec/encoder/core/asm/intra_pred_util.asm
@@ -32,7 +32,7 @@
 ;*  intra_pred_util.asm
 ;*
 ;*  Abstract
-;*      mmxext/sse for WelsFillingPred8to16, WelsFillingPred8x2to16 and 
+;*      mmxext/sse for WelsFillingPred8to16, WelsFillingPred8x2to16 and
 ;*		WelsFillingPred1to16 etc.
 ;*
 ;*  History
@@ -84,7 +84,7 @@
 	movq mm0, [ecx]
 	movq [eax  ], mm0
 	movq [eax+8], mm0
-	
+
 	WELSEMMS
 	ret
 
@@ -100,16 +100,16 @@
 	movq mm1, [ecx+8]
 	movq [eax  ], mm0
 	movq [eax+8], mm1
-	
+
 	WELSEMMS
 
 	ret
 
 %macro butterfly_1to8_mmx	3	; mm? for dst, mm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
-	mov %3h, %3l	
-	movd %2, e%3x		; i.e, 1% = eax (=b0)	
-	pshufw %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0	
-%endmacro 
+	mov %3h, %3l
+	movd %2, e%3x		; i.e, 1% = eax (=b0)
+	pshufw %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0
+%endmacro
 
 ALIGN 16
 ;***********************************************************************----------------
@@ -120,10 +120,10 @@
 
 	mov cl, byte [esp+8]	; v
 	butterfly_1to8_mmx	mm0, mm1, c	; mm? for dst, mm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
-	
+
 	movq [eax  ], mm0
 	movq [eax+8], mm0
-	
+
 	WELSEMMS
 
 	ret
@@ -136,9 +136,9 @@
 	mov eax, [esp+4]	; pred
 	mov ecx, [esp+8]	; v
 
-	movdqa xmm0, [ecx]	
-	movdqa [eax], xmm0	
-	
+	movdqa xmm0, [ecx]
+	movdqa [eax], xmm0
+
 	ret
 
 ALIGN 16
@@ -150,7 +150,7 @@
 
 	mov cl, byte [esp+8]	; v
 	butterfly_1to16_sse	xmm0, xmm1, c		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	
+
 	movdqa [eax], xmm0
-	
+
 	ret
--- a/codec/encoder/core/asm/mb_copy.asm
+++ b/codec/encoder/core/asm/mb_copy.asm
@@ -32,7 +32,7 @@
 ;*  mb_copy.asm
 ;*
 ;*  Abstract
-;*      mb_copy 
+;*      mb_copy
 ;*
 ;*
 ;*********************************************************************************************/
@@ -52,9 +52,9 @@
 WELS_EXTERN WelsCopy16x16_sse2
 WELS_EXTERN WelsCopy16x16NotAligned_sse2
 WELS_EXTERN WelsCopy8x8_mmx
-WELS_EXTERN WelsCopy16x8NotAligned_sse2	; 
-WELS_EXTERN WelsCopy8x16_mmx		; 
-WELS_EXTERN UpdateMbMv_sse2		; 
+WELS_EXTERN WelsCopy16x8NotAligned_sse2	;
+WELS_EXTERN WelsCopy8x16_mmx		;
+WELS_EXTERN UpdateMbMv_sse2		;
 
 ;***********************************************************************
 ; void WelsCopy16x16_sse2(	uint8_t* Dst,
@@ -66,7 +66,7 @@
 WelsCopy16x16_sse2:
 	push esi
 	push edi
-	push ebx	
+	push ebx
 
 	mov edi, [esp+16]	; Dst
 	mov eax, [esp+20]	; iStrideD
@@ -107,7 +107,7 @@
 	movdqa xmm5, [esi+ecx]
 	movdqa xmm6, [esi+2*ecx]
 	movdqa xmm7, [esi+edx]
-	
+
 	movdqa [edi], xmm0
 	movdqa [edi+eax], xmm1
 	movdqa [edi+2*eax], xmm2
@@ -116,7 +116,7 @@
 	movdqa [edi], xmm4
 	movdqa [edi+eax], xmm5
 	movdqa [edi+2*eax], xmm6
-	movdqa [edi+ebx], xmm7	
+	movdqa [edi+ebx], xmm7
 
 	pop ebx
 	pop edi
@@ -134,7 +134,7 @@
 WelsCopy16x16NotAligned_sse2:
 	push esi
 	push edi
-	push ebx	
+	push ebx
 
 	mov edi, [esp+16]	; Dst
 	mov eax, [esp+20]	; iStrideD
@@ -175,7 +175,7 @@
 	movdqu xmm5, [esi+ecx]
 	movdqu xmm6, [esi+2*ecx]
 	movdqu xmm7, [esi+edx]
-	
+
 	movdqa [edi], xmm0
 	movdqa [edi+eax], xmm1
 	movdqa [edi+2*eax], xmm2
@@ -184,8 +184,8 @@
 	movdqa [edi], xmm4
 	movdqa [edi+eax], xmm5
 	movdqa [edi+2*eax], xmm6
-	movdqa [edi+ebx], xmm7	
-	
+	movdqa [edi+ebx], xmm7
+
 	pop ebx
 	pop edi
 	pop esi
@@ -202,7 +202,7 @@
 WelsCopy16x8NotAligned_sse2:
 	push esi
 	push edi
-	push ebx	
+	push ebx
 
 	mov edi, [esp+16]	; Dst
 	mov eax, [esp+20]	; iStrideD
@@ -220,7 +220,7 @@
 	movdqu xmm4, [esi]
 	movdqu xmm5, [esi+ecx]
 	movdqu xmm6, [esi+2*ecx]
-	movdqu xmm7, [esi+edx]	
+	movdqu xmm7, [esi+edx]
 
 	movdqa [edi], xmm0
 	movdqa [edi+eax], xmm1
@@ -231,7 +231,7 @@
 	movdqa [edi+eax], xmm5
 	movdqa [edi+2*eax], xmm6
 	movdqa [edi+ebx], xmm7
-	
+
 	pop ebx
 	pop edi
 	pop esi
@@ -245,7 +245,7 @@
 ;                       int32_t  iStrideS )
 ;***********************************************************************
 ALIGN 16
-WelsCopy8x16_mmx:	
+WelsCopy8x16_mmx:
 	push ebx
 
 	mov eax, [esp + 8 ]           ;Dst
@@ -253,60 +253,60 @@
 	mov ebx, [esp + 16]           ;Src
 	mov edx, [esp + 20]           ;iStrideS
 
-	movq mm0, [ebx]	
-	movq mm1, [ebx+edx]	
+	movq mm0, [ebx]
+	movq mm1, [ebx+edx]
 	lea ebx, [ebx+2*edx]
-	movq mm2, [ebx]	
-	movq mm3, [ebx+edx]	
+	movq mm2, [ebx]
+	movq mm3, [ebx+edx]
 	lea ebx, [ebx+2*edx]
-	movq mm4, [ebx]	
-	movq mm5, [ebx+edx]	
+	movq mm4, [ebx]
+	movq mm5, [ebx+edx]
 	lea ebx, [ebx+2*edx]
-	movq mm6, [ebx]	
-	movq mm7, [ebx+edx]	
+	movq mm6, [ebx]
+	movq mm7, [ebx+edx]
 	lea ebx, [ebx+2*edx]
-	
-	movq [eax], mm0	
-	movq [eax+ecx], mm1	
+
+	movq [eax], mm0
+	movq [eax+ecx], mm1
 	lea eax, [eax+2*ecx]
-	movq [eax], mm2	
+	movq [eax], mm2
 	movq [eax+ecx], mm3
 	lea eax, [eax+2*ecx]
-	movq [eax], mm4	
+	movq [eax], mm4
 	movq [eax+ecx], mm5
 	lea eax, [eax+2*ecx]
-	movq [eax], mm6	
+	movq [eax], mm6
 	movq [eax+ecx], mm7
 	lea eax, [eax+2*ecx]
 
-	movq mm0, [ebx]	
-	movq mm1, [ebx+edx]	
+	movq mm0, [ebx]
+	movq mm1, [ebx+edx]
 	lea ebx, [ebx+2*edx]
-	movq mm2, [ebx]	
-	movq mm3, [ebx+edx]	
+	movq mm2, [ebx]
+	movq mm3, [ebx+edx]
 	lea ebx, [ebx+2*edx]
-	movq mm4, [ebx]	
-	movq mm5, [ebx+edx]	
+	movq mm4, [ebx]
+	movq mm5, [ebx+edx]
 	lea ebx, [ebx+2*edx]
-	movq mm6, [ebx]	
-	movq mm7, [ebx+edx]		
-	
-	movq [eax], mm0	
-	movq [eax+ecx], mm1	
+	movq mm6, [ebx]
+	movq mm7, [ebx+edx]
+
+	movq [eax], mm0
+	movq [eax+ecx], mm1
 	lea eax, [eax+2*ecx]
-	movq [eax], mm2	
+	movq [eax], mm2
 	movq [eax+ecx], mm3
 	lea eax, [eax+2*ecx]
-	movq [eax], mm4	
+	movq [eax], mm4
 	movq [eax+ecx], mm5
 	lea eax, [eax+2*ecx]
-	movq [eax], mm6	
-	movq [eax+ecx], mm7	
+	movq [eax], mm6
+	movq [eax+ecx], mm7
 
 	WELSEMMS
-	pop ebx	
+	pop ebx
 	ret
-	
+
 ;***********************************************************************
 ; void WelsCopy8x8_mmx(  uint8_t* Dst,
 ;                        int32_t  iStrideD,
@@ -314,7 +314,7 @@
 ;                        int32_t  iStrideS )
 ;***********************************************************************
 ALIGN 16
-WelsCopy8x8_mmx:	
+WelsCopy8x8_mmx:
 	push ebx
 	push esi
 	mov eax, [esp + 12]           ;Dst
@@ -343,7 +343,7 @@
 	lea esi, [esi+2*ebx]
 	movq mm6, [esi]
 	movq mm7, [esi+ebx]
-	
+
 	movq [eax], mm0
 	movq [eax+ecx], mm1
 	lea eax, [eax+2*ecx]
@@ -355,12 +355,12 @@
 	lea eax, [eax+2*ecx]
 	movq [eax], mm6
 	movq [eax+ecx], mm7
-		
+
 	WELSEMMS
-	pop esi	
+	pop esi
 	pop ebx
 	ret
-	
+
 ; (dunhuang@cisco), 12/21/2011
 ;***********************************************************************
 ; void UpdateMbMv_sse2( SMVUnitXY *pMvBuffer, const SMVUnitXY sMv )
@@ -417,8 +417,8 @@
 WELS_EXTERN McCopyWidthEq4_mmx
 WELS_EXTERN McCopyWidthEq8_mmx
 WELS_EXTERN McCopyWidthEq16_sse2
-                          
 
+
 ALIGN 16
 ;***********************************************************************
 ; void PixelAvgWidthEq8_mmx( uint8_t *dst,  int32_t iDstStride,
@@ -432,19 +432,19 @@
     push        esi
     push        edi
 
-    mov         edi, [esp+20]       
-    mov         esi, [esp+28]       
-    mov         edx, [esp+36]       
-    mov         ebp, [esp+24]       
-    mov         eax, [esp+32]       
-    mov         ebx, [esp+40]       
-    mov         ecx, [esp+44]       
+    mov         edi, [esp+20]
+    mov         esi, [esp+28]
+    mov         edx, [esp+36]
+    mov         ebp, [esp+24]
+    mov         eax, [esp+32]
+    mov         ebx, [esp+40]
+    mov         ecx, [esp+44]
 	sar			ecx, 2
 .height_loop:
-	movq        mm0, [esi]	
+	movq        mm0, [esi]
     pavgb       mm0, [edx]
     movq        [edi], mm0
-	movq		mm1, [esi+eax]		
+	movq		mm1, [esi+eax]
 	pavgb		mm1, [edx+ebx]
 	movq		[edi+ebp], mm1
 	lea         edi, [edi+2*ebp]
@@ -451,19 +451,19 @@
 	lea         esi, [esi+2*eax]
 	lea         edx, [edx+2*ebx]
 
-	movq        mm2, [esi]	
+	movq        mm2, [esi]
 	pavgb       mm2, [edx]
     movq        [edi], mm2
-	movq		mm3, [esi+eax]	
+	movq		mm3, [esi+eax]
 	pavgb		mm3, [edx+ebx]
 	movq		[edi+ebp], mm3
 	lea         edi, [edi+2*ebp]
 	lea         esi, [esi+2*eax]
 	lea         edx, [edx+2*ebx]
-	
+
 	dec         ecx
     jne         .height_loop
-	
+
 	WELSEMMS
     pop         edi
     pop         esi
@@ -485,19 +485,19 @@
     push        esi
     push        edi
 
-    mov         edi, [esp+20]       
-    mov         esi, [esp+28]       
-    mov         edx, [esp+36]       
-    mov         ebp, [esp+24]       
-    mov         eax, [esp+32]       
-    mov         ebx, [esp+40]       
-    mov         ecx, [esp+44]       
+    mov         edi, [esp+20]
+    mov         esi, [esp+28]
+    mov         edx, [esp+36]
+    mov         ebp, [esp+24]
+    mov         eax, [esp+32]
+    mov         ebx, [esp+40]
+    mov         ecx, [esp+44]
 	sar			ecx, 2
 .height_loop:
 	movdqu      xmm0, [esi]
 	movdqu      xmm1, [edx]
 	movdqu      xmm2, [esi+eax]
-	movdqu      xmm3, [edx+ebx]	
+	movdqu      xmm3, [edx+ebx]
 	pavgb       xmm0, xmm1
 	pavgb       xmm2, xmm3
 	movdqu      [edi], xmm0
@@ -504,12 +504,12 @@
 	movdqu      [edi+ebp], xmm2
 	lea			edi, [edi+2*ebp]
 	lea			esi, [esi+2*eax]
-	lea			edx, [edx+2*ebx]	
+	lea			edx, [edx+2*ebx]
 
 	movdqu      xmm4, [esi]
 	movdqu      xmm5, [edx]
 	movdqu      xmm6, [esi+eax]
-	movdqu      xmm7, [edx+ebx]	
+	movdqu      xmm7, [edx+ebx]
 	pavgb       xmm4, xmm5
 	pavgb       xmm6, xmm7
 	movdqu      [edi], xmm4
@@ -516,11 +516,11 @@
 	movdqu      [edi+ebp], xmm6
 	lea         edi, [edi+2*ebp]
 	lea         esi, [esi+2*eax]
-    lea         edx, [edx+2*ebx]	
-    
+    lea         edx, [edx+2*ebx]
+
 	dec         ecx
 	jne         .height_loop
-	
+
     pop         edi
     pop         esi
     pop         ebx
@@ -540,7 +540,7 @@
     dec    dword [esp+4]
     jg     avg_w16_align_0_ssse3
     ret
-    
+
     ALIGN 64
 avg_w16_align_1_ssse3:
     movdqa  xmm1, [ebx+16]
@@ -555,7 +555,7 @@
     jg     avg_w16_align_1_ssse3
     ret
 
-  
+
 ALIGN 16
 ;***********************************************************************
 ; void PixelAvgWidthEq16_ssse3(uint8_t *pDst,  int32_t iDstStride,
@@ -574,7 +574,7 @@
     mov         ebx, [esp+28]       ; src1
     mov         ecx, [esp+36]       ; src2
     mov         esi, [esp+24]       ; i_dst_stride
-    
+
      %define avg_w16_offset (avg_w16_align_1_ssse3-avg_w16_align_0_ssse3)
     mov edx, ebx
     and edx, 0x01
@@ -582,11 +582,11 @@
     lea ebp, [avg_w16_offset]
     imul ebp, edx
     lea edx, [ebp+eax]
-    
-    mov eax, [esp+32]  
-    mov ebp, [esp+44] 
+
+    mov eax, [esp+32]
+    mov ebp, [esp+44]
     push ebp
-    mov ebp, [esp+44]	
+    mov ebp, [esp+44]
     and ebx, 0xfffffff0
     call edx
 	pop		   ebp
@@ -607,7 +607,7 @@
     push    edi
     push    ebx
 
-    
+
     mov esi,  [esp+16]
     mov eax, [esp+20]
     mov edi,  [esp+24]
@@ -617,12 +617,12 @@
 .height_loop:
 	mov ebx, [esi]
 	mov [edi], ebx
-	
+
 	add esi, eax
 	add edi, ecx
 	dec edx
 	jnz .height_loop
-	WELSEMMS   
+	WELSEMMS
 	pop	   ebx
     pop     edi
     pop     esi
@@ -650,12 +650,12 @@
 	add edi, ecx
 	dec edx
 	jnz .height_loop
-	
-	WELSEMMS   
+
+	WELSEMMS
     pop     edi
     pop     esi
     ret
-	
+
 ALIGN 16
 ;***********************************************************************
 ;   void McCopyWidthEq16_sse2( uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, int32_t iHeight )
@@ -664,11 +664,11 @@
     push    esi
     push    edi
 
-    mov     esi, [esp+12]       
-    mov     eax, [esp+16]       
-    mov     edi, [esp+20]       
-    mov     edx, [esp+24]       
-    mov     ecx, [esp+28]       
+    mov     esi, [esp+12]
+    mov     eax, [esp+16]
+    mov     edi, [esp+20]
+    mov     edx, [esp+24]
+    mov     ecx, [esp+28]
 
 ALIGN 4
 .height_loop:
@@ -681,7 +681,7 @@
     lea     esi, [esi+eax*2]
     lea     edi, [edi+edx*2]
     jnz     .height_loop
-  
+
     pop     edi
     pop     esi
     ret
--- a/codec/encoder/core/asm/mc_chroma.asm
+++ b/codec/encoder/core/asm/mc_chroma.asm
@@ -1,317 +1,317 @@
-;*!
-;* \copy
-;*     Copyright (c)  2004-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  mc_chroma.asm
-;*
-;*  Abstract
-;*      mmx motion compensation for chroma
-;*
-;*  History
-;*      10/13/2004 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-BITS 32
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-SECTION .rodata align=16
-
-;***********************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;***********************************************************************
-
-ALIGN 16
-h264_d0x20_sse2:
-	dw 32,32,32,32,32,32,32,32
-ALIGN 16
-h264_d0x20_mmx:
-	dw 32,32,32,32
-
-
-;=============================================================================
-; Code
-;=============================================================================
-
-SECTION .text
-
-ALIGN 16
-;*******************************************************************************
-; void McChromaWidthEq4_mmx( uint8_t *src, 
-;							int32_t iSrcStride, 
-;							uint8_t *pDst, 
-;							int32_t iDstStride, 
-;							uint8_t *pABCD, 
-;							int32_t iHeigh );
-;*******************************************************************************
-WELS_EXTERN McChromaWidthEq4_mmx
-McChromaWidthEq4_mmx:
-	push esi
-	push edi
-	push ebx
-	
-	mov eax, [esp +12 + 20]
-	movd mm3, [eax]
-	WELS_Zero mm7
-	punpcklbw mm3, mm3
-	movq      mm4, mm3
-	punpcklwd mm3, mm3       
-	punpckhwd mm4, mm4		 
-	
-	movq	  mm5, mm3
-	punpcklbw mm3, mm7
-	punpckhbw mm5, mm7
-	
-	movq	  mm6, mm4
-	punpcklbw mm4, mm7
-	punpckhbw mm6, mm7
-	
-	mov esi, [esp +12+ 4]   
-	mov eax, [esp + 12 + 8]   
-	mov edi, [esp + 12 + 12]  
-	mov edx, [esp + 12 + 16]  
-    mov ecx, [esp + 12 + 24]   
-		
-	lea ebx, [esi + eax]
-	movd mm0, [esi]
-	movd mm1, [esi+1]
-	punpcklbw mm0, mm7
-	punpcklbw mm1, mm7
-.xloop:
-	
-	pmullw mm0, mm3
-	pmullw mm1, mm5
-	paddw  mm0, mm1
-	
-	movd  mm1, [ebx]
-	punpcklbw mm1, mm7
-	movq mm2, mm1
-	pmullw mm1, mm4
-	paddw mm0, mm1
-	
-	movd mm1, [ebx+1]
-	punpcklbw mm1, mm7
-	movq mm7, mm1
-	pmullw mm1,mm6
-	paddw mm0, mm1
-	movq mm1,mm7
-
-	paddw mm0, [h264_d0x20_mmx]
-	psrlw mm0, 6
-	
-	WELS_Zero mm7
-	packuswb mm0, mm7
-	movd [edi], mm0	
-
-	movq mm0, mm2
-	
-	lea edi, [edi +edx  ]
-	lea ebx, [ebx + eax]
-
-	dec ecx
-	jnz near .xloop
-	WELSEMMS
-	pop ebx
-	pop edi
-	pop esi
-	ret
-
-
-ALIGN 16
-;*******************************************************************************
-; void McChromaWidthEq8_sse2( uint8_t *pSrc, 
-;						int32_t iSrcStride, 
-;						uint8_t *pDst, 
-;						int32_t iDstStride, 
-;						uint8_t *pABCD, 
-;						int32_t iheigh );
-;*******************************************************************************
-WELS_EXTERN McChromaWidthEq8_sse2
-McChromaWidthEq8_sse2:
-	push esi
-	push edi
-	push ebx
-	
-	mov eax, [esp +12 + 20]
-	movd xmm3, [eax]
-	WELS_Zero xmm7
-	punpcklbw  xmm3, xmm3
-	punpcklwd  xmm3, xmm3
-	
-	movdqa	   xmm4, xmm3
-	punpckldq  xmm3, xmm3
-	punpckhdq  xmm4, xmm4
-	movdqa     xmm5, xmm3
-	movdqa	   xmm6, xmm4
-	
-	punpcklbw  xmm3, xmm7
-	punpckhbw  xmm5, xmm7
-	punpcklbw  xmm4, xmm7
-	punpckhbw  xmm6, xmm7
-	
-	mov esi, [esp +12+ 4]   
-	mov eax, [esp + 12 + 8]   
-	mov edi, [esp + 12 + 12]  
-	mov edx, [esp + 12 + 16]  
-    mov ecx, [esp + 12 + 24]   
-		
-	lea ebx, [esi + eax]
-	movq xmm0, [esi]
-	movq xmm1, [esi+1]
-	punpcklbw xmm0, xmm7
-	punpcklbw xmm1, xmm7
-.xloop:
-	
-	pmullw xmm0, xmm3
-	pmullw xmm1, xmm5
-	paddw  xmm0, xmm1
-	
-	movq  xmm1, [ebx]
-	punpcklbw xmm1, xmm7
-	movdqa xmm2, xmm1
-	pmullw xmm1, xmm4
-	paddw xmm0, xmm1
-	
-	movq xmm1, [ebx+1]
-	punpcklbw xmm1, xmm7
-	movdqa xmm7, xmm1
-	pmullw xmm1, xmm6
-	paddw xmm0, xmm1
-	movdqa xmm1,xmm7
-
-	paddw xmm0, [h264_d0x20_sse2]
-	psrlw xmm0, 6
-	
-	WELS_Zero xmm7
-	packuswb xmm0, xmm7
-	movq [edi], xmm0	
-
-	movdqa xmm0, xmm2
-	
-	lea edi, [edi +edx  ]
-	lea ebx, [ebx + eax]
-
-	dec ecx
-	jnz near .xloop
-	
-	pop ebx
-	pop edi
-	pop esi
-	ret
-
-
-
-
-ALIGN 16
-;***********************************************************************
-; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
-;						 int32_t iSrcStride, 
-;                        uint8_t *pDst,  
-;                        int32_t iDstStride,
-;                        uint8_t *pABCD,
-;					     int32_t iHeigh);
-;***********************************************************************
-WELS_EXTERN McChromaWidthEq8_ssse3
-McChromaWidthEq8_ssse3:
-	push ebx
-	push esi
-	push edi
-		
-	mov eax, [esp + 12 + 20]
-
-    pxor      xmm7, xmm7
-    movd   xmm5, [eax]   
-    punpcklwd xmm5, xmm5  
-    punpckldq xmm5, xmm5 
-    movdqa    xmm6, xmm5
-    punpcklqdq xmm5, xmm5
-    punpckhqdq xmm6, xmm6    
-    
-	mov eax, [esp + 12 + 4]   
-	mov edx, [esp + 12 + 8]   
-	mov esi, [esp + 12 + 12]  
-	mov edi, [esp + 12 + 16]  
-    mov ecx, [esp + 12 + 24]   
-    
-    sub esi, edi
-    sub esi, edi
-	movdqa xmm7, [h264_d0x20_sse2]
-
-	movdqu xmm0, [eax]
-	movdqa xmm1, xmm0
-	psrldq xmm1, 1
-	punpcklbw xmm0, xmm1
-	
-.hloop_chroma:	
-	lea	esi, [esi+2*edi]
-	
-	movdqu xmm2, [eax+edx]
-	movdqa xmm3, xmm2
-	psrldq xmm3, 1
-	punpcklbw xmm2, xmm3
-	movdqa      xmm4, xmm2
-	
-    pmaddubsw  xmm0, xmm5
-    pmaddubsw  xmm2, xmm6
-    paddw      xmm0, xmm2
-    paddw      xmm0, xmm7
-	psrlw      xmm0, 6
-    packuswb   xmm0, xmm0
-    movq       [esi],xmm0	
-    
-    lea eax, [eax+2*edx]
-    movdqu xmm2, [eax]
-    movdqa xmm3, xmm2
-    psrldq xmm3, 1
-    punpcklbw xmm2, xmm3
-    movdqa      xmm0, xmm2
-    
-    pmaddubsw  xmm4, xmm5
-    pmaddubsw  xmm2, xmm6
-    paddw      xmm4, xmm2
-    paddw      xmm4, xmm7
-	psrlw      xmm4, 6
-    packuswb   xmm4, xmm4
-    movq       [esi+edi],xmm4	
-	
-	sub ecx, 2
-	jnz .hloop_chroma
-	pop edi
-	pop esi
-	pop ebx
-
-	ret
-
-
+;*!
+;* \copy
+;*     Copyright (c)  2004-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  mc_chroma.asm
+;*
+;*  Abstract
+;*      mmx motion compensation for chroma
+;*
+;*  History
+;*      10/13/2004 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+BITS 32
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+SECTION .rodata align=16
+
+;***********************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;***********************************************************************
+
+ALIGN 16
+h264_d0x20_sse2:
+	dw 32,32,32,32,32,32,32,32
+ALIGN 16
+h264_d0x20_mmx:
+	dw 32,32,32,32
+
+
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
+
+ALIGN 16
+;*******************************************************************************
+; void McChromaWidthEq4_mmx( uint8_t *src,
+;							int32_t iSrcStride,
+;							uint8_t *pDst,
+;							int32_t iDstStride,
+;							uint8_t *pABCD,
+;							int32_t iHeigh );
+;*******************************************************************************
+WELS_EXTERN McChromaWidthEq4_mmx
+McChromaWidthEq4_mmx:
+	push esi
+	push edi
+	push ebx
+
+	mov eax, [esp +12 + 20]
+	movd mm3, [eax]
+	WELS_Zero mm7
+	punpcklbw mm3, mm3
+	movq      mm4, mm3
+	punpcklwd mm3, mm3
+	punpckhwd mm4, mm4
+
+	movq	  mm5, mm3
+	punpcklbw mm3, mm7
+	punpckhbw mm5, mm7
+
+	movq	  mm6, mm4
+	punpcklbw mm4, mm7
+	punpckhbw mm6, mm7
+
+	mov esi, [esp +12+ 4]
+	mov eax, [esp + 12 + 8]
+	mov edi, [esp + 12 + 12]
+	mov edx, [esp + 12 + 16]
+    mov ecx, [esp + 12 + 24]
+
+	lea ebx, [esi + eax]
+	movd mm0, [esi]
+	movd mm1, [esi+1]
+	punpcklbw mm0, mm7
+	punpcklbw mm1, mm7
+.xloop:
+
+	pmullw mm0, mm3
+	pmullw mm1, mm5
+	paddw  mm0, mm1
+
+	movd  mm1, [ebx]
+	punpcklbw mm1, mm7
+	movq mm2, mm1
+	pmullw mm1, mm4
+	paddw mm0, mm1
+
+	movd mm1, [ebx+1]
+	punpcklbw mm1, mm7
+	movq mm7, mm1
+	pmullw mm1,mm6
+	paddw mm0, mm1
+	movq mm1,mm7
+
+	paddw mm0, [h264_d0x20_mmx]
+	psrlw mm0, 6
+
+	WELS_Zero mm7
+	packuswb mm0, mm7
+	movd [edi], mm0
+
+	movq mm0, mm2
+
+	lea edi, [edi +edx  ]
+	lea ebx, [ebx + eax]
+
+	dec ecx
+	jnz near .xloop
+	WELSEMMS
+	pop ebx
+	pop edi
+	pop esi
+	ret
+
+
+ALIGN 16
+;*******************************************************************************
+; void McChromaWidthEq8_sse2( uint8_t *pSrc,
+;						int32_t iSrcStride,
+;						uint8_t *pDst,
+;						int32_t iDstStride,
+;						uint8_t *pABCD,
+;						int32_t iheigh );
+;*******************************************************************************
+WELS_EXTERN McChromaWidthEq8_sse2
+McChromaWidthEq8_sse2:
+	push esi
+	push edi
+	push ebx
+
+	mov eax, [esp +12 + 20]
+	movd xmm3, [eax]
+	WELS_Zero xmm7
+	punpcklbw  xmm3, xmm3
+	punpcklwd  xmm3, xmm3
+
+	movdqa	   xmm4, xmm3
+	punpckldq  xmm3, xmm3
+	punpckhdq  xmm4, xmm4
+	movdqa     xmm5, xmm3
+	movdqa	   xmm6, xmm4
+
+	punpcklbw  xmm3, xmm7
+	punpckhbw  xmm5, xmm7
+	punpcklbw  xmm4, xmm7
+	punpckhbw  xmm6, xmm7
+
+	mov esi, [esp +12+ 4]
+	mov eax, [esp + 12 + 8]
+	mov edi, [esp + 12 + 12]
+	mov edx, [esp + 12 + 16]
+    mov ecx, [esp + 12 + 24]
+
+	lea ebx, [esi + eax]
+	movq xmm0, [esi]
+	movq xmm1, [esi+1]
+	punpcklbw xmm0, xmm7
+	punpcklbw xmm1, xmm7
+.xloop:
+
+	pmullw xmm0, xmm3
+	pmullw xmm1, xmm5
+	paddw  xmm0, xmm1
+
+	movq  xmm1, [ebx]
+	punpcklbw xmm1, xmm7
+	movdqa xmm2, xmm1
+	pmullw xmm1, xmm4
+	paddw xmm0, xmm1
+
+	movq xmm1, [ebx+1]
+	punpcklbw xmm1, xmm7
+	movdqa xmm7, xmm1
+	pmullw xmm1, xmm6
+	paddw xmm0, xmm1
+	movdqa xmm1,xmm7
+
+	paddw xmm0, [h264_d0x20_sse2]
+	psrlw xmm0, 6
+
+	WELS_Zero xmm7
+	packuswb xmm0, xmm7
+	movq [edi], xmm0
+
+	movdqa xmm0, xmm2
+
+	lea edi, [edi +edx  ]
+	lea ebx, [ebx + eax]
+
+	dec ecx
+	jnz near .xloop
+
+	pop ebx
+	pop edi
+	pop esi
+	ret
+
+
+
+
+ALIGN 16
+;***********************************************************************
+; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
+;						 int32_t iSrcStride,
+;                        uint8_t *pDst,
+;                        int32_t iDstStride,
+;                        uint8_t *pABCD,
+;					     int32_t iHeigh);
+;***********************************************************************
+WELS_EXTERN McChromaWidthEq8_ssse3
+McChromaWidthEq8_ssse3:
+	push ebx
+	push esi
+	push edi
+
+	mov eax, [esp + 12 + 20]
+
+    pxor      xmm7, xmm7
+    movd   xmm5, [eax]
+    punpcklwd xmm5, xmm5
+    punpckldq xmm5, xmm5
+    movdqa    xmm6, xmm5
+    punpcklqdq xmm5, xmm5
+    punpckhqdq xmm6, xmm6
+
+	mov eax, [esp + 12 + 4]
+	mov edx, [esp + 12 + 8]
+	mov esi, [esp + 12 + 12]
+	mov edi, [esp + 12 + 16]
+    mov ecx, [esp + 12 + 24]
+
+    sub esi, edi
+    sub esi, edi
+	movdqa xmm7, [h264_d0x20_sse2]
+
+	movdqu xmm0, [eax]
+	movdqa xmm1, xmm0
+	psrldq xmm1, 1
+	punpcklbw xmm0, xmm1
+
+.hloop_chroma:
+	lea	esi, [esi+2*edi]
+
+	movdqu xmm2, [eax+edx]
+	movdqa xmm3, xmm2
+	psrldq xmm3, 1
+	punpcklbw xmm2, xmm3
+	movdqa      xmm4, xmm2
+
+    pmaddubsw  xmm0, xmm5
+    pmaddubsw  xmm2, xmm6
+    paddw      xmm0, xmm2
+    paddw      xmm0, xmm7
+	psrlw      xmm0, 6
+    packuswb   xmm0, xmm0
+    movq       [esi],xmm0
+
+    lea eax, [eax+2*edx]
+    movdqu xmm2, [eax]
+    movdqa xmm3, xmm2
+    psrldq xmm3, 1
+    punpcklbw xmm2, xmm3
+    movdqa      xmm0, xmm2
+
+    pmaddubsw  xmm4, xmm5
+    pmaddubsw  xmm2, xmm6
+    paddw      xmm4, xmm2
+    paddw      xmm4, xmm7
+	psrlw      xmm4, 6
+    packuswb   xmm4, xmm4
+    movq       [esi+edi],xmm4
+
+	sub ecx, 2
+	jnz .hloop_chroma
+	pop edi
+	pop esi
+	pop ebx
+
+	ret
+
+
--- a/codec/encoder/core/asm/mc_luma.asm
+++ b/codec/encoder/core/asm/mc_luma.asm
@@ -91,10 +91,10 @@
 
 ALIGN 16
 ;***********************************************************************
-; void McHorVer20WidthEq16_sse2(  uint8_t *pSrc, 
-;								int32_t iSrcStride, 
-;								uint8_t *pDst, 
-;								int32_t iDstStride, 
+; void McHorVer20WidthEq16_sse2(  uint8_t *pSrc,
+;								int32_t iSrcStride,
+;								uint8_t *pDst,
+;								int32_t iDstStride,
 ;								int32_t iHeight,
 ;                      );
 ;***********************************************************************
@@ -101,19 +101,19 @@
 McHorVer20WidthEq16_sse2:
 	push	esi
 	push	edi
-	
 
-	mov esi, [esp + 12]         
-	mov eax, [esp + 16]         
-	mov edi, [esp + 20]         
-	mov ecx, [esp + 28]         
-	mov edx, [esp + 24]			
-	sub esi, 2                  
-	
+
+	mov esi, [esp + 12]
+	mov eax, [esp + 16]
+	mov edi, [esp + 20]
+	mov ecx, [esp + 28]
+	mov edx, [esp + 24]
+	sub esi, 2
+
 	WELS_Zero  xmm7
 	movdqa xmm6, [h264_w0x10_1]
 .y_loop:
-	
+
 	movq xmm0, [esi]
 	punpcklbw xmm0, xmm7
 	movq xmm1, [esi+5]
@@ -126,7 +126,7 @@
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3]
 	punpcklbw xmm5, xmm7
-	
+
 	paddw xmm2, xmm3
 	paddw xmm4, xmm5
 	psllw xmm4, 2
@@ -152,7 +152,7 @@
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3+8]
 	punpcklbw xmm5, xmm7
-	
+
 	paddw xmm2, xmm3
 	paddw xmm4, xmm5
 	psllw xmm4, 2
@@ -165,8 +165,8 @@
 	psraw xmm0, 5
 	packuswb xmm0, xmm7
 	movq [edi+8], xmm0
-	
-	
+
+
 	add esi, eax
 	add edi, edx
 	dec ecx
@@ -178,9 +178,9 @@
 
 ALIGN 16
 ;***********************************************************************
-; void McHorVer22Width8HorFirst_sse2( uint8_t*pSrc, 
-;									int32_t iSrcStride, 
-;									uint8_t* pTap,	
+; void McHorVer22Width8HorFirst_sse2( uint8_t*pSrc,
+;									int32_t iSrcStride,
+;									uint8_t* pTap,
 ;									int32_t iTapStride,
 ;									int32_t iHeight);
 ;***********************************************************************
@@ -193,11 +193,11 @@
 	mov edi, [esp+24]		;tap
 	mov edx, [esp+28]	;tap_stride
 	mov ebx, [esp+32]	;i_height
-	pxor xmm7, xmm7	
-	
+	pxor xmm7, xmm7
+
 	sub esi, eax				;;;;;;;;need more 5 lines.
 	sub esi, eax
-		
+
 .yloop_width_8:
 	movq xmm0, [esi]
 	punpcklbw xmm0, xmm7
@@ -211,7 +211,7 @@
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3]
 	punpcklbw xmm5, xmm7
-	
+
 	paddw xmm2, xmm3
 	paddw xmm4, xmm5
 	psllw xmm4, 2
@@ -221,7 +221,7 @@
 	psllw xmm4, 2
 	paddw xmm0, xmm4
 	movdqa [edi], xmm0
-		
+
 	add esi, eax
 	add edi, edx
 	dec ebx
@@ -230,12 +230,12 @@
 	pop edi
 	pop esi
 	ret
-	
+
 ;***********************************************************************
-; void McHorVer02WidthEq8_sse2( uint8_t *pSrc, 
-;                       int32_t iSrcStride, 
-;                       uint8_t *pDst, 
-;                       int32_t iDstStride, 
+; void McHorVer02WidthEq8_sse2( uint8_t *pSrc,
+;                       int32_t iSrcStride,
+;                       uint8_t *pDst,
+;                       int32_t iDstStride,
 ;                       int32_t iHeight )
 ;***********************************************************************
 ALIGN 16
@@ -242,18 +242,18 @@
 McHorVer02WidthEq8_sse2:
 	push esi
 	push edi
-	
-	mov esi, [esp + 12]           
-	mov edx, [esp + 16]	          
-	mov edi, [esp + 20]           
-	mov eax, [esp + 24]           
-	mov ecx, [esp + 28]           
 
+	mov esi, [esp + 12]
+	mov edx, [esp + 16]
+	mov edi, [esp + 20]
+	mov eax, [esp + 24]
+	mov ecx, [esp + 28]
+
 	sub esi, edx
 	sub esi, edx
 
 	WELS_Zero xmm7
-			
+
 	SSE_LOAD_8P xmm0, xmm7, [esi]
 	SSE_LOAD_8P xmm1, xmm7, [esi+edx]
 	lea esi, [esi+2*edx]
@@ -262,8 +262,8 @@
 	lea esi, [esi+2*edx]
 	SSE_LOAD_8P xmm4, xmm7, [esi]
 	SSE_LOAD_8P xmm5, xmm7, [esi+edx]
-	
-.start:	
+
+.start:
 	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
 	dec ecx
 	jz near .xx_exit
@@ -273,7 +273,7 @@
 	FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [edi+eax]
 	dec ecx
 	jz near .xx_exit
-	
+
 	lea edi, [edi+2*eax]
 	SSE_LOAD_8P xmm7, xmm0, [esi+edx]
 	FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
@@ -356,11 +356,11 @@
 
 
 ;***********************************************************************
-; void McHorVer02_sse2(	uint8_t *pSrc, 
-;                       int32_t iSrcStride, 
-;                       uint8_t *pDst, 
+; void McHorVer02_sse2(	uint8_t *pSrc,
+;                       int32_t iSrcStride,
+;                       uint8_t *pDst,
 ;                       int32_t iDstStride,
-;						int32_t iWidth, 
+;						int32_t iWidth,
 ;                       int32_t iHeight )
 ;***********************************************************************
 ALIGN 16
@@ -368,19 +368,19 @@
 	push esi
 	push edi
 	push ebx
-	
-	mov esi, [esp + 16]           
-	mov edx, [esp + 20]	          
-	mov edi, [esp + 24]           
-	mov eax, [esp + 28]           
-	mov ecx, [esp + 36]           
-	mov ebx, [esp + 32]			  
+
+	mov esi, [esp + 16]
+	mov edx, [esp + 20]
+	mov edi, [esp + 24]
+	mov eax, [esp + 28]
+	mov ecx, [esp + 36]
+	mov ebx, [esp + 32]
 	shr ebx, 3
 	sub esi, edx
 	sub esi, edx
-	
-.xloop:	
-	WELS_Zero xmm7			
+
+.xloop:
+	WELS_Zero xmm7
 	SSE_LOAD_8P xmm0, xmm7, [esi]
 	SSE_LOAD_8P xmm1, xmm7, [esi+edx]
 	lea esi, [esi+2*edx]
@@ -389,7 +389,7 @@
 	lea esi, [esi+2*edx]
 	SSE_LOAD_8P xmm4, xmm7, [esi]
 	SSE_LOAD_8P xmm5, xmm7, [esi+edx]
-	
+
 	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
 	dec ecx
 	lea esi, [esi+2*edx]
@@ -402,8 +402,8 @@
 	movdqa xmm5,xmm6
 	add edi, eax
 	sub esi, edx
-	
-.start:	
+
+.start:
 	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
 	dec ecx
 	jz near .x_loop_dec
@@ -413,7 +413,7 @@
 	FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [edi+eax]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*eax]
 	SSE_LOAD_8P xmm7, xmm0, [esi+edx]
 	FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
@@ -454,16 +454,16 @@
 	SSE_LOAD_8P xmm5, xmm6, [esi+edx]
 	jmp near .start
 
-.x_loop_dec:	
+.x_loop_dec:
 	dec ebx
 	jz  near .xx_exit
-	mov esi, [esp + 16]           
-	mov edi, [esp + 24]           
+	mov esi, [esp + 16]
+	mov edi, [esp + 24]
 	sub esi, edx
 	sub esi, edx
 	add esi, 8
 	add edi, 8
-	mov ecx, [esp + 36] 
+	mov ecx, [esp + 36]
 	jmp near .xloop
 
 .xx_exit:
@@ -473,12 +473,12 @@
 	ret
 
 
-ALIGN 16                  
+ALIGN 16
 ;***********************************************************************
-; void McHorVer20_sse2(		uint8_t *pSrc, 
-;                       int32_t iSrcStride, 
-;						uint8_t *pDst, 
-;						int32_t iDstStride, 
+; void McHorVer20_sse2(		uint8_t *pSrc,
+;                       int32_t iSrcStride,
+;						uint8_t *pDst,
+;						int32_t iDstStride,
 ;						int32_t iWidth,
 ;						int32_t iHeight
 ;                      );
@@ -487,19 +487,19 @@
 	push esi
 	push edi
 	push ebx
-	mov esi, [esp+16]     
-	mov eax, [esp+20]	
-	mov edi, [esp+24]	
-	mov edx, [esp+28]	
-	mov ecx, [esp+32]	
-	mov ebx, [esp+36]	
+	mov esi, [esp+16]
+	mov eax, [esp+20]
+	mov edi, [esp+24]
+	mov edx, [esp+28]
+	mov ecx, [esp+32]
+	mov ebx, [esp+36]
 	sub esi, 2
-	pxor xmm7, xmm7	
-	
+	pxor xmm7, xmm7
+
 	cmp ecx, 9
-	jne near .width_17	
-	
-.yloop_width_9:	
+	jne near .width_17
+
+.yloop_width_9:
 	movq xmm0, [esi]
 	punpcklbw xmm0, xmm7
 	movq xmm1, [esi+5]
@@ -512,7 +512,7 @@
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3]
 	punpcklbw xmm5, xmm7
-	
+
 	movdqa xmm7, xmm2
 	paddw   xmm7, xmm3
 	movdqa xmm6, xmm4
@@ -526,12 +526,12 @@
 	paddw xmm0, [h264_w0x10_1]
 	psraw  xmm0, 5
 	packuswb xmm0, xmm0
-	movd [edi], xmm0	
-	
+	movd [edi], xmm0
+
 	pxor  xmm7, xmm7
 	movq xmm0, [esi+6]
 	punpcklbw xmm0, xmm7
-	
+
 	paddw xmm4, xmm1
 	paddw xmm5, xmm3
 	psllw xmm5, 2
@@ -543,8 +543,8 @@
 	paddw xmm2, [h264_w0x10_1]
 	psraw  xmm2, 5
 	packuswb xmm2, xmm2
-	movq [edi+1], xmm2	
-		
+	movq [edi+1], xmm2
+
 	add esi, eax
 	add edi, edx
 	dec ebx
@@ -553,8 +553,8 @@
 	pop edi
 	pop esi
 	ret
-	
-	
+
+
 .width_17:
 .yloop_width_17:
 	movq xmm0, [esi]
@@ -569,7 +569,7 @@
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3]
 	punpcklbw xmm5, xmm7
-	
+
 	paddw xmm2, xmm3
 	paddw xmm4, xmm5
 	psllw xmm4, 2
@@ -582,7 +582,7 @@
 	psraw  xmm0, 5
 	packuswb xmm0, xmm0
 	movq [edi], xmm0
-		
+
 	movq xmm0, [esi+8]
 	punpcklbw xmm0, xmm7
 	movq xmm1, [esi+5+8]
@@ -595,7 +595,7 @@
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3+8]
 	punpcklbw xmm5, xmm7
-	
+
 	movdqa xmm7, xmm2
 	paddw   xmm7, xmm3
 	movdqa xmm6, xmm4
@@ -610,12 +610,12 @@
 	psraw  xmm0, 5
 	packuswb xmm0, xmm0
 	movd [edi+8], xmm0
-	
-	
+
+
 	pxor  xmm7, xmm7
 	movq xmm0, [esi+6+8]
 	punpcklbw xmm0, xmm7
-	
+
 	paddw xmm4, xmm1
 	paddw xmm5, xmm3
 	psllw xmm5, 2
@@ -627,7 +627,7 @@
 	paddw xmm2, [h264_w0x10_1]
 	psraw  xmm2, 5
 	packuswb xmm2, xmm2
-	movq [edi+9], xmm2		
+	movq [edi+9], xmm2
 	add esi, eax
 	add edi, edx
 	dec ebx
@@ -636,14 +636,14 @@
 	pop edi
 	pop esi
 	ret
-	
-	
 
+
+
 ALIGN 16
 ;***********************************************************************
 ;void McHorVer22HorFirst_sse2
-;							(uint8_t *pSrc, 
-;							int32_t iSrcStride, 
+;							(uint8_t *pSrc,
+;							int32_t iSrcStride,
 ;							uint8_t * pTap,
 ;							int32_t iTapStride,
 ;							int32_t iWidth,int32_t iHeight);
@@ -652,21 +652,21 @@
 	push esi
 	push edi
 	push ebx
-	mov esi, [esp+16]     
-	mov eax, [esp+20]	
-	mov edi, [esp+24]	
-	mov edx, [esp+28]	
-	mov ecx, [esp+32]	
-	mov ebx, [esp+36]	
-	pxor xmm7, xmm7	
-	
+	mov esi, [esp+16]
+	mov eax, [esp+20]
+	mov edi, [esp+24]
+	mov edx, [esp+28]
+	mov ecx, [esp+32]
+	mov ebx, [esp+36]
+	pxor xmm7, xmm7
+
 	sub esi, eax				;;;;;;;;need more 5 lines.
 	sub esi, eax
-	
+
 	cmp ecx, 9
-	jne near .width_17	
-	
-.yloop_width_9:	
+	jne near .width_17
+
+.yloop_width_9:
 	movq xmm0, [esi]
 	punpcklbw xmm0, xmm7
 	movq xmm1, [esi+5]
@@ -679,7 +679,7 @@
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3]
 	punpcklbw xmm5, xmm7
-	
+
 	movdqa xmm7, xmm2
 	paddw   xmm7, xmm3
 	movdqa xmm6, xmm4
@@ -690,12 +690,12 @@
 	paddw xmm0, xmm6
 	psllw xmm6, 2
 	paddw xmm0, xmm6
-	movd [edi], xmm0	
-	
+	movd [edi], xmm0
+
 	pxor  xmm7, xmm7
 	movq xmm0, [esi+6]
 	punpcklbw xmm0, xmm7
-	
+
 	paddw xmm4, xmm1
 	paddw xmm5, xmm3
 	psllw xmm5, 2
@@ -704,9 +704,9 @@
 	paddw xmm2, xmm5
 	psllw xmm5, 2
 	paddw xmm2, xmm5
-	movq [edi+2], xmm2	
-	movhps [edi+2+8], xmm2	
-	
+	movq [edi+2], xmm2
+	movhps [edi+2+8], xmm2
+
 	add esi, eax
 	add edi, edx
 	dec ebx
@@ -715,8 +715,8 @@
 	pop edi
 	pop esi
 	ret
-	
-	
+
+
 .width_17:
 .yloop_width_17:
 	movq xmm0, [esi]
@@ -731,7 +731,7 @@
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3]
 	punpcklbw xmm5, xmm7
-	
+
 	paddw xmm2, xmm3
 	paddw xmm4, xmm5
 	psllw xmm4, 2
@@ -741,7 +741,7 @@
 	psllw xmm4, 2
 	paddw xmm0, xmm4
 	movdqa [edi], xmm0
-		
+
 	movq xmm0, [esi+8]
 	punpcklbw xmm0, xmm7
 	movq xmm1, [esi+5+8]
@@ -754,7 +754,7 @@
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3+8]
 	punpcklbw xmm5, xmm7
-	
+
 	movdqa xmm7, xmm2
 	paddw   xmm7, xmm3
 	movdqa xmm6, xmm4
@@ -766,12 +766,12 @@
 	psllw xmm6, 2
 	paddw xmm0, xmm6
 	movd [edi+16], xmm0
-	
-	
+
+
 	pxor  xmm7, xmm7
 	movq xmm0, [esi+6+8]
 	punpcklbw xmm0, xmm7
-	
+
 	paddw xmm4, xmm1
 	paddw xmm5, xmm3
 	psllw xmm5, 2
@@ -780,9 +780,9 @@
 	paddw xmm2, xmm5
 	psllw xmm5, 2
 	paddw xmm2, xmm5
-	movq [edi+18], xmm2	
-	movhps [edi+18+8], xmm2	
-	
+	movq [edi+18], xmm2
+	movhps [edi+18+8], xmm2
+
 	add esi, eax
 	add edi, edx
 	dec ebx
@@ -791,23 +791,23 @@
 	pop edi
 	pop esi
 	ret
-	
-	
+
+
 %macro FILTER_VER 9
 	paddw  %1, %6
 	movdqa %7, %2
 	movdqa %8, %3
-	
-	
+
+
 	paddw %7, %5
 	paddw %8, %4
-	
-	psubw  %1, %7   
-	psraw   %1, 2	  
-	paddw  %1, %8   
-	psubw  %1, %7 
-	psraw   %1, 2	
-	paddw  %8, %1   
+
+	psubw  %1, %7
+	psraw   %1, 2
+	paddw  %1, %8
+	psubw  %1, %7
+	psraw   %1, 2
+	paddw  %8, %1
 	paddw  %8, [h264_mc_hc_32]
 	psraw   %8, 6
 	packuswb %8, %8
@@ -815,8 +815,8 @@
 %endmacro
 ;***********************************************************************
 ;void McHorVer22VerLastAlign_sse2(
-;											uint8_t *pTap, 
-;											int32_t iTapStride, 
+;											uint8_t *pTap,
+;											int32_t iTapStride,
 ;											uint8_t * pDst,
 ;											int32_t iDstStride,
 ;											int32_t iWidth,
@@ -828,15 +828,15 @@
 	push edi
 	push ebx
 	push ebp
-	
+
 	mov esi, [esp+20]
 	mov eax, [esp+24]
 	mov edi, [esp+28]
 	mov edx, [esp+32]
 	mov ebx, [esp+36]
-	mov ecx, [esp+40]	
-	shr ebx, 3	
-	
+	mov ecx, [esp+40]
+	shr ebx, 3
+
 .width_loop:
 	movdqa xmm0, [esi]
 	movdqa xmm1, [esi+eax]
@@ -846,12 +846,12 @@
 	lea esi, [esi+2*eax]
 	movdqa xmm4, [esi]
 	movdqa xmm5, [esi+eax]
-	
+
 	FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
 	dec ecx
 	lea esi, [esi+2*eax]
 	movdqa xmm6, [esi]
-	
+
 	movdqa xmm0, xmm1
 	movdqa xmm1, xmm2
 	movdqa xmm2, xmm3
@@ -858,61 +858,61 @@
 	movdqa xmm3, xmm4
 	movdqa xmm4, xmm5
 	movdqa xmm5, xmm6
-	
+
 	add edi, edx
-	sub esi, eax		
-	
+	sub esi, eax
+
 .start:
 	FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqa xmm6, [esi]
 	FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqa xmm7, [esi+eax]
 	FILTER_VER  xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqa xmm0, [esi]
 	FILTER_VER  xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqa xmm1, [esi+eax]
 	FILTER_VER  xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqa xmm2, [esi]
 	FILTER_VER  xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqa xmm3, [esi+eax]
 	FILTER_VER  xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqa xmm4, [esi]
 	FILTER_VER  xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqa xmm5, [esi+eax]
 	jmp near .start
-	
+
 .x_loop_dec:
 	dec ebx
 	jz near .exit
@@ -922,9 +922,9 @@
 	add esi, 16
 	add edi, 8
 	jmp .width_loop
-	
-	
-	
+
+
+
 .exit:
 	pop ebp
 	pop ebx
@@ -934,8 +934,8 @@
 
 ;***********************************************************************
 ;void McHorVer22VerLastUnAlign_sse2(
-;											uint8_t *pTap, 
-;											int32_t iTapStride, 
+;											uint8_t *pTap,
+;											int32_t iTapStride,
 ;											uint8_t * pDst,
 ;											int32_t iDstStride,
 ;											int32_t iWidth,
@@ -947,15 +947,15 @@
 	push edi
 	push ebx
 	push ebp
-	
+
 	mov esi, [esp+20]
 	mov eax, [esp+24]
 	mov edi, [esp+28]
 	mov edx, [esp+32]
 	mov ebx, [esp+36]
-	mov ecx, [esp+40]	
-	shr ebx, 3	
-	
+	mov ecx, [esp+40]
+	shr ebx, 3
+
 .width_loop:
 	movdqu xmm0, [esi]
 	movdqu xmm1, [esi+eax]
@@ -965,12 +965,12 @@
 	lea esi, [esi+2*eax]
 	movdqu xmm4, [esi]
 	movdqu xmm5, [esi+eax]
-	
+
 	FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
 	dec ecx
 	lea esi, [esi+2*eax]
 	movdqu xmm6, [esi]
-	
+
 	movdqa xmm0, xmm1
 	movdqa xmm1, xmm2
 	movdqa xmm2, xmm3
@@ -977,61 +977,61 @@
 	movdqa xmm3, xmm4
 	movdqa xmm4, xmm5
 	movdqa xmm5, xmm6
-	
+
 	add edi, edx
-	sub esi, eax		
-	
+	sub esi, eax
+
 .start:
 	FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqu xmm6, [esi]
 	FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqu xmm7, [esi+eax]
 	FILTER_VER  xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqu xmm0, [esi]
 	FILTER_VER  xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqu xmm1, [esi+eax]
 	FILTER_VER  xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqu xmm2, [esi]
 	FILTER_VER  xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqu xmm3, [esi+eax]
 	FILTER_VER  xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqu xmm4, [esi]
 	FILTER_VER  xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqu xmm5, [esi+eax]
 	jmp near .start
-	
+
 .x_loop_dec:
 	dec ebx
 	jz near .exit
@@ -1041,9 +1041,9 @@
 	add esi, 16
 	add edi, 8
 	jmp .width_loop
-	
-	
-	
+
+
+
 .exit:
 	pop ebp
 	pop ebx
--- a/codec/encoder/core/asm/memzero.asm
+++ b/codec/encoder/core/asm/memzero.asm
@@ -32,8 +32,8 @@
 ;*  memzero.asm
 ;*
 ;*  Abstract
-;*      
 ;*
+;*
 ;*  History
 ;*      9/16/2009 Created
 ;*
@@ -47,8 +47,8 @@
 ; Code
 ;***********************************************************************
 
-SECTION .text			
-		
+SECTION .text
+
 ALIGN 16
 ;***********************************************************************
 ;_inline void __cdecl WelsPrefetchZero_mmx(int8_t const*_A);
@@ -57,7 +57,7 @@
 WelsPrefetchZero_mmx:
 	mov  eax,[esp+4]
 	prefetchnta [eax]
-	ret 			
+	ret
 
 
 ALIGN 16
@@ -69,7 +69,7 @@
 		mov		eax,	[esp + 4]          ; dst
 		mov		ecx,	[esp + 8]
 		neg		ecx
-			
+
 		pxor	xmm0,		xmm0
 .memzeroa64_sse2_loops:
 		movdqa	[eax],		xmm0
@@ -77,12 +77,12 @@
 		movdqa	[eax+32],	xmm0
 		movdqa	[eax+48],	xmm0
 		add		eax, 0x40
-		
+
 		add ecx, 0x40
 		jnz near .memzeroa64_sse2_loops
-			
-		ret	
 
+		ret
+
 ALIGN 16
 ;***********************************************************************
 ;   void WelsSetMemZeroSize64_mmx(void *dst, int32_t size)
@@ -92,7 +92,7 @@
 		mov		eax,	[esp + 4]          ; dst
 		mov		ecx,	[esp + 8]
 		neg		ecx
-			
+
 		pxor	mm0,		mm0
 .memzero64_mmx_loops:
 		movq	[eax],		mm0
@@ -102,16 +102,16 @@
 		movq	[eax+32],	mm0
 		movq	[eax+40],	mm0
 		movq	[eax+48],	mm0
-		movq	[eax+56],	mm0		
+		movq	[eax+56],	mm0
 		add		eax,		0x40
-		
+
 		add ecx, 0x40
 		jnz near .memzero64_mmx_loops
-			
-		WELSEMMS	
-		ret	
-	
-ALIGN 16		
+
+		WELSEMMS
+		ret
+
+ALIGN 16
 ;***********************************************************************
 ;   void WelsSetMemZeroSize8_mmx(void *dst, int32_t size)
 ;***********************************************************************
@@ -119,17 +119,17 @@
 WelsSetMemZeroSize8_mmx:
 		mov		eax,	[esp + 4]		; dst
 		mov		ecx,	[esp + 8]		; size
-		neg		ecx			
+		neg		ecx
 		pxor	mm0,		mm0
-		
+
 .memzero8_mmx_loops:
 		movq	[eax],		mm0
 		add		eax,		0x08
-	
+
 		add		ecx,		0x08
 		jnz near .memzero8_mmx_loops
-		
-		WELSEMMS	
-		ret	
 
-							
+		WELSEMMS
+		ret
+
+
--- a/codec/encoder/core/asm/quant.asm
+++ b/codec/encoder/core/asm/quant.asm
@@ -44,17 +44,17 @@
 
 BITS 32
 
-SECTION .text	
+SECTION .text
 ;************************************************
-;NEW_QUANT 
+;NEW_QUANT
 ;************************************************
 
 %macro SSE2_Quant8  5
 		MOVDQ	%1, %5
-		pxor	%2, %2							
-		pcmpgtw	%2, %1							
-		pxor	%1, %2							
-		psubw	%1, %2							
+		pxor	%2, %2
+		pcmpgtw	%2, %1
+		pxor	%1, %2
+		psubw	%1, %2
 		paddusw	%1, %3
 		pmulhuw	%1, %4
 		pxor	%1, %2
@@ -64,10 +64,10 @@
 
 %macro SSE2_QuantMax8  6
 		MOVDQ	%1, %5
-		pxor	%2, %2							
-		pcmpgtw	%2, %1							
-		pxor	%1, %2							
-		psubw	%1, %2								
+		pxor	%2, %2
+		pcmpgtw	%2, %1
+		pxor	%1, %2
+		psubw	%1, %2
 		paddusw	%1, %3
 		pmulhuw	%1, %4
 		pmaxsw	%6, %1
@@ -86,17 +86,17 @@
 WELS_EXTERN WelsQuant4x4_sse2
 align 16
 WelsQuant4x4_sse2:
-		mov		eax,  [ff]		
-		mov		ecx,  [mf]			
+		mov		eax,  [ff]
+		mov		ecx,  [mf]
 		MOVDQ	xmm2, [eax]
 		MOVDQ	xmm3, [ecx]
-		
+
 		mov		edx,  [pDct]
 		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x10]	
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x10]
 
 		ret
-	
+
 ;***********************************************************************
 ;void WelsQuant4x4Dc_sse2(int16_t *pDct, const int16_t ff, int16_t mf);
 ;***********************************************************************
@@ -104,17 +104,17 @@
 align 16
 WelsQuant4x4Dc_sse2:
 		mov		ax,		[mf]
-		SSE2_Copy8Times xmm3, eax						
-		
+		SSE2_Copy8Times xmm3, eax
+
 		mov		cx, [ff]
-		SSE2_Copy8Times xmm2, ecx						
+		SSE2_Copy8Times xmm2, ecx
 
 		mov		edx,  [pDct]
 		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx]
 		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x10]
-				
-		ret		
-		
+
+		ret
+
 ;***********************************************************************
 ;	void WelsQuantFour4x4_sse2(int16_t *pDct, int16_t* ff,  int16_t *mf);
 ;***********************************************************************
@@ -121,20 +121,20 @@
 WELS_EXTERN WelsQuantFour4x4_sse2
 align 16
 WelsQuantFour4x4_sse2:
-		mov		eax,  [ff]		
-		mov		ecx,  [mf]			
+		mov		eax,  [ff]
+		mov		ecx,  [mf]
 		MOVDQ	xmm2, [eax]
 		MOVDQ	xmm3, [ecx]
-		
-		mov		edx,  [pDct]	
+
+		mov		edx,  [pDct]
 		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x10]	
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x10]
 		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x20]
 		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x30]
 		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x40]
 		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x50]
 		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x60]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x70]	
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x70]
 
 		ret
 
@@ -144,17 +144,17 @@
 WELS_EXTERN WelsQuantFour4x4Max_sse2
 align 16
 WelsQuantFour4x4Max_sse2:
-		mov		eax,  [ff]		
-		mov		ecx,  [mf]			
+		mov		eax,  [ff]
+		mov		ecx,  [mf]
 		MOVDQ	xmm2, [eax]
 		MOVDQ	xmm3, [ecx]
-		
-		mov		edx,  [pDct]		
+
+		mov		edx,  [pDct]
 		pxor	xmm4, xmm4
 		pxor	xmm5, xmm5
 		pxor	xmm6, xmm6
 		pxor	xmm7, xmm7
-		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx	   ], xmm4		
+		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx	   ], xmm4
 		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx + 0x10], xmm4
 		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx + 0x20], xmm5
 		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx + 0x30], xmm5
@@ -162,20 +162,20 @@
 		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx + 0x50], xmm6
 		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx + 0x60], xmm7
 		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx + 0x70], xmm7
-		
+
 		SSE2_TransTwo4x4W xmm4, xmm5, xmm6, xmm7, xmm0
-		pmaxsw  xmm0,  xmm4	
+		pmaxsw  xmm0,  xmm4
 		pmaxsw  xmm0,  xmm5
-		pmaxsw  xmm0,  xmm7			
+		pmaxsw  xmm0,  xmm7
 		movdqa	xmm1,  xmm0
 		punpckhqdq	xmm0, xmm1
 		pmaxsw	xmm0, xmm1
 
-		mov		edx,  [max]	
-		movq	[edx], xmm0	
-			
-		ret		
+		mov		edx,  [max]
+		movq	[edx], xmm0
 
+		ret
+
 %macro  MMX_Copy4Times 2
 		movd		%1, %2
 		punpcklwd	%1, %1
@@ -185,10 +185,10 @@
 SECTION .text
 
 %macro MMX_Quant4  4
-		pxor	%2, %2							
-		pcmpgtw	%2, %1							
-		pxor	%1, %2							
-		psubw	%1, %2							
+		pxor	%2, %2
+		pcmpgtw	%2, %1
+		pxor	%1, %2
+		psubw	%1, %2
 		paddusw	%1, %3
 		pmulhuw	%1, %4
 		pxor	%1, %2
@@ -211,13 +211,13 @@
 		movd		mm3,			[eax + 0x40]
 		movd		mm1,			[eax + 0x60]
 		punpcklwd	mm3,			mm1
-		
+
 		mov			cx,				0
 		mov			[eax],			cx
 		mov			[eax + 0x20],	cx
 		mov			[eax + 0x40],	cx
 		mov			[eax + 0x60],	cx
-		
+
 		;hdm_2x2,	mm0 = dct0 dct1, mm3 = dct2 dct3
 		movq		mm5,			mm3
 		paddw		mm3,			mm0
@@ -229,22 +229,22 @@
 		paddw		mm1,			mm3
 		psubw		mm3,			mm5
 		punpcklwd	mm1,			mm3
-		
+
 		;quant_2x2_dc
 		mov			ax,				[mf]
-		MMX_Copy4Times	mm3,		eax		
+		MMX_Copy4Times	mm3,		eax
 		mov			cx,				[ff]
 		MMX_Copy4Times	mm2,		ecx
 		MMX_Quant4		mm1,	mm0,	mm2,	mm3
-		
+
 		; store dct_2x2
-		mov			edx,			[dct2x2]	
+		mov			edx,			[dct2x2]
 		movq		[edx],			mm1
 		mov			ecx,			[iChromaDc]
 		movq		[ecx],			mm1
-		
+
 		; pNonZeroCount of dct_2x2
-		pcmpeqb		mm2,			mm2		; mm2 = FF 
+		pcmpeqb		mm2,			mm2		; mm2 = FF
 		pxor		mm3,			mm3
 		packsswb	mm1,			mm3
 		pcmpeqb		mm1,			mm3		; set FF if equal, 0 if not equal
@@ -251,10 +251,10 @@
 		psubsb		mm1,			mm2		; set 0 if equal, 1 if not equal
 		psadbw		mm1,			mm3		;
 		movd		eax,			mm1
-					
+
 		WELSEMMS
 		ret
-	
+
 ;***********************************************************************
 ;int32_t WelsHadamardQuant2x2Skip_mmx(int16_t *pDct, int16_t ff,  int16_t mf);
 ;***********************************************************************
@@ -269,7 +269,7 @@
 		movd		mm3,			[eax + 0x40]
 		movd		mm1,			[eax + 0x60]
 		punpcklwd	mm3,			mm1
-		
+
 		;hdm_2x2,	mm0 = dct0 dct1, mm3 = dct2 dct3
 		movq		mm5,			mm3
 		paddw		mm3,			mm0
@@ -281,16 +281,16 @@
 		paddw		mm1,			mm3
 		psubw		mm3,			mm5
 		punpcklwd	mm1,			mm3
-		
+
 		;quant_2x2_dc
 		mov			ax,				[mf]
-		MMX_Copy4Times	mm3,		eax		
+		MMX_Copy4Times	mm3,		eax
 		mov			cx,				[ff]
 		MMX_Copy4Times	mm2,		ecx
 		MMX_Quant4		mm1,	mm0,	mm2,	mm3
-		
+
 		; pNonZeroCount of dct_2x2
-		pcmpeqb		mm2,			mm2		; mm2 = FF 
+		pcmpeqb		mm2,			mm2		; mm2 = FF
 		pxor		mm3,			mm3
 		packsswb	mm1,			mm3
 		pcmpeqb		mm1,			mm3		; set FF if equal, 0 if not equal
@@ -297,16 +297,16 @@
 		psubsb		mm1,			mm2		; set 0 if equal, 1 if not equal
 		psadbw		mm1,			mm3		;
 		movd		eax,			mm1
-			
-		WELSEMMS		
-		ret	
-		
-		
-%macro SSE2_DeQuant8 3  
+
+		WELSEMMS
+		ret
+
+
+%macro SSE2_DeQuant8 3
     MOVDQ  %2, %1
     pmullw %2, %3
     MOVDQ  %1, %2
-%endmacro 
+%endmacro
 
 
 ALIGN  16
@@ -329,7 +329,7 @@
 ;***********************************************************************====
 ;void WelsDequantFour4x4_sse2(int16_t *pDct, const uint16_t* mf);
 ;***********************************************************************====
-    
+
 align 16
 
 WELS_EXTERN WelsDequantFour4x4_sse2
@@ -356,15 +356,15 @@
 WELS_EXTERN WelsDequantIHadamard4x4_sse2
 align 16
 WelsDequantIHadamard4x4_sse2:
-		mov			eax,			[esp + 4]				
+		mov			eax,			[esp + 4]
 		mov			cx,				[esp + 8]
-		
+
 		; WelsDequantLumaDc4x4
-		SSE2_Copy8Times	xmm1,		ecx		
+		SSE2_Copy8Times	xmm1,		ecx
 		;psrlw		xmm1,		2		; for the (>>2) in ihdm
 		MOVDQ		xmm0,		[eax]
 		MOVDQ		xmm2,		[eax+0x10]
-		pmullw		xmm0,		xmm1		
+		pmullw		xmm0,		xmm1
 		pmullw		xmm2,		xmm1
 
 		; ihdm_4x4
@@ -371,24 +371,23 @@
 		movdqa		xmm1,		xmm0
 		psrldq		xmm1,		8
 		movdqa		xmm3,		xmm2
-		psrldq		xmm3,		8		
-		
-		SSE2_SumSub		xmm0, xmm3,	xmm5					; xmm0 = xmm0 - xmm3, xmm3 = xmm0 + xmm3	
-		SSE2_SumSub		xmm1, xmm2, xmm5					; xmm1 = xmm1 - xmm2, xmm2 = xmm1 + xmm2														
+		psrldq		xmm3,		8
+
+		SSE2_SumSub		xmm0, xmm3,	xmm5					; xmm0 = xmm0 - xmm3, xmm3 = xmm0 + xmm3
+		SSE2_SumSub		xmm1, xmm2, xmm5					; xmm1 = xmm1 - xmm2, xmm2 = xmm1 + xmm2
 		SSE2_SumSub		xmm3, xmm2, xmm5					; xmm3 = xmm3 - xmm2, xmm2 = xmm3 + xmm2
 		SSE2_SumSub		xmm0, xmm1, xmm5               		; xmm0 = xmm0 - xmm1, xmm1 = xmm0 + xmm1
 
-		SSE2_TransTwo4x4W	xmm2, xmm1, xmm3, xmm0, xmm4		
-		SSE2_SumSub		xmm2, xmm4,	xmm5		
-		SSE2_SumSub		xmm1, xmm0, xmm5																		
-		SSE2_SumSub		xmm4, xmm0, xmm5							
-		SSE2_SumSub		xmm2, xmm1, xmm5 
+		SSE2_TransTwo4x4W	xmm2, xmm1, xmm3, xmm0, xmm4
+		SSE2_SumSub		xmm2, xmm4,	xmm5
+		SSE2_SumSub		xmm1, xmm0, xmm5
+		SSE2_SumSub		xmm4, xmm0, xmm5
+		SSE2_SumSub		xmm2, xmm1, xmm5
 		SSE2_TransTwo4x4W	xmm0, xmm1, xmm4, xmm2, xmm3
-		
+
 		punpcklqdq	xmm0,		xmm1
 		MOVDQ		[eax],		xmm0
-		
+
 		punpcklqdq	xmm2,		xmm3
-		MOVDQ		[eax+16],	xmm2			
+		MOVDQ		[eax+16],	xmm2
 		ret
-	
\ No newline at end of file
--- a/codec/encoder/core/asm/satd_sad.asm
+++ b/codec/encoder/core/asm/satd_sad.asm
@@ -37,7 +37,7 @@
 ;*      WelsSampleSatd16x8_sse2
 ;*      WelsSampleSatd8x16_sse2
 ;*      WelsSampleSatd16x16_sse2
-;*      
+;*
 ;*      WelsSampleSad16x8_sse2
 ;*      WelsSampleSad16x16_sse2
 ;*
@@ -99,12 +99,12 @@
 
 %macro SSE2_HDMTwo4x4 5 ;in: xmm1,xmm2,xmm3,xmm4  pOut: xmm4,xmm2,xmm1,xmm3
    SSE2_SumSub %1, %2, %5
-   SSE2_SumSub %3, %4, %5 
-   SSE2_SumSub %2, %4, %5 
-   SSE2_SumSub %1, %3, %5 
-%endmacro 
+   SSE2_SumSub %3, %4, %5
+   SSE2_SumSub %2, %4, %5
+   SSE2_SumSub %1, %3, %5
+%endmacro
 
-%macro SSE2_SumAbs4 7  
+%macro SSE2_SumAbs4 7
 	WELS_AbsW %1, %3
 	WELS_AbsW %2, %3
 	WELS_AbsW %4, %6
@@ -113,13 +113,13 @@
 	paddusw       %4, %5
 	paddusw       %7, %1
 	paddusw       %7, %4
-%endmacro 
+%endmacro
 
 %macro  SSE2_SumWHorizon 3
 	movhlps		%2, %1			; x2 = xx xx xx xx d7 d6 d5 d4
 	paddw		%1, %2			; x1 = xx xx xx xx d37 d26 d15 d04
-	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04 
-	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26 
+	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04
+	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26
 	paddd		%1, %2			; x1 = xxxx xxxx d1357 d0246
 	pshuflw		%2, %1, 0x4e	; x2 = xxxx xxxx d0246 d1357
 	paddd		%1, %2			; x1 = xxxx xxxx xxxx  d01234567
@@ -132,12 +132,12 @@
 	lea                 ecx, [ecx+2*edx]
 	SSE2_LoadDiff8P    xmm2,xmm4,xmm7,[eax],[ecx]
 	SSE2_LoadDiff8P    xmm3,xmm5,xmm7,[eax+ebx],[ecx+edx]
-	
+
 	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm4
 	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm4
-	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5 
+	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5
 	SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
-	
+
 	lea					eax,    [eax+2*ebx]
     lea					ecx,    [ecx+2*edx]
 	SSE2_LoadDiff8P    xmm0,xmm4,xmm7,[eax],[ecx]
@@ -146,11 +146,11 @@
 	lea                 ecx, [ecx+2*edx]
 	SSE2_LoadDiff8P    xmm2,xmm4,xmm7,[eax],[ecx]
 	SSE2_LoadDiff8P    xmm3,xmm5,xmm7,[eax+ebx],[ecx+edx]
-	
+
 	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm4
 	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm4
-	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5 
-	SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6	
+	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5
+	SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
 %endmacro
 
 ;***********************************************************************
@@ -165,8 +165,8 @@
 	mov       eax,  [esp+8]
 	mov       ebx,  [esp+12]
 	mov       ecx,  [esp+16]
-	mov       edx,  [esp+20]    
-	
+	mov       edx,  [esp+20]
+
     movd      xmm0, [eax]
     movd      xmm1, [eax+ebx]
     lea       eax , [eax+2*ebx]
@@ -174,7 +174,7 @@
     movd      xmm3, [eax+ebx]
     punpckldq xmm0, xmm2
     punpckldq xmm1, xmm3
-   
+
     movd      xmm4, [ecx]
     movd      xmm5, [ecx+edx]
     lea       ecx , [ecx+2*edx]
@@ -188,7 +188,7 @@
     punpcklbw xmm1, xmm6
     punpcklbw xmm4, xmm6
     punpcklbw xmm5, xmm6
-    
+
     psubw     xmm0, xmm4
     psubw     xmm1, xmm5
 
@@ -196,7 +196,7 @@
     paddw     xmm0, xmm1
     psubw     xmm2, xmm1
     SSE2_XSawp qdq, xmm0, xmm2, xmm3
-    
+
     movdqa     xmm4, xmm0
     paddw      xmm0, xmm3
     psubw      xmm4, xmm3
@@ -204,7 +204,7 @@
     movdqa         xmm2, xmm0
     punpcklwd      xmm0, xmm4
     punpckhwd      xmm4, xmm2
-    
+
 	SSE2_XSawp     dq,  xmm0, xmm4, xmm3
 	SSE2_XSawp     qdq, xmm0, xmm3, xmm5
 
@@ -211,16 +211,16 @@
     movdqa         xmm7, xmm0
     paddw          xmm0, xmm5
     psubw          xmm7, xmm5
-    
+
 	SSE2_XSawp     qdq,  xmm0, xmm7, xmm1
 
     movdqa         xmm2, xmm0
     paddw          xmm0, xmm1
     psubw          xmm2, xmm1
-    
-    WELS_AbsW  xmm0, xmm3   
+
+    WELS_AbsW  xmm0, xmm3
     paddusw        xmm6, xmm0
-	WELS_AbsW  xmm2, xmm4   
+	WELS_AbsW  xmm2, xmm4
     paddusw        xmm6, xmm2
     SSE2_SumWHorizon1  xmm6, xmm4
 	movd           eax,  xmm6
@@ -228,7 +228,7 @@
     shr            eax,  1
 	pop            ebx
 	ret
- 
+
  ;***********************************************************************
  ;
  ;int32_t WelsSampleSatd8x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
@@ -241,16 +241,16 @@
 	 mov    eax,    [esp+8]
 	 mov    ebx,    [esp+12]
 	 mov    ecx,    [esp+16]
-	 mov    edx,    [esp+20]    
+	 mov    edx,    [esp+20]
 	 pxor   xmm6,   xmm6
-     pxor   xmm7,   xmm7     
-     SSE2_GetSatd8x8	 
+     pxor   xmm7,   xmm7
+     SSE2_GetSatd8x8
      psrlw   xmm6,  1
 	 SSE2_SumWHorizon   xmm6,xmm4,xmm7
 	 movd    eax,   xmm6
 	 pop     ebx
 	 ret
- 
+
  ;***********************************************************************
  ;
  ;int32_t WelsSampleSatd8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
@@ -263,15 +263,15 @@
 	 mov    eax,    [esp+8]
 	 mov    ebx,    [esp+12]
 	 mov    ecx,    [esp+16]
-	 mov    edx,    [esp+20]    
+	 mov    edx,    [esp+20]
 	 pxor   xmm6,   xmm6
-     pxor   xmm7,   xmm7  
-        
-	 SSE2_GetSatd8x8	 
+     pxor   xmm7,   xmm7
+
+	 SSE2_GetSatd8x8
      lea    eax,    [eax+2*ebx]
-     lea    ecx,    [ecx+2*edx]     
-	 SSE2_GetSatd8x8	
-	  
+     lea    ecx,    [ecx+2*edx]
+	 SSE2_GetSatd8x8
+
 	 psrlw   xmm6,  1
 	 SSE2_SumWHorizon   xmm6,xmm4,xmm7
 	 movd    eax,   xmm6
@@ -290,15 +290,15 @@
 	mov    eax,    [esp+8]
 	mov    ebx,    [esp+12]
 	mov    ecx,    [esp+16]
-	mov    edx,    [esp+20]    
+	mov    edx,    [esp+20]
 	pxor   xmm6,   xmm6
     pxor   xmm7,   xmm7
-    
+
 	SSE2_GetSatd8x8
 	mov    eax,    [esp+8]
     mov    ecx,    [esp+16]
     add    eax,    8
-    add    ecx,    8    
+    add    ecx,    8
 	SSE2_GetSatd8x8
 
 	psrlw   xmm6,  1
@@ -319,25 +319,25 @@
 	mov    eax,    [esp+8]
 	mov    ebx,    [esp+12]
 	mov    ecx,    [esp+16]
-	mov    edx,    [esp+20]    
+	mov    edx,    [esp+20]
 	pxor   xmm6,   xmm6
     pxor   xmm7,   xmm7
-    
-	SSE2_GetSatd8x8		
+
+	SSE2_GetSatd8x8
 	lea    eax,    [eax+2*ebx]
-	lea    ecx,    [ecx+2*edx]	
+	lea    ecx,    [ecx+2*edx]
 	SSE2_GetSatd8x8
-	
+
 	mov    eax,    [esp+8]
 	mov    ecx,    [esp+16]
 	add    eax,    8
 	add    ecx,    8
-	
-	SSE2_GetSatd8x8	
+
+	SSE2_GetSatd8x8
 	lea    eax,    [eax+2*ebx]
-	lea    ecx,    [ecx+2*edx]	
+	lea    ecx,    [ecx+2*edx]
 	SSE2_GetSatd8x8
-	
+
  ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.
     psrlw   xmm6,  1
 	SSE2_SumWHorizon   xmm6,xmm4,xmm7
@@ -353,18 +353,18 @@
 
 ;***********************************************************************
 ;
-;Pixel_satd_intra_sse2 BEGIN 
+;Pixel_satd_intra_sse2 BEGIN
 ;
 ;***********************************************************************
 
-%macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 
+%macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3
 	pmaddubsw    %1, xmm5
 	movdqa       %2, %1
 	pmaddwd      %1, xmm7
 	pmaddwd      %2, xmm6
 	movdqa       %3, %1
-	punpckldq    %1, %2 
-	punpckhdq    %2, %3 
+	punpckldq    %1, %2
+	punpckhdq    %2, %3
 	movdqa       %3, %1
 	punpcklqdq   %1, %2
 	punpckhqdq   %3, %2
@@ -373,14 +373,14 @@
 	packssdw     %1, %3
 	psllw        %1, 2
 %endmacro
-%macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2  
+%macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2
 	pmaddubsw    %1, xmm5
 	movdqa       %2, %1
 	pmaddwd      %1, xmm7
 	pmaddwd      %2, xmm6
 	movdqa       %3, %1
-	punpckldq    %1, %2 
-	punpckhdq    %2, %3 
+	punpckldq    %1, %2
+	punpckhdq    %2, %3
 	movdqa       %3, %1
 	punpcklqdq   %1, %2
 	punpckhqdq   %3, %2
@@ -387,7 +387,7 @@
 ;    paddd        xmm4, %1 ;for dc
 ;	 paddd        xmm4, %3 ;for dc
 	movdqa       %4, %1
-	punpcklqdq   %4, %3 
+	punpcklqdq   %4, %3
 	packssdw     %1, %3
 	psllw        %1, 2
 %endmacro
@@ -415,25 +415,25 @@
 	pinsrw      xmm0,   word[esi+%2+8], 4
 	psubsw      xmm0,   xmm7
 	pabsw       xmm0,   xmm0
-	paddw       xmm4,   xmm0 
+	paddw       xmm4,   xmm0
 	pxor        xmm0,   xmm0
 	pinsrw      xmm0,   word[esi+%2+2],  0
 	pinsrw      xmm0,   word[esi+%2+10], 4
 	psubsw      xmm0,   xmm1
 	pabsw       xmm0,   xmm0
-	paddw       xmm4,   xmm0 
+	paddw       xmm4,   xmm0
 	pxor        xmm0,   xmm0
 	pinsrw      xmm0,   word[esi+%2+4],  0
 	pinsrw      xmm0,   word[esi+%2+12], 4
 	psubsw      xmm0,   xmm3
 	pabsw       xmm0,   xmm0
-	paddw       xmm4,   xmm0 
+	paddw       xmm4,   xmm0
 	pxor        xmm0,   xmm0
 	pinsrw      xmm0,   word[esi+%2+6],  0
 	pinsrw      xmm0,   word[esi+%2+14], 4
 	psubsw      xmm0,   xmm2
 	pabsw       xmm0,   xmm0
-	paddw       xmm4,   xmm0 
+	paddw       xmm4,   xmm0
 %endmacro
 %macro SSE41_GetX38x4SatdH  3
 	movq        xmm0,   [esi+%3+8*%1]
@@ -455,7 +455,7 @@
 	psubsw      xmm0,   xmm7
 	pabsw       xmm0,   xmm0
 	paddw       xmm6,   xmm0
-	paddw       xmm6,   xmm2 
+	paddw       xmm6,   xmm2
 %endmacro
 %macro SSE41_ChromaGetX38x4SatdDC 1
 	shl         %1,     4
@@ -463,13 +463,13 @@
 	psubsw      xmm0,   xmm7
 	pabsw       xmm0,   xmm0
 	paddw       xmm6,   xmm0
-	paddw       xmm6,   xmm2 
+	paddw       xmm6,   xmm2
 %endmacro
 %macro SSE41_I16x16GetX38x4Satd 2
 	SSE41_GetX38x4SatdDec
 	SSE41_GetX38x4SatdV   %1, %2
 	SSE41_GetX38x4SatdH   %1, %2, 32
-	SSE41_I16X16GetX38x4SatdDC 
+	SSE41_I16X16GetX38x4SatdDC
 %endmacro
 %macro SSE41_ChromaGetX38x4Satd 2
 	SSE41_GetX38x4SatdDec
@@ -478,11 +478,11 @@
 	SSE41_ChromaGetX38x4SatdDC %1
 %endmacro
 %macro SSE41_HSum8W 3
-	pmaddwd     %1, %2 
-	movhlps     %3, %1 
-	paddd       %1, %3 
-	pshuflw     %3, %1,0Eh 
-	paddd       %1, %3 
+	pmaddwd     %1, %2
+	movhlps     %3, %1
+	paddd       %1, %3
+	pshuflw     %3, %1,0Eh
+	paddd       %1, %3
 %endmacro
 
 WELS_EXTERN WelsIntra16x16Combined3Satd_sse41
@@ -493,7 +493,7 @@
 	mov    ecx,    [esp+16]
 	mov    edx,    [esp+20]
 	mov    eax,    [esp+24]
-	mov    ebx,    [esp+28]    
+	mov    ebx,    [esp+28]
 	mov    esi,    [esp+40] ;temp_satd
 	pxor        xmm4,   xmm4
 	movdqa      xmm5,   [HSumSubDB1]
@@ -507,29 +507,29 @@
 	SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
 	SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
 	movdqa      [esi],  xmm0 ;V
-	movdqa      [esi+16], xmm1 
+	movdqa      [esi+16], xmm1
 	add         ecx,    edx
 	pinsrb      xmm0,   byte[ecx-1], 0
 	pinsrb      xmm0,   byte[ecx+edx-1], 1
-	lea         ecx,    [ecx+2*edx]  
+	lea         ecx,    [ecx+2*edx]
 	pinsrb      xmm0,   byte[ecx-1],     2
 	pinsrb      xmm0,   byte[ecx+edx-1], 3
-	lea         ecx,    [ecx+2*edx] 
+	lea         ecx,    [ecx+2*edx]
 	pinsrb      xmm0,   byte[ecx-1],     4
 	pinsrb      xmm0,   byte[ecx+edx-1], 5
-	lea         ecx,    [ecx+2*edx] 
+	lea         ecx,    [ecx+2*edx]
 	pinsrb      xmm0,   byte[ecx-1],     6
 	pinsrb      xmm0,   byte[ecx+edx-1], 7
-	lea         ecx,    [ecx+2*edx]  
+	lea         ecx,    [ecx+2*edx]
 	pinsrb      xmm0,   byte[ecx-1],     8
 	pinsrb      xmm0,   byte[ecx+edx-1], 9
-	lea         ecx,    [ecx+2*edx] 
+	lea         ecx,    [ecx+2*edx]
 	pinsrb      xmm0,   byte[ecx-1],     10
 	pinsrb      xmm0,   byte[ecx+edx-1], 11
-	lea         ecx,    [ecx+2*edx] 
+	lea         ecx,    [ecx+2*edx]
 	pinsrb      xmm0,   byte[ecx-1],     12
 	pinsrb      xmm0,   byte[ecx+edx-1], 13
-	lea         ecx,    [ecx+2*edx] 
+	lea         ecx,    [ecx+2*edx]
 	pinsrb      xmm0,   byte[ecx-1],     14
 	pinsrb      xmm0,   byte[ecx+edx-1], 15
 	movhlps		xmm1,   xmm0
@@ -549,7 +549,7 @@
 	pxor        xmm6,   xmm6 ;DC
 	mov         ecx,    0
 	mov         edi,    0
-.loop16x16_get_satd:    
+.loop16x16_get_satd:
 .loopStart1:
 	SSE41_I16x16GetX38x4Satd ecx, edi
 	inc          ecx
@@ -562,8 +562,8 @@
 	mov         ecx, 0
 	add         edi, 16
 	jmp         .loop16x16_get_satd
- .loop16x16_get_satd_end:   
-	MMX_DW_1_2REG    xmm0, xmm1 
+ .loop16x16_get_satd_end:
+	MMX_DW_1_2REG    xmm0, xmm1
 	psrlw       xmm4, 1 ;/2
 	psrlw       xmm5, 1 ;/2
 	psrlw       xmm6, 1 ;/2
@@ -570,7 +570,7 @@
 	SSE41_HSum8W     xmm4, xmm0, xmm1
 	SSE41_HSum8W     xmm5, xmm0, xmm1
 	SSE41_HSum8W     xmm6, xmm0, xmm1
-	
+
 	; comparing order: DC H V
 	movd      ebx, xmm6 ;DC
 	movd      edi, xmm5 ;H
@@ -577,33 +577,33 @@
 	movd      ecx, xmm4 ;V
 	mov      edx, [esp+36]
 	shl       edx, 1
-	add       edi, edx 
-	add       ebx, edx 
+	add       edi, edx
+	add       ebx, edx
 	mov       edx, [esp+32]
 	cmp       ebx, edi
 	jge near   not_dc_16x16
 	cmp        ebx, ecx
 	jge near   not_dc_h_16x16
-	
+
 	; for DC mode
 	mov       dword[edx], 2;I16_PRED_DC
-	mov       eax, ebx 
+	mov       eax, ebx
 	jmp near return_satd_intra_16x16_x3
 not_dc_16x16:
-	; for H mode 
+	; for H mode
 	cmp       edi, ecx
 	jge near   not_dc_h_16x16
 	mov       dword[edx], 1;I16_PRED_H
-	mov       eax, edi 
+	mov       eax, edi
 	jmp near return_satd_intra_16x16_x3
 not_dc_h_16x16:
 	; for V mode
 	mov       dword[edx], 0;I16_PRED_V
 	mov       eax, ecx
-return_satd_intra_16x16_x3: 
+return_satd_intra_16x16_x3:
 	WELSEMMS
-	pop         edi 
-	pop         esi 
+	pop         edi
+	pop         esi
 	pop         ebx
 ret
 
@@ -619,13 +619,13 @@
 	add         ecx,    edx
 	pinsrb      xmm0,   byte[ecx-1], 0
 	pinsrb      xmm0,   byte[ecx+edx-1], 1
-	lea         ecx,    [ecx+2*edx]  
+	lea         ecx,    [ecx+2*edx]
 	pinsrb      xmm0,   byte[ecx-1],     2
 	pinsrb      xmm0,   byte[ecx+edx-1], 3
-	lea         ecx,    [ecx+2*edx] 
+	lea         ecx,    [ecx+2*edx]
 	pinsrb      xmm0,   byte[ecx-1],     4
 	pinsrb      xmm0,   byte[ecx+edx-1], 5
-	lea         ecx,    [ecx+2*edx] 
+	lea         ecx,    [ecx+2*edx]
 	pinsrb      xmm0,   byte[ecx-1],     6
 	pinsrb      xmm0,   byte[ecx+edx-1], 7
 	punpcklqdq  xmm0,   xmm0
@@ -634,10 +634,10 @@
 ;(sum+2)>>2
 	movdqa      xmm6,   [PDQ2]
 	movdqa      xmm5,   xmm4
-	punpckhqdq  xmm5,   xmm1    
+	punpckhqdq  xmm5,   xmm1
 	paddd       xmm5,   xmm6
 	psrld       xmm5,   2
-;(sum1+sum2+4)>>3   
+;(sum1+sum2+4)>>3
 	paddd       xmm6,   xmm6
 	paddd       xmm4,   xmm1
 	paddd       xmm4,   xmm6
@@ -644,8 +644,8 @@
 	psrld       xmm4,   3
 ;satd *16
 	pslld       xmm5,   4
-	pslld       xmm4,   4    
-;temp satd    
+	pslld       xmm4,   4
+;temp satd
 	movdqa      xmm6,   xmm4
 	punpcklqdq  xmm4,   xmm5
 	psllq       xmm4,   32
@@ -655,12 +655,12 @@
 	psllq       xmm5,   32
 	psrlq       xmm5,   32
 	movdqa      [esi+48], xmm5
-	
+
 	pxor        xmm4,   xmm4 ;V
 	pxor        xmm5,   xmm5 ;H
 	pxor        xmm6,   xmm6 ;DC
 	mov         ecx,    0
-loop_chroma_satdx3_cb_cr:    
+loop_chroma_satdx3_cb_cr:
 	SSE41_ChromaGetX38x4Satd ecx, 0
 	inc             ecx
 	cmp             ecx, 2
@@ -668,13 +668,13 @@
 %endmacro
 
 %macro SSEReg2MMX 3
-	movdq2q     %2, %1 
-	movhlps     %1, %1 
-	movdq2q     %3, %1 
+	movdq2q     %2, %1
+	movhlps     %1, %1
+	movdq2q     %3, %1
 %endmacro
 %macro MMXReg2SSE 4
-	movq2dq     %1, %3 
-	movq2dq     %2, %4 
+	movq2dq     %1, %3
+	movq2dq     %2, %4
 	punpcklqdq  %1, %2
 %endmacro
 ;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41
@@ -687,10 +687,10 @@
 	mov    ecx,    [esp+16]
 	mov    edx,    [esp+20]
 	mov    eax,    [esp+24]
-	mov    ebx,    [esp+28]    
+	mov    ebx,    [esp+28]
 	mov    esi,    [esp+40] ;temp_satd
 	xor    edi,    edi
-loop_chroma_satdx3: 
+loop_chroma_satdx3:
 	SSE41_ChromaGetX38x8Satd
 	cmp             edi, 1
 	je              loop_chroma_satdx3end
@@ -701,16 +701,16 @@
 	mov         ecx,  [esp+44]
 	mov         eax,  [esp+48]
 	jmp         loop_chroma_satdx3
-loop_chroma_satdx3end:    
+loop_chroma_satdx3end:
 	MMXReg2SSE  xmm0, xmm3, mm0, mm1
 	MMXReg2SSE  xmm1, xmm3, mm2, mm3
 	MMXReg2SSE  xmm2, xmm3, mm5, mm6
-	
+
 	paddw       xmm4, xmm0
 	paddw       xmm5, xmm1
 	paddw       xmm6, xmm2
-	
-	MMX_DW_1_2REG    xmm0, xmm1 
+
+	MMX_DW_1_2REG    xmm0, xmm1
 	psrlw       xmm4, 1 ;/2
 	psrlw       xmm5, 1 ;/2
 	psrlw       xmm6, 1 ;/2
@@ -730,57 +730,57 @@
 	jge near   not_dc_8x8
 	cmp        ebx, ecx
 	jge near   not_dc_h_8x8
-	
+
 	; for DC mode
 	mov       dword[edx], 0;I8_PRED_DC
-	mov       eax, ebx 
+	mov       eax, ebx
 	jmp near return_satd_intra_8x8_x3
 not_dc_8x8:
-	; for H mode 
+	; for H mode
 	cmp       edi, ecx
 	jge near   not_dc_h_8x8
 	mov       dword[edx], 1;I8_PRED_H
-	mov       eax, edi 
+	mov       eax, edi
 	jmp near return_satd_intra_8x8_x3
 not_dc_h_8x8:
 	; for V mode
 	mov       dword[edx], 2;I8_PRED_V
 	mov       eax, ecx
-return_satd_intra_8x8_x3: 
+return_satd_intra_8x8_x3:
 	WELSEMMS
-	pop         edi 
-	pop         esi 
+	pop         edi
+	pop         esi
 	pop         ebx
 ret
 
-	
+
 ;***********************************************************************
 ;
-;Pixel_satd_intra_sse2 END 
+;Pixel_satd_intra_sse2 END
 ;
 ;***********************************************************************
 %macro SSSE3_Get16BSadHVDC 2
-  movd        xmm6,%1 
-  pshufb      xmm6,xmm1 
+  movd        xmm6,%1
+  pshufb      xmm6,xmm1
   movdqa      %1,  xmm6
-  movdqa      xmm0,%2 
-  psadbw      xmm0,xmm7 
-  paddw       xmm4,xmm0 
   movdqa      xmm0,%2
-  psadbw      xmm0,xmm5 
-  paddw       xmm2,xmm0 
+  psadbw      xmm0,xmm7
+  paddw       xmm4,xmm0
+  movdqa      xmm0,%2
+  psadbw      xmm0,xmm5
+  paddw       xmm2,xmm0
   psadbw      xmm6,%2
-  paddw       xmm3,xmm6 
+  paddw       xmm3,xmm6
 %endmacro
 %macro WelsAddDCValue 4
     movzx   %2, byte %1
-    mov    %3, %2 
+    mov    %3, %2
     add     %4, %2
-%endmacro   
+%endmacro
 
 ;***********************************************************************
 ;
-;Pixel_sad_intra_ssse3 BEGIN 
+;Pixel_sad_intra_ssse3 BEGIN
 ;
 ;***********************************************************************
 WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3
@@ -792,14 +792,14 @@
 	mov    edx,    [esp+20]
 	mov    edi,    [esp+40] ;temp_sad
 	sub    ecx,    edx
-    movdqa      xmm5,[ecx] 
+    movdqa      xmm5,[ecx]
     pxor        xmm0,xmm0
-    psadbw      xmm0,xmm5 
-    movhlps     xmm1,xmm0 
-    paddw       xmm0,xmm1 
+    psadbw      xmm0,xmm5
+    movhlps     xmm1,xmm0
+    paddw       xmm0,xmm1
     movd        eax,xmm0
-     
-    add         ecx,edx 
+
+    add         ecx,edx
     lea         ebx, [edx+2*edx]
     WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
     WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
@@ -824,45 +824,45 @@
     WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
     WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
     sub        edi, 192
-    add         eax,10h 
-    shr         eax,5 
-    movd        xmm7,eax 
+    add         eax,10h
+    shr         eax,5
+    movd        xmm7,eax
     pxor        xmm1,xmm1
     pshufb      xmm7,xmm1
-    pxor        xmm4,xmm4 
-    pxor        xmm3,xmm3 
-    pxor        xmm2,xmm2 
-;sad begin  
+    pxor        xmm4,xmm4
+    pxor        xmm3,xmm3
+    pxor        xmm2,xmm2
+;sad begin
 	mov    eax,    [esp+24]
-	mov    ebx,    [esp+28]    
+	mov    ebx,    [esp+28]
     lea         esi, [ebx+2*ebx]
     SSSE3_Get16BSadHVDC [edi], [eax]
     SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
     SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
     SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
-    add         edi, 64  
+    add         edi, 64
     lea         eax, [eax+4*ebx]
     SSSE3_Get16BSadHVDC [edi], [eax]
     SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
     SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
     SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
-    add         edi, 64  
+    add         edi, 64
     lea         eax, [eax+4*ebx]
     SSSE3_Get16BSadHVDC [edi], [eax]
     SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
     SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
     SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
-    add         edi, 64  
+    add         edi, 64
     lea         eax, [eax+4*ebx]
     SSSE3_Get16BSadHVDC [edi], [eax]
     SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
     SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
     SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
-    
-    pslldq      xmm3,4 
-    por         xmm3,xmm2 
-    movhlps     xmm1,xmm3 
-    paddw       xmm3,xmm1 
+
+    pslldq      xmm3,4
+    por         xmm3,xmm2
+    movhlps     xmm1,xmm3
+    paddw       xmm3,xmm1
     movhlps     xmm0,xmm4
     paddw       xmm4,xmm0
 ; comparing order: DC H V
@@ -872,8 +872,8 @@
 	movd        esi, xmm3 ;H
 	mov         eax, [esp+36] ;lamda
 	shl         eax, 1
-	add         esi, eax 
-	add         ebx, eax 
+	add         esi, eax
+	add         ebx, eax
 	mov         edx, [esp+32]
 	cmp         ebx, esi
 	jge near   not_dc_16x16_sad
@@ -881,7 +881,7 @@
 	jge near   not_dc_h_16x16_sad
 	; for DC mode
 	mov       dword[edx], 2;I16_PRED_DC
-	mov       eax, ebx 
+	mov       eax, ebx
     sub        edi, 192
 %assign x 0
 %rep 16
@@ -890,11 +890,11 @@
 %endrep
 	jmp near return_sad_intra_16x16_x3
 not_dc_16x16_sad:
-	; for H mode 
+	; for H mode
 	cmp       esi, ecx
 	jge near   not_dc_h_16x16_sad
 	mov       dword[edx], 1;I16_PRED_H
-	mov       eax, esi 
+	mov       eax, esi
 	jmp near return_sad_intra_16x16_x3
 not_dc_h_16x16_sad:
 	; for V mode
@@ -914,12 +914,12 @@
 
 ;***********************************************************************
 ;
-;Pixel_sad_intra_ssse3 END 
+;Pixel_sad_intra_ssse3 END
 ;
 ;***********************************************************************
 ;***********************************************************************
 ;
-;Pixel_satd_wxh_sse41 BEGIN 
+;Pixel_satd_wxh_sse41 BEGIN
 ;
 ;***********************************************************************
 
@@ -934,9 +934,9 @@
 	movq             xmm2, [ecx]
 	punpcklqdq       xmm2, xmm2
 	pmaddubsw        xmm2, xmm7
-	movq             xmm3, [ecx+edx]	
-	punpcklqdq       xmm3, xmm3	
-	pmaddubsw        xmm3, xmm7	
+	movq             xmm3, [ecx+edx]
+	punpcklqdq       xmm3, xmm3
+	pmaddubsw        xmm3, xmm7
 	psubsw           xmm0, xmm2
 	psubsw           xmm1, xmm3
 	movq             xmm2, [eax+2*ebx]
@@ -948,12 +948,12 @@
 	movq             xmm4, [ecx+2*edx]
 	punpcklqdq       xmm4, xmm4
 	pmaddubsw        xmm4, xmm7
-	movq             xmm5, [ecx+edi]	
-	punpcklqdq       xmm5, xmm5	
+	movq             xmm5, [ecx+edi]
+	punpcklqdq       xmm5, xmm5
 	pmaddubsw        xmm5, xmm7
 	psubsw           xmm2, xmm4
 	psubsw           xmm3, xmm5
-	SSE2_HDMTwo4x4   xmm0, xmm1, xmm2, xmm3, xmm4	
+	SSE2_HDMTwo4x4   xmm0, xmm1, xmm2, xmm3, xmm4
 	pabsw            xmm0, xmm0
 	pabsw            xmm2, xmm2
 	pabsw            xmm1, xmm1
@@ -970,18 +970,18 @@
 	pslld            xmm2, 16
 	psrld            xmm4, 16
 	por              xmm2, xmm4
-	pmaxuw           xmm0, xmm2	
+	pmaxuw           xmm0, xmm2
 	paddw            xmm6, xmm0
 %endmacro
 
 %macro SSSE3_SumWHorizon 4 ;eax, srcSSE, tempSSE, tempSSE
-	MMX_DW_1_2REG    %3, %4 
-	pmaddwd     %2, %3 
-	movhlps     %4, %2 
-	paddd       %2, %4 
-	pshuflw     %4, %2,0Eh 
-	paddd       %2, %4 
-	movd		%1, %2 
+	MMX_DW_1_2REG    %3, %4
+	pmaddwd     %2, %3
+	movhlps     %4, %2
+	paddd       %2, %4
+	pshuflw     %4, %2,0Eh
+	paddd       %2, %4
+	movd		%1, %2
 %endmacro
 ;***********************************************************************
 ;
@@ -990,53 +990,53 @@
 ;***********************************************************************
 WELS_EXTERN WelsSampleSatd4x4_sse41
 WelsSampleSatd4x4_sse41:
-	push        ebx  
-	mov         eax,[esp+8] 
-	mov         ebx,[esp+12] 
-	mov         ecx,[esp+16] 
-	mov         edx,[esp+20] 
-	movdqa      xmm4,[HSwapSumSubDB1] 
-	movd        xmm2,[ecx] 
-	movd        xmm5,[ecx+edx] 
-	shufps      xmm2,xmm5,0 
-	movd        xmm3,[ecx+edx*2] 
+	push        ebx
+	mov         eax,[esp+8]
+	mov         ebx,[esp+12]
+	mov         ecx,[esp+16]
+	mov         edx,[esp+20]
+	movdqa      xmm4,[HSwapSumSubDB1]
+	movd        xmm2,[ecx]
+	movd        xmm5,[ecx+edx]
+	shufps      xmm2,xmm5,0
+	movd        xmm3,[ecx+edx*2]
 	lea         ecx, [edx*2+ecx]
-	movd        xmm5,[ecx+edx] 
-	shufps      xmm3,xmm5,0 
-	movd        xmm0,[eax] 
-	movd        xmm5,[eax+ebx] 
-	shufps      xmm0,xmm5,0 
-	movd        xmm1,[eax+ebx*2] 
+	movd        xmm5,[ecx+edx]
+	shufps      xmm3,xmm5,0
+	movd        xmm0,[eax]
+	movd        xmm5,[eax+ebx]
+	shufps      xmm0,xmm5,0
+	movd        xmm1,[eax+ebx*2]
 	lea         eax, [ebx*2+eax]
-	movd        xmm5,[eax+ebx] 
-	shufps      xmm1,xmm5,0 
-	pmaddubsw   xmm0,xmm4 
-	pmaddubsw   xmm1,xmm4 
-	pmaddubsw   xmm2,xmm4 
-	pmaddubsw   xmm3,xmm4 
-	psubw       xmm0,xmm2 
-	psubw       xmm1,xmm3 
-	movdqa      xmm2,xmm0 
-	paddw       xmm0,xmm1 
-	psubw       xmm1,xmm2 
-	movdqa      xmm2,xmm0 
-	punpcklqdq  xmm0,xmm1 
-	punpckhqdq  xmm2,xmm1 
-	movdqa      xmm1,xmm0 
-	paddw       xmm0,xmm2 
-	psubw       xmm2,xmm1 
-	movdqa      xmm1,xmm0 
-	pblendw     xmm0,xmm2,0AAh 
-	pslld       xmm2,16 
-	psrld       xmm1,16 
-	por         xmm2,xmm1 
-	pabsw       xmm0,xmm0 
-	pabsw       xmm2,xmm2 
-	pmaxsw      xmm0,xmm2 
+	movd        xmm5,[eax+ebx]
+	shufps      xmm1,xmm5,0
+	pmaddubsw   xmm0,xmm4
+	pmaddubsw   xmm1,xmm4
+	pmaddubsw   xmm2,xmm4
+	pmaddubsw   xmm3,xmm4
+	psubw       xmm0,xmm2
+	psubw       xmm1,xmm3
+	movdqa      xmm2,xmm0
+	paddw       xmm0,xmm1
+	psubw       xmm1,xmm2
+	movdqa      xmm2,xmm0
+	punpcklqdq  xmm0,xmm1
+	punpckhqdq  xmm2,xmm1
+	movdqa      xmm1,xmm0
+	paddw       xmm0,xmm2
+	psubw       xmm2,xmm1
+	movdqa      xmm1,xmm0
+	pblendw     xmm0,xmm2,0AAh
+	pslld       xmm2,16
+	psrld       xmm1,16
+	por         xmm2,xmm1
+	pabsw       xmm0,xmm0
+	pabsw       xmm2,xmm2
+	pmaxsw      xmm0,xmm2
 	SSSE3_SumWHorizon eax, xmm0, xmm5, xmm7
-	pop         ebx  
-	ret 
- 
+	pop         ebx
+	ret
+
 ;***********************************************************************
 ;
 ;int32_t WelsSampleSatd8x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
@@ -1051,10 +1051,10 @@
 	mov    eax,    [esp+16]
 	mov    ebx,    [esp+20]
 	mov    ecx,    [esp+24]
-	mov    edx,    [esp+28]    
+	mov    edx,    [esp+28]
 	movdqa      xmm7, [HSumSubDB1]
-	lea         esi,  [ebx+ebx*2] 
-	lea         edi,  [edx+edx*2] 
+	lea         esi,  [ebx+ebx*2]
+	lea         edi,  [edx+edx*2]
 	pxor		xmm6, xmm6
 	SSE41_GetSatd8x4
 	lea			eax,	[eax+4*ebx]
@@ -1065,7 +1065,7 @@
 	pop 		esi
 	pop 		ebx
 	ret
- 
+
 ;***********************************************************************
 ;
 ;int32_t WelsSampleSatd8x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
@@ -1078,17 +1078,17 @@
 	push   esi
 	push   edi
 	push   ebp
-%define pushsize   16	
+%define pushsize   16
 	mov    eax,    [esp+pushsize+4]
 	mov    ebx,    [esp+pushsize+8]
 	mov    ecx,    [esp+pushsize+12]
-	mov    edx,    [esp+pushsize+16]    
+	mov    edx,    [esp+pushsize+16]
 	movdqa      xmm7, [HSumSubDB1]
-	lea         esi,  [ebx+ebx*2] 
-	lea         edi,  [edx+edx*2] 
+	lea         esi,  [ebx+ebx*2]
+	lea         edi,  [edx+edx*2]
 	pxor        xmm6, xmm6
 	mov         ebp,    0
-loop_get_satd_8x16:	
+loop_get_satd_8x16:
 	SSE41_GetSatd8x4
 	lea			eax,  [eax+4*ebx]
 	lea			ecx,  [ecx+4*edx]
@@ -1116,10 +1116,10 @@
 	mov    eax,    [esp+16]
 	mov    ebx,    [esp+20]
 	mov    ecx,    [esp+24]
-	mov    edx,    [esp+28]    
+	mov    edx,    [esp+28]
 	movdqa      xmm7, [HSumSubDB1]
-	lea         esi,  [ebx+ebx*2] 
-	lea         edi,  [edx+edx*2] 
+	lea         esi,  [ebx+ebx*2]
+	lea         edi,  [edx+edx*2]
 	pxor		xmm6,   xmm6
 	SSE41_GetSatd8x4
 	lea			eax,  [eax+4*ebx]
@@ -1144,7 +1144,7 @@
 ;int32_t WelsSampleSatd16x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
 ;
 ;***********************************************************************
-   
+
 WELS_EXTERN WelsSampleSatd16x16_sse41
 align 16
 WelsSampleSatd16x16_sse41:
@@ -1152,17 +1152,17 @@
 	push   esi
 	push   edi
 	push   ebp
-	%define pushsize   16	
+	%define pushsize   16
 	mov    eax,    [esp+pushsize+4]
 	mov    ebx,    [esp+pushsize+8]
 	mov    ecx,    [esp+pushsize+12]
-	mov    edx,    [esp+pushsize+16]    
+	mov    edx,    [esp+pushsize+16]
 	movdqa      xmm7, [HSumSubDB1]
-	lea         esi,  [ebx+ebx*2] 
-	lea         edi,  [edx+edx*2] 
+	lea         esi,  [ebx+ebx*2]
+	lea         edi,  [edx+edx*2]
 	pxor		xmm6,   xmm6
 	mov         ebp,    0
-loop_get_satd_16x16_left:	
+loop_get_satd_16x16_left:
 	SSE41_GetSatd8x4
 	lea			eax,  [eax+4*ebx]
 	lea			ecx,  [ecx+4*edx]
@@ -1206,8 +1206,8 @@
 	lea    ecx,    [ecx+2*edx]
 	movdqu xmm1,   [ecx]
 	MOVDQ  xmm2,   [eax];[eax] must aligned 16
-	psadbw xmm1,   xmm2 
-	paddw  xmm0,   xmm1	
+	psadbw xmm1,   xmm2
+	paddw  xmm0,   xmm1
 	movdqu xmm1,   [ecx+edx]
 	MOVDQ  xmm2,   [eax+ebx]
 	psadbw xmm1,   xmm2
@@ -1218,7 +1218,7 @@
 %macro SSE2_GetSad4x16 0
 	movdqu xmm0,   [ecx]
 	MOVDQ  xmm2,   [eax]
-	psadbw xmm0,   xmm2 
+	psadbw xmm0,   xmm2
 	paddw  xmm7,   xmm0
 	movdqu xmm1,   [ecx+edx]
 	MOVDQ  xmm2,   [eax+ebx]
@@ -1226,8 +1226,8 @@
 	paddw  xmm7,   xmm1
 	movdqu xmm1,   [ecx+2*edx]
 	MOVDQ  xmm2,   [eax+2*ebx];[eax] must aligned 16
-	psadbw xmm1,   xmm2 
-	paddw  xmm7,   xmm1	
+	psadbw xmm1,   xmm2
+	paddw  xmm7,   xmm1
 	movdqu xmm1,   [ecx+edi]
 	MOVDQ  xmm2,   [eax+esi]
 	psadbw xmm1,   xmm2
@@ -1265,17 +1265,17 @@
 WelsSampleSad16x16_sse2:
 	push ebx
 	push edi
-	push esi	
-		
+	push esi
+
 	%define _STACK_SIZE		12
-	
+
 	mov eax, [esp+_STACK_SIZE+4 ]
 	mov	ebx, [esp+_STACK_SIZE+8 ]
 	lea esi, [3*ebx]
 	mov ecx, [esp+_STACK_SIZE+12]
-	mov edx, [esp+_STACK_SIZE+16]	
-	lea edi, [3*edx]	
-	
+	mov edx, [esp+_STACK_SIZE+16]
+	lea edi, [3*edx]
+
 	pxor   xmm7,   xmm7
 	SSE2_GetSad4x16
 	lea   eax,    [eax+4*ebx]
@@ -1290,14 +1290,14 @@
 	movhlps xmm0, xmm7
 	paddw xmm0, xmm7
 	movd eax, xmm0
-	
-	%undef _STACK_SIZE	
-	
+
+	%undef _STACK_SIZE
+
 	pop esi
 	pop edi
 	pop ebx
 	ret
-   
+
 ;***********************************************************************
 ;
 ;int32_t WelsSampleSad16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
@@ -1312,10 +1312,10 @@
 	mov    eax,    [esp+8]
 	mov    ebx,    [esp+12]
 	mov    ecx,    [esp+16]
-	mov    edx,    [esp+20]    
+	mov    edx,    [esp+20]
 	movdqu xmm0,   [ecx]
 	MOVDQ  xmm2,   [eax]
-	psadbw xmm0,   xmm2 
+	psadbw xmm0,   xmm2
 	movdqu xmm1,   [ecx+edx]
 	MOVDQ  xmm2,   [eax+ebx]
 	psadbw xmm1,   xmm2
@@ -1339,19 +1339,19 @@
 	mov    eax,    [esp+8]
 	mov    ebx,    [esp+12]
 	mov    ecx,    [esp+16]
-	mov    edx,    [esp+20]    
+	mov    edx,    [esp+20]
     pxor   xmm6,   xmm6
-	
+
 	SSE2_GetSad8x4
     lea    eax,    [eax+2*ebx]
 	lea    ecx,    [ecx+2*edx]
-    SSE2_GetSad8x4    
+    SSE2_GetSad8x4
     lea    eax,    [eax+2*ebx]
 	lea    ecx,    [ecx+2*edx]
 	SSE2_GetSad8x4
     lea    eax,    [eax+2*ebx]
 	lea    ecx,    [ecx+2*edx]
-    SSE2_GetSad8x4    
+    SSE2_GetSad8x4
 
     movhlps    xmm0, xmm6
 	paddw      xmm0, xmm6
@@ -1375,15 +1375,15 @@
 	push   edi
 	mov    eax,    [esp+12]
 	mov    ebx,    [esp+16]
-    
+
     pxor   xmm7,   xmm7
-    
+
     mov    edi,    ecx
     and    edi,    0x07
-    sub    ecx,    edi   
+    sub    ecx,    edi
     mov    edx,    8
     sub    edx,    edi
-    
+
     shl    edi,    3
     shl    edx,    3
     movd   xmm5,   edi
@@ -1391,10 +1391,10 @@
 	mov    edi,    8
 	add    edi,    ecx
     mov    edx,    [esp+24]
-    
+
     movq   xmm0,   [eax]
 	movhps xmm0,   [eax+ebx]
-		
+
 	movq   xmm1,   [ecx]
 	movq   xmm2,   [edi]
 	movhps xmm1,   [ecx+edx]
@@ -1402,17 +1402,17 @@
 	psrlq  xmm1,   xmm5
 	psllq  xmm2,   xmm6
 	por    xmm1,   xmm2
-	
+
 	psadbw xmm0,   xmm1
 	paddw  xmm7,   xmm0
-	
+
 	lea    eax,    [eax+2*ebx]
 	lea    ecx,    [ecx+2*edx]
 	lea    edi,    [edi+2*edx]
-	 
+
     movq   xmm0,   [eax]
 	movhps xmm0,   [eax+ebx]
-		
+
 	movq   xmm1,   [ecx]
 	movq   xmm2,   [edi]
 	movhps xmm1,   [ecx+edx]
@@ -1420,7 +1420,7 @@
 	psrlq  xmm1,   xmm5
 	psllq  xmm2,   xmm6
 	por    xmm1,   xmm2
-	
+
 	psadbw xmm0,   xmm1
 	paddw  xmm7,   xmm0
 
@@ -1427,10 +1427,10 @@
 	lea    eax,    [eax+2*ebx]
 	lea    ecx,    [ecx+2*edx]
 	lea    edi,    [edi+2*edx]
-	 
+
     movq   xmm0,   [eax]
 	movhps xmm0,   [eax+ebx]
-		
+
 	movq   xmm1,   [ecx]
 	movq   xmm2,   [edi]
 	movhps xmm1,   [ecx+edx]
@@ -1438,17 +1438,17 @@
 	psrlq  xmm1,   xmm5
 	psllq  xmm2,   xmm6
 	por    xmm1,   xmm2
-	
+
 	psadbw xmm0,   xmm1
 	paddw  xmm7,   xmm0
-	
+
 	lea    eax,    [eax+2*ebx]
 	lea    ecx,    [ecx+2*edx]
 	lea    edi,    [edi+2*edx]
-	 
+
     movq   xmm0,   [eax]
 	movhps xmm0,   [eax+ebx]
-		
+
 	movq   xmm1,   [ecx]
 	movq   xmm2,   [edi]
 	movhps xmm1,   [ecx+edx]
@@ -1456,10 +1456,10 @@
 	psrlq  xmm1,   xmm5
 	psllq  xmm2,   xmm6
 	por    xmm1,   xmm2
-	
+
 	psadbw xmm0,   xmm1
 	paddw  xmm7,   xmm0
-	
+
     movhlps    xmm0, xmm7
 	paddw      xmm0, xmm7
 	movd       eax,  xmm0
@@ -1469,12 +1469,12 @@
     push   ebx
     mov    eax,    [esp+8]
 	mov    ebx,    [esp+12]
-	mov    edx,    [esp+20]    
+	mov    edx,    [esp+20]
 	pxor   xmm6,   xmm6
 	SSE2_GetSad8x4
     lea    eax,    [eax+2*ebx]
 	lea    ecx,    [ecx+2*edx]
-    SSE2_GetSad8x4    
+    SSE2_GetSad8x4
     movhlps    xmm0, xmm6
 	paddw      xmm0, xmm6
 	movd       eax,  xmm0
@@ -1485,7 +1485,7 @@
 
 ;***********************************************************************
 ;
-;Pixel_sad_wxh_sse2 END 
+;Pixel_sad_wxh_sse2 END
 ;
 ;***********************************************************************
 
@@ -1492,7 +1492,7 @@
 
 ;***********************************************************************
 ;
-;Pixel_sad_4_wxh_sse2 BEGIN 
+;Pixel_sad_4_wxh_sse2 BEGIN
 ;
 ;***********************************************************************
 
@@ -1525,20 +1525,20 @@
 	movdqu xmm3,   [ecx]
 	psadbw xmm3,   xmm0
 	paddw  xmm4,   xmm3
-	
+
 	movdqa xmm1,   [eax+ebx]
 	movdqu xmm3,   [ecx+edx]
 	psadbw xmm3,   xmm1
 	paddw  xmm4,   xmm3
-	
+
 	movdqu xmm2,   [ecx+edx-1]
 	psadbw xmm2,   xmm0
 	paddw  xmm6,   xmm2
-	
+
 	movdqu xmm3,   [ecx+edx+1]
 	psadbw xmm3,   xmm0
 	paddw  xmm7,   xmm3
-	
+
 	lea    eax,    [eax+2*ebx]
 	lea    ecx,    [ecx+2*edx]
 	movdqa xmm2,   [eax]
@@ -1599,30 +1599,30 @@
 	movdqu xmm3,   [ecx]
 	psadbw xmm2,   xmm3
 	paddw xmm5,   xmm2
-	
+
 	movdqu xmm2,   [ecx-1]
 	psadbw xmm2,   xmm0
 	paddw xmm6,   xmm2
-	
+
 	movdqu xmm3,   [ecx+1]
 	psadbw xmm3,   xmm0
 	paddw xmm7,   xmm3
-	
+
 	movdqu xmm3,   [ecx+edx]
 	psadbw xmm0,   xmm3
 	paddw xmm5,   xmm0
-	
+
 	mov        ecx,  [esp+24]
 	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0 
+	paddw      xmm4, xmm0
 	movhlps    xmm0, xmm5
-	paddw      xmm5, xmm0 
+	paddw      xmm5, xmm0
 	movhlps    xmm0, xmm6
-	paddw      xmm6, xmm0 
+	paddw      xmm6, xmm0
 	movhlps    xmm0, xmm7
 	paddw      xmm7, xmm0
 	punpckldq  xmm4, xmm5
-	punpckldq  xmm6, xmm7 
+	punpckldq  xmm6, xmm7
 	punpcklqdq xmm4, xmm6
 	movdqa     [ecx],xmm4
 	pop  ebx
@@ -1646,20 +1646,20 @@
 	movdqu xmm3,   [edi]
 	psadbw xmm3,   xmm0
 	paddw xmm4,   xmm3
-	
+
 	movdqa xmm1,   [eax+ebx]
 	movdqu xmm3,   [edi+edx]
 	psadbw xmm3,   xmm1
 	paddw xmm4,   xmm3
-	
+
 	movdqu xmm2,   [edi+edx-1]
 	psadbw xmm2,   xmm0
 	paddw xmm6,   xmm2
-	
+
 	movdqu xmm3,   [edi+edx+1]
 	psadbw xmm3,   xmm0
 	paddw xmm7,   xmm3
-	
+
 	lea    eax,    [eax+2*ebx]
 	lea    edi,    [edi+2*edx]
 	movdqa xmm2,   [eax]
@@ -1688,36 +1688,36 @@
 	movdqu xmm3,   [edi]
 	psadbw xmm0,   xmm3
 	paddw xmm5,   xmm0
-	
+
 	movdqu xmm0,   [edi-1]
 	psadbw xmm0,   xmm1
 	paddw xmm6,   xmm0
-	
+
 	movdqu xmm3,   [edi+1]
 	psadbw xmm3,   xmm1
 	paddw xmm7,   xmm3
-	
+
 	movdqu xmm3,   [edi+edx]
 	psadbw xmm1,   xmm3
 	paddw xmm5,   xmm1
-	
+
 	mov        edi,  [esp+28]
 	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0 
+	paddw      xmm4, xmm0
 	movhlps    xmm0, xmm5
-	paddw      xmm5, xmm0 
+	paddw      xmm5, xmm0
 	movhlps    xmm0, xmm6
-	paddw      xmm6, xmm0 
+	paddw      xmm6, xmm0
 	movhlps    xmm0, xmm7
 	paddw      xmm7, xmm0
 	punpckldq  xmm4, xmm5
-	punpckldq  xmm6, xmm7 
+	punpckldq  xmm6, xmm7
 	punpcklqdq xmm4, xmm6
 	movdqa     [edi],xmm4
 	pop  edi
 	pop  ebx
 	ret
-	
+
 WELS_EXTERN WelsSampleSadFour8x16_sse2
 WelsSampleSadFour8x16_sse2:
 	push ebx
@@ -1737,10 +1737,10 @@
 	movhps xmm3,   [edi+edx]
 	psadbw xmm3,   xmm0
 	paddw  xmm4,   xmm3
-	
+
 	movq   xmm1,  [edi+edx-1]
 	movq   xmm3,  [edi+edx+1]
-	
+
 	lea    eax,   [eax+2*ebx]
 	lea    edi,   [edi+2*edx]
 	movhps xmm1,  [edi-1]
@@ -1749,191 +1749,191 @@
 	paddw  xmm6,  xmm1
 	psadbw xmm3,  xmm0
 	paddw  xmm7,  xmm3
-	
+
 	movq   xmm3,  [edi]
 	movhps xmm3,  [edi+edx]
 	psadbw xmm0,  xmm3
 	paddw  xmm5,  xmm0
-	
+
 	movq   xmm0,  [eax]
 	movhps xmm0,  [eax+ebx]
 	psadbw xmm3,  xmm0
 	paddw  xmm4,  xmm3
-	
+
 	movq   xmm1,  [edi+edx-1]
 	movq   xmm3,  [edi+edx+1]
-	
+
 	lea    eax,   [eax+2*ebx]
 	lea    edi,   [edi+2*edx]
 	movhps xmm1,  [edi-1]
 	movhps xmm3,  [edi+1]
-	
+
 	psadbw xmm1,  xmm0
 	paddw  xmm6,  xmm1
 	psadbw xmm3,  xmm0
 	paddw  xmm7,  xmm3
-	
+
 	movq   xmm3,  [edi]
 	movhps xmm3,  [edi+edx]
 	psadbw xmm0,  xmm3
 	paddw  xmm5,  xmm0
-	
+
 	movq   xmm0,  [eax]
 	movhps xmm0,  [eax+ebx]
 	psadbw xmm3,  xmm0
 	paddw  xmm4,  xmm3
-	
+
 	movq   xmm1,  [edi+edx-1]
 	movq   xmm3,  [edi+edx+1]
-	
+
 	lea    eax,   [eax+2*ebx]
 	lea    edi,   [edi+2*edx]
 	movhps xmm1,  [edi-1]
 	movhps xmm3,  [edi+1]
-	
+
 	psadbw xmm1,  xmm0
 	paddw  xmm6,  xmm1
 	psadbw xmm3,  xmm0
 	paddw  xmm7,  xmm3
-	
+
 	movq   xmm3,  [edi]
 	movhps xmm3,  [edi+edx]
 	psadbw xmm0,  xmm3
 	paddw  xmm5,  xmm0
-	
+
 	movq   xmm0,  [eax]
 	movhps xmm0,  [eax+ebx]
 	psadbw xmm3,  xmm0
 	paddw  xmm4,  xmm3
-	
+
 	movq   xmm1,  [edi+edx-1]
 	movq   xmm3,  [edi+edx+1]
-	
+
 	lea    eax,   [eax+2*ebx]
 	lea    edi,   [edi+2*edx]
 	movhps xmm1,  [edi-1]
 	movhps xmm3,  [edi+1]
-	
+
 	psadbw xmm1,  xmm0
 	paddw  xmm6,  xmm1
 	psadbw xmm3,  xmm0
 	paddw  xmm7,  xmm3
-	
+
 	movq   xmm3,  [edi]
 	movhps xmm3,  [edi+edx]
 	psadbw xmm0,  xmm3
 	paddw  xmm5,  xmm0
-	
+
 	movq   xmm0,  [eax]
 	movhps xmm0,  [eax+ebx]
 	psadbw xmm3,  xmm0
 	paddw  xmm4,  xmm3
-	
+
 	movq   xmm1,  [edi+edx-1]
 	movq   xmm3,  [edi+edx+1]
-	
+
 	lea    eax,   [eax+2*ebx]
 	lea    edi,   [edi+2*edx]
 	movhps xmm1,  [edi-1]
 	movhps xmm3,  [edi+1]
-	
+
 	psadbw xmm1,  xmm0
 	paddw  xmm6,  xmm1
 	psadbw xmm3,  xmm0
 	paddw  xmm7,  xmm3
-	
+
 	movq   xmm3,  [edi]
 	movhps xmm3,  [edi+edx]
 	psadbw xmm0,  xmm3
 	paddw  xmm5,  xmm0
-	
+
 	movq   xmm0,  [eax]
 	movhps xmm0,  [eax+ebx]
 	psadbw xmm3,  xmm0
 	paddw  xmm4,  xmm3
-	
+
 	movq   xmm1,  [edi+edx-1]
 	movq   xmm3,  [edi+edx+1]
-	
+
 	lea    eax,   [eax+2*ebx]
 	lea    edi,   [edi+2*edx]
 	movhps xmm1,  [edi-1]
 	movhps xmm3,  [edi+1]
-	
+
 	psadbw xmm1,  xmm0
 	paddw  xmm6,  xmm1
 	psadbw xmm3,  xmm0
 	paddw  xmm7,  xmm3
-	
+
 	movq   xmm3,  [edi]
 	movhps xmm3,  [edi+edx]
 	psadbw xmm0,  xmm3
 	paddw  xmm5,  xmm0
-	
+
 	movq   xmm0,  [eax]
 	movhps xmm0,  [eax+ebx]
 	psadbw xmm3,  xmm0
 	paddw  xmm4,  xmm3
-	
+
 	movq   xmm1,  [edi+edx-1]
 	movq   xmm3,  [edi+edx+1]
-	
+
 	lea    eax,   [eax+2*ebx]
 	lea    edi,   [edi+2*edx]
 	movhps xmm1,  [edi-1]
 	movhps xmm3,  [edi+1]
-	
+
 	psadbw xmm1,  xmm0
 	paddw  xmm6,  xmm1
 	psadbw xmm3,  xmm0
 	paddw  xmm7,  xmm3
-	
+
 	movq   xmm3,  [edi]
 	movhps xmm3,  [edi+edx]
 	psadbw xmm0,  xmm3
 	paddw  xmm5,  xmm0
-	
+
 	movq   xmm0,  [eax]
 	movhps xmm0,  [eax+ebx]
 	psadbw xmm3,  xmm0
 	paddw  xmm4,  xmm3
-	
+
 	movq   xmm1,  [edi+edx-1]
 	movq   xmm3,  [edi+edx+1]
-	
+
 	lea    eax,   [eax+2*ebx]
 	lea    edi,   [edi+2*edx]
 	movhps xmm1,  [edi-1]
 	movhps xmm3,  [edi+1]
-	
+
 	psadbw xmm1,  xmm0
 	paddw  xmm6,  xmm1
 	psadbw xmm3,  xmm0
 	paddw  xmm7,  xmm3
-	
+
 	movq   xmm3,  [edi]
 	movhps xmm3,  [edi+edx]
 	psadbw xmm0,  xmm3
 	paddw  xmm5,  xmm0
-	
+
 	mov        edi,  [esp+28]
 	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0 
+	paddw      xmm4, xmm0
 	movhlps    xmm0, xmm5
-	paddw      xmm5, xmm0 
+	paddw      xmm5, xmm0
 	movhlps    xmm0, xmm6
-	paddw      xmm6, xmm0 
+	paddw      xmm6, xmm0
 	movhlps    xmm0, xmm7
 	paddw      xmm7, xmm0
 	punpckldq  xmm4, xmm5
-	punpckldq  xmm6, xmm7 
+	punpckldq  xmm6, xmm7
 	punpcklqdq xmm4, xmm6
 	movdqa     [edi],xmm4
 	pop  edi
 	pop  ebx
 	ret
-	
-	
+
+
 WELS_EXTERN WelsSampleSadFour8x8_sse2
 WelsSampleSadFour8x8_sse2:
 	push ebx
@@ -1953,10 +1953,10 @@
 	movhps xmm3,   [edi+edx]
 	psadbw xmm3,   xmm0
 	paddw  xmm4,   xmm3
-	
+
 	movq   xmm1,  [edi+edx-1]
 	movq   xmm3,  [edi+edx+1]
-	
+
 	lea    eax,   [eax+2*ebx]
 	lea    edi,   [edi+2*edx]
 	movhps xmm1,  [edi-1]
@@ -1965,99 +1965,99 @@
 	paddw  xmm6,  xmm1
 	psadbw xmm3,  xmm0
 	paddw  xmm7,  xmm3
-	
+
 	movq   xmm3,  [edi]
 	movhps xmm3,  [edi+edx]
 	psadbw xmm0,  xmm3
 	paddw  xmm5,  xmm0
-	
+
 	movq   xmm0,  [eax]
 	movhps xmm0,  [eax+ebx]
 	psadbw xmm3,  xmm0
 	paddw  xmm4,  xmm3
-	
+
 	movq   xmm1,  [edi+edx-1]
 	movq   xmm3,  [edi+edx+1]
-	
+
 	lea    eax,   [eax+2*ebx]
 	lea    edi,   [edi+2*edx]
 	movhps xmm1,  [edi-1]
 	movhps xmm3,  [edi+1]
-	
+
 	psadbw xmm1,  xmm0
 	paddw  xmm6,  xmm1
 	psadbw xmm3,  xmm0
 	paddw  xmm7,  xmm3
-	
+
 	movq   xmm3,  [edi]
 	movhps xmm3,  [edi+edx]
 	psadbw xmm0,  xmm3
 	paddw  xmm5,  xmm0
-	
+
 	movq   xmm0,  [eax]
 	movhps xmm0,  [eax+ebx]
 	psadbw xmm3,  xmm0
 	paddw  xmm4,  xmm3
-	
+
 	movq   xmm1,  [edi+edx-1]
 	movq   xmm3,  [edi+edx+1]
-	
+
 	lea    eax,   [eax+2*ebx]
 	lea    edi,   [edi+2*edx]
 	movhps xmm1,  [edi-1]
 	movhps xmm3,  [edi+1]
-	
+
 	psadbw xmm1,  xmm0
 	paddw  xmm6,  xmm1
 	psadbw xmm3,  xmm0
 	paddw  xmm7,  xmm3
-	
+
 	movq   xmm3,  [edi]
 	movhps xmm3,  [edi+edx]
 	psadbw xmm0,  xmm3
 	paddw  xmm5,  xmm0
-	
+
 	movq   xmm0,  [eax]
 	movhps xmm0,  [eax+ebx]
 	psadbw xmm3,  xmm0
 	paddw  xmm4,  xmm3
-	
-	
+
+
 	movq   xmm1,  [edi+edx-1]
 	movq   xmm3,  [edi+edx+1]
-	
+
 	lea    eax,   [eax+2*ebx]
 	lea    edi,   [edi+2*edx]
 	movhps xmm1,  [edi-1]
 	movhps xmm3,  [edi+1]
-	
+
 	psadbw xmm1,  xmm0
 	paddw  xmm6,  xmm1
 	psadbw xmm3,  xmm0
 	paddw  xmm7,  xmm3
-	
+
 	movq   xmm3,  [edi]
 	movhps xmm3,  [edi+edx]
 	psadbw xmm0,  xmm3
 	paddw  xmm5,  xmm0
-	
+
 	mov        edi,  [esp+28]
 	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0 
+	paddw      xmm4, xmm0
 	movhlps    xmm0, xmm5
-	paddw      xmm5, xmm0 
+	paddw      xmm5, xmm0
 	movhlps    xmm0, xmm6
-	paddw      xmm6, xmm0 
+	paddw      xmm6, xmm0
 	movhlps    xmm0, xmm7
 	paddw      xmm7, xmm0
 	punpckldq  xmm4, xmm5
-	punpckldq  xmm6, xmm7 
+	punpckldq  xmm6, xmm7
 	punpcklqdq xmm4, xmm6
 	movdqa     [edi],xmm4
 	pop  edi
 	pop  ebx
 	ret
-	
+
 WELS_EXTERN WelsSampleSadFour4x4_sse2
 WelsSampleSadFour4x4_sse2:
 	push ebx
@@ -2080,23 +2080,23 @@
 	punpckldq  xmm1, xmm2
 	movd       xmm2, [edi+edx-1]
 	movd       xmm3, [edi+edx+1]
-	
+
 	lea        edi,  [edi+2*edx]
-	
+
 	movd       xmm4, [edi]
 	movd       xmm5, [edi-1]
 	punpckldq  xmm2, xmm5
 	movd       xmm5, [edi+1]
 	punpckldq  xmm3, xmm5
-	
+
 	movd       xmm5, [edi+edx]
 	punpckldq  xmm4, xmm5
-	
+
 	punpcklqdq xmm1, xmm4 ;-L
-	
+
 	movd       xmm5, [edi+edx-1]
 	movd       xmm6, [edi+edx+1]
-	
+
 	lea        edi,  [edi+2*edx]
 	movd       xmm7, [edi-1]
 	punpckldq  xmm5, xmm7
@@ -2107,12 +2107,12 @@
 	movd       xmm6, [edi]
 	movd       xmm7, [edi+edx]
 	punpckldq  xmm6, xmm7
-	punpcklqdq xmm4, xmm6 ;+L 
+	punpcklqdq xmm4, xmm6 ;+L
 	psadbw     xmm1, xmm0
 	psadbw     xmm2, xmm0
 	psadbw     xmm3, xmm0
 	psadbw     xmm4, xmm0
-	
+
 	movhlps    xmm0, xmm1
 	paddw      xmm1, xmm0
 	movhlps    xmm0, xmm2
@@ -2123,13 +2123,13 @@
 	paddw      xmm4, xmm0
 	mov        edi,  [esp+28]
 	punpckldq  xmm1, xmm4
-	punpckldq  xmm2, xmm3 
+	punpckldq  xmm2, xmm3
 	punpcklqdq xmm1, xmm2
 	movdqa     [edi],xmm1
 	pop  edi
 	pop  ebx
 	ret
-	
+
 ;***********************************************************************
 ;
 ;Pixel_sad_4_wxh_sse2 END
@@ -2150,40 +2150,40 @@
 %define pix2address  esp+pushsize+12
 %define pix2stride   esp+pushsize+16
 
-    mov		  eax, [pix1address]    
-    mov		  ebx, [pix1stride ]    
-    mov		  ecx, [pix2address]    
-    mov		  edx, [pix2stride ]    
+    mov		  eax, [pix1address]
+    mov		  ebx, [pix1stride ]
+    mov		  ecx, [pix2address]
+    mov		  edx, [pix2stride ]
 
 	movd	  mm0, [eax]
 	movd	  mm1, [eax+ebx]
 	punpckldq mm0, mm1
-	
+
 	movd      mm3, [ecx]
 	movd      mm4, [ecx+edx]
 	punpckldq mm3, mm4
 	psadbw    mm0, mm3
-	
+
 	lea       eax, [eax+2*ebx]
 	lea       ecx, [ecx+2*edx]
-	
+
 	movd      mm1, [eax]
 	movd      mm2, [eax+ebx]
 	punpckldq mm1, mm2
-	
+
 	movd      mm3, [ecx]
 	movd      mm4, [ecx+edx]
 	punpckldq mm3, mm4
 	psadbw    mm1, mm3
 	paddw     mm0, mm1
-	
+
     movd      eax, mm0
 
 	WELSEMMS
     pop ebx
-%undef pushsize     
-%undef pix1address	
-%undef pix1stride   
-%undef pix2address  
-%undef pix2stride   
+%undef pushsize
+%undef pix1address
+%undef pix1stride
+%undef pix2address
+%undef pix2stride
     ret
\ No newline at end of file
--- a/codec/encoder/core/asm/score.asm
+++ b/codec/encoder/core/asm/score.asm
@@ -45,7 +45,7 @@
 bits 32
 
 ;***********************************************************************
-; Macros 
+; Macros
 ;***********************************************************************
 
 ;***********************************************************************
@@ -59,7 +59,7 @@
 sse2_1: dw 1, 1, 1, 1, 1, 1, 1, 1
 align 16
 sse2_b1: db 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
-i_ds_table: db 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 
+i_ds_table: db 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 align 16
 sse2_plane_inc_minus: dw -7, -6, -5, -4, -3, -2, -1, 0
 align 16
@@ -139,7 +139,7 @@
     db  4, 8, 5, 8, 8,12, 1, 4, 4, 8
     db  4, 7, 7,11, 4, 8, 7,11, 8,11
     db 11,15, 1, 4, 3, 7, 4, 7, 7,11
-    db  3, 7, 6,10, 7,10,10,14, 4, 7 
+    db  3, 7, 6,10, 7,10,10,14, 4, 7
     db  7,11, 7,10,10,14, 7,11,10,14
     db 11,14,14,18, 0, 4, 3, 7, 3, 6
     db  6,10, 3, 7, 6,10, 7,10,10,14
@@ -191,7 +191,7 @@
 	movdqa     [eax],xmm0
 	movdqa     [eax+16], xmm1
 	ret
-	
+
 ;***********************************************************************
 ;void WelsScan4x4DcAc_ssse3( int16_t level[16], int16_t *pDct )
 ;***********************************************************************
@@ -206,7 +206,7 @@
 	pinsrw		xmm0, eax, 7			; xmm0[7]	=	[8]
 	pinsrw		xmm1, ecx, 0			; xmm1[0]	=	[7]
 	pshufb		xmm1, [pb_scanacdc_maskb]
-	pshufb		xmm0, [pb_scanacdc_maska]	
+	pshufb		xmm0, [pb_scanacdc_maska]
 
 	mov        eax,  [esp+4]
 	movdqa     [eax],xmm0
@@ -224,7 +224,7 @@
 	movdqa     xmm2, xmm0
 	punpcklqdq xmm0, xmm1
 	punpckhqdq xmm2, xmm1
-	
+
 	movdqa     xmm3, xmm0
 	punpckldq  xmm0, xmm2
 	punpckhdq  xmm3, xmm2
@@ -236,10 +236,10 @@
 	pextrw     edx,  xmm3, 0
 	pinsrw     xmm3, eax,  0
 	pinsrw     xmm0, edx,  3
-	
+
 	pshufhw    xmm1, xmm0, 0x93
 	pshuflw    xmm2, xmm3, 0x39
-    
+
     movdqa     xmm3, xmm2
     psrldq     xmm1, 2
     pslldq     xmm3, 14
@@ -255,13 +255,13 @@
 ;void int32_t WelsCalculateSingleCtr4x4_sse2( int16_t *pDct );
 ;***********************************************************************
 ALIGN 16
-WELS_EXTERN WelsCalculateSingleCtr4x4_sse2 
+WELS_EXTERN WelsCalculateSingleCtr4x4_sse2
 WelsCalculateSingleCtr4x4_sse2:
 	push      ebx
 	mov       eax,  [esp+8]
 	movdqa    xmm0, [eax]
 	movdqa    xmm1, [eax+16]
-	
+
 	packsswb  xmm0, xmm1
 
     pxor      xmm3, xmm3
@@ -317,7 +317,7 @@
 	and       edx,  0xff
 	shr       ecx,  8
 ;	and       ecx,  0xff	; we do not need this due to high 16bits equal to 0 yet
-	xor       eax,  eax	
+	xor       eax,  eax
 	add       al,  [nozero_count_table+ecx]
 	add       al,  [nozero_count_table+edx]
 	ret
--- a/codec/encoder/core/asm/vaa.asm
+++ b/codec/encoder/core/asm/vaa.asm
@@ -38,7 +38,7 @@
 ;*      04/14/2010	Created
 ;*		06/07/2010	Added AnalysisVaaInfoIntra_sse2(ssse3)
 ;*		06/10/2010	Tune rc_sad_frame_sse2 and got about 40% improvement
-;*		08/11/2010	Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2 
+;*		08/11/2010	Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2
 ;*
 ;*************************************************************************/
 %include "asm_inc.asm"
@@ -167,7 +167,7 @@
 	mov ebp, esp
 	and ebp, 0fh
 	sub esp, ebp
-	sub esp, 32	
+	sub esp, 32
 	%define PUSH_SIZE	52	; 20 + 32
 
 	mov esi, [esp+ebp+PUSH_SIZE+4]	; data_y
@@ -179,31 +179,31 @@
 	add edx, ecx		; iLineSize x 3 [edx]
 	mov eax, ebx
 	sal eax, $1			; iLineSize x 4 [eax]
-	
+
 	pxor xmm7, xmm7
-	
+
 	; loops
 	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp], xmm0	
+	movq [esp], xmm0
 
 	lea esi, [esi+eax]
 	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp+8], xmm0	
+	movq [esp+8], xmm0
 
 	lea esi, [esi+eax]
 	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp+16], xmm0	
+	movq [esp+16], xmm0
 
 	lea esi, [esi+eax]
 	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
 	movq [esp+24], xmm0
-		
+
 	movdqa xmm0, [esp]		; block 0~7
 	movdqa xmm1, [esp+16]	; block 8~15
 	movdqa xmm2, xmm0
 	paddw xmm0, xmm1
 	SUM_WORD_8x2_SSE2 xmm0, xmm3
-	
+
 	pmullw xmm1, xmm1
 	pmullw xmm2, xmm2
 	movdqa xmm3, xmm1
@@ -219,7 +219,7 @@
 	paddd xmm1, xmm2
 	pshufd xmm2, xmm1, 0B1h
 	paddd xmm1, xmm2
-	
+
 	movd ebx, xmm0
 	and ebx, 0ffffh		; effective low word truncated
 	mov ecx, ebx
@@ -227,7 +227,7 @@
 	sar ebx, $4
 	movd eax, xmm1
 	sub eax, ebx
-	
+
 	%undef PUSH_SIZE
 	add esp, 32
 	add esp, ebp
@@ -253,7 +253,7 @@
 	mov ebp, esp
 	and ebp, 0fh
 	sub esp, ebp
-	sub esp, 32	
+	sub esp, 32
 	%define PUSH_SIZE	52	; 20 + 32
 
 	mov esi, [esp+ebp+PUSH_SIZE+4]	; data_y
@@ -265,25 +265,25 @@
 	add edx, ecx		; iLineSize x 3 [edx]
 	mov eax, ebx
 	sal eax, $1			; iLineSize x 4 [eax]
-	
+
 	pxor xmm7, xmm7
-	
+
 	; loops
 	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp], xmm0	
+	movq [esp], xmm0
 
 	lea esi, [esi+eax]
 	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
-	movq [esp+8], xmm1	
+	movq [esp+8], xmm1
 
 	lea esi, [esi+eax]
 	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp+16], xmm0	
+	movq [esp+16], xmm0
 
 	lea esi, [esi+eax]
 	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
 	movq [esp+24], xmm1
-		
+
 	movdqa xmm0, [esp]		; block 0~7
 	movdqa xmm1, [esp+16]	; block 8~15
 	movdqa xmm2, xmm0
@@ -305,7 +305,7 @@
 	paddd xmm1, xmm2
 	pshufd xmm2, xmm1, 0B1h
 	paddd xmm1, xmm2
-	
+
 	movd ebx, xmm0
 	and ebx, 0ffffh		; effective low work truncated
 	mov ecx, ebx
@@ -313,7 +313,7 @@
 	sar ebx, $4
 	movd eax, xmm1
 	sub eax, ebx
-	
+
 	%undef PUSH_SIZE
 	add esp, 32
 	add esp, ebp
@@ -323,7 +323,7 @@
 	pop edx
 	pop ebx
 	ret
-	
+
 WELS_EXTERN MdInterAnalysisVaaInfo_sse41
 ;***********************************************************************
 ;	uint8_t MdInterAnalysisVaaInfo_sse41( int32_t *pSad8x8 )
@@ -331,11 +331,11 @@
 ALIGN 16
 MdInterAnalysisVaaInfo_sse41:
 	mov eax, [esp+4]
-	movdqa xmm0, [eax]	; load 4 sad_8x8	
+	movdqa xmm0, [eax]	; load 4 sad_8x8
 	pshufd xmm1, xmm0, 01Bh
 	paddd xmm1, xmm0
 	pshufd xmm2, xmm1, 0B1h
-	paddd xmm1, xmm2	
+	paddd xmm1, xmm2
 	psrad xmm1, 02h		; iAverageSad
 	movdqa xmm2, xmm1
 	psrad xmm2, 06h
@@ -342,7 +342,7 @@
 	movdqa xmm3, xmm0	; iSadBlock
 	psrad xmm3, 06h
 	psubd xmm3, xmm2
-	pmulld xmm3, xmm3	; [comment]: pmulld from SSE4.1 instruction sets	
+	pmulld xmm3, xmm3	; [comment]: pmulld from SSE4.1 instruction sets
 	pshufd xmm4, xmm3, 01Bh
 	paddd xmm4, xmm3
 	pshufd xmm3, xmm4, 0B1h
@@ -354,7 +354,7 @@
 	pcmpgtd xmm0, xmm1	; iSadBlock > iAverageSad
 	movmskps eax, xmm0
 	ret
-.threshold_exit:	
+.threshold_exit:
 	mov eax, 15
 	ret
 
@@ -365,11 +365,11 @@
 ALIGN 16
 MdInterAnalysisVaaInfo_sse2:
 	mov eax, [esp+4]
-	movdqa xmm0, [eax]	; load 4 sad_8x8	
+	movdqa xmm0, [eax]	; load 4 sad_8x8
 	pshufd xmm1, xmm0, 01Bh
 	paddd xmm1, xmm0
 	pshufd xmm2, xmm1, 0B1h
-	paddd xmm1, xmm2	
+	paddd xmm1, xmm2
 	psrad xmm1, 02h		; iAverageSad
 	movdqa xmm2, xmm1
 	psrad xmm2, 06h
@@ -376,9 +376,9 @@
 	movdqa xmm3, xmm0	; iSadBlock
 	psrad xmm3, 06h
 	psubd xmm3, xmm2
-	
+
 	; to replace pmulld functionality as below
-	movdqa xmm2, xmm3	
+	movdqa xmm2, xmm3
 	pmuludq xmm2, xmm3
 	pshufd xmm4, xmm3, 0B1h
 	pmuludq xmm4, xmm4
@@ -385,8 +385,8 @@
 	movdqa xmm5, xmm2
 	punpckldq xmm5, xmm4
 	punpckhdq xmm2, xmm4
-	punpcklqdq xmm5, xmm2	
-	
+	punpcklqdq xmm5, xmm2
+
 	pshufd xmm4, xmm5, 01Bh
 	paddd xmm4, xmm5
 	pshufd xmm5, xmm4, 0B1h
@@ -398,6 +398,6 @@
 	pcmpgtd xmm0, xmm1	; iSadBlock > iAverageSad
 	movmskps eax, xmm0
 	ret
-.threshold_exit:	
+.threshold_exit:
 	mov eax, 15
 	ret
--- a/codec/encoder/core/inc/IWelsVP.h
+++ b/codec/encoder/core/inc/IWelsVP.h
@@ -1,306 +1,288 @@
-/*!
- * \copy
- *     Copyright (c)  2004-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file	    :  IWelsVP.h
- *
- * \brief	    :  Interface of wels video processor class
- *
- * \date        :  2011/01/04
- *
- * \description :  1. should support both C/C++ style interface
- *                 2. should concern with the feature extension requirement 
- *                 3. should care the usage of "char"==>
- *                     1) value char  : signed char/unsigned char
- *                     2) string char : char
- *
- *************************************************************************************
- */
-
-#ifndef _IWELSVP_H_
-#define _IWELSVP_H_ 
-
-#ifdef _WIN32
-#define WELSAPI __stdcall
-#else
-#define WELSAPI 
-#endif
-
-#define WELSVP_MAJOR_VERSION   1
-#define WELSVP_MINOR_VERSION   1
-#define WELSVP_VERSION         ((WELSVP_MAJOR_VERSION << 8) + WELSVP_MINOR_VERSION)
-
-typedef enum 
-{
-	RET_SUCCESS          =  0,
-	RET_FAILED           = -1,
-	RET_INVALIDPARAM     = -2,
-	RET_OUTOFMEMORY      = -3,
-	RET_NOTSUPPORTED       = -4,
-	RET_UNEXPECTED       = -5,
-	RET_NEEDREINIT		  = -6
-} EResult;
-
-typedef enum 
-{ 
-	VIDEO_FORMAT_NULL       = 0,   /* invalid format   */
-	/*rgb color formats*/
-	VIDEO_FORMAT_RGB        = 1,   /* rgb 24bits       */
-	VIDEO_FORMAT_RGBA       = 2,   /* rgba             */
-	VIDEO_FORMAT_RGB555     = 3,   /* rgb555           */
-	VIDEO_FORMAT_RGB565     = 4,   /* rgb565           */
-	VIDEO_FORMAT_BGR        = 5,   /* bgr 24bits       */
-	VIDEO_FORMAT_BGRA       = 6,   /* bgr 32bits       */
-	VIDEO_FORMAT_ABGR       = 7,   /* abgr             */
-	VIDEO_FORMAT_ARGB       = 8,   /* argb             */
-
-	/*yuv color formats*/
-	VIDEO_FORMAT_YUY2       = 20,   /* yuy2             */
-	VIDEO_FORMAT_YVYU       = 21,   /* yvyu             */
-	VIDEO_FORMAT_UYVY       = 22,   /* uyvy             */
-	VIDEO_FORMAT_I420       = 23,   /* yuv 4:2:0 planar */              
-	VIDEO_FORMAT_YV12       = 24,   /* yuv 4:2:0 planar */
-	VIDEO_FORMAT_INTERNAL   = 25,   /* Only Used for SVC decoder testbed */ 
-	VIDEO_FORMAT_NV12		= 26,	/* y planar + uv packed */
-	VIDEO_FORMAT_I422       = 27,   /* yuv 4:2:2 planar */
-	VIDEO_FORMAT_I444       = 28,   /* yuv 4:4:4 planar */
-	VIDEO_FORMAT_YUYV       = 20,   /* yuv 4:2:2 packed */
-
-	
-	VIDEO_FORMAT_RGB24      = 1,
-	VIDEO_FORMAT_RGB32      = 2,
-	VIDEO_FORMAT_RGB24_INV  = 5,
-	VIDEO_FORMAT_RGB32_INV  = 6,
-	VIDEO_FORMAT_RGB555_INV = 7,
-	VIDEO_FORMAT_RGB565_INV = 8,
-	VIDEO_FORMAT_YUV2       = 21,
-	VIDEO_FORMAT_420        = 23,
-
-
-	VIDEO_FORMAT_VFlip      = 0x80000000 
-} EVideoFormat;
-
-typedef enum 
-{ 
-	BUFFER_HOSTMEM  = 0,
-	BUFFER_SURFACE
-} EPixMapBufferProperty;
-
-typedef struct
-{
-  int iRectTop;
-  int iRectLeft;
-  int iRectWidth;
-  int iRectHeight;
-} SRect;
-
-typedef struct
-{
-	void        *pPixel[3]; 
-	int          iSizeInBits;
-	int          iStride[3];
-	SRect        sRect;	
-	EVideoFormat eFormat;
-	EPixMapBufferProperty eProperty;//not use? to remove? but how about the size of SPixMap?
-} SPixMap;
-
-typedef enum
-{	
-	METHOD_NULL              = 0,
-	METHOD_COLORSPACE_CONVERT    ,//not support yet
-	METHOD_DENOISE              ,
-	METHOD_SCENE_CHANGE_DETECTION ,
-	METHOD_DOWNSAMPLE			  ,
-	METHOD_VAA_STATISTICS        ,
-    METHOD_BACKGROUND_DETECTION  ,
-	METHOD_ADAPTIVE_QUANT ,
-	METHOD_COMPLEXITY_ANALYSIS   ,
-	METHOD_IMAGE_ROTATE		  ,
-	METHOD_MASK                 
-} EMethods;
-
-//-----------------------------------------------------------------//
-//  Algorithm parameters define
-//-----------------------------------------------------------------//
-
-typedef struct
-{
-	int bSceneChangeFlag; // 0:false ; 1:true
-} SSceneChangeResult;
-
-typedef enum
-{
-	SIMILAR_SCENE,      //similar scene 
-	MEDIUM_CHANGED_SCENE,   //medium changed scene
-	LARGE_CHANGED_SCENE,   //large changed scene
-} ESceneChangeIdc;
-
-typedef struct
-{
-	unsigned char *pCurY;					// Y data of current frame
-	unsigned char *pRefY;					// Y data of pRef frame for diff calc
-	int (*pSad8x8)[4];				// sad of 8x8, every 4 in the same 16x16 get together
-	int *pSsd16x16;					// sum of square difference of 16x16
-	int *pSum16x16;					// sum of 16x16
-	int *pSumOfSquare16x16;					// sum of square of 16x16
-	int	(*pSumOfDiff8x8)[4];
-	unsigned char	(*pMad8x8)[4];
-	int iFrameSad;					// sad of frame
-} SVAACalcResult;
-
-typedef struct
-{
-	int iCalcVar;
-	int iCalcBgd;
-	int iCalcSsd;
-	int iReserved;
-	SVAACalcResult	*pCalcResult;
-} SVAACalcParam;
-
-typedef struct
-{
-	signed char		*pBackgroundMbFlag;
-	SVAACalcResult  *pCalcRes;
-} SBGDInterface;
-
-typedef enum
-{
-	AQ_QUALITY_MODE,   //Quality mode
-	AQ_BITRATE_MODE,   //Bitrate mode
-}EAQModes;
-
-typedef struct 
-{
-	unsigned short    uiMotionIndex;
-	unsigned short    uiTextureIndex;
-} SMotionTextureUnit;
-
-typedef struct
-{
-	int					iAdaptiveQuantMode; // 0:quality mode, 1:bitrates mode
-	SVAACalcResult		*pCalcResult;
-	SMotionTextureUnit  *pMotionTextureUnit;
-
-	signed char			*pMotionTextureIndexToDeltaQp;	
-	double				dAverMotionTextureIndexToDeltaQp;
-} SAdaptiveQuantizationParam;
-
-typedef enum 
-{
-	FRAME_SAD     =  0,
-	GOM_SAD       = -1,
-	GOM_VAR       = -2
-} EComplexityAnalysisMode;
-
-typedef struct
-{
-	int  iComplexityAnalysisMode;
-	int  iCalcBgd;
-	int  iMbNumInGom;		
-	int  iFrameComplexity;
-	int  *pGomComplexity;
-	int  *pGomForegroundBlockNum;
-	signed char  *pBackgroundMbFlag;
-	unsigned int *uiRefMbType;
-	SVAACalcResult  *pCalcResult;
-} SComplexityAnalysisParam;
-
-/////////////////////////////////////////////////////////////////////////////////////////////
-
-typedef struct 
-{
-	void    *pCtx;
-	EResult (*Init)    (void *pCtx, int iType, void *pCfg);
-	EResult (*Uninit)  (void *pCtx, int iType);
-	EResult (*Flush)   (void *pCtx, int iType);
-	EResult (*Process) (void *pCtx, int iType, SPixMap *pSrc, SPixMap *dst); 
-	EResult (*Get)     (void *pCtx, int iType, void *pParam); 
-	EResult (*Set)     (void *pCtx, int iType, void *pParam); 
-	EResult (*SpecialFeature) (void *pCtx, int iType, void *pIn, void *pOut);
-} IWelsVPc;
-
-#if defined(__cplusplus) && !defined(CINTERFACE)  /* C++ style interface */
-
-class IWelsVP
-{
-public:
-	virtual ~IWelsVP() {}
-
-public:		
-	virtual EResult Init    (int iType, void *pCfg) = 0; 
-	virtual EResult Uninit  (int iType) = 0;
-	virtual EResult Flush   (int iType) = 0;
-	virtual EResult Process (int iType, SPixMap *pSrc, SPixMap *dst) = 0; 
-	virtual EResult Get     (int iType, void *pParam) = 0; 
-	virtual EResult Set     (int iType, void *pParam) = 0; 
-	virtual EResult SpecialFeature (int iType, void *pIn, void *pOut) = 0;
-};
-
-/* Recommend to invoke the interface via the micro for convenient */
-#define IWelsVPFunc_Init(p, a, b)                  (p)->Init(a, b)              
-#define IWelsVPFunc_Uninit(p, a)                   (p)->Uninit(a)               
-#define IWelsVPFunc_Flush(p, a)                    (p)->Flush(a)                
-#define IWelsVPFunc_Process(p, a, b, c)            (p)->Process(a, b, c)        
-#define IWelsVPFunc_Get(p, a, b)                   (p)->Get(a, b)               
-#define IWelsVPFunc_Set(p, a, b)                   (p)->Set(a, b)               
-#define IWelsVPFunc_SpecialFeature(p, a, b, c)     (p)->SpecialFeature(a, b, c)
-
-/* C++ interface version */
-#define WELSVP_INTERFACE_VERION                    (0x8000 + (WELSVP_VERSION & 0x7fff)) 
-#define WELSVP_EXTERNC_BEGIN                       extern "C" {
-#define WELSVP_EXTERNC_END                         }
-
-#else    /* C style interface */
-
-/* Recommend to invoke the interface via the micro for convenient */
-#define IWelsVPFunc_Init(p, a, b)                  (p)->Init(p->h, a, b)              
-#define IWelsVPFunc_Uninit(p, a)                   (p)->Uninit(p->h, a)               
-#define IWelsVPFunc_Flush(p, a)                    (p)->Flush(p->h, a)                
-#define IWelsVPFunc_Process(p, a, b, c)            (p)->Process(p->h, a, b, c)        
-#define IWelsVPFunc_Get(p, a, b)                   (p)->Get(p->h, a, b)               
-#define IWelsVPFunc_Set(p, a, b)                   (p)->Set(p->h, a, b)               
-#define IWelsVPFunc_SpecialFeature(p, a, b, c)     (p)->SpecialFeature(p->h, a, b, c) 
-
-/* C interface version */
-#define WELSVP_INTERFACE_VERION                    (0x0001 + (WELSVP_VERSION & 0x7fff)) 
-#define WELSVP_EXTERNC_BEGIN                      
-#define WELSVP_EXTERNC_END                       
-
-#endif
-
-WELSVP_EXTERNC_BEGIN
-EResult WELSAPI CreateVpInterface   (void **ppCtx, int iVersion /*= WELSVP_INTERFACE_VERION*/);
-EResult WELSAPI DestroyVpInterface  (void *pCtx , int iVersion /*= WELSVP_INTERFACE_VERION*/);
-WELSVP_EXTERNC_END
-
-//////////////////////////////////////////////////////////////////////////////////////////////
-#endif // _IWELSVP_H_
-
-
+/*!
+ * \copy
+ *     Copyright (c)  2004-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	    :  IWelsVP.h
+ *
+ * \brief	    :  Interface of wels video processor class
+ *
+ * \date        :  2011/01/04
+ *
+ * \description :  1. should support both C/C++ style interface
+ *                 2. should concern with the feature extension requirement
+ *                 3. should care the usage of "char"==>
+ *                     1) value char  : signed char/unsigned char
+ *                     2) string char : char
+ *
+ *************************************************************************************
+ */
+
+#ifndef _IWELSVP_H_
+#define _IWELSVP_H_
+
+#ifdef _WIN32
+#define WELSAPI __stdcall
+#else
+#define WELSAPI
+#endif
+
+#define WELSVP_MAJOR_VERSION   1
+#define WELSVP_MINOR_VERSION   1
+#define WELSVP_VERSION         ((WELSVP_MAJOR_VERSION << 8) + WELSVP_MINOR_VERSION)
+
+typedef enum {
+  RET_SUCCESS          =  0,
+  RET_FAILED           = -1,
+  RET_INVALIDPARAM     = -2,
+  RET_OUTOFMEMORY      = -3,
+  RET_NOTSUPPORTED       = -4,
+  RET_UNEXPECTED       = -5,
+  RET_NEEDREINIT		  = -6
+} EResult;
+
+typedef enum {
+  VIDEO_FORMAT_NULL       = 0,   /* invalid format   */
+  /*rgb color formats*/
+  VIDEO_FORMAT_RGB        = 1,   /* rgb 24bits       */
+  VIDEO_FORMAT_RGBA       = 2,   /* rgba             */
+  VIDEO_FORMAT_RGB555     = 3,   /* rgb555           */
+  VIDEO_FORMAT_RGB565     = 4,   /* rgb565           */
+  VIDEO_FORMAT_BGR        = 5,   /* bgr 24bits       */
+  VIDEO_FORMAT_BGRA       = 6,   /* bgr 32bits       */
+  VIDEO_FORMAT_ABGR       = 7,   /* abgr             */
+  VIDEO_FORMAT_ARGB       = 8,   /* argb             */
+
+  /*yuv color formats*/
+  VIDEO_FORMAT_YUY2       = 20,   /* yuy2             */
+  VIDEO_FORMAT_YVYU       = 21,   /* yvyu             */
+  VIDEO_FORMAT_UYVY       = 22,   /* uyvy             */
+  VIDEO_FORMAT_I420       = 23,   /* yuv 4:2:0 planar */
+  VIDEO_FORMAT_YV12       = 24,   /* yuv 4:2:0 planar */
+  VIDEO_FORMAT_INTERNAL   = 25,   /* Only Used for SVC decoder testbed */
+  VIDEO_FORMAT_NV12		= 26,	/* y planar + uv packed */
+  VIDEO_FORMAT_I422       = 27,   /* yuv 4:2:2 planar */
+  VIDEO_FORMAT_I444       = 28,   /* yuv 4:4:4 planar */
+  VIDEO_FORMAT_YUYV       = 20,   /* yuv 4:2:2 packed */
+
+
+  VIDEO_FORMAT_RGB24      = 1,
+  VIDEO_FORMAT_RGB32      = 2,
+  VIDEO_FORMAT_RGB24_INV  = 5,
+  VIDEO_FORMAT_RGB32_INV  = 6,
+  VIDEO_FORMAT_RGB555_INV = 7,
+  VIDEO_FORMAT_RGB565_INV = 8,
+  VIDEO_FORMAT_YUV2       = 21,
+  VIDEO_FORMAT_420        = 23,
+
+
+  VIDEO_FORMAT_VFlip      = 0x80000000
+} EVideoFormat;
+
+typedef enum {
+  BUFFER_HOSTMEM  = 0,
+  BUFFER_SURFACE
+} EPixMapBufferProperty;
+
+typedef struct {
+  int iRectTop;
+  int iRectLeft;
+  int iRectWidth;
+  int iRectHeight;
+} SRect;
+
+typedef struct {
+  void*        pPixel[3];
+  int          iSizeInBits;
+  int          iStride[3];
+  SRect        sRect;
+  EVideoFormat eFormat;
+  EPixMapBufferProperty eProperty;//not use? to remove? but how about the size of SPixMap?
+} SPixMap;
+
+typedef enum {
+  METHOD_NULL              = 0,
+  METHOD_COLORSPACE_CONVERT    ,//not support yet
+  METHOD_DENOISE              ,
+  METHOD_SCENE_CHANGE_DETECTION ,
+  METHOD_DOWNSAMPLE			  ,
+  METHOD_VAA_STATISTICS        ,
+  METHOD_BACKGROUND_DETECTION  ,
+  METHOD_ADAPTIVE_QUANT ,
+  METHOD_COMPLEXITY_ANALYSIS   ,
+  METHOD_IMAGE_ROTATE		  ,
+  METHOD_MASK
+} EMethods;
+
+//-----------------------------------------------------------------//
+//  Algorithm parameters define
+//-----------------------------------------------------------------//
+
+typedef struct {
+  int bSceneChangeFlag; // 0:false ; 1:true
+} SSceneChangeResult;
+
+typedef enum {
+  SIMILAR_SCENE,      //similar scene
+  MEDIUM_CHANGED_SCENE,   //medium changed scene
+  LARGE_CHANGED_SCENE,   //large changed scene
+} ESceneChangeIdc;
+
+typedef struct {
+  unsigned char* pCurY;					// Y data of current frame
+  unsigned char* pRefY;					// Y data of pRef frame for diff calc
+  int (*pSad8x8)[4];				// sad of 8x8, every 4 in the same 16x16 get together
+  int* pSsd16x16;					// sum of square difference of 16x16
+  int* pSum16x16;					// sum of 16x16
+  int* pSumOfSquare16x16;					// sum of square of 16x16
+  int	(*pSumOfDiff8x8)[4];
+  unsigned char (*pMad8x8)[4];
+  int iFrameSad;					// sad of frame
+} SVAACalcResult;
+
+typedef struct {
+  int iCalcVar;
+  int iCalcBgd;
+  int iCalcSsd;
+  int iReserved;
+  SVAACalcResult*	pCalcResult;
+} SVAACalcParam;
+
+typedef struct {
+  signed char*		pBackgroundMbFlag;
+  SVAACalcResult*  pCalcRes;
+} SBGDInterface;
+
+typedef enum {
+  AQ_QUALITY_MODE,   //Quality mode
+  AQ_BITRATE_MODE,   //Bitrate mode
+} EAQModes;
+
+typedef struct {
+  unsigned short    uiMotionIndex;
+  unsigned short    uiTextureIndex;
+} SMotionTextureUnit;
+
+typedef struct {
+  int					iAdaptiveQuantMode; // 0:quality mode, 1:bitrates mode
+  SVAACalcResult*		pCalcResult;
+  SMotionTextureUnit*  pMotionTextureUnit;
+
+  signed char*			pMotionTextureIndexToDeltaQp;
+  double				dAverMotionTextureIndexToDeltaQp;
+} SAdaptiveQuantizationParam;
+
+typedef enum {
+  FRAME_SAD     =  0,
+  GOM_SAD       = -1,
+  GOM_VAR       = -2
+} EComplexityAnalysisMode;
+
+typedef struct {
+  int  iComplexityAnalysisMode;
+  int  iCalcBgd;
+  int  iMbNumInGom;
+  int  iFrameComplexity;
+  int*  pGomComplexity;
+  int*  pGomForegroundBlockNum;
+  signed char*  pBackgroundMbFlag;
+  unsigned int* uiRefMbType;
+  SVAACalcResult*  pCalcResult;
+} SComplexityAnalysisParam;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+
+typedef struct {
+  void*    pCtx;
+  EResult (*Init) (void* pCtx, int iType, void* pCfg);
+  EResult (*Uninit) (void* pCtx, int iType);
+  EResult (*Flush) (void* pCtx, int iType);
+  EResult (*Process) (void* pCtx, int iType, SPixMap* pSrc, SPixMap* dst);
+  EResult (*Get) (void* pCtx, int iType, void* pParam);
+  EResult (*Set) (void* pCtx, int iType, void* pParam);
+  EResult (*SpecialFeature) (void* pCtx, int iType, void* pIn, void* pOut);
+} IWelsVPc;
+
+#if defined(__cplusplus) && !defined(CINTERFACE)  /* C++ style interface */
+
+class IWelsVP {
+ public:
+  virtual ~IWelsVP() {}
+
+ public:
+  virtual EResult Init (int iType, void* pCfg) = 0;
+  virtual EResult Uninit (int iType) = 0;
+  virtual EResult Flush (int iType) = 0;
+  virtual EResult Process (int iType, SPixMap* pSrc, SPixMap* dst) = 0;
+  virtual EResult Get (int iType, void* pParam) = 0;
+  virtual EResult Set (int iType, void* pParam) = 0;
+  virtual EResult SpecialFeature (int iType, void* pIn, void* pOut) = 0;
+};
+
+/* Recommend to invoke the interface via the micro for convenient */
+#define IWelsVPFunc_Init(p, a, b)                  (p)->Init(a, b)
+#define IWelsVPFunc_Uninit(p, a)                   (p)->Uninit(a)
+#define IWelsVPFunc_Flush(p, a)                    (p)->Flush(a)
+#define IWelsVPFunc_Process(p, a, b, c)            (p)->Process(a, b, c)
+#define IWelsVPFunc_Get(p, a, b)                   (p)->Get(a, b)
+#define IWelsVPFunc_Set(p, a, b)                   (p)->Set(a, b)
+#define IWelsVPFunc_SpecialFeature(p, a, b, c)     (p)->SpecialFeature(a, b, c)
+
+/* C++ interface version */
+#define WELSVP_INTERFACE_VERION                    (0x8000 + (WELSVP_VERSION & 0x7fff))
+#define WELSVP_EXTERNC_BEGIN                       extern "C" {
+#define WELSVP_EXTERNC_END                         }
+
+#else    /* C style interface */
+
+/* Recommend to invoke the interface via the micro for convenient */
+#define IWelsVPFunc_Init(p, a, b)                  (p)->Init(p->h, a, b)
+#define IWelsVPFunc_Uninit(p, a)                   (p)->Uninit(p->h, a)
+#define IWelsVPFunc_Flush(p, a)                    (p)->Flush(p->h, a)
+#define IWelsVPFunc_Process(p, a, b, c)            (p)->Process(p->h, a, b, c)
+#define IWelsVPFunc_Get(p, a, b)                   (p)->Get(p->h, a, b)
+#define IWelsVPFunc_Set(p, a, b)                   (p)->Set(p->h, a, b)
+#define IWelsVPFunc_SpecialFeature(p, a, b, c)     (p)->SpecialFeature(p->h, a, b, c)
+
+/* C interface version */
+#define WELSVP_INTERFACE_VERION                    (0x0001 + (WELSVP_VERSION & 0x7fff))
+#define WELSVP_EXTERNC_BEGIN
+#define WELSVP_EXTERNC_END
+
+#endif
+
+WELSVP_EXTERNC_BEGIN
+EResult WELSAPI CreateVpInterface (void** ppCtx, int iVersion /*= WELSVP_INTERFACE_VERION*/);
+EResult WELSAPI DestroyVpInterface (void* pCtx , int iVersion /*= WELSVP_INTERFACE_VERION*/);
+WELSVP_EXTERNC_END
+
+//////////////////////////////////////////////////////////////////////////////////////////////
+#endif // _IWELSVP_H_
+
+
--- a/codec/encoder/core/inc/array_stack_align.h
+++ b/codec/encoder/core/inc/array_stack_align.h
@@ -1,121 +1,121 @@
-/*!
- * \copy
- *     Copyright (c)  2011-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file		array_stack_align.h
- *
- * \brief	promised alignment of array pData declaration on stack
- *			multidimensional array can be extended if applicable need
- *
- * \date		8/8/2011 Created 
- *			8/12/2011 functionality implementation for multidimensional array
- *			8/26/2011 better solution with reducing extra memory used, 
- *						stack size is adaptively reduced by _tp & _al
- *
- *************************************************************************************
- */
-#ifndef ARRAY_STACK_ALIGN_H__
-#define ARRAY_STACK_ALIGN_H__
-
-#include <assert.h>
-#include "typedefs.h"
-
-/*
- * ENFORCE_STACK_ALIGN_1D: force 1 dimension local pData aligned in stack
- * _tp: type
- * _nm: var name
- * _sz: size
- * _al: align bytes
- * auxiliary var: _nm ## _tEmP
- * NOTE: _al should be power-of-2 and >= sizeof(_tp), before considering to use such macro
- */
-
-//#define ENFORCE_STACK_ALIGN_1D(_tp, _nm, _sz, _al) \
-//_tp _nm ## _tEmP[(_sz)+(_al)-1]; \
-//_tp *_nm = _nm ## _tEmP + ((_al)-1); \
-//_nm -= (((int32_t)_nm & ((_al)-1))/sizeof(_tp));
-
-/* Another better solution with reducing extra memory used */
-#define ENFORCE_STACK_ALIGN_1D(_tp, _nm, _sz, _al) \
-assert( ((_al) && !((_al) & ((_al) - 1))) && ((_al) >= sizeof(_tp)) ); /*_al should be power-of-2 and >= sizeof(_tp)*/\
-_tp _nm ## _tEmP[(_sz)+(_al)/sizeof(_tp)-1]; \
-_tp *_nm = _nm ## _tEmP + ((_al)/sizeof(_tp)-1); \
-_nm -= (((int32_t)_nm & ((_al)-1))/sizeof(_tp));
-
-/*
- * ENFORCE_STACK_ALIGN_2D: force 2 dimension local pData aligned in stack
- * _tp: type
- * _nm: var name
- * _cx, _cy: size in x, y dimension
- * _al: align bytes
- * auxiliary var: _nm ## _tEmP, _nm ## _tEmP_al
- * NOTE: _al should be power-of-2 and >= sizeof(_tp), before considering to use such macro
- */
-
-//#define ENFORCE_STACK_ALIGN_2D(_tp, _nm, _cx, _cy, _al) \
-//_tp _nm ## _tEmP[(_cx)*(_cy)+(_al)-1]; \
-//_tp *_nm ## _tEmP_al = _nm ## _tEmP + ((_al)-1); \
-//_nm ## _tEmP_al -= (((int32_t)_nm ## _tEmP_al & ((_al)-1))/sizeof(_tp)); \
-//_tp (*_nm)[(_cy)] = (_tp (*)[(_cy)])_nm ## _tEmP_al;
-
-/* Another better solution with reducing extra memory used */
-#define ENFORCE_STACK_ALIGN_2D(_tp, _nm, _cx, _cy, _al) \
-assert( ((_al) && !((_al) & ((_al) - 1))) && ((_al) >= sizeof(_tp)) ); /*_al should be power-of-2 and >= sizeof(_tp)*/\
-_tp _nm ## _tEmP[(_cx)*(_cy)+(_al)/sizeof(_tp)-1]; \
-_tp *_nm ## _tEmP_al = _nm ## _tEmP + ((_al)/sizeof(_tp)-1); \
-_nm ## _tEmP_al -= (((int32_t)_nm ## _tEmP_al & ((_al)-1))/sizeof(_tp)); \
-_tp (*_nm)[(_cy)] = (_tp (*)[(_cy)])_nm ## _tEmP_al;
-
-/*
- * ENFORCE_STACK_ALIGN_3D: force 3 dimension local pData aligned in stack
- * _tp: type
- * _nm: var name
- * _cx, _cy, _cz: size in x, y, z dimension
- * _al: align bytes
- * auxiliary var: _nm ## _tEmP, _nm ## _tEmP_al
- * NOTE: _al should be power-of-2 and >= sizeof(_tp), before considering to use such macro
- */
-
-//#define ENFORCE_STACK_ALIGN_3D(_tp, _nm, _cx, _cy, _cz, _al) \
-//_tp _nm ## _tEmP[(_cx)*(_cy)*(_cz)+(_al)-1]; \
-//_tp *_nm ## _tEmP_al = _nm ## _tEmP + ((_al)-1); \
-//_nm ## _tEmP_al -= (((int32_t)_nm ## _tEmP_al & ((_al)-1))/sizeof(_tp)); \
-//_tp (*_nm)[(_cy)][(_cz)] = (_tp (*)[(_cy)][(_cz)])_nm ## _tEmP_al;
-
-/* Another better solution with reducing extra memory used */
-#define ENFORCE_STACK_ALIGN_3D(_tp, _nm, _cx, _cy, _cz, _al) \
-assert( ((_al) && !((_al) & ((_al) - 1))) && ((_al) >= sizeof(_tp)) ); /*_al should be power-of-2 and >= sizeof(_tp)*/\
-_tp _nm ## _tEmP[(_cx)*(_cy)*(_cz)+(_al)/sizeof(_tp)-1]; \
-_tp *_nm ## _tEmP_al = _nm ## _tEmP + ((_al)/sizeof(_tp)-1); \
-_nm ## _tEmP_al -= (((int32_t)_nm ## _tEmP_al & ((_al)-1))/sizeof(_tp)); \
-_tp (*_nm)[(_cy)][(_cz)] = (_tp (*)[(_cy)][(_cz)])_nm ## _tEmP_al;
-
-#endif//ARRAY_STACK_ALIGN_H__
-
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file		array_stack_align.h
+ *
+ * \brief	promised alignment of array pData declaration on stack
+ *			multidimensional array can be extended if applicable need
+ *
+ * \date		8/8/2011 Created
+ *			8/12/2011 functionality implementation for multidimensional array
+ *			8/26/2011 better solution with reducing extra memory used,
+ *						stack size is adaptively reduced by _tp & _al
+ *
+ *************************************************************************************
+ */
+#ifndef ARRAY_STACK_ALIGN_H__
+#define ARRAY_STACK_ALIGN_H__
+
+#include <assert.h>
+#include "typedefs.h"
+
+/*
+ * ENFORCE_STACK_ALIGN_1D: force 1 dimension local pData aligned in stack
+ * _tp: type
+ * _nm: var name
+ * _sz: size
+ * _al: align bytes
+ * auxiliary var: _nm ## _tEmP
+ * NOTE: _al should be power-of-2 and >= sizeof(_tp), before considering to use such macro
+ */
+
+//#define ENFORCE_STACK_ALIGN_1D(_tp, _nm, _sz, _al) \
+//_tp _nm ## _tEmP[(_sz)+(_al)-1]; \
+//_tp *_nm = _nm ## _tEmP + ((_al)-1); \
+//_nm -= (((int32_t)_nm & ((_al)-1))/sizeof(_tp));
+
+/* Another better solution with reducing extra memory used */
+#define ENFORCE_STACK_ALIGN_1D(_tp, _nm, _sz, _al) \
+assert( ((_al) && !((_al) & ((_al) - 1))) && ((_al) >= sizeof(_tp)) ); /*_al should be power-of-2 and >= sizeof(_tp)*/\
+_tp _nm ## _tEmP[(_sz)+(_al)/sizeof(_tp)-1]; \
+_tp *_nm = _nm ## _tEmP + ((_al)/sizeof(_tp)-1); \
+_nm -= (((int32_t)_nm & ((_al)-1))/sizeof(_tp));
+
+/*
+ * ENFORCE_STACK_ALIGN_2D: force 2 dimension local pData aligned in stack
+ * _tp: type
+ * _nm: var name
+ * _cx, _cy: size in x, y dimension
+ * _al: align bytes
+ * auxiliary var: _nm ## _tEmP, _nm ## _tEmP_al
+ * NOTE: _al should be power-of-2 and >= sizeof(_tp), before considering to use such macro
+ */
+
+//#define ENFORCE_STACK_ALIGN_2D(_tp, _nm, _cx, _cy, _al) \
+//_tp _nm ## _tEmP[(_cx)*(_cy)+(_al)-1]; \
+//_tp *_nm ## _tEmP_al = _nm ## _tEmP + ((_al)-1); \
+//_nm ## _tEmP_al -= (((int32_t)_nm ## _tEmP_al & ((_al)-1))/sizeof(_tp)); \
+//_tp (*_nm)[(_cy)] = (_tp (*)[(_cy)])_nm ## _tEmP_al;
+
+/* Another better solution with reducing extra memory used */
+#define ENFORCE_STACK_ALIGN_2D(_tp, _nm, _cx, _cy, _al) \
+assert( ((_al) && !((_al) & ((_al) - 1))) && ((_al) >= sizeof(_tp)) ); /*_al should be power-of-2 and >= sizeof(_tp)*/\
+_tp _nm ## _tEmP[(_cx)*(_cy)+(_al)/sizeof(_tp)-1]; \
+_tp *_nm ## _tEmP_al = _nm ## _tEmP + ((_al)/sizeof(_tp)-1); \
+_nm ## _tEmP_al -= (((int32_t)_nm ## _tEmP_al & ((_al)-1))/sizeof(_tp)); \
+_tp (*_nm)[(_cy)] = (_tp (*)[(_cy)])_nm ## _tEmP_al;
+
+/*
+ * ENFORCE_STACK_ALIGN_3D: force 3 dimension local pData aligned in stack
+ * _tp: type
+ * _nm: var name
+ * _cx, _cy, _cz: size in x, y, z dimension
+ * _al: align bytes
+ * auxiliary var: _nm ## _tEmP, _nm ## _tEmP_al
+ * NOTE: _al should be power-of-2 and >= sizeof(_tp), before considering to use such macro
+ */
+
+//#define ENFORCE_STACK_ALIGN_3D(_tp, _nm, _cx, _cy, _cz, _al) \
+//_tp _nm ## _tEmP[(_cx)*(_cy)*(_cz)+(_al)-1]; \
+//_tp *_nm ## _tEmP_al = _nm ## _tEmP + ((_al)-1); \
+//_nm ## _tEmP_al -= (((int32_t)_nm ## _tEmP_al & ((_al)-1))/sizeof(_tp)); \
+//_tp (*_nm)[(_cy)][(_cz)] = (_tp (*)[(_cy)][(_cz)])_nm ## _tEmP_al;
+
+/* Another better solution with reducing extra memory used */
+#define ENFORCE_STACK_ALIGN_3D(_tp, _nm, _cx, _cy, _cz, _al) \
+assert( ((_al) && !((_al) & ((_al) - 1))) && ((_al) >= sizeof(_tp)) ); /*_al should be power-of-2 and >= sizeof(_tp)*/\
+_tp _nm ## _tEmP[(_cx)*(_cy)*(_cz)+(_al)/sizeof(_tp)-1]; \
+_tp *_nm ## _tEmP_al = _nm ## _tEmP + ((_al)/sizeof(_tp)-1); \
+_nm ## _tEmP_al -= (((int32_t)_nm ## _tEmP_al & ((_al)-1))/sizeof(_tp)); \
+_tp (*_nm)[(_cy)][(_cz)] = (_tp (*)[(_cy)][(_cz)])_nm ## _tEmP_al;
+
+#endif//ARRAY_STACK_ALIGN_H__
+
--- a/codec/encoder/core/inc/as264_common.h
+++ b/codec/encoder/core/inc/as264_common.h
@@ -48,14 +48,14 @@
 */
 
 /****************************************************************************
- * Options for algorithm, usually change bitrate 
+ * Options for algorithm, usually change bitrate
  ****************************************************************************/
 #define DISABLE_FMO_FEATURE	// 
 
 /****************************************************************************
- * Options for optimization, not change bitrate 
+ * Options for optimization, not change bitrate
  ****************************************************************************/
-//#undef	X86_ASM			// X86_ASM is included in project preprocessor definitions, undef it when need to disable asm code 
+//#undef	X86_ASM			// X86_ASM is included in project preprocessor definitions, undef it when need to disable asm code
 #define SINGLE_REF_FRAME		// need to disable it when use multi-reference
 
 
--- a/codec/encoder/core/inc/au_set.h
+++ b/codec/encoder/core/inc/au_set.h
@@ -47,12 +47,12 @@
 #include "param_svc.h"
 
 namespace WelsSVCEnc {
-/*! 
+/*!
  *************************************************************************************
  * \brief	to write Sequence Parameter Set (SPS)
  *
  * \param 	pSps     	SWelsSPS to be wrote
- * \param	bs_aux		bitstream writer auxiliary 
+ * \param	bs_aux		bitstream writer auxiliary
  *
  * \return	0 - successed
  *		    1 - failed
@@ -61,15 +61,15 @@
  *************************************************************************************
  */
 
-int32_t WelsWriteSpsNal( SWelsSPS *pSps, SBitStringAux *pBitStringAux, int32_t* pSpsIdDelta );
+int32_t WelsWriteSpsNal (SWelsSPS* pSps, SBitStringAux* pBitStringAux, int32_t* pSpsIdDelta);
 
 
-/*! 
+/*!
  *************************************************************************************
  * \brief	to write SubSet Sequence Parameter Set
  *
  * \param 	sub_sps		subset pSps parsed
- * \param	bs_aux		bitstream writer auxiliary 
+ * \param	bs_aux		bitstream writer auxiliary
  *
  * \return	0 - successed
  *		    1 - failed
@@ -77,15 +77,15 @@
  * \note	Call it in case EWelsNalUnitType is SubSet SPS.
  *************************************************************************************
  */
-int32_t WelsWriteSubsetSpsSyntax( SSubsetSps *pSubsetSps, SBitStringAux *pBitStringAux , int32_t* pSpsIdDelta );
+int32_t WelsWriteSubsetSpsSyntax (SSubsetSps* pSubsetSps, SBitStringAux* pBitStringAux , int32_t* pSpsIdDelta);
 
 
-/*! 
+/*!
  *************************************************************************************
  * \brief	to write Picture Parameter Set (PPS)
  *
  * \param 	pPps     	pPps
- * \param	bs_aux		bitstream writer auxiliary 
+ * \param	bs_aux		bitstream writer auxiliary
  *
  * \return	0 - successed
  *		    1 - failed
@@ -93,7 +93,7 @@
  * \note	Call it in case EWelsNalUnitType is PPS.
  *************************************************************************************
  */
-int32_t WelsWritePpsSyntax( SWelsPPS *pPps, SBitStringAux *pBitStringAux, SParaSetOffset* sPSOVector );
+int32_t WelsWritePpsSyntax (SWelsPPS* pPps, SBitStringAux* pBitStringAux, SParaSetOffset* sPSOVector);
 
 /*!
  * \brief	initialize pSps based on configurable parameters in svc
@@ -103,8 +103,9 @@
  * \return	0 - successful
  *			1 - failed
  */
-int32_t WelsInitSps( SWelsSPS *pSps, SDLayerParam *pLayerParam, const uint32_t kuiIntraPeriod, const int32_t kiNumRefFrame,
-					  const uint32_t kiSpsId, const bool_t kbEnableFrameCropping, bool_t bEnableRc );
+int32_t WelsInitSps (SWelsSPS* pSps, SDLayerParam* pLayerParam, const uint32_t kuiIntraPeriod,
+                     const int32_t kiNumRefFrame,
+                     const uint32_t kiSpsId, const bool_t kbEnableFrameCropping, bool_t bEnableRc);
 
 /*!
  * \brief	initialize subset pSps based on configurable parameters in svc
@@ -114,8 +115,9 @@
  * \return	0 - successful
  *			1 - failed
  */
-int32_t WelsInitSubsetSps( SSubsetSps *pSubsetSps, SDLayerParam *pLayerParam, const uint32_t kuiIntraPeriod, const int32_t kiNumRefFrame,
-							 const uint32_t kiSpsId, const bool_t kbEnableFrameCropping, bool_t bEnableRc );
+int32_t WelsInitSubsetSps (SSubsetSps* pSubsetSps, SDLayerParam* pLayerParam, const uint32_t kuiIntraPeriod,
+                           const int32_t kiNumRefFrame,
+                           const uint32_t kiSpsId, const bool_t kbEnableFrameCropping, bool_t bEnableRc);
 
 /*!
  * \brief	initialize pPps based on configurable parameters and pSps(subset pSps) in svc
@@ -128,12 +130,12 @@
  * \return	0 - successful
  *			1 - failed
  */
-int32_t WelsInitPps(	SWelsPPS *pPps,
-						SWelsSPS *pSps,
-						SSubsetSps *pSubsetSps,						
-						const uint32_t kuiPpsId,
-						const bool_t kbDeblockingFilterPresentFlag,
-						const bool_t kbUsingSubsetSps );
+int32_t WelsInitPps (SWelsPPS* pPps,
+                     SWelsSPS* pSps,
+                     SSubsetSps* pSubsetSps,
+                     const uint32_t kuiPpsId,
+                     const bool_t kbDeblockingFilterPresentFlag,
+                     const bool_t kbUsingSubsetSps);
 
 }
 #endif//WELS_ACCESS_UNIT_PARSER_H__
--- a/codec/encoder/core/inc/bit_stream.h
+++ b/codec/encoder/core/inc/bit_stream.h
@@ -41,13 +41,13 @@
  *	auxiliary struct for bit-stream reading / writing
  */
 typedef struct TagBitStringAux {
-	uint8_t		*pBuf;		// pBuffer to start position
-	uint8_t		*pBufEnd;	// pBuffer + length
-	uint8_t		*pBufPtr;	// current writing position	
-	uint32_t    uiCurBits;  
-	int32_t		iLeftBits;	// count number of available bits left ([1, 8]),
-							// need pointer to next byte start position in case 0 bit left then 8 instead
-}SBitStringAux;
+  uint8_t*		pBuf;		// pBuffer to start position
+  uint8_t*		pBufEnd;	// pBuffer + length
+  uint8_t*		pBufPtr;	// current writing position
+  uint32_t    uiCurBits;
+  int32_t		iLeftBits;	// count number of available bits left ([1, 8]),
+  // need pointer to next byte start position in case 0 bit left then 8 instead
+} SBitStringAux;
 
 /*!
  * \brief	input bits for decoder or initialize bitstream writing in encoder
@@ -58,17 +58,16 @@
  *
  * \return	iSize of pBuffer pData in byte; failed in -1 return
  */
-static inline int32_t InitBits( SBitStringAux *pBs, const uint8_t *kpBuf, const int32_t kiSize )
-{
-	uint8_t *ptr = (uint8_t *)kpBuf;
+static inline int32_t InitBits (SBitStringAux* pBs, const uint8_t* kpBuf, const int32_t kiSize) {
+  uint8_t* ptr = (uint8_t*)kpBuf;
 
-	pBs->pBuf			= ptr;
-	pBs->pBufPtr		= ptr;
-	pBs->pBufEnd		= ptr + kiSize;
-	pBs->iLeftBits	= 32;
-	pBs->uiCurBits = 0;
-	
-	return kiSize;
+  pBs->pBuf			= ptr;
+  pBs->pBufPtr		= ptr;
+  pBs->pBufEnd		= ptr + kiSize;
+  pBs->iLeftBits	= 32;
+  pBs->uiCurBits = 0;
+
+  return kiSize;
 }
 
 
--- a/codec/encoder/core/inc/bundleloader.h
+++ b/codec/encoder/core/inc/bundleloader.h
@@ -40,113 +40,97 @@
 #include <coreFoundation/CFBundle.h>
 #include <string>
 
-int GetCurrentModulePath(char* lpModulePath, const int iPathMax)
-{
-	if(lpModulePath == NULL || iPathMax <= 0)
-	{
-		return -1;
-	}
+int GetCurrentModulePath (char* lpModulePath, const int iPathMax) {
+if (lpModulePath == NULL || iPathMax <= 0) {
+  return -1;
+}
 
-	memset(lpModulePath, 0, iPathMax);
+memset (lpModulePath, 0, iPathMax);
 
-	char cCurrentPath[PATH_MAX];
-	memset(cCurrentPath, 0, PATH_MAX);
+char cCurrentPath[PATH_MAX];
+memset (cCurrentPath, 0, PATH_MAX);
 
-	Dl_info 	dlInfo;
-	static int  sDummy;
-	dladdr((void*)&sDummy, &dlInfo);
+Dl_info 	dlInfo;
+static int  sDummy;
+dladdr ((void*)&sDummy, &dlInfo);
 
-	strlcpy(cCurrentPath, dlInfo.dli_fname, PATH_MAX);
+strlcpy (cCurrentPath, dlInfo.dli_fname, PATH_MAX);
 
-	// whether is self a framework ? 
-	int locateNumber = 1;
-	struct FSRef currentPath;
-	OSStatus iStatus = FSPathMakeRef((unsigned char*)cCurrentPath, &currentPath, NULL);
-	if(noErr == iStatus)
-	{
-		LSItemInfoRecord  info;
-		iStatus = LSCopyItemInfoForRef(&currentPath, kLSRequestExtension, &info);
-		if(noErr == iStatus && NULL == info.extension)
-		{
-			locateNumber = 4;
-		}
-	}
-	std::string strPath(cCurrentPath);
-	int pos = std::string::npos;
-	for(int i = 0; i < locateNumber; i++)
-	{
-		pos = strPath.rfind('/');
-		if(std::string::npos == pos)
-		{
-			break;
-		}
-		strPath.erase(pos);
-	}
-	if(std::string::npos == pos)
-	{
-		return -2;
-	}
-	cCurrentPath[pos] = 0;
+// whether is self a framework ?
+int locateNumber = 1;
+struct FSRef currentPath;
+OSStatus iStatus = FSPathMakeRef ((unsigned char*)cCurrentPath, &currentPath, NULL);
+if (noErr == iStatus) {
+  LSItemInfoRecord  info;
+  iStatus = LSCopyItemInfoForRef (&currentPath, kLSRequestExtension, &info);
+  if (noErr == iStatus && NULL == info.extension) {
+    locateNumber = 4;
+  }
+}
+std::string strPath (cCurrentPath);
+int pos = std::string::npos;
+for (int i = 0; i < locateNumber; i++) {
+  pos = strPath.rfind ('/');
+  if (std::string::npos == pos) {
+    break;
+  }
+  strPath.erase (pos);
+}
+if (std::string::npos == pos) {
+  return -2;
+}
+cCurrentPath[pos] = 0;
 
-	strlcpy(lpModulePath, cCurrentPath, iPathMax);
-	strlcat(lpModulePath, "/", iPathMax);
+strlcpy (lpModulePath, cCurrentPath, iPathMax);
+strlcat (lpModulePath, "/", iPathMax);
 
-	return 0;
+return 0;
 }
 
-CFBundleRef LoadBundle(const char* lpBundlePath)
-{
-	if(lpBundlePath == NULL)
-	{
-		return NULL;
-	}
+CFBundleRef LoadBundle (const char* lpBundlePath) {
+  if (lpBundlePath == NULL) {
+    return NULL;
+  }
 
-	struct FSRef bundlePath;
-	OSStatus iStatus = FSPathMakeRef((unsigned char*)lpBundlePath, &bundlePath, NULL);
-	if(noErr != iStatus)
-	{
-		return NULL;
-	}
+  struct FSRef bundlePath;
+  OSStatus iStatus = FSPathMakeRef ((unsigned char*)lpBundlePath, &bundlePath, NULL);
+  if (noErr != iStatus) {
+    return NULL;
+  }
 
-	CFURLRef bundleURL = CFURLCreateFromFSRef(kCFAllocatorSystemDefault, &bundlePath);
-	if(NULL == bundleURL)
-	{
-		return NULL;
-	}
+  CFURLRef bundleURL = CFURLCreateFromFSRef (kCFAllocatorSystemDefault, &bundlePath);
+  if (NULL == bundleURL) {
+    return NULL;
+  }
 
-	// 2.get bundle ref
-	CFBundleRef bundleRef = CFBundleCreate(kCFAllocatorSystemDefault, bundleURL);
-	CFRelease(bundleURL);
+  // 2.get bundle ref
+  CFBundleRef bundleRef = CFBundleCreate (kCFAllocatorSystemDefault, bundleURL);
+  CFRelease (bundleURL);
 
-	//	Boolean bReturn = FALSE;
-	if(NULL != bundleRef)
-	{
-		//	bReturn = CFBundleLoadExecutable(bundleRef);
-	}
+  //	Boolean bReturn = FALSE;
+  if (NULL != bundleRef) {
+    //	bReturn = CFBundleLoadExecutable(bundleRef);
+  }
 
-	return bundleRef;
+  return bundleRef;
 }
 
-Boolean FreeBundle(CFBundleRef bundleRef)
-{
-	if(NULL != bundleRef)
-	{
-		//	CFBundleUnloadExecutable(bundleRef);
-		CFRelease(bundleRef);
-	}
-	return TRUE;
+Boolean FreeBundle (CFBundleRef bundleRef) {
+  if (NULL != bundleRef) {
+    //	CFBundleUnloadExecutable(bundleRef);
+    CFRelease (bundleRef);
+  }
+  return TRUE;
 }
 
-void* GetProcessAddress(CFBundleRef bundleRef, const char* lpProcName)
-{
-	void *processAddress = NULL;
-	if(NULL != bundleRef)
-	{
-		CFStringRef cfProcName = CFStringCreateWithCString(kCFAllocatorSystemDefault, lpProcName, CFStringGetSystemEncoding());
-		processAddress = CFBundleGetFunctionPointerForName(bundleRef, cfProcName);
-		CFRelease(cfProcName);
-	}
-	return processAddress;
+void* GetProcessAddress (CFBundleRef bundleRef, const char* lpProcName) {
+  void* processAddress = NULL;
+  if (NULL != bundleRef) {
+    CFStringRef cfProcName = CFStringCreateWithCString (kCFAllocatorSystemDefault, lpProcName, CFStringGetSystemEncoding());
+    processAddress = CFBundleGetFunctionPointerForName (bundleRef, cfProcName);
+    CFRelease (cfProcName);
+  }
+  return processAddress;
 }
 #endif
 
--- a/codec/encoder/core/inc/cpu.h
+++ b/codec/encoder/core/inc/cpu.h
@@ -54,19 +54,19 @@
  */
 int32_t  WelsCPUIdVerify();
 
-void WelsCPUId( uint32_t uiIndex, uint32_t *pFeatureA, uint32_t *pFeatureB, uint32_t *pFeatureC, uint32_t *pFeatureD );
+void WelsCPUId (uint32_t uiIndex, uint32_t* pFeatureA, uint32_t* pFeatureB, uint32_t* pFeatureC, uint32_t* pFeatureD);
 
-int32_t WelsCPUSupportAVX( uint32_t eax, uint32_t ecx );
-int32_t WelsCPUSupportFMA( uint32_t eax, uint32_t ecx );
+int32_t WelsCPUSupportAVX (uint32_t eax, uint32_t ecx);
+int32_t WelsCPUSupportFMA (uint32_t eax, uint32_t ecx);
 
 void WelsEmms();
 
-uint32_t WelsCPUFeatureDetect( int32_t *pNumberOfLogicProcessors );
+uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors);
 
 /*
  *	clear FPU registers states for potential float based calculation if support
  */
-void     WelsCPURestore( const uint32_t kuiCPU );
+void     WelsCPURestore (const uint32_t kuiCPU);
 
 #endif
 
--- a/codec/encoder/core/inc/cpu_core.h
+++ b/codec/encoder/core/inc/cpu_core.h
@@ -42,7 +42,7 @@
 
 /*
  *	WELS CPU feature flags
- */ 
+ */
 #define WELS_CPU_MMX        0x00000001    /* mmx */
 #define WELS_CPU_MMXEXT     0x00000002    /* mmx-ext*/
 #define WELS_CPU_SSE        0x00000004    /* sse */
--- a/codec/encoder/core/inc/crt_util_safe_x.h
+++ b/codec/encoder/core/inc/crt_util_safe_x.h
@@ -55,61 +55,60 @@
 #include "typedefs.h"
 #endif//WIN32
 
-/* 
+/*
  * Safe Lib specific errno codes.  These can be added to the errno.h file
- * if desired. 
+ * if desired.
  */
-#define ESNULLP         ( 400 )       /* null ptr                    */  
-#define ESZEROL         ( 401 )       /* length is zero              */  
-#define ESLEMIN         ( 402 )       /* length is below min         */  
-#define ESLEMAX         ( 403 )       /* length exceeds max          */  
-#define ESOVRLP         ( 404 )       /* overlap undefined           */ 
-#define ESEMPTY         ( 405 )       /* empty string                */ 
-#define ESNOSPC         ( 406 )       /* not enough space for s2     */  
-#define ESUNTERM        ( 407 )       /* unterminated string         */  
-#define ESNODIFF        ( 408 )       /* no difference               */ 
-#define ESNOTFND        ( 409 )       /* not found                   */ 
+#define ESNULLP         ( 400 )       /* null ptr                    */
+#define ESZEROL         ( 401 )       /* length is zero              */
+#define ESLEMIN         ( 402 )       /* length is below min         */
+#define ESLEMAX         ( 403 )       /* length exceeds max          */
+#define ESOVRLP         ( 404 )       /* overlap undefined           */
+#define ESEMPTY         ( 405 )       /* empty string                */
+#define ESNOSPC         ( 406 )       /* not enough space for s2     */
+#define ESUNTERM        ( 407 )       /* unterminated string         */
+#define ESNODIFF        ( 408 )       /* no difference               */
+#define ESNOTFND        ( 409 )       /* not found                   */
 
-/* EOK may or may not be defined in errno.h */ 
-#ifndef EOK 
+/* EOK may or may not be defined in errno.h */
+#ifndef EOK
 #define EOK   0
 #endif
 
 #if (defined(WIN32) && defined(_MSC_VER) && (_MSC_VER<1500)) || defined(__GNUC__)
 
-static __inline int wels_strncpy_s( char *dest, int dmax, const char *src, int slen )
-{
-	int orig_dmax;
-    char *orig_dest;
-    const char *overlap_bumper;
+static __inline int wels_strncpy_s (char* dest, int dmax, const char* src, int slen) {
+int orig_dmax;
+char* orig_dest;
+const char* overlap_bumper;
 
-    if (dest == NULL) {
-//        invoke_safe_lib_constraint_handler("strncpy_s: dest is null", 
+if (dest == NULL) {
+//        invoke_safe_lib_constraint_handler("strncpy_s: dest is null",
 //                   NULL, ESNULLP);
-        return (ESNULLP);
-    }
+  return (ESNULLP);
+}
 
-    if (dmax <= 0) {
-//        invoke_safe_lib_constraint_handler("strncpy_s: dmax is 0", 
+if (dmax <= 0) {
+//        invoke_safe_lib_constraint_handler("strncpy_s: dmax is 0",
 //                   NULL, ESZEROL);
-        return (ESZEROL);
-    }
+  return (ESZEROL);
+}
 
 //    if (dmax > RSIZE_MAX_STR) {
-//        invoke_safe_lib_constraint_handler("strncpy_s: dmax exceeds max", 
+//        invoke_safe_lib_constraint_handler("strncpy_s: dmax exceeds max",
 //                   NULL, ESLEMAX);
 //        return (ESLEMAX);
 //    }
 
-	if (src == NULL) {
+if (src == NULL) {
 //        handle_error(orig_dest, orig_dmax, "strncpy_s: src is null", ESNULLP);
-        return (ESNULLP);
-    }
+  return (ESNULLP);
+}
 
-    if (slen <= 0) {
+if (slen <= 0) {
 //        handle_error(orig_dest, orig_dmax, "strncpy_s: slen is zero", ESZEROL);
-        return (ESZEROL);
-    }
+  return (ESZEROL);
+}
 
 //    if (slen > RSIZE_MAX_STR) {
 //        handle_error(orig_dest, orig_dmax, "strncpy_s: slen exceeds max", ESLEMAX);
@@ -116,240 +115,262 @@
 //        return (ESLEMAX);
 //    }
 
-    /* hold base in case src was not copied */  
-    orig_dmax = dmax;
-    orig_dest = dest;
+/* hold base in case src was not copied */
+orig_dmax = dmax;
+orig_dest = dest;
 
-	if (dest < src) {
-       overlap_bumper = src;
+if (dest < src) {
+  overlap_bumper = src;
 
-        while (dmax > 0) {
-            if (dest == overlap_bumper) {
+  while (dmax > 0) {
+    if (dest == overlap_bumper) {
 //                handle_error(orig_dest, orig_dmax, "strncpy_s: overlapping objects", ESOVRLP);
-                return (ESOVRLP); 
-            }
+      return (ESOVRLP);
+    }
 
-			if (slen == 0) {
-                /*
-                 * Copying truncated to slen chars.  Note that the TR says to
-                 * copy slen chars plus the null char.  We null the slack.
-                 */
+    if (slen == 0) {
+      /*
+       * Copying truncated to slen chars.  Note that the TR says to
+       * copy slen chars plus the null char.  We null the slack.
+       */
 #ifdef SAFE_LIB_STR_NULL_SLACK
-                while (dmax) { *dest = '\0'; dmax--; dest++; }
+      while (dmax) {
+        *dest = '\0';
+        dmax--;
+        dest++;
+      }
 #else
-                *dest = '\0'; 
-#endif 
-                return (EOK);
-			}
+      *dest = '\0';
+#endif
+      return (EOK);
+    }
 
-            *dest = *src;
-            if (*dest == '\0') {
+    *dest = *src;
+    if (*dest == '\0') {
 #ifdef SAFE_LIB_STR_NULL_SLACK
-                /* null slack */
-                while (dmax) { *dest = '\0'; dmax--; dest++; }
-#endif 
-                return (EOK);
-            }
+      /* null slack */
+      while (dmax) {
+        *dest = '\0';
+        dmax--;
+        dest++;
+      }
+#endif
+      return (EOK);
+    }
 
-            dmax--;
-            slen--;
-            dest++;
-            src++;
-        }
+    dmax--;
+    slen--;
+    dest++;
+    src++;
+  }
 
-    } else { 
-        overlap_bumper = dest;
+} else {
+  overlap_bumper = dest;
 
-        while (dmax > 0) {
-            if (src == overlap_bumper) {
+  while (dmax > 0) {
+    if (src == overlap_bumper) {
 //                handle_error(orig_dest, orig_dmax, "strncpy_s: overlapping objects", ESOVRLP);
-                return (ESOVRLP); 
-            }
+      return (ESOVRLP);
+    }
 
-	    if (slen == 0) {
-                /*
-                 * Copying truncated to slen chars.  Note that the TR says to
-                 * copy slen chars plus the null char.  We null the slack.
-                 */
+    if (slen == 0) {
+      /*
+       * Copying truncated to slen chars.  Note that the TR says to
+       * copy slen chars plus the null char.  We null the slack.
+       */
 #ifdef SAFE_LIB_STR_NULL_SLACK
-                while (dmax) { *dest = '\0'; dmax--; dest++; }
+      while (dmax) {
+        *dest = '\0';
+        dmax--;
+        dest++;
+      }
 #else
-                *dest = '\0'; 
-#endif 
-                return (EOK);
-            }
+      *dest = '\0';
+#endif
+      return (EOK);
+    }
 
-            *dest = *src;
-            if (*dest == '\0') {
+    *dest = *src;
+    if (*dest == '\0') {
 #ifdef SAFE_LIB_STR_NULL_SLACK
-                /* null slack */
-                while (dmax) { *dest = '\0'; dmax--; dest++; }
-#endif 
-                return (EOK);
-            }
+      /* null slack */
+      while (dmax) {
+        *dest = '\0';
+        dmax--;
+        dest++;
+      }
+#endif
+      return (EOK);
+    }
 
-            dmax--;
-            slen--;
-            dest++;
-            src++;
-        }
-    } 
+    dmax--;
+    slen--;
+    dest++;
+    src++;
+  }
+}
 
-    /*
-     * the entire src was not copied, so zero the string
-     */
+/*
+ * the entire src was not copied, so zero the string
+ */
 //    handle_error(orig_dest, orig_dmax, "strncpy_s: not enough space for src", ESNOSPC);
-    return (ESNOSPC);
+return (ESNOSPC);
 }
 
-static __inline int wels_strcat_s(char *dest, int dmax, const char *src)
-{
-	int orig_dmax;
-    char *orig_dest;
-    const char *overlap_bumper;
+static __inline int wels_strcat_s (char* dest, int dmax, const char* src) {
+  int orig_dmax;
+  char* orig_dest;
+  const char* overlap_bumper;
 
-    if (dest == NULL) {
-//        invoke_safe_lib_constraint_handler("strcat_s: dest is null", 
+  if (dest == NULL) {
+//        invoke_safe_lib_constraint_handler("strcat_s: dest is null",
 //                   NULL, ESNULLP);
-        return (ESNULLP);
-    }
+    return (ESNULLP);
+  }
 
-    if (src == NULL) {
-//        invoke_safe_lib_constraint_handler("strcat_s: src is null", 
+  if (src == NULL) {
+//        invoke_safe_lib_constraint_handler("strcat_s: src is null",
 //                   NULL, ESNULLP);
-        return (ESNULLP);
-    }
+    return (ESNULLP);
+  }
 
-    if (dmax <= 0) {
-//        invoke_safe_lib_constraint_handler("strcat_s: dmax is 0", 
+  if (dmax <= 0) {
+//        invoke_safe_lib_constraint_handler("strcat_s: dmax is 0",
 //                   NULL, ESZEROL);
-        return (ESZEROL);
-    }
+    return (ESZEROL);
+  }
 
 //    if (dmax > RSIZE_MAX_STR) {
-//        invoke_safe_lib_constraint_handler("strcat_s: dmax exceeds max", 
+//        invoke_safe_lib_constraint_handler("strcat_s: dmax exceeds max",
 //                   NULL, ESLEMAX);
 //        return (ESLEMAX);
 //    }
 
-    /* hold base of dest in case src was not copied */
-    orig_dmax = dmax;
-    orig_dest = dest;
+  /* hold base of dest in case src was not copied */
+  orig_dmax = dmax;
+  orig_dest = dest;
 
-    if (dest < src) {
-        overlap_bumper = src;
+  if (dest < src) {
+    overlap_bumper = src;
 
-        /* Find the end of dest */
-        while (*dest != '\0') {
- 
-            if (dest == overlap_bumper) {
-//                handle_error(orig_dest, orig_dmax, "strcat_s: overlapping objects", ESOVRLP); 
-                return (ESOVRLP);
-            }
+    /* Find the end of dest */
+    while (*dest != '\0') {
 
-            dest++;
-            dmax--;
-            if (dmax == 0) {
-//                handle_error(orig_dest, orig_dmax, "strcat_s: dest unterminated", ESUNTERM); 
-                return (ESUNTERM);
-            }
-        }
+      if (dest == overlap_bumper) {
+//                handle_error(orig_dest, orig_dmax, "strcat_s: overlapping objects", ESOVRLP);
+        return (ESOVRLP);
+      }
 
-        while (dmax > 0) {
-            if (dest == overlap_bumper) {
-//                handle_error(orig_dest, orig_dmax, "strcat_s: overlapping objects", ESOVRLP); 
-                return (ESOVRLP);
-            }
+      dest++;
+      dmax--;
+      if (dmax == 0) {
+//                handle_error(orig_dest, orig_dmax, "strcat_s: dest unterminated", ESUNTERM);
+        return (ESUNTERM);
+      }
+    }
 
-            *dest = *src;
-            if (*dest == '\0') {
-#ifdef SAFE_LIB_STR_NULL_SLACK
-                /* null slack to clear any data */
-                while (dmax) { *dest = '\0'; dmax--; dest++; }
-#endif 
-                return (EOK);
-            }
+    while (dmax > 0) {
+      if (dest == overlap_bumper) {
+//                handle_error(orig_dest, orig_dmax, "strcat_s: overlapping objects", ESOVRLP);
+        return (ESOVRLP);
+      }
 
-            dmax--;
-            dest++;
-            src++;
+      *dest = *src;
+      if (*dest == '\0') {
+#ifdef SAFE_LIB_STR_NULL_SLACK
+        /* null slack to clear any data */
+        while (dmax) {
+          *dest = '\0';
+          dmax--;
+          dest++;
         }
+#endif
+        return (EOK);
+      }
 
-    } else {
-        overlap_bumper = dest;
+      dmax--;
+      dest++;
+      src++;
+    }
 
-        /* Find the end of dest */
-        while (*dest != '\0') {
+  } else {
+    overlap_bumper = dest;
 
-            /*
-             * NOTE: no need to check for overlap here since src comes first
-             * in memory and we're not incrementing src here.
-             */
-            dest++;
-            dmax--;
-            if (dmax == 0) {
-//                handle_error(orig_dest, orig_dmax, "strcat_s: dest unterminated", ESUNTERM); 
-                return (ESUNTERM);
-            }
-        }
+    /* Find the end of dest */
+    while (*dest != '\0') {
 
-        while (dmax > 0) {
-            if (src == overlap_bumper) {
-//                handle_error(orig_dest, orig_dmax, "strcat_s: overlapping objects", ESOVRLP); 
-                return (ESOVRLP);
-            }
+      /*
+       * NOTE: no need to check for overlap here since src comes first
+       * in memory and we're not incrementing src here.
+       */
+      dest++;
+      dmax--;
+      if (dmax == 0) {
+//                handle_error(orig_dest, orig_dmax, "strcat_s: dest unterminated", ESUNTERM);
+        return (ESUNTERM);
+      }
+    }
 
-            *dest = *src;
-            if (*dest == '\0') {
-#ifdef SAFE_LIB_STR_NULL_SLACK
-                /* null slack to clear any data */
-                while (dmax) { *dest = '\0'; dmax--; dest++; }
-#endif 
-                return (EOK);
-            }
+    while (dmax > 0) {
+      if (src == overlap_bumper) {
+//                handle_error(orig_dest, orig_dmax, "strcat_s: overlapping objects", ESOVRLP);
+        return (ESOVRLP);
+      }
 
-            dmax--;
-            dest++;
-            src++;
+      *dest = *src;
+      if (*dest == '\0') {
+#ifdef SAFE_LIB_STR_NULL_SLACK
+        /* null slack to clear any data */
+        while (dmax) {
+          *dest = '\0';
+          dmax--;
+          dest++;
         }
-    } 
+#endif
+        return (EOK);
+      }
 
-    /*
-     * the entire src was not copied, so null the string 
-     */
-//    handle_error(orig_dest, orig_dmax, "strcat_s: not enough space for src", ESNOSPC); 
+      dmax--;
+      dest++;
+      src++;
+    }
+  }
 
-    return (ESNOSPC);
+  /*
+   * the entire src was not copied, so null the string
+   */
+//    handle_error(orig_dest, orig_dmax, "strcat_s: not enough space for src", ESNOSPC);
+
+  return (ESNOSPC);
 }
 
-static __inline int wels_strnlen_s(const char *dest, int dmax)
-{
-    int count;
+static __inline int wels_strnlen_s (const char* dest, int dmax) {
+  int count;
 
-    if (dest == NULL) {
-        return (0);
-    }
+  if (dest == NULL) {
+    return (0);
+  }
 
-    if (dmax <= 0) { 
-//        invoke_safe_lib_constraint_handler("strnlen_s: dmax is 0", 
+  if (dmax <= 0) {
+//        invoke_safe_lib_constraint_handler("strnlen_s: dmax is 0",
 //                   NULL, ESZEROL);
-        return (0);
-    }
+    return (0);
+  }
 
 //    if (dmax > RSIZE_MAX_STR) {
-//        invoke_safe_lib_constraint_handler("strnlen_s: dmax exceeds max", 
+//        invoke_safe_lib_constraint_handler("strnlen_s: dmax exceeds max",
 //                   NULL, ESLEMAX);
 //        return (0);
 //    }
 
-    count = 0;
-    while (*dest && dmax) {
-        count++;
-        dmax--;
-        dest++;
-    }
+  count = 0;
+  while (*dest && dmax) {
+    count++;
+    dmax--;
+    dest++;
+  }
 
-    return (count);
+  return (count);
 }
 
 #endif//(WIN32 && _MSC_VER && _MSC_VER<1500) || __GNUC__
--- a/codec/encoder/core/inc/deblocking.h
+++ b/codec/encoder/core/inc/deblocking.h
@@ -50,34 +50,38 @@
 //struct tagDeblockingFunc;
 
 typedef struct TagDeblockingFilter {
-	uint8_t		*pCsData[3];	// pointer to reconstructed picture pData
-	int32_t		iCsStride[3];	// Cs iStride
-	int16_t     iMbStride;	
-	int8_t		iSliceAlphaC0Offset;
-	int8_t		iSliceBetaOffset;
-	uint8_t     uiLumaQP;
-	uint8_t     uiChromaQP;
-	uint8_t     uiFilterIdc;
-	uint8_t     uiReserved;
-}SDeblockingFilter;
+uint8_t*		pCsData[3];	// pointer to reconstructed picture pData
+int32_t		iCsStride[3];	// Cs iStride
+int16_t     iMbStride;
+int8_t		iSliceAlphaC0Offset;
+int8_t		iSliceBetaOffset;
+uint8_t     uiLumaQP;
+uint8_t     uiChromaQP;
+uint8_t     uiFilterIdc;
+uint8_t     uiReserved;
+} SDeblockingFilter;
 
-void DeblockLumaLt4_c( uint8_t *pPixY, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha, int32_t iBeta, int8_t *pTc );
-void DeblockLumaEq4_c( uint8_t *pPixY, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha, int32_t iBeta );
-void DeblockChromaLt4_c( uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha, int32_t iBeta, int8_t *pTc );
-void DeblockChromaEq4_c( uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha, int32_t iBeta );
+void DeblockLumaLt4_c (uint8_t* pPixY, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
+void DeblockLumaEq4_c (uint8_t* pPixY, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha, int32_t iBeta);
+void DeblockChromaLt4_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha,
+                         int32_t iBeta, int8_t* pTc);
+void DeblockChromaEq4_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha,
+                         int32_t iBeta);
 
 
-void DeblockLumaLt4V_c( uint8_t *pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *pTc );
-void DeblockLumaEq4V_c( uint8_t *pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta );
+void DeblockLumaLt4V_c (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
+void DeblockLumaEq4V_c (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
 
-void DeblockLumaLt4H_c( uint8_t *pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *pTc );
-void DeblockLumaEq4H_c( uint8_t *pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta );
+void DeblockLumaLt4H_c (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
+void DeblockLumaEq4H_c (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
 
-void DeblockChromaLt4V_c( uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *pTc );
-void DeblockChromaEq4V_c( uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta );
+void DeblockChromaLt4V_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
+                          int8_t* pTc);
+void DeblockChromaEq4V_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
 
-void DeblockChromaLt4H_c( uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *pTc );
-void DeblockChromaEq4H_c( uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta );
+void DeblockChromaLt4H_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
+                          int8_t* pTc);
+void DeblockChromaEq4H_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
 
 #if defined(__cplusplus)
 extern "C" {
@@ -84,16 +88,18 @@
 #endif//__cplusplus
 
 #ifdef  X86_ASM
-void DeblockLumaLt4V_sse2( uint8_t *pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *pTc );
-void DeblockLumaEq4V_sse2( uint8_t *pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta );
-void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);
-void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
-void DeblockLumaLt4H_sse2(uint8_t *pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *pTc);
-void DeblockLumaEq4H_sse2(uint8_t *pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
-void DeblockChromaEq4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
-void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t * pTC);
-void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
-void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t * pTC);
+void DeblockLumaLt4V_sse2 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
+void DeblockLumaEq4V_sse2 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void DeblockLumaTransposeH2V_sse2 (uint8_t* pPixY, int32_t iStride, uint8_t* pDst);
+void DeblockLumaTransposeV2H_sse2 (uint8_t* pPixY, int32_t iStride, uint8_t* pSrc);
+void DeblockLumaLt4H_sse2 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
+void DeblockLumaEq4H_sse2 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void DeblockChromaEq4V_sse2 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void DeblockChromaLt4V_sse2 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
+                             int8_t* pTC);
+void DeblockChromaEq4H_sse2 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void DeblockChromaLt4H_sse2 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
+                             int8_t* pTC);
 #endif
 
 #if defined(__cplusplus)
@@ -100,16 +106,16 @@
 }
 #endif//__cplusplus
 
-void DeblockingInit( DeblockingFunc  * pFunc,  int32_t iCpu );
+void DeblockingInit (DeblockingFunc*   pFunc,  int32_t iCpu);
 
-void WelsNonZeroCount_c(int8_t * pNonZeroCount);
-void WelsBlockFuncInit(PSetNoneZeroCountZeroFunc *pfSetNZCZero,  int32_t iCpu);
+void WelsNonZeroCount_c (int8_t* pNonZeroCount);
+void WelsBlockFuncInit (PSetNoneZeroCountZeroFunc* pfSetNZCZero,  int32_t iCpu);
 
-void PerformDeblockingFilter( sWelsEncCtx *pEnc );
+void PerformDeblockingFilter (sWelsEncCtx* pEnc);
 
-void DeblockingFilterFrameAvcbase( SDqLayer *pCurDq, SWelsFuncPtrList *pFunc );
+void DeblockingFilterFrameAvcbase (SDqLayer* pCurDq, SWelsFuncPtrList* pFunc);
 
-void DeblockingFilterSliceAvcbase( SDqLayer *pCurDq, SWelsFuncPtrList *pFunc, const int32_t kiSliceIdx );
+void DeblockingFilterSliceAvcbase (SDqLayer* pCurDq, SWelsFuncPtrList* pFunc, const int32_t kiSliceIdx);
 }
 
 #endif
--- a/codec/encoder/core/inc/decode_mb_aux.h
+++ b/codec/encoder/core/inc/decode_mb_aux.h
@@ -38,21 +38,22 @@
 #include "wels_func_ptr_def.h"
 
 namespace WelsSVCEnc {
-void WelsDequantLumaDc4x4(int16_t *pRes, const int32_t kiQp);
-void WelsIHadamard4x4Dc(int16_t* pRes);
+void WelsDequantLumaDc4x4 (int16_t* pRes, const int32_t kiQp);
+void WelsIHadamard4x4Dc (int16_t* pRes);
 
-void WelsInitReconstructionFuncs( SWelsFuncPtrList *pList, uint32_t  iCpuFlags );
-void WelsGetEncBlockStrideOffset(int32_t *pBlock, const int32_t kiStrideY, const int32_t kiStrideUV);
+void WelsInitReconstructionFuncs (SWelsFuncPtrList* pList, uint32_t  iCpuFlags);
+void WelsGetEncBlockStrideOffset (int32_t* pBlock, const int32_t kiStrideY, const int32_t kiStrideUV);
 
-void WelsDequantFour4x4_c(int16_t *pRes, const uint16_t* kpQpTable);
-void WelsDequant4x4_c(int16_t *pRes, const uint16_t* kpQpTable);
-void WelsDequantIHadamard4x4_c(int16_t *pRes, const uint16_t kuiMF);
-void WelsDequantIHadamard2x2Dc( int16_t* pDct, const uint16_t kuiMF);
+void WelsDequantFour4x4_c (int16_t* pRes, const uint16_t* kpQpTable);
+void WelsDequant4x4_c (int16_t* pRes, const uint16_t* kpQpTable);
+void WelsDequantIHadamard4x4_c (int16_t* pRes, const uint16_t kuiMF);
+void WelsDequantIHadamard2x2Dc (int16_t* pDct, const uint16_t kuiMF);
 
-void WelsIDctT4RecOnMb(uint8_t* pDst, int32_t iDstStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct, PIDctFunc pfIDctFourT4);
-void WelsIDctT4Rec_c( uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct );
-void WelsIDctFourT4Rec_c( uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct );
-void WelsIDctRecI16x16Dc_c(uint8_t *pRec, int32_t iStride, uint8_t *pPred, int32_t iPredStride, int16_t *pDctDc);
+void WelsIDctT4RecOnMb (uint8_t* pDst, int32_t iDstStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct,
+                        PIDctFunc pfIDctFourT4);
+void WelsIDctT4Rec_c (uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct);
+void WelsIDctFourT4Rec_c (uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct);
+void WelsIDctRecI16x16Dc_c (uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDctDc);
 
 #if defined(__cplusplus)
 extern "C" {
@@ -59,13 +60,14 @@
 #endif//__cplusplus
 
 #if defined(X86_ASM)
-void WelsDequant4x4_sse2(int16_t *pDct, const uint16_t* kpMF);
-void WelsDequantFour4x4_sse2(int16_t *pDct, const uint16_t* kpMF);
-void WelsDequantIHadamard4x4_sse2(int16_t *pRes, const uint16_t kuiMF);
+void WelsDequant4x4_sse2 (int16_t* pDct, const uint16_t* kpMF);
+void WelsDequantFour4x4_sse2 (int16_t* pDct, const uint16_t* kpMF);
+void WelsDequantIHadamard4x4_sse2 (int16_t* pRes, const uint16_t kuiMF);
 
-void WelsIDctT4Rec_mmx( uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct );
-void WelsIDctFourT4Rec_sse2( uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct );
-void WelsIDctRecI16x16Dc_sse2(uint8_t *pRec, int32_t iStride, uint8_t *pPrediction, int32_t iPredStride, int16_t *pDctDc);
+void WelsIDctT4Rec_mmx (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct);
+void WelsIDctFourT4Rec_sse2 (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct);
+void WelsIDctRecI16x16Dc_sse2 (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride,
+                               int16_t* pDctDc);
 #endif//X86_ASM
 
 #if defined(__cplusplus)
--- a/codec/encoder/core/inc/dq_map.h
+++ b/codec/encoder/core/inc/dq_map.h
@@ -46,11 +46,10 @@
  *	Dependency Quality IDC
  */
 
-typedef struct TagDqIdc
-{
-	uint16_t	iPpsId;			// pPps id
-	uint8_t	iSpsId;			// pSps id
-	int8_t		uiSpatialId;	// spatial id
-}SDqIdc;
+typedef struct TagDqIdc {
+  uint16_t	iPpsId;			// pPps id
+  uint8_t	iSpsId;			// pSps id
+  int8_t		uiSpatialId;	// spatial id
+} SDqIdc;
 
 #endif//WELS_ENCODER_DEPENDENCY_QUAILITY_IDC_MAP_H__
--- a/codec/encoder/core/inc/encode_mb_aux.h
+++ b/codec/encoder/core/inc/encode_mb_aux.h
@@ -37,45 +37,45 @@
 #include "wels_func_ptr_def.h"
 
 namespace WelsSVCEnc {
-void WelsInitEncodingFuncs( SWelsFuncPtrList *pFuncList, uint32_t  uiCpuFlag );
-int32_t WelsGetNoneZeroCount_c(int16_t* pLevel);
+void WelsInitEncodingFuncs (SWelsFuncPtrList* pFuncList, uint32_t  uiCpuFlag);
+int32_t WelsGetNoneZeroCount_c (int16_t* pLevel);
 
 /****************************************************************************
  * Scan and Score functions
  ****************************************************************************/
-void	WelsScan4x4Ac_c( int16_t* pZigValue, int16_t* pDct );
-void	WelsScan4x4Dc( int16_t* pLevel, int16_t* pDct );
-void	WelsScan4x4DcAc_c( int16_t* pLevel, int16_t *pDct );
-int32_t		WelsCalculateSingleCtr4x4_c( int16_t *pDct);
+void	WelsScan4x4Ac_c (int16_t* pZigValue, int16_t* pDct);
+void	WelsScan4x4Dc (int16_t* pLevel, int16_t* pDct);
+void	WelsScan4x4DcAc_c (int16_t* pLevel, int16_t* pDct);
+int32_t		WelsCalculateSingleCtr4x4_c (int16_t* pDct);
 
 /****************************************************************************
- * HDM and Quant functions 
+ * HDM and Quant functions
  ****************************************************************************/
-void WelsHadamardT4Dc_c( int16_t *pLumaDc, int16_t *pDct);
-int32_t WelsHadamardQuant2x2_c(int16_t *pRes, const int16_t kiFF, int16_t iMF, int16_t * pDct, int16_t * pBlock);
-int32_t WelsHadamardQuant2x2Skip_c(int16_t *pRes, int16_t iFF,  int16_t iMF);
+void WelsHadamardT4Dc_c (int16_t* pLumaDc, int16_t* pDct);
+int32_t WelsHadamardQuant2x2_c (int16_t* pRes, const int16_t kiFF, int16_t iMF, int16_t* pDct, int16_t* pBlock);
+int32_t WelsHadamardQuant2x2Skip_c (int16_t* pRes, int16_t iFF,  int16_t iMF);
 
-void WelsQuant4x4_c(int16_t *pDct, int16_t* pFF,  int16_t *pMF);
-void WelsQuant4x4Dc_c(int16_t *pDct, int16_t iFF,  int16_t iMF);
-void WelsQuantFour4x4_c(int16_t *pDct, int16_t* pFF,  int16_t *pQpTable);
-void WelsQuantFour4x4Max_c(int16_t *pDct, int16_t* pF,  int16_t *pQpTable, int16_t *pMax);
+void WelsQuant4x4_c (int16_t* pDct, int16_t* pFF,  int16_t* pMF);
+void WelsQuant4x4Dc_c (int16_t* pDct, int16_t iFF,  int16_t iMF);
+void WelsQuantFour4x4_c (int16_t* pDct, int16_t* pFF,  int16_t* pQpTable);
+void WelsQuantFour4x4Max_c (int16_t* pDct, int16_t* pF,  int16_t* pQpTable, int16_t* pMax);
 
 
 /****************************************************************************
  * DCT functions
  ****************************************************************************/
-void WelsDctT4_c( int16_t *pDct, uint8_t *pPixel1, int32_t iStride1, uint8_t *pPixel2, int32_t iStride2 );
+void WelsDctT4_c (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);
 // dct_data is no-use here, just for the same interface with dct_save functions
-void WelsDctFourT4_c(int16_t *pDct, uint8_t *pPixel1, int32_t iStride1, uint8_t *pPixel2, int32_t iStride2);
+void WelsDctFourT4_c (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);
 
 /****************************************************************************
  * Copy functions
  ****************************************************************************/
-void WelsCopy4x4( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS );
-void WelsCopy8x8_c( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS );
-void WelsCopy8x16_c( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS );	// 
-void WelsCopy16x8_c( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS );	// 
-void WelsCopy16x16_c( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS );
+void WelsCopy4x4 (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);
+void WelsCopy8x8_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);
+void WelsCopy8x16_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);	//
+void WelsCopy16x8_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);	//
+void WelsCopy16x16_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);
 
 #if defined(__cplusplus)
 extern "C" {
@@ -83,43 +83,43 @@
 
 #ifdef X86_ASM
 
-int32_t WelsGetNoneZeroCount_sse2(int16_t* pLevel);
+int32_t WelsGetNoneZeroCount_sse2 (int16_t* pLevel);
 
 /****************************************************************************
  * Scan and Score functions
  ****************************************************************************/
-void WelsScan4x4Ac_sse2( int16_t* zig_value, int16_t* pDct );
-void WelsScan4x4DcAc_ssse3( int16_t* pLevel, int16_t *pDct );
-void WelsScan4x4DcAc_sse2( int16_t* pLevel, int16_t *pDct );
-int32_t WelsCalculateSingleCtr4x4_sse2( int16_t *pDct );
+void WelsScan4x4Ac_sse2 (int16_t* zig_value, int16_t* pDct);
+void WelsScan4x4DcAc_ssse3 (int16_t* pLevel, int16_t* pDct);
+void WelsScan4x4DcAc_sse2 (int16_t* pLevel, int16_t* pDct);
+int32_t WelsCalculateSingleCtr4x4_sse2 (int16_t* pDct);
 
 /****************************************************************************
  * DCT functions
  ****************************************************************************/
-void WelsDctT4_mmx( int16_t *pDct,  uint8_t *pPixel1, int32_t iStride1, uint8_t *pPixel2, int32_t iStride2 );
-void WelsDctFourT4_sse2(int16_t *pDct, uint8_t *pPixel1, int32_t iStride1, uint8_t *pPixel2, int32_t iStride2);
+void WelsDctT4_mmx (int16_t* pDct,  uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);
+void WelsDctFourT4_sse2 (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);
 
 /****************************************************************************
- * HDM and Quant functions 
+ * HDM and Quant functions
  ****************************************************************************/
-int32_t WelsHadamardQuant2x2_mmx(int16_t *pRes, const int16_t kiFF, int16_t iMF, int16_t * pDct, int16_t * pBlock);
-void WelsHadamardT4Dc_sse2( int16_t *pLumaDc, int16_t *pDct);
-int32_t WelsHadamardQuant2x2Skip_mmx(int16_t *pRes, int16_t iFF,  int16_t iMF);
+int32_t WelsHadamardQuant2x2_mmx (int16_t* pRes, const int16_t kiFF, int16_t iMF, int16_t* pDct, int16_t* pBlock);
+void WelsHadamardT4Dc_sse2 (int16_t* pLumaDc, int16_t* pDct);
+int32_t WelsHadamardQuant2x2Skip_mmx (int16_t* pRes, int16_t iFF,  int16_t iMF);
 
-void WelsQuant4x4_sse2(int16_t *pDct, int16_t* pFF,  int16_t *pMF);
-void WelsQuant4x4Dc_sse2(int16_t *pDct,  int16_t iFF, int16_t iMF);
-void WelsQuantFour4x4_sse2(int16_t *pDct, int16_t* pFF,  int16_t *pMF);
-void WelsQuantFour4x4Max_sse2(int16_t *pDct, int16_t* pFF,  int16_t *pMF, int16_t *pMax);
+void WelsQuant4x4_sse2 (int16_t* pDct, int16_t* pFF,  int16_t* pMF);
+void WelsQuant4x4Dc_sse2 (int16_t* pDct,  int16_t iFF, int16_t iMF);
+void WelsQuantFour4x4_sse2 (int16_t* pDct, int16_t* pFF,  int16_t* pMF);
+void WelsQuantFour4x4Max_sse2 (int16_t* pDct, int16_t* pFF,  int16_t* pMF, int16_t* pMax);
 
 
 /****************************************************************************
  * Copy functions for rec
  ****************************************************************************/
-void WelsCopy8x8_mmx( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS );
-void WelsCopy8x16_mmx( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS );	
-void WelsCopy16x8NotAligned_sse2( uint8_t* Dst, int32_t  iStrideD, uint8_t* Src,int32_t  iStrideS );	
-void WelsCopy16x16_sse2( uint8_t* Dst, int32_t  iStrideD, uint8_t* Src,int32_t  iStrideS );
-void WelsCopy16x16NotAligned_sse2( uint8_t* Dst, int32_t  iStrideD, uint8_t* Src,int32_t  iStrideS );
+void WelsCopy8x8_mmx (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);
+void WelsCopy8x16_mmx (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);
+void WelsCopy16x8NotAligned_sse2 (uint8_t* Dst, int32_t  iStrideD, uint8_t* Src, int32_t  iStrideS);
+void WelsCopy16x16_sse2 (uint8_t* Dst, int32_t  iStrideD, uint8_t* Src, int32_t  iStrideS);
+void WelsCopy16x16NotAligned_sse2 (uint8_t* Dst, int32_t  iStrideD, uint8_t* Src, int32_t  iStrideS);
 #endif
 
 
@@ -127,8 +127,8 @@
 }
 #endif//__cplusplus
 
-__align16(extern int16_t, g_kiQuantInterFF[58][8] );
+__align16 (extern int16_t, g_kiQuantInterFF[58][8]);
 #define g_iQuantIntraFF (g_kiQuantInterFF +6 )
-__align16(extern int16_t, g_kiQuantMF[52][8]) ;
+__align16 (extern int16_t, g_kiQuantMF[52][8]) ;
 }
 #endif//ENCODE_MB_AUX_H
--- a/codec/encoder/core/inc/encoder.h
+++ b/codec/encoder/core/inc/encoder.h
@@ -48,7 +48,7 @@
  * \param	pEncCtx		sWelsEncCtx*
  * \return	successful - 0; otherwise none 0 for failed
  */
-int32_t RequestMemorySvc( sWelsEncCtx **ppCtx );
+int32_t RequestMemorySvc (sWelsEncCtx** ppCtx);
 
 /*!
  * \brief	free memory	in SVC core encoder
@@ -55,7 +55,7 @@
  * \param	pEncCtx		sWelsEncCtx**
  * \return	none
  */
-void FreeMemorySvc( sWelsEncCtx **ppCtx);
+void FreeMemorySvc (sWelsEncCtx** ppCtx);
 
 /*!
  * \brief	initialize function pointers that potentially used in Wels encoding
@@ -62,10 +62,10 @@
  * \param	pEncCtx		sWelsEncCtx*
  * \return	successful - 0; otherwise none 0 for failed
  */
-int32_t InitFunctionPointers( SWelsFuncPtrList *pFuncList, SWelsSvcCodingParam *_param, uint32_t  uiCpuFlag );
+int32_t InitFunctionPointers (SWelsFuncPtrList* pFuncList, SWelsSvcCodingParam* _param, uint32_t  uiCpuFlag);
 
 ///*!
-// * \brief	decide frame type (IDR/P frame)	
+// * \brief	decide frame type (IDR/P frame)
 // * \param	uiFrameType	frame type output
 // * \param	frame_idx	frame index elapsed currently
 // * \param	idr			IDR interval
@@ -72,21 +72,21 @@
 // * \return	successful - 0; otherwise none 0 for failed
 // */
 /*!
- * \brief	initialize frame coding	
+ * \brief	initialize frame coding
  */
-void InitFrameCoding( sWelsEncCtx *pEncCtx, const EFrameType keFrameType );
+void InitFrameCoding (sWelsEncCtx* pEncCtx, const EFrameType keFrameType);
 
-EFrameType DecideFrameType( sWelsEncCtx *pEncCtx, const int8_t kiSpatialNum );
+EFrameType DecideFrameType (sWelsEncCtx* pEncCtx, const int8_t kiSpatialNum);
 /*!
  * \brief	Dump reconstruction for dependency layer
  */
 
-extern "C" void DumpDependencyRec( SPicture *pSrcPic, const str_t *kpFileName, const int8_t kiDid );
+extern "C" void DumpDependencyRec (SPicture* pSrcPic, const str_t* kpFileName, const int8_t kiDid);
 
 /*!
  * \brief	Dump the reconstruction pictures
  */
-void DumpRecFrame( SPicture *pSrcPic, const str_t *kpFileName );
+void DumpRecFrame (SPicture* pSrcPic, const str_t* kpFileName);
 
 
 /*!
@@ -97,16 +97,16 @@
  * \param	nal_idc				EWelsNalRefIdc for a frame
  * \return	successful - 0; otherwise none 0 for failed
  */
-int32_t EncodeFrame(	sWelsEncCtx *pEncCtx,
-					const int32_t kiSliceNumCount,
-					const EWelsNalUnitType keNalType,
-					const EWelsNalRefIdc keNalIdc	);
+int32_t EncodeFrame (sWelsEncCtx* pEncCtx,
+                     const int32_t kiSliceNumCount,
+                     const EWelsNalUnitType keNalType,
+                     const EWelsNalRefIdc keNalIdc);
 
 
 /**********************************************************************************
- * memzero Function 
+ * memzero Function
 ***********************************************************************************/
-void WelsSetMemZero_c(void *pDst, int32_t iSize);	// confirmed_safe_unsafe_usage
+void WelsSetMemZero_c (void* pDst, int32_t iSize);	// confirmed_safe_unsafe_usage
 
 #if defined(__cplusplus)
 extern "C" {
@@ -113,10 +113,10 @@
 #endif//__cplusplus
 
 #ifdef X86_ASM
-void WelsSetMemZeroAligned64_sse2(void *pDst, int32_t iSize);
-void WelsSetMemZeroSize64_mmx(void *pDst, int32_t iSize);
-void WelsSetMemZeroSize8_mmx(void *pDst, int32_t iSize);
-void WelsPrefetchZero_mmx(int8_t const*kpDst);
+void WelsSetMemZeroAligned64_sse2 (void* pDst, int32_t iSize);
+void WelsSetMemZeroSize64_mmx (void* pDst, int32_t iSize);
+void WelsSetMemZeroSize8_mmx (void* pDst, int32_t iSize);
+void WelsPrefetchZero_mmx (int8_t const* kpDst);
 #endif
 
 #if defined(__cplusplus)
--- a/codec/encoder/core/inc/encoder_context.h
+++ b/codec/encoder/core/inc/encoder_context.h
@@ -64,159 +64,167 @@
  *	reference list for each quality layer in SVC
  */
 typedef struct TagRefList {
-	SPicture					*pShortRefList[1+MAX_SHORT_REF_COUNT];// reference list 0 - int16_t
-	SPicture					*pLongRefList[1+MAX_LONG_REF_COUNT];	// reference list 1 - int32_t
-	SPicture					*pNextBuffer;
-	SPicture					*pRef[1+MAX_REF_PIC_COUNT];	// plus 1 for swap intend
-	uint8_t						uiShortRefCount;
-	uint8_t						uiLongRefCount;	// dependend on pRef pic module
+  SPicture*					pShortRefList[1 + MAX_SHORT_REF_COUNT]; // reference list 0 - int16_t
+  SPicture*					pLongRefList[1 + MAX_LONG_REF_COUNT];	// reference list 1 - int32_t
+  SPicture*					pNextBuffer;
+  SPicture*					pRef[1 + MAX_REF_PIC_COUNT];	// plus 1 for swap intend
+  uint8_t						uiShortRefCount;
+  uint8_t						uiLongRefCount;	// dependend on pRef pic module
 } SRefList;
 
-typedef struct TagLTRState{	
-	// LTR mark feedback
-	uint32_t		    		uiLtrMarkState;	// LTR mark state, indicate whether there is a LTR mark feedback unsolved
-	int32_t						iLtrMarkFbFrameNum;// the unsolved LTR mark feedback, the marked iFrameNum feedback from decoder
+typedef struct TagLTRState {
+  // LTR mark feedback
+  uint32_t		    		uiLtrMarkState;	// LTR mark state, indicate whether there is a LTR mark feedback unsolved
+  int32_t						iLtrMarkFbFrameNum;// the unsolved LTR mark feedback, the marked iFrameNum feedback from decoder
 
-	// LTR used as recovery reference
-	int32_t						iLastRecoverFrameNum; // reserve the last LTR or IDR recover iFrameNum
-	int32_t						iLastCorFrameNumDec; // reserved the last correct position in decoder side, use to select valid LTR to recover or to decide the LTR mark validation
-	int32_t						iCurFrameNumInDec; // current iFrameNum in decoder side, use to select valid LTR to recover or to decide the LTR mark validation
+  // LTR used as recovery reference
+  int32_t						iLastRecoverFrameNum; // reserve the last LTR or IDR recover iFrameNum
+  int32_t
+  iLastCorFrameNumDec; // reserved the last correct position in decoder side, use to select valid LTR to recover or to decide the LTR mark validation
+  int32_t
+  iCurFrameNumInDec; // current iFrameNum in decoder side, use to select valid LTR to recover or to decide the LTR mark validation
 
-	// LTR mark
-	int32_t						iLTRMarkMode; // direct mark or delay mark
-	int32_t						iLTRMarkSuccessNum; //successful marked num, for mark mode switch
-	int32_t						iCurLtrIdx;// current int32_t term reference index to mark
-	int32_t						iLastLtrIdx;
-	uint32_t					uiLtrMarkInterval;// the interval from the last int32_t term pRef mark	
-		
-	bool_t						bLTRMarkingFlag;	//decide whether current frame marked as LTR
-	bool_t						bLTRMarkEnable; //when LTR is confirmed and the interval is no smaller than the marking period
-	bool_t						bReceivedT0LostFlag;	// indicate whether a t0 lost feedback is recieved, for LTR recovery
-}SLTRState;
+  // LTR mark
+  int32_t						iLTRMarkMode; // direct mark or delay mark
+  int32_t						iLTRMarkSuccessNum; //successful marked num, for mark mode switch
+  int32_t						iCurLtrIdx;// current int32_t term reference index to mark
+  int32_t						iLastLtrIdx;
+  uint32_t					uiLtrMarkInterval;// the interval from the last int32_t term pRef mark
 
-typedef struct TagSpatialPicIndex{
-	SPicture	*pSrc;	// I420 based and after color space converted
-	int32_t		iDid;	// dependency id
+  bool_t						bLTRMarkingFlag;	//decide whether current frame marked as LTR
+  bool_t						bLTRMarkEnable; //when LTR is confirmed and the interval is no smaller than the marking period
+  bool_t						bReceivedT0LostFlag;	// indicate whether a t0 lost feedback is recieved, for LTR recovery
+} SLTRState;
+
+typedef struct TagSpatialPicIndex {
+  SPicture*	pSrc;	// I420 based and after color space converted
+  int32_t		iDid;	// dependency id
 } SSpatialPicIndex;
 
 typedef struct TagStrideTables {
-	int32_t		*pStrideDecBlockOffset[MAX_DEPENDENCY_LAYER][2];	// [iDid][tid==0][24 x 4]: luma+chroma= 24 x 4
-	int32_t		*pStrideEncBlockOffset[MAX_DEPENDENCY_LAYER];		// [iDid][24 x 4]: luma+chroma= 24 x 4
-	int16_t		*pMbIndexX[MAX_DEPENDENCY_LAYER];					// [iDid][iMbX]: map for iMbX in each spatial layer coding
-	int16_t		*pMbIndexY[MAX_DEPENDENCY_LAYER];					// [iDid][iMbY]: map for iMbY in each spatial layer coding
+  int32_t*		pStrideDecBlockOffset[MAX_DEPENDENCY_LAYER][2];	// [iDid][tid==0][24 x 4]: luma+chroma= 24 x 4
+  int32_t*		pStrideEncBlockOffset[MAX_DEPENDENCY_LAYER];		// [iDid][24 x 4]: luma+chroma= 24 x 4
+  int16_t*		pMbIndexX[MAX_DEPENDENCY_LAYER];					// [iDid][iMbX]: map for iMbX in each spatial layer coding
+  int16_t*		pMbIndexY[MAX_DEPENDENCY_LAYER];					// [iDid][iMbY]: map for iMbY in each spatial layer coding
 } SStrideTables;
 
-typedef struct TagWelsEncCtx{
-	// Input	
-	SWelsSvcCodingParam		*pSvcParam;	// SVC parameter, WelsSVCParamConfig in svc_param_settings.h		
-	SWelsSliceBs			 *pSliceBs;		// bitstream buffering for various slices, [uiSliceIdx]	
+typedef struct TagWelsEncCtx {
+  // Input
+  SWelsSvcCodingParam*		pSvcParam;	// SVC parameter, WelsSVCParamConfig in svc_param_settings.h
+  SWelsSliceBs*		 	pSliceBs;		// bitstream buffering for various slices, [uiSliceIdx]
 
-	int32_t					*pSadCostMb;
-	/* MVD cost tables for Inter MB */
-	uint16_t					*pMvdCostTableInter; //[52];	// adaptive to spatial layers
-	SMVUnitXY					*pMvUnitBlock4x4;	// (*pMvUnitBlock4x4[2])[MB_BLOCK4x4_NUM];	    // for store each 4x4 blocks' mv unit, the two swap after different d layer
-	int8_t						*pRefIndexBlock4x4;	// (*pRefIndexBlock4x4[2])[MB_BLOCK8x8_NUM];	    // for store each 4x4 blocks' pRef index, the two swap after different d layer
-	int8_t                      *pNonZeroCountBlocks;	// (*pNonZeroCountBlocks)[MB_LUMA_CHROMA_BLOCK4x4_NUM];
-	int8_t                      *pIntra4x4PredModeBlocks;	// (*pIntra4x4PredModeBlocks)[INTRA_4x4_MODE_NUM];  //last byte is not used; the first 4 byte is for the bottom 12,13,14,15 4x4 block intra mode, and 3 byte for (3,7,11)
-	
-	SMB                          **ppMbListD;	// [MAX_DEPENDENCY_LAYER];
-	SStrideTables				*pStrideTab;	// stride tables for internal coding used
-	SWelsFuncPtrList			*pFuncList;
+  int32_t*					pSadCostMb;
+  /* MVD cost tables for Inter MB */
+  uint16_t*					pMvdCostTableInter; //[52];	// adaptive to spatial layers
+  SMVUnitXY*
+  pMvUnitBlock4x4;	// (*pMvUnitBlock4x4[2])[MB_BLOCK4x4_NUM];	    // for store each 4x4 blocks' mv unit, the two swap after different d layer
+  int8_t*
+  pRefIndexBlock4x4;	// (*pRefIndexBlock4x4[2])[MB_BLOCK8x8_NUM];	    // for store each 4x4 blocks' pRef index, the two swap after different d layer
+  int8_t*                      pNonZeroCountBlocks;	// (*pNonZeroCountBlocks)[MB_LUMA_CHROMA_BLOCK4x4_NUM];
+  int8_t*
+  pIntra4x4PredModeBlocks;	// (*pIntra4x4PredModeBlocks)[INTRA_4x4_MODE_NUM];  //last byte is not used; the first 4 byte is for the bottom 12,13,14,15 4x4 block intra mode, and 3 byte for (3,7,11)
 
+  SMB**                          ppMbListD;	// [MAX_DEPENDENCY_LAYER];
+  SStrideTables*				pStrideTab;	// stride tables for internal coding used
+  SWelsFuncPtrList*			pFuncList;
+
 #if defined(MT_ENABLED)
-	SSliceThreading				*pSliceThreading;
+  SSliceThreading*				pSliceThreading;
 #endif//MT_ENABLED
 
-	// SSlice context
-	SSliceCtx				*pSliceCtxList;// slice context table for each dependency quality layer
-	// pointers
-	SPicture					*pEncPic;			// pointer to current picture to be encoded
-	SPicture					*pDecPic;			// pointer to current picture being reconstructed
-	SPicture					*pRefPic;			// pointer to current reference picture	
+  // SSlice context
+  SSliceCtx*				pSliceCtxList;// slice context table for each dependency quality layer
+  // pointers
+  SPicture*					pEncPic;			// pointer to current picture to be encoded
+  SPicture*					pDecPic;			// pointer to current picture being reconstructed
+  SPicture*					pRefPic;			// pointer to current reference picture
+
+  SDqLayer*
+  pCurDqLayer;				// DQ layer context used to being encoded currently, for reference base layer to refer: pCurDqLayer->pRefLayer if applicable
+  SDqLayer**					ppDqLayerList;			// overall DQ layers encoded for storage
+
+  SRefList**					ppRefPicListExt;		// reference picture list for SVC
+  SPicture*					pRefList0[16];
+  SLTRState*					pLtr;//[MAX_DEPENDENCY_LAYER];
+
+  // Derived
+  int32_t						iCodingIndex;
+  int32_t						iFrameIndex;			// count how many frames elapsed during coding context currently
+  uint32_t					uiFrameIdxRc;           //only for RC
+  int32_t						iFrameNum;				// current frame number coding
+  int32_t						iPOC;					// frame iPOC
+  EWelsSliceType				eSliceType;			// currently coding slice type
+  EWelsNalUnitType			eNalType;			// NAL type
+  EWelsNalRefIdc				eNalPriority;		// NAL_Reference_Idc currently
+  EWelsNalRefIdc				eLastNalPriority;	// NAL_Reference_Idc in last frame
+  uint8_t						iNumRef0;
 
-	SDqLayer					*pCurDqLayer;				// DQ layer context used to being encoded currently, for reference base layer to refer: pCurDqLayer->pRefLayer if applicable	
-	SDqLayer					**ppDqLayerList;			// overall DQ layers encoded for storage	
+  uint8_t						uiDependencyId;	// Idc of dependecy layer to be coded
+  uint8_t						uiTemporalId;	// Idc of temporal layer to be coded
+  bool_t						bNeedPrefixNalFlag;	// whether add prefix nal
+  bool_t                      bEncCurFrmAsIdrFlag;
 
-	SRefList					**ppRefPicListExt;		// reference picture list for SVC
-	SPicture					*pRefList0[16];	
-	SLTRState					*pLtr;//[MAX_DEPENDENCY_LAYER];	
-	
-	// Derived
-	int32_t						iCodingIndex;
-	int32_t						iFrameIndex;			// count how many frames elapsed during coding context currently
-	uint32_t					uiFrameIdxRc;           //only for RC
-	int32_t						iFrameNum;				// current frame number coding
-	int32_t						iPOC;					// frame iPOC
-	EWelsSliceType				eSliceType;			// currently coding slice type
-	EWelsNalUnitType			eNalType;			// NAL type
-	EWelsNalRefIdc				eNalPriority;		// NAL_Reference_Idc currently
-	EWelsNalRefIdc				eLastNalPriority;	// NAL_Reference_Idc in last frame		
-	uint8_t						iNumRef0;	
+  // Rate control routine
+  SWelsSvcRc*					pWelsSvcRc;
+  int32_t						iSkipFrameFlag; //_GOM_RC_
+  int32_t						iGlobalQp;		// global qp
 
-	uint8_t						uiDependencyId;	// Idc of dependecy layer to be coded
-	uint8_t						uiTemporalId;	// Idc of temporal layer to be coded
-	bool_t						bNeedPrefixNalFlag;	// whether add prefix nal	
-	bool_t                      bEncCurFrmAsIdrFlag;  
+  // VAA
+  SVAAFrameInfo*		    	pVaa;		    // VAA information of reference
+  CWelsPreProcess*				pVpp;
 
-	// Rate control routine	
-	SWelsSvcRc					*pWelsSvcRc;
-	int32_t						iSkipFrameFlag; //_GOM_RC_
-	int32_t						iGlobalQp;		// global qp
+  SWelsSPS*							pSpsArray;		// MAX_SPS_COUNT by standard compatible
+  SWelsSPS*							pSps;
+  SWelsPPS*							pPPSArray;		// MAX_PPS_COUNT by standard compatible
+  SWelsPPS*							pPps;
+  /* SVC only */
+  SSubsetSps*					pSubsetArray;	// MAX_SPS_COUNT by standard compatible
+  SSubsetSps*					pSubsetSps;
+  int32_t						iSpsNum;	// number of pSps used
+  int32_t						iPpsNum;	// number of pPps used
 
-	// VAA	
-	SVAAFrameInfo			    *pVaa;		    // VAA information of reference
-	CWelsPreProcess				*pVpp;	
+  // Output
+  SWelsEncoderOutput*			pOut;			// for NAL raw pData (need allocating memory for sNalList internal)
+  uint8_t*						pFrameBs;		// restoring bitstream pBuffer of all NALs in a frame
+  int32_t						iFrameBsSize;	// count size of frame bs in bytes allocated
+  int32_t						iPosBsBuffer;	// current writing position of frame bs pBuffer
 
-	SWelsSPS							*pSpsArray;		// MAX_SPS_COUNT by standard compatible
-	SWelsSPS							*pSps;
-	SWelsPPS							*pPPSArray;		// MAX_PPS_COUNT by standard compatible
-	SWelsPPS							*pPps;
-	/* SVC only */
-	SSubsetSps					*pSubsetArray;	// MAX_SPS_COUNT by standard compatible
-	SSubsetSps					*pSubsetSps;
-	int32_t						iSpsNum;	// number of pSps used
-	int32_t						iPpsNum;	// number of pPps used
+  /* For Downsampling & VAA I420 based source pictures */
+  SPicture*					pSpatialPic[MAX_DEPENDENCY_LAYER][MAX_TEMPORAL_LEVEL + 1 +
+      LONG_TERM_REF_NUM];	// need memory requirement with total number of (log2(uiGopSize)+1+1+long_term_ref_num)
 
-	// Output
-	SWelsEncoderOutput			*pOut;			// for NAL raw pData (need allocating memory for sNalList internal)
-	uint8_t						*pFrameBs;		// restoring bitstream pBuffer of all NALs in a frame
-	int32_t						iFrameBsSize;	// count size of frame bs in bytes allocated
-	int32_t						iPosBsBuffer;	// current writing position of frame bs pBuffer
-	
-	/* For Downsampling & VAA I420 based source pictures */	
-	SPicture					*pSpatialPic[MAX_DEPENDENCY_LAYER][MAX_TEMPORAL_LEVEL+1+LONG_TERM_REF_NUM];	// need memory requirement with total number of (log2(uiGopSize)+1+1+long_term_ref_num)
+  SSpatialPicIndex			sSpatialIndexMap[MAX_DEPENDENCY_LAYER];
+  uint8_t						uiSpatialLayersInTemporal[MAX_DEPENDENCY_LAYER];
 
-	SSpatialPicIndex			sSpatialIndexMap[MAX_DEPENDENCY_LAYER];
-	uint8_t						uiSpatialLayersInTemporal[MAX_DEPENDENCY_LAYER];
+  uint8_t                     uiSpatialPicNum[MAX_DEPENDENCY_LAYER];
+  bool_t						bLongTermRefFlag[MAX_DEPENDENCY_LAYER][MAX_TEMPORAL_LEVEL + 1/*+LONG_TERM_REF_NUM*/];
 
-	uint8_t                     uiSpatialPicNum[MAX_DEPENDENCY_LAYER];
-    bool_t						bLongTermRefFlag[MAX_DEPENDENCY_LAYER][MAX_TEMPORAL_LEVEL+1/*+LONG_TERM_REF_NUM*/];
+  int16_t						iMaxSliceCount;// maximal count number of slices for all layers observation
+  int16_t						iActiveThreadsNum;	// number of threads active so far
 
-	int16_t						iMaxSliceCount;// maximal count number of slices for all layers observation
-	int16_t						iActiveThreadsNum;	// number of threads active so far
-	
-	/*
-	 * DQ layer idc map for svc encoding, might be a better scheme than that of design before,
-	 * can aware idc of referencing layer and that idc of successive layer to be coded
-	 */
-	/* SVC only */
-	SDqIdc						*pDqIdcMap;	// overall DQ map of full scalability in specific frame (All full D/T/Q layers involved)												// pDqIdcMap[dq_index] for each SDqIdc pData	
+  /*
+   * DQ layer idc map for svc encoding, might be a better scheme than that of design before,
+   * can aware idc of referencing layer and that idc of successive layer to be coded
+   */
+  /* SVC only */
+  SDqIdc*
+  pDqIdcMap;	// overall DQ map of full scalability in specific frame (All full D/T/Q layers involved)												// pDqIdcMap[dq_index] for each SDqIdc pData
 
-	SParaSetOffset				sPSOVector;	
-	CMemoryAlign				*pMemAlign;
+  SParaSetOffset				sPSOVector;
+  CMemoryAlign*				pMemAlign;
 
 #ifdef ENABLE_TRACE_FILE
-	FILE						*pFileLog;		// log file for wels encoder
-	uint32_t					uiSizeLog;		// size of log have been written in file
+  FILE*						pFileLog;		// log file for wels encoder
+  uint32_t					uiSizeLog;		// size of log have been written in file
 
 #endif//ENABLE_TRACE_FILE
 
-#if defined(STAT_OUTPUT)	
-	// overall stat pData, refer to SStatData in stat.h, in case avc to use stat[0][0]
-	SStatData					sStatData [ MAX_DEPENDENCY_LAYER ] [ MAX_QUALITY_LEVEL ];
-	SStatSliceInfo				sPerInfo;
+#if defined(STAT_OUTPUT)
+  // overall stat pData, refer to SStatData in stat.h, in case avc to use stat[0][0]
+  SStatData					sStatData [ MAX_DEPENDENCY_LAYER ] [ MAX_QUALITY_LEVEL ];
+  SStatSliceInfo				sPerInfo;
 #endif//STAT_OUTPUT	
 
-}sWelsEncCtx/*, *PWelsEncCtx*/;
+} sWelsEncCtx/*, *PWelsEncCtx*/;
 }
 #endif//sWelsEncCtx_H__
--- a/codec/encoder/core/inc/expand_pic.h
+++ b/codec/encoder/core/inc/expand_pic.h
@@ -44,9 +44,9 @@
 #include "picture.h"
 
 namespace WelsSVCEnc {
-typedef void (*PExpandPictureFunc)( uint8_t *pDst, const int32_t kiStride, const int32_t kiPicW, const int32_t kiPicH );
+typedef void (*PExpandPictureFunc) (uint8_t* pDst, const int32_t kiStride, const int32_t kiPicW, const int32_t kiPicH);
 
-void ExpandReferencingPicture( SPicture *pPic, PExpandPictureFunc pExpLuma, PExpandPictureFunc pExpChrom[2] );
+void ExpandReferencingPicture (SPicture* pPic, PExpandPictureFunc pExpLuma, PExpandPictureFunc pExpChrom[2]);
 
 #if defined(__cplusplus)
 extern "C" {
@@ -53,24 +53,24 @@
 #endif//__cplusplus
 
 #if defined(X86_ASM)
-void ExpandPictureLuma_sse2(	uint8_t *pDst,
-								const int32_t kiStride,
-								const int32_t kiPicW,
-								const int32_t kiPicH	);
-void ExpandPictureChromaAlign_sse2(	uint8_t *pDst,
-									const int32_t kiStride,
-									const int32_t kiPicW,
-									const int32_t kiPicH	);
-void ExpandPictureChromaUnalign_sse2(	uint8_t *pDst,
-									const int32_t kiStride,
-									const int32_t kiPicW,
-									const int32_t kiPicH	);
+void ExpandPictureLuma_sse2 (uint8_t* pDst,
+                             const int32_t kiStride,
+                             const int32_t kiPicW,
+                             const int32_t kiPicH);
+void ExpandPictureChromaAlign_sse2 (uint8_t* pDst,
+                                    const int32_t kiStride,
+                                    const int32_t kiPicW,
+                                    const int32_t kiPicH);
+void ExpandPictureChromaUnalign_sse2 (uint8_t* pDst,
+                                      const int32_t kiStride,
+                                      const int32_t kiPicW,
+                                      const int32_t kiPicH);
 #endif//X86_ASM
-	
+
 #if defined(__cplusplus)
 }
 #endif//__cplusplus
 
-void InitExpandPictureFunc( void *pL, const uint32_t kuiCPUFlags );
+void InitExpandPictureFunc (void* pL, const uint32_t kuiCPUFlags);
 }
 #endif
--- a/codec/encoder/core/inc/extern.h
+++ b/codec/encoder/core/inc/extern.h
@@ -55,7 +55,7 @@
  * \param	kiHeight	height of picture in pixels
  * \return	successful - 0; otherwise none 0 for failed
  */
-int32_t InitPic( const void *kpSrc, const int32_t kiCsp, const int32_t kiWidth, const int32_t kiHeight );
+int32_t InitPic (const void* kpSrc, const int32_t kiCsp, const int32_t kiWidth, const int32_t kiHeight);
 
 /*
  *	SVC core encoder external interfaces
@@ -66,10 +66,10 @@
  * \pParam	pParam		SWelsSvcCodingParam*
  * \return	successful - 0; otherwise none 0 for failed
  */
-int32_t ParamValidationExt( void *pParam );
+int32_t ParamValidationExt (void* pParam);
 
 // GOM based RC related for uiSliceNum decision
-void GomValidCheck(const int32_t kiMbWidth, const int32_t kiMbHeight, int32_t *pSliceNum);
+void GomValidCheck (const int32_t kiMbWidth, const int32_t kiMbHeight, int32_t* pSliceNum);
 
 /*!
  * \brief	initialize Wels avc encoder core library
@@ -77,7 +77,7 @@
  * \param	para		SWelsSvcCodingParam*
  * \return	successful - 0; otherwise none 0 for failed
  */
-int32_t WelsInitEncoderExt( sWelsEncCtx **ppCtx, SWelsSvcCodingParam *pPara );
+int32_t WelsInitEncoderExt (sWelsEncCtx** ppCtx, SWelsSvcCodingParam* pPara);
 
 /*!
  * \brief	uninitialize Wels encoder core library
@@ -84,7 +84,7 @@
  * \param	pEncCtx		sWelsEncCtx*
  * \return	none
  */
-void WelsUninitEncoderExt( sWelsEncCtx **ppCtx );
+void WelsUninitEncoderExt (sWelsEncCtx** ppCtx);
 
 /*!
  * \brief	core svc encoding process
@@ -97,22 +97,23 @@
  *						[NO in picture list case, YES in console aplication based]
  * \return	EFrameType (WELS_FRAME_TYPE_IDR/WELS_FRAME_TYPE_I/WELS_FRAME_TYPE_P)
  */
-int32_t WelsEncoderEncodeExt( sWelsEncCtx *, void *pDst, const SSourcePicture **kppSrcList, const int32_t kiConfiguredLayerNum );
+int32_t WelsEncoderEncodeExt (sWelsEncCtx*, void* pDst, const SSourcePicture** kppSrcList,
+                              const int32_t kiConfiguredLayerNum);
 
 /*
  * Force coding IDR as follows
  */
-int32_t ForceCodingIDR( sWelsEncCtx *pCtx );
+int32_t ForceCodingIDR (sWelsEncCtx* pCtx);
 
 /*!
  * \brief	Wels SVC encoder parameters adjustment
  *			SVC adjustment results in new requirement in memory blocks adjustment
  */
-int32_t WelsEncoderParamAdjust( sWelsEncCtx **ppCtx, SWelsSvcCodingParam *pNew );
+int32_t WelsEncoderParamAdjust (sWelsEncCtx** ppCtx, SWelsSvcCodingParam* pNew);
 
-int32_t FilterLTRRecoveryRequest(sWelsEncCtx *pCtx,SLTRRecoverRequest* pLTRRecoverRequest);
+int32_t FilterLTRRecoveryRequest (sWelsEncCtx* pCtx, SLTRRecoverRequest* pLTRRecoverRequest);
 
-void FilterLTRMarkingFeedback(sWelsEncCtx *pCtx,SLTRMarkingFeedback* pLTRMarkingFeedback);
+void FilterLTRMarkingFeedback (sWelsEncCtx* pCtx, SLTRMarkingFeedback* pLTRMarkingFeedback);
 }
 
 #endif//WELS_ENCODER_CALLBACK_H__
--- a/codec/encoder/core/inc/get_intra_predictor.h
+++ b/codec/encoder/core/inc/get_intra_predictor.h
@@ -45,42 +45,42 @@
 #include "wels_func_ptr_def.h"
 
 namespace WelsSVCEnc {
-void WelsI4x4LumaPredV_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
-void WelsI4x4LumaPredH_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
-void WelsI4x4LumaPredDc_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
-void WelsI4x4LumaPredDcLeft_c(uint8_t *pPred, uint8_t *pRef,  const int32_t kiStride);
-void WelsI4x4LumaPredDcTop_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
-void WelsI4x4LumaPredDcNA_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+void WelsI4x4LumaPredV_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI4x4LumaPredH_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI4x4LumaPredDc_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI4x4LumaPredDcLeft_c (uint8_t* pPred, uint8_t* pRef,  const int32_t kiStride);
+void WelsI4x4LumaPredDcTop_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI4x4LumaPredDcNA_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
 
-void WelsI4x4LumaPredDDL_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
-void WelsI4x4LumaPredDDLTop_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
-void WelsI4x4LumaPredDDR_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+void WelsI4x4LumaPredDDL_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI4x4LumaPredDDLTop_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI4x4LumaPredDDR_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
 
-void WelsI4x4LumaPredVR_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
-void WelsI4x4LumaPredHD_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
-void WelsI4x4LumaPredVL_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
-void WelsI4x4LumaPredVLTop_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
-void WelsI4x4LumaPredHU_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+void WelsI4x4LumaPredVR_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI4x4LumaPredHD_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI4x4LumaPredVL_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI4x4LumaPredVLTop_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI4x4LumaPredHU_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
 
 
-void WelsIChormaPredV_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
-void WelsIChormaPredH_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
-void WelsIChormaPredPlane_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
-void WelsIChormaPredDc_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
-void WelsIChormaPredDcLeft_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
-void WelsIChormaPredDcTop_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
-void WelsIChormaPredDcNA_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+void WelsIChormaPredV_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsIChormaPredH_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsIChormaPredPlane_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsIChormaPredDc_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsIChormaPredDcLeft_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsIChormaPredDcTop_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsIChormaPredDcNA_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
 
-void WelsI16x16ChormaPredVer(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
-void WelsI16x16ChormaPredHor(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+void WelsI16x16ChormaPredVer (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI16x16ChormaPredHor (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
 
-void WelsI16x16LumaPredV_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
-void WelsI16x16LumaPredH_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
-void WelsI16x16LumaPredPlane_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
-void WelsI16x16LumaPredDc_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
-void WelsI16x16LumaPredDcLeft_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
-void WelsI16x16LumaPredDcTop_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
-void WelsI16x16LumaPredDcNA_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+void WelsI16x16LumaPredV_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI16x16LumaPredH_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI16x16LumaPredPlane_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI16x16LumaPredDc_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI16x16LumaPredDcLeft_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI16x16LumaPredDcTop_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI16x16LumaPredDcNA_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
 
 #if defined(__cplusplus)
 extern "C" {
@@ -87,32 +87,32 @@
 #endif//__cplusplus
 
 #if defined(X86_ASM)
-void WelsFillingPred8to16_mmx( uint8_t *pPred, uint8_t *pValue );
-void WelsFillingPred8x2to16_mmx( uint8_t *pPred, uint8_t *pValue );
-void WelsFillingPred1to16_mmx( uint8_t *pPred, const uint8_t kuiValue );
-void WelsFillingPred8x2to16_sse2( uint8_t *pPred, uint8_t *pValue );
-void WelsFillingPred1to16_sse2( uint8_t *pPred, const uint8_t kuiValue );
+void WelsFillingPred8to16_mmx (uint8_t* pPred, uint8_t* pValue);
+void WelsFillingPred8x2to16_mmx (uint8_t* pPred, uint8_t* pValue);
+void WelsFillingPred1to16_mmx (uint8_t* pPred, const uint8_t kuiValue);
+void WelsFillingPred8x2to16_sse2 (uint8_t* pPred, uint8_t* pValue);
+void WelsFillingPred1to16_sse2 (uint8_t* pPred, const uint8_t kuiValue);
 
 //for intra-prediction ASM functions
-void WelsI16x16LumaPredV_sse2(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
-void WelsI16x16LumaPredH_sse2(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
-void WelsI16x16LumaPredDc_sse2(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
-void WelsI16x16LumaPredPlane_sse2(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+void WelsI16x16LumaPredV_sse2 (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI16x16LumaPredH_sse2 (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI16x16LumaPredDc_sse2 (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI16x16LumaPredPlane_sse2 (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
 
-void WelsIChromaPredH_mmx(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
-void WelsIChromaPredV_sse2(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
-void WelsIChromaPredDc_sse2(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
-void WelsIChromaPredPlane_sse2(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+void WelsIChromaPredH_mmx (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsIChromaPredV_sse2 (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsIChromaPredDc_sse2 (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsIChromaPredPlane_sse2 (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
 
-void WelsI4x4LumaPredV_sse2(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
-void WelsI4x4LumaPredH_sse2(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
-void WelsI4x4LumaPredDc_sse2(uint8_t *pPred,uint8_t *pRef,const int32_t kiStride);
-void WelsI4x4LumaPredDDL_mmx(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
-void WelsI4x4LumaPredDDR_mmx(uint8_t *pPred,uint8_t *pRef,const int32_t kiStride);
-void WelsI4x4LumaPredVR_mmx(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
-void WelsI4x4LumaPredHD_mmx(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
-void WelsI4x4LumaPredVL_mmx(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
-void WelsI4x4LumaPredHU_mmx(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+void WelsI4x4LumaPredV_sse2 (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI4x4LumaPredH_sse2 (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI4x4LumaPredDc_sse2 (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI4x4LumaPredDDL_mmx (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI4x4LumaPredDDR_mmx (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI4x4LumaPredVR_mmx (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI4x4LumaPredHD_mmx (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI4x4LumaPredVL_mmx (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI4x4LumaPredHU_mmx (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
 #endif//X86_ASM
 
 #if defined(__cplusplus)
@@ -119,8 +119,8 @@
 }
 #endif//__cplusplus
 
-void WelsInitFillingPredFuncs( const uint32_t kuiCpuFlag );
-void WelsInitIntraPredFuncs( SWelsFuncPtrList *pFuncList, const uint32_t kuiCpuFlag );
+void WelsInitFillingPredFuncs (const uint32_t kuiCpuFlag);
+void WelsInitIntraPredFuncs (SWelsFuncPtrList* pFuncList, const uint32_t kuiCpuFlag);
 
 //#pragma pack()
 }
--- a/codec/encoder/core/inc/ls_defines.h
+++ b/codec/encoder/core/inc/ls_defines.h
@@ -38,36 +38,40 @@
 
 #ifdef __GNUC__
 
-	struct tagUnaligned_64 { uint64_t l; } __attribute__((packed));
-	struct tagUnaligned_32 { uint32_t l; } __attribute__((packed));
-	struct tagUnaligned_16 { uint16_t l; } __attribute__((packed));
-	
-	#define LD16(a) (((struct tagUnaligned_16 *) (a))->l)
-	#define LD32(a) (((struct tagUnaligned_32 *) (a))->l)
-	//#define LD64(a) (((struct tagUnaligned_64 *) (a))->l)
-        inline uint64_t LD64(const void * a)
-		{
-			uint64_t v;
-			memcpy(&v, a, sizeof(v));	// confirmed_safe_unsafe_usage
-			return v;
-		}
-	//#define _USE_STRUCT_INT_CVT
+struct tagUnaligned_64 {
+uint64_t l;
+} __attribute__ ((packed));
+struct tagUnaligned_32 {
+uint32_t l;
+} __attribute__ ((packed));
+struct tagUnaligned_16 {
+uint16_t l;
+} __attribute__ ((packed));
+
+#define LD16(a) (((struct tagUnaligned_16 *) (a))->l)
+#define LD32(a) (((struct tagUnaligned_32 *) (a))->l)
+//#define LD64(a) (((struct tagUnaligned_64 *) (a))->l)
+inline uint64_t LD64 (const void* a) {
+uint64_t v;
+memcpy (&v, a, sizeof (v));	// confirmed_safe_unsafe_usage
+return v;
+}
+//#define _USE_STRUCT_INT_CVT
 //	#ifdef _USE_STRUCT_INT_CVT
-		#define ST16(a, b) (((struct tagUnaligned_16 *) (a))->l) = (b)             
-		#define ST32(a, b) (((struct tagUnaligned_32 *) (a))->l) = (b)
-		//#define ST64(a, b) (((struct tagUnaligned_64 *) (a))->l) = (b)                               
-                inline void ST64(void * a, uint64_t b)
-				{
-					memcpy(a, &b, sizeof(b));	// confirmed_safe_unsafe_usage
-				}
+#define ST16(a, b) (((struct tagUnaligned_16 *) (a))->l) = (b)
+#define ST32(a, b) (((struct tagUnaligned_32 *) (a))->l) = (b)
+//#define ST64(a, b) (((struct tagUnaligned_64 *) (a))->l) = (b)
+inline void ST64 (void* a, uint64_t b) {
+  memcpy (a, &b, sizeof (b));	// confirmed_safe_unsafe_usage
+}
 //	#else
 //		inline void __ST16(void *dst, uint16_t v) { memcpy(dst, &v, 2); }
 //		inline void __ST32(void *dst, uint32_t v) { memcpy(dst, &v, 4); }
-		//inline void __ST64(void *dst, uint64_t v) { memcpy(dst, &v, 8); }
+//inline void __ST64(void *dst, uint64_t v) { memcpy(dst, &v, 8); }
 //	#endif
 
 #else
-	
+
 //#define INTD16(a) (*((int16_t*)(a)))
 //#define INTD32(a) (*((int32_t*)(a)))
 //#define INTD64(a) (*((int64_t*)(a)))
--- a/codec/encoder/core/inc/macros.h
+++ b/codec/encoder/core/inc/macros.h
@@ -45,56 +45,56 @@
 
 namespace WelsSVCEnc {
 #if defined(_MSC_VER)
-	#if _MSC_VER <= 1200
-		#define ALLOC_ALLIGN_MEM(name,size,type,alignment) \
+#if _MSC_VER <= 1200
+#define ALLOC_ALLIGN_MEM(name,size,type,alignment) \
 			type name##_storage[size+(alignment)-1]; \
 			type * name = (type *) (((int32_t) name##_storage+(alignment - 1)) & ~((int32_t)(alignment)-1))
 
-		#define ALLOC_ALLIGN_MEM_2(name,sizex,sizey,type,alignment) \
+#define ALLOC_ALLIGN_MEM_2(name,sizex,sizey,type,alignment) \
 			type name##_storage[(sizex)*(sizey)+(alignment)-1]; \
 			type * name = (type *) (((int32_t) name##_storage+(alignment - 1)) & ~((int32_t)(alignment)-1))
-	#else //_MSC_VER <= 1200
-		#define ALLOC_ALLIGN_MEM(name,size,type,alignment) \
+#else //_MSC_VER <= 1200
+#define ALLOC_ALLIGN_MEM(name,size,type,alignment) \
 			__declspec(align(alignment)) type name[size]
 
-		#define ALLOC_ALLIGN_MEM_2(name,sizex,sizey,type,alignment) \
+#define ALLOC_ALLIGN_MEM_2(name,sizex,sizey,type,alignment) \
 		__declspec(align(alignment)) type name[(sizex)*(sizey)]
-	#endif//_MSC_VER <= 1200
+#endif//_MSC_VER <= 1200
 
 #elif defined(__GNUC__)
 
-	#define ALLOC_ALLIGN_MEM(name,size,type,alignment) \
+#define ALLOC_ALLIGN_MEM(name,size,type,alignment) \
 		type name[size] __attribute__((aligned(alignment)))
-	#define ALLOC_ALLIGN_MEM_2(name,sizex,sizey,type,alignment) \
+#define ALLOC_ALLIGN_MEM_2(name,sizex,sizey,type,alignment) \
 		type name[(sizex)*(sizey)] __attribute__((aligned(alignment)))
-		
+
 #endif//_MSC_VER
 
 
 #if defined(_MSC_VER)
 
-	#if(_MSC_VER < 1700)
-	#define inline	__inline  
-	#endif
+#if(_MSC_VER < 1700)
+#define inline	__inline
+#endif
 
-    #define __FASTCALL   __fastcall
-	#define ALIGNED_DECLARE( type, var, n ) __declspec(align(n)) type var
-	#define __align8(t,v) __declspec(align(8)) t v
-	#define __align16(t,v) __declspec(align(16)) t v
+#define __FASTCALL   __fastcall
+#define ALIGNED_DECLARE( type, var, n ) __declspec(align(n)) type var
+#define __align8(t,v) __declspec(align(8)) t v
+#define __align16(t,v) __declspec(align(16)) t v
 #elif defined(__GNUC__)
 #if !defined(MAC_POWERPC)
-    #define __FASTCALL    __attribute__ ((fastcall))
+#define __FASTCALL    __attribute__ ((fastcall))
 #else
-	#define __FASTCALL	// mean NULL for mac ppc
+#define __FASTCALL	// mean NULL for mac ppc
 #endif//MAC_POWERPC    
-	#define ALIGNED_DECLARE( type, var, n ) type var __attribute__((aligned(n)))
-	#define __align8(t,v) t v __attribute__ ((aligned (8)))
-	#define __align16(t,v) t v __attribute__ ((aligned (16)))
+#define ALIGNED_DECLARE( type, var, n ) type var __attribute__((aligned(n)))
+#define __align8(t,v) t v __attribute__ ((aligned (8)))
+#define __align16(t,v) t v __attribute__ ((aligned (16)))
 #endif//_MSC_VER
 
 #if defined(_MACH_PLATFORM) || defined(__GNUC__)
 #define ALIGNED_DECLARE_MATRIX_2D(name,sizex,sizey,type,alignment) \
-	type name[(sizex)*(sizey)] __attribute__((aligned(alignment)))	
+	type name[(sizex)*(sizey)] __attribute__((aligned(alignment)))
 #else //_MSC_VER <= 1200
 #define ALIGNED_DECLARE_MATRIX_2D(name,sizex,sizey,type,alignment) \
 __declspec(align(alignment)) type name[(sizex)*(sizey)]
@@ -102,7 +102,7 @@
 
 #if defined(_MACH_PLATFORM) || defined(__GNUC__)
 #define ALIGNED_DECLARE_MATRIX_1D(name,size,type,alignment) \
-	type name[size] __attribute__((aligned(alignment)))	
+	type name[size] __attribute__((aligned(alignment)))
 #else //_MSC_VER <= 1200
 #define ALIGNED_DECLARE_MATRIX_1D(name,size,type,alignment) \
 	__declspec(align(alignment)) type name[(size)]
@@ -136,15 +136,13 @@
 #define WELS_ROUND(x)	((int32_t)((x)+0.5f+EPSN))
 #endif//WELS_ROUND
 
-static inline int32_t WELS_CEIL(float v)
-{
-	const int32_t n = (int32_t)v;	// floor value
-	return ((v>EPSN+n) ? (1+n) : n);	// (int32_t)ceil(v);
+static inline int32_t WELS_CEIL (float v) {
+const int32_t n = (int32_t)v;	// floor value
+return ((v > EPSN + n) ? (1 + n) : n);	// (int32_t)ceil(v);
 }
 
-static inline int32_t WELS_FLOOR(float v)
-{
-	return (int32_t)v;		
+static inline int32_t WELS_FLOOR (float v) {
+return (int32_t)v;
 }
 
 
@@ -152,59 +150,51 @@
     iC = iA + iB + 1;                           \
 	iC >>= (int32_t)( iA != -1 && iB != -1);    \
 	iC += (iA == -1 && iB == -1);               \
-}    
+}
 
 /*
  * log base 2 of v and ceil/floor extension
  */
 
-static inline int32_t WELS_CEILLOG2( uint32_t v )
-{
-	int32_t r = 0;
-	--v;
-	while( v > 0 )
-	{
-		++r;
-		v >>= 1;
-	}
-	return r;
+static inline int32_t WELS_CEILLOG2 (uint32_t v) {
+int32_t r = 0;
+--v;
+while (v > 0) {
+  ++r;
+  v >>= 1;
 }
+return r;
+}
 
-static inline int32_t WELS_FLOORLOG2( uint32_t v )
-{	
-	int32_t r = 0;
-	while( v > 1 )
-	{
-		++r;
-		v >>= 1;
-	}
-	return r;
+static inline int32_t WELS_FLOORLOG2 (uint32_t v) {
+int32_t r = 0;
+while (v > 1) {
+  ++r;
+  v >>= 1;
 }
+return r;
+}
 
-static inline int32_t WELS_LOG2( uint32_t v )
-{	
-	int32_t r = 0;
-	while (v >>= 1)
-	{
-  		++r;
-	}
-	return r;
+static inline int32_t WELS_LOG2 (uint32_t v) {
+int32_t r = 0;
+while (v >>= 1) {
+  ++r;
+}
+return r;
 
 }
 
-static inline BOOL_T WELS_POWER2_IF( uint32_t v )
-{
-	return ( v && !(v & (v - 1)) );
+static inline BOOL_T WELS_POWER2_IF (uint32_t v) {
+return (v && ! (v & (v - 1)));
 }
 
-static inline int32_t WELS_MEDIAN(int32_t x,  int32_t y, int32_t z)
-{
-	int32_t t = (x-y)&((x-y)>>31);
-	x -= t;
-	y += t;
-	y -= (y-z)&((y-z)>>31);
-	y += (x-y)&((x-y)>>31);
-	return y;
+static inline int32_t WELS_MEDIAN (int32_t x,  int32_t y, int32_t z) {
+int32_t t = (x - y) & ((x - y) >> 31);
+x -= t;
+y += t;
+y -= (y - z) & ((y - z) >> 31);
+y += (x - y) & ((x - y) >> 31);
+return y;
 }
 
 #ifndef BUTTERFLY1x2
@@ -229,7 +219,7 @@
 //#endif// NEG_NUM
 
 #ifndef WELS_CLIP1
-#define WELS_CLIP1(x) (((x) & ~255) ? (-(x) >> 31) : (x)) 
+#define WELS_CLIP1(x) (((x) & ~255) ? (-(x) >> 31) : (x))
 #endif//WELS_CLIP1
 
 #ifndef WELS_SIGN
@@ -236,10 +226,9 @@
 #define WELS_SIGN(a) ((int32_t)(a) >> 31)	// General: (a)>>(sizeof(int)*CHAR_BIT-1), CHAR_BIT= the number of bits per byte (normally 8)
 #endif //WELS_SIGN
 
-static inline int32_t WELS_ABS(int32_t a)
-{
-	const int32_t sign = WELS_SIGN(a);
-	return ((a + sign) ^ sign);
+static inline int32_t WELS_ABS (int32_t a) {
+const int32_t sign = WELS_SIGN (a);
+return ((a + sign) ^ sign);
 }
 
 // wels_tostring
@@ -257,18 +246,17 @@
 // Bitwise routines
 // n: ulong
 // b: bit order
-static inline bool_t BITWISE_ENABLED(const uint32_t n, const uint8_t b)
-{
-	const uint8_t bit = (b&0x1f);	// maximal bit position 31 for uint32_t 4 bytes
+static inline bool_t BITWISE_ENABLED (const uint32_t n, const uint8_t b) {
+const uint8_t bit = (b & 0x1f);	// maximal bit position 31 for uint32_t 4 bytes
 #if defined(WORDS_BIGENDIAN)
-	/* 
-	 * 31 .. 24, 23 .. 16, 15 .. 8, 7 .. 0
-	 * 7 .. 0, 15 .. 8, 23 .. 16, 31 .. 24
-	 */	
-	const uint8_t map = 24+((bit&7)<<1)-bit;	// BIG_ENDIAN map
-	return (bool_t)((n & (1<<map)) >> map);	// BIG_ENDIAN
+/*
+ * 31 .. 24, 23 .. 16, 15 .. 8, 7 .. 0
+ * 7 .. 0, 15 .. 8, 23 .. 16, 31 .. 24
+ */
+const uint8_t map = 24 + ((bit & 7) << 1) - bit;	// BIG_ENDIAN map
+return (bool_t) ((n & (1 << map)) >> map);	// BIG_ENDIAN
 #else
-	return ((n & (1<<bit)) >> bit)?true:false;	// LITTLE_ENDIAN
+return ((n & (1 << bit)) >> bit) ? true : false;	// LITTLE_ENDIAN
 #endif//WORDS_BIGENDIAN
 }
 
@@ -278,35 +266,31 @@
 
 #ifdef    WORDS_BIGENDIAN
 
-static inline uint32_t ENDIAN_FIX(uint32_t x)
-{
-    return x;
+static inline uint32_t ENDIAN_FIX (uint32_t x) {
+return x;
 }
 
-#else 
+#else
 
 
 #ifdef    _MSC_VER
-static inline uint32_t ENDIAN_FIX(uint32_t x)
-{
-    __asm
-    {
-        mov   eax,  x
-		bswap   eax
-		mov   x,    eax
-    }
-    return x;
+static inline uint32_t ENDIAN_FIX (uint32_t x) {
+__asm {
+  mov   eax,  x
+  bswap   eax
+  mov   x,    eax
 }
+return x;
+}
 #else  // GCC
-static inline uint32_t ENDIAN_FIX(uint32_t x)
-{
+static inline uint32_t ENDIAN_FIX (uint32_t x) {
 #ifdef X86_ARCH
-	__asm__ __volatile__("bswap %0":"+r"(x));
+__asm__ __volatile__ ("bswap %0":"+r" (x));
 #else
-    x = ((x & 0xff000000)>> 24) | ((x & 0xff0000) >> 8) |
-        ((x & 0xff00) << 8) | ((x&0xff) << 24);
+x = ((x & 0xff000000) >> 24) | ((x & 0xff0000) >> 8) |
+    ((x & 0xff00) << 8) | ((x & 0xff) << 24);
 #endif
-	return x;
+return x;
 }
 
 
@@ -333,7 +317,7 @@
 #endif//#if WELS_VERIFY_RETURN_IF
 
 /*
- *	Description: to check variable validation and return the specified result 
+ *	Description: to check variable validation and return the specified result
  *		with correspoinding process advance.
  *	 result:	value to be return
  *	 case_if:	negative condition to be verified
@@ -392,7 +376,7 @@
  * Description: to safe free an array ptr with free function pointer
  *	arr:		pointer to an array, something like "**p";
  *	num:		number of elements in array
- *  free_fn:	free function pointer	
+ *  free_fn:	free function pointer
  */
 #ifndef WELS_SAFE_FREE_ARR
 #define WELS_SAFE_FREE_ARR(arr, num, free_fn) \
--- a/codec/encoder/core/inc/mb_cache.h
+++ b/codec/encoder/core/inc/mb_cache.h
@@ -47,7 +47,7 @@
  */
 /*
  * Cache for Luma				Cache for Chroma(Cb, Cr)
- *	
+ *
  *	TL T T T T					TL T T
  *	 L - - - -					 L - -
  *	 L - - - -					 L - - TR
@@ -64,84 +64,82 @@
 extern const uint8_t g_kuiCache12_8x8RefIdx[4];
 extern const uint8_t g_kuiCache48CountScan4Idx[24];
 
-typedef	struct TagDCTCoeff
-{
-	//ALIGNED_DECLARE( int16_t, residual_ac[16], 16 ); //I_16x16 
-	int16_t iLumaBlock[16][16]; //based on block4x4 luma DC/AC
-	//ALIGNED_DECLARE( int16_t, iLumaI16x16Dc[16], 16 ); //I_16x16 DC
-	int16_t iLumaI16x16Dc[16];
-	//ALIGNED_DECLARE( int16_t, iChromaDc[2][4], 16 ); //chroma DC
-	int16_t iChromaBlock[8][16]; //based on block4x4  chroma DC/AC
-	int16_t iChromaDc[2][4];
-}SDCTCoeff ;
+typedef	struct TagDCTCoeff {
+//ALIGNED_DECLARE( int16_t, residual_ac[16], 16 ); //I_16x16
+int16_t iLumaBlock[16][16]; //based on block4x4 luma DC/AC
+//ALIGNED_DECLARE( int16_t, iLumaI16x16Dc[16], 16 ); //I_16x16 DC
+int16_t iLumaI16x16Dc[16];
+//ALIGNED_DECLARE( int16_t, iChromaDc[2][4], 16 ); //chroma DC
+int16_t iChromaBlock[8][16]; //based on block4x4  chroma DC/AC
+int16_t iChromaDc[2][4];
+} SDCTCoeff ;
 
-typedef struct TagMbCache{
-	//the followed pData now is promised aligned to 16 bytes
-	ALIGNED_DECLARE(SMVComponentUnit, sMvComponents, 16);
-	
-	ALIGNED_DECLARE_MATRIX_1D(iNonZeroCoeffCount, 48, int8_t, 16);	// Cache line size
-	// 	int8_t		iNonZeroCoeffCount[6 * 8];	// Right luma, Chroma(Left Top Cb, Left btm Cr); must follow by iIntraPredMode!
-	ALIGNED_DECLARE_MATRIX_1D(iIntraPredMode, 48, int8_t, 16);	
-	//	must follow with iNonZeroCoeffCount! 
-	
-	int32_t     iSadCost[4];			//avail 1; unavail 0
-	SMVUnitXY  sMbMvp[MB_BLOCK8x8_NUM];// for write bs
+typedef struct TagMbCache {
+//the followed pData now is promised aligned to 16 bytes
+ALIGNED_DECLARE (SMVComponentUnit, sMvComponents, 16);
 
-	//for residual decoding (recovery) at the side of Encoder
-	int16_t *pCoeffLevel;		// tmep
-	//malloc memory for prediction
-	uint8_t* pSkipMb;	
+ALIGNED_DECLARE_MATRIX_1D (iNonZeroCoeffCount, 48, int8_t, 16);	// Cache line size
+// 	int8_t		iNonZeroCoeffCount[6 * 8];	// Right luma, Chroma(Left Top Cb, Left btm Cr); must follow by iIntraPredMode!
+ALIGNED_DECLARE_MATRIX_1D (iIntraPredMode, 48, int8_t, 16);
+//	must follow with iNonZeroCoeffCount!
 
-	//ALIGNED_DECLARE(uint8_t, pMemPredMb[2][256],  16);//One: Best I_16x16 Luma and refine frac_pixel pBuffer; another: PingPong I_8x8&&Inter Cb + Cr
-	uint8_t *pMemPredMb;
-	uint8_t* pMemPredLuma;// inter && intra share same pointer; 
-	//ALIGNED_DECLARE(uint8_t, pMemPredChroma[2][64*2], 16); //another PingPong pBuffer: Best Cb + Cr; 
-	uint8_t *pMemPredChroma;// inter && intra share same pointer;
-	uint8_t* pBestPredIntraChroma; //Cb:0~63;   Cr:64~127
+int32_t     iSadCost[4];			//avail 1; unavail 0
+SMVUnitXY  sMbMvp[MB_BLOCK8x8_NUM];// for write bs
 
-	//ALIGNED_DECLARE(uint8_t, pMemPredBlk4[2][16], 16); //I_4x4
-	uint8_t *pMemPredBlk4;		
+//for residual decoding (recovery) at the side of Encoder
+int16_t* pCoeffLevel;		// tmep
+//malloc memory for prediction
+uint8_t* pSkipMb;
 
-	uint8_t* pBestPredI4x4Blk4;//I_4x4
+//ALIGNED_DECLARE(uint8_t, pMemPredMb[2][256],  16);//One: Best I_16x16 Luma and refine frac_pixel pBuffer; another: PingPong I_8x8&&Inter Cb + Cr
+uint8_t* pMemPredMb;
+uint8_t* pMemPredLuma;// inter && intra share same pointer;
+//ALIGNED_DECLARE(uint8_t, pMemPredChroma[2][64*2], 16); //another PingPong pBuffer: Best Cb + Cr;
+uint8_t* pMemPredChroma;// inter && intra share same pointer;
+uint8_t* pBestPredIntraChroma; //Cb:0~63;   Cr:64~127
 
-	//ALIGNED_DECLARE(uint8_t, pBufferInterPredMe[4][400], 16);//inter type pBuffer for ME h & v & hv
-	uint8_t *pBufferInterPredMe;    // [4][400] is enough because only h&v or v&hv or h&hv. but if both h&v&hv is needed when 8 quart pixel, future we have to use [5][400].
+//ALIGNED_DECLARE(uint8_t, pMemPredBlk4[2][16], 16); //I_4x4
+uint8_t* pMemPredBlk4;
 
-	//no scan4[] order, just as memory order to store
-	//ALIGNED_DECLARE(bool_t, pPrevIntra4x4PredModeFlag[16], 16);//if 1, means no rem_intra4x4_pred_mode; if 0, means rem_intra4x4_pred_mode != 0
-	bool_t *pPrevIntra4x4PredModeFlag;
-	//ALIGNED_DECLARE(int8_t, pRemIntra4x4PredModeFlag[16], 16);//-1 as default; if pPrevIntra4x4PredModeFlag==0, 
-	//pRemIntra4x4PredModeFlag or added by 1 is the best pred_mode
-	int8_t *pRemIntra4x4PredModeFlag;
+uint8_t* pBestPredI4x4Blk4;//I_4x4
 
-	int32_t     iSadCostSkip[4];	     //avail 1; unavail 0
-	bool_t      bMbTypeSkip[4];         //1: skip; 0: non-skip  
-	int32_t     *pEncSad;
+//ALIGNED_DECLARE(uint8_t, pBufferInterPredMe[4][400], 16);//inter type pBuffer for ME h & v & hv
+uint8_t* pBufferInterPredMe;    // [4][400] is enough because only h&v or v&hv or h&hv. but if both h&v&hv is needed when 8 quart pixel, future we have to use [5][400].
 
-	//for residual encoding at the side of Encoder
-	SDCTCoeff *pDct;
+//no scan4[] order, just as memory order to store
+//ALIGNED_DECLARE(bool_t, pPrevIntra4x4PredModeFlag[16], 16);//if 1, means no rem_intra4x4_pred_mode; if 0, means rem_intra4x4_pred_mode != 0
+bool_t* pPrevIntra4x4PredModeFlag;
+//ALIGNED_DECLARE(int8_t, pRemIntra4x4PredModeFlag[16], 16);//-1 as default; if pPrevIntra4x4PredModeFlag==0,
+//pRemIntra4x4PredModeFlag or added by 1 is the best pred_mode
+int8_t* pRemIntra4x4PredModeFlag;
 
-	uint8_t      uiNeighborIntra; // LEFT_MB_POS:0x01, TOP_MB_POS:0x02, TOPLEFT_MB_POS = 0x04 ,TOPRIGHT_MB_POS = 0x08;
-	uint8_t uiLumaI16x16Mode;
-	uint8_t uiChmaI8x8Mode;
+int32_t     iSadCostSkip[4];	     //avail 1; unavail 0
+bool_t      bMbTypeSkip[4];         //1: skip; 0: non-skip
+int32_t*     pEncSad;
 
-	bool_t		bCollocatedPredFlag;//denote if current MB is collocated predicted (MV==0).
-	uint32_t	uiRefMbType;
+//for residual encoding at the side of Encoder
+SDCTCoeff* pDct;
 
-	struct
-	{
-		/* pointer of current mb location in original frame */
-		uint8_t *pEncMb[3];		
-		/* pointer of current mb location in recovery frame */
-		uint8_t *pDecMb[3];		
-		/* pointer of co-located mb location in reference frame */
-		uint8_t *pRefMb[3];	
-		//for SVC
-		uint8_t	*pCsMb[3];//locating current mb's CS in whole frame
+uint8_t      uiNeighborIntra; // LEFT_MB_POS:0x01, TOP_MB_POS:0x02, TOPLEFT_MB_POS = 0x04 ,TOPRIGHT_MB_POS = 0x08;
+uint8_t uiLumaI16x16Mode;
+uint8_t uiChmaI8x8Mode;
+
+bool_t		bCollocatedPredFlag;//denote if current MB is collocated predicted (MV==0).
+uint32_t	uiRefMbType;
+
+struct {
+  /* pointer of current mb location in original frame */
+  uint8_t* pEncMb[3];
+  /* pointer of current mb location in recovery frame */
+  uint8_t* pDecMb[3];
+  /* pointer of co-located mb location in reference frame */
+  uint8_t* pRefMb[3];
+  //for SVC
+  uint8_t*	pCsMb[3];//locating current mb's CS in whole frame
 //		int16_t *p_rs[3];//locating current mb's RS	in whole frame
 
-	} SPicData;
-}SMbCache;
+} SPicData;
+} SMbCache;
 
 }//end of namespace
 
--- a/codec/encoder/core/inc/mc.h
+++ b/codec/encoder/core/inc/mc.h
@@ -40,11 +40,11 @@
 #include "macros.h"
 #include "wels_func_ptr_def.h"
 
-/////////////////////luma MC////////////////////////// 
+/////////////////////luma MC//////////////////////////
 //x y means dx(mv[0] & 3) and dy(mv[1] & 3)
 
 namespace WelsSVCEnc {
-void WelsInitMcFuncs( SWelsFuncPtrList *pFuncList, uint32_t uiCpuFlag );
+void WelsInitMcFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag);
 
 
 #if defined(__cplusplus)
@@ -55,26 +55,35 @@
 //                       MMXEXT and SSE2 definition                          //
 //***************************************************************************//
 #if defined(X86_ASM)
-void McChromaWidthEq4_mmx( uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, const uint8_t *kpABCD,int32_t iHeigh );
-void McCopyWidthEq4_mmx ( uint8_t *, int32_t, uint8_t *, int32_t, int32_t );
-void McCopyWidthEq8_mmx( uint8_t *, int32_t, uint8_t *, int32_t, int32_t );
-void PixelAvgWidthEq8_mmx( uint8_t *,  int32_t, uint8_t *, int32_t, uint8_t *, int32_t, int32_t  );
+void McChromaWidthEq4_mmx (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, const uint8_t* kpABCD,
+                           int32_t iHeigh);
+void McCopyWidthEq4_mmx (uint8_t*, int32_t, uint8_t*, int32_t, int32_t);
+void McCopyWidthEq8_mmx (uint8_t*, int32_t, uint8_t*, int32_t, int32_t);
+void PixelAvgWidthEq8_mmx (uint8_t*,  int32_t, uint8_t*, int32_t, uint8_t*, int32_t, int32_t);
 
-void McHorVer20_sse2( uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride,int32_t iWidth, int32_t iHeight);
-void McHorVer02_sse2( uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride,int32_t iWidth, int32_t iHeight);
-void McHorVer22HorFirst_sse2(uint8_t * pSrc,int32_t iSrcStride,uint8_t * pTap,int32_t iTapStride,int32_t iWidth,int32_t iHeight);	
-void McHorVer22VerLastAlign_sse2(uint8_t * pTap, int32_t iTapStride, uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight);
-void McHorVer22VerLastUnAlign_sse2(uint8_t * pTap, int32_t iTapStride, uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight);
-void McChromaWidthEq8_sse2( uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, const uint8_t *kpABCD, int32_t iHeigh );
-void McCopyWidthEq16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, int32_t );
-void McHorVer20WidthEq16_sse2( uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, int32_t iHeight );
-void McHorVer02WidthEq8_sse2(uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer22Width8HorFirst_sse2( uint8_t*pSrc, int32_t iSrcStride, uint8_t* pTap,	int32_t iTapStride,int32_t iHeight);
-void PixelAvgWidthEq16_sse2( uint8_t *,  int32_t, uint8_t *, int32_t, uint8_t *, int32_t, int32_t  );
+void McHorVer20_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                      int32_t iHeight);
+void McHorVer02_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                      int32_t iHeight);
+void McHorVer22HorFirst_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pTap, int32_t iTapStride, int32_t iWidth,
+                              int32_t iHeight);
+void McHorVer22VerLastAlign_sse2 (uint8_t* pTap, int32_t iTapStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                                  int32_t iHeight);
+void McHorVer22VerLastUnAlign_sse2 (uint8_t* pTap, int32_t iTapStride, uint8_t* pDst, int32_t iDstStride,
+                                    int32_t iWidth, int32_t iHeight);
+void McChromaWidthEq8_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, const uint8_t* kpABCD,
+                            int32_t iHeigh);
+void McCopyWidthEq16_sse2 (uint8_t*, int32_t, uint8_t*, int32_t, int32_t);
+void McHorVer20WidthEq16_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer02WidthEq8_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer22Width8HorFirst_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pTap,	int32_t iTapStride,
+                                    int32_t iHeight);
+void PixelAvgWidthEq16_sse2 (uint8_t*,  int32_t, uint8_t*, int32_t, uint8_t*, int32_t, int32_t);
 
 
-void PixelAvgWidthEq16_ssse3( uint8_t *,  int32_t, uint8_t *, int32_t, uint8_t *, int32_t, int32_t  );
-void McChromaWidthEq8_ssse3( uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, const uint8_t *kpABCD, int32_t iHeigh );
+void PixelAvgWidthEq16_ssse3 (uint8_t*,  int32_t, uint8_t*, int32_t, uint8_t*, int32_t, int32_t);
+void McChromaWidthEq8_ssse3 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                             const uint8_t* kpABCD, int32_t iHeigh);
 
 
 #endif //X86_ASM
--- a/codec/encoder/core/inc/md.h
+++ b/codec/encoder/core/inc/md.h
@@ -31,7 +31,7 @@
  *
  * \file	md.h
  *
- * \brief	mode decision 
+ * \brief	mode decision
  *
  * \date	2009.5.14 Created
  *
@@ -51,13 +51,13 @@
 #define ME_REFINE_BUF_WIDTH_BLK8   16
 #define ME_REFINE_BUF_STRIDE_BLK4  160
 #define ME_REFINE_BUF_STRIDE_BLK8  320
-	
+
 #define REFINE_ME_NO_BEST_HALF_PIXEL 0 //( 0,  0)
 #define REFINE_ME_HALF_PIXEL_LEFT    3 //(-2,  0)
 #define REFINE_ME_HALF_PIXEL_RIGHT   4 //( 2,  0)
 #define REFINE_ME_HALF_PIXEL_TOP     1 //( 0, -2)
 #define REFINE_ME_HALF_PIXEL_BOTTOM  2 //( 0,  2)
-	
+
 #define ME_NO_BEST_QUAR_PIXEL 1 //( 0,  0) or best half pixel
 #define ME_QUAR_PIXEL_LEFT    2 //(-1,  0)
 #define ME_QUAR_PIXEL_RIGHT   3 //( 1,  0)
@@ -69,81 +69,80 @@
 extern const int32_t g_kiQpCostTable[52];
 extern const int8_t g_kiMapModeI16x16[7];
 //extern const int8_t g_kiMapModeI4x4[14];
-extern const int8_t g_kiMapModeIntraChroma[7];	
+extern const int8_t g_kiMapModeIntraChroma[7];
 
 /////////////////////////////
 
 // if we want keep total sizeof(SWelsMD) <= 256, we maybe need to seperate three member of SWelsME.
-typedef struct TagWelsMD
-{
-    int32_t			iLambda;
-	uint16_t		*pMvdCost;
+typedef struct TagWelsMD {
+int32_t			iLambda;
+uint16_t*		pMvdCost;
 
-	int32_t			iCostLuma;
-    int32_t			iCostChroma;//satd+lambda(best_pred_mode) //i_sad_chroma;
-	int32_t			iSadPredMb; 
+int32_t			iCostLuma;
+int32_t			iCostChroma;//satd+lambda(best_pred_mode) //i_sad_chroma;
+int32_t			iSadPredMb;
 
-    uint8_t			uiRef; //uiRefIndex appointed by Encoder, used for MC
-    bool_t			bMdUsingSad;
-    uint16_t		uiReserved;
+uint8_t			uiRef; //uiRefIndex appointed by Encoder, used for MC
+bool_t			bMdUsingSad;
+uint16_t		uiReserved;
 
-	int32_t			iCostSkipMb;
-    int32_t			iSadPredSkip;
-    
-	//NO B frame in our Wels, we can ignore list1
+int32_t			iCostSkipMb;
+int32_t			iSadPredSkip;
 
-	struct 
-	{		
-		SWelsME			sMe16x16;		//adjust each SWelsME for 8 D-word!
-		SWelsME			sMe8x8[4];
-		SWelsME			sMe16x8[2];
-		SWelsME			sMe8x16[2];				
+//NO B frame in our Wels, we can ignore list1
+
+struct {
+  SWelsME			sMe16x16;		//adjust each SWelsME for 8 D-word!
+  SWelsME			sMe8x8[4];
+  SWelsME			sMe16x8[2];
+  SWelsME			sMe8x16[2];
 //		SMVUnitXY		i_mvbs[MB_BLOCK8x8_NUM];	//scaled MVB
-	} sMe;    
+} sMe;
 
-}SWelsMD;
+} SWelsMD;
 
-typedef struct TagMeRefinePointer
-{
-	uint8_t* pHalfPixH;
-	uint8_t* pHalfPixV;
-	uint8_t* pHalfPixHV;
+typedef struct TagMeRefinePointer {
+uint8_t* pHalfPixH;
+uint8_t* pHalfPixV;
+uint8_t* pHalfPixHV;
 
-	uint8_t* pQuarPixBest;
-	uint8_t* pQuarPixTmp; 
+uint8_t* pQuarPixBest;
+uint8_t* pQuarPixTmp;
 
 } SMeRefinePointer;
 
-static void md_intra_init(sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb);
-static void md_inter_init(sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb);
+static void md_intra_init (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb);
+static void md_inter_init (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb);
 
-void FillNeighborCacheIntra(SMbCache* pMbCache, SMB* pCurMb, int32_t iMbWidth/*, bool_t constrained_intra_pred_flag*/);
-void FillNeighborCacheInterWithoutBGD(SMbCache* pMbCache, SMB* pCurMb, int32_t iMbWidth, int8_t *pVaaBgMbFlag); //BGD spatial func
-void FillNeighborCacheInterWithBGD(SMbCache* pMbCache, SMB* pCurMb, int32_t iMbWidth, int8_t *pVaaBgMbFlag);
-void InitFillNeighborCacheInterFunc( SWelsFuncPtrList *pFuncList, const int32_t kiFlag );
+void FillNeighborCacheIntra (SMbCache* pMbCache, SMB* pCurMb, int32_t iMbWidth/*, bool_t constrained_intra_pred_flag*/);
+void FillNeighborCacheInterWithoutBGD (SMbCache* pMbCache, SMB* pCurMb, int32_t iMbWidth,
+                                       int8_t* pVaaBgMbFlag); //BGD spatial func
+void FillNeighborCacheInterWithBGD (SMbCache* pMbCache, SMB* pCurMb, int32_t iMbWidth, int8_t* pVaaBgMbFlag);
+void InitFillNeighborCacheInterFunc (SWelsFuncPtrList* pFuncList, const int32_t kiFlag);
 
-void MvdCostInit( uint16_t* pMvdCostInter, const int32_t kiMvdSz );
+void MvdCostInit (uint16_t* pMvdCostInter, const int32_t kiMvdSz);
 
-void PredictSad( int8_t* pRefIndexCache, int32_t* pSadCostCache, int32_t uiRef, int32_t * pSadPred );
+void PredictSad (int8_t* pRefIndexCache, int32_t* pSadCostCache, int32_t uiRef, int32_t* pSadPred);
 
 
-void PredictSadSkip( int8_t* pRefIndexCache, bool_t* pMbSkipCache, int32_t* pSadCostCache, int32_t uiRef, int32_t * iSadPredSkip );
+void PredictSadSkip (int8_t* pRefIndexCache, bool_t* pMbSkipCache, int32_t* pSadCostCache, int32_t uiRef,
+                     int32_t* iSadPredSkip);
 
 //  for pfGetVarianceFromIntraVaa function ptr adaptive by CPU features, 6/7/2010
-void InitIntraAnalysisVaaInfo( SWelsFuncPtrList *pFuncList, const uint32_t kuiCpuFlag );
-BOOL_T MdIntraAnalysisVaaInfo( sWelsEncCtx* pEncCtx, uint8_t* pEncMb );
+void InitIntraAnalysisVaaInfo (SWelsFuncPtrList* pFuncList, const uint32_t kuiCpuFlag);
+BOOL_T MdIntraAnalysisVaaInfo (sWelsEncCtx* pEncCtx, uint8_t* pEncMb);
 
-uint8_t MdInterAnalysisVaaInfo_c( int32_t *pSad8x8 );
+uint8_t MdInterAnalysisVaaInfo_c (int32_t* pSad8x8);
 
 
-void InitMeRefinePointer(SMeRefinePointer* pMeRefine, SMbCache* pMbCache, int32_t iStride);
-void MeRefineFracPixel(sWelsEncCtx* pEncCtx, uint8_t* pMemPredInterMb, SWelsME* pMe,
-						  SMeRefinePointer* pMeRefine, int32_t iWidth, int32_t iHeight);
-								 
-void InitBlkStrideWithRef(int32_t* pBlkStride, const int32_t kiStrideRef);
+void InitMeRefinePointer (SMeRefinePointer* pMeRefine, SMbCache* pMbCache, int32_t iStride);
+void MeRefineFracPixel (sWelsEncCtx* pEncCtx, uint8_t* pMemPredInterMb, SWelsME* pMe,
+                        SMeRefinePointer* pMeRefine, int32_t iWidth, int32_t iHeight);
 
-void UpdateMbMv_c( SMVUnitXY *pMvBuffer, const SMVUnitXY ksMv );
+void InitBlkStrideWithRef (int32_t* pBlkStride, const int32_t kiStrideRef);
 
+void UpdateMbMv_c (SMVUnitXY* pMvBuffer, const SMVUnitXY ksMv);
+
 #if defined(__cplusplus)
 extern "C" {
 #endif//__cplusplus
@@ -151,11 +150,11 @@
 #if defined(X86_ASM)
 
 //  for pfGetVarianceFromIntraVaa SIMD optimization, 6/7/2010
-int32_t AnalysisVaaInfoIntra_sse2 (	uint8_t *pDataY, const int32_t kiLineSize );
-int32_t AnalysisVaaInfoIntra_ssse3(	uint8_t *pDataY, const int32_t kiLineSize );
-uint8_t MdInterAnalysisVaaInfo_sse2( int32_t *pSad8x8 );
-uint8_t MdInterAnalysisVaaInfo_sse41( int32_t *pSad8x8 );
-void UpdateMbMv_sse2( SMVUnitXY *pMvBuffer, const SMVUnitXY ksMv );
+int32_t AnalysisVaaInfoIntra_sse2 (uint8_t* pDataY, const int32_t kiLineSize);
+int32_t AnalysisVaaInfoIntra_ssse3 (uint8_t* pDataY, const int32_t kiLineSize);
+uint8_t MdInterAnalysisVaaInfo_sse2 (int32_t* pSad8x8);
+uint8_t MdInterAnalysisVaaInfo_sse41 (int32_t* pSad8x8);
+void UpdateMbMv_sse2 (SMVUnitXY* pMvBuffer, const SMVUnitXY ksMv);
 
 #endif//X86_ASM
 
--- a/codec/encoder/core/inc/measure_time.h
+++ b/codec/encoder/core/inc/measure_time.h
@@ -61,45 +61,44 @@
  * \return	time elapsed since run (unit: microsecond)
  */
 
-static inline int64_t WelsTime()
-{
+static inline int64_t WelsTime() {
 #if !(defined(_MSC_VER) || defined(__MINGW32__))
-	struct timeval tv_date;
-	
-	gettimeofday( &tv_date, NULL );
-	return( (int64_t) tv_date.tv_sec * 1000000 + (int64_t) tv_date.tv_usec );
+struct timeval tv_date;
+
+gettimeofday (&tv_date, NULL);
+return ((int64_t) tv_date.tv_sec * 1000000 + (int64_t) tv_date.tv_usec);
 #else
-#if defined (WIN32)	
-	static int64_t iMeasureTimeFreq = 0;
+#if defined (WIN32)
+static int64_t iMeasureTimeFreq = 0;
 //	static BOOL_T support_high_resolution_perf_flag = TRUE;
-	int64_t iMeasureTimeCur = 0;
-	int64_t iResult = 0;	
-	if ( 0 == iMeasureTimeFreq ){
-		// Per MSDN minimum supported OS is Windows 2000 Professional/Server above for high-resolution performance counter
-		/*BOOL_T ret = */QueryPerformanceFrequency((LARGE_INTEGER *)&iMeasureTimeFreq);
+int64_t iMeasureTimeCur = 0;
+int64_t iResult = 0;
+if (0 == iMeasureTimeFreq) {
+  // Per MSDN minimum supported OS is Windows 2000 Professional/Server above for high-resolution performance counter
+  /*BOOL_T ret = */QueryPerformanceFrequency ((LARGE_INTEGER*)&iMeasureTimeFreq);
 //		if ( !ret )	// the installed hardware can not support a high-resolution performance counter, we have to use others instead for well feature
 //		{
-//			support_high_resolution_perf_flag	= FALSE;			
+//			support_high_resolution_perf_flag	= FALSE;
 //		}
-		if ( !iMeasureTimeFreq )
-			iMeasureTimeFreq = 1;
-	}
+  if (!iMeasureTimeFreq)
+    iMeasureTimeFreq = 1;
+}
 //	if ( support_high_resolution_perf_flag )
 //	{
-		QueryPerformanceCounter((LARGE_INTEGER *)&iMeasureTimeCur);
-		iResult = (int64_t)((double)iMeasureTimeCur * 1e6 / (double)iMeasureTimeFreq + 0.5);
+QueryPerformanceCounter ((LARGE_INTEGER*)&iMeasureTimeCur);
+iResult = (int64_t) ((double)iMeasureTimeCur * 1e6 / (double)iMeasureTimeFreq + 0.5);
 //	}
 //	else
 //	{
-//		iResult = timeGetTime() * 1000;	// 10 ms precision		
-//	}	
-	return iResult;
-	
+//		iResult = timeGetTime() * 1000;	// 10 ms precision
+//	}
+return iResult;
+
 #else
-	struct _timeb tb;
-	
-	_ftime(&tb);
-	return ((int64_t)tb.time * (1000) + (int64_t)tb.millitm) * (1000);
+struct _timeb tb;
+
+_ftime (&tb);
+return ((int64_t)tb.time * (1000) + (int64_t)tb.millitm) * (1000);
 #endif//#if WIN32
 #endif//!(defined(_MSC_VER) || defined(__MINGW32__))
 }
--- a/codec/encoder/core/inc/memory_align.h
+++ b/codec/encoder/core/inc/memory_align.h
@@ -1,80 +1,79 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#if !defined(WELS_ENCODER_MEMORY_ALIGN_H__)
-#define WELS_ENCODER_MEMORY_ALIGN_H__
-
-#include "typedefs.h"
-#include "as264_common.h"
-#ifdef MEMORY_CHECK
-#include <stdio.h>
-#endif//MEMORY_CHECK
-
-namespace WelsSVCEnc {
-
-#define MEMORY_REQUEST_ALIGN_BYTES	0 // or (1^n), i.e, 0x04
-
-class CMemoryAlign
-{
-public:
-	CMemoryAlign( const uint32_t kuiCacheLineSize );
-	virtual ~CMemoryAlign();
-
-	void* WelsMallocz( const uint32_t kuiSize, const str_t *kpTag );
-	void* WelsMalloc( const uint32_t kuiSize, const str_t *kpTag );
-	void WelsFree( void* pPointer, const str_t *kpTag );
-	const uint32_t WelsGetCacheLineSize() const;
-#if defined(MEMORY_MONITOR)
-	const uint32_t WelsGetMemoryUsage() const;
-#endif//MEMORY_MONITOR
-
-private:
-	// private copy & assign constructors adding to fix klocwork scan issues
-	CMemoryAlign( const CMemoryAlign& kcMa );           
-	CMemoryAlign& operator=( const CMemoryAlign& kcMa );
-
-protected:
-	uint32_t	m_nCacheLineSize;
-
-#ifdef MEMORY_MONITOR
-	uint32_t	m_nMemoryUsageInBytes;
-#endif//MEMORY_MONITOR
-
-#ifdef MEMORY_CHECK
-	FILE*		m_fpMemChkPoint;
-	uint32_t	m_nCountRequestNum;
-#endif//MEMORY_CHECK
-};
-
-}
-
-#endif//WELS_ENCODER_MEMORY_ALIGN_H__
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#if !defined(WELS_ENCODER_MEMORY_ALIGN_H__)
+#define WELS_ENCODER_MEMORY_ALIGN_H__
+
+#include "typedefs.h"
+#include "as264_common.h"
+#ifdef MEMORY_CHECK
+#include <stdio.h>
+#endif//MEMORY_CHECK
+
+namespace WelsSVCEnc {
+
+#define MEMORY_REQUEST_ALIGN_BYTES	0 // or (1^n), i.e, 0x04
+
+class CMemoryAlign {
+ public:
+CMemoryAlign (const uint32_t kuiCacheLineSize);
+virtual ~CMemoryAlign();
+
+void* WelsMallocz (const uint32_t kuiSize, const str_t* kpTag);
+void* WelsMalloc (const uint32_t kuiSize, const str_t* kpTag);
+void WelsFree (void* pPointer, const str_t* kpTag);
+const uint32_t WelsGetCacheLineSize() const;
+#if defined(MEMORY_MONITOR)
+const uint32_t WelsGetMemoryUsage() const;
+#endif//MEMORY_MONITOR
+
+ private:
+// private copy & assign constructors adding to fix klocwork scan issues
+CMemoryAlign (const CMemoryAlign& kcMa);
+CMemoryAlign& operator= (const CMemoryAlign& kcMa);
+
+ protected:
+uint32_t	m_nCacheLineSize;
+
+#ifdef MEMORY_MONITOR
+uint32_t	m_nMemoryUsageInBytes;
+#endif//MEMORY_MONITOR
+
+#ifdef MEMORY_CHECK
+FILE*		m_fpMemChkPoint;
+uint32_t	m_nCountRequestNum;
+#endif//MEMORY_CHECK
+};
+
+}
+
+#endif//WELS_ENCODER_MEMORY_ALIGN_H__
--- a/codec/encoder/core/inc/mt_defs.h
+++ b/codec/encoder/core/inc/mt_defs.h
@@ -70,7 +70,7 @@
 
 /*
  *	Parallel slice bs output without memcpy used
- *  NOTE: might be not applicable for SVC 2.0/2.1 client application layer implementation 
+ *  NOTE: might be not applicable for SVC 2.0/2.1 client application layer implementation
  *	due bs of various slices need be continuous within a layer packing
  */
 //#define PACKING_ONE_SLICE_PER_LAYER	// MEAN packing only slice for a pLayerBs, disabled at SVC 2.0/2.1 in case Multi-Threading (MT) & Multi-SSlice (MS)
@@ -169,58 +169,57 @@
 #endif//NOT_ABSOLUTE_BALANCING
 
 typedef struct TagSliceThreadPrivateData {
-	void		*pWelsPEncCtx;
-	SLayerBSInfo	*pLayerBs;
-	int32_t		iSliceIndex;	// slice index, zero based								
-	int32_t		iThreadIndex;	// thread index, zero based
+void*		pWelsPEncCtx;
+SLayerBSInfo*	pLayerBs;
+int32_t		iSliceIndex;	// slice index, zero based
+int32_t		iThreadIndex;	// thread index, zero based
 
-	// for dynamic slicing mode
-	int32_t		iStartMbIndex;	// inclusive
-	int32_t		iEndMbIndex;	// exclusive
+// for dynamic slicing mode
+int32_t		iStartMbIndex;	// inclusive
+int32_t		iEndMbIndex;	// exclusive
 } SSliceThreadPrivateData;
 
-typedef struct TagSliceThreading 
-{
-	SSliceThreadPrivateData	*pThreadPEncCtx;// thread context, [iThreadIdx]
-	WELS_THREAD_HANDLE			*pThreadHandles;// thread handles, [iThreadIdx]
+typedef struct TagSliceThreading {
+SSliceThreadPrivateData*	pThreadPEncCtx;// thread context, [iThreadIdx]
+WELS_THREAD_HANDLE*			pThreadHandles;// thread handles, [iThreadIdx]
 #ifdef WIN32
-	WELS_EVENT					*pSliceCodedEvent;// events for slice coded state, [iThreadIdx]
-	WELS_EVENT					*pReadySliceCodingEvent;	// events for slice coding ready, [iThreadIdx]
-	WELS_EVENT					*pFinSliceCodingEvent;	// notify slice coding thread is done
-	WELS_EVENT					*pExitEncodeEvent;			// event for exit encoding event
+WELS_EVENT*					pSliceCodedEvent;// events for slice coded state, [iThreadIdx]
+WELS_EVENT*					pReadySliceCodingEvent;	// events for slice coding ready, [iThreadIdx]
+WELS_EVENT*					pFinSliceCodingEvent;	// notify slice coding thread is done
+WELS_EVENT*					pExitEncodeEvent;			// event for exit encoding event
 #else
-	WELS_EVENT*					pSliceCodedEvent[MAX_THREADS_NUM];// events for slice coded state, [iThreadIdx]
-	WELS_EVENT*					pReadySliceCodingEvent[MAX_THREADS_NUM];	// events for slice coding ready, [iThreadIdx]
+WELS_EVENT*					pSliceCodedEvent[MAX_THREADS_NUM];// events for slice coded state, [iThreadIdx]
+WELS_EVENT*					pReadySliceCodingEvent[MAX_THREADS_NUM];	// events for slice coding ready, [iThreadIdx]
 #endif//WIN32
 
 #if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
 #if defined(__GNUC__)
-	WELS_THREAD_HANDLE			*pUpdateMbListThrdHandles;	// thread handles for update mb list thread, [iThreadIdx]
+WELS_THREAD_HANDLE*			pUpdateMbListThrdHandles;	// thread handles for update mb list thread, [iThreadIdx]
 #endif//__GNUC__
 #ifdef WIN32
-	WELS_EVENT					*pUpdateMbListEvent;		// signal to update mb list neighbor for various slices
-	WELS_EVENT					*pFinUpdateMbListEvent;	// signal to indicate finish updating mb list
+WELS_EVENT*					pUpdateMbListEvent;		// signal to update mb list neighbor for various slices
+WELS_EVENT*					pFinUpdateMbListEvent;	// signal to indicate finish updating mb list
 #else
-	WELS_EVENT*					pUpdateMbListEvent[MAX_THREADS_NUM];		// signal to update mb list neighbor for various slices
-	WELS_EVENT*					pFinUpdateMbListEvent[MAX_THREADS_NUM];	// signal to indicate finish updating mb list	
+WELS_EVENT*					pUpdateMbListEvent[MAX_THREADS_NUM];		// signal to update mb list neighbor for various slices
+WELS_EVENT*					pFinUpdateMbListEvent[MAX_THREADS_NUM];	// signal to indicate finish updating mb list
 #endif//WIN32
 #endif//#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
 
-	WELS_MUTEX					mutexSliceNumUpdate;	// for dynamic slicing mode MT
+WELS_MUTEX					mutexSliceNumUpdate;	// for dynamic slicing mode MT
 
 #if defined(DYNAMIC_SLICE_ASSIGN) || defined(MT_DEBUG)
-	uint32_t					*pSliceConsumeTime[MAX_DEPENDENCY_LAYER];	// consuming time for each slice, [iSpatialIdx][uiSliceIdx]
+uint32_t*					pSliceConsumeTime[MAX_DEPENDENCY_LAYER];	// consuming time for each slice, [iSpatialIdx][uiSliceIdx]
 #endif//DYNAMIC_SLICE_ASSIGN || MT_DEBUG
 #if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
-	float						*pSliceComplexRatio[MAX_DEPENDENCY_LAYER];
+float*						pSliceComplexRatio[MAX_DEPENDENCY_LAYER];
 #endif//DYNAMIC_SLICE_ASSIGN && TRY_SLICING_BALANCE
 
 #ifdef MT_DEBUG
-	FILE						*pFSliceDiff;	// file handle for debug
+FILE*						pFSliceDiff;	// file handle for debug
 #endif//MT_DEBUG
 
 #ifdef PACKING_ONE_SLICE_PER_LAYER
-	uint32_t					*pCountBsSizeInPartition;
+uint32_t*					pCountBsSizeInPartition;
 #endif//PACKING_ONE_SLICE_PER_LAYER
 } SSliceThreading;
 
--- a/codec/encoder/core/inc/mv_pred.h
+++ b/codec/encoder/core/inc/mv_pred.h
@@ -48,95 +48,98 @@
 namespace WelsSVCEnc {
 /*!
  * \brief   update pMv and uiRefIndex cache for current MB, only for P_16x16 (SKIP inclusive)
- * \param 	
- * \param 	
+ * \param
+ * \param
  */
 
 /*!
  * \brief   update pMv and uiRefIndex cache for current MB and pMbCache, only for P_16x16 (SKIP inclusive)
- * \param 	
- * \param 	
+ * \param
+ * \param
  */
-void UpdateP16x16MotionInfo(SMbCache* pMbCache, SMB* pCurMb, const int8_t kiRef, SMVUnitXY* pMv);//for encoder
+void UpdateP16x16MotionInfo (SMbCache* pMbCache, SMB* pCurMb, const int8_t kiRef, SMVUnitXY* pMv); //for encoder
 
 /*!
  * \brief   update pMv and uiRefIndex cache for current MB and pMbCache, only for P_16x8
- * \param 	
- * \param 	
+ * \param
+ * \param
  */
-void UpdateP16x8MotionInfo(SMbCache* pMbCache, SMB* pCurMb, const int32_t kiPartIdx, const int8_t kiRef, SMVUnitXY* pMv);
+void UpdateP16x8MotionInfo (SMbCache* pMbCache, SMB* pCurMb, const int32_t kiPartIdx, const int8_t kiRef,
+                            SMVUnitXY* pMv);
 
 /*!
  * \brief   update pMv and uiRefIndex cache for current MB and pMbCache, only for P_8x16
- * \param 	
- * \param 	
+ * \param
+ * \param
  */
-void update_P8x16_motion_info(SMbCache* pMbCache, SMB* pCurMb, const int32_t kiPartIdx, const int8_t kiRef, SMVUnitXY* pMv);
+void update_P8x16_motion_info (SMbCache* pMbCache, SMB* pCurMb, const int32_t kiPartIdx, const int8_t kiRef,
+                               SMVUnitXY* pMv);
 
 /*!
  * \brief   update pMv and uiRefIndex cache for current MB and pMbCache, only for P_8x8
- * \param 	
- * \param 	
+ * \param
+ * \param
  */
-void UpdateP8x8MotionInfo(SMbCache* pMbCache, SMB* pCurMb, const int32_t kiPartIdx, const int8_t kiRef, SMVUnitXY* pMv);
+void UpdateP8x8MotionInfo (SMbCache* pMbCache, SMB* pCurMb, const int32_t kiPartIdx, const int8_t kiRef,
+                           SMVUnitXY* pMv);
 
 /*!
  * \brief   get the motion predictor for 4*4 or 8*8 or 16*16 block
- * \param 	
+ * \param
  * \param 	output mvp_x and mvp_y
  */
-void PredMv(const SMVComponentUnit* kpMvComp, int8_t iPartIdx, int8_t iPartW, int32_t iRef, SMVUnitXY* sMvp);
+void PredMv (const SMVComponentUnit* kpMvComp, int8_t iPartIdx, int8_t iPartW, int32_t iRef, SMVUnitXY* sMvp);
 
 
 /*!
  * \brief   get the motion predictor for SKIP MB
- * \param 	
+ * \param
  * \param 	output mvp_x and mvp_y
  */
-void PredSkipMv(SMbCache* pMbCache, SMVUnitXY* sMvp);
+void PredSkipMv (SMbCache* pMbCache, SMVUnitXY* sMvp);
 
 
 /*!
  * \brief   get the motion predictor for inter16x8 MB
- * \param 	
+ * \param
  * \param 	output mvp_x and mvp_y
  */
-void PredInter16x8Mv(SMbCache* pMbCache, int32_t iPartIdx, int8_t iRef, SMVUnitXY* sMvp);
+void PredInter16x8Mv (SMbCache* pMbCache, int32_t iPartIdx, int8_t iRef, SMVUnitXY* sMvp);
 
 
 /*!
  * \brief   get the motion predictor for inter8x16 MB
- * \param 	
+ * \param
  * \param 	output mvp_x and mvp_y
  */
-void PredInter8x16Mv(SMbCache* pMbCache, int32_t iPartIdx, int8_t iRef, SMVUnitXY* sMvp);
+void PredInter8x16Mv (SMbCache* pMbCache, int32_t iPartIdx, int8_t iRef, SMVUnitXY* sMvp);
 
 //=========================update motion info(MV and ref_idx) into Mb_cache==========================
 /*!
  * \brief   only update pMv cache for current MB, only for P_16x16
- * \param 	
- * \param 	
+ * \param
+ * \param
  */
 //void update_p16x16_motion2cache(SMbCache* pMbCache, int8_t pRef, SMVUnitXY* pMv);
 
 /*!
  * \brief   only update pMv cache for current MB, only for P_16x8
- * \param 	
- * \param 	
+ * \param
+ * \param
  */
-void UpdateP16x8Motion2Cache(SMbCache* pMbCache, int32_t iPartIdx, int8_t iRef, SMVUnitXY* pMv);
+void UpdateP16x8Motion2Cache (SMbCache* pMbCache, int32_t iPartIdx, int8_t iRef, SMVUnitXY* pMv);
 
 /*!
  * \brief   only update pMv cache for current MB, only for P_8x16
- * \param 	
- * \param 	
+ * \param
+ * \param
  */
-void UpdateP8x16Motion2Cache(SMbCache* pMbCache, int32_t iPartIdx, int8_t iRef, SMVUnitXY* pMv);
+void UpdateP8x16Motion2Cache (SMbCache* pMbCache, int32_t iPartIdx, int8_t iRef, SMVUnitXY* pMv);
 /*!
  * \brief   only update pMv cache for current MB, only for P_8x8
- * \param 	
- * \param 	
+ * \param
+ * \param
  */
-void UpdateP8x8Motion2Cache(SMbCache* pMbCache, int32_t iPartIdx, int8_t iRef, SMVUnitXY* pMv);
+void UpdateP8x8Motion2Cache (SMbCache* pMbCache, int32_t iPartIdx, int8_t iRef, SMVUnitXY* pMv);
 }
 #endif//WELS_MV_PRED_H__
--- a/codec/encoder/core/inc/nal_encap.h
+++ b/codec/encoder/core/inc/nal_encap.h
@@ -51,71 +51,73 @@
  *	Raw payload pData for NAL unit, AVC/SVC compatible
  */
 typedef struct TagWelsNalRaw {
-	uint8_t				*pRawData;		// pRawNal payload for slice pData
-	int32_t				iPayloadSize;		// size of pRawNal pData
-	
-	SNalUnitHeaderExt		sNalExt;		// NAL header information
+uint8_t*				pRawData;		// pRawNal payload for slice pData
+int32_t				iPayloadSize;		// size of pRawNal pData
 
-}SWelsNalRaw;
+SNalUnitHeaderExt		sNalExt;		// NAL header information
 
+} SWelsNalRaw;
+
 /*
  *	Encoder majoy output pData
  */
-typedef struct TagWelsEncoderOutput {	
-	uint8_t				*pBsBuffer;			// overall bitstream pBuffer allocation for a coded picture, recycling use intend. 
-	uint32_t			uiSize;				// size of allocation pBuffer above
+typedef struct TagWelsEncoderOutput {
+uint8_t*				pBsBuffer;			// overall bitstream pBuffer allocation for a coded picture, recycling use intend.
+uint32_t			uiSize;				// size of allocation pBuffer above
 
-	SBitStringAux		sBsWrite;
-	
+SBitStringAux		sBsWrite;
+
 //	SWelsNalRaw		raw_nals[MAX_DEPENDENCY_LAYER*2+MAX_DEPENDENCY_LAYER*MAX_QUALITY_LEVEL]; // AVC: max up to SPS+PPS+max_slice_idc (2 + 8) for FMO;
-	SWelsNalRaw		*sNalList;			// nal list, adaptive for AVC/SVC in case single slice, multiple slices or fmo
-	int32_t				iCountNals;			// count number of NAL in list
-																								 // SVC: num_sps (MAX_D) + num_pps (MAX_D) + num_vcl (MAX_D * MAX_Q)	
-	int32_t				iNalIndex;			// coding NAL currently, 0 based
-	
+SWelsNalRaw*		sNalList;			// nal list, adaptive for AVC/SVC in case single slice, multiple slices or fmo
+int32_t				iCountNals;			// count number of NAL in list
+// SVC: num_sps (MAX_D) + num_pps (MAX_D) + num_vcl (MAX_D * MAX_Q)
+int32_t				iNalIndex;			// coding NAL currently, 0 based
+
 //	BOOL_T				bAnnexBFlag;		// annexeb flag, to figure it pOut the packetization mode whether need 4 bytes (0 0 0 1) of start code prefix
-}SWelsEncoderOutput;
+} SWelsEncoderOutput;
 
 //#define MT_DEBUG_BS_WR	0	// for MT debugging if needed
 
 typedef struct TagWelsSliceBs {
-	uint8_t				*pBs;				// output bitstream, pBitStringAux not needed for slice 0 due to no dependency of pFrameBs available
-	uint32_t			uiBsPos;				// position of output bitstream
-	uint8_t				*pBsBuffer;			// overall bitstream pBuffer allocation for a coded slice, recycling use intend. 
-	uint32_t			uiSize;				// size of allocation pBuffer above
-	
-	SBitStringAux		sBsWrite;
-		
-	SWelsNalRaw		sNalList[2];		// nal list, PREFIX NAL(if applicable) + SLICE NAL
+uint8_t*				pBs;				// output bitstream, pBitStringAux not needed for slice 0 due to no dependency of pFrameBs available
+uint32_t			uiBsPos;				// position of output bitstream
+uint8_t*				pBsBuffer;			// overall bitstream pBuffer allocation for a coded slice, recycling use intend.
+uint32_t			uiSize;				// size of allocation pBuffer above
+
+SBitStringAux		sBsWrite;
+
+SWelsNalRaw		sNalList[2];		// nal list, PREFIX NAL(if applicable) + SLICE NAL
 //	int32_t				iCountNals;			// count number of NAL in list
-	int32_t				iNalLen[2];
-	int32_t				iNalIndex;			// coding NAL currently, 0 based	
-	
+int32_t				iNalLen[2];
+int32_t				iNalIndex;			// coding NAL currently, 0 based
+
 //	BOOL_T				bAnnexBFlag;		// annexeb flag, to figure it pOut the packetization mode whether need 4 bytes (0 0 0 1) of start code prefix
 #if MT_DEBUG_BS_WR
-	BOOL_T				bSliceCodedFlag;
+BOOL_T				bSliceCodedFlag;
 #endif//MT_DEBUG_BS_WR
-}SWelsSliceBs;
+} SWelsSliceBs;
 
 /*!
- * \brief	load an initialize NAL pRawNal pData	
+ * \brief	load an initialize NAL pRawNal pData
  */
-void WelsLoadNal( SWelsEncoderOutput *pEncoderOuput, const int32_t/*EWelsNalUnitType*/ kiType, const int32_t/*EWelsNalRefIdc*/ kiNalRefIdc );
+void WelsLoadNal (SWelsEncoderOutput* pEncoderOuput, const int32_t/*EWelsNalUnitType*/ kiType,
+                  const int32_t/*EWelsNalRefIdc*/ kiNalRefIdc);
 
 /*!
  * \brief	unload pRawNal NAL
  */
-void WelsUnloadNal( SWelsEncoderOutput *pEncoderOuput );
+void WelsUnloadNal (SWelsEncoderOutput* pEncoderOuput);
 
 /*!
- * \brief	load an initialize NAL pRawNal pData	
+ * \brief	load an initialize NAL pRawNal pData
  */
-void WelsLoadNalForSlice( SWelsSliceBs *pSliceBs, const int32_t/*EWelsNalUnitType*/ kiType, const int32_t/*EWelsNalRefIdc*/ kiNalRefIdc );
+void WelsLoadNalForSlice (SWelsSliceBs* pSliceBs, const int32_t/*EWelsNalUnitType*/ kiType,
+                          const int32_t/*EWelsNalRefIdc*/ kiNalRefIdc);
 
 /*!
  * \brief	unload pRawNal NAL
  */
-void WelsUnloadNalForSlice( SWelsSliceBs *pSliceBs );
+void WelsUnloadNalForSlice (SWelsSliceBs* pSliceBs);
 
 /*!
  * \brief	encode NAL with emulation forbidden three bytes checking
@@ -125,7 +127,7 @@
  * \param	pRawNal			pRawNal NAL pData
  * \return	length of pDst NAL
  */
-int32_t WelsEncodeNal( SWelsNalRaw *pRawNal, void *pDst, int32_t *pDstLen );
+int32_t WelsEncodeNal (SWelsNalRaw* pRawNal, void* pDst, int32_t* pDstLen);
 
 /*!
  * \brief	encode a nal into a pBuffer for any type of NAL, involved WelsEncodeNal introduced in AVC
@@ -138,11 +140,11 @@
  *
  * \return	length of pDst NAL
  */
-int32_t WelsEncodeNalExt( SWelsNalRaw *pRawNal, void *pNalHeaderExt, void *pDst, int32_t *pDstLen );
+int32_t WelsEncodeNalExt (SWelsNalRaw* pRawNal, void* pNalHeaderExt, void* pDst, int32_t* pDstLen);
 
 /*!
  * \brief	write prefix nal
  */
-int32_t WelsWriteSVCPrefixNal( SBitStringAux *pBitStringAux, const int32_t keNalRefIdc,const bool_t kbIdrFlag );
+int32_t WelsWriteSVCPrefixNal (SBitStringAux* pBitStringAux, const int32_t keNalRefIdc, const bool_t kbIdrFlag);
 }
 #endif//WELS_NAL_UNIT_ENCAPSULATION_H__
--- a/codec/encoder/core/inc/nal_prefix.h
+++ b/codec/encoder/core/inc/nal_prefix.h
@@ -42,23 +42,23 @@
 ///////////////////////////////////NAL Unit prefix/headers///////////////////////////////////
 
 /* NAL Unix Header in AVC, refer to Page 56 in JVT X201wcm */
-typedef struct TagNalUnitHeader{
-	uint8_t		uiForbiddenZeroBit;
-	uint8_t		uiNalRefIdc;
-	EWelsNalUnitType	eNalUnitType;
-	uint8_t		uiReservedOneByte;		
-}SNalUnitHeader, *PNalUnitHeader;
+typedef struct TagNalUnitHeader {
+  uint8_t		uiForbiddenZeroBit;
+  uint8_t		uiNalRefIdc;
+  EWelsNalUnitType	eNalUnitType;
+  uint8_t		uiReservedOneByte;
+} SNalUnitHeader, *PNalUnitHeader;
 
 /* NAL Unit Header in scalable extension syntax, refer to Page 390 in JVT X201wcm */
-typedef struct TagNalUnitHeaderExt{
-	SNalUnitHeader	sNalHeader;
-	
-	bool_t		bIdrFlag;
-	uint8_t		uiDependencyId;
-	uint8_t		uiTemporalId;
-	bool_t		bDiscardableFlag;
-	
+typedef struct TagNalUnitHeaderExt {
+  SNalUnitHeader	sNalHeader;
 
-}SNalUnitHeaderExt, *PNalUnitHeaderExt;
+  bool_t		bIdrFlag;
+  uint8_t		uiDependencyId;
+  uint8_t		uiTemporalId;
+  bool_t		bDiscardableFlag;
+
+
+} SNalUnitHeaderExt, *PNalUnitHeaderExt;
 }
 #endif//WELS_NAL_UNIT_PREFIX_H__
--- a/codec/encoder/core/inc/param_svc.h
+++ b/codec/encoder/core/inc/param_svc.h
@@ -1,483 +1,483 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file	param_svc.h
- *
- * \brief	Configurable parameters in H.264/SVC Encoder
- *
- * \date	4/20/2009 Created
- *
- *************************************************************************************
- */
-#if !defined(WELS_ENCODER_PARAMETER_SVC_H__)
-#define WELS_ENCODER_PARAMETER_SVC_H__
-
-#include <string.h>
-#include <math.h>
-#include "typedefs.h"
-#include "codec_def.h"
-#include "macros.h"
-#include "wels_const.h"
-#include "wels_common_basis.h"
-#include "rc.h"
-#include "svc_enc_slice_segment.h"
-#include "as264_common.h"
-
-namespace WelsSVCEnc {
-
-#define   INVALID_TEMPORAL_ID   ((uint8_t)0xff)
-
-extern const uint8_t   g_kuiTemporalIdListTable[MAX_TEMPORAL_LEVEL][MAX_GOP_SIZE + 1];
-
-/*!
-* \brief	get Logarithms base 2 of (upper/base)
-* \param	base	based scaler
-* \param	upper	input upper value
-* \return	2 based scaling factor
-*/
-static __inline uint32_t GetLogFactor( real32_t base, real32_t upper )
-{
-	const double dLog2factor	= log10(1.0 * upper / base) / log10(2.0);
-	const double dEpsilon		= 0.0001;
-	const double dRound		= floor( dLog2factor + 0.5 );	
-
-	if( dLog2factor < dRound+dEpsilon && dRound < dLog2factor+dEpsilon )
-	{
-		return (uint32_t)(dRound);
-	}
-	return UINT_MAX;
-}
-
-/*
- *	Dependency Layer Parameter
- */
-typedef struct TagDLayerParam {
-	int32_t		iActualWidth;			// input source picture actual width
-	int32_t		iActualHeight;			// input source picture actual height
-	int32_t		iFrameWidth;			// frame width
-	int32_t		iFrameHeight;			// frame height
-
-	int32_t		iSpatialBitrate;
-
-	/* temporal settings related */
-	int32_t		iTemporalResolution;
-	int32_t		iDecompositionStages;	
-	uint8_t     uiCodingIdx2TemporalId[(1<<MAX_TEMPORAL_LEVEL)+1];
-
-	uint8_t		uiProfileIdc;			// value of profile IDC (0 for auto-detection)	
-
-	int8_t		iHighestTemporalId;
-	//	uint8_t		uiDependencyId;
-	int8_t      iDLayerQp;
-
-	SMulSliceOption sMso;	// multiple slice options
-
-	float		fInputFrameRate;		// input frame rate
-	float		fOutputFrameRate;		// output frame rate
-
-#ifdef ENABLE_FRAME_DUMP
-	str_t		sRecFileName[MAX_FNAME_LEN];	// file to be constructed
-#endif//ENABLE_FRAME_DUMP	
-} SDLayerParam;
-
-/*
- *	Cisco OpenH264 Encoder Parameter Configuration
- */
-typedef struct TagWelsSvcCodingParam {	
-	SDLayerParam	sDependencyLayers[MAX_DEPENDENCY_LAYER];
-
-	/* General */
-#ifdef ENABLE_TRACE_FILE
-    str_t			sTracePath[MAX_FNAME_LEN];		// log file for wels encoder
-#endif
-
-	uint32_t	uiGopSize;			// GOP size (at maximal frame rate: 16)
-	uint32_t	uiIntraPeriod;		// intra period (multiple of GOP size as desired)
-	int32_t		iNumRefFrame;		// number of reference frame used
-
-	int32_t     iActualPicWidth;    //   actual input picture width
-	int32_t     iActualPicHeight;   //   actual input picture height
-
-	struct {
-		int32_t iLeft;
-		int32_t iTop;
-		int32_t iWidth;
-		int32_t iHeight;
-	}SUsedPicRect;	// the rect in input picture that encoder actually used
-
-	str_t       *pCurPath; // record current lib path such as:/pData/pData/com.wels.enc/lib/ 
-
-	float		fMaxFrameRate;		// maximal frame rate [Hz / fps]
-	int32_t		iInputCsp;			// color space of input sequence	
-	uint32_t	uiFrameToBeCoded;	// frame to be encoded (at input frame rate)	
-
-	int32_t		iTargetBitrate;			// overall target bitrate introduced in RC module	
-	int16_t		iMultipleThreadIdc;		// 1	# 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
-	int16_t		iCountThreadsNum;			//		# derived from disable_multiple_slice_idc (=0 or >1) means;
-
-	int32_t		iLTRRefNum;
-	uint32_t    uiLtrMarkPeriod;	//the min distance of two int32_t references
-
-	bool_t		bDeblockingParallelFlag;	// deblocking filter parallelization control flag
-	bool_t		bMgsT0OnlyStrategy; //MGS_T0_only_strategy
-    bool_t		bEnableSSEI;		
-	bool_t		bEnableFrameCroppingFlag;	// enable frame cropping flag: TRUE alwayse in application
-	
-	bool_t		bEnableCropPic;			// enable cropping source picture. , 8/25/2010
-											// FALSE: Streaming Video Sharing; TRUE: Video Conferencing Meeting;
-	int8_t		iDecompStages;		// GOP size dependency		
-
-	/* Deblocking loop filter */
-	int8_t		iLoopFilterDisableIdc;	// 0: on, 1: off, 2: on except for slice boundaries
-	int8_t		iLoopFilterAlphaC0Offset;// AlphaOffset: valid range [-6, 6], default 0
-
-	int8_t		iLoopFilterBetaOffset;	// BetaOffset:	valid range [-6, 6], default 0	
-	int8_t		iInterLayerLoopFilterDisableIdc; // Employed based upon inter-layer, same comment as above
-	int8_t		iInterLayerLoopFilterAlphaC0Offset;	// InterLayerLoopFilterAlphaC0Offset
-	int8_t		iInterLayerLoopFilterBetaOffset;	// InterLayerLoopFilterBetaOffset
-
-	/* Rate Control */
-	bool_t		bEnableRc;
-	int8_t		iRCMode;	
-	int8_t		iPaddingFlag;
-	/* denoise control */
-	bool_t      bEnableDenoise;				
-
-	/* scene change detection control */
-	bool_t      bEnableSceneChangeDetect;	 
-	// background detection control
-	bool_t		bEnableBackgroundDetection; 
-	/* adaptive quantization control */
-	bool_t		bEnableAdaptiveQuant;	         
-	/* long term reference control */
-	bool_t      bEnableLongTermReference;
-
-	/* pSps pPps id addition control */
-	bool_t      bEnableSpsPpsIdAddition;
-	/* Layer definition */
-	bool_t		bPrefixNalAddingCtrl;
-	int8_t		iNumDependencyLayer;	// number of dependency(Spatial/CGS) layers used to be encoded
-	int8_t		iNumTemporalLayer;		// number of temporal layer specified
-    
-
-    
-public:
-	TagWelsSvcCodingParam(const bool_t kbEnableRc = true)
-	{
-		FillDefault( kbEnableRc );
-	}
-	~TagWelsSvcCodingParam()	{}
-
-	void FillDefault( const bool_t kbEnableRc )
-	{
-		uiGopSize			= 1;			// GOP size (at maximal frame rate: 16)
-		uiIntraPeriod		= 0;			// intra period (multiple of GOP size as desired)
-		iNumRefFrame		= MIN_REF_PIC_COUNT;	// number of reference frame used
-
-		iActualPicWidth	= 0;    //   actual input picture width
-		iActualPicHeight	= 0;	//   actual input picture height
-		SUsedPicRect.iLeft	=
-		SUsedPicRect.iTop	=
-		SUsedPicRect.iWidth	=
-		SUsedPicRect.iHeight= 0;	// the rect in input picture that encoder actually used
-
-		pCurPath			= NULL; // record current lib path such as:/pData/pData/com.wels.enc/lib/ 
-
-		fMaxFrameRate		= MAX_FRAME_RATE;	// maximal frame rate [Hz / fps]		
-		iInputCsp			= videoFormatI420;	// input sequence color space in default
-		uiFrameToBeCoded	= (uint32_t)-1;		// frame to be encoded (at input frame rate)
-
-		iTargetBitrate			= 0;	// overall target bitrate introduced in RC module
-		bDeblockingParallelFlag= false;	// deblocking filter parallelization control flag
-#ifdef MT_ENABLED
-		iMultipleThreadIdc		= 0;	// auto to detect cpu cores inside
-#else
-		iMultipleThreadIdc		= 1;	// 1 # 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
-#endif//MT_ENABLED
-		iCountThreadsNum		= 1;	//		# derived from disable_multiple_slice_idc (=0 or >1) means;
-
-		iLTRRefNum				= 0;
-		uiLtrMarkPeriod			= 30;	//the min distance of two int32_t references		
-
-		bMgsT0OnlyStrategy			= true;	// Strategy of have MGS only at T0 frames (0: do not use this strategy; 1: use this strategy) 
-		bEnableSSEI					= true;
-		bEnableFrameCroppingFlag	= true;	// enable frame cropping flag: TRUE alwayse in application
-		bEnableCropPic				= true;	// enable cropping source picture. , 8/25/2010
-		// FALSE: Streaming Video Sharing; TRUE: Video Conferencing Meeting;
-		iDecompStages				= 0;	// GOP size dependency, unknown here and be revised later
-
-		/* Deblocking loop filter */
-		iLoopFilterDisableIdc		= 1;	// 0: on, 1: off, 2: on except for slice boundaries
-		iLoopFilterAlphaC0Offset	= 0;	// AlphaOffset: valid range [-6, 6], default 0
-		iLoopFilterBetaOffset		= 0;	// BetaOffset:	valid range [-6, 6], default 0
-		iInterLayerLoopFilterDisableIdc		= 1;	// Employed based upon inter-layer, same comment as above
-		iInterLayerLoopFilterAlphaC0Offset	= 0;	// InterLayerLoopFilterAlphaC0Offset
-		iInterLayerLoopFilterBetaOffset		= 0;	// InterLayerLoopFilterBetaOffset
-
-		/* Rate Control */
-		bEnableRc		= kbEnableRc;	
-		iRCMode			= 0;	
-		iPaddingFlag	= 0;
-		
-		bEnableDenoise				= false;	// denoise control		
-		bEnableSceneChangeDetect	= true;		// scene change detection control		
-		bEnableBackgroundDetection	= true;		// background detection control		
-		bEnableAdaptiveQuant		= true;		// adaptive quantization control		
-		bEnableLongTermReference	= false;	// long term reference control		
-		bEnableSpsPpsIdAddition	= true;		// pSps pPps id addition control		
-		bPrefixNalAddingCtrl		= true;		// prefix NAL adding control
-		iNumDependencyLayer		= 0;		// number of dependency(Spatial/CGS) layers used to be encoded
-		iNumTemporalLayer			= 0;		// number of temporal layer specified		
-	}
-
-	int32_t ParamTranscode( SVCEncodingParam& pCodingParam, const bool_t kbEnableRc = true )
-	{		
-		pCodingParam.fFrameRate		= WELS_CLIP3(pCodingParam.fFrameRate, MIN_FRAME_RATE, MAX_FRAME_RATE);
-		iInputCsp		= pCodingParam.iInputCsp;		// color space of input sequence	
-		uiFrameToBeCoded	= (uint32_t)-1;		// frame to be encoded (at input frame rate), -1 dependents on length of input sequence
-
-		iActualPicWidth   = pCodingParam.iPicWidth;
-		iActualPicHeight  = pCodingParam.iPicHeight; 
-
-		SUsedPicRect.iLeft = 0;
-		SUsedPicRect.iTop  = 0;
-		SUsedPicRect.iWidth = ((iActualPicWidth >> 1) << 1);
-		SUsedPicRect.iHeight = ((iActualPicHeight >> 1) << 1);
-
-		/* Deblocking loop filter */
-#ifdef MT_ENABLED
-		iLoopFilterDisableIdc	= 2;//pCodingParam.iLoopFilterDisableIdc;	// 0: on, 1: off, 2: on except for slice boundaries, 
-#else
-		iLoopFilterDisableIdc	= 0;	// 0: on, 1: off, 2: on except for slice boundaries
-#endif
-		iLoopFilterAlphaC0Offset= 0;	// AlphaOffset: valid range [-6, 6], default 0
-		iLoopFilterBetaOffset	= 0;	// BetaOffset:	valid range [-6, 6], default 0
-		iInterLayerLoopFilterDisableIdc	= iLoopFilterDisableIdc;	// Employed based upon inter-layer, same comment as above
-		iInterLayerLoopFilterAlphaC0Offset= 0;
-		iInterLayerLoopFilterBetaOffset	= 0;
-
-		bEnableFrameCroppingFlag	= true;
-
-		/* Rate Control */
-		bEnableRc			= kbEnableRc;
-		if (pCodingParam.iRCMode != RC_MODE0 && pCodingParam.iRCMode != RC_MODE1)
-			iRCMode = RC_MODE1;
-		else
-			iRCMode = pCodingParam.iRCMode;    // rc mode
-		iPaddingFlag= pCodingParam.iPaddingFlag;
-
-		iTargetBitrate		= pCodingParam.iTargetBitrate;	// target bitrate
-
-		/* Denoise Control */
-		bEnableDenoise = pCodingParam.bEnableDenoise ? true : false;    // Denoise Control  // only support 0 or 1 now  	
-
-		/* Scene change detection control */
-		bEnableSceneChangeDetect	= true;	   
-
-		/* Background detection Control */
-		bEnableBackgroundDetection = pCodingParam.bEnableBackgroundDetection ? true : false; 
-
-		/* Adaptive quantization control */
-		bEnableAdaptiveQuant	= pCodingParam.bEnableAdaptiveQuant ? true : false;	   
-
-		/* Enable cropping source picture */
-		bEnableCropPic	= pCodingParam.bEnableCropPic ? true : false;
-
-		/* Enable int32_t term reference */
-		bEnableLongTermReference	= pCodingParam.bEnableLongTermReference ? true : false;
-		uiLtrMarkPeriod = pCodingParam.iLtrMarkPeriod;
-
-		/* For ssei information */
-		bEnableSSEI		= true;
-
-		/* Layer definition */
-		iNumDependencyLayer	= (int8_t)WELS_CLIP3(pCodingParam.iSpatialLayerNum, 1, MAX_DEPENDENCY_LAYER); // number of dependency(Spatial/CGS) layers used to be encoded
-		pCodingParam.iTemporalLayerNum = (int8_t)WELS_CLIP3(pCodingParam.iTemporalLayerNum, 1, MAX_TEMPORAL_LEVEL);	// safe valid iTemporalLayerNum		
-		iNumTemporalLayer		= (int8_t)pCodingParam.iTemporalLayerNum;//(int8_t)WELS_CLIP3(pCodingParam.iTemporalLayerNum, 1, MAX_TEMPORAL_LEVEL);// number of temporal layer specified		
-
-		uiGopSize			= 1 << (iNumTemporalLayer-1);	// Override GOP size based temporal layer
-		iDecompStages		= iNumTemporalLayer-1;	// WELS_LOG2( uiGopSize );// GOP size dependency
-		uiIntraPeriod		= pCodingParam.iIntraPeriod;// intra period (multiple of GOP size as desired)
-		if ( uiIntraPeriod == (uint32_t)(-1) )
-			uiIntraPeriod = 0;
-		else if ( uiIntraPeriod & uiGopSize )	// none multiple of GOP size
-			uiIntraPeriod = ((uiIntraPeriod+uiGopSize-1) / uiGopSize) * uiGopSize;
-
-		iLTRRefNum = bEnableLongTermReference ? LONG_TERM_REF_NUM : 0;
-		iNumRefFrame		= ((uiGopSize>>1)>1)?((uiGopSize>>1)+iLTRRefNum):(MIN_REF_PIC_COUNT+iLTRRefNum);
-		iNumRefFrame		= WELS_CLIP3( iNumRefFrame, MIN_REF_PIC_COUNT, MAX_REFERENCE_PICTURE_COUNT_NUM );	
-
-		uiLtrMarkPeriod  = pCodingParam.iLtrMarkPeriod;
-
-		bPrefixNalAddingCtrl	= pCodingParam.bPrefixNalAddingCtrl;	
-		
-		bEnableSpsPpsIdAddition = pCodingParam.bEnableSpsPpsIdAddition;//For SVC meeting application, to avoid mosaic issue caused by cross-IDR reference. 
-		                                                               //SHOULD enable this feature.  
-
-		SDLayerParam *pDlp		= &sDependencyLayers[0];
-		float fMaxFr			= .0f;
-		uint8_t uiProfileIdc		= PRO_BASELINE;
-		int8_t iIdxSpatial	= 0;
-		while(iIdxSpatial < iNumDependencyLayer)
-		{
-			pDlp->uiProfileIdc		= uiProfileIdc;	
-
-			pCodingParam.sSpatialLayers[iIdxSpatial].fFrameRate	= WELS_CLIP3(pCodingParam.sSpatialLayers[iIdxSpatial].fFrameRate, MIN_FRAME_RATE, pCodingParam.fFrameRate);
-			pDlp->fInputFrameRate	= 
-			pDlp->fOutputFrameRate	= WELS_CLIP3(pCodingParam.sSpatialLayers[iIdxSpatial].fFrameRate, MIN_FRAME_RATE, MAX_FRAME_RATE);
-			if (pDlp->fInputFrameRate > fMaxFr+EPSN)
-				fMaxFr = pDlp->fInputFrameRate;
-
-#ifdef ENABLE_FRAME_DUMP
-			pDlp->sRecFileName[0]	= '\0';	// file to be constructed
-#endif//ENABLE_FRAME_DUMP
-			pDlp->iFrameWidth		= pCodingParam.sSpatialLayers[iIdxSpatial].iVideoWidth;	// frame width
-			pDlp->iFrameHeight		= pCodingParam.sSpatialLayers[iIdxSpatial].iVideoHeight;// frame height
-			pDlp->iSpatialBitrate	= pCodingParam.sSpatialLayers[iIdxSpatial].iSpatialBitrate;	// target bitrate for current spatial layer
-
-
-			//multi slice
-			pDlp->sMso.uiSliceMode = (SliceMode)pCodingParam.sSpatialLayers[iIdxSpatial].sSliceCfg.uiSliceMode;
-			pDlp->sMso.sSliceArgument.uiSliceSizeConstraint 
-				= (uint32_t)(pCodingParam.sSpatialLayers[iIdxSpatial].sSliceCfg.sSliceArgument.uiSliceSizeConstraint);
-			pDlp->sMso.sSliceArgument.iSliceNum 
-				= pCodingParam.sSpatialLayers[iIdxSpatial].sSliceCfg.sSliceArgument.uiSliceNum;
-			const int32_t kiLesserSliceNum = ((MAX_SLICES_NUM < MAX_SLICES_NUM_TMP) ? MAX_SLICES_NUM : MAX_SLICES_NUM_TMP);  
-			memcpy(pDlp->sMso.sSliceArgument.uiSliceMbNum, pCodingParam.sSpatialLayers[iIdxSpatial].sSliceCfg.sSliceArgument.uiSliceMbNum,	// confirmed_safe_unsafe_usage
-				kiLesserSliceNum * sizeof(uint32_t) ) ;
-
-			pDlp->iDLayerQp = SVC_QUALITY_BASE_QP;
-
-			uiProfileIdc	= PRO_SCALABLE_BASELINE;
-			++ pDlp;
-			++ iIdxSpatial;
-		}
-
-		fMaxFrameRate	= fMaxFr;
-
-		SetActualPicResolution();
-
-		return 0;
-	}
-
-	// assuming that the width/height ratio of all spatial layers are the same
-	
-	void SetActualPicResolution()
-	{
-		int32_t iSpatialIdx			= iNumDependencyLayer-1;
-		SDLayerParam *pDlayer		= &sDependencyLayers[iSpatialIdx];
-
-		for (; iSpatialIdx >= 0; iSpatialIdx -- )
-		{
-			pDlayer	= &sDependencyLayers[iSpatialIdx];
-
-			pDlayer->iActualWidth = pDlayer->iFrameWidth;
-			pDlayer->iActualHeight = pDlayer->iFrameHeight;
-			pDlayer->iFrameWidth = WELS_ALIGN(pDlayer->iActualWidth, MB_WIDTH_LUMA);
-			pDlayer->iFrameHeight = WELS_ALIGN(pDlayer->iActualHeight, MB_HEIGHT_LUMA);
-		}
-	}
-
-	/*!
-	* \brief	determined key coding tables for temporal scalability, uiProfileIdc etc for each spatial layer settings
-	* \param	SWelsSvcCodingParam, and carried with known GOP size, max, input and output frame rate of each spatial
-	* \return	NONE (should ensure valid parameter before this procedure)
-	*/
-	void DetermineTemporalSettings()
-	{		
-		const int32_t iDecStages		= WELS_LOG2( uiGopSize );	// (int8_t)GetLogFactor(1.0f, 1.0f * pcfg->uiGopSize);	//log2(uiGopSize)
-		const uint8_t *pTemporalIdList	= &g_kuiTemporalIdListTable[iDecStages][0];
-		SDLayerParam *pDlp				= &sDependencyLayers[0];
-		uint8_t uiProfileIdc				= PRO_BASELINE;
-		int8_t i						= 0;
-
-		while (i < iNumDependencyLayer )
-		{
-			const uint32_t kuiLogFactorInOutRate	= GetLogFactor(pDlp->fOutputFrameRate, pDlp->fInputFrameRate);
-			const uint32_t kuiLogFactorMaxInRate	= GetLogFactor(pDlp->fInputFrameRate, fMaxFrameRate);
-			int32_t iNotCodedMask= 0;
-			int8_t iMaxTemporalId = 0;
-
-			memset(pDlp->uiCodingIdx2TemporalId, INVALID_TEMPORAL_ID, sizeof(pDlp->uiCodingIdx2TemporalId));
-			pDlp->uiProfileIdc = uiProfileIdc;	// PRO_BASELINE, PRO_SCALABLE_BASELINE;			
-
-			iNotCodedMask	= (1 << (kuiLogFactorInOutRate + kuiLogFactorMaxInRate)) - 1;
-			for (uint32_t uiFrameIdx = 0; uiFrameIdx <= uiGopSize; ++ uiFrameIdx){						
-				if( 0 == (uiFrameIdx & iNotCodedMask) ) {				
-					const int8_t kiTemporalId = pTemporalIdList[uiFrameIdx];						
-					pDlp->uiCodingIdx2TemporalId[uiFrameIdx] = kiTemporalId;
-					if ( kiTemporalId > iMaxTemporalId )
-					{
-						iMaxTemporalId = kiTemporalId;
-					}
-				}
-			}
-
-			pDlp->iHighestTemporalId	= iMaxTemporalId;
-			pDlp->iTemporalResolution	= kuiLogFactorMaxInRate + kuiLogFactorInOutRate;
-			pDlp->iDecompositionStages	= iDecStages - kuiLogFactorMaxInRate - kuiLogFactorInOutRate;
-
-			uiProfileIdc	= PRO_SCALABLE_BASELINE;		
-			++ pDlp;
-			++ i;
-		}
-		iDecompStages = (int8_t)iDecStages;
-	}
-
-} SWelsSvcCodingParam;
-
-static inline int32_t FreeCodingParam( SWelsSvcCodingParam **pParam, CMemoryAlign *pMa )
-{
-	if (pParam == NULL || *pParam == NULL || pMa == NULL)
-		return 1;	
-	pMa->WelsFree(*pParam, "SWelsSvcCodingParam");
-	*pParam = NULL;
-	return 0;
-}
-
-static inline int32_t AllocCodingParam( SWelsSvcCodingParam **pParam, CMemoryAlign *pMa, const int32_t kiRequestNumSpatial )
-{
-	if ( pParam == NULL || pMa == NULL || kiRequestNumSpatial < 1 || kiRequestNumSpatial > MAX_SPATIAL_LAYER_NUM )
-		return 1;	
-	if (*pParam != NULL)
-	{
-		FreeCodingParam( pParam, pMa );
-	}
-	SWelsSvcCodingParam *pCodingParam = (SWelsSvcCodingParam *)pMa->WelsMalloc(sizeof(SWelsSvcCodingParam), "SWelsSvcCodingParam");
-	if ( NULL == pCodingParam )
-		return 1;
-	*pParam = pCodingParam;
-	return 0;
-}
-
-}//end of namespace WelsSVCEnc
-
-#endif//WELS_ENCODER_PARAMETER_SVC_H__
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	param_svc.h
+ *
+ * \brief	Configurable parameters in H.264/SVC Encoder
+ *
+ * \date	4/20/2009 Created
+ *
+ *************************************************************************************
+ */
+#if !defined(WELS_ENCODER_PARAMETER_SVC_H__)
+#define WELS_ENCODER_PARAMETER_SVC_H__
+
+#include <string.h>
+#include <math.h>
+#include "typedefs.h"
+#include "codec_def.h"
+#include "macros.h"
+#include "wels_const.h"
+#include "wels_common_basis.h"
+#include "rc.h"
+#include "svc_enc_slice_segment.h"
+#include "as264_common.h"
+
+namespace WelsSVCEnc {
+
+#define   INVALID_TEMPORAL_ID   ((uint8_t)0xff)
+
+extern const uint8_t   g_kuiTemporalIdListTable[MAX_TEMPORAL_LEVEL][MAX_GOP_SIZE + 1];
+
+/*!
+* \brief	get Logarithms base 2 of (upper/base)
+* \param	base	based scaler
+* \param	upper	input upper value
+* \return	2 based scaling factor
+*/
+static __inline uint32_t GetLogFactor (real32_t base, real32_t upper) {
+const double dLog2factor	= log10 (1.0 * upper / base) / log10 (2.0);
+const double dEpsilon		= 0.0001;
+const double dRound		= floor (dLog2factor + 0.5);
+
+if (dLog2factor < dRound + dEpsilon && dRound < dLog2factor + dEpsilon) {
+  return (uint32_t) (dRound);
+}
+return UINT_MAX;
+}
+
+/*
+ *	Dependency Layer Parameter
+ */
+typedef struct TagDLayerParam {
+int32_t		iActualWidth;			// input source picture actual width
+int32_t		iActualHeight;			// input source picture actual height
+int32_t		iFrameWidth;			// frame width
+int32_t		iFrameHeight;			// frame height
+
+int32_t		iSpatialBitrate;
+
+/* temporal settings related */
+int32_t		iTemporalResolution;
+int32_t		iDecompositionStages;
+uint8_t     uiCodingIdx2TemporalId[ (1 << MAX_TEMPORAL_LEVEL) + 1];
+
+uint8_t		uiProfileIdc;			// value of profile IDC (0 for auto-detection)
+
+int8_t		iHighestTemporalId;
+//	uint8_t		uiDependencyId;
+int8_t      iDLayerQp;
+
+SMulSliceOption sMso;	// multiple slice options
+
+float		fInputFrameRate;		// input frame rate
+float		fOutputFrameRate;		// output frame rate
+
+#ifdef ENABLE_FRAME_DUMP
+str_t		sRecFileName[MAX_FNAME_LEN];	// file to be constructed
+#endif//ENABLE_FRAME_DUMP	
+} SDLayerParam;
+
+/*
+ *	Cisco OpenH264 Encoder Parameter Configuration
+ */
+typedef struct TagWelsSvcCodingParam {
+SDLayerParam	sDependencyLayers[MAX_DEPENDENCY_LAYER];
+
+/* General */
+#ifdef ENABLE_TRACE_FILE
+str_t			sTracePath[MAX_FNAME_LEN];		// log file for wels encoder
+#endif
+
+uint32_t	uiGopSize;			// GOP size (at maximal frame rate: 16)
+uint32_t	uiIntraPeriod;		// intra period (multiple of GOP size as desired)
+int32_t		iNumRefFrame;		// number of reference frame used
+
+int32_t     iActualPicWidth;    //   actual input picture width
+int32_t     iActualPicHeight;   //   actual input picture height
+
+struct {
+  int32_t iLeft;
+  int32_t iTop;
+  int32_t iWidth;
+  int32_t iHeight;
+} SUsedPicRect;	// the rect in input picture that encoder actually used
+
+str_t*       pCurPath; // record current lib path such as:/pData/pData/com.wels.enc/lib/
+
+float		fMaxFrameRate;		// maximal frame rate [Hz / fps]
+int32_t		iInputCsp;			// color space of input sequence
+uint32_t	uiFrameToBeCoded;	// frame to be encoded (at input frame rate)
+
+int32_t		iTargetBitrate;			// overall target bitrate introduced in RC module
+int16_t		iMultipleThreadIdc;		// 1	# 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
+int16_t		iCountThreadsNum;			//		# derived from disable_multiple_slice_idc (=0 or >1) means;
+
+int32_t		iLTRRefNum;
+uint32_t    uiLtrMarkPeriod;	//the min distance of two int32_t references
+
+bool_t		bDeblockingParallelFlag;	// deblocking filter parallelization control flag
+bool_t		bMgsT0OnlyStrategy; //MGS_T0_only_strategy
+bool_t		bEnableSSEI;
+bool_t		bEnableFrameCroppingFlag;	// enable frame cropping flag: TRUE alwayse in application
+
+bool_t		bEnableCropPic;			// enable cropping source picture. , 8/25/2010
+// FALSE: Streaming Video Sharing; TRUE: Video Conferencing Meeting;
+int8_t		iDecompStages;		// GOP size dependency
+
+/* Deblocking loop filter */
+int8_t		iLoopFilterDisableIdc;	// 0: on, 1: off, 2: on except for slice boundaries
+int8_t		iLoopFilterAlphaC0Offset;// AlphaOffset: valid range [-6, 6], default 0
+
+int8_t		iLoopFilterBetaOffset;	// BetaOffset:	valid range [-6, 6], default 0
+int8_t		iInterLayerLoopFilterDisableIdc; // Employed based upon inter-layer, same comment as above
+int8_t		iInterLayerLoopFilterAlphaC0Offset;	// InterLayerLoopFilterAlphaC0Offset
+int8_t		iInterLayerLoopFilterBetaOffset;	// InterLayerLoopFilterBetaOffset
+
+/* Rate Control */
+bool_t		bEnableRc;
+int8_t		iRCMode;
+int8_t		iPaddingFlag;
+/* denoise control */
+bool_t      bEnableDenoise;
+
+/* scene change detection control */
+bool_t      bEnableSceneChangeDetect;
+// background detection control
+bool_t		bEnableBackgroundDetection;
+/* adaptive quantization control */
+bool_t		bEnableAdaptiveQuant;
+/* long term reference control */
+bool_t      bEnableLongTermReference;
+
+/* pSps pPps id addition control */
+bool_t      bEnableSpsPpsIdAddition;
+/* Layer definition */
+bool_t		bPrefixNalAddingCtrl;
+int8_t		iNumDependencyLayer;	// number of dependency(Spatial/CGS) layers used to be encoded
+int8_t		iNumTemporalLayer;		// number of temporal layer specified
+
+
+
+ public:
+TagWelsSvcCodingParam (const bool_t kbEnableRc = true) {
+  FillDefault (kbEnableRc);
+}
+~TagWelsSvcCodingParam()	{}
+
+void FillDefault (const bool_t kbEnableRc) {
+  uiGopSize			= 1;			// GOP size (at maximal frame rate: 16)
+  uiIntraPeriod		= 0;			// intra period (multiple of GOP size as desired)
+  iNumRefFrame		= MIN_REF_PIC_COUNT;	// number of reference frame used
+
+  iActualPicWidth	= 0;    //   actual input picture width
+  iActualPicHeight	= 0;	//   actual input picture height
+  SUsedPicRect.iLeft	=
+    SUsedPicRect.iTop	=
+      SUsedPicRect.iWidth	=
+        SUsedPicRect.iHeight = 0;	// the rect in input picture that encoder actually used
+
+  pCurPath			= NULL; // record current lib path such as:/pData/pData/com.wels.enc/lib/
+
+  fMaxFrameRate		= MAX_FRAME_RATE;	// maximal frame rate [Hz / fps]
+  iInputCsp			= videoFormatI420;	// input sequence color space in default
+  uiFrameToBeCoded	= (uint32_t) - 1;		// frame to be encoded (at input frame rate)
+
+  iTargetBitrate			= 0;	// overall target bitrate introduced in RC module
+  bDeblockingParallelFlag = false;	// deblocking filter parallelization control flag
+#ifdef MT_ENABLED
+  iMultipleThreadIdc		= 0;	// auto to detect cpu cores inside
+#else
+  iMultipleThreadIdc		=
+    1;	// 1 # 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
+#endif//MT_ENABLED
+  iCountThreadsNum		= 1;	//		# derived from disable_multiple_slice_idc (=0 or >1) means;
+
+  iLTRRefNum				= 0;
+  uiLtrMarkPeriod			= 30;	//the min distance of two int32_t references
+
+  bMgsT0OnlyStrategy			=
+    true;	// Strategy of have MGS only at T0 frames (0: do not use this strategy; 1: use this strategy)
+  bEnableSSEI					= true;
+  bEnableFrameCroppingFlag	= true;	// enable frame cropping flag: TRUE alwayse in application
+  bEnableCropPic				= true;	// enable cropping source picture. , 8/25/2010
+  // FALSE: Streaming Video Sharing; TRUE: Video Conferencing Meeting;
+  iDecompStages				= 0;	// GOP size dependency, unknown here and be revised later
+
+  /* Deblocking loop filter */
+  iLoopFilterDisableIdc		= 1;	// 0: on, 1: off, 2: on except for slice boundaries
+  iLoopFilterAlphaC0Offset	= 0;	// AlphaOffset: valid range [-6, 6], default 0
+  iLoopFilterBetaOffset		= 0;	// BetaOffset:	valid range [-6, 6], default 0
+  iInterLayerLoopFilterDisableIdc		= 1;	// Employed based upon inter-layer, same comment as above
+  iInterLayerLoopFilterAlphaC0Offset	= 0;	// InterLayerLoopFilterAlphaC0Offset
+  iInterLayerLoopFilterBetaOffset		= 0;	// InterLayerLoopFilterBetaOffset
+
+  /* Rate Control */
+  bEnableRc		= kbEnableRc;
+  iRCMode			= 0;
+  iPaddingFlag	= 0;
+
+  bEnableDenoise				= false;	// denoise control
+  bEnableSceneChangeDetect	= true;		// scene change detection control
+  bEnableBackgroundDetection	= true;		// background detection control
+  bEnableAdaptiveQuant		= true;		// adaptive quantization control
+  bEnableLongTermReference	= false;	// long term reference control
+  bEnableSpsPpsIdAddition	= true;		// pSps pPps id addition control
+  bPrefixNalAddingCtrl		= true;		// prefix NAL adding control
+  iNumDependencyLayer		= 0;		// number of dependency(Spatial/CGS) layers used to be encoded
+  iNumTemporalLayer			= 0;		// number of temporal layer specified
+}
+
+int32_t ParamTranscode (SVCEncodingParam& pCodingParam, const bool_t kbEnableRc = true) {
+  pCodingParam.fFrameRate		= WELS_CLIP3 (pCodingParam.fFrameRate, MIN_FRAME_RATE, MAX_FRAME_RATE);
+  iInputCsp		= pCodingParam.iInputCsp;		// color space of input sequence
+  uiFrameToBeCoded	= (uint32_t) -
+                      1;		// frame to be encoded (at input frame rate), -1 dependents on length of input sequence
+
+  iActualPicWidth   = pCodingParam.iPicWidth;
+  iActualPicHeight  = pCodingParam.iPicHeight;
+
+  SUsedPicRect.iLeft = 0;
+  SUsedPicRect.iTop  = 0;
+  SUsedPicRect.iWidth = ((iActualPicWidth >> 1) << 1);
+  SUsedPicRect.iHeight = ((iActualPicHeight >> 1) << 1);
+
+  /* Deblocking loop filter */
+#ifdef MT_ENABLED
+  iLoopFilterDisableIdc	= 2;//pCodingParam.iLoopFilterDisableIdc;	// 0: on, 1: off, 2: on except for slice boundaries,
+#else
+  iLoopFilterDisableIdc	= 0;	// 0: on, 1: off, 2: on except for slice boundaries
+#endif
+  iLoopFilterAlphaC0Offset = 0;	// AlphaOffset: valid range [-6, 6], default 0
+  iLoopFilterBetaOffset	= 0;	// BetaOffset:	valid range [-6, 6], default 0
+  iInterLayerLoopFilterDisableIdc	= iLoopFilterDisableIdc;	// Employed based upon inter-layer, same comment as above
+  iInterLayerLoopFilterAlphaC0Offset = 0;
+  iInterLayerLoopFilterBetaOffset	= 0;
+
+  bEnableFrameCroppingFlag	= true;
+
+  /* Rate Control */
+  bEnableRc			= kbEnableRc;
+  if (pCodingParam.iRCMode != RC_MODE0 && pCodingParam.iRCMode != RC_MODE1)
+    iRCMode = RC_MODE1;
+  else
+    iRCMode = pCodingParam.iRCMode;    // rc mode
+  iPaddingFlag = pCodingParam.iPaddingFlag;
+
+  iTargetBitrate		= pCodingParam.iTargetBitrate;	// target bitrate
+
+  /* Denoise Control */
+  bEnableDenoise = pCodingParam.bEnableDenoise ? true : false;    // Denoise Control  // only support 0 or 1 now
+
+  /* Scene change detection control */
+  bEnableSceneChangeDetect	= true;
+
+  /* Background detection Control */
+  bEnableBackgroundDetection = pCodingParam.bEnableBackgroundDetection ? true : false;
+
+  /* Adaptive quantization control */
+  bEnableAdaptiveQuant	= pCodingParam.bEnableAdaptiveQuant ? true : false;
+
+  /* Enable cropping source picture */
+  bEnableCropPic	= pCodingParam.bEnableCropPic ? true : false;
+
+  /* Enable int32_t term reference */
+  bEnableLongTermReference	= pCodingParam.bEnableLongTermReference ? true : false;
+  uiLtrMarkPeriod = pCodingParam.iLtrMarkPeriod;
+
+  /* For ssei information */
+  bEnableSSEI		= true;
+
+  /* Layer definition */
+  iNumDependencyLayer	= (int8_t)WELS_CLIP3 (pCodingParam.iSpatialLayerNum, 1,
+                        MAX_DEPENDENCY_LAYER); // number of dependency(Spatial/CGS) layers used to be encoded
+  pCodingParam.iTemporalLayerNum = (int8_t)WELS_CLIP3 (pCodingParam.iTemporalLayerNum, 1,
+                                   MAX_TEMPORAL_LEVEL);	// safe valid iTemporalLayerNum
+  iNumTemporalLayer		= (int8_t)
+                        pCodingParam.iTemporalLayerNum;//(int8_t)WELS_CLIP3(pCodingParam.iTemporalLayerNum, 1, MAX_TEMPORAL_LEVEL);// number of temporal layer specified
+
+  uiGopSize			= 1 << (iNumTemporalLayer - 1);	// Override GOP size based temporal layer
+  iDecompStages		= iNumTemporalLayer - 1;	// WELS_LOG2( uiGopSize );// GOP size dependency
+  uiIntraPeriod		= pCodingParam.iIntraPeriod;// intra period (multiple of GOP size as desired)
+  if (uiIntraPeriod == (uint32_t) (-1))
+    uiIntraPeriod = 0;
+  else if (uiIntraPeriod & uiGopSize)	// none multiple of GOP size
+    uiIntraPeriod = ((uiIntraPeriod + uiGopSize - 1) / uiGopSize) * uiGopSize;
+
+  iLTRRefNum = bEnableLongTermReference ? LONG_TERM_REF_NUM : 0;
+  iNumRefFrame		= ((uiGopSize >> 1) > 1) ? ((uiGopSize >> 1) + iLTRRefNum) : (MIN_REF_PIC_COUNT + iLTRRefNum);
+  iNumRefFrame		= WELS_CLIP3 (iNumRefFrame, MIN_REF_PIC_COUNT, MAX_REFERENCE_PICTURE_COUNT_NUM);
+
+  uiLtrMarkPeriod  = pCodingParam.iLtrMarkPeriod;
+
+  bPrefixNalAddingCtrl	= pCodingParam.bPrefixNalAddingCtrl;
+
+  bEnableSpsPpsIdAddition =
+    pCodingParam.bEnableSpsPpsIdAddition;//For SVC meeting application, to avoid mosaic issue caused by cross-IDR reference.
+  //SHOULD enable this feature.
+
+  SDLayerParam* pDlp		= &sDependencyLayers[0];
+  float fMaxFr			= .0f;
+  uint8_t uiProfileIdc		= PRO_BASELINE;
+  int8_t iIdxSpatial	= 0;
+  while (iIdxSpatial < iNumDependencyLayer) {
+    pDlp->uiProfileIdc		= uiProfileIdc;
+
+    pCodingParam.sSpatialLayers[iIdxSpatial].fFrameRate	= WELS_CLIP3 (pCodingParam.sSpatialLayers[iIdxSpatial].fFrameRate,
+        MIN_FRAME_RATE, pCodingParam.fFrameRate);
+    pDlp->fInputFrameRate	=
+      pDlp->fOutputFrameRate	= WELS_CLIP3 (pCodingParam.sSpatialLayers[iIdxSpatial].fFrameRate, MIN_FRAME_RATE,
+                                            MAX_FRAME_RATE);
+    if (pDlp->fInputFrameRate > fMaxFr + EPSN)
+      fMaxFr = pDlp->fInputFrameRate;
+
+#ifdef ENABLE_FRAME_DUMP
+    pDlp->sRecFileName[0]	= '\0';	// file to be constructed
+#endif//ENABLE_FRAME_DUMP
+    pDlp->iFrameWidth		= pCodingParam.sSpatialLayers[iIdxSpatial].iVideoWidth;	// frame width
+    pDlp->iFrameHeight		= pCodingParam.sSpatialLayers[iIdxSpatial].iVideoHeight;// frame height
+    pDlp->iSpatialBitrate	=
+      pCodingParam.sSpatialLayers[iIdxSpatial].iSpatialBitrate;	// target bitrate for current spatial layer
+
+
+    //multi slice
+    pDlp->sMso.uiSliceMode = (SliceMode)pCodingParam.sSpatialLayers[iIdxSpatial].sSliceCfg.uiSliceMode;
+    pDlp->sMso.sSliceArgument.uiSliceSizeConstraint
+      = (uint32_t) (pCodingParam.sSpatialLayers[iIdxSpatial].sSliceCfg.sSliceArgument.uiSliceSizeConstraint);
+    pDlp->sMso.sSliceArgument.iSliceNum
+      = pCodingParam.sSpatialLayers[iIdxSpatial].sSliceCfg.sSliceArgument.uiSliceNum;
+    const int32_t kiLesserSliceNum = ((MAX_SLICES_NUM < MAX_SLICES_NUM_TMP) ? MAX_SLICES_NUM : MAX_SLICES_NUM_TMP);
+    memcpy (pDlp->sMso.sSliceArgument.uiSliceMbNum,
+            pCodingParam.sSpatialLayers[iIdxSpatial].sSliceCfg.sSliceArgument.uiSliceMbNum,	// confirmed_safe_unsafe_usage
+            kiLesserSliceNum * sizeof (uint32_t)) ;
+
+    pDlp->iDLayerQp = SVC_QUALITY_BASE_QP;
+
+    uiProfileIdc	= PRO_SCALABLE_BASELINE;
+    ++ pDlp;
+    ++ iIdxSpatial;
+  }
+
+  fMaxFrameRate	= fMaxFr;
+
+  SetActualPicResolution();
+
+  return 0;
+}
+
+// assuming that the width/height ratio of all spatial layers are the same
+
+void SetActualPicResolution() {
+  int32_t iSpatialIdx			= iNumDependencyLayer - 1;
+  SDLayerParam* pDlayer		= &sDependencyLayers[iSpatialIdx];
+
+  for (; iSpatialIdx >= 0; iSpatialIdx --) {
+    pDlayer	= &sDependencyLayers[iSpatialIdx];
+
+    pDlayer->iActualWidth = pDlayer->iFrameWidth;
+    pDlayer->iActualHeight = pDlayer->iFrameHeight;
+    pDlayer->iFrameWidth = WELS_ALIGN (pDlayer->iActualWidth, MB_WIDTH_LUMA);
+    pDlayer->iFrameHeight = WELS_ALIGN (pDlayer->iActualHeight, MB_HEIGHT_LUMA);
+  }
+}
+
+/*!
+* \brief	determined key coding tables for temporal scalability, uiProfileIdc etc for each spatial layer settings
+* \param	SWelsSvcCodingParam, and carried with known GOP size, max, input and output frame rate of each spatial
+* \return	NONE (should ensure valid parameter before this procedure)
+*/
+void DetermineTemporalSettings() {
+  const int32_t iDecStages		= WELS_LOG2 (
+                                  uiGopSize);	// (int8_t)GetLogFactor(1.0f, 1.0f * pcfg->uiGopSize);	//log2(uiGopSize)
+  const uint8_t* pTemporalIdList	= &g_kuiTemporalIdListTable[iDecStages][0];
+  SDLayerParam* pDlp				= &sDependencyLayers[0];
+  uint8_t uiProfileIdc				= PRO_BASELINE;
+  int8_t i						= 0;
+
+  while (i < iNumDependencyLayer) {
+    const uint32_t kuiLogFactorInOutRate	= GetLogFactor (pDlp->fOutputFrameRate, pDlp->fInputFrameRate);
+    const uint32_t kuiLogFactorMaxInRate	= GetLogFactor (pDlp->fInputFrameRate, fMaxFrameRate);
+    int32_t iNotCodedMask = 0;
+    int8_t iMaxTemporalId = 0;
+
+    memset (pDlp->uiCodingIdx2TemporalId, INVALID_TEMPORAL_ID, sizeof (pDlp->uiCodingIdx2TemporalId));
+    pDlp->uiProfileIdc = uiProfileIdc;	// PRO_BASELINE, PRO_SCALABLE_BASELINE;
+
+    iNotCodedMask	= (1 << (kuiLogFactorInOutRate + kuiLogFactorMaxInRate)) - 1;
+    for (uint32_t uiFrameIdx = 0; uiFrameIdx <= uiGopSize; ++ uiFrameIdx) {
+      if (0 == (uiFrameIdx & iNotCodedMask)) {
+        const int8_t kiTemporalId = pTemporalIdList[uiFrameIdx];
+        pDlp->uiCodingIdx2TemporalId[uiFrameIdx] = kiTemporalId;
+        if (kiTemporalId > iMaxTemporalId) {
+          iMaxTemporalId = kiTemporalId;
+        }
+      }
+    }
+
+    pDlp->iHighestTemporalId	= iMaxTemporalId;
+    pDlp->iTemporalResolution	= kuiLogFactorMaxInRate + kuiLogFactorInOutRate;
+    pDlp->iDecompositionStages	= iDecStages - kuiLogFactorMaxInRate - kuiLogFactorInOutRate;
+
+    uiProfileIdc	= PRO_SCALABLE_BASELINE;
+    ++ pDlp;
+    ++ i;
+  }
+  iDecompStages = (int8_t)iDecStages;
+}
+
+} SWelsSvcCodingParam;
+
+static inline int32_t FreeCodingParam (SWelsSvcCodingParam** pParam, CMemoryAlign* pMa) {
+if (pParam == NULL || *pParam == NULL || pMa == NULL)
+  return 1;
+pMa->WelsFree (*pParam, "SWelsSvcCodingParam");
+*pParam = NULL;
+return 0;
+}
+
+static inline int32_t AllocCodingParam (SWelsSvcCodingParam** pParam, CMemoryAlign* pMa,
+                                        const int32_t kiRequestNumSpatial) {
+if (pParam == NULL || pMa == NULL || kiRequestNumSpatial < 1 || kiRequestNumSpatial > MAX_SPATIAL_LAYER_NUM)
+  return 1;
+if (*pParam != NULL) {
+  FreeCodingParam (pParam, pMa);
+}
+SWelsSvcCodingParam* pCodingParam = (SWelsSvcCodingParam*)pMa->WelsMalloc (sizeof (SWelsSvcCodingParam),
+                                    "SWelsSvcCodingParam");
+if (NULL == pCodingParam)
+  return 1;
+*pParam = pCodingParam;
+return 0;
+}
+
+}//end of namespace WelsSVCEnc
+
+#endif//WELS_ENCODER_PARAMETER_SVC_H__
--- a/codec/encoder/core/inc/parameter_sets.h
+++ b/codec/encoder/core/inc/parameter_sets.h
@@ -41,122 +41,122 @@
 //#pragma pack(1)
 
 /* Sequence Parameter Set, refer to Page 57 in JVT X201wcm */
-typedef struct TagWelsSPS{
-	uint32_t	uiSpsId;
-	int16_t		iMbWidth;
-	int16_t		iMbHeight;	
-	uint32_t	uiLog2MaxFrameNum;
+typedef struct TagWelsSPS {
+uint32_t	uiSpsId;
+int16_t		iMbWidth;
+int16_t		iMbHeight;
+uint32_t	uiLog2MaxFrameNum;
 //	uint32_t	uiPocType;
-	/* POC type 0 */
-	int32_t		iLog2MaxPocLsb;
-	/* POC type 1 */
+/* POC type 0 */
+int32_t		iLog2MaxPocLsb;
+/* POC type 1 */
 //	int32_t		iOffsetForNonRefPic;
 
 //	int32_t		iOffsetForTopToBottomField;
 //	int32_t		iNumRefFramesInPocCycle;
-//	int8_t		iOffsetForRefFrame[256];	
-	SCropOffset	sFrameCrop;
-	int16_t		iNumRefFrames;	
+//	int8_t		iOffsetForRefFrame[256];
+SCropOffset	sFrameCrop;
+int16_t		iNumRefFrames;
 //	uint32_t	uiNumUnitsInTick;
 //	uint32_t	uiTimeScale;
-	
-	uint8_t		uiProfileIdc;
-	uint8_t		iLevelIdc;
+
+uint8_t		uiProfileIdc;
+uint8_t		iLevelIdc;
 //	uint8_t		uiChromaFormatIdc;
 //	uint8_t		uiChromaArrayType;		//support =1
-	
+
 //	uint8_t		uiBitDepthLuma;         //=8, only used in decoder, encoder in general_***; it can be removed when removed general up_sample
 //	uint8_t		uiBitDepthChroma;		//=8
-	/* TO BE CONTINUE: POC type 1 */
-//	bool_t		bDeltaPicOrderAlwaysZeroFlag;	
+/* TO BE CONTINUE: POC type 1 */
+//	bool_t		bDeltaPicOrderAlwaysZeroFlag;
 //	bool_t		bGapsInFrameNumValueAllowedFlag;	//=true
 
 //	bool_t		bFrameMbsOnlyFlag;
 //	bool_t		bMbaffFlag;	// MB Adapative Frame Field
 //	bool_t		bDirect8x8InferenceFlag;
-	bool_t		bFrameCroppingFlag;
+bool_t		bFrameCroppingFlag;
 
 //	bool_t		bVuiParamPresentFlag;
 //	bool_t		bTimingInfoPresentFlag;
 //	bool_t		bFixedFrameRateFlag;
 
-	bool_t		bConstraintSet0Flag;
-	bool_t		bConstraintSet1Flag;
-	bool_t		bConstraintSet2Flag;
+bool_t		bConstraintSet0Flag;
+bool_t		bConstraintSet1Flag;
+bool_t		bConstraintSet2Flag;
 
 //	bool_t		bConstraintSet3Flag;		// reintroduce constrain_set3_flag instead of reserved filling bytes here
 //	bool_t		bSeparateColorPlaneFlag;  // =false,: only used in decoder, encoder in general_***; it can be removed when removed general up_sample
-	
-}SWelsSPS, *PWelsSPS;
 
+} SWelsSPS, *PWelsSPS;
 
+
 /* Sequence Parameter Set SVC extension syntax, refer to Page 391 in JVT X201wcm */
-typedef struct TagSpsSvcExt{
+typedef struct TagSpsSvcExt {
 //	SCropOffset	sSeqScaledRefLayer;
-	
-	uint8_t		iExtendedSpatialScalability;	// ESS
+
+uint8_t		iExtendedSpatialScalability;	// ESS
 //	uint8_t		uiChromaPhaseXPlus1Flag;
 //	uint8_t		uiChromaPhaseYPlus1;
 //	uint8_t		uiSeqRefLayerChromaPhaseXPlus1Flag;
 //	uint8_t		uiSeqRefLayerChromaPhaseYPlus1;
 //	bool_t		bInterLayerDeblockingFilterCtrlPresentFlag;
-	bool_t		bSeqTcoeffLevelPredFlag;
-	bool_t		bAdaptiveTcoeffLevelPredFlag;
-	bool_t		bSliceHeaderRestrictionFlag;	
-}SSpsSvcExt, *PSpsSvcExt;
+bool_t		bSeqTcoeffLevelPredFlag;
+bool_t		bAdaptiveTcoeffLevelPredFlag;
+bool_t		bSliceHeaderRestrictionFlag;
+} SSpsSvcExt, *PSpsSvcExt;
 
 /* Subset sequence parameter set syntax, refer to Page 391 in JVT X201wcm */
-typedef struct TagSubsetSps{	
-	SWelsSPS		pSps;
-	SSpsSvcExt	sSpsSvcExt;
+typedef struct TagSubsetSps {
+SWelsSPS		pSps;
+SSpsSvcExt	sSpsSvcExt;
 
-//	bool_t		bSvcVuiParamPresentFlag;	
+//	bool_t		bSvcVuiParamPresentFlag;
 //	bool_t		bAdditionalExtension2Flag;
 //	bool_t		bAdditionalExtension2DataFlag;
-}SSubsetSps, *PSubsetSps;
+} SSubsetSps, *PSubsetSps;
 
 /* Picture parameter set syntax, refer to Page 59 in JVT X201wcm */
-typedef struct TagWelsPPS{
-	uint32_t	iSpsId;
-	uint32_t	iPpsId;
-		
+typedef struct TagWelsPPS {
+uint32_t	iSpsId;
+uint32_t	iPpsId;
+
 #if !defined(DISABLE_FMO_FEATURE)
-	uint32_t	uiNumSliceGroups;
-	uint32_t	uiSliceGroupMapType;
-	/* uiSliceGroupMapType = 0 */
-	uint32_t	uiRunLength[MAX_SLICEGROUP_IDS];
-	/* uiSliceGroupMapType = 2 */
-	uint32_t	uiTopLeft[MAX_SLICEGROUP_IDS];
-	uint32_t	uiBottomRight[MAX_SLICEGROUP_IDS];
-	/* uiSliceGroupMapType = 3, 4 or 5 */
-	/* uiSliceGroupMapType = 3, 4 or 5 */
-	bool_t		bSliceGroupChangeDirectionFlag;
-	uint32_t	uiSliceGroupChangeRate;
-	/* uiSliceGroupMapType = 6 */
-	uint32_t	uiPicSizeInMapUnits;
-	uint32_t	uiSliceGroupId[MAX_SLICEGROUP_IDS];
+uint32_t	uiNumSliceGroups;
+uint32_t	uiSliceGroupMapType;
+/* uiSliceGroupMapType = 0 */
+uint32_t	uiRunLength[MAX_SLICEGROUP_IDS];
+/* uiSliceGroupMapType = 2 */
+uint32_t	uiTopLeft[MAX_SLICEGROUP_IDS];
+uint32_t	uiBottomRight[MAX_SLICEGROUP_IDS];
+/* uiSliceGroupMapType = 3, 4 or 5 */
+/* uiSliceGroupMapType = 3, 4 or 5 */
+bool_t		bSliceGroupChangeDirectionFlag;
+uint32_t	uiSliceGroupChangeRate;
+/* uiSliceGroupMapType = 6 */
+uint32_t	uiPicSizeInMapUnits;
+uint32_t	uiSliceGroupId[MAX_SLICEGROUP_IDS];
 #endif//!DISABLE_FMO_FEATURE
-	
+
 //	uint32_t	uiNumRefIdxL0Active;
 //	uint32_t	uiNumRefIdxL1Active;
-	
-	int8_t		iPicInitQp;
-	int8_t		iPicInitQs;
-	uint8_t		uiChromaQpIndexOffset;	
-	
-	/* potential application for High profile */
+
+int8_t		iPicInitQp;
+int8_t		iPicInitQs;
+uint8_t		uiChromaQpIndexOffset;
+
+/* potential application for High profile */
 //	int32_t		iSecondChromaQpIndexOffset;
 //	/* potential application for High profile */
 
 //	bool_t		bPicOrderPresentFlag;
-	
-	bool_t		bDeblockingFilterControlPresentFlag;
-	
+
+bool_t		bDeblockingFilterControlPresentFlag;
+
 //	bool_t		bConstainedIntraPredFlag;
 //	bool_t		bRedundantPicCntPresentFlag;
 //	bool_t		bWeightedPredFlag;
 //	uint8_t		uiWeightedBiPredIdc;
-	
+
 } SWelsPPS, *PWelsPPPS;
 
 //#pragma pack()
--- a/codec/encoder/core/inc/picture.h
+++ b/codec/encoder/core/inc/picture.h
@@ -45,40 +45,40 @@
  *	Reconstructed Picture definition
  *	It is used to express reference picture, also consequent reconstruction picture for output
  */
-typedef struct TagPicture{
-	/************************************payload pData*********************************/
-	uint8_t		*pBuffer;		// pointer to the first allocated byte, basical offset of pBuffer, dimension:
-	uint8_t		*pData[3];		// pointer to picture planes respectively
-	int32_t		iLineSize[3];	// iLineSize of picture planes respectively
+typedef struct TagPicture {
+  /************************************payload pData*********************************/
+  uint8_t*		pBuffer;		// pointer to the first allocated byte, basical offset of pBuffer, dimension:
+  uint8_t*		pData[3];		// pointer to picture planes respectively
+  int32_t		iLineSize[3];	// iLineSize of picture planes respectively
 
-	// picture information
-	/*******************************from other standard syntax****************************/
-	/*from pSps*/
-	int32_t		iWidthInPixel;	// picture width in pixel
-	int32_t		iHeightInPixel;// picture height in pixel
-	int32_t		iPictureType;	// got from sSliceHeader(): eSliceType
-	int32_t		iFramePoc;		// frame POC
+  // picture information
+  /*******************************from other standard syntax****************************/
+  /*from pSps*/
+  int32_t		iWidthInPixel;	// picture width in pixel
+  int32_t		iHeightInPixel;// picture height in pixel
+  int32_t		iPictureType;	// got from sSliceHeader(): eSliceType
+  int32_t		iFramePoc;		// frame POC
 
-	real32_t	fFrameRate;   // MOVE
-	int32_t		iFrameNum;		// frame number			//for pRef pic management
+  real32_t	fFrameRate;   // MOVE
+  int32_t		iFrameNum;		// frame number			//for pRef pic management
 
-	uint32_t	*uiRefMbType;	// for iMbWidth*iMbHeight	
-	uint8_t		*pRefMbQp;		// for iMbWidth*iMbHeight
+  uint32_t*	uiRefMbType;	// for iMbWidth*iMbHeight
+  uint8_t*		pRefMbQp;		// for iMbWidth*iMbHeight
 
-	int32_t     *pMbSkipSad;   //for iMbWidth*iMbHeight
+  int32_t*     pMbSkipSad;   //for iMbWidth*iMbHeight
 
-	SMVUnitXY	*sMvList;
+  SMVUnitXY*	sMvList;
 
-	/*******************************sef_definition for misc use****************************/
-	int32_t		iMarkFrameNum;
-	int32_t		iLongTermPicNum;
+  /*******************************sef_definition for misc use****************************/
+  int32_t		iMarkFrameNum;
+  int32_t		iLongTermPicNum;
 
-	bool_t		bUsedAsRef;						//for pRef pic management
-	bool_t		bIsLongRef;	// long term reference frame flag	//for pRef pic management
-	uint8_t		uiRecieveConfirmed;
-	uint8_t		uiTemporalId;
-	uint8_t		uiSpatialId;	
-}SPicture;	
+  bool_t		bUsedAsRef;						//for pRef pic management
+  bool_t		bIsLongRef;	// long term reference frame flag	//for pRef pic management
+  uint8_t		uiRecieveConfirmed;
+  uint8_t		uiTemporalId;
+  uint8_t		uiSpatialId;
+} SPicture;
 
 /*
  *	Residual Picture
--- a/codec/encoder/core/inc/picture_handle.h
+++ b/codec/encoder/core/inc/picture_handle.h
@@ -52,7 +52,7 @@
  * \pram	need_expand		need borders expanding
  * \return	successful if effective picture pointer returned, otherwise failed with NULL
  */
-SPicture *AllocPicture( CMemoryAlign *pMa, const int32_t kiWidth, const int32_t kiHeight, bool_t bNeedMbInfo );
+SPicture* AllocPicture (CMemoryAlign* pMa, const int32_t kiWidth, const int32_t kiHeight, bool_t bNeedMbInfo);
 
 /*!
  * \brief	free picture pData planes
@@ -59,7 +59,7 @@
  * \param	pic		picture pointer to be destoryed
  * \return	none
  */
-void FreePicture( CMemoryAlign *pMa, SPicture **ppPic );
+void FreePicture (CMemoryAlign* pMa, SPicture** ppPic);
 
 /*!
 * \brief	exchange two picture pData planes
@@ -67,6 +67,6 @@
 * \param	ppPic2		picture pointer to picture 2
 * \return	none
 */
-void WelsExchangeSpatialPictures( SPicture **ppPic1, SPicture **ppPic2 );
+void WelsExchangeSpatialPictures (SPicture** ppPic1, SPicture** ppPic2);
 }
 #endif//WELS_ENCODER_PICTURE_HANDLE_H__
--- a/codec/encoder/core/inc/property.h
+++ b/codec/encoder/core/inc/property.h
@@ -51,7 +51,7 @@
  * \param	iSize	size of pBuffer overall
  * \return	actual size of pBuffer used; 0 returned in failure
  */
-int32_t GetCodeName(str_t *pBuf, int32_t iSize);
+int32_t GetCodeName (str_t* pBuf, int32_t iSize);
 
 /*!
  * \brief	get library/module name
@@ -59,7 +59,7 @@
  * \param	iSize	size of pBuffer overall
  * \return	actual size of pBuffer used; 0 returned in failure
  */
-int32_t GetLibName(str_t *pBuf, int32_t iSize);
+int32_t GetLibName (str_t* pBuf, int32_t iSize);
 
 /*!
  * \brief	get version number
@@ -67,7 +67,7 @@
  * \param	iSize	size of pBuffer overall
  * \return	actual size of pBuffer used; 0 returned in failure
  */
-int32_t GetVerNum(str_t *pBuf, int32_t iSize);
+int32_t GetVerNum (str_t* pBuf, int32_t iSize);
 
 /*!
  * \brief	get identify information
@@ -75,6 +75,6 @@
  * \param	iSize	size of pBuffer overall
  * \return	actual size of pBuffer used; 0 returned in failure
  */
-int32_t GetIdentInfo(str_t *pBuf, int32_t iSize);
+int32_t GetIdentInfo (str_t* pBuf, int32_t iSize);
 }
 #endif//WELS_DECODER_PROPERTY_H__
--- a/codec/encoder/core/inc/rc.h
+++ b/codec/encoder/core/inc/rc.h
@@ -57,53 +57,52 @@
 #define    WELS_RC_DISABLE        0
 #define    WELS_RC_GOM            1
 
-typedef enum
-{
-	RC_MODE0,	//Quality mode
-	RC_MODE1,   //Bitrate mode
-}RC_MODES;
+typedef enum {
+  RC_MODE0,	//Quality mode
+  RC_MODE1,   //Bitrate mode
+} RC_MODES;
 
 enum {
-	//virtual gop size
-	VGOP_SIZE             = 8,
+  //virtual gop size
+  VGOP_SIZE             = 8,
 
-	//qp information
-	GOM_MIN_QP_MODE       = 12,
-	GOM_MAX_QP_MODE       = 36,
-    MIN_IDR_QP            = 26,
-    MAX_IDR_QP            = 32,
-    DELTA_QP              = 2,
-    DELTA_QP_BGD_THD      = 3,
+  //qp information
+  GOM_MIN_QP_MODE       = 12,
+  GOM_MAX_QP_MODE       = 36,
+  MIN_IDR_QP            = 26,
+  MAX_IDR_QP            = 32,
+  DELTA_QP              = 2,
+  DELTA_QP_BGD_THD      = 3,
 
-	//frame skip constants
-    SKIP_QP_90P           = 24,
-    SKIP_QP_180P          = 24,
-    SKIP_QP_360P          = 31,
-    SKIP_QP_720P          = 31,
-    LAST_FRAME_QP_RANGE_UPPER_MODE0  = 3,
-	LAST_FRAME_QP_RANGE_LOWER_MODE0  = 2,
-    LAST_FRAME_QP_RANGE_UPPER_MODE1  = 5,
-	LAST_FRAME_QP_RANGE_LOWER_MODE1  = 3,
+  //frame skip constants
+  SKIP_QP_90P           = 24,
+  SKIP_QP_180P          = 24,
+  SKIP_QP_360P          = 31,
+  SKIP_QP_720P          = 31,
+  LAST_FRAME_QP_RANGE_UPPER_MODE0  = 3,
+  LAST_FRAME_QP_RANGE_LOWER_MODE0  = 2,
+  LAST_FRAME_QP_RANGE_UPPER_MODE1  = 5,
+  LAST_FRAME_QP_RANGE_LOWER_MODE1  = 3,
 
-	MB_WIDTH_THRESHOLD_90P   = 15,
-	MB_WIDTH_THRESHOLD_180P  = 30,
-	MB_WIDTH_THRESHOLD_360P  = 60,
+  MB_WIDTH_THRESHOLD_90P   = 15,
+  MB_WIDTH_THRESHOLD_180P  = 30,
+  MB_WIDTH_THRESHOLD_360P  = 60,
 
-	//Mode 0 parameter
-	GOM_ROW_MODE0_90P     = 2,
-	GOM_ROW_MODE0_180P    = 2,
-	GOM_ROW_MODE0_360P    = 4,
-	GOM_ROW_MODE0_720P    = 4,
-    QP_RANGE_MODE0        = 3,
+  //Mode 0 parameter
+  GOM_ROW_MODE0_90P     = 2,
+  GOM_ROW_MODE0_180P    = 2,
+  GOM_ROW_MODE0_360P    = 4,
+  GOM_ROW_MODE0_720P    = 4,
+  QP_RANGE_MODE0        = 3,
 
-	//Mode 1 parameter
-	GOM_ROW_MODE1_90P     = 1,
-	GOM_ROW_MODE1_180P    = 1,
-	GOM_ROW_MODE1_360P    = 2,
-	GOM_ROW_MODE1_720P    = 2,
-    QP_RANGE_UPPER_MODE1  = 9,
-	QP_RANGE_LOWER_MODE1  = 4,
-    QP_RANGE_INTRA_MODE1  = 3,
+  //Mode 1 parameter
+  GOM_ROW_MODE1_90P     = 1,
+  GOM_ROW_MODE1_180P    = 1,
+  GOM_ROW_MODE1_360P    = 2,
+  GOM_ROW_MODE1_720P    = 2,
+  QP_RANGE_UPPER_MODE1  = 9,
+  QP_RANGE_LOWER_MODE1  = 4,
+  QP_RANGE_INTRA_MODE1  = 3,
 };
 
 //bits allocation
@@ -121,116 +120,113 @@
 #define PADDING_BUFFER_RATIO 0.5
 #define PADDING_THRESHOLD    0.05
 
-typedef struct TagRCSlicing
-{
-	int32_t   iComplexityIndexSlice;
-	int32_t   iCalculatedQpSlice;
-	int32_t   iStartMbSlice;
-	int32_t   iEndMbSlice;
-	int32_t   iTotalQpSlice;
-	int32_t   iTotalMbSlice;
-	int32_t   iTargetBitsSlice;
-	int32_t   iBsPosSlice;
-	int32_t   iFrameBitsSlice;
-	int32_t   iGomBitsSlice;
-	int32_t   iGomTargetBits;
-	//int32_t   gom_coded_mb;
+typedef struct TagRCSlicing {
+  int32_t   iComplexityIndexSlice;
+  int32_t   iCalculatedQpSlice;
+  int32_t   iStartMbSlice;
+  int32_t   iEndMbSlice;
+  int32_t   iTotalQpSlice;
+  int32_t   iTotalMbSlice;
+  int32_t   iTargetBitsSlice;
+  int32_t   iBsPosSlice;
+  int32_t   iFrameBitsSlice;
+  int32_t   iGomBitsSlice;
+  int32_t   iGomTargetBits;
+  //int32_t   gom_coded_mb;
 } SRCSlicing;
 
-typedef struct TagRCTemporal
-{
-	int32_t   iMinBitsTl;
-	int32_t   iMaxBitsTl;
-	double    dTlayerWeight;
-	int32_t   iGopBitsDq;
-	//P frame level R-Q Model 
-	double    dLinearCmplx;
-	int32_t   iPFrameNum;
-	int32_t   iFrameCmplxMean;
+typedef struct TagRCTemporal {
+  int32_t   iMinBitsTl;
+  int32_t   iMaxBitsTl;
+  double    dTlayerWeight;
+  int32_t   iGopBitsDq;
+  //P frame level R-Q Model
+  double    dLinearCmplx;
+  int32_t   iPFrameNum;
+  int32_t   iFrameCmplxMean;
 
 } SRCTemporal;
 
-typedef struct TagWelsRc{
-	int32_t   iRcVaryPercentage;
-	double    dRcVaryRatio;
+typedef struct TagWelsRc {
+  int32_t   iRcVaryPercentage;
+  double    dRcVaryRatio;
 
-	int32_t   iInitialQp; //initial qp
-	int32_t   iBitRate;
-	int32_t   iPreviousBitrate;
-	int32_t   iPreviousGopSize;
-	double    fFrameRate;
-	double    dBitsPerFrame;
-	double    dPreviousFps;
+  int32_t   iInitialQp; //initial qp
+  int32_t   iBitRate;
+  int32_t   iPreviousBitrate;
+  int32_t   iPreviousGopSize;
+  double    fFrameRate;
+  double    dBitsPerFrame;
+  double    dPreviousFps;
 
-	// bits allocation and status
-	int32_t   iRemainingBits;
-	int32_t   iTargetBits;
+  // bits allocation and status
+  int32_t   iRemainingBits;
+  int32_t   iTargetBits;
 
-	int32_t   iIdrNum;
-	int32_t   iIntraComplexity;
-	int32_t   iIntraMbCount;
+  int32_t   iIdrNum;
+  int32_t   iIntraComplexity;
+  int32_t   iIntraMbCount;
 
-	int8_t    iTlOfFrames[VGOP_SIZE];
-	double    dRemainingWeights;
-	int32_t   iFrameDqBits;
+  int8_t    iTlOfFrames[VGOP_SIZE];
+  double    dRemainingWeights;
+  int32_t   iFrameDqBits;
 
-	double    *pGomComplexity;
-	int32_t	  *pGomForegroundBlockNum;
-	int32_t   *pCurrentFrameGomSad;
-	int32_t   *pGomCost;
+  double*    pGomComplexity;
+  int32_t*  	pGomForegroundBlockNum;
+  int32_t*   pCurrentFrameGomSad;
+  int32_t*   pGomCost;
 
-	int32_t   iAverageFrameQp;
-	int32_t   iNumberMbFrame;
-	int32_t   iNumberMbGom;
-	int32_t	  iSliceNum;
-	int32_t   iGomSize;
+  int32_t   iAverageFrameQp;
+  int32_t   iNumberMbFrame;
+  int32_t   iNumberMbGom;
+  int32_t	  iSliceNum;
+  int32_t   iGomSize;
 
-	int32_t   iSkipFrameNum;
-	int32_t   iFrameCodedInVGop;
-	int32_t   iSkipFrameInVGop;
-	int32_t   iGopNumberInVGop;
-	int32_t   iGopIndexInVGop;
+  int32_t   iSkipFrameNum;
+  int32_t   iFrameCodedInVGop;
+  int32_t   iSkipFrameInVGop;
+  int32_t   iGopNumberInVGop;
+  int32_t   iGopIndexInVGop;
 
-	int32_t   iSkipQpValue;
-	int32_t   iQpRangeUpperInFrame;
-	int32_t   iQpRangeLowerInFrame;
-	int32_t   iMinQp;
-	int32_t   iMaxQp;
-	//int32_t   delta_adaptive_qp;
-	double    dSkipBufferRatio;
+  int32_t   iSkipQpValue;
+  int32_t   iQpRangeUpperInFrame;
+  int32_t   iQpRangeLowerInFrame;
+  int32_t   iMinQp;
+  int32_t   iMaxQp;
+  //int32_t   delta_adaptive_qp;
+  double    dSkipBufferRatio;
 
-	double    dQStep;
-	int32_t   iFrameDeltaQpUpper;
-	int32_t   iFrameDeltaQpLower;
-	int32_t   iLastCalculatedQScale;
+  double    dQStep;
+  int32_t   iFrameDeltaQpUpper;
+  int32_t   iFrameDeltaQpLower;
+  int32_t   iLastCalculatedQScale;
 
-	//for skip frame and padding
-	int32_t   iBufferSizeSkip;
-	int32_t   iBufferFullnessSkip;
-	int32_t   iBufferSizePadding;
-	int32_t   iBufferFullnessPadding;
-	int32_t   iPaddingSize;
-	int32_t   iPaddingBitrateStat;
+  //for skip frame and padding
+  int32_t   iBufferSizeSkip;
+  int32_t   iBufferFullnessSkip;
+  int32_t   iBufferSizePadding;
+  int32_t   iBufferFullnessPadding;
+  int32_t   iPaddingSize;
+  int32_t   iPaddingBitrateStat;
 
-	SRCSlicing	*pSlicingOverRc;
-	SRCTemporal *pTemporalOverRc;
-}SWelsSvcRc; 
+  SRCSlicing*	pSlicingOverRc;
+  SRCTemporal* pTemporalOverRc;
+} SWelsSvcRc;
 
-typedef  void (*PWelsRCPictureInitFunc) (void *pCtx);
-typedef  void (*PWelsRCPictureInfoUpdateFunc) (void *pCtx, int32_t iLayerSize);
-typedef  void (*PWelsRCMBInfoUpdateFunc)(void *pCtx, SMB * pCurMb, int32_t iCostLuma, SSlice *pSlice);
-typedef  void (*PWelsRCMBInitFunc)(void *pCtx, SMB * pCurMb, SSlice *pSlice);
+typedef  void (*PWelsRCPictureInitFunc) (void* pCtx);
+typedef  void (*PWelsRCPictureInfoUpdateFunc) (void* pCtx, int32_t iLayerSize);
+typedef  void (*PWelsRCMBInfoUpdateFunc) (void* pCtx, SMB* pCurMb, int32_t iCostLuma, SSlice* pSlice);
+typedef  void (*PWelsRCMBInitFunc) (void* pCtx, SMB* pCurMb, SSlice* pSlice);
 
-typedef  struct  WelsRcFunc_s
-{
-    PWelsRCPictureInitFunc			pfWelsRcPictureInit;
-	PWelsRCPictureInfoUpdateFunc	pfWelsRcPictureInfoUpdate;
-	PWelsRCMBInitFunc				pfWelsRcMbInit;
-	PWelsRCMBInfoUpdateFunc			pfWelsRcMbInfoUpdate;
+typedef  struct  WelsRcFunc_s {
+  PWelsRCPictureInitFunc			pfWelsRcPictureInit;
+  PWelsRCPictureInfoUpdateFunc	pfWelsRcPictureInfoUpdate;
+  PWelsRCMBInitFunc				pfWelsRcMbInit;
+  PWelsRCMBInfoUpdateFunc			pfWelsRcMbInfoUpdate;
 } SWelsRcFunc;
 
-void WelsRcInitModule(void *pCtx,  int32_t iModule);
-void WelsRcFreeMemory(void *pCtx);
+void WelsRcInitModule (void* pCtx,  int32_t iModule);
+void WelsRcFreeMemory (void* pCtx);
 
 }
 #endif //_RC_H
--- a/codec/encoder/core/inc/ref_list_mgr_svc.h
+++ b/codec/encoder/core/inc/ref_list_mgr_svc.h
@@ -47,62 +47,59 @@
 #include "codec_app_def.h"
 
 namespace WelsSVCEnc {
-typedef enum
-{
-	RECIEVE_UNKOWN = 0,
-	RECIEVE_SUCCESS = 1,
-	RECIEVE_FAILED = 2,
-}LTR_MARKING_RECEIVE_STATE;
+typedef enum {
+RECIEVE_UNKOWN = 0,
+RECIEVE_SUCCESS = 1,
+RECIEVE_FAILED = 2,
+} LTR_MARKING_RECEIVE_STATE;
 
-typedef enum
-{
-	LTR_DIRECT_MARK = 0,
-	LTR_DELAY_MARK = 1,
-}LTR_MARKING_PROCESS_MODE;
+typedef enum {
+LTR_DIRECT_MARK = 0,
+LTR_DELAY_MARK = 1,
+} LTR_MARKING_PROCESS_MODE;
 
-typedef enum
-{
-	FRAME_NUM_EQUAL    = 0x01,
-	FRAME_NUM_BIGGER   = 0x02,
-	FRAME_NUM_SMALLER  = 0x04,
-	FRAME_NUM_OVER_MAX = 0x08,
-}COMPARE_FRAME_NUM;
+typedef enum {
+FRAME_NUM_EQUAL    = 0x01,
+FRAME_NUM_BIGGER   = 0x02,
+FRAME_NUM_SMALLER  = 0x04,
+FRAME_NUM_OVER_MAX = 0x08,
+} COMPARE_FRAME_NUM;
 
 /*
 *	reset LTR marking , recovery ,feedback state to default
 */
-void ResetLtrState(SLTRState* pLtr );
+void ResetLtrState (SLTRState* pLtr);
 /*
  *	reset reference picture list
  */
-void WelsResetRefList( sWelsEncCtx *pCtx );
+void WelsResetRefList (sWelsEncCtx* pCtx);
 
 /*
  *	update reference picture list
  */
-BOOL_T WelsUpdateRefList( sWelsEncCtx *pCtx );	
+BOOL_T WelsUpdateRefList (sWelsEncCtx* pCtx);
 /*
  *	build reference picture list
  */
-BOOL_T WelsBuildRefList( sWelsEncCtx *pCtx, const int32_t kiPOC );
+BOOL_T WelsBuildRefList (sWelsEncCtx* pCtx, const int32_t kiPOC);
 
 /*
  *	update syntax for reference base related
  */
-void WelsUpdateRefSyntax( sWelsEncCtx *pCtx, const int32_t kiPOC, const int32_t kiFrameType );
+void WelsUpdateRefSyntax (sWelsEncCtx* pCtx, const int32_t kiPOC, const int32_t kiFrameType);
 
 
 /*
 * check current mark iFrameNum used in LTR list or not
 */
-bool_t CheckCurMarkFrameNumUsed(sWelsEncCtx *pCtx);
+bool_t CheckCurMarkFrameNumUsed (sWelsEncCtx* pCtx);
 /*
 *	decide whether current frame include long term reference mark and update long term reference mark syntax
 */
-void WelsMarkPic( sWelsEncCtx *pCtx);
+void WelsMarkPic (sWelsEncCtx* pCtx);
 
 #ifdef LONG_TERM_REF_DUMP
-void dump_ref(sWelsEncCtx* ctx);
+void dump_ref (sWelsEncCtx* ctx);
 #endif
 }
 #endif//REFERENCE_PICTURE_LIST_MANAGEMENT_SVC_H__
--- a/codec/encoder/core/inc/sample.h
+++ b/codec/encoder/core/inc/sample.h
@@ -1,123 +1,125 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#ifndef _SAMPLE_H_
-#define _SAMPLE_H_
-
-#include "typedefs.h"
-#include "wels_func_ptr_def.h"
-
-namespace WelsSVCEnc {
-enum
-{
-    BLOCK_16x16 = 0,
-    BLOCK_16x8  = 1,
-    BLOCK_8x16  = 2,
-    BLOCK_8x8   = 3,
-    BLOCK_4x4   = 4,
-//    BLOCK_8x4   = 5,
-//    BLOCK_4x8   = 6,
-};
-
-//===================SAD=====================//
-int32_t WelsSampleSad16x16_c( uint8_t *, int32_t, uint8_t *, int32_t );
-int32_t WelsSampleSad16x8_c( uint8_t *, int32_t, uint8_t *, int32_t );
-int32_t WelsSampleSad8x16_c( uint8_t *, int32_t, uint8_t *, int32_t );
-int32_t WelsSampleSad8x8_c( uint8_t *, int32_t, uint8_t *, int32_t );
-//int32_t WelsSampleSad8x4( uint8_t *, int32_t, uint8_t *, int32_t );
-//int32_t WelsSampleSad4x8( uint8_t *, int32_t, uint8_t *, int32_t );
-int32_t WelsSampleSad4x4_c( uint8_t *, int32_t, uint8_t *, int32_t );
-
-//======================SATD======================//
-int32_t WelsSampleSatd16x16_c( uint8_t *, int32_t, uint8_t *, int32_t );
-int32_t WelsSampleSatd16x8_c( uint8_t *, int32_t, uint8_t *, int32_t );
-int32_t WelsSampleSatd8x16_c( uint8_t *, int32_t, uint8_t *, int32_t );
-int32_t WelsSampleSatd8x8_c( uint8_t *, int32_t, uint8_t *, int32_t );
-//int32_t WelsSampleSatd8x4( uint8_t *, int32_t, uint8_t *, int32_t );
-//int32_t WelsSampleSatd4x8( uint8_t *, int32_t, uint8_t *, int32_t );
-int32_t WelsSampleSatd4x4_c( uint8_t *, int32_t, uint8_t *, int32_t );
-
-void WelsSampleSadFour16x16_c( uint8_t *iSample1, int32_t iStride1, uint8_t *iSample2, int32_t iStride2, int32_t* pSad); 
-void WelsSampleSadFour16x8_c( uint8_t *iSample1, int32_t iStride1, uint8_t *iSample2, int32_t iStride2, int32_t* pSad);
-void WelsSampleSadFour8x16_c( uint8_t *iSample1, int32_t iStride1, uint8_t *iSample2, int32_t iStride2, int32_t* pSad);
-void WelsSampleSadFour8x8_c( uint8_t *iSample1, int32_t iStride1, uint8_t *iSample2, int32_t iStride2, int32_t* pSad);
-void WelsSampleSadFour4x4_c( uint8_t *iSample1, int32_t iStride1, uint8_t *iSample2, int32_t iStride2, int32_t* pSad);
-
-#if defined(__cplusplus)
-extern "C" {
-#endif//__cplusplus
-
-#if defined (X86_ASM)
-
-int32_t WelsSampleSad4x4_mmx( uint8_t *, int32_t, uint8_t *, int32_t );
-int32_t WelsSampleSad16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t );
-int32_t WelsSampleSad16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t );
-int32_t WelsSampleSad8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t);
-int32_t WelsSampleSad8x8_sse21( uint8_t *, int32_t, uint8_t * , int32_t);
-
-void WelsSampleSadFour16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, int32_t* );
-void WelsSampleSadFour16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, int32_t* );
-void WelsSampleSadFour8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, int32_t* );
-void WelsSampleSadFour8x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, int32_t* );
-void WelsSampleSadFour4x4_sse2( uint8_t *, int32_t, uint8_t *, int32_t, int32_t* );
-
-int32_t WelsSampleSatd8x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t );
-int32_t WelsSampleSatd16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t );
-int32_t WelsSampleSatd8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t );
-int32_t WelsSampleSatd16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t );
-int32_t WelsSampleSatd4x4_sse2( uint8_t *, int32_t, uint8_t *, int32_t );
-int32_t WelsSmpleSatdThree4x4_sse2( uint8_t *, int32_t, uint8_t *, int32_t, uint8_t*, int32_t*, int32_t, int32_t, int32_t );
-
-int32_t WelsSampleSatd8x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t);
-int32_t WelsSampleSatd8x16_sse41( uint8_t * , int32_t, uint8_t *, int32_t);
-int32_t WelsSampleSatd16x8_sse41( uint8_t * , int32_t, uint8_t *, int32_t);
-int32_t WelsSampleSatd16x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t);
-int32_t WelsSampleSatd4x4_sse41( uint8_t *, int32_t, uint8_t *, int32_t );
-
-int32_t WelsIntra16x16Combined3Satd_sse41(uint8_t *, int32_t, uint8_t *, int32_t, int32_t*, int32_t, uint8_t*);
-int32_t WelsIntra16x16Combined3Sad_ssse3(uint8_t *, int32_t, uint8_t *, int32_t, int32_t*, int32_t, uint8_t*);
-int32_t WelsIntraChroma8x8Combined3Satd_sse41( uint8_t *, int32_t, uint8_t *, int32_t, int32_t*, int32_t, uint8_t*,uint8_t*,uint8_t*);
-int32_t WelsIntraChroma8x8Combined3Sad_ssse3( uint8_t *, int32_t, uint8_t *, int32_t, int32_t*, int32_t, uint8_t*,uint8_t*,uint8_t*);
-
-
-#endif//X86_ASM
-
-
-#if defined(__cplusplus)
-}
-#endif//__cplusplus
-
-void WelsInitSampleSadFunc( SWelsFuncPtrList *pFuncList, uint32_t uiCpuFlag );
-
-}
-
-#endif //_SAMPLE_H_
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef _SAMPLE_H_
+#define _SAMPLE_H_
+
+#include "typedefs.h"
+#include "wels_func_ptr_def.h"
+
+namespace WelsSVCEnc {
+enum {
+  BLOCK_16x16 = 0,
+  BLOCK_16x8  = 1,
+  BLOCK_8x16  = 2,
+  BLOCK_8x8   = 3,
+  BLOCK_4x4   = 4,
+//    BLOCK_8x4   = 5,
+//    BLOCK_4x8   = 6,
+};
+
+//===================SAD=====================//
+int32_t WelsSampleSad16x16_c (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSad16x8_c (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSad8x16_c (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSad8x8_c (uint8_t*, int32_t, uint8_t*, int32_t);
+//int32_t WelsSampleSad8x4( uint8_t *, int32_t, uint8_t *, int32_t );
+//int32_t WelsSampleSad4x8( uint8_t *, int32_t, uint8_t *, int32_t );
+int32_t WelsSampleSad4x4_c (uint8_t*, int32_t, uint8_t*, int32_t);
+
+//======================SATD======================//
+int32_t WelsSampleSatd16x16_c (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSatd16x8_c (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSatd8x16_c (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSatd8x8_c (uint8_t*, int32_t, uint8_t*, int32_t);
+//int32_t WelsSampleSatd8x4( uint8_t *, int32_t, uint8_t *, int32_t );
+//int32_t WelsSampleSatd4x8( uint8_t *, int32_t, uint8_t *, int32_t );
+int32_t WelsSampleSatd4x4_c (uint8_t*, int32_t, uint8_t*, int32_t);
+
+void WelsSampleSadFour16x16_c (uint8_t* iSample1, int32_t iStride1, uint8_t* iSample2, int32_t iStride2, int32_t* pSad);
+void WelsSampleSadFour16x8_c (uint8_t* iSample1, int32_t iStride1, uint8_t* iSample2, int32_t iStride2, int32_t* pSad);
+void WelsSampleSadFour8x16_c (uint8_t* iSample1, int32_t iStride1, uint8_t* iSample2, int32_t iStride2, int32_t* pSad);
+void WelsSampleSadFour8x8_c (uint8_t* iSample1, int32_t iStride1, uint8_t* iSample2, int32_t iStride2, int32_t* pSad);
+void WelsSampleSadFour4x4_c (uint8_t* iSample1, int32_t iStride1, uint8_t* iSample2, int32_t iStride2, int32_t* pSad);
+
+#if defined(__cplusplus)
+extern "C" {
+#endif//__cplusplus
+
+#if defined (X86_ASM)
+
+int32_t WelsSampleSad4x4_mmx (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSad16x16_sse2 (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSad16x8_sse2 (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSad8x16_sse2 (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSad8x8_sse21 (uint8_t*, int32_t, uint8_t*, int32_t);
+
+void WelsSampleSadFour16x16_sse2 (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
+void WelsSampleSadFour16x8_sse2 (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
+void WelsSampleSadFour8x16_sse2 (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
+void WelsSampleSadFour8x8_sse2 (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
+void WelsSampleSadFour4x4_sse2 (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
+
+int32_t WelsSampleSatd8x8_sse2 (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSatd16x8_sse2 (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSatd8x16_sse2 (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSatd16x16_sse2 (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSatd4x4_sse2 (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSmpleSatdThree4x4_sse2 (uint8_t*, int32_t, uint8_t*, int32_t, uint8_t*, int32_t*, int32_t, int32_t,
+                                    int32_t);
+
+int32_t WelsSampleSatd8x8_sse41 (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSatd8x16_sse41 (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSatd16x8_sse41 (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSatd16x16_sse41 (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSatd4x4_sse41 (uint8_t*, int32_t, uint8_t*, int32_t);
+
+int32_t WelsIntra16x16Combined3Satd_sse41 (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*);
+int32_t WelsIntra16x16Combined3Sad_ssse3 (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*);
+int32_t WelsIntraChroma8x8Combined3Satd_sse41 (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*,
+    uint8_t*, uint8_t*);
+int32_t WelsIntraChroma8x8Combined3Sad_ssse3 (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*,
+    uint8_t*, uint8_t*);
+
+
+#endif//X86_ASM
+
+
+#if defined(__cplusplus)
+}
+#endif//__cplusplus
+
+void WelsInitSampleSadFunc (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag);
+
+}
+
+#endif //_SAMPLE_H_
--- a/codec/encoder/core/inc/set_mb_syn_cavlc.h
+++ b/codec/encoder/core/inc/set_mb_syn_cavlc.h
@@ -37,7 +37,7 @@
  *
  *************************************************************************************
  */
- 
+
 #ifndef SET_MB_SYN_CAVLC_H_
 #define SET_MB_SYN_CAVLC_H_
 
@@ -49,39 +49,39 @@
 
 
 
-enum EResidualProperty{
-    LUMA_DC     = 0,
-	LUMA_AC     = 1,
-	LUMA_4x4    = 2,
-	CHROMA_DC   = 3, 
-	CHROMA_AC   = 4    
+enum EResidualProperty {
+LUMA_DC     = 0,
+LUMA_AC     = 1,
+LUMA_4x4    = 2,
+CHROMA_DC   = 3,
+CHROMA_AC   = 4
 };
 
 
 #define LUMA_DC_AC    0x04
 
-typedef  int32_t  (*PCavlcParamCalFunc) ( int16_t * pCoff, uint8_t * pRun, int16_t * pLevel, int32_t * pTotalCoeffs, int32_t iEndIdx);
+typedef  int32_t (*PCavlcParamCalFunc) (int16_t* pCoff, uint8_t* pRun, int16_t* pLevel, int32_t* pTotalCoeffs,
+                                        int32_t iEndIdx);
 
-typedef  struct TagCoeffFunc 
-{
-	PCavlcParamCalFunc    pfCavlcParamCal;
+typedef  struct TagCoeffFunc {
+PCavlcParamCalFunc    pfCavlcParamCal;
 } SCoeffFunc;
 
 /*  For CAVLC   */
 extern SCoeffFunc    sCoeffFunc;
 
-typedef struct TagCavlcTableItem
-{
-	uint16_t uiBits;
-	uint8_t  uiLen;
-	uint8_t  uiSuffixLength;
+typedef struct TagCavlcTableItem {
+uint16_t uiBits;
+uint8_t  uiLen;
+uint8_t  uiSuffixLength;
 } SCavlcTableItem;
 
-void  InitCoeffFunc( const uint32_t uiCpuFlag );
+void  InitCoeffFunc (const uint32_t uiCpuFlag);
 
 void  InitCavlcTable();
 
-void  WriteBlockResidualCavlc( int16_t *pCoffLevel, int32_t iEndIdx, int32_t iCalRunLevelFlag, int32_t iResidualProperty, int8_t iNC, SBitStringAux *pBs );
+void  WriteBlockResidualCavlc (int16_t* pCoffLevel, int32_t iEndIdx, int32_t iCalRunLevelFlag,
+                               int32_t iResidualProperty, int8_t iNC, SBitStringAux* pBs);
 
 #if defined(__cplusplus)
 extern "C" {
@@ -88,7 +88,8 @@
 #endif//__cplusplus
 
 #ifdef  X86_ASM
-int32_t CavlcParamCal_sse2(int16_t*pCoffLevel, uint8_t* pRun, int16_t *pLevel, int32_t * pTotalCoeffs , int32_t iEndIdx); 
+int32_t CavlcParamCal_sse2 (int16_t* pCoffLevel, uint8_t* pRun, int16_t* pLevel, int32_t* pTotalCoeffs ,
+                            int32_t iEndIdx);
 #endif
 
 #if defined(__cplusplus)
--- a/codec/encoder/core/inc/slice.h
+++ b/codec/encoder/core/inc/slice.h
@@ -53,132 +53,130 @@
  *	Reference picture list reordering syntax, refer to page 64 in JVT X201wcm
  */
 typedef struct TagRefPicListReorderSyntax {
-	struct
-	{
-		uint32_t	uiAbsDiffPicNumMinus1; //uiAbsDiffPicNumMinus1 SHOULD be in the range of [4, (1<<pSps->uiLog2MaxFrameNum)-1], {p104, JVT-X201wcm1}
-		                                     //but int8_t can't cover the range, SHOULD modify it.
-		uint16_t	iLongTermPicNum;
-		uint16_t	uiReorderingOfPicNumsIdc; //in order to pack 2-uint16_t into 1-(u)int32_t, so modify the type into uint16_t.
-	} SReorderingSyntax[MAX_REFERENCE_REORDER_COUNT_NUM];	// MAX_REF_PIC_COUNT
-}SRefPicListReorderSyntax;
+  struct {
+    uint32_t	uiAbsDiffPicNumMinus1; //uiAbsDiffPicNumMinus1 SHOULD be in the range of [4, (1<<pSps->uiLog2MaxFrameNum)-1], {p104, JVT-X201wcm1}
+    //but int8_t can't cover the range, SHOULD modify it.
+    uint16_t	iLongTermPicNum;
+    uint16_t	uiReorderingOfPicNumsIdc; //in order to pack 2-uint16_t into 1-(u)int32_t, so modify the type into uint16_t.
+  } SReorderingSyntax[MAX_REFERENCE_REORDER_COUNT_NUM];	// MAX_REF_PIC_COUNT
+} SRefPicListReorderSyntax;
 
-		
+
 /* Decoded reference picture marking syntax, refer to Page 66 in JVT X201wcm */
 typedef struct TagRefPicMarking {
-	struct
-	{
-		int32_t	iMmcoType;
-		int32_t iShortFrameNum;
-		int32_t	iDiffOfPicNum;
-		int32_t	iLongTermPicNum;
-		int32_t	iLongTermFrameIdx;
-		int32_t	iMaxLongTermFrameIdx;
-	} SMmcoRef[MAX_REFERENCE_MMCO_COUNT_NUM];	// MAX_MMCO_COUNT
-	
-	//	int32_t		mmco_index;
-	uint8_t		uiMmcoCount;
-	bool_t		bNoOutputOfPriorPicsFlag;
-	bool_t		bLongTermRefFlag;
-	bool_t		bAdaptiveRefPicMarkingModeFlag;	
+  struct {
+    int32_t	iMmcoType;
+    int32_t iShortFrameNum;
+    int32_t	iDiffOfPicNum;
+    int32_t	iLongTermPicNum;
+    int32_t	iLongTermFrameIdx;
+    int32_t	iMaxLongTermFrameIdx;
+  } SMmcoRef[MAX_REFERENCE_MMCO_COUNT_NUM];	// MAX_MMCO_COUNT
+
+  //	int32_t		mmco_index;
+  uint8_t		uiMmcoCount;
+  bool_t		bNoOutputOfPriorPicsFlag;
+  bool_t		bLongTermRefFlag;
+  bool_t		bAdaptiveRefPicMarkingModeFlag;
 } SRefPicMarking;
 
 
 /* Header of slice syntax elements, refer to Page 63 in JVT X201wcm */
-typedef struct TagSliceHeader{	
-	/*****************************slice header syntax and generated****************************/
-	int32_t		iFirstMbInSlice;		
+typedef struct TagSliceHeader {
+  /*****************************slice header syntax and generated****************************/
+  int32_t		iFirstMbInSlice;
 //	uint32_t	pic_parameter_set_id;
-	int32_t		iFrameNum;	
-	int32_t		iPicOrderCntLsb;
-    
+  int32_t		iFrameNum;
+  int32_t		iPicOrderCntLsb;
+
 //	int32_t		delta_pic_order_cnt_bottom;
 //	int32_t		delta_pic_order_cnt[2];
 //	int32_t		redundant_pic_cnt;
-		
-	EWelsSliceType	eSliceType;
-	uint8_t		uiNumRefIdxL0Active;			//
-	//int32_t		num_ref_idx_l1_active_minus1	//B frame is not supported
-	uint8_t		uiRefCount;
-	//Ref_Pic				*ref_pic;
-	uint8_t		uiRefIndex;	// exact reference picture index for slice	
-	
-	int8_t		iSliceQpDelta;
-//	int32_t		slice_qp;	
+
+  EWelsSliceType	eSliceType;
+  uint8_t		uiNumRefIdxL0Active;			//
+  //int32_t		num_ref_idx_l1_active_minus1	//B frame is not supported
+  uint8_t		uiRefCount;
+  //Ref_Pic				*ref_pic;
+  uint8_t		uiRefIndex;	// exact reference picture index for slice
+
+  int8_t		iSliceQpDelta;
+//	int32_t		slice_qp;
 //	int32_t		slice_qs_delta;		// For SP/SI slices
-	uint8_t		uiDisableDeblockingFilterIdc;
-	int8_t		iSliceAlphaC0Offset;
-	int8_t		iSliceBetaOffset;
+  uint8_t		uiDisableDeblockingFilterIdc;
+  int8_t		iSliceAlphaC0Offset;
+  int8_t		iSliceBetaOffset;
 #if !defined(DISABLE_FMO_FEATURE)
-	int32_t		iSliceGroupChangeCycle;
+  int32_t		iSliceGroupChangeCycle;
 #endif//!DISABLE_FMO_FEATURE
 
-	SWelsSPS			*pSps;
-	SWelsPPS			*pPps;
-	int32_t		iSpsId;
-	int32_t		iPpsId;
+  SWelsSPS*			pSps;
+  SWelsPPS*			pPps;
+  int32_t		iSpsId;
+  int32_t		iPpsId;
 
-	uint16_t    uiIdrPicId;	
+  uint16_t    uiIdrPicId;
 //	uint8_t		color_plane_id;//from?
 
-	bool_t		bNumRefIdxActiveOverrideFlag;
+  bool_t		bNumRefIdxActiveOverrideFlag;
 //	bool_t		field_pic_flag;		//not supported in base profile
 //	bool_t		bottom_field_flag;		//not supported in base profile
-	uint8_t		uiPadding1Bytes;
+  uint8_t		uiPadding1Bytes;
 
-	SRefPicMarking		sRefMarking;	// Decoded reference picture marking syntaxs
+  SRefPicMarking		sRefMarking;	// Decoded reference picture marking syntaxs
 
-	SRefPicListReorderSyntax	sRefReordering;	// Reference picture list reordering syntaxs
-}SSliceHeader, *PSliceHeader;
+  SRefPicListReorderSyntax	sRefReordering;	// Reference picture list reordering syntaxs
+} SSliceHeader, *PSliceHeader;
 
 
 /* SSlice header in scalable extension syntax, refer to Page 394 in JVT X201wcm */
-typedef struct TagSliceHeaderExt{	
-	SSliceHeader	sSliceHeader;
+typedef struct TagSliceHeaderExt {
+  SSliceHeader	sSliceHeader;
 
-	SSubsetSps	*pSubsetSps;
-	
-	uint32_t	uiNumMbsInSlice;	
-	
-	bool_t		bStoreRefBasePicFlag;	
-	bool_t		bConstrainedIntraResamplingFlag;	
-	bool_t		bSliceSkipFlag;
-	
-	bool_t		bAdaptiveBaseModeFlag;
-	bool_t		bDefaultBaseModeFlag;
-	bool_t		bAdaptiveMotionPredFlag;
-	bool_t		bDefaultMotionPredFlag;
+  SSubsetSps*	pSubsetSps;
 
-	bool_t		bAdaptiveResidualPredFlag;
-	bool_t		bDefaultResidualPredFlag;
-	bool_t		bTcoeffLevelPredFlag;		
-	uint8_t		uiDisableInterLayerDeblockingFilterIdc;
-	
-}SSliceHeaderExt, *PSliceHeaderExt;
+  uint32_t	uiNumMbsInSlice;
 
+  bool_t		bStoreRefBasePicFlag;
+  bool_t		bConstrainedIntraResamplingFlag;
+  bool_t		bSliceSkipFlag;
 
-typedef struct TagSlice{	
-	// mainly for multiple threads imp.
-	SMbCache	sMbCacheInfo;	// MBCache is introduced within slice dependency
-	SBitStringAux *pSliceBsa;
+  bool_t		bAdaptiveBaseModeFlag;
+  bool_t		bDefaultBaseModeFlag;
+  bool_t		bAdaptiveMotionPredFlag;
+  bool_t		bDefaultMotionPredFlag;
 
-	/*******************************sSliceHeader****************************/
-	SSliceHeaderExt	sSliceHeaderExt;	
+  bool_t		bAdaptiveResidualPredFlag;
+  bool_t		bDefaultResidualPredFlag;
+  bool_t		bTcoeffLevelPredFlag;
+  uint8_t		uiDisableInterLayerDeblockingFilterIdc;
 
+} SSliceHeaderExt, *PSliceHeaderExt;
 
-	SMVUnitXY	sMvMin;
-	SMVUnitXY	sMvMax;	
-	SMVUnitXY	sMvc[5];
-	uint8_t		uiMvcNum;
-	uint8_t		sScaleShift;
 
-	uint8_t		uiSliceIdx;
-	bool_t		bSliceHeaderExtFlag; // Indicate which slice header is used, avc or ext?	
-	uint8_t		uiLastMbQp;		// stored qp for last mb coded, maybe more efficient for mb skip detection etc.
+typedef struct TagSlice {
+  // mainly for multiple threads imp.
+  SMbCache	sMbCacheInfo;	// MBCache is introduced within slice dependency
+  SBitStringAux* pSliceBsa;
 
-	bool_t		bDynamicSlicingSliceSizeCtrlFlag;
-	uint8_t		uiAssumeLog2BytePerMb;
-	uint8_t		uiReservedFillByte;	// reserved to meet 4 bytes alignment
-}SSlice, *PSlice;
+  /*******************************sSliceHeader****************************/
+  SSliceHeaderExt	sSliceHeaderExt;
+
+
+  SMVUnitXY	sMvMin;
+  SMVUnitXY	sMvMax;
+  SMVUnitXY	sMvc[5];
+  uint8_t		uiMvcNum;
+  uint8_t		sScaleShift;
+
+  uint8_t		uiSliceIdx;
+  bool_t		bSliceHeaderExtFlag; // Indicate which slice header is used, avc or ext?
+  uint8_t		uiLastMbQp;		// stored qp for last mb coded, maybe more efficient for mb skip detection etc.
+
+  bool_t		bDynamicSlicingSliceSizeCtrlFlag;
+  uint8_t		uiAssumeLog2BytePerMb;
+  uint8_t		uiReservedFillByte;	// reserved to meet 4 bytes alignment
+} SSlice, *PSlice;
 
 }
 //#pragma pack()
--- a/codec/encoder/core/inc/slice_multi_threading.h
+++ b/codec/encoder/core/inc/slice_multi_threading.h
@@ -52,53 +52,56 @@
 #include "WelsThreadLib.h"
 
 namespace WelsSVCEnc {
-void UpdateMbListNeighborParallel(	SSliceCtx *pSliceCtx,
-										SMB *pMbList,
-										const int32_t kiSliceIdc	);
+void UpdateMbListNeighborParallel (SSliceCtx* pSliceCtx,
+                                   SMB* pMbList,
+                                   const int32_t kiSliceIdc);
 
-void CalcSliceComplexRatio( void *pRatio, SSliceCtx *pSliceCtx, uint32_t *pSliceConsume );
+void CalcSliceComplexRatio (void* pRatio, SSliceCtx* pSliceCtx, uint32_t* pSliceConsume);
 
 #if defined(MT_ENABLED) && defined(DYNAMIC_SLICE_ASSIGN) && defined(NOT_ABSOLUTE_BALANCING)
-int32_t NeedDynamicAdjust( void *pConsumeTime, const int32_t kiSliceNum );
+int32_t NeedDynamicAdjust (void* pConsumeTime, const int32_t kiSliceNum);
 #endif//..
 
 #if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
-void DynamicAdjustSlicing(	sWelsEncCtx *pCtx,
-								SDqLayer *pCurDqLayer,
-								void *pComplexRatio,
-								int32_t iCurDid );
+void DynamicAdjustSlicing (sWelsEncCtx* pCtx,
+                           SDqLayer* pCurDqLayer,
+                           void* pComplexRatio,
+                           int32_t iCurDid);
 #endif//#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
 
 #ifdef PACKING_ONE_SLICE_PER_LAYER
-void reset_env_mt( sWelsEncCtx *pCtx );
+void reset_env_mt (sWelsEncCtx* pCtx);
 #endif//PACKING_ONE_SLICE_PER_LAYER
 
 
-int32_t RequestMtResource( sWelsEncCtx **ppCtx, SWelsSvcCodingParam *pParam, const int32_t kiCountBsLen, const int32_t kiTargetSpatialBsSize );
+int32_t RequestMtResource (sWelsEncCtx** ppCtx, SWelsSvcCodingParam* pParam, const int32_t kiCountBsLen,
+                           const int32_t kiTargetSpatialBsSize);
 
-void ReleaseMtResource( sWelsEncCtx **ppCtx );
+void ReleaseMtResource (sWelsEncCtx** ppCtx);
 
-int32_t AppendSliceToFrameBs( sWelsEncCtx *pCtx, SLayerBSInfo *pLbi, const int32_t kiSliceCount );
-int32_t WriteSliceToFrameBs( sWelsEncCtx *pCtx, SLayerBSInfo *pLbi, uint8_t *pFrameBsBuffer, const int32_t kiSliceIdx );
+int32_t AppendSliceToFrameBs (sWelsEncCtx* pCtx, SLayerBSInfo* pLbi, const int32_t kiSliceCount);
+int32_t WriteSliceToFrameBs (sWelsEncCtx* pCtx, SLayerBSInfo* pLbi, uint8_t* pFrameBsBuffer, const int32_t kiSliceIdx);
 
 #if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
 #if defined(__GNUC__)
-WELS_THREAD_ROUTINE_TYPE UpdateMbListThreadProc( void *arg );
+WELS_THREAD_ROUTINE_TYPE UpdateMbListThreadProc (void* arg);
 #endif//__GNUC__
 #endif//#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
 
-WELS_THREAD_ROUTINE_TYPE CodingSliceThreadProc( void *arg );
+WELS_THREAD_ROUTINE_TYPE CodingSliceThreadProc (void* arg);
 
-int32_t CreateSliceThreads( sWelsEncCtx *pCtx );
+int32_t CreateSliceThreads (sWelsEncCtx* pCtx);
 
 #ifdef PACKING_ONE_SLICE_PER_LAYER
-void ResetCountBsSizeInPartitions( uint32_t *pCountBsSizeList, const int32_t kiPartitionCnt );
+void ResetCountBsSizeInPartitions (uint32_t* pCountBsSizeList, const int32_t kiPartitionCnt);
 #endif//PACKING_ONE_SLICE_PER_LAYER
 
 #ifdef WIN32
-int32_t FiredSliceThreads( SSliceThreadPrivateData *pPriData, WELS_EVENT *pEventsList, SLayerBSInfo *pLayerBsInfo, const uint32_t kuiNumThreads/*, int32_t *iLayerNum*/, SSliceCtx *pSliceCtx, const BOOL_T kbIsDynamicSlicingMode );
+int32_t FiredSliceThreads (SSliceThreadPrivateData* pPriData, WELS_EVENT* pEventsList, SLayerBSInfo* pLayerBsInfo,
+                           const uint32_t kuiNumThreads/*, int32_t *iLayerNum*/, SSliceCtx* pSliceCtx, const BOOL_T kbIsDynamicSlicingMode);
 #else
-int32_t FiredSliceThreads( SSliceThreadPrivateData *pPriData, WELS_EVENT **ppEventsList, SLayerBSInfo *pLayerBsInfo, const uint32_t kuiNumThreads/*, int32_t *iLayerNum*/, SSliceCtx *pSliceCtx, const BOOL_T kbIsDynamicSlicingMode );
+int32_t FiredSliceThreads (SSliceThreadPrivateData* pPriData, WELS_EVENT** ppEventsList, SLayerBSInfo* pLayerBsInfo,
+                           const uint32_t kuiNumThreads/*, int32_t *iLayerNum*/, SSliceCtx* pSliceCtx, const BOOL_T kbIsDynamicSlicingMode);
 #endif//WIN32
 
 int32_t DynamicDetectCpuCores();
@@ -105,8 +108,8 @@
 
 #if defined(MT_ENABLED) && defined(DYNAMIC_SLICE_ASSIGN)
 
-int32_t AdjustBaseLayer( sWelsEncCtx *pCtx );
-int32_t AdjustEnhanceLayer( sWelsEncCtx *pCtx, int32_t iCurDid );
+int32_t AdjustBaseLayer (sWelsEncCtx* pCtx);
+int32_t AdjustEnhanceLayer (sWelsEncCtx* pCtx, int32_t iCurDid);
 
 #endif//MT_ENABLED && DYNAMIC_SLICE_ASSIGN
 
@@ -113,10 +116,10 @@
 #if defined(MT_ENABLED)
 
 #if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE) && defined(MT_DEBUG)
-void TrackSliceComplexities( sWelsEncCtx *pCtx, const int32_t kiCurDid );
+void TrackSliceComplexities (sWelsEncCtx* pCtx, const int32_t kiCurDid);
 #endif//#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
 #if defined(DYNAMIC_SLICE_ASSIGN) && defined(MT_DEBUG)
-void TrackSliceConsumeTime( sWelsEncCtx *pCtx, int32_t *pDidList, const int32_t kiSpatialNum );
+void TrackSliceConsumeTime (sWelsEncCtx* pCtx, int32_t* pDidList, const int32_t kiSpatialNum);
 #endif//#if defined(DYNAMIC_SLICE_ASSIGN) && defined(MT_DEBUG)
 
 #endif//MT_ENABLED
--- a/codec/encoder/core/inc/stat.h
+++ b/codec/encoder/core/inc/stat.h
@@ -41,14 +41,14 @@
 #define WELS_ENCODER_STATISTICAL_DATA_H__
 
 /*
- *	Stat quality 
+ *	Stat quality
  */
 typedef struct TagStatQuality {
-	
-	real32_t	rYPsnr[5];
-	real32_t	rUPsnr[5];
-	real32_t	rVPsnr[5];
 
+  real32_t	rYPsnr[5];
+  real32_t	rUPsnr[5];
+  real32_t	rVPsnr[5];
+
 } SStatQuality;
 
 /*
@@ -57,13 +57,13 @@
 typedef struct TagComplexityStat {
 
 #ifdef FME_TEST
-	int32_t		cost_time;
-	int32_t		me_time;
-	int32_t		mvp_time;
-	int32_t		mvb_time;
+  int32_t		cost_time;
+  int32_t		me_time;
+  int32_t		mvp_time;
+  int32_t		mvb_time;
 #endif
 
-	// any else?
+  // any else?
 
 } SComplexityStat;
 
@@ -71,12 +71,12 @@
  *	Stat slice details information
  */
 typedef struct TagStatSliceInfo {
-	
-	/* per slice info */
-	int32_t		iSliceCount[5];
-	int32_t		iSliceSize [5];
-	int32_t		iMbCount   [5][18];
 
+  /* per slice info */
+  int32_t		iSliceCount[5];
+  int32_t		iSliceSize [5];
+  int32_t		iMbCount   [5][18];
+
 } SStatSliceInfo;
 
 /*
@@ -84,14 +84,14 @@
  */
 typedef struct TagStatData {
 
-	// Quality
-	SStatQuality		sQualityStat;
-	
-	// Complexity
-	SComplexityStat		sComplexityStat;
+  // Quality
+  SStatQuality		sQualityStat;
 
-	// SSlice information output
-	SStatSliceInfo		sSliceData;	
+  // Complexity
+  SComplexityStat		sComplexityStat;
+
+  // SSlice information output
+  SStatSliceInfo		sSliceData;
 
 } SStatData;
 
--- a/codec/encoder/core/inc/svc_base_layer_md.h
+++ b/codec/encoder/core/inc/svc_base_layer_md.h
@@ -31,7 +31,7 @@
  *
  * \file	svc_base_layer_md.h
  *
- * \brief	mode decision 
+ * \brief	mode decision
  *
  * \date	2009.08.10 Created
  *
@@ -44,53 +44,60 @@
 #include "mb_cache.h"
 
 namespace WelsSVCEnc {
-void WelsMdIntraInit(sWelsEncCtx* pEncCtx, SMB* pCurMb, SMbCache *pMbCache, const int32_t kiSliceFirstMbXY );
-int32_t WelsMdI16x16(SWelsFuncPtrList *pFunc, SDqLayer *pCurDqLayer, SMbCache *pMbCache, int32_t iLambda);
-int32_t WelsMdIntraChroma(SWelsFuncPtrList *pFunc, SDqLayer *pCurDqLayer, SMbCache *pMbCache, int32_t iLambda);
+void WelsMdIntraInit (sWelsEncCtx* pEncCtx, SMB* pCurMb, SMbCache* pMbCache, const int32_t kiSliceFirstMbXY);
+int32_t WelsMdI16x16 (SWelsFuncPtrList* pFunc, SDqLayer* pCurDqLayer, SMbCache* pMbCache, int32_t iLambda);
+int32_t WelsMdIntraChroma (SWelsFuncPtrList* pFunc, SDqLayer* pCurDqLayer, SMbCache* pMbCache, int32_t iLambda);
 
-int32_t WelsMdI4x4(void* pEnc,void* pMd, SMB* pCurMb, SMbCache *pMbCache);
-int32_t WelsMdI4x4Fast(void* pEnc, void* pMd, SMB* pCurMb, SMbCache *pMbCache);
+int32_t WelsMdI4x4 (void* pEnc, void* pMd, SMB* pCurMb, SMbCache* pMbCache);
+int32_t WelsMdI4x4Fast (void* pEnc, void* pMd, SMB* pCurMb, SMbCache* pMbCache);
 
-int32_t WelsMdIntraFinePartition(void* pEncCtx, void* pWelsMd, SMB* pCurMb, SMbCache *pMbCache);
-int32_t WelsMdIntraFinePartitionVaa(void* pEncCtx, void* pWelsMd, SMB* pCurMb, SMbCache *pMbCache);
+int32_t WelsMdIntraFinePartition (void* pEncCtx, void* pWelsMd, SMB* pCurMb, SMbCache* pMbCache);
+int32_t WelsMdIntraFinePartitionVaa (void* pEncCtx, void* pWelsMd, SMB* pCurMb, SMbCache* pMbCache);
 
-void WelsMdIntraMb(sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb, SMbCache *pMbCache);
+void WelsMdIntraMb (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb, SMbCache* pMbCache);
 
-void WelsMdBackgroundMbEnc(void* pEnc, void* pMd, SMB* pCurMb, SMbCache *pMbCache, SSlice *pSlice, bool_t bSkipMbFlag);
-BOOL_T WelsMdPSkipEnc(void* pEnc, void* pMd, SMB* pCurMb, SMbCache *pMbCache);
-int32_t WelsMdP16x16(SWelsFuncPtrList *pFunc, SDqLayer* pCurDqLayer, SWelsMD* pWelsMd, SSlice *pSlice, SMB* pCurMb);
+void WelsMdBackgroundMbEnc (void* pEnc, void* pMd, SMB* pCurMb, SMbCache* pMbCache, SSlice* pSlice, bool_t bSkipMbFlag);
+BOOL_T WelsMdPSkipEnc (void* pEnc, void* pMd, SMB* pCurMb, SMbCache* pMbCache);
+int32_t WelsMdP16x16 (SWelsFuncPtrList* pFunc, SDqLayer* pCurDqLayer, SWelsMD* pWelsMd, SSlice* pSlice, SMB* pCurMb);
 
-int32_t WelsMdP16x8(SWelsFuncPtrList *pFunc, SDqLayer* pCurDqLayer, SWelsMD* pWelsMd, SSlice *pSlice);
-int32_t WelsMdP8x16(SWelsFuncPtrList *pFunc, SDqLayer* pCurDqLayer, SWelsMD* pWelsMd, SSlice *pSlice);
-int32_t WelsMdP8x8(SWelsFuncPtrList *pFunc, SDqLayer* pCurDqLayer, SWelsMD* pWelsMd, SSlice *pSlice);
-/*static*/  void WelsMdInterInit( sWelsEncCtx* pEncCtx, SSlice *pSlice, SMB* pCurMb, const int32_t kiSliceFirstMbXY );
-/*static*/ void WelsMdInterFinePartition(void* pEnc, void* pMd, SSlice *pSlice, SMB* pCurMb, int32_t bestCost);
-/*static*/ void WelsMdInterFinePartitionVaa( void* pEnc, void* pMd, SSlice *pSlice, SMB* pCurMb, int32_t bestCost );
-void WelsMdInterMbRefinement(sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb, SMbCache *pMbCache);
-BOOL_T WelsMdFirstIntraMode(void* pEnc, void* pMd, SMB* pCurMb, SMbCache *pMbCache);
+int32_t WelsMdP16x8 (SWelsFuncPtrList* pFunc, SDqLayer* pCurDqLayer, SWelsMD* pWelsMd, SSlice* pSlice);
+int32_t WelsMdP8x16 (SWelsFuncPtrList* pFunc, SDqLayer* pCurDqLayer, SWelsMD* pWelsMd, SSlice* pSlice);
+int32_t WelsMdP8x8 (SWelsFuncPtrList* pFunc, SDqLayer* pCurDqLayer, SWelsMD* pWelsMd, SSlice* pSlice);
+/*static*/  void WelsMdInterInit (sWelsEncCtx* pEncCtx, SSlice* pSlice, SMB* pCurMb, const int32_t kiSliceFirstMbXY);
+/*static*/ void WelsMdInterFinePartition (void* pEnc, void* pMd, SSlice* pSlice, SMB* pCurMb, int32_t bestCost);
+/*static*/ void WelsMdInterFinePartitionVaa (void* pEnc, void* pMd, SSlice* pSlice, SMB* pCurMb, int32_t bestCost);
+void WelsMdInterMbRefinement (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb, SMbCache* pMbCache);
+BOOL_T WelsMdFirstIntraMode (void* pEnc, void* pMd, SMB* pCurMb, SMbCache* pMbCache);
 //BOOL_T svc_md_first_intra_mode_constrained(void* pEnc, void* pMd, SMB* pCurMb, SMbCache *pMbCache);
-void WelsMdInterMb(void* pEncCtx, void* pWelsMd, SSlice *pSlice, SMB* pCurMb );
+void WelsMdInterMb (void* pEncCtx, void* pWelsMd, SSlice* pSlice, SMB* pCurMb);
 
 //both used in BL and EL
 //void wels_md_inter_init ( SWelsMD* pMd, const uint8_t ref_idx, const bool_t is_highest_dlayer_flag );
 
-bool_t WelsMdInterJudgeBGDPskip         ( void* pEnc, void* pMd, SSlice *pSlice, SMB* pCurMb, SMbCache *pMbCache, BOOL_T* bKeepSkip );
-bool_t WelsMdInterJudgeBGDPskipFalse( void* pEnc, void* pMd, SSlice *pSlice, SMB* pCurMb, SMbCache *pMbCache, BOOL_T* bKeepSkip );
+bool_t WelsMdInterJudgeBGDPskip (void* pEnc, void* pMd, SSlice* pSlice, SMB* pCurMb, SMbCache* pMbCache,
+                                 BOOL_T* bKeepSkip);
+bool_t WelsMdInterJudgeBGDPskipFalse (void* pEnc, void* pMd, SSlice* pSlice, SMB* pCurMb, SMbCache* pMbCache,
+                                      BOOL_T* bKeepSkip);
 
-void WelsMdInterUpdateBGDInfo          ( SDqLayer* pCurLayer,  SMB* pCurMb, const bool_t kbCollocatedPredFlag, const int32_t kiRefPictureType );
-void WelsMdInterUpdateBGDInfoNULL ( SDqLayer* pCurLayer,  SMB* pCurMb, const bool_t kbCollocatedPredFlag, const int32_t kiRefPictureType );
+void WelsMdInterUpdateBGDInfo (SDqLayer* pCurLayer,  SMB* pCurMb, const bool_t kbCollocatedPredFlag,
+                               const int32_t kiRefPictureType);
+void WelsMdInterUpdateBGDInfoNULL (SDqLayer* pCurLayer,  SMB* pCurMb, const bool_t kbCollocatedPredFlag,
+                                   const int32_t kiRefPictureType);
 
-bool_t WelsMdInterJudgePskip( sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SSlice *pSlice, SMB* pCurMb, SMbCache *pMbCache, BOOL_T bTrySkip );
-void WelsMdInterUpdatePskip( SDqLayer* pCurDqLayer, SSlice *pSlice, SMB* pCurMb, SMbCache *pMbCache );
-void WelsMdInterDecidedPskip( sWelsEncCtx* pEncCtx, SSlice *pSlice, SMB* pCurMb, SMbCache *pMbCache );
+bool_t WelsMdInterJudgePskip (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SSlice* pSlice, SMB* pCurMb, SMbCache* pMbCache,
+                              BOOL_T bTrySkip);
+void WelsMdInterUpdatePskip (SDqLayer* pCurDqLayer, SSlice* pSlice, SMB* pCurMb, SMbCache* pMbCache);
+void WelsMdInterDecidedPskip (sWelsEncCtx* pEncCtx, SSlice* pSlice, SMB* pCurMb, SMbCache* pMbCache);
 
-void WelsMdInterDoubleCheckPskip( SMB* pCurMb, SMbCache *pMbCache );
-void WelsMdInterEncode( sWelsEncCtx* pEncCtx, SSlice *pSlice, SMB* pCurMb, SMbCache *pMbCache );
+void WelsMdInterDoubleCheckPskip (SMB* pCurMb, SMbCache* pMbCache);
+void WelsMdInterEncode (sWelsEncCtx* pEncCtx, SSlice* pSlice, SMB* pCurMb, SMbCache* pMbCache);
 
-void WelsMdInterSaveSadAndRefMbType( Mb_Type* pRefMbTypeList, SMbCache * pMbCache, const SMB*  kpCurMb, const SWelsMD* kpMd );
+void WelsMdInterSaveSadAndRefMbType (Mb_Type* pRefMbTypeList, SMbCache* pMbCache, const SMB*  kpCurMb,
+                                     const SWelsMD* kpMd);
 
-void WelsMdInterSecondaryModesEnc( sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SSlice *pSlice, SMB* pCurMb, SMbCache *pMbCache, const BOOL_T kbSkip );
-void WelsMdIntraSecondaryModesEnc( sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb, SMbCache *pMbCache );
+void WelsMdInterSecondaryModesEnc (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SSlice* pSlice, SMB* pCurMb,
+                                   SMbCache* pMbCache, const BOOL_T kbSkip);
+void WelsMdIntraSecondaryModesEnc (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb, SMbCache* pMbCache);
 //end of: both used in BL and EL
 
 //typedef void (*MD_INTRA_MB_BASE) (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb);
--- a/codec/encoder/core/inc/svc_enc_frame.h
+++ b/codec/encoder/core/inc/svc_enc_frame.h
@@ -60,48 +60,49 @@
 ///////////////////////////////////DQ Layer level///////////////////////////////////
 
 typedef struct TagDqLayer	SDqLayer;
-typedef SDqLayer *			pDqLayer;
+typedef SDqLayer* 			pDqLayer;
 
-typedef struct TagLayerInfo{
-	SNalUnitHeaderExt		sNalHeaderExt;
-	SSlice					*pSliceInLayer;// Here SSlice identify to Frame on concept, [iSliceIndex], need memory block external side	for MT
-	SSubsetSps				*pSubsetSpsP;	// current pSubsetSps used, memory alloc in external
-	SWelsSPS						*pSpsP;		// current pSps based avc used, memory alloc in external
-	SWelsPPS						*pPpsP;		// current pPps used
+typedef struct TagLayerInfo {
+  SNalUnitHeaderExt		sNalHeaderExt;
+  SSlice*
+  pSliceInLayer;// Here SSlice identify to Frame on concept, [iSliceIndex], need memory block external side	for MT
+  SSubsetSps*				pSubsetSpsP;	// current pSubsetSps used, memory alloc in external
+  SWelsSPS*						pSpsP;		// current pSps based avc used, memory alloc in external
+  SWelsPPS*						pPpsP;		// current pPps used
 } SLayerInfo;
 /* Layer Representation */
-struct TagDqLayer{
-	SLayerInfo				sLayerInfo;
-	
-	uint8_t					*pCsData[3];	// pointer to reconstructed picture pData
-	int32_t					iCsStride[3];	// Cs stride
+struct TagDqLayer {
+  SLayerInfo				sLayerInfo;
 
-	uint8_t					*pEncData[3];	// pData picture to be encoded in current layer
-	int32_t					iEncStride[3];	// pData picture stride
+  uint8_t*					pCsData[3];	// pointer to reconstructed picture pData
+  int32_t					iCsStride[3];	// Cs stride
 
-	SMB*					sMbDataP;		// pointer to mb of mbAddr equal to 0 in slice, mb_data_ptr = mb_base_ptr + (1+iMbStride).	
-	int16_t					iMbWidth;		// MB width of this picture, equal to pSps.iMbWidth
-	int16_t					iMbHeight;		// MB height of this picture, equal to pSps.iMbHeight;
+  uint8_t*					pEncData[3];	// pData picture to be encoded in current layer
+  int32_t					iEncStride[3];	// pData picture stride
 
-	bool_t					bBaseLayerAvailableFlag;	// whether base layer is available for prediction?
-	uint8_t					iLoopFilterDisableIdc;	// 0: on, 1: off, 2: on except for slice boundaries
-	int8_t					iLoopFilterAlphaC0Offset;// AlphaOffset: valid range [-6, 6], default 0
-	int8_t					iLoopFilterBetaOffset;	// BetaOffset:	valid range [-6, 6], default 0
-	uint8_t				    uiDisableInterLayerDeblockingFilterIdc;
-	int8_t					iInterLayerSliceAlphaC0Offset;
-	int8_t					iInterLayerSliceBetaOffset;	
-	bool_t					bDeblockingParallelFlag; //parallel_deblocking_flag
+  SMB*					sMbDataP;		// pointer to mb of mbAddr equal to 0 in slice, mb_data_ptr = mb_base_ptr + (1+iMbStride).
+  int16_t					iMbWidth;		// MB width of this picture, equal to pSps.iMbWidth
+  int16_t					iMbHeight;		// MB height of this picture, equal to pSps.iMbHeight;
 
-	SPicture				*pRefPic;			// reference picture pointer
-	SPicture				*pDecPic;			// reconstruction picture pointer for layer
+  bool_t					bBaseLayerAvailableFlag;	// whether base layer is available for prediction?
+  uint8_t					iLoopFilterDisableIdc;	// 0: on, 1: off, 2: on except for slice boundaries
+  int8_t					iLoopFilterAlphaC0Offset;// AlphaOffset: valid range [-6, 6], default 0
+  int8_t					iLoopFilterBetaOffset;	// BetaOffset:	valid range [-6, 6], default 0
+  uint8_t				    uiDisableInterLayerDeblockingFilterIdc;
+  int8_t					iInterLayerSliceAlphaC0Offset;
+  int8_t					iInterLayerSliceBetaOffset;
+  bool_t					bDeblockingParallelFlag; //parallel_deblocking_flag
 
-	SSliceCtx			*pSliceEncCtx;	// current slice context
-	
-	int32_t					*pNumSliceCodedOfPartition;		// for dynamic slicing mode
-	int32_t					*pLastCodedMbIdxOfPartition;	// for dynamic slicing mode
-	int32_t					*pLastMbIdxOfPartition;			// for dynamic slicing mode
+  SPicture*				pRefPic;			// reference picture pointer
+  SPicture*				pDecPic;			// reconstruction picture pointer for layer
 
-	SDqLayer				*pRefLayer;		// pointer to referencing dq_layer of current layer to be decoded	
+  SSliceCtx*			pSliceEncCtx;	// current slice context
+
+  int32_t*					pNumSliceCodedOfPartition;		// for dynamic slicing mode
+  int32_t*					pLastCodedMbIdxOfPartition;	// for dynamic slicing mode
+  int32_t*					pLastMbIdxOfPartition;			// for dynamic slicing mode
+
+  SDqLayer*				pRefLayer;		// pointer to referencing dq_layer of current layer to be decoded
 
 };
 
--- a/codec/encoder/core/inc/svc_enc_golomb.h
+++ b/codec/encoder/core/inc/svc_enc_golomb.h
@@ -57,13 +57,13 @@
 #define    CAVLC_BS_INIT( pBs )  \
 	uint8_t  * pBufPtr = pBs->pBufPtr; \
 	uint32_t   uiCurBits = pBs->uiCurBits; \
-	int32_t    iLeftBits = pBs->iLeftBits; 
+	int32_t    iLeftBits = pBs->iLeftBits;
 
 #define    CAVLC_BS_UNINIT( pBs ) \
 	pBs->pBufPtr = pBufPtr;  \
 	pBs->uiCurBits = uiCurBits;  \
 	pBs->iLeftBits = iLeftBits;
-   
+
 #define    CAVLC_BS_WRITE( n,  v ) \
 	{  \
 	if ( (n) < iLeftBits ) {\
@@ -78,7 +78,7 @@
 		uiCurBits = (v) & ((1<<(n))-1);\
 		iLeftBits = 32 - (n);\
 	}\
-	} ;  
+	} ;
 
 extern const uint32_t g_uiGolombUELength[256];
 
@@ -86,192 +86,158 @@
 /*
  *	Get size of unsigned exp golomb codes
  */
-static inline uint32_t BsSizeUE( const uint32_t kiValue )
-{
-	if ( 256 > kiValue )
-	{
-		return g_uiGolombUELength[kiValue];	
-	}
-	else
-	{
-		uint32_t n = 0;	
-		uint32_t iTmpValue = kiValue+1;
-		
-		if (iTmpValue & 0xffff0000) 
-		{
-			iTmpValue >>= 16;
-			n += 16;
-		}
-		if (iTmpValue & 0xff00) 
-		{
-			iTmpValue >>= 8;
-			n += 8;
-		}
-		
-		//n += (g_uiGolombUELength[iTmpValue] >> 1);
-		n += (g_uiGolombUELength[iTmpValue-1] >> 1);
-		return ((n<<1) + 1);
-		
-	}
+static inline uint32_t BsSizeUE (const uint32_t kiValue) {
+if (256 > kiValue) {
+  return g_uiGolombUELength[kiValue];
+} else {
+  uint32_t n = 0;
+  uint32_t iTmpValue = kiValue + 1;
+
+  if (iTmpValue & 0xffff0000) {
+    iTmpValue >>= 16;
+    n += 16;
+  }
+  if (iTmpValue & 0xff00) {
+    iTmpValue >>= 8;
+    n += 8;
+  }
+
+  //n += (g_uiGolombUELength[iTmpValue] >> 1);
+  n += (g_uiGolombUELength[iTmpValue - 1] >> 1);
+  return ((n << 1) + 1);
+
 }
+}
 
 /*
  *	Get size of signed exp golomb codes
  */
-static inline uint32_t BsSizeSE( const int32_t kiValue )
-{
-	uint32_t iTmpValue;
-	if ( 0 == kiValue )
-	{
-		return 1;
-	}
-	else if ( 0 < kiValue )
-	{
-		iTmpValue = (kiValue<<1) - 1;
-		return BsSizeUE( iTmpValue );
-	}
-	else
-	{
-		iTmpValue = ((-kiValue)<<1);
-		return BsSizeUE( iTmpValue );
-	}
+static inline uint32_t BsSizeSE (const int32_t kiValue) {
+uint32_t iTmpValue;
+if (0 == kiValue) {
+  return 1;
+} else if (0 < kiValue) {
+  iTmpValue = (kiValue << 1) - 1;
+  return BsSizeUE (iTmpValue);
+} else {
+  iTmpValue = ((-kiValue) << 1);
+  return BsSizeUE (iTmpValue);
 }
+}
 
 /*
  *	Get size of truncated exp golomb codes
  */
-static inline int32_t BsSizeTE( const int32_t kiX, const int32_t kiValue )
-{
-	return 0;
+static inline int32_t BsSizeTE (const int32_t kiX, const int32_t kiValue) {
+return 0;
 }
 
 
 
-static inline int32_t BsWriteBits( SBitStringAux *pBs, int32_t n, const uint32_t kuiValue )
-{  
-	if( n < pBs->iLeftBits ){
-		pBs->uiCurBits = (pBs->uiCurBits<<n) | kuiValue;
-		pBs->iLeftBits -= n;	
-	} else {
-	    n -= pBs->iLeftBits;
-		pBs->uiCurBits = (pBs->uiCurBits<<pBs->iLeftBits) | (kuiValue>>n);
-		*((uint32_t*)pBs->pBufPtr) = ENDIAN_FIX(pBs->uiCurBits);		
-		pBs->pBufPtr += 4;
-		pBs->uiCurBits = kuiValue & ((1<<n)-1);
-		pBs->iLeftBits = 32 - n;
-	}
-	return 0;
+static inline int32_t BsWriteBits (SBitStringAux* pBs, int32_t n, const uint32_t kuiValue) {
+if (n < pBs->iLeftBits) {
+  pBs->uiCurBits = (pBs->uiCurBits << n) | kuiValue;
+  pBs->iLeftBits -= n;
+} else {
+  n -= pBs->iLeftBits;
+  pBs->uiCurBits = (pBs->uiCurBits << pBs->iLeftBits) | (kuiValue >> n);
+  * ((uint32_t*)pBs->pBufPtr) = ENDIAN_FIX (pBs->uiCurBits);
+  pBs->pBufPtr += 4;
+  pBs->uiCurBits = kuiValue & ((1 << n) - 1);
+  pBs->iLeftBits = 32 - n;
 }
+return 0;
+}
 
 /*
  *	Write 1 bit
  */
-static inline int32_t BsWriteOneBit( SBitStringAux *pBs, const uint32_t kuiValue )
-{
-	BsWriteBits(pBs, 1, kuiValue);
-	
-	return 0;
+static inline int32_t BsWriteOneBit (SBitStringAux* pBs, const uint32_t kuiValue) {
+BsWriteBits (pBs, 1, kuiValue);
+
+return 0;
 }
 
 
-static inline void BsFlush(SBitStringAux * pBs)
-{
-    *(uint32_t*)pBs->pBufPtr = ENDIAN_FIX(pBs->uiCurBits << pBs->iLeftBits);
-	pBs->pBufPtr += 4 - pBs->iLeftBits/8;
-	pBs->iLeftBits = 32;
-	pBs->uiCurBits = 0;	//  for future writing safe, 5/19/2010
+static inline void BsFlush (SBitStringAux* pBs) {
+* (uint32_t*)pBs->pBufPtr = ENDIAN_FIX (pBs->uiCurBits << pBs->iLeftBits);
+pBs->pBufPtr += 4 - pBs->iLeftBits / 8;
+pBs->iLeftBits = 32;
+pBs->uiCurBits = 0;	//  for future writing safe, 5/19/2010
 }
 
 /*
  *	Write unsigned exp golomb codes
  */
-static inline void BsWriteUE( SBitStringAux *pBs, const uint32_t kuiValue )
-{
-	if ( 256 > kuiValue )	{
-		BsWriteBits( pBs, g_uiGolombUELength[kuiValue], kuiValue+1 );
-	}
-	else
-	{
-		uint32_t n = 0;	
-		uint32_t iTmpValue = kuiValue + 1;
-		
-		if (iTmpValue & 0xffff0000) 
-		{
-			iTmpValue >>= 16;
-			n += 16;
-		}
-		if (iTmpValue & 0xff00) 
-		{
-			iTmpValue >>= 8;
-			n += 8;
-		}
+static inline void BsWriteUE (SBitStringAux* pBs, const uint32_t kuiValue) {
+if (256 > kuiValue)	{
+  BsWriteBits (pBs, g_uiGolombUELength[kuiValue], kuiValue + 1);
+} else {
+  uint32_t n = 0;
+  uint32_t iTmpValue = kuiValue + 1;
 
-		//n += (g_uiGolombUELength[iTmpValue] >> 1);
+  if (iTmpValue & 0xffff0000) {
+    iTmpValue >>= 16;
+    n += 16;
+  }
+  if (iTmpValue & 0xff00) {
+    iTmpValue >>= 8;
+    n += 8;
+  }
 
-		n += (g_uiGolombUELength[iTmpValue-1] >> 1);
-		BsWriteBits( pBs, (n<<1) + 1, kuiValue+1 );
-	}
-	return;
+  //n += (g_uiGolombUELength[iTmpValue] >> 1);
+
+  n += (g_uiGolombUELength[iTmpValue - 1] >> 1);
+  BsWriteBits (pBs, (n << 1) + 1, kuiValue + 1);
 }
+return;
+}
 
 /*
  *	Write signed exp golomb codes
  */
-static inline void BsWriteSE( SBitStringAux *pBs, int32_t iValue )
-{	
-	uint32_t iTmpValue;
-	if ( 0 == iValue )
-	{
-		BsWriteOneBit( pBs, 1 );
-	}
-	else if ( 0 < iValue )
-	{
-		iTmpValue = (iValue<<1) - 1;
-		BsWriteUE( pBs, iTmpValue );
-	}
-	else
-	{
-		iTmpValue = ((-iValue)<<1);
-		BsWriteUE( pBs, iTmpValue );
-	}
-	return;
+static inline void BsWriteSE (SBitStringAux* pBs, int32_t iValue) {
+uint32_t iTmpValue;
+if (0 == iValue) {
+  BsWriteOneBit (pBs, 1);
+} else if (0 < iValue) {
+  iTmpValue = (iValue << 1) - 1;
+  BsWriteUE (pBs, iTmpValue);
+} else {
+  iTmpValue = ((-iValue) << 1);
+  BsWriteUE (pBs, iTmpValue);
 }
+return;
+}
 
 /*
  *	Write truncated exp golomb codes
  */
-static inline void BsWriteTE( SBitStringAux *pBs, const int32_t kiX, const uint32_t kuiValue )
-{
-	if ( 1 == kiX )
-	{
-		BsWriteOneBit( pBs, !kuiValue );
-	}
-	else
-	{
-		BsWriteUE( pBs, kuiValue );
-	}
+static inline void BsWriteTE (SBitStringAux* pBs, const int32_t kiX, const uint32_t kuiValue) {
+if (1 == kiX) {
+  BsWriteOneBit (pBs, !kuiValue);
+} else {
+  BsWriteUE (pBs, kuiValue);
 }
+}
 
 
 /*
  *	Write RBSP trailing bits
  */
-static inline void BsRbspTrailingBits( SBitStringAux *pBs )
-{
-	BsWriteOneBit(pBs, 1);	
-	BsFlush(pBs);	
+static inline void BsRbspTrailingBits (SBitStringAux* pBs) {
+BsWriteOneBit (pBs, 1);
+BsFlush (pBs);
 }
 
 
-static inline BOOL_T   BsCheckByteAlign( SBitStringAux * pBs)
-{
-    return !(pBs->iLeftBits & 0x7);
+static inline BOOL_T   BsCheckByteAlign (SBitStringAux* pBs) {
+return ! (pBs->iLeftBits & 0x7);
 }
 
 
-static inline int32_t BsGetBitsPos( SBitStringAux *pBs )
-{
-	return ( ((pBs->pBufPtr - pBs->pBuf) << 3) + 32 - pBs->iLeftBits );
+static inline int32_t BsGetBitsPos (SBitStringAux* pBs) {
+return (((pBs->pBufPtr - pBs->pBuf) << 3) + 32 - pBs->iLeftBits);
 }
 
 }
--- a/codec/encoder/core/inc/svc_enc_macroblock.h
+++ b/codec/encoder/core/inc/svc_enc_macroblock.h
@@ -44,33 +44,33 @@
 //struct Mb_s;
 
 /* MB syntax and context, refer to Page 399 in JVT X201wcm */
-// keep the most essential level pData structure be 64 Bytes, which matches cache line size; if so, the order with structure maybe negligible. 
+// keep the most essential level pData structure be 64 Bytes, which matches cache line size; if so, the order with structure maybe negligible.
 // pls take care when modify MB structure size
-typedef struct TagMB{	
-	/*************************mb_layer() syntax and generated********************************/
-	/*mb_layer():*/
-	Mb_Type		uiMbType;	// including MB detailed partition type, number and type of reference list
-	int16_t		iMbXY;		// offset position of MB top left point based	
-	int16_t		iMbX;		// position of MB in horizontal axis
-	int16_t		iMbY;		// position of MB in vertical axis
+typedef struct TagMB {
+/*************************mb_layer() syntax and generated********************************/
+/*mb_layer():*/
+Mb_Type		uiMbType;	// including MB detailed partition type, number and type of reference list
+int16_t		iMbXY;		// offset position of MB top left point based
+int16_t		iMbX;		// position of MB in horizontal axis
+int16_t		iMbY;		// position of MB in vertical axis
 
-	uint8_t		uiNeighborAvail;	// avail && same_slice: LEFT_MB_POS:0x01, TOP_MB_POS:0x02, TOPRIGHT_MB_POS = 0x04 ,TOPLEFT_MB_POS = 0x08;
-	uint8_t		uiCbp;	
+uint8_t		uiNeighborAvail;	// avail && same_slice: LEFT_MB_POS:0x01, TOP_MB_POS:0x02, TOPRIGHT_MB_POS = 0x04 ,TOPLEFT_MB_POS = 0x08;
+uint8_t		uiCbp;
 
-	SMVUnitXY	*sMv;
-	int8_t		*pRefIndex;
+SMVUnitXY*	sMv;
+int8_t*		pRefIndex;
 
-	int32_t     *pSadCost;				// mb sad. set to 0 for intra mb
-	int8_t      *pIntra4x4PredMode;	// [MB_BLOCK4x4_NUM]
-	int8_t      *pNonZeroCount;		// [MB_LUMA_CHROMA_BLOCK4x4_NUM]
+int32_t*     pSadCost;				// mb sad. set to 0 for intra mb
+int8_t*      pIntra4x4PredMode;	// [MB_BLOCK4x4_NUM]
+int8_t*      pNonZeroCount;		// [MB_LUMA_CHROMA_BLOCK4x4_NUM]
 
-	SMVUnitXY	sP16x16Mv;
+SMVUnitXY	sP16x16Mv;
 
-	uint8_t		uiLumaQp;		// uiLumaQp: pPps->iInitialQp + sSliceHeader->delta_qp + mb->dquant.
-	uint8_t		uiChromaQp;	
-	uint8_t		uiSliceIdc;	// AVC: pFirstMbInSlice?; SVC: (pFirstMbInSlice << 7) | ((uiDependencyId << 4) | uiQualityId);
-	uint8_t		reserved_filling_bytes[1];	// filling bytes reserved to make structure aligned with 4 bytes, higher cache hit on less structure size by 2 cache lines( 2 * 64 bytes) once hit
-}SMB, *PMb;
+uint8_t		uiLumaQp;		// uiLumaQp: pPps->iInitialQp + sSliceHeader->delta_qp + mb->dquant.
+uint8_t		uiChromaQp;
+uint8_t		uiSliceIdc;	// AVC: pFirstMbInSlice?; SVC: (pFirstMbInSlice << 7) | ((uiDependencyId << 4) | uiQualityId);
+uint8_t		reserved_filling_bytes[1];	// filling bytes reserved to make structure aligned with 4 bytes, higher cache hit on less structure size by 2 cache lines( 2 * 64 bytes) once hit
+} SMB, *PMb;
 
 }
 
--- a/codec/encoder/core/inc/svc_enc_slice_segment.h
+++ b/codec/encoder/core/inc/svc_enc_slice_segment.h
@@ -47,18 +47,18 @@
 
 #include "codec_app_def.h"
 namespace WelsSVCEnc {
-/*! 
+/*!
  * \brief	SSlice mode
  */
 typedef uint16_t SliceMode;
-typedef enum{
-	SM_SINGLE_SLICE         = 0,
-	SM_FIXEDSLCNUM_SLICE	= 1,
-	SM_RASTER_SLICE			= 2,
-	SM_ROWMB_SLICE			= 3,
-	SM_DYN_SLICE			= 4,
-	SM_RESERVED				= 5
-}SliceModeEnum;
+typedef enum {
+SM_SINGLE_SLICE         = 0,
+SM_FIXEDSLCNUM_SLICE	= 1,
+SM_RASTER_SLICE			= 2,
+SM_ROWMB_SLICE			= 3,
+SM_DYN_SLICE			= 4,
+SM_RESERVED				= 5
+} SliceModeEnum;
 
 
 // NOTE:
@@ -80,44 +80,45 @@
 #define JUMPPACKETSIZE_JUDGE(len,mb_idx,max_byte)	 ( (len) > JUMPPACKETSIZE_CONSTRAINT(max_byte) ) //( (mb_idx+1)%40/*16slice for compare*/ == 0 )	//
 //cur_mb_idx is for early tests, can be omit in optimization
 
-typedef struct TagSliceArgument{
-	uint32_t			uiSliceMbNum[MAX_SLICES_NUM];   //will perform check on this array to decide specific slicing, see note	
-	uint32_t			uiSliceSizeConstraint;
-	int16_t				iSliceNum;
+typedef struct TagSliceArgument {
+uint32_t			uiSliceMbNum[MAX_SLICES_NUM];   //will perform check on this array to decide specific slicing, see note
+uint32_t			uiSliceSizeConstraint;
+int16_t				iSliceNum;
 } SSliceArgument;
 
-typedef struct TagMulSliceOption{ //interfaces about slicing from application layer	
-	SSliceArgument		sSliceArgument; //according to uiSliceMode, decide which elements of this structure will actually takes effect
-	SliceMode			uiSliceMode;
+typedef struct TagMulSliceOption { //interfaces about slicing from application layer
+SSliceArgument
+sSliceArgument; //according to uiSliceMode, decide which elements of this structure will actually takes effect
+SliceMode			uiSliceMode;
 } SMulSliceOption;
 
-/*! 
+/*!
  * \brief	SSlice context
  */
-/* Single/multiple slices */	
-typedef struct SlicepEncCtx_s{
-	SliceMode		uiSliceMode;			/* 0: single slice in frame; 1: multiple slices in frame; */
-	int16_t			iMbWidth;			/* width of picture size in mb */
-	int16_t			iMbHeight;			/* height of picture size in mb */
-	int16_t			iSliceNumInFrame;	/* count number of slices in frame; */
-	int32_t			iMbNumInFrame;	/* count number of MBs in frame */
-	uint8_t			*pOverallMbMap;	/* overall MB map in frame, store virtual slice idc; */	
-	int16_t			*pFirstMbInSlice;	/* first MB address top-left based in every slice respectively; */
-	int32_t			*pCountMbNumInSlice;	/* count number of MBs in every slice respectively; */
-	uint32_t		uiSliceSizeConstraint;/*in byte*/
-	int32_t			iMaxSliceNumConstraint;/*maximal number of slices constraint*/
+/* Single/multiple slices */
+typedef struct SlicepEncCtx_s {
+SliceMode		uiSliceMode;			/* 0: single slice in frame; 1: multiple slices in frame; */
+int16_t			iMbWidth;			/* width of picture size in mb */
+int16_t			iMbHeight;			/* height of picture size in mb */
+int16_t			iSliceNumInFrame;	/* count number of slices in frame; */
+int32_t			iMbNumInFrame;	/* count number of MBs in frame */
+uint8_t*			pOverallMbMap;	/* overall MB map in frame, store virtual slice idc; */
+int16_t*			pFirstMbInSlice;	/* first MB address top-left based in every slice respectively; */
+int32_t*			pCountMbNumInSlice;	/* count number of MBs in every slice respectively; */
+uint32_t		uiSliceSizeConstraint;/*in byte*/
+int32_t			iMaxSliceNumConstraint;/*maximal number of slices constraint*/
 } SSliceCtx;
 
 
-typedef struct TagDynamicSlicingStack{
-	int32_t		iStartPos;	
-	int32_t		iCurrentPos;	
+typedef struct TagDynamicSlicingStack {
+int32_t		iStartPos;
+int32_t		iCurrentPos;
 
-	uint8_t		*pBsStackBufPtr;	// current writing position	
-	uint32_t    uiBsStackCurBits;  
-	int32_t		iBsStackLeftBits;
+uint8_t*		pBsStackBufPtr;	// current writing position
+uint32_t    uiBsStackCurBits;
+int32_t		iBsStackLeftBits;
 
-	int32_t		iMbSkipRunStack;
+int32_t		iMbSkipRunStack;
 } SDynamicSlicingStack;
 
 /*!
@@ -125,7 +126,7 @@
  *
  * \param	pSliceCtx		SSlice context to be initialized
  * \param	bFmoUseFlag	flag of using fmo
- * \param	iMbWidth		MB width 
+ * \param	iMbWidth		MB width
  * \param	iMbHeight		MB height
  * \param	uiSliceMode		slice mode
  * \param	mul_slice_arg	argument for multiple slice if it is applicable
@@ -133,23 +134,23 @@
  *
  * \return	0 - successful; none 0 - failed;
  */
-int32_t InitSlicePEncCtx( SSliceCtx *pSliceCtx,
-						    CMemoryAlign *pMa,
-						    bool_t bFmoUseFlag,
-							int32_t iMbWidth,
-							int32_t iMbHeight,
-							SMulSliceOption *pMulSliceOption,
-							void *pPpsArg );
+int32_t InitSlicePEncCtx (SSliceCtx* pSliceCtx,
+                          CMemoryAlign* pMa,
+                          bool_t bFmoUseFlag,
+                          int32_t iMbWidth,
+                          int32_t iMbHeight,
+                          SMulSliceOption* pMulSliceOption,
+                          void* pPpsArg);
 
 
 /*!
  * \brief	Uninitialize Wels SSlice context (Single/multiple slices and FMO)
  *
- * \param	pSliceCtx		SSlice context to be initialized 
+ * \param	pSliceCtx		SSlice context to be initialized
  *
  * \return	NONE;
  */
-void UninitSlicePEncCtx( SSliceCtx *pSliceCtx, CMemoryAlign *pMa );
+void UninitSlicePEncCtx (SSliceCtx* pSliceCtx, CMemoryAlign* pMa);
 
 /*!
  * \brief	Get slice idc for given iMbXY (apply in Single/multiple slices and FMO)
@@ -159,7 +160,7 @@
  *
  * \return	uiSliceIdc - successful; (uint8_t)(-1) - failed;
  */
-uint8_t WelsMbToSliceIdc( SSliceCtx *pSliceCtx, const int16_t kiMbXY );
+uint8_t WelsMbToSliceIdc (SSliceCtx* pSliceCtx, const int16_t kiMbXY);
 
 /*!
  * \brief	Get first mb in slice/slice_group: uiSliceIdc (apply in Single/multiple slices and FMO)
@@ -169,7 +170,7 @@
  *
  * \return	first_mb - successful; -1 - failed;
  */
-int32_t WelsGetFirstMbOfSlice( SSliceCtx *pSliceCtx, const int32_t kiSliceIdc );
+int32_t WelsGetFirstMbOfSlice (SSliceCtx* pSliceCtx, const int32_t kiSliceIdc);
 
 /*!
  * \brief	Get successive mb to be processed in slice/slice_group: uiSliceIdc (apply in Single/multiple slices and FMO)
@@ -179,7 +180,7 @@
  *
  * \return	next_mb - successful; -1 - failed;
  */
-int32_t WelsGetNextMbOfSlice( SSliceCtx *pSliceCtx, const int16_t kiMbXY );
+int32_t WelsGetNextMbOfSlice (SSliceCtx* pSliceCtx, const int16_t kiMbXY);
 
 /*!
  * \brief	Get previous mb to be processed in slice/slice_group: uiSliceIdc (apply in Single/multiple slices and FMO)
@@ -189,7 +190,7 @@
  *
  * \return	prev_mb - successful; -1 - failed;
  */
-int32_t WelsGetPrevMbOfSlice( SSliceCtx *pSliceCtx, const int16_t kiMbXY );
+int32_t WelsGetPrevMbOfSlice (SSliceCtx* pSliceCtx, const int16_t kiMbXY);
 
 /*!
  * \brief	Get number of mb in slice/slice_group: uiSliceIdc (apply in Single/multiple slices and FMO)
@@ -199,27 +200,27 @@
  *
  * \return	count_num_of_mb - successful; -1 - failed;
  */
-int32_t WelsGetNumMbInSlice( SSliceCtx *pSliceCtx, const int32_t kiSliceIdc );
+int32_t WelsGetNumMbInSlice (SSliceCtx* pSliceCtx, const int32_t kiSliceIdc);
 
 /*!
  *	Get slice count for multiple slice segment
  *
  */
-int32_t GetInitialSliceNum( const int32_t kiMbWidth, const int32_t kiMbHeight, SMulSliceOption* pMso );
-int32_t GetCurrentSliceNum( const SSliceCtx *kpSliceCtx );
+int32_t GetInitialSliceNum (const int32_t kiMbWidth, const int32_t kiMbHeight, SMulSliceOption* pMso);
+int32_t GetCurrentSliceNum (const SSliceCtx* kpSliceCtx);
 
 //checking valid para
-int32_t DynamicMaxSliceNumConstraint( uint32_t uiMaximumNum, int32_t uiConsumedNum, uint32_t uiDulplicateTimes  );
+int32_t DynamicMaxSliceNumConstraint (uint32_t uiMaximumNum, int32_t uiConsumedNum, uint32_t uiDulplicateTimes);
 
-bool_t CheckFixedSliceNumMultiSliceSetting( const int32_t kiMbNumInFrame,  SSliceArgument * pSliceArg );
-bool_t CheckRasterMultiSliceSetting( const int32_t kiMbNumInFrame, SSliceArgument * pSliceArg );
-bool_t CheckRowMbMultiSliceSetting( const int32_t kiMbWidth,  SSliceArgument * pSliceArg );
+bool_t CheckFixedSliceNumMultiSliceSetting (const int32_t kiMbNumInFrame,  SSliceArgument* pSliceArg);
+bool_t CheckRasterMultiSliceSetting (const int32_t kiMbNumInFrame, SSliceArgument* pSliceArg);
+bool_t CheckRowMbMultiSliceSetting (const int32_t kiMbWidth,  SSliceArgument* pSliceArg);
 
-void GomValidCheckSliceNum( const int32_t kiMbWidth, const int32_t kiMbHeight, int32_t *pSliceNum );
-void GomValidCheckSliceMbNum( const int32_t kiMbWidth, const int32_t kiMbHeight,  SSliceArgument * pSliceArg );
+void GomValidCheckSliceNum (const int32_t kiMbWidth, const int32_t kiMbHeight, int32_t* pSliceNum);
+void GomValidCheckSliceMbNum (const int32_t kiMbWidth, const int32_t kiMbHeight,  SSliceArgument* pSliceArg);
 //end of checking valid para
 
-int32_t DynamicAdjustSlicePEncCtxAll(	SSliceCtx *pSliceCtx,
-											int32_t *pRunLength	);
+int32_t DynamicAdjustSlicePEncCtxAll (SSliceCtx* pSliceCtx,
+                                      int32_t* pRunLength);
 }
 #endif//WELS_SLICE_SEGMENT_H__
--- a/codec/encoder/core/inc/svc_encode_mb.h
+++ b/codec/encoder/core/inc/svc_encode_mb.h
@@ -49,16 +49,16 @@
 #include "wels_func_ptr_def.h"
 
 namespace WelsSVCEnc {
-void	WelsDctMb(int16_t* pRs, uint8_t* pEncMb, int32_t iEncStride, uint8_t* pBestPred, PDctFunc pfDctFourT4);
+void	WelsDctMb (int16_t* pRs, uint8_t* pEncMb, int32_t iEncStride, uint8_t* pBestPred, PDctFunc pfDctFourT4);
 
-void	WelsEncRecI16x16Y(sWelsEncCtx *pEncCtx, SMB *pCurMb, SMbCache *pMbCache);
-void	WelsEncRecI4x4Y( sWelsEncCtx *pEncCtx, SMB *pCurMb, SMbCache *pMbCache, uint8_t uiI4x4Idx);
-void	WelsEncInterY(SWelsFuncPtrList *func, SMB * pCurMb, SMbCache *pMbCache);
-void    WelsEncRecUV(SWelsFuncPtrList *func, SMB * pCurMb, SMbCache *pMbCache, int16_t * pRs, int32_t iUV);
-void    WelsRecPskip(SDqLayer *pCurDq, SWelsFuncPtrList *pFunc, SMB * pCurMb, SMbCache *pMbCache);
+void	WelsEncRecI16x16Y (sWelsEncCtx* pEncCtx, SMB* pCurMb, SMbCache* pMbCache);
+void	WelsEncRecI4x4Y (sWelsEncCtx* pEncCtx, SMB* pCurMb, SMbCache* pMbCache, uint8_t uiI4x4Idx);
+void	WelsEncInterY (SWelsFuncPtrList* func, SMB* pCurMb, SMbCache* pMbCache);
+void    WelsEncRecUV (SWelsFuncPtrList* func, SMB* pCurMb, SMbCache* pMbCache, int16_t* pRs, int32_t iUV);
+void    WelsRecPskip (SDqLayer* pCurDq, SWelsFuncPtrList* pFunc, SMB* pCurMb, SMbCache* pMbCache);
 
-BOOL_T	WelsTryPYskip(sWelsEncCtx * pEncCtx, SMB * pCurMb, SMbCache *pMbCache);
-BOOL_T    WelsTryPUVskip(sWelsEncCtx * pEncCtx, SMB * pCurMb, SMbCache *pMbCache, int32_t iUV);
+BOOL_T	WelsTryPYskip (sWelsEncCtx* pEncCtx, SMB* pCurMb, SMbCache* pMbCache);
+BOOL_T    WelsTryPUVskip (sWelsEncCtx* pEncCtx, SMB* pCurMb, SMbCache* pMbCache, int32_t iUV);
 }
 #endif
 
--- a/codec/encoder/core/inc/svc_encode_slice.h
+++ b/codec/encoder/core/inc/svc_encode_slice.h
@@ -31,7 +31,7 @@
  *
  * \file	svc_encode_slice.h
  *
- * \brief	svc encoding slice 
+ * \brief	svc encoding slice
  *
  * \date	2009.07.27 Created
  *
@@ -47,54 +47,59 @@
 
 namespace WelsSVCEnc {
 #if defined(MB_TYPES_CHECK)
-void WelsCountMbType(int32_t (*iMbCount)[18], const EWelsSliceType eSt, const SMB* pMb);
+void WelsCountMbType (int32_t (*iMbCount)[18], const EWelsSliceType eSt, const SMB* pMb);
 #endif
 
 
-void UpdateNonZeroCountCache(SMB *pMb, SMbCache *pMbCache);
+void UpdateNonZeroCountCache (SMB* pMb, SMbCache* pMbCache);
 
 //for P SSlice (intra part + inter part, MB level)
-void OutputPMbWithoutConstructCsRsNoCopy( sWelsEncCtx *pEncCtx, SDqLayer* pDq, SSlice *pSlice, SMB* pMb );
+void OutputPMbWithoutConstructCsRsNoCopy (sWelsEncCtx* pEncCtx, SDqLayer* pDq, SSlice* pSlice, SMB* pMb);
 
-void WelsSliceHeaderScalExtInit( SDqLayer* pCurLayer, SSlice *pSlice );
-void WelsSliceHeaderExtInit( sWelsEncCtx* pEncCtx, SDqLayer* pCurLayer, SSlice *pSlice );
+void WelsSliceHeaderScalExtInit (SDqLayer* pCurLayer, SSlice* pSlice);
+void WelsSliceHeaderExtInit (sWelsEncCtx* pEncCtx, SDqLayer* pCurLayer, SSlice* pSlice);
 
-void WelsSliceHeaderWrite( SBitStringAux* pBs, SDqLayer* pCurLayer, SSlice *pSlice, uint32_t uiPpsIdBasis );
-void WelsSliceHeaderExtWrite( SBitStringAux* pBs, SDqLayer* pCurLayer, SSlice *pSlice, uint32_t uiPpsIdBasis );
+void WelsSliceHeaderWrite (SBitStringAux* pBs, SDqLayer* pCurLayer, SSlice* pSlice, uint32_t uiPpsIdBasis);
+void WelsSliceHeaderExtWrite (SBitStringAux* pBs, SDqLayer* pCurLayer, SSlice* pSlice, uint32_t uiPpsIdBasis);
 
 //===================MB-leve encode====================//
-void WelsInterMbEncode( sWelsEncCtx* pEncCtx, SSlice *pSlice, SMB* pCurMb );//only for inter part
+void WelsInterMbEncode (sWelsEncCtx* pEncCtx, SSlice* pSlice, SMB* pCurMb); //only for inter part
 //for I SSlice (only intra part, MB level)
-void WelsIMbChromaEncode( sWelsEncCtx* pEncCtx, SMB* pCurMb, SMbCache *pMbCache );
+void WelsIMbChromaEncode (sWelsEncCtx* pEncCtx, SMB* pCurMb, SMbCache* pMbCache);
 //for P SSlice (intra part + inter part, MB level)
-void WelsPMbChromaEncode( sWelsEncCtx* pEncCtx, SSlice *pSlice, SMB* pCurMb );
+void WelsPMbChromaEncode (sWelsEncCtx* pEncCtx, SSlice* pSlice, SMB* pCurMb);
 
 
 //===================MB-level encode====================//
 //encapsulation func: store base rec, highest Dependency Layer(only one quality) rec, single layer rec
-void WelsPSliceMdEnc( sWelsEncCtx* pEncCtx, SSlice *pSlice,  const bool_t kbIsHighestDlayerFlag );
-void WelsPSliceMdEncDynamic( sWelsEncCtx* pEncCtx, SSlice *pSlice,  const bool_t kbIsHighestDlayerFlag );
+void WelsPSliceMdEnc (sWelsEncCtx* pEncCtx, SSlice* pSlice,  const bool_t kbIsHighestDlayerFlag);
+void WelsPSliceMdEncDynamic (sWelsEncCtx* pEncCtx, SSlice* pSlice,  const bool_t kbIsHighestDlayerFlag);
 
 //encapsulation func: store base rec, highest Dependency Layer(only one quality) rec, single layer rec
-void WelsISliceMdEnc( sWelsEncCtx* pEncCtx, SSlice *pSlice );	// for intra non-dynamic slice
-void WelsISliceMdEncDynamic( sWelsEncCtx* pEncCtx, SSlice *pSlice );	// for intra dynamic slice
+void WelsISliceMdEnc (sWelsEncCtx* pEncCtx, SSlice* pSlice);	// for intra non-dynamic slice
+void WelsISliceMdEncDynamic (sWelsEncCtx* pEncCtx, SSlice* pSlice);	// for intra dynamic slice
 
-void WelsCodePSlice( sWelsEncCtx* pEncCtx, SSlice *pSlice );
-void WelsCodePOverDynamicSlice( sWelsEncCtx* pEncCtx, SSlice *pSlice );
+void WelsCodePSlice (sWelsEncCtx* pEncCtx, SSlice* pSlice);
+void WelsCodePOverDynamicSlice (sWelsEncCtx* pEncCtx, SSlice* pSlice);
 
-void WelsCodeOneSlice( sWelsEncCtx* pEncCtx, const int32_t kiSliceIdx, const int32_t/*EWelsNalUnitType*/ keNalType/*, bool_t bNewLayer*/ );
+void WelsCodeOneSlice (sWelsEncCtx* pEncCtx, const int32_t kiSliceIdx,
+                       const int32_t/*EWelsNalUnitType*/ keNalType/*, bool_t bNewLayer*/);
 
-void WelsInitSliceEncodingFuncs( uint32_t uiCpuFlag );
+void WelsInitSliceEncodingFuncs (uint32_t uiCpuFlag);
 
-void UpdateMbNeighbourInfoForNextSlice(	SSliceCtx *pSliceCtx,
-											 SMB *pMbList,
-											 const int32_t kiNextSliceFirstMbIdx,
-											 const int32_t kiLastMbIdxInPartition );
-void AddSliceBoundary(sWelsEncCtx* pEncCtx, SSlice * pCurSlice, SSliceCtx *pSliceCtx, SMB* pCurMb, int32_t iNextSliceFirstMbIdx, const int32_t kiLastMbIdxInPartition );
-void WelsMdInterMbLoop( sWelsEncCtx* pEncCtx, SSlice *pSlice, void* pMd, const int32_t kiSliceFirstMbXY );	// for inter non-dynamic slice
-void WelsMdInterMbLoopOverDynamicSlice( sWelsEncCtx* pEncCtx, SSlice *pSlice, void* pMd, const int32_t kiSliceFirstMbXY );	// for inter dynamic slice
+void UpdateMbNeighbourInfoForNextSlice (SSliceCtx* pSliceCtx,
+                                        SMB* pMbList,
+                                        const int32_t kiNextSliceFirstMbIdx,
+                                        const int32_t kiLastMbIdxInPartition);
+void AddSliceBoundary (sWelsEncCtx* pEncCtx, SSlice* pCurSlice, SSliceCtx* pSliceCtx, SMB* pCurMb,
+                       int32_t iNextSliceFirstMbIdx, const int32_t kiLastMbIdxInPartition);
+void WelsMdInterMbLoop (sWelsEncCtx* pEncCtx, SSlice* pSlice, void* pMd,
+                        const int32_t kiSliceFirstMbXY);	// for inter non-dynamic slice
+void WelsMdInterMbLoopOverDynamicSlice (sWelsEncCtx* pEncCtx, SSlice* pSlice, void* pMd,
+                                        const int32_t kiSliceFirstMbXY);	// for inter dynamic slice
 
 
-BOOL_T DynSlcJudgeSliceBoundaryStepBack(void *pEncCtx, void *pSlice, SSliceCtx *pSliceCtx, SMB* pCurMb, SDynamicSlicingStack* pDss );
+BOOL_T DynSlcJudgeSliceBoundaryStepBack (void* pEncCtx, void* pSlice, SSliceCtx* pSliceCtx, SMB* pCurMb,
+    SDynamicSlicingStack* pDss);
 }
 #endif //SVC_ENCODE_SLICE_H__
--- a/codec/encoder/core/inc/svc_mode_decision.h
+++ b/codec/encoder/core/inc/svc_mode_decision.h
@@ -51,11 +51,12 @@
 ///////////////////////
 
 // NOILP ILFMD ENTRANCE
-void WelsMdSpatialelInterMbIlfmdNoilp( sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SSlice *pSlice, SMB* pCurMb, const Mb_Type kuiRefMbType);
-void WelsMdInterMbEnhancelayer( void* pEnc, void* pMd, SSlice *pSlice, SMB* pCurMb, SMbCache *pMbCache );
+void WelsMdSpatialelInterMbIlfmdNoilp (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SSlice* pSlice, SMB* pCurMb,
+                                       const Mb_Type kuiRefMbType);
+void WelsMdInterMbEnhancelayer (void* pEnc, void* pMd, SSlice* pSlice, SMB* pCurMb, SMbCache* pMbCache);
 
-SMB* GetRefMb( SDqLayer *pCurLayer, SMB *pCurMb );
-void SetMvBaseEnhancelayer( SWelsMD* pMd, SMB *pCurMb, const SMB *kpRefMb );
+SMB* GetRefMb (SDqLayer* pCurLayer, SMB* pCurMb);
+void SetMvBaseEnhancelayer (SWelsMD* pMd, SMB* pCurMb, const SMB* kpRefMb);
 }
 #endif //SVC_MODE_DECISION_H
 
--- a/codec/encoder/core/inc/svc_motion_estimate.h
+++ b/codec/encoder/core/inc/svc_motion_estimate.h
@@ -49,28 +49,28 @@
 #define	ITERATIVE_TIMES	(16)
 #define	BASE_MV_MB_NMB	((2*(MV_RANGE+ITERATIVE_TIMES)/MB_WIDTH_LUMA)-1)
 
-union SadPredISatdUnit{
-	uint16_t	uiSadPred;
-	uint16_t	uiSatd;    //reuse the sad_pred as a temp satd pData 
+union SadPredISatdUnit {
+uint16_t	uiSadPred;
+uint16_t	uiSatd;    //reuse the sad_pred as a temp satd pData
 };
 typedef struct TagWelsME {
-    /* input */
-	uint16_t					*pMvdCost;
-    union SadPredISatdUnit	uSadPredISatd; //reuse the sad_pred as a temp pData
-	uint16_t					uiSadCost;  //used by ME and RC 
-    uint16_t					uiSatdCost; /* satd + lm * nbits */
-    uint8_t						uiPixel;   /* PIXEL_WxH */
-    uint8_t						uiReserved;
-	
-    uint8_t						*pEncMb;
-    uint8_t						*pRefMb;
+/* input */
+uint16_t*					pMvdCost;
+union SadPredISatdUnit	uSadPredISatd; //reuse the sad_pred as a temp pData
+uint16_t					uiSadCost;  //used by ME and RC
+uint16_t					uiSatdCost; /* satd + lm * nbits */
+uint8_t						uiPixel;   /* PIXEL_WxH */
+uint8_t						uiReserved;
 
-	SMVUnitXY					sMvp;
-	SMVUnitXY					sMvBase;
-	/* output */
-    SMVUnitXY					sMv;
-}SWelsME;
+uint8_t*						pEncMb;
+uint8_t*						pRefMb;
 
+SMVUnitXY					sMvp;
+SMVUnitXY					sMvBase;
+/* output */
+SMVUnitXY					sMv;
+} SWelsME;
+
 #define  COST_MVD(table, mx, my)  (table[mx] + table[my])
 
 
@@ -83,9 +83,9 @@
  *
  * \return	NONE
  */
-void WelsMotionEstimateSearchSatd(SWelsFuncPtrList *pFuncList, void* pLplayer, void* pLpme, void* pLpslice );
+void WelsMotionEstimateSearchSatd (SWelsFuncPtrList* pFuncList, void* pLplayer, void* pLpme, void* pLpslice);
 
-void WelsMotionEstimateSearchSad(SWelsFuncPtrList *pFuncList, void* pLplayer, void* pLpme, void* pLpslice );
+void WelsMotionEstimateSearchSad (SWelsFuncPtrList* pFuncList, void* pLplayer, void* pLpme, void* pLpslice);
 
 
 
@@ -112,7 +112,8 @@
  * \return	NONE
  */
 
-void WelsMotionEstimateInitialPoint(SWelsFuncPtrList *pFuncList, SWelsME *pMe, SSlice *pSlice, const int32_t kiStrideEnc, const int32_t kiStrideRef );
+void WelsMotionEstimateInitialPoint (SWelsFuncPtrList* pFuncList, SWelsME* pMe, SSlice* pSlice,
+                                     const int32_t kiStrideEnc, const int32_t kiStrideRef);
 
 /*!
  * \brief	mb iterative motion estimate search
@@ -123,9 +124,11 @@
  *
  * \return	NONE
  */
-void WelsMotionEstimateIterativeSearch( SWelsFuncPtrList *pFuncList, SWelsME *pMe, const int32_t kiStrideEnc, const int32_t kiStrideRef, uint8_t *pRef );
+void WelsMotionEstimateIterativeSearch (SWelsFuncPtrList* pFuncList, SWelsME* pMe, const int32_t kiStrideEnc,
+                                        const int32_t kiStrideRef, uint8_t* pRef);
 
-bool_t WelsMeSadCostSelect( int32_t *pSadCost, const uint16_t *kpMvdCost, int32_t *pBestCost, const int32_t kiDx, const int32_t kiDy, int32_t *pIx, int32_t *pIy);
+bool_t WelsMeSadCostSelect (int32_t* pSadCost, const uint16_t* kpMvdCost, int32_t* pBestCost, const int32_t kiDx,
+                            const int32_t kiDy, int32_t* pIx, int32_t* pIy);
 
 }
 #endif
--- a/codec/encoder/core/inc/svc_set_mb_syn_cavlc.h
+++ b/codec/encoder/core/inc/svc_set_mb_syn_cavlc.h
@@ -33,11 +33,11 @@
  *
  * \brief	Seting all syntax elements of mb and decoding residual with cavlc
  *
- * \date	2009.8.12 Created 
+ * \date	2009.8.12 Created
  *
  *************************************************************************************
  */
- 
+
 #ifndef SVC_SET_MB_SYN_CAVLC_H_
 #define SVC_SET_MB_SYN_CAVLC_H_
 
@@ -51,15 +51,15 @@
 namespace WelsSVCEnc {
 //#pragma pack(1)
 
-void WelsWriteMbResidual( SMbCache* sMbCacheInfo, SMB *pCurMb, SBitStringAux *pBs );
+void WelsWriteMbResidual (SMbCache* sMbCacheInfo, SMB* pCurMb, SBitStringAux* pBs);
 
 //for Enhance Layer CAVLC writing
-void WelsSpatialWriteSubMbPred( sWelsEncCtx *pEncCtx, SSlice *pSlice, SMB *pCurMb );
+void WelsSpatialWriteSubMbPred (sWelsEncCtx* pEncCtx, SSlice* pSlice, SMB* pCurMb);
 
-void WelsSpatialWriteMbPred( sWelsEncCtx *pEncCtx, SSlice *pSlice, SMB *pCurMb );
+void WelsSpatialWriteMbPred (sWelsEncCtx* pEncCtx, SSlice* pSlice, SMB* pCurMb);
 
 //for Base Layer CAVLC writing
-void WelsSpatialWriteMbSyn( sWelsEncCtx *pEncCtx, SSlice *pSlice, SMB *pCurMb );
+void WelsSpatialWriteMbSyn (sWelsEncCtx* pEncCtx, SSlice* pSlice, SMB* pCurMb);
 
 //#pragma pack()
 }
--- a/codec/encoder/core/inc/trace.h
+++ b/codec/encoder/core/inc/trace.h
@@ -60,9 +60,9 @@
 			fflush(fp);\
 			}
 
-void TraceName(FILE *pFp, int8_t *pName, SBitStringAux *pBs);
+void TraceName (FILE* pFp, int8_t* pName, SBitStringAux* pBs);
 
-void TraceBits(FILE *pFp, uint32_t uiStart, uint32_t uiEnd, SBitStringAux *pBs);
+void TraceBits (FILE* pFp, uint32_t uiStart, uint32_t uiEnd, SBitStringAux* pBs);
 
 
 #endif
--- a/codec/encoder/core/inc/typedefs.h
+++ b/codec/encoder/core/inc/typedefs.h
@@ -47,7 +47,7 @@
 
 #else
 
-// FIXME:     all singed type should be declared explicit,  for example,  int8_t should be declared as signed char.  
+// FIXME:     all singed type should be declared explicit,  for example,  int8_t should be declared as signed char.
 typedef signed char      int8_t  ;
 typedef unsigned char    uint8_t ;
 typedef short            int16_t ;
@@ -59,7 +59,7 @@
 
 #endif // _MSC_VER defined
 
-// FIXME:     all string type should be declared explicit as char. 
+// FIXME:     all string type should be declared explicit as char.
 typedef char      str_t;
 typedef float     real32_t;
 
--- a/codec/encoder/core/inc/utils.h
+++ b/codec/encoder/core/inc/utils.h
@@ -50,15 +50,15 @@
  */
 
 typedef int32_t	iWelsLogLevel;
-enum{
-	WELS_LOG_QUIET		= 0x00,		// Quiet mode
-	WELS_LOG_ERROR		= 1 << 0,	// Error log iLevel
-	WELS_LOG_WARNING	= 1 << 1,	// Warning log iLevel
-	WELS_LOG_INFO		= 1 << 2,	// Information log iLevel
-	WELS_LOG_DEBUG		= 1 << 3,	// Debug log iLevel
-	WELS_LOG_RESV		= 1 << 4,	// Resversed log iLevel
-	WELS_LOG_LEVEL_COUNT= 5,
-	WELS_LOG_DEFAULT	= WELS_LOG_ERROR | WELS_LOG_WARNING | WELS_LOG_INFO | WELS_LOG_DEBUG	// Default log iLevel in Wels codec
+enum {
+WELS_LOG_QUIET		= 0x00,		// Quiet mode
+WELS_LOG_ERROR		= 1 << 0,	// Error log iLevel
+WELS_LOG_WARNING	= 1 << 1,	// Warning log iLevel
+WELS_LOG_INFO		= 1 << 2,	// Information log iLevel
+WELS_LOG_DEBUG		= 1 << 3,	// Debug log iLevel
+WELS_LOG_RESV		= 1 << 4,	// Resversed log iLevel
+WELS_LOG_LEVEL_COUNT = 5,
+WELS_LOG_DEFAULT	= WELS_LOG_ERROR | WELS_LOG_WARNING | WELS_LOG_INFO | WELS_LOG_DEBUG	// Default log iLevel in Wels codec
 };
 
 /*
@@ -65,27 +65,28 @@
  *	Function pointer declaration for various tool sets
  */
 // wels log output
-typedef void (*PWelsLogCallbackFunc)(void *pCtx, const int32_t iLevel, const str_t *kpFmt, va_list argv);
+typedef void (*PWelsLogCallbackFunc) (void* pCtx, const int32_t iLevel, const str_t* kpFmt, va_list argv);
 
 // wels psnr calc
-typedef real32_t (*PWelsPsnrFunc)(	const void *kpTarPic,
-										const int32_t kiTarStride,
-										const void *kpRefPic,
-										const int32_t kiRefStride,
-										const int32_t kiWidth,
-										const int32_t kiHeight	);
+typedef real32_t (*PWelsPsnrFunc) (const void* kpTarPic,
+                                   const int32_t kiTarStride,
+                                   const void* kpRefPic,
+                                   const int32_t kiRefStride,
+                                   const int32_t kiWidth,
+                                   const int32_t kiHeight);
 
 extern PWelsLogCallbackFunc	wlog;
 
 #ifdef __GNUC__
-extern void WelsLog(void *pCtx, int32_t iLevel, const str_t *kpFmt, ...) __attribute__ ((__format__ (__printf__, 3, 4)));
+extern void WelsLog (void* pCtx, int32_t iLevel, const str_t* kpFmt, ...) __attribute__ ((__format__ (__printf__, 3,
+    4)));
 #else
-extern void WelsLog(void *pCtx, int32_t iLevel, const str_t *kpFmt, ...);
+extern void WelsLog (void* pCtx, int32_t iLevel, const str_t* kpFmt, ...);
 #endif
 
-extern const str_t *g_sWelsLogTags[];
+extern const str_t* g_sWelsLogTags[];
 
-/*! 
+/*!
  *************************************************************************************
  * \brief	System trace log output in Wels
  *
@@ -99,15 +100,15 @@
  * \note	N/A
  *************************************************************************************
  */
-void WelsLogDefault( void *pCtx, const int32_t kiLevel, const str_t *kpFmtStr, va_list argv );
-void WelsLogNil( void *pCtx, const int32_t kiLevel, const str_t *kpFmtStr, va_list argv );
+void WelsLogDefault (void* pCtx, const int32_t kiLevel, const str_t* kpFmtStr, va_list argv);
+void WelsLogNil (void* pCtx, const int32_t kiLevel, const str_t* kpFmtStr, va_list argv);
 
 
-/*! 
+/*!
  *************************************************************************************
  * \brief	set log iLevel from external call
  *
- * \param	iLevel	iLevel of log 
+ * \param	iLevel	iLevel of log
  *
  * \return	NONE
  *
@@ -114,9 +115,9 @@
  * \note	can be able to control log iLevel dynamically
  *************************************************************************************
  */
-void WelsSetLogLevel( const int32_t kiLevel );
+void WelsSetLogLevel (const int32_t kiLevel);
 
-/*! 
+/*!
  *************************************************************************************
  * \brief	get log iLevel from external call
  *
@@ -127,9 +128,9 @@
  * \note	can be able to get log iLevel of internal codec applicable
  *************************************************************************************
  */
-int32_t WelsGetLogLevel( void );
+int32_t WelsGetLogLevel (void);
 
-/*! 
+/*!
  *************************************************************************************
  * \brief	set log callback from external call
  *
@@ -140,9 +141,9 @@
  * \note	N/A
  *************************************************************************************
  */
-void WelsSetLogCallback( PWelsLogCallbackFunc _log );
+void WelsSetLogCallback (PWelsLogCallbackFunc _log);
 
-/*! 
+/*!
 *************************************************************************************
 * \brief	reopen log file when finish setting current path
 *
@@ -154,12 +155,12 @@
 * \note	N/A
 *************************************************************************************
 */
-void WelsReopenTraceFile( void *pCtx, str_t *pCurPath );
+void WelsReopenTraceFile (void* pCtx, str_t* pCurPath);
 
 /*
  *	PSNR calculation routines
  */
-/*! 
+/*!
  *************************************************************************************
  * \brief	PSNR calculation utilization in Wels
  *
@@ -175,12 +176,12 @@
  * \note	N/A
  *************************************************************************************
  */
-real32_t WelsCalcPsnr(	const void *kpTarPic,
-							const int32_t kiTarStride,
-							const void *kpRefPic,
-							const int32_t kiRefStride,
-							const int32_t kiWidth,
-							const int32_t kiHeight );
+real32_t WelsCalcPsnr (const void* kpTarPic,
+                       const int32_t kiTarStride,
+                       const void* kpRefPic,
+                       const int32_t kiRefStride,
+                       const int32_t kiWidth,
+                       const int32_t kiHeight);
 
 }
 #endif//WELS_UTILS_H__
--- a/codec/encoder/core/inc/vlc_encoder.h
+++ b/codec/encoder/core/inc/vlc_encoder.h
@@ -52,46 +52,42 @@
 extern const uint8_t g_kuiVlcTotalZerosChromaDc422[8][8][2];
 //g_kuiVlcRunBefore[zeros-left][run-before][0--value, 1--bit count]
 extern const uint8_t g_kuiVlcRunBefore[8][15][2];
-extern const ALIGNED_DECLARE(uint8_t, g_kuiEncNcMapTable[18], 16);
+extern const ALIGNED_DECLARE (uint8_t, g_kuiEncNcMapTable[18], 16);
 
 #define    CHROMA_DC_NC_OFFSET       17
 
-static inline int32_t WriteTotalCoeffTrailingones( SBitStringAux *pBs, uint8_t uiNc, uint8_t uiTotalCoeff, uint8_t uiTrailingOnes )
-{
-	const uint8_t kuiNcIdx		= g_kuiEncNcMapTable[uiNc];
-	const uint8_t *kpCoeffToken	= &g_kuiVlcCoeffToken[kuiNcIdx][uiTotalCoeff][uiTrailingOnes][0];	
-	return BsWriteBits( pBs,  kpCoeffToken[1], kpCoeffToken[0] );	
+static inline int32_t WriteTotalCoeffTrailingones (SBitStringAux* pBs, uint8_t uiNc, uint8_t uiTotalCoeff,
+    uint8_t uiTrailingOnes) {
+const uint8_t kuiNcIdx		= g_kuiEncNcMapTable[uiNc];
+const uint8_t* kpCoeffToken	= &g_kuiVlcCoeffToken[kuiNcIdx][uiTotalCoeff][uiTrailingOnes][0];
+return BsWriteBits (pBs,  kpCoeffToken[1], kpCoeffToken[0]);
 }
 
-static inline int32_t WriteTotalcoeffTrailingonesChroma( SBitStringAux *pBs, uint8_t uiTotalCoeff, uint8_t uiTrailingOnes )
-{
-	const uint8_t *kpCoeffToken	= &g_kuiVlcCoeffToken[4][uiTotalCoeff][uiTrailingOnes][0];
-	return BsWriteBits( pBs, kpCoeffToken[1], kpCoeffToken[0] );	
+static inline int32_t WriteTotalcoeffTrailingonesChroma (SBitStringAux* pBs, uint8_t uiTotalCoeff,
+    uint8_t uiTrailingOnes) {
+const uint8_t* kpCoeffToken	= &g_kuiVlcCoeffToken[4][uiTotalCoeff][uiTrailingOnes][0];
+return BsWriteBits (pBs, kpCoeffToken[1], kpCoeffToken[0]);
 }
 
 //kuiZeroCount = level_prefix;
-static inline int32_t WriteLevelPrefix( SBitStringAux *pBs, const uint32_t kuiZeroCount )
-{	
-	BsWriteBits(pBs, kuiZeroCount+1, 1);
-	return 0;
+static inline int32_t WriteLevelPrefix (SBitStringAux* pBs, const uint32_t kuiZeroCount) {
+BsWriteBits (pBs, kuiZeroCount + 1, 1);
+return 0;
 }
 
-static inline int32_t WriteTotalZeros( SBitStringAux *pBs, uint32_t uiTotalCoeff, uint32_t uiTotalZeros )
-{
-	const uint8_t *kpTotalZeros	= &g_kuiVlcTotalZeros[uiTotalCoeff][uiTotalZeros][0];
-	return BsWriteBits( pBs, kpTotalZeros[1], kpTotalZeros[0] );	
+static inline int32_t WriteTotalZeros (SBitStringAux* pBs, uint32_t uiTotalCoeff, uint32_t uiTotalZeros) {
+const uint8_t* kpTotalZeros	= &g_kuiVlcTotalZeros[uiTotalCoeff][uiTotalZeros][0];
+return BsWriteBits (pBs, kpTotalZeros[1], kpTotalZeros[0]);
 }
 
-static inline int32_t WriteTotalZerosChromaDc( SBitStringAux *pBs, uint32_t uiTotalCoeff, uint32_t uiTotalZeros )
-{
-	const uint8_t *kpTotalZerosChromaDc = &g_kuiVlcTotalZerosChromaDc[uiTotalCoeff][uiTotalZeros][0];
-	return BsWriteBits( pBs, kpTotalZerosChromaDc[1], kpTotalZerosChromaDc[0] );	
+static inline int32_t WriteTotalZerosChromaDc (SBitStringAux* pBs, uint32_t uiTotalCoeff, uint32_t uiTotalZeros) {
+const uint8_t* kpTotalZerosChromaDc = &g_kuiVlcTotalZerosChromaDc[uiTotalCoeff][uiTotalZeros][0];
+return BsWriteBits (pBs, kpTotalZerosChromaDc[1], kpTotalZerosChromaDc[0]);
 }
 
-static inline int32_t WriteRunBefore( SBitStringAux *pBs, uint8_t uiZeroLeft, uint8_t uiRunBefore )
-{
-	const uint8_t *kpRunBefore = &g_kuiVlcRunBefore[uiZeroLeft][uiRunBefore][0];
-	return BsWriteBits( pBs, kpRunBefore[1], kpRunBefore[0] );
+static inline int32_t WriteRunBefore (SBitStringAux* pBs, uint8_t uiZeroLeft, uint8_t uiRunBefore) {
+const uint8_t* kpRunBefore = &g_kuiVlcRunBefore[uiZeroLeft][uiRunBefore][0];
+return BsWriteBits (pBs, kpRunBefore[1], kpRunBefore[0]);
 }
 }
 #endif
--- a/codec/encoder/core/inc/wels_common_basis.h
+++ b/codec/encoder/core/inc/wels_common_basis.h
@@ -37,68 +37,67 @@
 #include "typedefs.h"
 #include "macros.h"
 
-#include "wels_const.h" 
+#include "wels_const.h"
 
 
 namespace WelsSVCEnc {
 /*common use table*/
 
-extern const  ALIGNED_DECLARE(uint16_t, g_kuiDequantCoeff[52][8], 16);
+extern const  ALIGNED_DECLARE (uint16_t, g_kuiDequantCoeff[52][8], 16);
 extern const uint8_t g_kuiChromaQpTable[52];
 
 /* Profile IDC */
 
-enum EProfileIdc{
-	PRO_BASELINE	= 66,
-	PRO_MAIN		= 77,
-	PRO_EXTENDED	= 88,
-	PRO_HIGH		= 100,
-	PRO_HIGH10		= 110,
-	PRO_HIGH422		= 122,
-	PRO_HIGH444		= 144,
-	PRO_CAVLC444	= 244,
-	
-	PRO_SCALABLE_BASELINE	= 83,
-	PRO_SCALABLE_HIGH		= 86,
+enum EProfileIdc {
+PRO_BASELINE	= 66,
+PRO_MAIN		= 77,
+PRO_EXTENDED	= 88,
+PRO_HIGH		= 100,
+PRO_HIGH10		= 110,
+PRO_HIGH422		= 122,
+PRO_HIGH444		= 144,
+PRO_CAVLC444	= 244,
+
+PRO_SCALABLE_BASELINE	= 83,
+PRO_SCALABLE_HIGH		= 86,
 };
 
 /*
  *	NAL Unit Type (5 Bits)
  */
-enum EWelsNalUnitType
-{
-	NAL_UNIT_UNSPEC_0			= 0,
-	NAL_UNIT_CODED_SLICE		= 1,
-	NAL_UNIT_CODED_SLICE_DPA	= 2,
-	NAL_UNIT_CODED_SLICE_DPB	= 3,
-	NAL_UNIT_CODED_SLICE_DPC	= 4,
-	NAL_UNIT_CODED_SLICE_IDR	= 5,
-	NAL_UNIT_SEI				= 6,
-	NAL_UNIT_SPS				= 7,
-	NAL_UNIT_PPS				= 8,
-	NAL_UNIT_AU_DELIMITER		= 9,
-	NAL_UNIT_END_OF_SEQ			= 10,
-	NAL_UNIT_END_OF_STR			= 11,
-	NAL_UNIT_FILLER_DATA		= 12,
-	NAL_UNIT_SPS_EXT			= 13,
-	NAL_UNIT_PREFIX				= 14,
-	NAL_UNIT_SUBSET_SPS			= 15,
-	NAL_UNIT_RESV_16			= 16,
-	NAL_UNIT_RESV_17			= 17,
-	NAL_UNIT_RESV_18			= 18,
-	NAL_UNIT_AUX_CODED_SLICE	= 19,
-	NAL_UNIT_CODED_SLICE_EXT	= 20,
-	NAL_UNIT_RESV_21			= 21,
-	NAL_UNIT_RESV_22			= 22,
-	NAL_UNIT_RESV_23			= 23,
-	NAL_UNIT_UNSPEC_24			= 24,
-	NAL_UNIT_UNSPEC_25			= 25,
-	NAL_UNIT_UNSPEC_26			= 26,
-	NAL_UNIT_UNSPEC_27			= 27,
-	NAL_UNIT_UNSPEC_28			= 28,
-	NAL_UNIT_UNSPEC_29			= 29,
-	NAL_UNIT_UNSPEC_30			= 30,
-	NAL_UNIT_UNSPEC_31			= 31
+enum EWelsNalUnitType {
+NAL_UNIT_UNSPEC_0			= 0,
+NAL_UNIT_CODED_SLICE		= 1,
+NAL_UNIT_CODED_SLICE_DPA	= 2,
+NAL_UNIT_CODED_SLICE_DPB	= 3,
+NAL_UNIT_CODED_SLICE_DPC	= 4,
+NAL_UNIT_CODED_SLICE_IDR	= 5,
+NAL_UNIT_SEI				= 6,
+NAL_UNIT_SPS				= 7,
+NAL_UNIT_PPS				= 8,
+NAL_UNIT_AU_DELIMITER		= 9,
+NAL_UNIT_END_OF_SEQ			= 10,
+NAL_UNIT_END_OF_STR			= 11,
+NAL_UNIT_FILLER_DATA		= 12,
+NAL_UNIT_SPS_EXT			= 13,
+NAL_UNIT_PREFIX				= 14,
+NAL_UNIT_SUBSET_SPS			= 15,
+NAL_UNIT_RESV_16			= 16,
+NAL_UNIT_RESV_17			= 17,
+NAL_UNIT_RESV_18			= 18,
+NAL_UNIT_AUX_CODED_SLICE	= 19,
+NAL_UNIT_CODED_SLICE_EXT	= 20,
+NAL_UNIT_RESV_21			= 21,
+NAL_UNIT_RESV_22			= 22,
+NAL_UNIT_RESV_23			= 23,
+NAL_UNIT_UNSPEC_24			= 24,
+NAL_UNIT_UNSPEC_25			= 25,
+NAL_UNIT_UNSPEC_26			= 26,
+NAL_UNIT_UNSPEC_27			= 27,
+NAL_UNIT_UNSPEC_28			= 28,
+NAL_UNIT_UNSPEC_29			= 29,
+NAL_UNIT_UNSPEC_30			= 30,
+NAL_UNIT_UNSPEC_31			= 31
 };
 
 /*
@@ -105,22 +104,21 @@
  *	NAL Reference IDC (2 Bits)
  */
 
-enum EWelsNalRefIdc
-{
-	NRI_PRI_LOWEST	= 0,
-	NRI_PRI_LOW		= 1,
-	NRI_PRI_HIGH	= 2,
-	NRI_PRI_HIGHEST	= 3
+enum EWelsNalRefIdc {
+NRI_PRI_LOWEST	= 0,
+NRI_PRI_LOW		= 1,
+NRI_PRI_HIGH	= 2,
+NRI_PRI_HIGHEST	= 3
 };
 
 /*
- * VCL TYPE	
+ * VCL TYPE
  */
 
-enum EVclType{
-	NON_VCL			= 0,
-	VCL				= 1,
-	NOT_APP			= 2
+enum EVclType {
+NON_VCL			= 0,
+VCL				= 1,
+NOT_APP			= 2
 };
 
 /*
@@ -142,13 +140,13 @@
 /*
  *	Frame types used in internal encoder (logic level based)
  */
-enum EFrameType{
-	WELS_FRAME_TYPE_AUTO	= 0x0000,	/* Let encoder engine choose the proper type, RDO or scene change based */
-	WELS_FRAME_TYPE_IDR		= 0x0001,	/* IDR, I frame with parameter sets */
-	WELS_FRAME_TYPE_I		= 0x0002,	/* I Frame */
-	WELS_FRAME_TYPE_P		= 0x0003,	/* P Frame */
-	WELS_FRAME_TYPE_B		= 0x0004,	/* B Frame */
-	WELS_FRAME_TYPE_SKIP	= 0x0008
+enum EFrameType {
+WELS_FRAME_TYPE_AUTO	= 0x0000,	/* Let encoder engine choose the proper type, RDO or scene change based */
+WELS_FRAME_TYPE_IDR		= 0x0001,	/* IDR, I frame with parameter sets */
+WELS_FRAME_TYPE_I		= 0x0002,	/* I Frame */
+WELS_FRAME_TYPE_P		= 0x0003,	/* P Frame */
+WELS_FRAME_TYPE_B		= 0x0004,	/* B Frame */
+WELS_FRAME_TYPE_SKIP	= 0x0008
 };
 
 /* Base SSlice Types
@@ -157,113 +155,113 @@
  * meaning mapped version after eSliceType minus 4.
  */
 
-enum EWelsSliceType
-{
-	P_SLICE	= 0,
-	B_SLICE	= 1,
-	I_SLICE	= 2,
-	SP_SLICE= 3,
-	SI_SLICE= 4,
-	UNKNOWN_SLICE= 5
+enum EWelsSliceType {
+P_SLICE	= 0,
+B_SLICE	= 1,
+I_SLICE	= 2,
+SP_SLICE = 3,
+SI_SLICE = 4,
+UNKNOWN_SLICE = 5
 };
 
 /* SSlice Types in scalable extension */		;
-enum ESliceTypeExt{
-	EP_SLICE = 0,	// EP_SLICE: 0, 5
-	EB_SLICE = 1,	// EB_SLICE: 1, 6
-	EI_SLICE = 2	// EI_SLICE: 2, 7
+enum ESliceTypeExt {
+EP_SLICE = 0,	// EP_SLICE: 0, 5
+EB_SLICE = 1,	// EB_SLICE: 1, 6
+EI_SLICE = 2	// EI_SLICE: 2, 7
 };
 
 /* List Index */
-enum EListIndex{
-	LIST_0	= 0,
-	LIST_1	= 1,
-	LIST_A	= 2
+enum EListIndex {
+LIST_0	= 0,
+LIST_1	= 1,
+LIST_A	= 2
 };
 
 
-struct SMVUnitXY{			// each 4 Bytes
-    int16_t		iMvX;
-    int16_t		iMvY;
-public:	
-	SMVUnitXY& sDeltaMv ( const SMVUnitXY& _v0, const SMVUnitXY& _v1 )
-	{
-		iMvX = _v0.iMvX - _v1.iMvX;
-		iMvY = _v0.iMvY - _v1.iMvY;
-		return (*this);
-	}
+struct SMVUnitXY {			// each 4 Bytes
+int16_t		iMvX;
+int16_t		iMvY;
+ public:
+SMVUnitXY& sDeltaMv (const SMVUnitXY& _v0, const SMVUnitXY& _v1) {
+  iMvX = _v0.iMvX - _v1.iMvX;
+  iMvY = _v0.iMvY - _v1.iMvY;
+  return (*this);
+}
 };
 
-typedef struct TagMVComponentUnit{		// each 	LIST_0/LIST_1
-	SMVUnitXY	sMotionVectorCache[5*6-1];			// Luma only: 5 x 6 - 1 = 29 D-Words
-	int8_t		iRefIndexCache[5 * 6];			// Luma only: 5 x 6 = 30 bytes
-}SMVComponentUnit, *PMVComponentUnit;
+typedef struct TagMVComponentUnit {		// each 	LIST_0/LIST_1
+SMVUnitXY	sMotionVectorCache[5 * 6 - 1];			// Luma only: 5 x 6 - 1 = 29 D-Words
+int8_t		iRefIndexCache[5 * 6];			// Luma only: 5 x 6 = 30 bytes
+} SMVComponentUnit, *PMVComponentUnit;
 
 
-typedef struct TagParaSetOffsetVariable{	
-	int32_t 	iParaSetIdDelta[MAX_DQ_LAYER_NUM/*+1*/];	//mark delta between SPS_ID_in_bs and sps_id_in_encoder, can be minus, for each dq-layer
-															//need not extra +1 due no MGS and FMO case so far
-	bool_t		bUsedParaSetIdInBs[MAX_PPS_COUNT];	//mark the used SPS_ID with 1
-	uint32_t	uiNextParaSetIdToUseInBs;					//mark the next SPS_ID_in_bs, for all layers
-}SParaSetOffsetVariable;
+typedef struct TagParaSetOffsetVariable {
+int32_t 	iParaSetIdDelta[MAX_DQ_LAYER_NUM/*+1*/];	//mark delta between SPS_ID_in_bs and sps_id_in_encoder, can be minus, for each dq-layer
+//need not extra +1 due no MGS and FMO case so far
+bool_t		bUsedParaSetIdInBs[MAX_PPS_COUNT];	//mark the used SPS_ID with 1
+uint32_t	uiNextParaSetIdToUseInBs;					//mark the next SPS_ID_in_bs, for all layers
+} SParaSetOffsetVariable;
 
-typedef struct TagParaSetOffset{
-	//in PS0 design, "sParaSetOffsetVariable" record the previous paras before current IDR, AND NEED to be stacked and recover across IDR
-	SParaSetOffsetVariable   sParaSetOffsetVariable[PARA_SET_TYPE]; //PARA_SET_TYPE=3; paraset_type = 0: AVC_SPS; =1: Subset_SPS; =2: PPS	
-	//in PSO design, "bPpsIdMappingIntoSubsetsps" uses the current para of current IDR period
-	bool_t                  bPpsIdMappingIntoSubsetsps[MAX_DQ_LAYER_NUM/*+1*/];	// need not extra +1 due no MGS and FMO case so far
-	uint16_t	            uiIdrPicId;		// IDR picture id: [0, 65535], this one is used for LTR!! Can we just NOT put this into the SParaSetOffset structure?!!
-#if _DEBUG 
-	bool_t                  bEnableSpsPpsIdAddition;
+typedef struct TagParaSetOffset {
+//in PS0 design, "sParaSetOffsetVariable" record the previous paras before current IDR, AND NEED to be stacked and recover across IDR
+SParaSetOffsetVariable
+sParaSetOffsetVariable[PARA_SET_TYPE]; //PARA_SET_TYPE=3; paraset_type = 0: AVC_SPS; =1: Subset_SPS; =2: PPS
+//in PSO design, "bPpsIdMappingIntoSubsetsps" uses the current para of current IDR period
+bool_t
+bPpsIdMappingIntoSubsetsps[MAX_DQ_LAYER_NUM/*+1*/];	// need not extra +1 due no MGS and FMO case so far
+uint16_t
+uiIdrPicId;		// IDR picture id: [0, 65535], this one is used for LTR!! Can we just NOT put this into the SParaSetOffset structure?!!
+#if _DEBUG
+bool_t                  bEnableSpsPpsIdAddition;
 #endif
-}SParaSetOffset;
+} SParaSetOffset;
 
 
 
 /* Motion Vector components */
-enum EMvComp{
-	MV_X	= 0,
-	MV_Y	= 1,
-	MV_A	= 2
+enum EMvComp {
+MV_X	= 0,
+MV_Y	= 1,
+MV_A	= 2
 };
 
 /* Chroma Components */
 
-enum EChromaComp{
-	CHROMA_CB	= 0,
-	CHROMA_CR	= 1,
-	CHROMA_A	= 2
+enum EChromaComp {
+CHROMA_CB	= 0,
+CHROMA_CR	= 1,
+CHROMA_A	= 2
 };
 
 /* Position Offset structure */
-typedef struct TagCropOffset{
-	int16_t	iCropLeft;
-    int16_t	iCropRight;
-	int16_t	iCropTop;
-	int16_t	iCropBottom;
-}SCropOffset;
+typedef struct TagCropOffset {
+int16_t	iCropLeft;
+int16_t	iCropRight;
+int16_t	iCropTop;
+int16_t	iCropBottom;
+} SCropOffset;
 
 
 /* Transform Type */
 
-enum ETransType{
-	T_4x4	= 0,
-	T_8x8	= 1,
-	T_16x16	= 2,
-	T_PCM	= 3
+enum ETransType {
+T_4x4	= 0,
+T_8x8	= 1,
+T_16x16	= 2,
+T_PCM	= 3
 };
 
-enum EMbPosition 
-{
-    LEFT_MB_POS     = 0x01,	// A
-    TOP_MB_POS      = 0x02,	// B
-    TOPRIGHT_MB_POS = 0x04,	// C
-	TOPLEFT_MB_POS	= 0x08,	// D,
-	RIGHT_MB_POS	= 0x10,	//  add followed four case to reuse when intra up-sample
-	BOTTOM_MB_POS	= 0x20,	// 
-	BOTTOMRIGHT_MB_POS = 0x40,	// 
-	BOTTOMLEFT_MB_POS	= 0x80,	//
-	MB_POS_A  = 0x100
+enum EMbPosition {
+LEFT_MB_POS     = 0x01,	// A
+TOP_MB_POS      = 0x02,	// B
+TOPRIGHT_MB_POS = 0x04,	// C
+TOPLEFT_MB_POS	= 0x08,	// D,
+RIGHT_MB_POS	= 0x10,	//  add followed four case to reuse when intra up-sample
+BOTTOM_MB_POS	= 0x20,	//
+BOTTOMRIGHT_MB_POS = 0x40,	//
+BOTTOMLEFT_MB_POS	= 0x80,	//
+MB_POS_A  = 0x100
 };
 #define MB_ON_PIC_BOUNDRY			(RIGHT_MB_POS|BOTTOM_MB_POS|LEFT_MB_POS|TOP_MB_POS)
 
@@ -311,7 +309,7 @@
 #define SUB_TYPE_8x8			(MB_TYPE_8x8 | MB_TYPE_8x8_REF0)
 
 #define MB_TYPE_UNAVAILABLE		0xFF000000
-#define REF_NOT_AVAIL    -2   
+#define REF_NOT_AVAIL    -2
 #define REF_NOT_IN_LIST -1    //intra
 #define	REF_PIC_REORDER_DEFAULT	TRUE
 
@@ -329,14 +327,14 @@
 
 
 
-enum{
-	Intra4x4			= 0,
-	Intra16x16			= 1,
-	Inter16x16			= 2,
-	Inter16x8			= 3,
-	Inter8x16			= 4,
-	Inter8x8			= 5,
-	PSkip				= 6
+enum {
+Intra4x4			= 0,
+Intra16x16			= 1,
+Inter16x16			= 2,
+Inter16x8			= 3,
+Inter8x16			= 4,
+Inter8x8			= 5,
+PSkip				= 6
 };
 
 
@@ -343,14 +341,14 @@
 /*
  *	Memory Management Control Operation (MMCO) code
  */
-enum EMmcoCode{
-	MMCO_END			=0,
-	MMCO_SHORT2UNUSED	=1,
-	MMCO_LONG2UNUSED	=2,
-	MMCO_SHORT2LONG		=3,
-	MMCO_SET_MAX_LONG	=4,
-	MMCO_RESET			=5,
-	MMCO_LONG			=6
+enum EMmcoCode {
+MMCO_END			= 0,
+MMCO_SHORT2UNUSED	= 1,
+MMCO_LONG2UNUSED	= 2,
+MMCO_SHORT2LONG		= 3,
+MMCO_SET_MAX_LONG	= 4,
+MMCO_RESET			= 5,
+MMCO_LONG			= 6
 };
 
 /////////intra16x16  Luma
@@ -393,7 +391,7 @@
 
 #define C_PRED_DC_L      4
 #define C_PRED_DC_T      5
-#define C_PRED_DC_128    6 
-#define C_PRED_A    7 
+#define C_PRED_DC_128    6
+#define C_PRED_A    7
 }
 #endif//WELS_COMMON_BASIS_H__
--- a/codec/encoder/core/inc/wels_const.h
+++ b/codec/encoder/core/inc/wels_const.h
@@ -72,7 +72,7 @@
 #define MB_PARTITION_SIZE		4	// Macroblock partition size in 8x8 sub-blocks
 #define MB_SUB_PARTITION_SIZE	4	// Sub partition size in a 8x8 sub-block
 #define MB_BLOCK4x4_NUM				16
-#define INTRA_4x4_MODE_NUM		8	
+#define INTRA_4x4_MODE_NUM		8
 #define MB_BLOCK8x8_NUM				4
 #define MB_LUMA_CHROMA_BLOCK4x4_NUM  24
 
@@ -83,8 +83,8 @@
 #define MAX_PPS_COUNT 			(MAX_PPS_COUNT_LIMITED)//in Standard is 256	// Count number of PPS
 
 #define PARA_SET_TYPE			3 // SPS+PPS
-#define PARA_SET_TYPE_AVCSPS	0 
-#define PARA_SET_TYPE_SUBSETSPS	1 
+#define PARA_SET_TYPE_AVCSPS	0
+#define PARA_SET_TYPE_SUBSETSPS	1
 #define PARA_SET_TYPE_PPS		2
 
 #define MAX_FRAME_RATE			30	// maximal frame rate to support
@@ -174,15 +174,15 @@
 #define MAX_NAL_UNIT_NUM_IN_AU	256	// predefined maximal number of NAL Units in an access unit
 #define MAX_ACCESS_UINT_CAPACITY	(1<<20)	// Maximal AU capacity in bytes: 1024 KB predefined
 #define MAX_ACCESS_UNIT_CACHE_NUM	2	// Maximal Access Unit(AU) cache number to be processed, denote current AU and the next coming AU.
-enum{
-	CUR_AU_IDX	= 0,			// index symbol for current access unit
-	SUC_AU_IDX	= 1				// index symbol for successive access unit
+enum {
+  CUR_AU_IDX	= 0,			// index symbol for current access unit
+  SUC_AU_IDX	= 1				// index symbol for successive access unit
 };
 
 enum {
-	BASE_MB = 0,
-		AVC_REWRITE_ENHANCE_MB = 1,
-		NON_AVC_REWRITE_ENHANCE_MB =2
+  BASE_MB = 0,
+  AVC_REWRITE_ENHANCE_MB = 1,
+  NON_AVC_REWRITE_ENHANCE_MB = 2
 };
 
 #endif//WELS_CONSTANCE_H__
--- a/codec/encoder/core/inc/wels_func_ptr_def.h
+++ b/codec/encoder/core/inc/wels_func_ptr_def.h
@@ -48,177 +48,190 @@
 
 typedef struct TagWelsFuncPointerList SWelsFuncPtrList;
 
-typedef void (*PSetMemoryZero)(void *pDst, int32_t iSize);
-typedef void (*PDctFunc)( int16_t *pDct, uint8_t *pSample1, int32_t iStride1, uint8_t *pSample2, int32_t iStride2 );
+typedef void (*PSetMemoryZero) (void* pDst, int32_t iSize);
+typedef void (*PDctFunc) (int16_t* pDct, uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2);
 
-typedef void (*PCopyFunc)( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS );
-typedef void (*PIDctFunc)(uint8_t *pRec, int32_t iStride, uint8_t *pPred, int32_t iPredStride, int16_t *pRes);
-typedef void (*PDeQuantizationFunc)(int16_t *pRes, const uint16_t* kpQpTable);
-typedef void (*PDeQuantizationHadamardFunc)(int16_t *pRes, const uint16_t kuiMF);
-typedef int32_t (*PGetNoneZeroCountFunc)(int16_t *pLevel);
+typedef void (*PCopyFunc) (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);
+typedef void (*PIDctFunc) (uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pRes);
+typedef void (*PDeQuantizationFunc) (int16_t* pRes, const uint16_t* kpQpTable);
+typedef void (*PDeQuantizationHadamardFunc) (int16_t* pRes, const uint16_t kuiMF);
+typedef int32_t (*PGetNoneZeroCountFunc) (int16_t* pLevel);
 
-typedef void (*PScanFunc)(int16_t* pLevel, int16_t *pDct);
-typedef int32_t (*PCalculateSingleCtrFunc)(int16_t *pDct);
+typedef void (*PScanFunc) (int16_t* pLevel, int16_t* pDct);
+typedef int32_t (*PCalculateSingleCtrFunc) (int16_t* pDct);
 
-typedef void (*PTransformHadamard4x4Func)( int16_t *pLumaDc, int16_t *pDct);
-typedef void (*PQuantizationFunc)(int16_t *pDct, int16_t* pFF,  int16_t *pMF);
-typedef void (*PQuantizationMaxFunc)(int16_t *pDct, int16_t* pFF,  int16_t *pMF, int16_t *pMax);
-typedef void (*PQuantizationDcFunc)(int16_t *pDct, int16_t iFF,  int16_t iMF);
-typedef BOOL_T (*PQuantizationSkipFunc)(int16_t *pDct, int16_t iFF,  int16_t iMF);
-typedef int32_t (*PQuantizationHadamardFunc)(int16_t *pRes, const int16_t kiFF, int16_t iMF, int16_t * pDct, int16_t * pBlock);
+typedef void (*PTransformHadamard4x4Func) (int16_t* pLumaDc, int16_t* pDct);
+typedef void (*PQuantizationFunc) (int16_t* pDct, int16_t* pFF,  int16_t* pMF);
+typedef void (*PQuantizationMaxFunc) (int16_t* pDct, int16_t* pFF,  int16_t* pMF, int16_t* pMax);
+typedef void (*PQuantizationDcFunc) (int16_t* pDct, int16_t iFF,  int16_t iMF);
+typedef BOOL_T (*PQuantizationSkipFunc) (int16_t* pDct, int16_t iFF,  int16_t iMF);
+typedef int32_t (*PQuantizationHadamardFunc) (int16_t* pRes, const int16_t kiFF, int16_t iMF, int16_t* pDct,
+    int16_t* pBlock);
 
 typedef void (*PWelsMcFunc) (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-							  SMVUnitXY mv, int32_t iWidth, int32_t iHeight);
+                             SMVUnitXY mv, int32_t iWidth, int32_t iHeight);
 
-typedef void (*PWelsLumaHalfpelMcFunc) (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, 
-                                   int32_t iWidth, int32_t iHeight);
-typedef void (*PWelsLumaQuarpelMcFunc) (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-typedef void (*PWelsSampleAveragingFunc) ( uint8_t *, int32_t, uint8_t *, int32_t, uint8_t *, int32_t, int32_t );
+typedef void (*PWelsLumaHalfpelMcFunc) (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                        int32_t iWidth, int32_t iHeight);
+typedef void (*PWelsLumaQuarpelMcFunc) (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                        int32_t iHeight);
+typedef void (*PWelsSampleAveragingFunc) (uint8_t*, int32_t, uint8_t*, int32_t, uint8_t*, int32_t, int32_t);
 
-typedef struct TagMcFunc{
-	PWelsLumaHalfpelMcFunc      pfLumaHalfpelHor;
-	PWelsLumaHalfpelMcFunc      pfLumaHalfpelVer;
-	PWelsLumaHalfpelMcFunc      pfLumaHalfpelCen;
-	PWelsMcFunc                         pfChromaMc;
+typedef struct TagMcFunc {
+  PWelsLumaHalfpelMcFunc      pfLumaHalfpelHor;
+  PWelsLumaHalfpelMcFunc      pfLumaHalfpelVer;
+  PWelsLumaHalfpelMcFunc      pfLumaHalfpelCen;
+  PWelsMcFunc                         pfChromaMc;
 
-	PWelsLumaQuarpelMcFunc     *pfLumaQuarpelMc;
-	PWelsSampleAveragingFunc   *pfSampleAveraging;
-}SMcFunc;
+  PWelsLumaQuarpelMcFunc*     pfLumaQuarpelMc;
+  PWelsSampleAveragingFunc*   pfSampleAveraging;
+} SMcFunc;
 
-typedef void (*PLumaDeblockingLT4Func)( uint8_t *iSampleY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *iTc );
-typedef void (*PLumaDeblockingEQ4Func)(  uint8_t *iSampleY, int32_t iStride, int32_t iAlpha, int32_t iBeta );
-typedef void (*PChromaDeblockingLT4Func)( uint8_t *iSampleCb, uint8_t *iSampleCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *iTc );
-typedef void (*PChromaDeblockingEQ4Func)(  uint8_t *iSampleCb, uint8_t *iSampleCr, int32_t iStride, int32_t iAlpha, int32_t iBeta  );
+typedef void (*PLumaDeblockingLT4Func) (uint8_t* iSampleY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* iTc);
+typedef void (*PLumaDeblockingEQ4Func) (uint8_t* iSampleY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+typedef void (*PChromaDeblockingLT4Func) (uint8_t* iSampleCb, uint8_t* iSampleCr, int32_t iStride, int32_t iAlpha,
+    int32_t iBeta, int8_t* iTc);
+typedef void (*PChromaDeblockingEQ4Func) (uint8_t* iSampleCb, uint8_t* iSampleCr, int32_t iStride, int32_t iAlpha,
+    int32_t iBeta);
 
 typedef struct tagDeblockingFunc {
-	PLumaDeblockingLT4Func    pfLumaDeblockingLT4Ver;
-	PLumaDeblockingEQ4Func    pfLumaDeblockingEQ4Ver;
-	PLumaDeblockingLT4Func    pfLumaDeblockingLT4Hor;
-	PLumaDeblockingEQ4Func    pfLumaDeblockingEQ4Hor;
+  PLumaDeblockingLT4Func    pfLumaDeblockingLT4Ver;
+  PLumaDeblockingEQ4Func    pfLumaDeblockingEQ4Ver;
+  PLumaDeblockingLT4Func    pfLumaDeblockingLT4Hor;
+  PLumaDeblockingEQ4Func    pfLumaDeblockingEQ4Hor;
 
-	PChromaDeblockingLT4Func  pfChromaDeblockingLT4Ver;
-	PChromaDeblockingEQ4Func  pfChromaDeblockingEQ4Ver;
-	PChromaDeblockingLT4Func  pfChromaDeblockingLT4Hor;
-	PChromaDeblockingEQ4Func  pfChromaDeblockinEQ4Hor;
+  PChromaDeblockingLT4Func  pfChromaDeblockingLT4Ver;
+  PChromaDeblockingEQ4Func  pfChromaDeblockingEQ4Ver;
+  PChromaDeblockingLT4Func  pfChromaDeblockingLT4Hor;
+  PChromaDeblockingEQ4Func  pfChromaDeblockinEQ4Hor;
 } DeblockingFunc;
 
-typedef  void (*PSetNoneZeroCountZeroFunc) (int8_t * pNonZeroCount );
+typedef  void (*PSetNoneZeroCountZeroFunc) (int8_t* pNonZeroCount);
 
-typedef int32_t (*PIntraFineMdFunc)(void* pEncCtx, void * pWelsMd, SMB* pCurMb, SMbCache *pMbCache); 
-typedef void (*PInterFineMdFunc)(void* pEncCtx, void* pWelsMd, SSlice *slice, SMB* pCurMb, int32_t bestCost );
-typedef BOOL_T (*PInterMdFirstIntraModeFunc)(void* pEncCtx, void* pWelsMd, SMB* pCurMb, SMbCache *pMbCache);
+typedef int32_t (*PIntraFineMdFunc) (void* pEncCtx, void* pWelsMd, SMB* pCurMb, SMbCache* pMbCache);
+typedef void (*PInterFineMdFunc) (void* pEncCtx, void* pWelsMd, SSlice* slice, SMB* pCurMb, int32_t bestCost);
+typedef BOOL_T (*PInterMdFirstIntraModeFunc) (void* pEncCtx, void* pWelsMd, SMB* pCurMb, SMbCache* pMbCache);
 
-typedef void (*PMotionSearchFunc) ( SWelsFuncPtrList *pFuncList, void* pCurDqLayer, void* pMe, void* pSlice );// here after reset all function pointers, will set as right parameter type
-typedef void (*PFillInterNeighborCacheFunc) (SMbCache* pMbCache, SMB* pCurMb, int32_t iMbWidth, int8_t *pVaaBgMbFlag);
-typedef void (*PAccumulateSadFunc) (uint32_t *pSumDiff, int32_t *pGomForegroundBlockNum, int32_t *iSad8x8, int8_t *pVaaBgMbFlag);//for RC
-typedef BOOL_T (*PDynamicSlicingStepBackFunc)	( void* pEncCtx, void* pSlice, SSliceCtx *pSliceCtx, SMB* pCurMb, SDynamicSlicingStack *pDynamicSlicingStack );// 2010.8.17
+typedef void (*PMotionSearchFunc) (SWelsFuncPtrList* pFuncList, void* pCurDqLayer, void* pMe,
+                                   void* pSlice);  // here after reset all function pointers, will set as right parameter type
+typedef void (*PFillInterNeighborCacheFunc) (SMbCache* pMbCache, SMB* pCurMb, int32_t iMbWidth, int8_t* pVaaBgMbFlag);
+typedef void (*PAccumulateSadFunc) (uint32_t* pSumDiff, int32_t* pGomForegroundBlockNum, int32_t* iSad8x8,
+                                    int8_t* pVaaBgMbFlag);//for RC
+typedef BOOL_T (*PDynamicSlicingStepBackFunc) (void* pEncCtx, void* pSlice, SSliceCtx* pSliceCtx, SMB* pCurMb,
+    SDynamicSlicingStack* pDynamicSlicingStack); // 2010.8.17
 
-typedef bool_t (*PInterMdBackgroundDecisionFunc) ( void* pEncCtx, void* pWelsMd, SSlice *slice, SMB* pCurMb, SMbCache *pMbCache, BOOL_T* pKeepPskip );
-typedef void (*PInterMdBackgroundInfoUpdateFunc) ( SDqLayer* pCurLayer,  SMB* pCurMb, const bool_t bFlag, const int32_t kiRefPictureType );
+typedef bool_t (*PInterMdBackgroundDecisionFunc) (void* pEncCtx, void* pWelsMd, SSlice* slice, SMB* pCurMb,
+    SMbCache* pMbCache, BOOL_T* pKeepPskip);
+typedef void (*PInterMdBackgroundInfoUpdateFunc) (SDqLayer* pCurLayer,  SMB* pCurMb, const bool_t bFlag,
+    const int32_t kiRefPictureType);
 
-typedef void (*PInterMdFunc) ( void* pEncCtx, void* pWelsMd, SSlice *slice, SMB* pCurMb, SMbCache *pMbCache );
+typedef void (*PInterMdFunc) (void* pEncCtx, void* pWelsMd, SSlice* slice, SMB* pCurMb, SMbCache* pMbCache);
 
-typedef int32_t  (*PSampleSadSatdCostFunc) ( uint8_t *, int32_t, uint8_t *, int32_t );
-typedef void (*PSample4SadCostFunc) ( uint8_t *, int32_t, uint8_t*, int32_t, int32_t* );
-typedef int32_t (*PIntraPred4x4Combined3Func)(uint8_t *, int32_t, uint8_t *, int32_t, uint8_t *, int32_t *, int32_t, int32_t, int32_t);
-typedef int32_t (*PIntraPred16x16Combined3Func)(uint8_t *, int32_t, uint8_t *, int32_t, int32_t*, int32_t, uint8_t*);
-typedef int32_t (*PIntraPred8x8Combined3Func)(uint8_t *, int32_t, uint8_t *, int32_t, int32_t*, int32_t, uint8_t*,uint8_t*,uint8_t*);
+typedef int32_t (*PSampleSadSatdCostFunc) (uint8_t*, int32_t, uint8_t*, int32_t);
+typedef void (*PSample4SadCostFunc) (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
+typedef int32_t (*PIntraPred4x4Combined3Func) (uint8_t*, int32_t, uint8_t*, int32_t, uint8_t*, int32_t*, int32_t,
+    int32_t, int32_t);
+typedef int32_t (*PIntraPred16x16Combined3Func) (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*);
+typedef int32_t (*PIntraPred8x8Combined3Func) (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*,
+    uint8_t*, uint8_t*);
 #define     MAX_BLOCK_TYPE 5 // prev 7
 typedef struct TagSampleDealingFunc {
-	PSampleSadSatdCostFunc            pfSampleSad[MAX_BLOCK_TYPE];
-	PSampleSadSatdCostFunc            pfSampleSatd[MAX_BLOCK_TYPE];
-	PSample4SadCostFunc                 pfSample4Sad[MAX_BLOCK_TYPE];
-	PIntraPred4x4Combined3Func      pfIntra4x4Combined3Satd;
-	PIntraPred16x16Combined3Func  pfIntra16x16Combined3Satd;
-	PIntraPred16x16Combined3Func  pfIntra16x16Combined3Sad;
-	PIntraPred8x8Combined3Func      pfIntra8x8Combined3Satd;
-	PIntraPred8x8Combined3Func      pfIntra8x8Combined3Sad;
+  PSampleSadSatdCostFunc            pfSampleSad[MAX_BLOCK_TYPE];
+  PSampleSadSatdCostFunc            pfSampleSatd[MAX_BLOCK_TYPE];
+  PSample4SadCostFunc                 pfSample4Sad[MAX_BLOCK_TYPE];
+  PIntraPred4x4Combined3Func      pfIntra4x4Combined3Satd;
+  PIntraPred16x16Combined3Func  pfIntra16x16Combined3Satd;
+  PIntraPred16x16Combined3Func  pfIntra16x16Combined3Sad;
+  PIntraPred8x8Combined3Func      pfIntra8x8Combined3Satd;
+  PIntraPred8x8Combined3Func      pfIntra8x8Combined3Sad;
 
-	PSampleSadSatdCostFunc            *pfMdCost;
-	PSampleSadSatdCostFunc            *pfMeCost;
-	PIntraPred16x16Combined3Func   pfIntra16x16Combined3;
-	PIntraPred8x8Combined3Func       pfIntra8x8Combined3;
-	PIntraPred4x4Combined3Func       pfIntra4x4Combined3;
+  PSampleSadSatdCostFunc*            pfMdCost;
+  PSampleSadSatdCostFunc*            pfMeCost;
+  PIntraPred16x16Combined3Func   pfIntra16x16Combined3;
+  PIntraPred8x8Combined3Func       pfIntra8x8Combined3;
+  PIntraPred4x4Combined3Func       pfIntra4x4Combined3;
 } SSampleDealingFunc;
-typedef void (*PGetIntraPredFunc )(uint8_t *pPrediction, uint8_t *pRef, const int32_t kiStride);
+typedef void (*PGetIntraPredFunc) (uint8_t* pPrediction, uint8_t* pRef, const int32_t kiStride);
 
-typedef int32_t (*PGetVarianceFromIntraVaaFunc)( uint8_t *pSampelY, const int32_t kiStride );
-typedef uint8_t (*PGetMbSignFromInterVaaFunc)( int32_t *pSad8x8 );
-typedef void (*PUpdateMbMvFunc)( SMVUnitXY *pMvUnit, const SMVUnitXY ksMv );
+typedef int32_t (*PGetVarianceFromIntraVaaFunc) (uint8_t* pSampelY, const int32_t kiStride);
+typedef uint8_t (*PGetMbSignFromInterVaaFunc) (int32_t* pSad8x8);
+typedef void (*PUpdateMbMvFunc) (SMVUnitXY* pMvUnit, const SMVUnitXY ksMv);
 
-struct TagWelsFuncPointerList
-{
-	PExpandPictureFunc			pfExpandLumaPicture;
-	PExpandPictureFunc			pfExpandChromaPicture[2];// 0: for chroma unalignment && width_uv >= 16; 1: for chroma alignment && width_uv >= 16;
-    	
-    PFillInterNeighborCacheFunc       pfFillInterNeighborCache;
+struct TagWelsFuncPointerList {
+  PExpandPictureFunc			pfExpandLumaPicture;
+  PExpandPictureFunc
+  pfExpandChromaPicture[2];// 0: for chroma unalignment && width_uv >= 16; 1: for chroma alignment && width_uv >= 16;
 
-	PGetVarianceFromIntraVaaFunc	pfGetVarianceFromIntraVaa;
-	PGetMbSignFromInterVaaFunc	pfGetMbSignFromInterVaa;
-	PUpdateMbMvFunc					    pfUpdateMbMv;
-	PInterMdFirstIntraModeFunc      pfFirstIntraMode; //svc_encode_slice.c svc_mode_decision.c svc_base_layer_md.c
-	PIntraFineMdFunc                     pfIntraFineMd;          //svc_encode_slice.c svc_mode_decision.c svc_base_layer_md.c
-	PInterFineMdFunc                     pfInterFineMd;          //svc_encode_slice.c svc_base_layer_md.c
-	PInterMdFunc                           pfInterMd;
+  PFillInterNeighborCacheFunc       pfFillInterNeighborCache;
 
-	PInterMdBackgroundDecisionFunc          pfInterMdBackgroundDecision;
-	PInterMdBackgroundInfoUpdateFunc      pfInterMdBackgroundInfoUpdate;
+  PGetVarianceFromIntraVaaFunc	pfGetVarianceFromIntraVaa;
+  PGetMbSignFromInterVaaFunc	pfGetMbSignFromInterVaa;
+  PUpdateMbMvFunc					    pfUpdateMbMv;
+  PInterMdFirstIntraModeFunc      pfFirstIntraMode; //svc_encode_slice.c svc_mode_decision.c svc_base_layer_md.c
+  PIntraFineMdFunc
+  pfIntraFineMd;          //svc_encode_slice.c svc_mode_decision.c svc_base_layer_md.c
+  PInterFineMdFunc                     pfInterFineMd;          //svc_encode_slice.c svc_base_layer_md.c
+  PInterMdFunc                           pfInterMd;
 
-	SMcFunc				        sMcFuncs;
-	SSampleDealingFunc     sSampleDealingFuncs;
-	PGetIntraPredFunc 		pfGetLumaI16x16Pred[I16_PRED_DC_A];
-	PGetIntraPredFunc 		pfGetLumaI4x4Pred[I4_PRED_A];		
-	PGetIntraPredFunc 		pfGetChromaPred[C_PRED_A];		
-	PMotionSearchFunc	    pfMotionSearch; //svc_encode_slice.c svc_mode_decision.c svc_enhance_layer_md.c svc_base_layer_md.c
+  PInterMdBackgroundDecisionFunc          pfInterMdBackgroundDecision;
+  PInterMdBackgroundInfoUpdateFunc      pfInterMdBackgroundInfoUpdate;
 
-	PCopyFunc      pfCopy16x16Aligned;		//svc_encode_slice.c svc_mode_decision.c svc_base_layer_md.c
-	PCopyFunc      pfCopy16x16NotAligned;	//md.c
-	PCopyFunc      pfCopy8x8Aligned;		//svc_encode_slice.c svc_mode_decision.c svc_base_layer_md.c md.c
-	PCopyFunc	  pfCopy16x8NotAligned;	//for MeRefineFracPixel 16x8 based
-	PCopyFunc	  pfCopy8x16Aligned;		//for MeRefineFracPixel 8x16 based
+  SMcFunc				        sMcFuncs;
+  SSampleDealingFunc     sSampleDealingFuncs;
+  PGetIntraPredFunc 		pfGetLumaI16x16Pred[I16_PRED_DC_A];
+  PGetIntraPredFunc 		pfGetLumaI4x4Pred[I4_PRED_A];
+  PGetIntraPredFunc 		pfGetChromaPred[C_PRED_A];
+  PMotionSearchFunc
+  pfMotionSearch; //svc_encode_slice.c svc_mode_decision.c svc_enhance_layer_md.c svc_base_layer_md.c
 
-	//svc_encode_mb.c encode_mb_aux.c
-	PDctFunc					pfDctT4;
-	PDctFunc    		        pfDctFourT4;
+  PCopyFunc      pfCopy16x16Aligned;		//svc_encode_slice.c svc_mode_decision.c svc_base_layer_md.c
+  PCopyFunc      pfCopy16x16NotAligned;	//md.c
+  PCopyFunc      pfCopy8x8Aligned;		//svc_encode_slice.c svc_mode_decision.c svc_base_layer_md.c md.c
+  PCopyFunc	  pfCopy16x8NotAligned;	//for MeRefineFracPixel 16x8 based
+  PCopyFunc	  pfCopy8x16Aligned;		//for MeRefineFracPixel 8x16 based
 
-	PCalculateSingleCtrFunc				pfCalculateSingleCtr4x4;     
-	PScanFunc				pfScan4x4;		//DC/AC
-    PScanFunc				pfScan4x4Ac;
+  //svc_encode_mb.c encode_mb_aux.c
+  PDctFunc					pfDctT4;
+  PDctFunc    		        pfDctFourT4;
 
-	PQuantizationFunc				        pfQuantization4x4;       
-	PQuantizationFunc				        pfQuantizationFour4x4;  
-    PQuantizationDcFunc			        pfQuantizationDc4x4; 
-	PQuantizationMaxFunc		        pfQuantizationFour4x4Max; 
-	PQuantizationHadamardFunc		pfQuantizationHadamard2x2;
-	PQuantizationSkipFunc		        pfQuantizationHadamard2x2Skip;
+  PCalculateSingleCtrFunc				pfCalculateSingleCtr4x4;
+  PScanFunc				pfScan4x4;		//DC/AC
+  PScanFunc				pfScan4x4Ac;
 
-	PTransformHadamard4x4Func	 pfTransformHadamard4x4Dc;
+  PQuantizationFunc				        pfQuantization4x4;
+  PQuantizationFunc				        pfQuantizationFour4x4;
+  PQuantizationDcFunc			        pfQuantizationDc4x4;
+  PQuantizationMaxFunc		        pfQuantizationFour4x4Max;
+  PQuantizationHadamardFunc		pfQuantizationHadamard2x2;
+  PQuantizationSkipFunc		        pfQuantizationHadamard2x2Skip;
 
-	PGetNoneZeroCountFunc		      pfGetNoneZeroCount;
+  PTransformHadamard4x4Func	 pfTransformHadamard4x4Dc;
 
-	PDeQuantizationFunc				      pfDequantization4x4;  
-	PDeQuantizationFunc			          pfDequantizationFour4x4; 
-	PDeQuantizationHadamardFunc	  pfDequantizationIHadamard4x4;
-	PIDctFunc				                      pfIDctFourT4;
-	PIDctFunc				                      pfIDctT4;
-	PIDctFunc				                      pfIDctI16x16Dc;
+  PGetNoneZeroCountFunc		      pfGetNoneZeroCount;
 
-	
+  PDeQuantizationFunc				      pfDequantization4x4;
+  PDeQuantizationFunc			          pfDequantizationFour4x4;
+  PDeQuantizationHadamardFunc	  pfDequantizationIHadamard4x4;
+  PIDctFunc				                      pfIDctFourT4;
+  PIDctFunc				                      pfIDctT4;
+  PIDctFunc				                      pfIDctI16x16Dc;
 
-	// OPTI: if MT under diff uiSliceMode, need change here
-	//PDynamicSlicingStepBackFunc	dynslc_funcpointer_stepback;//svc_encode_slice.c 
-	//DYNSLC_LNGTH_CRTL		dynslc_funcpointer_slcsize_ctrl;
-    
-    /* For Deblocking */
-	DeblockingFunc                         pfDeblocking;
-	PSetNoneZeroCountZeroFunc     pfSetNZCZero;
 
-	SWelsRcFunc					    pfRc;
-	PAccumulateSadFunc         pfAccumulateSadForRc;
 
-    PSetMemoryZero				pfSetMemZeroSize8;			// for size is times to 8
-	PSetMemoryZero				pfSetMemZeroSize64Aligned16;			// for size is times of 64, and address is align to 16
-	PSetMemoryZero				pfSetMemZeroSize64;			// for size is times of 64, and don't know address is align to 16 or not
+  // OPTI: if MT under diff uiSliceMode, need change here
+  //PDynamicSlicingStepBackFunc	dynslc_funcpointer_stepback;//svc_encode_slice.c
+  //DYNSLC_LNGTH_CRTL		dynslc_funcpointer_slcsize_ctrl;
+
+  /* For Deblocking */
+  DeblockingFunc                         pfDeblocking;
+  PSetNoneZeroCountZeroFunc     pfSetNZCZero;
+
+  SWelsRcFunc					    pfRc;
+  PAccumulateSadFunc         pfAccumulateSadForRc;
+
+  PSetMemoryZero				pfSetMemZeroSize8;			// for size is times to 8
+  PSetMemoryZero				pfSetMemZeroSize64Aligned16;			// for size is times of 64, and address is align to 16
+  PSetMemoryZero				pfSetMemZeroSize64;			// for size is times of 64, and don't know address is align to 16 or not
 };
 
 }	//end of namespace WelsSVCEnc {
--- a/codec/encoder/core/inc/wels_preprocess.h
+++ b/codec/encoder/core/inc/wels_preprocess.h
@@ -1,153 +1,153 @@
-/*!
- * \copy
- *     Copyright (c)  2011-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file	wels_preprocess.h
- *
- * \brief	interface of video pre-process plugins
- *
- * \date	03/15/2011
- *
- * \description : this class is designed as an interface to unify video pre-processing 
- *                class implement sets such as denoise,colorspace conversion etc...
- *
- *************************************************************************************
- */
-
-#ifndef WELS_PREPROCESS_H
-#define WELS_PREPROCESS_H
-
-#include "typedefs.h"
-#include "picture.h"
-#include "wels_const.h"
-#include "IWelsVP.h"
-#include "param_svc.h"
-
-namespace WelsSVCEnc {
-
-typedef  struct
-{
-	SPicture	*pScaledInputPicture;
-	int32_t		iScaledWidth[MAX_DEPENDENCY_LAYER];
-	int32_t     iScaledHeight[MAX_DEPENDENCY_LAYER];
-} Scaled_Picture;
-
-typedef struct 
-{
-	SVAACalcResult		sVaaCalcInfo;
-	SAdaptiveQuantizationParam sAdaptiveQuantParam;
-	SComplexityAnalysisParam sComplexityAnalysisParam;
-
-	int32_t			iPicWidth;			// maximal iWidth of picture in samples for svc coding
-	int32_t			iPicHeight;			// maximal iHeight of picture in samples for svc coding
-	int32_t         iPicStride;         //luma
-	int32_t			iPicStrideUV;
-
-	uint8_t         *pRefY; //pRef	
-	uint8_t         *pCurY; //cur
-	uint8_t         *pRefU; //pRef	
-	uint8_t         *pCurU; //cur
-	uint8_t         *pRefV; //pRef	
-	uint8_t         *pCurV; //cur
-
-	int8_t			*pVaaBackgroundMbFlag;
-	uint8_t         uiValidLongTermPicIdx;
-	uint8_t         uiMarkLongTermPicIdx;
-
-	bool_t          bSceneChangeFlag;
-	bool_t          bIdrPeriodFlag;
-} SVAAFrameInfo;
-
-class CWelsLib
-{
-public:
-	CWelsLib(void *pEncCtx);
-	virtual  ~CWelsLib();	
-
-	int32_t CreateIface(void **pEncCtx);
-	int32_t DestroyIface(void *pEncCtx);
-
-protected:
-	void *QueryFunction(const str_t *pName);
-
-private:
-	void *m_pVpLib;
-	void *m_pInterface[2];
-};
-
-class CWelsPreProcess
-{
-public:
-	CWelsPreProcess(void *pEncCtx);
-	virtual  ~CWelsPreProcess();
-
-public:
-	int32_t WelsPreprocessReset ( void *pEncCtx );
-	int32_t WelsPreprocessStep1( void *pEncCtx, const SSourcePicture **kppSrcPicList, const int32_t kiConfiguredLayerNum );
-	int32_t WelsPreprocessStep3( void *pEncCtx, const int32_t kiDIdx );
-
-private:
-	int32_t WelsPreprocessCreate();
-	int32_t WelsPreprocessDestroy();
-	int32_t InitLastSpatialPictures( void *pEncCtx );
-
-private:
-	int32_t SingleLayerPreprocess( void *pEncCtx, const SSourcePicture *kpSrc, Scaled_Picture * m_sScaledPicture );
-	int32_t MultiLayerPreprocess( void *pEncCtx, const SSourcePicture **kppSrcPicList, const int32_t kiSpatialNum );
-
-	void	BilateralDenoising ( SPicture *pSrc, const int32_t iWidth, const int32_t iHeight );
-	bool_t  DetectSceneChange( SPicture *pCurPicture, SPicture *pRefPicture );
-	int32_t DownsamplePadding( SPicture *pSrc, SPicture *pDstPic,  int32_t iSrcWidth, int32_t iSrcHeight,
-		                        int32_t iShrinkWidth, int32_t iShrinkHeight, int32_t iTargetWidth, int32_t iTargetHeight );
-
-	void    VaaCalculation( SVAAFrameInfo *pVaaInfo, SPicture *pCurPicture, SPicture *pRefPicture, bool_t bCalculateSQDiff, bool_t bCalculateVar, bool_t bCalculateBGD );
-	void    BackgroundDetection( SVAAFrameInfo *pVaaInfo, SPicture *pCurPicture, SPicture *pRefPicture, bool_t bDetectFlag );
-	void    AdaptiveQuantCalculation( SVAAFrameInfo *pVaaInfo, SPicture *pCurPicture, SPicture *pRefPicture );
-	void    AnalyzePictureComplexity( void *pCtx, SPicture *pCurPicture, SPicture *pRefPicture, const int32_t kiDependencyId, const bool_t kbCalculateBGD );
-	void    Padding(uint8_t *pSrcY, uint8_t *pSrcU, uint8_t *pSrcV, int32_t iStrideY, int32_t iStrideUV,
-		            int32_t iActualWidth, int32_t iPaddingWidth, int32_t iActualHeight, int32_t iPaddingHeight);
-    void    SetRefMbType(void *pCtx, uint32_t **pRefMbTypeArray, int32_t iRefPicType);
-
- 	int32_t ColorspaceConvert( SWelsSvcCodingParam *pSvcParam, SPicture *pDstPic, const SSourcePicture *kpSrc, const int32_t kiWidth, const int32_t kiHeight );
-	void WelsMoveMemoryWrapper(SWelsSvcCodingParam * pSvcParam, SPicture *pDstPic, const SSourcePicture *kpSrc, const int32_t kiWidth, const int32_t kiHeight );
-
-private:
-	Scaled_Picture  m_sScaledPicture;
-	SPicture		*m_pLastSpatialPicture[MAX_DEPENDENCY_LAYER][2];	
-	IWelsVP         *m_pInterfaceVp;	
-	CWelsLib        *m_pEncLib;
-	void            *m_pEncCtx;
-	bool_t          m_bInitDone;
-	bool_t          m_bOfficialBranch;
-};
-
-}
-
-#endif
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	wels_preprocess.h
+ *
+ * \brief	interface of video pre-process plugins
+ *
+ * \date	03/15/2011
+ *
+ * \description : this class is designed as an interface to unify video pre-processing
+ *                class implement sets such as denoise,colorspace conversion etc...
+ *
+ *************************************************************************************
+ */
+
+#ifndef WELS_PREPROCESS_H
+#define WELS_PREPROCESS_H
+
+#include "typedefs.h"
+#include "picture.h"
+#include "wels_const.h"
+#include "IWelsVP.h"
+#include "param_svc.h"
+
+namespace WelsSVCEnc {
+
+typedef  struct {
+  SPicture*	pScaledInputPicture;
+  int32_t		iScaledWidth[MAX_DEPENDENCY_LAYER];
+  int32_t     iScaledHeight[MAX_DEPENDENCY_LAYER];
+} Scaled_Picture;
+
+typedef struct {
+  SVAACalcResult		sVaaCalcInfo;
+  SAdaptiveQuantizationParam sAdaptiveQuantParam;
+  SComplexityAnalysisParam sComplexityAnalysisParam;
+
+  int32_t			iPicWidth;			// maximal iWidth of picture in samples for svc coding
+  int32_t			iPicHeight;			// maximal iHeight of picture in samples for svc coding
+  int32_t         iPicStride;         //luma
+  int32_t			iPicStrideUV;
+
+  uint8_t*         pRefY; //pRef
+  uint8_t*         pCurY; //cur
+  uint8_t*         pRefU; //pRef
+  uint8_t*         pCurU; //cur
+  uint8_t*         pRefV; //pRef
+  uint8_t*         pCurV; //cur
+
+  int8_t*			pVaaBackgroundMbFlag;
+  uint8_t         uiValidLongTermPicIdx;
+  uint8_t         uiMarkLongTermPicIdx;
+
+  bool_t          bSceneChangeFlag;
+  bool_t          bIdrPeriodFlag;
+} SVAAFrameInfo;
+
+class CWelsLib {
+ public:
+  CWelsLib (void* pEncCtx);
+  virtual  ~CWelsLib();
+
+  int32_t CreateIface (void** pEncCtx);
+  int32_t DestroyIface (void* pEncCtx);
+
+ protected:
+  void* QueryFunction (const str_t* pName);
+
+ private:
+  void* m_pVpLib;
+  void* m_pInterface[2];
+};
+
+class CWelsPreProcess {
+ public:
+  CWelsPreProcess (void* pEncCtx);
+  virtual  ~CWelsPreProcess();
+
+ public:
+  int32_t WelsPreprocessReset (void* pEncCtx);
+  int32_t WelsPreprocessStep1 (void* pEncCtx, const SSourcePicture** kppSrcPicList, const int32_t kiConfiguredLayerNum);
+  int32_t WelsPreprocessStep3 (void* pEncCtx, const int32_t kiDIdx);
+
+ private:
+  int32_t WelsPreprocessCreate();
+  int32_t WelsPreprocessDestroy();
+  int32_t InitLastSpatialPictures (void* pEncCtx);
+
+ private:
+  int32_t SingleLayerPreprocess (void* pEncCtx, const SSourcePicture* kpSrc, Scaled_Picture* m_sScaledPicture);
+  int32_t MultiLayerPreprocess (void* pEncCtx, const SSourcePicture** kppSrcPicList, const int32_t kiSpatialNum);
+
+  void	BilateralDenoising (SPicture* pSrc, const int32_t iWidth, const int32_t iHeight);
+  bool_t  DetectSceneChange (SPicture* pCurPicture, SPicture* pRefPicture);
+  int32_t DownsamplePadding (SPicture* pSrc, SPicture* pDstPic,  int32_t iSrcWidth, int32_t iSrcHeight,
+                             int32_t iShrinkWidth, int32_t iShrinkHeight, int32_t iTargetWidth, int32_t iTargetHeight);
+
+  void    VaaCalculation (SVAAFrameInfo* pVaaInfo, SPicture* pCurPicture, SPicture* pRefPicture, bool_t bCalculateSQDiff,
+                          bool_t bCalculateVar, bool_t bCalculateBGD);
+  void    BackgroundDetection (SVAAFrameInfo* pVaaInfo, SPicture* pCurPicture, SPicture* pRefPicture, bool_t bDetectFlag);
+  void    AdaptiveQuantCalculation (SVAAFrameInfo* pVaaInfo, SPicture* pCurPicture, SPicture* pRefPicture);
+  void    AnalyzePictureComplexity (void* pCtx, SPicture* pCurPicture, SPicture* pRefPicture,
+                                    const int32_t kiDependencyId, const bool_t kbCalculateBGD);
+  void    Padding (uint8_t* pSrcY, uint8_t* pSrcU, uint8_t* pSrcV, int32_t iStrideY, int32_t iStrideUV,
+                   int32_t iActualWidth, int32_t iPaddingWidth, int32_t iActualHeight, int32_t iPaddingHeight);
+  void    SetRefMbType (void* pCtx, uint32_t** pRefMbTypeArray, int32_t iRefPicType);
+
+  int32_t ColorspaceConvert (SWelsSvcCodingParam* pSvcParam, SPicture* pDstPic, const SSourcePicture* kpSrc,
+                             const int32_t kiWidth, const int32_t kiHeight);
+  void WelsMoveMemoryWrapper (SWelsSvcCodingParam* pSvcParam, SPicture* pDstPic, const SSourcePicture* kpSrc,
+                              const int32_t kiWidth, const int32_t kiHeight);
+
+ private:
+  Scaled_Picture  m_sScaledPicture;
+  SPicture*		m_pLastSpatialPicture[MAX_DEPENDENCY_LAYER][2];
+  IWelsVP*         m_pInterfaceVp;
+  CWelsLib*        m_pEncLib;
+  void*            m_pEncCtx;
+  bool_t          m_bInitDone;
+  bool_t          m_bOfficialBranch;
+};
+
+}
+
+#endif
--- a/codec/encoder/core/src/au_set.cpp
+++ b/codec/encoder/core/src/au_set.cpp
@@ -1,514 +1,488 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file	au_set.c
- *
- * \brief	Interfaces introduced in Access Unit level based writer
- *
- * \date	05/18/2009 Created
- *
- *************************************************************************************
- */
-
-#include <string.h>
-#include <assert.h>
-#include "au_set.h"
-#include "svc_enc_golomb.h"
-namespace WelsSVCEnc {
-static const uint32_t g_kuiMaxDPBx2AtLevel[52] = // *2 on the basic of Annex A, Table A-1, for int32_t type
-{
-	0,		0,		0,		0,		0,	0,	0,	0,	0,	0, //0~9
-	297,	675,	1782,	1782,	0,	0,	0,	0,	0,	0, //10, 11, 12, 13
-	1782,	3564,	6075,	0,		0,	0,	0,	0,	0,	0, //20, 21, 22
-	6075,	13500,	15360,	0,		0,	0,	0,	0,	0,	0, //30, 31, 32
-	24576,	24576,	26112,	0,		0,	0,	0,	0,	0,	0, //40, 41, 42
-	82800,	138240											//50, 51
-};
-
-
-#define LEVEL_NUMBER 16
-
-typedef struct TagLevelLimit
-{
-	uint8_t iLevelIdc;
-	uint32_t uiMaxMbPS; // Max MBs processing speed
-	uint32_t uiMaxFS; // Max Frame size
-	uint32_t uiMaxDPBMB; //Max DPB MB Size
-	uint32_t uiMaxBR; //Max Bitrate
-} SLevelLimit;
-
-const SLevelLimit g_ksLevelLimit[LEVEL_NUMBER] =
-{
-  { 10,   1485,    99,	  396,     64 },                 //10
-  { 9,    1485,    99,	  396,    128 },                 //9 (1b)
-  { 11,   3000,   396,	  900,    192 },                 //11
-  { 12,   6000,   396,	 2376,    384 },                 //12
-  { 13,  11880,   396,	 2376,    768 },                 //13
-
-  { 20,  11880,   396,    2376,   2000 },                 //20
-  { 21,  19800,   792,    4752,   4000 },                 //21
-  { 22,  20250,  1620,    8100,   4000 },                 //22
-
-  { 30,  40500,  1620,    8100,  10000 },                 //30
-  { 31, 108000,  3600,   18000,  14000 },                 //31
-  { 32, 216000,  5120,   20480,  20000 },                 //32
-
-  { 40, 245760,  8192,   32768,  20000 },                 //40
-  { 41, 245760,  8192,   32768,  50000 },                 //41
-  { 42, 491520,  8192,   34816,  50000 },                 //42
-
-  { 50, 589824, 22080,  110400, 135000 },                 //50
-  { 51, 983040, 36864,  184320, 240000 }                  //51
-};
-
-static inline int32_t WelsCheckLevelLimitation( const SWelsSPS* kpSps, const SLevelLimit *kpLevelLimit, float fFrameRate, int32_t iTargetBitRate )
-{
-	uint32_t uiPicWidthInMBs = kpSps->iMbWidth;
-	uint32_t uiPicHeightInMBs = kpSps->iMbHeight;
-	uint32_t uiPicInMBs = uiPicWidthInMBs * uiPicHeightInMBs;
-	uint32_t uiNumRefFrames = kpSps->iNumRefFrames;
-
-	if( kpLevelLimit->uiMaxMbPS < ( uint32_t ) ( uiPicInMBs * fFrameRate ) )
-		return 0;
-	if( kpLevelLimit->uiMaxFS < uiPicInMBs )
-		return 0;
-	if( ( kpLevelLimit->uiMaxFS << 3 ) < ( uiPicWidthInMBs * uiPicWidthInMBs ) )
-		return 0;
-	if( ( kpLevelLimit->uiMaxFS << 3 ) < ( uiPicHeightInMBs * uiPicHeightInMBs ) )
-		return 0;
-	if( kpLevelLimit->uiMaxDPBMB < uiNumRefFrames * uiPicInMBs )
-		return 0;
-	if( iTargetBitRate && ( (int32_t) kpLevelLimit->uiMaxBR  * 1200 ) < iTargetBitRate ) //RC enabled, considering bitrate constraint
-		return 0;
-	//add more checks here if needed in future
-
-	return 1;
-
-}
-
-static inline int32_t WelsGetLevelIdc( const SWelsSPS* kpSps, float fFrameRate, int32_t iTargetBitRate )
-{	
-	int32_t iOrder;
-	for( iOrder = 0; iOrder < LEVEL_NUMBER; iOrder++ )
-	{
-		if( WelsCheckLevelLimitation(kpSps, &(g_ksLevelLimit[iOrder]), fFrameRate, iTargetBitRate) )
-		{
-			return (int32_t) ( g_ksLevelLimit[iOrder].iLevelIdc );
-		}
-	}
-	return 51; //final decision: select the biggest level
-}
-
-
-/*! 
- *************************************************************************************
- * \brief	to set Sequence Parameter Set (SPS)
- *
- * \param 	pSps 	SWelsSPS to be wrote, update iSpsId dependency
- * \param	pBitStringAux		bitstream writer auxiliary 
- *
- * \return	0 - successed
- *	    	1 - failed
- *
- * \note	Call it in case EWelsNalUnitType is SPS.
- *************************************************************************************
- */
-int32_t WelsWriteSpsSyntax( SWelsSPS *pSps, SBitStringAux *pBitStringAux, int32_t* pSpsIdDelta )
-{
-	SBitStringAux *pLocalBitStringAux = pBitStringAux;
-
-	assert( pSps != NULL && pBitStringAux != NULL );			
-
-	BsWriteBits( pLocalBitStringAux, 8, pSps->uiProfileIdc );
-
-	BsWriteOneBit( pLocalBitStringAux, pSps->bConstraintSet0Flag );	// bConstraintSet0Flag
-	BsWriteOneBit( pLocalBitStringAux, pSps->bConstraintSet1Flag );	// bConstraintSet1Flag
-	BsWriteOneBit( pLocalBitStringAux, pSps->bConstraintSet2Flag );	// bConstraintSet2Flag
-	BsWriteOneBit( pLocalBitStringAux, 0/*pSps->bConstraintSet3Flag*/ );	// bConstraintSet3Flag
-	BsWriteBits( pLocalBitStringAux, 4, 0 );							// reserved_zero_4bits, equal to 0
-	BsWriteBits( pLocalBitStringAux, 8, pSps->iLevelIdc );				// iLevelIdc
-	BsWriteUE( pLocalBitStringAux, pSps->uiSpsId + pSpsIdDelta[pSps->uiSpsId] );					    // seq_parameter_set_id
-
-	if ( PRO_SCALABLE_BASELINE == pSps->uiProfileIdc || PRO_SCALABLE_HIGH == pSps->uiProfileIdc ||
-		PRO_HIGH == pSps->uiProfileIdc || PRO_HIGH10 == pSps->uiProfileIdc ||
-		PRO_HIGH422 == pSps->uiProfileIdc || PRO_HIGH444 == pSps->uiProfileIdc ||
-		PRO_CAVLC444 == pSps->uiProfileIdc || 44 == pSps->uiProfileIdc )
-	{
-		BsWriteUE( pLocalBitStringAux, 1 ); //uiChromaFormatIdc, now should be 1
-		BsWriteUE( pLocalBitStringAux, 0); //uiBitDepthLuma
-		BsWriteUE( pLocalBitStringAux, 0); //uiBitDepthChroma
-		BsWriteOneBit( pLocalBitStringAux, 0); //qpprime_y_zero_transform_bypass_flag
-		BsWriteOneBit( pLocalBitStringAux, 0); //seq_scaling_matrix_present_flag
-	}
-
-	BsWriteUE( pLocalBitStringAux, pSps->uiLog2MaxFrameNum - 4 );	// log2_max_frame_num_minus4
-	BsWriteUE( pLocalBitStringAux, 0/*pSps->uiPocType*/ );		    // pic_order_cnt_type
-	BsWriteUE( pLocalBitStringAux, pSps->iLog2MaxPocLsb - 4 );	// log2_max_pic_order_cnt_lsb_minus4
-
-	BsWriteUE( pLocalBitStringAux, pSps->iNumRefFrames );		// max_num_ref_frames
-	BsWriteOneBit( pLocalBitStringAux, true/*pSps->bGapsInFrameNumValueAllowedFlag*/ );	// bGapsInFrameNumValueAllowedFlag
-	BsWriteUE( pLocalBitStringAux, pSps->iMbWidth - 1 );		// pic_width_in_mbs_minus1
-	BsWriteUE( pLocalBitStringAux, pSps->iMbHeight - 1 );		// pic_height_in_map_units_minus1
-	BsWriteOneBit( pLocalBitStringAux, true/*pSps->bFrameMbsOnlyFlag*/ );	// bFrameMbsOnlyFlag
-
-	BsWriteOneBit( pLocalBitStringAux, 0/*pSps->bDirect8x8InferenceFlag*/ );	// direct_8x8_inference_flag
-	BsWriteOneBit( pLocalBitStringAux, pSps->bFrameCroppingFlag );	// bFrameCroppingFlag
-	if ( pSps->bFrameCroppingFlag )
-	{
-		BsWriteUE( pLocalBitStringAux, pSps->sFrameCrop.iCropLeft );	// frame_crop_left_offset
-		BsWriteUE( pLocalBitStringAux, pSps->sFrameCrop.iCropRight );	// frame_crop_right_offset
-		BsWriteUE( pLocalBitStringAux, pSps->sFrameCrop.iCropTop );	// frame_crop_top_offset
-		BsWriteUE( pLocalBitStringAux, pSps->sFrameCrop.iCropBottom );	// frame_crop_bottom_offset
-	}
-
-	BsWriteOneBit( pLocalBitStringAux, 0/*pSps->bVuiParamPresentFlag*/ );	// vui_parameters_present_flag
-	
-	return 0;
-}
-
-
-int32_t WelsWriteSpsNal( SWelsSPS *pSps, SBitStringAux *pBitStringAux, int32_t* pSpsIdDelta)
-{
-	WelsWriteSpsSyntax( pSps, pBitStringAux, pSpsIdDelta );
-
-	BsRbspTrailingBits( pBitStringAux );
-
-	BsFlush( pBitStringAux );
-
-	return 0;
-}
-
-/*! 
- *************************************************************************************
- * \brief	to write SubSet Sequence Parameter Set
- *
- * \param 	sub_sps		subset pSps parsed
- * \param	pBitStringAux		bitstream writer auxiliary 
- *
- * \return	0 - successed
- *		    1 - failed
- *
- * \note	Call it in case EWelsNalUnitType is SubSet SPS.
- *************************************************************************************
- */
-
-int32_t WelsWriteSubsetSpsSyntax( SSubsetSps *pSubsetSps, SBitStringAux *pBitStringAux , int32_t* pSpsIdDelta )
-{
-	SWelsSPS *pSps = &pSubsetSps->pSps;
-
-	WelsWriteSpsSyntax( pSps, pBitStringAux, pSpsIdDelta );
-
-	if ( pSps->uiProfileIdc == PRO_SCALABLE_BASELINE || pSps->uiProfileIdc == PRO_SCALABLE_HIGH ){
-		SSpsSvcExt *pSubsetSpsExt = &pSubsetSps->sSpsSvcExt;
-		
-		BsWriteOneBit( pBitStringAux, true/*pSubsetSpsExt->bInterLayerDeblockingFilterCtrlPresentFlag*/ );
-		BsWriteBits( pBitStringAux, 2, pSubsetSpsExt->iExtendedSpatialScalability );
-			BsWriteOneBit( pBitStringAux, 0/*pSubsetSpsExt->uiChromaPhaseXPlus1Flag*/ );
-			BsWriteBits( pBitStringAux, 2, 1/*pSubsetSpsExt->uiChromaPhaseYPlus1*/ );
-		if ( pSubsetSpsExt->iExtendedSpatialScalability == 1 ){
-				BsWriteOneBit( pBitStringAux, 0/*pSubsetSpsExt->uiSeqRefLayerChromaPhaseXPlus1Flag*/ );
-				BsWriteBits( pBitStringAux, 2, 1/*pSubsetSpsExt->uiSeqRefLayerChromaPhaseYPlus1*/ );
-			BsWriteSE( pBitStringAux, 0/*pSubsetSpsExt->sSeqScaledRefLayer.left_offset*/ ); 
-			BsWriteSE( pBitStringAux, 0/*pSubsetSpsExt->sSeqScaledRefLayer.top_offset*/ ); 
-			BsWriteSE( pBitStringAux, 0/*pSubsetSpsExt->sSeqScaledRefLayer.right_offset*/ ); 
-			BsWriteSE( pBitStringAux, 0/*pSubsetSpsExt->sSeqScaledRefLayer.bottom_offset*/ );
-		}
-		BsWriteOneBit( pBitStringAux, pSubsetSpsExt->bSeqTcoeffLevelPredFlag );
-		if ( pSubsetSpsExt->bSeqTcoeffLevelPredFlag ){
-			BsWriteOneBit( pBitStringAux, pSubsetSpsExt->bAdaptiveTcoeffLevelPredFlag );
-		}
-		BsWriteOneBit( pBitStringAux, pSubsetSpsExt->bSliceHeaderRestrictionFlag );
-		
-		BsWriteOneBit( pBitStringAux, false/*pSubsetSps->bSvcVuiParamPresentFlag*/ );
-	}		
-	BsWriteOneBit( pBitStringAux, false/*pSubsetSps->bAdditionalExtension2Flag*/ );
-
-	BsRbspTrailingBits( pBitStringAux );
-
-	BsFlush( pBitStringAux );
-
-	return 0;
-}
-
-/*! 
- *************************************************************************************
- * \brief	to write Picture Parameter Set (PPS)
- *
- * \param 	pPps     	pPps
- * \param	pBitStringAux		bitstream writer auxiliary 
- *
- * \return	0 - successed
- *	    	1 - failed
- *
- * \note	Call it in case EWelsNalUnitType is PPS.
- *************************************************************************************
- */
-int32_t WelsWritePpsSyntax( SWelsPPS *pPps, SBitStringAux *pBitStringAux, SParaSetOffset* sPSOVector )
-{
-	SBitStringAux * pLocalBitStringAux = pBitStringAux;
-
-	bool_t bUsedSubset    =  sPSOVector->bPpsIdMappingIntoSubsetsps[pPps->iPpsId];
-	int32_t iParameterSetType = ( bUsedSubset ? PARA_SET_TYPE_SUBSETSPS : PARA_SET_TYPE_AVCSPS );
-
-	BsWriteUE( pLocalBitStringAux, pPps->iPpsId + sPSOVector->sParaSetOffsetVariable[PARA_SET_TYPE_PPS].iParaSetIdDelta[pPps->iPpsId] );	
-	BsWriteUE( pLocalBitStringAux, pPps->iSpsId + sPSOVector->sParaSetOffsetVariable[iParameterSetType].iParaSetIdDelta[pPps->iSpsId] );
-	
-#if _DEBUG 
-	//SParaSetOffset use, 110421
-	if ( sPSOVector->bEnableSpsPpsIdAddition )
-	{
-		const int32_t kiTmpSpsIdInBs = pPps->iSpsId + sPSOVector->sParaSetOffsetVariable[iParameterSetType].iParaSetIdDelta[pPps->iSpsId];
-		const int32_t tmp_pps_id_in_bs = pPps->iPpsId + sPSOVector->sParaSetOffsetVariable[PARA_SET_TYPE_PPS].iParaSetIdDelta[pPps->iPpsId];
-		assert ( MAX_SPS_COUNT > kiTmpSpsIdInBs );
-		assert ( MAX_PPS_COUNT > tmp_pps_id_in_bs );
-		assert( sPSOVector->sParaSetOffsetVariable[iParameterSetType].bUsedParaSetIdInBs[kiTmpSpsIdInBs] );
-	}
-#endif
-
-	BsWriteOneBit( pLocalBitStringAux, false/*pPps->entropy_coding_mode_flag*/ );
-	BsWriteOneBit( pLocalBitStringAux, false/*pPps->bPicOrderPresentFlag*/ );
-	
-#ifdef DISABLE_FMO_FEATURE
-	BsWriteUE( pLocalBitStringAux, 0/*pPps->uiNumSliceGroups - 1*/ );	
-#else
-	BsWriteUE( pLocalBitStringAux, pPps->uiNumSliceGroups - 1 );	
-	if ( pPps->uiNumSliceGroups > 1 )
-	{
-		uint32_t i, uiNumBits;
-
-		BsWriteUE( pLocalBitStringAux, pPps->uiSliceGroupMapType );
-		
-		switch ( pPps->uiSliceGroupMapType )
-		{
-		case 0:
-			for ( i = 0; i < pPps->uiNumSliceGroups; i ++ )
-			{
-				 BsWriteUE( pLocalBitStringAux, pPps->uiRunLength[i] - 1 );
-			}
-			break;
-		case 2:
-			for ( i = 0; i < pPps->uiNumSliceGroups; i ++ )
-			{
-				BsWriteUE( pLocalBitStringAux, pPps->uiTopLeft[i] );
-				BsWriteUE( pLocalBitStringAux, pPps->uiBottomRight[i] );
-			}
-			break;
-		case 3:
-		case 4:
-		case 5:
-			BsWriteOneBit( pLocalBitStringAux, pPps->bSliceGroupChangeDirectionFlag );
-			BsWriteUE( pLocalBitStringAux, pPps->uiSliceGroupChangeRate - 1 );
-			break;
-		case 6:
-			BsWriteUE( pLocalBitStringAux, pPps->uiPicSizeInMapUnits - 1 );
-			uiNumBits = 0;///////////////////WELS_CEILLOG2(pPps->uiPicSizeInMapUnits);
-			for ( i = 0; i < pPps->uiPicSizeInMapUnits; i ++ ) 
-			{
-				BsWriteBits( pLocalBitStringAux, uiNumBits, pPps->uiSliceGroupId[i] );
-			}
-			break;
-		default:
-			break;
-		}
-	}
-#endif//!DISABLE_FMO_FEATURE
-	
-	BsWriteUE( pLocalBitStringAux, 0/*pPps->uiNumRefIdxL0Active - 1*/ );
-	BsWriteUE( pLocalBitStringAux, 0/*pPps->uiNumRefIdxL1Active - 1*/ );
-	
-	
-	BsWriteOneBit( pLocalBitStringAux, false/*pPps->bWeightedPredFlag*/ );
-	BsWriteBits (pLocalBitStringAux, 2, 0/*pPps->uiWeightedBiPredIdc*/ );
-	
-	BsWriteSE( pLocalBitStringAux, pPps->iPicInitQp - 26 );
-	BsWriteSE( pLocalBitStringAux, pPps->iPicInitQs - 26 );
-	
-	BsWriteSE( pLocalBitStringAux, pPps->uiChromaQpIndexOffset );
-	BsWriteOneBit( pLocalBitStringAux, pPps->bDeblockingFilterControlPresentFlag );
-	BsWriteOneBit( pLocalBitStringAux, false/*pPps->bConstainedIntraPredFlag*/ );
-	BsWriteOneBit( pLocalBitStringAux, false/*pPps->bRedundantPicCntPresentFlag*/ );
-	
-    BsRbspTrailingBits( pLocalBitStringAux );
-
-	BsFlush( pLocalBitStringAux );
-
-	return 0;
-}
-
-static inline bool_t WelsGetPaddingOffset(int32_t iActualWidth, int32_t iActualHeight,  int32_t iWidth, int32_t iHeight, SCropOffset &pOffset)
-{
-	if( (iWidth < iActualWidth) || (iHeight < iActualHeight) )
-		return false;
-
-	// make actual size even
-	iActualWidth -= (iActualWidth & 1);
-	iActualHeight -= (iActualHeight & 1);
-
-	pOffset.iCropLeft = 0;
-	pOffset.iCropRight = (iWidth - iActualWidth)/2;
-	pOffset.iCropTop = 0;
-	pOffset.iCropBottom = (iHeight - iActualHeight)/2;
-
-	return (iWidth>iActualWidth) || (iHeight>iActualHeight);
-}
-
-int32_t WelsInitSps( SWelsSPS *pSps, SDLayerParam *pLayerParam, const uint32_t kuiIntraPeriod, const int32_t kiNumRefFrame,
-					  const uint32_t kuiSpsId, const bool_t kbEnableFrameCropping, bool_t bEnableRc )
-{
-	memset(pSps, 0, sizeof(SWelsSPS));
-
-	pSps->uiSpsId		= kuiSpsId;
-	pSps->iMbWidth	= (pLayerParam->iFrameWidth+15) >> 4;
-	pSps->iMbHeight	= (pLayerParam->iFrameHeight+15) >> 4;
-
-	if ( 0 == kuiIntraPeriod )
-	{
-		//max value of both iFrameNum and POC are 2^16-1, in our encoder, iPOC=2*iFrameNum, so max of iFrameNum should be 2^15-1.--
-		pSps->uiLog2MaxFrameNum = 15;//16; 
-	}
-	else
-	{
-		pSps->uiLog2MaxFrameNum	= 4;
-		while ( (uint32_t)(1 << pSps->uiLog2MaxFrameNum) <= kuiIntraPeriod ) {
-			++ pSps->uiLog2MaxFrameNum;
-		}
-	}
-	pSps->iLog2MaxPocLsb	= 1 + pSps->uiLog2MaxFrameNum;
-
-	pSps->iNumRefFrames	= kiNumRefFrame;	/* min pRef size when fifo pRef operation*/
-
-	if ( kbEnableFrameCropping )
-	{
-		// TODO: get frame_crop_left_offset, frame_crop_right_offset, frame_crop_top_offset, frame_crop_bottom_offset
-		pSps->bFrameCroppingFlag = WelsGetPaddingOffset( pLayerParam->iActualWidth, pLayerParam->iActualHeight, pLayerParam->iFrameWidth, pLayerParam->iFrameHeight, pSps->sFrameCrop );
-	}
-	else
-	{
-		pSps->bFrameCroppingFlag	= false;
-	}
-	
-	pSps->uiProfileIdc	= pLayerParam->uiProfileIdc ? pLayerParam->uiProfileIdc : PRO_BASELINE;
-
-	if( bEnableRc ) //fixed QP condition
-		pSps->iLevelIdc	= WelsGetLevelIdc(pSps, pLayerParam->fOutputFrameRate, pLayerParam->iSpatialBitrate);
-	else
-		pSps->iLevelIdc  = WelsGetLevelIdc(pSps, pLayerParam->fOutputFrameRate, 0); // Set tar_br = 0 to remove the bitrate constraint; a better way is to set actual tar_br as 0
-
-	return 0;
-}
-
-
-int32_t WelsInitSubsetSps( SSubsetSps *pSubsetSps, SDLayerParam *pLayerParam, const uint32_t kuiIntraPeriod, const int32_t kiNumRefFrame, 
-							 const uint32_t kuiSpsId, const bool_t kbEnableFrameCropping, bool_t bEnableRc )
-{
-	SWelsSPS *pSps = &pSubsetSps->pSps;
-
-	memset(pSubsetSps, 0, sizeof(SSubsetSps));
-
-	WelsInitSps( pSps, pLayerParam, kuiIntraPeriod, kiNumRefFrame, kuiSpsId, kbEnableFrameCropping, bEnableRc );
-
-	pSps->uiProfileIdc	= (pLayerParam->uiProfileIdc >= PRO_SCALABLE_BASELINE) ? pLayerParam->uiProfileIdc : PRO_SCALABLE_BASELINE;
-	
-	pSubsetSps->sSpsSvcExt.iExtendedSpatialScalability	= 0;	/* ESS is 0 in default */
-	pSubsetSps->sSpsSvcExt.bAdaptiveTcoeffLevelPredFlag	= false;
-	pSubsetSps->sSpsSvcExt.bSeqTcoeffLevelPredFlag	= false;
-	pSubsetSps->sSpsSvcExt.bSliceHeaderRestrictionFlag = true;
-
-	return 0;
-}
-
-int32_t WelsInitPps(	SWelsPPS *pPps,
-						SWelsSPS *pSps,
-						SSubsetSps *pSubsetSps,
-						const uint32_t kuiPpsId,
-						const bool_t kbDeblockingFilterPresentFlag,
-						const bool_t kbUsingSubsetSps )
-{
-	SWelsSPS *pUsedSps = NULL;
-	if ( pPps == NULL || (pSps == NULL && pSubsetSps == NULL) )
-		return 1;
-	if ( !kbUsingSubsetSps ){
-		assert( pSps != NULL );
-		if ( NULL == pSps )
-			return 1;
-		pUsedSps	= pSps;		
-	}
-	else{
-		assert(pSubsetSps != NULL);
-		if ( NULL == pSubsetSps )
-			return 1;
-		pUsedSps	= &pSubsetSps->pSps;		
-	}
-	
-	/* fill picture parameter set syntax */
-	pPps->iPpsId		= kuiPpsId;
-	pPps->iSpsId		= pUsedSps->uiSpsId;
-#if !defined(DISABLE_FMO_FEATURE)
-	pPps->uiNumSliceGroups =  1;	//param->qos_param.sliceGroupCount;
-    if( pPps->uiNumSliceGroups > 1 )
-    {        
-        pPps->uiSliceGroupMapType = 0;	//param->qos_param.sliceGroupType;
-        if( pPps->uiSliceGroupMapType == 0 )
-        {   
-			uint32_t uiGroup = 0;
-			while (uiGroup < pPps->uiNumSliceGroups) {
-				pPps->uiRunLength[uiGroup]	= 25;
-				++ uiGroup;
-			}
-        }
-        else if( pPps->uiSliceGroupMapType == 2 )
-        {
-			memset(&pPps->uiTopLeft[0], 0, MAX_SLICEGROUP_IDS*sizeof(pPps->uiTopLeft[0]));
-			memset(&pPps->uiBottomRight[0], 0, MAX_SLICEGROUP_IDS*sizeof(pPps->uiBottomRight[0]));
-        }
-        else if( pPps->uiSliceGroupMapType >= 3 &&
-			pPps->uiSliceGroupMapType <= 5 )
-        {
-            pPps->bSliceGroupChangeDirectionFlag = false;
-            pPps->uiSliceGroupChangeRate = 0;
-        }
-        else if( pPps->uiSliceGroupMapType == 6 )
-        {
-            pPps->uiPicSizeInMapUnits = 1;
-			memset(&pPps->uiSliceGroupId[0], 0, MAX_SLICEGROUP_IDS*sizeof(pPps->uiSliceGroupId[0]));
-        }
-    }
-#endif//!DISABLE_FMO_FEATURE
-	
-    pPps->iPicInitQp							= 26;
-    pPps->iPicInitQs							= 26;
-	
-    pPps->uiChromaQpIndexOffset					= 0;		
-	pPps->bDeblockingFilterControlPresentFlag	= kbDeblockingFilterPresentFlag;
-	
-	return 0;
-}
-} // namespace WelsSVCEnc
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	au_set.c
+ *
+ * \brief	Interfaces introduced in Access Unit level based writer
+ *
+ * \date	05/18/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#include <string.h>
+#include <assert.h>
+#include "au_set.h"
+#include "svc_enc_golomb.h"
+namespace WelsSVCEnc {
+static const uint32_t g_kuiMaxDPBx2AtLevel[52] = { // *2 on the basic of Annex A, Table A-1, for int32_t type
+  0,		0,		0,		0,		0,	0,	0,	0,	0,	0, //0~9
+  297,	675,	1782,	1782,	0,	0,	0,	0,	0,	0, //10, 11, 12, 13
+  1782,	3564,	6075,	0,		0,	0,	0,	0,	0,	0, //20, 21, 22
+  6075,	13500,	15360,	0,		0,	0,	0,	0,	0,	0, //30, 31, 32
+  24576,	24576,	26112,	0,		0,	0,	0,	0,	0,	0, //40, 41, 42
+  82800,	138240											//50, 51
+};
+
+
+#define LEVEL_NUMBER 16
+
+typedef struct TagLevelLimit {
+  uint8_t iLevelIdc;
+  uint32_t uiMaxMbPS; // Max MBs processing speed
+  uint32_t uiMaxFS; // Max Frame size
+  uint32_t uiMaxDPBMB; //Max DPB MB Size
+  uint32_t uiMaxBR; //Max Bitrate
+} SLevelLimit;
+
+const SLevelLimit g_ksLevelLimit[LEVEL_NUMBER] = {
+  { 10,   1485,    99,	  396,     64 },                 //10
+  { 9,    1485,    99,	  396,    128 },                 //9 (1b)
+  { 11,   3000,   396,	  900,    192 },                 //11
+  { 12,   6000,   396,	 2376,    384 },                 //12
+  { 13,  11880,   396,	 2376,    768 },                 //13
+
+  { 20,  11880,   396,    2376,   2000 },                 //20
+  { 21,  19800,   792,    4752,   4000 },                 //21
+  { 22,  20250,  1620,    8100,   4000 },                 //22
+
+  { 30,  40500,  1620,    8100,  10000 },                 //30
+  { 31, 108000,  3600,   18000,  14000 },                 //31
+  { 32, 216000,  5120,   20480,  20000 },                 //32
+
+  { 40, 245760,  8192,   32768,  20000 },                 //40
+  { 41, 245760,  8192,   32768,  50000 },                 //41
+  { 42, 491520,  8192,   34816,  50000 },                 //42
+
+  { 50, 589824, 22080,  110400, 135000 },                 //50
+  { 51, 983040, 36864,  184320, 240000 }                  //51
+};
+
+static inline int32_t WelsCheckLevelLimitation (const SWelsSPS* kpSps, const SLevelLimit* kpLevelLimit,
+    float fFrameRate, int32_t iTargetBitRate) {
+  uint32_t uiPicWidthInMBs = kpSps->iMbWidth;
+  uint32_t uiPicHeightInMBs = kpSps->iMbHeight;
+  uint32_t uiPicInMBs = uiPicWidthInMBs * uiPicHeightInMBs;
+  uint32_t uiNumRefFrames = kpSps->iNumRefFrames;
+
+  if (kpLevelLimit->uiMaxMbPS < (uint32_t) (uiPicInMBs * fFrameRate))
+    return 0;
+  if (kpLevelLimit->uiMaxFS < uiPicInMBs)
+    return 0;
+  if ((kpLevelLimit->uiMaxFS << 3) < (uiPicWidthInMBs * uiPicWidthInMBs))
+    return 0;
+  if ((kpLevelLimit->uiMaxFS << 3) < (uiPicHeightInMBs * uiPicHeightInMBs))
+    return 0;
+  if (kpLevelLimit->uiMaxDPBMB < uiNumRefFrames * uiPicInMBs)
+    return 0;
+  if (iTargetBitRate
+      && ((int32_t) kpLevelLimit->uiMaxBR  * 1200) < iTargetBitRate)    //RC enabled, considering bitrate constraint
+    return 0;
+  //add more checks here if needed in future
+
+  return 1;
+
+}
+
+static inline int32_t WelsGetLevelIdc (const SWelsSPS* kpSps, float fFrameRate, int32_t iTargetBitRate) {
+  int32_t iOrder;
+  for (iOrder = 0; iOrder < LEVEL_NUMBER; iOrder++) {
+    if (WelsCheckLevelLimitation (kpSps, & (g_ksLevelLimit[iOrder]), fFrameRate, iTargetBitRate)) {
+      return (int32_t) (g_ksLevelLimit[iOrder].iLevelIdc);
+    }
+  }
+  return 51; //final decision: select the biggest level
+}
+
+
+/*!
+ *************************************************************************************
+ * \brief	to set Sequence Parameter Set (SPS)
+ *
+ * \param 	pSps 	SWelsSPS to be wrote, update iSpsId dependency
+ * \param	pBitStringAux		bitstream writer auxiliary
+ *
+ * \return	0 - successed
+ *	    	1 - failed
+ *
+ * \note	Call it in case EWelsNalUnitType is SPS.
+ *************************************************************************************
+ */
+int32_t WelsWriteSpsSyntax (SWelsSPS* pSps, SBitStringAux* pBitStringAux, int32_t* pSpsIdDelta) {
+  SBitStringAux* pLocalBitStringAux = pBitStringAux;
+
+  assert (pSps != NULL && pBitStringAux != NULL);
+
+  BsWriteBits (pLocalBitStringAux, 8, pSps->uiProfileIdc);
+
+  BsWriteOneBit (pLocalBitStringAux, pSps->bConstraintSet0Flag);	// bConstraintSet0Flag
+  BsWriteOneBit (pLocalBitStringAux, pSps->bConstraintSet1Flag);	// bConstraintSet1Flag
+  BsWriteOneBit (pLocalBitStringAux, pSps->bConstraintSet2Flag);	// bConstraintSet2Flag
+  BsWriteOneBit (pLocalBitStringAux, 0/*pSps->bConstraintSet3Flag*/);	// bConstraintSet3Flag
+  BsWriteBits (pLocalBitStringAux, 4, 0);							// reserved_zero_4bits, equal to 0
+  BsWriteBits (pLocalBitStringAux, 8, pSps->iLevelIdc);				// iLevelIdc
+  BsWriteUE (pLocalBitStringAux, pSps->uiSpsId + pSpsIdDelta[pSps->uiSpsId]);					     // seq_parameter_set_id
+
+  if (PRO_SCALABLE_BASELINE == pSps->uiProfileIdc || PRO_SCALABLE_HIGH == pSps->uiProfileIdc ||
+      PRO_HIGH == pSps->uiProfileIdc || PRO_HIGH10 == pSps->uiProfileIdc ||
+      PRO_HIGH422 == pSps->uiProfileIdc || PRO_HIGH444 == pSps->uiProfileIdc ||
+      PRO_CAVLC444 == pSps->uiProfileIdc || 44 == pSps->uiProfileIdc) {
+    BsWriteUE (pLocalBitStringAux, 1);  //uiChromaFormatIdc, now should be 1
+    BsWriteUE (pLocalBitStringAux, 0); //uiBitDepthLuma
+    BsWriteUE (pLocalBitStringAux, 0); //uiBitDepthChroma
+    BsWriteOneBit (pLocalBitStringAux, 0); //qpprime_y_zero_transform_bypass_flag
+    BsWriteOneBit (pLocalBitStringAux, 0); //seq_scaling_matrix_present_flag
+  }
+
+  BsWriteUE (pLocalBitStringAux, pSps->uiLog2MaxFrameNum - 4);	// log2_max_frame_num_minus4
+  BsWriteUE (pLocalBitStringAux, 0/*pSps->uiPocType*/);		     // pic_order_cnt_type
+  BsWriteUE (pLocalBitStringAux, pSps->iLog2MaxPocLsb - 4);	// log2_max_pic_order_cnt_lsb_minus4
+
+  BsWriteUE (pLocalBitStringAux, pSps->iNumRefFrames);		// max_num_ref_frames
+  BsWriteOneBit (pLocalBitStringAux, true/*pSps->bGapsInFrameNumValueAllowedFlag*/);	// bGapsInFrameNumValueAllowedFlag
+  BsWriteUE (pLocalBitStringAux, pSps->iMbWidth - 1);		// pic_width_in_mbs_minus1
+  BsWriteUE (pLocalBitStringAux, pSps->iMbHeight - 1);		// pic_height_in_map_units_minus1
+  BsWriteOneBit (pLocalBitStringAux, true/*pSps->bFrameMbsOnlyFlag*/);	// bFrameMbsOnlyFlag
+
+  BsWriteOneBit (pLocalBitStringAux, 0/*pSps->bDirect8x8InferenceFlag*/);	// direct_8x8_inference_flag
+  BsWriteOneBit (pLocalBitStringAux, pSps->bFrameCroppingFlag);	// bFrameCroppingFlag
+  if (pSps->bFrameCroppingFlag) {
+    BsWriteUE (pLocalBitStringAux, pSps->sFrameCrop.iCropLeft);	// frame_crop_left_offset
+    BsWriteUE (pLocalBitStringAux, pSps->sFrameCrop.iCropRight);	// frame_crop_right_offset
+    BsWriteUE (pLocalBitStringAux, pSps->sFrameCrop.iCropTop);	// frame_crop_top_offset
+    BsWriteUE (pLocalBitStringAux, pSps->sFrameCrop.iCropBottom);	// frame_crop_bottom_offset
+  }
+
+  BsWriteOneBit (pLocalBitStringAux, 0/*pSps->bVuiParamPresentFlag*/);	// vui_parameters_present_flag
+
+  return 0;
+}
+
+
+int32_t WelsWriteSpsNal (SWelsSPS* pSps, SBitStringAux* pBitStringAux, int32_t* pSpsIdDelta) {
+  WelsWriteSpsSyntax (pSps, pBitStringAux, pSpsIdDelta);
+
+  BsRbspTrailingBits (pBitStringAux);
+
+  BsFlush (pBitStringAux);
+
+  return 0;
+}
+
+/*!
+ *************************************************************************************
+ * \brief	to write SubSet Sequence Parameter Set
+ *
+ * \param 	sub_sps		subset pSps parsed
+ * \param	pBitStringAux		bitstream writer auxiliary
+ *
+ * \return	0 - successed
+ *		    1 - failed
+ *
+ * \note	Call it in case EWelsNalUnitType is SubSet SPS.
+ *************************************************************************************
+ */
+
+int32_t WelsWriteSubsetSpsSyntax (SSubsetSps* pSubsetSps, SBitStringAux* pBitStringAux , int32_t* pSpsIdDelta) {
+  SWelsSPS* pSps = &pSubsetSps->pSps;
+
+  WelsWriteSpsSyntax (pSps, pBitStringAux, pSpsIdDelta);
+
+  if (pSps->uiProfileIdc == PRO_SCALABLE_BASELINE || pSps->uiProfileIdc == PRO_SCALABLE_HIGH) {
+    SSpsSvcExt* pSubsetSpsExt = &pSubsetSps->sSpsSvcExt;
+
+    BsWriteOneBit (pBitStringAux, true/*pSubsetSpsExt->bInterLayerDeblockingFilterCtrlPresentFlag*/);
+    BsWriteBits (pBitStringAux, 2, pSubsetSpsExt->iExtendedSpatialScalability);
+    BsWriteOneBit (pBitStringAux, 0/*pSubsetSpsExt->uiChromaPhaseXPlus1Flag*/);
+    BsWriteBits (pBitStringAux, 2, 1/*pSubsetSpsExt->uiChromaPhaseYPlus1*/);
+    if (pSubsetSpsExt->iExtendedSpatialScalability == 1) {
+      BsWriteOneBit (pBitStringAux, 0/*pSubsetSpsExt->uiSeqRefLayerChromaPhaseXPlus1Flag*/);
+      BsWriteBits (pBitStringAux, 2, 1/*pSubsetSpsExt->uiSeqRefLayerChromaPhaseYPlus1*/);
+      BsWriteSE (pBitStringAux, 0/*pSubsetSpsExt->sSeqScaledRefLayer.left_offset*/);
+      BsWriteSE (pBitStringAux, 0/*pSubsetSpsExt->sSeqScaledRefLayer.top_offset*/);
+      BsWriteSE (pBitStringAux, 0/*pSubsetSpsExt->sSeqScaledRefLayer.right_offset*/);
+      BsWriteSE (pBitStringAux, 0/*pSubsetSpsExt->sSeqScaledRefLayer.bottom_offset*/);
+    }
+    BsWriteOneBit (pBitStringAux, pSubsetSpsExt->bSeqTcoeffLevelPredFlag);
+    if (pSubsetSpsExt->bSeqTcoeffLevelPredFlag) {
+      BsWriteOneBit (pBitStringAux, pSubsetSpsExt->bAdaptiveTcoeffLevelPredFlag);
+    }
+    BsWriteOneBit (pBitStringAux, pSubsetSpsExt->bSliceHeaderRestrictionFlag);
+
+    BsWriteOneBit (pBitStringAux, false/*pSubsetSps->bSvcVuiParamPresentFlag*/);
+  }
+  BsWriteOneBit (pBitStringAux, false/*pSubsetSps->bAdditionalExtension2Flag*/);
+
+  BsRbspTrailingBits (pBitStringAux);
+
+  BsFlush (pBitStringAux);
+
+  return 0;
+}
+
+/*!
+ *************************************************************************************
+ * \brief	to write Picture Parameter Set (PPS)
+ *
+ * \param 	pPps     	pPps
+ * \param	pBitStringAux		bitstream writer auxiliary
+ *
+ * \return	0 - successed
+ *	    	1 - failed
+ *
+ * \note	Call it in case EWelsNalUnitType is PPS.
+ *************************************************************************************
+ */
+int32_t WelsWritePpsSyntax (SWelsPPS* pPps, SBitStringAux* pBitStringAux, SParaSetOffset* sPSOVector) {
+  SBitStringAux* pLocalBitStringAux = pBitStringAux;
+
+  bool_t bUsedSubset    =  sPSOVector->bPpsIdMappingIntoSubsetsps[pPps->iPpsId];
+  int32_t iParameterSetType = (bUsedSubset ? PARA_SET_TYPE_SUBSETSPS : PARA_SET_TYPE_AVCSPS);
+
+  BsWriteUE (pLocalBitStringAux, pPps->iPpsId +
+             sPSOVector->sParaSetOffsetVariable[PARA_SET_TYPE_PPS].iParaSetIdDelta[pPps->iPpsId]);
+  BsWriteUE (pLocalBitStringAux, pPps->iSpsId +
+             sPSOVector->sParaSetOffsetVariable[iParameterSetType].iParaSetIdDelta[pPps->iSpsId]);
+
+#if _DEBUG
+  //SParaSetOffset use, 110421
+  if (sPSOVector->bEnableSpsPpsIdAddition) {
+    const int32_t kiTmpSpsIdInBs = pPps->iSpsId +
+                                   sPSOVector->sParaSetOffsetVariable[iParameterSetType].iParaSetIdDelta[pPps->iSpsId];
+    const int32_t tmp_pps_id_in_bs = pPps->iPpsId +
+                                     sPSOVector->sParaSetOffsetVariable[PARA_SET_TYPE_PPS].iParaSetIdDelta[pPps->iPpsId];
+    assert (MAX_SPS_COUNT > kiTmpSpsIdInBs);
+    assert (MAX_PPS_COUNT > tmp_pps_id_in_bs);
+    assert (sPSOVector->sParaSetOffsetVariable[iParameterSetType].bUsedParaSetIdInBs[kiTmpSpsIdInBs]);
+  }
+#endif
+
+  BsWriteOneBit (pLocalBitStringAux, false/*pPps->entropy_coding_mode_flag*/);
+  BsWriteOneBit (pLocalBitStringAux, false/*pPps->bPicOrderPresentFlag*/);
+
+#ifdef DISABLE_FMO_FEATURE
+  BsWriteUE (pLocalBitStringAux, 0/*pPps->uiNumSliceGroups - 1*/);
+#else
+  BsWriteUE (pLocalBitStringAux, pPps->uiNumSliceGroups - 1);
+  if (pPps->uiNumSliceGroups > 1) {
+    uint32_t i, uiNumBits;
+
+    BsWriteUE (pLocalBitStringAux, pPps->uiSliceGroupMapType);
+
+    switch (pPps->uiSliceGroupMapType) {
+    case 0:
+      for (i = 0; i < pPps->uiNumSliceGroups; i ++) {
+        BsWriteUE (pLocalBitStringAux, pPps->uiRunLength[i] - 1);
+      }
+      break;
+    case 2:
+      for (i = 0; i < pPps->uiNumSliceGroups; i ++) {
+        BsWriteUE (pLocalBitStringAux, pPps->uiTopLeft[i]);
+        BsWriteUE (pLocalBitStringAux, pPps->uiBottomRight[i]);
+      }
+      break;
+    case 3:
+    case 4:
+    case 5:
+      BsWriteOneBit (pLocalBitStringAux, pPps->bSliceGroupChangeDirectionFlag);
+      BsWriteUE (pLocalBitStringAux, pPps->uiSliceGroupChangeRate - 1);
+      break;
+    case 6:
+      BsWriteUE (pLocalBitStringAux, pPps->uiPicSizeInMapUnits - 1);
+      uiNumBits = 0;///////////////////WELS_CEILLOG2(pPps->uiPicSizeInMapUnits);
+      for (i = 0; i < pPps->uiPicSizeInMapUnits; i ++) {
+        BsWriteBits (pLocalBitStringAux, uiNumBits, pPps->uiSliceGroupId[i]);
+      }
+      break;
+    default:
+      break;
+    }
+  }
+#endif//!DISABLE_FMO_FEATURE
+
+  BsWriteUE (pLocalBitStringAux, 0/*pPps->uiNumRefIdxL0Active - 1*/);
+  BsWriteUE (pLocalBitStringAux, 0/*pPps->uiNumRefIdxL1Active - 1*/);
+
+
+  BsWriteOneBit (pLocalBitStringAux, false/*pPps->bWeightedPredFlag*/);
+  BsWriteBits (pLocalBitStringAux, 2, 0/*pPps->uiWeightedBiPredIdc*/);
+
+  BsWriteSE (pLocalBitStringAux, pPps->iPicInitQp - 26);
+  BsWriteSE (pLocalBitStringAux, pPps->iPicInitQs - 26);
+
+  BsWriteSE (pLocalBitStringAux, pPps->uiChromaQpIndexOffset);
+  BsWriteOneBit (pLocalBitStringAux, pPps->bDeblockingFilterControlPresentFlag);
+  BsWriteOneBit (pLocalBitStringAux, false/*pPps->bConstainedIntraPredFlag*/);
+  BsWriteOneBit (pLocalBitStringAux, false/*pPps->bRedundantPicCntPresentFlag*/);
+
+  BsRbspTrailingBits (pLocalBitStringAux);
+
+  BsFlush (pLocalBitStringAux);
+
+  return 0;
+}
+
+static inline bool_t WelsGetPaddingOffset (int32_t iActualWidth, int32_t iActualHeight,  int32_t iWidth,
+    int32_t iHeight, SCropOffset& pOffset) {
+  if ((iWidth < iActualWidth) || (iHeight < iActualHeight))
+    return false;
+
+  // make actual size even
+  iActualWidth -= (iActualWidth & 1);
+  iActualHeight -= (iActualHeight & 1);
+
+  pOffset.iCropLeft = 0;
+  pOffset.iCropRight = (iWidth - iActualWidth) / 2;
+  pOffset.iCropTop = 0;
+  pOffset.iCropBottom = (iHeight - iActualHeight) / 2;
+
+  return (iWidth > iActualWidth) || (iHeight > iActualHeight);
+}
+
+int32_t WelsInitSps (SWelsSPS* pSps, SDLayerParam* pLayerParam, const uint32_t kuiIntraPeriod,
+                     const int32_t kiNumRefFrame,
+                     const uint32_t kuiSpsId, const bool_t kbEnableFrameCropping, bool_t bEnableRc) {
+  memset (pSps, 0, sizeof (SWelsSPS));
+
+  pSps->uiSpsId		= kuiSpsId;
+  pSps->iMbWidth	= (pLayerParam->iFrameWidth + 15) >> 4;
+  pSps->iMbHeight	= (pLayerParam->iFrameHeight + 15) >> 4;
+
+  if (0 == kuiIntraPeriod) {
+    //max value of both iFrameNum and POC are 2^16-1, in our encoder, iPOC=2*iFrameNum, so max of iFrameNum should be 2^15-1.--
+    pSps->uiLog2MaxFrameNum = 15;//16;
+  } else {
+    pSps->uiLog2MaxFrameNum	= 4;
+    while ((uint32_t) (1 << pSps->uiLog2MaxFrameNum) <= kuiIntraPeriod) {
+      ++ pSps->uiLog2MaxFrameNum;
+    }
+  }
+  pSps->iLog2MaxPocLsb	= 1 + pSps->uiLog2MaxFrameNum;
+
+  pSps->iNumRefFrames	= kiNumRefFrame;	/* min pRef size when fifo pRef operation*/
+
+  if (kbEnableFrameCropping) {
+    // TODO: get frame_crop_left_offset, frame_crop_right_offset, frame_crop_top_offset, frame_crop_bottom_offset
+    pSps->bFrameCroppingFlag = WelsGetPaddingOffset (pLayerParam->iActualWidth, pLayerParam->iActualHeight,
+                               pLayerParam->iFrameWidth, pLayerParam->iFrameHeight, pSps->sFrameCrop);
+  } else {
+    pSps->bFrameCroppingFlag	= false;
+  }
+
+  pSps->uiProfileIdc	= pLayerParam->uiProfileIdc ? pLayerParam->uiProfileIdc : PRO_BASELINE;
+
+  if (bEnableRc)  //fixed QP condition
+    pSps->iLevelIdc	= WelsGetLevelIdc (pSps, pLayerParam->fOutputFrameRate, pLayerParam->iSpatialBitrate);
+  else
+    pSps->iLevelIdc  = WelsGetLevelIdc (pSps, pLayerParam->fOutputFrameRate,
+                                        0); // Set tar_br = 0 to remove the bitrate constraint; a better way is to set actual tar_br as 0
+
+  return 0;
+}
+
+
+int32_t WelsInitSubsetSps (SSubsetSps* pSubsetSps, SDLayerParam* pLayerParam, const uint32_t kuiIntraPeriod,
+                           const int32_t kiNumRefFrame,
+                           const uint32_t kuiSpsId, const bool_t kbEnableFrameCropping, bool_t bEnableRc) {
+  SWelsSPS* pSps = &pSubsetSps->pSps;
+
+  memset (pSubsetSps, 0, sizeof (SSubsetSps));
+
+  WelsInitSps (pSps, pLayerParam, kuiIntraPeriod, kiNumRefFrame, kuiSpsId, kbEnableFrameCropping, bEnableRc);
+
+  pSps->uiProfileIdc	= (pLayerParam->uiProfileIdc >= PRO_SCALABLE_BASELINE) ? pLayerParam->uiProfileIdc :
+                        PRO_SCALABLE_BASELINE;
+
+  pSubsetSps->sSpsSvcExt.iExtendedSpatialScalability	= 0;	/* ESS is 0 in default */
+  pSubsetSps->sSpsSvcExt.bAdaptiveTcoeffLevelPredFlag	= false;
+  pSubsetSps->sSpsSvcExt.bSeqTcoeffLevelPredFlag	= false;
+  pSubsetSps->sSpsSvcExt.bSliceHeaderRestrictionFlag = true;
+
+  return 0;
+}
+
+int32_t WelsInitPps (SWelsPPS* pPps,
+                     SWelsSPS* pSps,
+                     SSubsetSps* pSubsetSps,
+                     const uint32_t kuiPpsId,
+                     const bool_t kbDeblockingFilterPresentFlag,
+                     const bool_t kbUsingSubsetSps) {
+  SWelsSPS* pUsedSps = NULL;
+  if (pPps == NULL || (pSps == NULL && pSubsetSps == NULL))
+    return 1;
+  if (!kbUsingSubsetSps) {
+    assert (pSps != NULL);
+    if (NULL == pSps)
+      return 1;
+    pUsedSps	= pSps;
+  } else {
+    assert (pSubsetSps != NULL);
+    if (NULL == pSubsetSps)
+      return 1;
+    pUsedSps	= &pSubsetSps->pSps;
+  }
+
+  /* fill picture parameter set syntax */
+  pPps->iPpsId		= kuiPpsId;
+  pPps->iSpsId		= pUsedSps->uiSpsId;
+#if !defined(DISABLE_FMO_FEATURE)
+  pPps->uiNumSliceGroups =  1;	//param->qos_param.sliceGroupCount;
+  if (pPps->uiNumSliceGroups > 1) {
+    pPps->uiSliceGroupMapType = 0;	//param->qos_param.sliceGroupType;
+    if (pPps->uiSliceGroupMapType == 0) {
+      uint32_t uiGroup = 0;
+      while (uiGroup < pPps->uiNumSliceGroups) {
+        pPps->uiRunLength[uiGroup]	= 25;
+        ++ uiGroup;
+      }
+    } else if (pPps->uiSliceGroupMapType == 2) {
+      memset (&pPps->uiTopLeft[0], 0, MAX_SLICEGROUP_IDS * sizeof (pPps->uiTopLeft[0]));
+      memset (&pPps->uiBottomRight[0], 0, MAX_SLICEGROUP_IDS * sizeof (pPps->uiBottomRight[0]));
+    } else if (pPps->uiSliceGroupMapType >= 3 &&
+               pPps->uiSliceGroupMapType <= 5) {
+      pPps->bSliceGroupChangeDirectionFlag = false;
+      pPps->uiSliceGroupChangeRate = 0;
+    } else if (pPps->uiSliceGroupMapType == 6) {
+      pPps->uiPicSizeInMapUnits = 1;
+      memset (&pPps->uiSliceGroupId[0], 0, MAX_SLICEGROUP_IDS * sizeof (pPps->uiSliceGroupId[0]));
+    }
+  }
+#endif//!DISABLE_FMO_FEATURE
+
+  pPps->iPicInitQp							= 26;
+  pPps->iPicInitQs							= 26;
+
+  pPps->uiChromaQpIndexOffset					= 0;
+  pPps->bDeblockingFilterControlPresentFlag	= kbDeblockingFilterPresentFlag;
+
+  return 0;
+}
+} // namespace WelsSVCEnc
--- a/codec/encoder/core/src/cpu.cpp
+++ b/codec/encoder/core/src/cpu.cpp
@@ -1,213 +1,196 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file	cpu.c
- *
- * \brief	CPU compatibility detection
- *
- * \date	04/29/2009 Created
- *
- *************************************************************************************
- */
-
-#include <string.h>
-
-#include "cpu.h"
-#include "cpu_core.h"
-
-
-namespace WelsSVCEnc {
-#define    CPU_Vender_AMD    "AuthenticAMD"
-#define    CPU_Vender_INTEL  "GenuineIntel"
-#define    CPU_Vender_CYRIX  "CyrixInstead"
-
-
-#if defined(X86_ASM)
-
-uint32_t WelsCPUFeatureDetect( int32_t *pNumberOfLogicProcessors )
-{
-    uint32_t uiCPU = 0;	
-    uint32_t uiFeatureA = 0, uiFeatureB = 0, uiFeatureC = 0, uiFeatureD = 0;
-	int32_t  CacheLineSize = 0;
-	int8_t   chVenderName[16] = { 0 };	
-	
-    if( !WelsCPUIdVerify() )
-    {
-        /* cpuid is not supported in cpu */
-        return 0;
-    }
-	
-	WelsCPUId( 0, &uiFeatureA, (uint32_t*)&chVenderName[0],(uint32_t*)&chVenderName[8],(uint32_t*)&chVenderName[4] );
-    if( uiFeatureA == 0 )
-    {
-		/* maximum input value for basic cpuid information */
-        return 0;
-    }
-	
-	WelsCPUId( 1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD );
-    if( (uiFeatureD & 0x00800000) == 0 )
-    {
-        /* Basic MMX technology is not support in cpu, mean nothing for us so return here */
-        return 0;
-    }
-	
-    uiCPU = WELS_CPU_MMX;
-    if( uiFeatureD & 0x02000000 )
-    {
-        /* SSE technology is identical to AMD MMX extensions */
-        uiCPU |= WELS_CPU_MMXEXT|WELS_CPU_SSE;
-    }
-    if( uiFeatureD & 0x04000000 )
-    {
-        /* SSE2 support here */
-        uiCPU |= WELS_CPU_SSE2;
-    }
-	if ( uiFeatureD & 0x00000001 )
-	{
-		/* x87 FPU on-chip checking */
-		uiCPU |= WELS_CPU_FPU;
-	}
-	if ( uiFeatureD & 0x00008000 )
-	{
-		/* CMOV instruction checking */
-		uiCPU |= WELS_CPU_CMOV;
-	}
-	if ( !strcmp((const str_t*)chVenderName,CPU_Vender_INTEL) )	// confirmed_safe_unsafe_usage
-	{
-		if ( uiFeatureD & 0x10000000 )
-		{
-			/* Multi-Threading checking: contains of multiple logic processors */
-			uiCPU |= WELS_CPU_HTT;
-		}
-	}	
-
-	if( uiFeatureC & 0x00000001 ){
-		/* SSE3 support here */
-		uiCPU |= WELS_CPU_SSE3;
-	}
-	if( uiFeatureC & 0x00000200 ){
-		/* SSSE3 support here */
-		uiCPU |= WELS_CPU_SSSE3;
-	}
-	if( uiFeatureC & 0x00080000 ){
-		/* SSE4.1 support here, 45nm Penryn processor */
-		uiCPU |= WELS_CPU_SSE41; 
-	}
-	if( uiFeatureC & 0x00100000 ){
-		/* SSE4.2 support here, next generation Nehalem processor */
-		uiCPU |= WELS_CPU_SSE42;
-	}
-	if ( WelsCPUSupportAVX( uiFeatureA, uiFeatureC ) )	// 
-	{
-		/* AVX supported */
-		uiCPU |= WELS_CPU_AVX;
-	}
-	if ( WelsCPUSupportFMA( uiFeatureA, uiFeatureC ) )	// 
-	{
-		/* AVX FMA supported */
-		uiCPU |= WELS_CPU_FMA;
-	}
-	if ( uiFeatureC & 0x02000000 )
-	{
-		/* AES checking */
-		uiCPU |= WELS_CPU_AES;
-	}
-	if ( uiFeatureC & 0x00400000 )
-	{
-		/* MOVBE checking */
-		uiCPU |= WELS_CPU_MOVBE;
-	}
-
-	if ( pNumberOfLogicProcessors != NULL )
-	{
-		// HTT enabled on chip
-		*pNumberOfLogicProcessors = (uiFeatureB & 0x00ff0000) >> 16; // feature bits: 23-16 on returned EBX		
-	}	
-	
-    WelsCPUId( 0x80000000, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD );
-
-	if( (!strcmp((const str_t*)chVenderName,CPU_Vender_AMD)) && (uiFeatureA>=0x80000001) ){	// confirmed_safe_unsafe_usage
-		WelsCPUId(0x80000001, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD );
-		if( uiFeatureD&0x00400000 ){
-			uiCPU |= WELS_CPU_MMXEXT;
-		}
-		if( uiFeatureD&0x80000000 ){
-			uiCPU |= WELS_CPU_3DNOW;
-		}
-	}
-
-	if( !strcmp((const str_t*)chVenderName,CPU_Vender_INTEL) ){	// confirmed_safe_unsafe_usage
-		int32_t  family, model;
-
-		WelsCPUId(1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
-		family = ((uiFeatureA>>8)&0xf) + ((uiFeatureA>>20)&0xff);
-        model  = ((uiFeatureA>>4)&0xf) + ((uiFeatureA>>12)&0xf0);
-
-		if( (family==6) && (model==9 || model==13 || model==14) ){
-			uiCPU &= ~(WELS_CPU_SSE2|WELS_CPU_SSE3);
-		}
-	}
-
-	// get cache line size
-	if( (!strcmp((const str_t*)chVenderName,CPU_Vender_INTEL)) || !(strcmp((const str_t*)chVenderName,CPU_Vender_CYRIX)) ){	// confirmed_safe_unsafe_usage
-		WelsCPUId(1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
-
-		CacheLineSize = (uiFeatureB&0xff00)>>5;	// ((clflush_line_size >> 8) << 3), CLFLUSH_line_size * 8 = CacheLineSize_in_byte
-
-		if( CacheLineSize == 128 ){
-			uiCPU |= WELS_CPU_CACHELINE_128;
-		}
-		else if( CacheLineSize == 64 ){
-			uiCPU |= WELS_CPU_CACHELINE_64;
-		}
-		else if( CacheLineSize == 32 ){
-			uiCPU |= WELS_CPU_CACHELINE_32;
-		}
-		else if( CacheLineSize == 16 ){
-			uiCPU |= WELS_CPU_CACHELINE_16;
-		}
-	}
-	
-    return uiCPU;
-}
-
-
-void WelsCPURestore( const uint32_t kuiCPU )
-{
-    if( kuiCPU & (WELS_CPU_MMX|WELS_CPU_MMXEXT|WELS_CPU_3DNOW|WELS_CPU_3DNOWEXT) )
-    {
-        WelsEmms();
-    }
-}
-
-#endif
-	
-}
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	cpu.c
+ *
+ * \brief	CPU compatibility detection
+ *
+ * \date	04/29/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#include <string.h>
+
+#include "cpu.h"
+#include "cpu_core.h"
+
+
+namespace WelsSVCEnc {
+#define    CPU_Vender_AMD    "AuthenticAMD"
+#define    CPU_Vender_INTEL  "GenuineIntel"
+#define    CPU_Vender_CYRIX  "CyrixInstead"
+
+
+#if defined(X86_ASM)
+
+uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors) {
+  uint32_t uiCPU = 0;
+  uint32_t uiFeatureA = 0, uiFeatureB = 0, uiFeatureC = 0, uiFeatureD = 0;
+  int32_t  CacheLineSize = 0;
+  int8_t   chVenderName[16] = { 0 };
+
+  if (!WelsCPUIdVerify()) {
+    /* cpuid is not supported in cpu */
+    return 0;
+  }
+
+  WelsCPUId (0, &uiFeatureA, (uint32_t*)&chVenderName[0], (uint32_t*)&chVenderName[8], (uint32_t*)&chVenderName[4]);
+  if (uiFeatureA == 0) {
+    /* maximum input value for basic cpuid information */
+    return 0;
+  }
+
+  WelsCPUId (1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
+  if ((uiFeatureD & 0x00800000) == 0) {
+    /* Basic MMX technology is not support in cpu, mean nothing for us so return here */
+    return 0;
+  }
+
+  uiCPU = WELS_CPU_MMX;
+  if (uiFeatureD & 0x02000000) {
+    /* SSE technology is identical to AMD MMX extensions */
+    uiCPU |= WELS_CPU_MMXEXT | WELS_CPU_SSE;
+  }
+  if (uiFeatureD & 0x04000000) {
+    /* SSE2 support here */
+    uiCPU |= WELS_CPU_SSE2;
+  }
+  if (uiFeatureD & 0x00000001) {
+    /* x87 FPU on-chip checking */
+    uiCPU |= WELS_CPU_FPU;
+  }
+  if (uiFeatureD & 0x00008000) {
+    /* CMOV instruction checking */
+    uiCPU |= WELS_CPU_CMOV;
+  }
+  if (!strcmp ((const str_t*)chVenderName, CPU_Vender_INTEL)) {	// confirmed_safe_unsafe_usage
+    if (uiFeatureD & 0x10000000) {
+      /* Multi-Threading checking: contains of multiple logic processors */
+      uiCPU |= WELS_CPU_HTT;
+    }
+  }
+
+  if (uiFeatureC & 0x00000001) {
+    /* SSE3 support here */
+    uiCPU |= WELS_CPU_SSE3;
+  }
+  if (uiFeatureC & 0x00000200) {
+    /* SSSE3 support here */
+    uiCPU |= WELS_CPU_SSSE3;
+  }
+  if (uiFeatureC & 0x00080000) {
+    /* SSE4.1 support here, 45nm Penryn processor */
+    uiCPU |= WELS_CPU_SSE41;
+  }
+  if (uiFeatureC & 0x00100000) {
+    /* SSE4.2 support here, next generation Nehalem processor */
+    uiCPU |= WELS_CPU_SSE42;
+  }
+  if (WelsCPUSupportAVX (uiFeatureA, uiFeatureC)) {	//
+    /* AVX supported */
+    uiCPU |= WELS_CPU_AVX;
+  }
+  if (WelsCPUSupportFMA (uiFeatureA, uiFeatureC)) {	//
+    /* AVX FMA supported */
+    uiCPU |= WELS_CPU_FMA;
+  }
+  if (uiFeatureC & 0x02000000) {
+    /* AES checking */
+    uiCPU |= WELS_CPU_AES;
+  }
+  if (uiFeatureC & 0x00400000) {
+    /* MOVBE checking */
+    uiCPU |= WELS_CPU_MOVBE;
+  }
+
+  if (pNumberOfLogicProcessors != NULL) {
+    // HTT enabled on chip
+    *pNumberOfLogicProcessors = (uiFeatureB & 0x00ff0000) >> 16; // feature bits: 23-16 on returned EBX
+  }
+
+  WelsCPUId (0x80000000, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
+
+  if ((!strcmp ((const str_t*)chVenderName, CPU_Vender_AMD))
+      && (uiFeatureA >= 0x80000001)) {	// confirmed_safe_unsafe_usage
+    WelsCPUId (0x80000001, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
+    if (uiFeatureD & 0x00400000) {
+      uiCPU |= WELS_CPU_MMXEXT;
+    }
+    if (uiFeatureD & 0x80000000) {
+      uiCPU |= WELS_CPU_3DNOW;
+    }
+  }
+
+  if (!strcmp ((const str_t*)chVenderName, CPU_Vender_INTEL)) {	// confirmed_safe_unsafe_usage
+    int32_t  family, model;
+
+    WelsCPUId (1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
+    family = ((uiFeatureA >> 8) & 0xf) + ((uiFeatureA >> 20) & 0xff);
+    model  = ((uiFeatureA >> 4) & 0xf) + ((uiFeatureA >> 12) & 0xf0);
+
+    if ((family == 6) && (model == 9 || model == 13 || model == 14)) {
+      uiCPU &= ~ (WELS_CPU_SSE2 | WELS_CPU_SSE3);
+    }
+  }
+
+  // get cache line size
+  if ((!strcmp ((const str_t*)chVenderName, CPU_Vender_INTEL))
+      || ! (strcmp ((const str_t*)chVenderName, CPU_Vender_CYRIX))) {	// confirmed_safe_unsafe_usage
+    WelsCPUId (1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
+
+    CacheLineSize = (uiFeatureB & 0xff00) >>
+                    5;	// ((clflush_line_size >> 8) << 3), CLFLUSH_line_size * 8 = CacheLineSize_in_byte
+
+    if (CacheLineSize == 128) {
+      uiCPU |= WELS_CPU_CACHELINE_128;
+    } else if (CacheLineSize == 64) {
+      uiCPU |= WELS_CPU_CACHELINE_64;
+    } else if (CacheLineSize == 32) {
+      uiCPU |= WELS_CPU_CACHELINE_32;
+    } else if (CacheLineSize == 16) {
+      uiCPU |= WELS_CPU_CACHELINE_16;
+    }
+  }
+
+  return uiCPU;
+}
+
+
+void WelsCPURestore (const uint32_t kuiCPU) {
+  if (kuiCPU & (WELS_CPU_MMX | WELS_CPU_MMXEXT | WELS_CPU_3DNOW | WELS_CPU_3DNOWEXT)) {
+    WelsEmms();
+  }
+}
+
+#endif
+
+}
--- a/codec/encoder/core/src/deblocking.cpp
+++ b/codec/encoder/core/src/deblocking.cpp
@@ -1,1091 +1,1012 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file	deblocking.c
- *
- * \brief	Interfaces introduced in frame deblocking filtering
- *
- * \date	08/03/2009 Created
- *
- *************************************************************************************
- */
-
-#include "as264_common.h"
-#include "deblocking.h"
-#include "cpu_core.h"
-#include "array_stack_align.h"
-
-namespace WelsSVCEnc {
-
-#define g_kuiAlphaTable(x) g_kuiAlphaTable[(x)]
-#define g_kiBetaTable(x)  g_kiBetaTable[(x)]
-#define g_kiTc0Table(x)   g_kiTc0Table[(x)]
-
-#define MB_BS_MV(sCurMv, sNeighMv, uiBIdx, uiBnIdx) \
-	(\
-	( WELS_ABS( sCurMv[uiBIdx].iMvX - sNeighMv[uiBnIdx].iMvX ) >= 4 ) ||\
-	( WELS_ABS( sCurMv[uiBIdx].iMvY - sNeighMv[uiBnIdx].iMvY ) >= 4 )\
-	)
-
-#define SMB_EDGE_MV(uiRefIndex, sMotionVector, uiBIdx, uiBnIdx) \
-	(\
-	!!((WELS_ABS(sMotionVector[uiBIdx].iMvX - sMotionVector[uiBnIdx].iMvX) &(~3)) | (WELS_ABS(sMotionVector[uiBIdx].iMvY - sMotionVector[uiBnIdx].iMvY) &(~3)))\
-	)
-
-#define BS_EDGE(bsx1, uiRefIndex, sMotionVector, uiBIdx, uiBnIdx) \
-	( (bsx1|SMB_EDGE_MV(uiRefIndex, sMotionVector, uiBIdx, uiBnIdx))<<(bsx1?1:0))
-
-#define GET_ALPHA_BETA_FROM_QP(QP, iAlphaOffset, iBetaOffset, iIdexA, iAlpha, iBeta) \
-{\
-	iIdexA = (QP + iAlphaOffset);\
-	iIdexA = CLIP3_QP_0_51(iIdexA);\
-	iAlpha = g_kuiAlphaTable(iIdexA);\
-	iBeta  = g_kiBetaTable((CLIP3_QP_0_51(QP + iBetaOffset)));\
-}
-
-static const uint8_t g_kuiAlphaTable[52+12] = { //this table refers to Table 8-16 in H.264/AVC standard
-	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-	0,  0,  0,  0,  0,  0,  4,  4,  5,  6,
-	7,  8,  9, 10, 12, 13, 15, 17, 20, 22,
-	25, 28, 32, 36, 40, 45, 50, 56, 63, 71,
-	80, 90,101,113,127,144,162,182,203,226,
-	255, 255
-	,255, 255,255, 255,255, 255,255, 255,255, 255,255, 255
-};
-
-static const int8_t g_kiBetaTable[52+12] = { //this table refers to Table 8-16 in H.264/AVC standard
-	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-	0,  0,  0,  0,  0,  0,  2,  2,  2,  3,
-	3,  3,  3,  4,  4,  4,  6,  6,  7,  7,
-	8,  8,  9,  9, 10, 10, 11, 11, 12, 12,
-	13, 13, 14, 14, 15, 15, 16, 16, 17, 17,
-	18, 18
-	,18, 18,18, 18,18, 18,18, 18,18, 18,18, 18
-};
-
-static const int8_t g_kiTc0Table[52+12][4] = { //this table refers Table 8-17 in H.264/AVC standard
-	{ -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 },
-	{ -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 },
-	{ -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 1 },
-	{ -1, 0, 0, 1 }, { -1, 0, 0, 1 }, { -1, 0, 0, 1 }, { -1, 0, 1, 1 }, { -1, 0, 1, 1 }, { -1, 1, 1, 1 },
-	{ -1, 1, 1, 1 }, { -1, 1, 1, 1 }, { -1, 1, 1, 1 }, { -1, 1, 1, 2 }, { -1, 1, 1, 2 }, { -1, 1, 1, 2 },
-	{ -1, 1, 1, 2 }, { -1, 1, 2, 3 }, { -1, 1, 2, 3 }, { -1, 2, 2, 3 }, { -1, 2, 2, 4 }, { -1, 2, 3, 4 },
-	{ -1, 2, 3, 4 }, { -1, 3, 3, 5 }, { -1, 3, 4, 6 }, { -1, 3, 4, 6 }, { -1, 4, 5, 7 }, { -1, 4, 5, 8 },
-	{ -1, 4, 6, 9 }, { -1, 5, 7,10 }, { -1, 6, 8,11 }, { -1, 6, 8,13 }, { -1, 7,10,14 }, { -1, 8,11,16 },
-	{ -1, 9,12,18 }, { -1, 10,13,20 }, {-1,11,15,23 }, { -1,13,17,25 }
-	,{ -1,13,17,25 },{ -1,13,17,25 },{ -1,13,17,25 },{ -1,13,17,25 },{ -1,13,17,25 },{ -1,13,17,25 }
-	,{ -1,13,17,25 },{ -1,13,17,25 },{ -1,13,17,25 },{ -1,13,17,25 },{ -1,13,17,25 },{ -1,13,17,25 }
-};
-
-static const uint8_t g_kuiTableBIdx[2][8] =   
-{     
-	{0,  4,  8,  12, // g_kuiTableBIdx
-	3,  7,  11, 15}, // table_bn_idx
-
-	{0,  1,  2,  3 , // g_kuiTableBIdx
-	12, 13, 14, 15}, // table_bn_idx
-};
-
-static const ALIGNED_DECLARE(int32_t,g_kiTableBlock8x8Idx[2][4][4],16) =   
-{ 
-	{0, 0, 2, 2,
-	 0, 0, 2, 2,
-	 1, 1, 3, 3,
-	 1, 1, 3, 3},
-	
-	{0, 0, 1, 1,
-	 0, 0, 1, 1,
-	 2, 2, 3, 3,
-	 2, 2, 3, 3}
-};
-static const ALIGNED_DECLARE(int32_t,g_kiTableBlock8x8NIdx[2][4][4],16) = 	
-{  
-	{1, 1, 3, 3,
-	 0, 0, 2, 2,
-	 0, 0, 2, 2,
-	 1, 1, 3, 3},
-
-	 {2, 2, 3, 3,
-	  0, 0, 1, 1,
-	  0, 0, 1, 1,
-	  2, 2, 3, 3}
-};
-
-#define TC0_TBL_LOOKUP(iTc, iIdexA, pBS, bchroma) \
-{\
-	iTc[0] = g_kiTc0Table(iIdexA)[pBS[0]] + bchroma;\
-	iTc[1] = g_kiTc0Table(iIdexA)[pBS[1]] + bchroma;\
-	iTc[2] = g_kiTc0Table(iIdexA)[pBS[2]] + bchroma;\
-	iTc[3] = g_kiTc0Table(iIdexA)[pBS[3]] + bchroma;\
-}
-
-void inline DeblockingBSInsideMBAvsbase( int8_t* pNnzTab, uint8_t uiBS[2][4][4], int32_t iLShiftFactor )
-{
-	uint32_t uiNnz32b0, uiNnz32b1, uiNnz32b2, uiNnz32b3;
-	ENFORCE_STACK_ALIGN_1D( uint8_t, uiBsx3, 4, 4 );
-
-	uiNnz32b0 = *(uint32_t *)(pNnzTab+0);
-	uiNnz32b1 = *(uint32_t *)(pNnzTab+4);
-	uiNnz32b2 = *(uint32_t *)(pNnzTab+8);
-	uiNnz32b3 = *(uint32_t *)(pNnzTab+12);
-
-	*(uint32_t *)uiBsx3 = (uiNnz32b0|(uiNnz32b0>>8))<<iLShiftFactor;
-	uiBS[0][1][0] = uiBsx3[0];
-	uiBS[0][2][0] = uiBsx3[1];
-	uiBS[0][3][0] = uiBsx3[2];
-
-	*(uint32_t *)uiBsx3 = (uiNnz32b1|(uiNnz32b1>>8))<<iLShiftFactor;
-	uiBS[0][1][1] = uiBsx3[0];
-	uiBS[0][2][1] = uiBsx3[1];
-	uiBS[0][3][1] = uiBsx3[2];
-	*(uint32_t *)uiBS[1][1] = (uiNnz32b0|uiNnz32b1)<<iLShiftFactor;
-
-	*(uint32_t *)uiBsx3 = (uiNnz32b2|(uiNnz32b2>>8))<<iLShiftFactor;
-	uiBS[0][1][2] = uiBsx3[0];
-	uiBS[0][2][2] = uiBsx3[1];
-	uiBS[0][3][2] = uiBsx3[2];
-	*(uint32_t *)uiBS[1][2] = (uiNnz32b1|uiNnz32b2)<<iLShiftFactor;
-
-	*(uint32_t *)uiBsx3 = (uiNnz32b3|(uiNnz32b3>>8))<<iLShiftFactor;
-	uiBS[0][1][3] = uiBsx3[0];
-	uiBS[0][2][3] = uiBsx3[1];
-	uiBS[0][3][3] = uiBsx3[2];	
-	*(uint32_t *)uiBS[1][3] = (uiNnz32b2|uiNnz32b3)<<iLShiftFactor;
-
-}
-
-void inline DeblockingBSInsideMBNormal( SMB* pCurMb, uint8_t uiBS[2][4][4], int8_t* pNnzTab )
-{
-	uint32_t uiNnz32b0, uiNnz32b1, uiNnz32b2, uiNnz32b3;
-	ENFORCE_STACK_ALIGN_1D( uint8_t, uiBsx4, 4, 4 );
-
-	uiNnz32b0 = *(uint32_t *)(pNnzTab+0);
-	uiNnz32b1 = *(uint32_t *)(pNnzTab+4);
-	uiNnz32b2 = *(uint32_t *)(pNnzTab+8);
-	uiNnz32b3 = *(uint32_t *)(pNnzTab+12);
-
-	*(uint32_t *)uiBsx4 = (uiNnz32b0|(uiNnz32b0>>8));
-	uiBS[0][1][0] = BS_EDGE(uiBsx4[0], iRefIdx, pCurMb->sMv, 1, 0);
-	uiBS[0][2][0] = BS_EDGE(uiBsx4[1], iRefIdx, pCurMb->sMv, 2, 1);
-	uiBS[0][3][0] = BS_EDGE(uiBsx4[2], iRefIdx, pCurMb->sMv, 3, 2); 
-
-	*(uint32_t *)uiBsx4 = (uiNnz32b1|(uiNnz32b1>>8));
-	uiBS[0][1][1] = BS_EDGE(uiBsx4[0], iRefIdx, pCurMb->sMv, 5, 4);
-	uiBS[0][2][1] = BS_EDGE(uiBsx4[1], iRefIdx, pCurMb->sMv, 6, 5);
-	uiBS[0][3][1] = BS_EDGE(uiBsx4[2], iRefIdx, pCurMb->sMv, 7, 6);
-
-	*(uint32_t *)uiBsx4 = (uiNnz32b2|(uiNnz32b2>>8));
-	uiBS[0][1][2] = BS_EDGE(uiBsx4[0], iRefIdx, pCurMb->sMv, 9, 8);
-	uiBS[0][2][2] = BS_EDGE(uiBsx4[1], iRefIdx, pCurMb->sMv, 10,9);
-	uiBS[0][3][2] = BS_EDGE(uiBsx4[2], iRefIdx, pCurMb->sMv, 11,10);
-
-	*(uint32_t *)uiBsx4 = (uiNnz32b3|(uiNnz32b3>>8));
-	uiBS[0][1][3] = BS_EDGE(uiBsx4[0], iRefIdx, pCurMb->sMv, 13,12);
-	uiBS[0][2][3] = BS_EDGE(uiBsx4[1], iRefIdx, pCurMb->sMv, 14,13);
-	uiBS[0][3][3] = BS_EDGE(uiBsx4[2], iRefIdx, pCurMb->sMv, 15,14);	
-
-	//horizontal
-	*(uint32_t *)uiBsx4 = (uiNnz32b0|uiNnz32b1);
-	uiBS[1][1][0] = BS_EDGE(uiBsx4[0], iRefIdx, pCurMb->sMv, 4, 0);
-	uiBS[1][1][1] = BS_EDGE(uiBsx4[1], iRefIdx, pCurMb->sMv, 5, 1);
-	uiBS[1][1][2] = BS_EDGE(uiBsx4[2], iRefIdx, pCurMb->sMv, 6, 2);
-	uiBS[1][1][3] = BS_EDGE(uiBsx4[3], iRefIdx, pCurMb->sMv, 7, 3); 
-
-	*(uint32_t *)uiBsx4 = (uiNnz32b1|uiNnz32b2);
-	uiBS[1][2][0] = BS_EDGE(uiBsx4[0], iRefIdx, pCurMb->sMv, 8, 4);
-	uiBS[1][2][1] = BS_EDGE(uiBsx4[1], iRefIdx, pCurMb->sMv, 9, 5);
-	uiBS[1][2][2] = BS_EDGE(uiBsx4[2], iRefIdx, pCurMb->sMv, 10, 6);
-	uiBS[1][2][3] = BS_EDGE(uiBsx4[3], iRefIdx, pCurMb->sMv, 11, 7);
-
-	*(uint32_t *)uiBsx4 = (uiNnz32b2|uiNnz32b3);
-	uiBS[1][3][0] = BS_EDGE(uiBsx4[0], iRefIdx, pCurMb->sMv, 12, 8);
-	uiBS[1][3][1] = BS_EDGE(uiBsx4[1], iRefIdx, pCurMb->sMv, 13, 9);
-	uiBS[1][3][2] = BS_EDGE(uiBsx4[2], iRefIdx, pCurMb->sMv, 14, 10);
-	uiBS[1][3][3] = BS_EDGE(uiBsx4[3], iRefIdx, pCurMb->sMv, 15, 11);
-}
-
-uint32_t DeblockingBSMarginalMBAvcbase( SMB* pCurMb, SMB* pNeighMb, int32_t iEdge)
-{
-	int32_t i;
-	uint32_t uiBSx4;    
-	uint8_t* pBS = (uint8_t*)(&uiBSx4);
-	uint32_t uiBIdx  = *(uint32_t *)(&g_kuiTableBIdx[iEdge][0]); 
-	uint32_t uiBnIdx = *(uint32_t *)(&g_kuiTableBIdx[iEdge][4]);
-
-	for( i = 0; i < 4; i++ )
-	{
-		if (pCurMb->pNonZeroCount[uiBIdx&0xff] | pNeighMb->pNonZeroCount[uiBnIdx&0xff])		
-		{
-			pBS[i] = 2;
-		} 
-		else 
-		{
-			pBS[i] = 
-#ifndef SINGLE_REF_FRAME
-			(pCurMb->uiRefIndex[g_kiTableBlock8x8Idx[1][iEdge][i]] - pNeighMb->uiRefIndex[g_kiTableBlock8x8NIdx[1][iEdge][i]]) ||
-#endif
-			MB_BS_MV(pCurMb->sMv, pNeighMb->sMv, (uiBIdx&0xff), (uiBnIdx&0xff));				
-		}
-		uiBIdx  = uiBIdx  >> 8;
-		uiBnIdx = uiBnIdx >> 8;
-	}
-	return uiBSx4;
-}
-
-void FilteringEdgeLumaH( DeblockingFunc* pfDeblocking, SDeblockingFilter* pFilter, uint8_t* pPix, int32_t iStride, uint8_t* pBS )
-{
-	int32_t iIdexA; 
-	int32_t iAlpha; 
-	int32_t iBeta;  
-	ENFORCE_STACK_ALIGN_1D( int8_t, iTc, 4, 16 );
-
-	GET_ALPHA_BETA_FROM_QP(pFilter->uiLumaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIdexA, iAlpha, iBeta);
-
-	if( iAlpha | iBeta )
-	{
-		TC0_TBL_LOOKUP(iTc, iIdexA, pBS, 0);
-		pfDeblocking->pfLumaDeblockingLT4Ver(pPix, iStride, iAlpha, iBeta, iTc);
-	}
-	return;
-}
-void FilteringEdgeLumaV( DeblockingFunc* pfDeblocking, SDeblockingFilter* pFilter, uint8_t* pPix, int32_t iStride, uint8_t* pBS )
-{
-	int32_t  iIdexA;
-	int32_t  iAlpha;
-	int32_t  iBeta; 
-	ENFORCE_STACK_ALIGN_1D( int8_t, iTc, 4, 16 );
-
-	GET_ALPHA_BETA_FROM_QP(pFilter->uiLumaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIdexA, iAlpha, iBeta);
-
-	if( iAlpha | iBeta )
-	{
-		TC0_TBL_LOOKUP(iTc, iIdexA, pBS, 0);
-		pfDeblocking->pfLumaDeblockingLT4Hor(pPix, iStride, iAlpha, iBeta, iTc);
-	}
-	return;
-}
-
-void FilteringEdgeLumaIntraH( DeblockingFunc* pfDeblocking, SDeblockingFilter* pFilter, uint8_t* pPix, int32_t iStride,uint8_t* pBS )
-{
-	int32_t iIdexA; 
-	int32_t iAlpha; 
-	int32_t iBeta;  	
-
-	GET_ALPHA_BETA_FROM_QP(pFilter->uiLumaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIdexA, iAlpha, iBeta);
-
-	if( iAlpha | iBeta )
-	{
-		pfDeblocking->pfLumaDeblockingEQ4Ver(pPix, iStride, iAlpha, iBeta);
-	}
-	return;
-}
-
-void FilteringEdgeLumaIntraV( DeblockingFunc* pfDeblocking, SDeblockingFilter* pFilter, uint8_t* pPix, int32_t iStride,uint8_t* pBS)
-{
-	int32_t iIdexA; 
-	int32_t iAlpha; 
-	int32_t iBeta;  
-
-	GET_ALPHA_BETA_FROM_QP(pFilter->uiLumaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIdexA, iAlpha, iBeta);
-
-	if( iAlpha | iBeta )
-	{	
-		pfDeblocking->pfLumaDeblockingEQ4Hor(pPix, iStride, iAlpha, iBeta);
-	}
-	return;
-}
-void FilteringEdgeChromaH( DeblockingFunc* pfDeblocking, SDeblockingFilter* pFilter, uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, uint8_t* pBS )
-{	
-	int32_t iIdexA; 
-	int32_t iAlpha; 
-	int32_t iBeta;  
-	ENFORCE_STACK_ALIGN_1D( int8_t, iTc, 4, 16 );
-
-	GET_ALPHA_BETA_FROM_QP(pFilter->uiChromaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIdexA, iAlpha, iBeta);
-
-	if( iAlpha | iBeta )
-	{
-		TC0_TBL_LOOKUP(iTc, iIdexA, pBS, 1);
-		pfDeblocking->pfChromaDeblockingLT4Ver(pPixCb, pPixCr, iStride,iAlpha, iBeta, iTc);
-	}
-	return;
-} 
-void FilteringEdgeChromaV( DeblockingFunc* pfDeblocking, SDeblockingFilter* pFilter, uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, uint8_t* pBS )
-{	  
-	int32_t iIdexA; 
-	int32_t iAlpha; 
-	int32_t iBeta;  
-	ENFORCE_STACK_ALIGN_1D( int8_t, iTc, 4, 16 );
-
-	GET_ALPHA_BETA_FROM_QP(pFilter->uiChromaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIdexA, iAlpha, iBeta);
-
-	if( iAlpha | iBeta )
-	{
-		TC0_TBL_LOOKUP(iTc, iIdexA, pBS, 1);
-		pfDeblocking->pfChromaDeblockingLT4Hor(pPixCb, pPixCr, iStride, iAlpha, iBeta, iTc);
-	}
-	return;
-}
-
-void FilteringEdgeChromaIntraH( DeblockingFunc* pfDeblocking, SDeblockingFilter* pFilter, uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, uint8_t* pBS )
-{
-	int32_t iIdexA; 
-	int32_t iAlpha; 
-	int32_t iBeta;  
-
-	GET_ALPHA_BETA_FROM_QP(pFilter->uiChromaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIdexA, iAlpha, iBeta);
-
-	if( iAlpha | iBeta )
-	{
-		pfDeblocking->pfChromaDeblockingEQ4Ver(pPixCb, pPixCr, iStride, iAlpha, iBeta);
-	}
-	return;
-}
-
-void FilteringEdgeChromaIntraV( DeblockingFunc* pfDeblocking, SDeblockingFilter* pFilter, uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, uint8_t* pBS )
-{
-	int32_t iIdexA; 
-	int32_t iAlpha; 
-	int32_t iBeta;  
-
-	GET_ALPHA_BETA_FROM_QP(pFilter->uiChromaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIdexA, iAlpha, iBeta);
-
-	if( iAlpha | iBeta )
-	{
-		pfDeblocking->pfChromaDeblockinEQ4Hor(pPixCb, pPixCr, iStride, iAlpha, iBeta);
-	}
-	return;
-}
-
-void DeblockingInterMb( DeblockingFunc* pfDeblocking, SMB* pCurMb, SDeblockingFilter* pFilter, uint8_t uiBS[2][4][4] )
-{
-	int8_t iCurLumaQp   = pCurMb->uiLumaQp;
-	int8_t iCurChromaQp = pCurMb->uiChromaQp;
-	int32_t iLineSize     = pFilter->iCsStride[0];
-	int32_t iLineSizeUV   = pFilter->iCsStride[1];
-	int32_t iMbStride    = pFilter->iMbStride;
-
-	int32_t iMbX = pCurMb->iMbX;
-	int32_t iMbY = pCurMb->iMbY;
-
-	BOOL_T bLeftBsValid[2] = {(iMbX > 0), ((iMbX > 0) && (pCurMb->uiSliceIdc == (pCurMb - 1)->uiSliceIdc))};
-	BOOL_T bTopBsValid[2]  = {(iMbY > 0), ((iMbY > 0) && (pCurMb->uiSliceIdc == (pCurMb - iMbStride)->uiSliceIdc))};
-
-	int32_t iLeftFlag = bLeftBsValid[pFilter->uiFilterIdc]; 
-	int32_t iTopFlag  = bTopBsValid[pFilter->uiFilterIdc];
-
-	uint8_t *pDestY, *pDestCb, *pDestCr;
-	pDestY  = pFilter->pCsData[0];			
-	pDestCb = pFilter->pCsData[1];				
-	pDestCr = pFilter->pCsData[2]; 
-
-	if (iLeftFlag)	
-	{
-		pFilter->uiLumaQP   = (iCurLumaQp + (pCurMb-1)->uiLumaQp + 1) >> 1;
-		pFilter->uiChromaQP = (iCurChromaQp + (pCurMb-1)->uiChromaQp+ 1) >> 1;
-
-		if( uiBS[0][0][0] == 0x04 )
-		{
-			FilteringEdgeLumaIntraV( pfDeblocking, pFilter, pDestY, iLineSize ,NULL);
-			FilteringEdgeChromaIntraV( pfDeblocking, pFilter, pDestCb, pDestCr, iLineSizeUV, NULL );
-		} 
-		else
-		{
-			if(*(uint32_t *)uiBS[0][0] != 0)
-			{
-				FilteringEdgeLumaV( pfDeblocking, pFilter, pDestY, iLineSize, uiBS[0][0] );
-				FilteringEdgeChromaV( pfDeblocking, pFilter, pDestCb, pDestCr, iLineSizeUV, uiBS[0][0] );
-			}
-		}
-	}
-
-	pFilter->uiLumaQP = iCurLumaQp;
-	pFilter->uiChromaQP = iCurChromaQp;
-
-	if(*(uint32_t *)uiBS[0][1] != 0)
-	{
-		FilteringEdgeLumaV( pfDeblocking, pFilter, &pDestY[1<<2], iLineSize, uiBS[0][1]);
-	}
-
-	if(*(uint32_t *)uiBS[0][2] != 0)
-	{
-		FilteringEdgeLumaV( pfDeblocking, pFilter, &pDestY[2<<2], iLineSize, uiBS[0][2]);
-		FilteringEdgeChromaV( pfDeblocking, pFilter, &pDestCb[2<<1], &pDestCr[2<<1], iLineSizeUV, uiBS[0][2] );
-	}
-
-	if(*(uint32_t *)uiBS[0][3] != 0)
-	{
-		FilteringEdgeLumaV( pfDeblocking, pFilter, &pDestY[3<<2], iLineSize, uiBS[0][3] );
-	}
-
-	if (iTopFlag)	
-	{	
-		pFilter->uiLumaQP = (iCurLumaQp + (pCurMb-iMbStride)->uiLumaQp + 1) >> 1;
-		pFilter->uiChromaQP = (iCurChromaQp + (pCurMb-iMbStride)->uiChromaQp + 1) >> 1;
-
-		if(uiBS[1][0][0] == 0x04)
-		{
-			FilteringEdgeLumaIntraH( pfDeblocking, pFilter, pDestY, iLineSize ,NULL);
-			FilteringEdgeChromaIntraH( pfDeblocking, pFilter, pDestCb, pDestCr, iLineSizeUV, NULL );
-		} 
-		else 
-		{
-			if(*(uint32_t *)uiBS[1][0] != 0)
-			{
-				FilteringEdgeLumaH( pfDeblocking, pFilter, pDestY, iLineSize, uiBS[1][0] );
-				FilteringEdgeChromaH( pfDeblocking, pFilter, pDestCb, pDestCr, iLineSizeUV, uiBS[1][0] );
-			}
-		}  
-	}
-
-	pFilter->uiLumaQP = iCurLumaQp;
-	pFilter->uiChromaQP = iCurChromaQp;
-
-	if(*(uint32_t *)uiBS[1][1] != 0)
-	{
-		FilteringEdgeLumaH( pfDeblocking, pFilter, &pDestY[(1<<2)*iLineSize], iLineSize, uiBS[1][1] );
-	}
-
-	if(*(uint32_t *)uiBS[1][2] != 0)
-	{
-		FilteringEdgeLumaH( pfDeblocking, pFilter, &pDestY[(2<<2)*iLineSize], iLineSize, uiBS[1][2] );
-		FilteringEdgeChromaH( pfDeblocking, pFilter, &pDestCb[(2<<1)*iLineSizeUV], &pDestCr[(2<<1)*iLineSizeUV], iLineSizeUV, uiBS[1][2] );
-	}
-
-	if(*(uint32_t *)uiBS[1][3] != 0)
-	{
-		FilteringEdgeLumaH( pfDeblocking, pFilter, &pDestY[(3<<2)*iLineSize], iLineSize, uiBS[1][3] );
-	}
-}
-
-void FilteringEdgeLumaHV( DeblockingFunc* pfDeblocking, SMB* pCurMb, SDeblockingFilter* pFilter )
-{
-	int32_t iLineSize  = pFilter->iCsStride[0];
-	int32_t iMbStride = pFilter->iMbStride;
-
-	uint8_t  *pDestY;	
-	int8_t   iCurQp;
-	int32_t  iIdexA, iAlpha, iBeta;
-
-	int32_t iMbX = pCurMb->iMbX;
-	int32_t iMbY = pCurMb->iMbY;
-
-	BOOL_T bLeftBsValid[2] = {(iMbX > 0), ((iMbX > 0) && (pCurMb->uiSliceIdc == (pCurMb - 1)->uiSliceIdc))};
-	BOOL_T bTopBsValid[2]  = {(iMbY > 0), ((iMbY > 0) && (pCurMb->uiSliceIdc == (pCurMb - iMbStride)->uiSliceIdc))};
-
-	int32_t iLeftFlag = bLeftBsValid[pFilter->uiFilterIdc]; 
-	int32_t iTopFlag  = bTopBsValid[pFilter->uiFilterIdc];
-
-	ENFORCE_STACK_ALIGN_1D(int8_t,  iTc,   4, 16 );
-	ENFORCE_STACK_ALIGN_1D(uint8_t, uiBSx4, 4, 4  );
-
-	pDestY  = pFilter->pCsData[0];
-	iCurQp  = pCurMb->uiLumaQp;
-
-	*(uint32_t*)uiBSx4 = 0x03030303;
-
-	// luma v
-	if (iLeftFlag)	
-	{
-		pFilter->uiLumaQP = ( iCurQp + (pCurMb-1)->uiLumaQp + 1 ) >> 1;		
-		FilteringEdgeLumaIntraV( pfDeblocking, pFilter, pDestY, iLineSize,NULL );
-	}
-
-	pFilter->uiLumaQP   = iCurQp;	
-	GET_ALPHA_BETA_FROM_QP(pFilter->uiLumaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIdexA, iAlpha, iBeta);
-	if( iAlpha | iBeta )
-	{
-		TC0_TBL_LOOKUP(iTc, iIdexA, uiBSx4, 0);
-		pfDeblocking->pfLumaDeblockingLT4Hor( &pDestY[1 << 2], iLineSize, iAlpha, iBeta, iTc );
-		pfDeblocking->pfLumaDeblockingLT4Hor( &pDestY[2 << 2], iLineSize, iAlpha, iBeta, iTc );
-		pfDeblocking->pfLumaDeblockingLT4Hor( &pDestY[3 << 2], iLineSize, iAlpha, iBeta, iTc );
-
-	}
-
-	// luma h
-	if (iTopFlag)	
-	{
-		pFilter->uiLumaQP   = ( iCurQp   + (pCurMb-iMbStride)->uiLumaQp + 1 ) >> 1;	
-		FilteringEdgeLumaIntraH( pfDeblocking, pFilter, pDestY, iLineSize,NULL );
-	}   
-
-	pFilter->uiLumaQP   = iCurQp;	
-	if( iAlpha | iBeta )
-	{
-		pfDeblocking->pfLumaDeblockingLT4Ver( &pDestY[(1<<2)*iLineSize],iLineSize,iAlpha, iBeta,iTc );
-		pfDeblocking->pfLumaDeblockingLT4Ver( &pDestY[(2<<2)*iLineSize],iLineSize,iAlpha, iBeta,iTc );
-		pfDeblocking->pfLumaDeblockingLT4Ver( &pDestY[(3<<2)*iLineSize],iLineSize,iAlpha, iBeta,iTc );
-	}
-}
-void FilteringEdgeChromaHV( DeblockingFunc* pfDeblocking, SMB* pCurMb, SDeblockingFilter* pFilter )
-{
-	int32_t iLineSize  = pFilter->iCsStride[1];
-	int32_t iMbStride = pFilter->iMbStride;
-
-	uint8_t  *pDestCb, *pDestCr;	
-	int8_t   iCurQp;
-	int32_t  iIdexA, iAlpha, iBeta;
-
-	int32_t iMbX = pCurMb->iMbX;
-	int32_t iMbY = pCurMb->iMbY;
-
-	BOOL_T bLeftBsValid[2] = {(iMbX > 0), ((iMbX > 0) && (pCurMb->uiSliceIdc == (pCurMb - 1)->uiSliceIdc))};
-	BOOL_T bTopBsValid[2]  = {(iMbY > 0), ((iMbY > 0) && (pCurMb->uiSliceIdc == (pCurMb - iMbStride)->uiSliceIdc))};
-
-	int32_t iLeftFlag = bLeftBsValid[pFilter->uiFilterIdc]; 
-	int32_t iTopFlag  = bTopBsValid[pFilter->uiFilterIdc];
-
-	ENFORCE_STACK_ALIGN_1D( int8_t,  iTc,   4, 16 );
-	ENFORCE_STACK_ALIGN_1D( uint8_t, uiBSx4, 4, 4  );
-
-	pDestCb = pFilter->pCsData[1];				
-	pDestCr = pFilter->pCsData[2];	
-	iCurQp  = pCurMb->uiChromaQp;
-	*(uint32_t*)uiBSx4 = 0x03030303;
-
-	// chroma v
-	if (iLeftFlag)		
-	{
-		pFilter->uiChromaQP = ( iCurQp + (pCurMb-1)->uiChromaQp + 1 ) >> 1;	
-		FilteringEdgeChromaIntraV( pfDeblocking, pFilter, pDestCb, pDestCr, iLineSize, NULL);
-	}
-
-	pFilter->uiChromaQP   = iCurQp;	
-	GET_ALPHA_BETA_FROM_QP(pFilter->uiChromaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIdexA, iAlpha, iBeta);
-	if( iAlpha | iBeta )
-	{
-		TC0_TBL_LOOKUP(iTc, iIdexA, uiBSx4, 1);
-		pfDeblocking->pfChromaDeblockingLT4Hor( &pDestCb[2<<1], &pDestCr[2<<1], iLineSize, iAlpha, iBeta, iTc );
-	}
-
-	// chroma h
-	if (iTopFlag)	
-	{
-		pFilter->uiChromaQP = ( iCurQp + (pCurMb-iMbStride)->uiChromaQp + 1 ) >> 1;		
-		FilteringEdgeChromaIntraH( pfDeblocking, pFilter, pDestCb, pDestCr, iLineSize, NULL);
-	}   
-
-	pFilter->uiChromaQP   = iCurQp;	
-	if( iAlpha | iBeta )
-	{
-		pfDeblocking->pfChromaDeblockingLT4Ver( &pDestCb[(2<<1)*iLineSize], &pDestCr[(2<<1)*iLineSize], iLineSize, iAlpha, iBeta, iTc );
-	}
-}
-
-// merge h&v lookup table operation to save performance
-void DeblockingIntraMb( DeblockingFunc* pfDeblocking, SMB* pCurMb, SDeblockingFilter* pFilter )
-{
-	FilteringEdgeLumaHV(  pfDeblocking, pCurMb, pFilter);
-	FilteringEdgeChromaHV(pfDeblocking, pCurMb, pFilter);
-}
-
-void DeblockingMbAvcbase( SWelsFuncPtrList* pFunc, SMB* pCurMb, SDeblockingFilter * pFilter )
-{
-	uint8_t uiBS[2][4][4] = { 0 };
-
-	Mb_Type uiCurMbType = pCurMb->uiMbType;
-	int32_t iMbStride  = pFilter->iMbStride;
-
-	int32_t iMbX = pCurMb->iMbX;
-	int32_t iMbY = pCurMb->iMbY;
-
-	BOOL_T bLeftBsValid[2] = {(iMbX > 0), ((iMbX > 0) && (pCurMb->uiSliceIdc == (pCurMb - 1)->uiSliceIdc))};
-	BOOL_T bTopBsValid[2]  = {(iMbY > 0), ((iMbY > 0) && (pCurMb->uiSliceIdc == (pCurMb - iMbStride)->uiSliceIdc))};
-
-	int32_t iLeftFlag = bLeftBsValid[pFilter->uiFilterIdc]; 
-	int32_t iTopFlag  = bTopBsValid[pFilter->uiFilterIdc];
-
-	switch( uiCurMbType )
-	{
-	case MB_TYPE_INTRA4x4:
-	case MB_TYPE_INTRA16x16:
-	case MB_TYPE_INTRA_PCM:
-		DeblockingIntraMb( &pFunc->pfDeblocking, pCurMb, pFilter );
-		break;
-	default:
-		if (iLeftFlag)		
-		{
-			*(uint32_t*)uiBS[0][0] = IS_INTRA((pCurMb-1)->uiMbType)?0x04040404:DeblockingBSMarginalMBAvcbase( pCurMb, pCurMb-1, 0 );
-		}
-		else
-		{
-			*(uint32_t*)uiBS[0][0] = 0;
-		}
-		if (iTopFlag)		
-		{
-			*(uint32_t*)uiBS[1][0] = IS_INTRA((pCurMb-iMbStride)->uiMbType)?0x04040404:DeblockingBSMarginalMBAvcbase( pCurMb, (pCurMb-iMbStride), 1 );
-		}
-		else
-		{
-			*(uint32_t*)uiBS[1][0] = 0;
-		}
-		//SKIP MB_16x16 or others
-		if( uiCurMbType != MB_TYPE_SKIP )
-		{
-			pFunc->pfSetNZCZero(pCurMb->pNonZeroCount);// set all none-zero nzc to 1; dbk can be opti!
-
-			if( uiCurMbType == MB_TYPE_16x16 )
-			{
-				DeblockingBSInsideMBAvsbase( pCurMb->pNonZeroCount, uiBS, 1 );
-			} 
-			else 
-			{
-				DeblockingBSInsideMBNormal(pCurMb, uiBS, pCurMb->pNonZeroCount);
-			}
-		}
-		else
-		{
-			*(uint32_t*)uiBS[0][1] = *(uint32_t*)uiBS[0][2] = *(uint32_t*)uiBS[0][3] = 
-				*(uint32_t*)uiBS[1][1] = *(uint32_t*)uiBS[1][2] = *(uint32_t*)uiBS[1][3] = 0;
-		}
-
-		DeblockingInterMb( &pFunc->pfDeblocking, pCurMb, pFilter, uiBS );
-		break;
-	}
-}
-
-//  C code only
-
-void DeblockLumaLt4_c( uint8_t *pPix, int32_t iStrideX,int32_t iStrideY, int32_t iAlpha, int32_t iBeta, int8_t *pTc )
-{
-	for( int32_t i = 0;i<16;i++)
-	{
-		int32_t iTc0 = pTc[i>>2];
-		if(iTc0>=0)
-		{
-				int32_t p0 = pPix[-iStrideX];	
-				int32_t p1 = pPix[-2*iStrideX];	
-				int32_t p2 = pPix[-3*iStrideX];	
-				int32_t q0 = pPix[0];	
-				int32_t q1 = pPix[iStrideX];	
-				int32_t q2 = pPix[2*iStrideX];	
-				bool_t bDetaP0Q0= WELS_ABS( p0 - q0 )<iAlpha;
-				bool_t bDetaP1P0 = WELS_ABS( p1 - p0 ) < iBeta;
-				bool_t bDetaQ1Q0 = WELS_ABS( q1 - q0 ) < iBeta;
-				int32_t iTc = iTc0;
-				if ( bDetaP0Q0&& bDetaP1P0 && bDetaQ1Q0 )
-				{	
-					bool_t bDetaP2P0 =  WELS_ABS( p2 - p0 ) < iBeta;
-					bool_t bDetaQ2Q0 =  WELS_ABS( q2 - q0 ) < iBeta;
-					if ( bDetaP2P0) 
-					{
-						pPix[-2*iStrideX] = p1 + WELS_CLIP3( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -iTc0, iTc0 );
-						iTc++;
-					}
-					if (bDetaQ2Q0)
-					{
-						pPix[iStrideX] = q1 + WELS_CLIP3( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -iTc0, iTc0 );
-						iTc++;
-					}
-					int32_t iDeta = WELS_CLIP3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -iTc, iTc );
-					pPix[-iStrideX] = WELS_CLIP1( p0 + iDeta );    /* p0' */
-					pPix[0]  = WELS_CLIP1( q0 - iDeta );    /* q0' */
-			}
-		}
-		pPix +=iStrideY;
-	}
-}
-
-
-void DeblockLumaEq4_c( uint8_t *pPix, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha, int32_t iBeta )
-{
-	int32_t p0,p1,p2,q0,q1,q2;
-	int32_t iDetaP0Q0;
-	bool_t bDetaP1P0,bDetaQ1Q0;
-	for (int32_t i = 0;i<16;i++)
-	{
-		p0 = pPix[-iStrideX];
-		p1 = pPix[-2*iStrideX];
-		p2 = pPix[-3*iStrideX];							
-		q0 = pPix[0];
-		q1 = pPix[iStrideX];
-		q2 = pPix[2*iStrideX];
-		iDetaP0Q0 = WELS_ABS( p0 - q0 );
-		bDetaP1P0 = WELS_ABS( p1 - p0 ) < iBeta;
-		bDetaQ1Q0 = WELS_ABS( q1 - q0 ) < iBeta;
-		if ((iDetaP0Q0<iAlpha) && bDetaP1P0 &&bDetaQ1Q0)
-		{
-			if (iDetaP0Q0< (( iAlpha >> 2 ) + 2 ) )
-			{
-				bool_t bDetaP2P0 = WELS_ABS( p2 - p0 ) < iBeta;
-				bool_t bDetaQ2Q0 =  WELS_ABS( q2 - q0 ) < iBeta;
-				if(bDetaP2P0)
-				{	
-					const int32_t p3 = pPix[-4*iStrideX];	
-					pPix[-iStrideX] = ( p2 + (p1 << 1) + (p0 << 1) + (q0 << 1) + q1 + 4 ) >> 3;	 //p0
-					pPix[-2*iStrideX] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;	 //p1
-					pPix[-3*iStrideX] = ( (p3 << 1) + p2 + (p2 << 1) + p1 + p0 + q0 + 4 ) >> 3;//p2
-				 } 
-				 else 
-				 {
-					pPix[-1*iStrideX] = ( (p1 << 1) + p0 + q1 + 2 ) >> 2;	//p0
-			     }	
-				 if (bDetaQ2Q0)	
-				 {	
-					const int32_t q3 = pPix[3*iStrideX];		
-					pPix[0] = ( p1 + (p0 << 1) + (q0 << 1) + (q1 << 1) + q2 + 4 ) >> 3; //q0
-					pPix[iStrideX] = ( p0 + q0 + q1 + q2 + 2 ) >> 2; //q1
-					pPix[2*iStrideX] = ( (q3 << 1) + q2 + (q2 << 1) + q1 + q0 + p0 + 4 ) >> 3;//q2
-				  } 
-				  else 
-				  {	
-					pPix[0] = ( (q1 << 1) + q0 + p1 + 2 ) >> 2; //q0
-				  }
-			}
-			else
-			{
-			 	pPix[-iStrideX] = ( (p1 << 1) + p0 + q1 + 2 ) >> 2; //p0
-				pPix[ 0] = ( (q1 << 1) + q0 + p1 + 2 ) >> 2; //q0
-			}
-		}
-	 pPix += iStrideY;
-	} 
-}
-void DeblockLumaLt4V_c( uint8_t *pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *iTc )
-{
-	DeblockLumaLt4_c( pPix, iStride, 1, iAlpha, iBeta, iTc );
-}
-void DeblockLumaLt4H_c( uint8_t *pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *iTc )
-{
-	DeblockLumaLt4_c( pPix, 1, iStride, iAlpha, iBeta, iTc );
-}
-void DeblockLumaEq4V_c( uint8_t *pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta )
-{
-	DeblockLumaEq4_c( pPix, iStride, 1, iAlpha, iBeta);
-}
-void DeblockLumaEq4H_c( uint8_t *pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta )
-{
-	DeblockLumaEq4_c( pPix, 1, iStride, iAlpha, iBeta );
-}
-void DeblockChromaLt4_c( uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha, int32_t iBeta, int8_t *pTc )
-{
-	int32_t p0, p1, q0, q1,iDeta;
-	bool_t bDetaP0Q0,bDetaP1P0,bDetaQ1Q0;
-
-	for(int32_t i = 0;i<8;i++)
-	{
-		int32_t iTc0 = pTc[i>>1];
-		if(iTc0 >0)
-		{
-		p0 = pPixCb[-iStrideX];
-		p1 = pPixCb[-2*iStrideX];
-		q0 = pPixCb[0];
-		q1 = pPixCb[iStrideX];		
-
-		bDetaP0Q0 =  WELS_ABS( p0 - q0 ) < iAlpha;
-		bDetaP1P0 =  WELS_ABS( p1 - p0 ) < iBeta;
-		bDetaQ1Q0 = WELS_ABS( q1 - q0 ) < iBeta;
-		if( bDetaP0Q0&&bDetaP1P0 &&	bDetaQ1Q0) 
-		{
-			iDeta = WELS_CLIP3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -iTc0, iTc0 );
-			pPixCb[-iStrideX] = WELS_CLIP1( p0 + iDeta );    /* p0' */
-			pPixCb[0]  = WELS_CLIP1( q0 - iDeta );    /* q0' */
-		}
-	
-
-		p0 = pPixCr[-iStrideX];
-		p1 = pPixCr[-2*iStrideX];
-		q0 = pPixCr[0];
-		q1 = pPixCr[iStrideX];	
-
-		bDetaP0Q0 =  WELS_ABS( p0 - q0 ) < iAlpha;
-		bDetaP1P0 =  WELS_ABS( p1 - p0 ) < iBeta;
-		bDetaQ1Q0 = WELS_ABS( q1 - q0 ) < iBeta;
-
-		if( bDetaP0Q0&&bDetaP1P0 &&	bDetaQ1Q0) 
-		{
-			iDeta = WELS_CLIP3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -iTc0, iTc0 );
-			pPixCr[-iStrideX] = WELS_CLIP1( p0 + iDeta );    /* p0' */
-			pPixCr[0]  = WELS_CLIP1( q0 - iDeta );    /* q0' */
-		}
-		}
-		pPixCb += iStrideY;
-		pPixCr += iStrideY;
-	}
-}
-void DeblockChromaEq4_c( uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha, int32_t iBeta )
-{
-	int32_t i = 0, d = 0;
-	int32_t p0, p1, q0, q1;
-	bool_t bDetaP0Q0,bDetaP1P0,bDetaQ1Q0;
-	for(int32_t i =0;i<8;i++)
-	{
-		    //cb
-			p0 = pPixCb[-iStrideX];
-			p1 = pPixCb[-2*iStrideX];
-			q0 = pPixCb[0];
-			q1 = pPixCb[iStrideX];
-			bDetaP0Q0 = WELS_ABS( p0 - q0 ) < iAlpha;
-			bDetaP1P0 = WELS_ABS( p1 - p0 ) < iBeta;
-			bDetaQ1Q0 = WELS_ABS( q1 - q0 ) < iBeta;
-			if(bDetaP0Q0&&bDetaP1P0&&bDetaQ1Q0)
-			{
-				pPixCb[-iStrideX] = ( (p1 << 1) + p0 + q1 + 2 ) >> 2;   /* p0' */
-				pPixCb[0]  = ( (q1 << 1) + q0 + p1 + 2 ) >> 2;   /* q0' */
-			}
-			
-			//cr
-			p0 = pPixCr[-iStrideX];
-			p1 = pPixCr[-2*iStrideX];
-			q0 = pPixCr[0];
-			q1 = pPixCr[iStrideX];
-			bDetaP0Q0 = WELS_ABS( p0 - q0 ) < iAlpha;
-			bDetaP1P0 = WELS_ABS( p1 - p0 ) < iBeta;
-			bDetaQ1Q0 = WELS_ABS( q1 - q0 ) < iBeta;
-			if(bDetaP0Q0&&bDetaP1P0&&bDetaQ1Q0)
-			{
-				pPixCr[-iStrideX] = ( (p1 << 1) + p0 + q1 + 2 ) >> 2;   /* p0' */
-				pPixCr[0]  = ( (q1 << 1) + q0 + p1 + 2 ) >> 2;   /* q0' */
-			}
-			pPixCr += iStrideY;	
-			pPixCb += iStrideY;	
-	}
-}
-void DeblockChromaLt4V_c( uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *iTc )
-{
-	DeblockChromaLt4_c( pPixCb, pPixCr, iStride, 1, iAlpha, iBeta, iTc );
-}
-void DeblockChromaLt4H_c( uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *iTc )
-{
-	DeblockChromaLt4_c( pPixCb, pPixCr, 1, iStride, iAlpha, iBeta, iTc );
-}
-void DeblockChromaEq4V_c( uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta )
-{
-	DeblockChromaEq4_c( pPixCb, pPixCr, iStride, 1, iAlpha, iBeta );
-}
-void DeblockChromaEq4H_c( uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta )
-{
-	DeblockChromaEq4_c( pPixCb, pPixCr, 1, iStride, iAlpha, iBeta );
-}
-
-
-void  DeblockingFilterFrameAvcbase( SDqLayer *pCurDq, SWelsFuncPtrList *pFunc )
-{
-	int32_t i,j;
-	const int32_t kiMbWidth	= pCurDq->iMbWidth;
-	const int32_t kiMbHeight	= pCurDq->iMbHeight;
-	SMB * pCurrentMbBlock	= pCurDq->sMbDataP;	
-	SSliceHeaderExt *sSliceHeaderExt = &pCurDq->sLayerInfo.pSliceInLayer[0].sSliceHeaderExt;
-	SDeblockingFilter pFilter;
-
-	/* Step1: parameters set */	
-	if ( sSliceHeaderExt->sSliceHeader.uiDisableDeblockingFilterIdc == 1 )
-		return;
-
-	pFilter.uiFilterIdc = (sSliceHeaderExt->sSliceHeader.uiDisableDeblockingFilterIdc != 0);
-
-	pFilter.iCsStride[0] = pCurDq->pDecPic->iLineSize[0];
-	pFilter.iCsStride[1] = pCurDq->pDecPic->iLineSize[1];
-	pFilter.iCsStride[2] = pCurDq->pDecPic->iLineSize[2];		
-
-	pFilter.iSliceAlphaC0Offset = sSliceHeaderExt->sSliceHeader.iSliceAlphaC0Offset;
-	pFilter.iSliceBetaOffset     = sSliceHeaderExt->sSliceHeader.iSliceBetaOffset;
-
-	pFilter.iMbStride = kiMbWidth;	
-
-	for( j = 0; j < kiMbHeight; ++j ){
-		pFilter.pCsData[0] = pCurDq->pDecPic->pData[0] + ((j*pFilter.iCsStride[0])<<4);
-		pFilter.pCsData[1] = pCurDq->pDecPic->pData[1] + ((j*pFilter.iCsStride[1])<<3);
-		pFilter.pCsData[2] = pCurDq->pDecPic->pData[2] + ((j*pFilter.iCsStride[2])<<3);
-		for( i=0;i<kiMbWidth; i++ ){
-			DeblockingMbAvcbase( pFunc, pCurrentMbBlock, &pFilter );
-			++pCurrentMbBlock;
-			pFilter.pCsData[0] += MB_WIDTH_LUMA;
-			pFilter.pCsData[1] += MB_WIDTH_CHROMA;
-			pFilter.pCsData[2] += MB_WIDTH_CHROMA;
-		}			
-	}
-}
-
-void DeblockingFilterSliceAvcbase( SDqLayer *pCurDq, SWelsFuncPtrList *pFunc, const int32_t kiSliceIdx )
-{	
-	SSliceCtx * pSliceCtx			= pCurDq->pSliceEncCtx;
-	SMB *pMbList							= pCurDq->sMbDataP;
-	SSliceHeaderExt *sSliceHeaderExt	= &pCurDq->sLayerInfo.pSliceInLayer[kiSliceIdx].sSliceHeaderExt;	
-	SMB * pCurrentMbBlock;
-
-	const int32_t kiMbWidth				= pCurDq->iMbWidth;
-	const int32_t kiMbHeight				= pCurDq->iMbHeight;
-	const int32_t kiTotalNumMb			= kiMbWidth * kiMbHeight;
-	int32_t iCurMbIdx = 0, iNextMbIdx = 0, iNumMbFiltered = 0;	
-
-	/* Step1: parameters set */	
-	if ( sSliceHeaderExt->sSliceHeader.uiDisableDeblockingFilterIdc == 1 )
-		return;
-
-	SDeblockingFilter pFilter;
-
-	pFilter.uiFilterIdc = (sSliceHeaderExt->sSliceHeader.uiDisableDeblockingFilterIdc != 0);
-	pFilter.iCsStride[0] = pCurDq->pDecPic->iLineSize[0];
-	pFilter.iCsStride[1] = pCurDq->pDecPic->iLineSize[1];
-	pFilter.iCsStride[2] = pCurDq->pDecPic->iLineSize[2];
-	pFilter.iSliceAlphaC0Offset = sSliceHeaderExt->sSliceHeader.iSliceAlphaC0Offset;
-	pFilter.iSliceBetaOffset     = sSliceHeaderExt->sSliceHeader.iSliceBetaOffset;
-	pFilter.iMbStride             = kiMbWidth;
-	
-	iNextMbIdx  = sSliceHeaderExt->sSliceHeader.iFirstMbInSlice;
-
-	for ( ; ; )
-	{
-		iCurMbIdx	= iNextMbIdx;
-		pCurrentMbBlock = &pMbList[ iCurMbIdx ];	
-
-		pFilter.pCsData[0] = pCurDq->pDecPic->pData[0] + ((pCurrentMbBlock->iMbX + pCurrentMbBlock->iMbY * pFilter.iCsStride[0]) << 4);
-		pFilter.pCsData[1] = pCurDq->pDecPic->pData[1] + ((pCurrentMbBlock->iMbX + pCurrentMbBlock->iMbY * pFilter.iCsStride[1]) << 3);
-		pFilter.pCsData[2] = pCurDq->pDecPic->pData[2] + ((pCurrentMbBlock->iMbX + pCurrentMbBlock->iMbY * pFilter.iCsStride[2]) << 3);
-
-		DeblockingMbAvcbase( pFunc, pCurrentMbBlock, &pFilter);
-
-		++iNumMbFiltered;
-		iNextMbIdx = WelsGetNextMbOfSlice( pSliceCtx, iCurMbIdx );
-		//whether all of MB in current slice filtered or not
-		if ( iNextMbIdx == -1 || iNextMbIdx >= kiTotalNumMb || iNumMbFiltered >= kiTotalNumMb )
-		{
-			break;
-		}				
-	}
-}
-
-void PerformDeblockingFilter( sWelsEncCtx *pEnc )
-{	
-	const int32_t kiCurDid				= pEnc->uiDependencyId;
-	SWelsSvcCodingParam *pSvcParam	= pEnc->pSvcParam;
-	SDLayerParam *pDlp					= &pSvcParam->sDependencyLayers[kiCurDid];
-	SDqLayer *pCurLayer					= pEnc->pCurDqLayer;
-
-	if ( pCurLayer->iLoopFilterDisableIdc == 0 )
-	{
-		DeblockingFilterFrameAvcbase( pCurLayer, pEnc->pFuncList );
-	}
-	else if ( pCurLayer->iLoopFilterDisableIdc == 2 )
-	{		
-		int32_t iSliceCount			= 0;
-		int32_t iSliceIdx			= 0;
-
-		if ( SM_DYN_SLICE != pDlp->sMso.uiSliceMode )
-		{
-			iSliceCount	= GetCurrentSliceNum( pCurLayer->pSliceEncCtx );
-			do {
-				DeblockingFilterSliceAvcbase( pCurLayer, pEnc->pFuncList, iSliceIdx );
-				++ iSliceIdx;
-			} while(iSliceIdx < iSliceCount);
-		}
-		else	// for dynamic slicing mode
-		{
-			const int32_t kiNumPicPartition	= pEnc->iActiveThreadsNum; //pSvcParam->iCountThreadsNum;
-			int32_t iPartitionIdx			= 0;
-
-			while ( iPartitionIdx < kiNumPicPartition )
-			{
-				iSliceCount	= pCurLayer->pNumSliceCodedOfPartition[iPartitionIdx];
-				iSliceIdx	= iPartitionIdx;
-				do {
-					DeblockingFilterSliceAvcbase( pCurLayer, pEnc->pFuncList, iSliceIdx );
-					iSliceIdx += kiNumPicPartition;
-				} while(iSliceIdx < iSliceCount);
-				++ iPartitionIdx;
-			}
-		}
-	}
-}
-
-void WelsNonZeroCount_c(int8_t* pNonZeroCount)
-{
-	int32_t i;
-	int32_t iIndex;
-
-	for( i=0;i<24;i++ ){
-		iIndex = g_kuiMbCountScan4Idx[i];
-		pNonZeroCount[iIndex] = !!pNonZeroCount[iIndex];
-	}
-}
-void WelsBlockFuncInit( PSetNoneZeroCountZeroFunc *pfSetNZCZero,  int32_t iCpu )
-{
-	*pfSetNZCZero = WelsNonZeroCount_c;
-}
-
-
-#ifdef X86_ASM
-
-extern "C" {
-void DeblockLumaLt4H_sse2(uint8_t *pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *pTc)
-{
-    ENFORCE_STACK_ALIGN_1D(uint8_t,  uiBuf,   16*8, 16);
-    
-    DeblockLumaTransposeH2V_sse2(pPixY - 4, iStride, &uiBuf[0]);
-	DeblockLumaLt4V_sse2(&uiBuf[4*16], 16, iAlpha, iBeta, pTc);
-	DeblockLumaTransposeV2H_sse2(pPixY - 4, iStride, &uiBuf[0]);
-}
-
-void DeblockLumaEq4H_sse2(uint8_t *pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta)
-{
-	ENFORCE_STACK_ALIGN_1D(uint8_t,  uiBuf,   16*8, 16);
-    
-    DeblockLumaTransposeH2V_sse2(pPixY - 4, iStride, &uiBuf[0]);
-	DeblockLumaEq4V_sse2(&uiBuf[4*16], 16, iAlpha, iBeta);
-	DeblockLumaTransposeV2H_sse2(pPixY - 4, iStride, &uiBuf[0]);
-}
-
-}
-
-#endif
-
-
-void  DeblockingInit( DeblockingFunc  * pFunc,  int32_t iCpu )
-{
-	pFunc->pfLumaDeblockingLT4Ver		= DeblockLumaLt4V_c;
-	pFunc->pfLumaDeblockingEQ4Ver		= DeblockLumaEq4V_c;
-	pFunc->pfLumaDeblockingLT4Hor		= DeblockLumaLt4H_c;
-	pFunc->pfLumaDeblockingEQ4Hor		= DeblockLumaEq4H_c;
-
-	pFunc->pfChromaDeblockingLT4Ver	= DeblockChromaLt4V_c;
-	pFunc->pfChromaDeblockingEQ4Ver	= DeblockChromaEq4V_c;
-	pFunc->pfChromaDeblockingLT4Hor	= DeblockChromaLt4H_c;
-	pFunc->pfChromaDeblockinEQ4Hor	= DeblockChromaEq4H_c;
-
-
-#ifdef X86_ASM
-	if( iCpu & WELS_CPU_SSE2 ){
-	    pFunc->pfLumaDeblockingLT4Ver	= DeblockLumaLt4V_sse2;
-	    pFunc->pfLumaDeblockingEQ4Ver	= DeblockLumaEq4V_sse2;
-		pFunc->pfLumaDeblockingLT4Hor   = DeblockLumaLt4H_sse2;
-		pFunc->pfLumaDeblockingEQ4Hor   = DeblockLumaEq4H_sse2;
-	    pFunc->pfChromaDeblockingLT4Ver	= DeblockChromaLt4V_sse2;
-	    pFunc->pfChromaDeblockingEQ4Ver	= DeblockChromaEq4V_sse2;
-	    pFunc->pfChromaDeblockingLT4Hor	= DeblockChromaLt4H_sse2;
-	    pFunc->pfChromaDeblockinEQ4Hor	= DeblockChromaEq4H_sse2;
-	}
-#endif		
-}
-
-
-} // namespace WelsSVCEnc
-
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	deblocking.c
+ *
+ * \brief	Interfaces introduced in frame deblocking filtering
+ *
+ * \date	08/03/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#include "as264_common.h"
+#include "deblocking.h"
+#include "cpu_core.h"
+#include "array_stack_align.h"
+
+namespace WelsSVCEnc {
+
+#define g_kuiAlphaTable(x) g_kuiAlphaTable[(x)]
+#define g_kiBetaTable(x)  g_kiBetaTable[(x)]
+#define g_kiTc0Table(x)   g_kiTc0Table[(x)]
+
+#define MB_BS_MV(sCurMv, sNeighMv, uiBIdx, uiBnIdx) \
+	(\
+	( WELS_ABS( sCurMv[uiBIdx].iMvX - sNeighMv[uiBnIdx].iMvX ) >= 4 ) ||\
+	( WELS_ABS( sCurMv[uiBIdx].iMvY - sNeighMv[uiBnIdx].iMvY ) >= 4 )\
+	)
+
+#define SMB_EDGE_MV(uiRefIndex, sMotionVector, uiBIdx, uiBnIdx) \
+	(\
+	!!((WELS_ABS(sMotionVector[uiBIdx].iMvX - sMotionVector[uiBnIdx].iMvX) &(~3)) | (WELS_ABS(sMotionVector[uiBIdx].iMvY - sMotionVector[uiBnIdx].iMvY) &(~3)))\
+	)
+
+#define BS_EDGE(bsx1, uiRefIndex, sMotionVector, uiBIdx, uiBnIdx) \
+	( (bsx1|SMB_EDGE_MV(uiRefIndex, sMotionVector, uiBIdx, uiBnIdx))<<(bsx1?1:0))
+
+#define GET_ALPHA_BETA_FROM_QP(QP, iAlphaOffset, iBetaOffset, iIdexA, iAlpha, iBeta) \
+{\
+	iIdexA = (QP + iAlphaOffset);\
+	iIdexA = CLIP3_QP_0_51(iIdexA);\
+	iAlpha = g_kuiAlphaTable(iIdexA);\
+	iBeta  = g_kiBetaTable((CLIP3_QP_0_51(QP + iBetaOffset)));\
+}
+
+static const uint8_t g_kuiAlphaTable[52 + 12] = { //this table refers to Table 8-16 in H.264/AVC standard
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  4,  4,  5,  6,
+  7,  8,  9, 10, 12, 13, 15, 17, 20, 22,
+  25, 28, 32, 36, 40, 45, 50, 56, 63, 71,
+  80, 90, 101, 113, 127, 144, 162, 182, 203, 226,
+  255, 255
+  , 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
+};
+
+static const int8_t g_kiBetaTable[52 + 12] = { //this table refers to Table 8-16 in H.264/AVC standard
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  2,  2,  2,  3,
+  3,  3,  3,  4,  4,  4,  6,  6,  7,  7,
+  8,  8,  9,  9, 10, 10, 11, 11, 12, 12,
+  13, 13, 14, 14, 15, 15, 16, 16, 17, 17,
+  18, 18
+  , 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18
+};
+
+static const int8_t g_kiTc0Table[52 + 12][4] = { //this table refers Table 8-17 in H.264/AVC standard
+  { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 },
+  { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 },
+  { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 1 },
+  { -1, 0, 0, 1 }, { -1, 0, 0, 1 }, { -1, 0, 0, 1 }, { -1, 0, 1, 1 }, { -1, 0, 1, 1 }, { -1, 1, 1, 1 },
+  { -1, 1, 1, 1 }, { -1, 1, 1, 1 }, { -1, 1, 1, 1 }, { -1, 1, 1, 2 }, { -1, 1, 1, 2 }, { -1, 1, 1, 2 },
+  { -1, 1, 1, 2 }, { -1, 1, 2, 3 }, { -1, 1, 2, 3 }, { -1, 2, 2, 3 }, { -1, 2, 2, 4 }, { -1, 2, 3, 4 },
+  { -1, 2, 3, 4 }, { -1, 3, 3, 5 }, { -1, 3, 4, 6 }, { -1, 3, 4, 6 }, { -1, 4, 5, 7 }, { -1, 4, 5, 8 },
+  { -1, 4, 6, 9 }, { -1, 5, 7, 10 }, { -1, 6, 8, 11 }, { -1, 6, 8, 13 }, { -1, 7, 10, 14 }, { -1, 8, 11, 16 },
+  { -1, 9, 12, 18 }, { -1, 10, 13, 20 }, { -1, 11, 15, 23 }, { -1, 13, 17, 25 }
+  , { -1, 13, 17, 25 }, { -1, 13, 17, 25 }, { -1, 13, 17, 25 }, { -1, 13, 17, 25 }, { -1, 13, 17, 25 }, { -1, 13, 17, 25 }
+  , { -1, 13, 17, 25 }, { -1, 13, 17, 25 }, { -1, 13, 17, 25 }, { -1, 13, 17, 25 }, { -1, 13, 17, 25 }, { -1, 13, 17, 25 }
+};
+
+static const uint8_t g_kuiTableBIdx[2][8] = {
+  {
+    0,  4,  8,  12, // g_kuiTableBIdx
+    3,  7,  11, 15
+  }, // table_bn_idx
+
+  {
+    0,  1,  2,  3 , // g_kuiTableBIdx
+    12, 13, 14, 15
+  }, // table_bn_idx
+};
+
+static const ALIGNED_DECLARE (int32_t, g_kiTableBlock8x8Idx[2][4][4], 16) = {
+  {
+    0, 0, 2, 2,
+    0, 0, 2, 2,
+    1, 1, 3, 3,
+    1, 1, 3, 3
+  },
+
+  {
+    0, 0, 1, 1,
+    0, 0, 1, 1,
+    2, 2, 3, 3,
+    2, 2, 3, 3
+  }
+};
+static const ALIGNED_DECLARE (int32_t, g_kiTableBlock8x8NIdx[2][4][4], 16) = {
+  {
+    1, 1, 3, 3,
+    0, 0, 2, 2,
+    0, 0, 2, 2,
+    1, 1, 3, 3
+  },
+
+  {
+    2, 2, 3, 3,
+    0, 0, 1, 1,
+    0, 0, 1, 1,
+    2, 2, 3, 3
+  }
+};
+
+#define TC0_TBL_LOOKUP(iTc, iIdexA, pBS, bchroma) \
+{\
+	iTc[0] = g_kiTc0Table(iIdexA)[pBS[0]] + bchroma;\
+	iTc[1] = g_kiTc0Table(iIdexA)[pBS[1]] + bchroma;\
+	iTc[2] = g_kiTc0Table(iIdexA)[pBS[2]] + bchroma;\
+	iTc[3] = g_kiTc0Table(iIdexA)[pBS[3]] + bchroma;\
+}
+
+void inline DeblockingBSInsideMBAvsbase (int8_t* pNnzTab, uint8_t uiBS[2][4][4], int32_t iLShiftFactor) {
+  uint32_t uiNnz32b0, uiNnz32b1, uiNnz32b2, uiNnz32b3;
+  ENFORCE_STACK_ALIGN_1D (uint8_t, uiBsx3, 4, 4);
+
+  uiNnz32b0 = * (uint32_t*) (pNnzTab + 0);
+  uiNnz32b1 = * (uint32_t*) (pNnzTab + 4);
+  uiNnz32b2 = * (uint32_t*) (pNnzTab + 8);
+  uiNnz32b3 = * (uint32_t*) (pNnzTab + 12);
+
+  * (uint32_t*)uiBsx3 = (uiNnz32b0 | (uiNnz32b0 >> 8)) << iLShiftFactor;
+  uiBS[0][1][0] = uiBsx3[0];
+  uiBS[0][2][0] = uiBsx3[1];
+  uiBS[0][3][0] = uiBsx3[2];
+
+  * (uint32_t*)uiBsx3 = (uiNnz32b1 | (uiNnz32b1 >> 8)) << iLShiftFactor;
+  uiBS[0][1][1] = uiBsx3[0];
+  uiBS[0][2][1] = uiBsx3[1];
+  uiBS[0][3][1] = uiBsx3[2];
+  * (uint32_t*)uiBS[1][1] = (uiNnz32b0 | uiNnz32b1) << iLShiftFactor;
+
+  * (uint32_t*)uiBsx3 = (uiNnz32b2 | (uiNnz32b2 >> 8)) << iLShiftFactor;
+  uiBS[0][1][2] = uiBsx3[0];
+  uiBS[0][2][2] = uiBsx3[1];
+  uiBS[0][3][2] = uiBsx3[2];
+  * (uint32_t*)uiBS[1][2] = (uiNnz32b1 | uiNnz32b2) << iLShiftFactor;
+
+  * (uint32_t*)uiBsx3 = (uiNnz32b3 | (uiNnz32b3 >> 8)) << iLShiftFactor;
+  uiBS[0][1][3] = uiBsx3[0];
+  uiBS[0][2][3] = uiBsx3[1];
+  uiBS[0][3][3] = uiBsx3[2];
+  * (uint32_t*)uiBS[1][3] = (uiNnz32b2 | uiNnz32b3) << iLShiftFactor;
+
+}
+
+void inline DeblockingBSInsideMBNormal (SMB* pCurMb, uint8_t uiBS[2][4][4], int8_t* pNnzTab) {
+  uint32_t uiNnz32b0, uiNnz32b1, uiNnz32b2, uiNnz32b3;
+  ENFORCE_STACK_ALIGN_1D (uint8_t, uiBsx4, 4, 4);
+
+  uiNnz32b0 = * (uint32_t*) (pNnzTab + 0);
+  uiNnz32b1 = * (uint32_t*) (pNnzTab + 4);
+  uiNnz32b2 = * (uint32_t*) (pNnzTab + 8);
+  uiNnz32b3 = * (uint32_t*) (pNnzTab + 12);
+
+  * (uint32_t*)uiBsx4 = (uiNnz32b0 | (uiNnz32b0 >> 8));
+  uiBS[0][1][0] = BS_EDGE (uiBsx4[0], iRefIdx, pCurMb->sMv, 1, 0);
+  uiBS[0][2][0] = BS_EDGE (uiBsx4[1], iRefIdx, pCurMb->sMv, 2, 1);
+  uiBS[0][3][0] = BS_EDGE (uiBsx4[2], iRefIdx, pCurMb->sMv, 3, 2);
+
+  * (uint32_t*)uiBsx4 = (uiNnz32b1 | (uiNnz32b1 >> 8));
+  uiBS[0][1][1] = BS_EDGE (uiBsx4[0], iRefIdx, pCurMb->sMv, 5, 4);
+  uiBS[0][2][1] = BS_EDGE (uiBsx4[1], iRefIdx, pCurMb->sMv, 6, 5);
+  uiBS[0][3][1] = BS_EDGE (uiBsx4[2], iRefIdx, pCurMb->sMv, 7, 6);
+
+  * (uint32_t*)uiBsx4 = (uiNnz32b2 | (uiNnz32b2 >> 8));
+  uiBS[0][1][2] = BS_EDGE (uiBsx4[0], iRefIdx, pCurMb->sMv, 9, 8);
+  uiBS[0][2][2] = BS_EDGE (uiBsx4[1], iRefIdx, pCurMb->sMv, 10, 9);
+  uiBS[0][3][2] = BS_EDGE (uiBsx4[2], iRefIdx, pCurMb->sMv, 11, 10);
+
+  * (uint32_t*)uiBsx4 = (uiNnz32b3 | (uiNnz32b3 >> 8));
+  uiBS[0][1][3] = BS_EDGE (uiBsx4[0], iRefIdx, pCurMb->sMv, 13, 12);
+  uiBS[0][2][3] = BS_EDGE (uiBsx4[1], iRefIdx, pCurMb->sMv, 14, 13);
+  uiBS[0][3][3] = BS_EDGE (uiBsx4[2], iRefIdx, pCurMb->sMv, 15, 14);
+
+  //horizontal
+  * (uint32_t*)uiBsx4 = (uiNnz32b0 | uiNnz32b1);
+  uiBS[1][1][0] = BS_EDGE (uiBsx4[0], iRefIdx, pCurMb->sMv, 4, 0);
+  uiBS[1][1][1] = BS_EDGE (uiBsx4[1], iRefIdx, pCurMb->sMv, 5, 1);
+  uiBS[1][1][2] = BS_EDGE (uiBsx4[2], iRefIdx, pCurMb->sMv, 6, 2);
+  uiBS[1][1][3] = BS_EDGE (uiBsx4[3], iRefIdx, pCurMb->sMv, 7, 3);
+
+  * (uint32_t*)uiBsx4 = (uiNnz32b1 | uiNnz32b2);
+  uiBS[1][2][0] = BS_EDGE (uiBsx4[0], iRefIdx, pCurMb->sMv, 8, 4);
+  uiBS[1][2][1] = BS_EDGE (uiBsx4[1], iRefIdx, pCurMb->sMv, 9, 5);
+  uiBS[1][2][2] = BS_EDGE (uiBsx4[2], iRefIdx, pCurMb->sMv, 10, 6);
+  uiBS[1][2][3] = BS_EDGE (uiBsx4[3], iRefIdx, pCurMb->sMv, 11, 7);
+
+  * (uint32_t*)uiBsx4 = (uiNnz32b2 | uiNnz32b3);
+  uiBS[1][3][0] = BS_EDGE (uiBsx4[0], iRefIdx, pCurMb->sMv, 12, 8);
+  uiBS[1][3][1] = BS_EDGE (uiBsx4[1], iRefIdx, pCurMb->sMv, 13, 9);
+  uiBS[1][3][2] = BS_EDGE (uiBsx4[2], iRefIdx, pCurMb->sMv, 14, 10);
+  uiBS[1][3][3] = BS_EDGE (uiBsx4[3], iRefIdx, pCurMb->sMv, 15, 11);
+}
+
+uint32_t DeblockingBSMarginalMBAvcbase (SMB* pCurMb, SMB* pNeighMb, int32_t iEdge) {
+  int32_t i;
+  uint32_t uiBSx4;
+  uint8_t* pBS = (uint8_t*) (&uiBSx4);
+  uint32_t uiBIdx  = * (uint32_t*) (&g_kuiTableBIdx[iEdge][0]);
+  uint32_t uiBnIdx = * (uint32_t*) (&g_kuiTableBIdx[iEdge][4]);
+
+  for (i = 0; i < 4; i++) {
+    if (pCurMb->pNonZeroCount[uiBIdx & 0xff] | pNeighMb->pNonZeroCount[uiBnIdx & 0xff]) {
+      pBS[i] = 2;
+    } else {
+      pBS[i] =
+#ifndef SINGLE_REF_FRAME
+        (pCurMb->uiRefIndex[g_kiTableBlock8x8Idx[1][iEdge][i]] - pNeighMb->uiRefIndex[g_kiTableBlock8x8NIdx[1][iEdge][i]]) ||
+#endif
+        MB_BS_MV (pCurMb->sMv, pNeighMb->sMv, (uiBIdx & 0xff), (uiBnIdx & 0xff));
+    }
+    uiBIdx  = uiBIdx  >> 8;
+    uiBnIdx = uiBnIdx >> 8;
+  }
+  return uiBSx4;
+}
+
+void FilteringEdgeLumaH (DeblockingFunc* pfDeblocking, SDeblockingFilter* pFilter, uint8_t* pPix, int32_t iStride,
+                         uint8_t* pBS) {
+  int32_t iIdexA;
+  int32_t iAlpha;
+  int32_t iBeta;
+  ENFORCE_STACK_ALIGN_1D (int8_t, iTc, 4, 16);
+
+  GET_ALPHA_BETA_FROM_QP (pFilter->uiLumaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIdexA, iAlpha,
+                          iBeta);
+
+  if (iAlpha | iBeta) {
+    TC0_TBL_LOOKUP (iTc, iIdexA, pBS, 0);
+    pfDeblocking->pfLumaDeblockingLT4Ver (pPix, iStride, iAlpha, iBeta, iTc);
+  }
+  return;
+}
+void FilteringEdgeLumaV (DeblockingFunc* pfDeblocking, SDeblockingFilter* pFilter, uint8_t* pPix, int32_t iStride,
+                         uint8_t* pBS) {
+  int32_t  iIdexA;
+  int32_t  iAlpha;
+  int32_t  iBeta;
+  ENFORCE_STACK_ALIGN_1D (int8_t, iTc, 4, 16);
+
+  GET_ALPHA_BETA_FROM_QP (pFilter->uiLumaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIdexA, iAlpha,
+                          iBeta);
+
+  if (iAlpha | iBeta) {
+    TC0_TBL_LOOKUP (iTc, iIdexA, pBS, 0);
+    pfDeblocking->pfLumaDeblockingLT4Hor (pPix, iStride, iAlpha, iBeta, iTc);
+  }
+  return;
+}
+
+void FilteringEdgeLumaIntraH (DeblockingFunc* pfDeblocking, SDeblockingFilter* pFilter, uint8_t* pPix, int32_t iStride,
+                              uint8_t* pBS) {
+  int32_t iIdexA;
+  int32_t iAlpha;
+  int32_t iBeta;
+
+  GET_ALPHA_BETA_FROM_QP (pFilter->uiLumaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIdexA, iAlpha,
+                          iBeta);
+
+  if (iAlpha | iBeta) {
+    pfDeblocking->pfLumaDeblockingEQ4Ver (pPix, iStride, iAlpha, iBeta);
+  }
+  return;
+}
+
+void FilteringEdgeLumaIntraV (DeblockingFunc* pfDeblocking, SDeblockingFilter* pFilter, uint8_t* pPix, int32_t iStride,
+                              uint8_t* pBS) {
+  int32_t iIdexA;
+  int32_t iAlpha;
+  int32_t iBeta;
+
+  GET_ALPHA_BETA_FROM_QP (pFilter->uiLumaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIdexA, iAlpha,
+                          iBeta);
+
+  if (iAlpha | iBeta) {
+    pfDeblocking->pfLumaDeblockingEQ4Hor (pPix, iStride, iAlpha, iBeta);
+  }
+  return;
+}
+void FilteringEdgeChromaH (DeblockingFunc* pfDeblocking, SDeblockingFilter* pFilter, uint8_t* pPixCb, uint8_t* pPixCr,
+                           int32_t iStride, uint8_t* pBS) {
+  int32_t iIdexA;
+  int32_t iAlpha;
+  int32_t iBeta;
+  ENFORCE_STACK_ALIGN_1D (int8_t, iTc, 4, 16);
+
+  GET_ALPHA_BETA_FROM_QP (pFilter->uiChromaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIdexA, iAlpha,
+                          iBeta);
+
+  if (iAlpha | iBeta) {
+    TC0_TBL_LOOKUP (iTc, iIdexA, pBS, 1);
+    pfDeblocking->pfChromaDeblockingLT4Ver (pPixCb, pPixCr, iStride, iAlpha, iBeta, iTc);
+  }
+  return;
+}
+void FilteringEdgeChromaV (DeblockingFunc* pfDeblocking, SDeblockingFilter* pFilter, uint8_t* pPixCb, uint8_t* pPixCr,
+                           int32_t iStride, uint8_t* pBS) {
+  int32_t iIdexA;
+  int32_t iAlpha;
+  int32_t iBeta;
+  ENFORCE_STACK_ALIGN_1D (int8_t, iTc, 4, 16);
+
+  GET_ALPHA_BETA_FROM_QP (pFilter->uiChromaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIdexA, iAlpha,
+                          iBeta);
+
+  if (iAlpha | iBeta) {
+    TC0_TBL_LOOKUP (iTc, iIdexA, pBS, 1);
+    pfDeblocking->pfChromaDeblockingLT4Hor (pPixCb, pPixCr, iStride, iAlpha, iBeta, iTc);
+  }
+  return;
+}
+
+void FilteringEdgeChromaIntraH (DeblockingFunc* pfDeblocking, SDeblockingFilter* pFilter, uint8_t* pPixCb,
+                                uint8_t* pPixCr, int32_t iStride, uint8_t* pBS) {
+  int32_t iIdexA;
+  int32_t iAlpha;
+  int32_t iBeta;
+
+  GET_ALPHA_BETA_FROM_QP (pFilter->uiChromaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIdexA, iAlpha,
+                          iBeta);
+
+  if (iAlpha | iBeta) {
+    pfDeblocking->pfChromaDeblockingEQ4Ver (pPixCb, pPixCr, iStride, iAlpha, iBeta);
+  }
+  return;
+}
+
+void FilteringEdgeChromaIntraV (DeblockingFunc* pfDeblocking, SDeblockingFilter* pFilter, uint8_t* pPixCb,
+                                uint8_t* pPixCr, int32_t iStride, uint8_t* pBS) {
+  int32_t iIdexA;
+  int32_t iAlpha;
+  int32_t iBeta;
+
+  GET_ALPHA_BETA_FROM_QP (pFilter->uiChromaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIdexA, iAlpha,
+                          iBeta);
+
+  if (iAlpha | iBeta) {
+    pfDeblocking->pfChromaDeblockinEQ4Hor (pPixCb, pPixCr, iStride, iAlpha, iBeta);
+  }
+  return;
+}
+
+void DeblockingInterMb (DeblockingFunc* pfDeblocking, SMB* pCurMb, SDeblockingFilter* pFilter, uint8_t uiBS[2][4][4]) {
+  int8_t iCurLumaQp   = pCurMb->uiLumaQp;
+  int8_t iCurChromaQp = pCurMb->uiChromaQp;
+  int32_t iLineSize     = pFilter->iCsStride[0];
+  int32_t iLineSizeUV   = pFilter->iCsStride[1];
+  int32_t iMbStride    = pFilter->iMbStride;
+
+  int32_t iMbX = pCurMb->iMbX;
+  int32_t iMbY = pCurMb->iMbY;
+
+  BOOL_T bLeftBsValid[2] = { (iMbX > 0), ((iMbX > 0) && (pCurMb->uiSliceIdc == (pCurMb - 1)->uiSliceIdc))};
+  BOOL_T bTopBsValid[2]  = { (iMbY > 0), ((iMbY > 0) && (pCurMb->uiSliceIdc == (pCurMb - iMbStride)->uiSliceIdc))};
+
+  int32_t iLeftFlag = bLeftBsValid[pFilter->uiFilterIdc];
+  int32_t iTopFlag  = bTopBsValid[pFilter->uiFilterIdc];
+
+  uint8_t* pDestY, *pDestCb, *pDestCr;
+  pDestY  = pFilter->pCsData[0];
+  pDestCb = pFilter->pCsData[1];
+  pDestCr = pFilter->pCsData[2];
+
+  if (iLeftFlag) {
+    pFilter->uiLumaQP   = (iCurLumaQp + (pCurMb - 1)->uiLumaQp + 1) >> 1;
+    pFilter->uiChromaQP = (iCurChromaQp + (pCurMb - 1)->uiChromaQp + 1) >> 1;
+
+    if (uiBS[0][0][0] == 0x04) {
+      FilteringEdgeLumaIntraV (pfDeblocking, pFilter, pDestY, iLineSize , NULL);
+      FilteringEdgeChromaIntraV (pfDeblocking, pFilter, pDestCb, pDestCr, iLineSizeUV, NULL);
+    } else {
+      if (* (uint32_t*)uiBS[0][0] != 0) {
+        FilteringEdgeLumaV (pfDeblocking, pFilter, pDestY, iLineSize, uiBS[0][0]);
+        FilteringEdgeChromaV (pfDeblocking, pFilter, pDestCb, pDestCr, iLineSizeUV, uiBS[0][0]);
+      }
+    }
+  }
+
+  pFilter->uiLumaQP = iCurLumaQp;
+  pFilter->uiChromaQP = iCurChromaQp;
+
+  if (* (uint32_t*)uiBS[0][1] != 0) {
+    FilteringEdgeLumaV (pfDeblocking, pFilter, &pDestY[1 << 2], iLineSize, uiBS[0][1]);
+  }
+
+  if (* (uint32_t*)uiBS[0][2] != 0) {
+    FilteringEdgeLumaV (pfDeblocking, pFilter, &pDestY[2 << 2], iLineSize, uiBS[0][2]);
+    FilteringEdgeChromaV (pfDeblocking, pFilter, &pDestCb[2 << 1], &pDestCr[2 << 1], iLineSizeUV, uiBS[0][2]);
+  }
+
+  if (* (uint32_t*)uiBS[0][3] != 0) {
+    FilteringEdgeLumaV (pfDeblocking, pFilter, &pDestY[3 << 2], iLineSize, uiBS[0][3]);
+  }
+
+  if (iTopFlag) {
+    pFilter->uiLumaQP = (iCurLumaQp + (pCurMb - iMbStride)->uiLumaQp + 1) >> 1;
+    pFilter->uiChromaQP = (iCurChromaQp + (pCurMb - iMbStride)->uiChromaQp + 1) >> 1;
+
+    if (uiBS[1][0][0] == 0x04) {
+      FilteringEdgeLumaIntraH (pfDeblocking, pFilter, pDestY, iLineSize , NULL);
+      FilteringEdgeChromaIntraH (pfDeblocking, pFilter, pDestCb, pDestCr, iLineSizeUV, NULL);
+    } else {
+      if (* (uint32_t*)uiBS[1][0] != 0) {
+        FilteringEdgeLumaH (pfDeblocking, pFilter, pDestY, iLineSize, uiBS[1][0]);
+        FilteringEdgeChromaH (pfDeblocking, pFilter, pDestCb, pDestCr, iLineSizeUV, uiBS[1][0]);
+      }
+    }
+  }
+
+  pFilter->uiLumaQP = iCurLumaQp;
+  pFilter->uiChromaQP = iCurChromaQp;
+
+  if (* (uint32_t*)uiBS[1][1] != 0) {
+    FilteringEdgeLumaH (pfDeblocking, pFilter, &pDestY[ (1 << 2)*iLineSize], iLineSize, uiBS[1][1]);
+  }
+
+  if (* (uint32_t*)uiBS[1][2] != 0) {
+    FilteringEdgeLumaH (pfDeblocking, pFilter, &pDestY[ (2 << 2)*iLineSize], iLineSize, uiBS[1][2]);
+    FilteringEdgeChromaH (pfDeblocking, pFilter, &pDestCb[ (2 << 1)*iLineSizeUV], &pDestCr[ (2 << 1)*iLineSizeUV],
+                          iLineSizeUV, uiBS[1][2]);
+  }
+
+  if (* (uint32_t*)uiBS[1][3] != 0) {
+    FilteringEdgeLumaH (pfDeblocking, pFilter, &pDestY[ (3 << 2)*iLineSize], iLineSize, uiBS[1][3]);
+  }
+}
+
+void FilteringEdgeLumaHV (DeblockingFunc* pfDeblocking, SMB* pCurMb, SDeblockingFilter* pFilter) {
+  int32_t iLineSize  = pFilter->iCsStride[0];
+  int32_t iMbStride = pFilter->iMbStride;
+
+  uint8_t*  pDestY;
+  int8_t   iCurQp;
+  int32_t  iIdexA, iAlpha, iBeta;
+
+  int32_t iMbX = pCurMb->iMbX;
+  int32_t iMbY = pCurMb->iMbY;
+
+  BOOL_T bLeftBsValid[2] = { (iMbX > 0), ((iMbX > 0) && (pCurMb->uiSliceIdc == (pCurMb - 1)->uiSliceIdc))};
+  BOOL_T bTopBsValid[2]  = { (iMbY > 0), ((iMbY > 0) && (pCurMb->uiSliceIdc == (pCurMb - iMbStride)->uiSliceIdc))};
+
+  int32_t iLeftFlag = bLeftBsValid[pFilter->uiFilterIdc];
+  int32_t iTopFlag  = bTopBsValid[pFilter->uiFilterIdc];
+
+  ENFORCE_STACK_ALIGN_1D (int8_t,  iTc,   4, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, uiBSx4, 4, 4);
+
+  pDestY  = pFilter->pCsData[0];
+  iCurQp  = pCurMb->uiLumaQp;
+
+  * (uint32_t*)uiBSx4 = 0x03030303;
+
+  // luma v
+  if (iLeftFlag) {
+    pFilter->uiLumaQP = (iCurQp + (pCurMb - 1)->uiLumaQp + 1) >> 1;
+    FilteringEdgeLumaIntraV (pfDeblocking, pFilter, pDestY, iLineSize, NULL);
+  }
+
+  pFilter->uiLumaQP   = iCurQp;
+  GET_ALPHA_BETA_FROM_QP (pFilter->uiLumaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIdexA, iAlpha,
+                          iBeta);
+  if (iAlpha | iBeta) {
+    TC0_TBL_LOOKUP (iTc, iIdexA, uiBSx4, 0);
+    pfDeblocking->pfLumaDeblockingLT4Hor (&pDestY[1 << 2], iLineSize, iAlpha, iBeta, iTc);
+    pfDeblocking->pfLumaDeblockingLT4Hor (&pDestY[2 << 2], iLineSize, iAlpha, iBeta, iTc);
+    pfDeblocking->pfLumaDeblockingLT4Hor (&pDestY[3 << 2], iLineSize, iAlpha, iBeta, iTc);
+
+  }
+
+  // luma h
+  if (iTopFlag) {
+    pFilter->uiLumaQP   = (iCurQp   + (pCurMb - iMbStride)->uiLumaQp + 1) >> 1;
+    FilteringEdgeLumaIntraH (pfDeblocking, pFilter, pDestY, iLineSize, NULL);
+  }
+
+  pFilter->uiLumaQP   = iCurQp;
+  if (iAlpha | iBeta) {
+    pfDeblocking->pfLumaDeblockingLT4Ver (&pDestY[ (1 << 2)*iLineSize], iLineSize, iAlpha, iBeta, iTc);
+    pfDeblocking->pfLumaDeblockingLT4Ver (&pDestY[ (2 << 2)*iLineSize], iLineSize, iAlpha, iBeta, iTc);
+    pfDeblocking->pfLumaDeblockingLT4Ver (&pDestY[ (3 << 2)*iLineSize], iLineSize, iAlpha, iBeta, iTc);
+  }
+}
+void FilteringEdgeChromaHV (DeblockingFunc* pfDeblocking, SMB* pCurMb, SDeblockingFilter* pFilter) {
+  int32_t iLineSize  = pFilter->iCsStride[1];
+  int32_t iMbStride = pFilter->iMbStride;
+
+  uint8_t*  pDestCb, *pDestCr;
+  int8_t   iCurQp;
+  int32_t  iIdexA, iAlpha, iBeta;
+
+  int32_t iMbX = pCurMb->iMbX;
+  int32_t iMbY = pCurMb->iMbY;
+
+  BOOL_T bLeftBsValid[2] = { (iMbX > 0), ((iMbX > 0) && (pCurMb->uiSliceIdc == (pCurMb - 1)->uiSliceIdc))};
+  BOOL_T bTopBsValid[2]  = { (iMbY > 0), ((iMbY > 0) && (pCurMb->uiSliceIdc == (pCurMb - iMbStride)->uiSliceIdc))};
+
+  int32_t iLeftFlag = bLeftBsValid[pFilter->uiFilterIdc];
+  int32_t iTopFlag  = bTopBsValid[pFilter->uiFilterIdc];
+
+  ENFORCE_STACK_ALIGN_1D (int8_t,  iTc,   4, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, uiBSx4, 4, 4);
+
+  pDestCb = pFilter->pCsData[1];
+  pDestCr = pFilter->pCsData[2];
+  iCurQp  = pCurMb->uiChromaQp;
+  * (uint32_t*)uiBSx4 = 0x03030303;
+
+  // chroma v
+  if (iLeftFlag) {
+    pFilter->uiChromaQP = (iCurQp + (pCurMb - 1)->uiChromaQp + 1) >> 1;
+    FilteringEdgeChromaIntraV (pfDeblocking, pFilter, pDestCb, pDestCr, iLineSize, NULL);
+  }
+
+  pFilter->uiChromaQP   = iCurQp;
+  GET_ALPHA_BETA_FROM_QP (pFilter->uiChromaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIdexA, iAlpha,
+                          iBeta);
+  if (iAlpha | iBeta) {
+    TC0_TBL_LOOKUP (iTc, iIdexA, uiBSx4, 1);
+    pfDeblocking->pfChromaDeblockingLT4Hor (&pDestCb[2 << 1], &pDestCr[2 << 1], iLineSize, iAlpha, iBeta, iTc);
+  }
+
+  // chroma h
+  if (iTopFlag) {
+    pFilter->uiChromaQP = (iCurQp + (pCurMb - iMbStride)->uiChromaQp + 1) >> 1;
+    FilteringEdgeChromaIntraH (pfDeblocking, pFilter, pDestCb, pDestCr, iLineSize, NULL);
+  }
+
+  pFilter->uiChromaQP   = iCurQp;
+  if (iAlpha | iBeta) {
+    pfDeblocking->pfChromaDeblockingLT4Ver (&pDestCb[ (2 << 1)*iLineSize], &pDestCr[ (2 << 1)*iLineSize], iLineSize, iAlpha,
+                                            iBeta, iTc);
+  }
+}
+
+// merge h&v lookup table operation to save performance
+void DeblockingIntraMb (DeblockingFunc* pfDeblocking, SMB* pCurMb, SDeblockingFilter* pFilter) {
+  FilteringEdgeLumaHV (pfDeblocking, pCurMb, pFilter);
+  FilteringEdgeChromaHV (pfDeblocking, pCurMb, pFilter);
+}
+
+void DeblockingMbAvcbase (SWelsFuncPtrList* pFunc, SMB* pCurMb, SDeblockingFilter* pFilter) {
+  uint8_t uiBS[2][4][4] = { 0 };
+
+  Mb_Type uiCurMbType = pCurMb->uiMbType;
+  int32_t iMbStride  = pFilter->iMbStride;
+
+  int32_t iMbX = pCurMb->iMbX;
+  int32_t iMbY = pCurMb->iMbY;
+
+  BOOL_T bLeftBsValid[2] = { (iMbX > 0), ((iMbX > 0) && (pCurMb->uiSliceIdc == (pCurMb - 1)->uiSliceIdc))};
+  BOOL_T bTopBsValid[2]  = { (iMbY > 0), ((iMbY > 0) && (pCurMb->uiSliceIdc == (pCurMb - iMbStride)->uiSliceIdc))};
+
+  int32_t iLeftFlag = bLeftBsValid[pFilter->uiFilterIdc];
+  int32_t iTopFlag  = bTopBsValid[pFilter->uiFilterIdc];
+
+  switch (uiCurMbType) {
+  case MB_TYPE_INTRA4x4:
+  case MB_TYPE_INTRA16x16:
+  case MB_TYPE_INTRA_PCM:
+    DeblockingIntraMb (&pFunc->pfDeblocking, pCurMb, pFilter);
+    break;
+  default:
+    if (iLeftFlag) {
+      * (uint32_t*)uiBS[0][0] = IS_INTRA ((pCurMb - 1)->uiMbType) ? 0x04040404 : DeblockingBSMarginalMBAvcbase (pCurMb,
+                                pCurMb - 1, 0);
+    } else {
+      * (uint32_t*)uiBS[0][0] = 0;
+    }
+    if (iTopFlag) {
+      * (uint32_t*)uiBS[1][0] = IS_INTRA ((pCurMb - iMbStride)->uiMbType) ? 0x04040404 : DeblockingBSMarginalMBAvcbase (
+                                  pCurMb, (pCurMb - iMbStride), 1);
+    } else {
+      * (uint32_t*)uiBS[1][0] = 0;
+    }
+    //SKIP MB_16x16 or others
+    if (uiCurMbType != MB_TYPE_SKIP) {
+      pFunc->pfSetNZCZero (pCurMb->pNonZeroCount); // set all none-zero nzc to 1; dbk can be opti!
+
+      if (uiCurMbType == MB_TYPE_16x16) {
+        DeblockingBSInsideMBAvsbase (pCurMb->pNonZeroCount, uiBS, 1);
+      } else {
+        DeblockingBSInsideMBNormal (pCurMb, uiBS, pCurMb->pNonZeroCount);
+      }
+    } else {
+      * (uint32_t*)uiBS[0][1] = * (uint32_t*)uiBS[0][2] = * (uint32_t*)uiBS[0][3] =
+                                  * (uint32_t*)uiBS[1][1] = * (uint32_t*)uiBS[1][2] = * (uint32_t*)uiBS[1][3] = 0;
+    }
+
+    DeblockingInterMb (&pFunc->pfDeblocking, pCurMb, pFilter, uiBS);
+    break;
+  }
+}
+
+//  C code only
+
+void DeblockLumaLt4_c (uint8_t* pPix, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha, int32_t iBeta, int8_t* pTc) {
+  for (int32_t i = 0; i < 16; i++) {
+    int32_t iTc0 = pTc[i >> 2];
+    if (iTc0 >= 0) {
+      int32_t p0 = pPix[-iStrideX];
+      int32_t p1 = pPix[-2 * iStrideX];
+      int32_t p2 = pPix[-3 * iStrideX];
+      int32_t q0 = pPix[0];
+      int32_t q1 = pPix[iStrideX];
+      int32_t q2 = pPix[2 * iStrideX];
+      bool_t bDetaP0Q0 = WELS_ABS (p0 - q0) < iAlpha;
+      bool_t bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
+      bool_t bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
+      int32_t iTc = iTc0;
+      if (bDetaP0Q0 && bDetaP1P0 && bDetaQ1Q0) {
+        bool_t bDetaP2P0 =  WELS_ABS (p2 - p0) < iBeta;
+        bool_t bDetaQ2Q0 =  WELS_ABS (q2 - q0) < iBeta;
+        if (bDetaP2P0) {
+          pPix[-2 * iStrideX] = p1 + WELS_CLIP3 ((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1, -iTc0, iTc0);
+          iTc++;
+        }
+        if (bDetaQ2Q0) {
+          pPix[iStrideX] = q1 + WELS_CLIP3 ((q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1, -iTc0, iTc0);
+          iTc++;
+        }
+        int32_t iDeta = WELS_CLIP3 ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3, -iTc, iTc);
+        pPix[-iStrideX] = WELS_CLIP1 (p0 + iDeta);     /* p0' */
+        pPix[0]  = WELS_CLIP1 (q0 - iDeta);     /* q0' */
+      }
+    }
+    pPix += iStrideY;
+  }
+}
+
+
+void DeblockLumaEq4_c (uint8_t* pPix, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha, int32_t iBeta) {
+  int32_t p0, p1, p2, q0, q1, q2;
+  int32_t iDetaP0Q0;
+  bool_t bDetaP1P0, bDetaQ1Q0;
+  for (int32_t i = 0; i < 16; i++) {
+    p0 = pPix[-iStrideX];
+    p1 = pPix[-2 * iStrideX];
+    p2 = pPix[-3 * iStrideX];
+    q0 = pPix[0];
+    q1 = pPix[iStrideX];
+    q2 = pPix[2 * iStrideX];
+    iDetaP0Q0 = WELS_ABS (p0 - q0);
+    bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
+    bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
+    if ((iDetaP0Q0 < iAlpha) && bDetaP1P0 && bDetaQ1Q0) {
+      if (iDetaP0Q0 < ((iAlpha >> 2) + 2)) {
+        bool_t bDetaP2P0 = WELS_ABS (p2 - p0) < iBeta;
+        bool_t bDetaQ2Q0 =  WELS_ABS (q2 - q0) < iBeta;
+        if (bDetaP2P0) {
+          const int32_t p3 = pPix[-4 * iStrideX];
+          pPix[-iStrideX] = (p2 + (p1 << 1) + (p0 << 1) + (q0 << 1) + q1 + 4) >> 3;	   //p0
+          pPix[-2 * iStrideX] = (p2 + p1 + p0 + q0 + 2) >> 2;	 //p1
+          pPix[-3 * iStrideX] = ((p3 << 1) + p2 + (p2 << 1) + p1 + p0 + q0 + 4) >> 3;//p2
+        } else {
+          pPix[-1 * iStrideX] = ((p1 << 1) + p0 + q1 + 2) >> 2;	//p0
+        }
+        if (bDetaQ2Q0) {
+          const int32_t q3 = pPix[3 * iStrideX];
+          pPix[0] = (p1 + (p0 << 1) + (q0 << 1) + (q1 << 1) + q2 + 4) >> 3;   //q0
+          pPix[iStrideX] = (p0 + q0 + q1 + q2 + 2) >> 2;   //q1
+          pPix[2 * iStrideX] = ((q3 << 1) + q2 + (q2 << 1) + q1 + q0 + p0 + 4) >> 3;//q2
+        } else {
+          pPix[0] = ((q1 << 1) + q0 + p1 + 2) >> 2;   //q0
+        }
+      } else {
+        pPix[-iStrideX] = ((p1 << 1) + p0 + q1 + 2) >> 2;   //p0
+        pPix[ 0] = ((q1 << 1) + q0 + p1 + 2) >> 2;   //q0
+      }
+    }
+    pPix += iStrideY;
+  }
+}
+void DeblockLumaLt4V_c (uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* iTc) {
+  DeblockLumaLt4_c (pPix, iStride, 1, iAlpha, iBeta, iTc);
+}
+void DeblockLumaLt4H_c (uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* iTc) {
+  DeblockLumaLt4_c (pPix, 1, iStride, iAlpha, iBeta, iTc);
+}
+void DeblockLumaEq4V_c (uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
+  DeblockLumaEq4_c (pPix, iStride, 1, iAlpha, iBeta);
+}
+void DeblockLumaEq4H_c (uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
+  DeblockLumaEq4_c (pPix, 1, iStride, iAlpha, iBeta);
+}
+void DeblockChromaLt4_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha,
+                         int32_t iBeta, int8_t* pTc) {
+  int32_t p0, p1, q0, q1, iDeta;
+  bool_t bDetaP0Q0, bDetaP1P0, bDetaQ1Q0;
+
+  for (int32_t i = 0; i < 8; i++) {
+    int32_t iTc0 = pTc[i >> 1];
+    if (iTc0 > 0) {
+      p0 = pPixCb[-iStrideX];
+      p1 = pPixCb[-2 * iStrideX];
+      q0 = pPixCb[0];
+      q1 = pPixCb[iStrideX];
+
+      bDetaP0Q0 =  WELS_ABS (p0 - q0) < iAlpha;
+      bDetaP1P0 =  WELS_ABS (p1 - p0) < iBeta;
+      bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
+      if (bDetaP0Q0 && bDetaP1P0 &&	bDetaQ1Q0) {
+        iDeta = WELS_CLIP3 ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3, -iTc0, iTc0);
+        pPixCb[-iStrideX] = WELS_CLIP1 (p0 + iDeta);     /* p0' */
+        pPixCb[0]  = WELS_CLIP1 (q0 - iDeta);     /* q0' */
+      }
+
+
+      p0 = pPixCr[-iStrideX];
+      p1 = pPixCr[-2 * iStrideX];
+      q0 = pPixCr[0];
+      q1 = pPixCr[iStrideX];
+
+      bDetaP0Q0 =  WELS_ABS (p0 - q0) < iAlpha;
+      bDetaP1P0 =  WELS_ABS (p1 - p0) < iBeta;
+      bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
+
+      if (bDetaP0Q0 && bDetaP1P0 &&	bDetaQ1Q0) {
+        iDeta = WELS_CLIP3 ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3, -iTc0, iTc0);
+        pPixCr[-iStrideX] = WELS_CLIP1 (p0 + iDeta);     /* p0' */
+        pPixCr[0]  = WELS_CLIP1 (q0 - iDeta);     /* q0' */
+      }
+    }
+    pPixCb += iStrideY;
+    pPixCr += iStrideY;
+  }
+}
+void DeblockChromaEq4_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha,
+                         int32_t iBeta) {
+  int32_t i = 0, d = 0;
+  int32_t p0, p1, q0, q1;
+  bool_t bDetaP0Q0, bDetaP1P0, bDetaQ1Q0;
+  for (int32_t i = 0; i < 8; i++) {
+    //cb
+    p0 = pPixCb[-iStrideX];
+    p1 = pPixCb[-2 * iStrideX];
+    q0 = pPixCb[0];
+    q1 = pPixCb[iStrideX];
+    bDetaP0Q0 = WELS_ABS (p0 - q0) < iAlpha;
+    bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
+    bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
+    if (bDetaP0Q0 && bDetaP1P0 && bDetaQ1Q0) {
+      pPixCb[-iStrideX] = ((p1 << 1) + p0 + q1 + 2) >> 2;     /* p0' */
+      pPixCb[0]  = ((q1 << 1) + q0 + p1 + 2) >> 2;     /* q0' */
+    }
+
+    //cr
+    p0 = pPixCr[-iStrideX];
+    p1 = pPixCr[-2 * iStrideX];
+    q0 = pPixCr[0];
+    q1 = pPixCr[iStrideX];
+    bDetaP0Q0 = WELS_ABS (p0 - q0) < iAlpha;
+    bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
+    bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
+    if (bDetaP0Q0 && bDetaP1P0 && bDetaQ1Q0) {
+      pPixCr[-iStrideX] = ((p1 << 1) + p0 + q1 + 2) >> 2;     /* p0' */
+      pPixCr[0]  = ((q1 << 1) + q0 + p1 + 2) >> 2;     /* q0' */
+    }
+    pPixCr += iStrideY;
+    pPixCb += iStrideY;
+  }
+}
+void DeblockChromaLt4V_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
+                          int8_t* iTc) {
+  DeblockChromaLt4_c (pPixCb, pPixCr, iStride, 1, iAlpha, iBeta, iTc);
+}
+void DeblockChromaLt4H_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
+                          int8_t* iTc) {
+  DeblockChromaLt4_c (pPixCb, pPixCr, 1, iStride, iAlpha, iBeta, iTc);
+}
+void DeblockChromaEq4V_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
+  DeblockChromaEq4_c (pPixCb, pPixCr, iStride, 1, iAlpha, iBeta);
+}
+void DeblockChromaEq4H_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
+  DeblockChromaEq4_c (pPixCb, pPixCr, 1, iStride, iAlpha, iBeta);
+}
+
+
+void  DeblockingFilterFrameAvcbase (SDqLayer* pCurDq, SWelsFuncPtrList* pFunc) {
+  int32_t i, j;
+  const int32_t kiMbWidth	= pCurDq->iMbWidth;
+  const int32_t kiMbHeight	= pCurDq->iMbHeight;
+  SMB* pCurrentMbBlock	= pCurDq->sMbDataP;
+  SSliceHeaderExt* sSliceHeaderExt = &pCurDq->sLayerInfo.pSliceInLayer[0].sSliceHeaderExt;
+  SDeblockingFilter pFilter;
+
+  /* Step1: parameters set */
+  if (sSliceHeaderExt->sSliceHeader.uiDisableDeblockingFilterIdc == 1)
+    return;
+
+  pFilter.uiFilterIdc = (sSliceHeaderExt->sSliceHeader.uiDisableDeblockingFilterIdc != 0);
+
+  pFilter.iCsStride[0] = pCurDq->pDecPic->iLineSize[0];
+  pFilter.iCsStride[1] = pCurDq->pDecPic->iLineSize[1];
+  pFilter.iCsStride[2] = pCurDq->pDecPic->iLineSize[2];
+
+  pFilter.iSliceAlphaC0Offset = sSliceHeaderExt->sSliceHeader.iSliceAlphaC0Offset;
+  pFilter.iSliceBetaOffset     = sSliceHeaderExt->sSliceHeader.iSliceBetaOffset;
+
+  pFilter.iMbStride = kiMbWidth;
+
+  for (j = 0; j < kiMbHeight; ++j) {
+    pFilter.pCsData[0] = pCurDq->pDecPic->pData[0] + ((j * pFilter.iCsStride[0]) << 4);
+    pFilter.pCsData[1] = pCurDq->pDecPic->pData[1] + ((j * pFilter.iCsStride[1]) << 3);
+    pFilter.pCsData[2] = pCurDq->pDecPic->pData[2] + ((j * pFilter.iCsStride[2]) << 3);
+    for (i = 0; i < kiMbWidth; i++) {
+      DeblockingMbAvcbase (pFunc, pCurrentMbBlock, &pFilter);
+      ++pCurrentMbBlock;
+      pFilter.pCsData[0] += MB_WIDTH_LUMA;
+      pFilter.pCsData[1] += MB_WIDTH_CHROMA;
+      pFilter.pCsData[2] += MB_WIDTH_CHROMA;
+    }
+  }
+}
+
+void DeblockingFilterSliceAvcbase (SDqLayer* pCurDq, SWelsFuncPtrList* pFunc, const int32_t kiSliceIdx) {
+  SSliceCtx* pSliceCtx			= pCurDq->pSliceEncCtx;
+  SMB* pMbList							= pCurDq->sMbDataP;
+  SSliceHeaderExt* sSliceHeaderExt	= &pCurDq->sLayerInfo.pSliceInLayer[kiSliceIdx].sSliceHeaderExt;
+  SMB* pCurrentMbBlock;
+
+  const int32_t kiMbWidth				= pCurDq->iMbWidth;
+  const int32_t kiMbHeight				= pCurDq->iMbHeight;
+  const int32_t kiTotalNumMb			= kiMbWidth * kiMbHeight;
+  int32_t iCurMbIdx = 0, iNextMbIdx = 0, iNumMbFiltered = 0;
+
+  /* Step1: parameters set */
+  if (sSliceHeaderExt->sSliceHeader.uiDisableDeblockingFilterIdc == 1)
+    return;
+
+  SDeblockingFilter pFilter;
+
+  pFilter.uiFilterIdc = (sSliceHeaderExt->sSliceHeader.uiDisableDeblockingFilterIdc != 0);
+  pFilter.iCsStride[0] = pCurDq->pDecPic->iLineSize[0];
+  pFilter.iCsStride[1] = pCurDq->pDecPic->iLineSize[1];
+  pFilter.iCsStride[2] = pCurDq->pDecPic->iLineSize[2];
+  pFilter.iSliceAlphaC0Offset = sSliceHeaderExt->sSliceHeader.iSliceAlphaC0Offset;
+  pFilter.iSliceBetaOffset     = sSliceHeaderExt->sSliceHeader.iSliceBetaOffset;
+  pFilter.iMbStride             = kiMbWidth;
+
+  iNextMbIdx  = sSliceHeaderExt->sSliceHeader.iFirstMbInSlice;
+
+  for (; ;) {
+    iCurMbIdx	= iNextMbIdx;
+    pCurrentMbBlock = &pMbList[ iCurMbIdx ];
+
+    pFilter.pCsData[0] = pCurDq->pDecPic->pData[0] + ((pCurrentMbBlock->iMbX + pCurrentMbBlock->iMbY * pFilter.iCsStride[0])
+                         << 4);
+    pFilter.pCsData[1] = pCurDq->pDecPic->pData[1] + ((pCurrentMbBlock->iMbX + pCurrentMbBlock->iMbY * pFilter.iCsStride[1])
+                         << 3);
+    pFilter.pCsData[2] = pCurDq->pDecPic->pData[2] + ((pCurrentMbBlock->iMbX + pCurrentMbBlock->iMbY * pFilter.iCsStride[2])
+                         << 3);
+
+    DeblockingMbAvcbase (pFunc, pCurrentMbBlock, &pFilter);
+
+    ++iNumMbFiltered;
+    iNextMbIdx = WelsGetNextMbOfSlice (pSliceCtx, iCurMbIdx);
+    //whether all of MB in current slice filtered or not
+    if (iNextMbIdx == -1 || iNextMbIdx >= kiTotalNumMb || iNumMbFiltered >= kiTotalNumMb) {
+      break;
+    }
+  }
+}
+
+void PerformDeblockingFilter (sWelsEncCtx* pEnc) {
+  const int32_t kiCurDid				= pEnc->uiDependencyId;
+  SWelsSvcCodingParam* pSvcParam	= pEnc->pSvcParam;
+  SDLayerParam* pDlp					= &pSvcParam->sDependencyLayers[kiCurDid];
+  SDqLayer* pCurLayer					= pEnc->pCurDqLayer;
+
+  if (pCurLayer->iLoopFilterDisableIdc == 0) {
+    DeblockingFilterFrameAvcbase (pCurLayer, pEnc->pFuncList);
+  } else if (pCurLayer->iLoopFilterDisableIdc == 2) {
+    int32_t iSliceCount			= 0;
+    int32_t iSliceIdx			= 0;
+
+    if (SM_DYN_SLICE != pDlp->sMso.uiSliceMode) {
+      iSliceCount	= GetCurrentSliceNum (pCurLayer->pSliceEncCtx);
+      do {
+        DeblockingFilterSliceAvcbase (pCurLayer, pEnc->pFuncList, iSliceIdx);
+        ++ iSliceIdx;
+      } while (iSliceIdx < iSliceCount);
+    } else {	// for dynamic slicing mode
+      const int32_t kiNumPicPartition	= pEnc->iActiveThreadsNum; //pSvcParam->iCountThreadsNum;
+      int32_t iPartitionIdx			= 0;
+
+      while (iPartitionIdx < kiNumPicPartition) {
+        iSliceCount	= pCurLayer->pNumSliceCodedOfPartition[iPartitionIdx];
+        iSliceIdx	= iPartitionIdx;
+        do {
+          DeblockingFilterSliceAvcbase (pCurLayer, pEnc->pFuncList, iSliceIdx);
+          iSliceIdx += kiNumPicPartition;
+        } while (iSliceIdx < iSliceCount);
+        ++ iPartitionIdx;
+      }
+    }
+  }
+}
+
+void WelsNonZeroCount_c (int8_t* pNonZeroCount) {
+  int32_t i;
+  int32_t iIndex;
+
+  for (i = 0; i < 24; i++) {
+    iIndex = g_kuiMbCountScan4Idx[i];
+    pNonZeroCount[iIndex] = !!pNonZeroCount[iIndex];
+  }
+}
+void WelsBlockFuncInit (PSetNoneZeroCountZeroFunc* pfSetNZCZero,  int32_t iCpu) {
+  *pfSetNZCZero = WelsNonZeroCount_c;
+}
+
+
+#ifdef X86_ASM
+
+extern "C" {
+  void DeblockLumaLt4H_sse2 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc) {
+    ENFORCE_STACK_ALIGN_1D (uint8_t,  uiBuf,   16 * 8, 16);
+
+    DeblockLumaTransposeH2V_sse2 (pPixY - 4, iStride, &uiBuf[0]);
+    DeblockLumaLt4V_sse2 (&uiBuf[4 * 16], 16, iAlpha, iBeta, pTc);
+    DeblockLumaTransposeV2H_sse2 (pPixY - 4, iStride, &uiBuf[0]);
+  }
+
+  void DeblockLumaEq4H_sse2 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
+    ENFORCE_STACK_ALIGN_1D (uint8_t,  uiBuf,   16 * 8, 16);
+
+    DeblockLumaTransposeH2V_sse2 (pPixY - 4, iStride, &uiBuf[0]);
+    DeblockLumaEq4V_sse2 (&uiBuf[4 * 16], 16, iAlpha, iBeta);
+    DeblockLumaTransposeV2H_sse2 (pPixY - 4, iStride, &uiBuf[0]);
+  }
+
+}
+
+#endif
+
+
+void  DeblockingInit (DeblockingFunc*   pFunc,  int32_t iCpu) {
+  pFunc->pfLumaDeblockingLT4Ver		= DeblockLumaLt4V_c;
+  pFunc->pfLumaDeblockingEQ4Ver		= DeblockLumaEq4V_c;
+  pFunc->pfLumaDeblockingLT4Hor		= DeblockLumaLt4H_c;
+  pFunc->pfLumaDeblockingEQ4Hor		= DeblockLumaEq4H_c;
+
+  pFunc->pfChromaDeblockingLT4Ver	= DeblockChromaLt4V_c;
+  pFunc->pfChromaDeblockingEQ4Ver	= DeblockChromaEq4V_c;
+  pFunc->pfChromaDeblockingLT4Hor	= DeblockChromaLt4H_c;
+  pFunc->pfChromaDeblockinEQ4Hor	= DeblockChromaEq4H_c;
+
+
+#ifdef X86_ASM
+  if (iCpu & WELS_CPU_SSE2) {
+    pFunc->pfLumaDeblockingLT4Ver	= DeblockLumaLt4V_sse2;
+    pFunc->pfLumaDeblockingEQ4Ver	= DeblockLumaEq4V_sse2;
+    pFunc->pfLumaDeblockingLT4Hor   = DeblockLumaLt4H_sse2;
+    pFunc->pfLumaDeblockingEQ4Hor   = DeblockLumaEq4H_sse2;
+    pFunc->pfChromaDeblockingLT4Ver	= DeblockChromaLt4V_sse2;
+    pFunc->pfChromaDeblockingEQ4Ver	= DeblockChromaEq4V_sse2;
+    pFunc->pfChromaDeblockingLT4Hor	= DeblockChromaLt4H_sse2;
+    pFunc->pfChromaDeblockinEQ4Hor	= DeblockChromaEq4H_sse2;
+  }
+#endif
+}
+
+
+} // namespace WelsSVCEnc
+
--- a/codec/encoder/core/src/decode_mb_aux.cpp
+++ b/codec/encoder/core/src/decode_mb_aux.cpp
@@ -1,301 +1,276 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <string.h>
-#include "decode_mb_aux.h"
-#include "wels_common_basis.h"
-#include "cpu_core.h"
-
-namespace WelsSVCEnc {
-/****************************************************************************
- * Dequant and Ihdm functions
- ****************************************************************************/
-void WelsIHadamard4x4Dc(int16_t *pRes) //pBuffer size : 4x4
-{
-	int16_t iTemp[4];
-	int32_t i	= 4;
-
-	while( --i >= 0 )
-	{
-		const int32_t kiIdx	= i<<2;
-		const int32_t kiIdx1	= 1 + kiIdx;
-		const int32_t kiIdx2	= 1 + kiIdx1;
-		const int32_t kiIdx3	= 1 + kiIdx2;
-
-		iTemp[0] = pRes[kiIdx ] + pRes[kiIdx2];
-		iTemp[1] = pRes[kiIdx ] - pRes[kiIdx2];
-		iTemp[2] = pRes[kiIdx1] - pRes[kiIdx3];
-		iTemp[3] = pRes[kiIdx1] + pRes[kiIdx3];
-
-		pRes[kiIdx ] = iTemp[0] + iTemp[3];
-		pRes[kiIdx1] = iTemp[1] + iTemp[2];
-		pRes[kiIdx2] = iTemp[1] - iTemp[2];
-		pRes[kiIdx3] = iTemp[0] - iTemp[3];		
-	}
-
-	i = 4;
-	while( --i >= 0 )
-	{
-		const int32_t kiI4	= 4 + i;
-		const int32_t kiI8	= 4 + kiI4;
-		const int32_t kiI12	= 4 + kiI8;
-
-		iTemp[0] = pRes[i  ] + pRes[kiI8 ];
-		iTemp[1] = pRes[i  ] - pRes[kiI8 ];
-		iTemp[2] = pRes[kiI4 ] - pRes[kiI12];
-		iTemp[3] = pRes[kiI4 ] + pRes[kiI12];
-
-		pRes[i  ] = iTemp[0] + iTemp[3];
-		pRes[kiI4 ] = iTemp[1] + iTemp[2];
-		pRes[kiI8 ] = iTemp[1] - iTemp[2];
-		pRes[kiI12] = iTemp[0] - iTemp[3];
-	}
-}
-
-/* for qp < 12 */
-void WelsDequantLumaDc4x4(int16_t *pRes, const int32_t kiQp)
-{
-	int32_t i	= 15;
-	const uint16_t kuiDequantValue	= g_kuiDequantCoeff[kiQp%6][0];
-	const int16_t kiQF0		= kiQp / 6; 
-	const int16_t kiQF1		= 2 - kiQF0;
-	const int16_t kiQF0S	= 1 << (1 - kiQF0);
-	
-	while ( i >= 0 )
-	{
-		pRes[i  ] = ( pRes[i  ] * kuiDequantValue + kiQF0S ) >> kiQF1; 
-		pRes[i-1] = ( pRes[i-1] * kuiDequantValue + kiQF0S ) >> kiQF1; 
-		pRes[i-2] = ( pRes[i-2] * kuiDequantValue + kiQF0S ) >> kiQF1; 
-		pRes[i-3] = ( pRes[i-3] * kuiDequantValue + kiQF0S ) >> kiQF1; 
-
-		i -= 4;
-	}
-}
-
-/* for qp >= 12 */
-void WelsDequantIHadamard4x4_c(int16_t *pRes, const uint16_t kuiMF)
-{
-	int16_t iTemp[4];
-	int32_t i;
-
-	for(i = 0; i < 16; i += 4)
-	{
-		iTemp[0] = pRes[i  ] + pRes[i+2];
-		iTemp[1] = pRes[i  ] - pRes[i+2];
-		iTemp[2] = pRes[i+1] - pRes[i+3];
-		iTemp[3] = pRes[i+1] + pRes[i+3];
-
-		pRes[i  ] = iTemp[0] + iTemp[3];
-		pRes[i+1] = iTemp[1] + iTemp[2];
-		pRes[i+2] = iTemp[1] - iTemp[2];
-		pRes[i+3] = iTemp[0] - iTemp[3];		
-	}
-
-	for(i = 0; i < 4; i++)
-	{
-		iTemp[0] = pRes[i   ] + pRes[i+8 ];
-		iTemp[1] = pRes[i   ] - pRes[i+8 ];
-		iTemp[2] = pRes[i+4 ] - pRes[i+12];
-		iTemp[3] = pRes[i+4 ] + pRes[i+12];
-
-		pRes[i  ]  = (iTemp[0] + iTemp[3]) * kuiMF;
-		pRes[i+4 ] = (iTemp[1] + iTemp[2]) * kuiMF;
-		pRes[i+8 ] = (iTemp[1] - iTemp[2]) * kuiMF;
-		pRes[i+12] = (iTemp[0] - iTemp[3]) * kuiMF;
-	}	
-}
-
-void WelsDequantIHadamard2x2Dc( int16_t* pDct, const uint16_t kuiMF)
-{
-	const int16_t kiSumU = pDct[0] + pDct[2];
-	const int16_t kiDelU =   pDct[0] -  pDct[2];
-	const int16_t kiSumD = pDct[1] + pDct[3];
-	const int16_t kiDelD =   pDct[1] -  pDct[3];
-	
-	pDct[0] = (kiSumU + kiSumD) * kuiMF;
-    pDct[1] = (kiSumU  -  kiSumD) * kuiMF;
-    pDct[2] = (kiDelU   + kiDelD)   * kuiMF;
-    pDct[3] = (kiDelU   - kiDelD)   * kuiMF;
-}
-
-void WelsDequant4x4_c(int16_t *pRes, const uint16_t* kpMF)
-{
-	int32_t i;
-	for(i = 0; i < 8; i++)
-	{
-		pRes[i]	*=	kpMF[i];
-		pRes[i+8]	*=kpMF[i];
-	}
-}
-
-void WelsDequantFour4x4_c(int16_t *pRes, const uint16_t* kpMF)
-{
-	int32_t i;
-	for(i = 0; i < 8; i++)
-	{
-		pRes[i]	*=	kpMF[i];
-		pRes[i+8]	*=	kpMF[i];
-		pRes[i+16]*=	kpMF[i];
-		pRes[i+24]*=	kpMF[i];
-		pRes[i+32]*=	kpMF[i];
-		pRes[i+40]*=	kpMF[i];
-		pRes[i+48]*=	kpMF[i];
-		pRes[i+56]*=	kpMF[i];
-	}	
-}
-
-/****************************************************************************
- * IDCT functions, final output = prediction(CS) + IDCT(scaled_coeff)
- ****************************************************************************/
-void WelsIDctT4Rec_c( uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct )
-{
-	int32_t i;
-	int16_t iTemp[16];	
-
-	int32_t iDstStridex2 = iStride << 1;
-	int32_t iDstStridex3 = iStride + iDstStridex2;
-	int32_t iPredStridex2 = iPredStride << 1;
-	int32_t iPredStridex3 = iPredStride + iPredStridex2;
-
-	for (i = 0; i < 4; i ++) //horizon
-	{		
-		int32_t iIdx = i << 2;
-		const int32_t kiHorSumU = pDct[iIdx] + pDct[iIdx+2];	// add 0-2
-		const int32_t kiHorDelU = pDct[iIdx] - pDct[iIdx+2];	// sub 0-2
-		const int32_t kiHorSumD = pDct[iIdx+1] + (pDct[iIdx+3] >> 1);
-		const int32_t kiHorDelD = (pDct[iIdx+1] >> 1) - pDct[iIdx+3];
-
-		iTemp[iIdx  ]   = kiHorSumU  + kiHorSumD;		
-		iTemp[iIdx+1] = kiHorDelU   + kiHorDelD;
-		iTemp[iIdx+2] = kiHorDelU   -  kiHorDelD;
-		iTemp[iIdx+3] = kiHorSumU  -  kiHorSumD;
-	}
-
-	for (i = 0; i < 4; i ++) //vertical
-	{
-		const int32_t kiVerSumL = iTemp[i]                 + iTemp[8+i];
-		const int32_t kiVerDelL   = iTemp[i]                 - iTemp[8+i];
-		const int32_t kiVerDelR   = (iTemp[4+i] >> 1) - iTemp[12+i];
-		const int32_t kiVerSumR = iTemp[4+i]             + (iTemp[12+i] >> 1);
-
-		pRec[i				]         = WELS_CLIP1( pPred[i              ]         + ((kiVerSumL + kiVerSumR + 32) >> 6) );
-		pRec[iStride+i		]     = WELS_CLIP1( pPred[iPredStride+i  ]  + ((kiVerDelL + kiVerDelR + 32) >> 6) );
-		pRec[iDstStridex2 + i] = WELS_CLIP1( pPred[iPredStridex2+i] + ((kiVerDelL - kiVerDelR + 32) >> 6) );
-		pRec[iDstStridex3 + i] = WELS_CLIP1( pPred[iPredStridex3+i] + ((kiVerSumL - kiVerSumR + 32) >> 6) );
-	}	
-}
-
-void WelsIDctFourT4Rec_c( uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct )
-{
-	int32_t iDstStridex4  = iStride << 2;
-	int32_t iPredStridex4 = iPredStride << 2;
-	WelsIDctT4Rec_c( pRec,                  iStride, pPred,						iPredStride, pDct	);
-	WelsIDctT4Rec_c( &pRec[4],              iStride, &pPred[4],					iPredStride, pDct+16 );
-	WelsIDctT4Rec_c( &pRec[iDstStridex4  ], iStride, &pPred[iPredStridex4  ],	iPredStride, pDct+32 );	
-	WelsIDctT4Rec_c( &pRec[iDstStridex4+4], iStride, &pPred[iPredStridex4+4],	iPredStride, pDct+48 );
-
-}
-
-void WelsIDctT4RecOnMb(uint8_t* pDst, int32_t iDstStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct, PIDctFunc pfIDctFourT4)
-{
-	int32_t iDstStridex8  = iDstStride << 3;
-	int32_t iPredStridex8 = iPredStride << 3;
-	
-	pfIDctFourT4(&pDst[0], iDstStride, &pPred[0], iPredStride, pDct);
-	pfIDctFourT4(&pDst[8], iDstStride, &pPred[8], iPredStride, pDct+64);
-	pfIDctFourT4(&pDst[iDstStridex8], iDstStride, &pPred[iPredStridex8], iPredStride, pDct+128);
-	pfIDctFourT4(&pDst[iDstStridex8+8], iDstStride, &pPred[iPredStridex8+8], iPredStride, pDct+192);
-}
-
-/* 
- * pfIDctI16x16Dc: do luma idct of an MB for I16x16 mode, when only dc value are non-zero
- */
-void WelsIDctRecI16x16Dc_c(uint8_t *pRec, int32_t iStride, uint8_t *pPred, int32_t iPredStride, int16_t *pDctDc)
-{
-	int32_t i, j;
-
-	for (i = 0; i < 16; i ++) 
-	{
-		for(j = 0; j < 16; j++)
-		{
-			pRec[j] = WELS_CLIP1( pPred[j] + ((pDctDc[(i&0x0C) + (j>>2)] + 32) >> 6) );
-		}
-		pRec += iStride;
-		pPred += iPredStride;
-	}
-}
-
-void WelsGetEncBlockStrideOffset(int32_t *pBlock, const int32_t kiStrideY, const int32_t kiStrideUV)
-{
-	int32_t i, j, k, r;	
-	for(j = 0; j < 4; j++)
-	{
-		i = j << 2;
-		k = (j&0x01) << 1;
-		r = j&0x02;
-		pBlock[i]		= (0 + k + (0 + r) * kiStrideY) << 2;
-		pBlock[i+1]	= (1 + k + (0 + r) * kiStrideY) << 2;
-		pBlock[i+2]	= (0 + k + (1 + r) * kiStrideY) << 2;
-		pBlock[i+3]	= (1 + k + (1 + r) * kiStrideY) << 2;
-
-		pBlock[16+j]	=
-		pBlock[20+j]	= ((j&0x01) + r * kiStrideUV) << 2;
-	}
-}
-
-void WelsInitReconstructionFuncs( SWelsFuncPtrList *pFuncList, uint32_t  uiCpuFlag )
-{
-	pFuncList->pfDequantization4x4			= WelsDequant4x4_c;
-	pFuncList->pfDequantizationFour4x4		= WelsDequantFour4x4_c;
-	pFuncList->pfDequantizationIHadamard4x4	= WelsDequantIHadamard4x4_c;
-
-	pFuncList->pfIDctT4		= WelsIDctT4Rec_c;
-	pFuncList->pfIDctFourT4		= WelsIDctFourT4Rec_c;
-	pFuncList->pfIDctI16x16Dc = WelsIDctRecI16x16Dc_c;
-
-#if defined(X86_ASM)
-	if ( uiCpuFlag & WELS_CPU_MMXEXT )
-	{
-    	pFuncList->pfIDctT4		= WelsIDctT4Rec_mmx;
-	}
-	if ( uiCpuFlag & WELS_CPU_SSE2 )
-	{
-		pFuncList->pfDequantization4x4			= WelsDequant4x4_sse2;
-		pFuncList->pfDequantizationFour4x4		= WelsDequantFour4x4_sse2;
-		pFuncList->pfDequantizationIHadamard4x4	= WelsDequantIHadamard4x4_sse2;
-
-		pFuncList->pfIDctFourT4		= WelsIDctFourT4Rec_sse2;
-		pFuncList->pfIDctI16x16Dc = WelsIDctRecI16x16Dc_sse2;
-	}
-#endif//X86_ASM
-}
-}
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <string.h>
+#include "decode_mb_aux.h"
+#include "wels_common_basis.h"
+#include "cpu_core.h"
+
+namespace WelsSVCEnc {
+/****************************************************************************
+ * Dequant and Ihdm functions
+ ****************************************************************************/
+void WelsIHadamard4x4Dc (int16_t* pRes) { //pBuffer size : 4x4
+  int16_t iTemp[4];
+  int32_t i	= 4;
+
+  while (--i >= 0) {
+    const int32_t kiIdx	= i << 2;
+    const int32_t kiIdx1	= 1 + kiIdx;
+    const int32_t kiIdx2	= 1 + kiIdx1;
+    const int32_t kiIdx3	= 1 + kiIdx2;
+
+    iTemp[0] = pRes[kiIdx ] + pRes[kiIdx2];
+    iTemp[1] = pRes[kiIdx ] - pRes[kiIdx2];
+    iTemp[2] = pRes[kiIdx1] - pRes[kiIdx3];
+    iTemp[3] = pRes[kiIdx1] + pRes[kiIdx3];
+
+    pRes[kiIdx ] = iTemp[0] + iTemp[3];
+    pRes[kiIdx1] = iTemp[1] + iTemp[2];
+    pRes[kiIdx2] = iTemp[1] - iTemp[2];
+    pRes[kiIdx3] = iTemp[0] - iTemp[3];
+  }
+
+  i = 4;
+  while (--i >= 0) {
+    const int32_t kiI4	= 4 + i;
+    const int32_t kiI8	= 4 + kiI4;
+    const int32_t kiI12	= 4 + kiI8;
+
+    iTemp[0] = pRes[i  ] + pRes[kiI8 ];
+    iTemp[1] = pRes[i  ] - pRes[kiI8 ];
+    iTemp[2] = pRes[kiI4 ] - pRes[kiI12];
+    iTemp[3] = pRes[kiI4 ] + pRes[kiI12];
+
+    pRes[i  ] = iTemp[0] + iTemp[3];
+    pRes[kiI4 ] = iTemp[1] + iTemp[2];
+    pRes[kiI8 ] = iTemp[1] - iTemp[2];
+    pRes[kiI12] = iTemp[0] - iTemp[3];
+  }
+}
+
+/* for qp < 12 */
+void WelsDequantLumaDc4x4 (int16_t* pRes, const int32_t kiQp) {
+  int32_t i	= 15;
+  const uint16_t kuiDequantValue	= g_kuiDequantCoeff[kiQp % 6][0];
+  const int16_t kiQF0		= kiQp / 6;
+  const int16_t kiQF1		= 2 - kiQF0;
+  const int16_t kiQF0S	= 1 << (1 - kiQF0);
+
+  while (i >= 0) {
+    pRes[i  ] = (pRes[i  ] * kuiDequantValue + kiQF0S) >> kiQF1;
+    pRes[i - 1] = (pRes[i - 1] * kuiDequantValue + kiQF0S) >> kiQF1;
+    pRes[i - 2] = (pRes[i - 2] * kuiDequantValue + kiQF0S) >> kiQF1;
+    pRes[i - 3] = (pRes[i - 3] * kuiDequantValue + kiQF0S) >> kiQF1;
+
+    i -= 4;
+  }
+}
+
+/* for qp >= 12 */
+void WelsDequantIHadamard4x4_c (int16_t* pRes, const uint16_t kuiMF) {
+  int16_t iTemp[4];
+  int32_t i;
+
+  for (i = 0; i < 16; i += 4) {
+    iTemp[0] = pRes[i  ] + pRes[i + 2];
+    iTemp[1] = pRes[i  ] - pRes[i + 2];
+    iTemp[2] = pRes[i + 1] - pRes[i + 3];
+    iTemp[3] = pRes[i + 1] + pRes[i + 3];
+
+    pRes[i  ] = iTemp[0] + iTemp[3];
+    pRes[i + 1] = iTemp[1] + iTemp[2];
+    pRes[i + 2] = iTemp[1] - iTemp[2];
+    pRes[i + 3] = iTemp[0] - iTemp[3];
+  }
+
+  for (i = 0; i < 4; i++) {
+    iTemp[0] = pRes[i   ] + pRes[i + 8 ];
+    iTemp[1] = pRes[i   ] - pRes[i + 8 ];
+    iTemp[2] = pRes[i + 4 ] - pRes[i + 12];
+    iTemp[3] = pRes[i + 4 ] + pRes[i + 12];
+
+    pRes[i  ]  = (iTemp[0] + iTemp[3]) * kuiMF;
+    pRes[i + 4 ] = (iTemp[1] + iTemp[2]) * kuiMF;
+    pRes[i + 8 ] = (iTemp[1] - iTemp[2]) * kuiMF;
+    pRes[i + 12] = (iTemp[0] - iTemp[3]) * kuiMF;
+  }
+}
+
+void WelsDequantIHadamard2x2Dc (int16_t* pDct, const uint16_t kuiMF) {
+  const int16_t kiSumU = pDct[0] + pDct[2];
+  const int16_t kiDelU =   pDct[0] -  pDct[2];
+  const int16_t kiSumD = pDct[1] + pDct[3];
+  const int16_t kiDelD =   pDct[1] -  pDct[3];
+
+  pDct[0] = (kiSumU + kiSumD) * kuiMF;
+  pDct[1] = (kiSumU  -  kiSumD) * kuiMF;
+  pDct[2] = (kiDelU   + kiDelD)   * kuiMF;
+  pDct[3] = (kiDelU   - kiDelD)   * kuiMF;
+}
+
+void WelsDequant4x4_c (int16_t* pRes, const uint16_t* kpMF) {
+  int32_t i;
+  for (i = 0; i < 8; i++) {
+    pRes[i]	*=	kpMF[i];
+    pRes[i + 8]	*= kpMF[i];
+  }
+}
+
+void WelsDequantFour4x4_c (int16_t* pRes, const uint16_t* kpMF) {
+  int32_t i;
+  for (i = 0; i < 8; i++) {
+    pRes[i]	*=	kpMF[i];
+    pRes[i + 8]	*=	kpMF[i];
+    pRes[i + 16] *=	kpMF[i];
+    pRes[i + 24] *=	kpMF[i];
+    pRes[i + 32] *=	kpMF[i];
+    pRes[i + 40] *=	kpMF[i];
+    pRes[i + 48] *=	kpMF[i];
+    pRes[i + 56] *=	kpMF[i];
+  }
+}
+
+/****************************************************************************
+ * IDCT functions, final output = prediction(CS) + IDCT(scaled_coeff)
+ ****************************************************************************/
+void WelsIDctT4Rec_c (uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct) {
+  int32_t i;
+  int16_t iTemp[16];
+
+  int32_t iDstStridex2 = iStride << 1;
+  int32_t iDstStridex3 = iStride + iDstStridex2;
+  int32_t iPredStridex2 = iPredStride << 1;
+  int32_t iPredStridex3 = iPredStride + iPredStridex2;
+
+  for (i = 0; i < 4; i ++) { //horizon
+    int32_t iIdx = i << 2;
+    const int32_t kiHorSumU = pDct[iIdx] + pDct[iIdx + 2];	// add 0-2
+    const int32_t kiHorDelU = pDct[iIdx] - pDct[iIdx + 2];	// sub 0-2
+    const int32_t kiHorSumD = pDct[iIdx + 1] + (pDct[iIdx + 3] >> 1);
+    const int32_t kiHorDelD = (pDct[iIdx + 1] >> 1) - pDct[iIdx + 3];
+
+    iTemp[iIdx  ]   = kiHorSumU  + kiHorSumD;
+    iTemp[iIdx + 1] = kiHorDelU   + kiHorDelD;
+    iTemp[iIdx + 2] = kiHorDelU   -  kiHorDelD;
+    iTemp[iIdx + 3] = kiHorSumU  -  kiHorSumD;
+  }
+
+  for (i = 0; i < 4; i ++) { //vertical
+    const int32_t kiVerSumL = iTemp[i]                 + iTemp[8 + i];
+    const int32_t kiVerDelL   = iTemp[i]                 - iTemp[8 + i];
+    const int32_t kiVerDelR   = (iTemp[4 + i] >> 1) - iTemp[12 + i];
+    const int32_t kiVerSumR = iTemp[4 + i]             + (iTemp[12 + i] >> 1);
+
+    pRec[i				]         = WELS_CLIP1 (pPred[i              ]         + ((kiVerSumL + kiVerSumR + 32) >> 6));
+    pRec[iStride + i		]     = WELS_CLIP1 (pPred[iPredStride + i  ]  + ((kiVerDelL + kiVerDelR + 32) >> 6));
+    pRec[iDstStridex2 + i] = WELS_CLIP1 (pPred[iPredStridex2 + i] + ((kiVerDelL - kiVerDelR + 32) >> 6));
+    pRec[iDstStridex3 + i] = WELS_CLIP1 (pPred[iPredStridex3 + i] + ((kiVerSumL - kiVerSumR + 32) >> 6));
+  }
+}
+
+void WelsIDctFourT4Rec_c (uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct) {
+  int32_t iDstStridex4  = iStride << 2;
+  int32_t iPredStridex4 = iPredStride << 2;
+  WelsIDctT4Rec_c (pRec,                  iStride, pPred,						iPredStride, pDct);
+  WelsIDctT4Rec_c (&pRec[4],              iStride, &pPred[4],					iPredStride, pDct + 16);
+  WelsIDctT4Rec_c (&pRec[iDstStridex4  ], iStride, &pPred[iPredStridex4  ],	iPredStride, pDct + 32);
+  WelsIDctT4Rec_c (&pRec[iDstStridex4 + 4], iStride, &pPred[iPredStridex4 + 4],	iPredStride, pDct + 48);
+
+}
+
+void WelsIDctT4RecOnMb (uint8_t* pDst, int32_t iDstStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct,
+                        PIDctFunc pfIDctFourT4) {
+  int32_t iDstStridex8  = iDstStride << 3;
+  int32_t iPredStridex8 = iPredStride << 3;
+
+  pfIDctFourT4 (&pDst[0], iDstStride, &pPred[0], iPredStride, pDct);
+  pfIDctFourT4 (&pDst[8], iDstStride, &pPred[8], iPredStride, pDct + 64);
+  pfIDctFourT4 (&pDst[iDstStridex8], iDstStride, &pPred[iPredStridex8], iPredStride, pDct + 128);
+  pfIDctFourT4 (&pDst[iDstStridex8 + 8], iDstStride, &pPred[iPredStridex8 + 8], iPredStride, pDct + 192);
+}
+
+/*
+ * pfIDctI16x16Dc: do luma idct of an MB for I16x16 mode, when only dc value are non-zero
+ */
+void WelsIDctRecI16x16Dc_c (uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDctDc) {
+  int32_t i, j;
+
+  for (i = 0; i < 16; i ++) {
+    for (j = 0; j < 16; j++) {
+      pRec[j] = WELS_CLIP1 (pPred[j] + ((pDctDc[ (i & 0x0C) + (j >> 2)] + 32) >> 6));
+    }
+    pRec += iStride;
+    pPred += iPredStride;
+  }
+}
+
+void WelsGetEncBlockStrideOffset (int32_t* pBlock, const int32_t kiStrideY, const int32_t kiStrideUV) {
+  int32_t i, j, k, r;
+  for (j = 0; j < 4; j++) {
+    i = j << 2;
+    k = (j & 0x01) << 1;
+    r = j & 0x02;
+    pBlock[i]		= (0 + k + (0 + r) * kiStrideY) << 2;
+    pBlock[i + 1]	= (1 + k + (0 + r) * kiStrideY) << 2;
+    pBlock[i + 2]	= (0 + k + (1 + r) * kiStrideY) << 2;
+    pBlock[i + 3]	= (1 + k + (1 + r) * kiStrideY) << 2;
+
+    pBlock[16 + j]	=
+      pBlock[20 + j]	= ((j & 0x01) + r * kiStrideUV) << 2;
+  }
+}
+
+void WelsInitReconstructionFuncs (SWelsFuncPtrList* pFuncList, uint32_t  uiCpuFlag) {
+  pFuncList->pfDequantization4x4			= WelsDequant4x4_c;
+  pFuncList->pfDequantizationFour4x4		= WelsDequantFour4x4_c;
+  pFuncList->pfDequantizationIHadamard4x4	= WelsDequantIHadamard4x4_c;
+
+  pFuncList->pfIDctT4		= WelsIDctT4Rec_c;
+  pFuncList->pfIDctFourT4		= WelsIDctFourT4Rec_c;
+  pFuncList->pfIDctI16x16Dc = WelsIDctRecI16x16Dc_c;
+
+#if defined(X86_ASM)
+  if (uiCpuFlag & WELS_CPU_MMXEXT) {
+    pFuncList->pfIDctT4		= WelsIDctT4Rec_mmx;
+  }
+  if (uiCpuFlag & WELS_CPU_SSE2) {
+    pFuncList->pfDequantization4x4			= WelsDequant4x4_sse2;
+    pFuncList->pfDequantizationFour4x4		= WelsDequantFour4x4_sse2;
+    pFuncList->pfDequantizationIHadamard4x4	= WelsDequantIHadamard4x4_sse2;
+
+    pFuncList->pfIDctFourT4		= WelsIDctFourT4Rec_sse2;
+    pFuncList->pfIDctI16x16Dc = WelsIDctRecI16x16Dc_sse2;
+  }
+#endif//X86_ASM
+}
+}
--- a/codec/encoder/core/src/encode_mb_aux.cpp
+++ b/codec/encoder/core/src/encode_mb_aux.cpp
@@ -1,608 +1,579 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <string.h>
-
-#include "macros.h"
-#include "ls_defines.h"
-#include "encode_mb_aux.h"
-#include "cpu_core.h"
-#include "as264_common.h"
-#include "svc_encode_mb.h"
-namespace WelsSVCEnc {
-
-__align16( int16_t, g_kiQuantInterFF[58][8] )=
-{
-/* 0*/ {   0,   1,   0,   1,   1,   1,   1,   1 }, 
-/* 1*/ {   0,   1,   0,   1,   1,   1,   1,   1 }, 
-/* 2*/ {   1,   1,   1,   1,   1,   1,   1,   1 }, 
-/* 3*/ {   1,   1,   1,   1,   1,   1,   1,   1 }, 
-/* 4*/ {   1,   1,   1,   1,   1,   2,   1,   2 }, 
-/* 5*/ {   1,   1,   1,   1,   1,   2,   1,   2 }, 
-/* 6*/ {   1,   1,   1,   1,   1,   2,   1,   2 }, 
-/* 7*/ {   1,   1,   1,   1,   1,   2,   1,   2 }, 
-/* 8*/ {   1,   2,   1,   2,   2,   3,   2,   3 }, 
-/* 9*/ {   1,   2,   1,   2,   2,   3,   2,   3 }, 
-/*10*/ {   1,   2,   1,   2,   2,   3,   2,   3 }, 
-/*11*/ {   1,   2,   1,   2,   2,   4,   2,   4 }, 
-/*12*/ {   2,   3,   2,   3,   3,   4,   3,   4 }, 
-/*13*/ {   2,   3,   2,   3,   3,   5,   3,   5 }, 
-/*14*/ {   2,   3,   2,   3,   3,   5,   3,   5 }, 
-/*15*/ {   2,   4,   2,   4,   4,   6,   4,   6 }, 
-/*16*/ {   3,   4,   3,   4,   4,   7,   4,   7 }, 
-/*17*/ {   3,   5,   3,   5,   5,   8,   5,   8 }, 
-/*18*/ {   3,   5,   3,   5,   5,   8,   5,   8 }, 
-/*19*/ {   4,   6,   4,   6,   6,   9,   6,   9 }, 
-/*20*/ {   4,   7,   4,   7,   7,  10,   7,  10 }, 
-/*21*/ {   5,   8,   5,   8,   8,  12,   8,  12 }, 
-/*22*/ {   5,   8,   5,   8,   8,  13,   8,  13 }, 
-/*23*/ {   6,  10,   6,  10,  10,  15,  10,  15 }, 
-/*24*/ {   7,  11,   7,  11,  11,  17,  11,  17 }, 
-/*25*/ {   7,  12,   7,  12,  12,  19,  12,  19 }, 
-/*26*/ {   9,  13,   9,  13,  13,  21,  13,  21 }, 
-/*27*/ {   9,  15,   9,  15,  15,  24,  15,  24 }, 
-/*28*/ {  11,  17,  11,  17,  17,  26,  17,  26 }, 
-/*29*/ {  12,  19,  12,  19,  19,  30,  19,  30 }, 
-/*30*/ {  13,  22,  13,  22,  22,  33,  22,  33 }, 
-/*31*/ {  15,  23,  15,  23,  23,  38,  23,  38 }, 
-/*32*/ {  17,  27,  17,  27,  27,  42,  27,  42 }, 
-/*33*/ {  19,  30,  19,  30,  30,  48,  30,  48 }, 
-/*34*/ {  21,  33,  21,  33,  33,  52,  33,  52 }, 
-/*35*/ {  24,  38,  24,  38,  38,  60,  38,  60 }, 
-/*36*/ {  27,  43,  27,  43,  43,  67,  43,  67 }, 
-/*37*/ {  29,  47,  29,  47,  47,  75,  47,  75 }, 
-/*38*/ {  35,  53,  35,  53,  53,  83,  53,  83 }, 
-/*39*/ {  37,  60,  37,  60,  60,  96,  60,  96 }, 
-/*40*/ {  43,  67,  43,  67,  67, 104,  67, 104 },
-/*41*/ {  48,  77,  48,  77,  77, 121,  77, 121 },
-/*42*/ {  53,  87,  53,  87,  87, 133,  87, 133 },
-/*43*/ {  59,  93,  59,  93,  93, 150,  93, 150 },
-/*44*/ {  69, 107,  69, 107, 107, 167, 107, 167 },
-/*45*/ {  75, 120,  75, 120, 120, 192, 120, 192 },
-/*46*/ {  85, 133,  85, 133, 133, 208, 133, 208 },
-/*47*/ {  96, 153,  96, 153, 153, 242, 153, 242 },
-/*48*/ { 107, 173, 107, 173, 173, 267, 173, 267 },
-/*49*/ { 117, 187, 117, 187, 187, 300, 187, 300 },
-/*50*/ { 139, 213, 139, 213, 213, 333, 213, 333 },
-/*51*/ { 149, 240, 149, 240, 240, 383, 240, 383 },
-/* from here below is only for intra */  
-/*46*/ { 171, 267, 171, 267, 267, 417, 267, 417 },
-/*47*/ { 192, 307, 192, 307, 307, 483, 307, 483 },
-/*48*/ { 213, 347, 213, 347, 347, 533, 347, 533 },
-/*49*/ { 235, 373, 235, 373, 373, 600, 373, 600 },
-/*50*/ { 277, 427, 277, 427, 427, 667, 427, 667 },
-/*51*/ { 299, 480, 299, 480, 480, 767, 480, 767 },  
-};
-
-
-
-__align16( int16_t, g_kiQuantMF[52][8]) = {
-/* 0*/	{26214, 16132, 26214, 16132, 16132, 10486, 16132, 10486 }, 
-/* 1*/	{23832, 14980, 23832, 14980, 14980,  9320, 14980,  9320 }, 
-/* 2*/	{20164, 13108, 20164, 13108, 13108,  8388, 13108,  8388 }, 
-/* 3*/	{18724, 11650, 18724, 11650, 11650,  7294, 11650,  7294 }, 
-/* 4*/	{16384, 10486, 16384, 10486, 10486,  6710, 10486,  6710 }, 
-/* 5*/	{14564,  9118, 14564,  9118,  9118,  5786,  9118,  5786 }, 
-/* 6*/	{13107,  8066, 13107,  8066,  8066,  5243,  8066,  5243 }, 
-/* 7*/	{11916,  7490, 11916,  7490,  7490,  4660,  7490,  4660 }, 
-/* 8*/	{10082,  6554, 10082,  6554,  6554,  4194,  6554,  4194 }, 
-/* 9*/	{ 9362,  5825,  9362,  5825,  5825,  3647,  5825,  3647 }, 
-/*10*/	{ 8192,  5243,  8192,  5243,  5243,  3355,  5243,  3355 }, 
-/*11*/	{ 7282,  4559,  7282,  4559,  4559,  2893,  4559,  2893 }, 
-/*12*/	{ 6554,  4033,  6554,  4033,  4033,  2622,  4033,  2622 }, 
-/*13*/	{ 5958,  3745,  5958,  3745,  3745,  2330,  3745,  2330 }, 
-/*14*/	{ 5041,  3277,  5041,  3277,  3277,  2097,  3277,  2097 }, 
-/*15*/	{ 4681,  2913,  4681,  2913,  2913,  1824,  2913,  1824 }, 
-/*16*/	{ 4096,  2622,  4096,  2622,  2622,  1678,  2622,  1678 }, 
-/*17*/	{ 3641,  2280,  3641,  2280,  2280,  1447,  2280,  1447 }, 
-/*18*/	{ 3277,  2017,  3277,  2017,  2017,  1311,  2017,  1311 }, 
-/*19*/	{ 2979,  1873,  2979,  1873,  1873,  1165,  1873,  1165 }, 
-/*20*/	{ 2521,  1639,  2521,  1639,  1639,  1049,  1639,  1049 }, 
-/*21*/	{ 2341,  1456,  2341,  1456,  1456,   912,  1456,   912 }, 
-/*22*/	{ 2048,  1311,  2048,  1311,  1311,   839,  1311,   839 }, 
-/*23*/	{ 1821,  1140,  1821,  1140,  1140,   723,  1140,   723 }, 
-/*24*/	{ 1638,  1008,  1638,  1008,  1008,   655,  1008,   655 }, 
-/*25*/	{ 1490,   936,  1490,   936,   936,   583,   936,   583 }, 
-/*26*/	{ 1260,   819,  1260,   819,   819,   524,   819,   524 }, 
-/*27*/	{ 1170,   728,  1170,   728,   728,   456,   728,   456 }, 
-/*28*/	{ 1024,   655,  1024,   655,   655,   419,   655,   419 }, 
-/*29*/	{  910,   570,   910,   570,   570,   362,   570,   362 }, 
-/*30*/	{  819,   504,   819,   504,   504,   328,   504,   328 }, 
-/*31*/	{  745,   468,   745,   468,   468,   291,   468,   291 }, 
-/*32*/	{  630,   410,   630,   410,   410,   262,   410,   262 }, 
-/*33*/	{  585,   364,   585,   364,   364,   228,   364,   228 }, 
-/*34*/	{  512,   328,   512,   328,   328,   210,   328,   210 }, 
-/*35*/	{  455,   285,   455,   285,   285,   181,   285,   181 }, 
-/*36*/	{  410,   252,   410,   252,   252,   164,   252,   164 }, 
-/*37*/	{  372,   234,   372,   234,   234,   146,   234,   146 }, 
-/*38*/	{  315,   205,   315,   205,   205,   131,   205,   131 }, 
-/*39*/	{  293,   182,   293,   182,   182,   114,   182,   114 }, 
-/*40*/	{  256,   164,   256,   164,   164,   105,   164,   105 }, 
-/*41*/	{  228,   142,   228,   142,   142,    90,   142,    90 }, 
-/*42*/	{  205,   126,   205,   126,   126,    82,   126,    82 }, 
-/*43*/	{  186,   117,   186,   117,   117,    73,   117,    73 }, 
-/*44*/	{  158,   102,   158,   102,   102,    66,   102,    66 }, 
-/*45*/	{  146,    91,   146,    91,    91,    57,    91,    57 }, 
-/*46*/	{  128,    82,   128,    82,    82,    52,    82,    52 }, 
-/*47*/	{  114,    71,   114,    71,    71,    45,    71,    45 }, 
-/*48*/	{  102,    63,   102,    63,    63,    41,    63,    41 }, 
-/*49*/	{   93,    59,    93,    59,    59,    36,    59,    36 }, 
-/*50*/	{   79,    51,    79,    51,    51,    33,    51,    33 }, 
-/*51*/	{   73,    46,    73,    46,    46,    28,    46,    28 }  
-};
-
-/****************************************************************************
- * HDM and Quant functions 
- ****************************************************************************/
-#define WELS_ABS_LC(a) ((iSign ^ (int32_t)(a)) - iSign)
-#define NEW_QUANT(pDct, iFF, iMF) (((iFF)+ WELS_ABS_LC(pDct))*(iMF)) >>16
-#define WELS_NEW_QUANT(pDct,iFF,iMF)	WELS_ABS_LC(NEW_QUANT(pDct, iFF, iMF))
-void WelsQuant4x4_c(int16_t *pDct, int16_t* pFF,  int16_t *pMF)
-{
-	int32_t i, j, iSign;
-    for( i = 0; i < 16; i+=4 )
-    {
-		j = i & 0x07;
-        iSign = WELS_SIGN(pDct[i]);
-		pDct[i] = WELS_NEW_QUANT(pDct[i], pFF[j], pMF[j]);
-		iSign = WELS_SIGN(pDct[i+1]);
-		pDct[i+1] = WELS_NEW_QUANT(pDct[i+1], pFF[j+1], pMF[j+1]);
-		iSign = WELS_SIGN(pDct[i+2]);
-		pDct[i+2] = WELS_NEW_QUANT(pDct[i+2], pFF[j+2], pMF[j+2]);
-		iSign = WELS_SIGN(pDct[i+3]);
-		pDct[i+3] = WELS_NEW_QUANT(pDct[i+3], pFF[j+3], pMF[j+3]);
-    }
-}
-
-void WelsQuant4x4Dc_c(int16_t *pDct, int16_t iFF,  int16_t iMF)
-{
-	int32_t i, iSign;
-	for(i = 0; i < 16; i+=4)
-	{
-		iSign = WELS_SIGN(pDct[i]);
-		pDct[i] = WELS_NEW_QUANT(pDct[i], iFF, iMF);
-		iSign = WELS_SIGN(pDct[i+1]);
-		pDct[i+1] = WELS_NEW_QUANT(pDct[i+1], iFF, iMF);
-		iSign = WELS_SIGN(pDct[i+2]);
-		pDct[i+2] = WELS_NEW_QUANT(pDct[i+2], iFF, iMF);
-		iSign = WELS_SIGN(pDct[i+3]);
-		pDct[i+3] = WELS_NEW_QUANT(pDct[i+3], iFF, iMF);
-	}
-}
-
-void WelsQuantFour4x4_c(int16_t *pDct, int16_t* pFF,  int16_t *pMF)
-{
-	int32_t i, j, iSign;
-
-    for( i = 0; i < 64; i+=4 )
-    {
-		j = i & 0x07;
-        iSign = WELS_SIGN(pDct[i]);
-		pDct[i] = WELS_NEW_QUANT(pDct[i], pFF[j], pMF[j]);
-		iSign = WELS_SIGN(pDct[i+1]);
-		pDct[i+1] = WELS_NEW_QUANT(pDct[i+1], pFF[j+1], pMF[j+1]);
-		iSign = WELS_SIGN(pDct[i+2]);
-		pDct[i+2] = WELS_NEW_QUANT(pDct[i+2], pFF[j+2], pMF[j+2]);
-		iSign = WELS_SIGN(pDct[i+3]);
-		pDct[i+3] = WELS_NEW_QUANT(pDct[i+3], pFF[j+3], pMF[j+3]);
-    }
-}
-
-void WelsQuantFour4x4Max_c(int16_t *pDct, int16_t* pFF,  int16_t *pMF, int16_t *pMax)
-{
-	int32_t i, j, k, iSign;
-	int16_t iMaxAbs;
-	for( k = 0; k < 4; k++)
-	{
-		iMaxAbs = 0;
-		for( i = 0; i < 16; i++ )
-		{
-			j = i & 0x07;
-			iSign = WELS_SIGN(pDct[i]);
-			pDct[i] = NEW_QUANT(pDct[i], pFF[j], pMF[j]);
-			if( iMaxAbs < pDct[i]) iMaxAbs = pDct[i];
-			pDct[i] = WELS_ABS_LC(pDct[i]);
-		}
-		pDct += 16;
-		pMax[k] = iMaxAbs;
-	}
-}
-
-int32_t WelsHadamardQuant2x2Skip_c(int16_t *pRs, int16_t iFF,  int16_t iMF)
-{
-	int16_t pDct[4], s[4];
-	int16_t iThreshold = ((1<<16)-1)/iMF - iFF;	
-
-	s[0] = pRs[0]  + pRs[32];
-    s[1] = pRs[0]  - pRs[32];
-    s[2] = pRs[16] + pRs[48];
-    s[3] = pRs[16] - pRs[48];
-
-    pDct[0] = s[0] + s[2];
-    pDct[1] = s[0] - s[2];
-    pDct[2] = s[1] + s[3];
-    pDct[3] = s[1] - s[3];
-
-	return ((WELS_ABS(pDct[0]) > iThreshold) || (WELS_ABS(pDct[1]) > iThreshold) || (WELS_ABS(pDct[2]) > iThreshold) || (WELS_ABS(pDct[3]) > iThreshold));
-}
-
-int32_t WelsHadamardQuant2x2_c(int16_t *pRs, const int16_t iFF, int16_t iMF, int16_t * pDct, int16_t * pBlock)
-{
-	int16_t s[4];
-	int32_t iSign, i, iDcNzc = 0;
-
-	s[0] = pRs[0]  + pRs[32];
-    s[1] = pRs[0]  - pRs[32];
-    s[2] = pRs[16] + pRs[48];
-    s[3] = pRs[16] - pRs[48];
-
-	pRs[0] = 0;
-	pRs[16] = 0;
-	pRs[32] = 0;
-	pRs[48] = 0;
-
-    pDct[0] = s[0] + s[2];
-    pDct[1] = s[0] - s[2];
-    pDct[2] = s[1] + s[3];
-    pDct[3] = s[1] - s[3];
-
-	iSign = WELS_SIGN(pDct[0]);
-	pDct[0] = WELS_NEW_QUANT(pDct[0], iFF, iMF);
-	iSign = WELS_SIGN(pDct[1]);
-	pDct[1] = WELS_NEW_QUANT(pDct[1], iFF, iMF);
-	iSign = WELS_SIGN(pDct[2]);
-	pDct[2] = WELS_NEW_QUANT(pDct[2], iFF, iMF);
-	iSign = WELS_SIGN(pDct[3]);
-	pDct[3] = WELS_NEW_QUANT(pDct[3], iFF, iMF);
-
-	ST64( pBlock, LD64(pDct) );
-
-	for(i=0; i<4; i++)	
-		iDcNzc += (pBlock[i] != 0);
-	return iDcNzc;
-}
-
-/* dc value pick up and hdm_4x4 */
-void WelsHadamardT4Dc_c( int16_t *pLumaDc, int16_t *pDct)
-{
-	int32_t p[16], s[4];
-	int32_t i, iIdx;
-
-    for(i = 0 ; i < 16 ; i +=4)
-    {
-    	iIdx = ((i&0x08) << 4) +((i&0x04) << 3);		
-		s[0] = pDct[iIdx ]	+ pDct[iIdx+80];
-        s[3] = pDct[iIdx ]	- pDct[iIdx+80];
-        s[1] = pDct[iIdx+16]	+ pDct[iIdx+64];
-        s[2] = pDct[iIdx+16]	- pDct[iIdx+64];
-
-        p[i  ] = s[0] + s[1];
-        p[i+2] = s[0] - s[1];
-        p[i+1] = s[3] + s[2];
-        p[i+3] = s[3] - s[2];
-    }
-
-    for(i = 0 ; i < 4 ; i ++)
-    {
-        s[0] = p[i ]	+ p[i+12];
-        s[3] = p[i ]	- p[i+12];
-        s[1] = p[i+4]	+ p[i+ 8];
-        s[2] = p[i+4]	- p[i+ 8];
-
-		pLumaDc[i  ] = WELS_CLIP3((s[0] + s[1] + 1) >> 1, -32768, 32767);
-		pLumaDc[i+8 ] = WELS_CLIP3((s[0] - s[1] + 1) >> 1, -32768, 32767);
-		pLumaDc[i+4 ] = WELS_CLIP3((s[3] + s[2] + 1) >> 1, -32768, 32767);
-		pLumaDc[i+12] = WELS_CLIP3((s[3] - s[2] + 1) >> 1, -32768, 32767);
-    }
-}
-
-/****************************************************************************
- * DCT functions
- ****************************************************************************/
-void WelsDctT4_c( int16_t *pDct, uint8_t *pPixel1, int32_t iStride1, uint8_t *pPixel2, int32_t iStride2 )
-{
-	int16_t i, pData[16], s[4];
-    for(i = 0 ; i < 16 ; i +=4)
-    {    	
-		const int32_t kiI1= 1 + i;
-		const int32_t kiI2= 2 + i;
-		const int32_t kiI3= 3 + i;
-		
-		pData[i ] = pPixel1[0] - pPixel2[0];
-		pData[kiI1] = pPixel1[1] - pPixel2[1];
-		pData[kiI2] = pPixel1[2] - pPixel2[2];
-		pData[kiI3] = pPixel1[3] - pPixel2[3];
-
-        pPixel1 += iStride1;
-        pPixel2 += iStride2;
-
-		/*horizontal transform */
-        s[0] = pData[i] + pData[kiI3];
-        s[3] = pData[i] - pData[kiI3];
-        s[1] = pData[kiI1] + pData[kiI2];
-        s[2] = pData[kiI1] - pData[kiI2];
-
-        pDct[i ] = s[0] + s[1];
-        pDct[kiI2] = s[0] - s[1];
-        pDct[kiI1] = (s[3] << 1) + s[2];
-        pDct[kiI3] = s[3] - (s[2] << 1);
-    }
-
-    /* vertical transform */
-    for(i = 0 ; i < 4 ; i ++)
-    {
-		const int32_t kiI4	= 4 + i;
-		const int32_t kiI8	= 8 + i;
-		const int32_t kiI12	= 12 + i;
-
-        s[0] = pDct[i ] + pDct[kiI12];
-        s[3] = pDct[i ] - pDct[kiI12];
-        s[1] = pDct[kiI4] + pDct[kiI8 ];
-        s[2] = pDct[kiI4] - pDct[kiI8 ];
-
-        pDct[i  ] = s[0] + s[1];
-        pDct[kiI8 ] = s[0] - s[1];
-        pDct[kiI4 ] = (s[3] << 1) + s[2];
-        pDct[kiI12] = s[3] - (s[2] << 1);
-    }
-}
-
-void WelsDctFourT4_c(int16_t *pDct, uint8_t *pPixel1, int32_t iStride1, uint8_t *pPixel2, int32_t iStride2 )
-{
-	int32_t stride_1 = iStride1 << 2;
-	int32_t stride_2 = iStride2 << 2;
-
-    WelsDctT4_c( pDct,      &pPixel1[0],          iStride1, &pPixel2[0],          iStride2	);
-	WelsDctT4_c( pDct + 16, &pPixel1[4],          iStride1, &pPixel2[4],          iStride2	);
-	WelsDctT4_c( pDct + 32, &pPixel1[stride_1  ], iStride1, &pPixel2[stride_2  ], iStride2	);
-	WelsDctT4_c( pDct + 48, &pPixel1[stride_1+4], iStride1, &pPixel2[stride_2+4], iStride2	);
-}
-
-/****************************************************************************
- * Scan and Score functions
- ****************************************************************************/
-void WelsScan4x4DcAc_c( int16_t* pLevel, int16_t *pDct )
-{
-	ST32( pLevel, LD32(pDct) );	
-	pLevel[2] = pDct[4];
-	pLevel[3] = pDct[8];
-	pLevel[4] = pDct[5];
-	ST32( pLevel+5, LD32(pDct+2) );	
-	pLevel[7] = pDct[6];
-	pLevel[8] = pDct[9];
-	ST32( pLevel+9, LD32(pDct+12) );	
-	pLevel[11] = pDct[10];
-	pLevel[12] = pDct[7];
-	pLevel[13] = pDct[11];
-	ST32( pLevel+14, LD32(pDct+14) );
-}
-
-void WelsScan4x4Ac_c( int16_t* pLevel, int16_t* pDct )
-{
-	pLevel[0]  = pDct[1];
-	pLevel[1]  = pDct[4];
-	pLevel[2]  = pDct[8];
-	pLevel[3]  = pDct[5];
-	ST32( &pLevel[4], LD32(&pDct[2]) );	
-	pLevel[6]  = pDct[6];	
-	pLevel[7]  = pDct[9];
-	ST32( &pLevel[8], LD32(&pDct[12]) );	
-	pLevel[10] = pDct[10];
-	pLevel[11] = pDct[7];
-	pLevel[12] = pDct[11];
-	ST32( &pLevel[13], LD32(&pDct[14]) );
-	pLevel[15] = 0;
-}
-
-void WelsScan4x4Dc( int16_t* pLevel, int16_t* pDct )
-{
-	ST32( pLevel, LD32(pDct) );	
-	pLevel[2] = pDct[4];	
-	pLevel[3] = pDct[8];	
-	pLevel[4] = pDct[5];	
-	ST32( pLevel+5, LD32(pDct+2) );	
-	pLevel[7] = pDct[6];	
-	pLevel[8] = pDct[9];	
-	ST32( pLevel+9, LD32(pDct+12) );	
-	pLevel[11] = pDct[10];
-	pLevel[12] = pDct[7];	
-	pLevel[13] = pDct[11];
-	ST32( pLevel+14, LD32(pDct+14) );	
-}
-
-//refer to JVT-O079
-int32_t WelsCalculateSingleCtr4x4_c( int16_t *pDct)
-{
-    static const int32_t kiTRunTable[16] = { 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
-
-    int32_t iSingleCtr = 0;
-    int32_t iIdx = 15;
-    int32_t iRun;
-
-    while( iIdx >= 0 && pDct[iIdx] == 0 )      --iIdx;
-
-    while( iIdx >= 0 )
-    {
-		-- iIdx;        
-		iRun = iIdx;
-        while( iIdx >= 0 && pDct[iIdx] == 0 )  --iIdx;            
-		iRun -= iIdx;
-        iSingleCtr += kiTRunTable[iRun];
-    }
-    return iSingleCtr;
-}
-
-/****************************************************************************
- * Copy functions 
- ****************************************************************************/
-void WelsCopy4x4( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS )
-{
-	const int32_t kiSrcStride2 = iStrideS << 1;
-	const int32_t kiSrcStride3 = iStrideS + kiSrcStride2;
-	const int32_t kiDstStride2 = iStrideD << 1;
-	const int32_t kiDstStride3 = iStrideD + kiDstStride2;
-
-	ST32( pDst,				LD32(pSrc) );
-	ST32( pDst+iStrideD,	LD32(pSrc+iStrideS) );
-	ST32( pDst+kiDstStride2, LD32(pSrc+kiSrcStride2) );
-	ST32( pDst+kiDstStride3, LD32(pSrc+kiSrcStride3) );
-}
-void WelsCopy8x8_c( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS )
-{
-	int32_t i;
-	for( i = 0; i < 4; i++)
-	{
-		ST32( pDst,				LD32(pSrc			));	ST32( pDst + 4 ,			LD32(pSrc + 4			));
-		ST32( pDst + iStrideD,	LD32(pSrc + iStrideS));	ST32( pDst + iStrideD + 4 ,	LD32(pSrc + iStrideS + 4));
-		pDst += iStrideD << 1;
-		pSrc += iStrideS << 1;
-	}
-}
-void WelsCopy8x16_c( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS )
-{
-	int32_t i;
-	for( i = 0; i < 8; ++i )
-	{
-		ST32( pDst,				LD32(pSrc			));	ST32( pDst + 4 ,			LD32(pSrc + 4			));
-		ST32( pDst + iStrideD,	LD32(pSrc + iStrideS));	ST32( pDst + iStrideD + 4 ,	LD32(pSrc + iStrideS + 4));
-		pDst += iStrideD << 1;
-		pSrc += iStrideS << 1;
-	}
-}
-void WelsCopy16x8_c( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS )
-{
-	int32_t i;
-	for( i = 0; i < 8; i++)
-	{
-		ST32( pDst,		LD32(pSrc		));	ST32( pDst + 4 ,	LD32(pSrc + 4 ));
-		ST32( pDst + 8 ,LD32(pSrc + 8	));	ST32( pDst + 12 ,	LD32(pSrc + 12));
-		pDst += iStrideD ;
-		pSrc += iStrideS;
-	}
-}
-void WelsCopy16x16_c( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS )
-{
-	int32_t i;
-	for( i = 0; i < 16; i++)
-	{
-		ST32( pDst,		LD32(pSrc		));	ST32( pDst + 4 ,	LD32(pSrc + 4 ));
-		ST32( pDst + 8 ,LD32(pSrc + 8	));	ST32( pDst + 12 ,	LD32(pSrc + 12));
-		pDst += iStrideD ;
-		pSrc += iStrideS;
-	}
-}
-
-int32_t WelsGetNoneZeroCount_c(int16_t * pLevel)
-{
-	int32_t iCnt = 0; 
-	int32_t iIdx = 0;
-
-	while (iIdx < 16) {
-		iCnt += (pLevel[  iIdx] == 0);
-		iCnt += (pLevel[1+iIdx] == 0);
-		iCnt += (pLevel[2+iIdx] == 0);
-		iCnt += (pLevel[3+iIdx] == 0);
-
-		iIdx += 4;
-	}
-	return (16 - iCnt);
-}
-
-void WelsInitEncodingFuncs( SWelsFuncPtrList *pFuncList, uint32_t  uiCpuFlag )
-{
-	pFuncList->pfCopy8x8Aligned			= WelsCopy8x8_c;
-	pFuncList->pfCopy16x16Aligned		=
-	pFuncList->pfCopy16x16NotAligned	= WelsCopy16x16_c;
-	pFuncList->pfCopy16x8NotAligned		= WelsCopy16x8_c;
-	pFuncList->pfCopy8x16Aligned		= WelsCopy8x16_c;
-
-	pFuncList->pfQuantizationHadamard2x2		= WelsHadamardQuant2x2_c;
-	pFuncList->pfQuantizationHadamard2x2Skip	= WelsHadamardQuant2x2Skip_c;	
-	pFuncList->pfTransformHadamard4x4Dc			= WelsHadamardT4Dc_c;	
-
-	pFuncList->pfDctT4					= WelsDctT4_c;
-	pFuncList->pfDctFourT4   			= WelsDctFourT4_c;
-	
-	pFuncList->pfScan4x4				= WelsScan4x4DcAc_c;
-	pFuncList->pfScan4x4Ac				= WelsScan4x4Ac_c;
-	pFuncList->pfCalculateSingleCtr4x4	= WelsCalculateSingleCtr4x4_c;
-
-	pFuncList->pfGetNoneZeroCount		= WelsGetNoneZeroCount_c;
-
-	pFuncList->pfQuantization4x4		= WelsQuant4x4_c;
-	pFuncList->pfQuantizationDc4x4		= WelsQuant4x4Dc_c;
-	pFuncList->pfQuantizationFour4x4	= WelsQuantFour4x4_c;
-	pFuncList->pfQuantizationFour4x4Max	= WelsQuantFour4x4Max_c;
-
-#if defined(X86_ASM)
-	if ( uiCpuFlag & WELS_CPU_MMXEXT )
-	{
-		
-		pFuncList->pfQuantizationHadamard2x2		= WelsHadamardQuant2x2_mmx;
-		pFuncList->pfQuantizationHadamard2x2Skip	= WelsHadamardQuant2x2Skip_mmx;	
-
-		pFuncList->pfDctT4					= WelsDctT4_mmx;
-
-		pFuncList->pfCopy8x8Aligned			= WelsCopy8x8_mmx;
-		pFuncList->pfCopy8x16Aligned		= WelsCopy8x16_mmx;
-	}
-	if ( uiCpuFlag & WELS_CPU_SSE2 )
-	{
-		pFuncList->pfGetNoneZeroCount		= WelsGetNoneZeroCount_sse2;	
-		pFuncList->pfTransformHadamard4x4Dc	= WelsHadamardT4Dc_sse2;
-
-		pFuncList->pfQuantization4x4		= WelsQuant4x4_sse2;
-		pFuncList->pfQuantizationDc4x4		= WelsQuant4x4Dc_sse2;
-		pFuncList->pfQuantizationFour4x4	= WelsQuantFour4x4_sse2;
-		pFuncList->pfQuantizationFour4x4Max	= WelsQuantFour4x4Max_sse2;
-
-		pFuncList->pfCopy16x16Aligned		= WelsCopy16x16_sse2;
-		pFuncList->pfCopy16x16NotAligned	= WelsCopy16x16NotAligned_sse2;
-		pFuncList->pfCopy16x8NotAligned		= WelsCopy16x8NotAligned_sse2;
-
-		pFuncList->pfScan4x4				= WelsScan4x4DcAc_sse2;
-		pFuncList->pfScan4x4Ac				= WelsScan4x4Ac_sse2;
-		pFuncList->pfCalculateSingleCtr4x4	= WelsCalculateSingleCtr4x4_sse2;
-
-		pFuncList->pfDctFourT4				= WelsDctFourT4_sse2;		
-	}
-//#ifndef MACOS
-	if ( uiCpuFlag & WELS_CPU_SSSE3 )
-    {
-    	pFuncList->pfScan4x4				= WelsScan4x4DcAc_ssse3;
-	}
-
-//#endif//MACOS
-
-#endif//X86_ASM
-}
-}
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <string.h>
+
+#include "macros.h"
+#include "ls_defines.h"
+#include "encode_mb_aux.h"
+#include "cpu_core.h"
+#include "as264_common.h"
+#include "svc_encode_mb.h"
+namespace WelsSVCEnc {
+
+__align16 (int16_t, g_kiQuantInterFF[58][8]) = {
+  /* 0*/ {   0,   1,   0,   1,   1,   1,   1,   1 },
+  /* 1*/ {   0,   1,   0,   1,   1,   1,   1,   1 },
+  /* 2*/ {   1,   1,   1,   1,   1,   1,   1,   1 },
+  /* 3*/ {   1,   1,   1,   1,   1,   1,   1,   1 },
+  /* 4*/ {   1,   1,   1,   1,   1,   2,   1,   2 },
+  /* 5*/ {   1,   1,   1,   1,   1,   2,   1,   2 },
+  /* 6*/ {   1,   1,   1,   1,   1,   2,   1,   2 },
+  /* 7*/ {   1,   1,   1,   1,   1,   2,   1,   2 },
+  /* 8*/ {   1,   2,   1,   2,   2,   3,   2,   3 },
+  /* 9*/ {   1,   2,   1,   2,   2,   3,   2,   3 },
+  /*10*/ {   1,   2,   1,   2,   2,   3,   2,   3 },
+  /*11*/ {   1,   2,   1,   2,   2,   4,   2,   4 },
+  /*12*/ {   2,   3,   2,   3,   3,   4,   3,   4 },
+  /*13*/ {   2,   3,   2,   3,   3,   5,   3,   5 },
+  /*14*/ {   2,   3,   2,   3,   3,   5,   3,   5 },
+  /*15*/ {   2,   4,   2,   4,   4,   6,   4,   6 },
+  /*16*/ {   3,   4,   3,   4,   4,   7,   4,   7 },
+  /*17*/ {   3,   5,   3,   5,   5,   8,   5,   8 },
+  /*18*/ {   3,   5,   3,   5,   5,   8,   5,   8 },
+  /*19*/ {   4,   6,   4,   6,   6,   9,   6,   9 },
+  /*20*/ {   4,   7,   4,   7,   7,  10,   7,  10 },
+  /*21*/ {   5,   8,   5,   8,   8,  12,   8,  12 },
+  /*22*/ {   5,   8,   5,   8,   8,  13,   8,  13 },
+  /*23*/ {   6,  10,   6,  10,  10,  15,  10,  15 },
+  /*24*/ {   7,  11,   7,  11,  11,  17,  11,  17 },
+  /*25*/ {   7,  12,   7,  12,  12,  19,  12,  19 },
+  /*26*/ {   9,  13,   9,  13,  13,  21,  13,  21 },
+  /*27*/ {   9,  15,   9,  15,  15,  24,  15,  24 },
+  /*28*/ {  11,  17,  11,  17,  17,  26,  17,  26 },
+  /*29*/ {  12,  19,  12,  19,  19,  30,  19,  30 },
+  /*30*/ {  13,  22,  13,  22,  22,  33,  22,  33 },
+  /*31*/ {  15,  23,  15,  23,  23,  38,  23,  38 },
+  /*32*/ {  17,  27,  17,  27,  27,  42,  27,  42 },
+  /*33*/ {  19,  30,  19,  30,  30,  48,  30,  48 },
+  /*34*/ {  21,  33,  21,  33,  33,  52,  33,  52 },
+  /*35*/ {  24,  38,  24,  38,  38,  60,  38,  60 },
+  /*36*/ {  27,  43,  27,  43,  43,  67,  43,  67 },
+  /*37*/ {  29,  47,  29,  47,  47,  75,  47,  75 },
+  /*38*/ {  35,  53,  35,  53,  53,  83,  53,  83 },
+  /*39*/ {  37,  60,  37,  60,  60,  96,  60,  96 },
+  /*40*/ {  43,  67,  43,  67,  67, 104,  67, 104 },
+  /*41*/ {  48,  77,  48,  77,  77, 121,  77, 121 },
+  /*42*/ {  53,  87,  53,  87,  87, 133,  87, 133 },
+  /*43*/ {  59,  93,  59,  93,  93, 150,  93, 150 },
+  /*44*/ {  69, 107,  69, 107, 107, 167, 107, 167 },
+  /*45*/ {  75, 120,  75, 120, 120, 192, 120, 192 },
+  /*46*/ {  85, 133,  85, 133, 133, 208, 133, 208 },
+  /*47*/ {  96, 153,  96, 153, 153, 242, 153, 242 },
+  /*48*/ { 107, 173, 107, 173, 173, 267, 173, 267 },
+  /*49*/ { 117, 187, 117, 187, 187, 300, 187, 300 },
+  /*50*/ { 139, 213, 139, 213, 213, 333, 213, 333 },
+  /*51*/ { 149, 240, 149, 240, 240, 383, 240, 383 },
+  /* from here below is only for intra */
+  /*46*/ { 171, 267, 171, 267, 267, 417, 267, 417 },
+  /*47*/ { 192, 307, 192, 307, 307, 483, 307, 483 },
+  /*48*/ { 213, 347, 213, 347, 347, 533, 347, 533 },
+  /*49*/ { 235, 373, 235, 373, 373, 600, 373, 600 },
+  /*50*/ { 277, 427, 277, 427, 427, 667, 427, 667 },
+  /*51*/ { 299, 480, 299, 480, 480, 767, 480, 767 },
+};
+
+
+
+__align16 (int16_t, g_kiQuantMF[52][8]) = {
+  /* 0*/	{26214, 16132, 26214, 16132, 16132, 10486, 16132, 10486 },
+  /* 1*/	{23832, 14980, 23832, 14980, 14980,  9320, 14980,  9320 },
+  /* 2*/	{20164, 13108, 20164, 13108, 13108,  8388, 13108,  8388 },
+  /* 3*/	{18724, 11650, 18724, 11650, 11650,  7294, 11650,  7294 },
+  /* 4*/	{16384, 10486, 16384, 10486, 10486,  6710, 10486,  6710 },
+  /* 5*/	{14564,  9118, 14564,  9118,  9118,  5786,  9118,  5786 },
+  /* 6*/	{13107,  8066, 13107,  8066,  8066,  5243,  8066,  5243 },
+  /* 7*/	{11916,  7490, 11916,  7490,  7490,  4660,  7490,  4660 },
+  /* 8*/	{10082,  6554, 10082,  6554,  6554,  4194,  6554,  4194 },
+  /* 9*/	{ 9362,  5825,  9362,  5825,  5825,  3647,  5825,  3647 },
+  /*10*/	{ 8192,  5243,  8192,  5243,  5243,  3355,  5243,  3355 },
+  /*11*/	{ 7282,  4559,  7282,  4559,  4559,  2893,  4559,  2893 },
+  /*12*/	{ 6554,  4033,  6554,  4033,  4033,  2622,  4033,  2622 },
+  /*13*/	{ 5958,  3745,  5958,  3745,  3745,  2330,  3745,  2330 },
+  /*14*/	{ 5041,  3277,  5041,  3277,  3277,  2097,  3277,  2097 },
+  /*15*/	{ 4681,  2913,  4681,  2913,  2913,  1824,  2913,  1824 },
+  /*16*/	{ 4096,  2622,  4096,  2622,  2622,  1678,  2622,  1678 },
+  /*17*/	{ 3641,  2280,  3641,  2280,  2280,  1447,  2280,  1447 },
+  /*18*/	{ 3277,  2017,  3277,  2017,  2017,  1311,  2017,  1311 },
+  /*19*/	{ 2979,  1873,  2979,  1873,  1873,  1165,  1873,  1165 },
+  /*20*/	{ 2521,  1639,  2521,  1639,  1639,  1049,  1639,  1049 },
+  /*21*/	{ 2341,  1456,  2341,  1456,  1456,   912,  1456,   912 },
+  /*22*/	{ 2048,  1311,  2048,  1311,  1311,   839,  1311,   839 },
+  /*23*/	{ 1821,  1140,  1821,  1140,  1140,   723,  1140,   723 },
+  /*24*/	{ 1638,  1008,  1638,  1008,  1008,   655,  1008,   655 },
+  /*25*/	{ 1490,   936,  1490,   936,   936,   583,   936,   583 },
+  /*26*/	{ 1260,   819,  1260,   819,   819,   524,   819,   524 },
+  /*27*/	{ 1170,   728,  1170,   728,   728,   456,   728,   456 },
+  /*28*/	{ 1024,   655,  1024,   655,   655,   419,   655,   419 },
+  /*29*/	{  910,   570,   910,   570,   570,   362,   570,   362 },
+  /*30*/	{  819,   504,   819,   504,   504,   328,   504,   328 },
+  /*31*/	{  745,   468,   745,   468,   468,   291,   468,   291 },
+  /*32*/	{  630,   410,   630,   410,   410,   262,   410,   262 },
+  /*33*/	{  585,   364,   585,   364,   364,   228,   364,   228 },
+  /*34*/	{  512,   328,   512,   328,   328,   210,   328,   210 },
+  /*35*/	{  455,   285,   455,   285,   285,   181,   285,   181 },
+  /*36*/	{  410,   252,   410,   252,   252,   164,   252,   164 },
+  /*37*/	{  372,   234,   372,   234,   234,   146,   234,   146 },
+  /*38*/	{  315,   205,   315,   205,   205,   131,   205,   131 },
+  /*39*/	{  293,   182,   293,   182,   182,   114,   182,   114 },
+  /*40*/	{  256,   164,   256,   164,   164,   105,   164,   105 },
+  /*41*/	{  228,   142,   228,   142,   142,    90,   142,    90 },
+  /*42*/	{  205,   126,   205,   126,   126,    82,   126,    82 },
+  /*43*/	{  186,   117,   186,   117,   117,    73,   117,    73 },
+  /*44*/	{  158,   102,   158,   102,   102,    66,   102,    66 },
+  /*45*/	{  146,    91,   146,    91,    91,    57,    91,    57 },
+  /*46*/	{  128,    82,   128,    82,    82,    52,    82,    52 },
+  /*47*/	{  114,    71,   114,    71,    71,    45,    71,    45 },
+  /*48*/	{  102,    63,   102,    63,    63,    41,    63,    41 },
+  /*49*/	{   93,    59,    93,    59,    59,    36,    59,    36 },
+  /*50*/	{   79,    51,    79,    51,    51,    33,    51,    33 },
+  /*51*/	{   73,    46,    73,    46,    46,    28,    46,    28 }
+};
+
+/****************************************************************************
+ * HDM and Quant functions
+ ****************************************************************************/
+#define WELS_ABS_LC(a) ((iSign ^ (int32_t)(a)) - iSign)
+#define NEW_QUANT(pDct, iFF, iMF) (((iFF)+ WELS_ABS_LC(pDct))*(iMF)) >>16
+#define WELS_NEW_QUANT(pDct,iFF,iMF)	WELS_ABS_LC(NEW_QUANT(pDct, iFF, iMF))
+void WelsQuant4x4_c (int16_t* pDct, int16_t* pFF,  int16_t* pMF) {
+  int32_t i, j, iSign;
+  for (i = 0; i < 16; i += 4) {
+    j = i & 0x07;
+    iSign = WELS_SIGN (pDct[i]);
+    pDct[i] = WELS_NEW_QUANT (pDct[i], pFF[j], pMF[j]);
+    iSign = WELS_SIGN (pDct[i + 1]);
+    pDct[i + 1] = WELS_NEW_QUANT (pDct[i + 1], pFF[j + 1], pMF[j + 1]);
+    iSign = WELS_SIGN (pDct[i + 2]);
+    pDct[i + 2] = WELS_NEW_QUANT (pDct[i + 2], pFF[j + 2], pMF[j + 2]);
+    iSign = WELS_SIGN (pDct[i + 3]);
+    pDct[i + 3] = WELS_NEW_QUANT (pDct[i + 3], pFF[j + 3], pMF[j + 3]);
+  }
+}
+
+void WelsQuant4x4Dc_c (int16_t* pDct, int16_t iFF,  int16_t iMF) {
+  int32_t i, iSign;
+  for (i = 0; i < 16; i += 4) {
+    iSign = WELS_SIGN (pDct[i]);
+    pDct[i] = WELS_NEW_QUANT (pDct[i], iFF, iMF);
+    iSign = WELS_SIGN (pDct[i + 1]);
+    pDct[i + 1] = WELS_NEW_QUANT (pDct[i + 1], iFF, iMF);
+    iSign = WELS_SIGN (pDct[i + 2]);
+    pDct[i + 2] = WELS_NEW_QUANT (pDct[i + 2], iFF, iMF);
+    iSign = WELS_SIGN (pDct[i + 3]);
+    pDct[i + 3] = WELS_NEW_QUANT (pDct[i + 3], iFF, iMF);
+  }
+}
+
+void WelsQuantFour4x4_c (int16_t* pDct, int16_t* pFF,  int16_t* pMF) {
+  int32_t i, j, iSign;
+
+  for (i = 0; i < 64; i += 4) {
+    j = i & 0x07;
+    iSign = WELS_SIGN (pDct[i]);
+    pDct[i] = WELS_NEW_QUANT (pDct[i], pFF[j], pMF[j]);
+    iSign = WELS_SIGN (pDct[i + 1]);
+    pDct[i + 1] = WELS_NEW_QUANT (pDct[i + 1], pFF[j + 1], pMF[j + 1]);
+    iSign = WELS_SIGN (pDct[i + 2]);
+    pDct[i + 2] = WELS_NEW_QUANT (pDct[i + 2], pFF[j + 2], pMF[j + 2]);
+    iSign = WELS_SIGN (pDct[i + 3]);
+    pDct[i + 3] = WELS_NEW_QUANT (pDct[i + 3], pFF[j + 3], pMF[j + 3]);
+  }
+}
+
+void WelsQuantFour4x4Max_c (int16_t* pDct, int16_t* pFF,  int16_t* pMF, int16_t* pMax) {
+  int32_t i, j, k, iSign;
+  int16_t iMaxAbs;
+  for (k = 0; k < 4; k++) {
+    iMaxAbs = 0;
+    for (i = 0; i < 16; i++) {
+      j = i & 0x07;
+      iSign = WELS_SIGN (pDct[i]);
+      pDct[i] = NEW_QUANT (pDct[i], pFF[j], pMF[j]);
+      if (iMaxAbs < pDct[i]) iMaxAbs = pDct[i];
+      pDct[i] = WELS_ABS_LC (pDct[i]);
+    }
+    pDct += 16;
+    pMax[k] = iMaxAbs;
+  }
+}
+
+int32_t WelsHadamardQuant2x2Skip_c (int16_t* pRs, int16_t iFF,  int16_t iMF) {
+  int16_t pDct[4], s[4];
+  int16_t iThreshold = ((1 << 16) - 1) / iMF - iFF;
+
+  s[0] = pRs[0]  + pRs[32];
+  s[1] = pRs[0]  - pRs[32];
+  s[2] = pRs[16] + pRs[48];
+  s[3] = pRs[16] - pRs[48];
+
+  pDct[0] = s[0] + s[2];
+  pDct[1] = s[0] - s[2];
+  pDct[2] = s[1] + s[3];
+  pDct[3] = s[1] - s[3];
+
+  return ((WELS_ABS (pDct[0]) > iThreshold) || (WELS_ABS (pDct[1]) > iThreshold) || (WELS_ABS (pDct[2]) > iThreshold)
+          || (WELS_ABS (pDct[3]) > iThreshold));
+}
+
+int32_t WelsHadamardQuant2x2_c (int16_t* pRs, const int16_t iFF, int16_t iMF, int16_t* pDct, int16_t* pBlock) {
+  int16_t s[4];
+  int32_t iSign, i, iDcNzc = 0;
+
+  s[0] = pRs[0]  + pRs[32];
+  s[1] = pRs[0]  - pRs[32];
+  s[2] = pRs[16] + pRs[48];
+  s[3] = pRs[16] - pRs[48];
+
+  pRs[0] = 0;
+  pRs[16] = 0;
+  pRs[32] = 0;
+  pRs[48] = 0;
+
+  pDct[0] = s[0] + s[2];
+  pDct[1] = s[0] - s[2];
+  pDct[2] = s[1] + s[3];
+  pDct[3] = s[1] - s[3];
+
+  iSign = WELS_SIGN (pDct[0]);
+  pDct[0] = WELS_NEW_QUANT (pDct[0], iFF, iMF);
+  iSign = WELS_SIGN (pDct[1]);
+  pDct[1] = WELS_NEW_QUANT (pDct[1], iFF, iMF);
+  iSign = WELS_SIGN (pDct[2]);
+  pDct[2] = WELS_NEW_QUANT (pDct[2], iFF, iMF);
+  iSign = WELS_SIGN (pDct[3]);
+  pDct[3] = WELS_NEW_QUANT (pDct[3], iFF, iMF);
+
+  ST64 (pBlock, LD64 (pDct));
+
+  for (i = 0; i < 4; i++)
+    iDcNzc += (pBlock[i] != 0);
+  return iDcNzc;
+}
+
+/* dc value pick up and hdm_4x4 */
+void WelsHadamardT4Dc_c (int16_t* pLumaDc, int16_t* pDct) {
+  int32_t p[16], s[4];
+  int32_t i, iIdx;
+
+  for (i = 0 ; i < 16 ; i += 4) {
+    iIdx = ((i & 0x08) << 4) + ((i & 0x04) << 3);
+    s[0] = pDct[iIdx ]	+ pDct[iIdx + 80];
+    s[3] = pDct[iIdx ]	- pDct[iIdx + 80];
+    s[1] = pDct[iIdx + 16]	+ pDct[iIdx + 64];
+    s[2] = pDct[iIdx + 16]	- pDct[iIdx + 64];
+
+    p[i  ] = s[0] + s[1];
+    p[i + 2] = s[0] - s[1];
+    p[i + 1] = s[3] + s[2];
+    p[i + 3] = s[3] - s[2];
+  }
+
+  for (i = 0 ; i < 4 ; i ++) {
+    s[0] = p[i ]	+ p[i + 12];
+    s[3] = p[i ]	- p[i + 12];
+    s[1] = p[i + 4]	+ p[i + 8];
+    s[2] = p[i + 4]	- p[i + 8];
+
+    pLumaDc[i  ] = WELS_CLIP3 ((s[0] + s[1] + 1) >> 1, -32768, 32767);
+    pLumaDc[i + 8 ] = WELS_CLIP3 ((s[0] - s[1] + 1) >> 1, -32768, 32767);
+    pLumaDc[i + 4 ] = WELS_CLIP3 ((s[3] + s[2] + 1) >> 1, -32768, 32767);
+    pLumaDc[i + 12] = WELS_CLIP3 ((s[3] - s[2] + 1) >> 1, -32768, 32767);
+  }
+}
+
+/****************************************************************************
+ * DCT functions
+ ****************************************************************************/
+void WelsDctT4_c (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2) {
+  int16_t i, pData[16], s[4];
+  for (i = 0 ; i < 16 ; i += 4) {
+    const int32_t kiI1 = 1 + i;
+    const int32_t kiI2 = 2 + i;
+    const int32_t kiI3 = 3 + i;
+
+    pData[i ] = pPixel1[0] - pPixel2[0];
+    pData[kiI1] = pPixel1[1] - pPixel2[1];
+    pData[kiI2] = pPixel1[2] - pPixel2[2];
+    pData[kiI3] = pPixel1[3] - pPixel2[3];
+
+    pPixel1 += iStride1;
+    pPixel2 += iStride2;
+
+    /*horizontal transform */
+    s[0] = pData[i] + pData[kiI3];
+    s[3] = pData[i] - pData[kiI3];
+    s[1] = pData[kiI1] + pData[kiI2];
+    s[2] = pData[kiI1] - pData[kiI2];
+
+    pDct[i ] = s[0] + s[1];
+    pDct[kiI2] = s[0] - s[1];
+    pDct[kiI1] = (s[3] << 1) + s[2];
+    pDct[kiI3] = s[3] - (s[2] << 1);
+  }
+
+  /* vertical transform */
+  for (i = 0 ; i < 4 ; i ++) {
+    const int32_t kiI4	= 4 + i;
+    const int32_t kiI8	= 8 + i;
+    const int32_t kiI12	= 12 + i;
+
+    s[0] = pDct[i ] + pDct[kiI12];
+    s[3] = pDct[i ] - pDct[kiI12];
+    s[1] = pDct[kiI4] + pDct[kiI8 ];
+    s[2] = pDct[kiI4] - pDct[kiI8 ];
+
+    pDct[i  ] = s[0] + s[1];
+    pDct[kiI8 ] = s[0] - s[1];
+    pDct[kiI4 ] = (s[3] << 1) + s[2];
+    pDct[kiI12] = s[3] - (s[2] << 1);
+  }
+}
+
+void WelsDctFourT4_c (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2) {
+  int32_t stride_1 = iStride1 << 2;
+  int32_t stride_2 = iStride2 << 2;
+
+  WelsDctT4_c (pDct,      &pPixel1[0],          iStride1, &pPixel2[0],          iStride2);
+  WelsDctT4_c (pDct + 16, &pPixel1[4],          iStride1, &pPixel2[4],          iStride2);
+  WelsDctT4_c (pDct + 32, &pPixel1[stride_1  ], iStride1, &pPixel2[stride_2  ], iStride2);
+  WelsDctT4_c (pDct + 48, &pPixel1[stride_1 + 4], iStride1, &pPixel2[stride_2 + 4], iStride2);
+}
+
+/****************************************************************************
+ * Scan and Score functions
+ ****************************************************************************/
+void WelsScan4x4DcAc_c (int16_t* pLevel, int16_t* pDct) {
+  ST32 (pLevel, LD32 (pDct));
+  pLevel[2] = pDct[4];
+  pLevel[3] = pDct[8];
+  pLevel[4] = pDct[5];
+  ST32 (pLevel + 5, LD32 (pDct + 2));
+  pLevel[7] = pDct[6];
+  pLevel[8] = pDct[9];
+  ST32 (pLevel + 9, LD32 (pDct + 12));
+  pLevel[11] = pDct[10];
+  pLevel[12] = pDct[7];
+  pLevel[13] = pDct[11];
+  ST32 (pLevel + 14, LD32 (pDct + 14));
+}
+
+void WelsScan4x4Ac_c (int16_t* pLevel, int16_t* pDct) {
+  pLevel[0]  = pDct[1];
+  pLevel[1]  = pDct[4];
+  pLevel[2]  = pDct[8];
+  pLevel[3]  = pDct[5];
+  ST32 (&pLevel[4], LD32 (&pDct[2]));
+  pLevel[6]  = pDct[6];
+  pLevel[7]  = pDct[9];
+  ST32 (&pLevel[8], LD32 (&pDct[12]));
+  pLevel[10] = pDct[10];
+  pLevel[11] = pDct[7];
+  pLevel[12] = pDct[11];
+  ST32 (&pLevel[13], LD32 (&pDct[14]));
+  pLevel[15] = 0;
+}
+
+void WelsScan4x4Dc (int16_t* pLevel, int16_t* pDct) {
+  ST32 (pLevel, LD32 (pDct));
+  pLevel[2] = pDct[4];
+  pLevel[3] = pDct[8];
+  pLevel[4] = pDct[5];
+  ST32 (pLevel + 5, LD32 (pDct + 2));
+  pLevel[7] = pDct[6];
+  pLevel[8] = pDct[9];
+  ST32 (pLevel + 9, LD32 (pDct + 12));
+  pLevel[11] = pDct[10];
+  pLevel[12] = pDct[7];
+  pLevel[13] = pDct[11];
+  ST32 (pLevel + 14, LD32 (pDct + 14));
+}
+
+//refer to JVT-O079
+int32_t WelsCalculateSingleCtr4x4_c (int16_t* pDct) {
+  static const int32_t kiTRunTable[16] = { 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+  int32_t iSingleCtr = 0;
+  int32_t iIdx = 15;
+  int32_t iRun;
+
+  while (iIdx >= 0 && pDct[iIdx] == 0)      --iIdx;
+
+  while (iIdx >= 0) {
+    -- iIdx;
+    iRun = iIdx;
+    while (iIdx >= 0 && pDct[iIdx] == 0)  --iIdx;
+    iRun -= iIdx;
+    iSingleCtr += kiTRunTable[iRun];
+  }
+  return iSingleCtr;
+}
+
+/****************************************************************************
+ * Copy functions
+ ****************************************************************************/
+void WelsCopy4x4 (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS) {
+  const int32_t kiSrcStride2 = iStrideS << 1;
+  const int32_t kiSrcStride3 = iStrideS + kiSrcStride2;
+  const int32_t kiDstStride2 = iStrideD << 1;
+  const int32_t kiDstStride3 = iStrideD + kiDstStride2;
+
+  ST32 (pDst,				LD32 (pSrc));
+  ST32 (pDst + iStrideD,	LD32 (pSrc + iStrideS));
+  ST32 (pDst + kiDstStride2, LD32 (pSrc + kiSrcStride2));
+  ST32 (pDst + kiDstStride3, LD32 (pSrc + kiSrcStride3));
+}
+void WelsCopy8x8_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS) {
+  int32_t i;
+  for (i = 0; i < 4; i++) {
+    ST32 (pDst,				LD32 (pSrc));
+    ST32 (pDst + 4 ,			LD32 (pSrc + 4));
+    ST32 (pDst + iStrideD,	LD32 (pSrc + iStrideS));
+    ST32 (pDst + iStrideD + 4 ,	LD32 (pSrc + iStrideS + 4));
+    pDst += iStrideD << 1;
+    pSrc += iStrideS << 1;
+  }
+}
+void WelsCopy8x16_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS) {
+  int32_t i;
+  for (i = 0; i < 8; ++i) {
+    ST32 (pDst,				LD32 (pSrc));
+    ST32 (pDst + 4 ,			LD32 (pSrc + 4));
+    ST32 (pDst + iStrideD,	LD32 (pSrc + iStrideS));
+    ST32 (pDst + iStrideD + 4 ,	LD32 (pSrc + iStrideS + 4));
+    pDst += iStrideD << 1;
+    pSrc += iStrideS << 1;
+  }
+}
+void WelsCopy16x8_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS) {
+  int32_t i;
+  for (i = 0; i < 8; i++) {
+    ST32 (pDst,		LD32 (pSrc));
+    ST32 (pDst + 4 ,	LD32 (pSrc + 4));
+    ST32 (pDst + 8 , LD32 (pSrc + 8));
+    ST32 (pDst + 12 ,	LD32 (pSrc + 12));
+    pDst += iStrideD ;
+    pSrc += iStrideS;
+  }
+}
+void WelsCopy16x16_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS) {
+  int32_t i;
+  for (i = 0; i < 16; i++) {
+    ST32 (pDst,		LD32 (pSrc));
+    ST32 (pDst + 4 ,	LD32 (pSrc + 4));
+    ST32 (pDst + 8 , LD32 (pSrc + 8));
+    ST32 (pDst + 12 ,	LD32 (pSrc + 12));
+    pDst += iStrideD ;
+    pSrc += iStrideS;
+  }
+}
+
+int32_t WelsGetNoneZeroCount_c (int16_t* pLevel) {
+  int32_t iCnt = 0;
+  int32_t iIdx = 0;
+
+  while (iIdx < 16) {
+    iCnt += (pLevel[  iIdx] == 0);
+    iCnt += (pLevel[1 + iIdx] == 0);
+    iCnt += (pLevel[2 + iIdx] == 0);
+    iCnt += (pLevel[3 + iIdx] == 0);
+
+    iIdx += 4;
+  }
+  return (16 - iCnt);
+}
+
+void WelsInitEncodingFuncs (SWelsFuncPtrList* pFuncList, uint32_t  uiCpuFlag) {
+  pFuncList->pfCopy8x8Aligned			= WelsCopy8x8_c;
+  pFuncList->pfCopy16x16Aligned		=
+    pFuncList->pfCopy16x16NotAligned	= WelsCopy16x16_c;
+  pFuncList->pfCopy16x8NotAligned		= WelsCopy16x8_c;
+  pFuncList->pfCopy8x16Aligned		= WelsCopy8x16_c;
+
+  pFuncList->pfQuantizationHadamard2x2		= WelsHadamardQuant2x2_c;
+  pFuncList->pfQuantizationHadamard2x2Skip	= WelsHadamardQuant2x2Skip_c;
+  pFuncList->pfTransformHadamard4x4Dc			= WelsHadamardT4Dc_c;
+
+  pFuncList->pfDctT4					= WelsDctT4_c;
+  pFuncList->pfDctFourT4   			= WelsDctFourT4_c;
+
+  pFuncList->pfScan4x4				= WelsScan4x4DcAc_c;
+  pFuncList->pfScan4x4Ac				= WelsScan4x4Ac_c;
+  pFuncList->pfCalculateSingleCtr4x4	= WelsCalculateSingleCtr4x4_c;
+
+  pFuncList->pfGetNoneZeroCount		= WelsGetNoneZeroCount_c;
+
+  pFuncList->pfQuantization4x4		= WelsQuant4x4_c;
+  pFuncList->pfQuantizationDc4x4		= WelsQuant4x4Dc_c;
+  pFuncList->pfQuantizationFour4x4	= WelsQuantFour4x4_c;
+  pFuncList->pfQuantizationFour4x4Max	= WelsQuantFour4x4Max_c;
+
+#if defined(X86_ASM)
+  if (uiCpuFlag & WELS_CPU_MMXEXT) {
+
+    pFuncList->pfQuantizationHadamard2x2		= WelsHadamardQuant2x2_mmx;
+    pFuncList->pfQuantizationHadamard2x2Skip	= WelsHadamardQuant2x2Skip_mmx;
+
+    pFuncList->pfDctT4					= WelsDctT4_mmx;
+
+    pFuncList->pfCopy8x8Aligned			= WelsCopy8x8_mmx;
+    pFuncList->pfCopy8x16Aligned		= WelsCopy8x16_mmx;
+  }
+  if (uiCpuFlag & WELS_CPU_SSE2) {
+    pFuncList->pfGetNoneZeroCount		= WelsGetNoneZeroCount_sse2;
+    pFuncList->pfTransformHadamard4x4Dc	= WelsHadamardT4Dc_sse2;
+
+    pFuncList->pfQuantization4x4		= WelsQuant4x4_sse2;
+    pFuncList->pfQuantizationDc4x4		= WelsQuant4x4Dc_sse2;
+    pFuncList->pfQuantizationFour4x4	= WelsQuantFour4x4_sse2;
+    pFuncList->pfQuantizationFour4x4Max	= WelsQuantFour4x4Max_sse2;
+
+    pFuncList->pfCopy16x16Aligned		= WelsCopy16x16_sse2;
+    pFuncList->pfCopy16x16NotAligned	= WelsCopy16x16NotAligned_sse2;
+    pFuncList->pfCopy16x8NotAligned		= WelsCopy16x8NotAligned_sse2;
+
+    pFuncList->pfScan4x4				= WelsScan4x4DcAc_sse2;
+    pFuncList->pfScan4x4Ac				= WelsScan4x4Ac_sse2;
+    pFuncList->pfCalculateSingleCtr4x4	= WelsCalculateSingleCtr4x4_sse2;
+
+    pFuncList->pfDctFourT4				= WelsDctFourT4_sse2;
+  }
+//#ifndef MACOS
+  if (uiCpuFlag & WELS_CPU_SSSE3) {
+    pFuncList->pfScan4x4				= WelsScan4x4DcAc_ssse3;
+  }
+
+//#endif//MACOS
+
+#endif//X86_ASM
+}
+}
--- a/codec/encoder/core/src/encoder.cpp
+++ b/codec/encoder/core/src/encoder.cpp
@@ -1,554 +1,498 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file	encoder.c
- *
- * \brief	core encoder
- *
- * \date	5/14/2009 Created
- *
- *************************************************************************************
- */
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <assert.h>
-#include "encoder.h"
-#include "extern.h"
-#include "cpu.h"
-#include "cpu_core.h"
-#include "utils.h"
-
-#include "encode_mb_aux.h"
-#include "decode_mb_aux.h"
-#include "get_intra_predictor.h"
-#include "svc_encode_mb.h"
-
-#include "deblocking.h"
-#include "expand_pic.h"
-
-#include "mc.h"
-#include "sample.h"
-
-#include "svc_encode_slice.h"
-#include "svc_base_layer_md.h"
-#include "svc_mode_decision.h"
-#include "set_mb_syn_cavlc.h"
-#include "crt_util_safe_x.h"	// Safe CRT routines like utils for cross_platforms
-#include "codec_def.h"
-#ifdef MT_ENABLED
-#include "slice_multi_threading.h"
-#endif//MT_ENABLED
-
-//  global   function  pointers  definition
-namespace WelsSVCEnc {
-/* Motion compensation */
-
-
-/*!
- * \brief	initialize source picture body
- * \param	pSrc		SSourcePicture*
- * \param	csp		internal csp format
- * \param	iWidth	widht of picture in pixels
- * \param	iHeight	iHeight of picture in pixels
- * \return	successful - 0; otherwise none 0 for failed
- */
-int32_t InitPic( const void *kpSrc, const int32_t kiColorspace, const int32_t kiWidth, const int32_t kiHeight )
-{
-	SSourcePicture *pSrcPic = (SSourcePicture *)kpSrc;
-
-	if ( NULL == pSrcPic || kiWidth == 0 || kiHeight == 0 )
-		return 1;
-
-	pSrcPic->iColorFormat	= kiColorspace;
-	pSrcPic->iPicWidth		= kiWidth;
-	pSrcPic->iPicHeight		= kiHeight;
-	
-	switch( kiColorspace & (~videoFormatVFlip) ) {
-	case videoFormatI420:
-	case videoFormatYV12:
-		pSrcPic->pData[0]	= NULL;
-		pSrcPic->pData[1]	= NULL;
-		pSrcPic->pData[2]	= NULL;
-		pSrcPic->pData[3]	= NULL;
-		pSrcPic->iStride[0]	= kiWidth;
-		pSrcPic->iStride[2]	= pSrcPic->iStride[1] = kiWidth >> 1;
-		pSrcPic->iStride[3]	= 0;
-		break;	
-	case videoFormatYUY2:
-	case videoFormatYVYU:
-	case videoFormatUYVY:
-		pSrcPic->pData[0]	= NULL;
-		pSrcPic->pData[1]	= NULL;
-		pSrcPic->pData[2]	= NULL;
-		pSrcPic->pData[3]	= NULL;		
-		pSrcPic->iStride[0]	= CALC_BI_STRIDE(kiWidth,  16);
-		pSrcPic->iStride[3]	= pSrcPic->iStride[2] = pSrcPic->iStride[1] = 0;		
-		break;
-	case videoFormatRGB:
-	case videoFormatBGR:
-		pSrcPic->pData[0]	= NULL;
-		pSrcPic->pData[1]	= NULL;
-		pSrcPic->pData[2]	= NULL;
-		pSrcPic->pData[3]	= NULL;		
-		pSrcPic->iStride[0]	= CALC_BI_STRIDE(kiWidth, 24);
-		pSrcPic->iStride[3]	= pSrcPic->iStride[2] = pSrcPic->iStride[1] = 0;
-		if( kiColorspace & videoFormatVFlip )
-			pSrcPic->iColorFormat = kiColorspace & (~videoFormatVFlip);
-		else 
-			pSrcPic->iColorFormat = kiColorspace | videoFormatVFlip;
-		break;
-	case videoFormatBGRA:
-	case videoFormatRGBA:
-	case videoFormatARGB:
-	case videoFormatABGR:
-		pSrcPic->pData[0]	= NULL;
-		pSrcPic->pData[1]	= NULL;
-		pSrcPic->pData[2]	= NULL;
-		pSrcPic->pData[3]	= NULL;		
-		pSrcPic->iStride[0]	= kiWidth << 2;
-		pSrcPic->iStride[3]	= pSrcPic->iStride[2] = pSrcPic->iStride[1] = 0;	
-		if( kiColorspace & videoFormatVFlip )
-			pSrcPic->iColorFormat = kiColorspace & (~videoFormatVFlip);
-		else 
-			pSrcPic->iColorFormat = kiColorspace | videoFormatVFlip;
-		break;
-	default:
-		return 2;	// any else?
-	}
-
-	return 0;
-}
-
-
-void WelsInitBGDFunc( SWelsFuncPtrList *pFuncList, const bool_t kbEnableBackgroundDetection )
-{
-	if ( kbEnableBackgroundDetection )
-	{
-		 pFuncList->pfInterMdBackgroundDecision = (PInterMdBackgroundDecisionFunc)WelsMdInterJudgeBGDPskip;
-		 pFuncList->pfInterMdBackgroundInfoUpdate = (PInterMdBackgroundInfoUpdateFunc)WelsMdInterUpdateBGDInfo;
-	}
-	else
-	{
-		 pFuncList->pfInterMdBackgroundDecision = (PInterMdBackgroundDecisionFunc)WelsMdInterJudgeBGDPskipFalse;
-		 pFuncList->pfInterMdBackgroundInfoUpdate = (PInterMdBackgroundInfoUpdateFunc)WelsMdInterUpdateBGDInfoNULL;
-	}
-}
-
-/*!
- * \brief	initialize function pointers that potentially used in Wels encoding
- * \param	pEncCtx		sWelsEncCtx*
- * \return	successful - 0; otherwise none 0 for failed
- */
-int32_t InitFunctionPointers( SWelsFuncPtrList *pFuncList, SWelsSvcCodingParam *pParam, uint32_t uiCpuFlag )
-{	
-	int32_t iReturn = 0;
-
-	/* Functionality utilization of CPU instructions dependency */
-	pFuncList->pfSetMemZeroSize8	= WelsSetMemZero_c;		// confirmed_safe_unsafe_usage
-	pFuncList->pfSetMemZeroSize64Aligned16	= WelsSetMemZero_c;	// confirmed_safe_unsafe_usage
-	pFuncList->pfSetMemZeroSize64	= WelsSetMemZero_c;	// confirmed_safe_unsafe_usage
-#if defined(X86_ASM)
-	if ( uiCpuFlag & WELS_CPU_MMXEXT )
-	{		
-		pFuncList->pfSetMemZeroSize8	= WelsSetMemZeroSize8_mmx;		// confirmed_safe_unsafe_usage
-		pFuncList->pfSetMemZeroSize64Aligned16	= WelsSetMemZeroSize64_mmx;	// confirmed_safe_unsafe_usage
-		pFuncList->pfSetMemZeroSize64	= WelsSetMemZeroSize64_mmx;	// confirmed_safe_unsafe_usage
-	}
-	if ( uiCpuFlag & WELS_CPU_SSE2 )
-	{
-		pFuncList->pfSetMemZeroSize64Aligned16	= WelsSetMemZeroAligned64_sse2;	// confirmed_safe_unsafe_usage
-	}
-#endif//X86_ASM
-
-	InitExpandPictureFunc( pFuncList, uiCpuFlag );
-
-	/* Intra_Prediction_fn*/	
-	WelsInitFillingPredFuncs( uiCpuFlag );
-	WelsInitIntraPredFuncs( pFuncList, uiCpuFlag );
-
-	/* sad, satd, average */
-	WelsInitSampleSadFunc(pFuncList, uiCpuFlag);
-
-	//
-	WelsInitBGDFunc(pFuncList, pParam->bEnableBackgroundDetection );
-	// for pfGetVarianceFromIntraVaa function ptr adaptive by CPU features, 6/7/2010
-	InitIntraAnalysisVaaInfo( pFuncList, uiCpuFlag );
-	
-	/* Motion compensation */
-	/*init pixel average function*/
-	/*get one column or row pixel when refinement*/
-	WelsInitMcFuncs(pFuncList, uiCpuFlag);
-	InitCoeffFunc( uiCpuFlag );
-
-	WelsInitEncodingFuncs( pFuncList, uiCpuFlag );
-	WelsInitReconstructionFuncs( pFuncList, uiCpuFlag );
-
-	DeblockingInit( &pFuncList->pfDeblocking, uiCpuFlag );
-	WelsBlockFuncInit( &pFuncList->pfSetNZCZero, uiCpuFlag );
-
-	InitFillNeighborCacheInterFunc ( pFuncList, pParam->bEnableBackgroundDetection );
-
-	return iReturn;
-}
-
-/*!
- * \brief	initialize frame coding	
- */
-void InitFrameCoding( sWelsEncCtx *pEncCtx, const EFrameType keFrameType )
-{
-	// for bitstream writing
-	pEncCtx->iPosBsBuffer		= 0;	// reset bs pBuffer position
-	pEncCtx->pOut->iNalIndex		= 0;	// reset NAL index
-	
-	InitBits( &pEncCtx->pOut->sBsWrite, pEncCtx->pOut->pBsBuffer, pEncCtx->pOut->uiSize );
-
-	if ( keFrameType == WELS_FRAME_TYPE_P )
-	{
-		if ( pEncCtx->pSvcParam->uiIntraPeriod )
-		{
-			++pEncCtx->iFrameIndex;
-		}
-		
-		++pEncCtx->uiFrameIdxRc;
-
-		if ( pEncCtx->iPOC < ( 1 << pEncCtx->pSps->iLog2MaxPocLsb ) - 2 ) // if iPOC type is no 0, this need be modification
-			pEncCtx->iPOC			+= 2;	// for POC type 0
-		else
-			pEncCtx->iPOC = 0;
-		
-		if ( pEncCtx->eLastNalPriority != 0 )
-		{
-			if ( pEncCtx->iFrameNum < (1 << pEncCtx->pSps->uiLog2MaxFrameNum) - 1  )
-				++ pEncCtx->iFrameNum;
-			else
-				pEncCtx->iFrameNum	= 0;	// if iFrameNum overflow
-		}
-		pEncCtx->eNalType		= NAL_UNIT_CODED_SLICE;
-		pEncCtx->eSliceType	= P_SLICE;
-		pEncCtx->eNalPriority	= NRI_PRI_HIGH;
-	}
-	else if ( keFrameType == WELS_FRAME_TYPE_IDR )
-	{
-		pEncCtx->iFrameNum		= 0;
-		pEncCtx->iPOC			= 0;
-		pEncCtx->bEncCurFrmAsIdrFlag = false;
-		if ( pEncCtx->pSvcParam->uiIntraPeriod )
-		{
-			pEncCtx->iFrameIndex = 0;
-		}		
-		pEncCtx->uiFrameIdxRc = 0;
-
-		pEncCtx->eNalType		= NAL_UNIT_CODED_SLICE_IDR;
-		pEncCtx->eSliceType	= I_SLICE;
-		pEncCtx->eNalPriority	= NRI_PRI_HIGHEST;
-
-		pEncCtx->iCodingIndex	= 0;
-
-		// reset_ref_list
-
-		// rc_init_gop		
-	}
-	else if ( keFrameType == WELS_FRAME_TYPE_I )
-	{
-		if ( pEncCtx->iPOC < ( 1 << pEncCtx->pSps->iLog2MaxPocLsb ) - 2 ) // if iPOC type is no 0, this need be modification
-			pEncCtx->iPOC			+= 2;	// for POC type 0
-		else
-			pEncCtx->iPOC = 0;
-		
-		if ( pEncCtx->eLastNalPriority != 0 )
-		{
-			if ( pEncCtx->iFrameNum < (1 << pEncCtx->pSps->uiLog2MaxFrameNum) - 1  )
-				++ pEncCtx->iFrameNum;
-			else
-				pEncCtx->iFrameNum	= 0;	// if iFrameNum overflow
-		}
-
-		pEncCtx->eNalType		= NAL_UNIT_CODED_SLICE;
-		pEncCtx->eSliceType	= I_SLICE;
-		pEncCtx->eNalPriority	= NRI_PRI_HIGHEST;
-
-		// rc_init_gop
-	}
-	else	// B pictures are not supported now, any else?
-	{
-		assert( 0 );
-	}
-
-#if defined(STAT_OUTPUT)
-	memset( &pEncCtx->sPerInfo, 0, sizeof(SStatSliceInfo) );
-#endif//FRAME_INFO_OUTPUT
-
-#if defined(MT_ENABLED) && defined(PACKING_ONE_SLICE_PER_LAYER)
-	if ( pEncCtx->pSvcParam->iMultipleThreadIdc > 1 )
-		reset_env_mt( pEncCtx );
-#endif
-}
-
-EFrameType DecideFrameType( sWelsEncCtx *pEncCtx, const int8_t kiSpatialNum )
-{	
-	SWelsSvcCodingParam *pSvcParam	= pEncCtx->pSvcParam;
-	EFrameType iFrameType = WELS_FRAME_TYPE_AUTO;
-	bool_t bSceneChangeFlag = false;
-	
-	// perform scene change detection	
-	if ( (!pSvcParam->bEnableSceneChangeDetect) || pEncCtx->pVaa->bIdrPeriodFlag || 
-		(kiSpatialNum < pSvcParam->iNumDependencyLayer) || (pEncCtx->uiFrameIdxRc < (VGOP_SIZE << 1)) ) // avoid too frequent I frame coding, rc control 
-	{
-		bSceneChangeFlag = false;
-	}
-	else
-	{
-		bSceneChangeFlag = pEncCtx->pVaa->bSceneChangeFlag;
-	}
-
-	//scene_changed_flag: RC enable && iSpatialNum == pSvcParam->iNumDependencyLayer 
-	//bIdrPeriodFlag: RC disable || iSpatialNum != pSvcParam->iNumDependencyLayer
-	//pEncCtx->bEncCurFrmAsIdrFlag: 1. first frame should be IDR; 2. idr pause; 3. idr request
-	iFrameType = ( pEncCtx->pVaa->bIdrPeriodFlag || bSceneChangeFlag || pEncCtx->bEncCurFrmAsIdrFlag ) ? WELS_FRAME_TYPE_IDR : WELS_FRAME_TYPE_P;
-
-	if (  WELS_FRAME_TYPE_P == iFrameType && pEncCtx->iSkipFrameFlag > 0 ) // for frame skip, 1/5/2010
-	{
-		-- pEncCtx->iSkipFrameFlag;
-		iFrameType = WELS_FRAME_TYPE_SKIP;
-	}
-	else if ( WELS_FRAME_TYPE_IDR == iFrameType )
-	{
-		pEncCtx->iCodingIndex = 0;
-	}
-
-	return iFrameType;
-}
-
-/*!
- * \brief	Dump reconstruction for dependency layer
- */
-
-extern "C" void DumpDependencyRec( SPicture *pCurPicture, const str_t *kpFileName, const int8_t kiDid )
-{
-	FILE *pDumpRecFile											= NULL;	
-	static bool_t bDependencyRecFlag[MAX_DEPENDENCY_LAYER]	= {0};
-	int32_t iWrittenSize											= 0;
-
-	if ( NULL == pCurPicture || NULL == kpFileName || kiDid >= MAX_DEPENDENCY_LAYER )
-		return;
-	
-	if ( bDependencyRecFlag[kiDid] )
-	{
-		if ( STRNLEN(kpFileName, MAX_FNAME_LEN) > 0 )	// confirmed_safe_unsafe_usage
-#if defined(__GNUC__) || (defined(WIN32) && defined(_MSC_VER) && (_MSC_VER<1500))
-			pDumpRecFile	= FOPEN( kpFileName, "ab" );
-#elif defined(WIN32) && defined(_MSC_VER) && (_MSC_VER>=1500)	// vs2008
-			FOPEN(&pDumpRecFile, kpFileName, "ab");
-#endif//__GNUC__..
-		else
-		{
-			str_t sDependencyRecFileName[16] = {0};			
-#if defined(WIN32) && defined(_MSC_VER) && (_MSC_VER>=1500)	// vs2008
-			SNPRINTF( sDependencyRecFileName, 16, 16, "rec%d.yuv", kiDid );	// confirmed_safe_unsafe_usage
-			FOPEN( &pDumpRecFile, sDependencyRecFileName, "ab" );
-#else
-			SNPRINTF( sDependencyRecFileName, 16, "rec%d.yuv", kiDid );	// confirmed_safe_unsafe_usage
-			pDumpRecFile	= FOPEN( sDependencyRecFileName, "ab" );
-#endif//WIN32..
-		}
-		if ( NULL != pDumpRecFile)
-			fseek( pDumpRecFile, 0, SEEK_END );
-	}
-	else
-	{
-		if ( STRNLEN(kpFileName, MAX_FNAME_LEN) > 0 )	// confirmed_safe_unsafe_usage
-		{
-#if defined(WIN32) && defined(_MSC_VER) && (_MSC_VER>=1500)	// vs2008
-			FOPEN(&pDumpRecFile, kpFileName, "wb");
-#else
-			pDumpRecFile	= FOPEN( kpFileName, "wb" );
-#endif//WIN32..
-		}
-		else
-		{
-			str_t sDependencyRecFileName[16] = {0};
-#if defined(WIN32) && defined(_MSC_VER) && (_MSC_VER>=1500)	// vs2008
-			SNPRINTF( sDependencyRecFileName, 16, 16, "rec%d.yuv", kiDid );	// confirmed_safe_unsafe_usage
-			FOPEN(&pDumpRecFile, sDependencyRecFileName, "wb");
-#else
-			SNPRINTF( sDependencyRecFileName, 16, "rec%d.yuv", kiDid );	// confirmed_safe_unsafe_usage
-			pDumpRecFile	= FOPEN( sDependencyRecFileName, "wb");
-#endif//WIN32..
-		}
-		bDependencyRecFlag[kiDid]	= true;
-	}
-
-	if ( NULL != pDumpRecFile )
-	{
-		int32_t i = 0;
-		int32_t j = 0;
-		const int32_t kiStrideY	= pCurPicture->iLineSize[0];		
-		const int32_t kiLumaWidth	= pCurPicture->iWidthInPixel;
-		const int32_t kiLumaHeight	= pCurPicture->iHeightInPixel;
-		const int32_t kiChromaWidth	= kiLumaWidth >> 1;
-		const int32_t kiChromaHeight	= kiLumaHeight >> 1;		
-		
-		for( j = 0; j < kiLumaHeight; ++ j)
-		{
-			iWrittenSize = fwrite( &pCurPicture->pData[0][j*kiStrideY], 1, kiLumaWidth, pDumpRecFile );
-			assert( iWrittenSize == kiLumaWidth );
-			if ( iWrittenSize < kiLumaWidth )
-			{
-				assert( 0 );	// make no sense for us if writing failed
-				fclose(pDumpRecFile);
-				return;
-			}
-		}
-		for( i = 1; i < I420_PLANES; ++ i)
-		{
-			const int32_t kiStrideUV = pCurPicture->iLineSize[i];			
-			for ( j = 0; j < kiChromaHeight; ++ j)
-			{
-				iWrittenSize = fwrite( &pCurPicture->pData[i][j*kiStrideUV], 1, kiChromaWidth, pDumpRecFile );
-				assert(iWrittenSize == kiChromaWidth );
-				if ( iWrittenSize < kiChromaWidth )
-				{
-					assert( 0 );	// make no sense for us if writing failed
-					fclose(pDumpRecFile);
-					return;
-				}
-			}
-		}
-		fclose(pDumpRecFile);
-		pDumpRecFile = NULL;
-	}
-}
-
-/*!
- * \brief	Dump the reconstruction pictures
- */
-
-void DumpRecFrame( SPicture *pCurPicture, const str_t *kpFileName )
-{
-	FILE *pDumpRecFile				= NULL;	
-	static bool_t bRecFlag	= false;
-	int32_t iWrittenSize			= 0;
-
-	if ( NULL == pCurPicture || NULL == kpFileName )
-		return;
-	
-	if ( bRecFlag )
-	{
-		if ( STRNLEN(kpFileName, MAX_FNAME_LEN) > 0 )	// confirmed_safe_unsafe_usage
-		{
-#if defined(WIN32) && defined(_MSC_VER) && (_MSC_VER>=1500)	// vs2008
-			FOPEN(&pDumpRecFile, kpFileName, "ab");
-#else
-			pDumpRecFile	= FOPEN( kpFileName, "ab" );
-#endif//WIN32
-		}
-		else
-		{
-#if defined(WIN32) && defined(_MSC_VER) && (_MSC_VER>=1500)	// vs2008
-			FOPEN(&pDumpRecFile, "rec.yuv", "ab");
-#else
-			pDumpRecFile	= FOPEN( "rec.yuv", "ab" );
-#endif//WIN32
-		}
-		if ( NULL != pDumpRecFile)
-			fseek( pDumpRecFile, 0, SEEK_END );
-	}
-	else
-	{
-		if ( STRNLEN(kpFileName, MAX_FNAME_LEN) > 0 )	// confirmed_safe_unsafe_usage
-		{
-#if defined(WIN32) && defined(_MSC_VER) && (_MSC_VER>=1500)	// vs2008
-			FOPEN(&pDumpRecFile, kpFileName, "wb");
-#else
-			pDumpRecFile	= FOPEN( kpFileName, "wb" );
-#endif//WIN32
-		}
-		else
-		{
-#if defined(WIN32) && defined(_MSC_VER) && (_MSC_VER>=1500)	// vs2008
-			FOPEN(&pDumpRecFile, "rec.yuv", "wb");
-#else
-			pDumpRecFile	= FOPEN( "rec.yuv", "wb");
-#endif//WIN32..
-		}
-		bRecFlag	= true;
-	}
-
-	if ( NULL != pDumpRecFile )
-	{
-		int32_t i = 0;
-		int32_t j = 0;
-		const int32_t kiStrideY	= pCurPicture->iLineSize[0];		
-		const int32_t kiLumaWidth	= pCurPicture->iWidthInPixel;
-		const int32_t kiLumaHeight	= pCurPicture->iHeightInPixel;
-		const int32_t kiChromaWidth	= kiLumaWidth >> 1;
-		const int32_t kiChromaHeight	= kiLumaHeight >> 1;		
-		
-		for( j = 0; j < kiLumaHeight; ++ j)
-		{
-			iWrittenSize = fwrite( &pCurPicture->pData[0][j*kiStrideY], 1, kiLumaWidth, pDumpRecFile );
-			assert( iWrittenSize == kiLumaWidth );
-			if ( iWrittenSize < kiLumaWidth )
-			{
-				assert( 0 );	// make no sense for us if writing failed
-				fclose(pDumpRecFile);
-				return;
-			}
-		}
-		for( i = 1; i < I420_PLANES; ++ i)
-		{
-			const int32_t kiStrideUV = pCurPicture->iLineSize[i];			
-			for ( j = 0; j < kiChromaHeight; ++ j)
-			{
-				iWrittenSize = fwrite( &pCurPicture->pData[i][j*kiStrideUV], 1, kiChromaWidth, pDumpRecFile );
-				assert(iWrittenSize == kiChromaWidth );
-				if ( iWrittenSize < kiChromaWidth )
-				{
-					assert( 0 );	// make no sense for us if writing failed
-					fclose(pDumpRecFile);
-					return;
-				}
-			}
-		}
-		fclose(pDumpRecFile);
-		pDumpRecFile = NULL;
-	}
-}
-
-
-
-/***********************************************************************************/
-void WelsSetMemZero_c(void *pDst, int32_t iSize)	// confirmed_safe_unsafe_usage
-{
-	memset(pDst, 0, iSize);
-}
-}
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	encoder.c
+ *
+ * \brief	core encoder
+ *
+ * \date	5/14/2009 Created
+ *
+ *************************************************************************************
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include "encoder.h"
+#include "extern.h"
+#include "cpu.h"
+#include "cpu_core.h"
+#include "utils.h"
+
+#include "encode_mb_aux.h"
+#include "decode_mb_aux.h"
+#include "get_intra_predictor.h"
+#include "svc_encode_mb.h"
+
+#include "deblocking.h"
+#include "expand_pic.h"
+
+#include "mc.h"
+#include "sample.h"
+
+#include "svc_encode_slice.h"
+#include "svc_base_layer_md.h"
+#include "svc_mode_decision.h"
+#include "set_mb_syn_cavlc.h"
+#include "crt_util_safe_x.h"	// Safe CRT routines like utils for cross_platforms
+#include "codec_def.h"
+#ifdef MT_ENABLED
+#include "slice_multi_threading.h"
+#endif//MT_ENABLED
+
+//  global   function  pointers  definition
+namespace WelsSVCEnc {
+/* Motion compensation */
+
+
+/*!
+ * \brief	initialize source picture body
+ * \param	pSrc		SSourcePicture*
+ * \param	csp		internal csp format
+ * \param	iWidth	widht of picture in pixels
+ * \param	iHeight	iHeight of picture in pixels
+ * \return	successful - 0; otherwise none 0 for failed
+ */
+int32_t InitPic (const void* kpSrc, const int32_t kiColorspace, const int32_t kiWidth, const int32_t kiHeight) {
+  SSourcePicture* pSrcPic = (SSourcePicture*)kpSrc;
+
+  if (NULL == pSrcPic || kiWidth == 0 || kiHeight == 0)
+    return 1;
+
+  pSrcPic->iColorFormat	= kiColorspace;
+  pSrcPic->iPicWidth		= kiWidth;
+  pSrcPic->iPicHeight		= kiHeight;
+
+  switch (kiColorspace & (~videoFormatVFlip)) {
+  case videoFormatI420:
+  case videoFormatYV12:
+    pSrcPic->pData[0]	= NULL;
+    pSrcPic->pData[1]	= NULL;
+    pSrcPic->pData[2]	= NULL;
+    pSrcPic->pData[3]	= NULL;
+    pSrcPic->iStride[0]	= kiWidth;
+    pSrcPic->iStride[2]	= pSrcPic->iStride[1] = kiWidth >> 1;
+    pSrcPic->iStride[3]	= 0;
+    break;
+  case videoFormatYUY2:
+  case videoFormatYVYU:
+  case videoFormatUYVY:
+    pSrcPic->pData[0]	= NULL;
+    pSrcPic->pData[1]	= NULL;
+    pSrcPic->pData[2]	= NULL;
+    pSrcPic->pData[3]	= NULL;
+    pSrcPic->iStride[0]	= CALC_BI_STRIDE (kiWidth,  16);
+    pSrcPic->iStride[3]	= pSrcPic->iStride[2] = pSrcPic->iStride[1] = 0;
+    break;
+  case videoFormatRGB:
+  case videoFormatBGR:
+    pSrcPic->pData[0]	= NULL;
+    pSrcPic->pData[1]	= NULL;
+    pSrcPic->pData[2]	= NULL;
+    pSrcPic->pData[3]	= NULL;
+    pSrcPic->iStride[0]	= CALC_BI_STRIDE (kiWidth, 24);
+    pSrcPic->iStride[3]	= pSrcPic->iStride[2] = pSrcPic->iStride[1] = 0;
+    if (kiColorspace & videoFormatVFlip)
+      pSrcPic->iColorFormat = kiColorspace & (~videoFormatVFlip);
+    else
+      pSrcPic->iColorFormat = kiColorspace | videoFormatVFlip;
+    break;
+  case videoFormatBGRA:
+  case videoFormatRGBA:
+  case videoFormatARGB:
+  case videoFormatABGR:
+    pSrcPic->pData[0]	= NULL;
+    pSrcPic->pData[1]	= NULL;
+    pSrcPic->pData[2]	= NULL;
+    pSrcPic->pData[3]	= NULL;
+    pSrcPic->iStride[0]	= kiWidth << 2;
+    pSrcPic->iStride[3]	= pSrcPic->iStride[2] = pSrcPic->iStride[1] = 0;
+    if (kiColorspace & videoFormatVFlip)
+      pSrcPic->iColorFormat = kiColorspace & (~videoFormatVFlip);
+    else
+      pSrcPic->iColorFormat = kiColorspace | videoFormatVFlip;
+    break;
+  default:
+    return 2;	// any else?
+  }
+
+  return 0;
+}
+
+
+void WelsInitBGDFunc (SWelsFuncPtrList* pFuncList, const bool_t kbEnableBackgroundDetection) {
+  if (kbEnableBackgroundDetection) {
+    pFuncList->pfInterMdBackgroundDecision = (PInterMdBackgroundDecisionFunc)WelsMdInterJudgeBGDPskip;
+    pFuncList->pfInterMdBackgroundInfoUpdate = (PInterMdBackgroundInfoUpdateFunc)WelsMdInterUpdateBGDInfo;
+  } else {
+    pFuncList->pfInterMdBackgroundDecision = (PInterMdBackgroundDecisionFunc)WelsMdInterJudgeBGDPskipFalse;
+    pFuncList->pfInterMdBackgroundInfoUpdate = (PInterMdBackgroundInfoUpdateFunc)WelsMdInterUpdateBGDInfoNULL;
+  }
+}
+
+/*!
+ * \brief	initialize function pointers that potentially used in Wels encoding
+ * \param	pEncCtx		sWelsEncCtx*
+ * \return	successful - 0; otherwise none 0 for failed
+ */
+int32_t InitFunctionPointers (SWelsFuncPtrList* pFuncList, SWelsSvcCodingParam* pParam, uint32_t uiCpuFlag) {
+  int32_t iReturn = 0;
+
+  /* Functionality utilization of CPU instructions dependency */
+  pFuncList->pfSetMemZeroSize8	= WelsSetMemZero_c;		// confirmed_safe_unsafe_usage
+  pFuncList->pfSetMemZeroSize64Aligned16	= WelsSetMemZero_c;	// confirmed_safe_unsafe_usage
+  pFuncList->pfSetMemZeroSize64	= WelsSetMemZero_c;	// confirmed_safe_unsafe_usage
+#if defined(X86_ASM)
+  if (uiCpuFlag & WELS_CPU_MMXEXT) {
+    pFuncList->pfSetMemZeroSize8	= WelsSetMemZeroSize8_mmx;		// confirmed_safe_unsafe_usage
+    pFuncList->pfSetMemZeroSize64Aligned16	= WelsSetMemZeroSize64_mmx;	// confirmed_safe_unsafe_usage
+    pFuncList->pfSetMemZeroSize64	= WelsSetMemZeroSize64_mmx;	// confirmed_safe_unsafe_usage
+  }
+  if (uiCpuFlag & WELS_CPU_SSE2) {
+    pFuncList->pfSetMemZeroSize64Aligned16	= WelsSetMemZeroAligned64_sse2;	// confirmed_safe_unsafe_usage
+  }
+#endif//X86_ASM
+
+  InitExpandPictureFunc (pFuncList, uiCpuFlag);
+
+  /* Intra_Prediction_fn*/
+  WelsInitFillingPredFuncs (uiCpuFlag);
+  WelsInitIntraPredFuncs (pFuncList, uiCpuFlag);
+
+  /* sad, satd, average */
+  WelsInitSampleSadFunc (pFuncList, uiCpuFlag);
+
+  //
+  WelsInitBGDFunc (pFuncList, pParam->bEnableBackgroundDetection);
+  // for pfGetVarianceFromIntraVaa function ptr adaptive by CPU features, 6/7/2010
+  InitIntraAnalysisVaaInfo (pFuncList, uiCpuFlag);
+
+  /* Motion compensation */
+  /*init pixel average function*/
+  /*get one column or row pixel when refinement*/
+  WelsInitMcFuncs (pFuncList, uiCpuFlag);
+  InitCoeffFunc (uiCpuFlag);
+
+  WelsInitEncodingFuncs (pFuncList, uiCpuFlag);
+  WelsInitReconstructionFuncs (pFuncList, uiCpuFlag);
+
+  DeblockingInit (&pFuncList->pfDeblocking, uiCpuFlag);
+  WelsBlockFuncInit (&pFuncList->pfSetNZCZero, uiCpuFlag);
+
+  InitFillNeighborCacheInterFunc (pFuncList, pParam->bEnableBackgroundDetection);
+
+  return iReturn;
+}
+
+/*!
+ * \brief	initialize frame coding
+ */
+void InitFrameCoding (sWelsEncCtx* pEncCtx, const EFrameType keFrameType) {
+  // for bitstream writing
+  pEncCtx->iPosBsBuffer		= 0;	// reset bs pBuffer position
+  pEncCtx->pOut->iNalIndex		= 0;	// reset NAL index
+
+  InitBits (&pEncCtx->pOut->sBsWrite, pEncCtx->pOut->pBsBuffer, pEncCtx->pOut->uiSize);
+
+  if (keFrameType == WELS_FRAME_TYPE_P) {
+    if (pEncCtx->pSvcParam->uiIntraPeriod) {
+      ++pEncCtx->iFrameIndex;
+    }
+
+    ++pEncCtx->uiFrameIdxRc;
+
+    if (pEncCtx->iPOC < (1 << pEncCtx->pSps->iLog2MaxPocLsb) - 2)     // if iPOC type is no 0, this need be modification
+      pEncCtx->iPOC			+= 2;	// for POC type 0
+    else
+      pEncCtx->iPOC = 0;
+
+    if (pEncCtx->eLastNalPriority != 0) {
+      if (pEncCtx->iFrameNum < (1 << pEncCtx->pSps->uiLog2MaxFrameNum) - 1)
+        ++ pEncCtx->iFrameNum;
+      else
+        pEncCtx->iFrameNum	= 0;	// if iFrameNum overflow
+    }
+    pEncCtx->eNalType		= NAL_UNIT_CODED_SLICE;
+    pEncCtx->eSliceType	= P_SLICE;
+    pEncCtx->eNalPriority	= NRI_PRI_HIGH;
+  } else if (keFrameType == WELS_FRAME_TYPE_IDR) {
+    pEncCtx->iFrameNum		= 0;
+    pEncCtx->iPOC			= 0;
+    pEncCtx->bEncCurFrmAsIdrFlag = false;
+    if (pEncCtx->pSvcParam->uiIntraPeriod) {
+      pEncCtx->iFrameIndex = 0;
+    }
+    pEncCtx->uiFrameIdxRc = 0;
+
+    pEncCtx->eNalType		= NAL_UNIT_CODED_SLICE_IDR;
+    pEncCtx->eSliceType	= I_SLICE;
+    pEncCtx->eNalPriority	= NRI_PRI_HIGHEST;
+
+    pEncCtx->iCodingIndex	= 0;
+
+    // reset_ref_list
+
+    // rc_init_gop
+  } else if (keFrameType == WELS_FRAME_TYPE_I) {
+    if (pEncCtx->iPOC < (1 << pEncCtx->pSps->iLog2MaxPocLsb) - 2)     // if iPOC type is no 0, this need be modification
+      pEncCtx->iPOC			+= 2;	// for POC type 0
+    else
+      pEncCtx->iPOC = 0;
+
+    if (pEncCtx->eLastNalPriority != 0) {
+      if (pEncCtx->iFrameNum < (1 << pEncCtx->pSps->uiLog2MaxFrameNum) - 1)
+        ++ pEncCtx->iFrameNum;
+      else
+        pEncCtx->iFrameNum	= 0;	// if iFrameNum overflow
+    }
+
+    pEncCtx->eNalType		= NAL_UNIT_CODED_SLICE;
+    pEncCtx->eSliceType	= I_SLICE;
+    pEncCtx->eNalPriority	= NRI_PRI_HIGHEST;
+
+    // rc_init_gop
+  } else {	// B pictures are not supported now, any else?
+    assert (0);
+  }
+
+#if defined(STAT_OUTPUT)
+  memset (&pEncCtx->sPerInfo, 0, sizeof (SStatSliceInfo));
+#endif//FRAME_INFO_OUTPUT
+
+#if defined(MT_ENABLED) && defined(PACKING_ONE_SLICE_PER_LAYER)
+  if (pEncCtx->pSvcParam->iMultipleThreadIdc > 1)
+    reset_env_mt (pEncCtx);
+#endif
+}
+
+EFrameType DecideFrameType (sWelsEncCtx* pEncCtx, const int8_t kiSpatialNum) {
+  SWelsSvcCodingParam* pSvcParam	= pEncCtx->pSvcParam;
+  EFrameType iFrameType = WELS_FRAME_TYPE_AUTO;
+  bool_t bSceneChangeFlag = false;
+
+  // perform scene change detection
+  if ((!pSvcParam->bEnableSceneChangeDetect) || pEncCtx->pVaa->bIdrPeriodFlag ||
+      (kiSpatialNum < pSvcParam->iNumDependencyLayer)
+      || (pEncCtx->uiFrameIdxRc < (VGOP_SIZE << 1))) { // avoid too frequent I frame coding, rc control
+    bSceneChangeFlag = false;
+  } else {
+    bSceneChangeFlag = pEncCtx->pVaa->bSceneChangeFlag;
+  }
+
+  //scene_changed_flag: RC enable && iSpatialNum == pSvcParam->iNumDependencyLayer
+  //bIdrPeriodFlag: RC disable || iSpatialNum != pSvcParam->iNumDependencyLayer
+  //pEncCtx->bEncCurFrmAsIdrFlag: 1. first frame should be IDR; 2. idr pause; 3. idr request
+  iFrameType = (pEncCtx->pVaa->bIdrPeriodFlag || bSceneChangeFlag
+                || pEncCtx->bEncCurFrmAsIdrFlag) ? WELS_FRAME_TYPE_IDR : WELS_FRAME_TYPE_P;
+
+  if (WELS_FRAME_TYPE_P == iFrameType && pEncCtx->iSkipFrameFlag > 0) {  // for frame skip, 1/5/2010
+    -- pEncCtx->iSkipFrameFlag;
+    iFrameType = WELS_FRAME_TYPE_SKIP;
+  } else if (WELS_FRAME_TYPE_IDR == iFrameType) {
+    pEncCtx->iCodingIndex = 0;
+  }
+
+  return iFrameType;
+}
+
+/*!
+ * \brief	Dump reconstruction for dependency layer
+ */
+
+extern "C" void DumpDependencyRec (SPicture* pCurPicture, const str_t* kpFileName, const int8_t kiDid) {
+  FILE* pDumpRecFile											= NULL;
+  static bool_t bDependencyRecFlag[MAX_DEPENDENCY_LAYER]	= {0};
+  int32_t iWrittenSize											= 0;
+
+  if (NULL == pCurPicture || NULL == kpFileName || kiDid >= MAX_DEPENDENCY_LAYER)
+    return;
+
+  if (bDependencyRecFlag[kiDid]) {
+    if (STRNLEN (kpFileName, MAX_FNAME_LEN) > 0)	// confirmed_safe_unsafe_usage
+#if defined(__GNUC__) || (defined(WIN32) && defined(_MSC_VER) && (_MSC_VER<1500))
+      pDumpRecFile	= FOPEN (kpFileName, "ab");
+#elif defined(WIN32) && defined(_MSC_VER) && (_MSC_VER>=1500)	// vs2008
+      FOPEN (&pDumpRecFile, kpFileName, "ab");
+#endif//__GNUC__..
+    else {
+      str_t sDependencyRecFileName[16] = {0};
+#if defined(WIN32) && defined(_MSC_VER) && (_MSC_VER>=1500)	// vs2008
+      SNPRINTF (sDependencyRecFileName, 16, 16, "rec%d.yuv", kiDid);	// confirmed_safe_unsafe_usage
+      FOPEN (&pDumpRecFile, sDependencyRecFileName, "ab");
+#else
+      SNPRINTF (sDependencyRecFileName, 16, "rec%d.yuv", kiDid);	// confirmed_safe_unsafe_usage
+      pDumpRecFile	= FOPEN (sDependencyRecFileName, "ab");
+#endif//WIN32..
+    }
+    if (NULL != pDumpRecFile)
+      fseek (pDumpRecFile, 0, SEEK_END);
+  } else {
+    if (STRNLEN (kpFileName, MAX_FNAME_LEN) > 0) {	// confirmed_safe_unsafe_usage
+#if defined(WIN32) && defined(_MSC_VER) && (_MSC_VER>=1500)	// vs2008
+      FOPEN (&pDumpRecFile, kpFileName, "wb");
+#else
+      pDumpRecFile	= FOPEN (kpFileName, "wb");
+#endif//WIN32..
+    } else {
+      str_t sDependencyRecFileName[16] = {0};
+#if defined(WIN32) && defined(_MSC_VER) && (_MSC_VER>=1500)	// vs2008
+      SNPRINTF (sDependencyRecFileName, 16, 16, "rec%d.yuv", kiDid);	// confirmed_safe_unsafe_usage
+      FOPEN (&pDumpRecFile, sDependencyRecFileName, "wb");
+#else
+      SNPRINTF (sDependencyRecFileName, 16, "rec%d.yuv", kiDid);	// confirmed_safe_unsafe_usage
+      pDumpRecFile	= FOPEN (sDependencyRecFileName, "wb");
+#endif//WIN32..
+    }
+    bDependencyRecFlag[kiDid]	= true;
+  }
+
+  if (NULL != pDumpRecFile) {
+    int32_t i = 0;
+    int32_t j = 0;
+    const int32_t kiStrideY	= pCurPicture->iLineSize[0];
+    const int32_t kiLumaWidth	= pCurPicture->iWidthInPixel;
+    const int32_t kiLumaHeight	= pCurPicture->iHeightInPixel;
+    const int32_t kiChromaWidth	= kiLumaWidth >> 1;
+    const int32_t kiChromaHeight	= kiLumaHeight >> 1;
+
+    for (j = 0; j < kiLumaHeight; ++ j) {
+      iWrittenSize = fwrite (&pCurPicture->pData[0][j * kiStrideY], 1, kiLumaWidth, pDumpRecFile);
+      assert (iWrittenSize == kiLumaWidth);
+      if (iWrittenSize < kiLumaWidth) {
+        assert (0);	// make no sense for us if writing failed
+        fclose (pDumpRecFile);
+        return;
+      }
+    }
+    for (i = 1; i < I420_PLANES; ++ i) {
+      const int32_t kiStrideUV = pCurPicture->iLineSize[i];
+      for (j = 0; j < kiChromaHeight; ++ j) {
+        iWrittenSize = fwrite (&pCurPicture->pData[i][j * kiStrideUV], 1, kiChromaWidth, pDumpRecFile);
+        assert (iWrittenSize == kiChromaWidth);
+        if (iWrittenSize < kiChromaWidth) {
+          assert (0);	// make no sense for us if writing failed
+          fclose (pDumpRecFile);
+          return;
+        }
+      }
+    }
+    fclose (pDumpRecFile);
+    pDumpRecFile = NULL;
+  }
+}
+
+/*!
+ * \brief	Dump the reconstruction pictures
+ */
+
+void DumpRecFrame (SPicture* pCurPicture, const str_t* kpFileName) {
+  FILE* pDumpRecFile				= NULL;
+  static bool_t bRecFlag	= false;
+  int32_t iWrittenSize			= 0;
+
+  if (NULL == pCurPicture || NULL == kpFileName)
+    return;
+
+  if (bRecFlag) {
+    if (STRNLEN (kpFileName, MAX_FNAME_LEN) > 0) {	// confirmed_safe_unsafe_usage
+#if defined(WIN32) && defined(_MSC_VER) && (_MSC_VER>=1500)	// vs2008
+      FOPEN (&pDumpRecFile, kpFileName, "ab");
+#else
+      pDumpRecFile	= FOPEN (kpFileName, "ab");
+#endif//WIN32
+    } else {
+#if defined(WIN32) && defined(_MSC_VER) && (_MSC_VER>=1500)	// vs2008
+      FOPEN (&pDumpRecFile, "rec.yuv", "ab");
+#else
+      pDumpRecFile	= FOPEN ("rec.yuv", "ab");
+#endif//WIN32
+    }
+    if (NULL != pDumpRecFile)
+      fseek (pDumpRecFile, 0, SEEK_END);
+  } else {
+    if (STRNLEN (kpFileName, MAX_FNAME_LEN) > 0) {	// confirmed_safe_unsafe_usage
+#if defined(WIN32) && defined(_MSC_VER) && (_MSC_VER>=1500)	// vs2008
+      FOPEN (&pDumpRecFile, kpFileName, "wb");
+#else
+      pDumpRecFile	= FOPEN (kpFileName, "wb");
+#endif//WIN32
+    } else {
+#if defined(WIN32) && defined(_MSC_VER) && (_MSC_VER>=1500)	// vs2008
+      FOPEN (&pDumpRecFile, "rec.yuv", "wb");
+#else
+      pDumpRecFile	= FOPEN ("rec.yuv", "wb");
+#endif//WIN32..
+    }
+    bRecFlag	= true;
+  }
+
+  if (NULL != pDumpRecFile) {
+    int32_t i = 0;
+    int32_t j = 0;
+    const int32_t kiStrideY	= pCurPicture->iLineSize[0];
+    const int32_t kiLumaWidth	= pCurPicture->iWidthInPixel;
+    const int32_t kiLumaHeight	= pCurPicture->iHeightInPixel;
+    const int32_t kiChromaWidth	= kiLumaWidth >> 1;
+    const int32_t kiChromaHeight	= kiLumaHeight >> 1;
+
+    for (j = 0; j < kiLumaHeight; ++ j) {
+      iWrittenSize = fwrite (&pCurPicture->pData[0][j * kiStrideY], 1, kiLumaWidth, pDumpRecFile);
+      assert (iWrittenSize == kiLumaWidth);
+      if (iWrittenSize < kiLumaWidth) {
+        assert (0);	// make no sense for us if writing failed
+        fclose (pDumpRecFile);
+        return;
+      }
+    }
+    for (i = 1; i < I420_PLANES; ++ i) {
+      const int32_t kiStrideUV = pCurPicture->iLineSize[i];
+      for (j = 0; j < kiChromaHeight; ++ j) {
+        iWrittenSize = fwrite (&pCurPicture->pData[i][j * kiStrideUV], 1, kiChromaWidth, pDumpRecFile);
+        assert (iWrittenSize == kiChromaWidth);
+        if (iWrittenSize < kiChromaWidth) {
+          assert (0);	// make no sense for us if writing failed
+          fclose (pDumpRecFile);
+          return;
+        }
+      }
+    }
+    fclose (pDumpRecFile);
+    pDumpRecFile = NULL;
+  }
+}
+
+
+
+/***********************************************************************************/
+void WelsSetMemZero_c (void* pDst, int32_t iSize) {	// confirmed_safe_unsafe_usage
+  memset (pDst, 0, iSize);
+}
+}
--- a/codec/encoder/core/src/encoder_data_tables.cpp
+++ b/codec/encoder/core/src/encoder_data_tables.cpp
@@ -1,475 +1,500 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-// exp_data.c
-// export date cross various modules (.c)
-#include "typedefs.h"
-#include "wels_common_basis.h"
-#include "mb_cache.h"
-#include "utils.h"
-#include "md.h"
-#include "sample.h"
-#include "svc_enc_golomb.h"
-#include "vlc_encoder.h"
-namespace WelsSVCEnc {
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// extern at mb_cache.h
-const uint8_t g_kuiSmb4AddrIn256[16] = 
-{
-	0,		4,		16*4,		16*4+4,
-	8,		12,		16*4+8,		16*4+12,
-	16*8,	16*8+4,	16*12,		16*12+4,
-	16*8+8,  16*8+12,  16*12+8, 16*12+12
-};                       
-
-//////pNonZeroCount[16+8] mapping scan index
-const uint8_t g_kuiMbCountScan4Idx[24] =
-{                     //  0   1 | 4  5      luma 8*8 block           pNonZeroCount[16+8] 
-	0,  1,  4,  5,   //  2   3 | 6  7        0  |  1                  0   1   2   3 
-	2,  3,  6,  7,   //---------------      ---------                 4   5   6   7 
-	8,  9, 12, 13,   //  8   9 | 12 13       2  |  3                  8   9  10  11 
-	10, 11, 14, 15,   // 10  11 | 14 15-----------------------------> 12  13  14  15 
-	16, 17, 20, 21,   //----------------    chroma 8*8 block          16  17  18  19  
-	18, 19, 22, 23   // 16  17 | 20 21        0    1                 20  21  22  23 
-};
-
-const uint8_t g_kuiCache48CountScan4Idx[24] =
-{	// [16 + 2*4]
-	9, 10, 17, 18,	
-	11, 12, 19, 20,	
-	25, 26, 33, 34,	
-	27, 28, 35, 36,	
-	14, 15,			
-	22, 23,			
-	38, 39,			
-	46, 47			
-};	
-
-
-//cache element equal to 30
-const uint8_t g_kuiCache30ScanIdx[16] = //mv or uiRefIndex cache scan index, 4*4 block as basic unit
-{
-	7,  8, 13, 14,
-	9, 10, 15, 16,
-	19, 20, 25, 26,
-	21, 22, 27, 28
-};
-
-const uint8_t g_kuiCache12_8x8RefIdx[4] = //mv or uiRefIndex cache scan index, 4*4 block as basic unit
-{
-	5,6,
-	9, 10
-};
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// extern at mb_cache.h
-
-const str_t *g_sWelsLogTags[] = {
-	"ERR",
-	"WARN",
-	"INFO",
-	"DBUG",
-	"RESV"
-};
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// extern at wels_common_basis.h
-const uint8_t g_kuiChromaQpTable[52]={
-	0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,
-	12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,
-	28,29,29,30,31,32,32,33,34,34,35,35,36,36,37,37,
-	37,38,38,38,39,39,39,39
-};
-
-/*
- *	vcl type map for given NAL unit type and corresponding H264 type (0: AVC; 1: SVC).
- */
-const EVclType g_keTypeMap[32][2] =
-{
-	{ NON_VCL,	NON_VCL },	// 0: NAL_UNIT_UNSPEC_0
-	{ VCL,		VCL,	},	// 1: NAL_UNIT_CODED_SLICE
-	{ VCL,		NOT_APP },	// 2: NAL_UNIT_CODED_SLICE_DPA
-	{ VCL,		NOT_APP },	// 3: NAL_UNIT_CODED_SLICE_DPB
-	{ VCL,		NOT_APP },	// 4: NAL_UNIT_CODED_SLICE_DPC
-	{ VCL,		VCL		},	// 5: NAL_UNIT_CODED_SLICE_IDR
-	{ NON_VCL,	NON_VCL },	// 6: NAL_UNIT_SEI
-	{ NON_VCL,	NON_VCL },	// 7: NAL_UNIT_SPS
-	{ NON_VCL,	NON_VCL },	// 8: NAL_UNIT_PPS
-	{ NON_VCL,	NON_VCL },	// 9: NAL_UNIT_AU_DELIMITER
-	{ NON_VCL,	NON_VCL },	// 10: NAL_UNIT_END_OF_SEQ
-	{ NON_VCL,	NON_VCL },	// 11: NAL_UNIT_END_OF_STR
-	{ NON_VCL,	NON_VCL	},	// 12: NAL_UNIT_FILLER_DATA
-	{ NON_VCL,	NON_VCL },	// 13: NAL_UNIT_SPS_EXT
-	{ NON_VCL,	NON_VCL },	// 14: NAL_UNIT_PREFIX, NEED associate succeeded NAL to make a VCL
-	{ NON_VCL,	NON_VCL },	// 15: NAL_UNIT_SUBSET_SPS
-	{ NON_VCL,	NON_VCL },	// 16: NAL_UNIT_RESV_16
-	{ NON_VCL,	NON_VCL },	// 17: NAL_UNIT_RESV_17
-	{ NON_VCL,	NON_VCL },	// 18: NAL_UNIT_RESV_18
-	{ NON_VCL,	NON_VCL },	// 19: NAL_UNIT_AUX_CODED_SLICE
-	{ NON_VCL,	VCL		},	// 20: NAL_UNIT_CODED_SLICE_EXT
-	{ NON_VCL,	NON_VCL },	// 21: NAL_UNIT_RESV_21
-	{ NON_VCL,	NON_VCL },	// 22: NAL_UNIT_RESV_22
-	{ NON_VCL,	NON_VCL },	// 23: NAL_UNIT_RESV_23
-	{ NON_VCL,	NON_VCL },	// 24: NAL_UNIT_UNSPEC_24
-	{ NON_VCL,	NON_VCL },	// 25: NAL_UNIT_UNSPEC_25
-	{ NON_VCL,	NON_VCL },	// 26: NAL_UNIT_UNSPEC_26
-	{ NON_VCL,	NON_VCL	},	// 27: NAL_UNIT_UNSPEC_27
-	{ NON_VCL,	NON_VCL },	// 28: NAL_UNIT_UNSPEC_28
-	{ NON_VCL,	NON_VCL },	// 29: NAL_UNIT_UNSPEC_29
-	{ NON_VCL,	NON_VCL },	// 30: NAL_UNIT_UNSPEC_30
-	{ NON_VCL,	NON_VCL }	// 31: NAL_UNIT_UNSPEC_31
-};
-
-__align16( const uint16_t, g_kuiDequantCoeff[52][8]) = {
-/* 0*/{   10,   13,   10,   13,   13,   16,   13,   16 },	/* 1*/{   11,   14,   11,   14,   14,   18,   14,   18 },
-/* 2*/{   13,   16,   13,   16,   16,   20,   16,   20 },	/* 3*/{   14,   18,   14,   18,   18,   23,   18,   23 },
-/* 4*/{   16,   20,   16,   20,   20,   25,   20,   25 },	/* 5*/{   18,   23,   18,   23,   23,   29,   23,   29 },
-/* 6*/{   20,   26,   20,   26,   26,   32,   26,   32 },	/* 7*/{   22,   28,   22,   28,   28,   36,   28,   36 },
-/* 8*/{   26,   32,   26,   32,   32,   40,   32,   40 },	/* 9*/{   28,   36,   28,   36,   36,   46,   36,   46 },
-/*10*/{   32,   40,   32,   40,   40,   50,   40,   50 },	/*11*/{   36,   46,   36,   46,   46,   58,   46,   58 },
-/*12*/{   40,   52,   40,   52,   52,   64,   52,   64 },	/*13*/{   44,   56,   44,   56,   56,   72,   56,   72 },
-/*14*/{   52,   64,   52,   64,   64,   80,   64,   80 },	/*15*/{   56,   72,   56,   72,   72,   92,   72,   92 },
-/*16*/{   64,   80,   64,   80,   80,  100,   80,  100 },	/*17*/{   72,   92,   72,   92,   92,  116,   92,  116 },
-/*18*/{   80,  104,   80,  104,  104,  128,  104,  128 },	/*19*/{   88,  112,   88,  112,  112,  144,  112,  144 },
-/*20*/{  104,  128,  104,  128,  128,  160,  128,  160 },	/*21*/{  112,  144,  112,  144,  144,  184,  144,  184 },
-/*22*/{  128,  160,  128,  160,  160,  200,  160,  200 },	/*23*/{  144,  184,  144,  184,  184,  232,  184,  232 },
-/*24*/{  160,  208,  160,  208,  208,  256,  208,  256 },	/*25*/{  176,  224,  176,  224,  224,  288,  224,  288 },
-/*26*/{  208,  256,  208,  256,  256,  320,  256,  320 },	/*27*/{  224,  288,  224,  288,  288,  368,  288,  368 },
-/*28*/{  256,  320,  256,  320,  320,  400,  320,  400 },	/*29*/{  288,  368,  288,  368,  368,  464,  368,  464 },
-/*30*/{  320,  416,  320,  416,  416,  512,  416,  512 },	/*31*/{  352,  448,  352,  448,  448,  576,  448,  576 },
-/*32*/{  416,  512,  416,  512,  512,  640,  512,  640 },	/*33*/{  448,  576,  448,  576,  576,  736,  576,  736 },
-/*34*/{  512,  640,  512,  640,  640,  800,  640,  800 },	/*35*/{  576,  736,  576,  736,  736,  928,  736,  928 },
-/*36*/{  640,  832,  640,  832,  832, 1024,  832, 1024 },	/*37*/{  704,  896,  704,  896,  896, 1152,  896, 1152 },
-/*38*/{  832, 1024,  832, 1024, 1024, 1280, 1024, 1280 },	/*39*/{  896, 1152,  896, 1152, 1152, 1472, 1152, 1472 },
-/*40*/{ 1024, 1280, 1024, 1280, 1280, 1600, 1280, 1600 },	/*41*/{ 1152, 1472, 1152, 1472, 1472, 1856, 1472, 1856 },
-/*42*/{ 1280, 1664, 1280, 1664, 1664, 2048, 1664, 2048 },	/*43*/{ 1408, 1792, 1408, 1792, 1792, 2304, 1792, 2304 },
-/*44*/{ 1664, 2048, 1664, 2048, 2048, 2560, 2048, 2560 },	/*45*/{ 1792, 2304, 1792, 2304, 2304, 2944, 2304, 2944 },
-/*46*/{ 2048, 2560, 2048, 2560, 2560, 3200, 2560, 3200 },	/*47*/{ 2304, 2944, 2304, 2944, 2944, 3712, 2944, 3712 },
-/*48*/{ 2560, 3328, 2560, 3328, 3328, 4096, 3328, 4096 },	/*49*/{ 2816, 3584, 2816, 3584, 3584, 4608, 3584, 4608 },
-/*50*/{ 3328, 4096, 3328, 4096, 4096, 5120, 4096, 5120 },	/*51*/{ 3584, 4608, 3584, 4608, 4608, 5888, 4608, 5888 },
-};
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// extern at md.h
-const int32_t g_kiQpCostTable[52] = 
-{
-	1, 1, 1, 1, 1, 1, 1, 1,  /*  0-7 */
-	1, 1, 1, 1,              /*  8-11 */
-	1, 1, 1, 1, 2, 2, 2, 2,  /* 12-19 */
-	3, 3, 3, 4, 4, 4, 5, 6,  /* 20-27 */
-	6, 7, 8, 9,10,11,13,14,  /* 28-35 */
-	16,18,20,23,25,29,32,36,  /* 36-43 */
-	40,45,51,57,64,72,81,91   /* 44-51 */
-};
-const int8_t g_kiMapModeI16x16[7] = 
-{
-	0, 1, 2, 3, 2, 2, 2
-};//{I16_PRED_V, I16_PRED_H, I16_PRED_DC, I16_PRED_P, I16_PRED_DC, I16_PRED_DC, I16_PRED_DC};
-
-const int8_t g_kiMapModeIntraChroma[7] = 
-{
-	0, 1, 2, 3, 0, 0, 0
-};//{C_PRED_DC, C_PRED_H, C_PRED_V, C_PRED_P, C_PRED_DC_L, C_PRED_DC_T, C_PRED_DC_128};
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// extern at svc_enc_golomb.h
-
-const uint32_t g_uiGolombUELength[256] =
-{
-	1,  3,  3,  5,  5,  5,  5,  7,  7,  7,  7,  7,  7,  7,  7,    //14
-	9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, //30
-	11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, //46
-	11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, //62
-	13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, //
-	13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
-	13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
-	13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
-	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
-	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
-	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
-	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
-	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
-	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
-	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
-	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
-	17
-};
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// extern at vlc_encoder.h
-
-//g_kuiVlcCoeffToken[nc][total-coeff][trailing-ones][0--value, 1--bit count]
-const uint8_t g_kuiVlcCoeffToken[5][17][4][2] = 
-{
-	{//0<=nc<2
-		{	{ 1,  1}, { 0,  0}, { 0,  0}, { 0,  0} }, //0
-		{	{ 5,  6}, { 1,  2}, { 0,  0}, { 0,  0} },//1
-		{	{ 7,  8}, { 4,  6}, { 1,  3}, { 0,  0} },//2
-		{	{ 7,  9}, { 6,  8}, { 5,  7}, { 3,  5} },//3
-		{	{ 7, 10}, { 6,  9}, { 5,  8}, { 3,  6} },//4
-		{	{ 7, 11}, { 6, 10}, { 5,  9}, { 4,  7} },//5
-		{	{15, 13}, { 6, 11}, { 5, 10}, { 4,  8} },//6
-		{	{11, 13}, {14, 13}, { 5, 11}, { 4,  9} },//7
-		{	{ 8, 13}, {10, 13}, {13, 13}, { 4, 10} },//8
-		{	{15, 14}, {14, 14}, { 9, 13}, { 4, 11} },//9
-		{	{11, 14}, {10, 14}, {13, 14}, {12, 13} },//10
-		{	{15, 15}, {14, 15}, { 9, 14}, {12, 14} },//11
-		{	{11, 15}, {10, 15}, {13, 15}, { 8, 14} },//12
-		{	{15, 16}, { 1, 15}, { 9, 15}, {12, 15} },//13
-		{	{11, 16}, {14, 16}, {13, 16}, { 8, 15} },//14
-		{	{ 7, 16}, {10, 16}, { 9, 16}, {12, 16} },//15
-		{	{ 4, 16}, { 6, 16}, { 5, 16}, { 8, 16} }//16
-	},
-
-	{//2<=nc<4
-		{	{ 3,  2}, { 0,  0}, { 0,  0}, { 0,  0} },//0
-		{	{11,  6}, { 2,  2}, { 0,  0}, { 0,  0} },//1
-		{	{ 7,  6}, { 7,  5}, { 3,  3}, { 0,  0} },//2
-		{	{ 7,  7}, {10,  6}, { 9,  6}, { 5,  4} },//3
-		{	{ 7,  8}, { 6,  6}, { 5,  6}, { 4,  4} },//4
-		{	{ 4,  8}, { 6,  7}, { 5,  7}, { 6,  5} },//5
-		{	{ 7,  9}, { 6,  8}, { 5,  8}, { 8,  6} },//6
-		{	{15, 11}, { 6,  9}, { 5,  9}, { 4,  6} },//7
-		{	{11, 11}, {14, 11}, {13, 11}, { 4,  7} },//8
-		{	{15, 12}, {10, 11}, { 9, 11}, { 4,  9} },//9
-		{	{11, 12}, {14, 12}, {13, 12}, {12, 11} },//10
-		{	{ 8, 12}, {10, 12}, { 9, 12}, { 8, 11} },//11
-		{	{15, 13}, {14, 13}, {13, 13}, {12, 12} },//12
-		{	{11, 13}, {10, 13}, { 9, 13}, {12, 13} },//13
-		{	{ 7, 13}, {11, 14}, { 6, 13}, { 8, 13} },//14
-		{	{ 9, 14}, { 8, 14}, {10, 14}, { 1, 13} },//15
-		{	{ 7, 14}, { 6, 14}, { 5, 14}, { 4, 14} }//16
-	},
-
-	{//4<=nc<8
-		{	{15,  4}, { 0,  0}, { 0,  0}, { 0,  0} },//0
-		{	{15,  6}, {14,  4}, { 0,  0}, { 0,  0} },//1
-		{	{11,  6}, {15,  5}, {13,  4}, { 0,  0} },//2
-		{	{ 8,  6}, {12,  5}, {14,  5}, {12,  4} },//3
-		{	{15,  7}, {10,  5}, {11,  5}, {11,  4} },//4
-		{	{11,  7}, { 8,  5}, { 9,  5}, {10,  4} },//5
-		{	{ 9,  7}, {14,  6}, {13,  6}, { 9,  4} },//6
-		{	{ 8,  7}, {10,  6}, { 9,  6}, { 8,  4} },//7 
-		{	{15,  8}, {14,  7}, {13,  7}, {13,  5} },//8
-		{	{11,  8}, {14,  8}, {10,  7}, {12,  6} },//9
-		{	{15,  9}, {10,  8}, {13,  8}, {12,  7} },//10
-		{	{11,  9}, {14,  9}, { 9,  8}, {12,  8} },//11
-		{	{ 8,  9}, {10,  9}, {13,  9}, { 8,  8} },//12
-		{	{13, 10}, { 7,  9}, { 9,  9}, {12,  9} },//13
-		{	{ 9, 10}, {12, 10}, {11, 10}, {10, 10} },//14
-		{	{ 5, 10}, { 8, 10}, { 7, 10}, { 6, 10} },//15
-		{	{ 1, 10}, { 4, 10}, { 3, 10}, { 2, 10} }//16
-	},
-
-	{//8<=nc
-		{	{ 3,  6}, { 0,  0}, { 0,  0}, { 0,  0} },//0
-		{	{ 0,  6}, { 1,  6}, { 0,  0}, { 0,  0} },//1
-		{	{ 4,  6}, { 5,  6}, { 6,  6}, { 0,  0} },//2
-		{	{ 8,  6}, { 9,  6}, {10,  6}, {11,  6} },//3
-		{	{12,  6}, {13,  6}, {14,  6}, {15,  6} },//4
-		{	{16,  6}, {17,  6}, {18,  6}, {19,  6} },//5
-		{	{20,  6}, {21,  6}, {22,  6}, {23,  6} },//6
-		{	{24,  6}, {25,  6}, {26,  6}, {27,  6} },//7
-		{	{28,  6}, {29,  6}, {30,  6}, {31,  6} },//8
-		{	{32,  6}, {33,  6}, {34,  6}, {35,  6} },//9
-		{	{36,  6}, {37,  6}, {38,  6}, {39,  6} },//10
-		{	{40,  6}, {41,  6}, {42,  6}, {43,  6} },//11
-		{	{44,  6}, {45,  6}, {46,  6}, {47,  6} },//12
-		{	{48,  6}, {49,  6}, {50,  6}, {51,  6} },//13
-		{	{52,  6}, {53,  6}, {54,  6}, {55,  6} },//14
-		{	{56,  6}, {57,  6}, {58,  6}, {59,  6} },//15
-		{	{60,  6}, {61,  6}, {62,  6}, {63,  6} }//16
-	},
-
-	{//nc == -1
-		{	{ 1,  2}, { 0,  0}, { 0,  0}, { 0,  0} },//0
-		{	{ 7,  6}, { 1,  1}, { 0,  0}, { 0,  0} },//1
-		{	{ 4,  6}, { 6,  6}, { 1,  3}, { 0,  0} },//2
-		{	{ 3,  6}, { 3,  7}, { 2,  7}, { 5,  6} },//3
-		{	{ 2,  6}, { 3,  8}, { 2,  8}, { 0,  7} },//4
-		{	{ 0,  0}, { 0,  0}, { 0,  0}, { 0,  0} },//5
-		{	{ 0,  0}, { 0,  0}, { 0,  0}, { 0,  0} },//6
-		{	{ 0,  0}, { 0,  0}, { 0,  0}, { 0,  0} },//7
-		{	{ 0,  0}, { 0,  0}, { 0,  0}, { 0,  0} },//8
-		{	{ 0,  0}, { 0,  0}, { 0,  0}, { 0,  0} },//9
-		{	{ 0,  0}, { 0,  0}, { 0,  0}, { 0,  0} },//10
-		{	{ 0,  0}, { 0,  0}, { 0,  0}, { 0,  0} },//11
-		{	{ 0,  0}, { 0,  0}, { 0,  0}, { 0,  0} },//12
-		{	{ 0,  0}, { 0,  0}, { 0,  0}, { 0,  0} },//13
-		{	{ 0,  0}, { 0,  0}, { 0,  0}, { 0,  0} },//14
-		{	{ 0,  0}, { 0,  0}, { 0,  0}, { 0,  0} },//15
-		{	{ 0,  0}, { 0,  0}, { 0,  0}, { 0,  0} }//16
-	}
-};
-
-//const uint8_t g_kuiVlcLevelPrefix[15][2] =
-//{
-//	{1, 1}, {1, 2}
-//}; 
-
-//g_kuiVlcTotalZeros[tzVlcIndex][total_zeros][0--value, 1--bit count]
-const uint8_t g_kuiVlcTotalZeros[16][16][2] = 
-{
-	{//0 not available
-		{0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0} 	
-	},
-	{//1
-		{1, 1}, {3, 3}, {2, 3}, {3, 4}, {2, 4}, {3, 5}, {2, 5}, {3, 6}, {2, 6}, {3, 7}, {2, 7}, {3, 8}, {2, 8}, {3, 9}, {2, 9}, {1, 9}
-	},
-	{//2
-		{7, 3}, {6, 3}, {5, 3}, {4, 3}, {3, 3}, {5, 4}, {4, 4}, {3, 4}, {2, 4}, {3, 5}, {2, 5}, {3, 6}, {2, 6}, {1, 6}, {0, 6}, {0, 0}
-	},
-	{//3
-		{5, 4}, {7, 3}, {6, 3}, {5, 3}, {4, 4}, {3, 4}, {4, 3}, {3, 3}, {2, 4}, {3, 5}, {2, 5}, {1, 6}, {1, 5}, {0, 6}, {0, 0}, {0, 0}
-	},
-	{//4
-		{3, 5}, {7, 3}, {5, 4}, {4, 4}, {6, 3}, {5, 3}, {4, 3}, {3, 4}, {3, 3}, {2, 4}, {2, 5}, {1, 5}, {0, 5}, {0, 0}, {0, 0}, {0, 0}
-	},
-	{//5
-		{5, 4}, {4, 4}, {3, 4}, {7, 3}, {6, 3}, {5, 3}, {4, 3}, {3, 3}, {2, 4}, {1, 5}, {1, 4}, {0, 5}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
-	},
-	{//6
-		{1, 6}, {1, 5}, {7, 3}, {6, 3}, {5, 3}, {4, 3}, {3, 3}, {2, 3}, {1, 4}, {1, 3}, {0, 6}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
-	},
-	{//7
-		{1, 6}, {1, 5}, {5, 3}, {4, 3}, {3, 3}, {3, 2}, {2, 3}, {1, 4}, {1, 3}, {0, 6}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
-	},
-	{//8
-		{1, 6}, {1, 4}, {1, 5}, {3, 3}, {3, 2}, {2, 2}, {2, 3}, {1, 3}, {0, 6}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
-	},
-	{//9
-		{1, 6}, {0, 6}, {1, 4}, {3, 2}, {2, 2}, {1, 3}, {1, 2}, {1, 5}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
-	},
-	{//10
-		{1, 5}, {0, 5}, {1, 3}, {3, 2}, {2, 2}, {1, 2}, {1, 4}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
-	},
-	{//11
-		{0, 4}, {1, 4}, {1, 3}, {2, 3}, {1, 1}, {3, 3}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
-	},
-	{//12
-		{0, 4}, {1, 4}, {1, 2}, {1, 1}, {1, 3}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
-	},
-	{//13
-		{0, 3}, {1, 3}, {1, 1}, {1, 2}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
-	},
-	{//14
-		{0, 2}, {1, 2}, {1, 1}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
-	},
-	{//15
-		{0, 1}, {1, 1}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
-	}
-};
-
-const uint8_t g_kuiVlcTotalZerosChromaDc[4][4][2] =
-{
-	{
-		{0, 0}, {0, 0}, {0, 0}, {0, 0}
-	},
-	{
-		{1, 1}, {1, 2}, {1, 3}, {0, 3}
-	},
-	{
-		{1, 1}, {1, 2}, {0, 2}, {0, 0} 
-	},
-	{
-		{1, 1}, {0, 1}, {0, 0}, {0, 0}
-	}
-};
-//
-
-//g_kuiVlcRunBefore[zeros-left][run-before][0--value, 1--bit count]
-const uint8_t g_kuiVlcRunBefore[8][15][2] = 
-{
-	{//0 not available
-		{0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0} 	
-	},
-	{//1
-		{1, 1}, {0, 1}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
-	},
-	{//2
-		{1, 1}, {1, 2}, {0, 2}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
-	},
-	{//3
-		{3, 2}, {2, 2}, {1, 2}, {0, 2}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
-	},
-	{//4
-		{3, 2}, {2, 2}, {1, 2}, {1, 3}, {0, 3}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
-	},
-	{//5
-		{3, 2}, {2, 2}, {3, 3}, {2, 3}, {1, 3}, {0, 3}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
-	},
-	{//6
-		{3, 2}, {0, 3}, {1, 3}, {3, 3}, {2, 3}, {5, 3}, {4, 3}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
-	},
-	{//>6
-		{7, 3}, {6, 3}, {5, 3}, {4, 3}, {3, 3}, {2, 3}, {1, 3}, {1, 4}, {1, 5}, {1, 6}, {1, 7}, {1, 8}, {1, 9}, {1, 10}, {1, 11}
-	}
-};
-
-const ALIGNED_DECLARE(uint8_t, g_kuiEncNcMapTable[18], 16) =
-{
-	0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4
-};
-
-
-
-const uint8_t   g_kuiTemporalIdListTable[MAX_TEMPORAL_LEVEL][MAX_GOP_SIZE + 1] = 
-{
-	{  0, 0, 0, 0, 0, 0, 0, 0,
-	   0, 0, 0, 0, 0, 0, 0, 0,
-	   0  },  // gop size = 1
-	{  0, 1, 0, 0, 0, 0, 0, 0,
-       0, 0, 0, 0, 0, 0, 0, 0,
-       0  },  // uiGopSize = 2
-	{  0, 2, 1, 2, 0, 0, 0, 0,
-       0, 0, 0, 0, 0, 0, 0, 0,
-       0  },  // uiGopSize = 4
-	{  0, 3, 2, 3, 1, 3, 2, 3,
-       0, 0, 0, 0, 0, 0, 0, 0,
-       0  },  // uiGopSize = 8
-	{  0, 4, 3, 4, 2, 4, 3, 4,
-       1, 4, 3, 4, 2, 4, 3, 4,
-       0  }  //  uiGopSize = 16
-};
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// extern at svc_encode_slice.h
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-}
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+// exp_data.c
+// export date cross various modules (.c)
+#include "typedefs.h"
+#include "wels_common_basis.h"
+#include "mb_cache.h"
+#include "utils.h"
+#include "md.h"
+#include "sample.h"
+#include "svc_enc_golomb.h"
+#include "vlc_encoder.h"
+namespace WelsSVCEnc {
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// extern at mb_cache.h
+const uint8_t g_kuiSmb4AddrIn256[16] = {
+  0,		4,		16 * 4,		16 * 4 + 4,
+  8,		12,		16 * 4 + 8,		16 * 4 + 12,
+  16 * 8,	16 * 8 + 4,	16 * 12,		16 * 12 + 4,
+  16 * 8 + 8,  16 * 8 + 12,  16 * 12 + 8, 16 * 12 + 12
+};
+
+//////pNonZeroCount[16+8] mapping scan index
+const uint8_t g_kuiMbCountScan4Idx[24] = {
+  //  0   1 | 4  5      luma 8*8 block           pNonZeroCount[16+8]
+  0,  1,  4,  5,   //  2   3 | 6  7        0  |  1                  0   1   2   3
+  2,  3,  6,  7,   //---------------      ---------                 4   5   6   7
+  8,  9, 12, 13,   //  8   9 | 12 13       2  |  3                  8   9  10  11
+  10, 11, 14, 15,   // 10  11 | 14 15-----------------------------> 12  13  14  15
+  16, 17, 20, 21,   //----------------    chroma 8*8 block          16  17  18  19
+  18, 19, 22, 23   // 16  17 | 20 21        0    1                 20  21  22  23
+};
+
+const uint8_t g_kuiCache48CountScan4Idx[24] = {
+  // [16 + 2*4]
+  9, 10, 17, 18,
+  11, 12, 19, 20,
+  25, 26, 33, 34,
+  27, 28, 35, 36,
+  14, 15,
+  22, 23,
+  38, 39,
+  46, 47
+};
+
+
+//cache element equal to 30
+const uint8_t g_kuiCache30ScanIdx[16] = { //mv or uiRefIndex cache scan index, 4*4 block as basic unit
+  7,  8, 13, 14,
+  9, 10, 15, 16,
+  19, 20, 25, 26,
+  21, 22, 27, 28
+};
+
+const uint8_t g_kuiCache12_8x8RefIdx[4] = { //mv or uiRefIndex cache scan index, 4*4 block as basic unit
+  5, 6,
+  9, 10
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// extern at mb_cache.h
+
+const str_t* g_sWelsLogTags[] = {
+  "ERR",
+  "WARN",
+  "INFO",
+  "DBUG",
+  "RESV"
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// extern at wels_common_basis.h
+const uint8_t g_kuiChromaQpTable[52] = {
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+  12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
+  28, 29, 29, 30, 31, 32, 32, 33, 34, 34, 35, 35, 36, 36, 37, 37,
+  37, 38, 38, 38, 39, 39, 39, 39
+};
+
+/*
+ *	vcl type map for given NAL unit type and corresponding H264 type (0: AVC; 1: SVC).
+ */
+const EVclType g_keTypeMap[32][2] = {
+  { NON_VCL,	NON_VCL },	// 0: NAL_UNIT_UNSPEC_0
+  { VCL,		VCL,	},	// 1: NAL_UNIT_CODED_SLICE
+  { VCL,		NOT_APP },	// 2: NAL_UNIT_CODED_SLICE_DPA
+  { VCL,		NOT_APP },	// 3: NAL_UNIT_CODED_SLICE_DPB
+  { VCL,		NOT_APP },	// 4: NAL_UNIT_CODED_SLICE_DPC
+  { VCL,		VCL		},	// 5: NAL_UNIT_CODED_SLICE_IDR
+  { NON_VCL,	NON_VCL },	// 6: NAL_UNIT_SEI
+  { NON_VCL,	NON_VCL },	// 7: NAL_UNIT_SPS
+  { NON_VCL,	NON_VCL },	// 8: NAL_UNIT_PPS
+  { NON_VCL,	NON_VCL },	// 9: NAL_UNIT_AU_DELIMITER
+  { NON_VCL,	NON_VCL },	// 10: NAL_UNIT_END_OF_SEQ
+  { NON_VCL,	NON_VCL },	// 11: NAL_UNIT_END_OF_STR
+  { NON_VCL,	NON_VCL	},	// 12: NAL_UNIT_FILLER_DATA
+  { NON_VCL,	NON_VCL },	// 13: NAL_UNIT_SPS_EXT
+  { NON_VCL,	NON_VCL },	// 14: NAL_UNIT_PREFIX, NEED associate succeeded NAL to make a VCL
+  { NON_VCL,	NON_VCL },	// 15: NAL_UNIT_SUBSET_SPS
+  { NON_VCL,	NON_VCL },	// 16: NAL_UNIT_RESV_16
+  { NON_VCL,	NON_VCL },	// 17: NAL_UNIT_RESV_17
+  { NON_VCL,	NON_VCL },	// 18: NAL_UNIT_RESV_18
+  { NON_VCL,	NON_VCL },	// 19: NAL_UNIT_AUX_CODED_SLICE
+  { NON_VCL,	VCL		},	// 20: NAL_UNIT_CODED_SLICE_EXT
+  { NON_VCL,	NON_VCL },	// 21: NAL_UNIT_RESV_21
+  { NON_VCL,	NON_VCL },	// 22: NAL_UNIT_RESV_22
+  { NON_VCL,	NON_VCL },	// 23: NAL_UNIT_RESV_23
+  { NON_VCL,	NON_VCL },	// 24: NAL_UNIT_UNSPEC_24
+  { NON_VCL,	NON_VCL },	// 25: NAL_UNIT_UNSPEC_25
+  { NON_VCL,	NON_VCL },	// 26: NAL_UNIT_UNSPEC_26
+  { NON_VCL,	NON_VCL	},	// 27: NAL_UNIT_UNSPEC_27
+  { NON_VCL,	NON_VCL },	// 28: NAL_UNIT_UNSPEC_28
+  { NON_VCL,	NON_VCL },	// 29: NAL_UNIT_UNSPEC_29
+  { NON_VCL,	NON_VCL },	// 30: NAL_UNIT_UNSPEC_30
+  { NON_VCL,	NON_VCL }	// 31: NAL_UNIT_UNSPEC_31
+};
+
+__align16 (const uint16_t, g_kuiDequantCoeff[52][8]) = {
+  /* 0*/{   10,   13,   10,   13,   13,   16,   13,   16 },	/* 1*/{   11,   14,   11,   14,   14,   18,   14,   18 },
+  /* 2*/{   13,   16,   13,   16,   16,   20,   16,   20 },	/* 3*/{   14,   18,   14,   18,   18,   23,   18,   23 },
+  /* 4*/{   16,   20,   16,   20,   20,   25,   20,   25 },	/* 5*/{   18,   23,   18,   23,   23,   29,   23,   29 },
+  /* 6*/{   20,   26,   20,   26,   26,   32,   26,   32 },	/* 7*/{   22,   28,   22,   28,   28,   36,   28,   36 },
+  /* 8*/{   26,   32,   26,   32,   32,   40,   32,   40 },	/* 9*/{   28,   36,   28,   36,   36,   46,   36,   46 },
+  /*10*/{   32,   40,   32,   40,   40,   50,   40,   50 },	/*11*/{   36,   46,   36,   46,   46,   58,   46,   58 },
+  /*12*/{   40,   52,   40,   52,   52,   64,   52,   64 },	/*13*/{   44,   56,   44,   56,   56,   72,   56,   72 },
+  /*14*/{   52,   64,   52,   64,   64,   80,   64,   80 },	/*15*/{   56,   72,   56,   72,   72,   92,   72,   92 },
+  /*16*/{   64,   80,   64,   80,   80,  100,   80,  100 },	/*17*/{   72,   92,   72,   92,   92,  116,   92,  116 },
+  /*18*/{   80,  104,   80,  104,  104,  128,  104,  128 },	/*19*/{   88,  112,   88,  112,  112,  144,  112,  144 },
+  /*20*/{  104,  128,  104,  128,  128,  160,  128,  160 },	/*21*/{  112,  144,  112,  144,  144,  184,  144,  184 },
+  /*22*/{  128,  160,  128,  160,  160,  200,  160,  200 },	/*23*/{  144,  184,  144,  184,  184,  232,  184,  232 },
+  /*24*/{  160,  208,  160,  208,  208,  256,  208,  256 },	/*25*/{  176,  224,  176,  224,  224,  288,  224,  288 },
+  /*26*/{  208,  256,  208,  256,  256,  320,  256,  320 },	/*27*/{  224,  288,  224,  288,  288,  368,  288,  368 },
+  /*28*/{  256,  320,  256,  320,  320,  400,  320,  400 },	/*29*/{  288,  368,  288,  368,  368,  464,  368,  464 },
+  /*30*/{  320,  416,  320,  416,  416,  512,  416,  512 },	/*31*/{  352,  448,  352,  448,  448,  576,  448,  576 },
+  /*32*/{  416,  512,  416,  512,  512,  640,  512,  640 },	/*33*/{  448,  576,  448,  576,  576,  736,  576,  736 },
+  /*34*/{  512,  640,  512,  640,  640,  800,  640,  800 },	/*35*/{  576,  736,  576,  736,  736,  928,  736,  928 },
+  /*36*/{  640,  832,  640,  832,  832, 1024,  832, 1024 },	/*37*/{  704,  896,  704,  896,  896, 1152,  896, 1152 },
+  /*38*/{  832, 1024,  832, 1024, 1024, 1280, 1024, 1280 },	/*39*/{  896, 1152,  896, 1152, 1152, 1472, 1152, 1472 },
+  /*40*/{ 1024, 1280, 1024, 1280, 1280, 1600, 1280, 1600 },	/*41*/{ 1152, 1472, 1152, 1472, 1472, 1856, 1472, 1856 },
+  /*42*/{ 1280, 1664, 1280, 1664, 1664, 2048, 1664, 2048 },	/*43*/{ 1408, 1792, 1408, 1792, 1792, 2304, 1792, 2304 },
+  /*44*/{ 1664, 2048, 1664, 2048, 2048, 2560, 2048, 2560 },	/*45*/{ 1792, 2304, 1792, 2304, 2304, 2944, 2304, 2944 },
+  /*46*/{ 2048, 2560, 2048, 2560, 2560, 3200, 2560, 3200 },	/*47*/{ 2304, 2944, 2304, 2944, 2944, 3712, 2944, 3712 },
+  /*48*/{ 2560, 3328, 2560, 3328, 3328, 4096, 3328, 4096 },	/*49*/{ 2816, 3584, 2816, 3584, 3584, 4608, 3584, 4608 },
+  /*50*/{ 3328, 4096, 3328, 4096, 4096, 5120, 4096, 5120 },	/*51*/{ 3584, 4608, 3584, 4608, 4608, 5888, 4608, 5888 },
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// extern at md.h
+const int32_t g_kiQpCostTable[52] = {
+  1, 1, 1, 1, 1, 1, 1, 1,  /*  0-7 */
+  1, 1, 1, 1,              /*  8-11 */
+  1, 1, 1, 1, 2, 2, 2, 2,  /* 12-19 */
+  3, 3, 3, 4, 4, 4, 5, 6,  /* 20-27 */
+  6, 7, 8, 9, 10, 11, 13, 14, /* 28-35 */
+  16, 18, 20, 23, 25, 29, 32, 36, /* 36-43 */
+  40, 45, 51, 57, 64, 72, 81, 91 /* 44-51 */
+};
+const int8_t g_kiMapModeI16x16[7] = {
+  0, 1, 2, 3, 2, 2, 2
+};//{I16_PRED_V, I16_PRED_H, I16_PRED_DC, I16_PRED_P, I16_PRED_DC, I16_PRED_DC, I16_PRED_DC};
+
+const int8_t g_kiMapModeIntraChroma[7] = {
+  0, 1, 2, 3, 0, 0, 0
+};//{C_PRED_DC, C_PRED_H, C_PRED_V, C_PRED_P, C_PRED_DC_L, C_PRED_DC_T, C_PRED_DC_128};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// extern at svc_enc_golomb.h
+
+const uint32_t g_uiGolombUELength[256] = {
+  1,  3,  3,  5,  5,  5,  5,  7,  7,  7,  7,  7,  7,  7,  7,    //14
+  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, //30
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, //46
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, //62
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, //
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  17
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// extern at vlc_encoder.h
+
+//g_kuiVlcCoeffToken[nc][total-coeff][trailing-ones][0--value, 1--bit count]
+const uint8_t g_kuiVlcCoeffToken[5][17][4][2] = {
+  {
+    //0<=nc<2
+    {	{ 1,  1}, { 0,  0}, { 0,  0}, { 0,  0} }, //0
+    {	{ 5,  6}, { 1,  2}, { 0,  0}, { 0,  0} },//1
+    {	{ 7,  8}, { 4,  6}, { 1,  3}, { 0,  0} },//2
+    {	{ 7,  9}, { 6,  8}, { 5,  7}, { 3,  5} },//3
+    {	{ 7, 10}, { 6,  9}, { 5,  8}, { 3,  6} },//4
+    {	{ 7, 11}, { 6, 10}, { 5,  9}, { 4,  7} },//5
+    {	{15, 13}, { 6, 11}, { 5, 10}, { 4,  8} },//6
+    {	{11, 13}, {14, 13}, { 5, 11}, { 4,  9} },//7
+    {	{ 8, 13}, {10, 13}, {13, 13}, { 4, 10} },//8
+    {	{15, 14}, {14, 14}, { 9, 13}, { 4, 11} },//9
+    {	{11, 14}, {10, 14}, {13, 14}, {12, 13} },//10
+    {	{15, 15}, {14, 15}, { 9, 14}, {12, 14} },//11
+    {	{11, 15}, {10, 15}, {13, 15}, { 8, 14} },//12
+    {	{15, 16}, { 1, 15}, { 9, 15}, {12, 15} },//13
+    {	{11, 16}, {14, 16}, {13, 16}, { 8, 15} },//14
+    {	{ 7, 16}, {10, 16}, { 9, 16}, {12, 16} },//15
+    {	{ 4, 16}, { 6, 16}, { 5, 16}, { 8, 16} }//16
+  },
+
+  {
+    //2<=nc<4
+    {	{ 3,  2}, { 0,  0}, { 0,  0}, { 0,  0} },//0
+    {	{11,  6}, { 2,  2}, { 0,  0}, { 0,  0} },//1
+    {	{ 7,  6}, { 7,  5}, { 3,  3}, { 0,  0} },//2
+    {	{ 7,  7}, {10,  6}, { 9,  6}, { 5,  4} },//3
+    {	{ 7,  8}, { 6,  6}, { 5,  6}, { 4,  4} },//4
+    {	{ 4,  8}, { 6,  7}, { 5,  7}, { 6,  5} },//5
+    {	{ 7,  9}, { 6,  8}, { 5,  8}, { 8,  6} },//6
+    {	{15, 11}, { 6,  9}, { 5,  9}, { 4,  6} },//7
+    {	{11, 11}, {14, 11}, {13, 11}, { 4,  7} },//8
+    {	{15, 12}, {10, 11}, { 9, 11}, { 4,  9} },//9
+    {	{11, 12}, {14, 12}, {13, 12}, {12, 11} },//10
+    {	{ 8, 12}, {10, 12}, { 9, 12}, { 8, 11} },//11
+    {	{15, 13}, {14, 13}, {13, 13}, {12, 12} },//12
+    {	{11, 13}, {10, 13}, { 9, 13}, {12, 13} },//13
+    {	{ 7, 13}, {11, 14}, { 6, 13}, { 8, 13} },//14
+    {	{ 9, 14}, { 8, 14}, {10, 14}, { 1, 13} },//15
+    {	{ 7, 14}, { 6, 14}, { 5, 14}, { 4, 14} }//16
+  },
+
+  {
+    //4<=nc<8
+    {	{15,  4}, { 0,  0}, { 0,  0}, { 0,  0} },//0
+    {	{15,  6}, {14,  4}, { 0,  0}, { 0,  0} },//1
+    {	{11,  6}, {15,  5}, {13,  4}, { 0,  0} },//2
+    {	{ 8,  6}, {12,  5}, {14,  5}, {12,  4} },//3
+    {	{15,  7}, {10,  5}, {11,  5}, {11,  4} },//4
+    {	{11,  7}, { 8,  5}, { 9,  5}, {10,  4} },//5
+    {	{ 9,  7}, {14,  6}, {13,  6}, { 9,  4} },//6
+    {	{ 8,  7}, {10,  6}, { 9,  6}, { 8,  4} },//7
+    {	{15,  8}, {14,  7}, {13,  7}, {13,  5} },//8
+    {	{11,  8}, {14,  8}, {10,  7}, {12,  6} },//9
+    {	{15,  9}, {10,  8}, {13,  8}, {12,  7} },//10
+    {	{11,  9}, {14,  9}, { 9,  8}, {12,  8} },//11
+    {	{ 8,  9}, {10,  9}, {13,  9}, { 8,  8} },//12
+    {	{13, 10}, { 7,  9}, { 9,  9}, {12,  9} },//13
+    {	{ 9, 10}, {12, 10}, {11, 10}, {10, 10} },//14
+    {	{ 5, 10}, { 8, 10}, { 7, 10}, { 6, 10} },//15
+    {	{ 1, 10}, { 4, 10}, { 3, 10}, { 2, 10} }//16
+  },
+
+  {
+    //8<=nc
+    {	{ 3,  6}, { 0,  0}, { 0,  0}, { 0,  0} },//0
+    {	{ 0,  6}, { 1,  6}, { 0,  0}, { 0,  0} },//1
+    {	{ 4,  6}, { 5,  6}, { 6,  6}, { 0,  0} },//2
+    {	{ 8,  6}, { 9,  6}, {10,  6}, {11,  6} },//3
+    {	{12,  6}, {13,  6}, {14,  6}, {15,  6} },//4
+    {	{16,  6}, {17,  6}, {18,  6}, {19,  6} },//5
+    {	{20,  6}, {21,  6}, {22,  6}, {23,  6} },//6
+    {	{24,  6}, {25,  6}, {26,  6}, {27,  6} },//7
+    {	{28,  6}, {29,  6}, {30,  6}, {31,  6} },//8
+    {	{32,  6}, {33,  6}, {34,  6}, {35,  6} },//9
+    {	{36,  6}, {37,  6}, {38,  6}, {39,  6} },//10
+    {	{40,  6}, {41,  6}, {42,  6}, {43,  6} },//11
+    {	{44,  6}, {45,  6}, {46,  6}, {47,  6} },//12
+    {	{48,  6}, {49,  6}, {50,  6}, {51,  6} },//13
+    {	{52,  6}, {53,  6}, {54,  6}, {55,  6} },//14
+    {	{56,  6}, {57,  6}, {58,  6}, {59,  6} },//15
+    {	{60,  6}, {61,  6}, {62,  6}, {63,  6} }//16
+  },
+
+  {
+    //nc == -1
+    {	{ 1,  2}, { 0,  0}, { 0,  0}, { 0,  0} },//0
+    {	{ 7,  6}, { 1,  1}, { 0,  0}, { 0,  0} },//1
+    {	{ 4,  6}, { 6,  6}, { 1,  3}, { 0,  0} },//2
+    {	{ 3,  6}, { 3,  7}, { 2,  7}, { 5,  6} },//3
+    {	{ 2,  6}, { 3,  8}, { 2,  8}, { 0,  7} },//4
+    {	{ 0,  0}, { 0,  0}, { 0,  0}, { 0,  0} },//5
+    {	{ 0,  0}, { 0,  0}, { 0,  0}, { 0,  0} },//6
+    {	{ 0,  0}, { 0,  0}, { 0,  0}, { 0,  0} },//7
+    {	{ 0,  0}, { 0,  0}, { 0,  0}, { 0,  0} },//8
+    {	{ 0,  0}, { 0,  0}, { 0,  0}, { 0,  0} },//9
+    {	{ 0,  0}, { 0,  0}, { 0,  0}, { 0,  0} },//10
+    {	{ 0,  0}, { 0,  0}, { 0,  0}, { 0,  0} },//11
+    {	{ 0,  0}, { 0,  0}, { 0,  0}, { 0,  0} },//12
+    {	{ 0,  0}, { 0,  0}, { 0,  0}, { 0,  0} },//13
+    {	{ 0,  0}, { 0,  0}, { 0,  0}, { 0,  0} },//14
+    {	{ 0,  0}, { 0,  0}, { 0,  0}, { 0,  0} },//15
+    {	{ 0,  0}, { 0,  0}, { 0,  0}, { 0,  0} }//16
+  }
+};
+
+//const uint8_t g_kuiVlcLevelPrefix[15][2] =
+//{
+//	{1, 1}, {1, 2}
+//};
+
+//g_kuiVlcTotalZeros[tzVlcIndex][total_zeros][0--value, 1--bit count]
+const uint8_t g_kuiVlcTotalZeros[16][16][2] = {
+  {
+    //0 not available
+    {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
+  },
+  {
+    //1
+    {1, 1}, {3, 3}, {2, 3}, {3, 4}, {2, 4}, {3, 5}, {2, 5}, {3, 6}, {2, 6}, {3, 7}, {2, 7}, {3, 8}, {2, 8}, {3, 9}, {2, 9}, {1, 9}
+  },
+  {
+    //2
+    {7, 3}, {6, 3}, {5, 3}, {4, 3}, {3, 3}, {5, 4}, {4, 4}, {3, 4}, {2, 4}, {3, 5}, {2, 5}, {3, 6}, {2, 6}, {1, 6}, {0, 6}, {0, 0}
+  },
+  {
+    //3
+    {5, 4}, {7, 3}, {6, 3}, {5, 3}, {4, 4}, {3, 4}, {4, 3}, {3, 3}, {2, 4}, {3, 5}, {2, 5}, {1, 6}, {1, 5}, {0, 6}, {0, 0}, {0, 0}
+  },
+  {
+    //4
+    {3, 5}, {7, 3}, {5, 4}, {4, 4}, {6, 3}, {5, 3}, {4, 3}, {3, 4}, {3, 3}, {2, 4}, {2, 5}, {1, 5}, {0, 5}, {0, 0}, {0, 0}, {0, 0}
+  },
+  {
+    //5
+    {5, 4}, {4, 4}, {3, 4}, {7, 3}, {6, 3}, {5, 3}, {4, 3}, {3, 3}, {2, 4}, {1, 5}, {1, 4}, {0, 5}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
+  },
+  {
+    //6
+    {1, 6}, {1, 5}, {7, 3}, {6, 3}, {5, 3}, {4, 3}, {3, 3}, {2, 3}, {1, 4}, {1, 3}, {0, 6}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
+  },
+  {
+    //7
+    {1, 6}, {1, 5}, {5, 3}, {4, 3}, {3, 3}, {3, 2}, {2, 3}, {1, 4}, {1, 3}, {0, 6}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
+  },
+  {
+    //8
+    {1, 6}, {1, 4}, {1, 5}, {3, 3}, {3, 2}, {2, 2}, {2, 3}, {1, 3}, {0, 6}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
+  },
+  {
+    //9
+    {1, 6}, {0, 6}, {1, 4}, {3, 2}, {2, 2}, {1, 3}, {1, 2}, {1, 5}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
+  },
+  {
+    //10
+    {1, 5}, {0, 5}, {1, 3}, {3, 2}, {2, 2}, {1, 2}, {1, 4}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
+  },
+  {
+    //11
+    {0, 4}, {1, 4}, {1, 3}, {2, 3}, {1, 1}, {3, 3}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
+  },
+  {
+    //12
+    {0, 4}, {1, 4}, {1, 2}, {1, 1}, {1, 3}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
+  },
+  {
+    //13
+    {0, 3}, {1, 3}, {1, 1}, {1, 2}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
+  },
+  {
+    //14
+    {0, 2}, {1, 2}, {1, 1}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
+  },
+  {
+    //15
+    {0, 1}, {1, 1}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
+  }
+};
+
+const uint8_t g_kuiVlcTotalZerosChromaDc[4][4][2] = {
+  {
+    {0, 0}, {0, 0}, {0, 0}, {0, 0}
+  },
+  {
+    {1, 1}, {1, 2}, {1, 3}, {0, 3}
+  },
+  {
+    {1, 1}, {1, 2}, {0, 2}, {0, 0}
+  },
+  {
+    {1, 1}, {0, 1}, {0, 0}, {0, 0}
+  }
+};
+//
+
+//g_kuiVlcRunBefore[zeros-left][run-before][0--value, 1--bit count]
+const uint8_t g_kuiVlcRunBefore[8][15][2] = {
+  {
+    //0 not available
+    {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
+  },
+  {
+    //1
+    {1, 1}, {0, 1}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
+  },
+  {
+    //2
+    {1, 1}, {1, 2}, {0, 2}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
+  },
+  {
+    //3
+    {3, 2}, {2, 2}, {1, 2}, {0, 2}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
+  },
+  {
+    //4
+    {3, 2}, {2, 2}, {1, 2}, {1, 3}, {0, 3}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
+  },
+  {
+    //5
+    {3, 2}, {2, 2}, {3, 3}, {2, 3}, {1, 3}, {0, 3}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
+  },
+  {
+    //6
+    {3, 2}, {0, 3}, {1, 3}, {3, 3}, {2, 3}, {5, 3}, {4, 3}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
+  },
+  {
+    //>6
+    {7, 3}, {6, 3}, {5, 3}, {4, 3}, {3, 3}, {2, 3}, {1, 3}, {1, 4}, {1, 5}, {1, 6}, {1, 7}, {1, 8}, {1, 9}, {1, 10}, {1, 11}
+  }
+};
+
+const ALIGNED_DECLARE (uint8_t, g_kuiEncNcMapTable[18], 16) = {
+  0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4
+};
+
+
+
+const uint8_t   g_kuiTemporalIdListTable[MAX_TEMPORAL_LEVEL][MAX_GOP_SIZE + 1] = {
+  {
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0
+  },  // gop size = 1
+  {
+    0, 1, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0
+  },  // uiGopSize = 2
+  {
+    0, 2, 1, 2, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0
+  },  // uiGopSize = 4
+  {
+    0, 3, 2, 3, 1, 3, 2, 3,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0
+  },  // uiGopSize = 8
+  {
+    0, 4, 3, 4, 2, 4, 3, 4,
+    1, 4, 3, 4, 2, 4, 3, 4,
+    0
+  }  //  uiGopSize = 16
+};
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// extern at svc_encode_slice.h
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+}
--- a/codec/encoder/core/src/encoder_ext.cpp
+++ b/codec/encoder/core/src/encoder_ext.cpp
@@ -1,4250 +1,4032 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file	encoder_ext.c
- *
- * \brief	core encoder for SVC
- *
- * \date	7/24/2009 Created
- *
- *************************************************************************************
- */
-#include <string.h>
-#include <stdlib.h>
-#include <assert.h>
-
-#include "encoder.h"
-#include "extern.h"
-#include "encoder_context.h"
-#include "typedefs.h"
-#include "wels_const.h"
-#include "wels_common_basis.h"
-#include "codec_def.h"
-#include "param_svc.h"
-#include "cpu_core.h"
-#include "cpu.h"
-#include "utils.h"
-#include "svc_enc_frame.h"
-#include "svc_enc_golomb.h"
-#include "svc_enc_slice_segment.h"
-#include "au_set.h"
-#include "picture_handle.h"
-#include "codec_app_def.h"
-#include "svc_base_layer_md.h"
-#include "svc_encode_slice.h"
-#include "decode_mb_aux.h"
-#include "deblocking.h"
-#include "rc.h"
-#include "ref_list_mgr_svc.h"
-#include "md.h"
-#include "ls_defines.h"
-#include "set_mb_syn_cavlc.h"
-#include "crt_util_safe_x.h"	// Safe CRT routines like utils for cross platforms
-#include "array_stack_align.h"
-// for MT, 4/22/2010
-#include "slice_multi_threading.h"
-#if defined(DYNAMIC_SLICE_ASSIGN) || defined(MT_DEBUG)
-#include "measure_time.h"
-#endif//DYNAMIC_SLICE_ASSIGN
-
-namespace WelsSVCEnc {
-
-
-int32_t WelsCodeOnePicPartition(	sWelsEncCtx *pCtx,
-									SLayerBSInfo *pLbi,
-									int32_t *pNalIdxInLayer,									
-									int32_t* pLayerSize,
-									int32_t iFirstMbInPartition,	// first mb inclusive in partition
-									int32_t iEndMbInPartition,	// end mb exclusive in partition
-									int32_t iStartSliceIdx
-								  );
-
-
-/*!
- * \brief	validate checking in parameter configuration
- * \pParam	pParam		SWelsSvcCodingParam*
- * \return	successful - 0; otherwise none 0 for failed
- */
-int32_t ParamValidation( SWelsSvcCodingParam *pCfg )
-{
-	float fMaxFrameRate = 0.0f;
-	const float fEpsn = 0.000001f;
-	int32_t i = 0;
-	int32_t iLastSpatialWidth	= 0;
-	int32_t	iLastSpatialHeight	= 0;
-	float fLastFrameRateIn	= 0.0f;
-	float fLastFrameRateOut	= 0.0f;
-	SDLayerParam *pLastSpatialParam = NULL;
-
-	assert( pCfg != NULL );
-
-	for (i = 0; i < pCfg->iNumDependencyLayer; ++ i)
-	{
-		SDLayerParam *fDlp = &pCfg->sDependencyLayers[i];
-		if ( fDlp->fOutputFrameRate > fDlp->fInputFrameRate || (fDlp->fInputFrameRate >= -fEpsn && fDlp->fInputFrameRate <= fEpsn)
-			|| (fDlp->fOutputFrameRate >= -fEpsn && fDlp->fOutputFrameRate <= fEpsn) )
-		{
-#if defined (_DEBUG)
-			fprintf(stderr, "Invalid settings in input frame rate(%.6f) or output frame rate(%.6f) of layer #%d config file..\n",
-				fDlp->fInputFrameRate, fDlp->fOutputFrameRate, i);
-#endif
-			return 1;
-		}
-		if ( UINT_MAX == GetLogFactor(fDlp->fOutputFrameRate, fDlp->fInputFrameRate) )
-		{
-#if defined (_DEBUG)
-			fprintf(stderr, "Invalid settings in input frame rate(%.6f) and output frame rate(%.6f) of layer #%d config file: iResult of output frame rate divided by input frame rate should be power of 2(i.e,in/pOut=2^n)..\n",
-				fDlp->fInputFrameRate, fDlp->fOutputFrameRate, i);
-#endif
-			return 1;
-		}
-	}
-
-	for (i = 0; i < pCfg->iNumDependencyLayer; ++ i)
-	{
-		SDLayerParam *fDlp = &pCfg->sDependencyLayers[i];
-		if ( fDlp->fInputFrameRate > fMaxFrameRate )
-			fMaxFrameRate	= fDlp->fInputFrameRate;
-	}
-
-	if ( fMaxFrameRate > fEpsn && (fMaxFrameRate - pCfg->fMaxFrameRate > fEpsn || fMaxFrameRate - pCfg->fMaxFrameRate < -fEpsn) )
-	{
-		pCfg->fMaxFrameRate	= fMaxFrameRate;		
-	}
-
-	for (i = 0; i < pCfg->iNumDependencyLayer; ++ i)
-	{
-		SDLayerParam *fDlp = &pCfg->sDependencyLayers[i];
-
-		pLastSpatialParam	= fDlp;
-		iLastSpatialWidth	= fDlp->iFrameWidth;
-		iLastSpatialHeight	= fDlp->iFrameHeight;
-		fLastFrameRateIn	= fDlp->fInputFrameRate;
-		fLastFrameRateOut	= fDlp->fOutputFrameRate;
-	}
-
-	return 0;
-}
-
-int32_t ParamValidationExt( void *pParam )
-{
-	SWelsSvcCodingParam *pCodingParam = (SWelsSvcCodingParam *)pParam;
-	int8_t i = 0;
-	int32_t iIdx = 0;
-
-	assert ( pCodingParam != NULL );
-	if ( NULL == pCodingParam )
-		return 1;
-
-	if ( pCodingParam->iNumDependencyLayer < 1 || pCodingParam->iNumDependencyLayer > MAX_DEPENDENCY_LAYER ){
-#if defined (_DEBUG)
-		fprintf(stderr, "ParamValidationExt(), monitor invalid pCodingParam->iNumDependencyLayer: %d!\n", pCodingParam->iNumDependencyLayer);
-#endif//#if _DEBUG
-
-		return 1;
-	}
-	
-	if ( pCodingParam->iNumTemporalLayer < 1 || pCodingParam->iNumTemporalLayer > MAX_TEMPORAL_LEVEL ){
-#if defined (_DEBUG)
-		fprintf(stderr, "ParamValidationExt(), monitor invalid pCodingParam->iNumTemporalLayer: %d!\n", pCodingParam->iNumTemporalLayer);
-#endif//#if _DEBUG
-		return 1;
-	}
-	
-	if ( pCodingParam->uiGopSize < 1 || pCodingParam->uiGopSize > MAX_GOP_SIZE ){
-#if defined (_DEBUG)
-		fprintf(stderr, "ParamValidationExt(), monitor invalid pCodingParam->uiGopSize: %d!\n", pCodingParam->uiGopSize);
-#endif//#if _DEBUG
-		return 1;
-	}
-	
-
-	if ( pCodingParam->uiIntraPeriod && pCodingParam->uiIntraPeriod < pCodingParam->uiGopSize )
-	{
-#if defined (_DEBUG)
-		fprintf(stderr, "ParamValidationExt(), uiIntraPeriod(%d) should be not less than that of uiGopSize(%d) or -1 specified!\n",
-			pCodingParam->uiIntraPeriod, pCodingParam->uiGopSize);
-#endif//#if _DEBUG
-		return 1;
-	}
-	
-	if ( pCodingParam->uiIntraPeriod && (pCodingParam->uiIntraPeriod & (pCodingParam->uiGopSize-1)) != 0 )
-	{
-#if defined (_DEBUG)
-		fprintf(stderr, "ParamValidationExt(), uiIntraPeriod(%d) should be multiple of uiGopSize(%d) or -1 specified!\n",
-			pCodingParam->uiIntraPeriod, pCodingParam->uiGopSize);
-#endif//#if _DEBUG
-		return 1;
-	}
-	
-
-#ifdef MT_ENABLED
-	//about iMultipleThreadIdc, bDeblockingParallelFlag, iLoopFilterDisableIdc, & uiSliceMode
-	// (1) Single Thread
-	//	if (THREAD==1)//single thread
-	//		no parallel_deblocking: bDeblockingParallelFlag = 0;
-	// (2) Multi Thread: see uiSliceMode decision
-	if ( pCodingParam->iMultipleThreadIdc == 1 )
-	{
-		//now is single thread. no parallel deblocking, set flag=0
-		pCodingParam->bDeblockingParallelFlag = false;
-	}
-	else
-	{
-		pCodingParam->bDeblockingParallelFlag = true;
-	}
-#else
-	pCodingParam->bDeblockingParallelFlag	= false;
-#endif//MT_ENABLED
-	
-	for ( i = 0; i < pCodingParam->iNumDependencyLayer; ++ i ){
-		SDLayerParam *fDlp = &pCodingParam->sDependencyLayers[i];
-		const int32_t kiPicWidth = fDlp->iFrameWidth;
-		const int32_t kiPicHeight= fDlp->iFrameHeight;
-		int32_t iMbWidth		= 0;
-		int32_t iMbHeight		= 0;
-		int32_t iMbNumInFrame		= 0;
-		int32_t iMaxSliceNum		= MAX_SLICES_NUM;
-		if ( kiPicWidth <= 0 || kiPicHeight <= 0 ){
-#if defined (_DEBUG)
-			fprintf(stderr, "ParamValidationExt(), invalid %d x %d in dependency layer settings!\n", kiPicWidth, kiPicHeight);
-#endif//#if _DEBUG
-			return 1;
-		}
-		if ( (kiPicWidth & 0x0F) != 0 || (kiPicHeight & 0x0F) != 0 ){
-#if defined (_DEBUG)
-			fprintf(stderr, "ParamValidationExt(), in layer #%d iWidth x iHeight(%d x %d) both should be multiple of 16, can not support with arbitrary size currently!\n", i, kiPicWidth, kiPicHeight);
-#endif//#if _DEBUG
-			return 1;
-		}	
-
-		if ( fDlp->sMso.uiSliceMode >= SM_RESERVED ){
-#if defined (_DEBUG)
-			fprintf(stderr, "ParamValidationExt(), invalid uiSliceMode (%d) settings!\n", fDlp->sMso.uiSliceMode );
-#endif//#if _DEBUG
-			return 1;
-		}
-
-		//check pSlice settings under multi-pSlice
-		if ( kiPicWidth<=16 && kiPicHeight<=16 ){
-			//only have one MB, set to single_slice
-			fDlp->sMso.uiSliceMode = SM_SINGLE_SLICE;
-		}
-		switch ( fDlp->sMso.uiSliceMode )
-		{
-			case SM_SINGLE_SLICE:
-				fDlp->sMso.sSliceArgument.iSliceNum = 1;
-				fDlp->sMso.sSliceArgument.uiSliceSizeConstraint = 0;
-				fDlp->sMso.sSliceArgument.iSliceNum = 0;
-				for (iIdx=0; iIdx<MAX_SLICES_NUM;iIdx++)
-				{
-					fDlp->sMso.sSliceArgument.uiSliceMbNum[iIdx] = 0;
-				}
-				break;
-			case SM_FIXEDSLCNUM_SLICE:
-				{
-					fDlp->sMso.sSliceArgument.uiSliceSizeConstraint = 0;
-
-					iMbWidth	= (kiPicWidth+15)>>4;
-					iMbHeight	= (kiPicHeight+15)>>4;
-					iMbNumInFrame = iMbWidth * iMbHeight;
-					iMaxSliceNum = MAX_SLICES_NUM;
-					if ( fDlp->sMso.sSliceArgument.iSliceNum <= 0 
-						|| fDlp->sMso.sSliceArgument.iSliceNum > iMaxSliceNum )
-					{
-#if defined (_DEBUG)
-						fprintf(stderr, "ParamValidationExt(), invalid uiSliceNum (%d) settings!\n", fDlp->sMso.sSliceArgument.iSliceNum );
-#endif//#if _DEBUG
-						return 1;
-					}
-					if (fDlp->sMso.sSliceArgument.iSliceNum == 1)
-					{
-#if defined (_DEBUG)
-						fprintf(stderr, "ParamValidationExt(), uiSliceNum(%d) you set for SM_FIXEDSLCNUM_SLICE, now turn to SM_SINGLE_SLICE type!\n", fDlp->sMso.sSliceArgument.iSliceNum );
-#endif//#if _DEBUG
-						fDlp->sMso.uiSliceMode	= SM_SINGLE_SLICE;
-						break;
-					}
-					if (pCodingParam->bEnableRc)	// multiple slices verify with gom
-					{		
-						//check uiSliceNum
-						GomValidCheckSliceNum( iMbWidth, iMbHeight, (int32_t*)&fDlp->sMso.sSliceArgument.iSliceNum );
-						assert(fDlp->sMso.sSliceArgument.iSliceNum > 1);
-						//set uiSliceMbNum with current uiSliceNum
-						GomValidCheckSliceMbNum( iMbWidth, iMbHeight, &fDlp->sMso.sSliceArgument );
-					}
-					else if ( !CheckFixedSliceNumMultiSliceSetting( iMbNumInFrame, &fDlp->sMso.sSliceArgument ) )	// verify interleave mode settings
-					{//check uiSliceMbNum with current uiSliceNum
-#if defined (_DEBUG)
-						fprintf(stderr, "ParamValidationExt(), invalid uiSliceMbNum (%d) settings!\n",  fDlp->sMso.sSliceArgument.uiSliceMbNum[0] );
-#endif//#if _DEBUG
-						return 1;
-					}
-					// considering the coding efficient and performance, iCountMbNum constraint by MIN_NUM_MB_PER_SLICE condition of multi-pSlice mode settting
-					if ( iMbNumInFrame <= MIN_NUM_MB_PER_SLICE )
-					{
-						fDlp->sMso.uiSliceMode	= SM_SINGLE_SLICE;
-						fDlp->sMso.sSliceArgument.iSliceNum	= 1;
-						break;
-					}
-				}
-				break;
-			case SM_RASTER_SLICE:
-				{
-					fDlp->sMso.sSliceArgument.uiSliceSizeConstraint = 0;
-
-					iMbWidth	= (kiPicWidth+15)>>4;
-					iMbHeight	= (kiPicHeight+15)>>4;
-					iMbNumInFrame = iMbWidth * iMbHeight;
-					iMaxSliceNum = MAX_SLICES_NUM;
-					if ( fDlp->sMso.sSliceArgument.uiSliceMbNum[0] <= 0 )
-					{
-#if defined (_DEBUG)
-						fprintf(stderr, "ParamValidationExt(), invalid uiSliceMbNum (%d) settings!\n",  fDlp->sMso.sSliceArgument.uiSliceMbNum[0] );
-#endif//#if _DEBUG
-						return 1;
-					}
-
-					if ( !CheckRasterMultiSliceSetting( iMbNumInFrame, &fDlp->sMso.sSliceArgument ) )	// verify interleave mode settings
-					{
-#if defined (_DEBUG)
-						fprintf(stderr, "ParamValidationExt(), invalid uiSliceMbNum (%d) settings!\n",  fDlp->sMso.sSliceArgument.uiSliceMbNum[0] );
-#endif//#if _DEBUG
-						return 1;
-					}
-					if ( fDlp->sMso.sSliceArgument.iSliceNum <= 0 || fDlp->sMso.sSliceArgument.iSliceNum > iMaxSliceNum )	// verify interleave mode settings
-					{
-#if defined (_DEBUG)
-						fprintf(stderr, "ParamValidationExt(), invalid uiSliceNum (%d) in SM_RASTER_SLICE settings!\n",  fDlp->sMso.sSliceArgument.iSliceNum );
-#endif//#if _DEBUG
-						return 1;
-					}
-					if (fDlp->sMso.sSliceArgument.iSliceNum == 1)
-					{
-#if defined (_DEBUG)
-						fprintf(stderr, "ParamValidationExt(), pSlice setting for SM_RASTER_SLICE now turn to SM_SINGLE_SLICE!\n" );
-#endif//#if _DEBUG
-						fDlp->sMso.uiSliceMode	= SM_SINGLE_SLICE;
-						break;
-					}
-#ifdef MT_ENABLED
-					if (pCodingParam->bEnableRc && fDlp->sMso.sSliceArgument.iSliceNum > 1)
-					{
-#if defined (_DEBUG)
-						fprintf(stderr, "ParamValidationExt(), WARNING: GOM based RC do not support SM_RASTER_SLICE!\n" );
-#endif//#if _DEBUG
-					}
-#endif
-					// considering the coding efficient and performance, iCountMbNum constraint by MIN_NUM_MB_PER_SLICE condition of multi-pSlice mode settting
-					if ( iMbNumInFrame <= MIN_NUM_MB_PER_SLICE )
-					{
-						fDlp->sMso.uiSliceMode	= SM_SINGLE_SLICE;
-						fDlp->sMso.sSliceArgument.iSliceNum	= 1;
-						break;
-					}
-				}
-				break;		
-			case SM_ROWMB_SLICE:
-				{
-					fDlp->sMso.sSliceArgument.uiSliceSizeConstraint = 0;
-
-					iMbWidth	= (kiPicWidth+15)>>4;
-					iMbHeight	= (kiPicHeight+15)>>4;
-					iMaxSliceNum = MAX_SLICES_NUM;
-					if ( iMbHeight > iMaxSliceNum )
-					{
-#if defined (_DEBUG)
-						fprintf(stderr, "ParamValidationExt(), invalid uiSliceNum (%d) settings more than MAX!\n", iMbHeight );
-#endif//#if _DEBUG
-						return 1;
-					}
-					fDlp->sMso.sSliceArgument.iSliceNum	= iMbHeight;
-
-					if ( fDlp->sMso.sSliceArgument.iSliceNum <= 0 )
-					{
-#if defined (_DEBUG)
-						fprintf(stderr, "ParamValidationExt(), invalid uiSliceNum (%d) settings!\n", fDlp->sMso.sSliceArgument.iSliceNum );
-#endif//#if _DEBUG
-						return 1;
-					}	
-					if ( !CheckRowMbMultiSliceSetting( iMbWidth, &fDlp->sMso.sSliceArgument ) )	// verify interleave mode settings
-					{
-#if defined (_DEBUG)
-						fprintf(stderr, "ParamValidationExt(), invalid uiSliceMbNum (%d) settings!\n",  fDlp->sMso.sSliceArgument.uiSliceMbNum[0] );
-#endif//#if _DEBUG
-						return 1;
-					}
-				}
-				break;
-			case SM_DYN_SLICE:
-				{
-					iMbWidth	= (kiPicWidth+15)>>4;
-					iMbHeight	= (kiPicHeight+15)>>4;
-					if ( fDlp->sMso.sSliceArgument.uiSliceSizeConstraint <= 0 )
-					{
-#if defined (_DEBUG)
-						fprintf(stderr, "ParamValidationExt(), invalid iSliceSize (%d) settings!\n",   fDlp->sMso.sSliceArgument.uiSliceSizeConstraint );
-#endif//#if _DEBUG
-						return 1;
-					}
-					// considering the coding efficient and performance, iCountMbNum constraint by MIN_NUM_MB_PER_SLICE condition of multi-pSlice mode settting
-					if ( iMbWidth * iMbHeight <= MIN_NUM_MB_PER_SLICE )
-					{
-						fDlp->sMso.uiSliceMode	= SM_SINGLE_SLICE;
-						fDlp->sMso.sSliceArgument.iSliceNum	= 1;
-						break;
-					}
-				}
-				break;
-			default:
-				{
-
-#if defined (_DEBUG)
-					fprintf(stderr, "ParamValidationExt(), invalid uiSliceMode (%d) settings!\n", pCodingParam->sDependencyLayers[0].sMso.uiSliceMode );
-#endif//#if _DEBUG
-					return 1;
-
-				}
-				break;
-		}
-	}
-	
-	return ParamValidation(pCodingParam);
-}
-
-/*!
- * \brief	acquire count number of layers and NALs based on configurable paramters dependency
- * \pParam	pCtx				sWelsEncCtx*
- * \pParam	pParam			SWelsSvcCodingParam*
- * \pParam	pCountLayers	pointer of count number of layers indeed
- * \pParam	iCountNals		pointer of count number of nals indeed
- * \return	0 - successful; otherwise failed
- */
-static inline int32_t AcquireLayersNals( sWelsEncCtx **ppCtx, SWelsSvcCodingParam *pParam, int32_t *pCountLayers, int32_t *pCountNals )
-{	
-	int32_t iCountNumLayers		= 0;
-	int32_t iCountNumNals			= 0;
-	int32_t iNumDependencyLayers	= 0;
-	int32_t iDIndex 				= 0;
-#if defined(MT_ENABLED) && defined(PACKING_ONE_SLICE_PER_LAYER)
-	int32_t iNumLayersPack = 0;
-#endif//MT_ENABLED && PACKING_ONE_SLICE_PER_LAYER
-
-	if ( NULL == pParam || NULL == ppCtx || NULL == *ppCtx )
-		return 1;
-	
-	iNumDependencyLayers	= pParam->iNumDependencyLayer;
-
-	do {
-		SDLayerParam *pDLayer = &pParam->sDependencyLayers[iDIndex];		
-//		pDLayer->ptr_cfg = pParam;
-		int32_t iOrgNumNals = iCountNumNals;
-
-		//Note: Sep. 2010
-		//Review this part and suggest no change, since the memory over-use 
-		//(1) counts little to the overall performance
-		//(2) should not be critial even under mobile case
-		if ( SM_DYN_SLICE == pDLayer->sMso.uiSliceMode )
-		{
-			iCountNumNals += MAX_SLICES_NUM;
-			// plus prefix NALs
-			if ( iDIndex == 0 )
-				iCountNumNals += MAX_SLICES_NUM;
-			// MAX_SLICES_NUM < MAX_LAYER_NUM_OF_FRAME ensured at svc_enc_slice_segment.h
-#if defined(MT_ENABLED) && defined(PACKING_ONE_SLICE_PER_LAYER)
-			assert(MAX_SLICES_NUM < MAX_LAYER_NUM_OF_FRAME);
-			// iNumLayersPack += MAX_SLICES_NUM; // do not count it for dynamic slicing mode
-#else//!MT_ENABLED || !PACKING_ONE_SLICE_PER_LAYER
-			assert(iCountNumNals - iOrgNumNals <= MAX_NAL_UNITS_IN_LAYER );
-#endif//MT_ENABLED && PACKING_ONE_SLICE_PER_LAYER
-		}
-		else /*if ( SM_SINGLE_SLICE != pDLayer->sMso.uiSliceMode )*/
-		{
-			const int32_t kiNumOfSlice = GetInitialSliceNum(	(pDLayer->iFrameWidth+0x0f)>>4,
-												(pDLayer->iFrameHeight+0x0f)>>4,
-												&pDLayer->sMso );
-
-			// NEED check iCountNals value in case multiple slices is used
-			iCountNumNals += kiNumOfSlice; // for pSlice VCL NALs
-			// plus prefix NALs
-			if ( iDIndex == 0 )
-				iCountNumNals += kiNumOfSlice;
-#if defined(MT_ENABLED) && defined(PACKING_ONE_SLICE_PER_LAYER)
-			assert(num_of_slice <= MAX_SLICES_NUM && MAX_SLICES_NUM < MAX_LAYER_NUM_OF_FRAME);
-			iNumLayersPack += num_of_slice;
-#else//!MT_ENABLED || !PACKING_ONE_SLICE_PER_LAYER
-			assert(iCountNumNals - iOrgNumNals <= MAX_NAL_UNITS_IN_LAYER );				
-#endif//MT_ENALBED && PACKING_ONE_SLICE_PER_LAYER
-			if ( kiNumOfSlice > MAX_SLICES_NUM )
-			{
-				WelsLog( *ppCtx, WELS_LOG_ERROR, "AcquireLayersNals(), num_of_slice(%d) > MAX_SLICES_NUM(%d) per (iDid= %d, qid= %d) settings!\n",
-					kiNumOfSlice, MAX_SLICES_NUM, iDIndex, 0 );
-				return 1;
-			}
-		}
-#if !defined(MT_ENABLED) || !defined(PACKING_ONE_SLICE_PER_LAYER)
-		if ( iCountNumNals - iOrgNumNals > MAX_NAL_UNITS_IN_LAYER )
-		{
-			WelsLog( *ppCtx, WELS_LOG_ERROR, "AcquireLayersNals(), num_of_nals(%d) > MAX_NAL_UNITS_IN_LAYER(%d) per (iDid= %d, qid= %d) settings!\n",
-				(iCountNumNals - iOrgNumNals), MAX_NAL_UNITS_IN_LAYER, iDIndex, 0 );
-			return 1;
-		}
-#endif//!MT_ENABLED) || !PACKING_ONE_SLICE_PER_LAYER
-
-		iCountNumLayers ++;
-		
-		++ iDIndex;
-	} while(iDIndex < iNumDependencyLayers);
-
-	iCountNumNals += 1 + iNumDependencyLayers + (iCountNumLayers<<1) + iCountNumLayers;	// plus iCountNumLayers for reserved application
-#if defined(MT_ENABLED) && defined(PACKING_ONE_SLICE_PER_LAYER)
-	iNumLayersPack += 1 + iNumDependencyLayers + (iCountNumLayers<<1);
-#endif//MT_ENABLED && PACKING_ONE_SLICE_PER_LAYER
-
-	// to check number of layers / nals / slices dependencies, 12/8/2010
-#if !defined(MT_ENABLED)
-	if ( iCountNumLayers > MAX_LAYER_NUM_OF_FRAME )
-	{
-		WelsLog( *ppCtx, WELS_LOG_ERROR, "AcquireLayersNals(), iCountNumLayers(%d) > MAX_LAYER_NUM_OF_FRAME(%d)!", iCountNumLayers, MAX_LAYER_NUM_OF_FRAME );
-		return 1;
-	}
-#else//MT_ENABLED
-#if defined(PACKING_ONE_SLICE_PER_LAYER)
-	if ( iNumLayersPack > MAX_LAYER_NUM_OF_FRAME )
-	{
-		WelsLog( *ppCtx, WELS_LOG_ERROR, "AcquireLayersNals(), num_layers_pack_overall(%d) > MAX_LAYER_NUM_OF_FRAME(%d)!", iNumLayersPack, MAX_LAYER_NUM_OF_FRAME );
-		return 1;
-	}
-#else//!PACKING_ONE_SLICE_PER_LAYER
-	if ( iCountNumLayers > MAX_LAYER_NUM_OF_FRAME )
-	{
-		WelsLog( *ppCtx, WELS_LOG_ERROR, "AcquireLayersNals(), iCountNumLayers(%d) > MAX_LAYER_NUM_OF_FRAME(%d)!", iCountNumLayers, MAX_LAYER_NUM_OF_FRAME );
-		return 1;
-	}
-#endif//PACKING_ONE_SLICE_PER_LAYER
-#endif//!MT_ENABLED
-
-	if ( NULL != pCountLayers )
-		*pCountLayers	= iCountNumLayers;
-	if ( NULL != pCountNals )
-		*pCountNals 	= iCountNumNals;
-	return 0;
-}
-
-/*!
- * \brief	alloc spatial layers pictures (I420 based source pictures)	
- */
-int32_t AllocSpatialPictures( sWelsEncCtx **ppCtx, SWelsSvcCodingParam *pParam )
-{
-	CMemoryAlign *pMa						= (*ppCtx)->pMemAlign;	
-	const int32_t kiDlayerCount					= pParam->iNumDependencyLayer;
-	int32_t iDlayerIndex							= 0;
-
-	// spatial pictures
-	iDlayerIndex = 0;
-	do {
-		const int32_t kiPicWidth = pParam->sDependencyLayers[iDlayerIndex].iFrameWidth;
-		const int32_t kiPicHeight   = pParam->sDependencyLayers[iDlayerIndex].iFrameHeight;
-		const uint8_t kuiLayerInTemporal = 2 + WELS_MAX(pParam->sDependencyLayers[iDlayerIndex].iHighestTemporalId, 1);
-		const uint8_t kuiRefNumInTemporal = kuiLayerInTemporal + pParam->iLTRRefNum;
-		uint8_t i = 0;
-
-		do {
-			SPicture *pPic = AllocPicture( pMa, kiPicWidth, kiPicHeight, false );
-			WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pPic), FreeMemorySvc(ppCtx); *ppCtx = NULL )
-			(*ppCtx)->pSpatialPic[iDlayerIndex][i] = pPic;
-			++ i;
-		} while( i < kuiRefNumInTemporal);
-
-		(*ppCtx)->uiSpatialLayersInTemporal[iDlayerIndex] = kuiLayerInTemporal;
-		(*ppCtx)->uiSpatialPicNum[iDlayerIndex] = kuiRefNumInTemporal;
-		++ iDlayerIndex;
-	} while( iDlayerIndex < kiDlayerCount );  
-
-	return 0;
-}
-
-void FreeSpatialPictures( sWelsEncCtx *pCtx )
-{
-	CMemoryAlign *pMa	= pCtx->pMemAlign;
-	int32_t j = 0;
-	while( j < pCtx->pSvcParam->iNumDependencyLayer )
-	{
-		uint8_t i = 0;
-		uint8_t uiRefNumInTemporal = pCtx->uiSpatialPicNum[j];
-
-		while( i < uiRefNumInTemporal ) 
-		{
-			if ( NULL != pCtx->pSpatialPic[j][i] )
-			{
-				FreePicture( pMa, &pCtx->pSpatialPic[j][i] );			
-			}
-			++ i;
-		}
-		pCtx->uiSpatialLayersInTemporal[j]	= 0;
-		++ j;
-	}
-
-}
-
-static  void  InitMbInfo(sWelsEncCtx * pEnc, SMB  * pList, SDqLayer * pLayer, const int32_t kiDlayerId, const int32_t kiMaxMbNum )
-{
-    int32_t  iMbWidth		= pLayer->iMbWidth;
-	int32_t  iMbHeight		= pLayer->iMbHeight;
-	int32_t  iIdx;
-	int32_t  iMbNum			= iMbWidth * iMbHeight;
-	SSliceCtx *pSliceCtx= pLayer->pSliceEncCtx;
-	uint32_t uiNeighborAvail;    
-	const int32_t kiOffset	= (kiDlayerId & 0x01) * kiMaxMbNum;
-	SMVUnitXY (*pLayerMvUnitBlock4x4)[MB_BLOCK4x4_NUM]	= (SMVUnitXY(*)[MB_BLOCK4x4_NUM])(&pEnc->pMvUnitBlock4x4[MB_BLOCK4x4_NUM*kiOffset]);	
-	int8_t (*pLayerRefIndexBlock8x8)[MB_BLOCK8x8_NUM]		= (int8_t(*)[MB_BLOCK8x8_NUM])(&pEnc->pRefIndexBlock4x4[MB_BLOCK8x8_NUM*kiOffset]);	
-
-	for( iIdx = 0; iIdx<iMbNum; iIdx++ ){
-		BOOL_T     bLeft;
-		BOOL_T     bTop;
-		BOOL_T     bLeftTop;
-		BOOL_T     bRightTop;
-		int32_t  iLeftXY, iTopXY, iLeftTopXY, iRightTopXY;
-		uint8_t  uiSliceIdc;
-		
-		pList[iIdx].iMbX = pEnc->pStrideTab->pMbIndexX[kiDlayerId][iIdx];
-		pList[iIdx].iMbY = pEnc->pStrideTab->pMbIndexY[kiDlayerId][iIdx];
-		pList[iIdx].iMbXY = iIdx;		
-
-        uiSliceIdc = WelsMbToSliceIdc(pSliceCtx, iIdx);
-		iLeftXY = iIdx - 1;
-		iTopXY = iIdx - iMbWidth;
-		iLeftTopXY = iTopXY - 1;
-		iRightTopXY = iTopXY + 1;
-
-		bLeft = (pList[iIdx].iMbX > 0) && (uiSliceIdc == WelsMbToSliceIdc(pSliceCtx, iLeftXY));
-		bTop = (pList[iIdx].iMbY > 0) && (uiSliceIdc == WelsMbToSliceIdc(pSliceCtx, iTopXY));
-		bLeftTop = (pList[iIdx].iMbX > 0) && (pList[iIdx].iMbY > 0) && (uiSliceIdc == 
-			WelsMbToSliceIdc(pSliceCtx, iLeftTopXY));
-		bRightTop = (pList[iIdx].iMbX < (iMbWidth-1)) && (pList[iIdx].iMbY > 0) && (uiSliceIdc == 
-			WelsMbToSliceIdc(pSliceCtx, iRightTopXY));
-
-		uiNeighborAvail = 0;
-		if( bLeft ){
-			uiNeighborAvail |= LEFT_MB_POS;
-		}
-		if( bTop ){
-			uiNeighborAvail |= TOP_MB_POS;
-		}
-		if( bLeftTop ){
-			uiNeighborAvail |= TOPLEFT_MB_POS;
-		}
-		if( bRightTop ){
-			uiNeighborAvail |= TOPRIGHT_MB_POS;
-		}
-		pList[iIdx].uiSliceIdc		= uiSliceIdc;	// merge from svc_hd_opt_b for multiple slices coding
-		pList[iIdx].uiNeighborAvail	= uiNeighborAvail;
-		uiNeighborAvail = 0;
-		if(pList[iIdx].iMbX >= BASE_MV_MB_NMB)
-			uiNeighborAvail |= LEFT_MB_POS;
-        if(pList[iIdx].iMbX <= (iMbWidth-1-BASE_MV_MB_NMB))
-            uiNeighborAvail |= RIGHT_MB_POS;
-		if (pList[iIdx].iMbY >= BASE_MV_MB_NMB)
-			uiNeighborAvail |= TOP_MB_POS;
-        if(pList[iIdx].iMbY <= (iMbHeight-1-BASE_MV_MB_NMB))
-            uiNeighborAvail |= BOTTOM_MB_POS;
-
-		pList[iIdx].sMv					= pLayerMvUnitBlock4x4[iIdx];
-		pList[iIdx].pRefIndex			= pLayerRefIndexBlock8x8[iIdx];
-		pList[iIdx].pSadCost				= &pEnc->pSadCostMb[iIdx];
-		pList[iIdx].pIntra4x4PredMode	= &pEnc->pIntra4x4PredModeBlocks[iIdx*INTRA_4x4_MODE_NUM];
-		pList[iIdx].pNonZeroCount		= &pEnc->pNonZeroCountBlocks[iIdx*MB_LUMA_CHROMA_BLOCK4x4_NUM];		
-	}
-}
-
-
-int32_t   InitMbListD( sWelsEncCtx ** ppCtx)
-{
-    int32_t		iNumDlayer = (*ppCtx)->pSvcParam->iNumDependencyLayer;	
-	int32_t		iMbSize[MAX_DEPENDENCY_LAYER] = { 0 };
-	int32_t		iOverallMbNum = 0;
-	int32_t		iMbWidth = 0;
-	int32_t		iMbHeight= 0;
-	int32_t		i;
-
-	if ( iNumDlayer > MAX_DEPENDENCY_LAYER )
-		return 1;
-
-	for( i=0;i<iNumDlayer;i++ ){
-	    iMbWidth = ((*ppCtx)->pSvcParam->sDependencyLayers[i].iFrameWidth + 15)>>4;
-		iMbHeight = ((*ppCtx)->pSvcParam->sDependencyLayers[i].iFrameHeight + 15)>>4;
-		iMbSize[i] = iMbWidth  * iMbHeight;
-		iOverallMbNum += iMbSize[i];
-	}
-
-	(*ppCtx)->ppMbListD = static_cast<SMB **>((*ppCtx)->pMemAlign->WelsMalloc(iNumDlayer * sizeof(SMB *), "ppMbListD"));
-	(*ppCtx)->ppMbListD[0] = NULL;
-	WELS_VERIFY_RETURN_PROC_IF(1, (*ppCtx)->ppMbListD==NULL, FreeMemorySvc(ppCtx));
-	(*ppCtx)->ppMbListD[0] = static_cast<SMB*>((*ppCtx)->pMemAlign->WelsMallocz(iOverallMbNum * sizeof(SMB), "ppMbListD[0]"));
-	WELS_VERIFY_RETURN_PROC_IF(1, (*ppCtx)->ppMbListD[0]==NULL, FreeMemorySvc(ppCtx));
-	(*ppCtx)->ppDqLayerList[0]->sMbDataP = (*ppCtx)->ppMbListD[0];
-	InitMbInfo(*ppCtx, (*ppCtx)->ppMbListD[0], (*ppCtx)->ppDqLayerList[0], 0, iMbSize[iNumDlayer-1]);
-	for( i=1;i<iNumDlayer;i++ ){		
-		(*ppCtx)->ppMbListD[i] = (*ppCtx)->ppMbListD[i-1] + iMbSize[i-1];
-		(*ppCtx)->ppDqLayerList[i]->sMbDataP = (*ppCtx)->ppMbListD[i];
-		InitMbInfo(*ppCtx, (*ppCtx)->ppMbListD[i], (*ppCtx)->ppDqLayerList[i], i, iMbSize[iNumDlayer-1]);
-	}
-
-	return 0;
-}
-
-int32_t AllocMbCacheAligned( SMbCache *pMbCache, CMemoryAlign *pMa )
-{
-	pMbCache->pCoeffLevel = (int16_t *)pMa->WelsMalloc(MB_COEFF_LIST_SIZE*sizeof(int16_t), "pMbCache->pCoeffLevel");
-	WELS_VERIFY_RETURN_IF(1, (NULL==pMbCache->pCoeffLevel));
-	pMbCache->pMemPredMb = (uint8_t *)pMa->WelsMalloc(2*256*sizeof(uint8_t), "pMbCache->pMemPredMb");
-	WELS_VERIFY_RETURN_IF(1, (NULL==pMbCache->pMemPredMb));
-	pMbCache->pSkipMb = (uint8_t *)pMa->WelsMalloc(384*sizeof(uint8_t), "pMbCache->pSkipMb");
-	WELS_VERIFY_RETURN_IF(1, (NULL==pMbCache->pSkipMb));
-	pMbCache->pMemPredBlk4 = (uint8_t *)pMa->WelsMalloc(2*16*sizeof(uint8_t), "pMbCache->pMemPredBlk4");
-	WELS_VERIFY_RETURN_IF(1, (NULL==pMbCache->pMemPredBlk4));
-	pMbCache->pBufferInterPredMe = (uint8_t *)pMa->WelsMalloc(4*640*sizeof(uint8_t), "pMbCache->pBufferInterPredMe");
-	WELS_VERIFY_RETURN_IF(1, (NULL==pMbCache->pBufferInterPredMe));
-	pMbCache->pPrevIntra4x4PredModeFlag = (bool_t *)pMa->WelsMalloc(16*sizeof(bool_t), "pMbCache->pPrevIntra4x4PredModeFlag");
-	WELS_VERIFY_RETURN_IF(1, (NULL==pMbCache->pPrevIntra4x4PredModeFlag));
-	pMbCache->pRemIntra4x4PredModeFlag	= (int8_t *)pMa->WelsMalloc(16*sizeof(int8_t), "pMbCache->pRemIntra4x4PredModeFlag");
-	WELS_VERIFY_RETURN_IF(1, (NULL==pMbCache->pRemIntra4x4PredModeFlag));
-	pMbCache->pDct = (SDCTCoeff *)pMa->WelsMalloc(sizeof(SDCTCoeff), "pMbCache->pDct");
-	WELS_VERIFY_RETURN_IF(1, (NULL==pMbCache->pDct));	
-	return 0;
-}
-
-void FreeMbCache( SMbCache *pMbCache, CMemoryAlign *pMa )
-{
-	if ( NULL != pMbCache->pCoeffLevel )
-	{
-		pMa->WelsFree( pMbCache->pCoeffLevel, "pMbCache->pCoeffLevel" );
-		pMbCache->pCoeffLevel = NULL;
-	}		
-	if ( NULL != pMbCache->pMemPredMb )
-	{
-		pMa->WelsFree( pMbCache->pMemPredMb, "pMbCache->pMemPredMb" );
-		pMbCache->pMemPredMb = NULL;
-	}	
-	if ( NULL != pMbCache->pSkipMb )
-	{
-		pMa->WelsFree( pMbCache->pSkipMb, "pMbCache->pSkipMb" );
-		pMbCache->pSkipMb = NULL;
-	}	
-	if ( NULL != pMbCache->pMemPredBlk4 )
-	{
-		pMa->WelsFree( pMbCache->pMemPredBlk4, "pMbCache->pMemPredBlk4" );
-		pMbCache->pMemPredBlk4 = NULL;
-	}	
-	if ( NULL != pMbCache->pBufferInterPredMe )
-	{
-		pMa->WelsFree( pMbCache->pBufferInterPredMe, "pMbCache->pBufferInterPredMe" );
-		pMbCache->pBufferInterPredMe = NULL;
-	}	
-	if ( NULL != pMbCache->pPrevIntra4x4PredModeFlag )
-	{
-		pMa->WelsFree( pMbCache->pPrevIntra4x4PredModeFlag, "pMbCache->pPrevIntra4x4PredModeFlag" );
-		pMbCache->pPrevIntra4x4PredModeFlag = NULL;
-	}	
-	if ( NULL != pMbCache->pRemIntra4x4PredModeFlag )
-	{
-		pMa->WelsFree( pMbCache->pRemIntra4x4PredModeFlag, "pMbCache->pRemIntra4x4PredModeFlag" );
-		pMbCache->pRemIntra4x4PredModeFlag = NULL;
-	}	
-	if ( NULL != pMbCache->pDct )
-	{
-		pMa->WelsFree( pMbCache->pDct, "pMbCache->pDct" );
-		pMbCache->pDct = NULL;
-	}
-}
-
-
-/*!
- * \brief	initialize ppDqLayerList and slicepEncCtx_list due to count number of layers available
- * \pParam	pCtx			sWelsEncCtx*
- * \return	0 - successful; otherwise failed
- */
-static inline int32_t InitDqLayers( sWelsEncCtx **ppCtx )
-{
-	SWelsSvcCodingParam *pParam	= NULL;
-	SWelsSPS *pSps						= NULL;
-	SSubsetSps *pSubsetSps			= NULL;
-	SWelsPPS *pPps						= NULL;
-	CMemoryAlign *pMa				= NULL;
-	SStrideTables *pStrideTab		= NULL;	
-	int32_t iDlayerCount					= 0;	
-	int32_t iDlayerIndex					= 0;
-	uint32_t iSpsId					= 0;
-	uint32_t iPpsId					= 0;
-	uint32_t iNumRef				= 0;
-	int32_t iResult					= 0;	
-	
-	if ( NULL == ppCtx || NULL == *ppCtx )
-		return 1;
-
-	pMa		= (*ppCtx)->pMemAlign;
-	pParam	= (*ppCtx)->pSvcParam;	
-	iDlayerCount	= pParam->iNumDependencyLayer;
-	iNumRef	= pParam->iNumRefFrame;
-//	highest_layers_in_temporal = 1 + WELS_MAX(pParam->iDecompStages, 1);
-	pStrideTab	= (*ppCtx)->pStrideTab;	
-
-	iDlayerIndex			= 0;	
-	while (iDlayerIndex < iDlayerCount)
-	{
-		SRefList *pRefList			= NULL;
-		uint32_t i					= 0;
-		const int32_t kiWidth			= pParam->sDependencyLayers[iDlayerIndex].iFrameWidth;
-		const int32_t kiHeight		= pParam->sDependencyLayers[iDlayerIndex].iFrameHeight;
-        int32_t iPicWidth			= WELS_ALIGN(kiWidth, MB_WIDTH_LUMA) + (PADDING_LENGTH<<1);	// with iWidth of horizon
-        int32_t iPicChromaWidth	= iPicWidth >> 1;
-
-        iPicWidth	= WELS_ALIGN( iPicWidth, 32 );	// 32(or 16 for chroma below) to match original imp. here instead of iCacheLineSize
-		iPicChromaWidth	= WELS_ALIGN( iPicChromaWidth, 16 );
-
-		WelsGetEncBlockStrideOffset( (*ppCtx)->pStrideTab->pStrideEncBlockOffset[iDlayerIndex], iPicWidth, iPicChromaWidth);		
-		
-		// pRef list
-		pRefList		= (SRefList *)pMa->WelsMallocz( sizeof(SRefList), "pRefList" );
-		WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pRefList), FreeMemorySvc(ppCtx) )			
-		
-		do {
-			pRefList->pRef[i]	= AllocPicture( pMa, kiWidth, kiHeight, true );	// to use actual size of current layer
-			WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pRefList->pRef[i]), FreeMemorySvc(ppCtx) )				
-			++ i;
-		} while(i < 1 + iNumRef);
-		
-		pRefList->pNextBuffer = pRefList->pRef[0];			
-		(*ppCtx)->ppRefPicListExt[iDlayerIndex]	= pRefList;		
-		++ iDlayerIndex;
-	}
-
-	// for I420 based source spatial pictures
-	if ( AllocSpatialPictures( ppCtx, pParam ) )
-	{
-		FreeMemorySvc( ppCtx );		
-		return 1;
-	}
-
-	iDlayerIndex	= 0;
-	while (iDlayerIndex < iDlayerCount) {
-		SDqLayer *pDqLayer		= NULL;
-		SDLayerParam *pDlayer	= &pParam->sDependencyLayers[iDlayerIndex];		
-		const int32_t kiMbW		= (pDlayer->iFrameWidth + 0x0f) >> 4;
-		const int32_t kiMbH		= (pDlayer->iFrameHeight + 0x0f) >> 4;
-		int32_t iMaxSliceNum	= 1;
-		const int32_t kiSliceNum = GetInitialSliceNum( kiMbW, kiMbH, &pDlayer->sMso );
-		if ( iMaxSliceNum < kiSliceNum )
-			iMaxSliceNum = kiSliceNum;
-
-		// pDq layers list
-		pDqLayer = (SDqLayer *)pMa->WelsMallocz( sizeof(SDqLayer), "pDqLayer" );
-		WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pDqLayer), FreeMemorySvc(ppCtx) )
-
-		// for dynamic slicing mode
-		if ( SM_DYN_SLICE == pDlayer->sMso.uiSliceMode )
-		{			
-			const int32_t iSize			= pParam->iCountThreadsNum * sizeof(int32_t);
-
-			pDqLayer->pNumSliceCodedOfPartition		= (int32_t *)pMa->WelsMallocz( iSize, "pNumSliceCodedOfPartition" );			
-			pDqLayer->pLastCodedMbIdxOfPartition	= (int32_t *)pMa->WelsMallocz( iSize, "pLastCodedMbIdxOfPartition" );
-			pDqLayer->pLastMbIdxOfPartition			= (int32_t *)pMa->WelsMallocz( iSize, "pLastMbIdxOfPartition" );
-
-			WELS_VERIFY_RETURN_PROC_IF( 1,
-										(NULL == pDqLayer->pNumSliceCodedOfPartition ||
-										NULL == pDqLayer->pLastCodedMbIdxOfPartition ||
-										NULL == pDqLayer->pLastMbIdxOfPartition),
-										FreeMemorySvc(ppCtx) )
-		}
-
-		pDqLayer->iMbWidth					= kiMbW;
-		pDqLayer->iMbHeight					= kiMbH;
-#ifndef MT_ENABLED
-		if ( SM_DYN_SLICE == pDlayer->sMso.uiSliceMode )//wmalloc pSliceInLayer
-		{	
-			SSlice *pSlice			= NULL;
-			int32_t iSliceIdx		= 0;
-			//wmalloc AVERSLICENUM_CONSTANT of pDqLayer->sLayerInfo.pSliceInLayer, 
-			//wmalloc AVERSLICENUM_CONSTANT num of pSlice as initialization			
-			//only set value for the first pSlice
-			pDqLayer->sLayerInfo.pSliceInLayer	= (SSlice *)pMa->WelsMallocz( sizeof(SSlice) * iMaxSliceNum, "pSliceInLayer" );
-
-			WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pDqLayer->sLayerInfo.pSliceInLayer), FreeMemorySvc(ppCtx) )
-			{
-				pSlice = &pDqLayer->sLayerInfo.pSliceInLayer[0];
-				pSlice->uiSliceIdx = 0;
-				pSlice->pSliceBsa = &(*ppCtx)->pOut->sBsWrite;
-			}		
-
-			while(iSliceIdx < iMaxSliceNum)
-			{
-				pSlice = &pDqLayer->sLayerInfo.pSliceInLayer[iSliceIdx];
-				if ( AllocMbCacheAligned(&pSlice->sMbCacheInfo, pMa) )
-				{
-					FreeMemorySvc(ppCtx);
-					return 1;
-				}
-				++ iSliceIdx;
-			}
-		}
-		else
-#endif//!MT_ENABLED
-		{			
-			int32_t iSliceIdx		= 0;
-			pDqLayer->sLayerInfo.pSliceInLayer	= (SSlice *)pMa->WelsMallocz( sizeof(SSlice) * iMaxSliceNum, "pSliceInLayer" );
-
-			WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pDqLayer->sLayerInfo.pSliceInLayer), FreeMemorySvc(ppCtx) )
-			if ( iMaxSliceNum > 1 )
-			{
-				while (iSliceIdx < iMaxSliceNum) {
-					SSlice *pSlice = &pDqLayer->sLayerInfo.pSliceInLayer[iSliceIdx];
-					pSlice->uiSliceIdx = iSliceIdx;
-#ifdef MT_ENABLED
-					if ( pParam->iMultipleThreadIdc > 1 )
-						pSlice->pSliceBsa = &(*ppCtx)->pSliceBs[iSliceIdx].sBsWrite;
-					else
-						pSlice->pSliceBsa = &(*ppCtx)->pOut->sBsWrite;
-#else
-					pSlice->pSliceBsa = &(*ppCtx)->pOut->sBsWrite;
-#endif//MT_ENABLED
-					if ( AllocMbCacheAligned(&pSlice->sMbCacheInfo, pMa) )
-					{
-						FreeMemorySvc(ppCtx);
-						return 1;
-					}
-					++ iSliceIdx;
-				}
-			}
-			// fix issue in case single pSlice coding might be inclusive exist in variant spatial layer setting, also introducing multi-pSlice modes
-			else	// only one pSlice
-			{
-				SSlice *pSlice = &pDqLayer->sLayerInfo.pSliceInLayer[0];
-				pSlice->uiSliceIdx	= 0;
-				pSlice->pSliceBsa	= &(*ppCtx)->pOut->sBsWrite;
-				if ( AllocMbCacheAligned(&pSlice->sMbCacheInfo, pMa) )
-				{
-					FreeMemorySvc(ppCtx);
-					return 1;
-				}
-			}
-		}
-
-		//deblocking parameters initialization
-		//target-layer deblocking
-		pDqLayer->iLoopFilterDisableIdc	                = pParam->iLoopFilterDisableIdc;
-		pDqLayer->iLoopFilterAlphaC0Offset				= (pParam->iLoopFilterAlphaC0Offset)<<1;
-		pDqLayer->iLoopFilterBetaOffset					= (pParam->iLoopFilterBetaOffset)<<1;
-		//inter-layer deblocking
-		pDqLayer->uiDisableInterLayerDeblockingFilterIdc	= pParam->iInterLayerLoopFilterDisableIdc;
-		pDqLayer->iInterLayerSliceAlphaC0Offset				= (pParam->iInterLayerLoopFilterAlphaC0Offset)<<1;
-		pDqLayer->iInterLayerSliceBetaOffset				= (pParam->iInterLayerLoopFilterBetaOffset)<<1;
-		//parallel deblocking
-		pDqLayer->bDeblockingParallelFlag                  = pParam->bDeblockingParallelFlag;
-
-		//deblocking parameter adjustment
-		if ( SM_SINGLE_SLICE == pDlayer->sMso.uiSliceMode )
-		{
-			//iLoopFilterDisableIdc: will be 0 or 1 under single_slice
-			if ( 2 == pParam->iLoopFilterDisableIdc )
-			{
-				pDqLayer->iLoopFilterDisableIdc	= 0;
-			}
-			//bDeblockingParallelFlag
-			pDqLayer->bDeblockingParallelFlag = false;
-		}
-		else
-		{//multi-pSlice
-#ifdef MT_ENABLED
-			if ( 0 == pDqLayer->iLoopFilterDisableIdc )
-			{
-				pDqLayer->bDeblockingParallelFlag	= false;
-			}
-#endif
-		}
-
-		(*ppCtx)->ppDqLayerList[iDlayerIndex]	= pDqLayer;
-		
-		++ iDlayerIndex;
-	}
-
-	// for dynamically malloc for parameter sets memory instead of maximal items for standard to reduce size, 3/18/2010
-	if ( &(*ppCtx)->pSvcParam->bMgsT0OnlyStrategy )
-	{
-	    (*ppCtx)->pPPSArray	= (SWelsPPS *)pMa->WelsMalloc( (1+iDlayerCount) * sizeof(SWelsPPS), "pPPSArray" );
-	}
-	else
-	{
-	    (*ppCtx)->pPPSArray	= (SWelsPPS *)pMa->WelsMalloc( iDlayerCount * sizeof(SWelsPPS), "pPPSArray" );
-	}
-	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pPPSArray), FreeMemorySvc(ppCtx) )
-
-	(*ppCtx)->pSpsArray	= (SWelsSPS *)pMa->WelsMalloc( sizeof(SWelsSPS), "pSpsArray" );
-	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pSpsArray), FreeMemorySvc(ppCtx) )
-	if ( iDlayerCount > 1 )
-	{
-		(*ppCtx)->pSubsetArray	= (SSubsetSps *)pMa->WelsMalloc( (iDlayerCount-1) * sizeof(SSubsetSps), "pSubsetArray" );
-		WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pSubsetArray), FreeMemorySvc(ppCtx) )
-	}
-	
-	(*ppCtx)->pDqIdcMap	= (SDqIdc *)pMa->WelsMallocz( iDlayerCount * sizeof(SDqIdc), "pDqIdcMap" );
-	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pDqIdcMap), FreeMemorySvc(ppCtx) )	
-
-	iDlayerIndex	= 0;
-	while (iDlayerIndex < iDlayerCount) {		
-		SDqIdc *pDqIdc		= &(*ppCtx)->pDqIdcMap[iDlayerIndex];		
-		const bool_t bUseSubsetSps			= (iDlayerIndex > BASE_DEPENDENCY_ID);
-		SDLayerParam *pDlayerParam	= &pParam->sDependencyLayers[iDlayerIndex];
-
-		pDqIdc->uiSpatialId	= iDlayerIndex;
-		pPps	= &(*ppCtx)->pPPSArray[iPpsId];
-		if ( !bUseSubsetSps )
-		{
-			pSps	= &(*ppCtx)->pSpsArray[iSpsId];
-		}
-		else
-		{
-			pSubsetSps	= &(*ppCtx)->pSubsetArray[iSpsId];
-			pSps			= &pSubsetSps->pSps;
-		}		
-
-		// Need port pSps/pPps initialization due to spatial scalability changed
-		if ( !bUseSubsetSps )
-		{	
-			WelsInitSps( pSps, pDlayerParam, pParam->uiIntraPeriod, pParam->iNumRefFrame, iSpsId, 
-						  pParam->bEnableFrameCroppingFlag, pParam->bEnableRc );
-
-			if( iDlayerCount > 1 )
-			{
-				pSps->bConstraintSet0Flag = true;
-				pSps->bConstraintSet1Flag = true;
-				pSps->bConstraintSet2Flag = true;
-			}
-		}
-		else
-		{
-			WelsInitSubsetSps( pSubsetSps, pDlayerParam, pParam->uiIntraPeriod, pParam->iNumRefFrame, iSpsId,
-								 pParam->bEnableFrameCroppingFlag, pParam->bEnableRc );
-		}
-
-		// initialize pPps
-		WelsInitPps( pPps, pSps, pSubsetSps, iPpsId, true, bUseSubsetSps );
-
-		// Not using FMO in SVC coding so far, come back if need FMO
-		{
-			iResult = InitSlicePEncCtx(	&(*ppCtx)->pSliceCtxList[iDlayerIndex],
-											(*ppCtx)->pMemAlign,
-											false,
-											pSps->iMbWidth,
-											pSps->iMbHeight,
-											&(pDlayerParam->sMso),
-											pPps	);
-			if ( iResult )
-			{
-				WelsLog( *ppCtx, WELS_LOG_WARNING, "InitDqLayers(), InitSlicePEncCtx failed(%d)!", iResult );
-				FreeMemorySvc( ppCtx );			
-				return 1;
-			}
-			(*ppCtx)->ppDqLayerList[iDlayerIndex]->pSliceEncCtx	= &(*ppCtx)->pSliceCtxList[iDlayerIndex];			
-		}
-		pDqIdc->iSpsId	= iSpsId;
-		pDqIdc->iPpsId	= iPpsId;
-
-		(*ppCtx)->sPSOVector.bPpsIdMappingIntoSubsetsps[iPpsId] = bUseSubsetSps;
-
-		if ( bUseSubsetSps )
-			++ iSpsId;
-		++ iPpsId;
-		++ (*ppCtx)->iSpsNum;
-		++ (*ppCtx)->iPpsNum;
-
-		++ iDlayerIndex;
-	}	
-	return 0;
-}
-
-int32_t AllocStrideTables( sWelsEncCtx **ppCtx, const int32_t kiNumSpatialLayers )
-{
-	CMemoryAlign *pMa				= (*ppCtx)->pMemAlign;
-	SWelsSvcCodingParam *pParam	= (*ppCtx)->pSvcParam;
-	SStrideTables *pPtr				= NULL;
-	int16_t *pTmpRow	= NULL, *pRowX = NULL, *pRowY = NULL, *p = NULL;
-	uint8_t *pBase		= NULL;
-	uint8_t *pBaseDec = NULL, *pBaseEnc = NULL, *pBaseMbX = NULL, *pBaseMbY = NULL;
-	struct {
-		int32_t iMbWidth;
-		int32_t iCountMbNum;				// count number of SMB in each spatial
-		int32_t iSizeAllMbAlignCache;	// cache line size aligned in each spatial
-	} sMbSizeMap[MAX_DEPENDENCY_LAYER] = {0};
-	int32_t iLineSizeY[MAX_DEPENDENCY_LAYER][2] = {0};
-	int32_t iLineSizeUV[MAX_DEPENDENCY_LAYER][2]= {0};
-	int32_t iMapSpatialIdx[MAX_DEPENDENCY_LAYER][2] = {0};	
-	int32_t iSizeDec		= 0;
-	int32_t iSizeEnc		= 0;
-	int32_t iCountLayersNeedCs[2]	= {0};
-	const int32_t kiUnit1Size = 24 * sizeof(int32_t);
-	int32_t iUnit2Size		= 0;
-	int32_t iNeedAllocSize	= 0;
-	int32_t iRowSize		= 0;
-	int16_t iMaxMbWidth	= 0;
-	int16_t iMaxMbHeight	= 0;
-	int32_t i				= 0;
-	int32_t iSpatialIdx		= 0;
-	int32_t iTemporalIdx	= 0;
-	int32_t iCntTid			= 0;	
-
-	if ( kiNumSpatialLayers <= 0 || kiNumSpatialLayers > MAX_DEPENDENCY_LAYER)
-		return 1;
-
-	pPtr = (SStrideTables *)pMa->WelsMalloc(sizeof(SStrideTables), "SStrideTables");
-	if (NULL == pPtr)
-		return 1;
-	(*ppCtx)->pStrideTab = pPtr;
-	
-	iCntTid	= pParam->iNumTemporalLayer > 1 ? 2 : 1;	
-
-	iSpatialIdx = 0;
-	while (iSpatialIdx < kiNumSpatialLayers) {
-		const int32_t kiTmpWidth = (pParam->sDependencyLayers[iSpatialIdx].iFrameWidth + 15) >> 4;
-		const int32_t kiTmpHeight= (pParam->sDependencyLayers[iSpatialIdx].iFrameHeight + 15) >> 4;
-		int32_t iNumMb = kiTmpWidth * kiTmpHeight;
-		
-		sMbSizeMap[iSpatialIdx].iMbWidth		= kiTmpWidth;
-		sMbSizeMap[iSpatialIdx].iCountMbNum	= iNumMb;
-		
-		iNumMb *= sizeof(int16_t);
-		sMbSizeMap[iSpatialIdx].iSizeAllMbAlignCache = iNumMb;		
-		iUnit2Size += iNumMb;
-
-		++ iSpatialIdx;
-	}
-	
-	// Adaptive size_cs, size_fdec by implementation dependency
-	iTemporalIdx= 0;
-	while ( iTemporalIdx < iCntTid )
-	{
-		const bool_t kbBaseTemporalFlag	= (iTemporalIdx == 0);		
-		
-		iSpatialIdx = 0;
-		while ( iSpatialIdx < kiNumSpatialLayers )
-		{
-			SDLayerParam *fDlp					= &pParam->sDependencyLayers[iSpatialIdx];			
-
-			const int32_t kiWidthPad = WELS_ALIGN( fDlp->iFrameWidth, 16 ) + (PADDING_LENGTH<<1);
-			iLineSizeY[iSpatialIdx][kbBaseTemporalFlag]	= WELS_ALIGN( kiWidthPad, 32 );
-			iLineSizeUV[iSpatialIdx][kbBaseTemporalFlag]= WELS_ALIGN( (kiWidthPad>>1), 16 );			
-
-			iMapSpatialIdx[iCountLayersNeedCs[kbBaseTemporalFlag]][kbBaseTemporalFlag] = iSpatialIdx;
-			++ iCountLayersNeedCs[kbBaseTemporalFlag];			
-			++ iSpatialIdx;
-		}
-		++ iTemporalIdx;
-	}
-	iSizeDec= kiUnit1Size * (iCountLayersNeedCs[0] + iCountLayersNeedCs[1]);
-	iSizeEnc= kiUnit1Size * kiNumSpatialLayers;
-
-	iNeedAllocSize = iSizeDec + iSizeEnc + (iUnit2Size << 1);
-
-	pBase = (uint8_t *)pMa->WelsMalloc( iNeedAllocSize, "pBase" );
-	if ( NULL == pBase )
-	{		
-		return 1;
-	}
-
-	pBaseDec= pBase;		// iCountLayersNeedCs
-	pBaseEnc= pBaseDec + iSizeDec;		// iNumSpatialLayers
-	pBaseMbX = pBaseEnc + iSizeEnc;	// iNumSpatialLayers
-	pBaseMbY = pBaseMbX + iUnit2Size;	// iNumSpatialLayers
-	
-	iTemporalIdx= 0;
-	while ( iTemporalIdx < iCntTid )
-	{
-		const bool_t kbBaseTemporalFlag	= (iTemporalIdx == 0);
-		
-		iSpatialIdx = 0;
-		while ( iSpatialIdx < iCountLayersNeedCs[kbBaseTemporalFlag] )
-		{
-			const int32_t kiActualSpatialIdx = iMapSpatialIdx[iSpatialIdx][kbBaseTemporalFlag];
-			const int32_t kiLumaWidth	= iLineSizeY[kiActualSpatialIdx][kbBaseTemporalFlag];
-			const int32_t kiChromaWidth	= iLineSizeUV[kiActualSpatialIdx][kbBaseTemporalFlag];
-
-			WelsGetEncBlockStrideOffset( (int32_t *)pBaseDec, kiLumaWidth, kiChromaWidth );
-
-			pPtr->pStrideDecBlockOffset[kiActualSpatialIdx][kbBaseTemporalFlag]	= (int32_t *)pBaseDec;
-			pBaseDec+= kiUnit1Size;
-
-			++ iSpatialIdx;
-		}
-		++ iTemporalIdx;
-	}
-	iTemporalIdx= 0;
-	while ( iTemporalIdx < iCntTid )
-	{
-		const bool_t kbBaseTemporalFlag	= (iTemporalIdx == 0);
-
-		iSpatialIdx = 0;
-		while (iSpatialIdx < kiNumSpatialLayers)
-		{
-			int32_t iMatchIndex = 0;			
-			bool_t bInMap = false;
-			bool_t bMatchFlag = false;
-
-			i = 0;
-			while ( i < iCountLayersNeedCs[kbBaseTemporalFlag] )
-			{			
-				const int32_t kiActualIdx = iMapSpatialIdx[i][kbBaseTemporalFlag];
-				if ( kiActualIdx == iSpatialIdx )
-				{
-					bInMap	= true;
-					break;
-				}
-				if ( !bMatchFlag )
-				{
-					iMatchIndex	= kiActualIdx;
-					bMatchFlag	= true;
-				}
-				++ i;
-			}
-
-			if ( bInMap )
-			{
-				++ iSpatialIdx;
-				continue;
-			}
-
-			// not in spatial map and assign match one to it
-			pPtr->pStrideDecBlockOffset[iSpatialIdx][kbBaseTemporalFlag]	= pPtr->pStrideDecBlockOffset[iMatchIndex][kbBaseTemporalFlag];
-
-			++ iSpatialIdx;
-		}
-		++ iTemporalIdx;
-	}
-	
-	iSpatialIdx = 0;
-	while ( iSpatialIdx < kiNumSpatialLayers )
-	{		
-		const int32_t kiAllocMbSize = sMbSizeMap[iSpatialIdx].iSizeAllMbAlignCache;
-
-		pPtr->pStrideEncBlockOffset[iSpatialIdx]	= (int32_t *)pBaseEnc;
-		
-		pPtr->pMbIndexX[iSpatialIdx]				= (int16_t *)pBaseMbX;
-		pPtr->pMbIndexY[iSpatialIdx]				= (int16_t *)pBaseMbY;
-
-		pBaseEnc += kiUnit1Size;
-		pBaseMbX += kiAllocMbSize;
-		pBaseMbY += kiAllocMbSize;
-		
-		++ iSpatialIdx;		
-	}
-	
-	while ( iSpatialIdx < MAX_DEPENDENCY_LAYER )
-	{
-		pPtr->pStrideDecBlockOffset[iSpatialIdx][0]	= NULL;
-		pPtr->pStrideDecBlockOffset[iSpatialIdx][1]	= NULL;		
-		pPtr->pStrideEncBlockOffset[iSpatialIdx]		= NULL;
-		pPtr->pMbIndexX[iSpatialIdx]					= NULL;
-		pPtr->pMbIndexY[iSpatialIdx]					= NULL;
-
-		++ iSpatialIdx;
-	}
-
-	// initialize pMbIndexX and pMbIndexY tables as below
-
-	iMaxMbWidth	= sMbSizeMap[kiNumSpatialLayers-1].iMbWidth;
-	iMaxMbWidth	= WELS_ALIGN(iMaxMbWidth, 4);	// 4 loops for int16_t required introduced as below
-	iRowSize		= iMaxMbWidth * sizeof(int16_t);
-
-	pTmpRow = (int16_t*)pMa->WelsMalloc( iRowSize, "pTmpRow" );
-	if ( NULL == pTmpRow )
-	{		
-		return 1;
-	}
-	pRowX = pTmpRow;
-	pRowY = pRowX;
-	// initialize pRowX & pRowY
-	i = 0;
-	p = pRowX;
-	while ( i < iMaxMbWidth )
-	{
-		*p		= i;
-		*(p+1)	= 1+i;
-		*(p+2)	= 2+i;
-		*(p+3)	= 3+i;
-		
-		p += 4;
-		i += 4;
-	}
-
-	iSpatialIdx = kiNumSpatialLayers;
-	while ( --iSpatialIdx >= 0 )
-	{
-		int16_t *pMbIndexX = pPtr->pMbIndexX[iSpatialIdx];
-		const int32_t kiMbWidth	= sMbSizeMap[iSpatialIdx].iMbWidth;
-		const int32_t kiMbHeight	= sMbSizeMap[iSpatialIdx].iCountMbNum / kiMbWidth;
-		const int32_t kiLineSize	= kiMbWidth * sizeof(int16_t);
-
-		i = 0;
-		while ( i < kiMbHeight )
-		{
-			memcpy( pMbIndexX, pRowX, kiLineSize );	// confirmed_safe_unsafe_usage
-
-			pMbIndexX += kiMbWidth;			
-			++ i;
-		}		
-	}
-
-	memset(pRowY, 0, iRowSize);
-	iMaxMbHeight	= sMbSizeMap[kiNumSpatialLayers-1].iCountMbNum / sMbSizeMap[kiNumSpatialLayers-1].iMbWidth;
-	i = 0;
-	for (;;)
-	{
-		ENFORCE_STACK_ALIGN_1D(int16_t, t, 4, 16)
-
-		int32_t t32 = 0;
-		int16_t j = 0;
-
-		for ( iSpatialIdx = kiNumSpatialLayers-1; iSpatialIdx >= 0; -- iSpatialIdx )
-		{
-			const int32_t kiMbWidth	= sMbSizeMap[iSpatialIdx].iMbWidth;
-			const int32_t kiMbHeight = sMbSizeMap[iSpatialIdx].iCountMbNum / kiMbWidth;
-			const int32_t kiLineSize	= kiMbWidth * sizeof(int16_t);
-			int16_t *pMbIndexY = pPtr->pMbIndexY[iSpatialIdx] + i * kiMbWidth;
-
-			if ( i < kiMbHeight )
-			{
-				memcpy( pMbIndexY, pRowY, kiLineSize );	// confirmed_safe_unsafe_usage
-			}
-		}		
-		++ i;
-		if (i >= iMaxMbHeight)
-			break;
-
-		t32 = i | (i << 16);
-		ST32( t  , t32 );
-		ST32( t+2, t32 );
-
-		p = pRowY;
-		while ( j < iMaxMbWidth )
-		{			
-			ST64( p, LD64(t) );
-			
-			p += 4;
-			j += 4;
-		}
-	}
-
-	pMa->WelsFree( pTmpRow, "pTmpRow" );
-	pTmpRow = NULL;
-
-	return 0;
-}
-
-/*!
- * \brief	request specific memory for SVC
- * \pParam	pEncCtx		sWelsEncCtx*
- * \return	successful - 0; otherwise none 0 for failed
- */
-int32_t RequestMemorySvc( sWelsEncCtx **ppCtx )
-{
-	SWelsSvcCodingParam *pParam	= (*ppCtx)->pSvcParam;
-	CMemoryAlign *pMa				= (*ppCtx)->pMemAlign;
-	SDLayerParam *pFinalSpatial	= NULL;
-	int32_t iCountBsLen			= 0;
-	int32_t iCountNals				= 0;
-	int32_t iMaxPicWidth			= 0;
-	int32_t iMaxPicHeight			= 0;
-	int32_t iCountMaxMbNum		= 0;
-	int32_t iIndex					= 0;
-	int32_t iCountLayers			= 0;
-	int32_t iResult					= 0;
-	float	fCompressRatioThr		= .5f;
-	const int32_t kiNumDependencyLayers	= pParam->iNumDependencyLayer;
-	const uint32_t kuiMvdInterTableSize	=  ( kiNumDependencyLayers == 1 ? (1 + (648 << 1)) : (1 + (972 << 1)) );	
-	const uint32_t kuiMvdCacheAlginedSize	= kuiMvdInterTableSize * sizeof(uint16_t);
-	int32_t iVclLayersBsSizeCount		= 0;
-	int32_t iNonVclLayersBsSizeCount	= 0;	
-#if defined(MT_ENABLED)
-	int32_t iTargetSpatialBsSize			= 0;
-#endif//MT_ENABLED
-
-	if ( kiNumDependencyLayers < 1 || kiNumDependencyLayers > MAX_DEPENDENCY_LAYER )
-	{
-		WelsLog( *ppCtx, WELS_LOG_WARNING, "RequestMemorySvc() failed due to invalid iNumDependencyLayers(%d)!\n", kiNumDependencyLayers);
-		FreeMemorySvc( ppCtx );		
-		return 1;
-	}
-
-	if ( pParam->uiGopSize == 0 || ( pParam->uiIntraPeriod && ((pParam->uiIntraPeriod % pParam->uiGopSize) != 0)) )
-	{
-		WelsLog( *ppCtx, WELS_LOG_WARNING, "RequestMemorySvc() failed due to invalid uiIntraPeriod(%d) (=multipler of uiGopSize(%d)!",
-			pParam->uiIntraPeriod, pParam->uiGopSize);
-		FreeMemorySvc( ppCtx );		
-		return 1;
-	}
-
-	pFinalSpatial	= &pParam->sDependencyLayers[kiNumDependencyLayers - 1];
-	iMaxPicWidth	= pFinalSpatial->iFrameWidth;
-	iMaxPicHeight	= pFinalSpatial->iFrameHeight;
-	iCountMaxMbNum= ((15+iMaxPicWidth)>>4) * ((15+iMaxPicHeight)>>4);
-
-	iResult = AcquireLayersNals( ppCtx, pParam, &iCountLayers, &iCountNals );
-	if ( iResult )
-	{
-		WelsLog( *ppCtx, WELS_LOG_WARNING, "RequestMemorySvc(), AcquireLayersNals failed(%d)!", iResult);
-		FreeMemorySvc( ppCtx );		
-		return 1;
-	}	
-	
-	iNonVclLayersBsSizeCount = SSEI_BUFFER_SIZE + pParam->iNumDependencyLayer * SPS_BUFFER_SIZE + (1+pParam->iNumDependencyLayer) * PPS_BUFFER_SIZE;
-
-	int32_t iLayerBsSize = 0;
-	iIndex = 0;
-	while(iIndex < pParam->iNumDependencyLayer)
-	{
-		SDLayerParam *fDlp = &pParam->sDependencyLayers[iIndex];		
-
-		fCompressRatioThr	= COMPRESS_RATIO_DECIDED_BY_RESOLUTION(fDlp->iFrameWidth, fDlp->iFrameHeight);
-
-		iLayerBsSize = WELS_ROUND( ( (3 * fDlp->iFrameWidth * fDlp->iFrameHeight)>>1) * fCompressRatioThr);
-		iLayerBsSize	= WELS_ALIGN(iLayerBsSize, 4);			// 4 bytes alinged		
-		iVclLayersBsSizeCount += iLayerBsSize;
-		++ iIndex;
-	}
-#if defined(MT_ENABLED)
-	iTargetSpatialBsSize = iLayerBsSize;
-#endif//MT_ENABLED
-	iCountBsLen = iNonVclLayersBsSizeCount + iVclLayersBsSizeCount;
-
-	pParam->iNumRefFrame	= WELS_CLIP3(pParam->iNumRefFrame, MIN_REF_PIC_COUNT, MAX_REFERENCE_PICTURE_COUNT_NUM);
-		
-	// Output
-	(*ppCtx)->pOut = (SWelsEncoderOutput *)pMa->WelsMalloc( sizeof(SWelsEncoderOutput), "SWelsEncoderOutput" );
-	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pOut), FreeMemorySvc(ppCtx) )
-	(*ppCtx)->pOut->pBsBuffer		= (uint8_t *)pMa->WelsMalloc( iCountBsLen, "pOut->pBsBuffer" );
-	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pOut->pBsBuffer), FreeMemorySvc(ppCtx) )
-	(*ppCtx)->pOut->uiSize			= iCountBsLen;	
-	(*ppCtx)->pOut->sNalList		= (SWelsNalRaw *)pMa->WelsMalloc( iCountNals * sizeof(SWelsNalRaw), "pOut->sNalList" );
-	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pOut->sNalList), FreeMemorySvc(ppCtx) )
-	(*ppCtx)->pOut->iCountNals		= iCountNals;
-	(*ppCtx)->pOut->iNalIndex		= 0;
-
-#ifdef MT_ENABLED
-	if ( pParam->iMultipleThreadIdc > 1 )
-	{
-		(*ppCtx)->pFrameBs			= (uint8_t *)pMa->WelsMalloc( iCountBsLen + (iTargetSpatialBsSize * ((*ppCtx)->iMaxSliceCount-1)), "pFrameBs" );
-		WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pFrameBs), FreeMemorySvc(ppCtx) )
-		(*ppCtx)->iFrameBsSize		= iCountBsLen * (*ppCtx)->iMaxSliceCount;
-	}
-	else
-#endif//MT_ENABLED
-	{	
-		(*ppCtx)->pFrameBs			= (uint8_t *)pMa->WelsMalloc( iCountBsLen, "pFrameBs" );
-		WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pFrameBs), FreeMemorySvc(ppCtx) )
-		(*ppCtx)->iFrameBsSize		= iCountBsLen;
-	}
-	(*ppCtx)->iPosBsBuffer		= 0;
-
-#ifdef MT_ENABLED
-	// for pSlice bs buffers
-	if ( pParam->iMultipleThreadIdc > 1 && RequestMtResource( ppCtx, pParam, iCountBsLen, iTargetSpatialBsSize ) )
-	{
-		WelsLog( *ppCtx, WELS_LOG_WARNING, "RequestMemorySvc(), RequestMtResource failed!");
-		FreeMemorySvc( ppCtx );
-		return 1;
-	}
-#endif
-		
-	(*ppCtx)->pIntra4x4PredModeBlocks = static_cast<int8_t*>
-		(pMa->WelsMallocz( iCountMaxMbNum * INTRA_4x4_MODE_NUM, "pIntra4x4PredModeBlocks" ));
-	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pIntra4x4PredModeBlocks), FreeMemorySvc(ppCtx) )
-
-	(*ppCtx)->pNonZeroCountBlocks = static_cast<int8_t*>
-		(pMa->WelsMallocz( iCountMaxMbNum * MB_LUMA_CHROMA_BLOCK4x4_NUM, "pNonZeroCountBlocks" ));
-	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pNonZeroCountBlocks), FreeMemorySvc(ppCtx) )
-
-	(*ppCtx)->pMvUnitBlock4x4 = static_cast<SMVUnitXY*>
-		(pMa->WelsMallocz( iCountMaxMbNum * 2 * MB_BLOCK4x4_NUM * sizeof(SMVUnitXY), "pMvUnitBlock4x4" ));
-	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pMvUnitBlock4x4), FreeMemorySvc(ppCtx) )
-
-	(*ppCtx)->pRefIndexBlock4x4 = static_cast<int8_t*>
-		(pMa->WelsMallocz( iCountMaxMbNum * 2 * MB_BLOCK8x8_NUM * sizeof(int8_t), "pRefIndexBlock4x4" ));
-	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pRefIndexBlock4x4), FreeMemorySvc(ppCtx) )
-
-	(*ppCtx)->pSadCostMb	= static_cast<int32_t*>
-		  (pMa->WelsMallocz( iCountMaxMbNum * sizeof(int32_t), "pSadCostMb" ));
-	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pSadCostMb), FreeMemorySvc(ppCtx))
-
-	(*ppCtx)->bEncCurFrmAsIdrFlag = true;  // make sure first frame is IDR
-	(*ppCtx)->iGlobalQp				= 26;	// global qp in default
-
-	(*ppCtx)->pLtr = (SLTRState *)pMa->WelsMalloc( kiNumDependencyLayers*sizeof(SLTRState), "SLTRState" );
-	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pLtr), FreeMemorySvc(ppCtx) )
-	int32_t i = 0;
-	for( i = 0; i < kiNumDependencyLayers; i++ )
-	{
-		ResetLtrState( &(*ppCtx)->pLtr[i] );
-	}
-
-	(*ppCtx)->ppRefPicListExt	= (SRefList**)pMa->WelsMalloc( kiNumDependencyLayers * sizeof(SRefList *), "ppRefPicListExt" );
-	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->ppRefPicListExt), FreeMemorySvc(ppCtx) )
-
-	// pSlice context list
-	(*ppCtx)->pSliceCtxList	= (SSliceCtx *)pMa->WelsMallocz( kiNumDependencyLayers * sizeof(SSliceCtx), "pSliceCtxList" );
-	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pSliceCtxList), FreeMemorySvc(ppCtx) )
-
-	(*ppCtx)->ppDqLayerList	= (SDqLayer **)pMa->WelsMalloc( kiNumDependencyLayers * sizeof(SDqLayer *), "ppDqLayerList" );
-	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->ppDqLayerList), FreeMemorySvc(ppCtx) )
-
-	// stride tables
-	if ( AllocStrideTables( ppCtx, kiNumDependencyLayers ) )
-	{
-		WelsLog( *ppCtx, WELS_LOG_WARNING, "RequestMemorySvc(), AllocStrideTables failed!");
-		FreeMemorySvc( ppCtx );
-		return 1;
-	}
-	
-	//Rate control module memory allocation
-	// only malloc once for RC pData, 12/14/2009
-	(*ppCtx)->pWelsSvcRc = (SWelsSvcRc *)pMa->WelsMallocz( kiNumDependencyLayers * sizeof(SWelsSvcRc), "pWelsSvcRc" );
-	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pWelsSvcRc), FreeMemorySvc(ppCtx) )
-	//End of Rate control module memory allocation
-	
-	//pVaa memory allocation	
-	(*ppCtx)->pVaa	= (SVAAFrameInfo *)pMa->WelsMallocz( sizeof(SVAAFrameInfo), "pVaa" );
-	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pVaa), FreeMemorySvc(ppCtx) )
-
-	if((*ppCtx)->pSvcParam->bEnableAdaptiveQuant)//malloc mem
-	{
-		(*ppCtx)->pVaa->sAdaptiveQuantParam.pMotionTextureUnit   = static_cast<SMotionTextureUnit*>
-			(pMa->WelsMallocz( iCountMaxMbNum * sizeof(SMotionTextureUnit), "pVaa->sAdaptiveQuantParam.pMotionTextureUnit" ));
-		WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pVaa->sAdaptiveQuantParam.pMotionTextureUnit), FreeMemorySvc(ppCtx) )
-		(*ppCtx)->pVaa->sAdaptiveQuantParam.pMotionTextureIndexToDeltaQp   = static_cast<int8_t*>
-			(pMa->WelsMallocz( iCountMaxMbNum * sizeof(int8_t), "pVaa->sAdaptiveQuantParam.pMotionTextureIndexToDeltaQp" ));
-		WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pVaa->sAdaptiveQuantParam.pMotionTextureIndexToDeltaQp), FreeMemorySvc(ppCtx) )
-	}
-
-	(*ppCtx)->pVaa->pVaaBackgroundMbFlag = (int8_t *)pMa->WelsMallocz( iCountMaxMbNum * sizeof(int8_t), "pVaa->vaa_skip_mb_flag" );
-	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pVaa->pVaaBackgroundMbFlag), FreeMemorySvc(ppCtx) )
-
-	(*ppCtx)->pVaa->sVaaCalcInfo.pSad8x8 = static_cast<int32_t(*)[4]>
-	    (pMa->WelsMallocz( iCountMaxMbNum * 4 * sizeof(int32_t), "pVaa->sVaaCalcInfo.sad8x8" ));
-	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pVaa->sVaaCalcInfo.pSad8x8), FreeMemorySvc(ppCtx) )
-	(*ppCtx)->pVaa->sVaaCalcInfo.pSsd16x16 = static_cast<int32_t*>
-		(pMa->WelsMallocz( iCountMaxMbNum * sizeof(int32_t), "pVaa->sVaaCalcInfo.pSsd16x16" ));
-	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pVaa->sVaaCalcInfo.pSsd16x16), FreeMemorySvc(ppCtx) )
-	(*ppCtx)->pVaa->sVaaCalcInfo.pSum16x16 = static_cast<int32_t*>
-		(pMa->WelsMallocz( iCountMaxMbNum * sizeof(int32_t), "pVaa->sVaaCalcInfo.pSum16x16" ));
-	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pVaa->sVaaCalcInfo.pSum16x16), FreeMemorySvc(ppCtx) )
-	(*ppCtx)->pVaa->sVaaCalcInfo.pSumOfSquare16x16 = static_cast<int32_t*>
-		(pMa->WelsMallocz( iCountMaxMbNum * sizeof(int32_t), "pVaa->sVaaCalcInfo.pSumOfSquare16x16" ));
-	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pVaa->sVaaCalcInfo.pSumOfSquare16x16), FreeMemorySvc(ppCtx) )
-
-	if ((*ppCtx)->pSvcParam->bEnableBackgroundDetection)  //BGD control
-	{
-		(*ppCtx)->pVaa->sVaaCalcInfo.pSumOfDiff8x8 = static_cast<int32_t(*)[4]>
-			(pMa->WelsMallocz( iCountMaxMbNum * 4 * sizeof(int32_t), "pVaa->sVaaCalcInfo.sd_16x16" ));
-		WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pVaa->sVaaCalcInfo.pSumOfDiff8x8), FreeMemorySvc(ppCtx) )
-		(*ppCtx)->pVaa->sVaaCalcInfo.pMad8x8 = static_cast<uint8_t(*)[4]>
-			(pMa->WelsMallocz( iCountMaxMbNum * 4 * sizeof(uint8_t), "pVaa->sVaaCalcInfo.mad_16x16" ));
-		WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pVaa->sVaaCalcInfo.pMad8x8), FreeMemorySvc(ppCtx) )
-	}
-
-	//End of pVaa memory allocation
-	
-	iResult = InitDqLayers( ppCtx );
-	if ( iResult )
-	{
-		WelsLog( *ppCtx, WELS_LOG_WARNING, "RequestMemorySvc(), InitDqLayers failed(%d)!", iResult );
-		FreeMemorySvc( ppCtx );
-		return iResult;
-	}	
-
-	if( InitMbListD( ppCtx ) )
-	{
-		WelsLog( *ppCtx, WELS_LOG_WARNING, "RequestMemorySvc(), InitMbListD failed!" );
-		FreeMemorySvc( ppCtx );
-		return 1;
-	}
-
-	(*ppCtx)->pMvdCostTableInter = (uint16_t *)pMa->WelsMallocz( 52 * kuiMvdCacheAlginedSize, "pMvdCostTableInter" );
-	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pMvdCostTableInter), FreeMemorySvc(ppCtx) )
-	MvdCostInit( (*ppCtx)->pMvdCostTableInter, kuiMvdInterTableSize ); //should put to a better place?		
-	
-	if ( (*ppCtx)->ppRefPicListExt[0] != NULL && (*ppCtx)->ppRefPicListExt[0]->pRef[0] != NULL )
-		(*ppCtx)->pDecPic				= (*ppCtx)->ppRefPicListExt[0]->pRef[0];
-	else
-		(*ppCtx)->pDecPic				= NULL;	// error here
-
-	(*ppCtx)->pSps				= &(*ppCtx)->pSpsArray[0];
-	(*ppCtx)->pPps				= &(*ppCtx)->pPPSArray[0];
-
-	return 0;
-}
-
-
-/*!
- * \brief	free memory	in SVC core encoder
- * \pParam	pEncCtx		sWelsEncCtx*
- * \return	none
- */
-void FreeMemorySvc( sWelsEncCtx **ppCtx )
-{
-	if ( NULL != *ppCtx )
-	{
-		sWelsEncCtx *pCtx	= *ppCtx;
-		CMemoryAlign *pMa			= pCtx->pMemAlign;
-		SWelsSvcCodingParam *pParam= pCtx->pSvcParam;
-		int32_t ilayer				= 0;        
-		
-		// SStrideTables
-		if ( NULL != pCtx->pStrideTab )
-		{
-			if ( NULL != pCtx->pStrideTab->pStrideDecBlockOffset[0][1] )
-			{
-				pMa->WelsFree( pCtx->pStrideTab->pStrideDecBlockOffset[0][1], "pBase" );
-				pCtx->pStrideTab->pStrideDecBlockOffset[0][1] = NULL;
-			}
-			pMa->WelsFree(pCtx->pStrideTab, "SStrideTables");
-			pCtx->pStrideTab = NULL;
-		}
-		// pDq idc map
-		if ( NULL != pCtx->pDqIdcMap )
-		{
-			pMa->WelsFree( pCtx->pDqIdcMap, "pDqIdcMap" );
-			pCtx->pDqIdcMap = NULL;
-		}
-
-		if ( NULL != pCtx->pOut )
-		{		
-			// bs pBuffer
-			if ( NULL != pCtx->pOut->pBsBuffer )
-			{
-				pMa->WelsFree( pCtx->pOut->pBsBuffer, "pOut->pBsBuffer" );
-				pCtx->pOut->pBsBuffer = NULL;
-			}
-			// NALs list
-			if ( NULL != pCtx->pOut->sNalList )
-			{
-				pMa->WelsFree( pCtx->pOut->sNalList, "pOut->sNalList" );
-				pCtx->pOut->sNalList = NULL;
-			}
-			pMa->WelsFree( pCtx->pOut, "SWelsEncoderOutput" );
-			pCtx->pOut = NULL;
-		}
-
-#ifdef MT_ENABLED
-		if ( pParam != NULL && pParam->iMultipleThreadIdc > 1 )
-			ReleaseMtResource( ppCtx );
-#endif//MT_ENABLED
-
-		// frame bitstream pBuffer
-		if ( NULL != pCtx->pFrameBs )
-		{
-			pMa->WelsFree( pCtx->pFrameBs, "pFrameBs" );
-			pCtx->pFrameBs = NULL;
-		}
-
-		// pSpsArray
-		if ( NULL != pCtx->pSpsArray )
-		{
-			pMa->WelsFree( pCtx->pSpsArray, "pSpsArray" );
-			pCtx->pSpsArray = NULL;
-		}
-		// pPPSArray
-		if ( NULL != pCtx->pPPSArray )
-		{
-			pMa->WelsFree( pCtx->pPPSArray, "pPPSArray" );
-			pCtx->pPPSArray = NULL;
-		}
-		// subset_sps_array
-		if ( NULL != pCtx->pSubsetArray )
-		{
-			pMa->WelsFree( pCtx->pSubsetArray, "pSubsetArray" );
-			pCtx->pSubsetArray = NULL;
-		}
-
-		if( NULL != pCtx->pIntra4x4PredModeBlocks ){
-			pMa->WelsFree( pCtx->pIntra4x4PredModeBlocks, "pIntra4x4PredModeBlocks" );
-			pCtx->pIntra4x4PredModeBlocks = NULL;
-		}
-
-		if( NULL != pCtx->pNonZeroCountBlocks ){
-			pMa->WelsFree( pCtx->pNonZeroCountBlocks, "pNonZeroCountBlocks" );
-			pCtx->pNonZeroCountBlocks = NULL;
-		}
-
-		if ( NULL != pCtx->pMvUnitBlock4x4)
-		{
-			pMa->WelsFree( pCtx->pMvUnitBlock4x4, "pMvUnitBlock4x4" );
-			pCtx->pMvUnitBlock4x4	= NULL;
-		}
-
-		if ( NULL != pCtx->pRefIndexBlock4x4)
-		{
-			pMa->WelsFree( pCtx->pRefIndexBlock4x4, "pRefIndexBlock4x4" );
-			pCtx->pRefIndexBlock4x4	= NULL;
-		}
-			
-		if ( NULL != pCtx->ppMbListD )
-		{		
-			if( NULL != pCtx->ppMbListD[0] ){
-				pMa->WelsFree( pCtx->ppMbListD[0], "ppMbListD[0]" );
-				(*ppCtx)->ppMbListD[0] = NULL;
-			}
-			pMa->WelsFree( pCtx->ppMbListD, "ppMbListD" );
-			pCtx->ppMbListD = NULL;
-		}
-
-		if ( NULL != pCtx->pSadCostMb)
-		{
-			pMa->WelsFree( pCtx->pSadCostMb, "pSadCostMb" );
-			pCtx->pSadCostMb = NULL;
-		}
-
-		// SLTRState
-		if ( NULL != pCtx->pLtr )
-		{
-			pMa->WelsFree( pCtx->pLtr, "SLTRState" );
-			pCtx->pLtr = NULL;
-		}
-
-		// pDq layers list
-		ilayer = 0;
-		if ( NULL != pCtx->ppDqLayerList && pParam != NULL )
-		{			
-			while (ilayer < pParam->iNumDependencyLayer) {
-				SDqLayer *pDq	= pCtx->ppDqLayerList[ilayer];
-				SDLayerParam *pDlp = &pCtx->pSvcParam->sDependencyLayers[ilayer];
-				const BOOL_T kbIsDynamicSlicing = (SM_DYN_SLICE == pDlp->sMso.uiSliceMode);
-				
-				// pDq layers
-				if ( NULL != pDq )
-				{
-					if ( NULL != pDq->sLayerInfo.pSliceInLayer )
-					{
-						int32_t iSliceIdx = 0;
-						int32_t iSliceNum = GetInitialSliceNum( pDq->iMbWidth, pDq->iMbHeight, &pDlp->sMso );
-						if (iSliceNum < 1)
-							iSliceNum = 1;
-						while(iSliceIdx < iSliceNum)
-						{
-							SSlice *pSlice = &pDq->sLayerInfo.pSliceInLayer[iSliceIdx];
-							FreeMbCache(&pSlice->sMbCacheInfo, pMa);
-							++ iSliceIdx;
-						}
-						pMa->WelsFree( pDq->sLayerInfo.pSliceInLayer, "pSliceInLayer" );
-						pDq->sLayerInfo.pSliceInLayer = NULL;
-					}
-					if ( kbIsDynamicSlicing )
-					{
-						pMa->WelsFree( pDq->pNumSliceCodedOfPartition, "pNumSliceCodedOfPartition" );
-						pDq->pNumSliceCodedOfPartition	= NULL;
-						pMa->WelsFree( pDq->pLastCodedMbIdxOfPartition, "pLastCodedMbIdxOfPartition" );
-						pDq->pLastCodedMbIdxOfPartition	= NULL;
-						pMa->WelsFree( pDq->pLastMbIdxOfPartition, "pLastMbIdxOfPartition" );						
-						pDq->pLastMbIdxOfPartition = NULL;
-					}
-
-					pMa->WelsFree( pDq, "pDq" );
-					pDq = NULL;
-					pCtx->ppDqLayerList[ilayer] = NULL;
-				}				
-				++ ilayer;
-			}
-			pMa->WelsFree( pCtx->ppDqLayerList, "ppDqLayerList" );
-			pCtx->ppDqLayerList = NULL;
-		}
-		FreeSpatialPictures( pCtx );		
-
-		// reference picture list extension
-		if ( NULL != pCtx->ppRefPicListExt && pParam != NULL )
-		{
-			ilayer = 0;
-			while (ilayer < pParam->iNumDependencyLayer) {
-				SRefList *pRefList		= pCtx->ppRefPicListExt[ilayer];
-				if ( NULL != pRefList )
-				{
-					int32_t iRef = 0;
-					do {
-						if ( pRefList->pRef[iRef] != NULL )
-						{
-							FreePicture( pMa, &pRefList->pRef[iRef] );
-						}
-						++ iRef;
-					} while(iRef < 1 + pParam->iNumRefFrame);
-
-					pMa->WelsFree( pCtx->ppRefPicListExt[ilayer], "ppRefPicListExt[]" );
-					pCtx->ppRefPicListExt[ilayer] = NULL;
-				}				
-				++ ilayer;
-			}	
-
-			pMa->WelsFree( pCtx->ppRefPicListExt, "ppRefPicListExt" );
-			pCtx->ppRefPicListExt = NULL;
-		}
-		
-		// pSlice context list
-		if ( NULL != pCtx->pSliceCtxList && pParam != NULL )
-		{
-			ilayer = 0;
-			while (ilayer < pParam->iNumDependencyLayer) {
-				SSliceCtx *pSliceCtx	= &pCtx->pSliceCtxList[ilayer];
-				if ( NULL != pSliceCtx )
-					UninitSlicePEncCtx( pSliceCtx, pMa );				
-				++ ilayer;
-			}
-			pMa->WelsFree( pCtx->pSliceCtxList, "pSliceCtxList" );
-			pCtx->pSliceCtxList = NULL;
-		}		
-
-		// VAA
-		if ( NULL != pCtx->pVaa )
-		{
-			if(pCtx->pSvcParam->bEnableAdaptiveQuant)//free mem
-			{
-				pMa->WelsFree( pCtx->pVaa->sAdaptiveQuantParam.pMotionTextureUnit, "pVaa->sAdaptiveQuantParam.pMotionTextureUnit" );
-				pCtx->pVaa->sAdaptiveQuantParam.pMotionTextureUnit = NULL;
-				pMa->WelsFree( pCtx->pVaa->sAdaptiveQuantParam.pMotionTextureIndexToDeltaQp, "pVaa->sAdaptiveQuantParam.pMotionTextureIndexToDeltaQp" );
-				pCtx->pVaa->sAdaptiveQuantParam.pMotionTextureIndexToDeltaQp = NULL;
-			}
-
-			pMa->WelsFree( pCtx->pVaa->pVaaBackgroundMbFlag, "pVaa->pVaaBackgroundMbFlag");
-			pCtx->pVaa->pVaaBackgroundMbFlag	= NULL;
-			pMa->WelsFree( pCtx->pVaa->sVaaCalcInfo.pSad8x8, "pVaa->sVaaCalcInfo.sad8x8" );
-			pCtx->pVaa->sVaaCalcInfo.pSad8x8		= NULL;
-			pMa->WelsFree( pCtx->pVaa->sVaaCalcInfo.pSsd16x16, "pVaa->sVaaCalcInfo.pSsd16x16" );
-			pCtx->pVaa->sVaaCalcInfo.pSsd16x16	= NULL;
-			pMa->WelsFree( pCtx->pVaa->sVaaCalcInfo.pSum16x16, "pVaa->sVaaCalcInfo.pSum16x16" );
-			pCtx->pVaa->sVaaCalcInfo.pSum16x16	= NULL;
-			pMa->WelsFree( pCtx->pVaa->sVaaCalcInfo.pSumOfSquare16x16, "pVaa->sVaaCalcInfo.pSumOfSquare16x16" );
-			pCtx->pVaa->sVaaCalcInfo.pSumOfSquare16x16		= NULL;
-
-			if (pCtx->pSvcParam->bEnableBackgroundDetection) //BGD control
-			{
-				pMa->WelsFree( pCtx->pVaa->sVaaCalcInfo.pSumOfDiff8x8, "pVaa->sVaaCalcInfo.pSumOfDiff8x8" );
-				pCtx->pVaa->sVaaCalcInfo.pSumOfDiff8x8	= NULL;
-				pMa->WelsFree( pCtx->pVaa->sVaaCalcInfo.pMad8x8, "pVaa->sVaaCalcInfo.pMad8x8" );
-				pCtx->pVaa->sVaaCalcInfo.pMad8x8	= NULL;
-			}
-
-			pMa->WelsFree( pCtx->pVaa, "pVaa" );
-			pCtx->pVaa = NULL;
-		}
-
-		WelsRcFreeMemory(pCtx);
-		// rate control module memory free
-		if ( NULL != pCtx->pWelsSvcRc )
-		{
-			pMa->WelsFree( pCtx->pWelsSvcRc, "pWelsSvcRc" );
-			pCtx->pWelsSvcRc = NULL;
-		}
-
-		/* MVD cost tables for Inter */
-		if ( NULL != pCtx->pMvdCostTableInter )
-		{
-			pMa->WelsFree( pCtx->pMvdCostTableInter, "pMvdCostTableInter" );
-			pCtx->pMvdCostTableInter = NULL;
-		}
-
-#ifdef ENABLE_TRACE_FILE
-		if ( NULL != pCtx->pFileLog )
-		{
-			fclose( pCtx->pFileLog );
-			pCtx->pFileLog	= NULL;
-		}
-		pCtx->uiSizeLog	= 0;
-#endif//ENABLE_TRACE_FILE
-
-		FreeCodingParam( &pCtx->pSvcParam, pMa );
-		if ( NULL != pCtx->pFuncList )
-		{
-			pMa->WelsFree(pCtx->pFuncList, "SWelsFuncPtrList");
-			pCtx->pFuncList = NULL;
-		}
-
-#if defined(MEMORY_MONITOR)
-		assert(pMa->WelsGetMemoryUsage() == 0);	// ensure all memory free well
-#endif//MEMORY_MONITOR		
-
-		if ( (*ppCtx)->pMemAlign != NULL )
-		{
-			WelsLog( NULL, WELS_LOG_INFO, "FreeMemorySvc(), verify memory usage (%d bytes) after free..\n", (*ppCtx)->pMemAlign->WelsGetMemoryUsage() );
-			delete (*ppCtx)->pMemAlign;
-			(*ppCtx)->pMemAlign = NULL;
-		}
-
-		free(*ppCtx);
-		*ppCtx = NULL;
-	}
-}
-
-int32_t InitSliceSettings( SWelsSvcCodingParam *pCodingParam, const int32_t kiCpuCores, int16_t *pMaxSliceCount )
-{
-	int32_t iSpatialIdx = 0, iSpatialNum = pCodingParam->iNumDependencyLayer;
-	int16_t iMaxSliceCount = 0;
-		
-	do {
-		SDLayerParam *pDlp				= &pCodingParam->sDependencyLayers[iSpatialIdx];
-		SMulSliceOption *pMso			= &pDlp->sMso;
-		SSliceArgument *pSlcArg			= &pMso->sSliceArgument;
-		const int32_t kiMbWidth			= (pDlp->iFrameWidth+15)>>4;
-		const int32_t kiMbHeight			= (pDlp->iFrameHeight+15)>>4;
-		const int32_t kiMbNumInFrame	= kiMbWidth * kiMbHeight;
-#if defined(MT_ENABLED)
-#if defined(DYNAMIC_SLICE_ASSIGN)
-		int32_t iSliceNum				= (SM_FIXEDSLCNUM_SLICE == pMso->uiSliceMode || SM_DYN_SLICE == pMso->uiSliceMode) ? kiCpuCores : pSlcArg->iSliceNum; // uiSliceNum per input has been validated at ParamValidationExt()
-#else//!DYNAMIC_SLICE_ASSIGN
-		int32_t iSliceNum				= (SM_DYN_SLICE == pMso->uiSliceMode) ? kiCpuCores : pSlcArg->uiSliceNum; // uiSliceNum per input has been validated at ParamValidationExt()
-#endif//DYNAMIC_SLICE_ASSIGN
-#else//!MT_ENABLED
-		int16_t iSliceNum				= pSlcArg->iSliceNum; // uiSliceNum per input has been validated at ParamValidationExt()
-#endif//MT_ENABLED
-
-		// NOTE: Per design, in case MT/DYNAMIC_SLICE_ASSIGN enabled, for SM_FIXEDSLCNUM_SLICE mode, 
-		// uiSliceNum of current spatial layer settings equals to uiCpuCores number; SM_DYN_SLICE mode,
-		// uiSliceNum intials as uiCpuCores also, stay tuned dynamically slicing in future
-		pSlcArg->iSliceNum	= iSliceNum;	// used fixed one
-
-		switch(pMso->uiSliceMode)
-		{
-		case SM_DYN_SLICE:
-			iMaxSliceCount	= AVERSLICENUM_CONSTRAINT;
-//#ifndef MT_ENABLED
-			break;	// go through for MT_ENABLED & SM_DYN_SLICE?
-//#endif//MT_ENABLED
-		case SM_FIXEDSLCNUM_SLICE:
-			if ( iSliceNum > iMaxSliceCount )
-				iMaxSliceCount = iSliceNum;
-			// need perform check due uiSliceNum might change, although has been initialized somewhere outside
-			if (pCodingParam->bEnableRc)
-			{
-				GomValidCheckSliceMbNum( kiMbWidth, kiMbHeight, pSlcArg );						
-			}			
-			else
-			{			
-				CheckFixedSliceNumMultiSliceSetting( kiMbNumInFrame, pSlcArg );
-			}			
-			break;
-		case SM_SINGLE_SLICE:
-			if ( iSliceNum > iMaxSliceCount )
-				iMaxSliceCount = iSliceNum;
-			break;
-		case SM_RASTER_SLICE:
-			if ( iSliceNum > iMaxSliceCount )
-				iMaxSliceCount = iSliceNum;
-			break;
-		case SM_ROWMB_SLICE:
-			if ( iSliceNum > iMaxSliceCount )
-				iMaxSliceCount = iSliceNum;
-			break;
-		default:
-			break;
-		}			
-
-		++ iSpatialIdx;
-	} while(iSpatialIdx < iSpatialNum);	
-
-#ifdef MT_ENABLED	
-	pCodingParam->iCountThreadsNum				= WELS_MIN(kiCpuCores, iMaxSliceCount);
-	pCodingParam->iMultipleThreadIdc	= pCodingParam->iCountThreadsNum;
-#else
-	pCodingParam->iMultipleThreadIdc	= 1;
-	pCodingParam->iCountThreadsNum				= 1;
-#endif//MT_ENABLED
-
-#ifndef WELS_TESTBED	// for product release and non-SGE testing
-	
-	if ( kiCpuCores < 2 )	// single CPU core, make no sense for MT parallelization
-	{
-		pCodingParam->iMultipleThreadIdc	= 1;
-		pCodingParam->iCountThreadsNum				= 1;
-	}
-#endif
-	
-	*pMaxSliceCount					= iMaxSliceCount;
-
-	return 0;
-}
-
-/*!
- * \brief	log output for cpu features/capabilities
- */
-void OutputCpuFeaturesLog( uint32_t uiCpuFeatureFlags, uint32_t uiCpuCores, int32_t iCacheLineSize )
-{
-	// welstracer output
-	WelsLog(NULL, WELS_LOG_INFO, "WELS CPU features/capacities (0x%x) detected: \t"	\
-		"HTT:      %c, "	\
-		"MMX:      %c, "	\
-		"MMXEX:    %c, "	\
-		"SSE:      %c, "	\
-		"SSE2:     %c, "	\
-		"SSE3:     %c, "	\
-		"SSSE3:    %c, "	\
-		"SSE4.1:   %c, "	\
-		"SSE4.2:   %c, "	\
-		"AVX:      %c, "	\
-		"FMA:      %c, "	\
-		"X87-FPU:  %c, "	\
-		"3DNOW:    %c, "	\
-		"3DNOWEX:  %c, "	\
-		"ALTIVEC:  %c, "	\
-		"CMOV:     %c, "	\
-		"MOVBE:    %c, "	\
-		"AES:      %c, "	\
-		"NUMBER OF LOGIC PROCESSORS ON CHIP: %d, "	\
-		"CPU CACHE LINE SIZE (BYTES):        %d\n",
-		uiCpuFeatureFlags,
-		(uiCpuFeatureFlags & WELS_CPU_HTT) ? 'Y' : 'N',
-		(uiCpuFeatureFlags & WELS_CPU_MMX) ? 'Y' : 'N',
-		(uiCpuFeatureFlags & WELS_CPU_MMXEXT) ? 'Y' : 'N',
-		(uiCpuFeatureFlags & WELS_CPU_SSE) ? 'Y' : 'N',
-		(uiCpuFeatureFlags & WELS_CPU_SSE2) ? 'Y' : 'N',
-		(uiCpuFeatureFlags & WELS_CPU_SSE3) ? 'Y' : 'N',
-		(uiCpuFeatureFlags & WELS_CPU_SSSE3) ? 'Y' : 'N',
-		(uiCpuFeatureFlags & WELS_CPU_SSE41) ? 'Y' : 'N',
-		(uiCpuFeatureFlags & WELS_CPU_SSE42) ? 'Y' : 'N',
-		(uiCpuFeatureFlags & WELS_CPU_AVX) ? 'Y' : 'N',
-		(uiCpuFeatureFlags & WELS_CPU_FMA) ? 'Y' : 'N',
-		(uiCpuFeatureFlags & WELS_CPU_FPU) ? 'Y' : 'N',
-		(uiCpuFeatureFlags & WELS_CPU_3DNOW) ? 'Y' : 'N',
-		(uiCpuFeatureFlags & WELS_CPU_3DNOWEXT) ? 'Y' : 'N',
-		(uiCpuFeatureFlags & WELS_CPU_ALTIVEC) ? 'Y' : 'N',
-		(uiCpuFeatureFlags & WELS_CPU_CMOV) ? 'Y' : 'N',
-		(uiCpuFeatureFlags & WELS_CPU_MOVBE) ? 'Y' : 'N',
-		(uiCpuFeatureFlags & WELS_CPU_AES) ? 'Y' : 'N',
-		uiCpuCores,
-		iCacheLineSize );
-
-#ifdef _DEBUG	// output at console & _debug
-	fprintf( stderr, "WELS CPU features/capacities (0x%x) detected: \n"	\
-		"HTT:      %c, "	\
-		"MMX:      %c, "	\
-		"MMXEX:    %c, "	\
-		"SSE:      %c, "	\
-		"SSE2:     %c, "	\
-		"SSE3:     %c, "	\
-		"SSSE3:    %c, "	\
-		"SSE4.1:   %c, "	\
-		"SSE4.2:   %c, "	\
-		"AVX:      %c, "	\
-		"FMA:      %c, "	\
-		"X87-FPU:  %c, "	\
-		"3DNOW:    %c, "	\
-		"3DNOWEX:  %c, "	\
-		"ALTIVEC:  %c, "	\
-		"CMOV:     %c, "	\
-		"MOVBE:    %c, "	\
-		"AES:      %c, "	\
-		"NUMBER OF LOGIC PROCESSORS ON CHIP: %d, "	\
-		"CPU CACHE LINE SIZE (BYTES):        %d\n",
-		uiCpuFeatureFlags,
-		(uiCpuFeatureFlags & WELS_CPU_HTT) ? 'Y' : 'N',
-		(uiCpuFeatureFlags & WELS_CPU_MMX) ? 'Y' : 'N',
-		(uiCpuFeatureFlags & WELS_CPU_MMXEXT) ? 'Y' : 'N',
-		(uiCpuFeatureFlags & WELS_CPU_SSE) ? 'Y' : 'N',
-		(uiCpuFeatureFlags & WELS_CPU_SSE2) ? 'Y' : 'N',
-		(uiCpuFeatureFlags & WELS_CPU_SSE3) ? 'Y' : 'N',
-		(uiCpuFeatureFlags & WELS_CPU_SSSE3) ? 'Y' : 'N',
-		(uiCpuFeatureFlags & WELS_CPU_SSE41) ? 'Y' : 'N',
-		(uiCpuFeatureFlags & WELS_CPU_SSE42) ? 'Y' : 'N',
-		(uiCpuFeatureFlags & WELS_CPU_AVX) ? 'Y' : 'N',
-		(uiCpuFeatureFlags & WELS_CPU_FMA) ? 'Y' : 'N',
-		(uiCpuFeatureFlags & WELS_CPU_FPU) ? 'Y' : 'N',
-		(uiCpuFeatureFlags & WELS_CPU_3DNOW) ? 'Y' : 'N',
-		(uiCpuFeatureFlags & WELS_CPU_3DNOWEXT) ? 'Y' : 'N',
-		(uiCpuFeatureFlags & WELS_CPU_ALTIVEC) ? 'Y' : 'N',
-		(uiCpuFeatureFlags & WELS_CPU_CMOV) ? 'Y' : 'N',
-		(uiCpuFeatureFlags & WELS_CPU_MOVBE) ? 'Y' : 'N',
-		(uiCpuFeatureFlags & WELS_CPU_AES) ? 'Y' : 'N',
-		uiCpuCores,
-		iCacheLineSize );
-#endif//_DEBUG
-}
-
-/*!
- * \brief	initialize Wels avc encoder core library
- * \pParam	ppCtx		sWelsEncCtx**
- * \pParam	pParam		SWelsSvcCodingParam*
- * \return	successful - 0; otherwise none 0 for failed
- */
-int32_t WelsInitEncoderExt( sWelsEncCtx **ppCtx, SWelsSvcCodingParam *pCodingParam )
-{
-	sWelsEncCtx *pCtx		= NULL;
-	int32_t	iRet					= 0;
-	uint32_t uiCpuFeatureFlags		= 0;	// CPU features
-	int32_t uiCpuCores				= 1;	// number of logic processors on physical processor package, one logic processor means HTT not supported	
-	int32_t iCacheLineSize			= 16;	// on chip cache line size in byte
-	int16_t iSliceNum				= 1;	// number of slices used
-		
- 	if ( NULL == ppCtx || NULL == pCodingParam )
-	{
-		WelsLog(NULL, WELS_LOG_ERROR, "WelsInitEncoderExt(), NULL == ppCtx(0x%p) or NULL == pCodingParam(0x%p).\n", (void *)ppCtx, (void *)pCodingParam);
-		return 1;
-	}
-
-	iRet	=	ParamValidationExt( pCodingParam );
-	if ( iRet != 0 )
-	{
-		WelsLog(NULL, WELS_LOG_ERROR, "WelsInitEncoderExt(), ParamValidationExt failed return %d.\n", iRet);
-		return iRet;
-	}
-
-	// for cpu features detection, Only detect once??
-#ifdef X86_ASM
-	uiCpuFeatureFlags	= WelsCPUFeatureDetect( &uiCpuCores );	// detect cpu capacity features	
-	if ( uiCpuFeatureFlags & WELS_CPU_CACHELINE_128 )
-		iCacheLineSize = 128;
-	else if ( uiCpuFeatureFlags & WELS_CPU_CACHELINE_64 )
-		iCacheLineSize = 64;
-	else if ( uiCpuFeatureFlags & WELS_CPU_CACHELINE_32 )
-		iCacheLineSize	= 32;
-	else if ( uiCpuFeatureFlags & WELS_CPU_CACHELINE_16 )
-		iCacheLineSize	= 16;
-	OutputCpuFeaturesLog( uiCpuFeatureFlags, uiCpuCores, iCacheLineSize );
-#else
-	iCacheLineSize	= 16;	// 16 bytes aligned in default
-#endif//X86_ASM
-
-#ifndef WELS_TESTBED
-
-#if defined(MT_ENABLED) && defined(DYNAMIC_DETECT_CPU_CORES)
-	if ( pCodingParam->iMultipleThreadIdc > 0 )
-		uiCpuCores = pCodingParam->iMultipleThreadIdc;
-	else
-	{
-		if ( uiCpuFeatureFlags == 0 )	// cpuid not supported, use high level system API as followed to detect number of pysical/logic processor
-			uiCpuCores = DynamicDetectCpuCores();
-		// So far so many cpu cores up to MAX_THREADS_NUM mean for server platforms,
-		// for client application here it is constrained by maximal to MAX_THREADS_NUM
-		if ( uiCpuCores > MAX_THREADS_NUM )	// MAX_THREADS_NUM
-			uiCpuCores	= MAX_THREADS_NUM;	// MAX_THREADS_NUM
-		else if ( uiCpuCores < 1 )	// just for safe
-			uiCpuCores	= 1;
-	}
-#endif//MT_ENABLED && DYNAMIC_DETECT_CPU_CORES
-
-#else//WELS_TESTBED
-	
-	uiCpuCores	= pCodingParam->iMultipleThreadIdc;	// assigned uiCpuCores from iMultipleThreadIdc from SGE testing
-
-#endif//WELS_TESTBED	
-
-	uiCpuCores	= WELS_CLIP3(uiCpuCores, 1, MAX_THREADS_NUM);
-
-	if ( InitSliceSettings(pCodingParam, uiCpuCores, &iSliceNum ) )
-	{
-		WelsLog(NULL, WELS_LOG_ERROR, "WelsInitEncoderExt(), InitSliceSettings failed.\n");
-		return 1;
-	}
-	
-	*ppCtx	= NULL;
-	
-	pCtx	= static_cast<sWelsEncCtx*>(malloc( sizeof(sWelsEncCtx) ));
-
-	WELS_VERIFY_RETURN_IF(1, (NULL == pCtx))
-	memset( pCtx, 0, sizeof(sWelsEncCtx) );	
-
-	pCtx->pMemAlign = new CMemoryAlign( iCacheLineSize );
-	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pCtx->pMemAlign), FreeMemorySvc(&pCtx) )
-
-	// for logs
-#ifdef ENABLE_TRACE_FILE
-	if (wlog == WelsLogDefault)
-	{
-		str_t fname[MAX_FNAME_LEN] = {0};
-
-#if defined (_MSC_VER)
-#if _MSC_VER>=1500
-			SNPRINTF(fname, MAX_FNAME_LEN, MAX_FNAME_LEN, "%swels_svc_encoder_trace.txt",  pCodingParam->sTracePath );		// confirmed_safe_unsafe_usage
-#else
-			SNPRINTF(fname, MAX_FNAME_LEN, "%swels_svc_encoder_trace.txt",  pCodingParam->sTracePath );		// confirmed_safe_unsafe_usage
-#endif//_MSC_VER>=1500
-#else
-        //GNUC/
-        SNPRINTF(fname,      MAX_FNAME_LEN,       "%swels_svc_encoder_trace.txt",  pCodingParam->sTracePath );		// confirmed_safe_unsafe_usage
-#endif//_MSC_VER
-
-
-#if defined(__GNUC__)
-		pCtx->pFileLog	= FOPEN(fname, "wt+");
-#else//WIN32
-#if defined(WIN32) && defined(_MSC_VER)
-#if _MSC_VER >= 1500
-		FOPEN(&pCtx->pFileLog,fname, "wt+");
-#else
-		pCtx->pFileLog	= FOPEN(fname, "wt+");
-#endif//_MSC_VER>=1500
-#endif//WIN32 && _MSC_VER
-#endif//__GNUC__
-		pCtx->uiSizeLog	= 0;
-	}
-#endif//ENABLE_TRACE_FILE
-
-	pCodingParam->DetermineTemporalSettings();
-	iRet = AllocCodingParam( &pCtx->pSvcParam, pCtx->pMemAlign, pCodingParam->iNumDependencyLayer );
-	if ( iRet != 0 )
-	{
-		FreeMemorySvc( &pCtx );		
-		return iRet;
-	}
-	memcpy( pCtx->pSvcParam, pCodingParam, sizeof(SWelsSvcCodingParam) );	// confirmed_safe_unsafe_usage
-
-	pCtx->pFuncList = (SWelsFuncPtrList *)pCtx->pMemAlign->WelsMalloc(sizeof(SWelsFuncPtrList), "SWelsFuncPtrList");
-	if ( NULL == pCtx->pFuncList )
-	{
-		FreeMemorySvc( &pCtx );
-		return 1;
-	}
-	InitFunctionPointers( pCtx->pFuncList, pCtx->pSvcParam, uiCpuFeatureFlags );	
-
-	pCtx->iActiveThreadsNum	= pCodingParam->iCountThreadsNum;
-	pCtx->iMaxSliceCount	= iSliceNum;
-	iRet = RequestMemorySvc( &pCtx );
-	if ( iRet != 0 )
-	{		
-		WelsLog(pCtx, WELS_LOG_ERROR, "WelsInitEncoderExt(), RequestMemorySvc failed return %d.\n", iRet);
-		FreeMemorySvc( &pCtx );		
-		return iRet;
-	}
-
-#ifdef MT_ENABLED
-	if ( pCodingParam->iMultipleThreadIdc > 1 )
-		iRet = CreateSliceThreads( pCtx);		
-#endif
-
-	WelsRcInitModule( pCtx,  pCtx->pSvcParam->bEnableRc ? WELS_RC_GOM : WELS_RC_DISABLE);
-
-	pCtx->pVpp = new CWelsPreProcess((void *)pCtx);
-	if ( pCtx->pVpp == NULL )
-	{		
-		WelsLog(pCtx, WELS_LOG_ERROR, "WelsInitEncoderExt(), pOut of memory in case new CWelsPreProcess().\n");
-		FreeMemorySvc( &pCtx );
-		return iRet;
-	}
-
-#if defined(MEMORY_MONITOR)
-	WelsLog(pCtx, WELS_LOG_INFO, "WelsInitEncoderExt() exit, overall memory usage: %lu bytes\n", sizeof(sWelsEncCtx) /* requested size from malloc() or new operator */
-                                                                                                 + pCtx->pMemAlign->WelsGetMemoryUsage()	/* requested size from CMemoryAlign::WelsMalloc() */
-             );
-#endif//MEMORY_MONITOR
-	
-	*ppCtx	= pCtx;
-
-	WelsLog(pCtx, WELS_LOG_DEBUG, "WelsInitEncoderExt(), pCtx= 0x%p.\n", (void *)pCtx);
-	
-	return 0;
-}
-/*
- *
- * status information output
- */
-#if defined(STAT_OUTPUT)
-void StatOverallEncodingExt(sWelsEncCtx *pCtx)
-{
-    int8_t i = 0;
-	int8_t j = 0;
-	for (i = 0;i<pCtx->pSvcParam->iNumDependencyLayer;i++)
-	{
-			fprintf( stdout,"\nDependency layer : %d\n",i);
-			fprintf( stdout,"Quality layer : %d\n",j);
-			{
-				const int32_t iCount = pCtx->sStatData[i][j].sSliceData.iSliceCount[I_SLICE] +
-					                pCtx->sStatData[i][j].sSliceData.iSliceCount[P_SLICE] +
-									pCtx->sStatData[i][j].sSliceData.iSliceCount[B_SLICE];
-#if defined(MB_TYPES_CHECK) 
-				if (iCount > 0){
-					int32_t iCountNumIMb = pCtx->sStatData[i][j].sSliceData.iMbCount[I_SLICE][Intra4x4] + pCtx->sStatData[i][j].sSliceData.iMbCount[I_SLICE][Intra16x16]+ pCtx->sStatData[i][j].sSliceData.iMbCount[I_SLICE][7];
-					int32_t iCountNumPMb	=	pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Intra4x4] +
-						pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Intra16x16] +
-						pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][7] +
-						pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter16x16] +
-						pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter16x8] +
-						pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter8x16] +
-						pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter8x8] +
-						pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][10] +
-						pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][PSkip];	
-					int32_t count_p_mbL0 = 	pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter16x16] +
-						pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter16x8] +
-						pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter8x16] +
-						pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter8x8] +
-						pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][10];
-					
-					int32_t iMbCount = iCountNumIMb + iCountNumPMb;
-					if ( iMbCount > 0 ){
-						fprintf(	stderr,
-							"SVC: overall Slices	MBs: %d Avg\nI4x4: %.3f%% I16x16: %.3f%% IBL: %.3f%%\nP16x16: %.3f%% P16x8: %.3f%% P8x16: %.3f%% P8x8: %.3f%% SUBP8x8: %.3f%% PSKIP: %.3f%%\nILP(All): %.3f%% ILP(PL0): %.3f%% BLSKIP(PL0): %.3f%% RP(PL0): %.3f%%\n",
-							iMbCount,
-							(100.0f * (pCtx->sStatData[i][j].sSliceData.iMbCount[I_SLICE][Intra4x4] + pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Intra4x4]) / iMbCount),
-							(100.0f * (pCtx->sStatData[i][j].sSliceData.iMbCount[I_SLICE][Intra16x16] + pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Intra16x16]) / iMbCount),
-							(100.0f * (pCtx->sStatData[i][j].sSliceData.iMbCount[I_SLICE][7] + pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][7]) / iMbCount),
-							(100.0f * pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter16x16] / iMbCount ),
-							(100.0f * pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter16x8] / iMbCount ),
-							(100.0f * pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter8x16] / iMbCount ),
-							(100.0f * pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter8x8] / iMbCount),
-							(100.0f * pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][10] / iMbCount),
-							(100.0f * pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][PSkip] / iMbCount),
-							(100.0f * pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][11] / iMbCount),
-							(100.0f * pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][11] / count_p_mbL0),
-							(100.0f * pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][8] / count_p_mbL0),
-							(100.0f * pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][9] / count_p_mbL0) 
-							);
-					}					
-				}
-#endif //#if defined(MB_TYPES_CHECK)
-
-				if (iCount > 0){
-					fprintf( stdout, "SVC: overall PSNR Y: %2.3f U: %2.3f V: %2.3f kb/s: %.1f fps: %.3f\n\n",
-						(pCtx->sStatData[i][j].sQualityStat.rYPsnr[I_SLICE]+pCtx->sStatData[i][j].sQualityStat.rYPsnr[P_SLICE]+pCtx->sStatData[i][j].sQualityStat.rYPsnr[B_SLICE]) / (float)(iCount),
-						(pCtx->sStatData[i][j].sQualityStat.rUPsnr[I_SLICE]+pCtx->sStatData[i][j].sQualityStat.rUPsnr[P_SLICE]+pCtx->sStatData[i][j].sQualityStat.rUPsnr[B_SLICE]) / (float)(iCount),
-						(pCtx->sStatData[i][j].sQualityStat.rVPsnr[I_SLICE]+pCtx->sStatData[i][j].sQualityStat.rVPsnr[P_SLICE]+pCtx->sStatData[i][j].sQualityStat.rVPsnr[B_SLICE]) / (float)(iCount),
-						1.0f * pCtx->pSvcParam->sDependencyLayers[i].fOutputFrameRate *(pCtx->sStatData[i][j].sSliceData.iSliceSize[I_SLICE] +pCtx->sStatData[i][j].sSliceData.iSliceSize[P_SLICE] +pCtx->sStatData[i][j].sSliceData.iSliceSize[B_SLICE] ) / (float)(iCount+pCtx->pWelsSvcRc[i].iSkipFrameNum)/1000,
-						1.0f * pCtx->pSvcParam->sDependencyLayers[i].fOutputFrameRate );
-
-				}
-
-			}
-		
-	}
-}
-#endif
-/*!
- * \brief	uninitialize Wels encoder core library
- * \pParam	pEncCtx		sWelsEncCtx*
- * \return	none
- */
-void WelsUninitEncoderExt( sWelsEncCtx **ppCtx )
-{
-	if ( NULL == ppCtx || NULL == *ppCtx )
-		return;
-
-	WelsLog( *ppCtx, WELS_LOG_INFO, "WelsUninitEncoderExt(), pCtx= %p, iThreadCount= %d, iMultipleThreadIdc= %d.\n", (void *)(*ppCtx), (*ppCtx)->pSvcParam->iCountThreadsNum, (*ppCtx)->pSvcParam->iMultipleThreadIdc );
-
-#if defined(STAT_OUTPUT)
-	StatOverallEncodingExt( *ppCtx );
-#endif	
-
-#if defined(MT_ENABLED)	
-	if ( (*ppCtx)->pSvcParam->iMultipleThreadIdc > 1 && (*ppCtx)->pSliceThreading != NULL )
-	{		
-		const int32_t iThreadCount = (*ppCtx)->pSvcParam->iCountThreadsNum;
-		int32_t iThreadIdx = 0;
-		
-#if defined(WIN32)
-		if ( (*ppCtx)->pSliceThreading->pExitEncodeEvent != NULL )
-		{
-			do {
-				if ( (*ppCtx)->pSliceThreading->pThreadHandles[iThreadIdx] != NULL )	// iThreadIdx is already created successfully
-					WelsEventSignal( &(*ppCtx)->pSliceThreading->pExitEncodeEvent[iThreadIdx] );
-				++ iThreadIdx;
-			} while(iThreadIdx < iThreadCount);
-
-			WelsMultipleEventsWaitAllBlocking( iThreadCount, &(*ppCtx)->pSliceThreading->pFinSliceCodingEvent[0] );
-
-		}		
-#elif defined(__GNUC__)
-		while ( iThreadIdx < iThreadCount )
-		{
-			int res = 0;
-			if ( (*ppCtx)->pSliceThreading->pThreadHandles[iThreadIdx] )
-			{
-				res = WelsThreadCancel( (*ppCtx)->pSliceThreading->pThreadHandles[iThreadIdx] );
-				WelsLog( *ppCtx, WELS_LOG_INFO, "WelsUninitEncoderExt(), WelsThreadCancel(pThreadHandles%d) return %d..\n", iThreadIdx, res);
-				res = WelsThreadJoin( (*ppCtx)->pSliceThreading->pThreadHandles[iThreadIdx] );	// waiting thread exit
-				WelsLog( *ppCtx, WELS_LOG_INFO, "WelsUninitEncoderExt(), pthread_join(pThreadHandles%d) return %d..\n", iThreadIdx, res);
-				(*ppCtx)->pSliceThreading->pThreadHandles[iThreadIdx] = 0;
-			}
-#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
-			if ( (*ppCtx)->pSliceThreading->pUpdateMbListThrdHandles[iThreadIdx] )
-			{
-				res = WelsThreadCancel( (*ppCtx)->pSliceThreading->pUpdateMbListThrdHandles[iThreadIdx] );
-				WelsLog( *ppCtx, WELS_LOG_INFO, "WelsUninitEncoderExt(), WelsThreadCancel(pUpdateMbListThrdHandles%d) return %d..\n", iThreadIdx, res);				
-				res = WelsThreadJoin( (*ppCtx)->pSliceThreading->pUpdateMbListThrdHandles[iThreadIdx] );	// waiting thread exit
-				WelsLog( *ppCtx, WELS_LOG_INFO, "WelsUninitEncoderExt(), pthread_join(pUpdateMbListThrdHandles%d) return %d..\n", iThreadIdx, res);
-				(*ppCtx)->pSliceThreading->pUpdateMbListThrdHandles[iThreadIdx] = 0;
-			}
-#endif//DYNAMIC_SLICE_ASSIGN && TRY_SLICING_BALANCE
-			++ iThreadIdx;
-		}
-#endif//WIN32
-	}
-#endif//MT_ENABLED
-
-	if ((*ppCtx)->pVpp)
-	{
-		delete (*ppCtx)->pVpp;
-		(*ppCtx)->pVpp = NULL;
-	}
-	FreeMemorySvc( ppCtx );
-	*ppCtx = NULL;
-}
-
-/*!
- * \brief	get temporal level due to configuration and coding context	
- */
-static inline int32_t GetTemporalLevel( SDLayerParam *fDlp, const int32_t kiFrameNum, const int32_t kiGopSize )
-{
-	const int32_t kiCodingIdx	= kiFrameNum & (kiGopSize-1);
-	
-	return fDlp->uiCodingIdx2TemporalId[kiCodingIdx];
-}
-
-void DynslcUpdateMbNeighbourInfoListForAllSlices( SSliceCtx *pSliceCtx, SMB *pMbList )
-{	
-	const int32_t kiMbWidth			= pSliceCtx->iMbWidth;
-	const int32_t kiEndMbInSlice	= pSliceCtx->iMbNumInFrame - 1;
-	int32_t  iIdx					= 0;	
-
-	do {
-		SMB *pMb = &pMbList[iIdx];
-		uint32_t uiNeighborAvailFlag	= 0;
-		const int32_t kiMbXY				= pMb->iMbXY;
-		const int32_t kiMbX				= pMb->iMbX;
-		const int32_t kiMbY				= pMb->iMbY;
-		BOOL_T     bLeft;
-		BOOL_T     bTop;
-		BOOL_T     bLeftTop;
-		BOOL_T     bRightTop;
-		int32_t  uiSliceIdc;
-		int32_t   iLeftXY, iTopXY, iLeftTopXY, iRightTopXY;
-
-		uiSliceIdc = WelsMbToSliceIdc(pSliceCtx, kiMbXY);
-		pMb->uiSliceIdc	= uiSliceIdc;
-		iLeftXY = kiMbXY - 1;
-		iTopXY = kiMbXY - kiMbWidth;
-		iLeftTopXY = iTopXY - 1;
-		iRightTopXY = iTopXY + 1;
-
-		bLeft = (kiMbX > 0) && (uiSliceIdc == WelsMbToSliceIdc(pSliceCtx, iLeftXY));
-		bTop = (kiMbY > 0) && (uiSliceIdc == WelsMbToSliceIdc(pSliceCtx, iTopXY));
-		bLeftTop = (kiMbX > 0) && (kiMbY > 0) && (uiSliceIdc == WelsMbToSliceIdc(pSliceCtx, iLeftTopXY));
-		bRightTop = (kiMbX < (kiMbWidth-1)) && (kiMbY > 0) && (uiSliceIdc == WelsMbToSliceIdc(pSliceCtx, iRightTopXY));		
-
-		if( bLeft ){
-			uiNeighborAvailFlag |= LEFT_MB_POS;
-		}
-		if( bTop ){
-			uiNeighborAvailFlag |= TOP_MB_POS;
-		}
-		if( bLeftTop ){
-			uiNeighborAvailFlag |= TOPLEFT_MB_POS;
-		}
-		if( bRightTop ){
-			uiNeighborAvailFlag |= TOPRIGHT_MB_POS;
-		}		
-		pMb->uiNeighborAvail	= (uint8_t)uiNeighborAvailFlag;
-
-		++ iIdx;
-	} while(iIdx <= kiEndMbInSlice);
-}
-
-/*
- * TUNE back if number of picture partition decision algorithm based on past if available
- */
-int32_t PicPartitionNumDecision( sWelsEncCtx *pCtx )
-{
-	int32_t iPartitionNum	= 1;
-#ifdef MT_ENABLED
-	if ( pCtx->pSvcParam->iMultipleThreadIdc > 1 )
-	{
-		iPartitionNum	= pCtx->pSvcParam->iCountThreadsNum;
-#if !defined(FIXED_PARTITION_ASSIGN)
-		if ( P_SLICE == pCtx->eSliceType )
-			iPartitionNum	= 1;
-#endif//!FIXED_PARTITION_ASSIGN
-	}
-	return iPartitionNum;
-#else
-	return iPartitionNum;
-#endif//MT_ENABLED
-}
-
-#if defined(MT_ENABLED)
-void WelsInitCurrentQBLayerMltslc( sWelsEncCtx *pCtx )
-{	
-	//pData init
-	SDqLayer*		pCurDq				= pCtx->pCurDqLayer;
-	SSliceCtx*	pSliceCtx			= (pCurDq->pSliceEncCtx);	
-		
-	//mb_neighbor
-	DynslcUpdateMbNeighbourInfoListForAllSlices( pSliceCtx, pCurDq->sMbDataP );	
-}
-
-void UpdateSlicepEncCtxWithPartition( SSliceCtx *pSliceCtx, int32_t iPartitionNum )
-{
-	const int32_t kiMbNumInFrame	= pSliceCtx->iMbNumInFrame;
-	int32_t iCountMbNumPerPartition	= kiMbNumInFrame;
-	int32_t iAssignableMbLeft		= kiMbNumInFrame;
-	int32_t iFirstMbIdx			= 0;
-	int32_t i/*, j*/;
-
-	if ( iPartitionNum <= 0 )
-		iPartitionNum	= 1;
-	else if ( iPartitionNum > AVERSLICENUM_CONSTRAINT )
-		iPartitionNum	= AVERSLICENUM_CONSTRAINT;	// AVERSLICENUM_CONSTRAINT might be variable, however not fixed by MACRO
-	iCountMbNumPerPartition	/= iPartitionNum;
-	pSliceCtx->iSliceNumInFrame	= iPartitionNum;
-	i = 0;
-	while( i < iPartitionNum )
-	{		
-		if ( i + 1 == iPartitionNum )
-		{
-			pSliceCtx->pCountMbNumInSlice[i]	= iAssignableMbLeft;
-		}
-		else
-		{
-			pSliceCtx->pCountMbNumInSlice[i]	= iCountMbNumPerPartition;
-		}
-		pSliceCtx->pFirstMbInSlice[i]	=	iFirstMbIdx;
-		
-		memset( pSliceCtx->pOverallMbMap+iFirstMbIdx, (uint8_t)i, pSliceCtx->pCountMbNumInSlice[i]*sizeof(uint8_t) );
-
-		// for next partition(or pSlice)
-		iFirstMbIdx	+= pSliceCtx->pCountMbNumInSlice[i];
-		iAssignableMbLeft -= pSliceCtx->pCountMbNumInSlice[i];
-		++ i;
-	}
-}
-
-void WelsInitCurrentDlayerMltslc( sWelsEncCtx *pCtx, int32_t iPartitionNum )
-{	
-	SDqLayer* pCurDq				= pCtx->pCurDqLayer;
-	SSliceCtx* pSliceCtx		= pCurDq->pSliceEncCtx;	
-
-	UpdateSlicepEncCtxWithPartition( pSliceCtx, iPartitionNum );
-
-	if ( I_SLICE == pCtx->eSliceType )//check if uiSliceSizeConstraint too small
-	{
-#define byte_complexIMBat26 (60)
-		uint8_t		iCurDid = pCtx->uiDependencyId;
-		uint32_t	uiFrmByte = 0;
-
-		if ( pCtx->pSvcParam->bEnableRc ) 
-		{//RC case
-			uiFrmByte = (
-				( (uint32_t)(pCtx->pSvcParam->sDependencyLayers[iCurDid].iSpatialBitrate)
-				/(uint32_t)(pCtx->pSvcParam->sDependencyLayers[iCurDid].fInputFrameRate) ) >> 3 );
-		}
-		else
-		{//fixed QP case
-			const int32_t iTtlMbNumInFrame = pSliceCtx->iMbNumInFrame;
-			int32_t iQDeltaTo26 = ( 26 - pCtx->pSvcParam->sDependencyLayers[iCurDid].iDLayerQp );
-
-			uiFrmByte = (iTtlMbNumInFrame * byte_complexIMBat26);
-			if ( iQDeltaTo26 > 0 )
-			{
-				//smaller QP than 26
-				uiFrmByte = (uint32_t)( uiFrmByte * ( (float)iQDeltaTo26 / 4 ) );
-			}
-			else if ( iQDeltaTo26 < 0 )
-			{
-				//larger QP than 26
-				iQDeltaTo26 = ( (-iQDeltaTo26) >> 2 ); //delta mod 4
-				uiFrmByte = ( uiFrmByte >> (iQDeltaTo26) ); //if delta 4, byte /2
-			}
-		}
-
-		//MINPACKETSIZE_CONSTRAINT
-		if ( pSliceCtx->uiSliceSizeConstraint 
-			<
-			 (uint32_t)( uiFrmByte//suppose 16 byte per mb at average
-			 / ( pSliceCtx->iMaxSliceNumConstraint ) )
-			)
-		{
-
-			WelsLog( pCtx, 
-				WELS_LOG_WARNING, 
-				"Set-SliceConstraint(%d) too small for current resolution (MB# %d) under QP/BR!\n", 
-				pSliceCtx->uiSliceSizeConstraint,
-				pSliceCtx->iMbNumInFrame
-				);
-		}											
-	}
-
-	WelsInitCurrentQBLayerMltslc( pCtx );
-}
-#else
-void WelsInitCurrentQBLayerMltslc( sWelsEncCtx *pCtx )
-{	
-	//pData init
-	SDqLayer*		pCurDq				= pCtx->pCurDqLayer;
-	SSliceCtx*	pSliceCtx			= (pCurDq->pSliceEncCtx);	
-	SSlice *			pSlice				= &pCurDq->sLayerInfo.pSliceInLayer[0];
-	int32_t			iTtlMbNumInFrame = pSliceCtx->iMbNumInFrame;
-
-	//pSliceCtx
-	memset( pSliceCtx->pOverallMbMap,		0, iTtlMbNumInFrame * sizeof(uint8_t) );
-	memset( pSliceCtx->pCountMbNumInSlice,	0, pSliceCtx->iSliceNumInFrame * sizeof(int32_t) );
-	memset( pSliceCtx->pFirstMbInSlice,		0, pSliceCtx->iSliceNumInFrame * sizeof(int16_t) );
-	pSliceCtx->iSliceNumInFrame				= 1;//
-	pSliceCtx->pCountMbNumInSlice[0]			= iTtlMbNumInFrame;
-		
-	//mb_neighbor
-	DynslcUpdateMbNeighbourInfoListForAllSlices( pSliceCtx, pCurDq->sMbDataP );	
-
-	//pSlice init
-	pSlice->uiSliceIdx				= 0;
-	pSlice->pSliceBsa				= &pCtx->pOut->sBsWrite;
-	pSlice->bDynamicSlicingSliceSizeCtrlFlag			= false;
-	pSlice->uiAssumeLog2BytePerMb	= ( pCtx->eSliceType == P_SLICE ) ? 0 : 1;
-}
-
-void WelsInitCurrentDlayerMltslc( sWelsEncCtx *pCtx, int32_t iPartitionNum )
-{	
-	SDqLayer* pCurDq = pCtx->pCurDqLayer;
-	SSliceCtx* pSliceCtx = ( pCurDq->pSliceEncCtx );	
-	int32_t iTtlMbNumInFrame = pCurDq->iMbHeight*pCurDq->iMbWidth;
-
-	pSliceCtx->iMbNumInFrame 
-		= pSliceCtx->pCountMbNumInSlice[0] = iTtlMbNumInFrame;
-
-	if ( I_SLICE == pCtx->eSliceType )//check if uiSliceSizeConstraint too small
-	{
-#define byte_complexIMBat26 (60)
-		uint8_t		iCurDid = pCtx->uiDependencyId;
-		uint32_t	uiFrmByte = 0;
-
-		if ( pCtx->pSvcParam->bEnableRc ) 
-		{//RC case
-			uiFrmByte = (
-				( (uint32_t)(pCtx->pSvcParam->sDependencyLayers[iCurDid].iSpatialBitrate)
-				/(uint32_t)(pCtx->pSvcParam->sDependencyLayers[iCurDid].fInputFrameRate) ) >> 3 );
-		}
-		else
-		{//fixed QP case
-			int32_t iQDeltaTo26 = ( 26 - pCtx->pSvcParam->sDependencyLayers[iCurDid].iDLayerQp );
-
-			uiFrmByte = (iTtlMbNumInFrame * byte_complexIMBat26);
-			if ( iQDeltaTo26 > 0 )
-			{
-				//smaller QP than 26
-				uiFrmByte = (uint32_t)( uiFrmByte * ( (float)iQDeltaTo26 / 4 ) );
-			}
-			else if ( iQDeltaTo26 < 0 )
-			{
-				//larger QP than 26
-				iQDeltaTo26 = ( (-iQDeltaTo26) >> 2 ); //delta mod 4
-				uiFrmByte = ( uiFrmByte >> (iQDeltaTo26) ); //if delta 4, byte /2
-			}
-		}
-
-		//MINPACKETSIZE_CONSTRAINT
-		if ( pSliceCtx->uiSliceSizeConstraint 
-			<
-			 (uint32_t)( uiFrmByte//suppose 16 byte per mb at average
-			 / ( pSliceCtx->iMaxSliceNumConstraint ) )
-			)
-		{
-
-			WelsLog( pCtx, 
-				WELS_LOG_WARNING, 
-				"Set-SliceConstraint(%d) too small for current resolution (MB# %d) under QP/BR!\n", 
-				pSliceCtx->uiSliceSizeConstraint,
-				pSliceCtx->iMbNumInFrame
-				);
-		}											
-	}
-
-	WelsInitCurrentQBLayerMltslc( pCtx );
-}
-#endif
-
-/*!
- * \brief	initialize current layer	
- */
-void WelsInitCurrentLayer(	sWelsEncCtx *pCtx,
-								const int32_t kiWidth,
-								const int32_t kiHeight )
-{
- 	SWelsSvcCodingParam *pParam	= pCtx->pSvcParam;
-	SPicture *pEncPic					= pCtx->pEncPic;
-	SPicture *pDecPic					= pCtx->pDecPic;
-	SDqLayer *pCurDq				= pCtx->pCurDqLayer;
-	SSlice *pBaseSlice				= &pCurDq->sLayerInfo.pSliceInLayer[0];
-	SSlice *pSlice					= NULL;
-	const uint8_t kiCurDid			= pCtx->uiDependencyId;
-	const bool_t kbUseSubsetSpsFlag= (kiCurDid > BASE_DEPENDENCY_ID);
-	SDLayerParam *fDlp				= &pParam->sDependencyLayers[kiCurDid];
-	SNalUnitHeaderExt *pNalHdExt	= &pCurDq->sLayerInfo.sNalHeaderExt;
-	SNalUnitHeader *pNalHd			= &pNalHdExt->sNalHeader;	
-	SDqIdc *pDqIdc						= &pCtx->pDqIdcMap[kiCurDid];
-	int32_t iIdx						= 0;
-	int32_t iSliceCount				= 0;
-
-	if ( NULL == pCurDq )
-		return;
-	
-	pCurDq->pDecPic	= pDecPic;
-	
-	if ( fDlp->sMso.uiSliceMode == SM_DYN_SLICE )	// need get extra slices for update
-		iSliceCount = GetInitialSliceNum( pCurDq->iMbWidth, pCurDq->iMbHeight, &fDlp->sMso );
-	else
-		iSliceCount = GetCurrentSliceNum( pCurDq->pSliceEncCtx );
-	assert( iSliceCount > 0 );
-	
-	pBaseSlice->sSliceHeaderExt.sSliceHeader.iPpsId	= pDqIdc->iPpsId;
-	pCurDq->sLayerInfo.pPpsP							=
-	pBaseSlice->sSliceHeaderExt.sSliceHeader.pPps		= &pCtx->pPPSArray[pBaseSlice->sSliceHeaderExt.sSliceHeader.iPpsId];	
-	pBaseSlice->sSliceHeaderExt.sSliceHeader.iSpsId	= pDqIdc->iSpsId;
-	if ( kbUseSubsetSpsFlag )
-	{
-		pCurDq->sLayerInfo.pSubsetSpsP					= &pCtx->pSubsetArray[pDqIdc->iSpsId];
-		pCurDq->sLayerInfo.pSpsP						=
-		pBaseSlice->sSliceHeaderExt.sSliceHeader.pSps	= &pCurDq->sLayerInfo.pSubsetSpsP->pSps;
-	}
-	else
-	{
-		pCurDq->sLayerInfo.pSubsetSpsP					= NULL;
-		pCurDq->sLayerInfo.pSpsP						=
-		pBaseSlice->sSliceHeaderExt.sSliceHeader.pSps	= &pCtx->pSpsArray[pBaseSlice->sSliceHeaderExt.sSliceHeader.iSpsId];
-	}
-
-	pSlice = pBaseSlice;
-	iIdx = 1;
-	while ( iIdx < iSliceCount ) {
-		++ pSlice;
-		pSlice->sSliceHeaderExt.sSliceHeader.iPpsId	= pBaseSlice->sSliceHeaderExt.sSliceHeader.iPpsId;		
-		pSlice->sSliceHeaderExt.sSliceHeader.pPps	= pBaseSlice->sSliceHeaderExt.sSliceHeader.pPps;		
-		pSlice->sSliceHeaderExt.sSliceHeader.iSpsId	= pBaseSlice->sSliceHeaderExt.sSliceHeader.iSpsId;
-		pSlice->sSliceHeaderExt.sSliceHeader.pSps	= pBaseSlice->sSliceHeaderExt.sSliceHeader.pSps;		
-		++ iIdx;		
-	}
-
-	memset( pNalHdExt, 0, sizeof(SNalUnitHeaderExt) );
-	pNalHd->uiNalRefIdc					= pCtx->eNalPriority;
-	pNalHd->eNalUnitType				= pCtx->eNalType;
-
-	pNalHdExt->uiDependencyId			= kiCurDid;
-	pNalHdExt->bDiscardableFlag		= (pCtx->bNeedPrefixNalFlag) ? (pNalHd->uiNalRefIdc == NRI_PRI_LOWEST) : false;
-	pNalHdExt->bIdrFlag				= (pCtx->iFrameNum == 0) && ((pCtx->eNalType == NAL_UNIT_CODED_SLICE_IDR) || (pCtx->eSliceType == I_SLICE));
-	pNalHdExt->uiTemporalId				= pCtx->uiTemporalId;
-	
-	pBaseSlice->bSliceHeaderExtFlag	= (NAL_UNIT_CODED_SLICE_EXT == pNalHd->eNalUnitType);
-	
-	pSlice = pBaseSlice;
-	iIdx = 1;
-	while (iIdx < iSliceCount) {
-		++ pSlice;		
-		pSlice->bSliceHeaderExtFlag			= pBaseSlice->bSliceHeaderExtFlag;
-		++ iIdx;		
-	}	
-
-	// pEncPic pData
-	pCurDq->pEncData[0]		= pEncPic->pData[0];
-	pCurDq->pEncData[1]		= pEncPic->pData[1];
-	pCurDq->pEncData[2]		= pEncPic->pData[2];
-	pCurDq->iEncStride[0]	= pEncPic->iLineSize[0];
-	pCurDq->iEncStride[1]	= pEncPic->iLineSize[1];
-	pCurDq->iEncStride[2]	= pEncPic->iLineSize[2];
-	// cs pData
-	pCurDq->pCsData[0]		= pDecPic->pData[0];
-	pCurDq->pCsData[1]		= pDecPic->pData[1];
-	pCurDq->pCsData[2]		= pDecPic->pData[2];
-	pCurDq->iCsStride[0]	= pDecPic->iLineSize[0];
-	pCurDq->iCsStride[1]	= pDecPic->iLineSize[1];
-	pCurDq->iCsStride[2]	= pDecPic->iLineSize[2];		
-	
-	if ( pCurDq->pRefLayer != NULL )
-	{
-		pCurDq->bBaseLayerAvailableFlag	= true;
-	}
-	else
-	{
-		pCurDq->bBaseLayerAvailableFlag	= false;
-	}
-}
-
-void PreprocessSliceCoding( sWelsEncCtx *pCtx )
-{
-	SDqLayer *pCurLayer		= pCtx->pCurDqLayer;
-	const bool_t kbBaseAvail	= pCurLayer->bBaseLayerAvailableFlag;
-
-	/* function pointers conditional assignment under sWelsEncCtx, layer_mb_enc_rec (in stack) is exclusive */
-
-	if ( P_SLICE == pCtx->eSliceType )
-	{ 
-		if ( kbBaseAvail ) 
-		{			
-			if ( pCtx->pSvcParam->iNumDependencyLayer == (pCurLayer->sLayerInfo.sNalHeaderExt.uiDependencyId + 1) ) //
-			{
-				pCtx->pFuncList->pfMotionSearch = WelsMotionEstimateSearchSad;				
-				pCtx->pFuncList->pfFirstIntraMode = WelsMdFirstIntraMode;
-				pCtx->pFuncList->pfIntraFineMd = WelsMdIntraFinePartitionVaa;
-				pCtx->pFuncList->pfInterFineMd = WelsMdInterFinePartitionVaa;
-				pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Sad;
-				pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad;
-				pCtx->pFuncList->sSampleDealingFuncs.pfMdCost = pCtx->pFuncList->sSampleDealingFuncs.pfSampleSad;				
-			}
-			else 
-			{
-				pCtx->pFuncList->pfMotionSearch  = WelsMotionEstimateSearchSatd;
-				pCtx->pFuncList->pfFirstIntraMode = WelsMdFirstIntraMode;
-				pCtx->pFuncList->pfIntraFineMd = WelsMdIntraFinePartition;
-				pCtx->pFuncList->pfInterFineMd = WelsMdInterFinePartition;
-				pCtx->pFuncList->sSampleDealingFuncs.pfMdCost = pCtx->pFuncList->sSampleDealingFuncs.pfSampleSatd;
-				pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Satd;
-				pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd;
-				pCtx->pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd;				
-			}
-			pCtx->pFuncList->sSampleDealingFuncs.pfMeCost = pCtx->pFuncList->sSampleDealingFuncs.pfSampleSatd;	
-		}
-		else
-		{
-			//case 3: pBase layer MD + encoding
-			if ( pCurLayer->sLayerInfo.sNalHeaderExt.uiDependencyId+1 == pCtx->pSvcParam->iNumDependencyLayer )
-			{
-				pCtx->pFuncList->pfMotionSearch  = WelsMotionEstimateSearchSad;
-				pCtx->pFuncList->pfFirstIntraMode = WelsMdFirstIntraMode;
-				pCtx->pFuncList->pfIntraFineMd = WelsMdIntraFinePartitionVaa;
-				pCtx->pFuncList->pfInterFineMd = WelsMdInterFinePartitionVaa;
-				pCtx->pFuncList->sSampleDealingFuncs.pfMdCost = pCtx->pFuncList->sSampleDealingFuncs.pfSampleSad;
-				pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad;
-				pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Sad;				
-			}
-			else
-			{
-   				pCtx->pFuncList->pfMotionSearch  = WelsMotionEstimateSearchSatd;
-				pCtx->pFuncList->pfFirstIntraMode = WelsMdFirstIntraMode;
-				pCtx->pFuncList->pfIntraFineMd = WelsMdIntraFinePartition;
-				pCtx->pFuncList->pfInterFineMd = WelsMdInterFinePartition;
-				pCtx->pFuncList->sSampleDealingFuncs.pfMdCost = pCtx->pFuncList->sSampleDealingFuncs.pfSampleSatd;
-				pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Satd;
-				pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd;
-				pCtx->pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd;				
-			}
-			pCtx->pFuncList->sSampleDealingFuncs.pfMeCost = pCtx->pFuncList->sSampleDealingFuncs.pfSampleSatd;			
-		}
-	}
-	else if ( I_SLICE == pCtx->eSliceType )
-	{
-			if ( pCurLayer->sLayerInfo.sNalHeaderExt.uiDependencyId+1 == pCtx->pSvcParam->iNumDependencyLayer )
-			{
-				pCtx->pFuncList->sSampleDealingFuncs.pfMdCost = pCtx->pFuncList->sSampleDealingFuncs.pfSampleSad;
-				pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad;
-				pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Sad;
-				pCtx->pFuncList->pfIntraFineMd = WelsMdIntraFinePartitionVaa;
-			}
-			else
-			{
-				pCtx->pFuncList->sSampleDealingFuncs.pfMdCost = pCtx->pFuncList->sSampleDealingFuncs.pfSampleSatd;
-				pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Satd;
-				pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd;
-				pCtx->pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd;
-				pCtx->pFuncList->pfIntraFineMd = WelsMdIntraFinePartition;
-			}			
-	}
-}
-
-/*!
- * \brief	swap pDq layers between current pDq layer and reference pDq layer	
- */
-
-static inline void WelsSwapDqLayers( sWelsEncCtx *pCtx )
-{
-	// swap and assign reference	
-	const int32_t kiDid			= pCtx->uiDependencyId;
-	const int32_t kiNextDqIdx   = 1 + kiDid;
-
-	SDqLayer *pTmpLayer			= pCtx->ppDqLayerList[kiNextDqIdx];
-	SDqLayer *pRefLayer			= pCtx->pCurDqLayer;
-	pCtx->pCurDqLayer				= pTmpLayer;
-	pCtx->pCurDqLayer->pRefLayer	= pRefLayer;
-}
-
-/*!
- * \brief	prefetch reference picture after WelsBuildRefList	
- */
-static inline void PrefetchReferencePicture( sWelsEncCtx *pCtx, const EFrameType keFrameType )
-{	
-	SSlice *pSliceBase = &pCtx->pCurDqLayer->sLayerInfo.pSliceInLayer[0];
-	const int32_t kiSliceCount = GetCurrentSliceNum( pCtx->pCurDqLayer->pSliceEncCtx );
-	int32_t iIdx = 0;
-	uint8_t uiRefIdx = -1;
-
-	assert( kiSliceCount > 0 );
-	if ( keFrameType != WELS_FRAME_TYPE_IDR )
-	{
-		assert( pCtx->iNumRef0 > 0 );
-		pCtx->pRefPic	= pCtx->pRefList0[0];	// always get item 0 due to reordering done
-		pCtx->pCurDqLayer->pRefPic	= pCtx->pRefPic;
-		uiRefIdx	= 0;	// reordered reference iIndex		
-	}
-	else	// safe for IDR coding
-	{
-		pCtx->pRefPic					= NULL;
-		pCtx->pCurDqLayer->pRefPic	= NULL;		
-	}
-
-	iIdx = 0;
-	while (iIdx < kiSliceCount) {
-		pSliceBase->sSliceHeaderExt.sSliceHeader.uiRefIndex	= uiRefIdx;
-		++ pSliceBase;
-		++ iIdx;
-	}
-}
-
-
-void ParasetIdAdditionIdAdjust( SParaSetOffsetVariable *sParaSetOffsetVariable, const int32_t kiCurEncoderParaSetId, const uint32_t kuiMaxIdInBs )//paraset_type = 0: SPS; =1: PPS
-{
-	//SPS_ID in avc_sps and pSubsetSps will be different using this
-	//SPS_ID case example:
-	//1st enter:		next_spsid_in_bs == 0; spsid == 0; delta==0;				//actual spsid_in_bs == 0 
-	//1st finish:		next_spsid_in_bs == 1;
-	//2nd enter:	next_spsid_in_bs == 1; spsid == 0; delta==1;				//actual spsid_in_bs == 1
-	//2nd finish:		next_spsid_in_bs == 2;
-	//31st enter:	next_spsid_in_bs == 31; spsid == 0~2; delta==31~29;	//actual spsid_in_bs == 31
-	//31st finish:	next_spsid_in_bs == 0;
-	//31st enter:	next_spsid_in_bs == 0; spsid == 0~2; delta==-2~0;		//actual spsid_in_bs == 0
-	//31st finish:	next_spsid_in_bs == 1;
-	
-	const int32_t kiEncId			= kiCurEncoderParaSetId;
-	const uint32_t kuiPrevIdInBs	= sParaSetOffsetVariable->iParaSetIdDelta[kiEncId] + kiEncId;//mark current_id
-	const bool_t *kpUsedIdPointer   = &sParaSetOffsetVariable->bUsedParaSetIdInBs[0];
-	uint32_t uiNextIdInBs			= sParaSetOffsetVariable->uiNextParaSetIdToUseInBs;
-
-#if _DEBUG
-	if ( 0 != sParaSetOffsetVariable->iParaSetIdDelta[kiEncId] )
-		assert ( sParaSetOffsetVariable->bUsedParaSetIdInBs[kuiPrevIdInBs] ); //sure the prev-used one was marked activated correctly
-#endif
-	//update current layer's pCodingParam
-	sParaSetOffsetVariable->iParaSetIdDelta[kiEncId]	= uiNextIdInBs - kiEncId;  //for current parameter set, change its id_delta
-	//write pso pData for next update: 
-	sParaSetOffsetVariable->bUsedParaSetIdInBs[kuiPrevIdInBs] = false;	//   
-	sParaSetOffsetVariable->bUsedParaSetIdInBs[uiNextIdInBs] = true;		//   update current used_id
-
-	//prepare for next update:
-	//   find the next avaibable iId
-	do
-	{
-		++uiNextIdInBs;
-		if (uiNextIdInBs >= kuiMaxIdInBs ) 
-		{
-			uiNextIdInBs = 0;//ensure the SPS_ID wound not exceed MAX_SPS_COUNT
-		}
-	}while ( kpUsedIdPointer[uiNextIdInBs] );
-
-	//   update next_id
-	sParaSetOffsetVariable->uiNextParaSetIdToUseInBs = uiNextIdInBs;
-
-#if _DEBUG
-	assert ( !sParaSetOffsetVariable->bUsedParaSetIdInBs[uiNextIdInBs] ); //sure the next-to-use one is marked activated correctly
-#endif
-
-}
-
-/*!
- * \brief	write all parameter sets introduced in SVC extension
- * \return	size in bytes of bitstream wrote
- */
-int32_t WelsWriteParameterSets( sWelsEncCtx *pCtx, int32_t *pNalLen, int32_t *pNumNal )
-{
-	int32_t iSize	= 0;
-	int32_t iNal	= 0;
-	int32_t	iIdx	= 0;
-	int32_t iId	= 0;
-	int32_t iCountNal	= 0;
-
-	if ( NULL == pCtx || NULL == pNalLen || NULL == pNumNal )
-		return 0;	
-		
-	/* write all SPS */
-	iIdx = 0;
-	while (iIdx < pCtx->iSpsNum) {
-		SDqIdc *pDqIdc		= &pCtx->pDqIdcMap[iIdx];
-		const int32_t kiDid	= pDqIdc->uiSpatialId;
-		const bool_t kbUsingSubsetSps = (kiDid > BASE_DEPENDENCY_ID);
-
-		iNal	= pCtx->pOut->iNalIndex;
-
-		if ( pCtx->pSvcParam->bEnableSpsPpsIdAddition )
-		{
-#if _DEBUG
-			pCtx->sPSOVector.bEnableSpsPpsIdAddition = 1;
-			assert(kiDid < MAX_DEPENDENCY_LAYER);
-			assert(iIdx < MAX_DQ_LAYER_NUM);
-#endif
-
-			ParasetIdAdditionIdAdjust( &(pCtx->sPSOVector.sParaSetOffsetVariable[kbUsingSubsetSps ? PARA_SET_TYPE_SUBSETSPS : PARA_SET_TYPE_AVCSPS]), 
-				(kbUsingSubsetSps)?(pCtx->pSubsetArray[iIdx - 1].pSps.uiSpsId):(pCtx->pSpsArray[0].uiSpsId ), 
-				MAX_SPS_COUNT );
-		}
-		else
-		{
-			memset(&(pCtx->sPSOVector), 0, sizeof(pCtx->sPSOVector)  );
-		}
-
-		if ( kbUsingSubsetSps ){
-			iId	= iIdx - 1;
-			
-			/* generate Subset SPS */
-			WelsLoadNal( pCtx->pOut, NAL_UNIT_SUBSET_SPS, NRI_PRI_HIGHEST );
-
-			WelsWriteSubsetSpsSyntax( &pCtx->pSubsetArray[iId], &pCtx->pOut->sBsWrite, &(pCtx->sPSOVector.sParaSetOffsetVariable[PARA_SET_TYPE_SUBSETSPS].iParaSetIdDelta[0]) );
-			WelsUnloadNal( pCtx->pOut );
-		}
-		else{
-			iId	= 0;
-			
-			/* generate sequence parameters set */
-			WelsLoadNal( pCtx->pOut, NAL_UNIT_SPS, NRI_PRI_HIGHEST );
-			WelsWriteSpsNal( &pCtx->pSpsArray[0], &pCtx->pOut->sBsWrite,  &(pCtx->sPSOVector.sParaSetOffsetVariable[PARA_SET_TYPE_AVCSPS].iParaSetIdDelta[0]) );
-			WelsUnloadNal( pCtx->pOut );
-		}
-		
-		pNalLen[iCountNal] = WelsEncodeNal( &pCtx->pOut->sNalList[iNal], pCtx->pFrameBs + pCtx->iPosBsBuffer, &pNalLen[iCountNal] );
-
-		pCtx->iPosBsBuffer	+= pNalLen[iCountNal];
-		iSize				+= pNalLen[iCountNal];
-		
-		++ iIdx;
-		++ iCountNal;
-	}	
-	
-	/* write all PPS */
-	iIdx = 0;
-	while (iIdx < pCtx->iPpsNum) {
-		if ( pCtx->pSvcParam->bEnableSpsPpsIdAddition )
-		{
-			//para_set_type = 2: PPS, use MAX_PPS_COUNT
-			ParasetIdAdditionIdAdjust( &pCtx->sPSOVector.sParaSetOffsetVariable[PARA_SET_TYPE_PPS], pCtx->pPPSArray[iIdx].iPpsId, MAX_PPS_COUNT );
-		}
-
-		iNal	= pCtx->pOut->iNalIndex;
-		/* generate picture parameter set */
-		WelsLoadNal( pCtx->pOut, NAL_UNIT_PPS, NRI_PRI_HIGHEST );
-		WelsWritePpsSyntax( &pCtx->pPPSArray[iIdx], &pCtx->pOut->sBsWrite, &(pCtx->sPSOVector) );
-		WelsUnloadNal( pCtx->pOut );
-		
-		pNalLen[iCountNal] = WelsEncodeNal( &pCtx->pOut->sNalList[iNal], pCtx->pFrameBs + pCtx->iPosBsBuffer, &pNalLen[iCountNal] );
-		
-		pCtx->iPosBsBuffer	+= pNalLen[iCountNal];
-		iSize				+= pNalLen[iCountNal];
-		
-		++ iIdx;
-		++ iCountNal;
-	}
-	
-	*pNumNal = iCountNal;
-	
-	return iSize;
-}
-
-static inline int32_t AddPrefixNal(	sWelsEncCtx *pCtx,
-									 SLayerBSInfo *pLayerBsInfo,
-									 int32_t *pNalLen,
-									 int32_t *pNalIdxInLayer,
-									 const EWelsNalUnitType keNalType,
-									 const EWelsNalRefIdc keNalRefIdc	)
-{
-	int32_t iPayloadSize = 0;
-	
-	if ( keNalRefIdc != NRI_PRI_LOWEST )
-	{
-		WelsLoadNal( pCtx->pOut, NAL_UNIT_PREFIX, keNalRefIdc );		
-
-		WelsWriteSVCPrefixNal( &pCtx->pOut->sBsWrite, keNalRefIdc, (NAL_UNIT_CODED_SLICE_IDR == keNalType) );
-
-		WelsUnloadNal( pCtx->pOut );						
-		
-		iPayloadSize	= WelsEncodeNalExt(	&pCtx->pOut->sNalList[pCtx->pOut->iNalIndex-1],
-			&pCtx->pCurDqLayer->sLayerInfo.sNalHeaderExt,
-			pCtx->pFrameBs + pCtx->iPosBsBuffer,
-			&pNalLen[*pNalIdxInLayer]	);	
-		
-		pCtx->iPosBsBuffer							+= iPayloadSize;
-		pLayerBsInfo->iNalLengthInByte[*pNalIdxInLayer]	= iPayloadSize;
-		
-		(*pNalIdxInLayer) ++;
-	}
-	else // No Prefix NAL Unit RBSP syntax here, but need add NAL Unit Header extension
-	{
-		WelsLoadNal( pCtx->pOut, NAL_UNIT_PREFIX, keNalRefIdc );
-		// No need write any syntax of prefix NAL Unit RBSP here
-		WelsUnloadNal( pCtx->pOut );
-		
-		iPayloadSize = WelsEncodeNalExt(	&pCtx->pOut->sNalList[pCtx->pOut->iNalIndex-1],
-			&pCtx->pCurDqLayer->sLayerInfo.sNalHeaderExt,
-			pCtx->pFrameBs + pCtx->iPosBsBuffer,
-			&pNalLen[*pNalIdxInLayer]	);
-		
-		pCtx->iPosBsBuffer							+= iPayloadSize;
-		pLayerBsInfo->iNalLengthInByte[*pNalIdxInLayer]	= iPayloadSize;
-		
-		(*pNalIdxInLayer) ++;
-	}
-	
-	return iPayloadSize;
-}
-
-int32_t WritePadding(sWelsEncCtx *pCtx, int32_t iLen)
-{
-	int32_t i=0;
-	int32_t iNal	= 0;
-	SBitStringAux	*pBs = NULL;	
-	int32_t iNalLen;
-	int32_t iSize=0;
-	
-	iNal	= pCtx->pOut->iNalIndex;
-	pBs	=	&pCtx->pOut->sBsWrite;	// SBitStringAux instance for non VCL NALs decoding
-	
-	if((pBs->pBufEnd - pBs->pBufPtr) < iLen || iNal >= pCtx->pOut->iCountNals)
-	{
-#if GOM_TRACE_FLAG
-		WelsLog( pCtx, WELS_LOG_ERROR,"[RC] paddingcal pBuffer overflow, bufferlen=%d, paddinglen=%d, iNalIdx= %d, iCountNals= %d\n",
-			(pBs->pBufEnd-pBs->pBufPtr), iLen, iNal, pCtx->pOut->iCountNals);
-#endif
-		return 0;
-	}
-
-	WelsLoadNal( pCtx->pOut, NAL_UNIT_FILLER_DATA, NRI_PRI_LOWEST );
-	
-	for(i=0;i<iLen;i++)
-	{
-		BsWriteBits( pBs, 8, 0xff);
-	}
-	
-	BsRbspTrailingBits( pBs );
-
-	BsFlush( pBs );
-	
-	WelsUnloadNal( pCtx->pOut );
-	iNalLen = WelsEncodeNal( &pCtx->pOut->sNalList[iNal], pCtx->pFrameBs + pCtx->iPosBsBuffer, &iNalLen );
-	
-	pCtx->iPosBsBuffer	+= iNalLen;
-	iSize				+= iNalLen;
-	
-	return iSize;
-}
-
-/*
- * post process of dynamic slicing bs writing in case PACKING_ONE_SLICE_PER_LAYER
- * include: count bs size of over all the slices in layer, 
- * return: count number of slices in layer
- */
-#if defined(MT_ENABLED) && defined(PACKING_ONE_SLICE_PER_LAYER)
-int32_t PostProcDynamicSlicingBsWriting( sWelsEncCtx *pCtx, SLayerBSInfo *pLayerBsInfo, int32_t *pLayerSize, const int32_t kiPartitionCnt )
-{
-	SDqLayer *pCurDq		= pCtx->pCurDqLayer;
-	int32_t iPartitionIdx	= 0;
-	int32_t iCheckingIdx	= 0;
-	int32_t iSwappingIdx	= -1;
-	int32_t iSliceCount		= 0;
-	int32_t iLayerSize		= 0;
-
-	// count number of slices in layer and layer size
-	while(iPartitionIdx < kiPartitionCnt)
-	{
-		const int32_t coded_slice_cnt = pCurDq->pNumSliceCodedOfPartition[iPartitionIdx];		
-		iLayerSize += pCtx->pSliceThreading->pCountBsSizeInPartition[iPartitionIdx];
-		iSliceCount += coded_slice_cnt;
-		++ iPartitionIdx;
-	}
-	*pLayerSize	= iLayerSize;
-
-	// reordering pLayerBs pointers, but do not ensure raster scan order of picture
-	// just maintain discontinuous items,i.e,
-	// input:
-	// partition 1: uiSliceIdx: 0 2 4 6
-	// partition 2: uiSliceIdx: 1 3 5 7 9 11 13
-	// output:
-	// uiSliceIdx: 0 1 2 3 4 5 6 7 8 9 10
-	iCheckingIdx = 0;						
-	while(true)
-	{
-		bool_t bMatchFlag = false;
-		iPartitionIdx = 0;							
-		while(iPartitionIdx < kiPartitionCnt)
-		{
-			const int32_t coded_slice_cnt = pCurDq->pNumSliceCodedOfPartition[iPartitionIdx];
-			// iCheckingIdx need convert to iIndex of iPartitionIdx based to avoid linear searching
-			// belong this partition and not exceed the number of slices coded in partition
-			if ( iPartitionIdx == (iCheckingIdx % kiPartitionCnt)
-				&& iCheckingIdx / kiPartitionCnt < coded_slice_cnt )
-			{
-				if ( iSwappingIdx >= 0 )
-				{
-					// memory swapping
-					memmove(pLayerBsInfo+iSwappingIdx, LayerBsInfo+iCheckingIdx, sizeof(SLayerBSInfo));	// confirmed_safe_unsafe_usage
-					++ iSwappingIdx;	// record iSwappingIdx
-				}
-				++ iCheckingIdx;
-				bMatchFlag = true;
-				break;
-			}
-			++ iPartitionIdx;
-		}
-		if ( !bMatchFlag )
-		{
-			if ( iSwappingIdx < 0 )
-				iSwappingIdx = iCheckingIdx;
-			++ iCheckingIdx;
-		}
-		if ( iSwappingIdx >= iSliceCount )
-			break;
-	}
-
-	return iSliceCount;
-}
-#endif//MT_ENABLED && PACKING_ONE_SLICE_PER_LAYER
-
-/*
- * Force coding IDR as follows
- */
-int32_t ForceCodingIDR( sWelsEncCtx *pCtx )
-{
-	if ( NULL == pCtx )
-		return 1;
-
-	pCtx->bEncCurFrmAsIdrFlag = true;
-	pCtx->iCodingIndex	= 0;
-
-	return 0;
-}
-
-/*!
- * \brief	core svc encoding process
- *
- * \pParam	pCtx			sWelsEncCtx*, encoder context
- * \pParam	pDst			FrameBSInfo*
- * \pParam	pSrc			SSourcePicture* for need_ds = true or SSourcePicture** for need_ds = false
- * \pParam	iConfiguredLayerNum	=1 in case need_ds = true or >1 in case need_ds = false
- * \pParam	need_ds		Indicate whether need down sampling desired
- *						[NO in picture list case, YES in console aplication based]
- * \return	EFrameType (WELS_FRAME_TYPE_IDR/WELS_FRAME_TYPE_I/WELS_FRAME_TYPE_P)
- */
-int32_t WelsEncoderEncodeExt( sWelsEncCtx *pCtx, void *pDst, const SSourcePicture **ppSrcList, const int32_t iConfiguredLayerNum )
-{
-	SFrameBSInfo *pFbi					= (SFrameBSInfo *)pDst;
-	SLayerBSInfo *pLayerBsInfo					= &pFbi->sLayerInfo[0];
-	SWelsSvcCodingParam *pSvcParam	= pCtx->pSvcParam;
-	SSpatialPicIndex *pSpatialIndexMap= &pCtx->sSpatialIndexMap[0];
-#if defined(ENABLE_FRAME_DUMP) || defined(ENABLE_PSNR_CALC)
-	SPicture *fsnr						= NULL;
-#endif//ENABLE_FRAME_DUMP || ENABLE_PSNR_CALC
-	SPicture *pEncPic						= NULL;	// to be decided later
-#if defined(MT_ENABLED) && (defined(DYNAMIC_SLICE_ASSIGN) || defined(MT_DEBUG))
-	int32_t did_list[MAX_DEPENDENCY_LAYER]	= {0};	
-#endif//MT_ENABLED && DYNAMIC_SLICE_ASSIGN
-	int32_t iLayerNum					= 0;
-	int32_t iLayerSize					= 0;
-	int32_t iSpatialNum					= 0; // available count number of spatial layers due to frame size changed in this given frame
-	int32_t iSpatialIdx					= 0; // iIndex of spatial layers due to frame size changed in this given frame
-	int32_t iFrameSize					= 0;
-	int32_t iNalLen[128]				= {0};
-	int32_t iNalIdxInLayer			= 0;
-	int32_t iCountNal					= 0;
-	EFrameType eFrameType				= WELS_FRAME_TYPE_AUTO;	
-	int32_t iCurWidth					= 0;
-	int32_t iCurHeight					= 0;
-	EWelsNalUnitType eNalType			= NAL_UNIT_UNSPEC_0;
-	EWelsNalRefIdc eNalRefIdc			= NRI_PRI_LOWEST;
-	int8_t iCurDid						= 0;
-	int8_t iCurTid						= 0;
-	bool_t bAvcBased					= false;
-#if defined(ENABLE_PSNR_CALC)
-	real32_t snr_y = .0f, snr_u = .0f, snr_v = .0f;
-#endif//ENABLE_PSNR_CALC
-
-#if defined(_DEBUG)
-	int32_t i = 0, j = 0, k = 0;
-#endif//_DEBUG
-
-	pFbi->iLayerNum	= 0;	// for initialization
-
-	// perform csc/denoise/downsample/padding, generate spatial layers
-	iSpatialNum = pCtx->pVpp->WelsPreprocessStep1(pCtx, ppSrcList, iConfiguredLayerNum);	
-	if ( iSpatialNum < 1 )	// skip due to temporal layer settings (different frame rate)
-	{
-		++ pCtx->iCodingIndex;
-		return WELS_FRAME_TYPE_SKIP;
-	}
-
-	eFrameType = DecideFrameType( pCtx, iSpatialNum );
-	if (eFrameType == WELS_FRAME_TYPE_SKIP)
-		return eFrameType;
-
-	InitFrameCoding( pCtx, eFrameType );
-
-	iCurTid	= GetTemporalLevel( &pSvcParam->sDependencyLayers[pSpatialIndexMap->iDid], pCtx->iCodingIndex, pSvcParam->uiGopSize );
-	pCtx->uiTemporalId	= iCurTid;
-	
-	pLayerBsInfo->pBsBuf	= pCtx->pFrameBs ;
-
-	if ( eFrameType == WELS_FRAME_TYPE_IDR  )
-	{
-		++ pCtx->sPSOVector.uiIdrPicId;
-		//if ( pSvcParam->bEnableSSEI )
-		
-		// write parameter sets bitstream here
-		WelsWriteParameterSets( pCtx, &iNalLen[0], &iCountNal );
-
-		pLayerBsInfo->uiPriorityId	= 0;
-		pLayerBsInfo->uiSpatialId		= 0;
-		pLayerBsInfo->uiTemporalId	= 0;
-		pLayerBsInfo->uiQualityId		= 0;
-		pLayerBsInfo->uiLayerType		= NON_VIDEO_CODING_LAYER;
-		pLayerBsInfo->iNalCount		= iCountNal;
-		for (int32_t iNalIndex	= 0; iNalIndex < iCountNal; ++ iNalIndex)
-		{
-			pLayerBsInfo->iNalLengthInByte[iNalIndex]	= iNalLen[iNalIndex];
-		}
-
-		++ pLayerBsInfo;
-		pLayerBsInfo->pBsBuf			= pCtx->pFrameBs + pCtx->iPosBsBuffer;
-		++ iLayerNum;
-	}
-
-	pCtx->pCurDqLayer				= pCtx->ppDqLayerList[pSpatialIndexMap->iDid];
-	pCtx->pCurDqLayer->pRefLayer	= NULL;
-
-	while ( iSpatialIdx < iSpatialNum )
-	{		
-		const int32_t d_idx			= (pSpatialIndexMap+iSpatialIdx)->iDid;	// get iDid
-		SDLayerParam *param_d		= &pSvcParam->sDependencyLayers[d_idx];			
-
-		pCtx->uiDependencyId	= iCurDid = (int8_t)d_idx;
-		pCtx->pVpp->WelsPreprocessStep3(pCtx, d_idx);
-
-		pCtx->pEncPic	 = pEncPic = (pSpatialIndexMap+iSpatialIdx)->pSrc;
-		pCtx->pEncPic->iPictureType	= pCtx->eSliceType;
-		pCtx->pEncPic->iFramePoc		= pCtx->iPOC;
-
-		iCurWidth	= param_d->iFrameWidth;
-		iCurHeight	= param_d->iFrameHeight;
-
-#if defined(MT_ENABLED) && (defined(DYNAMIC_SLICE_ASSIGN) || defined(MT_DEBUG))
-		did_list[iSpatialIdx]	= iCurDid;
-#endif//MT_ENABLED && DYNAMIC_SLICE_ASSIGN
-		
-		// Encoding this picture might mulitiple sQualityStat layers potentially be encoded as followed
-
-		switch ( param_d->sMso.uiSliceMode )
-		{
-		case SM_FIXEDSLCNUM_SLICE:
-			{
-#if defined(MT_ENABLED) && defined(DYNAMIC_SLICE_ASSIGN)	
-				if ( (iCurDid > 0) && (pSvcParam->iMultipleThreadIdc > 1) &&
-					(pSvcParam->sDependencyLayers[iCurDid].sMso.uiSliceMode == SM_FIXEDSLCNUM_SLICE && pSvcParam->iMultipleThreadIdc >= pSvcParam->sDependencyLayers[iCurDid].sMso.sSliceArgument.iSliceNum )
-					) 
-					AdjustEnhanceLayer( pCtx, iCurDid );
-#endif//MT_ENABLED && DYNAMIC_SLICE_ASSIGN
-				break;
-			}
-		case SM_DYN_SLICE:
-			{
-				int32_t iPicIPartitionNum = PicPartitionNumDecision( pCtx );
-				// MT compatibility
-				pCtx->iActiveThreadsNum	= iPicIPartitionNum;	// we try to active number of threads, equal to number of picture partitions
-				WelsInitCurrentDlayerMltslc( pCtx, iPicIPartitionNum );
-				break;
-			}
-		default:
-			{
-				break;
-			}
-		}
-
-		/* coding each spatial layer, only one sQualityStat layer within spatial support */
-		int32_t iSliceCount	= 1;			
-		if ( iLayerNum >= MAX_LAYER_NUM_OF_FRAME )	// check available layer_bs_info writing as follows
-		{
-			WelsLog( pCtx, WELS_LOG_ERROR, "WelsEncoderEncodeExt(), iLayerNum(%d) overflow(max:%d)!", iLayerNum, MAX_LAYER_NUM_OF_FRAME);
-			return -1;
-		}
-
-		iNalIdxInLayer	= 0;
-		bAvcBased	= (iCurDid == BASE_DEPENDENCY_ID);
-		pCtx->bNeedPrefixNalFlag	= (bAvcBased && 
-			(pSvcParam->bPrefixNalAddingCtrl || 
-			(pSvcParam->iNumDependencyLayer > 1) ));
-
-		if ( eFrameType == WELS_FRAME_TYPE_P )
-		{
-			eNalType	= bAvcBased ? NAL_UNIT_CODED_SLICE : NAL_UNIT_CODED_SLICE_EXT;					
-		}
-		else if ( eFrameType == WELS_FRAME_TYPE_IDR )
-		{
-			eNalType	= bAvcBased ? NAL_UNIT_CODED_SLICE_IDR : NAL_UNIT_CODED_SLICE_EXT;
-		}
-		if ( iCurTid == 0 || pCtx->eSliceType == I_SLICE )
-			eNalRefIdc	= NRI_PRI_HIGHEST;
-		else if ( iCurTid == pSvcParam->iDecompStages )
-			eNalRefIdc	= NRI_PRI_LOWEST;
-		else if ( 1 + iCurTid == pSvcParam->iDecompStages )
-			eNalRefIdc	= NRI_PRI_LOW;
-		else	// more details for other temporal layers?
-			eNalRefIdc	= NRI_PRI_HIGHEST;
-		pCtx->eNalType		= eNalType;
-		pCtx->eNalPriority	= eNalRefIdc;				
-
-		pCtx->pDecPic					= pCtx->ppRefPicListExt[iCurDid]->pNextBuffer;
-#if defined(ENABLE_FRAME_DUMP) || defined(ENABLE_PSNR_CALC)
-		fsnr					= pCtx->pDecPic;
-#endif//#if defined(ENABLE_FRAME_DUMP) || defined(ENABLE_PSNR_CALC)
-		pCtx->pDecPic->iPictureType	= pCtx->eSliceType;				
-		pCtx->pDecPic->iFramePoc		= pCtx->iPOC;				
-
-		WelsInitCurrentLayer( pCtx, iCurWidth, iCurHeight );
-
-		WelsMarkPic(pCtx);
-		if ( !WelsBuildRefList( pCtx, pCtx->iPOC ) )
-		{
-			// Force coding IDR as followed
-			ForceCodingIDR( pCtx );
-			WelsLog(pCtx, WELS_LOG_WARNING, "WelsEncoderEncodeExt(), WelsBuildRefList failed for P frames, pCtx->iNumRef0= %d.\n", pCtx->iNumRef0);
-			return -1;
-		}
-#ifdef LONG_TERM_REF_DUMP
-		dump_ref(pCtx);
-#endif
-		WelsUpdateRefSyntax(pCtx,  pCtx->iPOC, eFrameType);	//get reordering syntax used for writing slice header and transmit to encoder.
-		PrefetchReferencePicture( pCtx, eFrameType );	// update reference picture for current pDq layer
-
-		pCtx->pFuncList->pfRc.pfWelsRcPictureInit(pCtx);
-		PreprocessSliceCoding( pCtx );	// MUST be called after pfWelsRcPictureInit() and WelsInitCurrentLayer()
-
-		iLayerSize	= 0;
-		if ( SM_SINGLE_SLICE == param_d->sMso.uiSliceMode )	// only one slice within a sQualityStat layer
-		{
-			int32_t iSliceSize = 0;					
-			
-			if ( pCtx->bNeedPrefixNalFlag )
-			{
-				iLayerSize += AddPrefixNal( pCtx, pLayerBsInfo, &iNalLen[0], &iNalIdxInLayer, eNalType, eNalRefIdc );
-			}
-			
-			WelsLoadNal( pCtx->pOut, eNalType, eNalRefIdc );
-			
-			WelsCodeOneSlice( pCtx, 0, eNalType );
-			
-			WelsUnloadNal( pCtx->pOut );
-			
-			iSliceSize = WelsEncodeNalExt(	&pCtx->pOut->sNalList[pCtx->pOut->iNalIndex-1],
-											&pCtx->pCurDqLayer->sLayerInfo.sNalHeaderExt,
-											pCtx->pFrameBs + pCtx->iPosBsBuffer,
-											&iNalLen[iNalIdxInLayer] );
-			iLayerSize += iSliceSize;
-			pCtx->iPosBsBuffer	+= iSliceSize;
-			pLayerBsInfo->uiLayerType		= VIDEO_CODING_LAYER;
-			pLayerBsInfo->uiSpatialId		= iCurDid;
-			pLayerBsInfo->uiTemporalId	= iCurTid;
-			pLayerBsInfo->uiQualityId		= 0;
-			pLayerBsInfo->uiPriorityId	= 0;
-			pLayerBsInfo->iNalLengthInByte[iNalIdxInLayer]	= iSliceSize;
-			pLayerBsInfo->iNalCount		= ++ iNalIdxInLayer;					
-		}
-		// for dynamic slicing single threading..
-#ifndef MT_ENABLED
-		else if ( SM_DYN_SLICE == param_d->sMso.uiSliceMode )
-#else	// MT_ENABLED
-		else if ( (SM_DYN_SLICE == param_d->sMso.uiSliceMode) && (pSvcParam->iMultipleThreadIdc <= 1) )
-#endif//MT_ENABLED
-		{
-			const int32_t kiLastMbInFrame = pCtx->pCurDqLayer->pSliceEncCtx->iMbNumInFrame;
-			WelsCodeOnePicPartition( pCtx, pLayerBsInfo, &iNalIdxInLayer, &iLayerSize, 0, kiLastMbInFrame, 0 );					
-		}
-		else
-		{//other multi-slice uiSliceMode			
-#if defined(MT_ENABLED)
-            int err = 0;
-			// THREAD_FULLY_FIRE_MODE/THREAD_PICK_UP_MODE for any mode of non-SM_DYN_SLICE
-			if ( (SM_DYN_SLICE != param_d->sMso.uiSliceMode) && (pSvcParam->iMultipleThreadIdc > 1) )
-			{
-				iSliceCount	= GetCurrentSliceNum( pCtx->pCurDqLayer->pSliceEncCtx );
-				if ( iLayerNum +
-#if defined(PACKING_ONE_SLICE_PER_LAYER)
-					iSliceCount
-#else
-					1
-#endif//PACKING_ONE_SLICE_PER_LAYER
-					>= MAX_LAYER_NUM_OF_FRAME )	// check available layer_bs_info for further writing as followed
-				{
-					WelsLog( pCtx, WELS_LOG_ERROR, "WelsEncoderEncodeExt(), iLayerNum(%d) overflow(max:%d) at iDid= %d uiSliceMode= %d, iSliceCount= %d!",
-						iLayerNum, MAX_LAYER_NUM_OF_FRAME, iCurDid, param_d->sMso.uiSliceMode, iSliceCount );
-					return -1;
-				}
-				if ( iSliceCount <= 1 )
-				{
-					WelsLog( pCtx, WELS_LOG_ERROR, "WelsEncoderEncodeExt(), iSliceCount(%d) from GetCurrentSliceNum() is untrusted due stack/heap crupted!\n", iSliceCount );
-					return -1;
-				}
-
-				if ( pSvcParam->iCountThreadsNum >= iSliceCount )	//THREAD_FULLY_FIRE_MODE
-				{
-#if defined(PACKING_ONE_SLICE_PER_LAYER)
-					int32_t iSliceIdx = 1;							
-					int32_t iOrgSlicePos[MAX_SLICES_NUM] = {0};
-					iOrgSlicePos[0] = pCtx->iPosBsBuffer;
-					while (uiSliceIdx < iSliceCount)
-					{
-						iOrgSlicePos[uiSliceIdx] = pCtx->pSliceBs[uiSliceIdx].uiBsPos;
-						++ uiSliceIdx;
-					}
-#elif defined(MT_DEBUG)
-					int64_t t_bs_append = 0;
-#endif//PACKING_ONE_SLICE_PER_LAYER
-					
-					pCtx->iActiveThreadsNum	= iSliceCount;
-					// to fire slice coding threads
-					err = FiredSliceThreads( &pCtx->pSliceThreading->pThreadPEncCtx[0], &pCtx->pSliceThreading->pReadySliceCodingEvent[0], pLayerBsInfo, iSliceCount, pCtx->pCurDqLayer->pSliceEncCtx, FALSE );
-					if ( err )
-					{
-						WelsLog( pCtx, WELS_LOG_ERROR, "[MT] WelsEncoderEncodeExt(), FiredSliceThreads return(%d) failed and exit encoding frame, iCountThreadsNum= %d, iSliceCount= %d, uiSliceMode= %d, iMultipleThreadIdc= %d!!\n",
-							err, pSvcParam->iCountThreadsNum, iSliceCount, param_d->sMso.uiSliceMode, pSvcParam->iMultipleThreadIdc );
-						return -1;
-					}
-				
-					WelsMultipleEventsWaitAllBlocking( iSliceCount, &pCtx->pSliceThreading->pSliceCodedEvent[0] );
-				
-
-					// all slices are finished coding here
-					// append exclusive slice 0 bs to pFrameBs
-#if defined(PACKING_ONE_SLICE_PER_LAYER)
-					iLayerSize = pCtx->iPosBsBuffer - iOrgSlicePos[0];
-					uiSliceIdx = 1;
-					while (uiSliceIdx < iSliceCount)
-					{
-						iLayerSize += pCtx->pSliceBs[uiSliceIdx].uiBsPos - iOrgSlicePos[uiSliceIdx];
-						++ uiSliceIdx;
-					}
-					iLayerNum += iSliceCount;	// each slice stickly output as layer info for performance improvement directly
-					pLayerBsInfo += iSliceCount;
-#else
-#if defined(MT_DEBUG)
-					t_bs_append = WelsTime();
-#endif//MT_DEBUG
-					iLayerSize = AppendSliceToFrameBs( pCtx, pLayerBsInfo, iSliceCount );
-#if defined(MT_DEBUG)
-					t_bs_append = WelsTime() - t_bs_append;
-					if ( pCtx->pSliceThreading->pFSliceDiff )
-					{
-						fprintf(pCtx->pSliceThreading->pFSliceDiff, 
-#if defined(WIN32)
-							"%6I64d us consumed at AppendSliceToFrameBs() for coding_idx: %d iDid: %d qid: %d\n",
-#else
-							"%6lld us consumed at AppendSliceToFrameBs() for coding_idx: %d iDid: %d qid: %d\n",
-#endif//WIN32
-							t_bs_append, pCtx->iCodingIndex, iCurDid, 0 );
-					}
-#endif//MT_DEBUG
-#endif//PACKING_ONE_SLICE_PER_LAYER
-				}
-				else	//THREAD_PICK_UP_MODE
-				{
-					int32_t iNumThreadsRunning = 0;
-					int32_t iNumThreadsScheduled = 0;
-					int32_t iIndexOfSliceToBeCoded = 0;
-#if defined(PACKING_ONE_SLICE_PER_LAYER)
-					int32_t iSliceIdx = 1;							
-					int32_t iOrgSlicePos[MAX_SLICES_NUM] = {0};
-					iOrgSlicePos[0] = pCtx->iPosBsBuffer;
-					while (uiSliceIdx < iSliceCount)
-					{
-						iOrgSlicePos[uiSliceIdx] = pCtx->pSliceBs[uiSliceIdx].uiBsPos;
-						++ uiSliceIdx;
-					}
-#endif//PACKING_ONE_SLICE_PER_LAYER
-
-					pCtx->iActiveThreadsNum	= pSvcParam->iCountThreadsNum;
-					iNumThreadsScheduled	= pCtx->iActiveThreadsNum;
-					iNumThreadsRunning		= iNumThreadsScheduled;
-					// to fire slice coding threads
-					err = FiredSliceThreads( &pCtx->pSliceThreading->pThreadPEncCtx[0], &pCtx->pSliceThreading->pReadySliceCodingEvent[0], pLayerBsInfo, iNumThreadsRunning, pCtx->pCurDqLayer->pSliceEncCtx, FALSE );
-					if ( err )
-					{
-						WelsLog( pCtx, WELS_LOG_ERROR, "[MT] WelsEncoderEncodeExt(), FiredSliceThreads return(%d) failed and exit encoding frame, iCountThreadsNum= %d, iSliceCount= %d, uiSliceMode= %d, iMultipleThreadIdc= %d!!\n",
-							err, pSvcParam->iCountThreadsNum, iSliceCount, param_d->sMso.uiSliceMode, pSvcParam->iMultipleThreadIdc );
-						return -1;
-					}
-
-					iIndexOfSliceToBeCoded = iNumThreadsRunning;
-					while (1)
-					{
-						if ( iIndexOfSliceToBeCoded >= iSliceCount && iNumThreadsRunning <= 0 )
-							break;								
-#ifdef WIN32
-						WELS_THREAD_ERROR_CODE lwait	= 0;
-						int32_t iEventId				= -1;
-						
-						lwait = WelsMultipleEventsWaitSingleBlocking(	iNumThreadsScheduled,
-																		&pCtx->pSliceThreading->pSliceCodedEvent[0],
-																		2 );	// 2 ms for one tick
-						iEventId = (int32_t)(lwait - WELS_THREAD_ERROR_WAIT_OBJECT_0);
-						if ( iEventId >= 0 && iEventId < iNumThreadsScheduled )
-						{									
-							if ( iIndexOfSliceToBeCoded < iSliceCount )
-							{		
-								// pick up succeeding slice for threading
-								// thread_id equal to iEventId per implementation here
-								pCtx->pSliceThreading->pThreadPEncCtx[iEventId].iSliceIndex	= iIndexOfSliceToBeCoded;
-#ifdef PACKING_ONE_SLICE_PER_LAYER
-								pCtx->pSliceThreading->pThreadPEncCtx[iEventId].pLayerBs	= pLayerBsInfo+iIndexOfSliceToBeCoded;
-#endif//PACKING_ONE_SLICE_PER_LAYER
-								WelsEventSignal( &pCtx->pSliceThreading->pReadySliceCodingEvent[iEventId] );
-
-								++ iIndexOfSliceToBeCoded;
-							}
-							else	// no other slices left for coding
-							{										
-								-- iNumThreadsRunning;
-							}
-						}
-						else
-						{
-							WelsSleep(1);
-						}								
-#else//__GNUC__
-						// TODO for pthread platforms
-						// alternate implementation using blocking due non-blocking with timeout mode not support at wels thread lib, tune back if available
-						WelsMultipleEventsWaitAllBlocking( iNumThreadsRunning, &pCtx->pSliceThreading->pSliceCodedEvent[0] );
-						if ( iIndexOfSliceToBeCoded < iSliceCount )
-						{
-							int32_t iThreadIdx = 0;
-							// pick up succeeding slices for threading if left
-							while ( iThreadIdx < iNumThreadsScheduled )
-							{
-								if ( iIndexOfSliceToBeCoded >= iSliceCount )
-									break;
-								pCtx->pSliceThreading->pThreadPEncCtx[iThreadIdx].iSliceIndex = iIndexOfSliceToBeCoded;
-#ifdef PACKING_ONE_SLICE_PER_LAYER
-								pCtx->pSliceThreading->pThreadPEncCtx[iThreadIdx].pLayerBs = pLayerBsInfo+iIndexOfSliceToBeCoded;
-#endif//PACKING_ONE_SLICE_PER_LAYER
-								WelsEventSignal( pCtx->pSliceThreading->pReadySliceCodingEvent[iThreadIdx] );
-
-								++ iIndexOfSliceToBeCoded;
-								++ iThreadIdx;
-							}
-							// update iNumThreadsRunning
-							iNumThreadsRunning		= iThreadIdx;									
-						}
-						else
-						{
-							iNumThreadsRunning = 0;
-						}
-#endif//WIN32
-					}//while(1)
-
-// all slices are finished coding here
-					// append exclusive slice 0 bs to pFrameBs
-#if defined(PACKING_ONE_SLICE_PER_LAYER)
-					iLayerSize = pCtx->iPosBsBuffer - iOrgSlicePos[0];
-					uiSliceIdx = 1;
-					while (uiSliceIdx < iSliceCount)
-					{
-						iLayerSize += pCtx->pSliceBs[uiSliceIdx].uiBsPos - iOrgSlicePos[uiSliceIdx];
-						++ uiSliceIdx;
-					}
-					iLayerNum += iSliceCount;	// each slice stickly output as layer info for performance improvement directly
-					pLayerBsInfo += iSliceCount;
-#else
-					iLayerSize = AppendSliceToFrameBs( pCtx, pLayerBsInfo, iSliceCount );
-#endif//PACKING_ONE_SLICE_PER_LAYER
-				}
-			}					
-			// THREAD_FULLY_FIRE_MODE && SM_DYN_SLICE
-			else if ( (SM_DYN_SLICE == param_d->sMso.uiSliceMode) && (pSvcParam->iMultipleThreadIdc > 1) )
-			{
-				const int32_t kiPartitionCnt	= pCtx->iActiveThreadsNum; //pSvcParam->iCountThreadsNum;
-#if defined(PACKING_ONE_SLICE_PER_LAYER)
-				ResetCountBsSizeInPartitions( pCtx->pSliceThreading->pCountBsSizeInPartition, kiPartitionCnt );
-				pCtx->pCurDqLayer->pSliceEncCtx->iMaxSliceNumConstraint = WELS_MIN ( MAX_SLICES_NUM, DynamicMaxSliceNumConstraint( MAX_LAYER_NUM_OF_FRAME, iLayerNum, 1 + /*( num_qlayer - 1) +*/ ( ( (iCurDid==0) && ( pSvcParam->uiGopSize>1 ) ) ? 1: 0 ) ) );  				
-#endif//PACKING_ONE_SLICE_PER_LAYER
-
-				// to fire slice coding threads
-				err = FiredSliceThreads( &pCtx->pSliceThreading->pThreadPEncCtx[0], &pCtx->pSliceThreading->pReadySliceCodingEvent[0], pLayerBsInfo, kiPartitionCnt, pCtx->pCurDqLayer->pSliceEncCtx, TRUE );
-				if ( err )
-				{
-					WelsLog( pCtx, WELS_LOG_ERROR, "[MT] WelsEncoderEncodeExt(), FiredSliceThreads return(%d) failed and exit encoding frame, iCountThreadsNum= %d, iSliceCount= %d, uiSliceMode= %d, iMultipleThreadIdc= %d!!\n",
-						err, pSvcParam->iCountThreadsNum, iSliceCount, param_d->sMso.uiSliceMode, pSvcParam->iMultipleThreadIdc );
-					return -1;
-				}
-
-				WelsMultipleEventsWaitAllBlocking( kiPartitionCnt, &pCtx->pSliceThreading->pSliceCodedEvent[0] );
-
-#if defined(PACKING_ONE_SLICE_PER_LAYER)						
-				iSliceCount = PostProcDynamicSlicingBsWriting( pCtx, pLayerBsInfo, &iLayerSize, kiPartitionCnt );
-				assert(iLayerNum + iSliceCount < MAX_LAYER_NUM_OF_FRAME);
-				pLayerBsInfo += iSliceCount;
-				iLayerNum += iSliceCount;
-#else
-				iLayerSize = AppendSliceToFrameBs( pCtx, pLayerBsInfo, kiPartitionCnt );
-#endif//PACKING_ONE_SLICE_PER_LAYER
-			}
-			else	// for non-dynamic-slicing mode single threading branch..
-#endif//MT_ENABLED
-			{
-				const bool_t bNeedPrefix	= pCtx->bNeedPrefixNalFlag;
-				int32_t iSliceIdx			= 0;
-
-				iSliceCount	= GetCurrentSliceNum( pCtx->pCurDqLayer->pSliceEncCtx );						
-				while (iSliceIdx < iSliceCount)
-				{
-					int32_t iSliceSize	= 0;
-
-					if ( bNeedPrefix )
-					{
-						iLayerSize += AddPrefixNal( pCtx, pLayerBsInfo, &iNalLen[0], &iNalIdxInLayer, eNalType, eNalRefIdc );
-					}
-					
-					WelsLoadNal( pCtx->pOut, eNalType, eNalRefIdc );
-					WelsCodeOneSlice( pCtx, iSliceIdx, eNalType );
-					WelsUnloadNal( pCtx->pOut );
-					
-					iSliceSize = WelsEncodeNalExt(	&pCtx->pOut->sNalList[pCtx->pOut->iNalIndex-1],
-													&pCtx->pCurDqLayer->sLayerInfo.sNalHeaderExt,
-													pCtx->pFrameBs + pCtx->iPosBsBuffer,
-													&iNalLen[iNalIdxInLayer] );
-					pCtx->iPosBsBuffer	+= iSliceSize;
-					iLayerSize	+= iSliceSize;
-					pLayerBsInfo->iNalLengthInByte[iNalIdxInLayer]	= iSliceSize;
-
-#if defined(SLICE_INFO_OUTPUT)
-					fprintf(	stderr,
-								"@slice=%-6d sliceType:%c idc:%d size:%-6d\n",
-								iSliceIdx,
-								(pCtx->eSliceType == P_SLICE ? 'P' : 'I'),
-								eNalRefIdc,
-								iSliceSize	);
-#endif//SLICE_INFO_OUTPUT						
-					++ iNalIdxInLayer;
-					++ iSliceIdx;						
-				}						
-
-				pLayerBsInfo->uiLayerType		= VIDEO_CODING_LAYER;
-				pLayerBsInfo->uiSpatialId		= iCurDid;
-				pLayerBsInfo->uiTemporalId	= iCurTid;
-				pLayerBsInfo->uiQualityId		= 0;
-				pLayerBsInfo->uiPriorityId	= 0;
-				pLayerBsInfo->iNalCount		= iNalIdxInLayer;
-			}
-		}			
-
-		// deblocking filter
-		if (
-#if defined(MT_ENABLED)
-			(!pCtx->pCurDqLayer->bDeblockingParallelFlag) &&
-#endif//MT_ENABLED
-#if !defined(ENABLE_FRAME_DUMP)
-			( (eNalRefIdc != NRI_PRI_LOWEST) && (param_d->iHighestTemporalId == 0 || iCurTid < param_d->iHighestTemporalId) ) &&
-#endif//!ENABLE_FRAME_DUMP
-			true
-		)
-		{
-			PerformDeblockingFilter( pCtx );
-		}
-
-		// reference picture list update				
-		if ( eNalRefIdc != NRI_PRI_LOWEST )
-		{
-			if ( !WelsUpdateRefList( pCtx ) )
-			{
-				// Force coding IDR as followed
-				ForceCodingIDR( pCtx );
-				WelsLog(pCtx, WELS_LOG_WARNING, "WelsEncoderEncodeExt(), WelsUpdateRefList failed.\n");
-				return -1;
-			}
-		}
-
-		iFrameSize += iLayerSize;				
-
-		pCtx->pFuncList->pfRc.pfWelsRcPictureInfoUpdate(pCtx, iLayerSize);
-
-#ifdef ENABLE_FRAME_DUMP
-		// Dump reconstruction picture for each sQualityStat layer
-		if ( iCurDid+1 < pSvcParam->iNumDependencyLayer )
-			DumpDependencyRec( fsnr, &param_d->sRecFileName[0], iCurDid );
-#endif//ENABLE_FRAME_DUMP
-
-#if defined(ENABLE_PSNR_CALC)
-		snr_y	= WelsCalcPsnr(	fsnr->pData[0],
-							fsnr->iLineSize[0],
-							pEncPic->pData[0],
-							pEncPic->iLineSize[0],
-							iCurWidth,
-							iCurHeight	);
-		snr_u	= WelsCalcPsnr(	fsnr->pData[1],
-							fsnr->iLineSize[1],
-							pEncPic->pData[1],
-							pEncPic->iLineSize[1],
-							(iCurWidth>>1),
-							(iCurHeight>>1)	);
-		snr_v	= WelsCalcPsnr(	fsnr->pData[2],
-							fsnr->iLineSize[2],
-							pEncPic->pData[2],
-							pEncPic->iLineSize[2],
-							(iCurWidth>>1),
-							(iCurHeight>>1)	);
-#endif//ENABLE_PSNR_CALC
-
-#if defined(LAYER_INFO_OUTPUT)
-		fprintf( stderr, "%2s %5d: %-5d %2s   T%1d D%1d Q%-2d  QP%3d   Y%2.2f  U%2.2f  V%2.2f  %8d bits\n",
-				 (iSpatialIdx == 0) ? "#AU" : "   ",
-				 pCtx->iPOC,
-				 pCtx->iFrameNum,
-				 (uiFrameType == WELS_FRAME_TYPE_I || uiFrameType == WELS_FRAME_TYPE_IDR) ? "I": "P",
-				 iCurTid,
-				 iCurDid,
-				 0,
-				 pCtx->pWelsSvcRc[pCtx->uiDependencyId].iAverageFrameQp,
-				 snr_y,
-				 snr_u,
-				 snr_v,
-				 (iLayerSize<<3)	);
-#endif//LAYER_INFO_OUTPUT
-
-#if defined(STAT_OUTPUT)
-
-#if defined(ENABLE_PSNR_CALC)
-		{
-			pCtx->sStatData[iCurDid][0].sQualityStat.rYPsnr[pCtx->eSliceType]	+= snr_y;
-			pCtx->sStatData[iCurDid][0].sQualityStat.rUPsnr[pCtx->eSliceType]	+= snr_u;
-			pCtx->sStatData[iCurDid][0].sQualityStat.rVPsnr[pCtx->eSliceType]	+= snr_v;
-		}
-#endif//ENABLE_PSNR_CALC
-		
-#if defined(MB_TYPES_CHECK) //091025, frame output
-		if (pCtx->eSliceType == P_SLICE)
-		{
-			pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[P_SLICE][Intra4x4] += pCtx->sPerInfo.iMbCount[P_SLICE][Intra4x4];
-			pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[P_SLICE][Intra16x16] += pCtx->sPerInfo.iMbCount[P_SLICE][Intra16x16];
-			pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[P_SLICE][Inter16x16] += pCtx->sPerInfo.iMbCount[P_SLICE][Inter16x16];
-			pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[P_SLICE][Inter16x8] += pCtx->sPerInfo.iMbCount[P_SLICE][Inter16x8];
-			pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[P_SLICE][Inter8x16] += pCtx->sPerInfo.iMbCount[P_SLICE][Inter8x16];
-			pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[P_SLICE][Inter8x8] += pCtx->sPerInfo.iMbCount[P_SLICE][Inter8x8];
-			pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[P_SLICE][PSkip] += pCtx->sPerInfo.iMbCount[P_SLICE][PSkip];
-			pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[P_SLICE][8] += pCtx->sPerInfo.iMbCount[P_SLICE][8];
-			pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[P_SLICE][9] += pCtx->sPerInfo.iMbCount[P_SLICE][9];
-			pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[P_SLICE][10] += pCtx->sPerInfo.iMbCount[P_SLICE][10];
-			pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[P_SLICE][11] += pCtx->sPerInfo.iMbCount[P_SLICE][11];
-		}
-		else if (pCtx->eSliceType == I_SLICE)
-		{
-			pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[I_SLICE][Intra4x4] += pCtx->sPerInfo.iMbCount[I_SLICE][Intra4x4];
-			pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[I_SLICE][Intra16x16] += pCtx->sPerInfo.iMbCount[I_SLICE][Intra16x16];
-			pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[I_SLICE][7] += pCtx->sPerInfo.iMbCount[I_SLICE][7];
-		}
-		
-		memset(pCtx->sPerInfo.iMbCount[P_SLICE], 0, 18*sizeof( int32_t ));
-		memset(pCtx->sPerInfo.iMbCount[I_SLICE], 0, 18*sizeof( int32_t ));
-
-#endif//MB_TYPES_CHECK
-		{ 
-    		//no pCtx->pSvcParam->bMgsT0OnlyStrategy
-			++ pCtx->sStatData[iCurDid][0].sSliceData.iSliceCount[pCtx->eSliceType];	// for multiple slices coding
-			pCtx->sStatData[iCurDid][0].sSliceData.iSliceSize[pCtx->eSliceType]	+= (iLayerSize<<3);	// bits
-		}
-#endif//STAT_OUTPUT
-
-#if defined(MT_ENABLED) && defined(PACKING_ONE_SLICE_PER_LAYER)
-		if ( pSvcParam->iMultipleThreadIdc <= 1 || SM_SINGLE_SLICE == param_d->sMso.uiSliceMode )	// sigle thread actually used
-#else
-		if ( 1 )
-#endif//MT_ENABLED && PACKING_ONE_SLICE_PER_LAYER
-		{
-			++ iLayerNum;
-			++ pLayerBsInfo;
-		}
-
-						
-		pLayerBsInfo->pBsBuf	= pCtx->pFrameBs + pCtx->iPosBsBuffer;
-
-		if( pSvcParam->iPaddingFlag && pCtx->pWelsSvcRc[pCtx->uiDependencyId].iPaddingSize > 0 )
-		{
-			const int32_t kiPaddingNalSize = WritePadding(pCtx, pCtx->pWelsSvcRc[pCtx->uiDependencyId].iPaddingSize);
-			
-#if GOM_TRACE_FLAG
-			WelsLog( pCtx, WELS_LOG_INFO,"[RC] encoding_qp%d Padding: %d\n",pCtx->uiDependencyId, pCtx->pWelsSvcRc[pCtx->uiDependencyId].iPaddingSize);
-#endif
-			if ( kiPaddingNalSize <= 0 )
-				return -1;
-
-			pCtx->pWelsSvcRc[pCtx->uiDependencyId].iPaddingBitrateStat += pCtx->pWelsSvcRc[pCtx->uiDependencyId].iPaddingSize;
-			
-			pCtx->pWelsSvcRc[pCtx->uiDependencyId].iPaddingSize=0;
-
-			pLayerBsInfo->uiPriorityId	= 0;
-			pLayerBsInfo->uiSpatialId		= 0;
-			pLayerBsInfo->uiTemporalId	= 0;
-			pLayerBsInfo->uiQualityId		= 0;
-			pLayerBsInfo->uiLayerType		= NON_VIDEO_CODING_LAYER;
-			pLayerBsInfo->iNalCount		= 1;
-			pLayerBsInfo->iNalLengthInByte[0] = kiPaddingNalSize;
-			++ pLayerBsInfo;
-			pLayerBsInfo->pBsBuf	= pCtx->pFrameBs + pCtx->iPosBsBuffer;
-			++ iLayerNum;
-		}
-
-#if defined(MT_ENABLED) && defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
-		if ( param_d->sMso.uiSliceMode == SM_FIXEDSLCNUM_SLICE && pSvcParam->iMultipleThreadIdc > 1 &&
-			 pSvcParam->iMultipleThreadIdc >= param_d->sMso.sSliceArgument.iSliceNum )
-		{
-			CalcSliceComplexRatio( pCtx->pSliceThreading->pSliceComplexRatio[iCurDid], pCtx->pCurDqLayer->pSliceEncCtx, pCtx->pSliceThreading->pSliceConsumeTime[iCurDid] );
-#if defined(MT_DEBUG)
-			TrackSliceComplexities( pCtx, iCurDid );
-#endif//#if defined(MT_DEBUG)
-		}
-#endif//MT_ENABLED && DYNAMIC_SLICE_ASSIGN && TRY_SLICING_BALANCE
-
-		++ iSpatialIdx;		
-
-		if ( iCurDid+1 < pSvcParam->iNumDependencyLayer )
-		{
-			WelsSwapDqLayers( pCtx );
-		}
-
-		if ( pSvcParam->bEnableLongTermReference && (pCtx->pLtr[pCtx->uiDependencyId].bLTRMarkingFlag && (pCtx->pLtr[pCtx->uiDependencyId].iLTRMarkMode == LTR_DELAY_MARK))) 
-		{
-			pCtx->bLongTermRefFlag[d_idx][0] = true;
-		}
-
-		if ( iCurTid < pCtx->uiSpatialLayersInTemporal[d_idx] - 1 || pSvcParam->iDecompStages == 0 )
-		{
-			if ( (iCurTid >= MAX_TEMPORAL_LEVEL)||(pCtx->uiSpatialLayersInTemporal[d_idx]-1>= MAX_TEMPORAL_LEVEL))
-			{
-				ForceCodingIDR( pCtx );	// some logic error
-				return -1;
-			}
-
-			if ( pSvcParam->bEnableLongTermReference && pCtx->bLongTermRefFlag[d_idx][iCurTid] )
-			{	
-				SPicture *tmp	= pCtx->pSpatialPic[d_idx][pCtx->uiSpatialLayersInTemporal[d_idx]+pCtx->pVaa->uiMarkLongTermPicIdx];
-				pCtx->pSpatialPic[d_idx][pCtx->uiSpatialLayersInTemporal[d_idx]+pCtx->pVaa->uiMarkLongTermPicIdx] = pCtx->pSpatialPic[d_idx][iCurTid];
-				pCtx->pSpatialPic[d_idx][iCurTid] = pCtx->pSpatialPic[d_idx][pCtx->uiSpatialLayersInTemporal[d_idx]-1];
-				pCtx->pSpatialPic[d_idx][pCtx->uiSpatialLayersInTemporal[d_idx]-1] = tmp;
-				pCtx->bLongTermRefFlag[d_idx][iCurTid] = false;
-			}
-			else
-			{
-				WelsExchangeSpatialPictures( &pCtx->pSpatialPic[d_idx][pCtx->uiSpatialLayersInTemporal[d_idx]-1], &pCtx->pSpatialPic[d_idx][iCurTid] );
-			}
-		}
-
-		if ( pSvcParam->bEnableLongTermReference && ((pCtx->pLtr[pCtx->uiDependencyId].bLTRMarkingFlag && (pCtx->pLtr[pCtx->uiDependencyId].iLTRMarkMode == LTR_DIRECT_MARK)) || eFrameType == WELS_FRAME_TYPE_IDR)) 
-		{
-			pCtx->bLongTermRefFlag[d_idx][iCurTid] = true;
-		}
-	}
-
-#if defined(MT_ENABLED) && defined(MT_DEBUG)
-	TrackSliceConsumeTime( pCtx, did_list, iSpatialNum );
-#endif//MT_ENABLED && MT_DEBUG
-	
-#if defined(MT_ENABLED) && defined(DYNAMIC_SLICE_ASSIGN)
-	if ( pSvcParam->iMultipleThreadIdc > 1 && did_list[0] == BASE_DEPENDENCY_ID 
-		&& pSvcParam->sDependencyLayers[0].sMso.uiSliceMode == SM_FIXEDSLCNUM_SLICE && pSvcParam->iMultipleThreadIdc >= pSvcParam->sDependencyLayers[0].sMso.sSliceArgument.iSliceNum
-		&& pSvcParam->sDependencyLayers[did_list[iSpatialNum-1]].sMso.uiSliceMode == SM_FIXEDSLCNUM_SLICE && pSvcParam->iMultipleThreadIdc >= pSvcParam->sDependencyLayers[did_list[iSpatialNum-1]].sMso.sSliceArgument.iSliceNum )
-	{	
-		AdjustBaseLayer( pCtx );
-	}
-#endif//DYNAMIC_SLICE_ASSIGN
-
-#ifdef ENABLE_FRAME_DUMP
-	DumpRecFrame( fsnr, &pSvcParam->sDependencyLayers[pSvcParam->iNumDependencyLayer-1].sRecFileName[0] );	// pDecPic: final reconstruction output
-#endif//ENABLE_FRAME_DUMP
-	
-	++ pCtx->iCodingIndex;
-	pCtx->eLastNalPriority	= eNalRefIdc;
-	pFbi->iLayerNum			= iLayerNum;
-
-#if defined(X86_ASM)
-	WelsEmms();
-#endif //X86_ASM	
- 
-	return eFrameType;
-}
-
-/*!
- * \brief	Wels SVC encoder parameters adjustment
- *			SVC adjustment results in new requirement in memory blocks adjustment
- */
-int32_t WelsEncoderParamAdjust( sWelsEncCtx **ppCtx, SWelsSvcCodingParam *pNewParam )
-{
-	SWelsSvcCodingParam *pOldParam		= NULL;
-	int32_t iReturn = 0;
-	int8_t iIndexD= 0;
-	bool_t bNeedReset = false;
-
-	if ( NULL == ppCtx || NULL == *ppCtx || NULL == pNewParam )	return 1;
-	
-	/* Check validation in new parameters */
-	iReturn	= ParamValidationExt( pNewParam );
-	if ( iReturn != 0 )	return iReturn;
-
-	pOldParam	= (*ppCtx)->pSvcParam;
-
-	/* Decide whether need reset for IDR frame based on adjusting prarameters changed */
-	/* Temporal levels, spatial settings and/ or quality settings changed need update parameter sets related. */
-	bNeedReset	=	(pOldParam == NULL ) ||
-					(pOldParam->iNumTemporalLayer != pNewParam->iNumTemporalLayer) ||
-					(pOldParam->uiGopSize != pNewParam->uiGopSize) ||
-					(pOldParam->iNumDependencyLayer != pNewParam->iNumDependencyLayer) ||
-					(pOldParam->iDecompStages != pNewParam->iDecompStages) ||
-					(pOldParam->iActualPicWidth != pNewParam->iActualPicWidth || pOldParam->iActualPicHeight != pNewParam->iActualPicHeight) ||
-					(pOldParam->SUsedPicRect.iWidth != pNewParam->SUsedPicRect.iWidth || pOldParam->SUsedPicRect.iHeight != pNewParam->SUsedPicRect.iHeight) ||
-					(pOldParam->bEnableLongTermReference != pNewParam->bEnableLongTermReference);
-	if ( !bNeedReset ){	// Check its picture resolutions/quality settings respectively in each dependency layer
-		iIndexD = 0;
-		assert( pOldParam->iNumDependencyLayer == pNewParam->iNumDependencyLayer );
-		do 
-		{
-			const SDLayerParam *kpOldDlp	= &pOldParam->sDependencyLayers[iIndexD];
-			const SDLayerParam *kpNewDlp	= &pNewParam->sDependencyLayers[iIndexD];
-			float fT1 = .0f;
-			float fT2 = .0f;
-
-			// check frame size settings
-			if ( kpOldDlp->iFrameWidth != kpNewDlp->iFrameWidth ||
-				 kpOldDlp->iFrameHeight != kpNewDlp->iFrameHeight ||
-				 kpOldDlp->iActualWidth != kpNewDlp->iActualWidth ||
-				 kpOldDlp->iActualHeight != kpNewDlp->iActualHeight ){
-				bNeedReset	= true;
-				break;
-			}
-
-			if ( kpOldDlp->sMso.uiSliceMode != kpNewDlp->sMso.uiSliceMode ||				 
-				 kpOldDlp->sMso.sSliceArgument.iSliceNum != kpNewDlp->sMso.sSliceArgument.iSliceNum )
-			{
-				bNeedReset	= true;
-				break;
-			}
-
-			// check frame rate
-			// we can not check whether corresponding fFrameRate is equal or not, 
-			// only need to check d_max/d_min and max_fr/d_max whether it is equal or not
-			if ( kpNewDlp->fInputFrameRate > EPSN && kpOldDlp->fInputFrameRate > EPSN )
-				fT1 = kpNewDlp->fOutputFrameRate/kpNewDlp->fInputFrameRate - kpOldDlp->fOutputFrameRate/kpOldDlp->fInputFrameRate;
-			if ( kpNewDlp->fOutputFrameRate > EPSN && kpOldDlp->fOutputFrameRate > EPSN )
-				fT2 = pNewParam->fMaxFrameRate/kpNewDlp->fOutputFrameRate - pOldParam->fMaxFrameRate/kpOldDlp->fOutputFrameRate;
-			if ( fT1 > EPSN || fT1 < -EPSN || fT2 > EPSN || fT2 < -EPSN )
-			{
-				bNeedReset = true;
-				break;
-			}
-
-			if ( kpOldDlp->iHighestTemporalId != kpNewDlp->iHighestTemporalId )
-			{
-				bNeedReset = true;
-				break;
-			}			
-
-			++ iIndexD;
-		} while (iIndexD < pOldParam->iNumDependencyLayer);		
-	}
-
-	if ( bNeedReset ){
-		SParaSetOffsetVariable sTmpPsoVariable[PARA_SET_TYPE];
-		uint16_t	          uiTmpIdrPicId;//this is for LTR!
-		memcpy( sTmpPsoVariable, (*ppCtx)->sPSOVector.sParaSetOffsetVariable, (PARA_SET_TYPE)*sizeof(SParaSetOffsetVariable)  );// confirmed_safe_unsafe_usage
-		uiTmpIdrPicId = (*ppCtx)->sPSOVector.uiIdrPicId;
-
-		WelsUninitEncoderExt( ppCtx );
-
-		/* Update new parameters */
-		if ( WelsInitEncoderExt( ppCtx, pNewParam ) )
-			return 1;		
-
-		// reset the scaled spatial picture size 
-		(*ppCtx)->pVpp->WelsPreprocessReset(*ppCtx);
-		//if WelsInitEncoderExt succeed
-
-		//for FLEXIBLE_PARASET_ID
-		memcpy( (*ppCtx)->sPSOVector.sParaSetOffsetVariable, sTmpPsoVariable, (PARA_SET_TYPE)*sizeof(SParaSetOffsetVariable)  );// confirmed_safe_unsafe_usage
-		(*ppCtx)->sPSOVector.uiIdrPicId = uiTmpIdrPicId;
-	}
-	else{
-		/* maybe adjustment introduced in bitrate or little settings adjustment and so on.. */		
-		pNewParam->iNumRefFrame								= WELS_CLIP3(pNewParam->iNumRefFrame, MIN_REF_PIC_COUNT, MAX_REFERENCE_PICTURE_COUNT_NUM);
-		pNewParam->iLoopFilterDisableIdc					= WELS_CLIP3(pNewParam->iLoopFilterDisableIdc, 0, 6);
-		pNewParam->iLoopFilterAlphaC0Offset				= WELS_CLIP3(pNewParam->iLoopFilterAlphaC0Offset, -6, 6);
-		pNewParam->iLoopFilterBetaOffset					= WELS_CLIP3(pNewParam->iLoopFilterBetaOffset, -6, 6);
-		pNewParam->iInterLayerLoopFilterDisableIdc		= WELS_CLIP3(pNewParam->iInterLayerLoopFilterDisableIdc, 0, 6);
-		pNewParam->iInterLayerLoopFilterAlphaC0Offset	= WELS_CLIP3(pNewParam->iInterLayerLoopFilterAlphaC0Offset, -6, 6);
-		pNewParam->iInterLayerLoopFilterBetaOffset		= WELS_CLIP3(pNewParam->iInterLayerLoopFilterBetaOffset, -6, 6);
-		pNewParam->fMaxFrameRate							= WELS_CLIP3(pNewParam->fMaxFrameRate, MIN_FRAME_RATE, MAX_FRAME_RATE);
-
-		// we can not use direct struct based memcpy due some fields need keep unchanged as before
-		pOldParam->fMaxFrameRate	= pNewParam->fMaxFrameRate;		// maximal frame rate [Hz / fps]
-		pOldParam->iInputCsp			= pNewParam->iInputCsp;			// color space of input sequence	
-		pOldParam->uiIntraPeriod		= pNewParam->uiIntraPeriod;		// intra period (multiple of GOP size as desired)
-		pOldParam->bEnableSpsPpsIdAddition = pNewParam->bEnableSpsPpsIdAddition;
-		pOldParam->bPrefixNalAddingCtrl = pNewParam->bPrefixNalAddingCtrl;
-		pOldParam->iNumRefFrame		= pNewParam->iNumRefFrame;		// number of reference frame used
-
-		/* denoise control */
-		pOldParam->bEnableDenoise	= pNewParam->bEnableDenoise;
-
-		/* background detection control */
-		pOldParam->bEnableBackgroundDetection		= pNewParam->bEnableBackgroundDetection;
-
-		/* adaptive quantization control */
-		pOldParam->bEnableAdaptiveQuant	= pNewParam->bEnableAdaptiveQuant;
-
-		/* int32_t term reference control */
-		pOldParam->bEnableLongTermReference	= pNewParam->bEnableLongTermReference;	   
-		pOldParam->uiLtrMarkPeriod	= pNewParam->uiLtrMarkPeriod;	
-
-		// keep below values unchanged as before
-		pOldParam->bEnableSSEI		= pNewParam->bEnableSSEI;
-		pOldParam->bEnableFrameCroppingFlag	= pNewParam->bEnableFrameCroppingFlag;	// enable frame cropping flag
-
-		/* Motion search */
-		
-		/* Deblocking loop filter */
-		pOldParam->iLoopFilterDisableIdc	= pNewParam->iLoopFilterDisableIdc;	// 0: on, 1: off, 2: on except for slice boundaries
-		pOldParam->iLoopFilterAlphaC0Offset	= pNewParam->iLoopFilterAlphaC0Offset;// AlphaOffset: valid range [-6, 6], default 0
-		pOldParam->iLoopFilterBetaOffset		= pNewParam->iLoopFilterBetaOffset;	// BetaOffset:	valid range [-6, 6], default 0
-		pOldParam->iInterLayerLoopFilterDisableIdc	= pNewParam->iInterLayerLoopFilterDisableIdc; // Employed based upon inter-layer, same comment as above
-		pOldParam->iInterLayerLoopFilterAlphaC0Offset	= pNewParam->iInterLayerLoopFilterAlphaC0Offset;	// InterLayerLoopFilterAlphaC0Offset
-		pOldParam->iInterLayerLoopFilterBetaOffset		= pNewParam->iInterLayerLoopFilterBetaOffset;	// InterLayerLoopFilterBetaOffset
-		
-		/* Rate Control */
-		pOldParam->bEnableRc			= pNewParam->bEnableRc;	
-		pOldParam->iRCMode	    	= pNewParam->iRCMode;	
-		pOldParam->iTargetBitrate	= pNewParam->iTargetBitrate;			// overall target bitrate introduced in RC module
-		pOldParam->iPaddingFlag	    = pNewParam->iPaddingFlag;
-		
-		/* Layer definition */
-		pOldParam->bPrefixNalAddingCtrl	= pNewParam->bPrefixNalAddingCtrl;
-
-		// d
-		iIndexD = 0;
-		do 
-		{
-			SDLayerParam *pOldDlp	= &pOldParam->sDependencyLayers[iIndexD];
-			SDLayerParam *pNewDlp	= &pNewParam->sDependencyLayers[iIndexD];
-
-			pOldDlp->fInputFrameRate	= pNewDlp->fInputFrameRate;	// input frame rate
-			pOldDlp->fOutputFrameRate	= pNewDlp->fOutputFrameRate;	// output frame rate
-			pOldDlp->iSpatialBitrate	= pNewDlp->iSpatialBitrate;
-			
-			pOldDlp->uiProfileIdc		= pNewDlp->uiProfileIdc;			// value of profile IDC (0 for auto-detection)
-
-			/* Derived variants below */
-			pOldDlp->iTemporalResolution	= pNewDlp->iTemporalResolution;
-			pOldDlp->iDecompositionStages	= pNewDlp->iDecompositionStages;			
-			
-			memcpy(pOldDlp->uiCodingIdx2TemporalId, pNewDlp->uiCodingIdx2TemporalId, sizeof(pOldDlp->uiCodingIdx2TemporalId));	// confirmed_safe_unsafe_usage
-
-			++ iIndexD;
-		} while (iIndexD < pOldParam->iNumDependencyLayer);		
-	}
-
-	/* Any else initialization/reset for rate control here? */
-	
-	return 0;
-}
-
-
-int32_t WelsCodeOnePicPartition(	sWelsEncCtx *pCtx,
-									SLayerBSInfo *pLayerBsInfo,
-									int32_t *pNalIdxInLayer,									
-									int32_t* pLayerSize,
-									int32_t iFirstMbInPartition,	// first mb inclusive in partition
-									int32_t iEndMbInPartition,	// end mb exclusive in partition
-									int32_t iStartSliceIdx
-								  )
-{
-
-	SDqLayer * pCurLayer			= pCtx->pCurDqLayer;
-	SSliceCtx * pSliceCtx		= pCurLayer->pSliceEncCtx;	
-	int32_t iNalLen[MAX_NAL_UNITS_IN_LAYER]			= {0};
-	int32_t iNalIdxInLayer		= *pNalIdxInLayer;
-	int32_t iSliceIdx				= iStartSliceIdx;
-	const int32_t kiSliceStep		= pCtx->iActiveThreadsNum;
-	const int32_t kiPartitionId		= iStartSliceIdx % kiSliceStep;
-	int32_t iPartitionBsSize		= 0;
-	int32_t iAnyMbLeftInPartition= iEndMbInPartition - iFirstMbInPartition;
-	const EWelsNalUnitType keNalType	= pCtx->eNalType;
-	const EWelsNalRefIdc keNalRefIdc	= pCtx->eNalPriority;
-	const bool_t kbNeedPrefix		= pCtx->bNeedPrefixNalFlag;
-
-	//init
-	{
-		pSliceCtx->pFirstMbInSlice[iSliceIdx]		= iFirstMbInPartition;	
-		pCurLayer->pNumSliceCodedOfPartition[kiPartitionId]	= 1;	// one slice per partition intialized, dynamic slicing inside
-		pCurLayer->pLastMbIdxOfPartition[kiPartitionId]		= iEndMbInPartition-1;
-	}
-	pCurLayer->pLastCodedMbIdxOfPartition[kiPartitionId] = 0;
-
-	while ( iAnyMbLeftInPartition > 0 )
-	{
-		int32_t iSliceSize	= 0;
-
-		if ( iSliceIdx >= pSliceCtx->iMaxSliceNumConstraint )	// insufficient memory in pSliceInLayer[]
-		{
-			// TODO: need exception handler for not large enough of MAX_SLICES_NUM related memory usage
-			// No idea about its solution due MAX_SLICES_NUM is fixed lenght in relevent pData structure
-			return 1;
-		}
-		
-		if ( kbNeedPrefix )
-		{
-			iPartitionBsSize += AddPrefixNal( pCtx, pLayerBsInfo, &iNalLen[0], &iNalIdxInLayer, keNalType, keNalRefIdc );
-		}
-
-		WelsLoadNal( pCtx->pOut, keNalType, keNalRefIdc );
-		WelsCodeOneSlice( pCtx, iSliceIdx, keNalType );
-		WelsUnloadNal( pCtx->pOut );
-
-		iSliceSize = WelsEncodeNalExt(	&pCtx->pOut->sNalList[pCtx->pOut->iNalIndex-1],
-			&pCtx->pCurDqLayer->sLayerInfo.sNalHeaderExt,
-			pCtx->pFrameBs + pCtx->iPosBsBuffer,
-			&iNalLen[iNalIdxInLayer]	);
-		pCtx->iPosBsBuffer	+= iSliceSize;
-		iPartitionBsSize	+= iSliceSize;
-		pLayerBsInfo->iNalLengthInByte[iNalIdxInLayer]	= iSliceSize;		
-
-#if defined(SLICE_INFO_OUTPUT)
-		fprintf(	stderr,
-			"@slice=%-6d sliceType:%c idc:%d size:%-6d\n",
-			iSliceIdx,
-			(pCtx->eSliceType == P_SLICE ? 'P' : 'I'),
-			eNalRefIdc,
-			iSliceSize	);
-#endif//SLICE_INFO_OUTPUT
-
-		++ iNalIdxInLayer;
-		iSliceIdx += kiSliceStep;	//if uiSliceIdx is not continuous
-		iAnyMbLeftInPartition = iEndMbInPartition - (1 + pCurLayer->pLastCodedMbIdxOfPartition[kiPartitionId]);		
-	}
-
-	*pLayerSize			= iPartitionBsSize;
-	*pNalIdxInLayer	= iNalIdxInLayer;
-
-	// slice based packing???
-	pLayerBsInfo->uiLayerType		= VIDEO_CODING_LAYER;
-	pLayerBsInfo->uiSpatialId		= pCtx->uiDependencyId;
-	pLayerBsInfo->uiTemporalId	= pCtx->uiTemporalId;
-	pLayerBsInfo->uiQualityId		= 0;
-	pLayerBsInfo->uiPriorityId	= 0;
-	pLayerBsInfo->iNalCount		= iNalIdxInLayer;
-
-	return 0;
-}
-} // namespace WelsSVCEnc
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	encoder_ext.c
+ *
+ * \brief	core encoder for SVC
+ *
+ * \date	7/24/2009 Created
+ *
+ *************************************************************************************
+ */
+#include <string.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#include "encoder.h"
+#include "extern.h"
+#include "encoder_context.h"
+#include "typedefs.h"
+#include "wels_const.h"
+#include "wels_common_basis.h"
+#include "codec_def.h"
+#include "param_svc.h"
+#include "cpu_core.h"
+#include "cpu.h"
+#include "utils.h"
+#include "svc_enc_frame.h"
+#include "svc_enc_golomb.h"
+#include "svc_enc_slice_segment.h"
+#include "au_set.h"
+#include "picture_handle.h"
+#include "codec_app_def.h"
+#include "svc_base_layer_md.h"
+#include "svc_encode_slice.h"
+#include "decode_mb_aux.h"
+#include "deblocking.h"
+#include "rc.h"
+#include "ref_list_mgr_svc.h"
+#include "md.h"
+#include "ls_defines.h"
+#include "set_mb_syn_cavlc.h"
+#include "crt_util_safe_x.h"	// Safe CRT routines like utils for cross platforms
+#include "array_stack_align.h"
+// for MT, 4/22/2010
+#include "slice_multi_threading.h"
+#if defined(DYNAMIC_SLICE_ASSIGN) || defined(MT_DEBUG)
+#include "measure_time.h"
+#endif//DYNAMIC_SLICE_ASSIGN
+
+namespace WelsSVCEnc {
+
+
+int32_t WelsCodeOnePicPartition (sWelsEncCtx* pCtx,
+                                 SLayerBSInfo* pLbi,
+                                 int32_t* pNalIdxInLayer,
+                                 int32_t* pLayerSize,
+                                 int32_t iFirstMbInPartition,	// first mb inclusive in partition
+                                 int32_t iEndMbInPartition,	// end mb exclusive in partition
+                                 int32_t iStartSliceIdx
+                                );
+
+
+/*!
+ * \brief	validate checking in parameter configuration
+ * \pParam	pParam		SWelsSvcCodingParam*
+ * \return	successful - 0; otherwise none 0 for failed
+ */
+int32_t ParamValidation (SWelsSvcCodingParam* pCfg) {
+  float fMaxFrameRate = 0.0f;
+  const float fEpsn = 0.000001f;
+  int32_t i = 0;
+  int32_t iLastSpatialWidth	= 0;
+  int32_t	iLastSpatialHeight	= 0;
+  float fLastFrameRateIn	= 0.0f;
+  float fLastFrameRateOut	= 0.0f;
+  SDLayerParam* pLastSpatialParam = NULL;
+
+  assert (pCfg != NULL);
+
+  for (i = 0; i < pCfg->iNumDependencyLayer; ++ i) {
+    SDLayerParam* fDlp = &pCfg->sDependencyLayers[i];
+    if (fDlp->fOutputFrameRate > fDlp->fInputFrameRate || (fDlp->fInputFrameRate >= -fEpsn
+        && fDlp->fInputFrameRate <= fEpsn)
+        || (fDlp->fOutputFrameRate >= -fEpsn && fDlp->fOutputFrameRate <= fEpsn)) {
+#if defined (_DEBUG)
+      fprintf (stderr, "Invalid settings in input frame rate(%.6f) or output frame rate(%.6f) of layer #%d config file..\n",
+               fDlp->fInputFrameRate, fDlp->fOutputFrameRate, i);
+#endif
+      return 1;
+    }
+    if (UINT_MAX == GetLogFactor (fDlp->fOutputFrameRate, fDlp->fInputFrameRate)) {
+#if defined (_DEBUG)
+      fprintf (stderr,
+               "Invalid settings in input frame rate(%.6f) and output frame rate(%.6f) of layer #%d config file: iResult of output frame rate divided by input frame rate should be power of 2(i.e,in/pOut=2^n)..\n",
+               fDlp->fInputFrameRate, fDlp->fOutputFrameRate, i);
+#endif
+      return 1;
+    }
+  }
+
+  for (i = 0; i < pCfg->iNumDependencyLayer; ++ i) {
+    SDLayerParam* fDlp = &pCfg->sDependencyLayers[i];
+    if (fDlp->fInputFrameRate > fMaxFrameRate)
+      fMaxFrameRate	= fDlp->fInputFrameRate;
+  }
+
+  if (fMaxFrameRate > fEpsn && (fMaxFrameRate - pCfg->fMaxFrameRate > fEpsn
+                                || fMaxFrameRate - pCfg->fMaxFrameRate < -fEpsn)) {
+    pCfg->fMaxFrameRate	= fMaxFrameRate;
+  }
+
+  for (i = 0; i < pCfg->iNumDependencyLayer; ++ i) {
+    SDLayerParam* fDlp = &pCfg->sDependencyLayers[i];
+
+    pLastSpatialParam	= fDlp;
+    iLastSpatialWidth	= fDlp->iFrameWidth;
+    iLastSpatialHeight	= fDlp->iFrameHeight;
+    fLastFrameRateIn	= fDlp->fInputFrameRate;
+    fLastFrameRateOut	= fDlp->fOutputFrameRate;
+  }
+
+  return 0;
+}
+
+int32_t ParamValidationExt (void* pParam) {
+  SWelsSvcCodingParam* pCodingParam = (SWelsSvcCodingParam*)pParam;
+  int8_t i = 0;
+  int32_t iIdx = 0;
+
+  assert (pCodingParam != NULL);
+  if (NULL == pCodingParam)
+    return 1;
+
+  if (pCodingParam->iNumDependencyLayer < 1 || pCodingParam->iNumDependencyLayer > MAX_DEPENDENCY_LAYER) {
+#if defined (_DEBUG)
+    fprintf (stderr, "ParamValidationExt(), monitor invalid pCodingParam->iNumDependencyLayer: %d!\n",
+             pCodingParam->iNumDependencyLayer);
+#endif//#if _DEBUG
+
+    return 1;
+  }
+
+  if (pCodingParam->iNumTemporalLayer < 1 || pCodingParam->iNumTemporalLayer > MAX_TEMPORAL_LEVEL) {
+#if defined (_DEBUG)
+    fprintf (stderr, "ParamValidationExt(), monitor invalid pCodingParam->iNumTemporalLayer: %d!\n",
+             pCodingParam->iNumTemporalLayer);
+#endif//#if _DEBUG
+    return 1;
+  }
+
+  if (pCodingParam->uiGopSize < 1 || pCodingParam->uiGopSize > MAX_GOP_SIZE) {
+#if defined (_DEBUG)
+    fprintf (stderr, "ParamValidationExt(), monitor invalid pCodingParam->uiGopSize: %d!\n", pCodingParam->uiGopSize);
+#endif//#if _DEBUG
+    return 1;
+  }
+
+
+  if (pCodingParam->uiIntraPeriod && pCodingParam->uiIntraPeriod < pCodingParam->uiGopSize) {
+#if defined (_DEBUG)
+    fprintf (stderr,
+             "ParamValidationExt(), uiIntraPeriod(%d) should be not less than that of uiGopSize(%d) or -1 specified!\n",
+             pCodingParam->uiIntraPeriod, pCodingParam->uiGopSize);
+#endif//#if _DEBUG
+    return 1;
+  }
+
+  if (pCodingParam->uiIntraPeriod && (pCodingParam->uiIntraPeriod & (pCodingParam->uiGopSize - 1)) != 0) {
+#if defined (_DEBUG)
+    fprintf (stderr, "ParamValidationExt(), uiIntraPeriod(%d) should be multiple of uiGopSize(%d) or -1 specified!\n",
+             pCodingParam->uiIntraPeriod, pCodingParam->uiGopSize);
+#endif//#if _DEBUG
+    return 1;
+  }
+
+
+#ifdef MT_ENABLED
+  //about iMultipleThreadIdc, bDeblockingParallelFlag, iLoopFilterDisableIdc, & uiSliceMode
+  // (1) Single Thread
+  //	if (THREAD==1)//single thread
+  //		no parallel_deblocking: bDeblockingParallelFlag = 0;
+  // (2) Multi Thread: see uiSliceMode decision
+  if (pCodingParam->iMultipleThreadIdc == 1) {
+    //now is single thread. no parallel deblocking, set flag=0
+    pCodingParam->bDeblockingParallelFlag = false;
+  } else {
+    pCodingParam->bDeblockingParallelFlag = true;
+  }
+#else
+  pCodingParam->bDeblockingParallelFlag	= false;
+#endif//MT_ENABLED
+
+  for (i = 0; i < pCodingParam->iNumDependencyLayer; ++ i) {
+    SDLayerParam* fDlp = &pCodingParam->sDependencyLayers[i];
+    const int32_t kiPicWidth = fDlp->iFrameWidth;
+    const int32_t kiPicHeight = fDlp->iFrameHeight;
+    int32_t iMbWidth		= 0;
+    int32_t iMbHeight		= 0;
+    int32_t iMbNumInFrame		= 0;
+    int32_t iMaxSliceNum		= MAX_SLICES_NUM;
+    if (kiPicWidth <= 0 || kiPicHeight <= 0) {
+#if defined (_DEBUG)
+      fprintf (stderr, "ParamValidationExt(), invalid %d x %d in dependency layer settings!\n", kiPicWidth, kiPicHeight);
+#endif//#if _DEBUG
+      return 1;
+    }
+    if ((kiPicWidth & 0x0F) != 0 || (kiPicHeight & 0x0F) != 0) {
+#if defined (_DEBUG)
+      fprintf (stderr,
+               "ParamValidationExt(), in layer #%d iWidth x iHeight(%d x %d) both should be multiple of 16, can not support with arbitrary size currently!\n",
+               i, kiPicWidth, kiPicHeight);
+#endif//#if _DEBUG
+      return 1;
+    }
+
+    if (fDlp->sMso.uiSliceMode >= SM_RESERVED) {
+#if defined (_DEBUG)
+      fprintf (stderr, "ParamValidationExt(), invalid uiSliceMode (%d) settings!\n", fDlp->sMso.uiSliceMode);
+#endif//#if _DEBUG
+      return 1;
+    }
+
+    //check pSlice settings under multi-pSlice
+    if (kiPicWidth <= 16 && kiPicHeight <= 16) {
+      //only have one MB, set to single_slice
+      fDlp->sMso.uiSliceMode = SM_SINGLE_SLICE;
+    }
+    switch (fDlp->sMso.uiSliceMode) {
+    case SM_SINGLE_SLICE:
+      fDlp->sMso.sSliceArgument.iSliceNum = 1;
+      fDlp->sMso.sSliceArgument.uiSliceSizeConstraint = 0;
+      fDlp->sMso.sSliceArgument.iSliceNum = 0;
+      for (iIdx = 0; iIdx < MAX_SLICES_NUM; iIdx++) {
+        fDlp->sMso.sSliceArgument.uiSliceMbNum[iIdx] = 0;
+      }
+      break;
+    case SM_FIXEDSLCNUM_SLICE: {
+      fDlp->sMso.sSliceArgument.uiSliceSizeConstraint = 0;
+
+      iMbWidth	= (kiPicWidth + 15) >> 4;
+      iMbHeight	= (kiPicHeight + 15) >> 4;
+      iMbNumInFrame = iMbWidth * iMbHeight;
+      iMaxSliceNum = MAX_SLICES_NUM;
+      if (fDlp->sMso.sSliceArgument.iSliceNum <= 0
+          || fDlp->sMso.sSliceArgument.iSliceNum > iMaxSliceNum) {
+#if defined (_DEBUG)
+        fprintf (stderr, "ParamValidationExt(), invalid uiSliceNum (%d) settings!\n", fDlp->sMso.sSliceArgument.iSliceNum);
+#endif//#if _DEBUG
+        return 1;
+      }
+      if (fDlp->sMso.sSliceArgument.iSliceNum == 1) {
+#if defined (_DEBUG)
+        fprintf (stderr,
+                 "ParamValidationExt(), uiSliceNum(%d) you set for SM_FIXEDSLCNUM_SLICE, now turn to SM_SINGLE_SLICE type!\n",
+                 fDlp->sMso.sSliceArgument.iSliceNum);
+#endif//#if _DEBUG
+        fDlp->sMso.uiSliceMode	= SM_SINGLE_SLICE;
+        break;
+      }
+      if (pCodingParam->bEnableRc) {	// multiple slices verify with gom
+        //check uiSliceNum
+        GomValidCheckSliceNum (iMbWidth, iMbHeight, (int32_t*)&fDlp->sMso.sSliceArgument.iSliceNum);
+        assert (fDlp->sMso.sSliceArgument.iSliceNum > 1);
+        //set uiSliceMbNum with current uiSliceNum
+        GomValidCheckSliceMbNum (iMbWidth, iMbHeight, &fDlp->sMso.sSliceArgument);
+      } else if (!CheckFixedSliceNumMultiSliceSetting (iMbNumInFrame,
+                 &fDlp->sMso.sSliceArgument)) {	// verify interleave mode settings
+        //check uiSliceMbNum with current uiSliceNum
+#if defined (_DEBUG)
+        fprintf (stderr, "ParamValidationExt(), invalid uiSliceMbNum (%d) settings!\n",
+                 fDlp->sMso.sSliceArgument.uiSliceMbNum[0]);
+#endif//#if _DEBUG
+        return 1;
+      }
+      // considering the coding efficient and performance, iCountMbNum constraint by MIN_NUM_MB_PER_SLICE condition of multi-pSlice mode settting
+      if (iMbNumInFrame <= MIN_NUM_MB_PER_SLICE) {
+        fDlp->sMso.uiSliceMode	= SM_SINGLE_SLICE;
+        fDlp->sMso.sSliceArgument.iSliceNum	= 1;
+        break;
+      }
+    }
+    break;
+    case SM_RASTER_SLICE: {
+      fDlp->sMso.sSliceArgument.uiSliceSizeConstraint = 0;
+
+      iMbWidth	= (kiPicWidth + 15) >> 4;
+      iMbHeight	= (kiPicHeight + 15) >> 4;
+      iMbNumInFrame = iMbWidth * iMbHeight;
+      iMaxSliceNum = MAX_SLICES_NUM;
+      if (fDlp->sMso.sSliceArgument.uiSliceMbNum[0] <= 0) {
+#if defined (_DEBUG)
+        fprintf (stderr, "ParamValidationExt(), invalid uiSliceMbNum (%d) settings!\n",
+                 fDlp->sMso.sSliceArgument.uiSliceMbNum[0]);
+#endif//#if _DEBUG
+        return 1;
+      }
+
+      if (!CheckRasterMultiSliceSetting (iMbNumInFrame, &fDlp->sMso.sSliceArgument)) {	// verify interleave mode settings
+#if defined (_DEBUG)
+        fprintf (stderr, "ParamValidationExt(), invalid uiSliceMbNum (%d) settings!\n",
+                 fDlp->sMso.sSliceArgument.uiSliceMbNum[0]);
+#endif//#if _DEBUG
+        return 1;
+      }
+      if (fDlp->sMso.sSliceArgument.iSliceNum <= 0
+          || fDlp->sMso.sSliceArgument.iSliceNum > iMaxSliceNum) {	// verify interleave mode settings
+#if defined (_DEBUG)
+        fprintf (stderr, "ParamValidationExt(), invalid uiSliceNum (%d) in SM_RASTER_SLICE settings!\n",
+                 fDlp->sMso.sSliceArgument.iSliceNum);
+#endif//#if _DEBUG
+        return 1;
+      }
+      if (fDlp->sMso.sSliceArgument.iSliceNum == 1) {
+#if defined (_DEBUG)
+        fprintf (stderr, "ParamValidationExt(), pSlice setting for SM_RASTER_SLICE now turn to SM_SINGLE_SLICE!\n");
+#endif//#if _DEBUG
+        fDlp->sMso.uiSliceMode	= SM_SINGLE_SLICE;
+        break;
+      }
+#ifdef MT_ENABLED
+      if (pCodingParam->bEnableRc && fDlp->sMso.sSliceArgument.iSliceNum > 1) {
+#if defined (_DEBUG)
+        fprintf (stderr, "ParamValidationExt(), WARNING: GOM based RC do not support SM_RASTER_SLICE!\n");
+#endif//#if _DEBUG
+      }
+#endif
+      // considering the coding efficient and performance, iCountMbNum constraint by MIN_NUM_MB_PER_SLICE condition of multi-pSlice mode settting
+      if (iMbNumInFrame <= MIN_NUM_MB_PER_SLICE) {
+        fDlp->sMso.uiSliceMode	= SM_SINGLE_SLICE;
+        fDlp->sMso.sSliceArgument.iSliceNum	= 1;
+        break;
+      }
+    }
+    break;
+    case SM_ROWMB_SLICE: {
+      fDlp->sMso.sSliceArgument.uiSliceSizeConstraint = 0;
+
+      iMbWidth	= (kiPicWidth + 15) >> 4;
+      iMbHeight	= (kiPicHeight + 15) >> 4;
+      iMaxSliceNum = MAX_SLICES_NUM;
+      if (iMbHeight > iMaxSliceNum) {
+#if defined (_DEBUG)
+        fprintf (stderr, "ParamValidationExt(), invalid uiSliceNum (%d) settings more than MAX!\n", iMbHeight);
+#endif//#if _DEBUG
+        return 1;
+      }
+      fDlp->sMso.sSliceArgument.iSliceNum	= iMbHeight;
+
+      if (fDlp->sMso.sSliceArgument.iSliceNum <= 0) {
+#if defined (_DEBUG)
+        fprintf (stderr, "ParamValidationExt(), invalid uiSliceNum (%d) settings!\n", fDlp->sMso.sSliceArgument.iSliceNum);
+#endif//#if _DEBUG
+        return 1;
+      }
+      if (!CheckRowMbMultiSliceSetting (iMbWidth, &fDlp->sMso.sSliceArgument)) {	// verify interleave mode settings
+#if defined (_DEBUG)
+        fprintf (stderr, "ParamValidationExt(), invalid uiSliceMbNum (%d) settings!\n",
+                 fDlp->sMso.sSliceArgument.uiSliceMbNum[0]);
+#endif//#if _DEBUG
+        return 1;
+      }
+    }
+    break;
+    case SM_DYN_SLICE: {
+      iMbWidth	= (kiPicWidth + 15) >> 4;
+      iMbHeight	= (kiPicHeight + 15) >> 4;
+      if (fDlp->sMso.sSliceArgument.uiSliceSizeConstraint <= 0) {
+#if defined (_DEBUG)
+        fprintf (stderr, "ParamValidationExt(), invalid iSliceSize (%d) settings!\n",
+                 fDlp->sMso.sSliceArgument.uiSliceSizeConstraint);
+#endif//#if _DEBUG
+        return 1;
+      }
+      // considering the coding efficient and performance, iCountMbNum constraint by MIN_NUM_MB_PER_SLICE condition of multi-pSlice mode settting
+      if (iMbWidth * iMbHeight <= MIN_NUM_MB_PER_SLICE) {
+        fDlp->sMso.uiSliceMode	= SM_SINGLE_SLICE;
+        fDlp->sMso.sSliceArgument.iSliceNum	= 1;
+        break;
+      }
+    }
+    break;
+    default: {
+
+#if defined (_DEBUG)
+      fprintf (stderr, "ParamValidationExt(), invalid uiSliceMode (%d) settings!\n",
+               pCodingParam->sDependencyLayers[0].sMso.uiSliceMode);
+#endif//#if _DEBUG
+      return 1;
+
+    }
+    break;
+    }
+  }
+
+  return ParamValidation (pCodingParam);
+}
+
+/*!
+ * \brief	acquire count number of layers and NALs based on configurable paramters dependency
+ * \pParam	pCtx				sWelsEncCtx*
+ * \pParam	pParam			SWelsSvcCodingParam*
+ * \pParam	pCountLayers	pointer of count number of layers indeed
+ * \pParam	iCountNals		pointer of count number of nals indeed
+ * \return	0 - successful; otherwise failed
+ */
+static inline int32_t AcquireLayersNals (sWelsEncCtx** ppCtx, SWelsSvcCodingParam* pParam, int32_t* pCountLayers,
+    int32_t* pCountNals) {
+  int32_t iCountNumLayers		= 0;
+  int32_t iCountNumNals			= 0;
+  int32_t iNumDependencyLayers	= 0;
+  int32_t iDIndex 				= 0;
+#if defined(MT_ENABLED) && defined(PACKING_ONE_SLICE_PER_LAYER)
+  int32_t iNumLayersPack = 0;
+#endif//MT_ENABLED && PACKING_ONE_SLICE_PER_LAYER
+
+  if (NULL == pParam || NULL == ppCtx || NULL == *ppCtx)
+    return 1;
+
+  iNumDependencyLayers	= pParam->iNumDependencyLayer;
+
+  do {
+    SDLayerParam* pDLayer = &pParam->sDependencyLayers[iDIndex];
+//		pDLayer->ptr_cfg = pParam;
+    int32_t iOrgNumNals = iCountNumNals;
+
+    //Note: Sep. 2010
+    //Review this part and suggest no change, since the memory over-use
+    //(1) counts little to the overall performance
+    //(2) should not be critial even under mobile case
+    if (SM_DYN_SLICE == pDLayer->sMso.uiSliceMode) {
+      iCountNumNals += MAX_SLICES_NUM;
+      // plus prefix NALs
+      if (iDIndex == 0)
+        iCountNumNals += MAX_SLICES_NUM;
+      // MAX_SLICES_NUM < MAX_LAYER_NUM_OF_FRAME ensured at svc_enc_slice_segment.h
+#if defined(MT_ENABLED) && defined(PACKING_ONE_SLICE_PER_LAYER)
+      assert (MAX_SLICES_NUM < MAX_LAYER_NUM_OF_FRAME);
+      // iNumLayersPack += MAX_SLICES_NUM; // do not count it for dynamic slicing mode
+#else//!MT_ENABLED || !PACKING_ONE_SLICE_PER_LAYER
+      assert (iCountNumNals - iOrgNumNals <= MAX_NAL_UNITS_IN_LAYER);
+#endif//MT_ENABLED && PACKING_ONE_SLICE_PER_LAYER
+    } else { /*if ( SM_SINGLE_SLICE != pDLayer->sMso.uiSliceMode )*/
+      const int32_t kiNumOfSlice = GetInitialSliceNum ((pDLayer->iFrameWidth + 0x0f) >> 4,
+                                   (pDLayer->iFrameHeight + 0x0f) >> 4,
+                                   &pDLayer->sMso);
+
+      // NEED check iCountNals value in case multiple slices is used
+      iCountNumNals += kiNumOfSlice; // for pSlice VCL NALs
+      // plus prefix NALs
+      if (iDIndex == 0)
+        iCountNumNals += kiNumOfSlice;
+#if defined(MT_ENABLED) && defined(PACKING_ONE_SLICE_PER_LAYER)
+      assert (num_of_slice <= MAX_SLICES_NUM && MAX_SLICES_NUM < MAX_LAYER_NUM_OF_FRAME);
+      iNumLayersPack += num_of_slice;
+#else//!MT_ENABLED || !PACKING_ONE_SLICE_PER_LAYER
+      assert (iCountNumNals - iOrgNumNals <= MAX_NAL_UNITS_IN_LAYER);
+#endif//MT_ENALBED && PACKING_ONE_SLICE_PER_LAYER
+      if (kiNumOfSlice > MAX_SLICES_NUM) {
+        WelsLog (*ppCtx, WELS_LOG_ERROR,
+                 "AcquireLayersNals(), num_of_slice(%d) > MAX_SLICES_NUM(%d) per (iDid= %d, qid= %d) settings!\n",
+                 kiNumOfSlice, MAX_SLICES_NUM, iDIndex, 0);
+        return 1;
+      }
+    }
+#if !defined(MT_ENABLED) || !defined(PACKING_ONE_SLICE_PER_LAYER)
+    if (iCountNumNals - iOrgNumNals > MAX_NAL_UNITS_IN_LAYER) {
+      WelsLog (*ppCtx, WELS_LOG_ERROR,
+               "AcquireLayersNals(), num_of_nals(%d) > MAX_NAL_UNITS_IN_LAYER(%d) per (iDid= %d, qid= %d) settings!\n",
+               (iCountNumNals - iOrgNumNals), MAX_NAL_UNITS_IN_LAYER, iDIndex, 0);
+      return 1;
+    }
+#endif//!MT_ENABLED) || !PACKING_ONE_SLICE_PER_LAYER
+
+    iCountNumLayers ++;
+
+    ++ iDIndex;
+  } while (iDIndex < iNumDependencyLayers);
+
+  iCountNumNals += 1 + iNumDependencyLayers + (iCountNumLayers << 1) +
+                   iCountNumLayers;	// plus iCountNumLayers for reserved application
+#if defined(MT_ENABLED) && defined(PACKING_ONE_SLICE_PER_LAYER)
+  iNumLayersPack += 1 + iNumDependencyLayers + (iCountNumLayers << 1);
+#endif//MT_ENABLED && PACKING_ONE_SLICE_PER_LAYER
+
+  // to check number of layers / nals / slices dependencies, 12/8/2010
+#if !defined(MT_ENABLED)
+  if (iCountNumLayers > MAX_LAYER_NUM_OF_FRAME) {
+    WelsLog (*ppCtx, WELS_LOG_ERROR, "AcquireLayersNals(), iCountNumLayers(%d) > MAX_LAYER_NUM_OF_FRAME(%d)!",
+             iCountNumLayers, MAX_LAYER_NUM_OF_FRAME);
+    return 1;
+  }
+#else//MT_ENABLED
+#if defined(PACKING_ONE_SLICE_PER_LAYER)
+  if (iNumLayersPack > MAX_LAYER_NUM_OF_FRAME) {
+    WelsLog (*ppCtx, WELS_LOG_ERROR, "AcquireLayersNals(), num_layers_pack_overall(%d) > MAX_LAYER_NUM_OF_FRAME(%d)!",
+             iNumLayersPack, MAX_LAYER_NUM_OF_FRAME);
+    return 1;
+  }
+#else//!PACKING_ONE_SLICE_PER_LAYER
+  if (iCountNumLayers > MAX_LAYER_NUM_OF_FRAME) {
+    WelsLog (*ppCtx, WELS_LOG_ERROR, "AcquireLayersNals(), iCountNumLayers(%d) > MAX_LAYER_NUM_OF_FRAME(%d)!",
+             iCountNumLayers, MAX_LAYER_NUM_OF_FRAME);
+    return 1;
+  }
+#endif//PACKING_ONE_SLICE_PER_LAYER
+#endif//!MT_ENABLED
+
+  if (NULL != pCountLayers)
+    *pCountLayers	= iCountNumLayers;
+  if (NULL != pCountNals)
+    *pCountNals 	= iCountNumNals;
+  return 0;
+}
+
+/*!
+ * \brief	alloc spatial layers pictures (I420 based source pictures)
+ */
+int32_t AllocSpatialPictures (sWelsEncCtx** ppCtx, SWelsSvcCodingParam* pParam) {
+  CMemoryAlign* pMa						= (*ppCtx)->pMemAlign;
+  const int32_t kiDlayerCount					= pParam->iNumDependencyLayer;
+  int32_t iDlayerIndex							= 0;
+
+  // spatial pictures
+  iDlayerIndex = 0;
+  do {
+    const int32_t kiPicWidth = pParam->sDependencyLayers[iDlayerIndex].iFrameWidth;
+    const int32_t kiPicHeight   = pParam->sDependencyLayers[iDlayerIndex].iFrameHeight;
+    const uint8_t kuiLayerInTemporal = 2 + WELS_MAX (pParam->sDependencyLayers[iDlayerIndex].iHighestTemporalId, 1);
+    const uint8_t kuiRefNumInTemporal = kuiLayerInTemporal + pParam->iLTRRefNum;
+    uint8_t i = 0;
+
+    do {
+      SPicture* pPic = AllocPicture (pMa, kiPicWidth, kiPicHeight, false);
+      WELS_VERIFY_RETURN_PROC_IF (1, (NULL == pPic), FreeMemorySvc (ppCtx); *ppCtx = NULL)
+      (*ppCtx)->pSpatialPic[iDlayerIndex][i] = pPic;
+      ++ i;
+    } while (i < kuiRefNumInTemporal);
+
+    (*ppCtx)->uiSpatialLayersInTemporal[iDlayerIndex] = kuiLayerInTemporal;
+    (*ppCtx)->uiSpatialPicNum[iDlayerIndex] = kuiRefNumInTemporal;
+    ++ iDlayerIndex;
+  } while (iDlayerIndex < kiDlayerCount);
+
+  return 0;
+}
+
+void FreeSpatialPictures (sWelsEncCtx* pCtx) {
+  CMemoryAlign* pMa	= pCtx->pMemAlign;
+  int32_t j = 0;
+  while (j < pCtx->pSvcParam->iNumDependencyLayer) {
+    uint8_t i = 0;
+    uint8_t uiRefNumInTemporal = pCtx->uiSpatialPicNum[j];
+
+    while (i < uiRefNumInTemporal) {
+      if (NULL != pCtx->pSpatialPic[j][i]) {
+        FreePicture (pMa, &pCtx->pSpatialPic[j][i]);
+      }
+      ++ i;
+    }
+    pCtx->uiSpatialLayersInTemporal[j]	= 0;
+    ++ j;
+  }
+
+}
+
+static  void  InitMbInfo (sWelsEncCtx* pEnc, SMB*   pList, SDqLayer* pLayer, const int32_t kiDlayerId,
+                          const int32_t kiMaxMbNum) {
+  int32_t  iMbWidth		= pLayer->iMbWidth;
+  int32_t  iMbHeight		= pLayer->iMbHeight;
+  int32_t  iIdx;
+  int32_t  iMbNum			= iMbWidth * iMbHeight;
+  SSliceCtx* pSliceCtx = pLayer->pSliceEncCtx;
+  uint32_t uiNeighborAvail;
+  const int32_t kiOffset	= (kiDlayerId & 0x01) * kiMaxMbNum;
+  SMVUnitXY (*pLayerMvUnitBlock4x4)[MB_BLOCK4x4_NUM]	= (SMVUnitXY (*)[MB_BLOCK4x4_NUM]) (
+        &pEnc->pMvUnitBlock4x4[MB_BLOCK4x4_NUM * kiOffset]);
+  int8_t (*pLayerRefIndexBlock8x8)[MB_BLOCK8x8_NUM]		= (int8_t (*)[MB_BLOCK8x8_NUM]) (
+        &pEnc->pRefIndexBlock4x4[MB_BLOCK8x8_NUM * kiOffset]);
+
+  for (iIdx = 0; iIdx < iMbNum; iIdx++) {
+    BOOL_T     bLeft;
+    BOOL_T     bTop;
+    BOOL_T     bLeftTop;
+    BOOL_T     bRightTop;
+    int32_t  iLeftXY, iTopXY, iLeftTopXY, iRightTopXY;
+    uint8_t  uiSliceIdc;
+
+    pList[iIdx].iMbX = pEnc->pStrideTab->pMbIndexX[kiDlayerId][iIdx];
+    pList[iIdx].iMbY = pEnc->pStrideTab->pMbIndexY[kiDlayerId][iIdx];
+    pList[iIdx].iMbXY = iIdx;
+
+    uiSliceIdc = WelsMbToSliceIdc (pSliceCtx, iIdx);
+    iLeftXY = iIdx - 1;
+    iTopXY = iIdx - iMbWidth;
+    iLeftTopXY = iTopXY - 1;
+    iRightTopXY = iTopXY + 1;
+
+    bLeft = (pList[iIdx].iMbX > 0) && (uiSliceIdc == WelsMbToSliceIdc (pSliceCtx, iLeftXY));
+    bTop = (pList[iIdx].iMbY > 0) && (uiSliceIdc == WelsMbToSliceIdc (pSliceCtx, iTopXY));
+    bLeftTop = (pList[iIdx].iMbX > 0) && (pList[iIdx].iMbY > 0) && (uiSliceIdc ==
+               WelsMbToSliceIdc (pSliceCtx, iLeftTopXY));
+    bRightTop = (pList[iIdx].iMbX < (iMbWidth - 1)) && (pList[iIdx].iMbY > 0) && (uiSliceIdc ==
+                WelsMbToSliceIdc (pSliceCtx, iRightTopXY));
+
+    uiNeighborAvail = 0;
+    if (bLeft) {
+      uiNeighborAvail |= LEFT_MB_POS;
+    }
+    if (bTop) {
+      uiNeighborAvail |= TOP_MB_POS;
+    }
+    if (bLeftTop) {
+      uiNeighborAvail |= TOPLEFT_MB_POS;
+    }
+    if (bRightTop) {
+      uiNeighborAvail |= TOPRIGHT_MB_POS;
+    }
+    pList[iIdx].uiSliceIdc		= uiSliceIdc;	// merge from svc_hd_opt_b for multiple slices coding
+    pList[iIdx].uiNeighborAvail	= uiNeighborAvail;
+    uiNeighborAvail = 0;
+    if (pList[iIdx].iMbX >= BASE_MV_MB_NMB)
+      uiNeighborAvail |= LEFT_MB_POS;
+    if (pList[iIdx].iMbX <= (iMbWidth - 1 - BASE_MV_MB_NMB))
+      uiNeighborAvail |= RIGHT_MB_POS;
+    if (pList[iIdx].iMbY >= BASE_MV_MB_NMB)
+      uiNeighborAvail |= TOP_MB_POS;
+    if (pList[iIdx].iMbY <= (iMbHeight - 1 - BASE_MV_MB_NMB))
+      uiNeighborAvail |= BOTTOM_MB_POS;
+
+    pList[iIdx].sMv					= pLayerMvUnitBlock4x4[iIdx];
+    pList[iIdx].pRefIndex			= pLayerRefIndexBlock8x8[iIdx];
+    pList[iIdx].pSadCost				= &pEnc->pSadCostMb[iIdx];
+    pList[iIdx].pIntra4x4PredMode	= &pEnc->pIntra4x4PredModeBlocks[iIdx * INTRA_4x4_MODE_NUM];
+    pList[iIdx].pNonZeroCount		= &pEnc->pNonZeroCountBlocks[iIdx * MB_LUMA_CHROMA_BLOCK4x4_NUM];
+  }
+}
+
+
+int32_t   InitMbListD (sWelsEncCtx** ppCtx) {
+  int32_t		iNumDlayer = (*ppCtx)->pSvcParam->iNumDependencyLayer;
+  int32_t		iMbSize[MAX_DEPENDENCY_LAYER] = { 0 };
+  int32_t		iOverallMbNum = 0;
+  int32_t		iMbWidth = 0;
+  int32_t		iMbHeight = 0;
+  int32_t		i;
+
+  if (iNumDlayer > MAX_DEPENDENCY_LAYER)
+    return 1;
+
+  for (i = 0; i < iNumDlayer; i++) {
+    iMbWidth = ((*ppCtx)->pSvcParam->sDependencyLayers[i].iFrameWidth + 15) >> 4;
+    iMbHeight = ((*ppCtx)->pSvcParam->sDependencyLayers[i].iFrameHeight + 15) >> 4;
+    iMbSize[i] = iMbWidth  * iMbHeight;
+    iOverallMbNum += iMbSize[i];
+  }
+
+  (*ppCtx)->ppMbListD = static_cast<SMB**> ((*ppCtx)->pMemAlign->WelsMalloc (iNumDlayer * sizeof (SMB*), "ppMbListD"));
+  (*ppCtx)->ppMbListD[0] = NULL;
+  WELS_VERIFY_RETURN_PROC_IF (1, (*ppCtx)->ppMbListD == NULL, FreeMemorySvc (ppCtx));
+  (*ppCtx)->ppMbListD[0] = static_cast<SMB*> ((*ppCtx)->pMemAlign->WelsMallocz (iOverallMbNum * sizeof (SMB),
+                           "ppMbListD[0]"));
+  WELS_VERIFY_RETURN_PROC_IF (1, (*ppCtx)->ppMbListD[0] == NULL, FreeMemorySvc (ppCtx));
+  (*ppCtx)->ppDqLayerList[0]->sMbDataP = (*ppCtx)->ppMbListD[0];
+  InitMbInfo (*ppCtx, (*ppCtx)->ppMbListD[0], (*ppCtx)->ppDqLayerList[0], 0, iMbSize[iNumDlayer - 1]);
+  for (i = 1; i < iNumDlayer; i++) {
+    (*ppCtx)->ppMbListD[i] = (*ppCtx)->ppMbListD[i - 1] + iMbSize[i - 1];
+    (*ppCtx)->ppDqLayerList[i]->sMbDataP = (*ppCtx)->ppMbListD[i];
+    InitMbInfo (*ppCtx, (*ppCtx)->ppMbListD[i], (*ppCtx)->ppDqLayerList[i], i, iMbSize[iNumDlayer - 1]);
+  }
+
+  return 0;
+}
+
+int32_t AllocMbCacheAligned (SMbCache* pMbCache, CMemoryAlign* pMa) {
+  pMbCache->pCoeffLevel = (int16_t*)pMa->WelsMalloc (MB_COEFF_LIST_SIZE * sizeof (int16_t), "pMbCache->pCoeffLevel");
+  WELS_VERIFY_RETURN_IF (1, (NULL == pMbCache->pCoeffLevel));
+  pMbCache->pMemPredMb = (uint8_t*)pMa->WelsMalloc (2 * 256 * sizeof (uint8_t), "pMbCache->pMemPredMb");
+  WELS_VERIFY_RETURN_IF (1, (NULL == pMbCache->pMemPredMb));
+  pMbCache->pSkipMb = (uint8_t*)pMa->WelsMalloc (384 * sizeof (uint8_t), "pMbCache->pSkipMb");
+  WELS_VERIFY_RETURN_IF (1, (NULL == pMbCache->pSkipMb));
+  pMbCache->pMemPredBlk4 = (uint8_t*)pMa->WelsMalloc (2 * 16 * sizeof (uint8_t), "pMbCache->pMemPredBlk4");
+  WELS_VERIFY_RETURN_IF (1, (NULL == pMbCache->pMemPredBlk4));
+  pMbCache->pBufferInterPredMe = (uint8_t*)pMa->WelsMalloc (4 * 640 * sizeof (uint8_t), "pMbCache->pBufferInterPredMe");
+  WELS_VERIFY_RETURN_IF (1, (NULL == pMbCache->pBufferInterPredMe));
+  pMbCache->pPrevIntra4x4PredModeFlag = (bool_t*)pMa->WelsMalloc (16 * sizeof (bool_t),
+                                        "pMbCache->pPrevIntra4x4PredModeFlag");
+  WELS_VERIFY_RETURN_IF (1, (NULL == pMbCache->pPrevIntra4x4PredModeFlag));
+  pMbCache->pRemIntra4x4PredModeFlag	= (int8_t*)pMa->WelsMalloc (16 * sizeof (int8_t),
+                                        "pMbCache->pRemIntra4x4PredModeFlag");
+  WELS_VERIFY_RETURN_IF (1, (NULL == pMbCache->pRemIntra4x4PredModeFlag));
+  pMbCache->pDct = (SDCTCoeff*)pMa->WelsMalloc (sizeof (SDCTCoeff), "pMbCache->pDct");
+  WELS_VERIFY_RETURN_IF (1, (NULL == pMbCache->pDct));
+  return 0;
+}
+
+void FreeMbCache (SMbCache* pMbCache, CMemoryAlign* pMa) {
+  if (NULL != pMbCache->pCoeffLevel) {
+    pMa->WelsFree (pMbCache->pCoeffLevel, "pMbCache->pCoeffLevel");
+    pMbCache->pCoeffLevel = NULL;
+  }
+  if (NULL != pMbCache->pMemPredMb) {
+    pMa->WelsFree (pMbCache->pMemPredMb, "pMbCache->pMemPredMb");
+    pMbCache->pMemPredMb = NULL;
+  }
+  if (NULL != pMbCache->pSkipMb) {
+    pMa->WelsFree (pMbCache->pSkipMb, "pMbCache->pSkipMb");
+    pMbCache->pSkipMb = NULL;
+  }
+  if (NULL != pMbCache->pMemPredBlk4) {
+    pMa->WelsFree (pMbCache->pMemPredBlk4, "pMbCache->pMemPredBlk4");
+    pMbCache->pMemPredBlk4 = NULL;
+  }
+  if (NULL != pMbCache->pBufferInterPredMe) {
+    pMa->WelsFree (pMbCache->pBufferInterPredMe, "pMbCache->pBufferInterPredMe");
+    pMbCache->pBufferInterPredMe = NULL;
+  }
+  if (NULL != pMbCache->pPrevIntra4x4PredModeFlag) {
+    pMa->WelsFree (pMbCache->pPrevIntra4x4PredModeFlag, "pMbCache->pPrevIntra4x4PredModeFlag");
+    pMbCache->pPrevIntra4x4PredModeFlag = NULL;
+  }
+  if (NULL != pMbCache->pRemIntra4x4PredModeFlag) {
+    pMa->WelsFree (pMbCache->pRemIntra4x4PredModeFlag, "pMbCache->pRemIntra4x4PredModeFlag");
+    pMbCache->pRemIntra4x4PredModeFlag = NULL;
+  }
+  if (NULL != pMbCache->pDct) {
+    pMa->WelsFree (pMbCache->pDct, "pMbCache->pDct");
+    pMbCache->pDct = NULL;
+  }
+}
+
+
+/*!
+ * \brief	initialize ppDqLayerList and slicepEncCtx_list due to count number of layers available
+ * \pParam	pCtx			sWelsEncCtx*
+ * \return	0 - successful; otherwise failed
+ */
+static inline int32_t InitDqLayers (sWelsEncCtx** ppCtx) {
+  SWelsSvcCodingParam* pParam	= NULL;
+  SWelsSPS* pSps						= NULL;
+  SSubsetSps* pSubsetSps			= NULL;
+  SWelsPPS* pPps						= NULL;
+  CMemoryAlign* pMa				= NULL;
+  SStrideTables* pStrideTab		= NULL;
+  int32_t iDlayerCount					= 0;
+  int32_t iDlayerIndex					= 0;
+  uint32_t iSpsId					= 0;
+  uint32_t iPpsId					= 0;
+  uint32_t iNumRef				= 0;
+  int32_t iResult					= 0;
+
+  if (NULL == ppCtx || NULL == *ppCtx)
+    return 1;
+
+  pMa		= (*ppCtx)->pMemAlign;
+  pParam	= (*ppCtx)->pSvcParam;
+  iDlayerCount	= pParam->iNumDependencyLayer;
+  iNumRef	= pParam->iNumRefFrame;
+//	highest_layers_in_temporal = 1 + WELS_MAX(pParam->iDecompStages, 1);
+  pStrideTab	= (*ppCtx)->pStrideTab;
+
+  iDlayerIndex			= 0;
+  while (iDlayerIndex < iDlayerCount) {
+    SRefList* pRefList			= NULL;
+    uint32_t i					= 0;
+    const int32_t kiWidth			= pParam->sDependencyLayers[iDlayerIndex].iFrameWidth;
+    const int32_t kiHeight		= pParam->sDependencyLayers[iDlayerIndex].iFrameHeight;
+    int32_t iPicWidth			= WELS_ALIGN (kiWidth, MB_WIDTH_LUMA) + (PADDING_LENGTH << 1);	// with iWidth of horizon
+    int32_t iPicChromaWidth	= iPicWidth >> 1;
+
+    iPicWidth	= WELS_ALIGN (iPicWidth,
+                            32);	// 32(or 16 for chroma below) to match original imp. here instead of iCacheLineSize
+    iPicChromaWidth	= WELS_ALIGN (iPicChromaWidth, 16);
+
+    WelsGetEncBlockStrideOffset ((*ppCtx)->pStrideTab->pStrideEncBlockOffset[iDlayerIndex], iPicWidth, iPicChromaWidth);
+
+    // pRef list
+    pRefList		= (SRefList*)pMa->WelsMallocz (sizeof (SRefList), "pRefList");
+    WELS_VERIFY_RETURN_PROC_IF (1, (NULL == pRefList), FreeMemorySvc (ppCtx))
+
+    do {
+      pRefList->pRef[i]	= AllocPicture (pMa, kiWidth, kiHeight, true);	// to use actual size of current layer
+      WELS_VERIFY_RETURN_PROC_IF (1, (NULL == pRefList->pRef[i]), FreeMemorySvc (ppCtx))
+      ++ i;
+    } while (i < 1 + iNumRef);
+
+    pRefList->pNextBuffer = pRefList->pRef[0];
+    (*ppCtx)->ppRefPicListExt[iDlayerIndex]	= pRefList;
+    ++ iDlayerIndex;
+  }
+
+  // for I420 based source spatial pictures
+  if (AllocSpatialPictures (ppCtx, pParam)) {
+    FreeMemorySvc (ppCtx);
+    return 1;
+  }
+
+  iDlayerIndex	= 0;
+  while (iDlayerIndex < iDlayerCount) {
+    SDqLayer* pDqLayer		= NULL;
+    SDLayerParam* pDlayer	= &pParam->sDependencyLayers[iDlayerIndex];
+    const int32_t kiMbW		= (pDlayer->iFrameWidth + 0x0f) >> 4;
+    const int32_t kiMbH		= (pDlayer->iFrameHeight + 0x0f) >> 4;
+    int32_t iMaxSliceNum	= 1;
+    const int32_t kiSliceNum = GetInitialSliceNum (kiMbW, kiMbH, &pDlayer->sMso);
+    if (iMaxSliceNum < kiSliceNum)
+      iMaxSliceNum = kiSliceNum;
+
+    // pDq layers list
+    pDqLayer = (SDqLayer*)pMa->WelsMallocz (sizeof (SDqLayer), "pDqLayer");
+    WELS_VERIFY_RETURN_PROC_IF (1, (NULL == pDqLayer), FreeMemorySvc (ppCtx))
+
+    // for dynamic slicing mode
+    if (SM_DYN_SLICE == pDlayer->sMso.uiSliceMode) {
+      const int32_t iSize			= pParam->iCountThreadsNum * sizeof (int32_t);
+
+      pDqLayer->pNumSliceCodedOfPartition		= (int32_t*)pMa->WelsMallocz (iSize, "pNumSliceCodedOfPartition");
+      pDqLayer->pLastCodedMbIdxOfPartition	= (int32_t*)pMa->WelsMallocz (iSize, "pLastCodedMbIdxOfPartition");
+      pDqLayer->pLastMbIdxOfPartition			= (int32_t*)pMa->WelsMallocz (iSize, "pLastMbIdxOfPartition");
+
+      WELS_VERIFY_RETURN_PROC_IF (1,
+                                  (NULL == pDqLayer->pNumSliceCodedOfPartition ||
+                                   NULL == pDqLayer->pLastCodedMbIdxOfPartition ||
+                                   NULL == pDqLayer->pLastMbIdxOfPartition),
+                                  FreeMemorySvc (ppCtx))
+    }
+
+    pDqLayer->iMbWidth					= kiMbW;
+    pDqLayer->iMbHeight					= kiMbH;
+#ifndef MT_ENABLED
+    if (SM_DYN_SLICE == pDlayer->sMso.uiSliceMode) { //wmalloc pSliceInLayer
+      SSlice* pSlice			= NULL;
+      int32_t iSliceIdx		= 0;
+      //wmalloc AVERSLICENUM_CONSTANT of pDqLayer->sLayerInfo.pSliceInLayer,
+      //wmalloc AVERSLICENUM_CONSTANT num of pSlice as initialization
+      //only set value for the first pSlice
+      pDqLayer->sLayerInfo.pSliceInLayer	= (SSlice*)pMa->WelsMallocz (sizeof (SSlice) * iMaxSliceNum, "pSliceInLayer");
+
+      WELS_VERIFY_RETURN_PROC_IF (1, (NULL == pDqLayer->sLayerInfo.pSliceInLayer), FreeMemorySvc (ppCtx)) {
+        pSlice = &pDqLayer->sLayerInfo.pSliceInLayer[0];
+        pSlice->uiSliceIdx = 0;
+        pSlice->pSliceBsa = & (*ppCtx)->pOut->sBsWrite;
+      }
+
+      while (iSliceIdx < iMaxSliceNum) {
+        pSlice = &pDqLayer->sLayerInfo.pSliceInLayer[iSliceIdx];
+        if (AllocMbCacheAligned (&pSlice->sMbCacheInfo, pMa)) {
+          FreeMemorySvc (ppCtx);
+          return 1;
+        }
+        ++ iSliceIdx;
+      }
+    } else
+#endif//!MT_ENABLED
+    {
+      int32_t iSliceIdx		= 0;
+      pDqLayer->sLayerInfo.pSliceInLayer	= (SSlice*)pMa->WelsMallocz (sizeof (SSlice) * iMaxSliceNum, "pSliceInLayer");
+
+      WELS_VERIFY_RETURN_PROC_IF (1, (NULL == pDqLayer->sLayerInfo.pSliceInLayer), FreeMemorySvc (ppCtx))
+      if (iMaxSliceNum > 1) {
+        while (iSliceIdx < iMaxSliceNum) {
+          SSlice* pSlice = &pDqLayer->sLayerInfo.pSliceInLayer[iSliceIdx];
+          pSlice->uiSliceIdx = iSliceIdx;
+#ifdef MT_ENABLED
+          if (pParam->iMultipleThreadIdc > 1)
+            pSlice->pSliceBsa = & (*ppCtx)->pSliceBs[iSliceIdx].sBsWrite;
+          else
+            pSlice->pSliceBsa = & (*ppCtx)->pOut->sBsWrite;
+#else
+          pSlice->pSliceBsa = & (*ppCtx)->pOut->sBsWrite;
+#endif//MT_ENABLED
+          if (AllocMbCacheAligned (&pSlice->sMbCacheInfo, pMa)) {
+            FreeMemorySvc (ppCtx);
+            return 1;
+          }
+          ++ iSliceIdx;
+        }
+      }
+      // fix issue in case single pSlice coding might be inclusive exist in variant spatial layer setting, also introducing multi-pSlice modes
+      else {	// only one pSlice
+        SSlice* pSlice = &pDqLayer->sLayerInfo.pSliceInLayer[0];
+        pSlice->uiSliceIdx	= 0;
+        pSlice->pSliceBsa	= & (*ppCtx)->pOut->sBsWrite;
+        if (AllocMbCacheAligned (&pSlice->sMbCacheInfo, pMa)) {
+          FreeMemorySvc (ppCtx);
+          return 1;
+        }
+      }
+    }
+
+    //deblocking parameters initialization
+    //target-layer deblocking
+    pDqLayer->iLoopFilterDisableIdc	                = pParam->iLoopFilterDisableIdc;
+    pDqLayer->iLoopFilterAlphaC0Offset				= (pParam->iLoopFilterAlphaC0Offset) << 1;
+    pDqLayer->iLoopFilterBetaOffset					= (pParam->iLoopFilterBetaOffset) << 1;
+    //inter-layer deblocking
+    pDqLayer->uiDisableInterLayerDeblockingFilterIdc	= pParam->iInterLayerLoopFilterDisableIdc;
+    pDqLayer->iInterLayerSliceAlphaC0Offset				= (pParam->iInterLayerLoopFilterAlphaC0Offset) << 1;
+    pDqLayer->iInterLayerSliceBetaOffset				= (pParam->iInterLayerLoopFilterBetaOffset) << 1;
+    //parallel deblocking
+    pDqLayer->bDeblockingParallelFlag                  = pParam->bDeblockingParallelFlag;
+
+    //deblocking parameter adjustment
+    if (SM_SINGLE_SLICE == pDlayer->sMso.uiSliceMode) {
+      //iLoopFilterDisableIdc: will be 0 or 1 under single_slice
+      if (2 == pParam->iLoopFilterDisableIdc) {
+        pDqLayer->iLoopFilterDisableIdc	= 0;
+      }
+      //bDeblockingParallelFlag
+      pDqLayer->bDeblockingParallelFlag = false;
+    } else {
+      //multi-pSlice
+#ifdef MT_ENABLED
+      if (0 == pDqLayer->iLoopFilterDisableIdc) {
+        pDqLayer->bDeblockingParallelFlag	= false;
+      }
+#endif
+    }
+
+    (*ppCtx)->ppDqLayerList[iDlayerIndex]	= pDqLayer;
+
+    ++ iDlayerIndex;
+  }
+
+  // for dynamically malloc for parameter sets memory instead of maximal items for standard to reduce size, 3/18/2010
+  if (& (*ppCtx)->pSvcParam->bMgsT0OnlyStrategy) {
+    (*ppCtx)->pPPSArray	= (SWelsPPS*)pMa->WelsMalloc ((1 + iDlayerCount) * sizeof (SWelsPPS), "pPPSArray");
+  } else {
+    (*ppCtx)->pPPSArray	= (SWelsPPS*)pMa->WelsMalloc (iDlayerCount * sizeof (SWelsPPS), "pPPSArray");
+  }
+  WELS_VERIFY_RETURN_PROC_IF (1, (NULL == (*ppCtx)->pPPSArray), FreeMemorySvc (ppCtx))
+
+  (*ppCtx)->pSpsArray	= (SWelsSPS*)pMa->WelsMalloc (sizeof (SWelsSPS), "pSpsArray");
+  WELS_VERIFY_RETURN_PROC_IF (1, (NULL == (*ppCtx)->pSpsArray), FreeMemorySvc (ppCtx))
+  if (iDlayerCount > 1) {
+    (*ppCtx)->pSubsetArray	= (SSubsetSps*)pMa->WelsMalloc ((iDlayerCount - 1) * sizeof (SSubsetSps), "pSubsetArray");
+    WELS_VERIFY_RETURN_PROC_IF (1, (NULL == (*ppCtx)->pSubsetArray), FreeMemorySvc (ppCtx))
+  }
+
+  (*ppCtx)->pDqIdcMap	= (SDqIdc*)pMa->WelsMallocz (iDlayerCount * sizeof (SDqIdc), "pDqIdcMap");
+  WELS_VERIFY_RETURN_PROC_IF (1, (NULL == (*ppCtx)->pDqIdcMap), FreeMemorySvc (ppCtx))
+
+  iDlayerIndex	= 0;
+  while (iDlayerIndex < iDlayerCount) {
+    SDqIdc* pDqIdc		= & (*ppCtx)->pDqIdcMap[iDlayerIndex];
+    const bool_t bUseSubsetSps			= (iDlayerIndex > BASE_DEPENDENCY_ID);
+    SDLayerParam* pDlayerParam	= &pParam->sDependencyLayers[iDlayerIndex];
+
+    pDqIdc->uiSpatialId	= iDlayerIndex;
+    pPps	= & (*ppCtx)->pPPSArray[iPpsId];
+    if (!bUseSubsetSps) {
+      pSps	= & (*ppCtx)->pSpsArray[iSpsId];
+    } else {
+      pSubsetSps	= & (*ppCtx)->pSubsetArray[iSpsId];
+      pSps			= &pSubsetSps->pSps;
+    }
+
+    // Need port pSps/pPps initialization due to spatial scalability changed
+    if (!bUseSubsetSps) {
+      WelsInitSps (pSps, pDlayerParam, pParam->uiIntraPeriod, pParam->iNumRefFrame, iSpsId,
+                   pParam->bEnableFrameCroppingFlag, pParam->bEnableRc);
+
+      if (iDlayerCount > 1) {
+        pSps->bConstraintSet0Flag = true;
+        pSps->bConstraintSet1Flag = true;
+        pSps->bConstraintSet2Flag = true;
+      }
+    } else {
+      WelsInitSubsetSps (pSubsetSps, pDlayerParam, pParam->uiIntraPeriod, pParam->iNumRefFrame, iSpsId,
+                         pParam->bEnableFrameCroppingFlag, pParam->bEnableRc);
+    }
+
+    // initialize pPps
+    WelsInitPps (pPps, pSps, pSubsetSps, iPpsId, true, bUseSubsetSps);
+
+    // Not using FMO in SVC coding so far, come back if need FMO
+    {
+      iResult = InitSlicePEncCtx (& (*ppCtx)->pSliceCtxList[iDlayerIndex],
+                                  (*ppCtx)->pMemAlign,
+                                  false,
+                                  pSps->iMbWidth,
+                                  pSps->iMbHeight,
+                                  & (pDlayerParam->sMso),
+                                  pPps);
+      if (iResult) {
+        WelsLog (*ppCtx, WELS_LOG_WARNING, "InitDqLayers(), InitSlicePEncCtx failed(%d)!", iResult);
+        FreeMemorySvc (ppCtx);
+        return 1;
+      }
+      (*ppCtx)->ppDqLayerList[iDlayerIndex]->pSliceEncCtx	= & (*ppCtx)->pSliceCtxList[iDlayerIndex];
+    }
+    pDqIdc->iSpsId	= iSpsId;
+    pDqIdc->iPpsId	= iPpsId;
+
+    (*ppCtx)->sPSOVector.bPpsIdMappingIntoSubsetsps[iPpsId] = bUseSubsetSps;
+
+    if (bUseSubsetSps)
+      ++ iSpsId;
+    ++ iPpsId;
+    ++ (*ppCtx)->iSpsNum;
+    ++ (*ppCtx)->iPpsNum;
+
+    ++ iDlayerIndex;
+  }
+  return 0;
+}
+
+int32_t AllocStrideTables (sWelsEncCtx** ppCtx, const int32_t kiNumSpatialLayers) {
+  CMemoryAlign* pMa				= (*ppCtx)->pMemAlign;
+  SWelsSvcCodingParam* pParam	= (*ppCtx)->pSvcParam;
+  SStrideTables* pPtr				= NULL;
+  int16_t* pTmpRow	= NULL, *pRowX = NULL, *pRowY = NULL, *p = NULL;
+  uint8_t* pBase		= NULL;
+  uint8_t* pBaseDec = NULL, *pBaseEnc = NULL, *pBaseMbX = NULL, *pBaseMbY = NULL;
+  struct {
+    int32_t iMbWidth;
+    int32_t iCountMbNum;				// count number of SMB in each spatial
+    int32_t iSizeAllMbAlignCache;	// cache line size aligned in each spatial
+  } sMbSizeMap[MAX_DEPENDENCY_LAYER] = {0};
+  int32_t iLineSizeY[MAX_DEPENDENCY_LAYER][2] = {0};
+  int32_t iLineSizeUV[MAX_DEPENDENCY_LAYER][2] = {0};
+  int32_t iMapSpatialIdx[MAX_DEPENDENCY_LAYER][2] = {0};
+  int32_t iSizeDec		= 0;
+  int32_t iSizeEnc		= 0;
+  int32_t iCountLayersNeedCs[2]	= {0};
+  const int32_t kiUnit1Size = 24 * sizeof (int32_t);
+  int32_t iUnit2Size		= 0;
+  int32_t iNeedAllocSize	= 0;
+  int32_t iRowSize		= 0;
+  int16_t iMaxMbWidth	= 0;
+  int16_t iMaxMbHeight	= 0;
+  int32_t i				= 0;
+  int32_t iSpatialIdx		= 0;
+  int32_t iTemporalIdx	= 0;
+  int32_t iCntTid			= 0;
+
+  if (kiNumSpatialLayers <= 0 || kiNumSpatialLayers > MAX_DEPENDENCY_LAYER)
+    return 1;
+
+  pPtr = (SStrideTables*)pMa->WelsMalloc (sizeof (SStrideTables), "SStrideTables");
+  if (NULL == pPtr)
+    return 1;
+  (*ppCtx)->pStrideTab = pPtr;
+
+  iCntTid	= pParam->iNumTemporalLayer > 1 ? 2 : 1;
+
+  iSpatialIdx = 0;
+  while (iSpatialIdx < kiNumSpatialLayers) {
+    const int32_t kiTmpWidth = (pParam->sDependencyLayers[iSpatialIdx].iFrameWidth + 15) >> 4;
+    const int32_t kiTmpHeight = (pParam->sDependencyLayers[iSpatialIdx].iFrameHeight + 15) >> 4;
+    int32_t iNumMb = kiTmpWidth * kiTmpHeight;
+
+    sMbSizeMap[iSpatialIdx].iMbWidth		= kiTmpWidth;
+    sMbSizeMap[iSpatialIdx].iCountMbNum	= iNumMb;
+
+    iNumMb *= sizeof (int16_t);
+    sMbSizeMap[iSpatialIdx].iSizeAllMbAlignCache = iNumMb;
+    iUnit2Size += iNumMb;
+
+    ++ iSpatialIdx;
+  }
+
+  // Adaptive size_cs, size_fdec by implementation dependency
+  iTemporalIdx = 0;
+  while (iTemporalIdx < iCntTid) {
+    const bool_t kbBaseTemporalFlag	= (iTemporalIdx == 0);
+
+    iSpatialIdx = 0;
+    while (iSpatialIdx < kiNumSpatialLayers) {
+      SDLayerParam* fDlp					= &pParam->sDependencyLayers[iSpatialIdx];
+
+      const int32_t kiWidthPad = WELS_ALIGN (fDlp->iFrameWidth, 16) + (PADDING_LENGTH << 1);
+      iLineSizeY[iSpatialIdx][kbBaseTemporalFlag]	= WELS_ALIGN (kiWidthPad, 32);
+      iLineSizeUV[iSpatialIdx][kbBaseTemporalFlag] = WELS_ALIGN ((kiWidthPad >> 1), 16);
+
+      iMapSpatialIdx[iCountLayersNeedCs[kbBaseTemporalFlag]][kbBaseTemporalFlag] = iSpatialIdx;
+      ++ iCountLayersNeedCs[kbBaseTemporalFlag];
+      ++ iSpatialIdx;
+    }
+    ++ iTemporalIdx;
+  }
+  iSizeDec = kiUnit1Size * (iCountLayersNeedCs[0] + iCountLayersNeedCs[1]);
+  iSizeEnc = kiUnit1Size * kiNumSpatialLayers;
+
+  iNeedAllocSize = iSizeDec + iSizeEnc + (iUnit2Size << 1);
+
+  pBase = (uint8_t*)pMa->WelsMalloc (iNeedAllocSize, "pBase");
+  if (NULL == pBase) {
+    return 1;
+  }
+
+  pBaseDec = pBase;		// iCountLayersNeedCs
+  pBaseEnc = pBaseDec + iSizeDec;		// iNumSpatialLayers
+  pBaseMbX = pBaseEnc + iSizeEnc;	// iNumSpatialLayers
+  pBaseMbY = pBaseMbX + iUnit2Size;	// iNumSpatialLayers
+
+  iTemporalIdx = 0;
+  while (iTemporalIdx < iCntTid) {
+    const bool_t kbBaseTemporalFlag	= (iTemporalIdx == 0);
+
+    iSpatialIdx = 0;
+    while (iSpatialIdx < iCountLayersNeedCs[kbBaseTemporalFlag]) {
+      const int32_t kiActualSpatialIdx = iMapSpatialIdx[iSpatialIdx][kbBaseTemporalFlag];
+      const int32_t kiLumaWidth	= iLineSizeY[kiActualSpatialIdx][kbBaseTemporalFlag];
+      const int32_t kiChromaWidth	= iLineSizeUV[kiActualSpatialIdx][kbBaseTemporalFlag];
+
+      WelsGetEncBlockStrideOffset ((int32_t*)pBaseDec, kiLumaWidth, kiChromaWidth);
+
+      pPtr->pStrideDecBlockOffset[kiActualSpatialIdx][kbBaseTemporalFlag]	= (int32_t*)pBaseDec;
+      pBaseDec += kiUnit1Size;
+
+      ++ iSpatialIdx;
+    }
+    ++ iTemporalIdx;
+  }
+  iTemporalIdx = 0;
+  while (iTemporalIdx < iCntTid) {
+    const bool_t kbBaseTemporalFlag	= (iTemporalIdx == 0);
+
+    iSpatialIdx = 0;
+    while (iSpatialIdx < kiNumSpatialLayers) {
+      int32_t iMatchIndex = 0;
+      bool_t bInMap = false;
+      bool_t bMatchFlag = false;
+
+      i = 0;
+      while (i < iCountLayersNeedCs[kbBaseTemporalFlag]) {
+        const int32_t kiActualIdx = iMapSpatialIdx[i][kbBaseTemporalFlag];
+        if (kiActualIdx == iSpatialIdx) {
+          bInMap	= true;
+          break;
+        }
+        if (!bMatchFlag) {
+          iMatchIndex	= kiActualIdx;
+          bMatchFlag	= true;
+        }
+        ++ i;
+      }
+
+      if (bInMap) {
+        ++ iSpatialIdx;
+        continue;
+      }
+
+      // not in spatial map and assign match one to it
+      pPtr->pStrideDecBlockOffset[iSpatialIdx][kbBaseTemporalFlag]	=
+        pPtr->pStrideDecBlockOffset[iMatchIndex][kbBaseTemporalFlag];
+
+      ++ iSpatialIdx;
+    }
+    ++ iTemporalIdx;
+  }
+
+  iSpatialIdx = 0;
+  while (iSpatialIdx < kiNumSpatialLayers) {
+    const int32_t kiAllocMbSize = sMbSizeMap[iSpatialIdx].iSizeAllMbAlignCache;
+
+    pPtr->pStrideEncBlockOffset[iSpatialIdx]	= (int32_t*)pBaseEnc;
+
+    pPtr->pMbIndexX[iSpatialIdx]				= (int16_t*)pBaseMbX;
+    pPtr->pMbIndexY[iSpatialIdx]				= (int16_t*)pBaseMbY;
+
+    pBaseEnc += kiUnit1Size;
+    pBaseMbX += kiAllocMbSize;
+    pBaseMbY += kiAllocMbSize;
+
+    ++ iSpatialIdx;
+  }
+
+  while (iSpatialIdx < MAX_DEPENDENCY_LAYER) {
+    pPtr->pStrideDecBlockOffset[iSpatialIdx][0]	= NULL;
+    pPtr->pStrideDecBlockOffset[iSpatialIdx][1]	= NULL;
+    pPtr->pStrideEncBlockOffset[iSpatialIdx]		= NULL;
+    pPtr->pMbIndexX[iSpatialIdx]					= NULL;
+    pPtr->pMbIndexY[iSpatialIdx]					= NULL;
+
+    ++ iSpatialIdx;
+  }
+
+  // initialize pMbIndexX and pMbIndexY tables as below
+
+  iMaxMbWidth	= sMbSizeMap[kiNumSpatialLayers - 1].iMbWidth;
+  iMaxMbWidth	= WELS_ALIGN (iMaxMbWidth, 4);	// 4 loops for int16_t required introduced as below
+  iRowSize		= iMaxMbWidth * sizeof (int16_t);
+
+  pTmpRow = (int16_t*)pMa->WelsMalloc (iRowSize, "pTmpRow");
+  if (NULL == pTmpRow) {
+    return 1;
+  }
+  pRowX = pTmpRow;
+  pRowY = pRowX;
+  // initialize pRowX & pRowY
+  i = 0;
+  p = pRowX;
+  while (i < iMaxMbWidth) {
+    *p		= i;
+    * (p + 1)	= 1 + i;
+    * (p + 2)	= 2 + i;
+    * (p + 3)	= 3 + i;
+
+    p += 4;
+    i += 4;
+  }
+
+  iSpatialIdx = kiNumSpatialLayers;
+  while (--iSpatialIdx >= 0) {
+    int16_t* pMbIndexX = pPtr->pMbIndexX[iSpatialIdx];
+    const int32_t kiMbWidth	= sMbSizeMap[iSpatialIdx].iMbWidth;
+    const int32_t kiMbHeight	= sMbSizeMap[iSpatialIdx].iCountMbNum / kiMbWidth;
+    const int32_t kiLineSize	= kiMbWidth * sizeof (int16_t);
+
+    i = 0;
+    while (i < kiMbHeight) {
+      memcpy (pMbIndexX, pRowX, kiLineSize);	// confirmed_safe_unsafe_usage
+
+      pMbIndexX += kiMbWidth;
+      ++ i;
+    }
+  }
+
+  memset (pRowY, 0, iRowSize);
+  iMaxMbHeight	= sMbSizeMap[kiNumSpatialLayers - 1].iCountMbNum / sMbSizeMap[kiNumSpatialLayers - 1].iMbWidth;
+  i = 0;
+  for (;;) {
+    ENFORCE_STACK_ALIGN_1D (int16_t, t, 4, 16)
+
+    int32_t t32 = 0;
+    int16_t j = 0;
+
+    for (iSpatialIdx = kiNumSpatialLayers - 1; iSpatialIdx >= 0; -- iSpatialIdx) {
+      const int32_t kiMbWidth	= sMbSizeMap[iSpatialIdx].iMbWidth;
+      const int32_t kiMbHeight = sMbSizeMap[iSpatialIdx].iCountMbNum / kiMbWidth;
+      const int32_t kiLineSize	= kiMbWidth * sizeof (int16_t);
+      int16_t* pMbIndexY = pPtr->pMbIndexY[iSpatialIdx] + i * kiMbWidth;
+
+      if (i < kiMbHeight) {
+        memcpy (pMbIndexY, pRowY, kiLineSize);	// confirmed_safe_unsafe_usage
+      }
+    }
+    ++ i;
+    if (i >= iMaxMbHeight)
+      break;
+
+    t32 = i | (i << 16);
+    ST32 (t  , t32);
+    ST32 (t + 2, t32);
+
+    p = pRowY;
+    while (j < iMaxMbWidth) {
+      ST64 (p, LD64 (t));
+
+      p += 4;
+      j += 4;
+    }
+  }
+
+  pMa->WelsFree (pTmpRow, "pTmpRow");
+  pTmpRow = NULL;
+
+  return 0;
+}
+
+/*!
+ * \brief	request specific memory for SVC
+ * \pParam	pEncCtx		sWelsEncCtx*
+ * \return	successful - 0; otherwise none 0 for failed
+ */
+int32_t RequestMemorySvc (sWelsEncCtx** ppCtx) {
+  SWelsSvcCodingParam* pParam	= (*ppCtx)->pSvcParam;
+  CMemoryAlign* pMa				= (*ppCtx)->pMemAlign;
+  SDLayerParam* pFinalSpatial	= NULL;
+  int32_t iCountBsLen			= 0;
+  int32_t iCountNals				= 0;
+  int32_t iMaxPicWidth			= 0;
+  int32_t iMaxPicHeight			= 0;
+  int32_t iCountMaxMbNum		= 0;
+  int32_t iIndex					= 0;
+  int32_t iCountLayers			= 0;
+  int32_t iResult					= 0;
+  float	fCompressRatioThr		= .5f;
+  const int32_t kiNumDependencyLayers	= pParam->iNumDependencyLayer;
+  const uint32_t kuiMvdInterTableSize	= (kiNumDependencyLayers == 1 ? (1 + (648 << 1)) : (1 + (972 << 1)));
+  const uint32_t kuiMvdCacheAlginedSize	= kuiMvdInterTableSize * sizeof (uint16_t);
+  int32_t iVclLayersBsSizeCount		= 0;
+  int32_t iNonVclLayersBsSizeCount	= 0;
+#if defined(MT_ENABLED)
+  int32_t iTargetSpatialBsSize			= 0;
+#endif//MT_ENABLED
+
+  if (kiNumDependencyLayers < 1 || kiNumDependencyLayers > MAX_DEPENDENCY_LAYER) {
+    WelsLog (*ppCtx, WELS_LOG_WARNING, "RequestMemorySvc() failed due to invalid iNumDependencyLayers(%d)!\n",
+             kiNumDependencyLayers);
+    FreeMemorySvc (ppCtx);
+    return 1;
+  }
+
+  if (pParam->uiGopSize == 0 || (pParam->uiIntraPeriod && ((pParam->uiIntraPeriod % pParam->uiGopSize) != 0))) {
+    WelsLog (*ppCtx, WELS_LOG_WARNING,
+             "RequestMemorySvc() failed due to invalid uiIntraPeriod(%d) (=multipler of uiGopSize(%d)!",
+             pParam->uiIntraPeriod, pParam->uiGopSize);
+    FreeMemorySvc (ppCtx);
+    return 1;
+  }
+
+  pFinalSpatial	= &pParam->sDependencyLayers[kiNumDependencyLayers - 1];
+  iMaxPicWidth	= pFinalSpatial->iFrameWidth;
+  iMaxPicHeight	= pFinalSpatial->iFrameHeight;
+  iCountMaxMbNum = ((15 + iMaxPicWidth) >> 4) * ((15 + iMaxPicHeight) >> 4);
+
+  iResult = AcquireLayersNals (ppCtx, pParam, &iCountLayers, &iCountNals);
+  if (iResult) {
+    WelsLog (*ppCtx, WELS_LOG_WARNING, "RequestMemorySvc(), AcquireLayersNals failed(%d)!", iResult);
+    FreeMemorySvc (ppCtx);
+    return 1;
+  }
+
+  iNonVclLayersBsSizeCount = SSEI_BUFFER_SIZE + pParam->iNumDependencyLayer * SPS_BUFFER_SIZE +
+                             (1 + pParam->iNumDependencyLayer) * PPS_BUFFER_SIZE;
+
+  int32_t iLayerBsSize = 0;
+  iIndex = 0;
+  while (iIndex < pParam->iNumDependencyLayer) {
+    SDLayerParam* fDlp = &pParam->sDependencyLayers[iIndex];
+
+    fCompressRatioThr	= COMPRESS_RATIO_DECIDED_BY_RESOLUTION (fDlp->iFrameWidth, fDlp->iFrameHeight);
+
+    iLayerBsSize = WELS_ROUND (((3 * fDlp->iFrameWidth * fDlp->iFrameHeight) >> 1) * fCompressRatioThr);
+    iLayerBsSize	= WELS_ALIGN (iLayerBsSize, 4);			// 4 bytes alinged
+    iVclLayersBsSizeCount += iLayerBsSize;
+    ++ iIndex;
+  }
+#if defined(MT_ENABLED)
+  iTargetSpatialBsSize = iLayerBsSize;
+#endif//MT_ENABLED
+  iCountBsLen = iNonVclLayersBsSizeCount + iVclLayersBsSizeCount;
+
+  pParam->iNumRefFrame	= WELS_CLIP3 (pParam->iNumRefFrame, MIN_REF_PIC_COUNT, MAX_REFERENCE_PICTURE_COUNT_NUM);
+
+  // Output
+  (*ppCtx)->pOut = (SWelsEncoderOutput*)pMa->WelsMalloc (sizeof (SWelsEncoderOutput), "SWelsEncoderOutput");
+  WELS_VERIFY_RETURN_PROC_IF (1, (NULL == (*ppCtx)->pOut), FreeMemorySvc (ppCtx))
+  (*ppCtx)->pOut->pBsBuffer		= (uint8_t*)pMa->WelsMalloc (iCountBsLen, "pOut->pBsBuffer");
+  WELS_VERIFY_RETURN_PROC_IF (1, (NULL == (*ppCtx)->pOut->pBsBuffer), FreeMemorySvc (ppCtx))
+  (*ppCtx)->pOut->uiSize			= iCountBsLen;
+  (*ppCtx)->pOut->sNalList		= (SWelsNalRaw*)pMa->WelsMalloc (iCountNals * sizeof (SWelsNalRaw), "pOut->sNalList");
+  WELS_VERIFY_RETURN_PROC_IF (1, (NULL == (*ppCtx)->pOut->sNalList), FreeMemorySvc (ppCtx))
+  (*ppCtx)->pOut->iCountNals		= iCountNals;
+  (*ppCtx)->pOut->iNalIndex		= 0;
+
+#ifdef MT_ENABLED
+  if (pParam->iMultipleThreadIdc > 1) {
+    (*ppCtx)->pFrameBs			= (uint8_t*)pMa->WelsMalloc (iCountBsLen + (iTargetSpatialBsSize * ((*ppCtx)->iMaxSliceCount - 1)),
+                              "pFrameBs");
+    WELS_VERIFY_RETURN_PROC_IF (1, (NULL == (*ppCtx)->pFrameBs), FreeMemorySvc (ppCtx))
+    (*ppCtx)->iFrameBsSize		= iCountBsLen * (*ppCtx)->iMaxSliceCount;
+  } else
+#endif//MT_ENABLED
+  {
+    (*ppCtx)->pFrameBs			= (uint8_t*)pMa->WelsMalloc (iCountBsLen, "pFrameBs");
+    WELS_VERIFY_RETURN_PROC_IF (1, (NULL == (*ppCtx)->pFrameBs), FreeMemorySvc (ppCtx))
+    (*ppCtx)->iFrameBsSize		= iCountBsLen;
+  }
+  (*ppCtx)->iPosBsBuffer		= 0;
+
+#ifdef MT_ENABLED
+  // for pSlice bs buffers
+  if (pParam->iMultipleThreadIdc > 1 && RequestMtResource (ppCtx, pParam, iCountBsLen, iTargetSpatialBsSize)) {
+    WelsLog (*ppCtx, WELS_LOG_WARNING, "RequestMemorySvc(), RequestMtResource failed!");
+    FreeMemorySvc (ppCtx);
+    return 1;
+  }
+#endif
+
+  (*ppCtx)->pIntra4x4PredModeBlocks = static_cast<int8_t*>
+                                      (pMa->WelsMallocz (iCountMaxMbNum * INTRA_4x4_MODE_NUM, "pIntra4x4PredModeBlocks"));
+  WELS_VERIFY_RETURN_PROC_IF (1, (NULL == (*ppCtx)->pIntra4x4PredModeBlocks), FreeMemorySvc (ppCtx))
+
+  (*ppCtx)->pNonZeroCountBlocks = static_cast<int8_t*>
+                                  (pMa->WelsMallocz (iCountMaxMbNum * MB_LUMA_CHROMA_BLOCK4x4_NUM, "pNonZeroCountBlocks"));
+  WELS_VERIFY_RETURN_PROC_IF (1, (NULL == (*ppCtx)->pNonZeroCountBlocks), FreeMemorySvc (ppCtx))
+
+  (*ppCtx)->pMvUnitBlock4x4 = static_cast<SMVUnitXY*>
+                              (pMa->WelsMallocz (iCountMaxMbNum * 2 * MB_BLOCK4x4_NUM * sizeof (SMVUnitXY), "pMvUnitBlock4x4"));
+  WELS_VERIFY_RETURN_PROC_IF (1, (NULL == (*ppCtx)->pMvUnitBlock4x4), FreeMemorySvc (ppCtx))
+
+  (*ppCtx)->pRefIndexBlock4x4 = static_cast<int8_t*>
+                                (pMa->WelsMallocz (iCountMaxMbNum * 2 * MB_BLOCK8x8_NUM * sizeof (int8_t), "pRefIndexBlock4x4"));
+  WELS_VERIFY_RETURN_PROC_IF (1, (NULL == (*ppCtx)->pRefIndexBlock4x4), FreeMemorySvc (ppCtx))
+
+  (*ppCtx)->pSadCostMb	= static_cast<int32_t*>
+                          (pMa->WelsMallocz (iCountMaxMbNum * sizeof (int32_t), "pSadCostMb"));
+  WELS_VERIFY_RETURN_PROC_IF (1, (NULL == (*ppCtx)->pSadCostMb), FreeMemorySvc (ppCtx))
+
+  (*ppCtx)->bEncCurFrmAsIdrFlag = true;  // make sure first frame is IDR
+  (*ppCtx)->iGlobalQp				= 26;	// global qp in default
+
+  (*ppCtx)->pLtr = (SLTRState*)pMa->WelsMalloc (kiNumDependencyLayers * sizeof (SLTRState), "SLTRState");
+  WELS_VERIFY_RETURN_PROC_IF (1, (NULL == (*ppCtx)->pLtr), FreeMemorySvc (ppCtx))
+  int32_t i = 0;
+  for (i = 0; i < kiNumDependencyLayers; i++) {
+    ResetLtrState (& (*ppCtx)->pLtr[i]);
+  }
+
+  (*ppCtx)->ppRefPicListExt	= (SRefList**)pMa->WelsMalloc (kiNumDependencyLayers * sizeof (SRefList*), "ppRefPicListExt");
+  WELS_VERIFY_RETURN_PROC_IF (1, (NULL == (*ppCtx)->ppRefPicListExt), FreeMemorySvc (ppCtx))
+
+  // pSlice context list
+  (*ppCtx)->pSliceCtxList	= (SSliceCtx*)pMa->WelsMallocz (kiNumDependencyLayers * sizeof (SSliceCtx), "pSliceCtxList");
+  WELS_VERIFY_RETURN_PROC_IF (1, (NULL == (*ppCtx)->pSliceCtxList), FreeMemorySvc (ppCtx))
+
+  (*ppCtx)->ppDqLayerList	= (SDqLayer**)pMa->WelsMalloc (kiNumDependencyLayers * sizeof (SDqLayer*), "ppDqLayerList");
+  WELS_VERIFY_RETURN_PROC_IF (1, (NULL == (*ppCtx)->ppDqLayerList), FreeMemorySvc (ppCtx))
+
+  // stride tables
+  if (AllocStrideTables (ppCtx, kiNumDependencyLayers)) {
+    WelsLog (*ppCtx, WELS_LOG_WARNING, "RequestMemorySvc(), AllocStrideTables failed!");
+    FreeMemorySvc (ppCtx);
+    return 1;
+  }
+
+  //Rate control module memory allocation
+  // only malloc once for RC pData, 12/14/2009
+  (*ppCtx)->pWelsSvcRc = (SWelsSvcRc*)pMa->WelsMallocz (kiNumDependencyLayers * sizeof (SWelsSvcRc), "pWelsSvcRc");
+  WELS_VERIFY_RETURN_PROC_IF (1, (NULL == (*ppCtx)->pWelsSvcRc), FreeMemorySvc (ppCtx))
+  //End of Rate control module memory allocation
+
+  //pVaa memory allocation
+  (*ppCtx)->pVaa	= (SVAAFrameInfo*)pMa->WelsMallocz (sizeof (SVAAFrameInfo), "pVaa");
+  WELS_VERIFY_RETURN_PROC_IF (1, (NULL == (*ppCtx)->pVaa), FreeMemorySvc (ppCtx))
+
+  if ((*ppCtx)->pSvcParam->bEnableAdaptiveQuant) { //malloc mem
+    (*ppCtx)->pVaa->sAdaptiveQuantParam.pMotionTextureUnit   = static_cast<SMotionTextureUnit*>
+        (pMa->WelsMallocz (iCountMaxMbNum * sizeof (SMotionTextureUnit), "pVaa->sAdaptiveQuantParam.pMotionTextureUnit"));
+    WELS_VERIFY_RETURN_PROC_IF (1, (NULL == (*ppCtx)->pVaa->sAdaptiveQuantParam.pMotionTextureUnit), FreeMemorySvc (ppCtx))
+    (*ppCtx)->pVaa->sAdaptiveQuantParam.pMotionTextureIndexToDeltaQp   = static_cast<int8_t*>
+        (pMa->WelsMallocz (iCountMaxMbNum * sizeof (int8_t), "pVaa->sAdaptiveQuantParam.pMotionTextureIndexToDeltaQp"));
+    WELS_VERIFY_RETURN_PROC_IF (1, (NULL == (*ppCtx)->pVaa->sAdaptiveQuantParam.pMotionTextureIndexToDeltaQp),
+                                FreeMemorySvc (ppCtx))
+  }
+
+  (*ppCtx)->pVaa->pVaaBackgroundMbFlag = (int8_t*)pMa->WelsMallocz (iCountMaxMbNum * sizeof (int8_t),
+                                         "pVaa->vaa_skip_mb_flag");
+  WELS_VERIFY_RETURN_PROC_IF (1, (NULL == (*ppCtx)->pVaa->pVaaBackgroundMbFlag), FreeMemorySvc (ppCtx))
+
+  (*ppCtx)->pVaa->sVaaCalcInfo.pSad8x8 = static_cast<int32_t (*)[4]>
+                                         (pMa->WelsMallocz (iCountMaxMbNum * 4 * sizeof (int32_t), "pVaa->sVaaCalcInfo.sad8x8"));
+  WELS_VERIFY_RETURN_PROC_IF (1, (NULL == (*ppCtx)->pVaa->sVaaCalcInfo.pSad8x8), FreeMemorySvc (ppCtx))
+  (*ppCtx)->pVaa->sVaaCalcInfo.pSsd16x16 = static_cast<int32_t*>
+      (pMa->WelsMallocz (iCountMaxMbNum * sizeof (int32_t), "pVaa->sVaaCalcInfo.pSsd16x16"));
+  WELS_VERIFY_RETURN_PROC_IF (1, (NULL == (*ppCtx)->pVaa->sVaaCalcInfo.pSsd16x16), FreeMemorySvc (ppCtx))
+  (*ppCtx)->pVaa->sVaaCalcInfo.pSum16x16 = static_cast<int32_t*>
+      (pMa->WelsMallocz (iCountMaxMbNum * sizeof (int32_t), "pVaa->sVaaCalcInfo.pSum16x16"));
+  WELS_VERIFY_RETURN_PROC_IF (1, (NULL == (*ppCtx)->pVaa->sVaaCalcInfo.pSum16x16), FreeMemorySvc (ppCtx))
+  (*ppCtx)->pVaa->sVaaCalcInfo.pSumOfSquare16x16 = static_cast<int32_t*>
+      (pMa->WelsMallocz (iCountMaxMbNum * sizeof (int32_t), "pVaa->sVaaCalcInfo.pSumOfSquare16x16"));
+  WELS_VERIFY_RETURN_PROC_IF (1, (NULL == (*ppCtx)->pVaa->sVaaCalcInfo.pSumOfSquare16x16), FreeMemorySvc (ppCtx))
+
+  if ((*ppCtx)->pSvcParam->bEnableBackgroundDetection) { //BGD control
+    (*ppCtx)->pVaa->sVaaCalcInfo.pSumOfDiff8x8 = static_cast<int32_t (*)[4]>
+        (pMa->WelsMallocz (iCountMaxMbNum * 4 * sizeof (int32_t), "pVaa->sVaaCalcInfo.sd_16x16"));
+    WELS_VERIFY_RETURN_PROC_IF (1, (NULL == (*ppCtx)->pVaa->sVaaCalcInfo.pSumOfDiff8x8), FreeMemorySvc (ppCtx))
+    (*ppCtx)->pVaa->sVaaCalcInfo.pMad8x8 = static_cast<uint8_t (*)[4]>
+                                           (pMa->WelsMallocz (iCountMaxMbNum * 4 * sizeof (uint8_t), "pVaa->sVaaCalcInfo.mad_16x16"));
+    WELS_VERIFY_RETURN_PROC_IF (1, (NULL == (*ppCtx)->pVaa->sVaaCalcInfo.pMad8x8), FreeMemorySvc (ppCtx))
+  }
+
+  //End of pVaa memory allocation
+
+  iResult = InitDqLayers (ppCtx);
+  if (iResult) {
+    WelsLog (*ppCtx, WELS_LOG_WARNING, "RequestMemorySvc(), InitDqLayers failed(%d)!", iResult);
+    FreeMemorySvc (ppCtx);
+    return iResult;
+  }
+
+  if (InitMbListD (ppCtx)) {
+    WelsLog (*ppCtx, WELS_LOG_WARNING, "RequestMemorySvc(), InitMbListD failed!");
+    FreeMemorySvc (ppCtx);
+    return 1;
+  }
+
+  (*ppCtx)->pMvdCostTableInter = (uint16_t*)pMa->WelsMallocz (52 * kuiMvdCacheAlginedSize, "pMvdCostTableInter");
+  WELS_VERIFY_RETURN_PROC_IF (1, (NULL == (*ppCtx)->pMvdCostTableInter), FreeMemorySvc (ppCtx))
+  MvdCostInit ((*ppCtx)->pMvdCostTableInter, kuiMvdInterTableSize);  //should put to a better place?
+
+  if ((*ppCtx)->ppRefPicListExt[0] != NULL && (*ppCtx)->ppRefPicListExt[0]->pRef[0] != NULL)
+    (*ppCtx)->pDecPic				= (*ppCtx)->ppRefPicListExt[0]->pRef[0];
+  else
+    (*ppCtx)->pDecPic				= NULL;	// error here
+
+  (*ppCtx)->pSps				= & (*ppCtx)->pSpsArray[0];
+  (*ppCtx)->pPps				= & (*ppCtx)->pPPSArray[0];
+
+  return 0;
+}
+
+
+/*!
+ * \brief	free memory	in SVC core encoder
+ * \pParam	pEncCtx		sWelsEncCtx*
+ * \return	none
+ */
+void FreeMemorySvc (sWelsEncCtx** ppCtx) {
+  if (NULL != *ppCtx) {
+    sWelsEncCtx* pCtx	= *ppCtx;
+    CMemoryAlign* pMa			= pCtx->pMemAlign;
+    SWelsSvcCodingParam* pParam = pCtx->pSvcParam;
+    int32_t ilayer				= 0;
+
+    // SStrideTables
+    if (NULL != pCtx->pStrideTab) {
+      if (NULL != pCtx->pStrideTab->pStrideDecBlockOffset[0][1]) {
+        pMa->WelsFree (pCtx->pStrideTab->pStrideDecBlockOffset[0][1], "pBase");
+        pCtx->pStrideTab->pStrideDecBlockOffset[0][1] = NULL;
+      }
+      pMa->WelsFree (pCtx->pStrideTab, "SStrideTables");
+      pCtx->pStrideTab = NULL;
+    }
+    // pDq idc map
+    if (NULL != pCtx->pDqIdcMap) {
+      pMa->WelsFree (pCtx->pDqIdcMap, "pDqIdcMap");
+      pCtx->pDqIdcMap = NULL;
+    }
+
+    if (NULL != pCtx->pOut) {
+      // bs pBuffer
+      if (NULL != pCtx->pOut->pBsBuffer) {
+        pMa->WelsFree (pCtx->pOut->pBsBuffer, "pOut->pBsBuffer");
+        pCtx->pOut->pBsBuffer = NULL;
+      }
+      // NALs list
+      if (NULL != pCtx->pOut->sNalList) {
+        pMa->WelsFree (pCtx->pOut->sNalList, "pOut->sNalList");
+        pCtx->pOut->sNalList = NULL;
+      }
+      pMa->WelsFree (pCtx->pOut, "SWelsEncoderOutput");
+      pCtx->pOut = NULL;
+    }
+
+#ifdef MT_ENABLED
+    if (pParam != NULL && pParam->iMultipleThreadIdc > 1)
+      ReleaseMtResource (ppCtx);
+#endif//MT_ENABLED
+
+    // frame bitstream pBuffer
+    if (NULL != pCtx->pFrameBs) {
+      pMa->WelsFree (pCtx->pFrameBs, "pFrameBs");
+      pCtx->pFrameBs = NULL;
+    }
+
+    // pSpsArray
+    if (NULL != pCtx->pSpsArray) {
+      pMa->WelsFree (pCtx->pSpsArray, "pSpsArray");
+      pCtx->pSpsArray = NULL;
+    }
+    // pPPSArray
+    if (NULL != pCtx->pPPSArray) {
+      pMa->WelsFree (pCtx->pPPSArray, "pPPSArray");
+      pCtx->pPPSArray = NULL;
+    }
+    // subset_sps_array
+    if (NULL != pCtx->pSubsetArray) {
+      pMa->WelsFree (pCtx->pSubsetArray, "pSubsetArray");
+      pCtx->pSubsetArray = NULL;
+    }
+
+    if (NULL != pCtx->pIntra4x4PredModeBlocks) {
+      pMa->WelsFree (pCtx->pIntra4x4PredModeBlocks, "pIntra4x4PredModeBlocks");
+      pCtx->pIntra4x4PredModeBlocks = NULL;
+    }
+
+    if (NULL != pCtx->pNonZeroCountBlocks) {
+      pMa->WelsFree (pCtx->pNonZeroCountBlocks, "pNonZeroCountBlocks");
+      pCtx->pNonZeroCountBlocks = NULL;
+    }
+
+    if (NULL != pCtx->pMvUnitBlock4x4) {
+      pMa->WelsFree (pCtx->pMvUnitBlock4x4, "pMvUnitBlock4x4");
+      pCtx->pMvUnitBlock4x4	= NULL;
+    }
+
+    if (NULL != pCtx->pRefIndexBlock4x4) {
+      pMa->WelsFree (pCtx->pRefIndexBlock4x4, "pRefIndexBlock4x4");
+      pCtx->pRefIndexBlock4x4	= NULL;
+    }
+
+    if (NULL != pCtx->ppMbListD) {
+      if (NULL != pCtx->ppMbListD[0]) {
+        pMa->WelsFree (pCtx->ppMbListD[0], "ppMbListD[0]");
+        (*ppCtx)->ppMbListD[0] = NULL;
+      }
+      pMa->WelsFree (pCtx->ppMbListD, "ppMbListD");
+      pCtx->ppMbListD = NULL;
+    }
+
+    if (NULL != pCtx->pSadCostMb) {
+      pMa->WelsFree (pCtx->pSadCostMb, "pSadCostMb");
+      pCtx->pSadCostMb = NULL;
+    }
+
+    // SLTRState
+    if (NULL != pCtx->pLtr) {
+      pMa->WelsFree (pCtx->pLtr, "SLTRState");
+      pCtx->pLtr = NULL;
+    }
+
+    // pDq layers list
+    ilayer = 0;
+    if (NULL != pCtx->ppDqLayerList && pParam != NULL) {
+      while (ilayer < pParam->iNumDependencyLayer) {
+        SDqLayer* pDq	= pCtx->ppDqLayerList[ilayer];
+        SDLayerParam* pDlp = &pCtx->pSvcParam->sDependencyLayers[ilayer];
+        const BOOL_T kbIsDynamicSlicing = (SM_DYN_SLICE == pDlp->sMso.uiSliceMode);
+
+        // pDq layers
+        if (NULL != pDq) {
+          if (NULL != pDq->sLayerInfo.pSliceInLayer) {
+            int32_t iSliceIdx = 0;
+            int32_t iSliceNum = GetInitialSliceNum (pDq->iMbWidth, pDq->iMbHeight, &pDlp->sMso);
+            if (iSliceNum < 1)
+              iSliceNum = 1;
+            while (iSliceIdx < iSliceNum) {
+              SSlice* pSlice = &pDq->sLayerInfo.pSliceInLayer[iSliceIdx];
+              FreeMbCache (&pSlice->sMbCacheInfo, pMa);
+              ++ iSliceIdx;
+            }
+            pMa->WelsFree (pDq->sLayerInfo.pSliceInLayer, "pSliceInLayer");
+            pDq->sLayerInfo.pSliceInLayer = NULL;
+          }
+          if (kbIsDynamicSlicing) {
+            pMa->WelsFree (pDq->pNumSliceCodedOfPartition, "pNumSliceCodedOfPartition");
+            pDq->pNumSliceCodedOfPartition	= NULL;
+            pMa->WelsFree (pDq->pLastCodedMbIdxOfPartition, "pLastCodedMbIdxOfPartition");
+            pDq->pLastCodedMbIdxOfPartition	= NULL;
+            pMa->WelsFree (pDq->pLastMbIdxOfPartition, "pLastMbIdxOfPartition");
+            pDq->pLastMbIdxOfPartition = NULL;
+          }
+
+          pMa->WelsFree (pDq, "pDq");
+          pDq = NULL;
+          pCtx->ppDqLayerList[ilayer] = NULL;
+        }
+        ++ ilayer;
+      }
+      pMa->WelsFree (pCtx->ppDqLayerList, "ppDqLayerList");
+      pCtx->ppDqLayerList = NULL;
+    }
+    FreeSpatialPictures (pCtx);
+
+    // reference picture list extension
+    if (NULL != pCtx->ppRefPicListExt && pParam != NULL) {
+      ilayer = 0;
+      while (ilayer < pParam->iNumDependencyLayer) {
+        SRefList* pRefList		= pCtx->ppRefPicListExt[ilayer];
+        if (NULL != pRefList) {
+          int32_t iRef = 0;
+          do {
+            if (pRefList->pRef[iRef] != NULL) {
+              FreePicture (pMa, &pRefList->pRef[iRef]);
+            }
+            ++ iRef;
+          } while (iRef < 1 + pParam->iNumRefFrame);
+
+          pMa->WelsFree (pCtx->ppRefPicListExt[ilayer], "ppRefPicListExt[]");
+          pCtx->ppRefPicListExt[ilayer] = NULL;
+        }
+        ++ ilayer;
+      }
+
+      pMa->WelsFree (pCtx->ppRefPicListExt, "ppRefPicListExt");
+      pCtx->ppRefPicListExt = NULL;
+    }
+
+    // pSlice context list
+    if (NULL != pCtx->pSliceCtxList && pParam != NULL) {
+      ilayer = 0;
+      while (ilayer < pParam->iNumDependencyLayer) {
+        SSliceCtx* pSliceCtx	= &pCtx->pSliceCtxList[ilayer];
+        if (NULL != pSliceCtx)
+          UninitSlicePEncCtx (pSliceCtx, pMa);
+        ++ ilayer;
+      }
+      pMa->WelsFree (pCtx->pSliceCtxList, "pSliceCtxList");
+      pCtx->pSliceCtxList = NULL;
+    }
+
+    // VAA
+    if (NULL != pCtx->pVaa) {
+      if (pCtx->pSvcParam->bEnableAdaptiveQuant) { //free mem
+        pMa->WelsFree (pCtx->pVaa->sAdaptiveQuantParam.pMotionTextureUnit, "pVaa->sAdaptiveQuantParam.pMotionTextureUnit");
+        pCtx->pVaa->sAdaptiveQuantParam.pMotionTextureUnit = NULL;
+        pMa->WelsFree (pCtx->pVaa->sAdaptiveQuantParam.pMotionTextureIndexToDeltaQp,
+                       "pVaa->sAdaptiveQuantParam.pMotionTextureIndexToDeltaQp");
+        pCtx->pVaa->sAdaptiveQuantParam.pMotionTextureIndexToDeltaQp = NULL;
+      }
+
+      pMa->WelsFree (pCtx->pVaa->pVaaBackgroundMbFlag, "pVaa->pVaaBackgroundMbFlag");
+      pCtx->pVaa->pVaaBackgroundMbFlag	= NULL;
+      pMa->WelsFree (pCtx->pVaa->sVaaCalcInfo.pSad8x8, "pVaa->sVaaCalcInfo.sad8x8");
+      pCtx->pVaa->sVaaCalcInfo.pSad8x8		= NULL;
+      pMa->WelsFree (pCtx->pVaa->sVaaCalcInfo.pSsd16x16, "pVaa->sVaaCalcInfo.pSsd16x16");
+      pCtx->pVaa->sVaaCalcInfo.pSsd16x16	= NULL;
+      pMa->WelsFree (pCtx->pVaa->sVaaCalcInfo.pSum16x16, "pVaa->sVaaCalcInfo.pSum16x16");
+      pCtx->pVaa->sVaaCalcInfo.pSum16x16	= NULL;
+      pMa->WelsFree (pCtx->pVaa->sVaaCalcInfo.pSumOfSquare16x16, "pVaa->sVaaCalcInfo.pSumOfSquare16x16");
+      pCtx->pVaa->sVaaCalcInfo.pSumOfSquare16x16		= NULL;
+
+      if (pCtx->pSvcParam->bEnableBackgroundDetection) { //BGD control
+        pMa->WelsFree (pCtx->pVaa->sVaaCalcInfo.pSumOfDiff8x8, "pVaa->sVaaCalcInfo.pSumOfDiff8x8");
+        pCtx->pVaa->sVaaCalcInfo.pSumOfDiff8x8	= NULL;
+        pMa->WelsFree (pCtx->pVaa->sVaaCalcInfo.pMad8x8, "pVaa->sVaaCalcInfo.pMad8x8");
+        pCtx->pVaa->sVaaCalcInfo.pMad8x8	= NULL;
+      }
+
+      pMa->WelsFree (pCtx->pVaa, "pVaa");
+      pCtx->pVaa = NULL;
+    }
+
+    WelsRcFreeMemory (pCtx);
+    // rate control module memory free
+    if (NULL != pCtx->pWelsSvcRc) {
+      pMa->WelsFree (pCtx->pWelsSvcRc, "pWelsSvcRc");
+      pCtx->pWelsSvcRc = NULL;
+    }
+
+    /* MVD cost tables for Inter */
+    if (NULL != pCtx->pMvdCostTableInter) {
+      pMa->WelsFree (pCtx->pMvdCostTableInter, "pMvdCostTableInter");
+      pCtx->pMvdCostTableInter = NULL;
+    }
+
+#ifdef ENABLE_TRACE_FILE
+    if (NULL != pCtx->pFileLog) {
+      fclose (pCtx->pFileLog);
+      pCtx->pFileLog	= NULL;
+    }
+    pCtx->uiSizeLog	= 0;
+#endif//ENABLE_TRACE_FILE
+
+    FreeCodingParam (&pCtx->pSvcParam, pMa);
+    if (NULL != pCtx->pFuncList) {
+      pMa->WelsFree (pCtx->pFuncList, "SWelsFuncPtrList");
+      pCtx->pFuncList = NULL;
+    }
+
+#if defined(MEMORY_MONITOR)
+    assert (pMa->WelsGetMemoryUsage() == 0);	// ensure all memory free well
+#endif//MEMORY_MONITOR		
+
+    if ((*ppCtx)->pMemAlign != NULL) {
+      WelsLog (NULL, WELS_LOG_INFO, "FreeMemorySvc(), verify memory usage (%d bytes) after free..\n",
+               (*ppCtx)->pMemAlign->WelsGetMemoryUsage());
+      delete (*ppCtx)->pMemAlign;
+      (*ppCtx)->pMemAlign = NULL;
+    }
+
+    free (*ppCtx);
+    *ppCtx = NULL;
+  }
+}
+
+int32_t InitSliceSettings (SWelsSvcCodingParam* pCodingParam, const int32_t kiCpuCores, int16_t* pMaxSliceCount) {
+  int32_t iSpatialIdx = 0, iSpatialNum = pCodingParam->iNumDependencyLayer;
+  int16_t iMaxSliceCount = 0;
+
+  do {
+    SDLayerParam* pDlp				= &pCodingParam->sDependencyLayers[iSpatialIdx];
+    SMulSliceOption* pMso			= &pDlp->sMso;
+    SSliceArgument* pSlcArg			= &pMso->sSliceArgument;
+    const int32_t kiMbWidth			= (pDlp->iFrameWidth + 15) >> 4;
+    const int32_t kiMbHeight			= (pDlp->iFrameHeight + 15) >> 4;
+    const int32_t kiMbNumInFrame	= kiMbWidth * kiMbHeight;
+#if defined(MT_ENABLED)
+#if defined(DYNAMIC_SLICE_ASSIGN)
+    int32_t iSliceNum				= (SM_FIXEDSLCNUM_SLICE == pMso->uiSliceMode
+                               || SM_DYN_SLICE == pMso->uiSliceMode) ? kiCpuCores :
+                              pSlcArg->iSliceNum; // uiSliceNum per input has been validated at ParamValidationExt()
+#else//!DYNAMIC_SLICE_ASSIGN
+    int32_t iSliceNum				= (SM_DYN_SLICE == pMso->uiSliceMode) ? kiCpuCores :
+                              pSlcArg->uiSliceNum; // uiSliceNum per input has been validated at ParamValidationExt()
+#endif//DYNAMIC_SLICE_ASSIGN
+#else//!MT_ENABLED
+    int16_t iSliceNum				= pSlcArg->iSliceNum; // uiSliceNum per input has been validated at ParamValidationExt()
+#endif//MT_ENABLED
+
+    // NOTE: Per design, in case MT/DYNAMIC_SLICE_ASSIGN enabled, for SM_FIXEDSLCNUM_SLICE mode,
+    // uiSliceNum of current spatial layer settings equals to uiCpuCores number; SM_DYN_SLICE mode,
+    // uiSliceNum intials as uiCpuCores also, stay tuned dynamically slicing in future
+    pSlcArg->iSliceNum	= iSliceNum;	// used fixed one
+
+    switch (pMso->uiSliceMode) {
+    case SM_DYN_SLICE:
+      iMaxSliceCount	= AVERSLICENUM_CONSTRAINT;
+//#ifndef MT_ENABLED
+      break;	// go through for MT_ENABLED & SM_DYN_SLICE?
+//#endif//MT_ENABLED
+    case SM_FIXEDSLCNUM_SLICE:
+      if (iSliceNum > iMaxSliceCount)
+        iMaxSliceCount = iSliceNum;
+      // need perform check due uiSliceNum might change, although has been initialized somewhere outside
+      if (pCodingParam->bEnableRc) {
+        GomValidCheckSliceMbNum (kiMbWidth, kiMbHeight, pSlcArg);
+      } else {
+        CheckFixedSliceNumMultiSliceSetting (kiMbNumInFrame, pSlcArg);
+      }
+      break;
+    case SM_SINGLE_SLICE:
+      if (iSliceNum > iMaxSliceCount)
+        iMaxSliceCount = iSliceNum;
+      break;
+    case SM_RASTER_SLICE:
+      if (iSliceNum > iMaxSliceCount)
+        iMaxSliceCount = iSliceNum;
+      break;
+    case SM_ROWMB_SLICE:
+      if (iSliceNum > iMaxSliceCount)
+        iMaxSliceCount = iSliceNum;
+      break;
+    default:
+      break;
+    }
+
+    ++ iSpatialIdx;
+  } while (iSpatialIdx < iSpatialNum);
+
+#ifdef MT_ENABLED
+  pCodingParam->iCountThreadsNum				= WELS_MIN (kiCpuCores, iMaxSliceCount);
+  pCodingParam->iMultipleThreadIdc	= pCodingParam->iCountThreadsNum;
+#else
+  pCodingParam->iMultipleThreadIdc	= 1;
+  pCodingParam->iCountThreadsNum				= 1;
+#endif//MT_ENABLED
+
+#ifndef WELS_TESTBED	// for product release and non-SGE testing
+
+  if (kiCpuCores < 2) {	// single CPU core, make no sense for MT parallelization
+    pCodingParam->iMultipleThreadIdc	= 1;
+    pCodingParam->iCountThreadsNum				= 1;
+  }
+#endif
+
+  *pMaxSliceCount					= iMaxSliceCount;
+
+  return 0;
+}
+
+/*!
+ * \brief	log output for cpu features/capabilities
+ */
+void OutputCpuFeaturesLog (uint32_t uiCpuFeatureFlags, uint32_t uiCpuCores, int32_t iCacheLineSize) {
+  // welstracer output
+  WelsLog (NULL, WELS_LOG_INFO, "WELS CPU features/capacities (0x%x) detected: \t"	\
+           "HTT:      %c, "	\
+           "MMX:      %c, "	\
+           "MMXEX:    %c, "	\
+           "SSE:      %c, "	\
+           "SSE2:     %c, "	\
+           "SSE3:     %c, "	\
+           "SSSE3:    %c, "	\
+           "SSE4.1:   %c, "	\
+           "SSE4.2:   %c, "	\
+           "AVX:      %c, "	\
+           "FMA:      %c, "	\
+           "X87-FPU:  %c, "	\
+           "3DNOW:    %c, "	\
+           "3DNOWEX:  %c, "	\
+           "ALTIVEC:  %c, "	\
+           "CMOV:     %c, "	\
+           "MOVBE:    %c, "	\
+           "AES:      %c, "	\
+           "NUMBER OF LOGIC PROCESSORS ON CHIP: %d, "	\
+           "CPU CACHE LINE SIZE (BYTES):        %d\n",
+           uiCpuFeatureFlags,
+           (uiCpuFeatureFlags & WELS_CPU_HTT) ? 'Y' : 'N',
+           (uiCpuFeatureFlags & WELS_CPU_MMX) ? 'Y' : 'N',
+           (uiCpuFeatureFlags & WELS_CPU_MMXEXT) ? 'Y' : 'N',
+           (uiCpuFeatureFlags & WELS_CPU_SSE) ? 'Y' : 'N',
+           (uiCpuFeatureFlags & WELS_CPU_SSE2) ? 'Y' : 'N',
+           (uiCpuFeatureFlags & WELS_CPU_SSE3) ? 'Y' : 'N',
+           (uiCpuFeatureFlags & WELS_CPU_SSSE3) ? 'Y' : 'N',
+           (uiCpuFeatureFlags & WELS_CPU_SSE41) ? 'Y' : 'N',
+           (uiCpuFeatureFlags & WELS_CPU_SSE42) ? 'Y' : 'N',
+           (uiCpuFeatureFlags & WELS_CPU_AVX) ? 'Y' : 'N',
+           (uiCpuFeatureFlags & WELS_CPU_FMA) ? 'Y' : 'N',
+           (uiCpuFeatureFlags & WELS_CPU_FPU) ? 'Y' : 'N',
+           (uiCpuFeatureFlags & WELS_CPU_3DNOW) ? 'Y' : 'N',
+           (uiCpuFeatureFlags & WELS_CPU_3DNOWEXT) ? 'Y' : 'N',
+           (uiCpuFeatureFlags & WELS_CPU_ALTIVEC) ? 'Y' : 'N',
+           (uiCpuFeatureFlags & WELS_CPU_CMOV) ? 'Y' : 'N',
+           (uiCpuFeatureFlags & WELS_CPU_MOVBE) ? 'Y' : 'N',
+           (uiCpuFeatureFlags & WELS_CPU_AES) ? 'Y' : 'N',
+           uiCpuCores,
+           iCacheLineSize);
+
+#ifdef _DEBUG	// output at console & _debug
+  fprintf (stderr, "WELS CPU features/capacities (0x%x) detected: \n"	\
+           "HTT:      %c, "	\
+           "MMX:      %c, "	\
+           "MMXEX:    %c, "	\
+           "SSE:      %c, "	\
+           "SSE2:     %c, "	\
+           "SSE3:     %c, "	\
+           "SSSE3:    %c, "	\
+           "SSE4.1:   %c, "	\
+           "SSE4.2:   %c, "	\
+           "AVX:      %c, "	\
+           "FMA:      %c, "	\
+           "X87-FPU:  %c, "	\
+           "3DNOW:    %c, "	\
+           "3DNOWEX:  %c, "	\
+           "ALTIVEC:  %c, "	\
+           "CMOV:     %c, "	\
+           "MOVBE:    %c, "	\
+           "AES:      %c, "	\
+           "NUMBER OF LOGIC PROCESSORS ON CHIP: %d, "	\
+           "CPU CACHE LINE SIZE (BYTES):        %d\n",
+           uiCpuFeatureFlags,
+           (uiCpuFeatureFlags & WELS_CPU_HTT) ? 'Y' : 'N',
+           (uiCpuFeatureFlags & WELS_CPU_MMX) ? 'Y' : 'N',
+           (uiCpuFeatureFlags & WELS_CPU_MMXEXT) ? 'Y' : 'N',
+           (uiCpuFeatureFlags & WELS_CPU_SSE) ? 'Y' : 'N',
+           (uiCpuFeatureFlags & WELS_CPU_SSE2) ? 'Y' : 'N',
+           (uiCpuFeatureFlags & WELS_CPU_SSE3) ? 'Y' : 'N',
+           (uiCpuFeatureFlags & WELS_CPU_SSSE3) ? 'Y' : 'N',
+           (uiCpuFeatureFlags & WELS_CPU_SSE41) ? 'Y' : 'N',
+           (uiCpuFeatureFlags & WELS_CPU_SSE42) ? 'Y' : 'N',
+           (uiCpuFeatureFlags & WELS_CPU_AVX) ? 'Y' : 'N',
+           (uiCpuFeatureFlags & WELS_CPU_FMA) ? 'Y' : 'N',
+           (uiCpuFeatureFlags & WELS_CPU_FPU) ? 'Y' : 'N',
+           (uiCpuFeatureFlags & WELS_CPU_3DNOW) ? 'Y' : 'N',
+           (uiCpuFeatureFlags & WELS_CPU_3DNOWEXT) ? 'Y' : 'N',
+           (uiCpuFeatureFlags & WELS_CPU_ALTIVEC) ? 'Y' : 'N',
+           (uiCpuFeatureFlags & WELS_CPU_CMOV) ? 'Y' : 'N',
+           (uiCpuFeatureFlags & WELS_CPU_MOVBE) ? 'Y' : 'N',
+           (uiCpuFeatureFlags & WELS_CPU_AES) ? 'Y' : 'N',
+           uiCpuCores,
+           iCacheLineSize);
+#endif//_DEBUG
+}
+
+/*!
+ * \brief	initialize Wels avc encoder core library
+ * \pParam	ppCtx		sWelsEncCtx**
+ * \pParam	pParam		SWelsSvcCodingParam*
+ * \return	successful - 0; otherwise none 0 for failed
+ */
+int32_t WelsInitEncoderExt (sWelsEncCtx** ppCtx, SWelsSvcCodingParam* pCodingParam) {
+  sWelsEncCtx* pCtx		= NULL;
+  int32_t	iRet					= 0;
+  uint32_t uiCpuFeatureFlags		= 0;	// CPU features
+  int32_t uiCpuCores				=
+    1;	// number of logic processors on physical processor package, one logic processor means HTT not supported
+  int32_t iCacheLineSize			= 16;	// on chip cache line size in byte
+  int16_t iSliceNum				= 1;	// number of slices used
+
+  if (NULL == ppCtx || NULL == pCodingParam) {
+    WelsLog (NULL, WELS_LOG_ERROR, "WelsInitEncoderExt(), NULL == ppCtx(0x%p) or NULL == pCodingParam(0x%p).\n",
+             (void*)ppCtx, (void*)pCodingParam);
+    return 1;
+  }
+
+  iRet	=	ParamValidationExt (pCodingParam);
+  if (iRet != 0) {
+    WelsLog (NULL, WELS_LOG_ERROR, "WelsInitEncoderExt(), ParamValidationExt failed return %d.\n", iRet);
+    return iRet;
+  }
+
+  // for cpu features detection, Only detect once??
+#ifdef X86_ASM
+  uiCpuFeatureFlags	= WelsCPUFeatureDetect (&uiCpuCores);	// detect cpu capacity features
+  if (uiCpuFeatureFlags & WELS_CPU_CACHELINE_128)
+    iCacheLineSize = 128;
+  else if (uiCpuFeatureFlags & WELS_CPU_CACHELINE_64)
+    iCacheLineSize = 64;
+  else if (uiCpuFeatureFlags & WELS_CPU_CACHELINE_32)
+    iCacheLineSize	= 32;
+  else if (uiCpuFeatureFlags & WELS_CPU_CACHELINE_16)
+    iCacheLineSize	= 16;
+  OutputCpuFeaturesLog (uiCpuFeatureFlags, uiCpuCores, iCacheLineSize);
+#else
+  iCacheLineSize	= 16;	// 16 bytes aligned in default
+#endif//X86_ASM
+
+#ifndef WELS_TESTBED
+
+#if defined(MT_ENABLED) && defined(DYNAMIC_DETECT_CPU_CORES)
+  if (pCodingParam->iMultipleThreadIdc > 0)
+    uiCpuCores = pCodingParam->iMultipleThreadIdc;
+  else {
+    if (uiCpuFeatureFlags ==
+        0)	// cpuid not supported, use high level system API as followed to detect number of pysical/logic processor
+      uiCpuCores = DynamicDetectCpuCores();
+    // So far so many cpu cores up to MAX_THREADS_NUM mean for server platforms,
+    // for client application here it is constrained by maximal to MAX_THREADS_NUM
+    if (uiCpuCores > MAX_THREADS_NUM)	// MAX_THREADS_NUM
+      uiCpuCores	= MAX_THREADS_NUM;	// MAX_THREADS_NUM
+    else if (uiCpuCores < 1)	// just for safe
+      uiCpuCores	= 1;
+  }
+#endif//MT_ENABLED && DYNAMIC_DETECT_CPU_CORES
+
+#else//WELS_TESTBED
+
+  uiCpuCores	= pCodingParam->iMultipleThreadIdc;	// assigned uiCpuCores from iMultipleThreadIdc from SGE testing
+
+#endif//WELS_TESTBED	
+
+  uiCpuCores	= WELS_CLIP3 (uiCpuCores, 1, MAX_THREADS_NUM);
+
+  if (InitSliceSettings (pCodingParam, uiCpuCores, &iSliceNum)) {
+    WelsLog (NULL, WELS_LOG_ERROR, "WelsInitEncoderExt(), InitSliceSettings failed.\n");
+    return 1;
+  }
+
+  *ppCtx	= NULL;
+
+  pCtx	= static_cast<sWelsEncCtx*> (malloc (sizeof (sWelsEncCtx)));
+
+  WELS_VERIFY_RETURN_IF (1, (NULL == pCtx))
+  memset (pCtx, 0, sizeof (sWelsEncCtx));
+
+  pCtx->pMemAlign = new CMemoryAlign (iCacheLineSize);
+  WELS_VERIFY_RETURN_PROC_IF (1, (NULL == pCtx->pMemAlign), FreeMemorySvc (&pCtx))
+
+  // for logs
+#ifdef ENABLE_TRACE_FILE
+  if (wlog == WelsLogDefault) {
+    str_t fname[MAX_FNAME_LEN] = {0};
+
+#if defined (_MSC_VER)
+#if _MSC_VER>=1500
+    SNPRINTF (fname, MAX_FNAME_LEN, MAX_FNAME_LEN, "%swels_svc_encoder_trace.txt",
+              pCodingParam->sTracePath);		// confirmed_safe_unsafe_usage
+#else
+    SNPRINTF (fname, MAX_FNAME_LEN, "%swels_svc_encoder_trace.txt",
+              pCodingParam->sTracePath);		// confirmed_safe_unsafe_usage
+#endif//_MSC_VER>=1500
+#else
+    //GNUC/
+    SNPRINTF (fname,      MAX_FNAME_LEN,       "%swels_svc_encoder_trace.txt",
+              pCodingParam->sTracePath);		// confirmed_safe_unsafe_usage
+#endif//_MSC_VER
+
+
+#if defined(__GNUC__)
+    pCtx->pFileLog	= FOPEN (fname, "wt+");
+#else//WIN32
+#if defined(WIN32) && defined(_MSC_VER)
+#if _MSC_VER >= 1500
+    FOPEN (&pCtx->pFileLog, fname, "wt+");
+#else
+    pCtx->pFileLog	= FOPEN (fname, "wt+");
+#endif//_MSC_VER>=1500
+#endif//WIN32 && _MSC_VER
+#endif//__GNUC__
+    pCtx->uiSizeLog	= 0;
+  }
+#endif//ENABLE_TRACE_FILE
+
+  pCodingParam->DetermineTemporalSettings();
+  iRet = AllocCodingParam (&pCtx->pSvcParam, pCtx->pMemAlign, pCodingParam->iNumDependencyLayer);
+  if (iRet != 0) {
+    FreeMemorySvc (&pCtx);
+    return iRet;
+  }
+  memcpy (pCtx->pSvcParam, pCodingParam, sizeof (SWelsSvcCodingParam));	// confirmed_safe_unsafe_usage
+
+  pCtx->pFuncList = (SWelsFuncPtrList*)pCtx->pMemAlign->WelsMalloc (sizeof (SWelsFuncPtrList), "SWelsFuncPtrList");
+  if (NULL == pCtx->pFuncList) {
+    FreeMemorySvc (&pCtx);
+    return 1;
+  }
+  InitFunctionPointers (pCtx->pFuncList, pCtx->pSvcParam, uiCpuFeatureFlags);
+
+  pCtx->iActiveThreadsNum	= pCodingParam->iCountThreadsNum;
+  pCtx->iMaxSliceCount	= iSliceNum;
+  iRet = RequestMemorySvc (&pCtx);
+  if (iRet != 0) {
+    WelsLog (pCtx, WELS_LOG_ERROR, "WelsInitEncoderExt(), RequestMemorySvc failed return %d.\n", iRet);
+    FreeMemorySvc (&pCtx);
+    return iRet;
+  }
+
+#ifdef MT_ENABLED
+  if (pCodingParam->iMultipleThreadIdc > 1)
+    iRet = CreateSliceThreads (pCtx);
+#endif
+
+  WelsRcInitModule (pCtx,  pCtx->pSvcParam->bEnableRc ? WELS_RC_GOM : WELS_RC_DISABLE);
+
+  pCtx->pVpp = new CWelsPreProcess ((void*)pCtx);
+  if (pCtx->pVpp == NULL) {
+    WelsLog (pCtx, WELS_LOG_ERROR, "WelsInitEncoderExt(), pOut of memory in case new CWelsPreProcess().\n");
+    FreeMemorySvc (&pCtx);
+    return iRet;
+  }
+
+#if defined(MEMORY_MONITOR)
+  WelsLog (pCtx, WELS_LOG_INFO, "WelsInitEncoderExt() exit, overall memory usage: %lu bytes\n",
+           sizeof (sWelsEncCtx) /* requested size from malloc() or new operator */
+           + pCtx->pMemAlign->WelsGetMemoryUsage()	/* requested size from CMemoryAlign::WelsMalloc() */
+          );
+#endif//MEMORY_MONITOR
+
+  *ppCtx	= pCtx;
+
+  WelsLog (pCtx, WELS_LOG_DEBUG, "WelsInitEncoderExt(), pCtx= 0x%p.\n", (void*)pCtx);
+
+  return 0;
+}
+/*
+ *
+ * status information output
+ */
+#if defined(STAT_OUTPUT)
+void StatOverallEncodingExt (sWelsEncCtx* pCtx) {
+  int8_t i = 0;
+  int8_t j = 0;
+  for (i = 0; i < pCtx->pSvcParam->iNumDependencyLayer; i++) {
+    fprintf (stdout, "\nDependency layer : %d\n", i);
+    fprintf (stdout, "Quality layer : %d\n", j);
+    {
+      const int32_t iCount = pCtx->sStatData[i][j].sSliceData.iSliceCount[I_SLICE] +
+                             pCtx->sStatData[i][j].sSliceData.iSliceCount[P_SLICE] +
+                             pCtx->sStatData[i][j].sSliceData.iSliceCount[B_SLICE];
+#if defined(MB_TYPES_CHECK)
+      if (iCount > 0) {
+        int32_t iCountNumIMb = pCtx->sStatData[i][j].sSliceData.iMbCount[I_SLICE][Intra4x4] +
+                               pCtx->sStatData[i][j].sSliceData.iMbCount[I_SLICE][Intra16x16] + pCtx->sStatData[i][j].sSliceData.iMbCount[I_SLICE][7];
+        int32_t iCountNumPMb	=	pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Intra4x4] +
+                                pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Intra16x16] +
+                                pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][7] +
+                                pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter16x16] +
+                                pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter16x8] +
+                                pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter8x16] +
+                                pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter8x8] +
+                                pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][10] +
+                                pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][PSkip];
+        int32_t count_p_mbL0 = 	pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter16x16] +
+                                pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter16x8] +
+                                pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter8x16] +
+                                pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter8x8] +
+                                pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][10];
+
+        int32_t iMbCount = iCountNumIMb + iCountNumPMb;
+        if (iMbCount > 0) {
+          fprintf (stderr,
+                   "SVC: overall Slices	MBs: %d Avg\nI4x4: %.3f%% I16x16: %.3f%% IBL: %.3f%%\nP16x16: %.3f%% P16x8: %.3f%% P8x16: %.3f%% P8x8: %.3f%% SUBP8x8: %.3f%% PSKIP: %.3f%%\nILP(All): %.3f%% ILP(PL0): %.3f%% BLSKIP(PL0): %.3f%% RP(PL0): %.3f%%\n",
+                   iMbCount,
+                   (100.0f * (pCtx->sStatData[i][j].sSliceData.iMbCount[I_SLICE][Intra4x4] +
+                              pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Intra4x4]) / iMbCount),
+                   (100.0f * (pCtx->sStatData[i][j].sSliceData.iMbCount[I_SLICE][Intra16x16] +
+                              pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Intra16x16]) / iMbCount),
+                   (100.0f * (pCtx->sStatData[i][j].sSliceData.iMbCount[I_SLICE][7] +
+                              pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][7]) / iMbCount),
+                   (100.0f * pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter16x16] / iMbCount),
+                   (100.0f * pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter16x8] / iMbCount),
+                   (100.0f * pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter8x16] / iMbCount),
+                   (100.0f * pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter8x8] / iMbCount),
+                   (100.0f * pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][10] / iMbCount),
+                   (100.0f * pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][PSkip] / iMbCount),
+                   (100.0f * pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][11] / iMbCount),
+                   (100.0f * pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][11] / count_p_mbL0),
+                   (100.0f * pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][8] / count_p_mbL0),
+                   (100.0f * pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][9] / count_p_mbL0)
+                  );
+        }
+      }
+#endif //#if defined(MB_TYPES_CHECK)
+
+      if (iCount > 0) {
+        fprintf (stdout, "SVC: overall PSNR Y: %2.3f U: %2.3f V: %2.3f kb/s: %.1f fps: %.3f\n\n",
+                 (pCtx->sStatData[i][j].sQualityStat.rYPsnr[I_SLICE] + pCtx->sStatData[i][j].sQualityStat.rYPsnr[P_SLICE] +
+                  pCtx->sStatData[i][j].sQualityStat.rYPsnr[B_SLICE]) / (float) (iCount),
+                 (pCtx->sStatData[i][j].sQualityStat.rUPsnr[I_SLICE] + pCtx->sStatData[i][j].sQualityStat.rUPsnr[P_SLICE] +
+                  pCtx->sStatData[i][j].sQualityStat.rUPsnr[B_SLICE]) / (float) (iCount),
+                 (pCtx->sStatData[i][j].sQualityStat.rVPsnr[I_SLICE] + pCtx->sStatData[i][j].sQualityStat.rVPsnr[P_SLICE] +
+                  pCtx->sStatData[i][j].sQualityStat.rVPsnr[B_SLICE]) / (float) (iCount),
+                 1.0f * pCtx->pSvcParam->sDependencyLayers[i].fOutputFrameRate * (pCtx->sStatData[i][j].sSliceData.iSliceSize[I_SLICE] +
+                     pCtx->sStatData[i][j].sSliceData.iSliceSize[P_SLICE] + pCtx->sStatData[i][j].sSliceData.iSliceSize[B_SLICE]) / (float) (
+                   iCount + pCtx->pWelsSvcRc[i].iSkipFrameNum) / 1000,
+                 1.0f * pCtx->pSvcParam->sDependencyLayers[i].fOutputFrameRate);
+
+      }
+
+    }
+
+  }
+}
+#endif
+/*!
+ * \brief	uninitialize Wels encoder core library
+ * \pParam	pEncCtx		sWelsEncCtx*
+ * \return	none
+ */
+void WelsUninitEncoderExt (sWelsEncCtx** ppCtx) {
+  if (NULL == ppCtx || NULL == *ppCtx)
+    return;
+
+  WelsLog (*ppCtx, WELS_LOG_INFO, "WelsUninitEncoderExt(), pCtx= %p, iThreadCount= %d, iMultipleThreadIdc= %d.\n",
+           (void*) (*ppCtx), (*ppCtx)->pSvcParam->iCountThreadsNum, (*ppCtx)->pSvcParam->iMultipleThreadIdc);
+
+#if defined(STAT_OUTPUT)
+  StatOverallEncodingExt (*ppCtx);
+#endif
+
+#if defined(MT_ENABLED)
+  if ((*ppCtx)->pSvcParam->iMultipleThreadIdc > 1 && (*ppCtx)->pSliceThreading != NULL) {
+    const int32_t iThreadCount = (*ppCtx)->pSvcParam->iCountThreadsNum;
+    int32_t iThreadIdx = 0;
+
+#if defined(WIN32)
+    if ((*ppCtx)->pSliceThreading->pExitEncodeEvent != NULL) {
+      do {
+        if ((*ppCtx)->pSliceThreading->pThreadHandles[iThreadIdx] != NULL)	// iThreadIdx is already created successfully
+          WelsEventSignal (& (*ppCtx)->pSliceThreading->pExitEncodeEvent[iThreadIdx]);
+        ++ iThreadIdx;
+      } while (iThreadIdx < iThreadCount);
+
+      WelsMultipleEventsWaitAllBlocking (iThreadCount, & (*ppCtx)->pSliceThreading->pFinSliceCodingEvent[0]);
+
+    }
+#elif defined(__GNUC__)
+    while (iThreadIdx < iThreadCount) {
+      int res = 0;
+      if ((*ppCtx)->pSliceThreading->pThreadHandles[iThreadIdx]) {
+        res = WelsThreadCancel ((*ppCtx)->pSliceThreading->pThreadHandles[iThreadIdx]);
+        WelsLog (*ppCtx, WELS_LOG_INFO, "WelsUninitEncoderExt(), WelsThreadCancel(pThreadHandles%d) return %d..\n", iThreadIdx,
+                 res);
+        res = WelsThreadJoin ((*ppCtx)->pSliceThreading->pThreadHandles[iThreadIdx]);	// waiting thread exit
+        WelsLog (*ppCtx, WELS_LOG_INFO, "WelsUninitEncoderExt(), pthread_join(pThreadHandles%d) return %d..\n", iThreadIdx,
+                 res);
+        (*ppCtx)->pSliceThreading->pThreadHandles[iThreadIdx] = 0;
+      }
+#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+      if ((*ppCtx)->pSliceThreading->pUpdateMbListThrdHandles[iThreadIdx]) {
+        res = WelsThreadCancel ((*ppCtx)->pSliceThreading->pUpdateMbListThrdHandles[iThreadIdx]);
+        WelsLog (*ppCtx, WELS_LOG_INFO, "WelsUninitEncoderExt(), WelsThreadCancel(pUpdateMbListThrdHandles%d) return %d..\n",
+                 iThreadIdx, res);
+        res = WelsThreadJoin ((*ppCtx)->pSliceThreading->pUpdateMbListThrdHandles[iThreadIdx]);	// waiting thread exit
+        WelsLog (*ppCtx, WELS_LOG_INFO, "WelsUninitEncoderExt(), pthread_join(pUpdateMbListThrdHandles%d) return %d..\n",
+                 iThreadIdx, res);
+        (*ppCtx)->pSliceThreading->pUpdateMbListThrdHandles[iThreadIdx] = 0;
+      }
+#endif//DYNAMIC_SLICE_ASSIGN && TRY_SLICING_BALANCE
+      ++ iThreadIdx;
+    }
+#endif//WIN32
+  }
+#endif//MT_ENABLED
+
+  if ((*ppCtx)->pVpp) {
+    delete (*ppCtx)->pVpp;
+    (*ppCtx)->pVpp = NULL;
+  }
+  FreeMemorySvc (ppCtx);
+  *ppCtx = NULL;
+}
+
+/*!
+ * \brief	get temporal level due to configuration and coding context
+ */
+static inline int32_t GetTemporalLevel (SDLayerParam* fDlp, const int32_t kiFrameNum, const int32_t kiGopSize) {
+  const int32_t kiCodingIdx	= kiFrameNum & (kiGopSize - 1);
+
+  return fDlp->uiCodingIdx2TemporalId[kiCodingIdx];
+}
+
+void DynslcUpdateMbNeighbourInfoListForAllSlices (SSliceCtx* pSliceCtx, SMB* pMbList) {
+  const int32_t kiMbWidth			= pSliceCtx->iMbWidth;
+  const int32_t kiEndMbInSlice	= pSliceCtx->iMbNumInFrame - 1;
+  int32_t  iIdx					= 0;
+
+  do {
+    SMB* pMb = &pMbList[iIdx];
+    uint32_t uiNeighborAvailFlag	= 0;
+    const int32_t kiMbXY				= pMb->iMbXY;
+    const int32_t kiMbX				= pMb->iMbX;
+    const int32_t kiMbY				= pMb->iMbY;
+    BOOL_T     bLeft;
+    BOOL_T     bTop;
+    BOOL_T     bLeftTop;
+    BOOL_T     bRightTop;
+    int32_t  uiSliceIdc;
+    int32_t   iLeftXY, iTopXY, iLeftTopXY, iRightTopXY;
+
+    uiSliceIdc = WelsMbToSliceIdc (pSliceCtx, kiMbXY);
+    pMb->uiSliceIdc	= uiSliceIdc;
+    iLeftXY = kiMbXY - 1;
+    iTopXY = kiMbXY - kiMbWidth;
+    iLeftTopXY = iTopXY - 1;
+    iRightTopXY = iTopXY + 1;
+
+    bLeft = (kiMbX > 0) && (uiSliceIdc == WelsMbToSliceIdc (pSliceCtx, iLeftXY));
+    bTop = (kiMbY > 0) && (uiSliceIdc == WelsMbToSliceIdc (pSliceCtx, iTopXY));
+    bLeftTop = (kiMbX > 0) && (kiMbY > 0) && (uiSliceIdc == WelsMbToSliceIdc (pSliceCtx, iLeftTopXY));
+    bRightTop = (kiMbX < (kiMbWidth - 1)) && (kiMbY > 0) && (uiSliceIdc == WelsMbToSliceIdc (pSliceCtx, iRightTopXY));
+
+    if (bLeft) {
+      uiNeighborAvailFlag |= LEFT_MB_POS;
+    }
+    if (bTop) {
+      uiNeighborAvailFlag |= TOP_MB_POS;
+    }
+    if (bLeftTop) {
+      uiNeighborAvailFlag |= TOPLEFT_MB_POS;
+    }
+    if (bRightTop) {
+      uiNeighborAvailFlag |= TOPRIGHT_MB_POS;
+    }
+    pMb->uiNeighborAvail	= (uint8_t)uiNeighborAvailFlag;
+
+    ++ iIdx;
+  } while (iIdx <= kiEndMbInSlice);
+}
+
+/*
+ * TUNE back if number of picture partition decision algorithm based on past if available
+ */
+int32_t PicPartitionNumDecision (sWelsEncCtx* pCtx) {
+  int32_t iPartitionNum	= 1;
+#ifdef MT_ENABLED
+  if (pCtx->pSvcParam->iMultipleThreadIdc > 1) {
+    iPartitionNum	= pCtx->pSvcParam->iCountThreadsNum;
+#if !defined(FIXED_PARTITION_ASSIGN)
+    if (P_SLICE == pCtx->eSliceType)
+      iPartitionNum	= 1;
+#endif//!FIXED_PARTITION_ASSIGN
+  }
+  return iPartitionNum;
+#else
+  return iPartitionNum;
+#endif//MT_ENABLED
+}
+
+#if defined(MT_ENABLED)
+void WelsInitCurrentQBLayerMltslc (sWelsEncCtx* pCtx) {
+  //pData init
+  SDqLayer*		pCurDq				= pCtx->pCurDqLayer;
+  SSliceCtx*	pSliceCtx			= (pCurDq->pSliceEncCtx);
+
+  //mb_neighbor
+  DynslcUpdateMbNeighbourInfoListForAllSlices (pSliceCtx, pCurDq->sMbDataP);
+}
+
+void UpdateSlicepEncCtxWithPartition (SSliceCtx* pSliceCtx, int32_t iPartitionNum) {
+  const int32_t kiMbNumInFrame	= pSliceCtx->iMbNumInFrame;
+  int32_t iCountMbNumPerPartition	= kiMbNumInFrame;
+  int32_t iAssignableMbLeft		= kiMbNumInFrame;
+  int32_t iFirstMbIdx			= 0;
+  int32_t i/*, j*/;
+
+  if (iPartitionNum <= 0)
+    iPartitionNum	= 1;
+  else if (iPartitionNum > AVERSLICENUM_CONSTRAINT)
+    iPartitionNum	= AVERSLICENUM_CONSTRAINT;	// AVERSLICENUM_CONSTRAINT might be variable, however not fixed by MACRO
+  iCountMbNumPerPartition	/= iPartitionNum;
+  pSliceCtx->iSliceNumInFrame	= iPartitionNum;
+  i = 0;
+  while (i < iPartitionNum) {
+    if (i + 1 == iPartitionNum) {
+      pSliceCtx->pCountMbNumInSlice[i]	= iAssignableMbLeft;
+    } else {
+      pSliceCtx->pCountMbNumInSlice[i]	= iCountMbNumPerPartition;
+    }
+    pSliceCtx->pFirstMbInSlice[i]	=	iFirstMbIdx;
+
+    memset (pSliceCtx->pOverallMbMap + iFirstMbIdx, (uint8_t)i, pSliceCtx->pCountMbNumInSlice[i]*sizeof (uint8_t));
+
+    // for next partition(or pSlice)
+    iFirstMbIdx	+= pSliceCtx->pCountMbNumInSlice[i];
+    iAssignableMbLeft -= pSliceCtx->pCountMbNumInSlice[i];
+    ++ i;
+  }
+}
+
+void WelsInitCurrentDlayerMltslc (sWelsEncCtx* pCtx, int32_t iPartitionNum) {
+  SDqLayer* pCurDq				= pCtx->pCurDqLayer;
+  SSliceCtx* pSliceCtx		= pCurDq->pSliceEncCtx;
+
+  UpdateSlicepEncCtxWithPartition (pSliceCtx, iPartitionNum);
+
+  if (I_SLICE == pCtx->eSliceType) { //check if uiSliceSizeConstraint too small
+#define byte_complexIMBat26 (60)
+    uint8_t		iCurDid = pCtx->uiDependencyId;
+    uint32_t	uiFrmByte = 0;
+
+    if (pCtx->pSvcParam->bEnableRc) {
+      //RC case
+      uiFrmByte = (
+                    ((uint32_t) (pCtx->pSvcParam->sDependencyLayers[iCurDid].iSpatialBitrate)
+                     / (uint32_t) (pCtx->pSvcParam->sDependencyLayers[iCurDid].fInputFrameRate)) >> 3);
+    } else {
+      //fixed QP case
+      const int32_t iTtlMbNumInFrame = pSliceCtx->iMbNumInFrame;
+      int32_t iQDeltaTo26 = (26 - pCtx->pSvcParam->sDependencyLayers[iCurDid].iDLayerQp);
+
+      uiFrmByte = (iTtlMbNumInFrame * byte_complexIMBat26);
+      if (iQDeltaTo26 > 0) {
+        //smaller QP than 26
+        uiFrmByte = (uint32_t) (uiFrmByte * ((float)iQDeltaTo26 / 4));
+      } else if (iQDeltaTo26 < 0) {
+        //larger QP than 26
+        iQDeltaTo26 = ((-iQDeltaTo26) >> 2);   //delta mod 4
+        uiFrmByte = (uiFrmByte >> (iQDeltaTo26));   //if delta 4, byte /2
+      }
+    }
+
+    //MINPACKETSIZE_CONSTRAINT
+    if (pSliceCtx->uiSliceSizeConstraint
+        <
+        (uint32_t) (uiFrmByte//suppose 16 byte per mb at average
+                    / (pSliceCtx->iMaxSliceNumConstraint))
+       ) {
+
+      WelsLog (pCtx,
+               WELS_LOG_WARNING,
+               "Set-SliceConstraint(%d) too small for current resolution (MB# %d) under QP/BR!\n",
+               pSliceCtx->uiSliceSizeConstraint,
+               pSliceCtx->iMbNumInFrame
+              );
+    }
+  }
+
+  WelsInitCurrentQBLayerMltslc (pCtx);
+}
+#else
+void WelsInitCurrentQBLayerMltslc (sWelsEncCtx* pCtx) {
+  //pData init
+  SDqLayer*		pCurDq				= pCtx->pCurDqLayer;
+  SSliceCtx*	pSliceCtx			= (pCurDq->pSliceEncCtx);
+  SSlice* 			pSlice				= &pCurDq->sLayerInfo.pSliceInLayer[0];
+  int32_t			iTtlMbNumInFrame = pSliceCtx->iMbNumInFrame;
+
+  //pSliceCtx
+  memset (pSliceCtx->pOverallMbMap,		0, iTtlMbNumInFrame * sizeof (uint8_t));
+  memset (pSliceCtx->pCountMbNumInSlice,	0, pSliceCtx->iSliceNumInFrame * sizeof (int32_t));
+  memset (pSliceCtx->pFirstMbInSlice,		0, pSliceCtx->iSliceNumInFrame * sizeof (int16_t));
+  pSliceCtx->iSliceNumInFrame				= 1;//
+  pSliceCtx->pCountMbNumInSlice[0]			= iTtlMbNumInFrame;
+
+  //mb_neighbor
+  DynslcUpdateMbNeighbourInfoListForAllSlices (pSliceCtx, pCurDq->sMbDataP);
+
+  //pSlice init
+  pSlice->uiSliceIdx				= 0;
+  pSlice->pSliceBsa				= &pCtx->pOut->sBsWrite;
+  pSlice->bDynamicSlicingSliceSizeCtrlFlag			= false;
+  pSlice->uiAssumeLog2BytePerMb	= (pCtx->eSliceType == P_SLICE) ? 0 : 1;
+}
+
+void WelsInitCurrentDlayerMltslc (sWelsEncCtx* pCtx, int32_t iPartitionNum) {
+  SDqLayer* pCurDq = pCtx->pCurDqLayer;
+  SSliceCtx* pSliceCtx = (pCurDq->pSliceEncCtx);
+  int32_t iTtlMbNumInFrame = pCurDq->iMbHeight * pCurDq->iMbWidth;
+
+  pSliceCtx->iMbNumInFrame
+    = pSliceCtx->pCountMbNumInSlice[0] = iTtlMbNumInFrame;
+
+  if (I_SLICE == pCtx->eSliceType) { //check if uiSliceSizeConstraint too small
+#define byte_complexIMBat26 (60)
+    uint8_t		iCurDid = pCtx->uiDependencyId;
+    uint32_t	uiFrmByte = 0;
+
+    if (pCtx->pSvcParam->bEnableRc) {
+      //RC case
+      uiFrmByte = (
+                    ((uint32_t) (pCtx->pSvcParam->sDependencyLayers[iCurDid].iSpatialBitrate)
+                     / (uint32_t) (pCtx->pSvcParam->sDependencyLayers[iCurDid].fInputFrameRate)) >> 3);
+    } else {
+      //fixed QP case
+      int32_t iQDeltaTo26 = (26 - pCtx->pSvcParam->sDependencyLayers[iCurDid].iDLayerQp);
+
+      uiFrmByte = (iTtlMbNumInFrame * byte_complexIMBat26);
+      if (iQDeltaTo26 > 0) {
+        //smaller QP than 26
+        uiFrmByte = (uint32_t) (uiFrmByte * ((float)iQDeltaTo26 / 4));
+      } else if (iQDeltaTo26 < 0) {
+        //larger QP than 26
+        iQDeltaTo26 = ((-iQDeltaTo26) >> 2);   //delta mod 4
+        uiFrmByte = (uiFrmByte >> (iQDeltaTo26));   //if delta 4, byte /2
+      }
+    }
+
+    //MINPACKETSIZE_CONSTRAINT
+    if (pSliceCtx->uiSliceSizeConstraint
+        <
+        (uint32_t) (uiFrmByte//suppose 16 byte per mb at average
+                    / (pSliceCtx->iMaxSliceNumConstraint))
+       ) {
+
+      WelsLog (pCtx,
+               WELS_LOG_WARNING,
+               "Set-SliceConstraint(%d) too small for current resolution (MB# %d) under QP/BR!\n",
+               pSliceCtx->uiSliceSizeConstraint,
+               pSliceCtx->iMbNumInFrame
+              );
+    }
+  }
+
+  WelsInitCurrentQBLayerMltslc (pCtx);
+}
+#endif
+
+/*!
+ * \brief	initialize current layer
+ */
+void WelsInitCurrentLayer (sWelsEncCtx* pCtx,
+                           const int32_t kiWidth,
+                           const int32_t kiHeight) {
+  SWelsSvcCodingParam* pParam	= pCtx->pSvcParam;
+  SPicture* pEncPic					= pCtx->pEncPic;
+  SPicture* pDecPic					= pCtx->pDecPic;
+  SDqLayer* pCurDq				= pCtx->pCurDqLayer;
+  SSlice* pBaseSlice				= &pCurDq->sLayerInfo.pSliceInLayer[0];
+  SSlice* pSlice					= NULL;
+  const uint8_t kiCurDid			= pCtx->uiDependencyId;
+  const bool_t kbUseSubsetSpsFlag = (kiCurDid > BASE_DEPENDENCY_ID);
+  SDLayerParam* fDlp				= &pParam->sDependencyLayers[kiCurDid];
+  SNalUnitHeaderExt* pNalHdExt	= &pCurDq->sLayerInfo.sNalHeaderExt;
+  SNalUnitHeader* pNalHd			= &pNalHdExt->sNalHeader;
+  SDqIdc* pDqIdc						= &pCtx->pDqIdcMap[kiCurDid];
+  int32_t iIdx						= 0;
+  int32_t iSliceCount				= 0;
+
+  if (NULL == pCurDq)
+    return;
+
+  pCurDq->pDecPic	= pDecPic;
+
+  if (fDlp->sMso.uiSliceMode == SM_DYN_SLICE)	// need get extra slices for update
+    iSliceCount = GetInitialSliceNum (pCurDq->iMbWidth, pCurDq->iMbHeight, &fDlp->sMso);
+  else
+    iSliceCount = GetCurrentSliceNum (pCurDq->pSliceEncCtx);
+  assert (iSliceCount > 0);
+
+  pBaseSlice->sSliceHeaderExt.sSliceHeader.iPpsId	= pDqIdc->iPpsId;
+  pCurDq->sLayerInfo.pPpsP							=
+    pBaseSlice->sSliceHeaderExt.sSliceHeader.pPps		= &pCtx->pPPSArray[pBaseSlice->sSliceHeaderExt.sSliceHeader.iPpsId];
+  pBaseSlice->sSliceHeaderExt.sSliceHeader.iSpsId	= pDqIdc->iSpsId;
+  if (kbUseSubsetSpsFlag) {
+    pCurDq->sLayerInfo.pSubsetSpsP					= &pCtx->pSubsetArray[pDqIdc->iSpsId];
+    pCurDq->sLayerInfo.pSpsP						=
+      pBaseSlice->sSliceHeaderExt.sSliceHeader.pSps	= &pCurDq->sLayerInfo.pSubsetSpsP->pSps;
+  } else {
+    pCurDq->sLayerInfo.pSubsetSpsP					= NULL;
+    pCurDq->sLayerInfo.pSpsP						=
+      pBaseSlice->sSliceHeaderExt.sSliceHeader.pSps	= &pCtx->pSpsArray[pBaseSlice->sSliceHeaderExt.sSliceHeader.iSpsId];
+  }
+
+  pSlice = pBaseSlice;
+  iIdx = 1;
+  while (iIdx < iSliceCount) {
+    ++ pSlice;
+    pSlice->sSliceHeaderExt.sSliceHeader.iPpsId	= pBaseSlice->sSliceHeaderExt.sSliceHeader.iPpsId;
+    pSlice->sSliceHeaderExt.sSliceHeader.pPps	= pBaseSlice->sSliceHeaderExt.sSliceHeader.pPps;
+    pSlice->sSliceHeaderExt.sSliceHeader.iSpsId	= pBaseSlice->sSliceHeaderExt.sSliceHeader.iSpsId;
+    pSlice->sSliceHeaderExt.sSliceHeader.pSps	= pBaseSlice->sSliceHeaderExt.sSliceHeader.pSps;
+    ++ iIdx;
+  }
+
+  memset (pNalHdExt, 0, sizeof (SNalUnitHeaderExt));
+  pNalHd->uiNalRefIdc					= pCtx->eNalPriority;
+  pNalHd->eNalUnitType				= pCtx->eNalType;
+
+  pNalHdExt->uiDependencyId			= kiCurDid;
+  pNalHdExt->bDiscardableFlag		= (pCtx->bNeedPrefixNalFlag) ? (pNalHd->uiNalRefIdc == NRI_PRI_LOWEST) : false;
+  pNalHdExt->bIdrFlag				= (pCtx->iFrameNum == 0) && ((pCtx->eNalType == NAL_UNIT_CODED_SLICE_IDR)
+                              || (pCtx->eSliceType == I_SLICE));
+  pNalHdExt->uiTemporalId				= pCtx->uiTemporalId;
+
+  pBaseSlice->bSliceHeaderExtFlag	= (NAL_UNIT_CODED_SLICE_EXT == pNalHd->eNalUnitType);
+
+  pSlice = pBaseSlice;
+  iIdx = 1;
+  while (iIdx < iSliceCount) {
+    ++ pSlice;
+    pSlice->bSliceHeaderExtFlag			= pBaseSlice->bSliceHeaderExtFlag;
+    ++ iIdx;
+  }
+
+  // pEncPic pData
+  pCurDq->pEncData[0]		= pEncPic->pData[0];
+  pCurDq->pEncData[1]		= pEncPic->pData[1];
+  pCurDq->pEncData[2]		= pEncPic->pData[2];
+  pCurDq->iEncStride[0]	= pEncPic->iLineSize[0];
+  pCurDq->iEncStride[1]	= pEncPic->iLineSize[1];
+  pCurDq->iEncStride[2]	= pEncPic->iLineSize[2];
+  // cs pData
+  pCurDq->pCsData[0]		= pDecPic->pData[0];
+  pCurDq->pCsData[1]		= pDecPic->pData[1];
+  pCurDq->pCsData[2]		= pDecPic->pData[2];
+  pCurDq->iCsStride[0]	= pDecPic->iLineSize[0];
+  pCurDq->iCsStride[1]	= pDecPic->iLineSize[1];
+  pCurDq->iCsStride[2]	= pDecPic->iLineSize[2];
+
+  if (pCurDq->pRefLayer != NULL) {
+    pCurDq->bBaseLayerAvailableFlag	= true;
+  } else {
+    pCurDq->bBaseLayerAvailableFlag	= false;
+  }
+}
+
+void PreprocessSliceCoding (sWelsEncCtx* pCtx) {
+  SDqLayer* pCurLayer		= pCtx->pCurDqLayer;
+  const bool_t kbBaseAvail	= pCurLayer->bBaseLayerAvailableFlag;
+
+  /* function pointers conditional assignment under sWelsEncCtx, layer_mb_enc_rec (in stack) is exclusive */
+
+  if (P_SLICE == pCtx->eSliceType) {
+    if (kbBaseAvail) {
+      if (pCtx->pSvcParam->iNumDependencyLayer == (pCurLayer->sLayerInfo.sNalHeaderExt.uiDependencyId + 1)) { //
+        pCtx->pFuncList->pfMotionSearch = WelsMotionEstimateSearchSad;
+        pCtx->pFuncList->pfFirstIntraMode = WelsMdFirstIntraMode;
+        pCtx->pFuncList->pfIntraFineMd = WelsMdIntraFinePartitionVaa;
+        pCtx->pFuncList->pfInterFineMd = WelsMdInterFinePartitionVaa;
+        pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Sad;
+        pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3 =
+          pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad;
+        pCtx->pFuncList->sSampleDealingFuncs.pfMdCost = pCtx->pFuncList->sSampleDealingFuncs.pfSampleSad;
+      } else {
+        pCtx->pFuncList->pfMotionSearch  = WelsMotionEstimateSearchSatd;
+        pCtx->pFuncList->pfFirstIntraMode = WelsMdFirstIntraMode;
+        pCtx->pFuncList->pfIntraFineMd = WelsMdIntraFinePartition;
+        pCtx->pFuncList->pfInterFineMd = WelsMdInterFinePartition;
+        pCtx->pFuncList->sSampleDealingFuncs.pfMdCost = pCtx->pFuncList->sSampleDealingFuncs.pfSampleSatd;
+        pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3 =
+          pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Satd;
+        pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd;
+        pCtx->pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd;
+      }
+      pCtx->pFuncList->sSampleDealingFuncs.pfMeCost = pCtx->pFuncList->sSampleDealingFuncs.pfSampleSatd;
+    } else {
+      //case 3: pBase layer MD + encoding
+      if (pCurLayer->sLayerInfo.sNalHeaderExt.uiDependencyId + 1 == pCtx->pSvcParam->iNumDependencyLayer) {
+        pCtx->pFuncList->pfMotionSearch  = WelsMotionEstimateSearchSad;
+        pCtx->pFuncList->pfFirstIntraMode = WelsMdFirstIntraMode;
+        pCtx->pFuncList->pfIntraFineMd = WelsMdIntraFinePartitionVaa;
+        pCtx->pFuncList->pfInterFineMd = WelsMdInterFinePartitionVaa;
+        pCtx->pFuncList->sSampleDealingFuncs.pfMdCost = pCtx->pFuncList->sSampleDealingFuncs.pfSampleSad;
+        pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3 =
+          pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad;
+        pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Sad;
+      } else {
+        pCtx->pFuncList->pfMotionSearch  = WelsMotionEstimateSearchSatd;
+        pCtx->pFuncList->pfFirstIntraMode = WelsMdFirstIntraMode;
+        pCtx->pFuncList->pfIntraFineMd = WelsMdIntraFinePartition;
+        pCtx->pFuncList->pfInterFineMd = WelsMdInterFinePartition;
+        pCtx->pFuncList->sSampleDealingFuncs.pfMdCost = pCtx->pFuncList->sSampleDealingFuncs.pfSampleSatd;
+        pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3 =
+          pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Satd;
+        pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd;
+        pCtx->pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd;
+      }
+      pCtx->pFuncList->sSampleDealingFuncs.pfMeCost = pCtx->pFuncList->sSampleDealingFuncs.pfSampleSatd;
+    }
+  } else if (I_SLICE == pCtx->eSliceType) {
+    if (pCurLayer->sLayerInfo.sNalHeaderExt.uiDependencyId + 1 == pCtx->pSvcParam->iNumDependencyLayer) {
+      pCtx->pFuncList->sSampleDealingFuncs.pfMdCost = pCtx->pFuncList->sSampleDealingFuncs.pfSampleSad;
+      pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3 =
+        pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad;
+      pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Sad;
+      pCtx->pFuncList->pfIntraFineMd = WelsMdIntraFinePartitionVaa;
+    } else {
+      pCtx->pFuncList->sSampleDealingFuncs.pfMdCost = pCtx->pFuncList->sSampleDealingFuncs.pfSampleSatd;
+      pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3 =
+        pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Satd;
+      pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd;
+      pCtx->pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd;
+      pCtx->pFuncList->pfIntraFineMd = WelsMdIntraFinePartition;
+    }
+  }
+}
+
+/*!
+ * \brief	swap pDq layers between current pDq layer and reference pDq layer
+ */
+
+static inline void WelsSwapDqLayers (sWelsEncCtx* pCtx) {
+  // swap and assign reference
+  const int32_t kiDid			= pCtx->uiDependencyId;
+  const int32_t kiNextDqIdx   = 1 + kiDid;
+
+  SDqLayer* pTmpLayer			= pCtx->ppDqLayerList[kiNextDqIdx];
+  SDqLayer* pRefLayer			= pCtx->pCurDqLayer;
+  pCtx->pCurDqLayer				= pTmpLayer;
+  pCtx->pCurDqLayer->pRefLayer	= pRefLayer;
+}
+
+/*!
+ * \brief	prefetch reference picture after WelsBuildRefList
+ */
+static inline void PrefetchReferencePicture (sWelsEncCtx* pCtx, const EFrameType keFrameType) {
+  SSlice* pSliceBase = &pCtx->pCurDqLayer->sLayerInfo.pSliceInLayer[0];
+  const int32_t kiSliceCount = GetCurrentSliceNum (pCtx->pCurDqLayer->pSliceEncCtx);
+  int32_t iIdx = 0;
+  uint8_t uiRefIdx = -1;
+
+  assert (kiSliceCount > 0);
+  if (keFrameType != WELS_FRAME_TYPE_IDR) {
+    assert (pCtx->iNumRef0 > 0);
+    pCtx->pRefPic	= pCtx->pRefList0[0];	// always get item 0 due to reordering done
+    pCtx->pCurDqLayer->pRefPic	= pCtx->pRefPic;
+    uiRefIdx	= 0;	// reordered reference iIndex
+  } else {	// safe for IDR coding
+    pCtx->pRefPic					= NULL;
+    pCtx->pCurDqLayer->pRefPic	= NULL;
+  }
+
+  iIdx = 0;
+  while (iIdx < kiSliceCount) {
+    pSliceBase->sSliceHeaderExt.sSliceHeader.uiRefIndex	= uiRefIdx;
+    ++ pSliceBase;
+    ++ iIdx;
+  }
+}
+
+
+void ParasetIdAdditionIdAdjust (SParaSetOffsetVariable* sParaSetOffsetVariable, const int32_t kiCurEncoderParaSetId,
+                                const uint32_t kuiMaxIdInBs) { //paraset_type = 0: SPS; =1: PPS
+  //SPS_ID in avc_sps and pSubsetSps will be different using this
+  //SPS_ID case example:
+  //1st enter:		next_spsid_in_bs == 0; spsid == 0; delta==0;				//actual spsid_in_bs == 0
+  //1st finish:		next_spsid_in_bs == 1;
+  //2nd enter:	next_spsid_in_bs == 1; spsid == 0; delta==1;				//actual spsid_in_bs == 1
+  //2nd finish:		next_spsid_in_bs == 2;
+  //31st enter:	next_spsid_in_bs == 31; spsid == 0~2; delta==31~29;	//actual spsid_in_bs == 31
+  //31st finish:	next_spsid_in_bs == 0;
+  //31st enter:	next_spsid_in_bs == 0; spsid == 0~2; delta==-2~0;		//actual spsid_in_bs == 0
+  //31st finish:	next_spsid_in_bs == 1;
+
+  const int32_t kiEncId			= kiCurEncoderParaSetId;
+  const uint32_t kuiPrevIdInBs	= sParaSetOffsetVariable->iParaSetIdDelta[kiEncId] + kiEncId;//mark current_id
+  const bool_t* kpUsedIdPointer   = &sParaSetOffsetVariable->bUsedParaSetIdInBs[0];
+  uint32_t uiNextIdInBs			= sParaSetOffsetVariable->uiNextParaSetIdToUseInBs;
+
+#if _DEBUG
+  if (0 != sParaSetOffsetVariable->iParaSetIdDelta[kiEncId])
+    assert (sParaSetOffsetVariable->bUsedParaSetIdInBs[kuiPrevIdInBs]);   //sure the prev-used one was marked activated correctly
+#endif
+  //update current layer's pCodingParam
+  sParaSetOffsetVariable->iParaSetIdDelta[kiEncId]	= uiNextIdInBs -
+      kiEncId;  //for current parameter set, change its id_delta
+  //write pso pData for next update:
+  sParaSetOffsetVariable->bUsedParaSetIdInBs[kuiPrevIdInBs] = false;	//
+  sParaSetOffsetVariable->bUsedParaSetIdInBs[uiNextIdInBs] = true;		//   update current used_id
+
+  //prepare for next update:
+  //   find the next avaibable iId
+  do {
+    ++uiNextIdInBs;
+    if (uiNextIdInBs >= kuiMaxIdInBs) {
+      uiNextIdInBs = 0;//ensure the SPS_ID wound not exceed MAX_SPS_COUNT
+    }
+  } while (kpUsedIdPointer[uiNextIdInBs]);
+
+  //   update next_id
+  sParaSetOffsetVariable->uiNextParaSetIdToUseInBs = uiNextIdInBs;
+
+#if _DEBUG
+  assert (!sParaSetOffsetVariable->bUsedParaSetIdInBs[uiNextIdInBs]);   //sure the next-to-use one is marked activated correctly
+#endif
+
+}
+
+/*!
+ * \brief	write all parameter sets introduced in SVC extension
+ * \return	size in bytes of bitstream wrote
+ */
+int32_t WelsWriteParameterSets (sWelsEncCtx* pCtx, int32_t* pNalLen, int32_t* pNumNal) {
+  int32_t iSize	= 0;
+  int32_t iNal	= 0;
+  int32_t	iIdx	= 0;
+  int32_t iId	= 0;
+  int32_t iCountNal	= 0;
+
+  if (NULL == pCtx || NULL == pNalLen || NULL == pNumNal)
+    return 0;
+
+  /* write all SPS */
+  iIdx = 0;
+  while (iIdx < pCtx->iSpsNum) {
+    SDqIdc* pDqIdc		= &pCtx->pDqIdcMap[iIdx];
+    const int32_t kiDid	= pDqIdc->uiSpatialId;
+    const bool_t kbUsingSubsetSps = (kiDid > BASE_DEPENDENCY_ID);
+
+    iNal	= pCtx->pOut->iNalIndex;
+
+    if (pCtx->pSvcParam->bEnableSpsPpsIdAddition) {
+#if _DEBUG
+      pCtx->sPSOVector.bEnableSpsPpsIdAddition = 1;
+      assert (kiDid < MAX_DEPENDENCY_LAYER);
+      assert (iIdx < MAX_DQ_LAYER_NUM);
+#endif
+
+      ParasetIdAdditionIdAdjust (& (pCtx->sPSOVector.sParaSetOffsetVariable[kbUsingSubsetSps ? PARA_SET_TYPE_SUBSETSPS :
+                                    PARA_SET_TYPE_AVCSPS]),
+                                 (kbUsingSubsetSps) ? (pCtx->pSubsetArray[iIdx - 1].pSps.uiSpsId) : (pCtx->pSpsArray[0].uiSpsId),
+                                 MAX_SPS_COUNT);
+    } else {
+      memset (& (pCtx->sPSOVector), 0, sizeof (pCtx->sPSOVector));
+    }
+
+    if (kbUsingSubsetSps) {
+      iId	= iIdx - 1;
+
+      /* generate Subset SPS */
+      WelsLoadNal (pCtx->pOut, NAL_UNIT_SUBSET_SPS, NRI_PRI_HIGHEST);
+
+      WelsWriteSubsetSpsSyntax (&pCtx->pSubsetArray[iId], &pCtx->pOut->sBsWrite,
+                                & (pCtx->sPSOVector.sParaSetOffsetVariable[PARA_SET_TYPE_SUBSETSPS].iParaSetIdDelta[0]));
+      WelsUnloadNal (pCtx->pOut);
+    } else {
+      iId	= 0;
+
+      /* generate sequence parameters set */
+      WelsLoadNal (pCtx->pOut, NAL_UNIT_SPS, NRI_PRI_HIGHEST);
+      WelsWriteSpsNal (&pCtx->pSpsArray[0], &pCtx->pOut->sBsWrite,
+                       & (pCtx->sPSOVector.sParaSetOffsetVariable[PARA_SET_TYPE_AVCSPS].iParaSetIdDelta[0]));
+      WelsUnloadNal (pCtx->pOut);
+    }
+
+    pNalLen[iCountNal] = WelsEncodeNal (&pCtx->pOut->sNalList[iNal], pCtx->pFrameBs + pCtx->iPosBsBuffer,
+                                        &pNalLen[iCountNal]);
+
+    pCtx->iPosBsBuffer	+= pNalLen[iCountNal];
+    iSize				+= pNalLen[iCountNal];
+
+    ++ iIdx;
+    ++ iCountNal;
+  }
+
+  /* write all PPS */
+  iIdx = 0;
+  while (iIdx < pCtx->iPpsNum) {
+    if (pCtx->pSvcParam->bEnableSpsPpsIdAddition) {
+      //para_set_type = 2: PPS, use MAX_PPS_COUNT
+      ParasetIdAdditionIdAdjust (&pCtx->sPSOVector.sParaSetOffsetVariable[PARA_SET_TYPE_PPS], pCtx->pPPSArray[iIdx].iPpsId,
+                                 MAX_PPS_COUNT);
+    }
+
+    iNal	= pCtx->pOut->iNalIndex;
+    /* generate picture parameter set */
+    WelsLoadNal (pCtx->pOut, NAL_UNIT_PPS, NRI_PRI_HIGHEST);
+    WelsWritePpsSyntax (&pCtx->pPPSArray[iIdx], &pCtx->pOut->sBsWrite, & (pCtx->sPSOVector));
+    WelsUnloadNal (pCtx->pOut);
+
+    pNalLen[iCountNal] = WelsEncodeNal (&pCtx->pOut->sNalList[iNal], pCtx->pFrameBs + pCtx->iPosBsBuffer,
+                                        &pNalLen[iCountNal]);
+
+    pCtx->iPosBsBuffer	+= pNalLen[iCountNal];
+    iSize				+= pNalLen[iCountNal];
+
+    ++ iIdx;
+    ++ iCountNal;
+  }
+
+  *pNumNal = iCountNal;
+
+  return iSize;
+}
+
+static inline int32_t AddPrefixNal (sWelsEncCtx* pCtx,
+                                    SLayerBSInfo* pLayerBsInfo,
+                                    int32_t* pNalLen,
+                                    int32_t* pNalIdxInLayer,
+                                    const EWelsNalUnitType keNalType,
+                                    const EWelsNalRefIdc keNalRefIdc) {
+  int32_t iPayloadSize = 0;
+
+  if (keNalRefIdc != NRI_PRI_LOWEST) {
+    WelsLoadNal (pCtx->pOut, NAL_UNIT_PREFIX, keNalRefIdc);
+
+    WelsWriteSVCPrefixNal (&pCtx->pOut->sBsWrite, keNalRefIdc, (NAL_UNIT_CODED_SLICE_IDR == keNalType));
+
+    WelsUnloadNal (pCtx->pOut);
+
+    iPayloadSize	= WelsEncodeNalExt (&pCtx->pOut->sNalList[pCtx->pOut->iNalIndex - 1],
+                                      &pCtx->pCurDqLayer->sLayerInfo.sNalHeaderExt,
+                                      pCtx->pFrameBs + pCtx->iPosBsBuffer,
+                                      &pNalLen[*pNalIdxInLayer]);
+
+    pCtx->iPosBsBuffer							+= iPayloadSize;
+    pLayerBsInfo->iNalLengthInByte[*pNalIdxInLayer]	= iPayloadSize;
+
+    (*pNalIdxInLayer) ++;
+  } else { // No Prefix NAL Unit RBSP syntax here, but need add NAL Unit Header extension
+    WelsLoadNal (pCtx->pOut, NAL_UNIT_PREFIX, keNalRefIdc);
+    // No need write any syntax of prefix NAL Unit RBSP here
+    WelsUnloadNal (pCtx->pOut);
+
+    iPayloadSize = WelsEncodeNalExt (&pCtx->pOut->sNalList[pCtx->pOut->iNalIndex - 1],
+                                     &pCtx->pCurDqLayer->sLayerInfo.sNalHeaderExt,
+                                     pCtx->pFrameBs + pCtx->iPosBsBuffer,
+                                     &pNalLen[*pNalIdxInLayer]);
+
+    pCtx->iPosBsBuffer							+= iPayloadSize;
+    pLayerBsInfo->iNalLengthInByte[*pNalIdxInLayer]	= iPayloadSize;
+
+    (*pNalIdxInLayer) ++;
+  }
+
+  return iPayloadSize;
+}
+
+int32_t WritePadding (sWelsEncCtx* pCtx, int32_t iLen) {
+  int32_t i = 0;
+  int32_t iNal	= 0;
+  SBitStringAux*	pBs = NULL;
+  int32_t iNalLen;
+  int32_t iSize = 0;
+
+  iNal	= pCtx->pOut->iNalIndex;
+  pBs	=	&pCtx->pOut->sBsWrite;	// SBitStringAux instance for non VCL NALs decoding
+
+  if ((pBs->pBufEnd - pBs->pBufPtr) < iLen || iNal >= pCtx->pOut->iCountNals) {
+#if GOM_TRACE_FLAG
+    WelsLog (pCtx, WELS_LOG_ERROR,
+             "[RC] paddingcal pBuffer overflow, bufferlen=%d, paddinglen=%d, iNalIdx= %d, iCountNals= %d\n",
+             (pBs->pBufEnd - pBs->pBufPtr), iLen, iNal, pCtx->pOut->iCountNals);
+#endif
+    return 0;
+  }
+
+  WelsLoadNal (pCtx->pOut, NAL_UNIT_FILLER_DATA, NRI_PRI_LOWEST);
+
+  for (i = 0; i < iLen; i++) {
+    BsWriteBits (pBs, 8, 0xff);
+  }
+
+  BsRbspTrailingBits (pBs);
+
+  BsFlush (pBs);
+
+  WelsUnloadNal (pCtx->pOut);
+  iNalLen = WelsEncodeNal (&pCtx->pOut->sNalList[iNal], pCtx->pFrameBs + pCtx->iPosBsBuffer, &iNalLen);
+
+  pCtx->iPosBsBuffer	+= iNalLen;
+  iSize				+= iNalLen;
+
+  return iSize;
+}
+
+/*
+ * post process of dynamic slicing bs writing in case PACKING_ONE_SLICE_PER_LAYER
+ * include: count bs size of over all the slices in layer,
+ * return: count number of slices in layer
+ */
+#if defined(MT_ENABLED) && defined(PACKING_ONE_SLICE_PER_LAYER)
+int32_t PostProcDynamicSlicingBsWriting (sWelsEncCtx* pCtx, SLayerBSInfo* pLayerBsInfo, int32_t* pLayerSize,
+    const int32_t kiPartitionCnt) {
+  SDqLayer* pCurDq		= pCtx->pCurDqLayer;
+  int32_t iPartitionIdx	= 0;
+  int32_t iCheckingIdx	= 0;
+  int32_t iSwappingIdx	= -1;
+  int32_t iSliceCount		= 0;
+  int32_t iLayerSize		= 0;
+
+  // count number of slices in layer and layer size
+  while (iPartitionIdx < kiPartitionCnt) {
+    const int32_t coded_slice_cnt = pCurDq->pNumSliceCodedOfPartition[iPartitionIdx];
+    iLayerSize += pCtx->pSliceThreading->pCountBsSizeInPartition[iPartitionIdx];
+    iSliceCount += coded_slice_cnt;
+    ++ iPartitionIdx;
+  }
+  *pLayerSize	= iLayerSize;
+
+  // reordering pLayerBs pointers, but do not ensure raster scan order of picture
+  // just maintain discontinuous items,i.e,
+  // input:
+  // partition 1: uiSliceIdx: 0 2 4 6
+  // partition 2: uiSliceIdx: 1 3 5 7 9 11 13
+  // output:
+  // uiSliceIdx: 0 1 2 3 4 5 6 7 8 9 10
+  iCheckingIdx = 0;
+  while (true) {
+    bool_t bMatchFlag = false;
+    iPartitionIdx = 0;
+    while (iPartitionIdx < kiPartitionCnt) {
+      const int32_t coded_slice_cnt = pCurDq->pNumSliceCodedOfPartition[iPartitionIdx];
+      // iCheckingIdx need convert to iIndex of iPartitionIdx based to avoid linear searching
+      // belong this partition and not exceed the number of slices coded in partition
+      if (iPartitionIdx == (iCheckingIdx % kiPartitionCnt)
+          && iCheckingIdx / kiPartitionCnt < coded_slice_cnt) {
+        if (iSwappingIdx >= 0) {
+          // memory swapping
+          memmove (pLayerBsInfo + iSwappingIdx, LayerBsInfo + iCheckingIdx, sizeof (SLayerBSInfo));	// confirmed_safe_unsafe_usage
+          ++ iSwappingIdx;	// record iSwappingIdx
+        }
+        ++ iCheckingIdx;
+        bMatchFlag = true;
+        break;
+      }
+      ++ iPartitionIdx;
+    }
+    if (!bMatchFlag) {
+      if (iSwappingIdx < 0)
+        iSwappingIdx = iCheckingIdx;
+      ++ iCheckingIdx;
+    }
+    if (iSwappingIdx >= iSliceCount)
+      break;
+  }
+
+  return iSliceCount;
+}
+#endif//MT_ENABLED && PACKING_ONE_SLICE_PER_LAYER
+
+/*
+ * Force coding IDR as follows
+ */
+int32_t ForceCodingIDR (sWelsEncCtx* pCtx) {
+  if (NULL == pCtx)
+    return 1;
+
+  pCtx->bEncCurFrmAsIdrFlag = true;
+  pCtx->iCodingIndex	= 0;
+
+  return 0;
+}
+
+/*!
+ * \brief	core svc encoding process
+ *
+ * \pParam	pCtx			sWelsEncCtx*, encoder context
+ * \pParam	pDst			FrameBSInfo*
+ * \pParam	pSrc			SSourcePicture* for need_ds = true or SSourcePicture** for need_ds = false
+ * \pParam	iConfiguredLayerNum	=1 in case need_ds = true or >1 in case need_ds = false
+ * \pParam	need_ds		Indicate whether need down sampling desired
+ *						[NO in picture list case, YES in console aplication based]
+ * \return	EFrameType (WELS_FRAME_TYPE_IDR/WELS_FRAME_TYPE_I/WELS_FRAME_TYPE_P)
+ */
+int32_t WelsEncoderEncodeExt (sWelsEncCtx* pCtx, void* pDst, const SSourcePicture** ppSrcList,
+                              const int32_t iConfiguredLayerNum) {
+  SFrameBSInfo* pFbi					= (SFrameBSInfo*)pDst;
+  SLayerBSInfo* pLayerBsInfo					= &pFbi->sLayerInfo[0];
+  SWelsSvcCodingParam* pSvcParam	= pCtx->pSvcParam;
+  SSpatialPicIndex* pSpatialIndexMap = &pCtx->sSpatialIndexMap[0];
+#if defined(ENABLE_FRAME_DUMP) || defined(ENABLE_PSNR_CALC)
+  SPicture* fsnr						= NULL;
+#endif//ENABLE_FRAME_DUMP || ENABLE_PSNR_CALC
+  SPicture* pEncPic						= NULL;	// to be decided later
+#if defined(MT_ENABLED) && (defined(DYNAMIC_SLICE_ASSIGN) || defined(MT_DEBUG))
+  int32_t did_list[MAX_DEPENDENCY_LAYER]	= {0};
+#endif//MT_ENABLED && DYNAMIC_SLICE_ASSIGN
+  int32_t iLayerNum					= 0;
+  int32_t iLayerSize					= 0;
+  int32_t iSpatialNum					= 0; // available count number of spatial layers due to frame size changed in this given frame
+  int32_t iSpatialIdx					= 0; // iIndex of spatial layers due to frame size changed in this given frame
+  int32_t iFrameSize					= 0;
+  int32_t iNalLen[128]				= {0};
+  int32_t iNalIdxInLayer			= 0;
+  int32_t iCountNal					= 0;
+  EFrameType eFrameType				= WELS_FRAME_TYPE_AUTO;
+  int32_t iCurWidth					= 0;
+  int32_t iCurHeight					= 0;
+  EWelsNalUnitType eNalType			= NAL_UNIT_UNSPEC_0;
+  EWelsNalRefIdc eNalRefIdc			= NRI_PRI_LOWEST;
+  int8_t iCurDid						= 0;
+  int8_t iCurTid						= 0;
+  bool_t bAvcBased					= false;
+#if defined(ENABLE_PSNR_CALC)
+  real32_t snr_y = .0f, snr_u = .0f, snr_v = .0f;
+#endif//ENABLE_PSNR_CALC
+
+#if defined(_DEBUG)
+  int32_t i = 0, j = 0, k = 0;
+#endif//_DEBUG
+
+  pFbi->iLayerNum	= 0;	// for initialization
+
+  // perform csc/denoise/downsample/padding, generate spatial layers
+  iSpatialNum = pCtx->pVpp->WelsPreprocessStep1 (pCtx, ppSrcList, iConfiguredLayerNum);
+  if (iSpatialNum < 1) {	// skip due to temporal layer settings (different frame rate)
+    ++ pCtx->iCodingIndex;
+    return WELS_FRAME_TYPE_SKIP;
+  }
+
+  eFrameType = DecideFrameType (pCtx, iSpatialNum);
+  if (eFrameType == WELS_FRAME_TYPE_SKIP)
+    return eFrameType;
+
+  InitFrameCoding (pCtx, eFrameType);
+
+  iCurTid	= GetTemporalLevel (&pSvcParam->sDependencyLayers[pSpatialIndexMap->iDid], pCtx->iCodingIndex,
+                              pSvcParam->uiGopSize);
+  pCtx->uiTemporalId	= iCurTid;
+
+  pLayerBsInfo->pBsBuf	= pCtx->pFrameBs ;
+
+  if (eFrameType == WELS_FRAME_TYPE_IDR) {
+    ++ pCtx->sPSOVector.uiIdrPicId;
+    //if ( pSvcParam->bEnableSSEI )
+
+    // write parameter sets bitstream here
+    WelsWriteParameterSets (pCtx, &iNalLen[0], &iCountNal);
+
+    pLayerBsInfo->uiPriorityId	= 0;
+    pLayerBsInfo->uiSpatialId		= 0;
+    pLayerBsInfo->uiTemporalId	= 0;
+    pLayerBsInfo->uiQualityId		= 0;
+    pLayerBsInfo->uiLayerType		= NON_VIDEO_CODING_LAYER;
+    pLayerBsInfo->iNalCount		= iCountNal;
+    for (int32_t iNalIndex	= 0; iNalIndex < iCountNal; ++ iNalIndex) {
+      pLayerBsInfo->iNalLengthInByte[iNalIndex]	= iNalLen[iNalIndex];
+    }
+
+    ++ pLayerBsInfo;
+    pLayerBsInfo->pBsBuf			= pCtx->pFrameBs + pCtx->iPosBsBuffer;
+    ++ iLayerNum;
+  }
+
+  pCtx->pCurDqLayer				= pCtx->ppDqLayerList[pSpatialIndexMap->iDid];
+  pCtx->pCurDqLayer->pRefLayer	= NULL;
+
+  while (iSpatialIdx < iSpatialNum) {
+    const int32_t d_idx			= (pSpatialIndexMap + iSpatialIdx)->iDid;	// get iDid
+    SDLayerParam* param_d		= &pSvcParam->sDependencyLayers[d_idx];
+
+    pCtx->uiDependencyId	= iCurDid = (int8_t)d_idx;
+    pCtx->pVpp->WelsPreprocessStep3 (pCtx, d_idx);
+
+    pCtx->pEncPic	 = pEncPic = (pSpatialIndexMap + iSpatialIdx)->pSrc;
+    pCtx->pEncPic->iPictureType	= pCtx->eSliceType;
+    pCtx->pEncPic->iFramePoc		= pCtx->iPOC;
+
+    iCurWidth	= param_d->iFrameWidth;
+    iCurHeight	= param_d->iFrameHeight;
+
+#if defined(MT_ENABLED) && (defined(DYNAMIC_SLICE_ASSIGN) || defined(MT_DEBUG))
+    did_list[iSpatialIdx]	= iCurDid;
+#endif//MT_ENABLED && DYNAMIC_SLICE_ASSIGN
+
+    // Encoding this picture might mulitiple sQualityStat layers potentially be encoded as followed
+
+    switch (param_d->sMso.uiSliceMode) {
+    case SM_FIXEDSLCNUM_SLICE: {
+#if defined(MT_ENABLED) && defined(DYNAMIC_SLICE_ASSIGN)
+      if ((iCurDid > 0) && (pSvcParam->iMultipleThreadIdc > 1) &&
+          (pSvcParam->sDependencyLayers[iCurDid].sMso.uiSliceMode == SM_FIXEDSLCNUM_SLICE
+           && pSvcParam->iMultipleThreadIdc >= pSvcParam->sDependencyLayers[iCurDid].sMso.sSliceArgument.iSliceNum)
+         )
+        AdjustEnhanceLayer (pCtx, iCurDid);
+#endif//MT_ENABLED && DYNAMIC_SLICE_ASSIGN
+      break;
+    }
+    case SM_DYN_SLICE: {
+      int32_t iPicIPartitionNum = PicPartitionNumDecision (pCtx);
+      // MT compatibility
+      pCtx->iActiveThreadsNum	=
+        iPicIPartitionNum;	// we try to active number of threads, equal to number of picture partitions
+      WelsInitCurrentDlayerMltslc (pCtx, iPicIPartitionNum);
+      break;
+    }
+    default: {
+      break;
+    }
+    }
+
+    /* coding each spatial layer, only one sQualityStat layer within spatial support */
+    int32_t iSliceCount	= 1;
+    if (iLayerNum >= MAX_LAYER_NUM_OF_FRAME) {	// check available layer_bs_info writing as follows
+      WelsLog (pCtx, WELS_LOG_ERROR, "WelsEncoderEncodeExt(), iLayerNum(%d) overflow(max:%d)!", iLayerNum,
+               MAX_LAYER_NUM_OF_FRAME);
+      return -1;
+    }
+
+    iNalIdxInLayer	= 0;
+    bAvcBased	= (iCurDid == BASE_DEPENDENCY_ID);
+    pCtx->bNeedPrefixNalFlag	= (bAvcBased &&
+                                 (pSvcParam->bPrefixNalAddingCtrl ||
+                                  (pSvcParam->iNumDependencyLayer > 1)));
+
+    if (eFrameType == WELS_FRAME_TYPE_P) {
+      eNalType	= bAvcBased ? NAL_UNIT_CODED_SLICE : NAL_UNIT_CODED_SLICE_EXT;
+    } else if (eFrameType == WELS_FRAME_TYPE_IDR) {
+      eNalType	= bAvcBased ? NAL_UNIT_CODED_SLICE_IDR : NAL_UNIT_CODED_SLICE_EXT;
+    }
+    if (iCurTid == 0 || pCtx->eSliceType == I_SLICE)
+      eNalRefIdc	= NRI_PRI_HIGHEST;
+    else if (iCurTid == pSvcParam->iDecompStages)
+      eNalRefIdc	= NRI_PRI_LOWEST;
+    else if (1 + iCurTid == pSvcParam->iDecompStages)
+      eNalRefIdc	= NRI_PRI_LOW;
+    else	// more details for other temporal layers?
+      eNalRefIdc	= NRI_PRI_HIGHEST;
+    pCtx->eNalType		= eNalType;
+    pCtx->eNalPriority	= eNalRefIdc;
+
+    pCtx->pDecPic					= pCtx->ppRefPicListExt[iCurDid]->pNextBuffer;
+#if defined(ENABLE_FRAME_DUMP) || defined(ENABLE_PSNR_CALC)
+    fsnr					= pCtx->pDecPic;
+#endif//#if defined(ENABLE_FRAME_DUMP) || defined(ENABLE_PSNR_CALC)
+    pCtx->pDecPic->iPictureType	= pCtx->eSliceType;
+    pCtx->pDecPic->iFramePoc		= pCtx->iPOC;
+
+    WelsInitCurrentLayer (pCtx, iCurWidth, iCurHeight);
+
+    WelsMarkPic (pCtx);
+    if (!WelsBuildRefList (pCtx, pCtx->iPOC)) {
+      // Force coding IDR as followed
+      ForceCodingIDR (pCtx);
+      WelsLog (pCtx, WELS_LOG_WARNING, "WelsEncoderEncodeExt(), WelsBuildRefList failed for P frames, pCtx->iNumRef0= %d.\n",
+               pCtx->iNumRef0);
+      return -1;
+    }
+#ifdef LONG_TERM_REF_DUMP
+    dump_ref (pCtx);
+#endif
+    WelsUpdateRefSyntax (pCtx,  pCtx->iPOC,
+                         eFrameType);	//get reordering syntax used for writing slice header and transmit to encoder.
+    PrefetchReferencePicture (pCtx, eFrameType);	// update reference picture for current pDq layer
+
+    pCtx->pFuncList->pfRc.pfWelsRcPictureInit (pCtx);
+    PreprocessSliceCoding (pCtx);	// MUST be called after pfWelsRcPictureInit() and WelsInitCurrentLayer()
+
+    iLayerSize	= 0;
+    if (SM_SINGLE_SLICE == param_d->sMso.uiSliceMode) {	// only one slice within a sQualityStat layer
+      int32_t iSliceSize = 0;
+
+      if (pCtx->bNeedPrefixNalFlag) {
+        iLayerSize += AddPrefixNal (pCtx, pLayerBsInfo, &iNalLen[0], &iNalIdxInLayer, eNalType, eNalRefIdc);
+      }
+
+      WelsLoadNal (pCtx->pOut, eNalType, eNalRefIdc);
+
+      WelsCodeOneSlice (pCtx, 0, eNalType);
+
+      WelsUnloadNal (pCtx->pOut);
+
+      iSliceSize = WelsEncodeNalExt (&pCtx->pOut->sNalList[pCtx->pOut->iNalIndex - 1],
+                                     &pCtx->pCurDqLayer->sLayerInfo.sNalHeaderExt,
+                                     pCtx->pFrameBs + pCtx->iPosBsBuffer,
+                                     &iNalLen[iNalIdxInLayer]);
+      iLayerSize += iSliceSize;
+      pCtx->iPosBsBuffer	+= iSliceSize;
+      pLayerBsInfo->uiLayerType		= VIDEO_CODING_LAYER;
+      pLayerBsInfo->uiSpatialId		= iCurDid;
+      pLayerBsInfo->uiTemporalId	= iCurTid;
+      pLayerBsInfo->uiQualityId		= 0;
+      pLayerBsInfo->uiPriorityId	= 0;
+      pLayerBsInfo->iNalLengthInByte[iNalIdxInLayer]	= iSliceSize;
+      pLayerBsInfo->iNalCount		= ++ iNalIdxInLayer;
+    }
+    // for dynamic slicing single threading..
+#ifndef MT_ENABLED
+    else if (SM_DYN_SLICE == param_d->sMso.uiSliceMode)
+#else	// MT_ENABLED
+    else if ((SM_DYN_SLICE == param_d->sMso.uiSliceMode) && (pSvcParam->iMultipleThreadIdc <= 1))
+#endif//MT_ENABLED
+    {
+      const int32_t kiLastMbInFrame = pCtx->pCurDqLayer->pSliceEncCtx->iMbNumInFrame;
+      WelsCodeOnePicPartition (pCtx, pLayerBsInfo, &iNalIdxInLayer, &iLayerSize, 0, kiLastMbInFrame, 0);
+    } else {
+      //other multi-slice uiSliceMode
+#if defined(MT_ENABLED)
+      int err = 0;
+      // THREAD_FULLY_FIRE_MODE/THREAD_PICK_UP_MODE for any mode of non-SM_DYN_SLICE
+      if ((SM_DYN_SLICE != param_d->sMso.uiSliceMode) && (pSvcParam->iMultipleThreadIdc > 1)) {
+        iSliceCount	= GetCurrentSliceNum (pCtx->pCurDqLayer->pSliceEncCtx);
+        if (iLayerNum +
+#if defined(PACKING_ONE_SLICE_PER_LAYER)
+            iSliceCount
+#else
+            1
+#endif//PACKING_ONE_SLICE_PER_LAYER
+            >= MAX_LAYER_NUM_OF_FRAME) {	// check available layer_bs_info for further writing as followed
+          WelsLog (pCtx, WELS_LOG_ERROR,
+                   "WelsEncoderEncodeExt(), iLayerNum(%d) overflow(max:%d) at iDid= %d uiSliceMode= %d, iSliceCount= %d!",
+                   iLayerNum, MAX_LAYER_NUM_OF_FRAME, iCurDid, param_d->sMso.uiSliceMode, iSliceCount);
+          return -1;
+        }
+        if (iSliceCount <= 1) {
+          WelsLog (pCtx, WELS_LOG_ERROR,
+                   "WelsEncoderEncodeExt(), iSliceCount(%d) from GetCurrentSliceNum() is untrusted due stack/heap crupted!\n",
+                   iSliceCount);
+          return -1;
+        }
+
+        if (pSvcParam->iCountThreadsNum >= iSliceCount) {	//THREAD_FULLY_FIRE_MODE
+#if defined(PACKING_ONE_SLICE_PER_LAYER)
+          int32_t iSliceIdx = 1;
+          int32_t iOrgSlicePos[MAX_SLICES_NUM] = {0};
+          iOrgSlicePos[0] = pCtx->iPosBsBuffer;
+          while (uiSliceIdx < iSliceCount) {
+            iOrgSlicePos[uiSliceIdx] = pCtx->pSliceBs[uiSliceIdx].uiBsPos;
+            ++ uiSliceIdx;
+          }
+#elif defined(MT_DEBUG)
+          int64_t t_bs_append = 0;
+#endif//PACKING_ONE_SLICE_PER_LAYER
+
+          pCtx->iActiveThreadsNum	= iSliceCount;
+          // to fire slice coding threads
+          err = FiredSliceThreads (&pCtx->pSliceThreading->pThreadPEncCtx[0], &pCtx->pSliceThreading->pReadySliceCodingEvent[0],
+                                   pLayerBsInfo, iSliceCount, pCtx->pCurDqLayer->pSliceEncCtx, FALSE);
+          if (err) {
+            WelsLog (pCtx, WELS_LOG_ERROR,
+                     "[MT] WelsEncoderEncodeExt(), FiredSliceThreads return(%d) failed and exit encoding frame, iCountThreadsNum= %d, iSliceCount= %d, uiSliceMode= %d, iMultipleThreadIdc= %d!!\n",
+                     err, pSvcParam->iCountThreadsNum, iSliceCount, param_d->sMso.uiSliceMode, pSvcParam->iMultipleThreadIdc);
+            return -1;
+          }
+
+          WelsMultipleEventsWaitAllBlocking (iSliceCount, &pCtx->pSliceThreading->pSliceCodedEvent[0]);
+
+
+          // all slices are finished coding here
+          // append exclusive slice 0 bs to pFrameBs
+#if defined(PACKING_ONE_SLICE_PER_LAYER)
+          iLayerSize = pCtx->iPosBsBuffer - iOrgSlicePos[0];
+          uiSliceIdx = 1;
+          while (uiSliceIdx < iSliceCount) {
+            iLayerSize += pCtx->pSliceBs[uiSliceIdx].uiBsPos - iOrgSlicePos[uiSliceIdx];
+            ++ uiSliceIdx;
+          }
+          iLayerNum += iSliceCount;	// each slice stickly output as layer info for performance improvement directly
+          pLayerBsInfo += iSliceCount;
+#else
+#if defined(MT_DEBUG)
+          t_bs_append = WelsTime();
+#endif//MT_DEBUG
+          iLayerSize = AppendSliceToFrameBs (pCtx, pLayerBsInfo, iSliceCount);
+#if defined(MT_DEBUG)
+          t_bs_append = WelsTime() - t_bs_append;
+          if (pCtx->pSliceThreading->pFSliceDiff) {
+            fprintf (pCtx->pSliceThreading->pFSliceDiff,
+#if defined(WIN32)
+                     "%6I64d us consumed at AppendSliceToFrameBs() for coding_idx: %d iDid: %d qid: %d\n",
+#else
+                     "%6lld us consumed at AppendSliceToFrameBs() for coding_idx: %d iDid: %d qid: %d\n",
+#endif//WIN32
+                     t_bs_append, pCtx->iCodingIndex, iCurDid, 0);
+          }
+#endif//MT_DEBUG
+#endif//PACKING_ONE_SLICE_PER_LAYER
+        } else {	//THREAD_PICK_UP_MODE
+          int32_t iNumThreadsRunning = 0;
+          int32_t iNumThreadsScheduled = 0;
+          int32_t iIndexOfSliceToBeCoded = 0;
+#if defined(PACKING_ONE_SLICE_PER_LAYER)
+          int32_t iSliceIdx = 1;
+          int32_t iOrgSlicePos[MAX_SLICES_NUM] = {0};
+          iOrgSlicePos[0] = pCtx->iPosBsBuffer;
+          while (uiSliceIdx < iSliceCount) {
+            iOrgSlicePos[uiSliceIdx] = pCtx->pSliceBs[uiSliceIdx].uiBsPos;
+            ++ uiSliceIdx;
+          }
+#endif//PACKING_ONE_SLICE_PER_LAYER
+
+          pCtx->iActiveThreadsNum	= pSvcParam->iCountThreadsNum;
+          iNumThreadsScheduled	= pCtx->iActiveThreadsNum;
+          iNumThreadsRunning		= iNumThreadsScheduled;
+          // to fire slice coding threads
+          err = FiredSliceThreads (&pCtx->pSliceThreading->pThreadPEncCtx[0], &pCtx->pSliceThreading->pReadySliceCodingEvent[0],
+                                   pLayerBsInfo, iNumThreadsRunning, pCtx->pCurDqLayer->pSliceEncCtx, FALSE);
+          if (err) {
+            WelsLog (pCtx, WELS_LOG_ERROR,
+                     "[MT] WelsEncoderEncodeExt(), FiredSliceThreads return(%d) failed and exit encoding frame, iCountThreadsNum= %d, iSliceCount= %d, uiSliceMode= %d, iMultipleThreadIdc= %d!!\n",
+                     err, pSvcParam->iCountThreadsNum, iSliceCount, param_d->sMso.uiSliceMode, pSvcParam->iMultipleThreadIdc);
+            return -1;
+          }
+
+          iIndexOfSliceToBeCoded = iNumThreadsRunning;
+          while (1) {
+            if (iIndexOfSliceToBeCoded >= iSliceCount && iNumThreadsRunning <= 0)
+              break;
+#ifdef WIN32
+            WELS_THREAD_ERROR_CODE lwait	= 0;
+            int32_t iEventId				= -1;
+
+            lwait = WelsMultipleEventsWaitSingleBlocking (iNumThreadsScheduled,
+                    &pCtx->pSliceThreading->pSliceCodedEvent[0],
+                    2);	// 2 ms for one tick
+            iEventId = (int32_t) (lwait - WELS_THREAD_ERROR_WAIT_OBJECT_0);
+            if (iEventId >= 0 && iEventId < iNumThreadsScheduled) {
+              if (iIndexOfSliceToBeCoded < iSliceCount) {
+                // pick up succeeding slice for threading
+                // thread_id equal to iEventId per implementation here
+                pCtx->pSliceThreading->pThreadPEncCtx[iEventId].iSliceIndex	= iIndexOfSliceToBeCoded;
+#ifdef PACKING_ONE_SLICE_PER_LAYER
+                pCtx->pSliceThreading->pThreadPEncCtx[iEventId].pLayerBs	= pLayerBsInfo + iIndexOfSliceToBeCoded;
+#endif//PACKING_ONE_SLICE_PER_LAYER
+                WelsEventSignal (&pCtx->pSliceThreading->pReadySliceCodingEvent[iEventId]);
+
+                ++ iIndexOfSliceToBeCoded;
+              } else {	// no other slices left for coding
+                -- iNumThreadsRunning;
+              }
+            } else {
+              WelsSleep (1);
+            }
+#else//__GNUC__
+            // TODO for pthread platforms
+            // alternate implementation using blocking due non-blocking with timeout mode not support at wels thread lib, tune back if available
+            WelsMultipleEventsWaitAllBlocking (iNumThreadsRunning, &pCtx->pSliceThreading->pSliceCodedEvent[0]);
+            if (iIndexOfSliceToBeCoded < iSliceCount) {
+              int32_t iThreadIdx = 0;
+              // pick up succeeding slices for threading if left
+              while (iThreadIdx < iNumThreadsScheduled) {
+                if (iIndexOfSliceToBeCoded >= iSliceCount)
+                  break;
+                pCtx->pSliceThreading->pThreadPEncCtx[iThreadIdx].iSliceIndex = iIndexOfSliceToBeCoded;
+#ifdef PACKING_ONE_SLICE_PER_LAYER
+                pCtx->pSliceThreading->pThreadPEncCtx[iThreadIdx].pLayerBs = pLayerBsInfo + iIndexOfSliceToBeCoded;
+#endif//PACKING_ONE_SLICE_PER_LAYER
+                WelsEventSignal (pCtx->pSliceThreading->pReadySliceCodingEvent[iThreadIdx]);
+
+                ++ iIndexOfSliceToBeCoded;
+                ++ iThreadIdx;
+              }
+              // update iNumThreadsRunning
+              iNumThreadsRunning		= iThreadIdx;
+            } else {
+              iNumThreadsRunning = 0;
+            }
+#endif//WIN32
+          }//while(1)
+
+// all slices are finished coding here
+          // append exclusive slice 0 bs to pFrameBs
+#if defined(PACKING_ONE_SLICE_PER_LAYER)
+          iLayerSize = pCtx->iPosBsBuffer - iOrgSlicePos[0];
+          uiSliceIdx = 1;
+          while (uiSliceIdx < iSliceCount) {
+            iLayerSize += pCtx->pSliceBs[uiSliceIdx].uiBsPos - iOrgSlicePos[uiSliceIdx];
+            ++ uiSliceIdx;
+          }
+          iLayerNum += iSliceCount;	// each slice stickly output as layer info for performance improvement directly
+          pLayerBsInfo += iSliceCount;
+#else
+          iLayerSize = AppendSliceToFrameBs (pCtx, pLayerBsInfo, iSliceCount);
+#endif//PACKING_ONE_SLICE_PER_LAYER
+        }
+      }
+      // THREAD_FULLY_FIRE_MODE && SM_DYN_SLICE
+      else if ((SM_DYN_SLICE == param_d->sMso.uiSliceMode) && (pSvcParam->iMultipleThreadIdc > 1)) {
+        const int32_t kiPartitionCnt	= pCtx->iActiveThreadsNum; //pSvcParam->iCountThreadsNum;
+#if defined(PACKING_ONE_SLICE_PER_LAYER)
+        ResetCountBsSizeInPartitions (pCtx->pSliceThreading->pCountBsSizeInPartition, kiPartitionCnt);
+        pCtx->pCurDqLayer->pSliceEncCtx->iMaxSliceNumConstraint = WELS_MIN (MAX_SLICES_NUM,
+            DynamicMaxSliceNumConstraint (MAX_LAYER_NUM_OF_FRAME, iLayerNum, 1 + /*( num_qlayer - 1) +*/ (((iCurDid == 0)
+                                          && (pSvcParam->uiGopSize > 1)) ? 1 : 0)));
+#endif//PACKING_ONE_SLICE_PER_LAYER
+
+        // to fire slice coding threads
+        err = FiredSliceThreads (&pCtx->pSliceThreading->pThreadPEncCtx[0], &pCtx->pSliceThreading->pReadySliceCodingEvent[0],
+                                 pLayerBsInfo, kiPartitionCnt, pCtx->pCurDqLayer->pSliceEncCtx, TRUE);
+        if (err) {
+          WelsLog (pCtx, WELS_LOG_ERROR,
+                   "[MT] WelsEncoderEncodeExt(), FiredSliceThreads return(%d) failed and exit encoding frame, iCountThreadsNum= %d, iSliceCount= %d, uiSliceMode= %d, iMultipleThreadIdc= %d!!\n",
+                   err, pSvcParam->iCountThreadsNum, iSliceCount, param_d->sMso.uiSliceMode, pSvcParam->iMultipleThreadIdc);
+          return -1;
+        }
+
+        WelsMultipleEventsWaitAllBlocking (kiPartitionCnt, &pCtx->pSliceThreading->pSliceCodedEvent[0]);
+
+#if defined(PACKING_ONE_SLICE_PER_LAYER)
+        iSliceCount = PostProcDynamicSlicingBsWriting (pCtx, pLayerBsInfo, &iLayerSize, kiPartitionCnt);
+        assert (iLayerNum + iSliceCount < MAX_LAYER_NUM_OF_FRAME);
+        pLayerBsInfo += iSliceCount;
+        iLayerNum += iSliceCount;
+#else
+        iLayerSize = AppendSliceToFrameBs (pCtx, pLayerBsInfo, kiPartitionCnt);
+#endif//PACKING_ONE_SLICE_PER_LAYER
+      } else	// for non-dynamic-slicing mode single threading branch..
+#endif//MT_ENABLED
+      {
+        const bool_t bNeedPrefix	= pCtx->bNeedPrefixNalFlag;
+        int32_t iSliceIdx			= 0;
+
+        iSliceCount	= GetCurrentSliceNum (pCtx->pCurDqLayer->pSliceEncCtx);
+        while (iSliceIdx < iSliceCount) {
+          int32_t iSliceSize	= 0;
+
+          if (bNeedPrefix) {
+            iLayerSize += AddPrefixNal (pCtx, pLayerBsInfo, &iNalLen[0], &iNalIdxInLayer, eNalType, eNalRefIdc);
+          }
+
+          WelsLoadNal (pCtx->pOut, eNalType, eNalRefIdc);
+          WelsCodeOneSlice (pCtx, iSliceIdx, eNalType);
+          WelsUnloadNal (pCtx->pOut);
+
+          iSliceSize = WelsEncodeNalExt (&pCtx->pOut->sNalList[pCtx->pOut->iNalIndex - 1],
+                                         &pCtx->pCurDqLayer->sLayerInfo.sNalHeaderExt,
+                                         pCtx->pFrameBs + pCtx->iPosBsBuffer,
+                                         &iNalLen[iNalIdxInLayer]);
+          pCtx->iPosBsBuffer	+= iSliceSize;
+          iLayerSize	+= iSliceSize;
+          pLayerBsInfo->iNalLengthInByte[iNalIdxInLayer]	= iSliceSize;
+
+#if defined(SLICE_INFO_OUTPUT)
+          fprintf (stderr,
+                   "@slice=%-6d sliceType:%c idc:%d size:%-6d\n",
+                   iSliceIdx,
+                   (pCtx->eSliceType == P_SLICE ? 'P' : 'I'),
+                   eNalRefIdc,
+                   iSliceSize);
+#endif//SLICE_INFO_OUTPUT						
+          ++ iNalIdxInLayer;
+          ++ iSliceIdx;
+        }
+
+        pLayerBsInfo->uiLayerType		= VIDEO_CODING_LAYER;
+        pLayerBsInfo->uiSpatialId		= iCurDid;
+        pLayerBsInfo->uiTemporalId	= iCurTid;
+        pLayerBsInfo->uiQualityId		= 0;
+        pLayerBsInfo->uiPriorityId	= 0;
+        pLayerBsInfo->iNalCount		= iNalIdxInLayer;
+      }
+    }
+
+    // deblocking filter
+    if (
+#if defined(MT_ENABLED)
+      (!pCtx->pCurDqLayer->bDeblockingParallelFlag) &&
+#endif//MT_ENABLED
+#if !defined(ENABLE_FRAME_DUMP)
+      ((eNalRefIdc != NRI_PRI_LOWEST) && (param_d->iHighestTemporalId == 0 || iCurTid < param_d->iHighestTemporalId)) &&
+#endif//!ENABLE_FRAME_DUMP
+      true
+    ) {
+      PerformDeblockingFilter (pCtx);
+    }
+
+    // reference picture list update
+    if (eNalRefIdc != NRI_PRI_LOWEST) {
+      if (!WelsUpdateRefList (pCtx)) {
+        // Force coding IDR as followed
+        ForceCodingIDR (pCtx);
+        WelsLog (pCtx, WELS_LOG_WARNING, "WelsEncoderEncodeExt(), WelsUpdateRefList failed.\n");
+        return -1;
+      }
+    }
+
+    iFrameSize += iLayerSize;
+
+    pCtx->pFuncList->pfRc.pfWelsRcPictureInfoUpdate (pCtx, iLayerSize);
+
+#ifdef ENABLE_FRAME_DUMP
+    // Dump reconstruction picture for each sQualityStat layer
+    if (iCurDid + 1 < pSvcParam->iNumDependencyLayer)
+      DumpDependencyRec (fsnr, &param_d->sRecFileName[0], iCurDid);
+#endif//ENABLE_FRAME_DUMP
+
+#if defined(ENABLE_PSNR_CALC)
+    snr_y	= WelsCalcPsnr (fsnr->pData[0],
+                          fsnr->iLineSize[0],
+                          pEncPic->pData[0],
+                          pEncPic->iLineSize[0],
+                          iCurWidth,
+                          iCurHeight);
+    snr_u	= WelsCalcPsnr (fsnr->pData[1],
+                          fsnr->iLineSize[1],
+                          pEncPic->pData[1],
+                          pEncPic->iLineSize[1],
+                          (iCurWidth >> 1),
+                          (iCurHeight >> 1));
+    snr_v	= WelsCalcPsnr (fsnr->pData[2],
+                          fsnr->iLineSize[2],
+                          pEncPic->pData[2],
+                          pEncPic->iLineSize[2],
+                          (iCurWidth >> 1),
+                          (iCurHeight >> 1));
+#endif//ENABLE_PSNR_CALC
+
+#if defined(LAYER_INFO_OUTPUT)
+    fprintf (stderr, "%2s %5d: %-5d %2s   T%1d D%1d Q%-2d  QP%3d   Y%2.2f  U%2.2f  V%2.2f  %8d bits\n",
+             (iSpatialIdx == 0) ? "#AU" : "   ",
+             pCtx->iPOC,
+             pCtx->iFrameNum,
+             (uiFrameType == WELS_FRAME_TYPE_I || uiFrameType == WELS_FRAME_TYPE_IDR) ? "I" : "P",
+             iCurTid,
+             iCurDid,
+             0,
+             pCtx->pWelsSvcRc[pCtx->uiDependencyId].iAverageFrameQp,
+             snr_y,
+             snr_u,
+             snr_v,
+             (iLayerSize << 3));
+#endif//LAYER_INFO_OUTPUT
+
+#if defined(STAT_OUTPUT)
+
+#if defined(ENABLE_PSNR_CALC)
+    {
+      pCtx->sStatData[iCurDid][0].sQualityStat.rYPsnr[pCtx->eSliceType]	+= snr_y;
+      pCtx->sStatData[iCurDid][0].sQualityStat.rUPsnr[pCtx->eSliceType]	+= snr_u;
+      pCtx->sStatData[iCurDid][0].sQualityStat.rVPsnr[pCtx->eSliceType]	+= snr_v;
+    }
+#endif//ENABLE_PSNR_CALC
+
+#if defined(MB_TYPES_CHECK) //091025, frame output
+    if (pCtx->eSliceType == P_SLICE) {
+      pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[P_SLICE][Intra4x4] += pCtx->sPerInfo.iMbCount[P_SLICE][Intra4x4];
+      pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[P_SLICE][Intra16x16] += pCtx->sPerInfo.iMbCount[P_SLICE][Intra16x16];
+      pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[P_SLICE][Inter16x16] += pCtx->sPerInfo.iMbCount[P_SLICE][Inter16x16];
+      pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[P_SLICE][Inter16x8] += pCtx->sPerInfo.iMbCount[P_SLICE][Inter16x8];
+      pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[P_SLICE][Inter8x16] += pCtx->sPerInfo.iMbCount[P_SLICE][Inter8x16];
+      pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[P_SLICE][Inter8x8] += pCtx->sPerInfo.iMbCount[P_SLICE][Inter8x8];
+      pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[P_SLICE][PSkip] += pCtx->sPerInfo.iMbCount[P_SLICE][PSkip];
+      pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[P_SLICE][8] += pCtx->sPerInfo.iMbCount[P_SLICE][8];
+      pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[P_SLICE][9] += pCtx->sPerInfo.iMbCount[P_SLICE][9];
+      pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[P_SLICE][10] += pCtx->sPerInfo.iMbCount[P_SLICE][10];
+      pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[P_SLICE][11] += pCtx->sPerInfo.iMbCount[P_SLICE][11];
+    } else if (pCtx->eSliceType == I_SLICE) {
+      pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[I_SLICE][Intra4x4] += pCtx->sPerInfo.iMbCount[I_SLICE][Intra4x4];
+      pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[I_SLICE][Intra16x16] += pCtx->sPerInfo.iMbCount[I_SLICE][Intra16x16];
+      pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[I_SLICE][7] += pCtx->sPerInfo.iMbCount[I_SLICE][7];
+    }
+
+    memset (pCtx->sPerInfo.iMbCount[P_SLICE], 0, 18 * sizeof (int32_t));
+    memset (pCtx->sPerInfo.iMbCount[I_SLICE], 0, 18 * sizeof (int32_t));
+
+#endif//MB_TYPES_CHECK
+    {
+      //no pCtx->pSvcParam->bMgsT0OnlyStrategy
+      ++ pCtx->sStatData[iCurDid][0].sSliceData.iSliceCount[pCtx->eSliceType];	// for multiple slices coding
+      pCtx->sStatData[iCurDid][0].sSliceData.iSliceSize[pCtx->eSliceType]	+= (iLayerSize << 3);	// bits
+    }
+#endif//STAT_OUTPUT
+
+#if defined(MT_ENABLED) && defined(PACKING_ONE_SLICE_PER_LAYER)
+    if (pSvcParam->iMultipleThreadIdc <= 1 || SM_SINGLE_SLICE == param_d->sMso.uiSliceMode)	// sigle thread actually used
+#else
+    if (1)
+#endif//MT_ENABLED && PACKING_ONE_SLICE_PER_LAYER
+    {
+      ++ iLayerNum;
+      ++ pLayerBsInfo;
+    }
+
+
+    pLayerBsInfo->pBsBuf	= pCtx->pFrameBs + pCtx->iPosBsBuffer;
+
+    if (pSvcParam->iPaddingFlag && pCtx->pWelsSvcRc[pCtx->uiDependencyId].iPaddingSize > 0) {
+      const int32_t kiPaddingNalSize = WritePadding (pCtx, pCtx->pWelsSvcRc[pCtx->uiDependencyId].iPaddingSize);
+
+#if GOM_TRACE_FLAG
+      WelsLog (pCtx, WELS_LOG_INFO, "[RC] encoding_qp%d Padding: %d\n", pCtx->uiDependencyId,
+               pCtx->pWelsSvcRc[pCtx->uiDependencyId].iPaddingSize);
+#endif
+      if (kiPaddingNalSize <= 0)
+        return -1;
+
+      pCtx->pWelsSvcRc[pCtx->uiDependencyId].iPaddingBitrateStat += pCtx->pWelsSvcRc[pCtx->uiDependencyId].iPaddingSize;
+
+      pCtx->pWelsSvcRc[pCtx->uiDependencyId].iPaddingSize = 0;
+
+      pLayerBsInfo->uiPriorityId	= 0;
+      pLayerBsInfo->uiSpatialId		= 0;
+      pLayerBsInfo->uiTemporalId	= 0;
+      pLayerBsInfo->uiQualityId		= 0;
+      pLayerBsInfo->uiLayerType		= NON_VIDEO_CODING_LAYER;
+      pLayerBsInfo->iNalCount		= 1;
+      pLayerBsInfo->iNalLengthInByte[0] = kiPaddingNalSize;
+      ++ pLayerBsInfo;
+      pLayerBsInfo->pBsBuf	= pCtx->pFrameBs + pCtx->iPosBsBuffer;
+      ++ iLayerNum;
+    }
+
+#if defined(MT_ENABLED) && defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+    if (param_d->sMso.uiSliceMode == SM_FIXEDSLCNUM_SLICE && pSvcParam->iMultipleThreadIdc > 1 &&
+        pSvcParam->iMultipleThreadIdc >= param_d->sMso.sSliceArgument.iSliceNum) {
+      CalcSliceComplexRatio (pCtx->pSliceThreading->pSliceComplexRatio[iCurDid], pCtx->pCurDqLayer->pSliceEncCtx,
+                             pCtx->pSliceThreading->pSliceConsumeTime[iCurDid]);
+#if defined(MT_DEBUG)
+      TrackSliceComplexities (pCtx, iCurDid);
+#endif//#if defined(MT_DEBUG)
+    }
+#endif//MT_ENABLED && DYNAMIC_SLICE_ASSIGN && TRY_SLICING_BALANCE
+
+    ++ iSpatialIdx;
+
+    if (iCurDid + 1 < pSvcParam->iNumDependencyLayer) {
+      WelsSwapDqLayers (pCtx);
+    }
+
+    if (pSvcParam->bEnableLongTermReference && (pCtx->pLtr[pCtx->uiDependencyId].bLTRMarkingFlag
+        && (pCtx->pLtr[pCtx->uiDependencyId].iLTRMarkMode == LTR_DELAY_MARK))) {
+      pCtx->bLongTermRefFlag[d_idx][0] = true;
+    }
+
+    if (iCurTid < pCtx->uiSpatialLayersInTemporal[d_idx] - 1 || pSvcParam->iDecompStages == 0) {
+      if ((iCurTid >= MAX_TEMPORAL_LEVEL) || (pCtx->uiSpatialLayersInTemporal[d_idx] - 1 >= MAX_TEMPORAL_LEVEL)) {
+        ForceCodingIDR (pCtx);	// some logic error
+        return -1;
+      }
+
+      if (pSvcParam->bEnableLongTermReference && pCtx->bLongTermRefFlag[d_idx][iCurTid]) {
+        SPicture* tmp	= pCtx->pSpatialPic[d_idx][pCtx->uiSpatialLayersInTemporal[d_idx] + pCtx->pVaa->uiMarkLongTermPicIdx];
+        pCtx->pSpatialPic[d_idx][pCtx->uiSpatialLayersInTemporal[d_idx] + pCtx->pVaa->uiMarkLongTermPicIdx] =
+          pCtx->pSpatialPic[d_idx][iCurTid];
+        pCtx->pSpatialPic[d_idx][iCurTid] = pCtx->pSpatialPic[d_idx][pCtx->uiSpatialLayersInTemporal[d_idx] - 1];
+        pCtx->pSpatialPic[d_idx][pCtx->uiSpatialLayersInTemporal[d_idx] - 1] = tmp;
+        pCtx->bLongTermRefFlag[d_idx][iCurTid] = false;
+      } else {
+        WelsExchangeSpatialPictures (&pCtx->pSpatialPic[d_idx][pCtx->uiSpatialLayersInTemporal[d_idx] - 1],
+                                     &pCtx->pSpatialPic[d_idx][iCurTid]);
+      }
+    }
+
+    if (pSvcParam->bEnableLongTermReference && ((pCtx->pLtr[pCtx->uiDependencyId].bLTRMarkingFlag
+        && (pCtx->pLtr[pCtx->uiDependencyId].iLTRMarkMode == LTR_DIRECT_MARK)) || eFrameType == WELS_FRAME_TYPE_IDR)) {
+      pCtx->bLongTermRefFlag[d_idx][iCurTid] = true;
+    }
+  }
+
+#if defined(MT_ENABLED) && defined(MT_DEBUG)
+  TrackSliceConsumeTime (pCtx, did_list, iSpatialNum);
+#endif//MT_ENABLED && MT_DEBUG
+
+#if defined(MT_ENABLED) && defined(DYNAMIC_SLICE_ASSIGN)
+  if (pSvcParam->iMultipleThreadIdc > 1 && did_list[0] == BASE_DEPENDENCY_ID
+      && pSvcParam->sDependencyLayers[0].sMso.uiSliceMode == SM_FIXEDSLCNUM_SLICE
+      && pSvcParam->iMultipleThreadIdc >= pSvcParam->sDependencyLayers[0].sMso.sSliceArgument.iSliceNum
+      && pSvcParam->sDependencyLayers[did_list[iSpatialNum - 1]].sMso.uiSliceMode == SM_FIXEDSLCNUM_SLICE
+      && pSvcParam->iMultipleThreadIdc >= pSvcParam->sDependencyLayers[did_list[iSpatialNum -
+          1]].sMso.sSliceArgument.iSliceNum) {
+    AdjustBaseLayer (pCtx);
+  }
+#endif//DYNAMIC_SLICE_ASSIGN
+
+#ifdef ENABLE_FRAME_DUMP
+  DumpRecFrame (fsnr, &pSvcParam->sDependencyLayers[pSvcParam->iNumDependencyLayer -
+                1].sRecFileName[0]);	// pDecPic: final reconstruction output
+#endif//ENABLE_FRAME_DUMP
+
+  ++ pCtx->iCodingIndex;
+  pCtx->eLastNalPriority	= eNalRefIdc;
+  pFbi->iLayerNum			= iLayerNum;
+
+#if defined(X86_ASM)
+  WelsEmms();
+#endif //X86_ASM	
+
+  return eFrameType;
+}
+
+/*!
+ * \brief	Wels SVC encoder parameters adjustment
+ *			SVC adjustment results in new requirement in memory blocks adjustment
+ */
+int32_t WelsEncoderParamAdjust (sWelsEncCtx** ppCtx, SWelsSvcCodingParam* pNewParam) {
+  SWelsSvcCodingParam* pOldParam		= NULL;
+  int32_t iReturn = 0;
+  int8_t iIndexD = 0;
+  bool_t bNeedReset = false;
+
+  if (NULL == ppCtx || NULL == *ppCtx || NULL == pNewParam)	return 1;
+
+  /* Check validation in new parameters */
+  iReturn	= ParamValidationExt (pNewParam);
+  if (iReturn != 0)	return iReturn;
+
+  pOldParam	= (*ppCtx)->pSvcParam;
+
+  /* Decide whether need reset for IDR frame based on adjusting prarameters changed */
+  /* Temporal levels, spatial settings and/ or quality settings changed need update parameter sets related. */
+  bNeedReset	=	(pOldParam == NULL) ||
+                (pOldParam->iNumTemporalLayer != pNewParam->iNumTemporalLayer) ||
+                (pOldParam->uiGopSize != pNewParam->uiGopSize) ||
+                (pOldParam->iNumDependencyLayer != pNewParam->iNumDependencyLayer) ||
+                (pOldParam->iDecompStages != pNewParam->iDecompStages) ||
+                (pOldParam->iActualPicWidth != pNewParam->iActualPicWidth
+                 || pOldParam->iActualPicHeight != pNewParam->iActualPicHeight) ||
+                (pOldParam->SUsedPicRect.iWidth != pNewParam->SUsedPicRect.iWidth
+                 || pOldParam->SUsedPicRect.iHeight != pNewParam->SUsedPicRect.iHeight) ||
+                (pOldParam->bEnableLongTermReference != pNewParam->bEnableLongTermReference);
+  if (!bNeedReset) {	// Check its picture resolutions/quality settings respectively in each dependency layer
+    iIndexD = 0;
+    assert (pOldParam->iNumDependencyLayer == pNewParam->iNumDependencyLayer);
+    do {
+      const SDLayerParam* kpOldDlp	= &pOldParam->sDependencyLayers[iIndexD];
+      const SDLayerParam* kpNewDlp	= &pNewParam->sDependencyLayers[iIndexD];
+      float fT1 = .0f;
+      float fT2 = .0f;
+
+      // check frame size settings
+      if (kpOldDlp->iFrameWidth != kpNewDlp->iFrameWidth ||
+          kpOldDlp->iFrameHeight != kpNewDlp->iFrameHeight ||
+          kpOldDlp->iActualWidth != kpNewDlp->iActualWidth ||
+          kpOldDlp->iActualHeight != kpNewDlp->iActualHeight) {
+        bNeedReset	= true;
+        break;
+      }
+
+      if (kpOldDlp->sMso.uiSliceMode != kpNewDlp->sMso.uiSliceMode ||
+          kpOldDlp->sMso.sSliceArgument.iSliceNum != kpNewDlp->sMso.sSliceArgument.iSliceNum) {
+        bNeedReset	= true;
+        break;
+      }
+
+      // check frame rate
+      // we can not check whether corresponding fFrameRate is equal or not,
+      // only need to check d_max/d_min and max_fr/d_max whether it is equal or not
+      if (kpNewDlp->fInputFrameRate > EPSN && kpOldDlp->fInputFrameRate > EPSN)
+        fT1 = kpNewDlp->fOutputFrameRate / kpNewDlp->fInputFrameRate - kpOldDlp->fOutputFrameRate / kpOldDlp->fInputFrameRate;
+      if (kpNewDlp->fOutputFrameRate > EPSN && kpOldDlp->fOutputFrameRate > EPSN)
+        fT2 = pNewParam->fMaxFrameRate / kpNewDlp->fOutputFrameRate - pOldParam->fMaxFrameRate / kpOldDlp->fOutputFrameRate;
+      if (fT1 > EPSN || fT1 < -EPSN || fT2 > EPSN || fT2 < -EPSN) {
+        bNeedReset = true;
+        break;
+      }
+
+      if (kpOldDlp->iHighestTemporalId != kpNewDlp->iHighestTemporalId) {
+        bNeedReset = true;
+        break;
+      }
+
+      ++ iIndexD;
+    } while (iIndexD < pOldParam->iNumDependencyLayer);
+  }
+
+  if (bNeedReset) {
+    SParaSetOffsetVariable sTmpPsoVariable[PARA_SET_TYPE];
+    uint16_t	          uiTmpIdrPicId;//this is for LTR!
+    memcpy (sTmpPsoVariable, (*ppCtx)->sPSOVector.sParaSetOffsetVariable,
+            (PARA_SET_TYPE)*sizeof (SParaSetOffsetVariable)); // confirmed_safe_unsafe_usage
+    uiTmpIdrPicId = (*ppCtx)->sPSOVector.uiIdrPicId;
+
+    WelsUninitEncoderExt (ppCtx);
+
+    /* Update new parameters */
+    if (WelsInitEncoderExt (ppCtx, pNewParam))
+      return 1;
+
+    // reset the scaled spatial picture size
+    (*ppCtx)->pVpp->WelsPreprocessReset (*ppCtx);
+    //if WelsInitEncoderExt succeed
+
+    //for FLEXIBLE_PARASET_ID
+    memcpy ((*ppCtx)->sPSOVector.sParaSetOffsetVariable, sTmpPsoVariable,
+            (PARA_SET_TYPE)*sizeof (SParaSetOffsetVariable)); // confirmed_safe_unsafe_usage
+    (*ppCtx)->sPSOVector.uiIdrPicId = uiTmpIdrPicId;
+  } else {
+    /* maybe adjustment introduced in bitrate or little settings adjustment and so on.. */
+    pNewParam->iNumRefFrame								= WELS_CLIP3 (pNewParam->iNumRefFrame, MIN_REF_PIC_COUNT,
+                                            MAX_REFERENCE_PICTURE_COUNT_NUM);
+    pNewParam->iLoopFilterDisableIdc					= WELS_CLIP3 (pNewParam->iLoopFilterDisableIdc, 0, 6);
+    pNewParam->iLoopFilterAlphaC0Offset				= WELS_CLIP3 (pNewParam->iLoopFilterAlphaC0Offset, -6, 6);
+    pNewParam->iLoopFilterBetaOffset					= WELS_CLIP3 (pNewParam->iLoopFilterBetaOffset, -6, 6);
+    pNewParam->iInterLayerLoopFilterDisableIdc		= WELS_CLIP3 (pNewParam->iInterLayerLoopFilterDisableIdc, 0, 6);
+    pNewParam->iInterLayerLoopFilterAlphaC0Offset	= WELS_CLIP3 (pNewParam->iInterLayerLoopFilterAlphaC0Offset, -6, 6);
+    pNewParam->iInterLayerLoopFilterBetaOffset		= WELS_CLIP3 (pNewParam->iInterLayerLoopFilterBetaOffset, -6, 6);
+    pNewParam->fMaxFrameRate							= WELS_CLIP3 (pNewParam->fMaxFrameRate, MIN_FRAME_RATE, MAX_FRAME_RATE);
+
+    // we can not use direct struct based memcpy due some fields need keep unchanged as before
+    pOldParam->fMaxFrameRate	= pNewParam->fMaxFrameRate;		// maximal frame rate [Hz / fps]
+    pOldParam->iInputCsp			= pNewParam->iInputCsp;			// color space of input sequence
+    pOldParam->uiIntraPeriod		= pNewParam->uiIntraPeriod;		// intra period (multiple of GOP size as desired)
+    pOldParam->bEnableSpsPpsIdAddition = pNewParam->bEnableSpsPpsIdAddition;
+    pOldParam->bPrefixNalAddingCtrl = pNewParam->bPrefixNalAddingCtrl;
+    pOldParam->iNumRefFrame		= pNewParam->iNumRefFrame;		// number of reference frame used
+
+    /* denoise control */
+    pOldParam->bEnableDenoise	= pNewParam->bEnableDenoise;
+
+    /* background detection control */
+    pOldParam->bEnableBackgroundDetection		= pNewParam->bEnableBackgroundDetection;
+
+    /* adaptive quantization control */
+    pOldParam->bEnableAdaptiveQuant	= pNewParam->bEnableAdaptiveQuant;
+
+    /* int32_t term reference control */
+    pOldParam->bEnableLongTermReference	= pNewParam->bEnableLongTermReference;
+    pOldParam->uiLtrMarkPeriod	= pNewParam->uiLtrMarkPeriod;
+
+    // keep below values unchanged as before
+    pOldParam->bEnableSSEI		= pNewParam->bEnableSSEI;
+    pOldParam->bEnableFrameCroppingFlag	= pNewParam->bEnableFrameCroppingFlag;	// enable frame cropping flag
+
+    /* Motion search */
+
+    /* Deblocking loop filter */
+    pOldParam->iLoopFilterDisableIdc	= pNewParam->iLoopFilterDisableIdc;	// 0: on, 1: off, 2: on except for slice boundaries
+    pOldParam->iLoopFilterAlphaC0Offset	= pNewParam->iLoopFilterAlphaC0Offset;// AlphaOffset: valid range [-6, 6], default 0
+    pOldParam->iLoopFilterBetaOffset		= pNewParam->iLoopFilterBetaOffset;	// BetaOffset:	valid range [-6, 6], default 0
+    pOldParam->iInterLayerLoopFilterDisableIdc	=
+      pNewParam->iInterLayerLoopFilterDisableIdc; // Employed based upon inter-layer, same comment as above
+    pOldParam->iInterLayerLoopFilterAlphaC0Offset	=
+      pNewParam->iInterLayerLoopFilterAlphaC0Offset;	// InterLayerLoopFilterAlphaC0Offset
+    pOldParam->iInterLayerLoopFilterBetaOffset		=
+      pNewParam->iInterLayerLoopFilterBetaOffset;	// InterLayerLoopFilterBetaOffset
+
+    /* Rate Control */
+    pOldParam->bEnableRc			= pNewParam->bEnableRc;
+    pOldParam->iRCMode	    	= pNewParam->iRCMode;
+    pOldParam->iTargetBitrate	= pNewParam->iTargetBitrate;			// overall target bitrate introduced in RC module
+    pOldParam->iPaddingFlag	    = pNewParam->iPaddingFlag;
+
+    /* Layer definition */
+    pOldParam->bPrefixNalAddingCtrl	= pNewParam->bPrefixNalAddingCtrl;
+
+    // d
+    iIndexD = 0;
+    do {
+      SDLayerParam* pOldDlp	= &pOldParam->sDependencyLayers[iIndexD];
+      SDLayerParam* pNewDlp	= &pNewParam->sDependencyLayers[iIndexD];
+
+      pOldDlp->fInputFrameRate	= pNewDlp->fInputFrameRate;	// input frame rate
+      pOldDlp->fOutputFrameRate	= pNewDlp->fOutputFrameRate;	// output frame rate
+      pOldDlp->iSpatialBitrate	= pNewDlp->iSpatialBitrate;
+
+      pOldDlp->uiProfileIdc		= pNewDlp->uiProfileIdc;			// value of profile IDC (0 for auto-detection)
+
+      /* Derived variants below */
+      pOldDlp->iTemporalResolution	= pNewDlp->iTemporalResolution;
+      pOldDlp->iDecompositionStages	= pNewDlp->iDecompositionStages;
+
+      memcpy (pOldDlp->uiCodingIdx2TemporalId, pNewDlp->uiCodingIdx2TemporalId,
+              sizeof (pOldDlp->uiCodingIdx2TemporalId));	// confirmed_safe_unsafe_usage
+
+      ++ iIndexD;
+    } while (iIndexD < pOldParam->iNumDependencyLayer);
+  }
+
+  /* Any else initialization/reset for rate control here? */
+
+  return 0;
+}
+
+
+int32_t WelsCodeOnePicPartition (sWelsEncCtx* pCtx,
+                                 SLayerBSInfo* pLayerBsInfo,
+                                 int32_t* pNalIdxInLayer,
+                                 int32_t* pLayerSize,
+                                 int32_t iFirstMbInPartition,	// first mb inclusive in partition
+                                 int32_t iEndMbInPartition,	// end mb exclusive in partition
+                                 int32_t iStartSliceIdx
+                                ) {
+
+  SDqLayer* pCurLayer			= pCtx->pCurDqLayer;
+  SSliceCtx* pSliceCtx		= pCurLayer->pSliceEncCtx;
+  int32_t iNalLen[MAX_NAL_UNITS_IN_LAYER]			= {0};
+  int32_t iNalIdxInLayer		= *pNalIdxInLayer;
+  int32_t iSliceIdx				= iStartSliceIdx;
+  const int32_t kiSliceStep		= pCtx->iActiveThreadsNum;
+  const int32_t kiPartitionId		= iStartSliceIdx % kiSliceStep;
+  int32_t iPartitionBsSize		= 0;
+  int32_t iAnyMbLeftInPartition = iEndMbInPartition - iFirstMbInPartition;
+  const EWelsNalUnitType keNalType	= pCtx->eNalType;
+  const EWelsNalRefIdc keNalRefIdc	= pCtx->eNalPriority;
+  const bool_t kbNeedPrefix		= pCtx->bNeedPrefixNalFlag;
+
+  //init
+  {
+    pSliceCtx->pFirstMbInSlice[iSliceIdx]		= iFirstMbInPartition;
+    pCurLayer->pNumSliceCodedOfPartition[kiPartitionId]	= 1;	// one slice per partition intialized, dynamic slicing inside
+    pCurLayer->pLastMbIdxOfPartition[kiPartitionId]		= iEndMbInPartition - 1;
+  }
+  pCurLayer->pLastCodedMbIdxOfPartition[kiPartitionId] = 0;
+
+  while (iAnyMbLeftInPartition > 0) {
+    int32_t iSliceSize	= 0;
+
+    if (iSliceIdx >= pSliceCtx->iMaxSliceNumConstraint) {	// insufficient memory in pSliceInLayer[]
+      // TODO: need exception handler for not large enough of MAX_SLICES_NUM related memory usage
+      // No idea about its solution due MAX_SLICES_NUM is fixed lenght in relevent pData structure
+      return 1;
+    }
+
+    if (kbNeedPrefix) {
+      iPartitionBsSize += AddPrefixNal (pCtx, pLayerBsInfo, &iNalLen[0], &iNalIdxInLayer, keNalType, keNalRefIdc);
+    }
+
+    WelsLoadNal (pCtx->pOut, keNalType, keNalRefIdc);
+    WelsCodeOneSlice (pCtx, iSliceIdx, keNalType);
+    WelsUnloadNal (pCtx->pOut);
+
+    iSliceSize = WelsEncodeNalExt (&pCtx->pOut->sNalList[pCtx->pOut->iNalIndex - 1],
+                                   &pCtx->pCurDqLayer->sLayerInfo.sNalHeaderExt,
+                                   pCtx->pFrameBs + pCtx->iPosBsBuffer,
+                                   &iNalLen[iNalIdxInLayer]);
+    pCtx->iPosBsBuffer	+= iSliceSize;
+    iPartitionBsSize	+= iSliceSize;
+    pLayerBsInfo->iNalLengthInByte[iNalIdxInLayer]	= iSliceSize;
+
+#if defined(SLICE_INFO_OUTPUT)
+    fprintf (stderr,
+             "@slice=%-6d sliceType:%c idc:%d size:%-6d\n",
+             iSliceIdx,
+             (pCtx->eSliceType == P_SLICE ? 'P' : 'I'),
+             eNalRefIdc,
+             iSliceSize);
+#endif//SLICE_INFO_OUTPUT
+
+    ++ iNalIdxInLayer;
+    iSliceIdx += kiSliceStep;	//if uiSliceIdx is not continuous
+    iAnyMbLeftInPartition = iEndMbInPartition - (1 + pCurLayer->pLastCodedMbIdxOfPartition[kiPartitionId]);
+  }
+
+  *pLayerSize			= iPartitionBsSize;
+  *pNalIdxInLayer	= iNalIdxInLayer;
+
+  // slice based packing???
+  pLayerBsInfo->uiLayerType		= VIDEO_CODING_LAYER;
+  pLayerBsInfo->uiSpatialId		= pCtx->uiDependencyId;
+  pLayerBsInfo->uiTemporalId	= pCtx->uiTemporalId;
+  pLayerBsInfo->uiQualityId		= 0;
+  pLayerBsInfo->uiPriorityId	= 0;
+  pLayerBsInfo->iNalCount		= iNalIdxInLayer;
+
+  return 0;
+}
+} // namespace WelsSVCEnc
--- a/codec/encoder/core/src/expand_pic.cpp
+++ b/codec/encoder/core/src/expand_pic.cpp
@@ -1,166 +1,160 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <string.h>
-#include "expand_pic.h"
-#include "cpu_core.h"
-#include "wels_func_ptr_def.h"
-
-namespace WelsSVCEnc {
-
-// rewrite it (split into luma & chroma) that is helpful for mmx/sse2 optimization perform, 9/27/2009
-static inline void ExpandPictureLuma_c( uint8_t *pDst, const int32_t kiStride, const int32_t kiPicW, const int32_t kiPicH )
-{
-	uint8_t *pTmp				= pDst;
-	uint8_t *pDstLastLine		= pTmp + (kiPicH-1) * kiStride;	
-	const int32_t kiPaddingLen	= PADDING_LENGTH;	
-	const uint8_t kuiTL			= pTmp[0];
-	const uint8_t kuiTR			= pTmp[kiPicW-1];
-	const uint8_t kuiBL			= pDstLastLine[0];
-	const uint8_t kuiBR			= pDstLastLine[kiPicW-1];
-	int32_t i					= 0;
-
-	do {
-		const int32_t kiStrides	= (1+i) * kiStride;
-		uint8_t* pTop			= pTmp - kiStrides;
-		uint8_t* pBottom			= pDstLastLine + kiStrides;
-		
-		// pad pTop and pBottom
-		memcpy(pTop, pTmp, kiPicW);				// confirmed_safe_unsafe_usage
-		memcpy(pBottom, pDstLastLine, kiPicW);	// confirmed_safe_unsafe_usage
-		
-		// pad corners
-		memset(pTop-kiPaddingLen, kuiTL, kiPaddingLen); //pTop left
-		memset(pTop+kiPicW, kuiTR, kiPaddingLen); //pTop right
-		memset(pBottom-kiPaddingLen, kuiBL, kiPaddingLen); //pBottom left
-		memset(pBottom+kiPicW, kuiBR, kiPaddingLen); //pBottom right
-		
-		++ i;
-	} while( i < kiPaddingLen );
-
-	// pad left and right
-	i = 0;
-	do {
-		memset(pTmp-kiPaddingLen, pTmp[0], kiPaddingLen);
-		memset(pTmp+kiPicW, pTmp[kiPicW-1], kiPaddingLen);
-
-		pTmp += kiStride;
-		++ i;
-	} while( i < kiPicH );
-}
-
-static inline void ExpandPictureChroma_c( uint8_t *pDst, const int32_t kiStride, const int32_t kiPicW, const int32_t kiPicH )
-{
-	uint8_t *pTmp				= pDst;
-	uint8_t *pDstLastLine		= pTmp + (kiPicH-1) * kiStride;	
-	const int32_t kiPaddingLen	= (PADDING_LENGTH>>1);	
-	const uint8_t kuiTL			= pTmp[0];
-	const uint8_t kuiTR			= pTmp[kiPicW-1];
-	const uint8_t kuiBL			= pDstLastLine[0];
-	const uint8_t kuiBR			= pDstLastLine[kiPicW-1];
-	int32_t i					= 0;
-	
-	do {
-		const int32_t kiStrides	= (1+i) * kiStride;
-		uint8_t* pTop			= pTmp - kiStrides;
-		uint8_t* pBottom			= pDstLastLine + kiStrides;
-		
-		// pad pTop and pBottom
-		memcpy(pTop, pTmp, kiPicW);				// confirmed_safe_unsafe_usage
-		memcpy(pBottom, pDstLastLine, kiPicW);	// confirmed_safe_unsafe_usage
-		
-		// pad corners
-		memset(pTop-kiPaddingLen, kuiTL, kiPaddingLen); //pTop left
-		memset(pTop+kiPicW, kuiTR, kiPaddingLen); //pTop right
-		memset(pBottom-kiPaddingLen, kuiBL, kiPaddingLen); //pBottom left
-		memset(pBottom+kiPicW, kuiBR, kiPaddingLen); //pBottom right
-		
-		++ i;
-	} while( i < kiPaddingLen );
-	
-	// pad left and right
-	i = 0;
-	do {
-		memset(pTmp-kiPaddingLen, pTmp[0], kiPaddingLen);
-		memset(pTmp+kiPicW, pTmp[kiPicW-1], kiPaddingLen);
-		
-		pTmp += kiStride;
-		++ i;
-	} while( i < kiPicH );
-}
-
-void InitExpandPictureFunc( void *pL, const uint32_t kuiCPUFlag )
-{
-	SWelsFuncPtrList *pFuncList = (SWelsFuncPtrList *)pL;
-	pFuncList->pfExpandLumaPicture		= ExpandPictureLuma_c;
-	pFuncList->pfExpandChromaPicture[0]	= ExpandPictureChroma_c;
-	pFuncList->pfExpandChromaPicture[1]	= ExpandPictureChroma_c;	
-
-#if defined(X86_ASM)
-	if ( (kuiCPUFlag & WELS_CPU_SSE2) == WELS_CPU_SSE2 )
-	{
-		pFuncList->pfExpandLumaPicture	= ExpandPictureLuma_sse2;
-		pFuncList->pfExpandChromaPicture[0]= ExpandPictureChromaUnalign_sse2;
-		pFuncList->pfExpandChromaPicture[1]= ExpandPictureChromaAlign_sse2;
-	}
-#endif//X86_ASM
-}
-
-
-void ExpandReferencingPicture( SPicture *pPic, PExpandPictureFunc pExpLuma, PExpandPictureFunc pExpChrom[2] )
-{		
-	/*local variable*/
-	uint8_t *pPicY	= pPic->pData[0];
-	uint8_t *pPicCb = pPic->pData[1];
-	uint8_t *pPicCr = pPic->pData[2];	
-	const int32_t kiWidthY	= pPic->iWidthInPixel;
-	const int32_t kiHeightY	= pPic->iHeightInPixel;
-	const int32_t kiWidthUV	= kiWidthY >> 1;
-	const int32_t kiHeightUV	= kiHeightY >> 1;	
-
-	pExpLuma(pPicY, pPic->iLineSize[0], kiWidthY, kiHeightY);
-	if ( kiWidthUV >= 16 )
-	{
-		// fix coding picture size as 16x16
-		const bool_t kbChrAligned= /*(iWidthUV >= 16) && */((kiWidthUV & 0x0F) == 0);	// chroma planes: (16+iWidthUV) & 15
-		pExpChrom[kbChrAligned](pPicCb, pPic->iLineSize[1], kiWidthUV, kiHeightUV);
-		pExpChrom[kbChrAligned](pPicCr, pPic->iLineSize[2], kiWidthUV, kiHeightUV);
-	}
-	else
-	{
-		// fix coding picture size as 16x16
-		ExpandPictureChroma_c(pPicCb, pPic->iLineSize[1], kiWidthUV, kiHeightUV);
-		ExpandPictureChroma_c(pPicCr, pPic->iLineSize[2], kiWidthUV, kiHeightUV);
-	}
-}
-
-}
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <string.h>
+#include "expand_pic.h"
+#include "cpu_core.h"
+#include "wels_func_ptr_def.h"
+
+namespace WelsSVCEnc {
+
+// rewrite it (split into luma & chroma) that is helpful for mmx/sse2 optimization perform, 9/27/2009
+static inline void ExpandPictureLuma_c (uint8_t* pDst, const int32_t kiStride, const int32_t kiPicW,
+                                        const int32_t kiPicH) {
+  uint8_t* pTmp				= pDst;
+  uint8_t* pDstLastLine		= pTmp + (kiPicH - 1) * kiStride;
+  const int32_t kiPaddingLen	= PADDING_LENGTH;
+  const uint8_t kuiTL			= pTmp[0];
+  const uint8_t kuiTR			= pTmp[kiPicW - 1];
+  const uint8_t kuiBL			= pDstLastLine[0];
+  const uint8_t kuiBR			= pDstLastLine[kiPicW - 1];
+  int32_t i					= 0;
+
+  do {
+    const int32_t kiStrides	= (1 + i) * kiStride;
+    uint8_t* pTop			= pTmp - kiStrides;
+    uint8_t* pBottom			= pDstLastLine + kiStrides;
+
+    // pad pTop and pBottom
+    memcpy (pTop, pTmp, kiPicW);				// confirmed_safe_unsafe_usage
+    memcpy (pBottom, pDstLastLine, kiPicW);	// confirmed_safe_unsafe_usage
+
+    // pad corners
+    memset (pTop - kiPaddingLen, kuiTL, kiPaddingLen); //pTop left
+    memset (pTop + kiPicW, kuiTR, kiPaddingLen); //pTop right
+    memset (pBottom - kiPaddingLen, kuiBL, kiPaddingLen); //pBottom left
+    memset (pBottom + kiPicW, kuiBR, kiPaddingLen); //pBottom right
+
+    ++ i;
+  } while (i < kiPaddingLen);
+
+  // pad left and right
+  i = 0;
+  do {
+    memset (pTmp - kiPaddingLen, pTmp[0], kiPaddingLen);
+    memset (pTmp + kiPicW, pTmp[kiPicW - 1], kiPaddingLen);
+
+    pTmp += kiStride;
+    ++ i;
+  } while (i < kiPicH);
+}
+
+static inline void ExpandPictureChroma_c (uint8_t* pDst, const int32_t kiStride, const int32_t kiPicW,
+    const int32_t kiPicH) {
+  uint8_t* pTmp				= pDst;
+  uint8_t* pDstLastLine		= pTmp + (kiPicH - 1) * kiStride;
+  const int32_t kiPaddingLen	= (PADDING_LENGTH >> 1);
+  const uint8_t kuiTL			= pTmp[0];
+  const uint8_t kuiTR			= pTmp[kiPicW - 1];
+  const uint8_t kuiBL			= pDstLastLine[0];
+  const uint8_t kuiBR			= pDstLastLine[kiPicW - 1];
+  int32_t i					= 0;
+
+  do {
+    const int32_t kiStrides	= (1 + i) * kiStride;
+    uint8_t* pTop			= pTmp - kiStrides;
+    uint8_t* pBottom			= pDstLastLine + kiStrides;
+
+    // pad pTop and pBottom
+    memcpy (pTop, pTmp, kiPicW);				// confirmed_safe_unsafe_usage
+    memcpy (pBottom, pDstLastLine, kiPicW);	// confirmed_safe_unsafe_usage
+
+    // pad corners
+    memset (pTop - kiPaddingLen, kuiTL, kiPaddingLen); //pTop left
+    memset (pTop + kiPicW, kuiTR, kiPaddingLen); //pTop right
+    memset (pBottom - kiPaddingLen, kuiBL, kiPaddingLen); //pBottom left
+    memset (pBottom + kiPicW, kuiBR, kiPaddingLen); //pBottom right
+
+    ++ i;
+  } while (i < kiPaddingLen);
+
+  // pad left and right
+  i = 0;
+  do {
+    memset (pTmp - kiPaddingLen, pTmp[0], kiPaddingLen);
+    memset (pTmp + kiPicW, pTmp[kiPicW - 1], kiPaddingLen);
+
+    pTmp += kiStride;
+    ++ i;
+  } while (i < kiPicH);
+}
+
+void InitExpandPictureFunc (void* pL, const uint32_t kuiCPUFlag) {
+  SWelsFuncPtrList* pFuncList = (SWelsFuncPtrList*)pL;
+  pFuncList->pfExpandLumaPicture		= ExpandPictureLuma_c;
+  pFuncList->pfExpandChromaPicture[0]	= ExpandPictureChroma_c;
+  pFuncList->pfExpandChromaPicture[1]	= ExpandPictureChroma_c;
+
+#if defined(X86_ASM)
+  if ((kuiCPUFlag & WELS_CPU_SSE2) == WELS_CPU_SSE2) {
+    pFuncList->pfExpandLumaPicture	= ExpandPictureLuma_sse2;
+    pFuncList->pfExpandChromaPicture[0] = ExpandPictureChromaUnalign_sse2;
+    pFuncList->pfExpandChromaPicture[1] = ExpandPictureChromaAlign_sse2;
+  }
+#endif//X86_ASM
+}
+
+
+void ExpandReferencingPicture (SPicture* pPic, PExpandPictureFunc pExpLuma, PExpandPictureFunc pExpChrom[2]) {
+  /*local variable*/
+  uint8_t* pPicY	= pPic->pData[0];
+  uint8_t* pPicCb = pPic->pData[1];
+  uint8_t* pPicCr = pPic->pData[2];
+  const int32_t kiWidthY	= pPic->iWidthInPixel;
+  const int32_t kiHeightY	= pPic->iHeightInPixel;
+  const int32_t kiWidthUV	= kiWidthY >> 1;
+  const int32_t kiHeightUV	= kiHeightY >> 1;
+
+  pExpLuma (pPicY, pPic->iLineSize[0], kiWidthY, kiHeightY);
+  if (kiWidthUV >= 16) {
+    // fix coding picture size as 16x16
+    const bool_t kbChrAligned = /*(iWidthUV >= 16) && */ ((kiWidthUV & 0x0F) == 0);	// chroma planes: (16+iWidthUV) & 15
+    pExpChrom[kbChrAligned] (pPicCb, pPic->iLineSize[1], kiWidthUV, kiHeightUV);
+    pExpChrom[kbChrAligned] (pPicCr, pPic->iLineSize[2], kiWidthUV, kiHeightUV);
+  } else {
+    // fix coding picture size as 16x16
+    ExpandPictureChroma_c (pPicCb, pPic->iLineSize[1], kiWidthUV, kiHeightUV);
+    ExpandPictureChroma_c (pPicCr, pPic->iLineSize[2], kiWidthUV, kiHeightUV);
+  }
+}
+
+}
--- a/codec/encoder/core/src/get_intra_predictor.cpp
+++ b/codec/encoder/core/src/get_intra_predictor.cpp
@@ -1,789 +1,742 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file	get_intra_predictor.c
- *
- * \brief	implementation for get intra predictor about 16x16, 4x4, chroma.
- *
- * \date	4/2/2009 Created
- *			9/14/2009 C level based optimization with high performance gained.
- *				[const, using ST32/ST64 to replace memset, memcpy and memmove etc.]
- *
- *************************************************************************************
- */
-#include <string.h>
-#include "macros.h"
-#include "ls_defines.h"
-#include "cpu_core.h"
-#include "get_intra_predictor.h"
-#include "wels_common_basis.h"
-#include "array_stack_align.h"
-
-namespace WelsSVCEnc {
-#define I4x4_COUNT 4
-#define I8x8_COUNT 8
-#define I16x16_COUNT 16
-
-typedef void (*PFillingPred)( uint8_t *pPred, uint8_t *pSrc );
-typedef void (*PFillingPred1to16) ( uint8_t *pPred, const uint8_t kuiSrc );
-
-static inline void WelsFillingPred8to16_c( uint8_t *pPred, uint8_t *pSrc )
-{
-	ST64( pPred  , LD64(pSrc) );
-	ST64( pPred+8, LD64(pSrc) );
-}
-static inline void WelsFillingPred8x2to16_c( uint8_t *pPred, uint8_t *pSrc )
-{
-	ST64( pPred  , LD64(pSrc  ) );
-	ST64( pPred+8, LD64(pSrc+8) );
-}
-static inline void WelsFillingPred1to16_c( uint8_t *pPred, const uint8_t kuiSrc )
-{
-	const uint8_t kuiSrc8[8] = { kuiSrc, kuiSrc, kuiSrc, kuiSrc, kuiSrc, kuiSrc, kuiSrc, kuiSrc };
-	ST64( pPred  , LD64(kuiSrc8));
-	ST64( pPred+8, LD64(kuiSrc8));
-}
-
-PFillingPred					WelsFillingPred8to16;
-PFillingPred					WelsFillingPred8x2to16;
-PFillingPred1to16 WelsFillingPred1to16;
-
-void WelsInitFillingPredFuncs( const uint32_t kuiCpuFlag )
-{
-	WelsFillingPred8to16	= WelsFillingPred8to16_c;
-	WelsFillingPred8x2to16	= WelsFillingPred8x2to16_c;
-	WelsFillingPred1to16	= WelsFillingPred1to16_c;
-
-#if defined(X86_ASM)
-	if ( kuiCpuFlag & WELS_CPU_MMXEXT )
-	{
-		WelsFillingPred8to16		= WelsFillingPred8to16_mmx;
-		WelsFillingPred8x2to16	    = WelsFillingPred8x2to16_mmx;
-		WelsFillingPred1to16		= WelsFillingPred1to16_mmx;
-	}
-	if ( kuiCpuFlag & WELS_CPU_SSE2 )
-	{		
-		WelsFillingPred8x2to16	    = WelsFillingPred8x2to16_sse2;
-		WelsFillingPred1to16		= WelsFillingPred1to16_sse2;
-	}
-#endif//X86_ASM
-}
-
-
-
-#define I4x4_PRED_STRIDE 4
-#define I4x4_PRED_STRIDE2 8
-#define I4x4_PRED_STRIDE3 12
-
-void WelsI4x4LumaPredV_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
-{
-	const uint32_t kuiSrc = LD32(&pRef[-kiStride]);
-	ENFORCE_STACK_ALIGN_1D(uint32_t, uiSrcx2, 2, 16)
-	uiSrcx2[0] = uiSrcx2[1] = kuiSrc;
-	
-	WelsFillingPred8to16( pPred, (uint8_t*)&uiSrcx2[0] );
-}
-
-void WelsI4x4LumaPredH_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
-{
-	const uint32_t kiStridex2Left = (kiStride<<1)-1;
-	const uint32_t kiStridex3Left = kiStride + kiStridex2Left;		
-	const uint8_t kuiHor1 = pRef[-1];
-	const uint8_t kuiHor2 = pRef[kiStride-1];	
-	const uint8_t kuiHor3 = pRef[kiStridex2Left];
-	const uint8_t kuiHor4 = pRef[kiStridex3Left];
-	const uint8_t kuiVec1[4] = {kuiHor1, kuiHor1, kuiHor1, kuiHor1};
-	const uint8_t kuiVec2[4] = {kuiHor2, kuiHor2, kuiHor2, kuiHor2};
-	const uint8_t kuiVec3[4] = {kuiHor3, kuiHor3, kuiHor3, kuiHor3};
-	const uint8_t kuiVec4[4] = {kuiHor4, kuiHor4, kuiHor4, kuiHor4};
-	ENFORCE_STACK_ALIGN_1D(uint8_t, uiSrc, 16, 16)	// TobeCont'd about assign opt as follows
-	ST32(&uiSrc[0], LD32(kuiVec1));
-	ST32(&uiSrc[4], LD32(kuiVec2));
-	ST32(&uiSrc[8], LD32(kuiVec3));
-	ST32(&uiSrc[12], LD32(kuiVec4));
-	
-	WelsFillingPred8x2to16( pPred, uiSrc );
-}
-void WelsI4x4LumaPredDc_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
-{
-	const uint8_t kuiDcValue	= ( pRef[-1] + pRef[kiStride-1] + pRef[(kiStride<<1)-1] + pRef[(kiStride<<1)+kiStride-1] +
-								pRef[-kiStride] + pRef[1-kiStride] + pRef[2-kiStride] + pRef[3-kiStride] + 4 ) >> 3;
-
-	WelsFillingPred1to16( pPred, kuiDcValue );
-}
-
-void WelsI4x4LumaPredDcLeft_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
-{
-	const uint8_t kuiDcValue	= (pRef[-1] + pRef[kiStride-1] + pRef[(kiStride<<1)-1] + pRef[(kiStride<<1)+kiStride-1] + 2)>>2;
-
-	WelsFillingPred1to16( pPred, kuiDcValue );	
-}
-
-void WelsI4x4LumaPredDcTop_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
-{
-	const uint8_t kuiDcValue	= (pRef[-kiStride] + pRef[1-kiStride] + pRef[2-kiStride] + pRef[3-kiStride] + 2) >> 2;
-
-	WelsFillingPred1to16( pPred, kuiDcValue );	
-}
-
-void WelsI4x4LumaPredDcNA_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
-{
-	const uint8_t kuiDcValue = 0x80;
-
-	WelsFillingPred1to16( pPred, kuiDcValue );
-}
-
-/*down pLeft*/
-void WelsI4x4LumaPredDDL_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
-{
-	/*get pTop*/
-	const uint8_t kuiT0		= pRef[-kiStride];
-	const uint8_t kuiT1		= pRef[1-kiStride];
-	const uint8_t kuiT2		= pRef[2-kiStride];
-	const uint8_t kuiT3		= pRef[3-kiStride];
-	const uint8_t kuiT4		= pRef[4-kiStride];
-	const uint8_t kuiT5		= pRef[5-kiStride];
-	const uint8_t kuiT6		= pRef[6-kiStride];
-	const uint8_t kuiT7		= pRef[7-kiStride];
-	const uint8_t kuiDDL0	= (2 + kuiT0 + kuiT2 + (kuiT1<<1))>>2;	// uiDDL0
-	const uint8_t kuiDDL1	= (2 + kuiT1 + kuiT3 + (kuiT2<<1))>>2;	// uiDDL1
-	const uint8_t kuiDDL2	= (2 + kuiT2 + kuiT4 + (kuiT3<<1))>>2;	// uiDDL2
-	const uint8_t kuiDDL3	= (2 + kuiT3 + kuiT5 + (kuiT4<<1))>>2;	// uiDDL3
-	const uint8_t kuiDDL4	= (2 + kuiT4 + kuiT6 + (kuiT5<<1))>>2;	// uiDDL4
-	const uint8_t kuiDDL5	= (2 + kuiT5 + kuiT7 + (kuiT6<<1))>>2;	// uiDDL5
-	const uint8_t kuiDDL6	= (2 + kuiT6 + kuiT7 + (kuiT7<<1))>>2;	// uiDDL6
-	ENFORCE_STACK_ALIGN_1D(uint8_t, uiSrc, 16, 16)	// TobeCont'd about assign opt as follows
-	uiSrc[0] = kuiDDL0;
-	uiSrc[1] = uiSrc[4] = kuiDDL1;
-	uiSrc[2] = uiSrc[5] = uiSrc[8] = kuiDDL2;
-	uiSrc[3] = uiSrc[6] = uiSrc[9] = uiSrc[12] = kuiDDL3;
-	uiSrc[7] = uiSrc[10]= uiSrc[13]= kuiDDL4;
-	uiSrc[11]= uiSrc[14]= kuiDDL5;
-	uiSrc[15] = kuiDDL6;
-
-	WelsFillingPred8x2to16( pPred, uiSrc );
-}
-
-/*down pLeft*/
-void WelsI4x4LumaPredDDLTop_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
-{
-	/*get pTop*/
-	const uint8_t kuiT0	= pRef[-kiStride];
-	const uint8_t kuiT1	= pRef[1-kiStride];
-	const uint8_t kuiT2	= pRef[2-kiStride];
-	const uint8_t kuiT3	= pRef[3-kiStride];
-	const uint8_t kuiDLT0	= (2 + kuiT0 + kuiT2 + (kuiT1<<1))>>2;	// uiDLT0
-	const uint8_t kuiDLT1	= (2 + kuiT1 + kuiT3 + (kuiT2<<1))>>2;	// uiDLT1
-	const uint8_t kuiDLT2	= (2 + kuiT2 + kuiT3 + (kuiT3<<1))>>2;	// uiDLT2
-	const uint8_t kuiDLT3	= (2 + (kuiT3<<2))>>2;				// uiDLT3
-	ENFORCE_STACK_ALIGN_1D(uint8_t, uiSrc, 16, 16)	// TobeCont'd about assign opt as follows
-	memset(&uiSrc[6], kuiDLT3, 10*sizeof(uint8_t));
-	uiSrc[0] = kuiDLT0;
-	uiSrc[1] = uiSrc[4] = kuiDLT1;
-	uiSrc[2] = uiSrc[5] = uiSrc[8] = kuiDLT2;
-	uiSrc[3] = kuiDLT3;
-
-	WelsFillingPred8x2to16( pPred, uiSrc );
-}
-
-
-/*down right*/
-void WelsI4x4LumaPredDDR_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
-{
-	const int32_t kiStridex2	= kiStride<<1;
-	const int32_t kiStridex3	= kiStride + kiStridex2;
-	const uint8_t kuiLT			= pRef[-kiStride-1];	// pTop-pLeft
-	/*get pLeft and pTop*/
-	const uint8_t kuiL0			= pRef[-1];
-	const uint8_t kuiL1			= pRef[kiStride-1];
-	const uint8_t kuiL2			= pRef[kiStridex2-1];
-	const uint8_t kuiL3			= pRef[kiStridex3-1];
-	const uint8_t kuiT0			= pRef[-kiStride];
-	const uint8_t kuiT1			= pRef[1-kiStride];
-	const uint8_t kuiT2			= pRef[2-kiStride];
-	const uint8_t kuiT3			= pRef[3-kiStride];
-	const uint16_t kuiTL0		= 1 + kuiLT + kuiL0;
-	const uint16_t kuiLT0		= 1 + kuiLT + kuiT0;
-	const uint16_t kuiT01		= 1 + kuiT0 + kuiT1;
-	const uint16_t kuiT12		= 1 + kuiT1 + kuiT2;
-	const uint16_t kuiT23		= 1 + kuiT2 + kuiT3;
-	const uint16_t kuiL01		= 1 + kuiL0 + kuiL1;
-	const uint16_t kuiL12		= 1 + kuiL1 + kuiL2;
-	const uint16_t kuiL23		= 1 + kuiL2 + kuiL3;
-	const uint8_t kuiDDR0		= (kuiTL0 + kuiLT0) >> 2;
-	const uint8_t kuiDDR1		= (kuiLT0 + kuiT01) >> 2;
-	const uint8_t kuiDDR2		= (kuiT01 + kuiT12) >> 2;
-	const uint8_t kuiDDR3		= (kuiT12 + kuiT23) >> 2;
-	const uint8_t kuiDDR4		= (kuiTL0 + kuiL01) >> 2;
-	const uint8_t kuiDDR5		= (kuiL01 + kuiL12) >> 2;
-	const uint8_t kuiDDR6		= (kuiL12 + kuiL23) >> 2;
-	ENFORCE_STACK_ALIGN_1D(uint8_t, uiSrc, 16, 16)	// TobeCont'd about assign opt as follows
-	uiSrc[0] = uiSrc[5] = uiSrc[10] = uiSrc[15] = kuiDDR0;
-	uiSrc[1] = uiSrc[6] = uiSrc[11] = kuiDDR1;
-	uiSrc[2] = uiSrc[7] = kuiDDR2;
-	uiSrc[3] = kuiDDR3;
-	uiSrc[4] = uiSrc[9] = uiSrc[14] = kuiDDR4;
-	uiSrc[8] = uiSrc[13] = kuiDDR5;
-	uiSrc[12]= kuiDDR6;
-
-	WelsFillingPred8x2to16( pPred, uiSrc );
-}
-
-
-/*vertical pLeft*/
-void WelsI4x4LumaPredVL_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
-{
-	/*get pTop*/
-	const uint8_t kuiT0		= pRef[-kiStride];
-	const uint8_t kuiT1		= pRef[1-kiStride];
-	const uint8_t kuiT2		= pRef[2-kiStride];
-	const uint8_t kuiT3		= pRef[3-kiStride];
-	const uint8_t kuiT4		= pRef[4-kiStride];
-	const uint8_t kuiT5		= pRef[5-kiStride];
-	const uint8_t kuiT6		= pRef[6-kiStride];
-	const uint8_t kuiVL0	= (1 + kuiT0 + kuiT1)>>1;				// uiVL0
-	const uint8_t kuiVL1	= (1 + kuiT1 + kuiT2)>>1;				// uiVL1
-	const uint8_t kuiVL2	= (1 + kuiT2 + kuiT3)>>1;				// uiVL2
-	const uint8_t kuiVL3	= (1 + kuiT3 + kuiT4)>>1;				// uiVL3
-	const uint8_t kuiVL4	= (1 + kuiT4 + kuiT5)>>1;				// uiVL4
-	const uint8_t kuiVL5	= (2 + kuiT0 + (kuiT1<<1) + kuiT2)>>2;	// uiVL5
-	const uint8_t kuiVL6	= (2 + kuiT1 + (kuiT2<<1) + kuiT3)>>2;	// uiVL6
-	const uint8_t kuiVL7	= (2 + kuiT2 + (kuiT3<<1) + kuiT4)>>2;	// uiVL7
-	const uint8_t kuiVL8	= (2 + kuiT3 + (kuiT4<<1) + kuiT5)>>2;	// uiVL8
-	const uint8_t kuiVL9	= (2 + kuiT4 + (kuiT5<<1) + kuiT6)>>2;	// uiVL9
-	ENFORCE_STACK_ALIGN_1D(uint8_t, uiSrc, 16, 16)	// TobeCont'd about assign opt as follows
-	uiSrc[0] = kuiVL0;
-	uiSrc[1] = uiSrc[8] = kuiVL1;
-	uiSrc[2] = uiSrc[9] = kuiVL2;
-	uiSrc[3] = uiSrc[10]= kuiVL3;
-	uiSrc[4] = kuiVL5;
-	uiSrc[5] = uiSrc[12] = kuiVL6;
-	uiSrc[6] = uiSrc[13] = kuiVL7;
-	uiSrc[7] = uiSrc[14] = kuiVL8;
-	uiSrc[11]= kuiVL4;
-	uiSrc[15]= kuiVL9;
-
-	WelsFillingPred8x2to16( pPred, uiSrc );
-}
-
-
-
-/*vertical pLeft*/
-void WelsI4x4LumaPredVLTop_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
-{
-	uint8_t *pTopLeft		= &pRef[-kiStride-1];	// pTop-pLeft
-	/*get pTop*/
-	const uint8_t kuiT0		= *(pTopLeft+1);
-	const uint8_t kuiT1		= *(pTopLeft+2);
-	const uint8_t kuiT2		= *(pTopLeft+3);
-	const uint8_t kuiT3		= *(pTopLeft+4);
-	const uint8_t kuiVLT0	= (1 + kuiT0 + kuiT1)>>1;				// uiVLT0
-	const uint8_t kuiVLT1	= (1 + kuiT1 + kuiT2)>>1;				// uiVLT1
-	const uint8_t kuiVLT2	= (1 + kuiT2 + kuiT3)>>1;				// uiVLT2
-	const uint8_t kuiVLT3	= (1 + (kuiT3<<1))>>1;				// uiVLT3
-	const uint8_t kuiVLT4	= (2 + kuiT0 + (kuiT1<<1) + kuiT2)>>2;	// uiVLT4
-	const uint8_t kuiVLT5	= (2 + kuiT1 + (kuiT2<<1) + kuiT3)>>2;	// uiVLT5
-	const uint8_t kuiVLT6	= (2 + kuiT2 + (kuiT3<<1) + kuiT3)>>2;	// uiVLT6
-	const uint8_t kuiVLT7	= (2 + (kuiT3<<2))>>2;				// uiVLT7
-	ENFORCE_STACK_ALIGN_1D(uint8_t, uiSrc, 16, 16)	// TobeCont'd about assign opt as follows
-	uiSrc[0] = kuiVLT0;
-	uiSrc[1] = uiSrc[8] = kuiVLT1;
-	uiSrc[2] = uiSrc[9] = kuiVLT2;
-	uiSrc[3] = uiSrc[10]= uiSrc[11] = kuiVLT3;
-	uiSrc[4] = kuiVLT4;
-	uiSrc[5] = uiSrc[12] = kuiVLT5;
-	uiSrc[6] = uiSrc[13] = kuiVLT6;
-	uiSrc[7] = uiSrc[14] = uiSrc[15] = kuiVLT7;
-
-	WelsFillingPred8x2to16( pPred, uiSrc );
-}
-
-/*vertical right*/
-void WelsI4x4LumaPredVR_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
-{
-	const int32_t kiStridex2	= kiStride<<1;
-	const uint8_t kuiLT			= pRef[-kiStride-1];	// pTop-pLeft
-	/*get pLeft and pTop*/
-	const uint8_t kuiL0			= pRef[-1];
-	const uint8_t kuiL1			= pRef[kiStride-1];
-	const uint8_t kuiL2			= pRef[kiStridex2-1];
-	const uint8_t kuiT0			= pRef[-kiStride];
-	const uint8_t kuiT1			= pRef[1-kiStride];
-	const uint8_t kuiT2			= pRef[2-kiStride];
-	const uint8_t kuiT3			= pRef[3-kiStride];
-	const uint8_t kuiVR0		= (1 + kuiLT + kuiT0) >> 1;
-	const uint8_t kuiVR1		= (1 + kuiT0 + kuiT1) >> 1;
-	const uint8_t kuiVR2		= (1 + kuiT1 + kuiT2) >> 1;
-	const uint8_t kuiVR3		= (1 + kuiT2 + kuiT3) >> 1;
-	const uint8_t kuiVR4		= (2 + kuiL0 + (kuiLT << 1) + kuiT0) >> 2;
-	const uint8_t kuiVR5		= (2 + kuiLT + (kuiT0 << 1) + kuiT1) >> 2;
-	const uint8_t kuiVR6		= (2 + kuiT0 + (kuiT1 << 1) + kuiT2) >> 2;
-	const uint8_t kuiVR7		= (2 + kuiT1 + (kuiT2 << 1) + kuiT3) >> 2; 
-	const uint8_t kuiVR8		= (2 + kuiLT + (kuiL0 << 1) + kuiL1) >> 2;
-	const uint8_t kuiVR9		= (2 + kuiL0 + (kuiL1 << 1) + kuiL2) >> 2;
-	ENFORCE_STACK_ALIGN_1D(uint8_t, uiSrc, 16, 16)	// TobeCont'd about assign opt as follows
-	uiSrc[0] = uiSrc[9] = kuiVR0;
-	uiSrc[1] = uiSrc[10] = kuiVR1;
-	uiSrc[2] = uiSrc[11] = kuiVR2;
-	uiSrc[3] = kuiVR3;
-	uiSrc[4] = uiSrc[13] = kuiVR4;
-	uiSrc[5] = uiSrc[14] = kuiVR5;
-	uiSrc[6] = uiSrc[15] = kuiVR6;
-	uiSrc[7] = kuiVR7;
-	uiSrc[8] = kuiVR8;
-	uiSrc[12]= kuiVR9;
-
-	WelsFillingPred8x2to16( pPred, uiSrc );
-}
-
-
-/*horizontal up*/
-void WelsI4x4LumaPredHU_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
-{
-	const int32_t kiStridex2	= kiStride<<1;
-	const int32_t kiStridex3	= kiStride + kiStridex2;
-	/*get pLeft*/
-	const uint8_t kuiL0			= pRef[-1];
-	const uint8_t kuiL1			= pRef[kiStride-1];
-	const uint8_t kuiL2			= pRef[kiStridex2-1];
-	const uint8_t kuiL3			= pRef[kiStridex3-1];
-	const uint16_t kuiL01		= (1 + kuiL0 + kuiL1);
-	const uint16_t kuiL12		= (1 + kuiL1 + kuiL2);
-	const uint16_t kuiL23		= (1 + kuiL2 + kuiL3);
-	const uint8_t kuiHU0		= kuiL01 >> 1;
-	const uint8_t kuiHU1		= (kuiL01 + kuiL12) >> 2;
-	const uint8_t kuiHU2		= kuiL12 >> 1;
-	const uint8_t kuiHU3		= (kuiL12 + kuiL23) >> 2;
-	const uint8_t kuiHU4		= kuiL23 >> 1;
-	const uint8_t kuiHU5		= (1 + kuiL23 + (kuiL3 << 1)) >> 2;
-	ENFORCE_STACK_ALIGN_1D(uint8_t, uiSrc, 16, 16)	// TobeCont'd about assign opt as follows
-	uiSrc[0] = kuiHU0;
-	uiSrc[1] = kuiHU1;
-	uiSrc[2] = uiSrc[4] = kuiHU2;
-	uiSrc[3] = uiSrc[5] = kuiHU3;
-	uiSrc[6] = uiSrc[8] = kuiHU4;
-	uiSrc[7] = uiSrc[9] = kuiHU5;
-	memset(&uiSrc[10], kuiL3, 6*sizeof(uint8_t));
-
-	WelsFillingPred8x2to16( pPred, uiSrc );
-}
-
-
-/*horizontal down*/
-void WelsI4x4LumaPredHD_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
-{
-	const int32_t kiStridex2	= kiStride<<1;
-	const int32_t kiStridex3	= kiStride + kiStridex2;
-	const uint8_t kuiLT		= pRef[-kiStride-1];	// pTop-pLeft
-	/*get pLeft and pTop*/
-	const uint8_t kuiL0		= pRef[-1];
-	const uint8_t kuiL1		= pRef[kiStride-1];
-	const uint8_t kuiL2		= pRef[kiStridex2-1];
-	const uint8_t kuiL3		= pRef[kiStridex3-1];
-	const uint8_t kuiT0		= pRef[-kiStride];
-	const uint8_t kuiT1		= pRef[1-kiStride];
-	const uint8_t kuiT2		= pRef[2-kiStride];
-	const uint8_t kuiHD0		= (1 + kuiLT + kuiL0)>>1;				// uiHD0 
-	const uint8_t kuiHD1		= (2 + kuiL0 + (kuiLT<<1) + kuiT0)>>2;	// uiHD1
-	const uint8_t kuiHD2		= (2 + kuiLT + (kuiT0<<1) + kuiT1)>>2;	// uiHD2
-	const uint8_t kuiHD3		= (2 + kuiT0 + (kuiT1<<1) + kuiT2)>>2;	// uiHD3
-	const uint8_t kuiHD4		= (1 + kuiL0 + kuiL1)>>1;				// uiHD4
-	const uint8_t kuiHD5		= (2 + kuiLT + (kuiL0<<1) + kuiL1)>>2;	// uiHD5
-	const uint8_t kuiHD6		= (1 + kuiL1 + kuiL2)>>1;				// uiHD6
-	const uint8_t kuiHD7		= (2 + kuiL0 + (kuiL1<<1) + kuiL2)>>2;	// uiHD7
-	const uint8_t kuiHD8		= (1 + kuiL2 + kuiL3)>>1;				// uiHD8
-	const uint8_t kuiHD9		= (2 + kuiL1 + (kuiL2<<1) + kuiL3)>>2;	// uiHD9
-	ENFORCE_STACK_ALIGN_1D(uint8_t, uiSrc, 16, 16)	// TobeCont'd about assign opt as follows
-	uiSrc[0] = uiSrc[6] = kuiHD0;
-	uiSrc[1] = uiSrc[7] = kuiHD1;
-	uiSrc[2] = kuiHD2;
-	uiSrc[3] = kuiHD3;
-	uiSrc[4] = uiSrc[10] = kuiHD4;
-	uiSrc[5] = uiSrc[11] = kuiHD5;
-	uiSrc[8] = uiSrc[14] = kuiHD6;
-	uiSrc[9] = uiSrc[15] = kuiHD7;
-	uiSrc[12] = kuiHD8;
-	uiSrc[13] = kuiHD9;
-
-	WelsFillingPred8x2to16( pPred, uiSrc );
-}
-
-
-
-#define I8x8_PRED_STRIDE 8
-
-void WelsIChormaPredV_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
-{
-	const uint64_t kuiSrc64 = LD64(&pRef[-kiStride]);	
-
-	ST64( pPred   , kuiSrc64 );
-	ST64( pPred+8 , kuiSrc64 );
-	ST64( pPred+16, kuiSrc64 );
-	ST64( pPred+24, kuiSrc64 );
-	ST64( pPred+32, kuiSrc64 );
-	ST64( pPred+40, kuiSrc64 );
-	ST64( pPred+48, kuiSrc64 );
-	ST64( pPred+56, kuiSrc64 );	
-}
-
-void WelsIChormaPredH_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
-{
-	int32_t iStridex7 = (kiStride<<3)-kiStride;
-	int32_t iI8x8Stridex7 = (I8x8_PRED_STRIDE<<3)-I8x8_PRED_STRIDE;
-	uint8_t i = 7;
-	
-	do
-	{
-		const uint8_t kuiLeft = pRef[iStridex7-1];	// pLeft value
-#ifdef _MSC_VER
-		uint64_t kuiSrc64 = (uint64_t)(0x0101010101010101U * kuiLeft);
-#else
-		uint64_t kuiSrc64 = (uint64_t)(0x0101010101010101LL * kuiLeft);
-#endif
-		ST64( pPred+iI8x8Stridex7, kuiSrc64 );
-
-		iStridex7 -= kiStride;
-		iI8x8Stridex7 -= I8x8_PRED_STRIDE;
-	}while(i-->0);
-}
-
-
-void WelsIChormaPredPlane_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
-{
-	int32_t iLTshift=0, iTopshift=0, iLeftshift=0, iTopSum=0, iLeftSum=0;
-	int32_t i, j;
-	uint8_t *pTop = &pRef[-kiStride];
-	uint8_t *pLeft = &pRef[-1];
-
-	for(i = 0 ; i < 4 ; i ++)
-	{
-		iTopSum += (i + 1) * (pTop[4 + i] - pTop[2 - i]);
-		iLeftSum += (i + 1) * (pLeft[(4 + i)*kiStride] - pLeft[(2 - i)*kiStride]);
-	}
-
-	iLTshift = (pLeft[7*kiStride] + pTop[7]) << 4;
-	iTopshift = (17 * iTopSum + 16) >> 5;
-	iLeftshift = (17 * iLeftSum + 16) >> 5;
-
-	for(i = 0 ; i < 8 ; i ++)
-	{
-		for(j = 0 ; j < 8 ; j ++)
-		{			
-			pPred[j] = (uint8_t)WELS_CLIP1((iLTshift + iTopshift * (j - 3) + iLeftshift * (i - 3) + 16) >> 5);
-		}
-		pPred += I8x8_PRED_STRIDE;
-	}
-}
-
-
-void WelsIChormaPredDc_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
-{
-	const int32_t kuiL1 = kiStride-1;
-	const int32_t kuiL2 = kuiL1 + kiStride;
-	const int32_t kuiL3 = kuiL2 + kiStride;
-	const int32_t kuiL4 = kuiL3 + kiStride;
-	const int32_t kuiL5 = kuiL4 + kiStride;
-	const int32_t kuiL6 = kuiL5 + kiStride;
-	const int32_t kuiL7 = kuiL6 + kiStride;
-	/*caculate the iMean value*/
-	const uint8_t kuiMean1	= (	pRef[-kiStride] + pRef[1-kiStride] + pRef[2-kiStride] + pRef[3-kiStride] +
-							pRef[-1] + pRef[kuiL1] + pRef[kuiL2] + pRef[kuiL3] + 4 ) >> 3;
-	const uint32_t kuiSum2 = pRef[4-kiStride] + pRef[5-kiStride] + pRef[6-kiStride] + pRef[7-kiStride];
-	const uint32_t kuiSum3 = pRef[kuiL4] + pRef[kuiL5] + pRef[kuiL6] + pRef[kuiL7];
-	const uint8_t kuiMean2 = (kuiSum2 + 2) >> 2;
-	const uint8_t kuiMean3 = (kuiSum3 + 2) >> 2;
-	const uint8_t kuiMean4 = (kuiSum2 + kuiSum3 + 4) >> 3;
-
-	const uint8_t kuiTopMean[8] = {kuiMean1, kuiMean1, kuiMean1, kuiMean1, kuiMean2, kuiMean2, kuiMean2, kuiMean2};
-	const uint8_t kuiBottomMean[8] = {kuiMean3, kuiMean3, kuiMean3, kuiMean3, kuiMean4, kuiMean4, kuiMean4, kuiMean4};
-	const uint64_t kuiTopMean64 = LD64(kuiTopMean);
-	const uint64_t kuiBottomMean64 = LD64(kuiBottomMean);
-
-	ST64( pPred   , kuiTopMean64 );
-	ST64( pPred+8 , kuiTopMean64 );
-	ST64( pPred+16, kuiTopMean64 );
-	ST64( pPred+24, kuiTopMean64 );
-	ST64( pPred+32, kuiBottomMean64 );
-	ST64( pPred+40, kuiBottomMean64 );
-	ST64( pPred+48, kuiBottomMean64 );
-	ST64( pPred+56, kuiBottomMean64 );
-}
-
-void WelsIChormaPredDcLeft_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
-{
-	const int32_t kuiL1	= kiStride-1;
-	const int32_t kuiL2	= kuiL1 + kiStride;
-	const int32_t kuiL3	= kuiL2 + kiStride;
-	const int32_t kuiL4	= kuiL3 + kiStride;
-	const int32_t kuiL5	= kuiL4 + kiStride;
-	const int32_t kuiL6	= kuiL5 + kiStride;
-	const int32_t kuiL7	= kuiL6 + kiStride;
-	/*caculate the iMean value*/
-	const uint8_t kuiTopMean	= (pRef[-1] + pRef[kuiL1] + pRef[kuiL2] + pRef[kuiL3] + 2)>>2 ;
-	const uint8_t kuiBottomMean	= (pRef[kuiL4] + pRef[kuiL5] + pRef[kuiL6] + pRef[kuiL7] + 2)>>2;
-#ifdef _MSC_VER
-	const uint64_t kuiTopMean64	= (uint64_t)(0x0101010101010101U * kuiTopMean);
-	const uint64_t kuiBottomMean64	= (uint64_t)(0x0101010101010101U * kuiBottomMean);
-#else
-	const uint64_t kuiTopMean64	= (uint64_t)(0x0101010101010101LL * kuiTopMean);
-	const uint64_t kuiBottomMean64	= (uint64_t)(0x0101010101010101LL * kuiBottomMean);
-#endif
-	ST64( pPred   , kuiTopMean64 );
-	ST64( pPred+8 , kuiTopMean64 );
-	ST64( pPred+16, kuiTopMean64 );
-	ST64( pPred+24, kuiTopMean64 );
-	ST64( pPred+32, kuiBottomMean64 );
-	ST64( pPred+40, kuiBottomMean64 );
-	ST64( pPred+48, kuiBottomMean64 );
-	ST64( pPred+56, kuiBottomMean64 );	
-}
-
-void WelsIChormaPredDcTop_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
-{
-	/*caculate the iMean value*/
-	const uint8_t kuiMean1 = (pRef[-kiStride] + pRef[1-kiStride] + pRef[2-kiStride] + pRef[3-kiStride]+2)>>2;
-	const uint8_t kuiMean2 = (pRef[4-kiStride] + pRef[5-kiStride] + pRef[6-kiStride] + pRef[7-kiStride] + 2)>>2;
-	const uint8_t kuiMean[8] = {kuiMean1, kuiMean1, kuiMean1, kuiMean1, kuiMean2, kuiMean2, kuiMean2, kuiMean2};
-	const uint64_t kuiMean64 = LD64(kuiMean);
-
-	ST64( pPred   , kuiMean64 );
-	ST64( pPred+8 , kuiMean64 );
-	ST64( pPred+16, kuiMean64 );
-	ST64( pPred+24, kuiMean64 );
-	ST64( pPred+32, kuiMean64 );
-	ST64( pPred+40, kuiMean64 );
-	ST64( pPred+48, kuiMean64 );
-	ST64( pPred+56, kuiMean64 );	
-}
-
-void WelsIChormaPredDcNA_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
-{
-#ifdef _MSC_VER
-	const uint64_t kuiDcValue64 = (uint64_t)0x8080808080808080U;
-#else
-	const uint64_t kuiDcValue64 = (uint64_t)0x8080808080808080LL;
-#endif
-	ST64( pPred   , kuiDcValue64 );
-	ST64( pPred+8 , kuiDcValue64 );
-	ST64( pPred+16, kuiDcValue64 );
-	ST64( pPred+24, kuiDcValue64 );
-	ST64( pPred+32, kuiDcValue64 );
-	ST64( pPred+40, kuiDcValue64 );
-	ST64( pPred+48, kuiDcValue64 );
-	ST64( pPred+56, kuiDcValue64 );
-}
-
-
-void WelsI16x16LumaPredV_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
-{
-	uint8_t i = 15;
-	const int8_t *kpSrc = (int8_t*)&pRef[-kiStride];
-	const uint64_t kuiT1 = LD64(kpSrc  );
-	const uint64_t kuiT2 = LD64(kpSrc+8);
-	uint8_t *pDst = pPred;
-
-	do
-	{
-		ST64(pDst  , kuiT1);
-		ST64(pDst+8, kuiT2);
-		pDst += 16;
-	}while(i-->0);
-}
-
-void WelsI16x16LumaPredH_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
-{
-	int32_t iStridex15 = (kiStride<<4)-kiStride;
-	int32_t iPredStride = 16;
-	int32_t iPredStridex15 = 240;	//(iPredStride<<4)-iPredStride;
-	uint8_t i = 15;
-	
-	do
-	{
-		const uint8_t kuiSrc8	= pRef[iStridex15-1];
-#ifdef _MSC_VER
-		const uint64_t kuiV64	= (uint64_t)(0x0101010101010101U * kuiSrc8);
-#else
-		const uint64_t kuiV64	= (uint64_t)(0x0101010101010101LL * kuiSrc8);
-#endif			
-		ST64( &pPred[iPredStridex15], kuiV64 );
-		ST64( &pPred[iPredStridex15+8], kuiV64 );		
-
-		iStridex15 -= kiStride;
-		iPredStridex15 -= iPredStride;
-	}while(i-->0);
-}
-
-void WelsI16x16LumaPredPlane_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
-{
-	int32_t iLTshift=0, iTopshift=0, iLeftshift=0, iTopSum=0, iLeftSum=0;
-	int32_t i, j;
-	uint8_t *pTop = &pRef[-kiStride];
-	uint8_t *pLeft = &pRef[-1];
-	int32_t iPredStride = 16;
-
-	for(i = 0 ; i < 8 ; i ++)
-	{
-		iTopSum += (i + 1) * (pTop[8 + i] - pTop[6 - i]);
-		iLeftSum += (i + 1) * (pLeft[(8 + i)*kiStride] - pLeft[(6 - i)*kiStride]);
-	}
-
-	iLTshift = (pLeft[15*kiStride] + pTop[15]) << 4;
-	iTopshift = (5 * iTopSum + 32) >> 6;
-	iLeftshift = (5 * iLeftSum + 32) >> 6;
-
-	for(i = 0 ; i < 16 ; i ++)
-	{
-		for(j = 0 ; j < 16 ; j ++)
-		{			
-			pPred[j] = (uint8_t)WELS_CLIP1((iLTshift + iTopshift * (j - 7) + iLeftshift * (i - 7) + 16) >> 5);
-		}
-		pPred += iPredStride;
-	}
-}
-
-void WelsI16x16LumaPredDc_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
-{
-	int32_t iStridex15 = (kiStride<<4)-kiStride;
-	int32_t iSum = 0;
-	uint8_t i = 15;
-	uint8_t iMean = 0;
-
-	/*caculate the iMean value*/
-	do
-	{
-		iSum += pRef[-1+iStridex15] + pRef[-kiStride+i];
-		iStridex15 -= kiStride;
-	}while(i-->0);
-	iMean = ( 16 + iSum ) >> 5;
-	memset(pPred, iMean, 256);
-}
-
-
-void WelsI16x16LumaPredDcTop_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
-{
-	int32_t iSum = 0;
-	uint8_t i = 15;
-	uint8_t iMean = 0;
-	
-	/*caculate the iMean value*/
-	do
-	{
-		iSum += pRef[-kiStride+i];
-	}while(i-->0);
-	iMean = ( 8 + iSum ) >> 4;
-	memset(pPred, iMean, 256);
-}
-
-void WelsI16x16LumaPredDcLeft_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
-{
-	int32_t iStridex15 = (kiStride<<4)-kiStride;
-	int32_t iSum = 0;
-	uint8_t i = 15;
-	uint8_t iMean = 0;
-
-	/*caculate the iMean value*/
-	do
-	{
-		iSum += pRef[-1+iStridex15];
-		iStridex15 -= kiStride;
-	}while(i-->0);
-	iMean = ( 8 + iSum ) >> 4;
-	memset(pPred, iMean, 256);
-}
-
-void WelsI16x16LumaPredDcNA_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
-{
-	memset(pPred, 0x80, 256);
-}
-
-void WelsInitIntraPredFuncs(SWelsFuncPtrList *pFuncList, const uint32_t kuiCpuFlag )
-{
-	pFuncList->pfGetLumaI16x16Pred[I16_PRED_V] =      WelsI16x16LumaPredV_c;
-	pFuncList->pfGetLumaI16x16Pred[I16_PRED_H] =      WelsI16x16LumaPredH_c;
-	pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC] =     WelsI16x16LumaPredDc_c;
-	pFuncList->pfGetLumaI16x16Pred[I16_PRED_P] =      WelsI16x16LumaPredPlane_c;
-	pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC_L] =   WelsI16x16LumaPredDcLeft_c;
-	pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC_T] =   WelsI16x16LumaPredDcTop_c;
-	pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC_128] = WelsI16x16LumaPredDcNA_c;
-
-	pFuncList->pfGetLumaI4x4Pred[I4_PRED_V] = WelsI4x4LumaPredV_c;
-	pFuncList->pfGetLumaI4x4Pred[I4_PRED_H] = WelsI4x4LumaPredH_c;
-	pFuncList->pfGetLumaI4x4Pred[I4_PRED_DC] = WelsI4x4LumaPredDc_c;
-	pFuncList->pfGetLumaI4x4Pred[I4_PRED_DC_L] = WelsI4x4LumaPredDcLeft_c;
-	pFuncList->pfGetLumaI4x4Pred[I4_PRED_DC_T] = WelsI4x4LumaPredDcTop_c;
-	pFuncList->pfGetLumaI4x4Pred[I4_PRED_DC_128] = WelsI4x4LumaPredDcNA_c;
-
-	pFuncList->pfGetLumaI4x4Pred[I4_PRED_DDL] = WelsI4x4LumaPredDDL_c;
-	pFuncList->pfGetLumaI4x4Pred[I4_PRED_DDL_TOP] = WelsI4x4LumaPredDDLTop_c;
-	pFuncList->pfGetLumaI4x4Pred[I4_PRED_DDR] = WelsI4x4LumaPredDDR_c;
-
-	pFuncList->pfGetLumaI4x4Pred[I4_PRED_VL] = WelsI4x4LumaPredVL_c;
-	pFuncList->pfGetLumaI4x4Pred[I4_PRED_VL_TOP] = WelsI4x4LumaPredVLTop_c;
-	pFuncList->pfGetLumaI4x4Pred[I4_PRED_VR] = WelsI4x4LumaPredVR_c;
-	pFuncList->pfGetLumaI4x4Pred[I4_PRED_HU] = WelsI4x4LumaPredHU_c;
-	pFuncList->pfGetLumaI4x4Pred[I4_PRED_HD] = WelsI4x4LumaPredHD_c;
-
-	pFuncList->pfGetChromaPred[C_PRED_DC] = WelsIChormaPredDc_c;
-	pFuncList->pfGetChromaPred[C_PRED_H] = WelsIChormaPredH_c;
-	pFuncList->pfGetChromaPred[C_PRED_V] = WelsIChormaPredV_c;
-	pFuncList->pfGetChromaPred[C_PRED_P] = WelsIChormaPredPlane_c;
-	pFuncList->pfGetChromaPred[C_PRED_DC_L] = WelsIChormaPredDcLeft_c;
-	pFuncList->pfGetChromaPred[C_PRED_DC_T] = WelsIChormaPredDcTop_c;
-	pFuncList->pfGetChromaPred[C_PRED_DC_128] = WelsIChormaPredDcNA_c;
-#ifdef X86_ASM
-	if( kuiCpuFlag & WELS_CPU_MMXEXT )
-	{
-		pFuncList->pfGetLumaI4x4Pred[I4_PRED_DDR] = WelsI4x4LumaPredDDR_mmx;
-		pFuncList->pfGetLumaI4x4Pred[I4_PRED_HD]  = WelsI4x4LumaPredHD_mmx;
-		pFuncList->pfGetLumaI4x4Pred[I4_PRED_HU]  = WelsI4x4LumaPredHU_mmx;
-		pFuncList->pfGetLumaI4x4Pred[I4_PRED_VR]  = WelsI4x4LumaPredVR_mmx;
-		pFuncList->pfGetLumaI4x4Pred[I4_PRED_DDL] = WelsI4x4LumaPredDDL_mmx;
-		pFuncList->pfGetLumaI4x4Pred[I4_PRED_VL]  = WelsI4x4LumaPredVL_mmx;
- 		pFuncList->pfGetChromaPred[C_PRED_H] = WelsIChromaPredH_mmx;
-	}
-	if ( kuiCpuFlag & WELS_CPU_SSE2 )
-	{
-		pFuncList->pfGetLumaI4x4Pred[I4_PRED_H] = WelsI4x4LumaPredH_sse2;
-		pFuncList->pfGetLumaI4x4Pred[I4_PRED_DC] = WelsI4x4LumaPredDc_sse2;
-		pFuncList->pfGetLumaI4x4Pred[I4_PRED_V] = WelsI4x4LumaPredV_sse2;
-
-		pFuncList->pfGetLumaI16x16Pred[I16_PRED_V] = WelsI16x16LumaPredV_sse2;
-		pFuncList->pfGetLumaI16x16Pred[I16_PRED_H] = WelsI16x16LumaPredH_sse2;
-		pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC] = WelsI16x16LumaPredDc_sse2;
-		pFuncList->pfGetLumaI16x16Pred[I16_PRED_P] = WelsI16x16LumaPredPlane_sse2;
-
-		pFuncList->pfGetChromaPred[C_PRED_DC]	= WelsIChromaPredDc_sse2;
-		pFuncList->pfGetChromaPred[C_PRED_V]	= WelsIChromaPredV_sse2;
-		pFuncList->pfGetChromaPred[C_PRED_P]	= WelsIChromaPredPlane_sse2;
-	}
-#endif
-}
-}
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	get_intra_predictor.c
+ *
+ * \brief	implementation for get intra predictor about 16x16, 4x4, chroma.
+ *
+ * \date	4/2/2009 Created
+ *			9/14/2009 C level based optimization with high performance gained.
+ *				[const, using ST32/ST64 to replace memset, memcpy and memmove etc.]
+ *
+ *************************************************************************************
+ */
+#include <string.h>
+#include "macros.h"
+#include "ls_defines.h"
+#include "cpu_core.h"
+#include "get_intra_predictor.h"
+#include "wels_common_basis.h"
+#include "array_stack_align.h"
+
+namespace WelsSVCEnc {
+#define I4x4_COUNT 4
+#define I8x8_COUNT 8
+#define I16x16_COUNT 16
+
+typedef void (*PFillingPred) (uint8_t* pPred, uint8_t* pSrc);
+typedef void (*PFillingPred1to16) (uint8_t* pPred, const uint8_t kuiSrc);
+
+static inline void WelsFillingPred8to16_c (uint8_t* pPred, uint8_t* pSrc) {
+  ST64 (pPred  , LD64 (pSrc));
+  ST64 (pPred + 8, LD64 (pSrc));
+}
+static inline void WelsFillingPred8x2to16_c (uint8_t* pPred, uint8_t* pSrc) {
+  ST64 (pPred  , LD64 (pSrc));
+  ST64 (pPred + 8, LD64 (pSrc + 8));
+}
+static inline void WelsFillingPred1to16_c (uint8_t* pPred, const uint8_t kuiSrc) {
+  const uint8_t kuiSrc8[8] = { kuiSrc, kuiSrc, kuiSrc, kuiSrc, kuiSrc, kuiSrc, kuiSrc, kuiSrc };
+  ST64 (pPred  , LD64 (kuiSrc8));
+  ST64 (pPred + 8, LD64 (kuiSrc8));
+}
+
+PFillingPred					WelsFillingPred8to16;
+PFillingPred					WelsFillingPred8x2to16;
+PFillingPred1to16 WelsFillingPred1to16;
+
+void WelsInitFillingPredFuncs (const uint32_t kuiCpuFlag) {
+  WelsFillingPred8to16	= WelsFillingPred8to16_c;
+  WelsFillingPred8x2to16	= WelsFillingPred8x2to16_c;
+  WelsFillingPred1to16	= WelsFillingPred1to16_c;
+
+#if defined(X86_ASM)
+  if (kuiCpuFlag & WELS_CPU_MMXEXT) {
+    WelsFillingPred8to16		= WelsFillingPred8to16_mmx;
+    WelsFillingPred8x2to16	    = WelsFillingPred8x2to16_mmx;
+    WelsFillingPred1to16		= WelsFillingPred1to16_mmx;
+  }
+  if (kuiCpuFlag & WELS_CPU_SSE2) {
+    WelsFillingPred8x2to16	    = WelsFillingPred8x2to16_sse2;
+    WelsFillingPred1to16		= WelsFillingPred1to16_sse2;
+  }
+#endif//X86_ASM
+}
+
+
+
+#define I4x4_PRED_STRIDE 4
+#define I4x4_PRED_STRIDE2 8
+#define I4x4_PRED_STRIDE3 12
+
+void WelsI4x4LumaPredV_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
+  const uint32_t kuiSrc = LD32 (&pRef[-kiStride]);
+  ENFORCE_STACK_ALIGN_1D (uint32_t, uiSrcx2, 2, 16)
+  uiSrcx2[0] = uiSrcx2[1] = kuiSrc;
+
+  WelsFillingPred8to16 (pPred, (uint8_t*)&uiSrcx2[0]);
+}
+
+void WelsI4x4LumaPredH_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
+  const uint32_t kiStridex2Left = (kiStride << 1) - 1;
+  const uint32_t kiStridex3Left = kiStride + kiStridex2Left;
+  const uint8_t kuiHor1 = pRef[-1];
+  const uint8_t kuiHor2 = pRef[kiStride - 1];
+  const uint8_t kuiHor3 = pRef[kiStridex2Left];
+  const uint8_t kuiHor4 = pRef[kiStridex3Left];
+  const uint8_t kuiVec1[4] = {kuiHor1, kuiHor1, kuiHor1, kuiHor1};
+  const uint8_t kuiVec2[4] = {kuiHor2, kuiHor2, kuiHor2, kuiHor2};
+  const uint8_t kuiVec3[4] = {kuiHor3, kuiHor3, kuiHor3, kuiHor3};
+  const uint8_t kuiVec4[4] = {kuiHor4, kuiHor4, kuiHor4, kuiHor4};
+  ENFORCE_STACK_ALIGN_1D (uint8_t, uiSrc, 16, 16)	// TobeCont'd about assign opt as follows
+  ST32 (&uiSrc[0], LD32 (kuiVec1));
+  ST32 (&uiSrc[4], LD32 (kuiVec2));
+  ST32 (&uiSrc[8], LD32 (kuiVec3));
+  ST32 (&uiSrc[12], LD32 (kuiVec4));
+
+  WelsFillingPred8x2to16 (pPred, uiSrc);
+}
+void WelsI4x4LumaPredDc_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
+  const uint8_t kuiDcValue	= (pRef[-1] + pRef[kiStride - 1] + pRef[ (kiStride << 1) - 1] + pRef[ (kiStride << 1) +
+                               kiStride - 1] +
+                               pRef[-kiStride] + pRef[1 - kiStride] + pRef[2 - kiStride] + pRef[3 - kiStride] + 4) >> 3;
+
+  WelsFillingPred1to16 (pPred, kuiDcValue);
+}
+
+void WelsI4x4LumaPredDcLeft_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
+  const uint8_t kuiDcValue	= (pRef[-1] + pRef[kiStride - 1] + pRef[ (kiStride << 1) - 1] + pRef[ (kiStride << 1) +
+                               kiStride - 1] + 2) >> 2;
+
+  WelsFillingPred1to16 (pPred, kuiDcValue);
+}
+
+void WelsI4x4LumaPredDcTop_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
+  const uint8_t kuiDcValue	= (pRef[-kiStride] + pRef[1 - kiStride] + pRef[2 - kiStride] + pRef[3 - kiStride] + 2) >> 2;
+
+  WelsFillingPred1to16 (pPred, kuiDcValue);
+}
+
+void WelsI4x4LumaPredDcNA_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
+  const uint8_t kuiDcValue = 0x80;
+
+  WelsFillingPred1to16 (pPred, kuiDcValue);
+}
+
+/*down pLeft*/
+void WelsI4x4LumaPredDDL_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
+  /*get pTop*/
+  const uint8_t kuiT0		= pRef[-kiStride];
+  const uint8_t kuiT1		= pRef[1 - kiStride];
+  const uint8_t kuiT2		= pRef[2 - kiStride];
+  const uint8_t kuiT3		= pRef[3 - kiStride];
+  const uint8_t kuiT4		= pRef[4 - kiStride];
+  const uint8_t kuiT5		= pRef[5 - kiStride];
+  const uint8_t kuiT6		= pRef[6 - kiStride];
+  const uint8_t kuiT7		= pRef[7 - kiStride];
+  const uint8_t kuiDDL0	= (2 + kuiT0 + kuiT2 + (kuiT1 << 1)) >> 2;	// uiDDL0
+  const uint8_t kuiDDL1	= (2 + kuiT1 + kuiT3 + (kuiT2 << 1)) >> 2;	// uiDDL1
+  const uint8_t kuiDDL2	= (2 + kuiT2 + kuiT4 + (kuiT3 << 1)) >> 2;	// uiDDL2
+  const uint8_t kuiDDL3	= (2 + kuiT3 + kuiT5 + (kuiT4 << 1)) >> 2;	// uiDDL3
+  const uint8_t kuiDDL4	= (2 + kuiT4 + kuiT6 + (kuiT5 << 1)) >> 2;	// uiDDL4
+  const uint8_t kuiDDL5	= (2 + kuiT5 + kuiT7 + (kuiT6 << 1)) >> 2;	// uiDDL5
+  const uint8_t kuiDDL6	= (2 + kuiT6 + kuiT7 + (kuiT7 << 1)) >> 2;	// uiDDL6
+  ENFORCE_STACK_ALIGN_1D (uint8_t, uiSrc, 16, 16)	// TobeCont'd about assign opt as follows
+  uiSrc[0] = kuiDDL0;
+  uiSrc[1] = uiSrc[4] = kuiDDL1;
+  uiSrc[2] = uiSrc[5] = uiSrc[8] = kuiDDL2;
+  uiSrc[3] = uiSrc[6] = uiSrc[9] = uiSrc[12] = kuiDDL3;
+  uiSrc[7] = uiSrc[10] = uiSrc[13] = kuiDDL4;
+  uiSrc[11] = uiSrc[14] = kuiDDL5;
+  uiSrc[15] = kuiDDL6;
+
+  WelsFillingPred8x2to16 (pPred, uiSrc);
+}
+
+/*down pLeft*/
+void WelsI4x4LumaPredDDLTop_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
+  /*get pTop*/
+  const uint8_t kuiT0	= pRef[-kiStride];
+  const uint8_t kuiT1	= pRef[1 - kiStride];
+  const uint8_t kuiT2	= pRef[2 - kiStride];
+  const uint8_t kuiT3	= pRef[3 - kiStride];
+  const uint8_t kuiDLT0	= (2 + kuiT0 + kuiT2 + (kuiT1 << 1)) >> 2;	// uiDLT0
+  const uint8_t kuiDLT1	= (2 + kuiT1 + kuiT3 + (kuiT2 << 1)) >> 2;	// uiDLT1
+  const uint8_t kuiDLT2	= (2 + kuiT2 + kuiT3 + (kuiT3 << 1)) >> 2;	// uiDLT2
+  const uint8_t kuiDLT3	= (2 + (kuiT3 << 2)) >> 2;				// uiDLT3
+  ENFORCE_STACK_ALIGN_1D (uint8_t, uiSrc, 16, 16)	// TobeCont'd about assign opt as follows
+  memset (&uiSrc[6], kuiDLT3, 10 * sizeof (uint8_t));
+  uiSrc[0] = kuiDLT0;
+  uiSrc[1] = uiSrc[4] = kuiDLT1;
+  uiSrc[2] = uiSrc[5] = uiSrc[8] = kuiDLT2;
+  uiSrc[3] = kuiDLT3;
+
+  WelsFillingPred8x2to16 (pPred, uiSrc);
+}
+
+
+/*down right*/
+void WelsI4x4LumaPredDDR_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
+  const int32_t kiStridex2	= kiStride << 1;
+  const int32_t kiStridex3	= kiStride + kiStridex2;
+  const uint8_t kuiLT			= pRef[-kiStride - 1];	// pTop-pLeft
+  /*get pLeft and pTop*/
+  const uint8_t kuiL0			= pRef[-1];
+  const uint8_t kuiL1			= pRef[kiStride - 1];
+  const uint8_t kuiL2			= pRef[kiStridex2 - 1];
+  const uint8_t kuiL3			= pRef[kiStridex3 - 1];
+  const uint8_t kuiT0			= pRef[-kiStride];
+  const uint8_t kuiT1			= pRef[1 - kiStride];
+  const uint8_t kuiT2			= pRef[2 - kiStride];
+  const uint8_t kuiT3			= pRef[3 - kiStride];
+  const uint16_t kuiTL0		= 1 + kuiLT + kuiL0;
+  const uint16_t kuiLT0		= 1 + kuiLT + kuiT0;
+  const uint16_t kuiT01		= 1 + kuiT0 + kuiT1;
+  const uint16_t kuiT12		= 1 + kuiT1 + kuiT2;
+  const uint16_t kuiT23		= 1 + kuiT2 + kuiT3;
+  const uint16_t kuiL01		= 1 + kuiL0 + kuiL1;
+  const uint16_t kuiL12		= 1 + kuiL1 + kuiL2;
+  const uint16_t kuiL23		= 1 + kuiL2 + kuiL3;
+  const uint8_t kuiDDR0		= (kuiTL0 + kuiLT0) >> 2;
+  const uint8_t kuiDDR1		= (kuiLT0 + kuiT01) >> 2;
+  const uint8_t kuiDDR2		= (kuiT01 + kuiT12) >> 2;
+  const uint8_t kuiDDR3		= (kuiT12 + kuiT23) >> 2;
+  const uint8_t kuiDDR4		= (kuiTL0 + kuiL01) >> 2;
+  const uint8_t kuiDDR5		= (kuiL01 + kuiL12) >> 2;
+  const uint8_t kuiDDR6		= (kuiL12 + kuiL23) >> 2;
+  ENFORCE_STACK_ALIGN_1D (uint8_t, uiSrc, 16, 16)	// TobeCont'd about assign opt as follows
+  uiSrc[0] = uiSrc[5] = uiSrc[10] = uiSrc[15] = kuiDDR0;
+  uiSrc[1] = uiSrc[6] = uiSrc[11] = kuiDDR1;
+  uiSrc[2] = uiSrc[7] = kuiDDR2;
+  uiSrc[3] = kuiDDR3;
+  uiSrc[4] = uiSrc[9] = uiSrc[14] = kuiDDR4;
+  uiSrc[8] = uiSrc[13] = kuiDDR5;
+  uiSrc[12] = kuiDDR6;
+
+  WelsFillingPred8x2to16 (pPred, uiSrc);
+}
+
+
+/*vertical pLeft*/
+void WelsI4x4LumaPredVL_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
+  /*get pTop*/
+  const uint8_t kuiT0		= pRef[-kiStride];
+  const uint8_t kuiT1		= pRef[1 - kiStride];
+  const uint8_t kuiT2		= pRef[2 - kiStride];
+  const uint8_t kuiT3		= pRef[3 - kiStride];
+  const uint8_t kuiT4		= pRef[4 - kiStride];
+  const uint8_t kuiT5		= pRef[5 - kiStride];
+  const uint8_t kuiT6		= pRef[6 - kiStride];
+  const uint8_t kuiVL0	= (1 + kuiT0 + kuiT1) >> 1;				// uiVL0
+  const uint8_t kuiVL1	= (1 + kuiT1 + kuiT2) >> 1;				// uiVL1
+  const uint8_t kuiVL2	= (1 + kuiT2 + kuiT3) >> 1;				// uiVL2
+  const uint8_t kuiVL3	= (1 + kuiT3 + kuiT4) >> 1;				// uiVL3
+  const uint8_t kuiVL4	= (1 + kuiT4 + kuiT5) >> 1;				// uiVL4
+  const uint8_t kuiVL5	= (2 + kuiT0 + (kuiT1 << 1) + kuiT2) >> 2;	// uiVL5
+  const uint8_t kuiVL6	= (2 + kuiT1 + (kuiT2 << 1) + kuiT3) >> 2;	// uiVL6
+  const uint8_t kuiVL7	= (2 + kuiT2 + (kuiT3 << 1) + kuiT4) >> 2;	// uiVL7
+  const uint8_t kuiVL8	= (2 + kuiT3 + (kuiT4 << 1) + kuiT5) >> 2;	// uiVL8
+  const uint8_t kuiVL9	= (2 + kuiT4 + (kuiT5 << 1) + kuiT6) >> 2;	// uiVL9
+  ENFORCE_STACK_ALIGN_1D (uint8_t, uiSrc, 16, 16)	// TobeCont'd about assign opt as follows
+  uiSrc[0] = kuiVL0;
+  uiSrc[1] = uiSrc[8] = kuiVL1;
+  uiSrc[2] = uiSrc[9] = kuiVL2;
+  uiSrc[3] = uiSrc[10] = kuiVL3;
+  uiSrc[4] = kuiVL5;
+  uiSrc[5] = uiSrc[12] = kuiVL6;
+  uiSrc[6] = uiSrc[13] = kuiVL7;
+  uiSrc[7] = uiSrc[14] = kuiVL8;
+  uiSrc[11] = kuiVL4;
+  uiSrc[15] = kuiVL9;
+
+  WelsFillingPred8x2to16 (pPred, uiSrc);
+}
+
+
+
+/*vertical pLeft*/
+void WelsI4x4LumaPredVLTop_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
+  uint8_t* pTopLeft		= &pRef[-kiStride - 1];	// pTop-pLeft
+  /*get pTop*/
+  const uint8_t kuiT0		= * (pTopLeft + 1);
+  const uint8_t kuiT1		= * (pTopLeft + 2);
+  const uint8_t kuiT2		= * (pTopLeft + 3);
+  const uint8_t kuiT3		= * (pTopLeft + 4);
+  const uint8_t kuiVLT0	= (1 + kuiT0 + kuiT1) >> 1;				// uiVLT0
+  const uint8_t kuiVLT1	= (1 + kuiT1 + kuiT2) >> 1;				// uiVLT1
+  const uint8_t kuiVLT2	= (1 + kuiT2 + kuiT3) >> 1;				// uiVLT2
+  const uint8_t kuiVLT3	= (1 + (kuiT3 << 1)) >> 1;				// uiVLT3
+  const uint8_t kuiVLT4	= (2 + kuiT0 + (kuiT1 << 1) + kuiT2) >> 2;	// uiVLT4
+  const uint8_t kuiVLT5	= (2 + kuiT1 + (kuiT2 << 1) + kuiT3) >> 2;	// uiVLT5
+  const uint8_t kuiVLT6	= (2 + kuiT2 + (kuiT3 << 1) + kuiT3) >> 2;	// uiVLT6
+  const uint8_t kuiVLT7	= (2 + (kuiT3 << 2)) >> 2;				// uiVLT7
+  ENFORCE_STACK_ALIGN_1D (uint8_t, uiSrc, 16, 16)	// TobeCont'd about assign opt as follows
+  uiSrc[0] = kuiVLT0;
+  uiSrc[1] = uiSrc[8] = kuiVLT1;
+  uiSrc[2] = uiSrc[9] = kuiVLT2;
+  uiSrc[3] = uiSrc[10] = uiSrc[11] = kuiVLT3;
+  uiSrc[4] = kuiVLT4;
+  uiSrc[5] = uiSrc[12] = kuiVLT5;
+  uiSrc[6] = uiSrc[13] = kuiVLT6;
+  uiSrc[7] = uiSrc[14] = uiSrc[15] = kuiVLT7;
+
+  WelsFillingPred8x2to16 (pPred, uiSrc);
+}
+
+/*vertical right*/
+void WelsI4x4LumaPredVR_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
+  const int32_t kiStridex2	= kiStride << 1;
+  const uint8_t kuiLT			= pRef[-kiStride - 1];	// pTop-pLeft
+  /*get pLeft and pTop*/
+  const uint8_t kuiL0			= pRef[-1];
+  const uint8_t kuiL1			= pRef[kiStride - 1];
+  const uint8_t kuiL2			= pRef[kiStridex2 - 1];
+  const uint8_t kuiT0			= pRef[-kiStride];
+  const uint8_t kuiT1			= pRef[1 - kiStride];
+  const uint8_t kuiT2			= pRef[2 - kiStride];
+  const uint8_t kuiT3			= pRef[3 - kiStride];
+  const uint8_t kuiVR0		= (1 + kuiLT + kuiT0) >> 1;
+  const uint8_t kuiVR1		= (1 + kuiT0 + kuiT1) >> 1;
+  const uint8_t kuiVR2		= (1 + kuiT1 + kuiT2) >> 1;
+  const uint8_t kuiVR3		= (1 + kuiT2 + kuiT3) >> 1;
+  const uint8_t kuiVR4		= (2 + kuiL0 + (kuiLT << 1) + kuiT0) >> 2;
+  const uint8_t kuiVR5		= (2 + kuiLT + (kuiT0 << 1) + kuiT1) >> 2;
+  const uint8_t kuiVR6		= (2 + kuiT0 + (kuiT1 << 1) + kuiT2) >> 2;
+  const uint8_t kuiVR7		= (2 + kuiT1 + (kuiT2 << 1) + kuiT3) >> 2;
+  const uint8_t kuiVR8		= (2 + kuiLT + (kuiL0 << 1) + kuiL1) >> 2;
+  const uint8_t kuiVR9		= (2 + kuiL0 + (kuiL1 << 1) + kuiL2) >> 2;
+  ENFORCE_STACK_ALIGN_1D (uint8_t, uiSrc, 16, 16)	// TobeCont'd about assign opt as follows
+  uiSrc[0] = uiSrc[9] = kuiVR0;
+  uiSrc[1] = uiSrc[10] = kuiVR1;
+  uiSrc[2] = uiSrc[11] = kuiVR2;
+  uiSrc[3] = kuiVR3;
+  uiSrc[4] = uiSrc[13] = kuiVR4;
+  uiSrc[5] = uiSrc[14] = kuiVR5;
+  uiSrc[6] = uiSrc[15] = kuiVR6;
+  uiSrc[7] = kuiVR7;
+  uiSrc[8] = kuiVR8;
+  uiSrc[12] = kuiVR9;
+
+  WelsFillingPred8x2to16 (pPred, uiSrc);
+}
+
+
+/*horizontal up*/
+void WelsI4x4LumaPredHU_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
+  const int32_t kiStridex2	= kiStride << 1;
+  const int32_t kiStridex3	= kiStride + kiStridex2;
+  /*get pLeft*/
+  const uint8_t kuiL0			= pRef[-1];
+  const uint8_t kuiL1			= pRef[kiStride - 1];
+  const uint8_t kuiL2			= pRef[kiStridex2 - 1];
+  const uint8_t kuiL3			= pRef[kiStridex3 - 1];
+  const uint16_t kuiL01		= (1 + kuiL0 + kuiL1);
+  const uint16_t kuiL12		= (1 + kuiL1 + kuiL2);
+  const uint16_t kuiL23		= (1 + kuiL2 + kuiL3);
+  const uint8_t kuiHU0		= kuiL01 >> 1;
+  const uint8_t kuiHU1		= (kuiL01 + kuiL12) >> 2;
+  const uint8_t kuiHU2		= kuiL12 >> 1;
+  const uint8_t kuiHU3		= (kuiL12 + kuiL23) >> 2;
+  const uint8_t kuiHU4		= kuiL23 >> 1;
+  const uint8_t kuiHU5		= (1 + kuiL23 + (kuiL3 << 1)) >> 2;
+  ENFORCE_STACK_ALIGN_1D (uint8_t, uiSrc, 16, 16)	// TobeCont'd about assign opt as follows
+  uiSrc[0] = kuiHU0;
+  uiSrc[1] = kuiHU1;
+  uiSrc[2] = uiSrc[4] = kuiHU2;
+  uiSrc[3] = uiSrc[5] = kuiHU3;
+  uiSrc[6] = uiSrc[8] = kuiHU4;
+  uiSrc[7] = uiSrc[9] = kuiHU5;
+  memset (&uiSrc[10], kuiL3, 6 * sizeof (uint8_t));
+
+  WelsFillingPred8x2to16 (pPred, uiSrc);
+}
+
+
+/*horizontal down*/
+void WelsI4x4LumaPredHD_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
+  const int32_t kiStridex2	= kiStride << 1;
+  const int32_t kiStridex3	= kiStride + kiStridex2;
+  const uint8_t kuiLT		= pRef[-kiStride - 1];	// pTop-pLeft
+  /*get pLeft and pTop*/
+  const uint8_t kuiL0		= pRef[-1];
+  const uint8_t kuiL1		= pRef[kiStride - 1];
+  const uint8_t kuiL2		= pRef[kiStridex2 - 1];
+  const uint8_t kuiL3		= pRef[kiStridex3 - 1];
+  const uint8_t kuiT0		= pRef[-kiStride];
+  const uint8_t kuiT1		= pRef[1 - kiStride];
+  const uint8_t kuiT2		= pRef[2 - kiStride];
+  const uint8_t kuiHD0		= (1 + kuiLT + kuiL0) >> 1;				// uiHD0
+  const uint8_t kuiHD1		= (2 + kuiL0 + (kuiLT << 1) + kuiT0) >> 2;	// uiHD1
+  const uint8_t kuiHD2		= (2 + kuiLT + (kuiT0 << 1) + kuiT1) >> 2;	// uiHD2
+  const uint8_t kuiHD3		= (2 + kuiT0 + (kuiT1 << 1) + kuiT2) >> 2;	// uiHD3
+  const uint8_t kuiHD4		= (1 + kuiL0 + kuiL1) >> 1;				// uiHD4
+  const uint8_t kuiHD5		= (2 + kuiLT + (kuiL0 << 1) + kuiL1) >> 2;	// uiHD5
+  const uint8_t kuiHD6		= (1 + kuiL1 + kuiL2) >> 1;				// uiHD6
+  const uint8_t kuiHD7		= (2 + kuiL0 + (kuiL1 << 1) + kuiL2) >> 2;	// uiHD7
+  const uint8_t kuiHD8		= (1 + kuiL2 + kuiL3) >> 1;				// uiHD8
+  const uint8_t kuiHD9		= (2 + kuiL1 + (kuiL2 << 1) + kuiL3) >> 2;	// uiHD9
+  ENFORCE_STACK_ALIGN_1D (uint8_t, uiSrc, 16, 16)	// TobeCont'd about assign opt as follows
+  uiSrc[0] = uiSrc[6] = kuiHD0;
+  uiSrc[1] = uiSrc[7] = kuiHD1;
+  uiSrc[2] = kuiHD2;
+  uiSrc[3] = kuiHD3;
+  uiSrc[4] = uiSrc[10] = kuiHD4;
+  uiSrc[5] = uiSrc[11] = kuiHD5;
+  uiSrc[8] = uiSrc[14] = kuiHD6;
+  uiSrc[9] = uiSrc[15] = kuiHD7;
+  uiSrc[12] = kuiHD8;
+  uiSrc[13] = kuiHD9;
+
+  WelsFillingPred8x2to16 (pPred, uiSrc);
+}
+
+
+
+#define I8x8_PRED_STRIDE 8
+
+void WelsIChormaPredV_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
+  const uint64_t kuiSrc64 = LD64 (&pRef[-kiStride]);
+
+  ST64 (pPred   , kuiSrc64);
+  ST64 (pPred + 8 , kuiSrc64);
+  ST64 (pPred + 16, kuiSrc64);
+  ST64 (pPred + 24, kuiSrc64);
+  ST64 (pPred + 32, kuiSrc64);
+  ST64 (pPred + 40, kuiSrc64);
+  ST64 (pPred + 48, kuiSrc64);
+  ST64 (pPred + 56, kuiSrc64);
+}
+
+void WelsIChormaPredH_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
+  int32_t iStridex7 = (kiStride << 3) - kiStride;
+  int32_t iI8x8Stridex7 = (I8x8_PRED_STRIDE << 3) - I8x8_PRED_STRIDE;
+  uint8_t i = 7;
+
+  do {
+    const uint8_t kuiLeft = pRef[iStridex7 - 1];	// pLeft value
+#ifdef _MSC_VER
+    uint64_t kuiSrc64 = (uint64_t) (0x0101010101010101U * kuiLeft);
+#else
+    uint64_t kuiSrc64 = (uint64_t) (0x0101010101010101LL * kuiLeft);
+#endif
+    ST64 (pPred + iI8x8Stridex7, kuiSrc64);
+
+    iStridex7 -= kiStride;
+    iI8x8Stridex7 -= I8x8_PRED_STRIDE;
+  } while (i-- > 0);
+}
+
+
+void WelsIChormaPredPlane_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
+  int32_t iLTshift = 0, iTopshift = 0, iLeftshift = 0, iTopSum = 0, iLeftSum = 0;
+  int32_t i, j;
+  uint8_t* pTop = &pRef[-kiStride];
+  uint8_t* pLeft = &pRef[-1];
+
+  for (i = 0 ; i < 4 ; i ++) {
+    iTopSum += (i + 1) * (pTop[4 + i] - pTop[2 - i]);
+    iLeftSum += (i + 1) * (pLeft[ (4 + i) * kiStride] - pLeft[ (2 - i) * kiStride]);
+  }
+
+  iLTshift = (pLeft[7 * kiStride] + pTop[7]) << 4;
+  iTopshift = (17 * iTopSum + 16) >> 5;
+  iLeftshift = (17 * iLeftSum + 16) >> 5;
+
+  for (i = 0 ; i < 8 ; i ++) {
+    for (j = 0 ; j < 8 ; j ++) {
+      pPred[j] = (uint8_t)WELS_CLIP1 ((iLTshift + iTopshift * (j - 3) + iLeftshift * (i - 3) + 16) >> 5);
+    }
+    pPred += I8x8_PRED_STRIDE;
+  }
+}
+
+
+void WelsIChormaPredDc_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
+  const int32_t kuiL1 = kiStride - 1;
+  const int32_t kuiL2 = kuiL1 + kiStride;
+  const int32_t kuiL3 = kuiL2 + kiStride;
+  const int32_t kuiL4 = kuiL3 + kiStride;
+  const int32_t kuiL5 = kuiL4 + kiStride;
+  const int32_t kuiL6 = kuiL5 + kiStride;
+  const int32_t kuiL7 = kuiL6 + kiStride;
+  /*caculate the iMean value*/
+  const uint8_t kuiMean1	= (pRef[-kiStride] + pRef[1 - kiStride] + pRef[2 - kiStride] + pRef[3 - kiStride] +
+                             pRef[-1] + pRef[kuiL1] + pRef[kuiL2] + pRef[kuiL3] + 4) >> 3;
+  const uint32_t kuiSum2 = pRef[4 - kiStride] + pRef[5 - kiStride] + pRef[6 - kiStride] + pRef[7 - kiStride];
+  const uint32_t kuiSum3 = pRef[kuiL4] + pRef[kuiL5] + pRef[kuiL6] + pRef[kuiL7];
+  const uint8_t kuiMean2 = (kuiSum2 + 2) >> 2;
+  const uint8_t kuiMean3 = (kuiSum3 + 2) >> 2;
+  const uint8_t kuiMean4 = (kuiSum2 + kuiSum3 + 4) >> 3;
+
+  const uint8_t kuiTopMean[8] = {kuiMean1, kuiMean1, kuiMean1, kuiMean1, kuiMean2, kuiMean2, kuiMean2, kuiMean2};
+  const uint8_t kuiBottomMean[8] = {kuiMean3, kuiMean3, kuiMean3, kuiMean3, kuiMean4, kuiMean4, kuiMean4, kuiMean4};
+  const uint64_t kuiTopMean64 = LD64 (kuiTopMean);
+  const uint64_t kuiBottomMean64 = LD64 (kuiBottomMean);
+
+  ST64 (pPred   , kuiTopMean64);
+  ST64 (pPred + 8 , kuiTopMean64);
+  ST64 (pPred + 16, kuiTopMean64);
+  ST64 (pPred + 24, kuiTopMean64);
+  ST64 (pPred + 32, kuiBottomMean64);
+  ST64 (pPred + 40, kuiBottomMean64);
+  ST64 (pPred + 48, kuiBottomMean64);
+  ST64 (pPred + 56, kuiBottomMean64);
+}
+
+void WelsIChormaPredDcLeft_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
+  const int32_t kuiL1	= kiStride - 1;
+  const int32_t kuiL2	= kuiL1 + kiStride;
+  const int32_t kuiL3	= kuiL2 + kiStride;
+  const int32_t kuiL4	= kuiL3 + kiStride;
+  const int32_t kuiL5	= kuiL4 + kiStride;
+  const int32_t kuiL6	= kuiL5 + kiStride;
+  const int32_t kuiL7	= kuiL6 + kiStride;
+  /*caculate the iMean value*/
+  const uint8_t kuiTopMean	= (pRef[-1] + pRef[kuiL1] + pRef[kuiL2] + pRef[kuiL3] + 2) >> 2 ;
+  const uint8_t kuiBottomMean	= (pRef[kuiL4] + pRef[kuiL5] + pRef[kuiL6] + pRef[kuiL7] + 2) >> 2;
+#ifdef _MSC_VER
+  const uint64_t kuiTopMean64	= (uint64_t) (0x0101010101010101U * kuiTopMean);
+  const uint64_t kuiBottomMean64	= (uint64_t) (0x0101010101010101U * kuiBottomMean);
+#else
+  const uint64_t kuiTopMean64	= (uint64_t) (0x0101010101010101LL * kuiTopMean);
+  const uint64_t kuiBottomMean64	= (uint64_t) (0x0101010101010101LL * kuiBottomMean);
+#endif
+  ST64 (pPred   , kuiTopMean64);
+  ST64 (pPred + 8 , kuiTopMean64);
+  ST64 (pPred + 16, kuiTopMean64);
+  ST64 (pPred + 24, kuiTopMean64);
+  ST64 (pPred + 32, kuiBottomMean64);
+  ST64 (pPred + 40, kuiBottomMean64);
+  ST64 (pPred + 48, kuiBottomMean64);
+  ST64 (pPred + 56, kuiBottomMean64);
+}
+
+void WelsIChormaPredDcTop_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
+  /*caculate the iMean value*/
+  const uint8_t kuiMean1 = (pRef[-kiStride] + pRef[1 - kiStride] + pRef[2 - kiStride] + pRef[3 - kiStride] + 2) >> 2;
+  const uint8_t kuiMean2 = (pRef[4 - kiStride] + pRef[5 - kiStride] + pRef[6 - kiStride] + pRef[7 - kiStride] + 2) >> 2;
+  const uint8_t kuiMean[8] = {kuiMean1, kuiMean1, kuiMean1, kuiMean1, kuiMean2, kuiMean2, kuiMean2, kuiMean2};
+  const uint64_t kuiMean64 = LD64 (kuiMean);
+
+  ST64 (pPred   , kuiMean64);
+  ST64 (pPred + 8 , kuiMean64);
+  ST64 (pPred + 16, kuiMean64);
+  ST64 (pPred + 24, kuiMean64);
+  ST64 (pPred + 32, kuiMean64);
+  ST64 (pPred + 40, kuiMean64);
+  ST64 (pPred + 48, kuiMean64);
+  ST64 (pPred + 56, kuiMean64);
+}
+
+void WelsIChormaPredDcNA_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
+#ifdef _MSC_VER
+  const uint64_t kuiDcValue64 = (uint64_t)0x8080808080808080U;
+#else
+  const uint64_t kuiDcValue64 = (uint64_t)0x8080808080808080LL;
+#endif
+  ST64 (pPred   , kuiDcValue64);
+  ST64 (pPred + 8 , kuiDcValue64);
+  ST64 (pPred + 16, kuiDcValue64);
+  ST64 (pPred + 24, kuiDcValue64);
+  ST64 (pPred + 32, kuiDcValue64);
+  ST64 (pPred + 40, kuiDcValue64);
+  ST64 (pPred + 48, kuiDcValue64);
+  ST64 (pPred + 56, kuiDcValue64);
+}
+
+
+void WelsI16x16LumaPredV_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
+  uint8_t i = 15;
+  const int8_t* kpSrc = (int8_t*)&pRef[-kiStride];
+  const uint64_t kuiT1 = LD64 (kpSrc);
+  const uint64_t kuiT2 = LD64 (kpSrc + 8);
+  uint8_t* pDst = pPred;
+
+  do {
+    ST64 (pDst  , kuiT1);
+    ST64 (pDst + 8, kuiT2);
+    pDst += 16;
+  } while (i-- > 0);
+}
+
+void WelsI16x16LumaPredH_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
+  int32_t iStridex15 = (kiStride << 4) - kiStride;
+  int32_t iPredStride = 16;
+  int32_t iPredStridex15 = 240;	//(iPredStride<<4)-iPredStride;
+  uint8_t i = 15;
+
+  do {
+    const uint8_t kuiSrc8	= pRef[iStridex15 - 1];
+#ifdef _MSC_VER
+    const uint64_t kuiV64	= (uint64_t) (0x0101010101010101U * kuiSrc8);
+#else
+    const uint64_t kuiV64	= (uint64_t) (0x0101010101010101LL * kuiSrc8);
+#endif
+    ST64 (&pPred[iPredStridex15], kuiV64);
+    ST64 (&pPred[iPredStridex15 + 8], kuiV64);
+
+    iStridex15 -= kiStride;
+    iPredStridex15 -= iPredStride;
+  } while (i-- > 0);
+}
+
+void WelsI16x16LumaPredPlane_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
+  int32_t iLTshift = 0, iTopshift = 0, iLeftshift = 0, iTopSum = 0, iLeftSum = 0;
+  int32_t i, j;
+  uint8_t* pTop = &pRef[-kiStride];
+  uint8_t* pLeft = &pRef[-1];
+  int32_t iPredStride = 16;
+
+  for (i = 0 ; i < 8 ; i ++) {
+    iTopSum += (i + 1) * (pTop[8 + i] - pTop[6 - i]);
+    iLeftSum += (i + 1) * (pLeft[ (8 + i) * kiStride] - pLeft[ (6 - i) * kiStride]);
+  }
+
+  iLTshift = (pLeft[15 * kiStride] + pTop[15]) << 4;
+  iTopshift = (5 * iTopSum + 32) >> 6;
+  iLeftshift = (5 * iLeftSum + 32) >> 6;
+
+  for (i = 0 ; i < 16 ; i ++) {
+    for (j = 0 ; j < 16 ; j ++) {
+      pPred[j] = (uint8_t)WELS_CLIP1 ((iLTshift + iTopshift * (j - 7) + iLeftshift * (i - 7) + 16) >> 5);
+    }
+    pPred += iPredStride;
+  }
+}
+
+void WelsI16x16LumaPredDc_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
+  int32_t iStridex15 = (kiStride << 4) - kiStride;
+  int32_t iSum = 0;
+  uint8_t i = 15;
+  uint8_t iMean = 0;
+
+  /*caculate the iMean value*/
+  do {
+    iSum += pRef[-1 + iStridex15] + pRef[-kiStride + i];
+    iStridex15 -= kiStride;
+  } while (i-- > 0);
+  iMean = (16 + iSum) >> 5;
+  memset (pPred, iMean, 256);
+}
+
+
+void WelsI16x16LumaPredDcTop_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
+  int32_t iSum = 0;
+  uint8_t i = 15;
+  uint8_t iMean = 0;
+
+  /*caculate the iMean value*/
+  do {
+    iSum += pRef[-kiStride + i];
+  } while (i-- > 0);
+  iMean = (8 + iSum) >> 4;
+  memset (pPred, iMean, 256);
+}
+
+void WelsI16x16LumaPredDcLeft_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
+  int32_t iStridex15 = (kiStride << 4) - kiStride;
+  int32_t iSum = 0;
+  uint8_t i = 15;
+  uint8_t iMean = 0;
+
+  /*caculate the iMean value*/
+  do {
+    iSum += pRef[-1 + iStridex15];
+    iStridex15 -= kiStride;
+  } while (i-- > 0);
+  iMean = (8 + iSum) >> 4;
+  memset (pPred, iMean, 256);
+}
+
+void WelsI16x16LumaPredDcNA_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
+  memset (pPred, 0x80, 256);
+}
+
+void WelsInitIntraPredFuncs (SWelsFuncPtrList* pFuncList, const uint32_t kuiCpuFlag) {
+  pFuncList->pfGetLumaI16x16Pred[I16_PRED_V] =      WelsI16x16LumaPredV_c;
+  pFuncList->pfGetLumaI16x16Pred[I16_PRED_H] =      WelsI16x16LumaPredH_c;
+  pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC] =     WelsI16x16LumaPredDc_c;
+  pFuncList->pfGetLumaI16x16Pred[I16_PRED_P] =      WelsI16x16LumaPredPlane_c;
+  pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC_L] =   WelsI16x16LumaPredDcLeft_c;
+  pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC_T] =   WelsI16x16LumaPredDcTop_c;
+  pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC_128] = WelsI16x16LumaPredDcNA_c;
+
+  pFuncList->pfGetLumaI4x4Pred[I4_PRED_V] = WelsI4x4LumaPredV_c;
+  pFuncList->pfGetLumaI4x4Pred[I4_PRED_H] = WelsI4x4LumaPredH_c;
+  pFuncList->pfGetLumaI4x4Pred[I4_PRED_DC] = WelsI4x4LumaPredDc_c;
+  pFuncList->pfGetLumaI4x4Pred[I4_PRED_DC_L] = WelsI4x4LumaPredDcLeft_c;
+  pFuncList->pfGetLumaI4x4Pred[I4_PRED_DC_T] = WelsI4x4LumaPredDcTop_c;
+  pFuncList->pfGetLumaI4x4Pred[I4_PRED_DC_128] = WelsI4x4LumaPredDcNA_c;
+
+  pFuncList->pfGetLumaI4x4Pred[I4_PRED_DDL] = WelsI4x4LumaPredDDL_c;
+  pFuncList->pfGetLumaI4x4Pred[I4_PRED_DDL_TOP] = WelsI4x4LumaPredDDLTop_c;
+  pFuncList->pfGetLumaI4x4Pred[I4_PRED_DDR] = WelsI4x4LumaPredDDR_c;
+
+  pFuncList->pfGetLumaI4x4Pred[I4_PRED_VL] = WelsI4x4LumaPredVL_c;
+  pFuncList->pfGetLumaI4x4Pred[I4_PRED_VL_TOP] = WelsI4x4LumaPredVLTop_c;
+  pFuncList->pfGetLumaI4x4Pred[I4_PRED_VR] = WelsI4x4LumaPredVR_c;
+  pFuncList->pfGetLumaI4x4Pred[I4_PRED_HU] = WelsI4x4LumaPredHU_c;
+  pFuncList->pfGetLumaI4x4Pred[I4_PRED_HD] = WelsI4x4LumaPredHD_c;
+
+  pFuncList->pfGetChromaPred[C_PRED_DC] = WelsIChormaPredDc_c;
+  pFuncList->pfGetChromaPred[C_PRED_H] = WelsIChormaPredH_c;
+  pFuncList->pfGetChromaPred[C_PRED_V] = WelsIChormaPredV_c;
+  pFuncList->pfGetChromaPred[C_PRED_P] = WelsIChormaPredPlane_c;
+  pFuncList->pfGetChromaPred[C_PRED_DC_L] = WelsIChormaPredDcLeft_c;
+  pFuncList->pfGetChromaPred[C_PRED_DC_T] = WelsIChormaPredDcTop_c;
+  pFuncList->pfGetChromaPred[C_PRED_DC_128] = WelsIChormaPredDcNA_c;
+#ifdef X86_ASM
+  if (kuiCpuFlag & WELS_CPU_MMXEXT) {
+    pFuncList->pfGetLumaI4x4Pred[I4_PRED_DDR] = WelsI4x4LumaPredDDR_mmx;
+    pFuncList->pfGetLumaI4x4Pred[I4_PRED_HD]  = WelsI4x4LumaPredHD_mmx;
+    pFuncList->pfGetLumaI4x4Pred[I4_PRED_HU]  = WelsI4x4LumaPredHU_mmx;
+    pFuncList->pfGetLumaI4x4Pred[I4_PRED_VR]  = WelsI4x4LumaPredVR_mmx;
+    pFuncList->pfGetLumaI4x4Pred[I4_PRED_DDL] = WelsI4x4LumaPredDDL_mmx;
+    pFuncList->pfGetLumaI4x4Pred[I4_PRED_VL]  = WelsI4x4LumaPredVL_mmx;
+    pFuncList->pfGetChromaPred[C_PRED_H] = WelsIChromaPredH_mmx;
+  }
+  if (kuiCpuFlag & WELS_CPU_SSE2) {
+    pFuncList->pfGetLumaI4x4Pred[I4_PRED_H] = WelsI4x4LumaPredH_sse2;
+    pFuncList->pfGetLumaI4x4Pred[I4_PRED_DC] = WelsI4x4LumaPredDc_sse2;
+    pFuncList->pfGetLumaI4x4Pred[I4_PRED_V] = WelsI4x4LumaPredV_sse2;
+
+    pFuncList->pfGetLumaI16x16Pred[I16_PRED_V] = WelsI16x16LumaPredV_sse2;
+    pFuncList->pfGetLumaI16x16Pred[I16_PRED_H] = WelsI16x16LumaPredH_sse2;
+    pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC] = WelsI16x16LumaPredDc_sse2;
+    pFuncList->pfGetLumaI16x16Pred[I16_PRED_P] = WelsI16x16LumaPredPlane_sse2;
+
+    pFuncList->pfGetChromaPred[C_PRED_DC]	= WelsIChromaPredDc_sse2;
+    pFuncList->pfGetChromaPred[C_PRED_V]	= WelsIChromaPredV_sse2;
+    pFuncList->pfGetChromaPred[C_PRED_P]	= WelsIChromaPredPlane_sse2;
+  }
+#endif
+}
+}
--- a/codec/encoder/core/src/mc.cpp
+++ b/codec/encoder/core/src/mc.cpp
@@ -1,595 +1,549 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file	mc.c
- *
- * \brief	Interfaces implementation for motion compensation
- *
- * \date	03/17/2009 Created
- *
- *************************************************************************************
- */
-
-#include <string.h>
-#include "as264_common.h"
-#include "typedefs.h"
-#include "wels_const.h"
-#include "macros.h"
-#include "mc.h"
-#include "sample.h"
-#include "cpu_core.h"
-#include "array_stack_align.h"
-
-namespace WelsSVCEnc {
-/*------------------weight for chroma fraction pixel interpolation------------------*/
-//kuiA = (8 - dx) * (8 - dy);   
-//kuiB = dx * (8 - dy);   
-//kuiC = (8 - dx) * dy;
-//kuiD = dx * dy
-static const uint8_t g_kuiABCD[8][8][4] = ////g_kuiA[dy][dx], g_kuiB[dy][dx], g_kuiC[dy][dx], g_kuiD[dy][dx]
-{
-	{	
-		{64, 0, 0, 0},{56, 8, 0, 0},{48, 16, 0, 0},{40, 24, 0, 0},
-		{32, 32, 0, 0},{24, 40, 0, 0},{16, 48, 0, 0},{8, 56, 0, 0}
-	},
-	{	
-		{56, 0, 8, 0},{49, 7, 7, 1},{42, 14, 6, 2},{35, 21, 5, 3},
-		{28, 28, 4, 4},{21, 35, 3, 5},{14, 42, 2, 6},{7, 49, 1, 7}
-	},
-	{	
-		{48, 0, 16, 0},{42, 6, 14, 2},{36, 12, 12, 4},{30, 18, 10, 6},
-		{24, 24, 8, 8},{18, 30, 6, 10},{12, 36, 4, 12},{6, 42, 2, 14}
-	},
-	{	
-		{40, 0, 24, 0},{35, 5, 21, 3},{30, 10, 18, 6},{25, 15, 15, 9},
-		{20, 20, 12, 12},{15, 25, 9, 15},{10, 30, 6, 18},{5, 35, 3, 21}
-	},
-	{	
-		{32, 0, 32, 0},{28, 4, 28, 4},{24, 8, 24, 8},{20, 12, 20, 12},
-		{16, 16, 16, 16},{12, 20, 12, 20},{8, 24, 8, 24},{4, 28, 4, 28}
-	},
-	{	
-		{24, 0, 40, 0},{21, 3, 35, 5},{18, 6, 30, 10},{15, 9, 25, 15},
-		{12, 12, 20, 20},{9, 15, 15, 25},{6, 18, 10, 30},{3, 21, 5, 35}
-	},
-	{	
-		{16, 0, 48, 0},{14, 2, 42, 6},{12, 4, 36, 12},{10, 6, 30, 18},
-		{8, 8, 24, 24},{6, 10, 18, 30},{4, 12, 12, 36},{2, 14, 6, 42}
-	},
-	{	
-		{8, 0, 56, 0},{7, 1, 49, 7},{6, 2, 42, 14},{5, 3, 35, 21},
-		{4, 4, 28, 28},{3, 5, 21, 35},{2, 6, 14, 42},{1, 7, 7, 49}
-	}
-};
-typedef int32_t (*VerFilterFunc)(uint8_t* pSrc, const int32_t kiSrcStride);
-typedef int32_t (*HorFilterFunc)(uint8_t* pSrc);
-typedef int32_t (*HorFilterFuncInput16Bits)(int16_t* pSrc);
-
-VerFilterFunc fpVerFilter			= NULL;
-HorFilterFunc fpHorFilter			= NULL;
-HorFilterFuncInput16Bits fpHorFilterInput16Bits = NULL;
-
-typedef void (*WelsMcFunc0) (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,int32_t iHeight);
-typedef void (*WelsMcFunc1) (uint8_t* pDst, int32_t iDstStride, uint8_t* psrcA, int32_t iSrcAStride,  uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
-WelsMcFunc0 McCopyWidthEq16 = NULL;
-WelsMcFunc0 McCopyWidthEq8 = NULL;
-WelsMcFunc0 McCopyWidthEq4 = NULL;
-WelsMcFunc0 pfMcHorVer02WidthEq16 = NULL;
-WelsMcFunc1 pfPixelAvgWidthEq16  = NULL;
-WelsMcFunc0 pfMcHorVer20WidthEq16 = NULL;
-WelsMcFunc0 pfMcHorVer22WidthEq16 = NULL;
-
-//***************************************************************************//
-//                          C code implementation                            //
-//***************************************************************************//
-static inline void McCopyWidthEq4_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight)
-{
-	int32_t i;
-	for (i = 0; i < iHeight; i++)
-	{
-		memcpy(pDst, pSrc, 4);	// confirmed_safe_unsafe_usage
-		pDst += iDstStride;
-		pSrc += iSrcStride;
-	}
-}
-
-static inline void McCopyWidthEq8_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight)
-
-{
-	int32_t i;
-	for (i = 0; i < iHeight; i++)
-	{
-		memcpy(pDst, pSrc, 8);	// confirmed_safe_unsafe_usage
-		pDst += iDstStride;
-		pSrc += iSrcStride;
-	}
-}
-static inline void McCopyWidthEq16_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight)
-{
-	int32_t i;
-	for (i = 0; i < iHeight; i++)
-	{
-		memcpy(pDst, pSrc, 16);	// confirmed_safe_unsafe_usage
-		pDst += iDstStride;
-		pSrc += iSrcStride;
-	}
-}
-
-//--------------------Luma sample MC------------------//
-static inline int32_t HorFilter_c(uint8_t* pSrc)
-{
-	int32_t iPix05 = pSrc[-2] + pSrc[3];
-	int32_t iPix14 = pSrc[-1] + pSrc[2];
-	int32_t iPix23 = pSrc[ 0] + pSrc[1];
-
-	return (iPix05 - ((iPix14<<2)+iPix14) + (iPix23<<4) + (iPix23<<2));
-}
-
-static inline int32_t HorFilterInput16bit1_c(int16_t* pSrc)
-{
-	int32_t iPix05 = pSrc[-2] + pSrc[3];
-	int32_t iPix14 = pSrc[-1] + pSrc[2];
-	int32_t iPix23 = pSrc[ 0] + pSrc[1];
-	
-	return (iPix05 - ((iPix14<<2)+iPix14) + (iPix23<<4) + (iPix23<<2));
-}	
-static inline int32_t VerFilter_c(uint8_t* pSrc, const int32_t kiSrcStride)
-{
-	const int32_t kiLine1	= kiSrcStride;
-	const int32_t kiLine2	= (kiSrcStride<<1);
-	const int32_t kiLine3 = kiLine1 + kiLine2;
-	const uint32_t kuiPix05= *(pSrc - kiLine2) + *(pSrc + kiLine3);
-	const uint32_t kuiPix14= *(pSrc - kiLine1) + *(pSrc + kiLine2);
-	const uint32_t kuiPix23= *(pSrc        ) + *(pSrc + kiLine1);
-
-	return (kuiPix05 - ((kuiPix14<<2)+kuiPix14) + (kuiPix23<<4) + (kuiPix23<<2));
-}
-
-static inline void PixelAvgWidthEq8_c(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, int32_t iSrcAStride,
-								uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight)
-{
-	int32_t i, j;
-	for (i = 0; i < iHeight; i++)
-	{
-		for (j = 0; j < 8; j++) 
-		{
-			pDst[j] = (pSrcA[j] + pSrcB[j] + 1) >> 1;
-		}
-		pDst  += iDstStride;
-		pSrcA += iSrcAStride;
-		pSrcB += iSrcBStride;
-	}
-}
-static inline void PixelAvgWidthEq16_c(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, int32_t iSrcAStride,
-								 uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight)
-{
-	int32_t i, j;
-	for (i = 0; i < iHeight; i++)
-	{
-		for (j = 0; j < 16; j++) 
-		{
-			pDst[j] = (pSrcA[j] + pSrcB[j] + 1) >> 1;
-		}
-		pDst  += iDstStride;
-		pSrcA += iSrcAStride;
-		pSrcB += iSrcBStride;
-	}
-}
-
-//horizontal filter to gain half sample, that is (2, 0) location in quarter sample
-static inline void McHorVer20WidthEq16_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight)
-{
-	int32_t i, j;
-	for (i = 0; i < iHeight; i++) 
-	{
-		for (j = 0; j < 16; j++)
-		{
-			pDst[j] = WELS_CLIP1((fpHorFilter(pSrc+j)+16)>>5);
-		}
-		pDst += iDstStride;
-		pSrc += iSrcStride;
-	}
-}
-//vertical filter to gain half sample, that is (0, 2) location in quarter sample
-static inline void McHorVer02WidthEq16_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight)
-{
-	int32_t i, j;
-	for (i = 0; i < iHeight; i++)
-	{
-		for (j = 0; j < 16; j++) 
-		{
-			pDst[j] = WELS_CLIP1((fpVerFilter(pSrc+j, iSrcStride)+16)>>5);
-		}
-		pDst += iDstStride;
-		pSrc += iSrcStride;
-	}
-}
-//horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
-static inline void McHorVer22WidthEq16_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight)
-{
-	int16_t pTmp[16+5] = {0}; //16
-	int32_t i, j, k;
-
-	for (i = 0; i < iHeight; i++)
-	{
-		for (j = 0; j < 16 + 5; j++)
-		{
-			pTmp[j] = fpVerFilter(pSrc-2+j, iSrcStride);
-		}
-		for (k = 0; k < 16; k++)
-		{
-			pDst[k] = WELS_CLIP1((fpHorFilterInput16Bits(&pTmp[2+k])+512)>>10);
-		}		
-		pSrc += iSrcStride;
-		pDst += iDstStride;
-	}
-}
-
-/////////////////////luma MC////////////////////////// 
-
-static inline void McHorVer01WidthEq16(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight)
-{
-	ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 256, 16)
-
-	pfMcHorVer02WidthEq16(pSrc, iSrcStride, pTmp, 16, iHeight);	
-	pfPixelAvgWidthEq16(pDst, iDstStride, pSrc, iSrcStride, pTmp, 16,iHeight);
-}
-static inline void McHorVer03WidthEq16(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight)
-{
-	ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 256, 16)
-	
-	pfMcHorVer02WidthEq16(pSrc, iSrcStride, pTmp, 16, iHeight);
-	pfPixelAvgWidthEq16(pDst, iDstStride, pSrc+iSrcStride, iSrcStride, pTmp, 16,iHeight);
-}
-static inline void McHorVer10WidthEq16(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight)
-{
-	ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 256, 16)
-
-	pfMcHorVer20WidthEq16(pSrc, iSrcStride, pTmp, 16, iHeight);
-	pfPixelAvgWidthEq16(pDst, iDstStride, pSrc, iSrcStride, pTmp, 16,iHeight);
-}
-static inline void McHorVer11WidthEq16(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight)
-{
-	ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)	
-	
-	pfMcHorVer20WidthEq16(pSrc, iSrcStride, pTmp, 16, iHeight);
-	pfMcHorVer02WidthEq16(pSrc, iSrcStride, &pTmp[256], 16,iHeight);
-	pfPixelAvgWidthEq16(pDst, iDstStride, pTmp, 16, &pTmp[256], 16,iHeight);
-}
-static inline void McHorVer12WidthEq16(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight)
-{
-	ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)	
-
-	pfMcHorVer02WidthEq16(pSrc, iSrcStride, pTmp, 16,iHeight);
-	pfMcHorVer22WidthEq16(pSrc, iSrcStride, &pTmp[256], 16,iHeight);
-	pfPixelAvgWidthEq16(pDst, iDstStride, pTmp, 16, &pTmp[256], 16,iHeight);
-}
-static inline void McHorVer13WidthEq16(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight)
-{
-	ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)	
-
-	pfMcHorVer20WidthEq16(pSrc+iSrcStride, iSrcStride, pTmp, 16,iHeight);
-	pfMcHorVer02WidthEq16(pSrc, iSrcStride, &pTmp[256], 16,iHeight);
-	pfPixelAvgWidthEq16(pDst, iDstStride, pTmp, 16, &pTmp[256], 16,iHeight);
-}
-static inline void McHorVer21WidthEq16(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight)
-{
-	ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)	
-
-	pfMcHorVer20WidthEq16(pSrc, iSrcStride, pTmp, 16,iHeight);
-	pfMcHorVer22WidthEq16(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-	pfPixelAvgWidthEq16(pDst, iDstStride,pTmp, 16, &pTmp[256], 16,iHeight);
-}
-static inline void McHorVer23WidthEq16(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight)
-{
-	ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
-
-	pfMcHorVer20WidthEq16(pSrc+iSrcStride, iSrcStride, pTmp, 16,iHeight);
-	pfMcHorVer22WidthEq16(pSrc, iSrcStride, &pTmp[256], 16,iHeight);
-	pfPixelAvgWidthEq16(pDst, iDstStride, pTmp, 16, &pTmp[256], 16,iHeight);
-}
-static inline void McHorVer30WidthEq16(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight)
-{
-	ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 256, 16)
-
-	pfMcHorVer20WidthEq16(pSrc, iSrcStride, pTmp, 16,iHeight);
-	pfPixelAvgWidthEq16(pDst, iDstStride, pSrc+1, iSrcStride, pTmp, 16,iHeight);
-}
-static inline void McHorVer31WidthEq16(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight)
-{
-	ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
-
-	pfMcHorVer20WidthEq16(pSrc, iSrcStride, pTmp, 16,iHeight);
-	pfMcHorVer02WidthEq16(pSrc+1, iSrcStride, &pTmp[256], 16,iHeight);
-	pfPixelAvgWidthEq16(pDst, iDstStride, pTmp, 16, &pTmp[256], 16,iHeight);
-}
-static inline void McHorVer32WidthEq16(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight)
-{
-	ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
-
-	pfMcHorVer02WidthEq16(pSrc+1, iSrcStride, pTmp, 16,iHeight);
-	pfMcHorVer22WidthEq16(pSrc, iSrcStride, &pTmp[256], 16,iHeight);
-	pfPixelAvgWidthEq16(pDst, iDstStride, pTmp, 16, &pTmp[256], 16,iHeight);
-}
-static inline void McHorVer33WidthEq16(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight)
-{
-	ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
-
-	pfMcHorVer20WidthEq16(pSrc+iSrcStride, iSrcStride, pTmp, 16,iHeight);
-	pfMcHorVer02WidthEq16(pSrc+1, iSrcStride, &pTmp[256], 16,iHeight);
-	pfPixelAvgWidthEq16(pDst, iDstStride, pTmp, 16, &pTmp[256], 16,iHeight);
-}
-
-static inline void McHorVer20_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
-{
-	int32_t i, j;
-	for (i = 0; i < iHeight; i++) 
-	{
-		for (j = 0; j < iWidth; j++)
-		{
-			pDst[j] = WELS_CLIP1((fpHorFilter(pSrc+j)+16)>>5);
-		}
-		pDst += iDstStride;
-		pSrc += iSrcStride;
-	}
-}
-//vertical filter to gain half sample, that is (0, 2) location in quarter sample
-static inline void McHorVer02_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
-{
-	int32_t i, j;
-	for (i = 0; i < iHeight; i++)
-	{
-		for (j = 0; j < iWidth; j++) 
-		{
-			pDst[j] = WELS_CLIP1((fpVerFilter(pSrc+j, iSrcStride)+16)>>5);
-		}
-		pDst += iDstStride;
-		pSrc += iSrcStride;
-	}
-}
-//horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
-static inline void McHorVer22_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
-{
-	int16_t pTmp[17+5] = {0}; //w+1
-	int32_t i, j, k;
-
-	for (i = 0; i < iHeight; i++)
-	{
-		for (j = 0; j < iWidth + 5; j++)
-		{
-			pTmp[j] = fpVerFilter(pSrc-2+j, iSrcStride);
-		}
-		for (k = 0; k < iWidth; k++)
-		{
-			pDst[k] = WELS_CLIP1((fpHorFilterInput16Bits(&pTmp[2+k])+512)>>10);
-		}		
-		pSrc += iSrcStride;
-		pDst += iDstStride;
-	}
-}
-static inline void McCopy(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
-{
-	int32_t i;
-	if (iWidth == 16 && McCopyWidthEq16!= NULL)
-		McCopyWidthEq16(pSrc,iSrcStride,pDst,iDstStride,iHeight);
-	else if(iWidth == 8 &&McCopyWidthEq8!= NULL)
-		McCopyWidthEq8(pSrc,iSrcStride,pDst,iDstStride,iHeight);
-	else if(iWidth == 4 &&McCopyWidthEq4!= NULL)
-		McCopyWidthEq4(pSrc,iSrcStride,pDst,iDstStride,iHeight);	
-	else
-	{
-		for (i = 0; i < iHeight; i++)
-		{
-			memcpy(pDst, pSrc, iWidth);	// confirmed_safe_unsafe_usage
-			pDst += iDstStride;
-			pSrc += iSrcStride;
-		}				
-	}
-}
-
-void McChroma_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-			        SMVUnitXY mv, int32_t iWidth, int32_t iHeight)
-					//pSrc has been added the offset of mv
-{
-	const int32_t kiDx = mv.iMvX & 0x07;
-	const int32_t kiDy = mv.iMvY & 0x07;
-
-	if ( 0 == kiDx && 0 == kiDy )
-	{
-		McCopy(pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
-	}
-	else
-	{
-		const int32_t kiDA = g_kuiABCD[kiDy][kiDx][0];
-		const int32_t kiDB = g_kuiABCD[kiDy][kiDx][1];
-		const int32_t kiDC = g_kuiABCD[kiDy][kiDx][2];
-		const int32_t kiDD = g_kuiABCD[kiDy][kiDx][3];
-
-		int32_t i, j;
-
-		uint8_t* pSrcNext = pSrc + iSrcStride;
-
-		for (i = 0; i < iHeight; i++)
-		{
-			for (j = 0; j < iWidth; j++)
-			{
-				pDst[j] = (kiDA * pSrc[j] + kiDB * pSrc[j+1] + kiDC * pSrcNext[j] + kiDD * pSrcNext[j+1] + 32) >> 6;
-			}
-			pDst += iDstStride;
-			pSrc = pSrcNext;
-			pSrcNext += iSrcStride;
-		}
-	}	
-}
-//***************************************************************************//
-//                       MMXEXT and SSE2 implementation                      //
-//***************************************************************************//
-#if defined(X86_ASM)
-
-static inline void McHorVer22WidthEq8_sse2 ( uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, int32_t iHeight )
-{	
-	ENFORCE_STACK_ALIGN_2D(int16_t, pTap, 21, 8, 16)
-	McHorVer22Width8HorFirst_sse2(pSrc-2, iSrcStride, (uint8_t *)pTap,16,iHeight+5);
-	McHorVer22VerLastAlign_sse2((uint8_t *)pTap,16, pDst, iDstStride, 8, iHeight);
-}
-
-//2010.2.5
-
-static inline void McHorVer02WidthEq16_sse2( uint8_t *pSrc, int32_t iSrcStride, uint8_t *PDst, int32_t iDstStride, int32_t iHeight )
-{
-	McHorVer02WidthEq8_sse2( pSrc,     iSrcStride, PDst,     iDstStride, iHeight );
-    McHorVer02WidthEq8_sse2( &pSrc[8], iSrcStride, &PDst[8], iDstStride, iHeight );
-}
-static inline void McHorVer22WidthEq16_sse2( uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, int32_t iHeight )
-{
-    McHorVer22WidthEq8_sse2( pSrc,     iSrcStride, pDst,     iDstStride, iHeight );
-    McHorVer22WidthEq8_sse2( &pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight );
-}
-void McHorVer22_sse2( uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, int32_t iWidth,int32_t iHeight )
-{
-	ENFORCE_STACK_ALIGN_2D(int16_t, pTap, 22, 24, 16)
-	int32_t tmp1 = 2*(iWidth-8);
-	McHorVer22HorFirst_sse2(pSrc-2, iSrcStride, (uint8_t *)pTap,48,iWidth,iHeight+5);
-	McHorVer22VerLastAlign_sse2((uint8_t *)pTap,  48, pDst, iDstStride, iWidth-1, iHeight);
-	McHorVer22VerLastUnAlign_sse2((uint8_t *)pTap+tmp1,  48, pDst+iWidth-8, iDstStride, 8, iHeight);
-}
-
-typedef void (*McChromaWidthEqx)(uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, const uint8_t *pABCD,int32_t iHeigh);
-void McChroma_sse2( uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride,
-					SMVUnitXY sMv, int32_t iWidth, int32_t iHeight )
-{
-	const int32_t kiD8x = sMv.iMvX&0x07;
-	const int32_t kiD8y = sMv.iMvY&0x07;
-	static const McChromaWidthEqx kpfFuncs[2] =
-	{
-		McChromaWidthEq4_mmx,
-		McChromaWidthEq8_sse2
-	};
-
-	if (0 == kiD8x && 0 == kiD8y)
-	{
-		McCopy(pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
-	}
-	else
-	{
-		kpfFuncs[(iWidth>>3)](pSrc, iSrcStride, pDst, iDstStride, g_kuiABCD[kiD8y][kiD8x], iHeight);
-	}
-}
-
-void McChroma_ssse3( uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride,
-					 SMVUnitXY sMv, int32_t iWidth, int32_t iHeight )
-{
-	const int32_t kiD8x = sMv.iMvX&0x07;
-	const int32_t kiD8y = sMv.iMvY&0x07;
-
-	static const McChromaWidthEqx kpfFuncs[2] = 
-	{
-		McChromaWidthEq4_mmx,
-		McChromaWidthEq8_ssse3
-	};
-	if (0 == kiD8x && 0 == kiD8y)
-	{
-		McCopy(pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
-	}
-	else
-	{
-		kpfFuncs[(iWidth>>3)](pSrc, iSrcStride, pDst, iDstStride, g_kuiABCD[kiD8y][kiD8x], iHeight);
-	}
-	
-}
-
-#endif //X86_ASM
-typedef void (*PixelAvgFunc) ( uint8_t *, int32_t, uint8_t *, int32_t, uint8_t *, int32_t, int32_t );
-void WelsInitMcFuncs( SWelsFuncPtrList *pFuncList, uint32_t uiCpuFlag )
-{
-	static PixelAvgFunc pfPixAvgFunc[2] ={PixelAvgWidthEq8_c,PixelAvgWidthEq16_c};
-
-	static PWelsLumaQuarpelMcFunc pWelsMcFuncWidthEq16[16] =  //[y*4+x]   
-	{
-		McCopyWidthEq16_c,  McHorVer10WidthEq16, McHorVer20WidthEq16_c,     McHorVer30WidthEq16,   
-		McHorVer01WidthEq16, McHorVer11WidthEq16, McHorVer21WidthEq16, McHorVer31WidthEq16, 
-		McHorVer02WidthEq16_c,     McHorVer12WidthEq16, McHorVer22WidthEq16_c,    McHorVer32WidthEq16,     
-		McHorVer03WidthEq16, McHorVer13WidthEq16, McHorVer23WidthEq16, McHorVer33WidthEq16
-	};
-#if defined (X86_ASM)
-	static PWelsLumaQuarpelMcFunc pWelsMcFuncWidthEq16_sse2[16] = 
-	{
-		McCopyWidthEq16_sse2,  McHorVer10WidthEq16, McHorVer20WidthEq16_sse2,     McHorVer30WidthEq16,   
-		McHorVer01WidthEq16, McHorVer11WidthEq16, McHorVer21WidthEq16, McHorVer31WidthEq16, 
-		McHorVer02WidthEq16_sse2,     McHorVer12WidthEq16, McHorVer22WidthEq16_sse2,    McHorVer32WidthEq16,     
-		McHorVer03WidthEq16, McHorVer13WidthEq16, McHorVer23WidthEq16, McHorVer33WidthEq16
-	};
-#endif
-
-	pFuncList->sMcFuncs.pfLumaHalfpelHor = McHorVer20_c;
-	pFuncList->sMcFuncs.pfLumaHalfpelVer = McHorVer02_c;
-	pFuncList->sMcFuncs.pfLumaHalfpelCen = McHorVer22_c;
-	pFuncList->sMcFuncs.pfSampleAveraging = pfPixAvgFunc;
-	pFuncList->sMcFuncs.pfChromaMc	= McChroma_c;
-	fpVerFilter				= VerFilter_c;	
-	fpHorFilter				= HorFilter_c;
-	fpHorFilterInput16Bits			= HorFilterInput16bit1_c;
-	McCopyWidthEq4 = McCopyWidthEq4_c;
-	McCopyWidthEq8 = McCopyWidthEq8_c;
-	McCopyWidthEq16 = McCopyWidthEq16_c;
-	pfPixelAvgWidthEq16 = PixelAvgWidthEq16_c;
-	pfMcHorVer02WidthEq16 = McHorVer02WidthEq16_c;
-	pfMcHorVer20WidthEq16 = McHorVer20WidthEq16_c;
-	pfMcHorVer22WidthEq16 = McHorVer22WidthEq16_c;
-	pFuncList->sMcFuncs.pfLumaQuarpelMc = pWelsMcFuncWidthEq16;
-#if defined (X86_ASM)
-	if ( uiCpuFlag & WELS_CPU_SSE2 )
-	{
-		pFuncList->sMcFuncs.pfLumaHalfpelHor = McHorVer20_sse2;
-		pFuncList->sMcFuncs.pfLumaHalfpelVer = McHorVer02_sse2;
-		pFuncList->sMcFuncs.pfLumaHalfpelCen = McHorVer22_sse2;
-		pFuncList->sMcFuncs.pfSampleAveraging[0] = PixelAvgWidthEq8_mmx;
-		pFuncList->sMcFuncs.pfSampleAveraging[1] = PixelAvgWidthEq16_sse2;
-		pFuncList->sMcFuncs.pfChromaMc = McChroma_sse2;
-		McCopyWidthEq4 = McCopyWidthEq4_mmx;
-		McCopyWidthEq8 = McCopyWidthEq8_mmx;
-		McCopyWidthEq16 = McCopyWidthEq16_sse2;
-		pfPixelAvgWidthEq16 = PixelAvgWidthEq16_sse2;
-		pfMcHorVer02WidthEq16 = McHorVer02WidthEq16_sse2;
-		pfMcHorVer20WidthEq16 = McHorVer20WidthEq16_sse2;
-		pfMcHorVer22WidthEq16 = McHorVer22WidthEq16_sse2;		
-		pFuncList->sMcFuncs.pfLumaQuarpelMc = pWelsMcFuncWidthEq16_sse2;
-	}
-
-	if ( uiCpuFlag & WELS_CPU_SSSE3 )
-	{
-		pFuncList->sMcFuncs.pfChromaMc = McChroma_ssse3;
-		pFuncList->sMcFuncs.pfSampleAveraging[1] = PixelAvgWidthEq16_ssse3;
-	}
-
-#endif //(X86_ASM)
-}
-}
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	mc.c
+ *
+ * \brief	Interfaces implementation for motion compensation
+ *
+ * \date	03/17/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#include <string.h>
+#include "as264_common.h"
+#include "typedefs.h"
+#include "wels_const.h"
+#include "macros.h"
+#include "mc.h"
+#include "sample.h"
+#include "cpu_core.h"
+#include "array_stack_align.h"
+
+namespace WelsSVCEnc {
+/*------------------weight for chroma fraction pixel interpolation------------------*/
+//kuiA = (8 - dx) * (8 - dy);
+//kuiB = dx * (8 - dy);
+//kuiC = (8 - dx) * dy;
+//kuiD = dx * dy
+static const uint8_t g_kuiABCD[8][8][4] = { ////g_kuiA[dy][dx], g_kuiB[dy][dx], g_kuiC[dy][dx], g_kuiD[dy][dx]
+  {
+    {64, 0, 0, 0}, {56, 8, 0, 0}, {48, 16, 0, 0}, {40, 24, 0, 0},
+    {32, 32, 0, 0}, {24, 40, 0, 0}, {16, 48, 0, 0}, {8, 56, 0, 0}
+  },
+  {
+    {56, 0, 8, 0}, {49, 7, 7, 1}, {42, 14, 6, 2}, {35, 21, 5, 3},
+    {28, 28, 4, 4}, {21, 35, 3, 5}, {14, 42, 2, 6}, {7, 49, 1, 7}
+  },
+  {
+    {48, 0, 16, 0}, {42, 6, 14, 2}, {36, 12, 12, 4}, {30, 18, 10, 6},
+    {24, 24, 8, 8}, {18, 30, 6, 10}, {12, 36, 4, 12}, {6, 42, 2, 14}
+  },
+  {
+    {40, 0, 24, 0}, {35, 5, 21, 3}, {30, 10, 18, 6}, {25, 15, 15, 9},
+    {20, 20, 12, 12}, {15, 25, 9, 15}, {10, 30, 6, 18}, {5, 35, 3, 21}
+  },
+  {
+    {32, 0, 32, 0}, {28, 4, 28, 4}, {24, 8, 24, 8}, {20, 12, 20, 12},
+    {16, 16, 16, 16}, {12, 20, 12, 20}, {8, 24, 8, 24}, {4, 28, 4, 28}
+  },
+  {
+    {24, 0, 40, 0}, {21, 3, 35, 5}, {18, 6, 30, 10}, {15, 9, 25, 15},
+    {12, 12, 20, 20}, {9, 15, 15, 25}, {6, 18, 10, 30}, {3, 21, 5, 35}
+  },
+  {
+    {16, 0, 48, 0}, {14, 2, 42, 6}, {12, 4, 36, 12}, {10, 6, 30, 18},
+    {8, 8, 24, 24}, {6, 10, 18, 30}, {4, 12, 12, 36}, {2, 14, 6, 42}
+  },
+  {
+    {8, 0, 56, 0}, {7, 1, 49, 7}, {6, 2, 42, 14}, {5, 3, 35, 21},
+    {4, 4, 28, 28}, {3, 5, 21, 35}, {2, 6, 14, 42}, {1, 7, 7, 49}
+  }
+};
+typedef int32_t (*VerFilterFunc) (uint8_t* pSrc, const int32_t kiSrcStride);
+typedef int32_t (*HorFilterFunc) (uint8_t* pSrc);
+typedef int32_t (*HorFilterFuncInput16Bits) (int16_t* pSrc);
+
+VerFilterFunc fpVerFilter			= NULL;
+HorFilterFunc fpHorFilter			= NULL;
+HorFilterFuncInput16Bits fpHorFilterInput16Bits = NULL;
+
+typedef void (*WelsMcFunc0) (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+typedef void (*WelsMcFunc1) (uint8_t* pDst, int32_t iDstStride, uint8_t* psrcA, int32_t iSrcAStride,  uint8_t* pSrcB,
+                             int32_t iSrcBStride, int32_t iHeight);
+WelsMcFunc0 McCopyWidthEq16 = NULL;
+WelsMcFunc0 McCopyWidthEq8 = NULL;
+WelsMcFunc0 McCopyWidthEq4 = NULL;
+WelsMcFunc0 pfMcHorVer02WidthEq16 = NULL;
+WelsMcFunc1 pfPixelAvgWidthEq16  = NULL;
+WelsMcFunc0 pfMcHorVer20WidthEq16 = NULL;
+WelsMcFunc0 pfMcHorVer22WidthEq16 = NULL;
+
+//***************************************************************************//
+//                          C code implementation                            //
+//***************************************************************************//
+static inline void McCopyWidthEq4_c (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                     int32_t iHeight) {
+  int32_t i;
+  for (i = 0; i < iHeight; i++) {
+    memcpy (pDst, pSrc, 4);	// confirmed_safe_unsafe_usage
+    pDst += iDstStride;
+    pSrc += iSrcStride;
+  }
+}
+
+static inline void McCopyWidthEq8_c (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                     int32_t iHeight)
+
+{
+  int32_t i;
+  for (i = 0; i < iHeight; i++) {
+    memcpy (pDst, pSrc, 8);	// confirmed_safe_unsafe_usage
+    pDst += iDstStride;
+    pSrc += iSrcStride;
+  }
+}
+static inline void McCopyWidthEq16_c (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iHeight) {
+  int32_t i;
+  for (i = 0; i < iHeight; i++) {
+    memcpy (pDst, pSrc, 16);	// confirmed_safe_unsafe_usage
+    pDst += iDstStride;
+    pSrc += iSrcStride;
+  }
+}
+
+//--------------------Luma sample MC------------------//
+static inline int32_t HorFilter_c (uint8_t* pSrc) {
+  int32_t iPix05 = pSrc[-2] + pSrc[3];
+  int32_t iPix14 = pSrc[-1] + pSrc[2];
+  int32_t iPix23 = pSrc[ 0] + pSrc[1];
+
+  return (iPix05 - ((iPix14 << 2) + iPix14) + (iPix23 << 4) + (iPix23 << 2));
+}
+
+static inline int32_t HorFilterInput16bit1_c (int16_t* pSrc) {
+  int32_t iPix05 = pSrc[-2] + pSrc[3];
+  int32_t iPix14 = pSrc[-1] + pSrc[2];
+  int32_t iPix23 = pSrc[ 0] + pSrc[1];
+
+  return (iPix05 - ((iPix14 << 2) + iPix14) + (iPix23 << 4) + (iPix23 << 2));
+}
+static inline int32_t VerFilter_c (uint8_t* pSrc, const int32_t kiSrcStride) {
+  const int32_t kiLine1	= kiSrcStride;
+  const int32_t kiLine2	= (kiSrcStride << 1);
+  const int32_t kiLine3 = kiLine1 + kiLine2;
+  const uint32_t kuiPix05 = * (pSrc - kiLine2) + * (pSrc + kiLine3);
+  const uint32_t kuiPix14 = * (pSrc - kiLine1) + * (pSrc + kiLine2);
+  const uint32_t kuiPix23 = * (pSrc) + * (pSrc + kiLine1);
+
+  return (kuiPix05 - ((kuiPix14 << 2) + kuiPix14) + (kuiPix23 << 4) + (kuiPix23 << 2));
+}
+
+static inline void PixelAvgWidthEq8_c (uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, int32_t iSrcAStride,
+                                       uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight) {
+  int32_t i, j;
+  for (i = 0; i < iHeight; i++) {
+    for (j = 0; j < 8; j++) {
+      pDst[j] = (pSrcA[j] + pSrcB[j] + 1) >> 1;
+    }
+    pDst  += iDstStride;
+    pSrcA += iSrcAStride;
+    pSrcB += iSrcBStride;
+  }
+}
+static inline void PixelAvgWidthEq16_c (uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, int32_t iSrcAStride,
+                                        uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight) {
+  int32_t i, j;
+  for (i = 0; i < iHeight; i++) {
+    for (j = 0; j < 16; j++) {
+      pDst[j] = (pSrcA[j] + pSrcB[j] + 1) >> 1;
+    }
+    pDst  += iDstStride;
+    pSrcA += iSrcAStride;
+    pSrcB += iSrcBStride;
+  }
+}
+
+//horizontal filter to gain half sample, that is (2, 0) location in quarter sample
+static inline void McHorVer20WidthEq16_c (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+    int32_t iHeight) {
+  int32_t i, j;
+  for (i = 0; i < iHeight; i++) {
+    for (j = 0; j < 16; j++) {
+      pDst[j] = WELS_CLIP1 ((fpHorFilter (pSrc + j) + 16) >> 5);
+    }
+    pDst += iDstStride;
+    pSrc += iSrcStride;
+  }
+}
+//vertical filter to gain half sample, that is (0, 2) location in quarter sample
+static inline void McHorVer02WidthEq16_c (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+    int32_t iHeight) {
+  int32_t i, j;
+  for (i = 0; i < iHeight; i++) {
+    for (j = 0; j < 16; j++) {
+      pDst[j] = WELS_CLIP1 ((fpVerFilter (pSrc + j, iSrcStride) + 16) >> 5);
+    }
+    pDst += iDstStride;
+    pSrc += iSrcStride;
+  }
+}
+//horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
+static inline void McHorVer22WidthEq16_c (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+    int32_t iHeight) {
+  int16_t pTmp[16 + 5] = {0}; //16
+  int32_t i, j, k;
+
+  for (i = 0; i < iHeight; i++) {
+    for (j = 0; j < 16 + 5; j++) {
+      pTmp[j] = fpVerFilter (pSrc - 2 + j, iSrcStride);
+    }
+    for (k = 0; k < 16; k++) {
+      pDst[k] = WELS_CLIP1 ((fpHorFilterInput16Bits (&pTmp[2 + k]) + 512) >> 10);
+    }
+    pSrc += iSrcStride;
+    pDst += iDstStride;
+  }
+}
+
+/////////////////////luma MC//////////////////////////
+
+static inline void McHorVer01WidthEq16 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                        int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16)
+
+  pfMcHorVer02WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight);
+  pfPixelAvgWidthEq16 (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
+}
+static inline void McHorVer03WidthEq16 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                        int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16)
+
+  pfMcHorVer02WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight);
+  pfPixelAvgWidthEq16 (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
+}
+static inline void McHorVer10WidthEq16 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                        int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16)
+
+  pfMcHorVer20WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight);
+  pfPixelAvgWidthEq16 (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
+}
+static inline void McHorVer11WidthEq16 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                        int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
+
+  pfMcHorVer20WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight);
+  pfMcHorVer02WidthEq16 (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+  pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+}
+static inline void McHorVer12WidthEq16 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                        int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
+
+  pfMcHorVer02WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight);
+  pfMcHorVer22WidthEq16 (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+  pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+}
+static inline void McHorVer13WidthEq16 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                        int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
+
+  pfMcHorVer20WidthEq16 (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
+  pfMcHorVer02WidthEq16 (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+  pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+}
+static inline void McHorVer21WidthEq16 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                        int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
+
+  pfMcHorVer20WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight);
+  pfMcHorVer22WidthEq16 (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+  pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+}
+static inline void McHorVer23WidthEq16 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                        int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
+
+  pfMcHorVer20WidthEq16 (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
+  pfMcHorVer22WidthEq16 (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+  pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+}
+static inline void McHorVer30WidthEq16 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                        int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16)
+
+  pfMcHorVer20WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight);
+  pfPixelAvgWidthEq16 (pDst, iDstStride, pSrc + 1, iSrcStride, pTmp, 16, iHeight);
+}
+static inline void McHorVer31WidthEq16 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                        int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
+
+  pfMcHorVer20WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight);
+  pfMcHorVer02WidthEq16 (pSrc + 1, iSrcStride, &pTmp[256], 16, iHeight);
+  pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+}
+static inline void McHorVer32WidthEq16 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                        int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
+
+  pfMcHorVer02WidthEq16 (pSrc + 1, iSrcStride, pTmp, 16, iHeight);
+  pfMcHorVer22WidthEq16 (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+  pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+}
+static inline void McHorVer33WidthEq16 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                        int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
+
+  pfMcHorVer20WidthEq16 (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
+  pfMcHorVer02WidthEq16 (pSrc + 1, iSrcStride, &pTmp[256], 16, iHeight);
+  pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+}
+
+static inline void McHorVer20_c (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                                 int32_t iHeight) {
+  int32_t i, j;
+  for (i = 0; i < iHeight; i++) {
+    for (j = 0; j < iWidth; j++) {
+      pDst[j] = WELS_CLIP1 ((fpHorFilter (pSrc + j) + 16) >> 5);
+    }
+    pDst += iDstStride;
+    pSrc += iSrcStride;
+  }
+}
+//vertical filter to gain half sample, that is (0, 2) location in quarter sample
+static inline void McHorVer02_c (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                                 int32_t iHeight) {
+  int32_t i, j;
+  for (i = 0; i < iHeight; i++) {
+    for (j = 0; j < iWidth; j++) {
+      pDst[j] = WELS_CLIP1 ((fpVerFilter (pSrc + j, iSrcStride) + 16) >> 5);
+    }
+    pDst += iDstStride;
+    pSrc += iSrcStride;
+  }
+}
+//horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
+static inline void McHorVer22_c (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                                 int32_t iHeight) {
+  int16_t pTmp[17 + 5] = {0}; //w+1
+  int32_t i, j, k;
+
+  for (i = 0; i < iHeight; i++) {
+    for (j = 0; j < iWidth + 5; j++) {
+      pTmp[j] = fpVerFilter (pSrc - 2 + j, iSrcStride);
+    }
+    for (k = 0; k < iWidth; k++) {
+      pDst[k] = WELS_CLIP1 ((fpHorFilterInput16Bits (&pTmp[2 + k]) + 512) >> 10);
+    }
+    pSrc += iSrcStride;
+    pDst += iDstStride;
+  }
+}
+static inline void McCopy (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                           int32_t iHeight) {
+  int32_t i;
+  if (iWidth == 16 && McCopyWidthEq16 != NULL)
+    McCopyWidthEq16 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8 && McCopyWidthEq8 != NULL)
+    McCopyWidthEq8 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 4 && McCopyWidthEq4 != NULL)
+    McCopyWidthEq4 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else {
+    for (i = 0; i < iHeight; i++) {
+      memcpy (pDst, pSrc, iWidth);	// confirmed_safe_unsafe_usage
+      pDst += iDstStride;
+      pSrc += iSrcStride;
+    }
+  }
+}
+
+void McChroma_c (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                 SMVUnitXY mv, int32_t iWidth, int32_t iHeight)
+//pSrc has been added the offset of mv
+{
+  const int32_t kiDx = mv.iMvX & 0x07;
+  const int32_t kiDy = mv.iMvY & 0x07;
+
+  if (0 == kiDx && 0 == kiDy) {
+    McCopy (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+  } else {
+    const int32_t kiDA = g_kuiABCD[kiDy][kiDx][0];
+    const int32_t kiDB = g_kuiABCD[kiDy][kiDx][1];
+    const int32_t kiDC = g_kuiABCD[kiDy][kiDx][2];
+    const int32_t kiDD = g_kuiABCD[kiDy][kiDx][3];
+
+    int32_t i, j;
+
+    uint8_t* pSrcNext = pSrc + iSrcStride;
+
+    for (i = 0; i < iHeight; i++) {
+      for (j = 0; j < iWidth; j++) {
+        pDst[j] = (kiDA * pSrc[j] + kiDB * pSrc[j + 1] + kiDC * pSrcNext[j] + kiDD * pSrcNext[j + 1] + 32) >> 6;
+      }
+      pDst += iDstStride;
+      pSrc = pSrcNext;
+      pSrcNext += iSrcStride;
+    }
+  }
+}
+//***************************************************************************//
+//                       MMXEXT and SSE2 implementation                      //
+//***************************************************************************//
+#if defined(X86_ASM)
+
+static inline void McHorVer22WidthEq8_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+    int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_2D (int16_t, pTap, 21, 8, 16)
+  McHorVer22Width8HorFirst_sse2 (pSrc - 2, iSrcStride, (uint8_t*)pTap, 16, iHeight + 5);
+  McHorVer22VerLastAlign_sse2 ((uint8_t*)pTap, 16, pDst, iDstStride, 8, iHeight);
+}
+
+//2010.2.5
+
+static inline void McHorVer02WidthEq16_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* PDst, int32_t iDstStride,
+    int32_t iHeight) {
+  McHorVer02WidthEq8_sse2 (pSrc,     iSrcStride, PDst,     iDstStride, iHeight);
+  McHorVer02WidthEq8_sse2 (&pSrc[8], iSrcStride, &PDst[8], iDstStride, iHeight);
+}
+static inline void McHorVer22WidthEq16_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+    int32_t iHeight) {
+  McHorVer22WidthEq8_sse2 (pSrc,     iSrcStride, pDst,     iDstStride, iHeight);
+  McHorVer22WidthEq8_sse2 (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight);
+}
+void McHorVer22_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                      int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_2D (int16_t, pTap, 22, 24, 16)
+  int32_t tmp1 = 2 * (iWidth - 8);
+  McHorVer22HorFirst_sse2 (pSrc - 2, iSrcStride, (uint8_t*)pTap, 48, iWidth, iHeight + 5);
+  McHorVer22VerLastAlign_sse2 ((uint8_t*)pTap,  48, pDst, iDstStride, iWidth - 1, iHeight);
+  McHorVer22VerLastUnAlign_sse2 ((uint8_t*)pTap + tmp1,  48, pDst + iWidth - 8, iDstStride, 8, iHeight);
+}
+
+typedef void (*McChromaWidthEqx) (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                  const uint8_t* pABCD, int32_t iHeigh);
+void McChroma_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                    SMVUnitXY sMv, int32_t iWidth, int32_t iHeight) {
+  const int32_t kiD8x = sMv.iMvX & 0x07;
+  const int32_t kiD8y = sMv.iMvY & 0x07;
+  static const McChromaWidthEqx kpfFuncs[2] = {
+    McChromaWidthEq4_mmx,
+    McChromaWidthEq8_sse2
+  };
+
+  if (0 == kiD8x && 0 == kiD8y) {
+    McCopy (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+  } else {
+    kpfFuncs[ (iWidth >> 3)] (pSrc, iSrcStride, pDst, iDstStride, g_kuiABCD[kiD8y][kiD8x], iHeight);
+  }
+}
+
+void McChroma_ssse3 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                     SMVUnitXY sMv, int32_t iWidth, int32_t iHeight) {
+  const int32_t kiD8x = sMv.iMvX & 0x07;
+  const int32_t kiD8y = sMv.iMvY & 0x07;
+
+  static const McChromaWidthEqx kpfFuncs[2] = {
+    McChromaWidthEq4_mmx,
+    McChromaWidthEq8_ssse3
+  };
+  if (0 == kiD8x && 0 == kiD8y) {
+    McCopy (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+  } else {
+    kpfFuncs[ (iWidth >> 3)] (pSrc, iSrcStride, pDst, iDstStride, g_kuiABCD[kiD8y][kiD8x], iHeight);
+  }
+
+}
+
+#endif //X86_ASM
+typedef void (*PixelAvgFunc) (uint8_t*, int32_t, uint8_t*, int32_t, uint8_t*, int32_t, int32_t);
+void WelsInitMcFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) {
+  static PixelAvgFunc pfPixAvgFunc[2] = {PixelAvgWidthEq8_c, PixelAvgWidthEq16_c};
+
+  static PWelsLumaQuarpelMcFunc pWelsMcFuncWidthEq16[16] = { //[y*4+x]
+    McCopyWidthEq16_c,  McHorVer10WidthEq16, McHorVer20WidthEq16_c,     McHorVer30WidthEq16,
+    McHorVer01WidthEq16, McHorVer11WidthEq16, McHorVer21WidthEq16, McHorVer31WidthEq16,
+    McHorVer02WidthEq16_c,     McHorVer12WidthEq16, McHorVer22WidthEq16_c,    McHorVer32WidthEq16,
+    McHorVer03WidthEq16, McHorVer13WidthEq16, McHorVer23WidthEq16, McHorVer33WidthEq16
+  };
+#if defined (X86_ASM)
+  static PWelsLumaQuarpelMcFunc pWelsMcFuncWidthEq16_sse2[16] = {
+    McCopyWidthEq16_sse2,  McHorVer10WidthEq16, McHorVer20WidthEq16_sse2,     McHorVer30WidthEq16,
+    McHorVer01WidthEq16, McHorVer11WidthEq16, McHorVer21WidthEq16, McHorVer31WidthEq16,
+    McHorVer02WidthEq16_sse2,     McHorVer12WidthEq16, McHorVer22WidthEq16_sse2,    McHorVer32WidthEq16,
+    McHorVer03WidthEq16, McHorVer13WidthEq16, McHorVer23WidthEq16, McHorVer33WidthEq16
+  };
+#endif
+
+  pFuncList->sMcFuncs.pfLumaHalfpelHor = McHorVer20_c;
+  pFuncList->sMcFuncs.pfLumaHalfpelVer = McHorVer02_c;
+  pFuncList->sMcFuncs.pfLumaHalfpelCen = McHorVer22_c;
+  pFuncList->sMcFuncs.pfSampleAveraging = pfPixAvgFunc;
+  pFuncList->sMcFuncs.pfChromaMc	= McChroma_c;
+  fpVerFilter				= VerFilter_c;
+  fpHorFilter				= HorFilter_c;
+  fpHorFilterInput16Bits			= HorFilterInput16bit1_c;
+  McCopyWidthEq4 = McCopyWidthEq4_c;
+  McCopyWidthEq8 = McCopyWidthEq8_c;
+  McCopyWidthEq16 = McCopyWidthEq16_c;
+  pfPixelAvgWidthEq16 = PixelAvgWidthEq16_c;
+  pfMcHorVer02WidthEq16 = McHorVer02WidthEq16_c;
+  pfMcHorVer20WidthEq16 = McHorVer20WidthEq16_c;
+  pfMcHorVer22WidthEq16 = McHorVer22WidthEq16_c;
+  pFuncList->sMcFuncs.pfLumaQuarpelMc = pWelsMcFuncWidthEq16;
+#if defined (X86_ASM)
+  if (uiCpuFlag & WELS_CPU_SSE2) {
+    pFuncList->sMcFuncs.pfLumaHalfpelHor = McHorVer20_sse2;
+    pFuncList->sMcFuncs.pfLumaHalfpelVer = McHorVer02_sse2;
+    pFuncList->sMcFuncs.pfLumaHalfpelCen = McHorVer22_sse2;
+    pFuncList->sMcFuncs.pfSampleAveraging[0] = PixelAvgWidthEq8_mmx;
+    pFuncList->sMcFuncs.pfSampleAveraging[1] = PixelAvgWidthEq16_sse2;
+    pFuncList->sMcFuncs.pfChromaMc = McChroma_sse2;
+    McCopyWidthEq4 = McCopyWidthEq4_mmx;
+    McCopyWidthEq8 = McCopyWidthEq8_mmx;
+    McCopyWidthEq16 = McCopyWidthEq16_sse2;
+    pfPixelAvgWidthEq16 = PixelAvgWidthEq16_sse2;
+    pfMcHorVer02WidthEq16 = McHorVer02WidthEq16_sse2;
+    pfMcHorVer20WidthEq16 = McHorVer20WidthEq16_sse2;
+    pfMcHorVer22WidthEq16 = McHorVer22WidthEq16_sse2;
+    pFuncList->sMcFuncs.pfLumaQuarpelMc = pWelsMcFuncWidthEq16_sse2;
+  }
+
+  if (uiCpuFlag & WELS_CPU_SSSE3) {
+    pFuncList->sMcFuncs.pfChromaMc = McChroma_ssse3;
+    pFuncList->sMcFuncs.pfSampleAveraging[1] = PixelAvgWidthEq16_ssse3;
+  }
+
+#endif //(X86_ASM)
+}
+}
--- a/codec/encoder/core/src/md.cpp
+++ b/codec/encoder/core/src/md.cpp
@@ -1,1034 +1,928 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file	md.c
- *
- * \brief	mode decision
- *
- * \date	2009.05.14 Created
- *
- *************************************************************************************
- */
-
-#include <string.h>
-#include "ls_defines.h"
-#include "encoder_context.h"
-#include "svc_enc_slice_segment.h"
-#include "md.h"
-#include "mc.h"
-#include "mv_pred.h"
-#include "cpu_core.h"
-#include "svc_enc_golomb.h"
-#include "sample.h"
-#include "array_stack_align.h"
-
-namespace WelsSVCEnc {
-#define INTRA_VARIANCE_SAD_THRESHOLD 150
-#define INTER_VARIANCE_SAD_THRESHOLD 20
-
-//fill cache of neighbor MB, containing pNonZeroCount, sample_avail, pIntra4x4PredMode
-void FillNeighborCacheIntra(SMbCache* pMbCache, SMB* pCurMb, int32_t iMbWidth)
-{
-	uint32_t uiNeighborAvail = pCurMb->uiNeighborAvail;
-	uint32_t uiNeighborIntra = 0;
-
-	if (uiNeighborAvail & LEFT_MB_POS) //LEFT MB
-	{
-		int8_t* pLeftMbNonZeroCount = pCurMb->pNonZeroCount - MB_LUMA_CHROMA_BLOCK4x4_NUM;
-		pMbCache->iNonZeroCoeffCount[8] = pLeftMbNonZeroCount[ 3];
-		pMbCache->iNonZeroCoeffCount[16] = pLeftMbNonZeroCount[ 7];
-		pMbCache->iNonZeroCoeffCount[24] = pLeftMbNonZeroCount[11];
-		pMbCache->iNonZeroCoeffCount[32] = pLeftMbNonZeroCount[15];
-
-		pMbCache->iNonZeroCoeffCount[ 13] = pLeftMbNonZeroCount[17]; 
-		pMbCache->iNonZeroCoeffCount[21] = pLeftMbNonZeroCount[21];
-		pMbCache->iNonZeroCoeffCount[37] = pLeftMbNonZeroCount[19]; 
-		pMbCache->iNonZeroCoeffCount[45] = pLeftMbNonZeroCount[23];
-
-        uiNeighborIntra |= LEFT_MB_POS;
-
-		if ( IS_INTRA4x4((pCurMb-1)->uiMbType ) ) 
-		{
-			int8_t* pLeftMbIntra4x4PredMode = pCurMb->pIntra4x4PredMode - INTRA_4x4_MODE_NUM;
-			pMbCache->iIntraPredMode[8] = pLeftMbIntra4x4PredMode[4];
-			pMbCache->iIntraPredMode[16] = pLeftMbIntra4x4PredMode[5];
-			pMbCache->iIntraPredMode[24] = pLeftMbIntra4x4PredMode[6];
-			pMbCache->iIntraPredMode[32] = pLeftMbIntra4x4PredMode[3];
-		}
-		else// if ( 0 == constrained_intra_pred_flag || IS_INTRA16x16((pCurMb-1)->uiMbType )) 
-		{
-			pMbCache->iIntraPredMode[8] = 
-			pMbCache->iIntraPredMode[16] = 
-			pMbCache->iIntraPredMode[24] = 
-			pMbCache->iIntraPredMode[32] = 2; //DC		
-		}
-	}
-	else
-	{
-		pMbCache->iNonZeroCoeffCount[ 8] = 
-		pMbCache->iNonZeroCoeffCount[16] = 
-		pMbCache->iNonZeroCoeffCount[24] =
-		pMbCache->iNonZeroCoeffCount[32] = -1;//unavailable
-		pMbCache->iNonZeroCoeffCount[13] = 
-		pMbCache->iNonZeroCoeffCount[21] =
-		pMbCache->iNonZeroCoeffCount[37] =
-		pMbCache->iNonZeroCoeffCount[45] = -1;//unavailable
-
-		pMbCache->iIntraPredMode[8] = 
-		pMbCache->iIntraPredMode[16] = 
-		pMbCache->iIntraPredMode[24] = 
-		pMbCache->iIntraPredMode[32] = -1;//unavailable
-	}
-
-	if (uiNeighborAvail & TOP_MB_POS)//TOP MB
-	{
-		SMB* pTopMb = pCurMb - iMbWidth;
-		ST32(&pMbCache->iNonZeroCoeffCount[1], LD32(&pTopMb->pNonZeroCount[12]));
-
-		ST16(&pMbCache->iNonZeroCoeffCount[6], LD16(&pTopMb->pNonZeroCount[20]));
-		ST16(&pMbCache->iNonZeroCoeffCount[30], LD16(&pTopMb->pNonZeroCount[22]));
-		
-        uiNeighborIntra |= TOP_MB_POS;
-
-		if ( IS_INTRA4x4( pTopMb->uiMbType ) ) 
-		{
-			ST32(pMbCache->iIntraPredMode+1, LD32(&pTopMb->pIntra4x4PredMode[0]));
-		}
-		else// if ( 0 == constrained_intra_pred_flag || IS_INTRA16x16( pTopMb->uiMbType )) 
-		{
-			const uint32_t kuiDc32 = 0x02020202;
-			ST32( pMbCache->iIntraPredMode+1 , kuiDc32 );
-		}
-	}
-	else
-	{
-		const uint32_t kuiUnavail32 = 0xffffffff;
-		ST32( pMbCache->iIntraPredMode+1 , kuiUnavail32 );
-		ST32( &pMbCache->iNonZeroCoeffCount[1], kuiUnavail32 );
-
-		ST16( &pMbCache->iNonZeroCoeffCount[6], 0xffff );
-		ST16( &pMbCache->iNonZeroCoeffCount[30], 0xffff );
-	}
-
-	if (uiNeighborAvail & TOPLEFT_MB_POS)
-	{
-        uiNeighborIntra |= 0x04;
-	}
-
-	
-	if (uiNeighborAvail & TOPRIGHT_MB_POS)
-    {
-        uiNeighborIntra |= 0x08;
-	}
-	pMbCache->uiNeighborIntra = uiNeighborIntra;
-}
-//fill cache of neighbor MB, containing motion_vector and uiRefIndex
-void FillNeighborCacheInterWithoutBGD(SMbCache* pMbCache, SMB* pCurMb, int32_t iMbWidth, int8_t *pVaaBgMbFlag)
-{	
-	uint32_t uiNeighborAvail = pCurMb->uiNeighborAvail;
-	SMB* pLeftMb = pCurMb -1 ;
-	SMB* pTopMb = pCurMb -iMbWidth;
-	SMB* pLeftTopMb = pCurMb - iMbWidth - 1 ;
-	SMB* iRightTopMb = pCurMb -iMbWidth + 1 ;
-	SMVComponentUnit *pMvComp = &pMbCache->sMvComponents;
-	if( (uiNeighborAvail & LEFT_MB_POS) && IS_SVC_INTER(pLeftMb->uiMbType) )	
-	{
-		pMvComp->sMotionVectorCache[ 6] = pLeftMb->sMv[ 3];
-		pMvComp->sMotionVectorCache[12] = pLeftMb->sMv[ 7];
-		pMvComp->sMotionVectorCache[18] = pLeftMb->sMv[11];
-		pMvComp->sMotionVectorCache[24] = pLeftMb->sMv[15];
-		pMvComp->iRefIndexCache[ 6] = pLeftMb->pRefIndex[1];
-		pMvComp->iRefIndexCache[12] = pLeftMb->pRefIndex[1];
-		pMvComp->iRefIndexCache[18] = pLeftMb->pRefIndex[3];			
-		pMvComp->iRefIndexCache[24] = pLeftMb->pRefIndex[3];			
-		pMbCache->iSadCost[3] = pLeftMb->pSadCost[0];
-
-		if ( pLeftMb->uiMbType == MB_TYPE_SKIP )
-		{
-			pMbCache->bMbTypeSkip[3] = 1;
-			pMbCache->iSadCostSkip[3] = pMbCache->pEncSad[-1];
-		}
-		else
-		{
-			pMbCache->bMbTypeSkip[3] = 0;
-			pMbCache->iSadCostSkip[3] = 0;
-		}
-	}
-	else //avail or non-inter
-	{
-		ST32(&pMvComp->sMotionVectorCache[ 6], 0);
-		ST32(&pMvComp->sMotionVectorCache[12], 0);
-		ST32(&pMvComp->sMotionVectorCache[18], 0);
-		ST32(&pMvComp->sMotionVectorCache[24], 0);
-		pMvComp->iRefIndexCache[ 6] =
-			pMvComp->iRefIndexCache[12] =
-			pMvComp->iRefIndexCache[18] =		
-			pMvComp->iRefIndexCache[24] = (uiNeighborAvail & LEFT_MB_POS) ? REF_NOT_IN_LIST : REF_NOT_AVAIL;			
-		pMbCache->iSadCost[3] = 0;
-		pMbCache->bMbTypeSkip[3] = 0;
-		pMbCache->iSadCostSkip[3] = 0;
-	}
-
-	if ( (uiNeighborAvail & TOP_MB_POS) && IS_SVC_INTER(pTopMb->uiMbType) ) //TOP MB	
-	{
-		ST64(&pMvComp->sMotionVectorCache[1], LD64(&pTopMb->sMv[12]));
-		ST64(&pMvComp->sMotionVectorCache[3], LD64(&pTopMb->sMv[14]));
-		pMvComp->iRefIndexCache[1] = pTopMb->pRefIndex[2];
-		pMvComp->iRefIndexCache[2] = pTopMb->pRefIndex[2];
-		pMvComp->iRefIndexCache[3] = pTopMb->pRefIndex[3];
-		pMvComp->iRefIndexCache[4] = pTopMb->pRefIndex[3];
-		pMbCache->iSadCost[1] = pTopMb->pSadCost[0];	
-
-		if ( pTopMb->uiMbType == MB_TYPE_SKIP )
-		{
-			pMbCache->bMbTypeSkip[1] = 1;
-			pMbCache->iSadCostSkip[1] = pMbCache->pEncSad[-iMbWidth];
-		}
-		else
-		{
-			pMbCache->bMbTypeSkip[1] = 0;
-			pMbCache->iSadCostSkip[1] = 0;
-		}			
-	}
-	else //unavail
-	{
-		ST64(&pMvComp->sMotionVectorCache[1], 0);
-		ST64(&pMvComp->sMotionVectorCache[3], 0);
-		pMvComp->iRefIndexCache[1] = 
-			pMvComp->iRefIndexCache[2] = 
-			pMvComp->iRefIndexCache[3] = 
-			pMvComp->iRefIndexCache[4] = (uiNeighborAvail & TOP_MB_POS) ? REF_NOT_IN_LIST : REF_NOT_AVAIL;
-		pMbCache->iSadCost[1] = 0; 
-
-		pMbCache->bMbTypeSkip[1] = 0;
-		pMbCache->iSadCostSkip[1] = 0;	
-	}
-
-	if ( (uiNeighborAvail & TOPLEFT_MB_POS) && IS_SVC_INTER(pLeftTopMb->uiMbType) ) //LEFT_TOP MB	
-	{
-		pMvComp->sMotionVectorCache[0] = pLeftTopMb->sMv[15];
-		pMvComp->iRefIndexCache[0] = pLeftTopMb->pRefIndex[3];		
-		pMbCache->iSadCost[0] = pLeftTopMb->pSadCost[0];
-
-		if ( pLeftTopMb->uiMbType == MB_TYPE_SKIP )
-		{
-			pMbCache->bMbTypeSkip[0] = 1;
-			pMbCache->iSadCostSkip[0] = pMbCache->pEncSad[-iMbWidth-1];
-		}
-		else
-		{
-			pMbCache->bMbTypeSkip[0] = 0;
-			pMbCache->iSadCostSkip[0] = 0;
-		}
-	}
-	else //unavail
-	{
-		ST32(&pMvComp->sMotionVectorCache[0], 0);
-		pMvComp->iRefIndexCache[0] = (uiNeighborAvail & TOPLEFT_MB_POS) ? REF_NOT_IN_LIST : REF_NOT_AVAIL;
-		pMbCache->iSadCost[0] = 0;
-		pMbCache->bMbTypeSkip[0] = 0;
-		pMbCache->iSadCostSkip[0] = 0;
-	}
-
-	if ((uiNeighborAvail & TOPRIGHT_MB_POS) && IS_SVC_INTER(iRightTopMb->uiMbType) ) //RIGHT_TOP MB	
-	{
-		pMvComp->sMotionVectorCache[5] = iRightTopMb->sMv[12];
-		pMvComp->iRefIndexCache[5] = iRightTopMb->pRefIndex[2];
-		pMbCache->iSadCost[2] = iRightTopMb->pSadCost[0];	
-
-		if ( iRightTopMb->uiMbType == MB_TYPE_SKIP )
-		{
-			pMbCache->bMbTypeSkip[2] = 1;
-			pMbCache->iSadCostSkip[2] = pMbCache->pEncSad[-iMbWidth+1];
-		}
-		else
-		{
-			pMbCache->bMbTypeSkip[2] = 0;
-			pMbCache->iSadCostSkip[2] = 0;
-		}		
-	}
-	else //unavail
-	{
-		ST32(&pMvComp->sMotionVectorCache[5], 0);
-		pMvComp->iRefIndexCache[5] = (uiNeighborAvail & TOPRIGHT_MB_POS) ? REF_NOT_IN_LIST : REF_NOT_AVAIL;
-		pMbCache->iSadCost[2] = 0;
-		pMbCache->bMbTypeSkip[2] = 0;
-		pMbCache->iSadCostSkip[2] = 0;
-	}
-
-	//right-top 4*4 pBlock unavailable
-	ST32(&pMvComp->sMotionVectorCache[ 9], 0);
-	ST32(&pMvComp->sMotionVectorCache[21], 0);
-	ST32(&pMvComp->sMotionVectorCache[11], 0);
-	ST32(&pMvComp->sMotionVectorCache[17], 0);
-	ST32(&pMvComp->sMotionVectorCache[23], 0);
-	pMvComp->iRefIndexCache[ 9] = 
-	pMvComp->iRefIndexCache[11] =
-	pMvComp->iRefIndexCache[17] =
-	pMvComp->iRefIndexCache[21] = 
-	pMvComp->iRefIndexCache[23] = REF_NOT_AVAIL;
-}
-
-void FillNeighborCacheInterWithBGD(SMbCache* pMbCache, SMB* pCurMb, int32_t iMbWidth, int8_t *pVaaBgMbFlag)
-{	
-	uint32_t uiNeighborAvail = pCurMb->uiNeighborAvail;
-	SMB* pLeftMb = pCurMb -1 ;
-	SMB* pTopMb = pCurMb -iMbWidth;
-	SMB* pLeftTopMb = pCurMb - iMbWidth - 1 ;
-	SMB* iRightTopMb = pCurMb -iMbWidth + 1 ;
-	SMVComponentUnit *pMvComp = &pMbCache->sMvComponents;
-
-	if( (uiNeighborAvail & LEFT_MB_POS) && IS_SVC_INTER(pLeftMb->uiMbType) )	
-	{
-		pMvComp->sMotionVectorCache[ 6] = pLeftMb->sMv[ 3];
-		pMvComp->sMotionVectorCache[12] = pLeftMb->sMv[ 7];
-		pMvComp->sMotionVectorCache[18] = pLeftMb->sMv[11];
-		pMvComp->sMotionVectorCache[24] = pLeftMb->sMv[15];
-		pMvComp->iRefIndexCache[ 6] = pLeftMb->pRefIndex[1];
-		pMvComp->iRefIndexCache[12] = pLeftMb->pRefIndex[1];
-		pMvComp->iRefIndexCache[18] = pLeftMb->pRefIndex[3];			
-		pMvComp->iRefIndexCache[24] = pLeftMb->pRefIndex[3];			
-		pMbCache->iSadCost[3] = pLeftMb->pSadCost[0];
-
-		if ( pLeftMb->uiMbType == MB_TYPE_SKIP && pVaaBgMbFlag[-1] == 0)
-		{
-			pMbCache->bMbTypeSkip[3] = 1;
-			pMbCache->iSadCostSkip[3] = pMbCache->pEncSad[-1];
-		}
-		else
-		{
-			pMbCache->bMbTypeSkip[3] = 0;
-			pMbCache->iSadCostSkip[3] = 0;
-		}
-	}
-	else //avail or non-inter
-	{
-		ST32(&pMvComp->sMotionVectorCache[ 6], 0);
-		ST32(&pMvComp->sMotionVectorCache[12], 0);
-		ST32(&pMvComp->sMotionVectorCache[18], 0);
-		ST32(&pMvComp->sMotionVectorCache[24], 0);
-		pMvComp->iRefIndexCache[ 6] =
-		pMvComp->iRefIndexCache[12] =
-		pMvComp->iRefIndexCache[18] =		
-		pMvComp->iRefIndexCache[24] = (uiNeighborAvail & LEFT_MB_POS) ? REF_NOT_IN_LIST : REF_NOT_AVAIL;
-		pMbCache->iSadCost[3] = 0;
-		pMbCache->bMbTypeSkip[3] = 0;
-		pMbCache->iSadCostSkip[3] = 0;
-	}
-
-	if ( (uiNeighborAvail & TOP_MB_POS) && IS_SVC_INTER(pTopMb->uiMbType) ) //TOP MB	
-	{
-		ST64(&pMvComp->sMotionVectorCache[1], LD64(&pTopMb->sMv[12]));
-		ST64(&pMvComp->sMotionVectorCache[3], LD64(&pTopMb->sMv[14]));
-		pMvComp->iRefIndexCache[1] = pTopMb->pRefIndex[2];
-		pMvComp->iRefIndexCache[2] = pTopMb->pRefIndex[2];
-		pMvComp->iRefIndexCache[3] = pTopMb->pRefIndex[3];
-		pMvComp->iRefIndexCache[4] = pTopMb->pRefIndex[3];
-		pMbCache->iSadCost[1] = pTopMb->pSadCost[0];	
-		if ( pTopMb->uiMbType == MB_TYPE_SKIP  && pVaaBgMbFlag[-iMbWidth] == 0 )
-		{
-			pMbCache->bMbTypeSkip[1] = 1;
-			pMbCache->iSadCostSkip[1] = pMbCache->pEncSad[-iMbWidth];
-		}
-		else
-		{
-			pMbCache->bMbTypeSkip[1] = 0;
-			pMbCache->iSadCostSkip[1] = 0;
-		}				
-	}
-	else //unavail
-	{
-		ST64(&pMvComp->sMotionVectorCache[1], 0);
-		ST64(&pMvComp->sMotionVectorCache[3], 0);
-		pMvComp->iRefIndexCache[1] = 
-			pMvComp->iRefIndexCache[2] = 
-			pMvComp->iRefIndexCache[3] = 
-			pMvComp->iRefIndexCache[4] = (uiNeighborAvail & TOP_MB_POS) ? REF_NOT_IN_LIST : REF_NOT_AVAIL;
-		pMbCache->iSadCost[1] = 0; 
-		pMbCache->bMbTypeSkip[1] = 0;
-		pMbCache->iSadCostSkip[1] = 0;	
-	}
-
-
-	if ( (uiNeighborAvail & TOPLEFT_MB_POS) && IS_SVC_INTER(pLeftTopMb->uiMbType) ) //LEFT_TOP MB	
-	{
-		pMvComp->sMotionVectorCache[0] = pLeftTopMb->sMv[15];
-		pMvComp->iRefIndexCache[0] = pLeftTopMb->pRefIndex[3];		
-		pMbCache->iSadCost[0] = pLeftTopMb->pSadCost[0];
-
-		if ( pLeftTopMb->uiMbType == MB_TYPE_SKIP  && pVaaBgMbFlag[-iMbWidth-1] == 0 )
-		{
-			pMbCache->bMbTypeSkip[0] = 1;
-			pMbCache->iSadCostSkip[0] = pMbCache->pEncSad[-iMbWidth-1];
-		}
-		else
-		{
-			pMbCache->bMbTypeSkip[0] = 0;
-			pMbCache->iSadCostSkip[0] = 0;
-		}
-	}
-	else //unavail
-	{
-		ST32(&pMvComp->sMotionVectorCache[0], 0);
-		pMvComp->iRefIndexCache[0] = (uiNeighborAvail & TOPLEFT_MB_POS) ? REF_NOT_IN_LIST : REF_NOT_AVAIL;
-		pMbCache->iSadCost[0] = 0;
-		pMbCache->bMbTypeSkip[0] = 0;
-		pMbCache->iSadCostSkip[0] = 0;
-	}
-
-	if ((uiNeighborAvail & TOPRIGHT_MB_POS) && IS_SVC_INTER(iRightTopMb->uiMbType) ) //RIGHT_TOP MB	
-	{
-		pMvComp->sMotionVectorCache[5] = iRightTopMb->sMv[12];
-		pMvComp->iRefIndexCache[5] = iRightTopMb->pRefIndex[2];
-		pMbCache->iSadCost[2] = iRightTopMb->pSadCost[0];	
-
-		if ( iRightTopMb->uiMbType == MB_TYPE_SKIP  && pVaaBgMbFlag[-iMbWidth+1] == 0 )
-		{
-			pMbCache->bMbTypeSkip[2] = 1;
-			pMbCache->iSadCostSkip[2] = pMbCache->pEncSad[-iMbWidth+1];
-		}
-		else
-		{
-			pMbCache->bMbTypeSkip[2] = 0;
-			pMbCache->iSadCostSkip[2] = 0;
-		}		
-	}
-	else //unavail
-	{
-		ST32(&pMvComp->sMotionVectorCache[5], 0);
-		pMvComp->iRefIndexCache[5] = (uiNeighborAvail & TOPRIGHT_MB_POS) ? REF_NOT_IN_LIST : REF_NOT_AVAIL;
-		pMbCache->iSadCost[2] = 0;
-		pMbCache->bMbTypeSkip[2] = 0;
-		pMbCache->iSadCostSkip[2] = 0;	
-	}
-
-	//right-top 4*4 pBlock unavailable
-	ST32(&pMvComp->sMotionVectorCache[ 9], 0);
-	ST32(&pMvComp->sMotionVectorCache[21], 0);
-	ST32(&pMvComp->sMotionVectorCache[11], 0);
-	ST32(&pMvComp->sMotionVectorCache[17], 0);
-	ST32(&pMvComp->sMotionVectorCache[23], 0);
-	pMvComp->iRefIndexCache[ 9] = 
-		pMvComp->iRefIndexCache[11] =
-		pMvComp->iRefIndexCache[17] =
-		pMvComp->iRefIndexCache[21] = 
-		pMvComp->iRefIndexCache[23] = REF_NOT_AVAIL;
-}
-
-void InitFillNeighborCacheInterFunc( SWelsFuncPtrList *pFuncList, const int32_t kiFlag )
-{
-	pFuncList->pfFillInterNeighborCache = kiFlag ? FillNeighborCacheInterWithBGD : FillNeighborCacheInterWithoutBGD;
-}
-
-void UpdateMbMv_c( SMVUnitXY *pMvBuffer, const SMVUnitXY ksMv )
-{
-	int32_t k = 0;
-	for (; k < MB_BLOCK4x4_NUM; k += 4)
-	{
-		pMvBuffer[k  ] = 
-		pMvBuffer[k+1] =
-		pMvBuffer[k+2] = 
-		pMvBuffer[k+3] = ksMv;
-	}
-}
-
-
-uint8_t MdInterAnalysisVaaInfo_c( int32_t *pSad8x8 )
-{	
-	int32_t iSadBlock[4], iAverageSadBlock[4];
-	int32_t iAverageSad, iVarianceSad;
-	
-	iSadBlock[0] = pSad8x8[0];
-	iAverageSad = iSadBlock[0];
-
-	iSadBlock[1] = pSad8x8[1];
-	iAverageSad += iSadBlock[1];
-
-	iSadBlock[2] = pSad8x8[2];
-	iAverageSad += iSadBlock[2];
-
-	iSadBlock[3] = pSad8x8[3];
-	iAverageSad += iSadBlock[3];
-
-	iAverageSad = iAverageSad >> 2;
-
-	iAverageSadBlock[0] = (iSadBlock[0] >> 6) - (iAverageSad >> 6);
-	iVarianceSad = iAverageSadBlock[0] * iAverageSadBlock[0];
-	
-	iAverageSadBlock[1] = (iSadBlock[1] >> 6) - (iAverageSad >> 6);
-	iVarianceSad += iAverageSadBlock[1] * iAverageSadBlock[1];
-
-	iAverageSadBlock[2] = (iSadBlock[2] >> 6) - (iAverageSad >> 6);
-	iVarianceSad += iAverageSadBlock[2] * iAverageSadBlock[2];
-
-	iAverageSadBlock[3] = (iSadBlock[3] >> 6) - (iAverageSad >> 6);
-	iVarianceSad += iAverageSadBlock[3] * iAverageSadBlock[3];
-
-	if ( iVarianceSad < INTER_VARIANCE_SAD_THRESHOLD )
-	{		
-		return 15;
-	}
-
-	uint8_t uiMbSign = 0;
-	if (iSadBlock[0] > iAverageSad) 
-		uiMbSign |= 0x08;
-	if (iSadBlock[1] > iAverageSad) 
-		uiMbSign |= 0x04;
-	if (iSadBlock[2] > iAverageSad) 
-		uiMbSign |= 0x02;
-	if (iSadBlock[3] > iAverageSad) 
-		uiMbSign |= 0x01;
-	return ( uiMbSign );
-}
-
-static inline int32_t AnalysisVaaInfoIntra_c( uint8_t *pDataY, const int32_t kiLineSize )
-{
-	ENFORCE_STACK_ALIGN_1D(uint16_t, uiAvgBlock, 16, 16)
-	uint16_t *pBlock = &uiAvgBlock[0];
-	uint8_t *pEncData	= pDataY;
-	const int32_t kiLineSize2	= kiLineSize << 1;
-	const int32_t kiLineSize3	= kiLineSize + kiLineSize2;
-	const int32_t kiLineSize4	= kiLineSize << 2;
-	int32_t i = 0, j = 0, num = 0;	
-	int32_t iSumAvg = 0, iSumSqr = 0;
-	
-//	analysis_vaa_info_intra_core_c( pDataY, iLineSize, pBlock );
-	for ( ; j < 16; j += 4 )
-	{
-		num = 0;
-		for ( i = 0; i < 16; i += 4, num ++ )
-		{
-			pBlock[num]	=  pEncData[i          ] + pEncData[i+1          ] + pEncData[i+2          ] + pEncData[i+3          ];
-			pBlock[num]	+= pEncData[i+kiLineSize ] + pEncData[i+kiLineSize+1 ] + pEncData[i+kiLineSize+2 ] + pEncData[i+kiLineSize+3 ];
-			pBlock[num]	+= pEncData[i+kiLineSize2] + pEncData[i+kiLineSize2+1] + pEncData[i+kiLineSize2+2] + pEncData[i+kiLineSize2+3];
-			pBlock[num]	+= pEncData[i+kiLineSize3] + pEncData[i+kiLineSize3+1] + pEncData[i+kiLineSize3+2] + pEncData[i+kiLineSize3+3];
-			pBlock[num]	>>=  4;			
-		}
-		pBlock += 4;
-		pEncData += kiLineSize4; 
-	}
-
-	pBlock = &uiAvgBlock[0];
-	i = 4;
-	for ( ; i > 0; --i )
-	{
-		iSumAvg += pBlock[0] + pBlock[1] + pBlock[2] + pBlock[3];
-		iSumSqr += pBlock[0] * pBlock[0] + pBlock[1] * pBlock[1] + pBlock[2] * pBlock[2] + pBlock[3] * pBlock[3];
-
-		pBlock += 4;
-	}
-
-
-	return /*variance =*/ (iSumSqr - ((iSumAvg * iSumAvg) >> 4));
-}
-
-// for pfGetVarianceFromIntraVaa function ptr adaptive by CPU features, 6/7/2010
-void InitIntraAnalysisVaaInfo( SWelsFuncPtrList *pFuncList, const uint32_t kuiCpuFlag )
-{
-	pFuncList->pfGetVarianceFromIntraVaa		= AnalysisVaaInfoIntra_c;
-	pFuncList->pfGetMbSignFromInterVaa	= MdInterAnalysisVaaInfo_c;
-	pFuncList->pfUpdateMbMv					= UpdateMbMv_c;
-	
-#if defined(X86_ASM)
-	if ( (kuiCpuFlag & WELS_CPU_SSE2) == WELS_CPU_SSE2 )
-	{
-		pFuncList->pfGetVarianceFromIntraVaa		= AnalysisVaaInfoIntra_sse2;	
-		pFuncList->pfGetMbSignFromInterVaa	= MdInterAnalysisVaaInfo_sse2;
-		pFuncList->pfUpdateMbMv					= UpdateMbMv_sse2;
-	}
-	if ( (kuiCpuFlag & WELS_CPU_SSSE3) == WELS_CPU_SSSE3 )
-	{
-		pFuncList->pfGetVarianceFromIntraVaa	= AnalysisVaaInfoIntra_ssse3;
-	}
-	if ( (kuiCpuFlag & WELS_CPU_SSE41) == WELS_CPU_SSE41 )
-	{
-		pFuncList->pfGetMbSignFromInterVaa	= MdInterAnalysisVaaInfo_sse41;
-	}
-#endif//X86_ASM
-}
-
-BOOL_T MdIntraAnalysisVaaInfo( sWelsEncCtx* pEncCtx, uint8_t* pEncMb )
-{	
-
-	SDqLayer* pCurDqLayer	= pEncCtx->pCurDqLayer;	
-	const int32_t kiLineSize  = pCurDqLayer->iEncStride[0];
-	const int32_t kiVariance	= pEncCtx->pFuncList->pfGetVarianceFromIntraVaa( pEncMb, kiLineSize );
-	return (kiVariance >= INTRA_VARIANCE_SAD_THRESHOLD);
-}
-
-void InitMeRefinePointer(SMeRefinePointer* pMeRefine, SMbCache* pMbCache, int32_t iStride)
-{
-	pMeRefine->pHalfPixH    = &pMbCache->pBufferInterPredMe[0] + iStride;
-	pMeRefine->pHalfPixV    = &pMbCache->pBufferInterPredMe[640] + iStride;
-
-	pMeRefine->pQuarPixBest= &pMbCache->pBufferInterPredMe[1280] + iStride;
-	pMeRefine->pQuarPixTmp  = &pMbCache->pBufferInterPredMe[1920] + iStride;
-}
-typedef struct TagQuarParams
-{	
-	int32_t iBestCost;
-	int32_t iBestHalfPix;
-	int32_t iStrideA;
-	int32_t iStrideB;
-	uint8_t * pRef;
-	uint8_t * pSrcB[4];
-	uint8_t * pSrcA[4];
-	int32_t iLms[4];
-	int32_t iBestQuarPix;
-}SQuarRefineParams;
-
-#define SWITCH_BEST_TMP_BUF(prev_best, curr_best){\
-	pParams->iBestCost = iCurCost;\
-	pTmp = prev_best;\
-	prev_best = curr_best;\
-	curr_best = pTmp;\
-}
-#define CALC_COST(me_buf, lm) ( pFunc->sSampleDealingFuncs.pfMeCost[kuiPixel](pEncMb, iStrideEnc, me_buf, ME_REFINE_BUF_STRIDE) + lm )
-
-inline void MeRefineQuarPixel( SWelsFuncPtrList *pFunc, SWelsME* pMe, SMeRefinePointer* pMeRefine, const int32_t kiWidth, const int32_t kiHeight,SQuarRefineParams *pParams, int32_t iStrideEnc )
-{
-	PWelsSampleAveragingFunc *pSampleAvg	= pFunc->sMcFuncs.pfSampleAveraging;
-	const int32_t kiAvgIndex		= kiWidth >> 4;
-	int32_t iCurCost;
-	uint8_t *pEncMb				= pMe->pEncMb;
-	uint8_t *pTmp				= NULL;
-	const uint8_t kuiPixel		= pMe->uiPixel;
-	
-	pSampleAvg[kiAvgIndex](pMeRefine->pQuarPixTmp, ME_REFINE_BUF_STRIDE, pParams->pSrcA[0], ME_REFINE_BUF_STRIDE,pParams->pSrcB[0], pParams->iStrideA, kiHeight);	
-
-	iCurCost = CALC_COST(pMeRefine->pQuarPixTmp,pParams->iLms[0]);
-	if (iCurCost < pParams->iBestCost)
-	{
-		pParams->iBestQuarPix =	ME_QUAR_PIXEL_TOP;
-		SWITCH_BEST_TMP_BUF(pMeRefine->pQuarPixBest,pMeRefine->pQuarPixTmp);
-	}
-	//=========================(0, 1)=======================//
-	pSampleAvg[kiAvgIndex](pMeRefine->pQuarPixTmp, ME_REFINE_BUF_STRIDE, pParams->pSrcA[1], 
-		ME_REFINE_BUF_STRIDE,pParams->pSrcB[1], pParams->iStrideA, kiHeight);
-	iCurCost = CALC_COST(pMeRefine->pQuarPixTmp,pParams->iLms[1]);
-	if (iCurCost < pParams->iBestCost)
-	{
-		pParams->iBestQuarPix = ME_QUAR_PIXEL_BOTTOM;
-		SWITCH_BEST_TMP_BUF(pMeRefine->pQuarPixBest,pMeRefine->pQuarPixTmp);
-	}
-	//==========================(-1, 0)=========================//
-	pSampleAvg[kiAvgIndex](pMeRefine->pQuarPixTmp, ME_REFINE_BUF_STRIDE,pParams->pSrcA[2], 
-		ME_REFINE_BUF_STRIDE,pParams->pSrcB[2], pParams->iStrideB, kiHeight);	
-	iCurCost = CALC_COST(pMeRefine->pQuarPixTmp,pParams->iLms[2]);
-	if (iCurCost < pParams->iBestCost)
-	{
-		pParams->iBestQuarPix = ME_QUAR_PIXEL_LEFT;
-		SWITCH_BEST_TMP_BUF(pMeRefine->pQuarPixBest,pMeRefine->pQuarPixTmp);
-	}
-	//==========================(1, 0)=========================//
-	pSampleAvg[kiAvgIndex](pMeRefine->pQuarPixTmp, ME_REFINE_BUF_STRIDE,pParams->pSrcA[3], 
-		ME_REFINE_BUF_STRIDE,	pParams->pSrcB[3], pParams->iStrideB,  kiHeight);
-
-	iCurCost = CALC_COST(pMeRefine->pQuarPixTmp,pParams->iLms[3]);
-	if (iCurCost < pParams->iBestCost)
-	{
-		pParams->iBestQuarPix = ME_QUAR_PIXEL_RIGHT;
-		SWITCH_BEST_TMP_BUF(pMeRefine->pQuarPixBest,pMeRefine->pQuarPixTmp);
-	}
-}
-
-void MeRefineFracPixel(sWelsEncCtx* pEncCtx, uint8_t* pMemPredInterMb, SWelsME* pMe, 
-						  SMeRefinePointer* pMeRefine, int32_t iWidth, int32_t iHeight)
-{
-	SWelsFuncPtrList *pFunc= pEncCtx->pFuncList;
-	int16_t iMvx = pMe->sMv.iMvX;
-	int16_t iMvy = pMe->sMv.iMvY;
-
-	int16_t iHalfMvx = iMvx;
-	int16_t iHalfMvy = iMvy;
-	const int32_t kiStrideEnc = pEncCtx->pCurDqLayer->iEncStride[0];
-	const int32_t kiStrideRef = pEncCtx->pCurDqLayer->pRefPic->iLineSize[0];
-    
-	uint8_t* pEncData = pMe->pEncMb;
-	uint8_t* pRef = pMe->pRefMb;//091010
-
-	int32_t iBestQuarPix = ME_NO_BEST_QUAR_PIXEL;
-
-	SQuarRefineParams sParams;
-	static int32_t iMvQuarAddX[10] = {0,0,-1,1,0,0,0,-1,1,0};
-	int32_t *pMvQuarAddY = iMvQuarAddX + 3;
-	uint8_t* pBestPredInter = pRef;
-	int32_t iInterBlk4Stride = ME_REFINE_BUF_STRIDE;
-
-	int32_t iBestCost;
-	int32_t iCurCost;
-	int32_t iBestHalfPix;
-
-	if ((pFunc->sSampleDealingFuncs.pfMeCost == pFunc->sSampleDealingFuncs.pfSampleSatd) && (pFunc->sSampleDealingFuncs.pfMdCost == pFunc->sSampleDealingFuncs.pfSampleSatd))
-	{
-		iBestCost = pMe->uSadPredISatd.uiSatd + COST_MVD(pMe->pMvdCost, iMvx - pMe->sMvp.iMvX, iMvy - pMe->sMvp.iMvY);
-	}
-	else
-	{
-		iBestCost = pFunc->sSampleDealingFuncs.pfMeCost[pMe->uiPixel]( pEncData, kiStrideEnc, pRef, kiStrideRef ) +
-			COST_MVD(pMe->pMvdCost, iMvx - pMe->sMvp.iMvX, iMvy - pMe->sMvp.iMvY);
-	}
-
-	iBestHalfPix = REFINE_ME_NO_BEST_HALF_PIXEL;
-
-	pFunc->sMcFuncs.pfLumaHalfpelVer( pRef-kiStrideRef, kiStrideRef, pMeRefine->pHalfPixV, ME_REFINE_BUF_STRIDE, iWidth, iHeight+1 );
-
-	//step 1: get [iWidth][iHeight+1] half pixel from vertical filter
-	//===========================(0, -2)==============================//
-	iCurCost = pFunc->sSampleDealingFuncs.pfMeCost[pMe->uiPixel](pEncData, kiStrideEnc, pMeRefine->pHalfPixV, ME_REFINE_BUF_STRIDE) +
-		COST_MVD( pMe->pMvdCost, iMvx - pMe->sMvp.iMvX, iMvy - 2 - pMe->sMvp.iMvY );
-	if(iCurCost < iBestCost)
-	{
-		iBestCost = iCurCost;
-		iBestHalfPix = REFINE_ME_HALF_PIXEL_TOP;
-       	pBestPredInter = pMeRefine->pHalfPixV;
-	}
-	//===========================(0, 2)==============================//
-	iCurCost = pFunc->sSampleDealingFuncs.pfMeCost[pMe->uiPixel](pEncData, kiStrideEnc, pMeRefine->pHalfPixV+ME_REFINE_BUF_STRIDE, ME_REFINE_BUF_STRIDE) +
-		COST_MVD( pMe->pMvdCost, iMvx - pMe->sMvp.iMvX, iMvy + 2 - pMe->sMvp.iMvY );
-	if(iCurCost < iBestCost)
-	{
-		iBestCost = iCurCost;
-		iBestHalfPix = REFINE_ME_HALF_PIXEL_BOTTOM;
-       	pBestPredInter = pMeRefine->pHalfPixV+ME_REFINE_BUF_STRIDE;
-	}
-	pFunc->sMcFuncs.pfLumaHalfpelHor( pRef-1, kiStrideRef, pMeRefine->pHalfPixH, ME_REFINE_BUF_STRIDE, iWidth+1, iHeight );
-	//step 2: get [iWidth][iHeight+1] half pixel from horizon filter
-	
-	//===========================(-2, 0)==============================//
-	iCurCost = pFunc->sSampleDealingFuncs.pfMeCost[pMe->uiPixel](pEncData, kiStrideEnc, pMeRefine->pHalfPixH, ME_REFINE_BUF_STRIDE) +
-		COST_MVD( pMe->pMvdCost, iMvx - 2 - pMe->sMvp.iMvX, iMvy - pMe->sMvp.iMvY );
-	if(iCurCost < iBestCost)
-	{
-		iBestCost = iCurCost;
-		iBestHalfPix = REFINE_ME_HALF_PIXEL_LEFT;
-       	pBestPredInter = pMeRefine->pHalfPixH;
-	}
-	//===========================(2, 0)===============================//
-	iCurCost = pFunc->sSampleDealingFuncs.pfMeCost[pMe->uiPixel](pEncData, kiStrideEnc, pMeRefine->pHalfPixH+1, ME_REFINE_BUF_STRIDE) +
-		COST_MVD( pMe->pMvdCost, iMvx + 2 - pMe->sMvp.iMvX, iMvy - pMe->sMvp.iMvY );
-	if(iCurCost < iBestCost)
-	{
-		iBestCost = iCurCost;
-		iBestHalfPix = REFINE_ME_HALF_PIXEL_RIGHT;
-       	pBestPredInter = pMeRefine->pHalfPixH+1;
-	}
-
-	sParams.iBestCost = iBestCost;
-	sParams.iBestHalfPix = iBestHalfPix;
-	sParams.pRef = pRef;
-	sParams.iBestQuarPix = ME_NO_BEST_QUAR_PIXEL;
-
-	//step 5: if no best half-pixel prediction, try quarter pixel prediction
-	//        if yes, must get [X+1][X+1] half-pixel from (2, 2) horizontal and vertical filter
-	if (REFINE_ME_NO_BEST_HALF_PIXEL == iBestHalfPix)
-	{
-		sParams.iStrideA = kiStrideRef;
-		sParams.iStrideB = kiStrideRef;
-		sParams.pSrcA[0] = pMeRefine->pHalfPixV;
-		sParams.pSrcA[1] = pMeRefine->pHalfPixV+ME_REFINE_BUF_STRIDE;
-		sParams.pSrcA[2] = pMeRefine->pHalfPixH;
-		sParams.pSrcA[3] = pMeRefine->pHalfPixH+1;
-
-		sParams.pSrcB[0] = sParams.pSrcB[1] = sParams.pSrcB[2] = sParams.pSrcB[3] = pRef;
-
-		sParams.iLms[0] = COST_MVD( pMe->pMvdCost, iHalfMvx - pMe->sMvp.iMvX, iHalfMvy - 1 - pMe->sMvp.iMvY ); 
-		sParams.iLms[1] = COST_MVD( pMe->pMvdCost, iHalfMvx - pMe->sMvp.iMvX, iHalfMvy + 1 - pMe->sMvp.iMvY );
-		sParams.iLms[2] = COST_MVD( pMe->pMvdCost, iHalfMvx - 1 - pMe->sMvp.iMvX, iHalfMvy - pMe->sMvp.iMvY );
-		sParams.iLms[3] = COST_MVD( pMe->pMvdCost, iHalfMvx + 1 - pMe->sMvp.iMvX, iHalfMvy - pMe->sMvp.iMvY );
-	}	
-	else //must get [X+1][X+1] half-pixel from (2, 2) horizontal and vertical filter
-	{
-		switch(iBestHalfPix)
-		{
-		case REFINE_ME_HALF_PIXEL_LEFT:
-			{
-                pMeRefine->pHalfPixHV = pMeRefine->pHalfPixV;//reuse pBuffer, here only h&hv
-				pFunc->sMcFuncs.pfLumaHalfpelCen( pRef-1-kiStrideRef, kiStrideRef, pMeRefine->pHalfPixHV,ME_REFINE_BUF_STRIDE,iWidth+1, iHeight+1 );
-				
-				iHalfMvx -= 2;
-				sParams.iStrideA = ME_REFINE_BUF_STRIDE;
-				sParams.iStrideB = kiStrideRef;
-				sParams.pSrcA[0] = pMeRefine->pHalfPixH;
-				sParams.pSrcA[3] = sParams.pSrcA[2] = sParams.pSrcA[1] = sParams.pSrcA[0];
-				sParams.pSrcB[0] = pMeRefine->pHalfPixHV;
-				sParams.pSrcB[1] = pMeRefine->pHalfPixHV+ME_REFINE_BUF_STRIDE;
-				sParams.pSrcB[2] = pRef - 1;
-				sParams.pSrcB[3] = pRef;
-
-			}break;
-		case REFINE_ME_HALF_PIXEL_RIGHT:
-			{
-                pMeRefine->pHalfPixHV = pMeRefine->pHalfPixV;//reuse pBuffer, here only h&hv
-				pFunc->sMcFuncs.pfLumaHalfpelCen( pRef-1-kiStrideRef, kiStrideRef, pMeRefine->pHalfPixHV,ME_REFINE_BUF_STRIDE,iWidth+1, iHeight+1 );
- 				iHalfMvx += 2;
-				sParams.iStrideA = ME_REFINE_BUF_STRIDE;
-				sParams.iStrideB = kiStrideRef;
-				sParams.pSrcA[0] = pMeRefine->pHalfPixH+1;
-				sParams.pSrcA[3] = sParams.pSrcA[2] = sParams.pSrcA[1] = sParams.pSrcA[0];
-				sParams.pSrcB[0] = pMeRefine->pHalfPixHV+1;
-				sParams.pSrcB[1] = pMeRefine->pHalfPixHV+1+ ME_REFINE_BUF_STRIDE;
-				sParams.pSrcB[2] = pRef;
-				sParams.pSrcB[3] = pRef + 1;
-			}break;
-		case REFINE_ME_HALF_PIXEL_TOP:
-			{
-                pMeRefine->pHalfPixHV = pMeRefine->pHalfPixH;//reuse pBuffer, here only v&hv
-				pFunc->sMcFuncs.pfLumaHalfpelCen( pRef-1-kiStrideRef, kiStrideRef, pMeRefine->pHalfPixHV,ME_REFINE_BUF_STRIDE,iWidth+1, iHeight+1 );
-		
-               	iHalfMvy -= 2;
-				sParams.iStrideA = kiStrideRef;
-				sParams.iStrideB = ME_REFINE_BUF_STRIDE;
-				sParams.pSrcA[0] = pMeRefine->pHalfPixV;				
-				sParams.pSrcA[3] = sParams.pSrcA[2] = sParams.pSrcA[1] = sParams.pSrcA[0];
-				sParams.pSrcB[0] = pRef - kiStrideRef;
-				sParams.pSrcB[1] = pRef;
-				sParams.pSrcB[2] = pMeRefine->pHalfPixHV;
-				sParams.pSrcB[3] = pMeRefine->pHalfPixHV+1;		
-			}break;
-		case REFINE_ME_HALF_PIXEL_BOTTOM:
-			{
-                pMeRefine->pHalfPixHV = pMeRefine->pHalfPixH;//reuse pBuffer, here only v&hv
-				pFunc->sMcFuncs.pfLumaHalfpelCen( pRef-1-kiStrideRef, kiStrideRef, pMeRefine->pHalfPixHV,ME_REFINE_BUF_STRIDE,iWidth+1, iHeight+1 );
-			    iHalfMvy += 2;
-				sParams.iStrideA = kiStrideRef;
-				sParams.iStrideB = ME_REFINE_BUF_STRIDE;
-				sParams.pSrcA[0] = pMeRefine->pHalfPixV + ME_REFINE_BUF_STRIDE;
-				sParams.pSrcA[3] = sParams.pSrcA[2] = sParams.pSrcA[1] = sParams.pSrcA[0];
-				sParams.pSrcB[0] = pRef;
-				sParams.pSrcB[1] = pRef + kiStrideRef;
-				sParams.pSrcB[2] = pMeRefine->pHalfPixHV + ME_REFINE_BUF_STRIDE;
-				sParams.pSrcB[3] = pMeRefine->pHalfPixHV + ME_REFINE_BUF_STRIDE + 1;	
-			}break;
-		default:
-			break;
-		}
-		sParams.iLms[0] = COST_MVD( pMe->pMvdCost, iHalfMvx - pMe->sMvp.iMvX, iHalfMvy - 1 - pMe->sMvp.iMvY );
-		sParams.iLms[1] = COST_MVD( pMe->pMvdCost, iHalfMvx - pMe->sMvp.iMvX, iHalfMvy + 1 - pMe->sMvp.iMvY );
-		sParams.iLms[2] = COST_MVD( pMe->pMvdCost, iHalfMvx - 1 - pMe->sMvp.iMvX, iHalfMvy - pMe->sMvp.iMvY );
-		sParams.iLms[3] = COST_MVD( pMe->pMvdCost, iHalfMvx + 1 - pMe->sMvp.iMvX, iHalfMvy - pMe->sMvp.iMvY );
-	}
-	MeRefineQuarPixel(pFunc, pMe, pMeRefine, iWidth, iHeight, &sParams, kiStrideEnc);
-	
-	if(iBestCost > sParams.iBestCost)
-	{
-		pBestPredInter = pMeRefine->pQuarPixBest;
-		iBestCost = sParams.iBestCost;
-	}
-	iBestQuarPix = sParams.iBestQuarPix;
-
-	//update final best MV
-	pMe->sMv.iMvX = iHalfMvx + iMvQuarAddX[iBestQuarPix];
-	pMe->sMv.iMvY = iHalfMvy + pMvQuarAddY[iBestQuarPix];
-	pMe->uiSatdCost = iBestCost;
-
-	//No half or quarter pixel best, so do MC with integer pixel MV
-	if ( iBestHalfPix+iBestQuarPix == NO_BEST_FRAC_PIX )
-	{
-		pBestPredInter = pRef;
-		iInterBlk4Stride = kiStrideRef;
-	}	
-	if ( MB_WIDTH_LUMA == iWidth && MB_HEIGHT_LUMA == iHeight ) //P16x16
-	{
-		pFunc->pfCopy16x16NotAligned( pMemPredInterMb, MB_WIDTH_LUMA, pBestPredInter, iInterBlk4Stride );	// dst can be align with 16 bytes, but not sure at pSrc, 12/29/2011
-	}
-	else if ( MB_WIDTH_LUMA == iWidth && MB_HEIGHT_CHROMA == iHeight ) //P16x8
-	{
-		pFunc->pfCopy16x8NotAligned( pMemPredInterMb, MB_WIDTH_LUMA, pBestPredInter, iInterBlk4Stride );	// dst can be align with 16 bytes, but not sure at pSrc, 12/29/2011
-	}
-	else if ( MB_WIDTH_CHROMA == iWidth && MB_HEIGHT_LUMA == iHeight ) //P8x16
-	{
-		pFunc->pfCopy8x16Aligned( pMemPredInterMb, MB_WIDTH_LUMA, pBestPredInter, iInterBlk4Stride );		
-	}
-	else //P8x8
-	{
-		pFunc->pfCopy8x8Aligned( pMemPredInterMb, MB_WIDTH_LUMA, pBestPredInter, iInterBlk4Stride );
-	}	
-}
-
-void InitBlkStrideWithRef(int32_t* pBlkStride, const int32_t kiStrideRef)
-{
-	static const uint8_t kuiStrideX[16] =
-	{
-		0, 4 , 0, 4 ,
-		8, 12, 8, 12,
-		0, 4 , 0, 4 ,
-		8, 12, 8, 12
-	};
-	static const uint8_t kuiStrideY[16] =
-	{
-		0, 0, 4 , 4 ,
-		0, 0, 4 , 4 ,
-		8, 8, 12, 12,
-		8, 8, 12, 12
-	};
-	int32_t i;
-
-	for (i = 0; i < 16; i+=4)
-	{
-		pBlkStride[i  ] = kuiStrideX[i  ] + kuiStrideY[i  ] * kiStrideRef; 
-		pBlkStride[i+1] = kuiStrideX[i+1] + kuiStrideY[i+1] * kiStrideRef; 
-		pBlkStride[i+2] = kuiStrideX[i+2] + kuiStrideY[i+2] * kiStrideRef; 
-		pBlkStride[i+3] = kuiStrideX[i+3] + kuiStrideY[i+3] * kiStrideRef; 
-	}
-}
-
-/*
- * iMvdSz = (648*2+1) or (972*2+1);
- */
-void MvdCostInit( uint16_t* pMvdCostInter, const int32_t kiMvdSz )
-{	
-	const int32_t kiSz		= kiMvdSz >> 1;
-	uint16_t *pNegMvd		= pMvdCostInter;
-	uint16_t *pPosMvd		= pMvdCostInter+kiSz+1;
-	const int32_t *kpQpLambda= &g_kiQpCostTable[0];
-	int32_t i,j;
-	
-	for( i = 0; i < 52; ++ i )
-	{
-		const uint16_t kiLambda = kpQpLambda[i];		
-		int32_t iNegSe = -kiSz;
-		int32_t iPosSe = 1;
-
-		for (j = 0; j < kiSz; j += 4)
-		{
-			*pNegMvd++	= kiLambda * BsSizeSE(iNegSe++);
-			*pNegMvd++	= kiLambda * BsSizeSE(iNegSe++);
-			*pNegMvd++	= kiLambda * BsSizeSE(iNegSe++);
-			*pNegMvd++	= kiLambda * BsSizeSE(iNegSe++);
-			
-			*pPosMvd++	= kiLambda * BsSizeSE(iPosSe++);
-			*pPosMvd++	= kiLambda * BsSizeSE(iPosSe++);
-			*pPosMvd++	= kiLambda * BsSizeSE(iPosSe++);
-			*pPosMvd++	= kiLambda * BsSizeSE(iPosSe++);			
-		}
-		*pNegMvd = kiLambda;
-		pNegMvd += kiSz+1;
-		pPosMvd += kiSz+1;
-	}
-}
-
-void PredictSad( int8_t* pRefIndexCache, int32_t* pSadCostCache, int32_t uiRef, int32_t * pSadPred )
-{    
-    const int32_t kiRefB	= pRefIndexCache[1];//top g_uiCache12_8x8RefIdx[0] - 4
-    int32_t iRefC			= pRefIndexCache[5];//top-right g_uiCache12_8x8RefIdx[0] - 2    
-	const int32_t kiRefA	= pRefIndexCache[6];//left g_uiCache12_8x8RefIdx[0] - 1
-    const int32_t kiSadB		= pSadCostCache[1];
-    int32_t iSadC			= pSadCostCache[2];
-	const int32_t kiSadA		= pSadCostCache[3];
-
-    int32_t iCount;
-
-    if( iRefC == REF_NOT_AVAIL )
-    {
-		iRefC = pRefIndexCache[0];//top-left g_uiCache12_8x8RefIdx[0] - 4 - 1
-        iSadC  = pSadCostCache[0];
-    }
-
-    if( kiRefB == REF_NOT_AVAIL && iRefC == REF_NOT_AVAIL && kiRefA != REF_NOT_AVAIL )
-    {
-        * pSadPred = kiSadA;
-    }
-	else
-	{
-		iCount  = (uiRef == kiRefA)<<MB_LEFT_BIT;
-		iCount |= (uiRef == kiRefB)<<MB_TOP_BIT;
-		iCount |= (uiRef == iRefC)<<MB_TOPRIGHT_BIT;
-		switch(iCount) 
-		{
-			case LEFT_MB_POS:// A
-				*pSadPred = kiSadA;
-				break;
-			case TOP_MB_POS:// B
-				*pSadPred = kiSadB;
-				break;
-			case TOPRIGHT_MB_POS:// C or D
-				*pSadPred = iSadC;
-				break;
-			default:
-				*pSadPred = WELS_MEDIAN( kiSadA, kiSadB, iSadC );
-				break;
-		}
-	}
-
-#define REPLACE_SAD_MULTIPLY(x)   ((x) - (x>>3) + (x >>5))    // it's 0.90625, very close with 0.9
-	iCount = (*pSadPred)<<6;    // here *64 will not overflow. SAD range 0~ 255*256(max 2^16), int32_t is enough
-	*pSadPred = (REPLACE_SAD_MULTIPLY(iCount) + 32)>>6;
-#undef REPLACE_SAD_MULTIPLY
-}
-
-
-void PredictSadSkip( int8_t* pRefIndexCache, bool_t* pMbSkipCache, int32_t* pSadCostCache, int32_t uiRef, int32_t * iSadPredSkip )
-{    
-    const int32_t kiRefB	= pRefIndexCache[1];//top g_uiCache12_8x8RefIdx[0] - 4
-    int32_t iRefC			= pRefIndexCache[5];//top-right g_uiCache12_8x8RefIdx[0] - 2
-	const int32_t kiRefA	= pRefIndexCache[6];//left g_uiCache12_8x8RefIdx[0] - 1    
-    const int32_t kiSadB		= (pMbSkipCache[1]==1 ? pSadCostCache[1] : 0);
-    int32_t iSadC			= (pMbSkipCache[2]==1 ? pSadCostCache[2] : 0);
-	const int32_t kiSadA		= (pMbSkipCache[3]==1 ? pSadCostCache[3] : 0);
-	int32_t iRefSkip		= pMbSkipCache[2];
-
-    int32_t iCount = 0;
-
-    if( iRefC == REF_NOT_AVAIL )
-    {
-		iRefC = pRefIndexCache[0];//top-left g_uiCache12_8x8RefIdx[0] - 4 - 1
-        iSadC  = (pMbSkipCache[0]==1 ? pSadCostCache[0] : 0);
-		iRefSkip = pMbSkipCache[0];
-    }
-
-    if( kiRefB == REF_NOT_AVAIL && iRefC == REF_NOT_AVAIL && kiRefA != REF_NOT_AVAIL )
-    {
-        * iSadPredSkip = kiSadA;
-    }
-	else
-	{
-		iCount  = ((uiRef == kiRefA) && (pMbSkipCache[3]==1))<<MB_LEFT_BIT;
-		iCount |= ((uiRef == kiRefB) && (pMbSkipCache[1]==1))<<MB_TOP_BIT;
-		iCount |= ((uiRef == iRefC) && (iRefSkip==1))<<MB_TOPRIGHT_BIT;
-		switch(iCount) 
-		{
-			case LEFT_MB_POS:// A
-				*iSadPredSkip = kiSadA;
-				break;
-			case TOP_MB_POS:// B
-				*iSadPredSkip = kiSadB;
-				break;
-			case TOPRIGHT_MB_POS:// C or D
-				*iSadPredSkip = iSadC;
-				break;
-			default:
-				*iSadPredSkip = WELS_MEDIAN( kiSadA, kiSadB, iSadC );
-				break;
-		}
-	}
-}
-}
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	md.c
+ *
+ * \brief	mode decision
+ *
+ * \date	2009.05.14 Created
+ *
+ *************************************************************************************
+ */
+
+#include <string.h>
+#include "ls_defines.h"
+#include "encoder_context.h"
+#include "svc_enc_slice_segment.h"
+#include "md.h"
+#include "mc.h"
+#include "mv_pred.h"
+#include "cpu_core.h"
+#include "svc_enc_golomb.h"
+#include "sample.h"
+#include "array_stack_align.h"
+
+namespace WelsSVCEnc {
+#define INTRA_VARIANCE_SAD_THRESHOLD 150
+#define INTER_VARIANCE_SAD_THRESHOLD 20
+
+//fill cache of neighbor MB, containing pNonZeroCount, sample_avail, pIntra4x4PredMode
+void FillNeighborCacheIntra (SMbCache* pMbCache, SMB* pCurMb, int32_t iMbWidth) {
+  uint32_t uiNeighborAvail = pCurMb->uiNeighborAvail;
+  uint32_t uiNeighborIntra = 0;
+
+  if (uiNeighborAvail & LEFT_MB_POS) { //LEFT MB
+    int8_t* pLeftMbNonZeroCount = pCurMb->pNonZeroCount - MB_LUMA_CHROMA_BLOCK4x4_NUM;
+    pMbCache->iNonZeroCoeffCount[8] = pLeftMbNonZeroCount[ 3];
+    pMbCache->iNonZeroCoeffCount[16] = pLeftMbNonZeroCount[ 7];
+    pMbCache->iNonZeroCoeffCount[24] = pLeftMbNonZeroCount[11];
+    pMbCache->iNonZeroCoeffCount[32] = pLeftMbNonZeroCount[15];
+
+    pMbCache->iNonZeroCoeffCount[ 13] = pLeftMbNonZeroCount[17];
+    pMbCache->iNonZeroCoeffCount[21] = pLeftMbNonZeroCount[21];
+    pMbCache->iNonZeroCoeffCount[37] = pLeftMbNonZeroCount[19];
+    pMbCache->iNonZeroCoeffCount[45] = pLeftMbNonZeroCount[23];
+
+    uiNeighborIntra |= LEFT_MB_POS;
+
+    if (IS_INTRA4x4 ((pCurMb - 1)->uiMbType)) {
+      int8_t* pLeftMbIntra4x4PredMode = pCurMb->pIntra4x4PredMode - INTRA_4x4_MODE_NUM;
+      pMbCache->iIntraPredMode[8] = pLeftMbIntra4x4PredMode[4];
+      pMbCache->iIntraPredMode[16] = pLeftMbIntra4x4PredMode[5];
+      pMbCache->iIntraPredMode[24] = pLeftMbIntra4x4PredMode[6];
+      pMbCache->iIntraPredMode[32] = pLeftMbIntra4x4PredMode[3];
+    } else { // if ( 0 == constrained_intra_pred_flag || IS_INTRA16x16((pCurMb-1)->uiMbType ))
+      pMbCache->iIntraPredMode[8] =
+        pMbCache->iIntraPredMode[16] =
+          pMbCache->iIntraPredMode[24] =
+            pMbCache->iIntraPredMode[32] = 2; //DC
+    }
+  } else {
+    pMbCache->iNonZeroCoeffCount[ 8] =
+      pMbCache->iNonZeroCoeffCount[16] =
+        pMbCache->iNonZeroCoeffCount[24] =
+          pMbCache->iNonZeroCoeffCount[32] = -1;//unavailable
+    pMbCache->iNonZeroCoeffCount[13] =
+      pMbCache->iNonZeroCoeffCount[21] =
+        pMbCache->iNonZeroCoeffCount[37] =
+          pMbCache->iNonZeroCoeffCount[45] = -1;//unavailable
+
+    pMbCache->iIntraPredMode[8] =
+      pMbCache->iIntraPredMode[16] =
+        pMbCache->iIntraPredMode[24] =
+          pMbCache->iIntraPredMode[32] = -1;//unavailable
+  }
+
+  if (uiNeighborAvail & TOP_MB_POS) { //TOP MB
+    SMB* pTopMb = pCurMb - iMbWidth;
+    ST32 (&pMbCache->iNonZeroCoeffCount[1], LD32 (&pTopMb->pNonZeroCount[12]));
+
+    ST16 (&pMbCache->iNonZeroCoeffCount[6], LD16 (&pTopMb->pNonZeroCount[20]));
+    ST16 (&pMbCache->iNonZeroCoeffCount[30], LD16 (&pTopMb->pNonZeroCount[22]));
+
+    uiNeighborIntra |= TOP_MB_POS;
+
+    if (IS_INTRA4x4 (pTopMb->uiMbType)) {
+      ST32 (pMbCache->iIntraPredMode + 1, LD32 (&pTopMb->pIntra4x4PredMode[0]));
+    } else { // if ( 0 == constrained_intra_pred_flag || IS_INTRA16x16( pTopMb->uiMbType ))
+      const uint32_t kuiDc32 = 0x02020202;
+      ST32 (pMbCache->iIntraPredMode + 1 , kuiDc32);
+    }
+  } else {
+    const uint32_t kuiUnavail32 = 0xffffffff;
+    ST32 (pMbCache->iIntraPredMode + 1 , kuiUnavail32);
+    ST32 (&pMbCache->iNonZeroCoeffCount[1], kuiUnavail32);
+
+    ST16 (&pMbCache->iNonZeroCoeffCount[6], 0xffff);
+    ST16 (&pMbCache->iNonZeroCoeffCount[30], 0xffff);
+  }
+
+  if (uiNeighborAvail & TOPLEFT_MB_POS) {
+    uiNeighborIntra |= 0x04;
+  }
+
+
+  if (uiNeighborAvail & TOPRIGHT_MB_POS) {
+    uiNeighborIntra |= 0x08;
+  }
+  pMbCache->uiNeighborIntra = uiNeighborIntra;
+}
+//fill cache of neighbor MB, containing motion_vector and uiRefIndex
+void FillNeighborCacheInterWithoutBGD (SMbCache* pMbCache, SMB* pCurMb, int32_t iMbWidth, int8_t* pVaaBgMbFlag) {
+  uint32_t uiNeighborAvail = pCurMb->uiNeighborAvail;
+  SMB* pLeftMb = pCurMb - 1 ;
+  SMB* pTopMb = pCurMb - iMbWidth;
+  SMB* pLeftTopMb = pCurMb - iMbWidth - 1 ;
+  SMB* iRightTopMb = pCurMb - iMbWidth + 1 ;
+  SMVComponentUnit* pMvComp = &pMbCache->sMvComponents;
+  if ((uiNeighborAvail & LEFT_MB_POS) && IS_SVC_INTER (pLeftMb->uiMbType)) {
+    pMvComp->sMotionVectorCache[ 6] = pLeftMb->sMv[ 3];
+    pMvComp->sMotionVectorCache[12] = pLeftMb->sMv[ 7];
+    pMvComp->sMotionVectorCache[18] = pLeftMb->sMv[11];
+    pMvComp->sMotionVectorCache[24] = pLeftMb->sMv[15];
+    pMvComp->iRefIndexCache[ 6] = pLeftMb->pRefIndex[1];
+    pMvComp->iRefIndexCache[12] = pLeftMb->pRefIndex[1];
+    pMvComp->iRefIndexCache[18] = pLeftMb->pRefIndex[3];
+    pMvComp->iRefIndexCache[24] = pLeftMb->pRefIndex[3];
+    pMbCache->iSadCost[3] = pLeftMb->pSadCost[0];
+
+    if (pLeftMb->uiMbType == MB_TYPE_SKIP) {
+      pMbCache->bMbTypeSkip[3] = 1;
+      pMbCache->iSadCostSkip[3] = pMbCache->pEncSad[-1];
+    } else {
+      pMbCache->bMbTypeSkip[3] = 0;
+      pMbCache->iSadCostSkip[3] = 0;
+    }
+  } else { //avail or non-inter
+    ST32 (&pMvComp->sMotionVectorCache[ 6], 0);
+    ST32 (&pMvComp->sMotionVectorCache[12], 0);
+    ST32 (&pMvComp->sMotionVectorCache[18], 0);
+    ST32 (&pMvComp->sMotionVectorCache[24], 0);
+    pMvComp->iRefIndexCache[ 6] =
+      pMvComp->iRefIndexCache[12] =
+        pMvComp->iRefIndexCache[18] =
+          pMvComp->iRefIndexCache[24] = (uiNeighborAvail & LEFT_MB_POS) ? REF_NOT_IN_LIST : REF_NOT_AVAIL;
+    pMbCache->iSadCost[3] = 0;
+    pMbCache->bMbTypeSkip[3] = 0;
+    pMbCache->iSadCostSkip[3] = 0;
+  }
+
+  if ((uiNeighborAvail & TOP_MB_POS) && IS_SVC_INTER (pTopMb->uiMbType)) { //TOP MB
+    ST64 (&pMvComp->sMotionVectorCache[1], LD64 (&pTopMb->sMv[12]));
+    ST64 (&pMvComp->sMotionVectorCache[3], LD64 (&pTopMb->sMv[14]));
+    pMvComp->iRefIndexCache[1] = pTopMb->pRefIndex[2];
+    pMvComp->iRefIndexCache[2] = pTopMb->pRefIndex[2];
+    pMvComp->iRefIndexCache[3] = pTopMb->pRefIndex[3];
+    pMvComp->iRefIndexCache[4] = pTopMb->pRefIndex[3];
+    pMbCache->iSadCost[1] = pTopMb->pSadCost[0];
+
+    if (pTopMb->uiMbType == MB_TYPE_SKIP) {
+      pMbCache->bMbTypeSkip[1] = 1;
+      pMbCache->iSadCostSkip[1] = pMbCache->pEncSad[-iMbWidth];
+    } else {
+      pMbCache->bMbTypeSkip[1] = 0;
+      pMbCache->iSadCostSkip[1] = 0;
+    }
+  } else { //unavail
+    ST64 (&pMvComp->sMotionVectorCache[1], 0);
+    ST64 (&pMvComp->sMotionVectorCache[3], 0);
+    pMvComp->iRefIndexCache[1] =
+      pMvComp->iRefIndexCache[2] =
+        pMvComp->iRefIndexCache[3] =
+          pMvComp->iRefIndexCache[4] = (uiNeighborAvail & TOP_MB_POS) ? REF_NOT_IN_LIST : REF_NOT_AVAIL;
+    pMbCache->iSadCost[1] = 0;
+
+    pMbCache->bMbTypeSkip[1] = 0;
+    pMbCache->iSadCostSkip[1] = 0;
+  }
+
+  if ((uiNeighborAvail & TOPLEFT_MB_POS) && IS_SVC_INTER (pLeftTopMb->uiMbType)) { //LEFT_TOP MB
+    pMvComp->sMotionVectorCache[0] = pLeftTopMb->sMv[15];
+    pMvComp->iRefIndexCache[0] = pLeftTopMb->pRefIndex[3];
+    pMbCache->iSadCost[0] = pLeftTopMb->pSadCost[0];
+
+    if (pLeftTopMb->uiMbType == MB_TYPE_SKIP) {
+      pMbCache->bMbTypeSkip[0] = 1;
+      pMbCache->iSadCostSkip[0] = pMbCache->pEncSad[-iMbWidth - 1];
+    } else {
+      pMbCache->bMbTypeSkip[0] = 0;
+      pMbCache->iSadCostSkip[0] = 0;
+    }
+  } else { //unavail
+    ST32 (&pMvComp->sMotionVectorCache[0], 0);
+    pMvComp->iRefIndexCache[0] = (uiNeighborAvail & TOPLEFT_MB_POS) ? REF_NOT_IN_LIST : REF_NOT_AVAIL;
+    pMbCache->iSadCost[0] = 0;
+    pMbCache->bMbTypeSkip[0] = 0;
+    pMbCache->iSadCostSkip[0] = 0;
+  }
+
+  if ((uiNeighborAvail & TOPRIGHT_MB_POS) && IS_SVC_INTER (iRightTopMb->uiMbType)) { //RIGHT_TOP MB
+    pMvComp->sMotionVectorCache[5] = iRightTopMb->sMv[12];
+    pMvComp->iRefIndexCache[5] = iRightTopMb->pRefIndex[2];
+    pMbCache->iSadCost[2] = iRightTopMb->pSadCost[0];
+
+    if (iRightTopMb->uiMbType == MB_TYPE_SKIP) {
+      pMbCache->bMbTypeSkip[2] = 1;
+      pMbCache->iSadCostSkip[2] = pMbCache->pEncSad[-iMbWidth + 1];
+    } else {
+      pMbCache->bMbTypeSkip[2] = 0;
+      pMbCache->iSadCostSkip[2] = 0;
+    }
+  } else { //unavail
+    ST32 (&pMvComp->sMotionVectorCache[5], 0);
+    pMvComp->iRefIndexCache[5] = (uiNeighborAvail & TOPRIGHT_MB_POS) ? REF_NOT_IN_LIST : REF_NOT_AVAIL;
+    pMbCache->iSadCost[2] = 0;
+    pMbCache->bMbTypeSkip[2] = 0;
+    pMbCache->iSadCostSkip[2] = 0;
+  }
+
+  //right-top 4*4 pBlock unavailable
+  ST32 (&pMvComp->sMotionVectorCache[ 9], 0);
+  ST32 (&pMvComp->sMotionVectorCache[21], 0);
+  ST32 (&pMvComp->sMotionVectorCache[11], 0);
+  ST32 (&pMvComp->sMotionVectorCache[17], 0);
+  ST32 (&pMvComp->sMotionVectorCache[23], 0);
+  pMvComp->iRefIndexCache[ 9] =
+    pMvComp->iRefIndexCache[11] =
+      pMvComp->iRefIndexCache[17] =
+        pMvComp->iRefIndexCache[21] =
+          pMvComp->iRefIndexCache[23] = REF_NOT_AVAIL;
+}
+
+void FillNeighborCacheInterWithBGD (SMbCache* pMbCache, SMB* pCurMb, int32_t iMbWidth, int8_t* pVaaBgMbFlag) {
+  uint32_t uiNeighborAvail = pCurMb->uiNeighborAvail;
+  SMB* pLeftMb = pCurMb - 1 ;
+  SMB* pTopMb = pCurMb - iMbWidth;
+  SMB* pLeftTopMb = pCurMb - iMbWidth - 1 ;
+  SMB* iRightTopMb = pCurMb - iMbWidth + 1 ;
+  SMVComponentUnit* pMvComp = &pMbCache->sMvComponents;
+
+  if ((uiNeighborAvail & LEFT_MB_POS) && IS_SVC_INTER (pLeftMb->uiMbType)) {
+    pMvComp->sMotionVectorCache[ 6] = pLeftMb->sMv[ 3];
+    pMvComp->sMotionVectorCache[12] = pLeftMb->sMv[ 7];
+    pMvComp->sMotionVectorCache[18] = pLeftMb->sMv[11];
+    pMvComp->sMotionVectorCache[24] = pLeftMb->sMv[15];
+    pMvComp->iRefIndexCache[ 6] = pLeftMb->pRefIndex[1];
+    pMvComp->iRefIndexCache[12] = pLeftMb->pRefIndex[1];
+    pMvComp->iRefIndexCache[18] = pLeftMb->pRefIndex[3];
+    pMvComp->iRefIndexCache[24] = pLeftMb->pRefIndex[3];
+    pMbCache->iSadCost[3] = pLeftMb->pSadCost[0];
+
+    if (pLeftMb->uiMbType == MB_TYPE_SKIP && pVaaBgMbFlag[-1] == 0) {
+      pMbCache->bMbTypeSkip[3] = 1;
+      pMbCache->iSadCostSkip[3] = pMbCache->pEncSad[-1];
+    } else {
+      pMbCache->bMbTypeSkip[3] = 0;
+      pMbCache->iSadCostSkip[3] = 0;
+    }
+  } else { //avail or non-inter
+    ST32 (&pMvComp->sMotionVectorCache[ 6], 0);
+    ST32 (&pMvComp->sMotionVectorCache[12], 0);
+    ST32 (&pMvComp->sMotionVectorCache[18], 0);
+    ST32 (&pMvComp->sMotionVectorCache[24], 0);
+    pMvComp->iRefIndexCache[ 6] =
+      pMvComp->iRefIndexCache[12] =
+        pMvComp->iRefIndexCache[18] =
+          pMvComp->iRefIndexCache[24] = (uiNeighborAvail & LEFT_MB_POS) ? REF_NOT_IN_LIST : REF_NOT_AVAIL;
+    pMbCache->iSadCost[3] = 0;
+    pMbCache->bMbTypeSkip[3] = 0;
+    pMbCache->iSadCostSkip[3] = 0;
+  }
+
+  if ((uiNeighborAvail & TOP_MB_POS) && IS_SVC_INTER (pTopMb->uiMbType)) { //TOP MB
+    ST64 (&pMvComp->sMotionVectorCache[1], LD64 (&pTopMb->sMv[12]));
+    ST64 (&pMvComp->sMotionVectorCache[3], LD64 (&pTopMb->sMv[14]));
+    pMvComp->iRefIndexCache[1] = pTopMb->pRefIndex[2];
+    pMvComp->iRefIndexCache[2] = pTopMb->pRefIndex[2];
+    pMvComp->iRefIndexCache[3] = pTopMb->pRefIndex[3];
+    pMvComp->iRefIndexCache[4] = pTopMb->pRefIndex[3];
+    pMbCache->iSadCost[1] = pTopMb->pSadCost[0];
+    if (pTopMb->uiMbType == MB_TYPE_SKIP  && pVaaBgMbFlag[-iMbWidth] == 0) {
+      pMbCache->bMbTypeSkip[1] = 1;
+      pMbCache->iSadCostSkip[1] = pMbCache->pEncSad[-iMbWidth];
+    } else {
+      pMbCache->bMbTypeSkip[1] = 0;
+      pMbCache->iSadCostSkip[1] = 0;
+    }
+  } else { //unavail
+    ST64 (&pMvComp->sMotionVectorCache[1], 0);
+    ST64 (&pMvComp->sMotionVectorCache[3], 0);
+    pMvComp->iRefIndexCache[1] =
+      pMvComp->iRefIndexCache[2] =
+        pMvComp->iRefIndexCache[3] =
+          pMvComp->iRefIndexCache[4] = (uiNeighborAvail & TOP_MB_POS) ? REF_NOT_IN_LIST : REF_NOT_AVAIL;
+    pMbCache->iSadCost[1] = 0;
+    pMbCache->bMbTypeSkip[1] = 0;
+    pMbCache->iSadCostSkip[1] = 0;
+  }
+
+
+  if ((uiNeighborAvail & TOPLEFT_MB_POS) && IS_SVC_INTER (pLeftTopMb->uiMbType)) { //LEFT_TOP MB
+    pMvComp->sMotionVectorCache[0] = pLeftTopMb->sMv[15];
+    pMvComp->iRefIndexCache[0] = pLeftTopMb->pRefIndex[3];
+    pMbCache->iSadCost[0] = pLeftTopMb->pSadCost[0];
+
+    if (pLeftTopMb->uiMbType == MB_TYPE_SKIP  && pVaaBgMbFlag[-iMbWidth - 1] == 0) {
+      pMbCache->bMbTypeSkip[0] = 1;
+      pMbCache->iSadCostSkip[0] = pMbCache->pEncSad[-iMbWidth - 1];
+    } else {
+      pMbCache->bMbTypeSkip[0] = 0;
+      pMbCache->iSadCostSkip[0] = 0;
+    }
+  } else { //unavail
+    ST32 (&pMvComp->sMotionVectorCache[0], 0);
+    pMvComp->iRefIndexCache[0] = (uiNeighborAvail & TOPLEFT_MB_POS) ? REF_NOT_IN_LIST : REF_NOT_AVAIL;
+    pMbCache->iSadCost[0] = 0;
+    pMbCache->bMbTypeSkip[0] = 0;
+    pMbCache->iSadCostSkip[0] = 0;
+  }
+
+  if ((uiNeighborAvail & TOPRIGHT_MB_POS) && IS_SVC_INTER (iRightTopMb->uiMbType)) { //RIGHT_TOP MB
+    pMvComp->sMotionVectorCache[5] = iRightTopMb->sMv[12];
+    pMvComp->iRefIndexCache[5] = iRightTopMb->pRefIndex[2];
+    pMbCache->iSadCost[2] = iRightTopMb->pSadCost[0];
+
+    if (iRightTopMb->uiMbType == MB_TYPE_SKIP  && pVaaBgMbFlag[-iMbWidth + 1] == 0) {
+      pMbCache->bMbTypeSkip[2] = 1;
+      pMbCache->iSadCostSkip[2] = pMbCache->pEncSad[-iMbWidth + 1];
+    } else {
+      pMbCache->bMbTypeSkip[2] = 0;
+      pMbCache->iSadCostSkip[2] = 0;
+    }
+  } else { //unavail
+    ST32 (&pMvComp->sMotionVectorCache[5], 0);
+    pMvComp->iRefIndexCache[5] = (uiNeighborAvail & TOPRIGHT_MB_POS) ? REF_NOT_IN_LIST : REF_NOT_AVAIL;
+    pMbCache->iSadCost[2] = 0;
+    pMbCache->bMbTypeSkip[2] = 0;
+    pMbCache->iSadCostSkip[2] = 0;
+  }
+
+  //right-top 4*4 pBlock unavailable
+  ST32 (&pMvComp->sMotionVectorCache[ 9], 0);
+  ST32 (&pMvComp->sMotionVectorCache[21], 0);
+  ST32 (&pMvComp->sMotionVectorCache[11], 0);
+  ST32 (&pMvComp->sMotionVectorCache[17], 0);
+  ST32 (&pMvComp->sMotionVectorCache[23], 0);
+  pMvComp->iRefIndexCache[ 9] =
+    pMvComp->iRefIndexCache[11] =
+      pMvComp->iRefIndexCache[17] =
+        pMvComp->iRefIndexCache[21] =
+          pMvComp->iRefIndexCache[23] = REF_NOT_AVAIL;
+}
+
+void InitFillNeighborCacheInterFunc (SWelsFuncPtrList* pFuncList, const int32_t kiFlag) {
+  pFuncList->pfFillInterNeighborCache = kiFlag ? FillNeighborCacheInterWithBGD : FillNeighborCacheInterWithoutBGD;
+}
+
+void UpdateMbMv_c (SMVUnitXY* pMvBuffer, const SMVUnitXY ksMv) {
+  int32_t k = 0;
+  for (; k < MB_BLOCK4x4_NUM; k += 4) {
+    pMvBuffer[k  ] =
+      pMvBuffer[k + 1] =
+        pMvBuffer[k + 2] =
+          pMvBuffer[k + 3] = ksMv;
+  }
+}
+
+
+uint8_t MdInterAnalysisVaaInfo_c (int32_t* pSad8x8) {
+  int32_t iSadBlock[4], iAverageSadBlock[4];
+  int32_t iAverageSad, iVarianceSad;
+
+  iSadBlock[0] = pSad8x8[0];
+  iAverageSad = iSadBlock[0];
+
+  iSadBlock[1] = pSad8x8[1];
+  iAverageSad += iSadBlock[1];
+
+  iSadBlock[2] = pSad8x8[2];
+  iAverageSad += iSadBlock[2];
+
+  iSadBlock[3] = pSad8x8[3];
+  iAverageSad += iSadBlock[3];
+
+  iAverageSad = iAverageSad >> 2;
+
+  iAverageSadBlock[0] = (iSadBlock[0] >> 6) - (iAverageSad >> 6);
+  iVarianceSad = iAverageSadBlock[0] * iAverageSadBlock[0];
+
+  iAverageSadBlock[1] = (iSadBlock[1] >> 6) - (iAverageSad >> 6);
+  iVarianceSad += iAverageSadBlock[1] * iAverageSadBlock[1];
+
+  iAverageSadBlock[2] = (iSadBlock[2] >> 6) - (iAverageSad >> 6);
+  iVarianceSad += iAverageSadBlock[2] * iAverageSadBlock[2];
+
+  iAverageSadBlock[3] = (iSadBlock[3] >> 6) - (iAverageSad >> 6);
+  iVarianceSad += iAverageSadBlock[3] * iAverageSadBlock[3];
+
+  if (iVarianceSad < INTER_VARIANCE_SAD_THRESHOLD) {
+    return 15;
+  }
+
+  uint8_t uiMbSign = 0;
+  if (iSadBlock[0] > iAverageSad)
+    uiMbSign |= 0x08;
+  if (iSadBlock[1] > iAverageSad)
+    uiMbSign |= 0x04;
+  if (iSadBlock[2] > iAverageSad)
+    uiMbSign |= 0x02;
+  if (iSadBlock[3] > iAverageSad)
+    uiMbSign |= 0x01;
+  return (uiMbSign);
+}
+
+static inline int32_t AnalysisVaaInfoIntra_c (uint8_t* pDataY, const int32_t kiLineSize) {
+  ENFORCE_STACK_ALIGN_1D (uint16_t, uiAvgBlock, 16, 16)
+  uint16_t* pBlock = &uiAvgBlock[0];
+  uint8_t* pEncData	= pDataY;
+  const int32_t kiLineSize2	= kiLineSize << 1;
+  const int32_t kiLineSize3	= kiLineSize + kiLineSize2;
+  const int32_t kiLineSize4	= kiLineSize << 2;
+  int32_t i = 0, j = 0, num = 0;
+  int32_t iSumAvg = 0, iSumSqr = 0;
+
+//	analysis_vaa_info_intra_core_c( pDataY, iLineSize, pBlock );
+  for (; j < 16; j += 4) {
+    num = 0;
+    for (i = 0; i < 16; i += 4, num ++) {
+      pBlock[num]	=  pEncData[i          ] + pEncData[i + 1          ] + pEncData[i + 2          ] + pEncData[i +
+                     3          ];
+      pBlock[num]	+= pEncData[i + kiLineSize ] + pEncData[i + kiLineSize + 1 ] + pEncData[i + kiLineSize + 2 ] + pEncData[i +
+                     kiLineSize + 3 ];
+      pBlock[num]	+= pEncData[i + kiLineSize2] + pEncData[i + kiLineSize2 + 1] + pEncData[i + kiLineSize2 + 2] + pEncData[i +
+                     kiLineSize2 + 3];
+      pBlock[num]	+= pEncData[i + kiLineSize3] + pEncData[i + kiLineSize3 + 1] + pEncData[i + kiLineSize3 + 2] + pEncData[i +
+                     kiLineSize3 + 3];
+      pBlock[num]	>>=  4;
+    }
+    pBlock += 4;
+    pEncData += kiLineSize4;
+  }
+
+  pBlock = &uiAvgBlock[0];
+  i = 4;
+  for (; i > 0; --i) {
+    iSumAvg += pBlock[0] + pBlock[1] + pBlock[2] + pBlock[3];
+    iSumSqr += pBlock[0] * pBlock[0] + pBlock[1] * pBlock[1] + pBlock[2] * pBlock[2] + pBlock[3] * pBlock[3];
+
+    pBlock += 4;
+  }
+
+
+  return /*variance =*/ (iSumSqr - ((iSumAvg * iSumAvg) >> 4));
+}
+
+// for pfGetVarianceFromIntraVaa function ptr adaptive by CPU features, 6/7/2010
+void InitIntraAnalysisVaaInfo (SWelsFuncPtrList* pFuncList, const uint32_t kuiCpuFlag) {
+  pFuncList->pfGetVarianceFromIntraVaa		= AnalysisVaaInfoIntra_c;
+  pFuncList->pfGetMbSignFromInterVaa	= MdInterAnalysisVaaInfo_c;
+  pFuncList->pfUpdateMbMv					= UpdateMbMv_c;
+
+#if defined(X86_ASM)
+  if ((kuiCpuFlag & WELS_CPU_SSE2) == WELS_CPU_SSE2) {
+    pFuncList->pfGetVarianceFromIntraVaa		= AnalysisVaaInfoIntra_sse2;
+    pFuncList->pfGetMbSignFromInterVaa	= MdInterAnalysisVaaInfo_sse2;
+    pFuncList->pfUpdateMbMv					= UpdateMbMv_sse2;
+  }
+  if ((kuiCpuFlag & WELS_CPU_SSSE3) == WELS_CPU_SSSE3) {
+    pFuncList->pfGetVarianceFromIntraVaa	= AnalysisVaaInfoIntra_ssse3;
+  }
+  if ((kuiCpuFlag & WELS_CPU_SSE41) == WELS_CPU_SSE41) {
+    pFuncList->pfGetMbSignFromInterVaa	= MdInterAnalysisVaaInfo_sse41;
+  }
+#endif//X86_ASM
+}
+
+BOOL_T MdIntraAnalysisVaaInfo (sWelsEncCtx* pEncCtx, uint8_t* pEncMb) {
+
+  SDqLayer* pCurDqLayer	= pEncCtx->pCurDqLayer;
+  const int32_t kiLineSize  = pCurDqLayer->iEncStride[0];
+  const int32_t kiVariance	= pEncCtx->pFuncList->pfGetVarianceFromIntraVaa (pEncMb, kiLineSize);
+  return (kiVariance >= INTRA_VARIANCE_SAD_THRESHOLD);
+}
+
+void InitMeRefinePointer (SMeRefinePointer* pMeRefine, SMbCache* pMbCache, int32_t iStride) {
+  pMeRefine->pHalfPixH    = &pMbCache->pBufferInterPredMe[0] + iStride;
+  pMeRefine->pHalfPixV    = &pMbCache->pBufferInterPredMe[640] + iStride;
+
+  pMeRefine->pQuarPixBest = &pMbCache->pBufferInterPredMe[1280] + iStride;
+  pMeRefine->pQuarPixTmp  = &pMbCache->pBufferInterPredMe[1920] + iStride;
+}
+typedef struct TagQuarParams {
+  int32_t iBestCost;
+  int32_t iBestHalfPix;
+  int32_t iStrideA;
+  int32_t iStrideB;
+  uint8_t* pRef;
+  uint8_t* pSrcB[4];
+  uint8_t* pSrcA[4];
+  int32_t iLms[4];
+  int32_t iBestQuarPix;
+} SQuarRefineParams;
+
+#define SWITCH_BEST_TMP_BUF(prev_best, curr_best){\
+	pParams->iBestCost = iCurCost;\
+	pTmp = prev_best;\
+	prev_best = curr_best;\
+	curr_best = pTmp;\
+}
+#define CALC_COST(me_buf, lm) ( pFunc->sSampleDealingFuncs.pfMeCost[kuiPixel](pEncMb, iStrideEnc, me_buf, ME_REFINE_BUF_STRIDE) + lm )
+
+inline void MeRefineQuarPixel (SWelsFuncPtrList* pFunc, SWelsME* pMe, SMeRefinePointer* pMeRefine,
+                               const int32_t kiWidth, const int32_t kiHeight, SQuarRefineParams* pParams, int32_t iStrideEnc) {
+  PWelsSampleAveragingFunc* pSampleAvg	= pFunc->sMcFuncs.pfSampleAveraging;
+  const int32_t kiAvgIndex		= kiWidth >> 4;
+  int32_t iCurCost;
+  uint8_t* pEncMb				= pMe->pEncMb;
+  uint8_t* pTmp				= NULL;
+  const uint8_t kuiPixel		= pMe->uiPixel;
+
+  pSampleAvg[kiAvgIndex] (pMeRefine->pQuarPixTmp, ME_REFINE_BUF_STRIDE, pParams->pSrcA[0], ME_REFINE_BUF_STRIDE,
+                          pParams->pSrcB[0], pParams->iStrideA, kiHeight);
+
+  iCurCost = CALC_COST (pMeRefine->pQuarPixTmp, pParams->iLms[0]);
+  if (iCurCost < pParams->iBestCost) {
+    pParams->iBestQuarPix =	ME_QUAR_PIXEL_TOP;
+    SWITCH_BEST_TMP_BUF (pMeRefine->pQuarPixBest, pMeRefine->pQuarPixTmp);
+  }
+  //=========================(0, 1)=======================//
+  pSampleAvg[kiAvgIndex] (pMeRefine->pQuarPixTmp, ME_REFINE_BUF_STRIDE, pParams->pSrcA[1],
+                          ME_REFINE_BUF_STRIDE, pParams->pSrcB[1], pParams->iStrideA, kiHeight);
+  iCurCost = CALC_COST (pMeRefine->pQuarPixTmp, pParams->iLms[1]);
+  if (iCurCost < pParams->iBestCost) {
+    pParams->iBestQuarPix = ME_QUAR_PIXEL_BOTTOM;
+    SWITCH_BEST_TMP_BUF (pMeRefine->pQuarPixBest, pMeRefine->pQuarPixTmp);
+  }
+  //==========================(-1, 0)=========================//
+  pSampleAvg[kiAvgIndex] (pMeRefine->pQuarPixTmp, ME_REFINE_BUF_STRIDE, pParams->pSrcA[2],
+                          ME_REFINE_BUF_STRIDE, pParams->pSrcB[2], pParams->iStrideB, kiHeight);
+  iCurCost = CALC_COST (pMeRefine->pQuarPixTmp, pParams->iLms[2]);
+  if (iCurCost < pParams->iBestCost) {
+    pParams->iBestQuarPix = ME_QUAR_PIXEL_LEFT;
+    SWITCH_BEST_TMP_BUF (pMeRefine->pQuarPixBest, pMeRefine->pQuarPixTmp);
+  }
+  //==========================(1, 0)=========================//
+  pSampleAvg[kiAvgIndex] (pMeRefine->pQuarPixTmp, ME_REFINE_BUF_STRIDE, pParams->pSrcA[3],
+                          ME_REFINE_BUF_STRIDE,	pParams->pSrcB[3], pParams->iStrideB,  kiHeight);
+
+  iCurCost = CALC_COST (pMeRefine->pQuarPixTmp, pParams->iLms[3]);
+  if (iCurCost < pParams->iBestCost) {
+    pParams->iBestQuarPix = ME_QUAR_PIXEL_RIGHT;
+    SWITCH_BEST_TMP_BUF (pMeRefine->pQuarPixBest, pMeRefine->pQuarPixTmp);
+  }
+}
+
+void MeRefineFracPixel (sWelsEncCtx* pEncCtx, uint8_t* pMemPredInterMb, SWelsME* pMe,
+                        SMeRefinePointer* pMeRefine, int32_t iWidth, int32_t iHeight) {
+  SWelsFuncPtrList* pFunc = pEncCtx->pFuncList;
+  int16_t iMvx = pMe->sMv.iMvX;
+  int16_t iMvy = pMe->sMv.iMvY;
+
+  int16_t iHalfMvx = iMvx;
+  int16_t iHalfMvy = iMvy;
+  const int32_t kiStrideEnc = pEncCtx->pCurDqLayer->iEncStride[0];
+  const int32_t kiStrideRef = pEncCtx->pCurDqLayer->pRefPic->iLineSize[0];
+
+  uint8_t* pEncData = pMe->pEncMb;
+  uint8_t* pRef = pMe->pRefMb;//091010
+
+  int32_t iBestQuarPix = ME_NO_BEST_QUAR_PIXEL;
+
+  SQuarRefineParams sParams;
+  static int32_t iMvQuarAddX[10] = {0, 0, -1, 1, 0, 0, 0, -1, 1, 0};
+  int32_t* pMvQuarAddY = iMvQuarAddX + 3;
+  uint8_t* pBestPredInter = pRef;
+  int32_t iInterBlk4Stride = ME_REFINE_BUF_STRIDE;
+
+  int32_t iBestCost;
+  int32_t iCurCost;
+  int32_t iBestHalfPix;
+
+  if ((pFunc->sSampleDealingFuncs.pfMeCost == pFunc->sSampleDealingFuncs.pfSampleSatd)
+      && (pFunc->sSampleDealingFuncs.pfMdCost == pFunc->sSampleDealingFuncs.pfSampleSatd)) {
+    iBestCost = pMe->uSadPredISatd.uiSatd + COST_MVD (pMe->pMvdCost, iMvx - pMe->sMvp.iMvX, iMvy - pMe->sMvp.iMvY);
+  } else {
+    iBestCost = pFunc->sSampleDealingFuncs.pfMeCost[pMe->uiPixel] (pEncData, kiStrideEnc, pRef, kiStrideRef) +
+                COST_MVD (pMe->pMvdCost, iMvx - pMe->sMvp.iMvX, iMvy - pMe->sMvp.iMvY);
+  }
+
+  iBestHalfPix = REFINE_ME_NO_BEST_HALF_PIXEL;
+
+  pFunc->sMcFuncs.pfLumaHalfpelVer (pRef - kiStrideRef, kiStrideRef, pMeRefine->pHalfPixV, ME_REFINE_BUF_STRIDE, iWidth,
+                                    iHeight + 1);
+
+  //step 1: get [iWidth][iHeight+1] half pixel from vertical filter
+  //===========================(0, -2)==============================//
+  iCurCost = pFunc->sSampleDealingFuncs.pfMeCost[pMe->uiPixel] (pEncData, kiStrideEnc, pMeRefine->pHalfPixV,
+             ME_REFINE_BUF_STRIDE) +
+             COST_MVD (pMe->pMvdCost, iMvx - pMe->sMvp.iMvX, iMvy - 2 - pMe->sMvp.iMvY);
+  if (iCurCost < iBestCost) {
+    iBestCost = iCurCost;
+    iBestHalfPix = REFINE_ME_HALF_PIXEL_TOP;
+    pBestPredInter = pMeRefine->pHalfPixV;
+  }
+  //===========================(0, 2)==============================//
+  iCurCost = pFunc->sSampleDealingFuncs.pfMeCost[pMe->uiPixel] (pEncData, kiStrideEnc,
+             pMeRefine->pHalfPixV + ME_REFINE_BUF_STRIDE, ME_REFINE_BUF_STRIDE) +
+             COST_MVD (pMe->pMvdCost, iMvx - pMe->sMvp.iMvX, iMvy + 2 - pMe->sMvp.iMvY);
+  if (iCurCost < iBestCost) {
+    iBestCost = iCurCost;
+    iBestHalfPix = REFINE_ME_HALF_PIXEL_BOTTOM;
+    pBestPredInter = pMeRefine->pHalfPixV + ME_REFINE_BUF_STRIDE;
+  }
+  pFunc->sMcFuncs.pfLumaHalfpelHor (pRef - 1, kiStrideRef, pMeRefine->pHalfPixH, ME_REFINE_BUF_STRIDE, iWidth + 1,
+                                    iHeight);
+  //step 2: get [iWidth][iHeight+1] half pixel from horizon filter
+
+  //===========================(-2, 0)==============================//
+  iCurCost = pFunc->sSampleDealingFuncs.pfMeCost[pMe->uiPixel] (pEncData, kiStrideEnc, pMeRefine->pHalfPixH,
+             ME_REFINE_BUF_STRIDE) +
+             COST_MVD (pMe->pMvdCost, iMvx - 2 - pMe->sMvp.iMvX, iMvy - pMe->sMvp.iMvY);
+  if (iCurCost < iBestCost) {
+    iBestCost = iCurCost;
+    iBestHalfPix = REFINE_ME_HALF_PIXEL_LEFT;
+    pBestPredInter = pMeRefine->pHalfPixH;
+  }
+  //===========================(2, 0)===============================//
+  iCurCost = pFunc->sSampleDealingFuncs.pfMeCost[pMe->uiPixel] (pEncData, kiStrideEnc, pMeRefine->pHalfPixH + 1,
+             ME_REFINE_BUF_STRIDE) +
+             COST_MVD (pMe->pMvdCost, iMvx + 2 - pMe->sMvp.iMvX, iMvy - pMe->sMvp.iMvY);
+  if (iCurCost < iBestCost) {
+    iBestCost = iCurCost;
+    iBestHalfPix = REFINE_ME_HALF_PIXEL_RIGHT;
+    pBestPredInter = pMeRefine->pHalfPixH + 1;
+  }
+
+  sParams.iBestCost = iBestCost;
+  sParams.iBestHalfPix = iBestHalfPix;
+  sParams.pRef = pRef;
+  sParams.iBestQuarPix = ME_NO_BEST_QUAR_PIXEL;
+
+  //step 5: if no best half-pixel prediction, try quarter pixel prediction
+  //        if yes, must get [X+1][X+1] half-pixel from (2, 2) horizontal and vertical filter
+  if (REFINE_ME_NO_BEST_HALF_PIXEL == iBestHalfPix) {
+    sParams.iStrideA = kiStrideRef;
+    sParams.iStrideB = kiStrideRef;
+    sParams.pSrcA[0] = pMeRefine->pHalfPixV;
+    sParams.pSrcA[1] = pMeRefine->pHalfPixV + ME_REFINE_BUF_STRIDE;
+    sParams.pSrcA[2] = pMeRefine->pHalfPixH;
+    sParams.pSrcA[3] = pMeRefine->pHalfPixH + 1;
+
+    sParams.pSrcB[0] = sParams.pSrcB[1] = sParams.pSrcB[2] = sParams.pSrcB[3] = pRef;
+
+    sParams.iLms[0] = COST_MVD (pMe->pMvdCost, iHalfMvx - pMe->sMvp.iMvX, iHalfMvy - 1 - pMe->sMvp.iMvY);
+    sParams.iLms[1] = COST_MVD (pMe->pMvdCost, iHalfMvx - pMe->sMvp.iMvX, iHalfMvy + 1 - pMe->sMvp.iMvY);
+    sParams.iLms[2] = COST_MVD (pMe->pMvdCost, iHalfMvx - 1 - pMe->sMvp.iMvX, iHalfMvy - pMe->sMvp.iMvY);
+    sParams.iLms[3] = COST_MVD (pMe->pMvdCost, iHalfMvx + 1 - pMe->sMvp.iMvX, iHalfMvy - pMe->sMvp.iMvY);
+  } else { //must get [X+1][X+1] half-pixel from (2, 2) horizontal and vertical filter
+    switch (iBestHalfPix) {
+    case REFINE_ME_HALF_PIXEL_LEFT: {
+      pMeRefine->pHalfPixHV = pMeRefine->pHalfPixV;//reuse pBuffer, here only h&hv
+      pFunc->sMcFuncs.pfLumaHalfpelCen (pRef - 1 - kiStrideRef, kiStrideRef, pMeRefine->pHalfPixHV, ME_REFINE_BUF_STRIDE,
+                                        iWidth + 1, iHeight + 1);
+
+      iHalfMvx -= 2;
+      sParams.iStrideA = ME_REFINE_BUF_STRIDE;
+      sParams.iStrideB = kiStrideRef;
+      sParams.pSrcA[0] = pMeRefine->pHalfPixH;
+      sParams.pSrcA[3] = sParams.pSrcA[2] = sParams.pSrcA[1] = sParams.pSrcA[0];
+      sParams.pSrcB[0] = pMeRefine->pHalfPixHV;
+      sParams.pSrcB[1] = pMeRefine->pHalfPixHV + ME_REFINE_BUF_STRIDE;
+      sParams.pSrcB[2] = pRef - 1;
+      sParams.pSrcB[3] = pRef;
+
+    }
+    break;
+    case REFINE_ME_HALF_PIXEL_RIGHT: {
+      pMeRefine->pHalfPixHV = pMeRefine->pHalfPixV;//reuse pBuffer, here only h&hv
+      pFunc->sMcFuncs.pfLumaHalfpelCen (pRef - 1 - kiStrideRef, kiStrideRef, pMeRefine->pHalfPixHV, ME_REFINE_BUF_STRIDE,
+                                        iWidth + 1, iHeight + 1);
+      iHalfMvx += 2;
+      sParams.iStrideA = ME_REFINE_BUF_STRIDE;
+      sParams.iStrideB = kiStrideRef;
+      sParams.pSrcA[0] = pMeRefine->pHalfPixH + 1;
+      sParams.pSrcA[3] = sParams.pSrcA[2] = sParams.pSrcA[1] = sParams.pSrcA[0];
+      sParams.pSrcB[0] = pMeRefine->pHalfPixHV + 1;
+      sParams.pSrcB[1] = pMeRefine->pHalfPixHV + 1 + ME_REFINE_BUF_STRIDE;
+      sParams.pSrcB[2] = pRef;
+      sParams.pSrcB[3] = pRef + 1;
+    }
+    break;
+    case REFINE_ME_HALF_PIXEL_TOP: {
+      pMeRefine->pHalfPixHV = pMeRefine->pHalfPixH;//reuse pBuffer, here only v&hv
+      pFunc->sMcFuncs.pfLumaHalfpelCen (pRef - 1 - kiStrideRef, kiStrideRef, pMeRefine->pHalfPixHV, ME_REFINE_BUF_STRIDE,
+                                        iWidth + 1, iHeight + 1);
+
+      iHalfMvy -= 2;
+      sParams.iStrideA = kiStrideRef;
+      sParams.iStrideB = ME_REFINE_BUF_STRIDE;
+      sParams.pSrcA[0] = pMeRefine->pHalfPixV;
+      sParams.pSrcA[3] = sParams.pSrcA[2] = sParams.pSrcA[1] = sParams.pSrcA[0];
+      sParams.pSrcB[0] = pRef - kiStrideRef;
+      sParams.pSrcB[1] = pRef;
+      sParams.pSrcB[2] = pMeRefine->pHalfPixHV;
+      sParams.pSrcB[3] = pMeRefine->pHalfPixHV + 1;
+    }
+    break;
+    case REFINE_ME_HALF_PIXEL_BOTTOM: {
+      pMeRefine->pHalfPixHV = pMeRefine->pHalfPixH;//reuse pBuffer, here only v&hv
+      pFunc->sMcFuncs.pfLumaHalfpelCen (pRef - 1 - kiStrideRef, kiStrideRef, pMeRefine->pHalfPixHV, ME_REFINE_BUF_STRIDE,
+                                        iWidth + 1, iHeight + 1);
+      iHalfMvy += 2;
+      sParams.iStrideA = kiStrideRef;
+      sParams.iStrideB = ME_REFINE_BUF_STRIDE;
+      sParams.pSrcA[0] = pMeRefine->pHalfPixV + ME_REFINE_BUF_STRIDE;
+      sParams.pSrcA[3] = sParams.pSrcA[2] = sParams.pSrcA[1] = sParams.pSrcA[0];
+      sParams.pSrcB[0] = pRef;
+      sParams.pSrcB[1] = pRef + kiStrideRef;
+      sParams.pSrcB[2] = pMeRefine->pHalfPixHV + ME_REFINE_BUF_STRIDE;
+      sParams.pSrcB[3] = pMeRefine->pHalfPixHV + ME_REFINE_BUF_STRIDE + 1;
+    }
+    break;
+    default:
+      break;
+    }
+    sParams.iLms[0] = COST_MVD (pMe->pMvdCost, iHalfMvx - pMe->sMvp.iMvX, iHalfMvy - 1 - pMe->sMvp.iMvY);
+    sParams.iLms[1] = COST_MVD (pMe->pMvdCost, iHalfMvx - pMe->sMvp.iMvX, iHalfMvy + 1 - pMe->sMvp.iMvY);
+    sParams.iLms[2] = COST_MVD (pMe->pMvdCost, iHalfMvx - 1 - pMe->sMvp.iMvX, iHalfMvy - pMe->sMvp.iMvY);
+    sParams.iLms[3] = COST_MVD (pMe->pMvdCost, iHalfMvx + 1 - pMe->sMvp.iMvX, iHalfMvy - pMe->sMvp.iMvY);
+  }
+  MeRefineQuarPixel (pFunc, pMe, pMeRefine, iWidth, iHeight, &sParams, kiStrideEnc);
+
+  if (iBestCost > sParams.iBestCost) {
+    pBestPredInter = pMeRefine->pQuarPixBest;
+    iBestCost = sParams.iBestCost;
+  }
+  iBestQuarPix = sParams.iBestQuarPix;
+
+  //update final best MV
+  pMe->sMv.iMvX = iHalfMvx + iMvQuarAddX[iBestQuarPix];
+  pMe->sMv.iMvY = iHalfMvy + pMvQuarAddY[iBestQuarPix];
+  pMe->uiSatdCost = iBestCost;
+
+  //No half or quarter pixel best, so do MC with integer pixel MV
+  if (iBestHalfPix + iBestQuarPix == NO_BEST_FRAC_PIX) {
+    pBestPredInter = pRef;
+    iInterBlk4Stride = kiStrideRef;
+  }
+  if (MB_WIDTH_LUMA == iWidth && MB_HEIGHT_LUMA == iHeight) { //P16x16
+    pFunc->pfCopy16x16NotAligned (pMemPredInterMb, MB_WIDTH_LUMA, pBestPredInter,
+                                  iInterBlk4Stride);	// dst can be align with 16 bytes, but not sure at pSrc, 12/29/2011
+  } else if (MB_WIDTH_LUMA == iWidth && MB_HEIGHT_CHROMA == iHeight) { //P16x8
+    pFunc->pfCopy16x8NotAligned (pMemPredInterMb, MB_WIDTH_LUMA, pBestPredInter,
+                                 iInterBlk4Stride);	// dst can be align with 16 bytes, but not sure at pSrc, 12/29/2011
+  } else if (MB_WIDTH_CHROMA == iWidth && MB_HEIGHT_LUMA == iHeight) { //P8x16
+    pFunc->pfCopy8x16Aligned (pMemPredInterMb, MB_WIDTH_LUMA, pBestPredInter, iInterBlk4Stride);
+  } else { //P8x8
+    pFunc->pfCopy8x8Aligned (pMemPredInterMb, MB_WIDTH_LUMA, pBestPredInter, iInterBlk4Stride);
+  }
+}
+
+void InitBlkStrideWithRef (int32_t* pBlkStride, const int32_t kiStrideRef) {
+  static const uint8_t kuiStrideX[16] = {
+    0, 4 , 0, 4 ,
+    8, 12, 8, 12,
+    0, 4 , 0, 4 ,
+    8, 12, 8, 12
+  };
+  static const uint8_t kuiStrideY[16] = {
+    0, 0, 4 , 4 ,
+    0, 0, 4 , 4 ,
+    8, 8, 12, 12,
+    8, 8, 12, 12
+  };
+  int32_t i;
+
+  for (i = 0; i < 16; i += 4) {
+    pBlkStride[i  ] = kuiStrideX[i  ] + kuiStrideY[i  ] * kiStrideRef;
+    pBlkStride[i + 1] = kuiStrideX[i + 1] + kuiStrideY[i + 1] * kiStrideRef;
+    pBlkStride[i + 2] = kuiStrideX[i + 2] + kuiStrideY[i + 2] * kiStrideRef;
+    pBlkStride[i + 3] = kuiStrideX[i + 3] + kuiStrideY[i + 3] * kiStrideRef;
+  }
+}
+
+/*
+ * iMvdSz = (648*2+1) or (972*2+1);
+ */
+void MvdCostInit (uint16_t* pMvdCostInter, const int32_t kiMvdSz) {
+  const int32_t kiSz		= kiMvdSz >> 1;
+  uint16_t* pNegMvd		= pMvdCostInter;
+  uint16_t* pPosMvd		= pMvdCostInter + kiSz + 1;
+  const int32_t* kpQpLambda = &g_kiQpCostTable[0];
+  int32_t i, j;
+
+  for (i = 0; i < 52; ++ i) {
+    const uint16_t kiLambda = kpQpLambda[i];
+    int32_t iNegSe = -kiSz;
+    int32_t iPosSe = 1;
+
+    for (j = 0; j < kiSz; j += 4) {
+      *pNegMvd++	= kiLambda * BsSizeSE (iNegSe++);
+      *pNegMvd++	= kiLambda * BsSizeSE (iNegSe++);
+      *pNegMvd++	= kiLambda * BsSizeSE (iNegSe++);
+      *pNegMvd++	= kiLambda * BsSizeSE (iNegSe++);
+
+      *pPosMvd++	= kiLambda * BsSizeSE (iPosSe++);
+      *pPosMvd++	= kiLambda * BsSizeSE (iPosSe++);
+      *pPosMvd++	= kiLambda * BsSizeSE (iPosSe++);
+      *pPosMvd++	= kiLambda * BsSizeSE (iPosSe++);
+    }
+    *pNegMvd = kiLambda;
+    pNegMvd += kiSz + 1;
+    pPosMvd += kiSz + 1;
+  }
+}
+
+void PredictSad (int8_t* pRefIndexCache, int32_t* pSadCostCache, int32_t uiRef, int32_t* pSadPred) {
+  const int32_t kiRefB	= pRefIndexCache[1];//top g_uiCache12_8x8RefIdx[0] - 4
+  int32_t iRefC			= pRefIndexCache[5];//top-right g_uiCache12_8x8RefIdx[0] - 2
+  const int32_t kiRefA	= pRefIndexCache[6];//left g_uiCache12_8x8RefIdx[0] - 1
+  const int32_t kiSadB		= pSadCostCache[1];
+  int32_t iSadC			= pSadCostCache[2];
+  const int32_t kiSadA		= pSadCostCache[3];
+
+  int32_t iCount;
+
+  if (iRefC == REF_NOT_AVAIL) {
+    iRefC = pRefIndexCache[0];//top-left g_uiCache12_8x8RefIdx[0] - 4 - 1
+    iSadC  = pSadCostCache[0];
+  }
+
+  if (kiRefB == REF_NOT_AVAIL && iRefC == REF_NOT_AVAIL && kiRefA != REF_NOT_AVAIL) {
+    * pSadPred = kiSadA;
+  } else {
+    iCount  = (uiRef == kiRefA) << MB_LEFT_BIT;
+    iCount |= (uiRef == kiRefB) << MB_TOP_BIT;
+    iCount |= (uiRef == iRefC) << MB_TOPRIGHT_BIT;
+    switch (iCount) {
+    case LEFT_MB_POS:// A
+      *pSadPred = kiSadA;
+      break;
+    case TOP_MB_POS:// B
+      *pSadPred = kiSadB;
+      break;
+    case TOPRIGHT_MB_POS:// C or D
+      *pSadPred = iSadC;
+      break;
+    default:
+      *pSadPred = WELS_MEDIAN (kiSadA, kiSadB, iSadC);
+      break;
+    }
+  }
+
+#define REPLACE_SAD_MULTIPLY(x)   ((x) - (x>>3) + (x >>5))    // it's 0.90625, very close with 0.9
+  iCount = (*pSadPred) << 6;  // here *64 will not overflow. SAD range 0~ 255*256(max 2^16), int32_t is enough
+  *pSadPred = (REPLACE_SAD_MULTIPLY (iCount) + 32) >> 6;
+#undef REPLACE_SAD_MULTIPLY
+}
+
+
+void PredictSadSkip (int8_t* pRefIndexCache, bool_t* pMbSkipCache, int32_t* pSadCostCache, int32_t uiRef,
+                     int32_t* iSadPredSkip) {
+  const int32_t kiRefB	= pRefIndexCache[1];//top g_uiCache12_8x8RefIdx[0] - 4
+  int32_t iRefC			= pRefIndexCache[5];//top-right g_uiCache12_8x8RefIdx[0] - 2
+  const int32_t kiRefA	= pRefIndexCache[6];//left g_uiCache12_8x8RefIdx[0] - 1
+  const int32_t kiSadB		= (pMbSkipCache[1] == 1 ? pSadCostCache[1] : 0);
+  int32_t iSadC			= (pMbSkipCache[2] == 1 ? pSadCostCache[2] : 0);
+  const int32_t kiSadA		= (pMbSkipCache[3] == 1 ? pSadCostCache[3] : 0);
+  int32_t iRefSkip		= pMbSkipCache[2];
+
+  int32_t iCount = 0;
+
+  if (iRefC == REF_NOT_AVAIL) {
+    iRefC = pRefIndexCache[0];//top-left g_uiCache12_8x8RefIdx[0] - 4 - 1
+    iSadC  = (pMbSkipCache[0] == 1 ? pSadCostCache[0] : 0);
+    iRefSkip = pMbSkipCache[0];
+  }
+
+  if (kiRefB == REF_NOT_AVAIL && iRefC == REF_NOT_AVAIL && kiRefA != REF_NOT_AVAIL) {
+    * iSadPredSkip = kiSadA;
+  } else {
+    iCount  = ((uiRef == kiRefA) && (pMbSkipCache[3] == 1)) << MB_LEFT_BIT;
+    iCount |= ((uiRef == kiRefB) && (pMbSkipCache[1] == 1)) << MB_TOP_BIT;
+    iCount |= ((uiRef == iRefC) && (iRefSkip == 1)) << MB_TOPRIGHT_BIT;
+    switch (iCount) {
+    case LEFT_MB_POS:// A
+      *iSadPredSkip = kiSadA;
+      break;
+    case TOP_MB_POS:// B
+      *iSadPredSkip = kiSadB;
+      break;
+    case TOPRIGHT_MB_POS:// C or D
+      *iSadPredSkip = iSadC;
+      break;
+    default:
+      *iSadPredSkip = WELS_MEDIAN (kiSadA, kiSadB, iSadC);
+      break;
+    }
+  }
+}
+}
--- a/codec/encoder/core/src/memory_align.cpp
+++ b/codec/encoder/core/src/memory_align.cpp
@@ -1,161 +1,156 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <stdlib.h>
-#include <string.h>
-#include <assert.h>
-#include "memory_align.h"
-#include "macros.h"
-
-namespace WelsSVCEnc {
-
-CMemoryAlign::CMemoryAlign( const uint32_t kuiCacheLineSize )
-#ifdef MEMORY_MONITOR
-:	m_nMemoryUsageInBytes( 0 )
-#endif//MEMORY_MONITOR
-{
-	if ( (kuiCacheLineSize == 0) || (kuiCacheLineSize & 0x0f) )	
-		m_nCacheLineSize	= 0x10;
-	else
-		m_nCacheLineSize	= kuiCacheLineSize;
-	
-#ifdef MEMORY_CHECK
-	m_fpMemChkPoint		= fopen("./enc_mem_check_point.txt",  "wt+");
-	m_nCountRequestNum	= 0;
-#endif//MEMORY_CHECK
-}
-
-CMemoryAlign::~CMemoryAlign()
-{
-#ifdef MEMORY_MONITOR
-	assert( m_nMemoryUsageInBytes == 0 );
-#endif//MEMORY_MONITOR
-
-#ifdef MEMORY_CHECK	
-	fclose(m_fpMemChkPoint);
-	m_fpMemChkPoint = NULL;
-
-	m_nCountRequestNum	= 0;
-#endif//MEMORY_CHECK
-}
-
-void* CMemoryAlign::WelsMallocz( const uint32_t kuiSize, const str_t *kpTag )
-{
-	void *pPointer = WelsMalloc( kuiSize, kpTag );	
-	if ( NULL == pPointer )
-	{
-		return NULL;
-	}	
-	// zero memory
-	memset( pPointer, 0, kuiSize );
-	
-	return pPointer;
-}
-
-void* CMemoryAlign::WelsMalloc( const uint32_t kuiSize, const str_t *kpTag )
-{
-	const int32_t kiSizeOfVoidPointer	= sizeof( void ** );
-	const int32_t kiSizeOfInt				= sizeof( int32_t );
-	const int32_t kiAlignedBytes		= m_nCacheLineSize - 1;
-	const int32_t kiTrialRequestedSize	= kuiSize + kiAlignedBytes + kiSizeOfVoidPointer + kiSizeOfInt;
-#if MEMORY_REQUEST_ALIGN_BYTES
-	// ensure 4 bytes boundary aligned memory request, unused extra bytes padding in pData payload
-	const int32_t kiActualRequestedSize	= WELS_ALIGN(kiTrialRequestedSize, MEMORY_REQUEST_ALIGN_BYTES);
-	const uint32_t kiPayloadSize			= kuiSize + MEMORY_REQUEST_ALIGN_BYTES - (kiTrialRequestedSize & (MEMORY_REQUEST_ALIGN_BYTES-1));
-#else
-	const int32_t kiActualRequestedSize	= kiTrialRequestedSize;
-	const uint32_t kiPayloadSize			= kuiSize;
-#endif//MEMORY_REQUEST_ALIGN_BYTES
-
-    uint8_t* pBuf		= (uint8_t *) malloc( kiActualRequestedSize );
-#ifdef MEMORY_CHECK	
-	if (m_fpMemChkPoint != NULL)
-	{
-		if ( kpTag != NULL )
-            fprintf( m_fpMemChkPoint, "WelsMalloc(), 0x%x : actual uiSize:\t%d\tbytes, input uiSize: %d bytes, %d - %s\n", (void *)pBuf, kiActualRequestedSize, kuiSize, m_nCountRequestNum++, kpTag );
-		else
-			fprintf( m_fpMemChkPoint, "WelsMalloc(), 0x%x : actual uiSize:\t%d\tbytes, input uiSize: %d bytes, %d \n", (void *)pBuf, kiActualRequestedSize, kuiSize, m_nCountRequestNum++ );
-		fflush( m_fpMemChkPoint);
-	}
-#endif
-	uint8_t* pAlignedBuffer;
-	
-	if ( NULL == pBuf )
-		return NULL;
-
-    pAlignedBuffer = pBuf + kiAlignedBytes + kiSizeOfVoidPointer + kiSizeOfInt;
-    pAlignedBuffer -= ((int32_t) pAlignedBuffer & kiAlignedBytes);
-    *( (void **) ( pAlignedBuffer - kiSizeOfVoidPointer ) ) = pBuf;
-    *( (int32_t *) ( pAlignedBuffer - (kiSizeOfVoidPointer + kiSizeOfInt) ) ) = kiPayloadSize;
-
-#ifdef MEMORY_MONITOR
-	m_nMemoryUsageInBytes += kiActualRequestedSize;
-#endif//MEMORY_MONITOR
-
-    return pAlignedBuffer;
-}
-
-void CMemoryAlign::WelsFree( void* pPointer, const str_t *kpTag )
-{
-	if( pPointer )
-    {
-#ifdef MEMORY_MONITOR
-		const int32_t kiMemoryLength = *((int32_t *)((uint8_t *)pPointer- sizeof(void **) - sizeof(int32_t))) + m_nCacheLineSize - 1 + sizeof(void **) + sizeof(int32_t);
-		m_nMemoryUsageInBytes -= kiMemoryLength;
-#endif//MEMORY_MONITOR
-#ifdef MEMORY_CHECK		
-		if (m_fpMemChkPoint != NULL)
-		{
-			if ( kpTag != NULL )
-				fprintf( m_fpMemChkPoint, "WelsFree(), 0x%x - %s: \t%d\t bytes \n", (void *)(*( ( ( void **) pPointer ) - 1 )), kpTag, kiMemoryLength );
-			else
-				fprintf( m_fpMemChkPoint, "WelsFree(), 0x%x \n", (void *)(*( ( ( void **) pPointer ) - 1 )) );
-			fflush( m_fpMemChkPoint);
-		}
-#endif
-        free( *( ( ( void **) pPointer ) - 1 ) );
-    }
-}
-
-const uint32_t CMemoryAlign::WelsGetCacheLineSize() const
-{
-	return m_nCacheLineSize;
-}
-
-#if defined(MEMORY_MONITOR)
-const uint32_t CMemoryAlign::WelsGetMemoryUsage() const
-{
-	return m_nMemoryUsageInBytes;
-}
-#endif//MEMORY_MONITOR
-
-} // end of namespace WelsSVCEnc
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include "memory_align.h"
+#include "macros.h"
+
+namespace WelsSVCEnc {
+
+CMemoryAlign::CMemoryAlign (const uint32_t kuiCacheLineSize)
+#ifdef MEMORY_MONITOR
+  :	m_nMemoryUsageInBytes (0)
+#endif//MEMORY_MONITOR
+{
+  if ((kuiCacheLineSize == 0) || (kuiCacheLineSize & 0x0f))
+    m_nCacheLineSize	= 0x10;
+  else
+    m_nCacheLineSize	= kuiCacheLineSize;
+
+#ifdef MEMORY_CHECK
+  m_fpMemChkPoint		= fopen ("./enc_mem_check_point.txt",  "wt+");
+  m_nCountRequestNum	= 0;
+#endif//MEMORY_CHECK
+}
+
+CMemoryAlign::~CMemoryAlign() {
+#ifdef MEMORY_MONITOR
+  assert (m_nMemoryUsageInBytes == 0);
+#endif//MEMORY_MONITOR
+
+#ifdef MEMORY_CHECK
+  fclose (m_fpMemChkPoint);
+  m_fpMemChkPoint = NULL;
+
+  m_nCountRequestNum	= 0;
+#endif//MEMORY_CHECK
+}
+
+void* CMemoryAlign::WelsMallocz (const uint32_t kuiSize, const str_t* kpTag) {
+  void* pPointer = WelsMalloc (kuiSize, kpTag);
+  if (NULL == pPointer) {
+    return NULL;
+  }
+  // zero memory
+  memset (pPointer, 0, kuiSize);
+
+  return pPointer;
+}
+
+void* CMemoryAlign::WelsMalloc (const uint32_t kuiSize, const str_t* kpTag) {
+  const int32_t kiSizeOfVoidPointer	= sizeof (void**);
+  const int32_t kiSizeOfInt				= sizeof (int32_t);
+  const int32_t kiAlignedBytes		= m_nCacheLineSize - 1;
+  const int32_t kiTrialRequestedSize	= kuiSize + kiAlignedBytes + kiSizeOfVoidPointer + kiSizeOfInt;
+#if MEMORY_REQUEST_ALIGN_BYTES
+  // ensure 4 bytes boundary aligned memory request, unused extra bytes padding in pData payload
+  const int32_t kiActualRequestedSize	= WELS_ALIGN (kiTrialRequestedSize, MEMORY_REQUEST_ALIGN_BYTES);
+  const uint32_t kiPayloadSize			= kuiSize + MEMORY_REQUEST_ALIGN_BYTES - (kiTrialRequestedSize &
+                                      (MEMORY_REQUEST_ALIGN_BYTES - 1));
+#else
+  const int32_t kiActualRequestedSize	= kiTrialRequestedSize;
+  const uint32_t kiPayloadSize			= kuiSize;
+#endif//MEMORY_REQUEST_ALIGN_BYTES
+
+  uint8_t* pBuf		= (uint8_t*) malloc (kiActualRequestedSize);
+#ifdef MEMORY_CHECK
+  if (m_fpMemChkPoint != NULL) {
+    if (kpTag != NULL)
+      fprintf (m_fpMemChkPoint, "WelsMalloc(), 0x%x : actual uiSize:\t%d\tbytes, input uiSize: %d bytes, %d - %s\n",
+               (void*)pBuf, kiActualRequestedSize, kuiSize, m_nCountRequestNum++, kpTag);
+    else
+      fprintf (m_fpMemChkPoint, "WelsMalloc(), 0x%x : actual uiSize:\t%d\tbytes, input uiSize: %d bytes, %d \n", (void*)pBuf,
+               kiActualRequestedSize, kuiSize, m_nCountRequestNum++);
+    fflush (m_fpMemChkPoint);
+  }
+#endif
+  uint8_t* pAlignedBuffer;
+
+  if (NULL == pBuf)
+    return NULL;
+
+  pAlignedBuffer = pBuf + kiAlignedBytes + kiSizeOfVoidPointer + kiSizeOfInt;
+  pAlignedBuffer -= ((int32_t) pAlignedBuffer & kiAlignedBytes);
+  * ((void**) (pAlignedBuffer - kiSizeOfVoidPointer)) = pBuf;
+  * ((int32_t*) (pAlignedBuffer - (kiSizeOfVoidPointer + kiSizeOfInt))) = kiPayloadSize;
+
+#ifdef MEMORY_MONITOR
+  m_nMemoryUsageInBytes += kiActualRequestedSize;
+#endif//MEMORY_MONITOR
+
+  return pAlignedBuffer;
+}
+
+void CMemoryAlign::WelsFree (void* pPointer, const str_t* kpTag) {
+  if (pPointer) {
+#ifdef MEMORY_MONITOR
+    const int32_t kiMemoryLength = * ((int32_t*) ((uint8_t*)pPointer - sizeof (void**) - sizeof (
+                                        int32_t))) + m_nCacheLineSize - 1 + sizeof (void**) + sizeof (int32_t);
+    m_nMemoryUsageInBytes -= kiMemoryLength;
+#endif//MEMORY_MONITOR
+#ifdef MEMORY_CHECK
+    if (m_fpMemChkPoint != NULL) {
+      if (kpTag != NULL)
+        fprintf (m_fpMemChkPoint, "WelsFree(), 0x%x - %s: \t%d\t bytes \n", (void*) (* (((void**) pPointer) - 1)), kpTag,
+                 kiMemoryLength);
+      else
+        fprintf (m_fpMemChkPoint, "WelsFree(), 0x%x \n", (void*) (* (((void**) pPointer) - 1)));
+      fflush (m_fpMemChkPoint);
+    }
+#endif
+    free (* (((void**) pPointer) - 1));
+  }
+}
+
+const uint32_t CMemoryAlign::WelsGetCacheLineSize() const {
+  return m_nCacheLineSize;
+}
+
+#if defined(MEMORY_MONITOR)
+const uint32_t CMemoryAlign::WelsGetMemoryUsage() const {
+  return m_nMemoryUsageInBytes;
+}
+#endif//MEMORY_MONITOR
+
+} // end of namespace WelsSVCEnc
--- a/codec/encoder/core/src/mv_pred.cpp
+++ b/codec/encoder/core/src/mv_pred.cpp
@@ -1,389 +1,362 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file	mv_pred.c
- *
- * \brief	Get MV predictor and update motion vector of mb cache
- *
- * \date	05/22/2009 Created
- *
- *************************************************************************************
- */
-
-#include "mv_pred.h"
-#include "ls_defines.h"
-namespace WelsSVCEnc {
-//basic pMv prediction unit for pMv width (4, 2, 1)
-void PredMv(const SMVComponentUnit* kpMvComp, int8_t iPartIdx, int8_t iPartW, int32_t iRef, SMVUnitXY* sMvp)
-{	
-	const uint8_t kuiLeftIdx		= g_kuiCache30ScanIdx[iPartIdx] - 1;
-	const uint8_t kuiTopIdx		= g_kuiCache30ScanIdx[iPartIdx] - 6;
-
-	int32_t iMatchRef;
-	int32_t iLeftRef = kpMvComp->iRefIndexCache[kuiLeftIdx];
-	int32_t iTopRef  = kpMvComp->iRefIndexCache[ kuiTopIdx];
-	int32_t iRightTopRef = kpMvComp->iRefIndexCache[kuiTopIdx + iPartW];
-	int32_t iDiagonalRef;
-	SMVUnitXY sMvA(kpMvComp->sMotionVectorCache[kuiLeftIdx]);
-	SMVUnitXY sMvB(kpMvComp->sMotionVectorCache[kuiTopIdx]);
-	SMVUnitXY sMvC;
-
-	if (REF_NOT_AVAIL == iRightTopRef) 
-	{
-		iDiagonalRef = kpMvComp->iRefIndexCache[ kuiTopIdx - 1];// left_top;
-		sMvC = kpMvComp->sMotionVectorCache[kuiTopIdx - 1];
-	}
-	else
-	{
-		iDiagonalRef = iRightTopRef;// right_top;
-		sMvC = kpMvComp->sMotionVectorCache[kuiTopIdx + iPartW];
-	}	
-
-	if ((REF_NOT_AVAIL == iTopRef) && (REF_NOT_AVAIL == iDiagonalRef) && iLeftRef != REF_NOT_AVAIL) 
-	{
-		*sMvp = sMvA;
-		return;
-	}
-
-	// b2[diag] b1[top] b0[left] is available!
-	iMatchRef  = (iRef == iLeftRef)	<<MB_LEFT_BIT;
-	iMatchRef |= (iRef == iTopRef)		<<MB_TOP_BIT;
-	iMatchRef |= (iRef == iDiagonalRef)<<MB_TOPRIGHT_BIT;
-	switch(iMatchRef) 
-	{
-		case LEFT_MB_POS:// A
-			*sMvp = sMvA;
-			break;
-		case TOP_MB_POS:// B
-			*sMvp = sMvB;
-			break;
-		case TOPRIGHT_MB_POS:// C or D
-			*sMvp = sMvC;
-			break;
-		default:
-			sMvp->iMvX = WELS_MEDIAN(sMvA.iMvX, sMvB.iMvX, sMvC.iMvX);
-			sMvp->iMvY = WELS_MEDIAN(sMvA.iMvY, sMvB.iMvY, sMvC.iMvY);
-			break;
-	}
-}
-void PredInter8x16Mv(SMbCache* pMbCache, int32_t iPartIdx, int8_t iRef, SMVUnitXY* sMvp)
-{
-	const SMVComponentUnit *kpMvComp = &pMbCache->sMvComponents;
-	if (0 == iPartIdx) 
-	{
-		const int8_t kiLeftRef = kpMvComp->iRefIndexCache[6];
-		if (iRef == kiLeftRef)
-		{
-			*sMvp = kpMvComp->sMotionVectorCache[6];
-			return;
-		}		
-	}
-	else // 1 == iPartIdx
-	{
-		int8_t iDiagonalRef = kpMvComp->iRefIndexCache[5]; //top-right
-		int8_t iIndex = 5;
-		if (REF_NOT_AVAIL == iDiagonalRef)
-		{
-			iDiagonalRef = kpMvComp->iRefIndexCache[2]; //top-left for 8*8 block(iIndex 1)
-			iIndex = 2;
-		}
-		if (iRef == iDiagonalRef) 
-		{
-			*sMvp = kpMvComp->sMotionVectorCache[iIndex];
-			return;
-		}	
-	}
-
-	PredMv(kpMvComp, iPartIdx, 2, iRef, sMvp);
-}
-void PredInter16x8Mv(SMbCache* pMbCache, int32_t iPartIdx, int8_t iRef, SMVUnitXY* sMvp)
-{
-	const SMVComponentUnit *kpMvComp = &pMbCache->sMvComponents;
-	if (0 == iPartIdx) 
-	{
-		const int8_t kiTopRef = kpMvComp->iRefIndexCache[1];
-		if (iRef == kiTopRef)
-		{
-			*sMvp = kpMvComp->sMotionVectorCache[1];
-			return;
-		}
-	}
-	else // 8 == iPartIdx
-	{
-		const int8_t kiLeftRef = kpMvComp->iRefIndexCache[18];
-		if (iRef == kiLeftRef) 
-		{
-			*sMvp = kpMvComp->sMotionVectorCache[18];
-			return;
-		}
-	}
-
-	PredMv(kpMvComp, iPartIdx, 4, iRef, sMvp);
-}
-void PredSkipMv(SMbCache* pMbCache, SMVUnitXY* sMvp)
-{	
-	const SMVComponentUnit *kpMvComp = &pMbCache->sMvComponents;
-	const int8_t kiLeftRef = kpMvComp->iRefIndexCache[6]; //A
-	const int8_t kiTopRef  = kpMvComp->iRefIndexCache[1]; //B
-
-	if (REF_NOT_AVAIL == kiLeftRef  || REF_NOT_AVAIL == kiTopRef ||
-		(0 == kiLeftRef && 0 == *(int32_t*)(&kpMvComp->sMotionVectorCache[6])) || 
-		(0 == kiTopRef  && 0 == *(int32_t*)(&kpMvComp->sMotionVectorCache[1])) )
-	{
-		ST32( sMvp, 0 );
-		return;
-	}
-
-	PredMv(kpMvComp, 0, 4, 0, sMvp);	
-}
-
-//update pMv and uiRefIndex cache for current MB, only for P_16*16 (SKIP inclusive)
-void UpdateP16x16MotionInfo(SMbCache* pMbCache, SMB* pCurMb, const int8_t kiRef, SMVUnitXY* pMv)
-{
-	// optimized 11/25/2011
-	SMVComponentUnit *pMvComp	= &pMbCache->sMvComponents;
-	const uint32_t kuiMv32			= LD32(pMv);
-	const uint64_t kuiMv64			= BUTTERFLY4x8(kuiMv32);
-	uint64_t uiMvBuf[8]			= { kuiMv64, kuiMv64, kuiMv64, kuiMv64, kuiMv64, kuiMv64, kuiMv64, kuiMv64 };	
-	const uint16_t kuiRef16		= BUTTERFLY1x2(kiRef);
-	const uint32_t kuiRef32		= BUTTERFLY2x4(kuiRef16);
-
-	ST32( pCurMb->pRefIndex, kuiRef32 );
-	// update pMv range from 0~15
-	memcpy( pCurMb->sMv, uiMvBuf, sizeof(uiMvBuf) );	// confirmed_safe_unsafe_usage
-	
-	/*
-	 * blocks 0: 7~10, 1: 13~16, 2: 19~22, 3: 25~28
-	 */
-	pMvComp->iRefIndexCache[7]	= kiRef;
-	ST16(&pMvComp->iRefIndexCache[8], kuiRef16);
-	pMvComp->iRefIndexCache[10]	= kiRef;
-	pMvComp->iRefIndexCache[13]	= kiRef;
-	ST16(&pMvComp->iRefIndexCache[14], kuiRef16);
-	pMvComp->iRefIndexCache[16]	= kiRef;
-	pMvComp->iRefIndexCache[19]	= kiRef;
-	ST16(&pMvComp->iRefIndexCache[20], kuiRef16);
-	pMvComp->iRefIndexCache[22]	= kiRef;
-	pMvComp->iRefIndexCache[25]	= kiRef;
-	ST16(&pMvComp->iRefIndexCache[26], kuiRef16);
-	pMvComp->iRefIndexCache[28]	= kiRef;
-
-	/*
-	* blocks 0: 7~10, 1: 13~16, 2: 19~22, 3: 25~28
-	*/
-	pMvComp->sMotionVectorCache[7]	= *pMv;
-	ST64( &pMvComp->sMotionVectorCache[8], kuiMv64 );
-	pMvComp->sMotionVectorCache[10] = *pMv;	
-	pMvComp->sMotionVectorCache[13] = *pMv;
-	ST64( &pMvComp->sMotionVectorCache[14], kuiMv64 );
-	pMvComp->sMotionVectorCache[16] = *pMv;
-	pMvComp->sMotionVectorCache[19] = *pMv;
-	ST64( &pMvComp->sMotionVectorCache[20], kuiMv64 );
-	pMvComp->sMotionVectorCache[22] = *pMv;
-	pMvComp->sMotionVectorCache[25] = *pMv;
-	ST64( &pMvComp->sMotionVectorCache[26], kuiMv64 );
-	pMvComp->sMotionVectorCache[28] = *pMv;
-}
-
-//update uiRefIndex and pMv of both SMB and Mb_cache, only for P16x8 
-void UpdateP16x8MotionInfo(SMbCache* pMbCache, SMB* pCurMb, const int32_t kiPartIdx, const int8_t kiRef, SMVUnitXY* pMv)
-{
-	// optimized 11/25/2011
-	SMVComponentUnit *pMvComp	= &pMbCache->sMvComponents;
-	const uint32_t kuiMv32			= LD32(pMv);
-	const uint64_t kuiMv64			= BUTTERFLY4x8(kuiMv32);
-	uint64_t uiMvBuf[4]			= { kuiMv64, kuiMv64, kuiMv64, kuiMv64 };
-	const int16_t kiScan4Idx		= g_kuiMbCountScan4Idx[kiPartIdx];
-	const int16_t kiCacheIdx		= g_kuiCache30ScanIdx[kiPartIdx];
-	const int16_t kiCacheIdx1	= 1+kiCacheIdx;
-	const int16_t kiCacheIdx3	= 3+kiCacheIdx;
-	const int16_t kiCacheIdx6	= 6+kiCacheIdx;
-	const int16_t kiCacheIdx7	= 7+kiCacheIdx;
-	const int16_t kiCacheIdx9	= 9+kiCacheIdx;
-	const uint16_t kuiRef16		= BUTTERFLY1x2(kiRef);
-
-	ST16( &pCurMb->pRefIndex[(kiPartIdx>>2)], kuiRef16 );
-	memcpy( &pCurMb->sMv[kiScan4Idx], uiMvBuf, sizeof(uiMvBuf) );	// confirmed_safe_unsafe_usage
-
-	/*
-	* blocks 0: g_kuiCache30ScanIdx[iPartIdx]~g_kuiCache30ScanIdx[iPartIdx]+3, 1: g_kuiCache30ScanIdx[iPartIdx]+6~g_kuiCache30ScanIdx[iPartIdx]+9
-	*/
-	pMvComp->iRefIndexCache[kiCacheIdx]		= kiRef;
-	ST16(&pMvComp->iRefIndexCache[kiCacheIdx1], kuiRef16);
-	pMvComp->iRefIndexCache[kiCacheIdx3]	= kiRef;
-	pMvComp->iRefIndexCache[kiCacheIdx6]	= kiRef;
-	ST16(&pMvComp->iRefIndexCache[kiCacheIdx7], kuiRef16);
-	pMvComp->iRefIndexCache[kiCacheIdx9]	= kiRef;
-
-	/*
-	* blocks 0: g_kuiCache30ScanIdx[iPartIdx]~g_kuiCache30ScanIdx[iPartIdx]+3, 1: g_kuiCache30ScanIdx[iPartIdx]+6~g_kuiCache30ScanIdx[iPartIdx]+9
-	*/
-	pMvComp->sMotionVectorCache[kiCacheIdx]	= *pMv;
-	ST64( &pMvComp->sMotionVectorCache[kiCacheIdx1], kuiMv64 );
-	pMvComp->sMotionVectorCache[kiCacheIdx3]= *pMv;	
-	pMvComp->sMotionVectorCache[kiCacheIdx6]= *pMv;
-	ST64( &pMvComp->sMotionVectorCache[kiCacheIdx7], kuiMv64 );
-	pMvComp->sMotionVectorCache[kiCacheIdx9]= *pMv;
-}
-//update uiRefIndex and pMv of both SMB and Mb_cache, only for P8x16
-void update_P8x16_motion_info(SMbCache* pMbCache, SMB* pCurMb, const int32_t kiPartIdx, const int8_t kiRef, SMVUnitXY* pMv)
-{
-	// optimized 11/25/2011
-	SMVComponentUnit *pMvComp	= &pMbCache->sMvComponents;
-	const uint32_t kuiMv32			= LD32(pMv);
-	const uint64_t kuiMv64			= BUTTERFLY4x8(kuiMv32);
-	const int16_t kiScan4Idx		= g_kuiMbCountScan4Idx[kiPartIdx];
-	const int16_t kiCacheIdx		= g_kuiCache30ScanIdx[kiPartIdx];
-	const int16_t kiCacheIdx1	= 1+kiCacheIdx;
-	const int16_t kiCacheIdx3	= 3+kiCacheIdx;
-	const int16_t kiCacheIdx12	= 12+kiCacheIdx;
-	const int16_t kiCacheIdx13	= 13+kiCacheIdx;
-	const int16_t kiCacheIdx15	= 15+kiCacheIdx;
-	const int16_t kiBlkIdx		= kiPartIdx>>2;
-	const uint16_t kuiRef16		= BUTTERFLY1x2(kiRef);
-		
-	pCurMb->pRefIndex[kiBlkIdx]	= kiRef;
-	pCurMb->pRefIndex[2+kiBlkIdx]= kiRef;
-	ST64( &pCurMb->sMv[kiScan4Idx], kuiMv64 );
-	ST64( &pCurMb->sMv[4+kiScan4Idx], kuiMv64 );
-	ST64( &pCurMb->sMv[8+kiScan4Idx], kuiMv64 );
-	ST64( &pCurMb->sMv[12+kiScan4Idx], kuiMv64 );
-
-	/*
-	* blocks 0: g_kuiCache30ScanIdx[iPartIdx]~g_kuiCache30ScanIdx[iPartIdx]+3, 1: g_kuiCache30ScanIdx[iPartIdx]+6~g_kuiCache30ScanIdx[iPartIdx]+9
-	*/
-	pMvComp->iRefIndexCache[kiCacheIdx]	= kiRef;
-	ST16(&pMvComp->iRefIndexCache[kiCacheIdx1], kuiRef16);
-	pMvComp->iRefIndexCache[kiCacheIdx3]	= kiRef;
-	pMvComp->iRefIndexCache[kiCacheIdx12]	= kiRef;
-	ST16(&pMvComp->iRefIndexCache[kiCacheIdx13], kuiRef16);
-	pMvComp->iRefIndexCache[kiCacheIdx15]	= kiRef;
-
-	/*
-	* blocks 0: g_kuiCache30ScanIdx[iPartIdx]~g_kuiCache30ScanIdx[iPartIdx]+3, 1: g_kuiCache30ScanIdx[iPartIdx]+6~g_kuiCache30ScanIdx[iPartIdx]+9
-	*/
-	pMvComp->sMotionVectorCache[kiCacheIdx]	= *pMv;
-	ST64( &pMvComp->sMotionVectorCache[kiCacheIdx1], kuiMv64 );
-	pMvComp->sMotionVectorCache[kiCacheIdx3] = *pMv;	
-	pMvComp->sMotionVectorCache[kiCacheIdx12] = *pMv;
-	ST64( &pMvComp->sMotionVectorCache[kiCacheIdx13], kuiMv64 );
-	pMvComp->sMotionVectorCache[kiCacheIdx15] = *pMv;
-}
-//update uiRefIndex and pMv of both SMB and Mb_cache, only for P8x8
-void UpdateP8x8MotionInfo(SMbCache* pMbCache, SMB* pCurMb, const int32_t kiPartIdx, const int8_t kiRef, SMVUnitXY* pMv)
-{
-	SMVComponentUnit *pMvComp = &pMbCache->sMvComponents;
-	const uint32_t kuiMv32			= LD32(pMv);
-	const uint64_t kuiMv64			= BUTTERFLY4x8(kuiMv32);
-	const int16_t kiScan4Idx		= g_kuiMbCountScan4Idx[kiPartIdx];
-	const int16_t kiCacheIdx		= g_kuiCache30ScanIdx[kiPartIdx];
-	const int16_t kiCacheIdx1	= 1+kiCacheIdx;
-	const int16_t kiCacheIdx6	= 6+kiCacheIdx;
-	const int16_t kiCacheIdx7	= 7+kiCacheIdx;
-	
-	//mb
-	ST64( &pCurMb->sMv[  kiScan4Idx], kuiMv64 );
-	ST64( &pCurMb->sMv[4+kiScan4Idx], kuiMv64 );
-	
-	//cache
-   	pMvComp->iRefIndexCache[kiCacheIdx ] =
-   	pMvComp->iRefIndexCache[kiCacheIdx1] = 
-   	pMvComp->iRefIndexCache[kiCacheIdx6] =
-   	pMvComp->iRefIndexCache[kiCacheIdx7] = kiRef;
-	pMvComp->sMotionVectorCache[kiCacheIdx ] =
-	pMvComp->sMotionVectorCache[kiCacheIdx1] =
-	pMvComp->sMotionVectorCache[kiCacheIdx6] =
-	pMvComp->sMotionVectorCache[kiCacheIdx7] = *pMv;
-}
-
-//=========================update motion info(MV and ref_idx) into Mb_cache==========================
-//update pMv and uiRefIndex cache only for Mb_cache, only for P_16*16 (SKIP inclusive)
-
-//update uiRefIndex and pMv of only Mb_cache, only for P16x8 
-void UpdateP16x8Motion2Cache(SMbCache* pMbCache, int32_t iPartIdx, int8_t iRef, SMVUnitXY* pMv)
-{
-	SMVComponentUnit *pMvComp = &pMbCache->sMvComponents;
-	int32_t i;	
-
-	for (i = 0; i < 2; i++, iPartIdx+=4) 
-	{
-		//cache
-		const uint8_t kuiCacheIdx = g_kuiCache30ScanIdx[iPartIdx];
-
-    	pMvComp->iRefIndexCache[  kuiCacheIdx] =
-    	pMvComp->iRefIndexCache[1+kuiCacheIdx] =
-    	pMvComp->iRefIndexCache[6+kuiCacheIdx] =
-    	pMvComp->iRefIndexCache[7+kuiCacheIdx] = iRef;
-		pMvComp->sMotionVectorCache[  kuiCacheIdx] =
-		pMvComp->sMotionVectorCache[1+kuiCacheIdx] =
-		pMvComp->sMotionVectorCache[6+kuiCacheIdx] =
-		pMvComp->sMotionVectorCache[7+kuiCacheIdx] = *pMv;
-	}	
-}
-//update uiRefIndex and pMv of only Mb_cache, only for P8x16
-void UpdateP8x16Motion2Cache(SMbCache* pMbCache, int32_t iPartIdx, int8_t iRef, SMVUnitXY* pMv)
-{
-	SMVComponentUnit *pMvComp = &pMbCache->sMvComponents;
-	int32_t i;
-
-	for (i = 0; i < 2; i++, iPartIdx+=8) 
-	{
-		//cache
-		const uint8_t kuiCacheIdx = g_kuiCache30ScanIdx[iPartIdx];
-
-    	pMvComp->iRefIndexCache[  kuiCacheIdx] =
-    	pMvComp->iRefIndexCache[1+kuiCacheIdx] =
-    	pMvComp->iRefIndexCache[6+kuiCacheIdx] =
-    	pMvComp->iRefIndexCache[7+kuiCacheIdx] = iRef;
-		pMvComp->sMotionVectorCache[  kuiCacheIdx] =
-		pMvComp->sMotionVectorCache[1+kuiCacheIdx] =
-		pMvComp->sMotionVectorCache[6+kuiCacheIdx] =
-		pMvComp->sMotionVectorCache[7+kuiCacheIdx] = *pMv;
-	}	
-}
-
-//update uiRefIndex and pMv of only Mb_cache, only for P8x8
-void UpdateP8x8Motion2Cache(SMbCache* pMbCache, int32_t iPartIdx, int8_t pRef, SMVUnitXY* pMv)
-{
-	SMVComponentUnit *pMvComp = &pMbCache->sMvComponents;
-	const uint8_t kuiCacheIdx = g_kuiCache30ScanIdx[iPartIdx];
-	
-    pMvComp->iRefIndexCache[  kuiCacheIdx] =
-    pMvComp->iRefIndexCache[1+kuiCacheIdx] =
-    pMvComp->iRefIndexCache[6+kuiCacheIdx] =
-    pMvComp->iRefIndexCache[7+kuiCacheIdx] = pRef;
-	pMvComp->sMotionVectorCache[  kuiCacheIdx] =
-	pMvComp->sMotionVectorCache[1+kuiCacheIdx] =
-	pMvComp->sMotionVectorCache[6+kuiCacheIdx] =
-	pMvComp->sMotionVectorCache[7+kuiCacheIdx] = *pMv;
-}
-
-} // namespace WelsSVCEnc 
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	mv_pred.c
+ *
+ * \brief	Get MV predictor and update motion vector of mb cache
+ *
+ * \date	05/22/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#include "mv_pred.h"
+#include "ls_defines.h"
+namespace WelsSVCEnc {
+//basic pMv prediction unit for pMv width (4, 2, 1)
+void PredMv (const SMVComponentUnit* kpMvComp, int8_t iPartIdx, int8_t iPartW, int32_t iRef, SMVUnitXY* sMvp) {
+  const uint8_t kuiLeftIdx		= g_kuiCache30ScanIdx[iPartIdx] - 1;
+  const uint8_t kuiTopIdx		= g_kuiCache30ScanIdx[iPartIdx] - 6;
+
+  int32_t iMatchRef;
+  int32_t iLeftRef = kpMvComp->iRefIndexCache[kuiLeftIdx];
+  int32_t iTopRef  = kpMvComp->iRefIndexCache[ kuiTopIdx];
+  int32_t iRightTopRef = kpMvComp->iRefIndexCache[kuiTopIdx + iPartW];
+  int32_t iDiagonalRef;
+  SMVUnitXY sMvA (kpMvComp->sMotionVectorCache[kuiLeftIdx]);
+  SMVUnitXY sMvB (kpMvComp->sMotionVectorCache[kuiTopIdx]);
+  SMVUnitXY sMvC;
+
+  if (REF_NOT_AVAIL == iRightTopRef) {
+    iDiagonalRef = kpMvComp->iRefIndexCache[ kuiTopIdx - 1];// left_top;
+    sMvC = kpMvComp->sMotionVectorCache[kuiTopIdx - 1];
+  } else {
+    iDiagonalRef = iRightTopRef;// right_top;
+    sMvC = kpMvComp->sMotionVectorCache[kuiTopIdx + iPartW];
+  }
+
+  if ((REF_NOT_AVAIL == iTopRef) && (REF_NOT_AVAIL == iDiagonalRef) && iLeftRef != REF_NOT_AVAIL) {
+    *sMvp = sMvA;
+    return;
+  }
+
+  // b2[diag] b1[top] b0[left] is available!
+  iMatchRef  = (iRef == iLeftRef)	<< MB_LEFT_BIT;
+  iMatchRef |= (iRef == iTopRef)		<< MB_TOP_BIT;
+  iMatchRef |= (iRef == iDiagonalRef) << MB_TOPRIGHT_BIT;
+  switch (iMatchRef) {
+  case LEFT_MB_POS:// A
+    *sMvp = sMvA;
+    break;
+  case TOP_MB_POS:// B
+    *sMvp = sMvB;
+    break;
+  case TOPRIGHT_MB_POS:// C or D
+    *sMvp = sMvC;
+    break;
+  default:
+    sMvp->iMvX = WELS_MEDIAN (sMvA.iMvX, sMvB.iMvX, sMvC.iMvX);
+    sMvp->iMvY = WELS_MEDIAN (sMvA.iMvY, sMvB.iMvY, sMvC.iMvY);
+    break;
+  }
+}
+void PredInter8x16Mv (SMbCache* pMbCache, int32_t iPartIdx, int8_t iRef, SMVUnitXY* sMvp) {
+  const SMVComponentUnit* kpMvComp = &pMbCache->sMvComponents;
+  if (0 == iPartIdx) {
+    const int8_t kiLeftRef = kpMvComp->iRefIndexCache[6];
+    if (iRef == kiLeftRef) {
+      *sMvp = kpMvComp->sMotionVectorCache[6];
+      return;
+    }
+  } else { // 1 == iPartIdx
+    int8_t iDiagonalRef = kpMvComp->iRefIndexCache[5]; //top-right
+    int8_t iIndex = 5;
+    if (REF_NOT_AVAIL == iDiagonalRef) {
+      iDiagonalRef = kpMvComp->iRefIndexCache[2]; //top-left for 8*8 block(iIndex 1)
+      iIndex = 2;
+    }
+    if (iRef == iDiagonalRef) {
+      *sMvp = kpMvComp->sMotionVectorCache[iIndex];
+      return;
+    }
+  }
+
+  PredMv (kpMvComp, iPartIdx, 2, iRef, sMvp);
+}
+void PredInter16x8Mv (SMbCache* pMbCache, int32_t iPartIdx, int8_t iRef, SMVUnitXY* sMvp) {
+  const SMVComponentUnit* kpMvComp = &pMbCache->sMvComponents;
+  if (0 == iPartIdx) {
+    const int8_t kiTopRef = kpMvComp->iRefIndexCache[1];
+    if (iRef == kiTopRef) {
+      *sMvp = kpMvComp->sMotionVectorCache[1];
+      return;
+    }
+  } else { // 8 == iPartIdx
+    const int8_t kiLeftRef = kpMvComp->iRefIndexCache[18];
+    if (iRef == kiLeftRef) {
+      *sMvp = kpMvComp->sMotionVectorCache[18];
+      return;
+    }
+  }
+
+  PredMv (kpMvComp, iPartIdx, 4, iRef, sMvp);
+}
+void PredSkipMv (SMbCache* pMbCache, SMVUnitXY* sMvp) {
+  const SMVComponentUnit* kpMvComp = &pMbCache->sMvComponents;
+  const int8_t kiLeftRef = kpMvComp->iRefIndexCache[6]; //A
+  const int8_t kiTopRef  = kpMvComp->iRefIndexCache[1]; //B
+
+  if (REF_NOT_AVAIL == kiLeftRef  || REF_NOT_AVAIL == kiTopRef ||
+      (0 == kiLeftRef && 0 == * (int32_t*) (&kpMvComp->sMotionVectorCache[6])) ||
+      (0 == kiTopRef  && 0 == * (int32_t*) (&kpMvComp->sMotionVectorCache[1]))) {
+    ST32 (sMvp, 0);
+    return;
+  }
+
+  PredMv (kpMvComp, 0, 4, 0, sMvp);
+}
+
+//update pMv and uiRefIndex cache for current MB, only for P_16*16 (SKIP inclusive)
+void UpdateP16x16MotionInfo (SMbCache* pMbCache, SMB* pCurMb, const int8_t kiRef, SMVUnitXY* pMv) {
+  // optimized 11/25/2011
+  SMVComponentUnit* pMvComp	= &pMbCache->sMvComponents;
+  const uint32_t kuiMv32			= LD32 (pMv);
+  const uint64_t kuiMv64			= BUTTERFLY4x8 (kuiMv32);
+  uint64_t uiMvBuf[8]			= { kuiMv64, kuiMv64, kuiMv64, kuiMv64, kuiMv64, kuiMv64, kuiMv64, kuiMv64 };
+  const uint16_t kuiRef16		= BUTTERFLY1x2 (kiRef);
+  const uint32_t kuiRef32		= BUTTERFLY2x4 (kuiRef16);
+
+  ST32 (pCurMb->pRefIndex, kuiRef32);
+  // update pMv range from 0~15
+  memcpy (pCurMb->sMv, uiMvBuf, sizeof (uiMvBuf));	// confirmed_safe_unsafe_usage
+
+  /*
+   * blocks 0: 7~10, 1: 13~16, 2: 19~22, 3: 25~28
+   */
+  pMvComp->iRefIndexCache[7]	= kiRef;
+  ST16 (&pMvComp->iRefIndexCache[8], kuiRef16);
+  pMvComp->iRefIndexCache[10]	= kiRef;
+  pMvComp->iRefIndexCache[13]	= kiRef;
+  ST16 (&pMvComp->iRefIndexCache[14], kuiRef16);
+  pMvComp->iRefIndexCache[16]	= kiRef;
+  pMvComp->iRefIndexCache[19]	= kiRef;
+  ST16 (&pMvComp->iRefIndexCache[20], kuiRef16);
+  pMvComp->iRefIndexCache[22]	= kiRef;
+  pMvComp->iRefIndexCache[25]	= kiRef;
+  ST16 (&pMvComp->iRefIndexCache[26], kuiRef16);
+  pMvComp->iRefIndexCache[28]	= kiRef;
+
+  /*
+  * blocks 0: 7~10, 1: 13~16, 2: 19~22, 3: 25~28
+  */
+  pMvComp->sMotionVectorCache[7]	= *pMv;
+  ST64 (&pMvComp->sMotionVectorCache[8], kuiMv64);
+  pMvComp->sMotionVectorCache[10] = *pMv;
+  pMvComp->sMotionVectorCache[13] = *pMv;
+  ST64 (&pMvComp->sMotionVectorCache[14], kuiMv64);
+  pMvComp->sMotionVectorCache[16] = *pMv;
+  pMvComp->sMotionVectorCache[19] = *pMv;
+  ST64 (&pMvComp->sMotionVectorCache[20], kuiMv64);
+  pMvComp->sMotionVectorCache[22] = *pMv;
+  pMvComp->sMotionVectorCache[25] = *pMv;
+  ST64 (&pMvComp->sMotionVectorCache[26], kuiMv64);
+  pMvComp->sMotionVectorCache[28] = *pMv;
+}
+
+//update uiRefIndex and pMv of both SMB and Mb_cache, only for P16x8
+void UpdateP16x8MotionInfo (SMbCache* pMbCache, SMB* pCurMb, const int32_t kiPartIdx, const int8_t kiRef,
+                            SMVUnitXY* pMv) {
+  // optimized 11/25/2011
+  SMVComponentUnit* pMvComp	= &pMbCache->sMvComponents;
+  const uint32_t kuiMv32			= LD32 (pMv);
+  const uint64_t kuiMv64			= BUTTERFLY4x8 (kuiMv32);
+  uint64_t uiMvBuf[4]			= { kuiMv64, kuiMv64, kuiMv64, kuiMv64 };
+  const int16_t kiScan4Idx		= g_kuiMbCountScan4Idx[kiPartIdx];
+  const int16_t kiCacheIdx		= g_kuiCache30ScanIdx[kiPartIdx];
+  const int16_t kiCacheIdx1	= 1 + kiCacheIdx;
+  const int16_t kiCacheIdx3	= 3 + kiCacheIdx;
+  const int16_t kiCacheIdx6	= 6 + kiCacheIdx;
+  const int16_t kiCacheIdx7	= 7 + kiCacheIdx;
+  const int16_t kiCacheIdx9	= 9 + kiCacheIdx;
+  const uint16_t kuiRef16		= BUTTERFLY1x2 (kiRef);
+
+  ST16 (&pCurMb->pRefIndex[ (kiPartIdx >> 2)], kuiRef16);
+  memcpy (&pCurMb->sMv[kiScan4Idx], uiMvBuf, sizeof (uiMvBuf));	// confirmed_safe_unsafe_usage
+
+  /*
+  * blocks 0: g_kuiCache30ScanIdx[iPartIdx]~g_kuiCache30ScanIdx[iPartIdx]+3, 1: g_kuiCache30ScanIdx[iPartIdx]+6~g_kuiCache30ScanIdx[iPartIdx]+9
+  */
+  pMvComp->iRefIndexCache[kiCacheIdx]		= kiRef;
+  ST16 (&pMvComp->iRefIndexCache[kiCacheIdx1], kuiRef16);
+  pMvComp->iRefIndexCache[kiCacheIdx3]	= kiRef;
+  pMvComp->iRefIndexCache[kiCacheIdx6]	= kiRef;
+  ST16 (&pMvComp->iRefIndexCache[kiCacheIdx7], kuiRef16);
+  pMvComp->iRefIndexCache[kiCacheIdx9]	= kiRef;
+
+  /*
+  * blocks 0: g_kuiCache30ScanIdx[iPartIdx]~g_kuiCache30ScanIdx[iPartIdx]+3, 1: g_kuiCache30ScanIdx[iPartIdx]+6~g_kuiCache30ScanIdx[iPartIdx]+9
+  */
+  pMvComp->sMotionVectorCache[kiCacheIdx]	= *pMv;
+  ST64 (&pMvComp->sMotionVectorCache[kiCacheIdx1], kuiMv64);
+  pMvComp->sMotionVectorCache[kiCacheIdx3] = *pMv;
+  pMvComp->sMotionVectorCache[kiCacheIdx6] = *pMv;
+  ST64 (&pMvComp->sMotionVectorCache[kiCacheIdx7], kuiMv64);
+  pMvComp->sMotionVectorCache[kiCacheIdx9] = *pMv;
+}
+//update uiRefIndex and pMv of both SMB and Mb_cache, only for P8x16
+void update_P8x16_motion_info (SMbCache* pMbCache, SMB* pCurMb, const int32_t kiPartIdx, const int8_t kiRef,
+                               SMVUnitXY* pMv) {
+  // optimized 11/25/2011
+  SMVComponentUnit* pMvComp	= &pMbCache->sMvComponents;
+  const uint32_t kuiMv32			= LD32 (pMv);
+  const uint64_t kuiMv64			= BUTTERFLY4x8 (kuiMv32);
+  const int16_t kiScan4Idx		= g_kuiMbCountScan4Idx[kiPartIdx];
+  const int16_t kiCacheIdx		= g_kuiCache30ScanIdx[kiPartIdx];
+  const int16_t kiCacheIdx1	= 1 + kiCacheIdx;
+  const int16_t kiCacheIdx3	= 3 + kiCacheIdx;
+  const int16_t kiCacheIdx12	= 12 + kiCacheIdx;
+  const int16_t kiCacheIdx13	= 13 + kiCacheIdx;
+  const int16_t kiCacheIdx15	= 15 + kiCacheIdx;
+  const int16_t kiBlkIdx		= kiPartIdx >> 2;
+  const uint16_t kuiRef16		= BUTTERFLY1x2 (kiRef);
+
+  pCurMb->pRefIndex[kiBlkIdx]	= kiRef;
+  pCurMb->pRefIndex[2 + kiBlkIdx] = kiRef;
+  ST64 (&pCurMb->sMv[kiScan4Idx], kuiMv64);
+  ST64 (&pCurMb->sMv[4 + kiScan4Idx], kuiMv64);
+  ST64 (&pCurMb->sMv[8 + kiScan4Idx], kuiMv64);
+  ST64 (&pCurMb->sMv[12 + kiScan4Idx], kuiMv64);
+
+  /*
+  * blocks 0: g_kuiCache30ScanIdx[iPartIdx]~g_kuiCache30ScanIdx[iPartIdx]+3, 1: g_kuiCache30ScanIdx[iPartIdx]+6~g_kuiCache30ScanIdx[iPartIdx]+9
+  */
+  pMvComp->iRefIndexCache[kiCacheIdx]	= kiRef;
+  ST16 (&pMvComp->iRefIndexCache[kiCacheIdx1], kuiRef16);
+  pMvComp->iRefIndexCache[kiCacheIdx3]	= kiRef;
+  pMvComp->iRefIndexCache[kiCacheIdx12]	= kiRef;
+  ST16 (&pMvComp->iRefIndexCache[kiCacheIdx13], kuiRef16);
+  pMvComp->iRefIndexCache[kiCacheIdx15]	= kiRef;
+
+  /*
+  * blocks 0: g_kuiCache30ScanIdx[iPartIdx]~g_kuiCache30ScanIdx[iPartIdx]+3, 1: g_kuiCache30ScanIdx[iPartIdx]+6~g_kuiCache30ScanIdx[iPartIdx]+9
+  */
+  pMvComp->sMotionVectorCache[kiCacheIdx]	= *pMv;
+  ST64 (&pMvComp->sMotionVectorCache[kiCacheIdx1], kuiMv64);
+  pMvComp->sMotionVectorCache[kiCacheIdx3] = *pMv;
+  pMvComp->sMotionVectorCache[kiCacheIdx12] = *pMv;
+  ST64 (&pMvComp->sMotionVectorCache[kiCacheIdx13], kuiMv64);
+  pMvComp->sMotionVectorCache[kiCacheIdx15] = *pMv;
+}
+//update uiRefIndex and pMv of both SMB and Mb_cache, only for P8x8
+void UpdateP8x8MotionInfo (SMbCache* pMbCache, SMB* pCurMb, const int32_t kiPartIdx, const int8_t kiRef,
+                           SMVUnitXY* pMv) {
+  SMVComponentUnit* pMvComp = &pMbCache->sMvComponents;
+  const uint32_t kuiMv32			= LD32 (pMv);
+  const uint64_t kuiMv64			= BUTTERFLY4x8 (kuiMv32);
+  const int16_t kiScan4Idx		= g_kuiMbCountScan4Idx[kiPartIdx];
+  const int16_t kiCacheIdx		= g_kuiCache30ScanIdx[kiPartIdx];
+  const int16_t kiCacheIdx1	= 1 + kiCacheIdx;
+  const int16_t kiCacheIdx6	= 6 + kiCacheIdx;
+  const int16_t kiCacheIdx7	= 7 + kiCacheIdx;
+
+  //mb
+  ST64 (&pCurMb->sMv[  kiScan4Idx], kuiMv64);
+  ST64 (&pCurMb->sMv[4 + kiScan4Idx], kuiMv64);
+
+  //cache
+  pMvComp->iRefIndexCache[kiCacheIdx ] =
+    pMvComp->iRefIndexCache[kiCacheIdx1] =
+      pMvComp->iRefIndexCache[kiCacheIdx6] =
+        pMvComp->iRefIndexCache[kiCacheIdx7] = kiRef;
+  pMvComp->sMotionVectorCache[kiCacheIdx ] =
+    pMvComp->sMotionVectorCache[kiCacheIdx1] =
+      pMvComp->sMotionVectorCache[kiCacheIdx6] =
+        pMvComp->sMotionVectorCache[kiCacheIdx7] = *pMv;
+}
+
+//=========================update motion info(MV and ref_idx) into Mb_cache==========================
+//update pMv and uiRefIndex cache only for Mb_cache, only for P_16*16 (SKIP inclusive)
+
+//update uiRefIndex and pMv of only Mb_cache, only for P16x8
+void UpdateP16x8Motion2Cache (SMbCache* pMbCache, int32_t iPartIdx, int8_t iRef, SMVUnitXY* pMv) {
+  SMVComponentUnit* pMvComp = &pMbCache->sMvComponents;
+  int32_t i;
+
+  for (i = 0; i < 2; i++, iPartIdx += 4) {
+    //cache
+    const uint8_t kuiCacheIdx = g_kuiCache30ScanIdx[iPartIdx];
+
+    pMvComp->iRefIndexCache[  kuiCacheIdx] =
+      pMvComp->iRefIndexCache[1 + kuiCacheIdx] =
+        pMvComp->iRefIndexCache[6 + kuiCacheIdx] =
+          pMvComp->iRefIndexCache[7 + kuiCacheIdx] = iRef;
+    pMvComp->sMotionVectorCache[  kuiCacheIdx] =
+      pMvComp->sMotionVectorCache[1 + kuiCacheIdx] =
+        pMvComp->sMotionVectorCache[6 + kuiCacheIdx] =
+          pMvComp->sMotionVectorCache[7 + kuiCacheIdx] = *pMv;
+  }
+}
+//update uiRefIndex and pMv of only Mb_cache, only for P8x16
+void UpdateP8x16Motion2Cache (SMbCache* pMbCache, int32_t iPartIdx, int8_t iRef, SMVUnitXY* pMv) {
+  SMVComponentUnit* pMvComp = &pMbCache->sMvComponents;
+  int32_t i;
+
+  for (i = 0; i < 2; i++, iPartIdx += 8) {
+    //cache
+    const uint8_t kuiCacheIdx = g_kuiCache30ScanIdx[iPartIdx];
+
+    pMvComp->iRefIndexCache[  kuiCacheIdx] =
+      pMvComp->iRefIndexCache[1 + kuiCacheIdx] =
+        pMvComp->iRefIndexCache[6 + kuiCacheIdx] =
+          pMvComp->iRefIndexCache[7 + kuiCacheIdx] = iRef;
+    pMvComp->sMotionVectorCache[  kuiCacheIdx] =
+      pMvComp->sMotionVectorCache[1 + kuiCacheIdx] =
+        pMvComp->sMotionVectorCache[6 + kuiCacheIdx] =
+          pMvComp->sMotionVectorCache[7 + kuiCacheIdx] = *pMv;
+  }
+}
+
+//update uiRefIndex and pMv of only Mb_cache, only for P8x8
+void UpdateP8x8Motion2Cache (SMbCache* pMbCache, int32_t iPartIdx, int8_t pRef, SMVUnitXY* pMv) {
+  SMVComponentUnit* pMvComp = &pMbCache->sMvComponents;
+  const uint8_t kuiCacheIdx = g_kuiCache30ScanIdx[iPartIdx];
+
+  pMvComp->iRefIndexCache[  kuiCacheIdx] =
+    pMvComp->iRefIndexCache[1 + kuiCacheIdx] =
+      pMvComp->iRefIndexCache[6 + kuiCacheIdx] =
+        pMvComp->iRefIndexCache[7 + kuiCacheIdx] = pRef;
+  pMvComp->sMotionVectorCache[  kuiCacheIdx] =
+    pMvComp->sMotionVectorCache[1 + kuiCacheIdx] =
+      pMvComp->sMotionVectorCache[6 + kuiCacheIdx] =
+        pMvComp->sMotionVectorCache[7 + kuiCacheIdx] = *pMv;
+}
+
+} // namespace WelsSVCEnc
--- a/codec/encoder/core/src/nal_encap.cpp
+++ b/codec/encoder/core/src/nal_encap.cpp
@@ -1,248 +1,235 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file	nal_encap.c
- *
- * \brief	NAL pRawNal pData encapsulation
- *
- * \date	5/25/2009	Created
- *
- *************************************************************************************/
-#include "nal_encap.h"
-#include "svc_enc_golomb.h"
-#include "ls_defines.h"
-namespace WelsSVCEnc {
-/*!
- * \brief	load an initialize NAL pRawNal pData	
- */
-void WelsLoadNal( SWelsEncoderOutput *pEncoderOuput, const int32_t/*EWelsNalUnitType*/ kiType, const int32_t/*EWelsNalRefIdc*/ kiNalRefIdc )
-{
-	SWelsEncoderOutput *pWelsEncoderOuput	= pEncoderOuput;
-	SWelsNalRaw *pRawNal			= &pWelsEncoderOuput->sNalList[ pWelsEncoderOuput->iNalIndex ];
-	SNalUnitHeader *sNalHeader	= &pRawNal->sNalExt.sNalHeader;
-	const int32_t kiStartPos		= (BsGetBitsPos(&pWelsEncoderOuput->sBsWrite) >> 3);
-
-	sNalHeader->eNalUnitType	= (EWelsNalUnitType)kiType;
-	sNalHeader->uiNalRefIdc		= (EWelsNalRefIdc)kiNalRefIdc;
-	sNalHeader->uiForbiddenZeroBit	= 0;	
-	
-	pRawNal->pRawData		= &pWelsEncoderOuput->pBsBuffer[kiStartPos];
-	pRawNal->iPayloadSize	= 0;
-}
-
-/*!
- * \brief	unload pRawNal NAL
- */
-void WelsUnloadNal( SWelsEncoderOutput *pEncoderOuput )
-{
-	SWelsEncoderOutput	*pWelsEncoderOuput= pEncoderOuput;
-	int32_t	*pIdx			= &pWelsEncoderOuput->iNalIndex;
-	SWelsNalRaw *pRawNal		= &pWelsEncoderOuput->sNalList[ *pIdx ];
-	const int32_t kiEndPos		= (BsGetBitsPos(&pWelsEncoderOuput->sBsWrite) >> 3);	
-
-	/* count payload size of pRawNal NAL */
-	pRawNal->iPayloadSize	= &pWelsEncoderOuput->pBsBuffer[kiEndPos] - pRawNal->pRawData;
-	
-	++ (*pIdx);
-}
-
-/*!
- * \brief	load an initialize NAL pRawNal pData	
- */
-void WelsLoadNalForSlice( SWelsSliceBs *pSliceBsIn, const int32_t/*EWelsNalUnitType*/ kiType, const int32_t/*EWelsNalRefIdc*/ kiNalRefIdc )
-{
-	SWelsSliceBs *pSliceBs		    = pSliceBsIn;
-	SWelsNalRaw *pRawNal		= &pSliceBs->sNalList[ pSliceBs->iNalIndex ];
-	SNalUnitHeader *sNalHeader	= &pRawNal->sNalExt.sNalHeader;
-	SBitStringAux *pBitStringAux	= &pSliceBs->sBsWrite;
-	const int32_t kiStartPos		    = (BsGetBitsPos(pBitStringAux) >> 3);
-	
-	sNalHeader->eNalUnitType	= (EWelsNalUnitType)kiType;
-	sNalHeader->uiNalRefIdc		= (EWelsNalRefIdc)kiNalRefIdc;
-	sNalHeader->uiForbiddenZeroBit	= 0;
-	
-	pRawNal->pRawData		= &pSliceBs->pBsBuffer[kiStartPos];
-	pRawNal->iPayloadSize	= 0;
-}
-
-/*!
- * \brief	unload pRawNal NAL
- */
-void WelsUnloadNalForSlice( SWelsSliceBs *pSliceBsIn )
-{
-	SWelsSliceBs *pSliceBs	        = pSliceBsIn;
-	int32_t	*pIdx			            = &pSliceBs->iNalIndex;
-	SWelsNalRaw *pRawNal		= &pSliceBs->sNalList[ *pIdx ];
-	SBitStringAux *pBitStringAux	= &pSliceBs->sBsWrite;
-	const int32_t kiEndPos		        = (BsGetBitsPos(pBitStringAux) >> 3);
-	
-	/* count payload size of pRawNal NAL */
-	pRawNal->iPayloadSize	= &pSliceBs->pBsBuffer[kiEndPos] - pRawNal->pRawData;
-	
-	++ (*pIdx);
-}
-
-/*!
- * \brief	encode NAL with emulation forbidden three bytes checking
- * \param	pDst			pDst NAL pData
- * \param	pDstLen		length of pDst NAL output
- * \param	annexeb		annexeb flag
- * \param	pRawNal			pRawNal NAL pData
- * \return	length of pDst NAL
- */
-int32_t WelsEncodeNal( SWelsNalRaw *pRawNal, void *pDst, int32_t *pDstLen )
-{
-	uint8_t *pDstStart	    = (uint8_t *)pDst;
-	uint8_t *pDstPointer	= pDstStart;
-	uint8_t *pSrcPointer	= pRawNal->pRawData;
-	uint8_t *pSrcEnd		= pRawNal->pRawData + pRawNal->iPayloadSize;	
-	int32_t iZeroCount		= 0;
-	int32_t iNalLength		= 0;
-
-    static const uint8_t kuiStartCodePrefix[4] = { 0, 0, 0, 1 };
-    ST32( pDstPointer, LD32(&kuiStartCodePrefix[0]) );
-    pDstPointer += 4;
-
-	/* NAL Unit Header */
-	*pDstPointer++	= ( pRawNal->sNalExt.sNalHeader.uiNalRefIdc << 5 ) | (pRawNal->sNalExt.sNalHeader.eNalUnitType & 0x1f);
-
-	while ( pSrcPointer < pSrcEnd ) {
-		if ( iZeroCount == 2 && *pSrcPointer <= 3 )
-		{
-			*pDstPointer++	= 3;
-			iZeroCount		= 0;
-		}
-		if ( *pSrcPointer == 0 )
-		{
-			++ iZeroCount;
-		}
-		else
-		{
-			iZeroCount		= 0;
-		}
-		*pDstPointer++ = *pSrcPointer++;
-	}
-
-	/* count length of NAL Unit */
-	iNalLength	= pDstPointer - pDstStart;
-	if ( NULL != pDstLen )
-		*pDstLen	= iNalLength;
-	
-	return iNalLength;
-}
-
-/*!
- * \brief	encode a nal into a pBuffer for any type of NAL, involved WelsEncodeNal introduced in AVC
- *
- * \param	pDst			pDst NAL pData
- * \param	pDstLen		length of pDst NAL output
- * \param	annexeb		annexeb flag
- * \param	pRawNal			pRawNal NAL pData
- * \param	pNalHeaderExt	pointer of SNalUnitHeaderExt
- *
- * \return	length of pDst NAL
- */
-int32_t WelsEncodeNalExt( SWelsNalRaw *pRawNal, void *pNalHeaderExt, void *pDst, int32_t *pDstLen )
-{	
-	SNalUnitHeaderExt *sNalExt	= (SNalUnitHeaderExt *)pNalHeaderExt;
-	uint8_t *pDstStart				    = (uint8_t *)pDst;
-	uint8_t *pDstPointer				= pDstStart;
-	uint8_t *pSrcPointer				= pRawNal->pRawData;
-	uint8_t *pSrcEnd					= pRawNal->pRawData + pRawNal->iPayloadSize;	
-	int32_t iZeroCount					= 0;
-	int32_t iNalLength					= 0;
-	
-	if ( pRawNal->sNalExt.sNalHeader.eNalUnitType != NAL_UNIT_PREFIX && pRawNal->sNalExt.sNalHeader.eNalUnitType != NAL_UNIT_CODED_SLICE_EXT )
-	{
-		return WelsEncodeNal( pRawNal, pDst, pDstLen );
-	}
-	
-	/* FIXME this code doesn't check overflow */
-	
-    static const uint8_t kuiStartCodePrefixExt[4]= { 0, 0, 0, 1 };
-    ST32( pDstPointer, LD32(&kuiStartCodePrefixExt[0]) );
-    pDstPointer += 4;
-
-	/* NAL Unit Header */
-	*pDstPointer++	= ( pRawNal->sNalExt.sNalHeader.uiNalRefIdc << 5 ) | (pRawNal->sNalExt.sNalHeader.eNalUnitType & 0x1f);
-
-	/* NAL UNIT Extension Header */
-	*pDstPointer++ =	(0x80) |
-					(sNalExt->bIdrFlag << 6);
-
-	*pDstPointer++ =	(0x80) |
-					(sNalExt->uiDependencyId << 4);
-
-	*pDstPointer++ =	(sNalExt->uiTemporalId << 5) |
-					(sNalExt->bDiscardableFlag << 3) |
-					(0x07);
-	
-	while ( pSrcPointer < pSrcEnd ) {
-		if ( iZeroCount == 2 && *pSrcPointer <= 3 )
-		{
-			*pDstPointer++	= 3;
-			iZeroCount		= 0;
-		}
-		if ( *pSrcPointer == 0 )
-		{
-			++ iZeroCount;
-		}
-		else
-		{
-			iZeroCount		= 0;
-		}
-		*pDstPointer++ = *pSrcPointer++;
-	}
-	
-	/* count length of NAL Unit */
-	iNalLength	= pDstPointer - pDstStart;
-	if ( NULL != pDstLen )
-		*pDstLen	= iNalLength;
-
-	return iNalLength;	
-}
-
-/*!
- * \brief	write prefix nal
- */
-int32_t WelsWriteSVCPrefixNal( SBitStringAux *pBitStringAux, const int32_t kiNalRefIdc,
-						  const bool_t kbIdrFlag )
-{
-	if ( 0 < kiNalRefIdc ){
-		BsWriteOneBit( pBitStringAux, false/*bStoreRefBasePicFlag*/ );
-		BsWriteOneBit( pBitStringAux, false );
-		BsRbspTrailingBits( pBitStringAux );
-		BsFlush( pBitStringAux );
-	}
-	return 0;
-}
-
-} // namespace WelsSVCEnc
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	nal_encap.c
+ *
+ * \brief	NAL pRawNal pData encapsulation
+ *
+ * \date	5/25/2009	Created
+ *
+ *************************************************************************************/
+#include "nal_encap.h"
+#include "svc_enc_golomb.h"
+#include "ls_defines.h"
+namespace WelsSVCEnc {
+/*!
+ * \brief	load an initialize NAL pRawNal pData
+ */
+void WelsLoadNal (SWelsEncoderOutput* pEncoderOuput, const int32_t/*EWelsNalUnitType*/ kiType,
+                  const int32_t/*EWelsNalRefIdc*/ kiNalRefIdc) {
+  SWelsEncoderOutput* pWelsEncoderOuput	= pEncoderOuput;
+  SWelsNalRaw* pRawNal			= &pWelsEncoderOuput->sNalList[ pWelsEncoderOuput->iNalIndex ];
+  SNalUnitHeader* sNalHeader	= &pRawNal->sNalExt.sNalHeader;
+  const int32_t kiStartPos		= (BsGetBitsPos (&pWelsEncoderOuput->sBsWrite) >> 3);
+
+  sNalHeader->eNalUnitType	= (EWelsNalUnitType)kiType;
+  sNalHeader->uiNalRefIdc		= (EWelsNalRefIdc)kiNalRefIdc;
+  sNalHeader->uiForbiddenZeroBit	= 0;
+
+  pRawNal->pRawData		= &pWelsEncoderOuput->pBsBuffer[kiStartPos];
+  pRawNal->iPayloadSize	= 0;
+}
+
+/*!
+ * \brief	unload pRawNal NAL
+ */
+void WelsUnloadNal (SWelsEncoderOutput* pEncoderOuput) {
+  SWelsEncoderOutput*	pWelsEncoderOuput = pEncoderOuput;
+  int32_t*	pIdx			= &pWelsEncoderOuput->iNalIndex;
+  SWelsNalRaw* pRawNal		= &pWelsEncoderOuput->sNalList[ *pIdx ];
+  const int32_t kiEndPos		= (BsGetBitsPos (&pWelsEncoderOuput->sBsWrite) >> 3);
+
+  /* count payload size of pRawNal NAL */
+  pRawNal->iPayloadSize	= &pWelsEncoderOuput->pBsBuffer[kiEndPos] - pRawNal->pRawData;
+
+  ++ (*pIdx);
+}
+
+/*!
+ * \brief	load an initialize NAL pRawNal pData
+ */
+void WelsLoadNalForSlice (SWelsSliceBs* pSliceBsIn, const int32_t/*EWelsNalUnitType*/ kiType,
+                          const int32_t/*EWelsNalRefIdc*/ kiNalRefIdc) {
+  SWelsSliceBs* pSliceBs		    = pSliceBsIn;
+  SWelsNalRaw* pRawNal		= &pSliceBs->sNalList[ pSliceBs->iNalIndex ];
+  SNalUnitHeader* sNalHeader	= &pRawNal->sNalExt.sNalHeader;
+  SBitStringAux* pBitStringAux	= &pSliceBs->sBsWrite;
+  const int32_t kiStartPos		    = (BsGetBitsPos (pBitStringAux) >> 3);
+
+  sNalHeader->eNalUnitType	= (EWelsNalUnitType)kiType;
+  sNalHeader->uiNalRefIdc		= (EWelsNalRefIdc)kiNalRefIdc;
+  sNalHeader->uiForbiddenZeroBit	= 0;
+
+  pRawNal->pRawData		= &pSliceBs->pBsBuffer[kiStartPos];
+  pRawNal->iPayloadSize	= 0;
+}
+
+/*!
+ * \brief	unload pRawNal NAL
+ */
+void WelsUnloadNalForSlice (SWelsSliceBs* pSliceBsIn) {
+  SWelsSliceBs* pSliceBs	        = pSliceBsIn;
+  int32_t*	pIdx			            = &pSliceBs->iNalIndex;
+  SWelsNalRaw* pRawNal		= &pSliceBs->sNalList[ *pIdx ];
+  SBitStringAux* pBitStringAux	= &pSliceBs->sBsWrite;
+  const int32_t kiEndPos		        = (BsGetBitsPos (pBitStringAux) >> 3);
+
+  /* count payload size of pRawNal NAL */
+  pRawNal->iPayloadSize	= &pSliceBs->pBsBuffer[kiEndPos] - pRawNal->pRawData;
+
+  ++ (*pIdx);
+}
+
+/*!
+ * \brief	encode NAL with emulation forbidden three bytes checking
+ * \param	pDst			pDst NAL pData
+ * \param	pDstLen		length of pDst NAL output
+ * \param	annexeb		annexeb flag
+ * \param	pRawNal			pRawNal NAL pData
+ * \return	length of pDst NAL
+ */
+int32_t WelsEncodeNal (SWelsNalRaw* pRawNal, void* pDst, int32_t* pDstLen) {
+  uint8_t* pDstStart	    = (uint8_t*)pDst;
+  uint8_t* pDstPointer	= pDstStart;
+  uint8_t* pSrcPointer	= pRawNal->pRawData;
+  uint8_t* pSrcEnd		= pRawNal->pRawData + pRawNal->iPayloadSize;
+  int32_t iZeroCount		= 0;
+  int32_t iNalLength		= 0;
+
+  static const uint8_t kuiStartCodePrefix[4] = { 0, 0, 0, 1 };
+  ST32 (pDstPointer, LD32 (&kuiStartCodePrefix[0]));
+  pDstPointer += 4;
+
+  /* NAL Unit Header */
+  *pDstPointer++	= (pRawNal->sNalExt.sNalHeader.uiNalRefIdc << 5) | (pRawNal->sNalExt.sNalHeader.eNalUnitType & 0x1f);
+
+  while (pSrcPointer < pSrcEnd) {
+    if (iZeroCount == 2 && *pSrcPointer <= 3) {
+      *pDstPointer++	= 3;
+      iZeroCount		= 0;
+    }
+    if (*pSrcPointer == 0) {
+      ++ iZeroCount;
+    } else {
+      iZeroCount		= 0;
+    }
+    *pDstPointer++ = *pSrcPointer++;
+  }
+
+  /* count length of NAL Unit */
+  iNalLength	= pDstPointer - pDstStart;
+  if (NULL != pDstLen)
+    *pDstLen	= iNalLength;
+
+  return iNalLength;
+}
+
+/*!
+ * \brief	encode a nal into a pBuffer for any type of NAL, involved WelsEncodeNal introduced in AVC
+ *
+ * \param	pDst			pDst NAL pData
+ * \param	pDstLen		length of pDst NAL output
+ * \param	annexeb		annexeb flag
+ * \param	pRawNal			pRawNal NAL pData
+ * \param	pNalHeaderExt	pointer of SNalUnitHeaderExt
+ *
+ * \return	length of pDst NAL
+ */
+int32_t WelsEncodeNalExt (SWelsNalRaw* pRawNal, void* pNalHeaderExt, void* pDst, int32_t* pDstLen) {
+  SNalUnitHeaderExt* sNalExt	= (SNalUnitHeaderExt*)pNalHeaderExt;
+  uint8_t* pDstStart				    = (uint8_t*)pDst;
+  uint8_t* pDstPointer				= pDstStart;
+  uint8_t* pSrcPointer				= pRawNal->pRawData;
+  uint8_t* pSrcEnd					= pRawNal->pRawData + pRawNal->iPayloadSize;
+  int32_t iZeroCount					= 0;
+  int32_t iNalLength					= 0;
+
+  if (pRawNal->sNalExt.sNalHeader.eNalUnitType != NAL_UNIT_PREFIX
+      && pRawNal->sNalExt.sNalHeader.eNalUnitType != NAL_UNIT_CODED_SLICE_EXT) {
+    return WelsEncodeNal (pRawNal, pDst, pDstLen);
+  }
+
+  /* FIXME this code doesn't check overflow */
+
+  static const uint8_t kuiStartCodePrefixExt[4] = { 0, 0, 0, 1 };
+  ST32 (pDstPointer, LD32 (&kuiStartCodePrefixExt[0]));
+  pDstPointer += 4;
+
+  /* NAL Unit Header */
+  *pDstPointer++	= (pRawNal->sNalExt.sNalHeader.uiNalRefIdc << 5) | (pRawNal->sNalExt.sNalHeader.eNalUnitType & 0x1f);
+
+  /* NAL UNIT Extension Header */
+  *pDstPointer++ =	(0x80) |
+                    (sNalExt->bIdrFlag << 6);
+
+  *pDstPointer++ =	(0x80) |
+                    (sNalExt->uiDependencyId << 4);
+
+  *pDstPointer++ =	(sNalExt->uiTemporalId << 5) |
+                    (sNalExt->bDiscardableFlag << 3) |
+                    (0x07);
+
+  while (pSrcPointer < pSrcEnd) {
+    if (iZeroCount == 2 && *pSrcPointer <= 3) {
+      *pDstPointer++	= 3;
+      iZeroCount		= 0;
+    }
+    if (*pSrcPointer == 0) {
+      ++ iZeroCount;
+    } else {
+      iZeroCount		= 0;
+    }
+    *pDstPointer++ = *pSrcPointer++;
+  }
+
+  /* count length of NAL Unit */
+  iNalLength	= pDstPointer - pDstStart;
+  if (NULL != pDstLen)
+    *pDstLen	= iNalLength;
+
+  return iNalLength;
+}
+
+/*!
+ * \brief	write prefix nal
+ */
+int32_t WelsWriteSVCPrefixNal (SBitStringAux* pBitStringAux, const int32_t kiNalRefIdc,
+                               const bool_t kbIdrFlag) {
+  if (0 < kiNalRefIdc) {
+    BsWriteOneBit (pBitStringAux, false/*bStoreRefBasePicFlag*/);
+    BsWriteOneBit (pBitStringAux, false);
+    BsRbspTrailingBits (pBitStringAux);
+    BsFlush (pBitStringAux);
+  }
+  return 0;
+}
+
+} // namespace WelsSVCEnc
--- a/codec/encoder/core/src/picture_handle.cpp
+++ b/codec/encoder/core/src/picture_handle.cpp
@@ -1,193 +1,185 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file	picture_handle.c
- *
- * \brief	picture pData handling
- *
- * \date	5/20/2009 Created
- *
- *************************************************************************************/
-#include <string.h>
-#include <assert.h>
-#include "picture_handle.h"
-#include "wels_const.h"
-#include "utils.h"
-#include "macros.h"
-
-namespace WelsSVCEnc {
-/*!
- * \brief	alloc picture pData with borders for each plane based width and height of picture
- * \param	cx				width of picture in pixels
- * \param	cy				height of picture in pixels
- * \param	need_data		need pData allocation
- * \pram	need_expand		need borders expanding
- * \return	successful if effective picture pointer returned, otherwise failed with NULL
- */
-SPicture *AllocPicture( CMemoryAlign *pMa, const int32_t kiWidth , const int32_t kiHeight, bool_t bNeedMbInfo )
-{
-	SPicture *pPic = NULL;
-	int32_t iPicWidth = 0;
-	int32_t iPicHeight= 0;
-
-	int32_t iPicChromaWidth	= 0;
-	int32_t iPicChromaHeight	= 0;
-	int32_t iLumaSize			= 0;
-	int32_t iChromaSize			= 0;
-
-	pPic	= static_cast<SPicture*>(pMa->WelsMallocz( sizeof(SPicture), "pPic" ));
-
-	WELS_VERIFY_RETURN_IF( NULL, NULL == pPic );	
-	
-	iPicWidth	= WELS_ALIGN(kiWidth, MB_WIDTH_LUMA) + (PADDING_LENGTH<<1);	// with width of horizon
-	iPicHeight	= WELS_ALIGN(kiHeight, MB_HEIGHT_LUMA) + (PADDING_LENGTH<<1);	// with height of vertical
-	iPicChromaWidth	= iPicWidth >> 1;
-	iPicChromaHeight	= iPicHeight >> 1;
-	iPicWidth	= WELS_ALIGN( iPicWidth, 32 );	// 32(or 16 for chroma below) to match original imp. here instead of cache_line_size
-	iPicChromaWidth	= WELS_ALIGN( iPicChromaWidth, 16 );
-	iLumaSize	= iPicWidth * iPicHeight;
-	iChromaSize	= iPicChromaWidth * iPicChromaHeight;
-
-	pPic->pBuffer	= (uint8_t*)pMa->WelsMalloc(	iLumaSize /* luma */
-								  + (iChromaSize << 1) /* Cb,Cr */
-								  , "pPic->pBuffer"	);
-	WELS_VERIFY_RETURN_PROC_IF( NULL, NULL == pPic->pBuffer, FreePicture(pMa, &pPic) );
-	pPic->iLineSize[0]	= iPicWidth;
-	pPic->iLineSize[1]	= pPic->iLineSize[2]	= iPicChromaWidth;
-	pPic->pData[0]	= pPic->pBuffer + (1+pPic->iLineSize[0]) * PADDING_LENGTH;
-	pPic->pData[1]	= pPic->pBuffer + iLumaSize + ( ((1+pPic->iLineSize[1]) * PADDING_LENGTH) >> 1 );
-	pPic->pData[2]	= pPic->pBuffer + iLumaSize + iChromaSize + ( ((1+pPic->iLineSize[2]) * PADDING_LENGTH) >> 1 );
-
-	pPic->iWidthInPixel	= kiWidth;
-	pPic->iHeightInPixel	= kiHeight;
-	pPic->iFrameNum			= -1;
-
-	pPic->bIsLongRef		= false;
-	pPic->iLongTermPicNum = -1;
-	pPic->uiRecieveConfirmed = 0;
-	pPic->iMarkFrameNum	= -1;
-
-	if ( bNeedMbInfo )
-	{	
-		const uint32_t kuiCountMbNum = ((15+kiWidth) >> 4) * ((15+kiHeight) >> 4);
-
-		pPic->uiRefMbType	= (uint32_t *)pMa->WelsMallocz( kuiCountMbNum * sizeof(uint32_t), "pPic->uiRefMbType" );
-		WELS_VERIFY_RETURN_PROC_IF( NULL, NULL == pPic->uiRefMbType, FreePicture(pMa, &pPic) );	
-
-		pPic->pRefMbQp	= (uint8_t *)pMa->WelsMallocz( kuiCountMbNum * sizeof(uint8_t), "pPic->bgd_mb_qp" );
-		WELS_VERIFY_RETURN_PROC_IF( NULL, NULL == pPic->pRefMbQp, FreePicture(pMa, &pPic) );
-
-		pPic->sMvList           = static_cast<SMVUnitXY *>(pMa->WelsMallocz( kuiCountMbNum*sizeof(SMVUnitXY), "pPic->sMvList" ));
-		WELS_VERIFY_RETURN_PROC_IF( NULL, NULL == pPic->sMvList, FreePicture(pMa, &pPic) );
-
-		pPic->pMbSkipSad       = (int32_t *)pMa->WelsMallocz( kuiCountMbNum*sizeof(int32_t), "pPic->pMbSkipSad" );
-		WELS_VERIFY_RETURN_PROC_IF( NULL, NULL == pPic->pMbSkipSad, FreePicture(pMa, &pPic) );
-	}	
-	
-	return pPic;
-}
-
-/*!
- * \brief	free picture pData planes
- * \param	pPic		picture pointer to be destoryed
- * \return	none
- */
-void FreePicture( CMemoryAlign *pMa, SPicture **ppPic )
-{	
-	if ( NULL != ppPic && NULL != *ppPic )
-	{
-		SPicture *pPic = *ppPic;
-
-		if ( NULL != pPic->pBuffer )
-		{
-			pMa->WelsFree( pPic->pBuffer, "pPic->pBuffer" );
-			pPic->pBuffer = NULL;
-		}
-		pPic->pBuffer		= NULL;
-		pPic->pData[0]	=
-		pPic->pData[1]	=
-		pPic->pData[2]	= NULL;
-		pPic->iLineSize[0] =
-		pPic->iLineSize[1] =
-		pPic->iLineSize[2] = 0;
-
-		pPic->iWidthInPixel		= 0;
-		pPic->iHeightInPixel	= 0;
-		pPic->iFrameNum			= -1;
-
-		pPic->bIsLongRef		= false;
-		pPic->uiRecieveConfirmed  = 0;
-		pPic->iLongTermPicNum  = -1;
-		pPic->iMarkFrameNum		= -1;
-
-		if ( pPic->uiRefMbType)
-		{
-			pMa->WelsFree( pPic->uiRefMbType, "pPic->bgd_mb_type" );
-			pPic->uiRefMbType = NULL;
-		}
-		if ( pPic->pRefMbQp)
-		{
-			pMa->WelsFree( pPic->pRefMbQp, "pPic->bgd_mb_qp" );
-			pPic->pRefMbQp = NULL;
-		}
-
-		if ( pPic->sMvList )
-		{
-			pMa->WelsFree( pPic->sMvList, "pPic->sMvList" );
-			pPic->sMvList = NULL;
-		}
-		if ( pPic->pMbSkipSad )
-		{
-			pMa->WelsFree( pPic->pMbSkipSad, "pPic->pMbSkipSad" );
-			pPic->pMbSkipSad = NULL;
-		}		
-		pMa->WelsFree( *ppPic, "pPic" );
-		*ppPic = NULL;
-	}
-}
-/*!
-* \brief	exchange two picture pData planes
-* \param	ppPic1		picture pointer to picture 1
-* \param	ppPic2		picture pointer to picture 2
-* \return	none
-*/
-void WelsExchangeSpatialPictures( SPicture **ppPic1, SPicture **ppPic2 )
-{
-	SPicture *tmp	= *ppPic1;
-
-	assert( *ppPic1 != *ppPic2 );
-
-	*ppPic1 = *ppPic2;
-	*ppPic2 = tmp;	
-}
-
-} // namespace WelsSVCEnc
-
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	picture_handle.c
+ *
+ * \brief	picture pData handling
+ *
+ * \date	5/20/2009 Created
+ *
+ *************************************************************************************/
+#include <string.h>
+#include <assert.h>
+#include "picture_handle.h"
+#include "wels_const.h"
+#include "utils.h"
+#include "macros.h"
+
+namespace WelsSVCEnc {
+/*!
+ * \brief	alloc picture pData with borders for each plane based width and height of picture
+ * \param	cx				width of picture in pixels
+ * \param	cy				height of picture in pixels
+ * \param	need_data		need pData allocation
+ * \pram	need_expand		need borders expanding
+ * \return	successful if effective picture pointer returned, otherwise failed with NULL
+ */
+SPicture* AllocPicture (CMemoryAlign* pMa, const int32_t kiWidth , const int32_t kiHeight, bool_t bNeedMbInfo) {
+  SPicture* pPic = NULL;
+  int32_t iPicWidth = 0;
+  int32_t iPicHeight = 0;
+
+  int32_t iPicChromaWidth	= 0;
+  int32_t iPicChromaHeight	= 0;
+  int32_t iLumaSize			= 0;
+  int32_t iChromaSize			= 0;
+
+  pPic	= static_cast<SPicture*> (pMa->WelsMallocz (sizeof (SPicture), "pPic"));
+
+  WELS_VERIFY_RETURN_IF (NULL, NULL == pPic);
+
+  iPicWidth	= WELS_ALIGN (kiWidth, MB_WIDTH_LUMA) + (PADDING_LENGTH << 1);	// with width of horizon
+  iPicHeight	= WELS_ALIGN (kiHeight, MB_HEIGHT_LUMA) + (PADDING_LENGTH << 1);	// with height of vertical
+  iPicChromaWidth	= iPicWidth >> 1;
+  iPicChromaHeight	= iPicHeight >> 1;
+  iPicWidth	= WELS_ALIGN (iPicWidth,
+                          32);	// 32(or 16 for chroma below) to match original imp. here instead of cache_line_size
+  iPicChromaWidth	= WELS_ALIGN (iPicChromaWidth, 16);
+  iLumaSize	= iPicWidth * iPicHeight;
+  iChromaSize	= iPicChromaWidth * iPicChromaHeight;
+
+  pPic->pBuffer	= (uint8_t*)pMa->WelsMalloc (iLumaSize /* luma */
+                  + (iChromaSize << 1) /* Cb,Cr */
+                  , "pPic->pBuffer");
+  WELS_VERIFY_RETURN_PROC_IF (NULL, NULL == pPic->pBuffer, FreePicture (pMa, &pPic));
+  pPic->iLineSize[0]	= iPicWidth;
+  pPic->iLineSize[1]	= pPic->iLineSize[2]	= iPicChromaWidth;
+  pPic->pData[0]	= pPic->pBuffer + (1 + pPic->iLineSize[0]) * PADDING_LENGTH;
+  pPic->pData[1]	= pPic->pBuffer + iLumaSize + (((1 + pPic->iLineSize[1]) * PADDING_LENGTH) >> 1);
+  pPic->pData[2]	= pPic->pBuffer + iLumaSize + iChromaSize + (((1 + pPic->iLineSize[2]) * PADDING_LENGTH) >> 1);
+
+  pPic->iWidthInPixel	= kiWidth;
+  pPic->iHeightInPixel	= kiHeight;
+  pPic->iFrameNum			= -1;
+
+  pPic->bIsLongRef		= false;
+  pPic->iLongTermPicNum = -1;
+  pPic->uiRecieveConfirmed = 0;
+  pPic->iMarkFrameNum	= -1;
+
+  if (bNeedMbInfo) {
+    const uint32_t kuiCountMbNum = ((15 + kiWidth) >> 4) * ((15 + kiHeight) >> 4);
+
+    pPic->uiRefMbType	= (uint32_t*)pMa->WelsMallocz (kuiCountMbNum * sizeof (uint32_t), "pPic->uiRefMbType");
+    WELS_VERIFY_RETURN_PROC_IF (NULL, NULL == pPic->uiRefMbType, FreePicture (pMa, &pPic));
+
+    pPic->pRefMbQp	= (uint8_t*)pMa->WelsMallocz (kuiCountMbNum * sizeof (uint8_t), "pPic->bgd_mb_qp");
+    WELS_VERIFY_RETURN_PROC_IF (NULL, NULL == pPic->pRefMbQp, FreePicture (pMa, &pPic));
+
+    pPic->sMvList           = static_cast<SMVUnitXY*> (pMa->WelsMallocz (kuiCountMbNum * sizeof (SMVUnitXY),
+                              "pPic->sMvList"));
+    WELS_VERIFY_RETURN_PROC_IF (NULL, NULL == pPic->sMvList, FreePicture (pMa, &pPic));
+
+    pPic->pMbSkipSad       = (int32_t*)pMa->WelsMallocz (kuiCountMbNum * sizeof (int32_t), "pPic->pMbSkipSad");
+    WELS_VERIFY_RETURN_PROC_IF (NULL, NULL == pPic->pMbSkipSad, FreePicture (pMa, &pPic));
+  }
+
+  return pPic;
+}
+
+/*!
+ * \brief	free picture pData planes
+ * \param	pPic		picture pointer to be destoryed
+ * \return	none
+ */
+void FreePicture (CMemoryAlign* pMa, SPicture** ppPic) {
+  if (NULL != ppPic && NULL != *ppPic) {
+    SPicture* pPic = *ppPic;
+
+    if (NULL != pPic->pBuffer) {
+      pMa->WelsFree (pPic->pBuffer, "pPic->pBuffer");
+      pPic->pBuffer = NULL;
+    }
+    pPic->pBuffer		= NULL;
+    pPic->pData[0]	=
+      pPic->pData[1]	=
+        pPic->pData[2]	= NULL;
+    pPic->iLineSize[0] =
+      pPic->iLineSize[1] =
+        pPic->iLineSize[2] = 0;
+
+    pPic->iWidthInPixel		= 0;
+    pPic->iHeightInPixel	= 0;
+    pPic->iFrameNum			= -1;
+
+    pPic->bIsLongRef		= false;
+    pPic->uiRecieveConfirmed  = 0;
+    pPic->iLongTermPicNum  = -1;
+    pPic->iMarkFrameNum		= -1;
+
+    if (pPic->uiRefMbType) {
+      pMa->WelsFree (pPic->uiRefMbType, "pPic->bgd_mb_type");
+      pPic->uiRefMbType = NULL;
+    }
+    if (pPic->pRefMbQp) {
+      pMa->WelsFree (pPic->pRefMbQp, "pPic->bgd_mb_qp");
+      pPic->pRefMbQp = NULL;
+    }
+
+    if (pPic->sMvList) {
+      pMa->WelsFree (pPic->sMvList, "pPic->sMvList");
+      pPic->sMvList = NULL;
+    }
+    if (pPic->pMbSkipSad) {
+      pMa->WelsFree (pPic->pMbSkipSad, "pPic->pMbSkipSad");
+      pPic->pMbSkipSad = NULL;
+    }
+    pMa->WelsFree (*ppPic, "pPic");
+    *ppPic = NULL;
+  }
+}
+/*!
+* \brief	exchange two picture pData planes
+* \param	ppPic1		picture pointer to picture 1
+* \param	ppPic2		picture pointer to picture 2
+* \return	none
+*/
+void WelsExchangeSpatialPictures (SPicture** ppPic1, SPicture** ppPic2) {
+  SPicture* tmp	= *ppPic1;
+
+  assert (*ppPic1 != *ppPic2);
+
+  *ppPic1 = *ppPic2;
+  *ppPic2 = tmp;
+}
+
+} // namespace WelsSVCEnc
+
--- a/codec/encoder/core/src/property.cpp
+++ b/codec/encoder/core/src/property.cpp
@@ -1,149 +1,145 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file	property.c
- *
- * \brief	CODE name, library module and corresponding version are included
- *
- * \date	03/10/2009 Created
- *
- *************************************************************************************
- */
-#include <string.h>
-#include <stdlib.h>
-#include "property.h"
-#include "crt_util_safe_x.h"	// Safe CRT routines like utils for cross_platforms
-namespace WelsSVCEnc {
-#define WELS_CODE_NAME	"Wels"
-#define WELS_LIB_NAME	"Encoder"
-
-#define WELS_VERSION_INT	0x000001	// v 0.0.1
-#define WELS_VERSION_STR	"0.0.1"
-
-#define WELS_BUILD_NUM		"090420"	// yymmdd
-
-//////////////summary information//////////////
-
-#define WELS_IDENT		WELS_CODE_NAME WELS_LIB_NAME "v" WELS_VERSION_STR "b" WELS_BUILD_NUM
-
-/*!
- * \brief	get code name
- * \param	pBuf	pBuffer to restore code name
- * \param	iSize	size of pBuffer overall
- * \return	actual size of pBuffer used; 0 returned in failure
- */
-int32_t GetCodeName(str_t *pBuf, int32_t iSize)
-{
-	int32_t iLen = 0;
-	
-	if ( NULL == pBuf )
-		return 0;
-	
-	iLen = STRNLEN( WELS_CODE_NAME, 4 );	// confirmed_safe_unsafe_usage
-	if ( iSize <= iLen )
-		return 0;
-
-	pBuf[iLen]	= '\0';
-	STRNCPY( pBuf, iSize, WELS_CODE_NAME, iLen);	// confirmed_safe_unsafe_usage
-
-	return iLen;
-}
-
-/*!
- * \brief	get library/module name
- * \param	pBuf	pBuffer to restore module name
- * \param	iSize	size of pBuffer overall
- * \return	actual size of pBuffer used; 0 returned in failure
- */
-int32_t GetLibName(str_t *pBuf, int32_t iSize)
-{
-	int32_t iLen = 0;
-
-	if ( NULL == pBuf )
-		return 0;
-
-	iLen	= STRNLEN( WELS_LIB_NAME, 7 );	// confirmed_safe_unsafe_usage
-	if ( iSize <= iLen )
-		return 0;
-
-	pBuf[iLen]	= '\0';
-	STRNCPY( pBuf, iSize, WELS_LIB_NAME, iLen );	// confirmed_safe_unsafe_usage
-
-	return iLen;
-}
-
-/*!
- * \brief	get version number
- * \param	pBuf	pBuffer to restore version number
- * \param	iSize	size of pBuffer overall
- * \return	actual size of pBuffer used; 0 returned in failure
- */
-int32_t GetVerNum(str_t *pBuf, int32_t iSize)
-{
-	int32_t iLen = 0;
-	
-	if ( NULL == pBuf )
-		return 0;
-	
-	iLen	= STRNLEN( WELS_VERSION_STR, 5 );	// confirmed_safe_unsafe_usage
-	if ( iSize <= iLen )
-		return 0;
-	
-	pBuf[iLen]	= '\0';
-	STRNCPY( pBuf, iSize, WELS_VERSION_STR, iLen );	// confirmed_safe_unsafe_usage
-	
-	return iLen;
-}
-
-/*!
- * \brief	get identify information
- * \param	pBuf	pBuffer to restore indentify information
- * \param	iSize	size of pBuffer overall
- * \return	actual size of pBuffer used; 0 returned in failure
- */
-int32_t GetIdentInfo(str_t *pBuf, int32_t iSize)
-{
-	int32_t iLen = 0;
-	
-	if ( NULL == pBuf )
-		return 0;
-	
-	iLen	= STRNLEN( WELS_IDENT, 30 );	// confirmed_safe_unsafe_usage
-	if ( iSize <= iLen )
-		return 0;
-	
-	pBuf[iLen]	= '\0';
-	STRNCPY( pBuf, iSize, WELS_IDENT, iLen );	// confirmed_safe_unsafe_usage
-	
-	return iLen;
-}
-
-} // namespace WelsSVCEnc
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	property.c
+ *
+ * \brief	CODE name, library module and corresponding version are included
+ *
+ * \date	03/10/2009 Created
+ *
+ *************************************************************************************
+ */
+#include <string.h>
+#include <stdlib.h>
+#include "property.h"
+#include "crt_util_safe_x.h"	// Safe CRT routines like utils for cross_platforms
+namespace WelsSVCEnc {
+#define WELS_CODE_NAME	"Wels"
+#define WELS_LIB_NAME	"Encoder"
+
+#define WELS_VERSION_INT	0x000001	// v 0.0.1
+#define WELS_VERSION_STR	"0.0.1"
+
+#define WELS_BUILD_NUM		"090420"	// yymmdd
+
+//////////////summary information//////////////
+
+#define WELS_IDENT		WELS_CODE_NAME WELS_LIB_NAME "v" WELS_VERSION_STR "b" WELS_BUILD_NUM
+
+/*!
+ * \brief	get code name
+ * \param	pBuf	pBuffer to restore code name
+ * \param	iSize	size of pBuffer overall
+ * \return	actual size of pBuffer used; 0 returned in failure
+ */
+int32_t GetCodeName (str_t* pBuf, int32_t iSize) {
+  int32_t iLen = 0;
+
+  if (NULL == pBuf)
+    return 0;
+
+  iLen = STRNLEN (WELS_CODE_NAME, 4);	// confirmed_safe_unsafe_usage
+  if (iSize <= iLen)
+    return 0;
+
+  pBuf[iLen]	= '\0';
+  STRNCPY (pBuf, iSize, WELS_CODE_NAME, iLen);	// confirmed_safe_unsafe_usage
+
+  return iLen;
+}
+
+/*!
+ * \brief	get library/module name
+ * \param	pBuf	pBuffer to restore module name
+ * \param	iSize	size of pBuffer overall
+ * \return	actual size of pBuffer used; 0 returned in failure
+ */
+int32_t GetLibName (str_t* pBuf, int32_t iSize) {
+  int32_t iLen = 0;
+
+  if (NULL == pBuf)
+    return 0;
+
+  iLen	= STRNLEN (WELS_LIB_NAME, 7);	// confirmed_safe_unsafe_usage
+  if (iSize <= iLen)
+    return 0;
+
+  pBuf[iLen]	= '\0';
+  STRNCPY (pBuf, iSize, WELS_LIB_NAME, iLen);	// confirmed_safe_unsafe_usage
+
+  return iLen;
+}
+
+/*!
+ * \brief	get version number
+ * \param	pBuf	pBuffer to restore version number
+ * \param	iSize	size of pBuffer overall
+ * \return	actual size of pBuffer used; 0 returned in failure
+ */
+int32_t GetVerNum (str_t* pBuf, int32_t iSize) {
+  int32_t iLen = 0;
+
+  if (NULL == pBuf)
+    return 0;
+
+  iLen	= STRNLEN (WELS_VERSION_STR, 5);	// confirmed_safe_unsafe_usage
+  if (iSize <= iLen)
+    return 0;
+
+  pBuf[iLen]	= '\0';
+  STRNCPY (pBuf, iSize, WELS_VERSION_STR, iLen);	// confirmed_safe_unsafe_usage
+
+  return iLen;
+}
+
+/*!
+ * \brief	get identify information
+ * \param	pBuf	pBuffer to restore indentify information
+ * \param	iSize	size of pBuffer overall
+ * \return	actual size of pBuffer used; 0 returned in failure
+ */
+int32_t GetIdentInfo (str_t* pBuf, int32_t iSize) {
+  int32_t iLen = 0;
+
+  if (NULL == pBuf)
+    return 0;
+
+  iLen	= STRNLEN (WELS_IDENT, 30);	// confirmed_safe_unsafe_usage
+  if (iSize <= iLen)
+    return 0;
+
+  pBuf[iLen]	= '\0';
+  STRNCPY (pBuf, iSize, WELS_IDENT, iLen);	// confirmed_safe_unsafe_usage
+
+  return iLen;
+}
+
+} // namespace WelsSVCEnc
--- a/codec/encoder/core/src/ratectl.cpp
+++ b/codec/encoder/core/src/ratectl.cpp
@@ -1,1049 +1,952 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- *  ratectl.c
- *
- *  Abstract
- *      Rate Control
- *
- *  History
- *      9/8/2009 Created
- *    12/26/2011 Modified
- *  
- *
- *
- *************************************************************************/
-#include <stdlib.h>
-#include <stdio.h>
-#include <math.h>
-#include "rc.h"
-#include "encoder_context.h"
-#include "utils.h"
-#include "svc_enc_golomb.h"
-
-
-namespace WelsSVCEnc {
-
-//#define _TEST_TEMP_RC_
-#ifdef _TEST_TEMP_RC_
-//#define _NOT_USE_AQ_FOR_TEST_
-FILE *fp_test_rc = NULL;
-FILE *fp_vgop = NULL;
-#endif
-#define _BITS_RANGE 0
-
-void RcInitLayerMemory(SWelsSvcRc *pWelsSvcRc, CMemoryAlign *pMA, const int32_t kiMaxTl)
-{
-	const int32_t kiSliceNum			= pWelsSvcRc->iSliceNum;
-	const int32_t kiGomSize				= pWelsSvcRc->iGomSize;
-	const int32_t kiGomSizeD			= kiGomSize * sizeof(double);
-	const int32_t kiGomSizeI			= kiGomSize * sizeof(int32_t);
-	const int32_t kiLayerRcSize			= kiGomSizeD + (kiGomSizeI*3) + sizeof(SRCSlicing)*kiSliceNum + sizeof(SRCTemporal)*kiMaxTl;	
-	uint8_t *pBaseMem					= (uint8_t *)pMA->WelsMalloc(kiLayerRcSize, "rc_layer_memory");
-
-	if (NULL == pBaseMem)
-		return;	
-
-	pWelsSvcRc->pGomComplexity				= (double *)pBaseMem;
-	pBaseMem += kiGomSizeD;
-	pWelsSvcRc->pGomForegroundBlockNum	= (int32_t *)pBaseMem;
-	pBaseMem += kiGomSizeI;
-	pWelsSvcRc->pCurrentFrameGomSad		= (int32_t *)pBaseMem;
-	pBaseMem += kiGomSizeI;
-	pWelsSvcRc->pGomCost					= (int32_t *)pBaseMem;
-	pBaseMem += kiGomSizeI;
-	pWelsSvcRc->pSlicingOverRc			= (SRCSlicing *)pBaseMem;
-	pBaseMem += sizeof(SRCSlicing)*kiSliceNum;
-	pWelsSvcRc->pTemporalOverRc			= (SRCTemporal *)pBaseMem;
-}
-
-void RcFreeLayerMemory(SWelsSvcRc *pWelsSvcRc, CMemoryAlign *pMA)
-{
-	if (pWelsSvcRc != NULL && pWelsSvcRc->pGomComplexity != NULL)
-	{
-		pMA->WelsFree(pWelsSvcRc->pGomComplexity, "rc_layer_memory");
-		pWelsSvcRc->pGomComplexity			= NULL;
-		pWelsSvcRc->pGomForegroundBlockNum	= NULL;
-		pWelsSvcRc->pCurrentFrameGomSad	= NULL;
-		pWelsSvcRc->pGomCost				= NULL;
-		pWelsSvcRc->pSlicingOverRc			= NULL;
-		pWelsSvcRc->pTemporalOverRc		= NULL;
-	}
-}
-
-static inline double RcConvertQp2QStep(double dQP)
-{	
-	return pow( 2.0, (dQP-4.0)/6.0 );
-}
-static inline double RcConvertQStep2Qp(double dQpStep)
-{
-	return (6 * log(dQpStep) / log(2.0) + 4.0);
-}
-
-void RcInitSequenceParameter(sWelsEncCtx *pEncCtx)
-{
-	SWelsSvcRc *pWelsSvcRc = NULL;
-	SDLayerParam *pDLayerParam = NULL;
-
-	int32_t j = 0;
-	int32_t iMbWidth = 0;
-
-	BOOL_T bMultiSliceMode = FALSE;
-	int32_t iGomRowMode0 = 1, iGomRowMode1 = 1;
-#ifdef _TEST_TEMP_RC_
-	fp_test_rc = fopen("testRC.dat","w");
-	fp_vgop = fopen("vgop.dat","w");
-#endif
-	for( j=0; j<pEncCtx->pSvcParam->iNumDependencyLayer; j++ )
-	{
-		SSliceCtx *pSliceCtx = &pEncCtx->pSliceCtxList[j];
-		pWelsSvcRc  = &pEncCtx->pWelsSvcRc[j];
-		pDLayerParam = &pEncCtx->pSvcParam->sDependencyLayers[j];
-		iMbWidth     = (pDLayerParam->iFrameWidth>>4);
-		pWelsSvcRc->iNumberMbFrame = iMbWidth*(pDLayerParam->iFrameHeight>>4);
-		pWelsSvcRc->iSliceNum= pSliceCtx->iSliceNumInFrame;
-
-		pWelsSvcRc->iRcVaryPercentage = _BITS_RANGE;	// % -- for temp
-		pWelsSvcRc->dRcVaryRatio = (double)pWelsSvcRc->iRcVaryPercentage/MAX_BITS_VARY_PERCENTAGE;
-
-		pWelsSvcRc->dSkipBufferRatio  = SKIP_RATIO;
-
-		pWelsSvcRc->iQpRangeUpperInFrame = QP_RANGE_UPPER_MODE1 - (int32_t)((QP_RANGE_UPPER_MODE1 - QP_RANGE_MODE0)*pWelsSvcRc->dRcVaryRatio + 0.5);
-		pWelsSvcRc->iQpRangeLowerInFrame = QP_RANGE_LOWER_MODE1 - (int32_t)((QP_RANGE_LOWER_MODE1 - QP_RANGE_MODE0)*pWelsSvcRc->dRcVaryRatio + 0.5);
-
-		if( iMbWidth<=MB_WIDTH_THRESHOLD_90P )
-		{
-			pWelsSvcRc->iSkipQpValue = SKIP_QP_90P;
-			iGomRowMode0 = GOM_ROW_MODE0_90P;
-			iGomRowMode1 = GOM_ROW_MODE1_90P;
-		}
-		else if( iMbWidth<=MB_WIDTH_THRESHOLD_180P )
-		{
-			pWelsSvcRc->iSkipQpValue = SKIP_QP_180P;
-			iGomRowMode0 = GOM_ROW_MODE0_180P;
-			iGomRowMode1 = GOM_ROW_MODE1_180P;
-		}
-		else if( iMbWidth<=MB_WIDTH_THRESHOLD_360P )
-		{
-			pWelsSvcRc->iSkipQpValue = SKIP_QP_360P;
-			iGomRowMode0 = GOM_ROW_MODE0_360P;
-			iGomRowMode1 = GOM_ROW_MODE1_360P;
-		}
-		else
-		{
-			pWelsSvcRc->iSkipQpValue = SKIP_QP_720P;
-			iGomRowMode0 = GOM_ROW_MODE0_720P;
-			iGomRowMode1 = GOM_ROW_MODE1_720P;				
-		}
-		iGomRowMode0 = iGomRowMode1 + (int32_t)((iGomRowMode0 - iGomRowMode1)*pWelsSvcRc->dRcVaryRatio + 0.5);
-
-		pWelsSvcRc->iNumberMbGom   = iMbWidth*iGomRowMode0;
-
-		pWelsSvcRc->iMinQp = GOM_MIN_QP_MODE;
-		pWelsSvcRc->iMaxQp = GOM_MAX_QP_MODE;
-		
-		pWelsSvcRc->iFrameDeltaQpUpper = LAST_FRAME_QP_RANGE_UPPER_MODE1 - (int32_t)((LAST_FRAME_QP_RANGE_UPPER_MODE1 - LAST_FRAME_QP_RANGE_UPPER_MODE0)*pWelsSvcRc->dRcVaryRatio + 0.5);
-		pWelsSvcRc->iFrameDeltaQpLower = LAST_FRAME_QP_RANGE_LOWER_MODE1 - (int32_t)((LAST_FRAME_QP_RANGE_LOWER_MODE1 - LAST_FRAME_QP_RANGE_LOWER_MODE0)*pWelsSvcRc->dRcVaryRatio + 0.5);
-
-		pWelsSvcRc->iSkipFrameNum = 0;
-		pWelsSvcRc->iGomSize = (pWelsSvcRc->iNumberMbFrame+pWelsSvcRc->iNumberMbGom-1)/pWelsSvcRc->iNumberMbGom;
-	
-
-		RcInitLayerMemory( pWelsSvcRc, pEncCtx->pMemAlign, 1+pDLayerParam->iHighestTemporalId );
-
-		bMultiSliceMode	= ( (SM_RASTER_SLICE == pDLayerParam->sMso.uiSliceMode) || 
-			(SM_ROWMB_SLICE	 == pDLayerParam->sMso.uiSliceMode) || 
-			(SM_DYN_SLICE	 == pDLayerParam->sMso.uiSliceMode)	);
-		if( bMultiSliceMode )
-			pWelsSvcRc->iNumberMbGom = pWelsSvcRc->iNumberMbFrame;
-	}
-}
-
-
-void RcInitTlWeight(sWelsEncCtx *pEncCtx)
-{
-	SWelsSvcRc *pWelsSvcRc = &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
-	SRCTemporal *pTOverRc	= pWelsSvcRc->pTemporalOverRc;
-	SDLayerParam *pDLayerParam =  &pEncCtx->pSvcParam->sDependencyLayers[pEncCtx->uiDependencyId];
-	const int32_t kiDecompositionStages = pDLayerParam->iDecompositionStages;
-	const int32_t kiHighestTid = pDLayerParam->iHighestTemporalId;
-
-	//Index 0:Virtual GOP size, Index 1:Frame rate
-	double WeightArray[4][4] = { {1.0, 0, 0, 0}, {0.6, 0.4, 0, 0}, {0.4, 0.3, 0.15, 0}, {0.25, 0.15, 0.125, 0.0875}};
-	const int32_t kiGopSize = (1<<kiDecompositionStages);
-	int32_t i, k, n;
-
-	n = 0;
-	while (n <= kiHighestTid)
-	{
-		pTOverRc[n].dTlayerWeight	= WeightArray[kiDecompositionStages][n];
-		++ n;
-	}
-	//Calculate the frame index for the current frame and its reference frame
-	for( n=0; n<VGOP_SIZE; n+=kiGopSize )
-	{
-		pWelsSvcRc->iTlOfFrames[n] = 0;
-		for( i=1; i<=kiDecompositionStages; i++ )
-		{
-			for( k=1<<(kiDecompositionStages-i); k<kiGopSize; k+=(kiGopSize>>(i-1)) )
-			{
-				pWelsSvcRc->iTlOfFrames[k+n]=i;
-			}
-		}
-	}
-	pWelsSvcRc->iPreviousGopSize = kiGopSize;
-	pWelsSvcRc->iGopNumberInVGop = VGOP_SIZE/kiGopSize;
-}
-
-void RcUpdateBitrateFps(sWelsEncCtx *pEncCtx)
-{
-	SWelsSvcRc *pWelsSvcRc	= &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
-	SRCTemporal *pTOverRc		= pWelsSvcRc->pTemporalOverRc;
-	SDLayerParam *pDLayerParam     = &pEncCtx->pSvcParam->sDependencyLayers[pEncCtx->uiDependencyId];
-	const int32_t kiGopSize	= (1<<pDLayerParam->iDecompositionStages);	
-	const int32_t kiHighestTid = pDLayerParam->iHighestTemporalId;	
-	double input_dBitsPerFrame = pDLayerParam->iSpatialBitrate / pDLayerParam->fInputFrameRate;
-	const int32_t kiGopBits	= (int32_t)(input_dBitsPerFrame*kiGopSize);
-	int32_t i;
-
-	pWelsSvcRc->iBitRate   = pDLayerParam->iSpatialBitrate; 
-	pWelsSvcRc->fFrameRate = pDLayerParam->fInputFrameRate;	
-	
-	double dTargetVaryRange = FRAME_iTargetBits_VARY_RANGE*(1.0 - pWelsSvcRc->dRcVaryRatio);
-	double dMinBitsRatio = 1.0 - dTargetVaryRange;
-	double dMaxBitsRatio = 1.0 + FRAME_iTargetBits_VARY_RANGE;//dTargetVaryRange;
-
-	for( i=0; i<=kiHighestTid; i++)
-	{
-		const double kdConstraitBits = kiGopBits*pTOverRc[i].dTlayerWeight;	
-		pTOverRc[i].iMinBitsTl = (int32_t)(kdConstraitBits*dMinBitsRatio);
-		pTOverRc[i].iMaxBitsTl = (int32_t)(kdConstraitBits*dMaxBitsRatio);
-	}
-	//When bitrate is changed, pBuffer size should be updated
-	pWelsSvcRc->iBufferSizeSkip = (int32_t)(pWelsSvcRc->iBitRate * pWelsSvcRc->dSkipBufferRatio);
-	pWelsSvcRc->iBufferSizePadding = (int32_t)(pWelsSvcRc->iBitRate * PADDING_BUFFER_RATIO);
-
-	//change remaining bits
-	if(pWelsSvcRc->dBitsPerFrame > 0.1)
-		pWelsSvcRc->iRemainingBits = (int32_t)(pWelsSvcRc->iRemainingBits*input_dBitsPerFrame/pWelsSvcRc->dBitsPerFrame);
-	pWelsSvcRc->dBitsPerFrame = input_dBitsPerFrame;
-}
-
-
-void RcInitVGop(sWelsEncCtx *pEncCtx)
-{
-	const int32_t kiDid		= pEncCtx->uiDependencyId;
-	SWelsSvcRc *pWelsSvcRc = &pEncCtx->pWelsSvcRc[kiDid];
-	SRCTemporal *pTOverRc		= pWelsSvcRc->pTemporalOverRc;
-	const int32_t kiHighestTid = pEncCtx->pSvcParam->sDependencyLayers[kiDid].iHighestTemporalId;
-
-	pWelsSvcRc->iRemainingBits = (int32_t)(VGOP_SIZE*pWelsSvcRc->dBitsPerFrame);
-	pWelsSvcRc->dRemainingWeights = pWelsSvcRc->iGopNumberInVGop;
-
-	pWelsSvcRc->iFrameCodedInVGop = 0;
-	pWelsSvcRc->iGopIndexInVGop = 0;
-
-	for (int32_t i = 0; i <= kiHighestTid; ++ i)
-		pTOverRc[i].iGopBitsDq = 0;
-	pWelsSvcRc->iSkipFrameInVGop=0;
-}
-
-void RcInitRefreshParameter(sWelsEncCtx *pEncCtx)
-{
-	const int32_t kiDid		  = pEncCtx->uiDependencyId;
-	SWelsSvcRc *pWelsSvcRc   = &pEncCtx->pWelsSvcRc[kiDid];
-	SRCTemporal *pTOverRc		  = pWelsSvcRc->pTemporalOverRc;
-	SDLayerParam *pDLayerParam       = &pEncCtx->pSvcParam->sDependencyLayers[kiDid];
-	const int32_t kiHighestTid = pDLayerParam->iHighestTemporalId;
-	int32_t i;
-
-	//I frame R-Q Model
-	pWelsSvcRc->iIntraComplexity = 0;
-	pWelsSvcRc->iIntraMbCount = 0;
-
-	//P frame R-Q Model
-	for(i=0; i<=kiHighestTid; i++)
-	{
-		pTOverRc[i].iPFrameNum = 0;
-		pTOverRc[i].dLinearCmplx = 0.0;
-		pTOverRc[i].iFrameCmplxMean = 0;
-	}
-
-	pWelsSvcRc->iBufferFullnessSkip = 0;
-	pWelsSvcRc->iBufferFullnessPadding = 0;
-
-	pWelsSvcRc->iGopIndexInVGop = 0;
-	pWelsSvcRc->iRemainingBits = 0;
-	pWelsSvcRc->dBitsPerFrame	= 0.0;
-
-	//Backup the initial bitrate and fps
-	pWelsSvcRc->iPreviousBitrate  = pDLayerParam->iSpatialBitrate;
-	pWelsSvcRc->dPreviousFps      = pDLayerParam->fInputFrameRate;	
-
-	memset( pWelsSvcRc->pCurrentFrameGomSad, 0, pWelsSvcRc->iGomSize*sizeof(int32_t) );
-
-	RcInitTlWeight(pEncCtx);
-	RcUpdateBitrateFps(pEncCtx);
-	RcInitVGop(pEncCtx);
-}
-
-bool_t RcJudgeBitrateFpsUpdate(sWelsEncCtx *pEncCtx)
-{
-	int32_t iCurDid = pEncCtx->uiDependencyId;
-	SWelsSvcRc *pWelsSvcRc       = &pEncCtx->pWelsSvcRc[iCurDid];
-	SDLayerParam *pDLayerParam    = &pEncCtx->pSvcParam->sDependencyLayers[iCurDid];
-
-	if((pWelsSvcRc->iPreviousBitrate != pDLayerParam->iSpatialBitrate) ||
-		(pWelsSvcRc->dPreviousFps-pDLayerParam->fInputFrameRate)>EPSN ||
-		(pWelsSvcRc->dPreviousFps-pDLayerParam->fInputFrameRate)<-EPSN)
-	{
-		pWelsSvcRc->iPreviousBitrate = pDLayerParam->iSpatialBitrate;
-		pWelsSvcRc->dPreviousFps = pDLayerParam->fInputFrameRate;
-		return true;
-	}
-	else
-		return false;
-}
-
-#if GOM_TRACE_FLAG
-void RcTraceVGopBitrate(sWelsEncCtx *pEncCtx)
-{
-	const int32_t kiDid				= pEncCtx->uiDependencyId;
-	SWelsSvcRc *pWelsSvcRc			= &pEncCtx->pWelsSvcRc[kiDid];
-
-	if( pWelsSvcRc->iFrameCodedInVGop )
-	{
-		const int32_t kiHighestTid	= pEncCtx->pSvcParam->sDependencyLayers[kiDid].iHighestTemporalId;
-		SRCTemporal *pTOverRc			= pWelsSvcRc->pTemporalOverRc;
-		int32_t iVGopBitrate;
-		int32_t	iTotalBits = pWelsSvcRc->iPaddingBitrateStat;
-		int32_t iTid = 0;
-		while (iTid <= kiHighestTid)
-		{
-			iTotalBits += pTOverRc[iTid].iGopBitsDq;
-			++ iTid;
-		}
-		int32_t iFrameInVGop = pWelsSvcRc->iFrameCodedInVGop+pWelsSvcRc->iSkipFrameInVGop;
-		if(0 != iFrameInVGop)			
-			iVGopBitrate = (int32_t)( iTotalBits/iFrameInVGop *pWelsSvcRc->fFrameRate );
-#ifdef _TEST_TEMP_Rc_
-		fprintf(fp_vgop,"%d\n",(int32_t)((double)iTotalBits/iFrameInVGop));
-#endif
-		WelsLog( pEncCtx, WELS_LOG_INFO,"[Rc] VGOPbitrate%d: %d \n", kiDid, iVGopBitrate);
-		if ( iTotalBits > 0 )
-		{
-			iTid = 0;
-			while (iTid <= kiHighestTid)
-			{
-				WelsLog( pEncCtx, WELS_LOG_INFO,"T%d=%8.3f \n", iTid, (double)(pTOverRc[iTid].iGopBitsDq/iTotalBits) );
-				++ iTid;
-			}			
-		}		
-	}
-}
-#endif
-
-void RcUpdateTemporalZero(sWelsEncCtx *pEncCtx)
-{
-	const int32_t kiDid		= pEncCtx->uiDependencyId;
-	SWelsSvcRc *pWelsSvcRc	= &pEncCtx->pWelsSvcRc[kiDid];
-	SDLayerParam *pDLayerParam		= &pEncCtx->pSvcParam->sDependencyLayers[kiDid];
-	const int32_t kiGopSize	= (1<<pDLayerParam->iDecompositionStages);
-
-	if( pWelsSvcRc->iPreviousGopSize  != kiGopSize )
-	{
-#if GOM_TRACE_FLAG
-		RcTraceVGopBitrate(pEncCtx);
-#endif
-		RcInitTlWeight(pEncCtx);
-		RcInitVGop(pEncCtx);		
-	}
-	else if( pWelsSvcRc->iGopIndexInVGop == pWelsSvcRc->iGopNumberInVGop || pEncCtx->eSliceType == I_SLICE)
-	{
-#if GOM_TRACE_FLAG
-		RcTraceVGopBitrate(pEncCtx);
-#endif
-		RcInitVGop(pEncCtx);
-	}
-	pWelsSvcRc->iGopIndexInVGop++;
-}
-
-
-void RcInitIdrQp(sWelsEncCtx *pEncCtx)
-{
-	double dBpp = 0;
-	int32_t i;
-
-	//64k@6fps for 90p:     bpp 0.74    QP:24
-	//192k@12fps for 180p:  bpp 0.28    QP:26
-	//512k@24fps for 360p:  bpp 0.09    QP:30
-	//1500k@30fps for 720p: bpp 0.05    QP:32
-	double dBppArray[4][3] = {{0.5, 0.75, 1.0}, {0.2, 0.3, 0.4}, {0.05, 0.09, 0.13}, {0.03, 0.06, 0.1}};
-	int32_t dInitialQPArray[4][4] = {{28, 26, 24, 22}, {30, 28, 26, 24}, {32, 30, 28, 26}, {34, 32, 30, 28}};
-	int32_t iBppIndex = 0;
-
-	SWelsSvcRc *pWelsSvcRc		= &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
-	SDLayerParam *pDLayerParam			= &pEncCtx->pSvcParam->sDependencyLayers[pEncCtx->uiDependencyId];
-
-	if (pDLayerParam->fOutputFrameRate > EPSN && pDLayerParam->iFrameWidth && pDLayerParam->iFrameHeight)
-		dBpp=(double)(pDLayerParam->iSpatialBitrate) / (double)(pDLayerParam->fOutputFrameRate * pDLayerParam->iFrameWidth * pDLayerParam->iFrameHeight);
-	else
-		dBpp = 0.1;
-
-	//Area*2
-	if ( pDLayerParam->iFrameWidth*pDLayerParam->iFrameHeight <= 28800 ) // 90p video:160*90
-		iBppIndex = 0;
-	else if ( pDLayerParam->iFrameWidth*pDLayerParam->iFrameHeight <= 115200 ) // 180p video:320*180
-		iBppIndex = 1;
-	else if ( pDLayerParam->iFrameWidth*pDLayerParam->iFrameHeight <= 460800 ) // 360p video:640*360
-		iBppIndex = 2;
-	else
-		iBppIndex = 3;
-
-	//Search
-	for( i=0; i<3; i++ )
-	{
-		if ( dBpp<=dBppArray[iBppIndex][i] )
-			break;
-	}
-	pWelsSvcRc->iInitialQp = dInitialQPArray[iBppIndex][i];
-	pWelsSvcRc->iInitialQp = (int32_t)WELS_CLIP3( pWelsSvcRc->iInitialQp, MIN_IDR_QP, MAX_IDR_QP );
-	pEncCtx->iGlobalQp = pWelsSvcRc->iInitialQp;
-	pWelsSvcRc->dQStep = RcConvertQp2QStep(pEncCtx->iGlobalQp);
-	pWelsSvcRc->iLastCalculatedQScale = pEncCtx->iGlobalQp;
-}
-
-void RcCalculateIdrQp(sWelsEncCtx *pEncCtx)
-{
-	SWelsSvcRc *pWelsSvcRc			= &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
-	//obtain the idr qp using previous idr complexity
-	if(pWelsSvcRc->iNumberMbFrame != pWelsSvcRc->iIntraMbCount){
-		pWelsSvcRc->iIntraComplexity = (int32_t)((double)pWelsSvcRc->iIntraComplexity*pWelsSvcRc->iNumberMbFrame/pWelsSvcRc->iIntraMbCount + 0.5);		
-	}
-	pWelsSvcRc->iInitialQp = (int32_t)RcConvertQStep2Qp( (double)pWelsSvcRc->iIntraComplexity/pWelsSvcRc->iTargetBits);
-	pWelsSvcRc->iInitialQp = (int32_t)WELS_CLIP3( pWelsSvcRc->iInitialQp, MIN_IDR_QP, MAX_IDR_QP );
-	pEncCtx->iGlobalQp = pWelsSvcRc->iInitialQp;
-	pWelsSvcRc->dQStep = RcConvertQp2QStep(pEncCtx->iGlobalQp);
-	pWelsSvcRc->iLastCalculatedQScale = pEncCtx->iGlobalQp;
-}
-
-
-void RcCalculatePictureQp(sWelsEncCtx *pEncCtx)
-{
-	SWelsSvcRc *pWelsSvcRc		= &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
-	int32_t iTl					= pEncCtx->uiTemporalId;
-	SRCTemporal *pTOverRc			= &pWelsSvcRc->pTemporalOverRc[iTl];
-	int32_t iLumaQp = 0;
-
-	if(0 == pTOverRc->iPFrameNum)
-	{
-		iLumaQp = pWelsSvcRc->iInitialQp;	
-	}
-	else{
-		double dCmplxRatio = (double)pEncCtx->pVaa->sComplexityAnalysisParam.iFrameComplexity/pTOverRc->iFrameCmplxMean;
-		dCmplxRatio = WELS_CLIP3(dCmplxRatio, 1.0-FRAME_CMPLX_RATIO_RANGE, 1.0+FRAME_CMPLX_RATIO_RANGE);
-		
-		pWelsSvcRc->dQStep = pTOverRc->dLinearCmplx*dCmplxRatio / pWelsSvcRc->iTargetBits;
-		iLumaQp = (int32_t)( RcConvertQStep2Qp( pWelsSvcRc->dQStep )+0.5 );
-
-		//limit QP
-		int32_t iLastIdxCodecInVGop = pWelsSvcRc->iFrameCodedInVGop - 1;
-		if(iLastIdxCodecInVGop < 0)
-			iLastIdxCodecInVGop += VGOP_SIZE;
-		int32_t iTlLast = pWelsSvcRc->iTlOfFrames[iLastIdxCodecInVGop];
-		int32_t iDeltaQpTemporal = iTl - iTlLast;
-		if(0 == iTlLast && iTl > 0)
-			iDeltaQpTemporal += 3;
-		else if(0 == iTl && iTlLast > 0)
-			iDeltaQpTemporal -= 3;		
-
-		iLumaQp = WELS_CLIP3(iLumaQp,  
-			pWelsSvcRc->iLastCalculatedQScale - pWelsSvcRc->iFrameDeltaQpLower +iDeltaQpTemporal, pWelsSvcRc->iLastCalculatedQScale + pWelsSvcRc->iFrameDeltaQpUpper + iDeltaQpTemporal);
-	}
-
-	iLumaQp = WELS_CLIP3(iLumaQp,  GOM_MIN_QP_MODE, GOM_MAX_QP_MODE);
-
-	pWelsSvcRc->dQStep = RcConvertQp2QStep(iLumaQp);
-	pWelsSvcRc->iLastCalculatedQScale = iLumaQp;
-#ifndef _NOT_USE_AQ_FOR_TEST_
-	if(pEncCtx->pSvcParam->bEnableAdaptiveQuant)
-	{
-
-		iLumaQp = (int32_t)WELS_CLIP3(iLumaQp - pEncCtx->pVaa->sAdaptiveQuantParam.dAverMotionTextureIndexToDeltaQp, pWelsSvcRc->iMinQp, pWelsSvcRc->iMaxQp);
-	}
-#endif
-	pEncCtx->iGlobalQp = iLumaQp;
-}
-
-void RcInitSliceInformation(sWelsEncCtx *pEncCtx)
-{
-	SSliceCtx *pCurSliceCtx	= pEncCtx->pCurDqLayer->pSliceEncCtx;
-	SWelsSvcRc *pWelsSvcRc			= &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
-	SRCSlicing *pSOverRc				= &pWelsSvcRc->pSlicingOverRc[0];
-	const int32_t kiSliceNum			= pCurSliceCtx->iSliceNumInFrame;
-	const double kdBitsPerMb		= (double)pWelsSvcRc->iTargetBits / pWelsSvcRc->iNumberMbFrame;
-
-	for(int32_t i=0; i<kiSliceNum; i++ )
-	{
-		pSOverRc->iStartMbSlice	=
-		pSOverRc->iEndMbSlice		= pCurSliceCtx->pFirstMbInSlice[i];
-		pSOverRc->iEndMbSlice		+= (pCurSliceCtx->pCountMbNumInSlice[i]-1);
-		pSOverRc->iTotalQpSlice	= 0;
-		pSOverRc->iTotalMbSlice	= 0;
-		pSOverRc->iTargetBitsSlice = (int32_t)(kdBitsPerMb * pCurSliceCtx->pCountMbNumInSlice[i]);
-		pSOverRc->iFrameBitsSlice	= 0;
-		pSOverRc->iGomBitsSlice	= 0;
-		++ pSOverRc;
-	}
-}
-
-void RcDecideTargetBits(sWelsEncCtx *pEncCtx)
-{
-	SWelsSvcRc *pWelsSvcRc	= &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
-	SRCTemporal *pTOverRc		= &pWelsSvcRc->pTemporalOverRc[pEncCtx->uiTemporalId];		
-	//allocate bits
-	if(pEncCtx->eSliceType == I_SLICE)
-	{
-		pWelsSvcRc->iTargetBits = (int32_t)( pWelsSvcRc->dBitsPerFrame * IDR_BITRATE_RATIO );
-	}
-	else
-	{
-		pWelsSvcRc->iTargetBits = (int32_t)( pWelsSvcRc->iRemainingBits*pTOverRc->dTlayerWeight/pWelsSvcRc->dRemainingWeights );
-		pWelsSvcRc->iTargetBits = WELS_CLIP3( pWelsSvcRc->iTargetBits, pTOverRc->iMinBitsTl,	pTOverRc->iMaxBitsTl);	
-	}
-	pWelsSvcRc->dRemainingWeights -= pTOverRc->dTlayerWeight;
-}
-
-
-void RcInitGoomParameters(sWelsEncCtx *pEncCtx)
-{
-	SWelsSvcRc *pWelsSvcRc			= &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
-	SRCSlicing *pSOverRc				= &pWelsSvcRc->pSlicingOverRc[0];
-	const int32_t kiSliceNum			= pWelsSvcRc->iSliceNum;
-	const int32_t kiGlobalQp			= pEncCtx->iGlobalQp;
-
-	pWelsSvcRc->iAverageFrameQp = 0;
-	for(int32_t i=0; i<kiSliceNum; ++i )
-	{
-		pSOverRc->iComplexityIndexSlice	= 0;
-		pSOverRc->iCalculatedQpSlice		= kiGlobalQp;		
-		++ pSOverRc;
-	}
-	memset( pWelsSvcRc->pGomComplexity, 0, pWelsSvcRc->iGomSize*sizeof(double) );
-	memset( pWelsSvcRc->pGomCost, 0, pWelsSvcRc->iGomSize*sizeof(int32_t) );
-}
-
-void RcCalculateMbQp(sWelsEncCtx *pEncCtx,SMB* pCurMb, const int32_t kiSliceId)
-{
-	SWelsSvcRc *pWelsSvcRc = &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
-	SRCSlicing *pSOverRc		= &pWelsSvcRc->pSlicingOverRc[kiSliceId];	
-	int32_t iLumaQp			= pSOverRc->iCalculatedQpSlice;
-
-#ifndef _NOT_USE_AQ_FOR_TEST_
-	if ( pEncCtx->pSvcParam->bEnableAdaptiveQuant )
-	{
-		iLumaQp   = (int8_t)WELS_CLIP3(iLumaQp + 
-		pEncCtx->pVaa->sAdaptiveQuantParam.pMotionTextureIndexToDeltaQp[pCurMb->iMbXY], pWelsSvcRc->iMinQp, 51);
-	}
-#endif
-	pCurMb->uiChromaQp	= g_kuiChromaQpTable[iLumaQp];
-	pCurMb->uiLumaQp		= iLumaQp;
-}
-
-SWelsSvcRc* RcJudgeBaseUsability(sWelsEncCtx *pEncCtx)
-{
-	SWelsSvcRc *pWelsSvcRc  = NULL, *pWelsSvcRc_Base = NULL;
-	SDLayerParam *pDlpBase = NULL, *pDLayerParam = NULL;
-
-	if( pEncCtx->uiDependencyId<=0 )
-		return NULL;
-
-	pDlpBase = &pEncCtx->pSvcParam->sDependencyLayers[pEncCtx->uiDependencyId-1];
-	pWelsSvcRc_Base = &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId-1];
-	if( pEncCtx->uiTemporalId<=pDlpBase->iDecompositionStages )
-	{
-		pWelsSvcRc      = &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
-		pWelsSvcRc_Base = &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId-1];
-		pDLayerParam             = &pEncCtx->pSvcParam->sDependencyLayers[pEncCtx->uiDependencyId];
-		pDlpBase        = &pEncCtx->pSvcParam->sDependencyLayers[pEncCtx->uiDependencyId-1];
-		if( (pDLayerParam->iFrameWidth*pDLayerParam->iFrameHeight/pWelsSvcRc->iNumberMbGom) == 
-			(pDlpBase->iFrameWidth*pDlpBase->iFrameHeight/pWelsSvcRc_Base->iNumberMbGom) )
-			return pWelsSvcRc_Base;
-		else
-			return NULL;
-	}
-	else
-		return NULL;
-}
-
-void RcGomTargetBits(sWelsEncCtx *pEncCtx, const int32_t kiSliceId)
-{
-	SWelsSvcRc *pWelsSvcRc			= &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
-	SWelsSvcRc *pWelsSvcRc_Base	= NULL;
-	SRCSlicing *pSOverRc				= &pWelsSvcRc->pSlicingOverRc[kiSliceId];
-
-	double dAllocateBits = 0;
-	int32_t iSumSad = 0;
-	int32_t iLastGomIndex = 0;
-	int32_t iLeftBits = 0;
-	const int32_t kiComplexityIndex	= pSOverRc->iComplexityIndexSlice;
-	int32_t i;
-
-	iLastGomIndex  = pSOverRc->iEndMbSlice/pWelsSvcRc->iNumberMbGom;
-	iLeftBits = pSOverRc->iTargetBitsSlice-pSOverRc->iFrameBitsSlice;
-	
-	if(iLeftBits <= 0)
-	{
-		pSOverRc->iGomTargetBits = 0;
-		return;
-	}
-	else if( kiComplexityIndex >= iLastGomIndex)
-	{
-		dAllocateBits = iLeftBits;
-	}
-	else
-	{
-		pWelsSvcRc_Base = RcJudgeBaseUsability(pEncCtx);
-		pWelsSvcRc_Base = (pWelsSvcRc_Base) ? pWelsSvcRc_Base : pWelsSvcRc;		
-		for( i=kiComplexityIndex; i<=iLastGomIndex; i++ )
-		{
-			iSumSad += pWelsSvcRc_Base->pCurrentFrameGomSad[i];
-		}
-		if(0 == iSumSad)
-			dAllocateBits = (double)iLeftBits/(iLastGomIndex-kiComplexityIndex);
-		else
-			dAllocateBits = (double)iLeftBits*pWelsSvcRc_Base->pCurrentFrameGomSad[kiComplexityIndex+1]/iSumSad;
-		
-	}
-	pSOverRc->iGomTargetBits = int32_t(dAllocateBits + 0.5);
-}
-
-
-
-void RcCalculateGomQp(sWelsEncCtx *pEncCtx, SMB* pCurMb, int32_t iSliceId)
-{
-	SWelsSvcRc *pWelsSvcRc			= &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
-	SRCSlicing *pSOverRc				= &pWelsSvcRc->pSlicingOverRc[iSliceId];
-	double dBitsRatio = 1.0;
-
-	int32_t iLeftBits = pSOverRc->iTargetBitsSlice - pSOverRc->iFrameBitsSlice;
-	int32_t iTargetLeftBits = iLeftBits + pSOverRc->iGomBitsSlice - pSOverRc->iGomTargetBits;
-	
-	if(iLeftBits <= 0)
-	{
-		pSOverRc->iCalculatedQpSlice += 2;
-	}
-	else
-	{
-		//globe decision
-		dBitsRatio = iLeftBits / (iTargetLeftBits + 0.1);		
-		if(dBitsRatio < 0.8409)		//2^(-1.5/6)
-			pSOverRc->iCalculatedQpSlice += 2;
-		else if(dBitsRatio < 0.9439)	//2^(-0.5/6)
-			pSOverRc->iCalculatedQpSlice += 1;
-		else if(dBitsRatio > 1.06)		//2^(0.5/6)
-			pSOverRc->iCalculatedQpSlice -= 1;
-		else if(dBitsRatio > 1.19)		//2^(1.5/6)
-			pSOverRc->iCalculatedQpSlice -= 2;
-	}
-
-	pSOverRc->iCalculatedQpSlice = WELS_CLIP3( pSOverRc->iCalculatedQpSlice, 
-		pEncCtx->iGlobalQp-pWelsSvcRc->iQpRangeLowerInFrame, pEncCtx->iGlobalQp+pWelsSvcRc->iQpRangeUpperInFrame );
-	pSOverRc->iCalculatedQpSlice = WELS_CLIP3(pSOverRc->iCalculatedQpSlice, pWelsSvcRc->iMinQp, pWelsSvcRc->iMaxQp);
-
-	pSOverRc->iGomBitsSlice = 0;
-
-}
-
-void   RcVBufferCalculationSkip(sWelsEncCtx *pEncCtx)
-{
-	SWelsSvcRc *pWelsSvcRc = &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
-	SRCTemporal *pTOverRc		= pWelsSvcRc->pTemporalOverRc;
-	const int32_t kiOutputBits = (int32_t)(pWelsSvcRc->dBitsPerFrame + 0.5);
-	//condition 1: whole pBuffer fullness
-	pWelsSvcRc->iBufferFullnessSkip += (pWelsSvcRc->iFrameDqBits - kiOutputBits);
-	//condition 2: VGOP bits constraint
-	const int32_t kiVGopBits = (int32_t)(pWelsSvcRc->dBitsPerFrame * VGOP_SIZE);
-	int32_t iVGopBitsPred = 0;
-	for(int32_t i = pWelsSvcRc->iFrameCodedInVGop+1; i<VGOP_SIZE; i++ )
-		iVGopBitsPred += pTOverRc[pWelsSvcRc->iTlOfFrames[i]].iMinBitsTl;
-	iVGopBitsPred -= pWelsSvcRc->iRemainingBits;
-	double dIncPercent = iVGopBitsPred*100.0/kiVGopBits - (double)VGOP_BITS_PERCENTAGE_DIFF;
-	
-	if( (pWelsSvcRc->iBufferFullnessSkip > pWelsSvcRc->iBufferSizeSkip &&	pWelsSvcRc->iAverageFrameQp > pWelsSvcRc->iSkipQpValue)
-		|| (dIncPercent > pWelsSvcRc->iRcVaryPercentage))
-	{
-		pEncCtx->iSkipFrameFlag=1;
-		pWelsSvcRc->iBufferFullnessSkip = pWelsSvcRc->iBufferFullnessSkip-kiOutputBits;
-#ifdef FRAME_INFO_OUTPUT
-		fprintf(stderr, "skip one frame\n");
-#endif
-	}
-
-	if( pWelsSvcRc->iBufferFullnessSkip<0 )
-		pWelsSvcRc->iBufferFullnessSkip = 0;
-
-	if( pEncCtx->iSkipFrameFlag==1 )
-	{
-		pWelsSvcRc->iRemainingBits += (int32_t)(pWelsSvcRc->dBitsPerFrame + 0.5);
-		pWelsSvcRc->iSkipFrameNum++;
-		pWelsSvcRc->iSkipFrameInVGop++;
-	}
-}
-
-void RcVBufferCalculationPadding(sWelsEncCtx *pEncCtx)
-{
-	SWelsSvcRc *pWelsSvcRc = &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
-	const int32_t kiOutputBits = (int32_t)(pWelsSvcRc->dBitsPerFrame + 0.5);
-	const int32_t kiBufferThreshold = (int32_t)(PADDING_THRESHOLD*(-pWelsSvcRc->iBufferSizePadding));
-
-	pWelsSvcRc->iBufferFullnessPadding += (pWelsSvcRc->iFrameDqBits - kiOutputBits);
-
-	if( pWelsSvcRc->iBufferFullnessPadding < kiBufferThreshold )
-	{
-		pWelsSvcRc->iPaddingSize = -pWelsSvcRc->iBufferFullnessPadding;
-		pWelsSvcRc->iPaddingSize >>= 3;	// /8
-		pWelsSvcRc->iBufferFullnessPadding = 0;
-	}
-	else
-		pWelsSvcRc->iPaddingSize=0;
-}
-
-
-void RcTraceFrameBits(sWelsEncCtx *pEncCtx)
-{
-	SWelsSvcRc *pWelsSvcRc = &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
-
-	WelsLog( pEncCtx, WELS_LOG_INFO,"[Rc] encoding_qp%d, qp = %3d, index = %8d, iTid = %1d, used = %8d, target = %8d, remaingbits = %8d\n",
-		pEncCtx->uiDependencyId, pWelsSvcRc->iAverageFrameQp, pEncCtx->uiFrameIdxRc, pEncCtx->uiTemporalId, pWelsSvcRc->iFrameDqBits,
-		pWelsSvcRc->iTargetBits,pWelsSvcRc->iRemainingBits);
-}
-
-void RcUpdatePictureQpBits(sWelsEncCtx *pEncCtx, int32_t iCodedBits)
-{
-	SWelsSvcRc *pWelsSvcRc = &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
-	SRCSlicing *pSOverRc		= &pWelsSvcRc->pSlicingOverRc[0];
-	SSliceCtx *pCurSliceCtx = pEncCtx->pCurDqLayer->pSliceEncCtx;
-	int32_t iTotalQp = 0, iTotalMb = 0;
-	int32_t i;
-
-	if(pEncCtx->eSliceType == P_SLICE)
-	{
-		for( i=0; i<pCurSliceCtx->iSliceNumInFrame; i++ )
-		{			
-			iTotalQp += pSOverRc->iTotalQpSlice;
-			iTotalMb += pSOverRc->iTotalMbSlice;
-			++ pSOverRc;
-		}
-		if(iTotalMb > 0)
-			pWelsSvcRc->iAverageFrameQp = (int32_t)(1.0*iTotalQp/iTotalMb+0.5);
-		else
-			pWelsSvcRc->iAverageFrameQp = pEncCtx->iGlobalQp;
-	}
-	else
-	{
-		pWelsSvcRc->iAverageFrameQp = pEncCtx->iGlobalQp;
-	}	
-	pWelsSvcRc->iFrameDqBits = iCodedBits;
-	pWelsSvcRc->pTemporalOverRc[pEncCtx->uiTemporalId].iGopBitsDq += pWelsSvcRc->iFrameDqBits;
-}
-
-void RcUpdateIntraComplexity(sWelsEncCtx *pEncCtx)
-{
-	SWelsSvcRc *pWelsSvcRc = &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
-	double iAlpha = 1.0/(1+pWelsSvcRc->iIdrNum);
-	if(iAlpha < 0.25) iAlpha = 0.25;
-	
-	double dIntraCmplx = pWelsSvcRc->dQStep*pWelsSvcRc->iFrameDqBits;
-	dIntraCmplx = (1.0-iAlpha)*pWelsSvcRc->iIntraComplexity + iAlpha*dIntraCmplx;
-	pWelsSvcRc->iIntraComplexity = (int32_t)(dIntraCmplx + 0.5);	
-	pWelsSvcRc->iIntraMbCount = pWelsSvcRc->iNumberMbFrame;
-
-	pWelsSvcRc->iIdrNum++;
-	if(pWelsSvcRc->iIdrNum > 255)
-		pWelsSvcRc->iIdrNum = 255;
-}
-
-void RcUpdateFrameComplexity(sWelsEncCtx *pEncCtx)
-{
-	SWelsSvcRc *pWelsSvcRc		= &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
-	const int32_t kiTl			= pEncCtx->uiTemporalId;
-	SRCTemporal *pTOverRc			= &pWelsSvcRc->pTemporalOverRc[kiTl];
-
-	if(0 == pTOverRc->iPFrameNum){
-		pTOverRc->dLinearCmplx = pWelsSvcRc->iFrameDqBits * pWelsSvcRc->dQStep;
-	}
-	else{
-		pTOverRc->dLinearCmplx = LINEAR_MODEL_DECAY_FACTOR*pTOverRc->dLinearCmplx 
-			+ (1.0-LINEAR_MODEL_DECAY_FACTOR)*(pWelsSvcRc->iFrameDqBits * pWelsSvcRc->dQStep);
-	}
-	double iAlpha = 1.0/(1+pTOverRc->iPFrameNum);
-	if(iAlpha < SMOOTH_FACTOR_MIN_VALUE)
-		iAlpha = SMOOTH_FACTOR_MIN_VALUE;
-	pTOverRc->iFrameCmplxMean = (int32_t)((1.0-iAlpha)*pTOverRc->iFrameCmplxMean + iAlpha*pEncCtx->pVaa->sComplexityAnalysisParam.iFrameComplexity + 0.5);
-
-	pTOverRc->iPFrameNum++;
-	if(pTOverRc->iPFrameNum > 255)
-		pTOverRc->iPFrameNum = 255;
-}
-
-int32_t RcCalculateCascadingQp(struct TagWelsEncCtx *pEncCtx, int32_t iQp)
-{
-	int32_t iTemporalQp = 0;
-	if( pEncCtx->pSvcParam->iDecompStages )
-	{
-		if( pEncCtx->uiTemporalId==0 )
-			iTemporalQp = iQp - 3 - (pEncCtx->pSvcParam->iDecompStages-1);
-		else
-			iTemporalQp = iQp - (pEncCtx->pSvcParam->iDecompStages - pEncCtx->uiTemporalId);
-		iTemporalQp = WELS_CLIP3( iTemporalQp, 1, 51 );
-	}
-	else
-		iTemporalQp = iQp;
-	return iTemporalQp;
-}
-
-void  WelsRcPictureInitGom(void *pCtx)
-{
-	sWelsEncCtx *pEncCtx = (sWelsEncCtx*)pCtx;
-	SWelsSvcRc *pWelsSvcRc = &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
-
-	if ( pEncCtx->eSliceType == I_SLICE )
-	{
-		if(0 == pWelsSvcRc->iIdrNum)	//iIdrNum == 0 means encoder has been initialed
-		{
-			RcInitRefreshParameter(pEncCtx);
-		}
-	}
-	if( RcJudgeBitrateFpsUpdate(pEncCtx))
-	{
-		RcUpdateBitrateFps(pEncCtx);
-	}
-	if( pEncCtx->uiTemporalId == 0 )
-	{
-		RcUpdateTemporalZero(pEncCtx);
-	}
-	RcDecideTargetBits(pEncCtx);
-	//decide globe_qp
-	if(pEncCtx->eSliceType == I_SLICE)
-	{
-		if(0 == pWelsSvcRc->iIdrNum)
-			RcInitIdrQp(pEncCtx);
-		else
-		{
-			RcCalculateIdrQp(pEncCtx);	
-		}
-	}
-	else
-	{
-		RcCalculatePictureQp(pEncCtx);
-	}
-	RcInitSliceInformation(pEncCtx);
-	RcInitGoomParameters(pEncCtx);
-
-}
-
-
-
-void  WelsRcPictureInfoUpdateGom(void *pCtx, int32_t layer_size)
-{
-	sWelsEncCtx *pEncCtx = (sWelsEncCtx*)pCtx;
-	SWelsSvcRc *pWelsSvcRc = &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
-	int32_t iCodedBits = (layer_size<<3);
-
-	RcUpdatePictureQpBits(pEncCtx, iCodedBits);
-
-	if ( pEncCtx->eSliceType == P_SLICE )
-	{		
-		RcUpdateFrameComplexity(pEncCtx);
-	}
-	else
-	{
-		RcUpdateIntraComplexity(pEncCtx);
-	}
-	pWelsSvcRc->iRemainingBits -= pWelsSvcRc->iFrameDqBits;	
-
-#if GOM_TRACE_FLAG
-	RcTraceFrameBits(pEncCtx);
-#endif
-
-	
-#if SKIP_FRAME_FLAG
-	if ( pEncCtx->uiDependencyId == pEncCtx->pSvcParam->iNumDependencyLayer - 1 )
-	{
-		RcVBufferCalculationSkip(pEncCtx);
-	}
-#endif
-
-	if ( pEncCtx->pSvcParam->iPaddingFlag )
-		RcVBufferCalculationPadding(pEncCtx);
-	pWelsSvcRc->iFrameCodedInVGop++;
-#ifdef _TEST_TEMP_Rc_	
-	fprintf(fp_test_rc, "%d\n", pWelsSvcRc->iFrameDqBits);
-	if(pEncCtx->iSkipFrameFlag)
-		fprintf(fp_test_rc, "0\n");	
-	fflush(fp_test_rc);
-#endif
-}
-
-void WelsRcMbInitGom(void *pCtx, SMB* pCurMb, SSlice *pSlice)
-{	
-	sWelsEncCtx *pEncCtx = (sWelsEncCtx*)pCtx;	
-	SWelsSvcRc *pWelsSvcRc			= &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
-	const int32_t kiSliceId			= pSlice->uiSliceIdx;
-	SRCSlicing *pSOverRc				= &pWelsSvcRc->pSlicingOverRc[kiSliceId];
-	SBitStringAux * bs				= pSlice->pSliceBsa;
-
-
-	pSOverRc->iBsPosSlice = BsGetBitsPos(bs);
-
-	if(pEncCtx->eSliceType==I_SLICE)
-		return;
-	//calculate gom qp and target bits at the beginning of gom
-	if(0 == (pCurMb->iMbXY%pWelsSvcRc->iNumberMbGom)){
-		if(pCurMb->iMbXY != pSOverRc->iStartMbSlice){
-			pSOverRc->iComplexityIndexSlice++;
-			RcCalculateGomQp(pEncCtx, pCurMb, kiSliceId);			
-		}
-		RcGomTargetBits(pEncCtx, kiSliceId);
-	}
-
-	RcCalculateMbQp(pEncCtx,pCurMb,kiSliceId);
-}
-
-void WelsRcMbInfoUpdateGom(void *pCtx, SMB* pCurMb, int32_t iCostLuma, SSlice *pSlice)
-{	
-	sWelsEncCtx *pEncCtx = (sWelsEncCtx*)pCtx;
-	SWelsSvcRc *pWelsSvcRc			= &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];	
-	SBitStringAux * bs				= pSlice->pSliceBsa;
-	int32_t iSliceId				= pSlice->uiSliceIdx;
-	SRCSlicing *pSOverRc				= &pWelsSvcRc->pSlicingOverRc[iSliceId];	
-	const int32_t kiComplexityIndex	= pSOverRc->iComplexityIndexSlice;
-	
-	int32_t cur_mb_bits = BsGetBitsPos(bs) - pSOverRc->iBsPosSlice;
-	pSOverRc->iFrameBitsSlice += cur_mb_bits;
-	pSOverRc->iGomBitsSlice += cur_mb_bits;
-
-	pWelsSvcRc->pGomCost[kiComplexityIndex] += iCostLuma;
-
-	if(cur_mb_bits > 0){
-		pSOverRc->iTotalQpSlice += pCurMb->uiLumaQp;
-		pSOverRc->iTotalMbSlice++;
-	}
-}
-
-void  WelsRcPictureInitDisable(void *pCtx)
-{
-	sWelsEncCtx *pEncCtx = (sWelsEncCtx*)pCtx;
-	SWelsSvcRc *pWelsSvcRc = &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
-	SDLayerParam *pDLayerParam		= &pEncCtx->pSvcParam->sDependencyLayers[pEncCtx->uiDependencyId];
-
-	const int32_t kiQp = pDLayerParam->iDLayerQp;
-
-	pEncCtx->iGlobalQp	= RcCalculateCascadingQp( pEncCtx, kiQp );
-
-	if ( pEncCtx->pSvcParam->bEnableAdaptiveQuant && (pEncCtx->eSliceType == P_SLICE) )
-	{
-		pEncCtx->iGlobalQp = (int32_t)WELS_CLIP3(pEncCtx->iGlobalQp - 
-			pEncCtx->pVaa->sAdaptiveQuantParam.dAverMotionTextureIndexToDeltaQp, GOM_MIN_QP_MODE, GOM_MAX_QP_MODE);
-	}
-	pWelsSvcRc->iAverageFrameQp = pEncCtx->iGlobalQp;
-}
-
-void  WelsRcPictureInfoUpdateDisable(void *pCtx, int32_t layer_size)
-{
-}
-
-void  WelsRcMbInitDisable(void *pCtx, SMB* pCurMb, SSlice *pSlice)
-{
-	sWelsEncCtx *pEncCtx = (sWelsEncCtx*)pCtx;
-	int32_t iLumaQp					= pEncCtx->iGlobalQp;
-
-	if ( pEncCtx->pSvcParam->bEnableAdaptiveQuant && (pEncCtx->eSliceType == P_SLICE) )
-	{
-		iLumaQp   = (int8_t)WELS_CLIP3(iLumaQp + 
-			pEncCtx->pVaa->sAdaptiveQuantParam.pMotionTextureIndexToDeltaQp[pCurMb->iMbXY], GOM_MIN_QP_MODE, 51);
-	}
-	pCurMb->uiChromaQp = g_kuiChromaQpTable[iLumaQp];
-	pCurMb->uiLumaQp = iLumaQp;
-}
-
-void  WelsRcMbInfoUpdateDisable(void *pCtx, SMB* pCurMb, int32_t iCostLuma, SSlice *pSlice)
-{
-}
-
-
-void  WelsRcInitModule(void *pCtx,  int32_t iModule)
-{
-	sWelsEncCtx *pEncCtx = (sWelsEncCtx*)pCtx;
-	SWelsRcFunc  * pRcf = &pEncCtx->pFuncList->pfRc;
-
-	switch(iModule)
-	{
-	case WELS_RC_DISABLE:
-		pRcf->pfWelsRcPictureInit = WelsRcPictureInitDisable;
-		pRcf->pfWelsRcPictureInfoUpdate = WelsRcPictureInfoUpdateDisable;
-		pRcf->pfWelsRcMbInit = WelsRcMbInitDisable;
-		pRcf->pfWelsRcMbInfoUpdate = WelsRcMbInfoUpdateDisable;
-		break;
-	case WELS_RC_GOM:
-	default:
-		pRcf->pfWelsRcPictureInit = WelsRcPictureInitGom;
-		pRcf->pfWelsRcPictureInfoUpdate = WelsRcPictureInfoUpdateGom;
-		pRcf->pfWelsRcMbInit = WelsRcMbInitGom;
-		pRcf->pfWelsRcMbInfoUpdate = WelsRcMbInfoUpdateGom;			
-		break;
-	}
-
-	RcInitSequenceParameter(pEncCtx);
-}
-
-void  WelsRcFreeMemory(void *pCtx)
-{
-	sWelsEncCtx *pEncCtx = (sWelsEncCtx*)pCtx;
-	SWelsSvcRc *pWelsSvcRc = NULL;
-	int32_t i = 0;
-#ifdef _TEST_TEMP_Rc_
-	if(fp_test_rc)
-		fclose(fp_test_rc);
-	fp_test_rc = NULL;
-	if(fp_vgop)
-		fclose(fp_vgop);
-	fp_vgop = NULL;
-#endif
-	for( i=0; i<pEncCtx->pSvcParam->iNumDependencyLayer; i++ )
-	{
-		pWelsSvcRc  = &pEncCtx->pWelsSvcRc[i];
-		RcFreeLayerMemory(pWelsSvcRc, pEncCtx->pMemAlign);
-	}
-}
-
-}//end of namespace
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ *  ratectl.c
+ *
+ *  Abstract
+ *      Rate Control
+ *
+ *  History
+ *      9/8/2009 Created
+ *    12/26/2011 Modified
+ *
+ *
+ *
+ *************************************************************************/
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include "rc.h"
+#include "encoder_context.h"
+#include "utils.h"
+#include "svc_enc_golomb.h"
+
+
+namespace WelsSVCEnc {
+
+//#define _TEST_TEMP_RC_
+#ifdef _TEST_TEMP_RC_
+//#define _NOT_USE_AQ_FOR_TEST_
+FILE* fp_test_rc = NULL;
+FILE* fp_vgop = NULL;
+#endif
+#define _BITS_RANGE 0
+
+void RcInitLayerMemory (SWelsSvcRc* pWelsSvcRc, CMemoryAlign* pMA, const int32_t kiMaxTl) {
+  const int32_t kiSliceNum			= pWelsSvcRc->iSliceNum;
+  const int32_t kiGomSize				= pWelsSvcRc->iGomSize;
+  const int32_t kiGomSizeD			= kiGomSize * sizeof (double);
+  const int32_t kiGomSizeI			= kiGomSize * sizeof (int32_t);
+  const int32_t kiLayerRcSize			= kiGomSizeD + (kiGomSizeI * 3) + sizeof (SRCSlicing) * kiSliceNum + sizeof (
+                                      SRCTemporal) * kiMaxTl;
+  uint8_t* pBaseMem					= (uint8_t*)pMA->WelsMalloc (kiLayerRcSize, "rc_layer_memory");
+
+  if (NULL == pBaseMem)
+    return;
+
+  pWelsSvcRc->pGomComplexity				= (double*)pBaseMem;
+  pBaseMem += kiGomSizeD;
+  pWelsSvcRc->pGomForegroundBlockNum	= (int32_t*)pBaseMem;
+  pBaseMem += kiGomSizeI;
+  pWelsSvcRc->pCurrentFrameGomSad		= (int32_t*)pBaseMem;
+  pBaseMem += kiGomSizeI;
+  pWelsSvcRc->pGomCost					= (int32_t*)pBaseMem;
+  pBaseMem += kiGomSizeI;
+  pWelsSvcRc->pSlicingOverRc			= (SRCSlicing*)pBaseMem;
+  pBaseMem += sizeof (SRCSlicing) * kiSliceNum;
+  pWelsSvcRc->pTemporalOverRc			= (SRCTemporal*)pBaseMem;
+}
+
+void RcFreeLayerMemory (SWelsSvcRc* pWelsSvcRc, CMemoryAlign* pMA) {
+  if (pWelsSvcRc != NULL && pWelsSvcRc->pGomComplexity != NULL) {
+    pMA->WelsFree (pWelsSvcRc->pGomComplexity, "rc_layer_memory");
+    pWelsSvcRc->pGomComplexity			= NULL;
+    pWelsSvcRc->pGomForegroundBlockNum	= NULL;
+    pWelsSvcRc->pCurrentFrameGomSad	= NULL;
+    pWelsSvcRc->pGomCost				= NULL;
+    pWelsSvcRc->pSlicingOverRc			= NULL;
+    pWelsSvcRc->pTemporalOverRc		= NULL;
+  }
+}
+
+static inline double RcConvertQp2QStep (double dQP) {
+  return pow (2.0, (dQP - 4.0) / 6.0);
+}
+static inline double RcConvertQStep2Qp (double dQpStep) {
+  return (6 * log (dQpStep) / log (2.0) + 4.0);
+}
+
+void RcInitSequenceParameter (sWelsEncCtx* pEncCtx) {
+  SWelsSvcRc* pWelsSvcRc = NULL;
+  SDLayerParam* pDLayerParam = NULL;
+
+  int32_t j = 0;
+  int32_t iMbWidth = 0;
+
+  BOOL_T bMultiSliceMode = FALSE;
+  int32_t iGomRowMode0 = 1, iGomRowMode1 = 1;
+#ifdef _TEST_TEMP_RC_
+  fp_test_rc = fopen ("testRC.dat", "w");
+  fp_vgop = fopen ("vgop.dat", "w");
+#endif
+  for (j = 0; j < pEncCtx->pSvcParam->iNumDependencyLayer; j++) {
+    SSliceCtx* pSliceCtx = &pEncCtx->pSliceCtxList[j];
+    pWelsSvcRc  = &pEncCtx->pWelsSvcRc[j];
+    pDLayerParam = &pEncCtx->pSvcParam->sDependencyLayers[j];
+    iMbWidth     = (pDLayerParam->iFrameWidth >> 4);
+    pWelsSvcRc->iNumberMbFrame = iMbWidth * (pDLayerParam->iFrameHeight >> 4);
+    pWelsSvcRc->iSliceNum = pSliceCtx->iSliceNumInFrame;
+
+    pWelsSvcRc->iRcVaryPercentage = _BITS_RANGE;	// % -- for temp
+    pWelsSvcRc->dRcVaryRatio = (double)pWelsSvcRc->iRcVaryPercentage / MAX_BITS_VARY_PERCENTAGE;
+
+    pWelsSvcRc->dSkipBufferRatio  = SKIP_RATIO;
+
+    pWelsSvcRc->iQpRangeUpperInFrame = QP_RANGE_UPPER_MODE1 - (int32_t) ((QP_RANGE_UPPER_MODE1 - QP_RANGE_MODE0) *
+                                       pWelsSvcRc->dRcVaryRatio + 0.5);
+    pWelsSvcRc->iQpRangeLowerInFrame = QP_RANGE_LOWER_MODE1 - (int32_t) ((QP_RANGE_LOWER_MODE1 - QP_RANGE_MODE0) *
+                                       pWelsSvcRc->dRcVaryRatio + 0.5);
+
+    if (iMbWidth <= MB_WIDTH_THRESHOLD_90P) {
+      pWelsSvcRc->iSkipQpValue = SKIP_QP_90P;
+      iGomRowMode0 = GOM_ROW_MODE0_90P;
+      iGomRowMode1 = GOM_ROW_MODE1_90P;
+    } else if (iMbWidth <= MB_WIDTH_THRESHOLD_180P) {
+      pWelsSvcRc->iSkipQpValue = SKIP_QP_180P;
+      iGomRowMode0 = GOM_ROW_MODE0_180P;
+      iGomRowMode1 = GOM_ROW_MODE1_180P;
+    } else if (iMbWidth <= MB_WIDTH_THRESHOLD_360P) {
+      pWelsSvcRc->iSkipQpValue = SKIP_QP_360P;
+      iGomRowMode0 = GOM_ROW_MODE0_360P;
+      iGomRowMode1 = GOM_ROW_MODE1_360P;
+    } else {
+      pWelsSvcRc->iSkipQpValue = SKIP_QP_720P;
+      iGomRowMode0 = GOM_ROW_MODE0_720P;
+      iGomRowMode1 = GOM_ROW_MODE1_720P;
+    }
+    iGomRowMode0 = iGomRowMode1 + (int32_t) ((iGomRowMode0 - iGomRowMode1) * pWelsSvcRc->dRcVaryRatio + 0.5);
+
+    pWelsSvcRc->iNumberMbGom   = iMbWidth * iGomRowMode0;
+
+    pWelsSvcRc->iMinQp = GOM_MIN_QP_MODE;
+    pWelsSvcRc->iMaxQp = GOM_MAX_QP_MODE;
+
+    pWelsSvcRc->iFrameDeltaQpUpper = LAST_FRAME_QP_RANGE_UPPER_MODE1 - (int32_t) ((LAST_FRAME_QP_RANGE_UPPER_MODE1 -
+                                     LAST_FRAME_QP_RANGE_UPPER_MODE0) * pWelsSvcRc->dRcVaryRatio + 0.5);
+    pWelsSvcRc->iFrameDeltaQpLower = LAST_FRAME_QP_RANGE_LOWER_MODE1 - (int32_t) ((LAST_FRAME_QP_RANGE_LOWER_MODE1 -
+                                     LAST_FRAME_QP_RANGE_LOWER_MODE0) * pWelsSvcRc->dRcVaryRatio + 0.5);
+
+    pWelsSvcRc->iSkipFrameNum = 0;
+    pWelsSvcRc->iGomSize = (pWelsSvcRc->iNumberMbFrame + pWelsSvcRc->iNumberMbGom - 1) / pWelsSvcRc->iNumberMbGom;
+
+
+    RcInitLayerMemory (pWelsSvcRc, pEncCtx->pMemAlign, 1 + pDLayerParam->iHighestTemporalId);
+
+    bMultiSliceMode	= ((SM_RASTER_SLICE == pDLayerParam->sMso.uiSliceMode) ||
+                       (SM_ROWMB_SLICE	 == pDLayerParam->sMso.uiSliceMode) ||
+                       (SM_DYN_SLICE	 == pDLayerParam->sMso.uiSliceMode));
+    if (bMultiSliceMode)
+      pWelsSvcRc->iNumberMbGom = pWelsSvcRc->iNumberMbFrame;
+  }
+}
+
+
+void RcInitTlWeight (sWelsEncCtx* pEncCtx) {
+  SWelsSvcRc* pWelsSvcRc = &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+  SRCTemporal* pTOverRc	= pWelsSvcRc->pTemporalOverRc;
+  SDLayerParam* pDLayerParam =  &pEncCtx->pSvcParam->sDependencyLayers[pEncCtx->uiDependencyId];
+  const int32_t kiDecompositionStages = pDLayerParam->iDecompositionStages;
+  const int32_t kiHighestTid = pDLayerParam->iHighestTemporalId;
+
+  //Index 0:Virtual GOP size, Index 1:Frame rate
+  double WeightArray[4][4] = { {1.0, 0, 0, 0}, {0.6, 0.4, 0, 0}, {0.4, 0.3, 0.15, 0}, {0.25, 0.15, 0.125, 0.0875}};
+  const int32_t kiGopSize = (1 << kiDecompositionStages);
+  int32_t i, k, n;
+
+  n = 0;
+  while (n <= kiHighestTid) {
+    pTOverRc[n].dTlayerWeight	= WeightArray[kiDecompositionStages][n];
+    ++ n;
+  }
+  //Calculate the frame index for the current frame and its reference frame
+  for (n = 0; n < VGOP_SIZE; n += kiGopSize) {
+    pWelsSvcRc->iTlOfFrames[n] = 0;
+    for (i = 1; i <= kiDecompositionStages; i++) {
+      for (k = 1 << (kiDecompositionStages - i); k < kiGopSize; k += (kiGopSize >> (i - 1))) {
+        pWelsSvcRc->iTlOfFrames[k + n] = i;
+      }
+    }
+  }
+  pWelsSvcRc->iPreviousGopSize = kiGopSize;
+  pWelsSvcRc->iGopNumberInVGop = VGOP_SIZE / kiGopSize;
+}
+
+void RcUpdateBitrateFps (sWelsEncCtx* pEncCtx) {
+  SWelsSvcRc* pWelsSvcRc	= &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+  SRCTemporal* pTOverRc		= pWelsSvcRc->pTemporalOverRc;
+  SDLayerParam* pDLayerParam     = &pEncCtx->pSvcParam->sDependencyLayers[pEncCtx->uiDependencyId];
+  const int32_t kiGopSize	= (1 << pDLayerParam->iDecompositionStages);
+  const int32_t kiHighestTid = pDLayerParam->iHighestTemporalId;
+  double input_dBitsPerFrame = pDLayerParam->iSpatialBitrate / pDLayerParam->fInputFrameRate;
+  const int32_t kiGopBits	= (int32_t) (input_dBitsPerFrame * kiGopSize);
+  int32_t i;
+
+  pWelsSvcRc->iBitRate   = pDLayerParam->iSpatialBitrate;
+  pWelsSvcRc->fFrameRate = pDLayerParam->fInputFrameRate;
+
+  double dTargetVaryRange = FRAME_iTargetBits_VARY_RANGE * (1.0 - pWelsSvcRc->dRcVaryRatio);
+  double dMinBitsRatio = 1.0 - dTargetVaryRange;
+  double dMaxBitsRatio = 1.0 + FRAME_iTargetBits_VARY_RANGE;//dTargetVaryRange;
+
+  for (i = 0; i <= kiHighestTid; i++) {
+    const double kdConstraitBits = kiGopBits * pTOverRc[i].dTlayerWeight;
+    pTOverRc[i].iMinBitsTl = (int32_t) (kdConstraitBits * dMinBitsRatio);
+    pTOverRc[i].iMaxBitsTl = (int32_t) (kdConstraitBits * dMaxBitsRatio);
+  }
+  //When bitrate is changed, pBuffer size should be updated
+  pWelsSvcRc->iBufferSizeSkip = (int32_t) (pWelsSvcRc->iBitRate * pWelsSvcRc->dSkipBufferRatio);
+  pWelsSvcRc->iBufferSizePadding = (int32_t) (pWelsSvcRc->iBitRate * PADDING_BUFFER_RATIO);
+
+  //change remaining bits
+  if (pWelsSvcRc->dBitsPerFrame > 0.1)
+    pWelsSvcRc->iRemainingBits = (int32_t) (pWelsSvcRc->iRemainingBits * input_dBitsPerFrame / pWelsSvcRc->dBitsPerFrame);
+  pWelsSvcRc->dBitsPerFrame = input_dBitsPerFrame;
+}
+
+
+void RcInitVGop (sWelsEncCtx* pEncCtx) {
+  const int32_t kiDid		= pEncCtx->uiDependencyId;
+  SWelsSvcRc* pWelsSvcRc = &pEncCtx->pWelsSvcRc[kiDid];
+  SRCTemporal* pTOverRc		= pWelsSvcRc->pTemporalOverRc;
+  const int32_t kiHighestTid = pEncCtx->pSvcParam->sDependencyLayers[kiDid].iHighestTemporalId;
+
+  pWelsSvcRc->iRemainingBits = (int32_t) (VGOP_SIZE * pWelsSvcRc->dBitsPerFrame);
+  pWelsSvcRc->dRemainingWeights = pWelsSvcRc->iGopNumberInVGop;
+
+  pWelsSvcRc->iFrameCodedInVGop = 0;
+  pWelsSvcRc->iGopIndexInVGop = 0;
+
+  for (int32_t i = 0; i <= kiHighestTid; ++ i)
+    pTOverRc[i].iGopBitsDq = 0;
+  pWelsSvcRc->iSkipFrameInVGop = 0;
+}
+
+void RcInitRefreshParameter (sWelsEncCtx* pEncCtx) {
+  const int32_t kiDid		  = pEncCtx->uiDependencyId;
+  SWelsSvcRc* pWelsSvcRc   = &pEncCtx->pWelsSvcRc[kiDid];
+  SRCTemporal* pTOverRc		  = pWelsSvcRc->pTemporalOverRc;
+  SDLayerParam* pDLayerParam       = &pEncCtx->pSvcParam->sDependencyLayers[kiDid];
+  const int32_t kiHighestTid = pDLayerParam->iHighestTemporalId;
+  int32_t i;
+
+  //I frame R-Q Model
+  pWelsSvcRc->iIntraComplexity = 0;
+  pWelsSvcRc->iIntraMbCount = 0;
+
+  //P frame R-Q Model
+  for (i = 0; i <= kiHighestTid; i++) {
+    pTOverRc[i].iPFrameNum = 0;
+    pTOverRc[i].dLinearCmplx = 0.0;
+    pTOverRc[i].iFrameCmplxMean = 0;
+  }
+
+  pWelsSvcRc->iBufferFullnessSkip = 0;
+  pWelsSvcRc->iBufferFullnessPadding = 0;
+
+  pWelsSvcRc->iGopIndexInVGop = 0;
+  pWelsSvcRc->iRemainingBits = 0;
+  pWelsSvcRc->dBitsPerFrame	= 0.0;
+
+  //Backup the initial bitrate and fps
+  pWelsSvcRc->iPreviousBitrate  = pDLayerParam->iSpatialBitrate;
+  pWelsSvcRc->dPreviousFps      = pDLayerParam->fInputFrameRate;
+
+  memset (pWelsSvcRc->pCurrentFrameGomSad, 0, pWelsSvcRc->iGomSize * sizeof (int32_t));
+
+  RcInitTlWeight (pEncCtx);
+  RcUpdateBitrateFps (pEncCtx);
+  RcInitVGop (pEncCtx);
+}
+
+bool_t RcJudgeBitrateFpsUpdate (sWelsEncCtx* pEncCtx) {
+  int32_t iCurDid = pEncCtx->uiDependencyId;
+  SWelsSvcRc* pWelsSvcRc       = &pEncCtx->pWelsSvcRc[iCurDid];
+  SDLayerParam* pDLayerParam    = &pEncCtx->pSvcParam->sDependencyLayers[iCurDid];
+
+  if ((pWelsSvcRc->iPreviousBitrate != pDLayerParam->iSpatialBitrate) ||
+      (pWelsSvcRc->dPreviousFps - pDLayerParam->fInputFrameRate) > EPSN ||
+      (pWelsSvcRc->dPreviousFps - pDLayerParam->fInputFrameRate) < -EPSN) {
+    pWelsSvcRc->iPreviousBitrate = pDLayerParam->iSpatialBitrate;
+    pWelsSvcRc->dPreviousFps = pDLayerParam->fInputFrameRate;
+    return true;
+  } else
+    return false;
+}
+
+#if GOM_TRACE_FLAG
+void RcTraceVGopBitrate (sWelsEncCtx* pEncCtx) {
+  const int32_t kiDid				= pEncCtx->uiDependencyId;
+  SWelsSvcRc* pWelsSvcRc			= &pEncCtx->pWelsSvcRc[kiDid];
+
+  if (pWelsSvcRc->iFrameCodedInVGop) {
+    const int32_t kiHighestTid	= pEncCtx->pSvcParam->sDependencyLayers[kiDid].iHighestTemporalId;
+    SRCTemporal* pTOverRc			= pWelsSvcRc->pTemporalOverRc;
+    int32_t iVGopBitrate;
+    int32_t	iTotalBits = pWelsSvcRc->iPaddingBitrateStat;
+    int32_t iTid = 0;
+    while (iTid <= kiHighestTid) {
+      iTotalBits += pTOverRc[iTid].iGopBitsDq;
+      ++ iTid;
+    }
+    int32_t iFrameInVGop = pWelsSvcRc->iFrameCodedInVGop + pWelsSvcRc->iSkipFrameInVGop;
+    if (0 != iFrameInVGop)
+      iVGopBitrate = (int32_t) (iTotalBits / iFrameInVGop * pWelsSvcRc->fFrameRate);
+#ifdef _TEST_TEMP_Rc_
+    fprintf (fp_vgop, "%d\n", (int32_t) ((double)iTotalBits / iFrameInVGop));
+#endif
+    WelsLog (pEncCtx, WELS_LOG_INFO, "[Rc] VGOPbitrate%d: %d \n", kiDid, iVGopBitrate);
+    if (iTotalBits > 0) {
+      iTid = 0;
+      while (iTid <= kiHighestTid) {
+        WelsLog (pEncCtx, WELS_LOG_INFO, "T%d=%8.3f \n", iTid, (double) (pTOverRc[iTid].iGopBitsDq / iTotalBits));
+        ++ iTid;
+      }
+    }
+  }
+}
+#endif
+
+void RcUpdateTemporalZero (sWelsEncCtx* pEncCtx) {
+  const int32_t kiDid		= pEncCtx->uiDependencyId;
+  SWelsSvcRc* pWelsSvcRc	= &pEncCtx->pWelsSvcRc[kiDid];
+  SDLayerParam* pDLayerParam		= &pEncCtx->pSvcParam->sDependencyLayers[kiDid];
+  const int32_t kiGopSize	= (1 << pDLayerParam->iDecompositionStages);
+
+  if (pWelsSvcRc->iPreviousGopSize  != kiGopSize) {
+#if GOM_TRACE_FLAG
+    RcTraceVGopBitrate (pEncCtx);
+#endif
+    RcInitTlWeight (pEncCtx);
+    RcInitVGop (pEncCtx);
+  } else if (pWelsSvcRc->iGopIndexInVGop == pWelsSvcRc->iGopNumberInVGop || pEncCtx->eSliceType == I_SLICE) {
+#if GOM_TRACE_FLAG
+    RcTraceVGopBitrate (pEncCtx);
+#endif
+    RcInitVGop (pEncCtx);
+  }
+  pWelsSvcRc->iGopIndexInVGop++;
+}
+
+
+void RcInitIdrQp (sWelsEncCtx* pEncCtx) {
+  double dBpp = 0;
+  int32_t i;
+
+  //64k@6fps for 90p:     bpp 0.74    QP:24
+  //192k@12fps for 180p:  bpp 0.28    QP:26
+  //512k@24fps for 360p:  bpp 0.09    QP:30
+  //1500k@30fps for 720p: bpp 0.05    QP:32
+  double dBppArray[4][3] = {{0.5, 0.75, 1.0}, {0.2, 0.3, 0.4}, {0.05, 0.09, 0.13}, {0.03, 0.06, 0.1}};
+  int32_t dInitialQPArray[4][4] = {{28, 26, 24, 22}, {30, 28, 26, 24}, {32, 30, 28, 26}, {34, 32, 30, 28}};
+  int32_t iBppIndex = 0;
+
+  SWelsSvcRc* pWelsSvcRc		= &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+  SDLayerParam* pDLayerParam			= &pEncCtx->pSvcParam->sDependencyLayers[pEncCtx->uiDependencyId];
+
+  if (pDLayerParam->fOutputFrameRate > EPSN && pDLayerParam->iFrameWidth && pDLayerParam->iFrameHeight)
+    dBpp = (double) (pDLayerParam->iSpatialBitrate) / (double) (pDLayerParam->fOutputFrameRate * pDLayerParam->iFrameWidth *
+           pDLayerParam->iFrameHeight);
+  else
+    dBpp = 0.1;
+
+  //Area*2
+  if (pDLayerParam->iFrameWidth * pDLayerParam->iFrameHeight <= 28800) // 90p video:160*90
+    iBppIndex = 0;
+  else if (pDLayerParam->iFrameWidth * pDLayerParam->iFrameHeight <= 115200) // 180p video:320*180
+    iBppIndex = 1;
+  else if (pDLayerParam->iFrameWidth * pDLayerParam->iFrameHeight <= 460800) // 360p video:640*360
+    iBppIndex = 2;
+  else
+    iBppIndex = 3;
+
+  //Search
+  for (i = 0; i < 3; i++) {
+    if (dBpp <= dBppArray[iBppIndex][i])
+      break;
+  }
+  pWelsSvcRc->iInitialQp = dInitialQPArray[iBppIndex][i];
+  pWelsSvcRc->iInitialQp = (int32_t)WELS_CLIP3 (pWelsSvcRc->iInitialQp, MIN_IDR_QP, MAX_IDR_QP);
+  pEncCtx->iGlobalQp = pWelsSvcRc->iInitialQp;
+  pWelsSvcRc->dQStep = RcConvertQp2QStep (pEncCtx->iGlobalQp);
+  pWelsSvcRc->iLastCalculatedQScale = pEncCtx->iGlobalQp;
+}
+
+void RcCalculateIdrQp (sWelsEncCtx* pEncCtx) {
+  SWelsSvcRc* pWelsSvcRc			= &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+  //obtain the idr qp using previous idr complexity
+  if (pWelsSvcRc->iNumberMbFrame != pWelsSvcRc->iIntraMbCount) {
+    pWelsSvcRc->iIntraComplexity = (int32_t) ((double)pWelsSvcRc->iIntraComplexity * pWelsSvcRc->iNumberMbFrame /
+                                   pWelsSvcRc->iIntraMbCount + 0.5);
+  }
+  pWelsSvcRc->iInitialQp = (int32_t)RcConvertQStep2Qp ((double)pWelsSvcRc->iIntraComplexity / pWelsSvcRc->iTargetBits);
+  pWelsSvcRc->iInitialQp = (int32_t)WELS_CLIP3 (pWelsSvcRc->iInitialQp, MIN_IDR_QP, MAX_IDR_QP);
+  pEncCtx->iGlobalQp = pWelsSvcRc->iInitialQp;
+  pWelsSvcRc->dQStep = RcConvertQp2QStep (pEncCtx->iGlobalQp);
+  pWelsSvcRc->iLastCalculatedQScale = pEncCtx->iGlobalQp;
+}
+
+
+void RcCalculatePictureQp (sWelsEncCtx* pEncCtx) {
+  SWelsSvcRc* pWelsSvcRc		= &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+  int32_t iTl					= pEncCtx->uiTemporalId;
+  SRCTemporal* pTOverRc			= &pWelsSvcRc->pTemporalOverRc[iTl];
+  int32_t iLumaQp = 0;
+
+  if (0 == pTOverRc->iPFrameNum) {
+    iLumaQp = pWelsSvcRc->iInitialQp;
+  } else {
+    double dCmplxRatio = (double)pEncCtx->pVaa->sComplexityAnalysisParam.iFrameComplexity / pTOverRc->iFrameCmplxMean;
+    dCmplxRatio = WELS_CLIP3 (dCmplxRatio, 1.0 - FRAME_CMPLX_RATIO_RANGE, 1.0 + FRAME_CMPLX_RATIO_RANGE);
+
+    pWelsSvcRc->dQStep = pTOverRc->dLinearCmplx * dCmplxRatio / pWelsSvcRc->iTargetBits;
+    iLumaQp = (int32_t) (RcConvertQStep2Qp (pWelsSvcRc->dQStep) + 0.5);
+
+    //limit QP
+    int32_t iLastIdxCodecInVGop = pWelsSvcRc->iFrameCodedInVGop - 1;
+    if (iLastIdxCodecInVGop < 0)
+      iLastIdxCodecInVGop += VGOP_SIZE;
+    int32_t iTlLast = pWelsSvcRc->iTlOfFrames[iLastIdxCodecInVGop];
+    int32_t iDeltaQpTemporal = iTl - iTlLast;
+    if (0 == iTlLast && iTl > 0)
+      iDeltaQpTemporal += 3;
+    else if (0 == iTl && iTlLast > 0)
+      iDeltaQpTemporal -= 3;
+
+    iLumaQp = WELS_CLIP3 (iLumaQp,
+                          pWelsSvcRc->iLastCalculatedQScale - pWelsSvcRc->iFrameDeltaQpLower + iDeltaQpTemporal,
+                          pWelsSvcRc->iLastCalculatedQScale + pWelsSvcRc->iFrameDeltaQpUpper + iDeltaQpTemporal);
+  }
+
+  iLumaQp = WELS_CLIP3 (iLumaQp,  GOM_MIN_QP_MODE, GOM_MAX_QP_MODE);
+
+  pWelsSvcRc->dQStep = RcConvertQp2QStep (iLumaQp);
+  pWelsSvcRc->iLastCalculatedQScale = iLumaQp;
+#ifndef _NOT_USE_AQ_FOR_TEST_
+  if (pEncCtx->pSvcParam->bEnableAdaptiveQuant) {
+
+    iLumaQp = (int32_t)WELS_CLIP3 (iLumaQp - pEncCtx->pVaa->sAdaptiveQuantParam.dAverMotionTextureIndexToDeltaQp,
+                                   pWelsSvcRc->iMinQp, pWelsSvcRc->iMaxQp);
+  }
+#endif
+  pEncCtx->iGlobalQp = iLumaQp;
+}
+
+void RcInitSliceInformation (sWelsEncCtx* pEncCtx) {
+  SSliceCtx* pCurSliceCtx	= pEncCtx->pCurDqLayer->pSliceEncCtx;
+  SWelsSvcRc* pWelsSvcRc			= &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+  SRCSlicing* pSOverRc				= &pWelsSvcRc->pSlicingOverRc[0];
+  const int32_t kiSliceNum			= pCurSliceCtx->iSliceNumInFrame;
+  const double kdBitsPerMb		= (double)pWelsSvcRc->iTargetBits / pWelsSvcRc->iNumberMbFrame;
+
+  for (int32_t i = 0; i < kiSliceNum; i++) {
+    pSOverRc->iStartMbSlice	=
+      pSOverRc->iEndMbSlice		= pCurSliceCtx->pFirstMbInSlice[i];
+    pSOverRc->iEndMbSlice		+= (pCurSliceCtx->pCountMbNumInSlice[i] - 1);
+    pSOverRc->iTotalQpSlice	= 0;
+    pSOverRc->iTotalMbSlice	= 0;
+    pSOverRc->iTargetBitsSlice = (int32_t) (kdBitsPerMb * pCurSliceCtx->pCountMbNumInSlice[i]);
+    pSOverRc->iFrameBitsSlice	= 0;
+    pSOverRc->iGomBitsSlice	= 0;
+    ++ pSOverRc;
+  }
+}
+
+void RcDecideTargetBits (sWelsEncCtx* pEncCtx) {
+  SWelsSvcRc* pWelsSvcRc	= &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+  SRCTemporal* pTOverRc		= &pWelsSvcRc->pTemporalOverRc[pEncCtx->uiTemporalId];
+  //allocate bits
+  if (pEncCtx->eSliceType == I_SLICE) {
+    pWelsSvcRc->iTargetBits = (int32_t) (pWelsSvcRc->dBitsPerFrame * IDR_BITRATE_RATIO);
+  } else {
+    pWelsSvcRc->iTargetBits = (int32_t) (pWelsSvcRc->iRemainingBits * pTOverRc->dTlayerWeight /
+                                         pWelsSvcRc->dRemainingWeights);
+    pWelsSvcRc->iTargetBits = WELS_CLIP3 (pWelsSvcRc->iTargetBits, pTOverRc->iMinBitsTl,	pTOverRc->iMaxBitsTl);
+  }
+  pWelsSvcRc->dRemainingWeights -= pTOverRc->dTlayerWeight;
+}
+
+
+void RcInitGoomParameters (sWelsEncCtx* pEncCtx) {
+  SWelsSvcRc* pWelsSvcRc			= &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+  SRCSlicing* pSOverRc				= &pWelsSvcRc->pSlicingOverRc[0];
+  const int32_t kiSliceNum			= pWelsSvcRc->iSliceNum;
+  const int32_t kiGlobalQp			= pEncCtx->iGlobalQp;
+
+  pWelsSvcRc->iAverageFrameQp = 0;
+  for (int32_t i = 0; i < kiSliceNum; ++i) {
+    pSOverRc->iComplexityIndexSlice	= 0;
+    pSOverRc->iCalculatedQpSlice		= kiGlobalQp;
+    ++ pSOverRc;
+  }
+  memset (pWelsSvcRc->pGomComplexity, 0, pWelsSvcRc->iGomSize * sizeof (double));
+  memset (pWelsSvcRc->pGomCost, 0, pWelsSvcRc->iGomSize * sizeof (int32_t));
+}
+
+void RcCalculateMbQp (sWelsEncCtx* pEncCtx, SMB* pCurMb, const int32_t kiSliceId) {
+  SWelsSvcRc* pWelsSvcRc = &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+  SRCSlicing* pSOverRc		= &pWelsSvcRc->pSlicingOverRc[kiSliceId];
+  int32_t iLumaQp			= pSOverRc->iCalculatedQpSlice;
+
+#ifndef _NOT_USE_AQ_FOR_TEST_
+  if (pEncCtx->pSvcParam->bEnableAdaptiveQuant) {
+    iLumaQp   = (int8_t)WELS_CLIP3 (iLumaQp +
+                                    pEncCtx->pVaa->sAdaptiveQuantParam.pMotionTextureIndexToDeltaQp[pCurMb->iMbXY], pWelsSvcRc->iMinQp, 51);
+  }
+#endif
+  pCurMb->uiChromaQp	= g_kuiChromaQpTable[iLumaQp];
+  pCurMb->uiLumaQp		= iLumaQp;
+}
+
+SWelsSvcRc* RcJudgeBaseUsability (sWelsEncCtx* pEncCtx) {
+  SWelsSvcRc* pWelsSvcRc  = NULL, *pWelsSvcRc_Base = NULL;
+  SDLayerParam* pDlpBase = NULL, *pDLayerParam = NULL;
+
+  if (pEncCtx->uiDependencyId <= 0)
+    return NULL;
+
+  pDlpBase = &pEncCtx->pSvcParam->sDependencyLayers[pEncCtx->uiDependencyId - 1];
+  pWelsSvcRc_Base = &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId - 1];
+  if (pEncCtx->uiTemporalId <= pDlpBase->iDecompositionStages) {
+    pWelsSvcRc      = &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+    pWelsSvcRc_Base = &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId - 1];
+    pDLayerParam             = &pEncCtx->pSvcParam->sDependencyLayers[pEncCtx->uiDependencyId];
+    pDlpBase        = &pEncCtx->pSvcParam->sDependencyLayers[pEncCtx->uiDependencyId - 1];
+    if ((pDLayerParam->iFrameWidth * pDLayerParam->iFrameHeight / pWelsSvcRc->iNumberMbGom) ==
+        (pDlpBase->iFrameWidth * pDlpBase->iFrameHeight / pWelsSvcRc_Base->iNumberMbGom))
+      return pWelsSvcRc_Base;
+    else
+      return NULL;
+  } else
+    return NULL;
+}
+
+void RcGomTargetBits (sWelsEncCtx* pEncCtx, const int32_t kiSliceId) {
+  SWelsSvcRc* pWelsSvcRc			= &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+  SWelsSvcRc* pWelsSvcRc_Base	= NULL;
+  SRCSlicing* pSOverRc				= &pWelsSvcRc->pSlicingOverRc[kiSliceId];
+
+  double dAllocateBits = 0;
+  int32_t iSumSad = 0;
+  int32_t iLastGomIndex = 0;
+  int32_t iLeftBits = 0;
+  const int32_t kiComplexityIndex	= pSOverRc->iComplexityIndexSlice;
+  int32_t i;
+
+  iLastGomIndex  = pSOverRc->iEndMbSlice / pWelsSvcRc->iNumberMbGom;
+  iLeftBits = pSOverRc->iTargetBitsSlice - pSOverRc->iFrameBitsSlice;
+
+  if (iLeftBits <= 0) {
+    pSOverRc->iGomTargetBits = 0;
+    return;
+  } else if (kiComplexityIndex >= iLastGomIndex) {
+    dAllocateBits = iLeftBits;
+  } else {
+    pWelsSvcRc_Base = RcJudgeBaseUsability (pEncCtx);
+    pWelsSvcRc_Base = (pWelsSvcRc_Base) ? pWelsSvcRc_Base : pWelsSvcRc;
+    for (i = kiComplexityIndex; i <= iLastGomIndex; i++) {
+      iSumSad += pWelsSvcRc_Base->pCurrentFrameGomSad[i];
+    }
+    if (0 == iSumSad)
+      dAllocateBits = (double)iLeftBits / (iLastGomIndex - kiComplexityIndex);
+    else
+      dAllocateBits = (double)iLeftBits * pWelsSvcRc_Base->pCurrentFrameGomSad[kiComplexityIndex + 1] / iSumSad;
+
+  }
+  pSOverRc->iGomTargetBits = int32_t (dAllocateBits + 0.5);
+}
+
+
+
+void RcCalculateGomQp (sWelsEncCtx* pEncCtx, SMB* pCurMb, int32_t iSliceId) {
+  SWelsSvcRc* pWelsSvcRc			= &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+  SRCSlicing* pSOverRc				= &pWelsSvcRc->pSlicingOverRc[iSliceId];
+  double dBitsRatio = 1.0;
+
+  int32_t iLeftBits = pSOverRc->iTargetBitsSlice - pSOverRc->iFrameBitsSlice;
+  int32_t iTargetLeftBits = iLeftBits + pSOverRc->iGomBitsSlice - pSOverRc->iGomTargetBits;
+
+  if (iLeftBits <= 0) {
+    pSOverRc->iCalculatedQpSlice += 2;
+  } else {
+    //globe decision
+    dBitsRatio = iLeftBits / (iTargetLeftBits + 0.1);
+    if (dBitsRatio < 0.8409)		//2^(-1.5/6)
+      pSOverRc->iCalculatedQpSlice += 2;
+    else if (dBitsRatio < 0.9439)	//2^(-0.5/6)
+      pSOverRc->iCalculatedQpSlice += 1;
+    else if (dBitsRatio > 1.06)		//2^(0.5/6)
+      pSOverRc->iCalculatedQpSlice -= 1;
+    else if (dBitsRatio > 1.19)		//2^(1.5/6)
+      pSOverRc->iCalculatedQpSlice -= 2;
+  }
+
+  pSOverRc->iCalculatedQpSlice = WELS_CLIP3 (pSOverRc->iCalculatedQpSlice,
+                                 pEncCtx->iGlobalQp - pWelsSvcRc->iQpRangeLowerInFrame, pEncCtx->iGlobalQp + pWelsSvcRc->iQpRangeUpperInFrame);
+  pSOverRc->iCalculatedQpSlice = WELS_CLIP3 (pSOverRc->iCalculatedQpSlice, pWelsSvcRc->iMinQp, pWelsSvcRc->iMaxQp);
+
+  pSOverRc->iGomBitsSlice = 0;
+
+}
+
+void   RcVBufferCalculationSkip (sWelsEncCtx* pEncCtx) {
+  SWelsSvcRc* pWelsSvcRc = &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+  SRCTemporal* pTOverRc		= pWelsSvcRc->pTemporalOverRc;
+  const int32_t kiOutputBits = (int32_t) (pWelsSvcRc->dBitsPerFrame + 0.5);
+  //condition 1: whole pBuffer fullness
+  pWelsSvcRc->iBufferFullnessSkip += (pWelsSvcRc->iFrameDqBits - kiOutputBits);
+  //condition 2: VGOP bits constraint
+  const int32_t kiVGopBits = (int32_t) (pWelsSvcRc->dBitsPerFrame * VGOP_SIZE);
+  int32_t iVGopBitsPred = 0;
+  for (int32_t i = pWelsSvcRc->iFrameCodedInVGop + 1; i < VGOP_SIZE; i++)
+    iVGopBitsPred += pTOverRc[pWelsSvcRc->iTlOfFrames[i]].iMinBitsTl;
+  iVGopBitsPred -= pWelsSvcRc->iRemainingBits;
+  double dIncPercent = iVGopBitsPred * 100.0 / kiVGopBits - (double)VGOP_BITS_PERCENTAGE_DIFF;
+
+  if ((pWelsSvcRc->iBufferFullnessSkip > pWelsSvcRc->iBufferSizeSkip
+       &&	pWelsSvcRc->iAverageFrameQp > pWelsSvcRc->iSkipQpValue)
+      || (dIncPercent > pWelsSvcRc->iRcVaryPercentage)) {
+    pEncCtx->iSkipFrameFlag = 1;
+    pWelsSvcRc->iBufferFullnessSkip = pWelsSvcRc->iBufferFullnessSkip - kiOutputBits;
+#ifdef FRAME_INFO_OUTPUT
+    fprintf (stderr, "skip one frame\n");
+#endif
+  }
+
+  if (pWelsSvcRc->iBufferFullnessSkip < 0)
+    pWelsSvcRc->iBufferFullnessSkip = 0;
+
+  if (pEncCtx->iSkipFrameFlag == 1) {
+    pWelsSvcRc->iRemainingBits += (int32_t) (pWelsSvcRc->dBitsPerFrame + 0.5);
+    pWelsSvcRc->iSkipFrameNum++;
+    pWelsSvcRc->iSkipFrameInVGop++;
+  }
+}
+
+void RcVBufferCalculationPadding (sWelsEncCtx* pEncCtx) {
+  SWelsSvcRc* pWelsSvcRc = &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+  const int32_t kiOutputBits = (int32_t) (pWelsSvcRc->dBitsPerFrame + 0.5);
+  const int32_t kiBufferThreshold = (int32_t) (PADDING_THRESHOLD * (-pWelsSvcRc->iBufferSizePadding));
+
+  pWelsSvcRc->iBufferFullnessPadding += (pWelsSvcRc->iFrameDqBits - kiOutputBits);
+
+  if (pWelsSvcRc->iBufferFullnessPadding < kiBufferThreshold) {
+    pWelsSvcRc->iPaddingSize = -pWelsSvcRc->iBufferFullnessPadding;
+    pWelsSvcRc->iPaddingSize >>= 3;	// /8
+    pWelsSvcRc->iBufferFullnessPadding = 0;
+  } else
+    pWelsSvcRc->iPaddingSize = 0;
+}
+
+
+void RcTraceFrameBits (sWelsEncCtx* pEncCtx) {
+  SWelsSvcRc* pWelsSvcRc = &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+
+  WelsLog (pEncCtx, WELS_LOG_INFO,
+           "[Rc] encoding_qp%d, qp = %3d, index = %8d, iTid = %1d, used = %8d, target = %8d, remaingbits = %8d\n",
+           pEncCtx->uiDependencyId, pWelsSvcRc->iAverageFrameQp, pEncCtx->uiFrameIdxRc, pEncCtx->uiTemporalId,
+           pWelsSvcRc->iFrameDqBits,
+           pWelsSvcRc->iTargetBits, pWelsSvcRc->iRemainingBits);
+}
+
+void RcUpdatePictureQpBits (sWelsEncCtx* pEncCtx, int32_t iCodedBits) {
+  SWelsSvcRc* pWelsSvcRc = &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+  SRCSlicing* pSOverRc		= &pWelsSvcRc->pSlicingOverRc[0];
+  SSliceCtx* pCurSliceCtx = pEncCtx->pCurDqLayer->pSliceEncCtx;
+  int32_t iTotalQp = 0, iTotalMb = 0;
+  int32_t i;
+
+  if (pEncCtx->eSliceType == P_SLICE) {
+    for (i = 0; i < pCurSliceCtx->iSliceNumInFrame; i++) {
+      iTotalQp += pSOverRc->iTotalQpSlice;
+      iTotalMb += pSOverRc->iTotalMbSlice;
+      ++ pSOverRc;
+    }
+    if (iTotalMb > 0)
+      pWelsSvcRc->iAverageFrameQp = (int32_t) (1.0 * iTotalQp / iTotalMb + 0.5);
+    else
+      pWelsSvcRc->iAverageFrameQp = pEncCtx->iGlobalQp;
+  } else {
+    pWelsSvcRc->iAverageFrameQp = pEncCtx->iGlobalQp;
+  }
+  pWelsSvcRc->iFrameDqBits = iCodedBits;
+  pWelsSvcRc->pTemporalOverRc[pEncCtx->uiTemporalId].iGopBitsDq += pWelsSvcRc->iFrameDqBits;
+}
+
+void RcUpdateIntraComplexity (sWelsEncCtx* pEncCtx) {
+  SWelsSvcRc* pWelsSvcRc = &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+  double iAlpha = 1.0 / (1 + pWelsSvcRc->iIdrNum);
+  if (iAlpha < 0.25) iAlpha = 0.25;
+
+  double dIntraCmplx = pWelsSvcRc->dQStep * pWelsSvcRc->iFrameDqBits;
+  dIntraCmplx = (1.0 - iAlpha) * pWelsSvcRc->iIntraComplexity + iAlpha * dIntraCmplx;
+  pWelsSvcRc->iIntraComplexity = (int32_t) (dIntraCmplx + 0.5);
+  pWelsSvcRc->iIntraMbCount = pWelsSvcRc->iNumberMbFrame;
+
+  pWelsSvcRc->iIdrNum++;
+  if (pWelsSvcRc->iIdrNum > 255)
+    pWelsSvcRc->iIdrNum = 255;
+}
+
+void RcUpdateFrameComplexity (sWelsEncCtx* pEncCtx) {
+  SWelsSvcRc* pWelsSvcRc		= &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+  const int32_t kiTl			= pEncCtx->uiTemporalId;
+  SRCTemporal* pTOverRc			= &pWelsSvcRc->pTemporalOverRc[kiTl];
+
+  if (0 == pTOverRc->iPFrameNum) {
+    pTOverRc->dLinearCmplx = pWelsSvcRc->iFrameDqBits * pWelsSvcRc->dQStep;
+  } else {
+    pTOverRc->dLinearCmplx = LINEAR_MODEL_DECAY_FACTOR * pTOverRc->dLinearCmplx
+                             + (1.0 - LINEAR_MODEL_DECAY_FACTOR) * (pWelsSvcRc->iFrameDqBits * pWelsSvcRc->dQStep);
+  }
+  double iAlpha = 1.0 / (1 + pTOverRc->iPFrameNum);
+  if (iAlpha < SMOOTH_FACTOR_MIN_VALUE)
+    iAlpha = SMOOTH_FACTOR_MIN_VALUE;
+  pTOverRc->iFrameCmplxMean = (int32_t) ((1.0 - iAlpha) * pTOverRc->iFrameCmplxMean + iAlpha *
+                                         pEncCtx->pVaa->sComplexityAnalysisParam.iFrameComplexity + 0.5);
+
+  pTOverRc->iPFrameNum++;
+  if (pTOverRc->iPFrameNum > 255)
+    pTOverRc->iPFrameNum = 255;
+}
+
+int32_t RcCalculateCascadingQp (struct TagWelsEncCtx* pEncCtx, int32_t iQp) {
+  int32_t iTemporalQp = 0;
+  if (pEncCtx->pSvcParam->iDecompStages) {
+    if (pEncCtx->uiTemporalId == 0)
+      iTemporalQp = iQp - 3 - (pEncCtx->pSvcParam->iDecompStages - 1);
+    else
+      iTemporalQp = iQp - (pEncCtx->pSvcParam->iDecompStages - pEncCtx->uiTemporalId);
+    iTemporalQp = WELS_CLIP3 (iTemporalQp, 1, 51);
+  } else
+    iTemporalQp = iQp;
+  return iTemporalQp;
+}
+
+void  WelsRcPictureInitGom (void* pCtx) {
+  sWelsEncCtx* pEncCtx = (sWelsEncCtx*)pCtx;
+  SWelsSvcRc* pWelsSvcRc = &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+
+  if (pEncCtx->eSliceType == I_SLICE) {
+    if (0 == pWelsSvcRc->iIdrNum) {	//iIdrNum == 0 means encoder has been initialed
+      RcInitRefreshParameter (pEncCtx);
+    }
+  }
+  if (RcJudgeBitrateFpsUpdate (pEncCtx)) {
+    RcUpdateBitrateFps (pEncCtx);
+  }
+  if (pEncCtx->uiTemporalId == 0) {
+    RcUpdateTemporalZero (pEncCtx);
+  }
+  RcDecideTargetBits (pEncCtx);
+  //decide globe_qp
+  if (pEncCtx->eSliceType == I_SLICE) {
+    if (0 == pWelsSvcRc->iIdrNum)
+      RcInitIdrQp (pEncCtx);
+    else {
+      RcCalculateIdrQp (pEncCtx);
+    }
+  } else {
+    RcCalculatePictureQp (pEncCtx);
+  }
+  RcInitSliceInformation (pEncCtx);
+  RcInitGoomParameters (pEncCtx);
+
+}
+
+
+
+void  WelsRcPictureInfoUpdateGom (void* pCtx, int32_t layer_size) {
+  sWelsEncCtx* pEncCtx = (sWelsEncCtx*)pCtx;
+  SWelsSvcRc* pWelsSvcRc = &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+  int32_t iCodedBits = (layer_size << 3);
+
+  RcUpdatePictureQpBits (pEncCtx, iCodedBits);
+
+  if (pEncCtx->eSliceType == P_SLICE) {
+    RcUpdateFrameComplexity (pEncCtx);
+  } else {
+    RcUpdateIntraComplexity (pEncCtx);
+  }
+  pWelsSvcRc->iRemainingBits -= pWelsSvcRc->iFrameDqBits;
+
+#if GOM_TRACE_FLAG
+  RcTraceFrameBits (pEncCtx);
+#endif
+
+
+#if SKIP_FRAME_FLAG
+  if (pEncCtx->uiDependencyId == pEncCtx->pSvcParam->iNumDependencyLayer - 1) {
+    RcVBufferCalculationSkip (pEncCtx);
+  }
+#endif
+
+  if (pEncCtx->pSvcParam->iPaddingFlag)
+    RcVBufferCalculationPadding (pEncCtx);
+  pWelsSvcRc->iFrameCodedInVGop++;
+#ifdef _TEST_TEMP_Rc_
+  fprintf (fp_test_rc, "%d\n", pWelsSvcRc->iFrameDqBits);
+  if (pEncCtx->iSkipFrameFlag)
+    fprintf (fp_test_rc, "0\n");
+  fflush (fp_test_rc);
+#endif
+}
+
+void WelsRcMbInitGom (void* pCtx, SMB* pCurMb, SSlice* pSlice) {
+  sWelsEncCtx* pEncCtx = (sWelsEncCtx*)pCtx;
+  SWelsSvcRc* pWelsSvcRc			= &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+  const int32_t kiSliceId			= pSlice->uiSliceIdx;
+  SRCSlicing* pSOverRc				= &pWelsSvcRc->pSlicingOverRc[kiSliceId];
+  SBitStringAux* bs				= pSlice->pSliceBsa;
+
+
+  pSOverRc->iBsPosSlice = BsGetBitsPos (bs);
+
+  if (pEncCtx->eSliceType == I_SLICE)
+    return;
+  //calculate gom qp and target bits at the beginning of gom
+  if (0 == (pCurMb->iMbXY % pWelsSvcRc->iNumberMbGom)) {
+    if (pCurMb->iMbXY != pSOverRc->iStartMbSlice) {
+      pSOverRc->iComplexityIndexSlice++;
+      RcCalculateGomQp (pEncCtx, pCurMb, kiSliceId);
+    }
+    RcGomTargetBits (pEncCtx, kiSliceId);
+  }
+
+  RcCalculateMbQp (pEncCtx, pCurMb, kiSliceId);
+}
+
+void WelsRcMbInfoUpdateGom (void* pCtx, SMB* pCurMb, int32_t iCostLuma, SSlice* pSlice) {
+  sWelsEncCtx* pEncCtx = (sWelsEncCtx*)pCtx;
+  SWelsSvcRc* pWelsSvcRc			= &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+  SBitStringAux* bs				= pSlice->pSliceBsa;
+  int32_t iSliceId				= pSlice->uiSliceIdx;
+  SRCSlicing* pSOverRc				= &pWelsSvcRc->pSlicingOverRc[iSliceId];
+  const int32_t kiComplexityIndex	= pSOverRc->iComplexityIndexSlice;
+
+  int32_t cur_mb_bits = BsGetBitsPos (bs) - pSOverRc->iBsPosSlice;
+  pSOverRc->iFrameBitsSlice += cur_mb_bits;
+  pSOverRc->iGomBitsSlice += cur_mb_bits;
+
+  pWelsSvcRc->pGomCost[kiComplexityIndex] += iCostLuma;
+
+  if (cur_mb_bits > 0) {
+    pSOverRc->iTotalQpSlice += pCurMb->uiLumaQp;
+    pSOverRc->iTotalMbSlice++;
+  }
+}
+
+void  WelsRcPictureInitDisable (void* pCtx) {
+  sWelsEncCtx* pEncCtx = (sWelsEncCtx*)pCtx;
+  SWelsSvcRc* pWelsSvcRc = &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+  SDLayerParam* pDLayerParam		= &pEncCtx->pSvcParam->sDependencyLayers[pEncCtx->uiDependencyId];
+
+  const int32_t kiQp = pDLayerParam->iDLayerQp;
+
+  pEncCtx->iGlobalQp	= RcCalculateCascadingQp (pEncCtx, kiQp);
+
+  if (pEncCtx->pSvcParam->bEnableAdaptiveQuant && (pEncCtx->eSliceType == P_SLICE)) {
+    pEncCtx->iGlobalQp = (int32_t)WELS_CLIP3 (pEncCtx->iGlobalQp -
+                         pEncCtx->pVaa->sAdaptiveQuantParam.dAverMotionTextureIndexToDeltaQp, GOM_MIN_QP_MODE, GOM_MAX_QP_MODE);
+  }
+  pWelsSvcRc->iAverageFrameQp = pEncCtx->iGlobalQp;
+}
+
+void  WelsRcPictureInfoUpdateDisable (void* pCtx, int32_t layer_size) {
+}
+
+void  WelsRcMbInitDisable (void* pCtx, SMB* pCurMb, SSlice* pSlice) {
+  sWelsEncCtx* pEncCtx = (sWelsEncCtx*)pCtx;
+  int32_t iLumaQp					= pEncCtx->iGlobalQp;
+
+  if (pEncCtx->pSvcParam->bEnableAdaptiveQuant && (pEncCtx->eSliceType == P_SLICE)) {
+    iLumaQp   = (int8_t)WELS_CLIP3 (iLumaQp +
+                                    pEncCtx->pVaa->sAdaptiveQuantParam.pMotionTextureIndexToDeltaQp[pCurMb->iMbXY], GOM_MIN_QP_MODE, 51);
+  }
+  pCurMb->uiChromaQp = g_kuiChromaQpTable[iLumaQp];
+  pCurMb->uiLumaQp = iLumaQp;
+}
+
+void  WelsRcMbInfoUpdateDisable (void* pCtx, SMB* pCurMb, int32_t iCostLuma, SSlice* pSlice) {
+}
+
+
+void  WelsRcInitModule (void* pCtx,  int32_t iModule) {
+  sWelsEncCtx* pEncCtx = (sWelsEncCtx*)pCtx;
+  SWelsRcFunc*   pRcf = &pEncCtx->pFuncList->pfRc;
+
+  switch (iModule) {
+  case WELS_RC_DISABLE:
+    pRcf->pfWelsRcPictureInit = WelsRcPictureInitDisable;
+    pRcf->pfWelsRcPictureInfoUpdate = WelsRcPictureInfoUpdateDisable;
+    pRcf->pfWelsRcMbInit = WelsRcMbInitDisable;
+    pRcf->pfWelsRcMbInfoUpdate = WelsRcMbInfoUpdateDisable;
+    break;
+  case WELS_RC_GOM:
+  default:
+    pRcf->pfWelsRcPictureInit = WelsRcPictureInitGom;
+    pRcf->pfWelsRcPictureInfoUpdate = WelsRcPictureInfoUpdateGom;
+    pRcf->pfWelsRcMbInit = WelsRcMbInitGom;
+    pRcf->pfWelsRcMbInfoUpdate = WelsRcMbInfoUpdateGom;
+    break;
+  }
+
+  RcInitSequenceParameter (pEncCtx);
+}
+
+void  WelsRcFreeMemory (void* pCtx) {
+  sWelsEncCtx* pEncCtx = (sWelsEncCtx*)pCtx;
+  SWelsSvcRc* pWelsSvcRc = NULL;
+  int32_t i = 0;
+#ifdef _TEST_TEMP_Rc_
+  if (fp_test_rc)
+    fclose (fp_test_rc);
+  fp_test_rc = NULL;
+  if (fp_vgop)
+    fclose (fp_vgop);
+  fp_vgop = NULL;
+#endif
+  for (i = 0; i < pEncCtx->pSvcParam->iNumDependencyLayer; i++) {
+    pWelsSvcRc  = &pEncCtx->pWelsSvcRc[i];
+    RcFreeLayerMemory (pWelsSvcRc, pEncCtx->pMemAlign);
+  }
+}
+
+}//end of namespace
--- a/codec/encoder/core/src/ref_list_mgr_svc.cpp
+++ b/codec/encoder/core/src/ref_list_mgr_svc.cpp
@@ -1,631 +1,640 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-// ref_list_mgr_svc.c
-#include "ref_list_mgr_svc.h"
-#include "encoder_context.h"
-#include "svc_enc_frame.h"
-#include "picture.h"
-#include "expand_pic.h"
-#include <assert.h>
-#include "utils.h"
-#include "extern.h"
-namespace WelsSVCEnc {
-/*
- *	set picture as unreferenced
- */
-void SetUnref( SPicture *pRef )
-{
-	if ( NULL != pRef )	{
-		pRef->iFramePoc		= -1;
-		pRef->iFrameNum		= -1;	
-		pRef->uiTemporalId	=
-		pRef->uiSpatialId		=
-		pRef->iLongTermPicNum = -1;
-		pRef->bIsLongRef	= false;
-		pRef->uiRecieveConfirmed = RECIEVE_FAILED;
-		pRef->iMarkFrameNum = -1;
-		pRef->bUsedAsRef	= false;
-	}
-}
-
-/*
-*	reset LTR marking , recovery ,feedback state to default
-*/
-void ResetLtrState(SLTRState* pLtr )
-{	
-	pLtr->bReceivedT0LostFlag	= FALSE;
-	pLtr->iLastRecoverFrameNum = 0;
-	pLtr->iLastCorFrameNumDec = -1;
-	pLtr->iCurFrameNumInDec = -1;
-
-	// LTR mark
-	pLtr->iLTRMarkMode = LTR_DIRECT_MARK;
-	pLtr->iLTRMarkSuccessNum = 0; //successful marked num
-	pLtr->bLTRMarkingFlag = FALSE;	//decide whether current frame marked as LTR
-	pLtr->bLTRMarkEnable = FALSE; //when LTR is confirmed and the interval is no smaller than the marking period
-	pLtr->iCurLtrIdx = 0;
-	pLtr->iLastLtrIdx = 0;
-	pLtr->uiLtrMarkInterval = 0;	
-
-	// LTR mark feedback
-	pLtr->uiLtrMarkState = NO_LTR_MARKING_FEEDBACK ;
-	pLtr->iLtrMarkFbFrameNum = -1;
-}
-
-/*
- *	reset reference picture list
- */
-void WelsResetRefList( sWelsEncCtx *pCtx )
-{
-	SRefList *pRefList = pCtx->ppRefPicListExt[pCtx->uiDependencyId];
-	int32_t i;
-	
-	for ( i = 0; i<MAX_SHORT_REF_COUNT+1;i++)
-		pRefList->pShortRefList[i] = NULL;
-	for ( i = 0; i<MAX_LONG_REF_COUNT+1;i++)
-		pRefList->pLongRefList[i] = NULL;
-	for ( i = 0; i<pCtx->pSvcParam->iNumRefFrame+1;i++)
-		SetUnref( pRefList->pRef[i] );
-
-	pRefList->uiLongRefCount = 0;
-	pRefList->uiShortRefCount =0;
-	pRefList->pNextBuffer = pRefList->pRef[0];
-}
-
-static inline void DeleteLTRFromLongList(sWelsEncCtx*pCtx, int32_t iIdx)
-{
-	SRefList* pRefList = pCtx->ppRefPicListExt[pCtx->uiDependencyId];
-	int32_t k ;
-				
-	for (k= iIdx; k<pRefList->uiLongRefCount-1;k++)	{
-		pRefList->pLongRefList[k]= pRefList->pLongRefList[k+1];
-	}
-	pRefList->pLongRefList[k]= NULL;
-	pRefList->uiLongRefCount--;		
-
-}
-static inline void DeleteSTRFromShortList(sWelsEncCtx*pCtx, int32_t iIdx)
-{
-	SRefList* pRefList = pCtx->ppRefPicListExt[pCtx->uiDependencyId];
-	int32_t k ;
-
-	for (k= iIdx; k<pRefList->uiShortRefCount-1;k++)	{
-			pRefList->pShortRefList[k]= pRefList->pShortRefList[k+1];
-	}
-	pRefList->pShortRefList[k]= NULL;
-	pRefList->uiShortRefCount--;
-
-}
-static inline int32_t CompareFrameNum(int32_t iFrameNumA,int32_t iFrameNumB,int32_t iMaxFrameNumPlus1)
-{
-	int64_t iNumA,iNumB,iDiffAB,iDiffMin;
-	if ( iFrameNumA>iMaxFrameNumPlus1 || iFrameNumB>iMaxFrameNumPlus1 ){	return -2;	}
-#define  WelsAbsDiffInt64(a,b) ( (a) > (b) )?( a - b ):( b - a )
-
-	iDiffAB = WelsAbsDiffInt64( (int64_t)(iFrameNumA),(int64_t)(iFrameNumB));
-
-	iDiffMin = iDiffAB;
-	if (iDiffMin == 0){	return FRAME_NUM_EQUAL;	}
-
-	iNumA = WelsAbsDiffInt64( (int64_t)(iFrameNumA+iMaxFrameNumPlus1), (int64_t)(iFrameNumB) );
-	if (iNumA == 0){ return FRAME_NUM_EQUAL; }
-	else if (iDiffMin > iNumA)	{	return FRAME_NUM_BIGGER;	}
-
-	iNumB = WelsAbsDiffInt64( (int64_t)(iFrameNumB+iMaxFrameNumPlus1), (int64_t)(iFrameNumA) );
-	if (iNumB == 0){ return FRAME_NUM_EQUAL; }
-	else if (iDiffMin > iNumB)	{	return FRAME_NUM_SMALLER;	}
-
-	return (iFrameNumA > iFrameNumB)?(FRAME_NUM_BIGGER):(FRAME_NUM_SMALLER);
-	
-}
-/*
-*	delete failed mark according LTR recovery pRequest
-*/
-static inline void DeleteInvalidLTR(sWelsEncCtx *pCtx)
-{
-	SRefList * pRefList		= pCtx->ppRefPicListExt[pCtx->uiDependencyId];
-	SPicture** pLongRefList = pRefList->pLongRefList;
-	SLTRState* pLtr = &pCtx->pLtr[pCtx->uiDependencyId];
-	int32_t iMaxFrameNumPlus1 = (1<<pCtx->pSps->uiLog2MaxFrameNum);
-	int32_t i;
-
-	for( i = 0;i<LONG_TERM_REF_NUM;i++){
-		if ( pLongRefList[i]!=NULL  )	{
-			if ( CompareFrameNum( pLongRefList[i]->iFrameNum , pLtr->iLastCorFrameNumDec,iMaxFrameNumPlus1 ) == FRAME_NUM_BIGGER
-				&&( CompareFrameNum( pLongRefList[i]->iFrameNum , pLtr->iCurFrameNumInDec,iMaxFrameNumPlus1)& (FRAME_NUM_EQUAL|FRAME_NUM_SMALLER) )){			
-				WelsLog(pCtx,WELS_LOG_WARNING,"LTR ,invalid LTR delete ,long_term_idx = %d , iFrameNum =%d \n",pLongRefList[i]->iLongTermPicNum,pLongRefList[i]->iFrameNum);
-				SetUnref(pLongRefList[i]);
-				DeleteLTRFromLongList(pCtx,i);
-				pLtr->bLTRMarkEnable = TRUE;
-				if (pRefList->uiLongRefCount == 0) 	{	pCtx->bEncCurFrmAsIdrFlag = true; }
-			}else if ( CompareFrameNum(pLongRefList[i]->iMarkFrameNum , pLtr->iLastCorFrameNumDec ,iMaxFrameNumPlus1) == FRAME_NUM_BIGGER
-				&& (CompareFrameNum(pLongRefList[i]->iMarkFrameNum, pLtr->iCurFrameNumInDec ,iMaxFrameNumPlus1)&(FRAME_NUM_EQUAL|FRAME_NUM_SMALLER))
-				&& pLtr->iLTRMarkMode == LTR_DELAY_MARK )	{	
-				WelsLog(pCtx,WELS_LOG_WARNING,"LTR ,iMarkFrameNum invalid LTR delete ,long_term_idx = %d , iFrameNum =%d \n",pLongRefList[i]->iLongTermPicNum,pLongRefList[i]->iFrameNum);
-				SetUnref(pLongRefList[i]);
-				DeleteLTRFromLongList(pCtx,i);
-				pLtr->bLTRMarkEnable = TRUE;
-				if (pRefList->uiLongRefCount == 0) 	{	pCtx->bEncCurFrmAsIdrFlag = true; }
-			}
-		}
-	}
-
-}
-/*
-*	handle LTR Mark feedback message
-*/
-static inline void HandleLTRMarkFeedback(sWelsEncCtx *pCtx)
-{	
-	SRefList * pRefList		= pCtx->ppRefPicListExt[pCtx->uiDependencyId];
-	SPicture** pLongRefList		= pRefList->pLongRefList;
-	SLTRState* pLtr = &pCtx->pLtr[pCtx->uiDependencyId];
-	int32_t i,j;
-
-	if (pLtr->uiLtrMarkState == LTR_MARKING_SUCCESS){	
-		WelsLog(pCtx,WELS_LOG_WARNING,"pLtr->uiLtrMarkState = %d, pLtr.iCurLtrIdx = %d , pLtr->iLtrMarkFbFrameNum = %d ,pCtx->iFrameNum = %d ",pLtr->uiLtrMarkState,pLtr->iCurLtrIdx, pLtr->iLtrMarkFbFrameNum,pCtx->iFrameNum);
-		for ( i = 0; i<pRefList->uiLongRefCount; i++)	{
-			if (pLongRefList[i]->iFrameNum == pLtr->iLtrMarkFbFrameNum && pLongRefList[i]->uiRecieveConfirmed != RECIEVE_SUCCESS){	
-		
-				pLongRefList[i]->uiRecieveConfirmed = RECIEVE_SUCCESS;
-				pCtx->pVaa->uiValidLongTermPicIdx = pLongRefList[i]->iLongTermPicNum;
-
-				pLtr->iCurFrameNumInDec  =
-				pLtr->iLastRecoverFrameNum = 
-				pLtr->iLastCorFrameNumDec = pLtr->iLtrMarkFbFrameNum;
-		
-				for ( j = 0;j<pRefList->uiLongRefCount;j++)	{
-					if(pLongRefList[j]->iLongTermPicNum != pLtr->iCurLtrIdx)	{
-						SetUnref(pLongRefList[j]);
-						DeleteLTRFromLongList(pCtx,j);
-					}
-				}	
-		
-				pLtr->iLTRMarkSuccessNum++;
-				pLtr->iCurLtrIdx = (++pLtr->iCurLtrIdx%LONG_TERM_REF_NUM);
-				pLtr->iLTRMarkMode = ( pLtr->iLTRMarkSuccessNum >= (LONG_TERM_REF_NUM) )?( LTR_DELAY_MARK):(LTR_DIRECT_MARK);
-				WelsLog(pCtx,WELS_LOG_WARNING,"LTR mark mode =%d",pLtr->iLTRMarkMode);
-				pLtr->bLTRMarkEnable = TRUE;
-				break;		
-			}
-		}
-		pLtr->uiLtrMarkState = NO_LTR_MARKING_FEEDBACK;
-	}else if (pLtr->uiLtrMarkState == LTR_MARKING_FAILED){
-		for ( i =0; i < pRefList->uiLongRefCount; i++)	{
-			if (pLongRefList[i]->iFrameNum == pLtr->iLtrMarkFbFrameNum)	{
-				SetUnref(pLongRefList[i]);
-				DeleteLTRFromLongList(pCtx,i);
-				break;
-			}
-		}
-		pLtr->uiLtrMarkState = NO_LTR_MARKING_FEEDBACK;
-		pLtr->bLTRMarkEnable = TRUE;
-
-		if (pLtr->iLTRMarkSuccessNum == 0){pCtx->bEncCurFrmAsIdrFlag = true;} // no LTR , means IDR recieve failed, force next frame IDR		
-	}
-}
-/*
- *	LTR mark process
- */
-static inline void LTRMarkProcess(sWelsEncCtx *pCtx)
-{
-	SRefList * pRefList		= pCtx->ppRefPicListExt[pCtx->uiDependencyId];
-	SPicture** pLongRefList = pRefList->pLongRefList;
-	SPicture** pShortRefList = pRefList->pShortRefList;
-	SLTRState* pLtr = &pCtx->pLtr[pCtx->uiDependencyId];
-	int32_t iGoPFrameNumInterval = ( (pCtx->pSvcParam->uiGopSize>>1)>1 )?( pCtx->pSvcParam->uiGopSize>>1 ):( 1 );
-	int32_t iMaxFrameNumPlus1 = (1<<pCtx->pSps->uiLog2MaxFrameNum);
-	int32_t i = 0;
-	int32_t j = 0;
-	bool_t bMoveLtrFromShortToLong = false;
-
-	if (pCtx->eSliceType == I_SLICE )	{
-		i=0;
-		pShortRefList[i]->uiRecieveConfirmed = RECIEVE_SUCCESS;
-	}else if ( pLtr->bLTRMarkingFlag){
-		pCtx->pVaa->uiMarkLongTermPicIdx = pLtr->iCurLtrIdx;
-
-		if (pLtr->iLTRMarkMode == LTR_DELAY_MARK)	{
-			for (i = 0; i<pRefList->uiShortRefCount; i++)	{			
-				if( CompareFrameNum(pCtx->iFrameNum,pShortRefList[i]->iFrameNum+iGoPFrameNumInterval,iMaxFrameNumPlus1)==FRAME_NUM_EQUAL)
-				{	break;	}	
-			}	
-		}
-	}
-
-	if (pCtx->eSliceType == I_SLICE || pLtr->bLTRMarkingFlag){	
-		pShortRefList[i]->bIsLongRef = true;
-		pShortRefList[i]->iLongTermPicNum = pLtr->iCurLtrIdx;	
-		pShortRefList[i]->iMarkFrameNum = pCtx->iFrameNum;
-	}
-	
-	// delay one gop to move LTR from int16_t list to int32_t list
-	if (pLtr->iLTRMarkMode == LTR_DIRECT_MARK && pCtx->eSliceType != I_SLICE && !pLtr->bLTRMarkingFlag ){
-		for (j = 0; j<pRefList->uiShortRefCount;j++){
-			if ( pRefList->pShortRefList[j]->bIsLongRef)	{
-				i = j;
-				bMoveLtrFromShortToLong = true;
-				break;
-			}
-		}
-	}
-
-	if ( (pLtr->iLTRMarkMode == LTR_DELAY_MARK && pLtr->bLTRMarkingFlag) || ( (pLtr->iLTRMarkMode == LTR_DIRECT_MARK) && (bMoveLtrFromShortToLong) ) )
-	{
-		if (pRefList->uiLongRefCount>0)
-		{
-			memmove(&pRefList->pLongRefList[1],&pRefList->pLongRefList[0],pRefList->uiLongRefCount*sizeof(SPicture*));	// confirmed_safe_unsafe_usage
-		}
-		pLongRefList[0]	 = pShortRefList[i];
-		pRefList->uiLongRefCount++;
-		DeleteSTRFromShortList(pCtx,i);
-	}
-}
-static inline void PrefetchNextBuffer(sWelsEncCtx *pCtx)
-{	
-	SRefList * pRefList		= pCtx->ppRefPicListExt[pCtx->uiDependencyId];
-	const int32_t kiNumRef	= pCtx->pSvcParam->iNumRefFrame;
-	int32_t i;
-	
-	pRefList->pNextBuffer = NULL;
-	for (i = 0; i<kiNumRef+1;i++){
-		if (!pRefList->pRef[i]->bUsedAsRef){
-			pRefList->pNextBuffer = pRefList->pRef[i];
-			break;
-		}
-	}
-
-	if (pRefList->pNextBuffer == NULL && pRefList->uiShortRefCount>0){
-		pRefList->pNextBuffer = pRefList->pShortRefList[pRefList->uiShortRefCount-1];
-		SetUnref(pRefList->pNextBuffer);
-	}
-
-	pCtx->pDecPic = pRefList->pNextBuffer;
-}
-
-/*
- *	update reference picture list
- */
-BOOL_T WelsUpdateRefList( sWelsEncCtx *pCtx )
-{
-	SRefList * pRefList		= pCtx->ppRefPicListExt[pCtx->uiDependencyId];
-	SLTRState* pLtr			= &pCtx->pLtr[pCtx->uiDependencyId];
-	SDLayerParam *pParamD	= &pCtx->pSvcParam->sDependencyLayers[pCtx->uiDependencyId];
-	const int32_t kiNumRef	= pCtx->pSvcParam->iNumRefFrame;
-
-	int32_t iRefIdx			= 0;
-	const uint8_t kuiTid		= pCtx->uiTemporalId;
-	const uint8_t kuiDid		= pCtx->uiDependencyId;
-	const EWelsSliceType keSliceType		= pCtx->eSliceType;		
-	const int32_t kiSwapIdx = (pCtx->eSliceType == P_SLICE )?( kiNumRef-LONG_TERM_REF_NUM ):( (pCtx->pSvcParam->bEnableLongTermReference)?(kiNumRef - pLtr->iCurLtrIdx):(1) );
-	uint32_t i = 0;
-	// Need update pRef list in case store base layer or target dependency layer construction
-	if ( NULL == pCtx->pCurDqLayer )
-		return FALSE;
-
-	if ( NULL == pRefList || NULL == pRefList->pRef[0] || NULL == pRefList->pRef[kiSwapIdx] )
-		return FALSE;
-
-	if ( (NULL != pCtx->pDecPic)
-#if !defined(ENABLE_FRAME_DUMP)	// to save complexity, 1/6/2009
-		 && (pParamD->iHighestTemporalId == 0 || kuiTid < pParamD->iHighestTemporalId)
-#endif// !ENABLE_FRAME_DUMP
-	)
-		// Expanding picture for future reference
-		ExpandReferencingPicture( pCtx->pDecPic, pCtx->pFuncList->pfExpandLumaPicture, pCtx->pFuncList->pfExpandChromaPicture );
-
-	// move picture in list
-	pCtx->pDecPic->uiTemporalId = kuiTid;
-	pCtx->pDecPic->uiSpatialId	= kuiDid;
-	pCtx->pDecPic->iFrameNum		= pCtx->iFrameNum;
-	pCtx->pDecPic->iFramePoc		= pCtx->iPOC;
-	pCtx->pDecPic->uiRecieveConfirmed = RECIEVE_UNKOWN;
-	pCtx->pDecPic->bUsedAsRef	= true;
-
-	for (iRefIdx = pRefList->uiShortRefCount-1;iRefIdx>=0;--iRefIdx)	{
-		pRefList->pShortRefList[iRefIdx+1] = pRefList->pShortRefList[iRefIdx];
-	}
-	pRefList->pShortRefList[0] = pCtx->pDecPic;
-	pRefList->uiShortRefCount++;
-
-	if ( keSliceType == P_SLICE ){
-		if (pCtx->uiTemporalId == 0)
-		{
-			if (pCtx->pSvcParam->bEnableLongTermReference)	{
-				LTRMarkProcess(pCtx);
-				DeleteInvalidLTR(pCtx);	
-				HandleLTRMarkFeedback(pCtx);
-
-				pLtr->bReceivedT0LostFlag = FALSE; // reset to false due to the recovery is finished
-				pLtr->bLTRMarkingFlag = FALSE;	
-				++pLtr->uiLtrMarkInterval;		
-			}
-
-			for (i = pRefList->uiShortRefCount-1;i>0;i--){		
-				SetUnref(pRefList->pShortRefList[i]);
-				DeleteSTRFromShortList(pCtx,i);
-			}
-			if (pRefList->uiShortRefCount>0 && (pRefList->pShortRefList[0]->uiTemporalId>0 || pRefList->pShortRefList[0]->iFrameNum != pCtx->iFrameNum))
-			{
-				SetUnref(pRefList->pShortRefList[0]);
-				DeleteSTRFromShortList(pCtx,0);
-			}
-		}
-	}else{	// in case IDR currently coding	
-		if (pCtx->pSvcParam->bEnableLongTermReference)	{
-			LTRMarkProcess(pCtx);
-
-			pLtr->iCurLtrIdx = (++pLtr->iCurLtrIdx%LONG_TERM_REF_NUM);
-			pLtr->iLTRMarkSuccessNum = 1; //IDR default suceess
-			pLtr->bLTRMarkEnable =  TRUE;
-			pLtr->uiLtrMarkInterval = 0;
-
-			pCtx->pVaa->uiValidLongTermPicIdx = 0;
-			pCtx->pVaa->uiMarkLongTermPicIdx = 0;
-		}
-	}
-	PrefetchNextBuffer(pCtx);
-	return TRUE;
-}
-
-bool_t CheckCurMarkFrameNumUsed(sWelsEncCtx *pCtx)
-{
-	SLTRState* pLtr = &pCtx->pLtr[pCtx->uiDependencyId];
-	SRefList *pRefList	= pCtx->ppRefPicListExt[pCtx->uiDependencyId];
-	SPicture** pLongRefList = pRefList->pLongRefList;
-	int32_t iGoPFrameNumInterval = ( (pCtx->pSvcParam->uiGopSize>>1)>1 )?( pCtx->pSvcParam->uiGopSize>>1 ):( 1 );
-	int32_t iMaxFrameNumPlus1 = (1<<pCtx->pSps->uiLog2MaxFrameNum);
-	int32_t i;
-
-	for (i = 0;i<pRefList->uiLongRefCount;i++){
-		if( ( pCtx->iFrameNum == pLongRefList[i]->iFrameNum &&pLtr->iLTRMarkMode == LTR_DIRECT_MARK ) ||
-		    ( CompareFrameNum(pCtx->iFrameNum + iGoPFrameNumInterval,pLongRefList[i]->iFrameNum,iMaxFrameNumPlus1)== FRAME_NUM_EQUAL  && pLtr->iLTRMarkMode == LTR_DELAY_MARK))
-		{
-			return FALSE;
-		}
-	}
-	
-	return TRUE;
-}
-void WelsMarkPic( sWelsEncCtx *pCtx)
-{
-	SLTRState* pLtr = &pCtx->pLtr[pCtx->uiDependencyId];
-	const int32_t kiCountSliceNum			= GetCurrentSliceNum( pCtx->pCurDqLayer->pSliceEncCtx );
-	int32_t iGoPFrameNumInterval = ((pCtx->pSvcParam->uiGopSize>>1)>1)?(pCtx->pSvcParam->uiGopSize>>1):(1);
-	int32_t iSliceIdx = 0;
-
-	if (pCtx->pSvcParam->bEnableLongTermReference && pLtr->bLTRMarkEnable && pCtx->uiTemporalId == 0){ 
-		if (  !pLtr->bReceivedT0LostFlag && pLtr->uiLtrMarkInterval > pCtx->pSvcParam->uiLtrMarkPeriod 
-			&& CheckCurMarkFrameNumUsed(pCtx)){
-				pLtr->bLTRMarkingFlag = TRUE;
-				pLtr->bLTRMarkEnable = FALSE;
-				pLtr->uiLtrMarkInterval = 0;
-				pLtr->iLastLtrIdx = pLtr->iCurLtrIdx;
-		}else{
-			pLtr->bLTRMarkingFlag = FALSE;
-		}
-	}
-
-	for (iSliceIdx = 0; iSliceIdx<kiCountSliceNum;iSliceIdx++)	{	
-		SSliceHeaderExt	*pSliceHdrExt		= &pCtx->pCurDqLayer->sLayerInfo.pSliceInLayer[iSliceIdx].sSliceHeaderExt;
-		SSliceHeader		*pSliceHdr			= &pSliceHdrExt->sSliceHeader;
-		SRefPicMarking		*pRefPicMark		= &pSliceHdr->sRefMarking;	
-
-		memset( pRefPicMark, 0, sizeof(SRefPicMarking) );
-
-		if (iSliceIdx != kiCountSliceNum-1)	{ //marking syntax only exist in last slice head
-			continue; 
-		}
-		if (pCtx->pSvcParam->bEnableLongTermReference && pLtr->bLTRMarkingFlag){	
-			if (pLtr->iLTRMarkMode == LTR_DIRECT_MARK)	{
-				pRefPicMark->SMmcoRef[pRefPicMark->uiMmcoCount].iMaxLongTermFrameIdx = LONG_TERM_REF_NUM-1;
-				pRefPicMark->SMmcoRef[pRefPicMark->uiMmcoCount++].iMmcoType = MMCO_SET_MAX_LONG;	
-
-				pRefPicMark->SMmcoRef[pRefPicMark->uiMmcoCount].iDiffOfPicNum = iGoPFrameNumInterval;
-				pRefPicMark->SMmcoRef[pRefPicMark->uiMmcoCount++].iMmcoType = MMCO_SHORT2UNUSED;
-			
-				pRefPicMark->SMmcoRef[pRefPicMark->uiMmcoCount].iLongTermFrameIdx = pLtr->iCurLtrIdx;
-				pRefPicMark->SMmcoRef[pRefPicMark->uiMmcoCount++].iMmcoType = MMCO_LONG;
-			}else if (pLtr->iLTRMarkMode == LTR_DELAY_MARK )	{
-				pRefPicMark->SMmcoRef[pRefPicMark->uiMmcoCount].iDiffOfPicNum = iGoPFrameNumInterval;
-				pRefPicMark->SMmcoRef[pRefPicMark->uiMmcoCount].iLongTermFrameIdx = pLtr->iCurLtrIdx;
-				pRefPicMark->SMmcoRef[pRefPicMark->uiMmcoCount++].iMmcoType = MMCO_SHORT2LONG;
-			}
-		}
-	}
-}
-
-int32_t FilterLTRRecoveryRequest(sWelsEncCtx *pCtx,SLTRRecoverRequest* pLTRRecoverRequest)
-{
-	SLTRRecoverRequest* pRequest = pLTRRecoverRequest;
-	SLTRState* pLtr = &pCtx->pLtr[pCtx->uiDependencyId];
-	int32_t iMaxFrameNumPlus1 = (1<<pCtx->pSps->uiLog2MaxFrameNum);
-	if ( pCtx->pSvcParam->bEnableLongTermReference )
-	{
-		 if( pRequest->uiFeedbackType == LTR_RECOVERY_REQUEST &&  pRequest->uiIDRPicId == pCtx->sPSOVector.uiIdrPicId)
-		 {
-			if(pRequest->iLastCorrectFrameNum == -1){
-				pCtx->bEncCurFrmAsIdrFlag = true;
-				return TRUE;
-			}else if (pRequest->iCurrentFrameNum == -1){
-				pLtr->bReceivedT0LostFlag = true;
-				return TRUE;
-			}else if( ( CompareFrameNum( pLtr->iLastRecoverFrameNum , pRequest->iLastCorrectFrameNum,iMaxFrameNumPlus1) & (FRAME_NUM_EQUAL|FRAME_NUM_SMALLER) )// t0 lost
-				||( ( CompareFrameNum(pLtr->iLastRecoverFrameNum , pRequest->iCurrentFrameNum,iMaxFrameNumPlus1) & ( FRAME_NUM_EQUAL|FRAME_NUM_SMALLER ) )&&
-				CompareFrameNum(pLtr->iLastRecoverFrameNum , pRequest->iLastCorrectFrameNum,iMaxFrameNumPlus1) == FRAME_NUM_BIGGER ) ){// recovery failed
-					
-				pLtr->bReceivedT0LostFlag = true;
-				pLtr->iLastCorFrameNumDec = pRequest->iLastCorrectFrameNum;
-				pLtr->iCurFrameNumInDec = pRequest->iCurrentFrameNum;
-				WelsLog(pCtx,WELS_LOG_INFO,"Receive valid LTR recovery pRequest,feedback_type = %d ,uiIdrPicId = %d , current_frame_num = %d , last correct frame num = %d"
-					,pRequest->uiFeedbackType,pRequest->uiIDRPicId,pRequest->iCurrentFrameNum,pRequest->iLastCorrectFrameNum);
-			}
-
-			WelsLog(pCtx,WELS_LOG_INFO,"Receive LTR recovery pRequest,feedback_type = %d ,uiIdrPicId = %d , current_frame_num = %d , last correct frame num = %d"
-					,pRequest->uiFeedbackType,pRequest->uiIDRPicId,pRequest->iCurrentFrameNum,pRequest->iLastCorrectFrameNum);	
-		 }
-	}else if (!pCtx->pSvcParam->bEnableLongTermReference){
-		pCtx->bEncCurFrmAsIdrFlag = TRUE;
-	}
-	return TRUE;
-}
-void FilterLTRMarkingFeedback(sWelsEncCtx *pCtx,SLTRMarkingFeedback* pLTRMarkingFeedback)
-{
-	SLTRState* pLtr = &pCtx->pLtr[pCtx->uiDependencyId];
-	assert(pLTRMarkingFeedback);
-	if ( pCtx->pSvcParam->bEnableLongTermReference )	{
-		if ( pLTRMarkingFeedback->uiIDRPicId == pCtx->sPSOVector.uiIdrPicId 
-			&&( pLTRMarkingFeedback->uiFeedbackType == LTR_MARKING_SUCCESS || pLTRMarkingFeedback->uiFeedbackType == LTR_MARKING_FAILED))// avoid error pData
-		{	
-			pLtr->uiLtrMarkState = pLTRMarkingFeedback->uiFeedbackType;
-			pLtr->iLtrMarkFbFrameNum =  pLTRMarkingFeedback->iLTRFrameNum ;
-			WelsLog(pCtx,WELS_LOG_INFO,"Receive valid LTR marking feedback, feedback_type = %d , uiIdrPicId = %d , LTR_frame_num = %d , cur_idr_pic_id = %d",pLTRMarkingFeedback->uiFeedbackType,pLTRMarkingFeedback->uiIDRPicId,pLTRMarkingFeedback->iLTRFrameNum , pCtx->sPSOVector.uiIdrPicId);
-
-		}else{
-			WelsLog(pCtx,WELS_LOG_INFO,"Receive LTR marking feedback, feedback_type = %d , uiIdrPicId = %d , LTR_frame_num = %d , cur_idr_pic_id = %d",pLTRMarkingFeedback->uiFeedbackType,pLTRMarkingFeedback->uiIDRPicId,pLTRMarkingFeedback->iLTRFrameNum , pCtx->sPSOVector.uiIdrPicId);
-		}
-	}
-}
-
-/*
- *	build reference picture list
- */
-BOOL_T WelsBuildRefList( sWelsEncCtx *pCtx, const int32_t iPOC )
-{	
-	SRefList *pRefList		=  pCtx->ppRefPicListExt[pCtx->uiDependencyId];	
-	SLTRState* pLtr			= &pCtx->pLtr[pCtx->uiDependencyId];
-	const int32_t kiNumRef	= pCtx->pSvcParam->iNumRefFrame;	
-	const uint8_t kuiTid		= pCtx->uiTemporalId;	
-	uint32_t i				= 0;
-
-	// to support any type of cur_dq->mgs_control
-	//	[ 0:	using current layer to do ME/MC;
-	//	  -1:	using store base layer to do ME/MC;
-	//	  2:	using highest layer to do ME/MC; ]
-
-	// build reference list 0/1 if applicable
-
-	pCtx->iNumRef0	= 0;
-	
-	if ( pCtx->eSliceType != I_SLICE )
-	{
-		if (pCtx->pSvcParam->bEnableLongTermReference && pLtr->bReceivedT0LostFlag && pCtx->uiTemporalId == 0){
-			for ( i = 0;i <pRefList->uiLongRefCount;i++)	{
-				if (pRefList->pLongRefList[i]->uiRecieveConfirmed == RECIEVE_SUCCESS)	{
-					pCtx->pRefList0[pCtx->iNumRef0++] = pRefList->pLongRefList[i];
-					pLtr->iLastRecoverFrameNum = pCtx->iFrameNum;
-					WelsLog(pCtx,WELS_LOG_INFO,"pRef is int32_t !iLastRecoverFrameNum = %d, pRef iFrameNum = %d,LTR number = %d,",pLtr->iLastRecoverFrameNum,pCtx->pRefList0[0]->iFrameNum,pRefList->uiLongRefCount);
-					break;
-				}
-			}
-		}else{
-			for ( i = 0; i < pRefList->uiShortRefCount; ++ i )
-			{
-				SPicture *pRef = pRefList->pShortRefList[i];
-				if ( pRef != NULL && pRef->bUsedAsRef && pRef->iFramePoc >= 0 && pRef->uiTemporalId <= kuiTid)
-				{		
-					pCtx->pRefList0[pCtx->iNumRef0++]	= pRef;
-					break;	
-				}
-			}
-		}	
-	}
-	else	// safe for IDR
-	{
-		WelsResetRefList( pCtx ); //for IDR, SHOULD reset pRef list. 
-		ResetLtrState(&pCtx->pLtr[pCtx->uiDependencyId]); //SHOULD update it when IDR.
-		pCtx->pRefList0[0]	= NULL;
-	}
-
-	if ( pCtx->iNumRef0 > kiNumRef )
-		pCtx->iNumRef0 = kiNumRef;
-	return ( pCtx->iNumRef0>0 || pCtx->eSliceType == I_SLICE) ? ( TRUE ): ( FALSE );
-}
-
-/*
- *	update syntax for reference base related
- */
-void WelsUpdateRefSyntax( sWelsEncCtx *pCtx, const int32_t iPOC, const int32_t uiFrameType )
-{
-	SLTRState* pLtr = &pCtx->pLtr[pCtx->uiDependencyId];
-	int32_t iIdx								= 0;
-	const int32_t kiCountSliceNum			= GetCurrentSliceNum( pCtx->pCurDqLayer->pSliceEncCtx );
-	int32_t	iAbsDiffPicNumMinus1			= -1;
-
-	assert( kiCountSliceNum > 0 );
-	
-	/*syntax for ref_pic_list_reordering()*/
-	if( pCtx->iNumRef0 > 0 )
-		iAbsDiffPicNumMinus1 = pCtx->iFrameNum - (pCtx->pRefList0[0]->iFrameNum) -1;
-	
-	for (iIdx = 0;iIdx < kiCountSliceNum;iIdx++) {
-		SSliceHeaderExt	*pSliceHdrExt		= &pCtx->pCurDqLayer->sLayerInfo.pSliceInLayer[iIdx].sSliceHeaderExt;	
-		SSliceHeader		*pSliceHdr			= &pSliceHdrExt->sSliceHeader;
-		SRefPicListReorderSyntax *pRefReorder	= &pSliceHdr->sRefReordering;
-		SRefPicMarking *pRefPicMark			= &pSliceHdr->sRefMarking;	
-	
-		/*syntax for num_ref_idx_l0_active_minus1*/
-		pSliceHdr->uiRefCount = pCtx->iNumRef0;
-		if( pCtx->iNumRef0 > 0 )
-		{	
-			if( !pCtx->pRefList0[0]->bIsLongRef )	
-			{
- 				if ( iAbsDiffPicNumMinus1 < 0 )
-				{
- 					WelsLog( pCtx, WELS_LOG_INFO, "WelsUpdateRefSyntax():::uiAbsDiffPicNumMinus1:%d\n", iAbsDiffPicNumMinus1 );
- 					iAbsDiffPicNumMinus1 += (1 << (pCtx->pSps->uiLog2MaxFrameNum));
- 					WelsLog( pCtx, WELS_LOG_INFO, "WelsUpdateRefSyntax():::uiAbsDiffPicNumMinus1< 0, update as:%d\n", iAbsDiffPicNumMinus1 );
- 				}
-			
- 				pRefReorder->SReorderingSyntax[0].uiReorderingOfPicNumsIdc = 0;
- 				pRefReorder->SReorderingSyntax[0].uiAbsDiffPicNumMinus1    = iAbsDiffPicNumMinus1;
- 				pRefReorder->SReorderingSyntax[1].uiReorderingOfPicNumsIdc = 3;	
-			}
-			else
-			{
-				pRefReorder->SReorderingSyntax[0].uiReorderingOfPicNumsIdc = 2;
-				pRefReorder->SReorderingSyntax[0].iLongTermPicNum = pCtx->pRefList0[0]->iLongTermPicNum;
-				pRefReorder->SReorderingSyntax[1].uiReorderingOfPicNumsIdc = 3;
-			}
-		}
-		
-		/*syntax for dec_ref_pic_marking()*/
-		if( WELS_FRAME_TYPE_IDR == uiFrameType )		{
-			pRefPicMark->bNoOutputOfPriorPicsFlag = false;
-			pRefPicMark->bLongTermRefFlag = pCtx->pSvcParam->bEnableLongTermReference;
-		}else{
- 			pRefPicMark->bAdaptiveRefPicMarkingModeFlag = (pCtx->pSvcParam->bEnableLongTermReference && pLtr->bLTRMarkingFlag)?(true):(false);
-		}		
-	}
-}
-
-} // namespace WelsSVCEnc
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+// ref_list_mgr_svc.c
+#include "ref_list_mgr_svc.h"
+#include "encoder_context.h"
+#include "svc_enc_frame.h"
+#include "picture.h"
+#include "expand_pic.h"
+#include <assert.h>
+#include "utils.h"
+#include "extern.h"
+namespace WelsSVCEnc {
+/*
+ *	set picture as unreferenced
+ */
+void SetUnref (SPicture* pRef) {
+  if (NULL != pRef)	{
+    pRef->iFramePoc		= -1;
+    pRef->iFrameNum		= -1;
+    pRef->uiTemporalId	=
+      pRef->uiSpatialId		=
+        pRef->iLongTermPicNum = -1;
+    pRef->bIsLongRef	= false;
+    pRef->uiRecieveConfirmed = RECIEVE_FAILED;
+    pRef->iMarkFrameNum = -1;
+    pRef->bUsedAsRef	= false;
+  }
+}
+
+/*
+*	reset LTR marking , recovery ,feedback state to default
+*/
+void ResetLtrState (SLTRState* pLtr) {
+  pLtr->bReceivedT0LostFlag	= FALSE;
+  pLtr->iLastRecoverFrameNum = 0;
+  pLtr->iLastCorFrameNumDec = -1;
+  pLtr->iCurFrameNumInDec = -1;
+
+  // LTR mark
+  pLtr->iLTRMarkMode = LTR_DIRECT_MARK;
+  pLtr->iLTRMarkSuccessNum = 0; //successful marked num
+  pLtr->bLTRMarkingFlag = FALSE;	//decide whether current frame marked as LTR
+  pLtr->bLTRMarkEnable = FALSE; //when LTR is confirmed and the interval is no smaller than the marking period
+  pLtr->iCurLtrIdx = 0;
+  pLtr->iLastLtrIdx = 0;
+  pLtr->uiLtrMarkInterval = 0;
+
+  // LTR mark feedback
+  pLtr->uiLtrMarkState = NO_LTR_MARKING_FEEDBACK ;
+  pLtr->iLtrMarkFbFrameNum = -1;
+}
+
+/*
+ *	reset reference picture list
+ */
+void WelsResetRefList (sWelsEncCtx* pCtx) {
+  SRefList* pRefList = pCtx->ppRefPicListExt[pCtx->uiDependencyId];
+  int32_t i;
+
+  for (i = 0; i < MAX_SHORT_REF_COUNT + 1; i++)
+    pRefList->pShortRefList[i] = NULL;
+  for (i = 0; i < MAX_LONG_REF_COUNT + 1; i++)
+    pRefList->pLongRefList[i] = NULL;
+  for (i = 0; i < pCtx->pSvcParam->iNumRefFrame + 1; i++)
+    SetUnref (pRefList->pRef[i]);
+
+  pRefList->uiLongRefCount = 0;
+  pRefList->uiShortRefCount = 0;
+  pRefList->pNextBuffer = pRefList->pRef[0];
+}
+
+static inline void DeleteLTRFromLongList (sWelsEncCtx* pCtx, int32_t iIdx) {
+  SRefList* pRefList = pCtx->ppRefPicListExt[pCtx->uiDependencyId];
+  int32_t k ;
+
+  for (k = iIdx; k < pRefList->uiLongRefCount - 1; k++)	{
+    pRefList->pLongRefList[k] = pRefList->pLongRefList[k + 1];
+  }
+  pRefList->pLongRefList[k] = NULL;
+  pRefList->uiLongRefCount--;
+
+}
+static inline void DeleteSTRFromShortList (sWelsEncCtx* pCtx, int32_t iIdx) {
+  SRefList* pRefList = pCtx->ppRefPicListExt[pCtx->uiDependencyId];
+  int32_t k ;
+
+  for (k = iIdx; k < pRefList->uiShortRefCount - 1; k++)	{
+    pRefList->pShortRefList[k] = pRefList->pShortRefList[k + 1];
+  }
+  pRefList->pShortRefList[k] = NULL;
+  pRefList->uiShortRefCount--;
+
+}
+static inline int32_t CompareFrameNum (int32_t iFrameNumA, int32_t iFrameNumB, int32_t iMaxFrameNumPlus1) {
+  int64_t iNumA, iNumB, iDiffAB, iDiffMin;
+  if (iFrameNumA > iMaxFrameNumPlus1 || iFrameNumB > iMaxFrameNumPlus1) {
+    return -2;
+  }
+#define  WelsAbsDiffInt64(a,b) ( (a) > (b) )?( a - b ):( b - a )
+
+  iDiffAB = WelsAbsDiffInt64 ((int64_t) (iFrameNumA), (int64_t) (iFrameNumB));
+
+  iDiffMin = iDiffAB;
+  if (iDiffMin == 0) {
+    return FRAME_NUM_EQUAL;
+  }
+
+  iNumA = WelsAbsDiffInt64 ((int64_t) (iFrameNumA + iMaxFrameNumPlus1), (int64_t) (iFrameNumB));
+  if (iNumA == 0) {
+    return FRAME_NUM_EQUAL;
+  } else if (iDiffMin > iNumA)	{
+    return FRAME_NUM_BIGGER;
+  }
+
+  iNumB = WelsAbsDiffInt64 ((int64_t) (iFrameNumB + iMaxFrameNumPlus1), (int64_t) (iFrameNumA));
+  if (iNumB == 0) {
+    return FRAME_NUM_EQUAL;
+  } else if (iDiffMin > iNumB)	{
+    return FRAME_NUM_SMALLER;
+  }
+
+  return (iFrameNumA > iFrameNumB) ? (FRAME_NUM_BIGGER) : (FRAME_NUM_SMALLER);
+
+}
+/*
+*	delete failed mark according LTR recovery pRequest
+*/
+static inline void DeleteInvalidLTR (sWelsEncCtx* pCtx) {
+  SRefList* pRefList		= pCtx->ppRefPicListExt[pCtx->uiDependencyId];
+  SPicture** pLongRefList = pRefList->pLongRefList;
+  SLTRState* pLtr = &pCtx->pLtr[pCtx->uiDependencyId];
+  int32_t iMaxFrameNumPlus1 = (1 << pCtx->pSps->uiLog2MaxFrameNum);
+  int32_t i;
+
+  for (i = 0; i < LONG_TERM_REF_NUM; i++) {
+    if (pLongRefList[i] != NULL)	{
+      if (CompareFrameNum (pLongRefList[i]->iFrameNum , pLtr->iLastCorFrameNumDec, iMaxFrameNumPlus1) == FRAME_NUM_BIGGER
+          && (CompareFrameNum (pLongRefList[i]->iFrameNum , pLtr->iCurFrameNumInDec,
+                               iMaxFrameNumPlus1) & (FRAME_NUM_EQUAL | FRAME_NUM_SMALLER))) {
+        WelsLog (pCtx, WELS_LOG_WARNING, "LTR ,invalid LTR delete ,long_term_idx = %d , iFrameNum =%d \n",
+                 pLongRefList[i]->iLongTermPicNum, pLongRefList[i]->iFrameNum);
+        SetUnref (pLongRefList[i]);
+        DeleteLTRFromLongList (pCtx, i);
+        pLtr->bLTRMarkEnable = TRUE;
+        if (pRefList->uiLongRefCount == 0) 	{
+          pCtx->bEncCurFrmAsIdrFlag = true;
+        }
+      } else if (CompareFrameNum (pLongRefList[i]->iMarkFrameNum , pLtr->iLastCorFrameNumDec ,
+                                  iMaxFrameNumPlus1) == FRAME_NUM_BIGGER
+                 && (CompareFrameNum (pLongRefList[i]->iMarkFrameNum, pLtr->iCurFrameNumInDec ,
+                                      iMaxFrameNumPlus1) & (FRAME_NUM_EQUAL | FRAME_NUM_SMALLER))
+                 && pLtr->iLTRMarkMode == LTR_DELAY_MARK)	{
+        WelsLog (pCtx, WELS_LOG_WARNING, "LTR ,iMarkFrameNum invalid LTR delete ,long_term_idx = %d , iFrameNum =%d \n",
+                 pLongRefList[i]->iLongTermPicNum, pLongRefList[i]->iFrameNum);
+        SetUnref (pLongRefList[i]);
+        DeleteLTRFromLongList (pCtx, i);
+        pLtr->bLTRMarkEnable = TRUE;
+        if (pRefList->uiLongRefCount == 0) 	{
+          pCtx->bEncCurFrmAsIdrFlag = true;
+        }
+      }
+    }
+  }
+
+}
+/*
+*	handle LTR Mark feedback message
+*/
+static inline void HandleLTRMarkFeedback (sWelsEncCtx* pCtx) {
+  SRefList* pRefList		= pCtx->ppRefPicListExt[pCtx->uiDependencyId];
+  SPicture** pLongRefList		= pRefList->pLongRefList;
+  SLTRState* pLtr = &pCtx->pLtr[pCtx->uiDependencyId];
+  int32_t i, j;
+
+  if (pLtr->uiLtrMarkState == LTR_MARKING_SUCCESS) {
+    WelsLog (pCtx, WELS_LOG_WARNING,
+             "pLtr->uiLtrMarkState = %d, pLtr.iCurLtrIdx = %d , pLtr->iLtrMarkFbFrameNum = %d ,pCtx->iFrameNum = %d ",
+             pLtr->uiLtrMarkState, pLtr->iCurLtrIdx, pLtr->iLtrMarkFbFrameNum, pCtx->iFrameNum);
+    for (i = 0; i < pRefList->uiLongRefCount; i++)	{
+      if (pLongRefList[i]->iFrameNum == pLtr->iLtrMarkFbFrameNum && pLongRefList[i]->uiRecieveConfirmed != RECIEVE_SUCCESS) {
+
+        pLongRefList[i]->uiRecieveConfirmed = RECIEVE_SUCCESS;
+        pCtx->pVaa->uiValidLongTermPicIdx = pLongRefList[i]->iLongTermPicNum;
+
+        pLtr->iCurFrameNumInDec  =
+          pLtr->iLastRecoverFrameNum =
+            pLtr->iLastCorFrameNumDec = pLtr->iLtrMarkFbFrameNum;
+
+        for (j = 0; j < pRefList->uiLongRefCount; j++)	{
+          if (pLongRefList[j]->iLongTermPicNum != pLtr->iCurLtrIdx)	{
+            SetUnref (pLongRefList[j]);
+            DeleteLTRFromLongList (pCtx, j);
+          }
+        }
+
+        pLtr->iLTRMarkSuccessNum++;
+        pLtr->iCurLtrIdx = (++pLtr->iCurLtrIdx % LONG_TERM_REF_NUM);
+        pLtr->iLTRMarkMode = (pLtr->iLTRMarkSuccessNum >= (LONG_TERM_REF_NUM)) ? (LTR_DELAY_MARK) : (LTR_DIRECT_MARK);
+        WelsLog (pCtx, WELS_LOG_WARNING, "LTR mark mode =%d", pLtr->iLTRMarkMode);
+        pLtr->bLTRMarkEnable = TRUE;
+        break;
+      }
+    }
+    pLtr->uiLtrMarkState = NO_LTR_MARKING_FEEDBACK;
+  } else if (pLtr->uiLtrMarkState == LTR_MARKING_FAILED) {
+    for (i = 0; i < pRefList->uiLongRefCount; i++)	{
+      if (pLongRefList[i]->iFrameNum == pLtr->iLtrMarkFbFrameNum)	{
+        SetUnref (pLongRefList[i]);
+        DeleteLTRFromLongList (pCtx, i);
+        break;
+      }
+    }
+    pLtr->uiLtrMarkState = NO_LTR_MARKING_FEEDBACK;
+    pLtr->bLTRMarkEnable = TRUE;
+
+    if (pLtr->iLTRMarkSuccessNum == 0) {
+      pCtx->bEncCurFrmAsIdrFlag = true; // no LTR , means IDR recieve failed, force next frame IDR
+    }
+  }
+}
+/*
+ *	LTR mark process
+ */
+static inline void LTRMarkProcess (sWelsEncCtx* pCtx) {
+  SRefList* pRefList		= pCtx->ppRefPicListExt[pCtx->uiDependencyId];
+  SPicture** pLongRefList = pRefList->pLongRefList;
+  SPicture** pShortRefList = pRefList->pShortRefList;
+  SLTRState* pLtr = &pCtx->pLtr[pCtx->uiDependencyId];
+  int32_t iGoPFrameNumInterval = ((pCtx->pSvcParam->uiGopSize >> 1) > 1) ? (pCtx->pSvcParam->uiGopSize >> 1) : (1);
+  int32_t iMaxFrameNumPlus1 = (1 << pCtx->pSps->uiLog2MaxFrameNum);
+  int32_t i = 0;
+  int32_t j = 0;
+  bool_t bMoveLtrFromShortToLong = false;
+
+  if (pCtx->eSliceType == I_SLICE)	{
+    i = 0;
+    pShortRefList[i]->uiRecieveConfirmed = RECIEVE_SUCCESS;
+  } else if (pLtr->bLTRMarkingFlag) {
+    pCtx->pVaa->uiMarkLongTermPicIdx = pLtr->iCurLtrIdx;
+
+    if (pLtr->iLTRMarkMode == LTR_DELAY_MARK)	{
+      for (i = 0; i < pRefList->uiShortRefCount; i++)	{
+        if (CompareFrameNum (pCtx->iFrameNum, pShortRefList[i]->iFrameNum + iGoPFrameNumInterval,
+                             iMaxFrameNumPlus1) == FRAME_NUM_EQUAL) {
+          break;
+        }
+      }
+    }
+  }
+
+  if (pCtx->eSliceType == I_SLICE || pLtr->bLTRMarkingFlag) {
+    pShortRefList[i]->bIsLongRef = true;
+    pShortRefList[i]->iLongTermPicNum = pLtr->iCurLtrIdx;
+    pShortRefList[i]->iMarkFrameNum = pCtx->iFrameNum;
+  }
+
+  // delay one gop to move LTR from int16_t list to int32_t list
+  if (pLtr->iLTRMarkMode == LTR_DIRECT_MARK && pCtx->eSliceType != I_SLICE && !pLtr->bLTRMarkingFlag) {
+    for (j = 0; j < pRefList->uiShortRefCount; j++) {
+      if (pRefList->pShortRefList[j]->bIsLongRef)	{
+        i = j;
+        bMoveLtrFromShortToLong = true;
+        break;
+      }
+    }
+  }
+
+  if ((pLtr->iLTRMarkMode == LTR_DELAY_MARK && pLtr->bLTRMarkingFlag) || ((pLtr->iLTRMarkMode == LTR_DIRECT_MARK)
+      && (bMoveLtrFromShortToLong))) {
+    if (pRefList->uiLongRefCount > 0) {
+      memmove (&pRefList->pLongRefList[1], &pRefList->pLongRefList[0],
+               pRefList->uiLongRefCount * sizeof (SPicture*));	// confirmed_safe_unsafe_usage
+    }
+    pLongRefList[0]	 = pShortRefList[i];
+    pRefList->uiLongRefCount++;
+    DeleteSTRFromShortList (pCtx, i);
+  }
+}
+static inline void PrefetchNextBuffer (sWelsEncCtx* pCtx) {
+  SRefList* pRefList		= pCtx->ppRefPicListExt[pCtx->uiDependencyId];
+  const int32_t kiNumRef	= pCtx->pSvcParam->iNumRefFrame;
+  int32_t i;
+
+  pRefList->pNextBuffer = NULL;
+  for (i = 0; i < kiNumRef + 1; i++) {
+    if (!pRefList->pRef[i]->bUsedAsRef) {
+      pRefList->pNextBuffer = pRefList->pRef[i];
+      break;
+    }
+  }
+
+  if (pRefList->pNextBuffer == NULL && pRefList->uiShortRefCount > 0) {
+    pRefList->pNextBuffer = pRefList->pShortRefList[pRefList->uiShortRefCount - 1];
+    SetUnref (pRefList->pNextBuffer);
+  }
+
+  pCtx->pDecPic = pRefList->pNextBuffer;
+}
+
+/*
+ *	update reference picture list
+ */
+BOOL_T WelsUpdateRefList (sWelsEncCtx* pCtx) {
+  SRefList* pRefList		= pCtx->ppRefPicListExt[pCtx->uiDependencyId];
+  SLTRState* pLtr			= &pCtx->pLtr[pCtx->uiDependencyId];
+  SDLayerParam* pParamD	= &pCtx->pSvcParam->sDependencyLayers[pCtx->uiDependencyId];
+  const int32_t kiNumRef	= pCtx->pSvcParam->iNumRefFrame;
+
+  int32_t iRefIdx			= 0;
+  const uint8_t kuiTid		= pCtx->uiTemporalId;
+  const uint8_t kuiDid		= pCtx->uiDependencyId;
+  const EWelsSliceType keSliceType		= pCtx->eSliceType;
+  const int32_t kiSwapIdx = (pCtx->eSliceType == P_SLICE) ? (kiNumRef - LONG_TERM_REF_NUM) : ((
+                              pCtx->pSvcParam->bEnableLongTermReference) ? (kiNumRef - pLtr->iCurLtrIdx) : (1));
+  uint32_t i = 0;
+  // Need update pRef list in case store base layer or target dependency layer construction
+  if (NULL == pCtx->pCurDqLayer)
+    return FALSE;
+
+  if (NULL == pRefList || NULL == pRefList->pRef[0] || NULL == pRefList->pRef[kiSwapIdx])
+    return FALSE;
+
+  if ((NULL != pCtx->pDecPic)
+#if !defined(ENABLE_FRAME_DUMP)	// to save complexity, 1/6/2009
+      && (pParamD->iHighestTemporalId == 0 || kuiTid < pParamD->iHighestTemporalId)
+#endif// !ENABLE_FRAME_DUMP
+     )
+    // Expanding picture for future reference
+    ExpandReferencingPicture (pCtx->pDecPic, pCtx->pFuncList->pfExpandLumaPicture, pCtx->pFuncList->pfExpandChromaPicture);
+
+  // move picture in list
+  pCtx->pDecPic->uiTemporalId = kuiTid;
+  pCtx->pDecPic->uiSpatialId	= kuiDid;
+  pCtx->pDecPic->iFrameNum		= pCtx->iFrameNum;
+  pCtx->pDecPic->iFramePoc		= pCtx->iPOC;
+  pCtx->pDecPic->uiRecieveConfirmed = RECIEVE_UNKOWN;
+  pCtx->pDecPic->bUsedAsRef	= true;
+
+  for (iRefIdx = pRefList->uiShortRefCount - 1; iRefIdx >= 0; --iRefIdx)	{
+    pRefList->pShortRefList[iRefIdx + 1] = pRefList->pShortRefList[iRefIdx];
+  }
+  pRefList->pShortRefList[0] = pCtx->pDecPic;
+  pRefList->uiShortRefCount++;
+
+  if (keSliceType == P_SLICE) {
+    if (pCtx->uiTemporalId == 0) {
+      if (pCtx->pSvcParam->bEnableLongTermReference)	{
+        LTRMarkProcess (pCtx);
+        DeleteInvalidLTR (pCtx);
+        HandleLTRMarkFeedback (pCtx);
+
+        pLtr->bReceivedT0LostFlag = FALSE; // reset to false due to the recovery is finished
+        pLtr->bLTRMarkingFlag = FALSE;
+        ++pLtr->uiLtrMarkInterval;
+      }
+
+      for (i = pRefList->uiShortRefCount - 1; i > 0; i--) {
+        SetUnref (pRefList->pShortRefList[i]);
+        DeleteSTRFromShortList (pCtx, i);
+      }
+      if (pRefList->uiShortRefCount > 0 && (pRefList->pShortRefList[0]->uiTemporalId > 0
+                                            || pRefList->pShortRefList[0]->iFrameNum != pCtx->iFrameNum)) {
+        SetUnref (pRefList->pShortRefList[0]);
+        DeleteSTRFromShortList (pCtx, 0);
+      }
+    }
+  } else {	// in case IDR currently coding
+    if (pCtx->pSvcParam->bEnableLongTermReference)	{
+      LTRMarkProcess (pCtx);
+
+      pLtr->iCurLtrIdx = (++pLtr->iCurLtrIdx % LONG_TERM_REF_NUM);
+      pLtr->iLTRMarkSuccessNum = 1; //IDR default suceess
+      pLtr->bLTRMarkEnable =  TRUE;
+      pLtr->uiLtrMarkInterval = 0;
+
+      pCtx->pVaa->uiValidLongTermPicIdx = 0;
+      pCtx->pVaa->uiMarkLongTermPicIdx = 0;
+    }
+  }
+  PrefetchNextBuffer (pCtx);
+  return TRUE;
+}
+
+bool_t CheckCurMarkFrameNumUsed (sWelsEncCtx* pCtx) {
+  SLTRState* pLtr = &pCtx->pLtr[pCtx->uiDependencyId];
+  SRefList* pRefList	= pCtx->ppRefPicListExt[pCtx->uiDependencyId];
+  SPicture** pLongRefList = pRefList->pLongRefList;
+  int32_t iGoPFrameNumInterval = ((pCtx->pSvcParam->uiGopSize >> 1) > 1) ? (pCtx->pSvcParam->uiGopSize >> 1) : (1);
+  int32_t iMaxFrameNumPlus1 = (1 << pCtx->pSps->uiLog2MaxFrameNum);
+  int32_t i;
+
+  for (i = 0; i < pRefList->uiLongRefCount; i++) {
+    if ((pCtx->iFrameNum == pLongRefList[i]->iFrameNum && pLtr->iLTRMarkMode == LTR_DIRECT_MARK) ||
+        (CompareFrameNum (pCtx->iFrameNum + iGoPFrameNumInterval, pLongRefList[i]->iFrameNum,
+                          iMaxFrameNumPlus1) == FRAME_NUM_EQUAL  && pLtr->iLTRMarkMode == LTR_DELAY_MARK)) {
+      return FALSE;
+    }
+  }
+
+  return TRUE;
+}
+void WelsMarkPic (sWelsEncCtx* pCtx) {
+  SLTRState* pLtr = &pCtx->pLtr[pCtx->uiDependencyId];
+  const int32_t kiCountSliceNum			= GetCurrentSliceNum (pCtx->pCurDqLayer->pSliceEncCtx);
+  int32_t iGoPFrameNumInterval = ((pCtx->pSvcParam->uiGopSize >> 1) > 1) ? (pCtx->pSvcParam->uiGopSize >> 1) : (1);
+  int32_t iSliceIdx = 0;
+
+  if (pCtx->pSvcParam->bEnableLongTermReference && pLtr->bLTRMarkEnable && pCtx->uiTemporalId == 0) {
+    if (!pLtr->bReceivedT0LostFlag && pLtr->uiLtrMarkInterval > pCtx->pSvcParam->uiLtrMarkPeriod
+        && CheckCurMarkFrameNumUsed (pCtx)) {
+      pLtr->bLTRMarkingFlag = TRUE;
+      pLtr->bLTRMarkEnable = FALSE;
+      pLtr->uiLtrMarkInterval = 0;
+      pLtr->iLastLtrIdx = pLtr->iCurLtrIdx;
+    } else {
+      pLtr->bLTRMarkingFlag = FALSE;
+    }
+  }
+
+  for (iSliceIdx = 0; iSliceIdx < kiCountSliceNum; iSliceIdx++)	{
+    SSliceHeaderExt*	pSliceHdrExt		= &pCtx->pCurDqLayer->sLayerInfo.pSliceInLayer[iSliceIdx].sSliceHeaderExt;
+    SSliceHeader*		pSliceHdr			= &pSliceHdrExt->sSliceHeader;
+    SRefPicMarking*		pRefPicMark		= &pSliceHdr->sRefMarking;
+
+    memset (pRefPicMark, 0, sizeof (SRefPicMarking));
+
+    if (iSliceIdx != kiCountSliceNum - 1)	{ //marking syntax only exist in last slice head
+      continue;
+    }
+    if (pCtx->pSvcParam->bEnableLongTermReference && pLtr->bLTRMarkingFlag) {
+      if (pLtr->iLTRMarkMode == LTR_DIRECT_MARK)	{
+        pRefPicMark->SMmcoRef[pRefPicMark->uiMmcoCount].iMaxLongTermFrameIdx = LONG_TERM_REF_NUM - 1;
+        pRefPicMark->SMmcoRef[pRefPicMark->uiMmcoCount++].iMmcoType = MMCO_SET_MAX_LONG;
+
+        pRefPicMark->SMmcoRef[pRefPicMark->uiMmcoCount].iDiffOfPicNum = iGoPFrameNumInterval;
+        pRefPicMark->SMmcoRef[pRefPicMark->uiMmcoCount++].iMmcoType = MMCO_SHORT2UNUSED;
+
+        pRefPicMark->SMmcoRef[pRefPicMark->uiMmcoCount].iLongTermFrameIdx = pLtr->iCurLtrIdx;
+        pRefPicMark->SMmcoRef[pRefPicMark->uiMmcoCount++].iMmcoType = MMCO_LONG;
+      } else if (pLtr->iLTRMarkMode == LTR_DELAY_MARK)	{
+        pRefPicMark->SMmcoRef[pRefPicMark->uiMmcoCount].iDiffOfPicNum = iGoPFrameNumInterval;
+        pRefPicMark->SMmcoRef[pRefPicMark->uiMmcoCount].iLongTermFrameIdx = pLtr->iCurLtrIdx;
+        pRefPicMark->SMmcoRef[pRefPicMark->uiMmcoCount++].iMmcoType = MMCO_SHORT2LONG;
+      }
+    }
+  }
+}
+
+int32_t FilterLTRRecoveryRequest (sWelsEncCtx* pCtx, SLTRRecoverRequest* pLTRRecoverRequest) {
+  SLTRRecoverRequest* pRequest = pLTRRecoverRequest;
+  SLTRState* pLtr = &pCtx->pLtr[pCtx->uiDependencyId];
+  int32_t iMaxFrameNumPlus1 = (1 << pCtx->pSps->uiLog2MaxFrameNum);
+  if (pCtx->pSvcParam->bEnableLongTermReference) {
+    if (pRequest->uiFeedbackType == LTR_RECOVERY_REQUEST &&  pRequest->uiIDRPicId == pCtx->sPSOVector.uiIdrPicId) {
+      if (pRequest->iLastCorrectFrameNum == -1) {
+        pCtx->bEncCurFrmAsIdrFlag = true;
+        return TRUE;
+      } else if (pRequest->iCurrentFrameNum == -1) {
+        pLtr->bReceivedT0LostFlag = true;
+        return TRUE;
+      } else if ((CompareFrameNum (pLtr->iLastRecoverFrameNum , pRequest->iLastCorrectFrameNum,
+                                   iMaxFrameNumPlus1) & (FRAME_NUM_EQUAL | FRAME_NUM_SMALLER)) // t0 lost
+                 || ((CompareFrameNum (pLtr->iLastRecoverFrameNum , pRequest->iCurrentFrameNum,
+                                       iMaxFrameNumPlus1) & (FRAME_NUM_EQUAL | FRAME_NUM_SMALLER)) &&
+                     CompareFrameNum (pLtr->iLastRecoverFrameNum , pRequest->iLastCorrectFrameNum,
+                                      iMaxFrameNumPlus1) == FRAME_NUM_BIGGER)) { // recovery failed
+
+        pLtr->bReceivedT0LostFlag = true;
+        pLtr->iLastCorFrameNumDec = pRequest->iLastCorrectFrameNum;
+        pLtr->iCurFrameNumInDec = pRequest->iCurrentFrameNum;
+        WelsLog (pCtx, WELS_LOG_INFO,
+                 "Receive valid LTR recovery pRequest,feedback_type = %d ,uiIdrPicId = %d , current_frame_num = %d , last correct frame num = %d"
+                 , pRequest->uiFeedbackType, pRequest->uiIDRPicId, pRequest->iCurrentFrameNum, pRequest->iLastCorrectFrameNum);
+      }
+
+      WelsLog (pCtx, WELS_LOG_INFO,
+               "Receive LTR recovery pRequest,feedback_type = %d ,uiIdrPicId = %d , current_frame_num = %d , last correct frame num = %d"
+               , pRequest->uiFeedbackType, pRequest->uiIDRPicId, pRequest->iCurrentFrameNum, pRequest->iLastCorrectFrameNum);
+    }
+  } else if (!pCtx->pSvcParam->bEnableLongTermReference) {
+    pCtx->bEncCurFrmAsIdrFlag = TRUE;
+  }
+  return TRUE;
+}
+void FilterLTRMarkingFeedback (sWelsEncCtx* pCtx, SLTRMarkingFeedback* pLTRMarkingFeedback) {
+  SLTRState* pLtr = &pCtx->pLtr[pCtx->uiDependencyId];
+  assert (pLTRMarkingFeedback);
+  if (pCtx->pSvcParam->bEnableLongTermReference)	{
+    if (pLTRMarkingFeedback->uiIDRPicId == pCtx->sPSOVector.uiIdrPicId
+        && (pLTRMarkingFeedback->uiFeedbackType == LTR_MARKING_SUCCESS
+            || pLTRMarkingFeedback->uiFeedbackType == LTR_MARKING_FAILED)) { // avoid error pData
+      pLtr->uiLtrMarkState = pLTRMarkingFeedback->uiFeedbackType;
+      pLtr->iLtrMarkFbFrameNum =  pLTRMarkingFeedback->iLTRFrameNum ;
+      WelsLog (pCtx, WELS_LOG_INFO,
+               "Receive valid LTR marking feedback, feedback_type = %d , uiIdrPicId = %d , LTR_frame_num = %d , cur_idr_pic_id = %d",
+               pLTRMarkingFeedback->uiFeedbackType, pLTRMarkingFeedback->uiIDRPicId, pLTRMarkingFeedback->iLTRFrameNum ,
+               pCtx->sPSOVector.uiIdrPicId);
+
+    } else {
+      WelsLog (pCtx, WELS_LOG_INFO,
+               "Receive LTR marking feedback, feedback_type = %d , uiIdrPicId = %d , LTR_frame_num = %d , cur_idr_pic_id = %d",
+               pLTRMarkingFeedback->uiFeedbackType, pLTRMarkingFeedback->uiIDRPicId, pLTRMarkingFeedback->iLTRFrameNum ,
+               pCtx->sPSOVector.uiIdrPicId);
+    }
+  }
+}
+
+/*
+ *	build reference picture list
+ */
+BOOL_T WelsBuildRefList (sWelsEncCtx* pCtx, const int32_t iPOC) {
+  SRefList* pRefList		=  pCtx->ppRefPicListExt[pCtx->uiDependencyId];
+  SLTRState* pLtr			= &pCtx->pLtr[pCtx->uiDependencyId];
+  const int32_t kiNumRef	= pCtx->pSvcParam->iNumRefFrame;
+  const uint8_t kuiTid		= pCtx->uiTemporalId;
+  uint32_t i				= 0;
+
+  // to support any type of cur_dq->mgs_control
+  //	[ 0:	using current layer to do ME/MC;
+  //	  -1:	using store base layer to do ME/MC;
+  //	  2:	using highest layer to do ME/MC; ]
+
+  // build reference list 0/1 if applicable
+
+  pCtx->iNumRef0	= 0;
+
+  if (pCtx->eSliceType != I_SLICE) {
+    if (pCtx->pSvcParam->bEnableLongTermReference && pLtr->bReceivedT0LostFlag && pCtx->uiTemporalId == 0) {
+      for (i = 0; i < pRefList->uiLongRefCount; i++)	{
+        if (pRefList->pLongRefList[i]->uiRecieveConfirmed == RECIEVE_SUCCESS)	{
+          pCtx->pRefList0[pCtx->iNumRef0++] = pRefList->pLongRefList[i];
+          pLtr->iLastRecoverFrameNum = pCtx->iFrameNum;
+          WelsLog (pCtx, WELS_LOG_INFO, "pRef is int32_t !iLastRecoverFrameNum = %d, pRef iFrameNum = %d,LTR number = %d,",
+                   pLtr->iLastRecoverFrameNum, pCtx->pRefList0[0]->iFrameNum, pRefList->uiLongRefCount);
+          break;
+        }
+      }
+    } else {
+      for (i = 0; i < pRefList->uiShortRefCount; ++ i) {
+        SPicture* pRef = pRefList->pShortRefList[i];
+        if (pRef != NULL && pRef->bUsedAsRef && pRef->iFramePoc >= 0 && pRef->uiTemporalId <= kuiTid) {
+          pCtx->pRefList0[pCtx->iNumRef0++]	= pRef;
+          break;
+        }
+      }
+    }
+  } else {	// safe for IDR
+    WelsResetRefList (pCtx);  //for IDR, SHOULD reset pRef list.
+    ResetLtrState (&pCtx->pLtr[pCtx->uiDependencyId]); //SHOULD update it when IDR.
+    pCtx->pRefList0[0]	= NULL;
+  }
+
+  if (pCtx->iNumRef0 > kiNumRef)
+    pCtx->iNumRef0 = kiNumRef;
+  return (pCtx->iNumRef0 > 0 || pCtx->eSliceType == I_SLICE) ? (TRUE) : (FALSE);
+}
+
+/*
+ *	update syntax for reference base related
+ */
+void WelsUpdateRefSyntax (sWelsEncCtx* pCtx, const int32_t iPOC, const int32_t uiFrameType) {
+  SLTRState* pLtr = &pCtx->pLtr[pCtx->uiDependencyId];
+  int32_t iIdx								= 0;
+  const int32_t kiCountSliceNum			= GetCurrentSliceNum (pCtx->pCurDqLayer->pSliceEncCtx);
+  int32_t	iAbsDiffPicNumMinus1			= -1;
+
+  assert (kiCountSliceNum > 0);
+
+  /*syntax for ref_pic_list_reordering()*/
+  if (pCtx->iNumRef0 > 0)
+    iAbsDiffPicNumMinus1 = pCtx->iFrameNum - (pCtx->pRefList0[0]->iFrameNum) - 1;
+
+  for (iIdx = 0; iIdx < kiCountSliceNum; iIdx++) {
+    SSliceHeaderExt*	pSliceHdrExt		= &pCtx->pCurDqLayer->sLayerInfo.pSliceInLayer[iIdx].sSliceHeaderExt;
+    SSliceHeader*		pSliceHdr			= &pSliceHdrExt->sSliceHeader;
+    SRefPicListReorderSyntax* pRefReorder	= &pSliceHdr->sRefReordering;
+    SRefPicMarking* pRefPicMark			= &pSliceHdr->sRefMarking;
+
+    /*syntax for num_ref_idx_l0_active_minus1*/
+    pSliceHdr->uiRefCount = pCtx->iNumRef0;
+    if (pCtx->iNumRef0 > 0) {
+      if (!pCtx->pRefList0[0]->bIsLongRef) {
+        if (iAbsDiffPicNumMinus1 < 0) {
+          WelsLog (pCtx, WELS_LOG_INFO, "WelsUpdateRefSyntax():::uiAbsDiffPicNumMinus1:%d\n", iAbsDiffPicNumMinus1);
+          iAbsDiffPicNumMinus1 += (1 << (pCtx->pSps->uiLog2MaxFrameNum));
+          WelsLog (pCtx, WELS_LOG_INFO, "WelsUpdateRefSyntax():::uiAbsDiffPicNumMinus1< 0, update as:%d\n", iAbsDiffPicNumMinus1);
+        }
+
+        pRefReorder->SReorderingSyntax[0].uiReorderingOfPicNumsIdc = 0;
+        pRefReorder->SReorderingSyntax[0].uiAbsDiffPicNumMinus1    = iAbsDiffPicNumMinus1;
+        pRefReorder->SReorderingSyntax[1].uiReorderingOfPicNumsIdc = 3;
+      } else {
+        pRefReorder->SReorderingSyntax[0].uiReorderingOfPicNumsIdc = 2;
+        pRefReorder->SReorderingSyntax[0].iLongTermPicNum = pCtx->pRefList0[0]->iLongTermPicNum;
+        pRefReorder->SReorderingSyntax[1].uiReorderingOfPicNumsIdc = 3;
+      }
+    }
+
+    /*syntax for dec_ref_pic_marking()*/
+    if (WELS_FRAME_TYPE_IDR == uiFrameType)		{
+      pRefPicMark->bNoOutputOfPriorPicsFlag = false;
+      pRefPicMark->bLongTermRefFlag = pCtx->pSvcParam->bEnableLongTermReference;
+    } else {
+      pRefPicMark->bAdaptiveRefPicMarkingModeFlag = (pCtx->pSvcParam->bEnableLongTermReference
+          && pLtr->bLTRMarkingFlag) ? (true) : (false);
+    }
+  }
+}
+
+} // namespace WelsSVCEnc
--- a/codec/encoder/core/src/sample.cpp
+++ b/codec/encoder/core/src/sample.cpp
@@ -1,531 +1,489 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file	sample.c
- *
- * \brief	compute SAD and SATD
- *
- * \date	2009.06.02 Created
- *
- *************************************************************************************
- */
-
-#include "sample.h"
-#include "macros.h"
-
-#include "mc.h"
-#include "cpu_core.h"
-#include "array_stack_align.h"
-
-namespace WelsSVCEnc {
-int32_t WelsSampleSad4x4_c( uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2 )
-{
-	int32_t iSadSum = 0;
-	int32_t i = 0;
-	uint8_t* pSrc1 = pSample1;
-	uint8_t* pSrc2 = pSample2;
-	for ( i = 0; i < 4; i++ )
-	{
-		iSadSum += WELS_ABS( ( pSrc1[0] - pSrc2[0] ) );
-		iSadSum += WELS_ABS( ( pSrc1[1] - pSrc2[1] ) );
-		iSadSum += WELS_ABS( ( pSrc1[2] - pSrc2[2] ) );
-		iSadSum += WELS_ABS( ( pSrc1[3] - pSrc2[3] ) );
-
-		pSrc1 += iStride1;
-		pSrc2 += iStride2;
-	}
-
-	return iSadSum;
-} 
-
-int32_t WelsSampleSad8x8_c( uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2 )
-{
-	int32_t iSadSum = 0;
-	int32_t i = 0;
-	uint8_t* pSrc1 = pSample1;
-	uint8_t* pSrc2 = pSample2;
-	for ( i = 0; i < 8; i++ )
-	{
-		iSadSum += WELS_ABS( ( pSrc1[0] - pSrc2[0] ) );
-		iSadSum += WELS_ABS( ( pSrc1[1] - pSrc2[1] ) );
-		iSadSum += WELS_ABS( ( pSrc1[2] - pSrc2[2] ) );
-		iSadSum += WELS_ABS( ( pSrc1[3] - pSrc2[3] ) );
-		iSadSum += WELS_ABS( ( pSrc1[4] - pSrc2[4] ) );
-		iSadSum += WELS_ABS( ( pSrc1[5] - pSrc2[5] ) );
-		iSadSum += WELS_ABS( ( pSrc1[6] - pSrc2[6] ) );
-		iSadSum += WELS_ABS( ( pSrc1[7] - pSrc2[7] ) );
-
-		pSrc1 += iStride1;
-		pSrc2 += iStride2;
-	}
-
-	return iSadSum;
-} 
-int32_t WelsSampleSad16x8_c( uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2 )
-{
-	int32_t iSadSum = 0;
-
-	iSadSum += WelsSampleSad8x8_c( pSample1,     iStride1, pSample2,     iStride2 );
-	iSadSum += WelsSampleSad8x8_c( pSample1 + 8, iStride1, pSample2 + 8, iStride2 );
-
-	return iSadSum;
-} 
-int32_t WelsSampleSad8x16_c( uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2 )
-{
-	int32_t iSadSum = 0;
-	iSadSum += WelsSampleSad8x8_c( pSample1,                   iStride1, pSample2,                   iStride2 );
-	iSadSum += WelsSampleSad8x8_c( pSample1+(iStride1<<3), iStride1, pSample2+(iStride2<<3), iStride2 );
-
-	return iSadSum;
-} 
-int32_t WelsSampleSad16x16_c( uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2 )
-{
-	int32_t iSadSum = 0;
-	iSadSum += WelsSampleSad8x8_c( pSample1,                     iStride1, pSample2,                     iStride2 );
-	iSadSum += WelsSampleSad8x8_c( pSample1+8,                   iStride1, pSample2+8,                   iStride2 );
-	iSadSum += WelsSampleSad8x8_c( pSample1+(iStride1<<3),   iStride1, pSample2+(iStride2<<3),   iStride2 );
-	iSadSum += WelsSampleSad8x8_c( pSample1+(iStride1<<3)+8, iStride1, pSample2+(iStride2<<3)+8, iStride2 );
-
-	return iSadSum;
-} 
-
-int32_t WelsSampleSatd4x4_c( uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2 )
-{
-	int32_t iSatdSum = 0;
-	int32_t pSampleMix[4][4] = { 0 };
-	int32_t iSample0, iSample1, iSample2, iSample3;
-	int32_t i = 0;
-	uint8_t* pSrc1 = pSample1;
-	uint8_t* pSrc2 = pSample2;
-
-	//step 1: get the difference
-	for( i = 0; i < 4; i++ )
-	{
-		pSampleMix[i][0] = pSrc1[0] - pSrc2[0];
-		pSampleMix[i][1] = pSrc1[1] - pSrc2[1];
-		pSampleMix[i][2] = pSrc1[2] - pSrc2[2];
-		pSampleMix[i][3] = pSrc1[3] - pSrc2[3];
-
-		pSrc1 += iStride1;
-		pSrc2 += iStride2;
-	}
-
-	//step 2: horizontal transform
-	for ( i = 0; i < 4; i++ )
-	{
-		iSample0 = pSampleMix[i][0] + pSampleMix[i][2];
-		iSample1 = pSampleMix[i][1] + pSampleMix[i][3];
-		iSample2 = pSampleMix[i][0] - pSampleMix[i][2];
-		iSample3 = pSampleMix[i][1] - pSampleMix[i][3];
-
-		pSampleMix[i][0] = iSample0 + iSample1;		
-		pSampleMix[i][1] = iSample2 + iSample3;
-		pSampleMix[i][2] = iSample2 - iSample3;
-		pSampleMix[i][3] = iSample0 - iSample1;
-	}
-
-	//step 3: vertical transform and get the sum of SATD
-	for ( i = 0; i < 4; i++ )
-	{
-		iSample0 = pSampleMix[0][i] + pSampleMix[2][i];
-		iSample1 = pSampleMix[1][i] + pSampleMix[3][i];
-		iSample2 = pSampleMix[0][i] - pSampleMix[2][i];
-		iSample3 = pSampleMix[1][i] - pSampleMix[3][i];
-
-		pSampleMix[0][i] = iSample0 + iSample1;		
-		pSampleMix[1][i] = iSample2 + iSample3;
-		pSampleMix[2][i] = iSample2 - iSample3;
-		pSampleMix[3][i] = iSample0 - iSample1;
-
-		iSatdSum += ( WELS_ABS( pSampleMix[0][i] ) + WELS_ABS( pSampleMix[1][i] ) + WELS_ABS( pSampleMix[2][i] ) + WELS_ABS( pSampleMix[3][i] ) );
-	}
-
-	return ( (iSatdSum+1)>>1 );
-}
-int32_t WelsSampleSatd8x8_c( uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2 )
-{
-	int32_t iSatdSum = 0;
-
-	iSatdSum += WelsSampleSatd4x4_c( pSample1,                     iStride1, pSample2,                     iStride2 );
-	iSatdSum += WelsSampleSatd4x4_c( pSample1+4,                   iStride1, pSample2+4,                   iStride2 );
-	iSatdSum += WelsSampleSatd4x4_c( pSample1+(iStride1<<2),   iStride1, pSample2+(iStride2<<2),   iStride2 );
-	iSatdSum += WelsSampleSatd4x4_c( pSample1+(iStride1<<2)+4, iStride1, pSample2+(iStride2<<2)+4, iStride2 );
-
-	return iSatdSum;
-}
-int32_t WelsSampleSatd16x8_c( uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2 )
-{
-	int32_t iSatdSum = 0;
-
-	iSatdSum += WelsSampleSatd8x8_c( pSample1,   iStride1, pSample2,   iStride2 );
-	iSatdSum += WelsSampleSatd8x8_c( pSample1+8, iStride1, pSample2+8, iStride2 );
-
-	return iSatdSum;
-}
-int32_t WelsSampleSatd8x16_c( uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2 )
-{
-	int32_t iSatdSum = 0;
-
-	iSatdSum += WelsSampleSatd8x8_c( pSample1,                   iStride1, pSample2,                   iStride2 );
-	iSatdSum += WelsSampleSatd8x8_c( pSample1+(iStride1<<3), iStride1, pSample2+(iStride2<<3), iStride2 );
-
-	return iSatdSum;
-}
-int32_t WelsSampleSatd16x16_c( uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2 )
-{
-	int32_t iSatdSum = 0;
-
-	iSatdSum += WelsSampleSatd8x8_c( pSample1,                     iStride1, pSample2,                     iStride2 );
-	iSatdSum += WelsSampleSatd8x8_c( pSample1+8,                   iStride1, pSample2+8,                   iStride2 );
-	iSatdSum += WelsSampleSatd8x8_c( pSample1+(iStride1<<3),   iStride1, pSample2+(iStride2<<3),   iStride2 );
-	iSatdSum += WelsSampleSatd8x8_c( pSample1+(iStride1<<3)+8, iStride1, pSample2+(iStride2<<3)+8, iStride2 );
-
-	return iSatdSum;
-}
-
-
-void WelsSampleSadFour16x16_c( uint8_t *iSample1, int32_t iStride1, uint8_t *iSample2, int32_t iStride2, int32_t* pSad)
-{
-	*(pSad)     = WelsSampleSad16x16_c(iSample1, iStride1, (iSample2-iStride2), iStride2);
-	*(pSad + 1) = WelsSampleSad16x16_c(iSample1, iStride1, (iSample2+iStride2), iStride2);
-	*(pSad + 2) = WelsSampleSad16x16_c(iSample1, iStride1, (iSample2-1), iStride2);
-	*(pSad + 3) = WelsSampleSad16x16_c(iSample1, iStride1, (iSample2+1), iStride2);
-}
-void WelsSampleSadFour16x8_c(uint8_t *iSample1, int32_t iStride1, uint8_t *iSample2, int32_t iStride2, int32_t* pSad)
-{
-	*(pSad)     = WelsSampleSad16x8_c(iSample1, iStride1, (iSample2-iStride2), iStride2);
-	*(pSad + 1) = WelsSampleSad16x8_c(iSample1, iStride1, (iSample2+iStride2), iStride2);
-	*(pSad + 2) = WelsSampleSad16x8_c(iSample1, iStride1, (iSample2-1), iStride2);
-	*(pSad + 3) = WelsSampleSad16x8_c(iSample1, iStride1, (iSample2+1), iStride2);
-}
-void WelsSampleSadFour8x16_c( uint8_t *iSample1, int32_t iStride1, uint8_t *iSample2, int32_t iStride2, int32_t* pSad)
-{
-	*(pSad)     = WelsSampleSad8x16_c(iSample1, iStride1, (iSample2-iStride2), iStride2);
-	*(pSad + 1) = WelsSampleSad8x16_c(iSample1, iStride1, (iSample2+iStride2), iStride2);
-	*(pSad + 2) = WelsSampleSad8x16_c(iSample1, iStride1, (iSample2-1), iStride2);
-	*(pSad + 3) = WelsSampleSad8x16_c(iSample1, iStride1, (iSample2+1), iStride2);
-
-}
-void WelsSampleSadFour8x8_c( uint8_t *iSample1, int32_t iStride1, uint8_t *iSample2, int32_t iStride2, int32_t* pSad)
-{
-	*(pSad)     = WelsSampleSad8x8_c(iSample1, iStride1, (iSample2-iStride2), iStride2);
-	*(pSad + 1) = WelsSampleSad8x8_c(iSample1, iStride1, (iSample2+iStride2), iStride2);
-	*(pSad + 2) = WelsSampleSad8x8_c(iSample1, iStride1, (iSample2-1), iStride2);
-	*(pSad + 3) = WelsSampleSad8x8_c(iSample1, iStride1, (iSample2+1), iStride2);
-}
-void WelsSampleSadFour4x4_c( uint8_t *iSample1, int32_t iStride1, uint8_t *iSample2, int32_t iStride2, int32_t* pSad)
-{
-	*(pSad)     = WelsSampleSad4x4_c(iSample1, iStride1, (iSample2-iStride2), iStride2);
-	*(pSad + 1) = WelsSampleSad4x4_c(iSample1, iStride1, (iSample2+iStride2), iStride2);
-	*(pSad + 2) = WelsSampleSad4x4_c(iSample1, iStride1, (iSample2-1), iStride2);
-	*(pSad + 3) = WelsSampleSad4x4_c(iSample1, iStride1, (iSample2+1), iStride2);
-}
-
-extern void WelsI4x4LumaPredDc_c(uint8_t *pPred, uint8_t *pRef, const int32_t iStride);
-extern void WelsI4x4LumaPredH_c(uint8_t *pPred, uint8_t *pRef, const int32_t iStride);
-extern void WelsI4x4LumaPredV_c(uint8_t *pPred, uint8_t *pRef, const int32_t iStride);
-
-int32_t WelsSampleSatdIntra4x4Combined3_c(uint8_t *pDec, int32_t iDecStride, uint8_t *pEnc, int32_t iEncStride, uint8_t *pDst, 
-						  int32_t *pBestMode, int32_t iLambda2, int32_t iLambda1, int32_t iLambda0)
-{
-	int32_t iBestMode = -1;
-	int32_t iCurCost, iBestCost = INT_MAX;
-	ENFORCE_STACK_ALIGN_2D(uint8_t, uiLocalBuffer, 3, 16, 16)
-	
-	WelsI4x4LumaPredDc_c(uiLocalBuffer[2], pDec, iDecStride);
-	iCurCost = WelsSampleSatd4x4_c(uiLocalBuffer[2], 4, pEnc, iEncStride) + iLambda2;
-	if (iCurCost < iBestCost)
-	{			
-		iBestMode = 2;
-		iBestCost = iCurCost;
-	}
-
-	WelsI4x4LumaPredH_c(uiLocalBuffer[1], pDec, iDecStride);
-	iCurCost = WelsSampleSatd4x4_c(uiLocalBuffer[1], 4, pEnc, iEncStride) + iLambda1;
-	if (iCurCost < iBestCost)
-	{			
-		iBestMode = 1;
-		iBestCost = iCurCost;
-	}
-	WelsI4x4LumaPredV_c(uiLocalBuffer[0], pDec, iDecStride);
-	iCurCost = WelsSampleSatd4x4_c(uiLocalBuffer[0], 4, pEnc, iEncStride) + iLambda0;
-	if (iCurCost < iBestCost)
-	{			
-		iBestMode = 0;
-		iBestCost = iCurCost;
-	}
-
-	memcpy(pDst, uiLocalBuffer[iBestMode], 16*sizeof(uint8_t));	// confirmed_safe_unsafe_usage
-	*pBestMode = iBestMode;
-
-	return iBestCost;
-}
-extern void WelsIChormaPredDc_c(uint8_t *pPred, uint8_t *pRef, const int32_t iStride);
-extern void WelsIChormaPredH_c(uint8_t *pPred, uint8_t *pRef, const int32_t iStride);
-extern void WelsIChormaPredV_c(uint8_t *pPred, uint8_t *pRef, const int32_t iStride);
-
-int32_t WelsSampleSatdIntra8x8Combined3_c(uint8_t *pDecCb, int32_t iDecStride, uint8_t *pEncCb, int32_t iEncStride, 
-							int32_t *pBestMode, int32_t iLambda, uint8_t *pDstChroma,uint8_t *pDecCr,uint8_t *pEncCr)
-{
-	int32_t iBestMode = -1;
-	int32_t iCurCost, iBestCost = INT_MAX;
-
-	WelsIChormaPredV_c(pDstChroma, pDecCb, iDecStride);
-	WelsIChormaPredV_c(pDstChroma+64, pDecCr, iDecStride);
-	iCurCost = WelsSampleSatd8x8_c(pDstChroma, 8, pEncCb, iEncStride);
-	iCurCost += WelsSampleSatd8x8_c(pDstChroma+64, 8, pEncCr, iEncStride) + iLambda * 2;
-	
-	if (iCurCost < iBestCost)
-	{			
-		iBestMode = 2;
-		iBestCost = iCurCost;
-	}
-	
-	WelsIChormaPredH_c(pDstChroma, pDecCb, iDecStride);
-	WelsIChormaPredH_c(pDstChroma+64, pDecCr, iDecStride);
-	iCurCost = WelsSampleSatd8x8_c(pDstChroma, 8, pEncCb, iEncStride);
-	iCurCost += WelsSampleSatd8x8_c(pDstChroma+64, 8, pEncCr, iEncStride) + iLambda * 2;
-	if (iCurCost < iBestCost)
-	{			
-		iBestMode = 1;
-		iBestCost = iCurCost;
-	}
-	WelsIChormaPredDc_c(pDstChroma, pDecCb, iDecStride);
-	WelsIChormaPredDc_c(pDstChroma+64, pDecCr, iDecStride);
-	iCurCost = WelsSampleSatd8x8_c(pDstChroma, 8, pEncCb, iEncStride);
-	iCurCost += WelsSampleSatd8x8_c(pDstChroma+64, 8, pEncCr, iEncStride);
-	if (iCurCost < iBestCost)
-	{			
-		iBestMode = 0;
-		iBestCost = iCurCost;
-	}
-	
-	*pBestMode	= iBestMode;
-
-    return iBestCost;
-	
-
-}
-int32_t WelsSampleSadIntra8x8Combined3_c(uint8_t *pDecCb, int32_t iDecStride, uint8_t *pEncCb, int32_t iEncStride, 
-							int32_t *pBestMode, int32_t iLambda, uint8_t *pDstChroma,uint8_t *pDecCr,uint8_t *pEncCr)
-{
-	int32_t iBestMode = -1;
-	int32_t iCurCost, iBestCost = INT_MAX;
-	
-	WelsIChormaPredV_c(pDstChroma, pDecCb, iDecStride);
-	WelsIChormaPredV_c(pDstChroma+64, pDecCr, iDecStride);
-	iCurCost = WelsSampleSad8x8_c(pDstChroma, 8, pEncCb, iEncStride);
-	iCurCost += WelsSampleSad8x8_c(pDstChroma+64, 8, pEncCr, iEncStride) + iLambda * 2;
-	
-	if (iCurCost < iBestCost)
-	{			
-		iBestMode = 2;
-		iBestCost = iCurCost;
-	}
-	
-	WelsIChormaPredH_c(pDstChroma, pDecCb, iDecStride);
-	WelsIChormaPredH_c(pDstChroma+64, pDecCr, iDecStride);
-	iCurCost = WelsSampleSad8x8_c(pDstChroma, 8, pEncCb, iEncStride);
-	iCurCost += WelsSampleSad8x8_c(pDstChroma+64, 8, pEncCr, iEncStride) + iLambda * 2;
-	if (iCurCost < iBestCost)
-	{			
-		iBestMode = 1;
-		iBestCost = iCurCost;
-	}
-	WelsIChormaPredDc_c(pDstChroma, pDecCb, iDecStride);
-	WelsIChormaPredDc_c(pDstChroma+64, pDecCr, iDecStride);
-	iCurCost = WelsSampleSad8x8_c(pDstChroma, 8, pEncCb, iEncStride);
-	iCurCost += WelsSampleSad8x8_c(pDstChroma+64, 8, pEncCr, iEncStride);
-	if (iCurCost < iBestCost)
-	{			
-		iBestMode = 0;
-		iBestCost = iCurCost;
-	}
-
-	*pBestMode = iBestMode;
-
-    return iBestCost;
-
-}
-
-extern void WelsI16x16LumaPredDc_c(uint8_t *pPred, uint8_t *pRef, const int32_t iStride);
-extern void WelsI16x16LumaPredH_c(uint8_t *pPred, uint8_t *pRef, const int32_t iStride);
-extern void WelsI16x16LumaPredV_c(uint8_t *pPred, uint8_t *pRef, const int32_t iStride);
-
-int32_t WelsSampleSatdIntra16x16Combined3_c(uint8_t *pDec, int32_t iDecStride, uint8_t *pEnc, int32_t iEncStride, 
-							  int32_t *pBestMode, int32_t iLambda, uint8_t *pDst)
-{
-	int32_t iBestMode = -1;
-	int32_t iCurCost, iBestCost = INT_MAX;
-	
-	WelsI16x16LumaPredV_c(pDst, pDec, iDecStride);
-	iCurCost = WelsSampleSatd16x16_c(pDst, 16, pEnc, iEncStride);
-	
-	if (iCurCost < iBestCost)
-	{			
-		iBestMode = 0;
-		iBestCost = iCurCost;
-	}
-	
-	WelsI16x16LumaPredH_c(pDst, pDec, iDecStride);
-	iCurCost = WelsSampleSatd16x16_c(pDst, 16, pEnc, iEncStride) + iLambda * 2;
-	if (iCurCost < iBestCost)
-	{			
-		iBestMode = 1;
-		iBestCost = iCurCost;
-	}
-	WelsI16x16LumaPredDc_c(pDst, pDec, iDecStride);
-	iCurCost = WelsSampleSatd16x16_c(pDst, 16, pEnc, iEncStride) + iLambda * 2;
-	if (iCurCost < iBestCost)
-	{			
-		iBestMode = 2;
-		iBestCost = iCurCost;
-	}
-	
-	*pBestMode = iBestMode;
-
-    return iBestCost;
-	
-	
-}
-int32_t WelsSampleSadIntra16x16Combined3_c(uint8_t *pDec, int32_t iDecStride, uint8_t *pEnc, int32_t iEncStride, 
-							  int32_t *pBestMode, int32_t iLambda, uint8_t *pDst)
-{
-	int32_t iBestMode = -1;
-	int32_t iCurCost, iBestCost = INT_MAX;
-	
-	WelsI16x16LumaPredV_c(pDst, pDec, iDecStride);
-	iCurCost = WelsSampleSad16x16_c(pDst, 16, pEnc, iEncStride);
-	
-	if (iCurCost < iBestCost)
-	{			
-		iBestMode = 0;
-		iBestCost = iCurCost;
-	}
-	
-	WelsI16x16LumaPredH_c(pDst, pDec, iDecStride);
-	iCurCost = WelsSampleSad16x16_c(pDst, 16, pEnc, iEncStride) + iLambda * 2;
-	if (iCurCost < iBestCost)
-	{			
-		iBestMode = 1;
-		iBestCost = iCurCost;
-	}
-	WelsI16x16LumaPredDc_c(pDst, pDec, iDecStride);
-	iCurCost = WelsSampleSad16x16_c(pDst, 16, pEnc, iEncStride) + iLambda * 2;
-	if (iCurCost < iBestCost)
-	{			
-		iBestMode = 2;
-		iBestCost = iCurCost;
-	}
-	
-	*pBestMode = iBestMode;
-
-    return iBestCost;
-	
-	
-}
-
-void WelsInitSampleSadFunc( SWelsFuncPtrList *pFuncList, uint32_t uiCpuFlag)
-{
-	//pfSampleSad init
-	pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16] = WelsSampleSad16x16_c;
-	pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x8 ] = WelsSampleSad16x8_c;
-	pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x16 ] = WelsSampleSad8x16_c;
-	pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8  ] = WelsSampleSad8x8_c;
-	pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_4x4  ] = WelsSampleSad4x4_c;
-
-	//pfSampleSatd init
-	pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] = WelsSampleSatd16x16_c;
-	pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x8 ] = WelsSampleSatd16x8_c;
-	pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16 ] = WelsSampleSatd8x16_c;
-	pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x8  ] = WelsSampleSatd8x8_c;
-	pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x4  ] = WelsSampleSatd4x4_c;
-
-	pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x16] = WelsSampleSadFour16x16_c;
-	pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x8] = WelsSampleSadFour16x8_c;
-	pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x16] = WelsSampleSadFour8x16_c;
-	pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x8] = WelsSampleSadFour8x8_c;
-	pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_4x4] = WelsSampleSadFour4x4_c;
-
-	pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd   = NULL;
-	pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd   = NULL;
-	pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Sad    = NULL;
-	pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Satd = NULL;
-	pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad  = NULL;
-
-#if defined (X86_ASM)
-	if ( uiCpuFlag & WELS_CPU_MMXEXT )
-	{
-		pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_4x4  ] = WelsSampleSad4x4_mmx;
-	}	
-	
-	if ( uiCpuFlag & WELS_CPU_SSE2 )
-	{
-		pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16] = WelsSampleSad16x16_sse2;
-		pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x8 ] = WelsSampleSad16x8_sse2;
-		pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x16] = WelsSampleSad8x16_sse2;
-		pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8] = WelsSampleSad8x8_sse21;
-
-		pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x16] = WelsSampleSadFour16x16_sse2;
-	    pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x8] = WelsSampleSadFour16x8_sse2;
-	    pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x16] = WelsSampleSadFour8x16_sse2;
-	    pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x8] = WelsSampleSadFour8x8_sse2;
-	    pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_4x4] = WelsSampleSadFour4x4_sse2;
-		
-		pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x4  ] = WelsSampleSatd4x4_sse2;
-		pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x8  ] = WelsSampleSatd8x8_sse2;
-		pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16 ] = WelsSampleSatd8x16_sse2;
-		pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x8 ] = WelsSampleSatd16x8_sse2;
-		pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] = WelsSampleSatd16x16_sse2;		
-       	pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd =  WelsSmpleSatdThree4x4_sse2;
-	}	
-
-	if (uiCpuFlag & WELS_CPU_SSSE3)
-	{
-		pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad = WelsIntra16x16Combined3Sad_ssse3;
-	}
-
-	if( uiCpuFlag & WELS_CPU_SSE41 )
-	{	   
-	    pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] = WelsSampleSatd16x16_sse41;
-		pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x8] = WelsSampleSatd16x8_sse41;
-		pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16] = WelsSampleSatd8x16_sse41;
-		pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x8] = WelsSampleSatd8x8_sse41;
-		pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x4] = WelsSampleSatd4x4_sse41;
-		pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Satd = WelsIntra16x16Combined3Satd_sse41;
-		pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd = WelsIntraChroma8x8Combined3Satd_sse41;
-	}
-	
-#endif //(X86_ASM)
-
-}
-
-} // namespace WelsSVCEnc
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	sample.c
+ *
+ * \brief	compute SAD and SATD
+ *
+ * \date	2009.06.02 Created
+ *
+ *************************************************************************************
+ */
+
+#include "sample.h"
+#include "macros.h"
+
+#include "mc.h"
+#include "cpu_core.h"
+#include "array_stack_align.h"
+
+namespace WelsSVCEnc {
+int32_t WelsSampleSad4x4_c (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) {
+  int32_t iSadSum = 0;
+  int32_t i = 0;
+  uint8_t* pSrc1 = pSample1;
+  uint8_t* pSrc2 = pSample2;
+  for (i = 0; i < 4; i++) {
+    iSadSum += WELS_ABS ((pSrc1[0] - pSrc2[0]));
+    iSadSum += WELS_ABS ((pSrc1[1] - pSrc2[1]));
+    iSadSum += WELS_ABS ((pSrc1[2] - pSrc2[2]));
+    iSadSum += WELS_ABS ((pSrc1[3] - pSrc2[3]));
+
+    pSrc1 += iStride1;
+    pSrc2 += iStride2;
+  }
+
+  return iSadSum;
+}
+
+int32_t WelsSampleSad8x8_c (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) {
+  int32_t iSadSum = 0;
+  int32_t i = 0;
+  uint8_t* pSrc1 = pSample1;
+  uint8_t* pSrc2 = pSample2;
+  for (i = 0; i < 8; i++) {
+    iSadSum += WELS_ABS ((pSrc1[0] - pSrc2[0]));
+    iSadSum += WELS_ABS ((pSrc1[1] - pSrc2[1]));
+    iSadSum += WELS_ABS ((pSrc1[2] - pSrc2[2]));
+    iSadSum += WELS_ABS ((pSrc1[3] - pSrc2[3]));
+    iSadSum += WELS_ABS ((pSrc1[4] - pSrc2[4]));
+    iSadSum += WELS_ABS ((pSrc1[5] - pSrc2[5]));
+    iSadSum += WELS_ABS ((pSrc1[6] - pSrc2[6]));
+    iSadSum += WELS_ABS ((pSrc1[7] - pSrc2[7]));
+
+    pSrc1 += iStride1;
+    pSrc2 += iStride2;
+  }
+
+  return iSadSum;
+}
+int32_t WelsSampleSad16x8_c (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) {
+  int32_t iSadSum = 0;
+
+  iSadSum += WelsSampleSad8x8_c (pSample1,     iStride1, pSample2,     iStride2);
+  iSadSum += WelsSampleSad8x8_c (pSample1 + 8, iStride1, pSample2 + 8, iStride2);
+
+  return iSadSum;
+}
+int32_t WelsSampleSad8x16_c (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) {
+  int32_t iSadSum = 0;
+  iSadSum += WelsSampleSad8x8_c (pSample1,                   iStride1, pSample2,                   iStride2);
+  iSadSum += WelsSampleSad8x8_c (pSample1 + (iStride1 << 3), iStride1, pSample2 + (iStride2 << 3), iStride2);
+
+  return iSadSum;
+}
+int32_t WelsSampleSad16x16_c (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) {
+  int32_t iSadSum = 0;
+  iSadSum += WelsSampleSad8x8_c (pSample1,                     iStride1, pSample2,                     iStride2);
+  iSadSum += WelsSampleSad8x8_c (pSample1 + 8,                   iStride1, pSample2 + 8,                   iStride2);
+  iSadSum += WelsSampleSad8x8_c (pSample1 + (iStride1 << 3),   iStride1, pSample2 + (iStride2 << 3),   iStride2);
+  iSadSum += WelsSampleSad8x8_c (pSample1 + (iStride1 << 3) + 8, iStride1, pSample2 + (iStride2 << 3) + 8, iStride2);
+
+  return iSadSum;
+}
+
+int32_t WelsSampleSatd4x4_c (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) {
+  int32_t iSatdSum = 0;
+  int32_t pSampleMix[4][4] = { 0 };
+  int32_t iSample0, iSample1, iSample2, iSample3;
+  int32_t i = 0;
+  uint8_t* pSrc1 = pSample1;
+  uint8_t* pSrc2 = pSample2;
+
+  //step 1: get the difference
+  for (i = 0; i < 4; i++) {
+    pSampleMix[i][0] = pSrc1[0] - pSrc2[0];
+    pSampleMix[i][1] = pSrc1[1] - pSrc2[1];
+    pSampleMix[i][2] = pSrc1[2] - pSrc2[2];
+    pSampleMix[i][3] = pSrc1[3] - pSrc2[3];
+
+    pSrc1 += iStride1;
+    pSrc2 += iStride2;
+  }
+
+  //step 2: horizontal transform
+  for (i = 0; i < 4; i++) {
+    iSample0 = pSampleMix[i][0] + pSampleMix[i][2];
+    iSample1 = pSampleMix[i][1] + pSampleMix[i][3];
+    iSample2 = pSampleMix[i][0] - pSampleMix[i][2];
+    iSample3 = pSampleMix[i][1] - pSampleMix[i][3];
+
+    pSampleMix[i][0] = iSample0 + iSample1;
+    pSampleMix[i][1] = iSample2 + iSample3;
+    pSampleMix[i][2] = iSample2 - iSample3;
+    pSampleMix[i][3] = iSample0 - iSample1;
+  }
+
+  //step 3: vertical transform and get the sum of SATD
+  for (i = 0; i < 4; i++) {
+    iSample0 = pSampleMix[0][i] + pSampleMix[2][i];
+    iSample1 = pSampleMix[1][i] + pSampleMix[3][i];
+    iSample2 = pSampleMix[0][i] - pSampleMix[2][i];
+    iSample3 = pSampleMix[1][i] - pSampleMix[3][i];
+
+    pSampleMix[0][i] = iSample0 + iSample1;
+    pSampleMix[1][i] = iSample2 + iSample3;
+    pSampleMix[2][i] = iSample2 - iSample3;
+    pSampleMix[3][i] = iSample0 - iSample1;
+
+    iSatdSum += (WELS_ABS (pSampleMix[0][i]) + WELS_ABS (pSampleMix[1][i]) + WELS_ABS (pSampleMix[2][i]) + WELS_ABS (
+                   pSampleMix[3][i]));
+  }
+
+  return ((iSatdSum + 1) >> 1);
+}
+int32_t WelsSampleSatd8x8_c (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) {
+  int32_t iSatdSum = 0;
+
+  iSatdSum += WelsSampleSatd4x4_c (pSample1,                     iStride1, pSample2,                     iStride2);
+  iSatdSum += WelsSampleSatd4x4_c (pSample1 + 4,                   iStride1, pSample2 + 4,                   iStride2);
+  iSatdSum += WelsSampleSatd4x4_c (pSample1 + (iStride1 << 2),   iStride1, pSample2 + (iStride2 << 2),   iStride2);
+  iSatdSum += WelsSampleSatd4x4_c (pSample1 + (iStride1 << 2) + 4, iStride1, pSample2 + (iStride2 << 2) + 4, iStride2);
+
+  return iSatdSum;
+}
+int32_t WelsSampleSatd16x8_c (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) {
+  int32_t iSatdSum = 0;
+
+  iSatdSum += WelsSampleSatd8x8_c (pSample1,   iStride1, pSample2,   iStride2);
+  iSatdSum += WelsSampleSatd8x8_c (pSample1 + 8, iStride1, pSample2 + 8, iStride2);
+
+  return iSatdSum;
+}
+int32_t WelsSampleSatd8x16_c (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) {
+  int32_t iSatdSum = 0;
+
+  iSatdSum += WelsSampleSatd8x8_c (pSample1,                   iStride1, pSample2,                   iStride2);
+  iSatdSum += WelsSampleSatd8x8_c (pSample1 + (iStride1 << 3), iStride1, pSample2 + (iStride2 << 3), iStride2);
+
+  return iSatdSum;
+}
+int32_t WelsSampleSatd16x16_c (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) {
+  int32_t iSatdSum = 0;
+
+  iSatdSum += WelsSampleSatd8x8_c (pSample1,                     iStride1, pSample2,                     iStride2);
+  iSatdSum += WelsSampleSatd8x8_c (pSample1 + 8,                   iStride1, pSample2 + 8,                   iStride2);
+  iSatdSum += WelsSampleSatd8x8_c (pSample1 + (iStride1 << 3),   iStride1, pSample2 + (iStride2 << 3),   iStride2);
+  iSatdSum += WelsSampleSatd8x8_c (pSample1 + (iStride1 << 3) + 8, iStride1, pSample2 + (iStride2 << 3) + 8, iStride2);
+
+  return iSatdSum;
+}
+
+
+void WelsSampleSadFour16x16_c (uint8_t* iSample1, int32_t iStride1, uint8_t* iSample2, int32_t iStride2,
+                               int32_t* pSad) {
+  * (pSad)     = WelsSampleSad16x16_c (iSample1, iStride1, (iSample2 - iStride2), iStride2);
+  * (pSad + 1) = WelsSampleSad16x16_c (iSample1, iStride1, (iSample2 + iStride2), iStride2);
+  * (pSad + 2) = WelsSampleSad16x16_c (iSample1, iStride1, (iSample2 - 1), iStride2);
+  * (pSad + 3) = WelsSampleSad16x16_c (iSample1, iStride1, (iSample2 + 1), iStride2);
+}
+void WelsSampleSadFour16x8_c (uint8_t* iSample1, int32_t iStride1, uint8_t* iSample2, int32_t iStride2, int32_t* pSad) {
+  * (pSad)     = WelsSampleSad16x8_c (iSample1, iStride1, (iSample2 - iStride2), iStride2);
+  * (pSad + 1) = WelsSampleSad16x8_c (iSample1, iStride1, (iSample2 + iStride2), iStride2);
+  * (pSad + 2) = WelsSampleSad16x8_c (iSample1, iStride1, (iSample2 - 1), iStride2);
+  * (pSad + 3) = WelsSampleSad16x8_c (iSample1, iStride1, (iSample2 + 1), iStride2);
+}
+void WelsSampleSadFour8x16_c (uint8_t* iSample1, int32_t iStride1, uint8_t* iSample2, int32_t iStride2, int32_t* pSad) {
+  * (pSad)     = WelsSampleSad8x16_c (iSample1, iStride1, (iSample2 - iStride2), iStride2);
+  * (pSad + 1) = WelsSampleSad8x16_c (iSample1, iStride1, (iSample2 + iStride2), iStride2);
+  * (pSad + 2) = WelsSampleSad8x16_c (iSample1, iStride1, (iSample2 - 1), iStride2);
+  * (pSad + 3) = WelsSampleSad8x16_c (iSample1, iStride1, (iSample2 + 1), iStride2);
+
+}
+void WelsSampleSadFour8x8_c (uint8_t* iSample1, int32_t iStride1, uint8_t* iSample2, int32_t iStride2, int32_t* pSad) {
+  * (pSad)     = WelsSampleSad8x8_c (iSample1, iStride1, (iSample2 - iStride2), iStride2);
+  * (pSad + 1) = WelsSampleSad8x8_c (iSample1, iStride1, (iSample2 + iStride2), iStride2);
+  * (pSad + 2) = WelsSampleSad8x8_c (iSample1, iStride1, (iSample2 - 1), iStride2);
+  * (pSad + 3) = WelsSampleSad8x8_c (iSample1, iStride1, (iSample2 + 1), iStride2);
+}
+void WelsSampleSadFour4x4_c (uint8_t* iSample1, int32_t iStride1, uint8_t* iSample2, int32_t iStride2, int32_t* pSad) {
+  * (pSad)     = WelsSampleSad4x4_c (iSample1, iStride1, (iSample2 - iStride2), iStride2);
+  * (pSad + 1) = WelsSampleSad4x4_c (iSample1, iStride1, (iSample2 + iStride2), iStride2);
+  * (pSad + 2) = WelsSampleSad4x4_c (iSample1, iStride1, (iSample2 - 1), iStride2);
+  * (pSad + 3) = WelsSampleSad4x4_c (iSample1, iStride1, (iSample2 + 1), iStride2);
+}
+
+extern void WelsI4x4LumaPredDc_c (uint8_t* pPred, uint8_t* pRef, const int32_t iStride);
+extern void WelsI4x4LumaPredH_c (uint8_t* pPred, uint8_t* pRef, const int32_t iStride);
+extern void WelsI4x4LumaPredV_c (uint8_t* pPred, uint8_t* pRef, const int32_t iStride);
+
+int32_t WelsSampleSatdIntra4x4Combined3_c (uint8_t* pDec, int32_t iDecStride, uint8_t* pEnc, int32_t iEncStride,
+    uint8_t* pDst,
+    int32_t* pBestMode, int32_t iLambda2, int32_t iLambda1, int32_t iLambda0) {
+  int32_t iBestMode = -1;
+  int32_t iCurCost, iBestCost = INT_MAX;
+  ENFORCE_STACK_ALIGN_2D (uint8_t, uiLocalBuffer, 3, 16, 16)
+
+  WelsI4x4LumaPredDc_c (uiLocalBuffer[2], pDec, iDecStride);
+  iCurCost = WelsSampleSatd4x4_c (uiLocalBuffer[2], 4, pEnc, iEncStride) + iLambda2;
+  if (iCurCost < iBestCost) {
+    iBestMode = 2;
+    iBestCost = iCurCost;
+  }
+
+  WelsI4x4LumaPredH_c (uiLocalBuffer[1], pDec, iDecStride);
+  iCurCost = WelsSampleSatd4x4_c (uiLocalBuffer[1], 4, pEnc, iEncStride) + iLambda1;
+  if (iCurCost < iBestCost) {
+    iBestMode = 1;
+    iBestCost = iCurCost;
+  }
+  WelsI4x4LumaPredV_c (uiLocalBuffer[0], pDec, iDecStride);
+  iCurCost = WelsSampleSatd4x4_c (uiLocalBuffer[0], 4, pEnc, iEncStride) + iLambda0;
+  if (iCurCost < iBestCost) {
+    iBestMode = 0;
+    iBestCost = iCurCost;
+  }
+
+  memcpy (pDst, uiLocalBuffer[iBestMode], 16 * sizeof (uint8_t));	// confirmed_safe_unsafe_usage
+  *pBestMode = iBestMode;
+
+  return iBestCost;
+}
+extern void WelsIChormaPredDc_c (uint8_t* pPred, uint8_t* pRef, const int32_t iStride);
+extern void WelsIChormaPredH_c (uint8_t* pPred, uint8_t* pRef, const int32_t iStride);
+extern void WelsIChormaPredV_c (uint8_t* pPred, uint8_t* pRef, const int32_t iStride);
+
+int32_t WelsSampleSatdIntra8x8Combined3_c (uint8_t* pDecCb, int32_t iDecStride, uint8_t* pEncCb, int32_t iEncStride,
+    int32_t* pBestMode, int32_t iLambda, uint8_t* pDstChroma, uint8_t* pDecCr, uint8_t* pEncCr) {
+  int32_t iBestMode = -1;
+  int32_t iCurCost, iBestCost = INT_MAX;
+
+  WelsIChormaPredV_c (pDstChroma, pDecCb, iDecStride);
+  WelsIChormaPredV_c (pDstChroma + 64, pDecCr, iDecStride);
+  iCurCost = WelsSampleSatd8x8_c (pDstChroma, 8, pEncCb, iEncStride);
+  iCurCost += WelsSampleSatd8x8_c (pDstChroma + 64, 8, pEncCr, iEncStride) + iLambda * 2;
+
+  if (iCurCost < iBestCost) {
+    iBestMode = 2;
+    iBestCost = iCurCost;
+  }
+
+  WelsIChormaPredH_c (pDstChroma, pDecCb, iDecStride);
+  WelsIChormaPredH_c (pDstChroma + 64, pDecCr, iDecStride);
+  iCurCost = WelsSampleSatd8x8_c (pDstChroma, 8, pEncCb, iEncStride);
+  iCurCost += WelsSampleSatd8x8_c (pDstChroma + 64, 8, pEncCr, iEncStride) + iLambda * 2;
+  if (iCurCost < iBestCost) {
+    iBestMode = 1;
+    iBestCost = iCurCost;
+  }
+  WelsIChormaPredDc_c (pDstChroma, pDecCb, iDecStride);
+  WelsIChormaPredDc_c (pDstChroma + 64, pDecCr, iDecStride);
+  iCurCost = WelsSampleSatd8x8_c (pDstChroma, 8, pEncCb, iEncStride);
+  iCurCost += WelsSampleSatd8x8_c (pDstChroma + 64, 8, pEncCr, iEncStride);
+  if (iCurCost < iBestCost) {
+    iBestMode = 0;
+    iBestCost = iCurCost;
+  }
+
+  *pBestMode	= iBestMode;
+
+  return iBestCost;
+
+
+}
+int32_t WelsSampleSadIntra8x8Combined3_c (uint8_t* pDecCb, int32_t iDecStride, uint8_t* pEncCb, int32_t iEncStride,
+    int32_t* pBestMode, int32_t iLambda, uint8_t* pDstChroma, uint8_t* pDecCr, uint8_t* pEncCr) {
+  int32_t iBestMode = -1;
+  int32_t iCurCost, iBestCost = INT_MAX;
+
+  WelsIChormaPredV_c (pDstChroma, pDecCb, iDecStride);
+  WelsIChormaPredV_c (pDstChroma + 64, pDecCr, iDecStride);
+  iCurCost = WelsSampleSad8x8_c (pDstChroma, 8, pEncCb, iEncStride);
+  iCurCost += WelsSampleSad8x8_c (pDstChroma + 64, 8, pEncCr, iEncStride) + iLambda * 2;
+
+  if (iCurCost < iBestCost) {
+    iBestMode = 2;
+    iBestCost = iCurCost;
+  }
+
+  WelsIChormaPredH_c (pDstChroma, pDecCb, iDecStride);
+  WelsIChormaPredH_c (pDstChroma + 64, pDecCr, iDecStride);
+  iCurCost = WelsSampleSad8x8_c (pDstChroma, 8, pEncCb, iEncStride);
+  iCurCost += WelsSampleSad8x8_c (pDstChroma + 64, 8, pEncCr, iEncStride) + iLambda * 2;
+  if (iCurCost < iBestCost) {
+    iBestMode = 1;
+    iBestCost = iCurCost;
+  }
+  WelsIChormaPredDc_c (pDstChroma, pDecCb, iDecStride);
+  WelsIChormaPredDc_c (pDstChroma + 64, pDecCr, iDecStride);
+  iCurCost = WelsSampleSad8x8_c (pDstChroma, 8, pEncCb, iEncStride);
+  iCurCost += WelsSampleSad8x8_c (pDstChroma + 64, 8, pEncCr, iEncStride);
+  if (iCurCost < iBestCost) {
+    iBestMode = 0;
+    iBestCost = iCurCost;
+  }
+
+  *pBestMode = iBestMode;
+
+  return iBestCost;
+
+}
+
+extern void WelsI16x16LumaPredDc_c (uint8_t* pPred, uint8_t* pRef, const int32_t iStride);
+extern void WelsI16x16LumaPredH_c (uint8_t* pPred, uint8_t* pRef, const int32_t iStride);
+extern void WelsI16x16LumaPredV_c (uint8_t* pPred, uint8_t* pRef, const int32_t iStride);
+
+int32_t WelsSampleSatdIntra16x16Combined3_c (uint8_t* pDec, int32_t iDecStride, uint8_t* pEnc, int32_t iEncStride,
+    int32_t* pBestMode, int32_t iLambda, uint8_t* pDst) {
+  int32_t iBestMode = -1;
+  int32_t iCurCost, iBestCost = INT_MAX;
+
+  WelsI16x16LumaPredV_c (pDst, pDec, iDecStride);
+  iCurCost = WelsSampleSatd16x16_c (pDst, 16, pEnc, iEncStride);
+
+  if (iCurCost < iBestCost) {
+    iBestMode = 0;
+    iBestCost = iCurCost;
+  }
+
+  WelsI16x16LumaPredH_c (pDst, pDec, iDecStride);
+  iCurCost = WelsSampleSatd16x16_c (pDst, 16, pEnc, iEncStride) + iLambda * 2;
+  if (iCurCost < iBestCost) {
+    iBestMode = 1;
+    iBestCost = iCurCost;
+  }
+  WelsI16x16LumaPredDc_c (pDst, pDec, iDecStride);
+  iCurCost = WelsSampleSatd16x16_c (pDst, 16, pEnc, iEncStride) + iLambda * 2;
+  if (iCurCost < iBestCost) {
+    iBestMode = 2;
+    iBestCost = iCurCost;
+  }
+
+  *pBestMode = iBestMode;
+
+  return iBestCost;
+
+
+}
+int32_t WelsSampleSadIntra16x16Combined3_c (uint8_t* pDec, int32_t iDecStride, uint8_t* pEnc, int32_t iEncStride,
+    int32_t* pBestMode, int32_t iLambda, uint8_t* pDst) {
+  int32_t iBestMode = -1;
+  int32_t iCurCost, iBestCost = INT_MAX;
+
+  WelsI16x16LumaPredV_c (pDst, pDec, iDecStride);
+  iCurCost = WelsSampleSad16x16_c (pDst, 16, pEnc, iEncStride);
+
+  if (iCurCost < iBestCost) {
+    iBestMode = 0;
+    iBestCost = iCurCost;
+  }
+
+  WelsI16x16LumaPredH_c (pDst, pDec, iDecStride);
+  iCurCost = WelsSampleSad16x16_c (pDst, 16, pEnc, iEncStride) + iLambda * 2;
+  if (iCurCost < iBestCost) {
+    iBestMode = 1;
+    iBestCost = iCurCost;
+  }
+  WelsI16x16LumaPredDc_c (pDst, pDec, iDecStride);
+  iCurCost = WelsSampleSad16x16_c (pDst, 16, pEnc, iEncStride) + iLambda * 2;
+  if (iCurCost < iBestCost) {
+    iBestMode = 2;
+    iBestCost = iCurCost;
+  }
+
+  *pBestMode = iBestMode;
+
+  return iBestCost;
+
+
+}
+
+void WelsInitSampleSadFunc (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) {
+  //pfSampleSad init
+  pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16] = WelsSampleSad16x16_c;
+  pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x8 ] = WelsSampleSad16x8_c;
+  pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x16 ] = WelsSampleSad8x16_c;
+  pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8  ] = WelsSampleSad8x8_c;
+  pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_4x4  ] = WelsSampleSad4x4_c;
+
+  //pfSampleSatd init
+  pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] = WelsSampleSatd16x16_c;
+  pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x8 ] = WelsSampleSatd16x8_c;
+  pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16 ] = WelsSampleSatd8x16_c;
+  pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x8  ] = WelsSampleSatd8x8_c;
+  pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x4  ] = WelsSampleSatd4x4_c;
+
+  pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x16] = WelsSampleSadFour16x16_c;
+  pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x8] = WelsSampleSadFour16x8_c;
+  pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x16] = WelsSampleSadFour8x16_c;
+  pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x8] = WelsSampleSadFour8x8_c;
+  pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_4x4] = WelsSampleSadFour4x4_c;
+
+  pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd   = NULL;
+  pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd   = NULL;
+  pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Sad    = NULL;
+  pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Satd = NULL;
+  pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad  = NULL;
+
+#if defined (X86_ASM)
+  if (uiCpuFlag & WELS_CPU_MMXEXT) {
+    pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_4x4  ] = WelsSampleSad4x4_mmx;
+  }
+
+  if (uiCpuFlag & WELS_CPU_SSE2) {
+    pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16] = WelsSampleSad16x16_sse2;
+    pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x8 ] = WelsSampleSad16x8_sse2;
+    pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x16] = WelsSampleSad8x16_sse2;
+    pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8] = WelsSampleSad8x8_sse21;
+
+    pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x16] = WelsSampleSadFour16x16_sse2;
+    pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x8] = WelsSampleSadFour16x8_sse2;
+    pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x16] = WelsSampleSadFour8x16_sse2;
+    pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x8] = WelsSampleSadFour8x8_sse2;
+    pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_4x4] = WelsSampleSadFour4x4_sse2;
+
+    pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x4  ] = WelsSampleSatd4x4_sse2;
+    pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x8  ] = WelsSampleSatd8x8_sse2;
+    pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16 ] = WelsSampleSatd8x16_sse2;
+    pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x8 ] = WelsSampleSatd16x8_sse2;
+    pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] = WelsSampleSatd16x16_sse2;
+    pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd =  WelsSmpleSatdThree4x4_sse2;
+  }
+
+  if (uiCpuFlag & WELS_CPU_SSSE3) {
+    pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad = WelsIntra16x16Combined3Sad_ssse3;
+  }
+
+  if (uiCpuFlag & WELS_CPU_SSE41) {
+    pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] = WelsSampleSatd16x16_sse41;
+    pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x8] = WelsSampleSatd16x8_sse41;
+    pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16] = WelsSampleSatd8x16_sse41;
+    pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x8] = WelsSampleSatd8x8_sse41;
+    pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x4] = WelsSampleSatd4x4_sse41;
+    pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Satd = WelsIntra16x16Combined3Satd_sse41;
+    pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd = WelsIntraChroma8x8Combined3Satd_sse41;
+  }
+
+#endif //(X86_ASM)
+
+}
+
+} // namespace WelsSVCEnc
--- a/codec/encoder/core/src/set_mb_syn_cavlc.cpp
+++ b/codec/encoder/core/src/set_mb_syn_cavlc.cpp
@@ -1,229 +1,215 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file	set_mb_syn_cavlc.h
- *
- * \brief	Seting all syntax elements of mb and decoding residual with cavlc
- *
- * \date	05/19/2009 Created
- *
- *************************************************************************************
- */
-
-#include "set_mb_syn_cavlc.h"
-#include "svc_enc_golomb.h"
-#include "vlc_encoder.h"
-#include "cpu_core.h"
-#include "array_stack_align.h"
-
-namespace WelsSVCEnc {
-SCoeffFunc    sCoeffFunc;
-
-const  ALIGNED_DECLARE(uint8_t, g_kuiZeroLeftMap[16], 16) = 
-{
-	0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
-};
-
-const ALIGNED_DECLARE(uint8_t, g_kuiTrailingOneIndex[8], 16) = 
-{
-	3, 0, 1, 0, 2, 0, 1, 0
-};
-
-int32_t CavlcParamCal_c(int16_t *pCoffLevel, uint8_t *pRun, int16_t *pLevel, int32_t *pTotalCoeff , int32_t iLastIndex)
-{
-	int32_t iTotalZeros = 0;
-	int32_t iTotalCoeffs = 0;
-
-	while (iLastIndex >= 0 && pCoffLevel[iLastIndex] == 0) {
-		-- iLastIndex;
-	}
-	 
-	while (iLastIndex >= 0) {
-		int32_t iCountZero = 0;
-		pLevel[iTotalCoeffs] = pCoffLevel[iLastIndex--];   
-
-		while (iLastIndex >= 0 && pCoffLevel[iLastIndex] == 0) {
-			++ iCountZero;
-			-- iLastIndex;
-		}
-		iTotalZeros += iCountZero;
-		pRun[iTotalCoeffs++] = iCountZero;
-	}
-	*pTotalCoeff = iTotalCoeffs;
-	return iTotalZeros;
-}
-
-void  WriteBlockResidualCavlc( int16_t *pCoffLevel, int32_t iEndIdx, int32_t iCalRunLevelFlag, int32_t iResidualProperty, int8_t iNC, SBitStringAux *pBs )
-{		
-	ENFORCE_STACK_ALIGN_1D(int16_t, iLevel, 16, 16)
-	ENFORCE_STACK_ALIGN_1D(uint8_t, uiRun, 16, 16)
-
-	int32_t iTotalCoeffs = 0;
-	int32_t iTrailingOnes = 0;
-	int32_t iTotalZeros = 0, iZerosLeft = 0;
-	uint32_t uiSign = 0;
-	int32_t iLevelCode = 0, iLevelPrefix = 0, iLevelSuffix = 0, uiSuffixLength = 0, iLevelSuffixSize = 0;
-	int32_t iValue = 0, iThreshold, iZeroLeft;
-	int32_t n = 0;	
-	int32_t i = 0;
-
-
-	CAVLC_BS_INIT(pBs);
-
-    /*Step 1: calculate iLevel and iRun and total */ 
-
-	if( iCalRunLevelFlag ){
-		int32_t iCount = 0;
-		iTotalZeros = sCoeffFunc.pfCavlcParamCal(pCoffLevel, uiRun, iLevel, &iTotalCoeffs, iEndIdx);        
-		iCount = (iTotalCoeffs>3)?3:iTotalCoeffs;
-		for(i = 0;i <iCount ;i++)
-		{
-			if(WELS_ABS(iLevel[i]) == 1)
-			{
-				iTrailingOnes ++;
-				uiSign <<=1;
-				if(iLevel[i]<0)
-					uiSign|=1;
-			}
-			else
-			{
-				break;
-
-			}
-		}
-	}
-	/*Step 3: coeff token */
-	const uint8_t *upCoeffToken = &g_kuiVlcCoeffToken[g_kuiEncNcMapTable[iNC]][iTotalCoeffs][iTrailingOnes][0];
-	iValue = upCoeffToken[0];
-	n = upCoeffToken[1];	
-
-    if( iTotalCoeffs == 0 )
-    {
-		CAVLC_BS_WRITE(n, iValue);
-
-		CAVLC_BS_UNINIT(pBs);
-        return;
-    }	
-
-    /* Step 4: */
-   /*  trailing */
-	n += iTrailingOnes;
-	iValue = (iValue << iTrailingOnes) + uiSign;
-	CAVLC_BS_WRITE(n, iValue);
-
-    /*  levels */
-	uiSuffixLength = ( iTotalCoeffs > 10 && iTrailingOnes < 3 ) ? 1 : 0;	
-
-	for( i=iTrailingOnes; i<iTotalCoeffs; i++ ){
-		int32_t iVal = iLevel[i];
-
-			iLevelCode = (iVal-1)<<1;
-			uiSign = (iLevelCode>>31);
-			iLevelCode = (iLevelCode ^ uiSign) + (uiSign<<1);
-			iLevelCode -= ((i == iTrailingOnes) && (iTrailingOnes < 3)) << 1;
-
-			iLevelPrefix = iLevelCode >> uiSuffixLength; 
-			iLevelSuffixSize = uiSuffixLength;
-			iLevelSuffix = iLevelCode - (iLevelPrefix<<uiSuffixLength);
-
-			if (iLevelPrefix >= 14 && iLevelPrefix < 30 && uiSuffixLength == 0) {
-				iLevelPrefix = 14; 
-				iLevelSuffix = iLevelCode - iLevelPrefix;
-				iLevelSuffixSize = 4;
-			}
-			else if (iLevelPrefix >= 15) {
-				iLevelPrefix = 15; 
-				iLevelSuffix = iLevelCode - (iLevelPrefix << uiSuffixLength);
-
-				if (uiSuffixLength == 0) {
-					iLevelSuffix -= 15;
-				}
-				iLevelSuffixSize = 12;
-			}		
-
-			n = iLevelPrefix + 1 + iLevelSuffixSize;
-			iValue = ((1<< iLevelSuffixSize) | iLevelSuffix);
-			CAVLC_BS_WRITE(n, iValue);
-
-			uiSuffixLength += !uiSuffixLength;
-			iThreshold = 3 << ( uiSuffixLength - 1 );
-			uiSuffixLength += ((iVal > iThreshold) || (iVal < -iThreshold)) && (uiSuffixLength < 6);
-
-	}
-
-    /* Step 5: total zeros */
-
-    if( iTotalCoeffs < iEndIdx + 1 )
-    {
-		if ( CHROMA_DC != iResidualProperty )
-		{	
-			const uint8_t *upTotalZeros = &g_kuiVlcTotalZeros[iTotalCoeffs][iTotalZeros][0];
-			n = upTotalZeros[1];
-			iValue = upTotalZeros[0];
-			CAVLC_BS_WRITE( n, iValue );
-		}
-		else
-		{	
-			const uint8_t *upTotalZeros = &g_kuiVlcTotalZerosChromaDc[iTotalCoeffs][iTotalZeros][0];
-			n = upTotalZeros[1];
-			iValue = upTotalZeros[0];
-			CAVLC_BS_WRITE( n, iValue );	
-		}
-    }
-
-    /* Step 6: pRun before */	
-	iZerosLeft = iTotalZeros;
-    for( i = 0; i+1 < iTotalCoeffs && iZerosLeft > 0; ++ i )
-    {	
-		const uint8_t uirun = uiRun[i];
-		iZeroLeft = g_kuiZeroLeftMap[iZerosLeft];
-		n = g_kuiVlcRunBefore[iZeroLeft][uirun][1];
-		iValue = g_kuiVlcRunBefore[iZeroLeft][uirun][0];
-		CAVLC_BS_WRITE(n, iValue);		
-        iZerosLeft -= uirun;
-    }
-
-	CAVLC_BS_UNINIT(pBs);
-}
-
-
-void InitCoeffFunc( const uint32_t uiCpuFlag)
-{
-	sCoeffFunc.pfCavlcParamCal = CavlcParamCal_c;
-
-#if defined(X86_ASM)
-	if( uiCpuFlag & WELS_CPU_SSE2 ){
-		sCoeffFunc.pfCavlcParamCal = CavlcParamCal_sse2;
-	}
-#endif
-}
-
-} // namespace WelsSVCEnc
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	set_mb_syn_cavlc.h
+ *
+ * \brief	Seting all syntax elements of mb and decoding residual with cavlc
+ *
+ * \date	05/19/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#include "set_mb_syn_cavlc.h"
+#include "svc_enc_golomb.h"
+#include "vlc_encoder.h"
+#include "cpu_core.h"
+#include "array_stack_align.h"
+
+namespace WelsSVCEnc {
+SCoeffFunc    sCoeffFunc;
+
+const  ALIGNED_DECLARE (uint8_t, g_kuiZeroLeftMap[16], 16) = {
+  0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
+};
+
+const ALIGNED_DECLARE (uint8_t, g_kuiTrailingOneIndex[8], 16) = {
+  3, 0, 1, 0, 2, 0, 1, 0
+};
+
+int32_t CavlcParamCal_c (int16_t* pCoffLevel, uint8_t* pRun, int16_t* pLevel, int32_t* pTotalCoeff ,
+                         int32_t iLastIndex) {
+  int32_t iTotalZeros = 0;
+  int32_t iTotalCoeffs = 0;
+
+  while (iLastIndex >= 0 && pCoffLevel[iLastIndex] == 0) {
+    -- iLastIndex;
+  }
+
+  while (iLastIndex >= 0) {
+    int32_t iCountZero = 0;
+    pLevel[iTotalCoeffs] = pCoffLevel[iLastIndex--];
+
+    while (iLastIndex >= 0 && pCoffLevel[iLastIndex] == 0) {
+      ++ iCountZero;
+      -- iLastIndex;
+    }
+    iTotalZeros += iCountZero;
+    pRun[iTotalCoeffs++] = iCountZero;
+  }
+  *pTotalCoeff = iTotalCoeffs;
+  return iTotalZeros;
+}
+
+void  WriteBlockResidualCavlc (int16_t* pCoffLevel, int32_t iEndIdx, int32_t iCalRunLevelFlag,
+                               int32_t iResidualProperty, int8_t iNC, SBitStringAux* pBs) {
+  ENFORCE_STACK_ALIGN_1D (int16_t, iLevel, 16, 16)
+  ENFORCE_STACK_ALIGN_1D (uint8_t, uiRun, 16, 16)
+
+  int32_t iTotalCoeffs = 0;
+  int32_t iTrailingOnes = 0;
+  int32_t iTotalZeros = 0, iZerosLeft = 0;
+  uint32_t uiSign = 0;
+  int32_t iLevelCode = 0, iLevelPrefix = 0, iLevelSuffix = 0, uiSuffixLength = 0, iLevelSuffixSize = 0;
+  int32_t iValue = 0, iThreshold, iZeroLeft;
+  int32_t n = 0;
+  int32_t i = 0;
+
+
+  CAVLC_BS_INIT (pBs);
+
+  /*Step 1: calculate iLevel and iRun and total */
+
+  if (iCalRunLevelFlag) {
+    int32_t iCount = 0;
+    iTotalZeros = sCoeffFunc.pfCavlcParamCal (pCoffLevel, uiRun, iLevel, &iTotalCoeffs, iEndIdx);
+    iCount = (iTotalCoeffs > 3) ? 3 : iTotalCoeffs;
+    for (i = 0; i < iCount ; i++) {
+      if (WELS_ABS (iLevel[i]) == 1) {
+        iTrailingOnes ++;
+        uiSign <<= 1;
+        if (iLevel[i] < 0)
+          uiSign |= 1;
+      } else {
+        break;
+
+      }
+    }
+  }
+  /*Step 3: coeff token */
+  const uint8_t* upCoeffToken = &g_kuiVlcCoeffToken[g_kuiEncNcMapTable[iNC]][iTotalCoeffs][iTrailingOnes][0];
+  iValue = upCoeffToken[0];
+  n = upCoeffToken[1];
+
+  if (iTotalCoeffs == 0) {
+    CAVLC_BS_WRITE (n, iValue);
+
+    CAVLC_BS_UNINIT (pBs);
+    return;
+  }
+
+  /* Step 4: */
+  /*  trailing */
+  n += iTrailingOnes;
+  iValue = (iValue << iTrailingOnes) + uiSign;
+  CAVLC_BS_WRITE (n, iValue);
+
+  /*  levels */
+  uiSuffixLength = (iTotalCoeffs > 10 && iTrailingOnes < 3) ? 1 : 0;
+
+  for (i = iTrailingOnes; i < iTotalCoeffs; i++) {
+    int32_t iVal = iLevel[i];
+
+    iLevelCode = (iVal - 1) << 1;
+    uiSign = (iLevelCode >> 31);
+    iLevelCode = (iLevelCode ^ uiSign) + (uiSign << 1);
+    iLevelCode -= ((i == iTrailingOnes) && (iTrailingOnes < 3)) << 1;
+
+    iLevelPrefix = iLevelCode >> uiSuffixLength;
+    iLevelSuffixSize = uiSuffixLength;
+    iLevelSuffix = iLevelCode - (iLevelPrefix << uiSuffixLength);
+
+    if (iLevelPrefix >= 14 && iLevelPrefix < 30 && uiSuffixLength == 0) {
+      iLevelPrefix = 14;
+      iLevelSuffix = iLevelCode - iLevelPrefix;
+      iLevelSuffixSize = 4;
+    } else if (iLevelPrefix >= 15) {
+      iLevelPrefix = 15;
+      iLevelSuffix = iLevelCode - (iLevelPrefix << uiSuffixLength);
+
+      if (uiSuffixLength == 0) {
+        iLevelSuffix -= 15;
+      }
+      iLevelSuffixSize = 12;
+    }
+
+    n = iLevelPrefix + 1 + iLevelSuffixSize;
+    iValue = ((1 << iLevelSuffixSize) | iLevelSuffix);
+    CAVLC_BS_WRITE (n, iValue);
+
+    uiSuffixLength += !uiSuffixLength;
+    iThreshold = 3 << (uiSuffixLength - 1);
+    uiSuffixLength += ((iVal > iThreshold) || (iVal < -iThreshold)) && (uiSuffixLength < 6);
+
+  }
+
+  /* Step 5: total zeros */
+
+  if (iTotalCoeffs < iEndIdx + 1) {
+    if (CHROMA_DC != iResidualProperty) {
+      const uint8_t* upTotalZeros = &g_kuiVlcTotalZeros[iTotalCoeffs][iTotalZeros][0];
+      n = upTotalZeros[1];
+      iValue = upTotalZeros[0];
+      CAVLC_BS_WRITE (n, iValue);
+    } else {
+      const uint8_t* upTotalZeros = &g_kuiVlcTotalZerosChromaDc[iTotalCoeffs][iTotalZeros][0];
+      n = upTotalZeros[1];
+      iValue = upTotalZeros[0];
+      CAVLC_BS_WRITE (n, iValue);
+    }
+  }
+
+  /* Step 6: pRun before */
+  iZerosLeft = iTotalZeros;
+  for (i = 0; i + 1 < iTotalCoeffs && iZerosLeft > 0; ++ i) {
+    const uint8_t uirun = uiRun[i];
+    iZeroLeft = g_kuiZeroLeftMap[iZerosLeft];
+    n = g_kuiVlcRunBefore[iZeroLeft][uirun][1];
+    iValue = g_kuiVlcRunBefore[iZeroLeft][uirun][0];
+    CAVLC_BS_WRITE (n, iValue);
+    iZerosLeft -= uirun;
+  }
+
+  CAVLC_BS_UNINIT (pBs);
+}
+
+
+void InitCoeffFunc (const uint32_t uiCpuFlag) {
+  sCoeffFunc.pfCavlcParamCal = CavlcParamCal_c;
+
+#if defined(X86_ASM)
+  if (uiCpuFlag & WELS_CPU_SSE2) {
+    sCoeffFunc.pfCavlcParamCal = CavlcParamCal_sse2;
+  }
+#endif
+}
+
+} // namespace WelsSVCEnc
--- a/codec/encoder/core/src/slice_multi_threading.cpp
+++ b/codec/encoder/core/src/slice_multi_threading.cpp
@@ -1,1592 +1,1506 @@
-/*!
- * \copy
- *     Copyright (c)  2010-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file	slice_multi_threading.h
- *
- * \brief	pSlice based multiple threading
- *
- * \date	04/16/2010 Created
- *
- *************************************************************************************
- */
-
-#if defined(MT_ENABLED)
-
-#include <assert.h>
-#ifdef __GNUC__
-#include <semaphore.h>
-#ifndef SEM_NAME_MAX
-// length of semaphore name should be system constrained at least on mac 10.7
-#define  SEM_NAME_MAX 32
-#endif//SEM_NAME_MAX
-#endif//__GNUC__
-#include "slice_multi_threading.h"
-#include "mt_defs.h"
-#include "nal_encap.h"
-#include "utils.h"
-#include "encoder.h"
-#include "svc_encode_slice.h"
-#include "deblocking.h"
-#include "svc_enc_golomb.h"
-#include "crt_util_safe_x.h"	// for safe crt like calls
-#include "rc.h"
-
-#if defined(X86_ASM)
-#include "cpu.h"
-#endif//X86_ASM
-
-#if defined(DYNAMIC_SLICE_ASSIGN) || defined(MT_DEBUG)
-#include "measure_time.h"
-#endif//DYNAMIC_SLICE_ASSIGN
-namespace WelsSVCEnc {
-void UpdateMbListNeighborParallel(	SSliceCtx *pSliceCtx,
-									  SMB *pMbList,
-									  const int32_t uiSliceIdc	)
-{
-	const uint8_t *kpMbMap			= pSliceCtx->pOverallMbMap;
-	const int32_t kiMbWidth			= pSliceCtx->iMbWidth;
-	int32_t iIdx						= pSliceCtx->pFirstMbInSlice[uiSliceIdc];
-	const int32_t kiEndMbInSlice	= iIdx + pSliceCtx->pCountMbNumInSlice[uiSliceIdc] - 1;
-	
-	do {
-		SMB *pMb							= &pMbList[iIdx];
-		uint32_t uiNeighborAvailFlag	= 0;
-		const int32_t kiMbXY				= pMb->iMbXY;
-		const int32_t kiMbX				= pMb->iMbX;
-		const int32_t kiMbY				= pMb->iMbY;
-		BOOL_T     bLeft;
-		BOOL_T     bTop;
-		BOOL_T     bLeftTop;
-		BOOL_T     bRightTop;		
-		int32_t   iLeftXY, iTopXY, iLeftTopXY, iRightTopXY;		
-
-		iLeftXY = kiMbXY - 1;
-		iTopXY = kiMbXY - kiMbWidth;
-		iLeftTopXY = iTopXY - 1;
-		iRightTopXY = iTopXY + 1;
-		
-		bLeft = (kiMbX > 0) && (uiSliceIdc == kpMbMap[iLeftXY]);
-		bTop = (kiMbY > 0) && (uiSliceIdc == kpMbMap[iTopXY]);
-		bLeftTop = (kiMbX > 0) && (kiMbY > 0) && (uiSliceIdc == kpMbMap[iLeftTopXY]);
-		bRightTop = (kiMbX < (kiMbWidth-1)) && (kiMbY > 0) && (uiSliceIdc == kpMbMap[iRightTopXY]);		
-		
-		if( bLeft ){
-			uiNeighborAvailFlag |= LEFT_MB_POS;
-		}		
-		if( bTop ){
-			uiNeighborAvailFlag |= TOP_MB_POS;
-		}
-		if( bLeftTop ){
-			uiNeighborAvailFlag |= TOPLEFT_MB_POS;
-		}		
-		if( bRightTop ){
-			uiNeighborAvailFlag |= TOPRIGHT_MB_POS;
-		}		
-		pMb->uiNeighborAvail	= (uint8_t)uiNeighborAvailFlag;
-		pMb->uiSliceIdc		= uiSliceIdc;
-
-		++ iIdx;
-	} while(iIdx <= kiEndMbInSlice);
-}
-
-void CalcSliceComplexRatio( void *pRatio, SSliceCtx *pSliceCtx, uint32_t *pSliceConsume )
-{
-	float *pRatioList			= (float *)pRatio;
-	float fAvI[MAX_SLICES_NUM];
-	float fSumAv				= .0f;
-	uint32_t *pSliceTime		= (uint32_t *)pSliceConsume;
-	int32_t *pCountMbInSlice	= (int32_t *)pSliceCtx->pCountMbNumInSlice;	
-	const int32_t kiSliceCount	= pSliceCtx->iSliceNumInFrame;
-	int32_t iSliceIdx			= 0;
-
-#if defined(X86_ASM)
-	WelsEmms();
-#endif //X86_ASM
-	
-	while ( iSliceIdx < kiSliceCount )
-	{
-		fAvI[iSliceIdx]	= 1.0f * pCountMbInSlice[iSliceIdx] / pSliceTime[iSliceIdx];
-#if defined(ENABLE_TRACE_MT)
-		WelsLog(NULL, WELS_LOG_DEBUG, "[MT] CalcSliceComplexRatio(), pSliceConsumeTime[%d]= %d us, slice_run= %d\n", iSliceIdx, pSliceTime[iSliceIdx], pCountMbInSlice[iSliceIdx]);
-#endif//ENABLE_TRACE_MT
-		fSumAv += fAvI[iSliceIdx];
-		
-		++ iSliceIdx;
-	}
-	while ( -- iSliceIdx >= 0 )
-	{
-		pRatioList[iSliceIdx] = fAvI[iSliceIdx] / fSumAv;
-	}
-}
-
-#if defined(MT_ENABLED) && defined(DYNAMIC_SLICE_ASSIGN) && defined(NOT_ABSOLUTE_BALANCING)
-int32_t NeedDynamicAdjust( void *pConsumeTime, const int32_t iSliceNum )
-{	
-#if !defined(USE_RMSE_SLICE_COMPLEXITY_RATIO_FOR_BALANCING)
-	const float fRatioLower	= TOLERANT_BALANCING_RATIO_LOWER( uiSliceNum );
-	const float fRatioUpper	= TOLERANT_BALANCING_RATIO_UPPER( uiSliceNum );
-#endif//USE_RMSE_SLICE_COMPLEXITY_RATIO_FOR_BALANCING
-	uint32_t *pSliceConsume	= (uint32_t *)pConsumeTime;
-	uint32_t uiTotalConsume	= 0;
-	int32_t iSliceIdx		= 0;
-	int32_t iNeedAdj		= false;
-
-#if defined(X86_ASM)
-	WelsEmms();
-#endif //X86_ASM
-	
-	while( iSliceIdx < iSliceNum )
-	{
-		uiTotalConsume += pSliceConsume[iSliceIdx] + pSliceConsume[1+iSliceIdx];
-		iSliceIdx += 2;
-	}
-	if (uiTotalConsume == 0)
-	{
-#if defined(ENABLE_TRACE_MT)
-		WelsLog( NULL, WELS_LOG_DEBUG, "[MT] NeedDynamicAdjust(), herein do no adjust due first picture, iCountSliceNum= %d\n", iSliceNum );
-#endif//ENABLE_TRACE_MT
-		return false;
-	}
-
-	iSliceIdx = 0;
-#if defined(USE_RMSE_SLICE_COMPLEXITY_RATIO_FOR_BALANCING)
-	float fThr				= EPSN;	// threshold for various cores cases
-	float fRmse				= .0f;	// root mean square error of pSlice consume ratios
-	const float kfMeanRatio	= 1.0f / iSliceNum;
-	do{
-		const float fRatio = 1.0f * pSliceConsume[iSliceIdx] / uiTotalConsume;
-		const float fDiffRatio = fRatio - kfMeanRatio;
-		fRmse += (fDiffRatio * fDiffRatio);		
-		++ iSliceIdx;
-	} while ( iSliceIdx+1 < iSliceNum );	
-	fRmse = sqrtf(fRmse/iSliceNum);
-	if ( iSliceNum >= 8 )
-	{
-		fThr += THRESHOLD_RMSE_CORE8;		
-	}
-	else if ( iSliceNum >= 4 )
-	{
-		fThr += THRESHOLD_RMSE_CORE4;
-	}
-	else if ( iSliceNum >= 2 )
-	{
-		fThr += THRESHOLD_RMSE_CORE2;
-	}
-	else
-		fThr = 1.0f;
-	if ( fRmse > fThr )
-		iNeedAdj	= true;
-#if defined(ENABLE_TRACE_MT)
-	WelsLog(NULL, WELS_LOG_DEBUG, "[MT] NeedDynamicAdjust(), herein adjustment decision is made (iNeedAdj= %d) by: fRmse of pSlice complexity ratios %.6f, the corresponding threshold %.6f, iCountSliceNum %d\n",
-		iNeedAdj, fRmse, fThr, iSliceNum);
-#endif//ENABLE_TRACE_MT
-#else
-	do{
-		const float kfRatio = 1.0f * pSliceConsume[uiSliceIdx] / uiTotalConsume;
-		if ( kfRatio+EPSN < fRatioLower || kfRatio > ratio_upper+EPSN )
-		{
-#if defined(ENABLE_TRACE_MT)
-			WelsLog(NULL, WELS_LOG_DEBUG, "[MT] NeedDynamicAdjust(), herein adjustment decision is made by pSlice consume time not balanced at all, uiSliceIdx= %d, comp_ratio= %.6f, pSliceConsumeTime= %d, total_consume_time= %d, iCountSliceNum= %d\n",
-				uiSliceIdx, kfRatio, pSliceConsume[uiSliceIdx], uiTotalConsume, uiSliceNum);
-#endif//ENABLE_TRACE_MT
-			iNeedAdj = true;
-			break;
-		}
-		++ uiSliceIdx;
-	} while ( uiSliceIdx+1 < uiSliceNum );
-#endif//USE_RMSE_SLICE_COMPLEXITY_RATIO_FOR_BALANCING
-
-	return iNeedAdj;
-}
-#endif//..
-
-#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
-void DynamicAdjustSlicing(	sWelsEncCtx *pCtx,
-								SDqLayer *pCurDqLayer,
-								void *pComplexRatio,
-								int32_t iCurDid )
-{	
-	SSliceCtx *pSliceCtx	= pCurDqLayer->pSliceEncCtx;
-	const int32_t kiCountSliceNum	= pSliceCtx->iSliceNumInFrame;	
-	const int32_t kiCountNumMb		= pSliceCtx->iMbNumInFrame;
-	int32_t iMinimalMbNum			= pSliceCtx->iMbWidth;	// in theory we need only 1 SMB, here let it as one SMB row required
-	int32_t iMaximalMbNum			= 0;	// dynamically assign later
-	float *pSliceComplexRatio	= (float *)pComplexRatio;	
-	int32_t iMbNumLeft					= kiCountNumMb;
-	int32_t iRunLen[MAX_THREADS_NUM]	= {0};
-	int32_t iSliceIdx					= 0;
-
-	int32_t iNumMbInEachGom;
-	SWelsSvcRc *pWelsSvcRc = &pCtx->pWelsSvcRc[iCurDid];
-	if(pCtx->pSvcParam->bEnableRc)
-	{
-		iNumMbInEachGom = pWelsSvcRc->iNumberMbGom;
-
-		if ( iNumMbInEachGom <= 0 )
-		{
-			WelsLog(pCtx, WELS_LOG_ERROR, "[MT] DynamicAdjustSlicing(), invalid iNumMbInEachGom= %d from RC, iDid= %d, iCountNumMb= %d\n", iNumMbInEachGom, iCurDid, kiCountNumMb);
-			return;
-		}
-
-		// do not adjust in case no extra iNumMbInEachGom based left for slicing adjustment,
-		// extra MB of non integrated GOM assigned at the last pSlice in default, keep up on early initial result.
-		if ( iNumMbInEachGom * kiCountSliceNum >= kiCountNumMb )
-		{
-			return;
-		}		
-		iMinimalMbNum	= iNumMbInEachGom;		
-	}
-	
-	if ( kiCountSliceNum < 2 || (kiCountSliceNum & 0x01) )	// we need suppose uiSliceNum is even for multiple threading
-		return;
-
-	iMaximalMbNum	= kiCountNumMb - (kiCountSliceNum - 1) * iMinimalMbNum;
-
-#if defined(X86_ASM)
-	WelsEmms();
-#endif //X86_ASM
-	
-#if defined(ENABLE_TRACE_MT)
-	WelsLog(pCtx, WELS_LOG_DEBUG, "[MT] DynamicAdjustSlicing(), iDid= %d, iCountNumMb= %d\n", iCurDid, kiCountNumMb);
-#endif//ENABLE_TRACE_MT
-
-	iSliceIdx	= 0;
-	while (iSliceIdx+1 < kiCountSliceNum) {
-		int32_t iNumMbAssigning = (int32_t)(kiCountNumMb * pSliceComplexRatio[iSliceIdx] + EPSN);			
-
-		// GOM boundary aligned
-		if(pCtx->pSvcParam->bEnableRc)
-		{
-			iNumMbAssigning=(int32_t)(1.0f * iNumMbAssigning / iNumMbInEachGom + 0.5f + EPSN) * iNumMbInEachGom;			
-		}
-
-		// make sure one GOM at least in each pSlice for safe
-		if ( iNumMbAssigning < iMinimalMbNum )
-			iNumMbAssigning	= iMinimalMbNum;
-		else if ( iNumMbAssigning > iMaximalMbNum )
-			iNumMbAssigning	= iMaximalMbNum;
-
-		assert( iNumMbAssigning > 0 );
-
-		iMbNumLeft -= iNumMbAssigning;
-		if ( iMbNumLeft <= 0 )	// error due to we can not support slice_skip now yet, do not adjust this time
-		{
-			assert( 0 );
-			return;
-		}
-		iRunLen[iSliceIdx]	= iNumMbAssigning;
-#if defined(ENABLE_TRACE_MT)
-		WelsLog(pCtx, WELS_LOG_DEBUG, "[MT] DynamicAdjustSlicing(), uiSliceIdx= %d, pSliceComplexRatio= %.2f, slice_run_org= %d, slice_run_adj= %d\n", 
-			iSliceIdx, pSliceComplexRatio[iSliceIdx], pSliceCtx->pCountMbNumInSlice[iSliceIdx], iNumMbAssigning);
-#endif//ENABLE_TRACE_MT
-		++ iSliceIdx;
-		iMaximalMbNum	= iMbNumLeft - (kiCountSliceNum - iSliceIdx - 1) * iMinimalMbNum;	// get maximal num_mb in left parts
-	}
-	iRunLen[iSliceIdx] = iMbNumLeft;
-#if defined(ENABLE_TRACE_MT)
-	WelsLog(pCtx, WELS_LOG_DEBUG, "[MT] DynamicAdjustSlicing(), iSliceIdx= %d, pSliceComplexRatio= %.2f, slice_run_org= %d, slice_run_adj= %d\n", 
-		iSliceIdx, pSliceComplexRatio[iSliceIdx], pSliceCtx->pCountMbNumInSlice[iSliceIdx], iMbNumLeft);
-#endif//ENABLE_TRACE_MT
-
-
-	if ( DynamicAdjustSlicePEncCtxAll( pSliceCtx, iRunLen ) == 0 )
-	{
-		const int32_t kiThreadNum	= pCtx->pSvcParam->iCountThreadsNum;
-		int32_t iThreadIdx			= 0;
-		do {
-#ifdef WIN32
-			WelsEventSignal( &pCtx->pSliceThreading->pUpdateMbListEvent[iThreadIdx] );
-#else
-			WelsEventSignal( pCtx->pSliceThreading->pUpdateMbListEvent[iThreadIdx] );
-#endif//WIN32
-			++ iThreadIdx;
-		} while(iThreadIdx < kiThreadNum);
-
-		WelsMultipleEventsWaitAllBlocking( kiThreadNum, &pCtx->pSliceThreading->pFinUpdateMbListEvent[0] );
-	}
-}
-#endif//#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
-
-#ifdef PACKING_ONE_SLICE_PER_LAYER
-void ResetEnvMt( sWelsEncCtx *pCtx)
-{
-	const int16_t kiSliceCount = pCtx->iMaxSliceCount;
-	int32_t iIdx = 0;
-
-	while ( iIdx < kiSliceCount )
-	{
-		SWelsSliceBs *pSliceBs	= &pCtx->pSliceBs[iIdx];
-		pSliceBs->uiBsPos		= 0;	
-		++ iIdx;
-	}
-}
-#endif//PACKING_ONE_SLICE_PER_LAYER
-
-int32_t RequestMtResource( sWelsEncCtx **ppCtx, SWelsSvcCodingParam *pCodingParam, const int32_t iCountBsLen, const int32_t iTargetSpatialBsSize )
-{	
-	CMemoryAlign *pMa			= NULL;
-	SWelsSvcCodingParam *pPara= NULL;
-	SSliceThreading *pSmt		= NULL;
-	SWelsSliceBs *pSliceB		= NULL;
-	uint8_t *pBsBase			= NULL;
-	int32_t iNumSpatialLayers	= 0;
-	int32_t iThreadNum			= 0;	
-	int32_t iIdx					= 0;
-	int32_t iSliceBsBufferSize= 0;
-	int16_t iMaxSliceNum		= 1;
-	
-	if ( NULL == ppCtx || NULL == pCodingParam || NULL == *ppCtx || iCountBsLen <= 0 )
-		return 1;
-	
-	pMa	= (*ppCtx)->pMemAlign;
-	pPara= pCodingParam;
-	iNumSpatialLayers	= pPara->iNumDependencyLayer;
-	iThreadNum	= pPara->iCountThreadsNum;
-	iMaxSliceNum = (*ppCtx)->iMaxSliceCount;
-
-	pSmt	= (SSliceThreading *)pMa->WelsMalloc(sizeof(SSliceThreading), "SSliceThreading");
-	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pSmt), FreeMemorySvc(ppCtx) )
-	(*ppCtx)->pSliceThreading	= pSmt;
-	pSmt->pThreadPEncCtx	= (SSliceThreadPrivateData *)pMa->WelsMalloc( sizeof(SSliceThreadPrivateData) * iThreadNum, "pThreadPEncCtx" );
-	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pSmt->pThreadPEncCtx), FreeMemorySvc(ppCtx) )
-	pSmt->pThreadHandles	= (WELS_THREAD_HANDLE *)pMa->WelsMalloc( sizeof(WELS_THREAD_HANDLE) * iThreadNum, "pThreadHandles" );
-	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pSmt->pThreadHandles), FreeMemorySvc(ppCtx) )
-
-#ifdef WIN32
-	pSmt->pSliceCodedEvent	= (WELS_EVENT *)pMa->WelsMalloc( sizeof(WELS_EVENT) * iThreadNum, "pSliceCodedEvent" );
-	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pSmt->pSliceCodedEvent), FreeMemorySvc(ppCtx) )
-	pSmt->pReadySliceCodingEvent	= (WELS_EVENT *)pMa->WelsMalloc( sizeof(WELS_EVENT) * iThreadNum, "pReadySliceCodingEvent" );
-	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pSmt->pReadySliceCodingEvent), FreeMemorySvc(ppCtx) )
-	pSmt->pFinSliceCodingEvent	= (WELS_EVENT *)pMa->WelsMalloc( sizeof(WELS_EVENT) * iThreadNum, "pFinSliceCodingEvent" );
-	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pSmt->pFinSliceCodingEvent), FreeMemorySvc(ppCtx) )
-#endif//WIN32
-
-#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
-#if defined(__GNUC__)
-	pSmt->pUpdateMbListThrdHandles	= (WELS_THREAD_HANDLE *)pMa->WelsMalloc( sizeof(WELS_THREAD_HANDLE) * iThreadNum, "pUpdateMbListThrdHandles" );
-	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pSmt->pUpdateMbListThrdHandles), FreeMemorySvc(ppCtx) )	
-#endif//__GNUC__
-#ifdef WIN32
-	pSmt->pUpdateMbListEvent	= (WELS_EVENT *)pMa->WelsMalloc( sizeof(WELS_EVENT) * iThreadNum, "pUpdateMbListEvent" );
-	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pSmt->pUpdateMbListEvent), FreeMemorySvc(ppCtx) )
-	pSmt->pFinUpdateMbListEvent	= (WELS_EVENT *)pMa->WelsMalloc( sizeof(WELS_EVENT) * iThreadNum, "pFinUpdateMbListEvent" );
-	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pSmt->pFinUpdateMbListEvent), FreeMemorySvc(ppCtx) )
-#endif//WIN32
-#endif//#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
-
-#ifdef WIN32
-	pSmt->pExitEncodeEvent	= (WELS_EVENT *)pMa->WelsMalloc( sizeof(WELS_EVENT) * iThreadNum, "pExitEncodeEvent" );
-	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pSmt->pExitEncodeEvent), FreeMemorySvc(ppCtx) )
-#endif//WIN32
-
-#if defined(DYNAMIC_SLICE_ASSIGN) || defined(MT_DEBUG)
-	iIdx = 0;
-	while ( iIdx < iNumSpatialLayers )
-	{		
-		SMulSliceOption *pMso	= &pPara->sDependencyLayers[iIdx].sMso;
-		const int32_t kiSliceNum= pMso->sSliceArgument.iSliceNum;
-		if (pMso->uiSliceMode == SM_FIXEDSLCNUM_SLICE && pPara->iMultipleThreadIdc > 1 && pPara->iMultipleThreadIdc >= kiSliceNum )
-		{
-			pSmt->pSliceConsumeTime[iIdx]	= (uint32_t *)pMa->WelsMallocz( kiSliceNum * sizeof(uint32_t), "pSliceConsumeTime[]" );
-			WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pSmt->pSliceConsumeTime[iIdx]), FreeMemorySvc(ppCtx) )
-#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
-			pSmt->pSliceComplexRatio[iIdx]	= (float *)pMa->WelsMalloc( kiSliceNum * sizeof(float), "pSliceComplexRatio[]" );
-			WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pSmt->pSliceComplexRatio[iIdx]), FreeMemorySvc(ppCtx) )
-#endif//TRY_SLICING_BALANCE
-		}
-		else
-		{
-			pSmt->pSliceConsumeTime[iIdx]	= NULL;
-#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
-			pSmt->pSliceComplexRatio[iIdx]	= NULL;
-#endif//TRY_SLICING_BALANCE
-		}		
-		++ iIdx;
-	}
-	// NULL for pSliceConsumeTime[iIdx]: iIdx from iNumSpatialLayers to MAX_DEPENDENCY_LAYERS	
-#endif//#if defined(DYNAMIC_SLICE_ASSIGN) || defined(MT_DEBUG)
-	
-#ifdef MT_DEBUG
-	// file handle for MT debug
-	pSmt->pFSliceDiff = NULL;
-
-	if ( pSmt->pFSliceDiff )
-	{
-		fclose( pSmt->pFSliceDiff );
-		pSmt->pFSliceDiff = NULL;
-	}
-#ifdef WIN32
-	pSmt->pFSliceDiff	= fopen(".\\slice_time.txt", "wt+" );
-#else
-	pSmt->pFSliceDiff	= fopen("/tmp/slice_time.txt", "wt+" );
-#endif//WIN32
-#endif//MT_DEBUG
-	
-#if defined(ENABLE_TRACE_MT)
-	WelsLog((*ppCtx), WELS_LOG_INFO, "encpEncCtx= 0x%p\n", (void *)(*ppCtx));
-#endif//ENABLE_TRACE_MT
-
-	iIdx = 0;
-	while ( iIdx < iThreadNum )
-	{
-#ifdef __GNUC__	// for posix threading
-		str_t name[SEM_NAME_MAX] = {0};
-		int32_t used_len = 0;
-		WELS_THREAD_ERROR_CODE err = 0;
-#endif//__GNUC__
-		pSmt->pThreadPEncCtx[iIdx].pWelsPEncCtx	= (void *)(*ppCtx);
-		pSmt->pThreadPEncCtx[iIdx].iSliceIndex	= iIdx;
-		pSmt->pThreadPEncCtx[iIdx].iThreadIndex	= iIdx;
-		pSmt->pThreadHandles[iIdx]				= 0;
-
-#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
-#ifdef WIN32
-		WelsEventInit( &pSmt->pUpdateMbListEvent[iIdx] );
-		WelsEventInit( &pSmt->pFinUpdateMbListEvent[iIdx] );		
-#else
-		// length of semaphore name should be system constrained at least on mac 10.7
-		SNPRINTF( name, SEM_NAME_MAX, "ud%d%p", iIdx, (void *)(*ppCtx) );
-		err = WelsEventOpen( &pSmt->pUpdateMbListEvent[iIdx], name );
-#if defined(ENABLE_TRACE_MT)
-		WelsLog((*ppCtx), WELS_LOG_INFO, "[MT] Open pUpdateMbListEvent%d named(%s) ret%d err%d\n", iIdx, name, err, errno);
-#endif
-		used_len = SNPRINTF( name, SEM_NAME_MAX, "fu%d%p", iIdx, (void *)(*ppCtx) );
-		name[used_len] = '\0';
-		err = WelsEventOpen( &pSmt->pFinUpdateMbListEvent[iIdx], name );
-#if defined(ENABLE_TRACE_MT)
-		WelsLog((*ppCtx), WELS_LOG_INFO, "[MT] Open pFinUpdateMbListEvent%d named(%s) ret%d err%d\n", iIdx, name, err, errno);
-#endif
-#endif//WIN32
-#endif//#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
-		
-#ifdef WIN32
-		WelsEventInit( &pSmt->pSliceCodedEvent[iIdx] );
-		WelsEventInit( &pSmt->pReadySliceCodingEvent[iIdx] );
-		WelsEventInit( &pSmt->pFinSliceCodingEvent[iIdx] );
-		WelsEventInit( &pSmt->pExitEncodeEvent[iIdx] );
-#else
-		used_len = SNPRINTF( name, SEM_NAME_MAX, "sc%d%p", iIdx, (void *)(*ppCtx) );
-		name[used_len] = '\0';
-		err = WelsEventOpen( &pSmt->pSliceCodedEvent[iIdx], name );
-#if defined(ENABLE_TRACE_MT)
-		WelsLog((*ppCtx), WELS_LOG_INFO, "[MT] Open pSliceCodedEvent%d named(%s) ret%d err%d\n", iIdx, name, err, errno);
-#endif
-		used_len = SNPRINTF( name, SEM_NAME_MAX, "rc%d%p", iIdx, (void *)(*ppCtx) );
-		name[used_len] = '\0';
-		err = WelsEventOpen( &pSmt->pReadySliceCodingEvent[iIdx], name );		
-#if defined(ENABLE_TRACE_MT)
-		WelsLog((*ppCtx), WELS_LOG_INFO, "[MT] Open pReadySliceCodingEvent%d = 0x%p named(%s) ret%d err%d\n", iIdx, (void *)pSmt->pReadySliceCodingEvent[iIdx]), (void *)(*ppCtx), err, errno);
-#endif
-#endif//WIN32
-
-		++ iIdx;
-	}
-
-#ifdef PACKING_ONE_SLICE_PER_LAYER
-	pSmt->pCountBsSizeInPartition	= (uint32_t *)pMa->WelsMalloc( sizeof(uint32_t) * iThreadNum, "pCountBsSizeInPartition" );
-	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pSmt->pCountBsSizeInPartition), FreeMemorySvc(ppCtx) )
-#endif//PACKING_ONE_SLICE_PER_LAYER	
-
-	WelsMutexInit( &pSmt->mutexSliceNumUpdate );
-	
-	(*ppCtx)->pSliceBs	= (SWelsSliceBs *)pMa->WelsMalloc( sizeof(SWelsSliceBs) * iMaxSliceNum, "pSliceBs" );
-	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pSliceBs), FreeMemorySvc(ppCtx) )
-	
-	pBsBase		= (*ppCtx)->pFrameBs + iCountBsLen;
-	pSliceB	= (*ppCtx)->pSliceBs;
-	iSliceBsBufferSize	= iTargetSpatialBsSize;
-	iIdx = 0;
-	while ( iIdx < iMaxSliceNum )
-	{
-		pSliceB->pBsBuffer	= (uint8_t *)pMa->WelsMalloc( iSliceBsBufferSize, "pSliceB->pBsBuffer" );
-
-		WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pSliceB->pBsBuffer), FreeMemorySvc(ppCtx) )
-		pSliceB->uiSize	= iSliceBsBufferSize;
-		
-		if ( iIdx > 0 )
-		{
-			pSliceB->pBs		= pBsBase;
-			pSliceB->uiBsPos	= 0;
-			pBsBase				+= iSliceBsBufferSize;
-		}
-		else
-		{
-			pSliceB->pBs		= NULL;
-			pSliceB->uiBsPos	= 0;
-		}
-		++ pSliceB;
-		++ iIdx;
-	}
-
-#if defined(ENABLE_TRACE_MT)
-	WelsLog((*ppCtx), WELS_LOG_INFO, "RequestMtResource(), iThreadNum=%d, iCountSliceNum= %d\n", pPara->iCountThreadsNum, iMaxSliceNum);
-#endif
-	
-	return 0;
-}
-
-void ReleaseMtResource( sWelsEncCtx **ppCtx )
-{
-	SWelsSliceBs *pSliceB			= NULL;
-	SWelsSvcCodingParam *pCodingParam	= NULL;
-	SSliceThreading *pSmt			= NULL;
-	CMemoryAlign *pMa				= NULL;	
-	int32_t iIdx						= 0;
-	int32_t iThreadNum				= 0;
-	int16_t uiSliceNum				= 0;
-
-	if ( NULL == ppCtx || NULL == *ppCtx )
-		return;
-
-	pMa			= (*ppCtx)->pMemAlign;
-	pCodingParam		= (*ppCtx)->pSvcParam;
-	uiSliceNum	= (*ppCtx)->iMaxSliceCount;
-	iThreadNum	= (*ppCtx)->pSvcParam->iCountThreadsNum;
-	pSmt		= (*ppCtx)->pSliceThreading;
-
-	if ( NULL == pSmt )
-		return;
-
-	while ( iIdx < iThreadNum) {
-#ifdef WIN32
-		if ( pSmt->pThreadHandles != NULL && pSmt->pThreadHandles[iIdx] != NULL )
-			WelsThreadDestroy( &pSmt->pThreadHandles[iIdx] );
-
-		if ( pSmt->pSliceCodedEvent != NULL )
-			WelsEventDestroy( &pSmt->pSliceCodedEvent[iIdx] );
-		if ( pSmt->pReadySliceCodingEvent != NULL )
-			WelsEventDestroy( &pSmt->pReadySliceCodingEvent[iIdx] );
-		if ( pSmt->pFinSliceCodingEvent != NULL )
-			WelsEventDestroy( &pSmt->pFinSliceCodingEvent[iIdx] );
-		if ( pSmt->pExitEncodeEvent != NULL )
-			WelsEventDestroy( &pSmt->pExitEncodeEvent[iIdx] );
-#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
-		if ( pSmt->pUpdateMbListEvent != NULL )
-			WelsEventDestroy( &pSmt->pUpdateMbListEvent[iIdx] );
-		if ( pSmt->pFinUpdateMbListEvent != NULL )
-			WelsEventDestroy( &pSmt->pFinUpdateMbListEvent[iIdx] );
-#endif//DYNAMIC_SLICE_ASSIGN && TRY_SLICING_BALANCE
-#else	
-		str_t ename[SEM_NAME_MAX] = {0};
-		int32_t used_len = 0;
-		// length of semaphore name should be system constrained at least on mac 10.7
-		SNPRINTF( ename, SEM_NAME_MAX, "sc%d%p", iIdx, (void *)(*ppCtx) );
-		WelsEventClose( pSmt->pSliceCodedEvent[iIdx], ename );
-		used_len = SNPRINTF( ename, SEM_NAME_MAX, "rc%d%p", iIdx, (void *)(*ppCtx) );
-		ename[used_len] = '\0';
-		WelsEventClose( pSmt->pReadySliceCodingEvent[iIdx], ename );
-#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
-		used_len = SNPRINTF( ename, SEM_NAME_MAX, "ud%d%p", iIdx, (void *)(*ppCtx) );
-		ename[used_len] = '\0';
-		WelsEventClose( pSmt->pUpdateMbListEvent[iIdx], ename );
-		used_len = SNPRINTF( ename, SEM_NAME_MAX, "fu%d%p", iIdx, (void *)(*ppCtx) );
-		ename[used_len] = '\0';
-		WelsEventClose( pSmt->pFinUpdateMbListEvent[iIdx], ename );
-#endif//DYNAMIC_SLICE_ASSIGN && TRY_SLICING_BALANCE
-#endif//WIN32		
-
-		++ iIdx;
-	}
-
-#ifdef WIN32
-	if ( pSmt->pExitEncodeEvent != NULL )
-	{
-		pMa->WelsFree( pSmt->pExitEncodeEvent, "pExitEncodeEvent" );
-		pSmt->pExitEncodeEvent = NULL;
-	}
-	if ( pSmt->pSliceCodedEvent != NULL )
-	{
-		pMa->WelsFree( pSmt->pSliceCodedEvent, "pSliceCodedEvent" );
-		pSmt->pSliceCodedEvent = NULL;
-	}
-	if ( pSmt->pReadySliceCodingEvent != NULL )
-	{
-		pMa->WelsFree( pSmt->pReadySliceCodingEvent, "pReadySliceCodingEvent" );
-		pSmt->pReadySliceCodingEvent = NULL;
-	}
-	if ( pSmt->pFinSliceCodingEvent != NULL )
-	{
-		pMa->WelsFree( pSmt->pFinSliceCodingEvent, "pFinSliceCodingEvent" );
-		pSmt->pFinSliceCodingEvent = NULL;
-	}
-#endif//WIN32
-
-#ifdef PACKING_ONE_SLICE_PER_LAYER
-	if ( NULL != pSmt->pCountBsSizeInPartition )
-	{
-		pMa->WelsFree( pSmt->pCountBsSizeInPartition, "pCountBsSizeInPartition" );
-		pSmt->pCountBsSizeInPartition = NULL;
-	}
-#endif//PACKING_ONE_SLICE_PER_LAYER
-	WelsMutexDestroy( &pSmt->mutexSliceNumUpdate );
-
-	if ( pSmt->pThreadPEncCtx != NULL )
-	{
-		pMa->WelsFree( pSmt->pThreadPEncCtx, "pThreadPEncCtx" );
-		pSmt->pThreadPEncCtx = NULL;
-	}
-	if ( pSmt->pThreadHandles != NULL )
-	{
-		pMa->WelsFree( pSmt->pThreadHandles, "pThreadHandles" );
-		pSmt->pThreadHandles = NULL;
-	}
-	
-	pSliceB = (*ppCtx)->pSliceBs;
-	iIdx = 0;
-	while ( pSliceB != NULL && iIdx < uiSliceNum )
-	{		
-		if ( pSliceB->pBsBuffer )
-		{
-			pMa->WelsFree( pSliceB->pBsBuffer, "pSliceB->pBsBuffer" );
-			pSliceB->pBsBuffer = NULL;
-			pSliceB->uiSize = 0;
-		}		
-		++ iIdx;
-		++ pSliceB;
-	}
-	if ( (*ppCtx)->pSliceBs != NULL )
-	{
-		pMa->WelsFree( (*ppCtx)->pSliceBs, "pSliceBs" );
-		(*ppCtx)->pSliceBs = NULL;
-	}
-#if defined(DYNAMIC_SLICE_ASSIGN) || defined(MT_DEBUG)
-	if ( pSmt->pSliceConsumeTime != NULL )
-	{
-		iIdx = 0;
-		while (iIdx < pCodingParam->iNumDependencyLayer)
-		{
-			if ( pSmt->pSliceConsumeTime[iIdx] )
-			{
-				pMa->WelsFree( pSmt->pSliceConsumeTime[iIdx], "pSliceConsumeTime[]" );
-				pSmt->pSliceConsumeTime[iIdx] = NULL;
-			}
-#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
-			if ( pSmt->pSliceComplexRatio[iIdx] != NULL )
-			{
-				pMa->WelsFree( pSmt->pSliceComplexRatio[iIdx], "pSliceComplexRatio[]" );
-				pSmt->pSliceComplexRatio[iIdx] = NULL;
-			}
-#endif//TRY_SLICING_BALANCE
-			++ iIdx;
-		}		
-	}
-#endif//#if defined(DYNAMIC_SLICE_ASSIGN) || defined(MT_DEBUG)
-
-#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)	
-
-#ifdef WIN32
-	if ( pSmt->pUpdateMbListEvent != NULL )
-	{
-		pMa->WelsFree( pSmt->pUpdateMbListEvent, "pUpdateMbListEvent" );
-		pSmt->pUpdateMbListEvent = NULL;
-	}
-	if ( pSmt->pFinUpdateMbListEvent != NULL )
-	{
-		pMa->WelsFree( pSmt->pFinUpdateMbListEvent, "pFinUpdateMbListEvent" );
-		pSmt->pFinUpdateMbListEvent = NULL;
-	}
-#else
-	if ( pSmt->pUpdateMbListThrdHandles )
-	{
-		pMa->WelsFree( pSmt->pUpdateMbListThrdHandles, "pUpdateMbListThrdHandles" );
-		pSmt->pUpdateMbListThrdHandles = NULL;
-	}
-#endif//WIN32
-
-#endif//#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
-
-#ifdef MT_DEBUG
-	// file handle for debug
-	if ( pSmt->pFSliceDiff )
-	{
-		fclose( pSmt->pFSliceDiff );
-		pSmt->pFSliceDiff = NULL;
-	}
-#endif//MT_DEBUG
-	pMa->WelsFree((*ppCtx)->pSliceThreading, "SSliceThreading");
-	(*ppCtx)->pSliceThreading = NULL;
-}
-
-int32_t AppendSliceToFrameBs( sWelsEncCtx *pCtx, SLayerBSInfo *pLbi, const int32_t iSliceCount )
-{	
-	SWelsSvcCodingParam *pCodingParam	= pCtx->pSvcParam;
-	SDLayerParam *pDlp				= &pCodingParam->sDependencyLayers[pCtx->uiDependencyId];
-	SWelsSliceBs *pSliceBs			= NULL;
-	const BOOL_T kbIsDynamicSlicingMode	= (pDlp->sMso.uiSliceMode == SM_DYN_SLICE);
-	int32_t iLayerSize					= 0;
-	int32_t iNalIdxBase				= pLbi->iNalCount;
-	int32_t iSliceIdx					= 0;	
-
-	if ( !kbIsDynamicSlicingMode )
-	{
-		pSliceBs	= &pCtx->pSliceBs[0];
-		iLayerSize	= pSliceBs->uiBsPos;	// assign with base pSlice first			
-		iSliceIdx	= 1;				// pSlice 0 bs has been written to pFrameBs yet by now, so uiSliceIdx base should be 1
-		while (iSliceIdx < iSliceCount)
-		{
-			++ pSliceBs;
-			if ( pSliceBs != NULL && pSliceBs->uiBsPos > 0 )
-			{
-				int32_t iNalIdx = 0;
-				const int32_t iCountNal	= pSliceBs->iNalIndex;
-
-#if MT_DEBUG_BS_WR
-				assert(pSliceBs->bSliceCodedFlag);
-#endif//MT_DEBUG_BS_WR
-
-				memmove(pCtx->pFrameBs + pCtx->iPosBsBuffer, pSliceBs->pBs, pSliceBs->uiBsPos);	// confirmed_safe_unsafe_usage
-				pCtx->iPosBsBuffer += pSliceBs->uiBsPos;
-
-				iLayerSize += pSliceBs->uiBsPos;
-
-				while (iNalIdx < iCountNal)
-				{
-					pLbi->iNalLengthInByte[iNalIdxBase+iNalIdx]	= pSliceBs->iNalLen[iNalIdx];
-					++ iNalIdx;
-				}
-				pLbi->iNalCount	+= iCountNal;
-				iNalIdxBase	+= iCountNal;
-			}		
-			++ iSliceIdx;		
-		}
-	}
-	else	// for SM_DYN_SLICE
-	{
-		const int32_t kiPartitionCnt	= iSliceCount;
-		int32_t iPartitionIdx		= 0;
-		
-		// due partition_0 has been written to pFrameBsBuffer
-		// so iLayerSize need add it
-		while ( iPartitionIdx < kiPartitionCnt )
-		{
-			const int32_t kiCountSlicesCoded = pCtx->pCurDqLayer->pNumSliceCodedOfPartition[iPartitionIdx];
-			int32_t iIdx = 0;
-
-			iSliceIdx	= iPartitionIdx;
-			while(iIdx < kiCountSlicesCoded)
-			{
-				pSliceBs	= &pCtx->pSliceBs[iSliceIdx];
-				if ( pSliceBs != NULL && pSliceBs->uiBsPos > 0 )
-				{
-					if ( iPartitionIdx > 0 )
-					{
-						int32_t iNalIdx = 0;
-						const int32_t iCountNal	= pSliceBs->iNalIndex;
-
-						memmove(pCtx->pFrameBs + pCtx->iPosBsBuffer, pSliceBs->pBs, pSliceBs->uiBsPos);	// confirmed_safe_unsafe_usage
-						pCtx->iPosBsBuffer += pSliceBs->uiBsPos;
-
-						iLayerSize += pSliceBs->uiBsPos;
-
-						while (iNalIdx < iCountNal)
-						{
-							pLbi->iNalLengthInByte[iNalIdxBase+iNalIdx]	= pSliceBs->iNalLen[iNalIdx];
-							++ iNalIdx;
-						}
-						pLbi->iNalCount	+= iCountNal;
-						iNalIdxBase	+= iCountNal;
-					}
-					else
-					{
-						iLayerSize	+= pSliceBs->uiBsPos;
-					}
-				}
-
-				iSliceIdx += kiPartitionCnt;
-				++ iIdx;
-			}			
-			++ iPartitionIdx;
-		}
-	}
-
-	return iLayerSize;
-}
-
-int32_t WriteSliceToFrameBs( sWelsEncCtx *pCtx, SLayerBSInfo *pLbi, uint8_t *pFrameBsBuffer, const int32_t iSliceIdx )
-{
-	SWelsSliceBs *pSliceBs			= &pCtx->pSliceBs[iSliceIdx];
-	SNalUnitHeaderExt *pNalHdrExt= &pCtx->pCurDqLayer->sLayerInfo.sNalHeaderExt;	
-	uint8_t *pDst					= pFrameBsBuffer;
-	int32_t pNalLen[2];
-	int32_t iSliceSize				= 0;	
-	const int32_t kiNalCnt			= pSliceBs->iNalIndex;	
-	int32_t iNalIdx					= 0;
-#if !defined(PACKING_ONE_SLICE_PER_LAYER)
-	const int32_t iFirstSlice		= (iSliceIdx == 0);
-	int32_t iNalBase				= iFirstSlice ? 0 : pLbi->iNalCount;
-#else
-	int32_t iNalBase				= 0;
-#endif//!PACKING_ONE_SLICE_PER_LAYER
-	
-	while ( iNalIdx < kiNalCnt ) {
-		iSliceSize += WelsEncodeNalExt( &pSliceBs->sNalList[iNalIdx], pNalHdrExt, pDst, &pNalLen[iNalIdx] );
-		pDst += pNalLen[iNalIdx];
-		pLbi->iNalLengthInByte[iNalBase+iNalIdx]	= pNalLen[iNalIdx];
-		
-		++ iNalIdx;
-	}	
-	
-#if !defined(PACKING_ONE_SLICE_PER_LAYER)
-	pSliceBs->uiBsPos	= iSliceSize;
-	if ( iFirstSlice )
-	{
-		// pBsBuffer has been updated at coding_slice_0_in_encoder_mother_thread()
-		pLbi->uiLayerType		= VIDEO_CODING_LAYER;
-		pLbi->uiSpatialId		= pNalHdrExt->uiDependencyId;
-		pLbi->uiTemporalId	= pNalHdrExt->uiTemporalId;
-		pLbi->uiQualityId		= 0;
-		pLbi->uiPriorityId	= 0;	
-		pLbi->iNalCount		= kiNalCnt;
-	}
-	else
-	{
-		pLbi->iNalCount		+= kiNalCnt;
-	}
-#else
-	pLbi->uiLayerType		= VIDEO_CODING_LAYER;
-	pLbi->uiSpatialId		= pNalHdrExt->uiDependencyId;
-	pLbi->uiTemporalId	= pNalHdrExt->uiTemporalId;
-	pLbi->uiQualityId		= 0;
-	pLbi->uiPriorityId	= 0;	
-	pLbi->iNalCount		= kiNalCnt;
-#endif//PACKING_ONE_SLICE_PER_LAYER
-	
-	return iSliceSize;
-}
-
-int32_t WriteSliceBs( sWelsEncCtx *pCtx, uint8_t *pSliceBsBuf, const int32_t iSliceIdx )
-{
-	SWelsSliceBs *pSliceBs			= &pCtx->pSliceBs[iSliceIdx];
-	SNalUnitHeaderExt *pNalHdrExt= &pCtx->pCurDqLayer->sLayerInfo.sNalHeaderExt;	
-	uint8_t *pDst					= pSliceBsBuf;
-	int32_t *pNalLen				= &pSliceBs->iNalLen[0];
-	int32_t iSliceSize				= 0;
-	const int32_t kiNalCnt			= pSliceBs->iNalIndex;	
-	int32_t iNalIdx					= 0;	
-
-	assert( kiNalCnt <= 2 );
-	if ( kiNalCnt > 2 )
-		return 0;
-	
-	while ( iNalIdx < kiNalCnt ) {
-		iSliceSize += WelsEncodeNalExt( &pSliceBs->sNalList[iNalIdx], pNalHdrExt, pDst, &pNalLen[iNalIdx] );
-		pDst += pNalLen[iNalIdx];		
-		
-		++ iNalIdx;
-	}
-	pSliceBs->uiBsPos	= iSliceSize;
-	
-	return iSliceSize;
-}
-
-#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
-#if defined(__GNUC__)
-WELS_THREAD_ROUTINE_TYPE UpdateMbListThreadProc( void *arg )
-{
-	SSliceThreadPrivateData *pPrivateData	= (SSliceThreadPrivateData *)arg;
-	sWelsEncCtx *pEncPEncCtx			= NULL;
-	SDqLayer *pCurDq							= NULL;
-	int32_t iSliceIdx							= -1;
-	int32_t iEventIdx							= -1;
-	WELS_THREAD_ERROR_CODE iWaitRet				= WELS_THREAD_ERROR_GENERIAL;
-	uint32_t uiThrdRet							= 0;
-	
-	if ( NULL == pPrivateData )
-		WELS_THREAD_ROUTINE_RETURN(1);
-
-	pEncPEncCtx	= (sWelsEncCtx *)pPrivateData->pWelsPEncCtx;	
-	iSliceIdx		= pPrivateData->iSliceIndex;
-	iEventIdx		= pPrivateData->iThreadIndex;
-
-	do {
-#if defined(ENABLE_TRACE_MT)
-		WelsLog(pEncPEncCtx, WELS_LOG_INFO, "[MT] UpdateMbListThreadProc(), try to wait (pUpdateMbListEvent[%d])!\n", iEventIdx);
-#endif
-		iWaitRet = WelsEventWait( pEncPEncCtx->pSliceThreading->pUpdateMbListEvent[iEventIdx] );
-		if ( WELS_THREAD_ERROR_WAIT_OBJECT_0 == iWaitRet )
-		{
-			pCurDq			= pEncPEncCtx->pCurDqLayer;
-			UpdateMbListNeighborParallel( pCurDq->pSliceEncCtx, pCurDq->sMbDataP, iSliceIdx );
-			WelsEventSignal( pEncPEncCtx->pSliceThreading->pFinUpdateMbListEvent[iEventIdx] );	// mean finished update pMb list for this pSlice
-		}
-		else
-		{
-			WelsLog(pEncPEncCtx, WELS_LOG_WARNING, "[MT] UpdateMbListThreadProc(), waiting pUpdateMbListEvent[%d] failed(%d) and thread%d terminated!\n", iEventIdx, iWaitRet, iEventIdx);
-			uiThrdRet = 1;
-			break;
-		}
-	} while(1);
-
-	WELS_THREAD_ROUTINE_RETURN(uiThrdRet);
-}
-#endif//__GNUC__
-#endif//#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
-
-// thread process for coding one pSlice
-WELS_THREAD_ROUTINE_TYPE CodingSliceThreadProc( void *arg )
-{
-	SSliceThreadPrivateData *pPrivateData	= (SSliceThreadPrivateData *)arg;
-	sWelsEncCtx *pEncPEncCtx			= NULL;
-	SDqLayer *pCurDq							= NULL;
-	SSlice *pSlice								= NULL;
-	SWelsSliceBs *pSliceBs						= NULL;
-#ifdef WIN32
-	WELS_EVENT pEventsList[3];
-	int32_t iEventCount						= 0;
-#endif
-	WELS_THREAD_ERROR_CODE iWaitRet				= WELS_THREAD_ERROR_GENERIAL;
-	uint32_t uiThrdRet							= 0;
-	int32_t iSliceSize							= 0;
-	int32_t iSliceIdx							= -1;
-	int32_t iThreadIdx							= -1;
-	int32_t iEventIdx							= -1;
-	bool_t bNeedPrefix							= false;
-	EWelsNalUnitType eNalType						= NAL_UNIT_UNSPEC_0;
-	EWelsNalRefIdc eNalRefIdc						= NRI_PRI_LOWEST;	
-
-	if ( NULL == pPrivateData )
-		WELS_THREAD_ROUTINE_RETURN(1);
-
-	WelsSetThreadCancelable();
-	
-	pEncPEncCtx	= (sWelsEncCtx *)pPrivateData->pWelsPEncCtx;
-	
-	iThreadIdx		= pPrivateData->iThreadIndex;	
-	iEventIdx		= iThreadIdx;
-	
-#ifdef WIN32
-	pEventsList[iEventCount++]	= pEncPEncCtx->pSliceThreading->pReadySliceCodingEvent[iEventIdx];
-	pEventsList[iEventCount++]	= pEncPEncCtx->pSliceThreading->pExitEncodeEvent[iEventIdx];
-#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
-	pEventsList[iEventCount++] = pEncPEncCtx->pSliceThreading->pUpdateMbListEvent[iEventIdx];
-#endif//#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)	
-#endif//WIN32
-
-	do {
-#ifdef WIN32
-		iWaitRet = WelsMultipleEventsWaitSingleBlocking(	iEventCount,
-															&pEventsList[0],
-															(uint32_t)-1	);	// blocking until at least one event is 
-#else
-#if defined(ENABLE_TRACE_MT)
-		WelsLog(pEncPEncCtx, WELS_LOG_INFO, "[MT] CodingSliceThreadProc(), try to call WelsEventWait(pReadySliceCodingEvent[%d]= 0x%p), pEncPEncCtx= 0x%p!\n", iEventIdx, (void *)(pEncPEncCtx->pReadySliceCodingEvent[iEventIdx]), (void *)pEncPEncCtx );
-#endif
-		iWaitRet = WelsEventWait( pEncPEncCtx->pSliceThreading->pReadySliceCodingEvent[iEventIdx] );
-#endif//WIN32		
-		if ( WELS_THREAD_ERROR_WAIT_OBJECT_0 == iWaitRet )	// start pSlice coding signal waited
-		{
-			SLayerBSInfo *pLbi = pPrivateData->pLayerBs;
-			const int32_t kiCurDid			= pEncPEncCtx->uiDependencyId;
-			const int32_t kiCurTid			= pEncPEncCtx->uiTemporalId;
-			SWelsSvcCodingParam *pCodingParam	= pEncPEncCtx->pSvcParam;
-			SDLayerParam *pParamD			= &pCodingParam->sDependencyLayers[kiCurDid];
-
-			pCurDq			= pEncPEncCtx->pCurDqLayer;			
-			eNalType		= pEncPEncCtx->eNalType;
-			eNalRefIdc		= pEncPEncCtx->eNalPriority;
-			bNeedPrefix		= pEncPEncCtx->bNeedPrefixNalFlag;			
-			
-			if ( pParamD->sMso.uiSliceMode != SM_DYN_SLICE )
-			{
-				int64_t iSliceStart	= 0;
-				bool_t bDsaFlag = false;
-				iSliceIdx		= pPrivateData->iSliceIndex;
-				pSlice			= &pCurDq->sLayerInfo.pSliceInLayer[iSliceIdx];
-				pSliceBs		= &pEncPEncCtx->pSliceBs[iSliceIdx];
-
-#if defined(DYNAMIC_SLICE_ASSIGN) || defined(MT_DEBUG)
-				bDsaFlag	= (pParamD->sMso.uiSliceMode == SM_FIXEDSLCNUM_SLICE && 
-							   pCodingParam->iMultipleThreadIdc > 1 &&
-							   pCodingParam->iMultipleThreadIdc >= pParamD->sMso.sSliceArgument.iSliceNum);
-				if ( bDsaFlag )
-					iSliceStart = WelsTime();
-#endif//DYNAMIC_SLICE_ASSIGN || MT_DEBUG
-
-#if !defined(PACKING_ONE_SLICE_PER_LAYER)
-				pSliceBs->uiBsPos	= 0;
-#endif//!PACKING_ONE_SLICE_PER_LAYER
-				pSliceBs->iNalIndex	= 0;
-				assert( (void*)(&pSliceBs->sBsWrite) == (void*)pSlice->pSliceBsa );
-				InitBits( &pSliceBs->sBsWrite, pSliceBs->pBsBuffer, pSliceBs->uiSize );
-
-#if MT_DEBUG_BS_WR
-				pSliceBs->bSliceCodedFlag	= FALSE;
-#endif//MT_DEBUG_BS_WR
-			
-				if ( bNeedPrefix )
-				{
-					if ( eNalRefIdc != NRI_PRI_LOWEST )
-					{
-						WelsLoadNalForSlice( pSliceBs, NAL_UNIT_PREFIX, eNalRefIdc );
-						WelsWriteSVCPrefixNal( &pSliceBs->sBsWrite, eNalRefIdc, (NAL_UNIT_CODED_SLICE_IDR == eNalType) );
-						WelsUnloadNalForSlice( pSliceBs );			
-					}
-					else // No Prefix NAL Unit RBSP syntax here, but need add NAL Unit Header extension
-					{
-						WelsLoadNalForSlice( pSliceBs, NAL_UNIT_PREFIX, eNalRefIdc );
-						// No need write any syntax of prefix NAL Unit RBSP here
-						WelsUnloadNalForSlice( pSliceBs );			
-					}
-				}
-				
-				WelsLoadNalForSlice( pSliceBs, eNalType, eNalRefIdc );
-
-				WelsCodeOneSlice( pEncPEncCtx, iSliceIdx, eNalType );			
-
-				WelsUnloadNalForSlice( pSliceBs );
-
-#if !defined(PACKING_ONE_SLICE_PER_LAYER)
-				if ( 0 == iSliceIdx )
-				{			
-					pLbi->pBsBuf	= pEncPEncCtx->pFrameBs + pEncPEncCtx->iPosBsBuffer;
-					iSliceSize = WriteSliceToFrameBs( pEncPEncCtx, pLbi, pLbi->pBsBuf, iSliceIdx );
-					pEncPEncCtx->iPosBsBuffer += iSliceSize;
-				}
-				else
-					iSliceSize = WriteSliceBs( pEncPEncCtx, pSliceBs->pBs, iSliceIdx );
-#else// PACKING_ONE_SLICE_PER_LAYER
-				if ( 0 == iSliceIdx )
-				{
-					pLbi->pBsBuf	= pEncPEncCtx->pFrameBs + pEncPEncCtx->iPosBsBuffer;
-					iSliceSize = WriteSliceToFrameBs( pEncPEncCtx, pLbi, pLbi->pBsBuf, iSliceIdx );
-					pEncPEncCtx->iPosBsBuffer += iSliceSize;
-				}
-				else
-				{
-					pLbi->pBsBuf	= pSliceBs->bs + pSliceBs->uiBsPos;
-					iSliceSize = WriteSliceToFrameBs( pEncPEncCtx, pLbi, pLbi->pBsBuf, iSliceIdx );
-					pSliceBs->uiBsPos += iSliceSize;
-				}			
-#endif//!PACKING_ONE_SLICE_PER_LAYER
-			
-				if ( pCurDq->bDeblockingParallelFlag && pSlice->sSliceHeaderExt.sSliceHeader.uiDisableDeblockingFilterIdc != 1
-#if !defined(ENABLE_FRAME_DUMP)
-					&& ( eNalRefIdc != NRI_PRI_LOWEST ) && 
-					( pParamD->iHighestTemporalId == 0 || kiCurTid < pParamD->iHighestTemporalId )
-#endif// !ENABLE_FRAME_DUMP
-					)
-				{
-					DeblockingFilterSliceAvcbase( pCurDq, pEncPEncCtx->pFuncList, iSliceIdx );
-				}			
-
-#if defined(DYNAMIC_SLICE_ASSIGN) || defined(MT_DEBUG)
-				if ( bDsaFlag )
-				{
-					pEncPEncCtx->pSliceThreading->pSliceConsumeTime[pEncPEncCtx->uiDependencyId][iSliceIdx] = (uint32_t)(WelsTime() - iSliceStart);
-#if defined(ENABLE_TRACE_MT)
-					WelsLog(pEncPEncCtx, WELS_LOG_INFO, "[MT] CodingSliceThreadProc(), coding_idx %d, uiSliceIdx %d, pSliceConsumeTime %d, iSliceSize %d, pFirstMbInSlice %d, count_num_mb_in_slice %d\n",
-						pEncPEncCtx->iCodingIndex, iSliceIdx, pEncPEncCtx->pSliceThreading->pSliceConsumeTime[pEncPEncCtx->uiDependencyId][iSliceIdx], iSliceSize, pCurDq->pSliceEncCtx->pFirstMbInSlice[iSliceIdx], pCurDq->pSliceEncCtx->pCountMbNumInSlice[iSliceIdx]);
-#endif//ENABLE_TRACE_MT
-				}
-#endif//DYNAMIC_SLICE_ASSIGN || MT_DEBUG
-
-#if defined(SLICE_INFO_OUTPUT)
-				fprintf(	stderr,
-							"@pSlice=%-6d sliceType:%c idc:%d size:%-6d\n",
-							iSliceIdx,
-							(pEncPEncCtx->eSliceType == P_SLICE ? 'P' : 'I'),
-							eNalRefIdc,
-							iSliceSize
-						);
-#endif//SLICE_INFO_OUTPUT				
-
-#if MT_DEBUG_BS_WR
-				pSliceBs->bSliceCodedFlag	= TRUE;
-#endif//MT_DEBUG_BS_WR
-
-#ifdef WIN32
-				WelsEventSignal( &pEncPEncCtx->pSliceThreading->pSliceCodedEvent[iEventIdx] );	// mean finished coding current pSlice						
-#else
-				WelsEventSignal( pEncPEncCtx->pSliceThreading->pSliceCodedEvent[iEventIdx] );	// mean finished coding current pSlice				
-#endif//WIN32				
-			}
-			else	// for SM_DYN_SLICE parallelization
-			{
-#ifdef PACKING_ONE_SLICE_PER_LAYER
-				SLayerBSInfo *pLbiPacking			= NULL;
-#endif//PACKING_ONE_SLICE_PER_LAYER
-				SSliceCtx *pSliceCtx			= pCurDq->pSliceEncCtx;
-				const int32_t kiPartitionId			= iThreadIdx;
-				const int32_t kiSliceIdxStep		= pEncPEncCtx->iActiveThreadsNum;
-				const int32_t kiFirstMbInPartition	= pPrivateData->iStartMbIndex;	// inclusive
-				const int32_t kiEndMbInPartition	= pPrivateData->iEndMbIndex;		// exclusive
-				int32_t iAnyMbLeftInPartition	= kiEndMbInPartition - kiFirstMbInPartition;				
-				
-				iSliceIdx		= pPrivateData->iSliceIndex;
-
-				pSliceCtx->pFirstMbInSlice[iSliceIdx]				= kiFirstMbInPartition;					
-				pCurDq->pNumSliceCodedOfPartition[kiPartitionId]		= 1;	// one pSlice per partition intialized, dynamic slicing inside
-				pCurDq->pLastMbIdxOfPartition[kiPartitionId]			= kiEndMbInPartition-1;
-
-				pCurDq->pLastCodedMbIdxOfPartition[kiPartitionId]		= 0;
-
-				while( iAnyMbLeftInPartition > 0 )
-				{
-					if ( iSliceIdx >= pSliceCtx->iMaxSliceNumConstraint )
-					{
-						// TODO: need exception handler for not large enough of MAX_SLICES_NUM related memory usage
-						// No idea about its solution due MAX_SLICES_NUM is fixed lenght in relevent pData structure
-						uiThrdRet	= 1;
-						break;
-					}
-
-					pSlice			= &pCurDq->sLayerInfo.pSliceInLayer[iSliceIdx];
-					pSliceBs		= &pEncPEncCtx->pSliceBs[iSliceIdx];
-			
-#if !defined(PACKING_ONE_SLICE_PER_LAYER)
-					pSliceBs->uiBsPos	= 0;
-#endif//!PACKING_ONE_SLICE_PER_LAYER
-					pSliceBs->iNalIndex	= 0;
-					InitBits( &pSliceBs->sBsWrite, pSliceBs->pBsBuffer, pSliceBs->uiSize );
-			
-					if ( bNeedPrefix )
-					{
-						if ( eNalRefIdc != NRI_PRI_LOWEST )
-						{
-							WelsLoadNalForSlice( pSliceBs, NAL_UNIT_PREFIX, eNalRefIdc );
-							WelsWriteSVCPrefixNal( &pSliceBs->sBsWrite, eNalRefIdc, (NAL_UNIT_CODED_SLICE_IDR == eNalType) );
-							WelsUnloadNalForSlice( pSliceBs );			
-						}
-						else // No Prefix NAL Unit RBSP syntax here, but need add NAL Unit Header extension
-						{
-							WelsLoadNalForSlice( pSliceBs, NAL_UNIT_PREFIX, eNalRefIdc );
-							// No need write any syntax of prefix NAL Unit RBSP here
-							WelsUnloadNalForSlice( pSliceBs );			
-						}
-					}
-				
-					WelsLoadNalForSlice( pSliceBs, eNalType, eNalRefIdc );
-
-					WelsCodeOneSlice( pEncPEncCtx, iSliceIdx, eNalType );			
-
-					WelsUnloadNalForSlice( pSliceBs );
-
-#if !defined(PACKING_ONE_SLICE_PER_LAYER)
-					if ( 0 == kiPartitionId )
-					{	
-						if ( 0 == iSliceIdx )
-							pLbi->pBsBuf	= pEncPEncCtx->pFrameBs + pEncPEncCtx->iPosBsBuffer;
-						iSliceSize = WriteSliceToFrameBs( pEncPEncCtx, pLbi, pEncPEncCtx->pFrameBs + pEncPEncCtx->iPosBsBuffer, iSliceIdx );
-						pEncPEncCtx->iPosBsBuffer += iSliceSize;
-					}
-					else
-						iSliceSize = WriteSliceBs( pEncPEncCtx, pSliceBs->pBs, iSliceIdx );
-#else// PACKING_ONE_SLICE_PER_LAYER
-					pLbiPacking	= pLbi + (iSliceIdx - kiPartitionId);
-
-					if ( 0 == kiPartitionId )
-					{
-						pLbiPacking->pBsBuf	= pEncPEncCtx->pFrameBs + pEncPEncCtx->iPosBsBuffer;
-						iSliceSize = WriteSliceToFrameBs( pEncPEncCtx, pLbiPacking, pLbiPacking->pBsBuf, iSliceIdx );
-						pEncPEncCtx->iPosBsBuffer += iSliceSize;
-					}
-					else
-					{
-						pLbiPacking->pBsBuf	= pSliceBs->bs + pSliceBs->uiBsPos;
-						iSliceSize = WriteSliceToFrameBs( pEncPEncCtx, pLbiPacking, pLbiPacking->pBsBuf, iSliceIdx );
-						pSliceBs->uiBsPos += iSliceSize;
-					}
-					pEncPEncCtx->pSliceThreading->pCountBsSizeInPartition[kiPartitionId] += iSliceSize;
-#endif//!PACKING_ONE_SLICE_PER_LAYER
-			
-					if ( pCurDq->bDeblockingParallelFlag && pSlice->sSliceHeaderExt.sSliceHeader.uiDisableDeblockingFilterIdc != 1
-#if !defined(ENABLE_FRAME_DUMP)
-						&& ( eNalRefIdc != NRI_PRI_LOWEST ) && 
-						( pParamD->iHighestTemporalId == 0 || kiCurTid < pParamD->iHighestTemporalId )
-#endif// !ENABLE_FRAME_DUMP
-						)
-					{
-						DeblockingFilterSliceAvcbase( pCurDq, pEncPEncCtx->pFuncList, iSliceIdx );
-					}
-					
-#if defined(SLICE_INFO_OUTPUT)
-					fprintf(	stderr,
-								"@pSlice=%-6d sliceType:%c idc:%d size:%-6d\n",
-								iSliceIdx,
-								(pEncPEncCtx->eSliceType == P_SLICE ? 'P' : 'I'),
-								eNalRefIdc,
-								iSliceSize
-							);
-#endif//SLICE_INFO_OUTPUT					
-
-#if defined(ENABLE_TRACE_MT)
-					WelsLog(pEncPEncCtx, WELS_LOG_INFO, "[MT] CodingSliceThreadProc(), coding_idx %d, iPartitionId %d, uiSliceIdx %d, iSliceSize %d, count_mb_slice %d, iEndMbInPartition %d, pCurDq->pLastCodedMbIdxOfPartition[%d] %d\n",
-						pEncPEncCtx->iCodingIndex, kiPartitionId, iSliceIdx, iSliceSize, pCurDq->pSliceEncCtx->pCountMbNumInSlice[iSliceIdx], kiEndMbInPartition, kiPartitionId, pCurDq->pLastCodedMbIdxOfPartition[kiPartitionId]);
-#endif//ENABLE_TRACE_MT
-					
-					iAnyMbLeftInPartition = kiEndMbInPartition - (1 + pCurDq->pLastCodedMbIdxOfPartition[kiPartitionId]);
-					iSliceIdx += kiSliceIdxStep;
-				}
-
-				if ( uiThrdRet )	// any exception??
-					break;
-
-#ifdef WIN32
-				WelsEventSignal( &pEncPEncCtx->pSliceThreading->pSliceCodedEvent[iEventIdx] );	// mean finished coding current pSlice		
-#else
-				WelsEventSignal( pEncPEncCtx->pSliceThreading->pSliceCodedEvent[iEventIdx] );	// mean finished coding current pSlice		
-#endif//WIN32
-			}
-		}
-#ifdef WIN32
-		else if ( WELS_THREAD_ERROR_WAIT_OBJECT_0+1 == iWaitRet )	// exit thread signal
-		{
-			uiThrdRet	= 0;
-			break;
-		}
-#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
-		else if ( WELS_THREAD_ERROR_WAIT_OBJECT_0+2 == iWaitRet )	// update pMb list singal
-		{
-			iSliceIdx		= iEventIdx;	// pPrivateData->iSliceIndex; old threads can not be terminated, pPrivateData is not correct for applicable
-			pCurDq			= pEncPEncCtx->pCurDqLayer;
-			UpdateMbListNeighborParallel( pCurDq->pSliceEncCtx, pCurDq->sMbDataP, iSliceIdx );
-			WelsEventSignal( &pEncPEncCtx->pSliceThreading->pFinUpdateMbListEvent[iEventIdx] );	// mean finished update pMb list for this pSlice			
-		}
-#endif//#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
-#endif//WIN32		
-		else // WELS_THREAD_ERROR_WAIT_TIMEOUT, or WELS_THREAD_ERROR_WAIT_FAILED
-		{
-			WelsLog(pEncPEncCtx, WELS_LOG_WARNING, "[MT] CodingSliceThreadProc(), waiting pReadySliceCodingEvent[%d] failed(%d) and thread%d terminated!\n", iEventIdx, iWaitRet, iThreadIdx);
-			uiThrdRet	= 1;
-			break;
-		}		
-	} while( 1 );
-
-#ifdef WIN32
-	WelsEventSignal( &pEncPEncCtx->pSliceThreading->pFinSliceCodingEvent[iEventIdx] );	// notify to mother encoding threading
-#endif//WIN32
-
-	WELS_THREAD_ROUTINE_RETURN(uiThrdRet);
-}
-
-int32_t CreateSliceThreads( sWelsEncCtx *pCtx )
-{
-	const int32_t kiThreadCount = pCtx->pSvcParam->iCountThreadsNum;
-	int32_t iIdx = 0;
-#if defined(WIN32) && defined(BIND_CPU_CORES_TO_THREADS)
-	DWORD  dwProcessAffinity;
-	DWORD  dwSystemAffinity;
-	GetProcessAffinityMask(GetCurrentProcess(), &dwProcessAffinity, &dwSystemAffinity);
-#endif//WIN32 && BIND_CPU_CORES_TO_THREADS
-	
-	while ( iIdx < kiThreadCount ) {
-		WelsThreadCreate( &pCtx->pSliceThreading->pThreadHandles[iIdx], CodingSliceThreadProc, &pCtx->pSliceThreading->pThreadPEncCtx[iIdx], 0);
-#if defined(WIN32) && defined(BIND_CPU_CORES_TO_THREADS)
-		if ( dwProcessAffinity > 1 && pCtx->pSliceThreading->pThreadHandles[iIdx] != NULL )	// multiple cores and thread created successfully
-		{	
-			DWORD  dw = 0;
-			DWORD  dwAffinityMask = 1 << iIdx;
-			if (dwAffinityMask & dwProcessAffinity) // check if cpu is available
-			{
-				dw = SetThreadAffinityMask( pCtx->pSliceThreading->pThreadHandles[iIdx], dwAffinityMask ); //1 << iIdx
-				if ( dw == 0)
-				{
-					str_t str[64] = {0};
-					SNPRINTF(str, 64, "SetThreadAffinityMask iIdx:%d", iIdx);
-				}
-			}			
-		}
-#endif//WIN32 && BIND_CPU_CORES_TO_THREADS
-		// We need extra threads for update_mb_list_proc on __GNUC__ like OS (mac/linux) 
-		// due to WelsMultipleEventsWaitSingleBlocking implememtation can not work well 
-		// in case waiting pUpdateMbListEvent and pReadySliceCodingEvent events at the same time
-#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
-#if defined(__GNUC__)
-		WelsThreadCreate( &pCtx->pSliceThreading->pUpdateMbListThrdHandles[iIdx], UpdateMbListThreadProc, &pCtx->pSliceThreading->pThreadPEncCtx[iIdx], 0);
-#endif//__GNUC__
-#endif//#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
-		++ iIdx;
-	}
-#if defined(ENABLE_TRACE_MT)
-	WelsLog(pCtx, WELS_LOG_INFO, "CreateSliceThreads() exit..\n");
-#endif
-	return 0;
-}
-
-#ifdef PACKING_ONE_SLICE_PER_LAYER
-void ResetCountBsSizeInPartitions( uint32_t *pCountBsSizeList, const int32_t iPartitionCnt )
-{
-	if ( pCountBsSizeList != NULL && iPartitionCnt > 0 )
-	{
-		memset(pCountBsSizeList, 0, sizeof(pCountBsSizeList[0]) * iPartitionCnt );
-	}
-}
-#endif//PACKING_ONE_SLICE_PER_LAYER
-
-#ifdef WIN32
-int32_t FiredSliceThreads( SSliceThreadPrivateData *pPriData, WELS_EVENT *pEventsList, SLayerBSInfo *pLbi, const uint32_t uiNumThreads, SSliceCtx *pSliceCtx, const BOOL_T bIsDynamicSlicingMode )
-#else
-int32_t FiredSliceThreads( SSliceThreadPrivateData *pPriData, WELS_EVENT **pEventsList, SLayerBSInfo *pLbi, const uint32_t uiNumThreads, SSliceCtx *pSliceCtx, const BOOL_T bIsDynamicSlicingMode )
-#endif//WIN32
-{
-	int32_t iEndMbIdx	= 0;
-	int32_t iIdx		= 0;
-	const int32_t kiEventCnt = uiNumThreads;
-	
-	if ( pPriData == NULL || pLbi == NULL || kiEventCnt <= 0 || pEventsList == NULL )
-	{
-		WelsLog( NULL, WELS_LOG_ERROR, "FiredSliceThreads(), fail due pPriData == %p || pLbi == %p || iEventCnt(%d) <= 0 || pEventsList == %p!!\n", (void *)pPriData, (void *)pLbi, uiNumThreads,  (void *)pEventsList);	
-		return 1;
-	}
-
-#if defined(PACKING_ONE_SLICE_PER_LAYER)
-	////////////////////////////////////////
-	if ( bIsDynamicSlicingMode )
-	{
-		iEndMbIdx	= pSliceCtx->iMbNumInFrame;
-		for (iIdx = kiEventCnt-1; iIdx >= 0; --iIdx)
-		{
-			const int32_t kiFirstMbIdx		= pSliceCtx->pFirstMbInSlice[iIdx];
-			pPriData[iIdx].iStartMbIndex	= kiFirstMbIdx;			
-			pPriData[iIdx].iEndMbIndex		= iEndMbIdx;
-			iEndMbIdx						= kiFirstMbIdx;
-		}
-	}
-
-	iIdx = 0;
-	while (iIdx < kiEventCnt) {
-		pPriData[iIdx].pLayerBs = pLbi;
-		pPriData[iIdx].iSliceIndex	= iIdx;
-#ifdef WIN32
-		if ( pEventsList[iIdx] )
-			WelsEventSignal( &pEventsList[iIdx] );
-#else
-		WelsEventSignal( pEventsList[iIdx] );
-#endif//WIN32
-		++ pLbi;
-		++ iIdx;
-	}	
-	////////////////////////////////////////
-#else
-	////////////////////////////////////////
-	if ( bIsDynamicSlicingMode )
-	{
-		iEndMbIdx	= pSliceCtx->iMbNumInFrame;
-		for (iIdx = kiEventCnt-1; iIdx >= 0; --iIdx)
-		{
-			const int32_t iFirstMbIdx		= pSliceCtx->pFirstMbInSlice[iIdx];
-			pPriData[iIdx].iStartMbIndex	= iFirstMbIdx;			
-			pPriData[iIdx].iEndMbIndex		= iEndMbIdx;
-			iEndMbIdx						= iFirstMbIdx;
-		}
-	}
-
-	iIdx = 0;
-	while (iIdx < kiEventCnt) {
-		pPriData[iIdx].pLayerBs = pLbi;
-		pPriData[iIdx].iSliceIndex	= iIdx;
-#ifdef WIN32
-		if ( pEventsList[iIdx] )
-			WelsEventSignal( &pEventsList[iIdx] );
-#else
-		WelsEventSignal( pEventsList[iIdx] );
-#endif//WIN32
-		++ iIdx;
-	}		
-	////////////////////////////////////////
-#endif//PACKING_ONE_SLICE_PER_LAYER
-
-	return 0;
-}
-
-int32_t DynamicDetectCpuCores()
-{
-	WelsLogicalProcessInfo  info;
-	WelsQueryLogicalProcessInfo(&info);
-	return info.ProcessorCount;
-}
-
-#if defined(MT_ENABLED) && defined(DYNAMIC_SLICE_ASSIGN)
-
-int32_t AdjustBaseLayer( sWelsEncCtx *pCtx )
-{
-	SDqLayer *pCurDq	= pCtx->ppDqLayerList[0];		
-	int32_t iNeedAdj	= 1;
-#ifdef MT_DEBUG
-	int64_t iT0 = WelsTime();
-#endif//MT_DEBUG
-#ifdef TRY_SLICING_BALANCE
-	
-	pCtx->pCurDqLayer	= pCurDq;
-	
-#ifdef NOT_ABSOLUTE_BALANCING
-	// do not need adjust due to not different at both slices of consumed time
-	iNeedAdj	= NeedDynamicAdjust( pCtx->pSliceThreading->pSliceConsumeTime[0], pCurDq->pSliceEncCtx->iSliceNumInFrame );
-	if ( iNeedAdj )
-#endif//NOT_ABSOLUTE_BALANCING
-	DynamicAdjustSlicing(	pCtx,
-							pCurDq,
-							pCtx->pSliceThreading->pSliceComplexRatio[0],
-							0 );
-#endif//TRY_SLICING_BALANCE
-#ifdef MT_DEBUG
-	iT0 = WelsTime() - iT0;
-	if ( pCtx->pSliceThreading->pFSliceDiff )
-	{
-		fprintf( pCtx->pSliceThreading->pFSliceDiff, 
-#ifdef WIN32
-				"%6I64d us adjust time at base spatial layer, iNeedAdj %d, DynamicAdjustSlicing()\n",
-#else
-				"%6lld us adjust time at base spatial layer, iNeedAdj %d, DynamicAdjustSlicing()\n",
-#endif//WIN32
-				iT0, iNeedAdj );
-	}
-#endif//MT_DEBUG
-
-	return iNeedAdj;
-}
-
-int32_t AdjustEnhanceLayer( sWelsEncCtx *pCtx, int32_t iCurDid )
-{
-#ifdef MT_DEBUG
-	int64_t iT1 = WelsTime();
-#endif//MT_DEBUG
-	int32_t iNeedAdj = 1;	
-	// uiSliceMode of referencing spatial should be SM_FIXEDSLCNUM_SLICE
-	// if using spatial base layer for complexity estimation
-	const BOOL_T kbModelingFromSpatial =	(pCtx->pCurDqLayer->pRefLayer != NULL && iCurDid > 0) 
-										&& (pCtx->pSvcParam->sDependencyLayers[iCurDid-1].sMso.uiSliceMode == SM_FIXEDSLCNUM_SLICE && pCtx->pSvcParam->iMultipleThreadIdc >= pCtx->pSvcParam->sDependencyLayers[iCurDid-1].sMso.sSliceArgument.iSliceNum);
-
-	if ( kbModelingFromSpatial )	// using spatial base layer for complexity estimation
-	{	
-#ifdef TRY_SLICING_BALANCE
-#ifdef NOT_ABSOLUTE_BALANCING
-		// do not need adjust due to not different at both slices of consumed time
-		iNeedAdj = NeedDynamicAdjust( pCtx->pSliceThreading->pSliceConsumeTime[iCurDid-1], pCtx->pCurDqLayer->pSliceEncCtx->iSliceNumInFrame );
-		if ( iNeedAdj )
-#endif//NOT_ABSOLUTE_BALANCING
-		DynamicAdjustSlicing(	pCtx,
-								pCtx->pCurDqLayer,
-								pCtx->pSliceThreading->pSliceComplexRatio[iCurDid-1],
-								iCurDid
-							  );
-#endif//TRY_SLICING_BALANCE
-	}
-	else	// use temporal layer for complexity estimation
-	{	
-#ifdef TRY_SLICING_BALANCE
-#ifdef NOT_ABSOLUTE_BALANCING
-		// do not need adjust due to not different at both slices of consumed time
-		iNeedAdj = NeedDynamicAdjust( pCtx->pSliceThreading->pSliceConsumeTime[iCurDid], pCtx->pCurDqLayer->pSliceEncCtx->iSliceNumInFrame );
-		if ( iNeedAdj )
-#endif//NOT_ABSOLUTE_BALANCING
-		DynamicAdjustSlicing(	pCtx,
-								pCtx->pCurDqLayer,
-								pCtx->pSliceThreading->pSliceComplexRatio[iCurDid],
-								iCurDid
-							  );
-#endif//TRY_SLICING_BALANCE
-	}
-
-#ifdef MT_DEBUG
-	iT1 = WelsTime() - iT1;
-	if ( pCtx->pSliceThreading->pFSliceDiff )
-	{
-		fprintf( pCtx->pSliceThreading->pFSliceDiff, 
-#ifdef WIN32
-				"%6I64d us adjust time at spatial layer %d, iNeedAdj %d, DynamicAdjustSlicing()\n",
-#else
-				"%6lld us adjust time at spatial layer %d, iNeedAdj %d, DynamicAdjustSlicing()\n",
-#endif//WIN32
-				iT1, iCurDid, iNeedAdj );
-	}
-#endif//MT_DEBUG
-
-	return iNeedAdj;
-}
-
-#endif//#if defined(MT_ENABLED) && defined(DYNAMIC_SLICE_ASSIGN)
-
-#if defined(MT_ENABLED)
-
-#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE) && defined(MT_DEBUG)
-void TrackSliceComplexities( sWelsEncCtx *pCtx, const int32_t iCurDid )
-{
-	const int32_t kiCountSliceNum = pCtx->pCurDqLayer->pSliceEncCtx->iSliceNumInFrame;
-	if ( kiCountSliceNum > 0 )
-	{
-		int32_t iSliceIdx = 0;
-		do {
-			fprintf( pCtx->pSliceThreading->pFSliceDiff, "%6.3f complexity pRatio at iDid %d pSlice %d\n", pCtx->pSliceThreading->pSliceComplexRatio[iCurDid][iSliceIdx], iCurDid, iSliceIdx );
-			++ iSliceIdx;
-		} while(iSliceIdx < kiCountSliceNum);
-	}
-}
-#endif//#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
-
-#if defined(DYNAMIC_SLICE_ASSIGN) && defined(MT_DEBUG)
-void TrackSliceConsumeTime( sWelsEncCtx *pCtx, int32_t *pDidList, const int32_t iSpatialNum )
-{
-	SWelsSvcCodingParam *pPara = NULL;
-	int32_t iSpatialIdx = 0;
-
-	if ( iSpatialNum > MAX_DEPENDENCY_LAYER )
-		return;
-
-	pPara	= pCtx->pSvcParam;
-	while ( iSpatialIdx < iSpatialNum )
-	{
-		const int32_t kiDid		= pDidList[iSpatialIdx];
-		SDLayerParam *pDlp		= &pPara->sDependencyLayers[kiDid];
-		SMulSliceOption *pMso	= &pDlp->sMso;
-		SDqLayer *pCurDq		= pCtx->ppDqLayerList[kiDid];
-		SSliceCtx *pSliceCtx= pCurDq->pSliceEncCtx;
-		const uint32_t kuiCountSliceNum = pSliceCtx->iSliceNumInFrame;
-		if(pCtx->pSliceThreading)
-		{
-			if ( pCtx->pSliceThreading->pFSliceDiff && pMso->uiSliceMode == SM_FIXEDSLCNUM_SLICE && pPara->iMultipleThreadIdc > 1 && pPara->iMultipleThreadIdc >= kuiCountSliceNum  )
-			{
-				uint32_t i = 0;
-				uint32_t uiMaxT = 0;
-				int32_t iMaxI = 0;
-				while (i < kuiCountSliceNum) {
-					if ( pCtx->pSliceThreading->pSliceConsumeTime[kiDid] != NULL )
-						fprintf( pCtx->pSliceThreading->pFSliceDiff, "%6d us consume_time coding_idx %d iDid %d pSlice %d\n",
-						pCtx->pSliceThreading->pSliceConsumeTime[kiDid][i], pCtx->iCodingIndex, kiDid, i /*/ 1000*/);
-					if (pCtx->pSliceThreading->pSliceConsumeTime[kiDid][i] > uiMaxT)
-					{
-						uiMaxT = pCtx->pSliceThreading->pSliceConsumeTime[kiDid][i];
-						iMaxI = i;
-					}
-					++ i;
-				}			 
-			fprintf( pCtx->pSliceThreading->pFSliceDiff, "%6d us consume_time_max coding_idx %d iDid %d pSlice %d\n", uiMaxT, pCtx->iCodingIndex, kiDid, iMaxI /*/ 1000*/);
-			}
-		}
-		++ iSpatialIdx;
-	}
-}
-#endif//#if defined(DYNAMIC_SLICE_ASSIGN) || defined(MT_DEBUG)
-
-#endif//MT_ENABLED
-}
-#endif//MT_ENABLED
-
+/*!
+ * \copy
+ *     Copyright (c)  2010-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	slice_multi_threading.h
+ *
+ * \brief	pSlice based multiple threading
+ *
+ * \date	04/16/2010 Created
+ *
+ *************************************************************************************
+ */
+
+#if defined(MT_ENABLED)
+
+#include <assert.h>
+#ifdef __GNUC__
+#include <semaphore.h>
+#ifndef SEM_NAME_MAX
+// length of semaphore name should be system constrained at least on mac 10.7
+#define  SEM_NAME_MAX 32
+#endif//SEM_NAME_MAX
+#endif//__GNUC__
+#include "slice_multi_threading.h"
+#include "mt_defs.h"
+#include "nal_encap.h"
+#include "utils.h"
+#include "encoder.h"
+#include "svc_encode_slice.h"
+#include "deblocking.h"
+#include "svc_enc_golomb.h"
+#include "crt_util_safe_x.h"	// for safe crt like calls
+#include "rc.h"
+
+#if defined(X86_ASM)
+#include "cpu.h"
+#endif//X86_ASM
+
+#if defined(DYNAMIC_SLICE_ASSIGN) || defined(MT_DEBUG)
+#include "measure_time.h"
+#endif//DYNAMIC_SLICE_ASSIGN
+namespace WelsSVCEnc {
+void UpdateMbListNeighborParallel (SSliceCtx* pSliceCtx,
+                                   SMB* pMbList,
+                                   const int32_t uiSliceIdc) {
+  const uint8_t* kpMbMap			= pSliceCtx->pOverallMbMap;
+  const int32_t kiMbWidth			= pSliceCtx->iMbWidth;
+  int32_t iIdx						= pSliceCtx->pFirstMbInSlice[uiSliceIdc];
+  const int32_t kiEndMbInSlice	= iIdx + pSliceCtx->pCountMbNumInSlice[uiSliceIdc] - 1;
+
+  do {
+    SMB* pMb							= &pMbList[iIdx];
+    uint32_t uiNeighborAvailFlag	= 0;
+    const int32_t kiMbXY				= pMb->iMbXY;
+    const int32_t kiMbX				= pMb->iMbX;
+    const int32_t kiMbY				= pMb->iMbY;
+    BOOL_T     bLeft;
+    BOOL_T     bTop;
+    BOOL_T     bLeftTop;
+    BOOL_T     bRightTop;
+    int32_t   iLeftXY, iTopXY, iLeftTopXY, iRightTopXY;
+
+    iLeftXY = kiMbXY - 1;
+    iTopXY = kiMbXY - kiMbWidth;
+    iLeftTopXY = iTopXY - 1;
+    iRightTopXY = iTopXY + 1;
+
+    bLeft = (kiMbX > 0) && (uiSliceIdc == kpMbMap[iLeftXY]);
+    bTop = (kiMbY > 0) && (uiSliceIdc == kpMbMap[iTopXY]);
+    bLeftTop = (kiMbX > 0) && (kiMbY > 0) && (uiSliceIdc == kpMbMap[iLeftTopXY]);
+    bRightTop = (kiMbX < (kiMbWidth - 1)) && (kiMbY > 0) && (uiSliceIdc == kpMbMap[iRightTopXY]);
+
+    if (bLeft) {
+      uiNeighborAvailFlag |= LEFT_MB_POS;
+    }
+    if (bTop) {
+      uiNeighborAvailFlag |= TOP_MB_POS;
+    }
+    if (bLeftTop) {
+      uiNeighborAvailFlag |= TOPLEFT_MB_POS;
+    }
+    if (bRightTop) {
+      uiNeighborAvailFlag |= TOPRIGHT_MB_POS;
+    }
+    pMb->uiNeighborAvail	= (uint8_t)uiNeighborAvailFlag;
+    pMb->uiSliceIdc		= uiSliceIdc;
+
+    ++ iIdx;
+  } while (iIdx <= kiEndMbInSlice);
+}
+
+void CalcSliceComplexRatio (void* pRatio, SSliceCtx* pSliceCtx, uint32_t* pSliceConsume) {
+  float* pRatioList			= (float*)pRatio;
+  float fAvI[MAX_SLICES_NUM];
+  float fSumAv				= .0f;
+  uint32_t* pSliceTime		= (uint32_t*)pSliceConsume;
+  int32_t* pCountMbInSlice	= (int32_t*)pSliceCtx->pCountMbNumInSlice;
+  const int32_t kiSliceCount	= pSliceCtx->iSliceNumInFrame;
+  int32_t iSliceIdx			= 0;
+
+#if defined(X86_ASM)
+  WelsEmms();
+#endif //X86_ASM
+
+  while (iSliceIdx < kiSliceCount) {
+    fAvI[iSliceIdx]	= 1.0f * pCountMbInSlice[iSliceIdx] / pSliceTime[iSliceIdx];
+#if defined(ENABLE_TRACE_MT)
+    WelsLog (NULL, WELS_LOG_DEBUG, "[MT] CalcSliceComplexRatio(), pSliceConsumeTime[%d]= %d us, slice_run= %d\n", iSliceIdx,
+             pSliceTime[iSliceIdx], pCountMbInSlice[iSliceIdx]);
+#endif//ENABLE_TRACE_MT
+    fSumAv += fAvI[iSliceIdx];
+
+    ++ iSliceIdx;
+  }
+  while (-- iSliceIdx >= 0) {
+    pRatioList[iSliceIdx] = fAvI[iSliceIdx] / fSumAv;
+  }
+}
+
+#if defined(MT_ENABLED) && defined(DYNAMIC_SLICE_ASSIGN) && defined(NOT_ABSOLUTE_BALANCING)
+int32_t NeedDynamicAdjust (void* pConsumeTime, const int32_t iSliceNum) {
+#if !defined(USE_RMSE_SLICE_COMPLEXITY_RATIO_FOR_BALANCING)
+  const float fRatioLower	= TOLERANT_BALANCING_RATIO_LOWER (uiSliceNum);
+  const float fRatioUpper	= TOLERANT_BALANCING_RATIO_UPPER (uiSliceNum);
+#endif//USE_RMSE_SLICE_COMPLEXITY_RATIO_FOR_BALANCING
+  uint32_t* pSliceConsume	= (uint32_t*)pConsumeTime;
+  uint32_t uiTotalConsume	= 0;
+  int32_t iSliceIdx		= 0;
+  int32_t iNeedAdj		= false;
+
+#if defined(X86_ASM)
+  WelsEmms();
+#endif //X86_ASM
+
+  while (iSliceIdx < iSliceNum) {
+    uiTotalConsume += pSliceConsume[iSliceIdx] + pSliceConsume[1 + iSliceIdx];
+    iSliceIdx += 2;
+  }
+  if (uiTotalConsume == 0) {
+#if defined(ENABLE_TRACE_MT)
+    WelsLog (NULL, WELS_LOG_DEBUG, "[MT] NeedDynamicAdjust(), herein do no adjust due first picture, iCountSliceNum= %d\n",
+             iSliceNum);
+#endif//ENABLE_TRACE_MT
+    return false;
+  }
+
+  iSliceIdx = 0;
+#if defined(USE_RMSE_SLICE_COMPLEXITY_RATIO_FOR_BALANCING)
+  float fThr				= EPSN;	// threshold for various cores cases
+  float fRmse				= .0f;	// root mean square error of pSlice consume ratios
+  const float kfMeanRatio	= 1.0f / iSliceNum;
+  do {
+    const float fRatio = 1.0f * pSliceConsume[iSliceIdx] / uiTotalConsume;
+    const float fDiffRatio = fRatio - kfMeanRatio;
+    fRmse += (fDiffRatio * fDiffRatio);
+    ++ iSliceIdx;
+  } while (iSliceIdx + 1 < iSliceNum);
+  fRmse = sqrtf (fRmse / iSliceNum);
+  if (iSliceNum >= 8) {
+    fThr += THRESHOLD_RMSE_CORE8;
+  } else if (iSliceNum >= 4) {
+    fThr += THRESHOLD_RMSE_CORE4;
+  } else if (iSliceNum >= 2) {
+    fThr += THRESHOLD_RMSE_CORE2;
+  } else
+    fThr = 1.0f;
+  if (fRmse > fThr)
+    iNeedAdj	= true;
+#if defined(ENABLE_TRACE_MT)
+  WelsLog (NULL, WELS_LOG_DEBUG,
+           "[MT] NeedDynamicAdjust(), herein adjustment decision is made (iNeedAdj= %d) by: fRmse of pSlice complexity ratios %.6f, the corresponding threshold %.6f, iCountSliceNum %d\n",
+           iNeedAdj, fRmse, fThr, iSliceNum);
+#endif//ENABLE_TRACE_MT
+#else
+  do {
+    const float kfRatio = 1.0f * pSliceConsume[uiSliceIdx] / uiTotalConsume;
+    if (kfRatio + EPSN < fRatioLower || kfRatio > ratio_upper + EPSN) {
+#if defined(ENABLE_TRACE_MT)
+      WelsLog (NULL, WELS_LOG_DEBUG,
+               "[MT] NeedDynamicAdjust(), herein adjustment decision is made by pSlice consume time not balanced at all, uiSliceIdx= %d, comp_ratio= %.6f, pSliceConsumeTime= %d, total_consume_time= %d, iCountSliceNum= %d\n",
+               uiSliceIdx, kfRatio, pSliceConsume[uiSliceIdx], uiTotalConsume, uiSliceNum);
+#endif//ENABLE_TRACE_MT
+      iNeedAdj = true;
+      break;
+    }
+    ++ uiSliceIdx;
+  } while (uiSliceIdx + 1 < uiSliceNum);
+#endif//USE_RMSE_SLICE_COMPLEXITY_RATIO_FOR_BALANCING
+
+  return iNeedAdj;
+}
+#endif//..
+
+#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+void DynamicAdjustSlicing (sWelsEncCtx* pCtx,
+                           SDqLayer* pCurDqLayer,
+                           void* pComplexRatio,
+                           int32_t iCurDid) {
+  SSliceCtx* pSliceCtx	= pCurDqLayer->pSliceEncCtx;
+  const int32_t kiCountSliceNum	= pSliceCtx->iSliceNumInFrame;
+  const int32_t kiCountNumMb		= pSliceCtx->iMbNumInFrame;
+  int32_t iMinimalMbNum			= pSliceCtx->iMbWidth;	// in theory we need only 1 SMB, here let it as one SMB row required
+  int32_t iMaximalMbNum			= 0;	// dynamically assign later
+  float* pSliceComplexRatio	= (float*)pComplexRatio;
+  int32_t iMbNumLeft					= kiCountNumMb;
+  int32_t iRunLen[MAX_THREADS_NUM]	= {0};
+  int32_t iSliceIdx					= 0;
+
+  int32_t iNumMbInEachGom;
+  SWelsSvcRc* pWelsSvcRc = &pCtx->pWelsSvcRc[iCurDid];
+  if (pCtx->pSvcParam->bEnableRc) {
+    iNumMbInEachGom = pWelsSvcRc->iNumberMbGom;
+
+    if (iNumMbInEachGom <= 0) {
+      WelsLog (pCtx, WELS_LOG_ERROR,
+               "[MT] DynamicAdjustSlicing(), invalid iNumMbInEachGom= %d from RC, iDid= %d, iCountNumMb= %d\n", iNumMbInEachGom,
+               iCurDid, kiCountNumMb);
+      return;
+    }
+
+    // do not adjust in case no extra iNumMbInEachGom based left for slicing adjustment,
+    // extra MB of non integrated GOM assigned at the last pSlice in default, keep up on early initial result.
+    if (iNumMbInEachGom * kiCountSliceNum >= kiCountNumMb) {
+      return;
+    }
+    iMinimalMbNum	= iNumMbInEachGom;
+  }
+
+  if (kiCountSliceNum < 2 || (kiCountSliceNum & 0x01))	// we need suppose uiSliceNum is even for multiple threading
+    return;
+
+  iMaximalMbNum	= kiCountNumMb - (kiCountSliceNum - 1) * iMinimalMbNum;
+
+#if defined(X86_ASM)
+  WelsEmms();
+#endif //X86_ASM
+
+#if defined(ENABLE_TRACE_MT)
+  WelsLog (pCtx, WELS_LOG_DEBUG, "[MT] DynamicAdjustSlicing(), iDid= %d, iCountNumMb= %d\n", iCurDid, kiCountNumMb);
+#endif//ENABLE_TRACE_MT
+
+  iSliceIdx	= 0;
+  while (iSliceIdx + 1 < kiCountSliceNum) {
+    int32_t iNumMbAssigning = (int32_t) (kiCountNumMb * pSliceComplexRatio[iSliceIdx] + EPSN);
+
+    // GOM boundary aligned
+    if (pCtx->pSvcParam->bEnableRc) {
+      iNumMbAssigning = (int32_t) (1.0f * iNumMbAssigning / iNumMbInEachGom + 0.5f + EPSN) * iNumMbInEachGom;
+    }
+
+    // make sure one GOM at least in each pSlice for safe
+    if (iNumMbAssigning < iMinimalMbNum)
+      iNumMbAssigning	= iMinimalMbNum;
+    else if (iNumMbAssigning > iMaximalMbNum)
+      iNumMbAssigning	= iMaximalMbNum;
+
+    assert (iNumMbAssigning > 0);
+
+    iMbNumLeft -= iNumMbAssigning;
+    if (iMbNumLeft <= 0) {	// error due to we can not support slice_skip now yet, do not adjust this time
+      assert (0);
+      return;
+    }
+    iRunLen[iSliceIdx]	= iNumMbAssigning;
+#if defined(ENABLE_TRACE_MT)
+    WelsLog (pCtx, WELS_LOG_DEBUG,
+             "[MT] DynamicAdjustSlicing(), uiSliceIdx= %d, pSliceComplexRatio= %.2f, slice_run_org= %d, slice_run_adj= %d\n",
+             iSliceIdx, pSliceComplexRatio[iSliceIdx], pSliceCtx->pCountMbNumInSlice[iSliceIdx], iNumMbAssigning);
+#endif//ENABLE_TRACE_MT
+    ++ iSliceIdx;
+    iMaximalMbNum	= iMbNumLeft - (kiCountSliceNum - iSliceIdx - 1) * iMinimalMbNum;	// get maximal num_mb in left parts
+  }
+  iRunLen[iSliceIdx] = iMbNumLeft;
+#if defined(ENABLE_TRACE_MT)
+  WelsLog (pCtx, WELS_LOG_DEBUG,
+           "[MT] DynamicAdjustSlicing(), iSliceIdx= %d, pSliceComplexRatio= %.2f, slice_run_org= %d, slice_run_adj= %d\n",
+           iSliceIdx, pSliceComplexRatio[iSliceIdx], pSliceCtx->pCountMbNumInSlice[iSliceIdx], iMbNumLeft);
+#endif//ENABLE_TRACE_MT
+
+
+  if (DynamicAdjustSlicePEncCtxAll (pSliceCtx, iRunLen) == 0) {
+    const int32_t kiThreadNum	= pCtx->pSvcParam->iCountThreadsNum;
+    int32_t iThreadIdx			= 0;
+    do {
+#ifdef WIN32
+      WelsEventSignal (&pCtx->pSliceThreading->pUpdateMbListEvent[iThreadIdx]);
+#else
+      WelsEventSignal (pCtx->pSliceThreading->pUpdateMbListEvent[iThreadIdx]);
+#endif//WIN32
+      ++ iThreadIdx;
+    } while (iThreadIdx < kiThreadNum);
+
+    WelsMultipleEventsWaitAllBlocking (kiThreadNum, &pCtx->pSliceThreading->pFinUpdateMbListEvent[0]);
+  }
+}
+#endif//#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+
+#ifdef PACKING_ONE_SLICE_PER_LAYER
+void ResetEnvMt (sWelsEncCtx* pCtx) {
+  const int16_t kiSliceCount = pCtx->iMaxSliceCount;
+  int32_t iIdx = 0;
+
+  while (iIdx < kiSliceCount) {
+    SWelsSliceBs* pSliceBs	= &pCtx->pSliceBs[iIdx];
+    pSliceBs->uiBsPos		= 0;
+    ++ iIdx;
+  }
+}
+#endif//PACKING_ONE_SLICE_PER_LAYER
+
+int32_t RequestMtResource (sWelsEncCtx** ppCtx, SWelsSvcCodingParam* pCodingParam, const int32_t iCountBsLen,
+                           const int32_t iTargetSpatialBsSize) {
+  CMemoryAlign* pMa			= NULL;
+  SWelsSvcCodingParam* pPara = NULL;
+  SSliceThreading* pSmt		= NULL;
+  SWelsSliceBs* pSliceB		= NULL;
+  uint8_t* pBsBase			= NULL;
+  int32_t iNumSpatialLayers	= 0;
+  int32_t iThreadNum			= 0;
+  int32_t iIdx					= 0;
+  int32_t iSliceBsBufferSize = 0;
+  int16_t iMaxSliceNum		= 1;
+
+  if (NULL == ppCtx || NULL == pCodingParam || NULL == *ppCtx || iCountBsLen <= 0)
+    return 1;
+
+  pMa	= (*ppCtx)->pMemAlign;
+  pPara = pCodingParam;
+  iNumSpatialLayers	= pPara->iNumDependencyLayer;
+  iThreadNum	= pPara->iCountThreadsNum;
+  iMaxSliceNum = (*ppCtx)->iMaxSliceCount;
+
+  pSmt	= (SSliceThreading*)pMa->WelsMalloc (sizeof (SSliceThreading), "SSliceThreading");
+  WELS_VERIFY_RETURN_PROC_IF (1, (NULL == pSmt), FreeMemorySvc (ppCtx))
+  (*ppCtx)->pSliceThreading	= pSmt;
+  pSmt->pThreadPEncCtx	= (SSliceThreadPrivateData*)pMa->WelsMalloc (sizeof (SSliceThreadPrivateData) * iThreadNum,
+                          "pThreadPEncCtx");
+  WELS_VERIFY_RETURN_PROC_IF (1, (NULL == pSmt->pThreadPEncCtx), FreeMemorySvc (ppCtx))
+  pSmt->pThreadHandles	= (WELS_THREAD_HANDLE*)pMa->WelsMalloc (sizeof (WELS_THREAD_HANDLE) * iThreadNum,
+                          "pThreadHandles");
+  WELS_VERIFY_RETURN_PROC_IF (1, (NULL == pSmt->pThreadHandles), FreeMemorySvc (ppCtx))
+
+#ifdef WIN32
+  pSmt->pSliceCodedEvent	= (WELS_EVENT*)pMa->WelsMalloc (sizeof (WELS_EVENT) * iThreadNum, "pSliceCodedEvent");
+  WELS_VERIFY_RETURN_PROC_IF (1, (NULL == pSmt->pSliceCodedEvent), FreeMemorySvc (ppCtx))
+  pSmt->pReadySliceCodingEvent	= (WELS_EVENT*)pMa->WelsMalloc (sizeof (WELS_EVENT) * iThreadNum,
+                                  "pReadySliceCodingEvent");
+  WELS_VERIFY_RETURN_PROC_IF (1, (NULL == pSmt->pReadySliceCodingEvent), FreeMemorySvc (ppCtx))
+  pSmt->pFinSliceCodingEvent	= (WELS_EVENT*)pMa->WelsMalloc (sizeof (WELS_EVENT) * iThreadNum, "pFinSliceCodingEvent");
+  WELS_VERIFY_RETURN_PROC_IF (1, (NULL == pSmt->pFinSliceCodingEvent), FreeMemorySvc (ppCtx))
+#endif//WIN32
+
+#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+#if defined(__GNUC__)
+  pSmt->pUpdateMbListThrdHandles	= (WELS_THREAD_HANDLE*)pMa->WelsMalloc (sizeof (WELS_THREAD_HANDLE) * iThreadNum,
+                                    "pUpdateMbListThrdHandles");
+  WELS_VERIFY_RETURN_PROC_IF (1, (NULL == pSmt->pUpdateMbListThrdHandles), FreeMemorySvc (ppCtx))
+#endif//__GNUC__
+#ifdef WIN32
+  pSmt->pUpdateMbListEvent	= (WELS_EVENT*)pMa->WelsMalloc (sizeof (WELS_EVENT) * iThreadNum, "pUpdateMbListEvent");
+  WELS_VERIFY_RETURN_PROC_IF (1, (NULL == pSmt->pUpdateMbListEvent), FreeMemorySvc (ppCtx))
+  pSmt->pFinUpdateMbListEvent	= (WELS_EVENT*)pMa->WelsMalloc (sizeof (WELS_EVENT) * iThreadNum, "pFinUpdateMbListEvent");
+  WELS_VERIFY_RETURN_PROC_IF (1, (NULL == pSmt->pFinUpdateMbListEvent), FreeMemorySvc (ppCtx))
+#endif//WIN32
+#endif//#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+
+#ifdef WIN32
+  pSmt->pExitEncodeEvent	= (WELS_EVENT*)pMa->WelsMalloc (sizeof (WELS_EVENT) * iThreadNum, "pExitEncodeEvent");
+  WELS_VERIFY_RETURN_PROC_IF (1, (NULL == pSmt->pExitEncodeEvent), FreeMemorySvc (ppCtx))
+#endif//WIN32
+
+#if defined(DYNAMIC_SLICE_ASSIGN) || defined(MT_DEBUG)
+  iIdx = 0;
+  while (iIdx < iNumSpatialLayers) {
+    SMulSliceOption* pMso	= &pPara->sDependencyLayers[iIdx].sMso;
+    const int32_t kiSliceNum = pMso->sSliceArgument.iSliceNum;
+    if (pMso->uiSliceMode == SM_FIXEDSLCNUM_SLICE && pPara->iMultipleThreadIdc > 1
+        && pPara->iMultipleThreadIdc >= kiSliceNum) {
+      pSmt->pSliceConsumeTime[iIdx]	= (uint32_t*)pMa->WelsMallocz (kiSliceNum * sizeof (uint32_t), "pSliceConsumeTime[]");
+      WELS_VERIFY_RETURN_PROC_IF (1, (NULL == pSmt->pSliceConsumeTime[iIdx]), FreeMemorySvc (ppCtx))
+#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+      pSmt->pSliceComplexRatio[iIdx]	= (float*)pMa->WelsMalloc (kiSliceNum * sizeof (float), "pSliceComplexRatio[]");
+      WELS_VERIFY_RETURN_PROC_IF (1, (NULL == pSmt->pSliceComplexRatio[iIdx]), FreeMemorySvc (ppCtx))
+#endif//TRY_SLICING_BALANCE
+    } else {
+      pSmt->pSliceConsumeTime[iIdx]	= NULL;
+#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+      pSmt->pSliceComplexRatio[iIdx]	= NULL;
+#endif//TRY_SLICING_BALANCE
+    }
+    ++ iIdx;
+  }
+  // NULL for pSliceConsumeTime[iIdx]: iIdx from iNumSpatialLayers to MAX_DEPENDENCY_LAYERS
+#endif//#if defined(DYNAMIC_SLICE_ASSIGN) || defined(MT_DEBUG)
+
+#ifdef MT_DEBUG
+  // file handle for MT debug
+  pSmt->pFSliceDiff = NULL;
+
+  if (pSmt->pFSliceDiff) {
+    fclose (pSmt->pFSliceDiff);
+    pSmt->pFSliceDiff = NULL;
+  }
+#ifdef WIN32
+  pSmt->pFSliceDiff	= fopen (".\\slice_time.txt", "wt+");
+#else
+  pSmt->pFSliceDiff	= fopen ("/tmp/slice_time.txt", "wt+");
+#endif//WIN32
+#endif//MT_DEBUG
+
+#if defined(ENABLE_TRACE_MT)
+  WelsLog ((*ppCtx), WELS_LOG_INFO, "encpEncCtx= 0x%p\n", (void*) (*ppCtx));
+#endif//ENABLE_TRACE_MT
+
+  iIdx = 0;
+  while (iIdx < iThreadNum) {
+#ifdef __GNUC__	// for posix threading
+    str_t name[SEM_NAME_MAX] = {0};
+    int32_t used_len = 0;
+    WELS_THREAD_ERROR_CODE err = 0;
+#endif//__GNUC__
+    pSmt->pThreadPEncCtx[iIdx].pWelsPEncCtx	= (void*) (*ppCtx);
+    pSmt->pThreadPEncCtx[iIdx].iSliceIndex	= iIdx;
+    pSmt->pThreadPEncCtx[iIdx].iThreadIndex	= iIdx;
+    pSmt->pThreadHandles[iIdx]				= 0;
+
+#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+#ifdef WIN32
+    WelsEventInit (&pSmt->pUpdateMbListEvent[iIdx]);
+    WelsEventInit (&pSmt->pFinUpdateMbListEvent[iIdx]);
+#else
+    // length of semaphore name should be system constrained at least on mac 10.7
+    SNPRINTF (name, SEM_NAME_MAX, "ud%d%p", iIdx, (void*) (*ppCtx));
+    err = WelsEventOpen (&pSmt->pUpdateMbListEvent[iIdx], name);
+#if defined(ENABLE_TRACE_MT)
+    WelsLog ((*ppCtx), WELS_LOG_INFO, "[MT] Open pUpdateMbListEvent%d named(%s) ret%d err%d\n", iIdx, name, err, errno);
+#endif
+    used_len = SNPRINTF (name, SEM_NAME_MAX, "fu%d%p", iIdx, (void*) (*ppCtx));
+    name[used_len] = '\0';
+    err = WelsEventOpen (&pSmt->pFinUpdateMbListEvent[iIdx], name);
+#if defined(ENABLE_TRACE_MT)
+    WelsLog ((*ppCtx), WELS_LOG_INFO, "[MT] Open pFinUpdateMbListEvent%d named(%s) ret%d err%d\n", iIdx, name, err, errno);
+#endif
+#endif//WIN32
+#endif//#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+
+#ifdef WIN32
+    WelsEventInit (&pSmt->pSliceCodedEvent[iIdx]);
+    WelsEventInit (&pSmt->pReadySliceCodingEvent[iIdx]);
+    WelsEventInit (&pSmt->pFinSliceCodingEvent[iIdx]);
+    WelsEventInit (&pSmt->pExitEncodeEvent[iIdx]);
+#else
+    used_len = SNPRINTF (name, SEM_NAME_MAX, "sc%d%p", iIdx, (void*) (*ppCtx));
+    name[used_len] = '\0';
+    err = WelsEventOpen (&pSmt->pSliceCodedEvent[iIdx], name);
+#if defined(ENABLE_TRACE_MT)
+    WelsLog ((*ppCtx), WELS_LOG_INFO, "[MT] Open pSliceCodedEvent%d named(%s) ret%d err%d\n", iIdx, name, err, errno);
+#endif
+    used_len = SNPRINTF (name, SEM_NAME_MAX, "rc%d%p", iIdx, (void*) (*ppCtx));
+    name[used_len] = '\0';
+    err = WelsEventOpen (&pSmt->pReadySliceCodingEvent[iIdx], name);
+#if defined(ENABLE_TRACE_MT)
+    WelsLog ((*ppCtx), WELS_LOG_INFO, "[MT] Open pReadySliceCodingEvent%d = 0x%p named(%s) ret%d err%d\n", iIdx,
+             (void*)pSmt->pReadySliceCodingEvent[iIdx]), (void*) (*ppCtx), err, errno);
+#endif
+#endif//WIN32
+
+    ++ iIdx;
+  }
+
+#ifdef PACKING_ONE_SLICE_PER_LAYER
+  pSmt->pCountBsSizeInPartition	= (uint32_t*)pMa->WelsMalloc (sizeof (uint32_t) * iThreadNum, "pCountBsSizeInPartition");
+  WELS_VERIFY_RETURN_PROC_IF (1, (NULL == pSmt->pCountBsSizeInPartition), FreeMemorySvc (ppCtx))
+#endif//PACKING_ONE_SLICE_PER_LAYER	
+
+  WelsMutexInit (&pSmt->mutexSliceNumUpdate);
+
+  (*ppCtx)->pSliceBs	= (SWelsSliceBs*)pMa->WelsMalloc (sizeof (SWelsSliceBs) * iMaxSliceNum, "pSliceBs");
+  WELS_VERIFY_RETURN_PROC_IF (1, (NULL == (*ppCtx)->pSliceBs), FreeMemorySvc (ppCtx))
+
+  pBsBase		= (*ppCtx)->pFrameBs + iCountBsLen;
+  pSliceB	= (*ppCtx)->pSliceBs;
+  iSliceBsBufferSize	= iTargetSpatialBsSize;
+  iIdx = 0;
+  while (iIdx < iMaxSliceNum) {
+    pSliceB->pBsBuffer	= (uint8_t*)pMa->WelsMalloc (iSliceBsBufferSize, "pSliceB->pBsBuffer");
+
+    WELS_VERIFY_RETURN_PROC_IF (1, (NULL == pSliceB->pBsBuffer), FreeMemorySvc (ppCtx))
+    pSliceB->uiSize	= iSliceBsBufferSize;
+
+    if (iIdx > 0) {
+      pSliceB->pBs		= pBsBase;
+      pSliceB->uiBsPos	= 0;
+      pBsBase				+= iSliceBsBufferSize;
+    } else {
+      pSliceB->pBs		= NULL;
+      pSliceB->uiBsPos	= 0;
+    }
+    ++ pSliceB;
+    ++ iIdx;
+  }
+
+#if defined(ENABLE_TRACE_MT)
+  WelsLog ((*ppCtx), WELS_LOG_INFO, "RequestMtResource(), iThreadNum=%d, iCountSliceNum= %d\n", pPara->iCountThreadsNum,
+           iMaxSliceNum);
+#endif
+
+  return 0;
+}
+
+void ReleaseMtResource (sWelsEncCtx** ppCtx) {
+  SWelsSliceBs* pSliceB			= NULL;
+  SWelsSvcCodingParam* pCodingParam	= NULL;
+  SSliceThreading* pSmt			= NULL;
+  CMemoryAlign* pMa				= NULL;
+  int32_t iIdx						= 0;
+  int32_t iThreadNum				= 0;
+  int16_t uiSliceNum				= 0;
+
+  if (NULL == ppCtx || NULL == *ppCtx)
+    return;
+
+  pMa			= (*ppCtx)->pMemAlign;
+  pCodingParam		= (*ppCtx)->pSvcParam;
+  uiSliceNum	= (*ppCtx)->iMaxSliceCount;
+  iThreadNum	= (*ppCtx)->pSvcParam->iCountThreadsNum;
+  pSmt		= (*ppCtx)->pSliceThreading;
+
+  if (NULL == pSmt)
+    return;
+
+  while (iIdx < iThreadNum) {
+#ifdef WIN32
+    if (pSmt->pThreadHandles != NULL && pSmt->pThreadHandles[iIdx] != NULL)
+      WelsThreadDestroy (&pSmt->pThreadHandles[iIdx]);
+
+    if (pSmt->pSliceCodedEvent != NULL)
+      WelsEventDestroy (&pSmt->pSliceCodedEvent[iIdx]);
+    if (pSmt->pReadySliceCodingEvent != NULL)
+      WelsEventDestroy (&pSmt->pReadySliceCodingEvent[iIdx]);
+    if (pSmt->pFinSliceCodingEvent != NULL)
+      WelsEventDestroy (&pSmt->pFinSliceCodingEvent[iIdx]);
+    if (pSmt->pExitEncodeEvent != NULL)
+      WelsEventDestroy (&pSmt->pExitEncodeEvent[iIdx]);
+#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+    if (pSmt->pUpdateMbListEvent != NULL)
+      WelsEventDestroy (&pSmt->pUpdateMbListEvent[iIdx]);
+    if (pSmt->pFinUpdateMbListEvent != NULL)
+      WelsEventDestroy (&pSmt->pFinUpdateMbListEvent[iIdx]);
+#endif//DYNAMIC_SLICE_ASSIGN && TRY_SLICING_BALANCE
+#else
+    str_t ename[SEM_NAME_MAX] = {0};
+    int32_t used_len = 0;
+    // length of semaphore name should be system constrained at least on mac 10.7
+    SNPRINTF (ename, SEM_NAME_MAX, "sc%d%p", iIdx, (void*) (*ppCtx));
+    WelsEventClose (pSmt->pSliceCodedEvent[iIdx], ename);
+    used_len = SNPRINTF (ename, SEM_NAME_MAX, "rc%d%p", iIdx, (void*) (*ppCtx));
+    ename[used_len] = '\0';
+    WelsEventClose (pSmt->pReadySliceCodingEvent[iIdx], ename);
+#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+    used_len = SNPRINTF (ename, SEM_NAME_MAX, "ud%d%p", iIdx, (void*) (*ppCtx));
+    ename[used_len] = '\0';
+    WelsEventClose (pSmt->pUpdateMbListEvent[iIdx], ename);
+    used_len = SNPRINTF (ename, SEM_NAME_MAX, "fu%d%p", iIdx, (void*) (*ppCtx));
+    ename[used_len] = '\0';
+    WelsEventClose (pSmt->pFinUpdateMbListEvent[iIdx], ename);
+#endif//DYNAMIC_SLICE_ASSIGN && TRY_SLICING_BALANCE
+#endif//WIN32		
+
+    ++ iIdx;
+  }
+
+#ifdef WIN32
+  if (pSmt->pExitEncodeEvent != NULL) {
+    pMa->WelsFree (pSmt->pExitEncodeEvent, "pExitEncodeEvent");
+    pSmt->pExitEncodeEvent = NULL;
+  }
+  if (pSmt->pSliceCodedEvent != NULL) {
+    pMa->WelsFree (pSmt->pSliceCodedEvent, "pSliceCodedEvent");
+    pSmt->pSliceCodedEvent = NULL;
+  }
+  if (pSmt->pReadySliceCodingEvent != NULL) {
+    pMa->WelsFree (pSmt->pReadySliceCodingEvent, "pReadySliceCodingEvent");
+    pSmt->pReadySliceCodingEvent = NULL;
+  }
+  if (pSmt->pFinSliceCodingEvent != NULL) {
+    pMa->WelsFree (pSmt->pFinSliceCodingEvent, "pFinSliceCodingEvent");
+    pSmt->pFinSliceCodingEvent = NULL;
+  }
+#endif//WIN32
+
+#ifdef PACKING_ONE_SLICE_PER_LAYER
+  if (NULL != pSmt->pCountBsSizeInPartition) {
+    pMa->WelsFree (pSmt->pCountBsSizeInPartition, "pCountBsSizeInPartition");
+    pSmt->pCountBsSizeInPartition = NULL;
+  }
+#endif//PACKING_ONE_SLICE_PER_LAYER
+  WelsMutexDestroy (&pSmt->mutexSliceNumUpdate);
+
+  if (pSmt->pThreadPEncCtx != NULL) {
+    pMa->WelsFree (pSmt->pThreadPEncCtx, "pThreadPEncCtx");
+    pSmt->pThreadPEncCtx = NULL;
+  }
+  if (pSmt->pThreadHandles != NULL) {
+    pMa->WelsFree (pSmt->pThreadHandles, "pThreadHandles");
+    pSmt->pThreadHandles = NULL;
+  }
+
+  pSliceB = (*ppCtx)->pSliceBs;
+  iIdx = 0;
+  while (pSliceB != NULL && iIdx < uiSliceNum) {
+    if (pSliceB->pBsBuffer) {
+      pMa->WelsFree (pSliceB->pBsBuffer, "pSliceB->pBsBuffer");
+      pSliceB->pBsBuffer = NULL;
+      pSliceB->uiSize = 0;
+    }
+    ++ iIdx;
+    ++ pSliceB;
+  }
+  if ((*ppCtx)->pSliceBs != NULL) {
+    pMa->WelsFree ((*ppCtx)->pSliceBs, "pSliceBs");
+    (*ppCtx)->pSliceBs = NULL;
+  }
+#if defined(DYNAMIC_SLICE_ASSIGN) || defined(MT_DEBUG)
+  if (pSmt->pSliceConsumeTime != NULL) {
+    iIdx = 0;
+    while (iIdx < pCodingParam->iNumDependencyLayer) {
+      if (pSmt->pSliceConsumeTime[iIdx]) {
+        pMa->WelsFree (pSmt->pSliceConsumeTime[iIdx], "pSliceConsumeTime[]");
+        pSmt->pSliceConsumeTime[iIdx] = NULL;
+      }
+#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+      if (pSmt->pSliceComplexRatio[iIdx] != NULL) {
+        pMa->WelsFree (pSmt->pSliceComplexRatio[iIdx], "pSliceComplexRatio[]");
+        pSmt->pSliceComplexRatio[iIdx] = NULL;
+      }
+#endif//TRY_SLICING_BALANCE
+      ++ iIdx;
+    }
+  }
+#endif//#if defined(DYNAMIC_SLICE_ASSIGN) || defined(MT_DEBUG)
+
+#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+
+#ifdef WIN32
+  if (pSmt->pUpdateMbListEvent != NULL) {
+    pMa->WelsFree (pSmt->pUpdateMbListEvent, "pUpdateMbListEvent");
+    pSmt->pUpdateMbListEvent = NULL;
+  }
+  if (pSmt->pFinUpdateMbListEvent != NULL) {
+    pMa->WelsFree (pSmt->pFinUpdateMbListEvent, "pFinUpdateMbListEvent");
+    pSmt->pFinUpdateMbListEvent = NULL;
+  }
+#else
+  if (pSmt->pUpdateMbListThrdHandles) {
+    pMa->WelsFree (pSmt->pUpdateMbListThrdHandles, "pUpdateMbListThrdHandles");
+    pSmt->pUpdateMbListThrdHandles = NULL;
+  }
+#endif//WIN32
+
+#endif//#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+
+#ifdef MT_DEBUG
+  // file handle for debug
+  if (pSmt->pFSliceDiff) {
+    fclose (pSmt->pFSliceDiff);
+    pSmt->pFSliceDiff = NULL;
+  }
+#endif//MT_DEBUG
+  pMa->WelsFree ((*ppCtx)->pSliceThreading, "SSliceThreading");
+  (*ppCtx)->pSliceThreading = NULL;
+}
+
+int32_t AppendSliceToFrameBs (sWelsEncCtx* pCtx, SLayerBSInfo* pLbi, const int32_t iSliceCount) {
+  SWelsSvcCodingParam* pCodingParam	= pCtx->pSvcParam;
+  SDLayerParam* pDlp				= &pCodingParam->sDependencyLayers[pCtx->uiDependencyId];
+  SWelsSliceBs* pSliceBs			= NULL;
+  const BOOL_T kbIsDynamicSlicingMode	= (pDlp->sMso.uiSliceMode == SM_DYN_SLICE);
+  int32_t iLayerSize					= 0;
+  int32_t iNalIdxBase				= pLbi->iNalCount;
+  int32_t iSliceIdx					= 0;
+
+  if (!kbIsDynamicSlicingMode) {
+    pSliceBs	= &pCtx->pSliceBs[0];
+    iLayerSize	= pSliceBs->uiBsPos;	// assign with base pSlice first
+    iSliceIdx	= 1;				// pSlice 0 bs has been written to pFrameBs yet by now, so uiSliceIdx base should be 1
+    while (iSliceIdx < iSliceCount) {
+      ++ pSliceBs;
+      if (pSliceBs != NULL && pSliceBs->uiBsPos > 0) {
+        int32_t iNalIdx = 0;
+        const int32_t iCountNal	= pSliceBs->iNalIndex;
+
+#if MT_DEBUG_BS_WR
+        assert (pSliceBs->bSliceCodedFlag);
+#endif//MT_DEBUG_BS_WR
+
+        memmove (pCtx->pFrameBs + pCtx->iPosBsBuffer, pSliceBs->pBs, pSliceBs->uiBsPos);	// confirmed_safe_unsafe_usage
+        pCtx->iPosBsBuffer += pSliceBs->uiBsPos;
+
+        iLayerSize += pSliceBs->uiBsPos;
+
+        while (iNalIdx < iCountNal) {
+          pLbi->iNalLengthInByte[iNalIdxBase + iNalIdx]	= pSliceBs->iNalLen[iNalIdx];
+          ++ iNalIdx;
+        }
+        pLbi->iNalCount	+= iCountNal;
+        iNalIdxBase	+= iCountNal;
+      }
+      ++ iSliceIdx;
+    }
+  } else {	// for SM_DYN_SLICE
+    const int32_t kiPartitionCnt	= iSliceCount;
+    int32_t iPartitionIdx		= 0;
+
+    // due partition_0 has been written to pFrameBsBuffer
+    // so iLayerSize need add it
+    while (iPartitionIdx < kiPartitionCnt) {
+      const int32_t kiCountSlicesCoded = pCtx->pCurDqLayer->pNumSliceCodedOfPartition[iPartitionIdx];
+      int32_t iIdx = 0;
+
+      iSliceIdx	= iPartitionIdx;
+      while (iIdx < kiCountSlicesCoded) {
+        pSliceBs	= &pCtx->pSliceBs[iSliceIdx];
+        if (pSliceBs != NULL && pSliceBs->uiBsPos > 0) {
+          if (iPartitionIdx > 0) {
+            int32_t iNalIdx = 0;
+            const int32_t iCountNal	= pSliceBs->iNalIndex;
+
+            memmove (pCtx->pFrameBs + pCtx->iPosBsBuffer, pSliceBs->pBs, pSliceBs->uiBsPos);	// confirmed_safe_unsafe_usage
+            pCtx->iPosBsBuffer += pSliceBs->uiBsPos;
+
+            iLayerSize += pSliceBs->uiBsPos;
+
+            while (iNalIdx < iCountNal) {
+              pLbi->iNalLengthInByte[iNalIdxBase + iNalIdx]	= pSliceBs->iNalLen[iNalIdx];
+              ++ iNalIdx;
+            }
+            pLbi->iNalCount	+= iCountNal;
+            iNalIdxBase	+= iCountNal;
+          } else {
+            iLayerSize	+= pSliceBs->uiBsPos;
+          }
+        }
+
+        iSliceIdx += kiPartitionCnt;
+        ++ iIdx;
+      }
+      ++ iPartitionIdx;
+    }
+  }
+
+  return iLayerSize;
+}
+
+int32_t WriteSliceToFrameBs (sWelsEncCtx* pCtx, SLayerBSInfo* pLbi, uint8_t* pFrameBsBuffer, const int32_t iSliceIdx) {
+  SWelsSliceBs* pSliceBs			= &pCtx->pSliceBs[iSliceIdx];
+  SNalUnitHeaderExt* pNalHdrExt = &pCtx->pCurDqLayer->sLayerInfo.sNalHeaderExt;
+  uint8_t* pDst					= pFrameBsBuffer;
+  int32_t pNalLen[2];
+  int32_t iSliceSize				= 0;
+  const int32_t kiNalCnt			= pSliceBs->iNalIndex;
+  int32_t iNalIdx					= 0;
+#if !defined(PACKING_ONE_SLICE_PER_LAYER)
+  const int32_t iFirstSlice		= (iSliceIdx == 0);
+  int32_t iNalBase				= iFirstSlice ? 0 : pLbi->iNalCount;
+#else
+  int32_t iNalBase				= 0;
+#endif//!PACKING_ONE_SLICE_PER_LAYER
+
+  while (iNalIdx < kiNalCnt) {
+    iSliceSize += WelsEncodeNalExt (&pSliceBs->sNalList[iNalIdx], pNalHdrExt, pDst, &pNalLen[iNalIdx]);
+    pDst += pNalLen[iNalIdx];
+    pLbi->iNalLengthInByte[iNalBase + iNalIdx]	= pNalLen[iNalIdx];
+
+    ++ iNalIdx;
+  }
+
+#if !defined(PACKING_ONE_SLICE_PER_LAYER)
+  pSliceBs->uiBsPos	= iSliceSize;
+  if (iFirstSlice) {
+    // pBsBuffer has been updated at coding_slice_0_in_encoder_mother_thread()
+    pLbi->uiLayerType		= VIDEO_CODING_LAYER;
+    pLbi->uiSpatialId		= pNalHdrExt->uiDependencyId;
+    pLbi->uiTemporalId	= pNalHdrExt->uiTemporalId;
+    pLbi->uiQualityId		= 0;
+    pLbi->uiPriorityId	= 0;
+    pLbi->iNalCount		= kiNalCnt;
+  } else {
+    pLbi->iNalCount		+= kiNalCnt;
+  }
+#else
+  pLbi->uiLayerType		= VIDEO_CODING_LAYER;
+  pLbi->uiSpatialId		= pNalHdrExt->uiDependencyId;
+  pLbi->uiTemporalId	= pNalHdrExt->uiTemporalId;
+  pLbi->uiQualityId		= 0;
+  pLbi->uiPriorityId	= 0;
+  pLbi->iNalCount		= kiNalCnt;
+#endif//PACKING_ONE_SLICE_PER_LAYER
+
+  return iSliceSize;
+}
+
+int32_t WriteSliceBs (sWelsEncCtx* pCtx, uint8_t* pSliceBsBuf, const int32_t iSliceIdx) {
+  SWelsSliceBs* pSliceBs			= &pCtx->pSliceBs[iSliceIdx];
+  SNalUnitHeaderExt* pNalHdrExt = &pCtx->pCurDqLayer->sLayerInfo.sNalHeaderExt;
+  uint8_t* pDst					= pSliceBsBuf;
+  int32_t* pNalLen				= &pSliceBs->iNalLen[0];
+  int32_t iSliceSize				= 0;
+  const int32_t kiNalCnt			= pSliceBs->iNalIndex;
+  int32_t iNalIdx					= 0;
+
+  assert (kiNalCnt <= 2);
+  if (kiNalCnt > 2)
+    return 0;
+
+  while (iNalIdx < kiNalCnt) {
+    iSliceSize += WelsEncodeNalExt (&pSliceBs->sNalList[iNalIdx], pNalHdrExt, pDst, &pNalLen[iNalIdx]);
+    pDst += pNalLen[iNalIdx];
+
+    ++ iNalIdx;
+  }
+  pSliceBs->uiBsPos	= iSliceSize;
+
+  return iSliceSize;
+}
+
+#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+#if defined(__GNUC__)
+WELS_THREAD_ROUTINE_TYPE UpdateMbListThreadProc (void* arg) {
+  SSliceThreadPrivateData* pPrivateData	= (SSliceThreadPrivateData*)arg;
+  sWelsEncCtx* pEncPEncCtx			= NULL;
+  SDqLayer* pCurDq							= NULL;
+  int32_t iSliceIdx							= -1;
+  int32_t iEventIdx							= -1;
+  WELS_THREAD_ERROR_CODE iWaitRet				= WELS_THREAD_ERROR_GENERIAL;
+  uint32_t uiThrdRet							= 0;
+
+  if (NULL == pPrivateData)
+    WELS_THREAD_ROUTINE_RETURN (1);
+
+  pEncPEncCtx	= (sWelsEncCtx*)pPrivateData->pWelsPEncCtx;
+  iSliceIdx		= pPrivateData->iSliceIndex;
+  iEventIdx		= pPrivateData->iThreadIndex;
+
+  do {
+#if defined(ENABLE_TRACE_MT)
+    WelsLog (pEncPEncCtx, WELS_LOG_INFO, "[MT] UpdateMbListThreadProc(), try to wait (pUpdateMbListEvent[%d])!\n",
+             iEventIdx);
+#endif
+    iWaitRet = WelsEventWait (pEncPEncCtx->pSliceThreading->pUpdateMbListEvent[iEventIdx]);
+    if (WELS_THREAD_ERROR_WAIT_OBJECT_0 == iWaitRet) {
+      pCurDq			= pEncPEncCtx->pCurDqLayer;
+      UpdateMbListNeighborParallel (pCurDq->pSliceEncCtx, pCurDq->sMbDataP, iSliceIdx);
+      WelsEventSignal (
+        pEncPEncCtx->pSliceThreading->pFinUpdateMbListEvent[iEventIdx]);	// mean finished update pMb list for this pSlice
+    } else {
+      WelsLog (pEncPEncCtx, WELS_LOG_WARNING,
+               "[MT] UpdateMbListThreadProc(), waiting pUpdateMbListEvent[%d] failed(%d) and thread%d terminated!\n", iEventIdx,
+               iWaitRet, iEventIdx);
+      uiThrdRet = 1;
+      break;
+    }
+  } while (1);
+
+  WELS_THREAD_ROUTINE_RETURN (uiThrdRet);
+}
+#endif//__GNUC__
+#endif//#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+
+// thread process for coding one pSlice
+WELS_THREAD_ROUTINE_TYPE CodingSliceThreadProc (void* arg) {
+  SSliceThreadPrivateData* pPrivateData	= (SSliceThreadPrivateData*)arg;
+  sWelsEncCtx* pEncPEncCtx			= NULL;
+  SDqLayer* pCurDq							= NULL;
+  SSlice* pSlice								= NULL;
+  SWelsSliceBs* pSliceBs						= NULL;
+#ifdef WIN32
+  WELS_EVENT pEventsList[3];
+  int32_t iEventCount						= 0;
+#endif
+  WELS_THREAD_ERROR_CODE iWaitRet				= WELS_THREAD_ERROR_GENERIAL;
+  uint32_t uiThrdRet							= 0;
+  int32_t iSliceSize							= 0;
+  int32_t iSliceIdx							= -1;
+  int32_t iThreadIdx							= -1;
+  int32_t iEventIdx							= -1;
+  bool_t bNeedPrefix							= false;
+  EWelsNalUnitType eNalType						= NAL_UNIT_UNSPEC_0;
+  EWelsNalRefIdc eNalRefIdc						= NRI_PRI_LOWEST;
+
+  if (NULL == pPrivateData)
+    WELS_THREAD_ROUTINE_RETURN (1);
+
+  WelsSetThreadCancelable();
+
+  pEncPEncCtx	= (sWelsEncCtx*)pPrivateData->pWelsPEncCtx;
+
+  iThreadIdx		= pPrivateData->iThreadIndex;
+  iEventIdx		= iThreadIdx;
+
+#ifdef WIN32
+  pEventsList[iEventCount++]	= pEncPEncCtx->pSliceThreading->pReadySliceCodingEvent[iEventIdx];
+  pEventsList[iEventCount++]	= pEncPEncCtx->pSliceThreading->pExitEncodeEvent[iEventIdx];
+#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+  pEventsList[iEventCount++] = pEncPEncCtx->pSliceThreading->pUpdateMbListEvent[iEventIdx];
+#endif//#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)	
+#endif//WIN32
+
+  do {
+#ifdef WIN32
+    iWaitRet = WelsMultipleEventsWaitSingleBlocking (iEventCount,
+               &pEventsList[0],
+               (uint32_t) - 1);	// blocking until at least one event is
+#else
+#if defined(ENABLE_TRACE_MT)
+    WelsLog (pEncPEncCtx, WELS_LOG_INFO,
+             "[MT] CodingSliceThreadProc(), try to call WelsEventWait(pReadySliceCodingEvent[%d]= 0x%p), pEncPEncCtx= 0x%p!\n",
+             iEventIdx, (void*) (pEncPEncCtx->pReadySliceCodingEvent[iEventIdx]), (void*)pEncPEncCtx);
+#endif
+    iWaitRet = WelsEventWait (pEncPEncCtx->pSliceThreading->pReadySliceCodingEvent[iEventIdx]);
+#endif//WIN32		
+    if (WELS_THREAD_ERROR_WAIT_OBJECT_0 == iWaitRet) {	// start pSlice coding signal waited
+      SLayerBSInfo* pLbi = pPrivateData->pLayerBs;
+      const int32_t kiCurDid			= pEncPEncCtx->uiDependencyId;
+      const int32_t kiCurTid			= pEncPEncCtx->uiTemporalId;
+      SWelsSvcCodingParam* pCodingParam	= pEncPEncCtx->pSvcParam;
+      SDLayerParam* pParamD			= &pCodingParam->sDependencyLayers[kiCurDid];
+
+      pCurDq			= pEncPEncCtx->pCurDqLayer;
+      eNalType		= pEncPEncCtx->eNalType;
+      eNalRefIdc		= pEncPEncCtx->eNalPriority;
+      bNeedPrefix		= pEncPEncCtx->bNeedPrefixNalFlag;
+
+      if (pParamD->sMso.uiSliceMode != SM_DYN_SLICE) {
+        int64_t iSliceStart	= 0;
+        bool_t bDsaFlag = false;
+        iSliceIdx		= pPrivateData->iSliceIndex;
+        pSlice			= &pCurDq->sLayerInfo.pSliceInLayer[iSliceIdx];
+        pSliceBs		= &pEncPEncCtx->pSliceBs[iSliceIdx];
+
+#if defined(DYNAMIC_SLICE_ASSIGN) || defined(MT_DEBUG)
+        bDsaFlag	= (pParamD->sMso.uiSliceMode == SM_FIXEDSLCNUM_SLICE &&
+                     pCodingParam->iMultipleThreadIdc > 1 &&
+                     pCodingParam->iMultipleThreadIdc >= pParamD->sMso.sSliceArgument.iSliceNum);
+        if (bDsaFlag)
+          iSliceStart = WelsTime();
+#endif//DYNAMIC_SLICE_ASSIGN || MT_DEBUG
+
+#if !defined(PACKING_ONE_SLICE_PER_LAYER)
+        pSliceBs->uiBsPos	= 0;
+#endif//!PACKING_ONE_SLICE_PER_LAYER
+        pSliceBs->iNalIndex	= 0;
+        assert ((void*) (&pSliceBs->sBsWrite) == (void*)pSlice->pSliceBsa);
+        InitBits (&pSliceBs->sBsWrite, pSliceBs->pBsBuffer, pSliceBs->uiSize);
+
+#if MT_DEBUG_BS_WR
+        pSliceBs->bSliceCodedFlag	= FALSE;
+#endif//MT_DEBUG_BS_WR
+
+        if (bNeedPrefix) {
+          if (eNalRefIdc != NRI_PRI_LOWEST) {
+            WelsLoadNalForSlice (pSliceBs, NAL_UNIT_PREFIX, eNalRefIdc);
+            WelsWriteSVCPrefixNal (&pSliceBs->sBsWrite, eNalRefIdc, (NAL_UNIT_CODED_SLICE_IDR == eNalType));
+            WelsUnloadNalForSlice (pSliceBs);
+          } else { // No Prefix NAL Unit RBSP syntax here, but need add NAL Unit Header extension
+            WelsLoadNalForSlice (pSliceBs, NAL_UNIT_PREFIX, eNalRefIdc);
+            // No need write any syntax of prefix NAL Unit RBSP here
+            WelsUnloadNalForSlice (pSliceBs);
+          }
+        }
+
+        WelsLoadNalForSlice (pSliceBs, eNalType, eNalRefIdc);
+
+        WelsCodeOneSlice (pEncPEncCtx, iSliceIdx, eNalType);
+
+        WelsUnloadNalForSlice (pSliceBs);
+
+#if !defined(PACKING_ONE_SLICE_PER_LAYER)
+        if (0 == iSliceIdx) {
+          pLbi->pBsBuf	= pEncPEncCtx->pFrameBs + pEncPEncCtx->iPosBsBuffer;
+          iSliceSize = WriteSliceToFrameBs (pEncPEncCtx, pLbi, pLbi->pBsBuf, iSliceIdx);
+          pEncPEncCtx->iPosBsBuffer += iSliceSize;
+        } else
+          iSliceSize = WriteSliceBs (pEncPEncCtx, pSliceBs->pBs, iSliceIdx);
+#else// PACKING_ONE_SLICE_PER_LAYER
+        if (0 == iSliceIdx) {
+          pLbi->pBsBuf	= pEncPEncCtx->pFrameBs + pEncPEncCtx->iPosBsBuffer;
+          iSliceSize = WriteSliceToFrameBs (pEncPEncCtx, pLbi, pLbi->pBsBuf, iSliceIdx);
+          pEncPEncCtx->iPosBsBuffer += iSliceSize;
+        } else {
+          pLbi->pBsBuf	= pSliceBs->bs + pSliceBs->uiBsPos;
+          iSliceSize = WriteSliceToFrameBs (pEncPEncCtx, pLbi, pLbi->pBsBuf, iSliceIdx);
+          pSliceBs->uiBsPos += iSliceSize;
+        }
+#endif//!PACKING_ONE_SLICE_PER_LAYER
+
+        if (pCurDq->bDeblockingParallelFlag && pSlice->sSliceHeaderExt.sSliceHeader.uiDisableDeblockingFilterIdc != 1
+#if !defined(ENABLE_FRAME_DUMP)
+            && (eNalRefIdc != NRI_PRI_LOWEST) &&
+            (pParamD->iHighestTemporalId == 0 || kiCurTid < pParamD->iHighestTemporalId)
+#endif// !ENABLE_FRAME_DUMP
+           ) {
+          DeblockingFilterSliceAvcbase (pCurDq, pEncPEncCtx->pFuncList, iSliceIdx);
+        }
+
+#if defined(DYNAMIC_SLICE_ASSIGN) || defined(MT_DEBUG)
+        if (bDsaFlag) {
+          pEncPEncCtx->pSliceThreading->pSliceConsumeTime[pEncPEncCtx->uiDependencyId][iSliceIdx] = (uint32_t) (
+                WelsTime() - iSliceStart);
+#if defined(ENABLE_TRACE_MT)
+          WelsLog (pEncPEncCtx, WELS_LOG_INFO,
+                   "[MT] CodingSliceThreadProc(), coding_idx %d, uiSliceIdx %d, pSliceConsumeTime %d, iSliceSize %d, pFirstMbInSlice %d, count_num_mb_in_slice %d\n",
+                   pEncPEncCtx->iCodingIndex, iSliceIdx,
+                   pEncPEncCtx->pSliceThreading->pSliceConsumeTime[pEncPEncCtx->uiDependencyId][iSliceIdx], iSliceSize,
+                   pCurDq->pSliceEncCtx->pFirstMbInSlice[iSliceIdx], pCurDq->pSliceEncCtx->pCountMbNumInSlice[iSliceIdx]);
+#endif//ENABLE_TRACE_MT
+        }
+#endif//DYNAMIC_SLICE_ASSIGN || MT_DEBUG
+
+#if defined(SLICE_INFO_OUTPUT)
+        fprintf (stderr,
+                 "@pSlice=%-6d sliceType:%c idc:%d size:%-6d\n",
+                 iSliceIdx,
+                 (pEncPEncCtx->eSliceType == P_SLICE ? 'P' : 'I'),
+                 eNalRefIdc,
+                 iSliceSize
+                );
+#endif//SLICE_INFO_OUTPUT				
+
+#if MT_DEBUG_BS_WR
+        pSliceBs->bSliceCodedFlag	= TRUE;
+#endif//MT_DEBUG_BS_WR
+
+#ifdef WIN32
+        WelsEventSignal (
+          &pEncPEncCtx->pSliceThreading->pSliceCodedEvent[iEventIdx]);	// mean finished coding current pSlice
+#else
+        WelsEventSignal (pEncPEncCtx->pSliceThreading->pSliceCodedEvent[iEventIdx]);	// mean finished coding current pSlice
+#endif//WIN32				
+      } else {	// for SM_DYN_SLICE parallelization
+#ifdef PACKING_ONE_SLICE_PER_LAYER
+        SLayerBSInfo* pLbiPacking			= NULL;
+#endif//PACKING_ONE_SLICE_PER_LAYER
+        SSliceCtx* pSliceCtx			= pCurDq->pSliceEncCtx;
+        const int32_t kiPartitionId			= iThreadIdx;
+        const int32_t kiSliceIdxStep		= pEncPEncCtx->iActiveThreadsNum;
+        const int32_t kiFirstMbInPartition	= pPrivateData->iStartMbIndex;	// inclusive
+        const int32_t kiEndMbInPartition	= pPrivateData->iEndMbIndex;		// exclusive
+        int32_t iAnyMbLeftInPartition	= kiEndMbInPartition - kiFirstMbInPartition;
+
+        iSliceIdx		= pPrivateData->iSliceIndex;
+
+        pSliceCtx->pFirstMbInSlice[iSliceIdx]				= kiFirstMbInPartition;
+        pCurDq->pNumSliceCodedOfPartition[kiPartitionId]		= 1;	// one pSlice per partition intialized, dynamic slicing inside
+        pCurDq->pLastMbIdxOfPartition[kiPartitionId]			= kiEndMbInPartition - 1;
+
+        pCurDq->pLastCodedMbIdxOfPartition[kiPartitionId]		= 0;
+
+        while (iAnyMbLeftInPartition > 0) {
+          if (iSliceIdx >= pSliceCtx->iMaxSliceNumConstraint) {
+            // TODO: need exception handler for not large enough of MAX_SLICES_NUM related memory usage
+            // No idea about its solution due MAX_SLICES_NUM is fixed lenght in relevent pData structure
+            uiThrdRet	= 1;
+            break;
+          }
+
+          pSlice			= &pCurDq->sLayerInfo.pSliceInLayer[iSliceIdx];
+          pSliceBs		= &pEncPEncCtx->pSliceBs[iSliceIdx];
+
+#if !defined(PACKING_ONE_SLICE_PER_LAYER)
+          pSliceBs->uiBsPos	= 0;
+#endif//!PACKING_ONE_SLICE_PER_LAYER
+          pSliceBs->iNalIndex	= 0;
+          InitBits (&pSliceBs->sBsWrite, pSliceBs->pBsBuffer, pSliceBs->uiSize);
+
+          if (bNeedPrefix) {
+            if (eNalRefIdc != NRI_PRI_LOWEST) {
+              WelsLoadNalForSlice (pSliceBs, NAL_UNIT_PREFIX, eNalRefIdc);
+              WelsWriteSVCPrefixNal (&pSliceBs->sBsWrite, eNalRefIdc, (NAL_UNIT_CODED_SLICE_IDR == eNalType));
+              WelsUnloadNalForSlice (pSliceBs);
+            } else { // No Prefix NAL Unit RBSP syntax here, but need add NAL Unit Header extension
+              WelsLoadNalForSlice (pSliceBs, NAL_UNIT_PREFIX, eNalRefIdc);
+              // No need write any syntax of prefix NAL Unit RBSP here
+              WelsUnloadNalForSlice (pSliceBs);
+            }
+          }
+
+          WelsLoadNalForSlice (pSliceBs, eNalType, eNalRefIdc);
+
+          WelsCodeOneSlice (pEncPEncCtx, iSliceIdx, eNalType);
+
+          WelsUnloadNalForSlice (pSliceBs);
+
+#if !defined(PACKING_ONE_SLICE_PER_LAYER)
+          if (0 == kiPartitionId) {
+            if (0 == iSliceIdx)
+              pLbi->pBsBuf	= pEncPEncCtx->pFrameBs + pEncPEncCtx->iPosBsBuffer;
+            iSliceSize = WriteSliceToFrameBs (pEncPEncCtx, pLbi, pEncPEncCtx->pFrameBs + pEncPEncCtx->iPosBsBuffer, iSliceIdx);
+            pEncPEncCtx->iPosBsBuffer += iSliceSize;
+          } else
+            iSliceSize = WriteSliceBs (pEncPEncCtx, pSliceBs->pBs, iSliceIdx);
+#else// PACKING_ONE_SLICE_PER_LAYER
+          pLbiPacking	= pLbi + (iSliceIdx - kiPartitionId);
+
+          if (0 == kiPartitionId) {
+            pLbiPacking->pBsBuf	= pEncPEncCtx->pFrameBs + pEncPEncCtx->iPosBsBuffer;
+            iSliceSize = WriteSliceToFrameBs (pEncPEncCtx, pLbiPacking, pLbiPacking->pBsBuf, iSliceIdx);
+            pEncPEncCtx->iPosBsBuffer += iSliceSize;
+          } else {
+            pLbiPacking->pBsBuf	= pSliceBs->bs + pSliceBs->uiBsPos;
+            iSliceSize = WriteSliceToFrameBs (pEncPEncCtx, pLbiPacking, pLbiPacking->pBsBuf, iSliceIdx);
+            pSliceBs->uiBsPos += iSliceSize;
+          }
+          pEncPEncCtx->pSliceThreading->pCountBsSizeInPartition[kiPartitionId] += iSliceSize;
+#endif//!PACKING_ONE_SLICE_PER_LAYER
+
+          if (pCurDq->bDeblockingParallelFlag && pSlice->sSliceHeaderExt.sSliceHeader.uiDisableDeblockingFilterIdc != 1
+#if !defined(ENABLE_FRAME_DUMP)
+              && (eNalRefIdc != NRI_PRI_LOWEST) &&
+              (pParamD->iHighestTemporalId == 0 || kiCurTid < pParamD->iHighestTemporalId)
+#endif// !ENABLE_FRAME_DUMP
+             ) {
+            DeblockingFilterSliceAvcbase (pCurDq, pEncPEncCtx->pFuncList, iSliceIdx);
+          }
+
+#if defined(SLICE_INFO_OUTPUT)
+          fprintf (stderr,
+                   "@pSlice=%-6d sliceType:%c idc:%d size:%-6d\n",
+                   iSliceIdx,
+                   (pEncPEncCtx->eSliceType == P_SLICE ? 'P' : 'I'),
+                   eNalRefIdc,
+                   iSliceSize
+                  );
+#endif//SLICE_INFO_OUTPUT					
+
+#if defined(ENABLE_TRACE_MT)
+          WelsLog (pEncPEncCtx, WELS_LOG_INFO,
+                   "[MT] CodingSliceThreadProc(), coding_idx %d, iPartitionId %d, uiSliceIdx %d, iSliceSize %d, count_mb_slice %d, iEndMbInPartition %d, pCurDq->pLastCodedMbIdxOfPartition[%d] %d\n",
+                   pEncPEncCtx->iCodingIndex, kiPartitionId, iSliceIdx, iSliceSize, pCurDq->pSliceEncCtx->pCountMbNumInSlice[iSliceIdx],
+                   kiEndMbInPartition, kiPartitionId, pCurDq->pLastCodedMbIdxOfPartition[kiPartitionId]);
+#endif//ENABLE_TRACE_MT
+
+          iAnyMbLeftInPartition = kiEndMbInPartition - (1 + pCurDq->pLastCodedMbIdxOfPartition[kiPartitionId]);
+          iSliceIdx += kiSliceIdxStep;
+        }
+
+        if (uiThrdRet)	// any exception??
+          break;
+
+#ifdef WIN32
+        WelsEventSignal (&pEncPEncCtx->pSliceThreading->pSliceCodedEvent[iEventIdx]);	// mean finished coding current pSlice
+#else
+        WelsEventSignal (pEncPEncCtx->pSliceThreading->pSliceCodedEvent[iEventIdx]);	// mean finished coding current pSlice
+#endif//WIN32
+      }
+    }
+#ifdef WIN32
+    else if (WELS_THREAD_ERROR_WAIT_OBJECT_0 + 1 == iWaitRet) {	// exit thread signal
+      uiThrdRet	= 0;
+      break;
+    }
+#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+    else if (WELS_THREAD_ERROR_WAIT_OBJECT_0 + 2 == iWaitRet) {	// update pMb list singal
+      iSliceIdx		=
+        iEventIdx;	// pPrivateData->iSliceIndex; old threads can not be terminated, pPrivateData is not correct for applicable
+      pCurDq			= pEncPEncCtx->pCurDqLayer;
+      UpdateMbListNeighborParallel (pCurDq->pSliceEncCtx, pCurDq->sMbDataP, iSliceIdx);
+      WelsEventSignal (
+        &pEncPEncCtx->pSliceThreading->pFinUpdateMbListEvent[iEventIdx]);	// mean finished update pMb list for this pSlice
+    }
+#endif//#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+#endif//WIN32		
+    else { // WELS_THREAD_ERROR_WAIT_TIMEOUT, or WELS_THREAD_ERROR_WAIT_FAILED
+      WelsLog (pEncPEncCtx, WELS_LOG_WARNING,
+               "[MT] CodingSliceThreadProc(), waiting pReadySliceCodingEvent[%d] failed(%d) and thread%d terminated!\n", iEventIdx,
+               iWaitRet, iThreadIdx);
+      uiThrdRet	= 1;
+      break;
+    }
+  } while (1);
+
+#ifdef WIN32
+  WelsEventSignal (&pEncPEncCtx->pSliceThreading->pFinSliceCodingEvent[iEventIdx]);	// notify to mother encoding threading
+#endif//WIN32
+
+  WELS_THREAD_ROUTINE_RETURN (uiThrdRet);
+}
+
+int32_t CreateSliceThreads (sWelsEncCtx* pCtx) {
+  const int32_t kiThreadCount = pCtx->pSvcParam->iCountThreadsNum;
+  int32_t iIdx = 0;
+#if defined(WIN32) && defined(BIND_CPU_CORES_TO_THREADS)
+  DWORD  dwProcessAffinity;
+  DWORD  dwSystemAffinity;
+  GetProcessAffinityMask (GetCurrentProcess(), &dwProcessAffinity, &dwSystemAffinity);
+#endif//WIN32 && BIND_CPU_CORES_TO_THREADS
+
+  while (iIdx < kiThreadCount) {
+    WelsThreadCreate (&pCtx->pSliceThreading->pThreadHandles[iIdx], CodingSliceThreadProc,
+                      &pCtx->pSliceThreading->pThreadPEncCtx[iIdx], 0);
+#if defined(WIN32) && defined(BIND_CPU_CORES_TO_THREADS)
+    if (dwProcessAffinity > 1
+        && pCtx->pSliceThreading->pThreadHandles[iIdx] != NULL) {	// multiple cores and thread created successfully
+      DWORD  dw = 0;
+      DWORD  dwAffinityMask = 1 << iIdx;
+      if (dwAffinityMask & dwProcessAffinity) { // check if cpu is available
+        dw = SetThreadAffinityMask (pCtx->pSliceThreading->pThreadHandles[iIdx], dwAffinityMask);  //1 << iIdx
+        if (dw == 0) {
+          str_t str[64] = {0};
+          SNPRINTF (str, 64, "SetThreadAffinityMask iIdx:%d", iIdx);
+        }
+      }
+    }
+#endif//WIN32 && BIND_CPU_CORES_TO_THREADS
+    // We need extra threads for update_mb_list_proc on __GNUC__ like OS (mac/linux)
+    // due to WelsMultipleEventsWaitSingleBlocking implememtation can not work well
+    // in case waiting pUpdateMbListEvent and pReadySliceCodingEvent events at the same time
+#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+#if defined(__GNUC__)
+    WelsThreadCreate (&pCtx->pSliceThreading->pUpdateMbListThrdHandles[iIdx], UpdateMbListThreadProc,
+                      &pCtx->pSliceThreading->pThreadPEncCtx[iIdx], 0);
+#endif//__GNUC__
+#endif//#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+    ++ iIdx;
+  }
+#if defined(ENABLE_TRACE_MT)
+  WelsLog (pCtx, WELS_LOG_INFO, "CreateSliceThreads() exit..\n");
+#endif
+  return 0;
+}
+
+#ifdef PACKING_ONE_SLICE_PER_LAYER
+void ResetCountBsSizeInPartitions (uint32_t* pCountBsSizeList, const int32_t iPartitionCnt) {
+  if (pCountBsSizeList != NULL && iPartitionCnt > 0) {
+    memset (pCountBsSizeList, 0, sizeof (pCountBsSizeList[0]) * iPartitionCnt);
+  }
+}
+#endif//PACKING_ONE_SLICE_PER_LAYER
+
+#ifdef WIN32
+int32_t FiredSliceThreads (SSliceThreadPrivateData* pPriData, WELS_EVENT* pEventsList, SLayerBSInfo* pLbi,
+                           const uint32_t uiNumThreads, SSliceCtx* pSliceCtx, const BOOL_T bIsDynamicSlicingMode)
+#else
+int32_t FiredSliceThreads (SSliceThreadPrivateData* pPriData, WELS_EVENT** pEventsList, SLayerBSInfo* pLbi,
+                           const uint32_t uiNumThreads, SSliceCtx* pSliceCtx, const BOOL_T bIsDynamicSlicingMode)
+#endif//WIN32
+{
+  int32_t iEndMbIdx	= 0;
+  int32_t iIdx		= 0;
+  const int32_t kiEventCnt = uiNumThreads;
+
+  if (pPriData == NULL || pLbi == NULL || kiEventCnt <= 0 || pEventsList == NULL) {
+    WelsLog (NULL, WELS_LOG_ERROR,
+             "FiredSliceThreads(), fail due pPriData == %p || pLbi == %p || iEventCnt(%d) <= 0 || pEventsList == %p!!\n",
+             (void*)pPriData, (void*)pLbi, uiNumThreads, (void*)pEventsList);
+    return 1;
+  }
+
+#if defined(PACKING_ONE_SLICE_PER_LAYER)
+  ////////////////////////////////////////
+  if (bIsDynamicSlicingMode) {
+    iEndMbIdx	= pSliceCtx->iMbNumInFrame;
+    for (iIdx = kiEventCnt - 1; iIdx >= 0; --iIdx) {
+      const int32_t kiFirstMbIdx		= pSliceCtx->pFirstMbInSlice[iIdx];
+      pPriData[iIdx].iStartMbIndex	= kiFirstMbIdx;
+      pPriData[iIdx].iEndMbIndex		= iEndMbIdx;
+      iEndMbIdx						= kiFirstMbIdx;
+    }
+  }
+
+  iIdx = 0;
+  while (iIdx < kiEventCnt) {
+    pPriData[iIdx].pLayerBs = pLbi;
+    pPriData[iIdx].iSliceIndex	= iIdx;
+#ifdef WIN32
+    if (pEventsList[iIdx])
+      WelsEventSignal (&pEventsList[iIdx]);
+#else
+    WelsEventSignal (pEventsList[iIdx]);
+#endif//WIN32
+    ++ pLbi;
+    ++ iIdx;
+  }
+  ////////////////////////////////////////
+#else
+  ////////////////////////////////////////
+  if (bIsDynamicSlicingMode) {
+    iEndMbIdx	= pSliceCtx->iMbNumInFrame;
+    for (iIdx = kiEventCnt - 1; iIdx >= 0; --iIdx) {
+      const int32_t iFirstMbIdx		= pSliceCtx->pFirstMbInSlice[iIdx];
+      pPriData[iIdx].iStartMbIndex	= iFirstMbIdx;
+      pPriData[iIdx].iEndMbIndex		= iEndMbIdx;
+      iEndMbIdx						= iFirstMbIdx;
+    }
+  }
+
+  iIdx = 0;
+  while (iIdx < kiEventCnt) {
+    pPriData[iIdx].pLayerBs = pLbi;
+    pPriData[iIdx].iSliceIndex	= iIdx;
+#ifdef WIN32
+    if (pEventsList[iIdx])
+      WelsEventSignal (&pEventsList[iIdx]);
+#else
+    WelsEventSignal (pEventsList[iIdx]);
+#endif//WIN32
+    ++ iIdx;
+  }
+  ////////////////////////////////////////
+#endif//PACKING_ONE_SLICE_PER_LAYER
+
+  return 0;
+}
+
+int32_t DynamicDetectCpuCores() {
+  WelsLogicalProcessInfo  info;
+  WelsQueryLogicalProcessInfo (&info);
+  return info.ProcessorCount;
+}
+
+#if defined(MT_ENABLED) && defined(DYNAMIC_SLICE_ASSIGN)
+
+int32_t AdjustBaseLayer (sWelsEncCtx* pCtx) {
+  SDqLayer* pCurDq	= pCtx->ppDqLayerList[0];
+  int32_t iNeedAdj	= 1;
+#ifdef MT_DEBUG
+  int64_t iT0 = WelsTime();
+#endif//MT_DEBUG
+#ifdef TRY_SLICING_BALANCE
+
+  pCtx->pCurDqLayer	= pCurDq;
+
+#ifdef NOT_ABSOLUTE_BALANCING
+  // do not need adjust due to not different at both slices of consumed time
+  iNeedAdj	= NeedDynamicAdjust (pCtx->pSliceThreading->pSliceConsumeTime[0], pCurDq->pSliceEncCtx->iSliceNumInFrame);
+  if (iNeedAdj)
+#endif//NOT_ABSOLUTE_BALANCING
+    DynamicAdjustSlicing (pCtx,
+                          pCurDq,
+                          pCtx->pSliceThreading->pSliceComplexRatio[0],
+                          0);
+#endif//TRY_SLICING_BALANCE
+#ifdef MT_DEBUG
+  iT0 = WelsTime() - iT0;
+  if (pCtx->pSliceThreading->pFSliceDiff) {
+    fprintf (pCtx->pSliceThreading->pFSliceDiff,
+#ifdef WIN32
+             "%6I64d us adjust time at base spatial layer, iNeedAdj %d, DynamicAdjustSlicing()\n",
+#else
+             "%6lld us adjust time at base spatial layer, iNeedAdj %d, DynamicAdjustSlicing()\n",
+#endif//WIN32
+             iT0, iNeedAdj);
+  }
+#endif//MT_DEBUG
+
+  return iNeedAdj;
+}
+
+int32_t AdjustEnhanceLayer (sWelsEncCtx* pCtx, int32_t iCurDid) {
+#ifdef MT_DEBUG
+  int64_t iT1 = WelsTime();
+#endif//MT_DEBUG
+  int32_t iNeedAdj = 1;
+  // uiSliceMode of referencing spatial should be SM_FIXEDSLCNUM_SLICE
+  // if using spatial base layer for complexity estimation
+  const BOOL_T kbModelingFromSpatial =	(pCtx->pCurDqLayer->pRefLayer != NULL && iCurDid > 0)
+                                        && (pCtx->pSvcParam->sDependencyLayers[iCurDid - 1].sMso.uiSliceMode == SM_FIXEDSLCNUM_SLICE
+                                            && pCtx->pSvcParam->iMultipleThreadIdc >= pCtx->pSvcParam->sDependencyLayers[iCurDid -
+                                                1].sMso.sSliceArgument.iSliceNum);
+
+  if (kbModelingFromSpatial) {	// using spatial base layer for complexity estimation
+#ifdef TRY_SLICING_BALANCE
+#ifdef NOT_ABSOLUTE_BALANCING
+    // do not need adjust due to not different at both slices of consumed time
+    iNeedAdj = NeedDynamicAdjust (pCtx->pSliceThreading->pSliceConsumeTime[iCurDid - 1],
+                                  pCtx->pCurDqLayer->pSliceEncCtx->iSliceNumInFrame);
+    if (iNeedAdj)
+#endif//NOT_ABSOLUTE_BALANCING
+      DynamicAdjustSlicing (pCtx,
+                            pCtx->pCurDqLayer,
+                            pCtx->pSliceThreading->pSliceComplexRatio[iCurDid - 1],
+                            iCurDid
+                           );
+#endif//TRY_SLICING_BALANCE
+  } else {	// use temporal layer for complexity estimation
+#ifdef TRY_SLICING_BALANCE
+#ifdef NOT_ABSOLUTE_BALANCING
+    // do not need adjust due to not different at both slices of consumed time
+    iNeedAdj = NeedDynamicAdjust (pCtx->pSliceThreading->pSliceConsumeTime[iCurDid],
+                                  pCtx->pCurDqLayer->pSliceEncCtx->iSliceNumInFrame);
+    if (iNeedAdj)
+#endif//NOT_ABSOLUTE_BALANCING
+      DynamicAdjustSlicing (pCtx,
+                            pCtx->pCurDqLayer,
+                            pCtx->pSliceThreading->pSliceComplexRatio[iCurDid],
+                            iCurDid
+                           );
+#endif//TRY_SLICING_BALANCE
+  }
+
+#ifdef MT_DEBUG
+  iT1 = WelsTime() - iT1;
+  if (pCtx->pSliceThreading->pFSliceDiff) {
+    fprintf (pCtx->pSliceThreading->pFSliceDiff,
+#ifdef WIN32
+             "%6I64d us adjust time at spatial layer %d, iNeedAdj %d, DynamicAdjustSlicing()\n",
+#else
+             "%6lld us adjust time at spatial layer %d, iNeedAdj %d, DynamicAdjustSlicing()\n",
+#endif//WIN32
+             iT1, iCurDid, iNeedAdj);
+  }
+#endif//MT_DEBUG
+
+  return iNeedAdj;
+}
+
+#endif//#if defined(MT_ENABLED) && defined(DYNAMIC_SLICE_ASSIGN)
+
+#if defined(MT_ENABLED)
+
+#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE) && defined(MT_DEBUG)
+void TrackSliceComplexities (sWelsEncCtx* pCtx, const int32_t iCurDid) {
+  const int32_t kiCountSliceNum = pCtx->pCurDqLayer->pSliceEncCtx->iSliceNumInFrame;
+  if (kiCountSliceNum > 0) {
+    int32_t iSliceIdx = 0;
+    do {
+      fprintf (pCtx->pSliceThreading->pFSliceDiff, "%6.3f complexity pRatio at iDid %d pSlice %d\n",
+               pCtx->pSliceThreading->pSliceComplexRatio[iCurDid][iSliceIdx], iCurDid, iSliceIdx);
+      ++ iSliceIdx;
+    } while (iSliceIdx < kiCountSliceNum);
+  }
+}
+#endif//#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+
+#if defined(DYNAMIC_SLICE_ASSIGN) && defined(MT_DEBUG)
+void TrackSliceConsumeTime (sWelsEncCtx* pCtx, int32_t* pDidList, const int32_t iSpatialNum) {
+  SWelsSvcCodingParam* pPara = NULL;
+  int32_t iSpatialIdx = 0;
+
+  if (iSpatialNum > MAX_DEPENDENCY_LAYER)
+    return;
+
+  pPara	= pCtx->pSvcParam;
+  while (iSpatialIdx < iSpatialNum) {
+    const int32_t kiDid		= pDidList[iSpatialIdx];
+    SDLayerParam* pDlp		= &pPara->sDependencyLayers[kiDid];
+    SMulSliceOption* pMso	= &pDlp->sMso;
+    SDqLayer* pCurDq		= pCtx->ppDqLayerList[kiDid];
+    SSliceCtx* pSliceCtx = pCurDq->pSliceEncCtx;
+    const uint32_t kuiCountSliceNum = pSliceCtx->iSliceNumInFrame;
+    if (pCtx->pSliceThreading) {
+      if (pCtx->pSliceThreading->pFSliceDiff && pMso->uiSliceMode == SM_FIXEDSLCNUM_SLICE && pPara->iMultipleThreadIdc > 1
+          && pPara->iMultipleThreadIdc >= kuiCountSliceNum) {
+        uint32_t i = 0;
+        uint32_t uiMaxT = 0;
+        int32_t iMaxI = 0;
+        while (i < kuiCountSliceNum) {
+          if (pCtx->pSliceThreading->pSliceConsumeTime[kiDid] != NULL)
+            fprintf (pCtx->pSliceThreading->pFSliceDiff, "%6d us consume_time coding_idx %d iDid %d pSlice %d\n",
+                     pCtx->pSliceThreading->pSliceConsumeTime[kiDid][i], pCtx->iCodingIndex, kiDid, i /*/ 1000*/);
+          if (pCtx->pSliceThreading->pSliceConsumeTime[kiDid][i] > uiMaxT) {
+            uiMaxT = pCtx->pSliceThreading->pSliceConsumeTime[kiDid][i];
+            iMaxI = i;
+          }
+          ++ i;
+        }
+        fprintf (pCtx->pSliceThreading->pFSliceDiff, "%6d us consume_time_max coding_idx %d iDid %d pSlice %d\n", uiMaxT,
+                 pCtx->iCodingIndex, kiDid, iMaxI /*/ 1000*/);
+      }
+    }
+    ++ iSpatialIdx;
+  }
+}
+#endif//#if defined(DYNAMIC_SLICE_ASSIGN) || defined(MT_DEBUG)
+
+#endif//MT_ENABLED
+}
+#endif//MT_ENABLED
+
--- a/codec/encoder/core/src/svc_base_layer_md.cpp
+++ b/codec/encoder/core/src/svc_base_layer_md.cpp
@@ -1,1985 +1,1875 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file	svc_base_layer_md.c
- *
- * \brief	mode decision
- *
- * \date	2009.08.10 Created
- *
- *************************************************************************************
- */
-#include <string.h>
-#include <assert.h>
-#include "ls_defines.h"
-#include "encoder_context.h"
-#include "svc_enc_slice_segment.h"
-#include "md.h"
-#include "mc.h"
-#include "mv_pred.h"
-#include "cpu_core.h"
-#include "svc_enc_golomb.h"
-#include "svc_base_layer_md.h"
-#include "sample.h"
-#include "encoder.h"
-#include "svc_encode_mb.h"
-#include "svc_encode_slice.h"
-#include "svc_motion_estimate.h"
-#include "as264_common.h"
-#include "encode_mb_aux.h"
-#include "utils.h"
-namespace WelsSVCEnc {
-static const ALIGNED_DECLARE(int8_t, g_kiIntra16AvaliMode[8][5], 16) = {
-    	{ I16_PRED_DC_128, I16_PRED_INVALID, I16_PRED_INVALID, I16_PRED_INVALID, 1 },
-		{ I16_PRED_DC_L,   I16_PRED_H,       I16_PRED_INVALID, I16_PRED_INVALID, 2 },
-		{ I16_PRED_DC_T,   I16_PRED_V,		 I16_PRED_INVALID, I16_PRED_INVALID, 2 },
-   		{ I16_PRED_V,      I16_PRED_H,		 I16_PRED_DC,	   I16_PRED_INVALID, 3 },
-   		{ I16_PRED_DC_128, I16_PRED_INVALID, I16_PRED_INVALID, I16_PRED_INVALID, 1 },
-   		{ I16_PRED_DC_L,   I16_PRED_H,		 I16_PRED_INVALID, I16_PRED_INVALID, 2 },
-   		{ I16_PRED_DC_T,   I16_PRED_V,		 I16_PRED_INVALID, I16_PRED_INVALID, 2 },
-   		{ I16_PRED_V,      I16_PRED_H,		 I16_PRED_DC,	   I16_PRED_P,       4 }
-};
-
-static const ALIGNED_DECLARE(uint8_t, g_kiIntra4AvailCount[16], 16) = {
-#ifndef  I4_PRED_MODE_EXTEND
-	1,3,2,4,1,3,2,7,1,3,4,6,1,3,4,9
-#else
-	1,3,4,4,1,3,4,7,1,3,4,6,1,3,4,9
-#endif  //I4_PRED_MODE_EXTEND
-};
-
-//left_avail | (top_avail<<1) | (left_top_avail<<2) | (right_top_avail<<3);
-static const ALIGNED_DECLARE(uint8_t, g_kiIntra4AvailMode[16][16], 16) = {
-	{
-	I4_PRED_DC_128,  I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID},  //  0000
-
-	{ 
-	I4_PRED_DC_L,    I4_PRED_H,       I4_PRED_HU,      I4_PRED_INVALID, 
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID},  //  0001
-
-#ifndef  I4_PRED_MODE_EXTEND
-	{ 
-	I4_PRED_DC_T,    I4_PRED_V,       I4_PRED_INVALID, I4_PRED_INVALID,
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID}, //  0010
-#else  
-	{ 
-	I4_PRED_DC_T,    I4_PRED_V,       I4_PRED_DDL_TOP, I4_PRED_VL_TOP, 
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID}, //  0010
-#endif //I4_PRED_MODE_EXTEND
-
-	{ 
-	I4_PRED_DC,      I4_PRED_H,       I4_PRED_V,       I4_PRED_HU, 
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID}, //  0011
-
-	{ 
-	I4_PRED_DC_128,  I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,	
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID},  //  0100
-
-	{ 
-	I4_PRED_DC_L,    I4_PRED_H,       I4_PRED_HU,      I4_PRED_INVALID, 
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID},    //  0101
-
-#ifndef  I4_PRED_MODE_EXTEND
-	{ 
-	I4_PRED_DC_T,    I4_PRED_V,       I4_PRED_INVALID, I4_PRED_INVALID, 
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID},     //  0110
-#else 
-	{ I4_PRED_DC_T,  I4_PRED_V,       I4_PRED_DDL_TOP, I4_PRED_VL_TOP, 
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID},     //  0110
-#endif //I4_PRED_MODE_EXTEND
-
-	{ 
-	I4_PRED_DC,      I4_PRED_H,       I4_PRED_V,       I4_PRED_HU,
-	I4_PRED_DDR,     I4_PRED_VR,      I4_PRED_HD,      I4_PRED_INVALID, 
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID},               //  0111
-
-	{ 
-	I4_PRED_DC_128,   I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
-	I4_PRED_INVALID,  I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
-	I4_PRED_INVALID,  I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
-	I4_PRED_INVALID,  I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID},  //  1000
-
-	{ 
-	I4_PRED_DC_L,    I4_PRED_H,       I4_PRED_HU,      I4_PRED_INVALID, 
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID},    //  1001
-
-	{ 
-	I4_PRED_DC_T,    I4_PRED_V,       I4_PRED_DDL,     I4_PRED_VL, 
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID},     //  1010
-
-	{ 
-	I4_PRED_DC,      I4_PRED_H,       I4_PRED_V,       I4_PRED_HU, 
-	I4_PRED_DDL,     I4_PRED_VL,      I4_PRED_INVALID, I4_PRED_INVALID, 
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID},          //  1011 
-
-	{ 
-	I4_PRED_DC_128,  I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID},  //  1100
-
-	{ 
-	I4_PRED_DC_L,    I4_PRED_H,       I4_PRED_HU,      I4_PRED_INVALID, 
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID},    //  1101
-
-	{ 
-	I4_PRED_DC_T,    I4_PRED_V,       I4_PRED_DDL,     I4_PRED_VL, 
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID},     //  1110
-
-	{ 
-	I4_PRED_DC,      I4_PRED_H,       I4_PRED_V,       I4_PRED_HU, 
-	I4_PRED_DDL,     I4_PRED_VL,      I4_PRED_DDR,     I4_PRED_VR, 
-	I4_PRED_HD,      I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
-	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID}                          //  1111 
-
-};
-static const ALIGNED_DECLARE(int8_t, g_kiIntraChromaAvailMode[8][5], 16) = {
-		{ C_PRED_DC_128, C_PRED_INVALID, C_PRED_INVALID, C_PRED_INVALID, 1 },
-		{ C_PRED_DC_L,   C_PRED_H,       C_PRED_INVALID, C_PRED_INVALID, 2 },
-		{ C_PRED_DC_T,   C_PRED_V,       C_PRED_INVALID, C_PRED_INVALID, 2 },
-		{ C_PRED_V,      C_PRED_H,       C_PRED_DC,      C_PRED_INVALID, 3 },
-		{ C_PRED_DC_128, C_PRED_INVALID, C_PRED_INVALID, C_PRED_INVALID, 1 },
-		{ C_PRED_DC_L,   C_PRED_H,       C_PRED_INVALID, C_PRED_INVALID, 2 },
-		{ C_PRED_DC_T,   C_PRED_V,       C_PRED_INVALID, C_PRED_INVALID, 2 },
-		{ C_PRED_V,      C_PRED_H,       C_PRED_DC,      C_PRED_P,       4 }		
-};
-
-// for cache hit, two table are total sizeof 64 Bytes
-const int8_t g_kiCoordinateIdx4x4X[16] = { 0, 4, 0, 4,
-										  8, 12, 8, 12,
-										  0, 4, 0, 4,
-										  8, 12, 8, 12};
-
-const int8_t g_kiCoordinateIdx4x4Y[16] = { 0, 0, 4, 4,
-										  0, 0, 4, 4,
-										  8, 8, 12, 12,
-										  8, 8, 12, 12};
-static const ALIGNED_DECLARE(int8_t, g_kiNeighborIntraToI4x4[16][16], 16) = 
-{
-	{	0,	1,	10,	7,	1,	1,	15,	7,	10,	15,	10,	7,	15,	7,	15,	7},
-	{	1,	1,	15,	7,	1,	1,	15,	7,	15,	15,	15,	7,	15,	7,	15,	7},
-	{	10,	15,	10,	7,	15,	7,	15,	7,	10,	15,	10,	7,	15,	7,	15,	7},
-	{	11,	15,	15,	7,	15,	7,	15,	7,	15,	15,	15,	7,	15,	7,	15,	7},
-	{	4,	1,	10,	7,	1,	1,	15,	7,	10,	15,	10,	7,	15,	7,	15,	7},
-	{	5,	1,	15,	7,	1,	1,	15,	7,	15,	15,	15,	7,	15,	7,	15,	7},
-	{	14,	15,	10,	7,	15,	7,	15,	7,	10,	15,	10,	7,	15,	7,	15,	7},
-	{	15,	15,	15,	7,	15,	7,	15,	7,	15,	15,	15,	7,	15,	7,	15,	7},
-	{	0,	1,	10,	7,	1,	9,	15,	7,	10,	15,	10,	7,	15,	7,	15,	7},
-	{	1,	1,	15,	7,	1,	9,	15,	7,	15,	15,	15,	7,	15,	7,	15,	7},
-	{	10,	15,	10,	7,	15,	15,	15,	7,	10,	15,	10,	7,	15,	7,	15,	7},
-	{	11,	15,	15,	7,	15,	15,	15,	7,	15,	15,	15,	7,	15,	7,	15,	7},
-	{	4,	1,	10,	7,	1,	9,	15,	7,	10,	15,	10,	7,	15,	7,	15,	7},
-	{	5,	1,	15,	7,	1,	9,	15,	7,	15,	15,	15,	7,	15,	7,	15,	7},
-	{	14,	15,	10,	7,	15,	15,	15,	7,	10,	15,	10,	7,	15,	7,	15,	7},
-	{	15,	15,	15,	7,	15,	15,	15,	7,	15,	15,	15,	7,	15,	7,	15,	7},
-};
-	
-__align16(const int8_t,g_kiMapModeI4x4[14]) =
-{
-	0, 1, 2, 3, 4, 5, 6, 7, 8, 2, 2, 2, 3, 7
-};
-	
-int32_t PredIntra4x4Mode(int8_t* pIntraPredMode, int32_t iIdx4)
-{
-	int8_t iTopMode = pIntraPredMode[iIdx4 - 8];
-	int8_t iLeftMode = pIntraPredMode[iIdx4 - 1];
-	int8_t iBestMode;
-
-	if (-1 == iLeftMode || -1 == iTopMode)
-	{
-		iBestMode = 2;
-	}
-	else
-	{	
-		iBestMode = WELS_MIN(iLeftMode, iTopMode);
-	}
-	return iBestMode;
-}
-
-void WelsMdIntraInit(sWelsEncCtx* pEncCtx, SMB* pCurMb, SMbCache *pMbCache, const int32_t iSliceFirstMbXY)
-{	
-	SDqLayer* pCurLayer = pEncCtx->pCurDqLayer;	
-
-	const int32_t kiMbX  = pCurMb->iMbX;
-	const int32_t kiMbY  = pCurMb->iMbY;
-	const int32_t kiMbXY = pCurMb->iMbXY;
-
-	// step 3. locating current pEnc and pDec
-	// unroll loops here
-	if( 0 == kiMbX || iSliceFirstMbXY == kiMbXY )
-	{   
-		int32_t iStrideY, iStrideUV;
-		int32_t iOffsetY, iOffsetUV;
-
-		iStrideY	= pCurLayer->iEncStride[0];
-		iStrideUV	= pCurLayer->iEncStride[1];
-		iOffsetY	= (kiMbX + kiMbY * iStrideY) << 4;
-		iOffsetUV	= (kiMbX + kiMbY * iStrideUV) << 3;
-		pMbCache->SPicData.pEncMb[0]		= pCurLayer->pEncData[0] + iOffsetY;
-		pMbCache->SPicData.pEncMb[1]		= pCurLayer->pEncData[1] + iOffsetUV;
-		pMbCache->SPicData.pEncMb[2]		= pCurLayer->pEncData[2] + iOffsetUV;
-
-		iStrideY	= pCurLayer->iCsStride[0];
-		iStrideUV	= pCurLayer->iCsStride[1];
-		iOffsetY	= (kiMbX + kiMbY * iStrideY) << 4;
-		iOffsetUV	= (kiMbX + kiMbY * iStrideUV) << 3;
-		pMbCache->SPicData.pCsMb[0]			= pCurLayer->pCsData[0] + iOffsetY;
-		pMbCache->SPicData.pCsMb[1]			= pCurLayer->pCsData[1] + iOffsetUV;
-		pMbCache->SPicData.pCsMb[2]			= pCurLayer->pCsData[2] + iOffsetUV;
-
-		iStrideY	= pCurLayer->pDecPic->iLineSize[0];
-		iStrideUV	= pCurLayer->pDecPic->iLineSize[1];
-		iOffsetY	= (kiMbX + kiMbY * iStrideY) << 4;
-		iOffsetUV	= (kiMbX + kiMbY * iStrideUV) << 3;
-		pMbCache->SPicData.pDecMb[0]		= pCurLayer->pDecPic->pData[0] + iOffsetY;
-		pMbCache->SPicData.pDecMb[1]		= pCurLayer->pDecPic->pData[1] + iOffsetUV;
-		pMbCache->SPicData.pDecMb[2]		= pCurLayer->pDecPic->pData[2] + iOffsetUV;
-	}
-	else
-	{
-		pMbCache->SPicData.pEncMb[0]	+= MB_WIDTH_LUMA;
-		pMbCache->SPicData.pEncMb[1]	+= MB_WIDTH_CHROMA;
-		pMbCache->SPicData.pEncMb[2]	+= MB_WIDTH_CHROMA;
-
-		pMbCache->SPicData.pDecMb[0]	+= MB_WIDTH_LUMA;
-		pMbCache->SPicData.pDecMb[1]	+= MB_WIDTH_CHROMA;
-		pMbCache->SPicData.pDecMb[2]	+= MB_WIDTH_CHROMA;
-
-		pMbCache->SPicData.pCsMb[0]		+= MB_WIDTH_LUMA;
-		pMbCache->SPicData.pCsMb[1]		+= MB_WIDTH_CHROMA;
-		pMbCache->SPicData.pCsMb[2]		+= MB_WIDTH_CHROMA;
-	}
-
-	//step 2. initial pWelsMd
-	pCurMb->uiCbp			= 0;
-   
-	//step 4: locating scaled_tcoeff
-
-	//step 1. load neighbor cache
-	FillNeighborCacheIntra(pMbCache, pCurMb, pCurLayer->iMbWidth);
-	pMbCache->pMemPredLuma = pMbCache->pMemPredMb;// in WelsMdI16x16() will be changed, so re-init here!
-	pMbCache->pMemPredChroma = pMbCache->pMemPredMb + 256;// Init with default, maybe change in WelsMdI16x16 and svc_md_i16x16_sad
-}
-
-void WelsMdInterInit( sWelsEncCtx* pEncCtx, SSlice *pSlice, SMB* pCurMb, const int32_t iSliceFirstMbXY )
-{	
-	SDqLayer* pCurLayer = pEncCtx->pCurDqLayer;
-	SMbCache *pMbCache	= &pSlice->sMbCacheInfo;	
-	const int32_t kiMbX  = pCurMb->iMbX;
-	const int32_t kiMbY  = pCurMb->iMbY;
-	const int32_t kiMbXY	= pCurMb->iMbXY;
-	const int32_t kiMbWidth = pCurLayer->iMbWidth;
-	const int32_t kiMbHeight= pCurLayer->iMbHeight;
-
-	pMbCache->pEncSad		= &pCurLayer->pDecPic->pMbSkipSad[kiMbXY]; 
-
-	//step 1. load neighbor cache
-	pEncCtx->pFuncList->pfFillInterNeighborCache(pMbCache, pCurMb, kiMbWidth, pEncCtx->pVaa->pVaaBackgroundMbFlag+kiMbXY); //BGD spatial pFunc   
-	
-	//step 3: initial cost
-
-	//step 4. locating current p_ref
-	// merge loops
-	if( 0 == kiMbX || iSliceFirstMbXY == kiMbXY )
-	{   
-		const int32_t kiRefStrideY	= pCurLayer->pRefPic->iLineSize[0];
-		const int32_t kiRefStrideUV	= pCurLayer->pRefPic->iLineSize[1];
-		const int32_t kiCurStrideY	= (kiMbX + kiMbY * kiRefStrideY) << 4;
-		const int32_t kiCurStrideUV	= (kiMbX + kiMbY * kiRefStrideUV) << 3;	
-		pMbCache->SPicData.pRefMb[0]	= pCurLayer->pRefPic->pData[0] + kiCurStrideY;
-		pMbCache->SPicData.pRefMb[1]	= pCurLayer->pRefPic->pData[1] + kiCurStrideUV;
-		pMbCache->SPicData.pRefMb[2]	= pCurLayer->pRefPic->pData[2] + kiCurStrideUV;
-	}
-	else
-	{
-		pMbCache->SPicData.pRefMb[0]	+= MB_WIDTH_LUMA;
-		pMbCache->SPicData.pRefMb[1]	+= MB_WIDTH_CHROMA;
-		pMbCache->SPicData.pRefMb[2]	+= MB_WIDTH_CHROMA;
-	}
-
-	pMbCache->uiRefMbType	= pCurLayer->pRefPic->uiRefMbType[kiMbXY];
-	pMbCache->bCollocatedPredFlag = false;
-
-	//comment: sometimes, mode decision process may skip the md_p16x16 and md_pskip function,
-	ST32(&pCurMb->sP16x16Mv, 0);
-	ST32(&pCurLayer->pDecPic->sMvList[kiMbXY], 0);
-
-	pSlice->sMvMin.iMvX = -16*( kiMbX + 1 ) + INTPEL_NEEDED_MARGIN;
-	if ( pSlice->sMvMin.iMvX < -MV_RANGE )
-		pSlice->sMvMin.iMvX = -MV_RANGE;
-	pSlice->sMvMin.iMvY = -16*( kiMbY + 1 ) + INTPEL_NEEDED_MARGIN;
-	if (pSlice->sMvMin.iMvY < -MV_RANGE)
-		pSlice->sMvMin.iMvY = -MV_RANGE;
-	pSlice->sMvMax.iMvX = 16*( kiMbWidth - kiMbX ) - INTPEL_NEEDED_MARGIN;
-	if (pSlice->sMvMax.iMvX > MV_RANGE)
-		pSlice->sMvMax.iMvX = MV_RANGE;
-	pSlice->sMvMax.iMvY = 16*( kiMbHeight - kiMbY ) - INTPEL_NEEDED_MARGIN;
-	if (pSlice->sMvMax.iMvY > MV_RANGE)
-		pSlice->sMvMax.iMvY = MV_RANGE;
-}
-
-int32_t WelsMdI16x16(SWelsFuncPtrList *pFunc, SDqLayer *pCurDqLayer, SMbCache *pMbCache, int32_t iLambda)
-{
-	const int8_t  *kpAvailMode;
-	int32_t iAvailCount;	
-	int32_t iIdx = 0;
-	uint8_t* pPredI16x16[2] = {pMbCache->pMemPredMb, pMbCache->pMemPredMb + 256};
-	uint8_t *pDst		= pPredI16x16[0];
-	uint8_t *pDec       = pMbCache->SPicData.pCsMb[0];
-	uint8_t *pEnc       = pMbCache->SPicData.pEncMb[0];
-	int32_t iLineSizeDec = pCurDqLayer->iCsStride[0];
-	int32_t iLineSizeEnc = pCurDqLayer->iEncStride[0];
-	int32_t i, iCurCost, iCurMode, iBestMode, iBestCost = INT_MAX;
-
-	int32_t iOffset = pMbCache->uiNeighborIntra&0x07;
-	iAvailCount = g_kiIntra16AvaliMode[iOffset][4];
-	kpAvailMode = g_kiIntra16AvaliMode[iOffset];
-	if (iAvailCount > 3 && pFunc->sSampleDealingFuncs.pfIntra16x16Combined3 )
-	{
-		iBestCost = pFunc->sSampleDealingFuncs.pfIntra16x16Combined3(pDec, iLineSizeDec, pEnc,iLineSizeEnc,&iBestMode, iLambda, pDst/*temp*/);
-		iCurMode = kpAvailMode[3];
-		pFunc->pfGetLumaI16x16Pred[iCurMode](pDst, pDec, iLineSizeDec);
-		iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_16x16](pDst, 16, pEnc, iLineSizeEnc) + iLambda * 4 ;
-		if (iCurCost < iBestCost)
-		{
-			iBestMode = iCurMode;
-			iBestCost = iCurCost;
-		}
-		else
-		{
-			pFunc->pfGetLumaI16x16Pred[iBestMode](pDst, pDec, iLineSizeDec);
-		}
-		iIdx = 1;
-		iBestCost += iLambda;
-	}
-	else
-	{
-		iBestMode = kpAvailMode[0];
-		for ( i = 0; i < iAvailCount; ++ i)
-		{
-			iCurMode = kpAvailMode[i];
-
-			assert( iCurMode >= 0 && iCurMode < 7 );
-
-			pFunc->pfGetLumaI16x16Pred[iCurMode](pDst, pDec, iLineSizeDec);
-			iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_16x16](pDst, 16, pEnc, iLineSizeEnc);
-			iCurCost += iLambda * (BsSizeUE(g_kiMapModeI16x16[iCurMode]));
-			if (iCurCost < iBestCost)
-			{
-				iBestMode = iCurMode;
-				iBestCost = iCurCost;
-				iIdx = iIdx ^ 0x01;
-				pDst = pPredI16x16[iIdx];
-			}
-		}
-	}
-	pMbCache->pMemPredChroma = pPredI16x16[iIdx];
-	
-	pMbCache->pMemPredLuma = pPredI16x16[iIdx ^ 0x01];
-	pMbCache->uiLumaI16x16Mode  = iBestMode;
-	return iBestCost;
-}
-int32_t WelsMdI4x4(void* pEnc, void* pMd, SMB* pCurMb, SMbCache *pMbCache)
-{
-	sWelsEncCtx *pEncCtx	= (sWelsEncCtx*)pEnc;
-	SWelsFuncPtrList *pFunc		= pEncCtx->pFuncList;
-	SWelsMD* pWelsMd					= (SWelsMD*)pMd;
-	SDqLayer *pCurDqLayer			= pEncCtx->pCurDqLayer;
-	int32_t iLambda				= pWelsMd->iLambda;
-	int32_t iBestCostLuma				= pWelsMd->iCostLuma;
-	uint8_t *pEncMb					= pMbCache->SPicData.pEncMb[0];
-	uint8_t *pDecMb					= pMbCache->SPicData.pCsMb[0];
-	const int32_t kiLineSizeEnc		= pCurDqLayer->iEncStride[0];
-	const int32_t kiLineSizeDec		= pCurDqLayer->iCsStride[0];
-
-	uint8_t *pCurEnc, *pCurDec, *pDst;
-	
-	int32_t iPredMode, iCurMode, iBestMode, iFinalMode;
-	int32_t iCurCost, iBestCost;
-	int32_t iAvailCount;
-	const uint8_t *kpAvailMode;
-	int32_t i, j, iCoordinateX, iCoordinateY, iIdxStrideEnc, iIdxStrideDec;
-	int32_t lambda[2]						= {iLambda<<2, iLambda};
-	bool_t *pPrevIntra4x4PredModeFlag	= pMbCache->pPrevIntra4x4PredModeFlag;
-	int8_t *pRemIntra4x4PredModeFlag		= pMbCache->pRemIntra4x4PredModeFlag;
-	const uint8_t* kpIntra4x4AvailCount		= &g_kiIntra4AvailCount[0];
-	const uint8_t* kpCache48CountScan4		= &g_kuiCache48CountScan4Idx[0];
-	const int8_t* kpNeighborIntraToI4x4	= g_kiNeighborIntraToI4x4[pMbCache->uiNeighborIntra];
-	const int8_t* kpCoordinateIdxX			= &g_kiCoordinateIdx4x4X[0];
-	const int8_t* kpCoordinateIdxY			= &g_kiCoordinateIdx4x4Y[0];
-    int32_t iBestPredBufferNum			= 0;
-    int32_t iCosti4x4						= 0;
-	
-#if defined(X86_ASM)	
-	WelsPrefetchZero_mmx(g_kiMapModeI4x4);
-	WelsPrefetchZero_mmx((int8_t *)&pFunc->pfGetLumaI4x4Pred);
-#endif//X86_ASM
-
-	for (i = 0; i < 16; i++) 
-	{
-		const int32_t kiOffset	= kpNeighborIntraToI4x4[i];
-
-		//step 1: locating current 4x4 block position in pEnc and pDecMb
-		iCoordinateX = kpCoordinateIdxX[i];
-		iCoordinateY = kpCoordinateIdxY[i];
-		
-		iIdxStrideEnc = (iCoordinateY * kiLineSizeEnc) + iCoordinateX;
-		pCurEnc = pEncMb + iIdxStrideEnc;
-		iIdxStrideDec = (iCoordinateY * kiLineSizeDec) + iCoordinateX;
-		pCurDec = pDecMb + iIdxStrideDec;
-
-		//step 2: get predicted mode from neighbor
-		iPredMode = PredIntra4x4Mode(pMbCache->iIntraPredMode, kpCache48CountScan4[i]);
-
-		//step 3: collect candidates of iPredMode		
-		iAvailCount = kpIntra4x4AvailCount[kiOffset];
-		kpAvailMode = g_kiIntra4AvailMode[kiOffset];
-
-		//step 4: gain the best pred mode        
-		iBestCost = INT_MAX;
-		iBestMode = kpAvailMode[0];
-		
-		if (pFunc->sSampleDealingFuncs.pfIntra4x4Combined3Satd && (iAvailCount >= 6))
-		{
-			pDst = &pMbCache->pMemPredBlk4[iBestPredBufferNum << 4];
-
-			iBestCost = pFunc->sSampleDealingFuncs.pfIntra4x4Combined3(pCurDec, kiLineSizeDec, pCurEnc, kiLineSizeEnc, pDst, &iBestMode, 
-				lambda[iPredMode == 2], lambda[iPredMode == 1], lambda[iPredMode == 0]);
-               //     ST64(&pMbCache->pMemPredBlk4[iBestMode<<4], LD64(mem_pred_blk4_temp));
-               //     ST64(&pMbCache->pMemPredBlk4[8+(iBestMode<<4)], LD64(mem_pred_blk4_temp+8));
-
-        		for (j = 3; j < iAvailCount; ++ j)
-        		{
-        			iCurMode = kpAvailMode[j];
-        			
-        			assert( iCurMode >= 0 && iCurMode < 14 );
-
-        			pDst = &pMbCache->pMemPredBlk4[(1 - iBestPredBufferNum) << 4];
-
-        			pFunc->pfGetLumaI4x4Pred[iCurMode](pDst, pCurDec, kiLineSizeDec);
-        			iCurCost = pFunc->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x4](pDst, 4, pCurEnc, kiLineSizeEnc) +
-        				lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
-        			
-        			if (iCurCost < iBestCost)
-        			{			
-        				iBestMode = iCurMode;
-        				iBestCost = iCurCost;
-						iBestPredBufferNum = 1 - iBestPredBufferNum;
-        			}
-        		}
-            }
-		else
-		{
-			for (j = 0; j < iAvailCount; ++ j)
-			{
-				iCurMode = kpAvailMode[j];
-				
-				assert( iCurMode >= 0 && iCurMode < 14 );
-
-				pDst = &pMbCache->pMemPredBlk4[(1 - iBestPredBufferNum) << 4];
-
-				pFunc->pfGetLumaI4x4Pred[iCurMode](pDst, pCurDec, kiLineSizeDec);
-				iCurCost = pFunc->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x4](pDst, 4, pCurEnc, kiLineSizeEnc) +
-					lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
-				
-				if (iCurCost < iBestCost)
-				{			
-					iBestMode = iCurMode;
-					iBestCost = iCurCost;
-					iBestPredBufferNum = 1 - iBestPredBufferNum;
-				}
-			}
-		}
-		pMbCache->pBestPredI4x4Blk4 = &pMbCache->pMemPredBlk4[iBestPredBufferNum << 4];
-		iCosti4x4 += iBestCost;
-		if (iCosti4x4 >= iBestCostLuma)
-		{
-			break;
-		}
-		
-		//step 5: update pred mode and sample avail cache
-		iFinalMode = g_kiMapModeI4x4[iBestMode];
-		if (iPredMode == iFinalMode)
-		{
-			*pPrevIntra4x4PredModeFlag++ = true;
-		}
-		else
-		{
-			*pPrevIntra4x4PredModeFlag++ = false;
-			*pRemIntra4x4PredModeFlag  = (iFinalMode < iPredMode ? iFinalMode: (iFinalMode-1));						
-		}
-		pRemIntra4x4PredModeFlag++;
-	//	pCurMb->pIntra4x4PredMode[g_kuiMbCountScan4Idx[i]] = iFinalMode;
-		pMbCache->iIntraPredMode[kpCache48CountScan4[i]] = iFinalMode;
-
-		//step 6: encoding I_4x4 
-		WelsEncRecI4x4Y(pEncCtx, pCurMb, pMbCache, i);
-	}
-	ST32(pCurMb->pIntra4x4PredMode, LD32(&pMbCache->iIntraPredMode[33]));
-	pCurMb->pIntra4x4PredMode[4] = pMbCache->iIntraPredMode[12];
-	pCurMb->pIntra4x4PredMode[5] =	pMbCache->iIntraPredMode[20];
-	pCurMb->pIntra4x4PredMode[6] = pMbCache->iIntraPredMode[28];
-	iCosti4x4 += (iLambda << 4) + (iLambda << 3); //4*6*lambda from JVT SATD0
-	return iCosti4x4;
-}
-
-int32_t WelsMdI4x4Fast(void* pEnc, void* pMd, SMB* pCurMb, SMbCache *pMbCache)
-{
-	sWelsEncCtx* pEncCtx	= (sWelsEncCtx*)pEnc;	
-	SWelsFuncPtrList *pFunc		= pEncCtx->pFuncList;
-	SWelsMD *pWelsMd					= (SWelsMD*)pMd;
-	SDqLayer *pCurDqLayer			= pEncCtx->pCurDqLayer;	
-	int32_t iLambda				= pWelsMd->iLambda;
-	int32_t iBestCostLuma				= pWelsMd->iCostLuma;
-	uint8_t *pEncMb					= pMbCache->SPicData.pEncMb[0];
-	uint8_t *pDecMb					= pMbCache->SPicData.pCsMb[0];
-	const int32_t kiLineSizeEnc		= pCurDqLayer->iEncStride[0];
-	const int32_t kiLineSizeDec		= pCurDqLayer->iCsStride[0];
-
-	uint8_t* pCurEnc, *pCurDec, *pDst;
-	int8_t iPredMode, iCurMode, iBestMode, iFinalMode;
-	int32_t iCurCost, iBestCost;
-	int32_t iAvailCount;
-	const uint8_t * kpAvailMode;
-	int32_t i, j, iCoordinateX, iCoordinateY, iIdxStrideEnc, iIdxStrideDec;
-	int32_t iCostH, iCostV, iCostVR, iCostHD, iCostVL, iCostHU, iBestModeFake;
-	int32_t lambda[2]						= {iLambda<<2, iLambda};
-	bool_t* pPrevIntra4x4PredModeFlag	= pMbCache->pPrevIntra4x4PredModeFlag;
-	int8_t* pRemIntra4x4PredModeFlag		= pMbCache->pRemIntra4x4PredModeFlag;	
-	const uint8_t* kpIntra4x4AvailCount		= &g_kiIntra4AvailCount[0];
-	const uint8_t* kpCache48CountScan4		= &g_kuiCache48CountScan4Idx[0];
-	const int8_t* kpNeighborIntraToI4x4	= g_kiNeighborIntraToI4x4[pMbCache->uiNeighborIntra];	
-	const int8_t* kpCoordinateIdxX			= &g_kiCoordinateIdx4x4X[0];
-	const int8_t* kpCoordinateIdxY			= &g_kiCoordinateIdx4x4Y[0];
-	int32_t iBestPredBufferNum			= 0;
-	int32_t iCosti4x4						= 0;
-#if defined(X86_ASM)
-	WelsPrefetchZero_mmx(g_kiMapModeI4x4);
-	WelsPrefetchZero_mmx((int8_t *)&pFunc->pfGetLumaI4x4Pred);
-#endif//X86_ASM
-
-	for (i = 0; i < 16; i++) 
-	{
-		const int32_t kiOffset	= kpNeighborIntraToI4x4[i];
-//		const int32_t i_next	= (1+i) & 15;												// next loop
-//		const uint8_t dummy_byte= pIntra4x4AvailCount[pNeighborIntraToI4x4[i_next]];	// prefetch pIntra4x4AvailCount of next loop to avoid cache missed
-		
-		//step 1: locating current 4x4 block position in pEnc and pDecMb
-		iCoordinateX = kpCoordinateIdxX[i];
-		iCoordinateY = kpCoordinateIdxY[i];		
-
-		iIdxStrideEnc = (iCoordinateY * kiLineSizeEnc) + iCoordinateX;
-		pCurEnc = pEncMb + iIdxStrideEnc;
-		iIdxStrideDec = (iCoordinateY * kiLineSizeDec) + iCoordinateX;
-		pCurDec = pDecMb + iIdxStrideDec;
-
-		//step 2: get predicted mode from neighbor
-		iPredMode = PredIntra4x4Mode(pMbCache->iIntraPredMode, kpCache48CountScan4[i]);
-		//step 3: collect candidates of iPredMode		
-		iAvailCount = kpIntra4x4AvailCount[kiOffset];
-		kpAvailMode = g_kiIntra4AvailMode[kiOffset];
-
-		if (iAvailCount == 9 || iAvailCount == 7)
-		{
-			//I4_PRED_DC(2)
-
-			iBestMode = I4_PRED_DC;
-
-			pDst = &pMbCache->pMemPredBlk4[iBestPredBufferNum << 4];
-
-			pFunc->pfGetLumaI4x4Pred[I4_PRED_DC](pDst, pCurDec, kiLineSizeDec);
-			iBestCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4](pDst, 4, pCurEnc, kiLineSizeEnc) +
-				lambda[iPredMode == g_kiMapModeI4x4[iBestMode]];	
-
-			//I4_PRED_H(1)			
-			iCurMode = I4_PRED_H; 
-
-			pDst = &pMbCache->pMemPredBlk4[(1 - iBestPredBufferNum) << 4];
-
-			pFunc->pfGetLumaI4x4Pred[iCurMode](pDst, pCurDec, kiLineSizeDec); 
-			iCostH = iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4](pDst, 4, pCurEnc, kiLineSizeEnc) +
-				lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
-
-			if (iCurCost < iBestCost) 
-			{ 
-				iBestMode = iCurMode; 
-				iBestCost = iCurCost; 
-				iBestPredBufferNum = 1 - iBestPredBufferNum;
-			}
-
-			//I4_PRED_V(0)
-			iCurMode = I4_PRED_V; 
-
-			pDst = &pMbCache->pMemPredBlk4[(1 - iBestPredBufferNum) << 4];
-
-			pFunc->pfGetLumaI4x4Pred[iCurMode](pDst, pCurDec, kiLineSizeDec); 
-			iCostV = iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4](pDst, 4, pCurEnc, kiLineSizeEnc) +
-				lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
-
-			if (iCurCost < iBestCost) 
-			{ 
-				iBestMode = iCurMode; 
-				iBestCost = iCurCost; 
-				iBestPredBufferNum = 1 - iBestPredBufferNum;
-			}
-			if ( iCostV < iCostH )
-			{
-				if (iAvailCount == 9)
-				{
-					iBestModeFake = true; //indicating whether V is the best fake mode
-
-					//I4_PRED_VR(5) and I4_PRED_VL(7)
-					iCurMode = I4_PRED_VR; 
-
-					pDst = &pMbCache->pMemPredBlk4[(1 - iBestPredBufferNum) << 4];
-
-					pFunc->pfGetLumaI4x4Pred[iCurMode](pDst, pCurDec, kiLineSizeDec); 
-					iCostVR = iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4](pDst, 4, pCurEnc, kiLineSizeEnc) +
-						lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
-
-					if (iCurCost < iBestCost) 
-					{ 
-						iBestMode = iCurMode; 
-						iBestCost = iCurCost; 
-						iBestPredBufferNum = 1 - iBestPredBufferNum;
-					}
-
-					if (iCurCost < iCostV) 
-						iBestModeFake = false;
-
-					iCurMode = I4_PRED_VL; 
-
-					pDst = &pMbCache->pMemPredBlk4[(1 - iBestPredBufferNum) << 4];
-
-					pFunc->pfGetLumaI4x4Pred[iCurMode](pDst, pCurDec, kiLineSizeDec); 
-					iCostVL = iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4](pDst, 4, pCurEnc, kiLineSizeEnc) +
-						lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
-
-					if (iCurCost < iBestCost) 
-					{ 
-						iBestMode = iCurMode; 
-						iBestCost = iCurCost; 
-						iBestPredBufferNum = 1 - iBestPredBufferNum;
-					}
-
-					if (iCurCost < iCostV) 
-						iBestModeFake = false;	
-
-					//Vertical Early Determination
-					if ( !iBestModeFake ) //Vertical is not the best, go on checking...
-					{
-						//select the best one from VL and VR
-						if (iCostVR < iCostVL)
-						{
-							//I4_PRED_DDR(4)
-							iCurMode = I4_PRED_DDR; 
-
-							pDst = &pMbCache->pMemPredBlk4[(1 - iBestPredBufferNum) << 4];
-
-							pFunc->pfGetLumaI4x4Pred[iCurMode](pDst, pCurDec, kiLineSizeDec); 
-
-							iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4](pDst, 4, pCurEnc, kiLineSizeEnc) +
-								lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
-
-							if (iCurCost < iBestCost) 
-							{ 
-								iBestMode = iCurMode; 
-								iBestCost = iCurCost; 
-								iBestPredBufferNum = 1 - iBestPredBufferNum;
-							}
-						}
-						else
-						{
-							//I4_PRED_DDL(3)
-							iCurMode = I4_PRED_DDL;
-
-							pDst = &pMbCache->pMemPredBlk4[(1 - iBestPredBufferNum) << 4];
-
-							pFunc->pfGetLumaI4x4Pred[iCurMode](pDst, pCurDec, kiLineSizeDec); 
-
-							iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4](pDst, 4, pCurEnc, kiLineSizeEnc) +
-								lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
-
-							if (iCurCost < iBestCost) 
-							{ 
-								iBestMode = iCurMode; 
-								iBestCost = iCurCost; 
-								iBestPredBufferNum = 1 - iBestPredBufferNum;
-							}
-						}
-					}
-				}
-				else if(iAvailCount == 7)
-				{
-					iCurMode = I4_PRED_DDR; 
-
-					pDst = &pMbCache->pMemPredBlk4[(1 - iBestPredBufferNum) << 4];
-
-					pFunc->pfGetLumaI4x4Pred[iCurMode](pDst, pCurDec, kiLineSizeDec); 				
-					iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4](pDst, 4, pCurEnc, kiLineSizeEnc) +
-						lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
-
-					if (iCurCost < iBestCost) 
-					{ 
-						iBestMode = iCurMode; 
-						iBestCost = iCurCost; 
-						iBestPredBufferNum = 1 - iBestPredBufferNum;
-					}			
-
-					iCurMode = I4_PRED_VR; 
-
-					pDst = &pMbCache->pMemPredBlk4[(1 - iBestPredBufferNum) << 4];
-
-					pFunc->pfGetLumaI4x4Pred[iCurMode](pDst, pCurDec, kiLineSizeDec); 
-
-					iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4](pDst, 4, pCurEnc, kiLineSizeEnc) +
-						lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
-
-					if (iCurCost < iBestCost) 
-					{ 
-						iBestMode = iCurMode; 
-						iBestCost = iCurCost; 
-						iBestPredBufferNum = 1 - iBestPredBufferNum;
-					}			
-				}
-			}
-			else
-			{
-				iBestModeFake = true; //indicating whether H is the best fake mode
-				//I4_PRED_HD(6) and I4_PRED_HU(8)
-				iCurMode = I4_PRED_HD; 
-
-				pDst = &pMbCache->pMemPredBlk4[(1 - iBestPredBufferNum) << 4];
-
-				pFunc->pfGetLumaI4x4Pred[iCurMode](pDst, pCurDec, kiLineSizeDec); 
-				iCostHD = iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4](pDst, 4, pCurEnc, kiLineSizeEnc) +
-					lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
-
-				if (iCurCost < iBestCost) 
-				{ 
-					iBestMode = iCurMode; 
-					iBestCost = iCurCost; 
-					iBestPredBufferNum = 1 - iBestPredBufferNum;
-				}			
-
-				if (iCurCost < iCostH) 
-					iBestModeFake = false;
-
-				iCurMode = I4_PRED_HU; 
-
-				pDst = &pMbCache->pMemPredBlk4[(1 - iBestPredBufferNum) << 4];
-
-				pFunc->pfGetLumaI4x4Pred[iCurMode](pDst, pCurDec, kiLineSizeDec); 
-				iCostHU = iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4](pDst, 4, pCurEnc, kiLineSizeEnc) +
-					lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
-
-				if (iCurCost < iBestCost) 
-				{ 
-					iBestMode = iCurMode; 
-					iBestCost = iCurCost; 
-					iBestPredBufferNum = 1 - iBestPredBufferNum;
-				}			
-
-				if (iCurCost < iCostH) 
-					iBestModeFake = false;	
-
-				if ( !iBestModeFake ) //Horizontal is not the best, go on checking...
-				{
-					//select the best one from VL and VR
-					if (iCostHD < iCostHU)
-					{
-						//I4_PRED_DDR(4)
-						iCurMode = I4_PRED_DDR; 
-
-						pDst = &pMbCache->pMemPredBlk4[(1 - iBestPredBufferNum) << 4];
-
-						pFunc->pfGetLumaI4x4Pred[iCurMode](pDst, pCurDec, kiLineSizeDec); 
-						iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4](pDst, 4, pCurEnc, kiLineSizeEnc) +
-							lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
-
-						if (iCurCost < iBestCost) 
-						{ 
-							iBestMode = iCurMode; 
-							iBestCost = iCurCost; 
-							iBestPredBufferNum = 1 - iBestPredBufferNum;
-						}			
-					}
-					else if(iAvailCount == 9)
-					{
-						//I4_PRED_DDL(3)
-						iCurMode = I4_PRED_DDL; 
-
-						pDst = &pMbCache->pMemPredBlk4[(1 - iBestPredBufferNum) << 4];
-						pFunc->pfGetLumaI4x4Pred[iCurMode](pDst, pCurDec, kiLineSizeDec); 
-
-						iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4](pDst, 4, pCurEnc, kiLineSizeEnc) +
-							lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
-
-						if (iCurCost < iBestCost) 
-						{ 
-							iBestMode = iCurMode; 
-							iBestCost = iCurCost; 
-							iBestPredBufferNum = 1 - iBestPredBufferNum;
-						}
-
-					}
-				}
-			}
-		}
-		else
-		{
-			iBestCost = INT_MAX;
-       		iBestMode = I4_PRED_INVALID;
-			for (j = 0; j < iAvailCount; j++)
-			{
-				// I4x4_MODE_CHECK(pAvailMode[j], iCurCost);
-				iCurMode = kpAvailMode[j]; 
-
-				pDst = &pMbCache->pMemPredBlk4[(1 - iBestPredBufferNum) << 4];
-
-				pFunc->pfGetLumaI4x4Pred[iCurMode](pDst, pCurDec, kiLineSizeDec); 
-				iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4](pDst, 4, pCurEnc, kiLineSizeEnc) +
-					lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
-
-				if (iCurCost < iBestCost) 
-				{ 
-					iBestMode = iCurMode; 
-					iBestCost = iCurCost; 
-					iBestPredBufferNum = 1 - iBestPredBufferNum;
-				}			
-			}
-		}
-		pMbCache->pBestPredI4x4Blk4 = &pMbCache->pMemPredBlk4[iBestPredBufferNum << 4];
-		iCosti4x4 += iBestCost;
-		if (iCosti4x4 >= iBestCostLuma)
-		{
-			break;
-		}
-
-		//step 5: update pred mode and sample avail cache
-		iFinalMode = g_kiMapModeI4x4[iBestMode];
-		if (iPredMode == iFinalMode)
-		{
-			*pPrevIntra4x4PredModeFlag++ = true;
-		}
-		else
-		{
-			*pPrevIntra4x4PredModeFlag++ = false;
-			*pRemIntra4x4PredModeFlag  = (iFinalMode < iPredMode ? iFinalMode: (iFinalMode-1));						
-		}
-		pRemIntra4x4PredModeFlag++;
-		//	pCurMb->pIntra4x4PredMode[scan4[i]] = iFinalMode;
-	    pMbCache->iIntraPredMode[kpCache48CountScan4[i]] = iFinalMode;
-		//step 6: encoding I_4x4 
-		WelsEncRecI4x4Y(pEncCtx, pCurMb, pMbCache, i);
-	}
-	ST32(pCurMb->pIntra4x4PredMode, LD32(&pMbCache->iIntraPredMode[33]));
-	pCurMb->pIntra4x4PredMode[4] = pMbCache->iIntraPredMode[12];
-	pCurMb->pIntra4x4PredMode[5] =	pMbCache->iIntraPredMode[20];
-	pCurMb->pIntra4x4PredMode[6] = pMbCache->iIntraPredMode[28];
-	iCosti4x4 += (iLambda << 4) + (iLambda << 3); //4*6*lambda from JVT SATD0	
-	return iCosti4x4;
-}
-
-int32_t WelsMdIntraChroma(SWelsFuncPtrList *pFunc, SDqLayer *pCurDqLayer, SMbCache *pMbCache, int32_t iLambda)
-{
-	const int8_t *kpAvailMode;
-	int32_t iAvailCount				= 0;
-	int32_t iChmaIdx = 0;
-	uint8_t *pPredIntraChma[2]	= {pMbCache->pMemPredChroma, pMbCache->pMemPredChroma + 128};
-	uint8_t *pDstChma				= pPredIntraChma[0];
-	uint8_t *pEncCb				= pMbCache->SPicData.pEncMb[1];
-	uint8_t *pEncCr				= pMbCache->SPicData.pEncMb[2];
-	uint8_t *pDecCb				= pMbCache->SPicData.pCsMb[1];//pMbCache->SPicData.pDecMb[1];
-	uint8_t *pDecCr				= pMbCache->SPicData.pCsMb[2];//pMbCache->SPicData.pDecMb[2];
-	const int32_t kiLineSizeEnc		= pCurDqLayer->iEncStride[1];
-	const int32_t kiLineSizeDec		= pCurDqLayer->iCsStride[1];//pMbCache->SPicData.i_stride_dec[1];
-
-	int32_t i, iCurMode, iCurCost, iBestMode, iBestCost = INT_MAX;
-
-	int32_t iOffset = pMbCache->uiNeighborIntra&0x07;
-	iAvailCount = g_kiIntraChromaAvailMode[iOffset][4];
-	kpAvailMode = g_kiIntraChromaAvailMode[iOffset];
-	if (iAvailCount > 3 && pFunc->sSampleDealingFuncs.pfIntra8x8Combined3 )
-	{
-		iBestCost = pFunc->sSampleDealingFuncs.pfIntra8x8Combined3(pDecCb,kiLineSizeDec,pEncCb,kiLineSizeEnc,&iBestMode,
-			iLambda, pDstChma,pDecCr,pEncCr);
-		iCurMode = kpAvailMode[3];
-		pFunc->pfGetChromaPred[iCurMode](pDstChma, pDecCb, kiLineSizeDec);//Cb
-		pFunc->pfGetChromaPred[iCurMode](pDstChma+64, pDecCr, kiLineSizeDec);//Cr
-
-		iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_8x8](pDstChma, 8, pEncCb, kiLineSizeEnc) +
-			pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_8x8](pDstChma+64, 8, pEncCr, kiLineSizeEnc) +
-			iLambda * 4;
-		if (iCurCost < iBestCost)
-		{
-			iBestMode = iCurMode;
-			iBestCost = iCurCost;
-		}
-		else
-		{
-			pFunc->pfGetChromaPred[iBestMode](pDstChma, pDecCb, kiLineSizeDec);//Cb
-			pFunc->pfGetChromaPred[iBestMode](pDstChma+64, pDecCr, kiLineSizeDec);//Cr
-		}
-		iBestCost += iLambda;
-		iChmaIdx = 1;
-	}
-	else{
-		iBestMode = kpAvailMode[0];
-		for ( i = 0; i < iAvailCount; ++ i )
-		{
-			iCurMode = kpAvailMode[i];
-
-			assert( iCurMode >= 0 && iCurMode < 7 );
-
-			//		pDstCb	= &pMbCache->mem_pred_intra_cb[iCurMode<<6];
-			//		pDstCr	= &pMbCache->mem_pred_intra_cr[iCurMode<<6];
-			pFunc->pfGetChromaPred[iCurMode](pDstChma, pDecCb, kiLineSizeDec);//Cb
-			iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_8x8](pDstChma, 8, pEncCb, kiLineSizeEnc);
-
-			pFunc->pfGetChromaPred[iCurMode](pDstChma+64, pDecCr, kiLineSizeDec);//Cr
-			iCurCost += pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_8x8](pDstChma+64, 8, pEncCr, kiLineSizeEnc) +
-				iLambda * BsSizeUE( g_kiMapModeIntraChroma[iCurMode] );
-			if (iCurCost < iBestCost)
-			{
-				iBestMode = iCurMode;
-				iBestCost = iCurCost;
-				iChmaIdx= iChmaIdx ^ 0x01;
-				pDstChma	= pPredIntraChma[iChmaIdx];
-			}
-		}
-	}
-
-	pMbCache->pBestPredIntraChroma	= pPredIntraChma[iChmaIdx ^ 0x01];
-	pMbCache->uiChmaI8x8Mode = iBestMode;
-	return iBestCost;
-}
-int32_t WelsMdIntraFinePartition(void* pEnc, void* pMd, SMB* pCurMb, SMbCache *pMbCache)
-{
-	sWelsEncCtx* pEncCtx = (sWelsEncCtx*)pEnc;	
-	SWelsMD* pWelsMd = (SWelsMD*)pMd;
-    
-	int32_t iCosti4x4 = WelsMdI4x4( pEncCtx, pWelsMd, pCurMb, pMbCache);
-	
-	if (iCosti4x4 < pWelsMd->iCostLuma)
-	{
-		pCurMb->uiMbType = MB_TYPE_INTRA4x4;
-		pWelsMd->iCostLuma = iCosti4x4;
-	}
-	return pWelsMd->iCostLuma;
-}
-
-int32_t WelsMdIntraFinePartitionVaa(void* pEnc, void* pMd, SMB* pCurMb, SMbCache *pMbCache)
-{	
-	sWelsEncCtx* pEncCtx = (sWelsEncCtx*)pEnc;	
-	SWelsMD* pWelsMd = (SWelsMD*)pMd;
-    
-	if ( MdIntraAnalysisVaaInfo( pEncCtx, pMbCache->SPicData.pEncMb[0] ) )
-	{
-		int32_t iCosti4x4 = WelsMdI4x4Fast( pEncCtx, pWelsMd, pCurMb, pMbCache );	
-		
-		if (iCosti4x4 < pWelsMd->iCostLuma)
-		{
-			pCurMb->uiMbType = MB_TYPE_INTRA4x4;
-			pWelsMd->iCostLuma = iCosti4x4;
-		}						
-    }
-
-    return pWelsMd->iCostLuma;
-}
-
-void WelsMdIntraMb(sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb, SMbCache *pMbCache)
-{		
-	//initial prediction memory for I_16x16
-	pWelsMd->iCostLuma = WelsMdI16x16(pEncCtx->pFuncList, pEncCtx->pCurDqLayer, pMbCache, pWelsMd->iLambda);
-	pCurMb->uiMbType = MB_TYPE_INTRA16x16;
-
-	WelsMdIntraSecondaryModesEnc( pEncCtx, pWelsMd, pCurMb, pMbCache );
-}
-
-int32_t WelsMdP16x16(SWelsFuncPtrList *pFunc, SDqLayer* pCurLayer, SWelsMD* pWelsMd, SSlice *pSlice, SMB* pCurMb)
-{
-	SMbCache *pMbCache = &pSlice->sMbCacheInfo;
-	SWelsME* sMe16x16 = &pWelsMd->sMe.sMe16x16;
-	uint32_t uiNeighborAvail = pCurMb->uiNeighborAvail;
-	const int32_t kiMbWidth	= pCurLayer->iMbWidth;	// for assign once
-	const int32_t kiMbHeight	= pCurLayer->iMbHeight;
-
-	sMe16x16->uiPixel = BLOCK_16x16;
-	sMe16x16->pMvdCost= pWelsMd->pMvdCost;
-
-	sMe16x16->pEncMb  = pMbCache->SPicData.pEncMb[0];
-	sMe16x16->pRefMb  = pMbCache->SPicData.pRefMb[0];
-	sMe16x16->uSadPredISatd.uiSadPred = pWelsMd->iSadPredMb;
-
-	pSlice->uiMvcNum = 0;
-	pSlice->sMvc[pSlice->uiMvcNum++] = sMe16x16->sMvBase;
-	//spatial motion vector predictors
-	if(uiNeighborAvail & LEFT_MB_POS) //left available
-	{
-		pSlice->sMvc[pSlice->uiMvcNum++] = (pCurMb-1)->sP16x16Mv;
-	}
-	if(uiNeighborAvail & TOP_MB_POS) //top available
-	{
-		pSlice->sMvc[pSlice->uiMvcNum++] = (pCurMb-kiMbWidth)->sP16x16Mv;		
-	}
-	//temporal motion vector predictors
-	if (pCurLayer->pRefPic->iPictureType == P_SLICE)
-	{
-		if (pCurMb->iMbX < kiMbWidth - 1)
-		{
-			SMVUnitXY sTempMv = pCurLayer->pRefPic->sMvList[pCurMb->iMbXY+1];
-			pSlice->sMvc[pSlice->uiMvcNum].iMvX = sTempMv.iMvX >> pSlice->sScaleShift;
-			pSlice->sMvc[pSlice->uiMvcNum].iMvY = sTempMv.iMvY >> pSlice->sScaleShift;
-			++ pSlice->uiMvcNum;
-		}
-		if (pCurMb->iMbY < kiMbHeight - 1)
-		{
-			SMVUnitXY sTempMv = pCurLayer->pRefPic->sMvList[pCurMb->iMbXY+kiMbWidth];
-			pSlice->sMvc[pSlice->uiMvcNum].iMvX = sTempMv.iMvX >> pSlice->sScaleShift;
-			pSlice->sMvc[pSlice->uiMvcNum].iMvY = sTempMv.iMvY >> pSlice->sScaleShift;
-			++ pSlice->uiMvcNum;
-		}
-	}
-
-	PredMv(&pMbCache->sMvComponents, 0, 4, 0, &(sMe16x16->sMvp));
-	pFunc->pfMotionSearch(pFunc, pCurLayer, sMe16x16, pSlice);	
-//	update_p16x16_motion2cache(pMbCache, pWelsMd->uiRef, &(sMe16x16->mv));
-
-	pCurMb->sP16x16Mv = sMe16x16->sMv;
-	pCurLayer->pDecPic->sMvList[pCurMb->iMbXY] = sMe16x16->sMv;
-
-	return sMe16x16->uiSatdCost;
-}
-int32_t WelsMdP16x8(SWelsFuncPtrList *pFunc, SDqLayer* pCurDqLayer, SWelsMD* pWelsMd, SSlice *pSlice)
-{
-	SMbCache *pMbCache = &pSlice->sMbCacheInfo;
-	int32_t iStrideEnc = pCurDqLayer->iEncStride[0];
-	int32_t iStrideRef = pCurDqLayer->pRefPic->iLineSize[0];
-	SWelsME* sMe16x8; 
-	int32_t i = 0;
-	int32_t iCostP16x8 = 0;
-	do 
-	{
-		sMe16x8 = &pWelsMd->sMe.sMe16x8[i];
-
-		sMe16x8->uiPixel = BLOCK_16x8;
-		sMe16x8->pMvdCost	 = pWelsMd->pMvdCost;
-
-		sMe16x8->pEncMb       = pMbCache->SPicData.pEncMb[0] + ((i << 3) * iStrideEnc);
-		sMe16x8->pRefMb       = pMbCache->SPicData.pRefMb[0] + ((i << 3) * iStrideRef);		
-		sMe16x8->uSadPredISatd.uiSadPred = pWelsMd->iSadPredMb >> 1;
-
-		pSlice->sMvc[0]	= sMe16x8->sMvBase;
-		pSlice->uiMvcNum = 1;
-
-		PredInter16x8Mv(pMbCache, i<<3, 0, &(sMe16x8->sMvp));
-		pFunc->pfMotionSearch(pFunc, pCurDqLayer, sMe16x8, pSlice);		
-		UpdateP16x8Motion2Cache(pMbCache, i<<3, pWelsMd->uiRef, &(sMe16x8->sMv));
-        iCostP16x8 += sMe16x8->uiSatdCost;
-		++i;
-	} while(i < 2);
-	return iCostP16x8;
-}
-int32_t WelsMdP8x16(SWelsFuncPtrList *pFunc, SDqLayer* pCurLayer, SWelsMD* pWelsMd, SSlice *pSlice)
-{
-	SMbCache *pMbCache = &pSlice->sMbCacheInfo;
-	SWelsME* sMe8x16;
-	int32_t i = 0;
-	int32_t iCostP8x16 = 0;
-	do 
-	{
-		sMe8x16 = &pWelsMd->sMe.sMe8x16[i];
-
-		sMe8x16->uiPixel = BLOCK_8x16;
-		sMe8x16->pMvdCost     = pWelsMd->pMvdCost;
-
-		sMe8x16->pEncMb       = pMbCache->SPicData.pEncMb[0] + (i << 3);
-		sMe8x16->pRefMb       = pMbCache->SPicData.pRefMb[0] + (i << 3);			
-		sMe8x16->uSadPredISatd.uiSadPred = pWelsMd->iSadPredMb >> 1;
-		
-		pSlice->sMvc[0] = sMe8x16->sMvBase;
-		pSlice->uiMvcNum = 1;		
-
-		PredInter8x16Mv(pMbCache, i<<2, 0, &(sMe8x16->sMvp));
-		pFunc->pfMotionSearch(pFunc, pCurLayer, sMe8x16, pSlice);		
-		UpdateP8x16Motion2Cache(pMbCache, i<<2, pWelsMd->uiRef, &(sMe8x16->sMv));
-		iCostP8x16 += sMe8x16->uiSatdCost;
-//		sMe8x16++;
-		++i;
-	} while(i < 2);
-	return iCostP8x16;
-}
-int32_t WelsMdP8x8(SWelsFuncPtrList *pFunc,SDqLayer* pCurDqLayer, SWelsMD* pWelsMd, SSlice *pSlice)
-{
-	SMbCache *pMbCache = &pSlice->sMbCacheInfo;
-	int32_t iLineSizeEnc = pCurDqLayer->iEncStride[0];
-	int32_t iLineSizeRef = pCurDqLayer->pRefPic->iLineSize[0];
-	SWelsME* sMe8x8;
-	int32_t i, iIdxX, iIdxY, iStrideEnc, iStrideRef;
-	int32_t iCostP8x8 = 0;
-	for (i = 0; i < 4; i++)
-	{
-		iIdxX = i & 1;
-		iIdxY = i >> 1;
-		iStrideEnc = (iIdxX << 3) + ((iIdxY << 3) * iLineSizeEnc);
-		iStrideRef = (iIdxX << 3) + ((iIdxY << 3) * iLineSizeRef);
-
-		sMe8x8 = &pWelsMd->sMe.sMe8x8[i];
-
-		sMe8x8->uiPixel = BLOCK_8x8;
-		sMe8x8->pMvdCost     = pWelsMd->pMvdCost;		
-
-		sMe8x8->pEncMb       = pMbCache->SPicData.pEncMb[0] + iStrideEnc;
-		sMe8x8->pRefMb       = pMbCache->SPicData.pRefMb[0] + iStrideRef;			
-		sMe8x8->uSadPredISatd.uiSadPred = pWelsMd->iSadPredMb >> 2;
-
-		pSlice->sMvc[0] = sMe8x8->sMvBase;
-		pSlice->uiMvcNum = 1;		
-
-		PredMv(&pMbCache->sMvComponents, i<<2, 2, pWelsMd->uiRef, &(sMe8x8->sMvp));
-		pFunc->pfMotionSearch(pFunc, pCurDqLayer, sMe8x8, pSlice);		
-		UpdateP8x8Motion2Cache(pMbCache, i<<2, pWelsMd->uiRef, &(sMe8x8->sMv));
-        iCostP8x8 += sMe8x8->uiSatdCost;
-//		sMe8x8++;
-	}
-    return iCostP8x8;
-}
-
-void WelsMdInterFinePartition(void* pEnc, void* pMd, SSlice *pSlice, SMB* pCurMb, int32_t iBestCost)
-{	
-	sWelsEncCtx* pEncCtx = (sWelsEncCtx*)pEnc;	
-	SWelsMD* pWelsMd = (SWelsMD*)pMd;
-    
-	SDqLayer* pCurDqLayer = pEncCtx->pCurDqLayer;
-//	SMbCache *pMbCache = &pSlice->sMbCacheInfo;
-	int32_t iCost = 0;
-
-//	WelsLog( pEncCtx, WELS_LOG_INFO, "WelsMdP8x8, p_ref[0]= 0x%p\n", pMbCache->SPicData.pRefMb[0]);
-	
-	iCost = WelsMdP8x8(pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice);
-
-	if (iCost < iBestCost)
-	{
-		int32_t iCostPart;
-		pCurMb->uiMbType = MB_TYPE_8x8;
-			
-//		WelsLog( pEncCtx, WELS_LOG_INFO, "WelsMdP16x8, p_ref[0]= 0x%p\n", pMbCache->SPicData.pRefMb[0]);
-		iCostPart = WelsMdP16x8(pEncCtx->pFuncList,pCurDqLayer, pWelsMd, pSlice);
-		if (iCostPart <= iCost)
-		{
-			iCost = iCostPart;
-			pCurMb->uiMbType = MB_TYPE_16x8;
-			//pCurMb->mb_partition = 2;
-		}
-		
-//		WelsLog( pEncCtx, WELS_LOG_INFO, "WelsMdP8x16, p_ref[0]= 0x%p\n", pMbCache->SPicData.pRefMb[0]);
-		iCostPart = WelsMdP8x16(pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice);
-		if (iCostPart <= iCost)
-		{
-			iCost = iCostPart;
-			pCurMb->uiMbType = MB_TYPE_8x16;
-			//pCurMb->mb_partition = 2;
-		}
-	}
-}
-
-void WelsMdInterFinePartitionVaa( void* pEnc, void* pMd, SSlice *pSlice, SMB* pCurMb, int32_t iBestCost )
-{	
-	sWelsEncCtx* pEncCtx = (sWelsEncCtx*)pEnc;	
-	SWelsMD* pWelsMd = (SWelsMD*)pMd;
-    
-	SDqLayer* pCurDqLayer = pEncCtx->pCurDqLayer;
-//	SMbCache *pMbCache = &pSlice->sMbCacheInfo;
-	int32_t iCostP8x16, iCostP16x8, iCostP8x8;
-	uint8_t uiMbSign = pEncCtx->pFuncList->pfGetMbSignFromInterVaa( &pEncCtx->pVaa->sVaaCalcInfo.pSad8x8[pCurMb->iMbXY][0] );			
-	
- 	if ( uiMbSign == 15 )
-	{
-		return;
-	}
-
-//	iCost = pWelsMd->sMe16x16.uiSatdCost;
-	
-	switch ( uiMbSign )
-	{
-	case 3:
-	case 12:
-//		WelsLog( pEncCtx, WELS_LOG_INFO, "WelsMdP16x8, p_ref[0]= 0x%p\n", pMbCache->SPicData.pRefMb[0]);
-		iCostP16x8 = WelsMdP16x8(pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice );
- 		if ( iCostP16x8 < iBestCost )
-		{
-			iBestCost = iCostP16x8;
-			pCurMb->uiMbType = MB_TYPE_16x8;
-			//pCurMb->mb_partition = 2;
-		}
- 		break;
-
-	case 5:
-	case 10:
-//		WelsLog( pEncCtx, WELS_LOG_INFO, "WelsMdP8x16, p_ref[0]= 0x%p\n", pMbCache->SPicData.pRefMb[0]);
-		iCostP8x16 = WelsMdP8x16(pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice );
- 		if ( iCostP8x16 < iBestCost )
-		{
-			iBestCost = iCostP8x16;
-			pCurMb->uiMbType = MB_TYPE_8x16;
-			//pCurMb->mb_partition = 2;
-		}
-		break;
-		
-	case 6:
-	case 9:
-		iCostP8x8 = WelsMdP8x8(pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice );	
-		if ( iCostP8x8 < iBestCost )
-		{
-			iBestCost = iCostP8x8;
-			pCurMb->uiMbType = MB_TYPE_8x8;
-		}
-		break;
-
-	default:
-		iCostP8x8 = WelsMdP8x8(pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice );
-		if (iCostP8x8 < iBestCost )
-		{
-			iBestCost = iCostP8x8;
-			pCurMb->uiMbType = MB_TYPE_8x8;
-
-			iCostP16x8 = WelsMdP16x8(pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice);
-			if (iCostP16x8 <= iBestCost)
-			{
-				iBestCost = iCostP16x8;
-				pCurMb->uiMbType = MB_TYPE_16x8;
-			}
-			
-			iCostP8x16 = WelsMdP8x16(pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice);
-			if (iCostP8x16 <= iBestCost)
-			{
-				iBestCost = iCostP8x16;
-				pCurMb->uiMbType = MB_TYPE_8x16;
-			}						
-		}
-		break;
-	}
-	pWelsMd->iCostLuma = iBestCost;
-}
-
-
-inline void VaaBackgroundMbDataUpdate(  SWelsFuncPtrList *pFunc, SVAAFrameInfo *pVaaInfo, SMB* pCurMb )
-{
-	const int32_t kiPicStride		= pVaaInfo->iPicStride;
-	const int32_t kiPicStrideUV	= pVaaInfo->iPicStrideUV;
-	const int32_t kiOffsetY		= (pCurMb->iMbY*kiPicStride+pCurMb->iMbX)<<4;
-	const int32_t kiOffsetUV		= (pCurMb->iMbY*kiPicStrideUV+pCurMb->iMbX)<<3;
-
-	pFunc->pfCopy16x16Aligned(pVaaInfo->pCurY+kiOffsetY, kiPicStride, pVaaInfo->pRefY+kiOffsetY, kiPicStride);
-	pFunc->pfCopy8x8Aligned(pVaaInfo->pCurU+kiOffsetUV, kiPicStrideUV, pVaaInfo->pRefU+kiOffsetUV, kiPicStrideUV);
-	pFunc->pfCopy8x8Aligned(pVaaInfo->pCurV+kiOffsetUV, kiPicStrideUV, pVaaInfo->pRefV+kiOffsetUV, kiPicStrideUV);
-}
-
-void WelsMdBackgroundMbEnc(void* pEnc, void* pMd, SMB* pCurMb, SMbCache *pMbCache, SSlice *pSlice, bool_t bSkipMbFlag) 
-{
-	sWelsEncCtx * pEncCtx	= (sWelsEncCtx *)pEnc;
-	SDqLayer* pCurDqLayer	= pEncCtx->pCurDqLayer;
-	SWelsMD * pWelsMd		= (SWelsMD *)pMd;
-	SWelsFuncPtrList *pFunc	= pEncCtx->pFuncList;
-	SMVUnitXY sMvp				= { 0 };
-	uint8_t* pRefLuma			= pMbCache->SPicData.pRefMb[0];
-	uint8_t* pRefCb				= pMbCache->SPicData.pRefMb[1];
-	uint8_t* pRefCr				= pMbCache->SPicData.pRefMb[2];
-	int32_t iLineSizeY			= pCurDqLayer->pRefPic->iLineSize[0];
-	int32_t iLineSizeUV			= pCurDqLayer->pRefPic->iLineSize[1];
-	uint8_t* pDstLuma			= pMbCache->pSkipMb;
-	uint8_t* pDstCb				= pMbCache->pSkipMb+256;
-	uint8_t* pDstCr				= pMbCache->pSkipMb+256+64;
-
-	if (!bSkipMbFlag)
-	{
-		pDstLuma	= pMbCache->pMemPredLuma;
-		pDstCb	= pMbCache->pMemPredChroma;
-		pDstCr	= pMbCache->pMemPredChroma+64;
-	}
-	//MC
-	pFunc->sMcFuncs.pfLumaQuarpelMc[0](pRefLuma, iLineSizeY, pDstLuma, 16,16);
-	pFunc->sMcFuncs.pfChromaMc(pRefCb, iLineSizeUV, pDstCb, 8, sMvp, 8, 8);//Cb
-	pFunc->sMcFuncs.pfChromaMc(pRefCr, iLineSizeUV, pDstCr, 8, sMvp, 8, 8);//Cr
-
-	pCurMb->uiCbp = 0;
-	pMbCache->bCollocatedPredFlag = true;
-	pWelsMd->iCostLuma = 0;//BGD&RC integration
-	pCurMb->pSadCost[0] = pFunc->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16](pMbCache->SPicData.pEncMb[0], pCurDqLayer->iEncStride[0], pRefLuma, iLineSizeY );	
-	ST32(&pCurMb->sP16x16Mv, 0);
-	ST32(&pCurDqLayer->pDecPic->sMvList[pCurMb->iMbXY], 0);
-
-	if (bSkipMbFlag)
-	{
-		pCurMb->uiMbType = MB_TYPE_BACKGROUND;
-
-		//update motion info to current MB
-		ST32(pCurMb->pRefIndex, 0);
-		pFunc->pfUpdateMbMv(pCurMb->sMv, sMvp);		
-
-		pCurMb->uiLumaQp   = pSlice->uiLastMbQp;
-		pCurMb->uiChromaQp = g_kuiChromaQpTable[CLIP3_QP_0_51(pCurMb->uiLumaQp + pCurDqLayer->sLayerInfo.pPpsP->uiChromaQpIndexOffset)];
-
-		WelsRecPskip(pCurDqLayer, pEncCtx->pFuncList, pCurMb, pMbCache);
-		VaaBackgroundMbDataUpdate(pEncCtx->pFuncList, pEncCtx->pVaa, pCurMb);
-		return;
-	}
-
-	pCurMb->uiMbType = MB_TYPE_16x16;
-
-	pWelsMd->sMe.sMe16x16.sMv.iMvX = 0;
-	pWelsMd->sMe.sMe16x16.sMv.iMvY = 0;
-	PredMv( &pMbCache->sMvComponents, 0, 4, pWelsMd->uiRef, &pWelsMd->sMe.sMe16x16.sMvp );
-	pMbCache->sMbMvp[0] = pWelsMd->sMe.sMe16x16.sMvp;
-
-	UpdateP16x16MotionInfo(pMbCache, pCurMb, pWelsMd->uiRef, &pWelsMd->sMe.sMe16x16.sMv);
-
-	if(pWelsMd->bMdUsingSad)
-		pWelsMd->iCostLuma = pCurMb->pSadCost[0];
-	else
-		pWelsMd->iCostLuma = pFunc->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16](pMbCache->SPicData.pEncMb[0], pCurDqLayer->iEncStride[0], pRefLuma, iLineSizeY );	
-
-	WelsInterMbEncode( pEncCtx, pSlice, pCurMb );
-	WelsPMbChromaEncode( pEncCtx, pSlice, pCurMb );
-
-	pFunc->pfCopy16x16Aligned( pMbCache->SPicData.pCsMb[0], pCurDqLayer->iCsStride[0], pMbCache->pMemPredLuma,     16 );
-	pFunc->pfCopy8x8Aligned(   pMbCache->SPicData.pCsMb[1], pCurDqLayer->iCsStride[1], pMbCache->pMemPredChroma,    8 );
-	pFunc->pfCopy8x8Aligned(   pMbCache->SPicData.pCsMb[2], pCurDqLayer->iCsStride[1], pMbCache->pMemPredChroma+64, 8 );
-}
-
-BOOL_T WelsMdPSkipEnc(void* pEnc, void* pMd, SMB* pCurMb, SMbCache *pMbCache) 
-{
-	sWelsEncCtx *pEncCtx	= (sWelsEncCtx *)pEnc;
-	SDqLayer *pCurLayer				= pEncCtx->pCurDqLayer;
-	SWelsMD *pWelsMd					= (SWelsMD *)pMd;
-	SWelsFuncPtrList *pFunc		= pEncCtx->pFuncList;
-
-	uint8_t* pRefLuma = pMbCache->SPicData.pRefMb[0];
-	uint8_t* pRefCb   = pMbCache->SPicData.pRefMb[1];
-	uint8_t* pRefCr   = pMbCache->SPicData.pRefMb[2];
-	int32_t iLineSizeY  = pCurLayer->pRefPic->iLineSize[0];
-	int32_t iLineSizeUV = pCurLayer->pRefPic->iLineSize[1];
-
-	uint8_t* pDstLuma = pMbCache->pSkipMb;
-	uint8_t* pDstCb   = pMbCache->pSkipMb+256;
-	uint8_t* pDstCr   = pMbCache->pSkipMb+256+64;
-
-	SMVUnitXY sMvp = { 0 };
-    uint8_t uiMvpIdx;
-	int32_t n;
-
-	int32_t iEncStride		= pCurLayer->iEncStride[0];
-	uint8_t* pEncMb			= pMbCache->SPicData.pEncMb[0];
-	int32_t *pStrideEncBlockOffset= pEncCtx->pStrideTab->pStrideEncBlockOffset[pEncCtx->uiDependencyId]; 
-	int32_t *pEncBlockOffset;
-
-	int32_t iSadCostLuma = 0;
-	int32_t iSadCostChroma = 0;
-	int32_t iSadCostMb = 0;
-
-	PredSkipMv(pMbCache, &sMvp);
-	
-	// Special case, need to clip the vector //
-	SMVUnitXY sQpelMvp = { sMvp.iMvX>>2, sMvp.iMvY>>2 };
-    n = (pCurMb->iMbX<<4) + sQpelMvp.iMvX;
-    if( n < -29 )
-        return FALSE;
-    else if( n > (int32_t)((pCurLayer->iMbWidth<<4) + 12) )
-        return FALSE;
-
-    n = (pCurMb->iMbY<<4) + sQpelMvp.iMvY;
-    if( n < -29 )
-        return FALSE;
-    else if( n > (int32_t)((pCurLayer->iMbHeight<<4) + 12) )
-        return FALSE;
-
-	//luma	
-	pRefLuma += sQpelMvp.iMvY * iLineSizeY + sQpelMvp.iMvX;
-    uiMvpIdx = ((sMvp.iMvY&0x03)<<2)+(sMvp.iMvX&0x03);
-	pFunc->sMcFuncs.pfLumaQuarpelMc[uiMvpIdx](pRefLuma, iLineSizeY, pDstLuma, 16,16);
-	iSadCostLuma    = pFunc->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16](pMbCache->SPicData.pEncMb[0], pCurLayer->iEncStride[0], pDstLuma, 16 );	
-
-	const int32_t iStrideUV = (sQpelMvp.iMvY>>1) * iLineSizeUV + (sQpelMvp.iMvX>>1);
-	pRefCb += iStrideUV;
-	pFunc->sMcFuncs.pfChromaMc(pRefCb, iLineSizeUV, pDstCb, 8, sMvp, 8, 8);//Cb	
-	iSadCostChroma  = pFunc->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8](pMbCache->SPicData.pEncMb[1], pCurLayer->iEncStride[1], pDstCb, 8 );	
-	
-	pRefCr += iStrideUV;
-	pFunc->sMcFuncs.pfChromaMc(pRefCr, iLineSizeUV, pDstCr, 8, sMvp, 8, 8);//Cr
-	iSadCostChroma += pFunc->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8](pMbCache->SPicData.pEncMb[2], pCurLayer->iEncStride[2], pDstCr, 8 );	
-
-	iSadCostMb = iSadCostLuma + iSadCostChroma;
-
-	if ( iSadCostMb == 0                             ||
-		 iSadCostMb < pWelsMd->iSadPredSkip   || 
-	   ( pCurLayer->pRefPic->iPictureType == P_SLICE     && 
-	     pMbCache->uiRefMbType == MB_TYPE_SKIP    &&
-		 iSadCostMb < pCurLayer->pRefPic->pMbSkipSad[pCurMb->iMbXY]) )
-	{
-		//update motion info to current MB
-		ST32(pCurMb->pRefIndex, 0);
-		pFunc->pfUpdateMbMv(pCurMb->sMv, sMvp);		
-
-		pCurMb->pSadCost[0] = pFunc->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16](pMbCache->SPicData.pEncMb[0], pCurLayer->iEncStride[0], pRefLuma, iLineSizeY );
-		
-		if (pWelsMd->bMdUsingSad)
-			pWelsMd->iCostLuma = pCurMb->pSadCost[0];
-		else
-			pWelsMd->iCostLuma = pFunc->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16](pMbCache->SPicData.pEncMb[0], pCurLayer->iEncStride[0], pRefLuma, iLineSizeY );	
-	
-		pWelsMd->iCostSkipMb = iSadCostMb;	
-
-		pCurMb->sP16x16Mv = sMvp;
-		pCurLayer->pDecPic->sMvList[pCurMb->iMbXY] = sMvp;
-		
-		return TRUE;
-	}	
-
-	WelsDctMb(pMbCache->pCoeffLevel,  pEncMb, iEncStride, pDstLuma, pEncCtx->pFuncList->pfDctFourT4 );
-
-	if ( WelsTryPYskip( pEncCtx, pCurMb, pMbCache ) )
-	{
-		iEncStride = pEncCtx->pCurDqLayer->iEncStride[1];
-		pEncMb = pMbCache->SPicData.pEncMb[1];
-		pEncBlockOffset = pStrideEncBlockOffset + 16;
-		pFunc->pfDctFourT4( pMbCache->pCoeffLevel + 256, &(pEncMb[*pEncBlockOffset]), iEncStride,	pMbCache->pSkipMb + 256, 8);
-		if ( WelsTryPUVskip( pEncCtx, pCurMb, pMbCache, 1 ) )
-		{
-			pEncMb = pMbCache->SPicData.pEncMb[2];
-			pEncBlockOffset = pStrideEncBlockOffset + 20;
-			pFunc->pfDctFourT4( pMbCache->pCoeffLevel + 320, &(pEncMb[*pEncBlockOffset]), iEncStride,	pMbCache->pSkipMb + 320, 8);
-			if ( WelsTryPUVskip( pEncCtx, pCurMb, pMbCache, 2 ) )
-			{
-				//update motion info to current MB
-				ST32(pCurMb->pRefIndex, 0);
-				pFunc->pfUpdateMbMv(pCurMb->sMv, sMvp);				
-
-				pCurMb->pSadCost[0] = pFunc->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16](pMbCache->SPicData.pEncMb[0], pCurLayer->iEncStride[0], pRefLuma, iLineSizeY );
-
-                if (pWelsMd->bMdUsingSad)
-					pWelsMd->iCostLuma = pCurMb->pSadCost[0];
-				else
-					pWelsMd->iCostLuma = pFunc->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16](pMbCache->SPicData.pEncMb[0], pCurLayer->iEncStride[0], pRefLuma, iLineSizeY );	
-				
-				pWelsMd->iCostSkipMb = iSadCostMb;
-
-				pCurMb->sP16x16Mv = sMvp;
-				pCurLayer->pDecPic->sMvList[pCurMb->iMbXY] = sMvp;
-
-				return TRUE;
-			}
-		}
-	}
-	return FALSE;
-}
-
-const int32_t g_kiPixStrideIdx8x8[4] = {  0,                                             ME_REFINE_BUF_WIDTH_BLK8,
-									ME_REFINE_BUF_STRIDE_BLK8, ME_REFINE_BUF_STRIDE_BLK8 + ME_REFINE_BUF_WIDTH_BLK8};
-
-void WelsMdInterMbRefinement(sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb, SMbCache *pMbCache)
-{
-	SDqLayer* pCurDqLayer = pEncCtx->pCurDqLayer;
-	uint8_t *pTmpRefCb, *pTmpRefCr, *pTmpDstCb, *pTmpDstCr;
-	int32_t iMvStride, iRefBlk4Stride, iDstBlk4Stride; 	
-	SMVUnitXY* pMv;
-	int32_t iBestSadCost = 0, iBestSatdCost = 0;	
-	SMeRefinePointer sMeRefine;
-
-	int32_t i, iIdx, iPixStride;
-
-	uint8_t* pRefCb = pMbCache->SPicData.pRefMb[1];
-	uint8_t* pRefCr = pMbCache->SPicData.pRefMb[2]; 
-	uint8_t* pDstCb = pMbCache->pMemPredChroma;
-	uint8_t* pDstCr = pMbCache->pMemPredChroma+64;
-	uint8_t* pDstLuma = pMbCache->pMemPredLuma;
-
-	int32_t iLineSizeRefUV = pCurDqLayer->pRefPic->iLineSize[1];
-	
-	switch(pCurMb->uiMbType)
-	{
-	case MB_TYPE_16x16:			
-		//luma
-		InitMeRefinePointer(&sMeRefine, pMbCache, 0);
-		MeRefineFracPixel(pEncCtx, pDstLuma, &pWelsMd->sMe.sMe16x16, &sMeRefine, 16, 16);			
-		UpdateP16x16MotionInfo(pMbCache, pCurMb, pWelsMd->uiRef, &pWelsMd->sMe.sMe16x16.sMv);
-		
-		pMbCache->sMbMvp[0] = pWelsMd->sMe.sMe16x16.sMvp;
-		//save the best cost of final mode
-		iBestSadCost  = pWelsMd->sMe.sMe16x16.uiSadCost;
-		iBestSatdCost = pWelsMd->sMe.sMe16x16.uiSatdCost;
-		
-		//chroma
-		pMv = &pWelsMd->sMe.sMe16x16.sMv;
-		iMvStride = (pMv->iMvY >> 3) * iLineSizeRefUV + (pMv->iMvX >> 3);
-		pTmpRefCb = pRefCb + iMvStride;
-		pTmpRefCr = pRefCr + iMvStride;
-		pEncCtx->pFuncList->sMcFuncs.pfChromaMc(pTmpRefCb, iLineSizeRefUV, pDstCb, 8, *pMv, 8, 8);//Cb
-		pEncCtx->pFuncList->sMcFuncs.pfChromaMc(pTmpRefCr, iLineSizeRefUV, pDstCr, 8, *pMv, 8, 8);//Cr
-
-		pWelsMd->iCostSkipMb = pEncCtx->pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16](pMbCache->SPicData.pEncMb[0], pCurDqLayer->iEncStride[0], pDstLuma, 16 );
-		pWelsMd->iCostSkipMb += pEncCtx->pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8](pMbCache->SPicData.pEncMb[1], pCurDqLayer->iEncStride[1], pDstCb, 8 );	
-		pWelsMd->iCostSkipMb += pEncCtx->pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8](pMbCache->SPicData.pEncMb[2], pCurDqLayer->iEncStride[2], pDstCr, 8 );	
-		break;
-		
-	case MB_TYPE_16x8:
-		iPixStride = 0;
-		for (i = 0; i < 2; i++)
-		{
-			//luma
-			iIdx = i<<3;
-			InitMeRefinePointer(&sMeRefine, pMbCache, iPixStride);
-			iPixStride += ME_REFINE_BUF_STRIDE_BLK8;
-			PredInter16x8Mv( pMbCache, iIdx, pWelsMd->uiRef, &pWelsMd->sMe.sMe16x8[i].sMvp );
-			MeRefineFracPixel(pEncCtx, pDstLuma + g_kuiSmb4AddrIn256[iIdx], &pWelsMd->sMe.sMe16x8[i], &sMeRefine, 16, 8);				
-			UpdateP16x8MotionInfo(pMbCache, pCurMb, iIdx, pWelsMd->uiRef, &pWelsMd->sMe.sMe16x8[i].sMv);
-			pMbCache->sMbMvp[i] = pWelsMd->sMe.sMe16x8[i].sMvp;
-			//save the best cost of final mode
-			iBestSadCost += pWelsMd->sMe.sMe16x8[i].uiSadCost;
-			iBestSatdCost += pWelsMd->sMe.sMe16x8[i].uiSatdCost;		
-			
-			//chroma
-			iRefBlk4Stride = (i << 2) * iLineSizeRefUV;
-			iDstBlk4Stride = i << 5; // 4*8
-			pMv = &pWelsMd->sMe.sMe16x8[i].sMv;
-			iMvStride = (pMv->iMvY >> 3) * iLineSizeRefUV + (pMv->iMvX >> 3);
-			pTmpRefCb = pRefCb + iRefBlk4Stride + iMvStride;
-			pTmpRefCr = pRefCr + iRefBlk4Stride + iMvStride;
-			pTmpDstCb = pDstCb + iDstBlk4Stride;
-			pTmpDstCr = pDstCr + iDstBlk4Stride;
-			pEncCtx->pFuncList->sMcFuncs.pfChromaMc(pTmpRefCb, iLineSizeRefUV, pTmpDstCb, 8, *pMv, 8, 4);//Cb
-			pEncCtx->pFuncList->sMcFuncs.pfChromaMc(pTmpRefCr, iLineSizeRefUV, pTmpDstCr, 8, *pMv, 8, 4);//Cr	
-		}	
-		break;
-		
-	case MB_TYPE_8x16:		
-		iPixStride = 0;
-		for (i = 0; i < 2; i++)
-		{
-			//luma
-			iIdx = i<<2;
-			InitMeRefinePointer(&sMeRefine, pMbCache, iPixStride);
-			iPixStride += ME_REFINE_BUF_WIDTH_BLK8;
-			PredInter8x16Mv( pMbCache, iIdx, pWelsMd->uiRef, &pWelsMd->sMe.sMe8x16[i].sMvp );
-			MeRefineFracPixel(pEncCtx, pDstLuma + g_kuiSmb4AddrIn256[iIdx], &pWelsMd->sMe.sMe8x16[i], &sMeRefine, 8, 16);				
-			update_P8x16_motion_info(pMbCache, pCurMb, iIdx, pWelsMd->uiRef, &pWelsMd->sMe.sMe8x16[i].sMv);
-			pMbCache->sMbMvp[i] = pWelsMd->sMe.sMe8x16[i].sMvp;
-			//save the best cost of final mode
-			iBestSadCost += pWelsMd->sMe.sMe8x16[i].uiSadCost;
-			iBestSatdCost += pWelsMd->sMe.sMe8x16[i].uiSatdCost;
-			
-			//chroma
-			iRefBlk4Stride = iIdx; //4
-			pMv = &pWelsMd->sMe.sMe8x16[i].sMv;
-			iMvStride = (pMv->iMvY >> 3) * iLineSizeRefUV + (pMv->iMvX >> 3);
-			pTmpRefCb = pRefCb + iRefBlk4Stride + iMvStride;
-			pTmpRefCr = pRefCr + iRefBlk4Stride + iMvStride;
-			pTmpDstCb = pDstCb + iRefBlk4Stride;
-			pTmpDstCr = pDstCr + iRefBlk4Stride;
-			pEncCtx->pFuncList->sMcFuncs.pfChromaMc(pTmpRefCb, iLineSizeRefUV, pTmpDstCb, 8, *pMv, 4, 8);//Cb
-			pEncCtx->pFuncList->sMcFuncs.pfChromaMc(pTmpRefCr, iLineSizeRefUV, pTmpDstCr, 8, *pMv, 4, 8);//Cr
-		}
-		break;
-		
-	case MB_TYPE_8x8:
-		for (i = 0; i < 4; i++)
-		{
-			int32_t iBlk8Idx = i<<2; //0, 4, 8, 12
-			int32_t	iBlk4X, iBlk4Y;
-			
-			pCurMb->pRefIndex[i] = pWelsMd->uiRef;
-			
-			//luma
-			InitMeRefinePointer(&sMeRefine, pMbCache, g_kiPixStrideIdx8x8[i]);
-			PredMv( &pMbCache->sMvComponents, iBlk8Idx, 2, pWelsMd->uiRef, &pWelsMd->sMe.sMe8x8[i].sMvp );
-			MeRefineFracPixel(pEncCtx, pDstLuma + g_kuiSmb4AddrIn256[iBlk8Idx], &pWelsMd->sMe.sMe8x8[i], &sMeRefine, 8, 8);					
-			UpdateP8x8MotionInfo(pMbCache, pCurMb, iBlk8Idx, pWelsMd->uiRef, &pWelsMd->sMe.sMe8x8[i].sMv);
-			pMbCache->sMbMvp[i] = pWelsMd->sMe.sMe8x8[i].sMvp;
-			iBestSadCost += pWelsMd->sMe.sMe8x8[i].uiSadCost;
-			iBestSatdCost += pWelsMd->sMe.sMe8x8[i].uiSatdCost; 
-			
-			//chroma
-			pMv = &pWelsMd->sMe.sMe8x8[i].sMv;
-			iMvStride = (pMv->iMvY >> 3) * iLineSizeRefUV + (pMv->iMvX >> 3);
-			
-			iBlk4X = (i & 1) << 2;
-			iBlk4Y = (i >> 1) << 2;
-			iRefBlk4Stride = iBlk4Y * iLineSizeRefUV + iBlk4X;
-			iDstBlk4Stride = (iBlk4Y << 3) + iBlk4X;
-			
-			pTmpRefCb = pRefCb + iRefBlk4Stride;
-			pTmpDstCb = pDstCb + iDstBlk4Stride;
-			pTmpRefCr = pRefCr + iRefBlk4Stride;
-			pTmpDstCr = pDstCr + iDstBlk4Stride;
-			pEncCtx->pFuncList->sMcFuncs.pfChromaMc(pTmpRefCb+iMvStride, iLineSizeRefUV, pTmpDstCb, 8, *pMv, 4, 4);//Cb
-			pEncCtx->pFuncList->sMcFuncs.pfChromaMc(pTmpRefCr+iMvStride, iLineSizeRefUV, pTmpDstCr, 8, *pMv, 4, 4);//Cr		
-		
-		}		
-		break;
-	default:
-		break;
-	}
-	pCurMb->pSadCost[0] = iBestSadCost;
-    if(pWelsMd->bMdUsingSad)
-		pWelsMd->iCostLuma = iBestSadCost;
-    else
-		pWelsMd->iCostLuma = iBestSatdCost;
-
-}
-BOOL_T WelsMdFirstIntraMode(void* pEnc, void* pMd, SMB* pCurMb, SMbCache *pMbCache)
-{
-	sWelsEncCtx *pEncCtx	= (sWelsEncCtx*)pEnc;
-	SWelsFuncPtrList *pFunc	= pEncCtx->pFuncList;
-	SWelsMD *pWelsMd		= (SWelsMD*)pMd;
-    
-	int32_t iCostI16x16 = WelsMdI16x16(pFunc, pEncCtx->pCurDqLayer, pMbCache, pWelsMd->iLambda);
-	
-	//compare cost_p16x16 with cost_i16x16
-	if ( iCostI16x16 < pWelsMd->iCostLuma ) 
-	{
-		pCurMb->uiMbType = MB_TYPE_INTRA16x16;
-		pWelsMd->iCostLuma = iCostI16x16;
-		
-		pFunc->pfIntraFineMd( pEncCtx, pWelsMd, pCurMb, pMbCache );
-		
-		//add pEnc&rec to MD--2010.3.15
-		if ( IS_INTRA16x16(pCurMb->uiMbType) )
-		{
-			pCurMb->uiCbp = 0;
-			WelsEncRecI16x16Y( pEncCtx, pCurMb, pMbCache );
-		}
-		
-		//chroma				
-		pWelsMd->iCostChroma = WelsMdIntraChroma(pFunc, pEncCtx->pCurDqLayer, pMbCache, pWelsMd->iLambda);
-		WelsIMbChromaEncode( pEncCtx, pCurMb, pMbCache ); //add pEnc&rec to MD--2010.3.15
-		
-		pCurMb->pSadCost[0] = 0;
-		return TRUE; //intra_mb_type is best
-	}
-
-	return FALSE;
-}
-
-void WelsMdInterMb(void* pEnc, void* pMd, SSlice *pSlice, SMB* pCurMb )
-{
-	sWelsEncCtx *pEncCtx	= (sWelsEncCtx*)pEnc;
-	SWelsMD* pWelsMd				= (SWelsMD*)pMd;
-	SDqLayer* pCurDqLayer			= pEncCtx->pCurDqLayer;
-	SMbCache *pMbCache			= &pSlice->sMbCacheInfo;
-	const uint32_t kuiNeighborAvail	= pCurMb->uiNeighborAvail;
-	const int32_t kiMbWidth			= pCurDqLayer->iMbWidth;
-	const  SMB* top_mb				= pCurMb-kiMbWidth;
-	const bool_t bMbLeftAvailPskip	= ((kuiNeighborAvail&LEFT_MB_POS) ? IS_SKIP((pCurMb-1)->uiMbType) : false );
-	const bool_t bMbTopAvailPskip		= ((kuiNeighborAvail&TOP_MB_POS) ? IS_SKIP(top_mb->uiMbType) : false );
-	const bool_t bMbTopLeftAvailPskip	= ((kuiNeighborAvail&TOPLEFT_MB_POS) ? IS_SKIP((top_mb -1)->uiMbType) : false );
-	const bool_t bMbTopRightAvailPskip= ((kuiNeighborAvail&TOPRIGHT_MB_POS) ? IS_SKIP((top_mb +1)->uiMbType) : false );
-	BOOL_T bTrySkip = bMbLeftAvailPskip || bMbTopAvailPskip || bMbTopLeftAvailPskip || bMbTopRightAvailPskip;
-	BOOL_T bKeepSkip = bMbLeftAvailPskip && bMbTopAvailPskip && bMbTopRightAvailPskip;
-	BOOL_T bSkip = FALSE;
-
-	if ( pEncCtx->pFuncList->pfInterMdBackgroundDecision( pEncCtx, pWelsMd, pSlice, pCurMb, pMbCache, &bKeepSkip ) )
-	{
-		return;
-	}
-
-	//step 1: try SKIP
-	bSkip = WelsMdInterJudgePskip( pEncCtx, pWelsMd, pSlice, pCurMb, pMbCache, bTrySkip ); 
-
-	if ( bSkip )
-	{
-		if(bKeepSkip)
-		{
-			WelsMdInterDecidedPskip(pEncCtx,  pSlice,  pCurMb, pMbCache);
-			return;
-		}	
-	}
-	else
-	{		
-		PredictSad( pMbCache->sMvComponents.iRefIndexCache, pMbCache->iSadCost, 0, &pWelsMd->iSadPredMb );
-
-		//step 2: P_16x16
-		pWelsMd->iCostLuma = WelsMdP16x16(pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice, pCurMb);
-		pCurMb->uiMbType = MB_TYPE_16x16;
-	}
-		
-	WelsMdInterSecondaryModesEnc( pEncCtx, pWelsMd, pSlice, pCurMb, pMbCache, bSkip );
-}
-
-
-
-//////
-//  try the ordinary Pskip
-//////
-bool_t WelsMdInterJudgePskip( sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SSlice *pSlice, SMB* pCurMb, SMbCache *pMbCache, BOOL_T bTrySkip )
-{
-	bool_t bRet = true;
-	if ( (( pEncCtx->pRefPic->iPictureType == P_SLICE ) && (pMbCache->uiRefMbType == MB_TYPE_SKIP || pMbCache->uiRefMbType == MB_TYPE_BACKGROUND) ) ||
-		bTrySkip )
-	{
-		PredictSadSkip( pMbCache->sMvComponents.iRefIndexCache, pMbCache->bMbTypeSkip, pMbCache->iSadCostSkip, 0, &(pWelsMd->iSadPredSkip) );	
-		bRet = WelsMdPSkipEnc(pEncCtx, pWelsMd, pCurMb, pMbCache)? true:false;
-		return bRet; 
-	}
-
-	return false;
-}
-
-//////
-//  try the ordinary Pskip
-//////
-void WelsMdInterUpdatePskip( SDqLayer* pCurDqLayer, SSlice *pSlice, SMB* pCurMb, SMbCache *pMbCache )
-{
-	//add pEnc&rec to MD--2010.3.15
-	pCurMb->uiCbp = 0;
-	pCurMb->uiLumaQp   = pSlice->uiLastMbQp;
-	pCurMb->uiChromaQp = g_kuiChromaQpTable[CLIP3_QP_0_51(pCurMb->uiLumaQp + pCurDqLayer->sLayerInfo.pPpsP->uiChromaQpIndexOffset)];
-	pMbCache->bCollocatedPredFlag = (LD32(&pCurMb->sMv[0]) == 0);
-}
-
-
-//////
-//  doublecheck if current MBTYPE is Pskip
-//////
-void WelsMdInterDoubleCheckPskip( SMB* pCurMb, SMbCache *pMbCache )
-{
-	if ( MB_TYPE_16x16 == pCurMb->uiMbType && 0 == pCurMb->uiCbp )
-	{
-		if ( 0 == pCurMb->pRefIndex[0] )
-		{
-			SMVUnitXY sMvp = { 0 };
-
-			PredSkipMv( pMbCache, &sMvp );
-			if ( LD32(&sMvp) == LD32(&pCurMb->sMv[0]) )
-			{
-				pCurMb->uiMbType = MB_TYPE_SKIP;
-			}			
-		}
-		pMbCache->bCollocatedPredFlag = (LD32(&pCurMb->sMv[0]) == 0);
-	}
-}
-
-//////
-//  Pskip mb encode
-//////
-void WelsMdInterDecidedPskip( sWelsEncCtx* pEncCtx, SSlice *pSlice, SMB* pCurMb, SMbCache *pMbCache )
-{
-	SDqLayer* pCurDqLayer = pEncCtx->pCurDqLayer;
-	pCurMb->uiMbType = MB_TYPE_SKIP;
-	WelsRecPskip( pCurDqLayer, pEncCtx->pFuncList, pCurMb, pMbCache );
-	WelsMdInterUpdatePskip( pCurDqLayer, pSlice, pCurMb, pMbCache );
-}
-
-//////
-//  inter mb encode
-//////
-void WelsMdInterEncode( sWelsEncCtx* pEncCtx, SSlice *pSlice, SMB* pCurMb, SMbCache *pMbCache )
-{
-	SWelsFuncPtrList *pFunc= pEncCtx->pFuncList;
-	SDqLayer* pCurDqLayer	= pEncCtx->pCurDqLayer;
-
-	//add pEnc&rec to MD--2010.3.15
-	const int32_t kiCsStrideY = pCurDqLayer->iCsStride[0];
-	const int32_t kiCsStrideUV = pCurDqLayer->iCsStride[1];
-
-	//add pEnc&rec to MD--2010.3.15
-	pCurMb->uiCbp = 0;
-	WelsInterMbEncode( pEncCtx, pSlice, pCurMb );
-	WelsPMbChromaEncode( pEncCtx, pSlice, pCurMb );
-
-	pFunc->pfCopy16x16Aligned( pMbCache->SPicData.pCsMb[0], kiCsStrideY, pMbCache->pMemPredLuma,      16 );
-	pFunc->pfCopy8x8Aligned(   pMbCache->SPicData.pCsMb[1], kiCsStrideUV, pMbCache->pMemPredChroma,    8 );
-	pFunc->pfCopy8x8Aligned(   pMbCache->SPicData.pCsMb[2], kiCsStrideUV, pMbCache->pMemPredChroma+64, 8 );
-}
-
-
-
-
-//////
-//  try the BGD Pskip
-//////
-bool_t WelsMdInterJudgeBGDPskip( void* pCtx, void* pMd, SSlice *pSlice, SMB* pCurMb, SMbCache *pMbCache, BOOL_T* bKeepSkip )
-{
-	sWelsEncCtx *pEncCtx = (sWelsEncCtx*)pCtx;
-	SWelsMD* pWelsMd = (SWelsMD*)pMd;
-
-	SDqLayer* pCurDqLayer = pEncCtx->pCurDqLayer;
-
-	const int32_t kiRefMbQp = pCurDqLayer->pRefPic->pRefMbQp[pCurMb->iMbXY];
-	const int32_t kiCurMbQp = pCurMb->uiLumaQp;// unsigned -> signed
-	int8_t	*pVaaBgMbFlag = pEncCtx->pVaa->pVaaBackgroundMbFlag+pCurMb->iMbXY;
-
-	const int32_t kiMbWidth = pCurDqLayer->iMbWidth;
-
-		*bKeepSkip = (*bKeepSkip) && 
-			((!pVaaBgMbFlag[-1])&&
-			(!pVaaBgMbFlag[-kiMbWidth])&&
-			(!pVaaBgMbFlag[-kiMbWidth+1]));
-		
-		if (
-			*pVaaBgMbFlag
-			&& !IS_INTRA(pMbCache->uiRefMbType)
-			&& (kiRefMbQp - kiCurMbQp <= DELTA_QP_BGD_THD || kiRefMbQp<=26)
-			)
-		{
-			SMVUnitXY	sVaaPredSkipMv = { 0 };
-			PredSkipMv( pMbCache, &sVaaPredSkipMv );
-			WelsMdBackgroundMbEnc( pEncCtx, pWelsMd, pCurMb, pMbCache, pSlice, (LD32(&sVaaPredSkipMv) == 0) );
-			return true;
-		}
-
-	return false;
-}
-
-bool_t WelsMdInterJudgeBGDPskipFalse( void* pCtx, void* pMd, SSlice *pSlice, SMB* pCurMb, SMbCache *pMbCache, BOOL_T* bKeepSkip )
-{
-	return false;
-}
-
-
-
-//////
-//  update BGD related info
-//////
-void WelsMdInterUpdateBGDInfo( SDqLayer* pCurLayer,  SMB* pCurMb, const bool_t bCollocatedPredFlag, const int32_t iRefPictureType )
-{ 
-	uint8_t* pTargetRefMbQpList = (pCurLayer->pDecPic->pRefMbQp);
-	const int32_t kiMbXY = pCurMb->iMbXY;
-
-	if (pCurMb->uiCbp || I_SLICE == iRefPictureType || 0 == bCollocatedPredFlag )
-	{
-		pTargetRefMbQpList[kiMbXY] = pCurMb->uiLumaQp;
-	}
-	else //unchange, do not need to evaluation?
-	{
-		uint8_t* pRefPicRefMbQpList = (pCurLayer->pRefPic->pRefMbQp);
-		pTargetRefMbQpList[kiMbXY] = pRefPicRefMbQpList[kiMbXY];
-	}
-
-	if (pCurMb->uiMbType==MB_TYPE_BACKGROUND)
-	{
-		pCurMb->uiMbType = MB_TYPE_SKIP;
-	}
-}
-
-void WelsMdInterUpdateBGDInfoNULL( SDqLayer* pCurLayer, SMB* pCurMb, const bool_t bCollocatedPredFlag, const int32_t iRefPictureType )
-{
-}
-
-//
-//
-//
-void WelsMdInterSaveSadAndRefMbType(Mb_Type* pRefMbtypeList, SMbCache * pMbCache, const SMB*  pCurMb, const SWelsMD* pMd)
-{
-	const Mb_Type kmtCurMbtype = pCurMb->uiMbType;
-
-	//sad
-	pMbCache->pEncSad[0] =  (kmtCurMbtype == MB_TYPE_SKIP) ? pMd->iCostSkipMb:0;
-	//uiMbType
-	pRefMbtypeList[pCurMb->iMbXY] = kmtCurMbtype;
-}
-
-void WelsMdInterSecondaryModesEnc(sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SSlice *pSlice, SMB* pCurMb, SMbCache *pMbCache, const BOOL_T bSkip )
-{
-		//step 2: Intra	
-		const BOOL_T kbTrySkip = pEncCtx->pFuncList->pfFirstIntraMode(pEncCtx, pWelsMd, pCurMb, pMbCache);
-        if(kbTrySkip)
-            return;
-
-		if (bSkip)
-		{
-			WelsMdInterDecidedPskip(pEncCtx,  pSlice,  pCurMb, pMbCache);
-		}
-		else
-		{
-			//Step 2: ILFMD in P
-			pEncCtx->pFuncList->pfInterFineMd(pEncCtx, pWelsMd, pSlice, pCurMb, pWelsMd->iCostLuma);
-
-			//refinement for inter type
-			WelsMdInterMbRefinement( pEncCtx, pWelsMd, pCurMb, pMbCache );
-
-			//step 7: invoke encoding
-			WelsMdInterEncode( pEncCtx, pSlice, pCurMb, pMbCache );
-
-			//step 8: double check Pskip
-			WelsMdInterDoubleCheckPskip(pCurMb, pMbCache);
-		}	
-}
-
-
-void WelsMdIntraSecondaryModesEnc( sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb, SMbCache *pMbCache )
-{
-	SWelsFuncPtrList *pFunc = pEncCtx->pFuncList;
-	//initial prediction memory for I_4x4
-	pFunc->pfIntraFineMd( pEncCtx, pWelsMd, pCurMb, pMbCache );			//WelsMdIntraFinePartitionVaa
-
-	//add pEnc&rec to MD--2010.3.15
-	if ( IS_INTRA16x16(pCurMb->uiMbType) )
-	{
-		pCurMb->uiCbp = 0;
-		WelsEncRecI16x16Y( pEncCtx, pCurMb, pMbCache );
-	}
-
-	//chroma
-	pWelsMd->iCostChroma = WelsMdIntraChroma(pFunc, pEncCtx->pCurDqLayer, pMbCache, pWelsMd->iLambda);	
-	WelsIMbChromaEncode( pEncCtx, pCurMb, pMbCache ); //add pEnc&rec to MD--2010.3.15
-	pCurMb->pSadCost[0] = 0;	
-}
-
-} // namespace WelsSVCEnc
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	svc_base_layer_md.c
+ *
+ * \brief	mode decision
+ *
+ * \date	2009.08.10 Created
+ *
+ *************************************************************************************
+ */
+#include <string.h>
+#include <assert.h>
+#include "ls_defines.h"
+#include "encoder_context.h"
+#include "svc_enc_slice_segment.h"
+#include "md.h"
+#include "mc.h"
+#include "mv_pred.h"
+#include "cpu_core.h"
+#include "svc_enc_golomb.h"
+#include "svc_base_layer_md.h"
+#include "sample.h"
+#include "encoder.h"
+#include "svc_encode_mb.h"
+#include "svc_encode_slice.h"
+#include "svc_motion_estimate.h"
+#include "as264_common.h"
+#include "encode_mb_aux.h"
+#include "utils.h"
+namespace WelsSVCEnc {
+static const ALIGNED_DECLARE (int8_t, g_kiIntra16AvaliMode[8][5], 16) = {
+  { I16_PRED_DC_128, I16_PRED_INVALID, I16_PRED_INVALID, I16_PRED_INVALID, 1 },
+  { I16_PRED_DC_L,   I16_PRED_H,       I16_PRED_INVALID, I16_PRED_INVALID, 2 },
+  { I16_PRED_DC_T,   I16_PRED_V,		 I16_PRED_INVALID, I16_PRED_INVALID, 2 },
+  { I16_PRED_V,      I16_PRED_H,		 I16_PRED_DC,	   I16_PRED_INVALID, 3 },
+  { I16_PRED_DC_128, I16_PRED_INVALID, I16_PRED_INVALID, I16_PRED_INVALID, 1 },
+  { I16_PRED_DC_L,   I16_PRED_H,		 I16_PRED_INVALID, I16_PRED_INVALID, 2 },
+  { I16_PRED_DC_T,   I16_PRED_V,		 I16_PRED_INVALID, I16_PRED_INVALID, 2 },
+  { I16_PRED_V,      I16_PRED_H,		 I16_PRED_DC,	   I16_PRED_P,       4 }
+};
+
+static const ALIGNED_DECLARE (uint8_t, g_kiIntra4AvailCount[16], 16) = {
+#ifndef  I4_PRED_MODE_EXTEND
+  1, 3, 2, 4, 1, 3, 2, 7, 1, 3, 4, 6, 1, 3, 4, 9
+#else
+  1, 3, 4, 4, 1, 3, 4, 7, 1, 3, 4, 6, 1, 3, 4, 9
+#endif  //I4_PRED_MODE_EXTEND
+};
+
+//left_avail | (top_avail<<1) | (left_top_avail<<2) | (right_top_avail<<3);
+static const ALIGNED_DECLARE (uint8_t, g_kiIntra4AvailMode[16][16], 16) = {
+  {
+    I4_PRED_DC_128,  I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
+  },  //  0000
+
+  {
+    I4_PRED_DC_L,    I4_PRED_H,       I4_PRED_HU,      I4_PRED_INVALID,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
+  },  //  0001
+
+#ifndef  I4_PRED_MODE_EXTEND
+  {
+    I4_PRED_DC_T,    I4_PRED_V,       I4_PRED_INVALID, I4_PRED_INVALID,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
+  }, //  0010
+#else
+  {
+    I4_PRED_DC_T,    I4_PRED_V,       I4_PRED_DDL_TOP, I4_PRED_VL_TOP,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
+  }, //  0010
+#endif //I4_PRED_MODE_EXTEND
+
+  {
+    I4_PRED_DC,      I4_PRED_H,       I4_PRED_V,       I4_PRED_HU,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
+  }, //  0011
+
+  {
+    I4_PRED_DC_128,  I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
+  },  //  0100
+
+  {
+    I4_PRED_DC_L,    I4_PRED_H,       I4_PRED_HU,      I4_PRED_INVALID,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
+  },    //  0101
+
+#ifndef  I4_PRED_MODE_EXTEND
+  {
+    I4_PRED_DC_T,    I4_PRED_V,       I4_PRED_INVALID, I4_PRED_INVALID,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
+  },     //  0110
+#else
+  {
+    I4_PRED_DC_T,  I4_PRED_V,       I4_PRED_DDL_TOP, I4_PRED_VL_TOP,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
+  },     //  0110
+#endif //I4_PRED_MODE_EXTEND
+
+  {
+    I4_PRED_DC,      I4_PRED_H,       I4_PRED_V,       I4_PRED_HU,
+    I4_PRED_DDR,     I4_PRED_VR,      I4_PRED_HD,      I4_PRED_INVALID,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
+  },               //  0111
+
+  {
+    I4_PRED_DC_128,   I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+    I4_PRED_INVALID,  I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+    I4_PRED_INVALID,  I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+    I4_PRED_INVALID,  I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
+  },  //  1000
+
+  {
+    I4_PRED_DC_L,    I4_PRED_H,       I4_PRED_HU,      I4_PRED_INVALID,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
+  },    //  1001
+
+  {
+    I4_PRED_DC_T,    I4_PRED_V,       I4_PRED_DDL,     I4_PRED_VL,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
+  },     //  1010
+
+  {
+    I4_PRED_DC,      I4_PRED_H,       I4_PRED_V,       I4_PRED_HU,
+    I4_PRED_DDL,     I4_PRED_VL,      I4_PRED_INVALID, I4_PRED_INVALID,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
+  },          //  1011
+
+  {
+    I4_PRED_DC_128,  I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
+  },  //  1100
+
+  {
+    I4_PRED_DC_L,    I4_PRED_H,       I4_PRED_HU,      I4_PRED_INVALID,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
+  },    //  1101
+
+  {
+    I4_PRED_DC_T,    I4_PRED_V,       I4_PRED_DDL,     I4_PRED_VL,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
+  },     //  1110
+
+  {
+    I4_PRED_DC,      I4_PRED_H,       I4_PRED_V,       I4_PRED_HU,
+    I4_PRED_DDL,     I4_PRED_VL,      I4_PRED_DDR,     I4_PRED_VR,
+    I4_PRED_HD,      I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+    I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
+  }                          //  1111
+
+};
+static const ALIGNED_DECLARE (int8_t, g_kiIntraChromaAvailMode[8][5], 16) = {
+  { C_PRED_DC_128, C_PRED_INVALID, C_PRED_INVALID, C_PRED_INVALID, 1 },
+  { C_PRED_DC_L,   C_PRED_H,       C_PRED_INVALID, C_PRED_INVALID, 2 },
+  { C_PRED_DC_T,   C_PRED_V,       C_PRED_INVALID, C_PRED_INVALID, 2 },
+  { C_PRED_V,      C_PRED_H,       C_PRED_DC,      C_PRED_INVALID, 3 },
+  { C_PRED_DC_128, C_PRED_INVALID, C_PRED_INVALID, C_PRED_INVALID, 1 },
+  { C_PRED_DC_L,   C_PRED_H,       C_PRED_INVALID, C_PRED_INVALID, 2 },
+  { C_PRED_DC_T,   C_PRED_V,       C_PRED_INVALID, C_PRED_INVALID, 2 },
+  { C_PRED_V,      C_PRED_H,       C_PRED_DC,      C_PRED_P,       4 }
+};
+
+// for cache hit, two table are total sizeof 64 Bytes
+const int8_t g_kiCoordinateIdx4x4X[16] = { 0, 4, 0, 4,
+                                           8, 12, 8, 12,
+                                           0, 4, 0, 4,
+                                           8, 12, 8, 12
+                                         };
+
+const int8_t g_kiCoordinateIdx4x4Y[16] = { 0, 0, 4, 4,
+                                           0, 0, 4, 4,
+                                           8, 8, 12, 12,
+                                           8, 8, 12, 12
+                                         };
+static const ALIGNED_DECLARE (int8_t, g_kiNeighborIntraToI4x4[16][16], 16) = {
+  {	0,	1,	10,	7,	1,	1,	15,	7,	10,	15,	10,	7,	15,	7,	15,	7},
+  {	1,	1,	15,	7,	1,	1,	15,	7,	15,	15,	15,	7,	15,	7,	15,	7},
+  {	10,	15,	10,	7,	15,	7,	15,	7,	10,	15,	10,	7,	15,	7,	15,	7},
+  {	11,	15,	15,	7,	15,	7,	15,	7,	15,	15,	15,	7,	15,	7,	15,	7},
+  {	4,	1,	10,	7,	1,	1,	15,	7,	10,	15,	10,	7,	15,	7,	15,	7},
+  {	5,	1,	15,	7,	1,	1,	15,	7,	15,	15,	15,	7,	15,	7,	15,	7},
+  {	14,	15,	10,	7,	15,	7,	15,	7,	10,	15,	10,	7,	15,	7,	15,	7},
+  {	15,	15,	15,	7,	15,	7,	15,	7,	15,	15,	15,	7,	15,	7,	15,	7},
+  {	0,	1,	10,	7,	1,	9,	15,	7,	10,	15,	10,	7,	15,	7,	15,	7},
+  {	1,	1,	15,	7,	1,	9,	15,	7,	15,	15,	15,	7,	15,	7,	15,	7},
+  {	10,	15,	10,	7,	15,	15,	15,	7,	10,	15,	10,	7,	15,	7,	15,	7},
+  {	11,	15,	15,	7,	15,	15,	15,	7,	15,	15,	15,	7,	15,	7,	15,	7},
+  {	4,	1,	10,	7,	1,	9,	15,	7,	10,	15,	10,	7,	15,	7,	15,	7},
+  {	5,	1,	15,	7,	1,	9,	15,	7,	15,	15,	15,	7,	15,	7,	15,	7},
+  {	14,	15,	10,	7,	15,	15,	15,	7,	10,	15,	10,	7,	15,	7,	15,	7},
+  {	15,	15,	15,	7,	15,	15,	15,	7,	15,	15,	15,	7,	15,	7,	15,	7},
+};
+
+__align16 (const int8_t, g_kiMapModeI4x4[14]) = {
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 2, 2, 2, 3, 7
+};
+
+int32_t PredIntra4x4Mode (int8_t* pIntraPredMode, int32_t iIdx4) {
+  int8_t iTopMode = pIntraPredMode[iIdx4 - 8];
+  int8_t iLeftMode = pIntraPredMode[iIdx4 - 1];
+  int8_t iBestMode;
+
+  if (-1 == iLeftMode || -1 == iTopMode) {
+    iBestMode = 2;
+  } else {
+    iBestMode = WELS_MIN (iLeftMode, iTopMode);
+  }
+  return iBestMode;
+}
+
+void WelsMdIntraInit (sWelsEncCtx* pEncCtx, SMB* pCurMb, SMbCache* pMbCache, const int32_t iSliceFirstMbXY) {
+  SDqLayer* pCurLayer = pEncCtx->pCurDqLayer;
+
+  const int32_t kiMbX  = pCurMb->iMbX;
+  const int32_t kiMbY  = pCurMb->iMbY;
+  const int32_t kiMbXY = pCurMb->iMbXY;
+
+  // step 3. locating current pEnc and pDec
+  // unroll loops here
+  if (0 == kiMbX || iSliceFirstMbXY == kiMbXY) {
+    int32_t iStrideY, iStrideUV;
+    int32_t iOffsetY, iOffsetUV;
+
+    iStrideY	= pCurLayer->iEncStride[0];
+    iStrideUV	= pCurLayer->iEncStride[1];
+    iOffsetY	= (kiMbX + kiMbY * iStrideY) << 4;
+    iOffsetUV	= (kiMbX + kiMbY * iStrideUV) << 3;
+    pMbCache->SPicData.pEncMb[0]		= pCurLayer->pEncData[0] + iOffsetY;
+    pMbCache->SPicData.pEncMb[1]		= pCurLayer->pEncData[1] + iOffsetUV;
+    pMbCache->SPicData.pEncMb[2]		= pCurLayer->pEncData[2] + iOffsetUV;
+
+    iStrideY	= pCurLayer->iCsStride[0];
+    iStrideUV	= pCurLayer->iCsStride[1];
+    iOffsetY	= (kiMbX + kiMbY * iStrideY) << 4;
+    iOffsetUV	= (kiMbX + kiMbY * iStrideUV) << 3;
+    pMbCache->SPicData.pCsMb[0]			= pCurLayer->pCsData[0] + iOffsetY;
+    pMbCache->SPicData.pCsMb[1]			= pCurLayer->pCsData[1] + iOffsetUV;
+    pMbCache->SPicData.pCsMb[2]			= pCurLayer->pCsData[2] + iOffsetUV;
+
+    iStrideY	= pCurLayer->pDecPic->iLineSize[0];
+    iStrideUV	= pCurLayer->pDecPic->iLineSize[1];
+    iOffsetY	= (kiMbX + kiMbY * iStrideY) << 4;
+    iOffsetUV	= (kiMbX + kiMbY * iStrideUV) << 3;
+    pMbCache->SPicData.pDecMb[0]		= pCurLayer->pDecPic->pData[0] + iOffsetY;
+    pMbCache->SPicData.pDecMb[1]		= pCurLayer->pDecPic->pData[1] + iOffsetUV;
+    pMbCache->SPicData.pDecMb[2]		= pCurLayer->pDecPic->pData[2] + iOffsetUV;
+  } else {
+    pMbCache->SPicData.pEncMb[0]	+= MB_WIDTH_LUMA;
+    pMbCache->SPicData.pEncMb[1]	+= MB_WIDTH_CHROMA;
+    pMbCache->SPicData.pEncMb[2]	+= MB_WIDTH_CHROMA;
+
+    pMbCache->SPicData.pDecMb[0]	+= MB_WIDTH_LUMA;
+    pMbCache->SPicData.pDecMb[1]	+= MB_WIDTH_CHROMA;
+    pMbCache->SPicData.pDecMb[2]	+= MB_WIDTH_CHROMA;
+
+    pMbCache->SPicData.pCsMb[0]		+= MB_WIDTH_LUMA;
+    pMbCache->SPicData.pCsMb[1]		+= MB_WIDTH_CHROMA;
+    pMbCache->SPicData.pCsMb[2]		+= MB_WIDTH_CHROMA;
+  }
+
+  //step 2. initial pWelsMd
+  pCurMb->uiCbp			= 0;
+
+  //step 4: locating scaled_tcoeff
+
+  //step 1. load neighbor cache
+  FillNeighborCacheIntra (pMbCache, pCurMb, pCurLayer->iMbWidth);
+  pMbCache->pMemPredLuma = pMbCache->pMemPredMb;// in WelsMdI16x16() will be changed, so re-init here!
+  pMbCache->pMemPredChroma = pMbCache->pMemPredMb +
+                             256;// Init with default, maybe change in WelsMdI16x16 and svc_md_i16x16_sad
+}
+
+void WelsMdInterInit (sWelsEncCtx* pEncCtx, SSlice* pSlice, SMB* pCurMb, const int32_t iSliceFirstMbXY) {
+  SDqLayer* pCurLayer = pEncCtx->pCurDqLayer;
+  SMbCache* pMbCache	= &pSlice->sMbCacheInfo;
+  const int32_t kiMbX  = pCurMb->iMbX;
+  const int32_t kiMbY  = pCurMb->iMbY;
+  const int32_t kiMbXY	= pCurMb->iMbXY;
+  const int32_t kiMbWidth = pCurLayer->iMbWidth;
+  const int32_t kiMbHeight = pCurLayer->iMbHeight;
+
+  pMbCache->pEncSad		= &pCurLayer->pDecPic->pMbSkipSad[kiMbXY];
+
+  //step 1. load neighbor cache
+  pEncCtx->pFuncList->pfFillInterNeighborCache (pMbCache, pCurMb, kiMbWidth,
+      pEncCtx->pVaa->pVaaBackgroundMbFlag + kiMbXY); //BGD spatial pFunc
+
+  //step 3: initial cost
+
+  //step 4. locating current p_ref
+  // merge loops
+  if (0 == kiMbX || iSliceFirstMbXY == kiMbXY) {
+    const int32_t kiRefStrideY	= pCurLayer->pRefPic->iLineSize[0];
+    const int32_t kiRefStrideUV	= pCurLayer->pRefPic->iLineSize[1];
+    const int32_t kiCurStrideY	= (kiMbX + kiMbY * kiRefStrideY) << 4;
+    const int32_t kiCurStrideUV	= (kiMbX + kiMbY * kiRefStrideUV) << 3;
+    pMbCache->SPicData.pRefMb[0]	= pCurLayer->pRefPic->pData[0] + kiCurStrideY;
+    pMbCache->SPicData.pRefMb[1]	= pCurLayer->pRefPic->pData[1] + kiCurStrideUV;
+    pMbCache->SPicData.pRefMb[2]	= pCurLayer->pRefPic->pData[2] + kiCurStrideUV;
+  } else {
+    pMbCache->SPicData.pRefMb[0]	+= MB_WIDTH_LUMA;
+    pMbCache->SPicData.pRefMb[1]	+= MB_WIDTH_CHROMA;
+    pMbCache->SPicData.pRefMb[2]	+= MB_WIDTH_CHROMA;
+  }
+
+  pMbCache->uiRefMbType	= pCurLayer->pRefPic->uiRefMbType[kiMbXY];
+  pMbCache->bCollocatedPredFlag = false;
+
+  //comment: sometimes, mode decision process may skip the md_p16x16 and md_pskip function,
+  ST32 (&pCurMb->sP16x16Mv, 0);
+  ST32 (&pCurLayer->pDecPic->sMvList[kiMbXY], 0);
+
+  pSlice->sMvMin.iMvX = -16 * (kiMbX + 1) + INTPEL_NEEDED_MARGIN;
+  if (pSlice->sMvMin.iMvX < -MV_RANGE)
+    pSlice->sMvMin.iMvX = -MV_RANGE;
+  pSlice->sMvMin.iMvY = -16 * (kiMbY + 1) + INTPEL_NEEDED_MARGIN;
+  if (pSlice->sMvMin.iMvY < -MV_RANGE)
+    pSlice->sMvMin.iMvY = -MV_RANGE;
+  pSlice->sMvMax.iMvX = 16 * (kiMbWidth - kiMbX) - INTPEL_NEEDED_MARGIN;
+  if (pSlice->sMvMax.iMvX > MV_RANGE)
+    pSlice->sMvMax.iMvX = MV_RANGE;
+  pSlice->sMvMax.iMvY = 16 * (kiMbHeight - kiMbY) - INTPEL_NEEDED_MARGIN;
+  if (pSlice->sMvMax.iMvY > MV_RANGE)
+    pSlice->sMvMax.iMvY = MV_RANGE;
+}
+
+int32_t WelsMdI16x16 (SWelsFuncPtrList* pFunc, SDqLayer* pCurDqLayer, SMbCache* pMbCache, int32_t iLambda) {
+  const int8_t*  kpAvailMode;
+  int32_t iAvailCount;
+  int32_t iIdx = 0;
+  uint8_t* pPredI16x16[2] = {pMbCache->pMemPredMb, pMbCache->pMemPredMb + 256};
+  uint8_t* pDst		= pPredI16x16[0];
+  uint8_t* pDec       = pMbCache->SPicData.pCsMb[0];
+  uint8_t* pEnc       = pMbCache->SPicData.pEncMb[0];
+  int32_t iLineSizeDec = pCurDqLayer->iCsStride[0];
+  int32_t iLineSizeEnc = pCurDqLayer->iEncStride[0];
+  int32_t i, iCurCost, iCurMode, iBestMode, iBestCost = INT_MAX;
+
+  int32_t iOffset = pMbCache->uiNeighborIntra & 0x07;
+  iAvailCount = g_kiIntra16AvaliMode[iOffset][4];
+  kpAvailMode = g_kiIntra16AvaliMode[iOffset];
+  if (iAvailCount > 3 && pFunc->sSampleDealingFuncs.pfIntra16x16Combined3) {
+    iBestCost = pFunc->sSampleDealingFuncs.pfIntra16x16Combined3 (pDec, iLineSizeDec, pEnc, iLineSizeEnc, &iBestMode,
+                iLambda, pDst/*temp*/);
+    iCurMode = kpAvailMode[3];
+    pFunc->pfGetLumaI16x16Pred[iCurMode] (pDst, pDec, iLineSizeDec);
+    iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_16x16] (pDst, 16, pEnc, iLineSizeEnc) + iLambda * 4 ;
+    if (iCurCost < iBestCost) {
+      iBestMode = iCurMode;
+      iBestCost = iCurCost;
+    } else {
+      pFunc->pfGetLumaI16x16Pred[iBestMode] (pDst, pDec, iLineSizeDec);
+    }
+    iIdx = 1;
+    iBestCost += iLambda;
+  } else {
+    iBestMode = kpAvailMode[0];
+    for (i = 0; i < iAvailCount; ++ i) {
+      iCurMode = kpAvailMode[i];
+
+      assert (iCurMode >= 0 && iCurMode < 7);
+
+      pFunc->pfGetLumaI16x16Pred[iCurMode] (pDst, pDec, iLineSizeDec);
+      iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_16x16] (pDst, 16, pEnc, iLineSizeEnc);
+      iCurCost += iLambda * (BsSizeUE (g_kiMapModeI16x16[iCurMode]));
+      if (iCurCost < iBestCost) {
+        iBestMode = iCurMode;
+        iBestCost = iCurCost;
+        iIdx = iIdx ^ 0x01;
+        pDst = pPredI16x16[iIdx];
+      }
+    }
+  }
+  pMbCache->pMemPredChroma = pPredI16x16[iIdx];
+
+  pMbCache->pMemPredLuma = pPredI16x16[iIdx ^ 0x01];
+  pMbCache->uiLumaI16x16Mode  = iBestMode;
+  return iBestCost;
+}
+int32_t WelsMdI4x4 (void* pEnc, void* pMd, SMB* pCurMb, SMbCache* pMbCache) {
+  sWelsEncCtx* pEncCtx	= (sWelsEncCtx*)pEnc;
+  SWelsFuncPtrList* pFunc		= pEncCtx->pFuncList;
+  SWelsMD* pWelsMd					= (SWelsMD*)pMd;
+  SDqLayer* pCurDqLayer			= pEncCtx->pCurDqLayer;
+  int32_t iLambda				= pWelsMd->iLambda;
+  int32_t iBestCostLuma				= pWelsMd->iCostLuma;
+  uint8_t* pEncMb					= pMbCache->SPicData.pEncMb[0];
+  uint8_t* pDecMb					= pMbCache->SPicData.pCsMb[0];
+  const int32_t kiLineSizeEnc		= pCurDqLayer->iEncStride[0];
+  const int32_t kiLineSizeDec		= pCurDqLayer->iCsStride[0];
+
+  uint8_t* pCurEnc, *pCurDec, *pDst;
+
+  int32_t iPredMode, iCurMode, iBestMode, iFinalMode;
+  int32_t iCurCost, iBestCost;
+  int32_t iAvailCount;
+  const uint8_t* kpAvailMode;
+  int32_t i, j, iCoordinateX, iCoordinateY, iIdxStrideEnc, iIdxStrideDec;
+  int32_t lambda[2]						= {iLambda << 2, iLambda};
+  bool_t* pPrevIntra4x4PredModeFlag	= pMbCache->pPrevIntra4x4PredModeFlag;
+  int8_t* pRemIntra4x4PredModeFlag		= pMbCache->pRemIntra4x4PredModeFlag;
+  const uint8_t* kpIntra4x4AvailCount		= &g_kiIntra4AvailCount[0];
+  const uint8_t* kpCache48CountScan4		= &g_kuiCache48CountScan4Idx[0];
+  const int8_t* kpNeighborIntraToI4x4	= g_kiNeighborIntraToI4x4[pMbCache->uiNeighborIntra];
+  const int8_t* kpCoordinateIdxX			= &g_kiCoordinateIdx4x4X[0];
+  const int8_t* kpCoordinateIdxY			= &g_kiCoordinateIdx4x4Y[0];
+  int32_t iBestPredBufferNum			= 0;
+  int32_t iCosti4x4						= 0;
+
+#if defined(X86_ASM)
+  WelsPrefetchZero_mmx (g_kiMapModeI4x4);
+  WelsPrefetchZero_mmx ((int8_t*)&pFunc->pfGetLumaI4x4Pred);
+#endif//X86_ASM
+
+  for (i = 0; i < 16; i++) {
+    const int32_t kiOffset	= kpNeighborIntraToI4x4[i];
+
+    //step 1: locating current 4x4 block position in pEnc and pDecMb
+    iCoordinateX = kpCoordinateIdxX[i];
+    iCoordinateY = kpCoordinateIdxY[i];
+
+    iIdxStrideEnc = (iCoordinateY * kiLineSizeEnc) + iCoordinateX;
+    pCurEnc = pEncMb + iIdxStrideEnc;
+    iIdxStrideDec = (iCoordinateY * kiLineSizeDec) + iCoordinateX;
+    pCurDec = pDecMb + iIdxStrideDec;
+
+    //step 2: get predicted mode from neighbor
+    iPredMode = PredIntra4x4Mode (pMbCache->iIntraPredMode, kpCache48CountScan4[i]);
+
+    //step 3: collect candidates of iPredMode
+    iAvailCount = kpIntra4x4AvailCount[kiOffset];
+    kpAvailMode = g_kiIntra4AvailMode[kiOffset];
+
+    //step 4: gain the best pred mode
+    iBestCost = INT_MAX;
+    iBestMode = kpAvailMode[0];
+
+    if (pFunc->sSampleDealingFuncs.pfIntra4x4Combined3Satd && (iAvailCount >= 6)) {
+      pDst = &pMbCache->pMemPredBlk4[iBestPredBufferNum << 4];
+
+      iBestCost = pFunc->sSampleDealingFuncs.pfIntra4x4Combined3 (pCurDec, kiLineSizeDec, pCurEnc, kiLineSizeEnc, pDst,
+                  &iBestMode,
+                  lambda[iPredMode == 2], lambda[iPredMode == 1], lambda[iPredMode == 0]);
+      //     ST64(&pMbCache->pMemPredBlk4[iBestMode<<4], LD64(mem_pred_blk4_temp));
+      //     ST64(&pMbCache->pMemPredBlk4[8+(iBestMode<<4)], LD64(mem_pred_blk4_temp+8));
+
+      for (j = 3; j < iAvailCount; ++ j) {
+        iCurMode = kpAvailMode[j];
+
+        assert (iCurMode >= 0 && iCurMode < 14);
+
+        pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
+
+        pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
+        iCurCost = pFunc->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
+                   lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
+
+        if (iCurCost < iBestCost) {
+          iBestMode = iCurMode;
+          iBestCost = iCurCost;
+          iBestPredBufferNum = 1 - iBestPredBufferNum;
+        }
+      }
+    } else {
+      for (j = 0; j < iAvailCount; ++ j) {
+        iCurMode = kpAvailMode[j];
+
+        assert (iCurMode >= 0 && iCurMode < 14);
+
+        pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
+
+        pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
+        iCurCost = pFunc->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
+                   lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
+
+        if (iCurCost < iBestCost) {
+          iBestMode = iCurMode;
+          iBestCost = iCurCost;
+          iBestPredBufferNum = 1 - iBestPredBufferNum;
+        }
+      }
+    }
+    pMbCache->pBestPredI4x4Blk4 = &pMbCache->pMemPredBlk4[iBestPredBufferNum << 4];
+    iCosti4x4 += iBestCost;
+    if (iCosti4x4 >= iBestCostLuma) {
+      break;
+    }
+
+    //step 5: update pred mode and sample avail cache
+    iFinalMode = g_kiMapModeI4x4[iBestMode];
+    if (iPredMode == iFinalMode) {
+      *pPrevIntra4x4PredModeFlag++ = true;
+    } else {
+      *pPrevIntra4x4PredModeFlag++ = false;
+      *pRemIntra4x4PredModeFlag  = (iFinalMode < iPredMode ? iFinalMode : (iFinalMode - 1));
+    }
+    pRemIntra4x4PredModeFlag++;
+    //	pCurMb->pIntra4x4PredMode[g_kuiMbCountScan4Idx[i]] = iFinalMode;
+    pMbCache->iIntraPredMode[kpCache48CountScan4[i]] = iFinalMode;
+
+    //step 6: encoding I_4x4
+    WelsEncRecI4x4Y (pEncCtx, pCurMb, pMbCache, i);
+  }
+  ST32 (pCurMb->pIntra4x4PredMode, LD32 (&pMbCache->iIntraPredMode[33]));
+  pCurMb->pIntra4x4PredMode[4] = pMbCache->iIntraPredMode[12];
+  pCurMb->pIntra4x4PredMode[5] =	pMbCache->iIntraPredMode[20];
+  pCurMb->pIntra4x4PredMode[6] = pMbCache->iIntraPredMode[28];
+  iCosti4x4 += (iLambda << 4) + (iLambda << 3); //4*6*lambda from JVT SATD0
+  return iCosti4x4;
+}
+
+int32_t WelsMdI4x4Fast (void* pEnc, void* pMd, SMB* pCurMb, SMbCache* pMbCache) {
+  sWelsEncCtx* pEncCtx	= (sWelsEncCtx*)pEnc;
+  SWelsFuncPtrList* pFunc		= pEncCtx->pFuncList;
+  SWelsMD* pWelsMd					= (SWelsMD*)pMd;
+  SDqLayer* pCurDqLayer			= pEncCtx->pCurDqLayer;
+  int32_t iLambda				= pWelsMd->iLambda;
+  int32_t iBestCostLuma				= pWelsMd->iCostLuma;
+  uint8_t* pEncMb					= pMbCache->SPicData.pEncMb[0];
+  uint8_t* pDecMb					= pMbCache->SPicData.pCsMb[0];
+  const int32_t kiLineSizeEnc		= pCurDqLayer->iEncStride[0];
+  const int32_t kiLineSizeDec		= pCurDqLayer->iCsStride[0];
+
+  uint8_t* pCurEnc, *pCurDec, *pDst;
+  int8_t iPredMode, iCurMode, iBestMode, iFinalMode;
+  int32_t iCurCost, iBestCost;
+  int32_t iAvailCount;
+  const uint8_t* kpAvailMode;
+  int32_t i, j, iCoordinateX, iCoordinateY, iIdxStrideEnc, iIdxStrideDec;
+  int32_t iCostH, iCostV, iCostVR, iCostHD, iCostVL, iCostHU, iBestModeFake;
+  int32_t lambda[2]						= {iLambda << 2, iLambda};
+  bool_t* pPrevIntra4x4PredModeFlag	= pMbCache->pPrevIntra4x4PredModeFlag;
+  int8_t* pRemIntra4x4PredModeFlag		= pMbCache->pRemIntra4x4PredModeFlag;
+  const uint8_t* kpIntra4x4AvailCount		= &g_kiIntra4AvailCount[0];
+  const uint8_t* kpCache48CountScan4		= &g_kuiCache48CountScan4Idx[0];
+  const int8_t* kpNeighborIntraToI4x4	= g_kiNeighborIntraToI4x4[pMbCache->uiNeighborIntra];
+  const int8_t* kpCoordinateIdxX			= &g_kiCoordinateIdx4x4X[0];
+  const int8_t* kpCoordinateIdxY			= &g_kiCoordinateIdx4x4Y[0];
+  int32_t iBestPredBufferNum			= 0;
+  int32_t iCosti4x4						= 0;
+#if defined(X86_ASM)
+  WelsPrefetchZero_mmx (g_kiMapModeI4x4);
+  WelsPrefetchZero_mmx ((int8_t*)&pFunc->pfGetLumaI4x4Pred);
+#endif//X86_ASM
+
+  for (i = 0; i < 16; i++) {
+    const int32_t kiOffset	= kpNeighborIntraToI4x4[i];
+//		const int32_t i_next	= (1+i) & 15;												// next loop
+//		const uint8_t dummy_byte= pIntra4x4AvailCount[pNeighborIntraToI4x4[i_next]];	// prefetch pIntra4x4AvailCount of next loop to avoid cache missed
+
+    //step 1: locating current 4x4 block position in pEnc and pDecMb
+    iCoordinateX = kpCoordinateIdxX[i];
+    iCoordinateY = kpCoordinateIdxY[i];
+
+    iIdxStrideEnc = (iCoordinateY * kiLineSizeEnc) + iCoordinateX;
+    pCurEnc = pEncMb + iIdxStrideEnc;
+    iIdxStrideDec = (iCoordinateY * kiLineSizeDec) + iCoordinateX;
+    pCurDec = pDecMb + iIdxStrideDec;
+
+    //step 2: get predicted mode from neighbor
+    iPredMode = PredIntra4x4Mode (pMbCache->iIntraPredMode, kpCache48CountScan4[i]);
+    //step 3: collect candidates of iPredMode
+    iAvailCount = kpIntra4x4AvailCount[kiOffset];
+    kpAvailMode = g_kiIntra4AvailMode[kiOffset];
+
+    if (iAvailCount == 9 || iAvailCount == 7) {
+      //I4_PRED_DC(2)
+
+      iBestMode = I4_PRED_DC;
+
+      pDst = &pMbCache->pMemPredBlk4[iBestPredBufferNum << 4];
+
+      pFunc->pfGetLumaI4x4Pred[I4_PRED_DC] (pDst, pCurDec, kiLineSizeDec);
+      iBestCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
+                  lambda[iPredMode == g_kiMapModeI4x4[iBestMode]];
+
+      //I4_PRED_H(1)
+      iCurMode = I4_PRED_H;
+
+      pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
+
+      pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
+      iCostH = iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
+                          lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
+
+      if (iCurCost < iBestCost) {
+        iBestMode = iCurMode;
+        iBestCost = iCurCost;
+        iBestPredBufferNum = 1 - iBestPredBufferNum;
+      }
+
+      //I4_PRED_V(0)
+      iCurMode = I4_PRED_V;
+
+      pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
+
+      pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
+      iCostV = iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
+                          lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
+
+      if (iCurCost < iBestCost) {
+        iBestMode = iCurMode;
+        iBestCost = iCurCost;
+        iBestPredBufferNum = 1 - iBestPredBufferNum;
+      }
+      if (iCostV < iCostH) {
+        if (iAvailCount == 9) {
+          iBestModeFake = true; //indicating whether V is the best fake mode
+
+          //I4_PRED_VR(5) and I4_PRED_VL(7)
+          iCurMode = I4_PRED_VR;
+
+          pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
+
+          pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
+          iCostVR = iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
+                               lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
+
+          if (iCurCost < iBestCost) {
+            iBestMode = iCurMode;
+            iBestCost = iCurCost;
+            iBestPredBufferNum = 1 - iBestPredBufferNum;
+          }
+
+          if (iCurCost < iCostV)
+            iBestModeFake = false;
+
+          iCurMode = I4_PRED_VL;
+
+          pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
+
+          pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
+          iCostVL = iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
+                               lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
+
+          if (iCurCost < iBestCost) {
+            iBestMode = iCurMode;
+            iBestCost = iCurCost;
+            iBestPredBufferNum = 1 - iBestPredBufferNum;
+          }
+
+          if (iCurCost < iCostV)
+            iBestModeFake = false;
+
+          //Vertical Early Determination
+          if (!iBestModeFake) { //Vertical is not the best, go on checking...
+            //select the best one from VL and VR
+            if (iCostVR < iCostVL) {
+              //I4_PRED_DDR(4)
+              iCurMode = I4_PRED_DDR;
+
+              pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
+
+              pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
+
+              iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
+                         lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
+
+              if (iCurCost < iBestCost) {
+                iBestMode = iCurMode;
+                iBestCost = iCurCost;
+                iBestPredBufferNum = 1 - iBestPredBufferNum;
+              }
+            } else {
+              //I4_PRED_DDL(3)
+              iCurMode = I4_PRED_DDL;
+
+              pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
+
+              pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
+
+              iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
+                         lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
+
+              if (iCurCost < iBestCost) {
+                iBestMode = iCurMode;
+                iBestCost = iCurCost;
+                iBestPredBufferNum = 1 - iBestPredBufferNum;
+              }
+            }
+          }
+        } else if (iAvailCount == 7) {
+          iCurMode = I4_PRED_DDR;
+
+          pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
+
+          pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
+          iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
+                     lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
+
+          if (iCurCost < iBestCost) {
+            iBestMode = iCurMode;
+            iBestCost = iCurCost;
+            iBestPredBufferNum = 1 - iBestPredBufferNum;
+          }
+
+          iCurMode = I4_PRED_VR;
+
+          pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
+
+          pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
+
+          iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
+                     lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
+
+          if (iCurCost < iBestCost) {
+            iBestMode = iCurMode;
+            iBestCost = iCurCost;
+            iBestPredBufferNum = 1 - iBestPredBufferNum;
+          }
+        }
+      } else {
+        iBestModeFake = true; //indicating whether H is the best fake mode
+        //I4_PRED_HD(6) and I4_PRED_HU(8)
+        iCurMode = I4_PRED_HD;
+
+        pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
+
+        pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
+        iCostHD = iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
+                             lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
+
+        if (iCurCost < iBestCost) {
+          iBestMode = iCurMode;
+          iBestCost = iCurCost;
+          iBestPredBufferNum = 1 - iBestPredBufferNum;
+        }
+
+        if (iCurCost < iCostH)
+          iBestModeFake = false;
+
+        iCurMode = I4_PRED_HU;
+
+        pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
+
+        pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
+        iCostHU = iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
+                             lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
+
+        if (iCurCost < iBestCost) {
+          iBestMode = iCurMode;
+          iBestCost = iCurCost;
+          iBestPredBufferNum = 1 - iBestPredBufferNum;
+        }
+
+        if (iCurCost < iCostH)
+          iBestModeFake = false;
+
+        if (!iBestModeFake) { //Horizontal is not the best, go on checking...
+          //select the best one from VL and VR
+          if (iCostHD < iCostHU) {
+            //I4_PRED_DDR(4)
+            iCurMode = I4_PRED_DDR;
+
+            pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
+
+            pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
+            iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
+                       lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
+
+            if (iCurCost < iBestCost) {
+              iBestMode = iCurMode;
+              iBestCost = iCurCost;
+              iBestPredBufferNum = 1 - iBestPredBufferNum;
+            }
+          } else if (iAvailCount == 9) {
+            //I4_PRED_DDL(3)
+            iCurMode = I4_PRED_DDL;
+
+            pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
+            pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
+
+            iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
+                       lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
+
+            if (iCurCost < iBestCost) {
+              iBestMode = iCurMode;
+              iBestCost = iCurCost;
+              iBestPredBufferNum = 1 - iBestPredBufferNum;
+            }
+
+          }
+        }
+      }
+    } else {
+      iBestCost = INT_MAX;
+      iBestMode = I4_PRED_INVALID;
+      for (j = 0; j < iAvailCount; j++) {
+        // I4x4_MODE_CHECK(pAvailMode[j], iCurCost);
+        iCurMode = kpAvailMode[j];
+
+        pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
+
+        pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
+        iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
+                   lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
+
+        if (iCurCost < iBestCost) {
+          iBestMode = iCurMode;
+          iBestCost = iCurCost;
+          iBestPredBufferNum = 1 - iBestPredBufferNum;
+        }
+      }
+    }
+    pMbCache->pBestPredI4x4Blk4 = &pMbCache->pMemPredBlk4[iBestPredBufferNum << 4];
+    iCosti4x4 += iBestCost;
+    if (iCosti4x4 >= iBestCostLuma) {
+      break;
+    }
+
+    //step 5: update pred mode and sample avail cache
+    iFinalMode = g_kiMapModeI4x4[iBestMode];
+    if (iPredMode == iFinalMode) {
+      *pPrevIntra4x4PredModeFlag++ = true;
+    } else {
+      *pPrevIntra4x4PredModeFlag++ = false;
+      *pRemIntra4x4PredModeFlag  = (iFinalMode < iPredMode ? iFinalMode : (iFinalMode - 1));
+    }
+    pRemIntra4x4PredModeFlag++;
+    //	pCurMb->pIntra4x4PredMode[scan4[i]] = iFinalMode;
+    pMbCache->iIntraPredMode[kpCache48CountScan4[i]] = iFinalMode;
+    //step 6: encoding I_4x4
+    WelsEncRecI4x4Y (pEncCtx, pCurMb, pMbCache, i);
+  }
+  ST32 (pCurMb->pIntra4x4PredMode, LD32 (&pMbCache->iIntraPredMode[33]));
+  pCurMb->pIntra4x4PredMode[4] = pMbCache->iIntraPredMode[12];
+  pCurMb->pIntra4x4PredMode[5] =	pMbCache->iIntraPredMode[20];
+  pCurMb->pIntra4x4PredMode[6] = pMbCache->iIntraPredMode[28];
+  iCosti4x4 += (iLambda << 4) + (iLambda << 3); //4*6*lambda from JVT SATD0
+  return iCosti4x4;
+}
+
+int32_t WelsMdIntraChroma (SWelsFuncPtrList* pFunc, SDqLayer* pCurDqLayer, SMbCache* pMbCache, int32_t iLambda) {
+  const int8_t* kpAvailMode;
+  int32_t iAvailCount				= 0;
+  int32_t iChmaIdx = 0;
+  uint8_t* pPredIntraChma[2]	= {pMbCache->pMemPredChroma, pMbCache->pMemPredChroma + 128};
+  uint8_t* pDstChma				= pPredIntraChma[0];
+  uint8_t* pEncCb				= pMbCache->SPicData.pEncMb[1];
+  uint8_t* pEncCr				= pMbCache->SPicData.pEncMb[2];
+  uint8_t* pDecCb				= pMbCache->SPicData.pCsMb[1];//pMbCache->SPicData.pDecMb[1];
+  uint8_t* pDecCr				= pMbCache->SPicData.pCsMb[2];//pMbCache->SPicData.pDecMb[2];
+  const int32_t kiLineSizeEnc		= pCurDqLayer->iEncStride[1];
+  const int32_t kiLineSizeDec		= pCurDqLayer->iCsStride[1];//pMbCache->SPicData.i_stride_dec[1];
+
+  int32_t i, iCurMode, iCurCost, iBestMode, iBestCost = INT_MAX;
+
+  int32_t iOffset = pMbCache->uiNeighborIntra & 0x07;
+  iAvailCount = g_kiIntraChromaAvailMode[iOffset][4];
+  kpAvailMode = g_kiIntraChromaAvailMode[iOffset];
+  if (iAvailCount > 3 && pFunc->sSampleDealingFuncs.pfIntra8x8Combined3) {
+    iBestCost = pFunc->sSampleDealingFuncs.pfIntra8x8Combined3 (pDecCb, kiLineSizeDec, pEncCb, kiLineSizeEnc, &iBestMode,
+                iLambda, pDstChma, pDecCr, pEncCr);
+    iCurMode = kpAvailMode[3];
+    pFunc->pfGetChromaPred[iCurMode] (pDstChma, pDecCb, kiLineSizeDec); //Cb
+    pFunc->pfGetChromaPred[iCurMode] (pDstChma + 64, pDecCr, kiLineSizeDec); //Cr
+
+    iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_8x8] (pDstChma, 8, pEncCb, kiLineSizeEnc) +
+               pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_8x8] (pDstChma + 64, 8, pEncCr, kiLineSizeEnc) +
+               iLambda * 4;
+    if (iCurCost < iBestCost) {
+      iBestMode = iCurMode;
+      iBestCost = iCurCost;
+    } else {
+      pFunc->pfGetChromaPred[iBestMode] (pDstChma, pDecCb, kiLineSizeDec); //Cb
+      pFunc->pfGetChromaPred[iBestMode] (pDstChma + 64, pDecCr, kiLineSizeDec); //Cr
+    }
+    iBestCost += iLambda;
+    iChmaIdx = 1;
+  } else {
+    iBestMode = kpAvailMode[0];
+    for (i = 0; i < iAvailCount; ++ i) {
+      iCurMode = kpAvailMode[i];
+
+      assert (iCurMode >= 0 && iCurMode < 7);
+
+      //		pDstCb	= &pMbCache->mem_pred_intra_cb[iCurMode<<6];
+      //		pDstCr	= &pMbCache->mem_pred_intra_cr[iCurMode<<6];
+      pFunc->pfGetChromaPred[iCurMode] (pDstChma, pDecCb, kiLineSizeDec); //Cb
+      iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_8x8] (pDstChma, 8, pEncCb, kiLineSizeEnc);
+
+      pFunc->pfGetChromaPred[iCurMode] (pDstChma + 64, pDecCr, kiLineSizeDec); //Cr
+      iCurCost += pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_8x8] (pDstChma + 64, 8, pEncCr, kiLineSizeEnc) +
+                  iLambda * BsSizeUE (g_kiMapModeIntraChroma[iCurMode]);
+      if (iCurCost < iBestCost) {
+        iBestMode = iCurMode;
+        iBestCost = iCurCost;
+        iChmaIdx = iChmaIdx ^ 0x01;
+        pDstChma	= pPredIntraChma[iChmaIdx];
+      }
+    }
+  }
+
+  pMbCache->pBestPredIntraChroma	= pPredIntraChma[iChmaIdx ^ 0x01];
+  pMbCache->uiChmaI8x8Mode = iBestMode;
+  return iBestCost;
+}
+int32_t WelsMdIntraFinePartition (void* pEnc, void* pMd, SMB* pCurMb, SMbCache* pMbCache) {
+  sWelsEncCtx* pEncCtx = (sWelsEncCtx*)pEnc;
+  SWelsMD* pWelsMd = (SWelsMD*)pMd;
+
+  int32_t iCosti4x4 = WelsMdI4x4 (pEncCtx, pWelsMd, pCurMb, pMbCache);
+
+  if (iCosti4x4 < pWelsMd->iCostLuma) {
+    pCurMb->uiMbType = MB_TYPE_INTRA4x4;
+    pWelsMd->iCostLuma = iCosti4x4;
+  }
+  return pWelsMd->iCostLuma;
+}
+
+int32_t WelsMdIntraFinePartitionVaa (void* pEnc, void* pMd, SMB* pCurMb, SMbCache* pMbCache) {
+  sWelsEncCtx* pEncCtx = (sWelsEncCtx*)pEnc;
+  SWelsMD* pWelsMd = (SWelsMD*)pMd;
+
+  if (MdIntraAnalysisVaaInfo (pEncCtx, pMbCache->SPicData.pEncMb[0])) {
+    int32_t iCosti4x4 = WelsMdI4x4Fast (pEncCtx, pWelsMd, pCurMb, pMbCache);
+
+    if (iCosti4x4 < pWelsMd->iCostLuma) {
+      pCurMb->uiMbType = MB_TYPE_INTRA4x4;
+      pWelsMd->iCostLuma = iCosti4x4;
+    }
+  }
+
+  return pWelsMd->iCostLuma;
+}
+
+void WelsMdIntraMb (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb, SMbCache* pMbCache) {
+  //initial prediction memory for I_16x16
+  pWelsMd->iCostLuma = WelsMdI16x16 (pEncCtx->pFuncList, pEncCtx->pCurDqLayer, pMbCache, pWelsMd->iLambda);
+  pCurMb->uiMbType = MB_TYPE_INTRA16x16;
+
+  WelsMdIntraSecondaryModesEnc (pEncCtx, pWelsMd, pCurMb, pMbCache);
+}
+
+int32_t WelsMdP16x16 (SWelsFuncPtrList* pFunc, SDqLayer* pCurLayer, SWelsMD* pWelsMd, SSlice* pSlice, SMB* pCurMb) {
+  SMbCache* pMbCache = &pSlice->sMbCacheInfo;
+  SWelsME* sMe16x16 = &pWelsMd->sMe.sMe16x16;
+  uint32_t uiNeighborAvail = pCurMb->uiNeighborAvail;
+  const int32_t kiMbWidth	= pCurLayer->iMbWidth;	// for assign once
+  const int32_t kiMbHeight	= pCurLayer->iMbHeight;
+
+  sMe16x16->uiPixel = BLOCK_16x16;
+  sMe16x16->pMvdCost = pWelsMd->pMvdCost;
+
+  sMe16x16->pEncMb  = pMbCache->SPicData.pEncMb[0];
+  sMe16x16->pRefMb  = pMbCache->SPicData.pRefMb[0];
+  sMe16x16->uSadPredISatd.uiSadPred = pWelsMd->iSadPredMb;
+
+  pSlice->uiMvcNum = 0;
+  pSlice->sMvc[pSlice->uiMvcNum++] = sMe16x16->sMvBase;
+  //spatial motion vector predictors
+  if (uiNeighborAvail & LEFT_MB_POS) { //left available
+    pSlice->sMvc[pSlice->uiMvcNum++] = (pCurMb - 1)->sP16x16Mv;
+  }
+  if (uiNeighborAvail & TOP_MB_POS) { //top available
+    pSlice->sMvc[pSlice->uiMvcNum++] = (pCurMb - kiMbWidth)->sP16x16Mv;
+  }
+  //temporal motion vector predictors
+  if (pCurLayer->pRefPic->iPictureType == P_SLICE) {
+    if (pCurMb->iMbX < kiMbWidth - 1) {
+      SMVUnitXY sTempMv = pCurLayer->pRefPic->sMvList[pCurMb->iMbXY + 1];
+      pSlice->sMvc[pSlice->uiMvcNum].iMvX = sTempMv.iMvX >> pSlice->sScaleShift;
+      pSlice->sMvc[pSlice->uiMvcNum].iMvY = sTempMv.iMvY >> pSlice->sScaleShift;
+      ++ pSlice->uiMvcNum;
+    }
+    if (pCurMb->iMbY < kiMbHeight - 1) {
+      SMVUnitXY sTempMv = pCurLayer->pRefPic->sMvList[pCurMb->iMbXY + kiMbWidth];
+      pSlice->sMvc[pSlice->uiMvcNum].iMvX = sTempMv.iMvX >> pSlice->sScaleShift;
+      pSlice->sMvc[pSlice->uiMvcNum].iMvY = sTempMv.iMvY >> pSlice->sScaleShift;
+      ++ pSlice->uiMvcNum;
+    }
+  }
+
+  PredMv (&pMbCache->sMvComponents, 0, 4, 0, & (sMe16x16->sMvp));
+  pFunc->pfMotionSearch (pFunc, pCurLayer, sMe16x16, pSlice);
+//	update_p16x16_motion2cache(pMbCache, pWelsMd->uiRef, &(sMe16x16->mv));
+
+  pCurMb->sP16x16Mv = sMe16x16->sMv;
+  pCurLayer->pDecPic->sMvList[pCurMb->iMbXY] = sMe16x16->sMv;
+
+  return sMe16x16->uiSatdCost;
+}
+int32_t WelsMdP16x8 (SWelsFuncPtrList* pFunc, SDqLayer* pCurDqLayer, SWelsMD* pWelsMd, SSlice* pSlice) {
+  SMbCache* pMbCache = &pSlice->sMbCacheInfo;
+  int32_t iStrideEnc = pCurDqLayer->iEncStride[0];
+  int32_t iStrideRef = pCurDqLayer->pRefPic->iLineSize[0];
+  SWelsME* sMe16x8;
+  int32_t i = 0;
+  int32_t iCostP16x8 = 0;
+  do {
+    sMe16x8 = &pWelsMd->sMe.sMe16x8[i];
+
+    sMe16x8->uiPixel = BLOCK_16x8;
+    sMe16x8->pMvdCost	 = pWelsMd->pMvdCost;
+
+    sMe16x8->pEncMb       = pMbCache->SPicData.pEncMb[0] + ((i << 3) * iStrideEnc);
+    sMe16x8->pRefMb       = pMbCache->SPicData.pRefMb[0] + ((i << 3) * iStrideRef);
+    sMe16x8->uSadPredISatd.uiSadPred = pWelsMd->iSadPredMb >> 1;
+
+    pSlice->sMvc[0]	= sMe16x8->sMvBase;
+    pSlice->uiMvcNum = 1;
+
+    PredInter16x8Mv (pMbCache, i << 3, 0, & (sMe16x8->sMvp));
+    pFunc->pfMotionSearch (pFunc, pCurDqLayer, sMe16x8, pSlice);
+    UpdateP16x8Motion2Cache (pMbCache, i << 3, pWelsMd->uiRef, & (sMe16x8->sMv));
+    iCostP16x8 += sMe16x8->uiSatdCost;
+    ++i;
+  } while (i < 2);
+  return iCostP16x8;
+}
+int32_t WelsMdP8x16 (SWelsFuncPtrList* pFunc, SDqLayer* pCurLayer, SWelsMD* pWelsMd, SSlice* pSlice) {
+  SMbCache* pMbCache = &pSlice->sMbCacheInfo;
+  SWelsME* sMe8x16;
+  int32_t i = 0;
+  int32_t iCostP8x16 = 0;
+  do {
+    sMe8x16 = &pWelsMd->sMe.sMe8x16[i];
+
+    sMe8x16->uiPixel = BLOCK_8x16;
+    sMe8x16->pMvdCost     = pWelsMd->pMvdCost;
+
+    sMe8x16->pEncMb       = pMbCache->SPicData.pEncMb[0] + (i << 3);
+    sMe8x16->pRefMb       = pMbCache->SPicData.pRefMb[0] + (i << 3);
+    sMe8x16->uSadPredISatd.uiSadPred = pWelsMd->iSadPredMb >> 1;
+
+    pSlice->sMvc[0] = sMe8x16->sMvBase;
+    pSlice->uiMvcNum = 1;
+
+    PredInter8x16Mv (pMbCache, i << 2, 0, & (sMe8x16->sMvp));
+    pFunc->pfMotionSearch (pFunc, pCurLayer, sMe8x16, pSlice);
+    UpdateP8x16Motion2Cache (pMbCache, i << 2, pWelsMd->uiRef, & (sMe8x16->sMv));
+    iCostP8x16 += sMe8x16->uiSatdCost;
+//		sMe8x16++;
+    ++i;
+  } while (i < 2);
+  return iCostP8x16;
+}
+int32_t WelsMdP8x8 (SWelsFuncPtrList* pFunc, SDqLayer* pCurDqLayer, SWelsMD* pWelsMd, SSlice* pSlice) {
+  SMbCache* pMbCache = &pSlice->sMbCacheInfo;
+  int32_t iLineSizeEnc = pCurDqLayer->iEncStride[0];
+  int32_t iLineSizeRef = pCurDqLayer->pRefPic->iLineSize[0];
+  SWelsME* sMe8x8;
+  int32_t i, iIdxX, iIdxY, iStrideEnc, iStrideRef;
+  int32_t iCostP8x8 = 0;
+  for (i = 0; i < 4; i++) {
+    iIdxX = i & 1;
+    iIdxY = i >> 1;
+    iStrideEnc = (iIdxX << 3) + ((iIdxY << 3) * iLineSizeEnc);
+    iStrideRef = (iIdxX << 3) + ((iIdxY << 3) * iLineSizeRef);
+
+    sMe8x8 = &pWelsMd->sMe.sMe8x8[i];
+
+    sMe8x8->uiPixel = BLOCK_8x8;
+    sMe8x8->pMvdCost     = pWelsMd->pMvdCost;
+
+    sMe8x8->pEncMb       = pMbCache->SPicData.pEncMb[0] + iStrideEnc;
+    sMe8x8->pRefMb       = pMbCache->SPicData.pRefMb[0] + iStrideRef;
+    sMe8x8->uSadPredISatd.uiSadPred = pWelsMd->iSadPredMb >> 2;
+
+    pSlice->sMvc[0] = sMe8x8->sMvBase;
+    pSlice->uiMvcNum = 1;
+
+    PredMv (&pMbCache->sMvComponents, i << 2, 2, pWelsMd->uiRef, & (sMe8x8->sMvp));
+    pFunc->pfMotionSearch (pFunc, pCurDqLayer, sMe8x8, pSlice);
+    UpdateP8x8Motion2Cache (pMbCache, i << 2, pWelsMd->uiRef, & (sMe8x8->sMv));
+    iCostP8x8 += sMe8x8->uiSatdCost;
+//		sMe8x8++;
+  }
+  return iCostP8x8;
+}
+
+void WelsMdInterFinePartition (void* pEnc, void* pMd, SSlice* pSlice, SMB* pCurMb, int32_t iBestCost) {
+  sWelsEncCtx* pEncCtx = (sWelsEncCtx*)pEnc;
+  SWelsMD* pWelsMd = (SWelsMD*)pMd;
+
+  SDqLayer* pCurDqLayer = pEncCtx->pCurDqLayer;
+//	SMbCache *pMbCache = &pSlice->sMbCacheInfo;
+  int32_t iCost = 0;
+
+//	WelsLog( pEncCtx, WELS_LOG_INFO, "WelsMdP8x8, p_ref[0]= 0x%p\n", pMbCache->SPicData.pRefMb[0]);
+
+  iCost = WelsMdP8x8 (pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice);
+
+  if (iCost < iBestCost) {
+    int32_t iCostPart;
+    pCurMb->uiMbType = MB_TYPE_8x8;
+
+//		WelsLog( pEncCtx, WELS_LOG_INFO, "WelsMdP16x8, p_ref[0]= 0x%p\n", pMbCache->SPicData.pRefMb[0]);
+    iCostPart = WelsMdP16x8 (pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice);
+    if (iCostPart <= iCost) {
+      iCost = iCostPart;
+      pCurMb->uiMbType = MB_TYPE_16x8;
+      //pCurMb->mb_partition = 2;
+    }
+
+//		WelsLog( pEncCtx, WELS_LOG_INFO, "WelsMdP8x16, p_ref[0]= 0x%p\n", pMbCache->SPicData.pRefMb[0]);
+    iCostPart = WelsMdP8x16 (pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice);
+    if (iCostPart <= iCost) {
+      iCost = iCostPart;
+      pCurMb->uiMbType = MB_TYPE_8x16;
+      //pCurMb->mb_partition = 2;
+    }
+  }
+}
+
+void WelsMdInterFinePartitionVaa (void* pEnc, void* pMd, SSlice* pSlice, SMB* pCurMb, int32_t iBestCost) {
+  sWelsEncCtx* pEncCtx = (sWelsEncCtx*)pEnc;
+  SWelsMD* pWelsMd = (SWelsMD*)pMd;
+
+  SDqLayer* pCurDqLayer = pEncCtx->pCurDqLayer;
+//	SMbCache *pMbCache = &pSlice->sMbCacheInfo;
+  int32_t iCostP8x16, iCostP16x8, iCostP8x8;
+  uint8_t uiMbSign = pEncCtx->pFuncList->pfGetMbSignFromInterVaa (&pEncCtx->pVaa->sVaaCalcInfo.pSad8x8[pCurMb->iMbXY][0]);
+
+  if (uiMbSign == 15) {
+    return;
+  }
+
+//	iCost = pWelsMd->sMe16x16.uiSatdCost;
+
+  switch (uiMbSign) {
+  case 3:
+  case 12:
+//		WelsLog( pEncCtx, WELS_LOG_INFO, "WelsMdP16x8, p_ref[0]= 0x%p\n", pMbCache->SPicData.pRefMb[0]);
+    iCostP16x8 = WelsMdP16x8 (pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice);
+    if (iCostP16x8 < iBestCost) {
+      iBestCost = iCostP16x8;
+      pCurMb->uiMbType = MB_TYPE_16x8;
+      //pCurMb->mb_partition = 2;
+    }
+    break;
+
+  case 5:
+  case 10:
+//		WelsLog( pEncCtx, WELS_LOG_INFO, "WelsMdP8x16, p_ref[0]= 0x%p\n", pMbCache->SPicData.pRefMb[0]);
+    iCostP8x16 = WelsMdP8x16 (pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice);
+    if (iCostP8x16 < iBestCost) {
+      iBestCost = iCostP8x16;
+      pCurMb->uiMbType = MB_TYPE_8x16;
+      //pCurMb->mb_partition = 2;
+    }
+    break;
+
+  case 6:
+  case 9:
+    iCostP8x8 = WelsMdP8x8 (pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice);
+    if (iCostP8x8 < iBestCost) {
+      iBestCost = iCostP8x8;
+      pCurMb->uiMbType = MB_TYPE_8x8;
+    }
+    break;
+
+  default:
+    iCostP8x8 = WelsMdP8x8 (pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice);
+    if (iCostP8x8 < iBestCost) {
+      iBestCost = iCostP8x8;
+      pCurMb->uiMbType = MB_TYPE_8x8;
+
+      iCostP16x8 = WelsMdP16x8 (pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice);
+      if (iCostP16x8 <= iBestCost) {
+        iBestCost = iCostP16x8;
+        pCurMb->uiMbType = MB_TYPE_16x8;
+      }
+
+      iCostP8x16 = WelsMdP8x16 (pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice);
+      if (iCostP8x16 <= iBestCost) {
+        iBestCost = iCostP8x16;
+        pCurMb->uiMbType = MB_TYPE_8x16;
+      }
+    }
+    break;
+  }
+  pWelsMd->iCostLuma = iBestCost;
+}
+
+
+inline void VaaBackgroundMbDataUpdate (SWelsFuncPtrList* pFunc, SVAAFrameInfo* pVaaInfo, SMB* pCurMb) {
+  const int32_t kiPicStride		= pVaaInfo->iPicStride;
+  const int32_t kiPicStrideUV	= pVaaInfo->iPicStrideUV;
+  const int32_t kiOffsetY		= (pCurMb->iMbY * kiPicStride + pCurMb->iMbX) << 4;
+  const int32_t kiOffsetUV		= (pCurMb->iMbY * kiPicStrideUV + pCurMb->iMbX) << 3;
+
+  pFunc->pfCopy16x16Aligned (pVaaInfo->pCurY + kiOffsetY, kiPicStride, pVaaInfo->pRefY + kiOffsetY, kiPicStride);
+  pFunc->pfCopy8x8Aligned (pVaaInfo->pCurU + kiOffsetUV, kiPicStrideUV, pVaaInfo->pRefU + kiOffsetUV, kiPicStrideUV);
+  pFunc->pfCopy8x8Aligned (pVaaInfo->pCurV + kiOffsetUV, kiPicStrideUV, pVaaInfo->pRefV + kiOffsetUV, kiPicStrideUV);
+}
+
+void WelsMdBackgroundMbEnc (void* pEnc, void* pMd, SMB* pCurMb, SMbCache* pMbCache, SSlice* pSlice,
+                            bool_t bSkipMbFlag) {
+  sWelsEncCtx* pEncCtx	= (sWelsEncCtx*)pEnc;
+  SDqLayer* pCurDqLayer	= pEncCtx->pCurDqLayer;
+  SWelsMD* pWelsMd		= (SWelsMD*)pMd;
+  SWelsFuncPtrList* pFunc	= pEncCtx->pFuncList;
+  SMVUnitXY sMvp				= { 0 };
+  uint8_t* pRefLuma			= pMbCache->SPicData.pRefMb[0];
+  uint8_t* pRefCb				= pMbCache->SPicData.pRefMb[1];
+  uint8_t* pRefCr				= pMbCache->SPicData.pRefMb[2];
+  int32_t iLineSizeY			= pCurDqLayer->pRefPic->iLineSize[0];
+  int32_t iLineSizeUV			= pCurDqLayer->pRefPic->iLineSize[1];
+  uint8_t* pDstLuma			= pMbCache->pSkipMb;
+  uint8_t* pDstCb				= pMbCache->pSkipMb + 256;
+  uint8_t* pDstCr				= pMbCache->pSkipMb + 256 + 64;
+
+  if (!bSkipMbFlag) {
+    pDstLuma	= pMbCache->pMemPredLuma;
+    pDstCb	= pMbCache->pMemPredChroma;
+    pDstCr	= pMbCache->pMemPredChroma + 64;
+  }
+  //MC
+  pFunc->sMcFuncs.pfLumaQuarpelMc[0] (pRefLuma, iLineSizeY, pDstLuma, 16, 16);
+  pFunc->sMcFuncs.pfChromaMc (pRefCb, iLineSizeUV, pDstCb, 8, sMvp, 8, 8); //Cb
+  pFunc->sMcFuncs.pfChromaMc (pRefCr, iLineSizeUV, pDstCr, 8, sMvp, 8, 8); //Cr
+
+  pCurMb->uiCbp = 0;
+  pMbCache->bCollocatedPredFlag = true;
+  pWelsMd->iCostLuma = 0;//BGD&RC integration
+  pCurMb->pSadCost[0] = pFunc->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16] (pMbCache->SPicData.pEncMb[0],
+                        pCurDqLayer->iEncStride[0], pRefLuma, iLineSizeY);
+  ST32 (&pCurMb->sP16x16Mv, 0);
+  ST32 (&pCurDqLayer->pDecPic->sMvList[pCurMb->iMbXY], 0);
+
+  if (bSkipMbFlag) {
+    pCurMb->uiMbType = MB_TYPE_BACKGROUND;
+
+    //update motion info to current MB
+    ST32 (pCurMb->pRefIndex, 0);
+    pFunc->pfUpdateMbMv (pCurMb->sMv, sMvp);
+
+    pCurMb->uiLumaQp   = pSlice->uiLastMbQp;
+    pCurMb->uiChromaQp = g_kuiChromaQpTable[CLIP3_QP_0_51 (pCurMb->uiLumaQp +
+                                            pCurDqLayer->sLayerInfo.pPpsP->uiChromaQpIndexOffset)];
+
+    WelsRecPskip (pCurDqLayer, pEncCtx->pFuncList, pCurMb, pMbCache);
+    VaaBackgroundMbDataUpdate (pEncCtx->pFuncList, pEncCtx->pVaa, pCurMb);
+    return;
+  }
+
+  pCurMb->uiMbType = MB_TYPE_16x16;
+
+  pWelsMd->sMe.sMe16x16.sMv.iMvX = 0;
+  pWelsMd->sMe.sMe16x16.sMv.iMvY = 0;
+  PredMv (&pMbCache->sMvComponents, 0, 4, pWelsMd->uiRef, &pWelsMd->sMe.sMe16x16.sMvp);
+  pMbCache->sMbMvp[0] = pWelsMd->sMe.sMe16x16.sMvp;
+
+  UpdateP16x16MotionInfo (pMbCache, pCurMb, pWelsMd->uiRef, &pWelsMd->sMe.sMe16x16.sMv);
+
+  if (pWelsMd->bMdUsingSad)
+    pWelsMd->iCostLuma = pCurMb->pSadCost[0];
+  else
+    pWelsMd->iCostLuma = pFunc->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] (pMbCache->SPicData.pEncMb[0],
+                         pCurDqLayer->iEncStride[0], pRefLuma, iLineSizeY);
+
+  WelsInterMbEncode (pEncCtx, pSlice, pCurMb);
+  WelsPMbChromaEncode (pEncCtx, pSlice, pCurMb);
+
+  pFunc->pfCopy16x16Aligned (pMbCache->SPicData.pCsMb[0], pCurDqLayer->iCsStride[0], pMbCache->pMemPredLuma,     16);
+  pFunc->pfCopy8x8Aligned (pMbCache->SPicData.pCsMb[1], pCurDqLayer->iCsStride[1], pMbCache->pMemPredChroma,    8);
+  pFunc->pfCopy8x8Aligned (pMbCache->SPicData.pCsMb[2], pCurDqLayer->iCsStride[1], pMbCache->pMemPredChroma + 64, 8);
+}
+
+BOOL_T WelsMdPSkipEnc (void* pEnc, void* pMd, SMB* pCurMb, SMbCache* pMbCache) {
+  sWelsEncCtx* pEncCtx	= (sWelsEncCtx*)pEnc;
+  SDqLayer* pCurLayer				= pEncCtx->pCurDqLayer;
+  SWelsMD* pWelsMd					= (SWelsMD*)pMd;
+  SWelsFuncPtrList* pFunc		= pEncCtx->pFuncList;
+
+  uint8_t* pRefLuma = pMbCache->SPicData.pRefMb[0];
+  uint8_t* pRefCb   = pMbCache->SPicData.pRefMb[1];
+  uint8_t* pRefCr   = pMbCache->SPicData.pRefMb[2];
+  int32_t iLineSizeY  = pCurLayer->pRefPic->iLineSize[0];
+  int32_t iLineSizeUV = pCurLayer->pRefPic->iLineSize[1];
+
+  uint8_t* pDstLuma = pMbCache->pSkipMb;
+  uint8_t* pDstCb   = pMbCache->pSkipMb + 256;
+  uint8_t* pDstCr   = pMbCache->pSkipMb + 256 + 64;
+
+  SMVUnitXY sMvp = { 0 };
+  uint8_t uiMvpIdx;
+  int32_t n;
+
+  int32_t iEncStride		= pCurLayer->iEncStride[0];
+  uint8_t* pEncMb			= pMbCache->SPicData.pEncMb[0];
+  int32_t* pStrideEncBlockOffset = pEncCtx->pStrideTab->pStrideEncBlockOffset[pEncCtx->uiDependencyId];
+  int32_t* pEncBlockOffset;
+
+  int32_t iSadCostLuma = 0;
+  int32_t iSadCostChroma = 0;
+  int32_t iSadCostMb = 0;
+
+  PredSkipMv (pMbCache, &sMvp);
+
+  // Special case, need to clip the vector //
+  SMVUnitXY sQpelMvp = { sMvp.iMvX >> 2, sMvp.iMvY >> 2 };
+  n = (pCurMb->iMbX << 4) + sQpelMvp.iMvX;
+  if (n < -29)
+    return FALSE;
+  else if (n > (int32_t) ((pCurLayer->iMbWidth << 4) + 12))
+    return FALSE;
+
+  n = (pCurMb->iMbY << 4) + sQpelMvp.iMvY;
+  if (n < -29)
+    return FALSE;
+  else if (n > (int32_t) ((pCurLayer->iMbHeight << 4) + 12))
+    return FALSE;
+
+  //luma
+  pRefLuma += sQpelMvp.iMvY * iLineSizeY + sQpelMvp.iMvX;
+  uiMvpIdx = ((sMvp.iMvY & 0x03) << 2) + (sMvp.iMvX & 0x03);
+  pFunc->sMcFuncs.pfLumaQuarpelMc[uiMvpIdx] (pRefLuma, iLineSizeY, pDstLuma, 16, 16);
+  iSadCostLuma    = pFunc->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16] (pMbCache->SPicData.pEncMb[0],
+                    pCurLayer->iEncStride[0], pDstLuma, 16);
+
+  const int32_t iStrideUV = (sQpelMvp.iMvY >> 1) * iLineSizeUV + (sQpelMvp.iMvX >> 1);
+  pRefCb += iStrideUV;
+  pFunc->sMcFuncs.pfChromaMc (pRefCb, iLineSizeUV, pDstCb, 8, sMvp, 8, 8); //Cb
+  iSadCostChroma  = pFunc->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8] (pMbCache->SPicData.pEncMb[1],
+                    pCurLayer->iEncStride[1], pDstCb, 8);
+
+  pRefCr += iStrideUV;
+  pFunc->sMcFuncs.pfChromaMc (pRefCr, iLineSizeUV, pDstCr, 8, sMvp, 8, 8); //Cr
+  iSadCostChroma += pFunc->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8] (pMbCache->SPicData.pEncMb[2],
+                    pCurLayer->iEncStride[2], pDstCr, 8);
+
+  iSadCostMb = iSadCostLuma + iSadCostChroma;
+
+  if (iSadCostMb == 0                             ||
+      iSadCostMb < pWelsMd->iSadPredSkip   ||
+      (pCurLayer->pRefPic->iPictureType == P_SLICE     &&
+       pMbCache->uiRefMbType == MB_TYPE_SKIP    &&
+       iSadCostMb < pCurLayer->pRefPic->pMbSkipSad[pCurMb->iMbXY])) {
+    //update motion info to current MB
+    ST32 (pCurMb->pRefIndex, 0);
+    pFunc->pfUpdateMbMv (pCurMb->sMv, sMvp);
+
+    pCurMb->pSadCost[0] = pFunc->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16] (pMbCache->SPicData.pEncMb[0],
+                          pCurLayer->iEncStride[0], pRefLuma, iLineSizeY);
+
+    if (pWelsMd->bMdUsingSad)
+      pWelsMd->iCostLuma = pCurMb->pSadCost[0];
+    else
+      pWelsMd->iCostLuma = pFunc->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] (pMbCache->SPicData.pEncMb[0],
+                           pCurLayer->iEncStride[0], pRefLuma, iLineSizeY);
+
+    pWelsMd->iCostSkipMb = iSadCostMb;
+
+    pCurMb->sP16x16Mv = sMvp;
+    pCurLayer->pDecPic->sMvList[pCurMb->iMbXY] = sMvp;
+
+    return TRUE;
+  }
+
+  WelsDctMb (pMbCache->pCoeffLevel,  pEncMb, iEncStride, pDstLuma, pEncCtx->pFuncList->pfDctFourT4);
+
+  if (WelsTryPYskip (pEncCtx, pCurMb, pMbCache)) {
+    iEncStride = pEncCtx->pCurDqLayer->iEncStride[1];
+    pEncMb = pMbCache->SPicData.pEncMb[1];
+    pEncBlockOffset = pStrideEncBlockOffset + 16;
+    pFunc->pfDctFourT4 (pMbCache->pCoeffLevel + 256, & (pEncMb[*pEncBlockOffset]), iEncStride,	pMbCache->pSkipMb + 256, 8);
+    if (WelsTryPUVskip (pEncCtx, pCurMb, pMbCache, 1)) {
+      pEncMb = pMbCache->SPicData.pEncMb[2];
+      pEncBlockOffset = pStrideEncBlockOffset + 20;
+      pFunc->pfDctFourT4 (pMbCache->pCoeffLevel + 320, & (pEncMb[*pEncBlockOffset]), iEncStride,	pMbCache->pSkipMb + 320, 8);
+      if (WelsTryPUVskip (pEncCtx, pCurMb, pMbCache, 2)) {
+        //update motion info to current MB
+        ST32 (pCurMb->pRefIndex, 0);
+        pFunc->pfUpdateMbMv (pCurMb->sMv, sMvp);
+
+        pCurMb->pSadCost[0] = pFunc->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16] (pMbCache->SPicData.pEncMb[0],
+                              pCurLayer->iEncStride[0], pRefLuma, iLineSizeY);
+
+        if (pWelsMd->bMdUsingSad)
+          pWelsMd->iCostLuma = pCurMb->pSadCost[0];
+        else
+          pWelsMd->iCostLuma = pFunc->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] (pMbCache->SPicData.pEncMb[0],
+                               pCurLayer->iEncStride[0], pRefLuma, iLineSizeY);
+
+        pWelsMd->iCostSkipMb = iSadCostMb;
+
+        pCurMb->sP16x16Mv = sMvp;
+        pCurLayer->pDecPic->sMvList[pCurMb->iMbXY] = sMvp;
+
+        return TRUE;
+      }
+    }
+  }
+  return FALSE;
+}
+
+const int32_t g_kiPixStrideIdx8x8[4] = {  0,                                             ME_REFINE_BUF_WIDTH_BLK8,
+                                          ME_REFINE_BUF_STRIDE_BLK8, ME_REFINE_BUF_STRIDE_BLK8 + ME_REFINE_BUF_WIDTH_BLK8
+                                       };
+
+void WelsMdInterMbRefinement (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb, SMbCache* pMbCache) {
+  SDqLayer* pCurDqLayer = pEncCtx->pCurDqLayer;
+  uint8_t* pTmpRefCb, *pTmpRefCr, *pTmpDstCb, *pTmpDstCr;
+  int32_t iMvStride, iRefBlk4Stride, iDstBlk4Stride;
+  SMVUnitXY* pMv;
+  int32_t iBestSadCost = 0, iBestSatdCost = 0;
+  SMeRefinePointer sMeRefine;
+
+  int32_t i, iIdx, iPixStride;
+
+  uint8_t* pRefCb = pMbCache->SPicData.pRefMb[1];
+  uint8_t* pRefCr = pMbCache->SPicData.pRefMb[2];
+  uint8_t* pDstCb = pMbCache->pMemPredChroma;
+  uint8_t* pDstCr = pMbCache->pMemPredChroma + 64;
+  uint8_t* pDstLuma = pMbCache->pMemPredLuma;
+
+  int32_t iLineSizeRefUV = pCurDqLayer->pRefPic->iLineSize[1];
+
+  switch (pCurMb->uiMbType) {
+  case MB_TYPE_16x16:
+    //luma
+    InitMeRefinePointer (&sMeRefine, pMbCache, 0);
+    MeRefineFracPixel (pEncCtx, pDstLuma, &pWelsMd->sMe.sMe16x16, &sMeRefine, 16, 16);
+    UpdateP16x16MotionInfo (pMbCache, pCurMb, pWelsMd->uiRef, &pWelsMd->sMe.sMe16x16.sMv);
+
+    pMbCache->sMbMvp[0] = pWelsMd->sMe.sMe16x16.sMvp;
+    //save the best cost of final mode
+    iBestSadCost  = pWelsMd->sMe.sMe16x16.uiSadCost;
+    iBestSatdCost = pWelsMd->sMe.sMe16x16.uiSatdCost;
+
+    //chroma
+    pMv = &pWelsMd->sMe.sMe16x16.sMv;
+    iMvStride = (pMv->iMvY >> 3) * iLineSizeRefUV + (pMv->iMvX >> 3);
+    pTmpRefCb = pRefCb + iMvStride;
+    pTmpRefCr = pRefCr + iMvStride;
+    pEncCtx->pFuncList->sMcFuncs.pfChromaMc (pTmpRefCb, iLineSizeRefUV, pDstCb, 8, *pMv, 8, 8); //Cb
+    pEncCtx->pFuncList->sMcFuncs.pfChromaMc (pTmpRefCr, iLineSizeRefUV, pDstCr, 8, *pMv, 8, 8); //Cr
+
+    pWelsMd->iCostSkipMb = pEncCtx->pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16] (pMbCache->SPicData.pEncMb[0],
+                           pCurDqLayer->iEncStride[0], pDstLuma, 16);
+    pWelsMd->iCostSkipMb += pEncCtx->pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8] (pMbCache->SPicData.pEncMb[1],
+                            pCurDqLayer->iEncStride[1], pDstCb, 8);
+    pWelsMd->iCostSkipMb += pEncCtx->pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8] (pMbCache->SPicData.pEncMb[2],
+                            pCurDqLayer->iEncStride[2], pDstCr, 8);
+    break;
+
+  case MB_TYPE_16x8:
+    iPixStride = 0;
+    for (i = 0; i < 2; i++) {
+      //luma
+      iIdx = i << 3;
+      InitMeRefinePointer (&sMeRefine, pMbCache, iPixStride);
+      iPixStride += ME_REFINE_BUF_STRIDE_BLK8;
+      PredInter16x8Mv (pMbCache, iIdx, pWelsMd->uiRef, &pWelsMd->sMe.sMe16x8[i].sMvp);
+      MeRefineFracPixel (pEncCtx, pDstLuma + g_kuiSmb4AddrIn256[iIdx], &pWelsMd->sMe.sMe16x8[i], &sMeRefine, 16, 8);
+      UpdateP16x8MotionInfo (pMbCache, pCurMb, iIdx, pWelsMd->uiRef, &pWelsMd->sMe.sMe16x8[i].sMv);
+      pMbCache->sMbMvp[i] = pWelsMd->sMe.sMe16x8[i].sMvp;
+      //save the best cost of final mode
+      iBestSadCost += pWelsMd->sMe.sMe16x8[i].uiSadCost;
+      iBestSatdCost += pWelsMd->sMe.sMe16x8[i].uiSatdCost;
+
+      //chroma
+      iRefBlk4Stride = (i << 2) * iLineSizeRefUV;
+      iDstBlk4Stride = i << 5; // 4*8
+      pMv = &pWelsMd->sMe.sMe16x8[i].sMv;
+      iMvStride = (pMv->iMvY >> 3) * iLineSizeRefUV + (pMv->iMvX >> 3);
+      pTmpRefCb = pRefCb + iRefBlk4Stride + iMvStride;
+      pTmpRefCr = pRefCr + iRefBlk4Stride + iMvStride;
+      pTmpDstCb = pDstCb + iDstBlk4Stride;
+      pTmpDstCr = pDstCr + iDstBlk4Stride;
+      pEncCtx->pFuncList->sMcFuncs.pfChromaMc (pTmpRefCb, iLineSizeRefUV, pTmpDstCb, 8, *pMv, 8, 4); //Cb
+      pEncCtx->pFuncList->sMcFuncs.pfChromaMc (pTmpRefCr, iLineSizeRefUV, pTmpDstCr, 8, *pMv, 8, 4); //Cr
+    }
+    break;
+
+  case MB_TYPE_8x16:
+    iPixStride = 0;
+    for (i = 0; i < 2; i++) {
+      //luma
+      iIdx = i << 2;
+      InitMeRefinePointer (&sMeRefine, pMbCache, iPixStride);
+      iPixStride += ME_REFINE_BUF_WIDTH_BLK8;
+      PredInter8x16Mv (pMbCache, iIdx, pWelsMd->uiRef, &pWelsMd->sMe.sMe8x16[i].sMvp);
+      MeRefineFracPixel (pEncCtx, pDstLuma + g_kuiSmb4AddrIn256[iIdx], &pWelsMd->sMe.sMe8x16[i], &sMeRefine, 8, 16);
+      update_P8x16_motion_info (pMbCache, pCurMb, iIdx, pWelsMd->uiRef, &pWelsMd->sMe.sMe8x16[i].sMv);
+      pMbCache->sMbMvp[i] = pWelsMd->sMe.sMe8x16[i].sMvp;
+      //save the best cost of final mode
+      iBestSadCost += pWelsMd->sMe.sMe8x16[i].uiSadCost;
+      iBestSatdCost += pWelsMd->sMe.sMe8x16[i].uiSatdCost;
+
+      //chroma
+      iRefBlk4Stride = iIdx; //4
+      pMv = &pWelsMd->sMe.sMe8x16[i].sMv;
+      iMvStride = (pMv->iMvY >> 3) * iLineSizeRefUV + (pMv->iMvX >> 3);
+      pTmpRefCb = pRefCb + iRefBlk4Stride + iMvStride;
+      pTmpRefCr = pRefCr + iRefBlk4Stride + iMvStride;
+      pTmpDstCb = pDstCb + iRefBlk4Stride;
+      pTmpDstCr = pDstCr + iRefBlk4Stride;
+      pEncCtx->pFuncList->sMcFuncs.pfChromaMc (pTmpRefCb, iLineSizeRefUV, pTmpDstCb, 8, *pMv, 4, 8); //Cb
+      pEncCtx->pFuncList->sMcFuncs.pfChromaMc (pTmpRefCr, iLineSizeRefUV, pTmpDstCr, 8, *pMv, 4, 8); //Cr
+    }
+    break;
+
+  case MB_TYPE_8x8:
+    for (i = 0; i < 4; i++) {
+      int32_t iBlk8Idx = i << 2; //0, 4, 8, 12
+      int32_t	iBlk4X, iBlk4Y;
+
+      pCurMb->pRefIndex[i] = pWelsMd->uiRef;
+
+      //luma
+      InitMeRefinePointer (&sMeRefine, pMbCache, g_kiPixStrideIdx8x8[i]);
+      PredMv (&pMbCache->sMvComponents, iBlk8Idx, 2, pWelsMd->uiRef, &pWelsMd->sMe.sMe8x8[i].sMvp);
+      MeRefineFracPixel (pEncCtx, pDstLuma + g_kuiSmb4AddrIn256[iBlk8Idx], &pWelsMd->sMe.sMe8x8[i], &sMeRefine, 8, 8);
+      UpdateP8x8MotionInfo (pMbCache, pCurMb, iBlk8Idx, pWelsMd->uiRef, &pWelsMd->sMe.sMe8x8[i].sMv);
+      pMbCache->sMbMvp[i] = pWelsMd->sMe.sMe8x8[i].sMvp;
+      iBestSadCost += pWelsMd->sMe.sMe8x8[i].uiSadCost;
+      iBestSatdCost += pWelsMd->sMe.sMe8x8[i].uiSatdCost;
+
+      //chroma
+      pMv = &pWelsMd->sMe.sMe8x8[i].sMv;
+      iMvStride = (pMv->iMvY >> 3) * iLineSizeRefUV + (pMv->iMvX >> 3);
+
+      iBlk4X = (i & 1) << 2;
+      iBlk4Y = (i >> 1) << 2;
+      iRefBlk4Stride = iBlk4Y * iLineSizeRefUV + iBlk4X;
+      iDstBlk4Stride = (iBlk4Y << 3) + iBlk4X;
+
+      pTmpRefCb = pRefCb + iRefBlk4Stride;
+      pTmpDstCb = pDstCb + iDstBlk4Stride;
+      pTmpRefCr = pRefCr + iRefBlk4Stride;
+      pTmpDstCr = pDstCr + iDstBlk4Stride;
+      pEncCtx->pFuncList->sMcFuncs.pfChromaMc (pTmpRefCb + iMvStride, iLineSizeRefUV, pTmpDstCb, 8, *pMv, 4, 4); //Cb
+      pEncCtx->pFuncList->sMcFuncs.pfChromaMc (pTmpRefCr + iMvStride, iLineSizeRefUV, pTmpDstCr, 8, *pMv, 4, 4); //Cr
+
+    }
+    break;
+  default:
+    break;
+  }
+  pCurMb->pSadCost[0] = iBestSadCost;
+  if (pWelsMd->bMdUsingSad)
+    pWelsMd->iCostLuma = iBestSadCost;
+  else
+    pWelsMd->iCostLuma = iBestSatdCost;
+
+}
+BOOL_T WelsMdFirstIntraMode (void* pEnc, void* pMd, SMB* pCurMb, SMbCache* pMbCache) {
+  sWelsEncCtx* pEncCtx	= (sWelsEncCtx*)pEnc;
+  SWelsFuncPtrList* pFunc	= pEncCtx->pFuncList;
+  SWelsMD* pWelsMd		= (SWelsMD*)pMd;
+
+  int32_t iCostI16x16 = WelsMdI16x16 (pFunc, pEncCtx->pCurDqLayer, pMbCache, pWelsMd->iLambda);
+
+  //compare cost_p16x16 with cost_i16x16
+  if (iCostI16x16 < pWelsMd->iCostLuma) {
+    pCurMb->uiMbType = MB_TYPE_INTRA16x16;
+    pWelsMd->iCostLuma = iCostI16x16;
+
+    pFunc->pfIntraFineMd (pEncCtx, pWelsMd, pCurMb, pMbCache);
+
+    //add pEnc&rec to MD--2010.3.15
+    if (IS_INTRA16x16 (pCurMb->uiMbType)) {
+      pCurMb->uiCbp = 0;
+      WelsEncRecI16x16Y (pEncCtx, pCurMb, pMbCache);
+    }
+
+    //chroma
+    pWelsMd->iCostChroma = WelsMdIntraChroma (pFunc, pEncCtx->pCurDqLayer, pMbCache, pWelsMd->iLambda);
+    WelsIMbChromaEncode (pEncCtx, pCurMb, pMbCache);  //add pEnc&rec to MD--2010.3.15
+
+    pCurMb->pSadCost[0] = 0;
+    return TRUE; //intra_mb_type is best
+  }
+
+  return FALSE;
+}
+
+void WelsMdInterMb (void* pEnc, void* pMd, SSlice* pSlice, SMB* pCurMb) {
+  sWelsEncCtx* pEncCtx	= (sWelsEncCtx*)pEnc;
+  SWelsMD* pWelsMd				= (SWelsMD*)pMd;
+  SDqLayer* pCurDqLayer			= pEncCtx->pCurDqLayer;
+  SMbCache* pMbCache			= &pSlice->sMbCacheInfo;
+  const uint32_t kuiNeighborAvail	= pCurMb->uiNeighborAvail;
+  const int32_t kiMbWidth			= pCurDqLayer->iMbWidth;
+  const  SMB* top_mb				= pCurMb - kiMbWidth;
+  const bool_t bMbLeftAvailPskip	= ((kuiNeighborAvail & LEFT_MB_POS) ? IS_SKIP ((pCurMb - 1)->uiMbType) : false);
+  const bool_t bMbTopAvailPskip		= ((kuiNeighborAvail & TOP_MB_POS) ? IS_SKIP (top_mb->uiMbType) : false);
+  const bool_t bMbTopLeftAvailPskip	= ((kuiNeighborAvail & TOPLEFT_MB_POS) ? IS_SKIP ((top_mb - 1)->uiMbType) : false);
+  const bool_t bMbTopRightAvailPskip = ((kuiNeighborAvail & TOPRIGHT_MB_POS) ? IS_SKIP ((top_mb + 1)->uiMbType) : false);
+  BOOL_T bTrySkip = bMbLeftAvailPskip || bMbTopAvailPskip || bMbTopLeftAvailPskip || bMbTopRightAvailPskip;
+  BOOL_T bKeepSkip = bMbLeftAvailPskip && bMbTopAvailPskip && bMbTopRightAvailPskip;
+  BOOL_T bSkip = FALSE;
+
+  if (pEncCtx->pFuncList->pfInterMdBackgroundDecision (pEncCtx, pWelsMd, pSlice, pCurMb, pMbCache, &bKeepSkip)) {
+    return;
+  }
+
+  //step 1: try SKIP
+  bSkip = WelsMdInterJudgePskip (pEncCtx, pWelsMd, pSlice, pCurMb, pMbCache, bTrySkip);
+
+  if (bSkip) {
+    if (bKeepSkip) {
+      WelsMdInterDecidedPskip (pEncCtx,  pSlice,  pCurMb, pMbCache);
+      return;
+    }
+  } else {
+    PredictSad (pMbCache->sMvComponents.iRefIndexCache, pMbCache->iSadCost, 0, &pWelsMd->iSadPredMb);
+
+    //step 2: P_16x16
+    pWelsMd->iCostLuma = WelsMdP16x16 (pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice, pCurMb);
+    pCurMb->uiMbType = MB_TYPE_16x16;
+  }
+
+  WelsMdInterSecondaryModesEnc (pEncCtx, pWelsMd, pSlice, pCurMb, pMbCache, bSkip);
+}
+
+
+
+//////
+//  try the ordinary Pskip
+//////
+bool_t WelsMdInterJudgePskip (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SSlice* pSlice, SMB* pCurMb, SMbCache* pMbCache,
+                              BOOL_T bTrySkip) {
+  bool_t bRet = true;
+  if (((pEncCtx->pRefPic->iPictureType == P_SLICE) && (pMbCache->uiRefMbType == MB_TYPE_SKIP
+       || pMbCache->uiRefMbType == MB_TYPE_BACKGROUND)) ||
+      bTrySkip) {
+    PredictSadSkip (pMbCache->sMvComponents.iRefIndexCache, pMbCache->bMbTypeSkip, pMbCache->iSadCostSkip, 0,
+                    & (pWelsMd->iSadPredSkip));
+    bRet = WelsMdPSkipEnc (pEncCtx, pWelsMd, pCurMb, pMbCache) ? true : false;
+    return bRet;
+  }
+
+  return false;
+}
+
+//////
+//  try the ordinary Pskip
+//////
+void WelsMdInterUpdatePskip (SDqLayer* pCurDqLayer, SSlice* pSlice, SMB* pCurMb, SMbCache* pMbCache) {
+  //add pEnc&rec to MD--2010.3.15
+  pCurMb->uiCbp = 0;
+  pCurMb->uiLumaQp   = pSlice->uiLastMbQp;
+  pCurMb->uiChromaQp = g_kuiChromaQpTable[CLIP3_QP_0_51 (pCurMb->uiLumaQp +
+                                          pCurDqLayer->sLayerInfo.pPpsP->uiChromaQpIndexOffset)];
+  pMbCache->bCollocatedPredFlag = (LD32 (&pCurMb->sMv[0]) == 0);
+}
+
+
+//////
+//  doublecheck if current MBTYPE is Pskip
+//////
+void WelsMdInterDoubleCheckPskip (SMB* pCurMb, SMbCache* pMbCache) {
+  if (MB_TYPE_16x16 == pCurMb->uiMbType && 0 == pCurMb->uiCbp) {
+    if (0 == pCurMb->pRefIndex[0]) {
+      SMVUnitXY sMvp = { 0 };
+
+      PredSkipMv (pMbCache, &sMvp);
+      if (LD32 (&sMvp) == LD32 (&pCurMb->sMv[0])) {
+        pCurMb->uiMbType = MB_TYPE_SKIP;
+      }
+    }
+    pMbCache->bCollocatedPredFlag = (LD32 (&pCurMb->sMv[0]) == 0);
+  }
+}
+
+//////
+//  Pskip mb encode
+//////
+void WelsMdInterDecidedPskip (sWelsEncCtx* pEncCtx, SSlice* pSlice, SMB* pCurMb, SMbCache* pMbCache) {
+  SDqLayer* pCurDqLayer = pEncCtx->pCurDqLayer;
+  pCurMb->uiMbType = MB_TYPE_SKIP;
+  WelsRecPskip (pCurDqLayer, pEncCtx->pFuncList, pCurMb, pMbCache);
+  WelsMdInterUpdatePskip (pCurDqLayer, pSlice, pCurMb, pMbCache);
+}
+
+//////
+//  inter mb encode
+//////
+void WelsMdInterEncode (sWelsEncCtx* pEncCtx, SSlice* pSlice, SMB* pCurMb, SMbCache* pMbCache) {
+  SWelsFuncPtrList* pFunc = pEncCtx->pFuncList;
+  SDqLayer* pCurDqLayer	= pEncCtx->pCurDqLayer;
+
+  //add pEnc&rec to MD--2010.3.15
+  const int32_t kiCsStrideY = pCurDqLayer->iCsStride[0];
+  const int32_t kiCsStrideUV = pCurDqLayer->iCsStride[1];
+
+  //add pEnc&rec to MD--2010.3.15
+  pCurMb->uiCbp = 0;
+  WelsInterMbEncode (pEncCtx, pSlice, pCurMb);
+  WelsPMbChromaEncode (pEncCtx, pSlice, pCurMb);
+
+  pFunc->pfCopy16x16Aligned (pMbCache->SPicData.pCsMb[0], kiCsStrideY, pMbCache->pMemPredLuma,      16);
+  pFunc->pfCopy8x8Aligned (pMbCache->SPicData.pCsMb[1], kiCsStrideUV, pMbCache->pMemPredChroma,    8);
+  pFunc->pfCopy8x8Aligned (pMbCache->SPicData.pCsMb[2], kiCsStrideUV, pMbCache->pMemPredChroma + 64, 8);
+}
+
+
+
+
+//////
+//  try the BGD Pskip
+//////
+bool_t WelsMdInterJudgeBGDPskip (void* pCtx, void* pMd, SSlice* pSlice, SMB* pCurMb, SMbCache* pMbCache,
+                                 BOOL_T* bKeepSkip) {
+  sWelsEncCtx* pEncCtx = (sWelsEncCtx*)pCtx;
+  SWelsMD* pWelsMd = (SWelsMD*)pMd;
+
+  SDqLayer* pCurDqLayer = pEncCtx->pCurDqLayer;
+
+  const int32_t kiRefMbQp = pCurDqLayer->pRefPic->pRefMbQp[pCurMb->iMbXY];
+  const int32_t kiCurMbQp = pCurMb->uiLumaQp;// unsigned -> signed
+  int8_t*	pVaaBgMbFlag = pEncCtx->pVaa->pVaaBackgroundMbFlag + pCurMb->iMbXY;
+
+  const int32_t kiMbWidth = pCurDqLayer->iMbWidth;
+
+  *bKeepSkip = (*bKeepSkip) &&
+               ((!pVaaBgMbFlag[-1]) &&
+                (!pVaaBgMbFlag[-kiMbWidth]) &&
+                (!pVaaBgMbFlag[-kiMbWidth + 1]));
+
+  if (
+    *pVaaBgMbFlag
+    && !IS_INTRA (pMbCache->uiRefMbType)
+    && (kiRefMbQp - kiCurMbQp <= DELTA_QP_BGD_THD || kiRefMbQp <= 26)
+  ) {
+    SMVUnitXY	sVaaPredSkipMv = { 0 };
+    PredSkipMv (pMbCache, &sVaaPredSkipMv);
+    WelsMdBackgroundMbEnc (pEncCtx, pWelsMd, pCurMb, pMbCache, pSlice, (LD32 (&sVaaPredSkipMv) == 0));
+    return true;
+  }
+
+  return false;
+}
+
+bool_t WelsMdInterJudgeBGDPskipFalse (void* pCtx, void* pMd, SSlice* pSlice, SMB* pCurMb, SMbCache* pMbCache,
+                                      BOOL_T* bKeepSkip) {
+  return false;
+}
+
+
+
+//////
+//  update BGD related info
+//////
+void WelsMdInterUpdateBGDInfo (SDqLayer* pCurLayer,  SMB* pCurMb, const bool_t bCollocatedPredFlag,
+                               const int32_t iRefPictureType) {
+  uint8_t* pTargetRefMbQpList = (pCurLayer->pDecPic->pRefMbQp);
+  const int32_t kiMbXY = pCurMb->iMbXY;
+
+  if (pCurMb->uiCbp || I_SLICE == iRefPictureType || 0 == bCollocatedPredFlag) {
+    pTargetRefMbQpList[kiMbXY] = pCurMb->uiLumaQp;
+  } else { //unchange, do not need to evaluation?
+    uint8_t* pRefPicRefMbQpList = (pCurLayer->pRefPic->pRefMbQp);
+    pTargetRefMbQpList[kiMbXY] = pRefPicRefMbQpList[kiMbXY];
+  }
+
+  if (pCurMb->uiMbType == MB_TYPE_BACKGROUND) {
+    pCurMb->uiMbType = MB_TYPE_SKIP;
+  }
+}
+
+void WelsMdInterUpdateBGDInfoNULL (SDqLayer* pCurLayer, SMB* pCurMb, const bool_t bCollocatedPredFlag,
+                                   const int32_t iRefPictureType) {
+}
+
+//
+//
+//
+void WelsMdInterSaveSadAndRefMbType (Mb_Type* pRefMbtypeList, SMbCache* pMbCache, const SMB*  pCurMb,
+                                     const SWelsMD* pMd) {
+  const Mb_Type kmtCurMbtype = pCurMb->uiMbType;
+
+  //sad
+  pMbCache->pEncSad[0] = (kmtCurMbtype == MB_TYPE_SKIP) ? pMd->iCostSkipMb : 0;
+  //uiMbType
+  pRefMbtypeList[pCurMb->iMbXY] = kmtCurMbtype;
+}
+
+void WelsMdInterSecondaryModesEnc (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SSlice* pSlice, SMB* pCurMb,
+                                   SMbCache* pMbCache, const BOOL_T bSkip) {
+  //step 2: Intra
+  const BOOL_T kbTrySkip = pEncCtx->pFuncList->pfFirstIntraMode (pEncCtx, pWelsMd, pCurMb, pMbCache);
+  if (kbTrySkip)
+    return;
+
+  if (bSkip) {
+    WelsMdInterDecidedPskip (pEncCtx,  pSlice,  pCurMb, pMbCache);
+  } else {
+    //Step 2: ILFMD in P
+    pEncCtx->pFuncList->pfInterFineMd (pEncCtx, pWelsMd, pSlice, pCurMb, pWelsMd->iCostLuma);
+
+    //refinement for inter type
+    WelsMdInterMbRefinement (pEncCtx, pWelsMd, pCurMb, pMbCache);
+
+    //step 7: invoke encoding
+    WelsMdInterEncode (pEncCtx, pSlice, pCurMb, pMbCache);
+
+    //step 8: double check Pskip
+    WelsMdInterDoubleCheckPskip (pCurMb, pMbCache);
+  }
+}
+
+
+void WelsMdIntraSecondaryModesEnc (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb, SMbCache* pMbCache) {
+  SWelsFuncPtrList* pFunc = pEncCtx->pFuncList;
+  //initial prediction memory for I_4x4
+  pFunc->pfIntraFineMd (pEncCtx, pWelsMd, pCurMb, pMbCache);			//WelsMdIntraFinePartitionVaa
+
+  //add pEnc&rec to MD--2010.3.15
+  if (IS_INTRA16x16 (pCurMb->uiMbType)) {
+    pCurMb->uiCbp = 0;
+    WelsEncRecI16x16Y (pEncCtx, pCurMb, pMbCache);
+  }
+
+  //chroma
+  pWelsMd->iCostChroma = WelsMdIntraChroma (pFunc, pEncCtx->pCurDqLayer, pMbCache, pWelsMd->iLambda);
+  WelsIMbChromaEncode (pEncCtx, pCurMb, pMbCache);  //add pEnc&rec to MD--2010.3.15
+  pCurMb->pSadCost[0] = 0;
+}
+
+} // namespace WelsSVCEnc
--- a/codec/encoder/core/src/svc_enc_slice_segment.cpp
+++ b/codec/encoder/core/src/svc_enc_slice_segment.cpp
@@ -1,768 +1,692 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file	slice_segment.c
- *
- * \brief	SSlice segment routine (Single slice/multiple slice/fmo arrangement exclusive)
- *
- * \date	2/4/2009 Created
- *
- *************************************************************************************
- */
-#include <string.h>
-#include <assert.h>
-#include "svc_enc_slice_segment.h"
-#include "wels_const.h"
-#include "macros.h"
-#include "utils.h"
-#include "macros.h"
-#include "rc.h"
-
-namespace WelsSVCEnc {
-/*!
- * \brief	Assign MB map for single slice segment
- *
- * \param	pMbMap			overall MB map
- * \param	iCountMbNum	count number of MB
- *
- * \return	0 - successful; none 0 - failed
- */
-int32_t AssignMbMapSingleSlice( void *pMbMap, const int32_t kiCountMbNum, const int32_t kiMapUnitSize )
-{
-	if ( NULL == pMbMap || kiCountMbNum <= 0 )
-		return 1;
-	
-	memset( pMbMap, 0, kiCountMbNum * kiMapUnitSize );
-
-	return 0;
-}
-
-/*!
- * \brief	Assign MB map for multiple slice(s) segment
- *
- * \param	pMbMap			overall MB map
- * \param	iCountMbNum	count number of MB
- *
- * \return	0 - successful; none 0 - failed
- */
-int32_t AssignMbMapMultipleSlices( SSliceCtx *pSliceSeg, const SMulSliceOption *kpMso )
-{
-	if ( NULL == pSliceSeg || SM_SINGLE_SLICE == pSliceSeg->uiSliceMode )
-		return 1;
-	
-	if ( SM_ROWMB_SLICE == pSliceSeg->uiSliceMode )
-	{
-		const int32_t kiMbWidth	= pSliceSeg->iMbWidth;
-		int32_t iSliceNum = pSliceSeg->iSliceNumInFrame, uiSliceIdx = 0;
-		
-		while ( uiSliceIdx < iSliceNum )
-		{
-			const int16_t kiFirstMb = uiSliceIdx * kiMbWidth;
-			pSliceSeg->pCountMbNumInSlice[uiSliceIdx]	= kiMbWidth;
-			pSliceSeg->pFirstMbInSlice[uiSliceIdx]		= kiFirstMb;
-			memset(pSliceSeg->pOverallMbMap+kiFirstMb, (uint8_t)uiSliceIdx, kiMbWidth*sizeof(uint8_t));
-			++ uiSliceIdx;
-		}
-
-		return 0;
-	}
-	else if (	SM_RASTER_SLICE  == pSliceSeg->uiSliceMode ||
-				SM_FIXEDSLCNUM_SLICE == pSliceSeg->uiSliceMode )
-	{
-		const int32_t *kpSlicesAssignList				= (int32_t *)&(kpMso->sSliceArgument.uiSliceMbNum[0]);
-		const int32_t kiCountNumMbInFrame		= pSliceSeg->iMbNumInFrame;
-		const int32_t kiCountSliceNumInFrame	= pSliceSeg->iSliceNumInFrame;
-		int32_t iSliceIdx						= 0;
-		int16_t iMbIdx							= 0;
-		
-		do {
-			const int32_t kiCurRunLength	= kpSlicesAssignList[iSliceIdx];
-			int32_t iRunIdx					= 0;
-
-			pSliceSeg->pFirstMbInSlice[iSliceIdx]			= iMbIdx;
-			pSliceSeg->pCountMbNumInSlice[iSliceIdx]		= kiCurRunLength;
-
-			// due here need check validate mb_assign_map for input pData, can not use memset			
-			do {
-				pSliceSeg->pOverallMbMap[iMbIdx+iRunIdx]	= iSliceIdx;
-				++ iRunIdx;
-			} while(iRunIdx < kiCurRunLength && iMbIdx + iRunIdx < kiCountNumMbInFrame);			
-
-			iMbIdx += kiCurRunLength;
-			++ iSliceIdx;
-		} while(iSliceIdx < kiCountSliceNumInFrame && iMbIdx < kiCountNumMbInFrame);		
-	}
-	else if ( SM_DYN_SLICE == pSliceSeg->uiSliceMode )
-	{
-	}
-	else	// any else uiSliceMode?
-	{
-		assert( 0 );
-	}
-
-	// extention for other multiple slice type in the future
-	return 1;
-}
-
-/*!
- *  Check slices assignment setttings on MST_INTERLEAVE type	
- */
-
-//slice parameter check for SM_FIXEDSLCNUM_SLICE 
-bool_t CheckFixedSliceNumMultiSliceSetting( const int32_t kiMbNumInFrame, SSliceArgument * pSliceArg )
-{
-	int32_t *pSlicesAssignList		= (int32_t *)&(pSliceArg->uiSliceMbNum[0]);
-	const uint32_t kuiSliceNum			= pSliceArg->iSliceNum;
-	uint32_t uiSliceIdx				= 0;
-	const int32_t kiMbNumPerSlice	= kiMbNumInFrame / kuiSliceNum;	
-	int32_t iNumMbLeft				= kiMbNumInFrame;		
-
-	if ( NULL == pSlicesAssignList )
-		return false;
-
-	for ( ; uiSliceIdx+1 < kuiSliceNum; ++ uiSliceIdx )
-	{
-		pSlicesAssignList[uiSliceIdx] = kiMbNumPerSlice;
-		iNumMbLeft	-= kiMbNumPerSlice;
-	}
-	pSlicesAssignList[uiSliceIdx] = iNumMbLeft;
-
-	return true;
-}
-
-//slice parameter check for SM_ROWMB_SLICE 
-bool_t CheckRowMbMultiSliceSetting( const int32_t kiMbWidth, SSliceArgument * pSliceArg )
-{
-	int32_t *pSlicesAssignList = (int32_t *)&(pSliceArg->uiSliceMbNum[0]);
-	const uint32_t kuiSliceNum		= pSliceArg->iSliceNum;
-	uint32_t uiSliceIdx			= 0;
-
-	if ( NULL == pSlicesAssignList )
-		return false;
-
-	while ( uiSliceIdx < kuiSliceNum )
-	{
-		pSlicesAssignList[uiSliceIdx]	= kiMbWidth;
-		++ uiSliceIdx;
-	}
-	return true;
-}
-
-//slice parameter check for SM_RASTER_SLICE 
-bool_t CheckRasterMultiSliceSetting( const int32_t kiMbNumInFrame, SSliceArgument * pSliceArg )
-{
-	int32_t			*pSlicesAssignList = (int32_t *)&(pSliceArg->uiSliceMbNum[0]);
-	int32_t			iActualSliceCount	= 0;
-
-	//check mb_num setting
-	uint32_t uiSliceIdx			= 0;
-	int32_t iCountMb			= 0;
-
-	if ( NULL == pSlicesAssignList )
-		return false;
-
-	while ( ( uiSliceIdx < MAX_SLICES_NUM ) && ( 0 < pSlicesAssignList[uiSliceIdx] ) ) 
-	{
-		iCountMb			+= pSlicesAssignList[uiSliceIdx];
-		iActualSliceCount	=  uiSliceIdx + 1;
-
-		if ( iCountMb >= kiMbNumInFrame )
-		{
-			break;
-		}
-
-		++ uiSliceIdx;
-	}
-	//break condition above makes, after the while
-	// here must have (iActualSliceCount <= MAX_SLICES_NUM)
-
-	//correction if needed
-	if ( iCountMb == kiMbNumInFrame )
-	{
-		;
-	}
-	else if ( iCountMb > kiMbNumInFrame )
-	{
-		//need correction: 
-		//setting is more than iMbNumInFrame, 
-		//cut the last uiSliceMbNum; adjust iCountMb
-		pSlicesAssignList[iActualSliceCount-1]	-=	( iCountMb - kiMbNumInFrame );
-		iCountMb								=	kiMbNumInFrame;
-	}
-	else if ( iActualSliceCount < MAX_SLICES_NUM )
-	{
-		//where ( iCountMb < iMbNumInFrame )
-		//can do correction: 
-		//	make the last uiSliceMbNum the left num
-		pSlicesAssignList[iActualSliceCount] = kiMbNumInFrame - iCountMb;	
-		iActualSliceCount += 1;
-	}
-	else
-	{
-		//here ( iCountMb < iMbNumInFrame ) && ( iActualSliceCount == MAX_SLICES_NUM )
-		//no more slice can be added
-		return false;
-	}
-
-	pSliceArg->iSliceNum = iActualSliceCount;
-	return true;
-
-}
-
-
-// GOM based RC related for uiSliceNum decision, only used at SM_FIXEDSLCNUM_SLICE
-void GomValidCheckSliceNum( const int32_t kiMbWidth, const int32_t kiMbHeight, int32_t *pSliceNum )
-{
-	const int32_t kiCountNumMb	= kiMbWidth * kiMbHeight;	
-	int32_t iSliceNum			= *pSliceNum;
-	int32_t iGomSize;
-	
-	//The default RC is Bit-rate mode[Yi], but need consider as below:
-	// Tuned to use max of mode0 and mode1 due can not refresh on this from rc mode changed outside, 8/16/2011
-	// NOTE: GOM_ROW_MODE0_?P is integer multipler of GOM_ROW_MODE1_?P, which predefined at rc.h there, so GOM_ROM take MODE0 as the initial	
-	if( kiMbWidth<=MB_WIDTH_THRESHOLD_90P )
-		iGomSize = kiMbWidth * GOM_ROW_MODE0_90P;
-	else if( kiMbWidth<=MB_WIDTH_THRESHOLD_180P )
-		iGomSize = kiMbWidth *  GOM_ROW_MODE0_180P;
-	else if( kiMbWidth<=MB_WIDTH_THRESHOLD_360P )
-		iGomSize = kiMbWidth * GOM_ROW_MODE0_360P;
-	else
-		iGomSize = kiMbWidth * GOM_ROW_MODE0_720P;
-
-	while(true)
-	{
-		if ( kiCountNumMb < iGomSize * iSliceNum )
-		{
-			-- iSliceNum;
-			iSliceNum = iSliceNum - (iSliceNum & 0x01);	// verfiy even num for multiple slices case			
-			if ( iSliceNum < 2 )	// for safe
-				break;
-			continue;
-		}		
-		break;		
-	}
-	
-	if ( 0 == iSliceNum )
-		iSliceNum = 1;
-	
-	*pSliceNum	= iSliceNum;
-}
-
-
-// GOM based RC related for uiSliceMbNum decision, only used at SM_FIXEDSLCNUM_SLICE
-void GomValidCheckSliceMbNum( const int32_t kiMbWidth, const int32_t kiMbHeight, SSliceArgument * pSliceArg )
-{
-	uint32_t *pSlicesAssignList		= &(pSliceArg->uiSliceMbNum[0]);
-	const uint32_t kuiSliceNum			= pSliceArg->iSliceNum;
-	const int32_t kiMbNumInFrame	= kiMbWidth * kiMbHeight;			
-	const int32_t kiMbNumPerSlice	= kiMbNumInFrame / kuiSliceNum;	
-	int32_t iNumMbLeft				= kiMbNumInFrame;			
-
-	int32_t iMinimalMbNum			= kiMbWidth;	// in theory we need only 1 SMB, here let it as one SMB row required
-	int32_t iMaximalMbNum			= 0;	// dynamically assign later
-	int32_t iGomSize;
-
-	uint32_t uiSliceIdx	= 0;	// for test
-
-	// The default RC is Bit-rate mode [Yi], but need consider as below:
-	// Tuned to use max of mode0 and mode1 due can not refresh on this from rc mode changed outside, 8/16/2011
-	// NOTE: GOM_ROW_MODE0_?P is integer multipler of GOM_ROW_MODE1_?P, which predefined at rc.h there, so GOM_ROM take MODE0 as the initial	
-	if( kiMbWidth<=MB_WIDTH_THRESHOLD_90P )
-		iGomSize = kiMbWidth * GOM_ROW_MODE0_90P;
-	else if( kiMbWidth<=MB_WIDTH_THRESHOLD_180P )
-		iGomSize = kiMbWidth * GOM_ROW_MODE0_180P;
-	else if( kiMbWidth<=MB_WIDTH_THRESHOLD_360P )
-		iGomSize = kiMbWidth * GOM_ROW_MODE0_360P;
-	else
-		iGomSize = kiMbWidth * GOM_ROW_MODE0_720P;
-
-	iMinimalMbNum	= iGomSize;
-	iMaximalMbNum	= kiMbNumInFrame - (kuiSliceNum - 1) * iMinimalMbNum;
-
-	while ( uiSliceIdx+1 < kuiSliceNum )
-	{
-		// GOM boundary aligned
-		int32_t iNumMbAssigning = (int32_t)(1.0f * kiMbNumPerSlice / iGomSize + 0.5f + EPSN) * iGomSize;
-
-		// make sure one GOM at least in each slice for safe
-		if ( iNumMbAssigning < iMinimalMbNum )
-			iNumMbAssigning	= iMinimalMbNum;
-		else if ( iNumMbAssigning > iMaximalMbNum )
-			iNumMbAssigning	= iMaximalMbNum;
-
-		assert( iNumMbAssigning > 0 );
-
-		iNumMbLeft -= iNumMbAssigning;
-		assert( iNumMbLeft > 0 );
-		pSlicesAssignList[uiSliceIdx]	= iNumMbAssigning;
-
-		++ uiSliceIdx;
-		iMaximalMbNum	= iNumMbLeft - (kuiSliceNum - uiSliceIdx - 1) * iMinimalMbNum;	// get maximal num_mb in left parts
-	}
-	pSlicesAssignList[uiSliceIdx] = iNumMbLeft;		
-}
-
-
-/*!
- *	Get slice count for multiple slice segment
- *
- */
-int32_t GetInitialSliceNum( const int32_t kiMbWidth, const int32_t kiMbHeight, SMulSliceOption* pMso )
-{
-	if ( NULL == pMso )
-		return -1;
-
-	switch( pMso->uiSliceMode )
-	{
-	case SM_SINGLE_SLICE:
-	case SM_FIXEDSLCNUM_SLICE:
-	case SM_RASTER_SLICE:
-	case SM_ROWMB_SLICE:
-		{
-			return pMso->sSliceArgument.iSliceNum;
-		}
-	case SM_DYN_SLICE:
-		{
-			return AVERSLICENUM_CONSTRAINT;//at the beginning of dynamic slicing, set the uiSliceNum to be 1
-		}
-	case SM_RESERVED:
-	default:
-		{
-			return -1;
-		}
-	}
-
-	return -1;
-}
-
-/*!
- * \brief	Initialize slice segment (Single/multiple slices)
- *
- * \param	pSliceSeg			SSlice segment to be initialized
- * \param	uiSliceMode			SSlice mode
- * \param	multi_slice_argv	Multiple slices argument
- * \param	iMbWidth			MB width 
- * \param	iMbHeight			MB height
- *
- * \return	0 - successful; none 0 - failed;
- */
-int32_t InitSliceSegment(	SSliceCtx *pSliceSeg,
-						    CMemoryAlign *pMa,
-							SMulSliceOption *pMso,
-							const int32_t kiMbWidth,
-							const int32_t kiMbHeight )
-{
-	const int32_t kiCountMbNum = kiMbWidth * kiMbHeight;
-	 SliceMode uiSliceMode = SM_SINGLE_SLICE;
-
-	if ( NULL == pSliceSeg || NULL == pMso || kiMbWidth == 0 || kiMbHeight == 0 )
-		return 1;
-
-	uiSliceMode = pMso->uiSliceMode;
-	if ( pSliceSeg->iMbNumInFrame == kiCountMbNum && pSliceSeg->iMbWidth == kiMbWidth
-			&& pSliceSeg->iMbHeight == kiMbHeight && pSliceSeg->uiSliceMode == uiSliceMode && pSliceSeg->pOverallMbMap != NULL )
-			return 0;
-	else if ( pSliceSeg->iMbNumInFrame != kiCountMbNum )
-	{
-		if ( NULL != pSliceSeg->pOverallMbMap )
-		{
-			pMa->WelsFree( pSliceSeg->pOverallMbMap, "pSliceSeg->pOverallMbMap" );
-
-			pSliceSeg->pOverallMbMap = NULL;
-		}
-		if ( NULL != pSliceSeg->pFirstMbInSlice )
-		{
-			pMa->WelsFree( pSliceSeg->pFirstMbInSlice, "pSliceSeg->pFirstMbInSlice" );
-
-			pSliceSeg->pFirstMbInSlice = NULL;
-		}
-		if ( NULL != pSliceSeg->pCountMbNumInSlice )
-		{
-			pMa->WelsFree( pSliceSeg->pCountMbNumInSlice, "pSliceSeg->pCountMbNumInSlice" );
-
-			pSliceSeg->pCountMbNumInSlice	= NULL;
-		}
-		// just for safe
-		pSliceSeg->iSliceNumInFrame	= 0;
-		pSliceSeg->iMbNumInFrame		= 0;
-		pSliceSeg->iMbWidth				= 0;
-		pSliceSeg->iMbHeight			= 0;
-		pSliceSeg->uiSliceMode			= SM_SINGLE_SLICE;	// sigle in default
-	}
-
-	if ( SM_SINGLE_SLICE == uiSliceMode )
-	{
-		pSliceSeg->pOverallMbMap	= (uint8_t *)pMa->WelsMalloc(kiCountMbNum * sizeof(uint8_t), "pSliceSeg->pOverallMbMap" );
-
-		WELS_VERIFY_RETURN_IF( 1, NULL == pSliceSeg->pOverallMbMap )		
-		pSliceSeg->iSliceNumInFrame	= 1;
-
-		pSliceSeg->pFirstMbInSlice	= (int16_t *)pMa->WelsMalloc( pSliceSeg->iSliceNumInFrame * sizeof(int16_t), "pSliceSeg->pFirstMbInSlice" );
-
-		WELS_VERIFY_RETURN_IF( 1, NULL == pSliceSeg->pFirstMbInSlice )
-
-		pSliceSeg->pCountMbNumInSlice= (int32_t *)pMa->WelsMalloc( pSliceSeg->iSliceNumInFrame * sizeof(int32_t), "pSliceSeg->pCountMbNumInSlice" );
-
-		WELS_VERIFY_RETURN_IF( 1, NULL == pSliceSeg->pCountMbNumInSlice )
-		pSliceSeg->uiSliceMode			= uiSliceMode;
-		pSliceSeg->iMbWidth				= kiMbWidth;
-		pSliceSeg->iMbHeight			= kiMbHeight;
-		pSliceSeg->iMbNumInFrame		= kiCountMbNum;
-		pSliceSeg->pCountMbNumInSlice[0]	= kiCountMbNum;
-		pSliceSeg->pFirstMbInSlice[0]		= 0;
-
-		return AssignMbMapSingleSlice( pSliceSeg->pOverallMbMap, kiCountMbNum, sizeof(pSliceSeg->pOverallMbMap[0]) );
-	}
-	else //if ( SM_MULTIPLE_SLICE == uiSliceMode )
-	{
-		if ( uiSliceMode != SM_FIXEDSLCNUM_SLICE && uiSliceMode != SM_ROWMB_SLICE && uiSliceMode != SM_RASTER_SLICE && uiSliceMode != SM_DYN_SLICE )
-			return 1;
-
-		pSliceSeg->pOverallMbMap	= (uint8_t *)pMa->WelsMalloc( kiCountMbNum * sizeof(uint8_t), "pSliceSeg->pOverallMbMap" );
-
-		WELS_VERIFY_RETURN_IF( 1, NULL == pSliceSeg->pOverallMbMap )
-
-		//SM_DYN_SLICE: init, set pSliceSeg->iSliceNumInFrame	= 1;		
-		pSliceSeg->iSliceNumInFrame = GetInitialSliceNum( kiMbWidth, kiMbHeight, pMso );
-
-		if ( -1 == pSliceSeg->iSliceNumInFrame )
-			return 1;
-
-		pSliceSeg->pCountMbNumInSlice	= (int32_t *)pMa->WelsMalloc( pSliceSeg->iSliceNumInFrame * sizeof(int32_t), "pSliceSeg->pCountMbNumInSlice" );
-
-		WELS_VERIFY_RETURN_IF( 1, NULL == pSliceSeg->pCountMbNumInSlice )
-
-		pSliceSeg->pFirstMbInSlice		= (int16_t *)pMa->WelsMalloc( pSliceSeg->iSliceNumInFrame * sizeof(int16_t), "pSliceSeg->pFirstMbInSlice" );
-
-		WELS_VERIFY_RETURN_IF( 1, NULL == pSliceSeg->pFirstMbInSlice )
-		pSliceSeg->uiSliceMode			= pMso->uiSliceMode;
-		pSliceSeg->iMbWidth				= kiMbWidth;
-		pSliceSeg->iMbHeight			= kiMbHeight;
-		pSliceSeg->iMbNumInFrame		= kiCountMbNum;
-		if ( SM_DYN_SLICE == pMso->uiSliceMode )
-		{
-			if ( 0 < pMso->sSliceArgument.uiSliceSizeConstraint )
-			{
-				pSliceSeg->uiSliceSizeConstraint= pMso->sSliceArgument.uiSliceSizeConstraint;
-			}
-			else
-			{
-				return 1;
-			}
-		}
-		else
-		{
-			pSliceSeg->uiSliceSizeConstraint = DEFAULT_MAXPACKETSIZE_CONSTRAINT;
-		}
-		// about "iMaxSliceNumConstraint"
-		//only used in SM_DYN_SLICE mode so far,
-		//now follows NAL_UNIT_CONSTRAINT, (see definition)
-		//will be adjusted under MT if there is limitation on iLayerNum 
-		pSliceSeg->iMaxSliceNumConstraint = MAX_SLICES_NUM;
-		
-
-		return AssignMbMapMultipleSlices( pSliceSeg, pMso );
-	}
-	return 0;
-}
-
-/*!
- * \brief	Uninitialize slice segment (Single/multiple slices)
- *
- * \param	pSliceSeg			SSlice segment to be uninitialized
- *
- * \return	none;
- */
-void UninitSliceSegment( SSliceCtx *pSliceSeg, CMemoryAlign *pMa )
-{
-	if ( NULL != pSliceSeg )
-	{
-		if ( NULL != pSliceSeg->pOverallMbMap )
-		{
-			pMa->WelsFree( pSliceSeg->pOverallMbMap, "pSliceSeg->pOverallMbMap" );
-
-			pSliceSeg->pOverallMbMap = NULL;
-		}
-		if ( NULL != pSliceSeg->pFirstMbInSlice )
-		{
-			pMa->WelsFree( pSliceSeg->pFirstMbInSlice, "pSliceSeg->pFirstMbInSlice" );
-
-			pSliceSeg->pFirstMbInSlice = NULL;
-		}
-		if ( NULL != pSliceSeg->pCountMbNumInSlice )
-		{
-			pMa->WelsFree( pSliceSeg->pCountMbNumInSlice, "pSliceSeg->pCountMbNumInSlice" );
-
-			pSliceSeg->pCountMbNumInSlice = NULL;
-		}		
-
-		pSliceSeg->iMbNumInFrame		= 0;
-		pSliceSeg->iMbWidth				= 0;
-		pSliceSeg->iMbHeight			= 0;
-		pSliceSeg->uiSliceMode			= SM_SINGLE_SLICE;	// single in default
-		pSliceSeg->iSliceNumInFrame	= 0;
-	}
-}
-
-
-/*!
- * \brief	Initialize Wels SSlice context (Single/multiple slices and FMO)
- *
- * \param	pSliceCtx		SSlice context to be initialized
- * \param	bFmoUseFlag	flag of using fmo
- * \param	iMbWidth		MB width 
- * \param	iMbHeight		MB height
- * \param	uiSliceMode		slice mode
- * \param	mul_slice_arg	argument for multiple slice if it is applicable
- * \param	pPpsArg			argument for pPps parameter
- *
- * \return	0 - successful; none 0 - failed;
- */
-int32_t InitSlicePEncCtx( SSliceCtx *pSliceCtx,
-						    CMemoryAlign *pMa,
-						    bool_t bFmoUseFlag,
-							int32_t iMbWidth,
-							int32_t iMbHeight,
-							SMulSliceOption *pMso,
-							void *pPpsArg )
-{
-	if ( NULL == pSliceCtx)
-		return 1;
-	
-	InitSliceSegment(	pSliceCtx,
-						pMa,
-						pMso,
-						iMbWidth,
-						iMbHeight	);
-	return 0;
-}
-
-/*!
- * \brief	Uninitialize Wels SSlice context (Single/multiple slices and FMO)
- *
- * \param	pSliceCtx		SSlice context to be initialized 
- *
- * \return	NONE;
- */
-void UninitSlicePEncCtx( SSliceCtx *pSliceCtx, CMemoryAlign *pMa )
-{
-	if ( NULL != pSliceCtx )
-	{
-		UninitSliceSegment( pSliceCtx, pMa );
-	}
-}
-
-/*!
- * \brief	Get slice idc for given iMbXY (apply in Single/multiple slices and FMO)
- *
- * \param	pSliceCtx		SSlice context
- * \param	kiMbXY			MB xy index
- *
- * \return	uiSliceIdc - successful; -1 - failed;
- */
-uint8_t WelsMbToSliceIdc( SSliceCtx *pSliceCtx, const int16_t kiMbXY )
-{
-	if ( NULL != pSliceCtx && kiMbXY < pSliceCtx->iMbNumInFrame && kiMbXY >= 0 )
-		return pSliceCtx->pOverallMbMap[ kiMbXY ];
-	return (uint8_t)(-1);
-}
-
-/*!
- * \brief	Get first mb in slice/slice_group: uiSliceIdc (apply in Single/multiple slices and FMO)
- *
- * \param	pSliceCtx		SSlice context
- * \param	kuiSliceIdc		slice idc
- *
- * \return	iFirstMb - successful; -1 - failed;
- */
-int32_t WelsGetFirstMbOfSlice( SSliceCtx *pSliceCtx, const int32_t kuiSliceIdc )
-{
-	return pSliceCtx->pFirstMbInSlice[ kuiSliceIdc ];
-}
-
-/*!
- * \brief	Get successive mb to be processed in slice/slice_group: uiSliceIdc (apply in Single/multiple slices and FMO)
- *
- * \param	pSliceCtx		SSlice context
- * \param	kiMbXY			MB xy index
- *
- * \return	next_mb - successful; -1 - failed;
- */
-int32_t WelsGetNextMbOfSlice( SSliceCtx *pSliceCtx, const int16_t kiMbXY )
-{
-	if ( NULL != pSliceCtx )
-	{
-		SSliceCtx *pSliceSeg = pSliceCtx;
-		if ( NULL == pSliceSeg || kiMbXY < 0 || kiMbXY >= pSliceSeg->iMbNumInFrame )
-			return -1;
-		if ( SM_SINGLE_SLICE == pSliceSeg->uiSliceMode )
-		{
-			int32_t iNextMbIdx = kiMbXY;
-			++ iNextMbIdx;
-			if ( iNextMbIdx >= pSliceSeg->iMbNumInFrame )
-				iNextMbIdx	= -1;
-			return iNextMbIdx;
-		}
-		else /*if ( SM_MULTIPLE_SLICE == pSliceSeg->uiSliceMode )*/
-		{
-			if ( SM_RESERVED != pSliceSeg->uiSliceMode )
-			{
-				int32_t iNextMbIdx = kiMbXY;
-				++ iNextMbIdx;
-				if ( iNextMbIdx < pSliceSeg->iMbNumInFrame && pSliceSeg->pOverallMbMap != NULL && pSliceSeg->pOverallMbMap[iNextMbIdx] == pSliceSeg->pOverallMbMap[ kiMbXY ] )
-					return iNextMbIdx;
-				return -1;
-			}
-			else
-				return -1;	// reserved here for other multiple slice type
-		}
-	}
-	else
-		return -1;
-}
-
-/*!
- * \brief	Get previous mb to be processed in slice/slice_group: uiSliceIdc (apply in Single/multiple slices and FMO)
- *
- * \param	pSliceCtx		SSlice context
- * \param	kiMbXY			MB xy index
- *
- * \return	prev_mb - successful; -1 - failed;
- */
-int32_t WelsGetPrevMbOfSlice( SSliceCtx *pSliceCtx, const int16_t kiMbXY )
-{
-	if ( NULL != pSliceCtx )
-	{
-		SSliceCtx *pSliceSeg = pSliceCtx;
-		if ( NULL == pSliceSeg || kiMbXY < 0 || kiMbXY >= pSliceSeg->iMbNumInFrame )
-			return -1;
-		if ( pSliceSeg->uiSliceMode == SM_SINGLE_SLICE )
-			return (-1+kiMbXY);
-		else/* if ( pSliceSeg->uiSliceMode == SM_MULTIPLE_SLICE )*/
-		{
-			if ( SM_RESERVED == pSliceSeg->uiSliceMode )
-			{
-				int32_t iPrevMbIdx = kiMbXY;
-				-- iPrevMbIdx;
-				if ( iPrevMbIdx >= 0 && iPrevMbIdx < pSliceSeg->iMbNumInFrame && NULL != pSliceSeg->pOverallMbMap
-					&& pSliceSeg->pOverallMbMap[ kiMbXY ] == pSliceSeg->pOverallMbMap[ iPrevMbIdx ] )
-					return iPrevMbIdx;
-				return -1;
-			}
-			else
-				return -1;
-		}
-	}
-	else
-		return -1;
-}
-
-/*!
- * \brief	Get number of mb in slice/slice_group: uiSliceIdc (apply in Single/multiple slices and FMO)
- *
- * \param	pSliceCtx		SSlice context
- * \param	kuiSliceIdc		slice/slice_group idc
- *
- * \return	count_num_of_mb - successful; -1 - failed;
- */
-int32_t WelsGetNumMbInSlice( SSliceCtx *pSliceCtx, const int32_t kuiSliceIdc )
-{
-	if ( NULL == pSliceCtx || kuiSliceIdc < 0 )
-		return -1;
-	{
-		SSliceCtx *pSliceSeg = pSliceCtx;
-		if ( SM_SINGLE_SLICE != pSliceSeg->uiSliceMode )
-		{
-			if ( NULL == pSliceSeg->pCountMbNumInSlice || kuiSliceIdc >= pSliceSeg->iSliceNumInFrame )
-				return -1;
-			return pSliceSeg->pCountMbNumInSlice[ kuiSliceIdc ];
-		}
-		else /*if ( pSliceSeg->uiSliceMode == SM_SINGLE_SLICE )*/
-		{
-			if ( kuiSliceIdc > 0 || NULL == pSliceSeg->pCountMbNumInSlice )
-				return -1;
-			return pSliceSeg->pCountMbNumInSlice[ kuiSliceIdc ];
-		}
-	}
-}
-
-int32_t GetCurrentSliceNum( const SSliceCtx *kpSliceCtx )
-{
-	return (kpSliceCtx != NULL) ? (kpSliceCtx->iSliceNumInFrame) : (-1);
-}
-int32_t DynamicAdjustSlicePEncCtxAll(	SSliceCtx *pSliceCtx,
-											int32_t *pRunLength	)
-{
-	const int32_t iCountNumMbInFrame		= pSliceCtx->iMbNumInFrame;
-	const int32_t iCountSliceNumInFrame	= pSliceCtx->iSliceNumInFrame;
-	int32_t iSameRunLenFlag				= 1;
-	int32_t iFirstMbIdx					= 0;
-	int32_t iSliceIdx						= 0;
-
-	assert( iCountSliceNumInFrame <= MAX_THREADS_NUM );
-	
-	while( iSliceIdx < iCountSliceNumInFrame )
-	{
-		if (pRunLength[iSliceIdx] != pSliceCtx->pCountMbNumInSlice[iSliceIdx])
-		{
-			iSameRunLenFlag = 0;
-			break;
-		}
-		++ iSliceIdx;
-	}
-	if ( iSameRunLenFlag )
-	{
-		return 1;	// do not need adjust it due to same running length as before to save complexity
-	}
-
-	iSliceIdx = 0;
-	do {
-		const int32_t kiSliceRun	= pRunLength[iSliceIdx];
-
-		pSliceCtx->pFirstMbInSlice[iSliceIdx]			= iFirstMbIdx;
-		pSliceCtx->pCountMbNumInSlice[iSliceIdx]		= kiSliceRun;
-		
-		memset(pSliceCtx->pOverallMbMap+iFirstMbIdx, (uint8_t)iSliceIdx, kiSliceRun*sizeof(uint8_t));
-		
-		iFirstMbIdx += kiSliceRun;
-
-		++ iSliceIdx;
-	} while(iSliceIdx < iCountSliceNumInFrame && iFirstMbIdx < iCountNumMbInFrame);
-	
-	return 0;	
-}
-
-int32_t DynamicMaxSliceNumConstraint( uint32_t uiMaximumNum, int32_t iConsumedNum, uint32_t iDulplicateTimes  )
-{
-	return ( (uiMaximumNum-iConsumedNum-1)/iDulplicateTimes );
-}
-
-} // namespace WelsSVCEnc
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	slice_segment.c
+ *
+ * \brief	SSlice segment routine (Single slice/multiple slice/fmo arrangement exclusive)
+ *
+ * \date	2/4/2009 Created
+ *
+ *************************************************************************************
+ */
+#include <string.h>
+#include <assert.h>
+#include "svc_enc_slice_segment.h"
+#include "wels_const.h"
+#include "macros.h"
+#include "utils.h"
+#include "macros.h"
+#include "rc.h"
+
+namespace WelsSVCEnc {
+/*!
+ * \brief	Assign MB map for single slice segment
+ *
+ * \param	pMbMap			overall MB map
+ * \param	iCountMbNum	count number of MB
+ *
+ * \return	0 - successful; none 0 - failed
+ */
+int32_t AssignMbMapSingleSlice (void* pMbMap, const int32_t kiCountMbNum, const int32_t kiMapUnitSize) {
+  if (NULL == pMbMap || kiCountMbNum <= 0)
+    return 1;
+
+  memset (pMbMap, 0, kiCountMbNum * kiMapUnitSize);
+
+  return 0;
+}
+
+/*!
+ * \brief	Assign MB map for multiple slice(s) segment
+ *
+ * \param	pMbMap			overall MB map
+ * \param	iCountMbNum	count number of MB
+ *
+ * \return	0 - successful; none 0 - failed
+ */
+int32_t AssignMbMapMultipleSlices (SSliceCtx* pSliceSeg, const SMulSliceOption* kpMso) {
+  if (NULL == pSliceSeg || SM_SINGLE_SLICE == pSliceSeg->uiSliceMode)
+    return 1;
+
+  if (SM_ROWMB_SLICE == pSliceSeg->uiSliceMode) {
+    const int32_t kiMbWidth	= pSliceSeg->iMbWidth;
+    int32_t iSliceNum = pSliceSeg->iSliceNumInFrame, uiSliceIdx = 0;
+
+    while (uiSliceIdx < iSliceNum) {
+      const int16_t kiFirstMb = uiSliceIdx * kiMbWidth;
+      pSliceSeg->pCountMbNumInSlice[uiSliceIdx]	= kiMbWidth;
+      pSliceSeg->pFirstMbInSlice[uiSliceIdx]		= kiFirstMb;
+      memset (pSliceSeg->pOverallMbMap + kiFirstMb, (uint8_t)uiSliceIdx, kiMbWidth * sizeof (uint8_t));
+      ++ uiSliceIdx;
+    }
+
+    return 0;
+  } else if (SM_RASTER_SLICE  == pSliceSeg->uiSliceMode ||
+             SM_FIXEDSLCNUM_SLICE == pSliceSeg->uiSliceMode) {
+    const int32_t* kpSlicesAssignList				= (int32_t*) & (kpMso->sSliceArgument.uiSliceMbNum[0]);
+    const int32_t kiCountNumMbInFrame		= pSliceSeg->iMbNumInFrame;
+    const int32_t kiCountSliceNumInFrame	= pSliceSeg->iSliceNumInFrame;
+    int32_t iSliceIdx						= 0;
+    int16_t iMbIdx							= 0;
+
+    do {
+      const int32_t kiCurRunLength	= kpSlicesAssignList[iSliceIdx];
+      int32_t iRunIdx					= 0;
+
+      pSliceSeg->pFirstMbInSlice[iSliceIdx]			= iMbIdx;
+      pSliceSeg->pCountMbNumInSlice[iSliceIdx]		= kiCurRunLength;
+
+      // due here need check validate mb_assign_map for input pData, can not use memset
+      do {
+        pSliceSeg->pOverallMbMap[iMbIdx + iRunIdx]	= iSliceIdx;
+        ++ iRunIdx;
+      } while (iRunIdx < kiCurRunLength && iMbIdx + iRunIdx < kiCountNumMbInFrame);
+
+      iMbIdx += kiCurRunLength;
+      ++ iSliceIdx;
+    } while (iSliceIdx < kiCountSliceNumInFrame && iMbIdx < kiCountNumMbInFrame);
+  } else if (SM_DYN_SLICE == pSliceSeg->uiSliceMode) {
+  } else {	// any else uiSliceMode?
+    assert (0);
+  }
+
+  // extention for other multiple slice type in the future
+  return 1;
+}
+
+/*!
+ *  Check slices assignment setttings on MST_INTERLEAVE type
+ */
+
+//slice parameter check for SM_FIXEDSLCNUM_SLICE
+bool_t CheckFixedSliceNumMultiSliceSetting (const int32_t kiMbNumInFrame, SSliceArgument* pSliceArg) {
+  int32_t* pSlicesAssignList		= (int32_t*) & (pSliceArg->uiSliceMbNum[0]);
+  const uint32_t kuiSliceNum			= pSliceArg->iSliceNum;
+  uint32_t uiSliceIdx				= 0;
+  const int32_t kiMbNumPerSlice	= kiMbNumInFrame / kuiSliceNum;
+  int32_t iNumMbLeft				= kiMbNumInFrame;
+
+  if (NULL == pSlicesAssignList)
+    return false;
+
+  for (; uiSliceIdx + 1 < kuiSliceNum; ++ uiSliceIdx) {
+    pSlicesAssignList[uiSliceIdx] = kiMbNumPerSlice;
+    iNumMbLeft	-= kiMbNumPerSlice;
+  }
+  pSlicesAssignList[uiSliceIdx] = iNumMbLeft;
+
+  return true;
+}
+
+//slice parameter check for SM_ROWMB_SLICE
+bool_t CheckRowMbMultiSliceSetting (const int32_t kiMbWidth, SSliceArgument* pSliceArg) {
+  int32_t* pSlicesAssignList = (int32_t*) & (pSliceArg->uiSliceMbNum[0]);
+  const uint32_t kuiSliceNum		= pSliceArg->iSliceNum;
+  uint32_t uiSliceIdx			= 0;
+
+  if (NULL == pSlicesAssignList)
+    return false;
+
+  while (uiSliceIdx < kuiSliceNum) {
+    pSlicesAssignList[uiSliceIdx]	= kiMbWidth;
+    ++ uiSliceIdx;
+  }
+  return true;
+}
+
+//slice parameter check for SM_RASTER_SLICE
+bool_t CheckRasterMultiSliceSetting (const int32_t kiMbNumInFrame, SSliceArgument* pSliceArg) {
+  int32_t*			pSlicesAssignList = (int32_t*) & (pSliceArg->uiSliceMbNum[0]);
+  int32_t			iActualSliceCount	= 0;
+
+  //check mb_num setting
+  uint32_t uiSliceIdx			= 0;
+  int32_t iCountMb			= 0;
+
+  if (NULL == pSlicesAssignList)
+    return false;
+
+  while ((uiSliceIdx < MAX_SLICES_NUM) && (0 < pSlicesAssignList[uiSliceIdx])) {
+    iCountMb			+= pSlicesAssignList[uiSliceIdx];
+    iActualSliceCount	=  uiSliceIdx + 1;
+
+    if (iCountMb >= kiMbNumInFrame) {
+      break;
+    }
+
+    ++ uiSliceIdx;
+  }
+  //break condition above makes, after the while
+  // here must have (iActualSliceCount <= MAX_SLICES_NUM)
+
+  //correction if needed
+  if (iCountMb == kiMbNumInFrame) {
+    ;
+  } else if (iCountMb > kiMbNumInFrame) {
+    //need correction:
+    //setting is more than iMbNumInFrame,
+    //cut the last uiSliceMbNum; adjust iCountMb
+    pSlicesAssignList[iActualSliceCount - 1]	-=	(iCountMb - kiMbNumInFrame);
+    iCountMb								=	kiMbNumInFrame;
+  } else if (iActualSliceCount < MAX_SLICES_NUM) {
+    //where ( iCountMb < iMbNumInFrame )
+    //can do correction:
+    //	make the last uiSliceMbNum the left num
+    pSlicesAssignList[iActualSliceCount] = kiMbNumInFrame - iCountMb;
+    iActualSliceCount += 1;
+  } else {
+    //here ( iCountMb < iMbNumInFrame ) && ( iActualSliceCount == MAX_SLICES_NUM )
+    //no more slice can be added
+    return false;
+  }
+
+  pSliceArg->iSliceNum = iActualSliceCount;
+  return true;
+
+}
+
+
+// GOM based RC related for uiSliceNum decision, only used at SM_FIXEDSLCNUM_SLICE
+void GomValidCheckSliceNum (const int32_t kiMbWidth, const int32_t kiMbHeight, int32_t* pSliceNum) {
+  const int32_t kiCountNumMb	= kiMbWidth * kiMbHeight;
+  int32_t iSliceNum			= *pSliceNum;
+  int32_t iGomSize;
+
+  //The default RC is Bit-rate mode[Yi], but need consider as below:
+  // Tuned to use max of mode0 and mode1 due can not refresh on this from rc mode changed outside, 8/16/2011
+  // NOTE: GOM_ROW_MODE0_?P is integer multipler of GOM_ROW_MODE1_?P, which predefined at rc.h there, so GOM_ROM take MODE0 as the initial
+  if (kiMbWidth <= MB_WIDTH_THRESHOLD_90P)
+    iGomSize = kiMbWidth * GOM_ROW_MODE0_90P;
+  else if (kiMbWidth <= MB_WIDTH_THRESHOLD_180P)
+    iGomSize = kiMbWidth *  GOM_ROW_MODE0_180P;
+  else if (kiMbWidth <= MB_WIDTH_THRESHOLD_360P)
+    iGomSize = kiMbWidth * GOM_ROW_MODE0_360P;
+  else
+    iGomSize = kiMbWidth * GOM_ROW_MODE0_720P;
+
+  while (true) {
+    if (kiCountNumMb < iGomSize * iSliceNum) {
+      -- iSliceNum;
+      iSliceNum = iSliceNum - (iSliceNum & 0x01);	// verfiy even num for multiple slices case
+      if (iSliceNum < 2)	// for safe
+        break;
+      continue;
+    }
+    break;
+  }
+
+  if (0 == iSliceNum)
+    iSliceNum = 1;
+
+  *pSliceNum	= iSliceNum;
+}
+
+
+// GOM based RC related for uiSliceMbNum decision, only used at SM_FIXEDSLCNUM_SLICE
+void GomValidCheckSliceMbNum (const int32_t kiMbWidth, const int32_t kiMbHeight, SSliceArgument* pSliceArg) {
+  uint32_t* pSlicesAssignList		= & (pSliceArg->uiSliceMbNum[0]);
+  const uint32_t kuiSliceNum			= pSliceArg->iSliceNum;
+  const int32_t kiMbNumInFrame	= kiMbWidth * kiMbHeight;
+  const int32_t kiMbNumPerSlice	= kiMbNumInFrame / kuiSliceNum;
+  int32_t iNumMbLeft				= kiMbNumInFrame;
+
+  int32_t iMinimalMbNum			= kiMbWidth;	// in theory we need only 1 SMB, here let it as one SMB row required
+  int32_t iMaximalMbNum			= 0;	// dynamically assign later
+  int32_t iGomSize;
+
+  uint32_t uiSliceIdx	= 0;	// for test
+
+  // The default RC is Bit-rate mode [Yi], but need consider as below:
+  // Tuned to use max of mode0 and mode1 due can not refresh on this from rc mode changed outside, 8/16/2011
+  // NOTE: GOM_ROW_MODE0_?P is integer multipler of GOM_ROW_MODE1_?P, which predefined at rc.h there, so GOM_ROM take MODE0 as the initial
+  if (kiMbWidth <= MB_WIDTH_THRESHOLD_90P)
+    iGomSize = kiMbWidth * GOM_ROW_MODE0_90P;
+  else if (kiMbWidth <= MB_WIDTH_THRESHOLD_180P)
+    iGomSize = kiMbWidth * GOM_ROW_MODE0_180P;
+  else if (kiMbWidth <= MB_WIDTH_THRESHOLD_360P)
+    iGomSize = kiMbWidth * GOM_ROW_MODE0_360P;
+  else
+    iGomSize = kiMbWidth * GOM_ROW_MODE0_720P;
+
+  iMinimalMbNum	= iGomSize;
+  iMaximalMbNum	= kiMbNumInFrame - (kuiSliceNum - 1) * iMinimalMbNum;
+
+  while (uiSliceIdx + 1 < kuiSliceNum) {
+    // GOM boundary aligned
+    int32_t iNumMbAssigning = (int32_t) (1.0f * kiMbNumPerSlice / iGomSize + 0.5f + EPSN) * iGomSize;
+
+    // make sure one GOM at least in each slice for safe
+    if (iNumMbAssigning < iMinimalMbNum)
+      iNumMbAssigning	= iMinimalMbNum;
+    else if (iNumMbAssigning > iMaximalMbNum)
+      iNumMbAssigning	= iMaximalMbNum;
+
+    assert (iNumMbAssigning > 0);
+
+    iNumMbLeft -= iNumMbAssigning;
+    assert (iNumMbLeft > 0);
+    pSlicesAssignList[uiSliceIdx]	= iNumMbAssigning;
+
+    ++ uiSliceIdx;
+    iMaximalMbNum	= iNumMbLeft - (kuiSliceNum - uiSliceIdx - 1) * iMinimalMbNum;	// get maximal num_mb in left parts
+  }
+  pSlicesAssignList[uiSliceIdx] = iNumMbLeft;
+}
+
+
+/*!
+ *	Get slice count for multiple slice segment
+ *
+ */
+int32_t GetInitialSliceNum (const int32_t kiMbWidth, const int32_t kiMbHeight, SMulSliceOption* pMso) {
+  if (NULL == pMso)
+    return -1;
+
+  switch (pMso->uiSliceMode) {
+  case SM_SINGLE_SLICE:
+  case SM_FIXEDSLCNUM_SLICE:
+  case SM_RASTER_SLICE:
+  case SM_ROWMB_SLICE: {
+    return pMso->sSliceArgument.iSliceNum;
+  }
+  case SM_DYN_SLICE: {
+    return AVERSLICENUM_CONSTRAINT;//at the beginning of dynamic slicing, set the uiSliceNum to be 1
+  }
+  case SM_RESERVED:
+  default: {
+    return -1;
+  }
+  }
+
+  return -1;
+}
+
+/*!
+ * \brief	Initialize slice segment (Single/multiple slices)
+ *
+ * \param	pSliceSeg			SSlice segment to be initialized
+ * \param	uiSliceMode			SSlice mode
+ * \param	multi_slice_argv	Multiple slices argument
+ * \param	iMbWidth			MB width
+ * \param	iMbHeight			MB height
+ *
+ * \return	0 - successful; none 0 - failed;
+ */
+int32_t InitSliceSegment (SSliceCtx* pSliceSeg,
+                          CMemoryAlign* pMa,
+                          SMulSliceOption* pMso,
+                          const int32_t kiMbWidth,
+                          const int32_t kiMbHeight) {
+  const int32_t kiCountMbNum = kiMbWidth * kiMbHeight;
+  SliceMode uiSliceMode = SM_SINGLE_SLICE;
+
+  if (NULL == pSliceSeg || NULL == pMso || kiMbWidth == 0 || kiMbHeight == 0)
+    return 1;
+
+  uiSliceMode = pMso->uiSliceMode;
+  if (pSliceSeg->iMbNumInFrame == kiCountMbNum && pSliceSeg->iMbWidth == kiMbWidth
+      && pSliceSeg->iMbHeight == kiMbHeight && pSliceSeg->uiSliceMode == uiSliceMode && pSliceSeg->pOverallMbMap != NULL)
+    return 0;
+  else if (pSliceSeg->iMbNumInFrame != kiCountMbNum) {
+    if (NULL != pSliceSeg->pOverallMbMap) {
+      pMa->WelsFree (pSliceSeg->pOverallMbMap, "pSliceSeg->pOverallMbMap");
+
+      pSliceSeg->pOverallMbMap = NULL;
+    }
+    if (NULL != pSliceSeg->pFirstMbInSlice) {
+      pMa->WelsFree (pSliceSeg->pFirstMbInSlice, "pSliceSeg->pFirstMbInSlice");
+
+      pSliceSeg->pFirstMbInSlice = NULL;
+    }
+    if (NULL != pSliceSeg->pCountMbNumInSlice) {
+      pMa->WelsFree (pSliceSeg->pCountMbNumInSlice, "pSliceSeg->pCountMbNumInSlice");
+
+      pSliceSeg->pCountMbNumInSlice	= NULL;
+    }
+    // just for safe
+    pSliceSeg->iSliceNumInFrame	= 0;
+    pSliceSeg->iMbNumInFrame		= 0;
+    pSliceSeg->iMbWidth				= 0;
+    pSliceSeg->iMbHeight			= 0;
+    pSliceSeg->uiSliceMode			= SM_SINGLE_SLICE;	// sigle in default
+  }
+
+  if (SM_SINGLE_SLICE == uiSliceMode) {
+    pSliceSeg->pOverallMbMap	= (uint8_t*)pMa->WelsMalloc (kiCountMbNum * sizeof (uint8_t), "pSliceSeg->pOverallMbMap");
+
+    WELS_VERIFY_RETURN_IF (1, NULL == pSliceSeg->pOverallMbMap)
+    pSliceSeg->iSliceNumInFrame	= 1;
+
+    pSliceSeg->pFirstMbInSlice	= (int16_t*)pMa->WelsMalloc (pSliceSeg->iSliceNumInFrame * sizeof (int16_t),
+                                  "pSliceSeg->pFirstMbInSlice");
+
+    WELS_VERIFY_RETURN_IF (1, NULL == pSliceSeg->pFirstMbInSlice)
+
+    pSliceSeg->pCountMbNumInSlice = (int32_t*)pMa->WelsMalloc (pSliceSeg->iSliceNumInFrame * sizeof (int32_t),
+                                    "pSliceSeg->pCountMbNumInSlice");
+
+    WELS_VERIFY_RETURN_IF (1, NULL == pSliceSeg->pCountMbNumInSlice)
+    pSliceSeg->uiSliceMode			= uiSliceMode;
+    pSliceSeg->iMbWidth				= kiMbWidth;
+    pSliceSeg->iMbHeight			= kiMbHeight;
+    pSliceSeg->iMbNumInFrame		= kiCountMbNum;
+    pSliceSeg->pCountMbNumInSlice[0]	= kiCountMbNum;
+    pSliceSeg->pFirstMbInSlice[0]		= 0;
+
+    return AssignMbMapSingleSlice (pSliceSeg->pOverallMbMap, kiCountMbNum, sizeof (pSliceSeg->pOverallMbMap[0]));
+  } else { //if ( SM_MULTIPLE_SLICE == uiSliceMode )
+    if (uiSliceMode != SM_FIXEDSLCNUM_SLICE && uiSliceMode != SM_ROWMB_SLICE && uiSliceMode != SM_RASTER_SLICE
+        && uiSliceMode != SM_DYN_SLICE)
+      return 1;
+
+    pSliceSeg->pOverallMbMap	= (uint8_t*)pMa->WelsMalloc (kiCountMbNum * sizeof (uint8_t), "pSliceSeg->pOverallMbMap");
+
+    WELS_VERIFY_RETURN_IF (1, NULL == pSliceSeg->pOverallMbMap)
+
+    //SM_DYN_SLICE: init, set pSliceSeg->iSliceNumInFrame	= 1;
+    pSliceSeg->iSliceNumInFrame = GetInitialSliceNum (kiMbWidth, kiMbHeight, pMso);
+
+    if (-1 == pSliceSeg->iSliceNumInFrame)
+      return 1;
+
+    pSliceSeg->pCountMbNumInSlice	= (int32_t*)pMa->WelsMalloc (pSliceSeg->iSliceNumInFrame * sizeof (int32_t),
+                                    "pSliceSeg->pCountMbNumInSlice");
+
+    WELS_VERIFY_RETURN_IF (1, NULL == pSliceSeg->pCountMbNumInSlice)
+
+    pSliceSeg->pFirstMbInSlice		= (int16_t*)pMa->WelsMalloc (pSliceSeg->iSliceNumInFrame * sizeof (int16_t),
+                                    "pSliceSeg->pFirstMbInSlice");
+
+    WELS_VERIFY_RETURN_IF (1, NULL == pSliceSeg->pFirstMbInSlice)
+    pSliceSeg->uiSliceMode			= pMso->uiSliceMode;
+    pSliceSeg->iMbWidth				= kiMbWidth;
+    pSliceSeg->iMbHeight			= kiMbHeight;
+    pSliceSeg->iMbNumInFrame		= kiCountMbNum;
+    if (SM_DYN_SLICE == pMso->uiSliceMode) {
+      if (0 < pMso->sSliceArgument.uiSliceSizeConstraint) {
+        pSliceSeg->uiSliceSizeConstraint = pMso->sSliceArgument.uiSliceSizeConstraint;
+      } else {
+        return 1;
+      }
+    } else {
+      pSliceSeg->uiSliceSizeConstraint = DEFAULT_MAXPACKETSIZE_CONSTRAINT;
+    }
+    // about "iMaxSliceNumConstraint"
+    //only used in SM_DYN_SLICE mode so far,
+    //now follows NAL_UNIT_CONSTRAINT, (see definition)
+    //will be adjusted under MT if there is limitation on iLayerNum
+    pSliceSeg->iMaxSliceNumConstraint = MAX_SLICES_NUM;
+
+
+    return AssignMbMapMultipleSlices (pSliceSeg, pMso);
+  }
+  return 0;
+}
+
+/*!
+ * \brief	Uninitialize slice segment (Single/multiple slices)
+ *
+ * \param	pSliceSeg			SSlice segment to be uninitialized
+ *
+ * \return	none;
+ */
+void UninitSliceSegment (SSliceCtx* pSliceSeg, CMemoryAlign* pMa) {
+  if (NULL != pSliceSeg) {
+    if (NULL != pSliceSeg->pOverallMbMap) {
+      pMa->WelsFree (pSliceSeg->pOverallMbMap, "pSliceSeg->pOverallMbMap");
+
+      pSliceSeg->pOverallMbMap = NULL;
+    }
+    if (NULL != pSliceSeg->pFirstMbInSlice) {
+      pMa->WelsFree (pSliceSeg->pFirstMbInSlice, "pSliceSeg->pFirstMbInSlice");
+
+      pSliceSeg->pFirstMbInSlice = NULL;
+    }
+    if (NULL != pSliceSeg->pCountMbNumInSlice) {
+      pMa->WelsFree (pSliceSeg->pCountMbNumInSlice, "pSliceSeg->pCountMbNumInSlice");
+
+      pSliceSeg->pCountMbNumInSlice = NULL;
+    }
+
+    pSliceSeg->iMbNumInFrame		= 0;
+    pSliceSeg->iMbWidth				= 0;
+    pSliceSeg->iMbHeight			= 0;
+    pSliceSeg->uiSliceMode			= SM_SINGLE_SLICE;	// single in default
+    pSliceSeg->iSliceNumInFrame	= 0;
+  }
+}
+
+
+/*!
+ * \brief	Initialize Wels SSlice context (Single/multiple slices and FMO)
+ *
+ * \param	pSliceCtx		SSlice context to be initialized
+ * \param	bFmoUseFlag	flag of using fmo
+ * \param	iMbWidth		MB width
+ * \param	iMbHeight		MB height
+ * \param	uiSliceMode		slice mode
+ * \param	mul_slice_arg	argument for multiple slice if it is applicable
+ * \param	pPpsArg			argument for pPps parameter
+ *
+ * \return	0 - successful; none 0 - failed;
+ */
+int32_t InitSlicePEncCtx (SSliceCtx* pSliceCtx,
+                          CMemoryAlign* pMa,
+                          bool_t bFmoUseFlag,
+                          int32_t iMbWidth,
+                          int32_t iMbHeight,
+                          SMulSliceOption* pMso,
+                          void* pPpsArg) {
+  if (NULL == pSliceCtx)
+    return 1;
+
+  InitSliceSegment (pSliceCtx,
+                    pMa,
+                    pMso,
+                    iMbWidth,
+                    iMbHeight);
+  return 0;
+}
+
+/*!
+ * \brief	Uninitialize Wels SSlice context (Single/multiple slices and FMO)
+ *
+ * \param	pSliceCtx		SSlice context to be initialized
+ *
+ * \return	NONE;
+ */
+void UninitSlicePEncCtx (SSliceCtx* pSliceCtx, CMemoryAlign* pMa) {
+  if (NULL != pSliceCtx) {
+    UninitSliceSegment (pSliceCtx, pMa);
+  }
+}
+
+/*!
+ * \brief	Get slice idc for given iMbXY (apply in Single/multiple slices and FMO)
+ *
+ * \param	pSliceCtx		SSlice context
+ * \param	kiMbXY			MB xy index
+ *
+ * \return	uiSliceIdc - successful; -1 - failed;
+ */
+uint8_t WelsMbToSliceIdc (SSliceCtx* pSliceCtx, const int16_t kiMbXY) {
+  if (NULL != pSliceCtx && kiMbXY < pSliceCtx->iMbNumInFrame && kiMbXY >= 0)
+    return pSliceCtx->pOverallMbMap[ kiMbXY ];
+  return (uint8_t) (-1);
+}
+
+/*!
+ * \brief	Get first mb in slice/slice_group: uiSliceIdc (apply in Single/multiple slices and FMO)
+ *
+ * \param	pSliceCtx		SSlice context
+ * \param	kuiSliceIdc		slice idc
+ *
+ * \return	iFirstMb - successful; -1 - failed;
+ */
+int32_t WelsGetFirstMbOfSlice (SSliceCtx* pSliceCtx, const int32_t kuiSliceIdc) {
+  return pSliceCtx->pFirstMbInSlice[ kuiSliceIdc ];
+}
+
+/*!
+ * \brief	Get successive mb to be processed in slice/slice_group: uiSliceIdc (apply in Single/multiple slices and FMO)
+ *
+ * \param	pSliceCtx		SSlice context
+ * \param	kiMbXY			MB xy index
+ *
+ * \return	next_mb - successful; -1 - failed;
+ */
+int32_t WelsGetNextMbOfSlice (SSliceCtx* pSliceCtx, const int16_t kiMbXY) {
+  if (NULL != pSliceCtx) {
+    SSliceCtx* pSliceSeg = pSliceCtx;
+    if (NULL == pSliceSeg || kiMbXY < 0 || kiMbXY >= pSliceSeg->iMbNumInFrame)
+      return -1;
+    if (SM_SINGLE_SLICE == pSliceSeg->uiSliceMode) {
+      int32_t iNextMbIdx = kiMbXY;
+      ++ iNextMbIdx;
+      if (iNextMbIdx >= pSliceSeg->iMbNumInFrame)
+        iNextMbIdx	= -1;
+      return iNextMbIdx;
+    } else { /*if ( SM_MULTIPLE_SLICE == pSliceSeg->uiSliceMode )*/
+      if (SM_RESERVED != pSliceSeg->uiSliceMode) {
+        int32_t iNextMbIdx = kiMbXY;
+        ++ iNextMbIdx;
+        if (iNextMbIdx < pSliceSeg->iMbNumInFrame && pSliceSeg->pOverallMbMap != NULL
+            && pSliceSeg->pOverallMbMap[iNextMbIdx] == pSliceSeg->pOverallMbMap[ kiMbXY ])
+          return iNextMbIdx;
+        return -1;
+      } else
+        return -1;	// reserved here for other multiple slice type
+    }
+  } else
+    return -1;
+}
+
+/*!
+ * \brief	Get previous mb to be processed in slice/slice_group: uiSliceIdc (apply in Single/multiple slices and FMO)
+ *
+ * \param	pSliceCtx		SSlice context
+ * \param	kiMbXY			MB xy index
+ *
+ * \return	prev_mb - successful; -1 - failed;
+ */
+int32_t WelsGetPrevMbOfSlice (SSliceCtx* pSliceCtx, const int16_t kiMbXY) {
+  if (NULL != pSliceCtx) {
+    SSliceCtx* pSliceSeg = pSliceCtx;
+    if (NULL == pSliceSeg || kiMbXY < 0 || kiMbXY >= pSliceSeg->iMbNumInFrame)
+      return -1;
+    if (pSliceSeg->uiSliceMode == SM_SINGLE_SLICE)
+      return (-1 + kiMbXY);
+    else { /* if ( pSliceSeg->uiSliceMode == SM_MULTIPLE_SLICE )*/
+      if (SM_RESERVED == pSliceSeg->uiSliceMode) {
+        int32_t iPrevMbIdx = kiMbXY;
+        -- iPrevMbIdx;
+        if (iPrevMbIdx >= 0 && iPrevMbIdx < pSliceSeg->iMbNumInFrame && NULL != pSliceSeg->pOverallMbMap
+            && pSliceSeg->pOverallMbMap[ kiMbXY ] == pSliceSeg->pOverallMbMap[ iPrevMbIdx ])
+          return iPrevMbIdx;
+        return -1;
+      } else
+        return -1;
+    }
+  } else
+    return -1;
+}
+
+/*!
+ * \brief	Get number of mb in slice/slice_group: uiSliceIdc (apply in Single/multiple slices and FMO)
+ *
+ * \param	pSliceCtx		SSlice context
+ * \param	kuiSliceIdc		slice/slice_group idc
+ *
+ * \return	count_num_of_mb - successful; -1 - failed;
+ */
+int32_t WelsGetNumMbInSlice (SSliceCtx* pSliceCtx, const int32_t kuiSliceIdc) {
+  if (NULL == pSliceCtx || kuiSliceIdc < 0)
+    return -1;
+  {
+    SSliceCtx* pSliceSeg = pSliceCtx;
+    if (SM_SINGLE_SLICE != pSliceSeg->uiSliceMode) {
+      if (NULL == pSliceSeg->pCountMbNumInSlice || kuiSliceIdc >= pSliceSeg->iSliceNumInFrame)
+        return -1;
+      return pSliceSeg->pCountMbNumInSlice[ kuiSliceIdc ];
+    } else { /*if ( pSliceSeg->uiSliceMode == SM_SINGLE_SLICE )*/
+      if (kuiSliceIdc > 0 || NULL == pSliceSeg->pCountMbNumInSlice)
+        return -1;
+      return pSliceSeg->pCountMbNumInSlice[ kuiSliceIdc ];
+    }
+  }
+}
+
+int32_t GetCurrentSliceNum (const SSliceCtx* kpSliceCtx) {
+  return (kpSliceCtx != NULL) ? (kpSliceCtx->iSliceNumInFrame) : (-1);
+}
+int32_t DynamicAdjustSlicePEncCtxAll (SSliceCtx* pSliceCtx,
+                                      int32_t* pRunLength) {
+  const int32_t iCountNumMbInFrame		= pSliceCtx->iMbNumInFrame;
+  const int32_t iCountSliceNumInFrame	= pSliceCtx->iSliceNumInFrame;
+  int32_t iSameRunLenFlag				= 1;
+  int32_t iFirstMbIdx					= 0;
+  int32_t iSliceIdx						= 0;
+
+  assert (iCountSliceNumInFrame <= MAX_THREADS_NUM);
+
+  while (iSliceIdx < iCountSliceNumInFrame) {
+    if (pRunLength[iSliceIdx] != pSliceCtx->pCountMbNumInSlice[iSliceIdx]) {
+      iSameRunLenFlag = 0;
+      break;
+    }
+    ++ iSliceIdx;
+  }
+  if (iSameRunLenFlag) {
+    return 1;	// do not need adjust it due to same running length as before to save complexity
+  }
+
+  iSliceIdx = 0;
+  do {
+    const int32_t kiSliceRun	= pRunLength[iSliceIdx];
+
+    pSliceCtx->pFirstMbInSlice[iSliceIdx]			= iFirstMbIdx;
+    pSliceCtx->pCountMbNumInSlice[iSliceIdx]		= kiSliceRun;
+
+    memset (pSliceCtx->pOverallMbMap + iFirstMbIdx, (uint8_t)iSliceIdx, kiSliceRun * sizeof (uint8_t));
+
+    iFirstMbIdx += kiSliceRun;
+
+    ++ iSliceIdx;
+  } while (iSliceIdx < iCountSliceNumInFrame && iFirstMbIdx < iCountNumMbInFrame);
+
+  return 0;
+}
+
+int32_t DynamicMaxSliceNumConstraint (uint32_t uiMaximumNum, int32_t iConsumedNum, uint32_t iDulplicateTimes) {
+  return ((uiMaximumNum - iConsumedNum - 1) / iDulplicateTimes);
+}
+
+} // namespace WelsSVCEnc
--- a/codec/encoder/core/src/svc_encode_mb.cpp
+++ b/codec/encoder/core/src/svc_encode_mb.cpp
@@ -1,413 +1,385 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file		encode_mb.c
- *
- * \brief		Implementaion for pCurMb encoding
- *
- * \date		05/19/2009 Created
- *************************************************************************************
- */
-
-#include <stdio.h>	//test use for file operation
-#include <string.h>
-
-#include "svc_encode_mb.h"
-#include "encode_mb_aux.h"
-#include "decode_mb_aux.h"
-#include "ls_defines.h"
-#include "cpu_core.h"
-#include "as264_common.h"
-#include "mb_cache.h"
-#include "array_stack_align.h"
-
-namespace WelsSVCEnc {
-void WelsDctMb(int16_t* pRes, uint8_t* pEncMb, int32_t iEncStride, uint8_t* pBestPred, PDctFunc pfDctFourT4)
-{
-    pfDctFourT4(pRes,			    pEncMb,							    iEncStride, pBestPred,			16);
-	pfDctFourT4(pRes + 64,		pEncMb + 8,						    iEncStride, pBestPred + 8,		16);
-	pfDctFourT4(pRes + 128,	pEncMb + 8 * iEncStride,		iEncStride, pBestPred + 128,	16);
-	pfDctFourT4(pRes + 192,	pEncMb + 8 * iEncStride + 8,	iEncStride, pBestPred + 136,	16);
-}
-
-void WelsEncRecI16x16Y(sWelsEncCtx *pEncCtx, SMB *pCurMb, SMbCache *pMbCache)
-{
-	ENFORCE_STACK_ALIGN_1D(int16_t, aDctT4Dc, 16, 16)
-	SWelsFuncPtrList *pFuncList	= pEncCtx->pFuncList;
-	SDqLayer* pCurDqLayer	    = pEncCtx->pCurDqLayer;
-	const int32_t kiEncStride	        = pCurDqLayer->iEncStride[0];	
-	int16_t *pRes				     	= pMbCache->pCoeffLevel;
-	uint8_t *pPred				        = pMbCache->SPicData.pCsMb[0];
-	const int32_t kiRecStride     	= pCurDqLayer->iCsStride[0];
-	int16_t *pBlock				        = pMbCache->pDct->iLumaBlock[0]; 
-	uint8_t *pBestPred		    	= pMbCache->pMemPredLuma;
-	const uint8_t* kpNoneZeroCountIdx	= &g_kuiMbCountScan4Idx[0];	
-	uint8_t i, uiQp						    = pCurMb->uiLumaQp;
-	uint32_t uiNoneZeroCount, uiNoneZeroCountMbAc				= 0, uiCountI16x16Dc;
-
-	int16_t* pMF = g_kiQuantMF[uiQp], *pFF	= g_iQuantIntraFF[uiQp];
-
-	WelsDctMb(pRes,  pMbCache->SPicData.pEncMb[0], kiEncStride, pBestPred, pEncCtx->pFuncList->pfDctFourT4);
-
-	pFuncList->pfTransformHadamard4x4Dc(aDctT4Dc, pRes);
-	pFuncList->pfQuantizationDc4x4( aDctT4Dc, pFF[0]<<1, pMF[0]>>1);
-	pFuncList->pfScan4x4( pMbCache->pDct->iLumaI16x16Dc, aDctT4Dc);
-	uiCountI16x16Dc = pFuncList->pfGetNoneZeroCount(pMbCache->pDct->iLumaI16x16Dc);
-
-	for(i = 0; i < 4; i++)
-	{	
-		pFuncList->pfQuantizationFour4x4(pRes, pFF,  pMF);
-		pFuncList->pfScan4x4Ac(pBlock,		pRes		);
-		pFuncList->pfScan4x4Ac(pBlock + 16, pRes + 16	);
-		pFuncList->pfScan4x4Ac(pBlock + 32, pRes + 32	);
-		pFuncList->pfScan4x4Ac(pBlock + 48, pRes + 48	);
-		pRes += 64; 
-		pBlock += 64;	
-	}
-	pRes -= 256;
-	pBlock -= 256;
-
-	for(i=0; i<16; i++)	{
-		uiNoneZeroCount = pFuncList->pfGetNoneZeroCount(pBlock);
-		pCurMb->pNonZeroCount[*kpNoneZeroCountIdx++] = uiNoneZeroCount;
-		uiNoneZeroCountMbAc += uiNoneZeroCount;
-		pBlock += 16;
-	}	
-
-	if( uiCountI16x16Dc > 0 ){
-		if(uiQp < 12) 		
-		{
-			WelsIHadamard4x4Dc(aDctT4Dc);
-			WelsDequantLumaDc4x4(aDctT4Dc, uiQp);
-		}
-		else
-			pFuncList->pfDequantizationIHadamard4x4(aDctT4Dc, g_kuiDequantCoeff[uiQp][0]>>2);
-	}
-
-	if( uiNoneZeroCountMbAc > 0 )
-	{		
-		pCurMb->uiCbp = 15;	
-        pFuncList->pfDequantizationFour4x4(pRes, g_kuiDequantCoeff[uiQp]);
-		pFuncList->pfDequantizationFour4x4(pRes+64, g_kuiDequantCoeff[uiQp]);
-		pFuncList->pfDequantizationFour4x4(pRes+128, g_kuiDequantCoeff[uiQp]);
-		pFuncList->pfDequantizationFour4x4(pRes+192, g_kuiDequantCoeff[uiQp]);
-
-		pRes[0]  = aDctT4Dc[0];		pRes[16] = aDctT4Dc[1];  
-		pRes[32] = aDctT4Dc[4];		pRes[48] = aDctT4Dc[5];  
-		pRes[64] = aDctT4Dc[2];		pRes[80] = aDctT4Dc[3];  
-		pRes[96] = aDctT4Dc[6];		pRes[112]= aDctT4Dc[7];  
-		pRes[128]= aDctT4Dc[8];		pRes[144]= aDctT4Dc[9];  
-		pRes[160]= aDctT4Dc[12];		pRes[176]= aDctT4Dc[13]; 
-		pRes[192]= aDctT4Dc[10];		pRes[208]= aDctT4Dc[11]; 
-		pRes[224]= aDctT4Dc[14];		pRes[240]= aDctT4Dc[15]; 
-
-		pFuncList->pfIDctFourT4(pPred,					              kiRecStride, pBestPred,		       16, pRes		);
-		pFuncList->pfIDctFourT4(pPred + 8,				          kiRecStride, pBestPred + 8,	   16, pRes + 64 );
-		pFuncList->pfIDctFourT4(pPred + kiRecStride*8,	      kiRecStride, pBestPred + 128,  16, pRes + 128);
-		pFuncList->pfIDctFourT4(pPred + kiRecStride*8 + 8, kiRecStride, pBestPred + 136,  16, pRes + 192);
-	} 
-	else if( uiCountI16x16Dc > 0 ){
-		pFuncList->pfIDctI16x16Dc(pPred,	kiRecStride, pBestPred,	16, aDctT4Dc);
-	} 
-	else{	
-		pFuncList->pfCopy16x16Aligned(pPred, kiRecStride, pBestPred, 16);
-	}
-}
-void WelsEncRecI4x4Y( sWelsEncCtx *pEncCtx, SMB *pCurMb, SMbCache *pMbCache, uint8_t uiI4x4Idx)
-{
-	SWelsFuncPtrList *pFuncList	= pEncCtx->pFuncList;
-	SDqLayer* pCurDqLayer		= pEncCtx->pCurDqLayer;
-	int32_t iEncStride			= pCurDqLayer->iEncStride[0];
-	uint8_t uiQp					= pCurMb->uiLumaQp;
-
-	int16_t *pResI4x4 = pMbCache->pCoeffLevel;
-	uint8_t *pPredI4x4;
-
-	uint8_t *pPred     = pMbCache->SPicData.pCsMb[0];
-	int32_t iRecStride = pCurDqLayer->iCsStride[0];
-
-	uint32_t uiOffset = g_kuiMbCountScan4Idx[uiI4x4Idx];
-	uint8_t* pEncMb = pMbCache->SPicData.pEncMb[0];
-	uint8_t *pBestPred = pMbCache->pBestPredI4x4Blk4;
-	int16_t* pBlock = pMbCache->pDct->iLumaBlock[uiI4x4Idx];
-
-	int16_t *pMF = g_kiQuantMF[uiQp], *pFF = g_iQuantIntraFF[uiQp];
-
-	int32_t *pStrideEncBlockOffset = pEncCtx->pStrideTab->pStrideEncBlockOffset[pEncCtx->uiDependencyId];
-	int32_t *pStrideDecBlockOffset = pEncCtx->pStrideTab->pStrideDecBlockOffset[pEncCtx->uiDependencyId][0==pEncCtx->uiTemporalId];
-	int32_t iNoneZeroCount = 0;
-
-	pFuncList->pfDctT4( pResI4x4, &(pEncMb[pStrideEncBlockOffset[uiI4x4Idx]]), iEncStride, pBestPred, 4 );
-	pFuncList->pfQuantization4x4(pResI4x4, pFF, pMF);
-	pFuncList->pfScan4x4(pBlock, pResI4x4);
-	
-	iNoneZeroCount = pFuncList->pfGetNoneZeroCount(pBlock);
-	pCurMb->pNonZeroCount[uiOffset] = iNoneZeroCount;
-
-	pPredI4x4 = pPred + pStrideDecBlockOffset[uiI4x4Idx]; 
-	if ( iNoneZeroCount > 0 )
-	{
-		pCurMb->uiCbp |= 1 << (uiI4x4Idx>>2);
-		pFuncList->pfDequantization4x4( pResI4x4, g_kuiDequantCoeff[uiQp]);
-		pFuncList->pfIDctT4(pPredI4x4, iRecStride, pBestPred, 4, pResI4x4);
-	}
-	else
-		WelsCopy4x4(pPredI4x4, iRecStride, pBestPred, 4);
-}
-
-void WelsEncInterY(SWelsFuncPtrList *pFuncList, SMB * pCurMb, SMbCache *pMbCache)
-{    
-	PQuantizationMaxFunc pfQuantizationFour4x4Max	= pFuncList->pfQuantizationFour4x4Max;
-	PSetMemoryZero pfSetMemZeroSize8				        = pFuncList->pfSetMemZeroSize8;
-	PSetMemoryZero pfSetMemZeroSize64			        = pFuncList->pfSetMemZeroSize64;
-	PScanFunc pfScan4x4			                                    = pFuncList->pfScan4x4;
-	PCalculateSingleCtrFunc pfCalculateSingleCtr4x4		= pFuncList->pfCalculateSingleCtr4x4;
-	PGetNoneZeroCountFunc pfGetNoneZeroCount	    = pFuncList->pfGetNoneZeroCount;
-	PDeQuantizationFunc pfDequantizationFour4x4		= pFuncList->pfDequantizationFour4x4;
-	int16_t *pRes					                                    = pMbCache->pCoeffLevel;
-	int32_t iSingleCtrMb		= 0, iSingleCtr8x8[4];
-	int16_t* pBlock				= pMbCache->pDct->iLumaBlock[0]; 
-	uint8_t uiQp					= pCurMb->uiLumaQp;
-	int16_t *pMF					= g_kiQuantMF[uiQp], *pFF = g_kiQuantInterFF[uiQp], aMax[16];
-	int32_t i, j, iNoneZeroCountMbDcAc	= 0, iNoneZeroCount=0;	
-
-	for(i = 0; i < 4; i++)
-	{	
-		pfQuantizationFour4x4Max(pRes, pFF,  pMF, aMax+(i<<2));
-		iSingleCtr8x8[i] = 0;
-		for(j = 0; j < 4; j++)
-		{
-			if(aMax[(i<<2)+j] == 0)
-				pfSetMemZeroSize8(pBlock, 32);
-			else	
-			{
-				pfScan4x4(pBlock, pRes);		
-				if(aMax[(i<<2)+j] > 1)
-					iSingleCtr8x8[i] += 9;	
-				else if(iSingleCtr8x8[i] < 6)
-					iSingleCtr8x8[i] += pfCalculateSingleCtr4x4(pBlock);
-			}
-			pRes += 16; 
-			pBlock += 16;	
-		}
-		iSingleCtrMb += iSingleCtr8x8[i];
-	}
-	pBlock -= 256;
-	pRes -= 256;
-
-	memset(pCurMb->pNonZeroCount, 0, 16);  
-    
-   
-	if( iSingleCtrMb < 6 )	 //from JVT-O079
-    {		
-		iNoneZeroCountMbDcAc = 0;
-		pfSetMemZeroSize64( pRes,  768 );	// confirmed_safe_unsafe_usage
-	}
-	else
-	{
-		const uint8_t* kpNoneZeroCountIdx = g_kuiMbCountScan4Idx;
-		for(i = 0; i < 4; i++)
-		{
-			if( iSingleCtr8x8[i] >= 4 ){				
-				for( j = 0; j < 4; j++ ){
-					iNoneZeroCount = pfGetNoneZeroCount(pBlock);
-					pCurMb->pNonZeroCount[*kpNoneZeroCountIdx++] = iNoneZeroCount;
-					iNoneZeroCountMbDcAc += iNoneZeroCount;
-       				pBlock += 16; 
-				}
-				pfDequantizationFour4x4(pRes, g_kuiDequantCoeff[uiQp]);
-				pCurMb->uiCbp |= 1 << i;
-			}
-			else {	// set zero for an 8x8 pBlock
-				pfSetMemZeroSize64(pRes, 128);	// confirmed_safe_unsafe_usage
-				kpNoneZeroCountIdx += 4;
-				pBlock += 64; 
-			}	
-			pRes += 64;
-		}
-	}
-}
-
-void    WelsEncRecUV(SWelsFuncPtrList *pFuncList, SMB * pCurMb, SMbCache *pMbCache, int16_t * pRes, int32_t iUV)
-{
-	PQuantizationHadamardFunc pfQuantizationHadamard2x2		= pFuncList->pfQuantizationHadamard2x2;
-	PQuantizationMaxFunc pfQuantizationFour4x4Max	= pFuncList->pfQuantizationFour4x4Max;	
-	PSetMemoryZero pfSetMemZeroSize8				        = pFuncList->pfSetMemZeroSize8;
-	PSetMemoryZero pfSetMemZeroSize64				    = pFuncList->pfSetMemZeroSize64;
-	PScanFunc pfScan4x4Ac		                                	= pFuncList->pfScan4x4Ac;
-	PCalculateSingleCtrFunc pfCalculateSingleCtr4x4		= pFuncList->pfCalculateSingleCtr4x4;
-	PGetNoneZeroCountFunc pfGetNoneZeroCount	    = pFuncList->pfGetNoneZeroCount;
-	PDeQuantizationFunc pfDequantizationFour4x4		= pFuncList->pfDequantizationFour4x4;
-	const int32_t kiInterFlag				                            = !IS_INTRA( pCurMb->uiMbType);
-	const uint8_t	kiQp                                                   = pCurMb->uiChromaQp;
-	uint8_t i, uiNoneZeroCount, uiNoneZeroCountMbAc	= 0, uiNoneZeroCountMbDc = 0;
-	uint8_t uiNoneZeroCountOffset	                            = (iUV - 1)<<1;	//UV==1 or 2 
-	uint8_t uiSubMbIdx				                                = 16 + ((iUV - 1)<<2);			//uiSubMbIdx == 16 or 20
-	int16_t* iChromaDc			= pMbCache->pDct->iChromaDc[iUV-1], *pBlock = pMbCache->pDct->iChromaBlock[(iUV - 1)<<2];		
-	int16_t aDct2x2[4], j, aMax[4];
-	int32_t iSingleCtr8x8		= 0;
-	int16_t* pMF = g_kiQuantMF[kiQp], *pFF = g_kiQuantInterFF[(!kiInterFlag)*6+kiQp];
-
-	uiNoneZeroCountMbDc = pfQuantizationHadamard2x2(pRes, pFF[0]<<1, pMF[0]>>1, aDct2x2, iChromaDc);
-
-	pfQuantizationFour4x4Max(pRes, pFF,  pMF, aMax);
-
-	for(j = 0; j < 4; j++)
-	{	
-		if(aMax[j] == 0)
-			pfSetMemZeroSize8(pBlock, 32);
-		else	
-		{
-			pfScan4x4Ac(pBlock, pRes);	
-			if(kiInterFlag)				
-			{
-				if(aMax[j] > 1)
-					iSingleCtr8x8 += 9;	
-				else if(iSingleCtr8x8 < 7)
-					iSingleCtr8x8 += pfCalculateSingleCtr4x4(pBlock);
-			}
-			else
-				iSingleCtr8x8 = INT_MAX;
-		}
-		pRes += 16; 
-		pBlock += 16;	
-	}	
-	pRes -= 64;
-
-	if(  iSingleCtr8x8 < 7 )	//from JVT-O079
-	{		
-		pfSetMemZeroSize64(pRes, 128);	// confirmed_safe_unsafe_usage
-		ST16( &pCurMb->pNonZeroCount[16+uiNoneZeroCountOffset], 0 );
-		ST16( &pCurMb->pNonZeroCount[20+uiNoneZeroCountOffset], 0 );
-	}
-	else
-	{
-		const uint8_t* kpNoneZeroCountIdx = &g_kuiMbCountScan4Idx[uiSubMbIdx];
-		pBlock -= 64;
-		for(i=0; i<4; i++){
-			uiNoneZeroCount = pfGetNoneZeroCount(pBlock);
-			pCurMb->pNonZeroCount[*kpNoneZeroCountIdx++] = uiNoneZeroCount;
-			uiNoneZeroCountMbAc += uiNoneZeroCount;
-			pBlock += 16;
-		}
-		pfDequantizationFour4x4(pRes, g_kuiDequantCoeff[pCurMb->uiChromaQp]);
-		pCurMb->uiCbp &= 0x0F;
-		pCurMb->uiCbp |= 0x20;	
-	}
-
-	if (uiNoneZeroCountMbDc > 0)
-	{	
-		WelsDequantIHadamard2x2Dc(aDct2x2, g_kuiDequantCoeff[kiQp][0] >> 1);
-		if ( 2 != (pCurMb->uiCbp >> 4) )
-			pCurMb->uiCbp |= (0x01 << 4) ;
- 		pRes[0]	= aDct2x2[0];
- 		pRes[16]	= aDct2x2[1];
- 		pRes[32]	= aDct2x2[2];
- 		pRes[48]	= aDct2x2[3];                   
-	}
-}
-
-
-void    WelsRecPskip(SDqLayer *pCurLayer, SWelsFuncPtrList *pFuncList, SMB * pCurMb, SMbCache *pMbCache)
-{	
-	int32_t* iRecStride	= pCurLayer->iCsStride;
-	uint8_t** pCsMb		= &pMbCache->SPicData.pCsMb[0];
-
-	pFuncList->pfCopy16x16Aligned(pCsMb[0],	*iRecStride++,	pMbCache->pSkipMb,		16);
-	pFuncList->pfCopy8x8Aligned(	pCsMb[1],	*iRecStride++,	pMbCache->pSkipMb + 256,	8);
-	pFuncList->pfCopy8x8Aligned(	pCsMb[2],	*iRecStride,	pMbCache->pSkipMb + 320,	8);
-	pFuncList->pfSetMemZeroSize8(		pCurMb->pNonZeroCount,	24 );
-}
-
-BOOL_T WelsTryPYskip(sWelsEncCtx * pEncCtx, SMB * pCurMb, SMbCache *pMbCache)
-{
-	int32_t iSingleCtrMb	= 0;
-	int16_t *pRes = pMbCache->pCoeffLevel;
-	const uint8_t kuiQp = pCurMb->uiLumaQp;	
-
-	int16_t* pBlock = pMbCache->pDct->iLumaBlock[0];		
-	uint16_t aMax[4], i, j;
-	int16_t* pMF = g_kiQuantMF[kuiQp], *pFF = g_kiQuantInterFF[kuiQp];
-
-    for(i = 0; i < 4; i++)
-    {		
-		pEncCtx->pFuncList->pfQuantizationFour4x4Max(pRes, pFF,  pMF, (int16_t*)aMax);	
-
-		for(j = 0; j < 4; j++)
-		{	
-			if(aMax[j] > 1) return FALSE;	// iSingleCtrMb += 9, can't be P_SKIP				
-			else if( aMax[j] == 1) 
-			{	
-				pEncCtx->pFuncList->pfScan4x4(pBlock, pRes); //
-				iSingleCtrMb += pEncCtx->pFuncList->pfCalculateSingleCtr4x4(pBlock);	
-			}		
-			if(iSingleCtrMb >= 6) 	return FALSE; //from JVT-O079
-			pRes += 16; 
-			pBlock += 16;
-		}
-	}	
-    return TRUE;
-}
-
-BOOL_T    WelsTryPUVskip(sWelsEncCtx * pEncCtx, SMB * pCurMb, SMbCache *pMbCache, int32_t iUV)
-{
-	int16_t* pRes = ((iUV == 1) ? &(pMbCache->pCoeffLevel[256]):&(pMbCache->pCoeffLevel[256+64]));	
-
-	const uint8_t kuiQp = g_kuiChromaQpTable[CLIP3_QP_0_51(pCurMb->uiLumaQp + pEncCtx->pCurDqLayer->sLayerInfo.pPpsP->uiChromaQpIndexOffset)];
-
-	int16_t* pMF = g_kiQuantMF[kuiQp], *pFF = g_kiQuantInterFF[kuiQp];
-
-	if(pEncCtx->pFuncList->pfQuantizationHadamard2x2Skip(pRes, pFF[0]<<1, pMF[0]>>1))
-		return FALSE;
-	else
-	{
-		uint16_t aMax[4], j;
-		int32_t iSingleCtrMb = 0;
-		int16_t* pBlock = pMbCache->pDct->iChromaBlock[(iUV-1)<<2];
-		pEncCtx->pFuncList->pfQuantizationFour4x4Max(pRes, pFF,  pMF, (int16_t*)aMax);
-
-		for(j = 0; j < 4; j++)
-		{
-			if( aMax[j] > 1)		return FALSE;	// iSingleCtrMb += 9, can't be P_SKIP			
-			else if( aMax[j] == 1)
-			{	
-				pEncCtx->pFuncList->pfScan4x4Ac(pBlock, pRes);				
-				iSingleCtrMb += pEncCtx->pFuncList->pfCalculateSingleCtr4x4(pBlock);	
-			}		
-			if(iSingleCtrMb >= 7) return FALSE; //from JVT-O079
-			pRes += 16; 
-			pBlock += 16;	
-		}
-		return TRUE;
-	}
-}
-
-} // namespace WelsSVCEnc
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file		encode_mb.c
+ *
+ * \brief		Implementaion for pCurMb encoding
+ *
+ * \date		05/19/2009 Created
+ *************************************************************************************
+ */
+
+#include <stdio.h>	//test use for file operation
+#include <string.h>
+
+#include "svc_encode_mb.h"
+#include "encode_mb_aux.h"
+#include "decode_mb_aux.h"
+#include "ls_defines.h"
+#include "cpu_core.h"
+#include "as264_common.h"
+#include "mb_cache.h"
+#include "array_stack_align.h"
+
+namespace WelsSVCEnc {
+void WelsDctMb (int16_t* pRes, uint8_t* pEncMb, int32_t iEncStride, uint8_t* pBestPred, PDctFunc pfDctFourT4) {
+  pfDctFourT4 (pRes,			    pEncMb,							    iEncStride, pBestPred,			16);
+  pfDctFourT4 (pRes + 64,		pEncMb + 8,						    iEncStride, pBestPred + 8,		16);
+  pfDctFourT4 (pRes + 128,	pEncMb + 8 * iEncStride,		iEncStride, pBestPred + 128,	16);
+  pfDctFourT4 (pRes + 192,	pEncMb + 8 * iEncStride + 8,	iEncStride, pBestPred + 136,	16);
+}
+
+void WelsEncRecI16x16Y (sWelsEncCtx* pEncCtx, SMB* pCurMb, SMbCache* pMbCache) {
+  ENFORCE_STACK_ALIGN_1D (int16_t, aDctT4Dc, 16, 16)
+  SWelsFuncPtrList* pFuncList	= pEncCtx->pFuncList;
+  SDqLayer* pCurDqLayer	    = pEncCtx->pCurDqLayer;
+  const int32_t kiEncStride	        = pCurDqLayer->iEncStride[0];
+  int16_t* pRes				     	= pMbCache->pCoeffLevel;
+  uint8_t* pPred				        = pMbCache->SPicData.pCsMb[0];
+  const int32_t kiRecStride     	= pCurDqLayer->iCsStride[0];
+  int16_t* pBlock				        = pMbCache->pDct->iLumaBlock[0];
+  uint8_t* pBestPred		    	= pMbCache->pMemPredLuma;
+  const uint8_t* kpNoneZeroCountIdx	= &g_kuiMbCountScan4Idx[0];
+  uint8_t i, uiQp						    = pCurMb->uiLumaQp;
+  uint32_t uiNoneZeroCount, uiNoneZeroCountMbAc				= 0, uiCountI16x16Dc;
+
+  int16_t* pMF = g_kiQuantMF[uiQp], *pFF	= g_iQuantIntraFF[uiQp];
+
+  WelsDctMb (pRes,  pMbCache->SPicData.pEncMb[0], kiEncStride, pBestPred, pEncCtx->pFuncList->pfDctFourT4);
+
+  pFuncList->pfTransformHadamard4x4Dc (aDctT4Dc, pRes);
+  pFuncList->pfQuantizationDc4x4 (aDctT4Dc, pFF[0] << 1, pMF[0]>>1);
+  pFuncList->pfScan4x4 (pMbCache->pDct->iLumaI16x16Dc, aDctT4Dc);
+  uiCountI16x16Dc = pFuncList->pfGetNoneZeroCount (pMbCache->pDct->iLumaI16x16Dc);
+
+  for (i = 0; i < 4; i++) {
+    pFuncList->pfQuantizationFour4x4 (pRes, pFF,  pMF);
+    pFuncList->pfScan4x4Ac (pBlock,		pRes);
+    pFuncList->pfScan4x4Ac (pBlock + 16, pRes + 16);
+    pFuncList->pfScan4x4Ac (pBlock + 32, pRes + 32);
+    pFuncList->pfScan4x4Ac (pBlock + 48, pRes + 48);
+    pRes += 64;
+    pBlock += 64;
+  }
+  pRes -= 256;
+  pBlock -= 256;
+
+  for (i = 0; i < 16; i++)	{
+    uiNoneZeroCount = pFuncList->pfGetNoneZeroCount (pBlock);
+    pCurMb->pNonZeroCount[*kpNoneZeroCountIdx++] = uiNoneZeroCount;
+    uiNoneZeroCountMbAc += uiNoneZeroCount;
+    pBlock += 16;
+  }
+
+  if (uiCountI16x16Dc > 0) {
+    if (uiQp < 12) {
+      WelsIHadamard4x4Dc (aDctT4Dc);
+      WelsDequantLumaDc4x4 (aDctT4Dc, uiQp);
+    } else
+      pFuncList->pfDequantizationIHadamard4x4 (aDctT4Dc, g_kuiDequantCoeff[uiQp][0] >> 2);
+  }
+
+  if (uiNoneZeroCountMbAc > 0) {
+    pCurMb->uiCbp = 15;
+    pFuncList->pfDequantizationFour4x4 (pRes, g_kuiDequantCoeff[uiQp]);
+    pFuncList->pfDequantizationFour4x4 (pRes + 64, g_kuiDequantCoeff[uiQp]);
+    pFuncList->pfDequantizationFour4x4 (pRes + 128, g_kuiDequantCoeff[uiQp]);
+    pFuncList->pfDequantizationFour4x4 (pRes + 192, g_kuiDequantCoeff[uiQp]);
+
+    pRes[0]  = aDctT4Dc[0];
+    pRes[16] = aDctT4Dc[1];
+    pRes[32] = aDctT4Dc[4];
+    pRes[48] = aDctT4Dc[5];
+    pRes[64] = aDctT4Dc[2];
+    pRes[80] = aDctT4Dc[3];
+    pRes[96] = aDctT4Dc[6];
+    pRes[112] = aDctT4Dc[7];
+    pRes[128] = aDctT4Dc[8];
+    pRes[144] = aDctT4Dc[9];
+    pRes[160] = aDctT4Dc[12];
+    pRes[176] = aDctT4Dc[13];
+    pRes[192] = aDctT4Dc[10];
+    pRes[208] = aDctT4Dc[11];
+    pRes[224] = aDctT4Dc[14];
+    pRes[240] = aDctT4Dc[15];
+
+    pFuncList->pfIDctFourT4 (pPred,					              kiRecStride, pBestPred,		       16, pRes);
+    pFuncList->pfIDctFourT4 (pPred + 8,				          kiRecStride, pBestPred + 8,	   16, pRes + 64);
+    pFuncList->pfIDctFourT4 (pPred + kiRecStride * 8,	      kiRecStride, pBestPred + 128,  16, pRes + 128);
+    pFuncList->pfIDctFourT4 (pPred + kiRecStride * 8 + 8, kiRecStride, pBestPred + 136,  16, pRes + 192);
+  } else if (uiCountI16x16Dc > 0) {
+    pFuncList->pfIDctI16x16Dc (pPred,	kiRecStride, pBestPred,	16, aDctT4Dc);
+  } else {
+    pFuncList->pfCopy16x16Aligned (pPred, kiRecStride, pBestPred, 16);
+  }
+}
+void WelsEncRecI4x4Y (sWelsEncCtx* pEncCtx, SMB* pCurMb, SMbCache* pMbCache, uint8_t uiI4x4Idx) {
+  SWelsFuncPtrList* pFuncList	= pEncCtx->pFuncList;
+  SDqLayer* pCurDqLayer		= pEncCtx->pCurDqLayer;
+  int32_t iEncStride			= pCurDqLayer->iEncStride[0];
+  uint8_t uiQp					= pCurMb->uiLumaQp;
+
+  int16_t* pResI4x4 = pMbCache->pCoeffLevel;
+  uint8_t* pPredI4x4;
+
+  uint8_t* pPred     = pMbCache->SPicData.pCsMb[0];
+  int32_t iRecStride = pCurDqLayer->iCsStride[0];
+
+  uint32_t uiOffset = g_kuiMbCountScan4Idx[uiI4x4Idx];
+  uint8_t* pEncMb = pMbCache->SPicData.pEncMb[0];
+  uint8_t* pBestPred = pMbCache->pBestPredI4x4Blk4;
+  int16_t* pBlock = pMbCache->pDct->iLumaBlock[uiI4x4Idx];
+
+  int16_t* pMF = g_kiQuantMF[uiQp], *pFF = g_iQuantIntraFF[uiQp];
+
+  int32_t* pStrideEncBlockOffset = pEncCtx->pStrideTab->pStrideEncBlockOffset[pEncCtx->uiDependencyId];
+  int32_t* pStrideDecBlockOffset = pEncCtx->pStrideTab->pStrideDecBlockOffset[pEncCtx->uiDependencyId][0 ==
+                                   pEncCtx->uiTemporalId];
+  int32_t iNoneZeroCount = 0;
+
+  pFuncList->pfDctT4 (pResI4x4, & (pEncMb[pStrideEncBlockOffset[uiI4x4Idx]]), iEncStride, pBestPred, 4);
+  pFuncList->pfQuantization4x4 (pResI4x4, pFF, pMF);
+  pFuncList->pfScan4x4 (pBlock, pResI4x4);
+
+  iNoneZeroCount = pFuncList->pfGetNoneZeroCount (pBlock);
+  pCurMb->pNonZeroCount[uiOffset] = iNoneZeroCount;
+
+  pPredI4x4 = pPred + pStrideDecBlockOffset[uiI4x4Idx];
+  if (iNoneZeroCount > 0) {
+    pCurMb->uiCbp |= 1 << (uiI4x4Idx >> 2);
+    pFuncList->pfDequantization4x4 (pResI4x4, g_kuiDequantCoeff[uiQp]);
+    pFuncList->pfIDctT4 (pPredI4x4, iRecStride, pBestPred, 4, pResI4x4);
+  } else
+    WelsCopy4x4 (pPredI4x4, iRecStride, pBestPred, 4);
+}
+
+void WelsEncInterY (SWelsFuncPtrList* pFuncList, SMB* pCurMb, SMbCache* pMbCache) {
+  PQuantizationMaxFunc pfQuantizationFour4x4Max	= pFuncList->pfQuantizationFour4x4Max;
+  PSetMemoryZero pfSetMemZeroSize8				        = pFuncList->pfSetMemZeroSize8;
+  PSetMemoryZero pfSetMemZeroSize64			        = pFuncList->pfSetMemZeroSize64;
+  PScanFunc pfScan4x4			                                    = pFuncList->pfScan4x4;
+  PCalculateSingleCtrFunc pfCalculateSingleCtr4x4		= pFuncList->pfCalculateSingleCtr4x4;
+  PGetNoneZeroCountFunc pfGetNoneZeroCount	    = pFuncList->pfGetNoneZeroCount;
+  PDeQuantizationFunc pfDequantizationFour4x4		= pFuncList->pfDequantizationFour4x4;
+  int16_t* pRes					                                    = pMbCache->pCoeffLevel;
+  int32_t iSingleCtrMb		= 0, iSingleCtr8x8[4];
+  int16_t* pBlock				= pMbCache->pDct->iLumaBlock[0];
+  uint8_t uiQp					= pCurMb->uiLumaQp;
+  int16_t* pMF					= g_kiQuantMF[uiQp], *pFF = g_kiQuantInterFF[uiQp], aMax[16];
+  int32_t i, j, iNoneZeroCountMbDcAc	= 0, iNoneZeroCount = 0;
+
+  for (i = 0; i < 4; i++) {
+    pfQuantizationFour4x4Max (pRes, pFF,  pMF, aMax + (i << 2));
+    iSingleCtr8x8[i] = 0;
+    for (j = 0; j < 4; j++) {
+      if (aMax[ (i << 2) + j] == 0)
+        pfSetMemZeroSize8 (pBlock, 32);
+      else {
+        pfScan4x4 (pBlock, pRes);
+        if (aMax[ (i << 2) + j] > 1)
+          iSingleCtr8x8[i] += 9;
+        else if (iSingleCtr8x8[i] < 6)
+          iSingleCtr8x8[i] += pfCalculateSingleCtr4x4 (pBlock);
+      }
+      pRes += 16;
+      pBlock += 16;
+    }
+    iSingleCtrMb += iSingleCtr8x8[i];
+  }
+  pBlock -= 256;
+  pRes -= 256;
+
+  memset (pCurMb->pNonZeroCount, 0, 16);
+
+
+  if (iSingleCtrMb < 6) {  //from JVT-O079
+    iNoneZeroCountMbDcAc = 0;
+    pfSetMemZeroSize64 (pRes,  768);	// confirmed_safe_unsafe_usage
+  } else {
+    const uint8_t* kpNoneZeroCountIdx = g_kuiMbCountScan4Idx;
+    for (i = 0; i < 4; i++) {
+      if (iSingleCtr8x8[i] >= 4) {
+        for (j = 0; j < 4; j++) {
+          iNoneZeroCount = pfGetNoneZeroCount (pBlock);
+          pCurMb->pNonZeroCount[*kpNoneZeroCountIdx++] = iNoneZeroCount;
+          iNoneZeroCountMbDcAc += iNoneZeroCount;
+          pBlock += 16;
+        }
+        pfDequantizationFour4x4 (pRes, g_kuiDequantCoeff[uiQp]);
+        pCurMb->uiCbp |= 1 << i;
+      } else {	// set zero for an 8x8 pBlock
+        pfSetMemZeroSize64 (pRes, 128);	// confirmed_safe_unsafe_usage
+        kpNoneZeroCountIdx += 4;
+        pBlock += 64;
+      }
+      pRes += 64;
+    }
+  }
+}
+
+void    WelsEncRecUV (SWelsFuncPtrList* pFuncList, SMB* pCurMb, SMbCache* pMbCache, int16_t* pRes, int32_t iUV) {
+  PQuantizationHadamardFunc pfQuantizationHadamard2x2		= pFuncList->pfQuantizationHadamard2x2;
+  PQuantizationMaxFunc pfQuantizationFour4x4Max	= pFuncList->pfQuantizationFour4x4Max;
+  PSetMemoryZero pfSetMemZeroSize8				        = pFuncList->pfSetMemZeroSize8;
+  PSetMemoryZero pfSetMemZeroSize64				    = pFuncList->pfSetMemZeroSize64;
+  PScanFunc pfScan4x4Ac		                                	= pFuncList->pfScan4x4Ac;
+  PCalculateSingleCtrFunc pfCalculateSingleCtr4x4		= pFuncList->pfCalculateSingleCtr4x4;
+  PGetNoneZeroCountFunc pfGetNoneZeroCount	    = pFuncList->pfGetNoneZeroCount;
+  PDeQuantizationFunc pfDequantizationFour4x4		= pFuncList->pfDequantizationFour4x4;
+  const int32_t kiInterFlag				                            = !IS_INTRA (pCurMb->uiMbType);
+  const uint8_t	kiQp                                                   = pCurMb->uiChromaQp;
+  uint8_t i, uiNoneZeroCount, uiNoneZeroCountMbAc	= 0, uiNoneZeroCountMbDc = 0;
+  uint8_t uiNoneZeroCountOffset	                            = (iUV - 1) << 1;	//UV==1 or 2
+  uint8_t uiSubMbIdx				                                = 16 + ((iUV - 1) << 2);			//uiSubMbIdx == 16 or 20
+  int16_t* iChromaDc			= pMbCache->pDct->iChromaDc[iUV - 1], *pBlock = pMbCache->pDct->iChromaBlock[ (iUV - 1) << 2];
+  int16_t aDct2x2[4], j, aMax[4];
+  int32_t iSingleCtr8x8		= 0;
+  int16_t* pMF = g_kiQuantMF[kiQp], *pFF = g_kiQuantInterFF[ (!kiInterFlag) * 6 + kiQp];
+
+  uiNoneZeroCountMbDc = pfQuantizationHadamard2x2 (pRes, pFF[0] << 1, pMF[0]>>1, aDct2x2, iChromaDc);
+
+  pfQuantizationFour4x4Max (pRes, pFF,  pMF, aMax);
+
+  for (j = 0; j < 4; j++) {
+    if (aMax[j] == 0)
+      pfSetMemZeroSize8 (pBlock, 32);
+    else {
+      pfScan4x4Ac (pBlock, pRes);
+      if (kiInterFlag) {
+        if (aMax[j] > 1)
+          iSingleCtr8x8 += 9;
+        else if (iSingleCtr8x8 < 7)
+          iSingleCtr8x8 += pfCalculateSingleCtr4x4 (pBlock);
+      } else
+        iSingleCtr8x8 = INT_MAX;
+    }
+    pRes += 16;
+    pBlock += 16;
+  }
+  pRes -= 64;
+
+  if (iSingleCtr8x8 < 7) {	//from JVT-O079
+    pfSetMemZeroSize64 (pRes, 128);	// confirmed_safe_unsafe_usage
+    ST16 (&pCurMb->pNonZeroCount[16 + uiNoneZeroCountOffset], 0);
+    ST16 (&pCurMb->pNonZeroCount[20 + uiNoneZeroCountOffset], 0);
+  } else {
+    const uint8_t* kpNoneZeroCountIdx = &g_kuiMbCountScan4Idx[uiSubMbIdx];
+    pBlock -= 64;
+    for (i = 0; i < 4; i++) {
+      uiNoneZeroCount = pfGetNoneZeroCount (pBlock);
+      pCurMb->pNonZeroCount[*kpNoneZeroCountIdx++] = uiNoneZeroCount;
+      uiNoneZeroCountMbAc += uiNoneZeroCount;
+      pBlock += 16;
+    }
+    pfDequantizationFour4x4 (pRes, g_kuiDequantCoeff[pCurMb->uiChromaQp]);
+    pCurMb->uiCbp &= 0x0F;
+    pCurMb->uiCbp |= 0x20;
+  }
+
+  if (uiNoneZeroCountMbDc > 0) {
+    WelsDequantIHadamard2x2Dc (aDct2x2, g_kuiDequantCoeff[kiQp][0] >> 1);
+    if (2 != (pCurMb->uiCbp >> 4))
+      pCurMb->uiCbp |= (0x01 << 4) ;
+    pRes[0]	= aDct2x2[0];
+    pRes[16]	= aDct2x2[1];
+    pRes[32]	= aDct2x2[2];
+    pRes[48]	= aDct2x2[3];
+  }
+}
+
+
+void    WelsRecPskip (SDqLayer* pCurLayer, SWelsFuncPtrList* pFuncList, SMB* pCurMb, SMbCache* pMbCache) {
+  int32_t* iRecStride	= pCurLayer->iCsStride;
+  uint8_t** pCsMb		= &pMbCache->SPicData.pCsMb[0];
+
+  pFuncList->pfCopy16x16Aligned (pCsMb[0],	*iRecStride++,	pMbCache->pSkipMb,		16);
+  pFuncList->pfCopy8x8Aligned (pCsMb[1],	*iRecStride++,	pMbCache->pSkipMb + 256,	8);
+  pFuncList->pfCopy8x8Aligned (pCsMb[2],	*iRecStride,	pMbCache->pSkipMb + 320,	8);
+  pFuncList->pfSetMemZeroSize8 (pCurMb->pNonZeroCount,	24);
+}
+
+BOOL_T WelsTryPYskip (sWelsEncCtx* pEncCtx, SMB* pCurMb, SMbCache* pMbCache) {
+  int32_t iSingleCtrMb	= 0;
+  int16_t* pRes = pMbCache->pCoeffLevel;
+  const uint8_t kuiQp = pCurMb->uiLumaQp;
+
+  int16_t* pBlock = pMbCache->pDct->iLumaBlock[0];
+  uint16_t aMax[4], i, j;
+  int16_t* pMF = g_kiQuantMF[kuiQp], *pFF = g_kiQuantInterFF[kuiQp];
+
+  for (i = 0; i < 4; i++) {
+    pEncCtx->pFuncList->pfQuantizationFour4x4Max (pRes, pFF,  pMF, (int16_t*)aMax);
+
+    for (j = 0; j < 4; j++) {
+      if (aMax[j] > 1) return FALSE;	// iSingleCtrMb += 9, can't be P_SKIP
+      else if (aMax[j] == 1) {
+        pEncCtx->pFuncList->pfScan4x4 (pBlock, pRes); //
+        iSingleCtrMb += pEncCtx->pFuncList->pfCalculateSingleCtr4x4 (pBlock);
+      }
+      if (iSingleCtrMb >= 6) 	return FALSE; //from JVT-O079
+      pRes += 16;
+      pBlock += 16;
+    }
+  }
+  return TRUE;
+}
+
+BOOL_T    WelsTryPUVskip (sWelsEncCtx* pEncCtx, SMB* pCurMb, SMbCache* pMbCache, int32_t iUV) {
+  int16_t* pRes = ((iUV == 1) ? & (pMbCache->pCoeffLevel[256]) : & (pMbCache->pCoeffLevel[256 + 64]));
+
+  const uint8_t kuiQp = g_kuiChromaQpTable[CLIP3_QP_0_51 (pCurMb->uiLumaQp +
+                        pEncCtx->pCurDqLayer->sLayerInfo.pPpsP->uiChromaQpIndexOffset)];
+
+  int16_t* pMF = g_kiQuantMF[kuiQp], *pFF = g_kiQuantInterFF[kuiQp];
+
+  if (pEncCtx->pFuncList->pfQuantizationHadamard2x2Skip (pRes, pFF[0] << 1, pMF[0]>>1))
+    return FALSE;
+  else {
+    uint16_t aMax[4], j;
+    int32_t iSingleCtrMb = 0;
+    int16_t* pBlock = pMbCache->pDct->iChromaBlock[ (iUV - 1) << 2];
+    pEncCtx->pFuncList->pfQuantizationFour4x4Max (pRes, pFF,  pMF, (int16_t*)aMax);
+
+    for (j = 0; j < 4; j++) {
+      if (aMax[j] > 1)		return FALSE;	// iSingleCtrMb += 9, can't be P_SKIP
+      else if (aMax[j] == 1) {
+        pEncCtx->pFuncList->pfScan4x4Ac (pBlock, pRes);
+        iSingleCtrMb += pEncCtx->pFuncList->pfCalculateSingleCtr4x4 (pBlock);
+      }
+      if (iSingleCtrMb >= 7) return FALSE; //from JVT-O079
+      pRes += 16;
+      pBlock += 16;
+    }
+    return TRUE;
+  }
+}
+
+} // namespace WelsSVCEnc
--- a/codec/encoder/core/src/svc_encode_slice.cpp
+++ b/codec/encoder/core/src/svc_encode_slice.cpp
@@ -1,1194 +1,1120 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file	svc_encode_slice.c
- *
- * \brief	svc encoding slice 
- *
- * \date	2009.07.27 Created
- *
- *************************************************************************************
- */
-
-#include <string.h>
-#include <assert.h>
-#include "ls_defines.h"
-#include "svc_encode_slice.h"
-#include "svc_enc_golomb.h"
-#include "svc_base_layer_md.h"
-#include "svc_encode_mb.h"
-#include "mv_pred.h"
-#include "svc_set_mb_syn_cavlc.h"
-#include "encode_mb_aux.h"
-#include "decode_mb_aux.h"
-#include "svc_mode_decision.h"
-#include "cpu_core.h"
-#include "svc_motion_estimate.h"
-#include "sample.h"
-#include "wels_func_ptr_def.h"
-#include "utils.h"
-
-namespace WelsSVCEnc {
-//#define ENC_TRACE
- 
-typedef void (*PWelsCodingSliceFunc)( sWelsEncCtx *pCtx, SSlice *pSlice );
-typedef void (*PWelsSliceHeaderWriteFunc)( SBitStringAux* pBs, SDqLayer* pCurLayer, SSlice *pSlice, int32_t* pPpsIdDelta );
-
-void UpdateNonZeroCountCache(SMB *pMb, SMbCache *pMbCache)
-{
-	ST32(&pMbCache->iNonZeroCoeffCount[9], LD32(&pMb->pNonZeroCount[ 0]));
-	ST32(&pMbCache->iNonZeroCoeffCount[17], LD32(&pMb->pNonZeroCount[ 4]));
-	ST32(&pMbCache->iNonZeroCoeffCount[25], LD32(&pMb->pNonZeroCount[ 8]));
-	ST32(&pMbCache->iNonZeroCoeffCount[33], LD32(&pMb->pNonZeroCount[12]));	
-	
-	ST16(&pMbCache->iNonZeroCoeffCount[14], LD16(&pMb->pNonZeroCount[16]));
-	ST16(&pMbCache->iNonZeroCoeffCount[38], LD16(&pMb->pNonZeroCount[18]));
-	ST16(&pMbCache->iNonZeroCoeffCount[22], LD16(&pMb->pNonZeroCount[20]));
-	ST16(&pMbCache->iNonZeroCoeffCount[46], LD16(&pMb->pNonZeroCount[22]));
-}
-
-void WelsSliceHeaderScalExtInit( SDqLayer* pCurLayer, SSlice *pSlice )
-{
-	SSliceHeaderExt* pSliceHeadExt	= &pSlice->sSliceHeaderExt;
-	SNalUnitHeaderExt* pNalHeadExt= &pCurLayer->sLayerInfo.sNalHeaderExt;
-	
-	uint8_t uiDependencyId	= pNalHeadExt->uiDependencyId;
-
-	pSliceHeadExt->bSliceSkipFlag = false;	
-
-	if ( uiDependencyId > 0 ) //spatial EL
-	{
-		//bothe adaptive and default flags should equal to 0.
-		pSliceHeadExt->bAdaptiveBaseModeFlag     = 
-			pSliceHeadExt->bAdaptiveMotionPredFlag   = 
-			pSliceHeadExt->bAdaptiveResidualPredFlag = false;
-
-		pSliceHeadExt->bDefaultBaseModeFlag     = 
-			pSliceHeadExt->bDefaultMotionPredFlag   =
-			pSliceHeadExt->bDefaultResidualPredFlag = false;
-	}
-}
-
-void WelsSliceHeaderExtInit( sWelsEncCtx* pEncCtx, SDqLayer* pCurLayer, SSlice *pSlice )
-{
-	SSliceHeaderExt* pCurSliceExt = &pSlice->sSliceHeaderExt;
-	SSliceHeader* pCurSliceHeader  = &pCurSliceExt->sSliceHeader;	
-	
-	pCurSliceHeader->eSliceType	= pEncCtx->eSliceType;
-
-	pCurSliceExt->bStoreRefBasePicFlag = false;	
-
-	pCurSliceHeader->iFirstMbInSlice = WelsGetFirstMbOfSlice( pCurLayer->pSliceEncCtx, pSlice->uiSliceIdx );
-
-	pCurSliceHeader->iFrameNum      = pEncCtx->iFrameNum;	
-	pCurSliceHeader->uiIdrPicId     = pEncCtx->sPSOVector.uiIdrPicId; //??
-
-	pCurSliceHeader->iPicOrderCntLsb          = pEncCtx->pEncPic->iFramePoc;	// 0
-
-	if ( P_SLICE == pEncCtx->eSliceType  )
-	{
-		pCurSliceHeader->uiNumRefIdxL0Active	= 1;
-		if ( pCurSliceHeader->uiRefCount > 0 && 
-			pCurSliceHeader->uiRefCount < pCurLayer->sLayerInfo.pSpsP->iNumRefFrames )
-		{
-			pCurSliceHeader->bNumRefIdxActiveOverrideFlag = true;
-			pCurSliceHeader->uiNumRefIdxL0Active	= pCurSliceHeader->uiRefCount;
-		}
-		//to solve mismatch between debug&release
-		else
-		{
-			pCurSliceHeader->bNumRefIdxActiveOverrideFlag = false;
-		}
-	}
-
-	pCurSliceHeader->iSliceQpDelta = pEncCtx->iGlobalQp - pCurLayer->sLayerInfo.pPpsP->iPicInitQp;
-
-	//for deblocking initial
-	pCurSliceHeader->uiDisableDeblockingFilterIdc			= pCurLayer->iLoopFilterDisableIdc;
-	pCurSliceHeader->iSliceAlphaC0Offset					= pCurLayer->iLoopFilterAlphaC0Offset;	//	need update iSliceAlphaC0Offset & iSliceBetaOffset for pSlice-header if loop_filter_idc != 1
-	pCurSliceHeader->iSliceBetaOffset						= pCurLayer->iLoopFilterBetaOffset;
-	pCurSliceExt->uiDisableInterLayerDeblockingFilterIdc = pCurLayer->uiDisableInterLayerDeblockingFilterIdc;
-
-	if ( pSlice->bSliceHeaderExtFlag )
-	{
-		WelsSliceHeaderScalExtInit( pCurLayer, pSlice );
-	}
-	else
-	{
-		//both adaptive and default flags should equal to 0.
-		pCurSliceExt->bAdaptiveBaseModeFlag		= 
-		pCurSliceExt->bAdaptiveMotionPredFlag		= 
-		pCurSliceExt->bAdaptiveResidualPredFlag	= false;
-		
-		pCurSliceExt->bDefaultBaseModeFlag		= 
-		pCurSliceExt->bDefaultMotionPredFlag		=
-		pCurSliceExt->bDefaultResidualPredFlag	= false;
-	}
-}
-
-/* count MB types if enabled FRAME_INFO_OUTPUT*/
-#if defined(MB_TYPES_CHECK)
-void WelsCountMbType(int32_t (*iMbCount)[18], const EWelsSliceType keSt, const SMB* kpMb)
-{	
-	if (NULL == iMbCount)
-		return;
-	
-	switch( kpMb->uiMbType ) {
-	case MB_TYPE_INTRA4x4:
-		++ iMbCount[keSt][Intra4x4];
-		break;
-	case MB_TYPE_INTRA16x16:
-		++ iMbCount[keSt][Intra16x16];
-		break;
-	case MB_TYPE_SKIP:
-		++ iMbCount[keSt][PSkip];
-		break;
-	case MB_TYPE_16x16:
-		++ iMbCount[keSt][Inter16x16];
-		break;
-	case MB_TYPE_16x8:
-		++ iMbCount[keSt][Inter16x8];
-		break;
-	case MB_TYPE_8x16:
-		++ iMbCount[eSt][Inter8x16];
-		break;
-	case MB_TYPE_8x8:
-		++ iMbCount[keSt][Inter8x8];
-		break;
-	case MB_TYPE_INTRA_BL:
-		++ iMbCount[keSt][7];
-		break;
-	default:
-		break;
-	}
-}
-#endif//MB_TYPES_CHECK
-
-/*!
-* \brief	write reference picture list on reordering syntax in Slice header	
-*/
-void WriteReferenceReorder( SBitStringAux *pBs, SSliceHeader *sSliceHeader )
-{
-	SRefPicListReorderSyntax *pRefOrdering	= &sSliceHeader->sRefReordering;
-	uint8_t eSliceType						= sSliceHeader->eSliceType % 5;
-	int16_t n = 0;
-
-	if (  I_SLICE != eSliceType && SI_SLICE != eSliceType )	// !I && !SI
-	{
-		BsWriteOneBit( pBs, true );
-//		{
-			uint16_t uiReorderingOfPicNumsIdc;
-			do 
-			{
-				uiReorderingOfPicNumsIdc = pRefOrdering->SReorderingSyntax[n].uiReorderingOfPicNumsIdc; 
-				BsWriteUE( pBs, uiReorderingOfPicNumsIdc );
-				if ( 0 == uiReorderingOfPicNumsIdc || 1 == uiReorderingOfPicNumsIdc )
-					BsWriteUE( pBs, pRefOrdering->SReorderingSyntax[n].uiAbsDiffPicNumMinus1 );
-				else if ( 2 == uiReorderingOfPicNumsIdc )
-					BsWriteUE( pBs, pRefOrdering->SReorderingSyntax[n].iLongTermPicNum );
-
-				n ++;
-			} while ( 3 != uiReorderingOfPicNumsIdc );
-//		}
-	}
-}
-
-/*!
-* \brief	write reference picture marking syntax in pSlice header	
-*/
-void WriteRefPicMarking( SBitStringAux *pBs, SSliceHeader *pSliceHeader, SNalUnitHeaderExt *pNalHdrExt )
-{
-	SRefPicMarking *sRefMarking	= &pSliceHeader->sRefMarking;
-	int16_t n = 0;	
-
-	if ( pNalHdrExt->bIdrFlag )
-	{
-		BsWriteOneBit( pBs, sRefMarking->bNoOutputOfPriorPicsFlag );
-		BsWriteOneBit( pBs, sRefMarking->bLongTermRefFlag );
-	}
-	else 
-	{
-		BsWriteOneBit( pBs, sRefMarking->bAdaptiveRefPicMarkingModeFlag );
-
-		if ( sRefMarking->bAdaptiveRefPicMarkingModeFlag )
-		{
-			int32_t iMmcoType;
-			do 
-			{
-				iMmcoType = sRefMarking->SMmcoRef[n].iMmcoType;
-				BsWriteUE( pBs, iMmcoType );
-				if ( 1 == iMmcoType || 3 == iMmcoType )
-					BsWriteUE( pBs, sRefMarking->SMmcoRef[n].iDiffOfPicNum - 1 );
-
-				if ( 2 == iMmcoType )
-					BsWriteUE( pBs, sRefMarking->SMmcoRef[n].iLongTermPicNum );
-
-				if ( 3 == iMmcoType || 6 == iMmcoType )
-					BsWriteUE( pBs, sRefMarking->SMmcoRef[n].iLongTermFrameIdx );
-
-				if ( 4 == iMmcoType )
-					BsWriteUE( pBs, sRefMarking->SMmcoRef[n].iMaxLongTermFrameIdx + 1 );
-
-				n ++;
-			} while ( 0 != iMmcoType );
-		}
-
-	}
-}
-
-void WelsSliceHeaderWrite( SBitStringAux* pBs, SDqLayer* pCurLayer, SSlice *pSlice, int32_t* pPpsIdDelta )
-{
-	SWelsSPS* pSps = pCurLayer->sLayerInfo.pSpsP;
-	SWelsPPS* pPps = pCurLayer->sLayerInfo.pPpsP;
-	SSliceHeader* pSliceHeader      = &pSlice->sSliceHeaderExt.sSliceHeader;	
-	SNalUnitHeaderExt* pNalHead   = &pCurLayer->sLayerInfo.sNalHeaderExt;	
-
-	BsWriteUE( pBs, pSliceHeader->iFirstMbInSlice );
-	BsWriteUE( pBs, pSliceHeader->eSliceType );   /* same type things */
-
-	BsWriteUE( pBs, pSliceHeader->pPps->iPpsId + pPpsIdDelta[pSliceHeader->pPps->iPpsId] );
-
-	BsWriteBits( pBs, pSps->uiLog2MaxFrameNum, pSliceHeader->iFrameNum );
-
-	if( pNalHead->bIdrFlag ) /* NAL IDR */
-	{
-		BsWriteUE( pBs, pSliceHeader->uiIdrPicId );
-	}
-
-	BsWriteBits( pBs, pSps->iLog2MaxPocLsb, pSliceHeader->iPicOrderCntLsb );
-
-	if ( P_SLICE == pSliceHeader->eSliceType )
-	{
-		BsWriteOneBit( pBs, pSliceHeader->bNumRefIdxActiveOverrideFlag );
-		if ( pSliceHeader->bNumRefIdxActiveOverrideFlag )
-		{
-			BsWriteUE( pBs, pSliceHeader->uiNumRefIdxL0Active - 1 );
-		}
-	}
-
-	if ( !pNalHead->bIdrFlag )
-		WriteReferenceReorder( pBs, pSliceHeader );
-
-	if ( pNalHead->sNalHeader.uiNalRefIdc )
-	{
-		WriteRefPicMarking( pBs, pSliceHeader, pNalHead );
-	}	
-
-	BsWriteSE( pBs, pSliceHeader->iSliceQpDelta );      /* pSlice qp delta */
-
-	if( pPps->bDeblockingFilterControlPresentFlag )
-	{
-		switch( pSliceHeader->uiDisableDeblockingFilterIdc )
-		{
-		case 0:
-		case 3:
-		case 4:
-		case 6:
-			BsWriteUE( pBs, 0 );
-			break;
-		case 1:
-			BsWriteUE( pBs, 1 );
-			break;
-		case 2:
-		case 5:
-			BsWriteUE( pBs, 2 );
-			break;
-		default :
-			fprintf( stderr, "pData error for deblocking" );
-			break;
-		}
-		if ( 1 != pSliceHeader->uiDisableDeblockingFilterIdc )
-		{
-			BsWriteSE( pBs, pSliceHeader->iSliceAlphaC0Offset >> 1 );
-			BsWriteSE( pBs, pSliceHeader->iSliceBetaOffset >> 1 );
-		}
-	}	
-}
-
-void WelsSliceHeaderExtWrite( SBitStringAux* pBs, SDqLayer* pCurLayer, SSlice *pSlice, int32_t *pPpsIdDelta )
-{
-	SWelsSPS* pSps           = pCurLayer->sLayerInfo.pSpsP;	
-	SWelsPPS* pPps           = pCurLayer->sLayerInfo.pPpsP;
-	SSubsetSps* pSubSps = pCurLayer->sLayerInfo.pSubsetSpsP;
-	SSliceHeaderExt* pSliceHeadExt = &pSlice->sSliceHeaderExt;
-	SSliceHeader* pSliceHeader      = &pSliceHeadExt->sSliceHeader;
-	SNalUnitHeaderExt* pNalHead   = &pCurLayer->sLayerInfo.sNalHeaderExt;
-
-	BsWriteUE( pBs, pSliceHeader->iFirstMbInSlice );
-	BsWriteUE( pBs, pSliceHeader->eSliceType );   /* same type things */
-
-	BsWriteUE( pBs, pSliceHeader->pPps->iPpsId + pPpsIdDelta[pSliceHeader->pPps->iPpsId] );
-
-	BsWriteBits( pBs, pSps->uiLog2MaxFrameNum, pSliceHeader->iFrameNum );
-
-	if( pNalHead->bIdrFlag ) /* NAL IDR */
-	{
-		BsWriteUE( pBs, pSliceHeader->uiIdrPicId );
-	}
-
-	BsWriteBits( pBs, pSps->iLog2MaxPocLsb, pSliceHeader->iPicOrderCntLsb );
-//	{
-		if ( P_SLICE == pSliceHeader->eSliceType )
-		{
-			BsWriteOneBit( pBs, pSliceHeader->bNumRefIdxActiveOverrideFlag );
-			if ( pSliceHeader->bNumRefIdxActiveOverrideFlag )
-			{
-				BsWriteUE( pBs, pSliceHeader->uiNumRefIdxL0Active - 1 );
-			}
-		}
-
-		if ( !pNalHead->bIdrFlag )
-			WriteReferenceReorder( pBs, pSliceHeader );
-
-		if ( pNalHead->sNalHeader.uiNalRefIdc )
-		{
-			WriteRefPicMarking( pBs, pSliceHeader, pNalHead );
-
-			if ( !pSubSps->sSpsSvcExt.bSliceHeaderRestrictionFlag )
-			{
-				BsWriteOneBit( pBs, pSliceHeadExt->bStoreRefBasePicFlag );
-			}
-		}
-//	}
-
-	BsWriteSE( pBs, pSliceHeader->iSliceQpDelta );      /* pSlice qp delta */
-
-	if( pPps->bDeblockingFilterControlPresentFlag )
-	{
-		BsWriteUE( pBs, pSliceHeader->uiDisableDeblockingFilterIdc );
-		if ( 1 != pSliceHeader->uiDisableDeblockingFilterIdc )
-		{
-			BsWriteSE( pBs, pSliceHeader->iSliceAlphaC0Offset >> 1 );
-			BsWriteSE( pBs, pSliceHeader->iSliceBetaOffset >> 1 );
-		}
-	}	
-
-#if !defined(DISABLE_FMO_FEATURE)
-	if ( pPps->uiNumSliceGroups > 1  &&
-		pPps->uiSliceGroupMapType >= 3 && 
-		pPps->uiSliceGroupMapType <= 5 )
-	{
-		int32_t iNumBits;
-		if ( pPps->uiSliceGroupChangeRate )
-		{
-			iNumBits = WELS_CEILLOG2(1 + pPps->uiPicSizeInMapUnits / pPps->uiSliceGroupChangeRate);
-			BsWriteBits( pBs, iNumBits, pSliceHeader->iSliceGroupChangeCycle );	
-		}
-	}
-#endif//!DISABLE_FMO_FEATURE
-
-	if ( false )
-	{
-		BsWriteOneBit( pBs, pSliceHeadExt->bSliceSkipFlag );
-		if ( pSliceHeadExt->bSliceSkipFlag )
-		{
-			BsWriteUE( pBs, pSliceHeadExt->uiNumMbsInSlice - 1 );
-		}
-		else
-		{
-			BsWriteOneBit( pBs, pSliceHeadExt->bAdaptiveBaseModeFlag );
-			if ( !pSliceHeadExt->bAdaptiveBaseModeFlag )  
-			{
-				BsWriteOneBit( pBs, pSliceHeadExt->bDefaultBaseModeFlag );
-			}
-
-			if ( !pSliceHeadExt->bDefaultBaseModeFlag )
-			{
-				BsWriteOneBit( pBs, 0 );
-				BsWriteOneBit( pBs, 0 );
-			}
-
-			BsWriteOneBit( pBs, pSliceHeadExt->bAdaptiveResidualPredFlag );
-			if ( !pSliceHeadExt->bAdaptiveResidualPredFlag )
-			{
-				BsWriteOneBit( pBs, 0);
-			}
-		}
-		if ( 1 == pSubSps->sSpsSvcExt.bAdaptiveTcoeffLevelPredFlag )
-		{
-			BsWriteOneBit( pBs, pSliceHeadExt->bTcoeffLevelPredFlag );
-		}
-
-	}
-
-	if ( !pSubSps->sSpsSvcExt.bSliceHeaderRestrictionFlag )
-	{
-		BsWriteBits( pBs, 4, 0 );
-		BsWriteBits( pBs, 4, 15 );
-	}
-}
-
-//only BaseLayer inter MB and SpatialLayer (uiQualityId = 0) inter MB calling this pFunc.
-//only for inter part
-void WelsInterMbEncode( sWelsEncCtx* pEncCtx, SSlice *pSlice, SMB* pCurMb )
-{
-	SMbCache* pMbCache = &pSlice->sMbCacheInfo;
-
-	WelsDctMb(pMbCache->pCoeffLevel,  pMbCache->SPicData.pEncMb[0], pEncCtx->pCurDqLayer->iEncStride[0], pMbCache->pMemPredLuma, pEncCtx->pFuncList->pfDctFourT4 );
-	WelsEncInterY( pEncCtx->pFuncList, pCurMb, pMbCache );
-}
-
-
-//only BaseLayer inter MB and SpatialLayer (uiQualityId = 0) inter MB calling this pFunc.
-//only for I SSlice
-void WelsIMbChromaEncode( sWelsEncCtx* pEncCtx, SMB* pCurMb, SMbCache *pMbCache )
-{
-	SWelsFuncPtrList *pFunc	= pEncCtx->pFuncList;
-	SDqLayer* pCurLayer			= pEncCtx->pCurDqLayer;	
-	const int32_t kiEncStride	= pCurLayer->iEncStride[1];
-	const int32_t kiCsStride		= pCurLayer->iCsStride[1];
-	int16_t *pCurRS				= pMbCache->pCoeffLevel;
-	uint8_t* pBestPred			= pMbCache->pBestPredIntraChroma;
-	uint8_t* pCsCb				= pMbCache->SPicData.pCsMb[1];
-	uint8_t* pCsCr				= pMbCache->SPicData.pCsMb[2];
-
-	//cb
-	pFunc->pfDctFourT4( pCurRS,    pMbCache->SPicData.pEncMb[1], kiEncStride, pBestPred,    8);
-	WelsEncRecUV( pFunc, pCurMb, pMbCache, pCurRS,    1 );
-	pFunc->pfIDctFourT4( pCsCb, kiCsStride, pBestPred,    8, pCurRS    );
-	
-	//cr
-	pFunc->pfDctFourT4( pCurRS+64, pMbCache->SPicData.pEncMb[2], kiEncStride, pBestPred+64, 8);
-	WelsEncRecUV( pFunc, pCurMb, pMbCache, pCurRS+64, 2 );
-	pFunc->pfIDctFourT4( pCsCr, kiCsStride, pBestPred+64, 8, pCurRS+64 );
-}
-
-
-//only BaseLayer inter MB and SpatialLayer (uiQualityId = 0) inter MB calling this pFunc.
-//for P SSlice (intra part + inter part)
-void WelsPMbChromaEncode( sWelsEncCtx* pEncCtx, SSlice *pSlice, SMB* pCurMb )
-{
-	SWelsFuncPtrList *pFunc	= pEncCtx->pFuncList;	
-	SDqLayer* pCurLayer			= pEncCtx->pCurDqLayer;	
-	const int32_t kiEncStride	= pCurLayer->iEncStride[1];
-	SMbCache* pMbCache			= &pSlice->sMbCacheInfo;
-	int16_t *pCurRS				= pMbCache->pCoeffLevel+256;
-	uint8_t* pBestPred			= pMbCache->pMemPredChroma;		
-
-	pFunc->pfDctFourT4(pCurRS,		pMbCache->SPicData.pEncMb[1],	kiEncStride,		pBestPred,		8);	
-	pFunc->pfDctFourT4(pCurRS+64,	pMbCache->SPicData.pEncMb[2],	kiEncStride,		pBestPred+64,	8);	
-	
-	WelsEncRecUV(pFunc, pCurMb, pMbCache, pCurRS, 1);
-	WelsEncRecUV(pFunc, pCurMb, pMbCache, pCurRS+64, 2);
-}
-
-void OutputPMbWithoutConstructCsRsNoCopy( sWelsEncCtx *pCtx, SDqLayer* pDq, SSlice *pSlice, SMB* pMb )
-{	
-	if ( IS_INTER( pMb->uiMbType ) || IS_I_BL(pMb->uiMbType) )		//intra have been reconstructed, NO COPY from CS to pDecPic--
-	{
-		SMbCache* pMbCache			= &pSlice->sMbCacheInfo;
-		uint8_t* pDecY				= pMbCache->SPicData.pDecMb[0];
-		uint8_t* pDecU				= pMbCache->SPicData.pDecMb[1];
-		uint8_t* pDecV				= pMbCache->SPicData.pDecMb[2];
-		int16_t *pScaledTcoeff		= pMbCache->pCoeffLevel;
-		const int32_t kiDecStrideLuma	= pDq->pDecPic->iLineSize[0];
-		const int32_t kiDecStrideChroma	= pDq->pDecPic->iLineSize[1];
-		PIDctFunc pfIdctFour4x4				= pCtx->pFuncList->pfIDctFourT4;
-
-		WelsIDctT4RecOnMb( pDecY, kiDecStrideLuma, pDecY, kiDecStrideLuma, pScaledTcoeff,  pfIdctFour4x4 );
-		pfIdctFour4x4( pDecU, kiDecStrideChroma, pDecU, kiDecStrideChroma, pScaledTcoeff + 256 );
-		pfIdctFour4x4( pDecV, kiDecStrideChroma, pDecV, kiDecStrideChroma, pScaledTcoeff + 320 );
-	}
-}
-
-// for intra non-dynamic pSlice
-//encapsulate two kinds of reconstruction:
-//first. store base or highest Dependency Layer with only one quality (without CS RS reconstruction)
-//second. lower than highest Dependency Layer, and for every Dependency Layer with one quality layer(single layer) 
-void WelsISliceMdEnc( sWelsEncCtx* pEncCtx, SSlice *pSlice ) //pMd + encoding
-{
-	SDqLayer* pCurLayer				= pEncCtx->pCurDqLayer;
-	SSliceCtx* pSliceCtx		= pCurLayer->pSliceEncCtx;
-	SMbCache *pMbCache				= &pSlice->sMbCacheInfo;
-	SSliceHeaderExt *pSliceHdExt	= &pSlice->sSliceHeaderExt;
-	SMB* pMbList						= pCurLayer->sMbDataP;
-	SMB* pCurMb						= NULL;	
-	const int32_t kiSliceFirstMbXY	= pSliceHdExt->sSliceHeader.iFirstMbInSlice;
-	int32_t iNextMbIdx				= kiSliceFirstMbXY;	
-	const int32_t kiTotalNumMb		= pCurLayer->iMbWidth * pCurLayer->iMbHeight;
-	int32_t iCurMbIdx				= 0, iNumMbCoded = 0;	
-	const int32_t kiSliceIdx			= pSlice->uiSliceIdx;
-	const uint8_t kuiChromaQpIndexOffset= pCurLayer->sLayerInfo.pPpsP->uiChromaQpIndexOffset;
-	SWelsMD sMd;	
-	
-	for ( ; ; )
-	{
-		iCurMbIdx	= iNextMbIdx;
-		pCurMb = &pMbList[ iCurMbIdx ];	
-		pCurMb->uiLumaQp   = pEncCtx->iGlobalQp;
-		pCurMb->uiChromaQp = g_kuiChromaQpTable[CLIP3_QP_0_51(pCurMb->uiLumaQp + kuiChromaQpIndexOffset)];
-
-		pEncCtx->pFuncList->pfRc.pfWelsRcMbInit(pEncCtx, pCurMb, pSlice);		
-
-		sMd.iLambda = g_kiQpCostTable[pCurMb->uiLumaQp];
-
-		WelsMdIntraInit( pEncCtx, pCurMb, pMbCache, kiSliceFirstMbXY );
-		WelsMdIntraMb( pEncCtx, &sMd, pCurMb, pMbCache );
-		UpdateNonZeroCountCache( pCurMb, pMbCache );
-		
-		WelsSpatialWriteMbSyn( pEncCtx, pSlice, pCurMb );
-
-		pCurMb->uiSliceIdc = kiSliceIdx;
-		
-        #if defined(MB_TYPES_CHECK) 
-		WelsCountMbType( pEncCtx->sPerInfo.iMbCount, I_SLICE, pCurMb );		
-        #endif//MB_TYPES_CHECK
-	
-		pEncCtx->pFuncList->pfRc.pfWelsRcMbInfoUpdate(pEncCtx,pCurMb,sMd.iCostLuma,pSlice);
-
-		++iNumMbCoded;		
-
-		iNextMbIdx = WelsGetNextMbOfSlice( pSliceCtx, iCurMbIdx );
-		if ( iNextMbIdx == -1 || iNextMbIdx >= kiTotalNumMb || iNumMbCoded >= kiTotalNumMb )
-		{
-			break;
-		}
-	}
-}
-
-// Only for intra dynamic slicing
-void WelsISliceMdEncDynamic( sWelsEncCtx* pEncCtx, SSlice *pSlice ) //pMd + encoding
-{
-	SBitStringAux* pBs				= pSlice->pSliceBsa;
-	SDqLayer* pCurLayer				= pEncCtx->pCurDqLayer;
-	SSliceCtx* pSliceCtx		= pCurLayer->pSliceEncCtx;
-	SMbCache *pMbCache				= &pSlice->sMbCacheInfo;
-	SSliceHeaderExt *pSliceHdExt	= &pSlice->sSliceHeaderExt;
-	SMB* pMbList						= pCurLayer->sMbDataP;
-	SMB* pCurMb						= NULL;	
-	const int32_t kiSliceFirstMbXY	= pSliceHdExt->sSliceHeader.iFirstMbInSlice;
-	int32_t iNextMbIdx				= kiSliceFirstMbXY;	
-	const int32_t kiTotalNumMb		= pCurLayer->iMbWidth * pCurLayer->iMbHeight;
-	int32_t iCurMbIdx				= 0, iNumMbCoded = 0;	
-	const int32_t kiSliceIdx				= pSlice->uiSliceIdx;
-	const int32_t kiPartitionId			= (kiSliceIdx % pEncCtx->iActiveThreadsNum);
-	const uint8_t kuiChromaQpIndexOffset= pCurLayer->sLayerInfo.pPpsP->uiChromaQpIndexOffset;
-
-	SWelsMD sMd;	
-	SDynamicSlicingStack sDss;
-	sDss.iStartPos = BsGetBitsPos(pBs);
-
-	for ( ; ; )
-	{
-		iCurMbIdx	= iNextMbIdx;
-		pCurMb = &pMbList[ iCurMbIdx ];	
-		pCurMb->uiLumaQp   = pEncCtx->iGlobalQp;
-		pCurMb->uiChromaQp = g_kuiChromaQpTable[CLIP3_QP_0_51(pCurMb->uiLumaQp + kuiChromaQpIndexOffset)];
-
-		pEncCtx->pFuncList->pfRc.pfWelsRcMbInit(pEncCtx, pCurMb, pSlice);
-		// if already reaches the largest number of slices, set QPs to the upper bound
-		if (pSlice->bDynamicSlicingSliceSizeCtrlFlag)
-		{			
-			pCurMb->uiLumaQp = pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId].iMaxQp;
-			pCurMb->uiChromaQp = g_kuiChromaQpTable[CLIP3_QP_0_51(pCurMb->uiLumaQp + kuiChromaQpIndexOffset)];
-		}
-
-		sMd.iLambda = g_kiQpCostTable[pCurMb->uiLumaQp];
-
-		WelsMdIntraInit( pEncCtx, pCurMb, pMbCache, kiSliceFirstMbXY );
-		WelsMdIntraMb( pEncCtx, &sMd, pCurMb, pMbCache );
-		UpdateNonZeroCountCache( pCurMb, pMbCache );
-		//stack pBs pointer
-		sDss.pBsStackBufPtr	= pBs->pBufPtr;
-		sDss.uiBsStackCurBits	= pBs->uiCurBits;
-		sDss.iBsStackLeftBits	= pBs->iLeftBits;
-
-		WelsSpatialWriteMbSyn( pEncCtx, pSlice, pCurMb );
-
-		sDss.iCurrentPos = BsGetBitsPos(pBs);
-
-		if ( DynSlcJudgeSliceBoundaryStepBack( pEncCtx, pSlice, pSliceCtx, pCurMb, &sDss ) )//islice
-		{
-			//stack pBs pointer
-			pBs->pBufPtr		= sDss.pBsStackBufPtr;
-			pBs->uiCurBits	= sDss.uiBsStackCurBits;
-			pBs->iLeftBits	= sDss.iBsStackLeftBits;
-
-			pCurLayer->pLastCodedMbIdxOfPartition[kiPartitionId] = iCurMbIdx-1;	// update pLastCodedMbIdxOfPartition, need to -1 due to stepping back
-			++ pCurLayer->pNumSliceCodedOfPartition[kiPartitionId];
-
-			break;
-		}
-
-		pCurMb->uiSliceIdc = kiSliceIdx;
-
-#if defined(MB_TYPES_CHECK) 
-		WelsCountMbType( pEncCtx->sPerInfo.iMbCount, I_SLICE, pCurMb );		
-#endif//MB_TYPES_CHECK
-
-		pEncCtx->pFuncList->pfRc.pfWelsRcMbInfoUpdate(pEncCtx,pCurMb,sMd.iCostLuma,pSlice);
-
-		++iNumMbCoded;		
-
-		iNextMbIdx = WelsGetNextMbOfSlice( pSliceCtx, iCurMbIdx );
-		//whether all of MB in current pSlice encoded or not
-		if ( iNextMbIdx == -1 || iNextMbIdx >= kiTotalNumMb || iNumMbCoded >= kiTotalNumMb )
-		{
-			pSliceCtx->pCountMbNumInSlice[kiSliceIdx]	= iCurMbIdx - pCurLayer->pLastCodedMbIdxOfPartition[kiPartitionId];
-			pCurLayer->pLastCodedMbIdxOfPartition[kiPartitionId] = iCurMbIdx;	// update pLastCodedMbIdxOfPartition, finish coding, use iCurMbIdx directly
-			break;
-		}
-	}
-}
-
-//encapsulate two kinds of reconstruction:
-// first. store base or highest Dependency Layer with only one quality (without CS RS reconstruction)
-// second. lower than highest Dependency Layer, and for every Dependency Layer with one quality layer(single layer) 
-void WelsPSliceMdEnc( sWelsEncCtx* pEncCtx, SSlice *pSlice,  const bool_t kbIsHighestDlayerFlag ) //pMd + encoding
-{
-	const SSliceHeaderExt	*kpShExt				= &pSlice->sSliceHeaderExt;
-	const SSliceHeader		*kpSh					= &kpShExt->sSliceHeader;
-	const int32_t			kiSliceFirstMbXY	= kpSh->iFirstMbInSlice;
-	SWelsMD sMd;
-
-	sMd.uiRef			= kpSh->uiRefIndex;
-	sMd.bMdUsingSad		= kbIsHighestDlayerFlag;
-	if (!pEncCtx->pCurDqLayer->bBaseLayerAvailableFlag || !kbIsHighestDlayerFlag)
-		memset( &sMd.sMe, 0, sizeof(sMd.sMe) );
-
-	//pMb loop
-	WelsMdInterMbLoop( pEncCtx, pSlice, &sMd, kiSliceFirstMbXY );
-}
-
-void WelsPSliceMdEncDynamic( sWelsEncCtx* pEncCtx, SSlice *pSlice, const bool_t kbIsHighestDlayerFlag )
-{
-	const SSliceHeaderExt	*kpShExt				= &pSlice->sSliceHeaderExt;
-	const SSliceHeader		*kpSh					= &kpShExt->sSliceHeader;
-	const int32_t			kiSliceFirstMbXY	= kpSh->iFirstMbInSlice;
-	SWelsMD sMd;
-
-	sMd.uiRef			= kpSh->uiRefIndex;
-	sMd.bMdUsingSad		= kbIsHighestDlayerFlag;
-	if (!pEncCtx->pCurDqLayer->bBaseLayerAvailableFlag || !kbIsHighestDlayerFlag)
-		memset( &sMd.sMe, 0, sizeof(sMd.sMe) );
-
-	//mb loop
-	WelsMdInterMbLoopOverDynamicSlice( pEncCtx, pSlice, &sMd, kiSliceFirstMbXY );
-}
-
-void WelsCodePSlice( sWelsEncCtx* pEncCtx, SSlice *pSlice )
-{
-	//pSlice-level init should be outside and before this function
-	SDqLayer* pCurLayer			= pEncCtx->pCurDqLayer;
-	const bool_t kbBaseAvail		= pCurLayer->bBaseLayerAvailableFlag;
-	const bool_t kbHighestSpatial= pEncCtx->pSvcParam->iNumDependencyLayer == (pCurLayer->sLayerInfo.sNalHeaderExt.uiDependencyId + 1);
-
-	//MD switch	
-	if ( kbBaseAvail && kbHighestSpatial ) 
-	{
-		//initial pMd pointer
-		pEncCtx->pFuncList->pfInterMd			=  (PInterMdFunc)WelsMdInterMbEnhancelayer;
-	}
-	else
-	{
-		//initial pMd pointer
-		pEncCtx->pFuncList->pfInterMd            =  (PInterMdFunc)WelsMdInterMb;
-	}
-	WelsPSliceMdEnc( pEncCtx, pSlice, kbHighestSpatial );
-}
-
-void WelsCodePOverDynamicSlice( sWelsEncCtx* pEncCtx, SSlice *pSlice )
-{
-	//pSlice-level init should be outside and before this function
-	SDqLayer* pCurLayer			= pEncCtx->pCurDqLayer;
-	const bool_t kbBaseAvail		= pCurLayer->bBaseLayerAvailableFlag;
-	const bool_t kbHighestSpatial= pEncCtx->pSvcParam->iNumDependencyLayer == (pCurLayer->sLayerInfo.sNalHeaderExt.uiDependencyId + 1);
-
-	//MD switch	
-	if ( kbBaseAvail && kbHighestSpatial ) 
-	{       	
-		//initial pMd pointer
-		pEncCtx->pFuncList->pfInterMd			=  (PInterMdFunc)WelsMdInterMbEnhancelayer;
-	}
-	else
-	{
-		//initial pMd pointer
-		pEncCtx->pFuncList->pfInterMd            =  (PInterMdFunc)WelsMdInterMb;		
-	}
-	WelsPSliceMdEncDynamic( pEncCtx, pSlice, kbHighestSpatial );
-}
-
-// 1st index: 0: for P pSlice; 1: for I pSlice;
-// 2nd index: 0: for non-dynamic pSlice; 1: for dynamic I pSlice;
-PWelsCodingSliceFunc	g_pWelsSliceCoding[2][2] =
-{
-	{ WelsCodePSlice, WelsCodePOverDynamicSlice },	// P SSlice
-	{ WelsISliceMdEnc, WelsISliceMdEncDynamic }	// I SSlice
-};
-PWelsSliceHeaderWriteFunc		g_pWelsWriteSliceHeader[2] =	// 0: for base; 1: for ext;
-{
-	WelsSliceHeaderWrite,
-	WelsSliceHeaderExtWrite
-};
-
-
-void WelsCodeOneSlice( sWelsEncCtx* pEncCtx, const int32_t kiSliceIdx, const int32_t kiNalType )
-{	
-	SDqLayer* pCurLayer					= pEncCtx->pCurDqLayer;
-	SNalUnitHeaderExt* pNalHeadExt	= &pCurLayer->sLayerInfo.sNalHeaderExt;
-	SSlice *pCurSlice					= &pCurLayer->sLayerInfo.pSliceInLayer[kiSliceIdx];
-	SBitStringAux* pBs					= pCurSlice->pSliceBsa;
-	const int32_t kiDynamicSliceFlag	= (pEncCtx->pSvcParam->sDependencyLayers[pEncCtx->uiDependencyId].sMso.uiSliceMode == SM_DYN_SLICE);
-
-	assert( kiSliceIdx == pCurSlice->uiSliceIdx );
-
-	if ( I_SLICE == pEncCtx->eSliceType )
-	{
-		pNalHeadExt->bIdrFlag = 1;
-		pCurSlice->sScaleShift = 0;
-	}
-	else
-	{
-		const uint32_t kuiTemporalId = pNalHeadExt->uiTemporalId;
-		pCurSlice->sScaleShift = kuiTemporalId ? (kuiTemporalId - pEncCtx->pRefPic->uiTemporalId) : 0;
-	}
-
-	WelsSliceHeaderExtInit( pEncCtx, pCurLayer, pCurSlice );	
-
-
-	g_pWelsWriteSliceHeader[pCurSlice->bSliceHeaderExtFlag]( pBs, pCurLayer, pCurSlice, &(pEncCtx->sPSOVector.sParaSetOffsetVariable[PARA_SET_TYPE_PPS].iParaSetIdDelta[0]) );
-#if _DEBUG 
-	if ( pEncCtx->sPSOVector.bEnableSpsPpsIdAddition )
-	{
-		const int32_t kiEncoderPpsId    = pCurSlice->sSliceHeaderExt.sSliceHeader.pPps->iPpsId;
-		const int32_t kiTmpPpsIdInBs = kiEncoderPpsId + pEncCtx->sPSOVector.sParaSetOffsetVariable[PARA_SET_TYPE_PPS].iParaSetIdDelta[ kiEncoderPpsId ];
-		assert ( MAX_PPS_COUNT > kiTmpPpsIdInBs );
-		
-		//when activated need to sure there is avialable PPS
-		assert ( pEncCtx->sPSOVector.sParaSetOffsetVariable[PARA_SET_TYPE_PPS].bUsedParaSetIdInBs[kiTmpPpsIdInBs] );
-	}
-#endif
-
-	pCurSlice->uiLastMbQp = pCurLayer->sLayerInfo.pPpsP->iPicInitQp + pCurSlice->sSliceHeaderExt.sSliceHeader.iSliceQpDelta;	
-
-	g_pWelsSliceCoding[pNalHeadExt->bIdrFlag][kiDynamicSliceFlag]( pEncCtx, pCurSlice );
-
-	BsRbspTrailingBits( pBs );
-
-	BsFlush( pBs );
-}
-
-//pFunc: UpdateMbNeighbourInfoForNextSlice()
-void UpdateMbNeighbourInfoForNextSlice(	SSliceCtx *pSliceCtx,
-											 SMB *pMbList,
-											 const int32_t kiFirstMbIdxOfNextSlice,
-											 const int32_t kiLastMbIdxInPartition	)
-{	
-	const int32_t kiMbWidth					= pSliceCtx->iMbWidth;
-	int32_t iIdx								= kiFirstMbIdxOfNextSlice;
-	int32_t	iNextSliceFirstMbIdxRowStart= (( kiFirstMbIdxOfNextSlice % kiMbWidth ) ? 1:0);
-	int32_t iCountMbUpdate					= kiMbWidth + iNextSliceFirstMbIdxRowStart; //need to update MB(iMbXY+1) to MB(iMbXY+1+row) in common case
-	const int32_t kiEndMbNeedUpdate		= kiFirstMbIdxOfNextSlice + iCountMbUpdate;
-	SMB *pMb									= &pMbList[iIdx];
-	
-	do {
-        uint32_t uiNeighborAvailFlag	= 0;
-		const int32_t kiMbXY				= pMb->iMbXY;
-		const int32_t kiMbX				= pMb->iMbX;
-		const int32_t kiMbY				= pMb->iMbY;
-		BOOL_T     bLeft;
-		BOOL_T     bTop;
-		BOOL_T     bLeftTop;
-		BOOL_T     bRightTop;		
-		int32_t   iLeftXY, iTopXY, iLeftTopXY, iRightTopXY;
-		const uint8_t  kuiSliceIdc		= WelsMbToSliceIdc(pSliceCtx, kiMbXY);
-		
-		pMb->uiSliceIdc	= kuiSliceIdc;
-		iLeftXY = kiMbXY - 1;
-		iTopXY = kiMbXY - kiMbWidth;
-		iLeftTopXY = iTopXY - 1;
-		iRightTopXY = iTopXY + 1;
-		
-		bLeft = (kiMbX > 0) && (kuiSliceIdc == WelsMbToSliceIdc(pSliceCtx, iLeftXY));
-		bTop = (kiMbY > 0) && (kuiSliceIdc == WelsMbToSliceIdc(pSliceCtx, iTopXY));
-		bLeftTop = (kiMbX > 0) && (kiMbY > 0) && (kuiSliceIdc == WelsMbToSliceIdc(pSliceCtx, iLeftTopXY));
-		bRightTop = (kiMbX < (kiMbWidth-1)) && (kiMbY > 0) && (kuiSliceIdc == WelsMbToSliceIdc(pSliceCtx, iRightTopXY));		
-		
-		if( bLeft ){
-			uiNeighborAvailFlag |= LEFT_MB_POS;
-		}
-		if( bTop ){
-			uiNeighborAvailFlag |= TOP_MB_POS;
-		}
-		if( bLeftTop ){
-			uiNeighborAvailFlag |= TOPLEFT_MB_POS;
-		}
-		if( bRightTop ){
-			uiNeighborAvailFlag |= TOPRIGHT_MB_POS;
-		}
-		pMb->uiNeighborAvail	= (uint8_t)uiNeighborAvailFlag;
-		
-		++ pMb;
-		++ iIdx;
-	}while (	( iIdx < kiEndMbNeedUpdate) && 
-				( iIdx <= kiLastMbIdxInPartition ) );
-} 
-
-
-void AddSliceBoundary(sWelsEncCtx* pEncCtx, SSlice * pCurSlice, SSliceCtx *pSliceCtx, SMB* pCurMb, int32_t iFirstMbIdxOfNextSlice, const int32_t kiLastMbIdxInPartition )
-{
-	SDqLayer*	pCurLayer = pEncCtx->pCurDqLayer;
-	int32_t		iCurMbIdx		= pCurMb->iMbXY;
-	int32_t		iCurSliceIdc	= pSliceCtx->pOverallMbMap[ iCurMbIdx ];
-	const int32_t kiSliceIdxStep= pEncCtx->iActiveThreadsNum;
-	int32_t		iNextSliceIdc	= iCurSliceIdc + kiSliceIdxStep;
-	SSlice		*pNextSlice		= NULL;
-
-	SMB *pMbList					= pCurLayer->sMbDataP;	
-
-	//update cur pSlice info 	
-	pCurSlice->sSliceHeaderExt.uiNumMbsInSlice	= 1 + iCurMbIdx - pCurSlice->sSliceHeaderExt.sSliceHeader.iFirstMbInSlice;
-	
-	//pNextSlice pointer/initialization
-		pNextSlice = &( pCurLayer->sLayerInfo.pSliceInLayer[ iNextSliceIdc ] );
-
-#if _DEBUG
-	assert( NULL != pNextSlice );
-	// now ( pSliceCtx->iSliceNumInFrame < pSliceCtx->iMaxSliceNumConstraint ) always true by the call of this pFunc
-#endif
-
-	//init next pSlice info
-	pNextSlice->bSliceHeaderExtFlag = 
-		(NAL_UNIT_CODED_SLICE_EXT == pCurLayer->sLayerInfo.sNalHeaderExt.sNalHeader.eNalUnitType);
-	memcpy( &pNextSlice->sSliceHeaderExt, &pCurSlice->sSliceHeaderExt, sizeof(SSliceHeaderExt) );	// confirmed_safe_unsafe_usage
-
-	pSliceCtx->pFirstMbInSlice[iNextSliceIdc] = iFirstMbIdxOfNextSlice;
-
-#if !defined(MT_ENABLED)
-	pNextSlice->uiSliceIdx = iNextSliceIdc;
-	pNextSlice->pSliceBsa = &(pEncCtx->pOut->sBsWrite);
-#endif//!MT_ENABLED
-
-	memset(pSliceCtx->pOverallMbMap+iFirstMbIdxOfNextSlice, (uint8_t)iNextSliceIdc, (kiLastMbIdxInPartition-iFirstMbIdxOfNextSlice+1)*sizeof(uint8_t));
-
-	//DYNAMIC_SLICING_ONE_THREAD: update pMbList slice_neighbor_info
-	UpdateMbNeighbourInfoForNextSlice( pSliceCtx, pMbList, iFirstMbIdxOfNextSlice, kiLastMbIdxInPartition );
-}
-
-BOOL_T DynSlcJudgeSliceBoundaryStepBack(void* pCtx, void *pSlice, SSliceCtx *pSliceCtx, SMB* pCurMb, SDynamicSlicingStack* pDss )
-{
-	sWelsEncCtx *pEncCtx = (sWelsEncCtx*)pCtx;
-	SSlice * pCurSlice = (SSlice *)pSlice;
-	int32_t		   iCurMbIdx  = pCurMb->iMbXY;
-	uint32_t        uiLen = 0;
-	int32_t		   iPosBitOffset = 0;
-	const int32_t  kiActiveThreadsNum = pEncCtx->iActiveThreadsNum;
-	const int32_t  kiPartitaionId = pCurSlice->uiSliceIdx % kiActiveThreadsNum;
-	const int32_t  kiLastMbIdxInPartition	= pEncCtx->pCurDqLayer->pLastMbIdxOfPartition[kiPartitaionId];
-
-	const BOOL_T    kbCurMbNotFirstMbOfCurSlice      = (pSliceCtx->pOverallMbMap[iCurMbIdx] == pSliceCtx->pOverallMbMap[iCurMbIdx-1]);
-	const BOOL_T    kbCurMbNotLastMbOfCurPartition = iCurMbIdx < kiLastMbIdxInPartition;
-	const BOOL_T    kbSliceNumNotExceedConstraint       = pSliceCtx->iSliceNumInFrame < pSliceCtx->iMaxSliceNumConstraint; /*tmp choice to avoid complex memory operation, 100520, to be modify*/
-	const BOOL_T    kbSliceNumReachConstraint               = (pSliceCtx->iSliceNumInFrame == pSliceCtx->iMaxSliceNumConstraint);
-
-	if ( pCurSlice->bDynamicSlicingSliceSizeCtrlFlag ) 
-		return false;
-
-	iPosBitOffset = ( pDss->iCurrentPos - pDss->iStartPos );
-#if _DEBUG
-	assert(iPosBitOffset>=0);
-#endif
-	uiLen = ( ( iPosBitOffset>>3 ) + (( iPosBitOffset & 0x07 )? 1: 0) );	
-
-#ifdef MT_ENABLED
-	if ( pEncCtx->pSvcParam->iMultipleThreadIdc > 1 )
-		WelsMutexLock( &pEncCtx->pSliceThreading->mutexSliceNumUpdate );
-#endif//MT_ENABLED
-
-	//DYNAMIC_SLICING_ONE_THREAD: judge jump_avoiding_pack_exceed
-	if (
-		( ( kbCurMbNotFirstMbOfCurSlice
-		&& JUMPPACKETSIZE_JUDGE(uiLen,iCurMbIdx,pSliceCtx->uiSliceSizeConstraint) )/*jump_avoiding_pack_exceed*/ 
-		&& kbCurMbNotLastMbOfCurPartition )//decide to add new pSlice
-		&& ( kbSliceNumNotExceedConstraint
-#ifdef MT_ENABLED
-		&& ( ( pCurSlice->uiSliceIdx + kiActiveThreadsNum ) < pSliceCtx->iMaxSliceNumConstraint )
-#endif//MT_ENABLED	
-		)//able to add new pSlice
-
-		)
-	{	
-		
-		AddSliceBoundary( pEncCtx, pCurSlice, pSliceCtx, pCurMb, iCurMbIdx, kiLastMbIdxInPartition );
-
-		++ pSliceCtx->iSliceNumInFrame;
-
-#ifdef MT_ENABLED
-		if (pEncCtx->pSvcParam->iMultipleThreadIdc > 1)
-			WelsMutexUnlock( &pEncCtx->pSliceThreading->mutexSliceNumUpdate );
-#endif//MT_ENABLED
-
-		return TRUE;
-	}
-
-	if (
-		( kbSliceNumReachConstraint
-#ifdef MT_ENABLED
-		|| ( ( pCurSlice->uiSliceIdx + kiActiveThreadsNum ) >= pSliceCtx->iMaxSliceNumConstraint )
-#endif//MT_ENABLED
-		)
-		&& ( ( JUMPPACKETSIZE_JUDGE(uiLen,	iCurMbIdx,
-		pSliceCtx->uiSliceSizeConstraint - ( ( kiLastMbIdxInPartition - iCurMbIdx ) << ( pCurSlice->uiAssumeLog2BytePerMb ) /* assume each MB consumes two byte under largest QP */) ) )
-		&& kbCurMbNotLastMbOfCurPartition )//risk of exceeding the size constraint when pSlice num reaches constraint
-		)
-	{		
-		pCurSlice->bDynamicSlicingSliceSizeCtrlFlag = true;
-	}
-
-#ifdef MT_ENABLED
-	if (pEncCtx->pSvcParam->iMultipleThreadIdc > 1)
-		WelsMutexUnlock( &pEncCtx->pSliceThreading->mutexSliceNumUpdate );
-#endif//MT_ENABLED
-
-	return FALSE;
-}
-
-///////////////
-//  pMb loop
-///////////////
-// for inter non-dynamic pSlice
-void WelsMdInterMbLoop( sWelsEncCtx* pEncCtx, SSlice *pSlice, void* pWelsMd, const int32_t kiSliceFirstMbXY )
-{
-	SWelsMD* pMd					= (SWelsMD*)pWelsMd;
-	SBitStringAux* pBs			= pSlice->pSliceBsa;
-	SDqLayer *pCurLayer			= pEncCtx->pCurDqLayer;
-	SSliceCtx *pSliceCtx	= pCurLayer->pSliceEncCtx;
-	SMbCache *pMbCache			= &pSlice->sMbCacheInfo;
-	SMB *pMbList					= pCurLayer->sMbDataP;
-	SMB *pCurMb					= NULL;
-	int32_t iNumMbCoded		= 0;
-	int32_t	iNextMbIdx			= kiSliceFirstMbXY;
-	int32_t	iCurMbIdx			= -1;	
-	int32_t	iMbSkipRun			= 0;
-	const int32_t kiTotalNumMb	= pCurLayer->iMbWidth * pCurLayer->iMbHeight;
-	const int32_t kiMvdInterTableSize	= (pEncCtx->pSvcParam->iNumDependencyLayer == 1 ? 648: 972);
-	const int32_t kiMvdInterTableStride= 1+(kiMvdInterTableSize<<1);
-	uint16_t *pMvdCostTableInter		= &pEncCtx->pMvdCostTableInter[kiMvdInterTableSize];
-	const int32_t kiSliceIdx				= pSlice->uiSliceIdx;
-	const uint8_t kuiChromaQpIndexOffset= pCurLayer->sLayerInfo.pPpsP->uiChromaQpIndexOffset;
-
-	for(;;)
-	{
-		//point to current pMb
-		iCurMbIdx	= iNextMbIdx;
-		pCurMb = &pMbList[ iCurMbIdx ];		
-
-		//step(1): set QP for the current MB
-		pEncCtx->pFuncList->pfRc.pfWelsRcMbInit(pEncCtx, pCurMb, pSlice);
-		
-        //step (2). save some vale for future use, initial pWelsMd
-		pMd->iLambda = g_kiQpCostTable[pCurMb->uiLumaQp];
-		pMd->pMvdCost = &pMvdCostTableInter[pCurMb->uiLumaQp*kiMvdInterTableStride];
-		WelsMdIntraInit(pEncCtx, pCurMb, pMbCache, kiSliceFirstMbXY);
-        WelsMdInterInit(pEncCtx, pSlice, pCurMb, kiSliceFirstMbXY);
-		pEncCtx->pFuncList->pfInterMd(pEncCtx, pMd, pSlice, pCurMb, pMbCache);
-		//mb_qp
-
-		//step (4): save from the MD process from future use
-		WelsMdInterSaveSadAndRefMbType( (pCurLayer->pDecPic->uiRefMbType), pMbCache, pCurMb, pMd);
-
-		pEncCtx->pFuncList->pfInterMdBackgroundInfoUpdate( pCurLayer, pCurMb, pMbCache->bCollocatedPredFlag, pEncCtx->pRefPic->iPictureType );
-
-		//step (5): update cache
-		UpdateNonZeroCountCache( pCurMb, pMbCache );
-
-		//step (6): begin to write bit stream; if the pSlice size is controlled, the writing may be skipped
-		if( IS_SKIP (pCurMb->uiMbType) )
-		{
-			pCurMb->uiLumaQp	= pSlice->uiLastMbQp;
-			pCurMb->uiChromaQp = g_kuiChromaQpTable[CLIP3_QP_0_51(pCurMb->uiLumaQp + kuiChromaQpIndexOffset)];
-			
-			iMbSkipRun++;
-		}
-		else
-		{
-			BsWriteUE( pBs, iMbSkipRun );
-			iMbSkipRun = 0;
-			WelsSpatialWriteMbSyn( pEncCtx, pSlice, pCurMb );
-		}
-		
-		//step (7): reconstruct current MB
-		pCurMb->uiSliceIdc = kiSliceIdx;
-		OutputPMbWithoutConstructCsRsNoCopy( pEncCtx, pCurLayer, pSlice, pCurMb );
-		
-        #if defined(MB_TYPES_CHECK) 
-		WelsCountMbType( pEncCtx->sPerInfo.iMbCount, P_SLICE, pCurMb );		
-        #endif//MB_TYPES_CHECK			
-
-		//step (8): update status and other parameters
-		pEncCtx->pFuncList->pfRc.pfWelsRcMbInfoUpdate(pEncCtx,pCurMb,pMd->iCostLuma,pSlice);
-		
-		/*judge if all pMb in cur pSlice has been encoded*/
-		++ iNumMbCoded;
-		iNextMbIdx = WelsGetNextMbOfSlice( pSliceCtx, iCurMbIdx );
-		//whether all of MB in current pSlice encoded or not
-		if ( iNextMbIdx == -1 || iNextMbIdx >= kiTotalNumMb || iNumMbCoded >= kiTotalNumMb )
-		{
-			break;
-		}
-	}
-
-	if ( iMbSkipRun )
-	{
-		BsWriteUE( pBs, iMbSkipRun );
-	}
-}
-
-// Only for inter dynamic slicing
-void WelsMdInterMbLoopOverDynamicSlice( sWelsEncCtx* pEncCtx, SSlice *pSlice, void* pWelsMd, const int32_t kiSliceFirstMbXY )
-{
-	SWelsMD* pMd					= (SWelsMD*)pWelsMd;
-	SBitStringAux* pBs			= pSlice->pSliceBsa;
-	SDqLayer *pCurLayer			= pEncCtx->pCurDqLayer;
-	SSliceCtx *pSliceCtx	= pCurLayer->pSliceEncCtx;
-	SMbCache *pMbCache			= &pSlice->sMbCacheInfo;
-	SMB *pMbList					= pCurLayer->sMbDataP;
-	SMB *pCurMb					= NULL;
-	int32_t iNumMbCoded		= 0;
-	const int32_t kiTotalNumMb	= pCurLayer->iMbWidth * pCurLayer->iMbHeight;
-	int32_t	iNextMbIdx			= kiSliceFirstMbXY;
-	int32_t	iCurMbIdx			= -1;
-	int32_t	iMbSkipRun			= 0;	
-	const int32_t kiMvdInterTableSize	= (pEncCtx->pSvcParam->iNumDependencyLayer == 1 ? 648: 972);
-	const int32_t kiMvdInterTableStride= 1+(kiMvdInterTableSize<<1);
-	uint16_t *pMvdCostTableInter		= &pEncCtx->pMvdCostTableInter[kiMvdInterTableSize];
-	const int32_t kiSliceIdx				= pSlice->uiSliceIdx;
-	const int32_t kiPartitionId			= (kiSliceIdx % pEncCtx->iActiveThreadsNum);
-	const uint8_t kuiChromaQpIndexOffset= pCurLayer->sLayerInfo.pPpsP->uiChromaQpIndexOffset;
-
-	SDynamicSlicingStack sDss;
-	sDss.iStartPos = BsGetBitsPos(pBs);
-	for(;;)
-	{
-		//point to current pMb
-		iCurMbIdx	= iNextMbIdx;
-		pCurMb = &pMbList[ iCurMbIdx ];		
-
-		//step(1): set QP for the current MB
-		pEncCtx->pFuncList->pfRc.pfWelsRcMbInit(pEncCtx, pCurMb, pSlice);
-		// if already reaches the largest number of slices, set QPs to the upper bound
-		if (pSlice->bDynamicSlicingSliceSizeCtrlFlag)
-		{
-			//a clearer logic may be: 
-			//if there is no need from size control from the pSlice size, the QP will be decided by RC; else it will be set to the max QP
-			//    however, there are some parameter updating in the rc_mb_init() function, so it cannot be skipped?
-			pCurMb->uiLumaQp = pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId].iMaxQp;
-			pCurMb->uiChromaQp = g_kuiChromaQpTable[CLIP3_QP_0_51(pCurMb->uiLumaQp + kuiChromaQpIndexOffset)];
-		}
-
-		//step (2). save some vale for future use, initial pWelsMd
-		pMd->iLambda = g_kiQpCostTable[pCurMb->uiLumaQp];
-		pMd->pMvdCost = &pMvdCostTableInter[pCurMb->uiLumaQp*kiMvdInterTableStride];
-		
-		WelsMdIntraInit(pEncCtx, pCurMb, pMbCache, kiSliceFirstMbXY);
-		WelsMdInterInit(pEncCtx, pSlice, pCurMb, kiSliceFirstMbXY);
-		pEncCtx->pFuncList->pfInterMd(pEncCtx, pMd, pSlice, pCurMb, pMbCache);
-		//mb_qp
-
-		//step (4): save from the MD process from future use
-		WelsMdInterSaveSadAndRefMbType( (pCurLayer->pDecPic->uiRefMbType), pMbCache, pCurMb, pMd);
-
-		pEncCtx->pFuncList->pfInterMdBackgroundInfoUpdate( pCurLayer, pCurMb, pMbCache->bCollocatedPredFlag, pEncCtx->pRefPic->iPictureType );
-
-		//step (5): update cache
-		UpdateNonZeroCountCache( pCurMb, pMbCache );
-
-		//step (6): begin to write bit stream; if the pSlice size is controlled, the writing may be skipped
-
-		//DYNAMIC_SLICING_ONE_THREAD - MultiD
-		//stack pBs pointer
-		sDss.pBsStackBufPtr	= pBs->pBufPtr;
-		sDss.uiBsStackCurBits	= pBs->uiCurBits;
-		sDss.iBsStackLeftBits	= pBs->iLeftBits;
-		//stack Pskip status
-		sDss.iMbSkipRunStack = iMbSkipRun;
-		//DYNAMIC_SLICING_ONE_THREAD - MultiD
-
-		if( IS_SKIP (pCurMb->uiMbType) )
-		{
-			pCurMb->uiLumaQp	= pSlice->uiLastMbQp;
-			pCurMb->uiChromaQp = g_kuiChromaQpTable[CLIP3_QP_0_51(pCurMb->uiLumaQp + kuiChromaQpIndexOffset)];
-
-			iMbSkipRun++;
-		}
-		else
-		{
-			BsWriteUE( pBs, iMbSkipRun );
-			iMbSkipRun = 0;
-			WelsSpatialWriteMbSyn( pEncCtx, pSlice, pCurMb );
-		}		
-
-		//DYNAMIC_SLICING_ONE_THREAD - MultiD
-		sDss.iCurrentPos = BsGetBitsPos(pBs);
-		if ( DynSlcJudgeSliceBoundaryStepBack( pEncCtx, pSlice, pSliceCtx, pCurMb, &sDss ) )
-		{
-			//stack pBs pointer
-			pBs->pBufPtr		= sDss.pBsStackBufPtr;
-			pBs->uiCurBits	= sDss.uiBsStackCurBits;
-			pBs->iLeftBits	= sDss.iBsStackLeftBits;
-
-			iMbSkipRun = sDss.iMbSkipRunStack;
-
-			pCurLayer->pLastCodedMbIdxOfPartition[kiPartitionId] = iCurMbIdx-1;	// update pLastCodedMbIdxOfPartition, need to -1 due to stepping back
-			++ pCurLayer->pNumSliceCodedOfPartition[kiPartitionId];
-
-			break;
-		}
-
-		//step (7): reconstruct current MB
-		pCurMb->uiSliceIdc = kiSliceIdx;
-		OutputPMbWithoutConstructCsRsNoCopy( pEncCtx, pCurLayer, pSlice, pCurMb );
-
-#if defined(MB_TYPES_CHECK) 
-		WelsCountMbType( pEncCtx->sPerInfo.iMbCount, P_SLICE, pCurMb );		
-#endif//MB_TYPES_CHECK			
-
-		//step (8): update status and other parameters
-		pEncCtx->pFuncList->pfRc.pfWelsRcMbInfoUpdate(pEncCtx,pCurMb,pMd->iCostLuma,pSlice);
-
-		/*judge if all pMb in cur pSlice has been encoded*/
-		++ iNumMbCoded;
-		iNextMbIdx = WelsGetNextMbOfSlice( pSliceCtx, iCurMbIdx );
-		//whether all of MB in current pSlice encoded or not
-		if ( iNextMbIdx == -1 || iNextMbIdx >= kiTotalNumMb || iNumMbCoded >= kiTotalNumMb )
-		{
-			pCurLayer->pLastCodedMbIdxOfPartition[kiPartitionId] = iCurMbIdx;	// update pLastCodedMbIdxOfPartition, finish coding, use pCurMb_idx directly				
-			break;
-		}
-	}
-
-	if ( iMbSkipRun )
-	{
-		BsWriteUE( pBs, iMbSkipRun );
-	}
-}
-
-}//namespace WelsSVCEnc
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	svc_encode_slice.c
+ *
+ * \brief	svc encoding slice
+ *
+ * \date	2009.07.27 Created
+ *
+ *************************************************************************************
+ */
+
+#include <string.h>
+#include <assert.h>
+#include "ls_defines.h"
+#include "svc_encode_slice.h"
+#include "svc_enc_golomb.h"
+#include "svc_base_layer_md.h"
+#include "svc_encode_mb.h"
+#include "mv_pred.h"
+#include "svc_set_mb_syn_cavlc.h"
+#include "encode_mb_aux.h"
+#include "decode_mb_aux.h"
+#include "svc_mode_decision.h"
+#include "cpu_core.h"
+#include "svc_motion_estimate.h"
+#include "sample.h"
+#include "wels_func_ptr_def.h"
+#include "utils.h"
+
+namespace WelsSVCEnc {
+//#define ENC_TRACE
+
+typedef void (*PWelsCodingSliceFunc) (sWelsEncCtx* pCtx, SSlice* pSlice);
+typedef void (*PWelsSliceHeaderWriteFunc) (SBitStringAux* pBs, SDqLayer* pCurLayer, SSlice* pSlice,
+    int32_t* pPpsIdDelta);
+
+void UpdateNonZeroCountCache (SMB* pMb, SMbCache* pMbCache) {
+  ST32 (&pMbCache->iNonZeroCoeffCount[9], LD32 (&pMb->pNonZeroCount[ 0]));
+  ST32 (&pMbCache->iNonZeroCoeffCount[17], LD32 (&pMb->pNonZeroCount[ 4]));
+  ST32 (&pMbCache->iNonZeroCoeffCount[25], LD32 (&pMb->pNonZeroCount[ 8]));
+  ST32 (&pMbCache->iNonZeroCoeffCount[33], LD32 (&pMb->pNonZeroCount[12]));
+
+  ST16 (&pMbCache->iNonZeroCoeffCount[14], LD16 (&pMb->pNonZeroCount[16]));
+  ST16 (&pMbCache->iNonZeroCoeffCount[38], LD16 (&pMb->pNonZeroCount[18]));
+  ST16 (&pMbCache->iNonZeroCoeffCount[22], LD16 (&pMb->pNonZeroCount[20]));
+  ST16 (&pMbCache->iNonZeroCoeffCount[46], LD16 (&pMb->pNonZeroCount[22]));
+}
+
+void WelsSliceHeaderScalExtInit (SDqLayer* pCurLayer, SSlice* pSlice) {
+  SSliceHeaderExt* pSliceHeadExt	= &pSlice->sSliceHeaderExt;
+  SNalUnitHeaderExt* pNalHeadExt = &pCurLayer->sLayerInfo.sNalHeaderExt;
+
+  uint8_t uiDependencyId	= pNalHeadExt->uiDependencyId;
+
+  pSliceHeadExt->bSliceSkipFlag = false;
+
+  if (uiDependencyId > 0) { //spatial EL
+    //bothe adaptive and default flags should equal to 0.
+    pSliceHeadExt->bAdaptiveBaseModeFlag     =
+      pSliceHeadExt->bAdaptiveMotionPredFlag   =
+        pSliceHeadExt->bAdaptiveResidualPredFlag = false;
+
+    pSliceHeadExt->bDefaultBaseModeFlag     =
+      pSliceHeadExt->bDefaultMotionPredFlag   =
+        pSliceHeadExt->bDefaultResidualPredFlag = false;
+  }
+}
+
+void WelsSliceHeaderExtInit (sWelsEncCtx* pEncCtx, SDqLayer* pCurLayer, SSlice* pSlice) {
+  SSliceHeaderExt* pCurSliceExt = &pSlice->sSliceHeaderExt;
+  SSliceHeader* pCurSliceHeader  = &pCurSliceExt->sSliceHeader;
+
+  pCurSliceHeader->eSliceType	= pEncCtx->eSliceType;
+
+  pCurSliceExt->bStoreRefBasePicFlag = false;
+
+  pCurSliceHeader->iFirstMbInSlice = WelsGetFirstMbOfSlice (pCurLayer->pSliceEncCtx, pSlice->uiSliceIdx);
+
+  pCurSliceHeader->iFrameNum      = pEncCtx->iFrameNum;
+  pCurSliceHeader->uiIdrPicId     = pEncCtx->sPSOVector.uiIdrPicId; //??
+
+  pCurSliceHeader->iPicOrderCntLsb          = pEncCtx->pEncPic->iFramePoc;	// 0
+
+  if (P_SLICE == pEncCtx->eSliceType) {
+    pCurSliceHeader->uiNumRefIdxL0Active	= 1;
+    if (pCurSliceHeader->uiRefCount > 0 &&
+        pCurSliceHeader->uiRefCount < pCurLayer->sLayerInfo.pSpsP->iNumRefFrames) {
+      pCurSliceHeader->bNumRefIdxActiveOverrideFlag = true;
+      pCurSliceHeader->uiNumRefIdxL0Active	= pCurSliceHeader->uiRefCount;
+    }
+    //to solve mismatch between debug&release
+    else {
+      pCurSliceHeader->bNumRefIdxActiveOverrideFlag = false;
+    }
+  }
+
+  pCurSliceHeader->iSliceQpDelta = pEncCtx->iGlobalQp - pCurLayer->sLayerInfo.pPpsP->iPicInitQp;
+
+  //for deblocking initial
+  pCurSliceHeader->uiDisableDeblockingFilterIdc			= pCurLayer->iLoopFilterDisableIdc;
+  pCurSliceHeader->iSliceAlphaC0Offset					=
+    pCurLayer->iLoopFilterAlphaC0Offset;	//	need update iSliceAlphaC0Offset & iSliceBetaOffset for pSlice-header if loop_filter_idc != 1
+  pCurSliceHeader->iSliceBetaOffset						= pCurLayer->iLoopFilterBetaOffset;
+  pCurSliceExt->uiDisableInterLayerDeblockingFilterIdc = pCurLayer->uiDisableInterLayerDeblockingFilterIdc;
+
+  if (pSlice->bSliceHeaderExtFlag) {
+    WelsSliceHeaderScalExtInit (pCurLayer, pSlice);
+  } else {
+    //both adaptive and default flags should equal to 0.
+    pCurSliceExt->bAdaptiveBaseModeFlag		=
+      pCurSliceExt->bAdaptiveMotionPredFlag		=
+        pCurSliceExt->bAdaptiveResidualPredFlag	= false;
+
+    pCurSliceExt->bDefaultBaseModeFlag		=
+      pCurSliceExt->bDefaultMotionPredFlag		=
+        pCurSliceExt->bDefaultResidualPredFlag	= false;
+  }
+}
+
+/* count MB types if enabled FRAME_INFO_OUTPUT*/
+#if defined(MB_TYPES_CHECK)
+void WelsCountMbType (int32_t (*iMbCount)[18], const EWelsSliceType keSt, const SMB* kpMb) {
+  if (NULL == iMbCount)
+    return;
+
+  switch (kpMb->uiMbType) {
+  case MB_TYPE_INTRA4x4:
+    ++ iMbCount[keSt][Intra4x4];
+    break;
+  case MB_TYPE_INTRA16x16:
+    ++ iMbCount[keSt][Intra16x16];
+    break;
+  case MB_TYPE_SKIP:
+    ++ iMbCount[keSt][PSkip];
+    break;
+  case MB_TYPE_16x16:
+    ++ iMbCount[keSt][Inter16x16];
+    break;
+  case MB_TYPE_16x8:
+    ++ iMbCount[keSt][Inter16x8];
+    break;
+  case MB_TYPE_8x16:
+    ++ iMbCount[eSt][Inter8x16];
+    break;
+  case MB_TYPE_8x8:
+    ++ iMbCount[keSt][Inter8x8];
+    break;
+  case MB_TYPE_INTRA_BL:
+    ++ iMbCount[keSt][7];
+    break;
+  default:
+    break;
+  }
+}
+#endif//MB_TYPES_CHECK
+
+/*!
+* \brief	write reference picture list on reordering syntax in Slice header
+*/
+void WriteReferenceReorder (SBitStringAux* pBs, SSliceHeader* sSliceHeader) {
+  SRefPicListReorderSyntax* pRefOrdering	= &sSliceHeader->sRefReordering;
+  uint8_t eSliceType						= sSliceHeader->eSliceType % 5;
+  int16_t n = 0;
+
+  if (I_SLICE != eSliceType && SI_SLICE != eSliceType) {	// !I && !SI
+    BsWriteOneBit (pBs, true);
+//		{
+    uint16_t uiReorderingOfPicNumsIdc;
+    do {
+      uiReorderingOfPicNumsIdc = pRefOrdering->SReorderingSyntax[n].uiReorderingOfPicNumsIdc;
+      BsWriteUE (pBs, uiReorderingOfPicNumsIdc);
+      if (0 == uiReorderingOfPicNumsIdc || 1 == uiReorderingOfPicNumsIdc)
+        BsWriteUE (pBs, pRefOrdering->SReorderingSyntax[n].uiAbsDiffPicNumMinus1);
+      else if (2 == uiReorderingOfPicNumsIdc)
+        BsWriteUE (pBs, pRefOrdering->SReorderingSyntax[n].iLongTermPicNum);
+
+      n ++;
+    } while (3 != uiReorderingOfPicNumsIdc);
+//		}
+  }
+}
+
+/*!
+* \brief	write reference picture marking syntax in pSlice header
+*/
+void WriteRefPicMarking (SBitStringAux* pBs, SSliceHeader* pSliceHeader, SNalUnitHeaderExt* pNalHdrExt) {
+  SRefPicMarking* sRefMarking	= &pSliceHeader->sRefMarking;
+  int16_t n = 0;
+
+  if (pNalHdrExt->bIdrFlag) {
+    BsWriteOneBit (pBs, sRefMarking->bNoOutputOfPriorPicsFlag);
+    BsWriteOneBit (pBs, sRefMarking->bLongTermRefFlag);
+  } else {
+    BsWriteOneBit (pBs, sRefMarking->bAdaptiveRefPicMarkingModeFlag);
+
+    if (sRefMarking->bAdaptiveRefPicMarkingModeFlag) {
+      int32_t iMmcoType;
+      do {
+        iMmcoType = sRefMarking->SMmcoRef[n].iMmcoType;
+        BsWriteUE (pBs, iMmcoType);
+        if (1 == iMmcoType || 3 == iMmcoType)
+          BsWriteUE (pBs, sRefMarking->SMmcoRef[n].iDiffOfPicNum - 1);
+
+        if (2 == iMmcoType)
+          BsWriteUE (pBs, sRefMarking->SMmcoRef[n].iLongTermPicNum);
+
+        if (3 == iMmcoType || 6 == iMmcoType)
+          BsWriteUE (pBs, sRefMarking->SMmcoRef[n].iLongTermFrameIdx);
+
+        if (4 == iMmcoType)
+          BsWriteUE (pBs, sRefMarking->SMmcoRef[n].iMaxLongTermFrameIdx + 1);
+
+        n ++;
+      } while (0 != iMmcoType);
+    }
+
+  }
+}
+
+void WelsSliceHeaderWrite (SBitStringAux* pBs, SDqLayer* pCurLayer, SSlice* pSlice, int32_t* pPpsIdDelta) {
+  SWelsSPS* pSps = pCurLayer->sLayerInfo.pSpsP;
+  SWelsPPS* pPps = pCurLayer->sLayerInfo.pPpsP;
+  SSliceHeader* pSliceHeader      = &pSlice->sSliceHeaderExt.sSliceHeader;
+  SNalUnitHeaderExt* pNalHead   = &pCurLayer->sLayerInfo.sNalHeaderExt;
+
+  BsWriteUE (pBs, pSliceHeader->iFirstMbInSlice);
+  BsWriteUE (pBs, pSliceHeader->eSliceType);    /* same type things */
+
+  BsWriteUE (pBs, pSliceHeader->pPps->iPpsId + pPpsIdDelta[pSliceHeader->pPps->iPpsId]);
+
+  BsWriteBits (pBs, pSps->uiLog2MaxFrameNum, pSliceHeader->iFrameNum);
+
+  if (pNalHead->bIdrFlag) { /* NAL IDR */
+    BsWriteUE (pBs, pSliceHeader->uiIdrPicId);
+  }
+
+  BsWriteBits (pBs, pSps->iLog2MaxPocLsb, pSliceHeader->iPicOrderCntLsb);
+
+  if (P_SLICE == pSliceHeader->eSliceType) {
+    BsWriteOneBit (pBs, pSliceHeader->bNumRefIdxActiveOverrideFlag);
+    if (pSliceHeader->bNumRefIdxActiveOverrideFlag) {
+      BsWriteUE (pBs, pSliceHeader->uiNumRefIdxL0Active - 1);
+    }
+  }
+
+  if (!pNalHead->bIdrFlag)
+    WriteReferenceReorder (pBs, pSliceHeader);
+
+  if (pNalHead->sNalHeader.uiNalRefIdc) {
+    WriteRefPicMarking (pBs, pSliceHeader, pNalHead);
+  }
+
+  BsWriteSE (pBs, pSliceHeader->iSliceQpDelta);       /* pSlice qp delta */
+
+  if (pPps->bDeblockingFilterControlPresentFlag) {
+    switch (pSliceHeader->uiDisableDeblockingFilterIdc) {
+    case 0:
+    case 3:
+    case 4:
+    case 6:
+      BsWriteUE (pBs, 0);
+      break;
+    case 1:
+      BsWriteUE (pBs, 1);
+      break;
+    case 2:
+    case 5:
+      BsWriteUE (pBs, 2);
+      break;
+    default :
+      fprintf (stderr, "pData error for deblocking");
+      break;
+    }
+    if (1 != pSliceHeader->uiDisableDeblockingFilterIdc) {
+      BsWriteSE (pBs, pSliceHeader->iSliceAlphaC0Offset >> 1);
+      BsWriteSE (pBs, pSliceHeader->iSliceBetaOffset >> 1);
+    }
+  }
+}
+
+void WelsSliceHeaderExtWrite (SBitStringAux* pBs, SDqLayer* pCurLayer, SSlice* pSlice, int32_t* pPpsIdDelta) {
+  SWelsSPS* pSps           = pCurLayer->sLayerInfo.pSpsP;
+  SWelsPPS* pPps           = pCurLayer->sLayerInfo.pPpsP;
+  SSubsetSps* pSubSps = pCurLayer->sLayerInfo.pSubsetSpsP;
+  SSliceHeaderExt* pSliceHeadExt = &pSlice->sSliceHeaderExt;
+  SSliceHeader* pSliceHeader      = &pSliceHeadExt->sSliceHeader;
+  SNalUnitHeaderExt* pNalHead   = &pCurLayer->sLayerInfo.sNalHeaderExt;
+
+  BsWriteUE (pBs, pSliceHeader->iFirstMbInSlice);
+  BsWriteUE (pBs, pSliceHeader->eSliceType);    /* same type things */
+
+  BsWriteUE (pBs, pSliceHeader->pPps->iPpsId + pPpsIdDelta[pSliceHeader->pPps->iPpsId]);
+
+  BsWriteBits (pBs, pSps->uiLog2MaxFrameNum, pSliceHeader->iFrameNum);
+
+  if (pNalHead->bIdrFlag) { /* NAL IDR */
+    BsWriteUE (pBs, pSliceHeader->uiIdrPicId);
+  }
+
+  BsWriteBits (pBs, pSps->iLog2MaxPocLsb, pSliceHeader->iPicOrderCntLsb);
+//	{
+  if (P_SLICE == pSliceHeader->eSliceType) {
+    BsWriteOneBit (pBs, pSliceHeader->bNumRefIdxActiveOverrideFlag);
+    if (pSliceHeader->bNumRefIdxActiveOverrideFlag) {
+      BsWriteUE (pBs, pSliceHeader->uiNumRefIdxL0Active - 1);
+    }
+  }
+
+  if (!pNalHead->bIdrFlag)
+    WriteReferenceReorder (pBs, pSliceHeader);
+
+  if (pNalHead->sNalHeader.uiNalRefIdc) {
+    WriteRefPicMarking (pBs, pSliceHeader, pNalHead);
+
+    if (!pSubSps->sSpsSvcExt.bSliceHeaderRestrictionFlag) {
+      BsWriteOneBit (pBs, pSliceHeadExt->bStoreRefBasePicFlag);
+    }
+  }
+//	}
+
+  BsWriteSE (pBs, pSliceHeader->iSliceQpDelta);       /* pSlice qp delta */
+
+  if (pPps->bDeblockingFilterControlPresentFlag) {
+    BsWriteUE (pBs, pSliceHeader->uiDisableDeblockingFilterIdc);
+    if (1 != pSliceHeader->uiDisableDeblockingFilterIdc) {
+      BsWriteSE (pBs, pSliceHeader->iSliceAlphaC0Offset >> 1);
+      BsWriteSE (pBs, pSliceHeader->iSliceBetaOffset >> 1);
+    }
+  }
+
+#if !defined(DISABLE_FMO_FEATURE)
+  if (pPps->uiNumSliceGroups > 1  &&
+      pPps->uiSliceGroupMapType >= 3 &&
+      pPps->uiSliceGroupMapType <= 5) {
+    int32_t iNumBits;
+    if (pPps->uiSliceGroupChangeRate) {
+      iNumBits = WELS_CEILLOG2 (1 + pPps->uiPicSizeInMapUnits / pPps->uiSliceGroupChangeRate);
+      BsWriteBits (pBs, iNumBits, pSliceHeader->iSliceGroupChangeCycle);
+    }
+  }
+#endif//!DISABLE_FMO_FEATURE
+
+  if (false) {
+    BsWriteOneBit (pBs, pSliceHeadExt->bSliceSkipFlag);
+    if (pSliceHeadExt->bSliceSkipFlag) {
+      BsWriteUE (pBs, pSliceHeadExt->uiNumMbsInSlice - 1);
+    } else {
+      BsWriteOneBit (pBs, pSliceHeadExt->bAdaptiveBaseModeFlag);
+      if (!pSliceHeadExt->bAdaptiveBaseModeFlag) {
+        BsWriteOneBit (pBs, pSliceHeadExt->bDefaultBaseModeFlag);
+      }
+
+      if (!pSliceHeadExt->bDefaultBaseModeFlag) {
+        BsWriteOneBit (pBs, 0);
+        BsWriteOneBit (pBs, 0);
+      }
+
+      BsWriteOneBit (pBs, pSliceHeadExt->bAdaptiveResidualPredFlag);
+      if (!pSliceHeadExt->bAdaptiveResidualPredFlag) {
+        BsWriteOneBit (pBs, 0);
+      }
+    }
+    if (1 == pSubSps->sSpsSvcExt.bAdaptiveTcoeffLevelPredFlag) {
+      BsWriteOneBit (pBs, pSliceHeadExt->bTcoeffLevelPredFlag);
+    }
+
+  }
+
+  if (!pSubSps->sSpsSvcExt.bSliceHeaderRestrictionFlag) {
+    BsWriteBits (pBs, 4, 0);
+    BsWriteBits (pBs, 4, 15);
+  }
+}
+
+//only BaseLayer inter MB and SpatialLayer (uiQualityId = 0) inter MB calling this pFunc.
+//only for inter part
+void WelsInterMbEncode (sWelsEncCtx* pEncCtx, SSlice* pSlice, SMB* pCurMb) {
+  SMbCache* pMbCache = &pSlice->sMbCacheInfo;
+
+  WelsDctMb (pMbCache->pCoeffLevel,  pMbCache->SPicData.pEncMb[0], pEncCtx->pCurDqLayer->iEncStride[0],
+             pMbCache->pMemPredLuma, pEncCtx->pFuncList->pfDctFourT4);
+  WelsEncInterY (pEncCtx->pFuncList, pCurMb, pMbCache);
+}
+
+
+//only BaseLayer inter MB and SpatialLayer (uiQualityId = 0) inter MB calling this pFunc.
+//only for I SSlice
+void WelsIMbChromaEncode (sWelsEncCtx* pEncCtx, SMB* pCurMb, SMbCache* pMbCache) {
+  SWelsFuncPtrList* pFunc	= pEncCtx->pFuncList;
+  SDqLayer* pCurLayer			= pEncCtx->pCurDqLayer;
+  const int32_t kiEncStride	= pCurLayer->iEncStride[1];
+  const int32_t kiCsStride		= pCurLayer->iCsStride[1];
+  int16_t* pCurRS				= pMbCache->pCoeffLevel;
+  uint8_t* pBestPred			= pMbCache->pBestPredIntraChroma;
+  uint8_t* pCsCb				= pMbCache->SPicData.pCsMb[1];
+  uint8_t* pCsCr				= pMbCache->SPicData.pCsMb[2];
+
+  //cb
+  pFunc->pfDctFourT4 (pCurRS,    pMbCache->SPicData.pEncMb[1], kiEncStride, pBestPred,    8);
+  WelsEncRecUV (pFunc, pCurMb, pMbCache, pCurRS,    1);
+  pFunc->pfIDctFourT4 (pCsCb, kiCsStride, pBestPred,    8, pCurRS);
+
+  //cr
+  pFunc->pfDctFourT4 (pCurRS + 64, pMbCache->SPicData.pEncMb[2], kiEncStride, pBestPred + 64, 8);
+  WelsEncRecUV (pFunc, pCurMb, pMbCache, pCurRS + 64, 2);
+  pFunc->pfIDctFourT4 (pCsCr, kiCsStride, pBestPred + 64, 8, pCurRS + 64);
+}
+
+
+//only BaseLayer inter MB and SpatialLayer (uiQualityId = 0) inter MB calling this pFunc.
+//for P SSlice (intra part + inter part)
+void WelsPMbChromaEncode (sWelsEncCtx* pEncCtx, SSlice* pSlice, SMB* pCurMb) {
+  SWelsFuncPtrList* pFunc	= pEncCtx->pFuncList;
+  SDqLayer* pCurLayer			= pEncCtx->pCurDqLayer;
+  const int32_t kiEncStride	= pCurLayer->iEncStride[1];
+  SMbCache* pMbCache			= &pSlice->sMbCacheInfo;
+  int16_t* pCurRS				= pMbCache->pCoeffLevel + 256;
+  uint8_t* pBestPred			= pMbCache->pMemPredChroma;
+
+  pFunc->pfDctFourT4 (pCurRS,		pMbCache->SPicData.pEncMb[1],	kiEncStride,		pBestPred,		8);
+  pFunc->pfDctFourT4 (pCurRS + 64,	pMbCache->SPicData.pEncMb[2],	kiEncStride,		pBestPred + 64,	8);
+
+  WelsEncRecUV (pFunc, pCurMb, pMbCache, pCurRS, 1);
+  WelsEncRecUV (pFunc, pCurMb, pMbCache, pCurRS + 64, 2);
+}
+
+void OutputPMbWithoutConstructCsRsNoCopy (sWelsEncCtx* pCtx, SDqLayer* pDq, SSlice* pSlice, SMB* pMb) {
+  if (IS_INTER (pMb->uiMbType) || IS_I_BL (pMb->uiMbType)) {	//intra have been reconstructed, NO COPY from CS to pDecPic--
+    SMbCache* pMbCache			= &pSlice->sMbCacheInfo;
+    uint8_t* pDecY				= pMbCache->SPicData.pDecMb[0];
+    uint8_t* pDecU				= pMbCache->SPicData.pDecMb[1];
+    uint8_t* pDecV				= pMbCache->SPicData.pDecMb[2];
+    int16_t* pScaledTcoeff		= pMbCache->pCoeffLevel;
+    const int32_t kiDecStrideLuma	= pDq->pDecPic->iLineSize[0];
+    const int32_t kiDecStrideChroma	= pDq->pDecPic->iLineSize[1];
+    PIDctFunc pfIdctFour4x4				= pCtx->pFuncList->pfIDctFourT4;
+
+    WelsIDctT4RecOnMb (pDecY, kiDecStrideLuma, pDecY, kiDecStrideLuma, pScaledTcoeff,  pfIdctFour4x4);
+    pfIdctFour4x4 (pDecU, kiDecStrideChroma, pDecU, kiDecStrideChroma, pScaledTcoeff + 256);
+    pfIdctFour4x4 (pDecV, kiDecStrideChroma, pDecV, kiDecStrideChroma, pScaledTcoeff + 320);
+  }
+}
+
+// for intra non-dynamic pSlice
+//encapsulate two kinds of reconstruction:
+//first. store base or highest Dependency Layer with only one quality (without CS RS reconstruction)
+//second. lower than highest Dependency Layer, and for every Dependency Layer with one quality layer(single layer)
+void WelsISliceMdEnc (sWelsEncCtx* pEncCtx, SSlice* pSlice) { //pMd + encoding
+  SDqLayer* pCurLayer				= pEncCtx->pCurDqLayer;
+  SSliceCtx* pSliceCtx		= pCurLayer->pSliceEncCtx;
+  SMbCache* pMbCache				= &pSlice->sMbCacheInfo;
+  SSliceHeaderExt* pSliceHdExt	= &pSlice->sSliceHeaderExt;
+  SMB* pMbList						= pCurLayer->sMbDataP;
+  SMB* pCurMb						= NULL;
+  const int32_t kiSliceFirstMbXY	= pSliceHdExt->sSliceHeader.iFirstMbInSlice;
+  int32_t iNextMbIdx				= kiSliceFirstMbXY;
+  const int32_t kiTotalNumMb		= pCurLayer->iMbWidth * pCurLayer->iMbHeight;
+  int32_t iCurMbIdx				= 0, iNumMbCoded = 0;
+  const int32_t kiSliceIdx			= pSlice->uiSliceIdx;
+  const uint8_t kuiChromaQpIndexOffset = pCurLayer->sLayerInfo.pPpsP->uiChromaQpIndexOffset;
+  SWelsMD sMd;
+
+  for (; ;) {
+    iCurMbIdx	= iNextMbIdx;
+    pCurMb = &pMbList[ iCurMbIdx ];
+    pCurMb->uiLumaQp   = pEncCtx->iGlobalQp;
+    pCurMb->uiChromaQp = g_kuiChromaQpTable[CLIP3_QP_0_51 (pCurMb->uiLumaQp + kuiChromaQpIndexOffset)];
+
+    pEncCtx->pFuncList->pfRc.pfWelsRcMbInit (pEncCtx, pCurMb, pSlice);
+
+    sMd.iLambda = g_kiQpCostTable[pCurMb->uiLumaQp];
+
+    WelsMdIntraInit (pEncCtx, pCurMb, pMbCache, kiSliceFirstMbXY);
+    WelsMdIntraMb (pEncCtx, &sMd, pCurMb, pMbCache);
+    UpdateNonZeroCountCache (pCurMb, pMbCache);
+
+    WelsSpatialWriteMbSyn (pEncCtx, pSlice, pCurMb);
+
+    pCurMb->uiSliceIdc = kiSliceIdx;
+
+#if defined(MB_TYPES_CHECK)
+    WelsCountMbType (pEncCtx->sPerInfo.iMbCount, I_SLICE, pCurMb);
+#endif//MB_TYPES_CHECK
+
+    pEncCtx->pFuncList->pfRc.pfWelsRcMbInfoUpdate (pEncCtx, pCurMb, sMd.iCostLuma, pSlice);
+
+    ++iNumMbCoded;
+
+    iNextMbIdx = WelsGetNextMbOfSlice (pSliceCtx, iCurMbIdx);
+    if (iNextMbIdx == -1 || iNextMbIdx >= kiTotalNumMb || iNumMbCoded >= kiTotalNumMb) {
+      break;
+    }
+  }
+}
+
+// Only for intra dynamic slicing
+void WelsISliceMdEncDynamic (sWelsEncCtx* pEncCtx, SSlice* pSlice) { //pMd + encoding
+  SBitStringAux* pBs				= pSlice->pSliceBsa;
+  SDqLayer* pCurLayer				= pEncCtx->pCurDqLayer;
+  SSliceCtx* pSliceCtx		= pCurLayer->pSliceEncCtx;
+  SMbCache* pMbCache				= &pSlice->sMbCacheInfo;
+  SSliceHeaderExt* pSliceHdExt	= &pSlice->sSliceHeaderExt;
+  SMB* pMbList						= pCurLayer->sMbDataP;
+  SMB* pCurMb						= NULL;
+  const int32_t kiSliceFirstMbXY	= pSliceHdExt->sSliceHeader.iFirstMbInSlice;
+  int32_t iNextMbIdx				= kiSliceFirstMbXY;
+  const int32_t kiTotalNumMb		= pCurLayer->iMbWidth * pCurLayer->iMbHeight;
+  int32_t iCurMbIdx				= 0, iNumMbCoded = 0;
+  const int32_t kiSliceIdx				= pSlice->uiSliceIdx;
+  const int32_t kiPartitionId			= (kiSliceIdx % pEncCtx->iActiveThreadsNum);
+  const uint8_t kuiChromaQpIndexOffset = pCurLayer->sLayerInfo.pPpsP->uiChromaQpIndexOffset;
+
+  SWelsMD sMd;
+  SDynamicSlicingStack sDss;
+  sDss.iStartPos = BsGetBitsPos (pBs);
+
+  for (; ;) {
+    iCurMbIdx	= iNextMbIdx;
+    pCurMb = &pMbList[ iCurMbIdx ];
+    pCurMb->uiLumaQp   = pEncCtx->iGlobalQp;
+    pCurMb->uiChromaQp = g_kuiChromaQpTable[CLIP3_QP_0_51 (pCurMb->uiLumaQp + kuiChromaQpIndexOffset)];
+
+    pEncCtx->pFuncList->pfRc.pfWelsRcMbInit (pEncCtx, pCurMb, pSlice);
+    // if already reaches the largest number of slices, set QPs to the upper bound
+    if (pSlice->bDynamicSlicingSliceSizeCtrlFlag) {
+      pCurMb->uiLumaQp = pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId].iMaxQp;
+      pCurMb->uiChromaQp = g_kuiChromaQpTable[CLIP3_QP_0_51 (pCurMb->uiLumaQp + kuiChromaQpIndexOffset)];
+    }
+
+    sMd.iLambda = g_kiQpCostTable[pCurMb->uiLumaQp];
+
+    WelsMdIntraInit (pEncCtx, pCurMb, pMbCache, kiSliceFirstMbXY);
+    WelsMdIntraMb (pEncCtx, &sMd, pCurMb, pMbCache);
+    UpdateNonZeroCountCache (pCurMb, pMbCache);
+    //stack pBs pointer
+    sDss.pBsStackBufPtr	= pBs->pBufPtr;
+    sDss.uiBsStackCurBits	= pBs->uiCurBits;
+    sDss.iBsStackLeftBits	= pBs->iLeftBits;
+
+    WelsSpatialWriteMbSyn (pEncCtx, pSlice, pCurMb);
+
+    sDss.iCurrentPos = BsGetBitsPos (pBs);
+
+    if (DynSlcJudgeSliceBoundaryStepBack (pEncCtx, pSlice, pSliceCtx, pCurMb, &sDss)) { //islice
+      //stack pBs pointer
+      pBs->pBufPtr		= sDss.pBsStackBufPtr;
+      pBs->uiCurBits	= sDss.uiBsStackCurBits;
+      pBs->iLeftBits	= sDss.iBsStackLeftBits;
+
+      pCurLayer->pLastCodedMbIdxOfPartition[kiPartitionId] = iCurMbIdx -
+          1;	// update pLastCodedMbIdxOfPartition, need to -1 due to stepping back
+      ++ pCurLayer->pNumSliceCodedOfPartition[kiPartitionId];
+
+      break;
+    }
+
+    pCurMb->uiSliceIdc = kiSliceIdx;
+
+#if defined(MB_TYPES_CHECK)
+    WelsCountMbType (pEncCtx->sPerInfo.iMbCount, I_SLICE, pCurMb);
+#endif//MB_TYPES_CHECK
+
+    pEncCtx->pFuncList->pfRc.pfWelsRcMbInfoUpdate (pEncCtx, pCurMb, sMd.iCostLuma, pSlice);
+
+    ++iNumMbCoded;
+
+    iNextMbIdx = WelsGetNextMbOfSlice (pSliceCtx, iCurMbIdx);
+    //whether all of MB in current pSlice encoded or not
+    if (iNextMbIdx == -1 || iNextMbIdx >= kiTotalNumMb || iNumMbCoded >= kiTotalNumMb) {
+      pSliceCtx->pCountMbNumInSlice[kiSliceIdx]	= iCurMbIdx - pCurLayer->pLastCodedMbIdxOfPartition[kiPartitionId];
+      pCurLayer->pLastCodedMbIdxOfPartition[kiPartitionId] =
+        iCurMbIdx;	// update pLastCodedMbIdxOfPartition, finish coding, use iCurMbIdx directly
+      break;
+    }
+  }
+}
+
+//encapsulate two kinds of reconstruction:
+// first. store base or highest Dependency Layer with only one quality (without CS RS reconstruction)
+// second. lower than highest Dependency Layer, and for every Dependency Layer with one quality layer(single layer)
+void WelsPSliceMdEnc (sWelsEncCtx* pEncCtx, SSlice* pSlice,  const bool_t kbIsHighestDlayerFlag) { //pMd + encoding
+  const SSliceHeaderExt*	kpShExt				= &pSlice->sSliceHeaderExt;
+  const SSliceHeader*		kpSh					= &kpShExt->sSliceHeader;
+  const int32_t			kiSliceFirstMbXY	= kpSh->iFirstMbInSlice;
+  SWelsMD sMd;
+
+  sMd.uiRef			= kpSh->uiRefIndex;
+  sMd.bMdUsingSad		= kbIsHighestDlayerFlag;
+  if (!pEncCtx->pCurDqLayer->bBaseLayerAvailableFlag || !kbIsHighestDlayerFlag)
+    memset (&sMd.sMe, 0, sizeof (sMd.sMe));
+
+  //pMb loop
+  WelsMdInterMbLoop (pEncCtx, pSlice, &sMd, kiSliceFirstMbXY);
+}
+
+void WelsPSliceMdEncDynamic (sWelsEncCtx* pEncCtx, SSlice* pSlice, const bool_t kbIsHighestDlayerFlag) {
+  const SSliceHeaderExt*	kpShExt				= &pSlice->sSliceHeaderExt;
+  const SSliceHeader*		kpSh					= &kpShExt->sSliceHeader;
+  const int32_t			kiSliceFirstMbXY	= kpSh->iFirstMbInSlice;
+  SWelsMD sMd;
+
+  sMd.uiRef			= kpSh->uiRefIndex;
+  sMd.bMdUsingSad		= kbIsHighestDlayerFlag;
+  if (!pEncCtx->pCurDqLayer->bBaseLayerAvailableFlag || !kbIsHighestDlayerFlag)
+    memset (&sMd.sMe, 0, sizeof (sMd.sMe));
+
+  //mb loop
+  WelsMdInterMbLoopOverDynamicSlice (pEncCtx, pSlice, &sMd, kiSliceFirstMbXY);
+}
+
+void WelsCodePSlice (sWelsEncCtx* pEncCtx, SSlice* pSlice) {
+  //pSlice-level init should be outside and before this function
+  SDqLayer* pCurLayer			= pEncCtx->pCurDqLayer;
+  const bool_t kbBaseAvail		= pCurLayer->bBaseLayerAvailableFlag;
+  const bool_t kbHighestSpatial = pEncCtx->pSvcParam->iNumDependencyLayer ==
+                                  (pCurLayer->sLayerInfo.sNalHeaderExt.uiDependencyId + 1);
+
+  //MD switch
+  if (kbBaseAvail && kbHighestSpatial) {
+    //initial pMd pointer
+    pEncCtx->pFuncList->pfInterMd			= (PInterMdFunc)WelsMdInterMbEnhancelayer;
+  } else {
+    //initial pMd pointer
+    pEncCtx->pFuncList->pfInterMd            = (PInterMdFunc)WelsMdInterMb;
+  }
+  WelsPSliceMdEnc (pEncCtx, pSlice, kbHighestSpatial);
+}
+
+void WelsCodePOverDynamicSlice (sWelsEncCtx* pEncCtx, SSlice* pSlice) {
+  //pSlice-level init should be outside and before this function
+  SDqLayer* pCurLayer			= pEncCtx->pCurDqLayer;
+  const bool_t kbBaseAvail		= pCurLayer->bBaseLayerAvailableFlag;
+  const bool_t kbHighestSpatial = pEncCtx->pSvcParam->iNumDependencyLayer ==
+                                  (pCurLayer->sLayerInfo.sNalHeaderExt.uiDependencyId + 1);
+
+  //MD switch
+  if (kbBaseAvail && kbHighestSpatial) {
+    //initial pMd pointer
+    pEncCtx->pFuncList->pfInterMd			= (PInterMdFunc)WelsMdInterMbEnhancelayer;
+  } else {
+    //initial pMd pointer
+    pEncCtx->pFuncList->pfInterMd            = (PInterMdFunc)WelsMdInterMb;
+  }
+  WelsPSliceMdEncDynamic (pEncCtx, pSlice, kbHighestSpatial);
+}
+
+// 1st index: 0: for P pSlice; 1: for I pSlice;
+// 2nd index: 0: for non-dynamic pSlice; 1: for dynamic I pSlice;
+PWelsCodingSliceFunc	g_pWelsSliceCoding[2][2] = {
+  { WelsCodePSlice, WelsCodePOverDynamicSlice },	// P SSlice
+  { WelsISliceMdEnc, WelsISliceMdEncDynamic }	// I SSlice
+};
+PWelsSliceHeaderWriteFunc		g_pWelsWriteSliceHeader[2] = {	// 0: for base; 1: for ext;
+  WelsSliceHeaderWrite,
+  WelsSliceHeaderExtWrite
+};
+
+
+void WelsCodeOneSlice (sWelsEncCtx* pEncCtx, const int32_t kiSliceIdx, const int32_t kiNalType) {
+  SDqLayer* pCurLayer					= pEncCtx->pCurDqLayer;
+  SNalUnitHeaderExt* pNalHeadExt	= &pCurLayer->sLayerInfo.sNalHeaderExt;
+  SSlice* pCurSlice					= &pCurLayer->sLayerInfo.pSliceInLayer[kiSliceIdx];
+  SBitStringAux* pBs					= pCurSlice->pSliceBsa;
+  const int32_t kiDynamicSliceFlag	= (pEncCtx->pSvcParam->sDependencyLayers[pEncCtx->uiDependencyId].sMso.uiSliceMode ==
+                                       SM_DYN_SLICE);
+
+  assert (kiSliceIdx == pCurSlice->uiSliceIdx);
+
+  if (I_SLICE == pEncCtx->eSliceType) {
+    pNalHeadExt->bIdrFlag = 1;
+    pCurSlice->sScaleShift = 0;
+  } else {
+    const uint32_t kuiTemporalId = pNalHeadExt->uiTemporalId;
+    pCurSlice->sScaleShift = kuiTemporalId ? (kuiTemporalId - pEncCtx->pRefPic->uiTemporalId) : 0;
+  }
+
+  WelsSliceHeaderExtInit (pEncCtx, pCurLayer, pCurSlice);
+
+
+  g_pWelsWriteSliceHeader[pCurSlice->bSliceHeaderExtFlag] (pBs, pCurLayer, pCurSlice,
+      & (pEncCtx->sPSOVector.sParaSetOffsetVariable[PARA_SET_TYPE_PPS].iParaSetIdDelta[0]));
+#if _DEBUG
+  if (pEncCtx->sPSOVector.bEnableSpsPpsIdAddition) {
+    const int32_t kiEncoderPpsId    = pCurSlice->sSliceHeaderExt.sSliceHeader.pPps->iPpsId;
+    const int32_t kiTmpPpsIdInBs = kiEncoderPpsId +
+                                   pEncCtx->sPSOVector.sParaSetOffsetVariable[PARA_SET_TYPE_PPS].iParaSetIdDelta[ kiEncoderPpsId ];
+    assert (MAX_PPS_COUNT > kiTmpPpsIdInBs);
+
+    //when activated need to sure there is avialable PPS
+    assert (pEncCtx->sPSOVector.sParaSetOffsetVariable[PARA_SET_TYPE_PPS].bUsedParaSetIdInBs[kiTmpPpsIdInBs]);
+  }
+#endif
+
+  pCurSlice->uiLastMbQp = pCurLayer->sLayerInfo.pPpsP->iPicInitQp + pCurSlice->sSliceHeaderExt.sSliceHeader.iSliceQpDelta;
+
+  g_pWelsSliceCoding[pNalHeadExt->bIdrFlag][kiDynamicSliceFlag] (pEncCtx, pCurSlice);
+
+  BsRbspTrailingBits (pBs);
+
+  BsFlush (pBs);
+}
+
+//pFunc: UpdateMbNeighbourInfoForNextSlice()
+void UpdateMbNeighbourInfoForNextSlice (SSliceCtx* pSliceCtx,
+                                        SMB* pMbList,
+                                        const int32_t kiFirstMbIdxOfNextSlice,
+                                        const int32_t kiLastMbIdxInPartition) {
+  const int32_t kiMbWidth					= pSliceCtx->iMbWidth;
+  int32_t iIdx								= kiFirstMbIdxOfNextSlice;
+  int32_t	iNextSliceFirstMbIdxRowStart = ((kiFirstMbIdxOfNextSlice % kiMbWidth) ? 1 : 0);
+  int32_t iCountMbUpdate					= kiMbWidth +
+                                    iNextSliceFirstMbIdxRowStart; //need to update MB(iMbXY+1) to MB(iMbXY+1+row) in common case
+  const int32_t kiEndMbNeedUpdate		= kiFirstMbIdxOfNextSlice + iCountMbUpdate;
+  SMB* pMb									= &pMbList[iIdx];
+
+  do {
+    uint32_t uiNeighborAvailFlag	= 0;
+    const int32_t kiMbXY				= pMb->iMbXY;
+    const int32_t kiMbX				= pMb->iMbX;
+    const int32_t kiMbY				= pMb->iMbY;
+    BOOL_T     bLeft;
+    BOOL_T     bTop;
+    BOOL_T     bLeftTop;
+    BOOL_T     bRightTop;
+    int32_t   iLeftXY, iTopXY, iLeftTopXY, iRightTopXY;
+    const uint8_t  kuiSliceIdc		= WelsMbToSliceIdc (pSliceCtx, kiMbXY);
+
+    pMb->uiSliceIdc	= kuiSliceIdc;
+    iLeftXY = kiMbXY - 1;
+    iTopXY = kiMbXY - kiMbWidth;
+    iLeftTopXY = iTopXY - 1;
+    iRightTopXY = iTopXY + 1;
+
+    bLeft = (kiMbX > 0) && (kuiSliceIdc == WelsMbToSliceIdc (pSliceCtx, iLeftXY));
+    bTop = (kiMbY > 0) && (kuiSliceIdc == WelsMbToSliceIdc (pSliceCtx, iTopXY));
+    bLeftTop = (kiMbX > 0) && (kiMbY > 0) && (kuiSliceIdc == WelsMbToSliceIdc (pSliceCtx, iLeftTopXY));
+    bRightTop = (kiMbX < (kiMbWidth - 1)) && (kiMbY > 0) && (kuiSliceIdc == WelsMbToSliceIdc (pSliceCtx, iRightTopXY));
+
+    if (bLeft) {
+      uiNeighborAvailFlag |= LEFT_MB_POS;
+    }
+    if (bTop) {
+      uiNeighborAvailFlag |= TOP_MB_POS;
+    }
+    if (bLeftTop) {
+      uiNeighborAvailFlag |= TOPLEFT_MB_POS;
+    }
+    if (bRightTop) {
+      uiNeighborAvailFlag |= TOPRIGHT_MB_POS;
+    }
+    pMb->uiNeighborAvail	= (uint8_t)uiNeighborAvailFlag;
+
+    ++ pMb;
+    ++ iIdx;
+  } while ((iIdx < kiEndMbNeedUpdate) &&
+           (iIdx <= kiLastMbIdxInPartition));
+}
+
+
+void AddSliceBoundary (sWelsEncCtx* pEncCtx, SSlice* pCurSlice, SSliceCtx* pSliceCtx, SMB* pCurMb,
+                       int32_t iFirstMbIdxOfNextSlice, const int32_t kiLastMbIdxInPartition) {
+  SDqLayer*	pCurLayer = pEncCtx->pCurDqLayer;
+  int32_t		iCurMbIdx		= pCurMb->iMbXY;
+  int32_t		iCurSliceIdc	= pSliceCtx->pOverallMbMap[ iCurMbIdx ];
+  const int32_t kiSliceIdxStep = pEncCtx->iActiveThreadsNum;
+  int32_t		iNextSliceIdc	= iCurSliceIdc + kiSliceIdxStep;
+  SSlice*		pNextSlice		= NULL;
+
+  SMB* pMbList					= pCurLayer->sMbDataP;
+
+  //update cur pSlice info
+  pCurSlice->sSliceHeaderExt.uiNumMbsInSlice	= 1 + iCurMbIdx - pCurSlice->sSliceHeaderExt.sSliceHeader.iFirstMbInSlice;
+
+  //pNextSlice pointer/initialization
+  pNextSlice = & (pCurLayer->sLayerInfo.pSliceInLayer[ iNextSliceIdc ]);
+
+#if _DEBUG
+  assert (NULL != pNextSlice);
+  // now ( pSliceCtx->iSliceNumInFrame < pSliceCtx->iMaxSliceNumConstraint ) always true by the call of this pFunc
+#endif
+
+  //init next pSlice info
+  pNextSlice->bSliceHeaderExtFlag =
+    (NAL_UNIT_CODED_SLICE_EXT == pCurLayer->sLayerInfo.sNalHeaderExt.sNalHeader.eNalUnitType);
+  memcpy (&pNextSlice->sSliceHeaderExt, &pCurSlice->sSliceHeaderExt,
+          sizeof (SSliceHeaderExt));	// confirmed_safe_unsafe_usage
+
+  pSliceCtx->pFirstMbInSlice[iNextSliceIdc] = iFirstMbIdxOfNextSlice;
+
+#if !defined(MT_ENABLED)
+  pNextSlice->uiSliceIdx = iNextSliceIdc;
+  pNextSlice->pSliceBsa = & (pEncCtx->pOut->sBsWrite);
+#endif//!MT_ENABLED
+
+  memset (pSliceCtx->pOverallMbMap + iFirstMbIdxOfNextSlice, (uint8_t)iNextSliceIdc,
+          (kiLastMbIdxInPartition - iFirstMbIdxOfNextSlice + 1)*sizeof (uint8_t));
+
+  //DYNAMIC_SLICING_ONE_THREAD: update pMbList slice_neighbor_info
+  UpdateMbNeighbourInfoForNextSlice (pSliceCtx, pMbList, iFirstMbIdxOfNextSlice, kiLastMbIdxInPartition);
+}
+
+BOOL_T DynSlcJudgeSliceBoundaryStepBack (void* pCtx, void* pSlice, SSliceCtx* pSliceCtx, SMB* pCurMb,
+    SDynamicSlicingStack* pDss) {
+  sWelsEncCtx* pEncCtx = (sWelsEncCtx*)pCtx;
+  SSlice* pCurSlice = (SSlice*)pSlice;
+  int32_t		   iCurMbIdx  = pCurMb->iMbXY;
+  uint32_t        uiLen = 0;
+  int32_t		   iPosBitOffset = 0;
+  const int32_t  kiActiveThreadsNum = pEncCtx->iActiveThreadsNum;
+  const int32_t  kiPartitaionId = pCurSlice->uiSliceIdx % kiActiveThreadsNum;
+  const int32_t  kiLastMbIdxInPartition	= pEncCtx->pCurDqLayer->pLastMbIdxOfPartition[kiPartitaionId];
+
+  const BOOL_T    kbCurMbNotFirstMbOfCurSlice      = (pSliceCtx->pOverallMbMap[iCurMbIdx] ==
+      pSliceCtx->pOverallMbMap[iCurMbIdx - 1]);
+  const BOOL_T    kbCurMbNotLastMbOfCurPartition = iCurMbIdx < kiLastMbIdxInPartition;
+  const BOOL_T    kbSliceNumNotExceedConstraint       = pSliceCtx->iSliceNumInFrame <
+      pSliceCtx->iMaxSliceNumConstraint; /*tmp choice to avoid complex memory operation, 100520, to be modify*/
+  const BOOL_T    kbSliceNumReachConstraint               = (pSliceCtx->iSliceNumInFrame ==
+      pSliceCtx->iMaxSliceNumConstraint);
+
+  if (pCurSlice->bDynamicSlicingSliceSizeCtrlFlag)
+    return false;
+
+  iPosBitOffset = (pDss->iCurrentPos - pDss->iStartPos);
+#if _DEBUG
+  assert (iPosBitOffset >= 0);
+#endif
+  uiLen = ((iPosBitOffset >> 3) + ((iPosBitOffset & 0x07) ? 1 : 0));
+
+#ifdef MT_ENABLED
+  if (pEncCtx->pSvcParam->iMultipleThreadIdc > 1)
+    WelsMutexLock (&pEncCtx->pSliceThreading->mutexSliceNumUpdate);
+#endif//MT_ENABLED
+
+  //DYNAMIC_SLICING_ONE_THREAD: judge jump_avoiding_pack_exceed
+  if (
+    ((kbCurMbNotFirstMbOfCurSlice
+      && JUMPPACKETSIZE_JUDGE (uiLen, iCurMbIdx, pSliceCtx->uiSliceSizeConstraint)) /*jump_avoiding_pack_exceed*/
+     && kbCurMbNotLastMbOfCurPartition) //decide to add new pSlice
+    && (kbSliceNumNotExceedConstraint
+#ifdef MT_ENABLED
+        && ((pCurSlice->uiSliceIdx + kiActiveThreadsNum) < pSliceCtx->iMaxSliceNumConstraint)
+#endif//MT_ENABLED	
+       )//able to add new pSlice
+
+  ) {
+
+    AddSliceBoundary (pEncCtx, pCurSlice, pSliceCtx, pCurMb, iCurMbIdx, kiLastMbIdxInPartition);
+
+    ++ pSliceCtx->iSliceNumInFrame;
+
+#ifdef MT_ENABLED
+    if (pEncCtx->pSvcParam->iMultipleThreadIdc > 1)
+      WelsMutexUnlock (&pEncCtx->pSliceThreading->mutexSliceNumUpdate);
+#endif//MT_ENABLED
+
+    return TRUE;
+  }
+
+  if (
+    (kbSliceNumReachConstraint
+#ifdef MT_ENABLED
+     || ((pCurSlice->uiSliceIdx + kiActiveThreadsNum) >= pSliceCtx->iMaxSliceNumConstraint)
+#endif//MT_ENABLED
+    )
+    && ((JUMPPACKETSIZE_JUDGE (uiLen,	iCurMbIdx,
+                               pSliceCtx->uiSliceSizeConstraint - ((kiLastMbIdxInPartition - iCurMbIdx) <<
+                                   (pCurSlice->uiAssumeLog2BytePerMb) /* assume each MB consumes two byte under largest QP */)))
+        && kbCurMbNotLastMbOfCurPartition) //risk of exceeding the size constraint when pSlice num reaches constraint
+  ) {
+    pCurSlice->bDynamicSlicingSliceSizeCtrlFlag = true;
+  }
+
+#ifdef MT_ENABLED
+  if (pEncCtx->pSvcParam->iMultipleThreadIdc > 1)
+    WelsMutexUnlock (&pEncCtx->pSliceThreading->mutexSliceNumUpdate);
+#endif//MT_ENABLED
+
+  return FALSE;
+}
+
+///////////////
+//  pMb loop
+///////////////
+// for inter non-dynamic pSlice
+void WelsMdInterMbLoop (sWelsEncCtx* pEncCtx, SSlice* pSlice, void* pWelsMd, const int32_t kiSliceFirstMbXY) {
+  SWelsMD* pMd					= (SWelsMD*)pWelsMd;
+  SBitStringAux* pBs			= pSlice->pSliceBsa;
+  SDqLayer* pCurLayer			= pEncCtx->pCurDqLayer;
+  SSliceCtx* pSliceCtx	= pCurLayer->pSliceEncCtx;
+  SMbCache* pMbCache			= &pSlice->sMbCacheInfo;
+  SMB* pMbList					= pCurLayer->sMbDataP;
+  SMB* pCurMb					= NULL;
+  int32_t iNumMbCoded		= 0;
+  int32_t	iNextMbIdx			= kiSliceFirstMbXY;
+  int32_t	iCurMbIdx			= -1;
+  int32_t	iMbSkipRun			= 0;
+  const int32_t kiTotalNumMb	= pCurLayer->iMbWidth * pCurLayer->iMbHeight;
+  const int32_t kiMvdInterTableSize	= (pEncCtx->pSvcParam->iNumDependencyLayer == 1 ? 648 : 972);
+  const int32_t kiMvdInterTableStride = 1 + (kiMvdInterTableSize << 1);
+  uint16_t* pMvdCostTableInter		= &pEncCtx->pMvdCostTableInter[kiMvdInterTableSize];
+  const int32_t kiSliceIdx				= pSlice->uiSliceIdx;
+  const uint8_t kuiChromaQpIndexOffset = pCurLayer->sLayerInfo.pPpsP->uiChromaQpIndexOffset;
+
+  for (;;) {
+    //point to current pMb
+    iCurMbIdx	= iNextMbIdx;
+    pCurMb = &pMbList[ iCurMbIdx ];
+
+    //step(1): set QP for the current MB
+    pEncCtx->pFuncList->pfRc.pfWelsRcMbInit (pEncCtx, pCurMb, pSlice);
+
+    //step (2). save some vale for future use, initial pWelsMd
+    pMd->iLambda = g_kiQpCostTable[pCurMb->uiLumaQp];
+    pMd->pMvdCost = &pMvdCostTableInter[pCurMb->uiLumaQp * kiMvdInterTableStride];
+    WelsMdIntraInit (pEncCtx, pCurMb, pMbCache, kiSliceFirstMbXY);
+    WelsMdInterInit (pEncCtx, pSlice, pCurMb, kiSliceFirstMbXY);
+    pEncCtx->pFuncList->pfInterMd (pEncCtx, pMd, pSlice, pCurMb, pMbCache);
+    //mb_qp
+
+    //step (4): save from the MD process from future use
+    WelsMdInterSaveSadAndRefMbType ((pCurLayer->pDecPic->uiRefMbType), pMbCache, pCurMb, pMd);
+
+    pEncCtx->pFuncList->pfInterMdBackgroundInfoUpdate (pCurLayer, pCurMb, pMbCache->bCollocatedPredFlag,
+        pEncCtx->pRefPic->iPictureType);
+
+    //step (5): update cache
+    UpdateNonZeroCountCache (pCurMb, pMbCache);
+
+    //step (6): begin to write bit stream; if the pSlice size is controlled, the writing may be skipped
+    if (IS_SKIP (pCurMb->uiMbType)) {
+      pCurMb->uiLumaQp	= pSlice->uiLastMbQp;
+      pCurMb->uiChromaQp = g_kuiChromaQpTable[CLIP3_QP_0_51 (pCurMb->uiLumaQp + kuiChromaQpIndexOffset)];
+
+      iMbSkipRun++;
+    } else {
+      BsWriteUE (pBs, iMbSkipRun);
+      iMbSkipRun = 0;
+      WelsSpatialWriteMbSyn (pEncCtx, pSlice, pCurMb);
+    }
+
+    //step (7): reconstruct current MB
+    pCurMb->uiSliceIdc = kiSliceIdx;
+    OutputPMbWithoutConstructCsRsNoCopy (pEncCtx, pCurLayer, pSlice, pCurMb);
+
+#if defined(MB_TYPES_CHECK)
+    WelsCountMbType (pEncCtx->sPerInfo.iMbCount, P_SLICE, pCurMb);
+#endif//MB_TYPES_CHECK			
+
+    //step (8): update status and other parameters
+    pEncCtx->pFuncList->pfRc.pfWelsRcMbInfoUpdate (pEncCtx, pCurMb, pMd->iCostLuma, pSlice);
+
+    /*judge if all pMb in cur pSlice has been encoded*/
+    ++ iNumMbCoded;
+    iNextMbIdx = WelsGetNextMbOfSlice (pSliceCtx, iCurMbIdx);
+    //whether all of MB in current pSlice encoded or not
+    if (iNextMbIdx == -1 || iNextMbIdx >= kiTotalNumMb || iNumMbCoded >= kiTotalNumMb) {
+      break;
+    }
+  }
+
+  if (iMbSkipRun) {
+    BsWriteUE (pBs, iMbSkipRun);
+  }
+}
+
+// Only for inter dynamic slicing
+void WelsMdInterMbLoopOverDynamicSlice (sWelsEncCtx* pEncCtx, SSlice* pSlice, void* pWelsMd,
+                                        const int32_t kiSliceFirstMbXY) {
+  SWelsMD* pMd					= (SWelsMD*)pWelsMd;
+  SBitStringAux* pBs			= pSlice->pSliceBsa;
+  SDqLayer* pCurLayer			= pEncCtx->pCurDqLayer;
+  SSliceCtx* pSliceCtx	= pCurLayer->pSliceEncCtx;
+  SMbCache* pMbCache			= &pSlice->sMbCacheInfo;
+  SMB* pMbList					= pCurLayer->sMbDataP;
+  SMB* pCurMb					= NULL;
+  int32_t iNumMbCoded		= 0;
+  const int32_t kiTotalNumMb	= pCurLayer->iMbWidth * pCurLayer->iMbHeight;
+  int32_t	iNextMbIdx			= kiSliceFirstMbXY;
+  int32_t	iCurMbIdx			= -1;
+  int32_t	iMbSkipRun			= 0;
+  const int32_t kiMvdInterTableSize	= (pEncCtx->pSvcParam->iNumDependencyLayer == 1 ? 648 : 972);
+  const int32_t kiMvdInterTableStride = 1 + (kiMvdInterTableSize << 1);
+  uint16_t* pMvdCostTableInter		= &pEncCtx->pMvdCostTableInter[kiMvdInterTableSize];
+  const int32_t kiSliceIdx				= pSlice->uiSliceIdx;
+  const int32_t kiPartitionId			= (kiSliceIdx % pEncCtx->iActiveThreadsNum);
+  const uint8_t kuiChromaQpIndexOffset = pCurLayer->sLayerInfo.pPpsP->uiChromaQpIndexOffset;
+
+  SDynamicSlicingStack sDss;
+  sDss.iStartPos = BsGetBitsPos (pBs);
+  for (;;) {
+    //point to current pMb
+    iCurMbIdx	= iNextMbIdx;
+    pCurMb = &pMbList[ iCurMbIdx ];
+
+    //step(1): set QP for the current MB
+    pEncCtx->pFuncList->pfRc.pfWelsRcMbInit (pEncCtx, pCurMb, pSlice);
+    // if already reaches the largest number of slices, set QPs to the upper bound
+    if (pSlice->bDynamicSlicingSliceSizeCtrlFlag) {
+      //a clearer logic may be:
+      //if there is no need from size control from the pSlice size, the QP will be decided by RC; else it will be set to the max QP
+      //    however, there are some parameter updating in the rc_mb_init() function, so it cannot be skipped?
+      pCurMb->uiLumaQp = pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId].iMaxQp;
+      pCurMb->uiChromaQp = g_kuiChromaQpTable[CLIP3_QP_0_51 (pCurMb->uiLumaQp + kuiChromaQpIndexOffset)];
+    }
+
+    //step (2). save some vale for future use, initial pWelsMd
+    pMd->iLambda = g_kiQpCostTable[pCurMb->uiLumaQp];
+    pMd->pMvdCost = &pMvdCostTableInter[pCurMb->uiLumaQp * kiMvdInterTableStride];
+
+    WelsMdIntraInit (pEncCtx, pCurMb, pMbCache, kiSliceFirstMbXY);
+    WelsMdInterInit (pEncCtx, pSlice, pCurMb, kiSliceFirstMbXY);
+    pEncCtx->pFuncList->pfInterMd (pEncCtx, pMd, pSlice, pCurMb, pMbCache);
+    //mb_qp
+
+    //step (4): save from the MD process from future use
+    WelsMdInterSaveSadAndRefMbType ((pCurLayer->pDecPic->uiRefMbType), pMbCache, pCurMb, pMd);
+
+    pEncCtx->pFuncList->pfInterMdBackgroundInfoUpdate (pCurLayer, pCurMb, pMbCache->bCollocatedPredFlag,
+        pEncCtx->pRefPic->iPictureType);
+
+    //step (5): update cache
+    UpdateNonZeroCountCache (pCurMb, pMbCache);
+
+    //step (6): begin to write bit stream; if the pSlice size is controlled, the writing may be skipped
+
+    //DYNAMIC_SLICING_ONE_THREAD - MultiD
+    //stack pBs pointer
+    sDss.pBsStackBufPtr	= pBs->pBufPtr;
+    sDss.uiBsStackCurBits	= pBs->uiCurBits;
+    sDss.iBsStackLeftBits	= pBs->iLeftBits;
+    //stack Pskip status
+    sDss.iMbSkipRunStack = iMbSkipRun;
+    //DYNAMIC_SLICING_ONE_THREAD - MultiD
+
+    if (IS_SKIP (pCurMb->uiMbType)) {
+      pCurMb->uiLumaQp	= pSlice->uiLastMbQp;
+      pCurMb->uiChromaQp = g_kuiChromaQpTable[CLIP3_QP_0_51 (pCurMb->uiLumaQp + kuiChromaQpIndexOffset)];
+
+      iMbSkipRun++;
+    } else {
+      BsWriteUE (pBs, iMbSkipRun);
+      iMbSkipRun = 0;
+      WelsSpatialWriteMbSyn (pEncCtx, pSlice, pCurMb);
+    }
+
+    //DYNAMIC_SLICING_ONE_THREAD - MultiD
+    sDss.iCurrentPos = BsGetBitsPos (pBs);
+    if (DynSlcJudgeSliceBoundaryStepBack (pEncCtx, pSlice, pSliceCtx, pCurMb, &sDss)) {
+      //stack pBs pointer
+      pBs->pBufPtr		= sDss.pBsStackBufPtr;
+      pBs->uiCurBits	= sDss.uiBsStackCurBits;
+      pBs->iLeftBits	= sDss.iBsStackLeftBits;
+
+      iMbSkipRun = sDss.iMbSkipRunStack;
+
+      pCurLayer->pLastCodedMbIdxOfPartition[kiPartitionId] = iCurMbIdx -
+          1;	// update pLastCodedMbIdxOfPartition, need to -1 due to stepping back
+      ++ pCurLayer->pNumSliceCodedOfPartition[kiPartitionId];
+
+      break;
+    }
+
+    //step (7): reconstruct current MB
+    pCurMb->uiSliceIdc = kiSliceIdx;
+    OutputPMbWithoutConstructCsRsNoCopy (pEncCtx, pCurLayer, pSlice, pCurMb);
+
+#if defined(MB_TYPES_CHECK)
+    WelsCountMbType (pEncCtx->sPerInfo.iMbCount, P_SLICE, pCurMb);
+#endif//MB_TYPES_CHECK			
+
+    //step (8): update status and other parameters
+    pEncCtx->pFuncList->pfRc.pfWelsRcMbInfoUpdate (pEncCtx, pCurMb, pMd->iCostLuma, pSlice);
+
+    /*judge if all pMb in cur pSlice has been encoded*/
+    ++ iNumMbCoded;
+    iNextMbIdx = WelsGetNextMbOfSlice (pSliceCtx, iCurMbIdx);
+    //whether all of MB in current pSlice encoded or not
+    if (iNextMbIdx == -1 || iNextMbIdx >= kiTotalNumMb || iNumMbCoded >= kiTotalNumMb) {
+      pCurLayer->pLastCodedMbIdxOfPartition[kiPartitionId] =
+        iCurMbIdx;	// update pLastCodedMbIdxOfPartition, finish coding, use pCurMb_idx directly
+      break;
+    }
+  }
+
+  if (iMbSkipRun) {
+    BsWriteUE (pBs, iMbSkipRun);
+  }
+}
+
+}//namespace WelsSVCEnc
--- a/codec/encoder/core/src/svc_mode_decision.cpp
+++ b/codec/encoder/core/src/svc_mode_decision.cpp
@@ -1,189 +1,178 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file	svc_mode_decision.c
- *
- * \brief	SVC Spatial Enhancement Layer MD
- *
- * \date	2009.7.29
- *
-		  
- **************************************************************************************
- */
-#include <assert.h>
-#include <string.h>
-#include "decode_mb_aux.h"
-#include "svc_enc_golomb.h"
-#include "ls_defines.h"
-#include "md.h"
-#include "mv_pred.h"
-#include "sample.h"
-#include "svc_base_layer_md.h"
-#include "svc_encode_mb.h"
-#include "svc_encode_slice.h"
-#include "mb_cache.h"
-
-#include "svc_mode_decision.h"
-#include "svc_motion_estimate.h"
-
-#include "svc_set_mb_syn_cavlc.h"
-#include "cpu_core.h"
-#include "encode_mb_aux.h"
-#include "utils.h"
-namespace WelsSVCEnc {
-
-//
-// md in enhancement layer
-///
-void WelsMdSpatialelInterMbIlfmdNoilp(	sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SSlice *pSlice,
-										    SMB* pCurMb, const Mb_Type kuiRefMbType)
-{
-	SDqLayer* pCurDqLayer = pEncCtx->pCurDqLayer;
-	SMbCache *pMbCache = &pSlice->sMbCacheInfo;
-
-	const uint32_t kuiNeighborAvail = pCurMb->uiNeighborAvail;
-	const int32_t kiMbWidth = pCurDqLayer->iMbWidth;
-	const  SMB* kpTopMb = pCurMb-kiMbWidth;
-	const bool_t kbMbLeftAvailPskip	= ((kuiNeighborAvail&LEFT_MB_POS) ? IS_SKIP((pCurMb-1)->uiMbType) : false );
-	const bool_t kbMbTopAvailPskip			= ((kuiNeighborAvail&TOP_MB_POS) ? IS_SKIP(kpTopMb->uiMbType) : false );
-	const bool_t kbMbTopLeftAvailPskip		= ((kuiNeighborAvail&TOPLEFT_MB_POS) ? IS_SKIP((kpTopMb -1)->uiMbType) : false );
-	const bool_t kbMbTopRightAvailPskip	= ((kuiNeighborAvail&TOPRIGHT_MB_POS) ? IS_SKIP((kpTopMb +1)->uiMbType) : false );
-
-	BOOL_T bTrySkip  = kbMbLeftAvailPskip|kbMbTopAvailPskip|kbMbTopLeftAvailPskip|kbMbTopRightAvailPskip;
-	BOOL_T bKeepSkip = kbMbLeftAvailPskip&kbMbTopAvailPskip&kbMbTopRightAvailPskip;
-	BOOL_T bSkip = FALSE;
-
-	if ( pEncCtx->pFuncList->pfInterMdBackgroundDecision( pEncCtx, pWelsMd, pSlice, pCurMb, pMbCache, &bKeepSkip ) )
-	{
-		return;
-	}
-
-	//step 1: try SKIP
-	bSkip = WelsMdInterJudgePskip( pEncCtx, pWelsMd, pSlice, pCurMb, pMbCache, bTrySkip ); 
-
-	if (  bSkip && bKeepSkip )
-	{
-		WelsMdInterDecidedPskip(pEncCtx,  pSlice,  pCurMb, pMbCache);
-		return;
-	}
-
-	if ( ! IS_SVC_INTRA(kuiRefMbType) )
-	{
-		if ( !bSkip )
-		{	
-			PredictSad( pMbCache->sMvComponents.iRefIndexCache, pMbCache->iSadCost, 0, &pWelsMd->iSadPredMb );
-			
-			//step 2: P_16x16
-			pWelsMd->iCostLuma = WelsMdP16x16(pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice, pCurMb);
-			pCurMb->uiMbType = MB_TYPE_16x16;
-		}
-		
-		WelsMdInterSecondaryModesEnc( pEncCtx, pWelsMd, pSlice, pCurMb, pMbCache, bSkip );
-	}
-	else //BLMODE == SVC_INTRA
-	{
-		//initial prediction memory for I_16x16
-		const int32_t kiCostI16x16 = WelsMdI16x16(pEncCtx->pFuncList, pEncCtx->pCurDqLayer, pMbCache, pWelsMd->iLambda);
-		if ( bSkip && (pWelsMd->iCostLuma <= kiCostI16x16) )
-		{
-			WelsMdInterDecidedPskip(pEncCtx,  pSlice,  pCurMb, pMbCache);
-		}
-		else
-		{
-			pWelsMd->iCostLuma = kiCostI16x16;		
-			pCurMb->uiMbType = MB_TYPE_INTRA16x16;
-			
-			WelsMdIntraSecondaryModesEnc( pEncCtx, pWelsMd, pCurMb, pMbCache );
-		}			
-	}		
-}
-
-
-
-void WelsMdInterMbEnhancelayer( void* pEnc, void* pMd, SSlice *pSlice, SMB* pCurMb, SMbCache *pMbCache )
-{
-	sWelsEncCtx *pEncCtx	= (sWelsEncCtx*)pEnc;
-	SDqLayer *pCurLayer				= pEncCtx->pCurDqLayer;
-	SWelsMD *pWelsMd					= (SWelsMD*)pMd;
-	const SMB* kpInterLayerRefMb		= GetRefMb( pCurLayer, pCurMb );
-	const Mb_Type kuiInterLayerRefMbType	= kpInterLayerRefMb->uiMbType;
-
-	SetMvBaseEnhancelayer( pWelsMd, pCurMb, kpInterLayerRefMb );// initial sMvBase here only when pRef mb type is inter, if not sMvBase will be not used! 	
-	//step (3): do the MD process
-	WelsMdSpatialelInterMbIlfmdNoilp(pEncCtx, pWelsMd, pSlice, pCurMb, kuiInterLayerRefMbType);//MD process
-}
-
-//////////////////////////
-//
-//SUPPORTING FUNCS
-//
-//////////////////////////
-
-///////////////////////
-// do initiation for noILP (needed by ILFMD)
-////////////////////////
-
-SMB* GetRefMb( SDqLayer *pCurLayer, SMB *pCurMb )
-{
-    const SDqLayer  *kpRefLayer		= pCurLayer->pRefLayer;
-	const int32_t  kiRefMbIdx = (pCurMb->iMbY>>1) * kpRefLayer->iMbWidth + (pCurMb->iMbX>>1);//because current lower layer is half size on both vertical and horizontal
-	return (&kpRefLayer->sMbDataP[kiRefMbIdx]);    
-}
-
-void SetMvBaseEnhancelayer( SWelsMD* pMd, SMB *pCurMb, const SMB *kpRefMb )
-{
-	const Mb_Type kuiRefMbType = kpRefMb->uiMbType;
-
-	if ( ! IS_SVC_INTRA( kuiRefMbType ))
-	{
-        SMVUnitXY sMv;
-        int32_t iRefMbPartIdx = ((pCurMb->iMbY&0x01)<<1) + (pCurMb->iMbX&0x01); //may be need modified
-        int32_t iScan4RefPartIdx = g_kuiMbCountScan4Idx[(iRefMbPartIdx<<2)];	
-        sMv.iMvX = kpRefMb->sMv[iScan4RefPartIdx].iMvX << 1;
-        sMv.iMvY = kpRefMb->sMv[iScan4RefPartIdx].iMvY << 1;
-
-		pMd->sMe.sMe16x16.sMvBase = sMv;
-
-		pMd->sMe.sMe8x8[0].sMvBase =
-		pMd->sMe.sMe8x8[1].sMvBase =
-		pMd->sMe.sMe8x8[2].sMvBase =
-		pMd->sMe.sMe8x8[3].sMvBase = sMv;
-        
- 		pMd->sMe.sMe16x8[0].sMvBase =
- 		pMd->sMe.sMe16x8[1].sMvBase =
-		pMd->sMe.sMe8x16[0].sMvBase =
- 		pMd->sMe.sMe8x16[1].sMvBase = sMv; 				
-	}
-}
-
-
-
-} // namespace WelsSVCEnc
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	svc_mode_decision.c
+ *
+ * \brief	SVC Spatial Enhancement Layer MD
+ *
+ * \date	2009.7.29
+ *
+
+ **************************************************************************************
+ */
+#include <assert.h>
+#include <string.h>
+#include "decode_mb_aux.h"
+#include "svc_enc_golomb.h"
+#include "ls_defines.h"
+#include "md.h"
+#include "mv_pred.h"
+#include "sample.h"
+#include "svc_base_layer_md.h"
+#include "svc_encode_mb.h"
+#include "svc_encode_slice.h"
+#include "mb_cache.h"
+
+#include "svc_mode_decision.h"
+#include "svc_motion_estimate.h"
+
+#include "svc_set_mb_syn_cavlc.h"
+#include "cpu_core.h"
+#include "encode_mb_aux.h"
+#include "utils.h"
+namespace WelsSVCEnc {
+
+//
+// md in enhancement layer
+///
+void WelsMdSpatialelInterMbIlfmdNoilp (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SSlice* pSlice,
+                                       SMB* pCurMb, const Mb_Type kuiRefMbType) {
+  SDqLayer* pCurDqLayer = pEncCtx->pCurDqLayer;
+  SMbCache* pMbCache = &pSlice->sMbCacheInfo;
+
+  const uint32_t kuiNeighborAvail = pCurMb->uiNeighborAvail;
+  const int32_t kiMbWidth = pCurDqLayer->iMbWidth;
+  const  SMB* kpTopMb = pCurMb - kiMbWidth;
+  const bool_t kbMbLeftAvailPskip	= ((kuiNeighborAvail & LEFT_MB_POS) ? IS_SKIP ((pCurMb - 1)->uiMbType) : false);
+  const bool_t kbMbTopAvailPskip			= ((kuiNeighborAvail & TOP_MB_POS) ? IS_SKIP (kpTopMb->uiMbType) : false);
+  const bool_t kbMbTopLeftAvailPskip		= ((kuiNeighborAvail & TOPLEFT_MB_POS) ? IS_SKIP ((kpTopMb - 1)->uiMbType) : false);
+  const bool_t kbMbTopRightAvailPskip	= ((kuiNeighborAvail & TOPRIGHT_MB_POS) ? IS_SKIP ((
+      kpTopMb + 1)->uiMbType) : false);
+
+  BOOL_T bTrySkip  = kbMbLeftAvailPskip | kbMbTopAvailPskip | kbMbTopLeftAvailPskip | kbMbTopRightAvailPskip;
+  BOOL_T bKeepSkip = kbMbLeftAvailPskip & kbMbTopAvailPskip & kbMbTopRightAvailPskip;
+  BOOL_T bSkip = FALSE;
+
+  if (pEncCtx->pFuncList->pfInterMdBackgroundDecision (pEncCtx, pWelsMd, pSlice, pCurMb, pMbCache, &bKeepSkip)) {
+    return;
+  }
+
+  //step 1: try SKIP
+  bSkip = WelsMdInterJudgePskip (pEncCtx, pWelsMd, pSlice, pCurMb, pMbCache, bTrySkip);
+
+  if (bSkip && bKeepSkip) {
+    WelsMdInterDecidedPskip (pEncCtx,  pSlice,  pCurMb, pMbCache);
+    return;
+  }
+
+  if (! IS_SVC_INTRA (kuiRefMbType)) {
+    if (!bSkip) {
+      PredictSad (pMbCache->sMvComponents.iRefIndexCache, pMbCache->iSadCost, 0, &pWelsMd->iSadPredMb);
+
+      //step 2: P_16x16
+      pWelsMd->iCostLuma = WelsMdP16x16 (pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice, pCurMb);
+      pCurMb->uiMbType = MB_TYPE_16x16;
+    }
+
+    WelsMdInterSecondaryModesEnc (pEncCtx, pWelsMd, pSlice, pCurMb, pMbCache, bSkip);
+  } else { //BLMODE == SVC_INTRA
+    //initial prediction memory for I_16x16
+    const int32_t kiCostI16x16 = WelsMdI16x16 (pEncCtx->pFuncList, pEncCtx->pCurDqLayer, pMbCache, pWelsMd->iLambda);
+    if (bSkip && (pWelsMd->iCostLuma <= kiCostI16x16)) {
+      WelsMdInterDecidedPskip (pEncCtx,  pSlice,  pCurMb, pMbCache);
+    } else {
+      pWelsMd->iCostLuma = kiCostI16x16;
+      pCurMb->uiMbType = MB_TYPE_INTRA16x16;
+
+      WelsMdIntraSecondaryModesEnc (pEncCtx, pWelsMd, pCurMb, pMbCache);
+    }
+  }
+}
+
+
+
+void WelsMdInterMbEnhancelayer (void* pEnc, void* pMd, SSlice* pSlice, SMB* pCurMb, SMbCache* pMbCache) {
+  sWelsEncCtx* pEncCtx	= (sWelsEncCtx*)pEnc;
+  SDqLayer* pCurLayer				= pEncCtx->pCurDqLayer;
+  SWelsMD* pWelsMd					= (SWelsMD*)pMd;
+  const SMB* kpInterLayerRefMb		= GetRefMb (pCurLayer, pCurMb);
+  const Mb_Type kuiInterLayerRefMbType	= kpInterLayerRefMb->uiMbType;
+
+  SetMvBaseEnhancelayer (pWelsMd, pCurMb,
+                         kpInterLayerRefMb); // initial sMvBase here only when pRef mb type is inter, if not sMvBase will be not used!
+  //step (3): do the MD process
+  WelsMdSpatialelInterMbIlfmdNoilp (pEncCtx, pWelsMd, pSlice, pCurMb, kuiInterLayerRefMbType); //MD process
+}
+
+//////////////////////////
+//
+//SUPPORTING FUNCS
+//
+//////////////////////////
+
+///////////////////////
+// do initiation for noILP (needed by ILFMD)
+////////////////////////
+
+SMB* GetRefMb (SDqLayer* pCurLayer, SMB* pCurMb) {
+  const SDqLayer*  kpRefLayer		= pCurLayer->pRefLayer;
+  const int32_t  kiRefMbIdx = (pCurMb->iMbY >> 1) * kpRefLayer->iMbWidth + (pCurMb->iMbX >>
+                              1); //because current lower layer is half size on both vertical and horizontal
+  return (&kpRefLayer->sMbDataP[kiRefMbIdx]);
+}
+
+void SetMvBaseEnhancelayer (SWelsMD* pMd, SMB* pCurMb, const SMB* kpRefMb) {
+  const Mb_Type kuiRefMbType = kpRefMb->uiMbType;
+
+  if (! IS_SVC_INTRA (kuiRefMbType)) {
+    SMVUnitXY sMv;
+    int32_t iRefMbPartIdx = ((pCurMb->iMbY & 0x01) << 1) + (pCurMb->iMbX & 0x01); //may be need modified
+    int32_t iScan4RefPartIdx = g_kuiMbCountScan4Idx[ (iRefMbPartIdx << 2)];
+    sMv.iMvX = kpRefMb->sMv[iScan4RefPartIdx].iMvX << 1;
+    sMv.iMvY = kpRefMb->sMv[iScan4RefPartIdx].iMvY << 1;
+
+    pMd->sMe.sMe16x16.sMvBase = sMv;
+
+    pMd->sMe.sMe8x8[0].sMvBase =
+      pMd->sMe.sMe8x8[1].sMvBase =
+        pMd->sMe.sMe8x8[2].sMvBase =
+          pMd->sMe.sMe8x8[3].sMvBase = sMv;
+
+    pMd->sMe.sMe16x8[0].sMvBase =
+      pMd->sMe.sMe16x8[1].sMvBase =
+        pMd->sMe.sMe8x16[0].sMvBase =
+          pMd->sMe.sMe8x16[1].sMvBase = sMv;
+  }
+}
+
+
+
+} // namespace WelsSVCEnc
--- a/codec/encoder/core/src/svc_motion_estimate.cpp
+++ b/codec/encoder/core/src/svc_motion_estimate.cpp
@@ -1,253 +1,242 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file	svc motion estimate.c
- *
- * \brief	Interfaces introduced in svc mb motion estimation
- *
- * \date	08/11/2009 Created
- *
- *************************************************************************************
- */
-
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-
-#include "svc_motion_estimate.h"
-#include "svc_enc_golomb.h"
-#include "macros.h"
-#include "sample.h"
-#include "array_stack_align.h"
-#include "cpu_core.h"	// WELS_CPU_SSE41
-
-namespace WelsSVCEnc {	
-/*!
- * \brief	BL mb motion estimate search
- *
- * \param	enc			Wels encoder context
- * \param	pMe	        Wels me information
- *
- * \return	NONE
- */
-
-void WelsMotionEstimateSearchSatd (SWelsFuncPtrList *pFuncList, void* pLplayer, void* pLpme, void* pLpslice)
-{
-	SDqLayer* pCurDqLayer			= (SDqLayer *)pLplayer;
-	SWelsME* pMe						= (SWelsME *)pLpme;
-	SSlice* pSlice					= (SSlice *)pLpslice;
-	int32_t iStrideEnc = pCurDqLayer->iEncStride[0];
-	int32_t iStrideRef = pCurDqLayer->pRefPic->iLineSize[0];
-
-	//  Step 1: Initial point prediction
-	WelsMotionEstimateInitialPoint ( pFuncList, pMe, pSlice, iStrideEnc, iStrideRef );
-
-	pMe->uSadPredISatd.uiSatd = pFuncList->sSampleDealingFuncs.pfSampleSatd[pMe->uiPixel]( pMe->pEncMb, iStrideEnc, pMe->pRefMb, iStrideRef );
-	pMe->uiSatdCost = pMe->uSadPredISatd.uiSatd + COST_MVD(pMe->pMvdCost, pMe->sMv.iMvX - pMe->sMvp.iMvX, pMe->sMv.iMvY - pMe->sMvp.iMvY);	
-}
-
-
-void WelsMotionEstimateSearchSad (SWelsFuncPtrList *pFuncList, void* pLplayer, void* pLpme, void* pLpslice)
-{
-	SDqLayer* pCurDqLayer			= (SDqLayer *)pLplayer;
-	SWelsME* pMe						= (SWelsME *)pLpme;
-	SSlice* slice					= (SSlice *)pLpslice;
-	int32_t iStrideEnc			= pCurDqLayer->iEncStride[0];
-	int32_t iStrideRef			= pCurDqLayer->pRefPic->iLineSize[0];
-
-	//  Step 1: Initial point prediction
-	WelsMotionEstimateInitialPoint ( pFuncList, pMe, slice, iStrideEnc, iStrideRef );
-}
-
-/*!
- * \brief	EL mb motion estimate initial point testing
- *
- * \param	pix_pFuncList	SSampleDealingFunc
- * \param	pMe	        Wels me information
- * \param	mv_range	search range in motion estimate
- * \param	point	    the best match point in motion estimation
- *
- * \return	NONE
- */
-void WelsMotionEstimateInitialPoint(SWelsFuncPtrList *pFuncList, SWelsME * pMe, SSlice *pSlice, int32_t iStrideEnc, int32_t iStrideRef )
-{   
-	PSampleSadSatdCostFunc pSad		= pFuncList->sSampleDealingFuncs.pfSampleSad[pMe->uiPixel];
-	const uint16_t *kpMvdCost	= pMe->pMvdCost;
-	uint8_t* const kpEncMb		= pMe->pEncMb;	
-	int16_t iMvc0, iMvc1;
-	int32_t iSadCost;
-	int32_t iBestSadCost;
-	uint8_t *pRefMb;
-	uint8_t *pFref2;
-	uint32_t i;
-	const uint32_t kuiMvcNum		= pSlice->uiMvcNum;
-	const SMVUnitXY *kpMvcList	= &pSlice->sMvc[0];
-	const SMVUnitXY ksMvMin		= pSlice->sMvMin;
-	const SMVUnitXY ksMvMax		= pSlice->sMvMax;
-	const SMVUnitXY ksMvp		= pMe->sMvp;
-	SMVUnitXY sMv;
-	
-	//  Step 1: Initial point prediction
-    // init with sMvp
-	sMv.iMvX	= WELS_CLIP3( (2 + ksMvp.iMvX) >> 2, ksMvMin.iMvX, ksMvMax.iMvX );
-	sMv.iMvY	= WELS_CLIP3( (2 + ksMvp.iMvY) >> 2, ksMvMin.iMvY, ksMvMax.iMvY );
-
-    pRefMb = &pMe->pRefMb[sMv.iMvY * iStrideRef + sMv.iMvX];
-	
-	iBestSadCost = pSad( kpEncMb, iStrideEnc, pRefMb, iStrideRef );
-    iBestSadCost += COST_MVD(kpMvdCost, ((sMv.iMvX)<<2) - ksMvp.iMvX, ((sMv.iMvY)<<2) - ksMvp.iMvY);
-	
-	for (i = 0; i < kuiMvcNum; i++)
-	{
-		//clipping here is essential since some pOut-of-range MVC may happen here (i.e., refer to baseMV)
-		iMvc0 = WELS_CLIP3( ( 2 + kpMvcList[i].iMvX ) >> 2, ksMvMin.iMvX, ksMvMax.iMvX );
-		iMvc1 = WELS_CLIP3( ( 2 + kpMvcList[i].iMvY ) >> 2, ksMvMin.iMvY, ksMvMax.iMvY );
-		
-		if( ((iMvc0-sMv.iMvX) || (iMvc1-sMv.iMvY)) )
-		{
-			pFref2 = &pMe->pRefMb[iMvc1*iStrideRef+iMvc0];
-
-			iSadCost = pSad( kpEncMb, iStrideEnc, pFref2, iStrideRef ) +
-				COST_MVD(kpMvdCost, (iMvc0<<2) - ksMvp.iMvX, (iMvc1<<2) - ksMvp.iMvY);		
-			
-			if( iSadCost < iBestSadCost )
-			{
-				sMv.iMvX = iMvc0;
-				sMv.iMvY = iMvc1;				
-				pRefMb = pFref2;				
-				iBestSadCost = iSadCost;				
-			}
-		}
-	}
-
-	pMe->sMv = sMv;
-	pMe->uiSadCost = iBestSadCost;
-	if ( iBestSadCost < pMe->uSadPredISatd.uiSadPred )
-	{
-        	//  Step 2: Initial early Stop	
-		/* -> qpel mv */
-		pMe->sMv.iMvX <<= 2;
-		pMe->sMv.iMvY <<= 2;		
-		/* -> pRef */
-		pMe->pRefMb = pRefMb;
-		/* compute the real cost */  
- 		pMe->uiSatdCost = iBestSadCost;
-	}
-    else
-    {
-        //  Step 3: Fast search pattern
-        WelsMotionEstimateIterativeSearch ( pFuncList, pMe, iStrideEnc, iStrideRef, pRefMb );
-    }
-}
-
-bool_t WelsMeSadCostSelect( int32_t *iSadCost, const uint16_t *kpMvdCost, int32_t *pBestCost, const int32_t kiDx, const int32_t kiDy, int32_t *pIx, int32_t *pIy)
-{
-	int32_t iTempSadCost[4];
-	int32_t iInputSadCost=*pBestCost;
-	iTempSadCost[0] = iSadCost[0]+COST_MVD(kpMvdCost, kiDx, kiDy - 4);
-	iTempSadCost[1] = iSadCost[1]+COST_MVD(kpMvdCost, kiDx, kiDy + 4);
-	iTempSadCost[2] = iSadCost[2]+COST_MVD(kpMvdCost, kiDx - 4, kiDy);
-	iTempSadCost[3] = iSadCost[3]+COST_MVD(kpMvdCost, kiDx + 4, kiDy);
-
-	if (iTempSadCost[0]<*pBestCost)
-	{
-		*pBestCost = iTempSadCost[0];
-		*pIx = 0;
-		*pIy = 1;
-	}
-
-	if (iTempSadCost[1]<*pBestCost)
-	{
-		*pBestCost = iTempSadCost[1];
-		*pIx = 0;
-		*pIy = -1;
-	}
-
-	if (iTempSadCost[2]<*pBestCost)
-	{
-		*pBestCost = iTempSadCost[2];
-		*pIx = 1;
-		*pIy = 0;
-	}
-
-	if (iTempSadCost[3]<*pBestCost)
-	{
-		*pBestCost = iTempSadCost[3];
-		*pIx = -1;
-		*pIy = 0;
-	}
-
-
-	return (*pBestCost==iInputSadCost);
-}
-
-void WelsMotionEstimateIterativeSearch( SWelsFuncPtrList *pFuncList, SWelsME *pMe, const int32_t kiStrideEnc, const int32_t kiStrideRef, uint8_t *pFref )
-{
-	PSample4SadCostFunc			pSad					=  pFuncList->sSampleDealingFuncs.pfSample4Sad[pMe->uiPixel];
-
-	uint8_t* const kpEncMb = pMe->pEncMb;
-	const uint16_t *kpMvdCost = pMe->pMvdCost;
-
-	int32_t iMvDx = ((pMe->sMv.iMvX)<<2) - pMe->sMvp.iMvX;
-	int32_t iMvDy = ((pMe->sMv.iMvY)<<2) - pMe->sMvp.iMvY;
-
-	uint8_t *pRefMb = pFref;
-	int32_t iBestCost = (pMe->uiSadCost);
-
-	int32_t iTimeThreshold = ITERATIVE_TIMES;
-	ENFORCE_STACK_ALIGN_1D(int32_t, iSadCosts, 4, 16)	
-
-	while(iTimeThreshold--)
-	{
-		pSad( kpEncMb,kiStrideEnc,pRefMb,kiStrideRef,&iSadCosts[0] );
-
-		int32_t iX,iY;
-
-		const bool_t kbIsBestCostWorse = WelsMeSadCostSelect( iSadCosts, kpMvdCost, &iBestCost,iMvDx, iMvDy,&iX,&iY );
-		if (kbIsBestCostWorse)
-			break;
-
-		iMvDx -= iX<<2 ;	
-		iMvDy -= iY<<2 ;
-
-		pRefMb -= (iX+iY*kiStrideRef);
-
-	}
-
-    /* -> qpel mv */
-	pMe->sMv.iMvX = (iMvDx + pMe->sMvp.iMvX) & 0xFFFC;
-	pMe->sMv.iMvY = (iMvDy + pMe->sMvp.iMvY) & 0xFFFC;
-	pMe->uiSatdCost = pMe->uiSadCost = (iBestCost);
-	pMe->pRefMb = pRefMb;
-}
-
-} // namespace WelsSVCEnc
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	svc motion estimate.c
+ *
+ * \brief	Interfaces introduced in svc mb motion estimation
+ *
+ * \date	08/11/2009 Created
+ *
+ *************************************************************************************
+ */
+
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "svc_motion_estimate.h"
+#include "svc_enc_golomb.h"
+#include "macros.h"
+#include "sample.h"
+#include "array_stack_align.h"
+#include "cpu_core.h"	// WELS_CPU_SSE41
+
+namespace WelsSVCEnc {
+/*!
+ * \brief	BL mb motion estimate search
+ *
+ * \param	enc			Wels encoder context
+ * \param	pMe	        Wels me information
+ *
+ * \return	NONE
+ */
+
+void WelsMotionEstimateSearchSatd (SWelsFuncPtrList* pFuncList, void* pLplayer, void* pLpme, void* pLpslice) {
+  SDqLayer* pCurDqLayer			= (SDqLayer*)pLplayer;
+  SWelsME* pMe						= (SWelsME*)pLpme;
+  SSlice* pSlice					= (SSlice*)pLpslice;
+  int32_t iStrideEnc = pCurDqLayer->iEncStride[0];
+  int32_t iStrideRef = pCurDqLayer->pRefPic->iLineSize[0];
+
+  //  Step 1: Initial point prediction
+  WelsMotionEstimateInitialPoint (pFuncList, pMe, pSlice, iStrideEnc, iStrideRef);
+
+  pMe->uSadPredISatd.uiSatd = pFuncList->sSampleDealingFuncs.pfSampleSatd[pMe->uiPixel] (pMe->pEncMb, iStrideEnc,
+                              pMe->pRefMb, iStrideRef);
+  pMe->uiSatdCost = pMe->uSadPredISatd.uiSatd + COST_MVD (pMe->pMvdCost, pMe->sMv.iMvX - pMe->sMvp.iMvX,
+                    pMe->sMv.iMvY - pMe->sMvp.iMvY);
+}
+
+
+void WelsMotionEstimateSearchSad (SWelsFuncPtrList* pFuncList, void* pLplayer, void* pLpme, void* pLpslice) {
+  SDqLayer* pCurDqLayer			= (SDqLayer*)pLplayer;
+  SWelsME* pMe						= (SWelsME*)pLpme;
+  SSlice* slice					= (SSlice*)pLpslice;
+  int32_t iStrideEnc			= pCurDqLayer->iEncStride[0];
+  int32_t iStrideRef			= pCurDqLayer->pRefPic->iLineSize[0];
+
+  //  Step 1: Initial point prediction
+  WelsMotionEstimateInitialPoint (pFuncList, pMe, slice, iStrideEnc, iStrideRef);
+}
+
+/*!
+ * \brief	EL mb motion estimate initial point testing
+ *
+ * \param	pix_pFuncList	SSampleDealingFunc
+ * \param	pMe	        Wels me information
+ * \param	mv_range	search range in motion estimate
+ * \param	point	    the best match point in motion estimation
+ *
+ * \return	NONE
+ */
+void WelsMotionEstimateInitialPoint (SWelsFuncPtrList* pFuncList, SWelsME* pMe, SSlice* pSlice, int32_t iStrideEnc,
+                                     int32_t iStrideRef) {
+  PSampleSadSatdCostFunc pSad		= pFuncList->sSampleDealingFuncs.pfSampleSad[pMe->uiPixel];
+  const uint16_t* kpMvdCost	= pMe->pMvdCost;
+  uint8_t* const kpEncMb		= pMe->pEncMb;
+  int16_t iMvc0, iMvc1;
+  int32_t iSadCost;
+  int32_t iBestSadCost;
+  uint8_t* pRefMb;
+  uint8_t* pFref2;
+  uint32_t i;
+  const uint32_t kuiMvcNum		= pSlice->uiMvcNum;
+  const SMVUnitXY* kpMvcList	= &pSlice->sMvc[0];
+  const SMVUnitXY ksMvMin		= pSlice->sMvMin;
+  const SMVUnitXY ksMvMax		= pSlice->sMvMax;
+  const SMVUnitXY ksMvp		= pMe->sMvp;
+  SMVUnitXY sMv;
+
+  //  Step 1: Initial point prediction
+  // init with sMvp
+  sMv.iMvX	= WELS_CLIP3 ((2 + ksMvp.iMvX) >> 2, ksMvMin.iMvX, ksMvMax.iMvX);
+  sMv.iMvY	= WELS_CLIP3 ((2 + ksMvp.iMvY) >> 2, ksMvMin.iMvY, ksMvMax.iMvY);
+
+  pRefMb = &pMe->pRefMb[sMv.iMvY * iStrideRef + sMv.iMvX];
+
+  iBestSadCost = pSad (kpEncMb, iStrideEnc, pRefMb, iStrideRef);
+  iBestSadCost += COST_MVD (kpMvdCost, ((sMv.iMvX) << 2) - ksMvp.iMvX, ((sMv.iMvY) << 2) - ksMvp.iMvY);
+
+  for (i = 0; i < kuiMvcNum; i++) {
+    //clipping here is essential since some pOut-of-range MVC may happen here (i.e., refer to baseMV)
+    iMvc0 = WELS_CLIP3 ((2 + kpMvcList[i].iMvX) >> 2, ksMvMin.iMvX, ksMvMax.iMvX);
+    iMvc1 = WELS_CLIP3 ((2 + kpMvcList[i].iMvY) >> 2, ksMvMin.iMvY, ksMvMax.iMvY);
+
+    if (((iMvc0 - sMv.iMvX) || (iMvc1 - sMv.iMvY))) {
+      pFref2 = &pMe->pRefMb[iMvc1 * iStrideRef + iMvc0];
+
+      iSadCost = pSad (kpEncMb, iStrideEnc, pFref2, iStrideRef) +
+                 COST_MVD (kpMvdCost, (iMvc0 << 2) - ksMvp.iMvX, (iMvc1 << 2) - ksMvp.iMvY);
+
+      if (iSadCost < iBestSadCost) {
+        sMv.iMvX = iMvc0;
+        sMv.iMvY = iMvc1;
+        pRefMb = pFref2;
+        iBestSadCost = iSadCost;
+      }
+    }
+  }
+
+  pMe->sMv = sMv;
+  pMe->uiSadCost = iBestSadCost;
+  if (iBestSadCost < pMe->uSadPredISatd.uiSadPred) {
+    //  Step 2: Initial early Stop
+    /* -> qpel mv */
+    pMe->sMv.iMvX <<= 2;
+    pMe->sMv.iMvY <<= 2;
+    /* -> pRef */
+    pMe->pRefMb = pRefMb;
+    /* compute the real cost */
+    pMe->uiSatdCost = iBestSadCost;
+  } else {
+    //  Step 3: Fast search pattern
+    WelsMotionEstimateIterativeSearch (pFuncList, pMe, iStrideEnc, iStrideRef, pRefMb);
+  }
+}
+
+bool_t WelsMeSadCostSelect (int32_t* iSadCost, const uint16_t* kpMvdCost, int32_t* pBestCost, const int32_t kiDx,
+                            const int32_t kiDy, int32_t* pIx, int32_t* pIy) {
+  int32_t iTempSadCost[4];
+  int32_t iInputSadCost = *pBestCost;
+  iTempSadCost[0] = iSadCost[0] + COST_MVD (kpMvdCost, kiDx, kiDy - 4);
+  iTempSadCost[1] = iSadCost[1] + COST_MVD (kpMvdCost, kiDx, kiDy + 4);
+  iTempSadCost[2] = iSadCost[2] + COST_MVD (kpMvdCost, kiDx - 4, kiDy);
+  iTempSadCost[3] = iSadCost[3] + COST_MVD (kpMvdCost, kiDx + 4, kiDy);
+
+  if (iTempSadCost[0] < *pBestCost) {
+    *pBestCost = iTempSadCost[0];
+    *pIx = 0;
+    *pIy = 1;
+  }
+
+  if (iTempSadCost[1] < *pBestCost) {
+    *pBestCost = iTempSadCost[1];
+    *pIx = 0;
+    *pIy = -1;
+  }
+
+  if (iTempSadCost[2] < *pBestCost) {
+    *pBestCost = iTempSadCost[2];
+    *pIx = 1;
+    *pIy = 0;
+  }
+
+  if (iTempSadCost[3] < *pBestCost) {
+    *pBestCost = iTempSadCost[3];
+    *pIx = -1;
+    *pIy = 0;
+  }
+
+
+  return (*pBestCost == iInputSadCost);
+}
+
+void WelsMotionEstimateIterativeSearch (SWelsFuncPtrList* pFuncList, SWelsME* pMe, const int32_t kiStrideEnc,
+                                        const int32_t kiStrideRef, uint8_t* pFref) {
+  PSample4SadCostFunc			pSad					=  pFuncList->sSampleDealingFuncs.pfSample4Sad[pMe->uiPixel];
+
+  uint8_t* const kpEncMb = pMe->pEncMb;
+  const uint16_t* kpMvdCost = pMe->pMvdCost;
+
+  int32_t iMvDx = ((pMe->sMv.iMvX) << 2) - pMe->sMvp.iMvX;
+  int32_t iMvDy = ((pMe->sMv.iMvY) << 2) - pMe->sMvp.iMvY;
+
+  uint8_t* pRefMb = pFref;
+  int32_t iBestCost = (pMe->uiSadCost);
+
+  int32_t iTimeThreshold = ITERATIVE_TIMES;
+  ENFORCE_STACK_ALIGN_1D (int32_t, iSadCosts, 4, 16)
+
+  while (iTimeThreshold--) {
+    pSad (kpEncMb, kiStrideEnc, pRefMb, kiStrideRef, &iSadCosts[0]);
+
+    int32_t iX, iY;
+
+    const bool_t kbIsBestCostWorse = WelsMeSadCostSelect (iSadCosts, kpMvdCost, &iBestCost, iMvDx, iMvDy, &iX, &iY);
+    if (kbIsBestCostWorse)
+      break;
+
+    iMvDx -= iX << 2 ;
+    iMvDy -= iY << 2 ;
+
+    pRefMb -= (iX + iY * kiStrideRef);
+
+  }
+
+  /* -> qpel mv */
+  pMe->sMv.iMvX = (iMvDx + pMe->sMvp.iMvX) & 0xFFFC;
+  pMe->sMv.iMvY = (iMvDy + pMe->sMvp.iMvY) & 0xFFFC;
+  pMe->uiSatdCost = pMe->uiSadCost = (iBestCost);
+  pMe->pRefMb = pRefMb;
+}
+
+} // namespace WelsSVCEnc
--- a/codec/encoder/core/src/svc_set_mb_syn_cavlc.cpp
+++ b/codec/encoder/core/src/svc_set_mb_syn_cavlc.cpp
@@ -1,385 +1,347 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file	svc_set_mb_syn_cavlc.h
- *
- * \brief	Seting all syntax elements of mb and decoding residual with cavlc
- *
- * \date	2009.8.12 Created 
- *
- *************************************************************************************
- */
-
-#include "svc_enc_golomb.h"
-#include "vlc_encoder.h"
-#include "ls_defines.h"
-#include "svc_set_mb_syn_cavlc.h"
-
-namespace WelsSVCEnc {
-const uint32_t g_kuiIntra4x4CbpMap[48] =
-{
-	3, 29, 30, 17, 31, 18, 37,  8, 32, 38, 19,  9, 20, 10, 11, 2, //15
-	16, 33, 34, 21, 35, 22, 39,  4, 36, 40, 23,  5, 24,  6,  7, 1, //31
-	41, 42, 43, 25, 44, 26, 46, 12, 45, 47, 27, 13, 28, 14, 15, 0  //47
-};
-
-const uint32_t g_kuiInterCbpMap[48] = 
-{
-	0,  2,  3,  7,  4,  8, 17, 13,  5, 18,  9, 14, 10, 15, 16, 11, //15
-	1, 32, 33, 36, 34, 37, 44, 40, 35, 45, 38, 41, 39, 42, 43, 19, //31
-	6, 24, 25, 20, 26, 21, 46, 28, 27, 47, 22, 29, 23, 30, 31, 12 //47
-};	
-
-//============================Enhance Layer CAVLC Writing===========================
-void WelsSpatialWriteMbPred( sWelsEncCtx *pEncCtx, SSlice *pSlice, SMB *pCurMb )
-{
-	SMbCache* pMbCache	= &pSlice->sMbCacheInfo;
-	SBitStringAux *pBs	= pSlice->pSliceBsa;
-	SSliceHeaderExt* pSliceHeadExt = &pSlice->sSliceHeaderExt;
-	int32_t iNumRefIdxl0ActiveMinus1 = pSliceHeadExt->sSliceHeader.uiNumRefIdxL0Active - 1;
-
-	Mb_Type uiMbType = pCurMb->uiMbType;
-	int32_t iCbpChroma = pCurMb->uiCbp >> 4;
-	int32_t iCbpLuma   = pCurMb->uiCbp & 15;
-	int32_t i = 0;
-
-	SMVUnitXY sMvd[2];
-    bool_t* pPredFlag;
-    int8_t* pRemMode;
-
-	int32_t iMbOffset = 0;
-
-	switch( pSliceHeadExt->sSliceHeader.eSliceType )
-    {
-        case I_SLICE:
-            iMbOffset = 0;
-            break;
-        case P_SLICE:
-            iMbOffset = 5;
-            break;
-        default:
-            return;
-    }	
-
-	switch ( uiMbType )
-	{		
-	case MB_TYPE_INTRA4x4:			
-		/* mb type */
-        BsWriteUE( pBs, iMbOffset + 0 );
-		
-        /* prediction: luma */
-        pPredFlag = &pMbCache->pPrevIntra4x4PredModeFlag[0];
-        pRemMode  = &pMbCache->pRemIntra4x4PredModeFlag[0];
-		do
-        {
-            BsWriteOneBit( pBs, *pPredFlag );  /* b_prev_intra4x4_pred_mode */
- 
-			if ( !*pPredFlag )
-            {
-                BsWriteBits( pBs, 3, *pRemMode );
-            }
-			
-			pPredFlag++;
-			pRemMode++;
-			++ i;
-        } while (i < 16);
-
-        /* prediction: chroma */		
-		BsWriteUE( pBs, g_kiMapModeIntraChroma[pMbCache->uiChmaI8x8Mode] );
-
-		break;
-
-	case MB_TYPE_INTRA16x16:		
-		/* mb type */
-		BsWriteUE( pBs, 1 + iMbOffset + g_kiMapModeI16x16[pMbCache->uiLumaI16x16Mode] + (iCbpChroma << 2) + ( iCbpLuma == 0 ? 0 : 12 ) );
-		
-        /* prediction: chroma */		
-		BsWriteUE( pBs, g_kiMapModeIntraChroma[pMbCache->uiChmaI8x8Mode] );
-
-		break;
-
-	case MB_TYPE_16x16:	
-        BsWriteUE( pBs, 0 );//uiMbType
-		sMvd[0].sDeltaMv(pCurMb->sMv[0], pMbCache->sMbMvp[0]);
-
-		if ( iNumRefIdxl0ActiveMinus1 > 0 )
-		{
-			BsWriteTE( pBs, iNumRefIdxl0ActiveMinus1, pCurMb->pRefIndex[0] );
-		}
-
-		BsWriteSE( pBs, sMvd[0].iMvX);
-		BsWriteSE( pBs, sMvd[0].iMvY);
-		
-		break;
-
-	case MB_TYPE_16x8:
-		BsWriteUE( pBs, 1 );//uiMbType
-		
-		sMvd[0].sDeltaMv(pCurMb->sMv[0], pMbCache->sMbMvp[0]);
-		sMvd[1].sDeltaMv(pCurMb->sMv[8], pMbCache->sMbMvp[1]);
-
-		if ( iNumRefIdxl0ActiveMinus1 > 0 )
-		{
-			BsWriteTE( pBs, iNumRefIdxl0ActiveMinus1, pCurMb->pRefIndex[0] );
-			BsWriteTE( pBs, iNumRefIdxl0ActiveMinus1, pCurMb->pRefIndex[2] );
-		}
-		BsWriteSE( pBs, sMvd[0].iMvX );//block0
-		BsWriteSE( pBs, sMvd[0].iMvY );
-		BsWriteSE( pBs, sMvd[1].iMvX );//block1
-		BsWriteSE( pBs, sMvd[1].iMvY );
-		
-		break;
-
-	case MB_TYPE_8x16:		
-		BsWriteUE( pBs, 2 );//uiMbType
-		
-		sMvd[0].sDeltaMv(pCurMb->sMv[0], pMbCache->sMbMvp[0]);
-		sMvd[1].sDeltaMv(pCurMb->sMv[2], pMbCache->sMbMvp[1]);
-
-		if ( iNumRefIdxl0ActiveMinus1 > 0 )
-		{
-			BsWriteTE( pBs, iNumRefIdxl0ActiveMinus1, pCurMb->pRefIndex[0] );
-			BsWriteTE( pBs, iNumRefIdxl0ActiveMinus1, pCurMb->pRefIndex[1] );
-		}
-		BsWriteSE( pBs, sMvd[0].iMvX );//block0
-		BsWriteSE( pBs, sMvd[0].iMvY );
-		BsWriteSE( pBs, sMvd[1].iMvX );//block1
-		BsWriteSE( pBs, sMvd[1].iMvY );
-		
-		break;
-	}
-}
-
-void WelsSpatialWriteSubMbPred( sWelsEncCtx *pEncCtx, SSlice *pSlice, SMB *pCurMb )
-{
-	SMbCache* pMbCache	= &pSlice->sMbCacheInfo;
-	SBitStringAux *pBs	= pSlice->pSliceBsa;
-	SSliceHeaderExt* pSliceHeadExt = &pSlice->sSliceHeaderExt;
-
-	int32_t iNumRefIdxl0ActiveMinus1 = pSliceHeadExt->sSliceHeader.uiNumRefIdxL0Active - 1;
-	int32_t i;
-
-	bool_t bSubRef0 = false;	
-	const uint8_t* kpScan4 = &(g_kuiMbCountScan4Idx[0]);
-
-	/* mb type */
-	if ( LD32(pCurMb->pRefIndex) == 0 )
-	{
-		BsWriteUE( pBs, 4 );
-		bSubRef0 = false;
-	}
-	else
-	{
-		BsWriteUE( pBs, 3 );
-		bSubRef0 = true;
-	}
-
-	//step 1: sub_mb_type
-	for ( i = 0; i < 4; i++ )
-	{
-		BsWriteUE( pBs, 0 );
-	}
-
-	//step 2: get and write uiRefIndex and sMvd
-	if ( iNumRefIdxl0ActiveMinus1 > 0 && bSubRef0 ) 
-	{
-		BsWriteTE( pBs, iNumRefIdxl0ActiveMinus1, pCurMb->pRefIndex[0] );
-		BsWriteTE( pBs, iNumRefIdxl0ActiveMinus1, pCurMb->pRefIndex[1] );
-		BsWriteTE( pBs, iNumRefIdxl0ActiveMinus1, pCurMb->pRefIndex[2] );
-		BsWriteTE( pBs, iNumRefIdxl0ActiveMinus1, pCurMb->pRefIndex[3] );
-	}			
-	//write sMvd
-	for ( i = 0; i < 4; i++ )
-	{
-		BsWriteSE( pBs, pCurMb->sMv[*kpScan4].iMvX - pMbCache->sMbMvp[i].iMvX );
-		BsWriteSE( pBs, pCurMb->sMv[*kpScan4].iMvY - pMbCache->sMbMvp[i].iMvY );
-		kpScan4 += 4;
-	}
-}
-
-//============================Base Layer CAVLC Writing===============================
-void WelsSpatialWriteMbSyn( sWelsEncCtx *pEncCtx, SSlice *pSlice, SMB *pCurMb )
-{
-	SBitStringAux *pBs = pSlice->pSliceBsa;
-	SMbCache* pMbCache = &pSlice->sMbCacheInfo;
-	
-	/* Step 1: write mb type and pred */
-	if ( IS_Inter_8x8(pCurMb->uiMbType))
-	{
-		WelsSpatialWriteSubMbPred( pEncCtx, pSlice, pCurMb );
-	}
-	else
-	{
-		WelsSpatialWriteMbPred( pEncCtx, pSlice, pCurMb );
-	}
-
-	/* Step 2: write coded block patern */	
-    if( IS_INTRA4x4 ( pCurMb->uiMbType ) )
-    {
-        BsWriteUE( pBs, g_kuiIntra4x4CbpMap[pCurMb->uiCbp] );
-    }
-    else if( !IS_INTRA16x16(pCurMb->uiMbType) )
-    {
-        BsWriteUE( pBs, g_kuiInterCbpMap[pCurMb->uiCbp] );
-    }
-
-	/* Step 3: write QP and residual */
-	if( pCurMb->uiCbp > 0 || IS_INTRA16x16(pCurMb->uiMbType) )
-	{
-		const int32_t kiDeltaQp = pCurMb->uiLumaQp - pSlice->uiLastMbQp;
-		pSlice->uiLastMbQp = pCurMb->uiLumaQp;		
-
-        BsWriteSE( pBs, kiDeltaQp );		
-		WelsWriteMbResidual( pMbCache, pCurMb, pBs );
-	}
-	else
-	{
-		pCurMb->uiLumaQp = pSlice->uiLastMbQp;
-		pCurMb->uiChromaQp = g_kuiChromaQpTable[CLIP3_QP_0_51(pCurMb->uiLumaQp + pEncCtx->pCurDqLayer->sLayerInfo.pPpsP->uiChromaQpIndexOffset)];
-	}
-}
-
-void WelsWriteMbResidual( SMbCache* sMbCacheInfo, SMB *pCurMb, SBitStringAux *pBs )
-{
-	int32_t i;
-	Mb_Type uiMbType					= pCurMb->uiMbType;	
-	const int32_t kiCbpChroma		= pCurMb->uiCbp >> 4;
-	const int32_t kiCbpLuma			= pCurMb->uiCbp & 0x0F;
-	int8_t *pNonZeroCoeffCount	= sMbCacheInfo->iNonZeroCoeffCount;
-	int16_t *pBlock;
-	int8_t iA, iB, iC;
-
-	if ( IS_INTRA16x16(uiMbType) )
-	{		
-        /* DC luma */
-		iA = pNonZeroCoeffCount[8];
-		iB = pNonZeroCoeffCount[ 1];
-		WELS_NON_ZERO_COUNT_AVERAGE(iC,iA,iB);
-		WriteBlockResidualCavlc( sMbCacheInfo->pDct->iLumaI16x16Dc, 15, 1, LUMA_4x4, iC, pBs);
-
-		/* AC Luma */
-        if( kiCbpLuma )
-        {
-			pBlock = sMbCacheInfo->pDct->iLumaBlock[0];		
-		
-			for( i=0;i<16;i++ )
-            {
-				int32_t iIdx = g_kuiCache48CountScan4Idx[i];
-				iA = pNonZeroCoeffCount[iIdx-1];
-				iB = pNonZeroCoeffCount[iIdx-8];
-				WELS_NON_ZERO_COUNT_AVERAGE(iC,iA,iB);
-				WriteBlockResidualCavlc( pBlock, 14, pNonZeroCoeffCount[iIdx]>0, LUMA_AC, iC, pBs);
-				pBlock += 16;				
-            } 
-        }		
-	}
-	else
-	{
-        /* Luma DC AC */
-        if ( kiCbpLuma )
-		{			
-			pBlock = sMbCacheInfo->pDct->iLumaBlock[0];
-			
-			for( i=0; i<16; i+=4 )
-            {
-				if( kiCbpLuma & (1 << (i >> 2)) )
-				{
-					int32_t iIdx = g_kuiCache48CountScan4Idx[i];
-					const int8_t kiA = pNonZeroCoeffCount[iIdx];
-					const int8_t kiB = pNonZeroCoeffCount[iIdx+1];
-					const int8_t kiC = pNonZeroCoeffCount[iIdx+8];
-					const int8_t kiD = pNonZeroCoeffCount[iIdx+9];
-					iA = pNonZeroCoeffCount[iIdx-1];
-					iB = pNonZeroCoeffCount[iIdx-8];
-					WELS_NON_ZERO_COUNT_AVERAGE(iC,iA,iB);
-					WriteBlockResidualCavlc( pBlock, 15, kiA>0, LUMA_4x4, iC, pBs );
-
-					iA = kiA;
-					iB = pNonZeroCoeffCount[iIdx-7];
-					WELS_NON_ZERO_COUNT_AVERAGE(iC,iA,iB);
-					WriteBlockResidualCavlc( pBlock + 16, 15, kiB>0, LUMA_4x4, iC, pBs );
-
-					iA = pNonZeroCoeffCount[iIdx+7];
-					iB = kiA;
-					WELS_NON_ZERO_COUNT_AVERAGE(iC,iA,iB);
-					WriteBlockResidualCavlc( pBlock + 32, 15, kiC>0, LUMA_4x4, iC, pBs );
-
-					iA = kiC;
-					iB = kiB;
-					WELS_NON_ZERO_COUNT_AVERAGE(iC,iA,iB);
-					WriteBlockResidualCavlc( pBlock + 48, 15, kiD>0, LUMA_4x4, iC, pBs );
-				}
-				pBlock += 64;				
-           } 
-        }				
-	}
-
-    if( kiCbpChroma )
-    {
-        /* Chroma DC residual present */
-		pBlock = sMbCacheInfo->pDct->iChromaDc[0]; // Cb
-        WriteBlockResidualCavlc( pBlock, 3, 1, CHROMA_DC, CHROMA_DC_NC_OFFSET, pBs );
-		
-		pBlock += 4; // Cr
-		WriteBlockResidualCavlc( pBlock, 3, 1, CHROMA_DC, CHROMA_DC_NC_OFFSET, pBs );
- 
-		/* Chroma AC residual present */
-        if( kiCbpChroma & 0x02 ) 
-        {
-			const uint8_t *kCache48CountScan4Idx16base = &g_kuiCache48CountScan4Idx[16];
-			pBlock = sMbCacheInfo->pDct->iChromaBlock[0]; // Cb
-			
-			for( i=0; i<4; i++ )
-            {
-				int32_t iIdx = kCache48CountScan4Idx16base[i];
-				iA = pNonZeroCoeffCount[iIdx-1];
-				iB = pNonZeroCoeffCount[iIdx-8];
-				WELS_NON_ZERO_COUNT_AVERAGE(iC,iA,iB);
-				WriteBlockResidualCavlc( pBlock, 14, pNonZeroCoeffCount[iIdx]>0, CHROMA_AC, iC, pBs );
-				pBlock += 16;			
-            }
-
-			pBlock = sMbCacheInfo->pDct->iChromaBlock[4]; // Cr
-		
-			for( i=0;i<4;i++ )
-            {
-				int32_t iIdx = 24+kCache48CountScan4Idx16base[i];
-				iA = pNonZeroCoeffCount[iIdx-1];
-				iB = pNonZeroCoeffCount[iIdx-8];
-				WELS_NON_ZERO_COUNT_AVERAGE(iC,iA,iB);
-				WriteBlockResidualCavlc( pBlock, 14,pNonZeroCoeffCount[iIdx]>0, CHROMA_AC, iC, pBs );
-				pBlock += 16;			
-            }
-        }
-    }	
-}
-
-} // namespace WelsSVCEnc
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	svc_set_mb_syn_cavlc.h
+ *
+ * \brief	Seting all syntax elements of mb and decoding residual with cavlc
+ *
+ * \date	2009.8.12 Created
+ *
+ *************************************************************************************
+ */
+
+#include "svc_enc_golomb.h"
+#include "vlc_encoder.h"
+#include "ls_defines.h"
+#include "svc_set_mb_syn_cavlc.h"
+
+namespace WelsSVCEnc {
+const uint32_t g_kuiIntra4x4CbpMap[48] = {
+  3, 29, 30, 17, 31, 18, 37,  8, 32, 38, 19,  9, 20, 10, 11, 2, //15
+  16, 33, 34, 21, 35, 22, 39,  4, 36, 40, 23,  5, 24,  6,  7, 1, //31
+  41, 42, 43, 25, 44, 26, 46, 12, 45, 47, 27, 13, 28, 14, 15, 0  //47
+};
+
+const uint32_t g_kuiInterCbpMap[48] = {
+  0,  2,  3,  7,  4,  8, 17, 13,  5, 18,  9, 14, 10, 15, 16, 11, //15
+  1, 32, 33, 36, 34, 37, 44, 40, 35, 45, 38, 41, 39, 42, 43, 19, //31
+  6, 24, 25, 20, 26, 21, 46, 28, 27, 47, 22, 29, 23, 30, 31, 12 //47
+};
+
+//============================Enhance Layer CAVLC Writing===========================
+void WelsSpatialWriteMbPred (sWelsEncCtx* pEncCtx, SSlice* pSlice, SMB* pCurMb) {
+  SMbCache* pMbCache	= &pSlice->sMbCacheInfo;
+  SBitStringAux* pBs	= pSlice->pSliceBsa;
+  SSliceHeaderExt* pSliceHeadExt = &pSlice->sSliceHeaderExt;
+  int32_t iNumRefIdxl0ActiveMinus1 = pSliceHeadExt->sSliceHeader.uiNumRefIdxL0Active - 1;
+
+  Mb_Type uiMbType = pCurMb->uiMbType;
+  int32_t iCbpChroma = pCurMb->uiCbp >> 4;
+  int32_t iCbpLuma   = pCurMb->uiCbp & 15;
+  int32_t i = 0;
+
+  SMVUnitXY sMvd[2];
+  bool_t* pPredFlag;
+  int8_t* pRemMode;
+
+  int32_t iMbOffset = 0;
+
+  switch (pSliceHeadExt->sSliceHeader.eSliceType) {
+  case I_SLICE:
+    iMbOffset = 0;
+    break;
+  case P_SLICE:
+    iMbOffset = 5;
+    break;
+  default:
+    return;
+  }
+
+  switch (uiMbType) {
+  case MB_TYPE_INTRA4x4:
+    /* mb type */
+    BsWriteUE (pBs, iMbOffset + 0);
+
+    /* prediction: luma */
+    pPredFlag = &pMbCache->pPrevIntra4x4PredModeFlag[0];
+    pRemMode  = &pMbCache->pRemIntra4x4PredModeFlag[0];
+    do {
+      BsWriteOneBit (pBs, *pPredFlag);   /* b_prev_intra4x4_pred_mode */
+
+      if (!*pPredFlag) {
+        BsWriteBits (pBs, 3, *pRemMode);
+      }
+
+      pPredFlag++;
+      pRemMode++;
+      ++ i;
+    } while (i < 16);
+
+    /* prediction: chroma */
+    BsWriteUE (pBs, g_kiMapModeIntraChroma[pMbCache->uiChmaI8x8Mode]);
+
+    break;
+
+  case MB_TYPE_INTRA16x16:
+    /* mb type */
+    BsWriteUE (pBs, 1 + iMbOffset + g_kiMapModeI16x16[pMbCache->uiLumaI16x16Mode] + (iCbpChroma << 2) +
+               (iCbpLuma == 0 ? 0 : 12));
+
+    /* prediction: chroma */
+    BsWriteUE (pBs, g_kiMapModeIntraChroma[pMbCache->uiChmaI8x8Mode]);
+
+    break;
+
+  case MB_TYPE_16x16:
+    BsWriteUE (pBs, 0); //uiMbType
+    sMvd[0].sDeltaMv (pCurMb->sMv[0], pMbCache->sMbMvp[0]);
+
+    if (iNumRefIdxl0ActiveMinus1 > 0) {
+      BsWriteTE (pBs, iNumRefIdxl0ActiveMinus1, pCurMb->pRefIndex[0]);
+    }
+
+    BsWriteSE (pBs, sMvd[0].iMvX);
+    BsWriteSE (pBs, sMvd[0].iMvY);
+
+    break;
+
+  case MB_TYPE_16x8:
+    BsWriteUE (pBs, 1); //uiMbType
+
+    sMvd[0].sDeltaMv (pCurMb->sMv[0], pMbCache->sMbMvp[0]);
+    sMvd[1].sDeltaMv (pCurMb->sMv[8], pMbCache->sMbMvp[1]);
+
+    if (iNumRefIdxl0ActiveMinus1 > 0) {
+      BsWriteTE (pBs, iNumRefIdxl0ActiveMinus1, pCurMb->pRefIndex[0]);
+      BsWriteTE (pBs, iNumRefIdxl0ActiveMinus1, pCurMb->pRefIndex[2]);
+    }
+    BsWriteSE (pBs, sMvd[0].iMvX); //block0
+    BsWriteSE (pBs, sMvd[0].iMvY);
+    BsWriteSE (pBs, sMvd[1].iMvX); //block1
+    BsWriteSE (pBs, sMvd[1].iMvY);
+
+    break;
+
+  case MB_TYPE_8x16:
+    BsWriteUE (pBs, 2); //uiMbType
+
+    sMvd[0].sDeltaMv (pCurMb->sMv[0], pMbCache->sMbMvp[0]);
+    sMvd[1].sDeltaMv (pCurMb->sMv[2], pMbCache->sMbMvp[1]);
+
+    if (iNumRefIdxl0ActiveMinus1 > 0) {
+      BsWriteTE (pBs, iNumRefIdxl0ActiveMinus1, pCurMb->pRefIndex[0]);
+      BsWriteTE (pBs, iNumRefIdxl0ActiveMinus1, pCurMb->pRefIndex[1]);
+    }
+    BsWriteSE (pBs, sMvd[0].iMvX); //block0
+    BsWriteSE (pBs, sMvd[0].iMvY);
+    BsWriteSE (pBs, sMvd[1].iMvX); //block1
+    BsWriteSE (pBs, sMvd[1].iMvY);
+
+    break;
+  }
+}
+
+void WelsSpatialWriteSubMbPred (sWelsEncCtx* pEncCtx, SSlice* pSlice, SMB* pCurMb) {
+  SMbCache* pMbCache	= &pSlice->sMbCacheInfo;
+  SBitStringAux* pBs	= pSlice->pSliceBsa;
+  SSliceHeaderExt* pSliceHeadExt = &pSlice->sSliceHeaderExt;
+
+  int32_t iNumRefIdxl0ActiveMinus1 = pSliceHeadExt->sSliceHeader.uiNumRefIdxL0Active - 1;
+  int32_t i;
+
+  bool_t bSubRef0 = false;
+  const uint8_t* kpScan4 = & (g_kuiMbCountScan4Idx[0]);
+
+  /* mb type */
+  if (LD32 (pCurMb->pRefIndex) == 0) {
+    BsWriteUE (pBs, 4);
+    bSubRef0 = false;
+  } else {
+    BsWriteUE (pBs, 3);
+    bSubRef0 = true;
+  }
+
+  //step 1: sub_mb_type
+  for (i = 0; i < 4; i++) {
+    BsWriteUE (pBs, 0);
+  }
+
+  //step 2: get and write uiRefIndex and sMvd
+  if (iNumRefIdxl0ActiveMinus1 > 0 && bSubRef0) {
+    BsWriteTE (pBs, iNumRefIdxl0ActiveMinus1, pCurMb->pRefIndex[0]);
+    BsWriteTE (pBs, iNumRefIdxl0ActiveMinus1, pCurMb->pRefIndex[1]);
+    BsWriteTE (pBs, iNumRefIdxl0ActiveMinus1, pCurMb->pRefIndex[2]);
+    BsWriteTE (pBs, iNumRefIdxl0ActiveMinus1, pCurMb->pRefIndex[3]);
+  }
+  //write sMvd
+  for (i = 0; i < 4; i++) {
+    BsWriteSE (pBs, pCurMb->sMv[*kpScan4].iMvX - pMbCache->sMbMvp[i].iMvX);
+    BsWriteSE (pBs, pCurMb->sMv[*kpScan4].iMvY - pMbCache->sMbMvp[i].iMvY);
+    kpScan4 += 4;
+  }
+}
+
+//============================Base Layer CAVLC Writing===============================
+void WelsSpatialWriteMbSyn (sWelsEncCtx* pEncCtx, SSlice* pSlice, SMB* pCurMb) {
+  SBitStringAux* pBs = pSlice->pSliceBsa;
+  SMbCache* pMbCache = &pSlice->sMbCacheInfo;
+
+  /* Step 1: write mb type and pred */
+  if (IS_Inter_8x8 (pCurMb->uiMbType)) {
+    WelsSpatialWriteSubMbPred (pEncCtx, pSlice, pCurMb);
+  } else {
+    WelsSpatialWriteMbPred (pEncCtx, pSlice, pCurMb);
+  }
+
+  /* Step 2: write coded block patern */
+  if (IS_INTRA4x4 (pCurMb->uiMbType)) {
+    BsWriteUE (pBs, g_kuiIntra4x4CbpMap[pCurMb->uiCbp]);
+  } else if (!IS_INTRA16x16 (pCurMb->uiMbType)) {
+    BsWriteUE (pBs, g_kuiInterCbpMap[pCurMb->uiCbp]);
+  }
+
+  /* Step 3: write QP and residual */
+  if (pCurMb->uiCbp > 0 || IS_INTRA16x16 (pCurMb->uiMbType)) {
+    const int32_t kiDeltaQp = pCurMb->uiLumaQp - pSlice->uiLastMbQp;
+    pSlice->uiLastMbQp = pCurMb->uiLumaQp;
+
+    BsWriteSE (pBs, kiDeltaQp);
+    WelsWriteMbResidual (pMbCache, pCurMb, pBs);
+  } else {
+    pCurMb->uiLumaQp = pSlice->uiLastMbQp;
+    pCurMb->uiChromaQp = g_kuiChromaQpTable[CLIP3_QP_0_51 (pCurMb->uiLumaQp +
+                                            pEncCtx->pCurDqLayer->sLayerInfo.pPpsP->uiChromaQpIndexOffset)];
+  }
+}
+
+void WelsWriteMbResidual (SMbCache* sMbCacheInfo, SMB* pCurMb, SBitStringAux* pBs) {
+  int32_t i;
+  Mb_Type uiMbType					= pCurMb->uiMbType;
+  const int32_t kiCbpChroma		= pCurMb->uiCbp >> 4;
+  const int32_t kiCbpLuma			= pCurMb->uiCbp & 0x0F;
+  int8_t* pNonZeroCoeffCount	= sMbCacheInfo->iNonZeroCoeffCount;
+  int16_t* pBlock;
+  int8_t iA, iB, iC;
+
+  if (IS_INTRA16x16 (uiMbType)) {
+    /* DC luma */
+    iA = pNonZeroCoeffCount[8];
+    iB = pNonZeroCoeffCount[ 1];
+    WELS_NON_ZERO_COUNT_AVERAGE (iC, iA, iB);
+    WriteBlockResidualCavlc (sMbCacheInfo->pDct->iLumaI16x16Dc, 15, 1, LUMA_4x4, iC, pBs);
+
+    /* AC Luma */
+    if (kiCbpLuma) {
+      pBlock = sMbCacheInfo->pDct->iLumaBlock[0];
+
+      for (i = 0; i < 16; i++) {
+        int32_t iIdx = g_kuiCache48CountScan4Idx[i];
+        iA = pNonZeroCoeffCount[iIdx - 1];
+        iB = pNonZeroCoeffCount[iIdx - 8];
+        WELS_NON_ZERO_COUNT_AVERAGE (iC, iA, iB);
+        WriteBlockResidualCavlc (pBlock, 14, pNonZeroCoeffCount[iIdx] > 0, LUMA_AC, iC, pBs);
+        pBlock += 16;
+      }
+    }
+  } else {
+    /* Luma DC AC */
+    if (kiCbpLuma) {
+      pBlock = sMbCacheInfo->pDct->iLumaBlock[0];
+
+      for (i = 0; i < 16; i += 4) {
+        if (kiCbpLuma & (1 << (i >> 2))) {
+          int32_t iIdx = g_kuiCache48CountScan4Idx[i];
+          const int8_t kiA = pNonZeroCoeffCount[iIdx];
+          const int8_t kiB = pNonZeroCoeffCount[iIdx + 1];
+          const int8_t kiC = pNonZeroCoeffCount[iIdx + 8];
+          const int8_t kiD = pNonZeroCoeffCount[iIdx + 9];
+          iA = pNonZeroCoeffCount[iIdx - 1];
+          iB = pNonZeroCoeffCount[iIdx - 8];
+          WELS_NON_ZERO_COUNT_AVERAGE (iC, iA, iB);
+          WriteBlockResidualCavlc (pBlock, 15, kiA > 0, LUMA_4x4, iC, pBs);
+
+          iA = kiA;
+          iB = pNonZeroCoeffCount[iIdx - 7];
+          WELS_NON_ZERO_COUNT_AVERAGE (iC, iA, iB);
+          WriteBlockResidualCavlc (pBlock + 16, 15, kiB > 0, LUMA_4x4, iC, pBs);
+
+          iA = pNonZeroCoeffCount[iIdx + 7];
+          iB = kiA;
+          WELS_NON_ZERO_COUNT_AVERAGE (iC, iA, iB);
+          WriteBlockResidualCavlc (pBlock + 32, 15, kiC > 0, LUMA_4x4, iC, pBs);
+
+          iA = kiC;
+          iB = kiB;
+          WELS_NON_ZERO_COUNT_AVERAGE (iC, iA, iB);
+          WriteBlockResidualCavlc (pBlock + 48, 15, kiD > 0, LUMA_4x4, iC, pBs);
+        }
+        pBlock += 64;
+      }
+    }
+  }
+
+  if (kiCbpChroma) {
+    /* Chroma DC residual present */
+    pBlock = sMbCacheInfo->pDct->iChromaDc[0]; // Cb
+    WriteBlockResidualCavlc (pBlock, 3, 1, CHROMA_DC, CHROMA_DC_NC_OFFSET, pBs);
+
+    pBlock += 4; // Cr
+    WriteBlockResidualCavlc (pBlock, 3, 1, CHROMA_DC, CHROMA_DC_NC_OFFSET, pBs);
+
+    /* Chroma AC residual present */
+    if (kiCbpChroma & 0x02) {
+      const uint8_t* kCache48CountScan4Idx16base = &g_kuiCache48CountScan4Idx[16];
+      pBlock = sMbCacheInfo->pDct->iChromaBlock[0]; // Cb
+
+      for (i = 0; i < 4; i++) {
+        int32_t iIdx = kCache48CountScan4Idx16base[i];
+        iA = pNonZeroCoeffCount[iIdx - 1];
+        iB = pNonZeroCoeffCount[iIdx - 8];
+        WELS_NON_ZERO_COUNT_AVERAGE (iC, iA, iB);
+        WriteBlockResidualCavlc (pBlock, 14, pNonZeroCoeffCount[iIdx] > 0, CHROMA_AC, iC, pBs);
+        pBlock += 16;
+      }
+
+      pBlock = sMbCacheInfo->pDct->iChromaBlock[4]; // Cr
+
+      for (i = 0; i < 4; i++) {
+        int32_t iIdx = 24 + kCache48CountScan4Idx16base[i];
+        iA = pNonZeroCoeffCount[iIdx - 1];
+        iB = pNonZeroCoeffCount[iIdx - 8];
+        WELS_NON_ZERO_COUNT_AVERAGE (iC, iA, iB);
+        WriteBlockResidualCavlc (pBlock, 14, pNonZeroCoeffCount[iIdx] > 0, CHROMA_AC, iC, pBs);
+        pBlock += 16;
+      }
+    }
+  }
+}
+
+} // namespace WelsSVCEnc
--- a/codec/encoder/core/src/utils.cpp
+++ b/codec/encoder/core/src/utils.cpp
@@ -1,513 +1,482 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file	utils.c
- *
- * \brief	common tool/function utilization
- *
- * \date	03/10/2009 Created
- *
- *************************************************************************************
- */
-#include <string.h>
-#include <stdlib.h>
-#include <assert.h>
-#include <math.h>
-#include <time.h>
-#if defined(WIN32)
-#include <windows.h>
-#include <sys/types.h>
-#include <sys/timeb.h>
-#else
-#include <sys/time.h>
-#endif
-
-#include "utils.h"
-#include "macros.h"
-#include "wels_const.h"
-#include "property.h"
-#include "cpu_core.h"
-#include "encoder_context.h"
-#include "as264_common.h"
-#include "property.h"
-#include "crt_util_safe_x.h"	// Safe CRT routines like utils for cross platforms
-
-
-namespace WelsSVCEnc {
-
-void WelsLogDefault( void *pCtx, const int32_t kiLevel, const str_t *kpFmtStr, va_list argv );
-void WelsLogNil( void *pCtx, const int32_t kiLevel, const str_t *kpFmtStr, va_list argv );
-
-real32_t WelsCalcPsnr(	const void *kpTarPic,
-							const int32_t kiTarStride,
-							const void *kpRefPic,
-							const int32_t kiRefStride,
-							const int32_t kiWidth,
-							const int32_t kiHeight	);
-
-// to fill default routines
-#ifdef ENABLE_TRACE_FILE
-PWelsLogCallbackFunc wlog	= WelsLogDefault;
-#else
-PWelsLogCallbackFunc wlog	= WelsLogNil;
-#endif
-
-iWelsLogLevel		g_iLevelLog	= WELS_LOG_DEFAULT;	// default log iLevel
-int32_t			g_iSizeLogBuf	= 1024;			// pBuffer size for each log output
-
-/*
- *	Log output routines
- */
-
-/*!
- * \brief	get log tag
- * \param	kiLevel		log iLevel
- * \return  tag of log iLevel
- */
-static inline str_t *GetLogTag( const int32_t kiLevel, int32_t *pBit )
-{	
-	int32_t iShift	= 0;
-	int32_t iVal		= 0;
-	bool_t	bFound	= false;
-
-	if ( kiLevel <= 0 || kiLevel > (1 << (WELS_LOG_LEVEL_COUNT-1)) || NULL == pBit )
-		return NULL;
-
-	for(;;)
-	{
-		if ( iShift >= WELS_LOG_LEVEL_COUNT )
-			break;
-		iVal	= (1 << iShift);
-		if ( iVal == kiLevel )
-		{
-			bFound	= true;
-			break;
-		}
-		++ iShift;
-	}
-
-	if ( bFound )
-	{
-		*pBit	= iShift;
-		return (str_t *)g_sWelsLogTags[iShift];
-	}
-	return NULL;
-}
-
-/*! 
- *************************************************************************************
- * \brief	System trace log output in Wels
- *
- * \param	pCtx	instance pointer
- * \param	kiLevel	log iLevel ( WELS_LOG_QUIET, ERROR, WARNING, INFO, DEBUG )
- * \param	kpFmtStr	formated string to mount
- * \param 	argv	pData string argument
- *
- * \return	NONE
- *
- * \note	N/A
- *************************************************************************************
- */
-void WelsLogDefault( void *pCtx, const int32_t kiLevel, const str_t *kpFmtStr, va_list argv )
-{
-	sWelsEncCtx *pEncCtx	= (sWelsEncCtx *)pCtx;
-	iWelsLogLevel		 iVal	= (kiLevel & g_iLevelLog);
-
-	if ( 0 == iVal || NULL == pEncCtx )	// such iLevel not enabled
-	{
-		return;
-	}
-	else
-	{
-		str_t pBuf[WELS_LOG_BUF_SIZE+1] = {0};		
-		const int32_t kiBufSize = sizeof(pBuf) / sizeof(pBuf[0]) - 1;
-		int32_t iCurUsed = 0;
-		int32_t iBufUsed = 0;
-		int32_t iBufLeft = kiBufSize - iBufUsed;
-		
-		if ( pEncCtx ){
-			time_t l_time;
-#if defined(WIN32)
-#if defined(_MSC_VER)
-#if _MSC_VER >= 1500
-			struct tm t_now;
-#else//VC6
-			struct tm* t_now;
-#endif//_MSC_VER >= 1500
-#endif//_MSC_VER
-#else//__GNUC__
-			struct tm* t_now;
-#endif//WIN32			
-			
-#if defined( WIN32 )
-			struct _timeb tb;
-			
-			time(&l_time);
-#ifdef _MSC_VER
-#if _MSC_VER >= 1500
-			LOCALTIME(&t_now, &l_time);
-#else
-			t_now = LOCALTIME(&l_time);
-			if ( NULL == t_now )
-			{
-				return;
-			}
-#endif//_MSC_VER >= 1500
-#endif//_MSC_VER			
-			FTIME(&tb);
-#elif defined( __GNUC__ )
-			struct timeval tv;
-			time(&l_time);
-			t_now = (struct tm *)LOCALTIME(&l_time);
-			gettimeofday(&tv,NULL);
-#endif//WIN32
-			if (iBufLeft > 0){
-#ifdef _MSC_VER
-#if _MSC_VER >= 1500
-				iCurUsed = SNPRINTF( &pBuf[iBufUsed], iBufLeft, iBufLeft, "[0x%p @ ", pEncCtx );	// confirmed_safe_unsafe_usage
-#else
-				iCurUsed = SNPRINTF( &pBuf[iBufUsed], iBufLeft, "[0x%p @ ", pEncCtx );	// confirmed_safe_unsafe_usage
-#endif//_MSC_VER >= 1500
-#endif//_MSC_VER
-				if (iCurUsed >= 0){
-					iBufUsed += iCurUsed;
-					iBufLeft -= iCurUsed;
-				}				
-			}
-			else{
-				return;
-			}
-
-			if ( iBufLeft > 0 ){			
-				iCurUsed = GetCodeName( &pBuf[iBufUsed], iBufLeft );
-				if ( iCurUsed > 0 ){
-					iBufUsed += iCurUsed;
-					iBufLeft -= iCurUsed;
-				}
-				pBuf[iBufUsed] = ' ';
-				++ iBufUsed;
-				-- iBufLeft;
-				
-				iCurUsed = GetLibName( &pBuf[iBufUsed], iBufLeft );
-				if ( iCurUsed > 0 ){
-					iBufUsed += iCurUsed;
-					iBufLeft -= iCurUsed;
-				}
-				pBuf[iBufUsed] = ' ';
-				++ iBufUsed;
-				-- iBufLeft;
-
-				pBuf[iBufUsed] = 'v';
-				++ iBufUsed;
-				-- iBufLeft;		
-				iCurUsed = GetVerNum( &pBuf[iBufUsed], iBufLeft );
-				if ( iCurUsed > 0 ){
-					iBufUsed += iCurUsed;
-					iBufLeft -= iCurUsed;
-				}
-				pBuf[iBufUsed] = ' ';
-				++ iBufUsed;
-				-- iBufLeft;				
-			}
-
-			if (iBufLeft > 0){
-#if defined(WIN32) && defined(_MSC_VER) && (_MSC_VER >= 1500)
-				iCurUsed = strftime(&pBuf[iBufUsed], iBufLeft, "%y-%m-%d %H:%M:%S", &t_now);
-#else
-				iCurUsed = strftime(&pBuf[iBufUsed], iBufLeft, "%y-%m-%d %H:%M:%S", t_now);
-#endif//WIN32..
-				if (iCurUsed > 0){
-					iBufUsed += iCurUsed;
-					iBufLeft -= iCurUsed;
-				}
-			}
-			else{
-				return;
-			}
-
-			if (iBufLeft > 0){
-#if defined (WIN32)
-#ifdef _MSC_VER
-#if _MSC_VER >= 1500
-				iCurUsed = SNPRINTF(&pBuf[iBufUsed], iBufLeft, iBufLeft, ".%03.3u]: ", tb.millitm);	// confirmed_safe_unsafe_usage
-#else
-				iCurUsed = SNPRINTF(&pBuf[iBufUsed], iBufLeft, ".%3.3u]: ", tb.millitm);	// confirmed_safe_unsafe_usage
-#endif//_MSC_VER >= 1500
-#endif//_MSC_VER
-#elif defined (__GNUC__)
-				iCurUsed = SNPRINTF(&pBuf[iBufUsed], iBufLeft, ".%3.3u]: ", tv.tv_usec/1000);	// confirmed_safe_unsafe_usage
-#endif//WIN32
-				if (iCurUsed >= 0){
-					iBufUsed += iCurUsed;
-					iBufLeft -= iCurUsed;
-				}
-			}
-			else{
-				return;
-			}
-		}
-
-		// fixed stack corruption issue on vs2008
-		if ( iBufLeft > 0 ){
-			int32_t i_shift = 0;			
-			str_t *pStr = NULL;
-			pStr	= GetLogTag( kiLevel, &i_shift );
-			if ( NULL != pCtx){
-				int32_t iLenTag = STRNLEN( pStr, 8 );	// confirmed_safe_unsafe_usage
-				STRCAT( &pBuf[iBufUsed], iBufLeft, pStr );	// confirmed_safe_unsafe_usage
-				iBufUsed += iLenTag;
-				pBuf[iBufUsed] = ' ';
-				iBufUsed++;
-				++iLenTag;
-				iBufLeft -= iLenTag;
-			}			
-		}
-		if (iBufLeft > 0){
-#if defined(WIN32) && defined(_MSC_VER) && (_MSC_VER >= 1500)
-			int32_t len = 0;
-			len = _vscprintf( kpFmtStr, argv ) // _vscprintf doesn't count
-					+ 1; // terminating '\0'
-			iCurUsed = VSPRINTF(&pBuf[iBufUsed], len, kpFmtStr, argv);	// confirmed_safe_unsafe_usage
-#else
-			iCurUsed = VSPRINTF(&pBuf[iBufUsed], kpFmtStr, argv);	// confirmed_safe_unsafe_usage
-#endif//WIN32..
-			if (iCurUsed > 0){
-				iBufUsed += iCurUsed;
-				iBufLeft -= iCurUsed;
-			}
-		}
-#ifdef ENABLE_TRACE_FILE
-		if (NULL != pEncCtx && NULL != pEncCtx->pFileLog){
-			if ( pEncCtx->uiSizeLog > MAX_TRACE_LOG_SIZE){
-				if (0 == fseek(pEncCtx->pFileLog, 0L, SEEK_SET))
-					pEncCtx->uiSizeLog = 0;
-			}
-			if ( iBufUsed > 0 && iBufUsed < WELS_LOG_BUF_SIZE )
-			{
-				iCurUsed = fwrite(pBuf, 1, iBufUsed, pEncCtx->pFileLog);
-				fflush( pEncCtx->pFileLog );
-				if ( iCurUsed == iBufUsed )
-					pEncCtx->uiSizeLog += iBufUsed;
-			}			
-		}
-		else{
-#if defined(WIN32) && defined(_DEBUG)
-			OutputDebugStringA(pBuf);
-#endif
-		}
-#endif//ENABLE_TRACE_FILE
-	}	
-}
-void WelsLogNil( void *pCtx, const int32_t kiLevel, const str_t *kpFmtStr, va_list argv )
-{
-	// NULL implementation
-}
-
-/*! 
-*************************************************************************************
-* \brief	reopen log file when finish setting current path
-*
-* \param	pCtx		context pCtx
-* \param	pCurPath	current path string
-*
-* \return	NONE
-*
-* \note	N/A
-*************************************************************************************
-*/
-void WelsReopenTraceFile( void *pCtx, str_t *pCurPath )
-{
-#ifdef ENABLE_TRACE_FILE
-	sWelsEncCtx *pEncCtx	= (sWelsEncCtx *)pCtx;
-	if (wlog == WelsLogDefault)
-	{
-		str_t strTraceFile[MAX_FNAME_LEN] = {0};
-		int32_t len = 0;
-		if (pEncCtx->pFileLog != NULL)
-		{
-			fclose(pEncCtx->pFileLog);
-			pEncCtx->pFileLog = NULL;
-		}
-		pEncCtx->uiSizeLog	= 0;
-		len = STRNLEN( pCurPath, MAX_FNAME_LEN-1 );	// confirmed_safe_unsafe_usage
-		if (len >= MAX_FNAME_LEN)
-			return;
-		STRNCPY(strTraceFile, MAX_FNAME_LEN, pCurPath, len);	// confirmed_safe_unsafe_usage
-#ifdef __GNUC__		
-		STRCAT(strTraceFile, MAX_FNAME_LEN-len, "/wels_encoder_trace.txt");	// confirmed_safe_unsafe_usage
-		pEncCtx->pFileLog	= FOPEN(strTraceFile, "wt+");	// confirmed_safe_unsafe_usage
-#elif WIN32
-		STRCAT(strTraceFile, MAX_FNAME_LEN-len, "\\wels_encoder_trace.txt");// confirmed_safe_unsafe_usage
-#if _MSC_VER >= 1500
-		FOPEN(&pEncCtx->pFileLog, strTraceFile, "wt+");	// confirmed_safe_unsafe_usage
-#else
-		pEncCtx->pFileLog	= FOPEN(strTraceFile, "wt+");	// confirmed_safe_unsafe_usage
-#endif//_MSC_VER>=1500
-#else		
-#endif//__GNUC__
-	}
-#endif//ENABLE_TRACE_FILE
-}
-
-/*! 
- *************************************************************************************
- * \brief	set log iLevel from external call
- *
- * \param	iLevel	iLevel of log 
- *
- * \return	NONE
- *
- * \note	can be able to control log iLevel dynamically
- *************************************************************************************
- */
-void WelsSetLogLevel( const int32_t kiLevel )
-{
-	iWelsLogLevel iVal = 0;
-	if ( kiLevel & WELS_LOG_ERROR )
-	{
-		iVal |= WELS_LOG_ERROR;
-	}
-	if ( kiLevel & WELS_LOG_WARNING )
-	{
-		iVal |= WELS_LOG_WARNING;
-	}
-	if ( kiLevel & WELS_LOG_INFO )
-	{
-		iVal |= WELS_LOG_INFO;
-	}
-	if ( kiLevel & WELS_LOG_DEBUG )
-	{
-		iVal |= WELS_LOG_DEBUG;
-	}
-	g_iLevelLog	= iVal;	
-}
-
-/*! 
- *************************************************************************************
- * \brief	get log iLevel from external call
- *
- * \param	N/A
- *
- * \return	current iLevel of log used in codec internal
- *
- * \note	can be able to get log iLevel of internal codec applicable
- *************************************************************************************
- */
-int32_t WelsGetLogLevel( void )
-{
-	return g_iLevelLog;
-}
-
-/*! 
- *************************************************************************************
- * \brief	set log callback from external call
- *
- * \param	_log	log function routine
- *
- * \return	NONE
- *
- * \note	N/A
- *************************************************************************************
- */
-void WelsSetLogCallback( PWelsLogCallbackFunc _log )
-{
-	wlog	= _log;
-}
-
-void WelsLogCall(void *pCtx, int32_t iLevel, const str_t *kpFmt, va_list vl)
-{
-    wlog(pCtx, iLevel, kpFmt, vl);
-}
-
-void WelsLog(void *pCtx, int32_t iLevel, const str_t *kpFmt, ...)
-{
-    va_list vl;
-    va_start(vl, kpFmt);
-    WelsLogCall(pCtx, iLevel, kpFmt, vl);
-    va_end(vl);
-}
-
-#ifndef CALC_PSNR
-#define CONST_FACTOR_PSNR	(10.0 / log(10.0))	// for good computation
-#define CALC_PSNR(w, h, s)	((real32_t)(CONST_FACTOR_PSNR * log( 65025.0 * w * h / iSqe )))
-#endif//CALC_PSNR
-
-/*
- *	PSNR calculation routines
- */
-/*! 
- *************************************************************************************
- * \brief	PSNR calculation utilization in Wels
- *
- * \param	pTarPic		target picture to be calculated in Picture pData format
- * \param	iTarStride	stride of target picture pData pBuffer
- * \param 	pRefPic		base referencing picture samples
- * \param	iRefStride	stride of reference picture pData pBuffer
- * \param	iWidth		picture iWidth in pixel
- * \param	iHeight		picture iHeight in pixel
- *
- * \return	actual PSNR result;
- *
- * \note	N/A
- *************************************************************************************
- */
-real32_t WelsCalcPsnr(	const void *kpTarPic,
-							const int32_t kiTarStride,
-							const void *kpRefPic,
-							const int32_t kiRefStride,
-							const int32_t kiWidth,
-							const int32_t kiHeight )
-{
-	int64_t	iSqe = 0;
-	int32_t x, y;
-	uint8_t *pTar = (uint8_t *)kpTarPic;
-	uint8_t *pRef = (uint8_t *)kpRefPic;
-
-	if ( NULL == pTar || NULL == pRef )
-		return (-1.0f);
-
-	for ( y = 0; y < kiHeight; ++ y )	// OPTable !!
-	{
-		for ( x = 0; x < kiWidth; ++ x )
-		{
-			const int32_t kiT = pTar[y*kiTarStride+x] - pRef[y*kiRefStride+x];
-			iSqe	+= kiT * kiT;
-		}
-	}
-	if ( 0 == iSqe )
-	{
-		return (99.99f);
-	}
-	return CALC_PSNR( kiWidth, kiHeight, iSqe );
-}
-
-
-}
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	utils.c
+ *
+ * \brief	common tool/function utilization
+ *
+ * \date	03/10/2009 Created
+ *
+ *************************************************************************************
+ */
+#include <string.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <math.h>
+#include <time.h>
+#if defined(WIN32)
+#include <windows.h>
+#include <sys/types.h>
+#include <sys/timeb.h>
+#else
+#include <sys/time.h>
+#endif
+
+#include "utils.h"
+#include "macros.h"
+#include "wels_const.h"
+#include "property.h"
+#include "cpu_core.h"
+#include "encoder_context.h"
+#include "as264_common.h"
+#include "property.h"
+#include "crt_util_safe_x.h"	// Safe CRT routines like utils for cross platforms
+
+
+namespace WelsSVCEnc {
+
+void WelsLogDefault (void* pCtx, const int32_t kiLevel, const str_t* kpFmtStr, va_list argv);
+void WelsLogNil (void* pCtx, const int32_t kiLevel, const str_t* kpFmtStr, va_list argv);
+
+real32_t WelsCalcPsnr (const void* kpTarPic,
+                       const int32_t kiTarStride,
+                       const void* kpRefPic,
+                       const int32_t kiRefStride,
+                       const int32_t kiWidth,
+                       const int32_t kiHeight);
+
+// to fill default routines
+#ifdef ENABLE_TRACE_FILE
+PWelsLogCallbackFunc wlog	= WelsLogDefault;
+#else
+PWelsLogCallbackFunc wlog	= WelsLogNil;
+#endif
+
+iWelsLogLevel		g_iLevelLog	= WELS_LOG_DEFAULT;	// default log iLevel
+int32_t			g_iSizeLogBuf	= 1024;			// pBuffer size for each log output
+
+/*
+ *	Log output routines
+ */
+
+/*!
+ * \brief	get log tag
+ * \param	kiLevel		log iLevel
+ * \return  tag of log iLevel
+ */
+static inline str_t* GetLogTag (const int32_t kiLevel, int32_t* pBit) {
+  int32_t iShift	= 0;
+  int32_t iVal		= 0;
+  bool_t	bFound	= false;
+
+  if (kiLevel <= 0 || kiLevel > (1 << (WELS_LOG_LEVEL_COUNT - 1)) || NULL == pBit)
+    return NULL;
+
+  for (;;) {
+    if (iShift >= WELS_LOG_LEVEL_COUNT)
+      break;
+    iVal	= (1 << iShift);
+    if (iVal == kiLevel) {
+      bFound	= true;
+      break;
+    }
+    ++ iShift;
+  }
+
+  if (bFound) {
+    *pBit	= iShift;
+    return (str_t*)g_sWelsLogTags[iShift];
+  }
+  return NULL;
+}
+
+/*!
+ *************************************************************************************
+ * \brief	System trace log output in Wels
+ *
+ * \param	pCtx	instance pointer
+ * \param	kiLevel	log iLevel ( WELS_LOG_QUIET, ERROR, WARNING, INFO, DEBUG )
+ * \param	kpFmtStr	formated string to mount
+ * \param 	argv	pData string argument
+ *
+ * \return	NONE
+ *
+ * \note	N/A
+ *************************************************************************************
+ */
+void WelsLogDefault (void* pCtx, const int32_t kiLevel, const str_t* kpFmtStr, va_list argv) {
+  sWelsEncCtx* pEncCtx	= (sWelsEncCtx*)pCtx;
+  iWelsLogLevel		 iVal	= (kiLevel & g_iLevelLog);
+
+  if (0 == iVal || NULL == pEncCtx) {	// such iLevel not enabled
+    return;
+  } else {
+    str_t pBuf[WELS_LOG_BUF_SIZE + 1] = {0};
+    const int32_t kiBufSize = sizeof (pBuf) / sizeof (pBuf[0]) - 1;
+    int32_t iCurUsed = 0;
+    int32_t iBufUsed = 0;
+    int32_t iBufLeft = kiBufSize - iBufUsed;
+
+    if (pEncCtx) {
+      time_t l_time;
+#if defined(WIN32)
+#if defined(_MSC_VER)
+#if _MSC_VER >= 1500
+      struct tm t_now;
+#else//VC6
+      struct tm* t_now;
+#endif//_MSC_VER >= 1500
+#endif//_MSC_VER
+#else//__GNUC__
+      struct tm* t_now;
+#endif//WIN32			
+
+#if defined( WIN32 )
+      struct _timeb tb;
+
+      time (&l_time);
+#ifdef _MSC_VER
+#if _MSC_VER >= 1500
+      LOCALTIME (&t_now, &l_time);
+#else
+      t_now = LOCALTIME (&l_time);
+      if (NULL == t_now) {
+        return;
+      }
+#endif//_MSC_VER >= 1500
+#endif//_MSC_VER			
+      FTIME (&tb);
+#elif defined( __GNUC__ )
+      struct timeval tv;
+      time (&l_time);
+      t_now = (struct tm*)LOCALTIME (&l_time);
+      gettimeofday (&tv, NULL);
+#endif//WIN32
+      if (iBufLeft > 0) {
+#ifdef _MSC_VER
+#if _MSC_VER >= 1500
+        iCurUsed = SNPRINTF (&pBuf[iBufUsed], iBufLeft, iBufLeft, "[0x%p @ ", pEncCtx);	// confirmed_safe_unsafe_usage
+#else
+        iCurUsed = SNPRINTF (&pBuf[iBufUsed], iBufLeft, "[0x%p @ ", pEncCtx);	// confirmed_safe_unsafe_usage
+#endif//_MSC_VER >= 1500
+#endif//_MSC_VER
+        if (iCurUsed >= 0) {
+          iBufUsed += iCurUsed;
+          iBufLeft -= iCurUsed;
+        }
+      } else {
+        return;
+      }
+
+      if (iBufLeft > 0) {
+        iCurUsed = GetCodeName (&pBuf[iBufUsed], iBufLeft);
+        if (iCurUsed > 0) {
+          iBufUsed += iCurUsed;
+          iBufLeft -= iCurUsed;
+        }
+        pBuf[iBufUsed] = ' ';
+        ++ iBufUsed;
+        -- iBufLeft;
+
+        iCurUsed = GetLibName (&pBuf[iBufUsed], iBufLeft);
+        if (iCurUsed > 0) {
+          iBufUsed += iCurUsed;
+          iBufLeft -= iCurUsed;
+        }
+        pBuf[iBufUsed] = ' ';
+        ++ iBufUsed;
+        -- iBufLeft;
+
+        pBuf[iBufUsed] = 'v';
+        ++ iBufUsed;
+        -- iBufLeft;
+        iCurUsed = GetVerNum (&pBuf[iBufUsed], iBufLeft);
+        if (iCurUsed > 0) {
+          iBufUsed += iCurUsed;
+          iBufLeft -= iCurUsed;
+        }
+        pBuf[iBufUsed] = ' ';
+        ++ iBufUsed;
+        -- iBufLeft;
+      }
+
+      if (iBufLeft > 0) {
+#if defined(WIN32) && defined(_MSC_VER) && (_MSC_VER >= 1500)
+        iCurUsed = strftime (&pBuf[iBufUsed], iBufLeft, "%y-%m-%d %H:%M:%S", &t_now);
+#else
+        iCurUsed = strftime (&pBuf[iBufUsed], iBufLeft, "%y-%m-%d %H:%M:%S", t_now);
+#endif//WIN32..
+        if (iCurUsed > 0) {
+          iBufUsed += iCurUsed;
+          iBufLeft -= iCurUsed;
+        }
+      } else {
+        return;
+      }
+
+      if (iBufLeft > 0) {
+#if defined (WIN32)
+#ifdef _MSC_VER
+#if _MSC_VER >= 1500
+        iCurUsed = SNPRINTF (&pBuf[iBufUsed], iBufLeft, iBufLeft, ".%03.3u]: ", tb.millitm);	// confirmed_safe_unsafe_usage
+#else
+        iCurUsed = SNPRINTF (&pBuf[iBufUsed], iBufLeft, ".%3.3u]: ", tb.millitm);	// confirmed_safe_unsafe_usage
+#endif//_MSC_VER >= 1500
+#endif//_MSC_VER
+#elif defined (__GNUC__)
+        iCurUsed = SNPRINTF (&pBuf[iBufUsed], iBufLeft, ".%3.3u]: ", tv.tv_usec / 1000);	// confirmed_safe_unsafe_usage
+#endif//WIN32
+        if (iCurUsed >= 0) {
+          iBufUsed += iCurUsed;
+          iBufLeft -= iCurUsed;
+        }
+      } else {
+        return;
+      }
+    }
+
+    // fixed stack corruption issue on vs2008
+    if (iBufLeft > 0) {
+      int32_t i_shift = 0;
+      str_t* pStr = NULL;
+      pStr	= GetLogTag (kiLevel, &i_shift);
+      if (NULL != pCtx) {
+        int32_t iLenTag = STRNLEN (pStr, 8);	// confirmed_safe_unsafe_usage
+        STRCAT (&pBuf[iBufUsed], iBufLeft, pStr);	// confirmed_safe_unsafe_usage
+        iBufUsed += iLenTag;
+        pBuf[iBufUsed] = ' ';
+        iBufUsed++;
+        ++iLenTag;
+        iBufLeft -= iLenTag;
+      }
+    }
+    if (iBufLeft > 0) {
+#if defined(WIN32) && defined(_MSC_VER) && (_MSC_VER >= 1500)
+      int32_t len = 0;
+      len = _vscprintf (kpFmtStr, argv)  // _vscprintf doesn't count
+            + 1; // terminating '\0'
+      iCurUsed = VSPRINTF (&pBuf[iBufUsed], len, kpFmtStr, argv);	// confirmed_safe_unsafe_usage
+#else
+      iCurUsed = VSPRINTF (&pBuf[iBufUsed], kpFmtStr, argv);	// confirmed_safe_unsafe_usage
+#endif//WIN32..
+      if (iCurUsed > 0) {
+        iBufUsed += iCurUsed;
+        iBufLeft -= iCurUsed;
+      }
+    }
+#ifdef ENABLE_TRACE_FILE
+    if (NULL != pEncCtx && NULL != pEncCtx->pFileLog) {
+      if (pEncCtx->uiSizeLog > MAX_TRACE_LOG_SIZE) {
+        if (0 == fseek (pEncCtx->pFileLog, 0L, SEEK_SET))
+          pEncCtx->uiSizeLog = 0;
+      }
+      if (iBufUsed > 0 && iBufUsed < WELS_LOG_BUF_SIZE) {
+        iCurUsed = fwrite (pBuf, 1, iBufUsed, pEncCtx->pFileLog);
+        fflush (pEncCtx->pFileLog);
+        if (iCurUsed == iBufUsed)
+          pEncCtx->uiSizeLog += iBufUsed;
+      }
+    } else {
+#if defined(WIN32) && defined(_DEBUG)
+      OutputDebugStringA (pBuf);
+#endif
+    }
+#endif//ENABLE_TRACE_FILE
+  }
+}
+void WelsLogNil (void* pCtx, const int32_t kiLevel, const str_t* kpFmtStr, va_list argv) {
+  // NULL implementation
+}
+
+/*!
+*************************************************************************************
+* \brief	reopen log file when finish setting current path
+*
+* \param	pCtx		context pCtx
+* \param	pCurPath	current path string
+*
+* \return	NONE
+*
+* \note	N/A
+*************************************************************************************
+*/
+void WelsReopenTraceFile (void* pCtx, str_t* pCurPath) {
+#ifdef ENABLE_TRACE_FILE
+  sWelsEncCtx* pEncCtx	= (sWelsEncCtx*)pCtx;
+  if (wlog == WelsLogDefault) {
+    str_t strTraceFile[MAX_FNAME_LEN] = {0};
+    int32_t len = 0;
+    if (pEncCtx->pFileLog != NULL) {
+      fclose (pEncCtx->pFileLog);
+      pEncCtx->pFileLog = NULL;
+    }
+    pEncCtx->uiSizeLog	= 0;
+    len = STRNLEN (pCurPath, MAX_FNAME_LEN - 1);	// confirmed_safe_unsafe_usage
+    if (len >= MAX_FNAME_LEN)
+      return;
+    STRNCPY (strTraceFile, MAX_FNAME_LEN, pCurPath, len);	// confirmed_safe_unsafe_usage
+#ifdef __GNUC__
+    STRCAT (strTraceFile, MAX_FNAME_LEN - len, "/wels_encoder_trace.txt");	// confirmed_safe_unsafe_usage
+    pEncCtx->pFileLog	= FOPEN (strTraceFile, "wt+");	// confirmed_safe_unsafe_usage
+#elif WIN32
+    STRCAT (strTraceFile, MAX_FNAME_LEN - len, "\\wels_encoder_trace.txt"); // confirmed_safe_unsafe_usage
+#if _MSC_VER >= 1500
+    FOPEN (&pEncCtx->pFileLog, strTraceFile, "wt+");	// confirmed_safe_unsafe_usage
+#else
+    pEncCtx->pFileLog	= FOPEN (strTraceFile, "wt+");	// confirmed_safe_unsafe_usage
+#endif//_MSC_VER>=1500
+#else
+#endif//__GNUC__
+  }
+#endif//ENABLE_TRACE_FILE
+}
+
+/*!
+ *************************************************************************************
+ * \brief	set log iLevel from external call
+ *
+ * \param	iLevel	iLevel of log
+ *
+ * \return	NONE
+ *
+ * \note	can be able to control log iLevel dynamically
+ *************************************************************************************
+ */
+void WelsSetLogLevel (const int32_t kiLevel) {
+  iWelsLogLevel iVal = 0;
+  if (kiLevel & WELS_LOG_ERROR) {
+    iVal |= WELS_LOG_ERROR;
+  }
+  if (kiLevel & WELS_LOG_WARNING) {
+    iVal |= WELS_LOG_WARNING;
+  }
+  if (kiLevel & WELS_LOG_INFO) {
+    iVal |= WELS_LOG_INFO;
+  }
+  if (kiLevel & WELS_LOG_DEBUG) {
+    iVal |= WELS_LOG_DEBUG;
+  }
+  g_iLevelLog	= iVal;
+}
+
+/*!
+ *************************************************************************************
+ * \brief	get log iLevel from external call
+ *
+ * \param	N/A
+ *
+ * \return	current iLevel of log used in codec internal
+ *
+ * \note	can be able to get log iLevel of internal codec applicable
+ *************************************************************************************
+ */
+int32_t WelsGetLogLevel (void) {
+  return g_iLevelLog;
+}
+
+/*!
+ *************************************************************************************
+ * \brief	set log callback from external call
+ *
+ * \param	_log	log function routine
+ *
+ * \return	NONE
+ *
+ * \note	N/A
+ *************************************************************************************
+ */
+void WelsSetLogCallback (PWelsLogCallbackFunc _log) {
+  wlog	= _log;
+}
+
+void WelsLogCall (void* pCtx, int32_t iLevel, const str_t* kpFmt, va_list vl) {
+  wlog (pCtx, iLevel, kpFmt, vl);
+}
+
+void WelsLog (void* pCtx, int32_t iLevel, const str_t* kpFmt, ...) {
+  va_list vl;
+  va_start (vl, kpFmt);
+  WelsLogCall (pCtx, iLevel, kpFmt, vl);
+  va_end (vl);
+}
+
+#ifndef CALC_PSNR
+#define CONST_FACTOR_PSNR	(10.0 / log(10.0))	// for good computation
+#define CALC_PSNR(w, h, s)	((real32_t)(CONST_FACTOR_PSNR * log( 65025.0 * w * h / iSqe )))
+#endif//CALC_PSNR
+
+/*
+ *	PSNR calculation routines
+ */
+/*!
+ *************************************************************************************
+ * \brief	PSNR calculation utilization in Wels
+ *
+ * \param	pTarPic		target picture to be calculated in Picture pData format
+ * \param	iTarStride	stride of target picture pData pBuffer
+ * \param 	pRefPic		base referencing picture samples
+ * \param	iRefStride	stride of reference picture pData pBuffer
+ * \param	iWidth		picture iWidth in pixel
+ * \param	iHeight		picture iHeight in pixel
+ *
+ * \return	actual PSNR result;
+ *
+ * \note	N/A
+ *************************************************************************************
+ */
+real32_t WelsCalcPsnr (const void* kpTarPic,
+                       const int32_t kiTarStride,
+                       const void* kpRefPic,
+                       const int32_t kiRefStride,
+                       const int32_t kiWidth,
+                       const int32_t kiHeight) {
+  int64_t	iSqe = 0;
+  int32_t x, y;
+  uint8_t* pTar = (uint8_t*)kpTarPic;
+  uint8_t* pRef = (uint8_t*)kpRefPic;
+
+  if (NULL == pTar || NULL == pRef)
+    return (-1.0f);
+
+  for (y = 0; y < kiHeight; ++ y) {	// OPTable !!
+    for (x = 0; x < kiWidth; ++ x) {
+      const int32_t kiT = pTar[y * kiTarStride + x] - pRef[y * kiRefStride + x];
+      iSqe	+= kiT * kiT;
+    }
+  }
+  if (0 == iSqe) {
+    return (99.99f);
+  }
+  return CALC_PSNR (kiWidth, kiHeight, iSqe);
+}
+
+
+}
--- a/codec/encoder/core/src/wels_preprocess.cpp
+++ b/codec/encoder/core/src/wels_preprocess.cpp
@@ -1,1146 +1,1060 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#if defined(WIN32)
-#include <windows.h>
-#elif defined(MACOS)
-#include "bundleloader.h"
-#elif defined(__GNUC__)
-#include <dlfcn.h>
-#endif
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <assert.h>
-#include "wels_preprocess.h"
-#include "memory_align.h"
-#include "encoder.h"
-#include "extern.h"
-#include "picture_handle.h"
-#include "encoder_context.h"
-#include "utils.h"
-
-#ifdef NO_DYNAMIC_VP
-EResult WELSAPI CreateVpInterface  (void **ppCtx, int iVersion);
-EResult WELSAPI DestroyVpInterface  (void **ppCtx, int iVersion);
-#endif
-
-namespace WelsSVCEnc {
-
-#define WelsSafeDelete(p) if(p){ delete (p); (p) = NULL; }
-
-
-//***** entry API declaration ************************************************************************//
-typedef EResult (WELSAPI *pfnCreateVpInterface)  (void **, int );
-typedef EResult (WELSAPI *pfnDestroyVpInterface) (void * , int );
-
-int32_t WelsInitScaledPic( SWelsSvcCodingParam *pParam,  Scaled_Picture  *pScaledPic, CMemoryAlign *pMemoryAlign );
-bool_t  JudgeNeedOfScaling( SWelsSvcCodingParam *pParam, Scaled_Picture * pScaledPic );
-void    FreeScaledPic( Scaled_Picture  *pScaledPic, CMemoryAlign *pMemoryAlign );
-
-//******* table definition ***********************************************************************//
-const uint8_t g_kuiRefTemporalIdx[MAX_TEMPORAL_LEVEL][MAX_GOP_SIZE] =
-{
-	{  0, }, // 0
-	{  0,  0, }, // 1
-	{  0,  0,  0,  1, }, // 2
-	{  0,  0,  0,  2,  0,  1,  1,  2, }, // 3
-	{  0,  0,  0,  3,  0,  2,  2,  3,  0,  1,  1,  3,  1,  2,  2,  3 }  // 4
-};
-
-const int32_t g_kiPixMapSizeInBits = sizeof(uint8_t) * 8;
-
-
-inline  void   WelsUpdateSpatialIdxMap(sWelsEncCtx * pEncCtx, int32_t iPos, SPicture * pPic, int32_t iDidx)
-{
-    pEncCtx->sSpatialIndexMap[iPos].pSrc = pPic;
-	pEncCtx->sSpatialIndexMap[iPos].iDid = iDidx;
-}
-
-
-//***************************************************************************************************//
-CWelsLib::CWelsLib(void *pEncCtx)
-{
-	m_pInterface[0] = m_pInterface[1] = NULL;
-
-#ifndef NO_DYNAMIC_VP
-#if defined(WIN32)
-	const str_t WelsVPLib[] = "welsvp.dll";
-	HMODULE shModule = LoadLibrary(WelsVPLib);
-	if(!shModule)
-		WelsLog( pEncCtx, WELS_LOG_ERROR, "welsvp load lib dynamic failed module=%x\n", shModule );
-
-#elif defined(MACOS)
-	const str_t WelsVPLib[] = "welsvp.bundle";
-	str_t pCurPath[256];
-	GetCurrentModulePath(pCurPath, 256);
-	strlcat(pCurPath, WelsVPLib, 256);	
-	CFBundleRef shModule = LoadBundle(pCurPath);
-	if(!shModule)
-		WelsLog( pEncCtx, WELS_LOG_ERROR, "welsvp load lib dynamic failed module=%x\n", shModule );
-
-#elif defined(__GNUC__)
-	const str_t WelsVPLib[] = "./libwelsvp.so";
-	void* shModule = NULL;
-	shModule = dlopen(WelsVPLib, RTLD_LAZY);
-	if (shModule == NULL)
-		printf("dlopen %s iRet=%x, err=%s\n", WelsVPLib, shModule, dlerror());
-#endif
-
-	m_pVpLib = (void *)shModule;
-#endif
-}
-
-CWelsLib::~CWelsLib()
-{
-	if (m_pVpLib)
-	{
-#if defined(WIN32)
-		HMODULE shModule = (HMODULE)m_pVpLib;
-		FreeLibrary(shModule);
-
-#elif defined(MACOS)
-		CFBundleRef shModule = (CFBundleRef)m_pVpLib;
-		FreeBundle(shModule);
-
-#elif defined(__GNUC__)
-		void* shModule = m_pVpLib;
-		dlclose(shModule);
-#endif
-		m_pVpLib = NULL;
-	}
-}
-
-void* CWelsLib::QueryFunction(const str_t *pName)
-{
-	void *pFunc = NULL;
-
-	if (m_pVpLib)
-	{
-#if defined(WIN32)
-		HMODULE shModule = (HMODULE)m_pVpLib;
-		pFunc = (void *)GetProcAddress(shModule, pName);
-
-#elif defined(MACOS)
-		CFBundleRef shModule = (CFBundleRef)m_pVpLib;
-		pFunc = (void *)GetProcessAddress(shModule, pName);
-
-#elif defined(__GNUC__)
-		void* shModule = m_pVpLib;
-		pFunc = (void *)dlsym(shModule, pName);
-		if (pFunc == NULL)
-			printf("dlsym %s iRet=%p, err=%s\n", shModule, pFunc, dlerror());
-#endif
-	}
-	return pFunc;
-}
-
-int32_t CWelsLib::CreateIface(void **ppEncCtx)
-{
-#ifndef NO_DYNAMIC_VP
-	if (m_pVpLib)
-	{
-
-#endif
-		pfnCreateVpInterface  pCreateVpInterface  = NULL;
-		pfnDestroyVpInterface pDestroyVpInterface = NULL;
-
-#ifndef NO_DYNAMIC_VP
-		pCreateVpInterface  = (pfnCreateVpInterface)  QueryFunction("CreateVpInterface");
-		pDestroyVpInterface = (pfnDestroyVpInterface) QueryFunction("DestroyVpInterface");
-#else
-		pCreateVpInterface  = CreateVpInterface;
-		// TODO([email protected]): This cast corrects a signature difference... This is a potential real problem
-		pDestroyVpInterface = (pfnDestroyVpInterface)DestroyVpInterface;
-#endif
-
-		m_pInterface[0] = (void *)pCreateVpInterface;
-		m_pInterface[1] = (void *)pDestroyVpInterface;
-
-		if (m_pInterface[0] && m_pInterface[1])
-			pCreateVpInterface(ppEncCtx, WELSVP_INTERFACE_VERION);
-#ifndef NO_DYNAMIC_VP
-	}
-	else
-	{
-	}	
-#endif
-
-	return ppEncCtx ? 0 : 1;
-}
-
-int32_t CWelsLib::DestroyIface(void *pEncCtx)
-{
-	if (pEncCtx)
-	{
-		pfnDestroyVpInterface pDestroyVpInterface = (pfnDestroyVpInterface) m_pInterface[1];
-		if (pDestroyVpInterface)
-		{
-			pDestroyVpInterface(pEncCtx, WELSVP_INTERFACE_VERION);
-		}
-		else
-		{
-		}
-	}
-
-	return 0;
-}
-
-/***************************************************************************
-*	
-*	implement of the interface
-*	
-***************************************************************************/
-
-CWelsPreProcess::CWelsPreProcess(void *pEncCtx)
-{
-	m_pInterfaceVp = NULL;
-	m_pEncLib = NULL;
-	m_bInitDone = false;
-	m_bOfficialBranch  = FALSE;
-	m_pEncCtx = pEncCtx;
-	memset(&m_sScaledPicture, 0, sizeof(m_sScaledPicture));	
-}
-
-CWelsPreProcess::~CWelsPreProcess()
-{
-	FreeScaledPic(&m_sScaledPicture, static_cast<sWelsEncCtx *>(m_pEncCtx)->pMemAlign);
-	WelsPreprocessDestroy();
-}
-
-int32_t CWelsPreProcess::WelsPreprocessCreate()
-{
-	if (m_pEncLib == NULL && m_pInterfaceVp == NULL)
-	{
-		m_pEncLib  = new CWelsLib(m_pEncCtx);
-		if (!m_pEncLib)
-			goto exit;
-
-		m_pEncLib->CreateIface((void **)&m_pInterfaceVp);
-		if (!m_pInterfaceVp)
-			goto exit;
-	}
-	else 
-		goto exit;
-
-	return 0;
-
-exit:
-	WelsPreprocessDestroy();
-	return 1;
-}
-
-int32_t CWelsPreProcess::WelsPreprocessDestroy()
-{
-	if (m_pEncLib)
-	{	
-		m_pEncLib->DestroyIface((void *)m_pInterfaceVp);
-		m_pInterfaceVp = NULL;
-		WelsSafeDelete(m_pEncLib);
-	}
-
-	return 0;
-}
-
-int32_t CWelsPreProcess::WelsPreprocessReset ( void *pCtx )
-{
-	sWelsEncCtx *pEncCtx = (sWelsEncCtx *)pCtx;
-	int32_t iRet = -1;
-
-	if (pEncCtx)
-	{
-		FreeScaledPic(&m_sScaledPicture, pEncCtx->pMemAlign);
-        iRet = InitLastSpatialPictures(pEncCtx);
-		iRet = WelsInitScaledPic(pEncCtx->pSvcParam, &m_sScaledPicture, pEncCtx->pMemAlign);
-	}
-
-	return iRet;
-}
-
-int32_t CWelsPreProcess::WelsPreprocessStep1( void *pCtx, const SSourcePicture **kppSrcPicList, const int32_t kiConfiguredLayerNum )
-{
-	sWelsEncCtx *pEncCtx = (sWelsEncCtx *)pCtx;
-	SWelsSvcCodingParam *pSvcParam = pEncCtx->pSvcParam;
-	int32_t	iNumDependencyLayer = (int32_t)pSvcParam->iNumDependencyLayer;
-	int32_t iSpatialNum = 0;
-
-	if (!m_bInitDone)
-	{
-		if (WelsPreprocessCreate() != 0)
-			return -1;
-		if (WelsPreprocessReset(pEncCtx) != 0)
-			return -1;	
-
-		m_bOfficialBranch  = (iNumDependencyLayer != kiConfiguredLayerNum);
-		if ( !m_bOfficialBranch && (iNumDependencyLayer == 1) ) 
-		{
-			// check the input source uiSize to decide if need switch to official branch 
-			// NOTICE: the layernum=1 case is confused in official/non-official cases!
-			SSourcePicture **pic_queue = (SSourcePicture **)kppSrcPicList;
-			for (int32_t i=0; i<iNumDependencyLayer; i++)
-			{			
-				if ( pSvcParam->sDependencyLayers[i].iFrameWidth != pic_queue[i]->iPicWidth ||
-					pSvcParam->sDependencyLayers[i].iFrameHeight != pic_queue[i]->iPicHeight )
-				{
-					m_bOfficialBranch = TRUE;
-					break;
-				}		
-			}		
-		}
-		m_bInitDone = TRUE;
-	}
-
-	if (m_pInterfaceVp == NULL)
-		return -1;
-
-	if ( kiConfiguredLayerNum <= 0 )
-		return -1;	
-
-    pEncCtx->pVaa->bSceneChangeFlag = pEncCtx->pVaa->bIdrPeriodFlag = false;
-	if( pSvcParam->uiIntraPeriod )
-		pEncCtx->pVaa->bIdrPeriodFlag = ( 1 + pEncCtx->iFrameIndex >= (int32_t)pSvcParam->uiIntraPeriod ) ? true : false;		
-
-	if ( m_bOfficialBranch )	// Perform Down Sampling potentially due to application
-	{
-		assert( kiConfiguredLayerNum == 1 );
-		iSpatialNum	= SingleLayerPreprocess( pEncCtx, kppSrcPicList[0], &m_sScaledPicture );
-	}
-	else // for console each spatial pictures are available there
-	{
-		iSpatialNum	= kiConfiguredLayerNum;
-		MultiLayerPreprocess( pEncCtx, kppSrcPicList, iSpatialNum );	
-	}
-
-	return iSpatialNum;
-}
-
-int32_t CWelsPreProcess::WelsPreprocessStep3( void *pCtx, const int32_t kiDidx )
-{
-	sWelsEncCtx *pEncCtx = (sWelsEncCtx *)pCtx;
-	SWelsSvcCodingParam *pSvcParam = pEncCtx->pSvcParam;	
-	bool_t bNeededMbAq = (pSvcParam->bEnableAdaptiveQuant && (pEncCtx->eSliceType == P_SLICE));
-	bool_t bCalculateBGD = (pEncCtx->eSliceType == P_SLICE && pSvcParam->bEnableBackgroundDetection);
-		
-	int32_t iCurTemporalIdx  = pEncCtx->uiSpatialLayersInTemporal[kiDidx] - 1;
-
-	int32_t iRefTemporalIdx = (int32_t)g_kuiRefTemporalIdx[pSvcParam->iDecompStages][pEncCtx->iCodingIndex & (pSvcParam->uiGopSize-1)];
-	if ( pEncCtx->uiTemporalId == 0 && pEncCtx->pLtr[pEncCtx->uiDependencyId].bReceivedT0LostFlag )	
-		iRefTemporalIdx = pEncCtx->uiSpatialLayersInTemporal[kiDidx] + pEncCtx->pVaa->uiValidLongTermPicIdx;
-
-	SPicture *pCurPic = pEncCtx->pSpatialPic[kiDidx][iCurTemporalIdx];
-	SPicture *pRefPic = pEncCtx->pSpatialPic[kiDidx][iRefTemporalIdx];	
-	{			
-		SPicture *pLastPic= m_pLastSpatialPicture[kiDidx][0];
-		bool_t bCalculateSQDiff = ((pLastPic->pData[0] == pRefPic->pData[0]) && bNeededMbAq);
-		bool_t bCalculateVar = (pSvcParam->iRCMode == RC_MODE1 && pEncCtx->eSliceType == I_SLICE);
-
-		VaaCalculation( pEncCtx->pVaa, pCurPic, pRefPic, bCalculateSQDiff, bCalculateVar, bCalculateBGD);
-	}
-
-	if (pSvcParam->bEnableBackgroundDetection)
-	{
-		BackgroundDetection(pEncCtx->pVaa, pCurPic, pRefPic, bCalculateBGD && pRefPic->iPictureType != I_SLICE);
-	}
-
-	if ( bNeededMbAq )
-	{
-		SPicture *pCurPic = m_pLastSpatialPicture[kiDidx][1];
-		SPicture *pRefPic = m_pLastSpatialPicture[kiDidx][0];
-
-		AdaptiveQuantCalculation( pEncCtx->pVaa, pCurPic, pRefPic );           
-	}	
-
-	if ( pSvcParam->bEnableRc )
-	{
-		AnalyzePictureComplexity( pEncCtx, pCurPic, pRefPic, kiDidx, bCalculateBGD );	
-	}
-
-	WelsExchangeSpatialPictures( &m_pLastSpatialPicture[kiDidx][1], &m_pLastSpatialPicture[kiDidx][0] );
-
-	return 0;
-}
-
-
-/*
-*	SingleLayerPreprocess: down sampling if applicable
-*  @return:	exact number of spatial layers need to encoder indeed
-*/
-int32_t CWelsPreProcess::SingleLayerPreprocess( void *pCtx, const SSourcePicture *kpSrc, Scaled_Picture * pScaledPicture )
-{
-	sWelsEncCtx *pEncCtx = (sWelsEncCtx *)pCtx;
-	SWelsSvcCodingParam *pSvcParam    = pEncCtx->pSvcParam;	
-	int8_t	iDependencyId			= pSvcParam->iNumDependencyLayer - 1;
-	int32_t iPicturePos	                    = pEncCtx->uiSpatialLayersInTemporal[iDependencyId] - 1;
-
-	SPicture *pSrcPic					= NULL;	// large
-	SPicture *pDstPic					= NULL;	// small
-	SDLayerParam *pDlayerParam					= NULL;
-	int32_t iSpatialNum					= 0;
-	int32_t iSrcWidth					= 0;
-	int32_t iSrcHeight					= 0;
-	int32_t iTargetWidth					= 0;
-	int32_t iTargetHeight					= 0;		
-	int32_t iTemporalId = 0;
-	int32_t iActualSpatialLayerNum      = 0;
-
-	pDlayerParam = &pSvcParam->sDependencyLayers[iDependencyId];
-	iTargetWidth	  = pDlayerParam->iFrameWidth;
-	iTargetHeight  = pDlayerParam->iFrameHeight;	
-	iTemporalId    = pDlayerParam->uiCodingIdx2TemporalId[pEncCtx->iCodingIndex & (pSvcParam->uiGopSize-1)];	
-	iSrcWidth   = pSvcParam->SUsedPicRect.iWidth;
-	iSrcHeight  = pSvcParam->SUsedPicRect.iHeight;
-	
-	pSrcPic = pScaledPicture->pScaledInputPicture ? pScaledPicture->pScaledInputPicture : pEncCtx->pSpatialPic[iDependencyId][iPicturePos];
-
-	WelsMoveMemoryWrapper( pSvcParam, pSrcPic, kpSrc, iSrcWidth, iSrcHeight );
-
-	if( pSvcParam->bEnableDenoise )
-		BilateralDenoising(pSrcPic, iSrcWidth, iSrcHeight);
-
-	// different scaling in between input picture and dst highest spatial picture. 
-	int32_t iShrinkWidth  = iSrcWidth;
-	int32_t iShrinkHeight = iSrcHeight;
-	pDstPic = pSrcPic;
-	if ( pScaledPicture->pScaledInputPicture )
-	{	
-		// for highest downsampling				
-		pDstPic		= pEncCtx->pSpatialPic[iDependencyId][iPicturePos];			
-		iShrinkWidth = pScaledPicture->iScaledWidth[iDependencyId];
-		iShrinkHeight = pScaledPicture->iScaledHeight[iDependencyId];			
-	}
-	DownsamplePadding(pSrcPic, pDstPic, iSrcWidth, iSrcHeight, iShrinkWidth, iShrinkHeight, iTargetWidth, iTargetHeight);	
-
-	if(pSvcParam->bEnableSceneChangeDetect && !pEncCtx->pVaa->bIdrPeriodFlag && !(pEncCtx->iCodingIndex & (pSvcParam->uiGopSize-1))){
-		SPicture *pRefPic = pEncCtx->pLtr[iDependencyId].bReceivedT0LostFlag ? 
-			pEncCtx->pSpatialPic[iDependencyId][pEncCtx->uiSpatialLayersInTemporal[iDependencyId] + pEncCtx->pVaa->uiValidLongTermPicIdx] : m_pLastSpatialPicture[iDependencyId][0];
-
-		pEncCtx->pVaa->bSceneChangeFlag = DetectSceneChange(pDstPic, pRefPic);		
-	}
-
-	for( int32_t i=0;i<pSvcParam->iNumDependencyLayer;i++ ){
-		if( pSvcParam->sDependencyLayers[i].uiCodingIdx2TemporalId[pEncCtx->iCodingIndex & (pSvcParam->uiGopSize-1)]
-			!= INVALID_TEMPORAL_ID ){
-			++ iActualSpatialLayerNum;
-		}
-	}
-
-	if ( iTemporalId != INVALID_TEMPORAL_ID )
-	{
-		WelsUpdateSpatialIdxMap(pEncCtx, iActualSpatialLayerNum - 1, pDstPic, iDependencyId);	
-		++ iSpatialNum;
-		-- iActualSpatialLayerNum;
-	}	
-
-	m_pLastSpatialPicture[iDependencyId][1]	= pEncCtx->pSpatialPic[iDependencyId][iPicturePos];	
-	-- iDependencyId;
-
-	// generate other spacial layer
-	// pSrc is 
-	//	-- padded input pic, if downsample should be applied to generate highest layer, [if] block above
-	//	-- highest layer, if no downsampling, [else] block above
-	if ( pSvcParam->iNumDependencyLayer > 1 )
-	{
-		while (iDependencyId >= 0) 
-		{
-			pDlayerParam			= &pSvcParam->sDependencyLayers[iDependencyId];
-			iTargetWidth	= pDlayerParam->iFrameWidth;
-			iTargetHeight	= pDlayerParam->iFrameHeight;					
-			iTemporalId = pDlayerParam->uiCodingIdx2TemporalId[pEncCtx->iCodingIndex & (pSvcParam->uiGopSize-1)];
-			iPicturePos		= pEncCtx->uiSpatialLayersInTemporal[iDependencyId] - 1;
-
-			// NOT work for CGS, FIXME
-			// spatial layer is able to encode indeed
-			if ( (iTemporalId != INVALID_TEMPORAL_ID) )
-			{ 
-				// down sampling performed
-				if( NULL == pSrcPic )
-					return -1;
-
-				pDstPic	= pEncCtx->pSpatialPic[iDependencyId][iPicturePos];	// small
-				iShrinkWidth = pScaledPicture->iScaledWidth[iDependencyId];
-				iShrinkHeight = pScaledPicture->iScaledHeight[iDependencyId];
-				DownsamplePadding(pSrcPic, pDstPic, iSrcWidth, iSrcHeight, iShrinkWidth, iShrinkHeight, iTargetWidth, iTargetHeight);
-
-				WelsUpdateSpatialIdxMap(pEncCtx, iActualSpatialLayerNum - 1, pDstPic, iDependencyId);				
-
-				-- iActualSpatialLayerNum;
-				++ iSpatialNum;				
-
-				m_pLastSpatialPicture[iDependencyId][1]	= pEncCtx->pSpatialPic[iDependencyId][iPicturePos];	
-			}
-			-- iDependencyId;
-		}		
-	}
-
-	return iSpatialNum;
-}
-
-int32_t CWelsPreProcess::MultiLayerPreprocess( void *pCtx, const SSourcePicture **kppSrcPicList, const int32_t kiSpatialNum )
-{
-	sWelsEncCtx *pEncCtx = (sWelsEncCtx *)pCtx;
-	SWelsSvcCodingParam *pSvcParam	= pEncCtx->pSvcParam;		
-	const SSourcePicture *pSrc			= NULL;
-	SPicture *pDstPic						= NULL;
-	const int32_t iSpatialLayersCfgCount = pSvcParam->iNumDependencyLayer;	// count number of spatial layers to be encoded in cfg
-	int32_t i							= 0;
-	int32_t j							= -1;
-
-	do {
-		pSrc	= kppSrcPicList[i];
-
-		// do not clear j, just let it continue to save complexity
-		do {
-			++ j;
-			if ( pSvcParam->sDependencyLayers[j].iFrameWidth == pSrc->iPicWidth &&
-				pSvcParam->sDependencyLayers[j].iFrameHeight== pSrc->iPicHeight )
-			{
-				break;
-			}			
-		} while( j < iSpatialLayersCfgCount );
-
-		assert( j < iSpatialLayersCfgCount );
-		pDstPic = pEncCtx->pSpatialPic[j][pEncCtx->uiSpatialLayersInTemporal[j]-1];
-		
-		WelsUpdateSpatialIdxMap(pEncCtx, i, pDstPic, j);		
-
-		WelsMoveMemoryWrapper( pSvcParam, pDstPic, pSrc, pSrc->iPicWidth, pSrc->iPicHeight );
-
-		if(pSvcParam->bEnableDenoise)
-			BilateralDenoising(pDstPic, pSrc->iPicWidth, pSrc->iPicHeight);
-
-		m_pLastSpatialPicture[j][1]	= pDstPic;
-		++ i;
-	} while( i < kiSpatialNum );
-
-	if( pSvcParam->bEnableSceneChangeDetect && (kiSpatialNum == pSvcParam->iNumDependencyLayer) && !pEncCtx->pVaa->bIdrPeriodFlag )
-	{
-		SPicture *pRef = pEncCtx->pLtr[0].bReceivedT0LostFlag ? 
-			pEncCtx->pSpatialPic[0][pEncCtx->uiSpatialLayersInTemporal[0] + pEncCtx->pVaa->uiValidLongTermPicIdx] : m_pLastSpatialPicture[0][0];
-
-		pEncCtx->pVaa->bSceneChangeFlag = DetectSceneChange(pDstPic, pRef);
-	}
-
-	return 0;
-}
-
-/*!
- * \brief	Whether input picture need be scaled?	
- */
-bool_t JudgeNeedOfScaling( SWelsSvcCodingParam *pParam, Scaled_Picture * pScaledPicture )
-{
-	const int32_t kiInputPicWidth	= pParam->SUsedPicRect.iWidth;
-	const int32_t kiInputPicHeight = pParam->SUsedPicRect.iHeight;
-	const int32_t kiDstPicWidth		= pParam->sDependencyLayers[pParam->iNumDependencyLayer-1].iActualWidth;
-	const int32_t kiDstPicHeight	= pParam->sDependencyLayers[pParam->iNumDependencyLayer-1].iActualHeight;
-	bool_t bNeedDownsampling = true;
-
-	int32_t iSpatialIdx = pParam->iNumDependencyLayer-1;
-
-	if ( kiDstPicWidth >= kiInputPicWidth && kiDstPicHeight >= kiInputPicHeight )
-	{
-		iSpatialIdx --;  // highest D layer do not need downsampling
-		bNeedDownsampling = false;
-	}
-
-	for(; iSpatialIdx >= 0; iSpatialIdx --)
-	{
-		SDLayerParam *pCurLayer = &pParam->sDependencyLayers[iSpatialIdx];
-		int32_t iCurDstWidth			= pCurLayer->iActualWidth; 
-		int32_t iCurDstHeight			= pCurLayer->iActualHeight;
-		int32_t iInputWidthXDstHeight	= kiInputPicWidth * iCurDstHeight;
-		int32_t iInputHeightXDstWidth	= kiInputPicHeight * iCurDstWidth;
-
-		if (iInputWidthXDstHeight > iInputHeightXDstWidth)
-		{
-			pScaledPicture->iScaledWidth[iSpatialIdx] = iCurDstWidth;
-			pScaledPicture->iScaledHeight[iSpatialIdx] = iInputHeightXDstWidth / kiInputPicWidth;
-		}else {
-			pScaledPicture->iScaledWidth[iSpatialIdx] = iInputWidthXDstHeight / kiInputPicHeight;
-			pScaledPicture->iScaledHeight[iSpatialIdx] = iCurDstHeight;
-		}
-	}
-
-	return bNeedDownsampling;
-}
-
-int32_t  WelsInitScaledPic( SWelsSvcCodingParam *pParam,  Scaled_Picture  *pScaledPicture, CMemoryAlign *pMemoryAlign )
-{
-	bool_t bInputPicNeedScaling = JudgeNeedOfScaling( pParam, pScaledPicture );
-    if( bInputPicNeedScaling )
-    {
-        pScaledPicture->pScaledInputPicture = AllocPicture(pMemoryAlign, pParam->SUsedPicRect.iWidth, pParam->SUsedPicRect.iHeight, false);
-        if( pScaledPicture->pScaledInputPicture == NULL )           
-            return -1;
-    }
-    return 0;
-}
-
-void  FreeScaledPic(Scaled_Picture  *pScaledPicture, CMemoryAlign *pMemoryAlign)
-{
-	if ( pScaledPicture->pScaledInputPicture )
-	{
-		FreePicture( pMemoryAlign, &pScaledPicture->pScaledInputPicture );	
-		pScaledPicture->pScaledInputPicture = NULL;
-	}			
-}
-
-int32_t CWelsPreProcess::InitLastSpatialPictures( void *pCtx )
-{
-	sWelsEncCtx *pEncCtx         = (sWelsEncCtx *)pCtx;
-	SWelsSvcCodingParam *pParam	= pEncCtx->pSvcParam;
-	const int32_t kiDlayerCount			= pParam->iNumDependencyLayer;
-	int32_t iDlayerIndex					= 0;
-
-	for (; iDlayerIndex<kiDlayerCount; iDlayerIndex++)
-	{
-		const int32_t kiLayerInTemporal = pEncCtx->uiSpatialLayersInTemporal[iDlayerIndex];
-		m_pLastSpatialPicture[iDlayerIndex][0]	= pEncCtx->pSpatialPic[iDlayerIndex][kiLayerInTemporal - 2];
-		m_pLastSpatialPicture[iDlayerIndex][1]	= NULL;
-	}
-	for (; iDlayerIndex<MAX_DEPENDENCY_LAYER; iDlayerIndex++)
-	{
-		m_pLastSpatialPicture[iDlayerIndex][0]	= m_pLastSpatialPicture[iDlayerIndex][1] = NULL;
-	}
-
-	return 0;
-}
-//*********************************************************************************************************/
-
-int32_t CWelsPreProcess::ColorspaceConvert(SWelsSvcCodingParam * pSvcParam, SPicture *pDstPic, const SSourcePicture *kpSrc, const int32_t kiWidth, const int32_t kiHeight )
-{
-	return 1;
-	//not support yet
-}
-
-void CWelsPreProcess::BilateralDenoising ( SPicture *pSrc, const int32_t kiWidth, const int32_t kiHeight )
-{
-	int32_t iMethodIdx = METHOD_DENOISE;
-	SPixMap sSrcPixMap = {0};
-
-	sSrcPixMap.pPixel[0] = pSrc->pData[0];
-	sSrcPixMap.pPixel[1] = pSrc->pData[1];
-	sSrcPixMap.pPixel[2] = pSrc->pData[2];
-	sSrcPixMap.iSizeInBits = g_kiPixMapSizeInBits;
-	sSrcPixMap.sRect.iRectWidth = kiWidth;
-	sSrcPixMap.sRect.iRectHeight = kiHeight;
-	sSrcPixMap.iStride[0] = pSrc->iLineSize[0];
-	sSrcPixMap.iStride[1] = pSrc->iLineSize[1];
-	sSrcPixMap.iStride[2] = pSrc->iLineSize[2];
-	sSrcPixMap.eFormat = VIDEO_FORMAT_I420;
-
-	m_pInterfaceVp->Process(iMethodIdx, &sSrcPixMap, NULL);
-}
-
-bool_t CWelsPreProcess::DetectSceneChange( SPicture *pCurPicture, SPicture *pRefPicture )
-{
-	bool_t bSceneChangeFlag = false;
-	int32_t iMethodIdx = METHOD_SCENE_CHANGE_DETECTION;
-	SSceneChangeResult sSceneChangeDetectResult = {0};
-	SPixMap sSrcPixMap = {0};
-	SPixMap sRefPixMap = {0};
-
-	sSrcPixMap.pPixel[0] = pCurPicture->pData[0];
-	sSrcPixMap.iSizeInBits = g_kiPixMapSizeInBits;
-	sSrcPixMap.iStride[0] = pCurPicture->iLineSize[0];
-	sSrcPixMap.sRect.iRectWidth = pCurPicture->iWidthInPixel;
-	sSrcPixMap.sRect.iRectHeight = pCurPicture->iHeightInPixel;
-	sSrcPixMap.eFormat = VIDEO_FORMAT_I420;
-
-
-	sRefPixMap.pPixel[0] = pRefPicture->pData[0]; 
-	sRefPixMap.iSizeInBits = g_kiPixMapSizeInBits;
-	sRefPixMap.iStride[0] = pRefPicture->iLineSize[0];
-	sRefPixMap.sRect.iRectWidth = pRefPicture->iWidthInPixel;
-	sRefPixMap.sRect.iRectHeight = pRefPicture->iHeightInPixel;
-	sRefPixMap.eFormat = VIDEO_FORMAT_I420;
-
-	int32_t iRet = m_pInterfaceVp->Process(iMethodIdx, &sSrcPixMap, &sRefPixMap);
-	if (iRet == 0)
-	{
-		m_pInterfaceVp->Get(iMethodIdx, (void*)&sSceneChangeDetectResult);
-		bSceneChangeFlag = sSceneChangeDetectResult.bSceneChangeFlag ? true : false;
-	}
-
-	return bSceneChangeFlag;
-}
-
-int32_t CWelsPreProcess::DownsamplePadding( SPicture *pSrc, SPicture *pDstPic,  int32_t iSrcWidth, int32_t iSrcHeight,
-											int32_t iShrinkWidth, int32_t iShrinkHeight, int32_t iTargetWidth, int32_t iTargetHeight )
-{
-	int32_t iRet = 0;
-	SPixMap sSrcPixMap = {0};
-	SPixMap sDstPicMap = {0};
-
-	sSrcPixMap.pPixel[0]   = pSrc->pData[0];
-	sSrcPixMap.pPixel[1]   = pSrc->pData[1];
-	sSrcPixMap.pPixel[2]   = pSrc->pData[2];
-	sSrcPixMap.iSizeInBits = g_kiPixMapSizeInBits;
-	sSrcPixMap.sRect.iRectWidth  = iSrcWidth;
-	sSrcPixMap.sRect.iRectHeight = iSrcHeight;
-	sSrcPixMap.iStride[0]  = pSrc->iLineSize[0];
-	sSrcPixMap.iStride[1]  = pSrc->iLineSize[1];
-	sSrcPixMap.iStride[2]  = pSrc->iLineSize[2];
-	sSrcPixMap.eFormat     = VIDEO_FORMAT_I420;	
-
-	if (iSrcWidth != iShrinkWidth || iSrcHeight != iShrinkHeight)
-	{
-		int32_t iMethodIdx = METHOD_DOWNSAMPLE;
-		sDstPicMap.pPixel[0]   = pDstPic->pData[0];
-		sDstPicMap.pPixel[1]   = pDstPic->pData[1];
-		sDstPicMap.pPixel[2]   = pDstPic->pData[2];
-		sDstPicMap.iSizeInBits = g_kiPixMapSizeInBits;
-		sDstPicMap.sRect.iRectWidth  = iShrinkWidth;
-		sDstPicMap.sRect.iRectHeight = iShrinkHeight;
-		sDstPicMap.iStride[0]  = pDstPic->iLineSize[0];
-		sDstPicMap.iStride[1]  = pDstPic->iLineSize[1];
-		sDstPicMap.iStride[2]  = pDstPic->iLineSize[2];
-		sDstPicMap.eFormat     = VIDEO_FORMAT_I420;
-
-		iRet = m_pInterfaceVp->Process(iMethodIdx, &sSrcPixMap, &sDstPicMap);
-	}	
-	else
-	{
-        memcpy(&sDstPicMap, &sSrcPixMap, sizeof(sDstPicMap));	// confirmed_safe_unsafe_usage
-	}
-
-	// get rid of odd line
-	iShrinkWidth -= (iShrinkWidth & 1);
-	iShrinkHeight -= (iShrinkHeight & 1);
-	Padding( (uint8_t *)sDstPicMap.pPixel[0], (uint8_t *)sDstPicMap.pPixel[1], (uint8_t *)sDstPicMap.pPixel[2], 
-		sDstPicMap.iStride[0], sDstPicMap.iStride[1],	iShrinkWidth, iTargetWidth, iShrinkHeight, iTargetHeight);
-
-	return iRet;
-}
-
-//*********************************************************************************************************/
-void CWelsPreProcess::VaaCalculation(SVAAFrameInfo *pVaaInfo, SPicture *pCurPicture, SPicture *pRefPicture,
-                                     bool_t bCalculateSQDiff, bool_t bCalculateVar, bool_t bCalculateBGD)
-{
-	pVaaInfo->sVaaCalcInfo.pCurY = pCurPicture->pData[0];
-	pVaaInfo->sVaaCalcInfo.pRefY = pRefPicture->pData[0];
-	{
-		int32_t iMethodIdx = METHOD_VAA_STATISTICS;
-		SPixMap sCurPixMap = {0};
-		SPixMap sRefPixMap = {0};
-		SVAACalcParam calc_param = {0};
-
-		sCurPixMap.pPixel[0] = pCurPicture->pData[0];
-		sCurPixMap.iSizeInBits = g_kiPixMapSizeInBits;
-		sCurPixMap.sRect.iRectWidth = pCurPicture->iWidthInPixel;
-		sCurPixMap.sRect.iRectHeight = pCurPicture->iHeightInPixel;
-		sCurPixMap.iStride[0] = pCurPicture->iLineSize[0];
-		sCurPixMap.eFormat = VIDEO_FORMAT_I420;
-
-		sRefPixMap.pPixel[0] = pRefPicture->pData[0];
-		sRefPixMap.iSizeInBits = g_kiPixMapSizeInBits;
-		sRefPixMap.sRect.iRectWidth = pRefPicture->iWidthInPixel;
-		sRefPixMap.sRect.iRectHeight = pRefPicture->iHeightInPixel;
-		sRefPixMap.iStride[0] = pRefPicture->iLineSize[0];
-		sRefPixMap.eFormat = VIDEO_FORMAT_I420;
-
-		calc_param.iCalcVar	= bCalculateVar;
-		calc_param.iCalcBgd	= bCalculateBGD;
-		calc_param.iCalcSsd	= bCalculateSQDiff;
-		calc_param.pCalcResult = &pVaaInfo->sVaaCalcInfo;
-
-		m_pInterfaceVp->Set(iMethodIdx, &calc_param);
-		m_pInterfaceVp->Process(iMethodIdx, &sCurPixMap, &sRefPixMap);
-	} 
-}
-
-void CWelsPreProcess::BackgroundDetection( SVAAFrameInfo *pVaaInfo, SPicture *pCurPicture, SPicture *pRefPicture, bool_t bDetectFlag )
-{
-	if (bDetectFlag)
-	{
-		pVaaInfo->iPicWidth     = pCurPicture->iWidthInPixel;
-		pVaaInfo->iPicHeight    = pCurPicture->iHeightInPixel;
-
-		pVaaInfo->iPicStride	= pCurPicture->iLineSize[0];
-		pVaaInfo->iPicStrideUV	= pCurPicture->iLineSize[1];
-		pVaaInfo->pCurY			= pCurPicture->pData[0];
-		pVaaInfo->pRefY			= pRefPicture->pData[0];	
-		pVaaInfo->pCurU			= pCurPicture->pData[1];
-		pVaaInfo->pRefU			= pRefPicture->pData[1];	
-		pVaaInfo->pCurV			= pCurPicture->pData[2];
-		pVaaInfo->pRefV			= pRefPicture->pData[2];	
-
-		int32_t iMethodIdx = METHOD_BACKGROUND_DETECTION;
-		SPixMap sSrcPixMap = {0};
-		SPixMap sRefPixMap = {0};
-		SBGDInterface BGDParam = {0};
-
-		sSrcPixMap.pPixel[0] = pCurPicture->pData[0];
-		sSrcPixMap.pPixel[1] = pCurPicture->pData[1];
-		sSrcPixMap.pPixel[2] = pCurPicture->pData[2];
-		sSrcPixMap.iSizeInBits = g_kiPixMapSizeInBits;
-		sSrcPixMap.iStride[0] = pCurPicture->iLineSize[0];
-		sSrcPixMap.iStride[1] = pCurPicture->iLineSize[1];
-		sSrcPixMap.iStride[2] = pCurPicture->iLineSize[2];
-		sSrcPixMap.sRect.iRectWidth = pCurPicture->iWidthInPixel;
-		sSrcPixMap.sRect.iRectHeight = pCurPicture->iHeightInPixel;
-		sSrcPixMap.eFormat = VIDEO_FORMAT_I420;
-
-		sRefPixMap.pPixel[0] = pRefPicture->pData[0];
-		sRefPixMap.pPixel[1] = pRefPicture->pData[1];
-		sRefPixMap.pPixel[2] = pRefPicture->pData[2];
-		sRefPixMap.iSizeInBits = g_kiPixMapSizeInBits;
-		sRefPixMap.iStride[0] = pRefPicture->iLineSize[0];
-		sRefPixMap.iStride[1] = pRefPicture->iLineSize[1];
-		sRefPixMap.iStride[2] = pRefPicture->iLineSize[2];
-		sRefPixMap.sRect.iRectWidth = pRefPicture->iWidthInPixel;
-		sRefPixMap.sRect.iRectHeight = pRefPicture->iHeightInPixel;
-		sRefPixMap.eFormat = VIDEO_FORMAT_I420;
-
-		BGDParam.pBackgroundMbFlag = pVaaInfo->pVaaBackgroundMbFlag;
-		BGDParam.pCalcRes = &(pVaaInfo->sVaaCalcInfo);
-		m_pInterfaceVp->Set(iMethodIdx, (void*)&BGDParam);
-		m_pInterfaceVp->Process(iMethodIdx, &sSrcPixMap, &sRefPixMap);
-	} 
-	else
-	{
-		int32_t	iPicWidthInMb	= (pCurPicture->iWidthInPixel + 15) >> 4;
-		int32_t	iPicHeightInMb= (pCurPicture->iHeightInPixel+ 15) >> 4;
-		memset(pVaaInfo->pVaaBackgroundMbFlag, 0, iPicWidthInMb * iPicHeightInMb);
-	}
-}
-
-void CWelsPreProcess::AdaptiveQuantCalculation( SVAAFrameInfo *pVaaInfo, SPicture *pCurPicture, SPicture *pRefPicture )
-{
-	pVaaInfo->sAdaptiveQuantParam.pCalcResult = &(pVaaInfo->sVaaCalcInfo); 
-	pVaaInfo->sAdaptiveQuantParam.dAverMotionTextureIndexToDeltaQp = 0;
-
-	{
-		int32_t iMethodIdx = METHOD_ADAPTIVE_QUANT;
-		SPixMap pSrc = {0};
-		SPixMap pRef = {0};
-		int32_t iRet = 0;
-
-		pSrc.pPixel[0] = pCurPicture->pData[0];
-		pSrc.iSizeInBits = g_kiPixMapSizeInBits;
-		pSrc.iStride[0] = pCurPicture->iLineSize[0];
-		pSrc.sRect.iRectWidth = pCurPicture->iWidthInPixel;
-		pSrc.sRect.iRectHeight = pCurPicture->iHeightInPixel;
-		pSrc.eFormat = VIDEO_FORMAT_I420;
-
-		pRef.pPixel[0] = pRefPicture->pData[0]; 
-		pRef.iSizeInBits = g_kiPixMapSizeInBits;
-		pRef.iStride[0] = pRefPicture->iLineSize[0];
-		pRef.sRect.iRectWidth = pRefPicture->iWidthInPixel;
-		pRef.sRect.iRectHeight = pRefPicture->iHeightInPixel;
-		pRef.eFormat = VIDEO_FORMAT_I420;
-
-		iRet = m_pInterfaceVp->Set(iMethodIdx, (void*)&(pVaaInfo->sAdaptiveQuantParam));
-		iRet = m_pInterfaceVp->Process(iMethodIdx, &pSrc, &pRef);
-		if (iRet == 0)
-			m_pInterfaceVp->Get(iMethodIdx, (void*)&(pVaaInfo->sAdaptiveQuantParam));
-	}
-}
-
-void CWelsPreProcess::SetRefMbType( void *pCtx, uint32_t **pRefMbTypeArray, int32_t iRefPicType )
-{
-  sWelsEncCtx *pEncCtx	    = (sWelsEncCtx *)pCtx;
-  const uint8_t uiTid		    = pEncCtx->uiTemporalId;	
-  const uint8_t uiDid          = pEncCtx->uiDependencyId;
-  SRefList *pRefPicLlist				= pEncCtx->ppRefPicListExt[uiDid];	
-  SLTRState* pLtr				= &pEncCtx->pLtr[uiDid];
-  uint8_t i							= 0;
-
-  if (pEncCtx->pSvcParam->bEnableLongTermReference && pLtr->bReceivedT0LostFlag && uiTid == 0)
-  {
-    for ( i = 0;i <pRefPicLlist->uiLongRefCount;i++ )	
-    {
-      SPicture *pRef = pRefPicLlist->pLongRefList[i];
-      if ( pRef != NULL && pRef->uiRecieveConfirmed == 1/*RECIEVE_SUCCESS*/)	
-      {
-        *pRefMbTypeArray = pRef->uiRefMbType;
-        break;
-      }
-    }
-  }
-  else
-  {
-    for ( i = 0; i < pRefPicLlist->uiShortRefCount; i++ )
-    {
-      SPicture *pRef = pRefPicLlist->pShortRefList[i];
-      if ( pRef != NULL && pRef->bUsedAsRef && pRef->iFramePoc >= 0 && pRef->uiTemporalId <= uiTid)
-      {		
-        *pRefMbTypeArray = pRef->uiRefMbType;
-        break;	
-      }
-    }
-  }
-}
-
-
-void CWelsPreProcess::AnalyzePictureComplexity( void *pCtx, SPicture *pCurPicture, SPicture *pRefPicture, const int32_t kiDependencyId, const bool_t bCalculateBGD )
-{
-	sWelsEncCtx *pEncCtx	= (sWelsEncCtx *)pCtx;
-	SWelsSvcCodingParam *pSvcParam= pEncCtx->pSvcParam;
-	SVAAFrameInfo *pVaaInfo			= pEncCtx->pVaa;
-
-	SComplexityAnalysisParam *sComplexityAnalysisParam = &(pVaaInfo->sComplexityAnalysisParam);
-	SWelsSvcRc *SWelsSvcRc = &pEncCtx->pWelsSvcRc[kiDependencyId];
-	int32_t iComplexityAnalysisMode = 0;
-
-	if( pSvcParam->iRCMode == RC_MODE0 && pEncCtx->eSliceType == P_SLICE )
-	{
-		iComplexityAnalysisMode = FRAME_SAD;
-	}
-	else if ( pSvcParam->iRCMode == RC_MODE1 && pEncCtx->eSliceType == P_SLICE )
-	{
-		iComplexityAnalysisMode = GOM_SAD;
-	}
-	else if ( pSvcParam->iRCMode == RC_MODE1 && pEncCtx->eSliceType == I_SLICE )
-	{
-		iComplexityAnalysisMode = GOM_VAR;
-	}
-	else
-	{
-		return;
-	}
-
-	sComplexityAnalysisParam->iComplexityAnalysisMode = iComplexityAnalysisMode;
-	sComplexityAnalysisParam->pCalcResult = &(pVaaInfo->sVaaCalcInfo); 
-	sComplexityAnalysisParam->pBackgroundMbFlag = pVaaInfo->pVaaBackgroundMbFlag;
-    SetRefMbType(pEncCtx, &(sComplexityAnalysisParam->uiRefMbType), pRefPicture->iPictureType);
-	sComplexityAnalysisParam->iCalcBgd = bCalculateBGD; 
-	sComplexityAnalysisParam->iFrameComplexity = 0;
-
-	memset(SWelsSvcRc->pGomForegroundBlockNum, 0, SWelsSvcRc->iGomSize*sizeof(int32_t));
-	if ( iComplexityAnalysisMode != FRAME_SAD )
-		memset( SWelsSvcRc->pCurrentFrameGomSad, 0, SWelsSvcRc->iGomSize*sizeof(int32_t) );
-
-	sComplexityAnalysisParam->pGomComplexity = SWelsSvcRc->pCurrentFrameGomSad;
-	sComplexityAnalysisParam->pGomForegroundBlockNum = SWelsSvcRc->pGomForegroundBlockNum;
-	sComplexityAnalysisParam->iMbNumInGom = SWelsSvcRc->iNumberMbGom;
-
-	{
-		int32_t iMethodIdx = METHOD_COMPLEXITY_ANALYSIS;
-		SPixMap sSrcPixMap = {0};
-		SPixMap sRefPixMap = {0};
-		int32_t iRet = 0;
-
-		sSrcPixMap.pPixel[0] = pCurPicture->pData[0];
-		sSrcPixMap.iSizeInBits = g_kiPixMapSizeInBits;
-		sSrcPixMap.iStride[0] = pCurPicture->iLineSize[0];
-		sSrcPixMap.sRect.iRectWidth = pCurPicture->iWidthInPixel;
-		sSrcPixMap.sRect.iRectHeight = pCurPicture->iHeightInPixel;
-		sSrcPixMap.eFormat = VIDEO_FORMAT_I420;
-
-		sRefPixMap.pPixel[0] = pRefPicture->pData[0]; 
-		sRefPixMap.iSizeInBits = g_kiPixMapSizeInBits;
-		sRefPixMap.iStride[0] = pRefPicture->iLineSize[0];
-		sRefPixMap.sRect.iRectWidth = pRefPicture->iWidthInPixel;
-		sRefPixMap.sRect.iRectHeight = pRefPicture->iHeightInPixel;
-		sRefPixMap.eFormat = VIDEO_FORMAT_I420;
-
-		iRet = m_pInterfaceVp->Set(iMethodIdx, (void*)sComplexityAnalysisParam);
-		iRet = m_pInterfaceVp->Process(iMethodIdx, &sSrcPixMap, &sRefPixMap);
-		if (iRet == 0)
-			m_pInterfaceVp->Get(iMethodIdx, (void*)sComplexityAnalysisParam);
-	}
-}
-
-void  CWelsPreProcess::Padding(uint8_t * pSrcY, uint8_t * pSrcU, uint8_t * pSrcV, int32_t iStrideY, int32_t iStrideUV,
-			  int32_t iActualWidth, int32_t iPaddingWidth, int32_t iActualHeight, int32_t iPaddingHeight)
-{
-	int32_t i;
-
-	if( iPaddingHeight > iActualHeight ){
-		for( i=iActualHeight;i<iPaddingHeight;i++ ){
-			memset(pSrcY + i*iStrideY, 0, iActualWidth);			
-
-			if( !(i&1) ){
-				memset(pSrcU + i/2*iStrideUV, 0x80, iActualWidth/2);
-				memset(pSrcV + i/2*iStrideUV, 0x80, iActualWidth/2);	
-			}
-		}		
-	}
-
-	if( iPaddingWidth > iActualWidth ){
-		for( i=0;i<iPaddingHeight;i++ ){
-			memset(pSrcY + i*iStrideY + iActualWidth, 0, iPaddingWidth - iActualWidth);
-			if( !(i&1) ){
-				memset(pSrcU + i/2*iStrideUV + iActualWidth/2, 0x80, (iPaddingWidth - iActualWidth)/2);
-				memset(pSrcV + i/2*iStrideUV + iActualWidth/2, 0x80, (iPaddingWidth - iActualWidth)/2);
-			}
-		}        
-	}
-}
-
-
-//TODO: may opti later
-//TODO: not use this func?
-void * WelsMemcpy( void *dst, const void *kpSrc, uint32_t uiSize)
-{
-	return ::memcpy(dst, kpSrc, uiSize);
-}
-void * WelsMemset( void * p, int32_t val, uint32_t uiSize)
-{
-	return ::memset(p, val, uiSize);
-}
-
-//i420_to_i420_c
-void  WelsMoveMemory_c(uint8_t * pDstY, uint8_t * pDstU, uint8_t * pDstV,  int32_t iDstStrideY, int32_t iDstStrideUV,  
-                               uint8_t * pSrcY, uint8_t * pSrcU, uint8_t * pSrcV, int32_t iSrcStrideY, int32_t iSrcStrideUV, int32_t iWidth, int32_t iHeight )
-{
-	int32_t   iWidth2 = iWidth >> 1;
-	int32_t   iHeight2 = iHeight >> 1;
-	int32_t   j;
-
-	for( j=iHeight;j;j-- )
-	{
-		WelsMemcpy(pDstY, pSrcY, iWidth);
-		pDstY += iDstStrideY;
-		pSrcY += iSrcStrideY;
-	}
-
-	for( j=iHeight2;j;j-- )
-	{
-		WelsMemcpy(pDstU, pSrcU, iWidth2);
-		WelsMemcpy(pDstV, pSrcV, iWidth2);
-		pDstU += iDstStrideUV;
-		pDstV += iDstStrideUV;
-		pSrcU += iSrcStrideUV;
-		pSrcV += iSrcStrideUV;
-	}
-}
-//vp's padding
-void  VPpadding(uint8_t * pSrcPtr, int32_t iCurWidth, int32_t iTargetWidth, int32_t iCurHeight, int32_t iTargetHeight, 
-				int32_t iStride, uint8_t uiStuffValue)
-{
-	uint8_t *pTmp;	
-	if( iTargetWidth > iCurWidth )
-	{
-		pTmp = pSrcPtr + iCurWidth;
-		for( int32_t i = 0; i < iCurHeight; i++ )
-		{
-			WelsMemset(pTmp, uiStuffValue, iTargetWidth - iCurWidth);
-			pTmp += iStride;
-		}        
-	}
-	
-	if( iTargetHeight > iCurHeight )
-	{
-		pTmp = pSrcPtr + iCurHeight * iStride;
-		for( int32_t i = iCurHeight; i < iTargetHeight;i++ )
-		{
-			WelsMemset(pTmp, uiStuffValue, iTargetWidth);
-			pTmp += iStride;
-		}		
-	}
-}
-
-
-void  CWelsPreProcess::WelsMoveMemoryWrapper(SWelsSvcCodingParam * pSvcParam, SPicture *pDstPic, const SSourcePicture *kpSrc, 
-                                             const int32_t kiTargetWidth, const int32_t kiTargetHeight )
-{
-    if (VIDEO_FORMAT_I420!=(kpSrc->iColorFormat & (~VIDEO_FORMAT_VFlip)))
-        return;
-
-    int32_t  iSrcWidth       = kpSrc->iPicWidth;
-    int32_t  iSrcHeight      = kpSrc->iPicHeight;
-
-    if ( iSrcHeight > kiTargetHeight ) 	iSrcHeight = kiTargetHeight;
-    if ( iSrcWidth > kiTargetWidth )		iSrcWidth  = kiTargetWidth;
-
-    // copy from fr26 to fix the odd uiSize failed issue 
-    if( iSrcWidth & 0x1 )		-- iSrcWidth;
-    if( iSrcHeight & 0x1 )		-- iSrcHeight;	
-
-    const int32_t kiSrcTopOffsetY = pSvcParam->SUsedPicRect.iTop;
-    const int32_t kiSrcTopOffsetUV = (kiSrcTopOffsetY>>1);
-    const int32_t kiSrcLeftOffsetY = pSvcParam->SUsedPicRect.iLeft;
-    const int32_t kiSrcLeftOffsetUV = (kiSrcLeftOffsetY>>1);
-    int32_t  iSrcOffset[3]       = {0,0,0};
-    iSrcOffset[0] = kpSrc->iStride[0]*kiSrcTopOffsetY + kiSrcLeftOffsetY;
-    iSrcOffset[1] = kpSrc->iStride[1]*kiSrcTopOffsetUV + kiSrcLeftOffsetUV ;
-    iSrcOffset[2] = kpSrc->iStride[2]*kiSrcTopOffsetUV + kiSrcLeftOffsetUV;
-
-    uint8_t * pSrcY = kpSrc->pData[0] + iSrcOffset[0];
-    uint8_t * pSrcU = kpSrc->pData[1] + iSrcOffset[1];
-    uint8_t * pSrcV = kpSrc->pData[2] + iSrcOffset[2];
-    const int32_t kiSrcStrideY = kpSrc->iStride[0];
-    const int32_t kiSrcStrideUV= kpSrc->iStride[1];
-    
-    uint8_t * pDstY = pDstPic->pData[0];
-    uint8_t * pDstU = pDstPic->pData[1];
-    uint8_t * pDstV = pDstPic->pData[2];
-    const int32_t kiDstStrideY = pDstPic->iLineSize[0];
-    const int32_t kiDstStrideUV = pDstPic->iLineSize[1];
-
-#define MAX_WIDTH      (4096)
-#define MAX_HEIGHT     (2304)//MAX_FS_LEVEL51 (36864); MAX_FS_LEVEL51*256/4096 = 2304
-    if (pSrcY)
-    {
-        if (iSrcWidth <= 0 || iSrcWidth > MAX_WIDTH || iSrcHeight <= 0 || iSrcHeight > MAX_HEIGHT)
-            return;
-        if (kiSrcTopOffsetY >= iSrcHeight || kiSrcLeftOffsetY>= iSrcWidth || iSrcWidth > kiSrcStrideY )
-            return;
-    }
-    if (pDstY)
-    {
-        if (kiTargetWidth <= 0 || kiTargetWidth > MAX_WIDTH || kiTargetHeight<= 0 || kiTargetHeight> MAX_HEIGHT)
-            return;
-        if (kiTargetWidth > kiDstStrideY)
-            return;
-    }
-
-    if (pSrcY == NULL || pSrcU == NULL || pSrcV == NULL || pDstY == NULL || pDstU == NULL || pDstV == NULL
-        || (iSrcWidth & 1) || (iSrcHeight & 1) )
-    {}
-    else
-    { 
-        //i420_to_i420_c
-        WelsMoveMemory_c( pDstY,  pDstU,  pDstV,  kiDstStrideY, kiDstStrideUV,  
-            pSrcY,  pSrcU,  pSrcV, kiSrcStrideY, kiSrcStrideUV, iSrcWidth, iSrcHeight );
-
-        //in VP Process
-        if ( kiTargetWidth > iSrcWidth || kiTargetHeight > iSrcHeight )
-        {
-            const int32_t kiTargetWidthC  = (kiTargetWidth>>1);
-            const int32_t kiTargetHeightC = (kiTargetHeight>>1);
-            const int32_t kiSrcWidthC        = (iSrcWidth>>1);
-            const int32_t kiSrcHeightC       = (iSrcHeight>>1);
-
-            // padding pDstPic I420
-            VPpadding((uint8_t *)pDstY, iSrcWidth, kiTargetWidth, iSrcHeight, kiTargetHeight, kiDstStrideY, 0);
-            VPpadding((uint8_t *)pDstU, kiSrcWidthC, kiTargetWidthC, kiSrcHeightC, kiTargetHeightC, kiDstStrideUV, 0x80);
-            VPpadding((uint8_t *)pDstV, kiSrcWidthC, kiTargetWidthC, kiSrcHeightC, kiTargetHeightC, kiDstStrideUV, 0x80);
-        }
-    }
-
-}
-
-//*********************************************************************************************************/
-} // namespace WelsSVCEnc
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#if defined(WIN32)
+#include <windows.h>
+#elif defined(MACOS)
+#include "bundleloader.h"
+#elif defined(__GNUC__)
+#include <dlfcn.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include "wels_preprocess.h"
+#include "memory_align.h"
+#include "encoder.h"
+#include "extern.h"
+#include "picture_handle.h"
+#include "encoder_context.h"
+#include "utils.h"
+
+#ifdef NO_DYNAMIC_VP
+EResult WELSAPI CreateVpInterface (void** ppCtx, int iVersion);
+EResult WELSAPI DestroyVpInterface (void** ppCtx, int iVersion);
+#endif
+
+namespace WelsSVCEnc {
+
+#define WelsSafeDelete(p) if(p){ delete (p); (p) = NULL; }
+
+
+//***** entry API declaration ************************************************************************//
+typedef EResult (WELSAPI* pfnCreateVpInterface) (void**, int);
+typedef EResult (WELSAPI* pfnDestroyVpInterface) (void*, int);
+
+int32_t WelsInitScaledPic (SWelsSvcCodingParam* pParam,  Scaled_Picture*  pScaledPic, CMemoryAlign* pMemoryAlign);
+bool_t  JudgeNeedOfScaling (SWelsSvcCodingParam* pParam, Scaled_Picture* pScaledPic);
+void    FreeScaledPic (Scaled_Picture*  pScaledPic, CMemoryAlign* pMemoryAlign);
+
+//******* table definition ***********************************************************************//
+const uint8_t g_kuiRefTemporalIdx[MAX_TEMPORAL_LEVEL][MAX_GOP_SIZE] = {
+  {  0, }, // 0
+  {  0,  0, }, // 1
+  {  0,  0,  0,  1, }, // 2
+  {  0,  0,  0,  2,  0,  1,  1,  2, }, // 3
+  {  0,  0,  0,  3,  0,  2,  2,  3,  0,  1,  1,  3,  1,  2,  2,  3 }  // 4
+};
+
+const int32_t g_kiPixMapSizeInBits = sizeof (uint8_t) * 8;
+
+
+inline  void   WelsUpdateSpatialIdxMap (sWelsEncCtx* pEncCtx, int32_t iPos, SPicture* pPic, int32_t iDidx) {
+  pEncCtx->sSpatialIndexMap[iPos].pSrc = pPic;
+  pEncCtx->sSpatialIndexMap[iPos].iDid = iDidx;
+}
+
+
+//***************************************************************************************************//
+CWelsLib::CWelsLib (void* pEncCtx) {
+  m_pInterface[0] = m_pInterface[1] = NULL;
+
+#ifndef NO_DYNAMIC_VP
+#if defined(WIN32)
+  const str_t WelsVPLib[] = "welsvp.dll";
+  HMODULE shModule = LoadLibrary (WelsVPLib);
+  if (!shModule)
+    WelsLog (pEncCtx, WELS_LOG_ERROR, "welsvp load lib dynamic failed module=%x\n", shModule);
+
+#elif defined(MACOS)
+  const str_t WelsVPLib[] = "welsvp.bundle";
+  str_t pCurPath[256];
+  GetCurrentModulePath (pCurPath, 256);
+  strlcat (pCurPath, WelsVPLib, 256);
+  CFBundleRef shModule = LoadBundle (pCurPath);
+  if (!shModule)
+    WelsLog (pEncCtx, WELS_LOG_ERROR, "welsvp load lib dynamic failed module=%x\n", shModule);
+
+#elif defined(__GNUC__)
+  const str_t WelsVPLib[] = "./libwelsvp.so";
+  void* shModule = NULL;
+  shModule = dlopen (WelsVPLib, RTLD_LAZY);
+  if (shModule == NULL)
+    printf ("dlopen %s iRet=%x, err=%s\n", WelsVPLib, shModule, dlerror());
+#endif
+
+  m_pVpLib = (void*)shModule;
+#endif
+}
+
+CWelsLib::~CWelsLib() {
+#ifndef NO_DYNAMIC_VP
+  if (m_pVpLib) {
+#if defined(WIN32)
+    HMODULE shModule = (HMODULE)m_pVpLib;
+    FreeLibrary (shModule);
+
+#elif defined(MACOS)
+    CFBundleRef shModule = (CFBundleRef)m_pVpLib;
+    FreeBundle (shModule);
+
+#elif defined(__GNUC__)
+    void* shModule = m_pVpLib;
+    dlclose (shModule);
+#endif
+    m_pVpLib = NULL;
+  }
+#endif
+}
+
+void* CWelsLib::QueryFunction (const str_t* pName) {
+  void* pFunc = NULL;
+
+  if (m_pVpLib) {
+#if defined(WIN32)
+    HMODULE shModule = (HMODULE)m_pVpLib;
+    pFunc = (void*)GetProcAddress (shModule, pName);
+
+#elif defined(MACOS)
+    CFBundleRef shModule = (CFBundleRef)m_pVpLib;
+    pFunc = (void*)GetProcessAddress (shModule, pName);
+
+#elif defined(__GNUC__)
+    void* shModule = m_pVpLib;
+    pFunc = (void*)dlsym (shModule, pName);
+    if (pFunc == NULL)
+      printf ("dlsym %s iRet=%p, err=%s\n", shModule, pFunc, dlerror());
+#endif
+  }
+  return pFunc;
+}
+
+int32_t CWelsLib::CreateIface (void** ppEncCtx) {
+#ifndef NO_DYNAMIC_VP
+  if (m_pVpLib) {
+
+#endif
+    pfnCreateVpInterface  pCreateVpInterface  = NULL;
+    pfnDestroyVpInterface pDestroyVpInterface = NULL;
+
+#ifndef NO_DYNAMIC_VP
+    pCreateVpInterface  = (pfnCreateVpInterface)  QueryFunction ("CreateVpInterface");
+    pDestroyVpInterface = (pfnDestroyVpInterface) QueryFunction ("DestroyVpInterface");
+#else
+    pCreateVpInterface  = CreateVpInterface;
+    // TODO([email protected]): This cast corrects a signature difference... This is a potential real problem
+    pDestroyVpInterface = (pfnDestroyVpInterface)DestroyVpInterface;
+#endif
+
+    m_pInterface[0] = (void*)pCreateVpInterface;
+    m_pInterface[1] = (void*)pDestroyVpInterface;
+
+    if (m_pInterface[0] && m_pInterface[1])
+      pCreateVpInterface (ppEncCtx, WELSVP_INTERFACE_VERION);
+#ifndef NO_DYNAMIC_VP
+  } else {
+  }
+#endif
+
+  return ppEncCtx ? 0 : 1;
+}
+
+int32_t CWelsLib::DestroyIface (void* pEncCtx) {
+  if (pEncCtx) {
+    pfnDestroyVpInterface pDestroyVpInterface = (pfnDestroyVpInterface) m_pInterface[1];
+    if (pDestroyVpInterface) {
+      pDestroyVpInterface (pEncCtx, WELSVP_INTERFACE_VERION);
+    } else {
+    }
+  }
+
+  return 0;
+}
+
+/***************************************************************************
+*
+*	implement of the interface
+*
+***************************************************************************/
+
+CWelsPreProcess::CWelsPreProcess (void* pEncCtx) {
+  m_pInterfaceVp = NULL;
+  m_pEncLib = NULL;
+  m_bInitDone = false;
+  m_bOfficialBranch  = FALSE;
+  m_pEncCtx = pEncCtx;
+  memset (&m_sScaledPicture, 0, sizeof (m_sScaledPicture));
+}
+
+CWelsPreProcess::~CWelsPreProcess() {
+  FreeScaledPic (&m_sScaledPicture, static_cast<sWelsEncCtx*> (m_pEncCtx)->pMemAlign);
+  WelsPreprocessDestroy();
+}
+
+int32_t CWelsPreProcess::WelsPreprocessCreate() {
+  if (m_pEncLib == NULL && m_pInterfaceVp == NULL) {
+    m_pEncLib  = new CWelsLib (m_pEncCtx);
+    if (!m_pEncLib)
+      goto exit;
+
+    m_pEncLib->CreateIface ((void**)&m_pInterfaceVp);
+    if (!m_pInterfaceVp)
+      goto exit;
+  } else
+    goto exit;
+
+  return 0;
+
+exit:
+  WelsPreprocessDestroy();
+  return 1;
+}
+
+int32_t CWelsPreProcess::WelsPreprocessDestroy() {
+  if (m_pEncLib) {
+    m_pEncLib->DestroyIface ((void*)m_pInterfaceVp);
+    m_pInterfaceVp = NULL;
+    WelsSafeDelete (m_pEncLib);
+  }
+
+  return 0;
+}
+
+int32_t CWelsPreProcess::WelsPreprocessReset (void* pCtx) {
+  sWelsEncCtx* pEncCtx = (sWelsEncCtx*)pCtx;
+  int32_t iRet = -1;
+
+  if (pEncCtx) {
+    FreeScaledPic (&m_sScaledPicture, pEncCtx->pMemAlign);
+    iRet = InitLastSpatialPictures (pEncCtx);
+    iRet = WelsInitScaledPic (pEncCtx->pSvcParam, &m_sScaledPicture, pEncCtx->pMemAlign);
+  }
+
+  return iRet;
+}
+
+int32_t CWelsPreProcess::WelsPreprocessStep1 (void* pCtx, const SSourcePicture** kppSrcPicList,
+    const int32_t kiConfiguredLayerNum) {
+  sWelsEncCtx* pEncCtx = (sWelsEncCtx*)pCtx;
+  SWelsSvcCodingParam* pSvcParam = pEncCtx->pSvcParam;
+  int32_t	iNumDependencyLayer = (int32_t)pSvcParam->iNumDependencyLayer;
+  int32_t iSpatialNum = 0;
+
+  if (!m_bInitDone) {
+    if (WelsPreprocessCreate() != 0)
+      return -1;
+    if (WelsPreprocessReset (pEncCtx) != 0)
+      return -1;
+
+    m_bOfficialBranch  = (iNumDependencyLayer != kiConfiguredLayerNum);
+    if (!m_bOfficialBranch && (iNumDependencyLayer == 1)) {
+      // check the input source uiSize to decide if need switch to official branch
+      // NOTICE: the layernum=1 case is confused in official/non-official cases!
+      SSourcePicture** pic_queue = (SSourcePicture**)kppSrcPicList;
+      for (int32_t i = 0; i < iNumDependencyLayer; i++) {
+        if (pSvcParam->sDependencyLayers[i].iFrameWidth != pic_queue[i]->iPicWidth ||
+            pSvcParam->sDependencyLayers[i].iFrameHeight != pic_queue[i]->iPicHeight) {
+          m_bOfficialBranch = TRUE;
+          break;
+        }
+      }
+    }
+    m_bInitDone = TRUE;
+  }
+
+  if (m_pInterfaceVp == NULL)
+    return -1;
+
+  if (kiConfiguredLayerNum <= 0)
+    return -1;
+
+  pEncCtx->pVaa->bSceneChangeFlag = pEncCtx->pVaa->bIdrPeriodFlag = false;
+  if (pSvcParam->uiIntraPeriod)
+    pEncCtx->pVaa->bIdrPeriodFlag = (1 + pEncCtx->iFrameIndex >= (int32_t)pSvcParam->uiIntraPeriod) ? true : false;
+
+  if (m_bOfficialBranch) {	// Perform Down Sampling potentially due to application
+    assert (kiConfiguredLayerNum == 1);
+    iSpatialNum	= SingleLayerPreprocess (pEncCtx, kppSrcPicList[0], &m_sScaledPicture);
+  } else { // for console each spatial pictures are available there
+    iSpatialNum	= kiConfiguredLayerNum;
+    MultiLayerPreprocess (pEncCtx, kppSrcPicList, iSpatialNum);
+  }
+
+  return iSpatialNum;
+}
+
+int32_t CWelsPreProcess::WelsPreprocessStep3 (void* pCtx, const int32_t kiDidx) {
+  sWelsEncCtx* pEncCtx = (sWelsEncCtx*)pCtx;
+  SWelsSvcCodingParam* pSvcParam = pEncCtx->pSvcParam;
+  bool_t bNeededMbAq = (pSvcParam->bEnableAdaptiveQuant && (pEncCtx->eSliceType == P_SLICE));
+  bool_t bCalculateBGD = (pEncCtx->eSliceType == P_SLICE && pSvcParam->bEnableBackgroundDetection);
+
+  int32_t iCurTemporalIdx  = pEncCtx->uiSpatialLayersInTemporal[kiDidx] - 1;
+
+  int32_t iRefTemporalIdx = (int32_t)g_kuiRefTemporalIdx[pSvcParam->iDecompStages][pEncCtx->iCodingIndex &
+                            (pSvcParam->uiGopSize - 1)];
+  if (pEncCtx->uiTemporalId == 0 && pEncCtx->pLtr[pEncCtx->uiDependencyId].bReceivedT0LostFlag)
+    iRefTemporalIdx = pEncCtx->uiSpatialLayersInTemporal[kiDidx] + pEncCtx->pVaa->uiValidLongTermPicIdx;
+
+  SPicture* pCurPic = pEncCtx->pSpatialPic[kiDidx][iCurTemporalIdx];
+  SPicture* pRefPic = pEncCtx->pSpatialPic[kiDidx][iRefTemporalIdx];
+  {
+    SPicture* pLastPic = m_pLastSpatialPicture[kiDidx][0];
+    bool_t bCalculateSQDiff = ((pLastPic->pData[0] == pRefPic->pData[0]) && bNeededMbAq);
+    bool_t bCalculateVar = (pSvcParam->iRCMode == RC_MODE1 && pEncCtx->eSliceType == I_SLICE);
+
+    VaaCalculation (pEncCtx->pVaa, pCurPic, pRefPic, bCalculateSQDiff, bCalculateVar, bCalculateBGD);
+  }
+
+  if (pSvcParam->bEnableBackgroundDetection) {
+    BackgroundDetection (pEncCtx->pVaa, pCurPic, pRefPic, bCalculateBGD && pRefPic->iPictureType != I_SLICE);
+  }
+
+  if (bNeededMbAq) {
+    SPicture* pCurPic = m_pLastSpatialPicture[kiDidx][1];
+    SPicture* pRefPic = m_pLastSpatialPicture[kiDidx][0];
+
+    AdaptiveQuantCalculation (pEncCtx->pVaa, pCurPic, pRefPic);
+  }
+
+  if (pSvcParam->bEnableRc) {
+    AnalyzePictureComplexity (pEncCtx, pCurPic, pRefPic, kiDidx, bCalculateBGD);
+  }
+
+  WelsExchangeSpatialPictures (&m_pLastSpatialPicture[kiDidx][1], &m_pLastSpatialPicture[kiDidx][0]);
+
+  return 0;
+}
+
+
+/*
+*	SingleLayerPreprocess: down sampling if applicable
+*  @return:	exact number of spatial layers need to encoder indeed
+*/
+int32_t CWelsPreProcess::SingleLayerPreprocess (void* pCtx, const SSourcePicture* kpSrc,
+    Scaled_Picture* pScaledPicture) {
+  sWelsEncCtx* pEncCtx = (sWelsEncCtx*)pCtx;
+  SWelsSvcCodingParam* pSvcParam    = pEncCtx->pSvcParam;
+  int8_t	iDependencyId			= pSvcParam->iNumDependencyLayer - 1;
+  int32_t iPicturePos	                    = pEncCtx->uiSpatialLayersInTemporal[iDependencyId] - 1;
+
+  SPicture* pSrcPic					= NULL;	// large
+  SPicture* pDstPic					= NULL;	// small
+  SDLayerParam* pDlayerParam					= NULL;
+  int32_t iSpatialNum					= 0;
+  int32_t iSrcWidth					= 0;
+  int32_t iSrcHeight					= 0;
+  int32_t iTargetWidth					= 0;
+  int32_t iTargetHeight					= 0;
+  int32_t iTemporalId = 0;
+  int32_t iActualSpatialLayerNum      = 0;
+
+  pDlayerParam = &pSvcParam->sDependencyLayers[iDependencyId];
+  iTargetWidth	  = pDlayerParam->iFrameWidth;
+  iTargetHeight  = pDlayerParam->iFrameHeight;
+  iTemporalId    = pDlayerParam->uiCodingIdx2TemporalId[pEncCtx->iCodingIndex & (pSvcParam->uiGopSize - 1)];
+  iSrcWidth   = pSvcParam->SUsedPicRect.iWidth;
+  iSrcHeight  = pSvcParam->SUsedPicRect.iHeight;
+
+  pSrcPic = pScaledPicture->pScaledInputPicture ? pScaledPicture->pScaledInputPicture :
+            pEncCtx->pSpatialPic[iDependencyId][iPicturePos];
+
+  WelsMoveMemoryWrapper (pSvcParam, pSrcPic, kpSrc, iSrcWidth, iSrcHeight);
+
+  if (pSvcParam->bEnableDenoise)
+    BilateralDenoising (pSrcPic, iSrcWidth, iSrcHeight);
+
+  // different scaling in between input picture and dst highest spatial picture.
+  int32_t iShrinkWidth  = iSrcWidth;
+  int32_t iShrinkHeight = iSrcHeight;
+  pDstPic = pSrcPic;
+  if (pScaledPicture->pScaledInputPicture) {
+    // for highest downsampling
+    pDstPic		= pEncCtx->pSpatialPic[iDependencyId][iPicturePos];
+    iShrinkWidth = pScaledPicture->iScaledWidth[iDependencyId];
+    iShrinkHeight = pScaledPicture->iScaledHeight[iDependencyId];
+  }
+  DownsamplePadding (pSrcPic, pDstPic, iSrcWidth, iSrcHeight, iShrinkWidth, iShrinkHeight, iTargetWidth, iTargetHeight);
+
+  if (pSvcParam->bEnableSceneChangeDetect && !pEncCtx->pVaa->bIdrPeriodFlag
+      && ! (pEncCtx->iCodingIndex & (pSvcParam->uiGopSize - 1))) {
+    SPicture* pRefPic = pEncCtx->pLtr[iDependencyId].bReceivedT0LostFlag ?
+                        pEncCtx->pSpatialPic[iDependencyId][pEncCtx->uiSpatialLayersInTemporal[iDependencyId] +
+                            pEncCtx->pVaa->uiValidLongTermPicIdx] : m_pLastSpatialPicture[iDependencyId][0];
+
+    pEncCtx->pVaa->bSceneChangeFlag = DetectSceneChange (pDstPic, pRefPic);
+  }
+
+  for (int32_t i = 0; i < pSvcParam->iNumDependencyLayer; i++) {
+    if (pSvcParam->sDependencyLayers[i].uiCodingIdx2TemporalId[pEncCtx->iCodingIndex & (pSvcParam->uiGopSize - 1)]
+        != INVALID_TEMPORAL_ID) {
+      ++ iActualSpatialLayerNum;
+    }
+  }
+
+  if (iTemporalId != INVALID_TEMPORAL_ID) {
+    WelsUpdateSpatialIdxMap (pEncCtx, iActualSpatialLayerNum - 1, pDstPic, iDependencyId);
+    ++ iSpatialNum;
+    -- iActualSpatialLayerNum;
+  }
+
+  m_pLastSpatialPicture[iDependencyId][1]	= pEncCtx->pSpatialPic[iDependencyId][iPicturePos];
+  -- iDependencyId;
+
+  // generate other spacial layer
+  // pSrc is
+  //	-- padded input pic, if downsample should be applied to generate highest layer, [if] block above
+  //	-- highest layer, if no downsampling, [else] block above
+  if (pSvcParam->iNumDependencyLayer > 1) {
+    while (iDependencyId >= 0) {
+      pDlayerParam			= &pSvcParam->sDependencyLayers[iDependencyId];
+      iTargetWidth	= pDlayerParam->iFrameWidth;
+      iTargetHeight	= pDlayerParam->iFrameHeight;
+      iTemporalId = pDlayerParam->uiCodingIdx2TemporalId[pEncCtx->iCodingIndex & (pSvcParam->uiGopSize - 1)];
+      iPicturePos		= pEncCtx->uiSpatialLayersInTemporal[iDependencyId] - 1;
+
+      // NOT work for CGS, FIXME
+      // spatial layer is able to encode indeed
+      if ((iTemporalId != INVALID_TEMPORAL_ID)) {
+        // down sampling performed
+        if (NULL == pSrcPic)
+          return -1;
+
+        pDstPic	= pEncCtx->pSpatialPic[iDependencyId][iPicturePos];	// small
+        iShrinkWidth = pScaledPicture->iScaledWidth[iDependencyId];
+        iShrinkHeight = pScaledPicture->iScaledHeight[iDependencyId];
+        DownsamplePadding (pSrcPic, pDstPic, iSrcWidth, iSrcHeight, iShrinkWidth, iShrinkHeight, iTargetWidth, iTargetHeight);
+
+        WelsUpdateSpatialIdxMap (pEncCtx, iActualSpatialLayerNum - 1, pDstPic, iDependencyId);
+
+        -- iActualSpatialLayerNum;
+        ++ iSpatialNum;
+
+        m_pLastSpatialPicture[iDependencyId][1]	= pEncCtx->pSpatialPic[iDependencyId][iPicturePos];
+      }
+      -- iDependencyId;
+    }
+  }
+
+  return iSpatialNum;
+}
+
+int32_t CWelsPreProcess::MultiLayerPreprocess (void* pCtx, const SSourcePicture** kppSrcPicList,
+    const int32_t kiSpatialNum) {
+  sWelsEncCtx* pEncCtx = (sWelsEncCtx*)pCtx;
+  SWelsSvcCodingParam* pSvcParam	= pEncCtx->pSvcParam;
+  const SSourcePicture* pSrc			= NULL;
+  SPicture* pDstPic						= NULL;
+  const int32_t iSpatialLayersCfgCount =
+    pSvcParam->iNumDependencyLayer;	// count number of spatial layers to be encoded in cfg
+  int32_t i							= 0;
+  int32_t j							= -1;
+
+  do {
+    pSrc	= kppSrcPicList[i];
+
+    // do not clear j, just let it continue to save complexity
+    do {
+      ++ j;
+      if (pSvcParam->sDependencyLayers[j].iFrameWidth == pSrc->iPicWidth &&
+          pSvcParam->sDependencyLayers[j].iFrameHeight == pSrc->iPicHeight) {
+        break;
+      }
+    } while (j < iSpatialLayersCfgCount);
+
+    assert (j < iSpatialLayersCfgCount);
+    pDstPic = pEncCtx->pSpatialPic[j][pEncCtx->uiSpatialLayersInTemporal[j] - 1];
+
+    WelsUpdateSpatialIdxMap (pEncCtx, i, pDstPic, j);
+
+    WelsMoveMemoryWrapper (pSvcParam, pDstPic, pSrc, pSrc->iPicWidth, pSrc->iPicHeight);
+
+    if (pSvcParam->bEnableDenoise)
+      BilateralDenoising (pDstPic, pSrc->iPicWidth, pSrc->iPicHeight);
+
+    m_pLastSpatialPicture[j][1]	= pDstPic;
+    ++ i;
+  } while (i < kiSpatialNum);
+
+  if (pSvcParam->bEnableSceneChangeDetect && (kiSpatialNum == pSvcParam->iNumDependencyLayer)
+      && !pEncCtx->pVaa->bIdrPeriodFlag) {
+    SPicture* pRef = pEncCtx->pLtr[0].bReceivedT0LostFlag ?
+                     pEncCtx->pSpatialPic[0][pEncCtx->uiSpatialLayersInTemporal[0] + pEncCtx->pVaa->uiValidLongTermPicIdx] :
+                     m_pLastSpatialPicture[0][0];
+
+    pEncCtx->pVaa->bSceneChangeFlag = DetectSceneChange (pDstPic, pRef);
+  }
+
+  return 0;
+}
+
+/*!
+ * \brief	Whether input picture need be scaled?
+ */
+bool_t JudgeNeedOfScaling (SWelsSvcCodingParam* pParam, Scaled_Picture* pScaledPicture) {
+  const int32_t kiInputPicWidth	= pParam->SUsedPicRect.iWidth;
+  const int32_t kiInputPicHeight = pParam->SUsedPicRect.iHeight;
+  const int32_t kiDstPicWidth		= pParam->sDependencyLayers[pParam->iNumDependencyLayer - 1].iActualWidth;
+  const int32_t kiDstPicHeight	= pParam->sDependencyLayers[pParam->iNumDependencyLayer - 1].iActualHeight;
+  bool_t bNeedDownsampling = true;
+
+  int32_t iSpatialIdx = pParam->iNumDependencyLayer - 1;
+
+  if (kiDstPicWidth >= kiInputPicWidth && kiDstPicHeight >= kiInputPicHeight) {
+    iSpatialIdx --;  // highest D layer do not need downsampling
+    bNeedDownsampling = false;
+  }
+
+  for (; iSpatialIdx >= 0; iSpatialIdx --) {
+    SDLayerParam* pCurLayer = &pParam->sDependencyLayers[iSpatialIdx];
+    int32_t iCurDstWidth			= pCurLayer->iActualWidth;
+    int32_t iCurDstHeight			= pCurLayer->iActualHeight;
+    int32_t iInputWidthXDstHeight	= kiInputPicWidth * iCurDstHeight;
+    int32_t iInputHeightXDstWidth	= kiInputPicHeight * iCurDstWidth;
+
+    if (iInputWidthXDstHeight > iInputHeightXDstWidth) {
+      pScaledPicture->iScaledWidth[iSpatialIdx] = iCurDstWidth;
+      pScaledPicture->iScaledHeight[iSpatialIdx] = iInputHeightXDstWidth / kiInputPicWidth;
+    } else {
+      pScaledPicture->iScaledWidth[iSpatialIdx] = iInputWidthXDstHeight / kiInputPicHeight;
+      pScaledPicture->iScaledHeight[iSpatialIdx] = iCurDstHeight;
+    }
+  }
+
+  return bNeedDownsampling;
+}
+
+int32_t  WelsInitScaledPic (SWelsSvcCodingParam* pParam,  Scaled_Picture*  pScaledPicture, CMemoryAlign* pMemoryAlign) {
+  bool_t bInputPicNeedScaling = JudgeNeedOfScaling (pParam, pScaledPicture);
+  if (bInputPicNeedScaling) {
+    pScaledPicture->pScaledInputPicture = AllocPicture (pMemoryAlign, pParam->SUsedPicRect.iWidth,
+                                          pParam->SUsedPicRect.iHeight, false);
+    if (pScaledPicture->pScaledInputPicture == NULL)
+      return -1;
+  }
+  return 0;
+}
+
+void  FreeScaledPic (Scaled_Picture*  pScaledPicture, CMemoryAlign* pMemoryAlign) {
+  if (pScaledPicture->pScaledInputPicture) {
+    FreePicture (pMemoryAlign, &pScaledPicture->pScaledInputPicture);
+    pScaledPicture->pScaledInputPicture = NULL;
+  }
+}
+
+int32_t CWelsPreProcess::InitLastSpatialPictures (void* pCtx) {
+  sWelsEncCtx* pEncCtx         = (sWelsEncCtx*)pCtx;
+  SWelsSvcCodingParam* pParam	= pEncCtx->pSvcParam;
+  const int32_t kiDlayerCount			= pParam->iNumDependencyLayer;
+  int32_t iDlayerIndex					= 0;
+
+  for (; iDlayerIndex < kiDlayerCount; iDlayerIndex++) {
+    const int32_t kiLayerInTemporal = pEncCtx->uiSpatialLayersInTemporal[iDlayerIndex];
+    m_pLastSpatialPicture[iDlayerIndex][0]	= pEncCtx->pSpatialPic[iDlayerIndex][kiLayerInTemporal - 2];
+    m_pLastSpatialPicture[iDlayerIndex][1]	= NULL;
+  }
+  for (; iDlayerIndex < MAX_DEPENDENCY_LAYER; iDlayerIndex++) {
+    m_pLastSpatialPicture[iDlayerIndex][0]	= m_pLastSpatialPicture[iDlayerIndex][1] = NULL;
+  }
+
+  return 0;
+}
+//*********************************************************************************************************/
+
+int32_t CWelsPreProcess::ColorspaceConvert (SWelsSvcCodingParam* pSvcParam, SPicture* pDstPic,
+    const SSourcePicture* kpSrc, const int32_t kiWidth, const int32_t kiHeight) {
+  return 1;
+  //not support yet
+}
+
+void CWelsPreProcess::BilateralDenoising (SPicture* pSrc, const int32_t kiWidth, const int32_t kiHeight) {
+  int32_t iMethodIdx = METHOD_DENOISE;
+  SPixMap sSrcPixMap = {0};
+
+  sSrcPixMap.pPixel[0] = pSrc->pData[0];
+  sSrcPixMap.pPixel[1] = pSrc->pData[1];
+  sSrcPixMap.pPixel[2] = pSrc->pData[2];
+  sSrcPixMap.iSizeInBits = g_kiPixMapSizeInBits;
+  sSrcPixMap.sRect.iRectWidth = kiWidth;
+  sSrcPixMap.sRect.iRectHeight = kiHeight;
+  sSrcPixMap.iStride[0] = pSrc->iLineSize[0];
+  sSrcPixMap.iStride[1] = pSrc->iLineSize[1];
+  sSrcPixMap.iStride[2] = pSrc->iLineSize[2];
+  sSrcPixMap.eFormat = VIDEO_FORMAT_I420;
+
+  m_pInterfaceVp->Process (iMethodIdx, &sSrcPixMap, NULL);
+}
+
+bool_t CWelsPreProcess::DetectSceneChange (SPicture* pCurPicture, SPicture* pRefPicture) {
+  bool_t bSceneChangeFlag = false;
+  int32_t iMethodIdx = METHOD_SCENE_CHANGE_DETECTION;
+  SSceneChangeResult sSceneChangeDetectResult = {0};
+  SPixMap sSrcPixMap = {0};
+  SPixMap sRefPixMap = {0};
+
+  sSrcPixMap.pPixel[0] = pCurPicture->pData[0];
+  sSrcPixMap.iSizeInBits = g_kiPixMapSizeInBits;
+  sSrcPixMap.iStride[0] = pCurPicture->iLineSize[0];
+  sSrcPixMap.sRect.iRectWidth = pCurPicture->iWidthInPixel;
+  sSrcPixMap.sRect.iRectHeight = pCurPicture->iHeightInPixel;
+  sSrcPixMap.eFormat = VIDEO_FORMAT_I420;
+
+
+  sRefPixMap.pPixel[0] = pRefPicture->pData[0];
+  sRefPixMap.iSizeInBits = g_kiPixMapSizeInBits;
+  sRefPixMap.iStride[0] = pRefPicture->iLineSize[0];
+  sRefPixMap.sRect.iRectWidth = pRefPicture->iWidthInPixel;
+  sRefPixMap.sRect.iRectHeight = pRefPicture->iHeightInPixel;
+  sRefPixMap.eFormat = VIDEO_FORMAT_I420;
+
+  int32_t iRet = m_pInterfaceVp->Process (iMethodIdx, &sSrcPixMap, &sRefPixMap);
+  if (iRet == 0) {
+    m_pInterfaceVp->Get (iMethodIdx, (void*)&sSceneChangeDetectResult);
+    bSceneChangeFlag = sSceneChangeDetectResult.bSceneChangeFlag ? true : false;
+  }
+
+  return bSceneChangeFlag;
+}
+
+int32_t CWelsPreProcess::DownsamplePadding (SPicture* pSrc, SPicture* pDstPic,  int32_t iSrcWidth, int32_t iSrcHeight,
+    int32_t iShrinkWidth, int32_t iShrinkHeight, int32_t iTargetWidth, int32_t iTargetHeight) {
+  int32_t iRet = 0;
+  SPixMap sSrcPixMap = {0};
+  SPixMap sDstPicMap = {0};
+
+  sSrcPixMap.pPixel[0]   = pSrc->pData[0];
+  sSrcPixMap.pPixel[1]   = pSrc->pData[1];
+  sSrcPixMap.pPixel[2]   = pSrc->pData[2];
+  sSrcPixMap.iSizeInBits = g_kiPixMapSizeInBits;
+  sSrcPixMap.sRect.iRectWidth  = iSrcWidth;
+  sSrcPixMap.sRect.iRectHeight = iSrcHeight;
+  sSrcPixMap.iStride[0]  = pSrc->iLineSize[0];
+  sSrcPixMap.iStride[1]  = pSrc->iLineSize[1];
+  sSrcPixMap.iStride[2]  = pSrc->iLineSize[2];
+  sSrcPixMap.eFormat     = VIDEO_FORMAT_I420;
+
+  if (iSrcWidth != iShrinkWidth || iSrcHeight != iShrinkHeight) {
+    int32_t iMethodIdx = METHOD_DOWNSAMPLE;
+    sDstPicMap.pPixel[0]   = pDstPic->pData[0];
+    sDstPicMap.pPixel[1]   = pDstPic->pData[1];
+    sDstPicMap.pPixel[2]   = pDstPic->pData[2];
+    sDstPicMap.iSizeInBits = g_kiPixMapSizeInBits;
+    sDstPicMap.sRect.iRectWidth  = iShrinkWidth;
+    sDstPicMap.sRect.iRectHeight = iShrinkHeight;
+    sDstPicMap.iStride[0]  = pDstPic->iLineSize[0];
+    sDstPicMap.iStride[1]  = pDstPic->iLineSize[1];
+    sDstPicMap.iStride[2]  = pDstPic->iLineSize[2];
+    sDstPicMap.eFormat     = VIDEO_FORMAT_I420;
+
+    iRet = m_pInterfaceVp->Process (iMethodIdx, &sSrcPixMap, &sDstPicMap);
+  } else {
+    memcpy (&sDstPicMap, &sSrcPixMap, sizeof (sDstPicMap));	// confirmed_safe_unsafe_usage
+  }
+
+  // get rid of odd line
+  iShrinkWidth -= (iShrinkWidth & 1);
+  iShrinkHeight -= (iShrinkHeight & 1);
+  Padding ((uint8_t*)sDstPicMap.pPixel[0], (uint8_t*)sDstPicMap.pPixel[1], (uint8_t*)sDstPicMap.pPixel[2],
+           sDstPicMap.iStride[0], sDstPicMap.iStride[1],	iShrinkWidth, iTargetWidth, iShrinkHeight, iTargetHeight);
+
+  return iRet;
+}
+
+//*********************************************************************************************************/
+void CWelsPreProcess::VaaCalculation (SVAAFrameInfo* pVaaInfo, SPicture* pCurPicture, SPicture* pRefPicture,
+                                      bool_t bCalculateSQDiff, bool_t bCalculateVar, bool_t bCalculateBGD) {
+  pVaaInfo->sVaaCalcInfo.pCurY = pCurPicture->pData[0];
+  pVaaInfo->sVaaCalcInfo.pRefY = pRefPicture->pData[0];
+  {
+    int32_t iMethodIdx = METHOD_VAA_STATISTICS;
+    SPixMap sCurPixMap = {0};
+    SPixMap sRefPixMap = {0};
+    SVAACalcParam calc_param = {0};
+
+    sCurPixMap.pPixel[0] = pCurPicture->pData[0];
+    sCurPixMap.iSizeInBits = g_kiPixMapSizeInBits;
+    sCurPixMap.sRect.iRectWidth = pCurPicture->iWidthInPixel;
+    sCurPixMap.sRect.iRectHeight = pCurPicture->iHeightInPixel;
+    sCurPixMap.iStride[0] = pCurPicture->iLineSize[0];
+    sCurPixMap.eFormat = VIDEO_FORMAT_I420;
+
+    sRefPixMap.pPixel[0] = pRefPicture->pData[0];
+    sRefPixMap.iSizeInBits = g_kiPixMapSizeInBits;
+    sRefPixMap.sRect.iRectWidth = pRefPicture->iWidthInPixel;
+    sRefPixMap.sRect.iRectHeight = pRefPicture->iHeightInPixel;
+    sRefPixMap.iStride[0] = pRefPicture->iLineSize[0];
+    sRefPixMap.eFormat = VIDEO_FORMAT_I420;
+
+    calc_param.iCalcVar	= bCalculateVar;
+    calc_param.iCalcBgd	= bCalculateBGD;
+    calc_param.iCalcSsd	= bCalculateSQDiff;
+    calc_param.pCalcResult = &pVaaInfo->sVaaCalcInfo;
+
+    m_pInterfaceVp->Set (iMethodIdx, &calc_param);
+    m_pInterfaceVp->Process (iMethodIdx, &sCurPixMap, &sRefPixMap);
+  }
+}
+
+void CWelsPreProcess::BackgroundDetection (SVAAFrameInfo* pVaaInfo, SPicture* pCurPicture, SPicture* pRefPicture,
+    bool_t bDetectFlag) {
+  if (bDetectFlag) {
+    pVaaInfo->iPicWidth     = pCurPicture->iWidthInPixel;
+    pVaaInfo->iPicHeight    = pCurPicture->iHeightInPixel;
+
+    pVaaInfo->iPicStride	= pCurPicture->iLineSize[0];
+    pVaaInfo->iPicStrideUV	= pCurPicture->iLineSize[1];
+    pVaaInfo->pCurY			= pCurPicture->pData[0];
+    pVaaInfo->pRefY			= pRefPicture->pData[0];
+    pVaaInfo->pCurU			= pCurPicture->pData[1];
+    pVaaInfo->pRefU			= pRefPicture->pData[1];
+    pVaaInfo->pCurV			= pCurPicture->pData[2];
+    pVaaInfo->pRefV			= pRefPicture->pData[2];
+
+    int32_t iMethodIdx = METHOD_BACKGROUND_DETECTION;
+    SPixMap sSrcPixMap = {0};
+    SPixMap sRefPixMap = {0};
+    SBGDInterface BGDParam = {0};
+
+    sSrcPixMap.pPixel[0] = pCurPicture->pData[0];
+    sSrcPixMap.pPixel[1] = pCurPicture->pData[1];
+    sSrcPixMap.pPixel[2] = pCurPicture->pData[2];
+    sSrcPixMap.iSizeInBits = g_kiPixMapSizeInBits;
+    sSrcPixMap.iStride[0] = pCurPicture->iLineSize[0];
+    sSrcPixMap.iStride[1] = pCurPicture->iLineSize[1];
+    sSrcPixMap.iStride[2] = pCurPicture->iLineSize[2];
+    sSrcPixMap.sRect.iRectWidth = pCurPicture->iWidthInPixel;
+    sSrcPixMap.sRect.iRectHeight = pCurPicture->iHeightInPixel;
+    sSrcPixMap.eFormat = VIDEO_FORMAT_I420;
+
+    sRefPixMap.pPixel[0] = pRefPicture->pData[0];
+    sRefPixMap.pPixel[1] = pRefPicture->pData[1];
+    sRefPixMap.pPixel[2] = pRefPicture->pData[2];
+    sRefPixMap.iSizeInBits = g_kiPixMapSizeInBits;
+    sRefPixMap.iStride[0] = pRefPicture->iLineSize[0];
+    sRefPixMap.iStride[1] = pRefPicture->iLineSize[1];
+    sRefPixMap.iStride[2] = pRefPicture->iLineSize[2];
+    sRefPixMap.sRect.iRectWidth = pRefPicture->iWidthInPixel;
+    sRefPixMap.sRect.iRectHeight = pRefPicture->iHeightInPixel;
+    sRefPixMap.eFormat = VIDEO_FORMAT_I420;
+
+    BGDParam.pBackgroundMbFlag = pVaaInfo->pVaaBackgroundMbFlag;
+    BGDParam.pCalcRes = & (pVaaInfo->sVaaCalcInfo);
+    m_pInterfaceVp->Set (iMethodIdx, (void*)&BGDParam);
+    m_pInterfaceVp->Process (iMethodIdx, &sSrcPixMap, &sRefPixMap);
+  } else {
+    int32_t	iPicWidthInMb	= (pCurPicture->iWidthInPixel + 15) >> 4;
+    int32_t	iPicHeightInMb = (pCurPicture->iHeightInPixel + 15) >> 4;
+    memset (pVaaInfo->pVaaBackgroundMbFlag, 0, iPicWidthInMb * iPicHeightInMb);
+  }
+}
+
+void CWelsPreProcess::AdaptiveQuantCalculation (SVAAFrameInfo* pVaaInfo, SPicture* pCurPicture, SPicture* pRefPicture) {
+  pVaaInfo->sAdaptiveQuantParam.pCalcResult = & (pVaaInfo->sVaaCalcInfo);
+  pVaaInfo->sAdaptiveQuantParam.dAverMotionTextureIndexToDeltaQp = 0;
+
+  {
+    int32_t iMethodIdx = METHOD_ADAPTIVE_QUANT;
+    SPixMap pSrc = {0};
+    SPixMap pRef = {0};
+    int32_t iRet = 0;
+
+    pSrc.pPixel[0] = pCurPicture->pData[0];
+    pSrc.iSizeInBits = g_kiPixMapSizeInBits;
+    pSrc.iStride[0] = pCurPicture->iLineSize[0];
+    pSrc.sRect.iRectWidth = pCurPicture->iWidthInPixel;
+    pSrc.sRect.iRectHeight = pCurPicture->iHeightInPixel;
+    pSrc.eFormat = VIDEO_FORMAT_I420;
+
+    pRef.pPixel[0] = pRefPicture->pData[0];
+    pRef.iSizeInBits = g_kiPixMapSizeInBits;
+    pRef.iStride[0] = pRefPicture->iLineSize[0];
+    pRef.sRect.iRectWidth = pRefPicture->iWidthInPixel;
+    pRef.sRect.iRectHeight = pRefPicture->iHeightInPixel;
+    pRef.eFormat = VIDEO_FORMAT_I420;
+
+    iRet = m_pInterfaceVp->Set (iMethodIdx, (void*) & (pVaaInfo->sAdaptiveQuantParam));
+    iRet = m_pInterfaceVp->Process (iMethodIdx, &pSrc, &pRef);
+    if (iRet == 0)
+      m_pInterfaceVp->Get (iMethodIdx, (void*) & (pVaaInfo->sAdaptiveQuantParam));
+  }
+}
+
+void CWelsPreProcess::SetRefMbType (void* pCtx, uint32_t** pRefMbTypeArray, int32_t iRefPicType) {
+  sWelsEncCtx* pEncCtx	    = (sWelsEncCtx*)pCtx;
+  const uint8_t uiTid		    = pEncCtx->uiTemporalId;
+  const uint8_t uiDid          = pEncCtx->uiDependencyId;
+  SRefList* pRefPicLlist				= pEncCtx->ppRefPicListExt[uiDid];
+  SLTRState* pLtr				= &pEncCtx->pLtr[uiDid];
+  uint8_t i							= 0;
+
+  if (pEncCtx->pSvcParam->bEnableLongTermReference && pLtr->bReceivedT0LostFlag && uiTid == 0) {
+    for (i = 0; i < pRefPicLlist->uiLongRefCount; i++) {
+      SPicture* pRef = pRefPicLlist->pLongRefList[i];
+      if (pRef != NULL && pRef->uiRecieveConfirmed == 1/*RECIEVE_SUCCESS*/) {
+        *pRefMbTypeArray = pRef->uiRefMbType;
+        break;
+      }
+    }
+  } else {
+    for (i = 0; i < pRefPicLlist->uiShortRefCount; i++) {
+      SPicture* pRef = pRefPicLlist->pShortRefList[i];
+      if (pRef != NULL && pRef->bUsedAsRef && pRef->iFramePoc >= 0 && pRef->uiTemporalId <= uiTid) {
+        *pRefMbTypeArray = pRef->uiRefMbType;
+        break;
+      }
+    }
+  }
+}
+
+
+void CWelsPreProcess::AnalyzePictureComplexity (void* pCtx, SPicture* pCurPicture, SPicture* pRefPicture,
+    const int32_t kiDependencyId, const bool_t bCalculateBGD) {
+  sWelsEncCtx* pEncCtx	= (sWelsEncCtx*)pCtx;
+  SWelsSvcCodingParam* pSvcParam = pEncCtx->pSvcParam;
+  SVAAFrameInfo* pVaaInfo			= pEncCtx->pVaa;
+
+  SComplexityAnalysisParam* sComplexityAnalysisParam = & (pVaaInfo->sComplexityAnalysisParam);
+  SWelsSvcRc* SWelsSvcRc = &pEncCtx->pWelsSvcRc[kiDependencyId];
+  int32_t iComplexityAnalysisMode = 0;
+
+  if (pSvcParam->iRCMode == RC_MODE0 && pEncCtx->eSliceType == P_SLICE) {
+    iComplexityAnalysisMode = FRAME_SAD;
+  } else if (pSvcParam->iRCMode == RC_MODE1 && pEncCtx->eSliceType == P_SLICE) {
+    iComplexityAnalysisMode = GOM_SAD;
+  } else if (pSvcParam->iRCMode == RC_MODE1 && pEncCtx->eSliceType == I_SLICE) {
+    iComplexityAnalysisMode = GOM_VAR;
+  } else {
+    return;
+  }
+
+  sComplexityAnalysisParam->iComplexityAnalysisMode = iComplexityAnalysisMode;
+  sComplexityAnalysisParam->pCalcResult = & (pVaaInfo->sVaaCalcInfo);
+  sComplexityAnalysisParam->pBackgroundMbFlag = pVaaInfo->pVaaBackgroundMbFlag;
+  SetRefMbType (pEncCtx, & (sComplexityAnalysisParam->uiRefMbType), pRefPicture->iPictureType);
+  sComplexityAnalysisParam->iCalcBgd = bCalculateBGD;
+  sComplexityAnalysisParam->iFrameComplexity = 0;
+
+  memset (SWelsSvcRc->pGomForegroundBlockNum, 0, SWelsSvcRc->iGomSize * sizeof (int32_t));
+  if (iComplexityAnalysisMode != FRAME_SAD)
+    memset (SWelsSvcRc->pCurrentFrameGomSad, 0, SWelsSvcRc->iGomSize * sizeof (int32_t));
+
+  sComplexityAnalysisParam->pGomComplexity = SWelsSvcRc->pCurrentFrameGomSad;
+  sComplexityAnalysisParam->pGomForegroundBlockNum = SWelsSvcRc->pGomForegroundBlockNum;
+  sComplexityAnalysisParam->iMbNumInGom = SWelsSvcRc->iNumberMbGom;
+
+  {
+    int32_t iMethodIdx = METHOD_COMPLEXITY_ANALYSIS;
+    SPixMap sSrcPixMap = {0};
+    SPixMap sRefPixMap = {0};
+    int32_t iRet = 0;
+
+    sSrcPixMap.pPixel[0] = pCurPicture->pData[0];
+    sSrcPixMap.iSizeInBits = g_kiPixMapSizeInBits;
+    sSrcPixMap.iStride[0] = pCurPicture->iLineSize[0];
+    sSrcPixMap.sRect.iRectWidth = pCurPicture->iWidthInPixel;
+    sSrcPixMap.sRect.iRectHeight = pCurPicture->iHeightInPixel;
+    sSrcPixMap.eFormat = VIDEO_FORMAT_I420;
+
+    sRefPixMap.pPixel[0] = pRefPicture->pData[0];
+    sRefPixMap.iSizeInBits = g_kiPixMapSizeInBits;
+    sRefPixMap.iStride[0] = pRefPicture->iLineSize[0];
+    sRefPixMap.sRect.iRectWidth = pRefPicture->iWidthInPixel;
+    sRefPixMap.sRect.iRectHeight = pRefPicture->iHeightInPixel;
+    sRefPixMap.eFormat = VIDEO_FORMAT_I420;
+
+    iRet = m_pInterfaceVp->Set (iMethodIdx, (void*)sComplexityAnalysisParam);
+    iRet = m_pInterfaceVp->Process (iMethodIdx, &sSrcPixMap, &sRefPixMap);
+    if (iRet == 0)
+      m_pInterfaceVp->Get (iMethodIdx, (void*)sComplexityAnalysisParam);
+  }
+}
+
+void  CWelsPreProcess::Padding (uint8_t* pSrcY, uint8_t* pSrcU, uint8_t* pSrcV, int32_t iStrideY, int32_t iStrideUV,
+                                int32_t iActualWidth, int32_t iPaddingWidth, int32_t iActualHeight, int32_t iPaddingHeight) {
+  int32_t i;
+
+  if (iPaddingHeight > iActualHeight) {
+    for (i = iActualHeight; i < iPaddingHeight; i++) {
+      memset (pSrcY + i * iStrideY, 0, iActualWidth);
+
+      if (! (i & 1)) {
+        memset (pSrcU + i / 2 * iStrideUV, 0x80, iActualWidth / 2);
+        memset (pSrcV + i / 2 * iStrideUV, 0x80, iActualWidth / 2);
+      }
+    }
+  }
+
+  if (iPaddingWidth > iActualWidth) {
+    for (i = 0; i < iPaddingHeight; i++) {
+      memset (pSrcY + i * iStrideY + iActualWidth, 0, iPaddingWidth - iActualWidth);
+      if (! (i & 1)) {
+        memset (pSrcU + i / 2 * iStrideUV + iActualWidth / 2, 0x80, (iPaddingWidth - iActualWidth) / 2);
+        memset (pSrcV + i / 2 * iStrideUV + iActualWidth / 2, 0x80, (iPaddingWidth - iActualWidth) / 2);
+      }
+    }
+  }
+}
+
+
+//TODO: may opti later
+//TODO: not use this func?
+void* WelsMemcpy (void* dst, const void* kpSrc, uint32_t uiSize) {
+  return ::memcpy (dst, kpSrc, uiSize);
+}
+void* WelsMemset (void* p, int32_t val, uint32_t uiSize) {
+  return ::memset (p, val, uiSize);
+}
+
+//i420_to_i420_c
+void  WelsMoveMemory_c (uint8_t* pDstY, uint8_t* pDstU, uint8_t* pDstV,  int32_t iDstStrideY, int32_t iDstStrideUV,
+                        uint8_t* pSrcY, uint8_t* pSrcU, uint8_t* pSrcV, int32_t iSrcStrideY, int32_t iSrcStrideUV, int32_t iWidth,
+                        int32_t iHeight) {
+  int32_t   iWidth2 = iWidth >> 1;
+  int32_t   iHeight2 = iHeight >> 1;
+  int32_t   j;
+
+  for (j = iHeight; j; j--) {
+    WelsMemcpy (pDstY, pSrcY, iWidth);
+    pDstY += iDstStrideY;
+    pSrcY += iSrcStrideY;
+  }
+
+  for (j = iHeight2; j; j--) {
+    WelsMemcpy (pDstU, pSrcU, iWidth2);
+    WelsMemcpy (pDstV, pSrcV, iWidth2);
+    pDstU += iDstStrideUV;
+    pDstV += iDstStrideUV;
+    pSrcU += iSrcStrideUV;
+    pSrcV += iSrcStrideUV;
+  }
+}
+//vp's padding
+void  VPpadding (uint8_t* pSrcPtr, int32_t iCurWidth, int32_t iTargetWidth, int32_t iCurHeight, int32_t iTargetHeight,
+                 int32_t iStride, uint8_t uiStuffValue) {
+  uint8_t* pTmp;
+  if (iTargetWidth > iCurWidth) {
+    pTmp = pSrcPtr + iCurWidth;
+    for (int32_t i = 0; i < iCurHeight; i++) {
+      WelsMemset (pTmp, uiStuffValue, iTargetWidth - iCurWidth);
+      pTmp += iStride;
+    }
+  }
+
+  if (iTargetHeight > iCurHeight) {
+    pTmp = pSrcPtr + iCurHeight * iStride;
+    for (int32_t i = iCurHeight; i < iTargetHeight; i++) {
+      WelsMemset (pTmp, uiStuffValue, iTargetWidth);
+      pTmp += iStride;
+    }
+  }
+}
+
+
+void  CWelsPreProcess::WelsMoveMemoryWrapper (SWelsSvcCodingParam* pSvcParam, SPicture* pDstPic,
+    const SSourcePicture* kpSrc,
+    const int32_t kiTargetWidth, const int32_t kiTargetHeight) {
+  if (VIDEO_FORMAT_I420 != (kpSrc->iColorFormat & (~VIDEO_FORMAT_VFlip)))
+    return;
+
+  int32_t  iSrcWidth       = kpSrc->iPicWidth;
+  int32_t  iSrcHeight      = kpSrc->iPicHeight;
+
+  if (iSrcHeight > kiTargetHeight) 	iSrcHeight = kiTargetHeight;
+  if (iSrcWidth > kiTargetWidth)		iSrcWidth  = kiTargetWidth;
+
+  // copy from fr26 to fix the odd uiSize failed issue
+  if (iSrcWidth & 0x1)		-- iSrcWidth;
+  if (iSrcHeight & 0x1)		-- iSrcHeight;
+
+  const int32_t kiSrcTopOffsetY = pSvcParam->SUsedPicRect.iTop;
+  const int32_t kiSrcTopOffsetUV = (kiSrcTopOffsetY >> 1);
+  const int32_t kiSrcLeftOffsetY = pSvcParam->SUsedPicRect.iLeft;
+  const int32_t kiSrcLeftOffsetUV = (kiSrcLeftOffsetY >> 1);
+  int32_t  iSrcOffset[3]       = {0, 0, 0};
+  iSrcOffset[0] = kpSrc->iStride[0] * kiSrcTopOffsetY + kiSrcLeftOffsetY;
+  iSrcOffset[1] = kpSrc->iStride[1] * kiSrcTopOffsetUV + kiSrcLeftOffsetUV ;
+  iSrcOffset[2] = kpSrc->iStride[2] * kiSrcTopOffsetUV + kiSrcLeftOffsetUV;
+
+  uint8_t* pSrcY = kpSrc->pData[0] + iSrcOffset[0];
+  uint8_t* pSrcU = kpSrc->pData[1] + iSrcOffset[1];
+  uint8_t* pSrcV = kpSrc->pData[2] + iSrcOffset[2];
+  const int32_t kiSrcStrideY = kpSrc->iStride[0];
+  const int32_t kiSrcStrideUV = kpSrc->iStride[1];
+
+  uint8_t* pDstY = pDstPic->pData[0];
+  uint8_t* pDstU = pDstPic->pData[1];
+  uint8_t* pDstV = pDstPic->pData[2];
+  const int32_t kiDstStrideY = pDstPic->iLineSize[0];
+  const int32_t kiDstStrideUV = pDstPic->iLineSize[1];
+
+#define MAX_WIDTH      (4096)
+#define MAX_HEIGHT     (2304)//MAX_FS_LEVEL51 (36864); MAX_FS_LEVEL51*256/4096 = 2304
+  if (pSrcY) {
+    if (iSrcWidth <= 0 || iSrcWidth > MAX_WIDTH || iSrcHeight <= 0 || iSrcHeight > MAX_HEIGHT)
+      return;
+    if (kiSrcTopOffsetY >= iSrcHeight || kiSrcLeftOffsetY >= iSrcWidth || iSrcWidth > kiSrcStrideY)
+      return;
+  }
+  if (pDstY) {
+    if (kiTargetWidth <= 0 || kiTargetWidth > MAX_WIDTH || kiTargetHeight <= 0 || kiTargetHeight > MAX_HEIGHT)
+      return;
+    if (kiTargetWidth > kiDstStrideY)
+      return;
+  }
+
+  if (pSrcY == NULL || pSrcU == NULL || pSrcV == NULL || pDstY == NULL || pDstU == NULL || pDstV == NULL
+      || (iSrcWidth & 1) || (iSrcHeight & 1)) {
+  } else {
+    //i420_to_i420_c
+    WelsMoveMemory_c (pDstY,  pDstU,  pDstV,  kiDstStrideY, kiDstStrideUV,
+                      pSrcY,  pSrcU,  pSrcV, kiSrcStrideY, kiSrcStrideUV, iSrcWidth, iSrcHeight);
+
+    //in VP Process
+    if (kiTargetWidth > iSrcWidth || kiTargetHeight > iSrcHeight) {
+      const int32_t kiTargetWidthC  = (kiTargetWidth >> 1);
+      const int32_t kiTargetHeightC = (kiTargetHeight >> 1);
+      const int32_t kiSrcWidthC        = (iSrcWidth >> 1);
+      const int32_t kiSrcHeightC       = (iSrcHeight >> 1);
+
+      // padding pDstPic I420
+      VPpadding ((uint8_t*)pDstY, iSrcWidth, kiTargetWidth, iSrcHeight, kiTargetHeight, kiDstStrideY, 0);
+      VPpadding ((uint8_t*)pDstU, kiSrcWidthC, kiTargetWidthC, kiSrcHeightC, kiTargetHeightC, kiDstStrideUV, 0x80);
+      VPpadding ((uint8_t*)pDstV, kiSrcWidthC, kiTargetWidthC, kiSrcHeightC, kiTargetHeightC, kiDstStrideUV, 0x80);
+    }
+  }
+
+}
+
+//*********************************************************************************************************/
+} // namespace WelsSVCEnc
--- a/codec/encoder/plus/inc/welsCodecTrace.h
+++ b/codec/encoder/plus/inc/welsCodecTrace.h
@@ -37,42 +37,41 @@
 #include "typedefs.h"
 
 #ifdef WIN32
-typedef int32_t ( *CM_WELS_TRACE)( const str_t* format, ...);
+typedef int32_t (*CM_WELS_TRACE) (const str_t* format, ...);
 #else
-typedef int32_t ( *CM_WELS_TRACE2)( const str_t* dllname, const str_t* format, ...);
+typedef int32_t (*CM_WELS_TRACE2) (const str_t* dllname, const str_t* format, ...);
 #endif
 
-class welsCodecTrace
-{
-public:
-	welsCodecTrace();
-	~welsCodecTrace();
-	
-	static void TraceString(int32_t iLevel, const str_t* kpStrFormat);
-	static void CODEC_TRACE(void* pIgnore, const int32_t kiLevel, const str_t* kpStrFormat, va_list vl);
+class welsCodecTrace {
+ public:
+  welsCodecTrace();
+  ~welsCodecTrace();
 
-	void SetTraceLevel(const int32_t kiLevel);
-	int32_t WelsTraceModuleIsExist();
+  static void TraceString (int32_t iLevel, const str_t* kpStrFormat);
+  static void CODEC_TRACE (void* pIgnore, const int32_t kiLevel, const str_t* kpStrFormat, va_list vl);
 
-private:	
-	
-	int32_t m_WelsTraceExistFlag;
-	void* m_hTraceHandle;
+  void SetTraceLevel (const int32_t kiLevel);
+  int32_t WelsTraceModuleIsExist();
 
-public:
-	static int32_t	m_iTraceLevel;
+ private:
+
+  int32_t m_WelsTraceExistFlag;
+  void* m_hTraceHandle;
+
+ public:
+  static int32_t	m_iTraceLevel;
 #if defined WIN32
-	static CM_WELS_TRACE m_fpDebugTrace;
-	static CM_WELS_TRACE m_fpInfoTrace;
-	static CM_WELS_TRACE m_fpWarnTrace;
-	static CM_WELS_TRACE m_fpErrorTrace;
-#else	
-	static CM_WELS_TRACE2 m_fpDebugTrace;
-	static CM_WELS_TRACE2 m_fpInfoTrace;
-	static CM_WELS_TRACE2 m_fpWarnTrace;
-	static CM_WELS_TRACE2 m_fpErrorTrace;
+  static CM_WELS_TRACE m_fpDebugTrace;
+  static CM_WELS_TRACE m_fpInfoTrace;
+  static CM_WELS_TRACE m_fpWarnTrace;
+  static CM_WELS_TRACE m_fpErrorTrace;
+#else
+  static CM_WELS_TRACE2 m_fpDebugTrace;
+  static CM_WELS_TRACE2 m_fpInfoTrace;
+  static CM_WELS_TRACE2 m_fpWarnTrace;
+  static CM_WELS_TRACE2 m_fpErrorTrace;
 #endif
-	
+
 };
 
 #endif //WELS_CODEC_TRACE
--- a/codec/encoder/plus/inc/welsEncoderExt.h
+++ b/codec/encoder/plus/inc/welsEncoderExt.h
@@ -56,75 +56,74 @@
 
 class ISVCEncoder;
 namespace WelsSVCEnc {
-class CWelsH264SVCEncoder : public ISVCEncoder  
-{
-public:
-	CWelsH264SVCEncoder();
-	virtual ~CWelsH264SVCEncoder();
+class CWelsH264SVCEncoder : public ISVCEncoder {
+ public:
+  CWelsH264SVCEncoder();
+  virtual ~CWelsH264SVCEncoder();
 
-	/* Interfaces override from ISVCEncoder */
-	/*
-	 * return: CM_RETURN: 0 - success; otherwise - failed;
-	 */
-	virtual int Initialize(SVCEncodingParam* argv, const INIT_TYPE init_type);
-	virtual int Initialize(void* argv, const INIT_TYPE init_type);
+  /* Interfaces override from ISVCEncoder */
+  /*
+   * return: CM_RETURN: 0 - success; otherwise - failed;
+   */
+  virtual int Initialize (SVCEncodingParam* argv, const INIT_TYPE init_type);
+  virtual int Initialize (void* argv, const INIT_TYPE init_type);
 
-	virtual int Unintialize();
-	
-	/*
-	 * return: EVideoFrameType [IDR: videoFrameTypeIDR; P: videoFrameTypeP; ERROR: videoFrameTypeInvalid]
-	 */
-	virtual int EncodeFrame(const unsigned char* kpSrc, SFrameBSInfo* pBsInfo);
-	virtual int EncodeFrame(const SSourcePicture ** kppSrcPicList, int nSrcPicNum, SFrameBSInfo * pBsInfo);
-	
-	/*
-	 * return: 0 - success; otherwise - failed;
-	 */
-	virtual int PauseFrame(const unsigned char* pSrc, SFrameBSInfo* pBsInfo);	
-	
-	/*
-	 * return: 0 - success; otherwise - failed;
-	 */
-	virtual int ForceIntraFrame(bool bIDR);		
-	
-	/************************************************************************
-	 * InDataFormat, IDRInterval, SVC Encode Param, Frame Rate, Bitrate,..
-	 ************************************************************************/
-	/*
-	 * return: CM_RETURN: 0 - success; otherwise - failed;
-	 */
-	virtual int SetOption(ENCODER_OPTION opt_id, void* option);
-	virtual int GetOption(ENCODER_OPTION opt_id, void* option);	
+  virtual int Uninitialize();
 
-private:	
-	sWelsEncCtx	*m_pEncContext;
+  /*
+   * return: EVideoFrameType [IDR: videoFrameTypeIDR; P: videoFrameTypeP; ERROR: videoFrameTypeInvalid]
+   */
+  virtual int EncodeFrame (const unsigned char* kpSrc, SFrameBSInfo* pBsInfo);
+  virtual int EncodeFrame (const SSourcePicture** kppSrcPicList, int nSrcPicNum, SFrameBSInfo* pBsInfo);
 
-#if defined(WIN32)||defined(_MACH_PLATFORM)||defined(__GNUC__) 
-	welsCodecTrace			*m_pWelsTrace;
-#endif	
-	SSourcePicture			**m_pSrcPicList;
-	int32_t						m_iSrcListSize;
+  /*
+   * return: 0 - success; otherwise - failed;
+   */
+  virtual int PauseFrame (const unsigned char* pSrc, SFrameBSInfo* pBsInfo);
 
-	int32_t						m_iMaxPicWidth;
-	int32_t						m_iMaxPicHeight;
-	
-	int32_t						m_iCspInternal;
-	BOOL_T					m_bInitialFlag;	
+  /*
+   * return: 0 - success; otherwise - failed;
+   */
+  virtual int ForceIntraFrame (bool bIDR);
 
+  /************************************************************************
+   * InDataFormat, IDRInterval, SVC Encode Param, Frame Rate, Bitrate,..
+   ************************************************************************/
+  /*
+   * return: CM_RETURN: 0 - success; otherwise - failed;
+   */
+  virtual int SetOption (ENCODER_OPTION opt_id, void* option);
+  virtual int GetOption (ENCODER_OPTION opt_id, void* option);
+
+ private:
+  sWelsEncCtx*	m_pEncContext;
+
+#if defined(WIN32)||defined(_MACH_PLATFORM)||defined(__GNUC__)
+  welsCodecTrace*			m_pWelsTrace;
+#endif
+  SSourcePicture**			m_pSrcPicList;
+  int32_t						m_iSrcListSize;
+
+  int32_t						m_iMaxPicWidth;
+  int32_t						m_iMaxPicHeight;
+
+  int32_t						m_iCspInternal;
+  BOOL_T					m_bInitialFlag;
+
 #ifdef OUTPUT_BIT_STREAM
-	FILE*				m_pFileBs;
-	FILE*               m_pFileBsSize;
-	BOOL_T				m_bSwitch;
-	int32_t					m_iSwitchTimes;
+  FILE*				m_pFileBs;
+  FILE*               m_pFileBsSize;
+  BOOL_T				m_bSwitch;
+  int32_t					m_iSwitchTimes;
 #endif//OUTPUT_BIT_STREAM
 
 #ifdef REC_FRAME_COUNT
-   int32_t		m_uiCountFrameNum;
+  int32_t		m_uiCountFrameNum;
 #endif//REC_FRAME_COUNT
-	
-	void    InitEncoder( void );	
-	int32_t RawData2SrcPic(const uint8_t * pSrc);
-	void    DumpSrcPicture(const uint8_t *pSrc);
+
+  void    InitEncoder (void);
+  int32_t RawData2SrcPic (const uint8_t* pSrc);
+  void    DumpSrcPicture (const uint8_t* pSrc);
 };
 }
 #endif // !defined(AFX_WELSH264ENCODER_H__D9FAA1D1_5403_47E1_8E27_78F11EE65F02__INCLUDED_)
--- a/codec/encoder/plus/res/resource.h
+++ b/codec/encoder/plus/res/resource.h
@@ -4,7 +4,7 @@
 //
 
 // Next default values for new objects
-// 
+//
 #ifdef APSTUDIO_INVOKED
 #ifndef APSTUDIO_READONLY_SYMBOLS
 #define _APS_NEXT_RESOURCE_VALUE        101
--- a/codec/encoder/plus/res/welsenc.rc
+++ b/codec/encoder/plus/res/welsenc.rc
@@ -7,7 +7,7 @@
 //
 // Generated from the TEXTINCLUDE 2 resource.
 //
-#include "afxres.h"
+#include "windows.h"
 
 /////////////////////////////////////////////////////////////////////////////
 #undef APSTUDIO_READONLY_SYMBOLS
@@ -27,18 +27,18 @@
 // TEXTINCLUDE
 //
 
-1 TEXTINCLUDE 
+1 TEXTINCLUDE
 BEGIN
     "resource.h\0"
 END
 
-2 TEXTINCLUDE 
+2 TEXTINCLUDE
 BEGIN
-    "#include ""afxres.h""\r\n"
+    "#include ""windows.h""\r\n"
     "\0"
 END
 
-3 TEXTINCLUDE 
+3 TEXTINCLUDE
 BEGIN
     "\r\n"
     "\0"
--- a/codec/encoder/plus/src/DllEntry.cpp
+++ b/codec/encoder/plus/src/DllEntry.cpp
@@ -36,16 +36,14 @@
 // DLL Entry Point
 HANDLE g_hInstDll;
 
-BOOL WINAPI DllEntryPoint(HINSTANCE hInstance, DWORD dwReason, LPVOID lpReserved)
-{
-	g_hInstDll = hInstance;
-    if (DLL_PROCESS_ATTACH == dwReason)
-    {
-		DisableThreadLibraryCalls(hInstance);
-	}
+BOOL WINAPI DllEntryPoint (HINSTANCE hInstance, DWORD dwReason, LPVOID lpReserved) {
+  g_hInstDll = hInstance;
+  if (DLL_PROCESS_ATTACH == dwReason) {
+    DisableThreadLibraryCalls (hInstance);
+  }
 //	else if (DLL_PROCESS_DETACH == dwReason)
 //	{
-//	
+//
 //	}
-    return TRUE;
+  return TRUE;
 }
\ No newline at end of file
--- a/codec/encoder/plus/src/welsCodecTrace.cpp
+++ b/codec/encoder/plus/src/welsCodecTrace.cpp
@@ -56,6 +56,8 @@
 extern HANDLE g_hInstDll;
 #endif
 
+#include "logging.h"
+
 //#define CODEC_TRACE_ERROR 0
 //#define CODEC_TRACE_WARNING 1
 //#define CODEC_TRACE_INFO 2
@@ -64,85 +66,73 @@
 using namespace WelsSVCEnc;
 
 #ifdef MACOS
-static CFBundleRef LoadLibrary(const str_t* lpszbundle)
-{
-	// 1.get bundle path
-	str_t cBundlePath[PATH_MAX];
-	memset(cBundlePath, 0, PATH_MAX);
-	
-	Dl_info 	dlInfo;
-	static int32_t  sDummy;
-	dladdr((void*)&sDummy, &dlInfo);
-	
-	strlcpy(cBundlePath, dlInfo.dli_fname, PATH_MAX);	// confirmed_safe_unsafe_usage
-	
-	str_t * pPath = NULL;
-	for(int32_t i = 4; i > 0; i--)
-	{
-		pPath = strrchr(cBundlePath,'/');	// confirmed_safe_unsafe_usage
-		if(pPath)
-		{
-			*pPath = 0;
-		}
-		else
-		{
-			break;
-		}
-	}
-	if(pPath)
-	{
-		strlcat(cBundlePath, "/", PATH_MAX);	// confirmed_safe_unsafe_usage
-	}
-	else
-	{
-		return NULL;
-	}
-	
-	strlcat(cBundlePath, lpszbundle, PATH_MAX);	// confirmed_safe_unsafe_usage
-	
-	FSRef bundlePath;
-	OSStatus iStatus = FSPathMakeRef((uint8_t*)cBundlePath, &bundlePath, NULL);
-	if(noErr != iStatus)
-		return NULL;
-	
-	CFURLRef bundleURL = CFURLCreateFromFSRef(kCFAllocatorSystemDefault, &bundlePath);
-	if(NULL == bundleURL)
-		return NULL;
-	
-	// 2.get bundle pRef
-	CFBundleRef bundleRef = CFBundleCreate(kCFAllocatorSystemDefault, bundleURL);
-	CFRelease(bundleURL);
-	
+static CFBundleRef LoadLibrary (const str_t* lpszbundle) {
+  // 1.get bundle path
+  str_t cBundlePath[PATH_MAX];
+  memset (cBundlePath, 0, PATH_MAX);
+
+  Dl_info 	dlInfo;
+  static int32_t  sDummy;
+  dladdr ((void*)&sDummy, &dlInfo);
+
+  strlcpy (cBundlePath, dlInfo.dli_fname, PATH_MAX);	// confirmed_safe_unsafe_usage
+
+  str_t* pPath = NULL;
+  for (int32_t i = 4; i > 0; i--) {
+    pPath = strrchr (cBundlePath, '/');	// confirmed_safe_unsafe_usage
+    if (pPath) {
+      *pPath = 0;
+    } else {
+      break;
+    }
+  }
+  if (pPath) {
+    strlcat (cBundlePath, "/", PATH_MAX);	// confirmed_safe_unsafe_usage
+  } else {
+    return NULL;
+  }
+
+  strlcat (cBundlePath, lpszbundle, PATH_MAX);	// confirmed_safe_unsafe_usage
+
+  FSRef bundlePath;
+  OSStatus iStatus = FSPathMakeRef ((uint8_t*)cBundlePath, &bundlePath, NULL);
+  if (noErr != iStatus)
+    return NULL;
+
+  CFURLRef bundleURL = CFURLCreateFromFSRef (kCFAllocatorSystemDefault, &bundlePath);
+  if (NULL == bundleURL)
+    return NULL;
+
+  // 2.get bundle pRef
+  CFBundleRef bundleRef = CFBundleCreate (kCFAllocatorSystemDefault, bundleURL);
+  CFRelease (bundleURL);
+
 //	Boolean bReturn = FALSE;
-	if(NULL != bundleRef)
-	{
-		//	bReturn = CFBundleLoadExecutable(bundleRef);
-	}
-	
-	return bundleRef;
+  if (NULL != bundleRef) {
+    //	bReturn = CFBundleLoadExecutable(bundleRef);
+  }
+
+  return bundleRef;
 }
 
-static Boolean FreeLibrary(CFBundleRef bundle)
-{	
-	if(NULL != bundle)
-	{
-		//	CFBundleUnloadExecutable(bundle);
-		CFRelease(bundle);
-	}
-	
-	return TRUE;
+static Boolean FreeLibrary (CFBundleRef bundle) {
+  if (NULL != bundle) {
+    //	CFBundleUnloadExecutable(bundle);
+    CFRelease (bundle);
+  }
+
+  return TRUE;
 }
 
-static void* GetProcessAddress(CFBundleRef bundle, const str_t* lpszprocname)
-{
-	if(NULL == bundle)
-		return NULL;
-	
-	CFStringRef cfprocname = CFStringCreateWithCString(NULL,lpszprocname,CFStringGetSystemEncoding());
-	void *processAddress = CFBundleGetFunctionPointerForName(bundle,cfprocname);
-	CFRelease(cfprocname);
-	
-	return processAddress;
+static void* GetProcessAddress (CFBundleRef bundle, const str_t* lpszprocname) {
+  if (NULL == bundle)
+    return NULL;
+
+  CFStringRef cfprocname = CFStringCreateWithCString (NULL, lpszprocname, CFStringGetSystemEncoding());
+  void* processAddress = CFBundleGetFunctionPointerForName (bundle, cfprocname);
+  CFRelease (cfprocname);
+
+  return processAddress;
 }
 #endif
 
@@ -153,225 +143,219 @@
 CM_WELS_TRACE welsCodecTrace::m_fpWarnTrace	= NULL;
 CM_WELS_TRACE welsCodecTrace::m_fpErrorTrace	= NULL;
 #else
-CM_WELS_TRACE2 welsCodecTrace::m_fpDebugTrace= NULL;
+CM_WELS_TRACE2 welsCodecTrace::m_fpDebugTrace   = NULL;
 CM_WELS_TRACE2 welsCodecTrace::m_fpInfoTrace	= NULL;
 CM_WELS_TRACE2 welsCodecTrace::m_fpWarnTrace	= NULL;
-CM_WELS_TRACE2 welsCodecTrace::m_fpErrorTrace= NULL;
+CM_WELS_TRACE2 welsCodecTrace::m_fpErrorTrace   = NULL;
 #endif//WIN32
 
-welsCodecTrace::welsCodecTrace()
-{
-	m_hTraceHandle = NULL;
-	m_fpDebugTrace = NULL;
-	m_fpInfoTrace = NULL;
-	m_fpWarnTrace = NULL;
-	m_fpErrorTrace = NULL;
-	m_WelsTraceExistFlag	= false;
-	
-#if defined WIN32	
-	HMODULE handle = ::GetModuleHandle("welstrace.dll");
-//	HMODULE handle = ::GetModuleHandle("contrace.dll"); // for c7 
-	if ( NULL == handle )
-		return;
+welsCodecTrace::welsCodecTrace() {
+  m_hTraceHandle = NULL;
+  m_fpDebugTrace = NULL;
+  m_fpInfoTrace = NULL;
+  m_fpWarnTrace = NULL;
+  m_fpErrorTrace = NULL;
+  m_WelsTraceExistFlag	= false;
+#ifdef NO_DYNAMIC_VP
+  m_fpDebugTrace = welsStderrTrace<WELS_LOG_DEBUG>;
+  m_fpInfoTrace = welsStderrTrace<WELS_LOG_INFO>;
+  m_fpWarnTrace = welsStderrTrace<WELS_LOG_WARNING>;
+  m_fpErrorTrace = welsStderrTrace<WELS_LOG_ERROR>;
 
-	CHAR achPath[ _MAX_PATH]= {0};
-	GetModuleFileName( (HMODULE)handle, achPath, _MAX_PATH);
+  m_WelsTraceExistFlag = true;
+#else
+#if defined WIN32
+  HMODULE handle = ::GetModuleHandle ("welstrace.dll");
+//	HMODULE handle = ::GetModuleHandle("contrace.dll"); // for c7
+  if (NULL == handle)
+    return;
 
-	m_hTraceHandle = ::LoadLibrary(achPath);
-	
-	OutputDebugStringA(achPath);
-	if( m_hTraceHandle) {
-		m_fpDebugTrace = ( CM_WELS_TRACE)::GetProcAddress( ( HMODULE)m_hTraceHandle, "WELSDEBUGA");
-		m_fpInfoTrace = ( CM_WELS_TRACE)::GetProcAddress( ( HMODULE)m_hTraceHandle, "WELSINFOA");
-		m_fpWarnTrace = ( CM_WELS_TRACE)::GetProcAddress( ( HMODULE)m_hTraceHandle, "WELSWARNA");
-		m_fpErrorTrace = ( CM_WELS_TRACE)::GetProcAddress( ( HMODULE)m_hTraceHandle, "WELSERRORA");
-	}
+  CHAR achPath[ _MAX_PATH] = {0};
+  GetModuleFileName ((HMODULE)handle, achPath, _MAX_PATH);
+
+  m_hTraceHandle = ::LoadLibrary (achPath);
+
+  OutputDebugStringA (achPath);
+  if (m_hTraceHandle) {
+    m_fpDebugTrace = (CM_WELS_TRACE)::GetProcAddress ((HMODULE)m_hTraceHandle, "WELSDEBUGA");
+    m_fpInfoTrace = (CM_WELS_TRACE)::GetProcAddress ((HMODULE)m_hTraceHandle, "WELSINFOA");
+    m_fpWarnTrace = (CM_WELS_TRACE)::GetProcAddress ((HMODULE)m_hTraceHandle, "WELSWARNA");
+    m_fpErrorTrace = (CM_WELS_TRACE)::GetProcAddress ((HMODULE)m_hTraceHandle, "WELSERRORA");
+  }
 #elif defined MACOS
-	m_hTraceHandle = LoadLibrary("welstrace.bundle");
-	if(m_hTraceHandle) {
-		m_fpDebugTrace = ( CM_WELS_TRACE2)GetProcessAddress( (CFBundleRef)m_hTraceHandle, "WELSDEBUG2");
-		m_fpInfoTrace = ( CM_WELS_TRACE2)GetProcessAddress( (CFBundleRef)m_hTraceHandle, "WELSINFO2");
-		m_fpWarnTrace = ( CM_WELS_TRACE2)GetProcessAddress( (CFBundleRef)m_hTraceHandle, "WELSWARN2");
-		m_fpErrorTrace = ( CM_WELS_TRACE2)GetProcessAddress( (CFBundleRef)m_hTraceHandle, "WELSERROR2");
-	}
+  m_hTraceHandle = LoadLibrary ("welstrace.bundle");
+  if (m_hTraceHandle) {
+    m_fpDebugTrace = (CM_WELS_TRACE2)GetProcessAddress ((CFBundleRef)m_hTraceHandle, "WELSDEBUG2");
+    m_fpInfoTrace = (CM_WELS_TRACE2)GetProcessAddress ((CFBundleRef)m_hTraceHandle, "WELSINFO2");
+    m_fpWarnTrace = (CM_WELS_TRACE2)GetProcessAddress ((CFBundleRef)m_hTraceHandle, "WELSWARN2");
+    m_fpErrorTrace = (CM_WELS_TRACE2)GetProcessAddress ((CFBundleRef)m_hTraceHandle, "WELSERROR2");
+  }
 #elif defined LINUX || defined SOLARIS || defined UNIX
 //#else
 //	CCmString	cmPath;
-	str_t achPath[255]= {0};
-	Dl_info		DlInfo;
-	static int32_t	nMmTPAddress;
-    dladdr( &nMmTPAddress, &DlInfo);
+  str_t achPath[255] = {0};
+  Dl_info		DlInfo;
+  static int32_t	nMmTPAddress;
+  dladdr (&nMmTPAddress, &DlInfo);
 
-	if (NULL == DlInfo.dli_fname)
-		return;
-	STRNCPY(achPath, 255, DlInfo.dli_fname, STRNLEN(DlInfo.dli_fname, 255));	// confirmed_safe_unsafe_usage
-	str_t* p = strrchr(achPath, '/');	// confirmed_safe_unsafe_usage
-	if ( NULL == p )
-		return;
-	const int32_t kiLenTraceName = STRNLEN("/libwelstrace.so", 15);	// confirmed_safe_unsafe_usage
-	const int32_t kiCurPos = p - achPath;
-	if ( kiCurPos + kiLenTraceName < 255 )
-		STRNCPY(p, 254-kiCurPos, "/libwelstrace.so", kiLenTraceName );	// confirmed_safe_unsafe_usage
-	else
-		return;
+  if (NULL == DlInfo.dli_fname)
+    return;
+  STRNCPY (achPath, 255, DlInfo.dli_fname, STRNLEN (DlInfo.dli_fname, 255));	// confirmed_safe_unsafe_usage
+  str_t* p = strrchr (achPath, '/');	// confirmed_safe_unsafe_usage
+  if (NULL == p)
+    return;
+  const int32_t kiLenTraceName = STRNLEN ("/libwelstrace.so", 15);	// confirmed_safe_unsafe_usage
+  const int32_t kiCurPos = p - achPath;
+  if (kiCurPos + kiLenTraceName < 255)
+    STRNCPY (p, 254 - kiCurPos, "/libwelstrace.so", kiLenTraceName);	// confirmed_safe_unsafe_usage
+  else
+    return;
 
-	m_hTraceHandle = dlopen( achPath, RTLD_LAZY);
-	if (m_hTraceHandle == NULL)
-	{
-		FILE* fp = fopen("/tmp/trace.txt", "a");
-		if(fp)
-		{
-			fprintf(fp, "welsCodecTrace::welsCodecTrace ===> dlopen %s fail, %s\n", achPath, dlerror());
-			fclose(fp);
-		}
-		return;
-	}
-	if (m_hTraceHandle) {
-		m_fpDebugTrace = ( CM_WELS_TRACE2)dlsym( m_hTraceHandle, "WELSDEBUG2");
-		m_fpInfoTrace = ( CM_WELS_TRACE2)dlsym( m_hTraceHandle, "WELSINFO2");
-		m_fpWarnTrace = ( CM_WELS_TRACE2)dlsym( m_hTraceHandle, "WELSWARN2");
-		m_fpErrorTrace = ( CM_WELS_TRACE2)dlsym( m_hTraceHandle, "WELSERROR2");
-		if(m_fpDebugTrace == NULL)
-		{
-			FILE* fp = fopen("/tmp/trace.txt", "a");
-			if(fp)
-			{
-				printf("welsCodecTrace::welsCodecTrace ===> dlsym failed (WELSDEBUG2) , dlerror = %s\n", dlerror());
-				fclose(fp);
-			}
-			return;
-		}
-	}
+  m_hTraceHandle = dlopen (achPath, RTLD_LAZY);
+  if (m_hTraceHandle == NULL) {
+    FILE* fp = fopen ("/tmp/trace.txt", "a");
+    if (fp) {
+      fprintf (fp, "welsCodecTrace::welsCodecTrace ===> dlopen %s fail, %s\n", achPath, dlerror());
+      fclose (fp);
+    }
+    return;
+  }
+  if (m_hTraceHandle) {
+    m_fpDebugTrace = (CM_WELS_TRACE2)dlsym (m_hTraceHandle, "WELSDEBUG2");
+    m_fpInfoTrace = (CM_WELS_TRACE2)dlsym (m_hTraceHandle, "WELSINFO2");
+    m_fpWarnTrace = (CM_WELS_TRACE2)dlsym (m_hTraceHandle, "WELSWARN2");
+    m_fpErrorTrace = (CM_WELS_TRACE2)dlsym (m_hTraceHandle, "WELSERROR2");
+    if (m_fpDebugTrace == NULL) {
+      FILE* fp = fopen ("/tmp/trace.txt", "a");
+      if (fp) {
+        printf ("welsCodecTrace::welsCodecTrace ===> dlsym failed (WELSDEBUG2) , dlerror = %s\n", dlerror());
+        fclose (fp);
+      }
+      return;
+    }
+  }
 #endif
-	if(m_hTraceHandle != NULL)
-	{
-		m_WelsTraceExistFlag	= true;
-	}
+  if (m_hTraceHandle != NULL) {
+    m_WelsTraceExistFlag	= true;
+  }
+#endif
 }
 
-welsCodecTrace::~welsCodecTrace()
-{
+welsCodecTrace::~welsCodecTrace() {
 #if defined WIN32
-	if( m_hTraceHandle) {
-		::FreeLibrary( ( HMODULE)m_hTraceHandle);
-	}
+  if (m_hTraceHandle) {
+    ::FreeLibrary ((HMODULE)m_hTraceHandle);
+  }
 #elif defined MACOS
-	if (m_hTraceHandle) {
-		FreeLibrary( (CFBundleRef)m_hTraceHandle);
-	}
+  if (m_hTraceHandle) {
+    FreeLibrary ((CFBundleRef)m_hTraceHandle);
+  }
 #elif defined LINUX || defined SOLARIS || defined UNIX
-	if (m_hTraceHandle) {
-		::dlclose( m_hTraceHandle);
-	}
+  if (m_hTraceHandle) {
+    ::dlclose (m_hTraceHandle);
+  }
 #endif
 
-	m_hTraceHandle = NULL;
-	m_fpDebugTrace = NULL;
-	m_fpInfoTrace = NULL;
-	m_fpWarnTrace = NULL;
-	m_fpErrorTrace = NULL;
+  m_hTraceHandle = NULL;
+  m_fpDebugTrace = NULL;
+  m_fpInfoTrace = NULL;
+  m_fpWarnTrace = NULL;
+  m_fpErrorTrace = NULL;
 //	g_bWelsLibLoaded = false;
-	m_WelsTraceExistFlag = false;
+  m_WelsTraceExistFlag = false;
 }
 
-int32_t welsCodecTrace::WelsTraceModuleIsExist()
-{
-	return m_WelsTraceExistFlag;
+int32_t welsCodecTrace::WelsTraceModuleIsExist() {
+  return m_WelsTraceExistFlag;
 }
 
-void welsCodecTrace::TraceString(int32_t iLevel, const str_t* str)
-{
+void welsCodecTrace::TraceString (int32_t iLevel, const str_t* str) {
 #ifdef WIN32
-	switch(iLevel)
-	{
-	case WELS_LOG_ERROR:
-		if(m_fpErrorTrace)
-			m_fpErrorTrace("%s", str);
-		break;
-	case WELS_LOG_WARNING:
-		if(m_fpWarnTrace)
-			m_fpWarnTrace("%s", str);
-		break;
-	case WELS_LOG_INFO:
-		if(m_fpInfoTrace)
-			m_fpInfoTrace("%s", str);
-		break;
-	case WELS_LOG_DEBUG:
-		if(m_fpDebugTrace)
-			m_fpDebugTrace("%s", str);
-		break;
-	default:
-		if(m_fpDebugTrace)
-			m_fpInfoTrace("%s", str);
-		break;
-	}
+  switch (iLevel) {
+  case WELS_LOG_ERROR:
+    if (m_fpErrorTrace)
+      m_fpErrorTrace ("%s", str);
+    break;
+  case WELS_LOG_WARNING:
+    if (m_fpWarnTrace)
+      m_fpWarnTrace ("%s", str);
+    break;
+  case WELS_LOG_INFO:
+    if (m_fpInfoTrace)
+      m_fpInfoTrace ("%s", str);
+    break;
+  case WELS_LOG_DEBUG:
+    if (m_fpDebugTrace)
+      m_fpDebugTrace ("%s", str);
+    break;
+  default:
+    if (m_fpDebugTrace)
+      m_fpInfoTrace ("%s", str);
+    break;
+  }
 #else
-	switch(iLevel)
-	{
-	case WELS_LOG_ERROR:
-		if(m_fpErrorTrace)
-			m_fpErrorTrace("CODEC", "%s", str);
-		break;
-	case WELS_LOG_WARNING:
-		if(m_fpWarnTrace)
-			m_fpWarnTrace("CODEC", "%s",  str);
-		break;
-	case WELS_LOG_INFO:
-		if(m_fpInfoTrace)
-			m_fpInfoTrace("CODEC", "%s",  str);
-		break;
-	case WELS_LOG_DEBUG:
-		if(m_fpInfoTrace)
-			m_fpInfoTrace("CODEC", "%s",  str);
-		break;
-	default:
-		if(m_fpInfoTrace)
-			m_fpInfoTrace("CODEC", "%s",  str);
-		break;
-	}
+  switch (iLevel) {
+  case WELS_LOG_ERROR:
+    if (m_fpErrorTrace)
+      m_fpErrorTrace ("CODEC", "%s", str);
+    break;
+  case WELS_LOG_WARNING:
+    if (m_fpWarnTrace)
+      m_fpWarnTrace ("CODEC", "%s",  str);
+    break;
+  case WELS_LOG_INFO:
+    if (m_fpInfoTrace)
+      m_fpInfoTrace ("CODEC", "%s",  str);
+    break;
+  case WELS_LOG_DEBUG:
+    if (m_fpInfoTrace)
+      m_fpInfoTrace ("CODEC", "%s",  str);
+    break;
+  default:
+    if (m_fpInfoTrace)
+      m_fpInfoTrace ("CODEC", "%s",  str);
+    break;
+  }
 #endif
 }
 
 #define MAX_LOG_SIZE	1024
 
-void welsCodecTrace::CODEC_TRACE(void* ignore, const int32_t iLevel, const str_t* Str_Format, va_list vl)
-{
+void welsCodecTrace::CODEC_TRACE (void* ignore, const int32_t iLevel, const str_t* Str_Format, va_list vl) {
 //		if(g_traceLevel < iLevel)
-		if ( m_iTraceLevel < iLevel )
-		{
-			return;
-		}
+  if (m_iTraceLevel < iLevel) {
+    return;
+  }
 
-		str_t WStr_Format[MAX_LOG_SIZE] = {0};
-		str_t pBuf[MAX_LOG_SIZE] = {0};
-		str_t cResult[MAX_LOG_SIZE] = {0};
-		const int32_t len	= STRNLEN("[ENCODER]: ", MAX_LOG_SIZE);	// confirmed_safe_unsafe_usage
-		
-		STRNCPY(WStr_Format, MAX_LOG_SIZE, Str_Format, STRNLEN(Str_Format, MAX_LOG_SIZE));	// confirmed_safe_unsafe_usage	
-		
-		STRNCPY(pBuf, MAX_LOG_SIZE, "[ENCODER]: ", len);	// confirmed_safe_unsafe_usage
+  str_t WStr_Format[MAX_LOG_SIZE] = {0};
+  str_t pBuf[MAX_LOG_SIZE] = {0};
+  str_t cResult[MAX_LOG_SIZE] = {0};
+  const int32_t len	= STRNLEN ("[ENCODER]: ", MAX_LOG_SIZE);	// confirmed_safe_unsafe_usage
+
+  STRNCPY (WStr_Format, MAX_LOG_SIZE, Str_Format, STRNLEN (Str_Format, MAX_LOG_SIZE));	// confirmed_safe_unsafe_usage
+
+  STRNCPY (pBuf, MAX_LOG_SIZE, "[ENCODER]: ", len);	// confirmed_safe_unsafe_usage
 #if defined(WIN32)
 #if defined(_MSC_VER)
-#if _MSC_VER>=1500		
-		VSPRINTF(pBuf + len, MAX_LOG_SIZE-len, WStr_Format, vl);	// confirmed_safe_unsafe_usage		
-#else		
-		VSPRINTF(pBuf + len, WStr_Format, vl);	// confirmed_safe_unsafe_usage
+#if _MSC_VER>=1500
+  VSPRINTF (pBuf + len, MAX_LOG_SIZE - len, WStr_Format, vl);	// confirmed_safe_unsafe_usage
+#else
+  VSPRINTF (pBuf + len, WStr_Format, vl);	// confirmed_safe_unsafe_usage
 #endif//_MSC_VER>=1500
 #endif//_MSC_VER
 #else//__GNUC__
-		VSPRINTF(pBuf + len, WStr_Format, vl);	// confirmed_safe_unsafe_usage
+  VSPRINTF (pBuf + len, WStr_Format, vl);	// confirmed_safe_unsafe_usage
 #endif//WIN32
-		STRNCPY(cResult, MAX_LOG_SIZE, pBuf, STRNLEN(pBuf, MAX_LOG_SIZE));	// confirmed_safe_unsafe_usage
+  STRNCPY (cResult, MAX_LOG_SIZE, pBuf, STRNLEN (pBuf, MAX_LOG_SIZE));	// confirmed_safe_unsafe_usage
 
-//		g_WelsCodecTrace.TraceString(iLevel, cResult);		
-		welsCodecTrace::TraceString(iLevel, cResult);
+//		g_WelsCodecTrace.TraceString(iLevel, cResult);
+  welsCodecTrace::TraceString (iLevel, cResult);
 }
 
-void welsCodecTrace::SetTraceLevel(const int32_t iLevel)
-{
+void welsCodecTrace::SetTraceLevel (const int32_t iLevel) {
 //	g_traceLevel	= iLevel;
-	if ( iLevel >= 0 )
-		m_iTraceLevel	= iLevel;
+  if (iLevel >= 0)
+    m_iTraceLevel	= iLevel;
 }
 
 
--- a/codec/encoder/plus/src/welsEncoderExt.cpp
+++ b/codec/encoder/plus/src/welsEncoderExt.cpp
@@ -1,1255 +1,1226 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <assert.h>
-#include "welsEncoderExt.h"
-#include "welsCodecTrace.h"
-#include "typedefs.h"
-#include "wels_const.h"
-#include "utils.h"
-#include "macros.h"
-
-#include "crt_util_safe_x.h"	// Safe CRT routines like util for cross platforms
-#include "ref_list_mgr_svc.h"
-
-#include <time.h>
-#if defined(WIN32) /*&& defined(_DEBUG)*/
-
-#include <windows.h>
-#include <stdio.h>
-#include <stdarg.h>
-#include <sys/types.h>
-#include <sys/timeb.h>
-#else
-#include <sys/time.h>
-#endif
-
-namespace WelsSVCEnc {
-
-/*
- *	CWelsH264SVCEncoder class implementation
- */
-CWelsH264SVCEncoder::CWelsH264SVCEncoder()
-:	m_pEncContext		( NULL ),
-#if defined(WIN32)||defined(_MACH_PLATFORM)||defined(__GNUC__) 
-	m_pWelsTrace			( NULL ),
-#endif
-	m_pSrcPicList		( NULL ),
-	m_iSrcListSize		( 0 ),
-	m_iMaxPicWidth		( 0 ),
-	m_iMaxPicHeight		( 0 ),
-	m_iCspInternal		( 0 ),
-	m_bInitialFlag		( FALSE )
-{
-#ifdef REC_FRAME_COUNT
-	int32_t m_uiCountFrameNum = 0;
-#endif//REC_FRAME_COUNT
-
-#ifdef OUTPUT_BIT_STREAM
-	str_t strStreamFileName[1024] = { 0 };  //for .264
-	int32_t iBufferUsed = 0;
-	int32_t iBufferLeft = 1023;
-	
-	str_t strLenFileName[1024] = { 0 }; //for .len
-	int32_t iBufferUsedSize = 0;
-	int32_t iBufferLeftSize = 1023;
-#endif//OUTPUT_BIT_STREAM
-
-#ifdef OUTPUT_BIT_STREAM
-	time_t tTime;
-				
-#if defined( WIN32 )
-#if defined(_MSC_VER)
-#if _MSC_VER>=1500
-	struct tm tTimeNow;
-#else
-	struct tm *tTimeNow;
-#endif//_MSC_VER>=1500
-#endif//_MSC_VER
-	struct _timeb tTimeb;
-	
-	time(&tTime);
-#if defined(_MSC_VER)
-#if _MSC_VER>=1500
-	LOCALTIME(&tTimeNow, &tTime);
-#else
-	tTimeNow = LOCALTIME(&tTime);
-	if ( NULL == tTimeNow )
-		return;
-#endif//_MSC_VER>=1500
-#endif//_MSC_VER
-	FTIME(&tTimeb);
-#elif defined( __GNUC__ )
-	struct tm* tTimeNow;
-	struct timeval tTimev;
-	time(&tTime);
-	tTimeNow = (struct tm *)localtime(&tTime);
-	gettimeofday(&tTimev,NULL);
-#endif//WIN32	
-	
-#ifdef WIN32
-#if defined(_MSC_VER)
-#if _MSC_VER>=1500
-	iBufferUsed      += SNPRINTF(strStreamFileName,      iBufferLeft, iBufferLeft,      "enc_bs_0x%p_",   (void*)this);
-	iBufferUsedSize += SNPRINTF(strLenFileName, iBufferLeftSize, iBufferLeftSize, "enc_size_0x%p_", (void*)this);
-#else
-	iBufferUsed      += SNPRINTF(strStreamFileName,      iBufferLeft,      "enc_bs_0x%p_",   (void*)this);
-	iBufferUsedSize += SNPRINTF(strLenFileName, iBufferLeftSize, "enc_size_0x%p_", (void*)this);
-#endif//_MSC_VER>=1500
-#endif//_MSC_VER
-#else
-	iBufferUsed      += SNPRINTF(strStreamFileName,      iBufferLeft,      "/tmp/enc_bs_0x%p_",  (void*)this);
-	iBufferUsedSize += SNPRINTF(strLenFileName, iBufferLeftSize, "/tmp/enc_size_0x%p", (void*)this);
-#endif//WIN32
-    
-	
-	iBufferLeft -= iBufferUsed;
-	if ( iBufferLeft > iBufferUsed )
-	{		
-#if defined(_GNUC__)
-		iBufferUsed += strftime(&strStreamFileName[iBufferUsed], iBufferLeft, "%y%m%d%H%M%S", tTimeNow);
-#else		
-#if defined(_MSC_VER)
-		iBufferUsed += strftime(&strStreamFileName[iBufferUsed], iBufferLeft, "%y%m%d%H%M%S", 
-#if _MSC_VER>=1500
-			&tTimeNow
-#else
-			tTimeNow
-#endif//_MSC_VER>=1500
-			);
-#endif//_MSC_VER			
-#endif//__GNUC__
-		iBufferLeft -= iBufferUsed;
-	}
-	
-	iBufferLeftSize -= iBufferUsedSize;
-	if ( iBufferLeftSize> iBufferUsedSize )
-	{		
-#if defined(_GNUC__)
-		iBufferUsedSize += strftime(&strLenFileName[iBufferUsedSize], iBufferLeftSize, "%y%m%d%H%M%S", tTimeNow);
-#else
-#if defined(_MSC_VER)
-		iBufferUsedSize += strftime(&strLenFileName[iBufferUsedSize], iBufferLeftSize, "%y%m%d%H%M%S", 
-#if _MSC_VER>=1500
-			&tTimeNow
-#else
-			tTimeNow
-#endif//_MSC_VER>=1500
-			);
-#endif//_MSC_VER
-#endif//__GNUC__
-		iBufferLeftSize -= iBufferUsedSize;
-	}
-	
-	if ( iBufferLeft > iBufferUsed )
-	{
-#ifdef WIN32
-#if defined(_MSC_VER)
-#if _MSC_VER>=1500
-		iBufferUsed += SNPRINTF(&strStreamFileName[iBufferUsed], iBufferLeft, iBufferLeft, ".%03.3u.264", tTimeb.millitm);
-#else
-		iBufferUsed += SNPRINTF(&strStreamFileName[iBufferUsed], iBufferLeft, ".%03.3u.264", tTimeb.millitm);
-#endif//_MSC_VER>=1500
-#endif//_MSC_VER
-#else
-		iBufferUsed += SNPRINTF(&strStreamFileName[iBufferUsed], iBufferLeft, ".%03.3u.264", tTimev.tv_usec/1000);
-#endif//WIN32
-		iBufferLeft -= iBufferUsed;
-	}
-	
-	if ( iBufferLeftSize > iBufferUsedSize )
-	{
-#ifdef WIN32
-#if defined(_MSC_VER)
-#if _MSC_VER>=1500
-		iBufferUsedSize += SNPRINTF(&strLenFileName[iBufferUsedSize], iBufferLeftSize, iBufferLeftSize, ".%03.3u.len", tTimeb.millitm);
-#else
-		iBufferUsedSize += SNPRINTF(&strLenFileName[iBufferUsedSize], iBufferLeftSize, ".%03.3u.len", tTimeb.millitm);
-#endif//_MSC_VER>=1500
-#endif//_MSC_VER
-#else
-		iBufferUsedSize += SNPRINTF(&strLenFileName[iBufferUsedSize], iBufferLeftSize, ".%03.3u.len", tTimev.tv_usec/1000);
-#endif//WIN32
-		iBufferLeftSize -= iBufferUsedSize;
-	}
-
-#if defined(__GNUC__)
-	m_pFileBs       = FOPEN(strStreamFileName,      "wb");
-	m_pFileBsSize	= FOPEN(strLenFileName, "wb");
-#else
-#if defined(_MSC_VER)
-#if _MSC_VER>=1500
-	FOPEN(&m_pFileBs, strStreamFileName,      "wb");
-	FOPEN(&m_pFileBsSize, strLenFileName, "wb");
-#else
-	m_pFileBs       = FOPEN(strStreamFileName,      "wb");
-	m_pFileBsSize	= FOPEN(strLenFileName, "wb");
-#endif//_MSC_VER>=1500
-#endif//_MSC_VER
-#endif//__GNUC__
-
-	m_bSwitch	= FALSE;
-	m_iSwitchTimes	= 0;
-#endif//OUTPUT_BIT_STREAM
-	
-	InitEncoder();
-}
-
-CWelsH264SVCEncoder::~CWelsH264SVCEncoder()
-{	
-	WelsLog(NULL, WELS_LOG_INFO, "CWelsH264SVCEncoder::~CWelsH264SVCEncoder()\n");
-#if defined(WIN32)||defined(_MACH_PLATFORM)||defined(__GNUC__) 
-
-	if ( m_pWelsTrace != NULL )
-	{
-		delete m_pWelsTrace;
-		m_pWelsTrace = NULL;
-	}
-#endif
-#ifdef REC_FRAME_COUNT
-	WelsLog( m_pEncContext, WELS_LOG_INFO, "CWelsH264SVCEncoder::~CWelsH264SVCEncoder(), m_uiCountFrameNum= %d, m_iCspInternal= 0x%x\n", m_uiCountFrameNum, m_iCspInternal );
-#endif
-
-#ifdef REC_FRAME_COUNT
-	m_uiCountFrameNum = 0;
-#endif//REC_FRAME_COUNT
-
-#ifdef OUTPUT_BIT_STREAM
-	if ( m_pFileBs )
-	{
-		fclose( m_pFileBs );
-		m_pFileBs = NULL;
-	}
-	if ( m_pFileBsSize )
-	{
-		fclose( m_pFileBsSize );
-		m_pFileBsSize = NULL;
-	}
-	m_bSwitch	= FALSE;
-	m_iSwitchTimes	= 0;
-#endif//OUTPUT_BIT_STREAM
-
-	Unintialize();
-}
-
-void CWelsH264SVCEncoder::InitEncoder( void )
-{
-#if defined(WIN32)||defined(_MACH_PLATFORM)||defined(__GNUC__) 
-	
-#ifdef REC_FRAME_COUNT
-	WelsLog( m_pEncContext, WELS_LOG_INFO, "CWelsH264SVCEncoder::InitEncoder, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x\n", m_uiCountFrameNum, m_iCspInternal );
-#endif
-
-	m_pWelsTrace	= new welsCodecTrace();
-	if ( m_pWelsTrace != NULL )
-	{
-		const int32_t iWelsTraceExistingFlag = m_pWelsTrace->WelsTraceModuleIsExist();
-		if ( iWelsTraceExistingFlag )
-		{
-			m_pWelsTrace->SetTraceLevel( WELS_LOG_DEFAULT );
-			WelsSetLogCallback( welsCodecTrace::CODEC_TRACE );
-		}
-	}
-	
-	// initialization	
-	WelsSetLogLevel( WELS_LOG_DEFAULT );	// no output, WELS_LOG_QUIET
-#endif	
-}
-
-/* Interfaces override from ISVCEncoder */
-
-/*
- *	SVC Encoder Initialization
- */
-int CWelsH264SVCEncoder::Initialize(SVCEncodingParam* argv, const INIT_TYPE iInitType)
-{
-	if ( INIT_TYPE_PARAMETER_BASED != iInitType || NULL == argv )
-	{
-		WelsLog(m_pEncContext, WELS_LOG_ERROR, "CWelsH264SVCEncoder::Initialize(), invalid iInitType= %d, argv= 0x%p\n", iInitType, (void *)argv);
-		return cmInitParaError;
-	}
-
-	if ( m_bInitialFlag )
-	{
-		WelsLog(m_pEncContext, WELS_LOG_WARNING, "CWelsH264SVCEncoder::Initialize(), reinitialize, m_bInitialFlag= %d\n", m_bInitialFlag);
-		Unintialize();
-	}	
-	
-	SVCEncodingParam		sEncodingParam;
-	SWelsSvcCodingParam	sConfig( true );
-	
-	memcpy(&sEncodingParam, argv, sizeof(SVCEncodingParam));	// confirmed_safe_unsafe_usage
-
-#ifdef REC_FRAME_COUNT
-	WelsLog( m_pEncContext, WELS_LOG_INFO, "CWelsH264SVCEncoder::Initialize, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x\n", m_uiCountFrameNum, m_iCspInternal );
-	WelsLog( m_pEncContext, WELS_LOG_INFO, "coding_param->iPicWidth= %d;coding_param->iPicHeight= %d;coding_param->iTargetBitrate= %d;coding_param->iRCMode= %d;coding_param->iTemporalLayerNum= %d;coding_param->iSpatialLayerNum= %d;coding_param->fFrameRate= %.6ff;coding_param->iInputCsp= %d;coding_param->iKeyPicCodingMode= %d;coding_param->uiIntraPeriod= %d;coding_param->bEnableSpsPpsIdAddition = %d;coding_param->bPrefixNalAddingCtrl = %d;coding_param->bEnableDenoise= %d;coding_param->bEnableBackgroundDetection= %d;coding_param->bEnableAdaptiveQuant= %d;coding_param->bEnableCropPic= %d;coding_param->bEnableLongTermReference= %d;coding_param->iLtrMarkPeriod= %d;\n",
-		sEncodingParam.iPicWidth,
-		sEncodingParam.iPicHeight,
-		sEncodingParam.iTargetBitrate,
-		sEncodingParam.iRCMode,
-		sEncodingParam.iTemporalLayerNum,
-		sEncodingParam.iSpatialLayerNum,
-		sEncodingParam.fFrameRate,
-		sEncodingParam.iInputCsp,
-		sEncodingParam.iKeyPicCodingMode,
-		sEncodingParam.iIntraPeriod,
-		sEncodingParam.bEnableSpsPpsIdAddition,
-		sEncodingParam.bPrefixNalAddingCtrl,
-		sEncodingParam.bEnableDenoise,
-		sEncodingParam.bEnableBackgroundDetection,
-		sEncodingParam.bEnableAdaptiveQuant,
-		sEncodingParam.bEnableCropPic,
-		sEncodingParam.bEnableLongTermReference,
-		sEncodingParam.iLtrMarkPeriod);
-	int32_t i = 0;
-	while (i < sEncodingParam.iSpatialLayerNum) {
-		SSpatialLayerConfig *spatial_cfg = &sEncodingParam.sSpatialLayers[i];
-		WelsLog( m_pEncContext, WELS_LOG_INFO, "coding_param->sSpatialLayers[%d]: .iVideoWidth= %d; .iVideoHeight= %d; .fFrameRate= %.6ff; .iQualityLayerNum= %d; .iSpatialBitrate= %d; .iCgsSnrRefined= %d; .iInterSpatialLayerPredFlag= %d; .sSliceCfg.uiSliceMode= %d; .sSliceCfg.sSliceArgument.uiSliceNum= %d; .sSliceCfg.sSliceArgument.uiSliceSizeConstraint= %d;\n",
-			i, spatial_cfg->iVideoWidth,
-			spatial_cfg->iVideoHeight,
-			spatial_cfg->fFrameRate,
-			spatial_cfg->iQualityLayerNum,
-			spatial_cfg->iSpatialBitrate,
-			spatial_cfg->iCgsSnrRefined,
-			spatial_cfg->iInterSpatialLayerPredFlag,
-			spatial_cfg->sSliceCfg.uiSliceMode,			
-			spatial_cfg->sSliceCfg.sSliceArgument.uiSliceNum,
-			spatial_cfg->sSliceCfg.sSliceArgument.uiSliceSizeConstraint
-			);
-		++ i;
-	}
-#endif//REC_FRAME_COUNT
-
-	// Convert SVCEncodingParam into WelsSVCParamConfig here..	
-	if ( sConfig.ParamTranscode( sEncodingParam, true ) ){
-		WelsLog(m_pEncContext, WELS_LOG_ERROR, "CWelsH264SVCEncoder::Initialize(), parameter_translation failed.\n");
-		Unintialize();
-		return cmInitParaError;
-	}
-
-	m_iSrcListSize  = 1;
-
-	return Initialize((void *)&sConfig, INIT_TYPE_CONFIG_BASED);
-}
-
-int CWelsH264SVCEncoder::Initialize(void * argv, const INIT_TYPE iInitType)
-{
-	if ( INIT_TYPE_CONFIG_BASED != iInitType || NULL == argv )
-	{
-		WelsLog(m_pEncContext, WELS_LOG_ERROR, "CWelsH264SVCEncoder::Initialize(), invalid iInitType= %d, argv= 0x%p.\n", iInitType, (void *)argv);
-		return cmInitParaError;
-	}
-
-	if ( m_bInitialFlag )
-	{
-		WelsLog(m_pEncContext, WELS_LOG_WARNING, "CWelsH264SVCEncoder::Initialize(), reinitialize, m_bInitialFlag= %d.\n", m_bInitialFlag);
-		Unintialize();
-	}
-
-	SWelsSvcCodingParam  *pCfg = static_cast<SWelsSvcCodingParam*>(argv);		
-
-	const int32_t iColorspace = pCfg->iInputCsp;
-	if ( 0 == iColorspace )
-	{		
-		WelsLog(m_pEncContext, WELS_LOG_ERROR, "CWelsH264SVCEncoder::Initialize(), invalid iInputCsp= %d.\n", iColorspace);
-		Unintialize();
-		return cmInitParaError;
-	}		
-
-	// Check valid parameters
-	const int32_t iNumOfLayers = pCfg->iNumDependencyLayer;
-	if ( iNumOfLayers < 1 || iNumOfLayers > MAX_DEPENDENCY_LAYER ){
-		WelsLog(m_pEncContext, WELS_LOG_ERROR, "CWelsH264SVCEncoder::Initialize(), invalid iNumDependencyLayer= %d, valid at range of [1, %d].\n", iNumOfLayers, MAX_DEPENDENCY_LAYER);
-		Unintialize();
-		return cmInitParaError;
-	}
-	if ( pCfg->iNumTemporalLayer < 1 )
-		pCfg->iNumTemporalLayer	= 1;
-	if ( pCfg->iNumTemporalLayer > MAX_TEMPORAL_LEVEL ){
-		WelsLog(m_pEncContext, WELS_LOG_ERROR, "CWelsH264SVCEncoder::Initialize(), invalid iNumTemporalLayer= %d, valid at range of [1, %d].\n", pCfg->iNumTemporalLayer, MAX_TEMPORAL_LEVEL);
-		Unintialize();
-		return cmInitParaError;
-	}
-
-	//	assert( cfg.uiGopSize >= 1 && ( cfg.uiIntraPeriod && (cfg.uiIntraPeriod % cfg.uiGopSize) == 0) );
-
-	if ( pCfg->uiGopSize < 1 || pCfg->uiGopSize > MAX_GOP_SIZE ){
-		WelsLog(m_pEncContext, WELS_LOG_ERROR, "CWelsH264SVCEncoder::Initialize(), invalid uiGopSize= %d, valid at range of [1, %d].\n", pCfg->uiGopSize, MAX_GOP_SIZE);
-		Unintialize();
-		return cmInitParaError;
-	}
-
-	if ( !WELS_POWER2_IF(pCfg->uiGopSize) ){
-		WelsLog(m_pEncContext, WELS_LOG_ERROR, "CWelsH264SVCEncoder::Initialize(), invalid uiGopSize= %d, valid at range of [1, %d] and yield to power of 2.\n", pCfg->uiGopSize, MAX_GOP_SIZE);
-		Unintialize();
-		return cmInitParaError;
-	}
-
-	if ( pCfg->uiIntraPeriod && pCfg->uiIntraPeriod < pCfg->uiGopSize )			
-	{		
-		WelsLog(m_pEncContext, WELS_LOG_ERROR, "CWelsH264SVCEncoder::Initialize(), invalid uiIntraPeriod= %d, valid in case it equals to 0 for unlimited intra period or exceeds specified uiGopSize= %d.\n", pCfg->uiIntraPeriod, pCfg->uiGopSize);
-		Unintialize();
-		return cmInitParaError;
-	}
-
-	if ( ( pCfg->uiIntraPeriod && (pCfg->uiIntraPeriod & (pCfg->uiGopSize-1)) != 0) )
-	{
-		WelsLog(m_pEncContext, WELS_LOG_ERROR, "CWelsH264SVCEncoder::Initialize(), invalid uiIntraPeriod= %d, valid in case it equals to 0 for unlimited intra period or exceeds specified uiGopSize= %d also multiple of it.\n", pCfg->uiIntraPeriod, pCfg->uiGopSize);
-		Unintialize();
-		return cmInitParaError;
-	}
-
-	// Fine tune num_ref_num
-	if (pCfg->bEnableLongTermReference){
-		pCfg->iLTRRefNum = LONG_TERM_REF_NUM;
-	}else{
-		pCfg->iLTRRefNum = 0;
-	}
-	pCfg->iNumRefFrame = ((pCfg->uiGopSize>>1)>1)?((pCfg->uiGopSize>>1)+pCfg->iLTRRefNum):(MIN_REF_PIC_COUNT+pCfg->iLTRRefNum);
-
-	pCfg->iNumRefFrame = WELS_CLIP3(pCfg->iNumRefFrame, MIN_REF_PIC_COUNT, MAX_REFERENCE_PICTURE_COUNT_NUM);
-
-	if (pCfg->uiLtrMarkPeriod == 0)
-	{
-		pCfg->uiLtrMarkPeriod = 30;
-	}
-
-	const int32_t kiDecStages = WELS_LOG2( pCfg->uiGopSize );
-	pCfg->iInputCsp			= iColorspace;	
-	pCfg->iNumTemporalLayer	= (int8_t)(1 + kiDecStages);
-	pCfg->iLoopFilterAlphaC0Offset	= WELS_CLIP3( pCfg->iLoopFilterAlphaC0Offset, -6, 6 );
-	pCfg->iLoopFilterBetaOffset		= WELS_CLIP3( pCfg->iLoopFilterBetaOffset, -6, 6 );
-
-//	m_pSrcPicList	= (SSourcePicture **)WelsMalloc( pCfg->iNumDependencyLayer * sizeof(SSourcePicture *), "m_pSrcPicList" );
-	// prefer use new/delete pair due encoder intialization stage not start yet for CacheLineSize not detection here (16 or 64 not matched)
-	m_pSrcPicList	= new SSourcePicture* [iNumOfLayers];
-
-	if ( NULL == m_pSrcPicList ){
-		WelsLog(m_pEncContext, WELS_LOG_ERROR, "CWelsH264SVCEncoder::Initialize(), pOut of memory due m_pSrcPicList memory request.\n");
-		Unintialize();
-		return cmMallocMemeError;
-	}
-
-	// decide property list size between INIT_TYPE_PARAMETER_BASED/INIT_TYPE_CONFIG_BASED 
-    m_iMaxPicWidth	= pCfg->iActualPicWidth;
-	m_iMaxPicHeight	= pCfg->iActualPicHeight;	
-	m_iSrcListSize  = iNumOfLayers;	
-
-	for (int32_t i = 0; i < m_iSrcListSize; ++ i)
-	{
-//		m_pSrcPicList[i]	= (SSourcePicture *)WelsMalloc( sizeof(SSourcePicture), "m_pSrcPicList[]" );
-		// prefer use new/delete pair due encoder intialization stage not start yet for CacheLineSize not detection here (16 or 64 not matched)
-		m_pSrcPicList[i]	= new SSourcePicture;
-
-		if ( NULL == m_pSrcPicList[i] )
-		{			
-			WelsLog(m_pEncContext, WELS_LOG_ERROR, "CWelsH264SVCEncoder::Initialize(), pOut of memory due m_pSrcPicList[%d] memory request.\n", i);
-			Unintialize();
-			m_iSrcListSize = 0;
-			return cmMallocMemeError;
-		}
-		InitPic( m_pSrcPicList[i], iColorspace, m_iMaxPicWidth, m_iMaxPicHeight );
-	}	
-
-#if defined(OUTPUT_BIT_STREAM) || defined(ENABLE_TRACE_FILE)
-	str_t fpath[MAX_FNAME_LEN] = {0};
-#if defined(__GNUC__)
-	SNPRINTF(fpath, MAX_FNAME_LEN, "/tmp/");		// confirmed_safe_unsafe_usage
-
-#else//__GNUC__
-
-#if defined (_MSC_VER)
-#if _MSC_VER>=1500
-	SNPRINTF(fpath, MAX_FNAME_LEN, MAX_FNAME_LEN, ".\\" );		// confirmed_safe_unsafe_usage
-#else
-	SNPRINTF(fpath, MAX_FNAME_LEN, ".\\" );		// confirmed_safe_unsafe_usage
-#endif//_MSC_VER>=1500
-#endif//_MSC_VER
-#endif //__GNUC__
-
-	strcpy(pCfg->sTracePath, fpath);		// confirmed_safe_unsafe_usage
-
-#endif //#if defined(OUTPUT_BIT_STREAM) || defined(ENABLE_TRACE_FILE)
-
-	if ( WelsInitEncoderExt( &m_pEncContext, pCfg ) )
-	{		
-		WelsLog(m_pEncContext, WELS_LOG_ERROR, "CWelsH264SVCEncoder::Initialize(), WelsInitEncoderExt failed.\n");
-		Unintialize();
-		return cmInitParaError;
-	}  
-
-	m_iCspInternal	= iColorspace;
-	m_bInitialFlag  = TRUE;
-
-	return cmResultSuccess;
-}
-
-/*
- *	SVC Encoder Uninitialization
- */
-int32_t CWelsH264SVCEncoder::Unintialize()
-{
-	if ( !m_bInitialFlag )
-	{
-		return 0;
-	}
-
-	WelsLog( m_pEncContext, WELS_LOG_INFO, "CWelsH264SVCEncoder::Unintialize()..\n" );
-
-#ifdef REC_FRAME_COUNT
-	WelsLog( m_pEncContext, WELS_LOG_INFO, "CWelsH264SVCEncoder::Unintialize, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x\n", m_uiCountFrameNum, m_iCspInternal );
-#endif//REC_FRAME_COUNT
-
-	if ( NULL != m_pEncContext )
-	{
-		if ( NULL != m_pSrcPicList )
-		{
-			for (int32_t i = 0; i < m_iSrcListSize; i++)
-			{
-				SSourcePicture *pic = m_pSrcPicList[i];
-				if ( NULL != pic )
-				{
-//					WelsFree( pic, "m_pSrcPicList[]" );
-					// prefer use new/delete pair due encoder intialization stage not start yet for CacheLineSize not detection here (16 or 64 not matched)
-					delete pic;
-
-					pic = NULL;
-				}
-			}
-//			WelsFree( m_pSrcPicList, "m_pSrcPicList" );
-			// prefer use new/delete pair due encoder intialization stage not start yet for CacheLineSize not detection here (16 or 64 not matched)
-			delete [] m_pSrcPicList;
-
-			m_pSrcPicList = NULL;
-			m_iSrcListSize= 0;
-		}
-		
-		WelsUninitEncoderExt( &m_pEncContext );
-		m_pEncContext	= NULL;
-	}	
-
-	m_bInitialFlag = FALSE;
-
-	return 0;
-}
-	
-
-int32_t CWelsH264SVCEncoder::RawData2SrcPic(const uint8_t * pSrc)
-{    
-	assert( m_iSrcListSize > 0 );
-
-	int32_t y_length = m_iMaxPicWidth * m_iMaxPicHeight;
-	m_pSrcPicList[0]->pData[0] = const_cast<uint8_t*>(pSrc);
-
-	switch(m_iCspInternal & (~videoFormatVFlip))
-	{
-		case videoFormatYVYU:
-		case videoFormatUYVY:
-		case videoFormatYUY2:		
-		case videoFormatRGB:
-		case videoFormatBGR:			
-		case videoFormatBGRA:
-		case videoFormatRGBA:
-		case videoFormatARGB:
-		case videoFormatABGR:
-			m_pSrcPicList[0]->pData[1] = m_pSrcPicList[0]->pData[2] = NULL;		
-			break;
-		case videoFormatI420:
-		case videoFormatYV12:
-			m_pSrcPicList[0]->pData[1] = m_pSrcPicList[0]->pData[0] + y_length;
-			m_pSrcPicList[0]->pData[2] = m_pSrcPicList[0]->pData[1] + ( y_length >> 2 );
-			break;			
-		default:
-			return 1;		
-	}
-
-    return 0;
-}
-
-
-/*
- *	SVC core encoding
- */
-int CWelsH264SVCEncoder::EncodeFrame(const unsigned char* pSrc, SFrameBSInfo* pBsInfo)
-{
-	if ( !(pSrc && m_pEncContext && m_bInitialFlag) )
-	{
-		return videoFrameTypeInvalid;
-	}
-
-	int32_t uiFrameType = videoFrameTypeInvalid;		
-
-	if( RawData2SrcPic((uint8_t *)pSrc) == 0 ){
-		uiFrameType = EncodeFrame(const_cast<const SSourcePicture**>(m_pSrcPicList), 1, pBsInfo);
-	}
-
-#ifdef REC_FRAME_COUNT
-	++ m_uiCountFrameNum;
-	WelsLog( m_pEncContext, WELS_LOG_INFO, "CWelsH264SVCEncoder::EncodeFrame(), m_uiCountFrameNum= %d, m_iCspInternal= 0x%x\n", m_uiCountFrameNum, m_iCspInternal );
-#endif//REC_FRAME_COUNT		
-
-#ifdef DUMP_SRC_PICTURE
-    DumpSrcPicture(pSrc);
-#endif // DUMP_SRC_PICTURE	
-
-	return uiFrameType;
-}
-
-
-int CWelsH264SVCEncoder::EncodeFrame(const SSourcePicture  ** pSrcPicList, int nSrcPicNum, SFrameBSInfo * pBsInfo)
-{
-	if ( !(pSrcPicList && m_pEncContext && m_bInitialFlag) )
-	{		
-		return videoFrameTypeInvalid;
-	}
-
-	int32_t iFrameTypeReturned = 0;
-	int32_t iFrameType = videoFrameTypeInvalid;
-	
-	if (nSrcPicNum > 0)
-	{
-		iFrameTypeReturned = WelsEncoderEncodeExt( m_pEncContext, pBsInfo, pSrcPicList, nSrcPicNum);			
-	}
-	else
-	{
-		assert ( 0 );
-		return videoFrameTypeInvalid;
-	}
-
-	switch( iFrameTypeReturned )
-	{
-	case WELS_FRAME_TYPE_P:
-		iFrameType	= videoFrameTypeP;
-		break;
-	case WELS_FRAME_TYPE_IDR:
-		iFrameType	= videoFrameTypeIDR;
-		break;
-	case WELS_FRAME_TYPE_SKIP:
-		iFrameType	= videoFrameTypeSkip;
-		break;
-	case WELS_FRAME_TYPE_I:
-		iFrameType	= videoFrameTypeI;
-		break;
-	case WELS_FRAME_TYPE_AUTO:
-	case WELS_FRAME_TYPE_B: // not support B pictures
-		iFrameType	= videoFrameTypeInvalid;
-		break;
-	default:
-		break;
-	}	
-
-
-
-	///////////////////for test
-#ifdef OUTPUT_BIT_STREAM
-	if ( iFrameType != videoFrameTypeInvalid && iFrameType != videoFrameTypeSkip )
-	{		
-		SLayerBSInfo* pLayer = NULL;
-		int32_t i = 0, j = 0, iCurLayerBits = 0, total_bits = 0;		
-
-		if ( m_bSwitch )
-		{
-			if ( m_pFileBs )
-			{
-				fclose( m_pFileBs );
-				m_pFileBs = NULL;
-			}
-			if ( m_pFileBsSize )
-			{
-				fclose( m_pFileBsSize );
-				m_pFileBsSize = NULL;
-			}
-			str_t strStreamFileName[128] = {0};
-#if defined(__GNUC__)
-
-			int32_t iLen = SNPRINTF(strStreamFileName, 128, "%sadj%d_w%d.264", m_pEncContext->sTracePath,  m_iSwitchTimes, m_pEncContext->pSvcParam->iActualPicWidth);
-			m_pFileBs = FOPEN( strStreamFileName, "wb" );
-			SNPRINTF(strStreamFileName, 128, "%sadj%d_w%d_size.iLen", m_pEncContext->sTracePath, m_iSwitchTimes, m_pEncContext->pSvcParam->iActualPicWidth);
-			m_pFileBsSize = FOPEN( strStreamFileName, "wb");
-
-#else//__GNUC__
-            
-#if defined (_MSC_VER)
-#if _MSC_VER>=1500
-			int32_t iLen = SNPRINTF(strStreamFileName, 128, 128, "adj%d_w%d.264", m_iSwitchTimes, m_pEncContext->pSvcParam->iActualPicWidth);
-			FOPEN( &m_pFileBs, strStreamFileName, "wb" );
-			SNPRINTF(strStreamFileName, 128, 128, "adj%d_w%d_size.iLen", m_iSwitchTimes, m_pEncContext->pSvcParam->iActualPicWidth);
-			FOPEN( &m_pFileBsSize, strStreamFileName, "wb");
-#else
-			int32_t iLen = SNPRINTF(strStreamFileName, 128, "adj%d_w%d.264", m_iSwitchTimes, m_pEncContext->pSvcParam->iActualPicWidth);
-			m_pFileBs = FOPEN( strStreamFileName, "wb" );
-			SNPRINTF(strStreamFileName, 128, "adj%d_w%d_size.iLen", m_iSwitchTimes, m_pEncContext->pSvcParam->iActualPicWidth);
-			m_pFileBsSize = FOPEN( strStreamFileName, "wb");
-#endif//_MSC_VER>=1500
-#endif//_MSC_VER
-
-
-#endif//__GNUC__
-
-			m_bSwitch = FALSE;
-		}
-
-		for ( i = 0; i < pBsInfo->iLayerNum; i++ )
-		{
-			pLayer = &pBsInfo->sLayerInfo[i];
-
-			iCurLayerBits = 0;
-			for ( j = 0; j < pLayer->iNalCount; j++ )
-			{
-				iCurLayerBits += pLayer->iNalLengthInByte[j];
-			}
-			total_bits += iCurLayerBits;
-			if ( m_pFileBs != NULL )
-				fwrite( pLayer->pBsBuf, 1, iCurLayerBits, m_pFileBs );
-		}
-
-		if ( m_pFileBsSize != NULL )
-			fwrite( &total_bits, sizeof(int32_t), 1, m_pFileBsSize );
-	}
-#endif //OUTPUT_BIT_STREAM
-#ifdef DUMP_SRC_PICTURE
-	DumpSrcPicture(pSrcPicList[0]->pData[0]);
-#endif // DUMP_SRC_PICTURE	
-
-	return iFrameType;
-
-}
-
-/*
- * return: 0 - success; otherwise - failed;
- */
-int CWelsH264SVCEncoder::PauseFrame(const unsigned char* kpSrc, SFrameBSInfo* pBsInfo)
-{
-    int32_t  iReturn = 1;
-	
-	ForceIntraFrame(true);
-
-	if( EncodeFrame(kpSrc, pBsInfo) != videoFrameTypeInvalid ){
-		iReturn = 0;
-	}
-
-    // to avoid pause frame bitstream and 
-    // normal bitstream use different video channel. 
-	ForceIntraFrame(true);  
-
-	return (int)iReturn;
-}
-
-
-/*
- *	Force key frame
- */
-int CWelsH264SVCEncoder::ForceIntraFrame(bool bIDR)
-{
-	if ( !(m_pEncContext && m_bInitialFlag) )
-	{
-		return 1;
-	}
-
-#ifdef REC_FRAME_COUNT
-	WelsLog( m_pEncContext, WELS_LOG_INFO, "CWelsH264SVCEncoder::ForceIntraFrame(), bIDR= %d, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x\n", bIDR, m_uiCountFrameNum, m_iCspInternal );
-#endif//REC_FRAME_COUNT
-
-	ForceCodingIDR( m_pEncContext );	
-	
-	return 0;
-}
-
-/************************************************************************
-* InDataFormat, IDRInterval, SVC Encode Param, Frame Rate, Bitrate,..
-************************************************************************/
-int CWelsH264SVCEncoder::SetOption(ENCODER_OPTION eOptionId, void* pOption)
-{
-	if ( NULL == pOption ){		
-		return cmInitParaError;
-	}
-
-	if ( NULL == m_pEncContext || FALSE == m_bInitialFlag ){		
-		return cmInitExpected;
-	}
-
-	switch( eOptionId ) {
-	case ENCODER_OPTION_INTER_SPATIAL_PRED:	// Inter spatial layer prediction flag
-		{
-			WelsLog( m_pEncContext, WELS_LOG_INFO, "ENCODER_OPTION_INTER_SPATIAL_PRED, this feature not supported at present.\n" );
-		}
-		break;
-	case ENCODER_OPTION_DATAFORMAT:	// Input color space
-		{
-			int32_t iValue = *((int32_t*)pOption);
-			int32_t iColorspace = iValue;
-			if ( iColorspace == 0 ){				
-				return cmInitParaError;
-			}
-			
-#ifdef REC_FRAME_COUNT
-			WelsLog( m_pEncContext, WELS_LOG_INFO, "CWelsH264SVCEncoder::SetOption():ENCODER_OPTION_DATAFORMAT, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x, iValue= %d\n", m_uiCountFrameNum, m_iCspInternal, iValue );
-#endif//REC_FRAME_COUNT
-
-			
-			int32_t iPicIdx = m_iSrcListSize -1;
-			while ( iPicIdx >= 0 )
-			{
-				if ( m_pSrcPicList[iPicIdx] == NULL )
-				{
-					-- iPicIdx;
-					if (iPicIdx < 0) return cmInitParaError;
-					continue;
-				}
-
-				if ( m_pSrcPicList[iPicIdx]->iColorFormat == iColorspace )
-				{					
-					-- iPicIdx;
-					continue;
-				}
-
-				InitPic( m_pSrcPicList[iPicIdx], iColorspace, m_iMaxPicWidth, m_iMaxPicHeight );
-			}				
-			m_iCspInternal = iColorspace;
-#ifdef REC_FRAME_COUNT
-			WelsLog( m_pEncContext, WELS_LOG_INFO, "CWelsH264SVCEncoder::SetOption():ENCODER_OPTION_DATAFORMAT, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x\n", m_uiCountFrameNum, m_iCspInternal );
-#endif//REC_FRAME_COUNT
-		}		
-		break;
-	case ENCODER_OPTION_IDR_INTERVAL:	// IDR Interval
-		{
-			int32_t iValue	= *((int32_t*)pOption);
-#ifdef REC_FRAME_COUNT
-			WelsLog( m_pEncContext, WELS_LOG_INFO, "CWelsH264SVCEncoder::SetOption():ENCODER_OPTION_IDR_INTERVAL, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x, iValue= %d\n", m_uiCountFrameNum, m_iCspInternal, iValue );
-#endif//REC_FRAME_COUNT
-
-			if ( iValue < -1 || iValue == 0 )
-				iValue = 1;
-			if ( iValue == (int32_t)m_pEncContext->pSvcParam->uiIntraPeriod ){				
-				return cmResultSuccess;
-			}
-
-			
-			m_pEncContext->pSvcParam->uiIntraPeriod	= (uint32_t)iValue;			
-		}
-		break;
-	case ENCODER_OPTION_SVC_ENCODE_PARAM:	// SVC Encoding Parameter
-		{
-			SVCEncodingParam		sEncodingParam;
-			SWelsSvcCodingParam	sConfig( true );
-			int32_t iInputColorspace = 0;
-			int32_t iTargetWidth = 0;
-			int32_t iTargetHeight= 0;
-
-			memcpy(&sEncodingParam, pOption, sizeof(SVCEncodingParam));	// confirmed_safe_unsafe_usage
-			WelsLog( m_pEncContext, WELS_LOG_INFO, "ENCODER_OPTION_SVC_ENCODE_PARAM, sEncodingParam.iInputCsp= 0x%x\n", sEncodingParam.iInputCsp );
-			WelsLog( m_pEncContext, WELS_LOG_INFO, "coding_param->iPicWidth= %d;coding_param->iPicHeight= %d;coding_param->iTargetBitrate= %d;coding_param->iRCMode= %d;coding_param->iPaddingFlag= %d;coding_param->iTemporalLayerNum= %d;coding_param->iSpatialLayerNum= %d;coding_param->fFrameRate= %.6ff;coding_param->iInputCsp= %d;coding_param->iKeyPicCodingMode= %d;coding_param->uiIntraPeriod= %d;coding_param->bEnableSpsPpsIdAddition = %d;coding_param->bPrefixNalAddingCtrl = %d;coding_param->bEnableDenoise= %d;coding_param->bEnableBackgroundDetection= %d;coding_param->bEnableAdaptiveQuant= %d;coding_param->bEnableCropPic= %d;coding_param->bEnableLongTermReference= %d;coding_param->iLtrMarkPeriod= %d;\n",
-				sEncodingParam.iPicWidth,
-				sEncodingParam.iPicHeight,
-				sEncodingParam.iTargetBitrate,
-				sEncodingParam.iRCMode,
-				sEncodingParam.iPaddingFlag,
-				sEncodingParam.iTemporalLayerNum,
-				sEncodingParam.iSpatialLayerNum,
-				sEncodingParam.fFrameRate,
-				sEncodingParam.iInputCsp,
-				sEncodingParam.iKeyPicCodingMode,
-				sEncodingParam.iIntraPeriod,
-				sEncodingParam.bEnableSpsPpsIdAddition,
-				sEncodingParam.bPrefixNalAddingCtrl,
-				sEncodingParam.bEnableDenoise,
-				sEncodingParam.bEnableBackgroundDetection,
-				sEncodingParam.bEnableAdaptiveQuant,
-				sEncodingParam.bEnableCropPic,
-				sEncodingParam.bEnableLongTermReference,
-				sEncodingParam.iLtrMarkPeriod);
-			int32_t i = 0;
-			while (i < sEncodingParam.iSpatialLayerNum)
-			{
-				SSpatialLayerConfig *pSpatialCfg = &sEncodingParam.sSpatialLayers[i]; 
-				WelsLog( m_pEncContext, WELS_LOG_INFO, "coding_param->sSpatialLayers[%d]: .iVideoWidth= %d; .iVideoHeight= %d; .fFrameRate= %.6ff; .iQualityLayerNum= %d; .iSpatialBitrate= %d; .iCgsSnrRefined= %d; .iInterSpatialLayerPredFlag= %d; .sSliceCfg.uiSliceMode= %d; .sSliceCfg.sSliceArgument.iSliceNum= %d; .sSliceCfg.sSliceArgument.uiSliceSizeConstraint= %d;\n",
-					i, pSpatialCfg->iVideoWidth,
-					pSpatialCfg->iVideoHeight,
-					pSpatialCfg->fFrameRate,
-					pSpatialCfg->iQualityLayerNum,
-					pSpatialCfg->iSpatialBitrate,
-					pSpatialCfg->iCgsSnrRefined,
-					pSpatialCfg->iInterSpatialLayerPredFlag,
-					pSpatialCfg->sSliceCfg.uiSliceMode,					
-					pSpatialCfg->sSliceCfg.sSliceArgument.uiSliceNum,
-					pSpatialCfg->sSliceCfg.sSliceArgument.uiSliceSizeConstraint
-					);
-				++ i;
-			}
-#ifdef OUTPUT_BIT_STREAM
-			if ( sEncodingParam.sSpatialLayers[sEncodingParam.iSpatialLayerNum-1].iVideoWidth != m_pEncContext->pSvcParam->sDependencyLayers[m_pEncContext->pSvcParam->iNumDependencyLayer-1].iFrameWidth )
-			{
-				++ m_iSwitchTimes;
-				m_bSwitch = TRUE;
-			}
-#endif//OUTPUT_BIT_STREAM
-			if ( sEncodingParam.iSpatialLayerNum < 1 || sEncodingParam.iSpatialLayerNum > MAX_SPATIAL_LAYER_NUM )	// verify number of spatial layer
-			{					
-				return cmInitParaError;
-			}
-
-			iInputColorspace	= sEncodingParam.iInputCsp;			
-			if ( sConfig.ParamTranscode( sEncodingParam, true ) )
-			{					
-				return cmInitParaError;
-			}
-			if ( sConfig.iNumDependencyLayer < 1 )
-			{					
-				return cmInitParaError;
-			}
-			iTargetWidth	= sConfig.iActualPicWidth;
-			iTargetHeight	= sConfig.iActualPicHeight;				
-			if ( m_pSrcPicList[0] == NULL )
-			{					
-				return cmInitParaError;
-			}
-			if ( m_iCspInternal != iInputColorspace || m_iMaxPicWidth != iTargetWidth || m_iMaxPicHeight != iTargetHeight ){	// for color space due to changed
-				InitPic( m_pSrcPicList[0], iInputColorspace, iTargetWidth, iTargetHeight );
-				m_iMaxPicWidth	= iTargetWidth;
-				m_iMaxPicHeight	= iTargetHeight;
-				m_iCspInternal	= iInputColorspace;
-			}			
-#ifdef REC_FRAME_COUNT
-			WelsLog( m_pEncContext, WELS_LOG_INFO, "CWelsH264SVCEncoder::SetOption():ENCODER_OPTION_SVC_ENCODE_PARAM, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x\n", m_uiCountFrameNum, m_iCspInternal );
-#endif//REC_FRAME_COUNT
-
-			/* New configuration available here */
-			sConfig.iInputCsp	= m_iCspInternal;	// I420 in default designed for presentation in encoder used internal
-			sConfig.DetermineTemporalSettings();
-
-			/* Check every field whether there is new request for memory block changed or else, Oct. 24, 2008 */
-			WelsEncoderParamAdjust( &m_pEncContext, &sConfig );	
-		}
-		break;
-	case ENCODER_OPTION_FRAME_RATE:	// Maximal input frame rate
-		{
-			float iValue	= *((float*)pOption);
-#ifdef REC_FRAME_COUNT
-			WelsLog( m_pEncContext, WELS_LOG_INFO, "CWelsH264SVCEncoder::SetOption():ENCODER_OPTION_FRAME_RATE, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x, iValue= %d\n", m_uiCountFrameNum, m_iCspInternal, iValue );
-#endif//REC_FRAME_COUNT
-			m_pEncContext->pSvcParam->fMaxFrameRate	= iValue;			
-			
-		}
-		break;
-	case ENCODER_OPTION_iBitRate:	// Target bit-rate
-		{
-			int32_t iValue = *((int32_t*)pOption);
-#ifdef REC_FRAME_COUNT
-				WelsLog( m_pEncContext, WELS_LOG_INFO, "CWelsH264SVCEncoder::SetOption():ENCODER_OPTION_iBitRate, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x, iValue= %d\n", m_uiCountFrameNum, m_iCspInternal, iValue );
-#endif//REC_FRAME_COUNT
-				m_pEncContext->pSvcParam->iTargetBitrate	= iValue;				
-
-		}
-		break;
-	case ENCODER_OPTION_RC_MODE:	// 0:quality mode;1:bit-rate mode
-		{
-			int32_t iValue = *((int32_t*)pOption);				
-			m_pEncContext->pSvcParam->iRCMode	= iValue;						
-		}
-		break;
-	case ENCODER_PADDING_PADDING:	// 0:disable padding;1:padding
-		{
-			int32_t iValue = *((int32_t*)pOption);				
-			m_pEncContext->pSvcParam->iPaddingFlag	= iValue;				
-		}
-		break;
-	case ENCODER_LTR_RECOVERY_REQUEST:
-		{
-			SLTRRecoverRequest* pLTR_Recover_Request = (SLTRRecoverRequest*)(pOption);
-			FilterLTRRecoveryRequest(m_pEncContext,pLTR_Recover_Request);
-		}
-		break;
-	case ENCODER_LTR_MARKING_FEEDBACK:
-		{
-			SLTRMarkingFeedback* fb = (SLTRMarkingFeedback*)(pOption);
-			FilterLTRMarkingFeedback(m_pEncContext,fb);
-		}
-		break;
-	case ENCOCER_LTR_MARKING_PERIOD:
-		{
-			uint32_t iValue = *((uint32_t*)(pOption));
-			m_pEncContext->pSvcParam->uiLtrMarkPeriod = iValue;
-		}
-		break;
-	case ENCODER_OPTION_LTR:
-		{		
-			uint32_t iValue = *((uint32_t*)(pOption));
-			m_pEncContext->pSvcParam->bEnableLongTermReference = iValue?true:false;
-			WelsLog(m_pEncContext,WELS_LOG_WARNING," CWelsH264SVCEncoder::SetOption enable LTR = %d",m_pEncContext->pSvcParam->bEnableLongTermReference);
-		}
-		break;
-	case ENCODER_OPTION_ENABLE_SSEI:
-		{
-			bool_t iValue = *((bool_t*)pOption);
-			m_pEncContext->pSvcParam->bEnableSSEI = iValue;
-			WelsLog( m_pEncContext, WELS_LOG_INFO, " CWelsH264SVCEncoder::SetOption enable SSEI = %d \n", m_pEncContext->pSvcParam->bEnableSSEI );
-		}
-		break;
-	case ENCODER_OPTION_ENABLE_PREFIX_NAL_ADDING:
-		{
-			bool_t iValue = *((bool_t*)pOption);
-			m_pEncContext->pSvcParam->bPrefixNalAddingCtrl = iValue;
-			WelsLog( m_pEncContext, WELS_LOG_INFO, " CWelsH264SVCEncoder::SetOption bPrefixNalAddingCtrl = %d \n", m_pEncContext->pSvcParam->bPrefixNalAddingCtrl );		
-		}
-		break;
-	case ENCODER_OPTION_ENABLE_SPS_PPS_ID_ADDITION:
-		{
-			bool_t iValue = *((bool_t*)pOption);
-			
-			m_pEncContext->pSvcParam->bEnableSpsPpsIdAddition = iValue;
-			WelsLog( m_pEncContext, WELS_LOG_INFO, " CWelsH264SVCEncoder::SetOption enable SPS/PPS ID = %d \n", m_pEncContext->pSvcParam->bEnableSpsPpsIdAddition );		
-		}
-		break;
-	case ENCODER_OPTION_CURRENT_PATH:
-		{
-			if (m_pEncContext->pSvcParam != NULL)
-			{
-				str_t * path = static_cast<str_t *>(pOption);
-				m_pEncContext->pSvcParam->pCurPath = path;				
-			}			
-		}
-		break;
-	default:		
-		return cmInitParaError;
-	}
-
-	return 0;
-}
-
-int CWelsH264SVCEncoder::GetOption(ENCODER_OPTION eOptionId, void* pOption)
-{
-	if ( NULL == pOption ){		
-		return cmInitParaError;
-	}	
-	if ( NULL == m_pEncContext || FALSE == m_bInitialFlag ){		
-		return cmInitExpected;
-	}
-	
-	switch( eOptionId ) {
-	case ENCODER_OPTION_INTER_SPATIAL_PRED:	// Inter spatial layer prediction flag
-		{
-			WelsLog( m_pEncContext, WELS_LOG_INFO, "ENCODER_OPTION_INTER_SPATIAL_PRED, this feature not supported at present.\n" );
-		}
-		break;
-	case ENCODER_OPTION_DATAFORMAT:	// Input color space
-		{
-#ifdef REC_FRAME_COUNT
-			WelsLog( m_pEncContext, WELS_LOG_INFO, "CWelsH264SVCEncoder::GetOption():ENCODER_OPTION_DATAFORMAT, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x\n", m_uiCountFrameNum, m_iCspInternal );
-#endif//REC_FRAME_COUNT
-			
-			*((int32_t*)pOption)	= m_iCspInternal;
-		}
-		break;
-	case ENCODER_OPTION_IDR_INTERVAL:	// IDR Interval
-		{
-#ifdef REC_FRAME_COUNT
-			WelsLog( m_pEncContext, WELS_LOG_INFO, "CWelsH264SVCEncoder::GetOption():ENCODER_OPTION_IDR_INTERVAL, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x\n", m_uiCountFrameNum, m_iCspInternal );
-#endif//REC_FRAME_COUNT
-			*((int32_t*)pOption) = m_pEncContext->pSvcParam->uiIntraPeriod;
-		}
-		break;
-	case ENCODER_OPTION_SVC_ENCODE_PARAM:	// SVC Encoding Parameter
-		{
-#ifdef REC_FRAME_COUNT
-			WelsLog( m_pEncContext, WELS_LOG_INFO, "CWelsH264SVCEncoder::GetOption():ENCODER_OPTION_SVC_ENCODE_PARAM, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x\n", m_uiCountFrameNum, m_iCspInternal );
-#endif//REC_FRAME_COUNT
-			memcpy( pOption, m_pEncContext->pSvcParam, sizeof(SWelsSvcCodingParam) );	// confirmed_safe_unsafe_usage
-		}
-		break;
-	case ENCODER_OPTION_FRAME_RATE:	// Maximal input frame rate
-		{
-#ifdef REC_FRAME_COUNT
-			WelsLog( m_pEncContext, WELS_LOG_INFO, "CWelsH264SVCEncoder::GetOption():ENCODER_OPTION_FRAME_RATE, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x\n", m_uiCountFrameNum, m_iCspInternal );
-#endif//REC_FRAME_COUNT
-			*((float*)pOption)	= m_pEncContext->pSvcParam->fMaxFrameRate;
-		}
-		break;
-	case ENCODER_OPTION_iBitRate:	// Target bit-rate
-		{
-#ifdef REC_FRAME_COUNT
-			WelsLog( m_pEncContext, WELS_LOG_INFO, "CWelsH264SVCEncoder::GetOption():ENCODER_OPTION_iBitRate, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x\n", m_uiCountFrameNum, m_iCspInternal );
-#endif//REC_FRAME_COUNT
-			*((int32_t*)pOption)	= m_pEncContext->pSvcParam->iTargetBitrate;
-		}
-		break;
-	default:		
-		return cmInitParaError;
-	}
-	
-	return 0;
-}
-
-void CWelsH264SVCEncoder::DumpSrcPicture(const uint8_t *pSrc)
-{
-#ifdef DUMP_SRC_PICTURE
-	FILE *pFile = NULL;
-	str_t strFileName[256] = {0};
-	const int32_t iDataLength = m_iMaxPicWidth * m_iMaxPicHeight;
-
-#if defined(__GNUC__)
-	STRNCPY(strFileName, 256, "/tmp/pic_in_", STRNLEN("/tmp/pic_in_", 255));	// confirmed_safe_unsafe_usage
-#else
-	STRNCPY(strFileName, 256, "d:\\incoming\\mosaic_st\\pic_in_", STRNLEN("d:\\incoming\\mosaic_st\\pic_in_", 255));	// confirmed_safe_unsafe_usage
-#endif//__GNUC__
-    
-	if ( m_iMaxPicWidth == 640 )
-	{
-		STRCAT(strFileName, 256, "360p.");	// confirmed_safe_unsafe_usage
-	}
-	else if ( m_iMaxPicWidth == 320  )
-	{
-		STRCAT(strFileName, 256, "180p.");	// confirmed_safe_unsafe_usage
-	}
-	else if ( m_iMaxPicWidth == 160 )
-	{
-		STRCAT(strFileName, 256, "90p.");	// confirmed_safe_unsafe_usage
-	}		
-    
-	switch( m_iCspInternal) {
-		case videoFormatI420:
-		case videoFormatYV12:
-			STRCAT(strFileName, 256, "yuv");	// confirmed_safe_unsafe_usage
-#if defined(__GNUC__)
-			pFile = FOPEN(strFileName, "ab+");
-#else
-#if defined(_MSC_VER)
-#if _MSC_VER>=1500
-			FOPEN(&pFile, strFileName, "ab+");
-#else
-			pFile = FOPEN(strFileName, "ab+");
-#endif//_MSC_VER>=1500
-#endif//_MSC_VER			
-#endif//__GNUC__
-			//				WelsLog( m_pEncContext, WELS_LOG_INFO, "WELS_CSP_I420, m_iCspInternal= 0x%x\n", m_iCspInternal);
-			if (NULL != pFile)
-			{			
-				fwrite( pSrc, sizeof(uint8_t), (iDataLength * 3)>>1, pFile );
-				fflush( pFile );
-				fclose(pFile);
-			}
-			break;
-		case videoFormatRGB:
-			STRCAT(strFileName, 256, "rgb");	// confirmed_safe_unsafe_usage
-#if defined(__GNUC__)
-			pFile = FOPEN(strFileName, "ab+");
-#else
-#if defined(_MSC_VER)
-#if _MSC_VER>=1500
-			FOPEN(&pFile, strFileName, "ab+");
-#else
-			pFile = FOPEN(strFileName, "ab+");
-#endif//_MSC_VER>=1500
-#endif//_MSC_VER			
-#endif//__GNUC__
-			if ( NULL != pFile )
-			{			
-				fwrite( pSrc, sizeof(uint8_t), iDataLength * 3, pFile );
-				fflush( pFile );
-				fclose( pFile );
-			}
-		case videoFormatBGR:
-			STRCAT(strFileName, 256, "bgr");	// confirmed_safe_unsafe_usage
-#if defined(__GNUC__)
-			pFile = FOPEN(strFileName, "ab+");
-#else
-#if defined(_MSC_VER)
-#if _MSC_VER>=1500
-			FOPEN(&pFile, strFileName, "ab+");
-#else
-			pFile = FOPEN(strFileName, "ab+");
-#endif//_MSC_VER>=1500
-#endif//_MSC_VER			
-#endif//__GNUC__
-			//				WelsLog( m_pEncContext, WELS_LOG_INFO, "WELS_CSP_BGR, m_iCspInternal= 0x%x\n", m_iCspInternal);
-			if ( NULL != pFile )
-			{
-				fwrite( pSrc, sizeof(uint8_t), iDataLength * 3, pFile );
-				fflush( pFile );
-				fclose( pFile );
-			}			
-			break;
-		case videoFormatYUY2:
-			STRCAT(strFileName, 256, "yuy2");	// confirmed_safe_unsafe_usage
-#if defined(__GNUC__)
-			pFile = FOPEN(strFileName, "ab+");
-#else
-#if defined(_MSC_VER)
-#if _MSC_VER>=1500
-			FOPEN(&pFile, strFileName, "ab+");
-#else
-			pFile = FOPEN(strFileName, "ab+");
-#endif//_MSC_VER>=1500
-#endif//_MSC_VER			
-#endif//__GNUC__
-			if ( NULL != pFile )
-			{
-				fwrite( pSrc, sizeof(uint8_t), (CALC_BI_STRIDE(m_iMaxPicWidth,  16)) * m_iMaxPicHeight, pFile );
-				fflush( pFile );
-				fclose( pFile );
-			}			
-			break;
-		default:
-			WelsLog( m_pEncContext, WELS_LOG_INFO, "Exclusive case, m_iCspInternal= 0x%x\n", m_iCspInternal);
-			break;
-	}
-#endif//DUMP_SRC_PICTURE
-	return;
-}
-}
-
-using namespace WelsSVCEnc;
-
-int32_t CreateSVCEncoder(ISVCEncoder** ppEncoder)
-{
-	assert( ppEncoder );
-
-	if ( NULL == ppEncoder )
-		return 1;	
-
-	if( ( *ppEncoder = new CWelsH264SVCEncoder() ) != NULL )
-	{		
-		return 0;
-	}
-
-	return 1;
-}
-
-void DestroySVCEncoder(ISVCEncoder* pEncoder)
-{
-	CWelsH264SVCEncoder *pSVCEncoder = (CWelsH264SVCEncoder*)pEncoder;
-
-	if( pSVCEncoder ){		
-		delete pSVCEncoder;
-		pSVCEncoder = NULL;
-	}
-}
-//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <assert.h>
+#include "welsEncoderExt.h"
+#include "welsCodecTrace.h"
+#include "typedefs.h"
+#include "wels_const.h"
+#include "utils.h"
+#include "macros.h"
+
+#include "crt_util_safe_x.h"	// Safe CRT routines like util for cross platforms
+#include "ref_list_mgr_svc.h"
+
+#include <time.h>
+#if defined(WIN32) /*&& defined(_DEBUG)*/
+
+#include <windows.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <sys/types.h>
+#include <sys/timeb.h>
+#else
+#include <sys/time.h>
+#endif
+
+namespace WelsSVCEnc {
+
+/*
+ *	CWelsH264SVCEncoder class implementation
+ */
+CWelsH264SVCEncoder::CWelsH264SVCEncoder()
+  :	m_pEncContext (NULL),
+#if defined(WIN32)||defined(_MACH_PLATFORM)||defined(__GNUC__)
+    m_pWelsTrace (NULL),
+#endif
+    m_pSrcPicList (NULL),
+    m_iSrcListSize (0),
+    m_iMaxPicWidth (0),
+    m_iMaxPicHeight (0),
+    m_iCspInternal (0),
+    m_bInitialFlag (FALSE) {
+#ifdef REC_FRAME_COUNT
+  int32_t m_uiCountFrameNum = 0;
+#endif//REC_FRAME_COUNT
+
+#ifdef OUTPUT_BIT_STREAM
+  str_t strStreamFileName[1024] = { 0 };  //for .264
+  int32_t iBufferUsed = 0;
+  int32_t iBufferLeft = 1023;
+
+  str_t strLenFileName[1024] = { 0 }; //for .len
+  int32_t iBufferUsedSize = 0;
+  int32_t iBufferLeftSize = 1023;
+#endif//OUTPUT_BIT_STREAM
+
+#ifdef OUTPUT_BIT_STREAM
+  time_t tTime;
+
+#if defined( WIN32 )
+#if defined(_MSC_VER)
+#if _MSC_VER>=1500
+  struct tm tTimeNow;
+#else
+  struct tm* tTimeNow;
+#endif//_MSC_VER>=1500
+#endif//_MSC_VER
+  struct _timeb tTimeb;
+
+  time (&tTime);
+#if defined(_MSC_VER)
+#if _MSC_VER>=1500
+  LOCALTIME (&tTimeNow, &tTime);
+#else
+  tTimeNow = LOCALTIME (&tTime);
+  if (NULL == tTimeNow)
+    return;
+#endif//_MSC_VER>=1500
+#endif//_MSC_VER
+  FTIME (&tTimeb);
+#elif defined( __GNUC__ )
+  struct tm* tTimeNow;
+  struct timeval tTimev;
+  time (&tTime);
+  tTimeNow = (struct tm*)localtime (&tTime);
+  gettimeofday (&tTimev, NULL);
+#endif//WIN32	
+
+#ifdef WIN32
+#if defined(_MSC_VER)
+#if _MSC_VER>=1500
+  iBufferUsed      += SNPRINTF (strStreamFileName,      iBufferLeft, iBufferLeft,      "enc_bs_0x%p_", (void*)this);
+  iBufferUsedSize += SNPRINTF (strLenFileName, iBufferLeftSize, iBufferLeftSize, "enc_size_0x%p_", (void*)this);
+#else
+  iBufferUsed      += SNPRINTF (strStreamFileName,      iBufferLeft,      "enc_bs_0x%p_", (void*)this);
+  iBufferUsedSize += SNPRINTF (strLenFileName, iBufferLeftSize, "enc_size_0x%p_", (void*)this);
+#endif//_MSC_VER>=1500
+#endif//_MSC_VER
+#else
+  iBufferUsed      += SNPRINTF (strStreamFileName,      iBufferLeft,      "/tmp/enc_bs_0x%p_", (void*)this);
+  iBufferUsedSize += SNPRINTF (strLenFileName, iBufferLeftSize, "/tmp/enc_size_0x%p", (void*)this);
+#endif//WIN32
+
+
+  iBufferLeft -= iBufferUsed;
+  if (iBufferLeft > iBufferUsed) {
+#if defined(_GNUC__)
+    iBufferUsed += strftime (&strStreamFileName[iBufferUsed], iBufferLeft, "%y%m%d%H%M%S", tTimeNow);
+#else
+#if defined(_MSC_VER)
+    iBufferUsed += strftime (&strStreamFileName[iBufferUsed], iBufferLeft, "%y%m%d%H%M%S",
+#if _MSC_VER>=1500
+                             & tTimeNow
+#else
+                             tTimeNow
+#endif//_MSC_VER>=1500
+                            );
+#endif//_MSC_VER			
+#endif//__GNUC__
+    iBufferLeft -= iBufferUsed;
+  }
+
+  iBufferLeftSize -= iBufferUsedSize;
+  if (iBufferLeftSize > iBufferUsedSize) {
+#if defined(_GNUC__)
+    iBufferUsedSize += strftime (&strLenFileName[iBufferUsedSize], iBufferLeftSize, "%y%m%d%H%M%S", tTimeNow);
+#else
+#if defined(_MSC_VER)
+    iBufferUsedSize += strftime (&strLenFileName[iBufferUsedSize], iBufferLeftSize, "%y%m%d%H%M%S",
+#if _MSC_VER>=1500
+                                 & tTimeNow
+#else
+                                 tTimeNow
+#endif//_MSC_VER>=1500
+                                );
+#endif//_MSC_VER
+#endif//__GNUC__
+    iBufferLeftSize -= iBufferUsedSize;
+  }
+
+  if (iBufferLeft > iBufferUsed) {
+#ifdef WIN32
+#if defined(_MSC_VER)
+#if _MSC_VER>=1500
+    iBufferUsed += SNPRINTF (&strStreamFileName[iBufferUsed], iBufferLeft, iBufferLeft, ".%03.3u.264", tTimeb.millitm);
+#else
+    iBufferUsed += SNPRINTF (&strStreamFileName[iBufferUsed], iBufferLeft, ".%03.3u.264", tTimeb.millitm);
+#endif//_MSC_VER>=1500
+#endif//_MSC_VER
+#else
+    iBufferUsed += SNPRINTF (&strStreamFileName[iBufferUsed], iBufferLeft, ".%03.3u.264", tTimev.tv_usec / 1000);
+#endif//WIN32
+    iBufferLeft -= iBufferUsed;
+  }
+
+  if (iBufferLeftSize > iBufferUsedSize) {
+#ifdef WIN32
+#if defined(_MSC_VER)
+#if _MSC_VER>=1500
+    iBufferUsedSize += SNPRINTF (&strLenFileName[iBufferUsedSize], iBufferLeftSize, iBufferLeftSize, ".%03.3u.len",
+                                 tTimeb.millitm);
+#else
+    iBufferUsedSize += SNPRINTF (&strLenFileName[iBufferUsedSize], iBufferLeftSize, ".%03.3u.len", tTimeb.millitm);
+#endif//_MSC_VER>=1500
+#endif//_MSC_VER
+#else
+    iBufferUsedSize += SNPRINTF (&strLenFileName[iBufferUsedSize], iBufferLeftSize, ".%03.3u.len", tTimev.tv_usec / 1000);
+#endif//WIN32
+    iBufferLeftSize -= iBufferUsedSize;
+  }
+
+#if defined(__GNUC__)
+  m_pFileBs       = FOPEN (strStreamFileName,      "wb");
+  m_pFileBsSize	= FOPEN (strLenFileName, "wb");
+#else
+#if defined(_MSC_VER)
+#if _MSC_VER>=1500
+  FOPEN (&m_pFileBs, strStreamFileName,      "wb");
+  FOPEN (&m_pFileBsSize, strLenFileName, "wb");
+#else
+  m_pFileBs       = FOPEN (strStreamFileName,      "wb");
+  m_pFileBsSize	= FOPEN (strLenFileName, "wb");
+#endif//_MSC_VER>=1500
+#endif//_MSC_VER
+#endif//__GNUC__
+
+  m_bSwitch	= FALSE;
+  m_iSwitchTimes	= 0;
+#endif//OUTPUT_BIT_STREAM
+
+  InitEncoder();
+}
+
+CWelsH264SVCEncoder::~CWelsH264SVCEncoder() {
+  WelsLog (NULL, WELS_LOG_INFO, "CWelsH264SVCEncoder::~CWelsH264SVCEncoder()\n");
+#if defined(WIN32)||defined(_MACH_PLATFORM)||defined(__GNUC__)
+
+  if (m_pWelsTrace != NULL) {
+    delete m_pWelsTrace;
+    m_pWelsTrace = NULL;
+  }
+#endif
+#ifdef REC_FRAME_COUNT
+  WelsLog (m_pEncContext, WELS_LOG_INFO,
+           "CWelsH264SVCEncoder::~CWelsH264SVCEncoder(), m_uiCountFrameNum= %d, m_iCspInternal= 0x%x\n", m_uiCountFrameNum,
+           m_iCspInternal);
+#endif
+
+#ifdef REC_FRAME_COUNT
+  m_uiCountFrameNum = 0;
+#endif//REC_FRAME_COUNT
+
+#ifdef OUTPUT_BIT_STREAM
+  if (m_pFileBs) {
+    fclose (m_pFileBs);
+    m_pFileBs = NULL;
+  }
+  if (m_pFileBsSize) {
+    fclose (m_pFileBsSize);
+    m_pFileBsSize = NULL;
+  }
+  m_bSwitch	= FALSE;
+  m_iSwitchTimes	= 0;
+#endif//OUTPUT_BIT_STREAM
+
+  Uninitialize();
+}
+
+void CWelsH264SVCEncoder::InitEncoder (void) {
+#if defined(WIN32)||defined(_MACH_PLATFORM)||defined(__GNUC__)
+
+#ifdef REC_FRAME_COUNT
+  WelsLog (m_pEncContext, WELS_LOG_INFO,
+           "CWelsH264SVCEncoder::InitEncoder, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x\n", m_uiCountFrameNum, m_iCspInternal);
+#endif
+
+  m_pWelsTrace	= new welsCodecTrace();
+  if (m_pWelsTrace != NULL) {
+    const int32_t iWelsTraceExistingFlag = m_pWelsTrace->WelsTraceModuleIsExist();
+    if (iWelsTraceExistingFlag) {
+      m_pWelsTrace->SetTraceLevel (WELS_LOG_DEFAULT);
+      WelsSetLogCallback (welsCodecTrace::CODEC_TRACE);
+    }
+  }
+
+  // initialization
+  WelsSetLogLevel (WELS_LOG_DEFAULT);	// no output, WELS_LOG_QUIET
+#endif
+}
+
+/* Interfaces override from ISVCEncoder */
+
+/*
+ *	SVC Encoder Initialization
+ */
+int CWelsH264SVCEncoder::Initialize (SVCEncodingParam* argv, const INIT_TYPE iInitType) {
+  if (INIT_TYPE_PARAMETER_BASED != iInitType || NULL == argv) {
+    WelsLog (m_pEncContext, WELS_LOG_ERROR, "CWelsH264SVCEncoder::Initialize(), invalid iInitType= %d, argv= 0x%p\n",
+             iInitType, (void*)argv);
+    return cmInitParaError;
+  }
+
+  if (m_bInitialFlag) {
+    WelsLog (m_pEncContext, WELS_LOG_WARNING, "CWelsH264SVCEncoder::Initialize(), reinitialize, m_bInitialFlag= %d\n",
+             m_bInitialFlag);
+    Uninitialize();
+  }
+
+  SVCEncodingParam		sEncodingParam;
+  SWelsSvcCodingParam	sConfig (true);
+
+  memcpy (&sEncodingParam, argv, sizeof (SVCEncodingParam));	// confirmed_safe_unsafe_usage
+
+#ifdef REC_FRAME_COUNT
+  WelsLog (m_pEncContext, WELS_LOG_INFO, "CWelsH264SVCEncoder::Initialize, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x\n",
+           m_uiCountFrameNum, m_iCspInternal);
+  WelsLog (m_pEncContext, WELS_LOG_INFO,
+           "coding_param->iPicWidth= %d;coding_param->iPicHeight= %d;coding_param->iTargetBitrate= %d;coding_param->iRCMode= %d;coding_param->iTemporalLayerNum= %d;coding_param->iSpatialLayerNum= %d;coding_param->fFrameRate= %.6ff;coding_param->iInputCsp= %d;coding_param->iKeyPicCodingMode= %d;coding_param->uiIntraPeriod= %d;coding_param->bEnableSpsPpsIdAddition = %d;coding_param->bPrefixNalAddingCtrl = %d;coding_param->bEnableDenoise= %d;coding_param->bEnableBackgroundDetection= %d;coding_param->bEnableAdaptiveQuant= %d;coding_param->bEnableCropPic= %d;coding_param->bEnableLongTermReference= %d;coding_param->iLtrMarkPeriod= %d;\n",
+           sEncodingParam.iPicWidth,
+           sEncodingParam.iPicHeight,
+           sEncodingParam.iTargetBitrate,
+           sEncodingParam.iRCMode,
+           sEncodingParam.iTemporalLayerNum,
+           sEncodingParam.iSpatialLayerNum,
+           sEncodingParam.fFrameRate,
+           sEncodingParam.iInputCsp,
+           sEncodingParam.iKeyPicCodingMode,
+           sEncodingParam.iIntraPeriod,
+           sEncodingParam.bEnableSpsPpsIdAddition,
+           sEncodingParam.bPrefixNalAddingCtrl,
+           sEncodingParam.bEnableDenoise,
+           sEncodingParam.bEnableBackgroundDetection,
+           sEncodingParam.bEnableAdaptiveQuant,
+           sEncodingParam.bEnableCropPic,
+           sEncodingParam.bEnableLongTermReference,
+           sEncodingParam.iLtrMarkPeriod);
+  int32_t i = 0;
+  while (i < sEncodingParam.iSpatialLayerNum) {
+    SSpatialLayerConfig* spatial_cfg = &sEncodingParam.sSpatialLayers[i];
+    WelsLog (m_pEncContext, WELS_LOG_INFO,
+             "coding_param->sSpatialLayers[%d]: .iVideoWidth= %d; .iVideoHeight= %d; .fFrameRate= %.6ff; .iQualityLayerNum= %d; .iSpatialBitrate= %d; .iCgsSnrRefined= %d; .iInterSpatialLayerPredFlag= %d; .sSliceCfg.uiSliceMode= %d; .sSliceCfg.sSliceArgument.uiSliceNum= %d; .sSliceCfg.sSliceArgument.uiSliceSizeConstraint= %d;\n",
+             i, spatial_cfg->iVideoWidth,
+             spatial_cfg->iVideoHeight,
+             spatial_cfg->fFrameRate,
+             spatial_cfg->iQualityLayerNum,
+             spatial_cfg->iSpatialBitrate,
+             spatial_cfg->iCgsSnrRefined,
+             spatial_cfg->iInterSpatialLayerPredFlag,
+             spatial_cfg->sSliceCfg.uiSliceMode,
+             spatial_cfg->sSliceCfg.sSliceArgument.uiSliceNum,
+             spatial_cfg->sSliceCfg.sSliceArgument.uiSliceSizeConstraint
+            );
+    ++ i;
+  }
+#endif//REC_FRAME_COUNT
+
+  // Convert SVCEncodingParam into WelsSVCParamConfig here..
+  if (sConfig.ParamTranscode (sEncodingParam, true)) {
+    WelsLog (m_pEncContext, WELS_LOG_ERROR, "CWelsH264SVCEncoder::Initialize(), parameter_translation failed.\n");
+    Uninitialize();
+    return cmInitParaError;
+  }
+
+  m_iSrcListSize  = 1;
+
+  return Initialize ((void*)&sConfig, INIT_TYPE_CONFIG_BASED);
+}
+
+int CWelsH264SVCEncoder::Initialize (void* argv, const INIT_TYPE iInitType) {
+  if (INIT_TYPE_CONFIG_BASED != iInitType || NULL == argv) {
+    WelsLog (m_pEncContext, WELS_LOG_ERROR, "CWelsH264SVCEncoder::Initialize(), invalid iInitType= %d, argv= 0x%p.\n",
+             iInitType, (void*)argv);
+    return cmInitParaError;
+  }
+
+  if (m_bInitialFlag) {
+    WelsLog (m_pEncContext, WELS_LOG_WARNING, "CWelsH264SVCEncoder::Initialize(), reinitialize, m_bInitialFlag= %d.\n",
+             m_bInitialFlag);
+    Uninitialize();
+  }
+
+  SWelsSvcCodingParam*  pCfg = static_cast<SWelsSvcCodingParam*> (argv);
+
+  const int32_t iColorspace = pCfg->iInputCsp;
+  if (0 == iColorspace) {
+    WelsLog (m_pEncContext, WELS_LOG_ERROR, "CWelsH264SVCEncoder::Initialize(), invalid iInputCsp= %d.\n", iColorspace);
+    Uninitialize();
+    return cmInitParaError;
+  }
+
+  // Check valid parameters
+  const int32_t iNumOfLayers = pCfg->iNumDependencyLayer;
+  if (iNumOfLayers < 1 || iNumOfLayers > MAX_DEPENDENCY_LAYER) {
+    WelsLog (m_pEncContext, WELS_LOG_ERROR,
+             "CWelsH264SVCEncoder::Initialize(), invalid iNumDependencyLayer= %d, valid at range of [1, %d].\n", iNumOfLayers,
+             MAX_DEPENDENCY_LAYER);
+    Uninitialize();
+    return cmInitParaError;
+  }
+  if (pCfg->iNumTemporalLayer < 1)
+    pCfg->iNumTemporalLayer	= 1;
+  if (pCfg->iNumTemporalLayer > MAX_TEMPORAL_LEVEL) {
+    WelsLog (m_pEncContext, WELS_LOG_ERROR,
+             "CWelsH264SVCEncoder::Initialize(), invalid iNumTemporalLayer= %d, valid at range of [1, %d].\n",
+             pCfg->iNumTemporalLayer, MAX_TEMPORAL_LEVEL);
+    Uninitialize();
+    return cmInitParaError;
+  }
+
+  //	assert( cfg.uiGopSize >= 1 && ( cfg.uiIntraPeriod && (cfg.uiIntraPeriod % cfg.uiGopSize) == 0) );
+
+  if (pCfg->uiGopSize < 1 || pCfg->uiGopSize > MAX_GOP_SIZE) {
+    WelsLog (m_pEncContext, WELS_LOG_ERROR,
+             "CWelsH264SVCEncoder::Initialize(), invalid uiGopSize= %d, valid at range of [1, %d].\n", pCfg->uiGopSize,
+             MAX_GOP_SIZE);
+    Uninitialize();
+    return cmInitParaError;
+  }
+
+  if (!WELS_POWER2_IF (pCfg->uiGopSize)) {
+    WelsLog (m_pEncContext, WELS_LOG_ERROR,
+             "CWelsH264SVCEncoder::Initialize(), invalid uiGopSize= %d, valid at range of [1, %d] and yield to power of 2.\n",
+             pCfg->uiGopSize, MAX_GOP_SIZE);
+    Uninitialize();
+    return cmInitParaError;
+  }
+
+  if (pCfg->uiIntraPeriod && pCfg->uiIntraPeriod < pCfg->uiGopSize) {
+    WelsLog (m_pEncContext, WELS_LOG_ERROR,
+             "CWelsH264SVCEncoder::Initialize(), invalid uiIntraPeriod= %d, valid in case it equals to 0 for unlimited intra period or exceeds specified uiGopSize= %d.\n",
+             pCfg->uiIntraPeriod, pCfg->uiGopSize);
+    Uninitialize();
+    return cmInitParaError;
+  }
+
+  if ((pCfg->uiIntraPeriod && (pCfg->uiIntraPeriod & (pCfg->uiGopSize - 1)) != 0)) {
+    WelsLog (m_pEncContext, WELS_LOG_ERROR,
+             "CWelsH264SVCEncoder::Initialize(), invalid uiIntraPeriod= %d, valid in case it equals to 0 for unlimited intra period or exceeds specified uiGopSize= %d also multiple of it.\n",
+             pCfg->uiIntraPeriod, pCfg->uiGopSize);
+    Uninitialize();
+    return cmInitParaError;
+  }
+
+  // Fine tune num_ref_num
+  if (pCfg->bEnableLongTermReference) {
+    pCfg->iLTRRefNum = LONG_TERM_REF_NUM;
+  } else {
+    pCfg->iLTRRefNum = 0;
+  }
+  pCfg->iNumRefFrame = ((pCfg->uiGopSize >> 1) > 1) ? ((pCfg->uiGopSize >> 1) + pCfg->iLTRRefNum) :
+                       (MIN_REF_PIC_COUNT + pCfg->iLTRRefNum);
+
+  pCfg->iNumRefFrame = WELS_CLIP3 (pCfg->iNumRefFrame, MIN_REF_PIC_COUNT, MAX_REFERENCE_PICTURE_COUNT_NUM);
+
+  if (pCfg->uiLtrMarkPeriod == 0) {
+    pCfg->uiLtrMarkPeriod = 30;
+  }
+
+  const int32_t kiDecStages = WELS_LOG2 (pCfg->uiGopSize);
+  pCfg->iInputCsp			= iColorspace;
+  pCfg->iNumTemporalLayer	= (int8_t) (1 + kiDecStages);
+  pCfg->iLoopFilterAlphaC0Offset	= WELS_CLIP3 (pCfg->iLoopFilterAlphaC0Offset, -6, 6);
+  pCfg->iLoopFilterBetaOffset		= WELS_CLIP3 (pCfg->iLoopFilterBetaOffset, -6, 6);
+
+//	m_pSrcPicList	= (SSourcePicture **)WelsMalloc( pCfg->iNumDependencyLayer * sizeof(SSourcePicture *), "m_pSrcPicList" );
+  // prefer use new/delete pair due encoder intialization stage not start yet for CacheLineSize not detection here (16 or 64 not matched)
+  m_pSrcPicList	= new SSourcePicture* [iNumOfLayers];
+
+  if (NULL == m_pSrcPicList) {
+    WelsLog (m_pEncContext, WELS_LOG_ERROR,
+             "CWelsH264SVCEncoder::Initialize(), pOut of memory due m_pSrcPicList memory request.\n");
+    Uninitialize();
+    return cmMallocMemeError;
+  }
+
+  // decide property list size between INIT_TYPE_PARAMETER_BASED/INIT_TYPE_CONFIG_BASED
+  m_iMaxPicWidth	= pCfg->iActualPicWidth;
+  m_iMaxPicHeight	= pCfg->iActualPicHeight;
+  m_iSrcListSize  = iNumOfLayers;
+
+  for (int32_t i = 0; i < m_iSrcListSize; ++ i) {
+//		m_pSrcPicList[i]	= (SSourcePicture *)WelsMalloc( sizeof(SSourcePicture), "m_pSrcPicList[]" );
+    // prefer use new/delete pair due encoder intialization stage not start yet for CacheLineSize not detection here (16 or 64 not matched)
+    m_pSrcPicList[i]	= new SSourcePicture;
+
+    if (NULL == m_pSrcPicList[i]) {
+      WelsLog (m_pEncContext, WELS_LOG_ERROR,
+               "CWelsH264SVCEncoder::Initialize(), pOut of memory due m_pSrcPicList[%d] memory request.\n", i);
+      Uninitialize();
+      m_iSrcListSize = 0;
+      return cmMallocMemeError;
+    }
+    InitPic (m_pSrcPicList[i], iColorspace, m_iMaxPicWidth, m_iMaxPicHeight);
+  }
+
+#if defined(OUTPUT_BIT_STREAM) || defined(ENABLE_TRACE_FILE)
+  str_t fpath[MAX_FNAME_LEN] = {0};
+#if defined(__GNUC__)
+  SNPRINTF (fpath, MAX_FNAME_LEN, "/tmp/");		// confirmed_safe_unsafe_usage
+
+#else//__GNUC__
+
+#if defined (_MSC_VER)
+#if _MSC_VER>=1500
+  SNPRINTF (fpath, MAX_FNAME_LEN, MAX_FNAME_LEN, ".\\");		// confirmed_safe_unsafe_usage
+#else
+  SNPRINTF (fpath, MAX_FNAME_LEN, ".\\");		// confirmed_safe_unsafe_usage
+#endif//_MSC_VER>=1500
+#endif//_MSC_VER
+#endif //__GNUC__
+
+  strcpy (pCfg->sTracePath, fpath);		// confirmed_safe_unsafe_usage
+
+#endif //#if defined(OUTPUT_BIT_STREAM) || defined(ENABLE_TRACE_FILE)
+
+  if (WelsInitEncoderExt (&m_pEncContext, pCfg)) {
+    WelsLog (m_pEncContext, WELS_LOG_ERROR, "CWelsH264SVCEncoder::Initialize(), WelsInitEncoderExt failed.\n");
+    Uninitialize();
+    return cmInitParaError;
+  }
+
+  m_iCspInternal	= iColorspace;
+  m_bInitialFlag  = TRUE;
+
+  return cmResultSuccess;
+}
+
+/*
+ *	SVC Encoder Uninitialization
+ */
+int32_t CWelsH264SVCEncoder::Uninitialize() {
+  if (!m_bInitialFlag) {
+    return 0;
+  }
+
+  WelsLog (m_pEncContext, WELS_LOG_INFO, "CWelsH264SVCEncoder::Uninitialize()..\n");
+
+#ifdef REC_FRAME_COUNT
+  WelsLog (m_pEncContext, WELS_LOG_INFO,
+           "CWelsH264SVCEncoder::Uninitialize, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x\n", m_uiCountFrameNum, m_iCspInternal);
+#endif//REC_FRAME_COUNT
+
+  if (NULL != m_pEncContext) {
+    if (NULL != m_pSrcPicList) {
+      for (int32_t i = 0; i < m_iSrcListSize; i++) {
+        SSourcePicture* pic = m_pSrcPicList[i];
+        if (NULL != pic) {
+//					WelsFree( pic, "m_pSrcPicList[]" );
+          // prefer use new/delete pair due encoder intialization stage not start yet for CacheLineSize not detection here (16 or 64 not matched)
+          delete pic;
+
+          pic = NULL;
+        }
+      }
+//			WelsFree( m_pSrcPicList, "m_pSrcPicList" );
+      // prefer use new/delete pair due encoder intialization stage not start yet for CacheLineSize not detection here (16 or 64 not matched)
+      delete [] m_pSrcPicList;
+
+      m_pSrcPicList = NULL;
+      m_iSrcListSize = 0;
+    }
+
+    WelsUninitEncoderExt (&m_pEncContext);
+    m_pEncContext	= NULL;
+  }
+
+  m_bInitialFlag = FALSE;
+
+  return 0;
+}
+
+
+int32_t CWelsH264SVCEncoder::RawData2SrcPic (const uint8_t* pSrc) {
+  assert (m_iSrcListSize > 0);
+
+  int32_t y_length = m_iMaxPicWidth * m_iMaxPicHeight;
+  m_pSrcPicList[0]->pData[0] = const_cast<uint8_t*> (pSrc);
+
+  switch (m_iCspInternal & (~videoFormatVFlip)) {
+  case videoFormatYVYU:
+  case videoFormatUYVY:
+  case videoFormatYUY2:
+  case videoFormatRGB:
+  case videoFormatBGR:
+  case videoFormatBGRA:
+  case videoFormatRGBA:
+  case videoFormatARGB:
+  case videoFormatABGR:
+    m_pSrcPicList[0]->pData[1] = m_pSrcPicList[0]->pData[2] = NULL;
+    break;
+  case videoFormatI420:
+  case videoFormatYV12:
+    m_pSrcPicList[0]->pData[1] = m_pSrcPicList[0]->pData[0] + y_length;
+    m_pSrcPicList[0]->pData[2] = m_pSrcPicList[0]->pData[1] + (y_length >> 2);
+    break;
+  default:
+    return 1;
+  }
+
+  return 0;
+}
+
+
+/*
+ *	SVC core encoding
+ */
+int CWelsH264SVCEncoder::EncodeFrame (const unsigned char* pSrc, SFrameBSInfo* pBsInfo) {
+  if (! (pSrc && m_pEncContext && m_bInitialFlag)) {
+    return videoFrameTypeInvalid;
+  }
+
+  int32_t uiFrameType = videoFrameTypeInvalid;
+
+  if (RawData2SrcPic ((uint8_t*)pSrc) == 0) {
+    uiFrameType = EncodeFrame (const_cast<const SSourcePicture**> (m_pSrcPicList), 1, pBsInfo);
+  }
+
+#ifdef REC_FRAME_COUNT
+  ++ m_uiCountFrameNum;
+  WelsLog (m_pEncContext, WELS_LOG_INFO,
+           "CWelsH264SVCEncoder::EncodeFrame(), m_uiCountFrameNum= %d, m_iCspInternal= 0x%x\n", m_uiCountFrameNum, m_iCspInternal);
+#endif//REC_FRAME_COUNT		
+
+#ifdef DUMP_SRC_PICTURE
+  DumpSrcPicture (pSrc);
+#endif // DUMP_SRC_PICTURE	
+
+  return uiFrameType;
+}
+
+
+int CWelsH264SVCEncoder::EncodeFrame (const SSourcePicture**   pSrcPicList, int nSrcPicNum, SFrameBSInfo* pBsInfo) {
+  if (! (pSrcPicList && m_pEncContext && m_bInitialFlag)) {
+    return videoFrameTypeInvalid;
+  }
+
+  int32_t iFrameTypeReturned = 0;
+  int32_t iFrameType = videoFrameTypeInvalid;
+
+  if (nSrcPicNum > 0) {
+    iFrameTypeReturned = WelsEncoderEncodeExt (m_pEncContext, pBsInfo, pSrcPicList, nSrcPicNum);
+  } else {
+    assert (0);
+    return videoFrameTypeInvalid;
+  }
+
+  switch (iFrameTypeReturned) {
+  case WELS_FRAME_TYPE_P:
+    iFrameType	= videoFrameTypeP;
+    break;
+  case WELS_FRAME_TYPE_IDR:
+    iFrameType	= videoFrameTypeIDR;
+    break;
+  case WELS_FRAME_TYPE_SKIP:
+    iFrameType	= videoFrameTypeSkip;
+    break;
+  case WELS_FRAME_TYPE_I:
+    iFrameType	= videoFrameTypeI;
+    break;
+  case WELS_FRAME_TYPE_AUTO:
+  case WELS_FRAME_TYPE_B: // not support B pictures
+    iFrameType	= videoFrameTypeInvalid;
+    break;
+  default:
+    break;
+  }
+
+
+
+  ///////////////////for test
+#ifdef OUTPUT_BIT_STREAM
+  if (iFrameType != videoFrameTypeInvalid && iFrameType != videoFrameTypeSkip) {
+    SLayerBSInfo* pLayer = NULL;
+    int32_t i = 0, j = 0, iCurLayerBits = 0, total_bits = 0;
+
+    if (m_bSwitch) {
+      if (m_pFileBs) {
+        fclose (m_pFileBs);
+        m_pFileBs = NULL;
+      }
+      if (m_pFileBsSize) {
+        fclose (m_pFileBsSize);
+        m_pFileBsSize = NULL;
+      }
+      str_t strStreamFileName[128] = {0};
+#if defined(__GNUC__)
+
+      int32_t iLen = SNPRINTF (strStreamFileName, 128, "%sadj%d_w%d.264", m_pEncContext->sTracePath,  m_iSwitchTimes,
+                               m_pEncContext->pSvcParam->iActualPicWidth);
+      m_pFileBs = FOPEN (strStreamFileName, "wb");
+      SNPRINTF (strStreamFileName, 128, "%sadj%d_w%d_size.iLen", m_pEncContext->sTracePath, m_iSwitchTimes,
+                m_pEncContext->pSvcParam->iActualPicWidth);
+      m_pFileBsSize = FOPEN (strStreamFileName, "wb");
+
+#else//__GNUC__
+
+#if defined (_MSC_VER)
+#if _MSC_VER>=1500
+      int32_t iLen = SNPRINTF (strStreamFileName, 128, 128, "adj%d_w%d.264", m_iSwitchTimes,
+                               m_pEncContext->pSvcParam->iActualPicWidth);
+      FOPEN (&m_pFileBs, strStreamFileName, "wb");
+      SNPRINTF (strStreamFileName, 128, 128, "adj%d_w%d_size.iLen", m_iSwitchTimes,
+                m_pEncContext->pSvcParam->iActualPicWidth);
+      FOPEN (&m_pFileBsSize, strStreamFileName, "wb");
+#else
+      int32_t iLen = SNPRINTF (strStreamFileName, 128, "adj%d_w%d.264", m_iSwitchTimes,
+                               m_pEncContext->pSvcParam->iActualPicWidth);
+      m_pFileBs = FOPEN (strStreamFileName, "wb");
+      SNPRINTF (strStreamFileName, 128, "adj%d_w%d_size.iLen", m_iSwitchTimes, m_pEncContext->pSvcParam->iActualPicWidth);
+      m_pFileBsSize = FOPEN (strStreamFileName, "wb");
+#endif//_MSC_VER>=1500
+#endif//_MSC_VER
+
+
+#endif//__GNUC__
+
+      m_bSwitch = FALSE;
+    }
+
+    for (i = 0; i < pBsInfo->iLayerNum; i++) {
+      pLayer = &pBsInfo->sLayerInfo[i];
+
+      iCurLayerBits = 0;
+      for (j = 0; j < pLayer->iNalCount; j++) {
+        iCurLayerBits += pLayer->iNalLengthInByte[j];
+      }
+      total_bits += iCurLayerBits;
+      if (m_pFileBs != NULL)
+        fwrite (pLayer->pBsBuf, 1, iCurLayerBits, m_pFileBs);
+    }
+
+    if (m_pFileBsSize != NULL)
+      fwrite (&total_bits, sizeof (int32_t), 1, m_pFileBsSize);
+  }
+#endif //OUTPUT_BIT_STREAM
+#ifdef DUMP_SRC_PICTURE
+  DumpSrcPicture (pSrcPicList[0]->pData[0]);
+#endif // DUMP_SRC_PICTURE	
+
+  return iFrameType;
+
+}
+
+/*
+ * return: 0 - success; otherwise - failed;
+ */
+int CWelsH264SVCEncoder::PauseFrame (const unsigned char* kpSrc, SFrameBSInfo* pBsInfo) {
+  int32_t  iReturn = 1;
+
+  ForceIntraFrame (true);
+
+  if (EncodeFrame (kpSrc, pBsInfo) != videoFrameTypeInvalid) {
+    iReturn = 0;
+  }
+
+  // to avoid pause frame bitstream and
+  // normal bitstream use different video channel.
+  ForceIntraFrame (true);
+
+  return (int)iReturn;
+}
+
+
+/*
+ *	Force key frame
+ */
+int CWelsH264SVCEncoder::ForceIntraFrame (bool bIDR) {
+  if (! (m_pEncContext && m_bInitialFlag)) {
+    return 1;
+  }
+
+#ifdef REC_FRAME_COUNT
+  WelsLog (m_pEncContext, WELS_LOG_INFO,
+           "CWelsH264SVCEncoder::ForceIntraFrame(), bIDR= %d, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x\n", bIDR,
+           m_uiCountFrameNum, m_iCspInternal);
+#endif//REC_FRAME_COUNT
+
+  ForceCodingIDR (m_pEncContext);
+
+  return 0;
+}
+
+/************************************************************************
+* InDataFormat, IDRInterval, SVC Encode Param, Frame Rate, Bitrate,..
+************************************************************************/
+int CWelsH264SVCEncoder::SetOption (ENCODER_OPTION eOptionId, void* pOption) {
+  if (NULL == pOption) {
+    return cmInitParaError;
+  }
+
+  if (NULL == m_pEncContext || FALSE == m_bInitialFlag) {
+    return cmInitExpected;
+  }
+
+  switch (eOptionId) {
+  case ENCODER_OPTION_INTER_SPATIAL_PRED: {	// Inter spatial layer prediction flag
+    WelsLog (m_pEncContext, WELS_LOG_INFO, "ENCODER_OPTION_INTER_SPATIAL_PRED, this feature not supported at present.\n");
+  }
+  break;
+  case ENCODER_OPTION_DATAFORMAT: {	// Input color space
+    int32_t iValue = * ((int32_t*)pOption);
+    int32_t iColorspace = iValue;
+    if (iColorspace == 0) {
+      return cmInitParaError;
+    }
+
+#ifdef REC_FRAME_COUNT
+    WelsLog (m_pEncContext, WELS_LOG_INFO,
+             "CWelsH264SVCEncoder::SetOption():ENCODER_OPTION_DATAFORMAT, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x, iValue= %d\n",
+             m_uiCountFrameNum, m_iCspInternal, iValue);
+#endif//REC_FRAME_COUNT
+
+
+    int32_t iPicIdx = m_iSrcListSize - 1;
+    while (iPicIdx >= 0) {
+      if (m_pSrcPicList[iPicIdx] == NULL) {
+        -- iPicIdx;
+        if (iPicIdx < 0) return cmInitParaError;
+        continue;
+      }
+
+      if (m_pSrcPicList[iPicIdx]->iColorFormat == iColorspace) {
+        -- iPicIdx;
+        continue;
+      }
+
+      InitPic (m_pSrcPicList[iPicIdx], iColorspace, m_iMaxPicWidth, m_iMaxPicHeight);
+    }
+    m_iCspInternal = iColorspace;
+#ifdef REC_FRAME_COUNT
+    WelsLog (m_pEncContext, WELS_LOG_INFO,
+             "CWelsH264SVCEncoder::SetOption():ENCODER_OPTION_DATAFORMAT, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x\n",
+             m_uiCountFrameNum, m_iCspInternal);
+#endif//REC_FRAME_COUNT
+  }
+  break;
+  case ENCODER_OPTION_IDR_INTERVAL: {	// IDR Interval
+    int32_t iValue	= * ((int32_t*)pOption);
+#ifdef REC_FRAME_COUNT
+    WelsLog (m_pEncContext, WELS_LOG_INFO,
+             "CWelsH264SVCEncoder::SetOption():ENCODER_OPTION_IDR_INTERVAL, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x, iValue= %d\n",
+             m_uiCountFrameNum, m_iCspInternal, iValue);
+#endif//REC_FRAME_COUNT
+
+    if (iValue < -1 || iValue == 0)
+      iValue = 1;
+    if (iValue == (int32_t)m_pEncContext->pSvcParam->uiIntraPeriod) {
+      return cmResultSuccess;
+    }
+
+
+    m_pEncContext->pSvcParam->uiIntraPeriod	= (uint32_t)iValue;
+  }
+  break;
+  case ENCODER_OPTION_SVC_ENCODE_PARAM: {	// SVC Encoding Parameter
+    SVCEncodingParam		sEncodingParam;
+    SWelsSvcCodingParam	sConfig (true);
+    int32_t iInputColorspace = 0;
+    int32_t iTargetWidth = 0;
+    int32_t iTargetHeight = 0;
+
+    memcpy (&sEncodingParam, pOption, sizeof (SVCEncodingParam));	// confirmed_safe_unsafe_usage
+    WelsLog (m_pEncContext, WELS_LOG_INFO, "ENCODER_OPTION_SVC_ENCODE_PARAM, sEncodingParam.iInputCsp= 0x%x\n",
+             sEncodingParam.iInputCsp);
+    WelsLog (m_pEncContext, WELS_LOG_INFO,
+             "coding_param->iPicWidth= %d;coding_param->iPicHeight= %d;coding_param->iTargetBitrate= %d;coding_param->iRCMode= %d;coding_param->iPaddingFlag= %d;coding_param->iTemporalLayerNum= %d;coding_param->iSpatialLayerNum= %d;coding_param->fFrameRate= %.6ff;coding_param->iInputCsp= %d;coding_param->iKeyPicCodingMode= %d;coding_param->uiIntraPeriod= %d;coding_param->bEnableSpsPpsIdAddition = %d;coding_param->bPrefixNalAddingCtrl = %d;coding_param->bEnableDenoise= %d;coding_param->bEnableBackgroundDetection= %d;coding_param->bEnableAdaptiveQuant= %d;coding_param->bEnableCropPic= %d;coding_param->bEnableLongTermReference= %d;coding_param->iLtrMarkPeriod= %d;\n",
+             sEncodingParam.iPicWidth,
+             sEncodingParam.iPicHeight,
+             sEncodingParam.iTargetBitrate,
+             sEncodingParam.iRCMode,
+             sEncodingParam.iPaddingFlag,
+             sEncodingParam.iTemporalLayerNum,
+             sEncodingParam.iSpatialLayerNum,
+             sEncodingParam.fFrameRate,
+             sEncodingParam.iInputCsp,
+             sEncodingParam.iKeyPicCodingMode,
+             sEncodingParam.iIntraPeriod,
+             sEncodingParam.bEnableSpsPpsIdAddition,
+             sEncodingParam.bPrefixNalAddingCtrl,
+             sEncodingParam.bEnableDenoise,
+             sEncodingParam.bEnableBackgroundDetection,
+             sEncodingParam.bEnableAdaptiveQuant,
+             sEncodingParam.bEnableCropPic,
+             sEncodingParam.bEnableLongTermReference,
+             sEncodingParam.iLtrMarkPeriod);
+    int32_t i = 0;
+    while (i < sEncodingParam.iSpatialLayerNum) {
+      SSpatialLayerConfig* pSpatialCfg = &sEncodingParam.sSpatialLayers[i];
+      WelsLog (m_pEncContext, WELS_LOG_INFO,
+               "coding_param->sSpatialLayers[%d]: .iVideoWidth= %d; .iVideoHeight= %d; .fFrameRate= %.6ff; .iQualityLayerNum= %d; .iSpatialBitrate= %d; .iCgsSnrRefined= %d; .iInterSpatialLayerPredFlag= %d; .sSliceCfg.uiSliceMode= %d; .sSliceCfg.sSliceArgument.iSliceNum= %d; .sSliceCfg.sSliceArgument.uiSliceSizeConstraint= %d;\n",
+               i, pSpatialCfg->iVideoWidth,
+               pSpatialCfg->iVideoHeight,
+               pSpatialCfg->fFrameRate,
+               pSpatialCfg->iQualityLayerNum,
+               pSpatialCfg->iSpatialBitrate,
+               pSpatialCfg->iCgsSnrRefined,
+               pSpatialCfg->iInterSpatialLayerPredFlag,
+               pSpatialCfg->sSliceCfg.uiSliceMode,
+               pSpatialCfg->sSliceCfg.sSliceArgument.uiSliceNum,
+               pSpatialCfg->sSliceCfg.sSliceArgument.uiSliceSizeConstraint
+              );
+      ++ i;
+    }
+#ifdef OUTPUT_BIT_STREAM
+    if (sEncodingParam.sSpatialLayers[sEncodingParam.iSpatialLayerNum - 1].iVideoWidth !=
+        m_pEncContext->pSvcParam->sDependencyLayers[m_pEncContext->pSvcParam->iNumDependencyLayer - 1].iFrameWidth) {
+      ++ m_iSwitchTimes;
+      m_bSwitch = TRUE;
+    }
+#endif//OUTPUT_BIT_STREAM
+    if (sEncodingParam.iSpatialLayerNum < 1
+        || sEncodingParam.iSpatialLayerNum > MAX_SPATIAL_LAYER_NUM) {	// verify number of spatial layer
+      return cmInitParaError;
+    }
+
+    iInputColorspace	= sEncodingParam.iInputCsp;
+    if (sConfig.ParamTranscode (sEncodingParam, true)) {
+      return cmInitParaError;
+    }
+    if (sConfig.iNumDependencyLayer < 1) {
+      return cmInitParaError;
+    }
+    iTargetWidth	= sConfig.iActualPicWidth;
+    iTargetHeight	= sConfig.iActualPicHeight;
+    if (m_pSrcPicList[0] == NULL) {
+      return cmInitParaError;
+    }
+    if (m_iCspInternal != iInputColorspace || m_iMaxPicWidth != iTargetWidth
+        || m_iMaxPicHeight != iTargetHeight) {	// for color space due to changed
+      InitPic (m_pSrcPicList[0], iInputColorspace, iTargetWidth, iTargetHeight);
+      m_iMaxPicWidth	= iTargetWidth;
+      m_iMaxPicHeight	= iTargetHeight;
+      m_iCspInternal	= iInputColorspace;
+    }
+#ifdef REC_FRAME_COUNT
+    WelsLog (m_pEncContext, WELS_LOG_INFO,
+             "CWelsH264SVCEncoder::SetOption():ENCODER_OPTION_SVC_ENCODE_PARAM, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x\n",
+             m_uiCountFrameNum, m_iCspInternal);
+#endif//REC_FRAME_COUNT
+
+    /* New configuration available here */
+    sConfig.iInputCsp	= m_iCspInternal;	// I420 in default designed for presentation in encoder used internal
+    sConfig.DetermineTemporalSettings();
+
+    /* Check every field whether there is new request for memory block changed or else, Oct. 24, 2008 */
+    WelsEncoderParamAdjust (&m_pEncContext, &sConfig);
+  }
+  break;
+  case ENCODER_OPTION_FRAME_RATE: {	// Maximal input frame rate
+    float iValue	= * ((float*)pOption);
+#ifdef REC_FRAME_COUNT
+    WelsLog (m_pEncContext, WELS_LOG_INFO,
+             "CWelsH264SVCEncoder::SetOption():ENCODER_OPTION_FRAME_RATE, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x, iValue= %d\n",
+             m_uiCountFrameNum, m_iCspInternal, iValue);
+#endif//REC_FRAME_COUNT
+    m_pEncContext->pSvcParam->fMaxFrameRate	= iValue;
+
+  }
+  break;
+  case ENCODER_OPTION_iBitRate: {	// Target bit-rate
+    int32_t iValue = * ((int32_t*)pOption);
+#ifdef REC_FRAME_COUNT
+    WelsLog (m_pEncContext, WELS_LOG_INFO,
+             "CWelsH264SVCEncoder::SetOption():ENCODER_OPTION_iBitRate, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x, iValue= %d\n",
+             m_uiCountFrameNum, m_iCspInternal, iValue);
+#endif//REC_FRAME_COUNT
+    m_pEncContext->pSvcParam->iTargetBitrate	= iValue;
+
+  }
+  break;
+  case ENCODER_OPTION_RC_MODE: {	// 0:quality mode;1:bit-rate mode
+    int32_t iValue = * ((int32_t*)pOption);
+    m_pEncContext->pSvcParam->iRCMode	= iValue;
+  }
+  break;
+  case ENCODER_PADDING_PADDING: {	// 0:disable padding;1:padding
+    int32_t iValue = * ((int32_t*)pOption);
+    m_pEncContext->pSvcParam->iPaddingFlag	= iValue;
+  }
+  break;
+  case ENCODER_LTR_RECOVERY_REQUEST: {
+    SLTRRecoverRequest* pLTR_Recover_Request = (SLTRRecoverRequest*) (pOption);
+    FilterLTRRecoveryRequest (m_pEncContext, pLTR_Recover_Request);
+  }
+  break;
+  case ENCODER_LTR_MARKING_FEEDBACK: {
+    SLTRMarkingFeedback* fb = (SLTRMarkingFeedback*) (pOption);
+    FilterLTRMarkingFeedback (m_pEncContext, fb);
+  }
+  break;
+  case ENCOCER_LTR_MARKING_PERIOD: {
+    uint32_t iValue = * ((uint32_t*) (pOption));
+    m_pEncContext->pSvcParam->uiLtrMarkPeriod = iValue;
+  }
+  break;
+  case ENCODER_OPTION_LTR: {
+    uint32_t iValue = * ((uint32_t*) (pOption));
+    m_pEncContext->pSvcParam->bEnableLongTermReference = iValue ? true : false;
+    WelsLog (m_pEncContext, WELS_LOG_WARNING, " CWelsH264SVCEncoder::SetOption enable LTR = %d",
+             m_pEncContext->pSvcParam->bEnableLongTermReference);
+  }
+  break;
+  case ENCODER_OPTION_ENABLE_SSEI: {
+    bool_t iValue = * ((bool_t*)pOption);
+    m_pEncContext->pSvcParam->bEnableSSEI = iValue;
+    WelsLog (m_pEncContext, WELS_LOG_INFO, " CWelsH264SVCEncoder::SetOption enable SSEI = %d \n",
+             m_pEncContext->pSvcParam->bEnableSSEI);
+  }
+  break;
+  case ENCODER_OPTION_ENABLE_PREFIX_NAL_ADDING: {
+    bool_t iValue = * ((bool_t*)pOption);
+    m_pEncContext->pSvcParam->bPrefixNalAddingCtrl = iValue;
+    WelsLog (m_pEncContext, WELS_LOG_INFO, " CWelsH264SVCEncoder::SetOption bPrefixNalAddingCtrl = %d \n",
+             m_pEncContext->pSvcParam->bPrefixNalAddingCtrl);
+  }
+  break;
+  case ENCODER_OPTION_ENABLE_SPS_PPS_ID_ADDITION: {
+    bool_t iValue = * ((bool_t*)pOption);
+
+    m_pEncContext->pSvcParam->bEnableSpsPpsIdAddition = iValue;
+    WelsLog (m_pEncContext, WELS_LOG_INFO, " CWelsH264SVCEncoder::SetOption enable SPS/PPS ID = %d \n",
+             m_pEncContext->pSvcParam->bEnableSpsPpsIdAddition);
+  }
+  break;
+  case ENCODER_OPTION_CURRENT_PATH: {
+    if (m_pEncContext->pSvcParam != NULL) {
+      str_t* path = static_cast<str_t*> (pOption);
+      m_pEncContext->pSvcParam->pCurPath = path;
+    }
+  }
+  break;
+  default:
+    return cmInitParaError;
+  }
+
+  return 0;
+}
+
+int CWelsH264SVCEncoder::GetOption (ENCODER_OPTION eOptionId, void* pOption) {
+  if (NULL == pOption) {
+    return cmInitParaError;
+  }
+  if (NULL == m_pEncContext || FALSE == m_bInitialFlag) {
+    return cmInitExpected;
+  }
+
+  switch (eOptionId) {
+  case ENCODER_OPTION_INTER_SPATIAL_PRED: {	// Inter spatial layer prediction flag
+    WelsLog (m_pEncContext, WELS_LOG_INFO, "ENCODER_OPTION_INTER_SPATIAL_PRED, this feature not supported at present.\n");
+  }
+  break;
+  case ENCODER_OPTION_DATAFORMAT: {	// Input color space
+#ifdef REC_FRAME_COUNT
+    WelsLog (m_pEncContext, WELS_LOG_INFO,
+             "CWelsH264SVCEncoder::GetOption():ENCODER_OPTION_DATAFORMAT, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x\n",
+             m_uiCountFrameNum, m_iCspInternal);
+#endif//REC_FRAME_COUNT
+
+    * ((int32_t*)pOption)	= m_iCspInternal;
+  }
+  break;
+  case ENCODER_OPTION_IDR_INTERVAL: {	// IDR Interval
+#ifdef REC_FRAME_COUNT
+    WelsLog (m_pEncContext, WELS_LOG_INFO,
+             "CWelsH264SVCEncoder::GetOption():ENCODER_OPTION_IDR_INTERVAL, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x\n",
+             m_uiCountFrameNum, m_iCspInternal);
+#endif//REC_FRAME_COUNT
+    * ((int32_t*)pOption) = m_pEncContext->pSvcParam->uiIntraPeriod;
+  }
+  break;
+  case ENCODER_OPTION_SVC_ENCODE_PARAM: {	// SVC Encoding Parameter
+#ifdef REC_FRAME_COUNT
+    WelsLog (m_pEncContext, WELS_LOG_INFO,
+             "CWelsH264SVCEncoder::GetOption():ENCODER_OPTION_SVC_ENCODE_PARAM, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x\n",
+             m_uiCountFrameNum, m_iCspInternal);
+#endif//REC_FRAME_COUNT
+    memcpy (pOption, m_pEncContext->pSvcParam, sizeof (SWelsSvcCodingParam));	// confirmed_safe_unsafe_usage
+  }
+  break;
+  case ENCODER_OPTION_FRAME_RATE: {	// Maximal input frame rate
+#ifdef REC_FRAME_COUNT
+    WelsLog (m_pEncContext, WELS_LOG_INFO,
+             "CWelsH264SVCEncoder::GetOption():ENCODER_OPTION_FRAME_RATE, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x\n",
+             m_uiCountFrameNum, m_iCspInternal);
+#endif//REC_FRAME_COUNT
+    * ((float*)pOption)	= m_pEncContext->pSvcParam->fMaxFrameRate;
+  }
+  break;
+  case ENCODER_OPTION_iBitRate: {	// Target bit-rate
+#ifdef REC_FRAME_COUNT
+    WelsLog (m_pEncContext, WELS_LOG_INFO,
+             "CWelsH264SVCEncoder::GetOption():ENCODER_OPTION_iBitRate, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x\n",
+             m_uiCountFrameNum, m_iCspInternal);
+#endif//REC_FRAME_COUNT
+    * ((int32_t*)pOption)	= m_pEncContext->pSvcParam->iTargetBitrate;
+  }
+  break;
+  default:
+    return cmInitParaError;
+  }
+
+  return 0;
+}
+
+void CWelsH264SVCEncoder::DumpSrcPicture (const uint8_t* pSrc) {
+#ifdef DUMP_SRC_PICTURE
+  FILE* pFile = NULL;
+  str_t strFileName[256] = {0};
+  const int32_t iDataLength = m_iMaxPicWidth * m_iMaxPicHeight;
+
+#if defined(__GNUC__)
+  STRNCPY (strFileName, 256, "/tmp/pic_in_", STRNLEN ("/tmp/pic_in_", 255));	// confirmed_safe_unsafe_usage
+#else
+  STRNCPY (strFileName, 256, "d:\\incoming\\mosaic_st\\pic_in_", STRNLEN ("d:\\incoming\\mosaic_st\\pic_in_",
+           255));	// confirmed_safe_unsafe_usage
+#endif//__GNUC__
+
+  if (m_iMaxPicWidth == 640) {
+    STRCAT (strFileName, 256, "360p.");	// confirmed_safe_unsafe_usage
+  } else if (m_iMaxPicWidth == 320) {
+    STRCAT (strFileName, 256, "180p.");	// confirmed_safe_unsafe_usage
+  } else if (m_iMaxPicWidth == 160) {
+    STRCAT (strFileName, 256, "90p.");	// confirmed_safe_unsafe_usage
+  }
+
+  switch (m_iCspInternal) {
+  case videoFormatI420:
+  case videoFormatYV12:
+    STRCAT (strFileName, 256, "yuv");	// confirmed_safe_unsafe_usage
+#if defined(__GNUC__)
+    pFile = FOPEN (strFileName, "ab+");
+#else
+#if defined(_MSC_VER)
+#if _MSC_VER>=1500
+    FOPEN (&pFile, strFileName, "ab+");
+#else
+    pFile = FOPEN (strFileName, "ab+");
+#endif//_MSC_VER>=1500
+#endif//_MSC_VER			
+#endif//__GNUC__
+    //				WelsLog( m_pEncContext, WELS_LOG_INFO, "WELS_CSP_I420, m_iCspInternal= 0x%x\n", m_iCspInternal);
+    if (NULL != pFile) {
+      fwrite (pSrc, sizeof (uint8_t), (iDataLength * 3) >> 1, pFile);
+      fflush (pFile);
+      fclose (pFile);
+    }
+    break;
+  case videoFormatRGB:
+    STRCAT (strFileName, 256, "rgb");	// confirmed_safe_unsafe_usage
+#if defined(__GNUC__)
+    pFile = FOPEN (strFileName, "ab+");
+#else
+#if defined(_MSC_VER)
+#if _MSC_VER>=1500
+    FOPEN (&pFile, strFileName, "ab+");
+#else
+    pFile = FOPEN (strFileName, "ab+");
+#endif//_MSC_VER>=1500
+#endif//_MSC_VER			
+#endif//__GNUC__
+    if (NULL != pFile) {
+      fwrite (pSrc, sizeof (uint8_t), iDataLength * 3, pFile);
+      fflush (pFile);
+      fclose (pFile);
+    }
+  case videoFormatBGR:
+    STRCAT (strFileName, 256, "bgr");	// confirmed_safe_unsafe_usage
+#if defined(__GNUC__)
+    pFile = FOPEN (strFileName, "ab+");
+#else
+#if defined(_MSC_VER)
+#if _MSC_VER>=1500
+    FOPEN (&pFile, strFileName, "ab+");
+#else
+    pFile = FOPEN (strFileName, "ab+");
+#endif//_MSC_VER>=1500
+#endif//_MSC_VER			
+#endif//__GNUC__
+    //				WelsLog( m_pEncContext, WELS_LOG_INFO, "WELS_CSP_BGR, m_iCspInternal= 0x%x\n", m_iCspInternal);
+    if (NULL != pFile) {
+      fwrite (pSrc, sizeof (uint8_t), iDataLength * 3, pFile);
+      fflush (pFile);
+      fclose (pFile);
+    }
+    break;
+  case videoFormatYUY2:
+    STRCAT (strFileName, 256, "yuy2");	// confirmed_safe_unsafe_usage
+#if defined(__GNUC__)
+    pFile = FOPEN (strFileName, "ab+");
+#else
+#if defined(_MSC_VER)
+#if _MSC_VER>=1500
+    FOPEN (&pFile, strFileName, "ab+");
+#else
+    pFile = FOPEN (strFileName, "ab+");
+#endif//_MSC_VER>=1500
+#endif//_MSC_VER			
+#endif//__GNUC__
+    if (NULL != pFile) {
+      fwrite (pSrc, sizeof (uint8_t), (CALC_BI_STRIDE (m_iMaxPicWidth,  16)) * m_iMaxPicHeight, pFile);
+      fflush (pFile);
+      fclose (pFile);
+    }
+    break;
+  default:
+    WelsLog (m_pEncContext, WELS_LOG_INFO, "Exclusive case, m_iCspInternal= 0x%x\n", m_iCspInternal);
+    break;
+  }
+#endif//DUMP_SRC_PICTURE
+  return;
+}
+}
+
+using namespace WelsSVCEnc;
+
+int32_t CreateSVCEncoder (ISVCEncoder** ppEncoder) {
+  assert (ppEncoder);
+
+  if (NULL == ppEncoder)
+    return 1;
+
+  if ((*ppEncoder = new CWelsH264SVCEncoder()) != NULL) {
+    return 0;
+  }
+
+  return 1;
+}
+
+void DestroySVCEncoder (ISVCEncoder* pEncoder) {
+  CWelsH264SVCEncoder* pSVCEncoder = (CWelsH264SVCEncoder*)pEncoder;
+
+  if (pSVCEncoder) {
+    delete pSVCEncoder;
+    pSVCEncoder = NULL;
+  }
+}
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
--- a/processing/build/linux/makefile
+++ b/processing/build/linux/makefile
@@ -1,94 +1,94 @@
-NASM = 1
-NAME      = libwelsvp
-
-OUTDIR    = ../../../bin/linux
-BINDIR    = ../../bin
-OBJDIR    = ../../obj  
-SRCDIRS   = ../../src/asm \
-            ../../src/common \
-            ../../src/adaptivequantization \
-            ../../src/backgounddetection \
-            ../../src/denoise \
-            ../../src/downsample \
-            ../../src/scenechangedetection \
-            ../../src/vaacalc \
-            ../../src/complexityanalysis 
-SRCDIRS  += ../../src/imagerotate
-
-
-TARGETLIB =  $(BINDIR)/$(NAME).so
-
-CC        = $(shell which gcc)
-AS        = $(shell which nasm)
-GCC       = gcc -m32
-
-CPPFLAGS  = -Wall -g -O3
-ifeq ($(NASM), 1)
-CPPFLAGS += -DX86_ASM
-endif
-ASMFLAGS  = -f elf -DNOPREFIX  -I ../../src/asm/
-LDFLAGS   = -lstdc++ -ldl
-          
-SRCEXTS  = .cpp
-ifeq ($(NASM), 1)
-SRCEXTS += .asm
-endif
-HDREXTS  = .h
-SOURCES  = $(foreach d,$(SRCDIRS),$(wildcard $(addprefix $(d)/*,$(SRCEXTS))))
-HEADERS  = $(foreach d,$(SRCDIRS),$(wildcard $(addprefix $(d)/*,$(HDREXTS))))
-SRC_CPP  = $(filter %.cpp,$(SOURCES))
-SRC_ASM  = $(filter %.asm,$(SOURCES))
-OBJS     = $(addsuffix .o, $(basename $(SOURCES)))
-DEPS     = $(OBJS:.o=.d)
-
-DEP_OPT  = $(shell if `$(CC) --version | grep "GCC" >/dev/null`; then \
-                  echo "-MM -MP"; else echo "-M"; fi )
-DEPEND_cpp.d  = $(subst -g ,,$(CC) $(DEP_OPT) $(CPPFLAGS))
-DEPEND_asm.d  = $(subst -g ,,$(AS) $(DEP_OPT) $(ASMFLAGS))
-COMPILE.cpp   = $(GCC) $(CPPFLAGS) -c
-COMPILE.asm   = $(AS)  $(ASMFLAGS)
-LINK          = $(GCC) $(LDFLAGS)
-
-.PHONY: all objs tags ctags clean distclean
-
-.SUFFIXES:
-
-all: $(TARGETLIB)
-	
-%.d:%.cpp
-	@echo -n $(dir $<) > $@
-	@$(DEPEND_cpp.d) $< >> $@
-	
-%.d:%.asm
-	@echo -n $(dir $<) > $@
-	@$(DEPEND_asm.d) $< >> $@
-
-objs:$(OBJS)
-
-%.o:%.cpp
-	$(COMPILE.cpp) $< -o $@
-	
-%.o:%.asm
-	$(COMPILE.asm) $< -o $@	
-
-tags: $(HEADERS) $(SOURCES)
-	etags $(HEADERS) $(SOURCES)
-
-ctags: $(HEADERS) $(SOURCES)
-	ctags $(HEADERS) $(SOURCES)
-
-$(TARGETLIB):$(OBJS)
-	@if test ! -d $(BINDIR) ; then mkdir -p $(BINDIR) ; fi
-	$(LINK) $(OBJS) -shared -Wl,-Bsymbolic -o $@
-	@echo produce the lib to $(TARGETLIB).
-	@if test ! -d $(OUTDIR) ; then mkdir -p $(OUTDIR) ; fi
-	@cp -f $(TARGETLIB) $(OUTDIR)
-	@cp -f $(TARGETLIB) ../../../testbin
-	@echo copy the lib to $(OUTDIR).
-
-clean:
-	rm -f $(OBJS) $(TARGETLIB)
-
-distclean: clean
-	rm -f $(DEPS) TAGS
-
+NASM = 1
+NAME      = libwelsvp
+
+OUTDIR    = ../../../bin/linux
+BINDIR    = ../../bin
+OBJDIR    = ../../obj
+SRCDIRS   = ../../src/asm \
+            ../../src/common \
+            ../../src/adaptivequantization \
+            ../../src/backgounddetection \
+            ../../src/denoise \
+            ../../src/downsample \
+            ../../src/scenechangedetection \
+            ../../src/vaacalc \
+            ../../src/complexityanalysis
+SRCDIRS  += ../../src/imagerotate
+
+
+TARGETLIB =  $(BINDIR)/$(NAME).so
+
+CC        = $(shell which gcc)
+AS        = $(shell which nasm)
+GCC       = gcc -m32
+
+CPPFLAGS  = -Wall -g -O3
+ifeq ($(NASM), 1)
+CPPFLAGS += -DX86_ASM
+endif
+ASMFLAGS  = -f elf -DNOPREFIX  -I ../../src/asm/
+LDFLAGS   = -lstdc++ -ldl
+
+SRCEXTS  = .cpp
+ifeq ($(NASM), 1)
+SRCEXTS += .asm
+endif
+HDREXTS  = .h
+SOURCES  = $(foreach d,$(SRCDIRS),$(wildcard $(addprefix $(d)/*,$(SRCEXTS))))
+HEADERS  = $(foreach d,$(SRCDIRS),$(wildcard $(addprefix $(d)/*,$(HDREXTS))))
+SRC_CPP  = $(filter %.cpp,$(SOURCES))
+SRC_ASM  = $(filter %.asm,$(SOURCES))
+OBJS     = $(addsuffix .o, $(basename $(SOURCES)))
+DEPS     = $(OBJS:.o=.d)
+
+DEP_OPT  = $(shell if `$(CC) --version | grep "GCC" >/dev/null`; then \
+                  echo "-MM -MP"; else echo "-M"; fi )
+DEPEND_cpp.d  = $(subst -g ,,$(CC) $(DEP_OPT) $(CPPFLAGS))
+DEPEND_asm.d  = $(subst -g ,,$(AS) $(DEP_OPT) $(ASMFLAGS))
+COMPILE.cpp   = $(GCC) $(CPPFLAGS) -c
+COMPILE.asm   = $(AS)  $(ASMFLAGS)
+LINK          = $(GCC) $(LDFLAGS)
+
+.PHONY: all objs tags ctags clean distclean
+
+.SUFFIXES:
+
+all: $(TARGETLIB)
+
+%.d:%.cpp
+	@echo -n $(dir $<) > $@
+	@$(DEPEND_cpp.d) $< >> $@
+
+%.d:%.asm
+	@echo -n $(dir $<) > $@
+	@$(DEPEND_asm.d) $< >> $@
+
+objs:$(OBJS)
+
+%.o:%.cpp
+	$(COMPILE.cpp) $< -o $@
+
+%.o:%.asm
+	$(COMPILE.asm) $< -o $@
+
+tags: $(HEADERS) $(SOURCES)
+	etags $(HEADERS) $(SOURCES)
+
+ctags: $(HEADERS) $(SOURCES)
+	ctags $(HEADERS) $(SOURCES)
+
+$(TARGETLIB):$(OBJS)
+	@if test ! -d $(BINDIR) ; then mkdir -p $(BINDIR) ; fi
+	$(LINK) $(OBJS) -shared -Wl,-Bsymbolic -o $@
+	@echo produce the lib to $(TARGETLIB).
+	@if test ! -d $(OUTDIR) ; then mkdir -p $(OUTDIR) ; fi
+	@cp -f $(TARGETLIB) $(OUTDIR)
+	@cp -f $(TARGETLIB) ../../../testbin
+	@echo copy the lib to $(OUTDIR).
+
+clean:
+	rm -f $(OBJS) $(TARGETLIB)
+
+distclean: clean
+	rm -f $(DEPS) TAGS
+
--- a/processing/interface/IWelsVP.h
+++ b/processing/interface/IWelsVP.h
@@ -1,304 +1,286 @@
-/*!
- * \copy
- *     Copyright (c)  2004-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file	    :  IWelsVP.h
- *
- * \brief	    :  Interface of wels video processor class
- *
- * \date        :  2011/01/04
- *
- * \description :  1. should support both C/C++ style interface
- *                 2. should concern with the feature extension requirement 
- *                 3. should care the usage of "char"==>
- *                     1) value char  : signed char/unsigned char
- *                     2) string char : char
- *
- *************************************************************************************
- */
-
-#ifndef _IWELSVP_H_
-#define _IWELSVP_H_ 
-
-#ifdef _WIN32
-#define WELSAPI __stdcall
-#else
-#define WELSAPI 
-#endif
-
-#define WELSVP_MAJOR_VERSION   1
-#define WELSVP_MINOR_VERSION   1
-#define WELSVP_VERSION         ((WELSVP_MAJOR_VERSION << 8) + WELSVP_MINOR_VERSION)
-
-typedef enum 
-{
-	RET_SUCCESS          =  0,
-	RET_FAILED           = -1,
-	RET_INVALIDPARAM     = -2,
-	RET_OUTOFMEMORY      = -3,
-	RET_NOTSUPPORTED       = -4,
-	RET_UNEXPECTED       = -5,
-	RET_NEEDREINIT		  = -6
-} EResult;
-
-typedef enum 
-{ 
-	VIDEO_FORMAT_NULL       = 0,   /* invalid format   */
-	/*rgb color formats*/
-	VIDEO_FORMAT_RGB        = 1,   /* rgb 24bits       */
-	VIDEO_FORMAT_RGBA       = 2,   /* rgba             */
-	VIDEO_FORMAT_RGB555     = 3,   /* rgb555           */
-	VIDEO_FORMAT_RGB565     = 4,   /* rgb565           */
-	VIDEO_FORMAT_BGR        = 5,   /* bgr 24bits       */
-	VIDEO_FORMAT_BGRA       = 6,   /* bgr 32bits       */
-	VIDEO_FORMAT_ABGR       = 7,   /* abgr             */
-	VIDEO_FORMAT_ARGB       = 8,   /* argb             */
-
-	/*yuv color formats*/
-	VIDEO_FORMAT_YUY2       = 20,   /* yuy2             */
-	VIDEO_FORMAT_YVYU       = 21,   /* yvyu             */
-	VIDEO_FORMAT_UYVY       = 22,   /* uyvy             */
-	VIDEO_FORMAT_I420       = 23,   /* yuv 4:2:0 planar */              
-	VIDEO_FORMAT_YV12       = 24,   /* yuv 4:2:0 planar */
-	VIDEO_FORMAT_INTERNAL   = 25,   /* Only Used for SVC decoder testbed */ 
-	VIDEO_FORMAT_NV12		= 26,	/* y planar + uv packed */
-	VIDEO_FORMAT_I422       = 27,   /* yuv 4:2:2 planar */
-	VIDEO_FORMAT_I444       = 28,   /* yuv 4:4:4 planar */
-	VIDEO_FORMAT_YUYV       = 20,   /* yuv 4:2:2 packed */
-
-	VIDEO_FORMAT_RGB24      = 1,
-	VIDEO_FORMAT_RGB32      = 2,
-	VIDEO_FORMAT_RGB24_INV  = 5,
-	VIDEO_FORMAT_RGB32_INV  = 6,
-	VIDEO_FORMAT_RGB555_INV = 7,
-	VIDEO_FORMAT_RGB565_INV = 8,
-	VIDEO_FORMAT_YUV2       = 21,
-	VIDEO_FORMAT_420        = 23,
-
-	VIDEO_FORMAT_VFlip      = 0x80000000 
-} EVideoFormat;
-
-typedef enum 
-{ 
-	BUFFER_HOSTMEM  = 0,
-	BUFFER_SURFACE
-} EPixMapBufferProperty;
-
-typedef struct
-{
-  int iRectTop;
-  int iRectLeft;
-  int iRectWidth;
-  int iRectHeight;
-} SRect;
-
-typedef struct
-{
-	void        *pPixel[3]; 
-	int          iSizeInBits;
-	int          iStride[3];
-	SRect        sRect;	
-	EVideoFormat eFormat;
-	EPixMapBufferProperty eProperty;//not use? to remove? but how about the size of SPixMap?
-} SPixMap;
-
-typedef enum
-{	
-	METHOD_NULL              = 0,
-	METHOD_COLORSPACE_CONVERT    ,//not support yet
-	METHOD_DENOISE              ,
-	METHOD_SCENE_CHANGE_DETECTION ,
-	METHOD_DOWNSAMPLE			  ,
-	METHOD_VAA_STATISTICS        ,
-    METHOD_BACKGROUND_DETECTION  ,
-	METHOD_ADAPTIVE_QUANT ,
-	METHOD_COMPLEXITY_ANALYSIS   ,
-	METHOD_IMAGE_ROTATE		  ,
-	METHOD_MASK                 
-} EMethods;
-
-//-----------------------------------------------------------------//
-//  Algorithm parameters define
-//-----------------------------------------------------------------//
-
-typedef struct
-{
-	int bSceneChangeFlag; // 0:false ; 1:true
-} SSceneChangeResult;
-
-typedef enum
-{
-	SIMILAR_SCENE,      //similar scene 
-	MEDIUM_CHANGED_SCENE,   //medium changed scene
-	LARGE_CHANGED_SCENE,   //large changed scene
-} ESceneChangeIdc;
-
-typedef struct
-{
-	unsigned char *pCurY;					// Y data of current frame
-	unsigned char *pRefY;					// Y data of pRef frame for diff calc
-	int (*pSad8x8)[4];				// sad of 8x8, every 4 in the same 16x16 get together
-	int *pSsd16x16;					// sum of square difference of 16x16
-	int *pSum16x16;					// sum of 16x16
-	int *pSumOfSquare16x16;					// sum of square of 16x16
-	int	(*pSumOfDiff8x8)[4];
-	unsigned char	(*pMad8x8)[4];
-	int iFrameSad;					// sad of frame
-} SVAACalcResult;
-
-typedef struct
-{
-	int iCalcVar;
-	int iCalcBgd;
-	int iCalcSsd;
-	int iReserved;
-	SVAACalcResult	*pCalcResult;
-} SVAACalcParam;
-
-typedef struct
-{
-	signed char		*pBackgroundMbFlag;
-	SVAACalcResult  *pCalcRes;
-} SBGDInterface;
-
-typedef enum
-{
-	AQ_QUALITY_MODE,   //Quality mode
-	AQ_BITRATE_MODE,   //Bitrate mode
-}EAQModes;
-
-typedef struct 
-{
-	unsigned short    uiMotionIndex;
-	unsigned short    uiTextureIndex;
-} SMotionTextureUnit;
-
-typedef struct
-{
-	int					iAdaptiveQuantMode; // 0:quality mode, 1:bitrates mode
-	SVAACalcResult		*pCalcResult;
-	SMotionTextureUnit  *pMotionTextureUnit;
-
-	signed char			*pMotionTextureIndexToDeltaQp;	
-	double				dAverMotionTextureIndexToDeltaQp;
-} SAdaptiveQuantizationParam;
-
-typedef enum 
-{
-	FRAME_SAD     =  0,
-	GOM_SAD       = -1,
-	GOM_VAR       = -2
-} EComplexityAnalysisMode;
-
-typedef struct
-{
-	int  iComplexityAnalysisMode;
-	int  iCalcBgd;
-	int  iMbNumInGom;		
-	int  iFrameComplexity;
-	int  *pGomComplexity;
-	int  *pGomForegroundBlockNum;
-	signed char  *pBackgroundMbFlag;
-	unsigned int *uiRefMbType;
-	SVAACalcResult  *pCalcResult;
-} SComplexityAnalysisParam;
-
-/////////////////////////////////////////////////////////////////////////////////////////////
-
-typedef struct 
-{
-	void    *pCtx;
-	EResult (*Init)    (void *pCtx, int iType, void *pCfg);
-	EResult (*Uninit)  (void *pCtx, int iType);
-	EResult (*Flush)   (void *pCtx, int iType);
-	EResult (*Process) (void *pCtx, int iType, SPixMap *pSrc, SPixMap *dst); 
-	EResult (*Get)     (void *pCtx, int iType, void *pParam); 
-	EResult (*Set)     (void *pCtx, int iType, void *pParam); 
-	EResult (*SpecialFeature) (void *pCtx, int iType, void *pIn, void *pOut);
-} IWelsVPc;
-
-#if defined(__cplusplus) && !defined(CINTERFACE)  /* C++ style interface */
-
-class IWelsVP
-{
-public:
-	virtual ~IWelsVP() {}
-
-public:		
-	virtual EResult Init    (int iType, void *pCfg) = 0; 
-	virtual EResult Uninit  (int iType) = 0;
-	virtual EResult Flush   (int iType) = 0;
-	virtual EResult Process (int iType, SPixMap *pSrc, SPixMap *dst) = 0; 
-	virtual EResult Get     (int iType, void *pParam) = 0; 
-	virtual EResult Set     (int iType, void *pParam) = 0; 
-	virtual EResult SpecialFeature (int iType, void *pIn, void *pOut) = 0;
-};
-
-/* Recommend to invoke the interface via the micro for convenient */
-#define IWelsVPFunc_Init(p, a, b)                  (p)->Init(a, b)              
-#define IWelsVPFunc_Uninit(p, a)                   (p)->Uninit(a)               
-#define IWelsVPFunc_Flush(p, a)                    (p)->Flush(a)                
-#define IWelsVPFunc_Process(p, a, b, c)            (p)->Process(a, b, c)        
-#define IWelsVPFunc_Get(p, a, b)                   (p)->Get(a, b)               
-#define IWelsVPFunc_Set(p, a, b)                   (p)->Set(a, b)               
-#define IWelsVPFunc_SpecialFeature(p, a, b, c)     (p)->SpecialFeature(a, b, c)
-
-/* C++ interface version */
-#define WELSVP_INTERFACE_VERION                    (0x8000 + (WELSVP_VERSION & 0x7fff)) 
-#define WELSVP_EXTERNC_BEGIN                       extern "C" {
-#define WELSVP_EXTERNC_END                         }
-
-#else    /* C style interface */
-
-/* Recommend to invoke the interface via the micro for convenient */
-#define IWelsVPFunc_Init(p, a, b)                  (p)->Init(p->h, a, b)              
-#define IWelsVPFunc_Uninit(p, a)                   (p)->Uninit(p->h, a)               
-#define IWelsVPFunc_Flush(p, a)                    (p)->Flush(p->h, a)                
-#define IWelsVPFunc_Process(p, a, b, c)            (p)->Process(p->h, a, b, c)        
-#define IWelsVPFunc_Get(p, a, b)                   (p)->Get(p->h, a, b)               
-#define IWelsVPFunc_Set(p, a, b)                   (p)->Set(p->h, a, b)               
-#define IWelsVPFunc_SpecialFeature(p, a, b, c)     (p)->SpecialFeature(p->h, a, b, c) 
-
-/* C interface version */
-#define WELSVP_INTERFACE_VERION                    (0x0001 + (WELSVP_VERSION & 0x7fff)) 
-#define WELSVP_EXTERNC_BEGIN                      
-#define WELSVP_EXTERNC_END                       
-
-#endif
-
-WELSVP_EXTERNC_BEGIN
-EResult WELSAPI CreateVpInterface   (void **ppCtx, int iVersion /*= WELSVP_INTERFACE_VERION*/);
-EResult WELSAPI DestroyVpInterface  (void *pCtx , int iVersion /*= WELSVP_INTERFACE_VERION*/);
-WELSVP_EXTERNC_END
-
-//////////////////////////////////////////////////////////////////////////////////////////////
-#endif // _IWELSVP_H_
-
-
+/*!
+ * \copy
+ *     Copyright (c)  2004-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	    :  IWelsVP.h
+ *
+ * \brief	    :  Interface of wels video processor class
+ *
+ * \date        :  2011/01/04
+ *
+ * \description :  1. should support both C/C++ style interface
+ *                 2. should concern with the feature extension requirement
+ *                 3. should care the usage of "char"==>
+ *                     1) value char  : signed char/unsigned char
+ *                     2) string char : char
+ *
+ *************************************************************************************
+ */
+
+#ifndef _IWELSVP_H_
+#define _IWELSVP_H_
+
+#ifdef _WIN32
+#define WELSAPI __stdcall
+#else
+#define WELSAPI
+#endif
+
+#define WELSVP_MAJOR_VERSION   1
+#define WELSVP_MINOR_VERSION   1
+#define WELSVP_VERSION         ((WELSVP_MAJOR_VERSION << 8) + WELSVP_MINOR_VERSION)
+
+typedef enum {
+  RET_SUCCESS          =  0,
+  RET_FAILED           = -1,
+  RET_INVALIDPARAM     = -2,
+  RET_OUTOFMEMORY      = -3,
+  RET_NOTSUPPORTED       = -4,
+  RET_UNEXPECTED       = -5,
+  RET_NEEDREINIT		  = -6
+} EResult;
+
+typedef enum {
+  VIDEO_FORMAT_NULL       = 0,   /* invalid format   */
+  /*rgb color formats*/
+  VIDEO_FORMAT_RGB        = 1,   /* rgb 24bits       */
+  VIDEO_FORMAT_RGBA       = 2,   /* rgba             */
+  VIDEO_FORMAT_RGB555     = 3,   /* rgb555           */
+  VIDEO_FORMAT_RGB565     = 4,   /* rgb565           */
+  VIDEO_FORMAT_BGR        = 5,   /* bgr 24bits       */
+  VIDEO_FORMAT_BGRA       = 6,   /* bgr 32bits       */
+  VIDEO_FORMAT_ABGR       = 7,   /* abgr             */
+  VIDEO_FORMAT_ARGB       = 8,   /* argb             */
+
+  /*yuv color formats*/
+  VIDEO_FORMAT_YUY2       = 20,   /* yuy2             */
+  VIDEO_FORMAT_YVYU       = 21,   /* yvyu             */
+  VIDEO_FORMAT_UYVY       = 22,   /* uyvy             */
+  VIDEO_FORMAT_I420       = 23,   /* yuv 4:2:0 planar */
+  VIDEO_FORMAT_YV12       = 24,   /* yuv 4:2:0 planar */
+  VIDEO_FORMAT_INTERNAL   = 25,   /* Only Used for SVC decoder testbed */
+  VIDEO_FORMAT_NV12		= 26,	/* y planar + uv packed */
+  VIDEO_FORMAT_I422       = 27,   /* yuv 4:2:2 planar */
+  VIDEO_FORMAT_I444       = 28,   /* yuv 4:4:4 planar */
+  VIDEO_FORMAT_YUYV       = 20,   /* yuv 4:2:2 packed */
+
+  VIDEO_FORMAT_RGB24      = 1,
+  VIDEO_FORMAT_RGB32      = 2,
+  VIDEO_FORMAT_RGB24_INV  = 5,
+  VIDEO_FORMAT_RGB32_INV  = 6,
+  VIDEO_FORMAT_RGB555_INV = 7,
+  VIDEO_FORMAT_RGB565_INV = 8,
+  VIDEO_FORMAT_YUV2       = 21,
+  VIDEO_FORMAT_420        = 23,
+
+  VIDEO_FORMAT_VFlip      = 0x80000000
+} EVideoFormat;
+
+typedef enum {
+  BUFFER_HOSTMEM  = 0,
+  BUFFER_SURFACE
+} EPixMapBufferProperty;
+
+typedef struct {
+  int iRectTop;
+  int iRectLeft;
+  int iRectWidth;
+  int iRectHeight;
+} SRect;
+
+typedef struct {
+  void*        pPixel[3];
+  int          iSizeInBits;
+  int          iStride[3];
+  SRect        sRect;
+  EVideoFormat eFormat;
+  EPixMapBufferProperty eProperty;//not use? to remove? but how about the size of SPixMap?
+} SPixMap;
+
+typedef enum {
+  METHOD_NULL              = 0,
+  METHOD_COLORSPACE_CONVERT    ,//not support yet
+  METHOD_DENOISE              ,
+  METHOD_SCENE_CHANGE_DETECTION ,
+  METHOD_DOWNSAMPLE			  ,
+  METHOD_VAA_STATISTICS        ,
+  METHOD_BACKGROUND_DETECTION  ,
+  METHOD_ADAPTIVE_QUANT ,
+  METHOD_COMPLEXITY_ANALYSIS   ,
+  METHOD_IMAGE_ROTATE		  ,
+  METHOD_MASK
+} EMethods;
+
+//-----------------------------------------------------------------//
+//  Algorithm parameters define
+//-----------------------------------------------------------------//
+
+typedef struct {
+  int bSceneChangeFlag; // 0:false ; 1:true
+} SSceneChangeResult;
+
+typedef enum {
+  SIMILAR_SCENE,      //similar scene
+  MEDIUM_CHANGED_SCENE,   //medium changed scene
+  LARGE_CHANGED_SCENE,   //large changed scene
+} ESceneChangeIdc;
+
+typedef struct {
+  unsigned char* pCurY;					// Y data of current frame
+  unsigned char* pRefY;					// Y data of pRef frame for diff calc
+  int (*pSad8x8)[4];				// sad of 8x8, every 4 in the same 16x16 get together
+  int* pSsd16x16;					// sum of square difference of 16x16
+  int* pSum16x16;					// sum of 16x16
+  int* pSumOfSquare16x16;					// sum of square of 16x16
+  int	(*pSumOfDiff8x8)[4];
+  unsigned char (*pMad8x8)[4];
+  int iFrameSad;					// sad of frame
+} SVAACalcResult;
+
+typedef struct {
+  int iCalcVar;
+  int iCalcBgd;
+  int iCalcSsd;
+  int iReserved;
+  SVAACalcResult*	pCalcResult;
+} SVAACalcParam;
+
+typedef struct {
+  signed char*		pBackgroundMbFlag;
+  SVAACalcResult*  pCalcRes;
+} SBGDInterface;
+
+typedef enum {
+  AQ_QUALITY_MODE,   //Quality mode
+  AQ_BITRATE_MODE,   //Bitrate mode
+} EAQModes;
+
+typedef struct {
+  unsigned short    uiMotionIndex;
+  unsigned short    uiTextureIndex;
+} SMotionTextureUnit;
+
+typedef struct {
+  int					iAdaptiveQuantMode; // 0:quality mode, 1:bitrates mode
+  SVAACalcResult*		pCalcResult;
+  SMotionTextureUnit*  pMotionTextureUnit;
+
+  signed char*			pMotionTextureIndexToDeltaQp;
+  double				dAverMotionTextureIndexToDeltaQp;
+} SAdaptiveQuantizationParam;
+
+typedef enum {
+  FRAME_SAD     =  0,
+  GOM_SAD       = -1,
+  GOM_VAR       = -2
+} EComplexityAnalysisMode;
+
+typedef struct {
+  int  iComplexityAnalysisMode;
+  int  iCalcBgd;
+  int  iMbNumInGom;
+  int  iFrameComplexity;
+  int*  pGomComplexity;
+  int*  pGomForegroundBlockNum;
+  signed char*  pBackgroundMbFlag;
+  unsigned int* uiRefMbType;
+  SVAACalcResult*  pCalcResult;
+} SComplexityAnalysisParam;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+
+typedef struct {
+  void*    pCtx;
+  EResult (*Init) (void* pCtx, int iType, void* pCfg);
+  EResult (*Uninit) (void* pCtx, int iType);
+  EResult (*Flush) (void* pCtx, int iType);
+  EResult (*Process) (void* pCtx, int iType, SPixMap* pSrc, SPixMap* dst);
+  EResult (*Get) (void* pCtx, int iType, void* pParam);
+  EResult (*Set) (void* pCtx, int iType, void* pParam);
+  EResult (*SpecialFeature) (void* pCtx, int iType, void* pIn, void* pOut);
+} IWelsVPc;
+
+#if defined(__cplusplus) && !defined(CINTERFACE)  /* C++ style interface */
+
+class IWelsVP {
+ public:
+  virtual ~IWelsVP() {}
+
+ public:
+  virtual EResult Init (int iType, void* pCfg) = 0;
+  virtual EResult Uninit (int iType) = 0;
+  virtual EResult Flush (int iType) = 0;
+  virtual EResult Process (int iType, SPixMap* pSrc, SPixMap* dst) = 0;
+  virtual EResult Get (int iType, void* pParam) = 0;
+  virtual EResult Set (int iType, void* pParam) = 0;
+  virtual EResult SpecialFeature (int iType, void* pIn, void* pOut) = 0;
+};
+
+/* Recommend to invoke the interface via the micro for convenient */
+#define IWelsVPFunc_Init(p, a, b)                  (p)->Init(a, b)
+#define IWelsVPFunc_Uninit(p, a)                   (p)->Uninit(a)
+#define IWelsVPFunc_Flush(p, a)                    (p)->Flush(a)
+#define IWelsVPFunc_Process(p, a, b, c)            (p)->Process(a, b, c)
+#define IWelsVPFunc_Get(p, a, b)                   (p)->Get(a, b)
+#define IWelsVPFunc_Set(p, a, b)                   (p)->Set(a, b)
+#define IWelsVPFunc_SpecialFeature(p, a, b, c)     (p)->SpecialFeature(a, b, c)
+
+/* C++ interface version */
+#define WELSVP_INTERFACE_VERION                    (0x8000 + (WELSVP_VERSION & 0x7fff))
+#define WELSVP_EXTERNC_BEGIN                       extern "C" {
+#define WELSVP_EXTERNC_END                         }
+
+#else    /* C style interface */
+
+/* Recommend to invoke the interface via the micro for convenient */
+#define IWelsVPFunc_Init(p, a, b)                  (p)->Init(p->h, a, b)
+#define IWelsVPFunc_Uninit(p, a)                   (p)->Uninit(p->h, a)
+#define IWelsVPFunc_Flush(p, a)                    (p)->Flush(p->h, a)
+#define IWelsVPFunc_Process(p, a, b, c)            (p)->Process(p->h, a, b, c)
+#define IWelsVPFunc_Get(p, a, b)                   (p)->Get(p->h, a, b)
+#define IWelsVPFunc_Set(p, a, b)                   (p)->Set(p->h, a, b)
+#define IWelsVPFunc_SpecialFeature(p, a, b, c)     (p)->SpecialFeature(p->h, a, b, c)
+
+/* C interface version */
+#define WELSVP_INTERFACE_VERION                    (0x0001 + (WELSVP_VERSION & 0x7fff))
+#define WELSVP_EXTERNC_BEGIN
+#define WELSVP_EXTERNC_END
+
+#endif
+
+WELSVP_EXTERNC_BEGIN
+EResult WELSAPI CreateVpInterface (void** ppCtx, int iVersion /*= WELSVP_INTERFACE_VERION*/);
+EResult WELSAPI DestroyVpInterface (void* pCtx , int iVersion /*= WELSVP_INTERFACE_VERION*/);
+WELSVP_EXTERNC_END
+
+//////////////////////////////////////////////////////////////////////////////////////////////
+#endif // _IWELSVP_H_
+
+
--- a/processing/src/adaptivequantization/AdaptiveQuantization.cpp
+++ b/processing/src/adaptivequantization/AdaptiveQuantization.cpp
@@ -1,281 +1,256 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-#include "AdaptiveQuantization.h"
-#include "../common/cpu.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-
-
-#define AVERAGE_TIME_MOTION                   (0.3) //0.3046875 // 1/4 + 1/16 - 1/128 ~ 0.3
-#define AVERAGE_TIME_TEXTURE_QUALITYMODE  (1.0) //0.5 // 1/2
-#define AVERAGE_TIME_TEXTURE_BITRATEMODE  (0.875) //0.5 // 1/2
-#define MODEL_ALPHA                           (0.9910) //1.5 //1.1102
-#define MODEL_TIME                            (5.8185) //9.0 //5.9842
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-CAdaptiveQuantization::CAdaptiveQuantization(int32_t iCpuFlag)
-{
-	m_CPUFlag = iCpuFlag;
-	m_eMethod   = METHOD_ADAPTIVE_QUANT;
-	m_pfVar   = NULL;
-	WelsMemset( &m_sAdaptiveQuantParam, 0, sizeof(m_sAdaptiveQuantParam) );
-	WelsInitVarFunc(m_pfVar, m_CPUFlag);
-}
-
-CAdaptiveQuantization::~CAdaptiveQuantization()
-{	
-}
-
-EResult CAdaptiveQuantization::Process(int32_t iType, SPixMap *pSrcPixMap, SPixMap *pRefPixMap)
-{
-	EResult eReturn = RET_INVALIDPARAM;	
-
-	int32_t iWidth     = pSrcPixMap->sRect.iRectWidth;
-	int32_t iHeight    = pSrcPixMap->sRect.iRectHeight;	
-	int32_t iMbWidth  = iWidth  >> 4;
-	int32_t iMbHeight = iHeight >> 4;
-	int32_t iMbTotalNum    = iMbWidth * iMbHeight;
-
-	SMotionTextureUnit *pMotionTexture = NULL;
-	SVAACalcResult     *pVaaCalcResults = NULL;
-	int8_t   iMotionTextureIndexToDeltaQp = 0;	
-	int32_t	 iAverMotionTextureIndexToDeltaQp = 0;	// double to uint32
-	double_t dAverageMotionIndex = 0.0;	// double to float
-	double_t dAverageTextureIndex = 0.0;
-
-	double_t dQStep = 0.0;
-	double_t dLumaMotionDeltaQp = 0;
-	double_t dLumaTextureDeltaQp = 0;
-
-	uint8_t *pRefFrameY = NULL, *pCurFrameY = NULL;
-	int32_t iRefStride = 0, iCurStride = 0;
-
-	uint8_t *pRefFrameTmp = NULL, *pCurFrameTmp = NULL;
-	int32_t i = 0, j = 0;
-
-	pRefFrameY = (uint8_t *)pRefPixMap->pPixel[0];
-	pCurFrameY = (uint8_t *)pSrcPixMap->pPixel[0];
-
-	iRefStride  = pRefPixMap->iStride[0];
-	iCurStride  = pSrcPixMap->iStride[0];
-
-	/////////////////////////////////////// motion //////////////////////////////////
-	//  motion MB residual variance
-	dAverageMotionIndex = 0.0;
-	dAverageTextureIndex = 0.0;
-	pMotionTexture = m_sAdaptiveQuantParam.pMotionTextureUnit;
-	pVaaCalcResults = m_sAdaptiveQuantParam.pCalcResult;
-
-	if ( pVaaCalcResults->pRefY == pRefFrameY && pVaaCalcResults->pCurY == pCurFrameY )
-	{
-		int32_t iMbIndex = 0;
-		int32_t iSumDiff, iSQDiff, uiSum, iSQSum;
-		for ( j = 0; j < iMbHeight; j ++ ) 
-		{
-			pRefFrameTmp  = pRefFrameY;
-			pCurFrameTmp  = pCurFrameY;	
-			for ( i = 0; i < iMbWidth; i++ )
-			{
-				iSumDiff =  pVaaCalcResults->pSad8x8[iMbIndex][0];
-				iSumDiff += pVaaCalcResults->pSad8x8[iMbIndex][1];
-				iSumDiff += pVaaCalcResults->pSad8x8[iMbIndex][2];
-				iSumDiff += pVaaCalcResults->pSad8x8[iMbIndex][3];
-
-				iSQDiff = pVaaCalcResults->pSsd16x16[iMbIndex];
-				uiSum = pVaaCalcResults->pSum16x16[iMbIndex];
-				iSQSum = pVaaCalcResults->pSumOfSquare16x16[iMbIndex];
-
-				iSumDiff = iSumDiff>>8;
-				pMotionTexture->uiMotionIndex = (iSQDiff>>8) - (iSumDiff * iSumDiff);
-
-				uiSum = uiSum>>8;
-				pMotionTexture->uiTextureIndex = (iSQSum>>8) - (uiSum * uiSum);
-
-				dAverageMotionIndex += pMotionTexture->uiMotionIndex;
-				dAverageTextureIndex += pMotionTexture->uiTextureIndex;
-				pMotionTexture++;
-				++iMbIndex;
-				pRefFrameTmp += MB_WIDTH_LUMA;
-				pCurFrameTmp += MB_WIDTH_LUMA;
-			}
-			pRefFrameY += (iRefStride)<<4;
-			pCurFrameY += (iCurStride)<<4;
-		}
-	}
-	else 
-	{
-		for ( j = 0; j < iMbHeight; j ++ ) 
-		{
-			pRefFrameTmp  = pRefFrameY;
-			pCurFrameTmp  = pCurFrameY;	
-			for ( i = 0; i < iMbWidth; i++ )
-			{
-				m_pfVar( pRefFrameTmp, iRefStride, pCurFrameTmp, iCurStride, pMotionTexture);
-				dAverageMotionIndex += pMotionTexture->uiMotionIndex;
-				dAverageTextureIndex += pMotionTexture->uiTextureIndex;
-				pMotionTexture++;
-				pRefFrameTmp += MB_WIDTH_LUMA;
-				pCurFrameTmp += MB_WIDTH_LUMA;
-
-			}
-			pRefFrameY += (iRefStride)<<4;
-			pCurFrameY += (iCurStride)<<4;
-		}
-	}
-	dAverageMotionIndex = dAverageMotionIndex / iMbTotalNum;
-	dAverageTextureIndex = dAverageTextureIndex / iMbTotalNum;
-	if ( (dAverageMotionIndex <= PESN) && (dAverageMotionIndex >= -PESN) )
-	{
-		dAverageMotionIndex = 1.0;
-	}
-	if ( (dAverageTextureIndex <= PESN) && (dAverageTextureIndex >= -PESN) )
-	{
-		dAverageTextureIndex = 1.0;
-	}
-	//  motion mb residual map to QP
-	//  texture mb original map to QP	
-	iAverMotionTextureIndexToDeltaQp = 0;
-	dAverageMotionIndex = AVERAGE_TIME_MOTION * dAverageMotionIndex;
-
-	if ( m_sAdaptiveQuantParam.iAdaptiveQuantMode == AQ_QUALITY_MODE )
-	{
-		dAverageTextureIndex = AVERAGE_TIME_TEXTURE_QUALITYMODE * dAverageTextureIndex;
-	}
-	else
-	{
-		dAverageTextureIndex = AVERAGE_TIME_TEXTURE_BITRATEMODE * dAverageTextureIndex;
-	}
-
-	pMotionTexture = m_sAdaptiveQuantParam.pMotionTextureUnit;
-	for ( j = 0; j < iMbHeight; j ++ ) 
-	{
-		for ( i = 0; i < iMbWidth; i++ )
-		{
-			double_t a = pMotionTexture->uiTextureIndex / dAverageTextureIndex;
-			dQStep = (a - 1) / (a + MODEL_ALPHA); 		
-			dLumaTextureDeltaQp = MODEL_TIME * dQStep;// range +- 6
-
-			iMotionTextureIndexToDeltaQp = (int8_t)dLumaTextureDeltaQp;
-
-			a = pMotionTexture->uiMotionIndex / dAverageMotionIndex;
-			dQStep = (a - 1) / (a + MODEL_ALPHA); 			
-			dLumaMotionDeltaQp = MODEL_TIME * dQStep;// range +- 6
-
-			if ((m_sAdaptiveQuantParam.iAdaptiveQuantMode == AQ_QUALITY_MODE && dLumaMotionDeltaQp < -PESN) || (m_sAdaptiveQuantParam.iAdaptiveQuantMode == AQ_BITRATE_MODE))
-			{
-				iMotionTextureIndexToDeltaQp += (int8_t)dLumaMotionDeltaQp;
-			}
-
-			m_sAdaptiveQuantParam.pMotionTextureIndexToDeltaQp[j * iMbWidth + i] = iMotionTextureIndexToDeltaQp;
-			iAverMotionTextureIndexToDeltaQp += iMotionTextureIndexToDeltaQp;
-			pMotionTexture++;
-		}
-	}
-	m_sAdaptiveQuantParam.dAverMotionTextureIndexToDeltaQp = (1.0 * iAverMotionTextureIndexToDeltaQp) / iMbTotalNum;
-
-	eReturn = RET_SUCCESS;
-
-	return eReturn;
-}
-
-
-
-EResult CAdaptiveQuantization::Set(int32_t iType, void *pParam)
-{
-	if (pParam == NULL)
-	{
-		return RET_INVALIDPARAM;
-	}
-
-	m_sAdaptiveQuantParam = *(SAdaptiveQuantizationParam *)pParam;
-
-	return RET_SUCCESS;
-}
-
-EResult CAdaptiveQuantization::Get(int32_t iType, void *pParam)
-{
-	if (pParam == NULL)
-	{
-		return RET_INVALIDPARAM;
-	}
-
-	SAdaptiveQuantizationParam * sAdaptiveQuantParam = (SAdaptiveQuantizationParam *)pParam;
-
-	sAdaptiveQuantParam->dAverMotionTextureIndexToDeltaQp = m_sAdaptiveQuantParam.dAverMotionTextureIndexToDeltaQp;
-
-	return RET_SUCCESS;
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////
-
-void CAdaptiveQuantization::WelsInitVarFunc(PVarFunc &pfVar,  int32_t iCpuFlag)
-{
-	pfVar = SampleVariance16x16_c;
-
-#ifdef X86_ASM	
-	if ( iCpuFlag & WELS_CPU_SSE2 )
-	{
-		pfVar = SampleVariance16x16_sse2;
-	}
-#endif
-}
-
-void SampleVariance16x16_c( uint8_t * pRefY, int32_t iRefStride, uint8_t * pSrcY, int32_t iSrcStride, SMotionTextureUnit* pMotionTexture )
-{
-	uint32_t uiCurSquare = 0,  uiSquare = 0;
-	uint16_t uiCurSum = 0,  uiSum = 0;
-
-	for( int32_t y = 0; y < MB_WIDTH_LUMA; y++ )
-	{
-		for( int32_t x = 0; x < MB_WIDTH_LUMA; x++ )
-		{
-			uint32_t uiDiff = WELS_ABS(pRefY[x] - pSrcY[x]);	
-			uiSum += uiDiff;
-			uiSquare += uiDiff * uiDiff;
-
-			uiCurSum += pSrcY[x];
-			uiCurSquare += pSrcY[x] * pSrcY[x];
-		}
-		pRefY += iRefStride;
-		pSrcY += iSrcStride;
-	}
-
-	uiSum = uiSum>>8;
-	pMotionTexture->uiMotionIndex = (uiSquare>>8) - (uiSum * uiSum);
-
-	uiCurSum = uiCurSum>>8;
-	pMotionTexture->uiTextureIndex = (uiCurSquare>>8) - (uiCurSum * uiCurSum);
-}
-
-WELSVP_NAMESPACE_END
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include "AdaptiveQuantization.h"
+#include "../common/cpu.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+
+
+#define AVERAGE_TIME_MOTION                   (0.3) //0.3046875 // 1/4 + 1/16 - 1/128 ~ 0.3
+#define AVERAGE_TIME_TEXTURE_QUALITYMODE  (1.0) //0.5 // 1/2
+#define AVERAGE_TIME_TEXTURE_BITRATEMODE  (0.875) //0.5 // 1/2
+#define MODEL_ALPHA                           (0.9910) //1.5 //1.1102
+#define MODEL_TIME                            (5.8185) //9.0 //5.9842
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+CAdaptiveQuantization::CAdaptiveQuantization (int32_t iCpuFlag) {
+  m_CPUFlag = iCpuFlag;
+  m_eMethod   = METHOD_ADAPTIVE_QUANT;
+  m_pfVar   = NULL;
+  WelsMemset (&m_sAdaptiveQuantParam, 0, sizeof (m_sAdaptiveQuantParam));
+  WelsInitVarFunc (m_pfVar, m_CPUFlag);
+}
+
+CAdaptiveQuantization::~CAdaptiveQuantization() {
+}
+
+EResult CAdaptiveQuantization::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
+  EResult eReturn = RET_INVALIDPARAM;
+
+  int32_t iWidth     = pSrcPixMap->sRect.iRectWidth;
+  int32_t iHeight    = pSrcPixMap->sRect.iRectHeight;
+  int32_t iMbWidth  = iWidth  >> 4;
+  int32_t iMbHeight = iHeight >> 4;
+  int32_t iMbTotalNum    = iMbWidth * iMbHeight;
+
+  SMotionTextureUnit* pMotionTexture = NULL;
+  SVAACalcResult*     pVaaCalcResults = NULL;
+  int8_t   iMotionTextureIndexToDeltaQp = 0;
+  int32_t	 iAverMotionTextureIndexToDeltaQp = 0;	// double to uint32
+  double_t dAverageMotionIndex = 0.0;	// double to float
+  double_t dAverageTextureIndex = 0.0;
+
+  double_t dQStep = 0.0;
+  double_t dLumaMotionDeltaQp = 0;
+  double_t dLumaTextureDeltaQp = 0;
+
+  uint8_t* pRefFrameY = NULL, *pCurFrameY = NULL;
+  int32_t iRefStride = 0, iCurStride = 0;
+
+  uint8_t* pRefFrameTmp = NULL, *pCurFrameTmp = NULL;
+  int32_t i = 0, j = 0;
+
+  pRefFrameY = (uint8_t*)pRefPixMap->pPixel[0];
+  pCurFrameY = (uint8_t*)pSrcPixMap->pPixel[0];
+
+  iRefStride  = pRefPixMap->iStride[0];
+  iCurStride  = pSrcPixMap->iStride[0];
+
+  /////////////////////////////////////// motion //////////////////////////////////
+  //  motion MB residual variance
+  dAverageMotionIndex = 0.0;
+  dAverageTextureIndex = 0.0;
+  pMotionTexture = m_sAdaptiveQuantParam.pMotionTextureUnit;
+  pVaaCalcResults = m_sAdaptiveQuantParam.pCalcResult;
+
+  if (pVaaCalcResults->pRefY == pRefFrameY && pVaaCalcResults->pCurY == pCurFrameY) {
+    int32_t iMbIndex = 0;
+    int32_t iSumDiff, iSQDiff, uiSum, iSQSum;
+    for (j = 0; j < iMbHeight; j ++) {
+      pRefFrameTmp  = pRefFrameY;
+      pCurFrameTmp  = pCurFrameY;
+      for (i = 0; i < iMbWidth; i++) {
+        iSumDiff =  pVaaCalcResults->pSad8x8[iMbIndex][0];
+        iSumDiff += pVaaCalcResults->pSad8x8[iMbIndex][1];
+        iSumDiff += pVaaCalcResults->pSad8x8[iMbIndex][2];
+        iSumDiff += pVaaCalcResults->pSad8x8[iMbIndex][3];
+
+        iSQDiff = pVaaCalcResults->pSsd16x16[iMbIndex];
+        uiSum = pVaaCalcResults->pSum16x16[iMbIndex];
+        iSQSum = pVaaCalcResults->pSumOfSquare16x16[iMbIndex];
+
+        iSumDiff = iSumDiff >> 8;
+        pMotionTexture->uiMotionIndex = (iSQDiff >> 8) - (iSumDiff * iSumDiff);
+
+        uiSum = uiSum >> 8;
+        pMotionTexture->uiTextureIndex = (iSQSum >> 8) - (uiSum * uiSum);
+
+        dAverageMotionIndex += pMotionTexture->uiMotionIndex;
+        dAverageTextureIndex += pMotionTexture->uiTextureIndex;
+        pMotionTexture++;
+        ++iMbIndex;
+        pRefFrameTmp += MB_WIDTH_LUMA;
+        pCurFrameTmp += MB_WIDTH_LUMA;
+      }
+      pRefFrameY += (iRefStride) << 4;
+      pCurFrameY += (iCurStride) << 4;
+    }
+  } else {
+    for (j = 0; j < iMbHeight; j ++) {
+      pRefFrameTmp  = pRefFrameY;
+      pCurFrameTmp  = pCurFrameY;
+      for (i = 0; i < iMbWidth; i++) {
+        m_pfVar (pRefFrameTmp, iRefStride, pCurFrameTmp, iCurStride, pMotionTexture);
+        dAverageMotionIndex += pMotionTexture->uiMotionIndex;
+        dAverageTextureIndex += pMotionTexture->uiTextureIndex;
+        pMotionTexture++;
+        pRefFrameTmp += MB_WIDTH_LUMA;
+        pCurFrameTmp += MB_WIDTH_LUMA;
+
+      }
+      pRefFrameY += (iRefStride) << 4;
+      pCurFrameY += (iCurStride) << 4;
+    }
+  }
+  dAverageMotionIndex = dAverageMotionIndex / iMbTotalNum;
+  dAverageTextureIndex = dAverageTextureIndex / iMbTotalNum;
+  if ((dAverageMotionIndex <= PESN) && (dAverageMotionIndex >= -PESN)) {
+    dAverageMotionIndex = 1.0;
+  }
+  if ((dAverageTextureIndex <= PESN) && (dAverageTextureIndex >= -PESN)) {
+    dAverageTextureIndex = 1.0;
+  }
+  //  motion mb residual map to QP
+  //  texture mb original map to QP
+  iAverMotionTextureIndexToDeltaQp = 0;
+  dAverageMotionIndex = AVERAGE_TIME_MOTION * dAverageMotionIndex;
+
+  if (m_sAdaptiveQuantParam.iAdaptiveQuantMode == AQ_QUALITY_MODE) {
+    dAverageTextureIndex = AVERAGE_TIME_TEXTURE_QUALITYMODE * dAverageTextureIndex;
+  } else {
+    dAverageTextureIndex = AVERAGE_TIME_TEXTURE_BITRATEMODE * dAverageTextureIndex;
+  }
+
+  pMotionTexture = m_sAdaptiveQuantParam.pMotionTextureUnit;
+  for (j = 0; j < iMbHeight; j ++) {
+    for (i = 0; i < iMbWidth; i++) {
+      double_t a = pMotionTexture->uiTextureIndex / dAverageTextureIndex;
+      dQStep = (a - 1) / (a + MODEL_ALPHA);
+      dLumaTextureDeltaQp = MODEL_TIME * dQStep;// range +- 6
+
+      iMotionTextureIndexToDeltaQp = (int8_t)dLumaTextureDeltaQp;
+
+      a = pMotionTexture->uiMotionIndex / dAverageMotionIndex;
+      dQStep = (a - 1) / (a + MODEL_ALPHA);
+      dLumaMotionDeltaQp = MODEL_TIME * dQStep;// range +- 6
+
+      if ((m_sAdaptiveQuantParam.iAdaptiveQuantMode == AQ_QUALITY_MODE && dLumaMotionDeltaQp < -PESN)
+          || (m_sAdaptiveQuantParam.iAdaptiveQuantMode == AQ_BITRATE_MODE)) {
+        iMotionTextureIndexToDeltaQp += (int8_t)dLumaMotionDeltaQp;
+      }
+
+      m_sAdaptiveQuantParam.pMotionTextureIndexToDeltaQp[j * iMbWidth + i] = iMotionTextureIndexToDeltaQp;
+      iAverMotionTextureIndexToDeltaQp += iMotionTextureIndexToDeltaQp;
+      pMotionTexture++;
+    }
+  }
+  m_sAdaptiveQuantParam.dAverMotionTextureIndexToDeltaQp = (1.0 * iAverMotionTextureIndexToDeltaQp) / iMbTotalNum;
+
+  eReturn = RET_SUCCESS;
+
+  return eReturn;
+}
+
+
+
+EResult CAdaptiveQuantization::Set (int32_t iType, void* pParam) {
+  if (pParam == NULL) {
+    return RET_INVALIDPARAM;
+  }
+
+  m_sAdaptiveQuantParam = * (SAdaptiveQuantizationParam*)pParam;
+
+  return RET_SUCCESS;
+}
+
+EResult CAdaptiveQuantization::Get (int32_t iType, void* pParam) {
+  if (pParam == NULL) {
+    return RET_INVALIDPARAM;
+  }
+
+  SAdaptiveQuantizationParam* sAdaptiveQuantParam = (SAdaptiveQuantizationParam*)pParam;
+
+  sAdaptiveQuantParam->dAverMotionTextureIndexToDeltaQp = m_sAdaptiveQuantParam.dAverMotionTextureIndexToDeltaQp;
+
+  return RET_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+
+void CAdaptiveQuantization::WelsInitVarFunc (PVarFunc& pfVar,  int32_t iCpuFlag) {
+  pfVar = SampleVariance16x16_c;
+
+#ifdef X86_ASM
+  if (iCpuFlag & WELS_CPU_SSE2) {
+    pfVar = SampleVariance16x16_sse2;
+  }
+#endif
+}
+
+void SampleVariance16x16_c (uint8_t* pRefY, int32_t iRefStride, uint8_t* pSrcY, int32_t iSrcStride,
+                            SMotionTextureUnit* pMotionTexture) {
+  uint32_t uiCurSquare = 0,  uiSquare = 0;
+  uint16_t uiCurSum = 0,  uiSum = 0;
+
+  for (int32_t y = 0; y < MB_WIDTH_LUMA; y++) {
+    for (int32_t x = 0; x < MB_WIDTH_LUMA; x++) {
+      uint32_t uiDiff = WELS_ABS (pRefY[x] - pSrcY[x]);
+      uiSum += uiDiff;
+      uiSquare += uiDiff * uiDiff;
+
+      uiCurSum += pSrcY[x];
+      uiCurSquare += pSrcY[x] * pSrcY[x];
+    }
+    pRefY += iRefStride;
+    pSrcY += iSrcStride;
+  }
+
+  uiSum = uiSum >> 8;
+  pMotionTexture->uiMotionIndex = (uiSquare >> 8) - (uiSum * uiSum);
+
+  uiCurSum = uiCurSum >> 8;
+  pMotionTexture->uiTextureIndex = (uiCurSquare >> 8) - (uiCurSum * uiCurSum);
+}
+
+WELSVP_NAMESPACE_END
--- a/processing/src/adaptivequantization/AdaptiveQuantization.h
+++ b/processing/src/adaptivequantization/AdaptiveQuantization.h
@@ -1,85 +1,85 @@
-/*!
- * \copy
- *     Copyright (c)  2011-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- * \file	        :  AdaptiveQuantization.h
- *
- * \brief	    :  adaptive quantization class of wels video processor class
- *
- * \date         :  2011/03/21
- *
- * \description  :  1. rewrite the package code of scene change detection class  
- *
- */
-
-#ifndef _WELSVP_ADAPTIVEQUANTIZATION_H
-#define _WELSVP_ADAPTIVEQUANTIZATION_H
-
-#include "../common/util.h"
-#include "../common/memory.h"
-#include "../common/WelsFrameWork.h"
-#include "../../interface/IWelsVP.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-typedef void (VarFunc) ( uint8_t * pRefY, int32_t iRefStrideY, uint8_t * pSrc, int32_t iSrcStrideY, SMotionTextureUnit* pMotionTexture );
-
-typedef VarFunc  * PVarFunc;
-
-VarFunc      SampleVariance16x16_c;
-
-#ifdef X86_ASM
-WELSVP_EXTERN_C_BEGIN
-VarFunc      SampleVariance16x16_sse2;
-WELSVP_EXTERN_C_END
-#endif
-
-
-class CAdaptiveQuantization : public IStrategy
-{			  
-public:
-	CAdaptiveQuantization(int32_t iCpuFlag);
-	~CAdaptiveQuantization();
-
-	EResult Process(int32_t iType, SPixMap *pSrc, SPixMap *pRef);
-	EResult Set(int32_t iType, void *pParam);
-	EResult Get(int32_t iType, void *pParam);
-
-private:
-	void WelsInitVarFunc(PVarFunc &pfVar, int32_t iCpuFlag);
-
-private:
-	PVarFunc			                   m_pfVar;
-	int32_t                                  m_CPUFlag;
-	SAdaptiveQuantizationParam    m_sAdaptiveQuantParam;
-};	
-
-WELSVP_NAMESPACE_END
-
-#endif
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	        :  AdaptiveQuantization.h
+ *
+ * \brief	    :  adaptive quantization class of wels video processor class
+ *
+ * \date         :  2011/03/21
+ *
+ * \description  :  1. rewrite the package code of scene change detection class
+ *
+ */
+
+#ifndef _WELSVP_ADAPTIVEQUANTIZATION_H
+#define _WELSVP_ADAPTIVEQUANTIZATION_H
+
+#include "../common/util.h"
+#include "../common/memory.h"
+#include "../common/WelsFrameWork.h"
+#include "../../interface/IWelsVP.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+typedef void (VarFunc) (uint8_t* pRefY, int32_t iRefStrideY, uint8_t* pSrc, int32_t iSrcStrideY,
+                        SMotionTextureUnit* pMotionTexture);
+
+typedef VarFunc*   PVarFunc;
+
+VarFunc      SampleVariance16x16_c;
+
+#ifdef X86_ASM
+WELSVP_EXTERN_C_BEGIN
+VarFunc      SampleVariance16x16_sse2;
+WELSVP_EXTERN_C_END
+#endif
+
+
+class CAdaptiveQuantization : public IStrategy {
+ public:
+  CAdaptiveQuantization (int32_t iCpuFlag);
+  ~CAdaptiveQuantization();
+
+  EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* pRef);
+  EResult Set (int32_t iType, void* pParam);
+  EResult Get (int32_t iType, void* pParam);
+
+ private:
+  void WelsInitVarFunc (PVarFunc& pfVar, int32_t iCpuFlag);
+
+ private:
+  PVarFunc			                   m_pfVar;
+  int32_t                                  m_CPUFlag;
+  SAdaptiveQuantizationParam    m_sAdaptiveQuantParam;
+};
+
+WELSVP_NAMESPACE_END
+
+#endif
--- a/processing/src/asm/asm_inc.asm
+++ b/processing/src/asm/asm_inc.asm
@@ -43,7 +43,7 @@
 ; Options, for DEBUG
 ;***********************************************************************
 
-%if 1 
+%if 1
 	%define MOVDQ movdqa
 %else
 	%define MOVDQ movdqu
@@ -58,7 +58,7 @@
 BITS 32
 
 ;***********************************************************************
-; Macros 
+; Macros
 ;***********************************************************************
 
 %macro WELS_EXTERN 1
@@ -74,7 +74,7 @@
 	pxor        %2, %2
     psubw       %2, %1
     pmaxsw      %1, %2
-%endmacro 	
+%endmacro
 
 %macro MMX_XSwap  4
     movq		%4, %2
@@ -105,7 +105,7 @@
     SSE2_XSawp qdq, %5, %2, %3
 %endmacro
 
-;in: xmm0, xmm1, xmm2, xmm3  pOut:  xmm0, xmm1, xmm3, xmm4 
+;in: xmm0, xmm1, xmm2, xmm3  pOut:  xmm0, xmm1, xmm3, xmm4
 %macro SSE2_TransTwo4x4W 5
     SSE2_XSawp wd,  %1, %2, %5
     SSE2_XSawp wd,  %3, %4, %2
@@ -125,26 +125,26 @@
 	movdqa	%6, %9
 	movdqa	%9, %4
 	SSE2_XSawp bw,  %7, %6, %4
-	
-	SSE2_XSawp wd,  %1, %3, %6	
+
+	SSE2_XSawp wd,  %1, %3, %6
 	SSE2_XSawp wd,  %8, %2, %3
 	SSE2_XSawp wd,  %5, %7, %2
 	movdqa	%7, %9
-	movdqa	%9, %3	
+	movdqa	%9, %3
 	SSE2_XSawp wd,  %7, %4, %3
-	
-	SSE2_XSawp dq,  %1, %5, %4	
+
+	SSE2_XSawp dq,  %1, %5, %4
 	SSE2_XSawp dq,  %6, %2, %5
 	SSE2_XSawp dq,  %8, %7, %2
 	movdqa	%7, %9
-	movdqa	%9, %5		
+	movdqa	%9, %5
 	SSE2_XSawp dq,  %7, %3, %5
-	
+
 	SSE2_XSawp qdq,  %1, %8, %3
 	SSE2_XSawp qdq,  %4, %2, %8
 	SSE2_XSawp qdq,  %6, %7, %2
 	movdqa	%7, %9
-	movdqa	%9, %1		
+	movdqa	%9, %1
 	SSE2_XSawp qdq,  %7, %5, %1
 	movdqa	%5, %9
 %endmacro
@@ -170,9 +170,9 @@
 %macro butterfly_1to16_sse	3	; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
 	mov %3h, %3l
 	movd %1, e%3x		; i.e, 1% = eax (=b0)
-	pshuflw %2, %1, 00h	; ..., b0 b0 b0 b0 b0 b0 b0 b0	
-	pshufd %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0	
-%endmacro  
+	pshuflw %2, %1, 00h	; ..., b0 b0 b0 b0 b0 b0 b0 b0
+	pshufd %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
+%endmacro
 
 ;copy a dw into a xmm for 8 times
 %macro  SSE2_Copy8Times 2
--- a/processing/src/asm/cpuid.asm
+++ b/processing/src/asm/cpuid.asm
@@ -84,12 +84,12 @@
 ;   void WelsCPUId( int32_t uiIndex, int32_t *pFeatureA, int32_t *pFeatureB, int32_t *pFeatureC, int32_t *pFeatureD )
 ;****************************************************************************************************
 WelsCPUId:
-	push	ebx	
+	push	ebx
 	push	edi
-	
+
 	mov     eax, [esp+12]	; operating index
     cpuid					; cpuid
-	
+
 	; processing various information return
 	mov     edi, [esp+16]
     mov     [edi], eax
@@ -100,10 +100,10 @@
     mov     edi, [esp+28]
     mov     [edi], edx
 
-	pop		edi	
+	pop		edi
     pop     ebx
 	ret
-	
+
 WELS_EXTERN WelsCPUSupportAVX
 ; need call after cpuid=1 and eax, ecx flag got then
 ALIGN 16
@@ -139,7 +139,7 @@
 WelsCPUSupportFMA:
 	mov eax, [esp+4]
 	mov ecx, [esp+8]
-	
+
 	; refer to detection of FMA addressed in INTEL AVX manual document
 	and ecx, 018001000H
 	cmp ecx, 018001000H		; check OSXSAVE, AVX, FMA feature flags
@@ -153,7 +153,7 @@
 	mov eax, 1
 	ret
 fma_not_supported:
-	mov eax, 0	
+	mov eax, 0
 	ret
 
 WELS_EXTERN WelsEmms
--- a/processing/src/asm/denoisefilter.asm
+++ b/processing/src/asm/denoisefilter.asm
@@ -1,263 +1,263 @@
-;*!
-;* \copy
-;*     Copyright (c)  2010-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  predenoise.asm
-;*
-;*  Abstract
-;*      denoise for SVC2.1
-;*  History
-;*      4/13/2010 Created
-;*      7/30/2010 Modified
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-;***********************************************************************
-; Constant
-;***********************************************************************
-SECTION .rodata align=16
-
-sse2_32 times 8 dw 32
-sse2_20 times 8 dw 20
-
-
-BITS 32
-;***********************************************************************
-; Code
-;***********************************************************************
-SECTION .text
-	
-%macro	WEIGHT_LINE	9
-		movq		%2,	%9
-		punpcklbw	%2,	%7
-		movdqa		%8,	%2
-		
-		movdqa		%1,	%6
-		psubusb		%1,	%8
-		psubusb		%8,	%6
-		por			%8,	%1		; ABS(curPixel - centerPixel);
-		
-		movdqa		%1,	%3
-		psubusb		%1,	%8
-
-		pmullw		%1,	%1
-		psrlw		%1,	5
-		pmullw		%2,	%1		
-		paddusw		%4,	%1
-		paddusw		%5,	%2	
-%endmacro
-
-%macro	WEIGHT_LINE1_UV	4
-		movdqa		%2,	%1
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	1
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	2
-		punpcklbw	%2,	%4
-		psllw		%2,	1
-		paddw		%3,	%2
-		
-		movdqa		%2,	%1
-		psrldq		%2,	3
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
-		
-		movdqa		%2,	%1
-		psrldq		%2,	4
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
-%endmacro
-
-%macro	WEIGHT_LINE2_UV	4
-		movdqa		%2,	%1
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	1
-		punpcklbw	%2,	%4
-		psllw		%2,	1
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	2
-		punpcklbw	%2,	%4
-		psllw		%2,	2
-		paddw		%3,	%2
-		
-		movdqa		%2,	%1
-		psrldq		%2,	3
-		punpcklbw	%2,	%4
-		psllw		%2,	1
-		paddw		%3,	%2
-		
-		movdqa		%2,	%1
-		psrldq		%2,	4
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
-%endmacro
-
-%macro	WEIGHT_LINE3_UV	4
-		movdqa		%2,	%1
-		punpcklbw	%2,	%4
-		psllw		%2,	1
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	1
-		punpcklbw	%2,	%4
-		psllw		%2,	2
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	2
-		punpcklbw	%2,	%4
-		pmullw		%2,	[sse2_20]
-		paddw		%3,	%2
-		
-		movdqa		%2,	%1
-		psrldq		%2,	3
-		punpcklbw	%2,	%4
-		psllw		%2,	2
-		paddw		%3,	%2
-		
-		movdqa		%2,	%1
-		psrldq		%2,	4
-		punpcklbw	%2,	%4
-		psllw		%2,	1
-		paddw		%3,	%2
-%endmacro
-
-ALIGN 16
-WELS_EXTERN BilateralLumaFilter8_sse2
-;***********************************************************************
-;  BilateralLumaFilter8_sse2(uint8_t *pixels, int stride);
-;***********************************************************************
-;	1	2	3
-;	4	0	5
-;	6	7	8
-;	0:	the center point
-%define		pushsize	4
-%define		pixel		esp + pushsize + 4
-%define		stride		esp + pushsize + 8
-BilateralLumaFilter8_sse2:
-		push		ebx
-		
-		pxor		xmm7,	xmm7
-		mov			eax,	[pixel]
-		mov			ebx,	eax
-		movq		xmm6,	[eax]
-		punpcklbw	xmm6,	xmm7
-		movdqa		xmm3,	[sse2_32]
-		pxor		xmm4,	xmm4		; nTotWeight
-		pxor		xmm5,	xmm5		; nSum
-		
-		dec			eax
-		mov			ecx,	[stride]
-		
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax]			; pixel 4
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 2]		; pixel 5
-		
-		sub			eax,	ecx
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax]			; pixel 1
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 1]		; pixel 2
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 2]		; pixel 3
-		
-		lea			eax,	[eax + ecx * 2]
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax]			; pixel 6
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 1]		; pixel 7
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 2]		; pixel 8
-		
-		pcmpeqw		xmm0,	xmm0
-		psrlw		xmm0,	15
-		psllw		xmm0,	8
-		psubusw		xmm0,	xmm4
-		pmullw		xmm0,	xmm6
-		paddusw		xmm5,	xmm0
-		psrlw		xmm5,	8
-		packuswb	xmm5,	xmm5
-		movq		[ebx],	xmm5		
-		
-		pop ebx
-		ret	
-
-WELS_EXTERN WaverageChromaFilter8_sse2
-;***********************************************************************
-; void		WaverageChromaFilter8_sse2(uint8_t *pixels, int stride);
-;***********************************************************************
-;5x5 filter:
-;1	1	2	1	1
-;1	2	4	2	1
-;2	4	20	4	2
-;1	2	4	2	1
-;1	1	2	1	1
-
-ALIGN 16
-WaverageChromaFilter8_sse2:
-		mov		edx,	[esp + 4]	; pixels
-		mov		ecx,	[esp + 8]	; stride
-		
-		mov		eax,	ecx
-		add		eax,	eax
-		sub		edx,	eax			; pixels - 2 * stride
-		sub		edx,	2
-			
-		pxor	xmm0,	xmm0	
-		pxor	xmm3,	xmm3
-	
-		movdqu		xmm1,	[edx]
-		WEIGHT_LINE1_UV	xmm1,	xmm2,	xmm3,	xmm0
-		
-		movdqu		xmm1,	[edx + ecx]
-		WEIGHT_LINE2_UV	xmm1,	xmm2,	xmm3,	xmm0	
-		
-		add		edx,	eax	
-		movdqu		xmm1,	[edx]
-		WEIGHT_LINE3_UV	xmm1,	xmm2,	xmm3,	xmm0
-		
-		movdqu		xmm1,	[edx + ecx]
-		WEIGHT_LINE2_UV	xmm1,	xmm2,	xmm3,	xmm0	
-		
-		movdqu		xmm1,	[edx + ecx * 2]
-		WEIGHT_LINE1_UV	xmm1,	xmm2,	xmm3,	xmm0		
-	
-		psrlw		xmm3,		6
-		packuswb	xmm3,		xmm3
-		movq		[edx + 2],		xmm3			
-
-		ret	
\ No newline at end of file
+;*!
+;* \copy
+;*     Copyright (c)  2010-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  predenoise.asm
+;*
+;*  Abstract
+;*      denoise for SVC2.1
+;*  History
+;*      4/13/2010 Created
+;*      7/30/2010 Modified
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+;***********************************************************************
+; Constant
+;***********************************************************************
+SECTION .rodata align=16
+
+sse2_32 times 8 dw 32
+sse2_20 times 8 dw 20
+
+
+BITS 32
+;***********************************************************************
+; Code
+;***********************************************************************
+SECTION .text
+
+%macro	WEIGHT_LINE	9
+		movq		%2,	%9
+		punpcklbw	%2,	%7
+		movdqa		%8,	%2
+
+		movdqa		%1,	%6
+		psubusb		%1,	%8
+		psubusb		%8,	%6
+		por			%8,	%1		; ABS(curPixel - centerPixel);
+
+		movdqa		%1,	%3
+		psubusb		%1,	%8
+
+		pmullw		%1,	%1
+		psrlw		%1,	5
+		pmullw		%2,	%1
+		paddusw		%4,	%1
+		paddusw		%5,	%2
+%endmacro
+
+%macro	WEIGHT_LINE1_UV	4
+		movdqa		%2,	%1
+		punpcklbw	%2,	%4
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	1
+		punpcklbw	%2,	%4
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	2
+		punpcklbw	%2,	%4
+		psllw		%2,	1
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	3
+		punpcklbw	%2,	%4
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	4
+		punpcklbw	%2,	%4
+		paddw		%3,	%2
+%endmacro
+
+%macro	WEIGHT_LINE2_UV	4
+		movdqa		%2,	%1
+		punpcklbw	%2,	%4
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	1
+		punpcklbw	%2,	%4
+		psllw		%2,	1
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	2
+		punpcklbw	%2,	%4
+		psllw		%2,	2
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	3
+		punpcklbw	%2,	%4
+		psllw		%2,	1
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	4
+		punpcklbw	%2,	%4
+		paddw		%3,	%2
+%endmacro
+
+%macro	WEIGHT_LINE3_UV	4
+		movdqa		%2,	%1
+		punpcklbw	%2,	%4
+		psllw		%2,	1
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	1
+		punpcklbw	%2,	%4
+		psllw		%2,	2
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	2
+		punpcklbw	%2,	%4
+		pmullw		%2,	[sse2_20]
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	3
+		punpcklbw	%2,	%4
+		psllw		%2,	2
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	4
+		punpcklbw	%2,	%4
+		psllw		%2,	1
+		paddw		%3,	%2
+%endmacro
+
+ALIGN 16
+WELS_EXTERN BilateralLumaFilter8_sse2
+;***********************************************************************
+;  BilateralLumaFilter8_sse2(uint8_t *pixels, int stride);
+;***********************************************************************
+;	1	2	3
+;	4	0	5
+;	6	7	8
+;	0:	the center point
+%define		pushsize	4
+%define		pixel		esp + pushsize + 4
+%define		stride		esp + pushsize + 8
+BilateralLumaFilter8_sse2:
+		push		ebx
+
+		pxor		xmm7,	xmm7
+		mov			eax,	[pixel]
+		mov			ebx,	eax
+		movq		xmm6,	[eax]
+		punpcklbw	xmm6,	xmm7
+		movdqa		xmm3,	[sse2_32]
+		pxor		xmm4,	xmm4		; nTotWeight
+		pxor		xmm5,	xmm5		; nSum
+
+		dec			eax
+		mov			ecx,	[stride]
+
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax]			; pixel 4
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 2]		; pixel 5
+
+		sub			eax,	ecx
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax]			; pixel 1
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 1]		; pixel 2
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 2]		; pixel 3
+
+		lea			eax,	[eax + ecx * 2]
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax]			; pixel 6
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 1]		; pixel 7
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 2]		; pixel 8
+
+		pcmpeqw		xmm0,	xmm0
+		psrlw		xmm0,	15
+		psllw		xmm0,	8
+		psubusw		xmm0,	xmm4
+		pmullw		xmm0,	xmm6
+		paddusw		xmm5,	xmm0
+		psrlw		xmm5,	8
+		packuswb	xmm5,	xmm5
+		movq		[ebx],	xmm5
+
+		pop ebx
+		ret
+
+WELS_EXTERN WaverageChromaFilter8_sse2
+;***********************************************************************
+; void		WaverageChromaFilter8_sse2(uint8_t *pixels, int stride);
+;***********************************************************************
+;5x5 filter:
+;1	1	2	1	1
+;1	2	4	2	1
+;2	4	20	4	2
+;1	2	4	2	1
+;1	1	2	1	1
+
+ALIGN 16
+WaverageChromaFilter8_sse2:
+		mov		edx,	[esp + 4]	; pixels
+		mov		ecx,	[esp + 8]	; stride
+
+		mov		eax,	ecx
+		add		eax,	eax
+		sub		edx,	eax			; pixels - 2 * stride
+		sub		edx,	2
+
+		pxor	xmm0,	xmm0
+		pxor	xmm3,	xmm3
+
+		movdqu		xmm1,	[edx]
+		WEIGHT_LINE1_UV	xmm1,	xmm2,	xmm3,	xmm0
+
+		movdqu		xmm1,	[edx + ecx]
+		WEIGHT_LINE2_UV	xmm1,	xmm2,	xmm3,	xmm0
+
+		add		edx,	eax
+		movdqu		xmm1,	[edx]
+		WEIGHT_LINE3_UV	xmm1,	xmm2,	xmm3,	xmm0
+
+		movdqu		xmm1,	[edx + ecx]
+		WEIGHT_LINE2_UV	xmm1,	xmm2,	xmm3,	xmm0
+
+		movdqu		xmm1,	[edx + ecx * 2]
+		WEIGHT_LINE1_UV	xmm1,	xmm2,	xmm3,	xmm0
+
+		psrlw		xmm3,		6
+		packuswb	xmm3,		xmm3
+		movq		[edx + 2],		xmm3
+
+		ret
\ No newline at end of file
--- a/processing/src/asm/downsample_bilinear.asm
+++ b/processing/src/asm/downsample_bilinear.asm
@@ -1,1225 +1,1225 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*	upsampling.asm
-;*
-;*  Abstract
-;*		SIMD for pixel domain down sampling
-;*
-;*  History
-;*		10/22/2009	Created
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-BITS 32
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-
-
-;***********************************************************************
-; Some constants
-;***********************************************************************
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-SECTION .rodata align=16
-
-;***********************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;***********************************************************************
-
-ALIGN 16
-shufb_mask_low:
-	db 00h, 80h, 02h, 80h, 04h, 80h, 06h, 80h, 08h, 80h, 0ah, 80h, 0ch, 80h, 0eh, 80h
-shufb_mask_high:
-	db 01h, 80h, 03h, 80h, 05h, 80h, 07h, 80h, 09h, 80h, 0bh, 80h, 0dh, 80h, 0fh, 80h
-
-
-ALIGN 16
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse
-;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx32_sse(	unsigned char* pDst, const int iDstStride,
-;					unsigned char* pSrc, const int iSrcStride,
-;					const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx32_sse:
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride	
-	mov ebp, [esp+44]	; iSrcHeight
-	
-	sar ebp, $1			; iSrcHeight >> 1	
-
-.yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $1			; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $4			; (iSrcWidth >> 1) / 16		; loop count = num_of_mb
-	neg ebx				; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 32 bytes
-.xloops:
-	; 1st part horizonal loop: x16 bytes
-	;               mem  hi<-       ->lo
-	;1st Line Src:	mm0: d D c C b B a A	mm1: h H g G f F e E
-	;2nd Line Src:	mm2: l L k K j J i I   	mm3: p P o O n N m M
-	;=> target:
-	;: H G F E D C B A, P O N M L K J I
-	;: h g f e d c b a, p o n m l k j i
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;	
-	movq mm0, [esi]			; 1st pSrc line
-	movq mm1, [esi+8]		; 1st pSrc line + 8
-	movq mm2, [esi+ecx]		; 2nd pSrc line
-	movq mm3, [esi+ecx+8]	; 2nd pSrc line + 8
-
-	; to handle mm0, mm1, mm2, mm3
-	pshufw mm4, mm0, 0d8h	; d D b B c C a A ; 11011000 B
-	pshufw mm5, mm4, 04eh	; c C a A d D b B ; 01001110 B
-	punpcklbw mm4, mm5		; d c D C b a B A
-	pshufw mm4, mm4, 0d8h  	; d c b a D C B A ; 11011000 B: mm4
-
-	pshufw mm5, mm1, 0d8h	; h H f F g G e E ; 11011000 B
-	pshufw mm6, mm5, 04eh	; g G e E h H f F ; 01001110 B
-	punpcklbw mm5, mm6		; h g H G f e F E
-	pshufw mm5, mm5, 0d8h  	; h g f e H G F E ; 11011000 B: mm5
-
-	pshufw mm6, mm2, 0d8h	; l L j J k K i I ; 11011000 B
-	pshufw mm7, mm6, 04eh	; k K i I l L j J ; 01001110 B
-	punpcklbw mm6, mm7		; l k L K j i J I
-	pshufw mm6, mm6, 0d8h  	; l k j i L K J I ; 11011000 B: mm6
-
-	pshufw mm7, mm3, 0d8h	; p P n N o O m M ; 11011000 B
-	pshufw mm0, mm7, 04eh	; o O m M p P n N ; 01001110 B
-	punpcklbw mm7, mm0 		; p o P O n m N M
-	pshufw mm7, mm7, 0d8h  	; p o n m P O N M ; 11011000 B: mm7
-
-	; to handle mm4, mm5, mm6, mm7
-	movq mm0, mm4		; 
-	punpckldq mm0, mm5 	; H G F E D C B A
-	punpckhdq mm4, mm5 	; h g f e d c b a
-
-	movq mm1, mm6
-	punpckldq mm1, mm7 	; P O N M L K J I
-	punpckhdq mm6, mm7 	; p o n m l k j i
-
-	; avg within MB horizon width (16 x 2 lines)
-	pavgb mm0, mm4		; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
-	pavgb mm1, mm6		; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
-	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
-	
-	; 2nd part horizonal loop: x16 bytes
-	;               mem  hi<-       ->lo
-	;1st Line Src:	mm0: d D c C b B a A	mm1: h H g G f F e E
-	;2nd Line Src:	mm2: l L k K j J i I   	mm3: p P o O n N m M
-	;=> target:
-	;: H G F E D C B A, P O N M L K J I
-	;: h g f e d c b a, p o n m l k j i
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	movq mm1, [esi+16]		; 1st pSrc line + 16
-	movq mm2, [esi+24]		; 1st pSrc line + 24
-	movq mm3, [esi+ecx+16]	; 2nd pSrc line + 16
-	movq mm4, [esi+ecx+24]	; 2nd pSrc line + 24
-
-	; to handle mm1, mm2, mm3, mm4
-	pshufw mm5, mm1, 0d8h	; d D b B c C a A ; 11011000 B
-	pshufw mm6, mm5, 04eh	; c C a A d D b B ; 01001110 B
-	punpcklbw mm5, mm6		; d c D C b a B A
-	pshufw mm5, mm5, 0d8h  	; d c b a D C B A ; 11011000 B: mm5
-
-	pshufw mm6, mm2, 0d8h	; h H f F g G e E ; 11011000 B
-	pshufw mm7, mm6, 04eh	; g G e E h H f F ; 01001110 B
-	punpcklbw mm6, mm7		; h g H G f e F E
-	pshufw mm6, mm6, 0d8h  	; h g f e H G F E ; 11011000 B: mm6
-
-	pshufw mm7, mm3, 0d8h	; l L j J k K i I ; 11011000 B
-	pshufw mm1, mm7, 04eh	; k K i I l L j J ; 01001110 B
-	punpcklbw mm7, mm1		; l k L K j i J I
-	pshufw mm7, mm7, 0d8h  	; l k j i L K J I ; 11011000 B: mm7
-
-	pshufw mm1, mm4, 0d8h	; p P n N o O m M ; 11011000 B
-	pshufw mm2, mm1, 04eh	; o O m M p P n N ; 01001110 B
-	punpcklbw mm1, mm2 		; p o P O n m N M
-	pshufw mm1, mm1, 0d8h  	; p o n m P O N M ; 11011000 B: mm1
-
-	; to handle mm5, mm6, mm7, mm1
-	movq mm2, mm5
-	punpckldq mm2, mm6 	; H G F E D C B A
-	punpckhdq mm5, mm6 	; h g f e d c b a
-
-	movq mm3, mm7
-	punpckldq mm3, mm1 	; P O N M L K J I
-	punpckhdq mm7, mm1 	; p o n m l k j i
-
-	; avg within MB horizon width (16 x 2 lines)
-	pavgb mm2, mm5		; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
-	pavgb mm3, mm7		; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
-	pavgb mm2, mm3		; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part
-
-	movq [edi  ], mm0
-	movq [edi+8], mm2
-
-	; next SMB
-	lea esi, [esi+32]
-	lea edi, [edi+16]
-
-	dec eax
-	jg near .xloops
-
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
-
-	dec ebp
-	jg near .yloops
-
-	WELSEMMS
-	pop ebp
-	pop	edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-
-WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse
-;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx16_sse( unsigned char* pDst, const int iDstStride,
-;					  unsigned char* pSrc, const int iSrcStride,
-;					  const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx16_sse:
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride	
-	mov ebp, [esp+44]	; iSrcHeight
-	
-	sar ebp, $1		; iSrcHeight >> 1	
-
-.yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $1		; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $3		; (iSrcWidth >> 1) / 8		; loop count = num_of_mb
-	neg ebx			; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 16 bytes
-.xloops:
-	; 1st part horizonal loop: x16 bytes
-	;               mem  hi<-       ->lo
-	;1st Line Src:	mm0: d D c C b B a A	mm1: h H g G f F e E
-	;2nd Line Src:	mm2: l L k K j J i I   	mm3: p P o O n N m M
-	;=> target:
-	;: H G F E D C B A, P O N M L K J I
-	;: h g f e d c b a, p o n m l k j i
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;	
-	movq mm0, [esi]			; 1st pSrc line
-	movq mm1, [esi+8]		; 1st pSrc line + 8
-	movq mm2, [esi+ecx]		; 2nd pSrc line
-	movq mm3, [esi+ecx+8]	; 2nd pSrc line + 8
-
-	; to handle mm0, mm1, mm2, mm3
-	pshufw mm4, mm0, 0d8h	; d D b B c C a A ; 11011000 B
-	pshufw mm5, mm4, 04eh	; c C a A d D b B ; 01001110 B
-	punpcklbw mm4, mm5		; d c D C b a B A
-	pshufw mm4, mm4, 0d8h  	; d c b a D C B A ; 11011000 B: mm4
-
-	pshufw mm5, mm1, 0d8h	; h H f F g G e E ; 11011000 B
-	pshufw mm6, mm5, 04eh	; g G e E h H f F ; 01001110 B
-	punpcklbw mm5, mm6		; h g H G f e F E
-	pshufw mm5, mm5, 0d8h  	; h g f e H G F E ; 11011000 B: mm5
-
-	pshufw mm6, mm2, 0d8h	; l L j J k K i I ; 11011000 B
-	pshufw mm7, mm6, 04eh	; k K i I l L j J ; 01001110 B
-	punpcklbw mm6, mm7		; l k L K j i J I
-	pshufw mm6, mm6, 0d8h  	; l k j i L K J I ; 11011000 B: mm6
-
-	pshufw mm7, mm3, 0d8h	; p P n N o O m M ; 11011000 B
-	pshufw mm0, mm7, 04eh	; o O m M p P n N ; 01001110 B
-	punpcklbw mm7, mm0 		; p o P O n m N M
-	pshufw mm7, mm7, 0d8h  	; p o n m P O N M ; 11011000 B: mm7
-
-	; to handle mm4, mm5, mm6, mm7
-	movq mm0, mm4		; 
-	punpckldq mm0, mm5 	; H G F E D C B A
-	punpckhdq mm4, mm5 	; h g f e d c b a
-
-	movq mm1, mm6
-	punpckldq mm1, mm7 	; P O N M L K J I
-	punpckhdq mm6, mm7 	; p o n m l k j i
-
-	; avg within MB horizon width (16 x 2 lines)
-	pavgb mm0, mm4		; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
-	pavgb mm1, mm6		; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
-	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
-
-	movq [edi  ], mm0	
-
-	; next SMB
-	lea esi, [esi+16]
-	lea edi, [edi+8]
-
-	dec eax
-	jg near .xloops
-
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
-
-	dec ebp
-	jg near .yloops
-
-	WELSEMMS
-	pop ebp
-	pop edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-
-WELS_EXTERN DyadicBilinearDownsamplerWidthx8_sse
-;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx8_sse( unsigned char* pDst, const int iDstStride,
-;					  unsigned char* pSrc, const int iSrcStride,
-;					  const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx8_sse:
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride	
-	mov ebp, [esp+44]	; iSrcHeight
-	
-	sar ebp, $1		; iSrcHeight >> 1	
-
-.yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $1		; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $2		; (iSrcWidth >> 1) / 4		; loop count = num_of_mb
-	neg ebx			; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 8 bytes
-.xloops:
-	; 1st part horizonal loop: x8 bytes
-	;               mem  hi<-       ->lo
-	;1st Line Src:	mm0: d D c C b B a A
-	;2nd Line Src:	mm1: h H g G f F e E
-	;=> target:
-	;: H G F E D C B A
-	;: h g f e d c b a
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;	
-	movq mm0, [esi]			; 1st pSrc line	
-	movq mm1, [esi+ecx]		; 2nd pSrc line	
-
-	; to handle mm0, mm1, mm2, mm3
-	pshufw mm2, mm0, 0d8h	; d D b B c C a A ; 11011000 B
-	pshufw mm3, mm2, 04eh	; c C a A d D b B ; 01001110 B
-	punpcklbw mm2, mm3		; d c D C b a B A
-	pshufw mm2, mm2, 0d8h  	; d c b a D C B A ; 11011000 B: mm4
-
-	pshufw mm4, mm1, 0d8h	; h H f F g G e E ; 11011000 B
-	pshufw mm5, mm4, 04eh	; g G e E h H f F ; 01001110 B
-	punpcklbw mm4, mm5		; h g H G f e F E
-	pshufw mm4, mm4, 0d8h  	; h g f e H G F E ; 11011000 B: mm5	
-
-	; to handle mm2, mm4
-	movq mm0, mm2		; 
-	punpckldq mm0, mm4 	; H G F E D C B A
-	punpckhdq mm2, mm4 	; h g f e d c b a
-
-	; avg within MB horizon width (16 x 2 lines)
-	pavgb mm0, mm2		; (H+h+1)>>1, .., (A+a+1)>>1, temp_row1, 2
-	pshufw mm1, mm0, 04eh	; 01001110 B	
-	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
-
-	movd [edi],	mm0	
-
-	; next unit
-	lea esi, [esi+8]
-	lea edi, [edi+4]
-
-	dec eax
-	jg near .xloops
-
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
-
-	dec ebp
-	jg near .yloops
-
-	WELSEMMS
-	pop ebp
-	pop edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-
-
-
-; got about 50% improvement over DyadicBilinearDownsamplerWidthx32_sse
-WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3
-;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx32_ssse3(	unsigned char* pDst, const int iDstStride,
-;					unsigned char* pSrc, const int iSrcStride,
-;					const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx32_ssse3:
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride	
-	mov ebp, [esp+44]	; iSrcHeight
-	
-	sar ebp, $1			; iSrcHeight >> 1	
-
-	movdqa xmm7, [shufb_mask_low]	; mask low
-	movdqa xmm6, [shufb_mask_high]	; mask high
-
-.yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $1			; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $4			; (iSrcWidth >> 1) / 16		; loop count = num_of_mb
-	neg ebx				; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 32 bytes
-.xloops:
-	; 1st part horizonal loop: x16 bytes
-	;               mem  hi<-       ->lo
-	;1st Line Src:	xmm0: h H g G f F e E d D c C b B a A
-	;				xmm1: p P o O n N m M l L k K j J i I
-	;2nd Line Src:	xmm2: h H g G f F e E d D c C b B a A
-	;				xmm3: p P o O n N m M l L k K j J i I
-	;=> target:
-	;: P O N M L K J I H G F E D C B A
-	;: p o n m l k j i h g f e d c b a
-	;: P ..                          A
-	;: p ..                          a
-	
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;	
-	movdqa xmm0, [esi]			; 1st_src_line
-	movdqa xmm1, [esi+16]		; 1st_src_line + 16
-	movdqa xmm2, [esi+ecx]		; 2nd_src_line
-	movdqa xmm3, [esi+ecx+16]	; 2nd_src_line + 16	
-	
-	; packing & avg
-	movdqa xmm4, xmm0			; h H g G f F e E d D c C b B a A
-	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
-	pshufb xmm4, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-	; another implementation for xmm4 high bits
-;	psubb xmm4, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-;	psrlw xmm4, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-	pavgb xmm0, xmm4
-
-	movdqa xmm5, xmm1
-	pshufb xmm1, xmm7
-	pshufb xmm5, xmm6
-;	psubb xmm5, xmm1
-;	psrlw xmm5, 8	
-	pavgb xmm1, xmm5
-
-	movdqa xmm4, xmm2
-	pshufb xmm2, xmm7
-	pshufb xmm4, xmm6
-;	psubb xmm4, xmm2
-;	psrlw xmm4, 8	
-	pavgb xmm2, xmm4
-
-	movdqa xmm5, xmm3
-	pshufb xmm3, xmm7
-	pshufb xmm5, xmm6
-;	psubb xmm5, xmm3
-;	psrlw xmm5, 8	
-	pavgb xmm3, xmm5
-	
-	packuswb xmm0, xmm1	
-	packuswb xmm2, xmm3	
-	pavgb xmm0, xmm2	
-
-	; write pDst
-	movdqa [edi], xmm0
-
-	; next SMB
-	lea esi, [esi+32]
-	lea edi, [edi+16]
-
-	dec eax
-	jg near .xloops
-
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
-
-	dec ebp
-	jg near .yloops
-	
-	pop ebp
-	pop	edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-
-WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3
-;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx16_ssse3( unsigned char* pDst, const int iDstStride,
-;					  unsigned char* pSrc, const int iSrcStride,
-;					  const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx16_ssse3:
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride	
-	mov ebp, [esp+44]	; iSrcHeight
-	
-	sar ebp, $1		; iSrcHeight >> 1	
-	movdqa xmm7, [shufb_mask_low]	; mask low	
-	movdqa xmm6, [shufb_mask_high]	; mask high
-
-.yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $1		; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $3		; (iSrcWidth >> 1) / 8		; loop count = num_of_mb
-	neg ebx			; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 16 bytes
-.xloops:
-	; horizonal loop: x16 bytes by source
-	;               mem  hi<-       ->lo
-	;1st line pSrc:	xmm0: h H g G f F e E d D c C b B a A
-	;2nd line pSrc:  xmm1: p P o O n N m M l L k K j J i I
-	;=> target:
-	;: H G F E D C B A, P O N M L K J I
-	;: h g f e d c b a, p o n m l k j i
-
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;	
-	movdqa xmm0, [esi]			; 1st_src_line	
-	movdqa xmm1, [esi+ecx]		; 2nd_src_line	
-	
-	; packing & avg
-	movdqa xmm2, xmm0			; h H g G f F e E d D c C b B a A
-	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
-	pshufb xmm2, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-	; another implementation for xmm2 high bits
-;	psubb xmm2, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-;	psrlw xmm2, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a	
-	pavgb xmm0, xmm2
-
-	movdqa xmm3, xmm1
-	pshufb xmm1, xmm7
-	pshufb xmm3, xmm6
-;	psubb xmm3, xmm1
-;	psrlw xmm3, 8	
-	pavgb xmm1, xmm3
-
-	pavgb xmm0, xmm1	
-	packuswb xmm0, xmm1	
-
-	; write pDst
-	movq [edi], xmm0	
-
-	; next SMB
-	lea esi, [esi+16]
-	lea edi, [edi+8]
-
-	dec eax
-	jg near .xloops
-
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
-
-	dec ebp
-	jg near .yloops
-	
-	pop ebp
-	pop edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-
-; got about 65% improvement over DyadicBilinearDownsamplerWidthx32_sse
-WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse4
-;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx32_sse4(	unsigned char* pDst, const int iDstStride,
-;					unsigned char* pSrc, const int iSrcStride,
-;					const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx32_sse4:
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride	
-	mov ebp, [esp+44]	; iSrcHeight
-	
-	sar ebp, $1			; iSrcHeight >> 1	
-
-	movdqa xmm7, [shufb_mask_low]	; mask low	
-	movdqa xmm6, [shufb_mask_high]	; mask high
-
-.yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $1			; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $4			; (iSrcWidth >> 1) / 16		; loop count = num_of_mb
-	neg ebx				; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 32 bytes
-.xloops:
-	; 1st part horizonal loop: x16 bytes
-	;               mem  hi<-       ->lo
-	;1st Line Src:	xmm0: h H g G f F e E d D c C b B a A
-	;				xmm1: p P o O n N m M l L k K j J i I
-	;2nd Line Src:	xmm2: h H g G f F e E d D c C b B a A
-	;				xmm3: p P o O n N m M l L k K j J i I
-	;=> target:
-	;: P O N M L K J I H G F E D C B A
-	;: p o n m l k j i h g f e d c b a
-	;: P ..                          A
-	;: p ..                          a
-	
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;	
-	movntdqa xmm0, [esi]			; 1st_src_line
-	movntdqa xmm1, [esi+16]		; 1st_src_line + 16
-	movntdqa xmm2, [esi+ecx]		; 2nd_src_line
-	movntdqa xmm3, [esi+ecx+16]	; 2nd_src_line + 16	
-	
-	; packing & avg
-	movdqa xmm4, xmm0			; h H g G f F e E d D c C b B a A
-	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
-	pshufb xmm4, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-;	psubb xmm4, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-;	psrlw xmm4, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-	pavgb xmm0, xmm4
-
-	movdqa xmm5, xmm1
-	pshufb xmm1, xmm7
-	pshufb xmm5, xmm6
-;	psubb xmm5, xmm1
-;	psrlw xmm5, 8
-	pavgb xmm1, xmm5
-
-	movdqa xmm4, xmm2
-	pshufb xmm2, xmm7
-	pshufb xmm4, xmm6
-;	psubb xmm4, xmm2
-;	psrlw xmm4, 8
-	pavgb xmm2, xmm4
-
-	movdqa xmm5, xmm3
-	pshufb xmm3, xmm7
-	pshufb xmm5, xmm6
-;	psubb xmm5, xmm3
-;	psrlw xmm5, 8
-	pavgb xmm3, xmm5
-	
-	packuswb xmm0, xmm1	
-	packuswb xmm2, xmm3	
-	pavgb xmm0, xmm2	
-
-	; write pDst
-	movdqa [edi], xmm0
-
-	; next SMB
-	lea esi, [esi+32]
-	lea edi, [edi+16]
-
-	dec eax
-	jg near .xloops
-
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
-
-	dec ebp
-	jg near .yloops
-	
-	pop ebp
-	pop	edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-
-WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse4
-;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx16_sse4( unsigned char* pDst, const int iDstStride,
-;					  unsigned char* pSrc, const int iSrcStride,
-;					  const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx16_sse4:
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride	
-	mov ebp, [esp+44]	; iSrcHeight
-	
-	sar ebp, $1		; iSrcHeight >> 1	
-	movdqa xmm7, [shufb_mask_low]	; mask low
-	movdqa xmm6, [shufb_mask_high]	; mask high
-
-.yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $1		; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $3		; (iSrcWidth >> 1) / 8		; loop count = num_of_mb
-	neg ebx			; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 16 bytes
-.xloops:
-	; horizonal loop: x16 bytes by source
-	;               mem  hi<-       ->lo
-	;1st line pSrc:	xmm0: h H g G f F e E d D c C b B a A
-	;2nd line pSrc:  xmm1: p P o O n N m M l L k K j J i I
-	;=> target:
-	;: H G F E D C B A, P O N M L K J I
-	;: h g f e d c b a, p o n m l k j i
-
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;	
-	movntdqa xmm0, [esi]			; 1st_src_line	
-	movntdqa xmm1, [esi+ecx]		; 2nd_src_line	
-	
-	; packing & avg
-	movdqa xmm2, xmm0			; h H g G f F e E d D c C b B a A
-	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
-	pshufb xmm2, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-;	psubb xmm2, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-;	psrlw xmm2, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-	pavgb xmm0, xmm2
-
-	movdqa xmm3, xmm1
-	pshufb xmm1, xmm7
-	pshufb xmm3, xmm6
-;	psubb xmm3, xmm1
-;	psrlw xmm3, 8
-	pavgb xmm1, xmm3
-
-	pavgb xmm0, xmm1	
-	packuswb xmm0, xmm1	
-
-	; write pDst
-	movq [edi], xmm0	
-
-	; next SMB
-	lea esi, [esi+16]
-	lea edi, [edi+8]
-
-	dec eax
-	jg near .xloops
-
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
-
-	dec ebp
-	jg near .yloops
-	
-	pop ebp
-	pop edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-
-
-
-
-
-WELS_EXTERN	GeneralBilinearAccurateDownsampler_sse2
-;**************************************************************************************************************
-;int GeneralBilinearAccurateDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
-;							unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
-;                           unsigned int uiScaleX, unsigned int uiScaleY );
-;{
-;**************************************************************************************************************
-
-ALIGN 16
-GeneralBilinearAccurateDownsampler_sse2:
-	push	ebp
-	push	esi
-	push	edi
-	push	ebx
-%define		pushsize	16
-%define		localsize	28
-%define		pDstData		esp + pushsize + localsize + 4
-%define		dwDstStride		esp + pushsize + localsize + 8
-%define		dwDstWidth		esp + pushsize + localsize + 12
-%define		dwDstHeight		esp + pushsize + localsize + 16
-%define		pSrcData		esp + pushsize + localsize + 20
-%define		dwSrcStride		esp + pushsize + localsize + 24
-%define		dwSrcWidth		esp + pushsize + localsize + 28
-%define		dwSrcHeight		esp + pushsize + localsize + 32
-%define		scale			esp + 0
-%define		uiScaleX			esp + pushsize + localsize + 36
-%define		uiScaleY			esp + pushsize + localsize + 40
-%define		tmpHeight		esp + 12
-%define		yInverse		esp + 16
-%define		xInverse		esp + 20
-%define		dstStep			esp + 24
-	sub		esp,			localsize
-	
-	pxor	xmm0,	xmm0
-	mov		edx,	32767
-	mov		eax,	[uiScaleX]
-	and		eax,	32767
-	mov		ebx,	eax
-	neg		ebx
-	and		ebx,	32767
-	movd	xmm1,		eax						; uinc(uiScaleX mod 32767)
-	movd	xmm2,		ebx						; -uinc
-	psllq	xmm1,		32
-	por		xmm1,		xmm2					; 0 0  uinc  -uinc   (dword)
-	pshufd	xmm7,		xmm1,	01000100b		; xmm7: uinc -uinc uinc -uinc
-	
-	mov		eax,	[uiScaleY]
-	and		eax,	32767
-	mov		ebx,	eax
-	neg		ebx
-	and		ebx,	32767
-	movd	xmm6,		eax						; vinc(uiScaleY mod 32767)
-	movd	xmm2,		ebx						; -vinc
-	psllq	xmm6,		32
-	por		xmm6,		xmm2					; 0 0 vinc -vinc (dword)
-	pshufd	xmm6,		xmm6,	01010000b		; xmm6: vinc vinc -vinc -vinc
-	
-	mov		edx,		40003fffh
-	movd	xmm5,		edx
-	punpcklwd	xmm5,	xmm0					; 16384 16383
-	pshufd	xmm5,		xmm5,	01000100b		; xmm5: 16384 16383 16384 16383
-	
-
-DOWNSAMPLE:
-	
-	mov		eax,			[dwDstHeight]
-	mov		edi,			[pDstData]
-	mov		edx,			[dwDstStride]
-	mov		ecx,			[dwDstWidth]
-	sub		edx,			ecx
-	mov		[dstStep],	edx				; stride - width
-	dec		eax
-	mov		[tmpHeight],	eax
-	mov		eax,			16384
-	mov		[yInverse],		eax
-	
-	pshufd	xmm4,		xmm5,	01010000b	; initial v to 16384 16384 16383 16383
-	
-HEIGHT:	
-	mov		eax,	[yInverse]
-	mov		esi,	[pSrcData]
-	shr		eax,	15
-	mul		dword [dwSrcStride]
-	add		esi,	eax					; get current row address
-	mov		ebp,	esi
-	add		ebp,	[dwSrcStride]
-	
-	mov		eax,		16384
-	mov		[xInverse],		eax
-	mov		ecx,			[dwDstWidth]
-	dec		ecx
-	
-	movdqa	xmm3,		xmm5			; initial u to 16384 16383 16384 16383
-	
-WIDTH:
-	mov		eax,		[xInverse]
-	shr		eax,		15
-	
-	movd	xmm1,		[esi+eax]		; xxxxxxba
-	movd	xmm2,		[ebp+eax]		; xxxxxxdc
-	pxor	xmm0,		xmm0
-	punpcklwd	xmm1,	xmm2			; xxxxdcba
-	punpcklbw	xmm1,	xmm0			; 0d0c0b0a
-	punpcklwd	xmm1,	xmm0			; 000d000c000b000a
-	
-	movdqa	xmm2,	xmm4	; xmm2:  vv(1-v)(1-v)  tmpv
-	pmaddwd	xmm2,	xmm3	; mul u(1-u)u(1-u) on xmm2
-	movdqa	xmm0,	xmm2
-	pmuludq	xmm2,	xmm1
-	psrlq	xmm0,	32
-	psrlq	xmm1,	32
-	pmuludq	xmm0,	xmm1
-	paddq	xmm2,	xmm0
-	pshufd	xmm1,	xmm2,	00001110b
-	paddq	xmm2,	xmm1
-	psrlq	xmm2,	29
-	
-	movd	eax,	xmm2
-	inc		eax
-	shr		eax,	1
-	mov		[edi],	al
-	inc		edi
-	
-	mov		eax,		[uiScaleX]
-	add		[xInverse],	eax
-	
-	paddw	xmm3,		xmm7			; inc u
-	psllw	xmm3,		1
-	psrlw	xmm3,		1
-	
-	loop	WIDTH
-
-WIDTH_END:
-	mov		eax,		[xInverse]
-	shr		eax,		15
-	mov		cl,			[esi+eax]
-	mov		[edi],		cl
-	inc		edi
-	
-	mov		eax,		[uiScaleY]
-	add		[yInverse],	eax
-	add		edi,		[dstStep]
-	
-	paddw	xmm4,	xmm6				; inc v
-	psllw	xmm4,	1
-	psrlw	xmm4,	1
-	
-	dec		dword [tmpHeight]
-	jg		HEIGHT
-
-
-LAST_ROW:	
-	mov		eax,	[yInverse]
-	mov		esi,	[pSrcData]
-	shr		eax,	15
-	mul		dword [dwSrcStride]
-	add		esi,	eax					; get current row address
-	
-	mov		eax,		16384
-	mov		[xInverse],		eax
-	mov		ecx,			[dwDstWidth]
-	
-LAST_ROW_WIDTH:
-	mov		eax,		[xInverse]
-	shr		eax,		15
-	
-	mov		al,			[esi+eax]
-	mov		[edi],	al
-	inc		edi
-	
-	mov		eax,		[uiScaleX]
-	add		[xInverse],	eax
-	
-	loop	LAST_ROW_WIDTH
-
-LAST_ROW_END:
-
-	add		esp,			localsize
-	pop		ebx
-	pop		edi
-	pop		esi
-	pop		ebp
-%undef		pushsize
-%undef		localsize
-%undef		pSrcData
-%undef		dwSrcWidth
-%undef		dwSrcHeight
-%undef		dwSrcStride
-%undef		pDstData
-%undef		dwDstWidth
-%undef		dwDstHeight
-%undef		dwDstStride
-%undef		scale
-%undef		uiScaleX
-%undef		uiScaleY
-%undef		tmpHeight
-%undef		yInverse
-%undef		xInverse
-%undef		dstStep
-	ret
-	
-	
-	
-	
-WELS_EXTERN	GeneralBilinearFastDownsampler_sse2
-;**************************************************************************************************************
-;int GeneralBilinearFastDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
-;				unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
-;               unsigned int uiScaleX, unsigned int uiScaleY );
-;{
-;**************************************************************************************************************
-
-ALIGN 16
-GeneralBilinearFastDownsampler_sse2:
-	push	ebp
-	push	esi
-	push	edi
-	push	ebx
-%define		pushsize	16
-%define		localsize	28
-%define		pDstData		esp + pushsize + localsize + 4
-%define		dwDstStride		esp + pushsize + localsize + 8
-%define		dwDstWidth		esp + pushsize + localsize + 12
-%define		dwDstHeight		esp + pushsize + localsize + 16
-%define		pSrcData		esp + pushsize + localsize + 20
-%define		dwSrcStride		esp + pushsize + localsize + 24
-%define		dwSrcWidth		esp + pushsize + localsize + 28
-%define		dwSrcHeight		esp + pushsize + localsize + 32
-%define		scale			esp + 0
-%define		uiScaleX			esp + pushsize + localsize + 36
-%define		uiScaleY			esp + pushsize + localsize + 40
-%define		tmpHeight		esp + 12
-%define		yInverse		esp + 16
-%define		xInverse		esp + 20
-%define		dstStep			esp + 24
-	sub		esp,			localsize
-	
-	pxor	xmm0,	xmm0
-	mov		edx,	65535
-	mov		eax,	[uiScaleX]
-	and		eax,	edx
-	mov		ebx,	eax
-	neg		ebx
-	and		ebx,	65535
-	movd	xmm1,		eax						; uinc(uiScaleX mod 65536)
-	movd	xmm2,		ebx						; -uinc
-	psllq	xmm1,		32
-	por		xmm1,		xmm2					; 0 uinc 0 -uinc
-	pshuflw	xmm7,		xmm1,	10001000b		; xmm7: uinc -uinc uinc -uinc
-	
-	mov		eax,	[uiScaleY]
-	and		eax,	32767
-	mov		ebx,	eax
-	neg		ebx
-	and		ebx,	32767
-	movd	xmm6,		eax						; vinc(uiScaleY mod 32767)
-	movd	xmm2,		ebx						; -vinc
-	psllq	xmm6,		32
-	por		xmm6,		xmm2					; 0 vinc 0 -vinc
-	pshuflw	xmm6,		xmm6,	10100000b		; xmm6: vinc vinc -vinc -vinc
-	
-	mov		edx,		80007fffh				; 32768 32767
-	movd	xmm5,		edx					
-	pshuflw	xmm5,		xmm5,		01000100b	; 32768 32767 32768 32767
-	mov		ebx,		16384
-	
-
-FAST_DOWNSAMPLE:
-	
-	mov		eax,			[dwDstHeight]
-	mov		edi,			[pDstData]
-	mov		edx,			[dwDstStride]
-	mov		ecx,			[dwDstWidth]
-	sub		edx,			ecx
-	mov		[dstStep],	edx				; stride - width
-	dec		eax
-	mov		[tmpHeight],	eax
-	mov		eax,		16384
-	mov		[yInverse],		eax
-	
-	pshuflw	xmm4,		xmm5,	01010000b
-	psrlw	xmm4,		1				; initial v to 16384 16384 16383 16383
-	
-FAST_HEIGHT:	
-	mov		eax,	[yInverse]
-	mov		esi,	[pSrcData]
-	shr		eax,	15
-	mul		dword [dwSrcStride]
-	add		esi,	eax					; get current row address
-	mov		ebp,	esi
-	add		ebp,	[dwSrcStride]
-	
-	mov		eax,		32768
-	mov		[xInverse],		eax
-	mov		ecx,			[dwDstWidth]
-	dec		ecx
-	
-	movdqa	xmm3,		xmm5			; initial u to 32768 32767 32768 32767
-	
-FAST_WIDTH:
-	mov		eax,		[xInverse]
-	shr		eax,		16
-	
-	movd	xmm1,		[esi+eax]		; xxxxxxba
-	movd	xmm2,		[ebp+eax]		; xxxxxxdc
-	punpcklwd	xmm1,	xmm2			; xxxxdcba
-	punpcklbw	xmm1,	xmm0			; 0d0c0b0a
-	
-	movdqa	xmm2,	xmm4	; xmm2:  vv(1-v)(1-v)  tmpv
-	pmulhuw	xmm2,	xmm3	; mul u(1-u)u(1-u) on xmm2
-	pmaddwd		xmm2,	xmm1
-	pshufd	xmm1,	xmm2,	00000001b
-	paddd	xmm2,	xmm1
-	movd	xmm1,	ebx
-	paddd	xmm2,	xmm1
-	psrld	xmm2,	15
-	
-	packuswb	xmm2,	xmm0
-	movd	eax,	xmm2
-	mov		[edi],	al
-	inc		edi
-	
-	mov		eax,		[uiScaleX]
-	add		[xInverse],	eax
-	
-	paddw	xmm3,		xmm7			; inc u
-	
-	loop	FAST_WIDTH
-
-FAST_WIDTH_END:
-	mov		eax,		[xInverse]
-	shr		eax,		16
-	mov		cl,			[esi+eax]
-	mov		[edi],		cl
-	inc		edi
-	
-	mov		eax,		[uiScaleY]
-	add		[yInverse],	eax
-	add		edi,		[dstStep]
-	
-	paddw	xmm4,	xmm6				; inc v
-	psllw	xmm4,	1
-	psrlw	xmm4,	1
-	
-	dec		dword [tmpHeight]
-	jg		FAST_HEIGHT
-
-
-FAST_LAST_ROW:	
-	mov		eax,	[yInverse]
-	mov		esi,	[pSrcData]
-	shr		eax,	15
-	mul		dword [dwSrcStride]
-	add		esi,	eax					; get current row address
-	
-	mov		eax,		32768
-	mov		[xInverse],		eax
-	mov		ecx,			[dwDstWidth]
-	
-FAST_LAST_ROW_WIDTH:
-	mov		eax,		[xInverse]
-	shr		eax,		16
-	
-	mov		al,			[esi+eax]
-	mov		[edi],	al
-	inc		edi
-	
-	mov		eax,		[uiScaleX]
-	add		[xInverse],	eax
-	
-	loop	FAST_LAST_ROW_WIDTH
-
-FAST_LAST_ROW_END:
-
-	add		esp,			localsize
-	pop		ebx
-	pop		edi
-	pop		esi
-	pop		ebp
-%undef		pushsize
-%undef		localsize
-%undef		pSrcData
-%undef		dwSrcWidth
-%undef		dwSrcHeight
-%undef		dwSrcStride
-%undef		pDstData
-%undef		dwDstWidth
-%undef		dwDstHeight
-%undef		dwDstStride
-%undef		scale
-%undef		uiScaleX
-%undef		uiScaleY
-%undef		tmpHeight
-%undef		yInverse
-%undef		xInverse
-%undef		dstStep
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*	upsampling.asm
+;*
+;*  Abstract
+;*		SIMD for pixel domain down sampling
+;*
+;*  History
+;*		10/22/2009	Created
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+BITS 32
+
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+
+;***********************************************************************
+; Some constants
+;***********************************************************************
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+SECTION .rodata align=16
+
+;***********************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;***********************************************************************
+
+ALIGN 16
+shufb_mask_low:
+	db 00h, 80h, 02h, 80h, 04h, 80h, 06h, 80h, 08h, 80h, 0ah, 80h, 0ch, 80h, 0eh, 80h
+shufb_mask_high:
+	db 01h, 80h, 03h, 80h, 05h, 80h, 07h, 80h, 09h, 80h, 0bh, 80h, 0dh, 80h, 0fh, 80h
+
+
+ALIGN 16
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse
+;***********************************************************************
+;	void DyadicBilinearDownsamplerWidthx32_sse(	unsigned char* pDst, const int iDstStride,
+;					unsigned char* pSrc, const int iSrcStride,
+;					const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx32_sse:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov edi, [esp+24]	; pDst
+	mov edx, [esp+28]	; iDstStride
+	mov esi, [esp+32]	; pSrc
+	mov ecx, [esp+36]	; iSrcStride
+	mov ebp, [esp+44]	; iSrcHeight
+
+	sar ebp, $1			; iSrcHeight >> 1
+
+.yloops:
+	mov eax, [esp+40]	; iSrcWidth
+	sar eax, $1			; iSrcWidth >> 1
+	mov ebx, eax		; iDstWidth restored at ebx
+	sar eax, $4			; (iSrcWidth >> 1) / 16		; loop count = num_of_mb
+	neg ebx				; - (iSrcWidth >> 1)
+	; each loop = source bandwidth: 32 bytes
+.xloops:
+	; 1st part horizonal loop: x16 bytes
+	;               mem  hi<-       ->lo
+	;1st Line Src:	mm0: d D c C b B a A	mm1: h H g G f F e E
+	;2nd Line Src:	mm2: l L k K j J i I   	mm3: p P o O n N m M
+	;=> target:
+	;: H G F E D C B A, P O N M L K J I
+	;: h g f e d c b a, p o n m l k j i
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	movq mm0, [esi]			; 1st pSrc line
+	movq mm1, [esi+8]		; 1st pSrc line + 8
+	movq mm2, [esi+ecx]		; 2nd pSrc line
+	movq mm3, [esi+ecx+8]	; 2nd pSrc line + 8
+
+	; to handle mm0, mm1, mm2, mm3
+	pshufw mm4, mm0, 0d8h	; d D b B c C a A ; 11011000 B
+	pshufw mm5, mm4, 04eh	; c C a A d D b B ; 01001110 B
+	punpcklbw mm4, mm5		; d c D C b a B A
+	pshufw mm4, mm4, 0d8h  	; d c b a D C B A ; 11011000 B: mm4
+
+	pshufw mm5, mm1, 0d8h	; h H f F g G e E ; 11011000 B
+	pshufw mm6, mm5, 04eh	; g G e E h H f F ; 01001110 B
+	punpcklbw mm5, mm6		; h g H G f e F E
+	pshufw mm5, mm5, 0d8h  	; h g f e H G F E ; 11011000 B: mm5
+
+	pshufw mm6, mm2, 0d8h	; l L j J k K i I ; 11011000 B
+	pshufw mm7, mm6, 04eh	; k K i I l L j J ; 01001110 B
+	punpcklbw mm6, mm7		; l k L K j i J I
+	pshufw mm6, mm6, 0d8h  	; l k j i L K J I ; 11011000 B: mm6
+
+	pshufw mm7, mm3, 0d8h	; p P n N o O m M ; 11011000 B
+	pshufw mm0, mm7, 04eh	; o O m M p P n N ; 01001110 B
+	punpcklbw mm7, mm0 		; p o P O n m N M
+	pshufw mm7, mm7, 0d8h  	; p o n m P O N M ; 11011000 B: mm7
+
+	; to handle mm4, mm5, mm6, mm7
+	movq mm0, mm4		;
+	punpckldq mm0, mm5 	; H G F E D C B A
+	punpckhdq mm4, mm5 	; h g f e d c b a
+
+	movq mm1, mm6
+	punpckldq mm1, mm7 	; P O N M L K J I
+	punpckhdq mm6, mm7 	; p o n m l k j i
+
+	; avg within MB horizon width (16 x 2 lines)
+	pavgb mm0, mm4		; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+	pavgb mm1, mm6		; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
+
+	; 2nd part horizonal loop: x16 bytes
+	;               mem  hi<-       ->lo
+	;1st Line Src:	mm0: d D c C b B a A	mm1: h H g G f F e E
+	;2nd Line Src:	mm2: l L k K j J i I   	mm3: p P o O n N m M
+	;=> target:
+	;: H G F E D C B A, P O N M L K J I
+	;: h g f e d c b a, p o n m l k j i
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	movq mm1, [esi+16]		; 1st pSrc line + 16
+	movq mm2, [esi+24]		; 1st pSrc line + 24
+	movq mm3, [esi+ecx+16]	; 2nd pSrc line + 16
+	movq mm4, [esi+ecx+24]	; 2nd pSrc line + 24
+
+	; to handle mm1, mm2, mm3, mm4
+	pshufw mm5, mm1, 0d8h	; d D b B c C a A ; 11011000 B
+	pshufw mm6, mm5, 04eh	; c C a A d D b B ; 01001110 B
+	punpcklbw mm5, mm6		; d c D C b a B A
+	pshufw mm5, mm5, 0d8h  	; d c b a D C B A ; 11011000 B: mm5
+
+	pshufw mm6, mm2, 0d8h	; h H f F g G e E ; 11011000 B
+	pshufw mm7, mm6, 04eh	; g G e E h H f F ; 01001110 B
+	punpcklbw mm6, mm7		; h g H G f e F E
+	pshufw mm6, mm6, 0d8h  	; h g f e H G F E ; 11011000 B: mm6
+
+	pshufw mm7, mm3, 0d8h	; l L j J k K i I ; 11011000 B
+	pshufw mm1, mm7, 04eh	; k K i I l L j J ; 01001110 B
+	punpcklbw mm7, mm1		; l k L K j i J I
+	pshufw mm7, mm7, 0d8h  	; l k j i L K J I ; 11011000 B: mm7
+
+	pshufw mm1, mm4, 0d8h	; p P n N o O m M ; 11011000 B
+	pshufw mm2, mm1, 04eh	; o O m M p P n N ; 01001110 B
+	punpcklbw mm1, mm2 		; p o P O n m N M
+	pshufw mm1, mm1, 0d8h  	; p o n m P O N M ; 11011000 B: mm1
+
+	; to handle mm5, mm6, mm7, mm1
+	movq mm2, mm5
+	punpckldq mm2, mm6 	; H G F E D C B A
+	punpckhdq mm5, mm6 	; h g f e d c b a
+
+	movq mm3, mm7
+	punpckldq mm3, mm1 	; P O N M L K J I
+	punpckhdq mm7, mm1 	; p o n m l k j i
+
+	; avg within MB horizon width (16 x 2 lines)
+	pavgb mm2, mm5		; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+	pavgb mm3, mm7		; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+	pavgb mm2, mm3		; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part
+
+	movq [edi  ], mm0
+	movq [edi+8], mm2
+
+	; next SMB
+	lea esi, [esi+32]
+	lea edi, [edi+16]
+
+	dec eax
+	jg near .xloops
+
+	; next line
+	lea esi, [esi+2*ecx]	; next end of lines
+	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
+	lea edi, [edi+edx]
+	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+
+	dec ebp
+	jg near .yloops
+
+	WELSEMMS
+	pop ebp
+	pop	edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse
+;***********************************************************************
+;	void DyadicBilinearDownsamplerWidthx16_sse( unsigned char* pDst, const int iDstStride,
+;					  unsigned char* pSrc, const int iSrcStride,
+;					  const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx16_sse:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov edi, [esp+24]	; pDst
+	mov edx, [esp+28]	; iDstStride
+	mov esi, [esp+32]	; pSrc
+	mov ecx, [esp+36]	; iSrcStride
+	mov ebp, [esp+44]	; iSrcHeight
+
+	sar ebp, $1		; iSrcHeight >> 1
+
+.yloops:
+	mov eax, [esp+40]	; iSrcWidth
+	sar eax, $1		; iSrcWidth >> 1
+	mov ebx, eax		; iDstWidth restored at ebx
+	sar eax, $3		; (iSrcWidth >> 1) / 8		; loop count = num_of_mb
+	neg ebx			; - (iSrcWidth >> 1)
+	; each loop = source bandwidth: 16 bytes
+.xloops:
+	; 1st part horizonal loop: x16 bytes
+	;               mem  hi<-       ->lo
+	;1st Line Src:	mm0: d D c C b B a A	mm1: h H g G f F e E
+	;2nd Line Src:	mm2: l L k K j J i I   	mm3: p P o O n N m M
+	;=> target:
+	;: H G F E D C B A, P O N M L K J I
+	;: h g f e d c b a, p o n m l k j i
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	movq mm0, [esi]			; 1st pSrc line
+	movq mm1, [esi+8]		; 1st pSrc line + 8
+	movq mm2, [esi+ecx]		; 2nd pSrc line
+	movq mm3, [esi+ecx+8]	; 2nd pSrc line + 8
+
+	; to handle mm0, mm1, mm2, mm3
+	pshufw mm4, mm0, 0d8h	; d D b B c C a A ; 11011000 B
+	pshufw mm5, mm4, 04eh	; c C a A d D b B ; 01001110 B
+	punpcklbw mm4, mm5		; d c D C b a B A
+	pshufw mm4, mm4, 0d8h  	; d c b a D C B A ; 11011000 B: mm4
+
+	pshufw mm5, mm1, 0d8h	; h H f F g G e E ; 11011000 B
+	pshufw mm6, mm5, 04eh	; g G e E h H f F ; 01001110 B
+	punpcklbw mm5, mm6		; h g H G f e F E
+	pshufw mm5, mm5, 0d8h  	; h g f e H G F E ; 11011000 B: mm5
+
+	pshufw mm6, mm2, 0d8h	; l L j J k K i I ; 11011000 B
+	pshufw mm7, mm6, 04eh	; k K i I l L j J ; 01001110 B
+	punpcklbw mm6, mm7		; l k L K j i J I
+	pshufw mm6, mm6, 0d8h  	; l k j i L K J I ; 11011000 B: mm6
+
+	pshufw mm7, mm3, 0d8h	; p P n N o O m M ; 11011000 B
+	pshufw mm0, mm7, 04eh	; o O m M p P n N ; 01001110 B
+	punpcklbw mm7, mm0 		; p o P O n m N M
+	pshufw mm7, mm7, 0d8h  	; p o n m P O N M ; 11011000 B: mm7
+
+	; to handle mm4, mm5, mm6, mm7
+	movq mm0, mm4		;
+	punpckldq mm0, mm5 	; H G F E D C B A
+	punpckhdq mm4, mm5 	; h g f e d c b a
+
+	movq mm1, mm6
+	punpckldq mm1, mm7 	; P O N M L K J I
+	punpckhdq mm6, mm7 	; p o n m l k j i
+
+	; avg within MB horizon width (16 x 2 lines)
+	pavgb mm0, mm4		; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+	pavgb mm1, mm6		; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
+
+	movq [edi  ], mm0
+
+	; next SMB
+	lea esi, [esi+16]
+	lea edi, [edi+8]
+
+	dec eax
+	jg near .xloops
+
+	; next line
+	lea esi, [esi+2*ecx]	; next end of lines
+	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
+	lea edi, [edi+edx]
+	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+
+	dec ebp
+	jg near .yloops
+
+	WELSEMMS
+	pop ebp
+	pop edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+WELS_EXTERN DyadicBilinearDownsamplerWidthx8_sse
+;***********************************************************************
+;	void DyadicBilinearDownsamplerWidthx8_sse( unsigned char* pDst, const int iDstStride,
+;					  unsigned char* pSrc, const int iSrcStride,
+;					  const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx8_sse:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov edi, [esp+24]	; pDst
+	mov edx, [esp+28]	; iDstStride
+	mov esi, [esp+32]	; pSrc
+	mov ecx, [esp+36]	; iSrcStride
+	mov ebp, [esp+44]	; iSrcHeight
+
+	sar ebp, $1		; iSrcHeight >> 1
+
+.yloops:
+	mov eax, [esp+40]	; iSrcWidth
+	sar eax, $1		; iSrcWidth >> 1
+	mov ebx, eax		; iDstWidth restored at ebx
+	sar eax, $2		; (iSrcWidth >> 1) / 4		; loop count = num_of_mb
+	neg ebx			; - (iSrcWidth >> 1)
+	; each loop = source bandwidth: 8 bytes
+.xloops:
+	; 1st part horizonal loop: x8 bytes
+	;               mem  hi<-       ->lo
+	;1st Line Src:	mm0: d D c C b B a A
+	;2nd Line Src:	mm1: h H g G f F e E
+	;=> target:
+	;: H G F E D C B A
+	;: h g f e d c b a
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	movq mm0, [esi]			; 1st pSrc line
+	movq mm1, [esi+ecx]		; 2nd pSrc line
+
+	; to handle mm0, mm1, mm2, mm3
+	pshufw mm2, mm0, 0d8h	; d D b B c C a A ; 11011000 B
+	pshufw mm3, mm2, 04eh	; c C a A d D b B ; 01001110 B
+	punpcklbw mm2, mm3		; d c D C b a B A
+	pshufw mm2, mm2, 0d8h  	; d c b a D C B A ; 11011000 B: mm4
+
+	pshufw mm4, mm1, 0d8h	; h H f F g G e E ; 11011000 B
+	pshufw mm5, mm4, 04eh	; g G e E h H f F ; 01001110 B
+	punpcklbw mm4, mm5		; h g H G f e F E
+	pshufw mm4, mm4, 0d8h  	; h g f e H G F E ; 11011000 B: mm5
+
+	; to handle mm2, mm4
+	movq mm0, mm2		;
+	punpckldq mm0, mm4 	; H G F E D C B A
+	punpckhdq mm2, mm4 	; h g f e d c b a
+
+	; avg within MB horizon width (16 x 2 lines)
+	pavgb mm0, mm2		; (H+h+1)>>1, .., (A+a+1)>>1, temp_row1, 2
+	pshufw mm1, mm0, 04eh	; 01001110 B
+	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
+
+	movd [edi],	mm0
+
+	; next unit
+	lea esi, [esi+8]
+	lea edi, [edi+4]
+
+	dec eax
+	jg near .xloops
+
+	; next line
+	lea esi, [esi+2*ecx]	; next end of lines
+	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
+	lea edi, [edi+edx]
+	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+
+	dec ebp
+	jg near .yloops
+
+	WELSEMMS
+	pop ebp
+	pop edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+
+
+; got about 50% improvement over DyadicBilinearDownsamplerWidthx32_sse
+WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3
+;***********************************************************************
+;	void DyadicBilinearDownsamplerWidthx32_ssse3(	unsigned char* pDst, const int iDstStride,
+;					unsigned char* pSrc, const int iSrcStride,
+;					const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx32_ssse3:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov edi, [esp+24]	; pDst
+	mov edx, [esp+28]	; iDstStride
+	mov esi, [esp+32]	; pSrc
+	mov ecx, [esp+36]	; iSrcStride
+	mov ebp, [esp+44]	; iSrcHeight
+
+	sar ebp, $1			; iSrcHeight >> 1
+
+	movdqa xmm7, [shufb_mask_low]	; mask low
+	movdqa xmm6, [shufb_mask_high]	; mask high
+
+.yloops:
+	mov eax, [esp+40]	; iSrcWidth
+	sar eax, $1			; iSrcWidth >> 1
+	mov ebx, eax		; iDstWidth restored at ebx
+	sar eax, $4			; (iSrcWidth >> 1) / 16		; loop count = num_of_mb
+	neg ebx				; - (iSrcWidth >> 1)
+	; each loop = source bandwidth: 32 bytes
+.xloops:
+	; 1st part horizonal loop: x16 bytes
+	;               mem  hi<-       ->lo
+	;1st Line Src:	xmm0: h H g G f F e E d D c C b B a A
+	;				xmm1: p P o O n N m M l L k K j J i I
+	;2nd Line Src:	xmm2: h H g G f F e E d D c C b B a A
+	;				xmm3: p P o O n N m M l L k K j J i I
+	;=> target:
+	;: P O N M L K J I H G F E D C B A
+	;: p o n m l k j i h g f e d c b a
+	;: P ..                          A
+	;: p ..                          a
+
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	movdqa xmm0, [esi]			; 1st_src_line
+	movdqa xmm1, [esi+16]		; 1st_src_line + 16
+	movdqa xmm2, [esi+ecx]		; 2nd_src_line
+	movdqa xmm3, [esi+ecx+16]	; 2nd_src_line + 16
+
+	; packing & avg
+	movdqa xmm4, xmm0			; h H g G f F e E d D c C b B a A
+	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+	pshufb xmm4, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+	; another implementation for xmm4 high bits
+;	psubb xmm4, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+;	psrlw xmm4, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+	pavgb xmm0, xmm4
+
+	movdqa xmm5, xmm1
+	pshufb xmm1, xmm7
+	pshufb xmm5, xmm6
+;	psubb xmm5, xmm1
+;	psrlw xmm5, 8
+	pavgb xmm1, xmm5
+
+	movdqa xmm4, xmm2
+	pshufb xmm2, xmm7
+	pshufb xmm4, xmm6
+;	psubb xmm4, xmm2
+;	psrlw xmm4, 8
+	pavgb xmm2, xmm4
+
+	movdqa xmm5, xmm3
+	pshufb xmm3, xmm7
+	pshufb xmm5, xmm6
+;	psubb xmm5, xmm3
+;	psrlw xmm5, 8
+	pavgb xmm3, xmm5
+
+	packuswb xmm0, xmm1
+	packuswb xmm2, xmm3
+	pavgb xmm0, xmm2
+
+	; write pDst
+	movdqa [edi], xmm0
+
+	; next SMB
+	lea esi, [esi+32]
+	lea edi, [edi+16]
+
+	dec eax
+	jg near .xloops
+
+	; next line
+	lea esi, [esi+2*ecx]	; next end of lines
+	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
+	lea edi, [edi+edx]
+	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+
+	dec ebp
+	jg near .yloops
+
+	pop ebp
+	pop	edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3
+;***********************************************************************
+;	void DyadicBilinearDownsamplerWidthx16_ssse3( unsigned char* pDst, const int iDstStride,
+;					  unsigned char* pSrc, const int iSrcStride,
+;					  const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx16_ssse3:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov edi, [esp+24]	; pDst
+	mov edx, [esp+28]	; iDstStride
+	mov esi, [esp+32]	; pSrc
+	mov ecx, [esp+36]	; iSrcStride
+	mov ebp, [esp+44]	; iSrcHeight
+
+	sar ebp, $1		; iSrcHeight >> 1
+	movdqa xmm7, [shufb_mask_low]	; mask low
+	movdqa xmm6, [shufb_mask_high]	; mask high
+
+.yloops:
+	mov eax, [esp+40]	; iSrcWidth
+	sar eax, $1		; iSrcWidth >> 1
+	mov ebx, eax		; iDstWidth restored at ebx
+	sar eax, $3		; (iSrcWidth >> 1) / 8		; loop count = num_of_mb
+	neg ebx			; - (iSrcWidth >> 1)
+	; each loop = source bandwidth: 16 bytes
+.xloops:
+	; horizonal loop: x16 bytes by source
+	;               mem  hi<-       ->lo
+	;1st line pSrc:	xmm0: h H g G f F e E d D c C b B a A
+	;2nd line pSrc:  xmm1: p P o O n N m M l L k K j J i I
+	;=> target:
+	;: H G F E D C B A, P O N M L K J I
+	;: h g f e d c b a, p o n m l k j i
+
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	movdqa xmm0, [esi]			; 1st_src_line
+	movdqa xmm1, [esi+ecx]		; 2nd_src_line
+
+	; packing & avg
+	movdqa xmm2, xmm0			; h H g G f F e E d D c C b B a A
+	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+	pshufb xmm2, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+	; another implementation for xmm2 high bits
+;	psubb xmm2, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+;	psrlw xmm2, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+	pavgb xmm0, xmm2
+
+	movdqa xmm3, xmm1
+	pshufb xmm1, xmm7
+	pshufb xmm3, xmm6
+;	psubb xmm3, xmm1
+;	psrlw xmm3, 8
+	pavgb xmm1, xmm3
+
+	pavgb xmm0, xmm1
+	packuswb xmm0, xmm1
+
+	; write pDst
+	movq [edi], xmm0
+
+	; next SMB
+	lea esi, [esi+16]
+	lea edi, [edi+8]
+
+	dec eax
+	jg near .xloops
+
+	; next line
+	lea esi, [esi+2*ecx]	; next end of lines
+	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
+	lea edi, [edi+edx]
+	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+
+	dec ebp
+	jg near .yloops
+
+	pop ebp
+	pop edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+; got about 65% improvement over DyadicBilinearDownsamplerWidthx32_sse
+WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse4
+;***********************************************************************
+;	void DyadicBilinearDownsamplerWidthx32_sse4(	unsigned char* pDst, const int iDstStride,
+;					unsigned char* pSrc, const int iSrcStride,
+;					const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx32_sse4:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov edi, [esp+24]	; pDst
+	mov edx, [esp+28]	; iDstStride
+	mov esi, [esp+32]	; pSrc
+	mov ecx, [esp+36]	; iSrcStride
+	mov ebp, [esp+44]	; iSrcHeight
+
+	sar ebp, $1			; iSrcHeight >> 1
+
+	movdqa xmm7, [shufb_mask_low]	; mask low
+	movdqa xmm6, [shufb_mask_high]	; mask high
+
+.yloops:
+	mov eax, [esp+40]	; iSrcWidth
+	sar eax, $1			; iSrcWidth >> 1
+	mov ebx, eax		; iDstWidth restored at ebx
+	sar eax, $4			; (iSrcWidth >> 1) / 16		; loop count = num_of_mb
+	neg ebx				; - (iSrcWidth >> 1)
+	; each loop = source bandwidth: 32 bytes
+.xloops:
+	; 1st part horizonal loop: x16 bytes
+	;               mem  hi<-       ->lo
+	;1st Line Src:	xmm0: h H g G f F e E d D c C b B a A
+	;				xmm1: p P o O n N m M l L k K j J i I
+	;2nd Line Src:	xmm2: h H g G f F e E d D c C b B a A
+	;				xmm3: p P o O n N m M l L k K j J i I
+	;=> target:
+	;: P O N M L K J I H G F E D C B A
+	;: p o n m l k j i h g f e d c b a
+	;: P ..                          A
+	;: p ..                          a
+
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	movntdqa xmm0, [esi]			; 1st_src_line
+	movntdqa xmm1, [esi+16]		; 1st_src_line + 16
+	movntdqa xmm2, [esi+ecx]		; 2nd_src_line
+	movntdqa xmm3, [esi+ecx+16]	; 2nd_src_line + 16
+
+	; packing & avg
+	movdqa xmm4, xmm0			; h H g G f F e E d D c C b B a A
+	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+	pshufb xmm4, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+;	psubb xmm4, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+;	psrlw xmm4, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+	pavgb xmm0, xmm4
+
+	movdqa xmm5, xmm1
+	pshufb xmm1, xmm7
+	pshufb xmm5, xmm6
+;	psubb xmm5, xmm1
+;	psrlw xmm5, 8
+	pavgb xmm1, xmm5
+
+	movdqa xmm4, xmm2
+	pshufb xmm2, xmm7
+	pshufb xmm4, xmm6
+;	psubb xmm4, xmm2
+;	psrlw xmm4, 8
+	pavgb xmm2, xmm4
+
+	movdqa xmm5, xmm3
+	pshufb xmm3, xmm7
+	pshufb xmm5, xmm6
+;	psubb xmm5, xmm3
+;	psrlw xmm5, 8
+	pavgb xmm3, xmm5
+
+	packuswb xmm0, xmm1
+	packuswb xmm2, xmm3
+	pavgb xmm0, xmm2
+
+	; write pDst
+	movdqa [edi], xmm0
+
+	; next SMB
+	lea esi, [esi+32]
+	lea edi, [edi+16]
+
+	dec eax
+	jg near .xloops
+
+	; next line
+	lea esi, [esi+2*ecx]	; next end of lines
+	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
+	lea edi, [edi+edx]
+	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+
+	dec ebp
+	jg near .yloops
+
+	pop ebp
+	pop	edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse4
+;***********************************************************************
+;	void DyadicBilinearDownsamplerWidthx16_sse4( unsigned char* pDst, const int iDstStride,
+;					  unsigned char* pSrc, const int iSrcStride,
+;					  const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx16_sse4:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov edi, [esp+24]	; pDst
+	mov edx, [esp+28]	; iDstStride
+	mov esi, [esp+32]	; pSrc
+	mov ecx, [esp+36]	; iSrcStride
+	mov ebp, [esp+44]	; iSrcHeight
+
+	sar ebp, $1		; iSrcHeight >> 1
+	movdqa xmm7, [shufb_mask_low]	; mask low
+	movdqa xmm6, [shufb_mask_high]	; mask high
+
+.yloops:
+	mov eax, [esp+40]	; iSrcWidth
+	sar eax, $1		; iSrcWidth >> 1
+	mov ebx, eax		; iDstWidth restored at ebx
+	sar eax, $3		; (iSrcWidth >> 1) / 8		; loop count = num_of_mb
+	neg ebx			; - (iSrcWidth >> 1)
+	; each loop = source bandwidth: 16 bytes
+.xloops:
+	; horizonal loop: x16 bytes by source
+	;               mem  hi<-       ->lo
+	;1st line pSrc:	xmm0: h H g G f F e E d D c C b B a A
+	;2nd line pSrc:  xmm1: p P o O n N m M l L k K j J i I
+	;=> target:
+	;: H G F E D C B A, P O N M L K J I
+	;: h g f e d c b a, p o n m l k j i
+
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	movntdqa xmm0, [esi]			; 1st_src_line
+	movntdqa xmm1, [esi+ecx]		; 2nd_src_line
+
+	; packing & avg
+	movdqa xmm2, xmm0			; h H g G f F e E d D c C b B a A
+	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+	pshufb xmm2, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+;	psubb xmm2, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+;	psrlw xmm2, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+	pavgb xmm0, xmm2
+
+	movdqa xmm3, xmm1
+	pshufb xmm1, xmm7
+	pshufb xmm3, xmm6
+;	psubb xmm3, xmm1
+;	psrlw xmm3, 8
+	pavgb xmm1, xmm3
+
+	pavgb xmm0, xmm1
+	packuswb xmm0, xmm1
+
+	; write pDst
+	movq [edi], xmm0
+
+	; next SMB
+	lea esi, [esi+16]
+	lea edi, [edi+8]
+
+	dec eax
+	jg near .xloops
+
+	; next line
+	lea esi, [esi+2*ecx]	; next end of lines
+	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
+	lea edi, [edi+edx]
+	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+
+	dec ebp
+	jg near .yloops
+
+	pop ebp
+	pop edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+
+
+
+
+WELS_EXTERN	GeneralBilinearAccurateDownsampler_sse2
+;**************************************************************************************************************
+;int GeneralBilinearAccurateDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
+;							unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
+;                           unsigned int uiScaleX, unsigned int uiScaleY );
+;{
+;**************************************************************************************************************
+
+ALIGN 16
+GeneralBilinearAccurateDownsampler_sse2:
+	push	ebp
+	push	esi
+	push	edi
+	push	ebx
+%define		pushsize	16
+%define		localsize	28
+%define		pDstData		esp + pushsize + localsize + 4
+%define		dwDstStride		esp + pushsize + localsize + 8
+%define		dwDstWidth		esp + pushsize + localsize + 12
+%define		dwDstHeight		esp + pushsize + localsize + 16
+%define		pSrcData		esp + pushsize + localsize + 20
+%define		dwSrcStride		esp + pushsize + localsize + 24
+%define		dwSrcWidth		esp + pushsize + localsize + 28
+%define		dwSrcHeight		esp + pushsize + localsize + 32
+%define		scale			esp + 0
+%define		uiScaleX			esp + pushsize + localsize + 36
+%define		uiScaleY			esp + pushsize + localsize + 40
+%define		tmpHeight		esp + 12
+%define		yInverse		esp + 16
+%define		xInverse		esp + 20
+%define		dstStep			esp + 24
+	sub		esp,			localsize
+
+	pxor	xmm0,	xmm0
+	mov		edx,	32767
+	mov		eax,	[uiScaleX]
+	and		eax,	32767
+	mov		ebx,	eax
+	neg		ebx
+	and		ebx,	32767
+	movd	xmm1,		eax						; uinc(uiScaleX mod 32767)
+	movd	xmm2,		ebx						; -uinc
+	psllq	xmm1,		32
+	por		xmm1,		xmm2					; 0 0  uinc  -uinc   (dword)
+	pshufd	xmm7,		xmm1,	01000100b		; xmm7: uinc -uinc uinc -uinc
+
+	mov		eax,	[uiScaleY]
+	and		eax,	32767
+	mov		ebx,	eax
+	neg		ebx
+	and		ebx,	32767
+	movd	xmm6,		eax						; vinc(uiScaleY mod 32767)
+	movd	xmm2,		ebx						; -vinc
+	psllq	xmm6,		32
+	por		xmm6,		xmm2					; 0 0 vinc -vinc (dword)
+	pshufd	xmm6,		xmm6,	01010000b		; xmm6: vinc vinc -vinc -vinc
+
+	mov		edx,		40003fffh
+	movd	xmm5,		edx
+	punpcklwd	xmm5,	xmm0					; 16384 16383
+	pshufd	xmm5,		xmm5,	01000100b		; xmm5: 16384 16383 16384 16383
+
+
+DOWNSAMPLE:
+
+	mov		eax,			[dwDstHeight]
+	mov		edi,			[pDstData]
+	mov		edx,			[dwDstStride]
+	mov		ecx,			[dwDstWidth]
+	sub		edx,			ecx
+	mov		[dstStep],	edx				; stride - width
+	dec		eax
+	mov		[tmpHeight],	eax
+	mov		eax,			16384
+	mov		[yInverse],		eax
+
+	pshufd	xmm4,		xmm5,	01010000b	; initial v to 16384 16384 16383 16383
+
+HEIGHT:
+	mov		eax,	[yInverse]
+	mov		esi,	[pSrcData]
+	shr		eax,	15
+	mul		dword [dwSrcStride]
+	add		esi,	eax					; get current row address
+	mov		ebp,	esi
+	add		ebp,	[dwSrcStride]
+
+	mov		eax,		16384
+	mov		[xInverse],		eax
+	mov		ecx,			[dwDstWidth]
+	dec		ecx
+
+	movdqa	xmm3,		xmm5			; initial u to 16384 16383 16384 16383
+
+WIDTH:
+	mov		eax,		[xInverse]
+	shr		eax,		15
+
+	movd	xmm1,		[esi+eax]		; xxxxxxba
+	movd	xmm2,		[ebp+eax]		; xxxxxxdc
+	pxor	xmm0,		xmm0
+	punpcklwd	xmm1,	xmm2			; xxxxdcba
+	punpcklbw	xmm1,	xmm0			; 0d0c0b0a
+	punpcklwd	xmm1,	xmm0			; 000d000c000b000a
+
+	movdqa	xmm2,	xmm4	; xmm2:  vv(1-v)(1-v)  tmpv
+	pmaddwd	xmm2,	xmm3	; mul u(1-u)u(1-u) on xmm2
+	movdqa	xmm0,	xmm2
+	pmuludq	xmm2,	xmm1
+	psrlq	xmm0,	32
+	psrlq	xmm1,	32
+	pmuludq	xmm0,	xmm1
+	paddq	xmm2,	xmm0
+	pshufd	xmm1,	xmm2,	00001110b
+	paddq	xmm2,	xmm1
+	psrlq	xmm2,	29
+
+	movd	eax,	xmm2
+	inc		eax
+	shr		eax,	1
+	mov		[edi],	al
+	inc		edi
+
+	mov		eax,		[uiScaleX]
+	add		[xInverse],	eax
+
+	paddw	xmm3,		xmm7			; inc u
+	psllw	xmm3,		1
+	psrlw	xmm3,		1
+
+	loop	WIDTH
+
+WIDTH_END:
+	mov		eax,		[xInverse]
+	shr		eax,		15
+	mov		cl,			[esi+eax]
+	mov		[edi],		cl
+	inc		edi
+
+	mov		eax,		[uiScaleY]
+	add		[yInverse],	eax
+	add		edi,		[dstStep]
+
+	paddw	xmm4,	xmm6				; inc v
+	psllw	xmm4,	1
+	psrlw	xmm4,	1
+
+	dec		dword [tmpHeight]
+	jg		HEIGHT
+
+
+LAST_ROW:
+	mov		eax,	[yInverse]
+	mov		esi,	[pSrcData]
+	shr		eax,	15
+	mul		dword [dwSrcStride]
+	add		esi,	eax					; get current row address
+
+	mov		eax,		16384
+	mov		[xInverse],		eax
+	mov		ecx,			[dwDstWidth]
+
+LAST_ROW_WIDTH:
+	mov		eax,		[xInverse]
+	shr		eax,		15
+
+	mov		al,			[esi+eax]
+	mov		[edi],	al
+	inc		edi
+
+	mov		eax,		[uiScaleX]
+	add		[xInverse],	eax
+
+	loop	LAST_ROW_WIDTH
+
+LAST_ROW_END:
+
+	add		esp,			localsize
+	pop		ebx
+	pop		edi
+	pop		esi
+	pop		ebp
+%undef		pushsize
+%undef		localsize
+%undef		pSrcData
+%undef		dwSrcWidth
+%undef		dwSrcHeight
+%undef		dwSrcStride
+%undef		pDstData
+%undef		dwDstWidth
+%undef		dwDstHeight
+%undef		dwDstStride
+%undef		scale
+%undef		uiScaleX
+%undef		uiScaleY
+%undef		tmpHeight
+%undef		yInverse
+%undef		xInverse
+%undef		dstStep
+	ret
+
+
+
+
+WELS_EXTERN	GeneralBilinearFastDownsampler_sse2
+;**************************************************************************************************************
+;int GeneralBilinearFastDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
+;				unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
+;               unsigned int uiScaleX, unsigned int uiScaleY );
+;{
+;**************************************************************************************************************
+
+ALIGN 16
+GeneralBilinearFastDownsampler_sse2:
+	push	ebp
+	push	esi
+	push	edi
+	push	ebx
+%define		pushsize	16
+%define		localsize	28
+%define		pDstData		esp + pushsize + localsize + 4
+%define		dwDstStride		esp + pushsize + localsize + 8
+%define		dwDstWidth		esp + pushsize + localsize + 12
+%define		dwDstHeight		esp + pushsize + localsize + 16
+%define		pSrcData		esp + pushsize + localsize + 20
+%define		dwSrcStride		esp + pushsize + localsize + 24
+%define		dwSrcWidth		esp + pushsize + localsize + 28
+%define		dwSrcHeight		esp + pushsize + localsize + 32
+%define		scale			esp + 0
+%define		uiScaleX			esp + pushsize + localsize + 36
+%define		uiScaleY			esp + pushsize + localsize + 40
+%define		tmpHeight		esp + 12
+%define		yInverse		esp + 16
+%define		xInverse		esp + 20
+%define		dstStep			esp + 24
+	sub		esp,			localsize
+
+	pxor	xmm0,	xmm0
+	mov		edx,	65535
+	mov		eax,	[uiScaleX]
+	and		eax,	edx
+	mov		ebx,	eax
+	neg		ebx
+	and		ebx,	65535
+	movd	xmm1,		eax						; uinc(uiScaleX mod 65536)
+	movd	xmm2,		ebx						; -uinc
+	psllq	xmm1,		32
+	por		xmm1,		xmm2					; 0 uinc 0 -uinc
+	pshuflw	xmm7,		xmm1,	10001000b		; xmm7: uinc -uinc uinc -uinc
+
+	mov		eax,	[uiScaleY]
+	and		eax,	32767
+	mov		ebx,	eax
+	neg		ebx
+	and		ebx,	32767
+	movd	xmm6,		eax						; vinc(uiScaleY mod 32767)
+	movd	xmm2,		ebx						; -vinc
+	psllq	xmm6,		32
+	por		xmm6,		xmm2					; 0 vinc 0 -vinc
+	pshuflw	xmm6,		xmm6,	10100000b		; xmm6: vinc vinc -vinc -vinc
+
+	mov		edx,		80007fffh				; 32768 32767
+	movd	xmm5,		edx
+	pshuflw	xmm5,		xmm5,		01000100b	; 32768 32767 32768 32767
+	mov		ebx,		16384
+
+
+FAST_DOWNSAMPLE:
+
+	mov		eax,			[dwDstHeight]
+	mov		edi,			[pDstData]
+	mov		edx,			[dwDstStride]
+	mov		ecx,			[dwDstWidth]
+	sub		edx,			ecx
+	mov		[dstStep],	edx				; stride - width
+	dec		eax
+	mov		[tmpHeight],	eax
+	mov		eax,		16384
+	mov		[yInverse],		eax
+
+	pshuflw	xmm4,		xmm5,	01010000b
+	psrlw	xmm4,		1				; initial v to 16384 16384 16383 16383
+
+FAST_HEIGHT:
+	mov		eax,	[yInverse]
+	mov		esi,	[pSrcData]
+	shr		eax,	15
+	mul		dword [dwSrcStride]
+	add		esi,	eax					; get current row address
+	mov		ebp,	esi
+	add		ebp,	[dwSrcStride]
+
+	mov		eax,		32768
+	mov		[xInverse],		eax
+	mov		ecx,			[dwDstWidth]
+	dec		ecx
+
+	movdqa	xmm3,		xmm5			; initial u to 32768 32767 32768 32767
+
+FAST_WIDTH:
+	mov		eax,		[xInverse]
+	shr		eax,		16
+
+	movd	xmm1,		[esi+eax]		; xxxxxxba
+	movd	xmm2,		[ebp+eax]		; xxxxxxdc
+	punpcklwd	xmm1,	xmm2			; xxxxdcba
+	punpcklbw	xmm1,	xmm0			; 0d0c0b0a
+
+	movdqa	xmm2,	xmm4	; xmm2:  vv(1-v)(1-v)  tmpv
+	pmulhuw	xmm2,	xmm3	; mul u(1-u)u(1-u) on xmm2
+	pmaddwd		xmm2,	xmm1
+	pshufd	xmm1,	xmm2,	00000001b
+	paddd	xmm2,	xmm1
+	movd	xmm1,	ebx
+	paddd	xmm2,	xmm1
+	psrld	xmm2,	15
+
+	packuswb	xmm2,	xmm0
+	movd	eax,	xmm2
+	mov		[edi],	al
+	inc		edi
+
+	mov		eax,		[uiScaleX]
+	add		[xInverse],	eax
+
+	paddw	xmm3,		xmm7			; inc u
+
+	loop	FAST_WIDTH
+
+FAST_WIDTH_END:
+	mov		eax,		[xInverse]
+	shr		eax,		16
+	mov		cl,			[esi+eax]
+	mov		[edi],		cl
+	inc		edi
+
+	mov		eax,		[uiScaleY]
+	add		[yInverse],	eax
+	add		edi,		[dstStep]
+
+	paddw	xmm4,	xmm6				; inc v
+	psllw	xmm4,	1
+	psrlw	xmm4,	1
+
+	dec		dword [tmpHeight]
+	jg		FAST_HEIGHT
+
+
+FAST_LAST_ROW:
+	mov		eax,	[yInverse]
+	mov		esi,	[pSrcData]
+	shr		eax,	15
+	mul		dword [dwSrcStride]
+	add		esi,	eax					; get current row address
+
+	mov		eax,		32768
+	mov		[xInverse],		eax
+	mov		ecx,			[dwDstWidth]
+
+FAST_LAST_ROW_WIDTH:
+	mov		eax,		[xInverse]
+	shr		eax,		16
+
+	mov		al,			[esi+eax]
+	mov		[edi],	al
+	inc		edi
+
+	mov		eax,		[uiScaleX]
+	add		[xInverse],	eax
+
+	loop	FAST_LAST_ROW_WIDTH
+
+FAST_LAST_ROW_END:
+
+	add		esp,			localsize
+	pop		ebx
+	pop		edi
+	pop		esi
+	pop		ebp
+%undef		pushsize
+%undef		localsize
+%undef		pSrcData
+%undef		dwSrcWidth
+%undef		dwSrcHeight
+%undef		dwSrcStride
+%undef		pDstData
+%undef		dwDstWidth
+%undef		dwDstHeight
+%undef		dwDstStride
+%undef		scale
+%undef		uiScaleX
+%undef		uiScaleY
+%undef		tmpHeight
+%undef		yInverse
+%undef		xInverse
+%undef		dstStep
 	ret
\ No newline at end of file
--- a/processing/src/asm/intra_pred.asm
+++ b/processing/src/asm/intra_pred.asm
@@ -1,145 +1,145 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  intra_pred.asm
-;*
-;*  Abstract
-;*      sse2 function for intra predict operations
-;*
-;*  History
-;*      18/09/2009 Created
-;*
-;*
-;*************************************************************************/
-%include "../../src/asm/asm_inc.asm"
-
-BITS 32
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-%ifdef FORMAT_COFF
-SECTION .rodata data
-%else
-SECTION .rodata align=16
-%endif
-
-
-align 16
-mmx_01bytes:		times 16	db 1
-
-;***********************************************************************
-; macros
-;***********************************************************************
-%macro  COPY_16_TIMES 2
-		movdqa		%2,	[%1-16]
-		psrldq		%2,	15
-		pmuludq		%2,	[mmx_01bytes]
-		pshufd		%2,	%2, 0
-%endmacro
-
-%macro  COPY_16_TIMESS 3
-		movdqa		%2,	[%1+%3-16]
-		psrldq		%2,	15
-		pmuludq		%2,	[mmx_01bytes]
-		pshufd		%2,	%2, 0
-%endmacro
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-;***********************************************************************
-; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
-;***********************************************************************
-
-%macro SSE2_PRED_H_16X16_TWO_LINE 1
-    lea     eax,	[eax+ecx*2]
-    
-    COPY_16_TIMES eax,	xmm0
-    movdqa  [edx+%1],	xmm0
-    COPY_16_TIMESS eax,	xmm0,	ecx
-    movdqa  [edx+%1+0x10],	xmm0
-%endmacro
-
-WELS_EXTERN WelsI16x16LumaPredH_sse2
-WelsI16x16LumaPredH_sse2:
-    mov     edx, [esp+4]    ; pred
-    mov     eax, [esp+8]	; pRef
-    mov     ecx, [esp+12]   ; stride
-    
-    COPY_16_TIMES eax,	xmm0
-    movdqa  [edx],		xmm0
-    COPY_16_TIMESS eax,	xmm0,	ecx
-    movdqa  [edx+0x10],	xmm0
-    
-	SSE2_PRED_H_16X16_TWO_LINE   0x20 
-	SSE2_PRED_H_16X16_TWO_LINE   0x40
-	SSE2_PRED_H_16X16_TWO_LINE   0x60
-	SSE2_PRED_H_16X16_TWO_LINE   0x80
-	SSE2_PRED_H_16X16_TWO_LINE   0xa0
-	SSE2_PRED_H_16X16_TWO_LINE   0xc0
-	SSE2_PRED_H_16X16_TWO_LINE   0xe0
-   
-    ret
-    
-;***********************************************************************
-; void WelsI16x16LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
-;***********************************************************************
-WELS_EXTERN WelsI16x16LumaPredV_sse2
-WelsI16x16LumaPredV_sse2:
-    mov     edx, [esp+4]    ; pred
-    mov     eax, [esp+8]	; pRef
-    mov     ecx, [esp+12]   ; stride
-    
-    sub     eax, ecx
-    movdqa  xmm0, [eax]
-    
-    movdqa  [edx], xmm0
-    movdqa  [edx+10h], xmm0
-    movdqa  [edx+20h], xmm0
-    movdqa  [edx+30h], xmm0
-    movdqa  [edx+40h], xmm0
-    movdqa  [edx+50h], xmm0
-    movdqa  [edx+60h], xmm0
-    movdqa  [edx+70h], xmm0
-    movdqa  [edx+80h], xmm0
-    movdqa  [edx+90h], xmm0
-    movdqa  [edx+160], xmm0 
-	movdqa  [edx+176], xmm0
-    movdqa  [edx+192], xmm0
-    movdqa  [edx+208], xmm0
-    movdqa  [edx+224], xmm0
-    movdqa  [edx+240], xmm0
-    
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  intra_pred.asm
+;*
+;*  Abstract
+;*      sse2 function for intra predict operations
+;*
+;*  History
+;*      18/09/2009 Created
+;*
+;*
+;*************************************************************************/
+%include "../../src/asm/asm_inc.asm"
+
+BITS 32
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+%ifdef FORMAT_COFF
+SECTION .rodata data
+%else
+SECTION .rodata align=16
+%endif
+
+
+align 16
+mmx_01bytes:		times 16	db 1
+
+;***********************************************************************
+; macros
+;***********************************************************************
+%macro  COPY_16_TIMES 2
+		movdqa		%2,	[%1-16]
+		psrldq		%2,	15
+		pmuludq		%2,	[mmx_01bytes]
+		pshufd		%2,	%2, 0
+%endmacro
+
+%macro  COPY_16_TIMESS 3
+		movdqa		%2,	[%1+%3-16]
+		psrldq		%2,	15
+		pmuludq		%2,	[mmx_01bytes]
+		pshufd		%2,	%2, 0
+%endmacro
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+;***********************************************************************
+; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
+;***********************************************************************
+
+%macro SSE2_PRED_H_16X16_TWO_LINE 1
+    lea     eax,	[eax+ecx*2]
+
+    COPY_16_TIMES eax,	xmm0
+    movdqa  [edx+%1],	xmm0
+    COPY_16_TIMESS eax,	xmm0,	ecx
+    movdqa  [edx+%1+0x10],	xmm0
+%endmacro
+
+WELS_EXTERN WelsI16x16LumaPredH_sse2
+WelsI16x16LumaPredH_sse2:
+    mov     edx, [esp+4]    ; pred
+    mov     eax, [esp+8]	; pRef
+    mov     ecx, [esp+12]   ; stride
+
+    COPY_16_TIMES eax,	xmm0
+    movdqa  [edx],		xmm0
+    COPY_16_TIMESS eax,	xmm0,	ecx
+    movdqa  [edx+0x10],	xmm0
+
+	SSE2_PRED_H_16X16_TWO_LINE   0x20
+	SSE2_PRED_H_16X16_TWO_LINE   0x40
+	SSE2_PRED_H_16X16_TWO_LINE   0x60
+	SSE2_PRED_H_16X16_TWO_LINE   0x80
+	SSE2_PRED_H_16X16_TWO_LINE   0xa0
+	SSE2_PRED_H_16X16_TWO_LINE   0xc0
+	SSE2_PRED_H_16X16_TWO_LINE   0xe0
+
+    ret
+
+;***********************************************************************
+; void WelsI16x16LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
+;***********************************************************************
+WELS_EXTERN WelsI16x16LumaPredV_sse2
+WelsI16x16LumaPredV_sse2:
+    mov     edx, [esp+4]    ; pred
+    mov     eax, [esp+8]	; pRef
+    mov     ecx, [esp+12]   ; stride
+
+    sub     eax, ecx
+    movdqa  xmm0, [eax]
+
+    movdqa  [edx], xmm0
+    movdqa  [edx+10h], xmm0
+    movdqa  [edx+20h], xmm0
+    movdqa  [edx+30h], xmm0
+    movdqa  [edx+40h], xmm0
+    movdqa  [edx+50h], xmm0
+    movdqa  [edx+60h], xmm0
+    movdqa  [edx+70h], xmm0
+    movdqa  [edx+80h], xmm0
+    movdqa  [edx+90h], xmm0
+    movdqa  [edx+160], xmm0
+	movdqa  [edx+176], xmm0
+    movdqa  [edx+192], xmm0
+    movdqa  [edx+208], xmm0
+    movdqa  [edx+224], xmm0
+    movdqa  [edx+240], xmm0
+
     ret
\ No newline at end of file
--- a/processing/src/asm/sad.asm
+++ b/processing/src/asm/sad.asm
@@ -1,79 +1,79 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  pixel_sse2.asm
-;*
-;*  Abstract
-;*      WelsSampleSad8x8_sse21
-;*
-;*  History
-;*      8/5/2009 Created
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-BITS 32
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-
-%macro SAD_8x4 0
-	movq   xmm0,   [eax]
-	movq   xmm1,   [eax+ebx]
-	lea    eax,    [eax+2*ebx]
-	movhps xmm0,   [eax]
-	movhps xmm1,   [eax+ebx]
-
-	movq   xmm2,   [ecx]
-	movq   xmm3,   [ecx+edx]
-	lea    ecx,    [ecx+2*edx]
-	movhps xmm2,   [ecx]
-	movhps xmm3,   [ecx+edx]
-	psadbw xmm0,   xmm2
-	psadbw xmm1,   xmm3
-	paddw  xmm6,   xmm0
-	paddw  xmm6,   xmm1
-%endmacro
-
-
-  
-%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
-and    %1,  0x1f|(%3>>1)
-cmp    %1,  (32-%2)|(%3>>1)
-%endmacro
-
-
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  pixel_sse2.asm
+;*
+;*  Abstract
+;*      WelsSampleSad8x8_sse21
+;*
+;*  History
+;*      8/5/2009 Created
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+BITS 32
+
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+%macro SAD_8x4 0
+	movq   xmm0,   [eax]
+	movq   xmm1,   [eax+ebx]
+	lea    eax,    [eax+2*ebx]
+	movhps xmm0,   [eax]
+	movhps xmm1,   [eax+ebx]
+
+	movq   xmm2,   [ecx]
+	movq   xmm3,   [ecx+edx]
+	lea    ecx,    [ecx+2*edx]
+	movhps xmm2,   [ecx]
+	movhps xmm3,   [ecx+edx]
+	psadbw xmm0,   xmm2
+	psadbw xmm1,   xmm3
+	paddw  xmm6,   xmm0
+	paddw  xmm6,   xmm1
+%endmacro
+
+
+
+%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
+and    %1,  0x1f|(%3>>1)
+cmp    %1,  (32-%2)|(%3>>1)
+%endmacro
+
+
 %macro SSE2_GetSad8x4 0
 	movq   xmm0,   [eax]
 	movq   xmm1,   [eax+ebx]
@@ -90,12 +90,12 @@
 	psadbw xmm1,   xmm3
 	paddw  xmm6,   xmm0
 	paddw  xmm6,   xmm1
-%endmacro
+%endmacro
 
 
-;***********************************************************************
-; Code
-;***********************************************************************
+;***********************************************************************
+; Code
+;***********************************************************************
 SECTION .text
 
 WELS_EXTERN WelsSampleSad8x8_sse21
@@ -108,15 +108,15 @@
 	push   edi
 	mov    eax,    [esp+12]
 	mov    ebx,    [esp+16]
-    
+
     pxor   xmm7,   xmm7
-    
+
     mov    edi,    ecx
     and    edi,    0x07
-    sub    ecx,    edi   
+    sub    ecx,    edi
     mov    edx,    8
     sub    edx,    edi
-    
+
     shl    edi,    3
     shl    edx,    3
     movd   xmm5,   edi
@@ -124,10 +124,10 @@
 	mov    edi,    8
 	add    edi,    ecx
     mov    edx,    [esp+24]
-    
+
     movq   xmm0,   [eax]
 	movhps xmm0,   [eax+ebx]
-		
+
 	movq   xmm1,   [ecx]
 	movq   xmm2,   [edi]
 	movhps xmm1,   [ecx+edx]
@@ -135,17 +135,17 @@
 	psrlq  xmm1,   xmm5
 	psllq  xmm2,   xmm6
 	por    xmm1,   xmm2
-	
+
 	psadbw xmm0,   xmm1
 	paddw  xmm7,   xmm0
-	
+
 	lea    eax,    [eax+2*ebx]
 	lea    ecx,    [ecx+2*edx]
 	lea    edi,    [edi+2*edx]
-	 
+
     movq   xmm0,   [eax]
 	movhps xmm0,   [eax+ebx]
-		
+
 	movq   xmm1,   [ecx]
 	movq   xmm2,   [edi]
 	movhps xmm1,   [ecx+edx]
@@ -153,7 +153,7 @@
 	psrlq  xmm1,   xmm5
 	psllq  xmm2,   xmm6
 	por    xmm1,   xmm2
-	
+
 	psadbw xmm0,   xmm1
 	paddw  xmm7,   xmm0
 
@@ -160,10 +160,10 @@
 	lea    eax,    [eax+2*ebx]
 	lea    ecx,    [ecx+2*edx]
 	lea    edi,    [edi+2*edx]
-	 
+
     movq   xmm0,   [eax]
 	movhps xmm0,   [eax+ebx]
-		
+
 	movq   xmm1,   [ecx]
 	movq   xmm2,   [edi]
 	movhps xmm1,   [ecx+edx]
@@ -171,17 +171,17 @@
 	psrlq  xmm1,   xmm5
 	psllq  xmm2,   xmm6
 	por    xmm1,   xmm2
-	
+
 	psadbw xmm0,   xmm1
 	paddw  xmm7,   xmm0
-	
+
 	lea    eax,    [eax+2*ebx]
 	lea    ecx,    [ecx+2*edx]
 	lea    edi,    [edi+2*edx]
-	 
+
     movq   xmm0,   [eax]
 	movhps xmm0,   [eax+ebx]
-		
+
 	movq   xmm1,   [ecx]
 	movq   xmm2,   [edi]
 	movhps xmm1,   [ecx+edx]
@@ -189,10 +189,10 @@
 	psrlq  xmm1,   xmm5
 	psllq  xmm2,   xmm6
 	por    xmm1,   xmm2
-	
+
 	psadbw xmm0,   xmm1
 	paddw  xmm7,   xmm0
-	
+
     movhlps    xmm0, xmm7
 	paddw      xmm0, xmm7
 	movd       eax,  xmm0
@@ -202,12 +202,12 @@
     push   ebx
     mov    eax,    [esp+8]
 	mov    ebx,    [esp+12]
-	mov    edx,    [esp+20]    
+	mov    edx,    [esp+20]
 	pxor   xmm6,   xmm6
 	SSE2_GetSad8x4
     lea    eax,    [eax+2*ebx]
 	lea    ecx,    [ecx+2*edx]
-    SSE2_GetSad8x4    
+    SSE2_GetSad8x4
     movhlps    xmm0, xmm6
 	paddw      xmm0, xmm6
 	movd       eax,  xmm0
--- a/processing/src/asm/vaa.asm
+++ b/processing/src/asm/vaa.asm
@@ -1,1589 +1,1589 @@
-;*!
-;* \copy
-;*     Copyright (c)  2010-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*	vaa.asm
-;*
-;*	Abstract
-;*      sse2 for pVaa routines
-;*
-;*  History
-;*      04/14/2010	Created
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-BITS 32
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-
-;%macro SUM_SSE2	4	; dst, pSrc, zero, pack1_8x2
-;	movdqa %1, %2
-;	punpcklbw %1, %3
-;	punpckhbw %2, %3
-;	paddw %1, %2
-;	pmaddwd %1, %4
-;	pshufd %2, %1, 04Eh	; 01001110 B
-;	paddd %1, %2
-;	pshufd %2, %1, 0B1h	; 10110001 B
-;	paddd %1, %2
-;%endmacro	; END OF SUM_SSE2
-
-; by comparing it outperforms than phaddw(SSSE3) sets
-%macro SUM_WORD_8x2_SSE2	2	; dst(pSrc), tmp
-	; @sum_8x2 begin
-	pshufd %2, %1, 04Eh	; 01001110 B
-	paddw %1, %2
-	pshuflw %2, %1, 04Eh	; 01001110 B
-	paddw %1, %2
-	pshuflw %2, %1, 0B1h	; 10110001 B
-	paddw %1, %2
-	; end of @sum_8x2
-%endmacro	; END of SUM_WORD_8x2_SSE2
-
-%macro SUM_SQR_SSE2	3	; dst, pSrc, zero
-	movdqa %1, %2
-	punpcklbw %1, %3
-	punpckhbw %2, %3
-	pmaddwd %1, %1
-	pmaddwd %2, %2
-	paddd %1, %2
-	pshufd %2, %1, 04Eh	; 01001110 B
-	paddd %1, %2
-	pshufd %2, %1, 0B1h	; 10110001 B
-	paddd %1, %2
-%endmacro	; END OF SUM_SQR_SSE2
-
-%macro VAA_AVG_BLOCK_SSE2 6 ; dst, t0, t1, t2, t3, t4
-	movdqa %1, [esi    ]	; line 0
-	movdqa %2, [esi+ecx]	; line 1
-	movdqa %3, %1
-	punpcklbw %1, xmm7
-	punpckhbw %3, xmm7
-	movdqa %4, %2
-	punpcklbw %4, xmm7
-	punpckhbw %2, xmm7
-	paddw %1, %4
-	paddw %2, %3
-	movdqa %3, [esi+ebx]	; line 2
-	movdqa %4, [esi+edx]	; line 3
-	movdqa %5, %3
-	punpcklbw %3, xmm7
-	punpckhbw %5, xmm7
-	movdqa %6, %4
-	punpcklbw %6, xmm7
-	punpckhbw %4, xmm7
-	paddw %3, %6
-	paddw %4, %5
-	paddw %1, %3	; block 0, 1
-	paddw %2, %4	; block 2, 3
-	pshufd %3, %1, 0B1h
-	pshufd %4, %2, 0B1h
-	paddw %1, %3
-	paddw %2, %4
-	movdqa %3, %1
-	movdqa %4, %2
-	pshuflw %5, %1, 0B1h
-	pshufhw %6, %3, 0B1h
-	paddw %1, %5
-	paddw %3, %6
-	pshuflw %5, %2, 0B1h
-	pshufhw %6, %4, 0B1h
-	paddw %2, %5
-	paddw %4, %6
-	punpcklwd %1, %2
-	punpckhwd %3, %4
-	punpcklwd %1, %3
-	psraw %1, $4
-%endmacro
-
-%macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4
-	movdqa %1, [esi    ]	; line 0
-	movdqa %2, [esi+ecx]	; line 1
-	movdqa %3, %1
-	punpcklbw %1, xmm7
-	punpckhbw %3, xmm7
-	movdqa %4, %2
-	punpcklbw %4, xmm7
-	punpckhbw %2, xmm7
-	paddw %1, %4
-	paddw %2, %3
-	movdqa %3, [esi+ebx]	; line 2
-	movdqa %4, [esi+edx]	; line 3
-	movdqa %5, %3
-	punpcklbw %3, xmm7
-	punpckhbw %5, xmm7
-	movdqa %6, %4
-	punpcklbw %6, xmm7
-	punpckhbw %4, xmm7
-	paddw %3, %6
-	paddw %4, %5
-	paddw %1, %3	; block 0, 1
-	paddw %2, %4	; block 2, 3
-	phaddw %1, %2	; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..
-	phaddw %1, xmm7	; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....
-	psraw %1, $4
-%endmacro
-
-%macro WELS_SAD_16x2_SSE2  0
-	movdqa	xmm1,	[esi]
-	movdqa	xmm2,	[edi]
-	movdqa	xmm3,	[esi+ebx]
-	movdqa	xmm4,	[edi+ebx]
-	psadbw	xmm1,	xmm2
-	psadbw	xmm3,	xmm4
-	paddd	xmm6,	xmm1
-	paddd	xmm6,	xmm3
-	lea		esi,	[esi+ebx*2]
-	lea		edi,	[edi+ebx*2]	
-%endmacro
-
-%macro	WELS_SAD_SUM_SQSUM_16x1_SSE2 0
-	movdqa	xmm1,	[esi]
-	movdqa	xmm2,	[edi]
-	movdqa	xmm3,	xmm1
-	psadbw	xmm3,	xmm2
-	paddd	xmm6,	xmm3
-	
-	movdqa	xmm3,	xmm1
-	psadbw	xmm3,	xmm0
-	paddd	xmm5,	xmm3
-	
-	movdqa		xmm2,	xmm1
-	punpcklbw	xmm1,	xmm0
-	punpckhbw	xmm2,	xmm0
-	pmaddwd		xmm1,	xmm1
-	pmaddwd		xmm2,	xmm2
-	paddd		xmm4,	xmm1
-	paddd		xmm4,	xmm2
-	
-	add		esi,	ebx
-	add		edi,	ebx
-%endmacro
-
-%macro	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 0
-	movdqa	xmm1,	[esi]
-	movdqa	xmm2,	[edi]
-	movdqa	xmm3,	xmm1
-	psadbw	xmm3,	xmm2
-	paddd	xmm7,	xmm3	; sad
-	
-	movdqa	xmm3,	xmm1
-	pmaxub	xmm3,	xmm2
-	pminub	xmm2,	xmm1
-	psubb	xmm3,	xmm2	; diff
-	
-	movdqa	xmm2,	xmm1
-	psadbw	xmm2,	xmm0
-	paddd	xmm6,	xmm2	; sum
-	
-	movdqa		xmm2,	xmm1
-	punpcklbw	xmm1,	xmm0
-	punpckhbw	xmm2,	xmm0
-	pmaddwd		xmm1,	xmm1
-	pmaddwd		xmm2,	xmm2
-	paddd		xmm5,	xmm1
-	paddd		xmm5,	xmm2	; sqsum
-	
-	movdqa		xmm1,	xmm3
-	punpcklbw	xmm1,	xmm0
-	punpckhbw	xmm3,	xmm0
-	pmaddwd		xmm1,	xmm1
-	pmaddwd		xmm3,	xmm3
-	paddd		xmm4,	xmm1
-	paddd		xmm4,	xmm3	; sqdiff
-	
-	add		esi,	ebx
-	add		edi,	ebx
-%endmacro
-
-%macro	WELS_SAD_SD_MAD_16x1_SSE2	4
-%define sad_reg			%1
-%define	sum_cur_reg		%2
-%define sum_ref_reg		%3
-%define	mad_reg			%4
-	movdqa	xmm1,		[esi]
-	movdqa	xmm2,		[edi]
-	movdqa	xmm3,		xmm1
-	psadbw	xmm3,		xmm0
-	paddd	sum_cur_reg,			xmm3	; sum_cur
-	movdqa	xmm3,		xmm2
-	psadbw	xmm3,		xmm0
-	paddd	sum_ref_reg,			xmm3	; sum_ref
-	
-	movdqa	xmm3,		xmm1
-	pmaxub	xmm3,		xmm2
-	pminub	xmm2,		xmm1
-	psubb	xmm3,		xmm2	; abs diff
-	pmaxub	mad_reg,	xmm3	; max abs diff
-	
-	psadbw	xmm3,		xmm0
-	paddd	sad_reg,	xmm3	; sad
-	
-	add			esi,		ebx
-	add			edi,		ebx
-%endmacro
-
-
-%macro	WELS_MAX_REG_SSE2	1	; xmm1, xmm2, xmm3 can be used
-%define max_reg  %1
-	movdqa	xmm1,		max_reg
-	psrldq	xmm1,		4
-	pmaxub	max_reg,	xmm1
-	movdqa	xmm1,		max_reg
-	psrldq	xmm1,		2
-	pmaxub	max_reg,	xmm1
-	movdqa	xmm1,		max_reg
-	psrldq	xmm1,		1
-	pmaxub	max_reg,	xmm1
-%endmacro
-
-%macro	WELS_SAD_BGD_SQDIFF_16x1_SSE2	4
-%define sad_reg		%1
-%define	sum_reg		%2
-%define mad_reg		%3
-%define sqdiff_reg	%4
-	movdqa		xmm1,		[esi]
-	movdqa		xmm2,		xmm1
-	movdqa		xmm3,		xmm1
-	punpcklbw	xmm2,		xmm0
-	punpckhbw	xmm3,		xmm0
-	pmaddwd		xmm2,		xmm2
-	pmaddwd		xmm3,		xmm3
-	paddd		xmm2,		xmm3
-	movdqa		xmm3,		xmm2
-	psllq		xmm2,		32
-	psrlq		xmm3,		32
-	psllq		xmm3,		32
-	paddd		xmm2,		xmm3
-	paddd		sad_reg,	xmm2		; sqsum
-	
-	movdqa	xmm2,		[edi]
-	movdqa	xmm3,		xmm1
-	psadbw	xmm3,		xmm0
-	paddd	sum_reg,			xmm3	; sum_cur
-	movdqa	xmm3,		xmm2
-	psadbw	xmm3,		xmm0
-	pslldq	xmm3,		4
-	paddd	sum_reg,			xmm3	; sum_ref
-	
-	movdqa	xmm3,		xmm1
-	pmaxub	xmm3,		xmm2
-	pminub	xmm2,		xmm1
-	psubb	xmm3,		xmm2	; abs diff
-	pmaxub	mad_reg,	xmm3	; max abs diff
-	
-	movdqa	xmm1,		xmm3
-	psadbw	xmm3,		xmm0
-	paddd	sad_reg,	xmm3	; sad
-
-	movdqa		xmm3,	xmm1
-	punpcklbw	xmm1,	xmm0
-	punpckhbw	xmm3,	xmm0
-	pmaddwd		xmm1,	xmm1
-	pmaddwd		xmm3,	xmm3
-	paddd		sqdiff_reg,	xmm1
-	paddd		sqdiff_reg,	xmm3	; sqdiff
-	
-	add		esi,	ebx
-	add		edi,	ebx
-%endmacro
-
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-;SECTION .rodata align=16
-
-;ALIGN 16
-;pack1_8x2:
-;	dw 1, 1, 1, 1, 1, 1, 1, 1
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-WELS_EXTERN rc_sad_frame_sse2
-;***********************************************************************
-;	uint32_t rc_sad_frame_sse2(	uint8_t *ref_orig, uint8_t *cur_orig, const int mb_width, const int iPicHeight, const int iPicStride );
-;***********************************************************************
-ALIGN 16
-rc_sad_frame_sse2:
-	push esi
-	push edi
-	push ebp
-	push ebx
-	push edx
-
-	mov esi, [esp+24]
-	mov edi, [esp+28]
-	mov ebx, [esp+32]
-	mov ecx, [esp+36]
-	mov edx, [esp+40]
-	pxor xmm0, xmm0	
-.hloop:
-	mov eax, ebx
-	mov ebp, $0
-.wloop:
-	movdqa xmm1, [esi+ebp]
-	movdqa xmm2, [edi+ebp]
-	psadbw xmm1, xmm2
-	pshufd xmm2, xmm1, 0f6h	; 11110110 B ; movhlps for float
-	paddd xmm1, xmm2
-	paddd xmm0, xmm1	
-	add ebp, 010h
-	dec eax
-	jnz near .wloop
-	lea esi, [esi+edx]
-	lea edi, [edi+edx]
-	dec ecx
-	jnz near .hloop
-
-	movd eax, xmm0
-	pop edx
-	pop ebx
-	pop ebp
-	pop edi
-	pop esi
-	ret
-
-
-WELS_EXTERN SampleVariance16x16_sse2
-;***********************************************************************
-;   void SampleVariance16x16_sse2(	uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
-;***********************************************************************
-ALIGN 16
-SampleVariance16x16_sse2:	
-	push esi
-	push edi
-	push ebx
-	
-	sub esp, 16
-	%define SUM			[esp]
-	%define SUM_CUR		[esp+4]
-	%define SQR			[esp+8]
-	%define SQR_CUR		[esp+12]
-	%define PUSH_SIZE	28	; 12 + 16	
-
-	mov edi, [esp+PUSH_SIZE+4]	; y_ref
-	mov edx, [esp+PUSH_SIZE+8]	; y_ref_stride	
-	mov esi, [esp+PUSH_SIZE+12]	; y_src
-	mov eax, [esp+PUSH_SIZE+16]	; y_src_stride
-	mov ecx, 010h				; height = 16
-
-	pxor xmm7, xmm7
-	movdqu SUM, xmm7
-
-.hloops:
-	movdqa xmm0, [edi]		; y_ref
-	movdqa xmm1, [esi]		; y_src
-	movdqa xmm2, xmm0		; store first for future process
-	movdqa xmm3, xmm1
-	; sum += diff;
-	movdqa xmm4, xmm0
-	psadbw xmm4, xmm1		; 2 parts, [0,..,15], [64,..,79]
-	; to be continued for sum
-	pshufd xmm5, xmm4, 0C6h	; 11000110 B
-	paddw xmm4, xmm5
-	movd ebx, xmm4
-	add SUM, ebx
-
-	; sqr += diff * diff;
-	pmaxub xmm0, xmm1
-	pminub xmm1, xmm2
-	psubb xmm0, xmm1				; diff	
-	SUM_SQR_SSE2 xmm1, xmm0, xmm7	; dst, pSrc, zero
-	movd ebx, xmm1
-	add SQR, ebx
-
-	; sum_cur += y_src[x];
-	movdqa xmm0, xmm3		; cur_orig
-	movdqa xmm1, xmm0
-	punpcklbw xmm0, xmm7
-	punpckhbw xmm1, xmm7
-	paddw xmm0, xmm1		; 8x2
-	SUM_WORD_8x2_SSE2 xmm0, xmm1	
-	movd ebx, xmm0
-	and ebx, 0ffffh
-	add SUM_CUR, ebx
-
-	; sqr_cur += y_src[x] * y_src[x];
-	SUM_SQR_SSE2 xmm0, xmm3, xmm7	; dst, pSrc, zero
-	movd ebx, xmm0
-	add SQR_CUR, ebx
-	
-	lea edi, [edi+edx]
-	lea esi, [esi+eax]
-	dec ecx
-	jnz near .hloops
-	
-	mov ebx, 0
-	mov bx, word SUM
-	sar ebx, 8
-	imul ebx, ebx
-	mov ecx, SQR
-	sar ecx, 8
-	sub ecx, ebx
-	mov edi, [esp+PUSH_SIZE+20]	; pMotionTexture
-	mov [edi], cx				; to store uiMotionIndex
-	mov ebx, 0
-	mov bx, word SUM_CUR
-	sar ebx, 8
-	imul ebx, ebx
-	mov ecx, SQR_CUR
-	sar ecx, 8
-	sub ecx, ebx
-	mov [edi+2], cx				; to store uiTextureIndex
-	
-	%undef SUM
-	%undef SUM_CUR
-	%undef SQR
-	%undef SQR_CUR
-	%undef PUSH_SIZE
-
-	add esp, 16	
-	pop ebx
-	pop edi
-	pop esi	
-
-	ret
-
-; , 6/7/2010
-
-%ifndef NO_DYNAMIC_VP
-WELS_EXTERN AnalysisVaaInfoIntra_sse2
-;***********************************************************************
-;	int32_t AnalysisVaaInfoIntra_sse2(	uint8_t *pDataY, const int32_t linesize );
-;***********************************************************************
-ALIGN 16
-AnalysisVaaInfoIntra_sse2:
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov ebp, esp
-	and ebp, 0fh
-	sub esp, ebp
-	sub esp, 32	
-	%define PUSH_SIZE	52	; 20 + 32
-
-	mov esi, [esp+ebp+PUSH_SIZE+4]	; data_y
-	mov ecx, [esp+ebp+PUSH_SIZE+8]	; linesize
-
-	mov ebx, ecx
-	sal ebx, $1			; linesize x 2 [ebx]
-	mov edx, ebx
-	add edx, ecx		; linesize x 3 [edx]
-	mov eax, ebx
-	sal eax, $1			; linesize x 4 [eax]
-	
-	pxor xmm7, xmm7
-	
-	; loops
-	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp], xmm0	
-
-	lea esi, [esi+eax]
-	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp+8], xmm0	
-
-	lea esi, [esi+eax]
-	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp+16], xmm0	
-
-	lea esi, [esi+eax]
-	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp+24], xmm0
-		
-	movdqa xmm0, [esp]		; block 0~7
-	movdqa xmm1, [esp+16]	; block 8~15
-	movdqa xmm2, xmm0
-	paddw xmm0, xmm1
-	SUM_WORD_8x2_SSE2 xmm0, xmm3
-	
-	pmullw xmm1, xmm1
-	pmullw xmm2, xmm2
-	movdqa xmm3, xmm1
-	movdqa xmm4, xmm2
-	punpcklwd xmm1, xmm7
-	punpckhwd xmm3, xmm7
-	punpcklwd xmm2, xmm7
-	punpckhwd xmm4, xmm7
-	paddd xmm1, xmm2
-	paddd xmm3, xmm4
-	paddd xmm1, xmm3
-	pshufd xmm2, xmm1, 01Bh
-	paddd xmm1, xmm2
-	pshufd xmm2, xmm1, 0B1h
-	paddd xmm1, xmm2
-	
-	movd ebx, xmm0
-	and ebx, 0ffffh		; effective low word truncated
-	mov ecx, ebx
-	imul ebx, ecx
-	sar ebx, $4
-	movd eax, xmm1
-	sub eax, ebx
-	
-	%undef PUSH_SIZE
-	add esp, 32
-	add esp, ebp
-	pop ebp
-	pop edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-        
-WELS_EXTERN AnalysisVaaInfoIntra_ssse3
-;***********************************************************************
-;	int32_t AnalysisVaaInfoIntra_ssse3(	uint8_t *pDataY, const int32_t linesize );
-;***********************************************************************
-ALIGN 16
-AnalysisVaaInfoIntra_ssse3:
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov ebp, esp
-	and ebp, 0fh
-	sub esp, ebp
-	sub esp, 32	
-	%define PUSH_SIZE	52	; 20 + 32
-
-	mov esi, [esp+ebp+PUSH_SIZE+4]	; data_y
-	mov ecx, [esp+ebp+PUSH_SIZE+8]	; linesize
-
-	mov ebx, ecx
-	sal ebx, $1			; linesize x 2 [ebx]
-	mov edx, ebx
-	add edx, ecx		; linesize x 3 [edx]
-	mov eax, ebx
-	sal eax, $1			; linesize x 4 [eax]
-	
-	pxor xmm7, xmm7
-	
-	; loops
-	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp], xmm0	
-
-	lea esi, [esi+eax]
-	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
-	movq [esp+8], xmm1	
-
-	lea esi, [esi+eax]
-	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp+16], xmm0	
-
-	lea esi, [esi+eax]
-	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
-	movq [esp+24], xmm1
-		
-	movdqa xmm0, [esp]		; block 0~7
-	movdqa xmm1, [esp+16]	; block 8~15
-	movdqa xmm2, xmm0
-	paddw xmm0, xmm1
-	SUM_WORD_8x2_SSE2 xmm0, xmm3	; better performance than that of phaddw sets
-
-	pmullw xmm1, xmm1
-	pmullw xmm2, xmm2
-	movdqa xmm3, xmm1
-	movdqa xmm4, xmm2
-	punpcklwd xmm1, xmm7
-	punpckhwd xmm3, xmm7
-	punpcklwd xmm2, xmm7
-	punpckhwd xmm4, xmm7
-	paddd xmm1, xmm2
-	paddd xmm3, xmm4
-	paddd xmm1, xmm3
-	pshufd xmm2, xmm1, 01Bh
-	paddd xmm1, xmm2
-	pshufd xmm2, xmm1, 0B1h
-	paddd xmm1, xmm2
-	
-	movd ebx, xmm0
-	and ebx, 0ffffh		; effective low work truncated
-	mov ecx, ebx
-	imul ebx, ecx
-	sar ebx, $4
-	movd eax, xmm1
-	sub eax, ebx
-	
-	%undef PUSH_SIZE
-	add esp, 32
-	add esp, ebp
-	pop ebp
-	pop edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-%endif
-	
-	
-
-WELS_EXTERN abs_difference_mbrow_sse2
-;*************************************************************************************************************
-;void abs_difference_mbrow_sse2( uint8_t *ref_orig, uint8_t *cur_orig, int32_t iPicStride, 
-;								 int32_t gom_pixel_num, int32_t *pSum)
-;*************************************************************************************************************
-ALIGN 16
-abs_difference_mbrow_sse2:
-%define		ref_orig			esp + pushsize + 4
-%define		cur_orig			esp + pushsize + 8
-%define		iPicStride			esp + pushsize + 12
-%define		gom_pixel_num		esp + pushsize + 16
-%define		pSum				esp + pushsize + 20
-%define		pushsize	12
-	push	esi
-	push	edi
-	push	ebx
-	mov		esi,	[ref_orig]
-	mov		edi,	[cur_orig]
-	mov		ebx,	[iPicStride]
-	mov		eax,	[gom_pixel_num]
-	mov		ecx,	16					;MB_WIDTH_LUMA
-	pxor	xmm0,	xmm0
-mb_width_loop_p:
-	mov		edx,	esi
-	add		edx,	eax			; end address
-gom_row_loop_p:
-	movdqa	xmm1,	[esi]
-	movdqa	xmm2,	[edi]
-	psadbw	xmm1,	xmm2
-	paddd	xmm0,	xmm1
-	add		esi,	16
-	add		edi,	16
-	cmp		esi,	edx
-	jl		gom_row_loop_p
-	
-	sub		esi,	eax
-	sub		edi,	eax
-	add		esi,	ebx
-	add		edi,	ebx
-	loop	mb_width_loop_p
-	
-	movdqa	xmm1,	xmm0
-	psrldq	xmm1,	8
-	paddd	xmm1,	xmm0
-	movd	eax,	xmm1
-	mov		edx,	[pSum]	; pSum
-	add		[edx],	eax
-
-%undef		ref_orig
-%undef		cur_orig
-%undef		iPicStride
-%undef		gom_pixel_num
-%undef		pSum
-%undef		pushsize	
-	pop		ebx
-	pop		edi
-	pop		esi
-	ret
-
-
-
-
-WELS_EXTERN sum_sqrsum_mbrow_sse2
-;*************************************************************************************************************
-;void sum_sqrsum_mbrow_sse2( uint8_t *cur_orig, int32_t iPicStride, 
-;							 int32_t gom_pixel_num, int32_t *pSum, int32_t *pSqrSum)
-;*************************************************************************************************************
-ALIGN 16
-sum_sqrsum_mbrow_sse2:
-%define		cur_orig			esp + pushsize + 4
-%define		iPicStride			esp + pushsize + 8
-%define		gom_pixel_num		esp + pushsize + 12
-%define		pSum				esp + pushsize + 16
-%define		pSqrSum				esp + pushsize + 20
-%define		pushsize			8
-	push		esi
-	push		ebx
-	mov			esi,	[cur_orig]
-	mov			eax,	[gom_pixel_num]
-	mov			ebx,	[iPicStride]
-	mov			ecx,	16					;MB_WIDTH_LUMA
-	pxor		xmm0,	xmm0				; zero
-	pxor		xmm1,	xmm1				; sum
-	pxor		xmm2,	xmm2				; sqr sum
-mb_width_loop_i:
-	mov			edx,	esi
-	add			edx,	eax			; end address
-gom_row_loop_i:
-	movdqa		xmm3,	[esi]
-	movdqa		xmm4,	xmm3
-	psadbw		xmm4,	xmm0
-	paddd		xmm1,	xmm4
-	movdqa		xmm4,	xmm3
-	punpcklbw	xmm4,	xmm0
-	punpckhbw	xmm3,	xmm0
-	pmaddwd		xmm4,	xmm4
-	pmaddwd		xmm3,	xmm3
-	paddd		xmm2,	xmm3
-	paddd		xmm2,	xmm4
-	add			esi,	16
-	cmp			esi,	edx
-	jl			gom_row_loop_i
-	
-	sub			esi,	eax
-	add			esi,	ebx
-	loop		mb_width_loop_i
-	
-	movdqa		xmm3,	xmm1
-	psrldq		xmm3,	8
-	paddd		xmm1,	xmm3
-	movd		eax,	xmm1
-	mov			edx,	[pSum]
-	add			[edx],	eax
-	
-	movdqa		xmm3,	xmm2
-	psrldq		xmm3,	8
-	paddd		xmm2,	xmm3
-	movdqa		xmm3,	xmm2
-	psrldq		xmm3,	4
-	paddd		xmm2,	xmm3
-	movd		eax,	xmm2
-	mov			edx,	[pSqrSum]
-	add			[edx],	eax
-
-
-%undef		cur_orig
-%undef		iPicStride
-%undef		gom_pixel_num
-%undef		pSum
-%undef		pSqrSum
-%undef		pushsize	
-	pop			ebx
-	pop			esi
-	ret
-
-
-
-WELS_EXTERN VAACalcSad_sse2
-;*************************************************************************************************************
-;void VAACalcSad_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
-;								int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8)
-;*************************************************************************************************************
-
-
-ALIGN 16
-VAACalcSad_sse2:
-%define		cur_data			esp + pushsize + 4
-%define		ref_data			esp + pushsize + 8
-%define		iPicWidth			esp + pushsize + 12
-%define		iPicHeight			esp + pushsize + 16
-%define		iPicStride			esp + pushsize + 20
-%define		psadframe			esp + pushsize + 24
-%define		psad8x8				esp + pushsize + 28
-%define		pushsize	12
-	push	esi
-	push	edi
-	push	ebx
-	mov		esi,	[cur_data]
-	mov		edi,	[ref_data]
-	mov		ebx,	[iPicStride]
-	mov		edx,	[psad8x8]
-	mov		eax,	ebx
-	
-	shr		dword [iPicWidth],	4					; iPicWidth/16
-	shr		dword [iPicHeight],	4					; iPicHeight/16
-	shl		eax,	4								; iPicStride*16
-	pxor	xmm0,	xmm0
-	pxor	xmm7,	xmm7		; iFrameSad
-height_loop:
-	mov		ecx,	dword [iPicWidth]
-	push	esi
-	push	edi
-width_loop:
-	pxor	xmm6,	xmm6		;
-	WELS_SAD_16x2_SSE2
-	WELS_SAD_16x2_SSE2
-	WELS_SAD_16x2_SSE2
-	WELS_SAD_16x2_SSE2
-	paddd	xmm7,		xmm6
-	movd	[edx],		xmm6
-	psrldq	xmm6,		8
-	movd	[edx+4],	xmm6
-	
-	pxor	xmm6,	xmm6
-	WELS_SAD_16x2_SSE2
-	WELS_SAD_16x2_SSE2
-	WELS_SAD_16x2_SSE2
-	WELS_SAD_16x2_SSE2
-	paddd	xmm7,		xmm6
-	movd	[edx+8],	xmm6
-	psrldq	xmm6,		8
-	movd	[edx+12],	xmm6
-	
-	add		edx,	16
-	sub		esi,	eax
-	sub		edi,	eax
-	add		esi,	16
-	add		edi,	16
-	
-	dec		ecx
-	jnz		width_loop
-	
-	pop		edi
-	pop		esi
-	add		esi,	eax
-	add		edi,	eax
-	
-	dec	dword [iPicHeight]
-	jnz		height_loop
-	
-	mov		edx,	[psadframe]
-	movdqa	xmm5,	xmm7
-	psrldq	xmm7,	8
-	paddd	xmm7,	xmm5
-	movd	[edx],	xmm7
-
-%undef		cur_data
-%undef		ref_data
-%undef		iPicWidth
-%undef		iPicHeight
-%undef		iPicStride
-%undef		psadframe
-%undef		psad8x8
-%undef		pushsize	
-	pop		ebx
-	pop		edi
-	pop		esi
-	ret
-	
-	
-WELS_EXTERN VAACalcSadVar_sse2
-;*************************************************************************************************************
-;void VAACalcSadVar_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight 
-;		int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)
-;*************************************************************************************************************
-
-
-ALIGN 16
-VAACalcSadVar_sse2:
-%define		localsize		8
-%define		cur_data			esp + pushsize + localsize + 4
-%define		ref_data			esp + pushsize + localsize + 8
-%define		iPicWidth			esp + pushsize + localsize + 12
-%define		iPicHeight			esp + pushsize + localsize + 16
-%define		iPicStride			esp + pushsize + localsize + 20
-%define		psadframe			esp + pushsize + localsize + 24
-%define		psad8x8				esp + pushsize + localsize + 28
-%define		psum16x16			esp + pushsize + localsize + 32
-%define		psqsum16x16			esp + pushsize + localsize + 36
-%define		tmp_esi				esp + 0
-%define		tmp_edi				esp + 4
-%define		pushsize		16
-	push	ebp
-	push	esi
-	push	edi
-	push	ebx
-	sub		esp,	localsize
-	mov		esi,	[cur_data]
-	mov		edi,	[ref_data]
-	mov		ebx,	[iPicStride]
-	mov		edx,	[psad8x8]
-	mov		eax,	ebx
-	
-	shr		dword [iPicWidth],	4					; iPicWidth/16
-	shr		dword [iPicHeight],	4					; iPicHeight/16
-	shl		eax,	4							; iPicStride*16
-	pxor	xmm0,	xmm0
-	pxor	xmm7,	xmm7		; iFrameSad
-var_height_loop:
-	mov		ecx,	dword [iPicWidth]
-	mov		[tmp_esi],	esi
-	mov		[tmp_edi],	edi
-var_width_loop:
-	pxor	xmm6,	xmm6		; hiQuad_loQuad pSad8x8
-	pxor	xmm5,	xmm5		; pSum16x16
-	pxor	xmm4,	xmm4		; sqsum_16x16
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	paddd	xmm7,		xmm6
-	movd	[edx],		xmm6
-	psrldq	xmm6,		8
-	movd	[edx+4],	xmm6
-	
-	pxor	xmm6,	xmm6
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	paddd	xmm7,		xmm6
-	movd	[edx+8],	xmm6
-	psrldq	xmm6,		8
-	movd	[edx+12],	xmm6
-	
-	mov		ebp,	[psum16x16]
-	movdqa	xmm1,	xmm5
-	psrldq	xmm1,	8
-	paddd	xmm5,	xmm1
-	movd	[ebp],	xmm5
-	add		dword [psum16x16], 4
-	
-	movdqa	xmm5,	xmm4
-	psrldq	xmm5,	8
-	paddd	xmm4,	xmm5
-	movdqa	xmm3,	xmm4
-	psrldq	xmm3,	4
-	paddd	xmm4,	xmm3
-	
-	mov		ebp,	[psqsum16x16]
-	movd	[ebp],	xmm4
-	add		dword [psqsum16x16], 4
-	
-	add		edx,	16
-	sub		esi,	eax
-	sub		edi,	eax
-	add		esi,	16
-	add		edi,	16
-	
-	dec		ecx
-	jnz		var_width_loop
-	
-	mov		esi,	[tmp_esi]
-	mov		edi,	[tmp_edi]
-	add		esi,	eax
-	add		edi,	eax
-	
-	dec	dword [iPicHeight]
-	jnz		var_height_loop
-	
-	mov		edx,	[psadframe]
-	movdqa	xmm5,	xmm7
-	psrldq	xmm7,	8
-	paddd	xmm7,	xmm5
-	movd	[edx],	xmm7
-
-	add		esp,	localsize	
-	pop		ebx
-	pop		edi
-	pop		esi
-	pop		ebp
-%undef		cur_data
-%undef		ref_data
-%undef		iPicWidth
-%undef		iPicHeight
-%undef		iPicStride
-%undef		psadframe
-%undef		psad8x8
-%undef		psum16x16
-%undef		psqsum16x16
-%undef		tmp_esi
-%undef		tmp_edi
-%undef		pushsize
-%undef		localsize
-	ret
-	
-	
-
-WELS_EXTERN VAACalcSadSsd_sse2
-;*************************************************************************************************************
-;void VAACalcSadSsd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,  
-;	int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
-;*************************************************************************************************************
-
-
-ALIGN 16
-VAACalcSadSsd_sse2:
-%define		localsize		12
-%define		cur_data			esp + pushsize + localsize + 4
-%define		ref_data			esp + pushsize + localsize + 8
-%define		iPicWidth			esp + pushsize + localsize + 12
-%define		iPicHeight			esp + pushsize + localsize + 16
-%define		iPicStride			esp + pushsize + localsize + 20
-%define		psadframe			esp + pushsize + localsize + 24
-%define		psad8x8				esp + pushsize + localsize + 28
-%define		psum16x16			esp + pushsize + localsize + 32
-%define		psqsum16x16			esp + pushsize + localsize + 36
-%define		psqdiff16x16		esp + pushsize + localsize + 40
-%define		tmp_esi				esp + 0
-%define		tmp_edi				esp + 4
-%define		tmp_sadframe		esp + 8
-%define		pushsize		16
-	push	ebp
-	push	esi
-	push	edi
-	push	ebx
-	sub		esp,	localsize
-	mov		ecx,	[iPicWidth]
-	mov		ecx,	[iPicHeight]
-	mov		esi,	[cur_data]
-	mov		edi,	[ref_data]
-	mov		ebx,	[iPicStride]
-	mov		edx,	[psad8x8]
-	mov		eax,	ebx
-	
-	shr		dword [iPicWidth],	4					; iPicWidth/16
-	shr		dword [iPicHeight],	4					; iPicHeight/16
-	shl		eax,	4							; iPicStride*16
-	mov		ecx,	[iPicWidth]
-	mov		ecx,	[iPicHeight]
-	pxor	xmm0,	xmm0
-	movd	[tmp_sadframe],	xmm0
-sqdiff_height_loop:
-	mov		ecx,	dword [iPicWidth]
-	mov		[tmp_esi],	esi
-	mov		[tmp_edi],	edi
-sqdiff_width_loop:
-	pxor	xmm7,	xmm7		; hiQuad_loQuad pSad8x8
-	pxor	xmm6,	xmm6		; pSum16x16
-	pxor	xmm5,	xmm5		; sqsum_16x16  four dword
-	pxor	xmm4,	xmm4		; sqdiff_16x16	four Dword
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	movdqa	xmm1,		xmm7
-	movd	[edx],		xmm7
-	psrldq	xmm7,		8
-	paddd	xmm1,		xmm7
-	movd	[edx+4],	xmm7
-	movd	ebp,		xmm1
-	add		[tmp_sadframe],	ebp
-	
-	pxor	xmm7,	xmm7
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	movdqa	xmm1,		xmm7
-	movd	[edx+8],	xmm7
-	psrldq	xmm7,		8
-	paddd	xmm1,		xmm7
-	movd	[edx+12],	xmm7
-	movd	ebp,		xmm1
-	add		[tmp_sadframe],	ebp
-	
-	mov		ebp,	[psum16x16]
-	movdqa	xmm1,	xmm6
-	psrldq	xmm1,	8
-	paddd	xmm6,	xmm1
-	movd	[ebp],	xmm6
-	add		dword [psum16x16], 4
-	
-	mov		ebp,	[psqsum16x16]
-	pshufd	xmm6,	xmm5,	14 ;00001110
-	paddd	xmm6,	xmm5
-	pshufd	xmm5,	xmm6,	1  ;00000001
-	paddd	xmm5,	xmm6
-	movd	[ebp],	xmm5
-	add		dword [psqsum16x16], 4
-	
-	mov		ebp,	[psqdiff16x16]
-	pshufd	xmm5,	xmm4,	14	; 00001110
-	paddd	xmm5,	xmm4
-	pshufd	xmm4,	xmm5,	1	; 00000001
-	paddd	xmm4,	xmm5
-	movd	[ebp],	xmm4
-	add		dword	[psqdiff16x16],	4
-	
-	add		edx,	16
-	sub		esi,	eax
-	sub		edi,	eax
-	add		esi,	16
-	add		edi,	16
-	
-	dec		ecx
-	jnz		sqdiff_width_loop
-	
-	mov		esi,	[tmp_esi]
-	mov		edi,	[tmp_edi]
-	add		esi,	eax
-	add		edi,	eax
-	
-	dec	dword [iPicHeight]
-	jnz		sqdiff_height_loop
-	
-	mov		ebx,	[tmp_sadframe]
-	mov		eax,	[psadframe]
-	mov		[eax],	ebx
-
-	add		esp,	localsize	
-	pop		ebx
-	pop		edi
-	pop		esi
-	pop		ebp
-%undef		cur_data
-%undef		ref_data
-%undef		iPicWidth
-%undef		iPicHeight
-%undef		iPicStride
-%undef		psadframe
-%undef		psad8x8
-%undef		psum16x16
-%undef		psqsum16x16
-%undef		psqdiff16x16
-%undef		tmp_esi
-%undef		tmp_edi
-%undef		tmp_sadframe
-%undef		pushsize
-%undef		localsize
-	ret
-	
-	
-	
-	
-
-WELS_EXTERN VAACalcSadBgd_sse2
-;*************************************************************************************************************
-;void VAACalcSadBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight, 
-;				int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)
-;*************************************************************************************************************
-
-
-ALIGN 16
-VAACalcSadBgd_sse2:
-%define		localsize		12
-%define		cur_data			esp + pushsize + localsize + 4
-%define		ref_data			esp + pushsize + localsize + 8
-%define		iPicWidth			esp + pushsize + localsize + 12
-%define		iPicHeight			esp + pushsize + localsize + 16
-%define		iPicStride			esp + pushsize + localsize + 20
-%define		psadframe			esp + pushsize + localsize + 24
-%define		psad8x8				esp + pushsize + localsize + 28
-%define		p_sd8x8				esp + pushsize + localsize + 32
-%define		p_mad8x8			esp + pushsize + localsize + 36
-%define		tmp_esi				esp + 0
-%define		tmp_edi				esp + 4
-%define		tmp_ecx				esp + 8
-%define		pushsize		16
-	push	ebp
-	push	esi
-	push	edi
-	push	ebx
-	sub		esp,	localsize
-	mov		esi,	[cur_data]
-	mov		edi,	[ref_data]
-	mov		ebx,	[iPicStride]
-	mov		eax,	ebx
-	
-	shr		dword [iPicWidth],	4					; iPicWidth/16
-	shr		dword [iPicHeight],	4					; iPicHeight/16
-	shl		eax,	4							; iPicStride*16
-	xor		ebp,	ebp
-	pxor	xmm0,	xmm0
-bgd_height_loop:
-	mov		ecx,	dword [iPicWidth]
-	mov		[tmp_esi],	esi
-	mov		[tmp_edi],	edi
-bgd_width_loop:
-	pxor	xmm7,	xmm7		; pSad8x8
-	pxor	xmm6,	xmm6		; sum_cur_8x8
-	pxor	xmm5,	xmm5		; sum_ref_8x8
-	pxor	xmm4,	xmm4		; pMad8x8
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	
-	
-	mov			edx,		[p_mad8x8]
-	WELS_MAX_REG_SSE2	xmm4
-	
-	;movdqa		xmm1,	xmm4
-	;punpcklbw	xmm1,	xmm0
-	;punpcklwd	xmm1,	xmm0
-	;movd		[edx],	xmm1
-	;punpckhbw	xmm4,	xmm0
-	;punpcklwd	xmm4,	xmm0
-	;movd		[edx+4],	xmm4
-	;add			edx,		8
-	;mov			[p_mad8x8],	edx	
-	mov			[tmp_ecx],	ecx
-	movhlps		xmm1,	xmm4
-	movd		ecx,	xmm4
-	mov			[edx],	cl
-	movd		ecx,	xmm1
-	mov			[edx+1],cl
-	add			edx,	2
-	mov			[p_mad8x8],	edx
-
-	
-	pslldq		xmm7,	4
-	pslldq		xmm6,	4
-	pslldq		xmm5,	4
-	
-	
-	pxor	xmm4,	xmm4		; pMad8x8
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	
-	mov			edx,		[p_mad8x8]
-	WELS_MAX_REG_SSE2	xmm4
-	
-	;movdqa		xmm1,	xmm4
-	;punpcklbw	xmm1,	xmm0
-	;punpcklwd	xmm1,	xmm0
-	;movd		[edx],	xmm1
-	;punpckhbw	xmm4,	xmm0
-	;punpcklwd	xmm4,	xmm0
-	;movd		[edx+4],	xmm4
-	;add			edx,		8
-	;mov			[p_mad8x8],	edx	
-	movhlps		xmm1,	xmm4
-	movd		ecx,	xmm4
-	mov			[edx],	cl
-	movd		ecx,	xmm1
-	mov			[edx+1],cl
-	add			edx,	2
-	mov			[p_mad8x8],	edx
-	
-	; data in xmm7, xmm6, xmm5:  D1 D3 D0 D2
-	
-	mov		edx,	[psad8x8]
-	pshufd	xmm1,	xmm7,	10001101b		; D3 D2 D1 D0
-	movdqa	[edx],	xmm1					
-	add		edx,	16
-	mov		[psad8x8],	edx					; sad8x8
-	
-	paddd	xmm1,	xmm7					; D1+3 D3+2 D0+1 D2+0
-	pshufd	xmm2,	xmm1,	00000011b
-	paddd	xmm1,	xmm2
-	movd	edx,	xmm1
-	add		ebp,	edx						; sad frame
-	
-	mov		edx,	[p_sd8x8]
-	psubd	xmm6,	xmm5
-	pshufd	xmm1,	xmm6,	10001101b
-	movdqa	[edx],	xmm1
-	add		edx,	16
-	mov		[p_sd8x8],	edx
-	
-	
-	add		edx,	16
-	sub		esi,	eax
-	sub		edi,	eax
-	add		esi,	16
-	add		edi,	16
-	
-	mov		ecx,	[tmp_ecx]
-	dec		ecx
-	jnz		bgd_width_loop
-	
-	mov		esi,	[tmp_esi]
-	mov		edi,	[tmp_edi]
-	add		esi,	eax
-	add		edi,	eax
-	
-	dec		dword [iPicHeight]
-	jnz		bgd_height_loop
-	
-	mov		edx,	[psadframe]
-	mov		[edx],	ebp
-
-	add		esp,	localsize	
-	pop		ebx
-	pop		edi
-	pop		esi
-	pop		ebp
-%undef		cur_data
-%undef		ref_data
-%undef		iPicWidth
-%undef		iPicHeight
-%undef		iPicStride
-%undef		psadframe
-%undef		psad8x8
-%undef		p_sd8x8
-%undef		p_mad8x8
-%undef		tmp_esi
-%undef		tmp_edi
-%undef		pushsize
-%undef		localsize
-	ret
-
-
-
-WELS_EXTERN VAACalcSadSsdBgd_sse2
-;*************************************************************************************************************
-;void VAACalcSadSsdBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight, 
-;		 int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, 
-;			int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8)
-;*************************************************************************************************************
-
-
-ALIGN 16
-VAACalcSadSsdBgd_sse2:
-%define		localsize		16
-%define		cur_data			esp + pushsize + localsize + 4
-%define		ref_data			esp + pushsize + localsize + 8
-%define		iPicWidth			esp + pushsize + localsize + 12
-%define		iPicHeight			esp + pushsize + localsize + 16
-%define		iPicStride			esp + pushsize + localsize + 20
-%define		psadframe			esp + pushsize + localsize + 24
-%define		psad8x8				esp + pushsize + localsize + 28
-%define		psum16x16			esp + pushsize + localsize + 32
-%define		psqsum16x16			esp + pushsize + localsize + 36
-%define		psqdiff16x16		esp + pushsize + localsize + 40
-%define		p_sd8x8				esp + pushsize + localsize + 44
-%define		p_mad8x8			esp + pushsize + localsize + 48
-%define		tmp_esi				esp + 0
-%define		tmp_edi				esp + 4
-%define		tmp_sadframe		esp + 8
-%define		tmp_ecx				esp + 12
-%define		pushsize		16
-	push	ebp
-	push	esi
-	push	edi
-	push	ebx
-	sub		esp,	localsize
-	mov		esi,	[cur_data]
-	mov		edi,	[ref_data]
-	mov		ebx,	[iPicStride]
-	mov		eax,	ebx
-	
-	shr		dword [iPicWidth],	4					; iPicWidth/16
-	shr		dword [iPicHeight],	4					; iPicHeight/16
-	shl		eax,	4							; iPicStride*16
-	pxor	xmm0,	xmm0
-	movd	[tmp_sadframe],	xmm0
-sqdiff_bgd_height_loop:
-	mov		ecx,	dword [iPicWidth]
-	mov		[tmp_esi],	esi
-	mov		[tmp_edi],	edi
-sqdiff_bgd_width_loop:
-	pxor	xmm7,	xmm7		; pSad8x8 interleaves sqsum16x16:  sqsum1 sad1 sqsum0 sad0
-	pxor	xmm6,	xmm6		; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
-	pxor	xmm5,	xmm5		; pMad8x8
-	pxor	xmm4,	xmm4		; sqdiff_16x16	four Dword
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	
-	mov		edx,		[psad8x8]
-	movdqa	xmm2,		xmm7
-	pshufd	xmm1,		xmm2,		00001110b
-	movd	[edx],		xmm2
-	movd	[edx+4],	xmm1
-	add		edx,		8
-	mov		[psad8x8],	edx			; sad8x8
-	
-	paddd	xmm1,				xmm2
-	movd	edx,				xmm1
-	add		[tmp_sadframe],		edx			; iFrameSad
-	
-	mov		edx,		[psum16x16]
-	movdqa	xmm1,		xmm6
-	pshufd	xmm2,		xmm1,		00001110b
-	paddd	xmm1,		xmm2
-	movd	[edx],		xmm1				; sum
-	
-	mov		edx,		[p_sd8x8]
-	pshufd	xmm1,		xmm6,		11110101b			; Sref1 Sref1 Sref0 Sref0
-	psubd	xmm6,		xmm1		; 00 diff1 00 diff0
-	pshufd	xmm1,		xmm6,		00001000b			;  xx xx diff1 diff0
-	movq	[edx],		xmm1
-	add		edx,		8
-	mov		[p_sd8x8],	edx
-	
-	mov			edx,		[p_mad8x8]
-	WELS_MAX_REG_SSE2	xmm5
-	;movdqa		xmm1,	xmm5
-	;punpcklbw	xmm1,	xmm0
-	;punpcklwd	xmm1,	xmm0
-	;movd		[edx],	xmm1
-	;punpckhbw	xmm5,	xmm0
-	;punpcklwd	xmm5,	xmm0
-	;movd		[edx+4],	xmm5
-	;add			edx,		8
-	;mov			[p_mad8x8],	edx
-	mov			[tmp_ecx],	ecx
-	movhlps		xmm1,	xmm5
-	movd		ecx,	xmm5
-	mov			[edx],	cl
-	movd		ecx,	xmm1
-	mov			[edx+1],cl
-	add			edx,	2
-	mov			[p_mad8x8],	edx
-	
-	psrlq	xmm7,	32
-	psllq	xmm7,	32			; clear sad
-	pxor	xmm6,	xmm6		; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
-	pxor	xmm5,	xmm5		; pMad8x8
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	
-	mov		edx,		[psad8x8]
-	movdqa	xmm2,		xmm7
-	pshufd	xmm1,		xmm2,		00001110b
-	movd	[edx],		xmm2
-	movd	[edx+4],	xmm1
-	add		edx,		8
-	mov		[psad8x8],	edx			; sad8x8
-	
-	paddd	xmm1,				xmm2
-	movd	edx,				xmm1
-	add		[tmp_sadframe],		edx			; iFrameSad
-	
-	mov		edx,			[psum16x16]
-	movdqa	xmm1,			xmm6
-	pshufd	xmm2,			xmm1,		00001110b
-	paddd	xmm1,			xmm2
-	movd	ebp,			xmm1				; sum
-	add		[edx],			ebp
-	add		edx,			4
-	mov		[psum16x16],	edx
-	
-	mov		edx,			[psqsum16x16]
-	psrlq	xmm7,			32
-	pshufd	xmm2,			xmm7,		00001110b
-	paddd	xmm2,			xmm7
-	movd	[edx],			xmm2				; sqsum
-	add		edx,			4
-	mov		[psqsum16x16],	edx
-	
-	mov		edx,		[p_sd8x8]
-	pshufd	xmm1,		xmm6,		11110101b			; Sref1 Sref1 Sref0 Sref0
-	psubd	xmm6,		xmm1		; 00 diff1 00 diff0
-	pshufd	xmm1,		xmm6,		00001000b			;  xx xx diff1 diff0
-	movq	[edx],		xmm1
-	add		edx,		8
-	mov		[p_sd8x8],	edx
-	
-	mov		edx,		[p_mad8x8]
-	WELS_MAX_REG_SSE2	xmm5
-	;movdqa		xmm1,	xmm5
-	;punpcklbw	xmm1,	xmm0
-	;punpcklwd	xmm1,	xmm0
-	;movd		[edx],	xmm1
-	;punpckhbw	xmm5,	xmm0
-	;punpcklwd	xmm5,	xmm0
-	;movd		[edx+4],	xmm5
-	;add			edx,		8
-	;mov			[p_mad8x8],	edx	
-	movhlps		xmm1,	xmm5
-	movd		ecx,	xmm5
-	mov			[edx],	cl
-	movd		ecx,	xmm1
-	mov			[edx+1],cl
-	add			edx,	2
-	mov			[p_mad8x8],	edx
-	
-	mov		edx,		[psqdiff16x16]
-	pshufd	xmm1,		xmm4,		00001110b
-	paddd	xmm4,		xmm1
-	pshufd	xmm1,		xmm4,		00000001b
-	paddd	xmm4,		xmm1
-	movd	[edx],		xmm4
-	add		edx,		4
-	mov		[psqdiff16x16],	edx
-	
-	add		edx,	16
-	sub		esi,	eax
-	sub		edi,	eax
-	add		esi,	16
-	add		edi,	16
-	
-	mov		ecx,	[tmp_ecx]
-	dec		ecx
-	jnz		sqdiff_bgd_width_loop
-	
-	mov		esi,	[tmp_esi]
-	mov		edi,	[tmp_edi]
-	add		esi,	eax
-	add		edi,	eax
-	
-	dec	dword [iPicHeight]
-	jnz		sqdiff_bgd_height_loop
-	
-	mov		edx,	[psadframe]
-	mov		ebp,	[tmp_sadframe]
-	mov		[edx],	ebp
-
-	add		esp,	localsize	
-	pop		ebx
-	pop		edi
-	pop		esi
-	pop		ebp
-%undef		cur_data
-%undef		ref_data
-%undef		iPicWidth
-%undef		iPicHeight
-%undef		iPicStride
-%undef		psadframe
-%undef		psad8x8
-%undef		psum16x16
-%undef		psqsum16x16
-%undef		psqdiff16x16
-%undef		p_sd8x8
-%undef		p_mad8x8
-%undef		tmp_esi
-%undef		tmp_edi
-%undef		pushsize
-%undef		localsize
-	ret
+;*!
+;* \copy
+;*     Copyright (c)  2010-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*	vaa.asm
+;*
+;*	Abstract
+;*      sse2 for pVaa routines
+;*
+;*  History
+;*      04/14/2010	Created
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+BITS 32
+
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+;%macro SUM_SSE2	4	; dst, pSrc, zero, pack1_8x2
+;	movdqa %1, %2
+;	punpcklbw %1, %3
+;	punpckhbw %2, %3
+;	paddw %1, %2
+;	pmaddwd %1, %4
+;	pshufd %2, %1, 04Eh	; 01001110 B
+;	paddd %1, %2
+;	pshufd %2, %1, 0B1h	; 10110001 B
+;	paddd %1, %2
+;%endmacro	; END OF SUM_SSE2
+
+; by comparing it outperforms than phaddw(SSSE3) sets
+%macro SUM_WORD_8x2_SSE2	2	; dst(pSrc), tmp
+	; @sum_8x2 begin
+	pshufd %2, %1, 04Eh	; 01001110 B
+	paddw %1, %2
+	pshuflw %2, %1, 04Eh	; 01001110 B
+	paddw %1, %2
+	pshuflw %2, %1, 0B1h	; 10110001 B
+	paddw %1, %2
+	; end of @sum_8x2
+%endmacro	; END of SUM_WORD_8x2_SSE2
+
+%macro SUM_SQR_SSE2	3	; dst, pSrc, zero
+	movdqa %1, %2
+	punpcklbw %1, %3
+	punpckhbw %2, %3
+	pmaddwd %1, %1
+	pmaddwd %2, %2
+	paddd %1, %2
+	pshufd %2, %1, 04Eh	; 01001110 B
+	paddd %1, %2
+	pshufd %2, %1, 0B1h	; 10110001 B
+	paddd %1, %2
+%endmacro	; END OF SUM_SQR_SSE2
+
+%macro VAA_AVG_BLOCK_SSE2 6 ; dst, t0, t1, t2, t3, t4
+	movdqa %1, [esi    ]	; line 0
+	movdqa %2, [esi+ecx]	; line 1
+	movdqa %3, %1
+	punpcklbw %1, xmm7
+	punpckhbw %3, xmm7
+	movdqa %4, %2
+	punpcklbw %4, xmm7
+	punpckhbw %2, xmm7
+	paddw %1, %4
+	paddw %2, %3
+	movdqa %3, [esi+ebx]	; line 2
+	movdqa %4, [esi+edx]	; line 3
+	movdqa %5, %3
+	punpcklbw %3, xmm7
+	punpckhbw %5, xmm7
+	movdqa %6, %4
+	punpcklbw %6, xmm7
+	punpckhbw %4, xmm7
+	paddw %3, %6
+	paddw %4, %5
+	paddw %1, %3	; block 0, 1
+	paddw %2, %4	; block 2, 3
+	pshufd %3, %1, 0B1h
+	pshufd %4, %2, 0B1h
+	paddw %1, %3
+	paddw %2, %4
+	movdqa %3, %1
+	movdqa %4, %2
+	pshuflw %5, %1, 0B1h
+	pshufhw %6, %3, 0B1h
+	paddw %1, %5
+	paddw %3, %6
+	pshuflw %5, %2, 0B1h
+	pshufhw %6, %4, 0B1h
+	paddw %2, %5
+	paddw %4, %6
+	punpcklwd %1, %2
+	punpckhwd %3, %4
+	punpcklwd %1, %3
+	psraw %1, $4
+%endmacro
+
+%macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4
+	movdqa %1, [esi    ]	; line 0
+	movdqa %2, [esi+ecx]	; line 1
+	movdqa %3, %1
+	punpcklbw %1, xmm7
+	punpckhbw %3, xmm7
+	movdqa %4, %2
+	punpcklbw %4, xmm7
+	punpckhbw %2, xmm7
+	paddw %1, %4
+	paddw %2, %3
+	movdqa %3, [esi+ebx]	; line 2
+	movdqa %4, [esi+edx]	; line 3
+	movdqa %5, %3
+	punpcklbw %3, xmm7
+	punpckhbw %5, xmm7
+	movdqa %6, %4
+	punpcklbw %6, xmm7
+	punpckhbw %4, xmm7
+	paddw %3, %6
+	paddw %4, %5
+	paddw %1, %3	; block 0, 1
+	paddw %2, %4	; block 2, 3
+	phaddw %1, %2	; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..
+	phaddw %1, xmm7	; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....
+	psraw %1, $4
+%endmacro
+
+%macro WELS_SAD_16x2_SSE2  0
+	movdqa	xmm1,	[esi]
+	movdqa	xmm2,	[edi]
+	movdqa	xmm3,	[esi+ebx]
+	movdqa	xmm4,	[edi+ebx]
+	psadbw	xmm1,	xmm2
+	psadbw	xmm3,	xmm4
+	paddd	xmm6,	xmm1
+	paddd	xmm6,	xmm3
+	lea		esi,	[esi+ebx*2]
+	lea		edi,	[edi+ebx*2]
+%endmacro
+
+%macro	WELS_SAD_SUM_SQSUM_16x1_SSE2 0
+	movdqa	xmm1,	[esi]
+	movdqa	xmm2,	[edi]
+	movdqa	xmm3,	xmm1
+	psadbw	xmm3,	xmm2
+	paddd	xmm6,	xmm3
+
+	movdqa	xmm3,	xmm1
+	psadbw	xmm3,	xmm0
+	paddd	xmm5,	xmm3
+
+	movdqa		xmm2,	xmm1
+	punpcklbw	xmm1,	xmm0
+	punpckhbw	xmm2,	xmm0
+	pmaddwd		xmm1,	xmm1
+	pmaddwd		xmm2,	xmm2
+	paddd		xmm4,	xmm1
+	paddd		xmm4,	xmm2
+
+	add		esi,	ebx
+	add		edi,	ebx
+%endmacro
+
+%macro	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 0
+	movdqa	xmm1,	[esi]
+	movdqa	xmm2,	[edi]
+	movdqa	xmm3,	xmm1
+	psadbw	xmm3,	xmm2
+	paddd	xmm7,	xmm3	; sad
+
+	movdqa	xmm3,	xmm1
+	pmaxub	xmm3,	xmm2
+	pminub	xmm2,	xmm1
+	psubb	xmm3,	xmm2	; diff
+
+	movdqa	xmm2,	xmm1
+	psadbw	xmm2,	xmm0
+	paddd	xmm6,	xmm2	; sum
+
+	movdqa		xmm2,	xmm1
+	punpcklbw	xmm1,	xmm0
+	punpckhbw	xmm2,	xmm0
+	pmaddwd		xmm1,	xmm1
+	pmaddwd		xmm2,	xmm2
+	paddd		xmm5,	xmm1
+	paddd		xmm5,	xmm2	; sqsum
+
+	movdqa		xmm1,	xmm3
+	punpcklbw	xmm1,	xmm0
+	punpckhbw	xmm3,	xmm0
+	pmaddwd		xmm1,	xmm1
+	pmaddwd		xmm3,	xmm3
+	paddd		xmm4,	xmm1
+	paddd		xmm4,	xmm3	; sqdiff
+
+	add		esi,	ebx
+	add		edi,	ebx
+%endmacro
+
+%macro	WELS_SAD_SD_MAD_16x1_SSE2	4
+%define sad_reg			%1
+%define	sum_cur_reg		%2
+%define sum_ref_reg		%3
+%define	mad_reg			%4
+	movdqa	xmm1,		[esi]
+	movdqa	xmm2,		[edi]
+	movdqa	xmm3,		xmm1
+	psadbw	xmm3,		xmm0
+	paddd	sum_cur_reg,			xmm3	; sum_cur
+	movdqa	xmm3,		xmm2
+	psadbw	xmm3,		xmm0
+	paddd	sum_ref_reg,			xmm3	; sum_ref
+
+	movdqa	xmm3,		xmm1
+	pmaxub	xmm3,		xmm2
+	pminub	xmm2,		xmm1
+	psubb	xmm3,		xmm2	; abs diff
+	pmaxub	mad_reg,	xmm3	; max abs diff
+
+	psadbw	xmm3,		xmm0
+	paddd	sad_reg,	xmm3	; sad
+
+	add			esi,		ebx
+	add			edi,		ebx
+%endmacro
+
+
+%macro	WELS_MAX_REG_SSE2	1	; xmm1, xmm2, xmm3 can be used
+%define max_reg  %1
+	movdqa	xmm1,		max_reg
+	psrldq	xmm1,		4
+	pmaxub	max_reg,	xmm1
+	movdqa	xmm1,		max_reg
+	psrldq	xmm1,		2
+	pmaxub	max_reg,	xmm1
+	movdqa	xmm1,		max_reg
+	psrldq	xmm1,		1
+	pmaxub	max_reg,	xmm1
+%endmacro
+
+%macro	WELS_SAD_BGD_SQDIFF_16x1_SSE2	4
+%define sad_reg		%1
+%define	sum_reg		%2
+%define mad_reg		%3
+%define sqdiff_reg	%4
+	movdqa		xmm1,		[esi]
+	movdqa		xmm2,		xmm1
+	movdqa		xmm3,		xmm1
+	punpcklbw	xmm2,		xmm0
+	punpckhbw	xmm3,		xmm0
+	pmaddwd		xmm2,		xmm2
+	pmaddwd		xmm3,		xmm3
+	paddd		xmm2,		xmm3
+	movdqa		xmm3,		xmm2
+	psllq		xmm2,		32
+	psrlq		xmm3,		32
+	psllq		xmm3,		32
+	paddd		xmm2,		xmm3
+	paddd		sad_reg,	xmm2		; sqsum
+
+	movdqa	xmm2,		[edi]
+	movdqa	xmm3,		xmm1
+	psadbw	xmm3,		xmm0
+	paddd	sum_reg,			xmm3	; sum_cur
+	movdqa	xmm3,		xmm2
+	psadbw	xmm3,		xmm0
+	pslldq	xmm3,		4
+	paddd	sum_reg,			xmm3	; sum_ref
+
+	movdqa	xmm3,		xmm1
+	pmaxub	xmm3,		xmm2
+	pminub	xmm2,		xmm1
+	psubb	xmm3,		xmm2	; abs diff
+	pmaxub	mad_reg,	xmm3	; max abs diff
+
+	movdqa	xmm1,		xmm3
+	psadbw	xmm3,		xmm0
+	paddd	sad_reg,	xmm3	; sad
+
+	movdqa		xmm3,	xmm1
+	punpcklbw	xmm1,	xmm0
+	punpckhbw	xmm3,	xmm0
+	pmaddwd		xmm1,	xmm1
+	pmaddwd		xmm3,	xmm3
+	paddd		sqdiff_reg,	xmm1
+	paddd		sqdiff_reg,	xmm3	; sqdiff
+
+	add		esi,	ebx
+	add		edi,	ebx
+%endmacro
+
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+;SECTION .rodata align=16
+
+;ALIGN 16
+;pack1_8x2:
+;	dw 1, 1, 1, 1, 1, 1, 1, 1
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+WELS_EXTERN rc_sad_frame_sse2
+;***********************************************************************
+;	uint32_t rc_sad_frame_sse2(	uint8_t *ref_orig, uint8_t *cur_orig, const int mb_width, const int iPicHeight, const int iPicStride );
+;***********************************************************************
+ALIGN 16
+rc_sad_frame_sse2:
+	push esi
+	push edi
+	push ebp
+	push ebx
+	push edx
+
+	mov esi, [esp+24]
+	mov edi, [esp+28]
+	mov ebx, [esp+32]
+	mov ecx, [esp+36]
+	mov edx, [esp+40]
+	pxor xmm0, xmm0
+.hloop:
+	mov eax, ebx
+	mov ebp, $0
+.wloop:
+	movdqa xmm1, [esi+ebp]
+	movdqa xmm2, [edi+ebp]
+	psadbw xmm1, xmm2
+	pshufd xmm2, xmm1, 0f6h	; 11110110 B ; movhlps for float
+	paddd xmm1, xmm2
+	paddd xmm0, xmm1
+	add ebp, 010h
+	dec eax
+	jnz near .wloop
+	lea esi, [esi+edx]
+	lea edi, [edi+edx]
+	dec ecx
+	jnz near .hloop
+
+	movd eax, xmm0
+	pop edx
+	pop ebx
+	pop ebp
+	pop edi
+	pop esi
+	ret
+
+
+WELS_EXTERN SampleVariance16x16_sse2
+;***********************************************************************
+;   void SampleVariance16x16_sse2(	uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
+;***********************************************************************
+ALIGN 16
+SampleVariance16x16_sse2:
+	push esi
+	push edi
+	push ebx
+
+	sub esp, 16
+	%define SUM			[esp]
+	%define SUM_CUR		[esp+4]
+	%define SQR			[esp+8]
+	%define SQR_CUR		[esp+12]
+	%define PUSH_SIZE	28	; 12 + 16
+
+	mov edi, [esp+PUSH_SIZE+4]	; y_ref
+	mov edx, [esp+PUSH_SIZE+8]	; y_ref_stride
+	mov esi, [esp+PUSH_SIZE+12]	; y_src
+	mov eax, [esp+PUSH_SIZE+16]	; y_src_stride
+	mov ecx, 010h				; height = 16
+
+	pxor xmm7, xmm7
+	movdqu SUM, xmm7
+
+.hloops:
+	movdqa xmm0, [edi]		; y_ref
+	movdqa xmm1, [esi]		; y_src
+	movdqa xmm2, xmm0		; store first for future process
+	movdqa xmm3, xmm1
+	; sum += diff;
+	movdqa xmm4, xmm0
+	psadbw xmm4, xmm1		; 2 parts, [0,..,15], [64,..,79]
+	; to be continued for sum
+	pshufd xmm5, xmm4, 0C6h	; 11000110 B
+	paddw xmm4, xmm5
+	movd ebx, xmm4
+	add SUM, ebx
+
+	; sqr += diff * diff;
+	pmaxub xmm0, xmm1
+	pminub xmm1, xmm2
+	psubb xmm0, xmm1				; diff
+	SUM_SQR_SSE2 xmm1, xmm0, xmm7	; dst, pSrc, zero
+	movd ebx, xmm1
+	add SQR, ebx
+
+	; sum_cur += y_src[x];
+	movdqa xmm0, xmm3		; cur_orig
+	movdqa xmm1, xmm0
+	punpcklbw xmm0, xmm7
+	punpckhbw xmm1, xmm7
+	paddw xmm0, xmm1		; 8x2
+	SUM_WORD_8x2_SSE2 xmm0, xmm1
+	movd ebx, xmm0
+	and ebx, 0ffffh
+	add SUM_CUR, ebx
+
+	; sqr_cur += y_src[x] * y_src[x];
+	SUM_SQR_SSE2 xmm0, xmm3, xmm7	; dst, pSrc, zero
+	movd ebx, xmm0
+	add SQR_CUR, ebx
+
+	lea edi, [edi+edx]
+	lea esi, [esi+eax]
+	dec ecx
+	jnz near .hloops
+
+	mov ebx, 0
+	mov bx, word SUM
+	sar ebx, 8
+	imul ebx, ebx
+	mov ecx, SQR
+	sar ecx, 8
+	sub ecx, ebx
+	mov edi, [esp+PUSH_SIZE+20]	; pMotionTexture
+	mov [edi], cx				; to store uiMotionIndex
+	mov ebx, 0
+	mov bx, word SUM_CUR
+	sar ebx, 8
+	imul ebx, ebx
+	mov ecx, SQR_CUR
+	sar ecx, 8
+	sub ecx, ebx
+	mov [edi+2], cx				; to store uiTextureIndex
+
+	%undef SUM
+	%undef SUM_CUR
+	%undef SQR
+	%undef SQR_CUR
+	%undef PUSH_SIZE
+
+	add esp, 16
+	pop ebx
+	pop edi
+	pop esi
+
+	ret
+
+; , 6/7/2010
+
+%ifndef NO_DYNAMIC_VP
+WELS_EXTERN AnalysisVaaInfoIntra_sse2
+;***********************************************************************
+;	int32_t AnalysisVaaInfoIntra_sse2(	uint8_t *pDataY, const int32_t linesize );
+;***********************************************************************
+ALIGN 16
+AnalysisVaaInfoIntra_sse2:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov ebp, esp
+	and ebp, 0fh
+	sub esp, ebp
+	sub esp, 32
+	%define PUSH_SIZE	52	; 20 + 32
+
+	mov esi, [esp+ebp+PUSH_SIZE+4]	; data_y
+	mov ecx, [esp+ebp+PUSH_SIZE+8]	; linesize
+
+	mov ebx, ecx
+	sal ebx, $1			; linesize x 2 [ebx]
+	mov edx, ebx
+	add edx, ecx		; linesize x 3 [edx]
+	mov eax, ebx
+	sal eax, $1			; linesize x 4 [eax]
+
+	pxor xmm7, xmm7
+
+	; loops
+	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+	movq [esp], xmm0
+
+	lea esi, [esi+eax]
+	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+	movq [esp+8], xmm0
+
+	lea esi, [esi+eax]
+	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+	movq [esp+16], xmm0
+
+	lea esi, [esi+eax]
+	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+	movq [esp+24], xmm0
+
+	movdqa xmm0, [esp]		; block 0~7
+	movdqa xmm1, [esp+16]	; block 8~15
+	movdqa xmm2, xmm0
+	paddw xmm0, xmm1
+	SUM_WORD_8x2_SSE2 xmm0, xmm3
+
+	pmullw xmm1, xmm1
+	pmullw xmm2, xmm2
+	movdqa xmm3, xmm1
+	movdqa xmm4, xmm2
+	punpcklwd xmm1, xmm7
+	punpckhwd xmm3, xmm7
+	punpcklwd xmm2, xmm7
+	punpckhwd xmm4, xmm7
+	paddd xmm1, xmm2
+	paddd xmm3, xmm4
+	paddd xmm1, xmm3
+	pshufd xmm2, xmm1, 01Bh
+	paddd xmm1, xmm2
+	pshufd xmm2, xmm1, 0B1h
+	paddd xmm1, xmm2
+
+	movd ebx, xmm0
+	and ebx, 0ffffh		; effective low word truncated
+	mov ecx, ebx
+	imul ebx, ecx
+	sar ebx, $4
+	movd eax, xmm1
+	sub eax, ebx
+
+	%undef PUSH_SIZE
+	add esp, 32
+	add esp, ebp
+	pop ebp
+	pop edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+WELS_EXTERN AnalysisVaaInfoIntra_ssse3
+;***********************************************************************
+;	int32_t AnalysisVaaInfoIntra_ssse3(	uint8_t *pDataY, const int32_t linesize );
+;***********************************************************************
+ALIGN 16
+AnalysisVaaInfoIntra_ssse3:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov ebp, esp
+	and ebp, 0fh
+	sub esp, ebp
+	sub esp, 32
+	%define PUSH_SIZE	52	; 20 + 32
+
+	mov esi, [esp+ebp+PUSH_SIZE+4]	; data_y
+	mov ecx, [esp+ebp+PUSH_SIZE+8]	; linesize
+
+	mov ebx, ecx
+	sal ebx, $1			; linesize x 2 [ebx]
+	mov edx, ebx
+	add edx, ecx		; linesize x 3 [edx]
+	mov eax, ebx
+	sal eax, $1			; linesize x 4 [eax]
+
+	pxor xmm7, xmm7
+
+	; loops
+	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+	movq [esp], xmm0
+
+	lea esi, [esi+eax]
+	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
+	movq [esp+8], xmm1
+
+	lea esi, [esi+eax]
+	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+	movq [esp+16], xmm0
+
+	lea esi, [esi+eax]
+	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
+	movq [esp+24], xmm1
+
+	movdqa xmm0, [esp]		; block 0~7
+	movdqa xmm1, [esp+16]	; block 8~15
+	movdqa xmm2, xmm0
+	paddw xmm0, xmm1
+	SUM_WORD_8x2_SSE2 xmm0, xmm3	; better performance than that of phaddw sets
+
+	pmullw xmm1, xmm1
+	pmullw xmm2, xmm2
+	movdqa xmm3, xmm1
+	movdqa xmm4, xmm2
+	punpcklwd xmm1, xmm7
+	punpckhwd xmm3, xmm7
+	punpcklwd xmm2, xmm7
+	punpckhwd xmm4, xmm7
+	paddd xmm1, xmm2
+	paddd xmm3, xmm4
+	paddd xmm1, xmm3
+	pshufd xmm2, xmm1, 01Bh
+	paddd xmm1, xmm2
+	pshufd xmm2, xmm1, 0B1h
+	paddd xmm1, xmm2
+
+	movd ebx, xmm0
+	and ebx, 0ffffh		; effective low work truncated
+	mov ecx, ebx
+	imul ebx, ecx
+	sar ebx, $4
+	movd eax, xmm1
+	sub eax, ebx
+
+	%undef PUSH_SIZE
+	add esp, 32
+	add esp, ebp
+	pop ebp
+	pop edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+%endif
+
+
+
+WELS_EXTERN abs_difference_mbrow_sse2
+;*************************************************************************************************************
+;void abs_difference_mbrow_sse2( uint8_t *ref_orig, uint8_t *cur_orig, int32_t iPicStride,
+;								 int32_t gom_pixel_num, int32_t *pSum)
+;*************************************************************************************************************
+ALIGN 16
+abs_difference_mbrow_sse2:
+%define		ref_orig			esp + pushsize + 4
+%define		cur_orig			esp + pushsize + 8
+%define		iPicStride			esp + pushsize + 12
+%define		gom_pixel_num		esp + pushsize + 16
+%define		pSum				esp + pushsize + 20
+%define		pushsize	12
+	push	esi
+	push	edi
+	push	ebx
+	mov		esi,	[ref_orig]
+	mov		edi,	[cur_orig]
+	mov		ebx,	[iPicStride]
+	mov		eax,	[gom_pixel_num]
+	mov		ecx,	16					;MB_WIDTH_LUMA
+	pxor	xmm0,	xmm0
+mb_width_loop_p:
+	mov		edx,	esi
+	add		edx,	eax			; end address
+gom_row_loop_p:
+	movdqa	xmm1,	[esi]
+	movdqa	xmm2,	[edi]
+	psadbw	xmm1,	xmm2
+	paddd	xmm0,	xmm1
+	add		esi,	16
+	add		edi,	16
+	cmp		esi,	edx
+	jl		gom_row_loop_p
+
+	sub		esi,	eax
+	sub		edi,	eax
+	add		esi,	ebx
+	add		edi,	ebx
+	loop	mb_width_loop_p
+
+	movdqa	xmm1,	xmm0
+	psrldq	xmm1,	8
+	paddd	xmm1,	xmm0
+	movd	eax,	xmm1
+	mov		edx,	[pSum]	; pSum
+	add		[edx],	eax
+
+%undef		ref_orig
+%undef		cur_orig
+%undef		iPicStride
+%undef		gom_pixel_num
+%undef		pSum
+%undef		pushsize
+	pop		ebx
+	pop		edi
+	pop		esi
+	ret
+
+
+
+
+WELS_EXTERN sum_sqrsum_mbrow_sse2
+;*************************************************************************************************************
+;void sum_sqrsum_mbrow_sse2( uint8_t *cur_orig, int32_t iPicStride,
+;							 int32_t gom_pixel_num, int32_t *pSum, int32_t *pSqrSum)
+;*************************************************************************************************************
+ALIGN 16
+sum_sqrsum_mbrow_sse2:
+%define		cur_orig			esp + pushsize + 4
+%define		iPicStride			esp + pushsize + 8
+%define		gom_pixel_num		esp + pushsize + 12
+%define		pSum				esp + pushsize + 16
+%define		pSqrSum				esp + pushsize + 20
+%define		pushsize			8
+	push		esi
+	push		ebx
+	mov			esi,	[cur_orig]
+	mov			eax,	[gom_pixel_num]
+	mov			ebx,	[iPicStride]
+	mov			ecx,	16					;MB_WIDTH_LUMA
+	pxor		xmm0,	xmm0				; zero
+	pxor		xmm1,	xmm1				; sum
+	pxor		xmm2,	xmm2				; sqr sum
+mb_width_loop_i:
+	mov			edx,	esi
+	add			edx,	eax			; end address
+gom_row_loop_i:
+	movdqa		xmm3,	[esi]
+	movdqa		xmm4,	xmm3
+	psadbw		xmm4,	xmm0
+	paddd		xmm1,	xmm4
+	movdqa		xmm4,	xmm3
+	punpcklbw	xmm4,	xmm0
+	punpckhbw	xmm3,	xmm0
+	pmaddwd		xmm4,	xmm4
+	pmaddwd		xmm3,	xmm3
+	paddd		xmm2,	xmm3
+	paddd		xmm2,	xmm4
+	add			esi,	16
+	cmp			esi,	edx
+	jl			gom_row_loop_i
+
+	sub			esi,	eax
+	add			esi,	ebx
+	loop		mb_width_loop_i
+
+	movdqa		xmm3,	xmm1
+	psrldq		xmm3,	8
+	paddd		xmm1,	xmm3
+	movd		eax,	xmm1
+	mov			edx,	[pSum]
+	add			[edx],	eax
+
+	movdqa		xmm3,	xmm2
+	psrldq		xmm3,	8
+	paddd		xmm2,	xmm3
+	movdqa		xmm3,	xmm2
+	psrldq		xmm3,	4
+	paddd		xmm2,	xmm3
+	movd		eax,	xmm2
+	mov			edx,	[pSqrSum]
+	add			[edx],	eax
+
+
+%undef		cur_orig
+%undef		iPicStride
+%undef		gom_pixel_num
+%undef		pSum
+%undef		pSqrSum
+%undef		pushsize
+	pop			ebx
+	pop			esi
+	ret
+
+
+
+WELS_EXTERN VAACalcSad_sse2
+;*************************************************************************************************************
+;void VAACalcSad_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
+;								int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8)
+;*************************************************************************************************************
+
+
+ALIGN 16
+VAACalcSad_sse2:
+%define		cur_data			esp + pushsize + 4
+%define		ref_data			esp + pushsize + 8
+%define		iPicWidth			esp + pushsize + 12
+%define		iPicHeight			esp + pushsize + 16
+%define		iPicStride			esp + pushsize + 20
+%define		psadframe			esp + pushsize + 24
+%define		psad8x8				esp + pushsize + 28
+%define		pushsize	12
+	push	esi
+	push	edi
+	push	ebx
+	mov		esi,	[cur_data]
+	mov		edi,	[ref_data]
+	mov		ebx,	[iPicStride]
+	mov		edx,	[psad8x8]
+	mov		eax,	ebx
+
+	shr		dword [iPicWidth],	4					; iPicWidth/16
+	shr		dword [iPicHeight],	4					; iPicHeight/16
+	shl		eax,	4								; iPicStride*16
+	pxor	xmm0,	xmm0
+	pxor	xmm7,	xmm7		; iFrameSad
+height_loop:
+	mov		ecx,	dword [iPicWidth]
+	push	esi
+	push	edi
+width_loop:
+	pxor	xmm6,	xmm6		;
+	WELS_SAD_16x2_SSE2
+	WELS_SAD_16x2_SSE2
+	WELS_SAD_16x2_SSE2
+	WELS_SAD_16x2_SSE2
+	paddd	xmm7,		xmm6
+	movd	[edx],		xmm6
+	psrldq	xmm6,		8
+	movd	[edx+4],	xmm6
+
+	pxor	xmm6,	xmm6
+	WELS_SAD_16x2_SSE2
+	WELS_SAD_16x2_SSE2
+	WELS_SAD_16x2_SSE2
+	WELS_SAD_16x2_SSE2
+	paddd	xmm7,		xmm6
+	movd	[edx+8],	xmm6
+	psrldq	xmm6,		8
+	movd	[edx+12],	xmm6
+
+	add		edx,	16
+	sub		esi,	eax
+	sub		edi,	eax
+	add		esi,	16
+	add		edi,	16
+
+	dec		ecx
+	jnz		width_loop
+
+	pop		edi
+	pop		esi
+	add		esi,	eax
+	add		edi,	eax
+
+	dec	dword [iPicHeight]
+	jnz		height_loop
+
+	mov		edx,	[psadframe]
+	movdqa	xmm5,	xmm7
+	psrldq	xmm7,	8
+	paddd	xmm7,	xmm5
+	movd	[edx],	xmm7
+
+%undef		cur_data
+%undef		ref_data
+%undef		iPicWidth
+%undef		iPicHeight
+%undef		iPicStride
+%undef		psadframe
+%undef		psad8x8
+%undef		pushsize
+	pop		ebx
+	pop		edi
+	pop		esi
+	ret
+
+
+WELS_EXTERN VAACalcSadVar_sse2
+;*************************************************************************************************************
+;void VAACalcSadVar_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
+;		int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)
+;*************************************************************************************************************
+
+
+ALIGN 16
+VAACalcSadVar_sse2:
+%define		localsize		8
+%define		cur_data			esp + pushsize + localsize + 4
+%define		ref_data			esp + pushsize + localsize + 8
+%define		iPicWidth			esp + pushsize + localsize + 12
+%define		iPicHeight			esp + pushsize + localsize + 16
+%define		iPicStride			esp + pushsize + localsize + 20
+%define		psadframe			esp + pushsize + localsize + 24
+%define		psad8x8				esp + pushsize + localsize + 28
+%define		psum16x16			esp + pushsize + localsize + 32
+%define		psqsum16x16			esp + pushsize + localsize + 36
+%define		tmp_esi				esp + 0
+%define		tmp_edi				esp + 4
+%define		pushsize		16
+	push	ebp
+	push	esi
+	push	edi
+	push	ebx
+	sub		esp,	localsize
+	mov		esi,	[cur_data]
+	mov		edi,	[ref_data]
+	mov		ebx,	[iPicStride]
+	mov		edx,	[psad8x8]
+	mov		eax,	ebx
+
+	shr		dword [iPicWidth],	4					; iPicWidth/16
+	shr		dword [iPicHeight],	4					; iPicHeight/16
+	shl		eax,	4							; iPicStride*16
+	pxor	xmm0,	xmm0
+	pxor	xmm7,	xmm7		; iFrameSad
+var_height_loop:
+	mov		ecx,	dword [iPicWidth]
+	mov		[tmp_esi],	esi
+	mov		[tmp_edi],	edi
+var_width_loop:
+	pxor	xmm6,	xmm6		; hiQuad_loQuad pSad8x8
+	pxor	xmm5,	xmm5		; pSum16x16
+	pxor	xmm4,	xmm4		; sqsum_16x16
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	paddd	xmm7,		xmm6
+	movd	[edx],		xmm6
+	psrldq	xmm6,		8
+	movd	[edx+4],	xmm6
+
+	pxor	xmm6,	xmm6
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	paddd	xmm7,		xmm6
+	movd	[edx+8],	xmm6
+	psrldq	xmm6,		8
+	movd	[edx+12],	xmm6
+
+	mov		ebp,	[psum16x16]
+	movdqa	xmm1,	xmm5
+	psrldq	xmm1,	8
+	paddd	xmm5,	xmm1
+	movd	[ebp],	xmm5
+	add		dword [psum16x16], 4
+
+	movdqa	xmm5,	xmm4
+	psrldq	xmm5,	8
+	paddd	xmm4,	xmm5
+	movdqa	xmm3,	xmm4
+	psrldq	xmm3,	4
+	paddd	xmm4,	xmm3
+
+	mov		ebp,	[psqsum16x16]
+	movd	[ebp],	xmm4
+	add		dword [psqsum16x16], 4
+
+	add		edx,	16
+	sub		esi,	eax
+	sub		edi,	eax
+	add		esi,	16
+	add		edi,	16
+
+	dec		ecx
+	jnz		var_width_loop
+
+	mov		esi,	[tmp_esi]
+	mov		edi,	[tmp_edi]
+	add		esi,	eax
+	add		edi,	eax
+
+	dec	dword [iPicHeight]
+	jnz		var_height_loop
+
+	mov		edx,	[psadframe]
+	movdqa	xmm5,	xmm7
+	psrldq	xmm7,	8
+	paddd	xmm7,	xmm5
+	movd	[edx],	xmm7
+
+	add		esp,	localsize
+	pop		ebx
+	pop		edi
+	pop		esi
+	pop		ebp
+%undef		cur_data
+%undef		ref_data
+%undef		iPicWidth
+%undef		iPicHeight
+%undef		iPicStride
+%undef		psadframe
+%undef		psad8x8
+%undef		psum16x16
+%undef		psqsum16x16
+%undef		tmp_esi
+%undef		tmp_edi
+%undef		pushsize
+%undef		localsize
+	ret
+
+
+
+WELS_EXTERN VAACalcSadSsd_sse2
+;*************************************************************************************************************
+;void VAACalcSadSsd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+;	int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
+;*************************************************************************************************************
+
+
+ALIGN 16
+VAACalcSadSsd_sse2:
+%define		localsize		12
+%define		cur_data			esp + pushsize + localsize + 4
+%define		ref_data			esp + pushsize + localsize + 8
+%define		iPicWidth			esp + pushsize + localsize + 12
+%define		iPicHeight			esp + pushsize + localsize + 16
+%define		iPicStride			esp + pushsize + localsize + 20
+%define		psadframe			esp + pushsize + localsize + 24
+%define		psad8x8				esp + pushsize + localsize + 28
+%define		psum16x16			esp + pushsize + localsize + 32
+%define		psqsum16x16			esp + pushsize + localsize + 36
+%define		psqdiff16x16		esp + pushsize + localsize + 40
+%define		tmp_esi				esp + 0
+%define		tmp_edi				esp + 4
+%define		tmp_sadframe		esp + 8
+%define		pushsize		16
+	push	ebp
+	push	esi
+	push	edi
+	push	ebx
+	sub		esp,	localsize
+	mov		ecx,	[iPicWidth]
+	mov		ecx,	[iPicHeight]
+	mov		esi,	[cur_data]
+	mov		edi,	[ref_data]
+	mov		ebx,	[iPicStride]
+	mov		edx,	[psad8x8]
+	mov		eax,	ebx
+
+	shr		dword [iPicWidth],	4					; iPicWidth/16
+	shr		dword [iPicHeight],	4					; iPicHeight/16
+	shl		eax,	4							; iPicStride*16
+	mov		ecx,	[iPicWidth]
+	mov		ecx,	[iPicHeight]
+	pxor	xmm0,	xmm0
+	movd	[tmp_sadframe],	xmm0
+sqdiff_height_loop:
+	mov		ecx,	dword [iPicWidth]
+	mov		[tmp_esi],	esi
+	mov		[tmp_edi],	edi
+sqdiff_width_loop:
+	pxor	xmm7,	xmm7		; hiQuad_loQuad pSad8x8
+	pxor	xmm6,	xmm6		; pSum16x16
+	pxor	xmm5,	xmm5		; sqsum_16x16  four dword
+	pxor	xmm4,	xmm4		; sqdiff_16x16	four Dword
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	movdqa	xmm1,		xmm7
+	movd	[edx],		xmm7
+	psrldq	xmm7,		8
+	paddd	xmm1,		xmm7
+	movd	[edx+4],	xmm7
+	movd	ebp,		xmm1
+	add		[tmp_sadframe],	ebp
+
+	pxor	xmm7,	xmm7
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	movdqa	xmm1,		xmm7
+	movd	[edx+8],	xmm7
+	psrldq	xmm7,		8
+	paddd	xmm1,		xmm7
+	movd	[edx+12],	xmm7
+	movd	ebp,		xmm1
+	add		[tmp_sadframe],	ebp
+
+	mov		ebp,	[psum16x16]
+	movdqa	xmm1,	xmm6
+	psrldq	xmm1,	8
+	paddd	xmm6,	xmm1
+	movd	[ebp],	xmm6
+	add		dword [psum16x16], 4
+
+	mov		ebp,	[psqsum16x16]
+	pshufd	xmm6,	xmm5,	14 ;00001110
+	paddd	xmm6,	xmm5
+	pshufd	xmm5,	xmm6,	1  ;00000001
+	paddd	xmm5,	xmm6
+	movd	[ebp],	xmm5
+	add		dword [psqsum16x16], 4
+
+	mov		ebp,	[psqdiff16x16]
+	pshufd	xmm5,	xmm4,	14	; 00001110
+	paddd	xmm5,	xmm4
+	pshufd	xmm4,	xmm5,	1	; 00000001
+	paddd	xmm4,	xmm5
+	movd	[ebp],	xmm4
+	add		dword	[psqdiff16x16],	4
+
+	add		edx,	16
+	sub		esi,	eax
+	sub		edi,	eax
+	add		esi,	16
+	add		edi,	16
+
+	dec		ecx
+	jnz		sqdiff_width_loop
+
+	mov		esi,	[tmp_esi]
+	mov		edi,	[tmp_edi]
+	add		esi,	eax
+	add		edi,	eax
+
+	dec	dword [iPicHeight]
+	jnz		sqdiff_height_loop
+
+	mov		ebx,	[tmp_sadframe]
+	mov		eax,	[psadframe]
+	mov		[eax],	ebx
+
+	add		esp,	localsize
+	pop		ebx
+	pop		edi
+	pop		esi
+	pop		ebp
+%undef		cur_data
+%undef		ref_data
+%undef		iPicWidth
+%undef		iPicHeight
+%undef		iPicStride
+%undef		psadframe
+%undef		psad8x8
+%undef		psum16x16
+%undef		psqsum16x16
+%undef		psqdiff16x16
+%undef		tmp_esi
+%undef		tmp_edi
+%undef		tmp_sadframe
+%undef		pushsize
+%undef		localsize
+	ret
+
+
+
+
+
+WELS_EXTERN VAACalcSadBgd_sse2
+;*************************************************************************************************************
+;void VAACalcSadBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+;				int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)
+;*************************************************************************************************************
+
+
+ALIGN 16
+VAACalcSadBgd_sse2:
+%define		localsize		12
+%define		cur_data			esp + pushsize + localsize + 4
+%define		ref_data			esp + pushsize + localsize + 8
+%define		iPicWidth			esp + pushsize + localsize + 12
+%define		iPicHeight			esp + pushsize + localsize + 16
+%define		iPicStride			esp + pushsize + localsize + 20
+%define		psadframe			esp + pushsize + localsize + 24
+%define		psad8x8				esp + pushsize + localsize + 28
+%define		p_sd8x8				esp + pushsize + localsize + 32
+%define		p_mad8x8			esp + pushsize + localsize + 36
+%define		tmp_esi				esp + 0
+%define		tmp_edi				esp + 4
+%define		tmp_ecx				esp + 8
+%define		pushsize		16
+	push	ebp
+	push	esi
+	push	edi
+	push	ebx
+	sub		esp,	localsize
+	mov		esi,	[cur_data]
+	mov		edi,	[ref_data]
+	mov		ebx,	[iPicStride]
+	mov		eax,	ebx
+
+	shr		dword [iPicWidth],	4					; iPicWidth/16
+	shr		dword [iPicHeight],	4					; iPicHeight/16
+	shl		eax,	4							; iPicStride*16
+	xor		ebp,	ebp
+	pxor	xmm0,	xmm0
+bgd_height_loop:
+	mov		ecx,	dword [iPicWidth]
+	mov		[tmp_esi],	esi
+	mov		[tmp_edi],	edi
+bgd_width_loop:
+	pxor	xmm7,	xmm7		; pSad8x8
+	pxor	xmm6,	xmm6		; sum_cur_8x8
+	pxor	xmm5,	xmm5		; sum_ref_8x8
+	pxor	xmm4,	xmm4		; pMad8x8
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+
+
+	mov			edx,		[p_mad8x8]
+	WELS_MAX_REG_SSE2	xmm4
+
+	;movdqa		xmm1,	xmm4
+	;punpcklbw	xmm1,	xmm0
+	;punpcklwd	xmm1,	xmm0
+	;movd		[edx],	xmm1
+	;punpckhbw	xmm4,	xmm0
+	;punpcklwd	xmm4,	xmm0
+	;movd		[edx+4],	xmm4
+	;add			edx,		8
+	;mov			[p_mad8x8],	edx
+	mov			[tmp_ecx],	ecx
+	movhlps		xmm1,	xmm4
+	movd		ecx,	xmm4
+	mov			[edx],	cl
+	movd		ecx,	xmm1
+	mov			[edx+1],cl
+	add			edx,	2
+	mov			[p_mad8x8],	edx
+
+
+	pslldq		xmm7,	4
+	pslldq		xmm6,	4
+	pslldq		xmm5,	4
+
+
+	pxor	xmm4,	xmm4		; pMad8x8
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+
+	mov			edx,		[p_mad8x8]
+	WELS_MAX_REG_SSE2	xmm4
+
+	;movdqa		xmm1,	xmm4
+	;punpcklbw	xmm1,	xmm0
+	;punpcklwd	xmm1,	xmm0
+	;movd		[edx],	xmm1
+	;punpckhbw	xmm4,	xmm0
+	;punpcklwd	xmm4,	xmm0
+	;movd		[edx+4],	xmm4
+	;add			edx,		8
+	;mov			[p_mad8x8],	edx
+	movhlps		xmm1,	xmm4
+	movd		ecx,	xmm4
+	mov			[edx],	cl
+	movd		ecx,	xmm1
+	mov			[edx+1],cl
+	add			edx,	2
+	mov			[p_mad8x8],	edx
+
+	; data in xmm7, xmm6, xmm5:  D1 D3 D0 D2
+
+	mov		edx,	[psad8x8]
+	pshufd	xmm1,	xmm7,	10001101b		; D3 D2 D1 D0
+	movdqa	[edx],	xmm1
+	add		edx,	16
+	mov		[psad8x8],	edx					; sad8x8
+
+	paddd	xmm1,	xmm7					; D1+3 D3+2 D0+1 D2+0
+	pshufd	xmm2,	xmm1,	00000011b
+	paddd	xmm1,	xmm2
+	movd	edx,	xmm1
+	add		ebp,	edx						; sad frame
+
+	mov		edx,	[p_sd8x8]
+	psubd	xmm6,	xmm5
+	pshufd	xmm1,	xmm6,	10001101b
+	movdqa	[edx],	xmm1
+	add		edx,	16
+	mov		[p_sd8x8],	edx
+
+
+	add		edx,	16
+	sub		esi,	eax
+	sub		edi,	eax
+	add		esi,	16
+	add		edi,	16
+
+	mov		ecx,	[tmp_ecx]
+	dec		ecx
+	jnz		bgd_width_loop
+
+	mov		esi,	[tmp_esi]
+	mov		edi,	[tmp_edi]
+	add		esi,	eax
+	add		edi,	eax
+
+	dec		dword [iPicHeight]
+	jnz		bgd_height_loop
+
+	mov		edx,	[psadframe]
+	mov		[edx],	ebp
+
+	add		esp,	localsize
+	pop		ebx
+	pop		edi
+	pop		esi
+	pop		ebp
+%undef		cur_data
+%undef		ref_data
+%undef		iPicWidth
+%undef		iPicHeight
+%undef		iPicStride
+%undef		psadframe
+%undef		psad8x8
+%undef		p_sd8x8
+%undef		p_mad8x8
+%undef		tmp_esi
+%undef		tmp_edi
+%undef		pushsize
+%undef		localsize
+	ret
+
+
+
+WELS_EXTERN VAACalcSadSsdBgd_sse2
+;*************************************************************************************************************
+;void VAACalcSadSsdBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+;		 int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16,
+;			int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8)
+;*************************************************************************************************************
+
+
+ALIGN 16
+VAACalcSadSsdBgd_sse2:
+%define		localsize		16
+%define		cur_data			esp + pushsize + localsize + 4
+%define		ref_data			esp + pushsize + localsize + 8
+%define		iPicWidth			esp + pushsize + localsize + 12
+%define		iPicHeight			esp + pushsize + localsize + 16
+%define		iPicStride			esp + pushsize + localsize + 20
+%define		psadframe			esp + pushsize + localsize + 24
+%define		psad8x8				esp + pushsize + localsize + 28
+%define		psum16x16			esp + pushsize + localsize + 32
+%define		psqsum16x16			esp + pushsize + localsize + 36
+%define		psqdiff16x16		esp + pushsize + localsize + 40
+%define		p_sd8x8				esp + pushsize + localsize + 44
+%define		p_mad8x8			esp + pushsize + localsize + 48
+%define		tmp_esi				esp + 0
+%define		tmp_edi				esp + 4
+%define		tmp_sadframe		esp + 8
+%define		tmp_ecx				esp + 12
+%define		pushsize		16
+	push	ebp
+	push	esi
+	push	edi
+	push	ebx
+	sub		esp,	localsize
+	mov		esi,	[cur_data]
+	mov		edi,	[ref_data]
+	mov		ebx,	[iPicStride]
+	mov		eax,	ebx
+
+	shr		dword [iPicWidth],	4					; iPicWidth/16
+	shr		dword [iPicHeight],	4					; iPicHeight/16
+	shl		eax,	4							; iPicStride*16
+	pxor	xmm0,	xmm0
+	movd	[tmp_sadframe],	xmm0
+sqdiff_bgd_height_loop:
+	mov		ecx,	dword [iPicWidth]
+	mov		[tmp_esi],	esi
+	mov		[tmp_edi],	edi
+sqdiff_bgd_width_loop:
+	pxor	xmm7,	xmm7		; pSad8x8 interleaves sqsum16x16:  sqsum1 sad1 sqsum0 sad0
+	pxor	xmm6,	xmm6		; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
+	pxor	xmm5,	xmm5		; pMad8x8
+	pxor	xmm4,	xmm4		; sqdiff_16x16	four Dword
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+
+	mov		edx,		[psad8x8]
+	movdqa	xmm2,		xmm7
+	pshufd	xmm1,		xmm2,		00001110b
+	movd	[edx],		xmm2
+	movd	[edx+4],	xmm1
+	add		edx,		8
+	mov		[psad8x8],	edx			; sad8x8
+
+	paddd	xmm1,				xmm2
+	movd	edx,				xmm1
+	add		[tmp_sadframe],		edx			; iFrameSad
+
+	mov		edx,		[psum16x16]
+	movdqa	xmm1,		xmm6
+	pshufd	xmm2,		xmm1,		00001110b
+	paddd	xmm1,		xmm2
+	movd	[edx],		xmm1				; sum
+
+	mov		edx,		[p_sd8x8]
+	pshufd	xmm1,		xmm6,		11110101b			; Sref1 Sref1 Sref0 Sref0
+	psubd	xmm6,		xmm1		; 00 diff1 00 diff0
+	pshufd	xmm1,		xmm6,		00001000b			;  xx xx diff1 diff0
+	movq	[edx],		xmm1
+	add		edx,		8
+	mov		[p_sd8x8],	edx
+
+	mov			edx,		[p_mad8x8]
+	WELS_MAX_REG_SSE2	xmm5
+	;movdqa		xmm1,	xmm5
+	;punpcklbw	xmm1,	xmm0
+	;punpcklwd	xmm1,	xmm0
+	;movd		[edx],	xmm1
+	;punpckhbw	xmm5,	xmm0
+	;punpcklwd	xmm5,	xmm0
+	;movd		[edx+4],	xmm5
+	;add			edx,		8
+	;mov			[p_mad8x8],	edx
+	mov			[tmp_ecx],	ecx
+	movhlps		xmm1,	xmm5
+	movd		ecx,	xmm5
+	mov			[edx],	cl
+	movd		ecx,	xmm1
+	mov			[edx+1],cl
+	add			edx,	2
+	mov			[p_mad8x8],	edx
+
+	psrlq	xmm7,	32
+	psllq	xmm7,	32			; clear sad
+	pxor	xmm6,	xmm6		; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
+	pxor	xmm5,	xmm5		; pMad8x8
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+
+	mov		edx,		[psad8x8]
+	movdqa	xmm2,		xmm7
+	pshufd	xmm1,		xmm2,		00001110b
+	movd	[edx],		xmm2
+	movd	[edx+4],	xmm1
+	add		edx,		8
+	mov		[psad8x8],	edx			; sad8x8
+
+	paddd	xmm1,				xmm2
+	movd	edx,				xmm1
+	add		[tmp_sadframe],		edx			; iFrameSad
+
+	mov		edx,			[psum16x16]
+	movdqa	xmm1,			xmm6
+	pshufd	xmm2,			xmm1,		00001110b
+	paddd	xmm1,			xmm2
+	movd	ebp,			xmm1				; sum
+	add		[edx],			ebp
+	add		edx,			4
+	mov		[psum16x16],	edx
+
+	mov		edx,			[psqsum16x16]
+	psrlq	xmm7,			32
+	pshufd	xmm2,			xmm7,		00001110b
+	paddd	xmm2,			xmm7
+	movd	[edx],			xmm2				; sqsum
+	add		edx,			4
+	mov		[psqsum16x16],	edx
+
+	mov		edx,		[p_sd8x8]
+	pshufd	xmm1,		xmm6,		11110101b			; Sref1 Sref1 Sref0 Sref0
+	psubd	xmm6,		xmm1		; 00 diff1 00 diff0
+	pshufd	xmm1,		xmm6,		00001000b			;  xx xx diff1 diff0
+	movq	[edx],		xmm1
+	add		edx,		8
+	mov		[p_sd8x8],	edx
+
+	mov		edx,		[p_mad8x8]
+	WELS_MAX_REG_SSE2	xmm5
+	;movdqa		xmm1,	xmm5
+	;punpcklbw	xmm1,	xmm0
+	;punpcklwd	xmm1,	xmm0
+	;movd		[edx],	xmm1
+	;punpckhbw	xmm5,	xmm0
+	;punpcklwd	xmm5,	xmm0
+	;movd		[edx+4],	xmm5
+	;add			edx,		8
+	;mov			[p_mad8x8],	edx
+	movhlps		xmm1,	xmm5
+	movd		ecx,	xmm5
+	mov			[edx],	cl
+	movd		ecx,	xmm1
+	mov			[edx+1],cl
+	add			edx,	2
+	mov			[p_mad8x8],	edx
+
+	mov		edx,		[psqdiff16x16]
+	pshufd	xmm1,		xmm4,		00001110b
+	paddd	xmm4,		xmm1
+	pshufd	xmm1,		xmm4,		00000001b
+	paddd	xmm4,		xmm1
+	movd	[edx],		xmm4
+	add		edx,		4
+	mov		[psqdiff16x16],	edx
+
+	add		edx,	16
+	sub		esi,	eax
+	sub		edi,	eax
+	add		esi,	16
+	add		edi,	16
+
+	mov		ecx,	[tmp_ecx]
+	dec		ecx
+	jnz		sqdiff_bgd_width_loop
+
+	mov		esi,	[tmp_esi]
+	mov		edi,	[tmp_edi]
+	add		esi,	eax
+	add		edi,	eax
+
+	dec	dword [iPicHeight]
+	jnz		sqdiff_bgd_height_loop
+
+	mov		edx,	[psadframe]
+	mov		ebp,	[tmp_sadframe]
+	mov		[edx],	ebp
+
+	add		esp,	localsize
+	pop		ebx
+	pop		edi
+	pop		esi
+	pop		ebp
+%undef		cur_data
+%undef		ref_data
+%undef		iPicWidth
+%undef		iPicHeight
+%undef		iPicStride
+%undef		psadframe
+%undef		psad8x8
+%undef		psum16x16
+%undef		psqsum16x16
+%undef		psqdiff16x16
+%undef		p_sd8x8
+%undef		p_mad8x8
+%undef		tmp_esi
+%undef		tmp_edi
+%undef		pushsize
+%undef		localsize
+	ret
--- a/processing/src/backgounddetection/BackgroundDetection.cpp
+++ b/processing/src/backgounddetection/BackgroundDetection.cpp
@@ -1,419 +1,389 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "BackgroundDetection.h"
-#include "../common/cpu.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-#define LOG2_BGD_OU_SIZE    (4)
-#define LOG2_BGD_OU_SIZE_UV (LOG2_BGD_OU_SIZE-1)
-#define BGD_OU_SIZE         (1<<LOG2_BGD_OU_SIZE)
-#define BGD_OU_SIZE_UV      (BGD_OU_SIZE>>1)
-#define BGD_THD_SAD         (2*BGD_OU_SIZE*BGD_OU_SIZE)
-#define	BGD_THD_ASD_UV      (4*BGD_OU_SIZE_UV)
-#define LOG2_MB_SIZE        (4)
-#define OU_SIZE_IN_MB       (BGD_OU_SIZE >> 4)
-#define Q_FACTOR            (8)
-#define BGD_DELTA_QP_THD    (3)
-
-#define OU_LEFT		(0x01)
-#define OU_RIGHT	(0x02)
-#define OU_TOP		(0x04)
-#define OU_BOTTOM	(0x08)
-
-CBackgroundDetection::CBackgroundDetection(int32_t iCpuFlag)
-{
-	m_eMethod = METHOD_BACKGROUND_DETECTION;
-	WelsMemset(&m_BgdParam, 0, sizeof(m_BgdParam));
-	m_iLargestFrameSize = 0;
-}
-
-CBackgroundDetection::~CBackgroundDetection()
-{
-	FreeOUArrayMemory();
-}
-
-EResult CBackgroundDetection::Process(int32_t iType, SPixMap *pSrcPixMap, SPixMap *pRefPixMap)
-{
-	EResult eReturn = RET_INVALIDPARAM;	
-
-	if (pSrcPixMap==NULL || pRefPixMap==NULL)
-		return eReturn;
-
-	m_BgdParam.pCur[0] = (uint8_t *)pSrcPixMap->pPixel[0];
-	m_BgdParam.pCur[1] = (uint8_t *)pSrcPixMap->pPixel[1];
-	m_BgdParam.pCur[2] = (uint8_t *)pSrcPixMap->pPixel[2];
-	m_BgdParam.pRef[0] = (uint8_t *)pRefPixMap->pPixel[0];
-	m_BgdParam.pRef[1] = (uint8_t *)pRefPixMap->pPixel[1];
-	m_BgdParam.pRef[2] = (uint8_t *)pRefPixMap->pPixel[2];
-	m_BgdParam.iBgdWidth = pSrcPixMap->sRect.iRectWidth;
-	m_BgdParam.iBgdHeight = pSrcPixMap->sRect.iRectHeight;
-	m_BgdParam.iStride[0] = pSrcPixMap->iStride[0];
-	m_BgdParam.iStride[1] = pSrcPixMap->iStride[1];
-	m_BgdParam.iStride[2] = pSrcPixMap->iStride[2];
-
-	int32_t iCurFrameSize = m_BgdParam.iBgdWidth * m_BgdParam.iBgdHeight;
-	if (m_BgdParam.pOU_array == NULL || iCurFrameSize > m_iLargestFrameSize)
-	{
-		FreeOUArrayMemory();
-		m_BgdParam.pOU_array = AllocateOUArrayMemory(m_BgdParam.iBgdWidth, m_BgdParam.iBgdHeight);
-		m_iLargestFrameSize = iCurFrameSize;
-	} 
-
-	if (m_BgdParam.pOU_array == NULL)
-		return eReturn;
-
-	BackgroundDetection(&m_BgdParam);
-
-	return RET_SUCCESS;
-}
-
-EResult CBackgroundDetection::Set(int32_t iType, void *pParam)
-{
-	if (pParam == NULL)
-	{
-		return RET_INVALIDPARAM;
-	}
-
-	SBGDInterface *pInterface = (SBGDInterface *)pParam;
-
-	m_BgdParam.pBackgroundMbFlag = (int8_t *)pInterface->pBackgroundMbFlag;
-	m_BgdParam.pCalcRes = pInterface->pCalcRes;
-
-	return RET_SUCCESS;
-}
-
-inline SBackgroundOU* CBackgroundDetection::AllocateOUArrayMemory(int32_t iWidth, int32_t iHeight)
-{
-	int32_t	iMaxOUWidth	= (BGD_OU_SIZE-1+iWidth)>>LOG2_BGD_OU_SIZE;
-	int32_t	iMaxOUHeight	= (BGD_OU_SIZE-1+iHeight)>>LOG2_BGD_OU_SIZE;
-	return (SBackgroundOU *)WelsMalloc( iMaxOUWidth * iMaxOUHeight * sizeof(SBackgroundOU) );
-}
-
-inline void CBackgroundDetection::FreeOUArrayMemory()
-{
-	_SafeFree(m_BgdParam.pOU_array);
-}
-
-void CBackgroundDetection::GetOUParameters( SVAACalcResult *sVaaCalcInfo, int32_t iMbIndex, int32_t iMbWidth, SBackgroundOU* pBgdOU)
-{
-	int32_t	iSubSD[4];
-	uint8_t	iSubMAD[4];
-	int32_t	iSubSAD[4];
-
-	uint8_t (*pMad8x8)[4];
-	int32_t (*pSad8x8)[4];
-	int32_t (*pSd8x8)[4];
-
-	pSad8x8 = sVaaCalcInfo->pSad8x8;
-	pMad8x8 = sVaaCalcInfo->pMad8x8;
-	pSd8x8  = sVaaCalcInfo->pSumOfDiff8x8;
-
-	iSubSAD[0] = pSad8x8[iMbIndex][0];
-	iSubSAD[1] = pSad8x8[iMbIndex][1];
-	iSubSAD[2] = pSad8x8[iMbIndex][2];
-	iSubSAD[3] = pSad8x8[iMbIndex][3];
-
-	iSubSD[0] = pSd8x8[iMbIndex][0];
-	iSubSD[1] = pSd8x8[iMbIndex][1];
-	iSubSD[2] = pSd8x8[iMbIndex][2];
-	iSubSD[3] = pSd8x8[iMbIndex][3];
-
-	iSubMAD[0] = pMad8x8[iMbIndex][0];
-	iSubMAD[1] = pMad8x8[iMbIndex][1];
-	iSubMAD[2] = pMad8x8[iMbIndex][2];
-	iSubMAD[3] = pMad8x8[iMbIndex][3];
-
-	pBgdOU->iSD	= iSubSD[0] + iSubSD[1] + iSubSD[2] + iSubSD[3];
-	pBgdOU->iSAD	= iSubSAD[0] + iSubSAD[1] + iSubSAD[2] + iSubSAD[3];
-	pBgdOU->iSD	= WELS_ABS(pBgdOU->iSD);
-
-	// get the max absolute difference (MAD) of OU and min value of the MAD of sub-blocks of OU
-	pBgdOU->iMAD = WELS_MAX(WELS_MAX(iSubMAD[0],iSubMAD[1]), WELS_MAX(iSubMAD[2],iSubMAD[3]));
-	pBgdOU->iMinSubMad = WELS_MIN(WELS_MIN(iSubMAD[0],iSubMAD[1]), WELS_MIN(iSubMAD[2],iSubMAD[3]));
-
-	// get difference between the max and min SD of the SDs of sub-blocks of OU
-	pBgdOU->iMaxDiffSubSd = WELS_MAX(WELS_MAX(iSubSD[0],iSubSD[1]), WELS_MAX(iSubSD[2],iSubSD[3])) -
-		WELS_MIN(WELS_MIN(iSubSD[0],iSubSD[1]), WELS_MIN(iSubSD[2],iSubSD[3]));
-}
-
-void CBackgroundDetection::ForegroundBackgroundDivision(vBGDParam *pBgdParam)
-{
-	int32_t iPicWidthInOU	= pBgdParam->iBgdWidth  >> LOG2_BGD_OU_SIZE;
-	int32_t iPicHeightInOU	= pBgdParam->iBgdHeight >> LOG2_BGD_OU_SIZE;
-	int32_t iPicWidthInMb	= (15+pBgdParam->iBgdWidth)>>4;
-
-	SBackgroundOU *pBackgroundOU = pBgdParam->pOU_array;
-
-	for (int32_t j = 0; j < iPicHeightInOU; j ++ ) 
-	{
-		for (int32_t i = 0; i < iPicWidthInOU; i++ )
-		{
-			GetOUParameters( pBgdParam->pCalcRes, (j*iPicWidthInMb+i)<<(LOG2_BGD_OU_SIZE-LOG2_MB_SIZE), iPicWidthInMb, pBackgroundOU);
-
-			pBackgroundOU->iBackgroundFlag = 0;
-			if (pBackgroundOU->iMAD>63)
-			{
-				pBackgroundOU++;
-				continue;
-			}
-			if ((pBackgroundOU->iMaxDiffSubSd<=pBackgroundOU->iSAD>>3 || pBackgroundOU->iMaxDiffSubSd<=(BGD_OU_SIZE*Q_FACTOR)) && pBackgroundOU->iSAD < (BGD_THD_SAD<<1)) //BGD_OU_SIZE*BGD_OU_SIZE>>2
-			{
-				if (pBackgroundOU->iSAD<=BGD_OU_SIZE*Q_FACTOR)
-				{
-					pBackgroundOU->iBackgroundFlag = 1;
-				} 
-				else
-				{
-					pBackgroundOU->iBackgroundFlag = pBackgroundOU->iSAD < BGD_THD_SAD ?
-						(pBackgroundOU->iSD < (pBackgroundOU->iSAD*3)>>2) : 
-					(pBackgroundOU->iSD<<1 < pBackgroundOU->iSAD);
-				}
-			}
-			pBackgroundOU++;
-		}
-	}
-}
-inline int32_t CBackgroundDetection::CalculateAsdChromaEdge( uint8_t *pOriRef, uint8_t *pOriCur, int32_t iStride )
-{
-	int32_t	ASD = 0;
-	int32_t	idx;
-	for( idx = 0; idx < BGD_OU_SIZE_UV; idx++ )
-	{
-		ASD += *pOriCur - *pOriRef;
-		pOriRef += iStride;
-		pOriCur += iStride;
-	}
-	return WELS_ABS(ASD);
-}
-
-inline bool_t CBackgroundDetection::ForegroundDilation23Luma(SBackgroundOU *pBackgroundOU, SBackgroundOU *pOUNeighbours[])
-{
-	SBackgroundOU *pOU_L	= pOUNeighbours[0];
-	SBackgroundOU *pOU_R	= pOUNeighbours[1];
-	SBackgroundOU *pOU_U	= pOUNeighbours[2];
-	SBackgroundOU *pOU_D	= pOUNeighbours[3];
-
-	if (pBackgroundOU->iMAD > pBackgroundOU->iMinSubMad<<1)
-	{
-		int32_t iMaxNbrForegroundMad;
-		int32_t iMaxNbrBackgroundMad;
-		int32_t	aBackgroundMad[4];
-		int32_t	aForegroundMad[4];
-
-		aForegroundMad[0] = (pOU_L->iBackgroundFlag - 1) & pOU_L->iMAD;
-		aForegroundMad[1] = (pOU_R->iBackgroundFlag - 1) & pOU_R->iMAD;
-		aForegroundMad[2] = (pOU_U->iBackgroundFlag - 1) & pOU_U->iMAD;
-		aForegroundMad[3] = (pOU_D->iBackgroundFlag - 1) & pOU_D->iMAD;
-		iMaxNbrForegroundMad = WELS_MAX(WELS_MAX(aForegroundMad[0],aForegroundMad[1]), WELS_MAX(aForegroundMad[2],aForegroundMad[3]));
-
-		aBackgroundMad[0] = ((!pOU_L->iBackgroundFlag) - 1) & pOU_L->iMAD;
-		aBackgroundMad[1] = ((!pOU_R->iBackgroundFlag) - 1) & pOU_R->iMAD;
-		aBackgroundMad[2] = ((!pOU_U->iBackgroundFlag) - 1) & pOU_U->iMAD;
-		aBackgroundMad[3] = ((!pOU_D->iBackgroundFlag) - 1) & pOU_D->iMAD;
-		iMaxNbrBackgroundMad = WELS_MAX(WELS_MAX(aBackgroundMad[0],aBackgroundMad[1]), WELS_MAX(aBackgroundMad[2],aBackgroundMad[3]));
-
-		return ((iMaxNbrForegroundMad > pBackgroundOU->iMinSubMad<<2) || (pBackgroundOU->iMAD > iMaxNbrBackgroundMad<<1 && pBackgroundOU->iMAD <= (iMaxNbrForegroundMad*3)>>1));
-	}
-	return 0;
-}
-
-inline bool_t CBackgroundDetection::ForegroundDilation23Chroma(int8_t iNeighbourForegroundFlags, int32_t iStartSamplePos, int32_t iPicStrideUV, vBGDParam *pBgdParam)
-{
-	static const int8_t kaOUPos[4]	= {OU_LEFT, OU_RIGHT, OU_TOP, OU_BOTTOM};
-	int32_t	aEdgeOffset[4]	= {0, BGD_OU_SIZE_UV-1, 0, iPicStrideUV*(BGD_OU_SIZE_UV-1)};
-	int32_t	iStride[4]		= {iPicStrideUV, iPicStrideUV, 1, 1};
-
-	// V component first, high probability because V stands for red color and human skin colors have more weight on this component
-	for (int32_t i=0;i<4;i++)
-	{
-		if (iNeighbourForegroundFlags & kaOUPos[i])
-		{
-			uint8_t *pRefC = pBgdParam->pRef[2] + iStartSamplePos + aEdgeOffset[i];
-			uint8_t *pCurC = pBgdParam->pCur[2] + iStartSamplePos + aEdgeOffset[i];
-			if (CalculateAsdChromaEdge(pRefC, pCurC, iStride[i]) > BGD_THD_ASD_UV)
-			{
-				return 1;
-			}
-		}
-	}
-	// U component, which stands for blue color, low probability
-	for (int32_t i=0;i<4;i++)
-	{
-		if (iNeighbourForegroundFlags & kaOUPos[i])
-		{
-			uint8_t *pRefC = pBgdParam->pRef[1] + iStartSamplePos + aEdgeOffset[i];
-			uint8_t *pCurC = pBgdParam->pCur[1] + iStartSamplePos + aEdgeOffset[i];
-			if (CalculateAsdChromaEdge(pRefC, pCurC, iStride[i]) > BGD_THD_ASD_UV)
-			{
-				return 1;
-			}
-		}
-	}
-
-	return 0;
-}
-
-inline void CBackgroundDetection::ForegroundDilation(SBackgroundOU *pBackgroundOU, SBackgroundOU *pOUNeighbours[], vBGDParam *pBgdParam, int32_t	iChromaSampleStartPos)
-{
-	int32_t iPicStrideUV	= pBgdParam->iStride[1];
-	int32_t iSumNeighBackgroundFlags	= pOUNeighbours[0]->iBackgroundFlag + pOUNeighbours[1]->iBackgroundFlag + pOUNeighbours[2]->iBackgroundFlag + pOUNeighbours[3]->iBackgroundFlag;
-
-	if (pBackgroundOU->iSAD>BGD_OU_SIZE*Q_FACTOR)
-	{
-		switch (iSumNeighBackgroundFlags)
-		{
-		case 0:
-		case 1:
-			pBackgroundOU->iBackgroundFlag = 0;
-			break;
-		case 2:
-		case 3:
-			pBackgroundOU->iBackgroundFlag = !ForegroundDilation23Luma(pBackgroundOU, pOUNeighbours);
-
-			// chroma component check
-			if (pBackgroundOU->iBackgroundFlag==1)
-			{
-				int8_t	iNeighbourForegroundFlags = !pOUNeighbours[0]->iBackgroundFlag | ((!pOUNeighbours[1]->iBackgroundFlag)<<1)
-					| ((!pOUNeighbours[2]->iBackgroundFlag)<<2) | ((!pOUNeighbours[3]->iBackgroundFlag)<<3);
-				pBackgroundOU->iBackgroundFlag = !ForegroundDilation23Chroma(iNeighbourForegroundFlags, iChromaSampleStartPos, iPicStrideUV, pBgdParam);
-			}
-			break;
-		default:
-			break;
-		}
-	}
-}
-inline void CBackgroundDetection::BackgroundErosion(SBackgroundOU *pBackgroundOU, SBackgroundOU *pOUNeighbours[])
-{
-	if (pBackgroundOU->iMaxDiffSubSd <= (BGD_OU_SIZE*Q_FACTOR)) //BGD_OU_SIZE*BGD_OU_SIZE>>2
-	{
-		int32_t	iSumNeighBackgroundFlags = pOUNeighbours[0]->iBackgroundFlag + pOUNeighbours[1]->iBackgroundFlag + pOUNeighbours[2]->iBackgroundFlag + pOUNeighbours[3]->iBackgroundFlag;
-		int32_t	sumNbrBGsad = (pOUNeighbours[0]->iSAD&(-pOUNeighbours[0]->iBackgroundFlag)) + (pOUNeighbours[2]->iSAD&(-pOUNeighbours[2]->iBackgroundFlag))
-			+ (pOUNeighbours[1]->iSAD&(-pOUNeighbours[1]->iBackgroundFlag)) + (pOUNeighbours[3]->iSAD&(-pOUNeighbours[3]->iBackgroundFlag));
-		if (pBackgroundOU->iSAD*iSumNeighBackgroundFlags <= (3*sumNbrBGsad)>>1)
-		{
-			if (iSumNeighBackgroundFlags==4)
-			{
-				pBackgroundOU->iBackgroundFlag = 1;
-			} 
-			else
-			{
-				if ((pOUNeighbours[0]->iBackgroundFlag & pOUNeighbours[1]->iBackgroundFlag) || (pOUNeighbours[2]->iBackgroundFlag & pOUNeighbours[3]->iBackgroundFlag))
-				{
-					pBackgroundOU->iBackgroundFlag = !ForegroundDilation23Luma(pBackgroundOU, pOUNeighbours);
-				}
-			}
-		}
-	}
-}
-
-inline void CBackgroundDetection::SetBackgroundMbFlag(int8_t *pBackgroundMbFlag,int32_t iPicWidthInMb, int32_t iBackgroundMbFlag)
-{
-	*pBackgroundMbFlag = iBackgroundMbFlag;
-}
-
-inline void CBackgroundDetection::UpperOUForegroundCheck(SBackgroundOU *pCurOU, int8_t *pBackgroundMbFlag, int32_t iPicWidthInOU, int32_t iPicWidthInMb)
-{
-	if (pCurOU->iSAD > BGD_OU_SIZE*Q_FACTOR)
-	{
-		SBackgroundOU	*pOU_L = pCurOU-1;
-		SBackgroundOU	*pOU_R = pCurOU+1;
-		SBackgroundOU	*pOU_U = pCurOU-iPicWidthInOU;
-		SBackgroundOU	*pOU_D = pCurOU+iPicWidthInOU; 
-		if (pOU_L->iBackgroundFlag + pOU_R->iBackgroundFlag + pOU_U->iBackgroundFlag + pOU_D->iBackgroundFlag <= 1)
-		{
-			SetBackgroundMbFlag(pBackgroundMbFlag,iPicWidthInMb,0);
-			pCurOU->iBackgroundFlag = 0;
-		}
-	}
-}
-
-void CBackgroundDetection::ForegroundDilationAndBackgroundErosion(vBGDParam *pBgdParam)
-{
-	int32_t iPicStrideUV		= pBgdParam->iStride[1];
-	int32_t iPicWidthInOU	= pBgdParam->iBgdWidth  >> LOG2_BGD_OU_SIZE;
-	int32_t iPicHeightInOU	= pBgdParam->iBgdHeight >> LOG2_BGD_OU_SIZE;
-	int32_t iOUStrideUV		= iPicStrideUV << (LOG2_BGD_OU_SIZE-1);
-	int32_t iPicWidthInMb	= (15+pBgdParam->iBgdWidth)>>4;
-
-	SBackgroundOU *pBackgroundOU= pBgdParam->pOU_array;
-	int8_t	*pVaaBackgroundMbFlag   = (int8_t *)pBgdParam->pBackgroundMbFlag;
-	SBackgroundOU	*pOUNeighbours[4];//0: left; 1: right; 2: top; 3: bottom
-
-	pBackgroundOU	= pBgdParam->pOU_array;
-	pOUNeighbours[2]	= pBackgroundOU;//top OU
-	for (int32_t j = 0; j < iPicHeightInOU; j ++ )
-	{
-		int8_t *pRowSkipFlag = pVaaBackgroundMbFlag;
-		pOUNeighbours[0]	= pBackgroundOU;//left OU
-		pOUNeighbours[3]	= pBackgroundOU + (iPicWidthInOU & ((j == iPicHeightInOU-1) - 1));//bottom OU
-		for (int32_t i = 0; i < iPicWidthInOU; i++ )
-		{
-			pOUNeighbours[1] = pBackgroundOU + (i < iPicWidthInOU-1);//right OU
-
-			if (pBackgroundOU->iBackgroundFlag)
-				ForegroundDilation(pBackgroundOU, pOUNeighbours, pBgdParam, j*iOUStrideUV+(i<<LOG2_BGD_OU_SIZE_UV));
-			else 
-				BackgroundErosion(pBackgroundOU, pOUNeighbours);
-
-			// check the up OU
-			if (j>1 && i>0 && i<iPicWidthInOU-1 && pOUNeighbours[2]->iBackgroundFlag==1)
-			{
-				UpperOUForegroundCheck(pOUNeighbours[2], pRowSkipFlag-OU_SIZE_IN_MB*iPicWidthInMb, iPicWidthInOU, iPicWidthInMb);
-			}
-
-			SetBackgroundMbFlag(pRowSkipFlag,iPicWidthInMb,pBackgroundOU->iBackgroundFlag);
-
-			// preparation for the next OU
-			pRowSkipFlag += OU_SIZE_IN_MB;
-			pOUNeighbours[0] = pBackgroundOU;
-			pOUNeighbours[2]++;
-			pOUNeighbours[3]++;
-			pBackgroundOU++;
-		}
-		pOUNeighbours[2]	= pBackgroundOU - iPicWidthInOU;
-		pVaaBackgroundMbFlag += OU_SIZE_IN_MB*iPicWidthInMb;
-	}
-}
-
-void CBackgroundDetection::BackgroundDetection( vBGDParam *pBgdParam )
-{
-	// 1st step: foreground/background coarse division
-	ForegroundBackgroundDivision(pBgdParam);
-
-	// 2nd step: foreground dilation and background erosion
-	ForegroundDilationAndBackgroundErosion(pBgdParam);
-}
-
-WELSVP_NAMESPACE_END
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "BackgroundDetection.h"
+#include "../common/cpu.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+#define LOG2_BGD_OU_SIZE    (4)
+#define LOG2_BGD_OU_SIZE_UV (LOG2_BGD_OU_SIZE-1)
+#define BGD_OU_SIZE         (1<<LOG2_BGD_OU_SIZE)
+#define BGD_OU_SIZE_UV      (BGD_OU_SIZE>>1)
+#define BGD_THD_SAD         (2*BGD_OU_SIZE*BGD_OU_SIZE)
+#define	BGD_THD_ASD_UV      (4*BGD_OU_SIZE_UV)
+#define LOG2_MB_SIZE        (4)
+#define OU_SIZE_IN_MB       (BGD_OU_SIZE >> 4)
+#define Q_FACTOR            (8)
+#define BGD_DELTA_QP_THD    (3)
+
+#define OU_LEFT		(0x01)
+#define OU_RIGHT	(0x02)
+#define OU_TOP		(0x04)
+#define OU_BOTTOM	(0x08)
+
+CBackgroundDetection::CBackgroundDetection (int32_t iCpuFlag) {
+  m_eMethod = METHOD_BACKGROUND_DETECTION;
+  WelsMemset (&m_BgdParam, 0, sizeof (m_BgdParam));
+  m_iLargestFrameSize = 0;
+}
+
+CBackgroundDetection::~CBackgroundDetection() {
+  FreeOUArrayMemory();
+}
+
+EResult CBackgroundDetection::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
+  EResult eReturn = RET_INVALIDPARAM;
+
+  if (pSrcPixMap == NULL || pRefPixMap == NULL)
+    return eReturn;
+
+  m_BgdParam.pCur[0] = (uint8_t*)pSrcPixMap->pPixel[0];
+  m_BgdParam.pCur[1] = (uint8_t*)pSrcPixMap->pPixel[1];
+  m_BgdParam.pCur[2] = (uint8_t*)pSrcPixMap->pPixel[2];
+  m_BgdParam.pRef[0] = (uint8_t*)pRefPixMap->pPixel[0];
+  m_BgdParam.pRef[1] = (uint8_t*)pRefPixMap->pPixel[1];
+  m_BgdParam.pRef[2] = (uint8_t*)pRefPixMap->pPixel[2];
+  m_BgdParam.iBgdWidth = pSrcPixMap->sRect.iRectWidth;
+  m_BgdParam.iBgdHeight = pSrcPixMap->sRect.iRectHeight;
+  m_BgdParam.iStride[0] = pSrcPixMap->iStride[0];
+  m_BgdParam.iStride[1] = pSrcPixMap->iStride[1];
+  m_BgdParam.iStride[2] = pSrcPixMap->iStride[2];
+
+  int32_t iCurFrameSize = m_BgdParam.iBgdWidth * m_BgdParam.iBgdHeight;
+  if (m_BgdParam.pOU_array == NULL || iCurFrameSize > m_iLargestFrameSize) {
+    FreeOUArrayMemory();
+    m_BgdParam.pOU_array = AllocateOUArrayMemory (m_BgdParam.iBgdWidth, m_BgdParam.iBgdHeight);
+    m_iLargestFrameSize = iCurFrameSize;
+  }
+
+  if (m_BgdParam.pOU_array == NULL)
+    return eReturn;
+
+  BackgroundDetection (&m_BgdParam);
+
+  return RET_SUCCESS;
+}
+
+EResult CBackgroundDetection::Set (int32_t iType, void* pParam) {
+  if (pParam == NULL) {
+    return RET_INVALIDPARAM;
+  }
+
+  SBGDInterface* pInterface = (SBGDInterface*)pParam;
+
+  m_BgdParam.pBackgroundMbFlag = (int8_t*)pInterface->pBackgroundMbFlag;
+  m_BgdParam.pCalcRes = pInterface->pCalcRes;
+
+  return RET_SUCCESS;
+}
+
+inline SBackgroundOU* CBackgroundDetection::AllocateOUArrayMemory (int32_t iWidth, int32_t iHeight) {
+  int32_t	iMaxOUWidth	= (BGD_OU_SIZE - 1 + iWidth) >> LOG2_BGD_OU_SIZE;
+  int32_t	iMaxOUHeight	= (BGD_OU_SIZE - 1 + iHeight) >> LOG2_BGD_OU_SIZE;
+  return (SBackgroundOU*)WelsMalloc (iMaxOUWidth * iMaxOUHeight * sizeof (SBackgroundOU));
+}
+
+inline void CBackgroundDetection::FreeOUArrayMemory() {
+  _SafeFree (m_BgdParam.pOU_array);
+}
+
+void CBackgroundDetection::GetOUParameters (SVAACalcResult* sVaaCalcInfo, int32_t iMbIndex, int32_t iMbWidth,
+    SBackgroundOU* pBgdOU) {
+  int32_t	iSubSD[4];
+  uint8_t	iSubMAD[4];
+  int32_t	iSubSAD[4];
+
+  uint8_t (*pMad8x8)[4];
+  int32_t (*pSad8x8)[4];
+  int32_t (*pSd8x8)[4];
+
+  pSad8x8 = sVaaCalcInfo->pSad8x8;
+  pMad8x8 = sVaaCalcInfo->pMad8x8;
+  pSd8x8  = sVaaCalcInfo->pSumOfDiff8x8;
+
+  iSubSAD[0] = pSad8x8[iMbIndex][0];
+  iSubSAD[1] = pSad8x8[iMbIndex][1];
+  iSubSAD[2] = pSad8x8[iMbIndex][2];
+  iSubSAD[3] = pSad8x8[iMbIndex][3];
+
+  iSubSD[0] = pSd8x8[iMbIndex][0];
+  iSubSD[1] = pSd8x8[iMbIndex][1];
+  iSubSD[2] = pSd8x8[iMbIndex][2];
+  iSubSD[3] = pSd8x8[iMbIndex][3];
+
+  iSubMAD[0] = pMad8x8[iMbIndex][0];
+  iSubMAD[1] = pMad8x8[iMbIndex][1];
+  iSubMAD[2] = pMad8x8[iMbIndex][2];
+  iSubMAD[3] = pMad8x8[iMbIndex][3];
+
+  pBgdOU->iSD	= iSubSD[0] + iSubSD[1] + iSubSD[2] + iSubSD[3];
+  pBgdOU->iSAD	= iSubSAD[0] + iSubSAD[1] + iSubSAD[2] + iSubSAD[3];
+  pBgdOU->iSD	= WELS_ABS (pBgdOU->iSD);
+
+  // get the max absolute difference (MAD) of OU and min value of the MAD of sub-blocks of OU
+  pBgdOU->iMAD = WELS_MAX (WELS_MAX (iSubMAD[0], iSubMAD[1]), WELS_MAX (iSubMAD[2], iSubMAD[3]));
+  pBgdOU->iMinSubMad = WELS_MIN (WELS_MIN (iSubMAD[0], iSubMAD[1]), WELS_MIN (iSubMAD[2], iSubMAD[3]));
+
+  // get difference between the max and min SD of the SDs of sub-blocks of OU
+  pBgdOU->iMaxDiffSubSd = WELS_MAX (WELS_MAX (iSubSD[0], iSubSD[1]), WELS_MAX (iSubSD[2], iSubSD[3])) -
+                          WELS_MIN (WELS_MIN (iSubSD[0], iSubSD[1]), WELS_MIN (iSubSD[2], iSubSD[3]));
+}
+
+void CBackgroundDetection::ForegroundBackgroundDivision (vBGDParam* pBgdParam) {
+  int32_t iPicWidthInOU	= pBgdParam->iBgdWidth  >> LOG2_BGD_OU_SIZE;
+  int32_t iPicHeightInOU	= pBgdParam->iBgdHeight >> LOG2_BGD_OU_SIZE;
+  int32_t iPicWidthInMb	= (15 + pBgdParam->iBgdWidth) >> 4;
+
+  SBackgroundOU* pBackgroundOU = pBgdParam->pOU_array;
+
+  for (int32_t j = 0; j < iPicHeightInOU; j ++) {
+    for (int32_t i = 0; i < iPicWidthInOU; i++) {
+      GetOUParameters (pBgdParam->pCalcRes, (j * iPicWidthInMb + i) << (LOG2_BGD_OU_SIZE - LOG2_MB_SIZE), iPicWidthInMb,
+                       pBackgroundOU);
+
+      pBackgroundOU->iBackgroundFlag = 0;
+      if (pBackgroundOU->iMAD > 63) {
+        pBackgroundOU++;
+        continue;
+      }
+      if ((pBackgroundOU->iMaxDiffSubSd <= pBackgroundOU->iSAD >> 3
+           || pBackgroundOU->iMaxDiffSubSd <= (BGD_OU_SIZE * Q_FACTOR))
+          && pBackgroundOU->iSAD < (BGD_THD_SAD << 1)) { //BGD_OU_SIZE*BGD_OU_SIZE>>2
+        if (pBackgroundOU->iSAD <= BGD_OU_SIZE * Q_FACTOR) {
+          pBackgroundOU->iBackgroundFlag = 1;
+        } else {
+          pBackgroundOU->iBackgroundFlag = pBackgroundOU->iSAD < BGD_THD_SAD ?
+                                           (pBackgroundOU->iSD < (pBackgroundOU->iSAD * 3) >> 2) :
+                                           (pBackgroundOU->iSD << 1 < pBackgroundOU->iSAD);
+        }
+      }
+      pBackgroundOU++;
+    }
+  }
+}
+inline int32_t CBackgroundDetection::CalculateAsdChromaEdge (uint8_t* pOriRef, uint8_t* pOriCur, int32_t iStride) {
+  int32_t	ASD = 0;
+  int32_t	idx;
+  for (idx = 0; idx < BGD_OU_SIZE_UV; idx++) {
+    ASD += *pOriCur - *pOriRef;
+    pOriRef += iStride;
+    pOriCur += iStride;
+  }
+  return WELS_ABS (ASD);
+}
+
+inline bool_t CBackgroundDetection::ForegroundDilation23Luma (SBackgroundOU* pBackgroundOU,
+    SBackgroundOU* pOUNeighbours[]) {
+  SBackgroundOU* pOU_L	= pOUNeighbours[0];
+  SBackgroundOU* pOU_R	= pOUNeighbours[1];
+  SBackgroundOU* pOU_U	= pOUNeighbours[2];
+  SBackgroundOU* pOU_D	= pOUNeighbours[3];
+
+  if (pBackgroundOU->iMAD > pBackgroundOU->iMinSubMad << 1) {
+    int32_t iMaxNbrForegroundMad;
+    int32_t iMaxNbrBackgroundMad;
+    int32_t	aBackgroundMad[4];
+    int32_t	aForegroundMad[4];
+
+    aForegroundMad[0] = (pOU_L->iBackgroundFlag - 1) & pOU_L->iMAD;
+    aForegroundMad[1] = (pOU_R->iBackgroundFlag - 1) & pOU_R->iMAD;
+    aForegroundMad[2] = (pOU_U->iBackgroundFlag - 1) & pOU_U->iMAD;
+    aForegroundMad[3] = (pOU_D->iBackgroundFlag - 1) & pOU_D->iMAD;
+    iMaxNbrForegroundMad = WELS_MAX (WELS_MAX (aForegroundMad[0], aForegroundMad[1]), WELS_MAX (aForegroundMad[2],
+                                     aForegroundMad[3]));
+
+    aBackgroundMad[0] = ((!pOU_L->iBackgroundFlag) - 1) & pOU_L->iMAD;
+    aBackgroundMad[1] = ((!pOU_R->iBackgroundFlag) - 1) & pOU_R->iMAD;
+    aBackgroundMad[2] = ((!pOU_U->iBackgroundFlag) - 1) & pOU_U->iMAD;
+    aBackgroundMad[3] = ((!pOU_D->iBackgroundFlag) - 1) & pOU_D->iMAD;
+    iMaxNbrBackgroundMad = WELS_MAX (WELS_MAX (aBackgroundMad[0], aBackgroundMad[1]), WELS_MAX (aBackgroundMad[2],
+                                     aBackgroundMad[3]));
+
+    return ((iMaxNbrForegroundMad > pBackgroundOU->iMinSubMad << 2) || (pBackgroundOU->iMAD > iMaxNbrBackgroundMad << 1
+            && pBackgroundOU->iMAD <= (iMaxNbrForegroundMad * 3) >> 1));
+  }
+  return 0;
+}
+
+inline bool_t CBackgroundDetection::ForegroundDilation23Chroma (int8_t iNeighbourForegroundFlags,
+    int32_t iStartSamplePos, int32_t iPicStrideUV, vBGDParam* pBgdParam) {
+  static const int8_t kaOUPos[4]	= {OU_LEFT, OU_RIGHT, OU_TOP, OU_BOTTOM};
+  int32_t	aEdgeOffset[4]	= {0, BGD_OU_SIZE_UV - 1, 0, iPicStrideUV* (BGD_OU_SIZE_UV - 1)};
+  int32_t	iStride[4]		= {iPicStrideUV, iPicStrideUV, 1, 1};
+
+  // V component first, high probability because V stands for red color and human skin colors have more weight on this component
+  for (int32_t i = 0; i < 4; i++) {
+    if (iNeighbourForegroundFlags & kaOUPos[i]) {
+      uint8_t* pRefC = pBgdParam->pRef[2] + iStartSamplePos + aEdgeOffset[i];
+      uint8_t* pCurC = pBgdParam->pCur[2] + iStartSamplePos + aEdgeOffset[i];
+      if (CalculateAsdChromaEdge (pRefC, pCurC, iStride[i]) > BGD_THD_ASD_UV) {
+        return 1;
+      }
+    }
+  }
+  // U component, which stands for blue color, low probability
+  for (int32_t i = 0; i < 4; i++) {
+    if (iNeighbourForegroundFlags & kaOUPos[i]) {
+      uint8_t* pRefC = pBgdParam->pRef[1] + iStartSamplePos + aEdgeOffset[i];
+      uint8_t* pCurC = pBgdParam->pCur[1] + iStartSamplePos + aEdgeOffset[i];
+      if (CalculateAsdChromaEdge (pRefC, pCurC, iStride[i]) > BGD_THD_ASD_UV) {
+        return 1;
+      }
+    }
+  }
+
+  return 0;
+}
+
+inline void CBackgroundDetection::ForegroundDilation (SBackgroundOU* pBackgroundOU, SBackgroundOU* pOUNeighbours[],
+    vBGDParam* pBgdParam, int32_t	iChromaSampleStartPos) {
+  int32_t iPicStrideUV	= pBgdParam->iStride[1];
+  int32_t iSumNeighBackgroundFlags	= pOUNeighbours[0]->iBackgroundFlag + pOUNeighbours[1]->iBackgroundFlag +
+                                      pOUNeighbours[2]->iBackgroundFlag + pOUNeighbours[3]->iBackgroundFlag;
+
+  if (pBackgroundOU->iSAD > BGD_OU_SIZE * Q_FACTOR) {
+    switch (iSumNeighBackgroundFlags) {
+    case 0:
+    case 1:
+      pBackgroundOU->iBackgroundFlag = 0;
+      break;
+    case 2:
+    case 3:
+      pBackgroundOU->iBackgroundFlag = !ForegroundDilation23Luma (pBackgroundOU, pOUNeighbours);
+
+      // chroma component check
+      if (pBackgroundOU->iBackgroundFlag == 1) {
+        int8_t	iNeighbourForegroundFlags = !pOUNeighbours[0]->iBackgroundFlag | ((!pOUNeighbours[1]->iBackgroundFlag) << 1)
+                                            | ((!pOUNeighbours[2]->iBackgroundFlag) << 2) | ((!pOUNeighbours[3]->iBackgroundFlag) << 3);
+        pBackgroundOU->iBackgroundFlag = !ForegroundDilation23Chroma (iNeighbourForegroundFlags, iChromaSampleStartPos,
+                                         iPicStrideUV, pBgdParam);
+      }
+      break;
+    default:
+      break;
+    }
+  }
+}
+inline void CBackgroundDetection::BackgroundErosion (SBackgroundOU* pBackgroundOU, SBackgroundOU* pOUNeighbours[]) {
+  if (pBackgroundOU->iMaxDiffSubSd <= (BGD_OU_SIZE * Q_FACTOR)) { //BGD_OU_SIZE*BGD_OU_SIZE>>2
+    int32_t	iSumNeighBackgroundFlags = pOUNeighbours[0]->iBackgroundFlag + pOUNeighbours[1]->iBackgroundFlag +
+                                       pOUNeighbours[2]->iBackgroundFlag + pOUNeighbours[3]->iBackgroundFlag;
+    int32_t	sumNbrBGsad = (pOUNeighbours[0]->iSAD & (-pOUNeighbours[0]->iBackgroundFlag)) + (pOUNeighbours[2]->iSAD &
+                          (-pOUNeighbours[2]->iBackgroundFlag))
+                          + (pOUNeighbours[1]->iSAD & (-pOUNeighbours[1]->iBackgroundFlag)) + (pOUNeighbours[3]->iSAD &
+                              (-pOUNeighbours[3]->iBackgroundFlag));
+    if (pBackgroundOU->iSAD * iSumNeighBackgroundFlags <= (3 * sumNbrBGsad) >> 1) {
+      if (iSumNeighBackgroundFlags == 4) {
+        pBackgroundOU->iBackgroundFlag = 1;
+      } else {
+        if ((pOUNeighbours[0]->iBackgroundFlag & pOUNeighbours[1]->iBackgroundFlag)
+            || (pOUNeighbours[2]->iBackgroundFlag & pOUNeighbours[3]->iBackgroundFlag)) {
+          pBackgroundOU->iBackgroundFlag = !ForegroundDilation23Luma (pBackgroundOU, pOUNeighbours);
+        }
+      }
+    }
+  }
+}
+
+inline void CBackgroundDetection::SetBackgroundMbFlag (int8_t* pBackgroundMbFlag, int32_t iPicWidthInMb,
+    int32_t iBackgroundMbFlag) {
+  *pBackgroundMbFlag = iBackgroundMbFlag;
+}
+
+inline void CBackgroundDetection::UpperOUForegroundCheck (SBackgroundOU* pCurOU, int8_t* pBackgroundMbFlag,
+    int32_t iPicWidthInOU, int32_t iPicWidthInMb) {
+  if (pCurOU->iSAD > BGD_OU_SIZE * Q_FACTOR) {
+    SBackgroundOU*	pOU_L = pCurOU - 1;
+    SBackgroundOU*	pOU_R = pCurOU + 1;
+    SBackgroundOU*	pOU_U = pCurOU - iPicWidthInOU;
+    SBackgroundOU*	pOU_D = pCurOU + iPicWidthInOU;
+    if (pOU_L->iBackgroundFlag + pOU_R->iBackgroundFlag + pOU_U->iBackgroundFlag + pOU_D->iBackgroundFlag <= 1) {
+      SetBackgroundMbFlag (pBackgroundMbFlag, iPicWidthInMb, 0);
+      pCurOU->iBackgroundFlag = 0;
+    }
+  }
+}
+
+void CBackgroundDetection::ForegroundDilationAndBackgroundErosion (vBGDParam* pBgdParam) {
+  int32_t iPicStrideUV		= pBgdParam->iStride[1];
+  int32_t iPicWidthInOU	= pBgdParam->iBgdWidth  >> LOG2_BGD_OU_SIZE;
+  int32_t iPicHeightInOU	= pBgdParam->iBgdHeight >> LOG2_BGD_OU_SIZE;
+  int32_t iOUStrideUV		= iPicStrideUV << (LOG2_BGD_OU_SIZE - 1);
+  int32_t iPicWidthInMb	= (15 + pBgdParam->iBgdWidth) >> 4;
+
+  SBackgroundOU* pBackgroundOU = pBgdParam->pOU_array;
+  int8_t*	pVaaBackgroundMbFlag   = (int8_t*)pBgdParam->pBackgroundMbFlag;
+  SBackgroundOU*	pOUNeighbours[4];//0: left; 1: right; 2: top; 3: bottom
+
+  pBackgroundOU	= pBgdParam->pOU_array;
+  pOUNeighbours[2]	= pBackgroundOU;//top OU
+  for (int32_t j = 0; j < iPicHeightInOU; j ++) {
+    int8_t* pRowSkipFlag = pVaaBackgroundMbFlag;
+    pOUNeighbours[0]	= pBackgroundOU;//left OU
+    pOUNeighbours[3]	= pBackgroundOU + (iPicWidthInOU & ((j == iPicHeightInOU - 1) - 1)); //bottom OU
+    for (int32_t i = 0; i < iPicWidthInOU; i++) {
+      pOUNeighbours[1] = pBackgroundOU + (i < iPicWidthInOU - 1); //right OU
+
+      if (pBackgroundOU->iBackgroundFlag)
+        ForegroundDilation (pBackgroundOU, pOUNeighbours, pBgdParam, j * iOUStrideUV + (i << LOG2_BGD_OU_SIZE_UV));
+      else
+        BackgroundErosion (pBackgroundOU, pOUNeighbours);
+
+      // check the up OU
+      if (j > 1 && i > 0 && i < iPicWidthInOU - 1 && pOUNeighbours[2]->iBackgroundFlag == 1) {
+        UpperOUForegroundCheck (pOUNeighbours[2], pRowSkipFlag - OU_SIZE_IN_MB * iPicWidthInMb, iPicWidthInOU, iPicWidthInMb);
+      }
+
+      SetBackgroundMbFlag (pRowSkipFlag, iPicWidthInMb, pBackgroundOU->iBackgroundFlag);
+
+      // preparation for the next OU
+      pRowSkipFlag += OU_SIZE_IN_MB;
+      pOUNeighbours[0] = pBackgroundOU;
+      pOUNeighbours[2]++;
+      pOUNeighbours[3]++;
+      pBackgroundOU++;
+    }
+    pOUNeighbours[2]	= pBackgroundOU - iPicWidthInOU;
+    pVaaBackgroundMbFlag += OU_SIZE_IN_MB * iPicWidthInMb;
+  }
+}
+
+void CBackgroundDetection::BackgroundDetection (vBGDParam* pBgdParam) {
+  // 1st step: foreground/background coarse division
+  ForegroundBackgroundDivision (pBgdParam);
+
+  // 2nd step: foreground dilation and background erosion
+  ForegroundDilationAndBackgroundErosion (pBgdParam);
+}
+
+WELSVP_NAMESPACE_END
--- a/processing/src/backgounddetection/BackgroundDetection.h
+++ b/processing/src/backgounddetection/BackgroundDetection.h
@@ -1,104 +1,106 @@
-/*!
- * \copy
- *     Copyright (c)  2011-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- * \file	       :  BackgroundDetection.h
- *
- * \brief	     :  background detection class of wels video processor class
- *
- * \date        :  2011/03/17
- *
- * \description :  1. rewrite the package code of background detection class  
- *
- */
-
-#ifndef _WELSVP_BACKGROUNDDETECTION_H
-#define _WELSVP_BACKGROUNDDETECTION_H
-
-#include "../common/util.h"
-#include "../common/memory.h"
-#include "../common/WelsFrameWork.h"
-#include "../../interface/IWelsVP.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-typedef struct
-{
-	int32_t	iBackgroundFlag;
-	int32_t	iSAD;
-	int32_t	iSD;
-	int32_t	iMAD;			
-	int32_t	iMinSubMad;		
-	int32_t	iMaxDiffSubSd;	
-} SBackgroundOU;
-
-class CBackgroundDetection : public IStrategy
-{			  
-public:
-	CBackgroundDetection(int32_t iCpuFlag);
-	~CBackgroundDetection();
-
-	EResult Process(int32_t iType, SPixMap *pSrc, SPixMap *pRef);
-	EResult Set    (int32_t iType, void *pParam); 
-
-private:
-	struct vBGDParam
-	{
-		uint8_t   *pCur[3];
-		uint8_t   *pRef[3];
-		int32_t	   iBgdWidth;			
-		int32_t	   iBgdHeight;			
-		int32_t    iStride[3];
-		SBackgroundOU	  *pOU_array;
-		int8_t	  *pBackgroundMbFlag;
-		SVAACalcResult  *pCalcRes;
-	}m_BgdParam;
-
-	int32_t     m_iLargestFrameSize;
-
-private:
-	inline SBackgroundOU* AllocateOUArrayMemory(int32_t iWidth, int32_t iHeight);
-	inline void     FreeOUArrayMemory();
-	inline int32_t  CalculateAsdChromaEdge( uint8_t *pOriRef, uint8_t *pOriCur, int32_t iStride );
-	inline bool_t   ForegroundDilation23Luma(SBackgroundOU *pBackgroundOU, SBackgroundOU *pOUNeighbours[]);//Foreground_Dilation_2_3_Luma
-	inline bool_t   ForegroundDilation23Chroma(int8_t iNeighbourForegroundFlags, int32_t iStartSamplePos, int32_t iPicStrideUV, vBGDParam *pBgdParam);//Foreground_Dilation_2_3_Chroma
-	inline void     ForegroundDilation(SBackgroundOU *pBackgroundOU, SBackgroundOU *pOUNeighbours[], vBGDParam *pBgdParam, int32_t	iChromaSampleStartPos);
-	inline void     BackgroundErosion(SBackgroundOU *pBackgroundOU, SBackgroundOU *pOUNeighbours[]);
-	inline void     SetBackgroundMbFlag(int8_t *pBackgroundMbFlag,int32_t iPicWidthInMb, int32_t iBackgroundMbFlag);
-	inline void     UpperOUForegroundCheck(SBackgroundOU *pCurOU, int8_t *pBackgroundMbFlag, int32_t iPicWidthInOU, int32_t iPicWidthInMb);
-
-	void    GetOUParameters( SVAACalcResult *sVaaCalcInfo, int32_t iMbIndex, int32_t iMbWidth, SBackgroundOU* pBackgroundOU);
-	void    ForegroundBackgroundDivision(vBGDParam *pBgdParam);
-	void    ForegroundDilationAndBackgroundErosion(vBGDParam *pBgdParam);
-	void    BackgroundDetection( vBGDParam *pBgdParam );
-};	
-
-WELSVP_NAMESPACE_END
-
-#endif
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	       :  BackgroundDetection.h
+ *
+ * \brief	     :  background detection class of wels video processor class
+ *
+ * \date        :  2011/03/17
+ *
+ * \description :  1. rewrite the package code of background detection class
+ *
+ */
+
+#ifndef _WELSVP_BACKGROUNDDETECTION_H
+#define _WELSVP_BACKGROUNDDETECTION_H
+
+#include "../common/util.h"
+#include "../common/memory.h"
+#include "../common/WelsFrameWork.h"
+#include "../../interface/IWelsVP.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+typedef struct {
+  int32_t	iBackgroundFlag;
+  int32_t	iSAD;
+  int32_t	iSD;
+  int32_t	iMAD;
+  int32_t	iMinSubMad;
+  int32_t	iMaxDiffSubSd;
+} SBackgroundOU;
+
+class CBackgroundDetection : public IStrategy {
+ public:
+  CBackgroundDetection (int32_t iCpuFlag);
+  ~CBackgroundDetection();
+
+  EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* pRef);
+  EResult Set (int32_t iType, void* pParam);
+
+ private:
+  struct vBGDParam {
+    uint8_t*   pCur[3];
+    uint8_t*   pRef[3];
+    int32_t	   iBgdWidth;
+    int32_t	   iBgdHeight;
+    int32_t    iStride[3];
+    SBackgroundOU*  	pOU_array;
+    int8_t*  	pBackgroundMbFlag;
+    SVAACalcResult*  pCalcRes;
+  } m_BgdParam;
+
+  int32_t     m_iLargestFrameSize;
+
+ private:
+  inline SBackgroundOU* AllocateOUArrayMemory (int32_t iWidth, int32_t iHeight);
+  inline void     FreeOUArrayMemory();
+  inline int32_t  CalculateAsdChromaEdge (uint8_t* pOriRef, uint8_t* pOriCur, int32_t iStride);
+  inline bool_t   ForegroundDilation23Luma (SBackgroundOU* pBackgroundOU,
+      SBackgroundOU* pOUNeighbours[]); //Foreground_Dilation_2_3_Luma
+  inline bool_t   ForegroundDilation23Chroma (int8_t iNeighbourForegroundFlags, int32_t iStartSamplePos,
+      int32_t iPicStrideUV, vBGDParam* pBgdParam);//Foreground_Dilation_2_3_Chroma
+  inline void     ForegroundDilation (SBackgroundOU* pBackgroundOU, SBackgroundOU* pOUNeighbours[], vBGDParam* pBgdParam,
+                                      int32_t	iChromaSampleStartPos);
+  inline void     BackgroundErosion (SBackgroundOU* pBackgroundOU, SBackgroundOU* pOUNeighbours[]);
+  inline void     SetBackgroundMbFlag (int8_t* pBackgroundMbFlag, int32_t iPicWidthInMb, int32_t iBackgroundMbFlag);
+  inline void     UpperOUForegroundCheck (SBackgroundOU* pCurOU, int8_t* pBackgroundMbFlag, int32_t iPicWidthInOU,
+                                          int32_t iPicWidthInMb);
+
+  void    GetOUParameters (SVAACalcResult* sVaaCalcInfo, int32_t iMbIndex, int32_t iMbWidth,
+                           SBackgroundOU* pBackgroundOU);
+  void    ForegroundBackgroundDivision (vBGDParam* pBgdParam);
+  void    ForegroundDilationAndBackgroundErosion (vBGDParam* pBgdParam);
+  void    BackgroundDetection (vBGDParam* pBgdParam);
+};
+
+WELSVP_NAMESPACE_END
+
+#endif
--- a/processing/src/common/WelsFrameWork.cpp
+++ b/processing/src/common/WelsFrameWork.cpp
@@ -1,322 +1,301 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "WelsFrameWork.h"
-#include "cpu.h"
-#include "../denoise/denoise.h"
-#include "../downsample/downsample.h"
-#include "../scenechangedetection/SceneChangeDetection.h"
-#include "../vaacalc/vaacalculation.h"
-#include "../backgounddetection/BackgroundDetection.h"
-#include "../adaptivequantization/AdaptiveQuantization.h"
-#include "../complexityanalysis/ComplexityAnalysis.h"
-#include "../imagerotate/imagerotate.h"
-
-
-/* interface API implement */
-
-EResult WELSAPI CreateVpInterface  (void **ppCtx, int iVersion)
-{
-	if (iVersion & 0x8000)
-		return nsWelsVP::CreateSpecificVpInterface((IWelsVP **)ppCtx);
-	else if (iVersion & 0x7fff)
-		return nsWelsVP::CreateSpecificVpInterface((IWelsVPc **)ppCtx);
-	else
-		return RET_INVALIDPARAM;
-}
-
-EResult WELSAPI DestroyVpInterface  (void *pCtx, int iVersion)
-{
-	if (iVersion & 0x8000)
-		return nsWelsVP::DestroySpecificVpInterface((IWelsVP *)pCtx);
-	else if (iVersion & 0x7fff)
-		return nsWelsVP::DestroySpecificVpInterface((IWelsVPc *)pCtx);
-	else
-		return RET_INVALIDPARAM;
-}
-
-WELSVP_NAMESPACE_BEGIN
-
-///////////////////////////////////////////////////////////////////////
-
-EResult CreateSpecificVpInterface(IWelsVP **ppCtx)
-{
-	EResult  eReturn = RET_FAILED;
-
-	CVpFrameWork *pFr = new CVpFrameWork(1, eReturn);  
-	if (pFr)
-	{
-		*ppCtx  = (IWelsVP *)pFr;
-		eReturn = RET_SUCCESS;
-	}
-
-	return eReturn;
-}
-
-EResult DestroySpecificVpInterface  (IWelsVP *pCtx)
-{
-	_SafeDelete(pCtx);
-
-	return RET_SUCCESS;
-}
-
-///////////////////////////////////////////////////////////////////////////////
-
-CVpFrameWork::CVpFrameWork(uint32_t uiThreadsNum, EResult &eReturn)
-{
-	int32_t iCoreNum = 1;
-#ifndef X86_ASM
-	uint32_t uiCPUFlag = 0;
-#else
-	uint32_t uiCPUFlag = WelsCPUFeatureDetect(&iCoreNum);
-#endif
-
-	for (int32_t i = 0; i < MAX_STRATEGY_NUM; i++)
-	{
-		IStrategy *pStrategy = m_pStgChain[i];
-		pStrategy = CreateStrategy(WelsStaticCast(EMethods, i + 1), uiCPUFlag);
-		m_pStgChain[i] = pStrategy;	
-	}
-	
-	WelsMutexInit(&m_mutes);
-
-	eReturn = RET_SUCCESS;	
-}
-
-CVpFrameWork::~CVpFrameWork()
-{
-	for (int32_t i = 0; i < MAX_STRATEGY_NUM; i++)
-	{
-		if (m_pStgChain[i])
-		{
-			Uninit(m_pStgChain[i]->m_eMethod);
-			_SafeDelete(m_pStgChain[i]);
-		}		
-	}
- 
-	WelsMutexDestroy(&m_mutes);
-}
-
-EResult CVpFrameWork::Init(int32_t iType, void *pCfg)
-{
-	EResult eReturn   = RET_SUCCESS;
-	int32_t iCurIdx    = WelsStaticCast(int32_t, WelsVpGetValidMethod(iType)) - 1;	
-
-	Uninit(iType);
-
-	WelsMutexLock(&m_mutes);
-
-	IStrategy *pStrategy = m_pStgChain[iCurIdx];
-	if (pStrategy)
-		eReturn = pStrategy->Init(0, pCfg);
-
-	WelsMutexUnlock(&m_mutes);
-
-	return eReturn;
-}
-
-EResult CVpFrameWork::Uninit(int32_t iType)
-{
-	EResult eReturn        = RET_SUCCESS;
-	int32_t iCurIdx    = WelsStaticCast(int32_t, WelsVpGetValidMethod(iType)) - 1;
-
-	WelsMutexLock(&m_mutes);
-
-	IStrategy *pStrategy = m_pStgChain[iCurIdx];
-	if (pStrategy)
-		eReturn = pStrategy->Uninit(0);
-
-	WelsMutexUnlock(&m_mutes);
-
-	return eReturn;
-}
-
-EResult CVpFrameWork::Flush(int32_t iType)
-{
-	EResult eReturn        = RET_SUCCESS;
-
-	return eReturn;
-}
-
-EResult CVpFrameWork::Process(int32_t iType, SPixMap *pSrcPixMap, SPixMap *pDstPixMap)
-{
-	EResult eReturn        = RET_NOTSUPPORTED;
-	EMethods eMethod    = WelsVpGetValidMethod(iType);
-	int32_t iCurIdx    = WelsStaticCast(int32_t, eMethod) - 1;
-	SPixMap sSrcPic;
-	SPixMap sDstPic;
-    memset(&sSrcPic, 0, sizeof(sSrcPic));// confirmed_safe_unsafe_usage
-    memset(&sDstPic, 0, sizeof(sDstPic));// confirmed_safe_unsafe_usage
-
-	if (pSrcPixMap) sSrcPic = *pSrcPixMap;
-	if (pDstPixMap) sDstPic = *pDstPixMap;
-	if (!CheckValid(eMethod, sSrcPic, sDstPic))
-		return RET_INVALIDPARAM;
-
-	WelsMutexLock(&m_mutes);
-
-	IStrategy *pStrategy = m_pStgChain[iCurIdx];
-	if (pStrategy)
-		eReturn = pStrategy->Process(0, &sSrcPic, &sDstPic);
-
-	WelsMutexUnlock(&m_mutes);
-
-	return eReturn;
-}
-
-EResult CVpFrameWork::Get(int32_t iType, void *pParam)
-{
-	EResult eReturn        = RET_SUCCESS;
-	int32_t iCurIdx    = WelsStaticCast(int32_t, WelsVpGetValidMethod(iType)) - 1;
-
-	if (!pParam)
-		return RET_INVALIDPARAM;
-
-	WelsMutexLock(&m_mutes);
-
-	IStrategy *pStrategy = m_pStgChain[iCurIdx];
-	if (pStrategy)
-		eReturn = pStrategy->Get(0, pParam);
-
-	WelsMutexUnlock(&m_mutes);
-
-	return eReturn;
-}
-
-EResult CVpFrameWork::Set(int32_t iType, void *pParam)
-{
-	EResult eReturn        = RET_SUCCESS;
-	int32_t iCurIdx    = WelsStaticCast(int32_t, WelsVpGetValidMethod(iType)) - 1;
-
-	if (!pParam)
-		return RET_INVALIDPARAM;
-
-	WelsMutexLock(&m_mutes);
-
-	IStrategy *pStrategy = m_pStgChain[iCurIdx];
-	if (pStrategy)
-		eReturn = pStrategy->Set(0, pParam);
-
-	WelsMutexUnlock(&m_mutes);
-
-	return eReturn;
-}
-
-EResult CVpFrameWork::SpecialFeature(int32_t iType, void *pIn, void *pOut)
-{
-	EResult eReturn        = RET_SUCCESS;
-
-	return eReturn;
-}
-
-bool_t  CVpFrameWork::CheckValid(EMethods eMethod, SPixMap &pSrcPixMap, SPixMap &pDstPixMap)
-{
-	bool_t eReturn = FALSE;
-
-	if (eMethod == METHOD_NULL)
-		goto exit;
-
-	if (eMethod != METHOD_COLORSPACE_CONVERT)
-	{
-		if (pSrcPixMap.pPixel[0])
-		{
-			if (pSrcPixMap.eFormat != VIDEO_FORMAT_I420 && pSrcPixMap.eFormat != VIDEO_FORMAT_YV12)
-				goto exit;
-		}
-		if (pSrcPixMap.pPixel[0] && pDstPixMap.pPixel[0])
-		{
-			if (pDstPixMap.eFormat != pSrcPixMap.eFormat)
-				goto exit;
-		}
-	}
-
-	if (pSrcPixMap.pPixel[0])
-	{
-		if (pSrcPixMap.sRect.iRectWidth <= 0 || pSrcPixMap.sRect.iRectWidth > MAX_WIDTH || pSrcPixMap.sRect.iRectHeight <= 0 || pSrcPixMap.sRect.iRectHeight > MAX_HEIGHT)
-			goto exit;
-		if (pSrcPixMap.sRect.iRectTop >= pSrcPixMap.sRect.iRectHeight || pSrcPixMap.sRect.iRectLeft >= pSrcPixMap.sRect.iRectWidth || pSrcPixMap.sRect.iRectWidth > pSrcPixMap.iStride[0])
-			goto exit;
-	}
-	if (pDstPixMap.pPixel[0])
-	{
-		if (pDstPixMap.sRect.iRectWidth <= 0 || pDstPixMap.sRect.iRectWidth > MAX_WIDTH || pDstPixMap.sRect.iRectHeight <= 0 || pDstPixMap.sRect.iRectHeight > MAX_HEIGHT)
-			goto exit;
-		if (pDstPixMap.sRect.iRectTop >= pDstPixMap.sRect.iRectHeight || pDstPixMap.sRect.iRectLeft >= pDstPixMap.sRect.iRectWidth || pDstPixMap.sRect.iRectWidth > pDstPixMap.iStride[0])
-			goto exit;
-	}
-	eReturn = TRUE;
-
-exit:
-	return eReturn;
-}
-
-IStrategy* CVpFrameWork::CreateStrategy(EMethods m_eMethod, int32_t iCpuFlag)
-{
-	IStrategy *pStrategy = NULL;
-
-	switch (m_eMethod)
-	{
-	case METHOD_COLORSPACE_CONVERT:
-		//not support yet
-		break;
-	case METHOD_DENOISE:
-		pStrategy = WelsDynamicCast(IStrategy *, new CDenoiser(iCpuFlag));
-		break;
-	case METHOD_SCENE_CHANGE_DETECTION:
-		pStrategy = WelsDynamicCast(IStrategy *, new CSceneChangeDetection(iCpuFlag));
-		break;
-	case METHOD_DOWNSAMPLE:
-		pStrategy = WelsDynamicCast(IStrategy *, new CDownsampling(iCpuFlag));
-		break;
-	case METHOD_VAA_STATISTICS:
-		pStrategy = WelsDynamicCast(IStrategy *, new CVAACalculation(iCpuFlag));
-		break;
-	case METHOD_BACKGROUND_DETECTION:
-		pStrategy = WelsDynamicCast(IStrategy *, new CBackgroundDetection(iCpuFlag));
-		break;
-	case METHOD_ADAPTIVE_QUANT:
-		pStrategy = WelsDynamicCast(IStrategy *, new CAdaptiveQuantization(iCpuFlag));
-		break;
-	case METHOD_COMPLEXITY_ANALYSIS:
-		pStrategy = WelsDynamicCast(IStrategy *, new CComplexityAnalysis(iCpuFlag));
-		break;
-	case METHOD_IMAGE_ROTATE:
-		pStrategy = WelsDynamicCast(IStrategy *, new CImageRotating(iCpuFlag));
-		break;
-	default:
-		break;
-	}
-
-	return pStrategy;
-}
-
-WELSVP_NAMESPACE_END
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "WelsFrameWork.h"
+#include "cpu.h"
+#include "../denoise/denoise.h"
+#include "../downsample/downsample.h"
+#include "../scenechangedetection/SceneChangeDetection.h"
+#include "../vaacalc/vaacalculation.h"
+#include "../backgounddetection/BackgroundDetection.h"
+#include "../adaptivequantization/AdaptiveQuantization.h"
+#include "../complexityanalysis/ComplexityAnalysis.h"
+#include "../imagerotate/imagerotate.h"
+
+
+/* interface API implement */
+
+EResult WELSAPI CreateVpInterface (void** ppCtx, int iVersion) {
+  if (iVersion & 0x8000)
+    return nsWelsVP::CreateSpecificVpInterface ((IWelsVP**)ppCtx);
+  else if (iVersion & 0x7fff)
+    return nsWelsVP::CreateSpecificVpInterface ((IWelsVPc**)ppCtx);
+  else
+    return RET_INVALIDPARAM;
+}
+
+EResult WELSAPI DestroyVpInterface (void* pCtx, int iVersion) {
+  if (iVersion & 0x8000)
+    return nsWelsVP::DestroySpecificVpInterface ((IWelsVP*)pCtx);
+  else if (iVersion & 0x7fff)
+    return nsWelsVP::DestroySpecificVpInterface ((IWelsVPc*)pCtx);
+  else
+    return RET_INVALIDPARAM;
+}
+
+WELSVP_NAMESPACE_BEGIN
+
+///////////////////////////////////////////////////////////////////////
+
+EResult CreateSpecificVpInterface (IWelsVP** ppCtx) {
+  EResult  eReturn = RET_FAILED;
+
+  CVpFrameWork* pFr = new CVpFrameWork (1, eReturn);
+  if (pFr) {
+    *ppCtx  = (IWelsVP*)pFr;
+    eReturn = RET_SUCCESS;
+  }
+
+  return eReturn;
+}
+
+EResult DestroySpecificVpInterface (IWelsVP* pCtx) {
+  _SafeDelete (pCtx);
+
+  return RET_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+CVpFrameWork::CVpFrameWork (uint32_t uiThreadsNum, EResult& eReturn) {
+  int32_t iCoreNum = 1;
+#ifndef X86_ASM
+  uint32_t uiCPUFlag = 0;
+#else
+  uint32_t uiCPUFlag = WelsCPUFeatureDetect (&iCoreNum);
+#endif
+
+  for (int32_t i = 0; i < MAX_STRATEGY_NUM; i++) {
+    IStrategy* pStrategy = m_pStgChain[i];
+    pStrategy = CreateStrategy (WelsStaticCast (EMethods, i + 1), uiCPUFlag);
+    m_pStgChain[i] = pStrategy;
+  }
+
+  WelsMutexInit (&m_mutes);
+
+  eReturn = RET_SUCCESS;
+}
+
+CVpFrameWork::~CVpFrameWork() {
+  for (int32_t i = 0; i < MAX_STRATEGY_NUM; i++) {
+    if (m_pStgChain[i]) {
+      Uninit (m_pStgChain[i]->m_eMethod);
+      _SafeDelete (m_pStgChain[i]);
+    }
+  }
+
+  WelsMutexDestroy (&m_mutes);
+}
+
+EResult CVpFrameWork::Init (int32_t iType, void* pCfg) {
+  EResult eReturn   = RET_SUCCESS;
+  int32_t iCurIdx    = WelsStaticCast (int32_t, WelsVpGetValidMethod (iType)) - 1;
+
+  Uninit (iType);
+
+  WelsMutexLock (&m_mutes);
+
+  IStrategy* pStrategy = m_pStgChain[iCurIdx];
+  if (pStrategy)
+    eReturn = pStrategy->Init (0, pCfg);
+
+  WelsMutexUnlock (&m_mutes);
+
+  return eReturn;
+}
+
+EResult CVpFrameWork::Uninit (int32_t iType) {
+  EResult eReturn        = RET_SUCCESS;
+  int32_t iCurIdx    = WelsStaticCast (int32_t, WelsVpGetValidMethod (iType)) - 1;
+
+  WelsMutexLock (&m_mutes);
+
+  IStrategy* pStrategy = m_pStgChain[iCurIdx];
+  if (pStrategy)
+    eReturn = pStrategy->Uninit (0);
+
+  WelsMutexUnlock (&m_mutes);
+
+  return eReturn;
+}
+
+EResult CVpFrameWork::Flush (int32_t iType) {
+  EResult eReturn        = RET_SUCCESS;
+
+  return eReturn;
+}
+
+EResult CVpFrameWork::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pDstPixMap) {
+  EResult eReturn        = RET_NOTSUPPORTED;
+  EMethods eMethod    = WelsVpGetValidMethod (iType);
+  int32_t iCurIdx    = WelsStaticCast (int32_t, eMethod) - 1;
+  SPixMap sSrcPic;
+  SPixMap sDstPic;
+  memset (&sSrcPic, 0, sizeof (sSrcPic)); // confirmed_safe_unsafe_usage
+  memset (&sDstPic, 0, sizeof (sDstPic)); // confirmed_safe_unsafe_usage
+
+  if (pSrcPixMap) sSrcPic = *pSrcPixMap;
+  if (pDstPixMap) sDstPic = *pDstPixMap;
+  if (!CheckValid (eMethod, sSrcPic, sDstPic))
+    return RET_INVALIDPARAM;
+
+  WelsMutexLock (&m_mutes);
+
+  IStrategy* pStrategy = m_pStgChain[iCurIdx];
+  if (pStrategy)
+    eReturn = pStrategy->Process (0, &sSrcPic, &sDstPic);
+
+  WelsMutexUnlock (&m_mutes);
+
+  return eReturn;
+}
+
+EResult CVpFrameWork::Get (int32_t iType, void* pParam) {
+  EResult eReturn        = RET_SUCCESS;
+  int32_t iCurIdx    = WelsStaticCast (int32_t, WelsVpGetValidMethod (iType)) - 1;
+
+  if (!pParam)
+    return RET_INVALIDPARAM;
+
+  WelsMutexLock (&m_mutes);
+
+  IStrategy* pStrategy = m_pStgChain[iCurIdx];
+  if (pStrategy)
+    eReturn = pStrategy->Get (0, pParam);
+
+  WelsMutexUnlock (&m_mutes);
+
+  return eReturn;
+}
+
+EResult CVpFrameWork::Set (int32_t iType, void* pParam) {
+  EResult eReturn        = RET_SUCCESS;
+  int32_t iCurIdx    = WelsStaticCast (int32_t, WelsVpGetValidMethod (iType)) - 1;
+
+  if (!pParam)
+    return RET_INVALIDPARAM;
+
+  WelsMutexLock (&m_mutes);
+
+  IStrategy* pStrategy = m_pStgChain[iCurIdx];
+  if (pStrategy)
+    eReturn = pStrategy->Set (0, pParam);
+
+  WelsMutexUnlock (&m_mutes);
+
+  return eReturn;
+}
+
+EResult CVpFrameWork::SpecialFeature (int32_t iType, void* pIn, void* pOut) {
+  EResult eReturn        = RET_SUCCESS;
+
+  return eReturn;
+}
+
+bool_t  CVpFrameWork::CheckValid (EMethods eMethod, SPixMap& pSrcPixMap, SPixMap& pDstPixMap) {
+  bool_t eReturn = FALSE;
+
+  if (eMethod == METHOD_NULL)
+    goto exit;
+
+  if (eMethod != METHOD_COLORSPACE_CONVERT) {
+    if (pSrcPixMap.pPixel[0]) {
+      if (pSrcPixMap.eFormat != VIDEO_FORMAT_I420 && pSrcPixMap.eFormat != VIDEO_FORMAT_YV12)
+        goto exit;
+    }
+    if (pSrcPixMap.pPixel[0] && pDstPixMap.pPixel[0]) {
+      if (pDstPixMap.eFormat != pSrcPixMap.eFormat)
+        goto exit;
+    }
+  }
+
+  if (pSrcPixMap.pPixel[0]) {
+    if (pSrcPixMap.sRect.iRectWidth <= 0 || pSrcPixMap.sRect.iRectWidth > MAX_WIDTH || pSrcPixMap.sRect.iRectHeight <= 0
+        || pSrcPixMap.sRect.iRectHeight > MAX_HEIGHT)
+      goto exit;
+    if (pSrcPixMap.sRect.iRectTop >= pSrcPixMap.sRect.iRectHeight
+        || pSrcPixMap.sRect.iRectLeft >= pSrcPixMap.sRect.iRectWidth || pSrcPixMap.sRect.iRectWidth > pSrcPixMap.iStride[0])
+      goto exit;
+  }
+  if (pDstPixMap.pPixel[0]) {
+    if (pDstPixMap.sRect.iRectWidth <= 0 || pDstPixMap.sRect.iRectWidth > MAX_WIDTH || pDstPixMap.sRect.iRectHeight <= 0
+        || pDstPixMap.sRect.iRectHeight > MAX_HEIGHT)
+      goto exit;
+    if (pDstPixMap.sRect.iRectTop >= pDstPixMap.sRect.iRectHeight
+        || pDstPixMap.sRect.iRectLeft >= pDstPixMap.sRect.iRectWidth || pDstPixMap.sRect.iRectWidth > pDstPixMap.iStride[0])
+      goto exit;
+  }
+  eReturn = TRUE;
+
+exit:
+  return eReturn;
+}
+
+IStrategy* CVpFrameWork::CreateStrategy (EMethods m_eMethod, int32_t iCpuFlag) {
+  IStrategy* pStrategy = NULL;
+
+  switch (m_eMethod) {
+  case METHOD_COLORSPACE_CONVERT:
+    //not support yet
+    break;
+  case METHOD_DENOISE:
+    pStrategy = WelsDynamicCast (IStrategy*, new CDenoiser (iCpuFlag));
+    break;
+  case METHOD_SCENE_CHANGE_DETECTION:
+    pStrategy = WelsDynamicCast (IStrategy*, new CSceneChangeDetection (iCpuFlag));
+    break;
+  case METHOD_DOWNSAMPLE:
+    pStrategy = WelsDynamicCast (IStrategy*, new CDownsampling (iCpuFlag));
+    break;
+  case METHOD_VAA_STATISTICS:
+    pStrategy = WelsDynamicCast (IStrategy*, new CVAACalculation (iCpuFlag));
+    break;
+  case METHOD_BACKGROUND_DETECTION:
+    pStrategy = WelsDynamicCast (IStrategy*, new CBackgroundDetection (iCpuFlag));
+    break;
+  case METHOD_ADAPTIVE_QUANT:
+    pStrategy = WelsDynamicCast (IStrategy*, new CAdaptiveQuantization (iCpuFlag));
+    break;
+  case METHOD_COMPLEXITY_ANALYSIS:
+    pStrategy = WelsDynamicCast (IStrategy*, new CComplexityAnalysis (iCpuFlag));
+    break;
+  case METHOD_IMAGE_ROTATE:
+    pStrategy = WelsDynamicCast (IStrategy*, new CImageRotating (iCpuFlag));
+    break;
+  default:
+    break;
+  }
+
+  return pStrategy;
+}
+
+WELSVP_NAMESPACE_END
--- a/processing/src/common/WelsFrameWork.h
+++ b/processing/src/common/WelsFrameWork.h
@@ -1,121 +1,130 @@
-/*!
- * \copy
- *     Copyright (c)  2011-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- * \file	    :  WelsFrameWork.h
- *
- * \brief	    :  framework of wels video processor class
- *
- * \date        :  2011/01/04
- *
- * \description :  
- *
- *************************************************************************************
- */
-
-#ifndef _WELSVP_WELSFRAMEWORK_H
-#define _WELSVP_WELSFRAMEWORK_H
-
-#include "../../interface/IWelsVP.h"
-#include "util.h"
-#include "thread.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-EResult CreateSpecificVpInterface (IWelsVP **ppCtx);
-EResult DestroySpecificVpInterface(IWelsVP *pCtx );
-
-EResult CreateSpecificVpInterface (IWelsVPc **ppCtx);
-EResult DestroySpecificVpInterface(IWelsVPc *pCtx );
-
-#define MAX_STRATEGY_NUM (METHOD_MASK - 1)
-
-class IStrategy : public IWelsVP
-{
-public:		
-	IStrategy() 
-	{
-		m_eMethod  = METHOD_NULL;
-		m_eFormat  = VIDEO_FORMAT_I420;
-		m_iIndex   = 0;		
-		m_bInit    = FALSE;
-	};
-
-	virtual ~IStrategy() {}
-
-public:
-	virtual EResult Init(int32_t iType, void *pCfg)  { return RET_SUCCESS; } 
-	virtual EResult Uninit(int32_t iType)              { return RET_SUCCESS; }
-	virtual EResult Flush(int32_t iType)               { return RET_SUCCESS; }		
-	virtual EResult Get(int32_t iType, void *pParam) { return RET_SUCCESS; } 
-	virtual EResult Set(int32_t iType, void *pParam) { return RET_SUCCESS; } 
-	virtual EResult SpecialFeature(int32_t iType, void *pIn, void *pOut) { return RET_SUCCESS; }
-	virtual EResult Process(int32_t iType, SPixMap *pSrc, SPixMap *pDst) = 0; 		
-
-public:
-	EMethods       m_eMethod;
-	EVideoFormat m_eFormat;
-	int32_t           m_iIndex;		
-	bool_t            m_bInit;			
-};
-
-class CVpFrameWork : public IWelsVP
-{
-public:
-	CVpFrameWork(uint32_t uiThreadsNum, EResult &ret);
-	~CVpFrameWork();
-
-public:
-	EResult Init(int32_t iType, void *pCfg); 
-
-	EResult Uninit(int32_t iType);
-
-	EResult Flush(int32_t iType);
-
-	EResult Process(int32_t iType, SPixMap *pSrc, SPixMap *pDst); 
-
-	EResult Get(int32_t iType, void *pParam); 
-
-	EResult Set(int32_t iType, void *pParam); 
-
-	EResult SpecialFeature(int32_t iType, void *pIn, void *pOut);
-
-private:
-	bool_t  CheckValid(EMethods eMethod, SPixMap &sSrc, SPixMap &sDst);
-	IStrategy *CreateStrategy(EMethods eMethod, int32_t iCpuFlag);	
-
-private:
-	IStrategy *m_pStgChain[MAX_STRATEGY_NUM];
-
-	WELS_MUTEX m_mutes;
-};
-
-WELSVP_NAMESPACE_END
-
-#endif
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	    :  WelsFrameWork.h
+ *
+ * \brief	    :  framework of wels video processor class
+ *
+ * \date        :  2011/01/04
+ *
+ * \description :
+ *
+ *************************************************************************************
+ */
+
+#ifndef _WELSVP_WELSFRAMEWORK_H
+#define _WELSVP_WELSFRAMEWORK_H
+
+#include "../../interface/IWelsVP.h"
+#include "util.h"
+#include "thread.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+EResult CreateSpecificVpInterface (IWelsVP** ppCtx);
+EResult DestroySpecificVpInterface (IWelsVP* pCtx);
+
+EResult CreateSpecificVpInterface (IWelsVPc** ppCtx);
+EResult DestroySpecificVpInterface (IWelsVPc* pCtx);
+
+#define MAX_STRATEGY_NUM (METHOD_MASK - 1)
+
+class IStrategy : public IWelsVP {
+ public:
+  IStrategy() {
+    m_eMethod  = METHOD_NULL;
+    m_eFormat  = VIDEO_FORMAT_I420;
+    m_iIndex   = 0;
+    m_bInit    = FALSE;
+  };
+
+  virtual ~IStrategy() {}
+
+ public:
+  virtual EResult Init (int32_t iType, void* pCfg)  {
+    return RET_SUCCESS;
+  }
+  virtual EResult Uninit (int32_t iType)              {
+    return RET_SUCCESS;
+  }
+  virtual EResult Flush (int32_t iType)               {
+    return RET_SUCCESS;
+  }
+  virtual EResult Get (int32_t iType, void* pParam) {
+    return RET_SUCCESS;
+  }
+  virtual EResult Set (int32_t iType, void* pParam) {
+    return RET_SUCCESS;
+  }
+  virtual EResult SpecialFeature (int32_t iType, void* pIn, void* pOut) {
+    return RET_SUCCESS;
+  }
+  virtual EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* pDst) = 0;
+
+ public:
+  EMethods       m_eMethod;
+  EVideoFormat m_eFormat;
+  int32_t           m_iIndex;
+  bool_t            m_bInit;
+};
+
+class CVpFrameWork : public IWelsVP {
+ public:
+  CVpFrameWork (uint32_t uiThreadsNum, EResult& ret);
+  ~CVpFrameWork();
+
+ public:
+  EResult Init (int32_t iType, void* pCfg);
+
+  EResult Uninit (int32_t iType);
+
+  EResult Flush (int32_t iType);
+
+  EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* pDst);
+
+  EResult Get (int32_t iType, void* pParam);
+
+  EResult Set (int32_t iType, void* pParam);
+
+  EResult SpecialFeature (int32_t iType, void* pIn, void* pOut);
+
+ private:
+  bool_t  CheckValid (EMethods eMethod, SPixMap& sSrc, SPixMap& sDst);
+  IStrategy* CreateStrategy (EMethods eMethod, int32_t iCpuFlag);
+
+ private:
+  IStrategy* m_pStgChain[MAX_STRATEGY_NUM];
+
+  WELS_MUTEX m_mutes;
+};
+
+WELSVP_NAMESPACE_END
+
+#endif
--- a/processing/src/common/WelsFrameWorkEx.cpp
+++ b/processing/src/common/WelsFrameWorkEx.cpp
@@ -1,109 +1,96 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "WelsFrameWork.h"
-
-///////////////////////////////////////////////////////////////////////
-
-WELSVP_NAMESPACE_BEGIN
-
-EResult Init (void *pCtx, int32_t iType, void *pCfg)
-{
-	return pCtx ? WelsStaticCast(IWelsVP *, pCtx)->Init(iType, pCfg) : RET_INVALIDPARAM;
-}
-EResult Uninit (void *pCtx, int32_t iType)
-{
-	return pCtx ? WelsStaticCast(IWelsVP *, pCtx)->Uninit(iType) : RET_INVALIDPARAM;
-}
-EResult Flush (void *pCtx, int32_t iType)
-{
-	return pCtx ? WelsStaticCast(IWelsVP *, pCtx)->Flush(iType) : RET_INVALIDPARAM;
-}
-EResult Process (void *pCtx, int32_t iType, SPixMap *pSrc, SPixMap *dst)
-{
-	return pCtx ? WelsStaticCast(IWelsVP *, pCtx)->Process(iType, pSrc, dst) : RET_INVALIDPARAM;
-}
-EResult Get (void *pCtx, int32_t iType, void *pParam)
-{
-	return pCtx ? WelsStaticCast(IWelsVP *, pCtx)->Get(iType, pParam) : RET_INVALIDPARAM;
-}
-EResult Set (void *pCtx, int32_t iType, void *pParam)
-{
-	return pCtx ? WelsStaticCast(IWelsVP *, pCtx)->Set(iType, pParam) : RET_INVALIDPARAM;
-}
-EResult SpecialFeature (void *pCtx, int32_t iType, void *pIn, void *pOut)
-{
-	return pCtx ? WelsStaticCast(IWelsVP *, pCtx)->SpecialFeature(iType, pIn, pOut) : RET_INVALIDPARAM;
-}
-
-///////////////////////////////////////////////////////////////////////////////
-
-EResult CreateSpecificVpInterface(IWelsVPc **pCtx)
-{
-	EResult  ret     = RET_FAILED;
-	IWelsVP *pWelsVP = NULL;
-
-	ret = CreateSpecificVpInterface(&pWelsVP);
-	if (ret == RET_SUCCESS)
-	{
-		IWelsVPc *pVPc = new IWelsVPc;
-		if (pVPc)
-		{
-			pVPc->Init    = Init;
-			pVPc->Uninit  = Uninit;
-			pVPc->Flush   = Flush;
-			pVPc->Process = Process;
-			pVPc->Get     = Get;
-			pVPc->Set     = Set;
-			pVPc->SpecialFeature = SpecialFeature;
-			pVPc->pCtx       = WelsStaticCast(void *, pWelsVP);
-			*pCtx            = pVPc;
-		}
-		else 
-			ret = RET_OUTOFMEMORY;
-	}
-
-	return ret;
-}
-
-EResult DestroySpecificVpInterface(IWelsVPc *pCtx)
-{
-	if (pCtx)
-	{
-		DestroySpecificVpInterface(WelsStaticCast(IWelsVP *, pCtx->pCtx));
-		_SafeDelete(pCtx);
-	}
-
-	return RET_SUCCESS;
-}
-
-WELSVP_NAMESPACE_END
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "WelsFrameWork.h"
+
+///////////////////////////////////////////////////////////////////////
+
+WELSVP_NAMESPACE_BEGIN
+
+EResult Init (void* pCtx, int32_t iType, void* pCfg) {
+  return pCtx ? WelsStaticCast (IWelsVP*, pCtx)->Init (iType, pCfg) : RET_INVALIDPARAM;
+}
+EResult Uninit (void* pCtx, int32_t iType) {
+  return pCtx ? WelsStaticCast (IWelsVP*, pCtx)->Uninit (iType) : RET_INVALIDPARAM;
+}
+EResult Flush (void* pCtx, int32_t iType) {
+  return pCtx ? WelsStaticCast (IWelsVP*, pCtx)->Flush (iType) : RET_INVALIDPARAM;
+}
+EResult Process (void* pCtx, int32_t iType, SPixMap* pSrc, SPixMap* dst) {
+  return pCtx ? WelsStaticCast (IWelsVP*, pCtx)->Process (iType, pSrc, dst) : RET_INVALIDPARAM;
+}
+EResult Get (void* pCtx, int32_t iType, void* pParam) {
+  return pCtx ? WelsStaticCast (IWelsVP*, pCtx)->Get (iType, pParam) : RET_INVALIDPARAM;
+}
+EResult Set (void* pCtx, int32_t iType, void* pParam) {
+  return pCtx ? WelsStaticCast (IWelsVP*, pCtx)->Set (iType, pParam) : RET_INVALIDPARAM;
+}
+EResult SpecialFeature (void* pCtx, int32_t iType, void* pIn, void* pOut) {
+  return pCtx ? WelsStaticCast (IWelsVP*, pCtx)->SpecialFeature (iType, pIn, pOut) : RET_INVALIDPARAM;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+EResult CreateSpecificVpInterface (IWelsVPc** pCtx) {
+  EResult  ret     = RET_FAILED;
+  IWelsVP* pWelsVP = NULL;
+
+  ret = CreateSpecificVpInterface (&pWelsVP);
+  if (ret == RET_SUCCESS) {
+    IWelsVPc* pVPc = new IWelsVPc;
+    if (pVPc) {
+      pVPc->Init    = Init;
+      pVPc->Uninit  = Uninit;
+      pVPc->Flush   = Flush;
+      pVPc->Process = Process;
+      pVPc->Get     = Get;
+      pVPc->Set     = Set;
+      pVPc->SpecialFeature = SpecialFeature;
+      pVPc->pCtx       = WelsStaticCast (void*, pWelsVP);
+      *pCtx            = pVPc;
+    } else
+      ret = RET_OUTOFMEMORY;
+  }
+
+  return ret;
+}
+
+EResult DestroySpecificVpInterface (IWelsVPc* pCtx) {
+  if (pCtx) {
+    DestroySpecificVpInterface (WelsStaticCast (IWelsVP*, pCtx->pCtx));
+    _SafeDelete (pCtx);
+  }
+
+  return RET_SUCCESS;
+}
+
+WELSVP_NAMESPACE_END
--- a/processing/src/common/WelsVP.def
+++ b/processing/src/common/WelsVP.def
@@ -1,36 +1,36 @@
-;*!
-;* \copy
-;*     Copyright (c)  2011-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-
-LIBRARY		    welsvp.dll
-EXPORTS
-                CreateVpInterface    PRIVATE
-                DestroyVpInterface   PRIVATE      
\ No newline at end of file
+;*!
+;* \copy
+;*     Copyright (c)  2011-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+
+LIBRARY		    welsvp.dll
+EXPORTS
+                CreateVpInterface    PRIVATE
+                DestroyVpInterface   PRIVATE
\ No newline at end of file
--- a/processing/src/common/WelsVP.rc
+++ b/processing/src/common/WelsVP.rc
@@ -7,7 +7,7 @@
 //
 // Generated from the TEXTINCLUDE 2 resource.
 //
-#include "afxres.h"
+#include "windows.h"
 
 /////////////////////////////////////////////////////////////////////////////
 #undef APSTUDIO_READONLY_SYMBOLS
@@ -27,18 +27,18 @@
 // TEXTINCLUDE
 //
 
-1 TEXTINCLUDE 
+1 TEXTINCLUDE
 BEGIN
     "resource.h\0"
 END
 
-2 TEXTINCLUDE 
+2 TEXTINCLUDE
 BEGIN
-    "#include ""afxres.h""\r\n"
+    "#include ""windows.h""\r\n"
     "\0"
 END
 
-3 TEXTINCLUDE 
+3 TEXTINCLUDE
 BEGIN
     "\r\n"
     "\0"
--- a/processing/src/common/cpu.cpp
+++ b/processing/src/common/cpu.cpp
@@ -1,213 +1,196 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- * \file	cpu.c
- *
- * \brief	CPU compatibility detection
- *
- * \date	04/29/2009 Created
- *
- *************************************************************************************
- */
-
-#include "util.h"
-#include "cpu.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-#define    CPU_Vender_AMD    "AuthenticAMD"
-#define    CPU_Vender_INTEL  "GenuineIntel"
-#define    CPU_Vender_CYRIX  "CyrixInstead"
-
-
-#if defined(X86_ASM)
-
-uint32_t WelsCPUFeatureDetect( int32_t *pNumberOfLogicProcessors )
-{
-    uint32_t uiCPU = 0;	
-    uint32_t uiFeatureA = 0, uiFeatureB = 0, uiFeatureC = 0, uiFeatureD = 0;
-	int32_t  CacheLineSize = 0;
-	int8_t   chVenderName[16] = { 0 };	
-	
-    if( !WelsCPUIdVerify() )
-    {
-        /* cpuid is not supported in cpu */
-        return 0;
-    }
-	
-	WelsCPUId( 0, &uiFeatureA, (uint32_t*)&chVenderName[0],(uint32_t*)&chVenderName[8],(uint32_t*)&chVenderName[4] );
-    if( uiFeatureA == 0 )
-    {
-		/* maximum input value for basic cpuid information */
-        return 0;
-    }
-	
-	WelsCPUId( 1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD );
-    if( (uiFeatureD & 0x00800000) == 0 )
-    {
-        /* Basic MMX technology is not support in cpu, mean nothing for us so return here */
-        return 0;
-    }
-	
-    uiCPU = WELS_CPU_MMX;
-    if( uiFeatureD & 0x02000000 )
-    {
-        /* SSE technology is identical to AMD MMX extensions */
-        uiCPU |= WELS_CPU_MMXEXT|WELS_CPU_SSE;
-    }
-    if( uiFeatureD & 0x04000000 )
-    {
-        /* SSE2 support here */
-        uiCPU |= WELS_CPU_SSE2;
-    }
-	if ( uiFeatureD & 0x00000001 )
-	{
-		/* x87 FPU on-chip checking */
-		uiCPU |= WELS_CPU_FPU;
-	}
-	if ( uiFeatureD & 0x00008000 )
-	{
-		/* CMOV instruction checking */
-		uiCPU |= WELS_CPU_CMOV;
-	}
-	if ( !strcmp((const str_t*)chVenderName,CPU_Vender_INTEL) )	// confirmed_safe_unsafe_usage
-	{
-		if ( uiFeatureD & 0x10000000 )
-		{
-			/* Multi-Threading checking: contains of multiple logic processors */
-			uiCPU |= WELS_CPU_HTT;
-		}
-	}	
-
-	if( uiFeatureC & 0x00000001 ){
-		/* SSE3 support here */
-		uiCPU |= WELS_CPU_SSE3;
-	}
-	if( uiFeatureC & 0x00000200 ){
-		/* SSSE3 support here */
-		uiCPU |= WELS_CPU_SSSE3;
-	}
-	if( uiFeatureC & 0x00080000 ){
-		/* SSE4.1 support here, 45nm Penryn processor */
-		uiCPU |= WELS_CPU_SSE41; 
-	}
-	if( uiFeatureC & 0x00100000 ){
-		/* SSE4.2 support here, next generation Nehalem processor */
-		uiCPU |= WELS_CPU_SSE42;
-	}
-	if ( WelsCPUSupportAVX( uiFeatureA, uiFeatureC ) )	// 
-	{
-		/* AVX supported */
-		uiCPU |= WELS_CPU_AVX;
-	}
-	if ( WelsCPUSupportFMA( uiFeatureA, uiFeatureC ) )	// 
-	{
-		/* AVX FMA supported */
-		uiCPU |= WELS_CPU_FMA;
-	}
-	if ( uiFeatureC & 0x02000000 )
-	{
-		/* AES checking */
-		uiCPU |= WELS_CPU_AES;
-	}
-	if ( uiFeatureC & 0x00400000 )
-	{
-		/* MOVBE checking */
-		uiCPU |= WELS_CPU_MOVBE;
-	}
-
-	if ( pNumberOfLogicProcessors != NULL )
-	{
-		// HTT enabled on chip
-		*pNumberOfLogicProcessors = (uiFeatureB & 0x00ff0000) >> 16; // feature bits: 23-16 on returned EBX		
-	}	
-	
-    WelsCPUId( 0x80000000, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD );
-
-	if( (!strcmp((const str_t*)chVenderName,CPU_Vender_AMD)) && (uiFeatureA>=0x80000001) ){	// confirmed_safe_unsafe_usage
-		WelsCPUId(0x80000001, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD );
-		if( uiFeatureD&0x00400000 ){
-			uiCPU |= WELS_CPU_MMXEXT;
-		}
-		if( uiFeatureD&0x80000000 ){
-			uiCPU |= WELS_CPU_3DNOW;
-		}
-	}
-
-	if( !strcmp((const str_t*)chVenderName,CPU_Vender_INTEL) ){	// confirmed_safe_unsafe_usage
-		int32_t  family, model;
-
-		WelsCPUId(1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
-		family = ((uiFeatureA>>8)&0xf) + ((uiFeatureA>>20)&0xff);
-        model  = ((uiFeatureA>>4)&0xf) + ((uiFeatureA>>12)&0xf0);
-
-		if( (family==6) && (model==9 || model==13 || model==14) ){
-			uiCPU &= ~(WELS_CPU_SSE2|WELS_CPU_SSE3);
-		}
-	}
-
-	// get cache line size
-	if( (!strcmp((const str_t*)chVenderName,CPU_Vender_INTEL)) || !(strcmp((const str_t*)chVenderName,CPU_Vender_CYRIX)) ){	// confirmed_safe_unsafe_usage
-		WelsCPUId(1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
-
-		CacheLineSize = (uiFeatureB&0xff00)>>5;	// ((clflush_line_size >> 8) << 3), CLFLUSH_line_size * 8 = CacheLineSize_in_byte
-
-		if( CacheLineSize == 128 ){
-			uiCPU |= WELS_CPU_CACHELINE_128;
-		}
-		else if( CacheLineSize == 64 ){
-			uiCPU |= WELS_CPU_CACHELINE_64;
-		}
-		else if( CacheLineSize == 32 ){
-			uiCPU |= WELS_CPU_CACHELINE_32;
-		}
-		else if( CacheLineSize == 16 ){
-			uiCPU |= WELS_CPU_CACHELINE_16;
-		}
-	}
-	
-    return uiCPU;
-}
-
-
-void WelsCPURestore( const uint32_t kuiCPU )
-{
-    if( kuiCPU & (WELS_CPU_MMX|WELS_CPU_MMXEXT|WELS_CPU_3DNOW|WELS_CPU_3DNOWEXT) )
-    {
-        WelsEmms();
-    }
-}
-
-#endif
-
-
-WELSVP_NAMESPACE_END
-
-
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	cpu.c
+ *
+ * \brief	CPU compatibility detection
+ *
+ * \date	04/29/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#include "util.h"
+#include "cpu.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+#define    CPU_Vender_AMD    "AuthenticAMD"
+#define    CPU_Vender_INTEL  "GenuineIntel"
+#define    CPU_Vender_CYRIX  "CyrixInstead"
+
+
+#if defined(X86_ASM)
+
+uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors) {
+  uint32_t uiCPU = 0;
+  uint32_t uiFeatureA = 0, uiFeatureB = 0, uiFeatureC = 0, uiFeatureD = 0;
+  int32_t  CacheLineSize = 0;
+  int8_t   chVenderName[16] = { 0 };
+
+  if (!WelsCPUIdVerify()) {
+    /* cpuid is not supported in cpu */
+    return 0;
+  }
+
+  WelsCPUId (0, &uiFeatureA, (uint32_t*)&chVenderName[0], (uint32_t*)&chVenderName[8], (uint32_t*)&chVenderName[4]);
+  if (uiFeatureA == 0) {
+    /* maximum input value for basic cpuid information */
+    return 0;
+  }
+
+  WelsCPUId (1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
+  if ((uiFeatureD & 0x00800000) == 0) {
+    /* Basic MMX technology is not support in cpu, mean nothing for us so return here */
+    return 0;
+  }
+
+  uiCPU = WELS_CPU_MMX;
+  if (uiFeatureD & 0x02000000) {
+    /* SSE technology is identical to AMD MMX extensions */
+    uiCPU |= WELS_CPU_MMXEXT | WELS_CPU_SSE;
+  }
+  if (uiFeatureD & 0x04000000) {
+    /* SSE2 support here */
+    uiCPU |= WELS_CPU_SSE2;
+  }
+  if (uiFeatureD & 0x00000001) {
+    /* x87 FPU on-chip checking */
+    uiCPU |= WELS_CPU_FPU;
+  }
+  if (uiFeatureD & 0x00008000) {
+    /* CMOV instruction checking */
+    uiCPU |= WELS_CPU_CMOV;
+  }
+  if (!strcmp ((const str_t*)chVenderName, CPU_Vender_INTEL)) {	// confirmed_safe_unsafe_usage
+    if (uiFeatureD & 0x10000000) {
+      /* Multi-Threading checking: contains of multiple logic processors */
+      uiCPU |= WELS_CPU_HTT;
+    }
+  }
+
+  if (uiFeatureC & 0x00000001) {
+    /* SSE3 support here */
+    uiCPU |= WELS_CPU_SSE3;
+  }
+  if (uiFeatureC & 0x00000200) {
+    /* SSSE3 support here */
+    uiCPU |= WELS_CPU_SSSE3;
+  }
+  if (uiFeatureC & 0x00080000) {
+    /* SSE4.1 support here, 45nm Penryn processor */
+    uiCPU |= WELS_CPU_SSE41;
+  }
+  if (uiFeatureC & 0x00100000) {
+    /* SSE4.2 support here, next generation Nehalem processor */
+    uiCPU |= WELS_CPU_SSE42;
+  }
+  if (WelsCPUSupportAVX (uiFeatureA, uiFeatureC)) {	//
+    /* AVX supported */
+    uiCPU |= WELS_CPU_AVX;
+  }
+  if (WelsCPUSupportFMA (uiFeatureA, uiFeatureC)) {	//
+    /* AVX FMA supported */
+    uiCPU |= WELS_CPU_FMA;
+  }
+  if (uiFeatureC & 0x02000000) {
+    /* AES checking */
+    uiCPU |= WELS_CPU_AES;
+  }
+  if (uiFeatureC & 0x00400000) {
+    /* MOVBE checking */
+    uiCPU |= WELS_CPU_MOVBE;
+  }
+
+  if (pNumberOfLogicProcessors != NULL) {
+    // HTT enabled on chip
+    *pNumberOfLogicProcessors = (uiFeatureB & 0x00ff0000) >> 16; // feature bits: 23-16 on returned EBX
+  }
+
+  WelsCPUId (0x80000000, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
+
+  if ((!strcmp ((const str_t*)chVenderName, CPU_Vender_AMD))
+      && (uiFeatureA >= 0x80000001)) {	// confirmed_safe_unsafe_usage
+    WelsCPUId (0x80000001, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
+    if (uiFeatureD & 0x00400000) {
+      uiCPU |= WELS_CPU_MMXEXT;
+    }
+    if (uiFeatureD & 0x80000000) {
+      uiCPU |= WELS_CPU_3DNOW;
+    }
+  }
+
+  if (!strcmp ((const str_t*)chVenderName, CPU_Vender_INTEL)) {	// confirmed_safe_unsafe_usage
+    int32_t  family, model;
+
+    WelsCPUId (1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
+    family = ((uiFeatureA >> 8) & 0xf) + ((uiFeatureA >> 20) & 0xff);
+    model  = ((uiFeatureA >> 4) & 0xf) + ((uiFeatureA >> 12) & 0xf0);
+
+    if ((family == 6) && (model == 9 || model == 13 || model == 14)) {
+      uiCPU &= ~ (WELS_CPU_SSE2 | WELS_CPU_SSE3);
+    }
+  }
+
+  // get cache line size
+  if ((!strcmp ((const str_t*)chVenderName, CPU_Vender_INTEL))
+      || ! (strcmp ((const str_t*)chVenderName, CPU_Vender_CYRIX))) {	// confirmed_safe_unsafe_usage
+    WelsCPUId (1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
+
+    CacheLineSize = (uiFeatureB & 0xff00) >>
+                    5;	// ((clflush_line_size >> 8) << 3), CLFLUSH_line_size * 8 = CacheLineSize_in_byte
+
+    if (CacheLineSize == 128) {
+      uiCPU |= WELS_CPU_CACHELINE_128;
+    } else if (CacheLineSize == 64) {
+      uiCPU |= WELS_CPU_CACHELINE_64;
+    } else if (CacheLineSize == 32) {
+      uiCPU |= WELS_CPU_CACHELINE_32;
+    } else if (CacheLineSize == 16) {
+      uiCPU |= WELS_CPU_CACHELINE_16;
+    }
+  }
+
+  return uiCPU;
+}
+
+
+void WelsCPURestore (const uint32_t kuiCPU) {
+  if (kuiCPU & (WELS_CPU_MMX | WELS_CPU_MMXEXT | WELS_CPU_3DNOW | WELS_CPU_3DNOWEXT)) {
+    WelsEmms();
+  }
+}
+
+#endif
+
+
+WELSVP_NAMESPACE_END
+
+
--- a/processing/src/common/cpu.h
+++ b/processing/src/common/cpu.h
@@ -1,52 +1,52 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- * \file	cpu.h
- *
- * \brief	CPU feature compatibility detection
- *
- * \date	04/29/2009 Created
- *
- *************************************************************************************
- */
-
-#ifndef _WELSVP_CPU_H
-#define _WELSVP_CPU_H
-
-#include "typedef.h"
-
-WELSVP_NAMESPACE_BEGIN
-
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	cpu.h
+ *
+ * \brief	CPU feature compatibility detection
+ *
+ * \date	04/29/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#ifndef _WELSVP_CPU_H
+#define _WELSVP_CPU_H
+
+#include "typedef.h"
+
+WELSVP_NAMESPACE_BEGIN
+
 /*
  *	WELS CPU feature flags
- */ 
+ */
 #define WELS_CPU_MMX        0x00000001    /* mmx */
 #define WELS_CPU_MMXEXT     0x00000002    /* mmx-ext*/
 #define WELS_CPU_SSE        0x00000004    /* sse */
@@ -76,27 +76,27 @@
 #define WELS_CPU_CACHELINE_32    0x20000000    /* CacheLine Size 32 */
 #define WELS_CPU_CACHELINE_64    0x40000000    /* CacheLine Size 64 */
 #define WELS_CPU_CACHELINE_128   0x80000000    /* CacheLine Size 128 */
-
-/*
- *	Interfaces for CPU core feature detection as below
- */
-
-#ifdef X86_ASM
-WELSVP_EXTERN_C_BEGIN
-
-int32_t WelsCPUIdVerify();
-
-void  WelsCPUId( uint32_t uiIndex, uint32_t *pFeatureA, uint32_t *pFeatureB, uint32_t *pFeatureC, uint32_t *pFeatureD );
-int32_t WelsCPUSupportAVX( uint32_t eax, uint32_t ecx );
-int32_t WelsCPUSupportFMA( uint32_t eax, uint32_t ecx );
-
-void  WelsEmms();
-
-WELSVP_EXTERN_C_END
-#endif
-
-uint32_t WelsCPUFeatureDetect( int32_t *pNumberOfLogicProcessors );
-
-WELSVP_NAMESPACE_END
-
-#endif
+
+/*
+ *	Interfaces for CPU core feature detection as below
+ */
+
+#ifdef X86_ASM
+WELSVP_EXTERN_C_BEGIN
+
+int32_t WelsCPUIdVerify();
+
+void  WelsCPUId (uint32_t uiIndex, uint32_t* pFeatureA, uint32_t* pFeatureB, uint32_t* pFeatureC, uint32_t* pFeatureD);
+int32_t WelsCPUSupportAVX (uint32_t eax, uint32_t ecx);
+int32_t WelsCPUSupportFMA (uint32_t eax, uint32_t ecx);
+
+void  WelsEmms();
+
+WELSVP_EXTERN_C_END
+#endif
+
+uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors);
+
+WELSVP_NAMESPACE_END
+
+#endif
--- a/processing/src/common/memory.cpp
+++ b/processing/src/common/memory.cpp
@@ -1,128 +1,117 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "memory.h"
-
-WELSVP_NAMESPACE_BEGIN
-/////////////////////////////////////////////////////////////////////////////////
-
- void * WelsMalloc( const uint32_t kuiSize, str_t *pTag )
- {
-	 const int32_t kiSizeVoidPointer	= sizeof( void ** );
-	 const int32_t kiSizeInt32		= sizeof( int32_t );
-	 const int32_t kiAlignedBytes	= ALIGNBYTES - 1;
-
-	 uint8_t* pBuf		= (uint8_t *) ::malloc( kuiSize + kiAlignedBytes + kiSizeVoidPointer + kiSizeInt32 );
-	 uint8_t* pAlignedBuf = NULL;
-
-	 if ( NULL == pBuf )
-		 return NULL;
-
-	 // to fill zero values
-	 WelsMemset( pBuf, 0, kuiSize + kiAlignedBytes + kiSizeVoidPointer + kiSizeInt32 );
-
-	 pAlignedBuf = pBuf + kiAlignedBytes + kiSizeVoidPointer + kiSizeInt32;
-	 pAlignedBuf -= WelsCastFromPointer(pAlignedBuf) & kiAlignedBytes;
-	 *( (void **) ( pAlignedBuf - kiSizeVoidPointer ) ) = pBuf;
-	 *( (int32_t *) ( pAlignedBuf - (kiSizeVoidPointer + kiSizeInt32) ) ) = kuiSize;
-
-	 return (pAlignedBuf);
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- void WelsFree( void* pPointer, str_t *pTag )
- {
-	 if( pPointer )
-	 {
-		 ::free( *( ( ( void **) pPointer ) - 1 ) );
-	 }
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- void* InternalReallocate( void *pPointer, const uint32_t kuiSize, str_t *pTag )
- {
-	 uint32_t iOldSize = 0;
-	 uint8_t* pNew = NULL;
-	 if ( pPointer != NULL ) 
-		 iOldSize = *( (int32_t*) ( (uint8_t*) pPointer - sizeof( void ** ) - sizeof( int32_t ) ) ); 
-	 else
-		 return WelsMalloc( kuiSize, pTag );
-
-	 pNew = (uint8_t*)WelsMalloc( kuiSize, pTag );
-	 if (0 == pNew)
-	 {
-		 if (iOldSize > 0 && kuiSize > 0 && iOldSize >= kuiSize)
-			 return (pPointer);
-		 return 0;
-	 }
-	 else 
-		 if( iOldSize > 0 && kuiSize > 0 )
-			 memcpy( pNew, pPointer, ( iOldSize < kuiSize ) ? iOldSize : kuiSize );
-		 else
-			 return 0;
-
-	 WelsFree( pPointer, pTag );
-	 return (pNew);
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- void* WelsRealloc( void *pPointer, uint32_t *pRealSize, const uint32_t kuiSize, str_t *pTag )
- {
-	 const uint32_t kuiOldSize = *pRealSize;
-	 uint32_t kuiNewSize = 0;
-	 void *pLocalPointer = NULL;
-	 if ( kuiOldSize >= kuiSize )	// large enough of original block, so do nothing
-		 return (pPointer);
-
-	 // new request
-	 kuiNewSize = kuiSize + 15;
-	 kuiNewSize -= (kuiNewSize & 15);
-	 kuiNewSize += 32;
-
-	 pLocalPointer = InternalReallocate( pPointer, kuiNewSize, pTag );
-	 if ( NULL != pLocalPointer )
-	 {
-		 *pRealSize	= kuiNewSize;
-		 return (pLocalPointer);
-	 }
-	 else
-	 {
-		 return NULL;
-	 }
-
-	 return NULL;	// something wrong
- }
-
- WELSVP_NAMESPACE_END
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "memory.h"
+
+WELSVP_NAMESPACE_BEGIN
+/////////////////////////////////////////////////////////////////////////////////
+
+void* WelsMalloc (const uint32_t kuiSize, str_t* pTag) {
+  const int32_t kiSizeVoidPointer	= sizeof (void**);
+  const int32_t kiSizeInt32		= sizeof (int32_t);
+  const int32_t kiAlignedBytes	= ALIGNBYTES - 1;
+
+  uint8_t* pBuf		= (uint8_t*) ::malloc (kuiSize + kiAlignedBytes + kiSizeVoidPointer + kiSizeInt32);
+  uint8_t* pAlignedBuf = NULL;
+
+  if (NULL == pBuf)
+    return NULL;
+
+  // to fill zero values
+  WelsMemset (pBuf, 0, kuiSize + kiAlignedBytes + kiSizeVoidPointer + kiSizeInt32);
+
+  pAlignedBuf = pBuf + kiAlignedBytes + kiSizeVoidPointer + kiSizeInt32;
+  pAlignedBuf -= WelsCastFromPointer (pAlignedBuf) & kiAlignedBytes;
+  * ((void**) (pAlignedBuf - kiSizeVoidPointer)) = pBuf;
+  * ((int32_t*) (pAlignedBuf - (kiSizeVoidPointer + kiSizeInt32))) = kuiSize;
+
+  return (pAlignedBuf);
+}
+
+/////////////////////////////////////////////////////////////////////////////
+
+void WelsFree (void* pPointer, str_t* pTag) {
+  if (pPointer) {
+    ::free (* (((void**) pPointer) - 1));
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////
+
+void* InternalReallocate (void* pPointer, const uint32_t kuiSize, str_t* pTag) {
+  uint32_t iOldSize = 0;
+  uint8_t* pNew = NULL;
+  if (pPointer != NULL)
+    iOldSize = * ((int32_t*) ((uint8_t*) pPointer - sizeof (void**) - sizeof (int32_t)));
+  else
+    return WelsMalloc (kuiSize, pTag);
+
+  pNew = (uint8_t*)WelsMalloc (kuiSize, pTag);
+  if (0 == pNew) {
+    if (iOldSize > 0 && kuiSize > 0 && iOldSize >= kuiSize)
+      return (pPointer);
+    return 0;
+  } else if (iOldSize > 0 && kuiSize > 0)
+    memcpy (pNew, pPointer, (iOldSize < kuiSize) ? iOldSize : kuiSize);
+  else
+    return 0;
+
+  WelsFree (pPointer, pTag);
+  return (pNew);
+}
+
+/////////////////////////////////////////////////////////////////////////////
+
+void* WelsRealloc (void* pPointer, uint32_t* pRealSize, const uint32_t kuiSize, str_t* pTag) {
+  const uint32_t kuiOldSize = *pRealSize;
+  uint32_t kuiNewSize = 0;
+  void* pLocalPointer = NULL;
+  if (kuiOldSize >= kuiSize)	// large enough of original block, so do nothing
+    return (pPointer);
+
+  // new request
+  kuiNewSize = kuiSize + 15;
+  kuiNewSize -= (kuiNewSize & 15);
+  kuiNewSize += 32;
+
+  pLocalPointer = InternalReallocate (pPointer, kuiNewSize, pTag);
+  if (NULL != pLocalPointer) {
+    *pRealSize	= kuiNewSize;
+    return (pLocalPointer);
+  } else {
+    return NULL;
+  }
+
+  return NULL;	// something wrong
+}
+
+WELSVP_NAMESPACE_END
--- a/processing/src/common/memory.h
+++ b/processing/src/common/memory.h
@@ -1,113 +1,110 @@
-/*!
- * \copy
- *     Copyright (c)  2011-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- * \file	    :  memory.h
- *
- * \brief	    :  memory definition for wels video processor class
- *
- * \date        :  2011/02/22
- *
- * \description :  
- *
- *************************************************************************************
- */
-
-#ifndef _WELSVP_MEMORY_H
-#define _WELSVP_MEMORY_H
-
-#include "util.h"
-#include "typedef.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-inline_t void * WelsMemset( void * pPointer, int32_t iValue, uint32_t uiSize)
-{
-	return ::memset(pPointer, iValue, uiSize);
-}
-
-inline_t void * WelsMemcpy( void *pDst, const void *kpSrc, uint32_t uiSize)
-{
-	return ::memcpy(pDst, kpSrc, uiSize);
-}
-
-inline_t int32_t WelsMemcmp( const void *kpBuf1, const void *kpBuf2, uint32_t uiSize)
-{
-	return ::memcmp( kpBuf1, kpBuf2, uiSize);
-}
-
-/*! 
-*************************************************************************************
-* \brief	malloc with zero filled utilization in Wels
-*
-* \param 	i_size	uiSize of memory block required
-*
-* \return	allocated memory pointer exactly, failed in case of NULL return
-*
-* \note	N/A
-*************************************************************************************
-*/
-void * WelsMalloc( const uint32_t kuiSize, str_t *pTag = NULL );
-
-/*! 
-*************************************************************************************
-* \brief	free utilization in Wels
-*
-* \param 	p	data pointer to be free. 
-*			i.e, uint8_t *p = actual data to be free, argv = &p.
-*
-* \return	NONE
-*
-* \note	N/A
-*************************************************************************************
-*/
-void WelsFree( void * pPointer, str_t *pTag = NULL );
-
-/*! 
-*************************************************************************************
-* \brief	reallocation in Wels. Do nothing and continue using old block 
-*		in case the block is large enough currently
-*
-* \param 	p	    memory block required in old time
-* \param	i_size	new uiSize of memory block requested
-* \param	sz_real	pointer to the old uiSize of memory block
-*
-* \return	reallocated memory pointer exactly, failed in case of NULL return
-*
-* \note	N/A
-*************************************************************************************
-*/
-void * WelsRealloc( void  *pPointer, uint32_t *pRealSize, const uint32_t kuiSize, str_t *pTag = NULL );
-
-//////////////////////////////////////////////////////////////////////////////////////
-WELSVP_NAMESPACE_END
-
-#endif
-
-
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	    :  memory.h
+ *
+ * \brief	    :  memory definition for wels video processor class
+ *
+ * \date        :  2011/02/22
+ *
+ * \description :
+ *
+ *************************************************************************************
+ */
+
+#ifndef _WELSVP_MEMORY_H
+#define _WELSVP_MEMORY_H
+
+#include "util.h"
+#include "typedef.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+inline_t void* WelsMemset (void* pPointer, int32_t iValue, uint32_t uiSize) {
+  return ::memset (pPointer, iValue, uiSize);
+}
+
+inline_t void* WelsMemcpy (void* pDst, const void* kpSrc, uint32_t uiSize) {
+  return ::memcpy (pDst, kpSrc, uiSize);
+}
+
+inline_t int32_t WelsMemcmp (const void* kpBuf1, const void* kpBuf2, uint32_t uiSize) {
+  return ::memcmp (kpBuf1, kpBuf2, uiSize);
+}
+
+/*!
+*************************************************************************************
+* \brief	malloc with zero filled utilization in Wels
+*
+* \param 	i_size	uiSize of memory block required
+*
+* \return	allocated memory pointer exactly, failed in case of NULL return
+*
+* \note	N/A
+*************************************************************************************
+*/
+void* WelsMalloc (const uint32_t kuiSize, str_t* pTag = NULL);
+
+/*!
+*************************************************************************************
+* \brief	free utilization in Wels
+*
+* \param 	p	data pointer to be free.
+*			i.e, uint8_t *p = actual data to be free, argv = &p.
+*
+* \return	NONE
+*
+* \note	N/A
+*************************************************************************************
+*/
+void WelsFree (void* pPointer, str_t* pTag = NULL);
+
+/*!
+*************************************************************************************
+* \brief	reallocation in Wels. Do nothing and continue using old block
+*		in case the block is large enough currently
+*
+* \param 	p	    memory block required in old time
+* \param	i_size	new uiSize of memory block requested
+* \param	sz_real	pointer to the old uiSize of memory block
+*
+* \return	reallocated memory pointer exactly, failed in case of NULL return
+*
+* \note	N/A
+*************************************************************************************
+*/
+void* WelsRealloc (void*  pPointer, uint32_t* pRealSize, const uint32_t kuiSize, str_t* pTag = NULL);
+
+//////////////////////////////////////////////////////////////////////////////////////
+WELSVP_NAMESPACE_END
+
+#endif
+
+
--- a/processing/src/common/resource.h
+++ b/processing/src/common/resource.h
@@ -1,15 +1,15 @@
-//{{NO_DEPENDENCIES}}
-// Microsoft Visual C++ generated include file.
-// Used by WelsVP.rc
-//
-
-// Next default values for new objects
-// 
-#ifdef APSTUDIO_INVOKED
-#ifndef APSTUDIO_READONLY_SYMBOLS
-#define _APS_NEXT_RESOURCE_VALUE        101
-#define _APS_NEXT_COMMAND_VALUE         40001
-#define _APS_NEXT_CONTROL_VALUE         1000
-#define _APS_NEXT_SYMED_VALUE           101
-#endif
-#endif
+//{{NO_DEPENDENCIES}}
+// Microsoft Visual C++ generated include file.
+// Used by WelsVP.rc
+//
+
+// Next default values for new objects
+//
+#ifdef APSTUDIO_INVOKED
+#ifndef APSTUDIO_READONLY_SYMBOLS
+#define _APS_NEXT_RESOURCE_VALUE        101
+#define _APS_NEXT_COMMAND_VALUE         40001
+#define _APS_NEXT_CONTROL_VALUE         1000
+#define _APS_NEXT_SYMED_VALUE           101
+#endif
+#endif
--- a/processing/src/common/thread.cpp
+++ b/processing/src/common/thread.cpp
@@ -1,101 +1,93 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- * \file	thread.cpp
- *
- * \brief	Interfaces introduced in thread programming
- *
- * \date	11/17/2009 Created
- *
- *************************************************************************************
- */
-
-#include "thread.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-#if defined(WIN32)
-
-WELS_THREAD_ERROR_CODE    WelsMutexInit( WELS_MUTEX   * mutex )
-{
-	InitializeCriticalSection(mutex);
-
-	return WELS_THREAD_ERROR_OK;
-}
-
-WELS_THREAD_ERROR_CODE    WelsMutexLock( WELS_MUTEX   * mutex )
-{
-	EnterCriticalSection(mutex);
-
-	return WELS_THREAD_ERROR_OK;
-}
-
-WELS_THREAD_ERROR_CODE    WelsMutexUnlock( WELS_MUTEX * mutex )
-{
-	LeaveCriticalSection(mutex);
-
-	return WELS_THREAD_ERROR_OK;
-}
-
-WELS_THREAD_ERROR_CODE    WelsMutexDestroy( WELS_MUTEX * mutex )
-{
-    DeleteCriticalSection(mutex);
-
-	return WELS_THREAD_ERROR_OK;
-}
-
-#elif  defined(__GNUC__)
-
-WELS_THREAD_ERROR_CODE    WelsMutexInit( WELS_MUTEX   * mutex )
-{
-	return pthread_mutex_init(mutex, NULL);
-}
-
-WELS_THREAD_ERROR_CODE    WelsMutexLock( WELS_MUTEX   * mutex )
-{
-	return pthread_mutex_lock(mutex);
-}
-
-WELS_THREAD_ERROR_CODE    WelsMutexUnlock( WELS_MUTEX * mutex )
-{
-	return pthread_mutex_unlock(mutex);
-}
-
-WELS_THREAD_ERROR_CODE    WelsMutexDestroy( WELS_MUTEX * mutex )
-{
-    return pthread_mutex_destroy(mutex);
-}
-
-#endif
-
-WELSVP_NAMESPACE_END
-
-
-
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	thread.cpp
+ *
+ * \brief	Interfaces introduced in thread programming
+ *
+ * \date	11/17/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#include "thread.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+#if defined(WIN32)
+
+WELS_THREAD_ERROR_CODE    WelsMutexInit (WELS_MUTEX*    mutex) {
+  InitializeCriticalSection (mutex);
+
+  return WELS_THREAD_ERROR_OK;
+}
+
+WELS_THREAD_ERROR_CODE    WelsMutexLock (WELS_MUTEX*    mutex) {
+  EnterCriticalSection (mutex);
+
+  return WELS_THREAD_ERROR_OK;
+}
+
+WELS_THREAD_ERROR_CODE    WelsMutexUnlock (WELS_MUTEX* mutex) {
+  LeaveCriticalSection (mutex);
+
+  return WELS_THREAD_ERROR_OK;
+}
+
+WELS_THREAD_ERROR_CODE    WelsMutexDestroy (WELS_MUTEX* mutex) {
+  DeleteCriticalSection (mutex);
+
+  return WELS_THREAD_ERROR_OK;
+}
+
+#elif  defined(__GNUC__)
+
+WELS_THREAD_ERROR_CODE    WelsMutexInit (WELS_MUTEX*    mutex) {
+  return pthread_mutex_init (mutex, NULL);
+}
+
+WELS_THREAD_ERROR_CODE    WelsMutexLock (WELS_MUTEX*    mutex) {
+  return pthread_mutex_lock (mutex);
+}
+
+WELS_THREAD_ERROR_CODE    WelsMutexUnlock (WELS_MUTEX* mutex) {
+  return pthread_mutex_unlock (mutex);
+}
+
+WELS_THREAD_ERROR_CODE    WelsMutexDestroy (WELS_MUTEX* mutex) {
+  return pthread_mutex_destroy (mutex);
+}
+
+#endif
+
+WELSVP_NAMESPACE_END
+
+
+
--- a/processing/src/common/thread.h
+++ b/processing/src/common/thread.h
@@ -1,89 +1,89 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- * \file	thread.h
- *
- * \brief	Interfaces introduced in thread programming
- *
- * \date	11/17/2009 Created 
- *
- *************************************************************************************
- */
-
-#ifndef _WELSVP_THREAD_H
-#define _WELSVP_THREAD_H
-
-#include "typedef.h"
-
-#if defined(WIN32)
-
-#include <windows.h>
-
-#elif defined(__GNUC__) 
-
-#include <stdlib.h>
-#include <unistd.h>
-#include <pthread.h>
-#include <semaphore.h>
-#include <signal.h>
-#include <errno.h>
-
-#endif//WIN32
-
-WELSVP_NAMESPACE_BEGIN
-
-#if defined(WIN32)
-
-typedef  HANDLE            WELS_THREAD_HANDLE;
-typedef  CRITICAL_SECTION  WELS_MUTEX;
-
-#elif defined(__GNUC__) 
-
-typedef   pthread_t         WELS_THREAD_HANDLE;
-typedef   pthread_mutex_t   WELS_MUTEX;
-
-#endif
-
-typedef long_t WELS_THREAD_ERROR_CODE;
-
-#define   WELS_THREAD_ERROR_OK					0
-#define   WELS_THREAD_ERROR_GENERIAL			((unsigned long)(-1))
-#define   WELS_THREAD_ERROR_WAIT_OBJECT_0		0
-#define	  WELS_THREAD_ERROR_WAIT_TIMEOUT		((unsigned long)0x00000102L)  
-#define	  WELS_THREAD_ERROR_WAIT_FAILED		    WELS_THREAD_ERROR_GENERIAL
-
-WELS_THREAD_ERROR_CODE   WelsMutexInit( WELS_MUTEX   * mutex );
-WELS_THREAD_ERROR_CODE   WelsMutexLock( WELS_MUTEX   * mutex );
-WELS_THREAD_ERROR_CODE   WelsMutexUnlock( WELS_MUTEX * mutex );
-WELS_THREAD_ERROR_CODE   WelsMutexDestroy( WELS_MUTEX * mutex );
-
-WELSVP_NAMESPACE_END
-
-#endif
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	thread.h
+ *
+ * \brief	Interfaces introduced in thread programming
+ *
+ * \date	11/17/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#ifndef _WELSVP_THREAD_H
+#define _WELSVP_THREAD_H
+
+#include "typedef.h"
+
+#if defined(WIN32)
+
+#include <windows.h>
+
+#elif defined(__GNUC__)
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <semaphore.h>
+#include <signal.h>
+#include <errno.h>
+
+#endif//WIN32
+
+WELSVP_NAMESPACE_BEGIN
+
+#if defined(WIN32)
+
+typedef  HANDLE            WELS_THREAD_HANDLE;
+typedef  CRITICAL_SECTION  WELS_MUTEX;
+
+#elif defined(__GNUC__)
+
+typedef   pthread_t         WELS_THREAD_HANDLE;
+typedef   pthread_mutex_t   WELS_MUTEX;
+
+#endif
+
+typedef long_t WELS_THREAD_ERROR_CODE;
+
+#define   WELS_THREAD_ERROR_OK					0
+#define   WELS_THREAD_ERROR_GENERIAL			((unsigned long)(-1))
+#define   WELS_THREAD_ERROR_WAIT_OBJECT_0		0
+#define	  WELS_THREAD_ERROR_WAIT_TIMEOUT		((unsigned long)0x00000102L)
+#define	  WELS_THREAD_ERROR_WAIT_FAILED		    WELS_THREAD_ERROR_GENERIAL
+
+WELS_THREAD_ERROR_CODE   WelsMutexInit (WELS_MUTEX*    mutex);
+WELS_THREAD_ERROR_CODE   WelsMutexLock (WELS_MUTEX*    mutex);
+WELS_THREAD_ERROR_CODE   WelsMutexUnlock (WELS_MUTEX* mutex);
+WELS_THREAD_ERROR_CODE   WelsMutexDestroy (WELS_MUTEX* mutex);
+
+WELSVP_NAMESPACE_END
+
+#endif
--- a/processing/src/common/typedef.h
+++ b/processing/src/common/typedef.h
@@ -1,102 +1,102 @@
-/*!
- * \copy
- *     Copyright (c)  2011-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- * \file	    :  typedef.h
- *
- * \brief	    :  basic type definition 
- *
- * \date        :  2011/01/04
- *
- * \description :  1. Define basic type with platform-independent;
- *                 2. Define specific namespace to avoid name pollution;
- *                 3. C++ ONLY;             
- *
- *************************************************************************************
- */
-
-#ifndef _WELSVP_TYPEDEF_H
-#define _WELSVP_TYPEDEF_H
-
-#define WELSVP_EXTERN_C_BEGIN       extern "C" {
-#define WELSVP_EXTERN_C_END         }
-
-#define WELSVP_NAMESPACE_BEGIN      namespace nsWelsVP {
-#define WELSVP_NAMESPACE_END        }
-
-WELSVP_NAMESPACE_BEGIN
-
-#if defined(WIN32) || defined(_WIN32) || defined(_MSC_VER)
-
-typedef char               int8_t   ;
-typedef unsigned char      uint8_t  ;
-typedef short              int16_t  ;
-typedef unsigned short     uint16_t ;
-typedef int                int32_t  ;
-typedef unsigned int       uint32_t ;
-typedef __int64            int64_t  ;
-typedef unsigned __int64   uint64_t ;
-#define inline_t           _inline
-
-#else	// GCC
-
-typedef signed char        int8_t   ; // [comment]: some compilers may identify the type "char" as "unsigned char" as default, so declare it explicit 
-typedef unsigned char      uint8_t  ;
-typedef signed short       int16_t  ;
-typedef unsigned short     uint16_t ;
-typedef signed int         int32_t  ;
-typedef unsigned int       uint32_t ;
-typedef long long          int64_t  ;
-typedef unsigned long long uint64_t ;
-#define inline_t           inline
-
-#endif 
-
-typedef char    str_t    ; // [comment]: specific use plain char only for character parameters
-typedef long    long_t   ;
-typedef int32_t bool_t   ;
-
-#if defined(WIN32) || defined(_MACH_PLATFORM) || defined(__GNUC__)
-typedef float   float_t  ;
-typedef double  double_t ; 
-#endif
-
-#ifndef NULL
-#define NULL    0
-#endif
-
-enum
-{
-   FALSE = 0,
-   TRUE  = !FALSE
-};
-
-WELSVP_NAMESPACE_END
-
-#endif
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	    :  typedef.h
+ *
+ * \brief	    :  basic type definition
+ *
+ * \date        :  2011/01/04
+ *
+ * \description :  1. Define basic type with platform-independent;
+ *                 2. Define specific namespace to avoid name pollution;
+ *                 3. C++ ONLY;
+ *
+ *************************************************************************************
+ */
+
+#ifndef _WELSVP_TYPEDEF_H
+#define _WELSVP_TYPEDEF_H
+
+#define WELSVP_EXTERN_C_BEGIN       extern "C" {
+#define WELSVP_EXTERN_C_END         }
+
+#define WELSVP_NAMESPACE_BEGIN      namespace nsWelsVP {
+#define WELSVP_NAMESPACE_END        }
+
+WELSVP_NAMESPACE_BEGIN
+
+#if defined(WIN32) || defined(_WIN32) || defined(_MSC_VER)
+
+typedef char               int8_t   ;
+typedef unsigned char      uint8_t  ;
+typedef short              int16_t  ;
+typedef unsigned short     uint16_t ;
+typedef int                int32_t  ;
+typedef unsigned int       uint32_t ;
+typedef __int64            int64_t  ;
+typedef unsigned __int64   uint64_t ;
+#define inline_t           _inline
+
+#else	// GCC
+
+typedef signed char        int8_t
+; // [comment]: some compilers may identify the type "char" as "unsigned char" as default, so declare it explicit
+typedef unsigned char      uint8_t  ;
+typedef signed short       int16_t  ;
+typedef unsigned short     uint16_t ;
+typedef signed int         int32_t  ;
+typedef unsigned int       uint32_t ;
+typedef long long          int64_t  ;
+typedef unsigned long long uint64_t ;
+#define inline_t           inline
+
+#endif
+
+typedef char    str_t    ; // [comment]: specific use plain char only for character parameters
+typedef long    long_t   ;
+typedef int32_t bool_t   ;
+
+#if defined(WIN32) || defined(_MACH_PLATFORM) || defined(__GNUC__)
+typedef float   float_t  ;
+typedef double  double_t ;
+#endif
+
+#ifndef NULL
+#define NULL    0
+#endif
+
+enum {
+  FALSE = 0,
+  TRUE  = !FALSE
+};
+
+WELSVP_NAMESPACE_END
+
+#endif
--- a/processing/src/common/util.cpp
+++ b/processing/src/common/util.cpp
@@ -1,46 +1,45 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "util.h"
-
-WELSVP_NAMESPACE_BEGIN
-/////////////////////////////////////////////////////////////////////////////////
-
-
-int32_t  WelsStrCmp(const str_t * kpStr1, const str_t * kpStr2)
-{
-	return ::strcmp(kpStr1, kpStr2);
-}
-
-
-/////////////////////////////////////////////////////////////////////////////////
-WELSVP_NAMESPACE_END
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "util.h"
+
+WELSVP_NAMESPACE_BEGIN
+/////////////////////////////////////////////////////////////////////////////////
+
+
+int32_t  WelsStrCmp (const str_t* kpStr1, const str_t* kpStr2) {
+  return ::strcmp (kpStr1, kpStr2);
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////
+WELSVP_NAMESPACE_END
--- a/processing/src/common/util.h
+++ b/processing/src/common/util.h
@@ -1,108 +1,107 @@
-/*!
- * \copy
- *     Copyright (c)  2011-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- * \file	    :  util.h
- *
- * \brief	    :  utils for wels video processor class
- *
- * \date        :  2011/01/04
- *
- * \description :  
- *
- *************************************************************************************
- */
-
-#ifndef _WELSVP_UTIL_H
-#define _WELSVP_UTIL_H
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdarg.h>
-#include <assert.h>
-
-#include "typedef.h"
-#include "memory.h"
-#include "../../interface/IWelsVP.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-#define MAX_WIDTH      (4096)
-#define MAX_HEIGHT     (2304)//MAX_FS_LEVEL51 (36864); MAX_FS_LEVEL51*256/4096 = 2304
-#define MB_WIDTH_LUMA  (16)
-#define PESN		   (1e-6)	// desired float precision
-
-#define MB_TYPE_INTRA4x4		0x00000001
-#define MB_TYPE_INTRA16x16	0x00000002
-#define MB_TYPE_INTRA_PCM		0x00000004
-#define MB_TYPE_INTRA			  (MB_TYPE_INTRA4x4 | MB_TYPE_INTRA16x16 | MB_TYPE_INTRA_PCM)
-#define IS_INTRA(type) ((type)&MB_TYPE_INTRA)
-
-#define WELS_MAX(x, y)	((x) > (y) ? (x) : (y))
-#define WELS_MIN(x, y)	((x) < (y) ? (x) : (y))
-#define WELS_SIGN(a)	((long_t)(a) >> 31)
-#define WELS_ABS(a)		((WELS_SIGN(a) ^ (long_t)(a)) - WELS_SIGN(a))
-#define WELS_CLAMP(x, minv, maxv)  WELS_MIN(WELS_MAX(x, minv), maxv)
-
-#define ALIGNBYTES         (16)       /* Worst case is requiring alignment to an 16 byte boundary */
-#define WELS_ALIGN(iInput)   ((iInput+(ALIGNMENT-1)) & ~(ALIGNMENT-1))
-#define WELS_ALIGN2(iInput)  ((iInput+1) & ~1)
-#define WELS_ALIGN4(iInput)  ((iInput+3) & ~3)
-#define WELS_ALIGN8(iInput)  ((iInput+7) & ~7)
-
-#define WelsCastFromPointer(p)      (reinterpret_cast<long_t>(p))
-#define WelsStaticCast(type, p)  (static_cast<type>(p))
-#define WelsDynamicCast(type, p) (dynamic_cast<type>(p))
-
-#define GET_METHOD(x)  ((x) & 0xff)          // mask method as the lowest 8bits
-#define GET_SPECIAL(x) (((x) >> 8) & 0xff)   // mask special flag as 8bits
-
-inline_t EMethods WelsVpGetValidMethod(int32_t a)
-{
-   int32_t iMethod = GET_METHOD(a);
-   return WelsStaticCast(EMethods, WELS_CLAMP(iMethod, METHOD_NULL+1, METHOD_MASK-1));
-}
-
-
-#define _SafeFree(p)		if (p) { WelsFree(p); (p) = NULL; }
-#define _SafeDelete(p)		if (p) { delete (p); (p) = NULL; }
-
-
-//////////////////////////////////////////////////////////////////////////////////////
-
-int32_t   WelsStrCmp(const str_t * kpStr1, const str_t * kpStr2);
-
-
-//////////////////////////////////////////////////////////////////////////////////////
-WELSVP_NAMESPACE_END
-
-#endif
-
-
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	    :  util.h
+ *
+ * \brief	    :  utils for wels video processor class
+ *
+ * \date        :  2011/01/04
+ *
+ * \description :
+ *
+ *************************************************************************************
+ */
+
+#ifndef _WELSVP_UTIL_H
+#define _WELSVP_UTIL_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <assert.h>
+
+#include "typedef.h"
+#include "memory.h"
+#include "../../interface/IWelsVP.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+#define MAX_WIDTH      (4096)
+#define MAX_HEIGHT     (2304)//MAX_FS_LEVEL51 (36864); MAX_FS_LEVEL51*256/4096 = 2304
+#define MB_WIDTH_LUMA  (16)
+#define PESN		   (1e-6)	// desired float precision
+
+#define MB_TYPE_INTRA4x4		0x00000001
+#define MB_TYPE_INTRA16x16	0x00000002
+#define MB_TYPE_INTRA_PCM		0x00000004
+#define MB_TYPE_INTRA			  (MB_TYPE_INTRA4x4 | MB_TYPE_INTRA16x16 | MB_TYPE_INTRA_PCM)
+#define IS_INTRA(type) ((type)&MB_TYPE_INTRA)
+
+#define WELS_MAX(x, y)	((x) > (y) ? (x) : (y))
+#define WELS_MIN(x, y)	((x) < (y) ? (x) : (y))
+#define WELS_SIGN(a)	((long_t)(a) >> 31)
+#define WELS_ABS(a)		((WELS_SIGN(a) ^ (long_t)(a)) - WELS_SIGN(a))
+#define WELS_CLAMP(x, minv, maxv)  WELS_MIN(WELS_MAX(x, minv), maxv)
+
+#define ALIGNBYTES         (16)       /* Worst case is requiring alignment to an 16 byte boundary */
+#define WELS_ALIGN(iInput)   ((iInput+(ALIGNMENT-1)) & ~(ALIGNMENT-1))
+#define WELS_ALIGN2(iInput)  ((iInput+1) & ~1)
+#define WELS_ALIGN4(iInput)  ((iInput+3) & ~3)
+#define WELS_ALIGN8(iInput)  ((iInput+7) & ~7)
+
+#define WelsCastFromPointer(p)      (reinterpret_cast<long_t>(p))
+#define WelsStaticCast(type, p)  (static_cast<type>(p))
+#define WelsDynamicCast(type, p) (dynamic_cast<type>(p))
+
+#define GET_METHOD(x)  ((x) & 0xff)          // mask method as the lowest 8bits
+#define GET_SPECIAL(x) (((x) >> 8) & 0xff)   // mask special flag as 8bits
+
+inline_t EMethods WelsVpGetValidMethod (int32_t a) {
+  int32_t iMethod = GET_METHOD (a);
+  return WelsStaticCast (EMethods, WELS_CLAMP (iMethod, METHOD_NULL + 1, METHOD_MASK - 1));
+}
+
+
+#define _SafeFree(p)		if (p) { WelsFree(p); (p) = NULL; }
+#define _SafeDelete(p)		if (p) { delete (p); (p) = NULL; }
+
+
+//////////////////////////////////////////////////////////////////////////////////////
+
+int32_t   WelsStrCmp (const str_t* kpStr1, const str_t* kpStr2);
+
+
+//////////////////////////////////////////////////////////////////////////////////////
+WELSVP_NAMESPACE_END
+
+#endif
+
+
--- a/processing/src/complexityanalysis/ComplexityAnalysis.cpp
+++ b/processing/src/complexityanalysis/ComplexityAnalysis.cpp
@@ -1,325 +1,304 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "ComplexityAnalysis.h"
-#include "../common/cpu.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-CComplexityAnalysis::CComplexityAnalysis(int32_t iCpuFlag)
-{
-	m_eMethod   = METHOD_COMPLEXITY_ANALYSIS;
-	m_pfGomSad   = NULL;
-	WelsMemset( &m_sComplexityAnalysisParam, 0, sizeof(m_sComplexityAnalysisParam) );
-}
-
-CComplexityAnalysis::~CComplexityAnalysis()
-{	
-}
-
-EResult CComplexityAnalysis::Process(int32_t iType, SPixMap *pSrcPixMap, SPixMap *pRefPixMap)
-{
-	EResult eReturn = RET_SUCCESS;	
-
-	switch (m_sComplexityAnalysisParam.iComplexityAnalysisMode)
-	{
-	case FRAME_SAD:
-		AnalyzeFrameComplexityViaSad( pSrcPixMap, pRefPixMap );
-		break;
-	case GOM_SAD:
-		AnalyzeGomComplexityViaSad( pSrcPixMap, pRefPixMap );
-		break;
-	case GOM_VAR:
-		AnalyzeGomComplexityViaVar( pSrcPixMap, pRefPixMap );
-		break;
-	default:
-		eReturn = RET_INVALIDPARAM;
-		break;
-	}	
-
-	return eReturn;
-}
-
-
-EResult CComplexityAnalysis::Set(int32_t iType, void *pParam)
-{
-	if (pParam == NULL)
-	{
-		return RET_INVALIDPARAM;
-	}
-
-	m_sComplexityAnalysisParam = *(SComplexityAnalysisParam *)pParam;
-
-	return RET_SUCCESS;
-}
-
-EResult CComplexityAnalysis::Get(int32_t iType, void *pParam)
-{
-	if (pParam == NULL)
-	{
-		return RET_INVALIDPARAM;
-	}
-
-	SComplexityAnalysisParam * sComplexityAnalysisParam = (SComplexityAnalysisParam *)pParam;
-
-	sComplexityAnalysisParam->iFrameComplexity = m_sComplexityAnalysisParam.iFrameComplexity;
-
-	return RET_SUCCESS;
-}
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////
-void CComplexityAnalysis::AnalyzeFrameComplexityViaSad( SPixMap *pSrcPixMap, SPixMap *pRefPixMap )
-{
-	SVAACalcResult     *pVaaCalcResults = NULL;
-	pVaaCalcResults = m_sComplexityAnalysisParam.pCalcResult;
-
-	m_sComplexityAnalysisParam.iFrameComplexity = pVaaCalcResults->iFrameSad;
-
-	if ( m_sComplexityAnalysisParam.iCalcBgd ) //BGD control
-	{
-		m_sComplexityAnalysisParam.iFrameComplexity = (int32_t)GetFrameSadExcludeBackground( pSrcPixMap, pRefPixMap );
-	}
-}
-
-int32_t CComplexityAnalysis::GetFrameSadExcludeBackground( SPixMap *pSrcPixMap, SPixMap *pRefPixMap )
-{
-	int32_t iWidth     = pSrcPixMap->sRect.iRectWidth;
-	int32_t iHeight    = pSrcPixMap->sRect.iRectHeight;	
-	int32_t iMbWidth  = iWidth  >> 4;
-	int32_t iMbHeight = iHeight >> 4;
-	int32_t iMbNum    = iMbWidth * iMbHeight;
-
-	int32_t iMbNumInGom = m_sComplexityAnalysisParam.iMbNumInGom;
-	int32_t iGomMbNum = (iMbNum + iMbNumInGom - 1 ) / iMbNumInGom;
-	int32_t iGomMbStartIndex = 0, iGomMbEndIndex = 0;
-
-	uint8_t *pBackgroundMbFlag = (uint8_t *)m_sComplexityAnalysisParam.pBackgroundMbFlag;
-	uint32_t*uiRefMbType = (uint32_t *)m_sComplexityAnalysisParam.uiRefMbType;
-	SVAACalcResult *pVaaCalcResults = m_sComplexityAnalysisParam.pCalcResult;
-	int32_t  *pGomForegroundBlockNum = m_sComplexityAnalysisParam.pGomForegroundBlockNum;
-
-	uint32_t uiFrameSad = 0;
-	for ( int32_t j = 0; j < iGomMbNum; j ++ )
-	{
-		iGomMbStartIndex = j * iMbNumInGom;
-		iGomMbEndIndex = WELS_MIN( (j + 1) * iMbNumInGom, iMbNum);
-
-		for ( int32_t i = iGomMbStartIndex; i < iGomMbEndIndex; i ++)
-		{	
-			if ( pBackgroundMbFlag[i] == 0 || IS_INTRA(uiRefMbType[i]) )
-			{
-				pGomForegroundBlockNum[j]++;
-				uiFrameSad += pVaaCalcResults->pSad8x8[i][0];
-				uiFrameSad += pVaaCalcResults->pSad8x8[i][1];
-				uiFrameSad += pVaaCalcResults->pSad8x8[i][2];
-				uiFrameSad += pVaaCalcResults->pSad8x8[i][3];
-			}
-		}
-	}
-
-	return (uiFrameSad);
-}
-
-
-void InitGomSadFunc(PGOMSadFunc &pfGomSad, uint8_t iCalcBgd)
-{
-	pfGomSad = GomSampleSad;
-
-	if ( iCalcBgd )
-	{
-		pfGomSad = GomSampleSadExceptBackground;
-	}
-}
-
-void GomSampleSad(uint32_t *pGomSad, int32_t *pGomForegroundBlockNum, int32_t *pSad8x8, uint8_t pBackgroundMbFlag)
-{
-  (*pGomForegroundBlockNum) ++;
-  *pGomSad += pSad8x8[0];
-  *pGomSad += pSad8x8[1];
-  *pGomSad += pSad8x8[2];
-  *pGomSad += pSad8x8[3];
-}
-
-void GomSampleSadExceptBackground(uint32_t *pGomSad, int32_t *pGomForegroundBlockNum, int32_t *pSad8x8, uint8_t pBackgroundMbFlag)
-{
-  if ( pBackgroundMbFlag == 0 )
-  {
-    (*pGomForegroundBlockNum) ++;
-    *pGomSad += pSad8x8[0];
-    *pGomSad += pSad8x8[1];
-    *pGomSad += pSad8x8[2];
-    *pGomSad += pSad8x8[3];
-  }
-}
-
-void CComplexityAnalysis::AnalyzeGomComplexityViaSad( SPixMap *pSrcPixMap, SPixMap *pRefPixMap )
-{
-	int32_t iWidth     = pSrcPixMap->sRect.iRectWidth;
-	int32_t iHeight    = pSrcPixMap->sRect.iRectHeight;	
-	int32_t iMbWidth  = iWidth  >> 4;
-	int32_t iMbHeight = iHeight >> 4;
-	int32_t iMbNum    = iMbWidth * iMbHeight;
-
-	int32_t iMbNumInGom = m_sComplexityAnalysisParam.iMbNumInGom;
-	int32_t iGomMbNum = (iMbNum + iMbNumInGom - 1 ) / iMbNumInGom;
-
-	int32_t iGomMbStartIndex = 0, iGomMbEndIndex = 0, iGomMbRowNum = 0;
-	int32_t iMbStartIndex = 0, iMbEndIndex = 0;
-	int32_t iStartSampleIndex = 0;
-
-	uint8_t *pBackgroundMbFlag = (uint8_t *)m_sComplexityAnalysisParam.pBackgroundMbFlag;
-	uint32_t*uiRefMbType = (uint32_t *)m_sComplexityAnalysisParam.uiRefMbType;
-	SVAACalcResult *pVaaCalcResults = m_sComplexityAnalysisParam.pCalcResult;
-	int32_t  *pGomForegroundBlockNum = (int32_t *)m_sComplexityAnalysisParam.pGomForegroundBlockNum;
-	int32_t  *pGomComplexity = (int32_t *)m_sComplexityAnalysisParam.pGomComplexity;
-
-	uint8_t *pRefY = NULL, *pSrcY = NULL;
-	int32_t iRefStride = 0, iCurStride = 0;
-
-	uint8_t *pRefTmp = NULL, *pCurTmp = NULL;
-	uint32_t uiGomSad = 0, uiFrameSad = 0;
-
-	pRefY = (uint8_t *)pRefPixMap->pPixel[0];
-	pSrcY = (uint8_t *)pSrcPixMap->pPixel[0];
-
-	iRefStride  = pRefPixMap->iStride[0];
-	iCurStride  = pSrcPixMap->iStride[0];
-
-	InitGomSadFunc( m_pfGomSad, m_sComplexityAnalysisParam.iCalcBgd );
-
-	for ( int32_t j = 0; j < iGomMbNum; j ++ )
-	{
-		uiGomSad = 0;
-
-		iGomMbStartIndex = j * iMbNumInGom;
-		iGomMbEndIndex = WELS_MIN( (j + 1) * iMbNumInGom, iMbNum);
-		iGomMbRowNum = (iGomMbEndIndex + iMbWidth - 1 ) / iMbWidth  - iGomMbStartIndex / iMbWidth;
-
-		iMbStartIndex = iGomMbStartIndex;
-		iMbEndIndex = WELS_MIN( (iMbStartIndex / iMbWidth + 1) * iMbWidth, iGomMbEndIndex);
-
-		iStartSampleIndex  = ( iMbStartIndex / iMbWidth ) * MB_WIDTH_LUMA * iRefStride + ( iMbStartIndex % iMbWidth ) * MB_WIDTH_LUMA;
-
-		do 
-		{   
-			pRefTmp = pRefY + iStartSampleIndex;
-			pCurTmp = pSrcY + iStartSampleIndex;
-
-			for ( int32_t i = iMbStartIndex; i < iMbEndIndex; i ++)
-			{
-				m_pfGomSad(&uiGomSad, pGomForegroundBlockNum + j, pVaaCalcResults->pSad8x8[i], pBackgroundMbFlag[i] && !IS_INTRA(uiRefMbType[i]) );
-			}
-
-			iMbStartIndex = iMbEndIndex;
-			iMbEndIndex = WELS_MIN( iMbEndIndex + iMbWidth , iGomMbEndIndex);
-
-			iStartSampleIndex  = ( iMbStartIndex / iMbWidth ) * MB_WIDTH_LUMA * iRefStride + ( iMbStartIndex % iMbWidth ) * MB_WIDTH_LUMA;
-
-		} while ( --iGomMbRowNum );
-
-		pGomComplexity[j] = uiGomSad;
-		uiFrameSad += pGomComplexity[j];
-	}
-
-	m_sComplexityAnalysisParam.iFrameComplexity = uiFrameSad;
-}
-
-
-void CComplexityAnalysis::AnalyzeGomComplexityViaVar( SPixMap *pSrcPixMap, SPixMap *pRefPixMap )
-{
-	int32_t iWidth     = pSrcPixMap->sRect.iRectWidth;
-	int32_t iHeight    = pSrcPixMap->sRect.iRectHeight;	
-	int32_t iMbWidth  = iWidth  >> 4;
-	int32_t iMbHeight = iHeight >> 4;
-	int32_t iMbNum    = iMbWidth * iMbHeight;
-
-	int32_t iMbNumInGom = m_sComplexityAnalysisParam.iMbNumInGom;
-	int32_t iGomMbNum = (iMbNum + iMbNumInGom - 1 ) / iMbNumInGom;
-	int32_t iGomSampleNum = 0;
-
-	int32_t iGomMbStartIndex = 0, iGomMbEndIndex = 0, iGomMbRowNum = 0;
-	int32_t iMbStartIndex = 0, iMbEndIndex = 0;
-	int32_t iStartSampleIndex = 0;
-
-	SVAACalcResult *pVaaCalcResults = m_sComplexityAnalysisParam.pCalcResult;
-	int32_t  *pGomComplexity = (int32_t *)m_sComplexityAnalysisParam.pGomComplexity;
-
-	uint8_t *pSrcY = NULL;
-	int32_t iCurStride = 0;
-
-	uint8_t *pCurTmp = NULL;
-	uint32_t uiSampleSum = 0, uiSquareSum = 0;
-
-	pSrcY = (uint8_t *)pSrcPixMap->pPixel[0];
-	iCurStride  = pSrcPixMap->iStride[0];
-
-	for ( int32_t j = 0; j < iGomMbNum; j ++ )
-	{
-		uiSampleSum = 0;
-		uiSquareSum = 0;
-
-		iGomMbStartIndex = j * iMbNumInGom;
-		iGomMbEndIndex = WELS_MIN( (j + 1) * iMbNumInGom, iMbNum);
-		iGomMbRowNum = (iGomMbEndIndex + iMbWidth - 1 ) / iMbWidth  - iGomMbStartIndex / iMbWidth;
-
-		iMbStartIndex = iGomMbStartIndex;
-		iMbEndIndex = WELS_MIN( (iMbStartIndex / iMbWidth + 1) * iMbWidth, iGomMbEndIndex);
-
-		iStartSampleIndex  = ( iMbStartIndex / iMbWidth ) * MB_WIDTH_LUMA * iCurStride + ( iMbStartIndex % iMbWidth ) * MB_WIDTH_LUMA;
-		iGomSampleNum = (iMbEndIndex - iMbStartIndex) * MB_WIDTH_LUMA * MB_WIDTH_LUMA;
-
-		do 
-		{
-			pCurTmp = pSrcY + iStartSampleIndex;
-
-			for ( int32_t i = iMbStartIndex; i < iMbEndIndex; i ++ )
-			{
-				uiSampleSum += pVaaCalcResults->pSum16x16[i];
-				uiSquareSum += pVaaCalcResults->pSumOfSquare16x16[i];
-			}
-
-			iMbStartIndex = iMbEndIndex;
-			iMbEndIndex = WELS_MIN( iMbEndIndex + iMbWidth, iGomMbEndIndex);
-
-			iStartSampleIndex  = ( iMbStartIndex / iMbWidth ) * MB_WIDTH_LUMA * iCurStride + ( iMbStartIndex % iMbWidth ) * MB_WIDTH_LUMA;
-		} while ( --iGomMbRowNum );
-	
-		pGomComplexity[j] = uiSquareSum - (uiSampleSum * uiSampleSum / iGomSampleNum);
-	}
-}
-
-
-WELSVP_NAMESPACE_END
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "ComplexityAnalysis.h"
+#include "../common/cpu.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+CComplexityAnalysis::CComplexityAnalysis (int32_t iCpuFlag) {
+  m_eMethod   = METHOD_COMPLEXITY_ANALYSIS;
+  m_pfGomSad   = NULL;
+  WelsMemset (&m_sComplexityAnalysisParam, 0, sizeof (m_sComplexityAnalysisParam));
+}
+
+CComplexityAnalysis::~CComplexityAnalysis() {
+}
+
+EResult CComplexityAnalysis::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
+  EResult eReturn = RET_SUCCESS;
+
+  switch (m_sComplexityAnalysisParam.iComplexityAnalysisMode) {
+  case FRAME_SAD:
+    AnalyzeFrameComplexityViaSad (pSrcPixMap, pRefPixMap);
+    break;
+  case GOM_SAD:
+    AnalyzeGomComplexityViaSad (pSrcPixMap, pRefPixMap);
+    break;
+  case GOM_VAR:
+    AnalyzeGomComplexityViaVar (pSrcPixMap, pRefPixMap);
+    break;
+  default:
+    eReturn = RET_INVALIDPARAM;
+    break;
+  }
+
+  return eReturn;
+}
+
+
+EResult CComplexityAnalysis::Set (int32_t iType, void* pParam) {
+  if (pParam == NULL) {
+    return RET_INVALIDPARAM;
+  }
+
+  m_sComplexityAnalysisParam = * (SComplexityAnalysisParam*)pParam;
+
+  return RET_SUCCESS;
+}
+
+EResult CComplexityAnalysis::Get (int32_t iType, void* pParam) {
+  if (pParam == NULL) {
+    return RET_INVALIDPARAM;
+  }
+
+  SComplexityAnalysisParam* sComplexityAnalysisParam = (SComplexityAnalysisParam*)pParam;
+
+  sComplexityAnalysisParam->iFrameComplexity = m_sComplexityAnalysisParam.iFrameComplexity;
+
+  return RET_SUCCESS;
+}
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+void CComplexityAnalysis::AnalyzeFrameComplexityViaSad (SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
+  SVAACalcResult*     pVaaCalcResults = NULL;
+  pVaaCalcResults = m_sComplexityAnalysisParam.pCalcResult;
+
+  m_sComplexityAnalysisParam.iFrameComplexity = pVaaCalcResults->iFrameSad;
+
+  if (m_sComplexityAnalysisParam.iCalcBgd) { //BGD control
+    m_sComplexityAnalysisParam.iFrameComplexity = (int32_t)GetFrameSadExcludeBackground (pSrcPixMap, pRefPixMap);
+  }
+}
+
+int32_t CComplexityAnalysis::GetFrameSadExcludeBackground (SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
+  int32_t iWidth     = pSrcPixMap->sRect.iRectWidth;
+  int32_t iHeight    = pSrcPixMap->sRect.iRectHeight;
+  int32_t iMbWidth  = iWidth  >> 4;
+  int32_t iMbHeight = iHeight >> 4;
+  int32_t iMbNum    = iMbWidth * iMbHeight;
+
+  int32_t iMbNumInGom = m_sComplexityAnalysisParam.iMbNumInGom;
+  int32_t iGomMbNum = (iMbNum + iMbNumInGom - 1) / iMbNumInGom;
+  int32_t iGomMbStartIndex = 0, iGomMbEndIndex = 0;
+
+  uint8_t* pBackgroundMbFlag = (uint8_t*)m_sComplexityAnalysisParam.pBackgroundMbFlag;
+  uint32_t* uiRefMbType = (uint32_t*)m_sComplexityAnalysisParam.uiRefMbType;
+  SVAACalcResult* pVaaCalcResults = m_sComplexityAnalysisParam.pCalcResult;
+  int32_t*  pGomForegroundBlockNum = m_sComplexityAnalysisParam.pGomForegroundBlockNum;
+
+  uint32_t uiFrameSad = 0;
+  for (int32_t j = 0; j < iGomMbNum; j ++) {
+    iGomMbStartIndex = j * iMbNumInGom;
+    iGomMbEndIndex = WELS_MIN ((j + 1) * iMbNumInGom, iMbNum);
+
+    for (int32_t i = iGomMbStartIndex; i < iGomMbEndIndex; i ++) {
+      if (pBackgroundMbFlag[i] == 0 || IS_INTRA (uiRefMbType[i])) {
+        pGomForegroundBlockNum[j]++;
+        uiFrameSad += pVaaCalcResults->pSad8x8[i][0];
+        uiFrameSad += pVaaCalcResults->pSad8x8[i][1];
+        uiFrameSad += pVaaCalcResults->pSad8x8[i][2];
+        uiFrameSad += pVaaCalcResults->pSad8x8[i][3];
+      }
+    }
+  }
+
+  return (uiFrameSad);
+}
+
+
+void InitGomSadFunc (PGOMSadFunc& pfGomSad, uint8_t iCalcBgd) {
+  pfGomSad = GomSampleSad;
+
+  if (iCalcBgd) {
+    pfGomSad = GomSampleSadExceptBackground;
+  }
+}
+
+void GomSampleSad (uint32_t* pGomSad, int32_t* pGomForegroundBlockNum, int32_t* pSad8x8, uint8_t pBackgroundMbFlag) {
+  (*pGomForegroundBlockNum) ++;
+  *pGomSad += pSad8x8[0];
+  *pGomSad += pSad8x8[1];
+  *pGomSad += pSad8x8[2];
+  *pGomSad += pSad8x8[3];
+}
+
+void GomSampleSadExceptBackground (uint32_t* pGomSad, int32_t* pGomForegroundBlockNum, int32_t* pSad8x8,
+                                   uint8_t pBackgroundMbFlag) {
+  if (pBackgroundMbFlag == 0) {
+    (*pGomForegroundBlockNum) ++;
+    *pGomSad += pSad8x8[0];
+    *pGomSad += pSad8x8[1];
+    *pGomSad += pSad8x8[2];
+    *pGomSad += pSad8x8[3];
+  }
+}
+
+void CComplexityAnalysis::AnalyzeGomComplexityViaSad (SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
+  int32_t iWidth     = pSrcPixMap->sRect.iRectWidth;
+  int32_t iHeight    = pSrcPixMap->sRect.iRectHeight;
+  int32_t iMbWidth  = iWidth  >> 4;
+  int32_t iMbHeight = iHeight >> 4;
+  int32_t iMbNum    = iMbWidth * iMbHeight;
+
+  int32_t iMbNumInGom = m_sComplexityAnalysisParam.iMbNumInGom;
+  int32_t iGomMbNum = (iMbNum + iMbNumInGom - 1) / iMbNumInGom;
+
+  int32_t iGomMbStartIndex = 0, iGomMbEndIndex = 0, iGomMbRowNum = 0;
+  int32_t iMbStartIndex = 0, iMbEndIndex = 0;
+  int32_t iStartSampleIndex = 0;
+
+  uint8_t* pBackgroundMbFlag = (uint8_t*)m_sComplexityAnalysisParam.pBackgroundMbFlag;
+  uint32_t* uiRefMbType = (uint32_t*)m_sComplexityAnalysisParam.uiRefMbType;
+  SVAACalcResult* pVaaCalcResults = m_sComplexityAnalysisParam.pCalcResult;
+  int32_t*  pGomForegroundBlockNum = (int32_t*)m_sComplexityAnalysisParam.pGomForegroundBlockNum;
+  int32_t*  pGomComplexity = (int32_t*)m_sComplexityAnalysisParam.pGomComplexity;
+
+  uint8_t* pRefY = NULL, *pSrcY = NULL;
+  int32_t iRefStride = 0, iCurStride = 0;
+
+  uint8_t* pRefTmp = NULL, *pCurTmp = NULL;
+  uint32_t uiGomSad = 0, uiFrameSad = 0;
+
+  pRefY = (uint8_t*)pRefPixMap->pPixel[0];
+  pSrcY = (uint8_t*)pSrcPixMap->pPixel[0];
+
+  iRefStride  = pRefPixMap->iStride[0];
+  iCurStride  = pSrcPixMap->iStride[0];
+
+  InitGomSadFunc (m_pfGomSad, m_sComplexityAnalysisParam.iCalcBgd);
+
+  for (int32_t j = 0; j < iGomMbNum; j ++) {
+    uiGomSad = 0;
+
+    iGomMbStartIndex = j * iMbNumInGom;
+    iGomMbEndIndex = WELS_MIN ((j + 1) * iMbNumInGom, iMbNum);
+    iGomMbRowNum = (iGomMbEndIndex + iMbWidth - 1) / iMbWidth  - iGomMbStartIndex / iMbWidth;
+
+    iMbStartIndex = iGomMbStartIndex;
+    iMbEndIndex = WELS_MIN ((iMbStartIndex / iMbWidth + 1) * iMbWidth, iGomMbEndIndex);
+
+    iStartSampleIndex  = (iMbStartIndex / iMbWidth) * MB_WIDTH_LUMA * iRefStride + (iMbStartIndex % iMbWidth) *
+                         MB_WIDTH_LUMA;
+
+    do {
+      pRefTmp = pRefY + iStartSampleIndex;
+      pCurTmp = pSrcY + iStartSampleIndex;
+
+      for (int32_t i = iMbStartIndex; i < iMbEndIndex; i ++) {
+        m_pfGomSad (&uiGomSad, pGomForegroundBlockNum + j, pVaaCalcResults->pSad8x8[i], pBackgroundMbFlag[i]
+                    && !IS_INTRA (uiRefMbType[i]));
+      }
+
+      iMbStartIndex = iMbEndIndex;
+      iMbEndIndex = WELS_MIN (iMbEndIndex + iMbWidth , iGomMbEndIndex);
+
+      iStartSampleIndex  = (iMbStartIndex / iMbWidth) * MB_WIDTH_LUMA * iRefStride + (iMbStartIndex % iMbWidth) *
+                           MB_WIDTH_LUMA;
+
+    } while (--iGomMbRowNum);
+
+    pGomComplexity[j] = uiGomSad;
+    uiFrameSad += pGomComplexity[j];
+  }
+
+  m_sComplexityAnalysisParam.iFrameComplexity = uiFrameSad;
+}
+
+
+void CComplexityAnalysis::AnalyzeGomComplexityViaVar (SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
+  int32_t iWidth     = pSrcPixMap->sRect.iRectWidth;
+  int32_t iHeight    = pSrcPixMap->sRect.iRectHeight;
+  int32_t iMbWidth  = iWidth  >> 4;
+  int32_t iMbHeight = iHeight >> 4;
+  int32_t iMbNum    = iMbWidth * iMbHeight;
+
+  int32_t iMbNumInGom = m_sComplexityAnalysisParam.iMbNumInGom;
+  int32_t iGomMbNum = (iMbNum + iMbNumInGom - 1) / iMbNumInGom;
+  int32_t iGomSampleNum = 0;
+
+  int32_t iGomMbStartIndex = 0, iGomMbEndIndex = 0, iGomMbRowNum = 0;
+  int32_t iMbStartIndex = 0, iMbEndIndex = 0;
+  int32_t iStartSampleIndex = 0;
+
+  SVAACalcResult* pVaaCalcResults = m_sComplexityAnalysisParam.pCalcResult;
+  int32_t*  pGomComplexity = (int32_t*)m_sComplexityAnalysisParam.pGomComplexity;
+
+  uint8_t* pSrcY = NULL;
+  int32_t iCurStride = 0;
+
+  uint8_t* pCurTmp = NULL;
+  uint32_t uiSampleSum = 0, uiSquareSum = 0;
+
+  pSrcY = (uint8_t*)pSrcPixMap->pPixel[0];
+  iCurStride  = pSrcPixMap->iStride[0];
+
+  for (int32_t j = 0; j < iGomMbNum; j ++) {
+    uiSampleSum = 0;
+    uiSquareSum = 0;
+
+    iGomMbStartIndex = j * iMbNumInGom;
+    iGomMbEndIndex = WELS_MIN ((j + 1) * iMbNumInGom, iMbNum);
+    iGomMbRowNum = (iGomMbEndIndex + iMbWidth - 1) / iMbWidth  - iGomMbStartIndex / iMbWidth;
+
+    iMbStartIndex = iGomMbStartIndex;
+    iMbEndIndex = WELS_MIN ((iMbStartIndex / iMbWidth + 1) * iMbWidth, iGomMbEndIndex);
+
+    iStartSampleIndex  = (iMbStartIndex / iMbWidth) * MB_WIDTH_LUMA * iCurStride + (iMbStartIndex % iMbWidth) *
+                         MB_WIDTH_LUMA;
+    iGomSampleNum = (iMbEndIndex - iMbStartIndex) * MB_WIDTH_LUMA * MB_WIDTH_LUMA;
+
+    do {
+      pCurTmp = pSrcY + iStartSampleIndex;
+
+      for (int32_t i = iMbStartIndex; i < iMbEndIndex; i ++) {
+        uiSampleSum += pVaaCalcResults->pSum16x16[i];
+        uiSquareSum += pVaaCalcResults->pSumOfSquare16x16[i];
+      }
+
+      iMbStartIndex = iMbEndIndex;
+      iMbEndIndex = WELS_MIN (iMbEndIndex + iMbWidth, iGomMbEndIndex);
+
+      iStartSampleIndex  = (iMbStartIndex / iMbWidth) * MB_WIDTH_LUMA * iCurStride + (iMbStartIndex % iMbWidth) *
+                           MB_WIDTH_LUMA;
+    } while (--iGomMbRowNum);
+
+    pGomComplexity[j] = uiSquareSum - (uiSampleSum * uiSampleSum / iGomSampleNum);
+  }
+}
+
+
+WELSVP_NAMESPACE_END
--- a/processing/src/complexityanalysis/ComplexityAnalysis.h
+++ b/processing/src/complexityanalysis/ComplexityAnalysis.h
@@ -1,83 +1,83 @@
-/*!
- * \copy
- *     Copyright (c)  2011-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
-* \file	        :  ComplexityAnalysis.h
-*
-* \brief	    :  complexity analysis class of wels video processor class
-*
-* \date         :  2011/03/28
-*
-* \description  :  1. rewrite the package code of complexity analysis class  
-*
-*************************************************************************************
-*/
-
-#ifndef _WELSVP_COMPLEXITYANALYSIS_H
-#define _WELSVP_COMPLEXITYANALYSIS_H
-
-#include "../common/util.h"
-#include "../common/memory.h"
-#include "../common/WelsFrameWork.h"
-#include "../../interface/IWelsVP.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-typedef  void (GOMSadFunc) (uint32_t *pGomSad, int32_t *pGomForegroundBlockNum, int32_t *pSad8x8, uint8_t pBackgroundMbFlag);
-
-typedef GOMSadFunc  * PGOMSadFunc;
-
-GOMSadFunc      GomSampleSad;
-GOMSadFunc      GomSampleSadExceptBackground;
-
-class CComplexityAnalysis : public IStrategy
-{			  
-public:
-	CComplexityAnalysis(int32_t iCpuFlag);
-	~CComplexityAnalysis();
-
-	EResult Process(int32_t iType, SPixMap *pSrc, SPixMap *pRef);
-	EResult Set(int32_t iType, void *pParam);
-	EResult Get(int32_t iType, void *pParam);
-
-private:
-	void AnalyzeFrameComplexityViaSad(SPixMap *pSrc, SPixMap *pRef);
-	int32_t GetFrameSadExcludeBackground( SPixMap *pSrc, SPixMap *pRef );
-
-	void AnalyzeGomComplexityViaSad(SPixMap *pSrc, SPixMap *pRef);
-	void AnalyzeGomComplexityViaVar(SPixMap *pSrc, SPixMap *pRef);
-
-private:
-	PGOMSadFunc m_pfGomSad;
-	SComplexityAnalysisParam m_sComplexityAnalysisParam;
-};	
-
-WELSVP_NAMESPACE_END
-
-#endif
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+* \file	        :  ComplexityAnalysis.h
+*
+* \brief	    :  complexity analysis class of wels video processor class
+*
+* \date         :  2011/03/28
+*
+* \description  :  1. rewrite the package code of complexity analysis class
+*
+*************************************************************************************
+*/
+
+#ifndef _WELSVP_COMPLEXITYANALYSIS_H
+#define _WELSVP_COMPLEXITYANALYSIS_H
+
+#include "../common/util.h"
+#include "../common/memory.h"
+#include "../common/WelsFrameWork.h"
+#include "../../interface/IWelsVP.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+typedef  void (GOMSadFunc) (uint32_t* pGomSad, int32_t* pGomForegroundBlockNum, int32_t* pSad8x8,
+                            uint8_t pBackgroundMbFlag);
+
+typedef GOMSadFunc*   PGOMSadFunc;
+
+GOMSadFunc      GomSampleSad;
+GOMSadFunc      GomSampleSadExceptBackground;
+
+class CComplexityAnalysis : public IStrategy {
+ public:
+  CComplexityAnalysis (int32_t iCpuFlag);
+  ~CComplexityAnalysis();
+
+  EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* pRef);
+  EResult Set (int32_t iType, void* pParam);
+  EResult Get (int32_t iType, void* pParam);
+
+ private:
+  void AnalyzeFrameComplexityViaSad (SPixMap* pSrc, SPixMap* pRef);
+  int32_t GetFrameSadExcludeBackground (SPixMap* pSrc, SPixMap* pRef);
+
+  void AnalyzeGomComplexityViaSad (SPixMap* pSrc, SPixMap* pRef);
+  void AnalyzeGomComplexityViaVar (SPixMap* pSrc, SPixMap* pRef);
+
+ private:
+  PGOMSadFunc m_pfGomSad;
+  SComplexityAnalysisParam m_sComplexityAnalysisParam;
+};
+
+WELSVP_NAMESPACE_END
+
+#endif
--- a/processing/src/denoise/denoise.cpp
+++ b/processing/src/denoise/denoise.cpp
@@ -1,138 +1,124 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "denoise.h"
-#include "../common/cpu.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-#define CALC_BI_STRIDE(iWidth, iBitcount)  ((((iWidth) * (iBitcount) + 31) & ~31) >> 3)
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-CDenoiser::CDenoiser(int32_t iCpuFlag)
-{
-	m_CPUFlag = iCpuFlag;
-	m_eMethod   = METHOD_DENOISE;
-	WelsMemset(&m_pfDenoise, 0, sizeof(m_pfDenoise));
-
-	m_uiSpaceRadius = DENOISE_GRAY_RADIUS;
-	m_fSigmaGrey  = DENOISE_GRAY_SIGMA;
-	m_uiType		 = DENOISE_ALL_COMPONENT;
-	InitDenoiseFunc(m_pfDenoise, m_CPUFlag);
-}
-
-CDenoiser::~CDenoiser()
-{	
-}
-
-void CDenoiser::InitDenoiseFunc(SDenoiseFuncs &denoiser,  int32_t iCpuFlag)
-{
-		denoiser.pfBilateralLumaFilter8 = BilateralLumaFilter8_c;
-		denoiser.pfWaverageChromaFilter8 = WaverageChromaFilter8_c;
-#if defined(X86_ASM)
-	if ( iCpuFlag & WELS_CPU_SSE2 )
-	{
-		denoiser.pfBilateralLumaFilter8 = BilateralLumaFilter8_sse2;
-		denoiser.pfWaverageChromaFilter8 = WaverageChromaFilter8_sse2;	
-	}
-#endif
-}
-
-EResult CDenoiser::Process(int32_t iType, SPixMap *pSrc, SPixMap *dst)
-{
-	uint8_t *pSrcY = (uint8_t *)pSrc->pPixel[0];
-	uint8_t *pSrcU = (uint8_t *)pSrc->pPixel[1];
-	uint8_t *pSrcV = (uint8_t *)pSrc->pPixel[2];
-	if (pSrcY == NULL || pSrcU == NULL || pSrcV == NULL)
-	{
-		return RET_INVALIDPARAM;
-	}
-
-	int32_t iWidthY = pSrc->sRect.iRectWidth;
-	int32_t iHeightY = pSrc->sRect.iRectHeight;
-	int32_t iWidthUV = iWidthY >> 1;
-	int32_t iHeightUV = iHeightY >> 1;
-
-	if(m_uiType & DENOISE_Y_COMPONENT)
-		BilateralDenoiseLuma(pSrcY, iWidthY, iHeightY, pSrc->iStride[0]);
-
-	if(m_uiType & DENOISE_U_COMPONENT)
-		WaverageDenoiseChroma(pSrcU, iWidthUV, iHeightUV, pSrc->iStride[1]);
-
-	if(m_uiType & DENOISE_V_COMPONENT)
-		WaverageDenoiseChroma(pSrcV, iWidthUV, iHeightUV, pSrc->iStride[2]);
-
-	return RET_SUCCESS;
-}
-
-void CDenoiser::BilateralDenoiseLuma(uint8_t * pSrcY, int32_t iWidth, int32_t iHeight, int32_t iStride)
-{
-	int32_t w;
-
-	pSrcY = pSrcY + m_uiSpaceRadius * iStride;
-	for(int32_t h = m_uiSpaceRadius;h < iHeight - m_uiSpaceRadius; h++)
-	{
-		for(w = m_uiSpaceRadius; w < iWidth - m_uiSpaceRadius - TAIL_OF_LINE8; w+=8)
-		{	
-			m_pfDenoise.pfBilateralLumaFilter8(pSrcY + w, iStride);
-		}
-		for(w = w + TAIL_OF_LINE8; w < iWidth - m_uiSpaceRadius; w++)
-		{
-			Gauss3x3Filter(pSrcY + w, iStride);
-		}
-		pSrcY += iStride;
-	}
-}
-
-void CDenoiser::WaverageDenoiseChroma(uint8_t *pSrcUV, int32_t iWidth, int32_t iHeight, int32_t iStride)
-{
-	int32_t w;
-
-	pSrcUV = pSrcUV + UV_WINDOWS_RADIUS * iStride;
-	for(int32_t h = UV_WINDOWS_RADIUS; h < iHeight - UV_WINDOWS_RADIUS; h++)
-	{
-		for( w = UV_WINDOWS_RADIUS; w < iWidth - UV_WINDOWS_RADIUS - TAIL_OF_LINE8; w+=8)
-		{
-			m_pfDenoise.pfWaverageChromaFilter8(pSrcUV + w, iStride);		
-		}
-
-		for(w = w + TAIL_OF_LINE8; w < iWidth - UV_WINDOWS_RADIUS; w++)
-		{
-			Gauss3x3Filter(pSrcUV + w,iStride);
-		}
-		pSrcUV += iStride;
-	}
-}
-
-
-WELSVP_NAMESPACE_END
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "denoise.h"
+#include "../common/cpu.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+#define CALC_BI_STRIDE(iWidth, iBitcount)  ((((iWidth) * (iBitcount) + 31) & ~31) >> 3)
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+CDenoiser::CDenoiser (int32_t iCpuFlag) {
+  m_CPUFlag = iCpuFlag;
+  m_eMethod   = METHOD_DENOISE;
+  WelsMemset (&m_pfDenoise, 0, sizeof (m_pfDenoise));
+
+  m_uiSpaceRadius = DENOISE_GRAY_RADIUS;
+  m_fSigmaGrey  = DENOISE_GRAY_SIGMA;
+  m_uiType		 = DENOISE_ALL_COMPONENT;
+  InitDenoiseFunc (m_pfDenoise, m_CPUFlag);
+}
+
+CDenoiser::~CDenoiser() {
+}
+
+void CDenoiser::InitDenoiseFunc (SDenoiseFuncs& denoiser,  int32_t iCpuFlag) {
+  denoiser.pfBilateralLumaFilter8 = BilateralLumaFilter8_c;
+  denoiser.pfWaverageChromaFilter8 = WaverageChromaFilter8_c;
+#if defined(X86_ASM)
+  if (iCpuFlag & WELS_CPU_SSE2) {
+    denoiser.pfBilateralLumaFilter8 = BilateralLumaFilter8_sse2;
+    denoiser.pfWaverageChromaFilter8 = WaverageChromaFilter8_sse2;
+  }
+#endif
+}
+
+EResult CDenoiser::Process (int32_t iType, SPixMap* pSrc, SPixMap* dst) {
+  uint8_t* pSrcY = (uint8_t*)pSrc->pPixel[0];
+  uint8_t* pSrcU = (uint8_t*)pSrc->pPixel[1];
+  uint8_t* pSrcV = (uint8_t*)pSrc->pPixel[2];
+  if (pSrcY == NULL || pSrcU == NULL || pSrcV == NULL) {
+    return RET_INVALIDPARAM;
+  }
+
+  int32_t iWidthY = pSrc->sRect.iRectWidth;
+  int32_t iHeightY = pSrc->sRect.iRectHeight;
+  int32_t iWidthUV = iWidthY >> 1;
+  int32_t iHeightUV = iHeightY >> 1;
+
+  if (m_uiType & DENOISE_Y_COMPONENT)
+    BilateralDenoiseLuma (pSrcY, iWidthY, iHeightY, pSrc->iStride[0]);
+
+  if (m_uiType & DENOISE_U_COMPONENT)
+    WaverageDenoiseChroma (pSrcU, iWidthUV, iHeightUV, pSrc->iStride[1]);
+
+  if (m_uiType & DENOISE_V_COMPONENT)
+    WaverageDenoiseChroma (pSrcV, iWidthUV, iHeightUV, pSrc->iStride[2]);
+
+  return RET_SUCCESS;
+}
+
+void CDenoiser::BilateralDenoiseLuma (uint8_t* pSrcY, int32_t iWidth, int32_t iHeight, int32_t iStride) {
+  int32_t w;
+
+  pSrcY = pSrcY + m_uiSpaceRadius * iStride;
+  for (int32_t h = m_uiSpaceRadius; h < iHeight - m_uiSpaceRadius; h++) {
+    for (w = m_uiSpaceRadius; w < iWidth - m_uiSpaceRadius - TAIL_OF_LINE8; w += 8) {
+      m_pfDenoise.pfBilateralLumaFilter8 (pSrcY + w, iStride);
+    }
+    for (w = w + TAIL_OF_LINE8; w < iWidth - m_uiSpaceRadius; w++) {
+      Gauss3x3Filter (pSrcY + w, iStride);
+    }
+    pSrcY += iStride;
+  }
+}
+
+void CDenoiser::WaverageDenoiseChroma (uint8_t* pSrcUV, int32_t iWidth, int32_t iHeight, int32_t iStride) {
+  int32_t w;
+
+  pSrcUV = pSrcUV + UV_WINDOWS_RADIUS * iStride;
+  for (int32_t h = UV_WINDOWS_RADIUS; h < iHeight - UV_WINDOWS_RADIUS; h++) {
+    for (w = UV_WINDOWS_RADIUS; w < iWidth - UV_WINDOWS_RADIUS - TAIL_OF_LINE8; w += 8) {
+      m_pfDenoise.pfWaverageChromaFilter8 (pSrcUV + w, iStride);
+    }
+
+    for (w = w + TAIL_OF_LINE8; w < iWidth - UV_WINDOWS_RADIUS; w++) {
+      Gauss3x3Filter (pSrcUV + w, iStride);
+    }
+    pSrcUV += iStride;
+  }
+}
+
+
+WELSVP_NAMESPACE_END
--- a/processing/src/denoise/denoise.h
+++ b/processing/src/denoise/denoise.h
@@ -1,113 +1,111 @@
-/*!
- * \copy
- *     Copyright (c)  2011-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- * \file	    :  denoise.h
- *
- * \brief	    :  denoise class of wels video processor class
- *
- * \date        :  2011/03/15
- *
- * \description :  1. rewrite the package code of denoise class  
- *
- *************************************************************************************
- */
-
-#ifndef _WELSVP_DENOISE_H
-#define _WELSVP_DENOISE_H
-
-#include "../common/util.h"
-#include "../common/memory.h"
-#include "../common/WelsFrameWork.h"
-#include "../../interface/IWelsVP.h"
-
-
-#define DENOISE_GRAY_RADIUS (1)
-#define DENOISE_GRAY_SIGMA  (2)
-
-#define UV_WINDOWS_RADIUS   (2)
-#define TAIL_OF_LINE8		(7)
-
-#define DENOISE_Y_COMPONENT (1)
-#define DENOISE_U_COMPONENT (2)
-#define DENOISE_V_COMPONENT (4)
-#define DENOISE_ALL_COMPONENT (7)
-
-
-WELSVP_NAMESPACE_BEGIN
-
-void Gauss3x3Filter(uint8_t *pixels, int32_t stride);
-
-typedef void (DenoiseFilterFunc)(uint8_t *pixels, int32_t stride);
-
-typedef DenoiseFilterFunc *DenoiseFilterFuncPtr;
-
-DenoiseFilterFunc     BilateralLumaFilter8_c;
-DenoiseFilterFunc     WaverageChromaFilter8_c;
-
-#ifdef X86_ASM
-WELSVP_EXTERN_C_BEGIN
-	DenoiseFilterFunc     BilateralLumaFilter8_sse2 ;
-	DenoiseFilterFunc     WaverageChromaFilter8_sse2 ;
-WELSVP_EXTERN_C_END
-#endif
-
-typedef  struct TagDenoiseFuncs 
-{
-	DenoiseFilterFuncPtr	pfBilateralLumaFilter8;//on 8 samples
-	DenoiseFilterFuncPtr	pfWaverageChromaFilter8;//on 8 samples
-} SDenoiseFuncs;
-
-class CDenoiser : public IStrategy
-{			  
-public:
-	CDenoiser(int32_t iCpuFlag);
-	~CDenoiser();
-
-	EResult Process(int32_t iType, SPixMap *pSrc, SPixMap *dst);
-
-private:
-	void InitDenoiseFunc(SDenoiseFuncs &pf, int32_t cpu);
-	void BilateralDenoiseLuma(uint8_t * p_y_data, int32_t width, int32_t height, int32_t stride);
-	void WaverageDenoiseChroma(uint8_t *pSrcUV, int32_t width, int32_t height, int32_t stride);
-
-private:
-	float_t	 m_fSigmaGrey;			//sigma for grey scale similarity, suggestion 2.5-3
-	uint32_t  m_uiFilterWindow;				//filter window diameter
-	uint16_t	 m_uiSpaceRadius;			//filter windows radius: 1-3x3, 2-5x5,3-7x7. Larger size, slower speed
-	uint16_t	 m_uiType;					//do denoising on which component 1-Y, 2-U, 4-V; 7-YUV, 3-YU, 5-YV, 6-UV
-	uint32_t  *m_pGreyWeightTable;		//weight table for grey scale
-
-	SDenoiseFuncs m_pfDenoise;
-	int32_t      m_CPUFlag;
-};	
-
-WELSVP_NAMESPACE_END
-
-#endif
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	    :  denoise.h
+ *
+ * \brief	    :  denoise class of wels video processor class
+ *
+ * \date        :  2011/03/15
+ *
+ * \description :  1. rewrite the package code of denoise class
+ *
+ *************************************************************************************
+ */
+
+#ifndef _WELSVP_DENOISE_H
+#define _WELSVP_DENOISE_H
+
+#include "../common/util.h"
+#include "../common/memory.h"
+#include "../common/WelsFrameWork.h"
+#include "../../interface/IWelsVP.h"
+
+
+#define DENOISE_GRAY_RADIUS (1)
+#define DENOISE_GRAY_SIGMA  (2)
+
+#define UV_WINDOWS_RADIUS   (2)
+#define TAIL_OF_LINE8		(7)
+
+#define DENOISE_Y_COMPONENT (1)
+#define DENOISE_U_COMPONENT (2)
+#define DENOISE_V_COMPONENT (4)
+#define DENOISE_ALL_COMPONENT (7)
+
+
+WELSVP_NAMESPACE_BEGIN
+
+void Gauss3x3Filter (uint8_t* pixels, int32_t stride);
+
+typedef void (DenoiseFilterFunc) (uint8_t* pixels, int32_t stride);
+
+typedef DenoiseFilterFunc* DenoiseFilterFuncPtr;
+
+DenoiseFilterFunc     BilateralLumaFilter8_c;
+DenoiseFilterFunc     WaverageChromaFilter8_c;
+
+#ifdef X86_ASM
+WELSVP_EXTERN_C_BEGIN
+DenoiseFilterFunc     BilateralLumaFilter8_sse2 ;
+DenoiseFilterFunc     WaverageChromaFilter8_sse2 ;
+WELSVP_EXTERN_C_END
+#endif
+
+typedef  struct TagDenoiseFuncs {
+  DenoiseFilterFuncPtr	pfBilateralLumaFilter8;//on 8 samples
+  DenoiseFilterFuncPtr	pfWaverageChromaFilter8;//on 8 samples
+} SDenoiseFuncs;
+
+class CDenoiser : public IStrategy {
+ public:
+  CDenoiser (int32_t iCpuFlag);
+  ~CDenoiser();
+
+  EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* dst);
+
+ private:
+  void InitDenoiseFunc (SDenoiseFuncs& pf, int32_t cpu);
+  void BilateralDenoiseLuma (uint8_t* p_y_data, int32_t width, int32_t height, int32_t stride);
+  void WaverageDenoiseChroma (uint8_t* pSrcUV, int32_t width, int32_t height, int32_t stride);
+
+ private:
+  float_t	 m_fSigmaGrey;			//sigma for grey scale similarity, suggestion 2.5-3
+  uint32_t  m_uiFilterWindow;				//filter window diameter
+  uint16_t	 m_uiSpaceRadius;			//filter windows radius: 1-3x3, 2-5x5,3-7x7. Larger size, slower speed
+  uint16_t	 m_uiType;					//do denoising on which component 1-Y, 2-U, 4-V; 7-YUV, 3-YU, 5-YV, 6-UV
+  uint32_t*  m_pGreyWeightTable;		//weight table for grey scale
+
+  SDenoiseFuncs m_pfDenoise;
+  int32_t      m_CPUFlag;
+};
+
+WELSVP_NAMESPACE_END
+
+#endif
--- a/processing/src/denoise/denoise_filter.cpp
+++ b/processing/src/denoise/denoise_filter.cpp
@@ -1,134 +1,127 @@
-/*!
- * \copy
- *     Copyright (c)  2010-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- * \file	svc_preprocess.h
- *
- * \brief	svc denoising
- *
- * \date	4/1/2010 Created
- *
- */
-
-#include "denoise.h"
-#include "../common/typedef.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-void	BilateralLumaFilter8_c(uint8_t *pSample, int32_t iStride)
-{
-	int32_t nSum = 0, nTotWeight = 0;
-	int32_t iCenterSample = *pSample;
-	uint8_t * pCurLine = pSample- iStride - DENOISE_GRAY_RADIUS;
-	int32_t x, y;
-	int32_t iCurSample, iCurWeight, iGreyDiff;
-	uint8_t aSample[8];
-
-	for(int32_t i = 0; i < 8; i++)
-	{		
-		nSum = 0;
-		nTotWeight = 0;
-		iCenterSample = *pSample;
-		pCurLine = pSample- iStride - DENOISE_GRAY_RADIUS;
-		for (y = 0; y < 3; y++)
-		{
-			for (x = 0; x < 3; x++) 
-			{
-				if(x == 1 && y == 1) continue;			// except center point
-				iCurSample = pCurLine[x];
-				iCurWeight = WELS_ABS(iCurSample - iCenterSample);
-				iGreyDiff = 32 - iCurWeight;				
-				if(iGreyDiff < 0)	continue; 
-				else iCurWeight = (iGreyDiff * iGreyDiff) >> 5;
-				nSum += iCurSample * iCurWeight;
-				nTotWeight +=  iCurWeight;
-			}
-			pCurLine += iStride;
-		}
-		nTotWeight = 256 - nTotWeight;
-		nSum += iCenterSample * nTotWeight;
-		aSample[i] = nSum >> 8;
-		pSample++;
-	}
-	WelsMemcpy(pSample - 8, aSample, 8);
-}
-
-
-/***************************************************************************
-5x5 filter:
-1	1	2	1	1
-1	2	4	2	1
-2	4	20	4	2
-1	2	4	2	1
-1	1	2	1	1
-***************************************************************************/
-#define SUM_LINE1(pSample)	(pSample[0] +(pSample[1]) +(pSample[2]<<1)  + pSample[3] + pSample[4])
-#define SUM_LINE2(pSample)	(pSample[0] +(pSample[1]<<1) +(pSample[2]<<2)  +(pSample[3]<<1) +pSample[4])
-#define SUM_LINE3(pSample)	((pSample[0]<<1) +(pSample[1]<<2) +(pSample[2]*20)  +(pSample[3]<<2) +(pSample[4]<<1))
-void	WaverageChromaFilter8_c(uint8_t *pSample, int32_t iStride)
-{
-	int32_t sum;
-	uint8_t * pStartPixels = pSample- UV_WINDOWS_RADIUS * iStride - UV_WINDOWS_RADIUS;
-	uint8_t * pCurLine1 = pStartPixels;
-	uint8_t * pCurLine2 = pCurLine1 + iStride;
-	uint8_t * pCurLine3 = pCurLine2 + iStride;
-	uint8_t * pCurLine4 = pCurLine3 + iStride;
-	uint8_t * pCurLine5 = pCurLine4 + iStride;
-	uint8_t aSample[8];
-
-	for(int32_t i = 0; i < 8; i++)
-	{
-		sum = SUM_LINE1((pCurLine1+i)) + SUM_LINE2((pCurLine2+i)) + SUM_LINE3((pCurLine3+i)) 
-			+ SUM_LINE2((pCurLine4+i)) + SUM_LINE1((pCurLine5+i));
-		aSample[i] =  (sum >>6);
-		pSample++;
-	}
-	WelsMemcpy(pSample - 8, aSample, 8);
-}
-
-/***************************************************************************
-edge of y/uv use a 3x3 Gauss filter, radius = 1:
-1	2	1
-2	4	2	
-1	2	1
-***************************************************************************/
-void	Gauss3x3Filter(uint8_t *pSrc, int32_t iStride)
-{
-	int32_t nSum = 0;
-	uint8_t * pCurLine1 = pSrc - iStride - 1;		
-	uint8_t * pCurLine2 = pCurLine1 + iStride;
-	uint8_t * pCurLine3 = pCurLine2 + iStride;
-
-	nSum =	 pCurLine1[0]		+ (pCurLine1[1]<<1) +  pCurLine1[2]		+ 
-			(pCurLine2[0]<<1)	+ (pCurLine2[1]<<2) + (pCurLine2[2]<<1) + 
-			 pCurLine3[0]		+ (pCurLine3[1]<<1) +  pCurLine3[2]; 
-	*pSrc = nSum >> 4;
-}
-
-WELSVP_NAMESPACE_END
+/*!
+ * \copy
+ *     Copyright (c)  2010-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	svc_preprocess.h
+ *
+ * \brief	svc denoising
+ *
+ * \date	4/1/2010 Created
+ *
+ */
+
+#include "denoise.h"
+#include "../common/typedef.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+void	BilateralLumaFilter8_c (uint8_t* pSample, int32_t iStride) {
+  int32_t nSum = 0, nTotWeight = 0;
+  int32_t iCenterSample = *pSample;
+  uint8_t* pCurLine = pSample - iStride - DENOISE_GRAY_RADIUS;
+  int32_t x, y;
+  int32_t iCurSample, iCurWeight, iGreyDiff;
+  uint8_t aSample[8];
+
+  for (int32_t i = 0; i < 8; i++) {
+    nSum = 0;
+    nTotWeight = 0;
+    iCenterSample = *pSample;
+    pCurLine = pSample - iStride - DENOISE_GRAY_RADIUS;
+    for (y = 0; y < 3; y++) {
+      for (x = 0; x < 3; x++) {
+        if (x == 1 && y == 1) continue;			// except center point
+        iCurSample = pCurLine[x];
+        iCurWeight = WELS_ABS (iCurSample - iCenterSample);
+        iGreyDiff = 32 - iCurWeight;
+        if (iGreyDiff < 0)	continue;
+        else iCurWeight = (iGreyDiff * iGreyDiff) >> 5;
+        nSum += iCurSample * iCurWeight;
+        nTotWeight +=  iCurWeight;
+      }
+      pCurLine += iStride;
+    }
+    nTotWeight = 256 - nTotWeight;
+    nSum += iCenterSample * nTotWeight;
+    aSample[i] = nSum >> 8;
+    pSample++;
+  }
+  WelsMemcpy (pSample - 8, aSample, 8);
+}
+
+
+/***************************************************************************
+5x5 filter:
+1	1	2	1	1
+1	2	4	2	1
+2	4	20	4	2
+1	2	4	2	1
+1	1	2	1	1
+***************************************************************************/
+#define SUM_LINE1(pSample)	(pSample[0] +(pSample[1]) +(pSample[2]<<1)  + pSample[3] + pSample[4])
+#define SUM_LINE2(pSample)	(pSample[0] +(pSample[1]<<1) +(pSample[2]<<2)  +(pSample[3]<<1) +pSample[4])
+#define SUM_LINE3(pSample)	((pSample[0]<<1) +(pSample[1]<<2) +(pSample[2]*20)  +(pSample[3]<<2) +(pSample[4]<<1))
+void	WaverageChromaFilter8_c (uint8_t* pSample, int32_t iStride) {
+  int32_t sum;
+  uint8_t* pStartPixels = pSample - UV_WINDOWS_RADIUS * iStride - UV_WINDOWS_RADIUS;
+  uint8_t* pCurLine1 = pStartPixels;
+  uint8_t* pCurLine2 = pCurLine1 + iStride;
+  uint8_t* pCurLine3 = pCurLine2 + iStride;
+  uint8_t* pCurLine4 = pCurLine3 + iStride;
+  uint8_t* pCurLine5 = pCurLine4 + iStride;
+  uint8_t aSample[8];
+
+  for (int32_t i = 0; i < 8; i++) {
+    sum = SUM_LINE1 ((pCurLine1 + i)) + SUM_LINE2 ((pCurLine2 + i)) + SUM_LINE3 ((pCurLine3 + i))
+          + SUM_LINE2 ((pCurLine4 + i)) + SUM_LINE1 ((pCurLine5 + i));
+    aSample[i] = (sum >> 6);
+    pSample++;
+  }
+  WelsMemcpy (pSample - 8, aSample, 8);
+}
+
+/***************************************************************************
+edge of y/uv use a 3x3 Gauss filter, radius = 1:
+1	2	1
+2	4	2
+1	2	1
+***************************************************************************/
+void	Gauss3x3Filter (uint8_t* pSrc, int32_t iStride) {
+  int32_t nSum = 0;
+  uint8_t* pCurLine1 = pSrc - iStride - 1;
+  uint8_t* pCurLine2 = pCurLine1 + iStride;
+  uint8_t* pCurLine3 = pCurLine2 + iStride;
+
+  nSum =	 pCurLine1[0]		+ (pCurLine1[1] << 1) +  pCurLine1[2]		+
+           (pCurLine2[0] << 1)	+ (pCurLine2[1] << 2) + (pCurLine2[2] << 1) +
+           pCurLine3[0]		+ (pCurLine3[1] << 1) +  pCurLine3[2];
+  *pSrc = nSum >> 4;
+}
+
+WELSVP_NAMESPACE_END
--- a/processing/src/downsample/downsample.cpp
+++ b/processing/src/downsample/downsample.cpp
@@ -1,145 +1,135 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "downsample.h"
-#include "../common/cpu.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-CDownsampling::CDownsampling(int32_t iCpuFlag)
-{
-	m_iCPUFlag = iCpuFlag;
-	m_eMethod   = METHOD_DOWNSAMPLE;
-	WelsMemset(&m_pfDownsample, 0, sizeof(m_pfDownsample));
-	InitDownsampleFuncs(m_pfDownsample, m_iCPUFlag);
-}
-
-CDownsampling::~CDownsampling()
-{	
-}
-
-void CDownsampling::InitDownsampleFuncs(SDownsampleFuncs &sDownsampleFunc,  int32_t iCpuFlag)
-{
-	sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsampler_c;
-	sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_c;
-	sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_c;
-	sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_c;
-	sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsampler_c;
-	sDownsampleFunc.pfGeneralRatioLuma	 = GeneralBilinearFastDownsampler_c;
-#if defined(X86_ASM)
-	if ( iCpuFlag & WELS_CPU_SSE )
-	{
-		sDownsampleFunc.pfHalfAverage[0]	= DyadicBilinearDownsamplerWidthx32_sse;
-		sDownsampleFunc.pfHalfAverage[1]	= DyadicBilinearDownsamplerWidthx16_sse;
-		sDownsampleFunc.pfHalfAverage[2]	= DyadicBilinearDownsamplerWidthx8_sse;
-	}
-	if ( iCpuFlag & WELS_CPU_SSE2 )
-	{
-		sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_sse2;
-		sDownsampleFunc.pfGeneralRatioLuma   = GeneralBilinearFastDownsamplerWrap_sse2;
-	}
-	if ( iCpuFlag & WELS_CPU_SSSE3 )
-	{
-		sDownsampleFunc.pfHalfAverage[0]	= DyadicBilinearDownsamplerWidthx32_ssse3;
-		sDownsampleFunc.pfHalfAverage[1]	= DyadicBilinearDownsamplerWidthx16_ssse3;
-	}
-	if ( iCpuFlag & WELS_CPU_SSE41 )
-	{
-		sDownsampleFunc.pfHalfAverage[0]	= DyadicBilinearDownsamplerWidthx32_sse4;
-		sDownsampleFunc.pfHalfAverage[1]	= DyadicBilinearDownsamplerWidthx16_sse4;
-	}
-#endif//X86_ASM
-	
-}
-
-EResult CDownsampling::Process(int32_t iType, SPixMap *pSrcPixMap, SPixMap *pDstPixMap)
-{
-	int32_t iSrcWidthY = pSrcPixMap->sRect.iRectWidth;
-	int32_t iSrcHeightY = pSrcPixMap->sRect.iRectHeight;
-	int32_t iDstWidthY = pDstPixMap->sRect.iRectWidth;
-	int32_t iDstHeightY = pDstPixMap->sRect.iRectHeight;
-
-	int32_t iSrcWidthUV = iSrcWidthY >> 1;
-	int32_t iSrcHeightUV = iSrcHeightY >> 1;
-	int32_t iDstWidthUV = iDstWidthY >> 1;
-	int32_t iDstHeightUV = iDstHeightY >> 1;
-
-	if (iSrcWidthY <= iDstWidthY || iSrcHeightY <= iDstHeightY)
-	{
-		return RET_INVALIDPARAM;
-	}
-
-	if ((iSrcWidthY >>1) == iDstWidthY && (iSrcHeightY >> 1) == iDstHeightY)
-	{
-		// use half average functions
-		uint8_t iAlignIndex = 3;
-
-		iAlignIndex = GetAlignedIndex(iSrcWidthY);
-		m_pfDownsample.pfHalfAverage[iAlignIndex]((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0], (uint8_t*)pSrcPixMap->pPixel[0], pSrcPixMap->iStride[0], iSrcWidthY, iSrcHeightY);
-
-		iAlignIndex = GetAlignedIndex(iSrcWidthUV);
-		m_pfDownsample.pfHalfAverage[iAlignIndex]((uint8_t*)pDstPixMap->pPixel[1], pDstPixMap->iStride[1], (uint8_t*)pSrcPixMap->pPixel[1], pSrcPixMap->iStride[1], iSrcWidthUV, iSrcHeightUV);
-		m_pfDownsample.pfHalfAverage[iAlignIndex]((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2], (uint8_t*)pSrcPixMap->pPixel[2], pSrcPixMap->iStride[2], iSrcWidthUV, iSrcHeightUV);
-	}
-	else 
-	{
-		m_pfDownsample.pfGeneralRatioLuma((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0], iDstWidthY, iDstHeightY, 
-			(uint8_t*)pSrcPixMap->pPixel[0], pSrcPixMap->iStride[0], iSrcWidthY, iSrcHeightY);
-
-		m_pfDownsample.pfGeneralRatioChroma((uint8_t*)pDstPixMap->pPixel[1], pDstPixMap->iStride[1], iDstWidthUV, iDstHeightUV,
-			(uint8_t*)pSrcPixMap->pPixel[1], pSrcPixMap->iStride[1], iSrcWidthUV, iSrcHeightUV);
-
-		m_pfDownsample.pfGeneralRatioChroma((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2], iDstWidthUV, iDstHeightUV,
-			(uint8_t*)pSrcPixMap->pPixel[2], pSrcPixMap->iStride[2], iSrcWidthUV, iSrcHeightUV);
-	}
-	return RET_SUCCESS;
-}
-
-int32_t CDownsampling::GetAlignedIndex( const int32_t kiSrcWidth )
-{
-	int32_t iAlignIndex = 3;
-	if ( (kiSrcWidth & 0x1f) == 0 )	// x32	
-		iAlignIndex	= 0;
-	else if ( (kiSrcWidth & 0x0f) == 0 )	// x16
-		iAlignIndex	= 1;
-	else if ( (kiSrcWidth & 0x07) == 0 )	// x8
-		iAlignIndex	= 2;
-	else
-		iAlignIndex	= 3;
-	return iAlignIndex;
-}
-
-
-WELSVP_NAMESPACE_END
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "downsample.h"
+#include "../common/cpu.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+CDownsampling::CDownsampling (int32_t iCpuFlag) {
+  m_iCPUFlag = iCpuFlag;
+  m_eMethod   = METHOD_DOWNSAMPLE;
+  WelsMemset (&m_pfDownsample, 0, sizeof (m_pfDownsample));
+  InitDownsampleFuncs (m_pfDownsample, m_iCPUFlag);
+}
+
+CDownsampling::~CDownsampling() {
+}
+
+void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc,  int32_t iCpuFlag) {
+  sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsampler_c;
+  sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_c;
+  sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_c;
+  sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_c;
+  sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsampler_c;
+  sDownsampleFunc.pfGeneralRatioLuma	 = GeneralBilinearFastDownsampler_c;
+#if defined(X86_ASM)
+  if (iCpuFlag & WELS_CPU_SSE) {
+    sDownsampleFunc.pfHalfAverage[0]	= DyadicBilinearDownsamplerWidthx32_sse;
+    sDownsampleFunc.pfHalfAverage[1]	= DyadicBilinearDownsamplerWidthx16_sse;
+    sDownsampleFunc.pfHalfAverage[2]	= DyadicBilinearDownsamplerWidthx8_sse;
+  }
+  if (iCpuFlag & WELS_CPU_SSE2) {
+    sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_sse2;
+    sDownsampleFunc.pfGeneralRatioLuma   = GeneralBilinearFastDownsamplerWrap_sse2;
+  }
+  if (iCpuFlag & WELS_CPU_SSSE3) {
+    sDownsampleFunc.pfHalfAverage[0]	= DyadicBilinearDownsamplerWidthx32_ssse3;
+    sDownsampleFunc.pfHalfAverage[1]	= DyadicBilinearDownsamplerWidthx16_ssse3;
+  }
+  if (iCpuFlag & WELS_CPU_SSE41) {
+    sDownsampleFunc.pfHalfAverage[0]	= DyadicBilinearDownsamplerWidthx32_sse4;
+    sDownsampleFunc.pfHalfAverage[1]	= DyadicBilinearDownsamplerWidthx16_sse4;
+  }
+#endif//X86_ASM
+
+}
+
+EResult CDownsampling::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pDstPixMap) {
+  int32_t iSrcWidthY = pSrcPixMap->sRect.iRectWidth;
+  int32_t iSrcHeightY = pSrcPixMap->sRect.iRectHeight;
+  int32_t iDstWidthY = pDstPixMap->sRect.iRectWidth;
+  int32_t iDstHeightY = pDstPixMap->sRect.iRectHeight;
+
+  int32_t iSrcWidthUV = iSrcWidthY >> 1;
+  int32_t iSrcHeightUV = iSrcHeightY >> 1;
+  int32_t iDstWidthUV = iDstWidthY >> 1;
+  int32_t iDstHeightUV = iDstHeightY >> 1;
+
+  if (iSrcWidthY <= iDstWidthY || iSrcHeightY <= iDstHeightY) {
+    return RET_INVALIDPARAM;
+  }
+
+  if ((iSrcWidthY >> 1) == iDstWidthY && (iSrcHeightY >> 1) == iDstHeightY) {
+    // use half average functions
+    uint8_t iAlignIndex = 3;
+
+    iAlignIndex = GetAlignedIndex (iSrcWidthY);
+    m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0],
+        (uint8_t*)pSrcPixMap->pPixel[0], pSrcPixMap->iStride[0], iSrcWidthY, iSrcHeightY);
+
+    iAlignIndex = GetAlignedIndex (iSrcWidthUV);
+    m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[1], pDstPixMap->iStride[1],
+        (uint8_t*)pSrcPixMap->pPixel[1], pSrcPixMap->iStride[1], iSrcWidthUV, iSrcHeightUV);
+    m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2],
+        (uint8_t*)pSrcPixMap->pPixel[2], pSrcPixMap->iStride[2], iSrcWidthUV, iSrcHeightUV);
+  } else {
+    m_pfDownsample.pfGeneralRatioLuma ((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0], iDstWidthY, iDstHeightY,
+                                       (uint8_t*)pSrcPixMap->pPixel[0], pSrcPixMap->iStride[0], iSrcWidthY, iSrcHeightY);
+
+    m_pfDownsample.pfGeneralRatioChroma ((uint8_t*)pDstPixMap->pPixel[1], pDstPixMap->iStride[1], iDstWidthUV, iDstHeightUV,
+                                         (uint8_t*)pSrcPixMap->pPixel[1], pSrcPixMap->iStride[1], iSrcWidthUV, iSrcHeightUV);
+
+    m_pfDownsample.pfGeneralRatioChroma ((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2], iDstWidthUV, iDstHeightUV,
+                                         (uint8_t*)pSrcPixMap->pPixel[2], pSrcPixMap->iStride[2], iSrcWidthUV, iSrcHeightUV);
+  }
+  return RET_SUCCESS;
+}
+
+int32_t CDownsampling::GetAlignedIndex (const int32_t kiSrcWidth) {
+  int32_t iAlignIndex = 3;
+  if ((kiSrcWidth & 0x1f) == 0)	// x32
+    iAlignIndex	= 0;
+  else if ((kiSrcWidth & 0x0f) == 0)	// x16
+    iAlignIndex	= 1;
+  else if ((kiSrcWidth & 0x07) == 0)	// x8
+    iAlignIndex	= 2;
+  else
+    iAlignIndex	= 3;
+  return iAlignIndex;
+}
+
+
+WELSVP_NAMESPACE_END
--- a/processing/src/downsample/downsample.h
+++ b/processing/src/downsample/downsample.h
@@ -1,126 +1,128 @@
-/*!
- * \copy
- *     Copyright (c)  2011-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- * \file	    :  downsample.h
- *
- * \brief	    :  downsample class of wels video processor class
- *
- * \date        :  2011/03/33
- *
- * \description :  1. rewrite the package code of downsample class  
- *
- *************************************************************************************
- */
-
-#ifndef _WELSVP_DOWNSAMPLE_H
-#define _WELSVP_DOWNSAMPLE_H
-
-#include "../common/util.h"
-#include "../common/WelsFrameWork.h"
-#include "../../interface/IWelsVP.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-
-typedef void (HalveDownsampleFunc)(	uint8_t* pDst, const int32_t kiDstStride,
-								   uint8_t* pSrc, const int32_t kiSrcStride,
-								   const int32_t kiSrcWidth, const int32_t kiSrcHeight );
-
-typedef void (GeneralDownsampleFunc)(uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight,
-									 uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight );
-
-typedef HalveDownsampleFunc		*PHalveDownsampleFunc;
-typedef GeneralDownsampleFunc	*PGeneralDownsampleFunc;
-
-HalveDownsampleFunc   DyadicBilinearDownsampler_c;
-GeneralDownsampleFunc GeneralBilinearFastDownsampler_c;
-GeneralDownsampleFunc GeneralBilinearAccurateDownsampler_c;
-
-typedef struct {
-	// align_index: 0 = x32; 1 = x16; 2 = x8; 3 = common case left;
-	PHalveDownsampleFunc			pfHalfAverage[4];
-	PGeneralDownsampleFunc		pfGeneralRatioLuma;
-	PGeneralDownsampleFunc		pfGeneralRatioChroma;
-}SDownsampleFuncs;
-
-
-#ifdef X86_ASM
-WELSVP_EXTERN_C_BEGIN
-// used for scr width is multipler of 8 pixels
-HalveDownsampleFunc		DyadicBilinearDownsamplerWidthx8_sse;
-// iSrcWidth= x16 pixels
-HalveDownsampleFunc		DyadicBilinearDownsamplerWidthx16_sse;
-// iSrcWidth= x32 pixels
-HalveDownsampleFunc		DyadicBilinearDownsamplerWidthx32_sse;
-// used for scr width is multipler of 16 pixels
-HalveDownsampleFunc		DyadicBilinearDownsamplerWidthx16_ssse3;
-// iSrcWidth= x32 pixels
-HalveDownsampleFunc		DyadicBilinearDownsamplerWidthx32_ssse3;
-// iSrcWidth= x16 pixels
-HalveDownsampleFunc		DyadicBilinearDownsamplerWidthx16_sse4;
-// iSrcWidth= x32 pixels
-HalveDownsampleFunc		DyadicBilinearDownsamplerWidthx32_sse4;
-
-GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_sse2;
-GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_sse2;
-
-void GeneralBilinearFastDownsampler_sse2( uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight,
-	uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight,
-	const uint32_t kuiScaleX, const uint32_t kuiScaleY );
-void GeneralBilinearAccurateDownsampler_sse2( uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight,
-	uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight,
-	const uint32_t kuiScaleX, const uint32_t kuiScaleY );
-WELSVP_EXTERN_C_END
-#endif
-
-
-
-
-class CDownsampling : public IStrategy
-{			  
-public:
-	CDownsampling(int32_t iCpuFlag);
-	~CDownsampling();
-
-	EResult Process(int32_t iType, SPixMap *pSrc, SPixMap *pDst);
-
-private:
-	void InitDownsampleFuncs(SDownsampleFuncs &sDownsampleFunc, int32_t iCpuFlag);
-
-	int32_t GetAlignedIndex( const int32_t kiSrcWidth );
-
-private:
-	SDownsampleFuncs m_pfDownsample;
-	int32_t  m_iCPUFlag;
-};	
-
-WELSVP_NAMESPACE_END
-
-#endif
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	    :  downsample.h
+ *
+ * \brief	    :  downsample class of wels video processor class
+ *
+ * \date        :  2011/03/33
+ *
+ * \description :  1. rewrite the package code of downsample class
+ *
+ *************************************************************************************
+ */
+
+#ifndef _WELSVP_DOWNSAMPLE_H
+#define _WELSVP_DOWNSAMPLE_H
+
+#include "../common/util.h"
+#include "../common/WelsFrameWork.h"
+#include "../../interface/IWelsVP.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+
+typedef void (HalveDownsampleFunc) (uint8_t* pDst, const int32_t kiDstStride,
+                                    uint8_t* pSrc, const int32_t kiSrcStride,
+                                    const int32_t kiSrcWidth, const int32_t kiSrcHeight);
+
+typedef void (GeneralDownsampleFunc) (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
+                                      const int32_t kiDstHeight,
+                                      uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight);
+
+typedef HalveDownsampleFunc*		PHalveDownsampleFunc;
+typedef GeneralDownsampleFunc*	PGeneralDownsampleFunc;
+
+HalveDownsampleFunc   DyadicBilinearDownsampler_c;
+GeneralDownsampleFunc GeneralBilinearFastDownsampler_c;
+GeneralDownsampleFunc GeneralBilinearAccurateDownsampler_c;
+
+typedef struct {
+  // align_index: 0 = x32; 1 = x16; 2 = x8; 3 = common case left;
+  PHalveDownsampleFunc			pfHalfAverage[4];
+  PGeneralDownsampleFunc		pfGeneralRatioLuma;
+  PGeneralDownsampleFunc		pfGeneralRatioChroma;
+} SDownsampleFuncs;
+
+
+#ifdef X86_ASM
+WELSVP_EXTERN_C_BEGIN
+// used for scr width is multipler of 8 pixels
+HalveDownsampleFunc		DyadicBilinearDownsamplerWidthx8_sse;
+// iSrcWidth= x16 pixels
+HalveDownsampleFunc		DyadicBilinearDownsamplerWidthx16_sse;
+// iSrcWidth= x32 pixels
+HalveDownsampleFunc		DyadicBilinearDownsamplerWidthx32_sse;
+// used for scr width is multipler of 16 pixels
+HalveDownsampleFunc		DyadicBilinearDownsamplerWidthx16_ssse3;
+// iSrcWidth= x32 pixels
+HalveDownsampleFunc		DyadicBilinearDownsamplerWidthx32_ssse3;
+// iSrcWidth= x16 pixels
+HalveDownsampleFunc		DyadicBilinearDownsamplerWidthx16_sse4;
+// iSrcWidth= x32 pixels
+HalveDownsampleFunc		DyadicBilinearDownsamplerWidthx32_sse4;
+
+GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_sse2;
+GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_sse2;
+
+void GeneralBilinearFastDownsampler_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
+    const int32_t kiDstHeight,
+    uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight,
+    const uint32_t kuiScaleX, const uint32_t kuiScaleY);
+void GeneralBilinearAccurateDownsampler_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
+    const int32_t kiDstHeight,
+    uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight,
+    const uint32_t kuiScaleX, const uint32_t kuiScaleY);
+WELSVP_EXTERN_C_END
+#endif
+
+
+
+
+class CDownsampling : public IStrategy {
+ public:
+  CDownsampling (int32_t iCpuFlag);
+  ~CDownsampling();
+
+  EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* pDst);
+
+ private:
+  void InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc, int32_t iCpuFlag);
+
+  int32_t GetAlignedIndex (const int32_t kiSrcWidth);
+
+ private:
+  SDownsampleFuncs m_pfDownsample;
+  int32_t  m_iCPUFlag;
+};
+
+WELSVP_NAMESPACE_END
+
+#endif
--- a/processing/src/downsample/downsamplefuncs.cpp
+++ b/processing/src/downsample/downsamplefuncs.cpp
@@ -1,241 +1,234 @@
-/*!
- * \copy
- *     Copyright (c)  2008-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *  downsample_yuv.c
- *
- *  Abstract
- *      Implementation for source yuv data downsampling used before spatial encoding.
- *
- *  History
- *      10/24/2008 Created
- *
- *****************************************************************************/
-
-#include "../common/typedef.h"
-#include "../common/util.h"
-#include "downsample.h"
-
-
-WELSVP_NAMESPACE_BEGIN
-
-
-void DyadicBilinearDownsampler_c( uint8_t* pDst, const int32_t kiDstStride,
-						  uint8_t* pSrc, const int32_t kiSrcStride,
-						  const int32_t kiSrcWidth, const int32_t kiSrcHeight )
-								   
-{
-	uint8_t *pDstLine	= pDst;
-	uint8_t *pSrcLine	= pSrc;
-	const int32_t kiSrcStridex2	= kiSrcStride << 1;
-	const int32_t kiDstWidth		= kiSrcWidth >> 1;
-	const int32_t kiDstHeight	= kiSrcHeight >> 1;
-
-	for( int32_t j = 0; j < kiDstHeight; j ++ )
-	{
-		for( int32_t i = 0; i < kiDstWidth; i ++ )
-		{
-			const int32_t kiSrcX = i<<1;
-			const int32_t kiTempRow1 = (pSrcLine[kiSrcX] + pSrcLine[kiSrcX+1] + 1) >> 1;
-			const int32_t kiTempRow2 = (pSrcLine[kiSrcX+kiSrcStride] + pSrcLine[kiSrcX+kiSrcStride+1] + 1) >> 1;
-
-			pDstLine[i] = (uint8_t)((kiTempRow1 + kiTempRow2 + 1) >> 1);
-		}
-		pDstLine	+= kiDstStride;
-		pSrcLine	+= kiSrcStridex2;
-	}	
-}
-
-void GeneralBilinearFastDownsampler_c(uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight, 
-								uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight )
-{
-	const uint32_t kuiScaleBitWidth = 16, kuiScaleBitHeight = 15;
-	const uint32_t kuiScaleWidth = (1 << kuiScaleBitWidth), kuiScaleHeight = (1 << kuiScaleBitHeight);
-	int32_t fScalex = (int32_t)((float_t)kiSrcWidth / (float_t)kiDstWidth * kuiScaleWidth);
-	int32_t fScaley = (int32_t)((float_t)kiSrcHeight / (float_t)kiDstHeight * kuiScaleHeight);
-	uint32_t x;
-	int32_t iYInverse, iXInverse;
-
-	uint8_t* pByDst = pDst;
-	uint8_t* pByLineDst = pDst;
-
-	iYInverse = 1 << (kuiScaleBitHeight - 1);
-	for(int32_t i = 0; i < kiDstHeight - 1; i++)
-	{
-		int32_t iYy = iYInverse >> kuiScaleBitHeight;
-		int32_t fv = iYInverse & (kuiScaleHeight - 1);
-
-		uint8_t* pBySrc = pSrc + iYy * kiSrcStride;
-
-		pByDst = pByLineDst;
-		iXInverse = 1 << (kuiScaleBitWidth - 1);
-		for(int32_t j = 0; j < kiDstWidth - 1; j++)
-		{
-			int32_t iXx = iXInverse >> kuiScaleBitWidth;
-			int32_t iFu = iXInverse & (kuiScaleWidth - 1);
-
-			uint8_t* pByCurrent = pBySrc + iXx;
-			uint8_t a, b, c, d;
-
-			a = *pByCurrent;
-			b = *(pByCurrent + 1 );
-			c = *(pByCurrent + kiSrcStride);
-			d = *(pByCurrent + kiSrcStride + 1 );
-
-			x  = (((uint32_t)(kuiScaleWidth - 1 - iFu))*(kuiScaleHeight - 1 - fv) >> kuiScaleBitWidth) * a;
-			x += (((uint32_t)(iFu))*(kuiScaleHeight - 1 - fv) >> kuiScaleBitWidth) * b;
-			x += (((uint32_t)(kuiScaleWidth - 1 - iFu))*(fv) >> kuiScaleBitWidth) * c;
-			x += (((uint32_t)(iFu))*(fv) >> kuiScaleBitWidth) * d;
-			x >>= (kuiScaleBitHeight - 1);
-			x += 1;
-			x >>= 1;
-			//x = (((__int64)(SCALE_BIG - 1 - iFu))*(SCALE_BIG - 1 - fv)*a + ((__int64)iFu)*(SCALE_BIG - 1 -fv)*b + ((__int64)(SCALE_BIG - 1 -iFu))*fv*c + 
-			//		 ((__int64)iFu)*fv*d + (1 << (2*SCALE_BIT_BIG-1)) ) >> (2*SCALE_BIT_BIG);
-			x = WELS_CLAMP(x, 0, 255);
-			*pByDst++ = (uint8_t)x;
-
-			iXInverse += fScalex;
-		}
-		*pByDst = *(pBySrc + (iXInverse >> kuiScaleBitWidth));
-		pByLineDst += kiDstStride;
-		iYInverse += fScaley;
-	}
-
-	// last row special
-	{
-		int32_t iYy = iYInverse >> kuiScaleBitHeight;
-		uint8_t* pBySrc = pSrc + iYy * kiSrcStride;
-
-		pByDst = pByLineDst;
-		iXInverse = 1 << (kuiScaleBitWidth - 1);
-		for(int32_t j = 0; j < kiDstWidth; j++)
-		{
-			int32_t iXx = iXInverse >> kuiScaleBitWidth;
-			*pByDst++ = *(pBySrc + iXx);
-
-			iXInverse += fScalex;
-		}
-	}
-}
-
-void GeneralBilinearAccurateDownsampler_c(uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight, 
-									uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight )
-{
-	const int32_t kiScaleBit = 15;
-	const int32_t kiScale = (1 << kiScaleBit);
-	int32_t iScalex = (int32_t)((float_t)kiSrcWidth / (float_t)kiDstWidth * kiScale);
-	int32_t iScaley = (int32_t)((float_t)kiSrcHeight / (float_t)kiDstHeight * kiScale);
-	int64_t x;
-	int32_t iYInverse, iXInverse;
-
-	uint8_t* pByDst = pDst;
-	uint8_t* pByLineDst = pDst;
-
-	iYInverse = 1 << (kiScaleBit - 1);
-	for(int32_t i = 0; i < kiDstHeight - 1; i++)
-	{
-		int32_t iYy = iYInverse >> kiScaleBit;
-		int32_t iFv = iYInverse & (kiScale - 1);
-
-		uint8_t* pBySrc = pSrc + iYy * kiSrcStride;
-
-		pByDst = pByLineDst;
-		iXInverse = 1 << (kiScaleBit - 1);
-		for(int32_t j = 0; j < kiDstWidth - 1; j++)
-		{
-			int32_t iXx = iXInverse >> kiScaleBit;
-			int32_t iFu = iXInverse & (kiScale - 1);
-
-			uint8_t* pByCurrent = pBySrc + iXx;
-			uint8_t a, b, c, d;
-
-			a = *pByCurrent;
-			b = *(pByCurrent + 1 );
-			c = *(pByCurrent + kiSrcStride);
-			d = *(pByCurrent + kiSrcStride + 1 );
-
-			x = (((int64_t)(kiScale - 1 - iFu))*(kiScale - 1 - iFv)*a + ((int64_t)iFu)*(kiScale - 1 -iFv)*b + ((int64_t)(kiScale - 1 -iFu))*iFv*c + 
-				((int64_t)iFu)*iFv*d + (int64_t)(1 << (2*kiScaleBit-1)) ) >> (2*kiScaleBit);
-			x = WELS_CLAMP(x, 0, 255);
-			*pByDst++ = (uint8_t)x;
-
-			iXInverse += iScalex;
-		}
-		*pByDst = *(pBySrc + (iXInverse >> kiScaleBit));
-		pByLineDst += kiDstStride;
-		iYInverse += iScaley;
-	}
-
-	// last row special
-	{
-		int32_t iYy = iYInverse >> kiScaleBit;
-		uint8_t* pBySrc = pSrc + iYy * kiSrcStride;
-
-		pByDst = pByLineDst;
-		iXInverse = 1 << (kiScaleBit - 1);
-		for(int32_t j = 0; j < kiDstWidth; j++)
-		{
-			int32_t iXx = iXInverse >> kiScaleBit;
-			*pByDst++ = *(pBySrc + iXx);
-
-			iXInverse += iScalex;
-		}
-	}
-}
-
-
-#ifdef X86_ASM
-void GeneralBilinearFastDownsamplerWrap_sse2(uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight, 
-						uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight)
-{
-	const int32_t kiScaleBitWidth = 16, kiScaleBitHeight = 15;
-	const uint32_t kuiScaleWidth = (1 << kiScaleBitWidth), kuiScaleHeight = (1 << kiScaleBitHeight);
-
-	uint32_t uiScalex = (uint32_t)((float_t)kiSrcWidth / (float_t)kiDstWidth * kuiScaleWidth);
-	uint32_t uiScaley = (uint32_t)((float_t)kiSrcHeight / (float_t)kiDstHeight * kuiScaleHeight);
-
-	GeneralBilinearFastDownsampler_sse2(pDst, kiDstStride, kiDstWidth, kiDstHeight, 
-		pSrc, kiSrcStride, kiSrcWidth, kiSrcHeight, uiScalex, uiScaley);
-}
-
-void GeneralBilinearAccurateDownsamplerWrap_sse2(uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight, 
-									uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight )
-{
-	const int32_t kiScaleBit = 15;
-	const uint32_t kuiScale = (1 << kiScaleBit);
-
-	uint32_t uiScalex = (uint32_t)((float_t)kiSrcWidth / (float_t)kiDstWidth * kuiScale);
-	uint32_t uiScaley = (uint32_t)((float_t)kiSrcHeight / (float_t)kiDstHeight * kuiScale);
-
-	GeneralBilinearAccurateDownsampler_sse2(pDst, kiDstStride, kiDstWidth, kiDstHeight, 
-		pSrc, kiSrcStride, kiSrcWidth, kiSrcHeight, uiScalex, uiScaley);
-}
-#endif //X86_ASM
-
-WELSVP_NAMESPACE_END
+/*!
+ * \copy
+ *     Copyright (c)  2008-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *  downsample_yuv.c
+ *
+ *  Abstract
+ *      Implementation for source yuv data downsampling used before spatial encoding.
+ *
+ *  History
+ *      10/24/2008 Created
+ *
+ *****************************************************************************/
+
+#include "../common/typedef.h"
+#include "../common/util.h"
+#include "downsample.h"
+
+
+WELSVP_NAMESPACE_BEGIN
+
+
+void DyadicBilinearDownsampler_c (uint8_t* pDst, const int32_t kiDstStride,
+                                  uint8_t* pSrc, const int32_t kiSrcStride,
+                                  const int32_t kiSrcWidth, const int32_t kiSrcHeight)
+
+{
+  uint8_t* pDstLine	= pDst;
+  uint8_t* pSrcLine	= pSrc;
+  const int32_t kiSrcStridex2	= kiSrcStride << 1;
+  const int32_t kiDstWidth		= kiSrcWidth >> 1;
+  const int32_t kiDstHeight	= kiSrcHeight >> 1;
+
+  for (int32_t j = 0; j < kiDstHeight; j ++) {
+    for (int32_t i = 0; i < kiDstWidth; i ++) {
+      const int32_t kiSrcX = i << 1;
+      const int32_t kiTempRow1 = (pSrcLine[kiSrcX] + pSrcLine[kiSrcX + 1] + 1) >> 1;
+      const int32_t kiTempRow2 = (pSrcLine[kiSrcX + kiSrcStride] + pSrcLine[kiSrcX + kiSrcStride + 1] + 1) >> 1;
+
+      pDstLine[i] = (uint8_t) ((kiTempRow1 + kiTempRow2 + 1) >> 1);
+    }
+    pDstLine	+= kiDstStride;
+    pSrcLine	+= kiSrcStridex2;
+  }
+}
+
+void GeneralBilinearFastDownsampler_c (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
+                                       const int32_t kiDstHeight,
+                                       uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
+  const uint32_t kuiScaleBitWidth = 16, kuiScaleBitHeight = 15;
+  const uint32_t kuiScaleWidth = (1 << kuiScaleBitWidth), kuiScaleHeight = (1 << kuiScaleBitHeight);
+  int32_t fScalex = (int32_t) ((float_t)kiSrcWidth / (float_t)kiDstWidth * kuiScaleWidth);
+  int32_t fScaley = (int32_t) ((float_t)kiSrcHeight / (float_t)kiDstHeight * kuiScaleHeight);
+  uint32_t x;
+  int32_t iYInverse, iXInverse;
+
+  uint8_t* pByDst = pDst;
+  uint8_t* pByLineDst = pDst;
+
+  iYInverse = 1 << (kuiScaleBitHeight - 1);
+  for (int32_t i = 0; i < kiDstHeight - 1; i++) {
+    int32_t iYy = iYInverse >> kuiScaleBitHeight;
+    int32_t fv = iYInverse & (kuiScaleHeight - 1);
+
+    uint8_t* pBySrc = pSrc + iYy * kiSrcStride;
+
+    pByDst = pByLineDst;
+    iXInverse = 1 << (kuiScaleBitWidth - 1);
+    for (int32_t j = 0; j < kiDstWidth - 1; j++) {
+      int32_t iXx = iXInverse >> kuiScaleBitWidth;
+      int32_t iFu = iXInverse & (kuiScaleWidth - 1);
+
+      uint8_t* pByCurrent = pBySrc + iXx;
+      uint8_t a, b, c, d;
+
+      a = *pByCurrent;
+      b = * (pByCurrent + 1);
+      c = * (pByCurrent + kiSrcStride);
+      d = * (pByCurrent + kiSrcStride + 1);
+
+      x  = (((uint32_t) (kuiScaleWidth - 1 - iFu)) * (kuiScaleHeight - 1 - fv) >> kuiScaleBitWidth) * a;
+      x += (((uint32_t) (iFu)) * (kuiScaleHeight - 1 - fv) >> kuiScaleBitWidth) * b;
+      x += (((uint32_t) (kuiScaleWidth - 1 - iFu)) * (fv) >> kuiScaleBitWidth) * c;
+      x += (((uint32_t) (iFu)) * (fv) >> kuiScaleBitWidth) * d;
+      x >>= (kuiScaleBitHeight - 1);
+      x += 1;
+      x >>= 1;
+      //x = (((__int64)(SCALE_BIG - 1 - iFu))*(SCALE_BIG - 1 - fv)*a + ((__int64)iFu)*(SCALE_BIG - 1 -fv)*b + ((__int64)(SCALE_BIG - 1 -iFu))*fv*c +
+      //		 ((__int64)iFu)*fv*d + (1 << (2*SCALE_BIT_BIG-1)) ) >> (2*SCALE_BIT_BIG);
+      x = WELS_CLAMP (x, 0, 255);
+      *pByDst++ = (uint8_t)x;
+
+      iXInverse += fScalex;
+    }
+    *pByDst = * (pBySrc + (iXInverse >> kuiScaleBitWidth));
+    pByLineDst += kiDstStride;
+    iYInverse += fScaley;
+  }
+
+  // last row special
+  {
+    int32_t iYy = iYInverse >> kuiScaleBitHeight;
+    uint8_t* pBySrc = pSrc + iYy * kiSrcStride;
+
+    pByDst = pByLineDst;
+    iXInverse = 1 << (kuiScaleBitWidth - 1);
+    for (int32_t j = 0; j < kiDstWidth; j++) {
+      int32_t iXx = iXInverse >> kuiScaleBitWidth;
+      *pByDst++ = * (pBySrc + iXx);
+
+      iXInverse += fScalex;
+    }
+  }
+}
+
+void GeneralBilinearAccurateDownsampler_c (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
+    const int32_t kiDstHeight,
+    uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
+  const int32_t kiScaleBit = 15;
+  const int32_t kiScale = (1 << kiScaleBit);
+  int32_t iScalex = (int32_t) ((float_t)kiSrcWidth / (float_t)kiDstWidth * kiScale);
+  int32_t iScaley = (int32_t) ((float_t)kiSrcHeight / (float_t)kiDstHeight * kiScale);
+  int64_t x;
+  int32_t iYInverse, iXInverse;
+
+  uint8_t* pByDst = pDst;
+  uint8_t* pByLineDst = pDst;
+
+  iYInverse = 1 << (kiScaleBit - 1);
+  for (int32_t i = 0; i < kiDstHeight - 1; i++) {
+    int32_t iYy = iYInverse >> kiScaleBit;
+    int32_t iFv = iYInverse & (kiScale - 1);
+
+    uint8_t* pBySrc = pSrc + iYy * kiSrcStride;
+
+    pByDst = pByLineDst;
+    iXInverse = 1 << (kiScaleBit - 1);
+    for (int32_t j = 0; j < kiDstWidth - 1; j++) {
+      int32_t iXx = iXInverse >> kiScaleBit;
+      int32_t iFu = iXInverse & (kiScale - 1);
+
+      uint8_t* pByCurrent = pBySrc + iXx;
+      uint8_t a, b, c, d;
+
+      a = *pByCurrent;
+      b = * (pByCurrent + 1);
+      c = * (pByCurrent + kiSrcStride);
+      d = * (pByCurrent + kiSrcStride + 1);
+
+      x = (((int64_t) (kiScale - 1 - iFu)) * (kiScale - 1 - iFv) * a + ((int64_t)iFu) * (kiScale - 1 - iFv) * b + ((int64_t) (
+             kiScale - 1 - iFu)) * iFv * c +
+           ((int64_t)iFu) * iFv * d + (int64_t) (1 << (2 * kiScaleBit - 1))) >> (2 * kiScaleBit);
+      x = WELS_CLAMP (x, 0, 255);
+      *pByDst++ = (uint8_t)x;
+
+      iXInverse += iScalex;
+    }
+    *pByDst = * (pBySrc + (iXInverse >> kiScaleBit));
+    pByLineDst += kiDstStride;
+    iYInverse += iScaley;
+  }
+
+  // last row special
+  {
+    int32_t iYy = iYInverse >> kiScaleBit;
+    uint8_t* pBySrc = pSrc + iYy * kiSrcStride;
+
+    pByDst = pByLineDst;
+    iXInverse = 1 << (kiScaleBit - 1);
+    for (int32_t j = 0; j < kiDstWidth; j++) {
+      int32_t iXx = iXInverse >> kiScaleBit;
+      *pByDst++ = * (pBySrc + iXx);
+
+      iXInverse += iScalex;
+    }
+  }
+}
+
+
+#ifdef X86_ASM
+void GeneralBilinearFastDownsamplerWrap_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
+    const int32_t kiDstHeight,
+    uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
+  const int32_t kiScaleBitWidth = 16, kiScaleBitHeight = 15;
+  const uint32_t kuiScaleWidth = (1 << kiScaleBitWidth), kuiScaleHeight = (1 << kiScaleBitHeight);
+
+  uint32_t uiScalex = (uint32_t) ((float_t)kiSrcWidth / (float_t)kiDstWidth * kuiScaleWidth);
+  uint32_t uiScaley = (uint32_t) ((float_t)kiSrcHeight / (float_t)kiDstHeight * kuiScaleHeight);
+
+  GeneralBilinearFastDownsampler_sse2 (pDst, kiDstStride, kiDstWidth, kiDstHeight,
+                                       pSrc, kiSrcStride, kiSrcWidth, kiSrcHeight, uiScalex, uiScaley);
+}
+
+void GeneralBilinearAccurateDownsamplerWrap_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
+    const int32_t kiDstHeight,
+    uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
+  const int32_t kiScaleBit = 15;
+  const uint32_t kuiScale = (1 << kiScaleBit);
+
+  uint32_t uiScalex = (uint32_t) ((float_t)kiSrcWidth / (float_t)kiDstWidth * kuiScale);
+  uint32_t uiScaley = (uint32_t) ((float_t)kiSrcHeight / (float_t)kiDstHeight * kuiScale);
+
+  GeneralBilinearAccurateDownsampler_sse2 (pDst, kiDstStride, kiDstWidth, kiDstHeight,
+      pSrc, kiSrcStride, kiSrcWidth, kiSrcHeight, uiScalex, uiScaley);
+}
+#endif //X86_ASM
+
+WELSVP_NAMESPACE_END
--- a/processing/src/imagerotate/imagerotate.cpp
+++ b/processing/src/imagerotate/imagerotate.cpp
@@ -1,105 +1,93 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "imagerotate.h"
-#include "../common/cpu.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-CImageRotating::CImageRotating(int32_t iCpuFlag)
-{
-	m_iCPUFlag = iCpuFlag;
-	m_eMethod   = METHOD_IMAGE_ROTATE;
-	WelsMemset(&m_pfRotateImage, 0, sizeof(m_pfRotateImage));
-	InitImageRotateFuncs(m_pfRotateImage, m_iCPUFlag);
-}
-
-CImageRotating::~CImageRotating()
-{	
-}
-
-void CImageRotating::InitImageRotateFuncs(SImageRotateFuncs &sImageRotateFuncs, int32_t iCpuFlag)
-{
-	sImageRotateFuncs.pfImageRotate90D = ImageRotate90D_c;
-	sImageRotateFuncs.pfImageRotate180D = ImageRotate180D_c;
-	sImageRotateFuncs.pfImageRotate270D = ImageRotate270D_c;
-}
-EResult CImageRotating::ProcessImageRotate(int32_t iType, uint8_t *pSrc, uint32_t uiBytesPerPixel, uint32_t iWidth, uint32_t iHeight, uint8_t *pDst)
-{
-	if (iType == 90)
-	{
-		m_pfRotateImage.pfImageRotate90D(pSrc, uiBytesPerPixel, iWidth, iHeight, pDst);
-	}
-	else if (iType == 180)
-	{
-		m_pfRotateImage.pfImageRotate180D(pSrc, uiBytesPerPixel, iWidth, iHeight, pDst);
-	}
-	else if (iType == 270)
-	{
-		m_pfRotateImage.pfImageRotate270D(pSrc, uiBytesPerPixel, iWidth, iHeight, pDst);
-	}
-	else
-	{	
-		return RET_NOTSUPPORTED;
-	}
-	return RET_SUCCESS;
-}
-
-EResult CImageRotating::Process(int32_t iType, SPixMap *pSrc, SPixMap *pDst)
-{
-	EResult eReturn = RET_INVALIDPARAM;
-
-	if ((pSrc->eFormat == VIDEO_FORMAT_RGBA) ||
-		(pSrc->eFormat == VIDEO_FORMAT_BGRA) ||
-		(pSrc->eFormat == VIDEO_FORMAT_ABGR) ||
-		(pSrc->eFormat == VIDEO_FORMAT_ARGB))
-	{
-		eReturn = ProcessImageRotate(iType, (uint8_t *)pSrc->pPixel[0], pSrc->iSizeInBits*8, pSrc->sRect.iRectWidth, pSrc->sRect.iRectHeight, (uint8_t *)pDst->pPixel[0]);
-	}
-	else if (pSrc->eFormat == VIDEO_FORMAT_I420)
-	{
-		ProcessImageRotate(iType, (uint8_t *)pSrc->pPixel[0], pSrc->iSizeInBits*8, pSrc->sRect.iRectWidth, pSrc->sRect.iRectHeight, (uint8_t *)pDst->pPixel[0]);
-		ProcessImageRotate(iType, (uint8_t *)pSrc->pPixel[1], pSrc->iSizeInBits*8, (pSrc->sRect.iRectWidth >> 1), (pSrc->sRect.iRectHeight >> 1), (uint8_t *)pDst->pPixel[1]);
-		eReturn = ProcessImageRotate(iType, (uint8_t *)pSrc->pPixel[2], pSrc->iSizeInBits*8, (pSrc->sRect.iRectWidth >> 1), (pSrc->sRect.iRectHeight >> 1), (uint8_t *)pDst->pPixel[2]);
-	}
-	else
-	{
-		eReturn = RET_NOTSUPPORTED;
-	}
-
-	return eReturn;
-}
-
-
-WELSVP_NAMESPACE_END
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "imagerotate.h"
+#include "../common/cpu.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+CImageRotating::CImageRotating (int32_t iCpuFlag) {
+  m_iCPUFlag = iCpuFlag;
+  m_eMethod   = METHOD_IMAGE_ROTATE;
+  WelsMemset (&m_pfRotateImage, 0, sizeof (m_pfRotateImage));
+  InitImageRotateFuncs (m_pfRotateImage, m_iCPUFlag);
+}
+
+CImageRotating::~CImageRotating() {
+}
+
+void CImageRotating::InitImageRotateFuncs (SImageRotateFuncs& sImageRotateFuncs, int32_t iCpuFlag) {
+  sImageRotateFuncs.pfImageRotate90D = ImageRotate90D_c;
+  sImageRotateFuncs.pfImageRotate180D = ImageRotate180D_c;
+  sImageRotateFuncs.pfImageRotate270D = ImageRotate270D_c;
+}
+EResult CImageRotating::ProcessImageRotate (int32_t iType, uint8_t* pSrc, uint32_t uiBytesPerPixel, uint32_t iWidth,
+    uint32_t iHeight, uint8_t* pDst) {
+  if (iType == 90) {
+    m_pfRotateImage.pfImageRotate90D (pSrc, uiBytesPerPixel, iWidth, iHeight, pDst);
+  } else if (iType == 180) {
+    m_pfRotateImage.pfImageRotate180D (pSrc, uiBytesPerPixel, iWidth, iHeight, pDst);
+  } else if (iType == 270) {
+    m_pfRotateImage.pfImageRotate270D (pSrc, uiBytesPerPixel, iWidth, iHeight, pDst);
+  } else {
+    return RET_NOTSUPPORTED;
+  }
+  return RET_SUCCESS;
+}
+
+EResult CImageRotating::Process (int32_t iType, SPixMap* pSrc, SPixMap* pDst) {
+  EResult eReturn = RET_INVALIDPARAM;
+
+  if ((pSrc->eFormat == VIDEO_FORMAT_RGBA) ||
+      (pSrc->eFormat == VIDEO_FORMAT_BGRA) ||
+      (pSrc->eFormat == VIDEO_FORMAT_ABGR) ||
+      (pSrc->eFormat == VIDEO_FORMAT_ARGB)) {
+    eReturn = ProcessImageRotate (iType, (uint8_t*)pSrc->pPixel[0], pSrc->iSizeInBits * 8, pSrc->sRect.iRectWidth,
+                                  pSrc->sRect.iRectHeight, (uint8_t*)pDst->pPixel[0]);
+  } else if (pSrc->eFormat == VIDEO_FORMAT_I420) {
+    ProcessImageRotate (iType, (uint8_t*)pSrc->pPixel[0], pSrc->iSizeInBits * 8, pSrc->sRect.iRectWidth,
+                        pSrc->sRect.iRectHeight, (uint8_t*)pDst->pPixel[0]);
+    ProcessImageRotate (iType, (uint8_t*)pSrc->pPixel[1], pSrc->iSizeInBits * 8, (pSrc->sRect.iRectWidth >> 1),
+                        (pSrc->sRect.iRectHeight >> 1), (uint8_t*)pDst->pPixel[1]);
+    eReturn = ProcessImageRotate (iType, (uint8_t*)pSrc->pPixel[2], pSrc->iSizeInBits * 8, (pSrc->sRect.iRectWidth >> 1),
+                                  (pSrc->sRect.iRectHeight >> 1), (uint8_t*)pDst->pPixel[2]);
+  } else {
+    eReturn = RET_NOTSUPPORTED;
+  }
+
+  return eReturn;
+}
+
+
+WELSVP_NAMESPACE_END
--- a/processing/src/imagerotate/imagerotate.h
+++ b/processing/src/imagerotate/imagerotate.h
@@ -1,84 +1,85 @@
-/*!
- * \copy
- *     Copyright (c)  2011-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- * \file	    :  downsample.h
- *
- * \brief	    :  image rotate class of wels video processor class
- *
- * \date        :  2011/04/06
- *
- * \description :  
- *
- *************************************************************************************
- */
-
-#ifndef _WELSVP_IMAGEROTATE_H
-#define _WELSVP_IMAGEROTATE_H
-
-#include "../common/util.h"
-#include "../common/WelsFrameWork.h"
-#include "../../interface/IWelsVP.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-typedef void (ImageRotateFunc)( uint8_t *pSrc, uint32_t uiBytesPerPixel, uint32_t iWidth, uint32_t iHeight, uint8_t *pDst);
-
-typedef ImageRotateFunc		*ImageRotateFuncPtr;
-
-ImageRotateFunc   ImageRotate90D_c;
-ImageRotateFunc   ImageRotate180D_c;
-ImageRotateFunc   ImageRotate270D_c;
-
-typedef struct {
-	ImageRotateFuncPtr		pfImageRotate90D;
-	ImageRotateFuncPtr		pfImageRotate180D;
-	ImageRotateFuncPtr		pfImageRotate270D;
-}SImageRotateFuncs;
-
-class CImageRotating : public IStrategy
-{			  
-public:
-	CImageRotating(int32_t iCpuFlag);
-	~CImageRotating();
-
-	EResult Process(int32_t iType, SPixMap *pSrc, SPixMap *pDst);
-
-private:
-	void InitImageRotateFuncs(SImageRotateFuncs &pf, int32_t iCpuFlag);
-	EResult ProcessImageRotate(int32_t iType, uint8_t *pSrc, uint32_t uiBytesPerPixel, uint32_t iWidth, uint32_t iHeight, uint8_t *pDst);
-
-private:
-	SImageRotateFuncs m_pfRotateImage;
-	int32_t          m_iCPUFlag;
-};	
-
-WELSVP_NAMESPACE_END
-
-#endif
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	    :  downsample.h
+ *
+ * \brief	    :  image rotate class of wels video processor class
+ *
+ * \date        :  2011/04/06
+ *
+ * \description :
+ *
+ *************************************************************************************
+ */
+
+#ifndef _WELSVP_IMAGEROTATE_H
+#define _WELSVP_IMAGEROTATE_H
+
+#include "../common/util.h"
+#include "../common/WelsFrameWork.h"
+#include "../../interface/IWelsVP.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+typedef void (ImageRotateFunc) (uint8_t* pSrc, uint32_t uiBytesPerPixel, uint32_t iWidth, uint32_t iHeight,
+                                uint8_t* pDst);
+
+typedef ImageRotateFunc*		ImageRotateFuncPtr;
+
+ImageRotateFunc   ImageRotate90D_c;
+ImageRotateFunc   ImageRotate180D_c;
+ImageRotateFunc   ImageRotate270D_c;
+
+typedef struct {
+  ImageRotateFuncPtr		pfImageRotate90D;
+  ImageRotateFuncPtr		pfImageRotate180D;
+  ImageRotateFuncPtr		pfImageRotate270D;
+} SImageRotateFuncs;
+
+class CImageRotating : public IStrategy {
+ public:
+  CImageRotating (int32_t iCpuFlag);
+  ~CImageRotating();
+
+  EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* pDst);
+
+ private:
+  void InitImageRotateFuncs (SImageRotateFuncs& pf, int32_t iCpuFlag);
+  EResult ProcessImageRotate (int32_t iType, uint8_t* pSrc, uint32_t uiBytesPerPixel, uint32_t iWidth, uint32_t iHeight,
+                              uint8_t* pDst);
+
+ private:
+  SImageRotateFuncs m_pfRotateImage;
+  int32_t          m_iCPUFlag;
+};
+
+WELSVP_NAMESPACE_END
+
+#endif
--- a/processing/src/imagerotate/imagerotatefuncs.cpp
+++ b/processing/src/imagerotate/imagerotatefuncs.cpp
@@ -1,75 +1,66 @@
-/*!
- * \copy
- *     Copyright (c)  2011-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *  image_rotate.c
- *
- *  Created on 11-2-21.
- *
- */
-
-#include "imagerotate.h"
-#include "../common/cpu.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-void ImageRotate90D_c( uint8_t *pSrc, uint32_t uiBytesPerPixel, uint32_t iWidth, uint32_t iHeight, uint8_t *pDst)
-{
-	for (uint32_t j=0; j<iHeight; j++) 
-	{	
-		for (uint32_t i=0; i<iWidth; i++) 
-		{
-			for(uint32_t n = 0; n< uiBytesPerPixel; n++)				
-				pDst[(i*iHeight + iHeight-1-j)*uiBytesPerPixel + n] = pSrc[(iWidth*j+i)*uiBytesPerPixel+n];
-		}
-	}
-}
-void ImageRotate180D_c( uint8_t *pSrc, uint32_t uiBytesPerPixel, uint32_t iWidth, uint32_t iHeight, uint8_t *pDst)
-{
-	for (uint32_t j=0; j<iHeight; j++) 
-	{	
-		for (uint32_t i=0; i<iWidth; i++) 
-		{
-			for(uint32_t n = 0; n< uiBytesPerPixel; n++)
-				pDst[((iHeight-1-j)*iWidth + iWidth-1-i)*uiBytesPerPixel + n] = pSrc[(iWidth*j+i)*uiBytesPerPixel+n];
-		}
-	}
-}
-void ImageRotate270D_c( uint8_t *pSrc, uint32_t uiBytesPerPixel, uint32_t iWidth, uint32_t iHeight, uint8_t *pDst)
-{
-	for (uint32_t j=0; j<iWidth; j++) 
-	{	
-		for (uint32_t i=0; i<iHeight; i++) 
-		{
-			for(uint32_t n = 0; n< uiBytesPerPixel; n++)
-				pDst[((iWidth - 1-j)*iHeight + i)*uiBytesPerPixel + n] = pSrc[(iWidth*i+j)*uiBytesPerPixel+n];
-		}
-	}
-}
-WELSVP_NAMESPACE_END
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *  image_rotate.c
+ *
+ *  Created on 11-2-21.
+ *
+ */
+
+#include "imagerotate.h"
+#include "../common/cpu.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+void ImageRotate90D_c (uint8_t* pSrc, uint32_t uiBytesPerPixel, uint32_t iWidth, uint32_t iHeight, uint8_t* pDst) {
+  for (uint32_t j = 0; j < iHeight; j++) {
+    for (uint32_t i = 0; i < iWidth; i++) {
+      for (uint32_t n = 0; n < uiBytesPerPixel; n++)
+        pDst[ (i * iHeight + iHeight - 1 - j)*uiBytesPerPixel + n] = pSrc[ (iWidth * j + i) * uiBytesPerPixel + n];
+    }
+  }
+}
+void ImageRotate180D_c (uint8_t* pSrc, uint32_t uiBytesPerPixel, uint32_t iWidth, uint32_t iHeight, uint8_t* pDst) {
+  for (uint32_t j = 0; j < iHeight; j++) {
+    for (uint32_t i = 0; i < iWidth; i++) {
+      for (uint32_t n = 0; n < uiBytesPerPixel; n++)
+        pDst[ ((iHeight - 1 - j)*iWidth + iWidth - 1 - i)*uiBytesPerPixel + n] = pSrc[ (iWidth * j + i) * uiBytesPerPixel + n];
+    }
+  }
+}
+void ImageRotate270D_c (uint8_t* pSrc, uint32_t uiBytesPerPixel, uint32_t iWidth, uint32_t iHeight, uint8_t* pDst) {
+  for (uint32_t j = 0; j < iWidth; j++) {
+    for (uint32_t i = 0; i < iHeight; i++) {
+      for (uint32_t n = 0; n < uiBytesPerPixel; n++)
+        pDst[ ((iWidth - 1 - j)*iHeight + i)*uiBytesPerPixel + n] = pSrc[ (iWidth * i + j) * uiBytesPerPixel + n];
+    }
+  }
+}
+WELSVP_NAMESPACE_END
--- a/processing/src/scenechangedetection/SceneChangeDetection.cpp
+++ b/processing/src/scenechangedetection/SceneChangeDetection.cpp
@@ -1,146 +1,136 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "SceneChangeDetection.h"
-#include "../common/cpu.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-#define HIGH_MOTION_BLOCK_THRESHOLD 320
-#define SCENE_CHANGE_MOTION_RATIO	0.85f
-
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-CSceneChangeDetection::CSceneChangeDetection(int32_t iCpuFlag)
-{
-	m_iCpuFlag = iCpuFlag;
-	m_eMethod   = METHOD_SCENE_CHANGE_DETECTION;
-	m_pfSad   = NULL;
-	WelsMemset( &m_sSceneChangeParam, 0, sizeof(m_sSceneChangeParam) );
-	InitSadFuncs(m_pfSad, m_iCpuFlag);
-}
-
-CSceneChangeDetection::~CSceneChangeDetection()
-{	
-}
-
-EResult CSceneChangeDetection::Process(int32_t iType, SPixMap *pSrcPixMap, SPixMap *pRefPixMap)
-{
-	EResult eReturn = RET_INVALIDPARAM;
-
-	int32_t iWidth                  = pSrcPixMap->sRect.iRectWidth;
-	int32_t iHeight                 = pSrcPixMap->sRect.iRectHeight;	
-	int32_t iBlock8x8Width      = iWidth  >> 3;
-	int32_t iBlock8x8Height	 = iHeight >> 3;
-	int32_t iBlock8x8Num       = iBlock8x8Width * iBlock8x8Height;
-	int32_t iSceneChangeThreshold = WelsStaticCast(int32_t, SCENE_CHANGE_MOTION_RATIO * iBlock8x8Num + 0.5f + PESN);
-
-	int32_t iBlockSad = 0;
-	int32_t iMotionBlockNum = 0;
-
-	uint8_t *pRefY = NULL, *pCurY = NULL;
-	int32_t iRefStride = 0, iCurStride = 0;
-	int32_t iRefRowStride = 0, iCurRowStride = 0;
-
-	uint8_t *pRefTmp = NULL, *pCurTmp = NULL;
-
-	pRefY = (uint8_t *)pRefPixMap->pPixel[0];
-	pCurY = (uint8_t *)pSrcPixMap->pPixel[0];
-
-	iRefStride  = pRefPixMap->iStride[0];
-	iCurStride  = pSrcPixMap->iStride[0];
-
-	iRefRowStride  = pRefPixMap->iStride[0] << 3;
-	iCurRowStride  = pSrcPixMap->iStride[0] << 3;
-
-	m_sSceneChangeParam.bSceneChangeFlag = 0;
-
-	for (int32_t j = 0; j < iBlock8x8Height; j ++ ) 
-	{
-		pRefTmp	= pRefY;
-		pCurTmp 	= pCurY;
-
-		for (int32_t i = 0; i < iBlock8x8Width; i++ )
-		{
-			iBlockSad = m_pfSad(pRefTmp, iRefStride, pCurTmp, iCurStride);
-
-			iMotionBlockNum += (iBlockSad > HIGH_MOTION_BLOCK_THRESHOLD);
-			
-			pRefTmp += 8;
-			pCurTmp += 8;
-		}
-
-		pRefY += iRefRowStride;
-		pCurY += iCurRowStride;
-	}
-
-	if ( iMotionBlockNum >= iSceneChangeThreshold ) 
-	{ 
-		m_sSceneChangeParam.bSceneChangeFlag = 1;
-	}
-
-	eReturn = RET_SUCCESS;
-
-	return eReturn;
-}
-
-
-EResult CSceneChangeDetection::Get(int32_t iType, void *pParam)
-{
-	if (pParam == NULL)
-	{
-		return RET_INVALIDPARAM;
-	}
-
-	 *(SSceneChangeResult *)pParam = m_sSceneChangeParam;
-
-	return RET_SUCCESS;
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////
-
-void CSceneChangeDetection::InitSadFuncs(SadFuncPtr &pfSad,  int32_t iCpuFlag)
-{
-	pfSad = WelsSampleSad8x8_c;
-
-#ifdef X86_ASM	
-	if ( iCpuFlag & WELS_CPU_SSE2 )
-	{
-		pfSad = WelsSampleSad8x8_sse21;
-	}
-#endif
-}
-
-
-WELSVP_NAMESPACE_END
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "SceneChangeDetection.h"
+#include "../common/cpu.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+#define HIGH_MOTION_BLOCK_THRESHOLD 320
+#define SCENE_CHANGE_MOTION_RATIO	0.85f
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+CSceneChangeDetection::CSceneChangeDetection (int32_t iCpuFlag) {
+  m_iCpuFlag = iCpuFlag;
+  m_eMethod   = METHOD_SCENE_CHANGE_DETECTION;
+  m_pfSad   = NULL;
+  WelsMemset (&m_sSceneChangeParam, 0, sizeof (m_sSceneChangeParam));
+  InitSadFuncs (m_pfSad, m_iCpuFlag);
+}
+
+CSceneChangeDetection::~CSceneChangeDetection() {
+}
+
+EResult CSceneChangeDetection::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
+  EResult eReturn = RET_INVALIDPARAM;
+
+  int32_t iWidth                  = pSrcPixMap->sRect.iRectWidth;
+  int32_t iHeight                 = pSrcPixMap->sRect.iRectHeight;
+  int32_t iBlock8x8Width      = iWidth  >> 3;
+  int32_t iBlock8x8Height	 = iHeight >> 3;
+  int32_t iBlock8x8Num       = iBlock8x8Width * iBlock8x8Height;
+  int32_t iSceneChangeThreshold = WelsStaticCast (int32_t, SCENE_CHANGE_MOTION_RATIO * iBlock8x8Num + 0.5f + PESN);
+
+  int32_t iBlockSad = 0;
+  int32_t iMotionBlockNum = 0;
+
+  uint8_t* pRefY = NULL, *pCurY = NULL;
+  int32_t iRefStride = 0, iCurStride = 0;
+  int32_t iRefRowStride = 0, iCurRowStride = 0;
+
+  uint8_t* pRefTmp = NULL, *pCurTmp = NULL;
+
+  pRefY = (uint8_t*)pRefPixMap->pPixel[0];
+  pCurY = (uint8_t*)pSrcPixMap->pPixel[0];
+
+  iRefStride  = pRefPixMap->iStride[0];
+  iCurStride  = pSrcPixMap->iStride[0];
+
+  iRefRowStride  = pRefPixMap->iStride[0] << 3;
+  iCurRowStride  = pSrcPixMap->iStride[0] << 3;
+
+  m_sSceneChangeParam.bSceneChangeFlag = 0;
+
+  for (int32_t j = 0; j < iBlock8x8Height; j ++) {
+    pRefTmp	= pRefY;
+    pCurTmp 	= pCurY;
+
+    for (int32_t i = 0; i < iBlock8x8Width; i++) {
+      iBlockSad = m_pfSad (pRefTmp, iRefStride, pCurTmp, iCurStride);
+
+      iMotionBlockNum += (iBlockSad > HIGH_MOTION_BLOCK_THRESHOLD);
+
+      pRefTmp += 8;
+      pCurTmp += 8;
+    }
+
+    pRefY += iRefRowStride;
+    pCurY += iCurRowStride;
+  }
+
+  if (iMotionBlockNum >= iSceneChangeThreshold) {
+    m_sSceneChangeParam.bSceneChangeFlag = 1;
+  }
+
+  eReturn = RET_SUCCESS;
+
+  return eReturn;
+}
+
+
+EResult CSceneChangeDetection::Get (int32_t iType, void* pParam) {
+  if (pParam == NULL) {
+    return RET_INVALIDPARAM;
+  }
+
+  * (SSceneChangeResult*)pParam = m_sSceneChangeParam;
+
+  return RET_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+
+void CSceneChangeDetection::InitSadFuncs (SadFuncPtr& pfSad,  int32_t iCpuFlag) {
+  pfSad = WelsSampleSad8x8_c;
+
+#ifdef X86_ASM
+  if (iCpuFlag & WELS_CPU_SSE2) {
+    pfSad = WelsSampleSad8x8_sse21;
+  }
+#endif
+}
+
+
+WELSVP_NAMESPACE_END
--- a/processing/src/scenechangedetection/SceneChangeDetection.h
+++ b/processing/src/scenechangedetection/SceneChangeDetection.h
@@ -1,73 +1,72 @@
-/*!
- * \copy
- *     Copyright (c)  2011-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
-* \file	        :  SceneChangeDetection.h
-*
-* \brief	    :  scene change detection class of wels video processor class
-*
-* \date         :  2011/03/14
-*
-* \description  :  1. rewrite the package code of scene change detection class  
-*
-*************************************************************************************
-*/
-
-#ifndef _WELSVP_SCENECHANGEDETECTION_H
-#define _WELSVP_SCENECHANGEDETECTION_H
-
-#include "../common/util.h"
-#include "../common/memory.h"
-#include "../common/WelsFrameWork.h"
-#include "../../interface/IWelsVP.h"
-#include "SceneChangeDetectionCommon.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-class CSceneChangeDetection : public IStrategy
-{			  
-public:
-	CSceneChangeDetection(int32_t iCpuFlag);
-	~CSceneChangeDetection();
-
-	EResult Process(int32_t iType, SPixMap *pSrc, SPixMap *pRef);
-	EResult Get(int32_t iType, void *pParam);
-
-private:
-	void InitSadFuncs(SadFuncPtr &pfSadFunc, int32_t iCpuFlag);
-
-private:
-	SadFuncPtr m_pfSad;
-	int32_t    m_iCpuFlag;
-	SSceneChangeResult m_sSceneChangeParam;
-};	
-
-WELSVP_NAMESPACE_END
-
-#endif
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+* \file	        :  SceneChangeDetection.h
+*
+* \brief	    :  scene change detection class of wels video processor class
+*
+* \date         :  2011/03/14
+*
+* \description  :  1. rewrite the package code of scene change detection class
+*
+*************************************************************************************
+*/
+
+#ifndef _WELSVP_SCENECHANGEDETECTION_H
+#define _WELSVP_SCENECHANGEDETECTION_H
+
+#include "../common/util.h"
+#include "../common/memory.h"
+#include "../common/WelsFrameWork.h"
+#include "../../interface/IWelsVP.h"
+#include "SceneChangeDetectionCommon.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+class CSceneChangeDetection : public IStrategy {
+ public:
+  CSceneChangeDetection (int32_t iCpuFlag);
+  ~CSceneChangeDetection();
+
+  EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* pRef);
+  EResult Get (int32_t iType, void* pParam);
+
+ private:
+  void InitSadFuncs (SadFuncPtr& pfSadFunc, int32_t iCpuFlag);
+
+ private:
+  SadFuncPtr m_pfSad;
+  int32_t    m_iCpuFlag;
+  SSceneChangeResult m_sSceneChangeParam;
+};
+
+WELSVP_NAMESPACE_END
+
+#endif
--- a/processing/src/scenechangedetection/SceneChangeDetectionCommon.cpp
+++ b/processing/src/scenechangedetection/SceneChangeDetectionCommon.cpp
@@ -1,62 +1,60 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "SceneChangeDetectionCommon.h"
-#include "../common/cpu.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-
-int32_t WelsSampleSad8x8_c( uint8_t * pSrcY, int32_t iSrcStrideY, uint8_t * pRefY, int32_t iRefStrideY )
-{
-	int32_t iSadSum = 0;
-	uint8_t* pSrcA = pSrcY;
-	uint8_t* pSrcB = pRefY;
-	for (int32_t i = 0; i < 8; i++ )
-	{
-		iSadSum += WELS_ABS( ( pSrcA[0] - pSrcB[0] ) );
-		iSadSum += WELS_ABS( ( pSrcA[1] - pSrcB[1] ) );
-		iSadSum += WELS_ABS( ( pSrcA[2] - pSrcB[2] ) );
-		iSadSum += WELS_ABS( ( pSrcA[3] - pSrcB[3] ) );
-		iSadSum += WELS_ABS( ( pSrcA[4] - pSrcB[4] ) );
-		iSadSum += WELS_ABS( ( pSrcA[5] - pSrcB[5] ) );
-		iSadSum += WELS_ABS( ( pSrcA[6] - pSrcB[6] ) );
-		iSadSum += WELS_ABS( ( pSrcA[7] - pSrcB[7] ) );
-
-		pSrcA += iSrcStrideY;
-		pSrcB += iRefStrideY;
-	}
-
-	return iSadSum;
-} 
-
-WELSVP_NAMESPACE_END
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "SceneChangeDetectionCommon.h"
+#include "../common/cpu.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+
+int32_t WelsSampleSad8x8_c (uint8_t* pSrcY, int32_t iSrcStrideY, uint8_t* pRefY, int32_t iRefStrideY) {
+  int32_t iSadSum = 0;
+  uint8_t* pSrcA = pSrcY;
+  uint8_t* pSrcB = pRefY;
+  for (int32_t i = 0; i < 8; i++) {
+    iSadSum += WELS_ABS ((pSrcA[0] - pSrcB[0]));
+    iSadSum += WELS_ABS ((pSrcA[1] - pSrcB[1]));
+    iSadSum += WELS_ABS ((pSrcA[2] - pSrcB[2]));
+    iSadSum += WELS_ABS ((pSrcA[3] - pSrcB[3]));
+    iSadSum += WELS_ABS ((pSrcA[4] - pSrcB[4]));
+    iSadSum += WELS_ABS ((pSrcA[5] - pSrcB[5]));
+    iSadSum += WELS_ABS ((pSrcA[6] - pSrcB[6]));
+    iSadSum += WELS_ABS ((pSrcA[7] - pSrcB[7]));
+
+    pSrcA += iSrcStrideY;
+    pSrcB += iRefStrideY;
+  }
+
+  return iSadSum;
+}
+
+WELSVP_NAMESPACE_END
--- a/processing/src/scenechangedetection/SceneChangeDetectionCommon.h
+++ b/processing/src/scenechangedetection/SceneChangeDetectionCommon.h
@@ -1,65 +1,65 @@
-/*!
- * \copy
- *     Copyright (c)  2011-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- * \file	        :  SceneChangeDetectionCommon.h
- *
- * \brief	    :  scene change detection class of wels video processor class
- *
- * \date         :  2011/03/14
- *
- * \description  :  1. rewrite the package code of scene change detection class  
- *
- */
-
-#ifndef _WELSVP_SCENECHANGEDETECTIONCOMMON_H
-#define _WELSVP_SCENECHANGEDETECTIONCOMMON_H
-
-#include "../common/util.h"
-#include "../common/memory.h"
-#include "../common/WelsFrameWork.h"
-#include "../../interface/IWelsVP.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-typedef  int32_t (SadFunc) ( uint8_t * pSrcY, int32_t iSrcStrideY, uint8_t * pRefY, int32_t iRefStrideY );
-
-typedef SadFunc  * SadFuncPtr;
-
-SadFunc      WelsSampleSad8x8_c;
-
-#ifdef X86_ASM
-WELSVP_EXTERN_C_BEGIN
-SadFunc      WelsSampleSad8x8_sse21;
-WELSVP_EXTERN_C_END
-#endif
-
-WELSVP_NAMESPACE_END
-
-#endif
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	        :  SceneChangeDetectionCommon.h
+ *
+ * \brief	    :  scene change detection class of wels video processor class
+ *
+ * \date         :  2011/03/14
+ *
+ * \description  :  1. rewrite the package code of scene change detection class
+ *
+ */
+
+#ifndef _WELSVP_SCENECHANGEDETECTIONCOMMON_H
+#define _WELSVP_SCENECHANGEDETECTIONCOMMON_H
+
+#include "../common/util.h"
+#include "../common/memory.h"
+#include "../common/WelsFrameWork.h"
+#include "../../interface/IWelsVP.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+typedef  int32_t (SadFunc) (uint8_t* pSrcY, int32_t iSrcStrideY, uint8_t* pRefY, int32_t iRefStrideY);
+
+typedef SadFunc*   SadFuncPtr;
+
+SadFunc      WelsSampleSad8x8_c;
+
+#ifdef X86_ASM
+WELSVP_EXTERN_C_BEGIN
+SadFunc      WelsSampleSad8x8_sse21;
+WELSVP_EXTERN_C_END
+#endif
+
+WELSVP_NAMESPACE_END
+
+#endif
--- a/processing/src/testbed/WelsVideoProcessor.cpp
+++ b/processing/src/testbed/WelsVideoProcessor.cpp
@@ -1,464 +1,382 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-// WelsVideoProcessor.cpp : Defines the entry point for the console application.
-//
-
-#include <tchar.h>
-#include "stdafx.h"
-#include "wels_process.h"
-
-//////////////////////////////////////////////////////////////////////////
-typedef struct
-{
-	FILE    *srcfile;
-	FILE    *dstfile;
-	vPixMap  src;
-	vPixMap  dst;
-	vMethods methods[vMethods_Mask];
-} VpConfigure;
-//////////////////////////////////////////////////////////////////////////
-
-void PrintHelp(TCHAR *strAppName, TCHAR *strError)
-{
-	if (strError)
-	{
-		_tprintf(_T("Error: %s\n"), strError);
-	} 
-	else 
-	{
-		_tprintf(_T("Welsvp Sample Console\n"));
-	}
-
-	_tprintf(_T("Usage1: %s [Options] -i InputFile -o OutputFile -w 640 -h 480\n"), strAppName);
-	_tprintf(_T("Options: \n"));
-
-	_tprintf(_T("   [-sx  x]       - cropX  of src video (def: 0)\n"));
-	_tprintf(_T("   [-sy  y]       - cropY  of src video (def: 0)\n"));
-	_tprintf(_T("   [-sw  width]   - cropW  of src video (def: width)\n"));
-	_tprintf(_T("   [-sh  height]  - cropH  of src video (def: height)\n"));
-	_tprintf(_T("   [-scc format]  - format (FourCC) of src video (def: support yv12|yuy2|rgb3|rgb4)\n"));
-
-	_tprintf(_T("   [-dx  x]       - cropX  of dst video (def: 0)\n"));
-	_tprintf(_T("   [-dy  y]       - cropY  of dst video (def: 0)\n"));
-	_tprintf(_T("   [-dw  width]   - cropW  of dst video (def: width)\n"));
-	_tprintf(_T("   [-dh  height]  - cropH  of dst video (def: height)\n"));
-	_tprintf(_T("   [-dcc format]  - format (FourCC) of dst video (def: nv12. support nv12|yuy2)\n"));
-
-	_tprintf(_T("   Video Processing Algorithms\n"));
-	_tprintf(_T("   [-vaa]         - enable video analysis algorithm \n"));
-	_tprintf(_T("   [-bgd]         - enable background detection algorithm \n"));
-	_tprintf(_T("   [-scd]         - enable scene change detection algorithm \n"));
-	_tprintf(_T("   [-denoise]     - enable denoise algorithm \n"));
-	_tprintf(_T("   [-downsample]  - enable downsample algorithm \n"));
-
-	_tprintf(_T("   [-n frames]    - number of frames to VP process\n\n"));
-	_tprintf(_T("\n"));
-
-	_tprintf(_T("Usage2: %s -sw 640 -sh 480 -scc rgb3 -dw 320 -dh 240 -dcc i420 -denoise -vaa -i in.rgb -o out.yuv\n"), strAppName);
-	_tprintf(_T("\n"));
-} 
-
-vVideoFormat Str2FourCC( TCHAR* strInput )
-{
-	vVideoFormat format = vVideoFormat_I420; // as default
-
-	if ( 0 == _tcscmp(strInput, _T("yv12")) ) 
-	{
-		format = vVideoFormat_YV12;
-	} 
-	else if ( 0 == _tcscmp(strInput, _T("i420")) ) 
-	{
-		format = vVideoFormat_I420;
-	} 
-	else if ( 0 == _tcscmp(strInput, _T("rgb24")) ) 
-	{
-		format = vVideoFormat_RGB24;
-	} 
-	else if ( 0 == _tcscmp(strInput, _T("rgb32")) ) 
-	{
-		format = vVideoFormat_RGB32;
-	} 
-	else if ( 0 == _tcscmp(strInput, _T("yuy2")) )
-	{
-		format = vVideoFormat_YUY2;
-	} 
-	else if ( 0 == _tcscmp(strInput, _T("nv12")) ) 
-	{
-		format = vVideoFormat_NV12;
-	} 
-
-	return format;
-}
-
-int ReadFile(vPixMap &pixmap, FILE *fp)
-{
-	int ret = 0;
-
-	int size = pixmap.Rect.width * pixmap.Rect.height;
-	switch (pixmap.eFormat)
-	{
-	case vVideoFormat_I420:
-	case vVideoFormat_YV12:
-		{
-			if ( fread(pixmap.pPixel[0], pixmap.nSizeInBits/8, (3*size)>>1, fp) <= 0 )
-				ret = 1;		
-		}
-		break;
-	case vVideoFormat_YUY2:
-		{
-			if ( fread(pixmap.pPixel[0], pixmap.nSizeInBits/8, 2*size, fp) <= 0 )
-				ret = 1;
-		}
-		break;
-	case vVideoFormat_RGB24:
-		{
-			if ( fread(pixmap.pPixel[0], pixmap.nSizeInBits/8, 3*size, fp) <= 0 )
-				ret = 1;
-		}
-		break;
-	case vVideoFormat_RGB32:
-		{
-			if ( fread(pixmap.pPixel[0], pixmap.nSizeInBits/8, 4*size, fp) <= 0 )
-				ret = 1;
-		}
-		break;
-	default:
-		ret = 1;
-		break;
-	}
-	return ret;
-}
-
-int WriteFile(vPixMap &pixmap, FILE *fp)
-{
-	int ret = 0;
-	int size = pixmap.Rect.width * pixmap.Rect.height;
-	switch (pixmap.eFormat)
-	{
-	case vVideoFormat_I420:
-	case vVideoFormat_YV12:
-		{
-			if ( fwrite(pixmap.pPixel[0], pixmap.nSizeInBits/8, (3*size)>>1, fp) <= 0 )
-				ret = 1;		
-		}
-		break;
-	case vVideoFormat_YUY2:
-		{
-			if ( fwrite(pixmap.pPixel[0], pixmap.nSizeInBits/8, 2*size, fp) <= 0 )
-				ret = 1;
-		}
-		break;
-	case vVideoFormat_RGB24:
-		{
-			if ( fwrite(pixmap.pPixel[0], pixmap.nSizeInBits/8, 3*size, fp) <= 0 )
-				ret = 1;
-		}
-		break;
-	case vVideoFormat_RGB32:
-		{
-			if ( fwrite(pixmap.pPixel[0], pixmap.nSizeInBits/8, 4*size, fp) <= 0 )
-				ret = 1;
-		}
-		break;
-	default:
-		ret = 1;
-		break;
-	}
-	return ret;
-}
-
-
-int AllocPixMap(vPixMap &pixmap)
-{
-	pixmap.nSizeInBits = sizeof(unsigned char) * 8;
-
-	switch (pixmap.eFormat)
-	{
-	case vVideoFormat_I420:
-	case vVideoFormat_YV12:
-		{
-			pixmap.nStride[0]  = pixmap.Rect.width;
-			pixmap.nStride[1]  = pixmap.nStride[2]  = pixmap.Rect.width / 2;
-			pixmap.pPixel[0]   = new void *[pixmap.nStride[0] * pixmap.Rect.height * pixmap.nSizeInBits / 8 * 3 / 2];
-			pixmap.pPixel[1]   = (unsigned char *)pixmap.pPixel[0] + pixmap.nStride[0] * pixmap.Rect.height * pixmap.nSizeInBits / 8;
-			pixmap.pPixel[2]   = (unsigned char *)pixmap.pPixel[0] + pixmap.nStride[0] * pixmap.Rect.height * pixmap.nSizeInBits / 8 * 5 / 4;
-		}
-		break;
-
-	case vVideoFormat_YUY2:
-		{
-			pixmap.nStride[0]  = pixmap.nStride[1]  = pixmap.nStride[2]  = pixmap.Rect.width * 2;
-			pixmap.pPixel[0]   = new void *[pixmap.nStride[0] * pixmap.Rect.height * pixmap.nSizeInBits / 8 * 2];
-			pixmap.pPixel[1]   = pixmap.pPixel[2] = NULL;
-		}
-		break;
-
-	case vVideoFormat_RGB24:
-		{
-			pixmap.nStride[0]  = pixmap.nStride[1]  = pixmap.nStride[2]  = pixmap.Rect.width * 3;
-			pixmap.pPixel[0]   = new void *[pixmap.nStride[0] * pixmap.Rect.height * pixmap.nSizeInBits / 8 * 3];
-			pixmap.pPixel[1]   = pixmap.pPixel[2] = NULL;
-		}
-		break;
-
-	case vVideoFormat_RGB32:
-		{
-			pixmap.nStride[0]  = pixmap.nStride[1]  = pixmap.nStride[2]  = pixmap.Rect.width * 4;
-			pixmap.pPixel[0]   = new void *[pixmap.nStride[0] * pixmap.Rect.height * pixmap.nSizeInBits / 8 * 4];
-			pixmap.pPixel[1]   = pixmap.pPixel[2] = NULL;
-		}
-		break;
-
-	default:
-		return 1;
-	}
-	
-	return (pixmap.pPixel[0]) ? 0 : 1;
-}
-
-void FreePixMap(vPixMap &pixmap)
-{
-	if (pixmap.pPixel[0])
-	{
-		free(pixmap.pPixel[0]);
-		pixmap.pPixel[0] = pixmap.pPixel[1] = pixmap.pPixel[2] = NULL;
-	}
-}
-
-int InitResource(TCHAR *strAppName, VpConfigure &cfg)
-{
-	if (0 == cfg.srcfile) 
-	{
-		PrintHelp(strAppName, _T("Source file can not found!\n"));
-		goto exit;
-	};
-
-	if (0 == cfg.dstfile) 
-	{
-		PrintHelp(strAppName, _T("Destination file name not found"));
-		goto exit;
-	};
-
-	if (cfg.dst.Rect.width == 0)
-		cfg.dst.Rect.width = cfg.src.Rect.width;
-	if (cfg.dst.Rect.height == 0)
-		cfg.dst.Rect.height = cfg.src.Rect.height;
-
-	cfg.methods[vMethods_ColorSpaceConvert] = vMethods_ColorSpaceConvert;
-
-	if (AllocPixMap(cfg.src))
-		goto exit;
-
-	if (AllocPixMap(cfg.dst))
-		goto exit;
-
-	return 0;
-
-exit:
-	FreePixMap(cfg.src);
-	FreePixMap(cfg.dst);
-	return 1;	
-}
-
-int ParseCommond(TCHAR* strInput[], int nArgNum, VpConfigure &cfg)
-{
-	if (nArgNum < 9)
-	{
-		PrintHelp(strInput[0], _T("please specify all necessary parameters!"));
-		return 1;
-	}
-
-	int width = 0, height = 0;
-	for (int i = 1; i < nArgNum; i++ ) 
-	{
-		if (strInput[i])
-		{ 
-			if ( 0 == _tcscmp(strInput[i], _T("-i")) ) 
-			{          
-				i++;
-				_tfopen_s(&cfg.srcfile, strInput[i], _T("rb"));
-			}
-			else if (0 == _tcscmp(strInput[i], _T("-o")))
-			{
-				i++;
-				_tfopen_s(&cfg.dstfile, strInput[i], _T("wb"));
-			}
-			else if (0 == _tcscmp(strInput[i], _T("-w")))
-			{
-				i++;
-				_stscanf_s(strInput[i], _T("%d"), &width);
-			}
-			else if (0 == _tcscmp(strInput[i], _T("-h")))
-			{
-				i++;
-				_stscanf_s(strInput[i], _T("%d"), &height);
-			}
-            //-----------------------------------------------------------------------------------
-			else if (0 == _tcscmp(strInput[i], _T("-sx")))
-			{
-				i++;
-				_stscanf_s(strInput[i], _T("%hd"), &cfg.src.Rect.top);
-			}
-			else if (0 == _tcscmp(strInput[i], _T("-sy")))
-			{
-				i++;
-				_stscanf_s(strInput[i], _T("%hd"), &cfg.src.Rect.left);
-			}
-			else if (0 == _tcscmp(strInput[i], _T("-sw")))
-			{
-				i++;
-				TCHAR *a = strInput[i];
-				_stscanf_s(strInput[i], _T("%hd"), &cfg.src.Rect.width);
-			}
-			else if (0 == _tcscmp(strInput[i], _T("-sh")))
-			{
-				i++;
-				_stscanf_s(strInput[i], _T("%hd"), &cfg.src.Rect.height);
-			}
-			else if (0 == _tcscmp(strInput[i], _T("-scc")))
-			{
-				i++;
-				cfg.src.eFormat = Str2FourCC( strInput[i] );
-			}
-            //-----------------------------------------------------------------------------------
-			else if (0 == _tcscmp(strInput[i], _T("-dx")))
-			{
-				i++;
-				_stscanf_s(strInput[i], _T("%hd"), &cfg.dst.Rect.top);
-			}
-			else if (0 == _tcscmp(strInput[i], _T("-dy")))
-			{
-				i++;
-				_stscanf_s(strInput[i], _T("%hd"), &cfg.dst.Rect.left);
-			}
-			else if (0 == _tcscmp(strInput[i], _T("-dw")))
-			{
-				i++;
-				_stscanf_s(strInput[i], _T("%hd"), &cfg.dst.Rect.width);
-			}
-			else if (0 == _tcscmp(strInput[i], _T("-dh")))
-			{
-				i++;
-				_stscanf_s(strInput[i], _T("%hd"), &cfg.dst.Rect.height);
-			}
-			else if (0 == _tcscmp(strInput[i], _T("-dcc")))
-			{
-				i++;
-				cfg.dst.eFormat = Str2FourCC( strInput[i] );
-			}
-			//-----------------------------------------------------------------------------------
-			else if (0 == _tcscmp(strInput[i], _T("-denoise")))
-			{
-				cfg.methods[vMethods_Denoise] = vMethods_Denoise;
-			}	
-			else if (0 == _tcscmp(strInput[i], _T("-scd")))
-			{
-				cfg.methods[vMethods_SceneChangeDetection] = vMethods_SceneChangeDetection;
-			}
-			else if (0 == _tcscmp(strInput[i], _T("-downsample")))
-			{
-			}
-			else if (0 == _tcscmp(strInput[i], _T("-vaa")))
-			{
-			}
-			else if (0 == _tcscmp(strInput[i], _T("-bgd")))
-			{
-			}
-			else if (0 == _tcscmp(strInput[i], _T("-aq")))
-			{
-			}
-		}
-	}	
-
-	if (cfg.src.Rect.width == 0)  cfg.src.Rect.width  = width;
-	if (cfg.src.Rect.height == 0) cfg.src.Rect.height = height;
-	if (cfg.dst.Rect.width == 0)  cfg.dst.Rect.width  = width;
-	if (cfg.dst.Rect.height == 0) cfg.dst.Rect.height = height;
-
-	return InitResource(strInput[0], cfg);
-}
-
-int _tmain(int argc, _TCHAR* argv[])
-{
-	int   ret           = 0;
-	VpConfigure cfg     = {0};
-	IWelsVpPlugin *pVpp = NULL;
-
-	ret = ParseCommond(argv, argc, cfg);
-	if (ret)
-		goto exit;
-
-	pVpp = new IWelsVpPlugin(ret);
-	if (pVpp && ret == 0)
-	{
-		vResult vret = vRet_Success;
-		while (1)
-		{
-			if (feof(cfg.srcfile))
-				break;
-
-			if (ReadFile(cfg.src, cfg.srcfile))
-				break;
-
-			vret = pVpp->Process(cfg.methods[vMethods_ColorSpaceConvert], &cfg.src, &cfg.dst);
-			if (vret)
-				break;
-
-			vret = pVpp->Process(cfg.methods[vMethods_Denoise], &cfg.dst, NULL);
-			if (vret)
-				break;
-
-			if (WriteFile(cfg.dst, cfg.dstfile))
-				break;
-		}		
-	}
-
-exit:
-
-	if (pVpp)
-	{
-		delete pVpp;
-		pVpp = NULL;
-	}
-
-	if (cfg.srcfile)
-		fclose(cfg.srcfile);
-	if (cfg.dstfile)
-		fclose(cfg.dstfile);
-
-	FreePixMap(cfg.src);
-	FreePixMap(cfg.dst);	
-
-	return 0;
-}
-
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+// WelsVideoProcessor.cpp : Defines the entry point for the console application.
+//
+
+#include <tchar.h>
+#include "stdafx.h"
+#include "wels_process.h"
+
+//////////////////////////////////////////////////////////////////////////
+typedef struct {
+  FILE*    srcfile;
+  FILE*    dstfile;
+  vPixMap  src;
+  vPixMap  dst;
+  vMethods methods[vMethods_Mask];
+} VpConfigure;
+//////////////////////////////////////////////////////////////////////////
+
+void PrintHelp (TCHAR* strAppName, TCHAR* strError) {
+  if (strError) {
+    _tprintf (_T ("Error: %s\n"), strError);
+  } else {
+    _tprintf (_T ("Welsvp Sample Console\n"));
+  }
+
+  _tprintf (_T ("Usage1: %s [Options] -i InputFile -o OutputFile -w 640 -h 480\n"), strAppName);
+  _tprintf (_T ("Options: \n"));
+
+  _tprintf (_T ("   [-sx  x]       - cropX  of src video (def: 0)\n"));
+  _tprintf (_T ("   [-sy  y]       - cropY  of src video (def: 0)\n"));
+  _tprintf (_T ("   [-sw  width]   - cropW  of src video (def: width)\n"));
+  _tprintf (_T ("   [-sh  height]  - cropH  of src video (def: height)\n"));
+  _tprintf (_T ("   [-scc format]  - format (FourCC) of src video (def: support yv12|yuy2|rgb3|rgb4)\n"));
+
+  _tprintf (_T ("   [-dx  x]       - cropX  of dst video (def: 0)\n"));
+  _tprintf (_T ("   [-dy  y]       - cropY  of dst video (def: 0)\n"));
+  _tprintf (_T ("   [-dw  width]   - cropW  of dst video (def: width)\n"));
+  _tprintf (_T ("   [-dh  height]  - cropH  of dst video (def: height)\n"));
+  _tprintf (_T ("   [-dcc format]  - format (FourCC) of dst video (def: nv12. support nv12|yuy2)\n"));
+
+  _tprintf (_T ("   Video Processing Algorithms\n"));
+  _tprintf (_T ("   [-vaa]         - enable video analysis algorithm \n"));
+  _tprintf (_T ("   [-bgd]         - enable background detection algorithm \n"));
+  _tprintf (_T ("   [-scd]         - enable scene change detection algorithm \n"));
+  _tprintf (_T ("   [-denoise]     - enable denoise algorithm \n"));
+  _tprintf (_T ("   [-downsample]  - enable downsample algorithm \n"));
+
+  _tprintf (_T ("   [-n frames]    - number of frames to VP process\n\n"));
+  _tprintf (_T ("\n"));
+
+  _tprintf (_T ("Usage2: %s -sw 640 -sh 480 -scc rgb3 -dw 320 -dh 240 -dcc i420 -denoise -vaa -i in.rgb -o out.yuv\n"),
+            strAppName);
+  _tprintf (_T ("\n"));
+}
+
+vVideoFormat Str2FourCC (TCHAR* strInput) {
+  vVideoFormat format = vVideoFormat_I420; // as default
+
+  if (0 == _tcscmp (strInput, _T ("yv12"))) {
+    format = vVideoFormat_YV12;
+  } else if (0 == _tcscmp (strInput, _T ("i420"))) {
+    format = vVideoFormat_I420;
+  } else if (0 == _tcscmp (strInput, _T ("rgb24"))) {
+    format = vVideoFormat_RGB24;
+  } else if (0 == _tcscmp (strInput, _T ("rgb32"))) {
+    format = vVideoFormat_RGB32;
+  } else if (0 == _tcscmp (strInput, _T ("yuy2"))) {
+    format = vVideoFormat_YUY2;
+  } else if (0 == _tcscmp (strInput, _T ("nv12"))) {
+    format = vVideoFormat_NV12;
+  }
+
+  return format;
+}
+
+int ReadFile (vPixMap& pixmap, FILE* fp) {
+  int ret = 0;
+
+  int size = pixmap.Rect.width * pixmap.Rect.height;
+  switch (pixmap.eFormat) {
+  case vVideoFormat_I420:
+  case vVideoFormat_YV12: {
+    if (fread (pixmap.pPixel[0], pixmap.nSizeInBits / 8, (3 * size) >> 1, fp) <= 0)
+      ret = 1;
+  }
+  break;
+  case vVideoFormat_YUY2: {
+    if (fread (pixmap.pPixel[0], pixmap.nSizeInBits / 8, 2 * size, fp) <= 0)
+      ret = 1;
+  }
+  break;
+  case vVideoFormat_RGB24: {
+    if (fread (pixmap.pPixel[0], pixmap.nSizeInBits / 8, 3 * size, fp) <= 0)
+      ret = 1;
+  }
+  break;
+  case vVideoFormat_RGB32: {
+    if (fread (pixmap.pPixel[0], pixmap.nSizeInBits / 8, 4 * size, fp) <= 0)
+      ret = 1;
+  }
+  break;
+  default:
+    ret = 1;
+    break;
+  }
+  return ret;
+}
+
+int WriteFile (vPixMap& pixmap, FILE* fp) {
+  int ret = 0;
+  int size = pixmap.Rect.width * pixmap.Rect.height;
+  switch (pixmap.eFormat) {
+  case vVideoFormat_I420:
+  case vVideoFormat_YV12: {
+    if (fwrite (pixmap.pPixel[0], pixmap.nSizeInBits / 8, (3 * size) >> 1, fp) <= 0)
+      ret = 1;
+  }
+  break;
+  case vVideoFormat_YUY2: {
+    if (fwrite (pixmap.pPixel[0], pixmap.nSizeInBits / 8, 2 * size, fp) <= 0)
+      ret = 1;
+  }
+  break;
+  case vVideoFormat_RGB24: {
+    if (fwrite (pixmap.pPixel[0], pixmap.nSizeInBits / 8, 3 * size, fp) <= 0)
+      ret = 1;
+  }
+  break;
+  case vVideoFormat_RGB32: {
+    if (fwrite (pixmap.pPixel[0], pixmap.nSizeInBits / 8, 4 * size, fp) <= 0)
+      ret = 1;
+  }
+  break;
+  default:
+    ret = 1;
+    break;
+  }
+  return ret;
+}
+
+
+int AllocPixMap (vPixMap& pixmap) {
+  pixmap.nSizeInBits = sizeof (unsigned char) * 8;
+
+  switch (pixmap.eFormat) {
+  case vVideoFormat_I420:
+  case vVideoFormat_YV12: {
+    pixmap.nStride[0]  = pixmap.Rect.width;
+    pixmap.nStride[1]  = pixmap.nStride[2]  = pixmap.Rect.width / 2;
+    pixmap.pPixel[0]   = new void* [pixmap.nStride[0] * pixmap.Rect.height * pixmap.nSizeInBits / 8 * 3 / 2];
+    pixmap.pPixel[1]   = (unsigned char*)pixmap.pPixel[0] + pixmap.nStride[0] * pixmap.Rect.height * pixmap.nSizeInBits / 8;
+    pixmap.pPixel[2]   = (unsigned char*)pixmap.pPixel[0] + pixmap.nStride[0] * pixmap.Rect.height * pixmap.nSizeInBits /
+                         8 * 5 / 4;
+  }
+  break;
+
+  case vVideoFormat_YUY2: {
+    pixmap.nStride[0]  = pixmap.nStride[1]  = pixmap.nStride[2]  = pixmap.Rect.width * 2;
+    pixmap.pPixel[0]   = new void* [pixmap.nStride[0] * pixmap.Rect.height * pixmap.nSizeInBits / 8 * 2];
+    pixmap.pPixel[1]   = pixmap.pPixel[2] = NULL;
+  }
+  break;
+
+  case vVideoFormat_RGB24: {
+    pixmap.nStride[0]  = pixmap.nStride[1]  = pixmap.nStride[2]  = pixmap.Rect.width * 3;
+    pixmap.pPixel[0]   = new void* [pixmap.nStride[0] * pixmap.Rect.height * pixmap.nSizeInBits / 8 * 3];
+    pixmap.pPixel[1]   = pixmap.pPixel[2] = NULL;
+  }
+  break;
+
+  case vVideoFormat_RGB32: {
+    pixmap.nStride[0]  = pixmap.nStride[1]  = pixmap.nStride[2]  = pixmap.Rect.width * 4;
+    pixmap.pPixel[0]   = new void* [pixmap.nStride[0] * pixmap.Rect.height * pixmap.nSizeInBits / 8 * 4];
+    pixmap.pPixel[1]   = pixmap.pPixel[2] = NULL;
+  }
+  break;
+
+  default:
+    return 1;
+  }
+
+  return (pixmap.pPixel[0]) ? 0 : 1;
+}
+
+void FreePixMap (vPixMap& pixmap) {
+  if (pixmap.pPixel[0]) {
+    free (pixmap.pPixel[0]);
+    pixmap.pPixel[0] = pixmap.pPixel[1] = pixmap.pPixel[2] = NULL;
+  }
+}
+
+int InitResource (TCHAR* strAppName, VpConfigure& cfg) {
+  if (0 == cfg.srcfile) {
+    PrintHelp (strAppName, _T ("Source file can not found!\n"));
+    goto exit;
+  };
+
+  if (0 == cfg.dstfile) {
+    PrintHelp (strAppName, _T ("Destination file name not found"));
+    goto exit;
+  };
+
+  if (cfg.dst.Rect.width == 0)
+    cfg.dst.Rect.width = cfg.src.Rect.width;
+  if (cfg.dst.Rect.height == 0)
+    cfg.dst.Rect.height = cfg.src.Rect.height;
+
+  cfg.methods[vMethods_ColorSpaceConvert] = vMethods_ColorSpaceConvert;
+
+  if (AllocPixMap (cfg.src))
+    goto exit;
+
+  if (AllocPixMap (cfg.dst))
+    goto exit;
+
+  return 0;
+
+exit:
+  FreePixMap (cfg.src);
+  FreePixMap (cfg.dst);
+  return 1;
+}
+
+int ParseCommond (TCHAR* strInput[], int nArgNum, VpConfigure& cfg) {
+  if (nArgNum < 9) {
+    PrintHelp (strInput[0], _T ("please specify all necessary parameters!"));
+    return 1;
+  }
+
+  int width = 0, height = 0;
+  for (int i = 1; i < nArgNum; i++) {
+    if (strInput[i]) {
+      if (0 == _tcscmp (strInput[i], _T ("-i"))) {
+        i++;
+        _tfopen_s (&cfg.srcfile, strInput[i], _T ("rb"));
+      } else if (0 == _tcscmp (strInput[i], _T ("-o"))) {
+        i++;
+        _tfopen_s (&cfg.dstfile, strInput[i], _T ("wb"));
+      } else if (0 == _tcscmp (strInput[i], _T ("-w"))) {
+        i++;
+        _stscanf_s (strInput[i], _T ("%d"), &width);
+      } else if (0 == _tcscmp (strInput[i], _T ("-h"))) {
+        i++;
+        _stscanf_s (strInput[i], _T ("%d"), &height);
+      }
+      //-----------------------------------------------------------------------------------
+      else if (0 == _tcscmp (strInput[i], _T ("-sx"))) {
+        i++;
+        _stscanf_s (strInput[i], _T ("%hd"), &cfg.src.Rect.top);
+      } else if (0 == _tcscmp (strInput[i], _T ("-sy"))) {
+        i++;
+        _stscanf_s (strInput[i], _T ("%hd"), &cfg.src.Rect.left);
+      } else if (0 == _tcscmp (strInput[i], _T ("-sw"))) {
+        i++;
+        TCHAR* a = strInput[i];
+        _stscanf_s (strInput[i], _T ("%hd"), &cfg.src.Rect.width);
+      } else if (0 == _tcscmp (strInput[i], _T ("-sh"))) {
+        i++;
+        _stscanf_s (strInput[i], _T ("%hd"), &cfg.src.Rect.height);
+      } else if (0 == _tcscmp (strInput[i], _T ("-scc"))) {
+        i++;
+        cfg.src.eFormat = Str2FourCC (strInput[i]);
+      }
+      //-----------------------------------------------------------------------------------
+      else if (0 == _tcscmp (strInput[i], _T ("-dx"))) {
+        i++;
+        _stscanf_s (strInput[i], _T ("%hd"), &cfg.dst.Rect.top);
+      } else if (0 == _tcscmp (strInput[i], _T ("-dy"))) {
+        i++;
+        _stscanf_s (strInput[i], _T ("%hd"), &cfg.dst.Rect.left);
+      } else if (0 == _tcscmp (strInput[i], _T ("-dw"))) {
+        i++;
+        _stscanf_s (strInput[i], _T ("%hd"), &cfg.dst.Rect.width);
+      } else if (0 == _tcscmp (strInput[i], _T ("-dh"))) {
+        i++;
+        _stscanf_s (strInput[i], _T ("%hd"), &cfg.dst.Rect.height);
+      } else if (0 == _tcscmp (strInput[i], _T ("-dcc"))) {
+        i++;
+        cfg.dst.eFormat = Str2FourCC (strInput[i]);
+      }
+      //-----------------------------------------------------------------------------------
+      else if (0 == _tcscmp (strInput[i], _T ("-denoise"))) {
+        cfg.methods[vMethods_Denoise] = vMethods_Denoise;
+      } else if (0 == _tcscmp (strInput[i], _T ("-scd"))) {
+        cfg.methods[vMethods_SceneChangeDetection] = vMethods_SceneChangeDetection;
+      } else if (0 == _tcscmp (strInput[i], _T ("-downsample"))) {
+      } else if (0 == _tcscmp (strInput[i], _T ("-vaa"))) {
+      } else if (0 == _tcscmp (strInput[i], _T ("-bgd"))) {
+      } else if (0 == _tcscmp (strInput[i], _T ("-aq"))) {
+      }
+    }
+  }
+
+  if (cfg.src.Rect.width == 0)  cfg.src.Rect.width  = width;
+  if (cfg.src.Rect.height == 0) cfg.src.Rect.height = height;
+  if (cfg.dst.Rect.width == 0)  cfg.dst.Rect.width  = width;
+  if (cfg.dst.Rect.height == 0) cfg.dst.Rect.height = height;
+
+  return InitResource (strInput[0], cfg);
+}
+
+int _tmain (int argc, _TCHAR* argv[]) {
+  int   ret           = 0;
+  VpConfigure cfg     = {0};
+  IWelsVpPlugin* pVpp = NULL;
+
+  ret = ParseCommond (argv, argc, cfg);
+  if (ret)
+    goto exit;
+
+  pVpp = new IWelsVpPlugin (ret);
+  if (pVpp && ret == 0) {
+    vResult vret = vRet_Success;
+    while (1) {
+      if (feof (cfg.srcfile))
+        break;
+
+      if (ReadFile (cfg.src, cfg.srcfile))
+        break;
+
+      vret = pVpp->Process (cfg.methods[vMethods_ColorSpaceConvert], &cfg.src, &cfg.dst);
+      if (vret)
+        break;
+
+      vret = pVpp->Process (cfg.methods[vMethods_Denoise], &cfg.dst, NULL);
+      if (vret)
+        break;
+
+      if (WriteFile (cfg.dst, cfg.dstfile))
+        break;
+    }
+  }
+
+exit:
+
+  if (pVpp) {
+    delete pVpp;
+    pVpp = NULL;
+  }
+
+  if (cfg.srcfile)
+    fclose (cfg.srcfile);
+  if (cfg.dstfile)
+    fclose (cfg.dstfile);
+
+  FreePixMap (cfg.src);
+  FreePixMap (cfg.dst);
+
+  return 0;
+}
+
--- a/processing/src/testbed/bundleloader.h
+++ b/processing/src/testbed/bundleloader.h
@@ -1,95 +1,87 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#ifndef WELS_BOUNDLELOAD_H
-#define WELS_BOUNDLELOAD_H
-
-#if defined(MACOS)
-
-#include <dlfcn.h>
-#include <string>
-
-CFBundleRef LoadBundle(const char* lpBundlePath)
-{
-	if(lpBundlePath == NULL)
-	{
-		return NULL;
-	}
-	CFStringRef bundlePath = CFStringCreateWithCString(kCFAllocatorSystemDefault, lpBundlePath, CFStringGetSystemEncoding());
-	if(NULL == bundlePath)
-	{
-		return NULL;
-	}
-
-	CFURLRef bundleURL = CFURLCreateWithString(kCFAllocatorSystemDefault, bundlePath, NULL);
-	if(NULL == bundleURL)
-	{
-		return NULL;
-	}
-
-	// 2.get bundle ref
-	CFBundleRef bundleRef = CFBundleCreate(kCFAllocatorSystemDefault, bundleURL);
-	CFRelease(bundleURL);
-
-	//	Boolean bReturn = FALSE;
-	if(NULL != bundleRef)
-	{
-		//	bReturn = CFBundleLoadExecutable(bundleRef);
-	}
-
-	return bundleRef;
-}
-
-Boolean FreeBundle(CFBundleRef bundleRef)
-{
-	if(NULL != bundleRef)
-	{
-		//	CFBundleUnloadExecutable(bundleRef);
-		CFRelease(bundleRef);
-	}
-	return TRUE;
-}
-
-void* GetProcessAddress(CFBundleRef bundleRef, const char* lpProcName)
-{
-	void *processAddress = NULL;
-	if(NULL != bundleRef)
-	{
-		CFStringRef cfProcName = CFStringCreateWithCString(kCFAllocatorSystemDefault, lpProcName, CFStringGetSystemEncoding());
-		processAddress = CFBundleGetFunctionPointerForName(bundleRef, cfProcName);
-		CFRelease(cfProcName);
-	}
-	return processAddress;
-}
-#endif
-
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef WELS_BOUNDLELOAD_H
+#define WELS_BOUNDLELOAD_H
+
+#if defined(MACOS)
+
+#include <dlfcn.h>
+#include <string>
+
+CFBundleRef LoadBundle (const char* lpBundlePath) {
+  if (lpBundlePath == NULL) {
+    return NULL;
+  }
+  CFStringRef bundlePath = CFStringCreateWithCString (kCFAllocatorSystemDefault, lpBundlePath,
+                           CFStringGetSystemEncoding());
+  if (NULL == bundlePath) {
+    return NULL;
+  }
+
+  CFURLRef bundleURL = CFURLCreateWithString (kCFAllocatorSystemDefault, bundlePath, NULL);
+  if (NULL == bundleURL) {
+    return NULL;
+  }
+
+  // 2.get bundle ref
+  CFBundleRef bundleRef = CFBundleCreate (kCFAllocatorSystemDefault, bundleURL);
+  CFRelease (bundleURL);
+
+  //	Boolean bReturn = FALSE;
+  if (NULL != bundleRef) {
+    //	bReturn = CFBundleLoadExecutable(bundleRef);
+  }
+
+  return bundleRef;
+}
+
+Boolean FreeBundle (CFBundleRef bundleRef) {
+  if (NULL != bundleRef) {
+    //	CFBundleUnloadExecutable(bundleRef);
+    CFRelease (bundleRef);
+  }
+  return TRUE;
+}
+
+void* GetProcessAddress (CFBundleRef bundleRef, const char* lpProcName) {
+  void* processAddress = NULL;
+  if (NULL != bundleRef) {
+    CFStringRef cfProcName = CFStringCreateWithCString (kCFAllocatorSystemDefault, lpProcName, CFStringGetSystemEncoding());
+    processAddress = CFBundleGetFunctionPointerForName (bundleRef, cfProcName);
+    CFRelease (cfProcName);
+  }
+  return processAddress;
+}
+#endif
+
 #endif
\ No newline at end of file
--- a/processing/src/testbed/stdafx.cpp
+++ b/processing/src/testbed/stdafx.cpp
@@ -1,8 +1,8 @@
-// stdafx.cpp : source file that includes just the standard includes
-// WelsVideoProcessor.pch will be the pre-compiled header
-// stdafx.obj will contain the pre-compiled type information
-
-#include "stdafx.h"
-
-// TODO: reference any additional headers you need in STDAFX.H
-// and not in this file
+// stdafx.cpp : source file that includes just the standard includes
+// WelsVideoProcessor.pch will be the pre-compiled header
+// stdafx.obj will contain the pre-compiled type information
+
+#include "stdafx.h"
+
+// TODO: reference any additional headers you need in STDAFX.H
+// and not in this file
--- a/processing/src/testbed/stdafx.h
+++ b/processing/src/testbed/stdafx.h
@@ -1,20 +1,20 @@
-// stdafx.h : include file for standard system include files,
-// or project specific include files that are used frequently, but
-// are changed infrequently
-//
-
-#ifndef _WELSVP_STDAFX_H
-#define _WELSVP_STDAFX_H
-
-#include "targetver.h"
-
-#if defined (WIN32)
-#include <windows.h>
-#include <tchar.h>
-#else
-#include <string.h>
-#endif
-#include <stdio.h>
-#include <stdlib.h>
-
-#endif
+// stdafx.h : include file for standard system include files,
+// or project specific include files that are used frequently, but
+// are changed infrequently
+//
+
+#ifndef _WELSVP_STDAFX_H
+#define _WELSVP_STDAFX_H
+
+#include "targetver.h"
+
+#if defined (WIN32)
+#include <windows.h>
+#include <tchar.h>
+#else
+#include <string.h>
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+
+#endif
--- a/processing/src/testbed/targetver.h
+++ b/processing/src/testbed/targetver.h
@@ -1,16 +1,16 @@
-#ifndef _WELSVP_TARGETVER_H
-#define _WELSVP_TARGETVER_H
-
-// The following macros define the minimum required platform.  The minimum required platform
-// is the earliest version of Windows, Internet Explorer etc. that has the necessary features to run 
-// your application.  The macros work by enabling all features available on platform versions up to and 
-// including the version specified.
-
-// Modify the following defines if you have to target a platform prior to the ones specified below.
-// Refer to MSDN for the latest info on corresponding values for different platforms.
-#ifndef _WIN32_WINNT            // Specifies that the minimum required platform is Windows Vista.
-#define _WIN32_WINNT 0x0600     // Change this to the appropriate value to target other versions of Windows.
-#endif
-
-#endif
-
+#ifndef _WELSVP_TARGETVER_H
+#define _WELSVP_TARGETVER_H
+
+// The following macros define the minimum required platform.  The minimum required platform
+// is the earliest version of Windows, Internet Explorer etc. that has the necessary features to run
+// your application.  The macros work by enabling all features available on platform versions up to and
+// including the version specified.
+
+// Modify the following defines if you have to target a platform prior to the ones specified below.
+// Refer to MSDN for the latest info on corresponding values for different platforms.
+#ifndef _WIN32_WINNT            // Specifies that the minimum required platform is Windows Vista.
+#define _WIN32_WINNT 0x0600     // Change this to the appropriate value to target other versions of Windows.
+#endif
+
+#endif
+
--- a/processing/src/testbed/wels_process.cpp
+++ b/processing/src/testbed/wels_process.cpp
@@ -1,195 +1,181 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <windows.h>
-#include "wels_process.h"
-#include "bundleloader.h"
-
-// entry API declaration
-typedef vResult (WELSAPI *pfnCreateVpInterface)  (void **, int );
-typedef vResult (WELSAPI *pfnDestroyVpInterface) (void * , int );
-
-////////////////////////////////////////////////////////
-void *loadlib()
-{
-#if defined(WIN32)
-	HMODULE shModule = LoadLibraryA("WelsVP.dll");
-	if (shModule == NULL)
-		shModule = LoadLibraryA("../WelsVP.dll");
-
-#elif defined(MACOS)
-	const char WelsVPLib[] = "WelsVP.bundle";
-	CFBundleRef shModule = LoadBundle(WelsVPLib);
-
-#elif defined(UNIX)
-	const char WelsVPLib[] = "WelsVP.so";
-	void* shModule = dlopen(WelsVPLib, RTLD_LAZY);
-#endif
-
-	return (void *)shModule;
-}
-
-void freelib(void *lib)
-{
-	if (lib)
-	{
-#ifdef WIN32
-		HMODULE shModule = (HMODULE)lib;
-		FreeLibrary(shModule);
-
-#elif defined(MACOS)
-		CFBundleRef shModule = (CFBundleRef)lib;
-		FreeBundle(shModule);
-
-#elif defined(UNIX)
-		void* shModule = lib;
-		dlclose(shModule);
-#endif
-	}
-}
-
-void *queryfunc(void *lib, const char *name)
-{
-    void *pFunc = NULL;
-#ifdef WIN32
-	HMODULE shModule = (HMODULE)lib;
-	pFunc = (void *)GetProcAddress(shModule, name);
-#elif defined(MACOS)
-	CFBundleRef shModule = (CFBundleRef)lib;
-	pFunc = (void *)GetProcessAddress(shModule, name);
-#elif defined(UNIX)
-	void* shModule = lib;
-	pFunc = (void *)dlsym(shModule, name);
-#endif
-
-	return pFunc;
-}
-
-IWelsVpPlugin::IWelsVpPlugin(int &ret)
-: flag(0)
-, ivp(NULL)
-, hlib(NULL)
-{
-	pfnCreateVpInterface  pCreateVpInterface  = NULL;
-	pfnDestroyVpInterface pDestroyVpInterface = NULL;
-	iface[0] = iface[1] = NULL;
-
-	hlib  = loadlib();
-	if (!hlib)
-		goto exit;
-
-	pCreateVpInterface  = (pfnCreateVpInterface)  queryfunc(hlib, ("CreateVpInterface"));
-	pDestroyVpInterface = (pfnDestroyVpInterface) queryfunc(hlib, ("DestroyVpInterface"));
-	if (!pCreateVpInterface || !pDestroyVpInterface)
-		goto exit;
-    
-	iface[0] = (void *) pCreateVpInterface;
-	iface[1] = (void *) pDestroyVpInterface;
-	pCreateVpInterface((void **)&ivp, WELSVP_INTERFACE_VERION);
-	if (!iface)
-		goto exit;
-
-	ret = 0;
-	return;
-
-exit:
-	ret = 1;
-}
-
-IWelsVpPlugin::~IWelsVpPlugin()
-{
-	if (hlib)
-	{
-		pfnDestroyVpInterface pDestroyVpInterface = (pfnDestroyVpInterface) iface[1];
-		if (pDestroyVpInterface)
-			pDestroyVpInterface((void *)ivp, WELSVP_INTERFACE_VERION);
-
-		freelib(hlib);
-		hlib = NULL;
-	}
-}
-
-vResult IWelsVpPlugin::Init (int nType, void *pCfg)
-{
-	vResult ret = vRet_NotSupport;
-	if (hlib && nType > 0)
-		ret = ivp->Init(nType, pCfg);
-	return ret;
-}
-
-vResult IWelsVpPlugin::Uninit (int nType)
-{
-	vResult ret = vRet_NotSupport;
-	if (hlib && nType > 0)
-		ret = ivp->Uninit(nType);
-	return ret; 
-}
-
-vResult IWelsVpPlugin::Flush (int nType)
-{
-	vResult ret = vRet_NotSupport;
-	if (hlib && nType > 0)
-		ret = ivp->Flush(nType);
-	return ret; 	
-}
-
-vResult IWelsVpPlugin::Process (int nType, vPixMap *src, vPixMap *dst)
-{
-	vResult ret = vRet_NotSupport;
-	if (hlib && nType > 0)
-		ret = ivp->Process(nType, src, dst);
-	return ret; 
-}
-
-vResult IWelsVpPlugin::Get (int nType, void *pParam)
-{
-	vResult ret = vRet_NotSupport;
-	if (hlib && nType > 0)
-		ret = ivp->Get(nType, pParam);
-	return ret; 
-}
-
-vResult IWelsVpPlugin::Set (int nType, void *pParam)
-{
-	vResult ret = vRet_NotSupport;
-	if (hlib && nType > 0)
-		ret = ivp->Set(nType, pParam);
-	return ret; 
-}
-
-vResult IWelsVpPlugin::SpecialFeature (int nType, void *pIn, void *pOut)
-{
-	vResult ret = vRet_NotSupport;
-	if (hlib && nType > 0)
-		ret = ivp->SpecialFeature(nType, pIn, pOut);
-	return ret; 
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <windows.h>
+#include "wels_process.h"
+#include "bundleloader.h"
+
+// entry API declaration
+typedef vResult (WELSAPI* pfnCreateVpInterface) (void**, int);
+typedef vResult (WELSAPI* pfnDestroyVpInterface) (void*, int);
+
+////////////////////////////////////////////////////////
+void* loadlib() {
+#if defined(WIN32)
+  HMODULE shModule = LoadLibraryA ("WelsVP.dll");
+  if (shModule == NULL)
+    shModule = LoadLibraryA ("../WelsVP.dll");
+
+#elif defined(MACOS)
+  const char WelsVPLib[] = "WelsVP.bundle";
+  CFBundleRef shModule = LoadBundle (WelsVPLib);
+
+#elif defined(UNIX)
+  const char WelsVPLib[] = "WelsVP.so";
+  void* shModule = dlopen (WelsVPLib, RTLD_LAZY);
+#endif
+
+  return (void*)shModule;
+}
+
+void freelib (void* lib) {
+  if (lib) {
+#ifdef WIN32
+    HMODULE shModule = (HMODULE)lib;
+    FreeLibrary (shModule);
+
+#elif defined(MACOS)
+    CFBundleRef shModule = (CFBundleRef)lib;
+    FreeBundle (shModule);
+
+#elif defined(UNIX)
+    void* shModule = lib;
+    dlclose (shModule);
+#endif
+  }
+}
+
+void* queryfunc (void* lib, const char* name) {
+  void* pFunc = NULL;
+#ifdef WIN32
+  HMODULE shModule = (HMODULE)lib;
+  pFunc = (void*)GetProcAddress (shModule, name);
+#elif defined(MACOS)
+  CFBundleRef shModule = (CFBundleRef)lib;
+  pFunc = (void*)GetProcessAddress (shModule, name);
+#elif defined(UNIX)
+  void* shModule = lib;
+  pFunc = (void*)dlsym (shModule, name);
+#endif
+
+  return pFunc;
+}
+
+IWelsVpPlugin::IWelsVpPlugin (int& ret)
+  : flag (0)
+  , ivp (NULL)
+  , hlib (NULL) {
+  pfnCreateVpInterface  pCreateVpInterface  = NULL;
+  pfnDestroyVpInterface pDestroyVpInterface = NULL;
+  iface[0] = iface[1] = NULL;
+
+  hlib  = loadlib();
+  if (!hlib)
+    goto exit;
+
+  pCreateVpInterface  = (pfnCreateVpInterface)  queryfunc (hlib, ("CreateVpInterface"));
+  pDestroyVpInterface = (pfnDestroyVpInterface) queryfunc (hlib, ("DestroyVpInterface"));
+  if (!pCreateVpInterface || !pDestroyVpInterface)
+    goto exit;
+
+  iface[0] = (void*) pCreateVpInterface;
+  iface[1] = (void*) pDestroyVpInterface;
+  pCreateVpInterface ((void**)&ivp, WELSVP_INTERFACE_VERION);
+  if (!iface)
+    goto exit;
+
+  ret = 0;
+  return;
+
+exit:
+  ret = 1;
+}
+
+IWelsVpPlugin::~IWelsVpPlugin() {
+  if (hlib) {
+    pfnDestroyVpInterface pDestroyVpInterface = (pfnDestroyVpInterface) iface[1];
+    if (pDestroyVpInterface)
+      pDestroyVpInterface ((void*)ivp, WELSVP_INTERFACE_VERION);
+
+    freelib (hlib);
+    hlib = NULL;
+  }
+}
+
+vResult IWelsVpPlugin::Init (int nType, void* pCfg) {
+  vResult ret = vRet_NotSupport;
+  if (hlib && nType > 0)
+    ret = ivp->Init (nType, pCfg);
+  return ret;
+}
+
+vResult IWelsVpPlugin::Uninit (int nType) {
+  vResult ret = vRet_NotSupport;
+  if (hlib && nType > 0)
+    ret = ivp->Uninit (nType);
+  return ret;
+}
+
+vResult IWelsVpPlugin::Flush (int nType) {
+  vResult ret = vRet_NotSupport;
+  if (hlib && nType > 0)
+    ret = ivp->Flush (nType);
+  return ret;
+}
+
+vResult IWelsVpPlugin::Process (int nType, vPixMap* src, vPixMap* dst) {
+  vResult ret = vRet_NotSupport;
+  if (hlib && nType > 0)
+    ret = ivp->Process (nType, src, dst);
+  return ret;
+}
+
+vResult IWelsVpPlugin::Get (int nType, void* pParam) {
+  vResult ret = vRet_NotSupport;
+  if (hlib && nType > 0)
+    ret = ivp->Get (nType, pParam);
+  return ret;
+}
+
+vResult IWelsVpPlugin::Set (int nType, void* pParam) {
+  vResult ret = vRet_NotSupport;
+  if (hlib && nType > 0)
+    ret = ivp->Set (nType, pParam);
+  return ret;
+}
+
+vResult IWelsVpPlugin::SpecialFeature (int nType, void* pIn, void* pOut) {
+  vResult ret = vRet_NotSupport;
+  if (hlib && nType > 0)
+    ret = ivp->SpecialFeature (nType, pIn, pOut);
+  return ret;
 }
\ No newline at end of file
--- a/processing/src/testbed/wels_process.h
+++ b/processing/src/testbed/wels_process.h
@@ -1,79 +1,81 @@
-/*!
- * \copy
- *     Copyright (c)  2011-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- * \file	wels_process.h
- *
- * \brief	interface of video pre-process plugins
- *
- * \date	03/21/2011
- *
- * \description : this class is designed as an interface to unify video pre-processing 
- *                class implement sets such as denoise,colorspace conversion etc...
- *
- *************************************************************************************
- */
-
-#ifndef WELS_PREPROCESS_H
-#define WELS_PREPROCESS_H
-
-#include "../../interface/IWelsVP.h"
-
-class IWelsVpPlugin
-{
-public:
-	IWelsVpPlugin(int &ret);
-	~IWelsVpPlugin();
-
-	enum
-	{
-		STATE_BEFOREENC = 0, /* before picture encoding */
-		STATE_AFTERENC     , /* after picture encoded */
-	};
-
-public:
-	vResult Init    (int nType, void *pCfg); 
-	vResult Uninit  (int nType);
-	vResult Flush   (int nType);
-	vResult Process (int nType, vPixMap *src, vPixMap *dst); 
-	vResult Get     (int nType, void *pParam); 
-	vResult Set     (int nType, void *pParam); 
-	vResult SpecialFeature (int nType, void *pIn, void *pOut);
-
-	void SetFlag(int a)   { flag = a; }
-	void GetFlag(int &a)  { a = flag; }
-
-private:
-	int      flag;
-	IWelsVP  *ivp;	
-	void     *hlib;
-	void     *iface[2];
-};
-
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	wels_process.h
+ *
+ * \brief	interface of video pre-process plugins
+ *
+ * \date	03/21/2011
+ *
+ * \description : this class is designed as an interface to unify video pre-processing
+ *                class implement sets such as denoise,colorspace conversion etc...
+ *
+ *************************************************************************************
+ */
+
+#ifndef WELS_PREPROCESS_H
+#define WELS_PREPROCESS_H
+
+#include "../../interface/IWelsVP.h"
+
+class IWelsVpPlugin {
+ public:
+  IWelsVpPlugin (int& ret);
+  ~IWelsVpPlugin();
+
+  enum {
+    STATE_BEFOREENC = 0, /* before picture encoding */
+    STATE_AFTERENC     , /* after picture encoded */
+  };
+
+ public:
+  vResult Init (int nType, void* pCfg);
+  vResult Uninit (int nType);
+  vResult Flush (int nType);
+  vResult Process (int nType, vPixMap* src, vPixMap* dst);
+  vResult Get (int nType, void* pParam);
+  vResult Set (int nType, void* pParam);
+  vResult SpecialFeature (int nType, void* pIn, void* pOut);
+
+  void SetFlag (int a)   {
+    flag = a;
+  }
+  void GetFlag (int& a)  {
+    a = flag;
+  }
+
+ private:
+  int      flag;
+  IWelsVP*  ivp;
+  void*     hlib;
+  void*     iface[2];
+};
+
 #endif
\ No newline at end of file
--- a/processing/src/vaacalc/vaacalcfuncs.cpp
+++ b/processing/src/vaacalc/vaacalcfuncs.cpp
@@ -1,655 +1,595 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "../common/typedef.h"
-#include "../common/util.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-void VAACalcSadSsd_c(uint8_t *pCurData, uint8_t *pRefData, int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride, 
-						int32_t *pFrameSad, int32_t *pSad8x8, int32_t *pSum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
-{
-	uint8_t *tmp_ref = pRefData;
-	uint8_t *tmp_cur = pCurData;
-	int32_t iMbWidth = (iPicWidth >> 4);
-	int32_t mb_heigth = (iPicHeight >> 4);
-	int32_t mb_index = 0;
-	int32_t pic_stride_x8 = iPicStride << 3;
-	int32_t step = (iPicStride << 4) - iPicWidth;
-
-	*pFrameSad = 0;
-	for (int32_t i = 0; i < mb_heigth; i ++)
-	{
-		for (int32_t j = 0; j < iMbWidth; j ++)
-		{
-			int32_t k, l;
-			int32_t l_sad, l_sqdiff, l_sum, l_sqsum;
-			uint8_t *tmp_cur_row;
-			uint8_t *tmp_ref_row;
-
-			pSum16x16[mb_index] = 0;
-			psqsum16x16[mb_index] = 0;
-			psqdiff16x16[mb_index] = 0;
-			
-			l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
-			tmp_cur_row = tmp_cur;
-			tmp_ref_row = tmp_ref;
-			for (k = 0; k < 8; k ++)
-			{
-				for (l = 0; l < 8; l ++)
-				{
-					int32_t diff = WELS_ABS(tmp_cur_row[l] - tmp_ref_row[l]);
-					l_sad += diff;
-					l_sqdiff += diff*diff;
-					l_sum += tmp_cur_row[l];
-					l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
-				}
-				tmp_cur_row += iPicStride;
-				tmp_ref_row += iPicStride;
-			}
-			*pFrameSad += l_sad;
-			pSad8x8[(mb_index << 2) + 0] = l_sad;
-			pSum16x16[mb_index] += l_sum;
-			psqsum16x16[mb_index] += l_sqsum;
-			psqdiff16x16[mb_index] += l_sqdiff;
-
-			l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
-			tmp_cur_row = tmp_cur + 8;
-			tmp_ref_row = tmp_ref + 8;
-			for (k = 0; k < 8; k ++)
-			{
-				for (l = 0; l < 8; l ++)
-				{
-					int32_t diff = WELS_ABS(tmp_cur_row[l] - tmp_ref_row[l]);
-					l_sad += diff;
-					l_sqdiff += diff*diff;
-					l_sum += tmp_cur_row[l];
-					l_sqsum += tmp_cur_row[l]*tmp_cur_row[l];
-				}
-				tmp_cur_row += iPicStride;
-				tmp_ref_row += iPicStride;
-			}
-			*pFrameSad += l_sad;
-			pSad8x8[(mb_index << 2) + 1] = l_sad;
-			pSum16x16[mb_index] += l_sum;
-			psqsum16x16[mb_index] += l_sqsum;
-			psqdiff16x16[mb_index] += l_sqdiff;
-
-			l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
-			tmp_cur_row = tmp_cur + pic_stride_x8;
-			tmp_ref_row = tmp_ref + pic_stride_x8;
-			for (k = 0; k < 8; k ++)
-			{
-				for (l = 0; l < 8; l ++)
-				{
-					int32_t diff = WELS_ABS(tmp_cur_row[l] - tmp_ref_row[l]);
-					l_sad += diff;
-					l_sqdiff += diff*diff;
-					l_sum += tmp_cur_row[l];
-					l_sqsum += tmp_cur_row[l]*tmp_cur_row[l];
-				}
-				tmp_cur_row += iPicStride;
-				tmp_ref_row += iPicStride;
-			}
-			*pFrameSad += l_sad;
-			pSad8x8[(mb_index << 2) + 2] = l_sad;
-			pSum16x16[mb_index] += l_sum;
-			psqsum16x16[mb_index] += l_sqsum;
-			psqdiff16x16[mb_index] += l_sqdiff;
-			
-			l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
-			tmp_cur_row = tmp_cur + pic_stride_x8 + 8;
-			tmp_ref_row = tmp_ref + pic_stride_x8 + 8;
-			for (k = 0; k < 8; k ++)
-			{
-				for (l = 0; l < 8; l ++)
-				{
-					int32_t diff = WELS_ABS(tmp_cur_row[l] - tmp_ref_row[l]);
-					l_sad += diff;
-					l_sqdiff += diff*diff;
-					l_sum += tmp_cur_row[l];
-					l_sqsum += tmp_cur_row[l]*tmp_cur_row[l];
-				}
-				tmp_cur_row += iPicStride;
-				tmp_ref_row += iPicStride;
-			}
-			*pFrameSad += l_sad;
-			pSad8x8[(mb_index << 2) + 3] = l_sad;
-			pSum16x16[mb_index] += l_sum;
-			psqsum16x16[mb_index] += l_sqsum;
-			psqdiff16x16[mb_index] += l_sqdiff;
-			
-			
-			tmp_ref += 16;
-			tmp_cur += 16;
-			++mb_index;
-		}
-		tmp_ref += step;
-		tmp_cur += step;
-	}
-}
-void VAACalcSadVar_c(uint8_t *pCurData, uint8_t *pRefData, int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride, 
-						int32_t *pFrameSad, int32_t *pSad8x8, int32_t *pSum16x16, int32_t *psqsum16x16)
-{
-	uint8_t *tmp_ref = pRefData;
-	uint8_t *tmp_cur = pCurData;
-	int32_t iMbWidth = (iPicWidth >> 4);
-	int32_t mb_heigth = (iPicHeight >> 4);
-	int32_t mb_index = 0;
-	int32_t pic_stride_x8 = iPicStride << 3;
-	int32_t step = (iPicStride << 4) - iPicWidth;
-
-	*pFrameSad = 0;
-	for (int32_t i = 0; i < mb_heigth; i ++)
-	{
-		for (int32_t j = 0; j < iMbWidth; j ++)
-		{
-			int32_t k, l;
-			int32_t l_sad, l_sum, l_sqsum;
-			uint8_t *tmp_cur_row;
-			uint8_t *tmp_ref_row;
-
-			pSum16x16[mb_index] = 0;
-			psqsum16x16[mb_index] = 0;
-			
-			l_sad =  l_sum =  l_sqsum = 0;
-			tmp_cur_row = tmp_cur;
-			tmp_ref_row = tmp_ref;
-			for (k = 0; k < 8; k ++)
-			{
-				for (l = 0; l < 8; l ++)
-				{
-					int32_t diff = WELS_ABS(tmp_cur_row[l] - tmp_ref_row[l]);
-					l_sad += diff;
-					l_sum += tmp_cur_row[l];
-					l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
-				}
-				tmp_cur_row += iPicStride;
-				tmp_ref_row += iPicStride;
-			}
-			*pFrameSad += l_sad;
-			pSad8x8[(mb_index << 2) + 0] = l_sad;
-			pSum16x16[mb_index] += l_sum;
-			psqsum16x16[mb_index] += l_sqsum;
-
-			l_sad =  l_sum =  l_sqsum = 0;
-			tmp_cur_row = tmp_cur + 8;
-			tmp_ref_row = tmp_ref + 8;
-			for (k = 0; k < 8; k ++)
-			{
-				for (l = 0; l < 8; l ++)
-				{
-					int32_t diff = WELS_ABS(tmp_cur_row[l] - tmp_ref_row[l]);
-					l_sad += diff;
-					l_sum += tmp_cur_row[l];
-					l_sqsum += tmp_cur_row[l]*tmp_cur_row[l];
-				}
-				tmp_cur_row += iPicStride;
-				tmp_ref_row += iPicStride;
-			}
-			*pFrameSad += l_sad;
-			pSad8x8[(mb_index << 2) + 1] = l_sad;
-			pSum16x16[mb_index] += l_sum;
-			psqsum16x16[mb_index] += l_sqsum;
-
-			l_sad =  l_sum =  l_sqsum = 0;
-			tmp_cur_row = tmp_cur + pic_stride_x8;
-			tmp_ref_row = tmp_ref + pic_stride_x8;
-			for (k = 0; k < 8; k ++)
-			{
-				for (l = 0; l < 8; l ++)
-				{
-					int32_t diff = WELS_ABS(tmp_cur_row[l] - tmp_ref_row[l]);
-					l_sad += diff;
-					l_sum += tmp_cur_row[l];
-					l_sqsum += tmp_cur_row[l]*tmp_cur_row[l];
-				}
-				tmp_cur_row += iPicStride;
-				tmp_ref_row += iPicStride;
-			}
-			*pFrameSad += l_sad;
-			pSad8x8[(mb_index << 2) + 2] = l_sad;
-			pSum16x16[mb_index] += l_sum;
-			psqsum16x16[mb_index] += l_sqsum;
-			
-			l_sad =  l_sum =  l_sqsum = 0;
-			tmp_cur_row = tmp_cur + pic_stride_x8 + 8;
-			tmp_ref_row = tmp_ref + pic_stride_x8 + 8;
-			for (k = 0; k < 8; k ++)
-			{
-				for (l = 0; l < 8; l ++)
-				{
-					int32_t diff = WELS_ABS(tmp_cur_row[l] - tmp_ref_row[l]);
-					l_sad += diff;
-					l_sum += tmp_cur_row[l];
-					l_sqsum += tmp_cur_row[l]*tmp_cur_row[l];
-				}
-				tmp_cur_row += iPicStride;
-				tmp_ref_row += iPicStride;
-			}
-			*pFrameSad += l_sad;
-			pSad8x8[(mb_index << 2) + 3] = l_sad;
-			pSum16x16[mb_index] += l_sum;
-			psqsum16x16[mb_index] += l_sqsum;
-			
-			
-			tmp_ref += 16;
-			tmp_cur += 16;
-			++mb_index;
-		}
-		tmp_ref += step;
-		tmp_cur += step;
-	}
-}
-
-
-void VAACalcSad_c(uint8_t *pCurData, uint8_t *pRefData, int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride, 
-						int32_t *pFrameSad, int32_t *pSad8x8)
-{
-	uint8_t *tmp_ref = pRefData;
-	uint8_t *tmp_cur = pCurData;
-	int32_t iMbWidth = (iPicWidth >> 4);
-	int32_t mb_heigth = (iPicHeight >> 4);
-	int32_t mb_index = 0;
-	int32_t pic_stride_x8 = iPicStride << 3;
-	int32_t step = (iPicStride << 4) - iPicWidth;
-
-	*pFrameSad = 0;
-	for (int32_t i = 0; i < mb_heigth; i ++)
-	{
-		for (int32_t j = 0; j < iMbWidth; j ++)
-		{
-			int32_t k, l;
-			int32_t l_sad;
-			uint8_t *tmp_cur_row;
-			uint8_t *tmp_ref_row;
-			
-			l_sad =  0;
-			tmp_cur_row = tmp_cur;
-			tmp_ref_row = tmp_ref;
-			for (k = 0; k < 8; k ++)
-			{
-				for (l = 0; l < 8; l ++)
-				{
-					int32_t diff = WELS_ABS(tmp_cur_row[l] - tmp_ref_row[l]);
-					l_sad += diff;
-				}
-				tmp_cur_row += iPicStride;
-				tmp_ref_row += iPicStride;
-			}
-			*pFrameSad += l_sad;
-			pSad8x8[(mb_index << 2) + 0] = l_sad;
-
-			l_sad =  0;
-			tmp_cur_row = tmp_cur + 8;
-			tmp_ref_row = tmp_ref + 8;
-			for (k = 0; k < 8; k ++)
-			{
-				for (l = 0; l < 8; l ++)
-				{
-					int32_t diff = WELS_ABS(tmp_cur_row[l] - tmp_ref_row[l]);
-					l_sad += diff;
-				}
-				tmp_cur_row += iPicStride;
-				tmp_ref_row += iPicStride;
-			}
-			*pFrameSad += l_sad;
-			pSad8x8[(mb_index << 2) + 1] = l_sad;
-
-			l_sad =  0;
-			tmp_cur_row = tmp_cur + pic_stride_x8;
-			tmp_ref_row = tmp_ref + pic_stride_x8;
-			for (k = 0; k < 8; k ++)
-			{
-				for (l = 0; l < 8; l ++)
-				{
-					int32_t diff = WELS_ABS(tmp_cur_row[l] - tmp_ref_row[l]);
-					l_sad += diff;
-				}
-				tmp_cur_row += iPicStride;
-				tmp_ref_row += iPicStride;
-			}
-			*pFrameSad += l_sad;
-			pSad8x8[(mb_index << 2) + 2] = l_sad;
-			
-			l_sad =  0;
-			tmp_cur_row = tmp_cur + pic_stride_x8 + 8;
-			tmp_ref_row = tmp_ref + pic_stride_x8 + 8;
-			for (k = 0; k < 8; k ++)
-			{
-				for (l = 0; l < 8; l ++)
-				{
-					int32_t diff = WELS_ABS(tmp_cur_row[l] - tmp_ref_row[l]);
-					l_sad += diff;
-				}
-				tmp_cur_row += iPicStride;
-				tmp_ref_row += iPicStride;
-			}
-			*pFrameSad += l_sad;
-			pSad8x8[(mb_index << 2) + 3] = l_sad;
-			
-			tmp_ref += 16;
-			tmp_cur += 16;
-			++mb_index;
-		}
-		tmp_ref += step;
-		tmp_cur += step;
-	}
-}
-
-void VAACalcSadSsdBgd_c(uint8_t *pCurData, uint8_t *pRefData, int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride, 
-							int32_t *pFrameSad, int32_t *pSad8x8, int32_t *pSum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16, int32_t *pSd8x8, uint8_t *pMad8x8)
-
-{
-	uint8_t *tmp_ref = pRefData;
-	uint8_t *tmp_cur = pCurData;
-	int32_t iMbWidth = (iPicWidth >> 4);
-	int32_t mb_heigth = (iPicHeight >> 4);
-	int32_t mb_index = 0;
-	int32_t pic_stride_x8 = iPicStride << 3;
-	int32_t step = (iPicStride << 4) - iPicWidth;
-
-	*pFrameSad = 0;
-	for (int32_t i = 0; i < mb_heigth; i ++)
-	{
-		for (int32_t j = 0; j < iMbWidth; j ++)
-		{
-			int32_t k, l;
-			int32_t l_sad, l_sqdiff, l_sum, l_sqsum, l_sd, l_mad;
-			uint8_t *tmp_cur_row;
-			uint8_t *tmp_ref_row;
-
-			pSum16x16[mb_index] = 0;
-			psqsum16x16[mb_index] = 0;
-			psqdiff16x16[mb_index] = 0;
-
-			l_sd = l_mad = l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
-			tmp_cur_row = tmp_cur;
-			tmp_ref_row = tmp_ref;
-			for (k = 0; k < 8; k ++)
-			{
-				for (l = 0; l < 8; l ++)
-				{
-					int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
-					int32_t abs_diff = WELS_ABS(diff);
-
-					l_sd += diff;
-					if (abs_diff>l_mad)
-					{
-						l_mad = abs_diff;
-					}
-					l_sad += abs_diff;
-					l_sqdiff += abs_diff*abs_diff;
-					l_sum += tmp_cur_row[l];
-					l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
-				}
-				tmp_cur_row += iPicStride;
-				tmp_ref_row += iPicStride;
-			}
-			*pFrameSad += l_sad;
-			pSad8x8[(mb_index << 2) + 0] = l_sad;
-			pSum16x16[mb_index] += l_sum;
-			psqsum16x16[mb_index] += l_sqsum;
-			psqdiff16x16[mb_index] += l_sqdiff;
-			pSd8x8[(mb_index << 2) + 0] = l_sd;
-			pMad8x8[(mb_index << 2) + 0] = l_mad;
-
-
-			l_sd = l_mad = l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
-			tmp_cur_row = tmp_cur + 8;
-			tmp_ref_row = tmp_ref + 8;
-			for (k = 0; k < 8; k ++)
-			{
-				for (l = 0; l < 8; l ++)
-				{
-					int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
-					int32_t abs_diff = WELS_ABS(diff);
-
-					l_sd += diff;
-					if (abs_diff>l_mad)
-					{
-						l_mad = abs_diff;
-					}
-					l_sad += abs_diff;
-					l_sqdiff += abs_diff*abs_diff;
-					l_sum += tmp_cur_row[l];
-					l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
-				}
-				tmp_cur_row += iPicStride;
-				tmp_ref_row += iPicStride;
-			}
-			*pFrameSad += l_sad;
-			pSad8x8[(mb_index << 2) + 1] = l_sad;
-			pSum16x16[mb_index] += l_sum;
-			psqsum16x16[mb_index] += l_sqsum;
-			psqdiff16x16[mb_index] += l_sqdiff;
-			pSd8x8[(mb_index << 2) + 1] = l_sd;
-			pMad8x8[(mb_index << 2) + 1] = l_mad;
-
-			l_sd = l_mad = l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
-			tmp_cur_row = tmp_cur + pic_stride_x8;
-			tmp_ref_row = tmp_ref + pic_stride_x8;
-			for (k = 0; k < 8; k ++)
-			{
-				for (l = 0; l < 8; l ++)
-				{
-					int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
-					int32_t abs_diff = WELS_ABS(diff);
-
-					l_sd += diff;
-					if (abs_diff>l_mad)
-					{
-						l_mad = abs_diff;
-					}
-					l_sad += abs_diff;
-					l_sqdiff += abs_diff*abs_diff;
-					l_sum += tmp_cur_row[l];
-					l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
-				}
-				tmp_cur_row += iPicStride;
-				tmp_ref_row += iPicStride;
-			}
-			*pFrameSad += l_sad;
-			pSad8x8[(mb_index << 2) + 2] = l_sad;
-			pSum16x16[mb_index] += l_sum;
-			psqsum16x16[mb_index] += l_sqsum;
-			psqdiff16x16[mb_index] += l_sqdiff;
-			pSd8x8[(mb_index << 2) + 2] = l_sd;
-			pMad8x8[(mb_index << 2) + 2] = l_mad;
-
-			l_sd = l_mad = l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
-			tmp_cur_row = tmp_cur + pic_stride_x8 + 8;
-			tmp_ref_row = tmp_ref + pic_stride_x8 + 8;
-			for (k = 0; k < 8; k ++)
-			{
-				for (l = 0; l < 8; l ++)
-				{
-					int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
-					int32_t abs_diff = WELS_ABS(diff);
-
-					l_sd += diff;
-					if (abs_diff>l_mad)
-					{
-						l_mad = abs_diff;
-					}
-					l_sad += abs_diff;
-					l_sqdiff += abs_diff*abs_diff;
-					l_sum += tmp_cur_row[l];
-					l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
-				}
-				tmp_cur_row += iPicStride;
-				tmp_ref_row += iPicStride;
-			}
-			*pFrameSad += l_sad;
-			pSad8x8[(mb_index << 2) + 3] = l_sad;
-			pSum16x16[mb_index] += l_sum;
-			psqsum16x16[mb_index] += l_sqsum;
-			psqdiff16x16[mb_index] += l_sqdiff;
-			pSd8x8[(mb_index << 2) + 3] = l_sd;
-			pMad8x8[(mb_index << 2) + 3] = l_mad;
-
-			tmp_ref += 16;
-			tmp_cur += 16;
-			++mb_index;
-		}
-		tmp_ref += step;
-		tmp_cur += step;
-	}
-}
-
-void VAACalcSadBgd_c(uint8_t *pCurData, uint8_t *pRefData, int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride, 
-						int32_t *pFrameSad, int32_t *pSad8x8, int32_t *pSd8x8, uint8_t *pMad8x8)
-{
-	uint8_t *tmp_ref = pRefData;
-	uint8_t *tmp_cur = pCurData;
-	int32_t iMbWidth = (iPicWidth >> 4);
-	int32_t mb_heigth = (iPicHeight >> 4);
-	int32_t mb_index = 0;
-	int32_t pic_stride_x8 = iPicStride << 3;
-	int32_t step = (iPicStride << 4) - iPicWidth;
-
-	*pFrameSad = 0;
-	for (int32_t i = 0; i < mb_heigth; i ++)
-	{
-		for (int32_t j = 0; j < iMbWidth; j ++)
-		{
-			int32_t k, l;
-			int32_t l_sad,l_sd,l_mad;
-			uint8_t *tmp_cur_row;
-			uint8_t *tmp_ref_row;
-
-			l_mad = l_sd = l_sad =  0;
-			tmp_cur_row = tmp_cur;
-			tmp_ref_row = tmp_ref;
-			for (k = 0; k < 8; k ++)
-			{
-				for (l = 0; l < 8; l ++)
-				{
-					int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
-					int32_t abs_diff = WELS_ABS(diff);
-					l_sd += diff;
-					l_sad += abs_diff;
-					if (abs_diff>l_mad)
-					{
-						l_mad = abs_diff;
-					}
-				}
-				tmp_cur_row += iPicStride;
-				tmp_ref_row += iPicStride;
-			}
-			*pFrameSad += l_sad;
-			pSad8x8[(mb_index << 2) + 0] = l_sad;
-			pSd8x8[(mb_index << 2) + 0] = l_sd;
-			pMad8x8[(mb_index << 2) + 0] = l_mad;
-
-			l_mad = l_sd = l_sad =  0;
-			tmp_cur_row = tmp_cur + 8;
-			tmp_ref_row = tmp_ref + 8;
-			for (k = 0; k < 8; k ++)
-			{
-				for (l = 0; l < 8; l ++)
-				{
-					int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
-					int32_t abs_diff = WELS_ABS(diff);
-					l_sd += diff;
-					l_sad += abs_diff;
-					if (abs_diff>l_mad)
-					{
-						l_mad = abs_diff;
-					}
-				}
-				tmp_cur_row += iPicStride;
-				tmp_ref_row += iPicStride;
-			}
-			*pFrameSad += l_sad;
-			pSad8x8[(mb_index << 2) + 1] = l_sad;
-			pSd8x8[(mb_index << 2) + 1] = l_sd;
-			pMad8x8[(mb_index << 2) + 1] = l_mad;
-
-			l_mad = l_sd = l_sad =  0;
-			tmp_cur_row = tmp_cur + pic_stride_x8;
-			tmp_ref_row = tmp_ref + pic_stride_x8;
-			for (k = 0; k < 8; k ++)
-			{
-				for (l = 0; l < 8; l ++)
-				{
-					int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
-					int32_t abs_diff = WELS_ABS(diff);
-					l_sd += diff;
-					l_sad += abs_diff;
-					if (abs_diff>l_mad)
-					{
-						l_mad = abs_diff;
-					}
-				}
-				tmp_cur_row += iPicStride;
-				tmp_ref_row += iPicStride;
-			}
-			*pFrameSad += l_sad;
-			pSad8x8[(mb_index << 2) + 2] = l_sad;
-			pSd8x8[(mb_index << 2) + 2] = l_sd;
-			pMad8x8[(mb_index << 2) + 2] = l_mad;
-
-			l_mad = l_sd = l_sad =  0;
-			tmp_cur_row = tmp_cur + pic_stride_x8 + 8;
-			tmp_ref_row = tmp_ref + pic_stride_x8 + 8;
-			for (k = 0; k < 8; k ++)
-			{
-				for (l = 0; l < 8; l ++)
-				{
-					int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
-					int32_t abs_diff = WELS_ABS(diff);
-					l_sd += diff;
-					l_sad += abs_diff;
-					if (abs_diff>l_mad)
-					{
-						l_mad = abs_diff;
-					}
-				}
-				tmp_cur_row += iPicStride;
-				tmp_ref_row += iPicStride;
-			}
-			*pFrameSad += l_sad;
-			pSad8x8[(mb_index << 2) + 3] = l_sad;
-			pSd8x8[(mb_index << 2) + 3] = l_sd;
-			pMad8x8[(mb_index << 2) + 3] = l_mad;
-
-			tmp_ref += 16;
-			tmp_cur += 16;
-			++mb_index;
-		}
-		tmp_ref += step;
-		tmp_cur += step;
-	}
-}
-
-WELSVP_NAMESPACE_END
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "../common/typedef.h"
+#include "../common/util.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+void VAACalcSadSsd_c (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
+                      int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSum16x16, int32_t* psqsum16x16, int32_t* psqdiff16x16) {
+  uint8_t* tmp_ref = pRefData;
+  uint8_t* tmp_cur = pCurData;
+  int32_t iMbWidth = (iPicWidth >> 4);
+  int32_t mb_heigth = (iPicHeight >> 4);
+  int32_t mb_index = 0;
+  int32_t pic_stride_x8 = iPicStride << 3;
+  int32_t step = (iPicStride << 4) - iPicWidth;
+
+  *pFrameSad = 0;
+  for (int32_t i = 0; i < mb_heigth; i ++) {
+    for (int32_t j = 0; j < iMbWidth; j ++) {
+      int32_t k, l;
+      int32_t l_sad, l_sqdiff, l_sum, l_sqsum;
+      uint8_t* tmp_cur_row;
+      uint8_t* tmp_ref_row;
+
+      pSum16x16[mb_index] = 0;
+      psqsum16x16[mb_index] = 0;
+      psqdiff16x16[mb_index] = 0;
+
+      l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
+      tmp_cur_row = tmp_cur;
+      tmp_ref_row = tmp_ref;
+      for (k = 0; k < 8; k ++) {
+        for (l = 0; l < 8; l ++) {
+          int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
+          l_sad += diff;
+          l_sqdiff += diff * diff;
+          l_sum += tmp_cur_row[l];
+          l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
+        }
+        tmp_cur_row += iPicStride;
+        tmp_ref_row += iPicStride;
+      }
+      *pFrameSad += l_sad;
+      pSad8x8[ (mb_index << 2) + 0] = l_sad;
+      pSum16x16[mb_index] += l_sum;
+      psqsum16x16[mb_index] += l_sqsum;
+      psqdiff16x16[mb_index] += l_sqdiff;
+
+      l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
+      tmp_cur_row = tmp_cur + 8;
+      tmp_ref_row = tmp_ref + 8;
+      for (k = 0; k < 8; k ++) {
+        for (l = 0; l < 8; l ++) {
+          int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
+          l_sad += diff;
+          l_sqdiff += diff * diff;
+          l_sum += tmp_cur_row[l];
+          l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
+        }
+        tmp_cur_row += iPicStride;
+        tmp_ref_row += iPicStride;
+      }
+      *pFrameSad += l_sad;
+      pSad8x8[ (mb_index << 2) + 1] = l_sad;
+      pSum16x16[mb_index] += l_sum;
+      psqsum16x16[mb_index] += l_sqsum;
+      psqdiff16x16[mb_index] += l_sqdiff;
+
+      l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
+      tmp_cur_row = tmp_cur + pic_stride_x8;
+      tmp_ref_row = tmp_ref + pic_stride_x8;
+      for (k = 0; k < 8; k ++) {
+        for (l = 0; l < 8; l ++) {
+          int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
+          l_sad += diff;
+          l_sqdiff += diff * diff;
+          l_sum += tmp_cur_row[l];
+          l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
+        }
+        tmp_cur_row += iPicStride;
+        tmp_ref_row += iPicStride;
+      }
+      *pFrameSad += l_sad;
+      pSad8x8[ (mb_index << 2) + 2] = l_sad;
+      pSum16x16[mb_index] += l_sum;
+      psqsum16x16[mb_index] += l_sqsum;
+      psqdiff16x16[mb_index] += l_sqdiff;
+
+      l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
+      tmp_cur_row = tmp_cur + pic_stride_x8 + 8;
+      tmp_ref_row = tmp_ref + pic_stride_x8 + 8;
+      for (k = 0; k < 8; k ++) {
+        for (l = 0; l < 8; l ++) {
+          int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
+          l_sad += diff;
+          l_sqdiff += diff * diff;
+          l_sum += tmp_cur_row[l];
+          l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
+        }
+        tmp_cur_row += iPicStride;
+        tmp_ref_row += iPicStride;
+      }
+      *pFrameSad += l_sad;
+      pSad8x8[ (mb_index << 2) + 3] = l_sad;
+      pSum16x16[mb_index] += l_sum;
+      psqsum16x16[mb_index] += l_sqsum;
+      psqdiff16x16[mb_index] += l_sqdiff;
+
+
+      tmp_ref += 16;
+      tmp_cur += 16;
+      ++mb_index;
+    }
+    tmp_ref += step;
+    tmp_cur += step;
+  }
+}
+void VAACalcSadVar_c (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
+                      int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSum16x16, int32_t* psqsum16x16) {
+  uint8_t* tmp_ref = pRefData;
+  uint8_t* tmp_cur = pCurData;
+  int32_t iMbWidth = (iPicWidth >> 4);
+  int32_t mb_heigth = (iPicHeight >> 4);
+  int32_t mb_index = 0;
+  int32_t pic_stride_x8 = iPicStride << 3;
+  int32_t step = (iPicStride << 4) - iPicWidth;
+
+  *pFrameSad = 0;
+  for (int32_t i = 0; i < mb_heigth; i ++) {
+    for (int32_t j = 0; j < iMbWidth; j ++) {
+      int32_t k, l;
+      int32_t l_sad, l_sum, l_sqsum;
+      uint8_t* tmp_cur_row;
+      uint8_t* tmp_ref_row;
+
+      pSum16x16[mb_index] = 0;
+      psqsum16x16[mb_index] = 0;
+
+      l_sad =  l_sum =  l_sqsum = 0;
+      tmp_cur_row = tmp_cur;
+      tmp_ref_row = tmp_ref;
+      for (k = 0; k < 8; k ++) {
+        for (l = 0; l < 8; l ++) {
+          int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
+          l_sad += diff;
+          l_sum += tmp_cur_row[l];
+          l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
+        }
+        tmp_cur_row += iPicStride;
+        tmp_ref_row += iPicStride;
+      }
+      *pFrameSad += l_sad;
+      pSad8x8[ (mb_index << 2) + 0] = l_sad;
+      pSum16x16[mb_index] += l_sum;
+      psqsum16x16[mb_index] += l_sqsum;
+
+      l_sad =  l_sum =  l_sqsum = 0;
+      tmp_cur_row = tmp_cur + 8;
+      tmp_ref_row = tmp_ref + 8;
+      for (k = 0; k < 8; k ++) {
+        for (l = 0; l < 8; l ++) {
+          int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
+          l_sad += diff;
+          l_sum += tmp_cur_row[l];
+          l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
+        }
+        tmp_cur_row += iPicStride;
+        tmp_ref_row += iPicStride;
+      }
+      *pFrameSad += l_sad;
+      pSad8x8[ (mb_index << 2) + 1] = l_sad;
+      pSum16x16[mb_index] += l_sum;
+      psqsum16x16[mb_index] += l_sqsum;
+
+      l_sad =  l_sum =  l_sqsum = 0;
+      tmp_cur_row = tmp_cur + pic_stride_x8;
+      tmp_ref_row = tmp_ref + pic_stride_x8;
+      for (k = 0; k < 8; k ++) {
+        for (l = 0; l < 8; l ++) {
+          int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
+          l_sad += diff;
+          l_sum += tmp_cur_row[l];
+          l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
+        }
+        tmp_cur_row += iPicStride;
+        tmp_ref_row += iPicStride;
+      }
+      *pFrameSad += l_sad;
+      pSad8x8[ (mb_index << 2) + 2] = l_sad;
+      pSum16x16[mb_index] += l_sum;
+      psqsum16x16[mb_index] += l_sqsum;
+
+      l_sad =  l_sum =  l_sqsum = 0;
+      tmp_cur_row = tmp_cur + pic_stride_x8 + 8;
+      tmp_ref_row = tmp_ref + pic_stride_x8 + 8;
+      for (k = 0; k < 8; k ++) {
+        for (l = 0; l < 8; l ++) {
+          int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
+          l_sad += diff;
+          l_sum += tmp_cur_row[l];
+          l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
+        }
+        tmp_cur_row += iPicStride;
+        tmp_ref_row += iPicStride;
+      }
+      *pFrameSad += l_sad;
+      pSad8x8[ (mb_index << 2) + 3] = l_sad;
+      pSum16x16[mb_index] += l_sum;
+      psqsum16x16[mb_index] += l_sqsum;
+
+
+      tmp_ref += 16;
+      tmp_cur += 16;
+      ++mb_index;
+    }
+    tmp_ref += step;
+    tmp_cur += step;
+  }
+}
+
+
+void VAACalcSad_c (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
+                   int32_t* pFrameSad, int32_t* pSad8x8) {
+  uint8_t* tmp_ref = pRefData;
+  uint8_t* tmp_cur = pCurData;
+  int32_t iMbWidth = (iPicWidth >> 4);
+  int32_t mb_heigth = (iPicHeight >> 4);
+  int32_t mb_index = 0;
+  int32_t pic_stride_x8 = iPicStride << 3;
+  int32_t step = (iPicStride << 4) - iPicWidth;
+
+  *pFrameSad = 0;
+  for (int32_t i = 0; i < mb_heigth; i ++) {
+    for (int32_t j = 0; j < iMbWidth; j ++) {
+      int32_t k, l;
+      int32_t l_sad;
+      uint8_t* tmp_cur_row;
+      uint8_t* tmp_ref_row;
+
+      l_sad =  0;
+      tmp_cur_row = tmp_cur;
+      tmp_ref_row = tmp_ref;
+      for (k = 0; k < 8; k ++) {
+        for (l = 0; l < 8; l ++) {
+          int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
+          l_sad += diff;
+        }
+        tmp_cur_row += iPicStride;
+        tmp_ref_row += iPicStride;
+      }
+      *pFrameSad += l_sad;
+      pSad8x8[ (mb_index << 2) + 0] = l_sad;
+
+      l_sad =  0;
+      tmp_cur_row = tmp_cur + 8;
+      tmp_ref_row = tmp_ref + 8;
+      for (k = 0; k < 8; k ++) {
+        for (l = 0; l < 8; l ++) {
+          int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
+          l_sad += diff;
+        }
+        tmp_cur_row += iPicStride;
+        tmp_ref_row += iPicStride;
+      }
+      *pFrameSad += l_sad;
+      pSad8x8[ (mb_index << 2) + 1] = l_sad;
+
+      l_sad =  0;
+      tmp_cur_row = tmp_cur + pic_stride_x8;
+      tmp_ref_row = tmp_ref + pic_stride_x8;
+      for (k = 0; k < 8; k ++) {
+        for (l = 0; l < 8; l ++) {
+          int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
+          l_sad += diff;
+        }
+        tmp_cur_row += iPicStride;
+        tmp_ref_row += iPicStride;
+      }
+      *pFrameSad += l_sad;
+      pSad8x8[ (mb_index << 2) + 2] = l_sad;
+
+      l_sad =  0;
+      tmp_cur_row = tmp_cur + pic_stride_x8 + 8;
+      tmp_ref_row = tmp_ref + pic_stride_x8 + 8;
+      for (k = 0; k < 8; k ++) {
+        for (l = 0; l < 8; l ++) {
+          int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
+          l_sad += diff;
+        }
+        tmp_cur_row += iPicStride;
+        tmp_ref_row += iPicStride;
+      }
+      *pFrameSad += l_sad;
+      pSad8x8[ (mb_index << 2) + 3] = l_sad;
+
+      tmp_ref += 16;
+      tmp_cur += 16;
+      ++mb_index;
+    }
+    tmp_ref += step;
+    tmp_cur += step;
+  }
+}
+
+void VAACalcSadSsdBgd_c (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight,
+                         int32_t iPicStride,
+                         int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSum16x16, int32_t* psqsum16x16, int32_t* psqdiff16x16, int32_t* pSd8x8,
+                         uint8_t* pMad8x8)
+
+{
+  uint8_t* tmp_ref = pRefData;
+  uint8_t* tmp_cur = pCurData;
+  int32_t iMbWidth = (iPicWidth >> 4);
+  int32_t mb_heigth = (iPicHeight >> 4);
+  int32_t mb_index = 0;
+  int32_t pic_stride_x8 = iPicStride << 3;
+  int32_t step = (iPicStride << 4) - iPicWidth;
+
+  *pFrameSad = 0;
+  for (int32_t i = 0; i < mb_heigth; i ++) {
+    for (int32_t j = 0; j < iMbWidth; j ++) {
+      int32_t k, l;
+      int32_t l_sad, l_sqdiff, l_sum, l_sqsum, l_sd, l_mad;
+      uint8_t* tmp_cur_row;
+      uint8_t* tmp_ref_row;
+
+      pSum16x16[mb_index] = 0;
+      psqsum16x16[mb_index] = 0;
+      psqdiff16x16[mb_index] = 0;
+
+      l_sd = l_mad = l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
+      tmp_cur_row = tmp_cur;
+      tmp_ref_row = tmp_ref;
+      for (k = 0; k < 8; k ++) {
+        for (l = 0; l < 8; l ++) {
+          int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
+          int32_t abs_diff = WELS_ABS (diff);
+
+          l_sd += diff;
+          if (abs_diff > l_mad) {
+            l_mad = abs_diff;
+          }
+          l_sad += abs_diff;
+          l_sqdiff += abs_diff * abs_diff;
+          l_sum += tmp_cur_row[l];
+          l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
+        }
+        tmp_cur_row += iPicStride;
+        tmp_ref_row += iPicStride;
+      }
+      *pFrameSad += l_sad;
+      pSad8x8[ (mb_index << 2) + 0] = l_sad;
+      pSum16x16[mb_index] += l_sum;
+      psqsum16x16[mb_index] += l_sqsum;
+      psqdiff16x16[mb_index] += l_sqdiff;
+      pSd8x8[ (mb_index << 2) + 0] = l_sd;
+      pMad8x8[ (mb_index << 2) + 0] = l_mad;
+
+
+      l_sd = l_mad = l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
+      tmp_cur_row = tmp_cur + 8;
+      tmp_ref_row = tmp_ref + 8;
+      for (k = 0; k < 8; k ++) {
+        for (l = 0; l < 8; l ++) {
+          int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
+          int32_t abs_diff = WELS_ABS (diff);
+
+          l_sd += diff;
+          if (abs_diff > l_mad) {
+            l_mad = abs_diff;
+          }
+          l_sad += abs_diff;
+          l_sqdiff += abs_diff * abs_diff;
+          l_sum += tmp_cur_row[l];
+          l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
+        }
+        tmp_cur_row += iPicStride;
+        tmp_ref_row += iPicStride;
+      }
+      *pFrameSad += l_sad;
+      pSad8x8[ (mb_index << 2) + 1] = l_sad;
+      pSum16x16[mb_index] += l_sum;
+      psqsum16x16[mb_index] += l_sqsum;
+      psqdiff16x16[mb_index] += l_sqdiff;
+      pSd8x8[ (mb_index << 2) + 1] = l_sd;
+      pMad8x8[ (mb_index << 2) + 1] = l_mad;
+
+      l_sd = l_mad = l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
+      tmp_cur_row = tmp_cur + pic_stride_x8;
+      tmp_ref_row = tmp_ref + pic_stride_x8;
+      for (k = 0; k < 8; k ++) {
+        for (l = 0; l < 8; l ++) {
+          int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
+          int32_t abs_diff = WELS_ABS (diff);
+
+          l_sd += diff;
+          if (abs_diff > l_mad) {
+            l_mad = abs_diff;
+          }
+          l_sad += abs_diff;
+          l_sqdiff += abs_diff * abs_diff;
+          l_sum += tmp_cur_row[l];
+          l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
+        }
+        tmp_cur_row += iPicStride;
+        tmp_ref_row += iPicStride;
+      }
+      *pFrameSad += l_sad;
+      pSad8x8[ (mb_index << 2) + 2] = l_sad;
+      pSum16x16[mb_index] += l_sum;
+      psqsum16x16[mb_index] += l_sqsum;
+      psqdiff16x16[mb_index] += l_sqdiff;
+      pSd8x8[ (mb_index << 2) + 2] = l_sd;
+      pMad8x8[ (mb_index << 2) + 2] = l_mad;
+
+      l_sd = l_mad = l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
+      tmp_cur_row = tmp_cur + pic_stride_x8 + 8;
+      tmp_ref_row = tmp_ref + pic_stride_x8 + 8;
+      for (k = 0; k < 8; k ++) {
+        for (l = 0; l < 8; l ++) {
+          int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
+          int32_t abs_diff = WELS_ABS (diff);
+
+          l_sd += diff;
+          if (abs_diff > l_mad) {
+            l_mad = abs_diff;
+          }
+          l_sad += abs_diff;
+          l_sqdiff += abs_diff * abs_diff;
+          l_sum += tmp_cur_row[l];
+          l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
+        }
+        tmp_cur_row += iPicStride;
+        tmp_ref_row += iPicStride;
+      }
+      *pFrameSad += l_sad;
+      pSad8x8[ (mb_index << 2) + 3] = l_sad;
+      pSum16x16[mb_index] += l_sum;
+      psqsum16x16[mb_index] += l_sqsum;
+      psqdiff16x16[mb_index] += l_sqdiff;
+      pSd8x8[ (mb_index << 2) + 3] = l_sd;
+      pMad8x8[ (mb_index << 2) + 3] = l_mad;
+
+      tmp_ref += 16;
+      tmp_cur += 16;
+      ++mb_index;
+    }
+    tmp_ref += step;
+    tmp_cur += step;
+  }
+}
+
+void VAACalcSadBgd_c (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
+                      int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSd8x8, uint8_t* pMad8x8) {
+  uint8_t* tmp_ref = pRefData;
+  uint8_t* tmp_cur = pCurData;
+  int32_t iMbWidth = (iPicWidth >> 4);
+  int32_t mb_heigth = (iPicHeight >> 4);
+  int32_t mb_index = 0;
+  int32_t pic_stride_x8 = iPicStride << 3;
+  int32_t step = (iPicStride << 4) - iPicWidth;
+
+  *pFrameSad = 0;
+  for (int32_t i = 0; i < mb_heigth; i ++) {
+    for (int32_t j = 0; j < iMbWidth; j ++) {
+      int32_t k, l;
+      int32_t l_sad, l_sd, l_mad;
+      uint8_t* tmp_cur_row;
+      uint8_t* tmp_ref_row;
+
+      l_mad = l_sd = l_sad =  0;
+      tmp_cur_row = tmp_cur;
+      tmp_ref_row = tmp_ref;
+      for (k = 0; k < 8; k ++) {
+        for (l = 0; l < 8; l ++) {
+          int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
+          int32_t abs_diff = WELS_ABS (diff);
+          l_sd += diff;
+          l_sad += abs_diff;
+          if (abs_diff > l_mad) {
+            l_mad = abs_diff;
+          }
+        }
+        tmp_cur_row += iPicStride;
+        tmp_ref_row += iPicStride;
+      }
+      *pFrameSad += l_sad;
+      pSad8x8[ (mb_index << 2) + 0] = l_sad;
+      pSd8x8[ (mb_index << 2) + 0] = l_sd;
+      pMad8x8[ (mb_index << 2) + 0] = l_mad;
+
+      l_mad = l_sd = l_sad =  0;
+      tmp_cur_row = tmp_cur + 8;
+      tmp_ref_row = tmp_ref + 8;
+      for (k = 0; k < 8; k ++) {
+        for (l = 0; l < 8; l ++) {
+          int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
+          int32_t abs_diff = WELS_ABS (diff);
+          l_sd += diff;
+          l_sad += abs_diff;
+          if (abs_diff > l_mad) {
+            l_mad = abs_diff;
+          }
+        }
+        tmp_cur_row += iPicStride;
+        tmp_ref_row += iPicStride;
+      }
+      *pFrameSad += l_sad;
+      pSad8x8[ (mb_index << 2) + 1] = l_sad;
+      pSd8x8[ (mb_index << 2) + 1] = l_sd;
+      pMad8x8[ (mb_index << 2) + 1] = l_mad;
+
+      l_mad = l_sd = l_sad =  0;
+      tmp_cur_row = tmp_cur + pic_stride_x8;
+      tmp_ref_row = tmp_ref + pic_stride_x8;
+      for (k = 0; k < 8; k ++) {
+        for (l = 0; l < 8; l ++) {
+          int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
+          int32_t abs_diff = WELS_ABS (diff);
+          l_sd += diff;
+          l_sad += abs_diff;
+          if (abs_diff > l_mad) {
+            l_mad = abs_diff;
+          }
+        }
+        tmp_cur_row += iPicStride;
+        tmp_ref_row += iPicStride;
+      }
+      *pFrameSad += l_sad;
+      pSad8x8[ (mb_index << 2) + 2] = l_sad;
+      pSd8x8[ (mb_index << 2) + 2] = l_sd;
+      pMad8x8[ (mb_index << 2) + 2] = l_mad;
+
+      l_mad = l_sd = l_sad =  0;
+      tmp_cur_row = tmp_cur + pic_stride_x8 + 8;
+      tmp_ref_row = tmp_ref + pic_stride_x8 + 8;
+      for (k = 0; k < 8; k ++) {
+        for (l = 0; l < 8; l ++) {
+          int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
+          int32_t abs_diff = WELS_ABS (diff);
+          l_sd += diff;
+          l_sad += abs_diff;
+          if (abs_diff > l_mad) {
+            l_mad = abs_diff;
+          }
+        }
+        tmp_cur_row += iPicStride;
+        tmp_ref_row += iPicStride;
+      }
+      *pFrameSad += l_sad;
+      pSad8x8[ (mb_index << 2) + 3] = l_sad;
+      pSd8x8[ (mb_index << 2) + 3] = l_sd;
+      pMad8x8[ (mb_index << 2) + 3] = l_mad;
+
+      tmp_ref += 16;
+      tmp_cur += 16;
+      ++mb_index;
+    }
+    tmp_ref += step;
+    tmp_cur += step;
+  }
+}
+
+WELSVP_NAMESPACE_END
--- a/processing/src/vaacalc/vaacalculation.cpp
+++ b/processing/src/vaacalc/vaacalculation.cpp
@@ -1,139 +1,123 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "vaacalculation.h"
-#include "../common/cpu.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-CVAACalculation::CVAACalculation(int32_t iCpuFlag)
-{
-	m_iCPUFlag = iCpuFlag;
-	m_eMethod   = METHOD_VAA_STATISTICS;
-
-	WelsMemset(&m_sCalcParam, 0, sizeof(m_sCalcParam));
-	WelsMemset(&m_sVaaFuncs, 0, sizeof(m_sVaaFuncs));
-	InitVaaFuncs(m_sVaaFuncs, m_iCPUFlag);
-}
-
-CVAACalculation::~CVAACalculation()
-{	
-}
-
-void CVAACalculation::InitVaaFuncs(SVaaFuncs &sVaaFuncs, int32_t iCpuFlag)
-{
-	sVaaFuncs.pfVAACalcSad				= VAACalcSad_c;
-	sVaaFuncs.pfVAACalcSadBgd			= VAACalcSadBgd_c;
-	sVaaFuncs.pfVAACalcSadSsd			= VAACalcSadSsd_c;
-	sVaaFuncs.pfVAACalcSadSsdBgd		= VAACalcSadSsdBgd_c;
-	sVaaFuncs.pfVAACalcSadVar			= VAACalcSadVar_c;
-#ifdef X86_ASM
-	if ( (iCpuFlag & WELS_CPU_SSE2) == WELS_CPU_SSE2 )
-	{
-		sVaaFuncs.pfVAACalcSad			= VAACalcSad_sse2;
-		sVaaFuncs.pfVAACalcSadBgd		= VAACalcSadBgd_sse2;
-		sVaaFuncs.pfVAACalcSadSsd		= VAACalcSadSsd_sse2;
-		sVaaFuncs.pfVAACalcSadSsdBgd = VAACalcSadSsdBgd_sse2;
-		sVaaFuncs.pfVAACalcSadVar		= VAACalcSadVar_sse2;
-	}
-#endif//X86_ASM
-}
-
-EResult CVAACalculation::Process(int32_t iType, SPixMap *pSrcPixMap, SPixMap *pRefPixMap)
-{
-	uint8_t *pCurData	= (uint8_t *)pSrcPixMap->pPixel[0];
-	uint8_t *pRefData	= (uint8_t *)pRefPixMap->pPixel[0];
-	int32_t iPicWidth	= pSrcPixMap->sRect.iRectWidth;
-	int32_t iPicHeight	= pSrcPixMap->sRect.iRectHeight;
-	int32_t iPicStride	= pSrcPixMap->iStride[0];
-	
-	SVAACalcResult *pResult = m_sCalcParam.pCalcResult;
-
-	if (pCurData == NULL || pRefData == NULL)
-	{
-		return RET_INVALIDPARAM;
-	}
-
-	pResult->pCurY = pCurData;
-	pResult->pRefY = pRefData;
-	if (m_sCalcParam.iCalcBgd)
-	{
-		if (m_sCalcParam.iCalcSsd)
-		{
-			m_sVaaFuncs.pfVAACalcSadSsdBgd(pCurData, pRefData, iPicWidth, iPicHeight, iPicStride, &pResult->iFrameSad, 
-				(int32_t*)pResult->pSad8x8, pResult->pSum16x16, pResult->pSumOfSquare16x16, pResult->pSsd16x16, 
-				(int32_t*)pResult->pSumOfDiff8x8, (uint8_t*)pResult->pMad8x8);
-		}
-		else
-		{
-			m_sVaaFuncs.pfVAACalcSadBgd(pCurData, pRefData, iPicWidth, iPicHeight, iPicStride, &pResult->iFrameSad,
-				(int32_t*)(pResult->pSad8x8), (int32_t*)(pResult->pSumOfDiff8x8), (uint8_t*)pResult->pMad8x8);
-		}
-	}
-	else
-	{
-		if (m_sCalcParam.iCalcSsd)
-		{
-			m_sVaaFuncs.pfVAACalcSadSsd(pCurData, pRefData, iPicWidth, iPicHeight, iPicStride, &pResult->iFrameSad,
-				(int32_t*)pResult->pSad8x8, pResult->pSum16x16, pResult->pSumOfSquare16x16, pResult->pSsd16x16);
-		}else{
-			if (m_sCalcParam.iCalcVar)
-			{
-				m_sVaaFuncs.pfVAACalcSadVar(pCurData, pRefData, iPicWidth, iPicHeight, iPicStride, &pResult->iFrameSad,
-					(int32_t*)pResult->pSad8x8, pResult->pSum16x16, pResult->pSumOfSquare16x16);
-			}else{
-				m_sVaaFuncs.pfVAACalcSad(pCurData, pRefData, iPicWidth, iPicHeight, iPicStride, &pResult->iFrameSad,
-					(int32_t*)pResult->pSad8x8);
-			}			
-		}
-	}
-
-	return RET_SUCCESS;
-}
-
-EResult CVAACalculation::Set(int32_t iType, void *pParam)
-{
-	if (pParam == NULL || ((SVAACalcParam*)pParam)->pCalcResult == NULL)
-	{
-		return RET_INVALIDPARAM;
-	}
-
-	m_sCalcParam = *(SVAACalcParam*)pParam;
-
-	return RET_SUCCESS;
-}
-
-
-WELSVP_NAMESPACE_END
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "vaacalculation.h"
+#include "../common/cpu.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+CVAACalculation::CVAACalculation (int32_t iCpuFlag) {
+  m_iCPUFlag = iCpuFlag;
+  m_eMethod   = METHOD_VAA_STATISTICS;
+
+  WelsMemset (&m_sCalcParam, 0, sizeof (m_sCalcParam));
+  WelsMemset (&m_sVaaFuncs, 0, sizeof (m_sVaaFuncs));
+  InitVaaFuncs (m_sVaaFuncs, m_iCPUFlag);
+}
+
+CVAACalculation::~CVAACalculation() {
+}
+
+void CVAACalculation::InitVaaFuncs (SVaaFuncs& sVaaFuncs, int32_t iCpuFlag) {
+  sVaaFuncs.pfVAACalcSad				= VAACalcSad_c;
+  sVaaFuncs.pfVAACalcSadBgd			= VAACalcSadBgd_c;
+  sVaaFuncs.pfVAACalcSadSsd			= VAACalcSadSsd_c;
+  sVaaFuncs.pfVAACalcSadSsdBgd		= VAACalcSadSsdBgd_c;
+  sVaaFuncs.pfVAACalcSadVar			= VAACalcSadVar_c;
+#ifdef X86_ASM
+  if ((iCpuFlag & WELS_CPU_SSE2) == WELS_CPU_SSE2) {
+    sVaaFuncs.pfVAACalcSad			= VAACalcSad_sse2;
+    sVaaFuncs.pfVAACalcSadBgd		= VAACalcSadBgd_sse2;
+    sVaaFuncs.pfVAACalcSadSsd		= VAACalcSadSsd_sse2;
+    sVaaFuncs.pfVAACalcSadSsdBgd = VAACalcSadSsdBgd_sse2;
+    sVaaFuncs.pfVAACalcSadVar		= VAACalcSadVar_sse2;
+  }
+#endif//X86_ASM
+}
+
+EResult CVAACalculation::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
+  uint8_t* pCurData	= (uint8_t*)pSrcPixMap->pPixel[0];
+  uint8_t* pRefData	= (uint8_t*)pRefPixMap->pPixel[0];
+  int32_t iPicWidth	= pSrcPixMap->sRect.iRectWidth;
+  int32_t iPicHeight	= pSrcPixMap->sRect.iRectHeight;
+  int32_t iPicStride	= pSrcPixMap->iStride[0];
+
+  SVAACalcResult* pResult = m_sCalcParam.pCalcResult;
+
+  if (pCurData == NULL || pRefData == NULL) {
+    return RET_INVALIDPARAM;
+  }
+
+  pResult->pCurY = pCurData;
+  pResult->pRefY = pRefData;
+  if (m_sCalcParam.iCalcBgd) {
+    if (m_sCalcParam.iCalcSsd) {
+      m_sVaaFuncs.pfVAACalcSadSsdBgd (pCurData, pRefData, iPicWidth, iPicHeight, iPicStride, &pResult->iFrameSad,
+                                      (int32_t*)pResult->pSad8x8, pResult->pSum16x16, pResult->pSumOfSquare16x16, pResult->pSsd16x16,
+                                      (int32_t*)pResult->pSumOfDiff8x8, (uint8_t*)pResult->pMad8x8);
+    } else {
+      m_sVaaFuncs.pfVAACalcSadBgd (pCurData, pRefData, iPicWidth, iPicHeight, iPicStride, &pResult->iFrameSad,
+                                   (int32_t*) (pResult->pSad8x8), (int32_t*) (pResult->pSumOfDiff8x8), (uint8_t*)pResult->pMad8x8);
+    }
+  } else {
+    if (m_sCalcParam.iCalcSsd) {
+      m_sVaaFuncs.pfVAACalcSadSsd (pCurData, pRefData, iPicWidth, iPicHeight, iPicStride, &pResult->iFrameSad,
+                                   (int32_t*)pResult->pSad8x8, pResult->pSum16x16, pResult->pSumOfSquare16x16, pResult->pSsd16x16);
+    } else {
+      if (m_sCalcParam.iCalcVar) {
+        m_sVaaFuncs.pfVAACalcSadVar (pCurData, pRefData, iPicWidth, iPicHeight, iPicStride, &pResult->iFrameSad,
+                                     (int32_t*)pResult->pSad8x8, pResult->pSum16x16, pResult->pSumOfSquare16x16);
+      } else {
+        m_sVaaFuncs.pfVAACalcSad (pCurData, pRefData, iPicWidth, iPicHeight, iPicStride, &pResult->iFrameSad,
+                                  (int32_t*)pResult->pSad8x8);
+      }
+    }
+  }
+
+  return RET_SUCCESS;
+}
+
+EResult CVAACalculation::Set (int32_t iType, void* pParam) {
+  if (pParam == NULL || ((SVAACalcParam*)pParam)->pCalcResult == NULL) {
+    return RET_INVALIDPARAM;
+  }
+
+  m_sCalcParam = * (SVAACalcParam*)pParam;
+
+  return RET_SUCCESS;
+}
+
+
+WELSVP_NAMESPACE_END
--- a/processing/src/vaacalc/vaacalculation.h
+++ b/processing/src/vaacalc/vaacalculation.h
@@ -1,122 +1,125 @@
-/*!
- * \copy
- *     Copyright (c)  2011-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- * \file	    :  vaacalculation.h
- *
- * \brief	    :  pVaa calculation class of wels video processor class
- *
- * \date        :  2011/03/18
- *
- * \description :  1. rewrite the package code of pVaa calculation class  
- *
- *************************************************************************************
- */
-
-#ifndef _WELSVP_VAACALCULATION_H
-#define _WELSVP_VAACALCULATION_H
-
-#include "../common/util.h"
-#include "../common/memory.h"
-#include "../common/WelsFrameWork.h"
-#include "../../interface/IWelsVP.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-typedef void (VAACalcSadBgdFunc)( uint8_t *pCurData, uint8_t *pRefData, int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
-								int32_t *pFrameSad, int32_t *pSad8x8, int32_t *pSd8x8, uint8_t *pMad8x8);
-
-typedef void (VAACalcSadSsdBgdFunc)(uint8_t *pCurData, uint8_t *pRefData, int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
-								int32_t *pFrameSad, int32_t *pSad8x8, int32_t *pSum16x16, int32_t *pSumSquare16x16, 
-												int32_t *pSsd16x16, int32_t *pSd8x8, uint8_t *pMad8x8);
-
-typedef void (VAACalcSadFunc)( uint8_t *pCurData, uint8_t *pRefData, int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
-								int32_t *pFrameSad, int32_t *pSad8x8);
-
-typedef void (VAACalcSadVarFunc)( uint8_t *pCurData, uint8_t *pRefData, int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
-								int32_t *pFrameSad, int32_t *pSad8x8, int32_t *pSum16x16, int32_t *pSumSquare16x16);
-
-typedef void (VAACalcSadSsdFunc)(uint8_t *pCurData, uint8_t *pRefData, int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
-								int32_t *pFrameSad, int32_t *pSad8x8, int32_t *pSum16x16, int32_t *pSumSquare16x16, int32_t *pSsd16x16);
-
-
-typedef VAACalcSadBgdFunc		* PVAACalcSadBgdFunc;
-typedef VAACalcSadSsdBgdFunc	* PVAACalcSadSsdBgdFunc;
-typedef VAACalcSadFunc			* PVAACalcSadFunc;
-typedef VAACalcSadVarFunc		* PVAACalcSadVarFunc;
-typedef VAACalcSadSsdFunc		* PVAACalcSadSsdFunc;
-
-typedef  struct TagVaaFuncs 
-{
-	PVAACalcSadBgdFunc		pfVAACalcSadBgd;
-	PVAACalcSadSsdBgdFunc	pfVAACalcSadSsdBgd;
-	PVAACalcSadFunc			pfVAACalcSad;
-	PVAACalcSadVarFunc		pfVAACalcSadVar;
-	PVAACalcSadSsdFunc		pfVAACalcSadSsd;
-} SVaaFuncs;
-
-
-VAACalcSadBgdFunc		VAACalcSadBgd_c;
-VAACalcSadSsdBgdFunc	VAACalcSadSsdBgd_c;
-VAACalcSadFunc			    VAACalcSad_c;
-VAACalcSadVarFunc		VAACalcSadVar_c;
-VAACalcSadSsdFunc		VAACalcSadSsd_c;
-
-
-#ifdef X86_ASM
-WELSVP_EXTERN_C_BEGIN
-	VAACalcSadBgdFunc		VAACalcSadBgd_sse2;
-	VAACalcSadSsdBgdFunc	VAACalcSadSsdBgd_sse2;
-	VAACalcSadFunc			    VAACalcSad_sse2;
-	VAACalcSadVarFunc		VAACalcSadVar_sse2;
-	VAACalcSadSsdFunc		VAACalcSadSsd_sse2;
-WELSVP_EXTERN_C_END
-#endif
-
-class CVAACalculation : public IStrategy
-{			  
-public:
-	CVAACalculation(int32_t iCpuFlag);
-	~CVAACalculation();
-
-	EResult Process(int32_t iType, SPixMap *pCurPixMap, SPixMap *pRefPixMap);
-	EResult Set    (int32_t iType, void *pParam); 
-
-private:
-	void InitVaaFuncs(SVaaFuncs &sVaaFunc, int32_t iCpuFlag);
-
-private:
-	SVaaFuncs      m_sVaaFuncs;
-	int32_t       m_iCPUFlag;
-	SVAACalcParam m_sCalcParam;
-};	
-
-WELSVP_NAMESPACE_END
-
-#endif
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	    :  vaacalculation.h
+ *
+ * \brief	    :  pVaa calculation class of wels video processor class
+ *
+ * \date        :  2011/03/18
+ *
+ * \description :  1. rewrite the package code of pVaa calculation class
+ *
+ *************************************************************************************
+ */
+
+#ifndef _WELSVP_VAACALCULATION_H
+#define _WELSVP_VAACALCULATION_H
+
+#include "../common/util.h"
+#include "../common/memory.h"
+#include "../common/WelsFrameWork.h"
+#include "../../interface/IWelsVP.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+typedef void (VAACalcSadBgdFunc) (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight,
+                                  int32_t iPicStride,
+                                  int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSd8x8, uint8_t* pMad8x8);
+
+typedef void (VAACalcSadSsdBgdFunc) (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight,
+                                     int32_t iPicStride,
+                                     int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSum16x16, int32_t* pSumSquare16x16,
+                                     int32_t* pSsd16x16, int32_t* pSd8x8, uint8_t* pMad8x8);
+
+typedef void (VAACalcSadFunc) (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight,
+                               int32_t iPicStride,
+                               int32_t* pFrameSad, int32_t* pSad8x8);
+
+typedef void (VAACalcSadVarFunc) (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight,
+                                  int32_t iPicStride,
+                                  int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSum16x16, int32_t* pSumSquare16x16);
+
+typedef void (VAACalcSadSsdFunc) (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight,
+                                  int32_t iPicStride,
+                                  int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSum16x16, int32_t* pSumSquare16x16, int32_t* pSsd16x16);
+
+
+typedef VAACalcSadBgdFunc*		 PVAACalcSadBgdFunc;
+typedef VAACalcSadSsdBgdFunc*	 PVAACalcSadSsdBgdFunc;
+typedef VAACalcSadFunc*			 PVAACalcSadFunc;
+typedef VAACalcSadVarFunc*		 PVAACalcSadVarFunc;
+typedef VAACalcSadSsdFunc*		 PVAACalcSadSsdFunc;
+
+typedef  struct TagVaaFuncs {
+  PVAACalcSadBgdFunc		pfVAACalcSadBgd;
+  PVAACalcSadSsdBgdFunc	pfVAACalcSadSsdBgd;
+  PVAACalcSadFunc			pfVAACalcSad;
+  PVAACalcSadVarFunc		pfVAACalcSadVar;
+  PVAACalcSadSsdFunc		pfVAACalcSadSsd;
+} SVaaFuncs;
+
+
+VAACalcSadBgdFunc		VAACalcSadBgd_c;
+VAACalcSadSsdBgdFunc	VAACalcSadSsdBgd_c;
+VAACalcSadFunc			    VAACalcSad_c;
+VAACalcSadVarFunc		VAACalcSadVar_c;
+VAACalcSadSsdFunc		VAACalcSadSsd_c;
+
+
+#ifdef X86_ASM
+WELSVP_EXTERN_C_BEGIN
+VAACalcSadBgdFunc		VAACalcSadBgd_sse2;
+VAACalcSadSsdBgdFunc	VAACalcSadSsdBgd_sse2;
+VAACalcSadFunc			    VAACalcSad_sse2;
+VAACalcSadVarFunc		VAACalcSadVar_sse2;
+VAACalcSadSsdFunc		VAACalcSadSsd_sse2;
+WELSVP_EXTERN_C_END
+#endif
+
+class CVAACalculation : public IStrategy {
+ public:
+  CVAACalculation (int32_t iCpuFlag);
+  ~CVAACalculation();
+
+  EResult Process (int32_t iType, SPixMap* pCurPixMap, SPixMap* pRefPixMap);
+  EResult Set (int32_t iType, void* pParam);
+
+ private:
+  void InitVaaFuncs (SVaaFuncs& sVaaFunc, int32_t iCpuFlag);
+
+ private:
+  SVaaFuncs      m_sVaaFuncs;
+  int32_t       m_iCPUFlag;
+  SVAACalcParam m_sCalcParam;
+};
+
+WELSVP_NAMESPACE_END
+
+#endif
--- /dev/null
+++ b/test/simple_test.cpp
@@ -1,0 +1,41 @@
+#include <gtest/gtest.h>
+#if defined (WIN32)
+#include <windows.h>
+#include <tchar.h>
+#else
+#include <string.h>
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+
+#include "codec_def.h"
+#include "codec_app_def.h"
+#include "codec_api.h"
+
+class CodecTest : public ::testing::Test {
+ public:
+  CodecTest() : decoder_ (NULL) {}
+
+  ~CodecTest() {
+    if (decoder_) DestroyDecoder (decoder_);
+  }
+
+  void SetUp() {
+    long rv = CreateDecoder (&decoder_);
+    ASSERT_EQ (0, rv);
+    ASSERT_TRUE (decoder_);
+  }
+
+ protected:
+  ISVCDecoder* decoder_;
+};
+
+TEST_F (CodecTest, JustInit) {
+}
+
+int main (int argc, char** argv) {
+  testing::InitGoogleTest (&argc, argv);
+
+  return RUN_ALL_TESTS();
+}
--- /dev/null
+++ b/test/targets.mk
@@ -1,0 +1,21 @@
+CODEC_UNITTEST_PREFIX=CODEC_UNITTEST
+CODEC_UNITTEST_SRCDIR=test
+CODEC_UNITTEST_CPP_SRCS=\
+	$(CODEC_UNITTEST_SRCDIR)/./simple_test.cpp\
+
+CODEC_UNITTEST_OBJS += $(CODEC_UNITTEST_CPP_SRCS:.cpp=.o)
+ifeq ($(USE_ASM), Yes)
+CODEC_UNITTEST_ASM_SRCS=\
+
+CODEC_UNITTEST_OBJS += $(CODEC_UNITTEST_ASM_SRCS:.asm=.o)
+endif
+
+OBJS += $(CODEC_UNITTEST_OBJS)
+$(CODEC_UNITTEST_SRCDIR)/./simple_test.o: $(CODEC_UNITTEST_SRCDIR)/./simple_test.cpp
+	$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(CODEC_UNITTEST_CFLAGS) $(CODEC_UNITTEST_INCLUDES) -c -o $(CODEC_UNITTEST_SRCDIR)/./simple_test.o $(CODEC_UNITTEST_SRCDIR)/./simple_test.cpp
+
+codec_unittest: $(CODEC_UNITTEST_OBJS) $(LIBS) $(CODEC_UNITTEST_LIBS)
+	$(CXX) -o $@  $(CODEC_UNITTEST_OBJS) $(CODEC_UNITTEST_LDFLAGS) $(CODEC_UNITTEST_LIBS) $(LDFLAGS) $(LIBS)
+
+binaries: codec_unittest
+BINARIES += codec_unittest
--- a/testbin/AutoBuild_Windows_VS2008.bat
+++ b/testbin/AutoBuild_Windows_VS2008.bat
@@ -23,7 +23,7 @@
 rem call VP build
 echo "Welsvp Building....."
 cd %VPProjectDir%
-rem vcclean 
+rem vcclean
 %VCBUILDEXE% WelsVP_2008.vcproj
 
 
@@ -33,7 +33,7 @@
 
 cd %CurDir%
 cd %EncoderProjectDir%
-rem vcclean 
+rem vcclean
 %VCBUILDEXE% WelsEncCore.vcproj
 %VCBUILDEXE% WelsEncPlus.vcproj
 %VCBUILDEXE% encConsole.vcproj
@@ -44,7 +44,7 @@
 
 cd %CurDir%
 cd %DecoderProjectDir%
-rem vcclean 
+rem vcclean
 %VCBUILDEXE% WelsDecCore.vcproj
 %VCBUILDEXE% WelsDecPlus.vcproj
 %VCBUILDEXE% decConsole.vcproj
--- a/testbin/AutoBuild_Windows_VS2010.bat
+++ b/testbin/AutoBuild_Windows_VS2010.bat
@@ -36,7 +36,7 @@
 cd %CurDir%
 cd %EncoderProjectDir%
 echo current directory is %EncoderProjectDir%
-rem vcclean 
+rem vcclean
 
 echo %VCMSBUILDEXE_RELEASE% WelsEncoder_2010.sln
 %VCMSBUILDEXE_RELEASE% WelsEncoder_2010.sln
@@ -49,7 +49,7 @@
 cd %CurDir%
 cd %DecoderProjectDir%
 echo current directory is %DecoderProjectDir%
-rem vcclean 
+rem vcclean
 
 echo %VCMSBUILDEXE_RELEASE% WelsDecoder_2010.sln
 
--- a/testbin/AutoBuild_Windows_VS2012.bat
+++ b/testbin/AutoBuild_Windows_VS2012.bat
@@ -36,7 +36,7 @@
 cd %CurDir%
 cd %EncoderProjectDir%
 echo current directory is %EncoderProjectDir%
-rem vcclean 
+rem vcclean
 
 echo %VCMSBUILDEXE_RELEASE% WelsEncoder_2012.sln
 %VCMSBUILDEXE_RELEASE% WelsEncoder_2012.sln
@@ -49,7 +49,7 @@
 cd %CurDir%
 cd %DecoderProjectDir%
 echo current directory is %DecoderProjectDir%
-rem vcclean 
+rem vcclean
 
 echo %VCMSBUILDEXE_RELEASE% WelsDecoder_2012.sln
 
--- a/testbin/layer2.cfg
+++ b/testbin/layer2.cfg
@@ -1,39 +1,39 @@
-# Layer Configuration File
-
-
-#============================== INPUT / OUTPUT ==============================
-SourceWidth     320                     # Input  frame width
-SourceHeight    192                    # Input  frame height
-FrameRateIn     12                      # Input  frame rate [Hz]
-FrameRateOut    12                     # Output frame rate [Hz]
-InputFile       CiscoVT2people_320x192_12fps.yuv # Input  file
-ReconFile       rec_layer2.yuv          # Reconstructed file
-
-#============================== CODING ==============================
-ProfileIdc      66          # value of profile_idc (or 0 for auto detection)
-
-InitialQP       24			# Quantization parameters for base quality layer
-#================================ RATE CONTROL ===============================
-SpatialBitrate		600		# Unit: kbps, controled by DisableRC also
-#============================== MultiSlice Slice Argument ==============================
-# for S/M Slice(s) mode settings
-SliceMode			0		# 0: sigle slice mode; >0: multiple slices mode, see below;
-SliceSize			1500
-SliceNum			1		# multiple slices number specified
-
-SlicesAssign0		960		# count number of MBs in slice #0
-SlicesAssign1		0		# count number of MBs in slice #1
-SlicesAssign2		0		# count number of MBs in slice #2
-SlicesAssign3		0		# count number of MBs in slice #3 -- seting here is for better testing
-SlicesAssign4		0		# count number of MBs in slice #4
-SlicesAssign5		0		# count number of MBs in slice #5
-SlicesAssign6		0		# count number of MBs in slice #6
-SlicesAssign7		0		# count number of MBs in slice #7
-
-### DESIGN OF SLICE MODE ####
-# 0 SM_SINGLE_SLICE			| SliceNum==1
-# 1 SM_FIXEDSLCNUM_SLICE	| according to SliceNum			| Enabled dynamic slicing for multi-thread
-# 2 SM_RASTER_SLICE			| according to SlicesAssign		| Need input of MB numbers each slice. In addition, if other constraint in slice_argument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved.
-# 3 SM_ROWMB_SLICE			| according to PictureMBHeight	| Typical of single row of mbs each slice?+ slice size constraint which including re-encoding
-# 4 SM_DYN_SLICE			| according to SliceSize		| Dynamic slicing (have no idea about slice_nums until encoding current frame)
-
+# Layer Configuration File
+
+
+#============================== INPUT / OUTPUT ==============================
+SourceWidth     320                     # Input  frame width
+SourceHeight    192                    # Input  frame height
+FrameRateIn     12                      # Input  frame rate [Hz]
+FrameRateOut    12                     # Output frame rate [Hz]
+InputFile       CiscoVT2people_320x192_12fps.yuv # Input  file
+ReconFile       rec_layer2.yuv          # Reconstructed file
+
+#============================== CODING ==============================
+ProfileIdc      66          # value of profile_idc (or 0 for auto detection)
+
+InitialQP       24			# Quantization parameters for base quality layer
+#================================ RATE CONTROL ===============================
+SpatialBitrate		600		# Unit: kbps, controled by DisableRC also
+#============================== MultiSlice Slice Argument ==============================
+# for S/M Slice(s) mode settings
+SliceMode			0		# 0: sigle slice mode; >0: multiple slices mode, see below;
+SliceSize			1500
+SliceNum			1		# multiple slices number specified
+
+SlicesAssign0		960		# count number of MBs in slice #0
+SlicesAssign1		0		# count number of MBs in slice #1
+SlicesAssign2		0		# count number of MBs in slice #2
+SlicesAssign3		0		# count number of MBs in slice #3 -- seting here is for better testing
+SlicesAssign4		0		# count number of MBs in slice #4
+SlicesAssign5		0		# count number of MBs in slice #5
+SlicesAssign6		0		# count number of MBs in slice #6
+SlicesAssign7		0		# count number of MBs in slice #7
+
+### DESIGN OF SLICE MODE ####
+# 0 SM_SINGLE_SLICE			| SliceNum==1
+# 1 SM_FIXEDSLCNUM_SLICE	| according to SliceNum			| Enabled dynamic slicing for multi-thread
+# 2 SM_RASTER_SLICE			| according to SlicesAssign		| Need input of MB numbers each slice. In addition, if other constraint in slice_argument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved.
+# 3 SM_ROWMB_SLICE			| according to PictureMBHeight	| Typical of single row of mbs each slice?+ slice size constraint which including re-encoding
+# 4 SM_DYN_SLICE			| according to SliceSize		| Dynamic slicing (have no idea about slice_nums until encoding current frame)
+
--- a/testbin/layer2_vd.cfg
+++ b/testbin/layer2_vd.cfg
@@ -1,39 +1,39 @@
-# Layer Configuration File
-
-
-#============================== INPUT / OUTPUT ==============================
-SourceWidth     320                     # Input  frame width
-SourceHeight    192                    # Input  frame height
-FrameRateIn     12                      # Input  frame rate [Hz]
-FrameRateOut    12                     # Output frame rate [Hz]
-InputFile       CiscoVT2people_320x192_12fps.yuv # Input  file
-ReconFile       rec_layer2.yuv          # Reconstructed file
-
-#============================== CODING ==============================
-ProfileIdc      66          # value of profile_idc (or 0 for auto detection)
-
-InitialQP       24			# Quantization parameters for base quality layer
-#================================ RATE CONTROL ===============================
-SpatialBitrate		600		# Unit: kbps, controled by DisableRC also
-#============================== MultiSlice Slice Argument ==============================
-# for S/M Slice(s) mode settings
-SliceMode			0		# 0: sigle slice mode; >0: multiple slices mode, see below;
-SliceSize			1500
-SliceNum			1		# multiple slices number specified
-
-SlicesAssign0		960		# count number of MBs in slice #0
-SlicesAssign1		0		# count number of MBs in slice #1
-SlicesAssign2		0		# count number of MBs in slice #2
-SlicesAssign3		0		# count number of MBs in slice #3 -- seting here is for better testing
-SlicesAssign4		0		# count number of MBs in slice #4
-SlicesAssign5		0		# count number of MBs in slice #5
-SlicesAssign6		0		# count number of MBs in slice #6
-SlicesAssign7		0		# count number of MBs in slice #7
-
-### DESIGN OF SLICE MODE, 100804, Sijia ####
-# 0 SM_SINGLE_SLICE			| SliceNum==1
-# 1 SM_FIXEDSLCNUM_SLICE	| according to SliceNum			| Enabled dynamic slicing for multi-thread
-# 2 SM_RASTER_SLICE			| according to SlicesAssign		| Need input of MB numbers each slice. In addition, if other constraint in slice_argument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved.
-# 3 SM_ROWMB_SLICE			| according to PictureMBHeight	| Specially for TP. Typical of single row of mbs each slice?+ slice size constraint which including re-encoding
-# 4 SM_DYN_SLICE			| according to SliceSize		| Dynamic slicing (have no idea about slice_nums until encoding current frame)
-
+# Layer Configuration File
+
+
+#============================== INPUT / OUTPUT ==============================
+SourceWidth     320                     # Input  frame width
+SourceHeight    192                    # Input  frame height
+FrameRateIn     12                      # Input  frame rate [Hz]
+FrameRateOut    12                     # Output frame rate [Hz]
+InputFile       CiscoVT2people_320x192_12fps.yuv # Input  file
+ReconFile       rec_layer2.yuv          # Reconstructed file
+
+#============================== CODING ==============================
+ProfileIdc      66          # value of profile_idc (or 0 for auto detection)
+
+InitialQP       24			# Quantization parameters for base quality layer
+#================================ RATE CONTROL ===============================
+SpatialBitrate		600		# Unit: kbps, controled by DisableRC also
+#============================== MultiSlice Slice Argument ==============================
+# for S/M Slice(s) mode settings
+SliceMode			0		# 0: sigle slice mode; >0: multiple slices mode, see below;
+SliceSize			1500
+SliceNum			1		# multiple slices number specified
+
+SlicesAssign0		960		# count number of MBs in slice #0
+SlicesAssign1		0		# count number of MBs in slice #1
+SlicesAssign2		0		# count number of MBs in slice #2
+SlicesAssign3		0		# count number of MBs in slice #3 -- seting here is for better testing
+SlicesAssign4		0		# count number of MBs in slice #4
+SlicesAssign5		0		# count number of MBs in slice #5
+SlicesAssign6		0		# count number of MBs in slice #6
+SlicesAssign7		0		# count number of MBs in slice #7
+
+### DESIGN OF SLICE MODE, 100804, Sijia ####
+# 0 SM_SINGLE_SLICE			| SliceNum==1
+# 1 SM_FIXEDSLCNUM_SLICE	| according to SliceNum			| Enabled dynamic slicing for multi-thread
+# 2 SM_RASTER_SLICE			| according to SlicesAssign		| Need input of MB numbers each slice. In addition, if other constraint in slice_argument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved.
+# 3 SM_ROWMB_SLICE			| according to PictureMBHeight	| Specially for TP. Typical of single row of mbs each slice?+ slice size constraint which including re-encoding
+# 4 SM_DYN_SLICE			| according to SliceSize		| Dynamic slicing (have no idea about slice_nums until encoding current frame)
+
--- a/testbin/layer2_vd_rc.cfg
+++ b/testbin/layer2_vd_rc.cfg
@@ -1,39 +1,39 @@
-# Layer Configuration File
-
-
-#============================== INPUT / OUTPUT ==============================
-SourceWidth     320                     # Input  frame width
-SourceHeight    192                    # Input  frame height
-FrameRateIn     12                      # Input  frame rate [Hz]
-FrameRateOut    12                     # Output frame rate [Hz]
-InputFile       CiscoVT2people_320x192_12fps.yuv # Input  file
-ReconFile       rec_layer2.yuv          # Reconstructed file
-
-#============================== CODING ==============================
-ProfileIdc      66          # value of profile_idc (or 0 for auto detection)
-
-InitialQP       24			# Quantization parameters for base quality layer
-#================================ RATE CONTROL ===============================
-SpatialBitrate		600		# Unit: kbps, controled by DisableRC also
-#============================== MultiSlice Slice Argument ==============================
-# for S/M Slice(s) mode settings
-SliceMode			0		# 0: sigle slice mode; >0: multiple slices mode, see below;
-SliceSize			1500
-SliceNum			1		# multiple slices number specified
-
-SlicesAssign0		960		# count number of MBs in slice #0
-SlicesAssign1		0		# count number of MBs in slice #1
-SlicesAssign2		0		# count number of MBs in slice #2
-SlicesAssign3		0		# count number of MBs in slice #3 -- seting here is for better testing
-SlicesAssign4		0		# count number of MBs in slice #4
-SlicesAssign5		0		# count number of MBs in slice #5
-SlicesAssign6		0		# count number of MBs in slice #6
-SlicesAssign7		0		# count number of MBs in slice #7
-
-### DESIGN OF SLICE MODE, 100804, Sijia ####
-# 0 SM_SINGLE_SLICE			| SliceNum==1
-# 1 SM_FIXEDSLCNUM_SLICE	| according to SliceNum			| Enabled dynamic slicing for multi-thread
-# 2 SM_RASTER_SLICE			| according to SlicesAssign		| Need input of MB numbers each slice. In addition, if other constraint in slice_argument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved.
-# 3 SM_ROWMB_SLICE			| according to PictureMBHeight	| Specially for TP. Typical of single row of mbs each slice?+ slice size constraint which including re-encoding
-# 4 SM_DYN_SLICE			| according to SliceSize		| Dynamic slicing (have no idea about slice_nums until encoding current frame)
-
+# Layer Configuration File
+
+
+#============================== INPUT / OUTPUT ==============================
+SourceWidth     320                     # Input  frame width
+SourceHeight    192                    # Input  frame height
+FrameRateIn     12                      # Input  frame rate [Hz]
+FrameRateOut    12                     # Output frame rate [Hz]
+InputFile       CiscoVT2people_320x192_12fps.yuv # Input  file
+ReconFile       rec_layer2.yuv          # Reconstructed file
+
+#============================== CODING ==============================
+ProfileIdc      66          # value of profile_idc (or 0 for auto detection)
+
+InitialQP       24			# Quantization parameters for base quality layer
+#================================ RATE CONTROL ===============================
+SpatialBitrate		600		# Unit: kbps, controled by DisableRC also
+#============================== MultiSlice Slice Argument ==============================
+# for S/M Slice(s) mode settings
+SliceMode			0		# 0: sigle slice mode; >0: multiple slices mode, see below;
+SliceSize			1500
+SliceNum			1		# multiple slices number specified
+
+SlicesAssign0		960		# count number of MBs in slice #0
+SlicesAssign1		0		# count number of MBs in slice #1
+SlicesAssign2		0		# count number of MBs in slice #2
+SlicesAssign3		0		# count number of MBs in slice #3 -- seting here is for better testing
+SlicesAssign4		0		# count number of MBs in slice #4
+SlicesAssign5		0		# count number of MBs in slice #5
+SlicesAssign6		0		# count number of MBs in slice #6
+SlicesAssign7		0		# count number of MBs in slice #7
+
+### DESIGN OF SLICE MODE, 100804, Sijia ####
+# 0 SM_SINGLE_SLICE			| SliceNum==1
+# 1 SM_FIXEDSLCNUM_SLICE	| according to SliceNum			| Enabled dynamic slicing for multi-thread
+# 2 SM_RASTER_SLICE			| according to SlicesAssign		| Need input of MB numbers each slice. In addition, if other constraint in slice_argument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved.
+# 3 SM_ROWMB_SLICE			| according to PictureMBHeight	| Specially for TP. Typical of single row of mbs each slice?+ slice size constraint which including re-encoding
+# 4 SM_DYN_SLICE			| according to SliceSize		| Dynamic slicing (have no idea about slice_nums until encoding current frame)
+
--- a/testbin/welsenc.cfg
+++ b/testbin/welsenc.cfg
@@ -1,63 +1,63 @@
-# Cisco Scalable H.264/AVC Extension Encoder Configuration File
-
-#============================== GENERAL ==============================
-OutputFile              test.264               # Bitstream file
-MaxFrameRate            30                     # Maximum frame rate [Hz]
-FramesToBeEncoded       -1                    # Number of frames (at input frame rate)
-
-GOPSize                 4                     # GOP Size (at maximum frame rate), 16
-IntraPeriod            0                    # Intra Period ( multipler of GoP size or -1)
-EnableSpsPpsIDAddition  1
-
-EnableFrameCropping 	1 		       # enable frame cropping flag
-
-#============================== LOOP FILTER ==============================
-LoopFilterDisableIDC       0                   # Loop filter idc (0: on, 1: off, 
-                                               # 2: on except for slice boundaries,
-                                               # 3: two stage. slice boundries on in second stage
-                                               # 4: Luma on but Chroma off (w.r.t. idc=0)  
-                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
-                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
-LoopFilterAlphaC0Offset	0                      # AlphaOffset(-6..+6): valid range
-LoopFilterBetaOffset	0                      # BetaOffset (-6..+6): valid range
-
-InterLayerLoopFilterDisableIDC       0         # filter idc for inter-layer deblocking (0: on, 1: off, 
-                                               # 2: on except for slice boundaries,
-                                               # 3: two stage. slice boundries on in second stage
-                                               # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)  
-                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
-                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
-InterLayerLoopFilterAlphaC0Offset 0            # AlphaOffset for inter-layer deblocking
-InterLayerLoopFilterBetaOffset    0            # BetaOffset for inter-layer deblocking
-
-#============================== SOFTWARE IMPLEMENTATION ==============================
-MultipleThreadIdc			    1	# 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
-
-#============================== RATE CONTROL ==============================
-EnableRC				1						# ENABLE RC
-TargetBitrate			5000				    # Unit: kbps, controled by EnableRC also
-
-#============================== DENOISE CONTROL ==============================
-EnableDenoise                   0              # Enable Denoise (1: enable, 0: disable)
-
-#============================== SCENE CHANGE DETECTION CONTROL =======================
-EnableSceneChangeDetection			1			# Enable Scene Change Detection (1: enable, 0: disable)
-
-#============================== BACKGROUND DETECTION CONTROL ==============================
-EnableBackgroundDetection		 1     # BGD control(1: enable, 0: disable)
-
-#============================== ADAPTIVE QUANTIZATION CONTROL =======================
-EnableAdaptiveQuantization			1			# Enable Adaptive Quantization (1: enable, 0: disable)
-
-#============================== LONG TERM REFERENCE CONTROL ==============================
-EnableLongTermReference             0              # Enable Long Term Reference (1: enable, 0: disable)
-LtrMarkPeriod                       30             # Long Term Reference Marking Period 
-
-#============================== LAYER DEFINITION ==============================
-PrefixNALAddingCtrl		0						# Control flag of adding prefix unit (0: off, 1: on)
-												# It shall always be on in SVC contexts (i.e. when there are CGS/MGS/spatial enhancement layers)
-												# Can be disabled when no inter spatial layer prediction in case of its value as 0
-NumLayers              1                      # Number of layers
-//LayerCfg                layer0.cfg		# Layer 0 configuration file
-//LayerCfg                layer1.cfg		# Layer 1 configuration file
-LayerCfg                layer2.cfg		# Layer 2 configuration file
+# Cisco Scalable H.264/AVC Extension Encoder Configuration File
+
+#============================== GENERAL ==============================
+OutputFile              test.264               # Bitstream file
+MaxFrameRate            30                     # Maximum frame rate [Hz]
+FramesToBeEncoded       -1                    # Number of frames (at input frame rate)
+
+GOPSize                 4                     # GOP Size (at maximum frame rate), 16
+IntraPeriod            0                    # Intra Period ( multipler of GoP size or -1)
+EnableSpsPpsIDAddition  1
+
+EnableFrameCropping 	1 		       # enable frame cropping flag
+
+#============================== LOOP FILTER ==============================
+LoopFilterDisableIDC       0                   # Loop filter idc (0: on, 1: off,
+                                               # 2: on except for slice boundaries,
+                                               # 3: two stage. slice boundries on in second stage
+                                               # 4: Luma on but Chroma off (w.r.t. idc=0)
+                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
+                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
+LoopFilterAlphaC0Offset	0                      # AlphaOffset(-6..+6): valid range
+LoopFilterBetaOffset	0                      # BetaOffset (-6..+6): valid range
+
+InterLayerLoopFilterDisableIDC       0         # filter idc for inter-layer deblocking (0: on, 1: off,
+                                               # 2: on except for slice boundaries,
+                                               # 3: two stage. slice boundries on in second stage
+                                               # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)
+                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
+                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
+InterLayerLoopFilterAlphaC0Offset 0            # AlphaOffset for inter-layer deblocking
+InterLayerLoopFilterBetaOffset    0            # BetaOffset for inter-layer deblocking
+
+#============================== SOFTWARE IMPLEMENTATION ==============================
+MultipleThreadIdc			    1	# 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
+
+#============================== RATE CONTROL ==============================
+EnableRC				1						# ENABLE RC
+TargetBitrate			5000				    # Unit: kbps, controled by EnableRC also
+
+#============================== DENOISE CONTROL ==============================
+EnableDenoise                   0              # Enable Denoise (1: enable, 0: disable)
+
+#============================== SCENE CHANGE DETECTION CONTROL =======================
+EnableSceneChangeDetection			1			# Enable Scene Change Detection (1: enable, 0: disable)
+
+#============================== BACKGROUND DETECTION CONTROL ==============================
+EnableBackgroundDetection		 1     # BGD control(1: enable, 0: disable)
+
+#============================== ADAPTIVE QUANTIZATION CONTROL =======================
+EnableAdaptiveQuantization			1			# Enable Adaptive Quantization (1: enable, 0: disable)
+
+#============================== LONG TERM REFERENCE CONTROL ==============================
+EnableLongTermReference             0              # Enable Long Term Reference (1: enable, 0: disable)
+LtrMarkPeriod                       30             # Long Term Reference Marking Period
+
+#============================== LAYER DEFINITION ==============================
+PrefixNALAddingCtrl		0						# Control flag of adding prefix unit (0: off, 1: on)
+												# It shall always be on in SVC contexts (i.e. when there are CGS/MGS/spatial enhancement layers)
+												# Can be disabled when no inter spatial layer prediction in case of its value as 0
+NumLayers              1                      # Number of layers
+//LayerCfg                layer0.cfg		# Layer 0 configuration file
+//LayerCfg                layer1.cfg		# Layer 1 configuration file
+LayerCfg                layer2.cfg		# Layer 2 configuration file
--- a/testbin/welsenc_vd_1d.cfg
+++ b/testbin/welsenc_vd_1d.cfg
@@ -1,63 +1,63 @@
-# Cisco Scalable H.264/AVC Extension Encoder Configuration File
-
-#============================== GENERAL ==============================
-OutputFile              test_vd_1d.264               # Bitstream file
-MaxFrameRate            30                     # Maximum frame rate [Hz]
-FramesToBeEncoded       -1                    # Number of frames (at input frame rate)
-
-GOPSize                 4                     # GOP Size (at maximum frame rate), 16
-IntraPeriod            0                    # Intra Period ( multipler of GoP size or -1)
-EnableSpsPpsIDAddition  1
-
-EnableFrameCropping 	1 		       # enable frame cropping flag
-
-#============================== LOOP FILTER ==============================
-LoopFilterDisableIDC       0                   # Loop filter idc (0: on, 1: off, 
-                                               # 2: on except for slice boundaries,
-                                               # 3: two stage. slice boundries on in second stage
-                                               # 4: Luma on but Chroma off (w.r.t. idc=0)  
-                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
-                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
-LoopFilterAlphaC0Offset	0                      # AlphaOffset(-6..+6): valid range
-LoopFilterBetaOffset	0                      # BetaOffset (-6..+6): valid range
-
-InterLayerLoopFilterDisableIDC       0         # filter idc for inter-layer deblocking (0: on, 1: off, 
-                                               # 2: on except for slice boundaries,
-                                               # 3: two stage. slice boundries on in second stage
-                                               # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)  
-                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
-                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
-InterLayerLoopFilterAlphaC0Offset 0            # AlphaOffset for inter-layer deblocking
-InterLayerLoopFilterBetaOffset    0            # BetaOffset for inter-layer deblocking
-
-#============================== SOFTWARE IMPLEMENTATION ==============================
-MultipleThreadIdc			    1	# 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
-
-#============================== RATE CONTROL ==============================
-EnableRC				0						# ENABLE RC
-TargetBitrate			5000				    # Unit: kbps, controled by EnableRC also
-
-#============================== DENOISE CONTROL ==============================
-EnableDenoise                   0              # Enable Denoise (1: enable, 0: disable)
-
-#============================== SCENE CHANGE DETECTION CONTROL =======================
-EnableSceneChangeDetection			1			# Enable Scene Change Detection (1: enable, 0: disable)
-
-#============================== BACKGROUND DETECTION CONTROL ==============================
-EnableBackgroundDetection		 1     # BGD control(1: enable, 0: disable)
-
-#============================== ADAPTIVE QUANTIZATION CONTROL =======================
-EnableAdaptiveQuantization			0			# Enable Adaptive Quantization (1: enable, 0: disable)
-
-#============================== LONG TERM REFERENCE CONTROL ==============================
-EnableLongTermReference             1              # Enable Long Term Reference (1: enable, 0: disable)
-LtrMarkPeriod                       30             # Long Term Reference Marking Period 
-
-#============================== LAYER DEFINITION ==============================
-PrefixNALAddingCtrl		0						# Control flag of adding prefix unit (0: off, 1: on)
-												# It shall always be on in SVC contexts (i.e. when there are CGS/MGS/spatial enhancement layers)
-												# Can be disabled when no inter spatial layer prediction in case of its value as 0
-NumLayers              1                      # Number of layers
-//LayerCfg                layer0_vd.cfg		# Layer 0 configuration file
-//LayerCfg                layer1_vd.cfg		# Layer 1 configuration file
-LayerCfg                layer2_vd.cfg		# Layer 2 configuration file
+# Cisco Scalable H.264/AVC Extension Encoder Configuration File
+
+#============================== GENERAL ==============================
+OutputFile              test_vd_1d.264               # Bitstream file
+MaxFrameRate            30                     # Maximum frame rate [Hz]
+FramesToBeEncoded       -1                    # Number of frames (at input frame rate)
+
+GOPSize                 4                     # GOP Size (at maximum frame rate), 16
+IntraPeriod            0                    # Intra Period ( multipler of GoP size or -1)
+EnableSpsPpsIDAddition  1
+
+EnableFrameCropping 	1 		       # enable frame cropping flag
+
+#============================== LOOP FILTER ==============================
+LoopFilterDisableIDC       0                   # Loop filter idc (0: on, 1: off,
+                                               # 2: on except for slice boundaries,
+                                               # 3: two stage. slice boundries on in second stage
+                                               # 4: Luma on but Chroma off (w.r.t. idc=0)
+                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
+                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
+LoopFilterAlphaC0Offset	0                      # AlphaOffset(-6..+6): valid range
+LoopFilterBetaOffset	0                      # BetaOffset (-6..+6): valid range
+
+InterLayerLoopFilterDisableIDC       0         # filter idc for inter-layer deblocking (0: on, 1: off,
+                                               # 2: on except for slice boundaries,
+                                               # 3: two stage. slice boundries on in second stage
+                                               # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)
+                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
+                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
+InterLayerLoopFilterAlphaC0Offset 0            # AlphaOffset for inter-layer deblocking
+InterLayerLoopFilterBetaOffset    0            # BetaOffset for inter-layer deblocking
+
+#============================== SOFTWARE IMPLEMENTATION ==============================
+MultipleThreadIdc			    1	# 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
+
+#============================== RATE CONTROL ==============================
+EnableRC				0						# ENABLE RC
+TargetBitrate			5000				    # Unit: kbps, controled by EnableRC also
+
+#============================== DENOISE CONTROL ==============================
+EnableDenoise                   0              # Enable Denoise (1: enable, 0: disable)
+
+#============================== SCENE CHANGE DETECTION CONTROL =======================
+EnableSceneChangeDetection			1			# Enable Scene Change Detection (1: enable, 0: disable)
+
+#============================== BACKGROUND DETECTION CONTROL ==============================
+EnableBackgroundDetection		 1     # BGD control(1: enable, 0: disable)
+
+#============================== ADAPTIVE QUANTIZATION CONTROL =======================
+EnableAdaptiveQuantization			0			# Enable Adaptive Quantization (1: enable, 0: disable)
+
+#============================== LONG TERM REFERENCE CONTROL ==============================
+EnableLongTermReference             1              # Enable Long Term Reference (1: enable, 0: disable)
+LtrMarkPeriod                       30             # Long Term Reference Marking Period
+
+#============================== LAYER DEFINITION ==============================
+PrefixNALAddingCtrl		0						# Control flag of adding prefix unit (0: off, 1: on)
+												# It shall always be on in SVC contexts (i.e. when there are CGS/MGS/spatial enhancement layers)
+												# Can be disabled when no inter spatial layer prediction in case of its value as 0
+NumLayers              1                      # Number of layers
+//LayerCfg                layer0_vd.cfg		# Layer 0 configuration file
+//LayerCfg                layer1_vd.cfg		# Layer 1 configuration file
+LayerCfg                layer2_vd.cfg		# Layer 2 configuration file
--- a/testbin/welsenc_vd_rc.cfg
+++ b/testbin/welsenc_vd_rc.cfg
@@ -1,63 +1,63 @@
-# Cisco Scalable H.264/AVC Extension Encoder Configuration File
-
-#============================== GENERAL ==============================
-OutputFile              test_vd_rc.264               # Bitstream file
-MaxFrameRate            30                     # Maximum frame rate [Hz]
-FramesToBeEncoded       -1                    # Number of frames (at input frame rate), -1
-
-GOPSize                 8                     # GOP Size (at maximum frame rate), 16
-IntraPeriod            0                    # Intra Period ( multipler of GoP size or -1)
-EnableSpsPpsIDAddition  1
-
-EnableFrameCropping 	1 		       # enable frame cropping flag
-
-#============================== LOOP FILTER ==============================
-LoopFilterDisableIDC       0                   # Loop filter idc (0: on, 1: off, 
-                                               # 2: on except for slice boundaries,
-                                               # 3: two stage. slice boundries on in second stage
-                                               # 4: Luma on but Chroma off (w.r.t. idc=0)  
-                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
-                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
-LoopFilterAlphaC0Offset	0                      # AlphaOffset(-6..+6): valid range
-LoopFilterBetaOffset	0                      # BetaOffset (-6..+6): valid range
-
-InterLayerLoopFilterDisableIDC       0         # filter idc for inter-layer deblocking (0: on, 1: off, 
-                                               # 2: on except for slice boundaries,
-                                               # 3: two stage. slice boundries on in second stage
-                                               # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)  
-                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
-                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
-InterLayerLoopFilterAlphaC0Offset 0            # AlphaOffset for inter-layer deblocking
-InterLayerLoopFilterBetaOffset    0            # BetaOffset for inter-layer deblocking
-
-#============================== SOFTWARE IMPLEMENTATION ==============================
-MultipleThreadIdc			    1	# 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
-
-#============================== RATE CONTROL ==============================
-EnableRC			1						# ENABLE RC
-TargetBitrate			600				    # Unit: kbps, controled by EnableRC also
-
-#============================== DENOISE CONTROL ==============================
-EnableDenoise                   1              # Enable Denoise (1: enable, 0: disable)
-
-#============================== SCENE CHANGE DETECTION CONTROL =======================
-EnableSceneChangeDetection			1			# Enable Scene Change Detection (1: enable, 0: disable)
-
-#============================== BACKGROUND DETECTION CONTROL ==============================
-EnableBackgroundDetection		 1     # BGD control(1: enable, 0: disable)
-
-#============================== ADAPTIVE QUANTIZATION CONTROL =======================
-EnableAdaptiveQuantization			1			# Enable Adaptive Quantization (1: enable, 0: disable)
-
-#============================== LONG TERM REFERENCE CONTROL ==============================
-EnableLongTermReference             1              # Enable Long Term Reference (1: enable, 0: disable)
-LtrMarkPeriod                       30             # Long Term Reference Marking Period 
-
-#============================== LAYER DEFINITION ==============================
-PrefixNALAddingCtrl		0						# Control flag of adding prefix unit (0: off, 1: on)
-												# It shall always be on in SVC contexts (i.e. when there are CGS/MGS/spatial enhancement layers)
-												# Can be disabled when no inter spatial layer prediction in case of its value as 0
-NumLayers              1                      # Number of layers
-//LayerCfg                layer0_vd.cfg		# Layer 0 configuration file
-//LayerCfg                layer1_vd.cfg		# Layer 1 configuration file
-LayerCfg                layer2_vd_rc.cfg		# Layer 2 configuration file
+# Cisco Scalable H.264/AVC Extension Encoder Configuration File
+
+#============================== GENERAL ==============================
+OutputFile              test_vd_rc.264               # Bitstream file
+MaxFrameRate            30                     # Maximum frame rate [Hz]
+FramesToBeEncoded       -1                    # Number of frames (at input frame rate), -1
+
+GOPSize                 8                     # GOP Size (at maximum frame rate), 16
+IntraPeriod            0                    # Intra Period ( multipler of GoP size or -1)
+EnableSpsPpsIDAddition  1
+
+EnableFrameCropping 	1 		       # enable frame cropping flag
+
+#============================== LOOP FILTER ==============================
+LoopFilterDisableIDC       0                   # Loop filter idc (0: on, 1: off,
+                                               # 2: on except for slice boundaries,
+                                               # 3: two stage. slice boundries on in second stage
+                                               # 4: Luma on but Chroma off (w.r.t. idc=0)
+                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
+                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
+LoopFilterAlphaC0Offset	0                      # AlphaOffset(-6..+6): valid range
+LoopFilterBetaOffset	0                      # BetaOffset (-6..+6): valid range
+
+InterLayerLoopFilterDisableIDC       0         # filter idc for inter-layer deblocking (0: on, 1: off,
+                                               # 2: on except for slice boundaries,
+                                               # 3: two stage. slice boundries on in second stage
+                                               # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)
+                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
+                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
+InterLayerLoopFilterAlphaC0Offset 0            # AlphaOffset for inter-layer deblocking
+InterLayerLoopFilterBetaOffset    0            # BetaOffset for inter-layer deblocking
+
+#============================== SOFTWARE IMPLEMENTATION ==============================
+MultipleThreadIdc			    1	# 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
+
+#============================== RATE CONTROL ==============================
+EnableRC			1						# ENABLE RC
+TargetBitrate			600				    # Unit: kbps, controled by EnableRC also
+
+#============================== DENOISE CONTROL ==============================
+EnableDenoise                   1              # Enable Denoise (1: enable, 0: disable)
+
+#============================== SCENE CHANGE DETECTION CONTROL =======================
+EnableSceneChangeDetection			1			# Enable Scene Change Detection (1: enable, 0: disable)
+
+#============================== BACKGROUND DETECTION CONTROL ==============================
+EnableBackgroundDetection		 1     # BGD control(1: enable, 0: disable)
+
+#============================== ADAPTIVE QUANTIZATION CONTROL =======================
+EnableAdaptiveQuantization			1			# Enable Adaptive Quantization (1: enable, 0: disable)
+
+#============================== LONG TERM REFERENCE CONTROL ==============================
+EnableLongTermReference             1              # Enable Long Term Reference (1: enable, 0: disable)
+LtrMarkPeriod                       30             # Long Term Reference Marking Period
+
+#============================== LAYER DEFINITION ==============================
+PrefixNALAddingCtrl		0						# Control flag of adding prefix unit (0: off, 1: on)
+												# It shall always be on in SVC contexts (i.e. when there are CGS/MGS/spatial enhancement layers)
+												# Can be disabled when no inter spatial layer prediction in case of its value as 0
+NumLayers              1                      # Number of layers
+//LayerCfg                layer0_vd.cfg		# Layer 0 configuration file
+//LayerCfg                layer1_vd.cfg		# Layer 1 configuration file
+LayerCfg                layer2_vd_rc.cfg		# Layer 2 configuration file